Introduction
scCertify is an R package for explainable confidence scoring of single-cell RNA-seq annotations.
The framework integrates:
1.Marker enrichment scoring
2.Neighborhood agreement
3.Entropy uncertainty
4.Doublet-aware confidence scoring
5.Confidence calibration
6.Explainable confidence attribution
Load Libraries
library(scCertify)
library(Seurat)
#> Loading required package: SeuratObject
#> Loading required package: sp
#> 'SeuratObject' was built under R 4.6.0 but the current version is
#> 4.6.1; it is recomended that you reinstall 'SeuratObject' as the ABI
#> for R may have changed
#>
#> Attaching package: 'SeuratObject'
#> The following objects are masked from 'package:base':
#>
#> intersect, t
library(SingleR)
#> Loading required package: SummarizedExperiment
#> Loading required package: MatrixGenerics
#> Loading required package: matrixStats
#>
#> Attaching package: 'MatrixGenerics'
#> The following objects are masked from 'package:matrixStats':
#>
#> colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
#> colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
#> colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
#> colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
#> colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
#> colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
#> colWeightedMeans, colWeightedMedians, colWeightedSds,
#> colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
#> rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
#> rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
#> rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
#> rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
#> rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
#> rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
#> rowWeightedSds, rowWeightedVars
#> Loading required package: GenomicRanges
#> Loading required package: stats4
#> Loading required package: BiocGenerics
#> Loading required package: generics
#>
#> Attaching package: 'generics'
#> The following objects are masked from 'package:base':
#>
#> as.difftime, as.factor, as.ordered, intersect, is.element, setdiff,
#> setequal, union
#>
#> Attaching package: 'BiocGenerics'
#> The following objects are masked from 'package:stats':
#>
#> IQR, mad, sd, var, xtabs
#> The following object is masked from 'package:utils':
#>
#> data
#> The following objects are masked from 'package:base':
#>
#> anyDuplicated, aperm, append, as.data.frame, basename, cbind,
#> colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
#> get, grep, grepl, is.unsorted, lapply, Map, mapply, match, mget,
#> order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
#> rbind, Reduce, rownames, sapply, saveRDS, scale, sequence, table,
#> tapply, transform, unique, unsplit, which.max, which.min
#> Loading required package: S4Vectors
#>
#> Attaching package: 'S4Vectors'
#> The following object is masked from 'package:utils':
#>
#> findMatches
#> The following objects are masked from 'package:base':
#>
#> expand.grid, I, unname
#> Loading required package: IRanges
#>
#> Attaching package: 'IRanges'
#> The following object is masked from 'package:sp':
#>
#> %over%
#> Loading required package: Seqinfo
#> Loading required package: Biobase
#> Welcome to Bioconductor
#>
#> Vignettes contain introductory material; view with
#> 'browseVignettes()'. To cite Bioconductor, see
#> 'citation("Biobase")', and for packages 'citation("pkgname")'.
#>
#> Attaching package: 'Biobase'
#> The following object is masked from 'package:MatrixGenerics':
#>
#> rowMedians
#> The following objects are masked from 'package:matrixStats':
#>
#> anyMissing, rowMedians
#>
#> Attaching package: 'SummarizedExperiment'
#> The following object is masked from 'package:Seurat':
#>
#> Assays
#> The following object is masked from 'package:SeuratObject':
#>
#> Assays
library(celldex)
#>
#> Attaching package: 'celldex'
#> The following objects are masked from 'package:SingleR':
#>
#> BlueprintEncodeData, DatabaseImmuneCellExpressionData,
#> HumanPrimaryCellAtlasData, ImmGenData, MonacoImmuneData,
#> MouseRNAseqData, NovershternHematopoieticData
library(UCell)
library(scDblFinder)
#> Loading required package: SingleCellExperiment
library(SingleCellExperiment)Load Example Dataset
data("pbmc_small")
pbmc_small
#> An object of class Seurat
#> 230 features across 80 samples within 1 assay
#> Active assay: RNA (230 features, 20 variable features)
#> 3 layers present: counts, data, scale.data
#> 2 dimensional reductions calculated: pca, tsnePreprocessing
pbmc_small <- NormalizeData(
pbmc_small
)
pbmc_small <- FindVariableFeatures(
pbmc_small
)
pbmc_small <- ScaleData(
pbmc_small
)
#> Centering and scaling data matrix
pbmc_small <- RunPCA(
pbmc_small
)
#> Warning in svd.function(A = t(x = object), nv = npcs, ...): You're computing
#> too large a percentage of total singular values, use a standard svd instead.
#> PC_ 1
#> Positive: TYMP, LST1, AIF1, CST3, IFITM3, LYZ, FCGRT, GRN, BID, SERPINA1
#> S100A11, IFI30, CFP, S100A8, S100A9, HLA-DPA1, HLA-DPB1, HLA-DRB1, FCER1G, HCK
#> CTSS, FCN1, CFD, RNF130, TYROBP, HLA-DRB5, LINC00936, SMCO4, HLA-DRA, LGALS1
#> Negative: CCL5, LCK, CTSW, GZMA, GZMM, CST7, CD7, PRF1, CD3D, RARRES3
#> IL32, CD3E, CD247, FGFBP2, GNLY, LAMP1, GP9, NKG7, GZMH, NGFRAP1
#> TUBB1, GZMB, PF4, GNG11, KLRG1, HIST1H2AC, XBP1, CLU, PGRMC1, GYPC
#> PC_ 2
#> Positive: GNG11, CLU, SDPR, SPARC, GP9, PF4, HIST1H2AC, PPBP, CD9, ITGA2B
#> CA2, NRGN, TUBB1, TREML1, TMEM40, NGFRAP1, MYL9, PTCRA, PGRMC1, NCOA4
#> RUFY1, ODC1, TSC22D1, ACRBP, GPX1, TPM4, PARVB, FERMT3, TALDO1, SAT1
#> Negative: IFITM2, NKG7, CST7, CTSW, GZMA, GZMM, LCK, CD7, GNLY, FGFBP2
#> SSR2, XBP1, GZMB, IL32, PRF1, CD247, EIF4A2, HNRNPA3, SRSF7, SPON2
#> CD3D, GZMH, HNRNPF, KLRD1, CCL4, LAMP1, YWHAB, GIMAP1, EIF3G, GYPC
#> PC_ 3
#> Positive: MS4A1, CD79A, TCL1A, CD79B, LTB, FCER2, HLA-DQB1, HVCN1, PPAPDC1B, CYB561A3
#> CD180, HLA-DMB, FCRLA, LINC00926, KIAA0125, RP11-693J15.5, CXCR4, CD19, LY86, CD200
#> IGLL5, NCF1, SP100, HLA-DRB5, SNHG7, HLA-DQA1, NT5C, HLA-DRB1, HLA-DRA, BANK1
#> Negative: LGALS1, FCER1G, TYROBP, GZMB, PRF1, GNLY, CCL4, CCL5, FCGR3A, AKR1C3
#> GZMA, FGFBP2, KLRD1, PSAP, GZMH, TTC38, IGFBP7, NKG7, C12orf75, CST7
#> SPON2, RHOC, GZMM, CD247, PCMT1, LAMP1, CTSW, PTGDR, SAT1, GSTP1
#> PC_ 4
#> Positive: FGFBP2, AKR1C3, GZMB, CCL4, MS4A1, CLIC3, HLA-DRB1, TTC38, CD79A, NT5C
#> TCL1A, IL2RB, GNLY, KLRD1, GZMH, HLA-DMB, PRF1, CTSW, HLA-DPA1, NKG7
#> HLA-DPB1, ARHGDIA, HLA-DQB1, IGFBP7, CST7, C12orf75, XCL2, LY86, CD79B, PTGDR
#> Negative: CD3D, MAL, SAFB2, IL7R, MPHOSPH6, THYN1, ASNSD1, TAGAP, LDHB, CD2
#> NOSIP, TMEM204, DNAJB1, CCR7, CCDC104, ACAP1, TAF7, TMUB1, LTB, CD3E
#> TMEM123, PIK3IP1, IL32, ACSM3, SRSF7, EPC1, EIF4A2, GYPC, TNFAIP8, BLOC1S4
#> PC_ 5
#> Positive: FCGR3A, LILRA3, C5AR1, RP11-290F20.3, CD79B, TNFRSF1B, SMCO4, CFD, CD68, CTSS
#> ADAR, VSTM1, EPC1, HVCN1, RHOC, FCN1, SERPINA1, MPHOSPH6, MS4A7, NCF1
#> CLIC3, RBP7, WARS, FAM96A, STX10, FPR1, SCO2, AKR1C3, CD200, BLOC1S4
#> Negative: FCER1A, CLEC10A, RGS1, CD1C, LGALS2, IL1B, HLA-DMA, HLA-DQA2, FUOM, ZNF330
#> HLA-DQA1, MMADHC, RNF130, MS4A6A, POP7, HNRNPH1, RPL7L1, GPX1, ZFP36L1, LINC00936
#> HLA-DMB, LY86, SRSF7, HSP90AA1, GSTP1, SSR2, SATB1, GZMK, CCL5, HLA-DQB1
#> Warning: Number of dimensions changing from 19 to 50
pbmc_small <- RunUMAP(
pbmc_small,
dims = 1:10
)
#> Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
#> To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
#> This message will be shown once per session
#> 15:45:42 UMAP embedding parameters a = 0.9922 b = 1.112
#> 15:45:42 Read 80 rows and found 10 numeric columns
#> 15:45:42 Using Annoy for neighbor search, n_neighbors = 30
#> 15:45:42 Building Annoy index with metric = cosine, n_trees = 50
#> 0% 10 20 30 40 50 60 70 80 90 100%
#> [----|----|----|----|----|----|----|----|----|----|
#> **************************************************|
#> 15:45:42 Writing NN index file to temp file /tmp/Rtmp1ZisKB/file256112f62fb6
#> 15:45:42 Searching Annoy index using 1 thread, search_k = 3000
#> 15:45:42 Annoy recall = 100%
#> 15:45:43 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
#> 15:45:45 Initializing from normalized Laplacian + noise (using RSpectra)
#> 15:45:45 Commencing optimization for 500 epochs, with 2020 positive edges
#> 15:45:45 Using rng type: pcg
#> 15:45:46 Optimization finishedConvert to SingleCellExperiment
sce <- as.SingleCellExperiment(
pbmc_small
)
#> Found more than one class "package_version" in cache; using the first, from namespace 'SeuratObject'
#> Also defined by 'alabaster.base'
#> Found more than one class "package_version" in cache; using the first, from namespace 'SeuratObject'
#> Also defined by 'alabaster.base'
#> Found more than one class "package_version" in cache; using the first, from namespace 'SeuratObject'
#> Also defined by 'alabaster.base'
#> Found more than one class "package_version" in cache; using the first, from namespace 'SeuratObject'
#> Also defined by 'alabaster.base'
#> Found more than one class "package_version" in cache; using the first, from namespace 'SeuratObject'
#> Also defined by 'alabaster.base'
#> Found more than one class "package_version" in cache; using the first, from namespace 'SeuratObject'
#> Also defined by 'alabaster.base'SingleR Annotation
ref <- HumanPrimaryCellAtlasData()
pred <- SingleR(
test = sce,
ref = ref,
labels = ref$label.main
)
pbmc_small$predicted_label <-
pred$labelsDoublet Detection
sce <- scDblFinder(
sce
)
#> Warning in .checkSCE(sce, coerce = is.null(samples)): Some cells in `sce` have
#> an extremely low read counts; note that these could trigger errors and might
#> best be filtered out
#> Warning in scDblFinder(sce): scDblFinder might not work well with very low
#> numbers of cells.
#> Found more than one class "package_version" in cache; using the first, from namespace 'SeuratObject'
#> Also defined by 'alabaster.base'
#> Creating ~1500 artificial doublets...
#> Dimensional reduction
#> Evaluating kNN...
#> Training model...
#> iter=0, 2 cells excluded from training.
#> iter=1, 3 cells excluded from training.
#> iter=2, 3 cells excluded from training.
#> Threshold found:0.925
#> 3 (3.8%) doublets called
pbmc_small$doublet_score <-
colData(sce)$scDblFinder.score
pbmc_small$doublet_class <-
colData(sce)$scDblFinder.classMarker Database
markers <- list(
"B_cell" = c(
"MS4A1",
"CD79A"
),
"T_cells" = c(
"CD3D",
"IL7R"
),
"Monocyte" = c(
"LYZ",
"S100A8"
),
"NK_cell" = c(
"NKG7",
"GNLY"
),
"DC" = c(
"FCER1A",
"CST3"
)
)Entropy Calculation
pbmc_small$entropy_score <-
entropy_score(
pred$scores
)
pbmc_small$entropy_norm <- (
pbmc_small$entropy_score -
min(pbmc_small$entropy_score)
) / (
max(pbmc_small$entropy_score) -
min(pbmc_small$entropy_score)
)Run scCertify
pbmc_small <- cell_certify(
pbmc_small,
markers
)
#> Warning in grepl(marker_clean, label_clean): argument 'pattern' has length > 1
#> and only the first element will be used
#> Warning in marker_score(object, markers, label_column): No ontology match for:
#> HSC_-G-CSF
#> Warning in grepl(marker_clean, label_clean): argument 'pattern' has length > 1
#> and only the first element will be used
#> Warning in marker_score(object, markers, label_column): No ontology match for:
#> PlateletsConfidence Classes
Confidence Score Visualization
Confidence Class Visualization
Explain Confidence Attribution
cell_id <- colnames(
pbmc_small
)[1]
explain_confidence(
pbmc_small,
cell_id
)
#> [1] "High annotation uncertainty"Session Information
sessionInfo()
#> R version 4.6.1 (2026-06-24)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 26.04 LTS
#>
#> Matrix products: default
#> BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.32.so; LAPACK version 3.12.0
#>
#> locale:
#> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
#> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
#> [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#>
#> time zone: Etc/UTC
#> tzcode source: system (glibc)
#>
#> attached base packages:
#> [1] stats4 stats graphics grDevices utils datasets methods
#> [8] base
#>
#> other attached packages:
#> [1] scDblFinder_1.27.5 SingleCellExperiment_1.35.1
#> [3] UCell_2.17.0 celldex_1.23.0
#> [5] SingleR_2.15.1 SummarizedExperiment_1.43.0
#> [7] Biobase_2.73.1 GenomicRanges_1.65.0
#> [9] Seqinfo_1.3.0 IRanges_2.47.2
#> [11] S4Vectors_0.51.5 BiocGenerics_0.59.8
#> [13] generics_0.1.4 MatrixGenerics_1.25.0
#> [15] matrixStats_1.5.0 Seurat_5.5.1
#> [17] SeuratObject_5.4.0 sp_2.2-1
#> [19] scCertify_0.99.1 rmarkdown_2.31
#>
#> loaded via a namespace (and not attached):
#> [1] spatstat.sparse_3.2-0 bitops_1.0-9
#> [3] httr_1.4.8 RColorBrewer_1.1-3
#> [5] tools_4.6.1 sctransform_0.4.3
#> [7] alabaster.base_1.13.1 R6_2.6.1
#> [9] HDF5Array_1.41.0 lazyeval_0.2.3
#> [11] uwot_0.2.4 rhdf5filters_1.25.0
#> [13] withr_3.0.3 gridExtra_2.3.1
#> [15] progressr_0.19.0 cli_3.6.6
#> [17] spatstat.explore_3.8-1 fastDummies_1.7.6
#> [19] labeling_0.4.3 entropy_1.3.2
#> [21] alabaster.se_1.13.0 sass_0.4.10
#> [23] S7_0.2.2 spatstat.data_3.1-9
#> [25] ggridges_0.5.7 pbapply_1.7-4
#> [27] Rsamtools_2.29.0 scater_1.41.2
#> [29] parallelly_1.48.0 RSQLite_3.53.3
#> [31] FNN_1.1.4.1 BiocIO_1.23.3
#> [33] ica_1.0-3 spatstat.random_3.5-0
#> [35] dplyr_1.2.1 scrapper_1.7.3
#> [37] Matrix_1.7-5 ggbeeswarm_0.7.3
#> [39] abind_1.4-8 lifecycle_1.0.5
#> [41] yaml_2.3.12 rhdf5_2.57.1
#> [43] SparseArray_1.13.2 BiocFileCache_3.3.0
#> [45] Rtsne_0.17 grid_4.6.1
#> [47] blob_1.3.0 promises_1.5.0
#> [49] ExperimentHub_3.3.1 crayon_1.5.3
#> [51] miniUI_0.1.2 lattice_0.22-9
#> [53] beachmat_2.29.0 cowplot_1.2.0
#> [55] cigarillo_1.3.0 KEGGREST_1.53.4
#> [57] sys_3.4.3 maketools_1.3.2
#> [59] pillar_1.11.1 knitr_1.51
#> [61] rjson_0.2.23 xgboost_3.2.1.1
#> [63] future.apply_1.20.2 codetools_0.2-20
#> [65] glue_1.8.1 spatstat.univar_3.2-0
#> [67] data.table_1.18.4 vctrs_0.7.3
#> [69] png_0.1-9 gypsum_1.9.0
#> [71] spam_2.11-4 gtable_0.3.6
#> [73] cachem_1.1.0 xfun_0.59
#> [75] S4Arrays_1.13.0 mime_0.13
#> [77] survival_3.8-6 bluster_1.23.0
#> [79] fitdistrplus_1.2-6 ROCR_1.0-12
#> [81] nlme_3.1-169 bit64_4.8.2
#> [83] alabaster.ranges_1.13.0 filelock_1.0.3
#> [85] RcppAnnoy_0.0.23 GenomeInfoDb_1.49.1
#> [87] bslib_0.11.0 irlba_2.3.7
#> [89] vipor_0.4.7 KernSmooth_2.23-26
#> [91] otel_0.2.0 DBI_1.3.0
#> [93] tidyselect_1.2.1 bit_4.6.0
#> [95] compiler_4.6.1 curl_7.1.0
#> [97] httr2_1.2.3 BiocNeighbors_2.7.2
#> [99] h5mread_1.5.0 DelayedArray_0.39.3
#> [101] plotly_4.12.0 rtracklayer_1.73.0
#> [103] scales_1.4.0 lmtest_0.9-40
#> [105] rappdirs_0.3.4 stringr_1.6.0
#> [107] digest_0.6.39 goftest_1.2-3
#> [109] spatstat.utils_3.2-3 alabaster.matrix_1.13.0
#> [111] XVector_0.53.0 htmltools_0.5.9
#> [113] pkgconfig_2.0.3 sparseMatrixStats_1.25.0
#> [115] dbplyr_2.6.0 fastmap_1.2.0
#> [117] rlang_1.2.0 htmlwidgets_1.6.4
#> [119] UCSC.utils_1.9.0 shiny_1.14.0
#> [121] DelayedMatrixStats_1.35.0 farver_2.1.2
#> [123] jquerylib_0.1.4 zoo_1.8-15
#> [125] jsonlite_2.0.0 BiocParallel_1.47.0
#> [127] BiocSingular_1.29.0 RCurl_1.98-1.19
#> [129] magrittr_2.0.5 scuttle_1.23.1
#> [131] dotCall64_1.2 patchwork_1.3.2
#> [133] Rhdf5lib_2.1.0 Rcpp_1.1.1-1.1
#> [135] viridis_0.6.5 reticulate_1.46.0
#> [137] stringi_1.8.7 alabaster.schemas_1.13.0
#> [139] MASS_7.3-65 AnnotationHub_4.3.2
#> [141] plyr_1.8.9 parallel_4.6.1
#> [143] listenv_1.0.0 ggrepel_0.9.8
#> [145] deldir_2.0-4 Biostrings_2.81.3
#> [147] splines_4.6.1 tensor_1.5.1
#> [149] igraph_2.3.3 spatstat.geom_3.8-1
#> [151] RcppHNSW_0.7.0 buildtools_1.0.0
#> [153] reshape2_1.4.5 ScaledMatrix_1.21.0
#> [155] BiocVersion_3.24.0 XML_3.99-0.23
#> [157] evaluate_1.0.5 BiocManager_1.30.27
#> [159] httpuv_1.6.17 RANN_2.6.2
#> [161] tidyr_1.3.2 purrr_1.2.2
#> [163] polyclip_1.10-7 future_1.70.0
#> [165] scattermore_1.2 ggplot2_4.0.3
#> [167] BiocBaseUtils_1.15.1 rsvd_1.0.5
#> [169] xtable_1.8-8 restfulr_0.0.17
#> [171] RSpectra_0.16-2 later_1.4.8
#> [173] viridisLite_0.4.3 tibble_3.3.1
#> [175] beeswarm_0.4.0 memoise_2.0.1
#> [177] AnnotationDbi_1.75.0 GenomicAlignments_1.49.0
#> [179] cluster_2.1.8.2 globals_0.19.1