scCertify Workflow

Introduction

scCertify is an R package for explainable confidence scoring of single-cell RNA-seq annotations.

The framework integrates:

1.Marker enrichment scoring

2.Neighborhood agreement

3.Entropy uncertainty

4.Doublet-aware confidence scoring

5.Confidence calibration

6.Explainable confidence attribution

Load Libraries

library(scCertify)

library(Seurat)
#> Loading required package: SeuratObject
#> Loading required package: sp
#> 'SeuratObject' was built under R 4.6.0 but the current version is
#> 4.6.1; it is recomended that you reinstall 'SeuratObject' as the ABI
#> for R may have changed
#> 
#> Attaching package: 'SeuratObject'
#> The following objects are masked from 'package:base':
#> 
#>     intersect, t

library(SingleR)
#> Loading required package: SummarizedExperiment
#> Loading required package: MatrixGenerics
#> Loading required package: matrixStats
#> 
#> Attaching package: 'MatrixGenerics'
#> The following objects are masked from 'package:matrixStats':
#> 
#>     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
#>     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
#>     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
#>     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
#>     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
#>     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
#>     colWeightedMeans, colWeightedMedians, colWeightedSds,
#>     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
#>     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
#>     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
#>     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
#>     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
#>     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
#>     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
#>     rowWeightedSds, rowWeightedVars
#> Loading required package: GenomicRanges
#> Loading required package: stats4
#> Loading required package: BiocGenerics
#> Loading required package: generics
#> 
#> Attaching package: 'generics'
#> The following objects are masked from 'package:base':
#> 
#>     as.difftime, as.factor, as.ordered, intersect, is.element, setdiff,
#>     setequal, union
#> 
#> Attaching package: 'BiocGenerics'
#> The following objects are masked from 'package:stats':
#> 
#>     IQR, mad, sd, var, xtabs
#> The following object is masked from 'package:utils':
#> 
#>     data
#> The following objects are masked from 'package:base':
#> 
#>     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
#>     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
#>     get, grep, grepl, is.unsorted, lapply, Map, mapply, match, mget,
#>     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
#>     rbind, Reduce, rownames, sapply, saveRDS, scale, sequence, table,
#>     tapply, transform, unique, unsplit, which.max, which.min
#> Loading required package: S4Vectors
#> 
#> Attaching package: 'S4Vectors'
#> The following object is masked from 'package:utils':
#> 
#>     findMatches
#> The following objects are masked from 'package:base':
#> 
#>     expand.grid, I, unname
#> Loading required package: IRanges
#> 
#> Attaching package: 'IRanges'
#> The following object is masked from 'package:sp':
#> 
#>     %over%
#> Loading required package: Seqinfo
#> Loading required package: Biobase
#> Welcome to Bioconductor
#> 
#>     Vignettes contain introductory material; view with
#>     'browseVignettes()'. To cite Bioconductor, see
#>     'citation("Biobase")', and for packages 'citation("pkgname")'.
#> 
#> Attaching package: 'Biobase'
#> The following object is masked from 'package:MatrixGenerics':
#> 
#>     rowMedians
#> The following objects are masked from 'package:matrixStats':
#> 
#>     anyMissing, rowMedians
#> 
#> Attaching package: 'SummarizedExperiment'
#> The following object is masked from 'package:Seurat':
#> 
#>     Assays
#> The following object is masked from 'package:SeuratObject':
#> 
#>     Assays

library(celldex)
#> 
#> Attaching package: 'celldex'
#> The following objects are masked from 'package:SingleR':
#> 
#>     BlueprintEncodeData, DatabaseImmuneCellExpressionData,
#>     HumanPrimaryCellAtlasData, ImmGenData, MonacoImmuneData,
#>     MouseRNAseqData, NovershternHematopoieticData

library(UCell)

library(scDblFinder)
#> Loading required package: SingleCellExperiment

library(SingleCellExperiment)

Load Example Dataset

data("pbmc_small")

pbmc_small
#> An object of class Seurat 
#> 230 features across 80 samples within 1 assay 
#> Active assay: RNA (230 features, 20 variable features)
#>  3 layers present: counts, data, scale.data
#>  2 dimensional reductions calculated: pca, tsne

Preprocessing

pbmc_small <- NormalizeData(
  pbmc_small
)

pbmc_small <- FindVariableFeatures(
  pbmc_small
)

pbmc_small <- ScaleData(
  pbmc_small
)
#> Centering and scaling data matrix

pbmc_small <- RunPCA(
  pbmc_small
)
#> Warning in svd.function(A = t(x = object), nv = npcs, ...): You're computing
#> too large a percentage of total singular values, use a standard svd instead.
#> PC_ 1 
#> Positive:  TYMP, LST1, AIF1, CST3, IFITM3, LYZ, FCGRT, GRN, BID, SERPINA1 
#>     S100A11, IFI30, CFP, S100A8, S100A9, HLA-DPA1, HLA-DPB1, HLA-DRB1, FCER1G, HCK 
#>     CTSS, FCN1, CFD, RNF130, TYROBP, HLA-DRB5, LINC00936, SMCO4, HLA-DRA, LGALS1 
#> Negative:  CCL5, LCK, CTSW, GZMA, GZMM, CST7, CD7, PRF1, CD3D, RARRES3 
#>     IL32, CD3E, CD247, FGFBP2, GNLY, LAMP1, GP9, NKG7, GZMH, NGFRAP1 
#>     TUBB1, GZMB, PF4, GNG11, KLRG1, HIST1H2AC, XBP1, CLU, PGRMC1, GYPC 
#> PC_ 2 
#> Positive:  GNG11, CLU, SDPR, SPARC, GP9, PF4, HIST1H2AC, PPBP, CD9, ITGA2B 
#>     CA2, NRGN, TUBB1, TREML1, TMEM40, NGFRAP1, MYL9, PTCRA, PGRMC1, NCOA4 
#>     RUFY1, ODC1, TSC22D1, ACRBP, GPX1, TPM4, PARVB, FERMT3, TALDO1, SAT1 
#> Negative:  IFITM2, NKG7, CST7, CTSW, GZMA, GZMM, LCK, CD7, GNLY, FGFBP2 
#>     SSR2, XBP1, GZMB, IL32, PRF1, CD247, EIF4A2, HNRNPA3, SRSF7, SPON2 
#>     CD3D, GZMH, HNRNPF, KLRD1, CCL4, LAMP1, YWHAB, GIMAP1, EIF3G, GYPC 
#> PC_ 3 
#> Positive:  MS4A1, CD79A, TCL1A, CD79B, LTB, FCER2, HLA-DQB1, HVCN1, PPAPDC1B, CYB561A3 
#>     CD180, HLA-DMB, FCRLA, LINC00926, KIAA0125, RP11-693J15.5, CXCR4, CD19, LY86, CD200 
#>     IGLL5, NCF1, SP100, HLA-DRB5, SNHG7, HLA-DQA1, NT5C, HLA-DRB1, HLA-DRA, BANK1 
#> Negative:  LGALS1, FCER1G, TYROBP, GZMB, PRF1, GNLY, CCL4, CCL5, FCGR3A, AKR1C3 
#>     GZMA, FGFBP2, KLRD1, PSAP, GZMH, TTC38, IGFBP7, NKG7, C12orf75, CST7 
#>     SPON2, RHOC, GZMM, CD247, PCMT1, LAMP1, CTSW, PTGDR, SAT1, GSTP1 
#> PC_ 4 
#> Positive:  FGFBP2, AKR1C3, GZMB, CCL4, MS4A1, CLIC3, HLA-DRB1, TTC38, CD79A, NT5C 
#>     TCL1A, IL2RB, GNLY, KLRD1, GZMH, HLA-DMB, PRF1, CTSW, HLA-DPA1, NKG7 
#>     HLA-DPB1, ARHGDIA, HLA-DQB1, IGFBP7, CST7, C12orf75, XCL2, LY86, CD79B, PTGDR 
#> Negative:  CD3D, MAL, SAFB2, IL7R, MPHOSPH6, THYN1, ASNSD1, TAGAP, LDHB, CD2 
#>     NOSIP, TMEM204, DNAJB1, CCR7, CCDC104, ACAP1, TAF7, TMUB1, LTB, CD3E 
#>     TMEM123, PIK3IP1, IL32, ACSM3, SRSF7, EPC1, EIF4A2, GYPC, TNFAIP8, BLOC1S4 
#> PC_ 5 
#> Positive:  FCGR3A, LILRA3, C5AR1, RP11-290F20.3, CD79B, TNFRSF1B, SMCO4, CFD, CD68, CTSS 
#>     ADAR, VSTM1, EPC1, HVCN1, RHOC, FCN1, SERPINA1, MPHOSPH6, MS4A7, NCF1 
#>     CLIC3, RBP7, WARS, FAM96A, STX10, FPR1, SCO2, AKR1C3, CD200, BLOC1S4 
#> Negative:  FCER1A, CLEC10A, RGS1, CD1C, LGALS2, IL1B, HLA-DMA, HLA-DQA2, FUOM, ZNF330 
#>     HLA-DQA1, MMADHC, RNF130, MS4A6A, POP7, HNRNPH1, RPL7L1, GPX1, ZFP36L1, LINC00936 
#>     HLA-DMB, LY86, SRSF7, HSP90AA1, GSTP1, SSR2, SATB1, GZMK, CCL5, HLA-DQB1
#> Warning: Number of dimensions changing from 19 to 50

pbmc_small <- RunUMAP(
  pbmc_small,
  dims = 1:10
)
#> Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
#> To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
#> This message will be shown once per session
#> 15:45:42 UMAP embedding parameters a = 0.9922 b = 1.112
#> 15:45:42 Read 80 rows and found 10 numeric columns
#> 15:45:42 Using Annoy for neighbor search, n_neighbors = 30
#> 15:45:42 Building Annoy index with metric = cosine, n_trees = 50
#> 0%   10   20   30   40   50   60   70   80   90   100%
#> [----|----|----|----|----|----|----|----|----|----|
#> **************************************************|
#> 15:45:42 Writing NN index file to temp file /tmp/Rtmp1ZisKB/file256112f62fb6
#> 15:45:42 Searching Annoy index using 1 thread, search_k = 3000
#> 15:45:42 Annoy recall = 100%
#> 15:45:43 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
#> 15:45:45 Initializing from normalized Laplacian + noise (using RSpectra)
#> 15:45:45 Commencing optimization for 500 epochs, with 2020 positive edges
#> 15:45:45 Using rng type: pcg
#> 15:45:46 Optimization finished

Convert to SingleCellExperiment

sce <- as.SingleCellExperiment(
  pbmc_small
)
#> Found more than one class "package_version" in cache; using the first, from namespace 'SeuratObject'
#> Also defined by 'alabaster.base'
#> Found more than one class "package_version" in cache; using the first, from namespace 'SeuratObject'
#> Also defined by 'alabaster.base'
#> Found more than one class "package_version" in cache; using the first, from namespace 'SeuratObject'
#> Also defined by 'alabaster.base'
#> Found more than one class "package_version" in cache; using the first, from namespace 'SeuratObject'
#> Also defined by 'alabaster.base'
#> Found more than one class "package_version" in cache; using the first, from namespace 'SeuratObject'
#> Also defined by 'alabaster.base'
#> Found more than one class "package_version" in cache; using the first, from namespace 'SeuratObject'
#> Also defined by 'alabaster.base'

SingleR Annotation

ref <- HumanPrimaryCellAtlasData()

pred <- SingleR(
  test = sce,
  ref = ref,
  labels = ref$label.main
)

pbmc_small$predicted_label <-
  pred$labels

Doublet Detection

sce <- scDblFinder(
  sce
)
#> Warning in .checkSCE(sce, coerce = is.null(samples)): Some cells in `sce` have
#> an extremely low read counts; note that these could trigger errors and might
#> best be filtered out
#> Warning in scDblFinder(sce): scDblFinder might not work well with very low
#> numbers of cells.
#> Found more than one class "package_version" in cache; using the first, from namespace 'SeuratObject'
#> Also defined by 'alabaster.base'
#> Creating ~1500 artificial doublets...
#> Dimensional reduction
#> Evaluating kNN...
#> Training model...
#> iter=0, 2 cells excluded from training.
#> iter=1, 3 cells excluded from training.
#> iter=2, 3 cells excluded from training.
#> Threshold found:0.925
#> 3 (3.8%) doublets called

pbmc_small$doublet_score <-
  colData(sce)$scDblFinder.score

pbmc_small$doublet_class <-
  colData(sce)$scDblFinder.class

Marker Database

markers <- list(
  "B_cell" = c(
    "MS4A1",
    "CD79A"
  ),
  "T_cells" = c(
    "CD3D",
    "IL7R"
  ),
  "Monocyte" = c(
    "LYZ",
    "S100A8"
  ),
  "NK_cell" = c(
    "NKG7",
    "GNLY"
  ),
  "DC" = c(
    "FCER1A",
    "CST3"
  )
)

Entropy Calculation

pbmc_small$entropy_score <-
  entropy_score(
    pred$scores
  )

pbmc_small$entropy_norm <- (
  pbmc_small$entropy_score -

    min(pbmc_small$entropy_score)
) / (
  max(pbmc_small$entropy_score) -

    min(pbmc_small$entropy_score)
)

Run scCertify

pbmc_small <- cell_certify(
  pbmc_small,
  markers
)
#> Warning in grepl(marker_clean, label_clean): argument 'pattern' has length > 1
#> and only the first element will be used
#> Warning in marker_score(object, markers, label_column): No ontology match for:
#> HSC_-G-CSF
#> Warning in grepl(marker_clean, label_clean): argument 'pattern' has length > 1
#> and only the first element will be used
#> Warning in marker_score(object, markers, label_column): No ontology match for:
#> Platelets

Confidence Classes

table(
  pbmc_small$confidence_class
)
#> 
#>     High      Low Moderate 
#>       17       39       24

Confidence Score Visualization

FeaturePlot(
  pbmc_small,
  features = "confidence_score"
)

Confidence Class Visualization

DimPlot(
  pbmc_small,
  group.by = "confidence_class"
)

Explain Confidence Attribution

cell_id <- colnames(
  pbmc_small
)[1]

explain_confidence(
  pbmc_small,
  cell_id
)
#> [1] "High annotation uncertainty"

Session Information

sessionInfo()
#> R version 4.6.1 (2026-06-24)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 26.04 LTS
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.32.so;  LAPACK version 3.12.0
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> time zone: Etc/UTC
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] stats4    stats     graphics  grDevices utils     datasets  methods  
#> [8] base     
#> 
#> other attached packages:
#>  [1] scDblFinder_1.27.5          SingleCellExperiment_1.35.1
#>  [3] UCell_2.17.0                celldex_1.23.0             
#>  [5] SingleR_2.15.1              SummarizedExperiment_1.43.0
#>  [7] Biobase_2.73.1              GenomicRanges_1.65.0       
#>  [9] Seqinfo_1.3.0               IRanges_2.47.2             
#> [11] S4Vectors_0.51.5            BiocGenerics_0.59.8        
#> [13] generics_0.1.4              MatrixGenerics_1.25.0      
#> [15] matrixStats_1.5.0           Seurat_5.5.1               
#> [17] SeuratObject_5.4.0          sp_2.2-1                   
#> [19] scCertify_0.99.1            rmarkdown_2.31             
#> 
#> loaded via a namespace (and not attached):
#>   [1] spatstat.sparse_3.2-0     bitops_1.0-9             
#>   [3] httr_1.4.8                RColorBrewer_1.1-3       
#>   [5] tools_4.6.1               sctransform_0.4.3        
#>   [7] alabaster.base_1.13.1     R6_2.6.1                 
#>   [9] HDF5Array_1.41.0          lazyeval_0.2.3           
#>  [11] uwot_0.2.4                rhdf5filters_1.25.0      
#>  [13] withr_3.0.3               gridExtra_2.3.1          
#>  [15] progressr_0.19.0          cli_3.6.6                
#>  [17] spatstat.explore_3.8-1    fastDummies_1.7.6        
#>  [19] labeling_0.4.3            entropy_1.3.2            
#>  [21] alabaster.se_1.13.0       sass_0.4.10              
#>  [23] S7_0.2.2                  spatstat.data_3.1-9      
#>  [25] ggridges_0.5.7            pbapply_1.7-4            
#>  [27] Rsamtools_2.29.0          scater_1.41.2            
#>  [29] parallelly_1.48.0         RSQLite_3.53.3           
#>  [31] FNN_1.1.4.1               BiocIO_1.23.3            
#>  [33] ica_1.0-3                 spatstat.random_3.5-0    
#>  [35] dplyr_1.2.1               scrapper_1.7.3           
#>  [37] Matrix_1.7-5              ggbeeswarm_0.7.3         
#>  [39] abind_1.4-8               lifecycle_1.0.5          
#>  [41] yaml_2.3.12               rhdf5_2.57.1             
#>  [43] SparseArray_1.13.2        BiocFileCache_3.3.0      
#>  [45] Rtsne_0.17                grid_4.6.1               
#>  [47] blob_1.3.0                promises_1.5.0           
#>  [49] ExperimentHub_3.3.1       crayon_1.5.3             
#>  [51] miniUI_0.1.2              lattice_0.22-9           
#>  [53] beachmat_2.29.0           cowplot_1.2.0            
#>  [55] cigarillo_1.3.0           KEGGREST_1.53.4          
#>  [57] sys_3.4.3                 maketools_1.3.2          
#>  [59] pillar_1.11.1             knitr_1.51               
#>  [61] rjson_0.2.23              xgboost_3.2.1.1          
#>  [63] future.apply_1.20.2       codetools_0.2-20         
#>  [65] glue_1.8.1                spatstat.univar_3.2-0    
#>  [67] data.table_1.18.4         vctrs_0.7.3              
#>  [69] png_0.1-9                 gypsum_1.9.0             
#>  [71] spam_2.11-4               gtable_0.3.6             
#>  [73] cachem_1.1.0              xfun_0.59                
#>  [75] S4Arrays_1.13.0           mime_0.13                
#>  [77] survival_3.8-6            bluster_1.23.0           
#>  [79] fitdistrplus_1.2-6        ROCR_1.0-12              
#>  [81] nlme_3.1-169              bit64_4.8.2              
#>  [83] alabaster.ranges_1.13.0   filelock_1.0.3           
#>  [85] RcppAnnoy_0.0.23          GenomeInfoDb_1.49.1      
#>  [87] bslib_0.11.0              irlba_2.3.7              
#>  [89] vipor_0.4.7               KernSmooth_2.23-26       
#>  [91] otel_0.2.0                DBI_1.3.0                
#>  [93] tidyselect_1.2.1          bit_4.6.0                
#>  [95] compiler_4.6.1            curl_7.1.0               
#>  [97] httr2_1.2.3               BiocNeighbors_2.7.2      
#>  [99] h5mread_1.5.0             DelayedArray_0.39.3      
#> [101] plotly_4.12.0             rtracklayer_1.73.0       
#> [103] scales_1.4.0              lmtest_0.9-40            
#> [105] rappdirs_0.3.4            stringr_1.6.0            
#> [107] digest_0.6.39             goftest_1.2-3            
#> [109] spatstat.utils_3.2-3      alabaster.matrix_1.13.0  
#> [111] XVector_0.53.0            htmltools_0.5.9          
#> [113] pkgconfig_2.0.3           sparseMatrixStats_1.25.0 
#> [115] dbplyr_2.6.0              fastmap_1.2.0            
#> [117] rlang_1.2.0               htmlwidgets_1.6.4        
#> [119] UCSC.utils_1.9.0          shiny_1.14.0             
#> [121] DelayedMatrixStats_1.35.0 farver_2.1.2             
#> [123] jquerylib_0.1.4           zoo_1.8-15               
#> [125] jsonlite_2.0.0            BiocParallel_1.47.0      
#> [127] BiocSingular_1.29.0       RCurl_1.98-1.19          
#> [129] magrittr_2.0.5            scuttle_1.23.1           
#> [131] dotCall64_1.2             patchwork_1.3.2          
#> [133] Rhdf5lib_2.1.0            Rcpp_1.1.1-1.1           
#> [135] viridis_0.6.5             reticulate_1.46.0        
#> [137] stringi_1.8.7             alabaster.schemas_1.13.0 
#> [139] MASS_7.3-65               AnnotationHub_4.3.2      
#> [141] plyr_1.8.9                parallel_4.6.1           
#> [143] listenv_1.0.0             ggrepel_0.9.8            
#> [145] deldir_2.0-4              Biostrings_2.81.3        
#> [147] splines_4.6.1             tensor_1.5.1             
#> [149] igraph_2.3.3              spatstat.geom_3.8-1      
#> [151] RcppHNSW_0.7.0            buildtools_1.0.0         
#> [153] reshape2_1.4.5            ScaledMatrix_1.21.0      
#> [155] BiocVersion_3.24.0        XML_3.99-0.23            
#> [157] evaluate_1.0.5            BiocManager_1.30.27      
#> [159] httpuv_1.6.17             RANN_2.6.2               
#> [161] tidyr_1.3.2               purrr_1.2.2              
#> [163] polyclip_1.10-7           future_1.70.0            
#> [165] scattermore_1.2           ggplot2_4.0.3            
#> [167] BiocBaseUtils_1.15.1      rsvd_1.0.5               
#> [169] xtable_1.8-8              restfulr_0.0.17          
#> [171] RSpectra_0.16-2           later_1.4.8              
#> [173] viridisLite_0.4.3         tibble_3.3.1             
#> [175] beeswarm_0.4.0            memoise_2.0.1            
#> [177] AnnotationDbi_1.75.0      GenomicAlignments_1.49.0 
#> [179] cluster_2.1.8.2           globals_0.19.1