scCompoundDE is designed to fit into standard
Bioconductor single-cell workflows without introducing a
package-specific input object. The main function,
compoundDE, accepts a SingleCellExperiment
object, reads raw counts from an assay such as
assay(sce, "counts"), and uses sample, condition, broad
cell type, and subtype annotations stored in colData. This
allows users to pass the same object through quality control,
annotation, pseudo-bulk differential expression, and decomposition
steps.
A typical workflow is to perform batch-aware cell-level quality
control with scBatchQC, run donor-aware pseudo-bulk
differential expression with scFastDE or another
Bioconductor DE workflow, and then use scCompoundDE to
decide whether the observed broad pseudo-bulk signal is primarily
transcriptional or compositional. The result is returned as a
CDEResult S4 object containing a
S4Vectors::DataFrame of per-gene statistics, a subtype
proportion matrix, and per-subtype DE tables, so the output can be
filtered, plotted, or passed to downstream Bioconductor-style
analyses.
For each gene, compoundDE returns a
TC_ratio in \[0, 1\]:
| TC_ratio | Interpretation |
|---|---|
| ≈ 1.0 | Transcriptional — cells intrinsically changed expression |
| ≈ 0.0 | Compositional — proportion of subtypes shifted |
| ≈ 0.5 | Mixed — both mechanisms contribute |
Genes with low TC_ratio should be treated with caution: they may be DE in a standard analysis but reflect composition rather than biology.
We first demonstrate the tool on a dataset where the DE signal is deliberately transcriptional: the same proportions of TypeA and TypeB cells appear in both conditions, but TypeA cells intrinsically upregulate genes 1–10 in treatment.
library(scCompoundDE)
library(SingleCellExperiment)
#> Loading required package: SummarizedExperiment
#> Loading required package: MatrixGenerics
#> Loading required package: matrixStats
#>
#> Attaching package: 'MatrixGenerics'
#> The following objects are masked from 'package:matrixStats':
#>
#> colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
#> colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
#> colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
#> colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
#> colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
#> colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
#> colWeightedMeans, colWeightedMedians, colWeightedSds,
#> colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
#> rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
#> rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
#> rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
#> rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
#> rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
#> rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
#> rowWeightedSds, rowWeightedVars
#> Loading required package: GenomicRanges
#> Loading required package: stats4
#> Loading required package: BiocGenerics
#> Loading required package: generics
#>
#> Attaching package: 'generics'
#> The following objects are masked from 'package:base':
#>
#> as.difftime, as.factor, as.ordered, intersect, is.element, setdiff,
#> setequal, union
#>
#> Attaching package: 'BiocGenerics'
#> The following objects are masked from 'package:stats':
#>
#> IQR, mad, sd, var, xtabs
#> The following object is masked from 'package:utils':
#>
#> data
#> The following objects are masked from 'package:base':
#>
#> anyDuplicated, aperm, append, as.data.frame, basename, cbind,
#> colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
#> get, grep, grepl, is.unsorted, lapply, Map, mapply, match, mget,
#> order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
#> rbind, Reduce, rownames, sapply, saveRDS, scale, sequence, table,
#> tapply, transform, unique, unsplit, which.max, which.min
#> Loading required package: S4Vectors
#>
#> Attaching package: 'S4Vectors'
#> The following object is masked from 'package:utils':
#>
#> findMatches
#> The following objects are masked from 'package:base':
#>
#> expand.grid, I, unname
#> Loading required package: IRanges
#> Loading required package: Seqinfo
#> Loading required package: Biobase
#> Welcome to Bioconductor
#>
#> Vignettes contain introductory material; view with
#> 'browseVignettes()'. To cite Bioconductor, see
#> 'citation("Biobase")', and for packages 'citation("pkgname")'.
#>
#> Attaching package: 'Biobase'
#> The following object is masked from 'package:MatrixGenerics':
#>
#> rowMedians
#> The following objects are masked from 'package:matrixStats':
#>
#> anyMissing, rowMedians
set.seed(42)
n_genes <- 150
n_donors <- 6
counts <- matrix(rpois(n_genes * 240L, 8L),
nrow = n_genes, ncol = 240L)
rownames(counts) <- paste0("Gene", seq_len(n_genes))
colnames(counts) <- paste0("Cell", seq_len(240L))
# First 10 genes × 4 in TypeA treatment cells only
# TypeA cells: cols 61–120 in treatment
counts[seq_len(10), 61:120] <- counts[seq_len(10), 61:120] * 5L
sce <- SingleCellExperiment(assays = list(counts = counts))
sce$donor <- rep(rep(paste0("D", seq_len(n_donors)), each = 10L), 4L)
sce$cell_type <- "T_cell"
sce$subtype <- rep(rep(c("TypeA","TypeB"),
each = n_donors * 10L), 2L)
sce$condition <- rep(c("ctrl","treat"), each = 120L)
# Check the design
table(sce$subtype, sce$condition)
#>
#> ctrl treat
#> TypeA 60 60
#> TypeB 60 60result <- compoundDE(
sce,
broad_type = "T_cell",
subtype_col = "subtype",
broad_col = "cell_type",
donor = "donor",
condition = "condition",
min_cells = 3L,
min_subtypes = 2L,
min_donors = 2L
)
#> compoundDE: broad_type='T_cell' | 2 subtypes: TypeA, TypeB
#> Computing subtype proportions...
#> Computing mean subtype expression...
#> Running per-subtype differential expression...
#> Subtype: TypeA
#> Subtype: TypeB
#> Valid subtypes for decomposition: TypeA, TypeB
#> Running broad pseudo-bulk DE...
#> Broad DE: 150 genes tested.
#> Decomposing DE signal into T and C components...
#> Decomposition complete: 150 transcriptional, 0 compositional.
result
#> CDEResult
#> Broad type : T_cell
#> Subtypes : TypeA, TypeB
#> Genes tested : 150
#> Significant : 110 (adj.P.Val < 0.05)
#> -- Decomposition ------------------
#> Transcriptional: 150 genes (TC_ratio >= 0.8 )
#> Compositional : 0 genes (TC_ratio <= 0.2 )
#> Mixed : 0 genes
#> Condition : conditionThe deTable accessor returns one row per tested gene. In
addition to the broad pseudo-bulk DE statistics (logFC,
P.Value, and adj.P.Val), it contains the
decomposition scores used by scCompoundDE:
T_score for the transcriptional component,
C_score for the compositional component,
TC_ratio for their relative contribution, and
source for the final classification.
dt <- as.data.frame(deTable(result))
dt_sorted <- dt[order(dt$adj.P.Val), ]
head(dt_sorted[, c("gene","logFC","adj.P.Val",
"T_score","C_score","TC_ratio","source")], 15)
#> gene logFC adj.P.Val T_score C_score TC_ratio
#> Gene3 Gene3 -1.4635124 2.034657e-16 -0.9970198 0 1
#> Gene10 Gene10 -1.4121116 2.034657e-16 -1.0096664 0 1
#> Gene2 Gene2 -1.4621673 4.616920e-16 -0.9726888 0 1
#> Gene9 Gene9 -1.4455129 4.616920e-16 -1.0341775 0 1
#> Gene7 Gene7 -1.3025249 5.831894e-16 -0.9030888 0 1
#> Gene1 Gene1 -1.3462222 6.369800e-16 -0.9465582 0 1
#> Gene4 Gene4 -1.4263297 6.837822e-16 -1.0133586 0 1
#> Gene6 Gene6 -1.4012873 2.071942e-15 -1.0184999 0 1
#> Gene5 Gene5 -1.3359805 2.446513e-14 -0.9571568 0 1
#> Gene8 Gene8 -1.3090724 5.022715e-14 -0.9002502 0 1
#> Gene93 Gene93 0.3136655 2.108143e-04 0.3035815 0 1
#> Gene117 Gene117 0.3119877 7.416939e-04 0.3053132 0 1
#> Gene96 Gene96 0.2940292 7.453923e-04 0.2842360 0 1
#> Gene126 Gene126 0.2930015 7.453923e-04 0.2860481 0 1
#> Gene65 Gene65 0.2814824 7.813075e-04 0.2752704 0 1
#> source
#> Gene3 transcriptional
#> Gene10 transcriptional
#> Gene2 transcriptional
#> Gene9 transcriptional
#> Gene7 transcriptional
#> Gene1 transcriptional
#> Gene4 transcriptional
#> Gene6 transcriptional
#> Gene5 transcriptional
#> Gene8 transcriptional
#> Gene93 transcriptional
#> Gene117 transcriptional
#> Gene96 transcriptional
#> Gene126 transcriptional
#> Gene65 transcriptionalThe injected genes (Gene1–Gene10) should have high TC_ratio values,
confirming they were detected as transcriptionally driven. In a real
analysis, genes classified as transcriptional are the
strongest candidates for cell-intrinsic biology, genes classified as
compositional should be interpreted as possible
subtype-abundance effects, and genes classified as mixed
require inspection of both scores.
injected <- dt[dt$gene %in% paste0("Gene", seq_len(10)), ]
cat("Median TC_ratio of injected genes:",
round(median(injected$TC_ratio, na.rm = TRUE), 3), "\n")
#> Median TC_ratio of injected genes: 1
cat("All injected genes classified as transcriptional:",
all(injected$source == "transcriptional", na.rm = TRUE), "\n")
#> All injected genes classified as transcriptional: TRUEThe plotting functions provide complementary diagnostics for the
decomposition. plotDecomposition shows whether significant
genes are driven mainly by the transcriptional or compositional axis,
plotTCRatio summarizes the distribution of gene
classifications, and plotProportion checks whether subtype
proportions differ between conditions enough to explain compositional
signals.
Scatter of transcriptional vs compositional z-scores. Blue = transcriptional, red = compositional, orange = mixed. Injected genes appear with high T and low C scores.
In the transcriptional simulation, the top injected genes should have large transcriptional scores and relatively small compositional scores. Points near the compositional axis would instead indicate genes whose broad pseudo-bulk fold-change is explained by subtype abundance shifts.
Distribution of TC_ratio scores. Vertical lines show classification thresholds at 0.2 and 0.8.
The TC_ratio histogram is useful as a quick summary of the experiment. A distribution concentrated near 1 suggests that most significant genes are cell-intrinsic, whereas a distribution concentrated near 0 suggests that the broad DE result is strongly affected by composition.
Subtype proportions per condition. Balanced design — equal TypeA and TypeB in both conditions.
The proportion plot should be read together with the decomposition plot. If one subtype expands or contracts between conditions, genes expressed mainly in that subtype are more likely to receive low TC_ratio values.
Now we demonstrate the tool on a dataset where the apparent DE is caused entirely by a subtype proportion shift. Gene1 is a marker of TypeB cells. In control, most cells are TypeA (low Gene1). In treatment, most cells become TypeB (high Gene1). A standard DE tool would call Gene1 as significantly upregulated — but it is a compositional artefact.
set.seed(99)
n_genes <- 150
# Unequal proportions: ctrl = 75% TypeA, treat = 75% TypeB
n_a_ctrl <- 72L; n_b_ctrl <- 24L
n_a_treat <- 24L; n_b_treat <- 72L
n_cells <- n_a_ctrl + n_b_ctrl + n_a_treat + n_b_treat
counts2 <- matrix(rpois(n_genes * n_cells, 5L),
nrow = n_genes, ncol = n_cells)
rownames(counts2) <- paste0("Gene", seq_len(n_genes))
colnames(counts2) <- paste0("Cell", seq_len(n_cells))
# Gene1 is a marker of TypeB only — no transcriptional change
typeB_idx <- c(
seq(n_a_ctrl + 1L, n_a_ctrl + n_b_ctrl),
seq(n_a_ctrl + n_b_ctrl + n_a_treat + 1L, n_cells)
)
counts2[1L, typeB_idx] <- counts2[1L, typeB_idx] * 8L
sce2 <- SingleCellExperiment(assays = list(counts = counts2))
sce2$donor <- c(
rep(paste0("D", 1:6), each = 12L),
rep(paste0("D", 1:6), each = 4L),
rep(paste0("D", 1:6), each = 4L),
rep(paste0("D", 1:6), each = 12L)
)
sce2$cell_type <- "T_cell"
sce2$subtype <- c(rep("TypeA", n_a_ctrl), rep("TypeB", n_b_ctrl),
rep("TypeA", n_a_treat), rep("TypeB", n_b_treat))
sce2$condition <- c(rep("ctrl", n_a_ctrl + n_b_ctrl),
rep("treat", n_a_treat + n_b_treat))
table(sce2$subtype, sce2$condition)
#>
#> ctrl treat
#> TypeA 72 24
#> TypeB 24 72result2 <- compoundDE(
sce2,
broad_type = "T_cell",
subtype_col = "subtype",
broad_col = "cell_type",
donor = "donor",
condition = "condition",
min_cells = 3L,
min_subtypes = 2L,
min_donors = 2L
)
#> compoundDE: broad_type='T_cell' | 2 subtypes: TypeA, TypeB
#> Computing subtype proportions...
#> Computing mean subtype expression...
#> Running per-subtype differential expression...
#> Subtype: TypeA
#> Subtype: TypeB
#> Valid subtypes for decomposition: TypeA, TypeB
#> Running broad pseudo-bulk DE...
#> Broad DE: 150 genes tested.
#> Decomposing DE signal into T and C components...
#> Decomposition complete: 46 transcriptional, 15 compositional.
result2
#> CDEResult
#> Broad type : T_cell
#> Subtypes : TypeA, TypeB
#> Genes tested : 150
#> Significant : 1 (adj.P.Val < 0.05)
#> -- Decomposition ------------------
#> Transcriptional: 46 genes (TC_ratio >= 0.8 )
#> Compositional : 15 genes (TC_ratio <= 0.2 )
#> Mixed : 89 genes
#> Condition : conditiondt2 <- as.data.frame(deTable(result2))
gene1 <- dt2[dt2$gene == "Gene1", c("logFC","adj.P.Val","TC_ratio","source")]
cat("Gene1 (compositional marker):\n")
#> Gene1 (compositional marker):
print(gene1)
#> logFC adj.P.Val TC_ratio source
#> Gene1 1.025473 1.854896e-17 0.02242488 compositionalGene1 will have a significant p-value (broad DE is strong) but a low
TC_ratio, correctly identifying it as a compositional artefact. This is
the case where scCompoundDE is most useful: the broad
pseudo-bulk test detects a real difference between conditions, but the
decomposition shows that the difference is caused by the changing
mixture of subtypes rather than a cell-intrinsic expression shift.
With compositional signal, Gene1 sits at high C_score / low T_score.
Here, Gene1 should move toward the compositional side of the plot. The interpretation is different from an ordinary DE table: a small adjusted p-value alone is not enough to call the gene transcriptionally altered.
Clear proportion shift: TypeB increases dramatically in treatment.
The subtype proportion plot provides the biological reason for the low TC_ratio. TypeB cells are more common in treatment, and Gene1 is a TypeB marker, so the broad pseudo-bulk signal is explained by composition.
After running compoundDE, use
filterGenesBySource to extract gene lists by
classification:
# Genes with transcriptionally-driven DE (real biology)
transcriptional_genes <- filterGenesBySource(result,
source = "transcriptional",
fdr_thresh = 0.05)
cat("Transcriptional genes (FDR < 0.05):", nrow(transcriptional_genes), "\n")
#> Transcriptional genes (FDR < 0.05): 110
head(transcriptional_genes[, c("gene","logFC","adj.P.Val","TC_ratio")])
#> gene logFC adj.P.Val TC_ratio
#> Gene3 Gene3 -1.463512 2.034657e-16 1
#> Gene10 Gene10 -1.412112 2.034657e-16 1
#> Gene2 Gene2 -1.462167 4.616920e-16 1
#> Gene9 Gene9 -1.445513 4.616920e-16 1
#> Gene7 Gene7 -1.302525 5.831894e-16 1
#> Gene1 Gene1 -1.346222 6.369800e-16 1
# Genes that are artefactual (compositional)
compositional_genes <- filterGenesBySource(result,
source = "compositional",
fdr_thresh = 1) # include all, regardless of significance
cat("\nCompositional genes (any FDR):", nrow(compositional_genes), "\n")
#>
#> Compositional genes (any FDR): 0The same pattern applies to real Bioconductor workflows.
scBatchQC can be used first to add batch-aware QC calls to
colData, scFastDE can then perform fast
donor-weighted pseudo-bulk DE on the cleaned
SingleCellExperiment, and scCompoundDE can
decompose the resulting broad signal for a selected cell population.
Users may also substitute other Bioconductor QC, annotation, or
pseudo-bulk tools, provided the input object contains counts and the
required sample annotations.
# Step 1: QC with scBatchQC
library(scBatchQC)
sce <- batchAwareQCMetrics(sce, batch = "batch")
sce_clean <- sce[, !sce$scBatchQC_outlier]
# Step 2: Fast DE with scFastDE (or any pseudo-bulk tool)
library(scFastDE)
de_result <- fastDE(sce_clean,
donor = "donor",
cell_type = "cell_type",
condition = "condition",
target_type = "T_cell")
# Step 3: Decompose with scCompoundDE
cde_result <- compoundDE(
sce_clean,
broad_type = "T_cell",
subtype_col = "cell_subtype",
broad_col = "cell_type",
donor = "donor",
condition = "condition"
)
# Step 4: Retain only transcriptionally-driven genes
real_biology <- filterGenesBySource(cde_result,
source = "transcriptional", fdr_thresh = 0.05)sessionInfo()
#> R version 4.6.1 (2026-06-24)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 26.04 LTS
#>
#> Matrix products: default
#> BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.32.so; LAPACK version 3.12.0
#>
#> locale:
#> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
#> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
#> [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#>
#> time zone: Etc/UTC
#> tzcode source: system (glibc)
#>
#> attached base packages:
#> [1] stats4 stats graphics grDevices utils datasets methods
#> [8] base
#>
#> other attached packages:
#> [1] SingleCellExperiment_1.35.1 SummarizedExperiment_1.43.0
#> [3] Biobase_2.73.1 GenomicRanges_1.65.0
#> [5] Seqinfo_1.3.0 IRanges_2.47.2
#> [7] S4Vectors_0.51.5 BiocGenerics_0.59.8
#> [9] generics_0.1.4 MatrixGenerics_1.25.0
#> [11] matrixStats_1.5.0 scCompoundDE_0.99.1
#> [13] BiocStyle_2.41.0
#>
#> loaded via a namespace (and not attached):
#> [1] sass_0.4.10 SparseArray_1.13.2 lattice_0.22-9
#> [4] digest_0.6.39 evaluate_1.0.5 grid_4.6.1
#> [7] RColorBrewer_1.1-3 fastmap_1.2.0 jsonlite_2.0.0
#> [10] Matrix_1.7-5 limma_3.69.2 BiocManager_1.30.27
#> [13] scales_1.4.0 codetools_0.2-20 jquerylib_0.1.4
#> [16] abind_1.4-8 cli_3.6.6 rlang_1.2.0
#> [19] XVector_0.53.0 withr_3.0.3 cachem_1.1.0
#> [22] DelayedArray_0.39.3 yaml_2.3.12 otel_0.2.0
#> [25] S4Arrays_1.13.0 tools_4.6.1 parallel_4.6.1
#> [28] BiocParallel_1.47.0 ggplot2_4.0.3 vctrs_0.7.3
#> [31] buildtools_1.0.0 R6_2.6.1 lifecycle_1.0.5
#> [34] bslib_0.11.0 gtable_0.3.6 glue_1.8.1
#> [37] statmod_1.5.2 xfun_0.59 sys_3.4.3
#> [40] knitr_1.51 farver_2.1.2 htmltools_0.5.9
#> [43] labeling_0.4.3 rmarkdown_2.31 maketools_1.3.2
#> [46] compiler_4.6.1 S7_0.2.2