For each gene, compoundDE returns a
TC_ratio in \[0, 1\]:
| TC_ratio | Interpretation |
|---|---|
| ≈ 1.0 | Transcriptional — cells intrinsically changed expression |
| ≈ 0.0 | Compositional — proportion of subtypes shifted |
| ≈ 0.5 | Mixed — both mechanisms contribute |
Genes with low TC_ratio should be treated with caution: they may be DE in a standard analysis but reflect composition rather than biology.
We first demonstrate the tool on a dataset where the DE signal is deliberately transcriptional: the same proportions of TypeA and TypeB cells appear in both conditions, but TypeA cells intrinsically upregulate genes 1–10 in treatment.
library(scCompoundDE)
library(SingleCellExperiment)
#> Loading required package: SummarizedExperiment
#> Loading required package: MatrixGenerics
#> Loading required package: matrixStats
#>
#> Attaching package: 'MatrixGenerics'
#> The following objects are masked from 'package:matrixStats':
#>
#> colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
#> colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
#> colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
#> colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
#> colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
#> colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
#> colWeightedMeans, colWeightedMedians, colWeightedSds,
#> colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
#> rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
#> rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
#> rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
#> rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
#> rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
#> rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
#> rowWeightedSds, rowWeightedVars
#> Loading required package: GenomicRanges
#> Loading required package: stats4
#> Loading required package: BiocGenerics
#> Loading required package: generics
#>
#> Attaching package: 'generics'
#> The following objects are masked from 'package:base':
#>
#> as.difftime, as.factor, as.ordered, intersect, is.element, setdiff,
#> setequal, union
#>
#> Attaching package: 'BiocGenerics'
#> The following objects are masked from 'package:stats':
#>
#> IQR, mad, sd, var, xtabs
#> The following object is masked from 'package:utils':
#>
#> data
#> The following objects are masked from 'package:base':
#>
#> anyDuplicated, aperm, append, as.data.frame, basename, cbind,
#> colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
#> get, grep, grepl, is.unsorted, lapply, Map, mapply, match, mget,
#> order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
#> rbind, Reduce, rownames, sapply, saveRDS, scale, sequence, table,
#> tapply, transform, unique, unsplit, which.max, which.min
#> Loading required package: S4Vectors
#>
#> Attaching package: 'S4Vectors'
#> The following object is masked from 'package:utils':
#>
#> findMatches
#> The following objects are masked from 'package:base':
#>
#> expand.grid, I, unname
#> Loading required package: IRanges
#> Loading required package: Seqinfo
#> Loading required package: Biobase
#> Welcome to Bioconductor
#>
#> Vignettes contain introductory material; view with
#> 'browseVignettes()'. To cite Bioconductor, see
#> 'citation("Biobase")', and for packages 'citation("pkgname")'.
#>
#> Attaching package: 'Biobase'
#> The following object is masked from 'package:MatrixGenerics':
#>
#> rowMedians
#> The following objects are masked from 'package:matrixStats':
#>
#> anyMissing, rowMedians
set.seed(42)
n_genes <- 150
n_donors <- 6
counts <- matrix(rpois(n_genes * 240L, 8L),
nrow = n_genes, ncol = 240L)
rownames(counts) <- paste0("Gene", seq_len(n_genes))
colnames(counts) <- paste0("Cell", seq_len(240L))
# First 10 genes × 4 in TypeA treatment cells only
# TypeA cells: cols 61–120 in treatment
counts[seq_len(10), 61:120] <- counts[seq_len(10), 61:120] * 5L
sce <- SingleCellExperiment(assays = list(counts = counts))
sce$donor <- rep(rep(paste0("D", seq_len(n_donors)), each = 10L), 4L)
sce$cell_type <- "T_cell"
sce$subtype <- rep(rep(c("TypeA","TypeB"),
each = n_donors * 10L), 2L)
sce$condition <- rep(c("ctrl","treat"), each = 120L)
# Check the design
table(sce$subtype, sce$condition)
#>
#> ctrl treat
#> TypeA 60 60
#> TypeB 60 60result <- compoundDE(
sce,
broad_type = "T_cell",
subtype_col = "subtype",
broad_col = "cell_type",
donor = "donor",
condition = "condition",
min_cells = 3L,
min_subtypes = 2L,
min_donors = 2L
)
#> compoundDE: broad_type='T_cell' | 2 subtypes: TypeA, TypeB
#> Computing subtype proportions...
#> Computing mean subtype expression...
#> Running per-subtype differential expression...
#> Subtype: TypeA
#> Subtype: TypeB
#> Valid subtypes for decomposition: TypeA, TypeB
#> Running broad pseudo-bulk DE...
#> Broad DE: 150 genes tested.
#> Decomposing DE signal into T and C components...
#> Decomposition complete: 150 transcriptional, 0 compositional.
result
#> CDEResult
#> Broad type : T_cell
#> Subtypes : TypeA, TypeB
#> Genes tested : 150
#> Significant : 110 (adj.P.Val < 0.05)
#> -- Decomposition ------------------
#> Transcriptional: 150 genes (TC_ratio >= 0.8 )
#> Compositional : 0 genes (TC_ratio <= 0.2 )
#> Mixed : 0 genes
#> Condition : conditiondt <- as.data.frame(deTable(result))
dt_sorted <- dt[order(dt$adj.P.Val), ]
head(dt_sorted[, c("gene","logFC","adj.P.Val","TC_ratio","source")], 15)
#> gene logFC adj.P.Val TC_ratio source
#> Gene3 Gene3 -1.4635124 2.034657e-16 1 transcriptional
#> Gene10 Gene10 -1.4121116 2.034657e-16 1 transcriptional
#> Gene2 Gene2 -1.4621673 4.616920e-16 1 transcriptional
#> Gene9 Gene9 -1.4455129 4.616920e-16 1 transcriptional
#> Gene7 Gene7 -1.3025249 5.831894e-16 1 transcriptional
#> Gene1 Gene1 -1.3462222 6.369800e-16 1 transcriptional
#> Gene4 Gene4 -1.4263297 6.837822e-16 1 transcriptional
#> Gene6 Gene6 -1.4012873 2.071942e-15 1 transcriptional
#> Gene5 Gene5 -1.3359805 2.446513e-14 1 transcriptional
#> Gene8 Gene8 -1.3090724 5.022715e-14 1 transcriptional
#> Gene93 Gene93 0.3136655 2.108143e-04 1 transcriptional
#> Gene117 Gene117 0.3119877 7.416939e-04 1 transcriptional
#> Gene96 Gene96 0.2940292 7.453923e-04 1 transcriptional
#> Gene126 Gene126 0.2930015 7.453923e-04 1 transcriptional
#> Gene65 Gene65 0.2814824 7.813075e-04 1 transcriptionalThe injected genes (Gene1–Gene10) should have high TC_ratio values, confirming they were detected as transcriptionally driven.
injected <- dt[dt$gene %in% paste0("Gene", seq_len(10)), ]
cat("Median TC_ratio of injected genes:",
round(median(injected$TC_ratio, na.rm = TRUE), 3), "\n")
#> Median TC_ratio of injected genes: 1
cat("All injected genes classified as transcriptional:",
all(injected$source == "transcriptional", na.rm = TRUE), "\n")
#> All injected genes classified as transcriptional: TRUEScatter of transcriptional vs compositional z-scores. Blue = transcriptional, red = compositional, orange = mixed. Injected genes appear in the top-right (high T, low C).
Distribution of TC_ratio scores. Vertical lines show classification thresholds at 0.2 and 0.8.
Subtype proportions per condition. Balanced design — equal TypeA and TypeB in both conditions.
Now we demonstrate the tool on a dataset where the apparent DE is caused entirely by a subtype proportion shift. Gene1 is a marker of TypeB cells. In control, most cells are TypeA (low Gene1). In treatment, most cells become TypeB (high Gene1). A standard DE tool would call Gene1 as significantly upregulated — but it is a compositional artefact.
set.seed(99)
n_genes <- 150
# Unequal proportions: ctrl = 75% TypeA, treat = 75% TypeB
n_a_ctrl <- 72L; n_b_ctrl <- 24L
n_a_treat <- 24L; n_b_treat <- 72L
n_cells <- n_a_ctrl + n_b_ctrl + n_a_treat + n_b_treat
counts2 <- matrix(rpois(n_genes * n_cells, 5L),
nrow = n_genes, ncol = n_cells)
rownames(counts2) <- paste0("Gene", seq_len(n_genes))
colnames(counts2) <- paste0("Cell", seq_len(n_cells))
# Gene1 is a marker of TypeB only — no transcriptional change
typeB_idx <- c(
seq(n_a_ctrl + 1L, n_a_ctrl + n_b_ctrl),
seq(n_a_ctrl + n_b_ctrl + n_a_treat + 1L, n_cells)
)
counts2[1L, typeB_idx] <- counts2[1L, typeB_idx] * 8L
sce2 <- SingleCellExperiment(assays = list(counts = counts2))
sce2$donor <- c(
rep(paste0("D", 1:6), each = 12L),
rep(paste0("D", 1:6), each = 4L),
rep(paste0("D", 1:6), each = 4L),
rep(paste0("D", 1:6), each = 12L)
)
sce2$cell_type <- "T_cell"
sce2$subtype <- c(rep("TypeA", n_a_ctrl), rep("TypeB", n_b_ctrl),
rep("TypeA", n_a_treat), rep("TypeB", n_b_treat))
sce2$condition <- c(rep("ctrl", n_a_ctrl + n_b_ctrl),
rep("treat", n_a_treat + n_b_treat))
table(sce2$subtype, sce2$condition)
#>
#> ctrl treat
#> TypeA 72 24
#> TypeB 24 72result2 <- compoundDE(
sce2,
broad_type = "T_cell",
subtype_col = "subtype",
broad_col = "cell_type",
donor = "donor",
condition = "condition",
min_cells = 3L,
min_subtypes = 2L,
min_donors = 2L
)
#> compoundDE: broad_type='T_cell' | 2 subtypes: TypeA, TypeB
#> Computing subtype proportions...
#> Computing mean subtype expression...
#> Running per-subtype differential expression...
#> Subtype: TypeA
#> Subtype: TypeB
#> Valid subtypes for decomposition: TypeA, TypeB
#> Running broad pseudo-bulk DE...
#> Broad DE: 150 genes tested.
#> Decomposing DE signal into T and C components...
#> Decomposition complete: 46 transcriptional, 15 compositional.
result2
#> CDEResult
#> Broad type : T_cell
#> Subtypes : TypeA, TypeB
#> Genes tested : 150
#> Significant : 1 (adj.P.Val < 0.05)
#> -- Decomposition ------------------
#> Transcriptional: 46 genes (TC_ratio >= 0.8 )
#> Compositional : 15 genes (TC_ratio <= 0.2 )
#> Mixed : 89 genes
#> Condition : conditiondt2 <- as.data.frame(deTable(result2))
gene1 <- dt2[dt2$gene == "Gene1", c("logFC","adj.P.Val","TC_ratio","source")]
cat("Gene1 (compositional marker):\n")
#> Gene1 (compositional marker):
print(gene1)
#> logFC adj.P.Val TC_ratio source
#> Gene1 1.025473 1.854896e-17 0.02242488 compositionalGene1 will have a significant p-value (broad DE is strong) but a low TC_ratio, correctly identifying it as a compositional artefact.
With compositional signal, Gene1 sits at high C_score / low T_score.
Clear proportion shift: TypeB increases dramatically in treatment.
After running compoundDE, use
filterGenesBySource to extract gene lists by
classification:
# Genes with transcriptionally-driven DE (real biology)
transcriptional_genes <- filterGenesBySource(result,
source = "transcriptional",
fdr_thresh = 0.05)
cat("Transcriptional genes (FDR < 0.05):", nrow(transcriptional_genes), "\n")
#> Transcriptional genes (FDR < 0.05): 110
head(transcriptional_genes[, c("gene","logFC","adj.P.Val","TC_ratio")])
#> gene logFC adj.P.Val TC_ratio
#> Gene3 Gene3 -1.463512 2.034657e-16 1
#> Gene10 Gene10 -1.412112 2.034657e-16 1
#> Gene2 Gene2 -1.462167 4.616920e-16 1
#> Gene9 Gene9 -1.445513 4.616920e-16 1
#> Gene7 Gene7 -1.302525 5.831894e-16 1
#> Gene1 Gene1 -1.346222 6.369800e-16 1
# Genes that are artefactual (compositional)
compositional_genes <- filterGenesBySource(result,
source = "compositional",
fdr_thresh = 1) # include all, regardless of significance
cat("\nCompositional genes (any FDR):", nrow(compositional_genes), "\n")
#>
#> Compositional genes (any FDR): 0# Step 1: QC with scBatchQC
library(scBatchQC)
sce <- batchAwareQCMetrics(sce, batch = "batch")
sce_clean <- sce[, !sce$scBatchQC_outlier]
# Step 2: Fast DE with scFastDE (or any pseudo-bulk tool)
library(scFastDE)
de_result <- fastDE(sce_clean,
donor = "donor",
cell_type = "cell_type",
condition = "condition",
target_type = "T_cell")
# Step 3: Decompose with scCompoundDE
cde_result <- compoundDE(
sce_clean,
broad_type = "T_cell",
subtype_col = "cell_subtype",
broad_col = "cell_type",
donor = "donor",
condition = "condition"
)
# Step 4: Retain only transcriptionally-driven genes
real_biology <- filterGenesBySource(cde_result,
source = "transcriptional", fdr_thresh = 0.05)sessionInfo()
#> R version 4.6.0 (2026-04-24)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04.4 LTS
#>
#> Matrix products: default
#> BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so; LAPACK version 3.12.0
#>
#> locale:
#> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
#> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
#> [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#>
#> time zone: Etc/UTC
#> tzcode source: system (glibc)
#>
#> attached base packages:
#> [1] stats4 stats graphics grDevices utils datasets methods
#> [8] base
#>
#> other attached packages:
#> [1] SingleCellExperiment_1.35.1 SummarizedExperiment_1.43.0
#> [3] Biobase_2.73.1 GenomicRanges_1.65.0
#> [5] Seqinfo_1.3.0 IRanges_2.47.2
#> [7] S4Vectors_0.51.3 BiocGenerics_0.59.7
#> [9] generics_0.1.4 MatrixGenerics_1.25.0
#> [11] matrixStats_1.5.0 scCompoundDE_0.99.0
#> [13] BiocStyle_2.41.0
#>
#> loaded via a namespace (and not attached):
#> [1] sass_0.4.10 SparseArray_1.13.2 lattice_0.22-9
#> [4] digest_0.6.39 evaluate_1.0.5 grid_4.6.0
#> [7] RColorBrewer_1.1-3 fastmap_1.2.0 jsonlite_2.0.0
#> [10] Matrix_1.7-5 limma_3.69.2 BiocManager_1.30.27
#> [13] scales_1.4.0 codetools_0.2-20 jquerylib_0.1.4
#> [16] abind_1.4-8 cli_3.6.6 rlang_1.2.0
#> [19] XVector_0.53.0 withr_3.0.2 cachem_1.1.0
#> [22] DelayedArray_0.39.3 yaml_2.3.12 otel_0.2.0
#> [25] S4Arrays_1.13.0 tools_4.6.0 parallel_4.6.0
#> [28] BiocParallel_1.47.0 ggplot2_4.0.3 vctrs_0.7.3
#> [31] buildtools_1.0.0 R6_2.6.1 lifecycle_1.0.5
#> [34] bslib_0.11.0 gtable_0.3.6 glue_1.8.1
#> [37] statmod_1.5.2 xfun_0.58 sys_3.4.3
#> [40] knitr_1.51 farver_2.1.2 htmltools_0.5.9
#> [43] labeling_0.4.3 rmarkdown_2.31 maketools_1.3.2
#> [46] compiler_4.6.0 S7_0.2.2