tTEscanR tRNA-Specific Preprocessing Module


1. Overview

tTEscanR includes a dedicated preprocessing module that provides multiple functions for generating ready-to-use data count matrices. The primary goal of this module is to facilitate the transformation from (sc)ATAC-seq count matrices to tRNA abundance count matrices where tRNA genes are the rows and the conditions the columns. Additionally, it helps annotating the tRNA genes.

# install.packages("/avarassanchez/tTEscanR")
library(tTEscanR)

To illustrate the functionality of the tRNA speciic preprocessing module we generated a toy example dataset representing a peak count matrix where the fragment regions are placed as rows and the samples as columns. We have also defined the corresponding metadata.

library(Matrix)

# Parameters
num_peaks <- 100
num_cells <- 20 # Increased to pass min.cells = 10
set.seed(42)

# Create peak names with "-" separator as required by your code
chroms <- sample(paste0("chr", 1:3), num_peaks, replace = TRUE)
starts <- seq(1000, by = 5000, length.out = num_peaks)
ends <- starts + 300
peak_names <- paste(chroms, starts, ends, sep = "-")

# Create a sparse matrix (more memory efficient for Seurat/Signac)
counts <- Matrix(
    rpois(num_peaks * num_cells, lambda = 5),
    nrow = num_peaks,
    ncol = num_cells,
    sparse = TRUE
)

rownames(counts) <- peak_names
colnames(counts) <- paste0("Cell_", seq_len(num_cells))

saveRDS(counts, "foo.RDS")
library(dplyr)
library(Rsamtools)
#> Loading required package: Seqinfo
#> Loading required package: BiocGenerics
#> Loading required package: generics
#> 
#> Attaching package: 'generics'
#> The following object is masked from 'package:dplyr':
#> 
#>     explain
#> The following objects are masked from 'package:base':
#> 
#>     as.difftime, as.factor, as.ordered, intersect, is.element, setdiff,
#>     setequal, union
#> 
#> Attaching package: 'BiocGenerics'
#> The following object is masked from 'package:dplyr':
#> 
#>     combine
#> The following object is masked from 'package:tTEscanR':
#> 
#>     updateObject
#> The following objects are masked from 'package:stats':
#> 
#>     IQR, mad, sd, var, xtabs
#> The following object is masked from 'package:utils':
#> 
#>     data
#> The following objects are masked from 'package:base':
#> 
#>     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
#>     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
#>     get, grep, grepl, is.unsorted, lapply, Map, mapply, match, mget,
#>     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
#>     rbind, Reduce, rownames, sapply, saveRDS, scale, sequence, table,
#>     tapply, transform, unique, unsplit, which.max, which.min
#> Loading required package: GenomicRanges
#> Loading required package: stats4
#> Loading required package: S4Vectors
#> 
#> Attaching package: 'S4Vectors'
#> The following objects are masked from 'package:Matrix':
#> 
#>     expand, unname
#> The following objects are masked from 'package:dplyr':
#> 
#>     first, rename
#> The following object is masked from 'package:utils':
#> 
#>     findMatches
#> The following objects are masked from 'package:base':
#> 
#>     expand.grid, I, unname
#> Loading required package: IRanges
#> 
#> Attaching package: 'IRanges'
#> The following objects are masked from 'package:dplyr':
#> 
#>     collapse, desc, slice
#> Loading required package: Biostrings
#> Loading required package: XVector
#> 
#> Attaching package: 'Biostrings'
#> The following object is masked from 'package:base':
#> 
#>     strsplit

# Generate fragments based on our counts matrix
fragment_list <- list()
for (i in 1:ncol(counts)) {
    cell_name <- colnames(counts)[i]
    for (j in 1:nrow(counts)) {
        cnt <- counts[j, i]
        if (cnt > 0) {
            parts <- strsplit(rownames(counts)[j], "-")[[1]]
            for (k in 1:cnt) {
                fragment_list[[length(fragment_list) + 1]] <- data.frame(
                    chr = parts[1],
                    start = as.numeric(parts[2]) + k,
                    end = as.numeric(parts[3]) - k,
                    cell = cell_name,
                    count = 1
                )
            }
        }
    }
}

fragments_df <- bind_rows(fragment_list) %>%
    arrange(chr, start) # Sorting is mandatory for indexing

# Write to a temporary text file
temp_txt <- "foo.fragments.txt"
write.table(
    fragments_df, temp_txt,
    sep = "\t",
    row.names = FALSE, col.names = FALSE, quote = FALSE
)

# Bgzip compress and index (Requires Rsamtools)
# This creates foo.fragments.txt.gz and foo.fragments.txt.gz.tbi
bgzip(temp_txt, dest = "foo.fragments.txt.gz", overwrite = TRUE)
#> [1] "foo.fragments.txt.gz"
indexTabix("foo.fragments.txt.gz", format = "bed")
#> [1] "foo.fragments.txt.gz.tbi"

# Clean up the uncompressed temp file
file.remove(temp_txt)
#> [1] TRUE
library(Signac)
library(Seurat)
#> Loading required package: SeuratObject
#> Loading required package: sp
#> 
#> Attaching package: 'sp'
#> The following object is masked from 'package:IRanges':
#> 
#>     %over%
#> 'SeuratObject' was built under R 4.6.0 but the current version is
#> 4.6.1; it is recomended that you reinstall 'SeuratObject' as the ABI
#> for R may have changed
#> 
#> Attaching package: 'SeuratObject'
#> The following object is masked from 'package:Biostrings':
#> 
#>     intersect
#> The following object is masked from 'package:GenomicRanges':
#> 
#>     intersect
#> The following object is masked from 'package:IRanges':
#> 
#>     intersect
#> The following object is masked from 'package:S4Vectors':
#> 
#>     intersect
#> The following object is masked from 'package:Seqinfo':
#> 
#>     intersect
#> The following object is masked from 'package:BiocGenerics':
#> 
#>     intersect
#> The following objects are masked from 'package:base':
#> 
#>     intersect, t

# Now your provided code should work:
counts_loaded <- readRDS("foo.RDS")
fragment_file <- "foo.fragments.txt.gz"

chrom_assay <- CreateChromatinAssay(
    counts = counts_loaded,
    sep = c("-", "-"),
    fragments = fragment_file,
    min.cells = 10,
    min.features = 0
)
#> Computing hash

chrom_obj <- CreateSeuratObject(
    counts = chrom_assay,
    assay = "peaks"
)

print(chrom_obj)
#> An object of class Seurat 
#> 100 features across 20 samples within 1 assay 
#> Active assay: peaks (100 features, 0 variable features)
#>  2 layers present: counts, data

2. Obtaining the tRNA matrix

The first step is to generate the tRNA matrix and translate the peak fragments into tRNA gene names.

tRNA_matrix <- tRNAGetMatrix(
    data = chrom_obj, assay = "peaks", confidence_set = NULL, species = "hg38"
)
#> 1 . Importing the high-confidence tRNA annotations.
#> The default hg38 'confidence_set' will be used.
#> 1 . COMPLETED
#> 2 . Filtering tRNA genes with unknown annotations.
#> Loading required namespace: Structstrings
#> The default hg38 'tRNA_name_map' will be used.
#> 2 . COMPLETED
#> 3 . Finding overlaps and aggregating tRNA counts.
#> Extracting reads overlapping genomic regions
#> 3 . COMPLETED
#> 4 . Exporting tRNA expression matrix.
#> - Parameter 'out_directory' has not been specified.
#> The file will be stored in the current working directory.
#> - Parameter 'out_name' has not been specified.
#> A standard name will be used.
#> - The generated file will be save in: /tmp/RtmpTo8Kzi/Rbuild2b423d366a71/tTEscanR/vignettes/tRNA_expression_matrix.rds
#> 4 . COMPLETED

3. Identifying the optimal tRNA cutoff

#> R version 4.6.1 (2026-06-24)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 26.04 LTS
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.32.so;  LAPACK version 3.12.0
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> time zone: Etc/UTC
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] stats4    stats     graphics  grDevices utils     datasets  methods  
#> [8] base     
#> 
#> other attached packages:
#>  [1] Seurat_5.5.1         SeuratObject_5.4.0   sp_2.2-1            
#>  [4] Signac_1.17.1        Rsamtools_2.29.0     Biostrings_2.81.3   
#>  [7] XVector_0.53.0       GenomicRanges_1.65.0 IRanges_2.47.2      
#> [10] S4Vectors_0.51.5     Seqinfo_1.3.0        BiocGenerics_0.59.8 
#> [13] generics_0.1.4       Matrix_1.7-5         dplyr_1.2.1         
#> [16] biomaRt_2.69.0       tTEscanR_0.99.0      BiocStyle_2.41.0    
#> 
#> loaded via a namespace (and not attached):
#>   [1] RcppAnnoy_0.0.23            splines_4.6.1              
#>   [3] later_1.4.8                 bitops_1.0-9               
#>   [5] filelock_1.0.3              tibble_3.3.1               
#>   [7] polyclip_1.10-7             fastDummies_1.7.6          
#>   [9] lifecycle_1.0.5             httr2_1.2.3                
#>  [11] rstatix_0.7.3               globals_0.19.1             
#>  [13] lattice_0.22-9              MASS_7.3-65                
#>  [15] backports_1.5.1             magrittr_2.0.5             
#>  [17] plotly_4.12.0               sass_0.4.10                
#>  [19] rmarkdown_2.31              jquerylib_0.1.4            
#>  [21] yaml_2.3.12                 httpuv_1.6.17              
#>  [23] otel_0.2.0                  sctransform_0.4.3          
#>  [25] spam_2.11-4                 spatstat.sparse_3.2-0      
#>  [27] reticulate_1.46.0           cowplot_1.2.0              
#>  [29] pbapply_1.7-4               DBI_1.3.0                  
#>  [31] buildtools_1.0.0            RColorBrewer_1.1-3         
#>  [33] abind_1.4-8                 Rtsne_0.17                 
#>  [35] purrr_1.2.2                 rappdirs_0.3.4             
#>  [37] ggrepel_0.9.8               irlba_2.3.7                
#>  [39] spatstat.utils_3.2-3        listenv_1.0.0              
#>  [41] maketools_1.3.2             goftest_1.2-3              
#>  [43] RSpectra_0.16-2             spatstat.random_3.5-0      
#>  [45] fitdistrplus_1.2-6          parallelly_1.47.0          
#>  [47] codetools_0.2-20            DelayedArray_0.39.3        
#>  [49] RcppRoll_0.3.2              tidyselect_1.2.1           
#>  [51] UCSC.utils_1.9.0            farver_2.1.2               
#>  [53] spatstat.explore_3.8-1      matrixStats_1.5.0          
#>  [55] BiocFileCache_3.3.0         jsonlite_2.0.0             
#>  [57] progressr_0.19.0            Formula_1.2-5              
#>  [59] ggridges_0.5.7              survival_3.8-6             
#>  [61] tools_4.6.1                 progress_1.2.3             
#>  [63] ica_1.0-3                   Rcpp_1.1.1-1.1             
#>  [65] glue_1.8.1                  gridExtra_2.3.1            
#>  [67] SparseArray_1.13.2          xfun_0.59                  
#>  [69] DESeq2_1.53.0               MatrixGenerics_1.25.0      
#>  [71] GenomeInfoDb_1.49.1         withr_3.0.3                
#>  [73] BiocManager_1.30.27         fastmap_1.2.0              
#>  [75] digest_0.6.39               R6_2.6.1                   
#>  [77] mime_0.13                   scattermore_1.2            
#>  [79] tensor_1.5.1                spatstat.data_3.1-9        
#>  [81] RSQLite_3.53.2              tidyr_1.3.2                
#>  [83] data.table_1.18.4           Structstrings_1.29.0       
#>  [85] prettyunits_1.2.0           httr_1.4.8                 
#>  [87] htmlwidgets_1.6.4           S4Arrays_1.13.0            
#>  [89] uwot_0.2.4                  pkgconfig_2.0.3            
#>  [91] gtable_0.3.6                blob_1.3.0                 
#>  [93] lmtest_0.9-40               S7_0.2.2                   
#>  [95] sys_3.4.3                   htmltools_0.5.9            
#>  [97] carData_3.0-6               dotCall64_1.2              
#>  [99] scales_1.4.0                Biobase_2.73.1             
#> [101] png_0.1-9                   spatstat.univar_3.2-0      
#> [103] knitr_1.51                  reshape2_1.4.5             
#> [105] nlme_3.1-169                curl_7.1.0                 
#> [107] cachem_1.1.0                zoo_1.8-15                 
#> [109] stringr_1.6.0               KernSmooth_2.23-26         
#> [111] parallel_4.6.1              miniUI_0.1.2               
#> [113] AnnotationDbi_1.75.0        pillar_1.11.1              
#> [115] grid_4.6.1                  vctrs_0.7.3                
#> [117] RANN_2.6.2                  promises_1.5.0             
#> [119] ggpubr_0.6.3                car_3.1-5                  
#> [121] dbplyr_2.6.0                xtable_1.8-8               
#> [123] cluster_2.1.8.2             evaluate_1.0.5             
#> [125] cli_3.6.6                   locfit_1.5-9.12            
#> [127] compiler_4.6.1              rlang_1.2.0                
#> [129] crayon_1.5.3                future.apply_1.20.2        
#> [131] ggsignif_0.6.4              labeling_0.4.3             
#> [133] plyr_1.8.9                  stringi_1.8.7              
#> [135] deldir_2.0-4                viridisLite_0.4.3          
#> [137] BiocParallel_1.47.0         lazyeval_0.2.3             
#> [139] spatstat.geom_3.8-1         RcppHNSW_0.7.0             
#> [141] hms_1.1.4                   patchwork_1.3.2            
#> [143] sparseMatrixStats_1.25.0    bit64_4.8.2                
#> [145] future_1.70.0               ggplot2_4.0.3              
#> [147] KEGGREST_1.53.4             shiny_1.14.0               
#> [149] SummarizedExperiment_1.43.0 ROCR_1.0-12                
#> [151] igraph_2.3.3                broom_1.0.13               
#> [153] memoise_2.0.1               bslib_0.11.0               
#> [155] fastmatch_1.1-8             bit_4.6.0