plyranges API

Overview

dbSequence provides plyranges-compatible functions that translate to SQL queries. Operations execute in DuckDB, not R, enabling analysis of datasets larger than memory.

Core Functions

Function Description Returns
filter_by_overlaps(x, y) Ranges in x overlapping y dbSequence (lazy)
compute_coverage(x, region, window) Binned coverage counts data.frame
as_granges(x) Convert to GRanges GRanges (collected)

filter_by_overlaps

Find ranges overlapping a region of interest:

# Load example data
bed_file <- system.file("extdata", "example.bed", package = "dbSequence")
fragments <- read_bed(bed_file)

# Define region of interest (example.bed has features on chr1:100-200, 300-400)
region <- GRanges("chr1:100-500")

# Filter to overlapping fragments - stays lazy
overlapping <- filter_by_overlaps(fragments, region)

overlapping
#> # Class:    dbSequence
#> # Source:   SQL [?? x 6]
#> # Database: DuckDB 1.5.2 [unknown@Linux 6.17.0-1018-azure:R 4.6.0/:memory:]
#>   seqnames start   end name     score strand
#>   <chr>    <int> <int> <chr>    <dbl> <chr> 
#> 1 chr1       301   400 feature2   750 -     
#> 2 chr1       101   200 feature1   500 +

Multiple Regions

# Filter against multiple peaks
peaks <- GRanges(c("chr1:100-150", "chr1:300-350"))
overlapping <- filter_by_overlaps(fragments, peaks)

dbSequence to dbSequence

Both arguments can be lazy. To join them, they must reside in the same database and share the same connection object.

# Use a temporary persistent DB
temp_db <- tempfile(fileext = ".duckdb")
dest <- DuckDBFile(temp_db)

# Import both tables to the same database file
peaks_orig <- read_bed(bed_file, dest = dest, table_name = "peaks", lazy = FALSE)
frags_orig <- read_bed(bed_file, dest = dest, table_name = "fragments", lazy = FALSE)

# Create a shared connection (reuse the one from peaks_orig)
# This is required because dplyr queries cannot span multiple connections
con <- dbProject::conn(peaks_orig)
frags_shared <- dbSequence("fragments", file_source = dest, .conn = con)

# Now both objects share 'con', so we can join them lazily
overlapping <- filter_by_overlaps(frags_shared, peaks_orig)

compute_coverage

Calculate binned coverage for a region:

# 50bp bins across region
coverage <- compute_coverage(
    fragments,
    region = "chr1:0-500",
    window = 50
)

# Returns data.frame with bin_start, bin_end, count
head(coverage)
#>   bin_start bin_end count
#> 1         1      50     0
#> 2        51     100     0
#> 3       101     150     1
#> 4       151     200     1
#> 5       201     250     0
#> 6       251     300     0

as_granges

Convert to GRanges when you need Bioconductor operations:

# Collect subset to GRanges
gr <- fragments |>
    filter_by_overlaps(region) |>
    as_granges()

# Now use standard GenomicRanges operations
GenomicRanges::reduce(gr)
#> GRanges object with 2 ranges and 0 metadata columns:
#>       seqnames    ranges strand
#>          <Rle> <IRanges>  <Rle>
#>   [1]     chr1   101-200      +
#>   [2]     chr1   301-400      -
#>   -------
#>   seqinfo: 1 sequence from an unspecified genome; no seqlengths

Note: as_granges() collects data into R memory. Filter first.

Workflow Example

Typical analysis pattern:

# 1. Load data (lazy) - using example.bed
fragments <- read_bed(bed_file)

# 2. Define analysis region
promoter <- GRanges("chr1:50-250")

# 3. Filter to region (still lazy)
promoter_fragments <- filter_by_overlaps(fragments, promoter)

# 4. Compute coverage
coverage <- compute_coverage(promoter_fragments, promoter, window = 10)

# 5. Collect if needed for custom analysis
gr <- as_granges(promoter_fragments)

File Type Restrictions

BAM/CRAM files are blocked from filter_by_overlaps due to their indexed nature—use Rsamtools instead:

bam_file <- system.file("extdata", "example.bam", package = "dbSequence")
bam_data <- read_bam(bam_file)
#> Error in `.import_with_duckdb()`:
#> ! File type 'bam' requires exonr package for import. Please install exonr for comprehensive genomic file support.

# This should error
filter_by_overlaps(bam_data, region)
#> Error:
#> ! object 'bam_data' not found

Next Steps

sessionInfo()
#> R version 4.6.0 (2026-04-24)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04.4 LTS
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so;  LAPACK version 3.12.0
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> time zone: Etc/UTC
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] stats4    stats     graphics  grDevices utils     datasets  methods  
#> [8] base     
#> 
#> other attached packages:
#>  [1] dbplyr_2.5.2         rtracklayer_1.73.0   GenomicRanges_1.65.0
#>  [4] Seqinfo_1.3.0        IRanges_2.47.2       S4Vectors_0.51.3    
#>  [7] BiocGenerics_0.59.7  generics_0.1.4       BiocIO_1.23.3       
#> [10] dbSequence_0.99.0    rmarkdown_2.31      
#> 
#> loaded via a namespace (and not attached):
#>  [1] KEGGREST_1.53.0             SummarizedExperiment_1.43.0
#>  [3] rjson_0.2.23                xfun_0.58                  
#>  [5] bslib_0.11.0                Biobase_2.73.1             
#>  [7] lattice_0.22-9              vctrs_0.7.3                
#>  [9] tools_4.6.0                 bitops_1.0-9               
#> [11] curl_7.1.0                  parallel_4.6.0             
#> [13] tibble_3.3.1                AnnotationDbi_1.75.0       
#> [15] RSQLite_3.53.1              blob_1.3.0                 
#> [17] pkgconfig_2.0.3             BiocBaseUtils_1.15.1       
#> [19] Matrix_1.7-5                BSgenome_1.81.0            
#> [21] cigarillo_1.3.0             rscontract_0.1.2           
#> [23] lifecycle_1.0.5             compiler_4.6.0             
#> [25] Rsamtools_2.29.0            Biostrings_2.81.3          
#> [27] codetools_0.2-20            htmltools_0.5.9            
#> [29] sys_3.4.3                   buildtools_1.0.0           
#> [31] sass_0.4.10                 RCurl_1.98-1.19            
#> [33] yaml_2.3.12                 pillar_1.11.1              
#> [35] crayon_1.5.3                jquerylib_0.1.4            
#> [37] BiocParallel_1.47.0         DelayedArray_0.39.3        
#> [39] cachem_1.1.0                connections_0.2.1          
#> [41] abind_1.4-8                 tidyselect_1.2.1           
#> [43] digest_0.6.39               duckdb_1.5.2               
#> [45] dplyr_1.2.1                 purrr_1.2.2                
#> [47] restfulr_0.0.17             VariantAnnotation_1.59.0   
#> [49] pins_1.4.2                  maketools_1.3.2            
#> [51] fastmap_1.2.0               grid_4.6.0                 
#> [53] cli_3.6.6                   SparseArray_1.13.2         
#> [55] magrittr_2.0.5              S4Arrays_1.13.0            
#> [57] GenomicFeatures_1.65.0      utf8_1.2.6                 
#> [59] XML_3.99-0.23               withr_3.0.2                
#> [61] bit64_4.8.2                 XVector_0.53.0             
#> [63] httr_1.4.8                  matrixStats_1.5.0          
#> [65] bit_4.6.0                   otel_0.2.0                 
#> [67] png_0.1-9                   dbProject_0.1.1            
#> [69] memoise_2.0.1               evaluate_1.0.5             
#> [71] knitr_1.51                  rlang_1.2.0                
#> [73] glue_1.8.1                  DBI_1.3.0                  
#> [75] jsonlite_2.0.0              R6_2.6.1                   
#> [77] MatrixGenerics_1.25.0       GenomicAlignments_1.49.0