Data Ingestion

Overview

dbSequence stores genomic data in DuckDB, enabling analysis of datasets larger than available memory. Data stays in the database—R operates on lazy references.

Supported Formats

Format Function BiocIO Class
BED read_bed() BEDFile
VCF read_vcf() VcfFile
GFF/GTF read_gff() GFFFile
BAM read_bam() BamFile

Quick Start

library(dbSequence)

bed_file <- system.file("extdata", "example.bed", package = "dbSequence")

# Import a BED file
db_seq <- read_bed(bed_file)

# Data stays in DuckDB - this is a lazy reference
db_seq
#> # Class:    dbSequence
#> # Source:   SQL [?? x 6]
#> # Database: DuckDB 1.5.2 [unknown@Linux 6.17.0-1018-azure:R 4.6.0/:memory:]
#>    seqnames start   end name      score strand
#>    <chr>    <int> <int> <chr>     <dbl> <chr> 
#>  1 chr1       101   200 feature1    500 +     
#>  2 chr1       301   400 feature2    750 -     
#>  3 chr2       151   250 feature3    600 +     
#>  4 chr2       501   600 feature4    800 -     
#>  5 chr3      1001  1100 feature5    650 +     
#>  6 chr3      2001  2100 feature6    900 +     
#>  7 chr4       501   700 feature7    550 -     
#>  8 chr4      1501  1600 feature8    720 +     
#>  9 chr5       801   900 feature9    680 +     
#> 10 chr5      1201  1300 feature10   590 -

BiocIO Pattern

For more control, use the BiocIO import() function:

library(BiocIO)
library(rtracklayer)

bed_file <- system.file("extdata", "example.bed", package = "dbSequence")
db_file <- tempfile(fileext = ".duckdb")

# Specify destination database
db_seq <- import(
    BEDFile(bed_file),
    dest = DuckDBFile(db_file),
    table_name = "fragments"
)

Lazy Evaluation

The key feature: data never loads into R memory.

# This does NOT load data
db_seq <- read_bed(bed_file)

# Still no data in memory - just adds a filter condition
region <- GenomicRanges::GRanges("chr1:100-500")
filtered <- filter_by_overlaps(db_seq, region)

# Data only loads when you explicitly collect
result <- as_granges(filtered) # NOW data enters R

Multiple Tables

Store multiple datasets in one database:

db <- DuckDBFile(tempfile(fileext = ".duckdb"))
gff_file <- system.file("extdata", "example.gff3", package = "dbSequence")

# Import different files to same database
peaks <- import(BEDFile(bed_file), dest = db, table_name = "peaks")
genes <- import(GFFFile(gff_file), dest = db, table_name = "genes")

Connection Management

dbSequence manages connections automatically, but for long sessions:

# Access the underlying connection
con <- dbProject::conn(db_seq)

# For manual control, create your own connection
con <- DBI::dbConnect(duckdb::duckdb(), tempfile(fileext = ".duckdb"))
# ... do work ...
DBI::dbDisconnect(con, shutdown = TRUE)

Next Steps

sessionInfo()
#> R version 4.6.0 (2026-04-24)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04.4 LTS
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so;  LAPACK version 3.12.0
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> time zone: Etc/UTC
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] stats4    stats     graphics  grDevices utils     datasets  methods  
#> [8] base     
#> 
#> other attached packages:
#>  [1] rtracklayer_1.73.0   GenomicRanges_1.65.0 Seqinfo_1.3.0       
#>  [4] IRanges_2.47.2       S4Vectors_0.51.3     BiocGenerics_0.59.7 
#>  [7] generics_0.1.4       BiocIO_1.23.3        dbSequence_0.99.0   
#> [10] rmarkdown_2.31      
#> 
#> loaded via a namespace (and not attached):
#>  [1] KEGGREST_1.53.0             SummarizedExperiment_1.43.0
#>  [3] rjson_0.2.23                xfun_0.58                  
#>  [5] bslib_0.11.0                Biobase_2.73.1             
#>  [7] lattice_0.22-9              vctrs_0.7.3                
#>  [9] tools_4.6.0                 bitops_1.0-9               
#> [11] curl_7.1.0                  parallel_4.6.0             
#> [13] tibble_3.3.1                AnnotationDbi_1.75.0       
#> [15] RSQLite_3.53.1              blob_1.3.0                 
#> [17] pkgconfig_2.0.3             BiocBaseUtils_1.15.1       
#> [19] Matrix_1.7-5                BSgenome_1.81.0            
#> [21] dbplyr_2.5.2                cigarillo_1.3.0            
#> [23] rscontract_0.1.2            lifecycle_1.0.5            
#> [25] compiler_4.6.0              Rsamtools_2.29.0           
#> [27] Biostrings_2.81.3           codetools_0.2-20           
#> [29] htmltools_0.5.9             sys_3.4.3                  
#> [31] buildtools_1.0.0            sass_0.4.10                
#> [33] RCurl_1.98-1.19             yaml_2.3.12                
#> [35] pillar_1.11.1               crayon_1.5.3               
#> [37] jquerylib_0.1.4             BiocParallel_1.47.0        
#> [39] DelayedArray_0.39.3         cachem_1.1.0               
#> [41] connections_0.2.1           abind_1.4-8                
#> [43] tidyselect_1.2.1            digest_0.6.39              
#> [45] duckdb_1.5.2                dplyr_1.2.1                
#> [47] purrr_1.2.2                 restfulr_0.0.17            
#> [49] VariantAnnotation_1.59.0    pins_1.4.2                 
#> [51] maketools_1.3.2             fastmap_1.2.0              
#> [53] grid_4.6.0                  cli_3.6.6                  
#> [55] SparseArray_1.13.2          magrittr_2.0.5             
#> [57] S4Arrays_1.13.0             GenomicFeatures_1.65.0     
#> [59] utf8_1.2.6                  XML_3.99-0.23              
#> [61] withr_3.0.2                 bit64_4.8.2                
#> [63] XVector_0.53.0              httr_1.4.8                 
#> [65] matrixStats_1.5.0           bit_4.6.0                  
#> [67] otel_0.2.0                  png_0.1-9                  
#> [69] dbProject_0.1.1             memoise_2.0.1              
#> [71] evaluate_1.0.5              knitr_1.51                 
#> [73] rlang_1.2.0                 glue_1.8.1                 
#> [75] DBI_1.3.0                   jsonlite_2.0.0             
#> [77] R6_2.6.1                    MatrixGenerics_1.25.0      
#> [79] GenomicAlignments_1.49.0