Local Analysis of Plant Genomes with PlantTxDbHub

Introduction

This vignette demonstrates how to use the PlantTxDbHub package to analyse transcript‑level annotations for three plant species:

  • Arabidopsis thaliana (TAIR10)
  • Glycine max (Wm82 v2.1)
  • Oryza sativa (IRGSP‑1.0)

The annotations were generated from Ensembl Plants release 62 GTF files and stored as standalone TxDb SQLite databases on Zenodo.
The helper function downloadPlantTxDbs() caches the databases in your user data directory; subsequent analyses use standard TxDb methods.

Load a TxDb (Arabidopsis example)

ath_file <- file.path(db_dir, "TxDb.Athaliana.TAIR10.v62.sqlite")
txdb_ath <- loadDb(ath_file)
txdb_ath
#> TxDb object:
#> # Db type: TxDb
#> # Supporting package: GenomicFeatures
#> # Data source: Ensembl Plants release 62 GTF
#> # Organism: Arabidopsis thaliana
#> # Taxonomy ID: 3702
#> # Source: Ensembl Plants
#> # Ensembl release: 62
#> # GTF file: Arabidopsis_thaliana.TAIR10.62.gtf.gz
#> # Genome assembly: TAIR10
#> # Creation date: 2026-06-09
#> # Created by: 
#> # Contact: Kabilan S <[email protected]>
#> # Genome: NA
#> # Nb of transcripts: 54013
#> # Db created by: txdbmaker package from Bioconductor
#> # Creation time: 2026-06-09 12:36:03 +0500 (Tue, 09 Jun 2026)
#> # txdbmaker version at creation time: 1.6.2
#> # RSQLite version at creation time: 3.52.0
#> # DBSCHEMAVERSION: 1.2

Available columns and keys

TxDb objects work with the select() interface.

columns(txdb_ath)
#>  [1] "CDSCHROM"   "CDSEND"     "CDSID"      "CDSNAME"    "CDSPHASE"  
#>  [6] "CDSSTART"   "CDSSTRAND"  "EXONCHROM"  "EXONEND"    "EXONID"    
#> [11] "EXONNAME"   "EXONRANK"   "EXONSTART"  "EXONSTRAND" "GENEID"    
#> [16] "TXCHROM"    "TXEND"      "TXID"       "TXNAME"     "TXSTART"   
#> [21] "TXSTRAND"   "TXTYPE"
keytypes(txdb_ath)
#> [1] "CDSID"    "CDSNAME"  "EXONID"   "EXONNAME" "GENEID"   "TXID"     "TXNAME"

The column TXTYPE indicates the transcript biotype (e.g. protein_coding).

Retrieve all genes

genes() returns a GRanges with gene‑level information.
Here the database provides the gene_id as the only metadata column.

gene_gr <- genes(txdb_ath)
head(gene_gr)
#> GRanges object with 6 ranges and 1 metadata column:
#>             seqnames      ranges strand |     gene_id
#>                <Rle>   <IRanges>  <Rle> | <character>
#>   AT1G01010        1   3631-5899      + |   AT1G01010
#>   AT1G01020        1   6788-9130      - |   AT1G01020
#>   AT1G01030        1 11649-13714      - |   AT1G01030
#>   AT1G01040        1 23121-31227      + |   AT1G01040
#>   AT1G01046        1 28500-28706      + |   AT1G01046
#>   AT1G01050        1 31170-33171      - |   AT1G01050
#>   -------
#>   seqinfo: 7 sequences (2 circular) from an unspecified genome

Retrieve all transcripts

tx_gr <- transcripts(txdb_ath)
head(tx_gr)
#> GRanges object with 6 ranges and 2 metadata columns:
#>       seqnames      ranges strand |     tx_id     tx_name
#>          <Rle>   <IRanges>  <Rle> | <integer> <character>
#>   [1]        1   3631-5899      + |         1 AT1G01010.1
#>   [2]        1 11101-11372      + |         2 AT1G03987.1
#>   [3]        1 23121-31227      + |         3 AT1G01040.1
#>   [4]        1 23416-31120      + |         4 AT1G01040.2
#>   [5]        1 28500-28706      + |         5   at1g01046
#>   [6]        1 32727-33009      + |         6 AT1G03997.1
#>   -------
#>   seqinfo: 7 sequences (2 circular) from an unspecified genome

Retrieve all exons

ex_gr <- exons(txdb_ath, columns = "exon_id")
head(ex_gr)
#> GRanges object with 6 ranges and 1 metadata column:
#>       seqnames    ranges strand |   exon_id
#>          <Rle> <IRanges>  <Rle> | <integer>
#>   [1]        1 3631-3913      + |         1
#>   [2]        1 3996-4276      + |         2
#>   [3]        1 4486-4605      + |         3
#>   [4]        1 4706-5095      + |         4
#>   [5]        1 5174-5326      + |         5
#>   [6]        1 5439-5899      + |         6
#>   -------
#>   seqinfo: 7 sequences (2 circular) from an unspecified genome

Filter by gene ID

You can retrieve gene ranges using filter:

my_genes <- c("AT1G01010", "AT1G01020")
genes(txdb_ath, filter = list(gene_id = my_genes))
#> GRanges object with 2 ranges and 1 metadata column:
#>             seqnames    ranges strand |     gene_id
#>                <Rle> <IRanges>  <Rle> | <character>
#>   AT1G01010        1 3631-5899      + |   AT1G01010
#>   AT1G01020        1 6788-9130      - |   AT1G01020
#>   -------
#>   seqinfo: 7 sequences (2 circular) from an unspecified genome

If filter is not available (older GenomicFeatures), fall back to select():

sel <- select(txdb_ath,
  keys = my_genes,
  columns = c("GENEID", "TXID", "TXTYPE", "EXONID"),
  keytype = "GENEID"
)
#> 'select()' returned 1:many mapping between keys and columns
head(sel)
#>      GENEID EXONID TXID     TXTYPE
#> 1 AT1G01010      1    1 transcript
#> 2 AT1G01010      2    1 transcript
#> 3 AT1G01010      3    1 transcript
#> 4 AT1G01010      4    1 transcript
#> 5 AT1G01010      5    1 transcript
#> 6 AT1G01010      6    1 transcript

This returns a 1:many mapping between genes and their transcripts/exons.

Retrieving transcript types

The column TXTYPE holds the transcript biotype (e.g. protein_coding). You can extract it for all transcripts with select():

tx_info <- select(txdb_ath,
  keys = keys(txdb_ath, "TXID"),
  columns = c("TXID", "TXTYPE"),
  keytype = "TXID"
)
#> 'select()' returned 1:1 mapping between keys and columns
table(tx_info$TXTYPE)
#> 
#> transcript 
#>      54013

Working with chromosome names

The database stores chromosome names as bare numbers (1, 2, …) and organelle names as Mt, Pt. To add the standard Arabidopsis Chr prefix and restrict to nuclear chromosomes:

gene_gr_nuc <- keepSeqlevels(gene_gr,
  value = c("1", "2", "3", "4", "5"),
  pruning.mode = "coarse"
)
seqlevels(gene_gr_nuc) <- paste0("Chr", seqlevels(gene_gr_nuc))
seqlevels(gene_gr_nuc)
#> [1] "Chr1" "Chr2" "Chr3" "Chr4" "Chr5"

Soybean (Glycine max) example

gmx_file <- file.path(db_dir, "TxDb.Gmax.Wm82.v62.sqlite")
txdb_gmx <- loadDb(gmx_file)
head(genes(txdb_gmx))
#> GRanges object with 6 ranges and 1 metadata column:
#>                   seqnames          ranges strand |         gene_id
#>                      <Rle>       <IRanges>  <Rle> |     <character>
#>   ENSRNA049760019       11 2675651-2675723      + | ENSRNA049760019
#>   ENSRNA049760020       11 2794530-2794602      + | ENSRNA049760020
#>   ENSRNA049760021       11 5126230-5126303      + | ENSRNA049760021
#>   ENSRNA049760022       11 8090262-8090334      + | ENSRNA049760022
#>   ENSRNA049760023       11 8422999-8423082      + | ENSRNA049760023
#>   ENSRNA049760024       11 9000534-9000605      + | ENSRNA049760024
#>   -------
#>   seqinfo: 22 sequences (2 circular) from an unspecified genome

Rice (Oryza sativa) example

osa_file <- file.path(db_dir, "TxDb.Osativa.IRGSP.v62.sqlite")
txdb_osa <- loadDb(osa_file)
head(genes(txdb_osa))
#> GRanges object with 6 ranges and 1 metadata column:
#>                   seqnames            ranges strand |         gene_id
#>                      <Rle>         <IRanges>  <Rle> |     <character>
#>   ENSRNA049440515        1 33589412-33589483      - | ENSRNA049440515
#>   ENSRNA049440716        1 35264198-35264278      - | ENSRNA049440716
#>   ENSRNA049441102        1 37693591-37693662      - | ENSRNA049441102
#>   ENSRNA049441259        1 40086054-40086126      - | ENSRNA049441259
#>   ENSRNA049441339        1 40793762-40793842      - | ENSRNA049441339
#>   ENSRNA049441421        1 42318166-42318238      - | ENSRNA049441421
#>   -------
#>   seqinfo: 14 sequences (2 circular) from an unspecified genome

Session information

sessionInfo()
#> R version 4.6.0 (2026-04-24)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04.4 LTS
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so;  LAPACK version 3.12.0
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> time zone: Etc/UTC
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] stats4    stats     graphics  grDevices utils     datasets  methods  
#> [8] base     
#> 
#> other attached packages:
#>  [1] GenomeInfoDb_1.49.1    GenomicFeatures_1.65.0 AnnotationDbi_1.75.0  
#>  [4] Biobase_2.73.1         GenomicRanges_1.65.0   Seqinfo_1.3.0         
#>  [7] IRanges_2.47.2         S4Vectors_0.51.3       BiocGenerics_0.59.7   
#> [10] generics_0.1.4         PlantTxDbHub_0.99.1    BiocStyle_2.41.0      
#> 
#> loaded via a namespace (and not attached):
#>  [1] KEGGREST_1.53.0             SummarizedExperiment_1.43.0
#>  [3] rjson_0.2.23                xfun_0.58                  
#>  [5] bslib_0.11.0                lattice_0.22-9             
#>  [7] vctrs_0.7.3                 tools_4.6.0                
#>  [9] bitops_1.0-9                curl_7.1.0                 
#> [11] parallel_4.6.0              RSQLite_3.53.1             
#> [13] blob_1.3.0                  pkgconfig_2.0.3            
#> [15] BiocBaseUtils_1.15.1        Matrix_1.7-5               
#> [17] cigarillo_1.3.0             lifecycle_1.0.5            
#> [19] compiler_4.6.0              Rsamtools_2.29.0           
#> [21] Biostrings_2.81.3           codetools_0.2-20           
#> [23] htmltools_0.5.9             sys_3.4.3                  
#> [25] buildtools_1.0.0            sass_0.4.10                
#> [27] RCurl_1.98-1.19             yaml_2.3.12                
#> [29] crayon_1.5.3                jquerylib_0.1.4            
#> [31] BiocParallel_1.47.0         DelayedArray_0.39.3        
#> [33] cachem_1.1.0                abind_1.4-8                
#> [35] digest_0.6.39               restfulr_0.0.16            
#> [37] maketools_1.3.2             fastmap_1.2.0              
#> [39] grid_4.6.0                  SparseArray_1.13.2         
#> [41] cli_3.6.6                   S4Arrays_1.13.0            
#> [43] XML_3.99-0.23               UCSC.utils_1.9.0           
#> [45] bit64_4.8.2                 rmarkdown_2.31             
#> [47] XVector_0.53.0              httr_1.4.8                 
#> [49] matrixStats_1.5.0           bit_4.6.0                  
#> [51] otel_0.2.0                  png_0.1-9                  
#> [53] memoise_2.0.1               evaluate_1.0.5             
#> [55] knitr_1.51                  BiocIO_1.23.3              
#> [57] rtracklayer_1.73.0          rlang_1.2.0                
#> [59] DBI_1.3.0                   BiocManager_1.30.27        
#> [61] jsonlite_2.0.0              R6_2.6.1                   
#> [63] MatrixGenerics_1.25.0       GenomicAlignments_1.49.0