--- title: "TxParq.Hs.gencode.v49: Parquet transformation of TxDb.Hs with Gencode v49" shorttitle: "outlier dytection methods" author: "Vincent J. Carey, stvjc at channing.harvard.edu" date: "`r format(Sys.time(), '%B %d, %Y')`" vignette: > %\VignetteEngine{knitr::rmarkdown} %\VignetteIndexEntry{TxParq.Hs.gencode.v49: Parquet transformation of TxDb.Hs with Gencode v49} %\VignetteEncoding{UTF-8} output: BiocStyle::html_document: highlight: pygments number_sections: yes theme: united toc: yes --- # Introduction This is an experiment to evaluate an approach to TxDb production that is decoupled from AnnotationDbi and is based on serializations of information in Gencode GTFs to Parquet. The code that transformed Gencode GTF to parquet is in inst/python in the sources of this package. The code was generated by prompting Anthropic Claude Opus 4.5 in March of 2026. ```{r getlibs, message=FALSE} library(GenomicRanges) library(arrow) library(dplyr) library(TxParq.Hs.gencode.v49) parqloc = system.file("gc49", package="TxParq.Hs.gencode.v49") ``` ```{r dotasks} gtf <- GTFParquet(parqloc) print(gtf) # View GTF metadata (version, date, provider) gtf_metadata(gtf) ``` # Rich Gene Queries ## Get ALL genes with FULL attributes ```{r attr1} gr_genes <- genes(gtf) head(gr_genes) ``` ## Filter by gene_type ```{r lktype} protein_coding <- genes(gtf, filter = list(gene_type = "protein_coding")) length(protein_coding) lncRNAs <- genes(gtf, filter = list(gene_type = "lncRNA")) length(lncRNAs) # Convenience functions pc_genes <- protein_coding_genes(gtf) lnc_genes <- lncRNA_genes(gtf) # See what gene types are available gene_types(gtf) # lncRNA miRNA protein_coding pseudogene ... ``` ## Filter by annotation confidence level - Level 1: verified loci (experimental evidence) - Level 2: manually annotated loci - Level 3: automatically annotated loci ```{r lkfilt2} verified_genes <- genes(gtf, filter = list(level = 1)) # Filter by source (HAVANA = manual, ENSEMBL = automatic) manual_genes <- genes(gtf, filter = list(source = "HAVANA")) # Filter by tags # Common tags: basic, CCDS, overlaps_pseudogene, etc. # Note: tags is a semicolon-separated string, so use grep for complex filters all_genes <- genes(gtf) basic_genes <- all_genes[grep("basic", mcols(all_genes)$tags)] # Combine filters chr1_protein_coding <- genes(gtf, filter = list( chrom = "chr1", gene_type = "protein_coding" )) ``` # Rich Transcript Queries ```{r lktx1} # Transcripts include transcript_type, transcript_name, support level, etc. tx <- transcripts(gtf) head(mcols(tx)) # transcript_name transcript_type gene_id gene_name source level tags # transcript_support_level havana_transcript ccdsid protein_id # See transcript types transcript_types(gtf) # Filter by transcript type coding_tx <- transcripts(gtf, filter = list(transcript_type = "protein_coding")) # Filter by transcript support level (TSL) # TSL1 = all splice junctions supported by RNA-seq # TSL2 = best supporting transcript is flagged as suspect # TSL3 = not all splice junctions supported # TSL4/5 = poor support high_confidence_tx <- transcripts(gtf, filter = list(transcript_support_level = "1")) # Find transcripts with CCDS IDs (highly curated) all_tx <- transcripts(gtf) ccds_tx <- all_tx[!is.na(mcols(all_tx)$ccdsid)] length(ccds_tx) ``` ## Exon Queries with Context ```{r doex} # Exons include exon_number, transcript_id, gene_id ex <- exons(gtf) head(mcols(ex)) # Exons for a specific gene brca1_exons <- exons(gtf, filter = list(gene_id_stripped = "ENSG00000012048")) # Exons grouped by transcript exons_by_tx <- exonsBy(gtf, by = "tx") length(exons_by_tx) # Number of transcripts # Look at one transcript's exons (sorted by exon_number) exons_by_tx[[1]] # Exons grouped by gene exons_by_gene <- exonsBy(gtf, by = "gene") ``` ## CDS and Protein Information ```{r docds} # CDS includes protein_id and frame cds_regions <- cds(gtf) head(mcols(cds_regions)) # transcript_id gene_id protein_id exon_number frame source level tags # CDS grouped by transcript cds_by_tx <- cdsBy(gtf, by = "tx") # Find all CDS for a specific protein all_cds <- cds(gtf) # protein_ids <- unique(mcols(all_cds)$protein_id) ``` ## UTRs and Codons ```{r doutr} # 5' UTRs utr5 <- utrs(gtf, type = "5prime") # 3' UTRs utr3 <- utrs(gtf, type = "3prime") # Start codons start_codons <- codons(gtf, type = "start") # Stop codons stop_codons <- codons(gtf, type = "stop") ``` ## Transcripts by Gene ```{r dotx} # Get transcripts grouped by gene tx_by_gene <- transcriptsBy(gtf, by = "gene") # How many transcripts per gene? tx_counts <- lengths(tx_by_gene) summary(tx_counts) # Genes with most isoforms head(sort(tx_counts, decreasing = TRUE), 20) ``` # Region Queries ```{r doreg} # Find genes in a region region <- GRanges("chr1", IRanges(1e6, 2e6)) genes_in_query <- genes_in_region(gtf, region) # Find transcripts in a region tx_in_query <- transcripts_in_region(gtf, region) ``` ```{r doprom} # Get promoter regions (now with gene_type!) promoters <- promoters(genes(gtf), upstream = 2000, downstream = 200) # promoters still has all mcols from genes head(mcols(promoters)$gene_type) # Filter for protein-coding promoters pc_promoters <- promoters[mcols(promoters)$gene_type == "protein_coding"] head(pc_promoters) ``` # Performance Tips - Use filters to reduce data loaded ``` genes(gtf, filter = list(chrom = "chr1")) # Fast genes(gtf)[seqnames(genes(gtf)) == "chr1"] # Slower (loads all first) ``` - Select only columns you need ``` genes(gtf, columns = c("gene_name", "gene_type")) ``` - Use versioned IDs when you need them ``` genes(gtf, use_versioned_ids = TRUE) # ENSG00000290825.2 genes(gtf, use_versioned_ids = FALSE) # ENSG00000290825 (default) ``` - For large operations, consider working with the Parquet directly We have different field names for different components. ```{r lkpp} library(arrow) ds = open_dataset(parqloc) getflds = function(x) { open_dataset(x) |> names() } uu = sapply(ds$files, getflds, USE.NAMES=TRUE) names(uu) = sapply(names(uu), basename) uu ``` # Session information ```{r lksess} sessionInfo() ```