---
title: "TxParq.Hs.gencode.v49: Parquet transformation of TxDb.Hs with Gencode v49"
shorttitle: "outlier dytection methods"
author: "Vincent J. Carey, stvjc at channing.harvard.edu"
date: "`r format(Sys.time(), '%B %d, %Y')`"
vignette: >
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteIndexEntry{TxParq.Hs.gencode.v49: Parquet transformation of TxDb.Hs with Gencode v49}
  %\VignetteEncoding{UTF-8}
output:
  BiocStyle::html_document:
    highlight: pygments
    number_sections: yes
    theme: united
    toc: yes
---

# Introduction

This is an experiment to evaluate an approach to TxDb production
that is decoupled from AnnotationDbi and is based on serializations of
information in Gencode GTFs to Parquet.

The code that transformed Gencode GTF to parquet is in inst/python
in the sources of this package.  The code was generated by prompting
Anthropic Claude Opus 4.5 in March of 2026.

```{r getlibs, message=FALSE}
library(GenomicRanges)
library(arrow)
library(dplyr)
library(TxParq.Hs.gencode.v49)
parqloc = system.file("gc49", package="TxParq.Hs.gencode.v49")
```


```{r dotasks}
gtf <- GTFParquet(parqloc)
print(gtf)

# View GTF metadata (version, date, provider)
gtf_metadata(gtf)
```


# Rich Gene Queries


## Get ALL genes with FULL attributes

```{r attr1}
gr_genes <- genes(gtf)
head(gr_genes)
```


## Filter by gene_type

```{r lktype}
protein_coding <- genes(gtf, filter = list(gene_type = "protein_coding"))
length(protein_coding)

lncRNAs <- genes(gtf, filter = list(gene_type = "lncRNA"))
length(lncRNAs)

# Convenience functions
pc_genes <- protein_coding_genes(gtf)
lnc_genes <- lncRNA_genes(gtf)

# See what gene types are available
gene_types(gtf)
#    lncRNA      miRNA     protein_coding    pseudogene ...
```

## Filter by annotation confidence level

- Level 1: verified loci (experimental evidence)
- Level 2: manually annotated loci
- Level 3: automatically annotated loci

```{r lkfilt2}
verified_genes <- genes(gtf, filter = list(level = 1))

# Filter by source (HAVANA = manual, ENSEMBL = automatic)
manual_genes <- genes(gtf, filter = list(source = "HAVANA"))

# Filter by tags
# Common tags: basic, CCDS, overlaps_pseudogene, etc.
# Note: tags is a semicolon-separated string, so use grep for complex filters
all_genes <- genes(gtf)
basic_genes <- all_genes[grep("basic", mcols(all_genes)$tags)]

# Combine filters
chr1_protein_coding <- genes(gtf, filter = list(
  chrom = "chr1",
  gene_type = "protein_coding"
))
```

# Rich Transcript Queries

```{r lktx1}
# Transcripts include transcript_type, transcript_name, support level, etc.
tx <- transcripts(gtf)
head(mcols(tx))
#   transcript_name transcript_type gene_id gene_name source level tags
#   transcript_support_level havana_transcript ccdsid protein_id

# See transcript types
transcript_types(gtf)

# Filter by transcript type
coding_tx <- transcripts(gtf, filter = list(transcript_type = "protein_coding"))

# Filter by transcript support level (TSL)
# TSL1 = all splice junctions supported by RNA-seq
# TSL2 = best supporting transcript is flagged as suspect
# TSL3 = not all splice junctions supported
# TSL4/5 = poor support
high_confidence_tx <- transcripts(gtf, filter = list(transcript_support_level = "1"))

# Find transcripts with CCDS IDs (highly curated)
all_tx <- transcripts(gtf)
ccds_tx <- all_tx[!is.na(mcols(all_tx)$ccdsid)]
length(ccds_tx)
```

## Exon Queries with Context

```{r doex}
# Exons include exon_number, transcript_id, gene_id
ex <- exons(gtf)
head(mcols(ex))

# Exons for a specific gene
brca1_exons <- exons(gtf, filter = list(gene_id_stripped = "ENSG00000012048"))

# Exons grouped by transcript
exons_by_tx <- exonsBy(gtf, by = "tx")
length(exons_by_tx)  # Number of transcripts

# Look at one transcript's exons (sorted by exon_number)
exons_by_tx[[1]]

# Exons grouped by gene
exons_by_gene <- exonsBy(gtf, by = "gene")
```


## CDS and Protein Information


```{r docds}
# CDS includes protein_id and frame
cds_regions <- cds(gtf)
head(mcols(cds_regions))
#   transcript_id gene_id protein_id exon_number frame source level tags

# CDS grouped by transcript
cds_by_tx <- cdsBy(gtf, by = "tx")

# Find all CDS for a specific protein
all_cds <- cds(gtf)
# protein_ids <- unique(mcols(all_cds)$protein_id)
```


## UTRs and Codons


```{r doutr}
# 5' UTRs
utr5 <- utrs(gtf, type = "5prime")

# 3' UTRs
utr3 <- utrs(gtf, type = "3prime")

# Start codons
start_codons <- codons(gtf, type = "start")

# Stop codons
stop_codons <- codons(gtf, type = "stop")
```


## Transcripts by Gene


```{r dotx}
# Get transcripts grouped by gene
tx_by_gene <- transcriptsBy(gtf, by = "gene")

# How many transcripts per gene?
tx_counts <- lengths(tx_by_gene)
summary(tx_counts)

# Genes with most isoforms
head(sort(tx_counts, decreasing = TRUE), 20)
```


# Region Queries


```{r doreg}

# Find genes in a region
region <- GRanges("chr1", IRanges(1e6, 2e6))
genes_in_query <- genes_in_region(gtf, region)

# Find transcripts in a region
tx_in_query <- transcripts_in_region(gtf, region)
```


```{r doprom}
# Get promoter regions (now with gene_type!)
promoters <- promoters(genes(gtf), upstream = 2000, downstream = 200)
# promoters still has all mcols from genes
head(mcols(promoters)$gene_type)

# Filter for protein-coding promoters
pc_promoters <- promoters[mcols(promoters)$gene_type == "protein_coding"]
head(pc_promoters)
```


# Performance Tips

- Use filters to reduce data loaded
```
    genes(gtf, filter = list(chrom = "chr1"))  # Fast
    genes(gtf)[seqnames(genes(gtf)) == "chr1"] # Slower (loads all first)
```

- Select only columns you need
```
    genes(gtf, columns = c("gene_name", "gene_type"))
```

- Use versioned IDs when you need them
```
    genes(gtf, use_versioned_ids = TRUE)  # ENSG00000290825.2
    genes(gtf, use_versioned_ids = FALSE) # ENSG00000290825 (default)
```

-  For large operations, consider working with the Parquet directly

We have different field names for different components.

```{r lkpp}
library(arrow)
ds = open_dataset(parqloc)
getflds = function(x) { open_dataset(x) |> names() }
uu = sapply(ds$files, getflds, USE.NAMES=TRUE)
names(uu) = sapply(names(uu), basename)
uu
```

# Session information

```{r lksess}
sessionInfo()
```