--- title: "CMEnt End-to-End Example" author: "CMEnt Package" date: "`r Sys.Date()`" output: BiocStyle::html_document: code_folding: hide vignette: > %\VignetteIndexEntry{CMEnt End-to-End Example} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r vignette_setup, include=FALSE} knitr::opts_chunk$set( echo = TRUE, warning = FALSE, message = FALSE, fig.width = 12, fig.height = 8, dpi = 300, fig.retina = 2 ) ``` ```{r html_styles, echo=FALSE, results='asis'} cat(" ") ``` ## Goal This notebook showcases all CMEnt capabilities, including DMR scoring, motif extraction, motif-based DMR interactions, and visualizations. ## Setup ```{r setup} suppressPackageStartupMessages({ library(CMEnt) library(GenomicRanges) library(ggplot2) }) genome <- "hg19" ``` ## Load Example Inputs ```{r load_inputs} # loadExampleInputData fetches example resources from DMRsegaldata via # ExperimentHub and assigns them into the current environment. loadExampleInputDataChr5And11("beta") loadExampleInputDataChr5And11("pheno") loadExampleInputDataChr5And11("array_type") # DMPs are produced from minfi, and have the form of a dataframe loadExampleInputDataChr5And11("dmps") cat(sprintf("Beta matrix dimensions: %d sites x %d samples\n", nrow(beta), ncol(beta))) cat(sprintf("Phenotype data dimensions: %d samples x %d columns\n", nrow(pheno), ncol(pheno))) cat(sprintf("Array type: %s\n", array_type)) cat(sprintf("Number of DMPs (seeds): %d\n", nrow(dmps))) cat(sprintf("Head of phenotype data:\n")) print(head(pheno)) cat(sprintf("Head of DMPs:\n")) print(head(dmps)) # The beta handler is used to retrieve beta values for the DMRs and their seeds, and to perform optional SVM-based scoring of the DMRs. # It is better to initialize it once and reuse it, as it performs some pre-processing steps that are common across the DMRs. # The beta handler can be supplied as the `beta` argument for the DMR calling and scoring functions, to avoid redundant initialization. beta_handler <- getBetaHandler(beta, array = array_type, genome = genome) ``` ## DMR Assembly from Seeds The `buildDMRs` function is used to build DMRs from the provided seeds, which in this case are the DMPs. The function performs a correlation-based expansion of the regions around the seeds, considering both statistical significance and biological relevance of methylation changes. To keep the vignette fast and reproducible during package checks, we load a bundled example result generated with this workflow. The resulting DMRs are stored in a GRanges object, which includes metadata columns for the number of seeds and sites in each DMR, delta-beta between sample groups, and DMR-level `pval`/`qval` when seed p-values were supplied. The number of DMRs found is reported. ```{r build_dmrs} pheno[, "case_control"] <- pheno$Sample_Group == "cancer" dmrs <- readRDS(system.file("extdata", "example_outputChr5And11.rds", package = "CMEnt")) cat(sprintf("DMRs found: %d\n", length(dmrs))) cat(sprintf("DMRs metadata columns: %s\n", paste(colnames(mcols(dmrs)), collapse = ", "))) cat(sprintf("DMRs genomic range example: %s\n", paste0(seqnames(dmrs)[1], ":", start(dmrs)[1], "-", end(dmrs)[1]))) ``` ## DMR Summary DMRs are primarily summarized by genomic support, DMR-level `pval`/`qval` when available, and methylation effect size. The optional SVM-derived `score` and `cv_accuracy` columns are complementary measures of how well the DMR methylation pattern discriminates sample groups. In addition, score trajectories are smoothed per chromosome and segmented, enabling assignment of localized block IDs that capture saddle-like peaks in Manhattan plots. ```{r dmr_summary} dmr_meta <- as.data.frame(mcols(dmrs)) summary_tbl <- data.frame( chr = as.character(seqnames(dmrs)), start = start(dmrs), end = end(dmrs), width = width(dmrs), seeds_num = mcols(dmrs)$seeds_num, sites_num = mcols(dmrs)$sites_num, delta_beta = round(mcols(dmrs)$delta_beta, 4), pval = if ("pval" %in% colnames(dmr_meta)) signif(dmr_meta$pval, 3) else NA_real_, qval = if ("qval" %in% colnames(dmr_meta)) signif(dmr_meta$qval, 3) else NA_real_, score = if ("score" %in% colnames(dmr_meta)) round(dmr_meta$score, 4) else NA_real_, cv_accuracy = if ("cv_accuracy" %in% colnames(dmr_meta)) round(dmr_meta$cv_accuracy, 4) else NA_real_, block_id = if ("block_id" %in% colnames(dmr_meta)) dmr_meta$block_id else NA_character_ ) if (any(is.finite(summary_tbl$qval))) { summary_tbl <- summary_tbl[order(summary_tbl$qval, -abs(summary_tbl$delta_beta), -summary_tbl$score, na.last = TRUE), ] } else { summary_tbl <- summary_tbl[order(-abs(summary_tbl$delta_beta), -summary_tbl$score, na.last = TRUE), ] } knitr::kable(head(summary_tbl, 5), caption = "Top 5 DMRs by statistical evidence/effect size, with SVM score as a complementary measure") ``` ## Motif Discovery Based on the seeds of the DMRs, motifs are extracted using the `extractDMRMotifs` function (called by default when `buildDMRs` is used). The function retrieves the sequences around the seeds, applies a flank size of 5 bp, and identifies consensus sequences. The number of DMRs with extracted motifs and the number of unique motif consensus sequences are reported. ```{r motif_discovery} n_motif_consensus <- sum(!is.na(mcols(dmrs)$consensus_seq)) n_unique_motif_consensus <- length(unique(stats::na.omit(mcols(dmrs)$consensus_seq))) cat(sprintf("Motifs extracted for %d/%d DMRs.\n", n_motif_consensus, length(dmrs))) cat(sprintf("Unique motif consensus sequences: %d\n", n_unique_motif_consensus)) ``` ## Motif-Based DMR Interactions Once motifs are extracted, the `computeDMRsInteraction` function can be used to identify "interactions" between DMRs based on motif similarity. The rationale is that DMRs with similar motifs may be co-regulated or functionally related, even if they are not in close genomic proximity. The function computes pairwise similarities between the PWMs of the extracted motifs, excluding the center site positions, corrects for microarray design biases, applies a minimum similarity threshold of 0.8, and identifies connected components of interacting DMRs. The resulting components are annotated with JASPAR motifs if available. The number of interactions, components, and the size of the largest component are reported. Additionally, the number of components with JASPAR annotation is provided. ```{r motif_interactions} query_with_jaspar <- requireNamespace("JASPAR2024", quietly = TRUE) interaction_results <- computeDMRsInteraction( dmrs = dmrs, genome = genome, array = array_type, min_similarity = 0.8, motif_site_flank_size = 5, min_component_size = 2, query_components_with_jaspar = query_with_jaspar ) interaction_tbl <- interaction_results$interactions component_tbl <- interaction_results$components dmrs <- interaction_results$dmrs # The input DMRs are returned with additional metadata columns for the interactions and components. cat(sprintf("Interactions found: %d\n", nrow(interaction_tbl))) cat(sprintf("Components found: %d\n", nrow(component_tbl))) if (nrow(component_tbl) > 0) { largest_component <- max(component_tbl$size, na.rm = TRUE) cat( sprintf( "Largest component size: %d / %d DMRs (%.2f%%)\n", largest_component, length(dmrs), 100 * largest_component / length(dmrs) ) ) } annotated_components_m <- !is.na(component_tbl$jaspar_names) & nzchar(component_tbl$jaspar_names) annotated_components <- sum(annotated_components_m) cat(sprintf("Components with JASPAR annotation: %d / %d\n", annotated_components, nrow(component_tbl))) if (nrow(interaction_tbl) > 5) { knitr::kable( head(interaction_tbl[order(-interaction_tbl$sim), ], 5), caption = "Top 5 motif-similarity DMR interactions" ) } if (annotated_components > 0) { components_tbl <- component_tbl[annotated_components_m, c("component_id", "size", "consensus_seq", "jaspar_names")] components_tbl <- components_tbl[order(components_tbl$size, decreasing = TRUE), , drop = FALSE] if (nrow(components_tbl) > 0) { knitr::kable( components_tbl, caption = "Motif interaction components with JASPAR annotation" ) } } ``` ## Visualizations ### DMR Score Manhattan Plot A manhattan plot of the DMRs is drawn, showing complementary SVM scores, their annotation as promoter or gene body DMRs, and translucent block squares for localized score peaks identified by the scoring block extension. The DMRs that belong to both promoter and gene body are omitted. ```{r manhattan_plot, fig.height=6, fig.width=14, dpi=150} plotDMRsManhattan( dmrs = dmrs, promoter_col = "in_promoter_of", gene_body_col = "in_gene_body_of" ) ``` ### Single DMR Plot Each DMR can be plotted with the `plotDMR` function, which shows the beta values of the sites in the DMR, the seeds, and the motif-based interactions with other DMRs, along with JASPAR annotations if existing. Here we select a DMR that has more than one merged DMR and has an extension beyond the seed region, in both sides. ```{r single_dmr_plot, fig.height=10} merged_counts <- suppressWarnings(as.numeric(as.character(mcols(dmrs)$merged_dmrs_num))) has_extension <- (mcols(dmrs)$start_seed != mcols(dmrs)$start_site) & (mcols(dmrs)$end_seed != mcols(dmrs)$end_site) complex_candidates <- which(is.finite(merged_counts) & merged_counts > 1 & has_extension) if (length(complex_candidates) == 0) { complex_candidates <- which(has_extension) } if (length(complex_candidates) == 0) { complex_candidates <- 1L } selected_dmr_idx <- complex_candidates[1] cat( sprintf( "Selected DMR index %d (score=%s, seeds_num=%s, sites_num=%s)\n", selected_dmr_idx, as.character(mcols(dmrs)$score[selected_dmr_idx]), as.character(mcols(dmrs)$seeds_num[selected_dmr_idx]), as.character(mcols(dmrs)$sites_num[selected_dmr_idx]) ) ) plotDMR( dmrs = dmrs, dmr_index = selected_dmr_idx, beta = beta_handler, pheno = pheno, sample_group_col = "Sample_Group", genome = genome, array = array_type, motif_site_flank_size = 5 ) ``` ### Summary Circos Plot A circos plot consisting of the extracted information from CMEnt is drawn along the genome or a specified region. The information includes: - The DMRs and their complementary SVM scores (at most 40 DMRs per chromosome are shown) - Selected beta values of the DMRs (at most 5 sites per DMR, scoring by absolute delta-beta, are shown) - The motif-based interactions between DMRs, with links direction going from higher scoring to lower scoring (at most 120 interactions groups are shown) - The JASPAR annotation of the motif-based groups, if available, on the legend. ```{r circos_plot, eval=TRUE, fig.height=10, fig.width=14, dpi=300, fig.retina=1.5} circos_dmrs <- dmrs circos_summary_plotted <- FALSE tryCatch( { plotDMRsCircos( dmrs = circos_dmrs, beta = beta_handler, pheno = pheno, genome = genome, array = array_type, sample_group_col = "Sample_Group", min_similarity = 0.8, motif_site_flank_size = 5, max_num_samples_per_group = 5, max_dmrs_per_chr = 40, max_sites_per_dmr = 5, max_components = 120, #region = "chr11:2Mb-5Mb, chr11:72Mb-83Mb, chr5:1Mb-12Mb", # nolint degenerate_resolution = 1e6, verbose = 1 ) circos_summary_plotted <- TRUE }, error = function(e) { message("Circos plot generation failed: ", conditionMessage(e)) NULL } ) if (!circos_summary_plotted) { plot.new() text(0.5, 0.5, "Circos summary plot unavailable for this dataset.", cex = 1.1) } ``` It is also possible to plot the circos for one or multiple regions to assess connectivity, e.g, for chr17:2Mb-5Mb, chr17:72Mb-83Mb and chr19:1Mb-12Mb, which are regions that in the earlier plot were shown to interact. (The reported score in the legend is computed based on the visible DMRs of each component): ```{r circos_plot_region, eval=TRUE, fig.height=10, fig.width=14, dpi=300, fig.retina=1.5} circos_region_plotted <- FALSE tryCatch( { plotDMRsCircos( dmrs = circos_dmrs, beta = beta_handler, pheno = pheno, genome = genome, array = array_type, sample_group_col = "Sample_Group", min_similarity = 0.8, motif_site_flank_size = 5, max_num_samples_per_group = 5, max_dmrs_per_chr = 40, max_sites_per_dmr = 5, max_components = 120, region = "chr17:2Mb-5Mb, chr17:72Mb-83Mb, chr19:1Mb-12Mb", degenerate_resolution = 10e2, # If the width of the DMR is larger than that, the DMRs is shown as rectangles and ribbons are used instead of links to connect the DMRs. verbose = 1 ) circos_region_plotted <- TRUE }, error = function(e) { message("Circos plot generation failed: ", conditionMessage(e)) NULL } ) if (!circos_region_plotted) { plot.new() text(0.5, 0.5, "Circos region plot unavailable for selected regions.", cex = 1.1) } ``` ### CMEnt interactive Shiny App The CMEnt interactive Shiny app can be launched with the `launchCMEntViewer` function, which allows for interactive exploration of the DMRs and the aforementioned plots. In order to use it, the `output_prefix` argument should be provided in the `buildDMRs` function, as the app looks for the output files with this prefix. ```{r launch_shiny_app, eval=FALSE} launchCMEntViewer("tmp/CMEnt_example") ``` ## Session Info ```{r session_info} sessionInfo() ```