Skip to content

Commit

Permalink
rerun methylation analysis
Browse files Browse the repository at this point in the history
  • Loading branch information
komalsrathi committed Sep 22, 2024
1 parent d1badc7 commit 2a1d163
Show file tree
Hide file tree
Showing 19 changed files with 78 additions and 1,371 deletions.
19 changes: 9 additions & 10 deletions analyses/methylation_analysis/01-limma_analysis.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@ suppressPackageStartupMessages({
library(msigdbr)
library(limma)
library(optparse)
library(data.table)
library(dplyr)
})

# increase memory to read large files
mem.maxVSize(vsize = 102400)

# parse command line options
option_list <- list(
make_option(c("--methyl_mat"), type = "character", help = "methylation data matrix, preferably m-values (.rds) "),
Expand All @@ -22,18 +23,16 @@ opt <- parse_args(OptionParser(option_list = option_list, add_help_option = TRUE
output_dir <- opt$output_dir
dir.create(output_dir, showWarnings = F, recursive = T)

# read m-values
methyl_m_values_full <- readRDS(opt$methyl_mat) %>%
dplyr::slice_head(n = 500000) %>%
na.omit() %>%
dplyr::filter(!duplicated(Probe_ID))
# read m-values and remove rows with NA values
methyl_m_values_full <- readRDS(opt$methyl_mat)
methyl_m_values_full <- methyl_m_values_full %>%
tibble::column_to_rownames('Probe_ID')
tibble::column_to_rownames(var = 'Probe_ID')
methyl_m_values_full <- methyl_m_values_full[complete.cases(methyl_m_values_full),]

# read annotation
methyl_annot_full <- data.table::fread(opt$methyl_annot) %>%
dplyr::filter(!duplicated(Probe_ID))

distinct(Probe_ID, .keep_all = TRUE)
# create generalized function for gene feature analysis
run_analysis <- function(methyl_m_values_full,
methyl_annot_full,
Expand Down
30 changes: 17 additions & 13 deletions analyses/methylation_analysis/02-dms_gsameth_analysis.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ suppressPackageStartupMessages({
library(missMethyl)
})

# increase memory to read large files
mem.maxVSize(vsize = 102400)

# parse command line options
option_list <- list(
make_option(c("--methyl_mat"), type = "character", help = "methylation data matrix, preferably m-values (.rds) "),
Expand Down Expand Up @@ -50,13 +53,14 @@ gene_set_entrez <- base::split(gene_set$entrez_gene, list(gene_set$gs_name))
# read differentially methylated sites
diffexpr_sites <- read_tsv(diffexpr_sites)

# read methylation matrix for pulling all probes used in the analysis
methyl_mat <- readRDS(methyl_mat) %>%
dplyr::slice_head(n = 500000) %>%
na.omit() %>%
dplyr::filter(!duplicated(Probe_ID))
# read m-values and remove rows with NA values
methyl_mat <- readRDS(methyl_mat)
methyl_mat <- methyl_mat %>%
tibble::column_to_rownames('Probe_ID')
tibble::column_to_rownames(var = 'Probe_ID')
methyl_mat <- methyl_mat[complete.cases(methyl_mat),]

# pull all probes to be used in the analysis
all_cpgs <- rownames(methyl_mat)

# use a for-loop
clusters <- unique(diffexpr_sites$cluster)
Expand All @@ -68,14 +72,14 @@ pdf(
)
for (i in 1:length(clusters)) {
sigCpGs <- diffexpr_sites %>%
dplyr::filter(cluster == clusters[i])
filter(cluster == clusters[i])

# pathway enrichment using gsameth
# take top 10000 instead of all significant probes because the latter results in too many probes and hence pathway enrichment does not work
sigCpGs <- sigCpGs %>%
dplyr::arrange(adj.P.Val) %>%
dplyr::slice_head(n = 10000) %>%
dplyr::pull(probes)
slice_head(n = 10000) %>%
pull(probes)
pathway_df <- gsameth(
sig.cpg = sigCpGs,
all.cpg = all_cpgs,
Expand All @@ -87,10 +91,10 @@ for (i in 1:length(clusters)) {
# significant pathways
pathway_df <- pathway_df %>%
as.data.frame() %>%
dplyr::mutate(cluster = clusters[i]) %>%
tibble::rownames_to_column("pathway") %>%
mutate(cluster = clusters[i]) %>%
rownames_to_column("pathway") %>%
dplyr::arrange(FDR) %>%
dplyr::filter(FDR < 0.1) # not many significant pathways, so use a loose cutoff
filter(FDR < 0.1) # not many significant pathways, so use a loose cutoff

# combine with full output
all_pathway_df <- rbind(all_pathway_df, pathway_df)
Expand All @@ -99,7 +103,7 @@ for (i in 1:length(clusters)) {
if (nrow(pathway_df) > 0) {
pathway_df <- pathway_df %>%
dplyr::arrange(FDR) %>%
dplyr::slice_head(n = 50)
slice_head(n = 50)
pathway_df$pathway <- gsub("_", " ", pathway_df$pathway)
pathway_df$pathway <- factor(pathway_df$pathway, levels = pathway_df$pathway)
p <- ggplot(pathway_df, aes(pathway, y = (-1) * log10(FDR), fill = FDR)) +
Expand Down
12 changes: 10 additions & 2 deletions analyses/methylation_analysis/03-dmr_gsaregion_analysis.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,14 @@ suppressPackageStartupMessages({
library(msigdbr)
library(limma)
library(DMRcate)
library(DMRcatedata)
library(missMethyl)
library(optparse)
})

# increase memory to read large files
mem.maxVSize(vsize = 102400)

# parse command line options
option_list <- list(
make_option(c("--methyl_mat"), type = "character", help = "methylation data matrix, preferably m-values (.rds) "),
Expand Down Expand Up @@ -46,8 +50,11 @@ gene_set <- msigdbr::msigdbr(species = "Homo sapiens",
gene_set <- gene_set %>% dplyr::select(gs_name, entrez_gene)
gene_set_entrez <- base::split(gene_set$entrez_gene, list(gene_set$gs_name))

# read m-values
# read m-values and remove rows with NA values
methyl_m_values_full <- readRDS(opt$methyl_mat)
methyl_m_values_full <- methyl_m_values_full %>%
tibble::column_to_rownames(var = 'Probe_ID')
methyl_m_values_full <- methyl_m_values_full[complete.cases(methyl_m_values_full),]

# read annotation
methyl_annot_full <- data.table::fread(opt$methyl_annot)
Expand Down Expand Up @@ -110,7 +117,8 @@ run_analysis <- function(methyl_m_values_full,
arraytype = "EPICv1",
analysis.type = "differential",
design = design,
coef = 2
coef = 2,
fdr = 0.1 # increase threshold as 0.05 returned no significant CpGs
)
DMRs <- dmrcate(myAnnotation, lambda = 1000, C = 2)
results.ranges <- extractRanges(DMRs)
Expand Down
Loading

0 comments on commit 2a1d163

Please sign in to comment.