From 2e008b095d15cace6b0126ccfb736e26c4e231e7 Mon Sep 17 00:00:00 2001 From: KamilMaliszArdigen Date: Tue, 15 Oct 2024 17:01:36 +0200 Subject: [PATCH] Extension of limma module with voom and IHW correction --- .../limma/differential/environment.yml | 7 +- modules/nf-core/limma/differential/main.nf | 1 + .../limma/differential/templates/limma_de.R | 93 ++++++++++++++++--- 3 files changed, 89 insertions(+), 12 deletions(-) diff --git a/modules/nf-core/limma/differential/environment.yml b/modules/nf-core/limma/differential/environment.yml index dcc1aef2ed3..bce4ef9cbe1 100644 --- a/modules/nf-core/limma/differential/environment.yml +++ b/modules/nf-core/limma/differential/environment.yml @@ -1,4 +1,9 @@ channels: + - conda-forge - bioconda dependencies: - - bioconductor-limma:3.54.0 + - bioconda::bioconductor-edger=4.0.16 + - bioconda::bioconductor-ihw=1.28.0 + - bioconda::bioconductor-limma=3.58.1 + - conda-forge::r-dplyr=1.1.4 + - conda-forge::r-readr=2.1.5 diff --git a/modules/nf-core/limma/differential/main.nf b/modules/nf-core/limma/differential/main.nf index 384e4649871..b3f304aac4c 100644 --- a/modules/nf-core/limma/differential/main.nf +++ b/modules/nf-core/limma/differential/main.nf @@ -17,6 +17,7 @@ process LIMMA_DIFFERENTIAL { tuple val(meta), path("*.MArrayLM.limma.rds") , emit: rdata tuple val(meta), path("*.limma.model.txt") , emit: model tuple val(meta), path("*.R_sessionInfo.log") , emit: session_info + tuple val(meta), path("*.normalised_counts.tsv") , emit: normalised_counts, optional: true path "versions.yml" , emit: versions when: diff --git a/modules/nf-core/limma/differential/templates/limma_de.R b/modules/nf-core/limma/differential/templates/limma_de.R index 1d0195694df..a4171db9ac6 100755 --- a/modules/nf-core/limma/differential/templates/limma_de.R +++ b/modules/nf-core/limma/differential/templates/limma_de.R @@ -77,6 +77,9 @@ opt <- list( subset_to_contrast_samples = FALSE, exclude_samples_col = NULL, exclude_samples_values = NULL, + use_voom = TRUE, + IHW_correction = TRUE, + number = Inf, ndups = NULL, # lmFit spacing = NULL, # lmFit block = NULL, # lmFit @@ -142,6 +145,11 @@ opt[vector_opt] = lapply(strsplit(unlist(opt[vector_opt]), ','), as.numeric) ################################################ library(limma) +library(edgeR) +library(readr) +library(dplyr) +library(tibble) +library(IHW) ################################################ ################################################ @@ -287,11 +295,38 @@ colnames(design) <- sub( paste0(contrast_variable, '.'), colnames(design) ) +# Perform voom normalisation for RNA-seq data +if (!is.null(opt\$use_voom) && opt\$use_voom) { + # Create a DGEList object for RNA-seq data + dge <- DGEList(counts = intensities.table) + + # Normalize counts using TMM + dge <- calcNormFactors(dge, method = "TMM") + + # Run voom to transform the data + voom_result <- voom(dge, design) + data_for_fit <- voom_result + + # Write the normalized counts matrix to a TSV file + normalized_counts <- voom_result\$E + normalized_counts_with_genes <- data.frame(Gene = rownames(normalized_counts), normalized_counts, row.names = NULL) + colnames(normalized_counts_with_genes)[1] <- opt\$probe_id_col + write.table(normalized_counts_with_genes, + file = paste(opt\$output_prefix, "normalised_counts.tsv", sep = '.'), + sep = "\t", + quote = FALSE, + row.names = FALSE) + +} else { + # Use as.matrix for regular microarray analysis + data_for_fit <- as.matrix(intensities.table) +} + # Prepare for and run lmFit() lmfit_args = c( list( - object = as.matrix(intensities.table), + object = data_for_fit, design = design ), opt[c('ndups', 'spacing', 'block', 'method')] @@ -322,18 +357,54 @@ ebayes_args = c( fit2 <- do.call(eBayes, ebayes_args) -# Run topTable() to generate a results data frame +if ((!is.null(opt\$use_voom) && opt\$use_voom) && (!is.null(opt\$IHW_correction) && opt\$IHW_correction)) { -toptable_args = c( - list( - fit = fit2, - sort.by = 'none', - number = nrow(intensities.table) - ), - opt[c('adjust.method', 'p.value', 'lfc', 'confint')] -) + # Define the column name for the group comparison + name_con_ref <- glue::glue("{contrast_variable}.{opt\$target_level}-{contrast_variable}.{opt\$reference_level}") + + # Calculate the median gene expression + median_expression <- apply(dge\$counts, 1, median) + # Apply IHW for multiple testing correction using median expression as covariate + ihw_result <- ihw(fit2\$p.value[, name_con_ref] ~ median_expression, alpha = 0.05) + ihw_result@df <- as_tibble(ihw_result@df, rownames = opt\$probe_id_col) + + # Convert fit\$p.value to a tibble, including rownames as a new column 'Gene' + fit_df <- as_tibble(fit2\$p.value, rownames = opt\$probe_id_col) + + fit_df <- fit_df %>% + left_join(ihw_result@df %>% select(!!sym(opt\$probe_id_col),adj_pvalue),by=opt\$probe_id_col) %>% + rename(ihw.adj.p.value = adj_pvalue) -comp.results <- do.call(topTable, toptable_args)[rownames(intensities.table),] + # Convert back to a matrix if necessary (not needed for merging, but for consistency later) + fit\$p.value <- as.matrix(fit_df %>% select(-!!sym(opt\$probe_id_col))) # Keep the original structure without probe_id_col column + + # Generate the topTable including the new adjusted p-values + toptable_args <- list(fit = fit2, coef = name_con_ref, number = opt\$number, adjust.method = opt\$adjust.method, + p.value = opt\$p.value, lfc = opt\$lfc, confint = opt\$confint) + + comp.results <- do.call(topTable, toptable_args) + # Add the IHW-adjusted p-values to the topTable results by merging based on probe_id_col + comp.results <- comp.results %>% + rownames_to_column(opt\$probe_id_col) %>% # Ensure probe_id_col column exists in comp.results + left_join(fit_df, by = opt\$probe_id_col) %>% # Merge based on probe_id_col column + column_to_rownames(var=opt\$probe_id_col) %>% + select(-sym(name_con_ref)) + +} else { + + # Run topTable() to generate a results data frame + + toptable_args = c( + list( + fit = fit2, + sort.by = 'none', + number = nrow(intensities.table) + ), + opt[c('adjust.method', 'p.value', 'lfc', 'confint')] + ) + + comp.results <- do.call(topTable, toptable_args)[rownames(intensities.table),] +} ################################################ ################################################