From 1702918a3723e10410cdb6aad7be98eac74df1bd Mon Sep 17 00:00:00 2001 From: olabiyi Date: Mon, 18 Nov 2024 14:47:11 -0800 Subject: [PATCH 01/24] Fixed group means and stds --- .../NF_AmpIllumina-B/README.md | 2 +- .../workflow_code/bin/pairwise_ancombc1.R | 82 ++- .../workflow_code/bin/pairwise_ancombc2.R | 64 +- .../workflow_code/bin/run_deseq2.R | 638 +++++++++--------- .../NF_AmpIllumina-B/workflow_code/main.nf | 35 +- .../workflow_code/modules/ancombc.nf | 6 +- 6 files changed, 426 insertions(+), 401 deletions(-) diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/README.md b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/README.md index a8794823..e8f0f4e8 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/README.md +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/README.md @@ -105,7 +105,7 @@ nextflow run main.nf --help ``` > Note: Nextflow commands use both single hyphen arguments (e.g. -help) that denote general nextflow arguments and double hyphen arguments (e.g. --input_file) that denote workflow specific parameters. Take care to use the proper number of hyphens for each argument. - +> Please Note: This workflow assumes that all your raw reads end with the same suffix. If they don't, please modify your input read filenames to have the same suffix as shown in [SE_file.csv](workflow_code/SE_file.csv) and [PE_file.csv](workflow_code/PE_file.csv).
#### 4a. Approach 1: Run slurm jobs in singularity containers with OSD or GLDS accession as input diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/pairwise_ancombc1.R b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/pairwise_ancombc1.R index e136bd8e..23ddb63b 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/pairwise_ancombc1.R +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/pairwise_ancombc1.R @@ -148,7 +148,6 @@ library(ANCOMBC) library(DescTools) library(taxize) library(glue) -library(here) library(mia) library(phyloseq) library(utils) @@ -390,7 +389,7 @@ pairwise_comp.m <- utils::combn(metadata[,group] %>% unique, 2) pairwise_comp_df <- pairwise_comp.m %>% as.data.frame colnames(pairwise_comp_df) <- map_chr(pairwise_comp_df, - \(col) str_c(col, collapse = ".vs.")) + \(col) str_c(col, collapse = "v")) comparisons <- colnames(pairwise_comp_df) names(comparisons) <- comparisons @@ -443,7 +442,7 @@ final_results_bc1 <- map(pairwise_comp_df, function(col){ select(-contains("Intercept")) %>% set_names( c("taxon", - glue("lfc_({group2}).vs.({group1})")) + glue("logFC_({group2})v({group1})")) ) # SE @@ -452,7 +451,7 @@ final_results_bc1 <- map(pairwise_comp_df, function(col){ select(-contains("Intercept")) %>% set_names( c("taxon", - glue("se_({group2}).vs.({group1})")) + glue("lfcSE_({group2})v({group1})")) ) # W @@ -461,7 +460,7 @@ final_results_bc1 <- map(pairwise_comp_df, function(col){ select(-contains("Intercept")) %>% set_names( c("taxon", - glue("W_({group2}).vs.({group1})")) + glue("Wstat_({group2})v({group1})")) ) # p_val @@ -470,7 +469,7 @@ final_results_bc1 <- map(pairwise_comp_df, function(col){ select(-contains("Intercept")) %>% set_names( c("taxon", - glue("p_({group2}).vs.({group1})")) + glue("pvalue_({group2})v({group1})")) ) # q_val @@ -479,7 +478,7 @@ final_results_bc1 <- map(pairwise_comp_df, function(col){ select(-contains("Intercept")) %>% set_names( c("taxon", - glue("q_({group2}).vs.({group1})")) + glue("qvalue_({group2})v({group1})")) ) @@ -489,7 +488,7 @@ final_results_bc1 <- map(pairwise_comp_df, function(col){ select(-contains("Intercept")) %>% set_names( c("taxon", - glue("diff_({group2}).vs.({group1})")) + glue("diff_({group2})v({group1})")) ) @@ -524,14 +523,13 @@ walk(comparisons[names(final_results_bc1)], .f = function(comparison){ # Sort ASVs in ascending order merged_stats_df <- merged_stats_df %>% rename(!!feature := taxon) %>% - #filter(str_detect(ASV, "ASV")) %>% mutate(!!feature := SortMixed(!!sym(feature))) comp_names <- merged_stats_df %>% - select(starts_with("lfc")) %>% - colnames() %>% str_remove_all("lfc_") + select(starts_with("logFC")) %>% + colnames() %>% str_remove_all("logFC_") names(comp_names) <- comp_names message("Making volcano plots...") @@ -539,11 +537,11 @@ message("Making volcano plots...") volcano_plots <- map(comp_names, function(comparison){ comp_col <- c( - glue("lfc_{comparison}"), - glue("se_{comparison}"), - glue("W_{comparison}"), - glue("p_{comparison}"), - glue("q_{comparison}"), + glue("logFC_{comparison}"), + glue("lfcSE_{comparison}"), + glue("Wstat_{comparison}"), + glue("pvalue_{comparison}"), + glue("qvalue_{comparison}"), glue("diff_{comparison}") ) @@ -554,8 +552,8 @@ volcano_plots <- map(comp_names, function(comparison){ pattern = "(.+)_.+", replacement = "\\1") - p <- ggplot(sub_res_df, aes(x=lfc, y=-log10(p), color=diff, label=!!sym(feature))) + - geom_point(size=4) + geom_point(size=4) + + p <- ggplot(sub_res_df, aes(x=logFC, y=-log10(pvalue), color=diff, label=!!sym(feature))) + + geom_point(size=4) + scale_color_manual(values=c("TRUE"="cyan2", "FALSE"="red")) + geom_hline(yintercept = -log10(0.05), linetype = "dashed") + ggrepel::geom_text_repel() + @@ -563,7 +561,7 @@ volcano_plots <- map(comp_names, function(comparison){ title = comparison, color="Significant") + publication_format ggsave(filename = glue("{output_prefix}{comparison}_volcano{assay_suffix}.png"), plot = p, device = "png", - width = 6, height = 8, units = "in", dpi = 300, path=diff_abund_out_dir) + width = 6, height = 8, units = "in", dpi = 300, path = diff_abund_out_dir) return(p) }) @@ -580,7 +578,7 @@ p <- wrap_plots(volcano_plots, ncol = 2) try( ggsave(filename = glue("{output_prefix}{feature}_volcano{assay_suffix}.png"), plot = p, device = "png", width = 16, height = fig_height, units = "in", dpi = 300, - path=diff_abund_out_dir, limitsize = FALSE) + path = diff_abund_out_dir, limitsize = FALSE) ) # Add NCBI id to feature i.e. ASV @@ -602,25 +600,37 @@ normalized_table <- as.data.frame(feature_table + 1) %>% mutate(across( where(is.numeric), log ) ) -group_means_df <- normalized_table[feature] +samples <- metadata[[samples_column]] +samplesdropped <- setdiff(x = samples, y = colnames(normalized_table)[-1]) +missing_df <- data.frame(ASV=normalized_table[[feature]], + matrix(data = NA, + nrow = nrow(normalized_table), + ncol = length(samplesdropped) + ) +) +colnames(missing_df) <- c(feature,samplesdropped) + -walk(pairwise_comp_df, function(col){ +group_levels <- metadata[, group] %>% unique() %>% sort() +group_means_df <- normalized_table[feature] +walk(group_levels, function(group_level){ - group1 <- col[1] - group2 <- col[2] - mean_col <- glue("Group.Mean_({group2}).vs.({group1})") - std_col <- glue("Group.Stdev_({group2}).vs.({group1})") + mean_col <- glue("Group.Mean_({group_level})") + std_col <- glue("Group.Stdev_({group_level})") + # Samples that belong to the current group Samples <- metadata %>% - filter(!!sym(group) %in% c(group1, group2)) %>% - pull(!!samples_column) + filter(!!sym(group) == group_level) %>% + pull(!!sym(samples_column)) + # Samples that belong to the current group that are in the normalized table + Samples <- intersect(colnames(normalized_table), Samples) - temp_df <- normalized_table %>% select(!!feature, !!Samples) %>% + temp_df <- normalized_table %>% select(!!feature, all_of(Samples)) %>% rowwise() %>% mutate(!!mean_col := mean(c_across(where(is.numeric))), !!std_col := sd(c_across(where(is.numeric))) ) %>% - select(!!feature, !!sym(mean_col), !!sym(std_col)) + select(!!feature,!!sym(mean_col), !!sym(std_col)) group_means_df <<- group_means_df %>% left_join(temp_df) @@ -631,7 +641,9 @@ walk(pairwise_comp_df, function(col){ normalized_table <- normalized_table %>% rowwise() %>% mutate(All.Mean=mean(c_across(where(is.numeric))), - All.Stdev=sd(c_across(where(is.numeric))) ) + All.Stdev=sd(c_across(where(is.numeric))) )%>% + left_join(missing_df, by = feature) %>% + select(!!feature, all_of(samples), All.Mean, All.Stdev) merged_df <- df %>% @@ -643,9 +655,9 @@ merged_df <- df %>% merged_df <- merged_df %>% select(!!sym(feature):NCBI_id) %>% - left_join(normalized_table) %>% + left_join(normalized_table, by = feature) %>% left_join(merged_df) %>% - left_join(group_means_df) %>% + left_join(group_means_df, by = feature) %>% mutate(across(where(is.numeric), ~round(.x, digits=3))) %>% mutate(across(where(is.matrix), as.numeric)) @@ -697,8 +709,4 @@ ggsave(filename = glue("{output_prefix}{feature}_boxplots{assay_suffix}.png"), p ) -# Error: One or both dimensions exceed the maximum (50000px). -# - Use `options(ragg.max_dim = ...)` to change the max -# Warning: May cause the R session to crash -# Execution halted message("Run completed sucessfully.") diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/pairwise_ancombc2.R b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/pairwise_ancombc2.R index 227f8fb2..389635a5 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/pairwise_ancombc2.R +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/pairwise_ancombc2.R @@ -149,7 +149,6 @@ library(ANCOMBC) library(DescTools) library(taxize) library(glue) -library(here) library(mia) library(phyloseq) library(utils) @@ -474,13 +473,25 @@ new_colnames <- map_chr(output$res_pair %>% colnames, if(str_count(colname,group) == 1){ str_replace_all(string=colname, pattern=glue("(.+)_{group}(.+)"), - replacement=glue("\\1_(\\2).vs.({ref_group})")) + replacement=glue("\\1_(\\2)v({ref_group})")) %>% + str_replace(pattern = "^lfc_", replacement = "logFC_") %>% + str_replace(pattern = "^se_", replacement = "lfcSE_") %>% + str_replace(pattern = "^W_", replacement = "Wstat_") %>% + str_replace(pattern = "^p_", replacement = "pvalue_") %>% + str_replace(pattern = "^q_", replacement = "qvalue_") + # Columns with normal two groups comparison } else if(str_count(colname,group) == 2){ str_replace_all(string=colname, pattern=glue("(.+)_{group}(.+)_{group}(.+)"), - replacement=glue("\\1_(\\2).vs.(\\3)")) + replacement=glue("\\1_(\\2)v(\\3)")) %>% + str_replace(pattern = "^lfc_", replacement = "logFC_") %>% + str_replace(pattern = "^se_", replacement = "lfcSE_") %>% + str_replace(pattern = "^W_", replacement = "Wstat_") %>% + str_replace(pattern = "^p_", replacement = "pvalue_") %>% + str_replace(pattern = "^q_", replacement = "qvalue_") + # Feature/ ASV column } else{ @@ -489,7 +500,6 @@ new_colnames <- map_chr(output$res_pair %>% colnames, } ) - # Change the column named taxon to the feature name e.g. ASV new_colnames[match("taxon", new_colnames)] <- feature @@ -539,20 +549,29 @@ normalized_table <- output$bias_correct_log_table %>% mutate(across(where(is.numeric), ~replace_na(.x, replace=0))) +samples <- metadata[[samples_column]] +samplesdropped <- setdiff(x = samples, y = colnames(normalized_table)[-1]) +missing_df <- data.frame(ASV=normalized_table[[feature]], + matrix(data = NA, + nrow = nrow(normalized_table), + ncol = length(samplesdropped) + ) + ) +colnames(missing_df) <- c(feature,samplesdropped) + group_means_df <- normalized_table[feature] -walk(uniq_comps, function(comp){ +walk(group_levels, function(group_level){ - group1 <- str_replace(comp, "\\((.+)\\).vs.\\((.+)\\)", "\\1") - group2 <- str_replace(comp, "\\((.+)\\).vs.\\((.+)\\)", "\\2") - mean_col <- glue("Group.Mean_({group1}).vs.({group2})") - std_col <- glue("Group.Stdev_({group1}).vs.({group2})") + mean_col <- glue("Group.Mean_({group_level})") + std_col <- glue("Group.Stdev_({group_level})") + # Samples that belong to the current group Samples <- metadata %>% - filter(!!sym(group) %in% c(group1, group2)) %>% + filter(!!sym(group) == group_level) %>% pull(!!sym(samples_column)) - + # Samples that belong to the current group that are in the normalized table Samples <- intersect(colnames(normalized_table), Samples) temp_df <- normalized_table %>% select(!!feature, all_of(Samples)) %>% @@ -570,7 +589,9 @@ walk(uniq_comps, function(comp){ normalized_table <- normalized_table %>% rowwise() %>% mutate(All.Mean=mean(c_across(where(is.numeric))), - All.Stdev=sd(c_across(where(is.numeric))) ) + All.Stdev=sd(c_across(where(is.numeric))) ) %>% + left_join(missing_df, by = feature) %>% + select(!!feature, all_of(samples), All.Mean, All.Stdev) # Append the taxonomy table to the ncbi and stats table merged_df <- df %>% @@ -598,11 +619,11 @@ message("Making volcano plots...") volcano_plots <- map(uniq_comps, function(comparison){ comp_col <- c( - glue("lfc_{comparison}"), - glue("se_{comparison}"), - glue("W_{comparison}"), - glue("p_{comparison}"), - glue("q_{comparison}"), + glue("logFC_{comparison}"), + glue("lfcSE_{comparison}"), + glue("Wstat_{comparison}"), + glue("pvalue_{comparison}"), + glue("qvalue_{comparison}"), glue("diff_{comparison}"), glue("passed_ss_{comparison}") ) @@ -614,16 +635,17 @@ volcano_plots <- map(uniq_comps, function(comparison){ pattern = "(.+)_.+", replacement = "\\1") - p <- ggplot(sub_res_df, aes(x=lfc, y=-log10(p), color=diff, label=!!sym(feature))) + + p <- ggplot(sub_res_df, aes(x=logFC, y=-log10(pvalue), color=diff, label=!!sym(feature))) + geom_point(size=4) + geom_point(size=4) + scale_color_manual(values=c("TRUE"="cyan2", "FALSE"="red")) + geom_hline(yintercept = -log10(0.05), linetype = "dashed") + ggrepel::geom_text_repel() + labs(x="logFC", y="-log10(Pvalue)", title = comparison, color="Significant") + publication_format - + + ggsave(filename = glue("{output_prefix}{comparison}_volcano{assay_suffix}.png"), plot = p, device = "png", - width = 6, height = 8, units = "in", dpi = 300, path=diff_abund_out_dir) + width = 6, height = 8, units = "in", dpi = 300, path = diff_abund_out_dir) return(p) @@ -664,7 +686,7 @@ boxplots <- map(res_df[[feature]], function(feature){ legend.title = element_text(face = "bold", size=12)) ggsave(filename = glue("{output_prefix}{feature}_boxplot{assay_suffix}.png"), plot = p, device = "png", - width = 8, height = 5, units = "in", dpi = 300, path =diff_abund_out_dir) + width = 8, height = 5, units = "in", dpi = 300, path = diff_abund_out_dir) return(p) }) diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/run_deseq2.R b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/run_deseq2.R index d534ab18..0de06145 100755 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/run_deseq2.R +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/run_deseq2.R @@ -1,319 +1,319 @@ -#!/usr/bin/env Rscript -############################################################################### -# AUTHOR : OLABIYI ADEREMI OBAYOMI -# DESCRIPTION: A script to generate taxonomy plots at different taxonomy levels -# E-mail: obadbotanist@yahoo.com -# Created: November 2024 -# example: Rscript taxonomy_plots.R \ -# --metadata-table 'mapping/GLDS-487_amplicon_v1_runsheet.csv' \ -# --feature-table 'data/counts_GLAmpSeq.tsv' \ -# --taxonomy-table 'data/taxonomy_GLAmpSeq.tsv' \ -# --group 'groups' \ -# --samples-column 'Sample Name' -############################################################################### - -library(optparse) - - - -######## -------- Get input variables from the command line ----############## - -version <- 1.0 - -# Input options -option_list <- list( - - make_option(c("-m", "--metadata-table"), type="character", default=NULL, - help="path to a comma separated samples metadata file with the - group/treatment to be analyzed.", - metavar="path"), - - make_option(c("-f", "--feature-table"), type="character", default=NULL, - help="path to a tab separated samples feature table - i.e. ASV or OTU table.", - metavar="path"), - - make_option(c("-t", "--taxonomy-table"), type="character", default=NULL, - help="path to feature taxonomy table i.e. ASV taxonomy table.", - metavar="path"), - - make_option(c("-g", "--group"), type="character", default="groups", - help="Column in metadata to be analyzed", - metavar="groups"), - - make_option(c("-s", "--samples-column"), type="character", default="Sample Name", - help="Column in metadata containing the sample names in the feature table. \ - Deafault: 'Sample Name' ", - metavar="Sample Name"), - - make_option(c("-o", "--output-prefix"), type="character", default="", - help="Unique name to tag onto output files. Default: empty string.", - metavar=""), - - make_option(c("-c", "--abundance-cutoff"), type="numeric", default=0.2, - help="A fraction defining how abundant features most be to be \ - analyzes. Default: 1/5. ", - metavar="0.2"), - - make_option(c("-r", "--remove-rare"), type="logical", default=FALSE, - help="Should rare features be filtered out?. \ - Default: FALSE. ", action= "store_true", - metavar="FALSE"), - - make_option(c("-y", "--assay-suffix"), type="character", default="_GLAmpSeq", - help="Genelab assay suffix.", metavar="GLAmpSeq"), - - make_option(c("--version"), action = "store_true", type="logical", - default=FALSE, - help="Print out version number and exit.", metavar = "boolean") -) - - -opt_parser <- OptionParser( - option_list=option_list, - usage = "Rscript %prog \\ - --metadata-table 'mapping/GLDS-487_amplicon_v1_runsheet.csv' \\ - --feature-table 'data/counts_GLAmpSeq.tsv' \\ - --taxonomy-table 'data/taxonomy_GLAmpSeq.tsv' \\ - --group 'groups' \\ - --samples-column 'Sample Name' ", - description = paste("Author: Olabiyi Aderemi Obayomi", - "\nEmail: olabiyi.a.obayomi@nasa.gov", - "\nA script to generate taxonomy plots at different taxonomy levels.", - "\nIt outputs sample and group taxonomy plots ", sep="") -) - - - - -opt <- parse_args(opt_parser) - - -if (opt$version) { - cat("taxonomy_plots.R version: ", version, "\n") - options_tmp <- options(show.error.messages=FALSE) - on.exit(options(options_tmp)) - stop() -} - - - -if(is.null(opt[["metadata-table"]])) { - stop("Path to a metadata file must be set.") -} - -if(is.null(opt[["feature-table"]])) { - stop("Path to a feature table e.g. ASV table file must be set.") -} - - -if(is.null(opt[["taxonomy-table"]])) { - stop("Path to a metadata file must be set.") -} - -if(opt[["group"]] == "groups") { - message("Alpha diversity will be run on the default 'groups' column \n") -} - -if(opt[["samples-column"]] == "Sample Name") { - message("I will assume that the sample names are in a column named 'Sample Name' \n") -} -library(tidyverse) -library(dendextend) -library(DESeq2) - - - -# ------ Collecting the required input variables ---------- # - -# Group in metadata to analyze -group <- opt[["group"]] # "groups" -samples_column <- opt[["samples-column"]] # "Sample Name" -threads <- opt[["cpus"]] # 8 -metadata_file <- opt[["metadata-table"]] -taxonomy_file <- opt[["taxonomy-table"]] -feature_table_file <- opt[["feature-table"]] -feature <- opt[["feature-type"]] # "ASV" -output_prefix <- opt[["output-prefix"]] -assay_suffix <- opt[["assay-suffix"]] - -# taxon / ASV prevalence cutoff -prevalence_cutoff <- opt[["prevalence-cutoff"]] # 0.15 (15%) -# sample / library read count cutoff -library_cutoff <- opt[["library-cutoff"]] # 100 -diff_abund_out_dir <- "differential_abundance/" -if(!dir.exists(diff_abund_out_dir)) dir.create(diff_abund_out_dir) - - -de_out_dir <- file.path(plots_dir, "da") -abundance_out_dir <- file.path(de_out_dir, "differential_abundance") -volcano_out_dir <- file.path(de_out_dir, "volcano") - - - -# ------------------------ Read metadata ---------------------------------- # -metadata <- read_csv(metadata_file) %>% as.data.frame() -rownames(metadata) <- metadata[[samples_column]] - - - -# -------------------------- Read Feature table -------------------------- # -feature_table <- read_delim(file = feature_table_file) %>% as.data.frame() - -# Set the feature id column as the row names of the feature table -# This assumes that the first column contains the feature ids e.g. ASV ID -rownames(feature_table) <- feature_table[,1] -feature_names <- feature_table[,1] -# Drop the feature column -feature_table <- feature_table[, -1] %>% as.data.frame() -rownames(feature_table) <- feature_names - - -# ------------------------ Read Taxonomy table ---------------------------- # -taxonomy <- read_delim(file = taxonomy_file) %>% as.data.frame() -# Set the feature id column as the row names of the taxonomy table -# This assumes that the first column contains the feature ids e.g. ASV ID -rownames(taxonomy) <- taxonomy[,1] -taxonomy_table <- taxonomy[, -1] -feature_names <- rownames(taxonomy_table) -taxonomy_table <- process_taxonomy(taxonomy_table) -rownames(taxonomy_table) <- feature_names - -print(glue("There are {sum(taxonomy_table$domain == 'Other')} features without - taxonomy assignments. Dropping them ...")) - -# Dropping features that couldn't be assigned taxonomy -taxonomy_table <- taxonomy_table[-which(taxonomy_table$domain == 'Other'),] - -# Get long asv taxonomy names and clean -species <- taxonomy_table %>% - unite(species,domain:species,sep = ";") %>% # Generalize this line -------- - pull %>% str_replace_all("Other", "_") - -taxonomy_table <- fix_names(taxonomy_table, "Other", ";_") - -taxonomy_table[,"species"] <- species - - -# ---------------------- Subset tables ------------------------------------- # - -# Get features common to the taxonomy and feature table -common_ids <- intersect(rownames(feature_table), rownames(taxonomy_table)) - -# Subset the feature and taxonomy tables to contain -# only features found in both table -feature_table <- feature_table[common_ids,] -taxonomy_table <- taxonomy_table[common_ids,] - - - - - -# 6 Statistically testing for differences - -#### pairwise comparisons -unique_groups <- unique(runsheet$groups) -deseq_obj <- phyloseq_to_deseq2(physeq = ASV_physeq, design = ~groups) - -# add pseudocount if any 0 count samples are present -if (sum(colSums(counts(deseq_obj)) == 0) > 0) { - count_data <- counts(deseq_obj) + 1 - - count_data <- as.matrix(apply(count_data, 2, as.integer)) - rownames(count_data) <- rownames(counts(deseq_obj)) - colnames(count_data) <- colnames(counts(deseq_obj)) - counts(deseq_obj) <- count_data -} -# https://rdrr.io/bioc/phyloseq/src/inst/doc/phyloseq-mixture-models.R -deseq_modeled <- tryCatch({ - # Attempt to run DESeq - DESeq(deseq_obj) -}, error = function(e) { - message("Error encountered in DESeq, applying alternative method for size factor estimation...") - - # Define the geometric mean function - gm_mean = function(x, na.rm=TRUE) { - exp(sum(log(x[x > 0]), na.rm=na.rm) / length(x)) - } - geoMeans = apply(counts(deseq_obj), 1, gm_mean) - - # Apply the alternative size factor estimation method - deseq_obj <- estimateSizeFactors(deseq_obj, geoMeans=geoMeans) - - # Call DESeq again with alternative geom mean size est - DESeq(deseq_obj) -}) - -# save final differential abundance counts, individual group comparison results - -write.table(counts(deseq_modeled, normalized=TRUE), - file = file.path(de_out_dir, paste0(output_prefix, - "normalized_counts", - assay_suffix, ".tsv")), - sep="\t", row.names=TRUE, quote=FALSE) -# make the volcanoplot -plot_comparison <- function(group1, group2) { - plot_width_inches = 11.1 - plot_height_inches = 8.33 - - deseq_res <- results(deseq_modeled, contrast = c("groups", group1, group2)) - norm_tab <- counts(deseq_modeled, normalized = TRUE) %>% data.frame() - - volcano_data <- as.data.frame(deseq_res) - - p_val <- 0.1 - volcano_data <- volcano_data[!is.na(volcano_data$padj), ] - volcano_data$significant <- volcano_data$padj <= p_val #also logfc cutoff? - - ######Long x-axis label adjustments########## - x_label <- paste("Log2 Fold Change\n(",group1," vs ",group2,")") - label_length <- nchar(x_label) - max_allowed_label_length = plot_width_inches * 10 - - # Construct x-axis label with new line breaks if was too long - if (label_length > max_allowed_label_length){ - x_label <- paste("Log2 Fold Change\n\n(", group1, "\n vs \n", group2, ")", sep="") - } - ####################################### - - # ASVs promoted in space on right, reduced on left - p <- ggplot(volcano_data, aes(x=log2FoldChange, y=-log10(padj), color=significant)) + - geom_point(alpha=0.7, size=2) + - scale_color_manual(values=c("black", "red"), - labels=c(paste0("padj > ", p_val), - paste0("padj \u2264 ", p_val))) + - theme_bw() + - labs(title="Volcano Plot", - x=x_label, - y="-Log10 P-value", - color=paste0("")) + - theme(legend.position="top") - - # label points and plot - top_points <- volcano_data %>% - arrange(padj) %>% - filter(significant) %>% - head(10) - - volcano_plot <- p + geom_text_repel(data=top_points, aes(label=row.names(top_points)), size=3) - ggsave(filename=file.path(volcano_out_dir, paste0(output_prefix, - "volcano_", - gsub(" ", "_", group1), - "_vs_", - gsub(" ", "_", group2), ".png")), - plot=volcano_plot, - width = plot_width_inches, height = plot_height_inches, dpi = 300) - - write.csv(deseq_res, file = file.path(abundance_out_dir, - paste0(output_prefix, - gsub(" ", "_", group1), - "_vs_", gsub(" ", "_", group2), - ".csv"))) -} - - -# setting up pairwise comparisons and running -comparisons <- expand.grid(group1 = unique_groups, group2 = unique_groups) -comparisons <- subset(comparisons, group1 != group2) - -apply(comparisons, 1, function(pair) plot_comparison(pair['group1'], pair['group2'])) +#!/usr/bin/env Rscript +############################################################################### +# AUTHOR : OLABIYI ADEREMI OBAYOMI +# DESCRIPTION: A script to generate taxonomy plots at different taxonomy levels +# E-mail: obadbotanist@yahoo.com +# Created: November 2024 +# example: Rscript taxonomy_plots.R \ +# --metadata-table 'mapping/GLDS-487_amplicon_v1_runsheet.csv' \ +# --feature-table 'data/counts_GLAmpSeq.tsv' \ +# --taxonomy-table 'data/taxonomy_GLAmpSeq.tsv' \ +# --group 'groups' \ +# --samples-column 'Sample Name' +############################################################################### + +library(optparse) + + + +######## -------- Get input variables from the command line ----############## + +version <- 1.0 + +# Input options +option_list <- list( + + make_option(c("-m", "--metadata-table"), type="character", default=NULL, + help="path to a comma separated samples metadata file with the + group/treatment to be analyzed.", + metavar="path"), + + make_option(c("-f", "--feature-table"), type="character", default=NULL, + help="path to a tab separated samples feature table + i.e. ASV or OTU table.", + metavar="path"), + + make_option(c("-t", "--taxonomy-table"), type="character", default=NULL, + help="path to feature taxonomy table i.e. ASV taxonomy table.", + metavar="path"), + + make_option(c("-g", "--group"), type="character", default="groups", + help="Column in metadata to be analyzed", + metavar="groups"), + + make_option(c("-s", "--samples-column"), type="character", default="Sample Name", + help="Column in metadata containing the sample names in the feature table. \ + Deafault: 'Sample Name' ", + metavar="Sample Name"), + + make_option(c("-o", "--output-prefix"), type="character", default="", + help="Unique name to tag onto output files. Default: empty string.", + metavar=""), + + make_option(c("-c", "--abundance-cutoff"), type="numeric", default=0.2, + help="A fraction defining how abundant features most be to be \ + analyzes. Default: 1/5. ", + metavar="0.2"), + + make_option(c("-r", "--remove-rare"), type="logical", default=FALSE, + help="Should rare features be filtered out?. \ + Default: FALSE. ", action= "store_true", + metavar="FALSE"), + + make_option(c("-y", "--assay-suffix"), type="character", default="_GLAmpSeq", + help="Genelab assay suffix.", metavar="GLAmpSeq"), + + make_option(c("--version"), action = "store_true", type="logical", + default=FALSE, + help="Print out version number and exit.", metavar = "boolean") +) + + +opt_parser <- OptionParser( + option_list=option_list, + usage = "Rscript %prog \\ + --metadata-table 'mapping/GLDS-487_amplicon_v1_runsheet.csv' \\ + --feature-table 'data/counts_GLAmpSeq.tsv' \\ + --taxonomy-table 'data/taxonomy_GLAmpSeq.tsv' \\ + --group 'groups' \\ + --samples-column 'Sample Name' ", + description = paste("Author: Olabiyi Aderemi Obayomi", + "\nEmail: olabiyi.a.obayomi@nasa.gov", + "\nA script to generate taxonomy plots at different taxonomy levels.", + "\nIt outputs sample and group taxonomy plots ", sep="") +) + + + + +opt <- parse_args(opt_parser) + + +if (opt$version) { + cat("taxonomy_plots.R version: ", version, "\n") + options_tmp <- options(show.error.messages=FALSE) + on.exit(options(options_tmp)) + stop() +} + + + +if(is.null(opt[["metadata-table"]])) { + stop("Path to a metadata file must be set.") +} + +if(is.null(opt[["feature-table"]])) { + stop("Path to a feature table e.g. ASV table file must be set.") +} + + +if(is.null(opt[["taxonomy-table"]])) { + stop("Path to a metadata file must be set.") +} + +if(opt[["group"]] == "groups") { + message("Alpha diversity will be run on the default 'groups' column \n") +} + +if(opt[["samples-column"]] == "Sample Name") { + message("I will assume that the sample names are in a column named 'Sample Name' \n") +} +library(tidyverse) +library(dendextend) +library(DESeq2) + + + +# ------ Collecting the required input variables ---------- # + +# Group in metadata to analyze +group <- opt[["group"]] # "groups" +samples_column <- opt[["samples-column"]] # "Sample Name" +threads <- opt[["cpus"]] # 8 +metadata_file <- opt[["metadata-table"]] +taxonomy_file <- opt[["taxonomy-table"]] +feature_table_file <- opt[["feature-table"]] +feature <- opt[["feature-type"]] # "ASV" +output_prefix <- opt[["output-prefix"]] +assay_suffix <- opt[["assay-suffix"]] + +# taxon / ASV prevalence cutoff +prevalence_cutoff <- opt[["prevalence-cutoff"]] # 0.15 (15%) +# sample / library read count cutoff +library_cutoff <- opt[["library-cutoff"]] # 100 +diff_abund_out_dir <- "differential_abundance/" +if(!dir.exists(diff_abund_out_dir)) dir.create(diff_abund_out_dir) + + +de_out_dir <- file.path(plots_dir, "da") +abundance_out_dir <- file.path(de_out_dir, "differential_abundance") +volcano_out_dir <- file.path(de_out_dir, "volcano") + + + +# ------------------------ Read metadata ---------------------------------- # +metadata <- read_csv(metadata_file) %>% as.data.frame() +rownames(metadata) <- metadata[[samples_column]] + + + +# -------------------------- Read Feature table -------------------------- # +feature_table <- read_delim(file = feature_table_file) %>% as.data.frame() + +# Set the feature id column as the row names of the feature table +# This assumes that the first column contains the feature ids e.g. ASV ID +rownames(feature_table) <- feature_table[,1] +feature_names <- feature_table[,1] +# Drop the feature column +feature_table <- feature_table[, -1] %>% as.data.frame() +rownames(feature_table) <- feature_names + + +# ------------------------ Read Taxonomy table ---------------------------- # +taxonomy <- read_delim(file = taxonomy_file) %>% as.data.frame() +# Set the feature id column as the row names of the taxonomy table +# This assumes that the first column contains the feature ids e.g. ASV ID +rownames(taxonomy) <- taxonomy[,1] +taxonomy_table <- taxonomy[, -1] +feature_names <- rownames(taxonomy_table) +taxonomy_table <- process_taxonomy(taxonomy_table) +rownames(taxonomy_table) <- feature_names + +print(glue("There are {sum(taxonomy_table$domain == 'Other')} features without + taxonomy assignments. Dropping them ...")) + +# Dropping features that couldn't be assigned taxonomy +taxonomy_table <- taxonomy_table[-which(taxonomy_table$domain == 'Other'),] + +# Get long asv taxonomy names and clean +species <- taxonomy_table %>% + unite(species,domain:species,sep = ";") %>% # Generalize this line -------- + pull %>% str_replace_all("Other", "_") + +taxonomy_table <- fix_names(taxonomy_table, "Other", ";_") + +taxonomy_table[,"species"] <- species + + +# ---------------------- Subset tables ------------------------------------- # + +# Get features common to the taxonomy and feature table +common_ids <- intersect(rownames(feature_table), rownames(taxonomy_table)) + +# Subset the feature and taxonomy tables to contain +# only features found in both table +feature_table <- feature_table[common_ids,] +taxonomy_table <- taxonomy_table[common_ids,] + + + + + +# 6 Statistically testing for differences + +#### pairwise comparisons +unique_groups <- unique(runsheet$groups) +deseq_obj <- phyloseq_to_deseq2(physeq = ASV_physeq, design = ~groups) + +# add pseudocount if any 0 count samples are present +if (sum(colSums(counts(deseq_obj)) == 0) > 0) { + count_data <- counts(deseq_obj) + 1 + + count_data <- as.matrix(apply(count_data, 2, as.integer)) + rownames(count_data) <- rownames(counts(deseq_obj)) + colnames(count_data) <- colnames(counts(deseq_obj)) + counts(deseq_obj) <- count_data +} +# https://rdrr.io/bioc/phyloseq/src/inst/doc/phyloseq-mixture-models.R +deseq_modeled <- tryCatch({ + # Attempt to run DESeq + DESeq(deseq_obj) +}, error = function(e) { + message("Error encountered in DESeq, applying alternative method for size factor estimation...") + + # Define the geometric mean function + gm_mean = function(x, na.rm=TRUE) { + exp(sum(log(x[x > 0]), na.rm=na.rm) / length(x)) + } + geoMeans = apply(counts(deseq_obj), 1, gm_mean) + + # Apply the alternative size factor estimation method + deseq_obj <- estimateSizeFactors(deseq_obj, geoMeans=geoMeans) + + # Call DESeq again with alternative geom mean size est + DESeq(deseq_obj) +}) + +# save final differential abundance counts, individual group comparison results + +write.table(counts(deseq_modeled, normalized=TRUE), + file = file.path(de_out_dir, paste0(output_prefix, + "normalized_counts", + assay_suffix, ".tsv")), + sep="\t", row.names=TRUE, quote=FALSE) +# make the volcanoplot +plot_comparison <- function(group1, group2) { + plot_width_inches = 11.1 + plot_height_inches = 8.33 + + deseq_res <- results(deseq_modeled, contrast = c("groups", group1, group2)) + norm_tab <- counts(deseq_modeled, normalized = TRUE) %>% data.frame() + + volcano_data <- as.data.frame(deseq_res) + + p_val <- 0.1 + volcano_data <- volcano_data[!is.na(volcano_data$padj), ] + volcano_data$significant <- volcano_data$padj <= p_val #also logfc cutoff? + + ######Long x-axis label adjustments########## + x_label <- paste("Log2 Fold Change\n(",group1," vs ",group2,")") + label_length <- nchar(x_label) + max_allowed_label_length = plot_width_inches * 10 + + # Construct x-axis label with new line breaks if was too long + if (label_length > max_allowed_label_length){ + x_label <- paste("Log2 Fold Change\n\n(", group1, "\n vs \n", group2, ")", sep="") + } + ####################################### + + # ASVs promoted in space on right, reduced on left + p <- ggplot(volcano_data, aes(x=log2FoldChange, y=-log10(padj), color=significant)) + + geom_point(alpha=0.7, size=2) + + scale_color_manual(values=c("black", "red"), + labels=c(paste0("padj > ", p_val), + paste0("padj \u2264 ", p_val))) + + theme_bw() + + labs(title="Volcano Plot", + x=x_label, + y="-Log10 P-value", + color=paste0("")) + + theme(legend.position="top") + + # label points and plot + top_points <- volcano_data %>% + arrange(padj) %>% + filter(significant) %>% + head(10) + + volcano_plot <- p + geom_text_repel(data=top_points, aes(label=row.names(top_points)), size=3) + ggsave(filename=file.path(volcano_out_dir, paste0(output_prefix, + "volcano_", + gsub(" ", "_", group1), + "_vs_", + gsub(" ", "_", group2), ".png")), + plot=volcano_plot, + width = plot_width_inches, height = plot_height_inches, dpi = 300) + + write.csv(deseq_res, file = file.path(abundance_out_dir, + paste0(output_prefix, + gsub(" ", "_", group1), + "_vs_", gsub(" ", "_", group2), + ".csv"))) +} + + +# setting up pairwise comparisons and running +comparisons <- expand.grid(group1 = unique_groups, group2 = unique_groups) +comparisons <- subset(comparisons, group1 != group2) + +apply(comparisons, 1, function(pair) plot_comparison(pair['group1'], pair['group2'])) diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/main.nf b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/main.nf index 06df549b..e709df06 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/main.nf +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/main.nf @@ -24,20 +24,20 @@ if (params.help) { println(" > nextflow run main.nf -resume -profile conda --accession GLDS-487 --target_region 16S --conda.qc ") println() println("Required arguments:") - println("""-profile [STRING] What profile should be used to run the workflow. Options are [singularity, docker, conda, slurm]. - singularity, docker and conda will run the pipelne locally using singularity, docker, and conda, respectively. - To combine profiles, pass them together separated by comma. For example, to run jobs using slurm in singularity containers use 'slurm,singularity' . """) - println("--input_file [PATH] A 4-column (single-end) or 5-column (paired-end) input file (sample_id, forward, [reverse,] paired, groups). Mandatory if a GLDS accession is not provided.") - println(" Please see the files: SE_file.csv and PE_file.csv for single-end and paired-end examples, respectively.") - println(" The sample_id column should contain unique sample ids.") - println(" The forward and reverse columns should contain the absolute or relative path to the sample's forward and reverse reads.") - println(" The paired column should be true for paired-end or anything else for single-end reads.") - println(" The groups column contain group levels / treatments to be compared during diversity and differential abundance testing analysis.") - println("--target_region [STRING] What is the amplicon target region to be analyzed. Options are one of [16S, 18S, ITS]. Default: 16S.") - println("--trim_primers [BOOLEAN] Should primers be trimmed? true or false. Default: true.") + println(""" -profile [STRING] What profile should be used to run the workflow. Options are [singularity, docker, conda, slurm]. + singularity, docker and conda will run the pipelne locally using singularity, docker, and conda, respectively. + To combine profiles, pass them together separated by comma. For example, to run jobs using slurm in singularity containers use 'slurm,singularity' . """) + println(" --input_file [PATH] A 4-column (single-end) or 5-column (paired-end) input file (sample_id, forward, [reverse,] paired, groups). Mandatory if a GLDS or OSD accession is not provided.") + println(" Please see the files: SE_file.csv and PE_file.csv for single-end and paired-end examples, respectively.") + println(" The sample_id column should contain unique sample ids.") + println(" The forward and reverse columns should contain the absolute or relative path to the sample's forward and reverse reads.") + println(" The paired column should be true for paired-end or anything else for single-end reads.") + println(" The groups column contain group levels / treatments to be compared during diversity and differential abundance testing analysis.") + println(" --target_region [STRING] What is the amplicon target region to be analyzed. Options are one of [16S, 18S, ITS]. Default: 16S.") + println(" --trim_primers [BOOLEAN] Should primers be trimmed? true or false. Default: true.") println("PLEASE NOTE: This workflow assumes that all your raw reads end with the same suffix. If they don't please modify your filenames to have the same suffix as shown below.") - println("--raw_R1_suffix [STRING] Raw forward reads suffix (region following the unique part of the sample names). e.g. _R1_raw.fastq.gz.") - println("--raw_R2_suffix [STRING] Raw reverse reads suffix (region following the unique part of the sample names). e.g. _R2_raw.fastq.gz.") + println(" --raw_R1_suffix [STRING] Raw forward reads suffix (region following the unique part of the sample names). e.g. _R1_raw.fastq.gz.") + println(" --raw_R2_suffix [STRING] Raw reverse reads suffix (region following the unique part of the sample names). e.g. _R2_raw.fastq.gz.") println() println("Cutadapt (trimming) parameters:") println(" --F_primer [STRING] Forward primer sequence e.g. AGAGTTTGATCCTGGCTCAG. Default: emptry string.") @@ -50,8 +50,6 @@ if (params.help) { println(" --help Print this help message and exit.") println(" --publishDir_mode [STRING] How should nextflow publish file outputs. Options can be found here https://www.nextflow.io/docs/latest/process.html#publishdir. Default: link.") println(" --errorStrategy [STRING] How should nextflow handle errors. Options can be found here https://www.nextflow.io/docs/latest/process.html#errorstrategy. Default: terminate") - println(" --enable_visualizations [BOOLEAN] Should ASV plots be made? true or false. if true supply a path to the ruhnsheet for plotting to the --runsheet option. Default: false.") - //println(" --runsheet [PATH] A 4-column file with these exact headers [Sample Name, read1_path, raw_R1_suffix, groups] for plotting. Only relevant if --enable_visualizations is true. Default: null.") println(" --multiqc_config [PATH] Path to a custome multiqc config file. Default: config/multiqc.config.") println() println("Dada2 parameters passed to filterAndTrim() function:") @@ -95,6 +93,7 @@ if (params.help) { println(" --conda.genelab [PATH] Path to a conda environment containing genlab-utils. Default: null.") println(" --conda.cutadapt [PATH] Path to a conda environment containing cutadapt. Default: null.") println(" --conda.diversity [PATH] Path to a conda environment containing R packages required for diversity and differential abundance testing. Default: null.") + println() print("Advanced users can edit the nextflow.config file for more control over default settings such container choice, number of cpus, memory per task etc.") exit 0 } @@ -106,7 +105,7 @@ log.info """ You have set the following parameters: Input csv file : ${params.input_file} - GLDS_accession : ${params.accession} + GLDS or OSD accession : ${params.accession} Amplicon target region : ${params.target_region} Nextflow Directory publishing mode: ${params.publishDir_mode} Trim Primers: ${params.trim_primers} @@ -171,7 +170,7 @@ include { FASTQC as RAW_FASTQC ; MULTIQC as RAW_MULTIQC } from './modules/quali include { CUTADAPT; COMBINE_CUTADAPT_LOGS_AND_SUMMARIZE } from './modules/quality_assessment.nf' include { FASTQC as TRIMMED_FASTQC ; MULTIQC as TRIMMED_MULTIQC } from './modules/quality_assessment.nf' -// Cluster ASvs +// Cluster ASVs include { RUN_R_TRIM; RUN_R_NOTRIM } from './modules/run_dada.nf' include { ZIP_BIOM } from './modules/zip_biom.nf' @@ -313,7 +312,7 @@ workflow { ZIP_BIOM.out.version | mix(software_versions_ch) | set{software_versions_ch} - // Diversity, diffrential abundance testing and their corresponding visualizations + // Diversity, differential abundance testing and their corresponding visualizations if(params.accession){ meta = Channel.of(["samples": "Sample Name", "group" : "groups", diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/ancombc.nf b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/ancombc.nf index 6d1cb2a9..bff7e497 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/ancombc.nf +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/ancombc.nf @@ -37,15 +37,11 @@ process ANCOMBC { --assay-suffix '${meta.assay_suffix}' \\ --output-prefix '${meta.output_prefix}' \\ --cpus ${task.cpus} - - Rscript -e "VERSION=sprintf('ANCOMBC %s', packageVersion('ANCOMBC')); \\ - write(x=VERSION, file='versions.txt', append=TRUE)" - Rscript -e "VERSIONS=sprintf('tidyverse %s\\nglue %s\\nANCOMBC %s\\nhere %s\\nphyloseq %s\\nmia %s\\ntaxize %s\\nDescTools %s\\npatchwork %s\\nggrepel %s\\n', \\ + Rscript -e "VERSIONS=sprintf('tidyverse %s\\nglue %s\\nANCOMBC %s\\nphyloseq %s\\nmia %s\\ntaxize %s\\nDescTools %s\\npatchwork %s\\nggrepel %s\\n', \\ packageVersion('tidyverse'), \\ packageVersion('glue'), \\ packageVersion('ANCOMBC'), \\ - packageVersion('here'), \\ packageVersion('phyloseq'), \\ packageVersion('mia'), \\ packageVersion('taxize'), \\ From 8a7bbaaddeae38e107d1148a814efc0d1931cb05 Mon Sep 17 00:00:00 2001 From: olabiyi Date: Mon, 18 Nov 2024 15:33:09 -0800 Subject: [PATCH 02/24] set ANCOMBC image to 2.6.0 --- .../NF_AmpIllumina-B/workflow_code/nextflow.config | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/nextflow.config b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/nextflow.config index af7ee721..fa505fd4 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/nextflow.config +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/nextflow.config @@ -194,6 +194,12 @@ process { publishDir = [path: "${params.final_outputs_dir}${params.output_prefix}", mode: params.publishDir_mode] } + withName: ANCOMBC { + + container = "quay.io/nasa_genelab/ancombc:2.6.0" + + } + } From a4d15050c52fbfa920e73278591b982497ce2823 Mon Sep 17 00:00:00 2001 From: olabiyi Date: Wed, 27 Nov 2024 15:28:09 -0800 Subject: [PATCH 03/24] added run workflow script --- .../NF_AmpIllumina-B/README.md | 68 +- .../NF_AmpIllumina-B/workflow_code/main.nf | 38 +- .../workflow_code/nextflow.config | 24 +- .../workflow_code/run_workflow.py | 696 ++++++++++++++++++ .../workflow_code/slurm_submit.slurm | 6 +- 5 files changed, 802 insertions(+), 30 deletions(-) create mode 100644 Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/run_workflow.py diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/README.md b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/README.md index e8f0f4e8..20fc479c 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/README.md +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/README.md @@ -16,17 +16,23 @@ The current GeneLab Illumina amplicon sequencing data processing pipeline (AmpIl 3. [Fetch Singularity Images](#3-fetch-singularity-images) -4. [Run the workflow](#4-run-the-workflow) +4. [Run the workflow directly with nextflow](#4-run-the-workflow-directly-with-nextflow) 4a. [Approach 1: Run slurm jobs in singularity containers with OSD accession as input](#4a-approach-1-run-slurm-jobs-in-singularity-containers-with-osd-accession-as-input) 4b. [Approach 2: Run slurm jobs in singularity containers with a csv file as input](#4b-approach-2-run-slurm-jobs-in-singularity-containers-with-a-csv-file-as-input) 4c. [Approach 3: Run jobs locally in conda environments and specify the path to one or more existing conda environments](#4c-approach-run-jobs-locally-in-conda-environments-and-specify-the-path-to-one-or-more-existing-conda-environments) 4d. [Modify parameters and cpu resources in the nextflow config file](#4d-modify-parameters-and-cpu-resources-in-the-nextflow-config-file) -5. [Workflow outputs](#5-workflow-outputs) - 5a. [Main outputs](#5a-main-outputs) - 5b. [Resource logs](#5b-resource-logs) +5. [Run the workflow indirectly using the python wrapper script](#5-run-the-workflow-indirectly-using-the-python-wrapper-script) + 5a. [Approach 1: Use an OSD or Genelab acession as input](#5a-approach-1-use-an-osd-or-Genelab-acession-as-input) + 5b. [Approach 2: Use a csv file as input to the workflow](#5b-approach-2-use-a-csv-file-as-input-to-the-workflow) + 5c. [Approach 3: Use a csv file as input to the workflow and supply extra arguments to nextflow run](#5c-approach-3-use-a-csv-file-as-input-to-the-workflow-and-supply-extra-arguments-to-nextflow-run) + 5d. [Approach 4: Just create an edited nextflow.config file but dont run the workflow](#5d-approach-4-just-create-an-edited-nextflow.config-file-but-dont-run-the-workflow) -6. [Post Processing](#6-post-processing) +6. [Workflow outputs](#6-workflow-outputs) + 6a. [Main outputs](#6a-main-outputs) + 6b. [Resource logs](#6b-resource-logs) + +7. [Post Processing](#6-post-processing)
@@ -96,7 +102,7 @@ export NXF_SINGULARITY_CACHEDIR=$(pwd)/singularity --- -### 4. Run the Workflow +### 4. Run the workflow directly with nextflow For options and detailed help on how to run the workflow, run the following command: @@ -165,13 +171,55 @@ Once you've downloaded the workflow template, you can modify the parameters in t --- -### 5. Workflow outputs +### 5. Run the workflow indirectly using the python wrapper script + +For options and detailed help on how to run the workflow using the script, run the following command: + +```bash +python run_workflow.py +``` + +#### 5a. Approach 1: Use an OSD or Genelab acession as input + +```bash +python run_workflow.py --run --target-region 16S --accession GLDS-487 --profile slurm,singularity +``` + +#### 5b. Approach 2: Use a csv file as input to the workflow + +```bash +python run_workflow.py --run --target-region 16S --input-file PE_file.csv --F-primer AGAGTTTGATCCTGGCTCAG --R-primer CTGCCTCCCGTAGGAGT --profile singularity +``` + +#### 5c. Approach 3: Use a csv file as input to the workflow and supply extra arguments to nextflow run. + +Here were want to monitor our jobs with nextflow tower. + +```bash +export TOWER_ACCESS_TOKEN= +export TOWER_WORKSPACE_ID= +python run_workflow.py --run --target-region 16S --input-file PE_file.csv --F-primer AGAGTTTGATCCTGGCTCAG --R-primer CTGCCTCCCGTAGGAGT --profile slurm,conda --extra 'with-tower' +``` + +#### 5d. Aproach 4: Just create an edited nextflow.config file but dont run the workflow + +```bash +python run_workflow.py --target-region 16S --accession GLDS-487 --profile slurm,singularity +``` + +> Note: When using the wrapper script, all outputs generated by the workflow will be in a directory specified by the `--output-dir` parameter. This will be a directory named `./workflow_output/` by default. + +
+ +--- + +### 6. Workflow outputs -#### 5a. Main outputs +#### 6a. Main outputs The outputs from this pipeline are documented in the [GL-DPPD-7104-B](../../Pipeline_GL-DPPD-7104_Versions/GL-DPPD-7104-B.md) processing protocol. -#### 5b. Resource logs +#### 6b. Resource logs Standard nextflow resource usage logs are also produced as follows: @@ -186,7 +234,7 @@ Standard nextflow resource usage logs are also produced as follows: --- -### 6. Post Processing +### 7. Post Processing For options and detailed help on how to run the post-processing workflow, run the following command: diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/main.nf b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/main.nf index e709df06..c96d638d 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/main.nf +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/main.nf @@ -27,7 +27,7 @@ if (params.help) { println(""" -profile [STRING] What profile should be used to run the workflow. Options are [singularity, docker, conda, slurm]. singularity, docker and conda will run the pipelne locally using singularity, docker, and conda, respectively. To combine profiles, pass them together separated by comma. For example, to run jobs using slurm in singularity containers use 'slurm,singularity' . """) - println(" --input_file [PATH] A 4-column (single-end) or 5-column (paired-end) input file (sample_id, forward, [reverse,] paired, groups). Mandatory if a GLDS or OSD accession is not provided.") + println(" --input_file [PATH] A 4-column (single-end) or 5-column (paired-end) input file (sample_id, forward, [reverse,] paired, groups). Mandatory if a GLDS or OSD accession is not provided. Default: null") println(" Please see the files: SE_file.csv and PE_file.csv for single-end and paired-end examples, respectively.") println(" The sample_id column should contain unique sample ids.") println(" The forward and reverse columns should contain the absolute or relative path to the sample's forward and reverse reads.") @@ -40,11 +40,11 @@ if (params.help) { println(" --raw_R2_suffix [STRING] Raw reverse reads suffix (region following the unique part of the sample names). e.g. _R2_raw.fastq.gz.") println() println("Cutadapt (trimming) parameters:") - println(" --F_primer [STRING] Forward primer sequence e.g. AGAGTTTGATCCTGGCTCAG. Default: emptry string.") - println(" --R_primer [STRING] Reverse primer sequence e.g. CTGCCTCCCGTAGGAGT. Default: emptry string.") + println(" --F_primer [STRING] Forward primer sequence e.g. AGAGTTTGATCCTGGCTCAG. Default: null.") + println(" --R_primer [STRING] Reverse primer sequence e.g. CTGCCTCCCGTAGGAGT. Default: null.") println(" --min_cutadapt_len [INTEGER] What should be the minimum read length after quality trimming with cutadapt. Default: 130.") - println(" --primers_linked [STRING] Are the primers linked?. https://cutadapt.readthedocs.io/en/stable/recipes.html#trimming-amplicon-primers-from-paired-end-reads. Default: TRUE. ") - println(" --discard_untrimmed [STRING] Should untrimmed reads be discarded? Any supplied string except TRUE will not discard them. Default: TRUE.") + println(" --primers_linked [STRING] Are the primers linked?. https://cutadapt.readthedocs.io/en/stable/recipes.html#trimming-amplicon-primers-from-paired-end-reads. Default: 'TRUE'. ") + println(" --discard_untrimmed [STRING] Should untrimmed reads be discarded? Any supplied string except TRUE will not discard them. Default: 'TRUE'.") println() println("Optional arguments:") println(" --help Print this help message and exit.") @@ -83,7 +83,7 @@ if (params.help) { println(" --final_outputs_dir [PATH] Where should most outputs and summary reports be stored. Default: ../workflow_output/Final_Outputs/") println() println("Genelab specific arguements:") - println(" --accession [STRING] A Genelab accession number if the --input_file parameter is not set. If this parameter is set, it will ignore the --input_file parameter.") + println(" --accession [STRING] A Genelab accession number if the --input_file parameter is not set. If this parameter is set, it will ignore the --input_file parameter. Default: null") println(" --assay_suffix [STRING] Genelabs assay suffix. Default: GLAmpSeq.") println(" --output_prefix [STRING] Unique name to tag onto output files. Default: empty string.") println() @@ -192,6 +192,32 @@ def deleteWS(string){ workflow { + + + // --------------------- Sanity Checks ------------------------------------- // + // Test input requirement + if (params.accession == null && params.input_file == null){ + + error(""" + Please supply either an accession (OSD or Genelab number) or an input CSV file + by passing either to the --accession or --input_file parameter, respectively. + """) + } + + // Test input csv file + if(params.input_file){ + // Test primers + if(params.F_primer == null || params.R_primer == null){ + + error(""" + When using a csv file as input (--input_file) to this workflow you must provide + foward and reverse primer sequences. Please provide your forward + and reverse primer sequences as arguements to the --F_primer + and --R_primer parameters, respectively. + """) + } + } + // Capture software versions software_versions_ch = Channel.empty() diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/nextflow.config b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/nextflow.config index fa505fd4..755c7604 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/nextflow.config +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/nextflow.config @@ -10,15 +10,15 @@ params { // -------- Required only if --accession is false ---------------// // A 4-column (single-end) or 5-column (paired-end) input csv file with the following headers ( sample_id, forward, [reverse,] paired, groups) - input_file = "PE_file.csv" + input_file = null // Cutadapt parameters min_cutadapt_len = 130 primers_linked = "TRUE" discard_untrimmed = "TRUE" - F_primer = "" - R_primer = "" + F_primer = null + R_primer = null // Dada2 parameters left_trunc = 0 @@ -40,7 +40,7 @@ params { // Mandatory parameters if using GLDS or OSD accession as input - accession = false + accession = null assay_suffix = "_GLAmpSeq" output_prefix = "" @@ -135,14 +135,14 @@ process { //************************* Accession runsheet and input file retrieval **************************************// withName: GET_RUNSHEET { - conda = {params.conda.genelab != null ? params.conda.genelab : "envs/genelab.yaml"} + conda = {params.conda.genelab ? params.conda.genelab : "envs/genelab.yaml"} container = "olabiyi/genelab-utils:1.3.22" publishDir = [path: params.genelab_dir, mode: params.publishDir_mode] } //********************************** Read quality control and assesment ********************************************// withLabel: fastqc { - conda = {params.conda.qc != null ? params.conda.qc : "envs/qc.yaml"} + conda = {params.conda.qc ? params.conda.qc : "envs/qc.yaml"} container = "staphb/fastqc:0.12.1" } @@ -151,13 +151,13 @@ process { } withName: "RAW_MULTIQC|TRIMMED_MULTIQC" { - conda = {params.conda.qc != null ? params.conda.qc : "envs/qc.yaml"} + conda = {params.conda.qc ? params.conda.qc : "envs/qc.yaml"} container = "staphb/multiqc:1.19" publishDir = [path: params.fastqc_out_dir, mode: params.publishDir_mode] } withName: "CUTADAPT|COMBINE_CUTADAPT_LOGS_AND_SUMMARIZE" { - conda = {params.conda.cutadapt != null ? params.conda.cutadapt : "envs/cutadapt.yaml"} + conda = {params.conda.cutadapt ? params.conda.cutadapt : "envs/cutadapt.yaml"} container = "zavolab/cutadapt:1.16" memory = "10 GB" publishDir = [path: params.trimmed_reads_dir, mode: params.publishDir_mode] @@ -169,9 +169,9 @@ process { //********************************** ASV table creation********************************************// withName: "RUN_R_TRIM|RUN_R_NOTRIM" { - conda = {params.conda.R != null ? params.conda.R : "envs/R.yaml"} + conda = {params.conda.R ? params.conda.R : "envs/R.yaml"} container = "olabiyi/r-dada-decipher-biomformat:1.0" - memory = "10 GB" + memory = "20 GB" cpus = 10 publishDir = [[path: params.filtered_reads_dir, pattern: "Filtered_Sequence_Data/*", mode: params.publishDir_mode, saveAs: { fn -> fn.substring(fn.lastIndexOf('/')+1)}], @@ -180,14 +180,14 @@ process { } withName: ZIP_BIOM { - conda = {params.conda.qc != null ? params.conda.qc : "envs/qc.yaml"} + conda = {params.conda.qc ? params.conda.qc : "envs/qc.yaml"} container = "staphb/multiqc:1.19" publishDir = [path: "${params.final_outputs_dir}${params.output_prefix}", mode: params.publishDir_mode] } //********************************** Diversity and differential abundance testing ********************************************// withLabel: visualization { - conda = {params.conda.diversity != null ? params.conda.diversity : "envs/diversity.yaml"} + conda = {params.conda.diversity ? params.conda.diversity : "envs/diversity.yaml"} container = "quay.io/nasa_genelab/r-diversity:1.0" cpus = 5 memory = '10 GB' diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/run_workflow.py b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/run_workflow.py new file mode 100644 index 00000000..3ef70ed4 --- /dev/null +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/run_workflow.py @@ -0,0 +1,696 @@ +#!/usr/bin/env python +import argparse +import subprocess +import os +import sys + + + +def create_config(target_region, raw_R1_suffix, raw_R2_suffix, trim_primers, input_file, min_cutadapt_len, primers_linked, + discard_untrimmed, F_primer, R_primer, left_trunc, right_trunc, left_maxEE, right_maxEE, concatenate_reads_only, + conda_genelab, conda_qc, conda_R, conda_cutadapt, conda_diversity, accession, assay_suffix, output_prefix, publishDir_mode, + primer_trimmed_R1_suffix, primer_trimmed_R2_suffix, filtered_R1_suffix, filtered_R2_suffix, output_dir, diff_abund_method, + group, samples_column, rarefaction_depth, errorStrategy, queueSize, default_cpus, default_memory, cutadapt_memory, R_cpus, + R_memory, diversity_cpus, diversity_memory, container_genelab, container_fastqc, container_multiqc, container_cutadapt, + container_dada, container_ancom, container_diversity, singularity_cacheDir): + """ A function to create nextflow.config file by string interploation using the supplied arguements""" + + config = \ + f""" +//***************************************** Global parameters *******************************************// +params {{ + // Mandatory parameters + target_region = "{target_region}" + raw_R1_suffix = "{raw_R1_suffix}" + raw_R2_suffix = "{raw_R2_suffix}" + raw_reads_dir = "{output_dir}/Raw_Sequence_Data/" + trim_primers = "{trim_primers}" == "TRUE" ? true : false + + + // -------- Required only if --accession is false ---------------// + // A 4-column (single-end) or 5-column (paired-end) input csv file with the following headers ( sample_id, forward, [reverse,] paired, groups) + input_file = "{input_file}" == "null" ? null : "{input_file}" + + + // Cutadapt parameters + min_cutadapt_len = {min_cutadapt_len} + primers_linked = "{primers_linked}" + discard_untrimmed = "{discard_untrimmed}" + F_primer = "{F_primer}" == "null" ? null : "{F_primer}" + R_primer = "{R_primer}" == "null" ? null : "{R_primer}" + + // Dada2 parameters + left_trunc = {left_trunc} + right_trunc = {right_trunc} + left_maxEE = {left_maxEE} + right_maxEE = {right_maxEE} + concatenate_reads_only = "{concatenate_reads_only}" + + + // If using conda environments specify their locations so new ones won't be created + conda{{ + // Specify the paths to existing conda environments (/path/to/envs/genelab-utils) + // leave as is if you want to create a new conda environment + genelab = "{conda_genelab}" == "null" ? null : "{conda_genelab}" // /path/to/envs/genelab-utils + genelab = "{conda_genelab}" == "null" ? null : "{conda_genelab}" // /path/to/envs/genelab-utils + qc = "{conda_qc}" == "null" ? null : "{conda_qc}" // /path/to/envs/qc + R = "{conda_R}" == "null" ? null : "{conda_R}" // /path/to/envs/R + cutadapt = "{conda_cutadapt}" == "null" ? null : "{conda_cutadapt}" // /path/to/envs/cutadapt + diversity = "{conda_diversity}" == "null" ? null : "{conda_diversity}" // /path/to/envs/R_diversity + }} + + + // Mandatory parameters if using GLDS or OSD accession as input + accession = "{accession}" == "null" ? null : "{accession}" + + assay_suffix = "{assay_suffix}" + output_prefix = "{output_prefix}" + publishDir_mode = "{publishDir_mode}" + + // Suffixes + primer_trimmed_R1_suffix = "{primer_trimmed_R1_suffix}" + primer_trimmed_R2_suffix = "{primer_trimmed_R2_suffix}" + filtered_R1_suffix = "{filtered_R1_suffix}" + filtered_R2_suffix = "{filtered_R2_suffix}" + + + // Directories + fastqc_out_dir = "{output_dir}/FastQC_Outputs/" + trimmed_reads_dir = "{output_dir}/Trimmed_Sequence_Data/" + filtered_reads_dir = "{output_dir}/Filtered_Sequence_Data/" + info_out_dir = "{output_dir}/Metadata/" + final_outputs_dir = "{output_dir}/Final_Outputs/" + metadata_dir = "{output_dir}/Metadata/" + genelab_dir = "{output_dir}/GeneLab/" + + // Multiqc + multiqc_config = "${{baseDir}}/config/multiqc.config" + + // -------- Differential abundance parameters ----- // + diff_abund_method = "{diff_abund_method}" + group = "{group}" + samples_column = "{samples_column}" + // Minimum desired sample rarefaction depth for diversity analysis + rarefaction_depth = {rarefaction_depth} + + + errorStrategy = "{errorStrategy}" + debug = false // set to true if you'd like to see the values of your set parameters +}} + +// Setting the default container engine as singularity +params.containerEngine = "singularity" +// Conda shouldn't be used by default except when using conda-based profiles +params.use_conda = false + + +/******************************************************************************************************* +*************************************** Workflow Profiles ********************************************** +********************************************************************************************************/ + +profiles {{ + + slurm {{ + process.executor = 'slurm' + }} + + conda {{ + conda.enabled = true + params.use_conda = true + }} + + singularity {{ + singularity.enabled = true + singularity.autoMounts = true + singularity.cacheDir = "{singularity_cacheDir}" + params.containerEngine = "singularity" + }} + + docker {{ + docker.enabled = true + docker.runOptions = '-u $(id -u):$(id -g)' + docker.userEmulation = true + params.containerEngine = "docker" + }} + +}} + +// Maximum number of jobs to submit in parallel +executor.queueSize = {queueSize} + + +/****************************************************************************************************************** +***************** Tune process specific resources (cpu, container, memory etc.) *********************************** +*******************************************************************************************************************/ + +process {{ + + //******************* Default process settings ************************// + errorStrategy = {{ params.errorStrategy ? params.errorStrategy : "ignore" }} + maxRetries = 2 + cpus = {default_cpus} + memory = "{default_memory}" + cache = 'lenient' + //debug = true // uncomment to see what is being emitted to the standard output + +//************************* Accession runsheet and input file retrieval **************************************// + withName: GET_RUNSHEET {{ + conda = {{params.conda.genelab ? params.conda.genelab : "envs/genelab.yaml"}} + container = "{container_genelab}" + publishDir = [path: params.genelab_dir, mode: params.publishDir_mode] + }} + +//********************************** Read quality control and assesment ********************************************// + withLabel: fastqc {{ + conda = {{params.conda.qc ? params.conda.qc : "envs/qc.yaml"}} + container = "{container_fastqc}" + }} + + withName: RAW_FASTQC {{ + publishDir = [path: params.raw_reads_dir, mode: params.publishDir_mode] + }} + + withName: "RAW_MULTIQC|TRIMMED_MULTIQC" {{ + conda = {{params.conda.qc ? params.conda.qc : "envs/qc.yaml"}} + container = "{container_multiqc}" + publishDir = [path: params.fastqc_out_dir, mode: params.publishDir_mode] + }} + + withName: "CUTADAPT|COMBINE_CUTADAPT_LOGS_AND_SUMMARIZE" {{ + conda = {{params.conda.cutadapt ? params.conda.cutadapt : "envs/cutadapt.yaml"}} + container = "{container_cutadapt}" + memory = "{cutadapt_memory}" + publishDir = [path: params.trimmed_reads_dir, mode: params.publishDir_mode] + }} + + withName: TRIMMED_FASTQC {{ + publishDir = [path: params.filtered_reads_dir, mode: params.publishDir_mode ] + }} + +//********************************** ASV table creation********************************************// + withName: "RUN_R_TRIM|RUN_R_NOTRIM" {{ + conda = {{params.conda.R ? params.conda.R : "envs/R.yaml"}} + container = "{container_dada}" + memory = "{R_memory}" + cpus = {R_cpus} + publishDir = [[path: params.filtered_reads_dir, pattern: "Filtered_Sequence_Data/*", + mode: params.publishDir_mode, saveAs: {{ fn -> fn.substring(fn.lastIndexOf('/')+1) }} ], + [path: params.final_outputs_dir , pattern: "final_outputs/*.{{tsv,biom,fasta}}", + mode: params.publishDir_mode, saveAs: {{ fn -> fn.substring(fn.lastIndexOf('/')+1)}} ]] + }} + + withName: ZIP_BIOM {{ + conda = {{params.conda.qc ? params.conda.qc : "envs/qc.yaml"}} + container = "{container_multiqc}" + publishDir = [path: "${{params.final_outputs_dir}}${{params.output_prefix}}", mode: params.publishDir_mode] + }} + +//********************************** Diversity and differential abundance testing ********************************************// + withLabel: visualization {{ + conda = {{params.conda.diversity ? params.conda.diversity : "envs/diversity.yaml"}} + container = "{container_diversity}" + cpus = {diversity_cpus} + memory = "{diversity_memory}" + publishDir = [path: "${{params.final_outputs_dir}}${{params.output_prefix}}", mode: params.publishDir_mode] + }} + + + withName: ANCOMBC {{ + + container = "{container_ancom}" + + }} + +}} + + +/***************************************************************************** +********************** Workflow Resource Usage Capturing ********************* +******************************************************************************/ + +// Adapted from : https://github.com/nf-core/rnaseq/blob/master/nextflow.config +def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') +timeline {{ + enabled = true + file = "../Resource_Usage/execution_timeline_${{trace_timestamp}}.html" +}} +report {{ + enabled = true + file = "../Resource_Usage/execution_report_${{trace_timestamp}}.html" +}} +trace {{ + enabled = true + file = "../Resource_Usage/execution_trace_${{trace_timestamp}}.txt" +}} + + + +/****************************************************************************** +**************************** Workflow Metadata ******************************** +*******************************************************************************/ + +manifest {{ + author = 'Olabiyi Aderemi Obayomi, Mike D. Lee' + homePage = 'https://github.com/nasa/GeneLab_Data_Processing/blob/master/Amplicon/' + description = 'Amplicon Illumina workflow for pipeline document GL-DPPD-7104-B' + mainScript = 'main.nf' + defaultBranch = 'main' + nextflowVersion = '>=22.10.1' + version = '1.0.0' +}} + + """ + + return config + + + +def main(): + # Argument parser setup with short argument names and an automatic help option + parser = argparse.ArgumentParser( + description='Run workflow for GeneLab data processing.', + add_help=True, + usage=""" + Example 1: Use an OSD or Genelab acession as input + %(prog)s --run --target-region 16S --accession GLDS-487 --profile slurm,singularity [options] + + Example 2: Use a csv file as input to the workflow + %(prog)s --run --target-region 16S --input-file PE_file.csv --F-primer AGAGTTTGATCCTGGCTCAG --R-primer CTGCCTCCCGTAGGAGT --profile singularity [options] + + Example 3: Use a csv file as input to the workflow and supply extra arguments to nextflow run. + Here were want to mintor our jobs with nextflow tower. + export TOWER_ACCESS_TOKEN= + export TOWER_WORKSPACE_ID= + %(prog)s --run --target-region 16S --input-file PE_file.csv --F-primer AGAGTTTGATCCTGGCTCAG --R-primer CTGCCTCCCGTAGGAGT --profile slurm,conda --extra 'with-tower' [options] + + Example 4: Dry run: Just create an edited nextflow.config file but don't run the workflow + %(prog)s --target-region 16S --accession GLDS-487 --profile slurm,singularity [options] + """ + ) + + parser.add_argument('-x', '--run', + action='store_true', + help="Set this flag if would like to run the workflow") + + parser.add_argument('-a', '--accession', + metavar='osd_number', + default="null", + help=""" + A Genelab or OSD accession number if the --input-file parameter is not set. + If this parameter is set, it will ignore the --input-file parameter. Default: null + """, + type=str) + + parser.add_argument('-r', '--input-file', + metavar='/path/to/input_file.csv', + default="null", + help=""" + A 4-column (single-end) or 5-column (paired-end) input file (sample_id, forward, [reverse,] paired, groups). Mandatory if a GLDS or OSD accession is not provided. + Please see the files: SE_file.csv and PE_file.csv for single-end and paired-end examples, respectively. The sample_id column should contain unique sample ids. + The forward and reverse columns should contain the absolute or relative path to the sample's forward and reverse reads. + The paired column should be true for paired-end or anything else for single-end reads. + The groups column should contain group levels / treatments to be compared during diversity and differential abundance testing analysis. + """, + type=str) + + parser.add_argument('-p', '--profile', + metavar='profile', + default="null", + help=""" + What profile(s) should be used to run the workflow? Options are [singularity, docker, conda, slurm]. + singularity, docker and conda will run the pipelne locally using singularity, docker, and conda, respectively. + To combine profiles, pass them together separated by comma. For example, to run jobs using slurm in singularity containers use 'slurm,singularity'. + .Default: null """, + type=str) + + parser.add_argument('-t', '--target-region', + default="16S", + choices=['16S', '18S', 'ITS'], + help=""" + Specify the genomic target for the assay. Options: 16S, 18S, ITS. This is used to select the appropriate + dataset from an OSD study when multiple options are available and also determines the database to use for taxonomy assignment. Default: 16S + """, + type=str) + + parser.add_argument('--raw-R1-suffix', + default='_R1_raw.fastq.gz', + help='Raw forward reads suffix (region following the unique part of the sample names). Default: _R1_raw.fastq.gz', + metavar='raw_R1_suffix', + type=str) + + parser.add_argument('--raw-R2-suffix', + default='_R2_raw.fastq.gz', + help='Raw reverse reads suffix (region following the unique part of the sample names). Default: _R2_raw.fastq.gz', + metavar='raw_R2_suffix', + type=str) + + parser.add_argument('--primer-trimmed-R1-suffix', + default='_R1_trimmed.fastq.gz', + help='Suffix to use for naming your primer trimmed forward reads. Default: _R1_trimmed.fastq.gz', + metavar='primer_trimmed_R1_suffix', + type=str) + + parser.add_argument('--primer-trimmed-R2-suffix', + default='_R2_trimmed.fastq.gz', + help='Suffix to use for naming your primer trimmed reverse reads. Default: _R2_trimmed.fastq.gz', + metavar='primer_trimmed_R2_suffix', + type=str) + + parser.add_argument('--filtered-R1-suffix', + default='_R1_filtered.fastq.gz', + help='Suffix to use for naming your quality filtered forward reads. Default: _R1_filtered.fastq.gz', + metavar='filtered_R1_suffix', + type=str) + + parser.add_argument('--filtered-R2-suffix', + default='_R2_filtered.fastq.gz', + help='Suffix to use for naming your quality filtered reverse reads. Default: _R2_filtered.fastq.gz', + metavar='filtered_R2_suffix', + type=str) + + parser.add_argument('--trim-primers', + choices=["TRUE", "FALSE"], + default="TRUE", + help='Specifies to trim primers (TRUE) or not (FALSE) using cutadapt. Default: "TRUE" ', + type=str) + + parser.add_argument('--F-primer', + default='null', + help='Forward primer sequence e.g. AGAGTTTGATCCTGGCTCAG. Default: null."', + metavar='FORWARD PRIMER', + type=str) + + parser.add_argument('--R-primer', + default='null', + help='Reverse primer sequence e.g. AGAGTTTGATCCTGGCTCAG. Default: emptry string."', + metavar='REVERSE PRIMER', + type=str) + + parser.add_argument('--group-column', + default='groups', + help='Column in input csv file with treatments to be compared. Default: groups', + metavar='group column', + type=str) + + parser.add_argument('--samples-column', + default='sample_id', + help="Column in input csv file with sample names belonging to each treatment group. Default: sample_id", + metavar='samples column', + type=str) + + parser.add_argument('--rarefaction-depth', + metavar='depth', + default=500, + help='The Minimum desired sample rarefaction depth for diversity analysis. Default: 500', + type=int) + + parser.add_argument('--diff-abund-method', + choices=["ancombc1", "ancombc2", "deseq2"], + default="ancombc2", + help="The method to use for differential abundance testing. Either ancombc1, ancombc2, or deseq2. Default: ancombc2" , + type=str) + + parser.add_argument('-d', '--output-dir', + metavar='/path/to/output_dir/', + default='workflow_output', + help='Specifies the output directory for the output files generated by the workflow. Default: workflow_output', + type=str) + + parser.add_argument('-m', '--min-cutadapt-len', + metavar='length', + default=130, + help='Specifies the MINIMUM length of trimmed reads to pass to cutadapt. For paired-end data: if one read gets filtered, both reads are discarded. Default: 130', + type=int) + + parser.add_argument('--primers-linked', + choices=["TRUE", "FALSE"], + default="TRUE", + help='If set to TRUE, instructs cutadapt to treat the primers as linked. Default: FALSE', + type=str) + + + parser.add_argument('--discard-untrimmed', + choices=['TRUE', 'FALSE'], + default='TRUE', + help='If set to TRUE, instructs cutadapt to remove reads if the primers were not found in the expected location; if FALSE, these reads are kept. Default: TRUE', + type=str) + + parser.add_argument('--left-trunc', + default=0, + help='Specifies the length of the forwards reads, bases beyond this length will be truncated and reads shorter than this length are discarded. Default: 0 (no truncation)', + metavar='length', + type=int) + + parser.add_argument('--right-trunc', + default=0, + help='Specifies the length of the reverse reads, bases beyond this length will be truncated and reads shorter than this length are discarded. Default: 0 (no truncation)', + metavar='length', + type=int) + + parser.add_argument('--left-maxEE', + default=1, + help='Specifies the maximum expected error (maxEE) allowed for each forward read, reads with higher than maxEE will be discarded. Default: 1', + metavar='left_max_error', + type=int) + + parser.add_argument('--right-maxEE', + default=1, + help='Specifies the maximum expected error (maxEE) allowed for each forward read, reads with higher than maxEE will be discarded. Default: 1', + metavar='right_max_error', + type=int) + + parser.add_argument('--concatenate-reads-only', + choices=['TRUE', 'FALSE'], + default='FALSE', + help='If set to TRUE, specifies to concatenate forward and reverse reads only with dada2 instead of merging paired reads. Default: FALSE', + type=str) + + parser.add_argument('--output-prefix', + default='', + help='Specifies the prefix to use on all output files to distinguish multiple primer sets, leave as an empty string if only one primer set is being processed. Default: empty string', + metavar='prefix', + type=str) + + parser.add_argument('--assay-suffix', + default='_GLAmpSeq', + help='Genelabs assay suffix. Default: GLAmpSeq', + metavar='suffix', + type=str) + + parser.add_argument('--publishDir-mode', + choices=['link', 'copy', 'sym'], + default='link', + help="How should nextflow publish file outputs. Options can be found here https://www.nextflow.io/docs/latest/process.html#publishdir. Default: link", + metavar='publishDir_mode', + type=str) + + parser.add_argument('--errorStrategy', + default='terminate', + choices=['terminate', 'ignore'], + help="How should nextflow handle errors. Options can be found here https://www.nextflow.io/docs/latest/process.html#errorstrategy. Default: terminate", + metavar='errorStrategy', + type=str) + + parser.add_argument('--queueSize', + default=20, + help="Maximum number of jobs to submit in parallel. Default: 20", + metavar='queueSize', + type=int) + + parser.add_argument('--default-cpus', + default=2, + help="Default number of cpus for each job. Default: 2", + metavar='default_cpus', + type=int) + + parser.add_argument('--default-memory', + default="5 GB", + help="Default amount of memory to use for each job. Default: '5 GB' ", + metavar='default_memory', + type=str) + + parser.add_argument('--R-cpus', + default=10, + help="Number of cpus to run R dada2 and DECIPHER. Default: 10", + metavar='R_cpus', + type=int) + + parser.add_argument('--R-memory', + default="20 GB", + help="Amount of memory R uses to run Dada2 and DECIPHER. Default: '20 GB' ", + metavar='R_memory', + type=str) + + parser.add_argument('--diversity-cpus', + default=5, + help="Number of cpus to run differential abundance and diversity analysis. Default: 5", + metavar='ancom_cpus', + type=int) + + parser.add_argument('--diversity-memory', + default="10 GB", + help="Amount of memory to use for each job. Default: '10 GB' ", + metavar='ancom_memory', + type=str) + + parser.add_argument('--cutadapt-memory', + default="10 GB", + help="Amount of memory used by cutadapt for read trimming. Default: '10 GB' ", + metavar='cutadapt_memory', + type=str) + + parser.add_argument('--conda-genelab', + metavar='/path/to/envs/genelab-utils', + default='null', + help="Path to a conda environment containing genlab-utils. Default: null", + type=str) + + parser.add_argument('--conda-qc', + metavar='/path/to/envs/qc', + default='null', + help="Path to a conda environment containing fastqc, multiqc, zip and python. Default: null", + type=str) + + parser.add_argument('--conda-R', + metavar='/path/to/envs/R', + default='null', + help="Path to a conda environment containing R along with the packages decipher and biomformat installed. Default: null", + type=str) + + parser.add_argument('--conda-cutadapt', + metavar='/path/to/envs/cutadapt', + default='null', + help="Path to a conda environment containing cutadapt. Default: null", + type=str) + + parser.add_argument('--conda-diversity', + metavar='/path/to/envs/R_diversity', + default='null', + help="Path to a conda environment containing R packages required for diversity and differential abundance testing. Default: null", + type=str) + + parser.add_argument('--container-genelab', + default="olabiyi/genelab-utils:1.3.22", + help="Genelab utils container to be used to download raw sequences and retrieve sample metadata. Default: olabiyi/genelab-utils:1.3.22", + metavar='container_genelab', + type=str) + + parser.add_argument('--container-fastqc', + default="staphb/fastqc:0.12.1", + help="A docker container to be used to run fastqc. Default: staphb/fastqc:0.12.1", + metavar='container_fastqc', + type=str) + + parser.add_argument('--container-multiqc', + default="staphb/multiqc:1.19", + help="A docker container to be used to run multiqc. Default: staphb/multiqc:1.19", + metavar='container_multiqc', + type=str) + + parser.add_argument('--container-cutadapt', + default="zavolab/cutadapt:1.16", + help="A docker container to be used to run cutadapt. Default: zavolab/cutadapt:1.16", + metavar='container_cutadapt', + type=str) + + parser.add_argument('--container-dada', + default="olabiyi/r-dada-decipher-biomformat:1.0", + help="A docker container to be used to run dada2 and DECIPHER. Default: olabiyi/r-dada-decipher-biomformat:1.0", + metavar='container_dada', + type=str) + + parser.add_argument('--container-ancom', + default="quay.io/nasa_genelab/ancombc:2.6.0", + help="A docker container containing ancombc v2.6.0 . Default: quay.io/nasa_genelab/ancombc:2.6.0", + metavar='container_ancom', + type=str) + + parser.add_argument('--container-diversity', + default="quay.io/nasa_genelab/r-diversity:1.0", + help="A docker container to be used to run diversity analysis. Default: quay.io/nasa_genelab/r-diversity:1.0 ", + metavar='container_diversity', + type=str) + + parser.add_argument('--singularity-cacheDir', + default="singularity/", + help="A directory to store and retrieve singularity images. Default: singularity/", + metavar='singularity_directory', + type=str) + + parser.add_argument('--extra', + default="", + help="A comma separated string of extra arguement(s) to nextflow run e.g 'with-tower,name test'. \ + Please run nextflow run -h for a full list of available options. Default: '' ", + metavar='singularity_directory', + type=str) + + # Check if no arguments were provided + if len(sys.argv) == 1: + parser.print_help() + sys.exit(1) + + try: + args = parser.parse_args() + except SystemExit: + parser.print_help() + sys.exit(1) + + # Test input requirement + if (args.accession == "null" and args.input_file == "null"): + print(""" + Please supply either an accession (OSD or Genelab number) or an input CSV file + by passing either to the --accession or --input-file parameter, respectively. + """) + sys.exit(1) + + # Test input csv file + if(args.input_file != "null"): + # Test if input file exists + if(not os.path.exists(args.input_file)): + print(f"{args.input_file} does not exist. Please provide a correct path.") + sys.exit(1) + # Test primers + if(args.F_primer == "null" or args.R_primer == "null"): + print(""" + When using a csv file as input (--input-file) to this workflow you must provide + foward and reverse primer sequences. Please provide your forward + and reverse primer sequences as arguements to the --F-primer + and --R-primer parameters, respectively. + """) + sys.exit(1) + + + # Test profile + if(args.profile == "null"): + print("Please provide a valid combination of profiles (conda, slurm, docker and singularity) to the --profile parameter.") + sys.exit(1) + + # Create nextflow.config + config_file = create_config(args.target_region, args.raw_R1_suffix, args.raw_R2_suffix, args.trim_primers, + args.input_file, args.min_cutadapt_len, args.primers_linked, args.discard_untrimmed, args.F_primer, + args.R_primer, args.left_trunc, args.right_trunc, args.left_maxEE, args.right_maxEE, + args.concatenate_reads_only, args.conda_genelab, args.conda_qc, args.conda_R, args.conda_cutadapt, + args.conda_diversity, args.accession, args.assay_suffix, args.output_prefix, args.publishDir_mode, + args.primer_trimmed_R1_suffix, args.primer_trimmed_R2_suffix, args.filtered_R1_suffix, + args.filtered_R2_suffix, args.output_dir, args.diff_abund_method, args.group_column, args.samples_column, + args.rarefaction_depth, args.errorStrategy, args.queueSize, args.default_cpus, args.default_memory, + args.cutadapt_memory, args.R_cpus, args.R_memory, args.diversity_cpus, args.diversity_memory, + args.container_genelab, args.container_fastqc, args.container_multiqc, args.container_cutadapt, + args.container_dada, args.container_ancom, args.container_diversity, args.singularity_cacheDir) + + with open("nextflow.config", "w") as file: + print(config_file, file=file) + print("Nextflow workflow setup is complete.") + + # Run the nextflow workflow if --run is used + if args.run: + # Get extra arguement(s) to nextflow run + extra = "" + for opt in args.extra.split(","): + extra += f"-{opt} " + command = f"nextflow run main.nf -resume -profile {args.profile} {extra}" + print(f"Running this nextflow command: {command}") + subprocess.run(command, shell=True, check=True) + +if __name__ == "__main__": + main() diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/slurm_submit.slurm b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/slurm_submit.slurm index 9d6b9e1c..042a12af 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/slurm_submit.slurm +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/slurm_submit.slurm @@ -6,7 +6,7 @@ #SBATCH --partition=normal ## Specifies the job queue to use, for urgent jobs change normal to priority ## #SBATCH --mem=20G ## Memory required to run the job in MB, this example is showing 10,000 MB or 10GB, change this number based on how much RAM you need ## #SBATCH --cpus-per-task=1 ## Number of CPUs to run the job, this example is showing 5 CPUs, change this number based on how many CPUs you need ## -#SBATCH --mail-user=olabiyi.a.obayomi@nasa.gov ## Specifies the e-mail address to e-mail when the job is complete, replace this e-mail address with your NASA e-mail address ## +#SBATCH --mail-user=name@domain ## Specifies the e-mail address to e-mail when the job is complete, replace this e-mail address with your NASA e-mail address ## #SBATCH --mail-type=END ## Tells slurm to e-mail the address above when the job has completed ## . ~/.profile @@ -42,7 +42,9 @@ export NXF_SINGULARITY_CACHEDIR=singularity/ export TOWER_ACCESS_TOKEN= export TOWER_WORKSPACE_ID= #nextflow run main.nf -resume -profile slurm,singularity --input_file PE_file.csv --target_region 16S --F_primer AGAGTTTGATCCTGGCTCAG --R_primer CTGCCTCCCGTAGGAGT -with-tower ## Replace command with the command(s) you want to run ## -nextflow run main.nf -resume -profile slurm,conda --accession GLDS-487 --target_region 16S -with-tower +#nextflow run main.nf -resume -profile slurm,conda --accession GLDS-487 --target_region 16S -with-tower +#python3 run_workflow.py --run --target-region 16S --input-file PE_file.csv --F-primer GTGCCAGCMGCCGCGGTAA --R-primer GGACTACHVGGGTWTCTAAT --profile slurm,singularity --singularity-cacheDir /path/to/singularity_images/ --extra 'with-tower' --R-memory '20 GB' +python3 run_workflow.py --run --target-region 16S --accession GLDS-487 --profile slurm,singularity --singularity-cacheDir /path/to/singularity_images/ --extra 'with-tower' --R-memory '20 GB' ## Add a time-stamp at the end of the job then calculate how long the job took to run in seconds, minutes, and hours ## echo "" end=$(date +%s) From 7a974043506cc671c0b3617fc1b579f8045a11c5 Mon Sep 17 00:00:00 2001 From: olabiyi Date: Wed, 27 Nov 2024 15:38:35 -0800 Subject: [PATCH 04/24] Edited README --- .../Workflow_Documentation/NF_AmpIllumina-B/README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/README.md b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/README.md index 20fc479c..03fb9dc0 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/README.md +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/README.md @@ -22,11 +22,11 @@ The current GeneLab Illumina amplicon sequencing data processing pipeline (AmpIl 4c. [Approach 3: Run jobs locally in conda environments and specify the path to one or more existing conda environments](#4c-approach-run-jobs-locally-in-conda-environments-and-specify-the-path-to-one-or-more-existing-conda-environments) 4d. [Modify parameters and cpu resources in the nextflow config file](#4d-modify-parameters-and-cpu-resources-in-the-nextflow-config-file) -5. [Run the workflow indirectly using the python wrapper script](#5-run-the-workflow-indirectly-using-the-python-wrapper-script) - 5a. [Approach 1: Use an OSD or Genelab acession as input](#5a-approach-1-use-an-osd-or-Genelab-acession-as-input) - 5b. [Approach 2: Use a csv file as input to the workflow](#5b-approach-2-use-a-csv-file-as-input-to-the-workflow) - 5c. [Approach 3: Use a csv file as input to the workflow and supply extra arguments to nextflow run](#5c-approach-3-use-a-csv-file-as-input-to-the-workflow-and-supply-extra-arguments-to-nextflow-run) - 5d. [Approach 4: Just create an edited nextflow.config file but dont run the workflow](#5d-approach-4-just-create-an-edited-nextflow.config-file-but-dont-run-the-workflow) +5. [Run the workflow indirectly using the python wrapper script](#5-run-the-workflow-indirectly-using-the-python-wrapper-script) + 5a. [Approach 1: Use an OSD or Genelab acession as input](#5a-approach-1-use-an-osd-or-Genelab-acession-as-input) + 5b. [Approach 2: Use a csv file as input to the workflow](#5b-approach-2-use-a-csv-file-as-input-to-the-workflow) + 5c. [Approach 3: Use a csv file as input to the workflow and supply extra arguments to nextflow run](#5c-approach-3-use-a-csv-file-as-input-to-the-workflow-and-supply-extra-arguments-to-nextflow-run) + 5d. [Approach 4: Just create an edited nextflow.config file but dont run the workflow](#5d-approach-4-just-create-an-edited-nextflow.config-file-but-dont-run-the-workflow) 6. [Workflow outputs](#6-workflow-outputs) 6a. [Main outputs](#6a-main-outputs) From f7df02f8f9f1ec5891dcfd5b83b7d99c45e6436a Mon Sep 17 00:00:00 2001 From: olabiyi Date: Wed, 27 Nov 2024 15:40:54 -0800 Subject: [PATCH 05/24] Edited README --- .../Workflow_Documentation/NF_AmpIllumina-B/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/README.md b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/README.md index 03fb9dc0..5b4e16f0 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/README.md +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/README.md @@ -26,7 +26,7 @@ The current GeneLab Illumina amplicon sequencing data processing pipeline (AmpIl 5a. [Approach 1: Use an OSD or Genelab acession as input](#5a-approach-1-use-an-osd-or-Genelab-acession-as-input) 5b. [Approach 2: Use a csv file as input to the workflow](#5b-approach-2-use-a-csv-file-as-input-to-the-workflow) 5c. [Approach 3: Use a csv file as input to the workflow and supply extra arguments to nextflow run](#5c-approach-3-use-a-csv-file-as-input-to-the-workflow-and-supply-extra-arguments-to-nextflow-run) - 5d. [Approach 4: Just create an edited nextflow.config file but dont run the workflow](#5d-approach-4-just-create-an-edited-nextflow.config-file-but-dont-run-the-workflow) + 5d. [Approach 4: Just create an edited nextflow config file but dont run the workflow](#5d-approach-4-just-create-an-edited-nextflow-config-file-but-dont-run-the-workflow) 6. [Workflow outputs](#6-workflow-outputs) 6a. [Main outputs](#6a-main-outputs) @@ -201,7 +201,7 @@ export TOWER_WORKSPACE_ID= python run_workflow.py --run --target-region 16S --input-file PE_file.csv --F-primer AGAGTTTGATCCTGGCTCAG --R-primer CTGCCTCCCGTAGGAGT --profile slurm,conda --extra 'with-tower' ``` -#### 5d. Aproach 4: Just create an edited nextflow.config file but dont run the workflow +#### 5d. Aproach 4: Just create an edited nextflow config file but dont run the workflow ```bash python run_workflow.py --target-region 16S --accession GLDS-487 --profile slurm,singularity From addf4d6e310fc3fa54028d2ccee857a120507877 Mon Sep 17 00:00:00 2001 From: olabiyi Date: Wed, 27 Nov 2024 16:07:32 -0800 Subject: [PATCH 06/24] Fixed no extra argument to nextflow bug --- .../NF_AmpIllumina-B/README.md | 2 +- .../NF_AmpIllumina-B/workflow_code/run_workflow.py | 13 ++++++++----- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/README.md b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/README.md index 5b4e16f0..bcf6c37a 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/README.md +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/README.md @@ -191,7 +191,7 @@ python run_workflow.py --run --target-region 16S --accession GLDS-487 --profile python run_workflow.py --run --target-region 16S --input-file PE_file.csv --F-primer AGAGTTTGATCCTGGCTCAG --R-primer CTGCCTCCCGTAGGAGT --profile singularity ``` -#### 5c. Approach 3: Use a csv file as input to the workflow and supply extra arguments to nextflow run. +#### 5c. Approach 3: Use a csv file as input to the workflow and supply extra arguments to nextflow run Here were want to monitor our jobs with nextflow tower. diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/run_workflow.py b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/run_workflow.py index 3ef70ed4..169d4158 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/run_workflow.py +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/run_workflow.py @@ -684,11 +684,14 @@ def main(): # Run the nextflow workflow if --run is used if args.run: - # Get extra arguement(s) to nextflow run - extra = "" - for opt in args.extra.split(","): - extra += f"-{opt} " - command = f"nextflow run main.nf -resume -profile {args.profile} {extra}" + if(args.extra != ""): + # Get extra arguement(s) to nextflow run + extra = "" + for opt in args.extra.split(","): + extra += f"-{opt} " + command = f"nextflow run main.nf -resume -profile {args.profile} {extra}" + else: + command = f"nextflow run main.nf -resume -profile {args.profile}" print(f"Running this nextflow command: {command}") subprocess.run(command, shell=True, check=True) From 7a9c637f1551033a1956d10ba913717f77950061 Mon Sep 17 00:00:00 2001 From: olabiyi Date: Thu, 5 Dec 2024 09:21:24 -0800 Subject: [PATCH 07/24] updated UNITE_DB link --- .../workflow_code/bin/Illumina-PE-R-processing.R | 6 +++--- .../workflow_code/bin/Illumina-SE-R-processing.R | 7 +++---- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/Illumina-PE-R-processing.R b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/Illumina-PE-R-processing.R index 55210ce2..9a0eb905 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/Illumina-PE-R-processing.R +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/Illumina-PE-R-processing.R @@ -173,11 +173,11 @@ if ( target_region == "16S" ) { } else if (target_region == "ITS" ) { - download.file("https://figshare.com/ndownloader/files/46245586", "UNITE_v2020_February2020.RData") + download.file("https://figshare.com/ndownloader/files/49181545", "UNITE_v2023_July2023.RData") # loading reference taxonomy object - load("UNITE_v2020_February2020.RData") + load("UNITE_v2023_July2023.RData") # removing downloaded file - #file.remove("UNITE_v2020_February2020.RData") + #file.remove("UNITE_v2023_July2023.RData") ranks <- c("kingdom", "phylum", "class", "order", "family", "genus", "species") diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/Illumina-SE-R-processing.R b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/Illumina-SE-R-processing.R index b97820b8..f91d77be 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/Illumina-SE-R-processing.R +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/Illumina-SE-R-processing.R @@ -142,12 +142,11 @@ if ( target_region == "16S" ) { } else if (target_region == "ITS" ) { - download.file("https://figshare.com/ndownloader/files/46245586", "UNITE_v2020_February2020.RData") + download.file("https://figshare.com/ndownloader/files/49181545", "UNITE_v2023_July2023.RData") # loading reference taxonomy object - load("UNITE_v2020_February2020.RData") + load("UNITE_v2023_July2023.RData") # removing downloaded file - #file.remove("UNITE_v2020_February2020.RData") - + #file.remove("UNITE_v2023_July2023.RData") ranks <- c("kingdom", "phylum", "class", "order", "family", "genus", "species") } else if (target_region == "18S" ) { From 10afc84a57c140b576614b6a1928285cd7155b4c Mon Sep 17 00:00:00 2001 From: olabiyi Date: Mon, 23 Dec 2024 21:46:21 -0600 Subject: [PATCH 08/24] added deseq --- .../workflow_code/bin/alpha_diversity.R | 50 +- .../workflow_code/bin/beta_diversity.R | 348 ++++++++----- .../workflow_code/bin/pairwise_ancombc1.R | 56 ++- .../workflow_code/bin/pairwise_ancombc2.R | 61 ++- .../workflow_code/bin/plot_taxonomy.R | 44 +- .../workflow_code/bin/run_deseq2.R | 462 ++++++++++++++---- .../NF_AmpIllumina-B/workflow_code/main.nf | 60 ++- .../workflow_code/modules/ancombc.nf | 26 +- .../workflow_code/modules/deseq.nf | 10 +- .../workflow_code/nextflow.config | 11 +- 10 files changed, 834 insertions(+), 294 deletions(-) diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/alpha_diversity.R b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/alpha_diversity.R index 7fce672c..a3c26de5 100755 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/alpha_diversity.R +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/alpha_diversity.R @@ -49,7 +49,7 @@ option_list <- list( Deafault: 'Sample Name' ", metavar="Sample Name"), - make_option(c("-p", "--output-prefix"), type="character", default="", + make_option(c("-o", "--output-prefix"), type="character", default="", help="Unique name to tag onto output files. Default: empty string.", metavar=""), @@ -57,18 +57,28 @@ option_list <- list( help="Minimum rarefaction depth for alpha diversity estimation. \ Default: 500. ", metavar="500"), - - make_option(c("-c", "--abundance-cutoff"), type="numeric", default=0.2, - help="A fraction defining how abundant features most be to be \ - analyzes. Default: 1/5. ", - metavar="0.2"), - + make_option(c("-r", "--remove-rare"), type="logical", default=FALSE, help="Should rare features be filtered out?. \ Default: FALSE. ", action= "store_true", - metavar="FALSE"), + metavar="FALSE"), + + make_option(c("-p", "--prevalence-cutoff"), type="numeric", default=0.15, + help="If --remove-rare, a numerical fraction between 0 and 1. Taxa with prevalences + (the proportion of samples in which the taxon is present) less + than --prevalence-cutoff will be excluded in the analysis. + Default is 0.15, i.e. exclude taxa / features that are not present + in at least 15% of the samples.", + metavar="0.15"), + + make_option(c("-l", "--library-cutoff"), type="numeric", default=100, + help="If --remove-rare, a numerical threshold for filtering samples based on library + sizes. Samples with library sizes less than lib_cut will be + excluded in the analysis. Default is 100. + if you do not want to discard any sample then set to 0.", + metavar="100"), - make_option(c("-l", "--legend-title"), type="character", default="Groups", + make_option(c("-e", "--legend-title"), type="character", default="Groups", help="Legend title for alpha diversity plots.", metavar="Groups"), @@ -213,8 +223,6 @@ custom_palette <- custom_palette[-c(21:23, grep(pattern = pattern_to_filter, metadata_file <- opt[["metadata-table"]] features_file <- opt[["feature-table"]] taxonomy_file <- opt[["taxonomy-table"]] -alpha_diversity_out_dir <-"alpha_diversity/" -if(!dir.exists(alpha_diversity_out_dir)) dir.create(alpha_diversity_out_dir) # Metadata group column name to compare groups_colname <- opt[["group"]] sample_colname <- opt[["samples-column"]] @@ -223,8 +231,12 @@ assay_suffix <- opt[["assay-suffix"]] legend_title <- opt[["legend-title"]] rarefaction_depth <- opt[["rarefaction-depth"]] remove_rare <- opt[["remove-rare"]] -abundance_cutoff <- opt[["abundance-cutoff"]] - +# taxon / ASV prevalence cutoff +prevalence_cutoff <- opt[["prevalence-cutoff"]] # 0.15 (15%) +# sample / library read count cutoff +library_cutoff <- opt[["library-cutoff"]] # 100 +alpha_diversity_out_dir <-"alpha_diversity/" +if(!dir.exists(alpha_diversity_out_dir)) dir.create(alpha_diversity_out_dir) @@ -273,11 +285,17 @@ feature_table <- read.table(file = features_file, header = TRUE, row.names = 1, sep = "\t") if(remove_rare){ -# Remove rare ASVs -feature_table <- remove_rare_features(feature_table, - cut_off_percent=abundance_cutoff) + + # Remove samples with less than library-cutoff + message(glue("Dropping samples with less than {library_cutoff} read counts")) + feature_table <- feature_table[,colSums(feature_table) >= library_cutoff] + # Remove rare ASVs + message(glue("Dropping features with prevalence less than {prevalence_cutoff * 100}%")) + feature_table <- remove_rare_features(feature_table, + cut_off_percent = prevalence_cutoff) } + # Taxonomy taxonomy_table <- read.table(file = taxonomy_file, header = TRUE, row.names = 1, sep = "\t") diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/beta_diversity.R b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/beta_diversity.R index 53ce4128..bf1a7777 100755 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/beta_diversity.R +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/beta_diversity.R @@ -49,7 +49,7 @@ option_list <- list( Deafault: 'Sample Name' ", metavar="Sample Name"), - make_option(c("-p", "--output-prefix"), type="character", default="", + make_option(c("-o", "--output-prefix"), type="character", default="", help="Unique name to tag onto output files. Default: empty string.", metavar=""), @@ -58,16 +58,27 @@ option_list <- list( Default: 500. ", metavar="500"), - make_option(c("-c", "--abundance-cutoff"), type="numeric", default=0.2, - help="A fraction defining how abundant features most be to be \ - analyzes. Default: 1/5. ", - metavar="0.2"), make_option(c("-r", "--remove-rare"), type="logical", default=FALSE, help="Should rare features be filtered out?. \ Default: FALSE. ", action= "store_true", metavar="FALSE"), - make_option(c("-l", "--legend-title"), type="character", default="Groups", + make_option(c("-p", "--prevalence-cutoff"), type="numeric", default=0.15, + help="If --remove-rare, a numerical fraction between 0 and 1. Taxa with prevalences + (the proportion of samples in which the taxon is present) less + than --prevalence-cutoff will be excluded in the analysis. + Default is 0.15, i.e. exclude taxa / features that are not present + in at least 15% of the samples.", + metavar="0.15"), + + make_option(c("-l", "--library-cutoff"), type="numeric", default=100, + help="If --remove-rare, a numerical threshold for filtering samples based on library + sizes. Samples with library sizes less than lib_cut will be + excluded in the analysis. Default is 100. + if you do not want to discard any sample then set to 0.", + metavar="100"), + + make_option(c("-e", "--legend-title"), type="character", default="Groups", help="Legend title for alpha diversity plots.", metavar="Groups"), @@ -94,7 +105,7 @@ opt_parser <- OptionParser( description = paste("Author: Olabiyi Aderemi Obayomi", "\nEmail: olabiyi.a.obayomi@nasa.gov", "\n A script to perform ASV beta diversity analysis.", - "\nIt outputs a dendogram, pcoa and statistics tables. ", + "\nIt outputs a dendograms, pcoas and statistics tables. ", sep="") ) @@ -125,7 +136,7 @@ if(is.null(opt[["taxonomy-table"]])) { } if(opt[["group"]] == "groups") { - message("Alpha diversity will be run on the default 'groups' column \n") + message("Beta diversity will be run on the default 'groups' column \n") } if(opt[["samples-column"]] == "Sample Name") { @@ -141,29 +152,172 @@ library(DESeq2) library(ggdendro) library(RColorBrewer) library(broom) +library(ggrepel) library(tidyverse) # ----------------------------- Functions ----------------------------------- # +# A a function to create a phyloseq object with the appropriate +# sample count transformation depending on the supplied transformation method +# i.e. either 'rarefy' or 'vst' +transform_phyloseq <- function( feature_table, metadata, method, rarefaction_depth=500){ + # feature_table [DATAFRAME] ~ Feature / ASV count table with samples as columns and features as rows + # metadata [DATAFRAME] ~ Samples metadata with samples as row names + # method [STRING] ~ Distance transformation method to use. + # Either 'rarefy' or 'vst' for rarefaction and variance + # stabilizing transformation, respectively. + # rarefaction_depth [INT] ~ Sample rarefaction to even depth when method is 'bray' + + if(method == 'rarefy'){ + # Create phyloseq object + ASV_physeq <- phyloseq(otu_table(feature_table, taxa_are_rows = TRUE), + sample_data(metadata)) + + + seq_per_sample <- colSums(feature_table) %>% sort() + # Minimum value + depth <- min(seq_per_sample) + + for (count in seq_per_sample) { + # Get the count equal to rarefaction_depth or nearest to it + if(count >= rarefaction_depth) { + depth <- count + break + } + + } + + #----- Rarefy sample counts to even depth per sample + ps <- rarefy_even_depth(physeq = ASV_physeq, + sample.size = depth, + rngseed = 1, + replace = FALSE, + verbose = FALSE) + + }else if(method == "vst"){ + + # Using deseq + # Keep only ASVs with at least 1 count + feature_table <- feature_table[rowSums(feature_table) > 0, ] + # Add +1 pseudocount for VST for vst transformation + feature_table <- feature_table + 1 + + # Make the order of samples in metadata match the order in feature table + metadata <- metadata[colnames(feature_table),] + + # Create VST normalized counts matrix + # ~1 means no design + deseq_counts <- DESeqDataSetFromMatrix(countData = feature_table, + colData = metadata, + design = ~1) + deseq_counts_vst <- varianceStabilizingTransformation(deseq_counts) + vst_trans_count_tab <- assay(deseq_counts_vst) + + # Making a phyloseq object with our transformed table + vst_count_phy <- otu_table(object = vst_trans_count_tab, taxa_are_rows = TRUE) + sample_info_tab_phy <- sample_data(metadata) + ps <- phyloseq(vst_count_phy, sample_info_tab_phy) + }else{ + + stop("Please supply a valid normalization method, either 'rarefy' or 'vst' ") + } + + + return(ps) +} + +# ----------- Hierarchical Clustering and dendogram plotting +make_dendogram <- function(dist_obj, metadata, groups_colname, + group_colors, legend_title){ + + + sample_clust <- hclust(d = dist_obj, method = "ward.D2") + + # Extract clustering data + hcdata <- dendro_data(sample_clust, type = "rectangle") + segment_data <- segment(hcdata) + label_data <- label(hcdata) %>% + left_join(metadata %>% + rownames_to_column("label")) + + dendogram <- ggplot() + + geom_segment(data = segment_data, + aes(x = x, y = y, xend = xend, yend = yend) + ) + + geom_text(data = label_data , + aes(x = x, y = y, label = label, + color = !!sym(groups_colname) , hjust = 0), + size = 4.5, key_glyph = "rect") + + scale_color_manual(values = group_colors) + + coord_flip() + + scale_y_reverse(expand = c(0.2, 0)) + + labs(color = legend_title) + + theme_dendro() + + guides(colour = guide_legend(override.aes = list(size = 5)))+ + theme(legend.key = element_rect(fill=NA), + text = element_text(face = 'bold'), + legend.title = element_text(size = 12, face='bold'), + legend.text = element_text(face = 'bold', size = 11)) + + + return(dendogram) + +} + +# Run variance test and adonis test +run_stats <- function(dist_obj, metadata, groups_colname){ + + samples <- attr(dist_obj, "Label") + metadata <- metadata[samples,] + variance_test <- betadisper(d = dist_obj, + group = metadata[[groups_colname]]) %>% + anova() %>% + broom::tidy() %>% + mutate(across(where(is.numeric), ~round(.x, digits = 2))) + + + adonis_res <- adonis2(formula = dist_obj ~ metadata[[groups_colname]]) + adonis_test <- adonis_res %>% + broom::tidy() %>% + mutate(across(where(is.numeric), ~round(.x, digits = 2))) + + return(list(variance = variance_test, adonis = adonis_test)) +} -plot_pcoa <- function(phy, pcoa, addtext=FALSE) { +# Make PCoA +plot_pcoa <- function(ps, stats_res, distance_method, + groups_colname, sample_colname, + group_colors, legend_title, + addtext=FALSE) { + + + + # Generating a PCoA with phyloseq + pcoa <- ordinate(physeq = ps, method = "PCoA", distance = distance_method) + eigen_vals <- pcoa$values$Eigenvalues - p <- plot_ordination(phy, pcoa, - color = groups_colname) + + # Calculate the percentage of variance + percent_variance <- eigen_vals / sum(eigen_vals) * 100 + + # Retrieving plot labels + r2_value <- stats_res$adonis[["R2"]][1] + prf_value <- stats_res$adonis[["p.value"]][1] + label_PC1 <- sprintf("PC1 [%.1f%%]", percent_variance[1]) + label_PC2 <- sprintf("PC2 [%.1f%%]", percent_variance[2]) + + p <- plot_ordination(ps, pcoa, color = groups_colname) + geom_point(size = 1) if(addtext){ - p <- p + geom_text(aes(label = sample_names), - show.legend = FALSE, + sample_colname <- make.names(sample_colname) + sample_names <- p$data[[sample_colname]] + p <- p + geom_text(aes(label = sample_names), show.legend = FALSE, hjust = 0.3, vjust = -0.4, size = 4) } - p + - labs( - x = label_PC1, - y = label_PC2, - col = legend_title - ) + + + + p <- p + labs(x = label_PC1, y = label_PC2, color = legend_title) + coord_fixed(sqrt(eigen_vals[2]/eigen_vals[1])) + scale_color_manual(values = group_colors) + theme_bw() + theme(text = element_text(size = 15, face="bold"), @@ -176,6 +330,9 @@ plot_pcoa <- function(phy, pcoa, addtext=FALSE) { annotate("text", x = Inf, y = -Inf, label = paste("Pr(>F)", toString(round(prf_value,4))), hjust = 1.1, vjust = -0.5, size = 4) + ggtitle("PCoA") + + + return(p) } remove_rare_features <- function(feature_table, cut_off_percent=3/4){ @@ -232,8 +389,12 @@ sample_colname <- opt[["samples-column"]] output_prefix <- opt[["output-prefix"]] assay_suffix <- opt[["assay-suffix"]] legend_title <- opt[["legend-title"]] +rarefaction_depth <- opt[["rarefaction-depth"]] remove_rare <- opt[["remove-rare"]] -abundance_cutoff <- opt[["abundance-cutoff"]] +# taxon / ASV prevalence cutoff +prevalence_cutoff <- opt[["prevalence-cutoff"]] # 0.15 (15%) +# sample / library read count cutoff +library_cutoff <- opt[["library-cutoff"]] # 100 # Read in processed data @@ -265,7 +426,7 @@ group_colors <- setNames(colors, group_levels) metadata <- metadata %>% mutate(color = map_chr(!!sym(groups_colname), function(group) { group_colors[group] } - ) + ) ) sample_names <- rownames(metadata) deseq2_sample_names <- make.names(sample_names, unique = TRUE) @@ -280,15 +441,20 @@ sample_info_tab <- metadata %>% # Feature or ASV table feature_table <- read.table(file = features_file, header = TRUE, row.names = 1, sep = "\t") + +# ----------------- Preprocess ASV and taxonomy tables if(remove_rare){ + + # Remove samples with less than library-cutoff + message(glue("Dropping samples with less than {library_cutoff} read counts")) + feature_table <- feature_table[,colSums(feature_table) >= library_cutoff] # Remove rare ASVs + message(glue("Dropping features with prevalence less than {prevalence_cutoff * 100}%")) feature_table <- remove_rare_features(feature_table, - cut_off_percent=abundance_cutoff) + cut_off_percent = prevalence_cutoff) } -# Preprocess ASV and taxonomy tables - # Taxonomy taxonomy_table <- read.table(file = taxonomy_file, header = TRUE, row.names = 1, sep = "\t") @@ -317,113 +483,61 @@ common_ids <- intersect(rownames(feature_table), rownames(taxonomy_table)) feature_table <- feature_table[common_ids,] taxonomy_table <- taxonomy_table[common_ids,] +distance_methods <- c("euclidean", "bray") # "bray" # "euclidean" +normalization_methods <- c("vst", "rarefy") +legend_title <- NULL +options(warn=-1) # ignore warnings +# Run the analysis +walk2(.x = normalization_methods, .y = distance_methods, + .f = function(normalization_method, distance_method){ + +# Create transformed phyloseq object +ps <- transform_phyloseq(feature_table, metadata, + method = normalization_method, + rarefaction_depth = rarefaction_depth) -# Using deseq - - -# Keep only ASVs with at least 1 count -feature_table <- feature_table[rowSums(feature_table) > 0, ] -# Add +1 pseudocount for VST for vst transformation -feature_table <- feature_table + 1 - -# Make the order of samples in metadata match the order in feature table -metadata <- metadata[colnames(feature_table),] - -# Create VST normalized counts matrix -# ~1 means no design -deseq_counts <- DESeqDataSetFromMatrix(countData = feature_table, - colData = metadata, - design = ~1) -deseq_counts_vst <- varianceStabilizingTransformation(deseq_counts) -vst_trans_count_tab <- assay(deseq_counts_vst) +# ---------Clustering and dendogram plotting +# Extract normalized count table +count_tab <- otu_table(ps) +# Calculate distance between samples +dist_obj <- vegdist(t(count_tab), method = distance_method) +# Make dendogram +dendogram <- make_dendogram(dist_obj, metadata, groups_colname, + group_colors, legend_title) -# ----------- Hierarchical Clustering and dendogram plotting -euc_dist <- dist(t(vst_trans_count_tab)) -euc_clust <- hclust(d = euc_dist, method = "ward.D2") - -# Extract clustering data -hcdata <- dendro_data(euc_clust, type = "rectangle") -segment_data <- segment(hcdata) -label_data <- label(hcdata) %>% - left_join(sample_info_tab %>% - rownames_to_column("label")) - -dendogram <- ggplot() + - geom_segment(data = segment_data, - aes(x = x, y = y, xend = xend, yend = yend) - ) + - geom_text(data = label_data , - aes(x = x, y = y, label = label, - color = !!sym(groups_colname) , hjust = 0), - size = 4.5, key_glyph = "rect") + - scale_color_manual(values = group_colors) + - coord_flip() + - scale_y_reverse(expand = c(0.2, 0)) + - labs(color = legend_title) + - theme_dendro() + - guides(colour = guide_legend(override.aes = list(size = 5)))+ - theme(legend.key=element_rect(fill=NA), - text = element_text(face = 'bold'), - legend.title = element_text(size = 12, face='bold'), - legend.text = element_text(face = 'bold', size = 11)) - -ggsave(filename = glue("{beta_diversity_out_dir}/{output_prefix}dendrogram_by_group{assay_suffix}.png"), +# Save dendogram +ggsave(filename = glue("{beta_diversity_out_dir}/{output_prefix}{distance_method}_dendrogram{assay_suffix}.png"), plot = dendogram, width = 14, height = 10, dpi = 300, units = "in") -##### Making PCoA - -# Making a phyloseq object with our transformed table -vst_count_phy <- otu_table(object = vst_trans_count_tab, taxa_are_rows = TRUE) -sample_info_tab_phy <- sample_data(sample_info_tab) -vst_physeq <- phyloseq(vst_count_phy, sample_info_tab_phy) - -# generating a PCoA with phyloseq -vst_pcoa <- ordinate(physeq = vst_physeq, - method = "PCoA", - distance = "euclidean") -eigen_vals <- vst_pcoa$values$Eigenvalues - -# Calculate the percentage of variance -percent_variance <- eigen_vals / sum(eigen_vals) * 100 - -message("Checking homogeneity of variance across groups") -variance_test <- betadisper(d = euc_dist, group = sample_info_tab[[groups_colname]]) %>% - anova() %>% - broom::tidy() %>% - mutate(across(where(is.numeric), ~round(.x, digits = 2))) - -write_csv(x = variance_test, - file = glue("{beta_diversity_out_dir}/{output_prefix}variance_table{assay_suffix}.csv")) - - - -adonis_res <- adonis2(formula = euc_dist ~ sample_info_tab[[groups_colname]]) -adonis_test <- adonis_res %>% - broom::tidy() %>% - mutate(across(where(is.numeric), ~round(.x, digits = 2))) -write_csv(x = adonis_test, - file = glue("{beta_diversity_out_dir}/{output_prefix}adonis_table{assay_suffix}.csv")) - - -# Retrieving plot labels -r2_value <- adonis_res$R2[1] -prf_value <- adonis_res$`Pr(>F)`[1] -label_PC1 <- sprintf("PC1 [%.1f%%]", percent_variance[1]) -label_PC2 <- sprintf("PC2 [%.1f%%]", percent_variance[2]) +#---------------------------- Run stats +# Checking homogeneity of variance and comparing groups using adonis test +stats_res <- run_stats(dist_obj, metadata, groups_colname) +write_csv(x = stats_res$variance, + file = glue("{beta_diversity_out_dir}/{output_prefix}{distance_method}_variance_table{assay_suffix}.csv")) +write_csv(x = stats_res$adonis, + file = glue("{beta_diversity_out_dir}/{output_prefix}{distance_method}_adonis_table{assay_suffix}.csv")) +#---------------------------- Make PCoA # Unlabeled PCoA plot -ordination_plot_u <- plot_pcoa(vst_physeq, vst_pcoa) -ggsave(filename=glue("{beta_diversity_out_dir}/{output_prefix}PCoA_without_labels{assay_suffix}.png"), +ordination_plot_u <- plot_pcoa(ps, stats_res, distance_method, + groups_colname, sample_colname, + group_colors, legend_title) +ggsave(filename=glue("{beta_diversity_out_dir}/{output_prefix}{distance_method}_PCoA_without_labels{assay_suffix}.png"), plot=ordination_plot_u, width = 14, height = 8.33, dpi = 300, units = "in") # Labeled PCoA plot -ordination_plot <- plot_pcoa(vst_physeq, vst_pcoa, addtext=TRUE) -ggsave(filename=glue("{beta_diversity_out_dir}/{output_prefix}PCoA_w_labels{assay_suffix}.png"), +ordination_plot <- plot_pcoa(ps, stats_res, distance_method, + groups_colname, sample_colname, + group_colors, legend_title, + addtext=TRUE) +ggsave(filename=glue("{beta_diversity_out_dir}/{output_prefix}{distance_method}_PCoA_w_labels{assay_suffix}.png"), plot=ordination_plot, width = 14, height = 8.33, dpi = 300, units = "in") + +}) \ No newline at end of file diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/pairwise_ancombc1.R b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/pairwise_ancombc1.R index 23ddb63b..663289d3 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/pairwise_ancombc1.R +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/pairwise_ancombc1.R @@ -59,6 +59,11 @@ option_list <- list( This name will be used to name the feature column in the final table.", metavar="ASV"), + make_option(c("-r", "--target-region"), type="character", default="16S", + help="Amplicon target region. Options are either 16S, 18S or ITS \ + Default: 16S", + metavar="16S"), + make_option(c("-c", "--cpus"), type="numeric", default=1, help="Number of cpus to us for parallel processing.", metavar="1"), @@ -258,9 +263,17 @@ fix_names<- function(taxonomy,stringToReplace,suffix){ taxize_options(ncbi_sleep = 0.8) # A function to retrieve the NCBI taxonomy id for a given taxonomy name -get_ncbi_ids <- function(taxonomy){ +get_ncbi_ids <- function(taxonomy, target_region){ + + if(target_region == "ITS"){ + search_string <- "fungi" + }else if(target_region == "18S"){ + search_string <- "eukaryote" + }else{ + search_string <- "bacteria" + } - uid <- get_uid(taxonomy, division_filter = "bacteria") + uid <- get_uid(taxonomy, division_filter = search_string) tax_ids <- uid[1:length(uid)] @@ -293,6 +306,7 @@ metadata_file <- opt[["metadata-table"]] taxonomy_file <- opt[["taxonomy-table"]] feature_table_file <- opt[["feature-table"]] feature <- opt[["feature-type"]] # "ASV" +target_region <- opt[["target-region"]] # 16S output_prefix <- opt[["output-prefix"]] assay_suffix <- opt[["assay-suffix"]] @@ -300,8 +314,8 @@ assay_suffix <- opt[["assay-suffix"]] prevalence_cutoff <- opt[["prevalence-cutoff"]] # 0.15 (15%) # sample / library read count cutoff library_cutoff <- opt[["library-cutoff"]] # 100 -diff_abund_out_dir <- "differential_abundance/" -if(!dir.exists(diff_abund_out_dir)) dir.create(diff_abund_out_dir) +diff_abund_out_dir <- "differential_abundance/ancombc1/" +if(!dir.exists(diff_abund_out_dir)) dir.create(diff_abund_out_dir, recursive = TRUE) # ------------------------ Read metadata ---------------------------------- # @@ -332,11 +346,29 @@ feature_names <- rownames(taxonomy_table) taxonomy_table <- process_taxonomy(taxonomy_table) rownames(taxonomy_table) <- feature_names -print(glue("There are {sum(taxonomy_table$domain == 'Other')} features without +print(glue("There are {sum(taxonomy_table$phylum == 'Other')} features without taxonomy assignments. Dropping them ...")) # Dropping features that couldn't be assigned taxonomy -taxonomy_table <- taxonomy_table[-which(taxonomy_table$domain == 'Other'),] +taxonomy_table <- taxonomy_table[-which(taxonomy_table$phylum == 'Other'),] + +# Handle case where no domain was assigned but a phylum wasn't. +if(all(is.na(taxonomy$domain))){ + + if(target_region == "ITS"){ + + taxonomy_table$domain <- "Fungi" + + }else if(target_region == "18S"){ + + taxonomy_table$domain <- "Eukaryotes" + + }else{ + + taxonomy_table$domain <- "Bacteria" + } + +} # Removing Chloroplast and Mitochondria Organelle DNA contamination asvs2drop <- taxonomy_table %>% @@ -554,7 +586,7 @@ volcano_plots <- map(comp_names, function(comparison){ p <- ggplot(sub_res_df, aes(x=logFC, y=-log10(pvalue), color=diff, label=!!sym(feature))) + geom_point(size=4) + - scale_color_manual(values=c("TRUE"="cyan2", "FALSE"="red")) + + scale_color_manual(values=c("TRUE"="red", "FALSE"="black")) + geom_hline(yintercept = -log10(0.05), linetype = "dashed") + ggrepel::geom_text_repel() + labs(x="logFC", y="-log10(Pvalue)", @@ -588,9 +620,15 @@ tax_names <- map_chr(str_replace_all(taxonomy_table$species, ";_","") %>% df <- data.frame(ASV=rownames(taxonomy_table), best_taxonomy=tax_names) +# Pull NCBI IDS for unique taxonomy names +df2 <- data.frame(best_taxonomy = df$best_taxonomy %>% + unique()) %>% + mutate(NCBI_id=get_ncbi_ids(best_taxonomy, target_region), + .after = best_taxonomy) + df <- df %>% - right_join(merged_stats_df) %>% - mutate(NCBI_id=get_ncbi_ids(best_taxonomy), .after = best_taxonomy) + left_join(df2, join_by("best_taxonomy")) %>% + right_join(merged_stats_df) # Manually creating a normalized table because normalized diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/pairwise_ancombc2.R b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/pairwise_ancombc2.R index 389635a5..0957558f 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/pairwise_ancombc2.R +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/pairwise_ancombc2.R @@ -55,9 +55,15 @@ option_list <- list( help="Column in metadata containing the sample names in the feature table", metavar="Sample Name"), + make_option(c("-r", "--target-region"), type="character", default="16S", + help="Amplicon target region. Options are either 16S, 18S or ITS \ + Default: 16S", + metavar="16S"), + make_option(c("-a", "--feature-type"), type="character", default="ASV", help="What feature counts are in the feature table i.e ASV, OTU etc. - This name will be used to name the feature column in the final table.", + This name will be used to name the feature column in the final table. \ + Default: ASV", metavar="ASV"), make_option(c("-c", "--cpus"), type="numeric", default=1, @@ -314,9 +320,17 @@ ancombc2 <- function(data, ...) { taxize_options(ncbi_sleep = 0.8) # A function to retrieve the NCBI taxonomy id for a given taxonomy name -get_ncbi_ids <- function(taxonomy){ +get_ncbi_ids <- function(taxonomy, target_region){ + + if(target_region == "ITS"){ + search_string <- "fungi" + }else if(target_region == "18S"){ + search_string <- "eukaryote" + }else{ + search_string <- "bacteria" + } - uid <- get_uid(taxonomy, division_filter = "bacteria") + uid <- get_uid(taxonomy, division_filter = search_string) tax_ids <- uid[1:length(uid)] @@ -349,6 +363,7 @@ metadata_file <- opt[["metadata-table"]] taxonomy_file <- opt[["taxonomy-table"]] feature_table_file <- opt[["feature-table"]] feature <- opt[["feature-type"]] # "ASV" +target_region <- opt[["target-region"]] # 16S output_prefix <- opt[["output-prefix"]] assay_suffix <- opt[["assay-suffix"]] @@ -356,8 +371,8 @@ assay_suffix <- opt[["assay-suffix"]] prevalence_cutoff <- opt[["prevalence-cutoff"]] # 0.15 (15%) # sample / library read count cutoff library_cutoff <- opt[["library-cutoff"]] # 100 -diff_abund_out_dir <- "differential_abundance/" -if(!dir.exists(diff_abund_out_dir)) dir.create(diff_abund_out_dir) +diff_abund_out_dir <- "differential_abundance/ancombc2/" +if(!dir.exists(diff_abund_out_dir)) dir.create(diff_abund_out_dir, recursive = TRUE) # ------------------------ Read metadata ---------------------------------- # metadata <- read_csv(metadata_file) %>% as.data.frame() @@ -387,11 +402,29 @@ feature_names <- rownames(taxonomy_table) taxonomy_table <- process_taxonomy(taxonomy_table) rownames(taxonomy_table) <- feature_names -print(glue("There are {sum(taxonomy_table$domain == 'Other')} features without +print(glue("There are {sum(taxonomy_table$phylum == 'Other')} features without taxonomy assignments. Dropping them ...")) # Dropping features that couldn't be assigned taxonomy -taxonomy_table <- taxonomy_table[-which(taxonomy_table$domain == 'Other'),] +taxonomy_table <- taxonomy_table[-which(taxonomy_table$phylum == 'Other'),] + +# Handle case where no domain was assigned but a phylum wasn't. +if(all(is.na(taxonomy$domain))){ + + if(target_region == "ITS"){ + + taxonomy_table$domain <- "Fungi" + + }else if(target_region == "18S"){ + + taxonomy_table$domain <- "Eukaryotes" + + }else{ + + taxonomy_table$domain <- "Bacteria" + } + +} # Removing Chloroplast and Mitochondria Organelle DNA contamination asvs2drop <- taxonomy_table %>% @@ -522,7 +555,7 @@ walk(uniq_comps, function(comp){ # Get the results for a comparison temp_df <- paired_stats_df %>% select(ASV, contains(comp)) - # Merge the current comparison to previous comparions by feature/ASV id + # Merge the current comparison to previous comparisons by feature/ASV id res_df <<- res_df %>% left_join(temp_df) }) @@ -538,9 +571,15 @@ tax_names <- map_chr(str_replace_all(taxonomy_table$species, ";_","") %>% df <- data.frame(ASV=rownames(taxonomy_table), best_taxonomy=tax_names) message("Querying NCBI...") +# Pull NCBI IDS for unique taxonomy names +df2 <- data.frame(best_taxonomy = df$best_taxonomy %>% + unique()) %>% + mutate(NCBI_id=get_ncbi_ids(best_taxonomy, target_region), + .after = best_taxonomy) + df <- df %>% - right_join(res_df) %>% - mutate(NCBI_id=get_ncbi_ids(best_taxonomy), .after = best_taxonomy) + left_join(df2, join_by("best_taxonomy")) %>% + right_join(res_df) # Retrieve the normalized table @@ -637,7 +676,7 @@ volcano_plots <- map(uniq_comps, function(comparison){ p <- ggplot(sub_res_df, aes(x=logFC, y=-log10(pvalue), color=diff, label=!!sym(feature))) + geom_point(size=4) + geom_point(size=4) + - scale_color_manual(values=c("TRUE"="cyan2", "FALSE"="red")) + + scale_color_manual(values=c("TRUE"="red", "FALSE"="black")) + geom_hline(yintercept = -log10(0.05), linetype = "dashed") + ggrepel::geom_text_repel() + labs(x="logFC", y="-log10(Pvalue)", diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/plot_taxonomy.R b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/plot_taxonomy.R index 717950cd..a5471943 100755 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/plot_taxonomy.R +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/plot_taxonomy.R @@ -48,20 +48,30 @@ option_list <- list( Deafault: 'Sample Name' ", metavar="Sample Name"), - make_option(c("-p", "--output-prefix"), type="character", default="", + make_option(c("-o", "--output-prefix"), type="character", default="", help="Unique name to tag onto output files. Default: empty string.", metavar=""), - make_option(c("-c", "--abundance-cutoff"), type="numeric", default=0.2, - help="A fraction defining how abundant features most be to be \ - analyzes. Default: 1/5. ", - metavar="0.2"), - make_option(c("-r", "--remove-rare"), type="logical", default=FALSE, help="Should rare features be filtered out?. \ Default: FALSE. ", action= "store_true", metavar="FALSE"), + make_option(c("-p", "--prevalence-cutoff"), type="numeric", default=0.15, + help="If --remove-rare, a numerical fraction between 0 and 1. Taxa with prevalences + (the proportion of samples in which the taxon is present) less + than --prevalence-cutoff will be excluded in the analysis. + Default is 0.15, i.e. exclude taxa / features that are not present + in at least 15% of the samples.", + metavar="0.15"), + + make_option(c("-l", "--library-cutoff"), type="numeric", default=100, + help="If --remove-rare, a numerical threshold for filtering samples based on library + sizes. Samples with library sizes less than lib_cut will be + excluded in the analysis. Default is 100. + if you do not want to discard any sample then set to 0.", + metavar="100"), + make_option(c("-a", "--assay-suffix"), type="character", default="_GLAmpSeq", help="Genelab assay suffix.", metavar="GLAmpSeq"), @@ -92,7 +102,7 @@ opt <- parse_args(opt_parser) if (opt$version) { - cat("taxonomy_plots.R version: ", version, "\n") + cat("plot_taxonomy.R version: ", version, "\n") options_tmp <- options(show.error.messages=FALSE) on.exit(options(options_tmp)) stop() @@ -114,7 +124,7 @@ if(is.null(opt[["taxonomy-table"]])) { } if(opt[["group"]] == "groups") { - message("Alpha diversity will be run on the default 'groups' column \n") + message("Taxonomy plots will be grouped by the default 'groups' column \n") } if(opt[["samples-column"]] == "Sample Name") { @@ -364,16 +374,18 @@ custom_palette <- custom_palette[-c(21:23, grep(pattern = pattern_to_filter, metadata_file <- opt[["metadata-table"]] features_file <- opt[["feature-table"]] taxonomy_file <- opt[["taxonomy-table"]] -taxonomy_plots_out_dir <- "taxonomy_plots/" -if(!dir.exists(taxonomy_plots_out_dir)) dir.create(taxonomy_plots_out_dir) # Metadata group column name to compare groups_colname <- opt[["group"]] sample_colname <- opt[["samples-column"]] output_prefix <- opt[["output-prefix"]] assay_suffix <- opt[["assay-suffix"]] remove_rare <- opt[["remove-rare"]] -abundance_cutoff <- opt[["abundance-cutoff"]] - +# taxon / ASV prevalence cutoff +prevalence_cutoff <- opt[["prevalence-cutoff"]] # 0.15 (15%) +# sample / library read count cutoff +library_cutoff <- opt[["library-cutoff"]] # 100 +taxonomy_plots_out_dir <- "taxonomy_plots/" +if(!dir.exists(taxonomy_plots_out_dir)) dir.create(taxonomy_plots_out_dir) # Read in processed data @@ -388,11 +400,17 @@ feature_table <- read.table(file = features_file, header = TRUE, row.names = 1, sep = "\t") if(remove_rare){ + + # Remove samples with less than library-cutoff + message(glue("Dropping samples with less than {library_cutoff} read counts")) + feature_table <- feature_table[,colSums(feature_table) >= library_cutoff] # Remove rare ASVs + message(glue("Dropping features with prevalence less than {prevalence_cutoff * 100}%")) feature_table <- remove_rare_features(feature_table, - cut_off_percent=abundance_cutoff) + cut_off_percent = prevalence_cutoff) } + # Taxonomy taxonomy_table <- read.table(file = taxonomy_file, header = TRUE, row.names = 1, sep = "\t") diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/run_deseq2.R b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/run_deseq2.R index 0de06145..f1417e2d 100755 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/run_deseq2.R +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/run_deseq2.R @@ -1,10 +1,10 @@ #!/usr/bin/env Rscript ############################################################################### # AUTHOR : OLABIYI ADEREMI OBAYOMI -# DESCRIPTION: A script to generate taxonomy plots at different taxonomy levels +# DESCRIPTION: A script to perform differential abundance testing using DESeq2 # E-mail: obadbotanist@yahoo.com -# Created: November 2024 -# example: Rscript taxonomy_plots.R \ +# Created: December 2024 +# Example: Rscript run_deseq2.R \ # --metadata-table 'mapping/GLDS-487_amplicon_v1_runsheet.csv' \ # --feature-table 'data/counts_GLAmpSeq.tsv' \ # --taxonomy-table 'data/taxonomy_GLAmpSeq.tsv' \ @@ -25,7 +25,7 @@ option_list <- list( make_option(c("-m", "--metadata-table"), type="character", default=NULL, help="path to a comma separated samples metadata file with the - group/treatment to be analyzed.", + group/treatment to be compared.", metavar="path"), make_option(c("-f", "--feature-table"), type="character", default=NULL, @@ -33,16 +33,26 @@ option_list <- list( i.e. ASV or OTU table.", metavar="path"), + make_option(c("-a", "--feature-type"), type="character", default="ASV", + help="What feature type is in the feature table i.e ASV, OTU etc. + This will be used to name the feature column in the output table.", + metavar="ASV"), + + make_option(c("-R", "--target-region"), type="character", default="16S", + help="Amplicon target region. Options are either 16S, 18S or ITS \ + Default: 16S", + metavar="16S"), + make_option(c("-t", "--taxonomy-table"), type="character", default=NULL, help="path to feature taxonomy table i.e. ASV taxonomy table.", metavar="path"), make_option(c("-g", "--group"), type="character", default="groups", - help="Column in metadata to be analyzed", + help="Column in metadata to be analyzed / compared.", metavar="groups"), make_option(c("-s", "--samples-column"), type="character", default="Sample Name", - help="Column in metadata containing the sample names in the feature table. \ + help="Column in metadata containing sample names in the feature table. \ Deafault: 'Sample Name' ", metavar="Sample Name"), @@ -50,19 +60,30 @@ option_list <- list( help="Unique name to tag onto output files. Default: empty string.", metavar=""), - make_option(c("-c", "--abundance-cutoff"), type="numeric", default=0.2, - help="A fraction defining how abundant features most be to be \ - analyzes. Default: 1/5. ", - metavar="0.2"), - make_option(c("-r", "--remove-rare"), type="logical", default=FALSE, - help="Should rare features be filtered out?. \ - Default: FALSE. ", action= "store_true", - metavar="FALSE"), + help="Should rare features be filtered out?. If this flag is set \ + then rare features will be filtered out. Default: FALSE. ", + action= "store_true", metavar="FALSE"), make_option(c("-y", "--assay-suffix"), type="character", default="_GLAmpSeq", help="Genelab assay suffix.", metavar="GLAmpSeq"), + make_option(c("-p", "--prevalence-cutoff"), type="numeric", default=0.15, + help="If --remove-rare, a numerical fraction between 0 and 1. Taxa with prevalences + (the proportion of samples in which the taxon is present) less + than --prevalence-cutoff will be excluded in the analysis. + Default is 0.15, i.e. exclude taxa / features that are not present + in at least 15% of the samples.", + metavar="0.15"), + + make_option(c("-l", "--library-cutoff"), type="numeric", default=100, + help="If --remove-rare, a numerical threshold for filtering samples based on library + sizes. Samples with library sizes less than lib_cut will be + excluded in the analysis. Default is 100. + if you do not want to discard any sample then set to 0.", + metavar="100"), + + make_option(c("--version"), action = "store_true", type="logical", default=FALSE, help="Print out version number and exit.", metavar = "boolean") @@ -79,18 +100,17 @@ opt_parser <- OptionParser( --samples-column 'Sample Name' ", description = paste("Author: Olabiyi Aderemi Obayomi", "\nEmail: olabiyi.a.obayomi@nasa.gov", - "\nA script to generate taxonomy plots at different taxonomy levels.", - "\nIt outputs sample and group taxonomy plots ", sep="") + "\nA script to perform differential abundance using DESeq2.", + "\nIt outputs a differential abundance statistics table \ + and volcano plots. ", sep="") ) - - opt <- parse_args(opt_parser) if (opt$version) { - cat("taxonomy_plots.R version: ", version, "\n") + cat("run_deseq.R version: ", version, "\n") options_tmp <- options(show.error.messages=FALSE) on.exit(options(options_tmp)) stop() @@ -112,43 +132,155 @@ if(is.null(opt[["taxonomy-table"]])) { } if(opt[["group"]] == "groups") { - message("Alpha diversity will be run on the default 'groups' column \n") + message("Differential abundance testing will be run on the default 'groups' column \n") } if(opt[["samples-column"]] == "Sample Name") { message("I will assume that the sample names are in a column named 'Sample Name' \n") } -library(tidyverse) -library(dendextend) + +library(glue) +library(phyloseq) library(DESeq2) +library(taxize) +library(ggrepel) +library(tidyverse) + +# --------------------------- Functions -------------------------------------# + +# Define the geometric mean function +gm_mean <- function(x, na.rm=TRUE) { + exp(sum(log(x[x > 0]), na.rm=na.rm) / length(x)) +} + + +remove_rare_features <- function(feature_table, cut_off_percent=3/4){ + + # feature_table [MATRIX] feature table matrix with samples as columns and + # features as rows + # cut_off_percent [NUMERIC] cut-off fraction or decimal between 0.001 to 1 + # of the total number of samples to determine the + # most abundant features. By default it removes + # features that are not present in 3/4 of the total + # number of samples + + # Filter by occurrence in a fraction of samples + # Define a cut-off for determining what's rare + cut_off <- cut_off_percent * ncol(feature_table) + # Get the occurrence for each feature + feature_occurence <- rowSums(feature_table > 0) + # Get names of the abundant features + abund_features <- names(feature_occurence[feature_occurence >= cut_off]) + # Remove rare features + abun_features.m <- feature_table[abund_features,] + return(abun_features.m) +} + + +process_taxonomy <- function(taxonomy, prefix='\\w__') { + #function to process a metaphlan2 taxonopmy assigment table + #1. ~ file_path is a string specifying the taxonomic assignment file name + #2 prefix ~ is a regular expression specifying the characters to remove + # from the taxon names '\\w__' for greengenes and 'D_\\d__' for SILVA + + #taxon_levels <- c("kingdom","phylum","class","order", + # "family","genus","species", "strain") + + taxonomy <- apply(X = taxonomy, MARGIN = 2, FUN = as.character) + + #taxonomy[,'species'] <- paste(taxonomy[,'genus'],taxonomy[,'species']) + # replace NAa with Other and delete the D_num__ prefix from the taxonomy names + for (rank in colnames(taxonomy)) { + #delete the taxonomy prefix + taxonomy[,rank] <- gsub(pattern = prefix, x = taxonomy[, rank], + replacement = '') + indices <- which(is.na(taxonomy[,rank])) + taxonomy[indices, rank] <- rep(x = "Other", times=length(indices)) + #replace empty cell + indices <- which(taxonomy[,rank] == "") + taxonomy[indices,rank] <- rep(x = "Other", times=length(indices)) + } + taxonomy <- apply(X = taxonomy,MARGIN = 2, + FUN = gsub,pattern = "_",replacement = " ") %>% + as.data.frame(stringAsfactor=F) + return(taxonomy) +} + + + +# Function for format a taxonomy assignment table by appending suffix +# to a known name +format_taxonomy_table <- function(taxonomy=taxonomy.m,stringToReplace="Other", + suffix=";Other") { + + for (taxa_index in seq_along(taxonomy)) { + #indices <- which(taxonomy[,taxa_index] == stringToReplace) + + indices <- grep(x = taxonomy[,taxa_index], pattern = stringToReplace) + + taxonomy[indices,taxa_index] <- + paste0(taxonomy[indices,taxa_index-1], + rep(x = suffix, times=length(indices))) + + } + return(taxonomy) +} + +fix_names<- function(taxonomy,stringToReplace,suffix){ + #1~ taxonomy is a taxonomy dataframe with taxonomy ranks as column names + #2~ stringToReplace is a vector of regex strings specifying what to replace + #3~ suffix is a string specifying the replacement value + + + for(index in seq_along(stringToReplace)){ + taxonomy <- format_taxonomy_table(taxonomy = taxonomy, + stringToReplace=stringToReplace[index], + suffix=suffix[index]) + } + return(taxonomy) +} +taxize_options(ncbi_sleep = 0.8) +# A function to retrieve the NCBI taxonomy id for a given taxonomy name +get_ncbi_ids <- function(taxonomy, target_region){ + + if(target_region == "ITS"){ + search_string <- "fungi" + }else if(target_region == "18S"){ + search_string <- "eukaryote" + }else{ + search_string <- "bacteria" + } + + uid <- get_uid(taxonomy, division_filter = search_string) + + tax_ids <- uid[1:length(uid)] + + return(tax_ids) + +} + # ------ Collecting the required input variables ---------- # # Group in metadata to analyze group <- opt[["group"]] # "groups" samples_column <- opt[["samples-column"]] # "Sample Name" -threads <- opt[["cpus"]] # 8 metadata_file <- opt[["metadata-table"]] taxonomy_file <- opt[["taxonomy-table"]] feature_table_file <- opt[["feature-table"]] feature <- opt[["feature-type"]] # "ASV" +target_region <- opt[["target-region"]] # 16S output_prefix <- opt[["output-prefix"]] assay_suffix <- opt[["assay-suffix"]] - +remove_rare <- opt[["remove-rare"]] # taxon / ASV prevalence cutoff prevalence_cutoff <- opt[["prevalence-cutoff"]] # 0.15 (15%) # sample / library read count cutoff library_cutoff <- opt[["library-cutoff"]] # 100 -diff_abund_out_dir <- "differential_abundance/" -if(!dir.exists(diff_abund_out_dir)) dir.create(diff_abund_out_dir) - - -de_out_dir <- file.path(plots_dir, "da") -abundance_out_dir <- file.path(de_out_dir, "differential_abundance") -volcano_out_dir <- file.path(de_out_dir, "volcano") - +diff_abund_out_dir <- "differential_abundance/deseq2/" +if(!dir.exists(diff_abund_out_dir)) dir.create(diff_abund_out_dir, recursive = TRUE) # ------------------------ Read metadata ---------------------------------- # @@ -168,6 +300,16 @@ feature_names <- feature_table[,1] feature_table <- feature_table[, -1] %>% as.data.frame() rownames(feature_table) <- feature_names +if(remove_rare){ + + # Remove samples with less than library-cutoff + message(glue("Dropping samples with less than {library_cutoff} read counts")) + feature_table <- feature_table[,colSums(feature_table) >= library_cutoff] + # Remove rare ASVs + message(glue("Dropping features with prevalence less than {prevalence_cutoff * 100}%")) + feature_table <- remove_rare_features(feature_table, + cut_off_percent = prevalence_cutoff) +} # ------------------------ Read Taxonomy table ---------------------------- # taxonomy <- read_delim(file = taxonomy_file) %>% as.data.frame() @@ -179,16 +321,41 @@ feature_names <- rownames(taxonomy_table) taxonomy_table <- process_taxonomy(taxonomy_table) rownames(taxonomy_table) <- feature_names -print(glue("There are {sum(taxonomy_table$domain == 'Other')} features without +message(glue("There are {sum(taxonomy_table$phylum == 'Other')} features without taxonomy assignments. Dropping them ...")) # Dropping features that couldn't be assigned taxonomy -taxonomy_table <- taxonomy_table[-which(taxonomy_table$domain == 'Other'),] +taxonomy_table <- taxonomy_table[-which(taxonomy_table$phylum == 'Other'),] + +# Handle case where no domain was assigned but a phylum wasn't. +if(all(is.na(taxonomy$domain))){ + + if(target_region == "ITS"){ + + taxonomy_table$domain <- "Fungi" + + }else if(target_region == "18S"){ + + taxonomy_table$domain <- "Eukaryotes" + + }else{ + + taxonomy_table$domain <- "Bacteria" + } + +} + +# Removing Chloroplast and Mitochondria Organelle DNA contamination +asvs2drop <- taxonomy_table %>% + unite(col="taxonomy",domain:species) %>% + filter(str_detect(taxonomy, "[Cc]hloroplast|[Mn]itochondria")) %>% + row.names() +taxonomy_table <- taxonomy_table[!(rownames(taxonomy_table) %in% asvs2drop),] # Get long asv taxonomy names and clean species <- taxonomy_table %>% unite(species,domain:species,sep = ";") %>% # Generalize this line -------- - pull %>% str_replace_all("Other", "_") +pull %>% str_replace_all("Other", "_") taxonomy_table <- fix_names(taxonomy_table, "Other", ";_") @@ -205,17 +372,18 @@ common_ids <- intersect(rownames(feature_table), rownames(taxonomy_table)) feature_table <- feature_table[common_ids,] taxonomy_table <- taxonomy_table[common_ids,] +#### pairwise comparisons +unique_groups <- unique(metadata[[group]]) +# Create phyloseq object +ASV_physeq <- phyloseq(otu_table(feature_table, taxa_are_rows = TRUE), + tax_table(as.matrix(taxonomy_table)), + sample_data(metadata)) +deseq_obj <- phyloseq_to_deseq2(physeq = ASV_physeq, + design = reformulate(group)) - -# 6 Statistically testing for differences - -#### pairwise comparisons -unique_groups <- unique(runsheet$groups) -deseq_obj <- phyloseq_to_deseq2(physeq = ASV_physeq, design = ~groups) - -# add pseudocount if any 0 count samples are present +# Add pseudocount if any 0 count samples are present if (sum(colSums(counts(deseq_obj)) == 0) > 0) { count_data <- counts(deseq_obj) + 1 @@ -224,18 +392,17 @@ if (sum(colSums(counts(deseq_obj)) == 0) > 0) { colnames(count_data) <- colnames(counts(deseq_obj)) counts(deseq_obj) <- count_data } + +# Run Deseq # https://rdrr.io/bioc/phyloseq/src/inst/doc/phyloseq-mixture-models.R deseq_modeled <- tryCatch({ # Attempt to run DESeq DESeq(deseq_obj) }, error = function(e) { - message("Error encountered in DESeq, applying alternative method for size factor estimation...") + message("Error encountered in DESeq, applying alternative \ + method for size factor estimation...") - # Define the geometric mean function - gm_mean = function(x, na.rm=TRUE) { - exp(sum(log(x[x > 0]), na.rm=na.rm) / length(x)) - } - geoMeans = apply(counts(deseq_obj), 1, gm_mean) + geoMeans <- apply(counts(deseq_obj), 1, gm_mean) # Apply the alternative size factor estimation method deseq_obj <- estimateSizeFactors(deseq_obj, geoMeans=geoMeans) @@ -244,44 +411,174 @@ deseq_modeled <- tryCatch({ DESeq(deseq_obj) }) -# save final differential abundance counts, individual group comparison results -write.table(counts(deseq_modeled, normalized=TRUE), - file = file.path(de_out_dir, paste0(output_prefix, - "normalized_counts", - assay_suffix, ".tsv")), - sep="\t", row.names=TRUE, quote=FALSE) -# make the volcanoplot -plot_comparison <- function(group1, group2) { - plot_width_inches = 11.1 - plot_height_inches = 8.33 +# Get unique group comparison as a matrix +pairwise_comp.m <- utils::combn(metadata[,group] %>% unique, 2) +pairwise_comp_df <- pairwise_comp.m %>% as.data.frame + +colnames(pairwise_comp_df) <- map_chr(pairwise_comp_df, + \(col) str_c(col, collapse = "v")) +comparisons <- colnames(pairwise_comp_df) +names(comparisons) <- comparisons + +# Retrieve statistics table +merged_stats_df <- data.frame(ASV=rownames(feature_table)) +colnames(merged_stats_df) <- feature + +walk(pairwise_comp_df, function(col){ + + group1 <- col[1] + group2 <- col[2] + +df <- results(deseq_modeled, contrast = c(group, group1, group2)) %>% + data.frame() %>% + rownames_to_column(feature) %>% + set_names(c(feature , + glue("baseMean_({group1})v({group2})"), + glue("log2FC_({group1})v({group2})"), + glue("lfcSE_({group1})v({group2})"), + glue("stat_({group1})v({group2})"), + glue("pvalue_({group1})v({group2})"), + glue("padj_({group1})v({group2})") + )) + + + merged_stats_df <<- merged_stats_df %>% + dplyr::left_join(df, join_by("ASV")) +}) + + + +# Add NCBI id to feature i.e. ASV +tax_names <- map_chr(str_replace_all(taxonomy_table$species, ";_","") %>% + str_split(";"), + function(row) row[length(row)]) + +df <- data.frame(ASV=rownames(taxonomy_table), best_taxonomy=tax_names) + +# Pull NCBI IDS for unique taxonomy names +df2 <- data.frame(best_taxonomy = df$best_taxonomy %>% + unique()) %>% + mutate(NCBI_id=get_ncbi_ids(best_taxonomy, target_region), + .after = best_taxonomy) + +df <- df %>% + left_join(df2, join_by("best_taxonomy")) %>% + right_join(merged_stats_df) + + + + +group_levels <- metadata[, group] %>% unique() %>% sort() +normalized_table <- counts(deseq_modeled, normalized=TRUE) %>% + as.data.frame() %>% + rownames_to_column(feature) + + +samples <- metadata[[samples_column]] +samplesdropped <- setdiff(x = samples, y = colnames(normalized_table)[-1]) +missing_df <- data.frame(ASV=normalized_table[[feature]], + matrix(data = NA, + nrow = nrow(normalized_table), + ncol = length(samplesdropped) + ) +) +colnames(missing_df) <- c(feature,samplesdropped) + + +group_means_df <- normalized_table[feature] +walk(group_levels, function(group_level){ + + + mean_col <- glue("Group.Mean_({group_level})") + std_col <- glue("Group.Stdev_({group_level})") + + # Samples that belong to the current group + Samples <- metadata %>% + filter(!!sym(group) == group_level) %>% + pull(!!sym(samples_column)) + # Samples that belong to the current group that are in the normalized table + Samples <- intersect(colnames(normalized_table), Samples) + + temp_df <- normalized_table %>% select(!!feature, all_of(Samples)) %>% + rowwise() %>% + mutate(!!mean_col := mean(c_across(where(is.numeric))), + !!std_col := sd(c_across(where(is.numeric))) ) %>% + select(!!feature,!!sym(mean_col), !!sym(std_col)) + + group_means_df <<- group_means_df %>% left_join(temp_df) + +}) + + +# Append Mean and standard deviation +normalized_table <- normalized_table %>% + rowwise() %>% + mutate(All.Mean=mean(c_across(where(is.numeric))), + All.Stdev=sd(c_across(where(is.numeric))) )%>% + left_join(missing_df, by = feature) %>% + select(!!feature, all_of(samples), All.Mean, All.Stdev) + + +# Add taxonomy +merged_df <- df %>% + left_join(taxonomy_table %>% + as.data.frame() %>% + rownames_to_column(feature)) %>% + select(!!feature, domain:species,everything()) # Try to generalize + +# Merge all prepared tables +merged_df <- merged_df %>% + select(!!sym(feature):NCBI_id) %>% + left_join(normalized_table, by = feature) %>% + left_join(merged_df) %>% + left_join(group_means_df, by = feature) %>% + mutate(across(where(is.numeric), ~round(.x, digits=3))) %>% + mutate(across(where(is.matrix), as.numeric)) + + +output_file <- glue("{diff_abund_out_dir}/{output_prefix}deseq2_differential_abundance{assay_suffix}.csv") +message("Writing out results of differential abundance using DESeq2...") +write_csv(merged_df,output_file) + + + +# Make volcano plots +walk(pairwise_comp_df, function(col){ - deseq_res <- results(deseq_modeled, contrast = c("groups", group1, group2)) - norm_tab <- counts(deseq_modeled, normalized = TRUE) %>% data.frame() + group1 <- col[1] + group2 <- col[2] + plot_width_inches <- 11.1 + plot_height_inches <- 8.33 + p_val <- 0.1 #also logfc cutoff? + + deseq_res <- results(deseq_modeled, contrast = c(group, group1, group2)) volcano_data <- as.data.frame(deseq_res) - p_val <- 0.1 + volcano_data <- volcano_data[!is.na(volcano_data$padj), ] volcano_data$significant <- volcano_data$padj <= p_val #also logfc cutoff? ######Long x-axis label adjustments########## x_label <- paste("Log2 Fold Change\n(",group1," vs ",group2,")") label_length <- nchar(x_label) - max_allowed_label_length = plot_width_inches * 10 + max_allowed_label_length <- plot_width_inches * 10 # Construct x-axis label with new line breaks if was too long if (label_length > max_allowed_label_length){ x_label <- paste("Log2 Fold Change\n\n(", group1, "\n vs \n", group2, ")", sep="") } ####################################### - + # ASVs promoted in space on right, reduced on left - p <- ggplot(volcano_data, aes(x=log2FoldChange, y=-log10(padj), color=significant)) + + p <- ggplot(volcano_data, aes(x=log2FoldChange, y=-log10(padj), + color=significant)) + geom_point(alpha=0.7, size=2) + + geom_hline(yintercept = -log10(p_val), linetype = "dashed") + scale_color_manual(values=c("black", "red"), labels=c(paste0("padj > ", p_val), - paste0("padj \u2264 ", p_val))) + + paste0(" padj \u2264 ", p_val))) + theme_bw() + labs(title="Volcano Plot", x=x_label, @@ -295,25 +592,16 @@ plot_comparison <- function(group1, group2) { filter(significant) %>% head(10) - volcano_plot <- p + geom_text_repel(data=top_points, aes(label=row.names(top_points)), size=3) - ggsave(filename=file.path(volcano_out_dir, paste0(output_prefix, - "volcano_", - gsub(" ", "_", group1), - "_vs_", - gsub(" ", "_", group2), ".png")), - plot=volcano_plot, - width = plot_width_inches, height = plot_height_inches, dpi = 300) + volcano_plot <- p + geom_text_repel(data=top_points, + aes(label=row.names(top_points)), + size=3) - write.csv(deseq_res, file = file.path(abundance_out_dir, - paste0(output_prefix, - gsub(" ", "_", group1), - "_vs_", gsub(" ", "_", group2), - ".csv"))) -} - - -# setting up pairwise comparisons and running -comparisons <- expand.grid(group1 = unique_groups, group2 = unique_groups) -comparisons <- subset(comparisons, group1 != group2) - -apply(comparisons, 1, function(pair) plot_comparison(pair['group1'], pair['group2'])) + + + + ggsave(filename=glue("{diff_abund_out_dir}{output_prefix}volcano_{group1}_vs_{group2}.png"), + plot=volcano_plot, + width = plot_width_inches, + height = plot_height_inches, + dpi = 300) +}) diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/main.nf b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/main.nf index c96d638d..50d32ce5 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/main.nf +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/main.nf @@ -27,12 +27,12 @@ if (params.help) { println(""" -profile [STRING] What profile should be used to run the workflow. Options are [singularity, docker, conda, slurm]. singularity, docker and conda will run the pipelne locally using singularity, docker, and conda, respectively. To combine profiles, pass them together separated by comma. For example, to run jobs using slurm in singularity containers use 'slurm,singularity' . """) - println(" --input_file [PATH] A 4-column (single-end) or 5-column (paired-end) input file (sample_id, forward, [reverse,] paired, groups). Mandatory if a GLDS or OSD accession is not provided. Default: null") + println(" --input_file [PATH] A 4-column (single-end) or 5-column (paired-end) input file (sample_id, forward, [reverse,] paired, groups). Mandatory if a GLDS or OSD accession is not provided.") println(" Please see the files: SE_file.csv and PE_file.csv for single-end and paired-end examples, respectively.") println(" The sample_id column should contain unique sample ids.") println(" The forward and reverse columns should contain the absolute or relative path to the sample's forward and reverse reads.") println(" The paired column should be true for paired-end or anything else for single-end reads.") - println(" The groups column contain group levels / treatments to be compared during diversity and differential abundance testing analysis.") + println(" The groups column contain group levels / treatments to be compared during diversity and differential abundance testing analysis. Default: null") println(" --target_region [STRING] What is the amplicon target region to be analyzed. Options are one of [16S, 18S, ITS]. Default: 16S.") println(" --trim_primers [BOOLEAN] Should primers be trimmed? true or false. Default: true.") println("PLEASE NOTE: This workflow assumes that all your raw reads end with the same suffix. If they don't please modify your filenames to have the same suffix as shown below.") @@ -43,8 +43,8 @@ if (params.help) { println(" --F_primer [STRING] Forward primer sequence e.g. AGAGTTTGATCCTGGCTCAG. Default: null.") println(" --R_primer [STRING] Reverse primer sequence e.g. CTGCCTCCCGTAGGAGT. Default: null.") println(" --min_cutadapt_len [INTEGER] What should be the minimum read length after quality trimming with cutadapt. Default: 130.") - println(" --primers_linked [STRING] Are the primers linked?. https://cutadapt.readthedocs.io/en/stable/recipes.html#trimming-amplicon-primers-from-paired-end-reads. Default: 'TRUE'. ") - println(" --discard_untrimmed [STRING] Should untrimmed reads be discarded? Any supplied string except TRUE will not discard them. Default: 'TRUE'.") + println(" --primers_linked [STRING] Are the primers linked?. https://cutadapt.readthedocs.io/en/stable/recipes.html#trimming-amplicon-primers-from-paired-end-reads. Default: TRUE. ") + println(" --discard_untrimmed [STRING] Should untrimmed reads be discarded? Any supplied string except TRUE will not discard them. Default: TRUE.") println() println("Optional arguments:") println(" --help Print this help message and exit.") @@ -63,7 +63,7 @@ if (params.help) { println(" Values are TRUE or FALSE Default: FALSE.") println() println("Diversity and Differential abundance testing parameters:") - println(" --diff_abund_method [STRING] The method to use for differential abundance testing. Either ['ancombc1', 'ancombc2', or 'deseq2'] respectively. Default: 'ancombc2' ") + println(" --diff_abund_method [STRING] The method to use for differential abundance testing. Either ['all', 'ancombc1', 'ancombc2', or 'deseq2'] respectively. Default: 'all' ") println(" --rarefaction_depth [INTEGER] The Minimum desired sample rarefaction depth for diversity analysis. Default: 500.") println(" --group [STRING] Column in input csv file with treatments to be compared. Default: 'groups' ") println(" --samples_column [STRING] Column in input csv file with sample names belonging to each treatment group. Default: 'sample_id' ") @@ -79,11 +79,11 @@ if (params.help) { println(" --fastqc_out_dir [PATH] Where should multiqc outputs be stored. Default: ../workflow_output/FastQC_Outputs/") println(" --trimmed_reads_dir [PATH] Where should your cutadapt trimmed reads be stored. Default: ../workflow_output/Trimmed_Sequence_Data/") println(" --filtered_reads_dir [PATH] Where should your filtered reads be stored. Default: ../workflow_output/Filtered_Sequence_Data/") - println(" --info_out_dir [PATH] Where should output metadata be stored. Default: ../workflow_output/Metadata/") + println(" --metadata_dir [PATH] Where should output metadata be stored. Default: ../Metadata/") println(" --final_outputs_dir [PATH] Where should most outputs and summary reports be stored. Default: ../workflow_output/Final_Outputs/") println() println("Genelab specific arguements:") - println(" --accession [STRING] A Genelab accession number if the --input_file parameter is not set. If this parameter is set, it will ignore the --input_file parameter. Default: null") + println(" --accession [STRING] A Genelab accession number if the --input_file parameter is not set. If this parameter is set, it will ignore the --input_file parameter.") println(" --assay_suffix [STRING] Genelabs assay suffix. Default: GLAmpSeq.") println(" --output_prefix [STRING] Unique name to tag onto output files. Default: empty string.") println() @@ -147,7 +147,7 @@ log.info """ FastQC: ${params.fastqc_out_dir} Trimmed Reads: ${params.trimmed_reads_dir} Filtered Reads: ${params.filtered_reads_dir} - Metadata: ${params.info_out_dir} + Metadata: ${params.metadata_dir} Reports: ${params.final_outputs_dir} Genelab Assay Suffix: ${params.assay_suffix} @@ -177,7 +177,8 @@ include { ZIP_BIOM } from './modules/zip_biom.nf' // Diversity, differential abundance and visualizations include { ALPHA_DIVERSITY; BETA_DIVERSITY } from './modules/diversity.nf' include { PLOT_TAXONOMY } from './modules/taxonomy_plots.nf' -include { ANCOMBC } from './modules/ancombc.nf' +include { ANCOMBC as ANCOMBC1 } from './modules/ancombc.nf' +include { ANCOMBC as ANCOMBC2 } from './modules/ancombc.nf' include { DESEQ } from './modules/deseq.nf' @@ -196,7 +197,7 @@ workflow { // --------------------- Sanity Checks ------------------------------------- // // Test input requirement - if (params.accession == null && params.input_file == null){ + if (!params.accession && !params.input_file){ error(""" Please supply either an accession (OSD or Genelab number) or an input CSV file @@ -207,7 +208,7 @@ workflow { // Test input csv file if(params.input_file){ // Test primers - if(params.F_primer == null || params.R_primer == null){ + if(!params.F_primer || !params.R_primer){ error(""" When using a csv file as input (--input_file) to this workflow you must provide @@ -344,7 +345,8 @@ workflow { "group" : "groups", "depth" : params.rarefaction_depth, "assay_suffix" : params.assay_suffix, - "output_prefix" : params.output_prefix + "output_prefix" : params.output_prefix, + "target_region" : params.target_region ]) metadata = GET_RUNSHEET.out.runsheet @@ -355,7 +357,8 @@ workflow { "group" : params.group, "depth" : params.rarefaction_depth, "assay_suffix" : params.assay_suffix, - "output_prefix" : params.output_prefix + "output_prefix" : params.output_prefix, + "target_region" : params.target_region ]) metadata = Channel.fromPath(params.input_file, checkIfExists: true) @@ -372,20 +375,35 @@ workflow { BETA_DIVERSITY.out.version | mix(software_versions_ch) | set{software_versions_ch} PLOT_TAXONOMY.out.version | mix(software_versions_ch) | set{software_versions_ch} - - // Differential abundance testing - if (params.diff_abund_method == "deseq2"){ + // Differential abundance testing + method = Channel.of(params.diff_abund_method) + if (params.diff_abund_method == "deseq2"){ DESEQ(meta, dada_counts, dada_taxonomy, metadata) DESEQ.out.version | mix(software_versions_ch) | set{software_versions_ch} - }else{ - - ANCOMBC(meta, dada_counts, dada_taxonomy, metadata) - ANCOMBC.out.version | mix(software_versions_ch) | set{software_versions_ch} - } + }else if (params.diff_abund_method == "ancombc1"){ + ANCOMBC1(method, meta, dada_counts, dada_taxonomy, metadata) + ANCOMBC1.out.version | mix(software_versions_ch) | set{software_versions_ch} + + }else if (params.diff_abund_method == "ancombc2"){ + + ANCOMBC2(method, meta, dada_counts, dada_taxonomy, metadata) + ANCOMBC2.out.version | mix(software_versions_ch) | set{software_versions_ch} + + }else{ + + ANCOMBC1("ancombc1", meta, dada_counts, dada_taxonomy, metadata) + ANCOMBC1.out.version | mix(software_versions_ch) | set{software_versions_ch} + ANCOMBC2("ancombc2", meta, dada_counts, dada_taxonomy, metadata) + ANCOMBC2.out.version | mix(software_versions_ch) | set{software_versions_ch} + + DESEQ(meta, dada_counts, dada_taxonomy, metadata) + DESEQ.out.version | mix(software_versions_ch) | set{software_versions_ch} + + } // Software Version Capturing - combining all captured sofware versions diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/ancombc.nf b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/ancombc.nf index bff7e497..c4048901 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/ancombc.nf +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/ancombc.nf @@ -12,23 +12,29 @@ nextflow.enable.dsl = 2 process ANCOMBC { - tag "Running ${params.diff_abund_method} for differential abundance testing..." + tag "Running ${method} for differential abundance testing..." label "visualization" input: + val(method) val(meta) path(feature_table) path(taxonomy) path(metadata) output: - path("differential_abundance/"), emit: output_dir + path("differential_abundance/${method}/"), emit: output_dir path("versions.txt"), emit: version - script: - def script_name = params.diff_abund_method == "ancombc1" ? "pairwise_ancombc1.R" : "pairwise_ancombc2.R" + script: """ - ${script_name} \\ + if [ ${method} == "ancombc1" ]; then + script_name='pairwise_ancombc1.R' + else + script_name='pairwise_ancombc2.R' + fi + + \${script_name} \\ --metadata-table '${metadata}' \\ --feature-table '${feature_table}' \\ --taxonomy-table '${taxonomy}' \\ @@ -36,7 +42,8 @@ process ANCOMBC { --samples-column '${meta.samples}' \\ --assay-suffix '${meta.assay_suffix}' \\ --output-prefix '${meta.output_prefix}' \\ - --cpus ${task.cpus} + --cpus ${task.cpus} \\ + --target-region '${meta.target_region}' Rscript -e "VERSIONS=sprintf('tidyverse %s\\nglue %s\\nANCOMBC %s\\nphyloseq %s\\nmia %s\\ntaxize %s\\nDescTools %s\\npatchwork %s\\nggrepel %s\\n', \\ packageVersion('tidyverse'), \\ @@ -61,7 +68,8 @@ workflow { meta = Channel.of(["samples": params.samples_column, "group" : params.group, "assay_suffix" : params.assay_suffix, - "output_prefix" : params.output_prefix + "output_prefix" : params.output_prefix, + "target_region" : params.target_region ]) @@ -69,8 +77,8 @@ workflow { asv_table = Channel.fromPath(params.asv_table, checkIfExists: true) taxonomy = Channel.fromPath(params.taxonomy, checkIfExists: true) - - ANCOMBC(meta, asv_table, taxonomy, metadata) + method = Channel.of(params.diff_abund_method) + ANCOMBC(method, meta, asv_table, taxonomy, metadata) emit: version = ANCOMBC.out.version diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/deseq.nf b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/deseq.nf index c9d01c40..a005d169 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/deseq.nf +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/deseq.nf @@ -14,24 +14,24 @@ process DESEQ { path(metadata) output: - path("differential_abundance/"), emit: output_dir + path("differential_abundance/deseq2/"), emit: output_dir path("versions.txt"), emit: version script: """ - run_deseq.R \\ + run_deseq2.R \\ --metadata-table '${metadata}' \\ --feature-table '${feature_table}' \\ --taxonomy-table '${taxonomy_table}' \\ --group '${meta.group}' \\ --samples-column '${meta.samples}' \\ --assay-suffix '${meta.assay_suffix}' \\ - --output-prefix '${meta.output_prefix}' + --output-prefix '${meta.output_prefix}' \\ + --target-region '${meta.target_region}' - Rscript -e "VERSIONS=sprintf('tidyverse %s\\nglue %s\\nhere %s\\nDESeq2 %s\\nRColorBrewer %s\\n', \\ + Rscript -e "VERSIONS=sprintf('tidyverse %s\\nglue %s\\nDESeq2 %s\\nRColorBrewer %s\\n', \\ packageVersion('tidyverse'), \\ packageVersion('glue'), \\ - packageVersion('here'), \\ packageVersion('DESeq2'), \\ packageVersion('RColorBrewer')); \\ write(x=VERSIONS, file='versions.txt', append=TRUE)" diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/nextflow.config b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/nextflow.config index 755c7604..6c978ebf 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/nextflow.config +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/nextflow.config @@ -4,7 +4,6 @@ params { target_region = "16S" // "16S", "18S", "ITS" raw_R1_suffix = "_R1_raw.fastq.gz" raw_R2_suffix = "_R2_raw.fastq.gz" - raw_reads_dir = "../Raw_Sequence_Data/" trim_primers = true // true or false @@ -54,19 +53,19 @@ params { // Directories + raw_reads_dir = "../Raw_Sequence_Data/" + metadata_dir = "../Metadata/" + genelab_dir = "../GeneLab/" fastqc_out_dir = "../workflow_output/FastQC_Outputs/" trimmed_reads_dir = "../workflow_output/Trimmed_Sequence_Data/" filtered_reads_dir = "../workflow_output/Filtered_Sequence_Data/" - info_out_dir = "../workflow_output/Metadata/" final_outputs_dir = "../workflow_output/Final_Outputs/" - metadata_dir = "../Metadata/" - genelab_dir = "../GeneLab/" // Multiqc - multiqc_config = "${baseDir}/config/multiqc.config" + multiqc_config = "${projectDir}/config/multiqc.config" // -------- Differential abundance parameters ----- // - diff_abund_method = "ancombc2" // ["ancombc1", "ancombc2", or "deseq2"] - it runs ancombc2 by default + diff_abund_method = "all" // ["all", "ancombc1", "ancombc2", or "deseq2"] - it runs all three by default group = "groups" // column in input csv file to be compared samples_column = "sample_id" // column in input csv file containing sample names // Minimum desired sample rarefaction depth for diversity analysis From b8ea78833522260f142f433dfbf7c9c16ddd1573 Mon Sep 17 00:00:00 2001 From: olabiyi Date: Tue, 24 Dec 2024 13:37:10 -0600 Subject: [PATCH 09/24] Fixed beta diversity --- .../workflow_code/bin/beta_diversity.R | 35 +++++++++++-------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/beta_diversity.R b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/beta_diversity.R index bf1a7777..cbc536cf 100755 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/beta_diversity.R +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/beta_diversity.R @@ -287,12 +287,9 @@ run_stats <- function(dist_obj, metadata, groups_colname){ # Make PCoA plot_pcoa <- function(ps, stats_res, distance_method, - groups_colname, sample_colname, - group_colors, legend_title, + groups_colname, group_colors, legend_title, addtext=FALSE) { - - # Generating a PCoA with phyloseq pcoa <- ordinate(physeq = ps, method = "PCoA", distance = distance_method) eigen_vals <- pcoa$values$Eigenvalues @@ -306,13 +303,25 @@ plot_pcoa <- function(ps, stats_res, distance_method, label_PC1 <- sprintf("PC1 [%.1f%%]", percent_variance[1]) label_PC2 <- sprintf("PC2 [%.1f%%]", percent_variance[2]) - p <- plot_ordination(ps, pcoa, color = groups_colname) + - geom_point(size = 1) + vectors_df <- pcoa$vectors %>% + as.data.frame() %>% + rownames_to_column("samples") + + plot_df <- sample_data(ps) %>% + as.matrix() %>% + as.data.frame() %>% + rownames_to_column("samples") %>% + select(samples, !!groups_colname) %>% + right_join(vectors_df, join_by("samples")) + + p <- ggplot(plot_df, aes(x=Axis.1, y=Axis.2, + color=!!sym(groups_colname), + label=samples)) + + geom_point(size=1) + if(addtext){ - sample_colname <- make.names(sample_colname) - sample_names <- p$data[[sample_colname]] - p <- p + geom_text(aes(label = sample_names), show.legend = FALSE, + p <- p + geom_text(show.legend = FALSE, hjust = 0.3, vjust = -0.4, size = 4) } @@ -526,16 +535,14 @@ write_csv(x = stats_res$adonis, #---------------------------- Make PCoA # Unlabeled PCoA plot -ordination_plot_u <- plot_pcoa(ps, stats_res, distance_method, - groups_colname, sample_colname, - group_colors, legend_title) +ordination_plot_u <- plot_pcoa(ps, stats_res, distance_method, + groups_colname,group_colors, legend_title) ggsave(filename=glue("{beta_diversity_out_dir}/{output_prefix}{distance_method}_PCoA_without_labels{assay_suffix}.png"), plot=ordination_plot_u, width = 14, height = 8.33, dpi = 300, units = "in") # Labeled PCoA plot ordination_plot <- plot_pcoa(ps, stats_res, distance_method, - groups_colname, sample_colname, - group_colors, legend_title, + groups_colname, group_colors, legend_title, addtext=TRUE) ggsave(filename=glue("{beta_diversity_out_dir}/{output_prefix}{distance_method}_PCoA_w_labels{assay_suffix}.png"), plot=ordination_plot, width = 14, height = 8.33, dpi = 300, units = "in") From 51bdac9b1dde7156dc3707daddc5caa44d10dca2 Mon Sep 17 00:00:00 2001 From: olabiyi Date: Thu, 2 Jan 2025 12:34:57 -0800 Subject: [PATCH 10/24] Updated pipeline document --- .../GL-DPPD-7104-B.md | 2492 ++++++++++++++--- 1 file changed, 2151 insertions(+), 341 deletions(-) diff --git a/Amplicon/Illumina/Pipeline_GL-DPPD-7104_Versions/GL-DPPD-7104-B.md b/Amplicon/Illumina/Pipeline_GL-DPPD-7104_Versions/GL-DPPD-7104-B.md index 29bef590..3eb74d25 100644 --- a/Amplicon/Illumina/Pipeline_GL-DPPD-7104_Versions/GL-DPPD-7104-B.md +++ b/Amplicon/Illumina/Pipeline_GL-DPPD-7104_Versions/GL-DPPD-7104-B.md @@ -4,16 +4,16 @@ --- -**Date:** December 14, 2023 +**Date:** December 30, 2024 **Revision:** B **Document Number:** GL-DPPD-7104-B **Submitted by:** -Alexis Torres and Michael D. Lee (GeneLab Data Processing Team) +Olabiyi Obayomi, Alexis Torres, and Michael D. Lee (GeneLab Data Processing Team) **Approved by:** -Sylvain Costes (GeneLab Project Manager) -Samrawit Gebre (GeneLab Deputy Project Manager and Acting Genelab Configuration Manager) +Samrawit Gebre (GeneLab Project Manager) +Danielle Lopez (GeneLab Deputy Project Manager) Lauren Sanders (OSDR Project Scientist) Amanda Saravia-Butler (GeneLab Data Processing Lead) @@ -23,22 +23,39 @@ Amanda Saravia-Butler (GeneLab Data Processing Lead) - Additional software (R packages) used: - - vegan - - tidyverse - - dendextend + - ANCOMBC + - broom + - DESeq2 + - DescTools + - FSA + - ggdendro + - glue - ggrepel - - dplyr - - RColorBrewer + - mia + - multcompView + - optparse + - patchwork - phyloseq -- Inclusion of additional steps and outputs starting from ([step 6](#6-beta-diversity)): - - Beta Diversity with Hierarchical Clustering ([6a](#6a-hierarchical-clustering)) and Ordination ([6b](#6b-ordination)). - - Alpha Diversity with Rarefaction Curves ([7a](#7a-rarefaction-curves)) and Richness and Diversity Estimates ([7b](#7b-richness-and-diversity-estimates)). - - Groupwise and Samplewise Taxonomic Summary Plots ([step 8](#8-taxonomic-summaries)). - - Differential Abundance Analysis ([step 9](#9-differential-abundance-analysis)) Including Betadisper, Permutational ANOVA ([9a](#9a-betadisper-and-permutational-anova)), DESeq2 ([9b](#9b-differential-abundance-analysis-with-deseq2)) and Volcano Plots ([9c](#9c-volcano-plots)). + - RColorBrewer + - rstatix + - taxize + - tidyverse + - tools + - utils + - vegan + + + +- Inclusion of additional steps and outputs starting from ([step 6](#6-amplicon-seq-data-analysis-set-up)): + - Alpha Diversity Analysis ([step 7](#7-alpha-diversity-analysis)). + - Beta Diversity Analysis ([step 8](#8-beta-diversity-analysis)). + - Groupwise and Samplewise Taxonomic Summary Plots ([step 9](#9-taxonomy-plots)). + - Differential Abundance Testing ([step 10](#9-differential-abundance-analysis)) with ANCOMBC 1 ([10a](#10a-ancombc-1)), ANCOMBC 2 ([10b](#10b-ancombc-2)) and Deseq2 ([10c](#10c-deseq2)). -- Assay-specific suffixes were added where needed for GeneLab repo ("GLAmpSeq") +- Assay-specific suffixes were added where needed for GeneLab repo ("_GLAmpSeq") - The ITS UNITE reference database used was updated to "UNITE_v2023_July2023.RData", from http://www2.decipher.codes/Classification/TrainingSets/ +- Persistent Reference links to RDATA databases on Figshare replaced reference links on DECIPHER's [website](http://www2.decipher.codes/Classification/TrainingSets/) for [SILVA SSU r138](https://figshare.com/ndownloader/files/46245217), [UNITE v2023](https://figshare.com/ndownloader/files/49181545) and [PR2 v4.13](https://figshare.com/ndownloader/files/46241917) - Several program versions were updated (all versions listed in [Software used](#software-used) below) # Table of contents @@ -62,18 +79,19 @@ Amanda Saravia-Butler (GeneLab Data Processing Lead) - [5g. Generating and Writing Standard Outputs](#5g-generating-and-writing-standard-outputs) - [6. Amplicon Seq Data Analysis Set Up](#6-amplicon-seq-data-analysis-set-up) - [6a. Create Sample Runsheet](#6a-create-sample-runsheet) - - [6b. Environment Set Up](#6b-environment-set-up) - - [7. Beta Diversity](#7-beta-diversity) - - [7a. Hierarchical Clustering](#7a-hierarchical-clustering) - - [7b. Ordination](#7b-ordination) - - [8. Alpha Diversity](#8-alpha-diversity) - - [8a. Rarefaction Curves](#8a-rarefaction-curves) - - [8b. Richness and Diversity Estimates](#8b-richness-and-diversity-estimates) - - [9. Taxonomic Summaries](#9-taxonomic-summaries) - - [10. Differential Abundance Analysis](#10-differential-abundance-analysis) - - [10a. Betadisper and Permutational ANOVA](#10a-betadisper-and-permutational-anova) - - [10b. Differential Abundance Analysis with DESeq2](#10b-differential-abundance-analysis-with-deseq2) - - [10c. Volcano Plots](#10c-volcano-plots) + - [6b. R Environment Set Up](#6b-r-environment-set-up) + - [Load Libraries](#load-libraries) + - [Load Functions](#load-functions) + - [Set Variables](#set-variables) + - [Read-in Input Tables](#read-in-input-tables) + - [Preprocessing](#preprocessing) + - [7. Alpha Diversity Analysis](#7-alpha-diversity-analysis) + - [8. Beta Diversity Analysis](#8-beta-diversity-analysis) + - [9. Taxonomy Plots](#9-taxonomy-plots) + - [10. Differential Abundance Testing](#10-differential-abundance-testing) + - [10a. ANCOMBC 1](#10a-ancombc-1) + - [10b. ANCOMBC 2](#10b-ancombc-2) + - [10c. DESeq2 ](#10c-deseq2) --- @@ -84,26 +102,34 @@ Amanda Saravia-Butler (GeneLab Data Processing Lead) |FastQC|`0.12.1`|[https://www.bioinformatics.babraham.ac.uk/projects/fastqc/](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)| |MultiQC|`1.19`|[https://multiqc.info/](https://multiqc.info/)| |Cutadapt|`4.6`|[https://cutadapt.readthedocs.io/en/stable/](https://cutadapt.readthedocs.io/en/stable/)| +|R-base|`4.4.1`|[https://www.r-project.org/](https://www.r-project.org/)| |DADA2|`1.30.0`|[https://www.bioconductor.org/packages/release/bioc/html/dada2.html](https://www.bioconductor.org/packages/release/bioc/html/dada2.html)| |DECIPHER|`2.30.0`|[https://bioconductor.org/packages/release/bioc/html/DECIPHER.html](https://bioconductor.org/packages/release/bioc/html/DECIPHER.html)| |biomformat|`1.30.0`|[https://github.com/joey711/biomformat](https://github.com/joey711/biomformat)| -|R-base|`4.3.2`|[https://www.r-project.org/](https://www.r-project.org/)| -|vegan|`2.6.4`|[https://cran.r-project.org/package=vegan](https://cran.r-project.org/package=vegan)| +|ANCOMBC|`2.6.0`|[https://github.com/FrederickHuangLin/ANCOMBC](https://github.com/FrederickHuangLin/ANCOMBC)| +|broom|`1.0.7`|[https://CRAN.R-project.org/package=broom](https://CRAN.R-project.org/package=broom)| +|DescTools|`0.99.57`|[https://andrisignorell.github.io/DescTools/](https://andrisignorell.github.io/DescTools/)| +|DESeq2|`1.42.0`|[https://bioconductor.org/packages/release/bioc/html/DESeq2.html](https://bioconductor.org/packages/release/bioc/html/DESeq2.html)| +|FSA|`0.9.5`|[https://CRAN.R-project.org/package=FSA](https://CRAN.R-project.org/package=FSA)| +|ggdendro|`0.2.0`|[https://CRAN.R-project.org/package=ggdendro](https://CRAN.R-project.org/package=ggdendro)| +|ggrepel|`0.9.6`|[https://CRAN.R-project.org/package=ggrepel](https://CRAN.R-project.org/package=ggrepel)| +|glue|`1.8.0`|[https://glue.tidyverse.org/](https://glue.tidyverse.org/)| +|mia|`1.12.0`|[https://github.com/microbiome/mia](https://github.com/microbiome/mia)| +|phyloseq|`1.46.0`|[https://bioconductor.org/packages/release/bioc/html/phyloseq.html](https://bioconductor.org/packages/release/bioc/html/phyloseq.html)| +|rcolorbrewer|`1.1_3`|[https://CRAN.R-project.org/package=RColorBrewer](https://CRAN.R-project.org/package=RColorBrewer)| +|taxize|`0.9.100.1`|[https://docs.ropensci.org/taxize/](https://docs.ropensci.org/taxize/)| |tidyverse|`2.0.0`|[https://CRAN.R-project.org/package=tidyverse](https://CRAN.R-project.org/package=tidyverse)| -|dendextend|`1.17.1`|[https://CRAN.R-project.org/package=dendextend](https://CRAN.R-project.org/package=dendextend)| -|ggrepel|`0.9.4`|[https://CRAN.R-project.org/package=ggrepel](https://CRAN.R-project.org/package=ggrepel)| -|dplyr|`1.1.3`|[https://CRAN.R-project.org/package=dplyr](https://CRAN.R-project.org/package=dplyr)| -|rcolorbrewer|`1.1.3`|[https://CRAN.R-project.org/package=RColorBrewer](https://CRAN.R-project.org/package=RColorBrewer)| -|DESeq2|`1.40.2`|[https://bioconductor.org/packages/release/bioc/html/DESeq2.html](https://bioconductor.org/packages/release/bioc/html/DESeq2.html)| -|phyloseq|`1.44.0`|[https://bioconductor.org/packages/release/bioc/html/phyloseq.html](https://bioconductor.org/packages/release/bioc/html/phyloseq.html)| +|tools|`4.4.1`|[https://www.R-project.org/](https://www.R-project.org/)| +|utils|`4.4.1`|[https://www.R-project.org/](https://www.R-project.org/)| +|vegan|`2.6.4`|[https://cran.r-project.org/package=vegan](https://cran.r-project.org/package=vegan)| # Reference databases used |Program used| Database| Relevant Links| |:-----|:-----:|--------:| -|DECIPHER| SILVA SSU r138 | [http://www2.decipher.codes/Classification/TrainingSets/SILVA_SSU_r138_2019.RData](http://www2.decipher.codes/Classification/TrainingSets/)| -|DECIPHER| UNITE v2020 | [http://www2.decipher.codes/Classification/TrainingSets/UNITE_v2020_February2020.RData](http://www2.decipher.codes/Classification/TrainingSets/)| - +|DECIPHER| SILVA SSU r138 | [SILVA_SSU_r138_2019.RData](https://figshare.com/ndownloader/files/46245217)| +|DECIPHER| UNITE v2023 | [UNITE_v2023_July2023.RData](https://figshare.com/ndownloader/files/49181545)| +|DECIPHER| PR2 v4.13 | [PR2_v4_13_March2021.RData](https://figshare.com/ndownloader/files/46241917)| --- # General processing overview with example commands @@ -551,15 +577,13 @@ write.table(tax_and_count_tab, "taxonomy-and-counts_GLAmpSeq.tsv", sep="\t", quo --- -## 6. Amplicon Seq Data Analysis Set Up - -> The remainder of this document is performed in R. +## 6. Amplicon Seq Data Analysis Set Up
### 6a. Create Sample Runsheet -> Note: Rather than running the command below to create the runsheet needed for processing, the runsheet may also be created manually by following the [file specification](../Workflow_Documentation/SW_AmpIllumina-B/examples/runsheet/README.md). +> Note: Rather than running the command below to create the runsheet needed for processing, the runsheet may also be created manually by following the examples for [Paired-end](../Workflow_Documentation/NF_AmpIllumina-B/workflow_code/PE_file.csv) and [Single-end](../Workflow_Documentation/NF_AmpIllumina-B/workflow_code/SE_file.csv) samples. ```bash ### Download the *ISA.zip file from the OSDR ### @@ -595,502 +619,2288 @@ dpt-isa-to-runsheet --accession OSD-### \
-### 6b. Environment Set Up -```R -### Import libraries used for processing ### -library(tidyverse) -library(phyloseq) +> The remainder of this document is performed in R. + + +### 6b. R Environment Set Up + + +#### Load Libraries + +```R library(vegan) -library(dendextend) +library(phyloseq) +library(glue) +library(FSA) +library(multcompView) +library(rstatix) +library(patchwork) +library(RColorBrewer) library(DESeq2) +library(ggdendro) +library(broom) +library(ggrepel) +library(tools) +library(ANCOMBC) +library(DescTools) +library(taxize) +library(mia) +library(utils) +library(tidyverse) +``` +#### Load Functions + +```R +# Function to calculate text size for plotting +calculate_text_size <- function(num_samples, start_samples = 25, min_size = 3) { + max_size = 11 # Maximum size for up to start_samples + slope = -0.15 + + if (num_samples <= start_samples) { + return(max_size) + } else { + # Calculate the current size with the hard coded slope + current_size = max_size + slope * (num_samples - start_samples) + + # Ensure the size doesn't go below the minimum + return(max(current_size, min_size)) + } +} -### Read in the runsheet containing the metadata required for processing ### +# A function to create a phyloseq object with the appropriate +# sample count transformation depending on the supplied transformation method +# i.e. either 'rarefy' or 'vst' +transform_phyloseq <- function( feature_table, metadata, method, rarefaction_depth=500){ + # feature_table [DATAFRAME] ~ Feature / ASV count table with samples as columns and features as rows + # metadata [DATAFRAME] ~ Samples metadata with samples as row names + # method [STRING] ~ Distance transformation method to use. + # Either 'rarefy' or 'vst' for rarefaction and variance + # stabilizing transformation, respectively. + # rarefaction_depth [INT] ~ Sample rarefaction to even depth when method is 'bray' + + if(method == 'rarefy'){ + # Create phyloseq object + ASV_physeq <- phyloseq(otu_table(feature_table, taxa_are_rows = TRUE), + sample_data(metadata)) + + + seq_per_sample <- colSums(feature_table) %>% sort() + # Minimum value + depth <- min(seq_per_sample) + + for (count in seq_per_sample) { + # Get the count equal to rarefaction_depth or nearest to it + if(count >= rarefaction_depth) { + depth <- count + break + } + + } + + #----- Rarefy sample counts to even depth per sample + ps <- rarefy_even_depth(physeq = ASV_physeq, + sample.size = depth, + rngseed = 1, + replace = FALSE, + verbose = FALSE) + + }else if(method == "vst"){ + + # Using deseq + # Keep only ASVs with at least 1 count + feature_table <- feature_table[rowSums(feature_table) > 0, ] + # Add +1 pseudocount for VST for vst transformation + feature_table <- feature_table + 1 + + # Make the order of samples in metadata match the order in feature table + metadata <- metadata[colnames(feature_table),] + + # Create VST normalized counts matrix + # ~1 means no design + deseq_counts <- DESeqDataSetFromMatrix(countData = feature_table, + colData = metadata, + design = ~1) + deseq_counts_vst <- varianceStabilizingTransformation(deseq_counts) + vst_trans_count_tab <- assay(deseq_counts_vst) + + # Making a phyloseq object with our transformed table + vst_count_phy <- otu_table(object = vst_trans_count_tab, taxa_are_rows = TRUE) + sample_info_tab_phy <- sample_data(metadata) + ps <- phyloseq(vst_count_phy, sample_info_tab_phy) + }else{ + + stop("Please supply a valid normalization method, either 'rarefy' or 'vst' ") + } + + + return(ps) +} -runsheet <- read.table(file = "*runsheet.csv", - header = TRUE, row.names = 1, sep = ",") +# ----------- Hierarchical Clustering and dendogram plotting +make_dendogram <- function(dist_obj, metadata, groups_colname, + group_colors, legend_title){ + + + sample_clust <- hclust(d = dist_obj, method = "ward.D2") + + # Extract clustering data + hcdata <- dendro_data(sample_clust, type = "rectangle") + segment_data <- segment(hcdata) + label_data <- label(hcdata) %>% + left_join(metadata %>% + rownames_to_column("label")) + + # Plot dendogram + dendogram <- ggplot() + + geom_segment(data = segment_data, + aes(x = x, y = y, xend = xend, yend = yend) + ) + + geom_text(data = label_data , + aes(x = x, y = y, label = label, + color = !!sym(groups_colname) , hjust = 0), + size = 4.5, key_glyph = "rect") + + scale_color_manual(values = group_colors) + + coord_flip() + + scale_y_reverse(expand = c(0.2, 0)) + + labs(color = legend_title) + + theme_dendro() + + guides(colour = guide_legend(override.aes = list(size = 5)))+ + theme(legend.key = element_rect(fill=NA), + text = element_text(face = 'bold'), + legend.title = element_text(size = 12, face='bold'), + legend.text = element_text(face = 'bold', size = 11)) + + + return(dendogram) + +} +# Run variance test and adonis test +run_stats <- function(dist_obj, metadata, groups_colname){ + + samples <- attr(dist_obj, "Label") + metadata <- metadata[samples,] + variance_test <- betadisper(d = dist_obj, + group = metadata[[groups_colname]]) %>% + anova() %>% + broom::tidy() %>% + mutate(across(where(is.numeric), ~round(.x, digits = 2))) + + + adonis_res <- adonis2(formula = dist_obj ~ metadata[[groups_colname]]) + adonis_test <- adonis_res %>% + broom::tidy() %>% + mutate(across(where(is.numeric), ~round(.x, digits = 2))) + + return(list(variance = variance_test, adonis = adonis_test)) +} -### Create a sample info table from the runsheet containing the group names and a column of unique colors for each group ### +# Make PCoA +plot_pcoa <- function(ps, stats_res, distance_method, + groups_colname, group_colors, legend_title, + addtext=FALSE) { + + # Generating a PCoA with phyloseq + pcoa <- ordinate(physeq = ps, method = "PCoA", distance = distance_method) + eigen_vals <- pcoa$values$Eigenvalues + + # Calculate the percentage of variance + percent_variance <- eigen_vals / sum(eigen_vals) * 100 + + # Retrieving plot labels + r2_value <- stats_res$adonis[["R2"]][1] + prf_value <- stats_res$adonis[["p.value"]][1] + label_PC1 <- sprintf("PC1 [%.1f%%]", percent_variance[1]) + label_PC2 <- sprintf("PC2 [%.1f%%]", percent_variance[2]) + + vectors_df <- pcoa$vectors %>% + as.data.frame() %>% + rownames_to_column("samples") + + plot_df <- sample_data(ps) %>% + as.matrix() %>% + as.data.frame() %>% + rownames_to_column("samples") %>% + select(samples, !!groups_colname) %>% + right_join(vectors_df, join_by("samples")) + + p <- ggplot(plot_df, aes(x=Axis.1, y=Axis.2, + color=!!sym(groups_colname), + label=samples)) + + geom_point(size=1) -num_colors <- length(unique(runsheet$groups)) -if (num_colors > 9) { - custom_palette <- colorRampPalette(brewer.pal(9, "Set1"))(num_colors) - colors <- custom_palette -} else { - colors <- brewer.pal(num_colors, "Set1") + + if(addtext){ + p <- p + geom_text(show.legend = FALSE, + hjust = 0.3, vjust = -0.4, size = 4) + } + + + p <- p + labs(x = label_PC1, y = label_PC2, color = legend_title) + + coord_fixed(sqrt(eigen_vals[2]/eigen_vals[1])) + + scale_color_manual(values = group_colors) + + theme_bw() + theme(text = element_text(size = 15, face="bold"), + legend.direction = "vertical", + legend.justification = "center", + legend.title = element_text(hjust=0.1)) + + annotate("text", x = Inf, y = -Inf, + label = paste("R2:", toString(round(r2_value, 3))), + hjust = 1.1, vjust = -2, size = 4)+ + annotate("text", x = Inf, y = -Inf, + label = paste("Pr(>F)", toString(round(prf_value,4))), + hjust = 1.1, vjust = -0.5, size = 4) + ggtitle("PCoA") + + + return(p) } -group_colors <- setNames(colors, unique(runsheet$groups)) -runsheet <- runsheet %>% - mutate(!!color_colname := group_colors[.data$groups]) -sample_info_tab <- runsheet[, c($groups, $color)] +# A function to filter out rare features from a feature table depending +# on the supplied cut off. +remove_rare_features <- function(feature_table, cut_off_percent=3/4){ + + # feature_table [MATRIX] feature table matrix with samples as columns and + # features as rows + # cut_off_percent [NUMERIC] cut-off fraction or decimal between 0.001 to 1 + # of the total number of samples to determine the + # most abundant features. By default it removes + # features that are not present in 3/4 of the total + # number of samples + + # Filter by occurrence in a fraction of samples + # Define a cut-off for determining what's rare + cut_off <- cut_off_percent * ncol(feature_table) + # Get the occurrence for each feature + feature_occurence <- rowSums(feature_table > 0) + # Get names of the abundant features + abund_features <- names(feature_occurence[feature_occurence >= cut_off]) + # Remove rare features + abun_features.m <- feature_table[abund_features,] + return(abun_features.m) +} -### Read in the ASV count table, containing the counts of each ASV in each sample ### +process_taxonomy <- function(taxonomy, prefix='\\w__') { + # Function to process a taxonopmy assignment table + #1. ~ taxonomy is a string specifying the taxonomic assignment file name + #2 prefix ~ is a regular expression specifying the characters to remove + # from the taxon names '\\w__' for greengenes and 'D_\\d__' for SILVA + + + taxonomy <- apply(X = taxonomy, MARGIN = 2, FUN = as.character) + + for (rank in colnames(taxonomy)) { + #delete the taxonomy prefix + taxonomy[,rank] <- gsub(pattern = prefix, x = taxonomy[, rank], + replacement = '') + indices <- which(is.na(taxonomy[,rank])) + taxonomy[indices, rank] <- rep(x = "Other", times=length(indices)) + #replace empty cell + indices <- which(taxonomy[,rank] == "") + taxonomy[indices,rank] <- rep(x = "Other", times=length(indices)) + } + taxonomy <- apply(X = taxonomy,MARGIN = 2, + FUN = gsub,pattern = "_",replacement = " ") %>% + as.data.frame(stringAsfactor=F) + return(taxonomy) +} -count_tab <- read.table(file = "counts_GLAmpSeq.tsv", - header = TRUE, row.names = 1, sep = "\t") -``` +# Function to format a taxonomy assignment table by appending a suffix +# to a known name +format_taxonomy_table <- function(taxonomy=taxonomy.m,stringToReplace="Other", + suffix=";Other") { + + for (taxa_index in seq_along(taxonomy)) { + + indices <- grep(x = taxonomy[,taxa_index], pattern = stringToReplace) + + taxonomy[indices,taxa_index] <- + paste0(taxonomy[indices,taxa_index-1], + rep(x = suffix, times=length(indices))) + + } + return(taxonomy) +} -**Input Data:** -* \*runsheet.csv (runsheet containing sample metadata required for processing, output from [step 6a](#6a-create-sample-runsheet)) -* counts_GLAmpSeq.tsv (ASV counts table, output from [step 5g](#5g-generating-and-writing-standard-outputs)) +fix_names<- function(taxonomy,stringToReplace,suffix){ + #1~ taxonomy is a taxonomy dataframe with taxonomy ranks as column names + #2~ stringToReplace is a vector of regex strings specifying what to replace + #3~ suffix is a string specifying the replacement value -**Output Data:** -* `count_tab` (variable containing the ASV counts table created from counts.tsv) -* `runsheet` (variable containing sample metadata required for processing) -* `sample_info_tab` (variable containing a subtable of the runsheet, including the 'groups' column and an additional 'color' column with a color for each unique group) -
+ for(index in seq_along(stringToReplace)){ + taxonomy <- format_taxonomy_table(taxonomy = taxonomy, + stringToReplace=stringToReplace[index], + suffix=suffix[index]) + } + return(taxonomy) +} -___ +# A function to generate taxon level count matrix based on a taxonomy table and +# an existing feature table +make_feature_table <- function(count_matrix,taxonomy, + taxon_level, samples2keep=NULL){ + + # EAMPLE: + # make_feature_table(count_matrix = feature_counts_matrix, + # taxonomy = taxonomy_table, taxon_level = "Phylum") + + feature_counts_df <- data.frame(taxon_level=taxonomy[,taxon_level], + count_matrix, check.names = FALSE, + stringsAsFactors = FALSE) + + feature_counts_df <- aggregate(.~taxon_level,data = feature_counts_df, + FUN = sum) + rownames(feature_counts_df) <- feature_counts_df[,"taxon_level"] + feature_table <- feature_counts_df[,-1] + # Retain only taxa found in at least one sample + taxa2keep <- rowSums(feature_table) > 0 + feature_table <- feature_table[taxa2keep,] + + if(!is.null(samples2keep)){ + feature_table <- feature_table[,samples2keep] + # Retain only taxa found in at least one sample + taxa2keep <- rowSums(feature_table) > 0 + feature_table <- feature_table[taxa2keep,] + } + + return(feature_table) +} -## 7. Beta Diversity -Beta diversity measures the variation in species composition between different samples or environments. A common practice in working with a new dataset is to generate some exploratory visualizations like ordinations and hierarchical clusterings. These give us a quick overview of how our samples relate to each other and can be a way to check for problems like batch effects. +# Function to group rare taxa or return a table with the rare taxa +group_low_abund_taxa <- function(abund_table, threshold=0.05, + rare_taxa=FALSE) { + # abund_table is a relative abundance matrix with taxa as columns and samples as rows + #rare_taxa is a boolean specifying if only rare taxa should be returned + #If set to TRU then a table with only the rare taxa will be returned + #intialize an empty vector that will contain the indices for the + #low abundance columns/ taxa to group + taxa_to_group <- c() + #intialize the index variable of species with low abundance (taxa/columns) + index <- 1 + + # Loop over every column or taxa then check to see if the max abundance is less than the set threshold + # if true, save the index in the taxa_to_group vector variable + while(TRUE){ + + for (column in seq_along(abund_table)){ + if(max(abund_table[,column]) < threshold ){ + taxa_to_group[index] <- column + index = index + 1 + } + } + if(is.null(taxa_to_group)){ + threshold <- readline("please provide a higher threshold for grouping rare taxa, only numbers are allowed ") + threshold <- as.numeric(threshold) + }else{ + break + } + + } + + + if(rare_taxa){ + abund_table <- abund_table[,taxa_to_group,drop=FALSE] + }else{ + #remove the low abundant taxa or columns + abundant_taxa <-abund_table[,-(taxa_to_group), drop=FALSE] + #get the rare taxa + # rare_taxa <-abund_table[,taxa_to_group] + rare_taxa <- subset(x = abund_table, select = taxa_to_group) + #get the proportion of each sample that makes up the rare taxa + rare <- rowSums(rare_taxa) + #bind the abundant taxa to the rae taxa + abund_table <- cbind(abundant_taxa,rare) + #rename the columns i.e the taxa + colnames(abund_table) <- c(colnames(abundant_taxa),"Rare") + } + + return(abund_table) +} -Create a DESeq2 object from the counts and the runsheet and apply the Variance Stabilization Transformation (VST): -```R -deseq_counts <- DESeqDataSetFromMatrix(countData = count_tab, - colData = runsheet, - design = ~1) -deseq_counts_vst <- varianceStabilizingTransformation(deseq_counts) -vst_counts <- assay(deseq_counts_vst) -``` +# Function to collapse the samples in an oTU table with a defined function(fun) +# based on a group in metadata +collapse_samples <- function(taxon_table,metadata,group,fun=sum, + convertToRelativeAbundance=FALSE){ + # function to collapse the samples in an oTU table with a defined function(fun) based on a group in metadata + # taxon_table - a matrix count table with samples as rows and features/OTUs as columns + # metadata - a dataframe to containing the group to collapse samples by. Sample names must be the rownames of the metadata + # group - an independent factor variable within the metadata to collapse the samples by + # fun - a function without brackets to apply in order to collapse the samples + # convertToRelativeAbundance - a boolean set to TRUE OR FALSE if the taxon_table shout be converted to relative abundance + # default is FALSE + common.ids <- intersect(rownames(taxon_table),rownames(metadata)) + metadata <- droplevels(metadata[common.ids,,drop=FALSE]) + taxon_table <- taxon_table[common.ids,,drop=FALSE] + taxon_table <- cbind(subset(x = metadata, select=group),taxon_table) + + taxon_table <- aggregate(reformulate(termlabels = group, response = '.'), + data = taxon_table, FUN = fun) + rownames(taxon_table) <- taxon_table[,1] + taxon_table <- taxon_table[,-1] + if(convertToRelativeAbundance){ + taxon_table <- t(apply(X = taxon_table, MARGIN = 1, FUN = function(x) x/sum(x))) + } + + final <- list(taxon_table,metadata) + names(final) <- c("taxon_table","metadata") + return(final) +} -**Input Data:** -* `count_tab` (variable containing the ASV counts table, output from [step 6b](#6b-environment-set-up)) -* `runsheet` (variable containing sample metadata required for processing, output from [step 6b](#6b-environment-set-up)) +taxize_options(ncbi_sleep = 0.8) +# A function to retrieve NCBI taxonomy id for a given taxonomy name +get_ncbi_ids <- function(taxonomy, target_region){ -**Output Data:** -* `vst_counts` (variable holding the VST-normalized ASV counts) + if(target_region == "ITS"){ + search_string <- "fungi" + }else if(target_region == "18S"){ + search_string <- "eukaryote" + }else{ + search_string <- "bacteria" + } -
+ uid <- get_uid(taxonomy, division_filter = search_string) + tax_ids <- uid[1:length(uid)] + return(tax_ids) + +} + +# Error handling function when running ANCOMBC2 +find_bad_taxa <- function(cnd){ + + if(split_res == "replacement has 0 rows, data has 1" || + split_res == "All taxa contain structural zeros") { + + return( + list(res=data.frame(taxon=split_res, lfc=NA, se=NA, + W=NA, p=NA, q=NA, diff=NA, pass_ss=NA)) + ) + } + + bad_taxa <- split_res[[c(1L, 2L)]] + bad_taxa <- .subset2(strsplit(bad_taxa, ", "), 1L) + return(bad_taxa) +} + +# A function to run ANCOMBC2 while handlixnxg commxon +ancombc2 <- function(data, ...) { + tryCatch( + ANCOMBC::ancombc2(data = data, ...), + error = function(cnd) { + + res <- find_bad_taxa(cnd) + if( is.data.frame(res[[1]]) ){ + # Returns a manually created empty data.frame + return(res) + }else{ + # Returns the names of the bad taxa to exclude from further analysis + bad_taxa <- res # renaming for readability + } + + # Second error catcher in case it fails in first one + tryCatch( + ANCOMBC::ancombc2(data = data[!rownames(data) %in% bad_taxa, ], ...), + + error = function(cnd) { + # Returns a manually created empty data.frame + find_bad_taxa(cnd) + }) + + } + ) +} -### 7a. Hierarchical Clustering +# Geometric mean function used when running DESeq2 +gm_mean <- function(x, na.rm=TRUE) { + exp(sum(log(x[x > 0]), na.rm=na.rm) / length(x)) +} -Create a euclidean distance matrix and perform hierarchical clustering. +``` + +#### Set Variables ```R -euc_dist <- dist(t(vst_counts)) -euc_clust <- hclust(d = euc_dist, method = "ward.D2") +# Define a custom palette for plotting +custom_palette <- c("#1F78B4","#33A02C","#FB9A99","#E31A1C","#6A3D9A", + "#FDBF6F", "#FF7F00","#CAB2D6","#FF00FFFF", "#B15928", + "#000000","#FFC0CBFF", "#A6CEE3", "#8B864EFF","#F0027F", + "#666666","#1B9E77", "#E6AB02","#A6761D","#FFFF00FF", + "#00FFFFFF", "#FFFF99", "#B2182B","#FDDBC7","#D1E5F0", + "#B2DF8A","#CC0033","#FF00CC","#330033", "#999933", + "#FF9933", "#FFFAFAFF",colors()) + +# Remove white colors +pattern_to_filter <- "white|snow|azure|gray|#FFFAFAFF|aliceblue" +custom_palette <- custom_palette[-c(21:23, grep(pattern = pattern_to_filter, + x = custom_palette, + ignore.case = TRUE))] +# Custom theme for plotting +publication_format <- theme_bw() + + theme(panel.grid = element_blank()) + + theme(axis.ticks.length=unit(-0.15, "cm"), + axis.text.x=element_text(margin=ggplot2::margin(t=0.5,r=0,b=0,l=0,unit ="cm")), + axis.text.y=element_text(margin=ggplot2::margin(t=0,r=0.5,b=0,l=0,unit ="cm")), + axis.title = element_text(size = 18,face ='bold.italic', color = 'black'), + axis.text = element_text(size = 16,face ='bold', color = 'black'), + legend.position = 'right', + legend.title = element_text(size = 15,face ='bold', color = 'black'), + legend.text = element_text(size = 14,face ='bold', color = 'black'), + strip.text = element_text(size = 14,face ='bold', color = 'black')) ``` -**Parameter Definitions:** +#### Read-in Input Tables + +```R +# Read-in metadata +metadata <- read_csv(file = metadata_file) %>% as.data.frame() +row.names(metadata) <- metadata[[sample_colname]] +metadata[,sample_colname] <- NULL +group_column_values <- metadata %>% pull(!!sym(groups_colname)) +group_levels <- unique(group_column_values) + +# Add colors to metadata equals to the number of levels +# in the factor groups column +num_colors <- length(group_levels) +palette <- 'Set1' +number_of_colors_in_palette <- 9 +if(num_colors <= number_of_colors_in_palette){ + colors <- RColorBrewer::brewer.pal(n = num_colors, name = palette) +}else{ + colors <- custom_palette[1:num_colors] +} -* `euc_clust <-` – specifying the variable that will hold the euclidean distance object +# Metadata +group_colors <- setNames(colors, group_levels) +metadata <- metadata %>% + mutate(color = map_chr(!!sym(groups_colname), + function(group) { group_colors[group] } + ) + ) +sample_names <- rownames(metadata) +deseq2_sample_names <- make.names(sample_names, unique = TRUE) -* `hclust()` – the hclust function we are using for hierarchical clustering +sample_info_tab <- metadata %>% + select(!!groups_colname, color) %>% + arrange(!!sym(groups_colname)) -* `d=` – specifying the the input dissimilarity or distance object -* `method=` - specifying the method of clustering to use. "Ward.D2" is one that is commonly used +values <- sample_info_tab %>% pull(color) %>% unique() -Create a dendrogram object. -```R -euc_dend <- as.dendrogram(euc_clust, hang = 0.1) + +# Feature or ASV table +feature_table <- read.table(file = features_file, header = TRUE, + row.names = 1, sep = "\t") + +# Taxonomy table +taxonomy_table <- read.table(file = taxonomy_file, header = TRUE, + row.names = 1, sep = "\t") ``` -**Parameter Definitions:** -* `euc_dend <-` – specifying the variable that will hold the dendrogram object +#### Preprocessing + +```R +if(remove_rare){ + + # Remove samples with less than library-cutoff + message(glue("Dropping samples with less than {library_cutoff} read counts")) + feature_table <- feature_table[,colSums(feature_table) >= library_cutoff] + # Remove rare ASVs + message(glue("Dropping features with prevalence less than {prevalence_cutoff * 100}%")) + feature_table <- remove_rare_features(feature_table, + cut_off_percent = prevalence_cutoff) +} + +# Preprocess ASV and taxonomy tables -* `as.dendrogram()` – the dendrogram function we are using to create a dendrogram +message(glue("There are {sum(is.na(taxonomy_table$domain))} features without + taxonomy assignments. Dropping them...")) -* `euc_clust` – an object that can be converted into a dendrogram +# Dropping features that couldn't be assigned taxonomy +taxonomy_table <- taxonomy_table[-which(is.na(taxonomy_table$domain)),] -* `hang=` - numeric indicating how the leaves hight should be computed from the height of their parents +# Handle case where no domain was assigned but a phylum was. +if(all(is.na(taxonomy$domain))){ + + if(target_region == "ITS"){ + taxonomy_table$domain <- "Fungi" + }else if(target_region == "18S"){ + taxonomy_table$domain <- "Eukaryotes" + }else{ + taxonomy_table$domain <- "Bacteria" + } -Color the sample branches by group and plot the dendrogram. +} + +# Removing Chloroplast and Mitochondria Organelle DNA contamination +asvs2drop <- taxonomy_table %>% + unite(col="taxonomy",domain:species) %>% + filter(str_detect(taxonomy, "[Cc]hloroplast|[Mn]itochondria")) %>% + row.names() +taxonomy_table <- taxonomy_table[!(rownames(taxonomy_table) %in% asvs2drop),] + +# Clean taxonomy names +feature_names <- rownames(taxonomy_table) +taxonomy_table <- process_taxonomy(taxonomy_table) +rownames(taxonomy_table) <- feature_names +taxonomy_table <- fix_names(taxonomy_table, "Other", ";_") + + +# Get long asv taxonomy names and clean +species <- taxonomy_table %>% + unite(species,domain:species,sep = ";") %>% +pull %>% str_replace_all("Other", "_") + +taxonomy_table <- fix_names(taxonomy_table, "Other", ";_") + +taxonomy_table[,"species"] <- species + + +# Subset tables +# Get features common to the taxonomy and feature table +common_ids <- intersect(rownames(feature_table), rownames(taxonomy_table)) + +# Subset the feature and taxonomy tables to contain +# only features found in both table +feature_table <- feature_table[common_ids,] +taxonomy_table <- taxonomy_table[common_ids,] +``` + + +## 7. Alpha Diversity Analysis + +Alpha diversity examines the variety and abundance of taxa within individual samples. Rarefaction curves are utilized to visually represent this diversity, plotting the number of unique sequences (ASVs) identified against the total number of sequences sampled, offering a perspective on the saturation and completeness of sampling. Metrics like Chao1 richness estimates and Shannon diversity indices are employed to quantify the richness (total number of unique sequences) and diversity (combination of richness and evenness) within these samples. + +```bash +Rscript alpha_diversity.R \ + --metadata-table amplicon_runsheet.csv \ + --feature-table counts_GLAmpSeq.tsv \ + --taxonomy-table taxonomy_GLAmpSeq.tsv \ + --group groups \ + --samples-column 'Sample Name' \ + --rarefaction-depth 500 +``` +**Parameter Definitions:** + +* `--metadata-table` – specifies the path to a comma separated samples metadata file with the group/treatment to be analyzed +* `--feature-table` – specifies the path to a tab separated samples feature table i.e. ASV or OTU table +* `--taxonomy-table` – specifies the path to a feature taxonomy table i.e. ASV taxonomy table +* `--group` – specifies the group column in metadata to be analyzed +* `--samples-column` – specifies the column in metadata containing the sample names in the feature table +* `--rarefaction-depth` – specifies the minimum rarefaction depth for alpha diversity estimation + +Type `Rscript alpha_diversity.R --help` at the commandline for a full list of available parameters + +Content of `alpha_diversity.R` ```R -dend_cols <- sample_info_tab$color[order.dendrogram(euc_dend)] -labels_colors(euc_dend) <- dend_cols -png("dendrogram_by_group_GLAmpSeq.png") -euc_dend %>% set("labels_cex", max_cex) %>% plot(ylab = "VST Euc. dist.") -dev.off() +# Create output directory if it doesn't already exist +alpha_diversity_out_dir <- "alpha_diversity/" +if(!dir.exists(alpha_diversity_out_dir)) dir.create(alpha_diversity_out_dir) + +# Create phyloseq object +ASV_physeq <- phyloseq(otu_table(feature_table, taxa_are_rows = TRUE), + tax_table(as.matrix(taxonomy_table)), + sample_data(sample_info_tab)) + +seq_per_sample <- colSums(feature_table) %>% sort() + +# Get rarefaction depth +# minimum value +depth <- min(seq_per_sample) + +for (count in seq_per_sample) { + + if(count >= rarefaction_depth) { + depth <- count + break + } + +} + +# -------------------- Rarefy sample counts to even depth per sample +ps.rarefied <- rarefy_even_depth(physeq = ASV_physeq, + sample.size = depth, + rngseed = 1, + replace = FALSE, + verbose = FALSE) + + +# ------------------- Rarefaction curve +# Calculate a rough estimate of the step sample step size for plotting. +# This is meant to keep plotting time constant regardless of sample depth +step <- (50*depth)/1000 + +p <- rarecurve(t(otu_table(ps.rarefied)) %>% as.data.frame(), + step = step, + col = sample_info_tab[["color"]], + lwd = 2, ylab = "ASVs", cex=0.5, + label = FALSE, tidy = TRUE) + + +sample_info_tab_names <- sample_info_tab %>% rownames_to_column("Site") + +p <- p %>% left_join(sample_info_tab_names, by = "Site") + +# Sample rarefaction curves + +rareplot <- ggplot(p, aes(x = Sample, y = Species, + group = Site, color = !!sym(groups_colname))) + + geom_line() + + scale_color_manual(values = group_colors) + + labs(x = "Number of Sequences", y = "Number of ASVs", color = legend_title) + + theme_bw() + + theme(legend.position = "right", + text = element_text(face = 'bold', size = 15), + legend.text = element_text(face = 'bold', size = 14), + legend.direction = "vertical", + legend.justification = "center", + legend.box.just = "center", + legend.title = element_text(size = 15, face='bold'), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + plot.margin = margin(t = 10, r = 20, b = 10, l = 10, unit = "pt")) + +ggsave(filename = glue("{alpha_diversity_out_dir}/{output_prefix}rarefaction_curves{assay_suffix}.png"), + plot=rareplot, width = 14, height = 8.33, dpi = 300) + +# ------------------ Richness and diversity estimates ------------------# + +# Statistics table +diversity_metrics <- c("Observed", "Chao1", "Shannon", "Simpson") +names(diversity_metrics) <- diversity_metrics +diversity.df <- estimate_richness(ps.rarefied, + measures = diversity_metrics) %>% + select(-se.chao1) %>% + rownames_to_column("samples") + + +merged_table <- metadata %>% + rownames_to_column("samples") %>% + inner_join(diversity.df) + +diversity_stats <- map_dfr(.x = diversity_metrics, function(metric){ + + res <- dunnTest(merged_table[,metric],merged_table[,groups_colname]) + + df <- res$res %>% + separate(col = Comparison, into = c("group1", "group2"), sep = " - ") %>% + mutate(Metric=metric) %>% + rename(p=P.unadj, p.adj=P.adj) %>% + mutate(p.format=round(p,digits = 2)) + + add_significance(df, p.col='p', output.col = 'p.signif') %>% + select(Metric,group1, group2, Z, p, p.adj, p.format, p.signif) %>% + mutate(across(where(is.numeric), ~round(.x, digits = 2))) + +}) + + +# Write diversity statistics table to file +write_csv(x = diversity_stats, + file = glue("{alpha_diversity_out_dir}/{output_prefix}statistics_table{assay_suffix}.csv")) + +# Get diffrent letters compare groups for every diversity metric +comp_letters <- data.frame(group = group_levels) +colnames(comp_letters) <- groups_colname + +walk(.x = diversity_metrics, function(metric=.x){ + + sub_comp <- diversity_stats %>% filter(Metric == metric) + p_values <- sub_comp$p + names(p_values) <- paste(sub_comp$group1,sub_comp$group2, sep = "-") + + letters_df <- enframe(multcompView::multcompLetters(p_values)$Letters, + name = groups_colname, + value = glue("{metric}_letter")) + comp_letters <<- comp_letters %>% left_join(letters_df) + +}) + + +# Summary table +diversity_table <- metadata %>% + rownames_to_column("samples") %>% + inner_join(diversity.df) %>% + group_by(!!sym(groups_colname)) %>% + summarise(N=n(), across(Observed:Simpson, + .fns=list(mean=mean, se=se), + .names = "{.col}_{.fn}")) %>% + mutate(across(where(is.numeric), ~round(.x, digits = 2))) %>% + left_join(comp_letters) %>% + mutate(Observed = glue("{Observed_mean} ± {Observed_se}{Observed_letter}"), + Chao1 = glue("{Chao1_mean} ± {Chao1_se}{Chao1_letter}"), + Shannon = glue("{Shannon_mean} ± {Shannon_se}{Shannon_letter}"), + Simpson = glue("{Simpson_mean} ± {Simpson_se}{Simpson_letter}") + ) %>% + select (-contains("_")) + +# Write diversity summary table to file +write_csv(x = diversity_table, + file = glue("{alpha_diversity_out_dir}/{output_prefix}summary_table{assay_suffix}.csv")) + + +# ------------------ Make richness by sample dot plots ---------------------- # + +number_of_samples <- length(rownames(sample_info_tab)) +richness_sample_label_size <- calculate_text_size(number_of_samples) +metrics2plot <- c("Observed", "Shannon") +names(metrics2plot) <- metrics2plot + +samples_order <- metadata %>% arrange(!!sym(groups_colname)) %>% rownames() + +richness_by_sample <- plot_richness(ps.rarefied, color = groups_colname, + measures = metrics2plot) + +richness_by_sample <- ggplot(richness_by_sample$data %>% + mutate(samples = factor(samples, + levels=samples_order)), + aes(x=samples, y=value, colour = !!sym(groups_colname))) + + geom_point() + + geom_errorbar(aes(ymin=value-se, ymax = value+se), + width=0.2, position=position_dodge(0.9)) + + facet_wrap(~variable, scales = "free_y") + + scale_color_manual(values = group_colors) + + theme_bw() +labs(x = NULL, color = legend_title, y="Alpha Diversity Measure") + + theme( + text = element_text(face = 'bold', size = 15), + legend.text = element_text(face = 'bold', size = 14), + legend.position = "bottom", + legend.direction = "vertical", + legend.justification = "center", + legend.box.just = "center", + legend.title = element_text(face = 'bold', size = 15, hjust = 0.09), + axis.text.x = element_text(angle = 90, + size = richness_sample_label_size, + vjust = 0.5, # Vertically center the text + hjust = 1), + axis.ticks.length=unit(-0.15, "cm"), + strip.text = element_text(size = 14,face ='bold') + ) + +# Save sample plot +ggsave(filename = glue("{alpha_diversity_out_dir}/{output_prefix}richness_and_diversity_estimates_by_sample{assay_suffix}.png"), + plot=richness_by_sample, width = 14, height = 8.33, dpi = 300, units = "in") + + +# ------------------- Make richness by group box plots ----------------------- # +richness_by_group <- plot_richness(ps.rarefied, x = groups_colname, + color = groups_colname, + measures = metrics2plot) + +p <- map(.x = metrics2plot, .f = function(metric){ + + p <- ggplot(richness_by_group$data %>% filter(variable == metric), + aes(x=!!sym(groups_colname), y=value, fill=!!sym(groups_colname)) + ) + + geom_point() + + geom_boxplot() + + scale_fill_manual(values = group_colors) + + theme_bw() + labs(fill = legend_title, x = NULL, y= metric) + + theme( + text = element_text(size = 15, face = 'bold'), + legend.text = element_text(face = 'bold', size = 14), + legend.position = "right", + legend.direction = "vertical", + legend.justification = "center", + legend.box.just = "center", + legend.title = element_text(face = 'bold', size = 15), + axis.text.x = element_blank(), + axis.ticks.length=unit(-0.15, "cm"), + strip.text = element_text(size = 14,face ='bold') + ) + + + summary_table <- p$data %>% + select(!!sym(groups_colname), value) %>% + group_by(!!sym(groups_colname)) %>% + summarise(max=max(value), range=max(value)-min(value)) %>% + left_join(comp_letters %>% + select(groups, label= !!sym( glue("{metric}_letter") ) + ) + ) + text_size <- 6 + + # Calculate a constant to add to the max value of each group + # to determine where each group text will be added + toAdd <- if_else(condition = max(summary_table$range) <= 5, + true = min(summary_table$range), + false = (median(summary_table$range) - min(summary_table$range)) / 20 + ) + + # Add text to plot + p + geom_text(data=summary_table, + mapping = aes(y=max+toAdd, + label=label, + fontface = "bold"), + size = text_size) +}) + +richness_by_group <- wrap_plots(p, ncol = 2, guides = 'collect') + +# Save group plot +width <- 3.6 * length(group_levels) +ggsave(filename = glue("{alpha_diversity_out_dir}/{output_prefix}richness_and_diversity_estimates_by_group{assay_suffix}.png"), + plot=richness_by_group, width = width, height = 8.33, dpi = 300, units = "in") ``` **Input Data:** -* `vst_counts` (variable holding the VST-normalized ASV counts, output from [step 7](#7-beta-diversity)) - +* **amplicon_runsheet.csv** (metadata table - e.g {OSD-Accession-ID}_AmpSeq_v{version}_runsheet.csv ) +* **counts_GLAmpSeq.tsv** (count table) +* **taxonomy_GLAmpSeq.tsv** (taxonomy table) + **Output Data:** -* **dendrogram_by_group_GLAmpSeq.png** (dendrogram of euclidean distance - based hierarchical clustering of the samples, colored by experimental groups) -* `euc_dist` (variable containing the samplewise euclidean distance matrix based on VST-normalized counts) +* **alpha_diversity/rarefaction_curves_GLAmpSeq.png** (Rarefaction curves) +* **alpha_diversity/statistics_table_GLAmpSeq.csv** (Statistics Table) +* **alpha_diversity/summary_table_GLAmpSeq.csv** (Summary Table) +* **alpha_diversity/richness_and_diversity_estimates_by_sample_GLAmpSeq.png** (Samples Dot Plot) +* **alpha_diversity/richness_and_diversity_estimates_by_group_GLAmpSeq.png** (Group boxplot)
-### 7b. Ordination +--- -Ordination techniques like PCoA help in reducing the dimensionality of the data, allowing us to visualize complex relationships between samples. -Create a physeq object with an OTU table using VST-transformed counts and sample info table. Ordinate the counts using PCoA and euclidean distance. +## 8. Beta Diversity Analysis -```R -vst_count_phy <- otu_table(object = vst_counts, taxa_are_rows = TRUE) -sample_info_tab_phy <- sample_data(sample_info_tab) -vst_physeq <- phyloseq(vst_count_phy, sample_info_tab_phy) +Beta diversity measures the variation in species composition between different samples or environments. A common practice in working with a new dataset is to generate some exploratory visualizations like ordinations and hierarchical clusterings. These give us a quick overview of how our samples relate to each other and can be a way to check for problems like batch effects. -vst_pcoa <- ordinate(physeq = vst_physeq, method = "PCoA", distance = "euclidean") +```bash +Rscript beta_diversity.R \ + --metadata-table amplicon_runsheet.csv \ + --feature-table counts_GLAmpSeq.tsv \ + --taxonomy-table taxonomy_GLAmpSeq.tsv \ + --group groups \ + --samples-column 'Sample Name' \ + --rarefaction-depth 500 ``` +**Parameter Definitions:** -**Parameter Definitions:** +* `--metadata-table` – specifies the path to a comma separated samples metadata file with the group/treatment to be analyzed +* `--feature-table` – specifies the path to a tab separated samples feature table i.e. ASV or OTU table +* `--taxonomy-table` – specifies the path to a feature taxonomy table i.e. ASV taxonomy table +* `--group` – specifies the group column in metadata to be analyzed +* `--samples-column` – specifies the column in metadata containing the sample names in the feature table +* `--rarefaction-depth` – specifies the minimum rarefaction depth for diversity estimation. Relavant only for Bray Curtis distance calculation between samples -* `vst_pcoa <-` – specifying the variable that will hold the ordination of the VST-normalized counts +type `Rscript beta_diversity.R --help` at the commandline for a full list of available parameters -* `physeq=` – specifying the Phyloseq object that contains the variance stabilized counts and sample metadata +Content of `beta_diversity.R` -* `method=` – specifying the ordination method to be used +```R + +beta_diversity_out_dir <- "beta_diversity/" +if(!dir.exists(beta_diversity_out_dir)) dir.create(beta_diversity_out_dir) -* `distance=` – specifying the distance metric used for the ordination +distance_methods <- c("euclidean", "bray") +normalization_methods <- c("vst", "rarefy") +legend_title <- NULL + +options(warn=-1) # ignore warnings +# Run the analysis +walk2(.x = normalization_methods, .y = distance_methods, + .f = function(normalization_method, distance_method){ + +# Create transformed phyloseq object +ps <- transform_phyloseq(feature_table, metadata, + method = normalization_method, + rarefaction_depth = rarefaction_depth) + +# ---------Clustering and dendogram plotting + +# Extract normalized count table +count_tab <- otu_table(ps) + +# Calculate distance between samples +dist_obj <- vegdist(t(count_tab), method = distance_method) + +# Make dendogram +dendogram <- make_dendogram(dist_obj, metadata, groups_colname, + group_colors, legend_title) + +# Save dendogram +ggsave(filename = glue("{beta_diversity_out_dir}/{output_prefix}{distance_method}_dendrogram{assay_suffix}.png"), + plot = dendogram, width = 14, height = 10, dpi = 300, units = "in") + +#---------------------------- Run stats +# Checking homogeneity of variance and comparing groups using adonis test + +stats_res <- run_stats(dist_obj, metadata, groups_colname) +write_csv(x = stats_res$variance, + file = glue("{beta_diversity_out_dir}/{output_prefix}{distance_method}_variance_table{assay_suffix}.csv")) + +write_csv(x = stats_res$adonis, + file = glue("{beta_diversity_out_dir}/{output_prefix}{distance_method}_adonis_table{assay_suffix}.csv")) + +#---------------------------- Make PCoA +# Unlabeled PCoA plot +ordination_plot_u <- plot_pcoa(ps, stats_res, distance_method, + groups_colname,group_colors, legend_title) +ggsave(filename=glue("{beta_diversity_out_dir}/{output_prefix}{distance_method}_PCoA_without_labels{assay_suffix}.png"), + plot=ordination_plot_u, width = 14, height = 8.33, dpi = 300, units = "in") + +# Labeled PCoA plot +ordination_plot <- plot_pcoa(ps, stats_res, distance_method, + groups_colname, group_colors, legend_title, + addtext=TRUE) +ggsave(filename=glue("{beta_diversity_out_dir}/{output_prefix}{distance_method}_PCoA_w_labels{assay_suffix}.png"), + plot=ordination_plot, width = 14, height = 8.33, dpi = 300, units = "in") + +}) +``` **Input Data:** -* `vst_counts` (variable holding the VST-normalized ASV counts, output from [step 7](#7-beta-diversity)) -* `sample_info_tab` (variable containing a subtable of the runsheet, including the 'groups' and 'color' columns, output from [step 6b](#6b-environment-set-up)) +* **amplicon_runsheet.csv** (metadata table) +* **counts_GLAmpSeq.tsv** (count table) +* **taxonomy_GLAmpSeq.tsv** (taxonomy table) **Output Data:** -* `vst_physeq` (variable holding the Phyloseq object) -* `vst_pcoa` (variable holding the object containing the coordinates and eigenvalues resulting from the PCoA of the VST-normalized counts) - +* **beta_diversity/_dendrogram_GLAmpSeq.png** (Dendogram) +* **beta_diversity/_adonis_table_GLAmpSeq.csv** (Adonis Stats Table) +* **beta_diversity/_PCoA_without_labels_GLAmpSeq.png** (Unlabeled PCoA) +* **beta_diversity/_PCoA_w_labels_GLAmpSeq.png** (Labeled PCoA) + +where distance_method is either bray or euclidean for Bray Curtis and Euclidean distance, respectively. +
-___ +--- -## 8. Alpha Diversity -Alpha diversity examines the variety and abundance of taxa within individual samples. Rarefaction curves are utilized to visually represent this diversity, plotting the number of unique sequences (ASVs) identified against the total number of sequences sampled, offering a perspective on the saturation and completeness of sampling. Metrics like Chao1 richness estimates and Shannon diversity indices are employed to quantify the richness (total number of unique sequences) and diversity (combination of richness and evenness) within these samples. +## 9. Taxonomy Plots -
-### 8a. Rarefaction Curves +```bash +Rscript plot_taxonomy.R \ + --metadata-table mapping/GLDS-487_amplicon_v1_runsheet.csv \ + --feature-table data/counts_GLAmpSeq.tsv \ + --taxonomy-table data/taxonomy_GLAmpSeq.tsv \ + --group groups \ + --samples-column 'Sample Name' \ + --remove-rare FALSE \ + --prevalence-cutoff 0.15 \ + --library-cutoff 100 + +``` +**Parameter Definitions:** + +* `--metadata-table` – specifies the path to a comma separated samples metadata file with the group/treatment to be analyzed +* `--feature-table` – specifies the path to a tab separated samples feature table i.e. ASV or OTU table +* `--taxonomy-table` – specifies the path to a feature taxonomy table i.e. ASV taxonomy table +* `--group` – specifies the group column in metadata to be analyzed +* `--samples-column` – specifies the column in metadata containing the sample names in the feature table +* `--remove-rare` - should rare features be filtered out prior to analysis? If set, rare feature will be removed +* `--prevalence-cutoff` - If --remove-rare, a numerical fraction between 0 and 1. + Taxa with prevalences(the proportion of samples in which the taxon is present) less than --prevalence-cutoff will be excluded in the analysis. Default is 0.15, i.e. exclude taxa / features that are not present in at least 15% of the samples. +* `--library-cutoff` - If --remove-rare, a numerical threshold for filtering samples based on library sizes. + Samples with library sizes less than lib_cut will be excluded in the analysis. Default is 100. + if you do not want to discard any sample then set to 0. +type `Rscript plot_taxonomy.R --help` at the commandline for a full list of available parameters + +Content of `plot_taxonomy.R` ```R -rare_curve <- rarecurve(x = t(count_tab), step = 100, col = sample_info_tab$color, - lwd = 2, ylab = "ASVs", label = FALSE) -png("rarefaction_curves_GLAmpSeq.png") -plot(rare_curve) -dev.off() +taxonomy_plots_out_dir <- "taxonomy_plots/" +if(!dir.exists(taxonomy_plots_out_dir)) dir.create(taxonomy_plots_out_dir) + +# -------------------------Prepare feature tables -------------------------- # +taxon_levels <- colnames(taxonomy_table) +names(taxon_levels) <- taxon_levels +taxon_tables <- map(.x = taxon_levels, + .f = make_feature_table, + count_matrix = feature_table, + taxonomy = taxonomy_table) + + +# ----------------------- Sample abundance plots -------------------------- # +group_rare <- TRUE +samples_order <- metadata %>% arrange(!!sym(groups_colname)) %>% rownames() +dont_group <- c("phylum") +# In percentage +thresholds <- c(phylum=1,class=3, order=3, family=8, genus=8, species=9) +# Convert from wide to long format +# -1 drops the kingdom level since all the microbes are bacteria +relAbundace_tbs_rare_grouped <- map2(.x = taxon_levels[-1], + .y = taxon_tables[-1], + .f = function(taxon_level=.x, + taxon_table=.y){ + + + taxon_table <- apply(X = taxon_table, MARGIN = 2, + FUN = function(x) x/sum(x)) * 100 + + + taxon_table <- as.data.frame(taxon_table %>% t()) + if(group_rare && !(taxon_level %in% dont_group)){ + + taxon_table <- group_low_abund_taxa(taxon_table %>% + as.data.frame(check.names=FALSE, + stringAsFactor=FALSE), + threshold = thresholds[taxon_level]) + + } + taxon_table$samples <- rownames(taxon_table) + + + # Change data frame from wide to long format + taxon_table <- taxon_table %>% + pivot_longer(cols = -samples, names_to = taxon_level, values_to = "relativeAbundance") + taxon_table$samples <- factor(x = taxon_table$samples, + levels = samples_order) + return(taxon_table) + }) + + +x_lab <- "Samples" +y_lab <- "Relative abundance (%)" +x <- 'samples' +y <- "relativeAbundance" +facet_by <- reformulate(groups_colname) +number_of_samples <- length(samples_order) +plot_width <- 0.6 * number_of_samples + +# Make sample plots +walk2(.x = relAbundace_tbs_rare_grouped, .y = taxon_levels[-1], + .f = function(relAbundace_tb, taxon_level){ + + df <- relAbundace_tb %>% + left_join(metadata %>% rownames_to_column("samples")) + + p <- ggplot(data = df, mapping = aes(x= !!sym(x), y=!!sym(y) )) + + geom_col(aes(fill = !!sym(taxon_level) )) + + facet_wrap(facet_by, scales = "free", nrow = 1) + + publication_format + + labs(x = x_lab , y = y_lab, fill= tools::toTitleCase(taxon_level)) + + scale_fill_manual(values = custom_palette) + + theme(axis.text.x=element_text( + margin=margin(t=0.5,r=0,b=0,l=0,unit ="cm"), + angle = 90, + hjust = 0.5, vjust = 0.5)) + + labs(x=NULL) + + ggsave(filename = glue("{taxonomy_plots_out_dir}/{output_prefix}samples_{taxon_level}{assay_suffix}.png"), + plot=p, width = plot_width, height = 8.5, dpi = 300) + + }) + + + +# ------------------------ Group abundance plots ----------------------------- # +# In percentage +thresholds <- c(phylum=1,class=2, order=2, family=2, genus=2, species=2) + +# Convert from wide to long format for every treatment group of interest +group_rare <- TRUE +maximum_number_of_taxa <- 500 + +group_relAbundace_tbs <- map2(.x = taxon_levels[-1], .y = taxon_tables[-1], + .f = function(taxon_level=.x, taxon_table=.y){ + + taxon_table <- as.data.frame(taxon_table %>% t()) + taxon_table <- (collapse_samples(taxon_table = taxon_table, + metadata = metadata, group = groups_colname, + convertToRelativeAbundance = TRUE)$taxon_table * 100 ) %>% + as.data.frame(check.names=FALSE) + + if(ncol(taxon_table) > maximum_number_of_taxa){ + group_rare <- TRUE + } + + if(group_rare){ + taxon_table <- group_low_abund_taxa(taxon_table %>% + as.data.frame(check.names=FALSE, + stringAsFactor=FALSE), + threshold = thresholds[taxon_level]) + group_rare <- FALSE + } + + taxon_table[,groups_colname] <- rownames(taxon_table) + + + # Change from wide to long format + taxon_table <- taxon_table %>% + pivot_longer(cols = -!!sym(groups_colname), + names_to = taxon_level, + values_to = "relativeAbundance") + + return(taxon_table) + + }) + + +# Make bar plots +y_lab <- "Relative abundance (%)" +y <- "relativeAbundance" +number_of_groups <- length(group_levels) +plot_width <- 2 * number_of_groups +walk2(.x = group_relAbundace_tbs, .y = taxon_levels[-1], + .f = function(relAbundace_tb=.x, taxon_level=.y){ + + p <- ggplot(data = relAbundace_tb, mapping = aes(x= !!sym(groups_colname), y = !!sym(y) )) + + geom_col(aes(fill = !!sym(taxon_level))) + + publication_format + + theme(axis.text.x=element_text( + margin=margin(t=0.5,r=0,b=0,l=0,unit ="cm"), + angle = 90, + hjust = 0.5, vjust = 0.5)) + + labs(x = NULL , y = y_lab, fill = tools::toTitleCase(taxon_level)) + + scale_fill_manual(values = custom_palette) + ggsave(filename = glue("{taxonomy_plots_out_dir}/{output_prefix}groups_{taxon_level}{assay_suffix}.png"), + plot=p, width = plot_width, height = 10, dpi = 300) + }) ``` -**Parameter Definitions:** +**Input Data:** -* `rare_curve <-` – specifies the variable that will store the results within in our R environment +* **amplicon_metdata.csv** (metadata table) +* **counts_GLAmpSeq.tsv** (count table) +* **taxonomy_GLAmpSeq.tsv** (taxonomy table) -* `rarecurve()` – the rarefy function we are calling, with the following parameters set within it +**Output Data:** -* `x=` - specifies the input data for the rarefaction curve, which should be the transposed counts +* **taxonomy_plots/samples__GLAmpSeq.png** (samples barplots) +* **taxonomy_plots/groups__GLAmpSeq.png** (groups barplots) -* `step=` - specifies the step size for sample sizes in rarefaction curves +where taxon_level is one of phylum, class, order, family, genus and species. -* `col=` - assigns a color to each line on the rarefaction curve for visual differentiation of sample groups +> please note that species plot should only be taken with a grain of salt as short amplicon sequences can't be used to accurately predict species. -* `lwd=` - sets the line width for the curves in the plot +
-* `ylab=` - defines the label for the y-axis of the plot +--- -* `label=` - indicates whether the lines in the plot should be kept -**Input Data:** +## 10. Differential Abundance Testing -* `vst_counts` (variable holding the VST-normalized ASV counts, output from [step 7](#7-beta-diversity)) -* `sample_info_tab` (variable containing a subtable of the runsheet, including the 'groups' and 'color' columns, output from [step 6b](#6b-environment-set-up)) -**Output Data:** +### 10a. ANCOMBC 1 +```bash +Rscript pairwise_ancombc1.R \ + --metadata-table amplicon_runsheet.csv \ + --feature-table counts_GLAmpSeq.tsv \ + --taxonomy-table taxonomy_GLAmpSeq.tsv \ + --group groups \ + --samples-column 'Sample Name' \ + --target-region 16S \ + --remove-rare FALSE \ + --prevalence-cutoff 0.15 \ + --library-cutoff 100 \ + --cpus 5 + +``` +**Parameter Definitions:** -* **rarefaction_curves_GLAmpSeq.png** (Rarefaction curves plot for all samples) +* `--metadata-table` – specifies the path to a comma separated samples metadata file with the group/treatment to be analyzed +* `--feature-table` – specifies the path to a tab separated samples feature table i.e. ASV or OTU table +* `--taxonomy-table` – specifies the path to a feature taxonomy table i.e. ASV taxonomy table +* `--group` – specifies the group column in metadata to be analyzed +* `--samples-column` – specifies the column in metadata containing the sample names in the feature table +* `--target-region` – specifies the amplicon target region. Options are either 16S, 18S or ITS +* `--remove-rare` - should rare features be filtered out prior to analysis? If set, rare feature will be removed +* `--prevalence-cutoff` - If --remove-rare, a numerical fraction between 0 and 1. + Taxa with prevalences(the proportion of samples in which the taxon is present) less than --prevalence-cutoff will be excluded in the analysis. Default is 0.15, i.e. exclude taxa / features that are not present in at least 15% of the samples. +* `--library-cutoff` - If --remove-rare, a numerical threshold for filtering samples based on library sizes. + Samples with library sizes less than lib_cut will be excluded in the analysis. Default is 100. + if you do not want to discard any sample then set to 0. +* `--cpus ` - Specifies the number of cpus to use for parallel processing. -
+Type `Rscript pairwise_ancombc1.R --help` at the commandline for a full list of available parameters -### 8b. Richness and Diversity Estimates +Content of `pairwise_ancombc1.R` ```R -count_tab_phy <- otu_table(count_tab, taxa_are_rows = TRUE) -tax_tab_phy <- tax_table(as.matrix(tax_tab)) -ASV_physeq <- phyloseq(count_tab_phy, tax_tab_phy, sample_info_tab_phy) -richness_and_diversity_estimates_by_sample <- plot_richness(ASV_physeq, color = "groups", measures = c("Chao1", "Shannon")) -richness_and_diversity_estimates_by_group <- plot_richness(ASV_physeq, x = "groups", color = "groups", measures = c("Chao1", "Shannon")) - -ggsave(filename = "richness_and_diversity_estimates_by_sample_GLAmpSeq.png", plot = richness_and_diversity_estimates_by_sample) -ggsave(filename = "richness_and_diversity_estimates_by_group_GLAmpSeq.png", plot = richness_and_diversity_estimates_by_group) -``` +# Create output directory if it doesn't already exist +diff_abund_out_dir <- "differential_abundance/ancombc1/" +if(!dir.exists(diff_abund_out_dir)) dir.create(diff_abund_out_dir, recursive = TRUE) -**Parameter Definitions:** -* `plot_richness()` – the phyloseq function we are calling, with the following parameters set within it +# Create phyloseq object +ps <- phyloseq(otu_table(feature_table, taxa_are_rows = TRUE), + sample_data(metadata), + tax_table(as.matrix(taxonomy_table))) -* `x=` - an optional variable to map to the horizontal axis of the plot +# Convert phyloseq to tree summarized experiment object +tse <- mia::makeTreeSummarizedExperimentFromPhyloseq(ps) -* `color=` - specifies a variable for determining the coloring scheme of the plot -* `measures=` - determines which of the available alpha-diversity measures to include in the plot +# Get unique group comparison as a matrix +pairwise_comp.m <- utils::combn(metadata[,group] %>% unique, 2) +pairwise_comp_df <- pairwise_comp.m %>% as.data.frame -**Input Data:** +colnames(pairwise_comp_df) <- map_chr(pairwise_comp_df, + \(col) str_c(col, collapse = "v")) +comparisons <- colnames(pairwise_comp_df) +names(comparisons) <- comparisons -* `count_tab` (variable containing the ASV counts table, output from [step 6b](#6b-environment-set-up)) -* `sample_info_tab` (variable containing a subtable of the runsheet, including the 'groups' and 'color' columns, output from [step 6b](#6b-environment-set-up)) -* `tax_tab` (variable containing the taxonomy table, created in [step 5g](#5g-generating-and-writing-standard-outputs)) -**Output Data:** +message("Running ANCOMBC1....") +set.seed(123) +final_results_bc1 <- map(pairwise_comp_df, function(col){ + + group1 <- col[1] + group2 <- col[2] + + tse_sub <- tse[, tse[[group]] %in% c(group1, group2)] + + # Note that by default, levels of a categorical variable in R are sorted + # alphabetically. + # Changing the reference group by reordering the factor levels + tse_sub[[group]] <- factor(tse_sub[[group]] , levels = c(group1, group2)) + + # data - input data. TreeSummarizedExperiment or Phyloseq object + # assay_name - name of count table in the input data object. + # tax_level - taxonomy level for aggregation and analysis + # prv_cut - prevalence cut-off. proportion of samples in which taxon is present. + # lib_cut - a numerical threshold for filtering samples based on library sizes. + # p_adj_method - p-value adjustment method for multiple comparisons + # struc_zero - should group-wise rare taxa be detected + # neg_lb - whether to classify a taxon as a structural zero using its asymptotic lower bound. i.e.the best the algorithm can possibly achieve + # group - name of the group variable in metadata. Only important you'd like to perform global test can be set to NULL. + # alpha - significance level + # n_cl - number of processes to run in parallel + # global - should a global test be performed to detect significant differences between at least 2 groups (ANOVA-like comparison) + # tol - iteration convergence tolerance for the E-M algorithm. + # max_iter - max iteration + # formula - fixed effects formula + # conserve - should a conservative variance estimator be used for the test statistic? + # it is recommended to set to TRUE if your sample size is small and the number of expected differentially abundant taxa is large. + + out <- ancombc(data = tse_sub, assay_name = "counts", + tax_level = NULL, phyloseq = NULL, + formula = group, + p_adj_method = "fdr", prv_cut = prevalence_cutoff, + lib_cut = library_cutoff, + group = group, struc_zero = TRUE, neg_lb = TRUE, tol = 1e-5, + max_iter = 100, conserve = TRUE, alpha = 0.05, global = FALSE, + n_cl = threads, verbose = TRUE) + + # ------ Set data frame names ---------# + # LFC + lfc <- out$res$lfc %>% + as.data.frame() %>% + select(-contains("Intercept")) %>% + set_names( + c("taxon", + glue("logFC_({group2})v({group1})")) + ) + + # SE + se <- out$res$se %>% + as.data.frame() %>% + select(-contains("Intercept")) %>% + set_names( + c("taxon", + glue("lfcSE_({group2})v({group1})")) + ) + + # W + W <- out$res$W %>% + as.data.frame() %>% + select(-contains("Intercept")) %>% + set_names( + c("taxon", + glue("Wstat_({group2})v({group1})")) + ) + + # p_val + p_val <- out$res$p_val %>% + as.data.frame() %>% + select(-contains("Intercept")) %>% + set_names( + c("taxon", + glue("pvalue_({group2})v({group1})")) + ) + + # q_val + q_val <- out$res$q_val %>% + as.data.frame() %>% + select(-contains("Intercept")) %>% + set_names( + c("taxon", + glue("qvalue_({group2})v({group1})")) + ) + + + # Diff_abn + diff_abn <- out$res$diff_abn %>% + as.data.frame() %>% + select(-contains("Intercept")) %>% + set_names( + c("taxon", + glue("diff_({group2})v({group1})")) + ) + + + res <-lfc %>% + left_join(se) %>% + left_join(W) %>% + left_join(p_val) %>% + left_join(q_val) %>% + left_join(diff_abn) + + + return(res) + +}) + -* **richness_and_diversity_estimates_by_sample_GLAmpSeq.png** (Richness and diversity estimates plot for all samples) -* **richness_and_diversity_estimates_by_group_GLAmpSeq.png** (Richness and diversity estimates plot for all groups) -* `ASV_physeq` (variable contiaining the Phyloseq object created using an OTU table based on the vst_counts) + +# Create merged stats pairwise dataframe +# initialize the merged stats dataframe to contain the taxon column for joining +merged_stats_df <- final_results_bc1[[names(final_results_bc1)[1]]] %>% + as.data.frame() %>% select(taxon) + +walk(comparisons[names(final_results_bc1)], .f = function(comparison){ -
+ df <- final_results_bc1[[comparison]] %>% as.data.frame() + + merged_stats_df <<- merged_stats_df %>% + dplyr::full_join(df, by = join_by("taxon")) + +}) -___ +# Sort ASVs in ascending order +merged_stats_df <- merged_stats_df %>% + rename(!!feature := taxon) %>% + mutate(!!feature := SortMixed(!!sym(feature))) -## 9. Taxonomic Summaries -Taxonomic summaries provide insights into the composition of microbial communities at various taxonomic levels. -```R -proportions_physeq <- transform_sample_counts(ASV_physeq, function(ASV) ASV / sum(ASV)) +comp_names <- merged_stats_df %>% + select(starts_with("logFC")) %>% + colnames() %>% str_remove_all("logFC_") +names(comp_names) <- comp_names -relative_phyla <- plot_bar(proportions_physeq, x = "groups", fill = "phylum") -relative_classes <- plot_bar(proportions_physeq, x = "groups", fill = "class") +message("Making volcano plots...") +# -------------- Make volcano plots ------------------ # +volcano_plots <- map(comp_names, function(comparison){ + + comp_col <- c( + glue("logFC_{comparison}"), + glue("lfcSE_{comparison}"), + glue("Wstat_{comparison}"), + glue("pvalue_{comparison}"), + glue("qvalue_{comparison}"), + glue("diff_{comparison}") + ) + + sub_res_df <- merged_stats_df %>% + select(!!feature, all_of(comp_col)) %>% drop_na() + colnames(sub_res_df) <- str_replace_all(colnames(sub_res_df), + pattern = "(.+)_.+", + replacement = "\\1") + + p <- ggplot(sub_res_df, aes(x=logFC, y=-log10(pvalue), color=diff, label=!!sym(feature))) + + geom_point(size=4) + + scale_color_manual(values=c("TRUE"="red", "FALSE"="black")) + + geom_hline(yintercept = -log10(0.05), linetype = "dashed") + + ggrepel::geom_text_repel() + + labs(x="logFC", y="-log10(Pvalue)", + title = comparison, color="Significant") + publication_format + + ggsave(filename = glue("{output_prefix}{comparison}_volcano{assay_suffix}.png"), plot = p, device = "png", + width = 6, height = 8, units = "in", dpi = 300, path = diff_abund_out_dir) + + return(p) +}) + +number_of_columns <- 2 +number_of_rows = ceiling(length(comp_names) / number_of_columns) +fig_height = 7.5 * number_of_rows + +p <- wrap_plots(volcano_plots, ncol = 2) +# Try to combine all the volcano plots in one figure +try( +ggsave(filename = glue("{output_prefix}{feature}_volcano{assay_suffix}.png"), plot = p, device = "png", + width = 16, height = fig_height, units = "in", dpi = 300, + path = diff_abund_out_dir, limitsize = FALSE) +) + +# Add NCBI id to feature i.e. ASV +tax_names <- map_chr(str_replace_all(taxonomy_table$species, ";_","") %>% + str_split(";"), + function(row) row[length(row)]) + +df <- data.frame(ASV=rownames(taxonomy_table), best_taxonomy=tax_names) + +# Pull NCBI IDS for unique taxonomy names +df2 <- data.frame(best_taxonomy = df$best_taxonomy %>% + unique()) %>% + mutate(NCBI_id=get_ncbi_ids(best_taxonomy, target_region), + .after = best_taxonomy) + +df <- df %>% + left_join(df2, join_by("best_taxonomy")) %>% + right_join(merged_stats_df) + + +# Manually creating a normalized table because normalized +# tables differ by comparison +normalized_table <- as.data.frame(feature_table + 1) %>% + rownames_to_column(feature) %>% + mutate(across( where(is.numeric), log ) ) + + +samples <- metadata[[samples_column]] +samplesdropped <- setdiff(x = samples, y = colnames(normalized_table)[-1]) +missing_df <- data.frame(ASV=normalized_table[[feature]], + matrix(data = NA, + nrow = nrow(normalized_table), + ncol = length(samplesdropped) + ) +) +colnames(missing_df) <- c(feature,samplesdropped) + + +group_levels <- metadata[, group] %>% unique() %>% sort() +group_means_df <- normalized_table[feature] +walk(group_levels, function(group_level){ + + + mean_col <- glue("Group.Mean_({group_level})") + std_col <- glue("Group.Stdev_({group_level})") + + # Samples that belong to the current group + Samples <- metadata %>% + filter(!!sym(group) == group_level) %>% + pull(!!sym(samples_column)) + # Samples that belong to the current group that are in the normalized table + Samples <- intersect(colnames(normalized_table), Samples) + + temp_df <- normalized_table %>% select(!!feature, all_of(Samples)) %>% + rowwise() %>% + mutate(!!mean_col := mean(c_across(where(is.numeric))), + !!std_col := sd(c_across(where(is.numeric))) ) %>% + select(!!feature,!!sym(mean_col), !!sym(std_col)) + + group_means_df <<- group_means_df %>% left_join(temp_df) + +}) + + +# Append Mean and standard deviation +normalized_table <- normalized_table %>% + rowwise() %>% + mutate(All.Mean=mean(c_across(where(is.numeric))), + All.Stdev=sd(c_across(where(is.numeric))) )%>% + left_join(missing_df, by = feature) %>% + select(!!feature, all_of(samples), All.Mean, All.Stdev) + + +merged_df <- df %>% + left_join(taxonomy_table %>% + as.data.frame() %>% + rownames_to_column(feature)) %>% + select(!!feature, domain:species,everything()) # Try to generalize + + +merged_df <- merged_df %>% + select(!!sym(feature):NCBI_id) %>% + left_join(normalized_table, by = feature) %>% + left_join(merged_df) %>% + left_join(group_means_df, by = feature) %>% + mutate(across(where(is.numeric), ~round(.x, digits=3))) %>% + mutate(across(where(is.matrix), as.numeric)) + +output_file <- glue("{diff_abund_out_dir}/{output_prefix}ancombc1_differential_abundance{assay_suffix}.csv") +message("Writing out results of differential abundance using ANCOMBC1...") +write_csv(merged_df,output_file) + + +# --------------- Make log abundance box plots ------------------ # + +df2 <- (metadata %>% select(!!samples_column, !!group)) %>% + left_join(feature_table %>% + t %>% + as.data.frame %>% + rownames_to_column(samples_column)) + +message("Making abundance box plots...") +boxplots <- map( merged_stats_df[[feature]], function(feature){ + + p <- ggplot(df2, aes(x=!!sym(group), y=log(!!sym(feature)+1), fill=!!sym(group) )) + + geom_boxplot() + + labs(x=NULL, y="Log Abundance", fill=tools::toTitleCase(group), title = feature) + + theme_light() + + theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), + axis.title.y = element_text(face = "bold", size=12), + legend.text = element_text(face = "bold", size=10), + legend.title = element_text(face = "bold", size=12)) + + # Save feature boxplot as separate figures + ggsave(plot = p, filename = glue("{output_prefix}{feature}_boxplot{assay_suffix}.png"), device = "png", + width = 8, height = 5, units = "in", dpi = 300, path = diff_abund_out_dir) + + return(p) +}) + +p <- wrap_plots(boxplots, ncol = 2, guides = 'collect') + +number_of_features <- merged_stats_df[[feature]] %>% length +number_of_columns <- 2 +number_of_rows = ceiling(number_of_features / number_of_columns) +fig_height = 5 * number_of_rows + +# Try to Plot all features / ASVs in one figure +try( +ggsave(filename = glue("{output_prefix}{feature}_boxplots{assay_suffix}.png"), plot = p, device = "png", + width = 14, height = fig_height, units = "in", dpi = 300, + limitsize = FALSE, path = diff_abund_out_dir) # There too many things to plot -samplewise_phyla <- plot_bar(proportions_physeq, fill = "phylum") -samplewise_classes <- plot_bar(proportions_physeq, fill = "class") +) -ggsave(filename = "relative_phyla_GLAmpSeq.png", plot = relative_phyla) -ggsave(filename = "relative_classes_GLAmpSeq.png", plot = relative_classes) -ggsave(filename = "samplewise_relative_phyla_GLAmpSeq.png", plot = samplewise_phyla) -ggsave(filename = "samplewise_relative_classes_GLAmpSeq.png", plot = samplewise_classes) ``` **Input Data:** - -* `ASV_physeq` (variable contiaining the Phyloseq object, output from [step 8b](#8b-richness-and-diversity-estimates)) +* **amplicon_metdata.csv** (metadata table) +* **counts_GLAmpSeq.tsv** (count table) +* **taxonomy_GLAmpSeq.tsv** (taxonomy table) **Output Data:** -* **relative_phyla_GLAmpSeq.png** (taxonomic summaries plot based on phyla, for all samples) -* **relative_classes_GLAmpSeq.png** (taxonomic summaries plot based on class, for all samples) - -* **samplewise_phyla_GLAmpSeq.png** (taxonomic summaries plot based on phyla, for all samples) -* **samplewise_classes_GLAmpSeq.png** (taxonomic summaries plot based on class, for all samples) +* **differential_abundance/ancombc1/_volcano_GLAmpSeq.png** (Comparion Volcano Plot) +* **differential_abundance/ancombc1/_volcano_GLAmpSeq.png** (optional - Combined Volcano Plots) +* **differential_abundance/ancombc1/ancombc1_differential_abundance_GLAmpSeq.csv** (Statistics Table) +* **differential_abundance/ancombc1/_boxplot_GLAmpSeq.png** (ASV Boxplots) +* **differential_abundance/ancombc1/_boxplots_GLAmpSeq.png** (Combined Boxplots)
-___ +--- + +### 10b. ANCOMBC 2 +```bash +Rscript pairwise_ancombc2.R \ + --metadata-table amplicon_runsheet.csv \ + --feature-table counts_GLAmpSeq.tsv \ + --taxonomy-table taxonomy_GLAmpSeq.tsv \ + --group groups \ + --samples-column 'Sample Name' \ + --target-region 16S \ + --remove-rare FALSE \ + --prevalence-cutoff 0.15 \ + --library-cutoff 100 \ + --cpus 5 -## 10. Differential Abundance Analysis +``` +**Parameter Definitions:** -Using Betadisper, permutational ANOVA, and DESeq2, we aim to uncover specific taxa that exhibit notable variations across different conditions, complemented by visualizations like volcano plots to illustrate these disparities and their implications on ASV expression and overall microbial community dynamics. +* `--metadata-table` – specifies the path to a comma separated samples metadata file with the group/treatment to be analyzed +* `--feature-table` – specifies the path to a tab separated samples feature table i.e. ASV or OTU table +* `--taxonomy-table` – specifies the path to a feature taxonomy table i.e. ASV taxonomy table +* `--group` – specifies the group column in metadata to be analyzed +* `--samples-column` – specifies the column in metadata containing the sample names in the feature table +* `--target-region` – specifies the amplicon target region. Options are either 16S, 18S or ITS +* `--remove-rare` - should rare features be filtered out prior to analysis? If set, rare feature will be removed +* `--prevalence-cutoff` - If --remove-rare, a numerical fraction between 0 and 1. + Taxa with prevalences(the proportion of samples in which the taxon is present) less than --prevalence-cutoff will be excluded in the analysis. Default is 0.15, i.e. exclude taxa / features that are not present in at least 15% of the samples. +* `--library-cutoff` - If --remove-rare, a numerical threshold for filtering samples based on library sizes. + Samples with library sizes less than lib_cut will be excluded in the analysis. Default is 100. + if you do not want to discard any sample then set to 0. -
+* `--cpus ` - Specifies the number of cpus to use for parallel processing. -### 10a. Betadisper and Permutational ANOVA +Type `Rscript pairwise_ancombc2.R --help` at the commandline for a full list of available parameters -Use betadisper to check whether variability of data points in each group is similar. +Content of `pairwise_ancombc2.R` ```R -betadisper(d = euc_dist, group = sample_info_tab$groups) %>% anova() -``` +diff_abund_out_dir <- "differential_abundance/ancombc2/" +if(!dir.exists(diff_abund_out_dir)) dir.create(diff_abund_out_dir, recursive = TRUE) + +# Create phyloseq object +ps <- phyloseq(otu_table(feature_table, taxa_are_rows = TRUE), + sample_data(metadata), + tax_table(as.matrix(taxonomy_table))) + +# Convert phyloseq to tree summarized experiment object +tse <- mia::makeTreeSummarizedExperimentFromPhyloseq(ps) + +# Getting the reference group and making sure that it is the reference +# used in the analysis +group_levels <- metadata[, group] %>% unique() %>% sort() +ref_group <- group_levels[1] +tse[[group]] <- factor(tse[[group]] , levels = group_levels) + +message("Running ANCOMBC2....") +# Run acombc2 +output <- ancombc2(data = tse, assay_name = "counts", tax_level = NULL, + fix_formula = group, rand_formula = NULL, + p_adj_method = "fdr", pseudo_sens = TRUE, + prv_cut = prevalence_cutoff, + lib_cut = library_cutoff, s0_perc = 0.05, + group = group, struc_zero = TRUE, neg_lb = FALSE, + alpha = 0.05, n_cl = threads, verbose = TRUE, + global = TRUE, pairwise = TRUE, + dunnet = TRUE, trend = FALSE, + iter_control = list(tol = 1e-5, max_iter = 20, + verbose = FALSE), + em_control = list(tol = 1e-5, max_iter = 100), + mdfdr_control = list(fwer_ctrl_method = "fdr", B = 100), + lme_control = NULL, trend_control = NULL) + + + +# Create new column names - the original column names given by ANCOMBC are +# difficult to understand +new_colnames <- map_chr(output$res_pair %>% colnames, + function(colname) { + # Columns comparing a group to the reference group + if(str_count(colname,group) == 1){ + str_replace_all(string=colname, + pattern=glue("(.+)_{group}(.+)"), + replacement=glue("\\1_(\\2)v({ref_group})")) %>% + str_replace(pattern = "^lfc_", replacement = "logFC_") %>% + str_replace(pattern = "^se_", replacement = "lfcSE_") %>% + str_replace(pattern = "^W_", replacement = "Wstat_") %>% + str_replace(pattern = "^p_", replacement = "pvalue_") %>% + str_replace(pattern = "^q_", replacement = "qvalue_") + + # Columns with normal two groups comparison + } else if(str_count(colname,group) == 2){ + + str_replace_all(string=colname, + pattern=glue("(.+)_{group}(.+)_{group}(.+)"), + replacement=glue("\\1_(\\2)v(\\3)")) %>% + str_replace(pattern = "^lfc_", replacement = "logFC_") %>% + str_replace(pattern = "^se_", replacement = "lfcSE_") %>% + str_replace(pattern = "^W_", replacement = "Wstat_") %>% + str_replace(pattern = "^p_", replacement = "pvalue_") %>% + str_replace(pattern = "^q_", replacement = "qvalue_") + + # Feature/ ASV column + } else{ + + return(colname) + } + } ) + + +# Change the column named taxon to the feature name e.g. ASV +new_colnames[match("taxon", new_colnames)] <- feature + + +# Round numeric values and rename columns +paired_stats_df <- output$res_pair %>% + mutate(across(where(is.numeric), ~round(.x, digits=3))) %>% + set_names(new_colnames) + +# Get the unique comparison names +uniq_comps <- str_replace_all(new_colnames, ".+_(\\(.+\\))", "\\1") %>% unique() +uniq_comps <- uniq_comps[-match(feature, uniq_comps)] + + +# ------ Sort columns by group comparisons --------# +# Create a data frame containing only the feature/ASV column +res_df <- paired_stats_df[1] +walk(uniq_comps, function(comp){ + + # Get the results for a comparison + temp_df <- paired_stats_df %>% select(ASV, contains(comp)) + + # Merge the current comparison to previous comparisons by feature/ASV id + res_df <<- res_df %>% left_join(temp_df) +}) -**Parameter Definitions:** -* `betadisper()` – the vegan function we are calling, with the following parameters set within it -* `d=` - specifies the input distance object +# --------- Add NCBI id to feature ---------------# -* `group=` - specifies the sample grouping information +# Get the best taxonomy assigned to each ASV +tax_names <- map_chr(str_replace_all(taxonomy_table$species, ";_","") %>% + str_split(";"), + function(row) row[length(row)]) -* `%>% anova()` - Sends the output object from betadisper() to the anova() function to perform the permutational ANOVA test +df <- data.frame(ASV=rownames(taxonomy_table), best_taxonomy=tax_names) -Use adonis2 to test whether the mean of data differs significantly between groups. +message("Querying NCBI...") +# Pull NCBI IDS for unique taxonomy names +df2 <- data.frame(best_taxonomy = df$best_taxonomy %>% + unique()) %>% + mutate(NCBI_id=get_ncbi_ids(best_taxonomy, target_region), + .after = best_taxonomy) -```R -adonis_res <- adonis2(formula = euc_dist ~ sample_info_tab$groups) -``` +df <- df %>% + left_join(df2, join_by("best_taxonomy")) %>% + right_join(res_df) -**Parameter Definitions:** -* `adonis_res <-` – specifies the variable that will store the results within in our R environment +# Retrieve the normalized table +normalized_table <- output$bias_correct_log_table %>% + rownames_to_column(feature) %>% + mutate(across(where(is.numeric), ~replace_na(.x, replace=0))) -* `adonis2()` – the vegan function we are calling, with the following parameters set within it -* `formula=` - specifies the model formula or data matrix +samples <- metadata[[samples_column]] +samplesdropped <- setdiff(x = samples, y = colnames(normalized_table)[-1]) +missing_df <- data.frame(ASV=normalized_table[[feature]], + matrix(data = NA, + nrow = nrow(normalized_table), + ncol = length(samplesdropped) + ) + ) +colnames(missing_df) <- c(feature,samplesdropped) -Statistics from Adonis2 testing can be incorporated into PCoA visualizations using vst_pcoa which was made earlier. -```R -r2_value <- adonis_res$R2[1] -prf_value <- adonis_res$`Pr(>F)`[1] - -label_PC1 <- sprintf("PC1 [%.1f%%]", percent_variance[1]) -label_PC2 <- sprintf("PC2 [%.1f%%]", percent_variance[2]) - -ordination_plot <- plot_ordination(vst_physeq, vst_pcoa, color = "groups") + - geom_point(size = 1) + - labs( - col = "Groups", - x = label_PC1, - y = label_PC2 - ) + - geom_text(aes(label = rownames(sample_info_tab)), show.legend = FALSE, hjust = 0.3, vjust = -0.4, size = 4) + - coord_fixed(sqrt(eigen_vals[2]/eigen_vals[1])) + - scale_color_manual(values = unique(sample_info_tab$color), - labels = unique(sample_info_tab$groups)) + - theme_bw() + theme(legend.position = "bottom", text = element_text(size = 15, ), - legend.direction = "vertical", - legend.justification = "center", - legend.box.just = "center", - legend.title.align = 0.5) + - annotate("text", x = Inf, y = -Inf, label = paste("R2:", toString(round(r2_value, 3))), hjust = 1.1, vjust = -2, size = 4)+ - annotate("text", x = Inf, y = -Inf, label = paste("Pr(>F)", toString(round(prf_value,4))), hjust = 1.1, vjust = -0.5, size = 4)+ ggtitle("PCoA") - -ordination_plot_u <- plot_ordination(vst_physeq, vst_pcoa, color = "groups") + - geom_point(size = 1) + - labs( - x = label_PC1, - y = label_PC2, - col = "Groups" - ) + - coord_fixed(sqrt(eigen_vals[2]/eigen_vals[1])) + - scale_color_manual(values = unique(sample_info_tab[[color_colname]]), - labels = unique(sample_info_tab$short_group_labels)) + - theme_bw() + theme(legend.position = "bottom", text = element_text(size = 15, ), - legend.direction = "vertical", - legend.justification = "center", - legend.box.just = "center", - legend.title.align = 0.5) + - annotate("text", x = Inf, y = -Inf, label = paste("R2:", toString(round(r2_value, 3))), hjust = 1.1, vjust = -2, size = 4)+ - annotate("text", x = Inf, y = -Inf, label = paste("Pr(>F)", toString(round(prf_value,4))), hjust = 1.1, vjust = -0.5, size = 4)+ ggtitle("PCoA") - -ggsave(filename="PCoA_w_labels_GLAmpSeq.png", plot=ordination_plot) -ggsave(filename="PCoA_without_labels_GLAmpSeq.png", plot=ordination_plot_u) +group_means_df <- normalized_table[feature] +walk(group_levels, function(group_level){ + + + mean_col <- glue("Group.Mean_({group_level})") + std_col <- glue("Group.Stdev_({group_level})") + + # Samples that belong to the current group + Samples <- metadata %>% + filter(!!sym(group) == group_level) %>% + pull(!!sym(samples_column)) + # Samples that belong to the current group that are in the normalized table + Samples <- intersect(colnames(normalized_table), Samples) + + temp_df <- normalized_table %>% select(!!feature, all_of(Samples)) %>% + rowwise() %>% + mutate(!!mean_col := mean(c_across(where(is.numeric))), + !!std_col := sd(c_across(where(is.numeric))) ) %>% + select(!!feature,!!sym(mean_col), !!sym(std_col)) + + group_means_df <<- group_means_df %>% left_join(temp_df) + +}) + + +# Calculate global mean and standard deviation +normalized_table <- normalized_table %>% + rowwise() %>% + mutate(All.Mean=mean(c_across(where(is.numeric))), + All.Stdev=sd(c_across(where(is.numeric))) ) %>% + left_join(missing_df, by = feature) %>% + select(!!feature, all_of(samples), All.Mean, All.Stdev) + +# Append the taxonomy table to the ncbi and stats table +merged_df <- df %>% + left_join(taxonomy_table %>% + as.data.frame() %>% + rownames_to_column(feature)) %>% + select(!!feature,domain:species,everything()) + +# Add group means and normalized table +merged_df <- merged_df %>% + select(!!sym(feature):NCBI_id) %>% + left_join(normalized_table, by = feature) %>% + left_join(merged_df) %>% + left_join(group_means_df, by = feature) %>% + mutate(across(where(is.numeric), ~round(.x, digits=3))) + +message("Writing out results of differential abundance using ANCOMBC2...") +output_file <- glue("{diff_abund_out_dir}/{output_prefix}ancombc2_differential_abundance{assay_suffix}.csv") +write_csv(merged_df,output_file) + + +# ---------------------- Visualization --------------------------------------- # +message("Making volcano plots...") +# ------------ Make volcano ---------------- # +volcano_plots <- map(uniq_comps, function(comparison){ + + comp_col <- c( + glue("logFC_{comparison}"), + glue("lfcSE_{comparison}"), + glue("Wstat_{comparison}"), + glue("pvalue_{comparison}"), + glue("qvalue_{comparison}"), + glue("diff_{comparison}"), + glue("passed_ss_{comparison}") + ) + + + sub_res_df <- res_df %>% + select(!!feature, all_of(comp_col)) + colnames(sub_res_df) <- str_replace_all(colnames(sub_res_df), + pattern = "(.+)_.+", + replacement = "\\1") + + p <- ggplot(sub_res_df, aes(x=logFC, y=-log10(pvalue), color=diff, label=!!sym(feature))) + + geom_point(size=4) + geom_point(size=4) + + scale_color_manual(values=c("TRUE"="red", "FALSE"="black")) + + geom_hline(yintercept = -log10(0.05), linetype = "dashed") + + ggrepel::geom_text_repel() + + labs(x="logFC", y="-log10(Pvalue)", + title = comparison, color="Significant") + publication_format + + + ggsave(filename = glue("{output_prefix}{comparison}_volcano{assay_suffix}.png"), plot = p, device = "png", + width = 6, height = 8, units = "in", dpi = 300, path = diff_abund_out_dir) + + return(p) + +}) + +p <- wrap_plots(volcano_plots, ncol = 2) + + +number_of_columns <- 2 +number_of_rows = ceiling(length(uniq_comps) / number_of_columns) +fig_height = 7.5 * number_of_rows +# Try to combine all the volcano plots in one figure +try( +ggsave(filename = glue("{output_prefix}{feature}_volcano{assay_suffix}.png"), plot = p, device = "png", + width = 16, height = fig_height, units = "in", + dpi = 300, limitsize = FALSE, path=diff_abund_out_dir) +) + +# ------------- Box plots ---------------- # + +df2 <- (metadata %>% select(!!samples_column, !!group)) %>% + left_join(feature_table %>% + t %>% + as.data.frame %>% + rownames_to_column(samples_column)) + +message("Making abundance box plots...") +boxplots <- map(res_df[[feature]], function(feature){ + + p <- ggplot(df2, aes(x=!!sym(group), y=log(!!sym(feature)+1), fill=!!sym(group) )) + + geom_boxplot() + + labs(x=NULL, y="Log Abundance", fill=tools::toTitleCase(group), title = feature) + + theme_light() + + theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), + axis.title.y = element_text(face = "bold", size=12), + legend.text = element_text(face = "bold", size=10), + legend.title = element_text(face = "bold", size=12)) + + ggsave(filename = glue("{output_prefix}{feature}_boxplot{assay_suffix}.png"), plot = p, device = "png", + width = 8, height = 5, units = "in", dpi = 300, path = diff_abund_out_dir) + + return(p) +}) + + +p <- wrap_plots(boxplots, ncol = 2, guides = 'collect') + +number_of_features <- res_df[[feature]] %>% length +number_of_columns <- 2 +number_of_rows = ceiling(number_of_features / number_of_columns) +fig_height = 5 * number_of_rows + +# Try to Plot all features / ASVs in one figure +try( +ggsave(filename = glue("{output_prefix}{feature}_boxplots{assay_suffix}.png"), plot = p, device = "png", + width = 14, height = fig_height, units = "in", dpi = 300, + path = diff_abund_out_dir, limitsize = FALSE) +) ``` **Input Data:** -* `euc_dist` (variable containing the samplewise euclidean distance matrix of transposed VST-normalized counts, output from [step 7a](#7a-hierarchical-clustering)) -* `sample_info_tab` (variable containing a subtable of the runsheet, including the 'groups' and 'color' columns, output from [step 6b](#6b-environment-set-up)) -* `vst_physeq` (variable holding the Phyloseq object, output from [step 7b](#7b-ordination)) +* **amplicon_metdata.csv** (metadata table) +* **counts_GLAmpSeq.tsv** (count table) +* **taxonomy_GLAmpSeq.tsv** (taxonomy table) **Output Data:** -* **PCoA_w_labels_GLAmpSeq.png** (principle Coordinates Analysis plot of VST transformed ASV counts, with sample labels) -* **PCoA_without_labels_GLAmpSeq.png** (principle Coordinates Analysis plot of VST transformed ASV counts, without sample labels) +* **differential_abundance/ancombc2/_volcano_GLAmpSeq.png** (Comparion Volcano Plot) +* **differential_abundance/ancombc2/_volcano_GLAmpSeq.png** (optional - Combined Volcano Plots) +* **differential_abundance/ancombc2/ancombc2_differential_abundance_GLAmpSeq.csv** (Statistics Table) +* **differential_abundance/ancombc2/_boxplot_GLAmpSeq.png** (ASV Boxplots) +* **differential_abundance/ancombc2/_boxplots_GLAmpSeq.png** (Combined Boxplots) +
-### 10b. Differential abundance analysis with DESeq2 +--- -DESeq2 can be used to identify specific ASVs that have significantly different copy-number counts between sample groups. +### 10c. DESeq2 + +```bash +Rscript run_deseq2.R \ + --metadata-table amplicon_runsheet.csv \ + --feature-table counts_GLAmpSeq.tsv \ + --taxonomy-table taxonomy_GLAmpSeq.tsv \ + --group groups \ + --samples-column 'Sample Name' \ + --target-region 16S \ + --remove-rare FALSE \ + --prevalence-cutoff 0.15 \ + --library-cutoff 100 -```R -deseq_obj <- phyloseq_to_deseq2(physeq = ASV_physeq, design = ~groups) ``` +**Parameter Definitions:** -**Parameter Definitions:** +* `--metadata-table` – specifies the path to a comma separated samples metadata file with the group/treatment to be analyzed +* `--feature-table` – specifies the path to a tab separated samples feature table i.e. ASV or OTU table +* `--taxonomy-table` – specifies the path to a feature taxonomy table i.e. ASV taxonomy table +* `--group` – specifies the group column in metadata to be analyzed +* `--samples-column` – specifies the column in metadata containing the sample names in the feature table +* `--target-region` – specifies the amplicon target region. Options are either 16S, 18S or ITS +* `--remove-rare` - should rare features be filtered out prior to analysis? If set, rare feature will be removed +* `--prevalence-cutoff` - If --remove-rare, a numerical fraction between 0 and 1. + Taxa with prevalences(the proportion of samples in which the taxon is present) less than --prevalence-cutoff will be excluded in the analysis. Default is 0.15, i.e. exclude taxa / features that are not present in at least 15% of the samples. +* `--library-cutoff` - If --remove-rare, a numerical threshold for filtering samples based on library sizes. + Samples with library sizes less than lib_cut will be excluded in the analysis. Default is 100. + if you do not want to discard any sample then set to 0. -* `deseq_obj <-` – specifies the variable that will store the results within in our R environment +type `Rscript run_deseq2.R --help` at the commandline for a full list of available parameters -* `phyloseq_to_deseq2()` – the phyloseq function we are calling, with the following parameters set within it +Content of `run_deseq2.R` -* `physeq=` - specifies the phyloseq-class object +```R +# Create output directory if it doesn't already exist +diff_abund_out_dir <- "differential_abundance/deseq2/" +if(!dir.exists(diff_abund_out_dir)) dir.create(diff_abund_out_dir, recursive = TRUE) -* `design=` - a formula specifying the design of the experiment +#### pairwise comparisons +unique_groups <- unique(metadata[[group]]) -Run the DESeq() function to normalize for sample read-depth and composition, transform the data, and test for differential abundance between the groups. Save the size-factor-normalized counts. +# Create phyloseq object +ASV_physeq <- phyloseq(otu_table(feature_table, taxa_are_rows = TRUE), + tax_table(as.matrix(taxonomy_table)), + sample_data(metadata)) +deseq_obj <- phyloseq_to_deseq2(physeq = ASV_physeq, + design = reformulate(group)) -```R -deseq_modeled <- DESeq(deseq_obj) +# Add pseudocount if any 0 count samples are present +if (sum(colSums(counts(deseq_obj)) == 0) > 0) { + count_data <- counts(deseq_obj) + 1 + + count_data <- as.matrix(apply(count_data, 2, as.integer)) + rownames(count_data) <- rownames(counts(deseq_obj)) + colnames(count_data) <- colnames(counts(deseq_obj)) + counts(deseq_obj) <- count_data +} -write.table(counts(deseq_modeled, normalized=TRUE), file = "normalized_counts_GLAmpSeq.tsv", sep="\t", row.names=TRUE, quote=FALSE) -``` +# Run Deseq +# https://rdrr.io/bioc/phyloseq/src/inst/doc/phyloseq-mixture-models.R +deseq_modeled <- tryCatch({ + # Attempt to run DESeq + DESeq(deseq_obj) +}, error = function(e) { + message("Error encountered in DESeq, applying alternative \ + method for size factor estimation...") + + geoMeans <- apply(counts(deseq_obj), 1, gm_mean) + + # Apply the alternative size factor estimation method + deseq_obj <- estimateSizeFactors(deseq_obj, geoMeans=geoMeans) + + # Call DESeq again with alternative geom mean size est + DESeq(deseq_obj) +}) -**Input Data:** -* `ASV_physeq` (variable contiaining the Phyloseq object, output from [step 8b](#8b-richness-and-diversity-estimates)) +# Get unique group comparison as a matrix +pairwise_comp.m <- utils::combn(metadata[,group] %>% unique, 2) +pairwise_comp_df <- pairwise_comp.m %>% as.data.frame + +colnames(pairwise_comp_df) <- map_chr(pairwise_comp_df, + \(col) str_c(col, collapse = "v")) +comparisons <- colnames(pairwise_comp_df) +names(comparisons) <- comparisons + +# Retrieve statistics table +merged_stats_df <- data.frame(ASV=rownames(feature_table)) +colnames(merged_stats_df) <- feature + +walk(pairwise_comp_df, function(col){ -**Output Data:** -* **normalized_counts_GLAmpSeq.tsv** (size factor normalized ASV counts table) -* `deseq_modeled` (variable holding the DESeq2 output data) + group1 <- col[1] + group2 <- col[2] + +df <- results(deseq_modeled, contrast = c(group, group1, group2)) %>% + data.frame() %>% + rownames_to_column(feature) %>% + set_names(c(feature , + glue("baseMean_({group1})v({group2})"), + glue("log2FC_({group1})v({group2})"), + glue("lfcSE_({group1})v({group2})"), + glue("stat_({group1})v({group2})"), + glue("pvalue_({group1})v({group2})"), + glue("padj_({group1})v({group2})") + )) -
+ + merged_stats_df <<- merged_stats_df %>% + dplyr::left_join(df, join_by("ASV")) +}) -### 10c. Volcano Plots -Define the function for creating the volcano plot and saving the normalized counts for a given contrast. -```R -plot_comparison <- function(group1, group2) { +# Add NCBI id to feature i.e. ASV +tax_names <- map_chr(str_replace_all(taxonomy_table$species, ";_","") %>% + str_split(";"), + function(row) row[length(row)]) + +df <- data.frame(ASV=rownames(taxonomy_table), best_taxonomy=tax_names) + +# Pull NCBI IDS for unique taxonomy names +df2 <- data.frame(best_taxonomy = df$best_taxonomy %>% + unique()) %>% + mutate(NCBI_id=get_ncbi_ids(best_taxonomy, target_region), + .after = best_taxonomy) + +df <- df %>% + left_join(df2, join_by("best_taxonomy")) %>% + right_join(merged_stats_df) + + + + +group_levels <- metadata[, group] %>% unique() %>% sort() +normalized_table <- counts(deseq_modeled, normalized=TRUE) %>% + as.data.frame() %>% + rownames_to_column(feature) + +# Creating a dataframe of samples that were dropped because they didn't +# meet are cut-off criteria +samples <- metadata[[samples_column]] +samplesdropped <- setdiff(x = samples, y = colnames(normalized_table)[-1]) +missing_df <- data.frame(ASV=normalized_table[[feature]], + matrix(data = NA, + nrow = nrow(normalized_table), + ncol = length(samplesdropped) + ) +) +colnames(missing_df) <- c(feature,samplesdropped) + + +group_means_df <- normalized_table[feature] +walk(group_levels, function(group_level){ + + + mean_col <- glue("Group.Mean_({group_level})") + std_col <- glue("Group.Stdev_({group_level})") + + # Samples that belong to the current group + Samples <- metadata %>% + filter(!!sym(group) == group_level) %>% + pull(!!sym(samples_column)) + # Samples that belong to the current group that are in the normalized table + Samples <- intersect(colnames(normalized_table), Samples) - deseq_res <- results(deseq_modeled, contrast = c("groups", group1, group2)) - norm_tab <- counts(deseq_modeled, normalized = TRUE) %>% data.frame() + temp_df <- normalized_table %>% select(!!feature, all_of(Samples)) %>% + rowwise() %>% + mutate(!!mean_col := mean(c_across(where(is.numeric))), + !!std_col := sd(c_across(where(is.numeric))) ) %>% + select(!!feature,!!sym(mean_col), !!sym(std_col)) + + group_means_df <<- group_means_df %>% left_join(temp_df) + +}) + + +# Append Mean and standard deviation +normalized_table <- normalized_table %>% + rowwise() %>% + mutate(All.Mean=mean(c_across(where(is.numeric))), + All.Stdev=sd(c_across(where(is.numeric))) )%>% + left_join(missing_df, by = feature) %>% + select(!!feature, all_of(samples), All.Mean, All.Stdev) + + +# Add taxonomy +merged_df <- df %>% + left_join(taxonomy_table %>% + as.data.frame() %>% + rownames_to_column(feature)) %>% + select(!!feature, domain:species,everything()) # Try to generalize + +# Merge all prepared tables +merged_df <- merged_df %>% + select(!!sym(feature):NCBI_id) %>% + left_join(normalized_table, by = feature) %>% + left_join(merged_df) %>% + left_join(group_means_df, by = feature) %>% + mutate(across(where(is.numeric), ~round(.x, digits=3))) %>% + mutate(across(where(is.matrix), as.numeric)) + + +output_file <- glue("{diff_abund_out_dir}/{output_prefix}deseq2_differential_abundance{assay_suffix}.csv") +message("Writing out results of differential abundance using DESeq2...") +write_csv(merged_df,output_file) + + + +# Make volcano plots +walk(pairwise_comp_df, function(col){ + group1 <- col[1] + group2 <- col[2] + + plot_width_inches <- 11.1 + plot_height_inches <- 8.33 + p_val <- 0.1 #also logfc cutoff? + + deseq_res <- results(deseq_modeled, contrast = c(group, group1, group2)) volcano_data <- as.data.frame(deseq_res) - p_val <- 0.1 + volcano_data <- volcano_data[!is.na(volcano_data$padj), ] - volcano_data$significant <- volcano_data$padj <= p_val + volcano_data$significant <- volcano_data$padj <= p_val #also logfc cutoff? - p <- ggplot(volcano_data, aes(x=log2FoldChange, y=-log10(padj), color=significant)) + + ###### Long x-axis label adjustments ########## + x_label <- paste("Log2 Fold Change\n(",group1," vs ",group2,")") + label_length <- nchar(x_label) + max_allowed_label_length <- plot_width_inches * 10 + + # Construct x-axis label with new line breaks if was too long + if (label_length > max_allowed_label_length){ + x_label <- paste("Log2 Fold Change\n\n(", group1, "\n vs \n", group2, ")", sep="") + } + ####################################### + + # ASVs promoted in space on right, reduced on left + p <- ggplot(volcano_data, aes(x=log2FoldChange, y=-log10(padj), + color=significant)) + geom_point(alpha=0.7, size=2) + - scale_color_manual(values=c("black", "red"), labels=c(paste0("padj > ", p_val), paste0("padj \u2264 ", p_val))) + + geom_hline(yintercept = -log10(p_val), linetype = "dashed") + + scale_color_manual(values=c("black", "red"), + labels=c(paste0("padj > ", p_val), + paste0(" padj \u2264 ", p_val))) + theme_bw() + labs(title="Volcano Plot", - x=paste("Log2 Fold Change\n(",group1," vs ",group2,")"), + x=x_label, y="-Log10 P-value", color=paste0("")) + theme(legend.position="top") + # label points and plot top_points <- volcano_data %>% arrange(padj) %>% filter(significant) %>% head(10) - volcano_plot <- p + geom_text_repel(data=top_points, aes(label=row.names(top_points)), size=3) - ggsave(filename=paste0("volcano_", - gsub(" ", "_", group1), - "_vs_", - gsub(" ", "_", group2), ".png"), - plot=volcano_plot, - width = 11.1, height = 8.33, dpi = 300) + volcano_plot <- p + geom_text_repel(data=top_points, + aes(label=row.names(top_points)), + size=3) - write.csv(deseq_res, file = paste0(gsub(" ", "_", group1), - "_vs_", - gsub(" ", "_", group2), ".csv")) -} -``` - -Create volcano plots for all pairwise comparisons. - -```R -comparisons <- expand.grid(group1 = unique_groups, group2 = unique_groups) -comparisons <- subset(comparisons, group1 != group2) - -apply(comparisons, 1, function(pair) plot_comparison(pair['group1'], pair['group2'])) + # Save volcano plot + ggsave(filename=glue("{diff_abund_out_dir}{output_prefix}volcano_{group1}_vs_{group2}.png"), + plot=volcano_plot, + width = plot_width_inches, + height = plot_height_inches, + dpi = 300) +}) ``` -**Parameter Definitions:** - -* `apply()` – the function we are calling, with the following parameters set within it +**Input Data:** -* `physeq=` - the data matrix or array on which the function is to be applied +* **amplicon_metdata.csv** (metadata table) +* **counts_GLAmpSeq.tsv** (count table) +* **taxonomy_GLAmpSeq.tsv** (taxonomy table) -* `1` – indicates that the function should be applied to each row +**Output Data:** -* `function(pair) plot_comparison(pair['group1'], pair['group2'])` – an anonymous function which takes a pair of values as input and executes the plot_comparison function on these values +* **differential_abundance/deseq2/volcano__vs_.png** (Comparion Volcano Plot) +* **differential_abundance/deseq2/deseq2_differential_abundance_GLAmpSeq.csv** (Statistics Table) -**Input Data:** -* `deseq_modeled` (variable holding the DESeq2 output data, output from [step 10b](#910-differential-abundance-analysis-with-deseq2)) - -**Output Data:** +
-* **group1_vs_group2.csv** (differential abundance tables for all pairwise contrasts of groups) -* **volcano_group1_vs_group2.png** (volcano plots for all pairwise contrasts of groups) +--- \ No newline at end of file From 0f68848d06523aac60c684de4bcf46f483cfad48 Mon Sep 17 00:00:00 2001 From: olabiyi Date: Fri, 3 Jan 2025 15:47:18 -0800 Subject: [PATCH 11/24] Added more comments to pipeline document --- .../GL-DPPD-7104-B.md | 188 +++++++++++------- 1 file changed, 114 insertions(+), 74 deletions(-) diff --git a/Amplicon/Illumina/Pipeline_GL-DPPD-7104_Versions/GL-DPPD-7104-B.md b/Amplicon/Illumina/Pipeline_GL-DPPD-7104_Versions/GL-DPPD-7104-B.md index 3eb74d25..97a9ce64 100644 --- a/Amplicon/Illumina/Pipeline_GL-DPPD-7104_Versions/GL-DPPD-7104-B.md +++ b/Amplicon/Illumina/Pipeline_GL-DPPD-7104_Versions/GL-DPPD-7104-B.md @@ -1677,6 +1677,7 @@ where distance_method is either bray or euclidean for Bray Curtis and Euclidean ## 9. Taxonomy Plots +Taxonomic summaries provide insights into the composition of microbial communities at various taxonomic levels. ```bash Rscript plot_taxonomy.R \ @@ -1880,6 +1881,8 @@ where taxon_level is one of phylum, class, order, family, genus and species. ## 10. Differential Abundance Testing +Using ANCOMBC 1, ANCOMBC 2, and DESeq2, we aim to uncover specific taxa that exhibit notable variations across different conditions, complemented by visualizations like volcano plots to illustrate these disparities and their implications on ASV expression and overall microbial community dynamics. + ### 10a. ANCOMBC 1 ```bash @@ -1901,8 +1904,8 @@ Rscript pairwise_ancombc1.R \ * `--metadata-table` – specifies the path to a comma separated samples metadata file with the group/treatment to be analyzed * `--feature-table` – specifies the path to a tab separated samples feature table i.e. ASV or OTU table * `--taxonomy-table` – specifies the path to a feature taxonomy table i.e. ASV taxonomy table -* `--group` – specifies the group column in metadata to be analyzed -* `--samples-column` – specifies the column in metadata containing the sample names in the feature table +* `--group` – specifies the group column in the metadata to be analyzed +* `--samples-column` – specifies the column in the metadata containing the sample names in the feature table * `--target-region` – specifies the amplicon target region. Options are either 16S, 18S or ITS * `--remove-rare` - should rare features be filtered out prior to analysis? If set, rare feature will be removed * `--prevalence-cutoff` - If --remove-rare, a numerical fraction between 0 and 1. @@ -1910,7 +1913,7 @@ Rscript pairwise_ancombc1.R \ * `--library-cutoff` - If --remove-rare, a numerical threshold for filtering samples based on library sizes. Samples with library sizes less than lib_cut will be excluded in the analysis. Default is 100. if you do not want to discard any sample then set to 0. -* `--cpus ` - Specifies the number of cpus to use for parallel processing. +* `--cpus ` - specifies the number of cpus to use for parallel processing. Type `Rscript pairwise_ancombc1.R --help` at the commandline for a full list of available parameters @@ -1922,7 +1925,7 @@ diff_abund_out_dir <- "differential_abundance/ancombc1/" if(!dir.exists(diff_abund_out_dir)) dir.create(diff_abund_out_dir, recursive = TRUE) -# Create phyloseq object +# Create phyloseq object from feature, taxonomy and sample metadata tables ps <- phyloseq(otu_table(feature_table, taxa_are_rows = TRUE), sample_data(metadata), tax_table(as.matrix(taxonomy_table))) @@ -1934,20 +1937,22 @@ tse <- mia::makeTreeSummarizedExperimentFromPhyloseq(ps) # Get unique group comparison as a matrix pairwise_comp.m <- utils::combn(metadata[,group] %>% unique, 2) pairwise_comp_df <- pairwise_comp.m %>% as.data.frame - +# Name the columns in the pairwise matrix as group1vgroup2 colnames(pairwise_comp_df) <- map_chr(pairwise_comp_df, \(col) str_c(col, collapse = "v")) comparisons <- colnames(pairwise_comp_df) names(comparisons) <- comparisons -message("Running ANCOMBC1....") +# Running ANCOMBC 1 set.seed(123) final_results_bc1 <- map(pairwise_comp_df, function(col){ group1 <- col[1] group2 <- col[2] + # Subset the treeSummarizedExperiment object to contain only samples + # in group1 and group2 tse_sub <- tse[, tse[[group]] %in% c(group1, group2)] # Note that by default, levels of a categorical variable in R are sorted @@ -2038,7 +2043,7 @@ final_results_bc1 <- map(pairwise_comp_df, function(col){ glue("diff_({group2})v({group1})")) ) - + # Merge the dataframes to one results dataframe res <-lfc %>% left_join(se) %>% left_join(W) %>% @@ -2053,15 +2058,19 @@ final_results_bc1 <- map(pairwise_comp_df, function(col){ -# Create merged stats pairwise dataframe -# initialize the merged stats dataframe to contain the taxon column for joining +# ------------ Create merged stats pairwise dataframe ----------------- # +# Initialize the merged stats dataframe to contain the taxon column for joining merged_stats_df <- final_results_bc1[[names(final_results_bc1)[1]]] %>% as.data.frame() %>% select(taxon) +# Loop over the results of every comparion and join it the pre-existing +# stats table walk(comparisons[names(final_results_bc1)], .f = function(comparison){ + # Get comparison specific statistics df <- final_results_bc1[[comparison]] %>% as.data.frame() + # Merge it to the pre-existing statistics table merged_stats_df <<- merged_stats_df %>% dplyr::full_join(df, by = join_by("taxon")) @@ -2073,16 +2082,20 @@ merged_stats_df <- merged_stats_df %>% mutate(!!feature := SortMixed(!!sym(feature))) - +# ------ Get comparison names +# Since all column groups i.e. logFC, pval, W, etc. have the same +# suffixes as comparison names, we only need to extract the comparion names +# from one of them. Here we extract them from the "logFC" prefixed columns comp_names <- merged_stats_df %>% select(starts_with("logFC")) %>% colnames() %>% str_remove_all("logFC_") names(comp_names) <- comp_names -message("Making volcano plots...") + # -------------- Make volcano plots ------------------ # volcano_plots <- map(comp_names, function(comparison){ + # Construct column names for columns to be selected comp_col <- c( glue("logFC_{comparison}"), glue("lfcSE_{comparison}"), @@ -2091,13 +2104,15 @@ volcano_plots <- map(comp_names, function(comparison){ glue("qvalue_{comparison}"), glue("diff_{comparison}") ) - + + # Subset the statistics table for the current comparison sub_res_df <- merged_stats_df %>% select(!!feature, all_of(comp_col)) %>% drop_na() colnames(sub_res_df) <- str_replace_all(colnames(sub_res_df), pattern = "(.+)_.+", replacement = "\\1") + # Make plot p <- ggplot(sub_res_df, aes(x=logFC, y=-log10(pvalue), color=diff, label=!!sym(feature))) + geom_point(size=4) + scale_color_manual(values=c("TRUE"="red", "FALSE"="black")) + @@ -2105,13 +2120,14 @@ volcano_plots <- map(comp_names, function(comparison){ ggrepel::geom_text_repel() + labs(x="logFC", y="-log10(Pvalue)", title = comparison, color="Significant") + publication_format - + # Save the volcano plot ggsave(filename = glue("{output_prefix}{comparison}_volcano{assay_suffix}.png"), plot = p, device = "png", width = 6, height = 8, units = "in", dpi = 300, path = diff_abund_out_dir) return(p) }) +# Set dimensions for saving faceted plot number_of_columns <- 2 number_of_rows = ceiling(length(comp_names) / number_of_columns) fig_height = 7.5 * number_of_rows @@ -2124,7 +2140,8 @@ ggsave(filename = glue("{output_prefix}{feature}_volcano{assay_suffix}.png"), pl path = diff_abund_out_dir, limitsize = FALSE) ) -# Add NCBI id to feature i.e. ASV +# ------------------- Add NCBI id to feature i.e. ASV -------------- # +# Get the best/least possible taxonomy name for the ASVs tax_names <- map_chr(str_replace_all(taxonomy_table$species, ";_","") %>% str_split(";"), function(row) row[length(row)]) @@ -2160,6 +2177,7 @@ missing_df <- data.frame(ASV=normalized_table[[feature]], colnames(missing_df) <- c(feature,samplesdropped) +# Create mean and standard deviation table group_levels <- metadata[, group] %>% unique() %>% sort() group_means_df <- normalized_table[feature] walk(group_levels, function(group_level){ @@ -2186,7 +2204,7 @@ walk(group_levels, function(group_level){ }) -# Append Mean and standard deviation +# Append Mean and standard deviation to normalized table normalized_table <- normalized_table %>% rowwise() %>% mutate(All.Mean=mean(c_across(where(is.numeric))), @@ -2194,14 +2212,14 @@ normalized_table <- normalized_table %>% left_join(missing_df, by = feature) %>% select(!!feature, all_of(samples), All.Mean, All.Stdev) - +# Merge the taxonomy table to the stats table merged_df <- df %>% left_join(taxonomy_table %>% as.data.frame() %>% rownames_to_column(feature)) %>% - select(!!feature, domain:species,everything()) # Try to generalize - + select(!!feature, domain:species,everything()) +# Merge all the pre-combined dataframes merged_df <- merged_df %>% select(!!sym(feature):NCBI_id) %>% left_join(normalized_table, by = feature) %>% @@ -2210,20 +2228,21 @@ merged_df <- merged_df %>% mutate(across(where(is.numeric), ~round(.x, digits=3))) %>% mutate(across(where(is.matrix), as.numeric)) +# Writing out results of differential abundance using ANCOMBC 1 output_file <- glue("{diff_abund_out_dir}/{output_prefix}ancombc1_differential_abundance{assay_suffix}.csv") -message("Writing out results of differential abundance using ANCOMBC1...") write_csv(merged_df,output_file) # --------------- Make log abundance box plots ------------------ # +# Merge the metadata with the feature table df2 <- (metadata %>% select(!!samples_column, !!group)) %>% left_join(feature_table %>% t %>% as.data.frame %>% rownames_to_column(samples_column)) -message("Making abundance box plots...") +# Making abundance box plots boxplots <- map( merged_stats_df[[feature]], function(feature){ p <- ggplot(df2, aes(x=!!sym(group), y=log(!!sym(feature)+1), fill=!!sym(group) )) + @@ -2242,6 +2261,7 @@ boxplots <- map( merged_stats_df[[feature]], function(feature){ return(p) }) +# Make faceted plot p <- wrap_plots(boxplots, ncol = 2, guides = 'collect') number_of_features <- merged_stats_df[[feature]] %>% length @@ -2332,6 +2352,25 @@ tse[[group]] <- factor(tse[[group]] , levels = group_levels) message("Running ANCOMBC2....") # Run acombc2 + + # data - input data. TreeSummarizedExperiment or Phyloseq object + # assay_name - name of count table in the input data object. + # tax_level - taxonomy level for aggregation and analysis + # prv_cut - prevalence cut-off. proportion of samples in which taxon is present. + # lib_cut - a numerical threshold for filtering samples based on library sizes. + # p_adj_method - p-value adjustment method for multiple comparisons + # struc_zero - should group-wise rare taxa be detected + # neg_lb - whether to classify a taxon as a structural zero using its asymptotic lower bound. i.e.the best the algorithm can possibly achieve + # group - name of the group variable in metadata. Only important you'd like to perform global test can be set to NULL. + # alpha - significance level + # n_cl - number of processes to run in parallel + # global - should a global test be performed to detect significant differences between at least 2 groups (ANOVA-like comparison) + # tol - iteration convergence tolerance for the E-M algorithm. + # max_iter - max iteration + # formula - fixed effects formula + # conserve - should a conservative variance estimator be used for the test statistic? + # it is recommended to set to TRUE if your sample size is small and the number of expected differentially abundant taxa is large. + output <- ancombc2(data = tse, assay_name = "counts", tax_level = NULL, fix_formula = group, rand_formula = NULL, p_adj_method = "fdr", pseudo_sens = TRUE, @@ -2636,8 +2675,8 @@ Rscript run_deseq2.R \ * `--metadata-table` – specifies the path to a comma separated samples metadata file with the group/treatment to be analyzed * `--feature-table` – specifies the path to a tab separated samples feature table i.e. ASV or OTU table * `--taxonomy-table` – specifies the path to a feature taxonomy table i.e. ASV taxonomy table -* `--group` – specifies the group column in metadata to be analyzed -* `--samples-column` – specifies the column in metadata containing the sample names in the feature table +* `--group` – specifies the group column in the metadata to be analyzed +* `--samples-column` – specifies the column in the metadata containing the sample names in the feature table * `--target-region` – specifies the amplicon target region. Options are either 16S, 18S or ITS * `--remove-rare` - should rare features be filtered out prior to analysis? If set, rare feature will be removed * `--prevalence-cutoff` - If --remove-rare, a numerical fraction between 0 and 1. @@ -2655,21 +2694,19 @@ Content of `run_deseq2.R` diff_abund_out_dir <- "differential_abundance/deseq2/" if(!dir.exists(diff_abund_out_dir)) dir.create(diff_abund_out_dir, recursive = TRUE) -#### pairwise comparisons -unique_groups <- unique(metadata[[group]]) - -# Create phyloseq object +# Create phyloseq object from the feature, taxonomy and metadata tables ASV_physeq <- phyloseq(otu_table(feature_table, taxa_are_rows = TRUE), tax_table(as.matrix(taxonomy_table)), sample_data(metadata)) - +# Convert the phyloseq object to a deseq object deseq_obj <- phyloseq_to_deseq2(physeq = ASV_physeq, design = reformulate(group)) # Add pseudocount if any 0 count samples are present if (sum(colSums(counts(deseq_obj)) == 0) > 0) { + # Add pseudo count of 1 count_data <- counts(deseq_obj) + 1 - + # Make a columns of integer type count_data <- as.matrix(apply(count_data, 2, as.integer)) rownames(count_data) <- rownames(counts(deseq_obj)) colnames(count_data) <- colnames(counts(deseq_obj)) @@ -2679,7 +2716,8 @@ if (sum(colSums(counts(deseq_obj)) == 0) > 0) { # Run Deseq # https://rdrr.io/bioc/phyloseq/src/inst/doc/phyloseq-mixture-models.R deseq_modeled <- tryCatch({ - # Attempt to run DESeq + # Attempt to run DESeq, if error occurs then attempt an alternative + # size factor estimation method DESeq(deseq_obj) }, error = function(e) { message("Error encountered in DESeq, applying alternative \ @@ -2698,7 +2736,7 @@ deseq_modeled <- tryCatch({ # Get unique group comparison as a matrix pairwise_comp.m <- utils::combn(metadata[,group] %>% unique, 2) pairwise_comp_df <- pairwise_comp.m %>% as.data.frame - +# Set the colnames as group1vgroup2 colnames(pairwise_comp_df) <- map_chr(pairwise_comp_df, \(col) str_c(col, collapse = "v")) comparisons <- colnames(pairwise_comp_df) @@ -2712,8 +2750,9 @@ walk(pairwise_comp_df, function(col){ group1 <- col[1] group2 <- col[2] - -df <- results(deseq_modeled, contrast = c(group, group1, group2)) %>% + +# Retrieve the statistics table for the cuurrent pair and rename the columns +df <- results(deseq_modeled, contrast = c(group, group1, group2)) %>% # Get stats data.frame() %>% rownames_to_column(feature) %>% set_names(c(feature , @@ -2723,16 +2762,16 @@ df <- results(deseq_modeled, contrast = c(group, group1, group2)) %>% glue("stat_({group1})v({group2})"), glue("pvalue_({group1})v({group2})"), glue("padj_({group1})v({group2})") - )) + )) # rename the columns merged_stats_df <<- merged_stats_df %>% dplyr::left_join(df, join_by("ASV")) }) +# ---------------------- Add NCBI id to feature i.e. ASV - -# Add NCBI id to feature i.e. ASV +# Get the best / lowest possible taxonomy assignment for the features i.e. ASVs tax_names <- map_chr(str_replace_all(taxonomy_table$species, ";_","") %>% str_split(";"), function(row) row[length(row)]) @@ -2749,16 +2788,13 @@ df <- df %>% left_join(df2, join_by("best_taxonomy")) %>% right_join(merged_stats_df) - - - -group_levels <- metadata[, group] %>% unique() %>% sort() +# -------- Retrieve deseq normalized table from the deseq model normalized_table <- counts(deseq_modeled, normalized=TRUE) %>% as.data.frame() %>% rownames_to_column(feature) # Creating a dataframe of samples that were dropped because they didn't -# meet are cut-off criteria +# meet our cut-off criteria samples <- metadata[[samples_column]] samplesdropped <- setdiff(x = samples, y = colnames(normalized_table)[-1]) missing_df <- data.frame(ASV=normalized_table[[feature]], @@ -2769,21 +2805,24 @@ missing_df <- data.frame(ASV=normalized_table[[feature]], ) colnames(missing_df) <- c(feature,samplesdropped) - +# Calculate mean and standard deviation of all ASVs for each group in +# a dataframe called group_means_df +group_levels <- metadata[, group] %>% unique() %>% sort() group_means_df <- normalized_table[feature] walk(group_levels, function(group_level){ - + # Initializing mean and std column names mean_col <- glue("Group.Mean_({group_level})") std_col <- glue("Group.Stdev_({group_level})") - # Samples that belong to the current group + # Get a vector of samples that belong to the current group Samples <- metadata %>% filter(!!sym(group) == group_level) %>% pull(!!sym(samples_column)) - # Samples that belong to the current group that are in the normalized table + # Retain only samples that belong to the current group that are in the normalized table Samples <- intersect(colnames(normalized_table), Samples) + # Calculate the means and standard deviations for the current group temp_df <- normalized_table %>% select(!!feature, all_of(Samples)) %>% rowwise() %>% mutate(!!mean_col := mean(c_across(where(is.numeric))), @@ -2795,67 +2834,67 @@ walk(group_levels, function(group_level){ }) -# Append Mean and standard deviation +# Append mean, standard deviation and missing samples to the normalized table normalized_table <- normalized_table %>% rowwise() %>% mutate(All.Mean=mean(c_across(where(is.numeric))), - All.Stdev=sd(c_across(where(is.numeric))) )%>% - left_join(missing_df, by = feature) %>% - select(!!feature, all_of(samples), All.Mean, All.Stdev) + All.Stdev=sd(c_across(where(is.numeric))) ) %>% # calculate mean and std + left_join(missing_df, by = feature) %>% # append missing samples + select(!!feature, all_of(samples), All.Mean, All.Stdev) # select columns of interest # Add taxonomy -merged_df <- df %>% +merged_df <- df %>% # statistics table left_join(taxonomy_table %>% as.data.frame() %>% - rownames_to_column(feature)) %>% - select(!!feature, domain:species,everything()) # Try to generalize + rownames_to_column(feature)) %>% # append taxonomy table + select(!!feature, domain:species,everything()) # select columns of interest # Merge all prepared tables merged_df <- merged_df %>% - select(!!sym(feature):NCBI_id) %>% - left_join(normalized_table, by = feature) %>% - left_join(merged_df) %>% - left_join(group_means_df, by = feature) %>% - mutate(across(where(is.numeric), ~round(.x, digits=3))) %>% - mutate(across(where(is.matrix), as.numeric)) - - + select(!!sym(feature):NCBI_id) %>% # select on the features and NCBI ids + left_join(normalized_table, by = feature) %>% # append the normalized table + left_join(merged_df) %>% # append the stats table + left_join(group_means_df, by = feature) %>% # append the group means and stds + mutate(across(where(is.numeric), ~round(.x, digits=3))) %>% # round numeric columns + mutate(across(where(is.matrix), as.numeric)) # convert meatrix columns to numeric columns + +# Defining the output file output_file <- glue("{diff_abund_out_dir}/{output_prefix}deseq2_differential_abundance{assay_suffix}.csv") -message("Writing out results of differential abundance using DESeq2...") +# Writing out results of differential abundance using DESeq2 write_csv(merged_df,output_file) -# Make volcano plots +# ------------------------- Make volcano plots ------------------------ # +# Loop over group pairs and make a volcano comparing the pair walk(pairwise_comp_df, function(col){ group1 <- col[1] group2 <- col[2] + # Setting plot dimensions plot_width_inches <- 11.1 plot_height_inches <- 8.33 - p_val <- 0.1 #also logfc cutoff? + p_val <- 0.1 # logfc cutoff + # Retrieve data for plotting deseq_res <- results(deseq_modeled, contrast = c(group, group1, group2)) volcano_data <- as.data.frame(deseq_res) - - volcano_data <- volcano_data[!is.na(volcano_data$padj), ] - volcano_data$significant <- volcano_data$padj <= p_val #also logfc cutoff? + volcano_data$significant <- volcano_data$padj <= p_val - ###### Long x-axis label adjustments ########## + # -------- Long x-axis label adjustments x_label <- paste("Log2 Fold Change\n(",group1," vs ",group2,")") label_length <- nchar(x_label) max_allowed_label_length <- plot_width_inches * 10 - # Construct x-axis label with new line breaks if was too long + # Construct x-axis label with new line breaks if it is too long if (label_length > max_allowed_label_length){ x_label <- paste("Log2 Fold Change\n\n(", group1, "\n vs \n", group2, ")", sep="") } - ####################################### - # ASVs promoted in space on right, reduced on left + # -------- Compose the plot p <- ggplot(volcano_data, aes(x=log2FoldChange, y=-log10(padj), color=significant)) + geom_point(alpha=0.7, size=2) + @@ -2870,19 +2909,20 @@ walk(pairwise_comp_df, function(col){ color=paste0("")) + theme(legend.position="top") - # label points and plot + # Subset the 10 most significant asvs based on p adjusted values top_points <- volcano_data %>% arrange(padj) %>% filter(significant) %>% head(10) + # Add text of the top 10 ASVs to the volcano plot volcano_plot <- p + geom_text_repel(data=top_points, aes(label=row.names(top_points)), size=3) # Save volcano plot - ggsave(filename=glue("{diff_abund_out_dir}{output_prefix}volcano_{group1}_vs_{group2}.png"), - plot=volcano_plot, + ggsave(filename = glue("{diff_abund_out_dir}{output_prefix}volcano_{group1}_vs_{group2}.png"), + plot = volcano_plot, width = plot_width_inches, height = plot_height_inches, dpi = 300) @@ -2891,13 +2931,13 @@ walk(pairwise_comp_df, function(col){ **Input Data:** -* **amplicon_metdata.csv** (metadata table) +* **amplicon_runsheet.csv** (metadata table) * **counts_GLAmpSeq.tsv** (count table) * **taxonomy_GLAmpSeq.tsv** (taxonomy table) **Output Data:** -* **differential_abundance/deseq2/volcano__vs_.png** (Comparion Volcano Plot) +* **differential_abundance/deseq2/volcano___vs__.png** (Comparion Volcano Plot) * **differential_abundance/deseq2/deseq2_differential_abundance_GLAmpSeq.csv** (Statistics Table) From d1670ff4846957a817973592110831ef9d389cb5 Mon Sep 17 00:00:00 2001 From: olabiyi Date: Mon, 6 Jan 2025 14:04:01 -0800 Subject: [PATCH 12/24] Fixed taxize's api rate error --- .../NF_AmpIllumina-B/workflow_code/main.nf | 14 ++++++++------ .../workflow_code/modules/ancombc.nf | 7 +++++-- .../workflow_code/modules/deseq.nf | 7 ++++--- 3 files changed, 17 insertions(+), 11 deletions(-) diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/main.nf b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/main.nf index 50d32ce5..96ee44ac 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/main.nf +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/main.nf @@ -300,6 +300,7 @@ workflow { dada_counts = RUN_R_TRIM.out.counts dada_taxonomy = RUN_R_TRIM.out.taxonomy dada_biom = RUN_R_TRIM.out.biom + r_version = RUN_R_TRIM.out.version CUTADAPT.out.version | mix(software_versions_ch) | set{software_versions_ch} TRIMMED_FASTQC.out.version | mix(software_versions_ch) | set{software_versions_ch} @@ -327,6 +328,7 @@ workflow { dada_counts = RUN_R_NOTRIM.out.counts dada_taxonomy = RUN_R_NOTRIM.out.taxonomy dada_biom = RUN_R_NOTRIM.out.biom + r_version = RUN_R_NOTRIM.out.version RUN_R_NOTRIM.out.version | mix(software_versions_ch) | set{software_versions_ch} @@ -379,28 +381,28 @@ workflow { method = Channel.of(params.diff_abund_method) if (params.diff_abund_method == "deseq2"){ - DESEQ(meta, dada_counts, dada_taxonomy, metadata) + DESEQ(meta, dada_counts, dada_taxonomy, metadata, r_version) DESEQ.out.version | mix(software_versions_ch) | set{software_versions_ch} }else if (params.diff_abund_method == "ancombc1"){ - ANCOMBC1(method, meta, dada_counts, dada_taxonomy, metadata) + ANCOMBC1(method, meta, dada_counts, dada_taxonomy, metadata, r_version) ANCOMBC1.out.version | mix(software_versions_ch) | set{software_versions_ch} }else if (params.diff_abund_method == "ancombc2"){ - ANCOMBC2(method, meta, dada_counts, dada_taxonomy, metadata) + ANCOMBC2(method, meta, dada_counts, dada_taxonomy, metadata, r_version) ANCOMBC2.out.version | mix(software_versions_ch) | set{software_versions_ch} }else{ - ANCOMBC1("ancombc1", meta, dada_counts, dada_taxonomy, metadata) + ANCOMBC1("ancombc1", meta, dada_counts, dada_taxonomy, metadata, r_version) ANCOMBC1.out.version | mix(software_versions_ch) | set{software_versions_ch} - ANCOMBC2("ancombc2", meta, dada_counts, dada_taxonomy, metadata) + ANCOMBC2("ancombc2", meta, dada_counts, dada_taxonomy, metadata, ANCOMBC1.out.version) ANCOMBC2.out.version | mix(software_versions_ch) | set{software_versions_ch} - DESEQ(meta, dada_counts, dada_taxonomy, metadata) + DESEQ(meta, dada_counts, dada_taxonomy, metadata, ANCOMBC2.out.version) DESEQ.out.version | mix(software_versions_ch) | set{software_versions_ch} } diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/ancombc.nf b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/ancombc.nf index c4048901..42978058 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/ancombc.nf +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/ancombc.nf @@ -21,7 +21,7 @@ process ANCOMBC { path(feature_table) path(taxonomy) path(metadata) - + path(version) // dummy path to ensure dependency between this step and the step that generates this file output: path("differential_abundance/${method}/"), emit: output_dir path("versions.txt"), emit: version @@ -76,9 +76,12 @@ workflow { metadata = Channel.fromPath(params.input_file, checkIfExists: true) asv_table = Channel.fromPath(params.asv_table, checkIfExists: true) taxonomy = Channel.fromPath(params.taxonomy, checkIfExists: true) + // Dummy file + version = Channel.fromPath(params.taxonomy, checkIfExists: true) + method = Channel.of(params.diff_abund_method) - ANCOMBC(method, meta, asv_table, taxonomy, metadata) + ANCOMBC(method, meta, asv_table, taxonomy, metadata, version) emit: version = ANCOMBC.out.version diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/deseq.nf b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/deseq.nf index a005d169..9b42a876 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/deseq.nf +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/deseq.nf @@ -12,7 +12,7 @@ process DESEQ { path(feature_table) path(taxonomy_table) path(metadata) - + path(version) // dummy path to ensure dependency between this step and the step that generates this file output: path("differential_abundance/deseq2/"), emit: output_dir path("versions.txt"), emit: version @@ -52,9 +52,10 @@ workflow { metadata = Channel.fromPath(params.metadata, checkIfExists: true) asv_table = Channel.fromPath(params.asv_table, checkIfExists: true) taxonomy = Channel.fromPath(params.taxonomy, checkIfExists: true) - + // Dummy file + version = Channel.fromPath(params.taxonomy, checkIfExists: true) - DESEQ(meta, metadata, asv_table, taxonomy) + DESEQ(meta, metadata, asv_table, taxonomy, version) emit: version = DESEQ.out.version From e579cba657ef1399cfd8b37575722b2cf50dbae9 Mon Sep 17 00:00:00 2001 From: olabiyi Date: Mon, 6 Jan 2025 15:27:32 -0800 Subject: [PATCH 13/24] fixed post processing bug --- .../bin/GL-gen-amplicon-file-associations-table | 6 +++--- .../NF_AmpIllumina-B/workflow_code/post_processing.config | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/GL-gen-amplicon-file-associations-table b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/GL-gen-amplicon-file-associations-table index dafe9887..de176dcc 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/GL-gen-amplicon-file-associations-table +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/GL-gen-amplicon-file-associations-table @@ -253,7 +253,7 @@ def get_read_counts_from_raw_multiqc(mapping_tab, raw_multiqc_stats_file_path, # Reading in zip_file = zipfile.ZipFile(curr_file_path) - curr_df = pd.read_csv(zip_file.open(raw_multiqc_stats_file_path), sep = "\t", usecols = [0,5]) + curr_df = pd.read_csv(zip_file.open(raw_multiqc_stats_file_path), sep = "\t", usecols = [0,6]) curr_df.columns = ["sample", "counts"] curr_df.set_index("sample", inplace = True) @@ -268,7 +268,7 @@ def get_read_counts_from_raw_multiqc(mapping_tab, raw_multiqc_stats_file_path, else: input_zip = os.path.join(fastqc_dir, output_prefix + raw_multiqc_zip) zip_file = zipfile.ZipFile(input_zip) - df = pd.read_csv(zip_file.open(raw_multiqc_stats_file_path), sep = "\t", usecols = [0,5]) + df = pd.read_csv(zip_file.open(raw_multiqc_stats_file_path), sep = "\t", usecols = [0,6]) df.columns = ["sample", "counts"] df.set_index("sample", inplace = True) @@ -531,4 +531,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/post_processing.config b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/post_processing.config index 919e6d8b..cbb08445 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/post_processing.config +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/post_processing.config @@ -53,9 +53,9 @@ params { main = "./main.nf" config = "./nextflow.config" samples = "./unique-sample-IDs.txt" - assay_table = "" // e.g. "../Genelab/a_GLDS-487_amplicon-sequencing_16s_illumina-1.txt" - isa_zip = "" // e.g. "../Genelab/OSD-487_metadata_GLDS-487-ISA.zip" - runsheet = "" // e.g "../Genelab/GLfile.csv" + assay_table = "" // e.g. "../GeneLab/a_GLDS-487_amplicon-sequencing_16s_illumina-1.txt" + isa_zip = "" // e.g. "../GeneLab/OSD-487_metadata_GLDS-487-ISA.zip" + runsheet = "" // e.g "../GeneLab/GLfile.csv" software_versions = "../Metadata/software_versions.txt" } From e48589afa4fbad08841d9a426d0be8df1583261f Mon Sep 17 00:00:00 2001 From: olabiyi Date: Wed, 8 Jan 2025 14:15:00 -0800 Subject: [PATCH 14/24] Added remove rare functionality --- .../workflow_code/bin/generate_protocol.sh | 20 ++++--- .../NF_AmpIllumina-B/workflow_code/main.nf | 47 +++++++++++----- .../workflow_code/modules/ancombc.nf | 22 +++----- .../workflow_code/modules/deseq.nf | 21 ++++--- .../workflow_code/modules/diversity.nf | 49 +++++++++-------- .../workflow_code/modules/genelab.nf | 2 + .../workflow_code/modules/taxonomy_plots.nf | 20 ++++--- .../workflow_code/nextflow.config | 7 ++- .../workflow_code/run_workflow.py | 55 ++++++++++++++----- 9 files changed, 151 insertions(+), 92 deletions(-) diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/generate_protocol.sh b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/generate_protocol.sh index 6976655c..280eb1da 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/generate_protocol.sh +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/generate_protocol.sh @@ -1,17 +1,19 @@ #!/usr/bin/env bash -FASTQC=`grep -i 'fastqc' $1 | awk '{print $2}' |sed -E 's/v//'` -MULTIQC=`grep -i 'multiqc' $1 | awk '{print $3}' |sed -E 's/v//'` -DADA=`grep -i 'dada2' $1 | awk '{print $2}' |sed -E 's/v//'` -DECIPHER=`grep -i 'decipher' $1 | awk '{print $2}' |sed -E 's/v//'` -CUTADAPT=`grep -i 'cutadapt' $1 | awk '{print $2}' |sed -E 's/v//'` +FASTQC=`grep -i 'fastqc' $1 | sort -u |awk '{print $2}' |sed -E 's/v//'` +MULTIQC=`grep -i 'multiqc' $1 | sort -u |awk '{print $3}' |sed -E 's/v//'` +DADA=`grep -i 'dada2' $1 | sort -u |awk '{print $2}' |sed -E 's/v//'` +DECIPHER=`grep -i 'decipher' $1 | sort -u | awk '{print $2}' |sed -E 's/v//'` +CUTADAPT=`grep -i 'cutadapt' $1 | sort -u |awk '{print $2}' |sed -E 's/v//'` +ANCOMBC=`grep -i 'ancombc' $1 | sort -u |awk '{print $2}' |sed -E 's/v//'` +DESEQ=`grep -i 'deseq' $1 | sort -u |awk '{print $2}' |sed -E 's/v//'` -PROTOCOL_ID=$2 # GL-DPPD-7104-A +PROTOCOL_ID=$2 # GL-DPPD-7104-B PROTOCOL="Data were processed as described in ${PROTOCOL_ID} (https://github.com/nasa/GeneLab_Data_Processing/blob/master/Amplicon/Illumina/Pipeline_GL-DPPD-7104_Versions/${PROTOCOL_ID}.md), \ - using workflow NF_AmpIllumina v1.0.0 (https://github.com/nasa/GeneLab_Data_Processing/tree/NF_AmpIllumina_1.0.0/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina). \ + using workflow NF_AmpIllumina v1.0.0 (https://github.com/nasa/GeneLab_Data_Processing/tree/NF_AmpIllumina_1.0.0/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B). \ Quality assessment of reads was performed with FastQC v${FASTQC} and reports were summarized with MultiQC v${MULTIQC}. Primers were removed from raw reads using cutadapt v${CUTADAPT}. \ - DADA2 v${DADA} was utilized for quality-trimming, filtering and inference of amplicon sequence variants (ASVs), and taxonomy was assigned with DECIPHER v${DECIPHER} against the SILVA r138 reference database." + DADA2 v${DADA} was utilized for quality trimming, filtering and inference of amplicon sequence variants (ASVs), and taxonomy was assigned with DECIPHER v${DECIPHER} against the SILVA r138 reference database. Alpha and Beta diversity analyses were performed to detect within and between samples diversity differeneces, respectively. Taxonomy summary plots were made to compare and visaulize samples and groups microbial compositions. Finally, differential abundance testing to find significantly different ASVs between groups was carried out using a combination of DESeq2 v${DESEQ} and ANCOMBC1 and ANCOMBC2 using ANCOMBC v${ANCOMBC}." -echo ${PROTOCOL} \ No newline at end of file +echo ${PROTOCOL} diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/main.nf b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/main.nf index 96ee44ac..38405305 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/main.nf +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/main.nf @@ -67,6 +67,9 @@ if (params.help) { println(" --rarefaction_depth [INTEGER] The Minimum desired sample rarefaction depth for diversity analysis. Default: 500.") println(" --group [STRING] Column in input csv file with treatments to be compared. Default: 'groups' ") println(" --samples_column [STRING] Column in input csv file with sample names belonging to each treatment group. Default: 'sample_id' ") + println(" --remove_rare [BOOLEAN] Should rare features be filtered out prior to analysis? If true, rare features will be removed. Options are true or false. Default: false.") + println(" --prevalence_cutoff [FLOAT] If --remove_rare is true, a numerical fraction between 0 and 1. Taxa with prevalences(the proportion of samples in which the taxon is present) less than this will be excluded from diversity and diffrential abundance analysis. Default is 0 , i.e. do not exclude any taxa. For example, to exclude taxa that are not present in at least 15% of the samples set it to 0.15.") + println(" --library_cutoff [INTEGER] If --remove-rare is true, a numerical threshold for filtering samples based on library sizes. Samples with library sizes less than this number will be excluded in the analysis. Default is 0 i.e do not remove any sample. For example, if you want to discard samples with library sizes less than 100 then set to 100.") println() println("File Suffixes:") println(" --primer_trimmed_R1_suffix [STRING] Suffix to use for naming your primer trimmed forward reads. Default: _R1_trimmed.fastq.gz.") @@ -84,7 +87,7 @@ if (params.help) { println() println("Genelab specific arguements:") println(" --accession [STRING] A Genelab accession number if the --input_file parameter is not set. If this parameter is set, it will ignore the --input_file parameter.") - println(" --assay_suffix [STRING] Genelabs assay suffix. Default: GLAmpSeq.") + println(" --assay_suffix [STRING] Genelabs assay suffix. Default: _GLAmpSeq.") println(" --output_prefix [STRING] Unique name to tag onto output files. Default: empty string.") println() println("Paths to existing conda environments to use otherwise a new one will be created using the yaml file in envs/") @@ -138,6 +141,9 @@ log.info """ Diversity and Differential abundance Parameters: Method: ${params.diff_abund_method} Rarefaction Depth: ${params.rarefaction_depth} + Remove Rare Taxa and Samples: ${params.remove_rare} + Taxa Prevalence Cut Off: ${params.prevalence_cutoff} + Sample Library Cut Off: ${params.library_cutoff} Groups to Comapre Column: ${params.group} Samples Column: ${params.samples_column} @@ -343,25 +349,36 @@ workflow { // Diversity, differential abundance testing and their corresponding visualizations if(params.accession){ - meta = Channel.of(["samples": "Sample Name", - "group" : "groups", - "depth" : params.rarefaction_depth, - "assay_suffix" : params.assay_suffix, - "output_prefix" : params.output_prefix, - "target_region" : params.target_region - ]) + + values = ["samples": "Sample Name", + "group" : "groups", + "depth" : params.rarefaction_depth, + "assay_suffix" : params.assay_suffix, + "output_prefix" : params.output_prefix, + "target_region" : params.target_region, + "library_cutoff" : params.library_cutoff, + "prevalence_cutoff" : params.prevalence_cutoff, + "extra" : params.remove_rare ? "--remove-rare" : "" + ] + + meta = Channel.of(values) metadata = GET_RUNSHEET.out.runsheet }else{ - meta = Channel.of(["samples": params.samples_column, - "group" : params.group, - "depth" : params.rarefaction_depth, - "assay_suffix" : params.assay_suffix, - "output_prefix" : params.output_prefix, - "target_region" : params.target_region - ]) + values = ["samples": params.samples_column, + "group" : params.group, + "depth" : params.rarefaction_depth, + "assay_suffix" : params.assay_suffix, + "output_prefix" : params.output_prefix, + "target_region" : params.target_region, + "library_cutoff" : params.library_cutoff, + "prevalence_cutoff" : params.prevalence_cutoff, + "extra" : params.remove_rare ? "--remove-rare" : "" + ] + + meta = Channel.of(values) metadata = Channel.fromPath(params.input_file, checkIfExists: true) diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/ancombc.nf b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/ancombc.nf index 42978058..0d0c40d0 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/ancombc.nf +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/ancombc.nf @@ -22,6 +22,7 @@ process ANCOMBC { path(taxonomy) path(metadata) path(version) // dummy path to ensure dependency between this step and the step that generates this file + output: path("differential_abundance/${method}/"), emit: output_dir path("versions.txt"), emit: version @@ -43,18 +44,11 @@ process ANCOMBC { --assay-suffix '${meta.assay_suffix}' \\ --output-prefix '${meta.output_prefix}' \\ --cpus ${task.cpus} \\ - --target-region '${meta.target_region}' + --target-region '${meta.target_region}' \\ + --prevalence-cutoff ${meta.prevalence_cutoff} \\ + --library-cutoff ${meta.library_cutoff} - Rscript -e "VERSIONS=sprintf('tidyverse %s\\nglue %s\\nANCOMBC %s\\nphyloseq %s\\nmia %s\\ntaxize %s\\nDescTools %s\\npatchwork %s\\nggrepel %s\\n', \\ - packageVersion('tidyverse'), \\ - packageVersion('glue'), \\ - packageVersion('ANCOMBC'), \\ - packageVersion('phyloseq'), \\ - packageVersion('mia'), \\ - packageVersion('taxize'), \\ - packageVersion('DescTools'), \\ - packageVersion('patchwork'), \\ - packageVersion('ggrepel')); \\ + Rscript -e "VERSIONS=sprintf('ANCOMBC %s\\n', packageVersion('ANCOMBC')) write(x=VERSIONS, file='versions.txt', append=TRUE)" """ @@ -69,7 +63,10 @@ workflow { "group" : params.group, "assay_suffix" : params.assay_suffix, "output_prefix" : params.output_prefix, - "target_region" : params.target_region + "target_region" : params.target_region, + "library_cutoff" : params.library_cutoff, + "prevalence_cutoff" : params.prevalence_cutoff, + "extra" : params.remove_rare ? "--remove-rare" : "" ]) @@ -79,7 +76,6 @@ workflow { // Dummy file version = Channel.fromPath(params.taxonomy, checkIfExists: true) - method = Channel.of(params.diff_abund_method) ANCOMBC(method, meta, asv_table, taxonomy, metadata, version) diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/deseq.nf b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/deseq.nf index 9b42a876..f86d0ce9 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/deseq.nf +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/deseq.nf @@ -13,6 +13,7 @@ process DESEQ { path(taxonomy_table) path(metadata) path(version) // dummy path to ensure dependency between this step and the step that generates this file + output: path("differential_abundance/deseq2/"), emit: output_dir path("versions.txt"), emit: version @@ -27,13 +28,12 @@ process DESEQ { --samples-column '${meta.samples}' \\ --assay-suffix '${meta.assay_suffix}' \\ --output-prefix '${meta.output_prefix}' \\ - --target-region '${meta.target_region}' + --target-region '${meta.target_region}' \\ + --prevalence-cutoff ${meta.prevalence_cutoff} \\ + --library-cutoff ${meta.library_cutoff} ${meta.extra} + - Rscript -e "VERSIONS=sprintf('tidyverse %s\\nglue %s\\nDESeq2 %s\\nRColorBrewer %s\\n', \\ - packageVersion('tidyverse'), \\ - packageVersion('glue'), \\ - packageVersion('DESeq2'), \\ - packageVersion('RColorBrewer')); \\ + Rscript -e "VERSIONS=sprintf('DESeq2 %s\\n', packageVersion('DESeq2')); \\ write(x=VERSIONS, file='versions.txt', append=TRUE)" """ } @@ -44,8 +44,13 @@ workflow { meta = Channel.of(["samples": params.samples_column, "group" : params.group, + "depth" : params.rarefaction_depth, "assay_suffix" : params.assay_suffix, - "output_prefix" : params.output_prefix + "output_prefix" : params.output_prefix, + "target_region" : params.target_region, + "library_cutoff" : params.library_cutoff, + "prevalence_cutoff" : params.prevalence_cutoff, + "extra" : params.remove_rare ? "--remove-rare" : "" ]) @@ -54,7 +59,7 @@ workflow { taxonomy = Channel.fromPath(params.taxonomy, checkIfExists: true) // Dummy file version = Channel.fromPath(params.taxonomy, checkIfExists: true) - + DESEQ(meta, metadata, asv_table, taxonomy, version) emit: diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/diversity.nf b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/diversity.nf index 71231be4..0d37d6ba 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/diversity.nf +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/diversity.nf @@ -35,19 +35,15 @@ process ALPHA_DIVERSITY { --samples-column '${meta.samples}' \\ --rarefaction-depth ${meta.depth} \\ --assay-suffix '${meta.assay_suffix}' \\ - --output-prefix '${meta.output_prefix}' + --output-prefix '${meta.output_prefix}' \\ + --prevalence-cutoff ${meta.prevalence_cutoff} \\ + --library-cutoff ${meta.library_cutoff} ${meta.extra} + - Rscript -e "VERSIONS=sprintf('tidyverse %s\\nglue %s\\nvegan %s\\nhere %s\\nphyloseq %s\\nFSA %s\\nmultcompView %s\\nrstatix %s\\npatchwork %s\\nRColorBrewer %s\\n', \\ - packageVersion('tidyverse'), \\ - packageVersion('glue'), \\ - packageVersion('vegan'), \\ - packageVersion('here'), \\ - packageVersion('phyloseq'), \\ + Rscript -e "VERSIONS=sprintf('FSA %s\\nmultcompView %s\\nrstatix %s\\n', \\ packageVersion('FSA'), \\ packageVersion('multcompView'), \\ - packageVersion('rstatix'), \\ - packageVersion('patchwork'), \\ - packageVersion('RColorBrewer')); \\ + packageVersion('rstatix')); \\ write(x=VERSIONS, file='versions.txt', append=TRUE)" """ @@ -82,18 +78,21 @@ process BETA_DIVERSITY { --samples-column '${meta.samples}' \\ --rarefaction-depth ${meta.depth} \\ --assay-suffix '${meta.assay_suffix}' \\ - --output-prefix '${meta.output_prefix}' + --output-prefix '${meta.output_prefix}' \\ + --prevalence-cutoff ${meta.prevalence_cutoff} \\ + --library-cutoff ${meta.library_cutoff} ${meta.extra} - Rscript -e "VERSIONS=sprintf('tidyverse %s\\nglue %s\\nvegan %s\\nhere %s\\nphyloseq %s\\nDESeq2 %s\\nggdendro %s\\nbroom %s\\nRColorBrewer %s\\n', \\ - packageVersion('tidyverse'), \\ - packageVersion('glue'), \\ + Rscript -e "VERSIONS=sprintf('vegan %s\\nmia %s\\nphyloseq %s\\nggdendro %s\\nbroom %s\\nRColorBrewer %s\\ntaxize %s\\nDescTools %s\\npatchwork %s\\nggrepel %s\\n', \\ packageVersion('vegan'), \\ - packageVersion('here'), \\ + packageVersion('mia'), \\ packageVersion('phyloseq'), \\ - packageVersion('DESeq2'), \\ packageVersion('ggdendro'), \\ packageVersion('broom'), \\ - packageVersion('RColorBrewer')); \\ + packageVersion('RColorBrewer'), \\ + packageVersion('taxize'), \\ + packageVersion('DescTools'), \\ + packageVersion('patchwork'), \\ + packageVersion('ggrepel')); \\ write(x=VERSIONS, file='versions.txt', append=TRUE)" """ @@ -104,11 +103,15 @@ workflow{ meta = Channel.of(["samples": params.samples_column, - "group" : params.group, - "depth" : params.rarefaction_depth, - "assay_suffix" : params.assay_suffix, - "output_prefix" : params.output_prefix - ]) + "group" : params.group, + "depth" : params.rarefaction_depth, + "assay_suffix" : params.assay_suffix, + "output_prefix" : params.output_prefix, + "target_region" : params.target_region, + "library_cutoff" : params.library_cutoff, + "prevalence_cutoff" : params.prevalence_cutoff, + "extra" : params.remove_rare ? "--remove-rare" : "" + ]) asv_table = Channel.fromPath(params.asv_table, checkIfExists: true) @@ -126,4 +129,4 @@ workflow{ emit: version = software_versions_ch -} \ No newline at end of file +} diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/genelab.nf b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/genelab.nf index 96b828d1..b3db074c 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/genelab.nf +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/genelab.nf @@ -190,6 +190,7 @@ process GENERATE_CURATION_TABLE { script: def INPUT_TABLE = "${params.files.assay_table}" == "" ? "--isa-zip ${input_table}" : "--assay-table ${input_table}" """ + GL-gen-amplicon-file-associations-table ${INPUT_TABLE} \\ --output '${GLDS_accession}_${output_prefix}-associated-file-names.tsv' \\ --GLDS-ID '${GLDS_accession}' \\ @@ -211,6 +212,7 @@ process GENERATE_CURATION_TABLE { --filtered_reads_dir '${filtered_reads_dir}' \\ --trimmed_reads_dir '${trimmed_reads_dir}' \\ --final_outputs_dir '${final_outputs_dir}' ${params.file_association_extra} + """ } diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/taxonomy_plots.nf b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/taxonomy_plots.nf index aaa0650e..4a8fd692 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/taxonomy_plots.nf +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/taxonomy_plots.nf @@ -36,11 +36,10 @@ process PLOT_TAXONOMY { --assay-suffix '${meta.assay_suffix}' \\ --output-prefix '${meta.output_prefix}' - Rscript -e "VERSIONS=sprintf('tidyverse %s\\nglue %s\\ntools %s\\nhere %s\\n', \\ + Rscript -e "VERSIONS=sprintf('tidyverse %s\\nglue %s\\ntools %s\\n', \\ packageVersion('tidyverse'), \\ packageVersion('glue'), \\ - packageVersion('tools'), \\ - packageVersion('here')); \\ + packageVersion('tools')); \\ write(x=VERSIONS, file='versions.txt', append=TRUE)" """ @@ -51,10 +50,15 @@ workflow { meta = Channel.of(["samples": params.samples_column, - "group" : params.group, - "assay_suffix" : params.assay_suffix, - "output_prefix" : params.output_prefix - ]) + "group" : params.group, + "depth" : params.rarefaction_depth, + "assay_suffix" : params.assay_suffix, + "output_prefix" : params.output_prefix, + "target_region" : params.target_region, + "library_cutoff" : params.library_cutoff, + "prevalence_cutoff" : params.prevalence_cutoff, + "extra" : params.remove_rare ? "--remove-rare" : "" + ]) asv_table = Channel.fromPath(params.asv_table, checkIfExists: true) @@ -67,4 +71,4 @@ workflow { emit: version = PLOT_TAXONOMY.out.version -} \ No newline at end of file +} diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/nextflow.config b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/nextflow.config index 6c978ebf..42b29ada 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/nextflow.config +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/nextflow.config @@ -68,6 +68,11 @@ params { diff_abund_method = "all" // ["all", "ancombc1", "ancombc2", or "deseq2"] - it runs all three by default group = "groups" // column in input csv file to be compared samples_column = "sample_id" // column in input csv file containing sample names + // Should rare features and samples be discarded. Values are true or false. If set to true then set the cutoffs below + remove_rare = false + prevalence_cutoff = 0 // a fraction between 0 and 1 that represents the prevalance in percentage of taxa to be retained + library_cutoff = 0 // Samples with library sizes less than this number will be excluded in the analysis + // Minimum desired sample rarefaction depth for diversity analysis rarefaction_depth = 500 @@ -233,6 +238,6 @@ manifest { description = 'Amplicon Illumina workflow for pipeline document GL-DPPD-7104-B' mainScript = 'main.nf' defaultBranch = 'main' - nextflowVersion = '>=22.10.1' + nextflowVersion = '>=24.04.4' version = '1.0.0' } diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/run_workflow.py b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/run_workflow.py index 169d4158..e13154c4 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/run_workflow.py +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/run_workflow.py @@ -12,7 +12,7 @@ def create_config(target_region, raw_R1_suffix, raw_R2_suffix, trim_primers, inp primer_trimmed_R1_suffix, primer_trimmed_R2_suffix, filtered_R1_suffix, filtered_R2_suffix, output_dir, diff_abund_method, group, samples_column, rarefaction_depth, errorStrategy, queueSize, default_cpus, default_memory, cutadapt_memory, R_cpus, R_memory, diversity_cpus, diversity_memory, container_genelab, container_fastqc, container_multiqc, container_cutadapt, - container_dada, container_ancom, container_diversity, singularity_cacheDir): + container_dada, container_ancom, container_diversity, singularity_cacheDir, remove_rare, prevalence_cutoff, library_cutoff): """ A function to create nextflow.config file by string interploation using the supplied arguements""" config = \ @@ -23,7 +23,6 @@ def create_config(target_region, raw_R1_suffix, raw_R2_suffix, trim_primers, inp target_region = "{target_region}" raw_R1_suffix = "{raw_R1_suffix}" raw_R2_suffix = "{raw_R2_suffix}" - raw_reads_dir = "{output_dir}/Raw_Sequence_Data/" trim_primers = "{trim_primers}" == "TRUE" ? true : false @@ -75,21 +74,28 @@ def create_config(target_region, raw_R1_suffix, raw_R2_suffix, trim_primers, inp // Directories - fastqc_out_dir = "{output_dir}/FastQC_Outputs/" - trimmed_reads_dir = "{output_dir}/Trimmed_Sequence_Data/" - filtered_reads_dir = "{output_dir}/Filtered_Sequence_Data/" - info_out_dir = "{output_dir}/Metadata/" - final_outputs_dir = "{output_dir}/Final_Outputs/" + + fastqc_out_dir = "{output_dir}/workflow_output/FastQC_Outputs/" + trimmed_reads_dir = "{output_dir}/workflow_output/Trimmed_Sequence_Data/" + filtered_reads_dir = "{output_dir}/workflow_output/Filtered_Sequence_Data/" + final_outputs_dir = "{output_dir}/workflow_output/Final_Outputs/" + + raw_reads_dir = "{output_dir}/Raw_Sequence_Data/" metadata_dir = "{output_dir}/Metadata/" genelab_dir = "{output_dir}/GeneLab/" // Multiqc - multiqc_config = "${{baseDir}}/config/multiqc.config" + multiqc_config = "${{projectDir}}/config/multiqc.config" // -------- Differential abundance parameters ----- // diff_abund_method = "{diff_abund_method}" group = "{group}" samples_column = "{samples_column}" + // Should rare features and samples be discarded. Values are true or false. If set to true then set the cutoffs below + remove_rare = "{remove_rare}" == "TRUE" ? true : false + prevalence_cutoff = {prevalence_cutoff} // a fraction between 0 and 1 that represents the prevalance in percentage of taxa to be retained + library_cutoff = {library_cutoff} // Samples with library sizes less than this number will be excluded in the analysis + // Minimum desired sample rarefaction depth for diversity analysis rarefaction_depth = {rarefaction_depth} @@ -255,7 +261,7 @@ def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') description = 'Amplicon Illumina workflow for pipeline document GL-DPPD-7104-B' mainScript = 'main.nf' defaultBranch = 'main' - nextflowVersion = '>=22.10.1' + nextflowVersion = '>=24.04.4' version = '1.0.0' }} @@ -405,15 +411,15 @@ def main(): type=int) parser.add_argument('--diff-abund-method', - choices=["ancombc1", "ancombc2", "deseq2"], - default="ancombc2", + choices=["all", "ancombc1", "ancombc2", "deseq2"], + default="all", help="The method to use for differential abundance testing. Either ancombc1, ancombc2, or deseq2. Default: ancombc2" , type=str) parser.add_argument('-d', '--output-dir', metavar='/path/to/output_dir/', - default='workflow_output', - help='Specifies the output directory for the output files generated by the workflow. Default: workflow_output', + default='..', + help='Specifies the output directory for the output files generated by the workflow. Default: ..', type=str) parser.add_argument('-m', '--min-cutadapt-len', @@ -465,6 +471,24 @@ def main(): help='If set to TRUE, specifies to concatenate forward and reverse reads only with dada2 instead of merging paired reads. Default: FALSE', type=str) + parser.add_argument('--remove-rare', + choices=['TRUE', 'FALSE'], + default='FALSE', + help='If set to TRUE, rare features and samples with library size less than cutoff will be removed. Default: FALSE', + type=str) + + parser.add_argument('--prevalence-cutoff', + default=0, + help='If --remove-rare is TRUE, a numerical fraction between 0 and 1. Taxa with prevalences(the proportion of samples in which the taxon is present) less than this will be excluded from diversity and differential abundance analysis. Default is 0 , i.e. do not exclude any taxa. For example, to exclude taxa that are not present in at least 15% of the samples set it to 0.15.', + metavar='prevalence_cutoff', + type=float) + + parser.add_argument('--library-cutoff', + default=0, + help='If --remove-rare is TRUE, a numerical threshold for filtering samples based on library sizes. Samples with library sizes less than this number will be excluded in the analysis. Default is 0 i.e do not remove any sample. For example, if you want to discard samples with library sizes less than 100, then set to 100.', + metavar='library_cutoff', + type=int) + parser.add_argument('--output-prefix', default='', help='Specifies the prefix to use on all output files to distinguish multiple primer sets, leave as an empty string if only one primer set is being processed. Default: empty string', @@ -621,7 +645,7 @@ def main(): default="", help="A comma separated string of extra arguement(s) to nextflow run e.g 'with-tower,name test'. \ Please run nextflow run -h for a full list of available options. Default: '' ", - metavar='singularity_directory', + metavar='extra_args', type=str) # Check if no arguments were provided @@ -676,7 +700,8 @@ def main(): args.rarefaction_depth, args.errorStrategy, args.queueSize, args.default_cpus, args.default_memory, args.cutadapt_memory, args.R_cpus, args.R_memory, args.diversity_cpus, args.diversity_memory, args.container_genelab, args.container_fastqc, args.container_multiqc, args.container_cutadapt, - args.container_dada, args.container_ancom, args.container_diversity, args.singularity_cacheDir) + args.container_dada, args.container_ancom, args.container_diversity, args.singularity_cacheDir, + args.remove_rare, args.prevalence_cutoff, args.library_cutoff) with open("nextflow.config", "w") as file: print(config_file, file=file) From b61a9b5f983ae3403ff8acba1ce6576a5048e666 Mon Sep 17 00:00:00 2001 From: olabiyi Date: Wed, 8 Jan 2025 15:05:25 -0800 Subject: [PATCH 15/24] updated nextflow version --- .../NF_AmpIllumina-B/workflow_code/post_processing.config | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/post_processing.config b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/post_processing.config index cbb08445..6a069647 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/post_processing.config +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/post_processing.config @@ -82,8 +82,8 @@ params { } // Used as base for clean file paths -params.baseDir = "${baseDir}" -parent_dir = "${baseDir.getParent()}" +params.baseDir = "${projectDir}" +parent_dir = "${projectDir.getParent()}" // Setting the default container engine as singularity params.containerEngine = "singularity" // Conda shouldn't be used by default except when using conda-based profiles @@ -150,6 +150,6 @@ manifest { description = 'Amplicon Illumina post-processing workflow' mainScript = 'post_processing.nf' defaultBranch = 'main' - nextflowVersion = '>=22.10.1' + nextflowVersion = '>=24.04.4' version = '1.0.0' } From 1998ed25d1864edb42a8522dab61221394720867 Mon Sep 17 00:00:00 2001 From: olabiyi Date: Wed, 8 Jan 2025 15:12:28 -0800 Subject: [PATCH 16/24] Fixed reads count column bug --- .../workflow_code/bin/GL-validate-processed-amplicon-data | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/GL-validate-processed-amplicon-data b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/GL-validate-processed-amplicon-data index 5a3e9d6f..26ee3670 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/GL-validate-processed-amplicon-data +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/GL-validate-processed-amplicon-data @@ -312,8 +312,8 @@ def get_read_count_stats(validation_log, prefix, multiqc_zip, multiqc_stats_file """ Grabs read counts and summarizes """ zip_file = zipfile.ZipFile(multiqc_zip) - - df = pd.read_csv(zip_file.open(multiqc_stats_file_path), sep = "\t", usecols = [5]) + read_count_column = 6 + df = pd.read_csv(zip_file.open(multiqc_stats_file_path), sep = "\t", usecols = [read_count_column]) df.columns = ["counts"] counts = df.counts.tolist() @@ -486,4 +486,4 @@ def main(): get_read_count_stats(validation_log, filtered_prefix, filtered_multiqc_zip, filtered_multiqc_stats_file_path) if __name__ == "__main__": - main() \ No newline at end of file + main() From e11e03ff6a6c7a92fcc10553e1cafe288aedc7ee Mon Sep 17 00:00:00 2001 From: olabiyi Date: Thu, 9 Jan 2025 15:24:42 -0800 Subject: [PATCH 17/24] Updated readme and variables --- .../NF_AmpIllumina-B/README.md | 2 +- .../GL-gen-amplicon-file-associations-table | 8 ++++-- .../bin/GL-validate-processed-amplicon-data | 7 ++--- .../NF_AmpIllumina-B/workflow_code/main.nf | 25 +++++++++--------- .../workflow_code/modules/ancombc.nf | 6 ++--- .../workflow_code/modules/create_runsheet.nf | 2 +- .../workflow_code/modules/deseq.nf | 6 ++--- .../workflow_code/modules/genelab.nf | 6 ++--- .../modules/quality_assessment.nf | 10 +++---- .../workflow_code/modules/run_dada.nf | 2 +- .../workflow_code/post_processing.config | 26 +++++++++---------- 11 files changed, 53 insertions(+), 47 deletions(-) diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/README.md b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/README.md index bcf6c37a..656b44d2 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/README.md +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/README.md @@ -207,7 +207,7 @@ python run_workflow.py --run --target-region 16S --input-file PE_file.csv --F-pr python run_workflow.py --target-region 16S --accession GLDS-487 --profile slurm,singularity ``` -> Note: When using the wrapper script, all outputs generated by the workflow will be in a directory specified by the `--output-dir` parameter. This will be a directory named `./workflow_output/` by default. +> Note: When using the wrapper script, all outputs generated by the workflow will be in a directory specified by the `--output-dir` parameter. This will be the parent directory `..` by default.
diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/GL-gen-amplicon-file-associations-table b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/GL-gen-amplicon-file-associations-table index de176dcc..052497bc 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/GL-gen-amplicon-file-associations-table +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/GL-gen-amplicon-file-associations-table @@ -253,7 +253,9 @@ def get_read_counts_from_raw_multiqc(mapping_tab, raw_multiqc_stats_file_path, # Reading in zip_file = zipfile.ZipFile(curr_file_path) - curr_df = pd.read_csv(zip_file.open(raw_multiqc_stats_file_path), sep = "\t", usecols = [0,6]) + #curr_df = pd.read_csv(zip_file.open(raw_multiqc_stats_file_path), sep = "\t", usecols = [0,6]) + curr_df = pd.read_csv(zip_file.open(raw_multiqc_stats_file_path), sep = "\t") + curr_df = curr_df.iloc[:,[0,-1]] # retrieve the samples column[0] and last column[-1] which is reads counts column curr_df.columns = ["sample", "counts"] curr_df.set_index("sample", inplace = True) @@ -268,7 +270,9 @@ def get_read_counts_from_raw_multiqc(mapping_tab, raw_multiqc_stats_file_path, else: input_zip = os.path.join(fastqc_dir, output_prefix + raw_multiqc_zip) zip_file = zipfile.ZipFile(input_zip) - df = pd.read_csv(zip_file.open(raw_multiqc_stats_file_path), sep = "\t", usecols = [0,6]) + #df = pd.read_csv(zip_file.open(raw_multiqc_stats_file_path), sep = "\t", usecols = [0,6]) + df = pd.read_csv(zip_file.open(raw_multiqc_stats_file_path), sep = "\t") + df = df.iloc[:,[0,-1]] # retrieve the samples column[0] and last column[-1] which is reads counts column df.columns = ["sample", "counts"] df.set_index("sample", inplace = True) diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/GL-validate-processed-amplicon-data b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/GL-validate-processed-amplicon-data index 26ee3670..5ab85a3d 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/GL-validate-processed-amplicon-data +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/GL-validate-processed-amplicon-data @@ -312,9 +312,10 @@ def get_read_count_stats(validation_log, prefix, multiqc_zip, multiqc_stats_file """ Grabs read counts and summarizes """ zip_file = zipfile.ZipFile(multiqc_zip) - read_count_column = 6 - df = pd.read_csv(zip_file.open(multiqc_stats_file_path), sep = "\t", usecols = [read_count_column]) - + #read_count_column = 6 + #df = pd.read_csv(zip_file.open(multiqc_stats_file_path), sep = "\t", usecols = [read_count_column]) + df = pd.read_csv(zip_file.open(multiqc_stats_file_path), sep = "\t") + df = df.iloc[:,-1] # retrieve the last column which is reads counts column df.columns = ["counts"] counts = df.counts.tolist() diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/main.nf b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/main.nf index 38405305..39fb8c6e 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/main.nf +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/main.nf @@ -69,7 +69,7 @@ if (params.help) { println(" --samples_column [STRING] Column in input csv file with sample names belonging to each treatment group. Default: 'sample_id' ") println(" --remove_rare [BOOLEAN] Should rare features be filtered out prior to analysis? If true, rare features will be removed. Options are true or false. Default: false.") println(" --prevalence_cutoff [FLOAT] If --remove_rare is true, a numerical fraction between 0 and 1. Taxa with prevalences(the proportion of samples in which the taxon is present) less than this will be excluded from diversity and diffrential abundance analysis. Default is 0 , i.e. do not exclude any taxa. For example, to exclude taxa that are not present in at least 15% of the samples set it to 0.15.") - println(" --library_cutoff [INTEGER] If --remove-rare is true, a numerical threshold for filtering samples based on library sizes. Samples with library sizes less than this number will be excluded in the analysis. Default is 0 i.e do not remove any sample. For example, if you want to discard samples with library sizes less than 100 then set to 100.") + println(" --library_cutoff [INTEGER] If --remove-rare is true, a numerical threshold for filtering samples based on library sizes. Samples with library sizes less than this number will be excluded in the analysis. Default is 0 i.e do not remove any sample. For example, if you want to discard samples with library sizes less than 100, then set to 100.") println() println("File Suffixes:") println(" --primer_trimmed_R1_suffix [STRING] Suffix to use for naming your primer trimmed forward reads. Default: _R1_trimmed.fastq.gz.") @@ -266,7 +266,7 @@ workflow { // Generating a file with sample ids on a new line file_ch.map{row -> "${row.sample_id}"} - .collectFile(name: "${baseDir}/unique-sample-IDs.txt", newLine: true) + .collectFile(name: "${projectDir}/unique-sample-IDs.txt", newLine: true) .set{sample_ids_ch} // Read quality check and trimming @@ -306,7 +306,7 @@ workflow { dada_counts = RUN_R_TRIM.out.counts dada_taxonomy = RUN_R_TRIM.out.taxonomy dada_biom = RUN_R_TRIM.out.biom - r_version = RUN_R_TRIM.out.version + filtered_count = RUN_R_TRIM.out.filtered_count CUTADAPT.out.version | mix(software_versions_ch) | set{software_versions_ch} TRIMMED_FASTQC.out.version | mix(software_versions_ch) | set{software_versions_ch} @@ -334,7 +334,7 @@ workflow { dada_counts = RUN_R_NOTRIM.out.counts dada_taxonomy = RUN_R_NOTRIM.out.taxonomy dada_biom = RUN_R_NOTRIM.out.biom - r_version = RUN_R_NOTRIM.out.version + filtered_count = RUN_R_NOTRIM.out.filtered_count RUN_R_NOTRIM.out.version | mix(software_versions_ch) | set{software_versions_ch} @@ -398,41 +398,42 @@ workflow { method = Channel.of(params.diff_abund_method) if (params.diff_abund_method == "deseq2"){ - DESEQ(meta, dada_counts, dada_taxonomy, metadata, r_version) + DESEQ(meta, dada_counts, dada_taxonomy, metadata, filtered_count) DESEQ.out.version | mix(software_versions_ch) | set{software_versions_ch} }else if (params.diff_abund_method == "ancombc1"){ - ANCOMBC1(method, meta, dada_counts, dada_taxonomy, metadata, r_version) + ANCOMBC1(method, meta, dada_counts, dada_taxonomy, metadata, filtered_count) ANCOMBC1.out.version | mix(software_versions_ch) | set{software_versions_ch} }else if (params.diff_abund_method == "ancombc2"){ - ANCOMBC2(method, meta, dada_counts, dada_taxonomy, metadata, r_version) + ANCOMBC2(method, meta, dada_counts, dada_taxonomy, metadata, filtered_count) ANCOMBC2.out.version | mix(software_versions_ch) | set{software_versions_ch} }else{ - ANCOMBC1("ancombc1", meta, dada_counts, dada_taxonomy, metadata, r_version) + ANCOMBC1("ancombc1", meta, dada_counts, dada_taxonomy, metadata, filtered_count) ANCOMBC1.out.version | mix(software_versions_ch) | set{software_versions_ch} - ANCOMBC2("ancombc2", meta, dada_counts, dada_taxonomy, metadata, ANCOMBC1.out.version) + ANCOMBC2("ancombc2", meta, dada_counts, dada_taxonomy, metadata, ANCOMBC1.out.output_dir) ANCOMBC2.out.version | mix(software_versions_ch) | set{software_versions_ch} - DESEQ(meta, dada_counts, dada_taxonomy, metadata, ANCOMBC2.out.version) + DESEQ(meta, dada_counts, dada_taxonomy, metadata, ANCOMBC2.out.output_dir) DESEQ.out.version | mix(software_versions_ch) | set{software_versions_ch} } // Software Version Capturing - combining all captured sofware versions - nf_version = "Nextflow Version:".concat("${nextflow.version}\n<><><>\n") + nf_version = "Nextflow Version ".concat("${nextflow.version}") nextflow_version_ch = Channel.value(nf_version) // Write software versions to file - software_versions_ch | map { it.text + "\n<><><>\n"} + software_versions_ch | map { it.text.strip() } | unique | mix(nextflow_version_ch) + | unique | collectFile(name: "${params.metadata_dir}/software_versions.txt", newLine: true, cache: false) | set{final_software_versions_ch} diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/ancombc.nf b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/ancombc.nf index 0d0c40d0..1023e9c3 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/ancombc.nf +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/ancombc.nf @@ -21,7 +21,7 @@ process ANCOMBC { path(feature_table) path(taxonomy) path(metadata) - path(version) // dummy path to ensure dependency between this step and the step that generates this file + path(dummy) // dummy path to ensure dependency between this step and the step that generates this file output: path("differential_abundance/${method}/"), emit: output_dir @@ -74,10 +74,10 @@ workflow { asv_table = Channel.fromPath(params.asv_table, checkIfExists: true) taxonomy = Channel.fromPath(params.taxonomy, checkIfExists: true) // Dummy file - version = Channel.fromPath(params.taxonomy, checkIfExists: true) + dummy = Channel.fromPath(params.taxonomy, checkIfExists: true) method = Channel.of(params.diff_abund_method) - ANCOMBC(method, meta, asv_table, taxonomy, metadata, version) + ANCOMBC(method, meta, asv_table, taxonomy, metadata, dummy) emit: version = ANCOMBC.out.version diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/create_runsheet.nf b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/create_runsheet.nf index 4cc45f65..750cf53e 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/create_runsheet.nf +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/create_runsheet.nf @@ -6,7 +6,7 @@ nextflow.enable.dsl = 2 process GET_RUNSHEET { - beforeScript "chmod +x ${baseDir}/bin/create_runsheet.py" + beforeScript "chmod +x ${projectDir}/bin/create_runsheet.py" tag "Retrieving raw sequences and metadata for ${accession}..." input: tuple val(accession), val(target_region) diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/deseq.nf b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/deseq.nf index f86d0ce9..e39852bd 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/deseq.nf +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/deseq.nf @@ -12,7 +12,7 @@ process DESEQ { path(feature_table) path(taxonomy_table) path(metadata) - path(version) // dummy path to ensure dependency between this step and the step that generates this file + path(dummy) // dummy path to ensure dependency between this step and the step that generates this file output: path("differential_abundance/deseq2/"), emit: output_dir @@ -58,9 +58,9 @@ workflow { asv_table = Channel.fromPath(params.asv_table, checkIfExists: true) taxonomy = Channel.fromPath(params.taxonomy, checkIfExists: true) // Dummy file - version = Channel.fromPath(params.taxonomy, checkIfExists: true) + dummy = Channel.fromPath(params.taxonomy, checkIfExists: true) - DESEQ(meta, metadata, asv_table, taxonomy, version) + DESEQ(meta, metadata, asv_table, taxonomy, dummy) emit: version = DESEQ.out.version diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/genelab.nf b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/genelab.nf index b3db074c..3dfc5800 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/genelab.nf +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/genelab.nf @@ -84,7 +84,7 @@ process PACKAGE_PROCESSING_INFO { process GENERATE_README { - beforeScript "chmod +x ${baseDir}/bin/*" + beforeScript "chmod +x ${projectDir}/bin/*" tag "Generating README for ${OSD_accession}" input: tuple val(name), val(email), @@ -167,7 +167,7 @@ process VALIDATE_PROCESSING { process GENERATE_CURATION_TABLE { - beforeScript "chmod +x ${baseDir}/bin/*" + beforeScript "chmod +x ${projectDir}/bin/*" tag "Generating a file association table for curation..." input: @@ -245,7 +245,7 @@ process GENERATE_MD5SUMS { process GENERATE_PROTOCOL { - beforeScript "chmod +x ${baseDir}/bin/*" + beforeScript "chmod +x ${projectDir}/bin/*" tag "Generating your analysis protocol..." input: diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/quality_assessment.nf b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/quality_assessment.nf index 9d727bbb..2f36e812 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/quality_assessment.nf +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/quality_assessment.nf @@ -6,15 +6,15 @@ nextflow.enable.dsl = 2 ****************************************************************************************/ // A 3-column (single-end) or 4-column (paired-end) file -//params.csv_file = "${baseDir}/file.csv" +//params.csv_file = "${projectDir}/PE_file.csv" //params.prefix = "raw" -//params.multiqc_config = "${baseDir}/config/multiqc.config" +//params.multiqc_config = "${projectDir}/config/multiqc.config" // FastQC performed on reads process FASTQC { tag "Running fastqc on ${sample_id}..." - beforeScript "chmod +x ${baseDir}/bin/*" + beforeScript "chmod +x ${projectDir}/bin/*" label "fastqc" input: @@ -35,7 +35,7 @@ process FASTQC { process MULTIQC { tag "Running multiqc on the ${prefix} files..." - beforeScript "chmod +x ${baseDir}/bin/*" + beforeScript "chmod +x ${projectDir}/bin/*" input: val(prefix) @@ -65,7 +65,7 @@ process MULTIQC { process CUTADAPT { tag "Trimming off primers for ${sample_id} using cutadapt..." - beforeScript "chmod +x ${baseDir}/bin/*" + beforeScript "chmod +x ${projectDir}/bin/*" input: tuple val(sample_id), path(reads), val(isPaired) diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/run_dada.nf b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/run_dada.nf index ff0b9ee1..2bcd1b57 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/run_dada.nf +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/run_dada.nf @@ -96,7 +96,7 @@ process RUN_R_TRIM { process RUN_R_NOTRIM { tag "Running dada2 on the raw reads..." - beforeScript "chmod +x ${baseDir}/bin/*" + beforeScript "chmod +x ${projectDir}/bin/*" input: tuple path(sample_IDs_file), val(isPaired) diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/post_processing.config b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/post_processing.config index 6a069647..172a32b1 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/post_processing.config +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/post_processing.config @@ -3,12 +3,12 @@ params { publishDir_mode = "link" // "copy", "link", "symlink" //-------- Parameters used to generate README.txt ------------------// - name = "FirstName M. LastName" // name of analyst - email = "name@nasa.gov" // email of analyst + name = "Olabiyi A. Obayomi" // name of analyst + email = "olabiyi.a.obayomi@nasa.gov" // email of analyst // Genelab pipeline document protocol id used to process the data protocol_id = "GL-DPPD-7104-B" - GLDS_accession = "" // e.g "GLDS-487" - OSD_accession = "" // e.g "OSD-487" + GLDS_accession = "GLDS-487" // e.g "GLDS-487" + OSD_accession = "OSD-487" // e.g "OSD-487" assay_suffix = "_GLAmpSeq" readme = "README${params.assay_suffix}.txt" processing_zip_file = "processing_info${params.assay_suffix}.zip" @@ -54,9 +54,9 @@ params { config = "./nextflow.config" samples = "./unique-sample-IDs.txt" assay_table = "" // e.g. "../GeneLab/a_GLDS-487_amplicon-sequencing_16s_illumina-1.txt" - isa_zip = "" // e.g. "../GeneLab/OSD-487_metadata_GLDS-487-ISA.zip" - runsheet = "" // e.g "../GeneLab/GLfile.csv" - software_versions = "../Metadata/software_versions.txt" + isa_zip = "workflow_output/GeneLab/OSD-487_metadata_GLDS-487-ISA.zip" + runsheet = "workflow_output/GeneLab/GLfile.csv" + software_versions = "workflow_output/Metadata/software_versions.txt" } // Make sure you always end the directory names with a forward slash "/" and that if you use @@ -67,11 +67,11 @@ params { modules = "./modules/" Output_dir = "../Post_Processing/" config = "./config/" - Raw_Sequence_Data = "../Raw_Sequence_Data/" - FastQC_Outputs = "../workflow_output/FastQC_Outputs/" - Trimmed_Sequence_Data = "../workflow_output/Trimmed_Sequence_Data/" - Filtered_Sequence_Data = "../workflow_output/Filtered_Sequence_Data/" - Final_Outputs = "../workflow_output/Final_Outputs/" + Raw_Sequence_Data = "workflow_output/Raw_Sequence_Data/" + FastQC_Outputs = "workflow_output/FastQC_Outputs/" + Trimmed_Sequence_Data = "workflow_output/Trimmed_Sequence_Data/" + Filtered_Sequence_Data = "workflow_output/Filtered_Sequence_Data/" + Final_Outputs = "workflow_output/Final_Outputs/" } conda{ @@ -106,7 +106,7 @@ profiles { singularity { singularity.enabled = true singularity.autoMounts = true - singularity.cacheDir = "singularity/" // local singularity images location + singularity.cacheDir = "/global/data/temp_scratch/oobayomi/singularity_images/" // local singularity images location params.containerEngine = "singularity" } From 4c44697a25f24861286c38280956faf7dc239f9c Mon Sep 17 00:00:00 2001 From: olabiyi Date: Fri, 10 Jan 2025 10:59:53 -0800 Subject: [PATCH 18/24] Fixed software versions redundancy bug --- .../bin/GL-validate-processed-amplicon-data | 2 +- .../NF_AmpIllumina-B/workflow_code/main.nf | 8 +++--- .../workflow_code/modules/utils.nf | 20 ++++++++++++++ .../workflow_code/nextflow.config | 4 +++ .../workflow_code/post_processing.config | 26 +++++++++---------- .../workflow_code/run_workflow.py | 7 +++-- 6 files changed, 47 insertions(+), 20 deletions(-) create mode 100644 Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/utils.nf diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/GL-validate-processed-amplicon-data b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/GL-validate-processed-amplicon-data index 5ab85a3d..ca05c926 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/GL-validate-processed-amplicon-data +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/GL-validate-processed-amplicon-data @@ -315,7 +315,7 @@ def get_read_count_stats(validation_log, prefix, multiqc_zip, multiqc_stats_file #read_count_column = 6 #df = pd.read_csv(zip_file.open(multiqc_stats_file_path), sep = "\t", usecols = [read_count_column]) df = pd.read_csv(zip_file.open(multiqc_stats_file_path), sep = "\t") - df = df.iloc[:,-1] # retrieve the last column which is reads counts column + df = df.iloc[:,[-1]] # retrieve the last column which is reads counts column df.columns = ["counts"] counts = df.counts.tolist() diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/main.nf b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/main.nf index 39fb8c6e..01347530 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/main.nf +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/main.nf @@ -186,6 +186,7 @@ include { PLOT_TAXONOMY } from './modules/taxonomy_plots.nf' include { ANCOMBC as ANCOMBC1 } from './modules/ancombc.nf' include { ANCOMBC as ANCOMBC2 } from './modules/ancombc.nf' include { DESEQ } from './modules/deseq.nf' +include { SOFTWARE_VERSIONS } from './modules/utils.nf' @@ -425,7 +426,7 @@ workflow { } - // Software Version Capturing - combining all captured sofware versions + // Software Version Capturing - combining all captured software versions nf_version = "Nextflow Version ".concat("${nextflow.version}") nextflow_version_ch = Channel.value(nf_version) @@ -433,9 +434,8 @@ workflow { software_versions_ch | map { it.text.strip() } | unique | mix(nextflow_version_ch) - | unique - | collectFile(name: "${params.metadata_dir}/software_versions.txt", newLine: true, cache: false) - | set{final_software_versions_ch} + | collectFile({it -> it}, newLine: true, cache: false) + | SOFTWARE_VERSIONS } diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/utils.nf b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/utils.nf new file mode 100644 index 00000000..46402872 --- /dev/null +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/modules/utils.nf @@ -0,0 +1,20 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl = 2 + +process SOFTWARE_VERSIONS { + + tag "Writing out software versions..." + label "fastqc" //unix environment + + input: + path(software_versions) + + output: + path("software_versions.txt") + + script: + """ + # Delete white spaces and write out unique software versions + grep -v "^\$" ${software_versions} | sort -u > software_versions.txt + """ +} diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/nextflow.config b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/nextflow.config index 42b29ada..af60e555 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/nextflow.config +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/nextflow.config @@ -204,6 +204,10 @@ process { } + withName: SOFTWARE_VERSIONS { + publishDir = [path: params.metadata_dir, mode: params.publishDir_mode] + } + } diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/post_processing.config b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/post_processing.config index 172a32b1..19bc9ea5 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/post_processing.config +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/post_processing.config @@ -3,12 +3,12 @@ params { publishDir_mode = "link" // "copy", "link", "symlink" //-------- Parameters used to generate README.txt ------------------// - name = "Olabiyi A. Obayomi" // name of analyst - email = "olabiyi.a.obayomi@nasa.gov" // email of analyst + name = "First M. Last" // name of analyst + email = "NASA@nasa.gov" // email of analyst // Genelab pipeline document protocol id used to process the data protocol_id = "GL-DPPD-7104-B" - GLDS_accession = "GLDS-487" // e.g "GLDS-487" - OSD_accession = "OSD-487" // e.g "OSD-487" + GLDS_accession = "" // e.g "GLDS-487" + OSD_accession = "" // e.g "OSD-487" assay_suffix = "_GLAmpSeq" readme = "README${params.assay_suffix}.txt" processing_zip_file = "processing_info${params.assay_suffix}.zip" @@ -54,9 +54,9 @@ params { config = "./nextflow.config" samples = "./unique-sample-IDs.txt" assay_table = "" // e.g. "../GeneLab/a_GLDS-487_amplicon-sequencing_16s_illumina-1.txt" - isa_zip = "workflow_output/GeneLab/OSD-487_metadata_GLDS-487-ISA.zip" - runsheet = "workflow_output/GeneLab/GLfile.csv" - software_versions = "workflow_output/Metadata/software_versions.txt" + isa_zip = "" // e.g. "../GeneLab/OSD-487_metadata_GLDS-487-ISA.zip" + runsheet = "" // e.g. "../GeneLab/GLfile.csv" + software_versions = "../Metadata/software_versions.txt" } // Make sure you always end the directory names with a forward slash "/" and that if you use @@ -67,11 +67,11 @@ params { modules = "./modules/" Output_dir = "../Post_Processing/" config = "./config/" - Raw_Sequence_Data = "workflow_output/Raw_Sequence_Data/" - FastQC_Outputs = "workflow_output/FastQC_Outputs/" - Trimmed_Sequence_Data = "workflow_output/Trimmed_Sequence_Data/" - Filtered_Sequence_Data = "workflow_output/Filtered_Sequence_Data/" - Final_Outputs = "workflow_output/Final_Outputs/" + Raw_Sequence_Data = "../Raw_Sequence_Data/" + FastQC_Outputs = "../workflow_output/FastQC_Outputs/" + Trimmed_Sequence_Data = "../workflow_output/Trimmed_Sequence_Data/" + Filtered_Sequence_Data = "../workflow_output/Filtered_Sequence_Data/" + Final_Outputs = "../workflow_output/Final_Outputs/" } conda{ @@ -106,7 +106,7 @@ profiles { singularity { singularity.enabled = true singularity.autoMounts = true - singularity.cacheDir = "/global/data/temp_scratch/oobayomi/singularity_images/" // local singularity images location + singularity.cacheDir = "singularity/" // local singularity images location params.containerEngine = "singularity" } diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/run_workflow.py b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/run_workflow.py index e13154c4..19479364 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/run_workflow.py +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/run_workflow.py @@ -51,7 +51,6 @@ def create_config(target_region, raw_R1_suffix, raw_R2_suffix, trim_primers, inp // Specify the paths to existing conda environments (/path/to/envs/genelab-utils) // leave as is if you want to create a new conda environment genelab = "{conda_genelab}" == "null" ? null : "{conda_genelab}" // /path/to/envs/genelab-utils - genelab = "{conda_genelab}" == "null" ? null : "{conda_genelab}" // /path/to/envs/genelab-utils qc = "{conda_qc}" == "null" ? null : "{conda_qc}" // /path/to/envs/qc R = "{conda_R}" == "null" ? null : "{conda_R}" // /path/to/envs/R cutadapt = "{conda_cutadapt}" == "null" ? null : "{conda_cutadapt}" // /path/to/envs/cutadapt @@ -221,12 +220,16 @@ def create_config(target_region, raw_R1_suffix, raw_R2_suffix, trim_primers, inp }} - withName: ANCOMBC {{ + withName: ANCOMBC {{ container = "{container_ancom}" }} + withName: SOFTWARE_VERSIONS {{ + publishDir = [path: params.metadata_dir, mode: params.publishDir_mode] + }} + }} From 39bd0f25830a117653645250bf9b8cb858496872 Mon Sep 17 00:00:00 2001 From: olabiyi Date: Mon, 13 Jan 2025 15:18:00 -0800 Subject: [PATCH 19/24] Added diversity and differential abundance testing to README post-processing --- .../workflow_code/bin/GL-gen-processed-amplicon-readme | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/GL-gen-processed-amplicon-readme b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/GL-gen-processed-amplicon-readme index 0273a828..7bc7923b 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/GL-gen-processed-amplicon-readme +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/GL-gen-processed-amplicon-readme @@ -134,6 +134,10 @@ def write_amplicon_body(output, output_file, assay_suffix, processing_zip_file, # Outputs output.write(" {:<41} {:>0}".format("- " + str(final_outputs_dir), "- primary output files (may or may not have additional prefix)\n")) + output.write(" {:<37} {:>0}".format(f"- alpha_diversity/", "- directory containing alpha diversity plots and statistics tables\n")) + output.write(" {:<37} {:>0}".format(f"- beta_diversity/", "- directory containing beta diversity plots and statistics tables\n")) + output.write(" {:<37} {:>0}".format(f"- differential_abundance/", "- directory containing the results (tables and plots) of differential abundance testing using one of or all of ANCOMBC1, ANCOMBC2 and DESeq2 \n")) + output.write(" {:<37} {:>0}".format(f"- taxonomy_plots/", "- directory containing sample-wise and group-wise taxonomy relative abundance stacked bar plots from phylum to specie level\n")) output.write(" {:<37} {:>0}".format(f"- *{assay_suffix}.fasta", "- fasta file of recovered sequences\n")) output.write(" {:<37} {:>0}".format(f"- *counts{assay_suffix}.tsv", "- count table of sequences across samples\n")) output.write(" {:<37} {:>0}".format(f"- *taxonomy{assay_suffix}.tsv", "- assigned taxonomy of recovered sequences\n")) From 409b6f8f6644636cebe20d10a246ecb59627b21e Mon Sep 17 00:00:00 2001 From: olabiyi Date: Wed, 15 Jan 2025 14:47:30 -0800 Subject: [PATCH 20/24] updated pipeline document --- .../GL-DPPD-7104-B.md | 551 +++++++++--------- 1 file changed, 289 insertions(+), 262 deletions(-) diff --git a/Amplicon/Illumina/Pipeline_GL-DPPD-7104_Versions/GL-DPPD-7104-B.md b/Amplicon/Illumina/Pipeline_GL-DPPD-7104_Versions/GL-DPPD-7104-B.md index 97a9ce64..f68b86c7 100644 --- a/Amplicon/Illumina/Pipeline_GL-DPPD-7104_Versions/GL-DPPD-7104-B.md +++ b/Amplicon/Illumina/Pipeline_GL-DPPD-7104_Versions/GL-DPPD-7104-B.md @@ -12,10 +12,11 @@ Olabiyi Obayomi, Alexis Torres, and Michael D. Lee (GeneLab Data Processing Team) **Approved by:** -Samrawit Gebre (GeneLab Project Manager) -Danielle Lopez (GeneLab Deputy Project Manager) -Lauren Sanders (OSDR Project Scientist) -Amanda Saravia-Butler (GeneLab Data Processing Lead) +Samrawit Gebre (OSDR Project Manager) +Danielle Lopez (OSDR Deputy Project Manager) +Lauren Sanders (OSDR Project Scientist) +Amanda Saravia-Butler (GeneLab Science Lead) +Barbara Novak (GeneLab Data Processing Lead) --- @@ -1150,10 +1151,20 @@ publication_format <- theme_bw() + legend.text = element_text(size = 14,face ='bold', color = 'black'), strip.text = element_text(size = 14,face ='bold', color = 'black')) ``` +**Output Variables:** +* `custom_palette` - custom color palette for coloring plots +* `publication_format` - custom ggplot theme for plotting #### Read-in Input Tables ```R +custom_palette <-{COLOR_VECTOR} +groups_colname <- "groups" +sample_colname <- "Sample Name" +metadata_file <- file.path("amplicon_runsheet.csv") +features_file <- file.path("counts_GLAmpSeq.tsv") +taxonomy_file <- file.path("taxonomy_GLAmpSeq.tsv") + # Read-in metadata metadata <- read_csv(file = metadata_file) %>% as.data.frame() row.names(metadata) <- metadata[[sample_colname]] @@ -1186,11 +1197,8 @@ sample_info_tab <- metadata %>% select(!!groups_colname, color) %>% arrange(!!sym(groups_colname)) - values <- sample_info_tab %>% pull(color) %>% unique() - - # Feature or ASV table feature_table <- read.table(file = features_file, header = TRUE, row.names = 1, sep = "\t") @@ -1200,10 +1208,37 @@ taxonomy_table <- read.table(file = taxonomy_file, header = TRUE, row.names = 1, sep = "\t") ``` +**Parameter Definitions:** + +* `metadata_table` – path to a comma separated samples metadata file with the group/treatment to be analyzed. [{OSD-Accession-ID}_AmpSeq_v{version}_runsheet.csv](#6a-create-sample-runsheet) +* `feature_table` – path to a tab separated samples feature table i.e. ASV or OTU table [counts_GLAmpSeq.tsv](#5g-generating-and-writing-standard-outputs) +* `taxonomy_table` – path to a feature taxonomy table i.e. ASV taxonomy table [taxonomy_GLAmpSeq.tsv](#5g-generating-and-writing-standard-outputs) +* `groups_colname` – group column in metadata to be analyzed +* `sample_colname` – column in metadata containing the sample names in the feature table +* `custom_palette` - color palette defined in [Set Variables](#set-variables) + +**Output Variables:** +* `metadata_table` – samples metadata dataframe with the group/treatment to be analyzed +* `feature_table` – samples feature table, i.e. ASV counts dataframe. +* `taxonomy_table` – feature taxonomy table i.e. ASV taxonomy dataframe. +* `sample_info_tab` - dataframe of sample information i.e. a subset of samples metadata +* `values` - character vector of unique color values. +* `sample_names` - character vector of sample names +* `deseq2_sample_names` - character vector of sample names for deseq2 +* `group_colors` - named character vector of colors for each group +* `group_levels` - unique group levels within `groups_colname` to be compared #### Preprocessing ```R +feature_table <- {DATAFRAME} # from step [Read-in Input Tables] +taxonomy_table <- {DATAFRAME} # from step [Read-in Input Tables] +target_region <- "16S" # 16S, 18S, or ITS +remove_rare <- FALSE # TRUE OR FALSE +prevalence_cutoff <- 0 +library_cutoff <- 0 + + if(remove_rare){ # Remove samples with less than library-cutoff @@ -1270,37 +1305,54 @@ common_ids <- intersect(rownames(feature_table), rownames(taxonomy_table)) feature_table <- feature_table[common_ids,] taxonomy_table <- taxonomy_table[common_ids,] ``` +**Parameter Definitions:** +* `remove_rare` - should rare features and samples be filtered out prior to analysis? If true, rare feature and samples will be removed + according to the cutoffs set below. +* `prevalence_cutoff` - If `remove_rare` is true, a numerical fraction between 0 and 1. + Taxa with prevalences(the proportion of samples in which the taxon is present) less than this will be excluded from the analysis. Default is 0, i.e. do not exclude any taxa / features. +* `library_cutoff` - If `remove_rare` is true, a numerical threshold for filtering samples based on library sizes. + Samples with library sizes less than lib_cut will be excluded in the analysis. Default is 0 i.e. no sample will be dropped. if you want to discard samples with read counts less than or equal to 100 then set to 100. +* `target_region` - amplicon target region. Options are either 16S, 18S or ITS +* `feature_table` - ASV count table varaiable from [Read-in Input Tables](#read-in-input-tables) +* `taxonomy_table` - ASV taxonomy table variable from [Read-in Input Tables](#read-in-input-tables) + +**Output Variables:** +* `metadata_table` – samples metadata dataframe with the group/treatment to be analyzed +* `feature_table` – samples feature table, i.e. ASV counts dataframe. +* `taxonomy_table` – feature taxonomy table i.e. ASV taxonomy dataframe. +* `sample_info_tab` - dataframe of sample information i.e. a subset of samples metadata +* `values` - unique color values. +* `sample_names` - character vector of sample names +* `deseq2_sample_names` - character vector of sample names for deseq2 +* `group_colors` - named character vector of colors for each group +* `group_levels` - unique group levels within `groups_colname` to be compared ## 7. Alpha Diversity Analysis Alpha diversity examines the variety and abundance of taxa within individual samples. Rarefaction curves are utilized to visually represent this diversity, plotting the number of unique sequences (ASVs) identified against the total number of sequences sampled, offering a perspective on the saturation and completeness of sampling. Metrics like Chao1 richness estimates and Shannon diversity indices are employed to quantify the richness (total number of unique sequences) and diversity (combination of richness and evenness) within these samples. -```bash -Rscript alpha_diversity.R \ - --metadata-table amplicon_runsheet.csv \ - --feature-table counts_GLAmpSeq.tsv \ - --taxonomy-table taxonomy_GLAmpSeq.tsv \ - --group groups \ - --samples-column 'Sample Name' \ - --rarefaction-depth 500 -``` -**Parameter Definitions:** - -* `--metadata-table` – specifies the path to a comma separated samples metadata file with the group/treatment to be analyzed -* `--feature-table` – specifies the path to a tab separated samples feature table i.e. ASV or OTU table -* `--taxonomy-table` – specifies the path to a feature taxonomy table i.e. ASV taxonomy table -* `--group` – specifies the group column in metadata to be analyzed -* `--samples-column` – specifies the column in metadata containing the sample names in the feature table -* `--rarefaction-depth` – specifies the minimum rarefaction depth for alpha diversity estimation - -Type `Rscript alpha_diversity.R --help` at the commandline for a full list of available parameters +> Please note that if you'd like **to run this section in R, make sure that you run the code in the following sections above sequentially first**: +> [Load Libraries](#load-libraries) +> [Load Functions](#load-functions) +> [Set Variables](#set-variables) +> [Read-in Input Tables](#read-in-input-tables) +> [Preprocessing](#preprocessing) -Content of `alpha_diversity.R` ```R # Create output directory if it doesn't already exist alpha_diversity_out_dir <- "alpha_diversity/" if(!dir.exists(alpha_diversity_out_dir)) dir.create(alpha_diversity_out_dir) +metadata <- {DATAFRAME} +sample_info_tab <- {DATAFRAME} +feature_table <- {DATAFRAME} +taxonomy_table <- {DATAFRAME} +group_colors <- {NAMED_VECTOR} +groups_colname <- "groups" +rarefaction_depth <- 500 +legend_title <- "Groups" +assay_suffix <- "_GLAmpSeq" +output_prefix <- "" # Create phyloseq object ASV_physeq <- phyloseq(otu_table(feature_table, taxa_are_rows = TRUE), @@ -1534,10 +1586,8 @@ p <- map(.x = metrics2plot, .f = function(metric){ # Add text to plot p + geom_text(data=summary_table, - mapping = aes(y=max+toAdd, - label=label, - fontface = "bold"), - size = text_size) + mapping = aes(y=max+toAdd, label=label, fontface = "bold"), + size = text_size) }) richness_by_group <- wrap_plots(p, ncol = 2, guides = 'collect') @@ -1547,12 +1597,19 @@ width <- 3.6 * length(group_levels) ggsave(filename = glue("{alpha_diversity_out_dir}/{output_prefix}richness_and_diversity_estimates_by_group{assay_suffix}.png"), plot=richness_by_group, width = width, height = 8.33, dpi = 300, units = "in") ``` +**Input Data and Parameter Definitions:** -**Input Data:** +* `metadata` - samples metadata dataframe with the group/treatment to be analyzed from [Preprocessing](#preprocessing) +* `sample_info_tab` - dataframe of sample information i.e. a subset of samples metadata DATAFRAME} from [Preprocessing](#preprocessing) +* `feature_table` - ASV counts table dataframe from [Preprocessing](#preprocessing) +* `taxonomy_table` - taxonomy dataframe from [Preprocessing](#preprocessing) +* `rarefaction_depth` – minimum rarefaction depth for alpha diversity estimation +* `groups_colname` - group column in metadata to be analyzed +* `group_colors` - named character vector of colors for each group +* `legend_title` - legend title for plotting +* `assay_suffix` - Genelab assay suffix. Default : "_GLAmpSeq" +* `output_prefix` - additdional prefix to be added to output files. Default: "" -* **amplicon_runsheet.csv** (metadata table - e.g {OSD-Accession-ID}_AmpSeq_v{version}_runsheet.csv ) -* **counts_GLAmpSeq.tsv** (count table) -* **taxonomy_GLAmpSeq.tsv** (taxonomy table) **Output Data:** @@ -1566,38 +1623,28 @@ ggsave(filename = glue("{alpha_diversity_out_dir}/{output_prefix}richness_and_di --- - ## 8. Beta Diversity Analysis Beta diversity measures the variation in species composition between different samples or environments. A common practice in working with a new dataset is to generate some exploratory visualizations like ordinations and hierarchical clusterings. These give us a quick overview of how our samples relate to each other and can be a way to check for problems like batch effects. -```bash -Rscript beta_diversity.R \ - --metadata-table amplicon_runsheet.csv \ - --feature-table counts_GLAmpSeq.tsv \ - --taxonomy-table taxonomy_GLAmpSeq.tsv \ - --group groups \ - --samples-column 'Sample Name' \ - --rarefaction-depth 500 -``` -**Parameter Definitions:** - -* `--metadata-table` – specifies the path to a comma separated samples metadata file with the group/treatment to be analyzed -* `--feature-table` – specifies the path to a tab separated samples feature table i.e. ASV or OTU table -* `--taxonomy-table` – specifies the path to a feature taxonomy table i.e. ASV taxonomy table -* `--group` – specifies the group column in metadata to be analyzed -* `--samples-column` – specifies the column in metadata containing the sample names in the feature table -* `--rarefaction-depth` – specifies the minimum rarefaction depth for diversity estimation. Relavant only for Bray Curtis distance calculation between samples - -type `Rscript beta_diversity.R --help` at the commandline for a full list of available parameters - -Content of `beta_diversity.R` +> Please note that if you'd like **to run this section in R, make sure that you run the code in the following sections above sequentially first**: +> [Load Libraries](#load-libraries) +> [Load Functions](#load-functions) +> [Set Variables](#set-variables) +> [Read-in Input Tables](#read-in-input-tables) +> [Preprocessing](#preprocessing) ```R - beta_diversity_out_dir <- "beta_diversity/" if(!dir.exists(beta_diversity_out_dir)) dir.create(beta_diversity_out_dir) - +metadata <- {DATAFRAME} +feature_table <- {DATAFRAME} +group_colors <- {NAMED_VECTOR} +groups_colname <- "groups" +rarefaction_depth <- 500 +legend_title <- "Groups" +assay_suffix <- "_GLAmpSeq" +output_prefix <- "" distance_methods <- c("euclidean", "bray") normalization_methods <- c("vst", "rarefy") legend_title <- NULL @@ -1654,12 +1701,18 @@ ggsave(filename=glue("{beta_diversity_out_dir}/{output_prefix}{distance_method}_ }) ``` - -**Input Data:** - -* **amplicon_runsheet.csv** (metadata table) -* **counts_GLAmpSeq.tsv** (count table) -* **taxonomy_GLAmpSeq.tsv** (taxonomy table) +**Input Data and Parameter Definitions:** + +* `metadata` - samples metadata dataframe with the group/treatment to be analyzed from [Preprocessing](#preprocessing) +* `feature_table` - ASV counts table dataframe from [Preprocessing](#preprocessing) +* `rarefaction_depth` – minimum rarefaction depth when using Bray Curtis distance +* `groups_colname` - group column in metadata to be analyzed +* `group_colors` - named character vector of colors for each group +* `legend_title` - legend title for plotting +* `assay_suffix` - Genelab's amplicon assay suffix. Default : "_GLAmpSeq" +* `output_prefix` - additdional prefix to be added to output files. Default: "" +* `distance_methods` - method used to calculate the distance between samples. "euclidean" and "bray" for euclidean and Bray Curtis distance, respectively. +* `normalization_methods` - method for normalizing sample counts. "vst" and "rarefy" variance stabilizing transformation and rarefaction, respectively. **Output Data:** @@ -1668,7 +1721,7 @@ ggsave(filename=glue("{beta_diversity_out_dir}/{output_prefix}{distance_method}_ * **beta_diversity/_PCoA_without_labels_GLAmpSeq.png** (Unlabeled PCoA) * **beta_diversity/_PCoA_w_labels_GLAmpSeq.png** (Labeled PCoA) -where distance_method is either bray or euclidean for Bray Curtis and Euclidean distance, respectively. +Were distance_method is either bray or euclidean for Bray Curtis and Euclidean distance, respectively.
@@ -1679,38 +1732,26 @@ where distance_method is either bray or euclidean for Bray Curtis and Euclidean Taxonomic summaries provide insights into the composition of microbial communities at various taxonomic levels. -```bash -Rscript plot_taxonomy.R \ - --metadata-table mapping/GLDS-487_amplicon_v1_runsheet.csv \ - --feature-table data/counts_GLAmpSeq.tsv \ - --taxonomy-table data/taxonomy_GLAmpSeq.tsv \ - --group groups \ - --samples-column 'Sample Name' \ - --remove-rare FALSE \ - --prevalence-cutoff 0.15 \ - --library-cutoff 100 -``` -**Parameter Definitions:** - -* `--metadata-table` – specifies the path to a comma separated samples metadata file with the group/treatment to be analyzed -* `--feature-table` – specifies the path to a tab separated samples feature table i.e. ASV or OTU table -* `--taxonomy-table` – specifies the path to a feature taxonomy table i.e. ASV taxonomy table -* `--group` – specifies the group column in metadata to be analyzed -* `--samples-column` – specifies the column in metadata containing the sample names in the feature table -* `--remove-rare` - should rare features be filtered out prior to analysis? If set, rare feature will be removed -* `--prevalence-cutoff` - If --remove-rare, a numerical fraction between 0 and 1. - Taxa with prevalences(the proportion of samples in which the taxon is present) less than --prevalence-cutoff will be excluded in the analysis. Default is 0.15, i.e. exclude taxa / features that are not present in at least 15% of the samples. -* `--library-cutoff` - If --remove-rare, a numerical threshold for filtering samples based on library sizes. - Samples with library sizes less than lib_cut will be excluded in the analysis. Default is 100. - if you do not want to discard any sample then set to 0. -type `Rscript plot_taxonomy.R --help` at the commandline for a full list of available parameters - -Content of `plot_taxonomy.R` +> Please note that if you'd like **to run this section in R, make sure that you run the code in the following sections above sequentially first**: +> [Load Libraries](#load-libraries) +> [Load Functions](#load-functions) +> [Set Variables](#set-variables) +> [Read-in Input Tables](#read-in-input-tables) +> [Preprocessing](#preprocessing) ```R taxonomy_plots_out_dir <- "taxonomy_plots/" if(!dir.exists(taxonomy_plots_out_dir)) dir.create(taxonomy_plots_out_dir) +metadata <- {DATAFRAME} +feature_table <- {DATAFRAME} +taxonomy_table <- {DATAFRAME} +custom_palette <-{COLOR_VECTOR} +publication_format <- {GGPLOT_THEME} +groups_colname <- "groups" +assay_suffix <- "_GLAmpSeq" +output_prefix <- "" + # -------------------------Prepare feature tables -------------------------- # taxon_levels <- colnames(taxonomy_table) @@ -1726,6 +1767,7 @@ group_rare <- TRUE samples_order <- metadata %>% arrange(!!sym(groups_colname)) %>% rownames() dont_group <- c("phylum") # In percentage +# phylum 1%, class 3%, order 3%, family 8%, genus 8% and species 9% thresholds <- c(phylum=1,class=3, order=3, family=8, genus=8, species=9) # Convert from wide to long format # -1 drops the kingdom level since all the microbes are bacteria @@ -1796,11 +1838,12 @@ walk2(.x = relAbundace_tbs_rare_grouped, .y = taxon_levels[-1], # ------------------------ Group abundance plots ----------------------------- # # In percentage +# phylum 1% and 2% for class to species. thresholds <- c(phylum=1,class=2, order=2, family=2, genus=2, species=2) # Convert from wide to long format for every treatment group of interest -group_rare <- TRUE -maximum_number_of_taxa <- 500 +group_rare <- TRUE # should rare taxa be grouped ? +maximum_number_of_taxa <- 500 # If the number of taxa is more than this then rare taxa will be grouped anyway. group_relAbundace_tbs <- map2(.x = taxon_levels[-1], .y = taxon_tables[-1], .f = function(taxon_level=.x, taxon_table=.y){ @@ -1858,19 +1901,23 @@ walk2(.x = group_relAbundace_tbs, .y = taxon_levels[-1], plot=p, width = plot_width, height = 10, dpi = 300) }) ``` +**Input Data and Parameters Definitions:** -**Input Data:** - -* **amplicon_metdata.csv** (metadata table) -* **counts_GLAmpSeq.tsv** (count table) -* **taxonomy_GLAmpSeq.tsv** (taxonomy table) +* `metadata` - samples metadata dataframe with the group/treatment to be analyzed from [Preprocessing](#preprocessing) +* `feature_table` - ASV counts table dataframe from [Preprocessing](#preprocessing) +* `taxonomy_table` - taxonomy dataframe from [Preprocessing](#preprocessing) +* `groups_colname` - group column in metadata to be analyzed +* `assay_suffix` - Genelab assay suffix, default : "_GLAmpSeq" +* `output_prefix` - additdional prefix to be added to output files . Default: "" +* `custom_palette` - color palette defined in [Set Variables](#set-variables) +* `publication_format` - ggplot theme defined in [Set Variables](#set-variables) **Output Data:** * **taxonomy_plots/samples__GLAmpSeq.png** (samples barplots) * **taxonomy_plots/groups__GLAmpSeq.png** (groups barplots) -where taxon_level is one of phylum, class, order, family, genus and species. +Where taxon_level is all of phylum, class, order, family, genus and species. > please note that species plot should only be taken with a grain of salt as short amplicon sequences can't be used to accurately predict species. @@ -1883,46 +1930,32 @@ where taxon_level is one of phylum, class, order, family, genus and species. Using ANCOMBC 1, ANCOMBC 2, and DESeq2, we aim to uncover specific taxa that exhibit notable variations across different conditions, complemented by visualizations like volcano plots to illustrate these disparities and their implications on ASV expression and overall microbial community dynamics. +> Please note that if you'd like **to run this section in R, make sure that you run the code in the following sections above sequentially first**: +> [Load Libraries](#load-libraries) +> [Load Functions](#load-functions) +> [Set Variables](#set-variables) +> [Read-in Input Tables](#read-in-input-tables) +> [Preprocessing](#preprocessing) ### 10a. ANCOMBC 1 -```bash -Rscript pairwise_ancombc1.R \ - --metadata-table amplicon_runsheet.csv \ - --feature-table counts_GLAmpSeq.tsv \ - --taxonomy-table taxonomy_GLAmpSeq.tsv \ - --group groups \ - --samples-column 'Sample Name' \ - --target-region 16S \ - --remove-rare FALSE \ - --prevalence-cutoff 0.15 \ - --library-cutoff 100 \ - --cpus 5 - -``` -**Parameter Definitions:** - -* `--metadata-table` – specifies the path to a comma separated samples metadata file with the group/treatment to be analyzed -* `--feature-table` – specifies the path to a tab separated samples feature table i.e. ASV or OTU table -* `--taxonomy-table` – specifies the path to a feature taxonomy table i.e. ASV taxonomy table -* `--group` – specifies the group column in the metadata to be analyzed -* `--samples-column` – specifies the column in the metadata containing the sample names in the feature table -* `--target-region` – specifies the amplicon target region. Options are either 16S, 18S or ITS -* `--remove-rare` - should rare features be filtered out prior to analysis? If set, rare feature will be removed -* `--prevalence-cutoff` - If --remove-rare, a numerical fraction between 0 and 1. - Taxa with prevalences(the proportion of samples in which the taxon is present) less than --prevalence-cutoff will be excluded in the analysis. Default is 0.15, i.e. exclude taxa / features that are not present in at least 15% of the samples. -* `--library-cutoff` - If --remove-rare, a numerical threshold for filtering samples based on library sizes. - Samples with library sizes less than lib_cut will be excluded in the analysis. Default is 100. - if you do not want to discard any sample then set to 0. -* `--cpus ` - specifies the number of cpus to use for parallel processing. - -Type `Rscript pairwise_ancombc1.R --help` at the commandline for a full list of available parameters - -Content of `pairwise_ancombc1.R` ```R # Create output directory if it doesn't already exist diff_abund_out_dir <- "differential_abundance/ancombc1/" if(!dir.exists(diff_abund_out_dir)) dir.create(diff_abund_out_dir, recursive = TRUE) +metadata <- {DATAFRAME} +feature_table <- {DATAFRAME} +taxonomy_table <- {DATAFRAME} +publication_format <- {GGPLOT_THEME} +feature <- "ASV" +groups_colname <- "groups" +samples_column <- "Sample Name" +assay_suffix <- "_GLAmpSeq" +target_region <- "16S" # "16S", "18S" or "ITS" +output_prefix <- "" +prevalence_cutoff <- 0 +library_cutoff <- 0 +threads <- 5 # Create phyloseq object from feature, taxonomy and sample metadata tables @@ -1935,7 +1968,7 @@ tse <- mia::makeTreeSummarizedExperimentFromPhyloseq(ps) # Get unique group comparison as a matrix -pairwise_comp.m <- utils::combn(metadata[,group] %>% unique, 2) +pairwise_comp.m <- utils::combn(metadata[, groups_colname] %>% unique, 2) pairwise_comp_df <- pairwise_comp.m %>% as.data.frame # Name the columns in the pairwise matrix as group1vgroup2 colnames(pairwise_comp_df) <- map_chr(pairwise_comp_df, @@ -1953,12 +1986,12 @@ final_results_bc1 <- map(pairwise_comp_df, function(col){ # Subset the treeSummarizedExperiment object to contain only samples # in group1 and group2 - tse_sub <- tse[, tse[[group]] %in% c(group1, group2)] + tse_sub <- tse[, tse[[groups_colname]] %in% c(group1, group2)] # Note that by default, levels of a categorical variable in R are sorted # alphabetically. # Changing the reference group by reordering the factor levels - tse_sub[[group]] <- factor(tse_sub[[group]] , levels = c(group1, group2)) + tse_sub[[groups_colname]] <- factor(tse_sub[[groups_colname]] , levels = c(group1, group2)) # data - input data. TreeSummarizedExperiment or Phyloseq object # assay_name - name of count table in the input data object. @@ -1980,10 +2013,10 @@ final_results_bc1 <- map(pairwise_comp_df, function(col){ out <- ancombc(data = tse_sub, assay_name = "counts", tax_level = NULL, phyloseq = NULL, - formula = group, + formula = groups_colname, p_adj_method = "fdr", prv_cut = prevalence_cutoff, lib_cut = library_cutoff, - group = group, struc_zero = TRUE, neg_lb = TRUE, tol = 1e-5, + group = groups_colname , struc_zero = TRUE, neg_lb = TRUE, tol = 1e-5, max_iter = 100, conserve = TRUE, alpha = 0.05, global = FALSE, n_cl = threads, verbose = TRUE) @@ -2044,7 +2077,7 @@ final_results_bc1 <- map(pairwise_comp_df, function(col){ ) # Merge the dataframes to one results dataframe - res <-lfc %>% + res <- lfc %>% left_join(se) %>% left_join(W) %>% left_join(p_val) %>% @@ -2129,8 +2162,8 @@ volcano_plots <- map(comp_names, function(comparison){ # Set dimensions for saving faceted plot number_of_columns <- 2 -number_of_rows = ceiling(length(comp_names) / number_of_columns) -fig_height = 7.5 * number_of_rows +number_of_rows <- ceiling(length(comp_names) / number_of_columns) +fig_height <- 7.5 * number_of_rows p <- wrap_plots(volcano_plots, ncol = 2) # Try to combine all the volcano plots in one figure @@ -2178,7 +2211,7 @@ colnames(missing_df) <- c(feature,samplesdropped) # Create mean and standard deviation table -group_levels <- metadata[, group] %>% unique() %>% sort() +group_levels <- metadata[, groups_colname] %>% unique() %>% sort() group_means_df <- normalized_table[feature] walk(group_levels, function(group_level){ @@ -2188,7 +2221,7 @@ walk(group_levels, function(group_level){ # Samples that belong to the current group Samples <- metadata %>% - filter(!!sym(group) == group_level) %>% + filter(!!sym(groups_colname) == group_level) %>% pull(!!sym(samples_column)) # Samples that belong to the current group that are in the normalized table Samples <- intersect(colnames(normalized_table), Samples) @@ -2230,13 +2263,13 @@ merged_df <- merged_df %>% # Writing out results of differential abundance using ANCOMBC 1 output_file <- glue("{diff_abund_out_dir}/{output_prefix}ancombc1_differential_abundance{assay_suffix}.csv") -write_csv(merged_df,output_file) +write_csv(merged_df, output_file) # --------------- Make log abundance box plots ------------------ # # Merge the metadata with the feature table -df2 <- (metadata %>% select(!!samples_column, !!group)) %>% +df2 <- (metadata %>% select(!!samples_column, !!groups_colname)) %>% left_join(feature_table %>% t %>% as.data.frame %>% @@ -2245,9 +2278,9 @@ df2 <- (metadata %>% select(!!samples_column, !!group)) %>% # Making abundance box plots boxplots <- map( merged_stats_df[[feature]], function(feature){ - p <- ggplot(df2, aes(x=!!sym(group), y=log(!!sym(feature)+1), fill=!!sym(group) )) + + p <- ggplot(df2, aes(x=!!sym(groups_colname), y=log(!!sym(feature)+1), fill=!!sym(groups_colname) )) + geom_boxplot() + - labs(x=NULL, y="Log Abundance", fill=tools::toTitleCase(group), title = feature) + + labs(x=NULL, y="Log Abundance", fill=tools::toTitleCase(groups_colname), title = feature) + theme_light() + theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), axis.title.y = element_text(face = "bold", size=12), @@ -2273,16 +2306,29 @@ fig_height = 5 * number_of_rows try( ggsave(filename = glue("{output_prefix}{feature}_boxplots{assay_suffix}.png"), plot = p, device = "png", width = 14, height = fig_height, units = "in", dpi = 300, - limitsize = FALSE, path = diff_abund_out_dir) # There too many things to plot + limitsize = FALSE, path = diff_abund_out_dir) # There may be too many things to plot ) ``` -**Input Data:** -* **amplicon_metdata.csv** (metadata table) -* **counts_GLAmpSeq.tsv** (count table) -* **taxonomy_GLAmpSeq.tsv** (taxonomy table) +**Input Data and Parameter Definitions:** + +* `metadata` - samples metadata dataframe with the group/treatment to be analyzed from [Preprocessing](#preprocessing) +* `feature_table` - ASV counts table dataframe from [Preprocessing](#preprocessing) +* `feature` - feature type i.e. ASV or OTU. +* `taxonomy_table` - taxonomy dataframe from [Preprocessing](#preprocessing) +* `groups_colname` - group column in metadata to be analyzed +* `samples_column` – specifies the column in metadata containing the sample names in the feature table +* `assay_suffix` - Genelab assay suffix, default : "_GLAmpSeq" +* `output_prefix` - additdional prefix to be added to output files . Default: "" +* `threads` - specifies the number of cpus to use for parallel processing. +* `prevalence_cutoff` - If `remove_rare` is true, a numerical fraction between 0 and 1. + Taxa with prevalences(the proportion of samples in which the taxon is present) less than this will be excluded from the analysis. Default is 0, i.e. do not exclude any taxa / features. +* `library_cutoff` - If `remove_rare` is true, a numerical threshold for filtering samples based on library sizes. + Samples with library sizes less than lib_cut will be excluded in the analysis. Default is 0 i.e. no sample will be dropped. if you want to discard samples with read counts less than or equal to 100 then set to 100. +* `publication_format` - ggplot theme defined in [Set Variables](#set-variables) +* `target_region` - amplicon target region. Options are either 16S, 18S or ITS **Output Data:** @@ -2297,44 +2343,24 @@ ggsave(filename = glue("{output_prefix}{feature}_boxplots{assay_suffix}.png"), p --- ### 10b. ANCOMBC 2 -```bash -Rscript pairwise_ancombc2.R \ - --metadata-table amplicon_runsheet.csv \ - --feature-table counts_GLAmpSeq.tsv \ - --taxonomy-table taxonomy_GLAmpSeq.tsv \ - --group groups \ - --samples-column 'Sample Name' \ - --target-region 16S \ - --remove-rare FALSE \ - --prevalence-cutoff 0.15 \ - --library-cutoff 100 \ - --cpus 5 - -``` -**Parameter Definitions:** - -* `--metadata-table` – specifies the path to a comma separated samples metadata file with the group/treatment to be analyzed -* `--feature-table` – specifies the path to a tab separated samples feature table i.e. ASV or OTU table -* `--taxonomy-table` – specifies the path to a feature taxonomy table i.e. ASV taxonomy table -* `--group` – specifies the group column in metadata to be analyzed -* `--samples-column` – specifies the column in metadata containing the sample names in the feature table -* `--target-region` – specifies the amplicon target region. Options are either 16S, 18S or ITS -* `--remove-rare` - should rare features be filtered out prior to analysis? If set, rare feature will be removed -* `--prevalence-cutoff` - If --remove-rare, a numerical fraction between 0 and 1. - Taxa with prevalences(the proportion of samples in which the taxon is present) less than --prevalence-cutoff will be excluded in the analysis. Default is 0.15, i.e. exclude taxa / features that are not present in at least 15% of the samples. -* `--library-cutoff` - If --remove-rare, a numerical threshold for filtering samples based on library sizes. - Samples with library sizes less than lib_cut will be excluded in the analysis. Default is 100. - if you do not want to discard any sample then set to 0. - -* `--cpus ` - Specifies the number of cpus to use for parallel processing. - -Type `Rscript pairwise_ancombc2.R --help` at the commandline for a full list of available parameters - -Content of `pairwise_ancombc2.R` ```R diff_abund_out_dir <- "differential_abundance/ancombc2/" if(!dir.exists(diff_abund_out_dir)) dir.create(diff_abund_out_dir, recursive = TRUE) +metadata <- {DATAFRAME} +feature_table <- {DATAFRAME} +taxonomy_table <- {DATAFRAME} +publication_format <- {GGPLOT_THEME} +feature <- "ASV" +target_region <- "16S" # "16S" , "18S" or "ITS" +groups_colname <- "groups" +samples_column <- "Sample Name" +assay_suffix <- "_GLAmpSeq" +output_prefix <- "" +prevalence_cutoff <- 0 +library_cutoff <- 0 +threads <- 5 + # Create phyloseq object ps <- phyloseq(otu_table(feature_table, taxa_are_rows = TRUE), @@ -2346,11 +2372,11 @@ tse <- mia::makeTreeSummarizedExperimentFromPhyloseq(ps) # Getting the reference group and making sure that it is the reference # used in the analysis -group_levels <- metadata[, group] %>% unique() %>% sort() +group_levels <- metadata[, groups_colname] %>% unique() %>% sort() ref_group <- group_levels[1] -tse[[group]] <- factor(tse[[group]] , levels = group_levels) +tse[[groups_colname]] <- factor(tse[[groups_colname]] , levels = group_levels) -message("Running ANCOMBC2....") +# Running ANCOMBC2.... # Run acombc2 # data - input data. TreeSummarizedExperiment or Phyloseq object @@ -2372,11 +2398,11 @@ message("Running ANCOMBC2....") # it is recommended to set to TRUE if your sample size is small and the number of expected differentially abundant taxa is large. output <- ancombc2(data = tse, assay_name = "counts", tax_level = NULL, - fix_formula = group, rand_formula = NULL, + fix_formula = groups_colname, rand_formula = NULL, p_adj_method = "fdr", pseudo_sens = TRUE, prv_cut = prevalence_cutoff, lib_cut = library_cutoff, s0_perc = 0.05, - group = group, struc_zero = TRUE, neg_lb = FALSE, + group = groups_colname, struc_zero = TRUE, neg_lb = FALSE, alpha = 0.05, n_cl = threads, verbose = TRUE, global = TRUE, pairwise = TRUE, dunnet = TRUE, trend = FALSE, @@ -2393,9 +2419,9 @@ output <- ancombc2(data = tse, assay_name = "counts", tax_level = NULL, new_colnames <- map_chr(output$res_pair %>% colnames, function(colname) { # Columns comparing a group to the reference group - if(str_count(colname,group) == 1){ + if(str_count(colname, groups_colname) == 1){ str_replace_all(string=colname, - pattern=glue("(.+)_{group}(.+)"), + pattern=glue("(.+)_{groups_colname}(.+)"), replacement=glue("\\1_(\\2)v({ref_group})")) %>% str_replace(pattern = "^lfc_", replacement = "logFC_") %>% str_replace(pattern = "^se_", replacement = "lfcSE_") %>% @@ -2404,10 +2430,10 @@ new_colnames <- map_chr(output$res_pair %>% colnames, str_replace(pattern = "^q_", replacement = "qvalue_") # Columns with normal two groups comparison - } else if(str_count(colname,group) == 2){ + } else if(str_count(colname, groups_colname) == 2){ str_replace_all(string=colname, - pattern=glue("(.+)_{group}(.+)_{group}(.+)"), + pattern=glue("(.+)_{groups_colname}(.+)_{groups_colname}(.+)"), replacement=glue("\\1_(\\2)v(\\3)")) %>% str_replace(pattern = "^lfc_", replacement = "logFC_") %>% str_replace(pattern = "^se_", replacement = "lfcSE_") %>% @@ -2449,8 +2475,6 @@ walk(uniq_comps, function(comp){ res_df <<- res_df %>% left_join(temp_df) }) - - # --------- Add NCBI id to feature ---------------# # Get the best taxonomy assigned to each ASV @@ -2460,7 +2484,7 @@ tax_names <- map_chr(str_replace_all(taxonomy_table$species, ";_","") %>% df <- data.frame(ASV=rownames(taxonomy_table), best_taxonomy=tax_names) -message("Querying NCBI...") +# Querying NCBI... # Pull NCBI IDS for unique taxonomy names df2 <- data.frame(best_taxonomy = df$best_taxonomy %>% unique()) %>% @@ -2486,7 +2510,7 @@ missing_df <- data.frame(ASV=normalized_table[[feature]], ncol = length(samplesdropped) ) ) -colnames(missing_df) <- c(feature,samplesdropped) +colnames(missing_df) <- c(feature, samplesdropped) group_means_df <- normalized_table[feature] @@ -2498,7 +2522,7 @@ walk(group_levels, function(group_level){ # Samples that belong to the current group Samples <- metadata %>% - filter(!!sym(group) == group_level) %>% + filter(!!sym(groups_colname) == group_level) %>% pull(!!sym(samples_column)) # Samples that belong to the current group that are in the normalized table Samples <- intersect(colnames(normalized_table), Samples) @@ -2537,13 +2561,13 @@ merged_df <- merged_df %>% left_join(group_means_df, by = feature) %>% mutate(across(where(is.numeric), ~round(.x, digits=3))) -message("Writing out results of differential abundance using ANCOMBC2...") +# Writing out results of differential abundance using ANCOMBC2... output_file <- glue("{diff_abund_out_dir}/{output_prefix}ancombc2_differential_abundance{assay_suffix}.csv") -write_csv(merged_df,output_file) +write_csv(merged_df, output_file) # ---------------------- Visualization --------------------------------------- # -message("Making volcano plots...") +# Making volcano plots... # ------------ Make volcano ---------------- # volcano_plots <- map(uniq_comps, function(comparison){ @@ -2584,8 +2608,8 @@ p <- wrap_plots(volcano_plots, ncol = 2) number_of_columns <- 2 -number_of_rows = ceiling(length(uniq_comps) / number_of_columns) -fig_height = 7.5 * number_of_rows +number_of_rows <- ceiling(length(uniq_comps) / number_of_columns) +fig_height <- 7.5 * number_of_rows # Try to combine all the volcano plots in one figure try( @@ -2596,18 +2620,18 @@ ggsave(filename = glue("{output_prefix}{feature}_volcano{assay_suffix}.png"), pl # ------------- Box plots ---------------- # -df2 <- (metadata %>% select(!!samples_column, !!group)) %>% +df2 <- (metadata %>% select(!!samples_column, !!groups_colname)) %>% left_join(feature_table %>% t %>% as.data.frame %>% rownames_to_column(samples_column)) -message("Making abundance box plots...") +# Making abundance box plots... boxplots <- map(res_df[[feature]], function(feature){ - p <- ggplot(df2, aes(x=!!sym(group), y=log(!!sym(feature)+1), fill=!!sym(group) )) + + p <- ggplot(df2, aes(x=!!sym(groups_colname), y=log(!!sym(feature)+1), fill=!!sym(groups_colname) )) + geom_boxplot() + - labs(x=NULL, y="Log Abundance", fill=tools::toTitleCase(group), title = feature) + + labs(x=NULL, y="Log Abundance", fill=tools::toTitleCase(groups_colname), title = feature) + theme_light() + theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), axis.title.y = element_text(face = "bold", size=12), @@ -2625,8 +2649,8 @@ p <- wrap_plots(boxplots, ncol = 2, guides = 'collect') number_of_features <- res_df[[feature]] %>% length number_of_columns <- 2 -number_of_rows = ceiling(number_of_features / number_of_columns) -fig_height = 5 * number_of_rows +number_of_rows <- ceiling(number_of_features / number_of_columns) +fig_height <- 5 * number_of_rows # Try to Plot all features / ASVs in one figure try( @@ -2636,11 +2660,24 @@ ggsave(filename = glue("{output_prefix}{feature}_boxplots{assay_suffix}.png"), p ) ``` -**Input Data:** +**Input Data and Parameter Definitions:** + +* `metadata` - samples metadata dataframe with the group/treatment to be analyzed from [Preprocessing](#preprocessing) +* `feature_table` - ASV counts table dataframe from [Preprocessing](#preprocessing) +* `feature` - feature type i.e. ASV or OTU. +* `taxonomy_table` - taxonomy dataframe from [Preprocessing](#preprocessing) +* `samples_column` – column in metadata containing the sample names in the feature table +* `groups_colname` - group column in metadata to be analyzed +* `assay_suffix` - Genelab assay suffix, default : "_GLAmpSeq" +* `output_prefix` - additdional prefix to be added to output files . Default: "" +* `threads` - specifies the number of cpus to use for parallel processing. +* `prevalence_cutoff` - If `remove_rare` is true, a numerical fraction between 0 and 1. + Taxa with prevalences(the proportion of samples in which the taxon is present) less than this will be excluded from the analysis. Default is 0, i.e. do not exclude any taxa / features. +* `library_cutoff` - If `remove_rare` is true, a numerical threshold for filtering samples based on library sizes. + Samples with library sizes less than lib_cut will be excluded in the analysis. Default is 0 i.e. no sample will be dropped. if you want to discard samples with read counts less than or equal to 100 then set to 100. +* `publication_format` - ggplot theme defined in [Set Variables](#set-variables) +* `target_region` - amplicon target region. Options are either 16S, 18S or ITS -* **amplicon_metdata.csv** (metadata table) -* **counts_GLAmpSeq.tsv** (count table) -* **taxonomy_GLAmpSeq.tsv** (taxonomy table) **Output Data:** @@ -2657,42 +2694,23 @@ ggsave(filename = glue("{output_prefix}{feature}_boxplots{assay_suffix}.png"), p ### 10c. DESeq2 -```bash -Rscript run_deseq2.R \ - --metadata-table amplicon_runsheet.csv \ - --feature-table counts_GLAmpSeq.tsv \ - --taxonomy-table taxonomy_GLAmpSeq.tsv \ - --group groups \ - --samples-column 'Sample Name' \ - --target-region 16S \ - --remove-rare FALSE \ - --prevalence-cutoff 0.15 \ - --library-cutoff 100 - -``` -**Parameter Definitions:** - -* `--metadata-table` – specifies the path to a comma separated samples metadata file with the group/treatment to be analyzed -* `--feature-table` – specifies the path to a tab separated samples feature table i.e. ASV or OTU table -* `--taxonomy-table` – specifies the path to a feature taxonomy table i.e. ASV taxonomy table -* `--group` – specifies the group column in the metadata to be analyzed -* `--samples-column` – specifies the column in the metadata containing the sample names in the feature table -* `--target-region` – specifies the amplicon target region. Options are either 16S, 18S or ITS -* `--remove-rare` - should rare features be filtered out prior to analysis? If set, rare feature will be removed -* `--prevalence-cutoff` - If --remove-rare, a numerical fraction between 0 and 1. - Taxa with prevalences(the proportion of samples in which the taxon is present) less than --prevalence-cutoff will be excluded in the analysis. Default is 0.15, i.e. exclude taxa / features that are not present in at least 15% of the samples. -* `--library-cutoff` - If --remove-rare, a numerical threshold for filtering samples based on library sizes. - Samples with library sizes less than lib_cut will be excluded in the analysis. Default is 100. - if you do not want to discard any sample then set to 0. - -type `Rscript run_deseq2.R --help` at the commandline for a full list of available parameters - -Content of `run_deseq2.R` - ```R # Create output directory if it doesn't already exist diff_abund_out_dir <- "differential_abundance/deseq2/" if(!dir.exists(diff_abund_out_dir)) dir.create(diff_abund_out_dir, recursive = TRUE) +metadata <- {DATAFRAME} +feature_table <- {DATAFRAME} +taxonomy_table <- {DATAFRAME} +publication_format <- {GGPLOT_THEME} +feature <- "ASV" +samples_column <- "Sample Name" +groups_colname <- "groups" +assay_suffix <- "_GLAmpSeq" +target_region <- "16S" # "16S", "18S" or "ITS" +output_prefix <- "" +prevalence_cutoff <- 0 +library_cutoff <- 0 + # Create phyloseq object from the feature, taxonomy and metadata tables ASV_physeq <- phyloseq(otu_table(feature_table, taxa_are_rows = TRUE), @@ -2700,7 +2718,7 @@ ASV_physeq <- phyloseq(otu_table(feature_table, taxa_are_rows = TRUE), sample_data(metadata)) # Convert the phyloseq object to a deseq object deseq_obj <- phyloseq_to_deseq2(physeq = ASV_physeq, - design = reformulate(group)) + design = reformulate(groups_colname)) # Add pseudocount if any 0 count samples are present if (sum(colSums(counts(deseq_obj)) == 0) > 0) { @@ -2734,7 +2752,7 @@ deseq_modeled <- tryCatch({ # Get unique group comparison as a matrix -pairwise_comp.m <- utils::combn(metadata[,group] %>% unique, 2) +pairwise_comp.m <- utils::combn(metadata[,groups_colname] %>% unique, 2) pairwise_comp_df <- pairwise_comp.m %>% as.data.frame # Set the colnames as group1vgroup2 colnames(pairwise_comp_df) <- map_chr(pairwise_comp_df, @@ -2752,7 +2770,7 @@ walk(pairwise_comp_df, function(col){ group2 <- col[2] # Retrieve the statistics table for the cuurrent pair and rename the columns -df <- results(deseq_modeled, contrast = c(group, group1, group2)) %>% # Get stats +df <- results(deseq_modeled, contrast = c(groups_colname, group1, group2)) %>% # Get stats data.frame() %>% rownames_to_column(feature) %>% set_names(c(feature , @@ -2807,7 +2825,7 @@ colnames(missing_df) <- c(feature,samplesdropped) # Calculate mean and standard deviation of all ASVs for each group in # a dataframe called group_means_df -group_levels <- metadata[, group] %>% unique() %>% sort() +group_levels <- metadata[, groups_colname] %>% unique() %>% sort() group_means_df <- normalized_table[feature] walk(group_levels, function(group_level){ @@ -2817,7 +2835,7 @@ walk(group_levels, function(group_level){ # Get a vector of samples that belong to the current group Samples <- metadata %>% - filter(!!sym(group) == group_level) %>% + filter(!!sym(groups_colname) == group_level) %>% pull(!!sym(samples_column)) # Retain only samples that belong to the current group that are in the normalized table Samples <- intersect(colnames(normalized_table), Samples) @@ -2862,9 +2880,7 @@ merged_df <- merged_df %>% # Defining the output file output_file <- glue("{diff_abund_out_dir}/{output_prefix}deseq2_differential_abundance{assay_suffix}.csv") # Writing out results of differential abundance using DESeq2 -write_csv(merged_df,output_file) - - +write_csv(merged_df, output_file) # ------------------------- Make volcano plots ------------------------ # # Loop over group pairs and make a volcano comparing the pair @@ -2879,7 +2895,7 @@ walk(pairwise_comp_df, function(col){ p_val <- 0.1 # logfc cutoff # Retrieve data for plotting - deseq_res <- results(deseq_modeled, contrast = c(group, group1, group2)) + deseq_res <- results(deseq_modeled, contrast = c(groups_colname, group1, group2)) volcano_data <- as.data.frame(deseq_res) volcano_data <- volcano_data[!is.na(volcano_data$padj), ] volcano_data$significant <- volcano_data$padj <= p_val @@ -2929,11 +2945,22 @@ walk(pairwise_comp_df, function(col){ }) ``` -**Input Data:** - -* **amplicon_runsheet.csv** (metadata table) -* **counts_GLAmpSeq.tsv** (count table) -* **taxonomy_GLAmpSeq.tsv** (taxonomy table) +**Input Data and Parameter Definitions:** + +* `metadata` - samples metadata dataframe with the group/treatment to be analyzed from [Preprocessing](#preprocessing) +* `feature_table` - ASV counts table dataframe from [Preprocessing](#preprocessing) +* `feature` - feature type i.e. ASV or OTU. +* `taxonomy_table` - taxonomy dataframe from [Preprocessing](#preprocessing) +* `samples_column` – column in metadata containing the sample names in the feature table +* `groups_colname` - group column in metadata to be analyzed +* `assay_suffix` - Genelab assay suffix, default : "_GLAmpSeq" +* `output_prefix` - additdional prefix to be added to output files . Default: "" +* `prevalence_cutoff` - If `remove_rare` is true, a numerical fraction between 0 and 1. + Taxa with prevalences(the proportion of samples in which the taxon is present) less than this will be excluded from the analysis. Default is 0, i.e. do not exclude any taxa / features. +* `library_cutoff` - If `remove_rare` is true, a numerical threshold for filtering samples based on library sizes. + Samples with library sizes less than lib_cut will be excluded in the analysis. Default is 0 i.e. no sample will be dropped. if you want to discard samples with read counts less than or equal to 100 then set to 100. +* `publication_format` - ggplot theme defined in [Set Variables](#set-variables) +* `target_region` - amplicon target region. Options are either 16S, 18S or ITS **Output Data:** From 5b746912f6b10f6adaf4dee2e3d81dfa4d937c98 Mon Sep 17 00:00:00 2001 From: olabiyi Date: Wed, 15 Jan 2025 14:54:01 -0800 Subject: [PATCH 21/24] updated pipeline document --- .../GL-DPPD-7104-B.md | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/Amplicon/Illumina/Pipeline_GL-DPPD-7104_Versions/GL-DPPD-7104-B.md b/Amplicon/Illumina/Pipeline_GL-DPPD-7104_Versions/GL-DPPD-7104-B.md index f68b86c7..a5002168 100644 --- a/Amplicon/Illumina/Pipeline_GL-DPPD-7104_Versions/GL-DPPD-7104-B.md +++ b/Amplicon/Illumina/Pipeline_GL-DPPD-7104_Versions/GL-DPPD-7104-B.md @@ -1333,10 +1333,10 @@ taxonomy_table <- taxonomy_table[common_ids,] Alpha diversity examines the variety and abundance of taxa within individual samples. Rarefaction curves are utilized to visually represent this diversity, plotting the number of unique sequences (ASVs) identified against the total number of sequences sampled, offering a perspective on the saturation and completeness of sampling. Metrics like Chao1 richness estimates and Shannon diversity indices are employed to quantify the richness (total number of unique sequences) and diversity (combination of richness and evenness) within these samples. > Please note that if you'd like **to run this section in R, make sure that you run the code in the following sections above sequentially first**: -> [Load Libraries](#load-libraries) -> [Load Functions](#load-functions) -> [Set Variables](#set-variables) -> [Read-in Input Tables](#read-in-input-tables) +> [Load Libraries](#load-libraries), +> [Load Functions](#load-functions), +> [Set Variables](#set-variables), +> [Read-in Input Tables](#read-in-input-tables), and > [Preprocessing](#preprocessing) ```R @@ -1628,10 +1628,10 @@ ggsave(filename = glue("{alpha_diversity_out_dir}/{output_prefix}richness_and_di Beta diversity measures the variation in species composition between different samples or environments. A common practice in working with a new dataset is to generate some exploratory visualizations like ordinations and hierarchical clusterings. These give us a quick overview of how our samples relate to each other and can be a way to check for problems like batch effects. > Please note that if you'd like **to run this section in R, make sure that you run the code in the following sections above sequentially first**: -> [Load Libraries](#load-libraries) -> [Load Functions](#load-functions) -> [Set Variables](#set-variables) -> [Read-in Input Tables](#read-in-input-tables) +> [Load Libraries](#load-libraries), +> [Load Functions](#load-functions), +> [Set Variables](#set-variables), +> [Read-in Input Tables](#read-in-input-tables), and > [Preprocessing](#preprocessing) ```R @@ -1734,10 +1734,10 @@ Taxonomic summaries provide insights into the composition of microbial communiti > Please note that if you'd like **to run this section in R, make sure that you run the code in the following sections above sequentially first**: -> [Load Libraries](#load-libraries) -> [Load Functions](#load-functions) -> [Set Variables](#set-variables) -> [Read-in Input Tables](#read-in-input-tables) +> [Load Libraries](#load-libraries), +> [Load Functions](#load-functions), +> [Set Variables](#set-variables), +> [Read-in Input Tables](#read-in-input-tables), and > [Preprocessing](#preprocessing) ```R @@ -1931,10 +1931,10 @@ Where taxon_level is all of phylum, class, order, family, genus and species. Using ANCOMBC 1, ANCOMBC 2, and DESeq2, we aim to uncover specific taxa that exhibit notable variations across different conditions, complemented by visualizations like volcano plots to illustrate these disparities and their implications on ASV expression and overall microbial community dynamics. > Please note that if you'd like **to run this section in R, make sure that you run the code in the following sections above sequentially first**: -> [Load Libraries](#load-libraries) -> [Load Functions](#load-functions) -> [Set Variables](#set-variables) -> [Read-in Input Tables](#read-in-input-tables) +> [Load Libraries](#load-libraries), +> [Load Functions](#load-functions), +> [Set Variables](#set-variables), +> [Read-in Input Tables](#read-in-input-tables), and > [Preprocessing](#preprocessing) ### 10a. ANCOMBC 1 From 3657ba5a7282e1e8320d4a19e44f217eadd87b49 Mon Sep 17 00:00:00 2001 From: olabiyi Date: Fri, 17 Jan 2025 16:43:24 -0800 Subject: [PATCH 22/24] Documented functions --- .../GL-DPPD-7104-B.md | 212 +++++++++++------- 1 file changed, 133 insertions(+), 79 deletions(-) diff --git a/Amplicon/Illumina/Pipeline_GL-DPPD-7104_Versions/GL-DPPD-7104-B.md b/Amplicon/Illumina/Pipeline_GL-DPPD-7104_Versions/GL-DPPD-7104-B.md index a5002168..c948702e 100644 --- a/Amplicon/Illumina/Pipeline_GL-DPPD-7104_Versions/GL-DPPD-7104-B.md +++ b/Amplicon/Illumina/Pipeline_GL-DPPD-7104_Versions/GL-DPPD-7104-B.md @@ -584,7 +584,10 @@ write.table(tax_and_count_tab, "taxonomy-and-counts_GLAmpSeq.tsv", sep="\t", quo ### 6a. Create Sample Runsheet -> Note: Rather than running the command below to create the runsheet needed for processing, the runsheet may also be created manually by following the examples for [Paired-end](../Workflow_Documentation/NF_AmpIllumina-B/workflow_code/PE_file.csv) and [Single-end](../Workflow_Documentation/NF_AmpIllumina-B/workflow_code/SE_file.csv) samples. +> Note: Rather than running the command below to create the runsheet needed for processing, the runsheet may also be created manually by following the examples for [Paired-end](../Workflow_Documentation/NF_AmpIllumina-B/workflow_code/PE_file.csv) and [Single-end](../Workflow_Documentation/NF_AmpIllumina-B/workflow_code/SE_file.csv) samples. When creating this table manually, the most important columns for the analyses below are: + +* `sample_id` - column with unique sample names. +* `groups` - column with the groups/treatments that each sample belong. This column is used for comparison. ```bash ### Download the *ISA.zip file from the OSDR ### @@ -657,8 +660,11 @@ library(tidyverse) ```R # Function to calculate text size for plotting calculate_text_size <- function(num_samples, start_samples = 25, min_size = 3) { - max_size = 11 # Maximum size for up to start_samples - slope = -0.15 + # num_samples [INT] - the number of samples to plot + # start_samples [INT] - start of samples + # min_size [INT] - minimum text size for plotting + max_size <- 11 # Maximum size for up to start_samples + slope <- -0.15 if (num_samples <= start_samples) { return(max_size) @@ -675,23 +681,27 @@ calculate_text_size <- function(num_samples, start_samples = 25, min_size = 3) { # sample count transformation depending on the supplied transformation method # i.e. either 'rarefy' or 'vst' transform_phyloseq <- function( feature_table, metadata, method, rarefaction_depth=500){ - # feature_table [DATAFRAME] ~ Feature / ASV count table with samples as columns and features as rows - # metadata [DATAFRAME] ~ Samples metadata with samples as row names - # method [STRING] ~ Distance transformation method to use. + # feature_table [DATAFRAME] - Feature / ASV count table with samples as columns and features as rows + # metadata [DATAFRAME] - Samples metadata with samples as row names + # method [STRING] - Distance transformation method to use. # Either 'rarefy' or 'vst' for rarefaction and variance # stabilizing transformation, respectively. - # rarefaction_depth [INT] ~ Sample rarefaction to even depth when method is 'bray' + # rarefaction_depth [INT] - Sample rarefaction to even depth when method is 'bray' + + # Rarefaction if(method == 'rarefy'){ # Create phyloseq object ASV_physeq <- phyloseq(otu_table(feature_table, taxa_are_rows = TRUE), sample_data(metadata)) - + # Get the count for every sample sorted in ascending order seq_per_sample <- colSums(feature_table) %>% sort() - # Minimum value + # Minimum sequences/count value depth <- min(seq_per_sample) + # Loop through the sequences per sample and return the count + # nearest to the minimum required rarefaction depth for (count in seq_per_sample) { # Get the count equal to rarefaction_depth or nearest to it if(count >= rarefaction_depth) { @@ -707,7 +717,8 @@ transform_phyloseq <- function( feature_table, metadata, method, rarefaction_dep rngseed = 1, replace = FALSE, verbose = FALSE) - + + # Variance Stabilizing Transformation }else if(method == "vst"){ # Using deseq @@ -740,25 +751,31 @@ transform_phyloseq <- function( feature_table, metadata, method, rarefaction_dep return(ps) } -# ----------- Hierarchical Clustering and dendogram plotting +# ----------- A function Hierarchical Clustering and dendogram plotting make_dendogram <- function(dist_obj, metadata, groups_colname, group_colors, legend_title){ - - + # dis_obj [DIST] - a distance object holding the calculated distance (euclidean, bry curtid etc.) between samples + # metadata [DATAFRAME] - sample metadata with samples as rownames and sample info as columns + # groups_colname [STRING] - name of column in metadata to group samples by. + # legend_title [STRING] - legend time to use for plotting + + # Hierarchical Clustering sample_clust <- hclust(d = dist_obj, method = "ward.D2") - # Extract clustering data + # Extract clustering data for plotting hcdata <- dendro_data(sample_clust, type = "rectangle") - segment_data <- segment(hcdata) + segment_data <- segment(hcdata) # sepcifications for tree structure label_data <- label(hcdata) %>% left_join(metadata %>% - rownames_to_column("label")) + rownames_to_column("label")) # Labels are sample names # Plot dendogram dendogram <- ggplot() + + # Plot tree geom_segment(data = segment_data, - aes(x = x, y = y, xend = xend, yend = yend) + aes(x = x, y = y, xend = xend, yend = yend) ) + + # Add sample text labels to tree geom_text(data = label_data , aes(x = x, y = y, label = label, color = !!sym(groups_colname) , hjust = 0), @@ -779,23 +796,34 @@ make_dendogram <- function(dist_obj, metadata, groups_colname, } -# Run variance test and adonis test +# A function to arun variance test and adonis test run_stats <- function(dist_obj, metadata, groups_colname){ + + # dis_obj [DIST] - a distance object holding the calculated distance (euclidean, bry curtid etc.) between samples + # metadata [DATAFRAME] - sample metadata with samples as rownames and sample info as columns + # groups_colname [STRING] - name of column in metadata to group samples by. + # Retrieve sample names from the dist object samples <- attr(dist_obj, "Label") + # subset metadata to contain ony samples in the dist_obj metadata <- metadata[samples,] + + # Run variance test and present the result in a nicely formatted table / dataframe variance_test <- betadisper(d = dist_obj, group = metadata[[groups_colname]]) %>% - anova() %>% - broom::tidy() %>% - mutate(across(where(is.numeric), ~round(.x, digits = 2))) + anova() %>% # MAke results anova-like + broom::tidy() %>% # make the table 'tidy' + mutate(across(where(is.numeric), ~round(.x, digits = 2))) # round-up numeric columns + # Run Adonis test adonis_res <- adonis2(formula = dist_obj ~ metadata[[groups_colname]]) + adonis_test <- adonis_res %>% - broom::tidy() %>% - mutate(across(where(is.numeric), ~round(.x, digits = 2))) + broom::tidy() %>% # Make tidy table + mutate(across(where(is.numeric), ~round(.x, digits = 2))) # round-up numeric columns + # Return a named list with the variance and adonis test results return(list(variance = variance_test, adonis = adonis_test)) } @@ -803,6 +831,16 @@ run_stats <- function(dist_obj, metadata, groups_colname){ plot_pcoa <- function(ps, stats_res, distance_method, groups_colname, group_colors, legend_title, addtext=FALSE) { + + # ps [PHYLOSEQ] - Phyloseq object contructed from feature, taxonmy and metadata tables + # stats_res [LIST] - named list generated after runing the function `run_stats`. + # The list should contain variance and adonis tests dataframes + # distance_method [STRING] - Method used to calculate the distance between samples. + # "euclidean" or "bray" for euclidean and Bray Curtis distance, respectively. + # groups_colname [STRING] - name of column in metadata to group samples by. + # group_colors [VECTOR] - named character vector of colors for each group in `groups_colname` + # legend_title [STRING] - legend time to use for plotting + # addtext [BOOLEAN] - should text labels be added to your pcoa plot? Default: FALSE. # Generating a PCoA with phyloseq pcoa <- ordinate(physeq = ps, method = "PCoA", distance = distance_method) @@ -817,29 +855,30 @@ plot_pcoa <- function(ps, stats_res, distance_method, label_PC1 <- sprintf("PC1 [%.1f%%]", percent_variance[1]) label_PC2 <- sprintf("PC2 [%.1f%%]", percent_variance[2]) + # Retrieving pcoa vectors vectors_df <- pcoa$vectors %>% as.data.frame() %>% rownames_to_column("samples") - + # Creating a dataframe for plotting plot_df <- sample_data(ps) %>% as.matrix() %>% as.data.frame() %>% rownames_to_column("samples") %>% select(samples, !!groups_colname) %>% right_join(vectors_df, join_by("samples")) - + # Plot pcoa p <- ggplot(plot_df, aes(x=Axis.1, y=Axis.2, color=!!sym(groups_colname), label=samples)) + geom_point(size=1) - + # Add text if(addtext){ p <- p + geom_text(show.legend = FALSE, hjust = 0.3, vjust = -0.4, size = 4) } - + # Add annotations to pcoa plot p <- p + labs(x = label_PC1, y = label_PC2, color = legend_title) + coord_fixed(sqrt(eigen_vals[2]/eigen_vals[1])) + scale_color_manual(values = group_colors) + @@ -862,10 +901,10 @@ plot_pcoa <- function(ps, stats_res, distance_method, # on the supplied cut off. remove_rare_features <- function(feature_table, cut_off_percent=3/4){ - # feature_table [MATRIX] feature table matrix with samples as columns and - # features as rows - # cut_off_percent [NUMERIC] cut-off fraction or decimal between 0.001 to 1 - # of the total number of samples to determine the + # feature_table [MATRIX] - feature table matrix with samples as columns and + # features as rows + # cut_off_percent [NUMERIC] - cut-off fraction or decimal between 0.001 to 1 + # of the total number of samples to determine the # most abundant features. By default it removes # features that are not present in 3/4 of the total # number of samples @@ -882,25 +921,28 @@ remove_rare_features <- function(feature_table, cut_off_percent=3/4){ return(abun_features.m) } + # Function to process a taxonopmy assignment table process_taxonomy <- function(taxonomy, prefix='\\w__') { - # Function to process a taxonopmy assignment table - #1. ~ taxonomy is a string specifying the taxonomic assignment file name - #2 prefix ~ is a regular expression specifying the characters to remove - # from the taxon names '\\w__' for greengenes and 'D_\\d__' for SILVA - + + # taxonomy [DATAFRAME] - taxonomy dataframe to process + # prefix [STRING] - regular expression specifying the characters to remove + # from taxon names. Use '\\w__' for greengenes and 'D_\\d__' for SILVA + # Ensure that all columns are of character data type taxonomy <- apply(X = taxonomy, MARGIN = 2, FUN = as.character) + # Loop over every column (rank i.e. domain to species) amd make the necessary edits for (rank in colnames(taxonomy)) { - #delete the taxonomy prefix + # Delete the taxonomy prefix taxonomy[,rank] <- gsub(pattern = prefix, x = taxonomy[, rank], replacement = '') indices <- which(is.na(taxonomy[,rank])) taxonomy[indices, rank] <- rep(x = "Other", times=length(indices)) - #replace empty cell + # replace empty cell with the string 'Other' indices <- which(taxonomy[,rank] == "") taxonomy[indices,rank] <- rep(x = "Other", times=length(indices)) } + # Replace _ with space taxonomy <- apply(X = taxonomy,MARGIN = 2, FUN = gsub,pattern = "_",replacement = " ") %>% as.data.frame(stringAsfactor=F) @@ -909,9 +951,12 @@ process_taxonomy <- function(taxonomy, prefix='\\w__') { # Function to format a taxonomy assignment table by appending a suffix # to a known name -format_taxonomy_table <- function(taxonomy=taxonomy.m,stringToReplace="Other", - suffix=";Other") { - +format_taxonomy_table <- function(taxonomy, stringToReplace="Other", suffix=";Other") { + + # taxonomy [DATAFRAME] - taxonomy dataframe to process + # stringToReplace [STRING] - String to replace + # suffix [STRING] - Replacement string + for (taxa_index in seq_along(taxonomy)) { indices <- grep(x = taxonomy[,taxa_index], pattern = stringToReplace) @@ -921,13 +966,14 @@ format_taxonomy_table <- function(taxonomy=taxonomy.m,stringToReplace="Other", rep(x = suffix, times=length(indices))) } + return(taxonomy) } fix_names<- function(taxonomy,stringToReplace,suffix){ - #1~ taxonomy is a taxonomy dataframe with taxonomy ranks as column names - #2~ stringToReplace is a vector of regex strings specifying what to replace - #3~ suffix is a string specifying the replacement value + # taxonomy [DATAFRAME] - taxonomy dataframe with taxonomy ranks as column names + # stringToReplace [STRING VECTOR] - is a vector of regex strings specifying what to replace + # suffix [STRING VECTOR] - string specifying the replacement value for(index in seq_along(stringToReplace)){ @@ -947,6 +993,10 @@ make_feature_table <- function(count_matrix,taxonomy, # make_feature_table(count_matrix = feature_counts_matrix, # taxonomy = taxonomy_table, taxon_level = "Phylum") + # count_matrix [MATRIX] - ASV or OTU table + # taxonomy [MATRIX] - Taxonomy table + # taxon_level [STRING] - taxonol level string i.e. domain to species + feature_counts_df <- data.frame(taxon_level=taxonomy[,taxon_level], count_matrix, check.names = FALSE, stringsAsFactors = FALSE) @@ -973,11 +1023,14 @@ make_feature_table <- function(count_matrix,taxonomy, # Function to group rare taxa or return a table with the rare taxa group_low_abund_taxa <- function(abund_table, threshold=0.05, rare_taxa=FALSE) { - # abund_table is a relative abundance matrix with taxa as columns and samples as rows - #rare_taxa is a boolean specifying if only rare taxa should be returned - #If set to TRU then a table with only the rare taxa will be returned - #intialize an empty vector that will contain the indices for the - #low abundance columns/ taxa to group + + # abund_table [MATRIX] - relative abundance matrix with taxa as columns and samples as rows + # threshold [NUMERIC] - threshold for filtering out rare taxa. + # rare_taxa [BOOLEAN] - boolean specifying if only rare taxa should be returned + # If set to TRU then a table with only the rare taxa will be returned + + # Intialize an empty vector that will contain the indices for the + # low abundance columns/ taxa to group taxa_to_group <- c() #intialize the index variable of species with low abundance (taxa/columns) index <- 1 @@ -1027,12 +1080,14 @@ group_low_abund_taxa <- function(abund_table, threshold=0.05, collapse_samples <- function(taxon_table,metadata,group,fun=sum, convertToRelativeAbundance=FALSE){ # function to collapse the samples in an oTU table with a defined function(fun) based on a group in metadata - # taxon_table - a matrix count table with samples as rows and features/OTUs as columns - # metadata - a dataframe to containing the group to collapse samples by. Sample names must be the rownames of the metadata - # group - an independent factor variable within the metadata to collapse the samples by - # fun - a function without brackets to apply in order to collapse the samples - # convertToRelativeAbundance - a boolean set to TRUE OR FALSE if the taxon_table shout be converted to relative abundance - # default is FALSE + # taxon_table [MATRIX] - a matrix count table with samples as rows and features/OTUs as columns + # metadata [DATAFRAME] - a dataframe to containing the group to collapse samples by. + # Sample names must be the rownames of the metadata + # group [STRING] - an independent factor variable within the metadata to collapse the samples by + # fun [FUNCTION] - a function without brackets to apply in order to collapse the samples + # convertToRelativeAbundance [BOOLEAN] - a boolean set to TRUE OR FALSE if the taxon_table shout + # be converted to relative abundance default is FALSE. + common.ids <- intersect(rownames(taxon_table),rownames(metadata)) metadata <- droplevels(metadata[common.ids,,drop=FALSE]) taxon_table <- taxon_table[common.ids,,drop=FALSE] @@ -1054,6 +1109,9 @@ collapse_samples <- function(taxon_table,metadata,group,fun=sum, taxize_options(ncbi_sleep = 0.8) # A function to retrieve NCBI taxonomy id for a given taxonomy name get_ncbi_ids <- function(taxonomy, target_region){ + + # taxonomy [STRING] - taxonomy name to search for it's NCBI ID + # target [STRING] - amplicon target region analyzed. options are "16S", "18S" or "ITS" if(target_region == "ITS"){ search_string <- "fungi" @@ -1072,6 +1130,8 @@ get_ncbi_ids <- function(taxonomy, target_region){ # Error handling function when running ANCOMBC2 find_bad_taxa <- function(cnd){ + # cnd [TRY ERROR] - error condition to catch when running ancombc2 function below + if(split_res == "replacement has 0 rows, data has 1" || split_res == "All taxa contain structural zeros") { @@ -1088,6 +1148,10 @@ find_bad_taxa <- function(cnd){ # A function to run ANCOMBC2 while handlixnxg commxon ancombc2 <- function(data, ...) { + + # data [treeSummarizedExperiment] - a treeSummarizedExperiment containing the feature, + # taxonomy and metdata to be analyzed using ancombc2. + tryCatch( ANCOMBC::ancombc2(data = data, ...), error = function(cnd) { @@ -1116,6 +1180,8 @@ ancombc2 <- function(data, ...) { # Geometric mean function used when running DESeq2 gm_mean <- function(x, na.rm=TRUE) { + # x [NUMERIC] - a numeric vector to calculate geometric mean on + # na.rm [BOOLEAN] - should NAs be remove prior to calculation. exp(sum(log(x[x > 0]), na.rm=na.rm) / length(x)) } @@ -1332,12 +1398,9 @@ taxonomy_table <- taxonomy_table[common_ids,] Alpha diversity examines the variety and abundance of taxa within individual samples. Rarefaction curves are utilized to visually represent this diversity, plotting the number of unique sequences (ASVs) identified against the total number of sequences sampled, offering a perspective on the saturation and completeness of sampling. Metrics like Chao1 richness estimates and Shannon diversity indices are employed to quantify the richness (total number of unique sequences) and diversity (combination of richness and evenness) within these samples. -> Please note that if you'd like **to run this section in R, make sure that you run the code in the following sections above sequentially first**: -> [Load Libraries](#load-libraries), -> [Load Functions](#load-functions), -> [Set Variables](#set-variables), -> [Read-in Input Tables](#read-in-input-tables), and -> [Preprocessing](#preprocessing) +> Please note that if you'd like to run the code in this section, make sure that you [load the libraries](#load-libraries) +and [functions](#load-functions), [read-in input tables](#read-in-input-tables) and [preprocess](#preprocessing) them in R +by running the lines of code in [section 6](#6-amplicon-seq-data-analysis-set-up) sequentially, particularly those in [section 6b](#6b-r-environment-set-up). ```R # Create output directory if it doesn't already exist @@ -1627,12 +1690,9 @@ ggsave(filename = glue("{alpha_diversity_out_dir}/{output_prefix}richness_and_di Beta diversity measures the variation in species composition between different samples or environments. A common practice in working with a new dataset is to generate some exploratory visualizations like ordinations and hierarchical clusterings. These give us a quick overview of how our samples relate to each other and can be a way to check for problems like batch effects. -> Please note that if you'd like **to run this section in R, make sure that you run the code in the following sections above sequentially first**: -> [Load Libraries](#load-libraries), -> [Load Functions](#load-functions), -> [Set Variables](#set-variables), -> [Read-in Input Tables](#read-in-input-tables), and -> [Preprocessing](#preprocessing) +> Please note that if you'd like to run the code in this section, make sure that you [load the libraries](#load-libraries) +and [functions](#load-functions), [read-in input tables](#read-in-input-tables) and [preprocess](#preprocessing) them in R +by running the lines of code in [section 6](#6-amplicon-seq-data-analysis-set-up) sequentially, particularly those in [section 6b](#6b-r-environment-set-up). ```R beta_diversity_out_dir <- "beta_diversity/" @@ -1733,12 +1793,9 @@ Were distance_method is either bray or euclidean for Bray Curtis and Euclidean d Taxonomic summaries provide insights into the composition of microbial communities at various taxonomic levels. -> Please note that if you'd like **to run this section in R, make sure that you run the code in the following sections above sequentially first**: -> [Load Libraries](#load-libraries), -> [Load Functions](#load-functions), -> [Set Variables](#set-variables), -> [Read-in Input Tables](#read-in-input-tables), and -> [Preprocessing](#preprocessing) +> Please note that if you'd like to run the code in this section, make sure that you [load the libraries](#load-libraries) +and [functions](#load-functions), [read-in input tables](#read-in-input-tables) and [preprocess](#preprocessing) them in R +by running the lines of code in [section 6](#6-amplicon-seq-data-analysis-set-up) sequentially, particularly those in [section 6b](#6b-r-environment-set-up). ```R taxonomy_plots_out_dir <- "taxonomy_plots/" @@ -1930,12 +1987,9 @@ Where taxon_level is all of phylum, class, order, family, genus and species. Using ANCOMBC 1, ANCOMBC 2, and DESeq2, we aim to uncover specific taxa that exhibit notable variations across different conditions, complemented by visualizations like volcano plots to illustrate these disparities and their implications on ASV expression and overall microbial community dynamics. -> Please note that if you'd like **to run this section in R, make sure that you run the code in the following sections above sequentially first**: -> [Load Libraries](#load-libraries), -> [Load Functions](#load-functions), -> [Set Variables](#set-variables), -> [Read-in Input Tables](#read-in-input-tables), and -> [Preprocessing](#preprocessing) +> Please note that if you'd like to run the code in this section, make sure that you [load the libraries](#load-libraries) +and [functions](#load-functions), [read-in input tables](#read-in-input-tables) and [preprocess](#preprocessing) them in R +by running the lines of code in [section 6](#6-amplicon-seq-data-analysis-set-up) sequentially, particularly those in [section 6b](#6b-r-environment-set-up). ### 10a. ANCOMBC 1 From 7cc24dc97237a7d44cc06edada1ba39c0f2edb60 Mon Sep 17 00:00:00 2001 From: olabiyi Date: Tue, 21 Jan 2025 14:28:05 -0600 Subject: [PATCH 23/24] assigned a default value to params.help --- .../NF_AmpIllumina-B/workflow_code/main.nf | 2 ++ .../NF_AmpIllumina-B/workflow_code/nextflow.config | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/main.nf b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/main.nf index 01347530..54c55dd4 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/main.nf +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/main.nf @@ -7,6 +7,8 @@ c_bright_green = "\u001b[32;1m"; c_blue = "\033[0;34m"; c_reset = "\033[0m"; +params.help = false + /************************************************** * HELP MENU ************************************** **************************************************/ diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/nextflow.config b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/nextflow.config index af60e555..c4844019 100644 --- a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/nextflow.config +++ b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/nextflow.config @@ -113,7 +113,6 @@ profiles { docker { docker.enabled = true docker.runOptions = '-u $(id -u):$(id -g)' - docker.userEmulation = true params.containerEngine = "docker" } From 6923766f34108195ae179b7d6a1a1d9991ef0520 Mon Sep 17 00:00:00 2001 From: olabiyi Date: Wed, 22 Jan 2025 16:29:35 -0800 Subject: [PATCH 24/24] Added some more comments to preprocessing --- .../GL-DPPD-7104-B.md | 50 +++++++++++-------- 1 file changed, 28 insertions(+), 22 deletions(-) diff --git a/Amplicon/Illumina/Pipeline_GL-DPPD-7104_Versions/GL-DPPD-7104-B.md b/Amplicon/Illumina/Pipeline_GL-DPPD-7104_Versions/GL-DPPD-7104-B.md index c948702e..a3fcf132 100644 --- a/Amplicon/Illumina/Pipeline_GL-DPPD-7104_Versions/GL-DPPD-7104-B.md +++ b/Amplicon/Illumina/Pipeline_GL-DPPD-7104_Versions/GL-DPPD-7104-B.md @@ -1075,18 +1075,17 @@ group_low_abund_taxa <- function(abund_table, threshold=0.05, } -# Function to collapse the samples in an oTU table with a defined function(fun) +# Function to collapse samples in a feature table with a defined function(fun) # based on a group in metadata -collapse_samples <- function(taxon_table,metadata,group,fun=sum, - convertToRelativeAbundance=FALSE){ - # function to collapse the samples in an oTU table with a defined function(fun) based on a group in metadata +collapse_samples <- function(taxon_table, metadata, group, fun=sum, convertToRelativeAbundance=FALSE){ + # taxon_table [MATRIX] - a matrix count table with samples as rows and features/OTUs as columns # metadata [DATAFRAME] - a dataframe to containing the group to collapse samples by. # Sample names must be the rownames of the metadata - # group [STRING] - an independent factor variable within the metadata to collapse the samples by - # fun [FUNCTION] - a function without brackets to apply in order to collapse the samples - # convertToRelativeAbundance [BOOLEAN] - a boolean set to TRUE OR FALSE if the taxon_table shout - # be converted to relative abundance default is FALSE. + # group [STRING] - variable / column within the metadata to collapse samples by + # fun [FUNCTION] - function (without brackets) to apply in order to collapse samples + # convertToRelativeAbundance [BOOLEAN] - should the value in the taxon table be converted + # to per sample relave abundance values? Default: FALSE. common.ids <- intersect(rownames(taxon_table),rownames(metadata)) metadata <- droplevels(metadata[common.ids,,drop=FALSE]) @@ -1180,8 +1179,8 @@ ancombc2 <- function(data, ...) { # Geometric mean function used when running DESeq2 gm_mean <- function(x, na.rm=TRUE) { - # x [NUMERIC] - a numeric vector to calculate geometric mean on - # na.rm [BOOLEAN] - should NAs be remove prior to calculation. + # x [NUMERIC] - numeric vector to calculate geometric mean on + # na.rm [BOOLEAN] - should NAs be removed prior to the calculation? exp(sum(log(x[x > 0]), na.rm=na.rm) / length(x)) } @@ -1224,22 +1223,24 @@ publication_format <- theme_bw() + #### Read-in Input Tables ```R -custom_palette <-{COLOR_VECTOR} +custom_palette <- {COLOR_VECTOR} groups_colname <- "groups" sample_colname <- "Sample Name" metadata_file <- file.path("amplicon_runsheet.csv") features_file <- file.path("counts_GLAmpSeq.tsv") taxonomy_file <- file.path("taxonomy_GLAmpSeq.tsv") -# Read-in metadata +# Read-in metadata and convert from tibble to dataframe metadata <- read_csv(file = metadata_file) %>% as.data.frame() +# Set row names row.names(metadata) <- metadata[[sample_colname]] +# Delet sample column since the rownames now contain sample names metadata[,sample_colname] <- NULL +# Get unique group names group_column_values <- metadata %>% pull(!!sym(groups_colname)) group_levels <- unique(group_column_values) -# Add colors to metadata equals to the number of levels -# in the factor groups column +# Add colors to metadata that equals the number of groups num_colors <- length(group_levels) palette <- 'Set1' number_of_colors_in_palette <- 9 @@ -1249,27 +1250,32 @@ if(num_colors <= number_of_colors_in_palette){ colors <- custom_palette[1:num_colors] } -# Metadata +# ------ Metadata ----- # +# Assign color names to each group group_colors <- setNames(colors, group_levels) metadata <- metadata %>% mutate(color = map_chr(!!sym(groups_colname), function(group) { group_colors[group] } ) - ) + ) # assign group specific colors to each row in metadata + +# Retrieve sample names sample_names <- rownames(metadata) deseq2_sample_names <- make.names(sample_names, unique = TRUE) +# Subset metadata to contain on the groups and color columns sample_info_tab <- metadata %>% - select(!!groups_colname, color) %>% - arrange(!!sym(groups_colname)) + select(!!groups_colname, color) %>% # select groups and color columns + arrange(!!sym(groups_colname)) # metadata by groups column +# Retrieves unique colors values <- sample_info_tab %>% pull(color) %>% unique() -# Feature or ASV table +# ---- Import Feature or ASV table ---- # feature_table <- read.table(file = features_file, header = TRUE, row.names = 1, sep = "\t") -# Taxonomy table +# ---- Import Taxonomy table ---- # taxonomy_table <- read.table(file = taxonomy_file, header = TRUE, row.names = 1, sep = "\t") ``` @@ -1373,10 +1379,10 @@ taxonomy_table <- taxonomy_table[common_ids,] ``` **Parameter Definitions:** -* `remove_rare` - should rare features and samples be filtered out prior to analysis? If true, rare feature and samples will be removed +* `remove_rare` - should rare features and samples be filtered out prior to analysis? If true, rare features and samples will be removed according to the cutoffs set below. * `prevalence_cutoff` - If `remove_rare` is true, a numerical fraction between 0 and 1. - Taxa with prevalences(the proportion of samples in which the taxon is present) less than this will be excluded from the analysis. Default is 0, i.e. do not exclude any taxa / features. + Taxa with prevalences(the proportion of samples in which the taxon is present) less than this will be excluded from the analysis. Default is 0, i.e. do not exclude any taxon / feature. * `library_cutoff` - If `remove_rare` is true, a numerical threshold for filtering samples based on library sizes. Samples with library sizes less than lib_cut will be excluded in the analysis. Default is 0 i.e. no sample will be dropped. if you want to discard samples with read counts less than or equal to 100 then set to 100. * `target_region` - amplicon target region. Options are either 16S, 18S or ITS