nasa · olabiyi · Nov 18, 2024 · Nov 18, 2024 · Nov 27, 2024 · Nov 27, 2024
diff --git a/Amplicon/Illumina/Pipeline_GL-DPPD-7104_Versions/GL-DPPD-7104-B.md b/Amplicon/Illumina/Pipeline_GL-DPPD-7104_Versions/GL-DPPD-7104-B.md
diff --git a/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/README.md b/Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/README.md
@@ -16,17 +16,23 @@ The current GeneLab Illumina amplicon sequencing data processing pipeline (AmpIl
 
 3. [Fetch Singularity Images](#3-fetch-singularity-images)  
 
-4. [Run the workflow](#4-run-the-workflow)  
+4. [Run the workflow directly with nextflow](#4-run-the-workflow-directly-with-nextflow)  
    4a. [Approach 1: Run slurm jobs in singularity containers with OSD accession as input](#4a-approach-1-run-slurm-jobs-in-singularity-containers-with-osd-accession-as-input)   
    4b. [Approach 2: Run slurm jobs in singularity containers with a csv file as input](#4b-approach-2-run-slurm-jobs-in-singularity-containers-with-a-csv-file-as-input)  
    4c. [Approach 3: Run jobs locally in conda environments and specify the path to one or more existing conda environments](#4c-approach-run-jobs-locally-in-conda-environments-and-specify-the-path-to-one-or-more-existing-conda-environments)  
    4d. [Modify parameters and cpu resources in the nextflow config file](#4d-modify-parameters-and-cpu-resources-in-the-nextflow-config-file)  
 
-5. [Workflow outputs](#5-workflow-outputs)  
-   5a. [Main outputs](#5a-main-outputs)  
-   5b. [Resource logs](#5b-resource-logs)  
+5. [Run the workflow indirectly using the python wrapper script](#5-run-the-workflow-indirectly-using-the-python-wrapper-script)  
+   5a. [Approach 1: Use an OSD or Genelab acession as input](#5a-approach-1-use-an-osd-or-Genelab-acession-as-input)  
+   5b. [Approach 2: Use a csv file as input to the workflow](#5b-approach-2-use-a-csv-file-as-input-to-the-workflow)  
+   5c. [Approach 3: Use a csv file as input to the workflow and supply extra arguments to nextflow run](#5c-approach-3-use-a-csv-file-as-input-to-the-workflow-and-supply-extra-arguments-to-nextflow-run)  
+   5d. [Approach 4: Just create an edited nextflow config file but dont run the workflow](#5d-approach-4-just-create-an-edited-nextflow-config-file-but-dont-run-the-workflow)  
 
-6. [Post Processing](#6-post-processing) 
+6. [Workflow outputs](#6-workflow-outputs)  
+   6a. [Main outputs](#6a-main-outputs)  
+   6b. [Resource logs](#6b-resource-logs)  
+
+7. [Post Processing](#6-post-processing) 
 
 <br>
 
@@ -96,7 +102,7 @@ export NXF_SINGULARITY_CACHEDIR=$(pwd)/singularity
 
 ---
 
-### 4. Run the Workflow
+### 4. Run the workflow directly with nextflow
 
 For options and detailed help on how to run the workflow, run the following command:
 
@@ -105,7 +111,7 @@ nextflow run main.nf --help
 ```
 
 > Note: Nextflow commands use both single hyphen arguments (e.g. -help) that denote general nextflow arguments and double hyphen arguments (e.g. --input_file) that denote workflow specific parameters.  Take care to use the proper number of hyphens for each argument.
-
+> Please Note: This workflow assumes that all your raw reads end with the same suffix. If they don't, please modify your input read filenames to have the same suffix as shown in [SE_file.csv](workflow_code/SE_file.csv) and [PE_file.csv](workflow_code/PE_file.csv).
 <br>
 
 #### 4a. Approach 1: Run slurm jobs in singularity containers with OSD or GLDS accession as input
@@ -165,13 +171,55 @@ Once you've downloaded the workflow template, you can modify the parameters in t
 
 ---
 
-### 5. Workflow outputs
+### 5. Run the workflow indirectly using the python wrapper script
+
+For options and detailed help on how to run the workflow using the script, run the following command:
+
+```bash
+python run_workflow.py
+```
+
+#### 5a. Approach 1: Use an OSD or Genelab acession as input
+
+```bash
+python run_workflow.py --run --target-region 16S --accession GLDS-487 --profile slurm,singularity 
+```
+
+#### 5b. Approach 2: Use a csv file as input to the workflow
+
+```bash
+python run_workflow.py --run --target-region 16S --input-file PE_file.csv --F-primer AGAGTTTGATCCTGGCTCAG --R-primer CTGCCTCCCGTAGGAGT --profile singularity 
+```
+
+#### 5c. Approach 3: Use a csv file as input to the workflow and supply extra arguments to nextflow run 
+
+Here were want to monitor our jobs with nextflow tower.
+
+```bash
+export TOWER_ACCESS_TOKEN=<ACCESS TOKEN>
+export TOWER_WORKSPACE_ID=<WORKSPACE ID>
+python run_workflow.py --run --target-region 16S --input-file PE_file.csv --F-primer AGAGTTTGATCCTGGCTCAG --R-primer CTGCCTCCCGTAGGAGT --profile slurm,conda --extra 'with-tower'
+```
+
+#### 5d. Aproach 4: Just create an edited nextflow config file but dont run the workflow
+
+```bash
+python run_workflow.py --target-region 16S --accession GLDS-487 --profile slurm,singularity
+```
+
+> Note: When using the wrapper script, all outputs generated by the workflow will be in a directory specified by the `--output-dir` parameter. This will be the parent directory `..` by default.
+
+<br>
+
+---
+
+### 6. Workflow outputs
 
-#### 5a. Main outputs
+#### 6a. Main outputs
 
 The outputs from this pipeline are documented in the [GL-DPPD-7104-B](../../Pipeline_GL-DPPD-7104_Versions/GL-DPPD-7104-B.md) processing protocol.
 
-#### 5b. Resource logs
+#### 6b. Resource logs
 
 Standard nextflow resource usage logs are also produced as follows:
 
@@ -186,7 +234,7 @@ Standard nextflow resource usage logs are also produced as follows:
 
 ---
 
-### 6. Post Processing
+### 7. Post Processing
 
 For options and detailed help on how to run the post-processing workflow, run the following command:
 

diff --git a/..._Documentation/NF_AmpIllumina-B/workflow_code/bin/GL-gen-amplicon-file-associations-table b/..._Documentation/NF_AmpIllumina-B/workflow_code/bin/GL-gen-amplicon-file-associations-table
@@ -253,7 +253,9 @@ def get_read_counts_from_raw_multiqc(mapping_tab, raw_multiqc_stats_file_path,
 
             # Reading in
             zip_file = zipfile.ZipFile(curr_file_path)
-            curr_df = pd.read_csv(zip_file.open(raw_multiqc_stats_file_path), sep = "\t", usecols = [0,5])
+            #curr_df = pd.read_csv(zip_file.open(raw_multiqc_stats_file_path), sep = "\t", usecols = [0,6])
+            curr_df = pd.read_csv(zip_file.open(raw_multiqc_stats_file_path), sep = "\t")
+            curr_df = curr_df.iloc[:,[0,-1]] # retrieve the samples column[0] and  last column[-1] which is reads counts column
             curr_df.columns = ["sample", "counts"]
             curr_df.set_index("sample", inplace = True)
 
@@ -268,7 +270,9 @@ def get_read_counts_from_raw_multiqc(mapping_tab, raw_multiqc_stats_file_path,
     else:
         input_zip = os.path.join(fastqc_dir, output_prefix + raw_multiqc_zip)
         zip_file = zipfile.ZipFile(input_zip)
-        df = pd.read_csv(zip_file.open(raw_multiqc_stats_file_path), sep = "\t", usecols = [0,5])
+        #df = pd.read_csv(zip_file.open(raw_multiqc_stats_file_path), sep = "\t", usecols = [0,6])
+        df = pd.read_csv(zip_file.open(raw_multiqc_stats_file_path), sep = "\t")
+        df = df.iloc[:,[0,-1]] # retrieve the samples column[0] and  last column[-1] which is reads counts column
         df.columns = ["sample", "counts"]
         df.set_index("sample", inplace = True)
 
@@ -531,4 +535,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
+    main()
diff --git a/...orkflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/GL-gen-processed-amplicon-readme b/...orkflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/GL-gen-processed-amplicon-readme
@@ -134,6 +134,10 @@ def write_amplicon_body(output, output_file, assay_suffix, processing_zip_file,
 
     # Outputs
     output.write("    {:<41} {:>0}".format("- " + str(final_outputs_dir), "- primary output files (may or may not have additional prefix)\n"))
+    output.write("        {:<37} {:>0}".format(f"- alpha_diversity/", "- directory containing alpha diversity plots and statistics tables\n"))
+    output.write("        {:<37} {:>0}".format(f"- beta_diversity/", "- directory containing beta diversity plots and statistics tables\n"))
+    output.write("        {:<37} {:>0}".format(f"- differential_abundance/", "- directory containing the results (tables and plots) of differential abundance testing using one of or all of ANCOMBC1, ANCOMBC2 and DESeq2 \n"))
+    output.write("        {:<37} {:>0}".format(f"- taxonomy_plots/", "- directory containing sample-wise and group-wise taxonomy relative abundance stacked bar plots from phylum to specie level\n"))
     output.write("        {:<37} {:>0}".format(f"- *{assay_suffix}.fasta", "- fasta file of recovered sequences\n"))
     output.write("        {:<37} {:>0}".format(f"- *counts{assay_suffix}.tsv", "- count table of sequences across samples\n"))
     output.write("        {:<37} {:>0}".format(f"- *taxonomy{assay_suffix}.tsv", "- assigned taxonomy of recovered sequences\n"))

diff --git a/...flow_Documentation/NF_AmpIllumina-B/workflow_code/bin/GL-validate-processed-amplicon-data b/...flow_Documentation/NF_AmpIllumina-B/workflow_code/bin/GL-validate-processed-amplicon-data
@@ -312,9 +312,10 @@ def get_read_count_stats(validation_log, prefix, multiqc_zip, multiqc_stats_file
     """ Grabs read counts and summarizes """
 
     zip_file = zipfile.ZipFile(multiqc_zip)
-
-    df = pd.read_csv(zip_file.open(multiqc_stats_file_path), sep = "\t", usecols = [5])
-
+    #read_count_column = 6
+    #df = pd.read_csv(zip_file.open(multiqc_stats_file_path), sep = "\t", usecols = [read_count_column])
+    df = pd.read_csv(zip_file.open(multiqc_stats_file_path), sep = "\t")
+    df = df.iloc[:,[-1]] # retrieve the last column which is reads counts column
     df.columns = ["counts"]
     counts = df.counts.tolist()
 
@@ -486,4 +487,4 @@ def main():
     get_read_count_stats(validation_log, filtered_prefix, filtered_multiqc_zip, filtered_multiqc_stats_file_path)
 
 if __name__ == "__main__":
-    main()
+    main()
diff --git a/...mina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/Illumina-PE-R-processing.R b/...mina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/Illumina-PE-R-processing.R
@@ -173,11 +173,11 @@ if ( target_region == "16S" ) {
 
 } else if (target_region == "ITS" ) {
 
-    download.file("https://figshare.com/ndownloader/files/46245586", "UNITE_v2020_February2020.RData")    
+    download.file("https://figshare.com/ndownloader/files/49181545", "UNITE_v2023_July2023.RData")    
     # loading reference taxonomy object
-    load("UNITE_v2020_February2020.RData")
+    load("UNITE_v2023_July2023.RData")
     # removing downloaded file
-    #file.remove("UNITE_v2020_February2020.RData")
+    #file.remove("UNITE_v2023_July2023.RData")
 
     ranks <- c("kingdom", "phylum", "class", "order", "family", "genus", "species")
 

diff --git a/...mina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/Illumina-SE-R-processing.R b/...mina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/Illumina-SE-R-processing.R
@@ -142,12 +142,11 @@ if ( target_region == "16S" ) {
 
 } else if (target_region == "ITS" ) {
 
-    download.file("https://figshare.com/ndownloader/files/46245586", "UNITE_v2020_February2020.RData")    
+    download.file("https://figshare.com/ndownloader/files/49181545", "UNITE_v2023_July2023.RData")
     # loading reference taxonomy object
-    load("UNITE_v2020_February2020.RData")
+    load("UNITE_v2023_July2023.RData")
     # removing downloaded file
-    #file.remove("UNITE_v2020_February2020.RData")
-
+    #file.remove("UNITE_v2023_July2023.RData")
     ranks <- c("kingdom", "phylum", "class", "order", "family", "genus", "species")
 
 } else if (target_region == "18S" ) {

diff --git a/...icon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/alpha_diversity.R b/...icon/Illumina/Workflow_Documentation/NF_AmpIllumina-B/workflow_code/bin/alpha_diversity.R
@@ -49,26 +49,36 @@ option_list <- list(
                     Deafault: 'Sample Name' ",
               metavar="Sample Name"),
 
-  make_option(c("-p", "--output-prefix"), type="character", default="", 
+  make_option(c("-o", "--output-prefix"), type="character", default="", 
               help="Unique name to tag onto output files. Default: empty string.",
               metavar=""),
 
   make_option(c("-d", "--rarefaction-depth"), type="numeric", default=500, 
               help="Minimum rarefaction depth for alpha diversity estimation. \
                    Default: 500. ",
               metavar="500"),
-
-  make_option(c("-c", "--abundance-cutoff"), type="numeric", default=0.2, 
-              help="A fraction defining how abundant features most be to be \
-                   analyzes. Default: 1/5. ",
-              metavar="0.2"),
-
+
   make_option(c("-r", "--remove-rare"), type="logical", default=FALSE, 
               help="Should rare features be filtered out?. \
                    Default: FALSE. ", action= "store_true",
-              metavar="FALSE"),
+              metavar="FALSE"),  
+
+  make_option(c("-p", "--prevalence-cutoff"), type="numeric", default=0.15, 
+              help="If --remove-rare, a numerical fraction between 0 and 1. Taxa with prevalences
+              (the proportion of samples in which the taxon is present) less 
+              than --prevalence-cutoff will be excluded in the analysis. 
+              Default is 0.15, i.e. exclude taxa / features that are not present
+              in at least 15% of the samples.",
+              metavar="0.15"),
+
+  make_option(c("-l", "--library-cutoff"), type="numeric", default=100, 
+              help="If --remove-rare, a numerical threshold for filtering samples based on library
+              sizes. Samples with library sizes less than lib_cut will be 
+              excluded in the analysis. Default is 100. 
+              if you do not want to discard any sample then set to 0.",
+              metavar="100"),
 
-  make_option(c("-l", "--legend-title"), type="character", default="Groups", 
+  make_option(c("-e", "--legend-title"), type="character", default="Groups", 
               help="Legend title for alpha diversity plots.",
               metavar="Groups"),
 
@@ -213,8 +223,6 @@ custom_palette <- custom_palette[-c(21:23, grep(pattern = pattern_to_filter,
 metadata_file <- opt[["metadata-table"]] 
 features_file <- opt[["feature-table"]]
 taxonomy_file <- opt[["taxonomy-table"]]
-alpha_diversity_out_dir <-"alpha_diversity/"
-if(!dir.exists(alpha_diversity_out_dir)) dir.create(alpha_diversity_out_dir)
 # Metadata group column name to compare
 groups_colname <- opt[["group"]]
 sample_colname  <- opt[["samples-column"]]
@@ -223,8 +231,12 @@ assay_suffix <- opt[["assay-suffix"]]
 legend_title <- opt[["legend-title"]]
 rarefaction_depth  <- opt[["rarefaction-depth"]]
 remove_rare <- opt[["remove-rare"]]
-abundance_cutoff <- opt[["abundance-cutoff"]]
-
+# taxon / ASV prevalence cutoff
+prevalence_cutoff <- opt[["prevalence-cutoff"]] # 0.15 (15%)
+# sample / library read count cutoff
+library_cutoff <- opt[["library-cutoff"]]  # 100
+alpha_diversity_out_dir <-"alpha_diversity/"
+if(!dir.exists(alpha_diversity_out_dir)) dir.create(alpha_diversity_out_dir)
 
 
 
@@ -273,11 +285,17 @@ feature_table <- read.table(file = features_file, header = TRUE,
                             row.names = 1, sep = "\t")
 
 if(remove_rare){
-# Remove rare ASVs
-feature_table <- remove_rare_features(feature_table,
-                                      cut_off_percent=abundance_cutoff)
+
+  # Remove samples with less than library-cutoff
+  message(glue("Dropping samples with less than {library_cutoff} read counts"))
+  feature_table <- feature_table[,colSums(feature_table) >= library_cutoff]
+  # Remove rare ASVs
+  message(glue("Dropping features with prevalence less than {prevalence_cutoff * 100}%"))
+  feature_table <- remove_rare_features(feature_table,
+                                        cut_off_percent = prevalence_cutoff)
 }
 
+
 # Taxonomy 
 taxonomy_table <-  read.table(file = taxonomy_file, header = TRUE,
                               row.names = 1, sep = "\t")