Merge pull request #117 from olabiyi/DEV_Metagenomics_Illumina_NF_con…

…version Metagenomics Nextflow workflow: Added post-processing workflow
nasa · Oct 22, 2024 · ecb72e8 · ecb72e8
2 parents 5adb13f + 34b63a7
commit ecb72e8
Show file tree

Hide file tree

Showing 11 changed files with 2,507 additions and 5 deletions.
diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/README.md b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/README.md
@@ -29,6 +29,8 @@ The current GeneLab Illumina metagenomics sequencing data processing pipeline (M
    5a. [Main outputs](#5a-main-outputs)  
    5b. [Resource logs](#5b-resource-logs)  
 
+6. [Post Processing](#6-post-processing)  
+
 <br>
 
 ---
@@ -140,7 +142,8 @@ nextflow run main.nf -resume -profile conda --csv_file SE_file.csv --conda.qc <p
 * `-resume` - Resumes  workflow execution using previously cached results
 
 * `-profile` – Specifies the configuration profile(s) to load, `singularity` instructs nextflow to setup and use singularity for all software called in the workflow
-  *Required only if you would like to pull and process data directly from OSDR*
+
+*Required only if you would like to pull and process data directly from OSDR*
 
 * `--GLDS_accession` – A Genelab / OSD accession number e.g. OSD-574.
 
@@ -178,3 +181,23 @@ Standard nextflow resource usage logs are also produced as follows:
   - Resource_Usage/execution_trace_{timestamp}.txt (an execution tracing file that contains information about each process executed in the workflow, including: submission time, start time, completion time, cpu and memory used, machine-readable output)
 
 > Further details about these logs can also found within [this Nextflow documentation page](https://www.nextflow.io/docs/latest/tracing.html#execution-report).
+
+<br>
+
+---
+
+### 6. Post Processing
+
+For options and detailed help on how to run the post-processing workflow, run the following command:
+
+```bash
+nextflow run post_processng.nf --help
+```
+
+To generate a README file, a protocols file, a md5sums table and a file association table after running the processing workflow sucessfully, modify and set the parameters in [post_processing.config](workflow_code/post_processing.config) then run the following command:
+
+```bash
+nextflow -C post_processing.config run post_processng.nf -resume -profile slurm,singularity
+``` 
+
+The outputs of the run will be in a directory called `Post_Processing` by default.
diff --git a/...Documentation/NF_MGIllumina/workflow_code/bin/GL-gen-metagenomics-file-associations-table b/...Documentation/NF_MGIllumina/workflow_code/bin/GL-gen-metagenomics-file-associations-table
diff --git a/...rkflow_Documentation/NF_MGIllumina/workflow_code/bin/GL-gen-processed-metagenomics-readme b/...rkflow_Documentation/NF_MGIllumina/workflow_code/bin/GL-gen-processed-metagenomics-readme
diff --git a/...low_Documentation/NF_MGIllumina/workflow_code/bin/GL-validate-processed-metagenomics-data b/...low_Documentation/NF_MGIllumina/workflow_code/bin/GL-validate-processed-metagenomics-data
diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/clean-paths.sh b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/clean-paths.sh
@@ -21,4 +21,4 @@ sed -E 's|.*/GLDS_Datasets/(.+)|\1|g'  ${1} \
     | sed -E 's|.+/miniconda.+/envs/[^/]*/||g' \
     | sed -E 's|/[^ ]*/GLDS-|GLDS-|g' \
     | sed -E 's|/[a-z]{6}/[^ ]*|<path-removed-for-security-purposes>|g' \
-    | sed -E "s|${ROOT_DIR}||g" > t && mv t ${1}
+    | sed -E "s|${ROOT_DIR}||g" > t && mv t ${1}
diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/main.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/main.nf
@@ -80,7 +80,7 @@ if (params.help) {
   println("      --genes_dir [PATH] Specifies where predicted genes from the assemblies will be published. Default: ../Assembly-based_Processing/predicted-genes/.")
   println("      --annotations_and_tax_dir [PATH] Contig taxonomy and annotation directory.  Default: ../Assembly-based_Processing/annotations-and-taxonomy/.")
   println("      --mapping_dir [PATH] Read mapping to assembly directory.  Default: ../Assembly-based_Processing/read-mapping/.")
-  println("      --combined_output_dir [PATH] Assembly summuries and reports across samples directory.  Default: ../Assembly-based_Processing/combined-outputs/.")
+  println("      --combined_output_dir [PATH] Assembly summaries and reports across samples directory.  Default: ../Assembly-based_Processing/combined-outputs/.")
   println("      --bins_dir [PATH] Assembly bins directory.  Default: ../Assembly-based_Processing/bins/.")
   println("      --MAGs_dir [PATH] Meta assembled genomes (MAGs) directory.  Default: ../Assembly-based_Processing/MAGs/.")
   println("      --read_based_dir [PATH] Read-based analysis outputs directory.  Default: ../Read-based_Processing/.")

diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/genelab.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/genelab.nf
@@ -0,0 +1,283 @@
+#!/usr/bin/env nextflow
+nextflow.enable.dsl = 2
+
+process CLEAN_FASTQC_PATHS {
+    tag "Purging genelab paths from MultiQC zip files in ${params.directories.FastQC_Outputs}"
+    input:
+        path(FastQC_Outputs_dir)
+    output:
+        path("${OUT_DIR}"), emit: clean_dir
+    script:
+        OUT_DIR = "${FastQC_Outputs_dir.baseName}"
+        """
+        WORKDIR=`pwd`
+        mv ${FastQC_Outputs_dir} FastQC_Outputs_dir
+
+        [ -d ${OUT_DIR}/ ] || mkdir  ${OUT_DIR}/ && \\
+        cp -r FastQC_Outputs_dir/*  ${OUT_DIR}/
+        
+        [ -f ${OUT_DIR}/versions.txt ] && rm -rf ${OUT_DIR}/versions.txt
+
+        cat `which clean-paths.sh` > \${WORKDIR}/clean-paths.sh
+        chmod +x \${WORKDIR}/clean-paths.sh
+
+        echo "Purging paths from multiqc outputs"
+        cd \${WORKDIR}/${OUT_DIR}/
+        echo "Cleaning raw multiqc files with path info"
+        unzip raw_multiqc${params.assay_suffix}_report.zip && rm raw_multiqc${params.assay_suffix}_report.zip
+        cd raw_multiqc_report/raw_multiqc_data/
+
+        # No reason not to just run it on all
+        echo "Purging paths in all raw QC files..."
+        find . -type f -exec bash \${WORKDIR}/clean-paths.sh '{}' ${params.baseDir} \\;
+        cd \${WORKDIR}/${OUT_DIR}/
+
+        echo "Re-zipping up raw multiqc"
+        zip -r raw_multiqc${params.assay_suffix}_report.zip raw_multiqc_report/ && rm -rf raw_multiqc_report/
+
+        echo "Cleaning filtered multiqc files with path info..."
+        unzip filtered_multiqc${params.assay_suffix}_report.zip && rm filtered_multiqc${params.assay_suffix}_report.zip
+        cd filtered_multiqc_report/filtered_multiqc_data/
+
+
+        # No reason not to just run it on all
+        echo "Purging paths in all filtered QC files..."
+        find . -type f -exec bash \${WORKDIR}/clean-paths.sh '{}' ${params.baseDir} \\;
+        cd \${WORKDIR}/${OUT_DIR}/
+
+
+        echo "Re-zipping up filtered multiqc..."
+        zip -r filtered_multiqc${params.assay_suffix}_report.zip filtered_multiqc_report/ && rm -rf filtered_multiqc_report/
+        cd \${WORKDIR}
+
+        echo "Purging paths from multiqc outputs completed successfully..."
+
+        echo "Done! Paths purged successfully."
+        """
+
+}
+
+process PACKAGE_PROCESSING_INFO {
+
+    tag "Purging file paths and zipping processing info"
+
+    input:
+        val(files_and_dirs) 
+    output:
+        path("processing_info${params.assay_suffix}.zip"), emit: zip
+
+    script:
+        """
+        cat `which clean-paths.sh` > clean-paths.sh
+        chmod +x ./clean-paths.sh
+        [ -d processing_info/ ] || mkdir processing_info/ && \\
+        cp -r ${files_and_dirs.join(" ")} processing_info/
+
+        echo "Purging file paths"
+        find processing_info/ -type f -exec bash ./clean-paths.sh '{}' ${params.baseDir} \\;
+        
+        # Purge file paths and then zip
+        zip -r processing_info${params.assay_suffix}.zip processing_info/
+        """
+} 
+
+
+process GENERATE_README {
+
+    beforeScript "chmod +x ${baseDir}/bin/*"
+    tag "Generating README for ${OSD_accession}"
+    input:
+        tuple val(name), val(email), val(output_prefix),
+              val(OSD_accession), val(protocol_id),
+              val(FastQC_Outputs), val(Filtered_Sequence_Data),
+              val(Read_Based_Processing), val(Assembly_Based_Processing),
+              val(Assemblies), val(Genes), val(Annotations_And_Tax),
+              val(Mapping), val(Combined_Output)
+        path(processing_info)
+        path(Bins)
+        path(MAGS)
+    output:
+        path("README${params.assay_suffix}.txt"), emit: readme
+
+    script:
+        """    
+        GL-gen-processed-metagenomics-readme \\
+             --output 'README${params.assay_suffix}.txt' \\
+             --GLDS-ID '${OSD_accession}' \\
+             --output-prefix '${output_prefix}' \\
+             --name '${name}' \\
+             --email '${email}' \\
+             --protocol_ID '${protocol_id}' \\
+             --assay_suffix '${params.assay_suffix}' \\
+             --processing_zip_file '${processing_info}' \\
+             --fastqc_dir '${FastQC_Outputs}' \\
+             --filtered_reads_dir '${Filtered_Sequence_Data}' \\
+             --read_based_dir '${Read_Based_Processing}' \\
+             --assembly_based_dir '${Assembly_Based_Processing}' \\
+             --assemblies_dir '${Assemblies}' \\
+             --genes_dir   '${Genes}' \\
+             --annotations_and_tax_dir '${Annotations_And_Tax}' \\
+             --mapping_dir '${Mapping}' \\
+             --bins_dir '${Bins}' \\
+             --MAGs_dir '${MAGS}' \\
+             --combined_output_dir  '${Combined_Output}' ${params.readme_extra}
+        """
+
+}
+
+
+process VALIDATE_PROCESSING {
+
+    tag "Running automated validation and verification...."
+
+    input:
+        // Labels
+        tuple val(GLDS_accession), val(V_V_guidelines_link), val(output_prefix),
+               val(target_files), val(assay_suffix), val(log_dir_basename),
+               val(raw_suffix), val(raw_R1_suffix), val(raw_R2_suffix),
+               val(filtered_suffix), val(filtered_R1_suffix), val(filtered_R2_suffix)
+        // Directory paths
+        tuple path(Filtered_Sequence_Data),  path(Read_Based),
+              path(Assembly_Based), path(Assemblies), path(Mapping),
+              path(Genes), path(Annotation_And_Tax), path(Bins), 
+              path(MAGS), path(Combined_Output), path(FastQC_Outputs)
+        // File paths
+        path(sample_ids_file)
+        path(README)
+        path(processing_info) 
+
+    output:
+        path("${GLDS_accession}_${output_prefix}metagenomics-validation.log"), emit: log
+
+    script:
+        """
+        GL-validate-processed-metagenomics-data \\
+             --output '${GLDS_accession}_${output_prefix}metagenomics-validation.log' \\
+             --GLDS-ID '${GLDS_accession}' \\
+             --readme '${README}' \\
+             --sample-IDs-file '${sample_ids_file}' \\
+             --V_V_guidelines_link '${V_V_guidelines_link}' \\
+             --processing_zip_file '${processing_info}' \\
+             --output-prefix '${output_prefix}' \\
+             --zip_targets '${target_files}' \\
+             --assay_suffix '${assay_suffix}' \\
+             --raw_suffix '${raw_suffix}' \\
+             --raw_R1_suffix '${raw_R1_suffix}' \\
+             --raw_R2_suffix '${raw_R2_suffix}' \\
+             --filtered_suffix '${filtered_suffix}' \\
+             --filtered_R1_suffix '${filtered_R1_suffix}' \\
+             --filtered_R2_suffix '${filtered_R2_suffix}' \\
+             --logs_dir_basename '${log_dir_basename}' \\
+             --fastqc_dir ${FastQC_Outputs} \\
+             --filtered_reads_dir ${Filtered_Sequence_Data} \\
+             --read_based_dir ${Read_Based} \\
+             --assembly_based_dir ${Assembly_Based} \\
+             --assemblies_dir ${Assemblies} \\
+             --genes_dir ${Genes} \\
+             --annotations_and_tax_dir ${Annotation_And_Tax} \\
+             --mapping_dir ${Mapping} \\
+             --bins_dir ${Bins} \\
+             --MAGs_dir ${MAGS} \\
+             --combined_output_dir ${Combined_Output} ${params.validation_extra}
+        """
+}
+
+
+process GENERATE_CURATION_TABLE {
+
+    beforeScript "chmod +x ${baseDir}/bin/*"
+    tag "Generating a file association table for curation..."
+
+    input:
+        // GeneLab accession and Suffixes
+        tuple val(GLDS_accession), val(output_prefix), val(assay_suffix),
+               val(raw_suffix), val(raw_R1_suffix), val(raw_R2_suffix),
+               val(filtered_suffix), val(filtered_R1_suffix), val(filtered_R2_suffix)
+        // File labels
+        tuple val(processing_zip_file), val(readme)
+        // Directory labels as paths - these paths are utilized as mere labels by the script
+        tuple path(raw_reads_dir), path(filtered_reads_dir), path(read_based_dir),
+              path(assembly_based_dir), path(annotation_and_tax_dir), path(combined_output_dir)
+        // Directory paths
+        tuple path(Assemblies), path(Genes), path(Mapping),
+              path(Bins), path(MAGS), path(FastQC_Outputs) 
+        path(assay_table)
+        path(runsheet)
+
+    output:
+        path("${GLDS_accession}_${output_prefix}-associated-file-names.tsv"), emit: curation_table
+
+    script:
+        def INPUT_TABLE = "${params.files.assay_table}" == "" ? "--isa-zip  ${assay_table}" : "--assay-table ${assay_table}"
+        """
+        GL-gen-metagenomics-file-associations-table ${INPUT_TABLE} \\
+                    --runsheet '${runsheet}' \\
+                    --output '${GLDS_accession}_${output_prefix}-associated-file-names.tsv' \\
+                    --GLDS-ID  '${GLDS_accession}' \\
+                    --output-prefix '${output_prefix}' \\
+                    --assay_suffix '${assay_suffix}' \\
+                    --raw_suffix '${raw_suffix}' \\
+                    --raw_R1_suffix '${raw_R1_suffix}' \\
+                    --raw_R2_suffix '${raw_R2_suffix}' \\
+                    --filtered_suffix '${filtered_suffix}' \\
+                    --filtered_R1_suffix '${filtered_R1_suffix}' \\
+                    --filtered_R2_suffix '${filtered_R2_suffix}' \\
+                    --processing_zip_file '${processing_zip_file}' \\
+                    --readme '${readme}' \\
+                    --fastqc_dir '${FastQC_Outputs}' \\
+                    --assemblies_dir '${Assemblies}' \\
+                    --genes_dir '${Genes}' \\
+                    --mapping_dir '${Mapping}' \\
+                    --bins_dir '${Bins}' \\
+                    --MAGs_dir '${MAGS}' \\
+                    --raw_reads_dir '${raw_reads_dir}' \\
+                    --filtered_reads_dir '${filtered_reads_dir}' \\
+                    --read_based_dir  '${read_based_dir}' \\
+                    --assembly_based_dir '${assembly_based_dir}' \\
+                    --annotations_and_tax_dir '${annotation_and_tax_dir}' \\
+                    --combined_output_dir '${combined_output_dir}' ${params.file_association_extra}
+        """
+}
+
+
+process GENERATE_MD5SUMS {
+
+    tag "Generating md5sums for the files to be released on OSDR..."
+
+    input:
+        path(processing_info)
+        path(README)
+        val(dirs)
+
+    output:
+        path("processed_md5sum${params.assay_suffix}.tsv"), emit: md5sum
+    script:
+        """
+        mkdir processing/ && \\
+        cp -r ${dirs.join(" ")} ${processing_info} ${README} \\
+              processing/
+
+        # Generate md5sums
+        find -L processing/ -type f -exec md5sum '{}' \\; |
+        awk -v OFS='\\t' 'BEGIN{OFS="\\t"; printf "File Path\\tFile Name\\tmd5\\n"} \\
+                {N=split(\$2,a,"/"); sub(/processing\\//, "", \$2); print \$2,a[N],\$1}' \\
+                | grep -v "versions.txt" > processed_md5sum${params.assay_suffix}.tsv
+        """
+}
+
+
+process GENERATE_PROTOCOL {
+
+    beforeScript "chmod +x ${baseDir}/bin/*"
+    tag "Generating your analysis protocol..."
+
+    input:
+        path(software_versions)
+        val(protocol_id)
+    output:
+        path("protocol.txt")
+    script:
+        """
+        generate_protocol.sh ${software_versions} ${protocol_id} > protocol.txt
+        """
+}
diff --git a/...mics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/summarize_MAG.nf b/...mics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/summarize_MAG.nf
@@ -17,7 +17,7 @@ leave empty if wanting to use memory, the default, put in quotes the path to a d
 already exists if wanting to use disk space
 */
 
-params.gtdb_tk_scratch_location = ""
+//params.gtdb_tk_scratch_location = ""
 
 /*  Retrieve MAGS.
     Filters checkm results based on estimate completion, redundancy, and 

diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/nextflow.config b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/nextflow.config
@@ -167,7 +167,7 @@ executor.queueSize = 20
     Note that relative paths such as '~/' and '../' are not expanded
     by nextflow's evaluation of files, so don't use that.
 */ 
-params.DB_ROOT = ("${baseDir}".split("/")[0..-2]).join('/') + "/Reference_DBs"
+params.DB_ROOT = "${baseDir.getParent()}/Reference_DBs"
 
 // Mount Humann databases to their predefined locations in the Biobakery container being used
 if(params.database.chocophlan_dir == null ||