Skip to content

Commit

Permalink
Merge pull request #117 from olabiyi/DEV_Metagenomics_Illumina_NF_con…
Browse files Browse the repository at this point in the history
…version

Metagenomics Nextflow workflow: Added post-processing workflow
  • Loading branch information
asaravia-butler authored Oct 22, 2024
2 parents 5adb13f + 34b63a7 commit ecb72e8
Show file tree
Hide file tree
Showing 11 changed files with 2,507 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ The current GeneLab Illumina metagenomics sequencing data processing pipeline (M
5a. [Main outputs](#5a-main-outputs)
5b. [Resource logs](#5b-resource-logs)

6. [Post Processing](#6-post-processing)

<br>

---
Expand Down Expand Up @@ -140,7 +142,8 @@ nextflow run main.nf -resume -profile conda --csv_file SE_file.csv --conda.qc <p
* `-resume` - Resumes workflow execution using previously cached results
* `-profile` – Specifies the configuration profile(s) to load, `singularity` instructs nextflow to setup and use singularity for all software called in the workflow
*Required only if you would like to pull and process data directly from OSDR*
*Required only if you would like to pull and process data directly from OSDR*
* `--GLDS_accession` – A Genelab / OSD accession number e.g. OSD-574.
Expand Down Expand Up @@ -178,3 +181,23 @@ Standard nextflow resource usage logs are also produced as follows:
- Resource_Usage/execution_trace_{timestamp}.txt (an execution tracing file that contains information about each process executed in the workflow, including: submission time, start time, completion time, cpu and memory used, machine-readable output)
> Further details about these logs can also found within [this Nextflow documentation page](https://www.nextflow.io/docs/latest/tracing.html#execution-report).
<br>
---
### 6. Post Processing
For options and detailed help on how to run the post-processing workflow, run the following command:
```bash
nextflow run post_processng.nf --help
```
To generate a README file, a protocols file, a md5sums table and a file association table after running the processing workflow sucessfully, modify and set the parameters in [post_processing.config](workflow_code/post_processing.config) then run the following command:
```bash
nextflow -C post_processing.config run post_processng.nf -resume -profile slurm,singularity
```
The outputs of the run will be in a directory called `Post_Processing` by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,4 @@ sed -E 's|.*/GLDS_Datasets/(.+)|\1|g' ${1} \
| sed -E 's|.+/miniconda.+/envs/[^/]*/||g' \
| sed -E 's|/[^ ]*/GLDS-|GLDS-|g' \
| sed -E 's|/[a-z]{6}/[^ ]*|<path-removed-for-security-purposes>|g' \
| sed -E "s|${ROOT_DIR}||g" > t && mv t ${1}
| sed -E "s|${ROOT_DIR}||g" > t && mv t ${1}
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ if (params.help) {
println(" --genes_dir [PATH] Specifies where predicted genes from the assemblies will be published. Default: ../Assembly-based_Processing/predicted-genes/.")
println(" --annotations_and_tax_dir [PATH] Contig taxonomy and annotation directory. Default: ../Assembly-based_Processing/annotations-and-taxonomy/.")
println(" --mapping_dir [PATH] Read mapping to assembly directory. Default: ../Assembly-based_Processing/read-mapping/.")
println(" --combined_output_dir [PATH] Assembly summuries and reports across samples directory. Default: ../Assembly-based_Processing/combined-outputs/.")
println(" --combined_output_dir [PATH] Assembly summaries and reports across samples directory. Default: ../Assembly-based_Processing/combined-outputs/.")
println(" --bins_dir [PATH] Assembly bins directory. Default: ../Assembly-based_Processing/bins/.")
println(" --MAGs_dir [PATH] Meta assembled genomes (MAGs) directory. Default: ../Assembly-based_Processing/MAGs/.")
println(" --read_based_dir [PATH] Read-based analysis outputs directory. Default: ../Read-based_Processing/.")
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,283 @@
#!/usr/bin/env nextflow
nextflow.enable.dsl = 2

process CLEAN_FASTQC_PATHS {
tag "Purging genelab paths from MultiQC zip files in ${params.directories.FastQC_Outputs}"
input:
path(FastQC_Outputs_dir)
output:
path("${OUT_DIR}"), emit: clean_dir
script:
OUT_DIR = "${FastQC_Outputs_dir.baseName}"
"""
WORKDIR=`pwd`
mv ${FastQC_Outputs_dir} FastQC_Outputs_dir
[ -d ${OUT_DIR}/ ] || mkdir ${OUT_DIR}/ && \\
cp -r FastQC_Outputs_dir/* ${OUT_DIR}/
[ -f ${OUT_DIR}/versions.txt ] && rm -rf ${OUT_DIR}/versions.txt
cat `which clean-paths.sh` > \${WORKDIR}/clean-paths.sh
chmod +x \${WORKDIR}/clean-paths.sh
echo "Purging paths from multiqc outputs"
cd \${WORKDIR}/${OUT_DIR}/
echo "Cleaning raw multiqc files with path info"
unzip raw_multiqc${params.assay_suffix}_report.zip && rm raw_multiqc${params.assay_suffix}_report.zip
cd raw_multiqc_report/raw_multiqc_data/
# No reason not to just run it on all
echo "Purging paths in all raw QC files..."
find . -type f -exec bash \${WORKDIR}/clean-paths.sh '{}' ${params.baseDir} \\;
cd \${WORKDIR}/${OUT_DIR}/
echo "Re-zipping up raw multiqc"
zip -r raw_multiqc${params.assay_suffix}_report.zip raw_multiqc_report/ && rm -rf raw_multiqc_report/
echo "Cleaning filtered multiqc files with path info..."
unzip filtered_multiqc${params.assay_suffix}_report.zip && rm filtered_multiqc${params.assay_suffix}_report.zip
cd filtered_multiqc_report/filtered_multiqc_data/
# No reason not to just run it on all
echo "Purging paths in all filtered QC files..."
find . -type f -exec bash \${WORKDIR}/clean-paths.sh '{}' ${params.baseDir} \\;
cd \${WORKDIR}/${OUT_DIR}/
echo "Re-zipping up filtered multiqc..."
zip -r filtered_multiqc${params.assay_suffix}_report.zip filtered_multiqc_report/ && rm -rf filtered_multiqc_report/
cd \${WORKDIR}
echo "Purging paths from multiqc outputs completed successfully..."
echo "Done! Paths purged successfully."
"""

}

process PACKAGE_PROCESSING_INFO {

tag "Purging file paths and zipping processing info"

input:
val(files_and_dirs)
output:
path("processing_info${params.assay_suffix}.zip"), emit: zip

script:
"""
cat `which clean-paths.sh` > clean-paths.sh
chmod +x ./clean-paths.sh
[ -d processing_info/ ] || mkdir processing_info/ && \\
cp -r ${files_and_dirs.join(" ")} processing_info/
echo "Purging file paths"
find processing_info/ -type f -exec bash ./clean-paths.sh '{}' ${params.baseDir} \\;
# Purge file paths and then zip
zip -r processing_info${params.assay_suffix}.zip processing_info/
"""
}


process GENERATE_README {

beforeScript "chmod +x ${baseDir}/bin/*"
tag "Generating README for ${OSD_accession}"
input:
tuple val(name), val(email), val(output_prefix),
val(OSD_accession), val(protocol_id),
val(FastQC_Outputs), val(Filtered_Sequence_Data),
val(Read_Based_Processing), val(Assembly_Based_Processing),
val(Assemblies), val(Genes), val(Annotations_And_Tax),
val(Mapping), val(Combined_Output)
path(processing_info)
path(Bins)
path(MAGS)
output:
path("README${params.assay_suffix}.txt"), emit: readme

script:
"""
GL-gen-processed-metagenomics-readme \\
--output 'README${params.assay_suffix}.txt' \\
--GLDS-ID '${OSD_accession}' \\
--output-prefix '${output_prefix}' \\
--name '${name}' \\
--email '${email}' \\
--protocol_ID '${protocol_id}' \\
--assay_suffix '${params.assay_suffix}' \\
--processing_zip_file '${processing_info}' \\
--fastqc_dir '${FastQC_Outputs}' \\
--filtered_reads_dir '${Filtered_Sequence_Data}' \\
--read_based_dir '${Read_Based_Processing}' \\
--assembly_based_dir '${Assembly_Based_Processing}' \\
--assemblies_dir '${Assemblies}' \\
--genes_dir '${Genes}' \\
--annotations_and_tax_dir '${Annotations_And_Tax}' \\
--mapping_dir '${Mapping}' \\
--bins_dir '${Bins}' \\
--MAGs_dir '${MAGS}' \\
--combined_output_dir '${Combined_Output}' ${params.readme_extra}
"""

}


process VALIDATE_PROCESSING {

tag "Running automated validation and verification...."

input:
// Labels
tuple val(GLDS_accession), val(V_V_guidelines_link), val(output_prefix),
val(target_files), val(assay_suffix), val(log_dir_basename),
val(raw_suffix), val(raw_R1_suffix), val(raw_R2_suffix),
val(filtered_suffix), val(filtered_R1_suffix), val(filtered_R2_suffix)
// Directory paths
tuple path(Filtered_Sequence_Data), path(Read_Based),
path(Assembly_Based), path(Assemblies), path(Mapping),
path(Genes), path(Annotation_And_Tax), path(Bins),
path(MAGS), path(Combined_Output), path(FastQC_Outputs)
// File paths
path(sample_ids_file)
path(README)
path(processing_info)

output:
path("${GLDS_accession}_${output_prefix}metagenomics-validation.log"), emit: log

script:
"""
GL-validate-processed-metagenomics-data \\
--output '${GLDS_accession}_${output_prefix}metagenomics-validation.log' \\
--GLDS-ID '${GLDS_accession}' \\
--readme '${README}' \\
--sample-IDs-file '${sample_ids_file}' \\
--V_V_guidelines_link '${V_V_guidelines_link}' \\
--processing_zip_file '${processing_info}' \\
--output-prefix '${output_prefix}' \\
--zip_targets '${target_files}' \\
--assay_suffix '${assay_suffix}' \\
--raw_suffix '${raw_suffix}' \\
--raw_R1_suffix '${raw_R1_suffix}' \\
--raw_R2_suffix '${raw_R2_suffix}' \\
--filtered_suffix '${filtered_suffix}' \\
--filtered_R1_suffix '${filtered_R1_suffix}' \\
--filtered_R2_suffix '${filtered_R2_suffix}' \\
--logs_dir_basename '${log_dir_basename}' \\
--fastqc_dir ${FastQC_Outputs} \\
--filtered_reads_dir ${Filtered_Sequence_Data} \\
--read_based_dir ${Read_Based} \\
--assembly_based_dir ${Assembly_Based} \\
--assemblies_dir ${Assemblies} \\
--genes_dir ${Genes} \\
--annotations_and_tax_dir ${Annotation_And_Tax} \\
--mapping_dir ${Mapping} \\
--bins_dir ${Bins} \\
--MAGs_dir ${MAGS} \\
--combined_output_dir ${Combined_Output} ${params.validation_extra}
"""
}


process GENERATE_CURATION_TABLE {

beforeScript "chmod +x ${baseDir}/bin/*"
tag "Generating a file association table for curation..."

input:
// GeneLab accession and Suffixes
tuple val(GLDS_accession), val(output_prefix), val(assay_suffix),
val(raw_suffix), val(raw_R1_suffix), val(raw_R2_suffix),
val(filtered_suffix), val(filtered_R1_suffix), val(filtered_R2_suffix)
// File labels
tuple val(processing_zip_file), val(readme)
// Directory labels as paths - these paths are utilized as mere labels by the script
tuple path(raw_reads_dir), path(filtered_reads_dir), path(read_based_dir),
path(assembly_based_dir), path(annotation_and_tax_dir), path(combined_output_dir)
// Directory paths
tuple path(Assemblies), path(Genes), path(Mapping),
path(Bins), path(MAGS), path(FastQC_Outputs)
path(assay_table)
path(runsheet)

output:
path("${GLDS_accession}_${output_prefix}-associated-file-names.tsv"), emit: curation_table

script:
def INPUT_TABLE = "${params.files.assay_table}" == "" ? "--isa-zip ${assay_table}" : "--assay-table ${assay_table}"
"""
GL-gen-metagenomics-file-associations-table ${INPUT_TABLE} \\
--runsheet '${runsheet}' \\
--output '${GLDS_accession}_${output_prefix}-associated-file-names.tsv' \\
--GLDS-ID '${GLDS_accession}' \\
--output-prefix '${output_prefix}' \\
--assay_suffix '${assay_suffix}' \\
--raw_suffix '${raw_suffix}' \\
--raw_R1_suffix '${raw_R1_suffix}' \\
--raw_R2_suffix '${raw_R2_suffix}' \\
--filtered_suffix '${filtered_suffix}' \\
--filtered_R1_suffix '${filtered_R1_suffix}' \\
--filtered_R2_suffix '${filtered_R2_suffix}' \\
--processing_zip_file '${processing_zip_file}' \\
--readme '${readme}' \\
--fastqc_dir '${FastQC_Outputs}' \\
--assemblies_dir '${Assemblies}' \\
--genes_dir '${Genes}' \\
--mapping_dir '${Mapping}' \\
--bins_dir '${Bins}' \\
--MAGs_dir '${MAGS}' \\
--raw_reads_dir '${raw_reads_dir}' \\
--filtered_reads_dir '${filtered_reads_dir}' \\
--read_based_dir '${read_based_dir}' \\
--assembly_based_dir '${assembly_based_dir}' \\
--annotations_and_tax_dir '${annotation_and_tax_dir}' \\
--combined_output_dir '${combined_output_dir}' ${params.file_association_extra}
"""
}


process GENERATE_MD5SUMS {

tag "Generating md5sums for the files to be released on OSDR..."

input:
path(processing_info)
path(README)
val(dirs)

output:
path("processed_md5sum${params.assay_suffix}.tsv"), emit: md5sum
script:
"""
mkdir processing/ && \\
cp -r ${dirs.join(" ")} ${processing_info} ${README} \\
processing/
# Generate md5sums
find -L processing/ -type f -exec md5sum '{}' \\; |
awk -v OFS='\\t' 'BEGIN{OFS="\\t"; printf "File Path\\tFile Name\\tmd5\\n"} \\
{N=split(\$2,a,"/"); sub(/processing\\//, "", \$2); print \$2,a[N],\$1}' \\
| grep -v "versions.txt" > processed_md5sum${params.assay_suffix}.tsv
"""
}


process GENERATE_PROTOCOL {

beforeScript "chmod +x ${baseDir}/bin/*"
tag "Generating your analysis protocol..."

input:
path(software_versions)
val(protocol_id)
output:
path("protocol.txt")
script:
"""
generate_protocol.sh ${software_versions} ${protocol_id} > protocol.txt
"""
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ leave empty if wanting to use memory, the default, put in quotes the path to a d
already exists if wanting to use disk space
*/

params.gtdb_tk_scratch_location = ""
//params.gtdb_tk_scratch_location = ""

/* Retrieve MAGS.
Filters checkm results based on estimate completion, redundancy, and
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ executor.queueSize = 20
Note that relative paths such as '~/' and '../' are not expanded
by nextflow's evaluation of files, so don't use that.
*/
params.DB_ROOT = ("${baseDir}".split("/")[0..-2]).join('/') + "/Reference_DBs"
params.DB_ROOT = "${baseDir.getParent()}/Reference_DBs"

// Mount Humann databases to their predefined locations in the Biobakery container being used
if(params.database.chocophlan_dir == null ||
Expand Down
Loading

0 comments on commit ecb72e8

Please sign in to comment.