diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 0000000..74a24da --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1,5 @@ +blank_issues_enabled: false +contact_links: + - name: "Join the poseidon Slack space" + url: https://poseidon-8el7276.slack.com/ + about: Discussions about everything Poseidon-Framework. diff --git a/.github/ISSUE_TEMPLATE/request_a_package.md b/.github/ISSUE_TEMPLATE/request_a_package.md new file mode 100644 index 0000000..1124eec --- /dev/null +++ b/.github/ISSUE_TEMPLATE/request_a_package.md @@ -0,0 +1,45 @@ +--- +name: "Please add: [PACKAGE_NAME]" +about: Request to add data from a publication to the Poseidon Package Directory +labels: "new package" +--- + + + +## Check for existing package +I have checked the following places for the requested package and could not find it: + +- [ ] Poseidon Package Directory +- [ ] The `packages/` directory in this repository +- [ ] Open issues in this repository +- [ ] Open pull requests in this repository + + + +## Required information for the package + +- DOI of publication: https://doi.org/10.1038/s41467-018-07483-5 <- Please replace this example DOI with the one for the relevant publication +- Project accession number: [PRJEB29360](https://www.ebi.ac.uk/ena/data/view/PRJEB29360) <- Please replace this example ENA project link with the one for the relevant publication + + + +## Will you work on this? + +If you are planning to work on comppiling the `.ssf` file required to add this package, please assign yourself to this issue. +This helps the community collaborate effectively and avoid duplicating work. + + diff --git a/.github/ISSUE_TEMPLATE/update_a_package.md b/.github/ISSUE_TEMPLATE/update_a_package.md new file mode 100644 index 0000000..eaecd55 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/update_a_package.md @@ -0,0 +1,52 @@ +--- +name: "Please update: [PACKAGE_NAME]" +about: Suggest an update to an already-existing poseidon package. +labels: "package update" +--- + + + +## Which package needs changes? + +Package name: `2018_Lamnidis_Fennoscandia` <- Replace the package name here with the name of the relevant package. + + + +## Check for upcoming changes + +I confirm there are no changes already suggested for this package that address my issue in: + +- [ ] [Open Pull Requests](https://github.com/poseidon-framework/poseidon-eager/pulls) +- [ ] [Open Issues](https://github.com/poseidon-framework/poseidon-eager/issues) + + + +## What needs changing, and why? + + + +## (optional) What are the changes you'd like? + + + +## (optional) How did you realise something needs updating? + + diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000..d314ba2 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,39 @@ +--- +name: "Add: [PACKAGE_NAME]" +about: Add a new package for processing +labels: "new package" +--- + + + +Closes #XXX + +## PR Checklist +- [ ] The PR contains a sequencingSourceFile (`.ssf`) for the package. +- [ ] The name of the `.ssf` file(s) matches the package name (i.e. `packages/2023_my_package/2023_my_package.ssf`). +- [ ] Comment `@delphis-bot create backbone` to this pull request to awaken Poseidon's trusty helper. (This should be repeated whenever changes are made to the SSF file contents). +This will add a number of files to the PR. Check that they are all there. +- [ ] `packages/{package_name}/{package_name}.tsv` was added to the PR. +- [ ] `packages/{package_name}/{package_name}.tsv_patch.sh` was added to the PR from template. +- [ ] `packages/{package_name}/script_versions.txt` was added to the PR. +- [ ] `packages/{package_name}/{package_name}.config` was added to the PR from template. + + +## Human validation + + +### Package TSV file (`*.tsv`) + - [ ] I confirm that the `udg`, `library_built` columns are correct for each library. + - [ ] I confirm that the `R1_target_file` and `R2_target_file` columns point to the correct FastQ files for the library (i.e. consistent with SSF file). + +### Package config file (`*config`) +The template config file includes a few TODO statements, and information about them. Please ensure that you: + - [ ] I have selected the appropriate config for the CaptureType of the package. + - [ ] If any nf-core/eager parameters need to be altered from their defaults, I have added them within the `params` section at the end of the package config file. diff --git a/.github/workflows/create_package_skeleton.yml b/.github/workflows/create_package_skeleton.yml index 7794b94..151dcd5 100644 --- a/.github/workflows/create_package_skeleton.yml +++ b/.github/workflows/create_package_skeleton.yml @@ -1,14 +1,14 @@ name: Create additional PR files on: issue_comment: - types: [created, edited] + types: [created] jobs: deploy: # Only run if comment is on a PR with the main repo, and if it contains the magic keywords if: > contains(github.event.comment.html_url, '/pull/') && - contains(github.event.comment.body, '@delphis-bot create backbone') && + startsWith(github.event.comment.body, '@delphis-bot create backbone') && github.repository == 'poseidon-framework/poseidon-eager' runs-on: ubuntu-latest steps: @@ -71,7 +71,6 @@ jobs: run: | for package_name in ${{ steps.get_package_names.outputs.package_names }}; do ./scripts/create_eager_input.sh ${package_name} - echo -e "create_eager_input.sh:\t$(./scripts/create_eager_input.sh -v)" >./packages/${package_name}/script_versions.txt done - name: Add nextflow config from template @@ -88,6 +87,7 @@ jobs: run: | for package_name in ${{ steps.get_package_names.outputs.package_names }}; do cp ./assets/template.tsv_patch.sh ./packages/${package_name}/${package_name}.tsv_patch.sh + chmod +x ./packages/${package_name}/${package_name}.tsv_patch.sh done - name: Commit & push changes diff --git a/.gitignore b/.gitignore index ec24126..fe310aa 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ +raw_sequencing_data/ eager/ .nextflow* *.finalised.tsv +array_Logs/ +array_tempfiles/ diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..6b42804 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,29 @@ +# poseidon-framework/poseidon-eager: Changelog + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## v0.2.0dev - 25/04/2023 + +### `Added` + +- `scripts/source_me.sh`: Includes various helper functions for all other scripts. +- `scripts/create_eager_input.sh`: Script to create a preliminary nf-core/eager input TSV from a SSF file. +- `scripts/download_ena_data.py`: Python script to read SSF file, download FastQs from the ENA, and create a file of expected md5sums. +- `scripts/run_download.sh`: Wrapper script to submit the FastQ download job to the local MPI-EVA cluster. +- `scripts/validate_downloaded_data.sh`: Script to validate the downloaded data, and create the intended symlinks for running nf-core/eager. +- Added a Changelog: `CHANGELOG.md` +- `scripts/run_eager.sh`: Script to run nf-core/eager for all packages that need (re-)running. +- `scripts/submit_as_array.sh`: Helper script for submitting eager jobs as an SGE array on the MPI_EVA cluster. +- Github Issue templates +- Github pull request template +- @delphis-bot makes the template tsv_patch for a package executable. +- @delphis-bot now only triggered on new PR comment, not edits. +- Propagate versions of all config files used in nf-core/eager runs to `config_profile_description`. +- `scripts/download_and_localise_package_files.sh` is a wrapper script that does all the steps of getting a package backbone ready for eagering. Quality of life script for testing, but will be superceded soon. + +### `Fixed` + +### `Dependencies` + +### `Deprecated` diff --git a/assets/template.config b/assets/template.config index 010675a..229a3c2 100644 --- a/assets/template.config +++ b/assets/template.config @@ -1,4 +1,39 @@ -// Config template version: 0.1.0dev +// Keep track of config versions +config_template_version='0.2.0dev' +package_config_version='0.2.0dev' -// Load in main poseidon configuration file with default params and which loads other poseidon-specific profiles -includeConfig '../../conf/Poseidon.config' +// This configuration file is designed to be a used with the nf-core/eager pipeline. +// Instead of having to specify all other configurations for the Minotaur pipeline +// on runtime, they are all contained in this file and loaded automatically upon +// specifying this config file during runtime. Additionally, any parameters that +// need to be altered from the defaults can be specified here. +// +// The intention is to make it easy for users to understand and reproduce the output +// from processing with the Minotaur workflow processing from the contents of a +// single file. + +// Load configuration profiles +includeConfig "../../conf/EVA_cluster.config" // Cluster-specific configurations for nf-core/eager execution at MPI-EVA +includeConfig "../../conf/Minotaur.config" // Default nf-core/eager parameters for Minotaur processing. + +// The following config file specifies BED files for on-target endogenous DNA calculation and mean coverage as well as pseudohaploid genotyping. +// TODO: Select the appropriate config for the CaptureType of the package. +includeConfig '../../conf/CaptureType_profiles/1240K.config' + +params { + // Keep track of config file versions used when processing + config_profile_description = "${config_profile_description}\nconfig_template_version: ${config_template_version}\npackage_config_version: ${package_config_version}" + config_profile_contact = "Thiseas C. Lamnidis (@TCLamnidis)" + + /* + TODO: If you need to change any of the default processing parameters for this package + you can specify these parameters below. + Any parameters not specified in any of the config files default to their nf-core/eager default values. + + For information on all available parameters and their default values see: + https://nf-co.re/eager/2.4.6/parameters + + You can see the default values for parameters within poseidon-eager at: + https://github.com/poseidon-framework/poseidon-eager/blob/main/conf/Poseidon.config + */ +} diff --git a/assets/template.tsv_patch.sh b/assets/template.tsv_patch.sh index 2fbb539..422e836 100644 --- a/assets/template.tsv_patch.sh +++ b/assets/template.tsv_patch.sh @@ -1,7 +1,8 @@ #!/usr/bin/env bash +set -uo pipefail ## Pipefail, complain on new unassigned variables. ## Track the version of the TSV_patch template used -VERSION='0.1.0dev' +VERSION='0.2.0dev' ## This script is applied to the eager input TSV file locally to edit the dummy ## path to the fastQ files added by `create_eager_input.sh` to a real local @@ -14,8 +15,31 @@ VERSION='0.1.0dev' local_data_dir="$(readlink -f ${1})" input_tsv="$(readlink -f ${2})" output_tsv="$(dirname ${local_data_dir})/$(basename -s ".tsv" ${input_tsv}).finalised.tsv" +columns_to_keep=("Sample_Name" "Library_ID" "Lane" "Colour_Chemistry" "SeqType" "Organism" "Strandedness" "UDG_Treatment" "R1" "R2" "BAM") +source $(dirname ${2})/../../scripts/source_me.sh ## Load helper functions -sed -e "s||${local_data_dir}|g" ${input_tsv} > ${output_tsv} +## Index non-proliferated columns and exclude them from the finalised TSV +cut_selector='' +tsv_header=($(head -n1 ${input_tsv})) +for col_name in ${columns_to_keep[@]}; do + let idx=$(get_index_of ${col_name} "${columns_to_keep[@]}")+1 ## awk uses 1-based indexing + if [[ ! ${idx} -eq -1 ]]; then + cut_selector+="${idx}," + fi +done + +## Remove added columns, and put columns in right order +cut -f ${cut_selector%,} ${input_tsv} > ${output_tsv} +sed -i -e "s||${local_data_dir}|g" ${output_tsv} ## Any further commands to edit the file before finalisation should be added below as shown # sed -ie 's/replace_this/with_this/g' ${output_tsv} + +## Keep track of versions +version_file="$(dirname ${input_tsv})/script_versions.txt" +## Remove versions from older run if there +grep -v -F -e "$(basename ${0})" -e "source_me.sh for final TSV" ${version_file} >${version_file}.new +## Then add new versions +echo -e "$(basename ${0}):\t${VERSION}" >> ${version_file}.new +echo -e "source_me.sh for final TSV:\t${HELPER_FUNCTION_VERSION}" >>${version_file}.new +mv ${version_file}.new ${version_file} diff --git a/conf/CaptureType_profiles/1240K.config b/conf/CaptureType_profiles/1240K.config new file mode 100644 index 0000000..8625115 --- /dev/null +++ b/conf/CaptureType_profiles/1240K.config @@ -0,0 +1,20 @@ +// Keep track of config versions +capturetype_config_version='0.2.0dev' + +//TODO Backup all bed/snp files in the repo somewhere and use from central location instead of hard-coding paths. +// Profile specifying capture-specific parameters for packages of the '1240K' CaptureType. +params{ + // Keep track of config file versions used when processing + config_profile_name = "${config_profile_name}, on 1240K sites" + config_profile_description="${config_profile_description}\n1240K.config: ${capturetype_config_version}" + + // Qualimap bedfile for on-target coverage calculation + snpcapture_bed = '/mnt/archgen/Reference_Genomes/Human/hs37d5/SNPCapBEDs/1240K.pos.list_hs37d5.0based.bed' + + // Genotyping + pileupcaller_bedfile = '/mnt/archgen/Reference_Genomes/Human/hs37d5/SNPCapBEDs/1240K.pos.list_hs37d5.0based.bed' + pileupcaller_snpfile = '/mnt/archgen/public_data/Datashare_Boston_Jena_June2018.backup/1240K.snp' + + // 1240k depth calculation + anno_file = '/mnt/archgen/Reference_Genomes/Human/hs37d5/SNPCapBEDs/1240K.pos.list_hs37d5.0based.bed' +} diff --git a/conf/EVA_cluster.config b/conf/EVA_cluster.config new file mode 100644 index 0000000..610910e --- /dev/null +++ b/conf/EVA_cluster.config @@ -0,0 +1,34 @@ +// This configuration file changes nf-core/eager defaults specified in the eva archgen profile +// that specifically apply to the execution of jobs, not their content. Therefore, it does not +// have a version number, as these parameters do not affect the reproducibility of the results. + +// Increase number of concurrent jobs to 24 +executor { + queueSize = 24 +} + +// Change amount of resources provided to MarkD. +process { + maxRetries = 2 + + withName:markduplicates { + memory = { task.attempt == 3 ? 16.GB : task.attempt == 2 ? 8.GB : 4.GB } + } + + // More cores for bwa to reduce runtime + withName:bwa { + cpus = 8 + } +} + +profiles { + eva_local_paths { + params{ + // Mapping reference and reference indexes + fasta = '/mnt/archgen/Reference_Genomes/Human/hs37d5/hs37d5.fa' + fasta_index = '/mnt/archgen/Reference_Genomes/Human/hs37d5/hs37d5.fa.fai' + bwa_index = '/mnt/archgen/Reference_Genomes/Human/hs37d5/' + seq_dict = '/mnt/archgen/Reference_Genomes/Human/hs37d5/hs37d5.dict' + } + } +} diff --git a/conf/Minotaur.config b/conf/Minotaur.config new file mode 100644 index 0000000..d11accd --- /dev/null +++ b/conf/Minotaur.config @@ -0,0 +1,68 @@ +// Keep track of config versions +minotaur_config_version='0.2.0dev' + +// Default parameters for processing of data through Minotaur workflow. +params{ + // Keep track of config file versions used when processing + config_profile_name = "Minotaur pipeline config" + config_profile_description = "Minortaur.config: ${minotaur_config_version}" + config_profile_url = 'https://github.com/poseidon-framework/poseidon-eager' + + + // Mapping + bwaalnn = 0.01 + + // BAM filtering + run_bam_filtering = true // Filter out unmapped reads, so barplots in MultiQC are not completely overtaken by unmapped reads. + bam_unmapped_type = 'fastq' // Keep unmapped reads as a separate fastq file. Preferred format for possible future pathogen screening. + bam_mapping_quality_threshold = 30 // Keep MapQ 30+ (together with snpcapture_bed is needed for poseidon "coverage on target SNPs" field) + // The above also means that reads that are mapped with MapQ below 30 are lost after filtering, not present in the fastq OR the filtered bam! + + // mtDNA to nuclear ratio + run_mtnucratio = true + mtnucratio_header = "MT" + + // Bam Trimming + run_trim_bam = true + + // Double-stranded library clipping parameters + bamutils_clip_double_stranded_half_udg_left = 2 // Trim 2 bp of either side for dsDNA half-UDG libraries. + bamutils_clip_double_stranded_half_udg_right = 2 // Trim 2 bp of either side for dsDNA half-UDG libraries. + bamutils_clip_double_stranded_none_udg_left = 7 // Trim 7 bp of either side for dsDNA non-UDG libraries. + bamutils_clip_double_stranded_none_udg_right = 7 // Trim 7 bp of either side for dsDNA non-UDG libraries. + + // Single-stranded library clipping paramaters + bamutils_clip_single_stranded_half_udg_left = 0 // No trimming for ssDNA libraries, since --singelStrandMode removes damage artefacts. + bamutils_clip_single_stranded_half_udg_right = 0 // No trimming for ssDNA libraries, since --singelStrandMode removes damage artefacts. + bamutils_clip_single_stranded_none_udg_left = 0 // No trimming for ssDNA libraries, since --singelStrandMode removes damage artefacts. + bamutils_clip_single_stranded_none_udg_right = 0 // No trimming for ssDNA libraries, since --singelStrandMode removes damage artefacts. + + // Genotyping + genotyping_source = 'trimmed' // Use trimmed bams for genotyping + run_genotyping = true + genotyping_tool = 'pileupcaller' + pileupcaller_min_map_quality = 30 + pileupcaller_min_base_quality = 30 + + //Sex determination + run_sexdeterrmine = true + + // Nuclear contamination + run_nuclear_contamination = true + contamination_chrom_name = 'X' + + //1240k Coverage/Depth calculation + run_bedtools_coverage = true + + // Custom MQC config file with increased max_table_rows value + multiqc_config = '../../conf/minotaur_multiqc_config.yaml' +} + +/* + A profile defining the local paths for reference genome and index on the EVA cluster. + These values are only loaded if the profile is activated on runtime with `-profile` + The values will overwrite any values in config files, but not those in other profiles + or those provided directly on the command line. + For details on parameter inheritance across profiles and config files see: + https://nf-co.re/eager/2.4.6/usage#tutorial---what-are-profiles-and-how-to-use-them +*/ diff --git a/conf/Poseidon.config b/conf/Poseidon.config deleted file mode 100644 index 4196943..0000000 --- a/conf/Poseidon.config +++ /dev/null @@ -1,88 +0,0 @@ -profiles { - // Profile for microscope eagering. Includes parameters for the runs. - poseidon { - // Increase number of concurrent jobs to 24 - executor { - queueSize = 24 - } - - params{ - // Mapping reference and reference indexes - fasta = '/mnt/archgen/Reference_Genomes/Human/hs37d5/hs37d5.fa' - fasta_index = '/mnt/archgen/Reference_Genomes/Human/hs37d5/hs37d5.fa.fai' - bwa_index = '/mnt/archgen/Reference_Genomes/Human/hs37d5/' - seq_dict = '/mnt/archgen/Reference_Genomes/Human/hs37d5/hs37d5.dict' - - // Mapping - bwaalnn = 0.01 - - // BAM filtering - run_bam_filtering = true // Filter out unmapped reads, so barplots in MultiQC are not completely overtaken by unmapped reads. - bam_unmapped_type = 'fastq' // Keep unmapped reads as a separate fastq file. Preferred format for possible future pathogen screening. - bam_mapping_quality_threshold = 30 // Keep MapQ 30+ (together with snpcapture_bed is needed for poseidon "coverage on target SNPs" field) - // The above also means that reads that are mapped with MapQ below 30 are lost after filtering, not present in the fastq OR the filtered bam! - - // Calculate on-target coverage and capture efficiency metrics - snpcapture_bed = '/mnt/archgen/Reference_Genomes/Human/hs37d5/SNPCapBEDs/1240K.pos.list_hs37d5.0based.bed' - - // mtDNA to nuclear ratio - run_mtnucratio = true - mtnucratio_header = "MT" - - // Bam Trimming - run_trim_bam = true - bamutils_clip_half_udg_left = 2 // Trim 2 bp of either side for half-UDG libraries. - bamutils_clip_half_udg_right = 2 // Trim 2 bp of either side for half-UDG libraries. - bamutils_clip_none_udg_left = 0 // Set to 0 so ssDNA non-UDG do not get trimmed. 10/12/2020 - bamutils_clip_none_udg_right = 0 // Set to 0 so ssDNA non-UDG do not get trimmed. 10/12/2020 - - // [18/01/2022] New bam trimming params for version 2.4.1 onwards - bamutils_clip_double_stranded_half_udg_left = 2 // Trim 2 bp of either side for dsDNA half-UDG libraries. - bamutils_clip_double_stranded_half_udg_right = 2 // Trim 2 bp of either side for dsDNA half-UDG libraries. - bamutils_clip_double_stranded_none_udg_left = 7 // Trim 7 bp of either side for dsDNA non-UDG libraries. - bamutils_clip_double_stranded_none_udg_right = 7 // Trim 7 bp of either side for dsDNA non-UDG libraries. - - bamutils_clip_single_stranded_half_udg_left = 0 // No trimming for ssDNA libraries, since --singelStrandMode removes damage artefacts. - bamutils_clip_single_stranded_half_udg_right = 0 // No trimming for ssDNA libraries, since --singelStrandMode removes damage artefacts. - bamutils_clip_single_stranded_none_udg_left = 0 // No trimming for ssDNA libraries, since --singelStrandMode removes damage artefacts. - bamutils_clip_single_stranded_none_udg_right = 0 // No trimming for ssDNA libraries, since --singelStrandMode removes damage artefacts. - - // Genotyping - genotyping_source = 'trimmed' // Use trimmed bams for genotyping - run_genotyping = true - genotyping_tool = 'pileupcaller' - pileupcaller_min_map_quality = 30 - pileupcaller_min_base_quality = 30 - pileupcaller_bedfile = '/mnt/archgen/Reference_Genomes/Human/hs37d5/SNPCapBEDs/1240K.pos.list_hs37d5.0based.bed' - pileupcaller_snpfile = '/mnt/archgen/public_data/Datashare_Boston_Jena_June2018.backup/1240K.snp' - - //Sex determination - run_sexdeterrmine = true - sexdeterrmine_bedfile = '/mnt/archgen/Reference_Genomes/Human/hs37d5/SNPCapBEDs/1240K.pos.list_hs37d5.0based.bed' - - // Nuclear contamination - run_nuclear_contamination = true - contamination_chrom_name = 'X' - - //1240k Coverage/Depth calculation - run_bedtools_coverage = true - anno_file = '/mnt/archgen/Reference_Genomes/Human/hs37d5/SNPCapBEDs/1240K.pos.list_hs37d5.0based.bed' - - // Custom MQC config file with increased max_table_rows value - multiqc_config = '/mnt/archgen/poseidon/poseidon-eager/conf/poseidon_multiqc_config.yaml' - } - // Change amount of resources provided to MarkD. - process { - maxRetries = 2 - - withName:markduplicates { - memory = { task.attempt == 3 ? 16.GB : task.attempt == 2 ? 8.GB : 4.GB } - } - - // More cores for bwa to reduce runtime - withName:bwa { - cpus = 8 - } - } - } -} diff --git a/conf/poseidon_multiqc_config.yaml b/conf/minotaur_multiqc_config.yaml similarity index 100% rename from conf/poseidon_multiqc_config.yaml rename to conf/minotaur_multiqc_config.yaml diff --git a/scripts/create_eager_input.sh b/scripts/create_eager_input.sh index 8426709..b4b0763 100755 --- a/scripts/create_eager_input.sh +++ b/scripts/create_eager_input.sh @@ -1,5 +1,6 @@ #!/usr/bin/env bash -VERSION='0.1.0dev' +VERSION='0.2.0dev' +set -o pipefail ## Pipefail, complain on new unassigned variables. ## Helptext function function Helptext() { @@ -15,7 +16,7 @@ repo_dir=$(dirname $(readlink -f ${0}))/.. ## The repository will be the origin source ${repo_dir}/scripts/source_me.sh ## Source helper functions ## Parse CLI args. -TEMP=`getopt -q -o hvs --long help,version,skip_checksum -n "${0}" -- "$@"` +TEMP=`getopt -q -o hv --long help,version -n "${0}" -- "$@"` eval set -- "${TEMP}" ## Parameter defaults @@ -63,6 +64,7 @@ fi ## Read required info from yml file # package_rawdata_dir="${root_download_dir}/${package_name}" out_file="${package_dir}/${package_name}.tsv" +version_file="${package_dir}/script_versions.txt" ## This will all break down if the headers contain whitespace. ssf_header=($(head -n1 ${ena_table})) @@ -96,7 +98,7 @@ library_ids=() ## Paste together stuff to make a TSV. Header will flush older tsv if it exists. errecho -y "[${package_name}] Creating TSV input for nf-core/eager (v2.*)." -echo -e "Sample_Name\tLibrary_ID\tLane\tColour_Chemistry\tSeqType\tOrganism\tStrandedness\tUDG_Treatment\tR1\tR2\tBAM" > ${out_file} +echo -e "Sample_Name\tLibrary_ID\tLane\tColour_Chemistry\tSeqType\tOrganism\tStrandedness\tUDG_Treatment\tR1\tR2\tBAM\tR1_target_file\tR2_target_file" > ${out_file} organism="Homo sapiens (modern human)" while read line; do poseidon_id=$(echo "${line}" | awk -F "\t" -v X=${pid_col} '{print $X}') @@ -125,8 +127,11 @@ while read line; do row_lib_id="${row_pid}_${lib_name}${strandedness_suffix}" ## paste poseidon ID with Library ID to ensure unique naming of library results (both with suffix) let lane=$(count_instances ${row_lib_id} "${library_ids[@]}")+1 + ## Get intended input file names on local system (R1, R2) read -r seq_type r1 r2 < <(dummy_r1_r2_from_ena_fastq ${raw_data_dummy_path} ${row_lib_id}_L${lane} ${fastq_fn}) - echo -e "${row_pid}\t${row_lib_id}\t${lane}\t${colour_chemistry}\t${seq_type}\t${organism}\t${library_built}\t${udg_treatment}\t${r1}\t${r2}\tNA" >> ${out_file} + ## Also add column with the file that those will symlink to, for transparency during PR review. + read -r seq_type2 r1_target r2_target < <(r1_r2_from_ena_fastq ${fastq_fn}) + echo -e "${row_pid}\t${row_lib_id}\t${lane}\t${colour_chemistry}\t${seq_type}\t${organism}\t${library_built}\t${udg_treatment}\t${r1}\t${r2}\tNA\t${r1_target}\t${r2_target}" >> ${out_file} ## Keep track of observed values poseidon_ids+=(${row_pid}) @@ -135,3 +140,8 @@ while read line; do done < <(tail -n +2 ${ena_table}) errecho -y "[${package_name}] TSV creation completed" + +## Keep track of versions +## This is the first part of the pipeline, so always flush any older versions, since everything needs rerunning. +echo -e "$(basename ${0}):\t${VERSION}" > ${version_file} +echo -e "source_me.sh for initial TSV:\t${HELPER_FUNCTION_VERSION}" >>${version_file} diff --git a/scripts/download_and_localise_package_files.sh b/scripts/download_and_localise_package_files.sh new file mode 100755 index 0000000..479a3eb --- /dev/null +++ b/scripts/download_and_localise_package_files.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +set -o pipefail ## Pipefail, complain on new unassigned variables. + +VERSION='0.2.0dev' + +## Helptext function +function Helptext() { + echo -ne "\t usage: ${0} [options] package_name\n\n" + echo -ne "This script takes in the name of a package as it appears in './packages', and carries out the localisation operation necessary to prepare for nf-core/eager processing.\n\n" + echo -ne "Options:\n" + echo -ne "-h, --help\t\tPrint this text and exit.\n" + echo -ne "-v, --version \t\tPrint version and exit.\n" +} + +## Parse CLI args. +TEMP=`getopt -q -o hv --long help,version -n "${0}" -- "$@"` +eval set -- "${TEMP}" + +##Parameter defaults +package_name='' +script_debug_string="[localise_package_files.sh]:" + +## Read in CLI arguments +while true ; do + case "$1" in + -h|--help) Helptext; exit 0 ;; + -v|--version) echo ${VERSION}; exit 0;; + --) package_name="${2}"; break ;; + *) echo -e "invalid option provided.\n"; Helptext; exit 1;; + esac +done + +## Source helper functions +repo_dir=$(dirname $(readlink -f ${0}))/.. ## The repository will be the original position of this script. If a user copies instead of symlink, this will fail. +source ${repo_dir}/scripts/source_me.sh ## Source helper functions + +if [[ -z package_name ]]; then + errecho -r "${script_debug_string} No package name provided." + Helptext + exit 1 +fi + +## Other variables inferred from repo dir and pacakge name. +package_dir="${repo_dir}/packages/${package_name}/" +ssf_file="${package_dir}/${package_name}.ssf" +download_dir="${repo_dir}/raw_sequencing_data/" +download_log_dir="${download_dir}/download_logs" +local_data_dir="${repo_dir}/raw_sequencing_data/${package_name}" +symlink_dir="${repo_dir}/eager/${package_name}/data" +tsv_patch_fn="${package_dir}/${package_name}.tsv_patch.sh" +original_tsv="${package_dir}/${package_name}.tsv" + +## STEP 1: Download data +## Add a header to the log to keep track of when each part was ran and what version was used. +echo "[download_ena_data.py]: $(date +'%y%m%d_%H%M') ${package_name}" >> ${download_log_dir}/download.${package_name}.out +echo "[download_ena_data.py]: version $(${repo_dir}/scripts/download_ena_data.py --version)" >> ${download_log_dir}/download.${package_name}.out +${repo_dir}/scripts/download_ena_data.py -d ${package_dir} -o ${download_dir} 2>> ${download_log_dir}/download.${package_name}.out +check_fail $? "${script_debug_string} Downloads did not finish completely. Try again." + +## STEP 2: Validate downloaded files. +${repo_dir}/scripts/validate_downloaded_data.sh ${ssf_file} ${local_data_dir} ${symlink_dir} +check_fail $? "${script_debug_string} Validation and symlink creation failed." + +## STEP 3: Localise TSV file. +errecho -y "${script_debug_string} Localising TSV for nf-core/eager." +${tsv_patch_fn} ${local_data_dir} ${original_tsv} +check_fail $? "${script_debug_string} TSV localisation failed." diff --git a/scripts/download_ena_data.py b/scripts/download_ena_data.py index 5dfa085..2706f30 100755 --- a/scripts/download_ena_data.py +++ b/scripts/download_ena_data.py @@ -7,6 +7,8 @@ import os import wget +VERSION = "0.2.0dev" + parser = argparse.ArgumentParser( prog = 'download_ena_data', description = 'This script downloads raw FASTQ data from the ENA, using ' @@ -16,12 +18,18 @@ parser.add_argument('-d', '--ssf_dir', metavar="", required=True, help="The directory to scan for poseidon-formatted sequencingSourceFiles, to download the described data.") parser.add_argument('-o', '--output_dir', metavar="", required=True, help="The output directory for the FASTQ files.") parser.add_argument('--dry_run', action='store_true', help="Only list the download commands, but don't do anything.") +parser.add_argument('-v', '--version', action='version', version=VERSION) def read_ena_table(file_name): l = file_name.readlines() headers = l[0].split() return map(lambda row: dict(zip(headers, row.split('\t'))), l[1:]) +def read_versions_fn(file_name): + l = file_name.readlines() + headers = ['tool', 'version'] + return map(lambda row: dict(zip(headers, row.strip().split('\t'))), l[0:]) + args = parser.parse_args() # os.path.abspath(args.sequencingSourceFile) ## Absolute path to ssf file. @@ -33,7 +41,7 @@ def read_ena_table(file_name): source_file = os.path.join(root, file_name) print("[download_ena_data.py]: Found Sequencing Source File: ", source_file, file=sys.stderr) package_name = os.path.splitext(file_name)[0] ## The SSF name and desired package name must match - odir = os.path.join(args.output_dir, package_name) + odir = os.path.abspath(os.path.join(args.output_dir, package_name)) os.makedirs(odir, exist_ok=True) line_count=1 with open(source_file, 'r') as f: @@ -54,7 +62,24 @@ def read_ena_table(file_name): else: print(f"[download_ena_data.py]: Downloading {fastq_url} into {target_file}", file=sys.stderr) if not args.dry_run: + ## TODO Swap to aspera for faster downloads wget.download("https://" + fastq_url, out=target_file) print(f"{fastq_md5} {target_file}", file=md5_fn) - - + + ## Keep track of version information + version_file=os.path.join(os.path.dirname(source_file), "script_versions.txt") + new_version_file=version_file+".tmp" + version_exists=False + with open(version_file, 'r') as versions_in: + versions_out=open(version_file+".tmp", 'w') + for version_entry in read_versions_fn(versions_in): + if version_entry['tool'] != 'download_ena_data.py:': + print("{}\t{}".format(version_entry['tool'], version_entry['version']), sep='\t', file = versions_out) + else: + ## If version for download exist, update it + version_exists=True + print("{}\t{}".format("download_ena_data.py:", VERSION) , sep='\t', file = versions_out) + ## If version for download did not exist, add it + if not version_exists: + print("{}\t{}".format("download_ena_data.py:", VERSION) , sep='\t', file = versions_out) + os.replace(src = new_version_file, dst = version_file) diff --git a/scripts/run_download.sh b/scripts/run_download.sh index 0e53998..c42c670 100755 --- a/scripts/run_download.sh +++ b/scripts/run_download.sh @@ -4,7 +4,7 @@ ## found in poseidon-formatted sequencingSourceFiles. ## !! This is a localised script, including hard-coded paths for processing in MPI-EVA. !! package_name=$1 -OUTDIR="/mnt/archgen/poseidon/raw_sequencing_data" +OUTDIR="/mnt/archgen/poseidon/poseidon-eager/raw_sequencing_data" INDIR="/mnt/archgen/poseidon/poseidon-eager/packages/${package_name}" LOGDIR="${OUTDIR}/download_logs" SCRIPT="/mnt/archgen/poseidon/poseidon-eager/scripts/download_ena_data.py" diff --git a/scripts/run_eager.sh b/scripts/run_eager.sh new file mode 100755 index 0000000..6e2ea49 --- /dev/null +++ b/scripts/run_eager.sh @@ -0,0 +1,166 @@ +#!/usr/bin/env bash +set -uo pipefail + +VERSION='0.2.0dev' + +TEMP=`getopt -q -o hvadD --long help,version,test,array,dry_run,debug: -n 'run_eager.sh' -- "$@"` +eval set -- "$TEMP" + +## DEBUG +# echo $TEMP + +## Helptext function +function Helptext() { + echo -ne "\t usage: ${0} [options]\n\n" + echo -ne "This script will submit nf-core/eager runs to create a poseidon package from published sequencing data. Only runs that need (re)processing will be submitted.\n\n" + echo -ne "Options:\n" + echo -ne "-h, --help \t\tPrint this text and exit.\n" + echo -ne "-a, --array \t\tWhen provided, the nf-core/eager jobs will be submitted as an array job, using 'submit_as_array.sh'. 10 jobs will run concurrently.\n" + echo -ne "-d, --dry_run \t\tPrint the commands to be run, but run nothing. Array files will still be created.\n" + echo -ne "-v, --version \t\tPrint qpWrapper version and exit.\n" +} + +## Parameter defaults +array='' +temp_file='' +with_tower='' +dry_run="FALSE" +debug="FALSE" + +## Read in CLI arguments +while true ; do + case "$1" in + -h|--help) Helptext; exit 0 ;; + -d|--dry_run) dry_run="TRUE"; shift 1;; + -v|--version) echo ${VERSION}; exit 0;; + -a|--array) array="TRUE"; shift 1;; + -D|--debug) debug="TRUE"; shift 1;; + --) break ;; + *) echo -e "invalid option provided.\n"; Helptext; exit 1;; + esac +done + +if [[ ${debug} == "TRUE" ]]; then + echo -e "[run_eager.sh]: DEBUG activated. CLI argument parsing is not included in debug output." + set -x +fi + +## Hard-coded params +repo_dir='/mnt/archgen/poseidon/poseidon-eager' +nxf_path="/mnt/archgen/tools/nextflow/22.04.5.5708/" ## Use centarlly installed version +eager_version='2.4.6' +root_eager_dir="${repo_dir}/eager/" +root_package_dir="${repo_dir}/packages/" +nextflow_profiles="eva,archgen,medium_data,eva_local_paths" +tower_config="${repo_dir}/.nextflow_tower.conf" ## Optional tower.nf configuration +array_temp_fn_dir="${repo_dir}/array_tempfiles" +array_logs_dir="${repo_dir}/array_Logs" +submit_as_array_script="${repo_dir}/scripts/submit_as_array.sh" + +## Flood execution. Useful for testing/fast processing of small batches. +if [[ ${array} == 'TRUE' ]]; then + ## Create the required array dirs if not existing + mkdir -p ${array_temp_fn_dir} + + temp_file="/mnt/archgen/poseidon/poseidon-eager/array_tempfiles/$(date +'%y%m%d_%H%M')_Minotaur_eager_queue.txt" + ## Create new empty file with the correct naming, or flush contents of file if somehow it exists. + echo -n '' > ${temp_file} +fi + +for eager_input in ${root_eager_dir}/*/*.finalised.tsv; do + ## Infer package name from finalised TSV name + package_name=$(basename -s '.finalised.tsv' ${eager_input}) + + ## Set paths needed for processing + eager_work_dir="${root_eager_dir}/${package_name}/work" + eager_output_dir="${root_eager_dir}/${package_name}/results" + package_config="${root_package_dir}/${package_name}/${package_name}.config" + + ## Create necessary directories + mkdir -p ${eager_work_dir} ${eager_output_dir} + + ## Load nf-tower configuration if it exists + ## This loads variables needed for monitoring execution of jobs remotely. + if [[ -f ${tower_config} ]]; then + source ${tower_config} + with_tower='-with-tower' + fi + + ## Only try to run eager if the input is newer than the latest MultiQC report, or the parameter config is newer than the latest report. + if [[ ${eager_input} -nt ${eager_output_dir}/multiqc/multiqc_report.html ]] || [[ ${package_config} -nt ${eager_output_dir}/multiqc/multiqc_report.html ]]; then + ## Build nextflow command + CMD="${nxf_path}/nextflow run nf-core/eager \ + -r ${eager_version} \ + -profile ${nextflow_profiles} \ + -c ${package_config} \ + --input ${eager_input} \ + --outdir ${eager_output_dir} \ + -w ${eager_work_dir} \ + ${with_tower} \ + -ansi-log false \ + -resume" + + ## Array setup + if [[ ${array} == 'TRUE' ]]; then + ## For array submissions, the commands to be run will be added one by one to the temp_file + ## Then once all jobs have been added, submit that to qsub with each line being its own job. + ## Use `continue` to avoid running eager interactivetly for arrayed jobs. + echo "cd $(dirname ${eager_input}) ; ${CMD}" | tr -s " " >> ${temp_file} + continue ## Skip running eager interactively if arrays are requested. + fi + + ## NON-ARRAY RUNS + ## Change to input directory to run from, to keep one cwd per run. + cd $(dirname ${eager_input}) + + ## Debugging info. + echo "Running eager on ${eager_input}:" + echo "${CMD}" + + + ## Don't run comands if dry run specified. + if [[ ${dry_run} == "FALSE" ]]; then + $CMD + fi + + cd ${root_eager_dir} ## Then back to root dir + fi +done + +## If array is requested submit the created array file to qsub below +if [[ ${array} == 'TRUE' ]]; then + mkdir -p ${array_logs_dir}/$(basename -s '.txt' ${temp_file}) ## Create new directory for the logs for more traversable structure + jn=$(wc -l ${temp_file} | cut -f 1 -d " ") ## number of jobs equals number of lines + export NXF_OPTS='-Xms4G -Xmx4G' ## Set 4GB limit to Nextflow VM + export JAVA_OPTS='-Xms8G -Xmx8G' ## Set 8GB limit to Java VM + ## -V Pass environment to job (includes nxf/java opts) + ## -S /bin/bash Use bash + ## -l v_hmem=40G ## 40GB memory limit (8 for java + the rest for garbage collector) + ## -pe smp 2 ## Use two cores. one for nextflow, one for garbage collector + ## -N Minotaur_spawner_$(basename ${temp_file}) ## Name the job + ## -cwd Run in currect run directory (ran commands include a cd anyway, but to find the files at least) + ## -j y ## join stderr and stdout into one output log file + ## -b y ## Provided command is a binary already (i.e. executable) + ## -o /mnt/archgen/Autorun_eager/array_Logs/ ## Keep all log files in one directory. + ## -tc 10 ## Number of concurrent spawner jobs (10) + ## -t 1-${jn} ## The number of array jobs (from 1 to $jn) + array_cmd="qsub \ + -V \ + -S /bin/bash \ + -l h_vmem=40G \ + -pe smp 2 \ + -N Minotaur_spawner_$(basename ${temp_file}) \ + -cwd \ + -j y \ + -b y \ + -o /mnt/archgen/Autorun_eager/array_Logs/$(basename ${temp_file}) \ + -tc 10 \ + -t 1-${jn} \ + $submit_as_array_script ${temp_file}" + + echo ${array_cmd} + ## Don't run comands if dry run specified. + if [[ ${dry_run} == "FALSE" ]]; then + $array_cmd + fi +fi diff --git a/scripts/source_me.sh b/scripts/source_me.sh index 2b1b0b9..fef6cbf 100644 --- a/scripts/source_me.sh +++ b/scripts/source_me.sh @@ -1,4 +1,5 @@ #!/usr/bin/env bash +HELPER_FUNCTION_VERSION='0.2.0dev' ## Print coloured messages to stderr # errecho -r will print in red @@ -226,23 +227,21 @@ function infer_library_strandedness() { } ## Function to create R1 and R2 columns from ena_table fastq_fn entries -# usage: r1_r2_from_ena_fastq +# usage: r1_r2_from_ena_fastq # Returns: a thrupple of: seq_type R1 R2 function r1_r2_from_ena_fastq() { - local prefix local value local r1 local r2 local seq_type - prefix="${1}" - value="${2}" - r1="${prefix}$(pull_by_index ';' ${value} 0)" - r2="${prefix}$(pull_by_index ';' ${value} 1)" + value="${1}" + r1=$(basename "$(pull_by_index ';' ${value} 0)") + r2=$(basename "$(pull_by_index ';' ${value} 1)") seq_type="PE" ## If no R2, then SE - if [[ "${r2}" == "${prefix}" ]]; then + if [[ "${r2}" == '' ]]; then r2="NA" seq_type="SE" fi @@ -251,7 +250,7 @@ function r1_r2_from_ena_fastq() { } ## Function to create R1 and R2 columns from ena_table fastq_fn entries -# usage: r1_r2_from_ena_fastq +# usage: dummy_r1_r2_from_ena_fastq # Returns: a thrupple of: seq_type R1 R2 function dummy_r1_r2_from_ena_fastq() { local prefix @@ -319,7 +318,7 @@ function symlink_names_from_ena_fastq() { } ## Function to create R1 and R2 columns from ena_table fastq_fn entries -# usage: r1_r2_from_ena_fastq ${path_to_ena_data} ${fastq_fn} +# usage: local_r1_r2_from_ena_fastq ${path_to_ena_data} ${fastq_fn} # Returns: a thrupple of: seq_type R1 R2 function local_r1_r2_from_ena_fastq() { local data_path diff --git a/scripts/submit_as_array.sh b/scripts/submit_as_array.sh new file mode 100755 index 0000000..6ec4eb2 --- /dev/null +++ b/scripts/submit_as_array.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +if [[ ${1} = '' ]]; then + echo "No input file provided." + exit 0 +else + job_list_fn="${1}" +fi + +command=$(sed -n ${SGE_TASK_ID}p ${job_list_fn}) + +# echo "$command" +bash -c "${command}" diff --git a/scripts/validate_downloaded_data.sh b/scripts/validate_downloaded_data.sh index 820cbf2..556f804 100755 --- a/scripts/validate_downloaded_data.sh +++ b/scripts/validate_downloaded_data.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash set -uo pipefail ## Pipefail, complain on new unassigned variables. - +VERSION='0.2.0dev' ## Load helper bash functions source $(dirname ${0})/source_me.sh @@ -34,6 +34,7 @@ ssf_header=($(head -n1 ${ssf_file})) let pid_col=$(get_index_of 'poseidon_IDs' "${ssf_header[@]}")+1 let lib_name_col=$(get_index_of 'library_name' "${ssf_header[@]}")+1 let fastq_col=$(get_index_of 'fastq_ftp' "${ssf_header[@]}")+1 +let lib_built_col=$(get_index_of 'library_built' "${ssf_header[@]}")+1 poseidon_ids=() library_ids=() @@ -42,16 +43,27 @@ while read line; do poseidon_id=$(echo "${line}" | awk -F "\t" -v X=${pid_col} '{print $X}') lib_name=$(echo "${line}" | awk -F "\t" -v X=${lib_name_col} '{print $X}') fastq_fn=$(echo "${line}" | awk -F "\t" -v X=${fastq_col} '{print $X}') + library_built_field=$(echo "${line}" | awk -F "\t" -v X=${lib_built_col} '{print $X}') + library_built=$(infer_library_strandedness ${library_built_field} 0) ## One set of sequencing data can correspond to multiple poseidon_ids for index in $(seq 1 1 $(number_of_entries ';' ${poseidon_id})); do row_pid=$(pull_by_index ';' ${poseidon_id} "${index}-1") - row_lib_id="${row_pid}_${lib_name}" ## paste poseidon ID with Library ID to ensure unique naming of library results + + ## Add _ss suffix to sample_name (and later library_id) if single stranded (data never gets merged with double stranded data in eager). + if [[ "${library_built}" == "single" ]]; then + strandedness_suffix='_ss' + row_pid+=${strandedness_suffix} + else + strandedness_suffix='' + fi + + row_lib_id="${row_pid}_${lib_name}${strandedness_suffix}" ## paste poseidon ID with Library ID to ensure unique naming of library results (both with suffix) let lane=$(count_instances ${row_lib_id} "${library_ids[@]}")+1 + read -r seq_type r1 r1_target r2 r2_target < <(symlink_names_from_ena_fastq ${download_dir} ${symlink_dir} ${row_lib_id}_L${lane} ${fastq_fn}) - echo ${row_pid} ${seq_type} ${r1} ## Symink downloaded data to new naming to allow for multiple poseidon IDs per fastq. ## All symlinks are recreated if already existing if [[ ${seq_type} == 'SE' ]]; then @@ -67,3 +79,12 @@ while read line; do done done < <(tail -n +2 ${ssf_file}) + +## Keep track of versions +version_file="$(dirname ${ssf_file})/script_versions.txt" +## Remove versions from older run if there +grep -v -F -e "$(basename ${0})" -e "source_me.sh for data validation" ${version_file} >${version_file}.new +## Then add new versions +echo -e "$(basename ${0}):\t${VERSION}" >> ${version_file}.new +echo -e "source_me.sh for data validation:\t${HELPER_FUNCTION_VERSION}" >>${version_file}.new +mv ${version_file}.new ${version_file}