From 5380c6bf3502a551fb86b4b68da6a2ce02562e00 Mon Sep 17 00:00:00 2001 From: Florian Zwagemaker Date: Fri, 3 May 2024 11:21:08 +0200 Subject: [PATCH 01/22] style: formatting with isort & black --- ViroConstrictor/workflow/scripts/match_ref/group_refs.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ViroConstrictor/workflow/scripts/match_ref/group_refs.py b/ViroConstrictor/workflow/scripts/match_ref/group_refs.py index 911ee11..7c4bc9e 100644 --- a/ViroConstrictor/workflow/scripts/match_ref/group_refs.py +++ b/ViroConstrictor/workflow/scripts/match_ref/group_refs.py @@ -37,9 +37,9 @@ # add the seqrecord information to the dataframe for record in renamed_seqrecords: df.loc[df["Reference"].str.contains(record.id), "seqrecord_id"] = record.id - df.loc[ - df["Reference"].str.contains(record.id), "seqrecord_description" - ] = record.description + df.loc[df["Reference"].str.contains(record.id), "seqrecord_description"] = ( + record.description + ) df.loc[df["Reference"].str.contains(record.id), "seqrecord_name"] = record.name df.loc[df["Reference"].str.contains(record.id), "seqrecord_seq"] = str(record.seq) From 836daa38bc90a801fb5277291fba64eb4d2e13eb Mon Sep 17 00:00:00 2001 From: Florian Zwagemaker Date: Fri, 3 May 2024 11:23:26 +0200 Subject: [PATCH 02/22] build: add base container definitions --- containers/Alignment.dockerfile | 29 +++++++++++++++++++++++++++++ containers/Clean.dockerfile | 29 +++++++++++++++++++++++++++++ containers/Consensus.dockerfile | 29 +++++++++++++++++++++++++++++ containers/ORF_analysis.dockerfile | 29 +++++++++++++++++++++++++++++ containers/Scripts.dockerfile | 30 ++++++++++++++++++++++++++++++ 5 files changed, 146 insertions(+) create mode 100644 containers/Alignment.dockerfile create mode 100644 containers/Clean.dockerfile create mode 100644 containers/Consensus.dockerfile create mode 100644 containers/ORF_analysis.dockerfile create mode 100644 containers/Scripts.dockerfile diff --git a/containers/Alignment.dockerfile b/containers/Alignment.dockerfile new file mode 100644 index 0000000..e5fd7fb --- /dev/null +++ b/containers/Alignment.dockerfile @@ -0,0 +1,29 @@ +FROM mambaorg/micromamba:latest + +COPY ./ViroConstrictor/workflow/envs/Alignment.yaml /install.yml +COPY ./ViroConstrictor/workflow/files/ /files/ +COPY ./ViroConstrictor/workflow/wrappers/ /wrappers/ + +LABEL org.opencontainers.image.description="Sequence alignment processes and tools for the ViroConstrictor workflow." + +USER root + +ARG UID=10001 +RUN adduser \ + --disabled-password \ + --gecos "" \ + --home "/nonexistent" \ + --shell "/sbin/nologin" \ + --no-create-home \ + --uid "${UID}" \ + appuser + +RUN micromamba install -q -y -n base git -c conda-forge && \ + micromamba install -q -y -n base -f /install.yml && \ + micromamba clean -q --all --yes + +USER appuser + +ENV PATH=/opt/conda/bin:$PATH + +CMD ["@"] diff --git a/containers/Clean.dockerfile b/containers/Clean.dockerfile new file mode 100644 index 0000000..1d41945 --- /dev/null +++ b/containers/Clean.dockerfile @@ -0,0 +1,29 @@ +FROM mambaorg/micromamba:latest + +COPY ./ViroConstrictor/workflow/envs/Clean.yaml /install.yml +COPY ./ViroConstrictor/workflow/files/ /files/ +COPY ./ViroConstrictor/workflow/wrappers/ /wrappers/ + +LABEL org.opencontainers.image.description="Data cleaning processes and tools for the ViroConstrictor workflow." + +USER root + +ARG UID=10001 +RUN adduser \ + --disabled-password \ + --gecos "" \ + --home "/nonexistent" \ + --shell "/sbin/nologin" \ + --no-create-home \ + --uid "${UID}" \ + appuser + +RUN micromamba install -q -y -n base git -c conda-forge && \ + micromamba install -q -y -n base -f /install.yml && \ + micromamba clean -q --all --yes + +USER appuser + +ENV PATH=/opt/conda/bin:$PATH + +CMD ["@"] diff --git a/containers/Consensus.dockerfile b/containers/Consensus.dockerfile new file mode 100644 index 0000000..45a0380 --- /dev/null +++ b/containers/Consensus.dockerfile @@ -0,0 +1,29 @@ +FROM mambaorg/micromamba:latest + +COPY ./ViroConstrictor/workflow/envs/Consensus.yaml /install.yml +COPY ./ViroConstrictor/workflow/files/ /files/ +COPY ./ViroConstrictor/workflow/wrappers/ /wrappers/ + +LABEL org.opencontainers.image.description="Consensus sequence generation processes and tools for the ViroConstrictor workflow." + +USER root + +ARG UID=10001 +RUN adduser \ + --disabled-password \ + --gecos "" \ + --home "/nonexistent" \ + --shell "/sbin/nologin" \ + --no-create-home \ + --uid "${UID}" \ + appuser + +RUN micromamba install -q -y -n base git -c conda-forge && \ + micromamba install -q -y -n base -f /install.yml && \ + micromamba clean -q --all --yes + +USER appuser + +ENV PATH=/opt/conda/bin:$PATH + +CMD ["@"] diff --git a/containers/ORF_analysis.dockerfile b/containers/ORF_analysis.dockerfile new file mode 100644 index 0000000..8d2486c --- /dev/null +++ b/containers/ORF_analysis.dockerfile @@ -0,0 +1,29 @@ +FROM mambaorg/micromamba:latest + +COPY ./ViroConstrictor/workflow/envs/ORF_analysis.yaml /install.yml +COPY ./ViroConstrictor/workflow/files/ /files/ +COPY ./ViroConstrictor/workflow/wrappers/ /wrappers/ + +LABEL org.opencontainers.image.description="ORF analysis processes and tools for the ViroConstrictor workflow." + +USER root + +ARG UID=10001 +RUN adduser \ + --disabled-password \ + --gecos "" \ + --home "/nonexistent" \ + --shell "/sbin/nologin" \ + --no-create-home \ + --uid "${UID}" \ + appuser + +RUN micromamba install -q -y -n base git -c conda-forge && \ + micromamba install -q -y -n base -f /install.yml && \ + micromamba clean -q --all --yes + +USER appuser + +ENV PATH=/opt/conda/bin:$PATH + +CMD ["@"] diff --git a/containers/Scripts.dockerfile b/containers/Scripts.dockerfile new file mode 100644 index 0000000..53e7e92 --- /dev/null +++ b/containers/Scripts.dockerfile @@ -0,0 +1,30 @@ +FROM mambaorg/micromamba:latest + +COPY ./ViroConstrictor/workflow/envs/Scripts.yaml /install.yml +COPY ./ViroConstrictor/workflow/files/ /files/ +COPY ./ViroConstrictor/workflow/wrappers/ /wrappers/ +COPY ./ViroConstrictor/workflow/scripts/ /scripts/ + +LABEL org.opencontainers.image.description="Supplementary scripts for the ViroConstrictor workflow." + +USER root + +ARG UID=10001 +RUN adduser \ + --disabled-password \ + --gecos "" \ + --home "/nonexistent" \ + --shell "/sbin/nologin" \ + --no-create-home \ + --uid "${UID}" \ + appuser + +RUN micromamba install -q -y -n base git -c conda-forge && \ + micromamba install -q -y -n base -f /install.yml && \ + micromamba clean -q --all --yes + +USER appuser + +ENV PATH=/opt/conda/bin:$PATH + +CMD ["@"] From 347317bcd69fec8982072f63bcc4924f6e2324a7 Mon Sep 17 00:00:00 2001 From: Florian Zwagemaker Date: Fri, 3 May 2024 20:13:42 +0200 Subject: [PATCH 03/22] build(dependencies): Add a separate conda recipe and container for supplementary workflow scripts --- ViroConstrictor/workflow/envs/Scripts.yaml | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 ViroConstrictor/workflow/envs/Scripts.yaml diff --git a/ViroConstrictor/workflow/envs/Scripts.yaml b/ViroConstrictor/workflow/envs/Scripts.yaml new file mode 100644 index 0000000..4eb9aa2 --- /dev/null +++ b/ViroConstrictor/workflow/envs/Scripts.yaml @@ -0,0 +1,13 @@ +name: Custom_scripts +channels: + - bioconda + - conda-forge + - intel + - nodefaults +dependencies: + - python>=3.10 + - pandas + - pysam + - biopython + - fastqc + - aminoextract==0.3.1 \ No newline at end of file From f455445fb0ead3880253028dc82277fbe694c86b Mon Sep 17 00:00:00 2001 From: Florian Zwagemaker Date: Fri, 3 May 2024 20:38:11 +0200 Subject: [PATCH 04/22] feat: add container support to main workflow fix: split rules to separate dependencies --- ViroConstrictor/workflow/workflow.smk | 119 +++++++++++++++++++------- 1 file changed, 88 insertions(+), 31 deletions(-) diff --git a/ViroConstrictor/workflow/workflow.smk b/ViroConstrictor/workflow/workflow.smk index 744e055..72e0bcb 100644 --- a/ViroConstrictor/workflow/workflow.smk +++ b/ViroConstrictor/workflow/workflow.smk @@ -9,6 +9,7 @@ import numpy as np from directories import * from presets import get_preset_parameter +from containers import get_hash from Bio import SeqIO import AminoExtract from snakemake.utils import Paramspace, min_version @@ -172,7 +173,7 @@ rule prepare_refs: rule prepare_primers: input: - prm=lambda wc: SAMPLES[wc.sample]["PRIMERS"], + prm=lambda wc: "" if SAMPLES[wc.sample]["PRIMERS"].endswith(".bed") else SAMPLES[wc.sample]["PRIMERS"], ref=rules.prepare_refs.output, output: bed=f"{datadir}{wc_folder}{prim}" "{sample}_primers.bed", @@ -184,22 +185,42 @@ rule prepare_primers: f"{logdir}{bench}prepare_primers_" "{Virus}.{RefID}.{sample}.txt" params: pr_mm_rate=lambda wc: SAMPLES[wc.sample]["PRIMER-MISMATCH-RATE"], - script=srcdir("scripts/filter_bed_input.py"), conda: f"{conda_envs}Clean.yaml" + container: + f"{config['container_cache']}/viroconstrictor_clean_{get_hash('Clean')}.sif" shell: """ - if [[ {input.prm} == *.bed ]]; then - python {params.script} {input.prm} {output.bed} {wildcards.RefID} - else - python -m AmpliGone.fasta2bed \ - --primers {input.prm} \ - --reference {input.ref} \ - --output {output.bed} \ - --primer-mismatch-rate {params.pr_mm_rate} > {log} - fi + python -m AmpliGone.fasta2bed \ + --primers {input.prm} \ + --reference {input.ref} \ + --output {output.bed} \ + --primer-mismatch-rate {params.pr_mm_rate} > {log} """ +ruleorder: prepare_primers > filter_primer_bed + +rule filter_primer_bed: + input: + prm=lambda wc: SAMPLES[wc.sample]["PRIMERS"], + output: + bed=f"{datadir}{wc_folder}{prim}" "{sample}_primers.bed", + resources: + mem_mb=low_memory_job, + log: + f"{logdir}prepare_primers_" "{Virus}.{RefID}.{sample}.log", + benchmark: + f"{logdir}{bench}prepare_primers_" "{Virus}.{RefID}.{sample}.txt" + params: + script=srcdir("scripts/filter_bed_input.py") if config["use-conda"] is True and config["use-singularity"] is False else "/scripts/filter_bed_input.py", + conda: + f"{conda_envs}Scripts.yaml" + container: + f"{config['container_cache']}/viroconstrictor_scripts_{get_hash('Scripts')}.sif" + shell: + """ + python {params.script} {input.prm} {output.bed} {wildcards.RefID} + """ rule prepare_gffs: input: @@ -212,11 +233,13 @@ rule prepare_gffs: benchmark: f"{logdir}{bench}prepare_gffs_" "{Virus}.{RefID}.{sample}.txt" conda: - f"{conda_envs}ORF_analysis.yaml" + f"{conda_envs}Scripts.yaml" + container: + f"{config['container_cache']}/viroconstrictor_scripts_{get_hash('Scripts')}.sif" resources: mem_mb=low_memory_job, params: - script=srcdir("scripts/extract_gff.py"), + script=srcdir("scripts/extract_gff.py") if config["use-conda"] is True and config["use-singularity"] is False else "/scripts/extract_gff.py", shell: """ python {params.script} {input.feats} {output.gff} {wildcards.RefID} @@ -240,6 +263,8 @@ rule prodigal: threads: config["threads"]["Index"] conda: f"{conda_envs}ORF_analysis.yaml" + container: + f"{config['container_cache']}/viroconstrictor_orf_analysis_{get_hash('ORF_analysis')}.sif" resources: mem_mb=medium_memory_job, params: @@ -268,7 +293,9 @@ if config["platform"] in ["nanopore", "iontorrent"]: html=f"{datadir}{qc_pre}" "{sample}_fastqc.html", zip=f"{datadir}{qc_pre}" "{sample}_fastqc.zip", conda: - f"{conda_envs}Clean.yaml" + f"{conda_envs}Scripts.yaml" + container: + f"{config['container_cache']}/viroconstrictor_scripts_{get_hash('Scripts')}.sif" log: f"{logdir}QC_raw_data_" "{sample}.log", benchmark: @@ -278,7 +305,7 @@ if config["platform"] in ["nanopore", "iontorrent"]: mem_mb=low_memory_job, params: output_dir=f"{datadir}{qc_pre}", - script=srcdir("scripts/fastqc_wrapper.sh"), + script=srcdir("wrappers/fastqc_wrapper.sh") if config["use-conda"] is True and config["use-singularity"] is False else "/wrappers/fastqc_wrapper.sh", shell: """ bash {params.script} {input} {params.output_dir} {output.html} {output.zip} {log} @@ -293,6 +320,8 @@ if config["platform"] in ["nanopore", "iontorrent"]: index=f"{datadir}{wc_folder}{cln}{raln}" "{sample}.bam.bai", conda: f"{conda_envs}Alignment.yaml" + container: + f"{config['container_cache']}/viroconstrictor_alignment_{get_hash('Alignment')}.sif" log: f"{logdir}RemoveAdapters_p1_" "{Virus}.{RefID}.{sample}.log", benchmark: @@ -329,7 +358,9 @@ if config["platform"] == "illumina": html=f"{datadir}{qc_pre}" "{sample}_{read}_fastqc.html", zip=f"{datadir}{qc_pre}" "{sample}_{read}_fastqc.zip", conda: - f"{conda_envs}Clean.yaml" + f"{conda_envs}Scripts.yaml" + container: + f"{config['container_cache']}/viroconstrictor_scripts_{get_hash('Scripts')}.sif" log: f"{logdir}" "QC_raw_data_{sample}_{read}.log", benchmark: @@ -339,7 +370,7 @@ if config["platform"] == "illumina": mem_mb=low_memory_job, params: output_dir=f"{datadir}{qc_pre}", - script=srcdir("scripts/fastqc_wrapper.sh"), + script=srcdir("wrappers/fastqc_wrapper.sh") if config["use-conda"] is True and config["use-singularity"] is False else "/wrappers/fastqc_wrapper.sh", shell: """ bash {params.script} {input} {params.output_dir} {output.html} {output.zip} {log} @@ -355,6 +386,8 @@ if config["platform"] == "illumina": index=f"{datadir}{wc_folder}{cln}{raln}" "{sample}.bam.bai", conda: f"{conda_envs}Alignment.yaml" + container: + f"{config['container_cache']}/viroconstrictor_alignment_{get_hash('Alignment')}.sif" log: f"{logdir}" "RemoveAdapters_p1_{Virus}.{RefID}.{sample}.log", benchmark: @@ -387,12 +420,14 @@ rule remove_adapters_p2: output: f"{datadir}{wc_folder}{cln}{noad}" "{sample}.fastq", conda: - f"{conda_envs}Clean.yaml" + f"{conda_envs}Scripts.yaml" + container: + f"{config['container_cache']}/viroconstrictor_scripts_{get_hash('Scripts')}.sif" threads: config["threads"]["AdapterRemoval"] resources: mem_mb=low_memory_job, params: - script=srcdir("scripts/clipper.py"), + script=srcdir("scripts/clipper.py") if config["use-conda"] is True and config["use-singularity"] is False else "/scripts/clipper.py", clipper_settings=lambda wc: get_preset_parameter( preset_name=SAMPLES[wc.sample]["PRESET"], parameter_name=f"ClipperSettings_{config['platform']}", @@ -412,6 +447,8 @@ rule qc_filter: json=f"{datadir}{wc_folder}{cln}{qcfilt}{json}" "{sample}_fastp.json", conda: f"{conda_envs}Clean.yaml" + container: + f"{config['container_cache']}/viroconstrictor_clean_{get_hash('Clean')}.sif" log: f"{logdir}QC_filter_" "{Virus}.{RefID}.{sample}.log", benchmark: @@ -453,6 +490,8 @@ rule ampligone: ep=f"{datadir}{wc_folder}{prim}" "{sample}_removedprimers.bed", conda: f"{conda_envs}Clean.yaml" + container: + f"{config['container_cache']}/viroconstrictor_clean_{get_hash('Clean')}.sif" log: f"{logdir}" "AmpliGone_{Virus}.{RefID}.{sample}.log", benchmark: @@ -503,7 +542,9 @@ rule qc_clean: html=f"{datadir}{wc_folder}{qc_post}" "{sample}_fastqc.html", zip=f"{datadir}{wc_folder}{qc_post}" "{sample}_fastqc.zip", conda: - f"{conda_envs}Clean.yaml" + f"{conda_envs}Scripts.yaml" + container: + f"{config['container_cache']}/viroconstrictor_scripts_{get_hash('Scripts')}.sif" log: f"{logdir}QC_clean_" "{Virus}.{RefID}.{sample}.log", benchmark: @@ -513,7 +554,7 @@ rule qc_clean: mem_mb=low_memory_job, params: outdir=f"{datadir}{wc_folder}{qc_post}", - script=srcdir("scripts/fastqc_wrapper.sh"), + script=srcdir("wrappers/fastqc_wrapper.sh") if config["use-conda"] is True and config["use-singularity"] is False else "/wrappers/fastqc_wrapper.sh", shell: """ bash {params.script} {input} {params.outdir} {output.html} {output.zip} {log} @@ -538,6 +579,8 @@ rule align_before_trueconsense: index=f"{datadir}{wc_folder}{aln}{bf}" "{sample}.bam.bai", conda: f"{conda_envs}Alignment.yaml" + container: + f"{config['container_cache']}/viroconstrictor_alignment_{get_hash('Alignment')}.sif" log: f"{logdir}Alignment_" "{Virus}.{RefID}.{sample}.log", benchmark: @@ -579,6 +622,8 @@ rule trueconsense: mincov=lambda wc: SAMPLES[wc.sample]["MIN-COVERAGE"], conda: f"{conda_envs}Consensus.yaml" + container: + f"{config['container_cache']}/viroconstrictor_consensus_{get_hash('Consensus')}.sif" log: f"{logdir}Consensus_" "{Virus}.{RefID}.{sample}.log", benchmark: @@ -632,6 +677,8 @@ rule Translate_AminoAcids: f"{datadir}{wc_folder}{amino}" "{sample}/aa.faa", conda: f"{conda_envs}ORF_analysis.yaml" + container: + f"{config['container_cache']}/viroconstrictor_orf_analysis_{get_hash('ORF_analysis')}.sif" resources: mem_mb=low_memory_job, log: @@ -694,10 +741,12 @@ rule concat_aminoacids: resources: mem_mb=low_memory_job, conda: - f"{conda_envs}ORF_analysis.yaml" + f"{conda_envs}Scripts.yaml" + container: + f"{config['container_cache']}/viroconstrictor_scripts_{get_hash('Scripts')}.sif" threads: 1 params: - script=srcdir("scripts/group_aminoacids.py"), + script=srcdir("scripts/group_aminoacids.py") if config["use-conda"] is True and config["use-singularity"] is False else "/scripts/group_aminoacids.py", shell: 'python {params.script} "{input.files}" "{output}" {input.sampleinfo}' @@ -708,14 +757,16 @@ rule vcf_to_tsv: output: tsv=temp(f"{datadir}{wc_folder}{aln}{vf}" "{sample}.tsv"), conda: - f"{conda_envs}Consensus.yaml" + f"{conda_envs}Scripts.yaml" + container: + f"{config['container_cache']}/viroconstrictor_scripts_{get_hash('Scripts')}.sif" threads: config["threads"]["Index"] resources: mem_mb=low_memory_job, log: f"{logdir}" "vcf_to_tsv_{Virus}.{RefID}.{sample}.log", params: - script=srcdir("scripts/vcf_to_tsv.py"), + script=srcdir("scripts/vcf_to_tsv.py") if config["use-conda"] is True and config["use-singularity"] is False else "/scripts/vcf_to_tsv.py", shell: """ python {params.script} {input.vcf} {output.tsv} {wildcards.sample} >> {log} 2>&1 @@ -737,7 +788,7 @@ rule concat_tsv_coverages: cat {input} >> {output} """ - +#todo: this should be in an environment, why isn't it? rule get_breadth_of_coverage: input: reference=rules.prepare_refs.output, @@ -783,9 +834,11 @@ rule calculate_amplicon_cov: resources: mem_mb=low_memory_job, conda: - f"{conda_envs}Clean.yaml" + f"{conda_envs}Scripts.yaml" + container: + f"{config['container_cache']}/viroconstrictor_scripts_{get_hash('Scripts')}.sif" params: - script=srcdir("scripts/amplicon_covs.py"), + script=srcdir("scripts/amplicon_covs.py") if config["use-conda"] is True and config["use-singularity"] is False else "/scripts/amplicon_covs.py", shell: """ python {params.script} \ @@ -808,9 +861,11 @@ rule concat_amplicon_cov: resources: mem_mb=low_memory_job, conda: - f"{conda_envs}Clean.yaml" + f"{conda_envs}Scripts.yaml" + container: + f"{config['container_cache']}/viroconstrictor_scripts_{get_hash('Scripts')}.sif" params: - script=srcdir("scripts/concat_amplicon_covs.py"), + script=srcdir("scripts/concat_amplicon_covs.py") if config["use-conda"] is True and config["use-singularity"] is False else "/scripts/concat_amplicon_covs.py", shell: """ python {params.script} --output {output} --input {input} @@ -861,6 +916,8 @@ rule multiqc_report: expand(f"{res}{mqc_data}multiqc_" "{program}.txt", program="fastqc"), conda: f"{conda_envs}Clean.yaml" + container: + f"{config['container_cache']}/viroconstrictor_clean_{get_hash('Clean')}.sif" log: f"{logdir}MultiQC_report.log", benchmark: @@ -868,7 +925,7 @@ rule multiqc_report: resources: mem_mb=high_memory_job, params: - conffile=srcdir("files/multiqc_config.yaml"), + conffile=srcdir("files/multiqc_config.yaml") if config["use-conda"] is True and config["use-singularity"] is False else "/files/multiqc_config.yaml", outdir=res, shell: """ From c29495f1d7af13de7a1dc414a3f1d85e42545ef8 Mon Sep 17 00:00:00 2001 From: Florian Zwagemaker Date: Fri, 3 May 2024 20:40:11 +0200 Subject: [PATCH 05/22] feat: add containerization support to workflow fix: split combined job to separate script-based job for better dependency management --- ViroConstrictor/workflow/match_ref.smk | 84 +++++++++++++++++++------- 1 file changed, 62 insertions(+), 22 deletions(-) diff --git a/ViroConstrictor/workflow/match_ref.smk b/ViroConstrictor/workflow/match_ref.smk index 902d52e..dcbcbb5 100644 --- a/ViroConstrictor/workflow/match_ref.smk +++ b/ViroConstrictor/workflow/match_ref.smk @@ -6,14 +6,18 @@ import logging import pandas as pd from presets import get_preset_parameter +from containers import get_hash from Bio import SeqIO, SeqRecord from snakemake.utils import Paramspace, min_version +import snakemake from directories import * +from rich import print min_version("7.15") +# print(config) logger = logging.getLogger() logger.handlers.clear() @@ -188,6 +192,8 @@ if config["platform"] in ["nanopore", "iontorrent"]: index=temp(f"{datadir}{matchref}{wc_folder}" "{sample}.bam.bai"), conda: f"{conda_envs}Alignment.yaml" + container: + f"{config['container_cache']}/viroconstrictor_alignment_{get_hash('Alignment')}.sif" log: f"{logdir}AlignMR_" "{Virus}.{segment}.{sample}.log", benchmark: @@ -236,6 +242,8 @@ if config["platform"] == "illumina": index=temp(f"{datadir}{matchref}{wc_folder}" "{sample}.bam.bai"), conda: f"{conda_envs}Alignment.yaml" + container: + f"{config['container_cache']}/viroconstrictor_alignment_{get_hash('Alignment')}.sif" log: f"{logdir}" "AlignMR_{Virus}.{segment}.{sample}.log", benchmark: @@ -278,14 +286,16 @@ rule count_mapped_reads: output: temp(f"{datadir}{matchref}{wc_folder}" "{sample}_count.csv"), conda: - f"{conda_envs}Clean.yaml" + f"{conda_envs}Scripts.yaml" + container: + f"{config['container_cache']}/viroconstrictor_scripts_{get_hash('Scripts')}.sif" threads: 1 resources: mem=low_memory_job, log: f"{logdir}CountMR_" "{Virus}.{segment}.{sample}.log", params: - script=srcdir("scripts/match_ref/count_mapped_reads.py"), + script=srcdir("scripts/match_ref/count_mapped_reads.py") if config["use-conda"] is True and config["use-singularity"] is False else "/scripts/match_ref/count_mapped_reads.py", shell: """ python {params.script} {input.bam} {output} >> {log} 2>&1 @@ -300,14 +310,16 @@ rule filter_best_matching_ref: filtref=temp(f"{datadir}{matchref}{wc_folder}" "{sample}_best_ref.fasta"), filtcount=temp(f"{datadir}{matchref}{wc_folder}" "{sample}_best_ref.csv"), conda: - f"{conda_envs}Clean.yaml" + f"{conda_envs}Scripts.yaml" + container: + f"{config['container_cache']}/viroconstrictor_scripts_{get_hash('Scripts')}.sif" threads: 1 resources: mem=low_memory_job, log: f"{logdir}FilterBR_" "{Virus}.{segment}.{sample}.log", params: - script=srcdir("scripts/match_ref/filter_best_matching_ref.py"), + script=srcdir("scripts/match_ref/filter_best_matching_ref.py") if config["use-conda"] is True and config["use-singularity"] is False else "/scripts/match_ref/filter_best_matching_ref.py", shell: """ python {params.script} {input.stats} {input.ref} {output.filtref} {output.filtcount} >> {log} 2>&1 @@ -342,14 +354,16 @@ rule group_and_rename_refs: groupedrefs=f"{datadir}{matchref}" "{sample}_refs.fasta", groupedstats=temp(f"{datadir}{matchref}" "{sample}_refs.csv"), conda: - f"{conda_envs}Clean.yaml" + f"{conda_envs}Scripts.yaml" + container: + f"{config['container_cache']}/viroconstrictor_scripts_{get_hash('Scripts')}.sif" threads: 1 resources: mem=low_memory_job, log: f"{logdir}GroupRefs_" "{sample}.log", params: - script=srcdir("scripts/match_ref/group_refs.py"), + script=srcdir("scripts/match_ref/group_refs.py") if config["use-conda"] is True and config["use-singularity"] is False else "/scripts/match_ref/group_refs.py", shell: """ python {params.script} "{input.ref}" "{input.stats}" {output.groupedrefs} {output.groupedstats} {wildcards.sample} >> {log} 2>&1 @@ -368,8 +382,12 @@ rule filter_gff: mem=low_memory_job, log: f"{logdir}FilterGFF_" "{sample}.log", + conda: + f"{conda_envs}Scripts.yaml" + container: + f"{config['container_cache']}/viroconstrictor_scripts_{get_hash('Scripts')}.sif" params: - script=srcdir("scripts/match_ref/filter_gff.py"), + script=srcdir("scripts/match_ref/filter_gff.py") if config["use-conda"] is True and config["use-singularity"] is False else "/scripts/match_ref/filter_gff.py", shell: """ python {params.script} {input.refdata} {input.gff} {output.gff} {output.groupedstats} >> {log} 2>&1 @@ -392,11 +410,42 @@ rule touch_gff: touch {output.gff} """ +rule filter_fasta2bed: + input: + ref=rules.group_and_rename_refs.output.groupedrefs, + refdata=rules.filter_gff.output.groupedstats, + prm=lambda wc: "" if SAMPLES[wc.sample]["PRIMERS"].endswith(".bed") else SAMPLES[wc.sample]["PRIMERS"], + output: + bed=f"{datadir}{matchref}" "{sample}_primers.bed", + groupedstats=temp(f"{datadir}{matchref}" "{sample}_data2.csv"), + conda: + f"{conda_envs}Clean.yaml" + container: + f"{config['container_cache']}/viroconstrictor_clean_{get_hash('Clean')}.sif" + threads: 1 + resources: + mem_mb=low_memory_job, + log: + f"{logdir}Fasta2Bed_" "{sample}.log", + params: + pr_mm_rate=lambda wc: SAMPLES[wc.sample]["PRIMER-MISMATCH-RATE"], + shell: + """ + python -m AmpliGone.fasta2bed \ + --primers {input.prm} \ + --reference {input.ref} \ + --output {output.bed} \ + --primer-mismatch-rate {params.pr_mm_rate} > {log} + awk -F ',' -v OFS=',' '{{ $(NF+1) = (NR==1 ? "Primer_file" : "{output.bed}"); print }}' {input.refdata} > {output.groupedstats} + """ + +ruleorder: filter_fasta2bed > filter_bed > touch_primers + + rule filter_bed: input: prm=lambda wc: SAMPLES[wc.sample]["PRIMERS"], refdata=rules.filter_gff.output.groupedstats, - ref=rules.group_and_rename_refs.output.groupedrefs, output: bed=f"{datadir}{matchref}" "{sample}_primers.bed", groupedstats=temp(f"{datadir}{matchref}" "{sample}_data2.csv"), @@ -406,25 +455,16 @@ rule filter_bed: log: f"{logdir}FilterBed_" "{sample}.log", params: - pr_mm_rate=lambda wc: SAMPLES[wc.sample]["PRIMER-MISMATCH-RATE"], - script=srcdir("scripts/match_ref/filter_bed.py"), + script=srcdir("scripts/match_ref/filter_bed.py") if config["use-conda"] is True and config["use-singularity"] is False else "/scripts/match_ref/filter_bed.py", conda: - f"{conda_envs}Clean.yaml" + f"{conda_envs}Scripts.yaml" + container: + f"{config['container_cache']}/viroconstrictor_scripts_{get_hash('Scripts')}.sif" shell: """ - if [[ {input.prm} == *.bed ]]; then - python {params.script} {input.prm} {input.refdata} {output.bed} {output.groupedstats} - else - python -m AmpliGone.fasta2bed \ - --primers {input.prm} \ - --reference {input.ref} \ - --output {output.bed} \ - --primer-mismatch-rate {params.pr_mm_rate} > {log} - awk -F"," 'BEGIN {{ OFS = "," }} {{$6="{output.bed}"; print}}' {input.refdata} > {output.groupedstats} - fi + python {params.script} {input.prm} {input.refdata} {output.bed} {output.groupedstats} """ -ruleorder: filter_bed > touch_primers rule touch_primers: input: From d4e1fc97d96093d5bcd2afeda67bfe829fd170f5 Mon Sep 17 00:00:00 2001 From: Florian Zwagemaker Date: Fri, 3 May 2024 20:40:54 +0200 Subject: [PATCH 06/22] feat: add primary container management functions to viroconstrictor wrapper --- ViroConstrictor/workflow/containers.py | 173 +++++++++++++++++++++++++ 1 file changed, 173 insertions(+) create mode 100644 ViroConstrictor/workflow/containers.py diff --git a/ViroConstrictor/workflow/containers.py b/ViroConstrictor/workflow/containers.py new file mode 100644 index 0000000..76b6a89 --- /dev/null +++ b/ViroConstrictor/workflow/containers.py @@ -0,0 +1,173 @@ +import hashlib +import os +import subprocess +from typing import Any, Dict, List, Tuple + +from rich import print + +from ViroConstrictor import __prog__ + +upstream_registry = "ghcr.io/rivm-bioinformatics" + + +def fetch_recipes(recipe_folder: str) -> List[str]: + return [ + os.path.abspath(os.path.join(recipe_folder, file)) + for file in os.listdir(recipe_folder) + if file.endswith(".yaml") + ] + + +def fetch_scripts(script_folder: str) -> List[str]: + script_files = [] + for root, dirs, files in os.walk(script_folder): + script_files.extend( + os.path.abspath(os.path.join(root, file)) + for file in files + if file.endswith(".py") + ) + return script_files + + +def fetch_files(file_folder: str) -> List[str]: + return [ + os.path.abspath(os.path.join(file_folder, file)) + for file in os.listdir(file_folder) + ] + + +def fetch_hashes() -> dict[str, str]: + recipe_files = sorted( + fetch_recipes(f"{os.path.dirname(os.path.realpath(__file__))}/envs/") + ) + script_files = sorted( + fetch_scripts(f"{os.path.dirname(os.path.realpath(__file__))}/scripts/") + ) + config_files = sorted( + fetch_files(f"{os.path.dirname(os.path.realpath(__file__))}/files/") + ) + + script_hashes = {} + for script_file in script_files: + with open(script_file, "rb") as f: + file_hash = hashlib.sha256(f.read()).hexdigest()[:6] + script_hashes[script_file] = file_hash + + config_hashes = {} + for config_file in config_files: + with open(config_file, "rb") as f: + file_hash = hashlib.sha256(f.read()).hexdigest()[:6] + config_hashes[config_file] = file_hash + + # sort the hashes of the scripts and the configs + script_hashes = dict(sorted(script_hashes.items())) + config_hashes = dict(sorted(config_hashes.items())) + + # join the hashes of the scripts and the configs (the values of the dictionaries), make a new hash of the joined hashes + merged_hashes = hashlib.sha256( + "".join(list(script_hashes.values()) + list(config_hashes.values())).encode() + ).hexdigest()[:6] + + hashes = {} + for recipe_file in recipe_files: + with open(recipe_file, "rb") as f: + recipe_hash = hashlib.sha256(f.read()).hexdigest()[:6] + if os.path.basename(recipe_file).split(".")[0] != "Scripts": + hashes[recipe_file] = recipe_hash + continue + # add the merged hash to the recipe hash and make a new hash of the joined hashes + file_hash = hashlib.sha256( + (recipe_hash + merged_hashes).encode() + ).hexdigest()[:6] + hashes[recipe_file] = file_hash + return hashes + + +def get_hash(target_container: str) -> str | None: + hashes = fetch_hashes() + return next( + (hash for recipe, hash in hashes.items() if target_container in recipe), + None, + ) + + +def containerization_installed() -> bool: + if os.system("which apptainer") == 0: + return True + return os.system("which singularity") == 0 + + +def containers_to_download(config: Dict[str, Any]) -> List[str]: + recipes = fetch_hashes() + + # check the recipes dict and create a list of required containers for the workflow. the list must contain strings that look like "viroconstrictor_alignment_{hash}" "viroconstrictor_clean_{hash}" etc. + required_containers = [] + for key, val in recipes.items(): + recipe_basename = os.path.basename(key).replace(".yaml", "") + required_containers.append(f"{__prog__}_{recipe_basename}_{val}".lower()) + + # check if the required containers are already present in the directory listed in config["container_cache"] + # if they are, remove them from the list + containers_present = os.listdir(config["container_cache"]) + + # remove the .sif extension from the container names + containers_present = [item.split(".")[0] for item in containers_present] + + containers_to_download = [] + for container in required_containers: + if container in containers_present: + continue + containers_to_download.append(container) + return containers_to_download + + +def containerization_executable() -> str: + return ( + "apptainer" + if subprocess.call( + "which apptainer", + shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + == 0 + else "singularity" + ) + + +def download_containers(config: Dict[str, Any]) -> int: + to_download = containers_to_download(config) + to_download = [x.rsplit("_", 1)[0] + ":" + x.rsplit("_", 1)[1] for x in to_download] + + # I thought this would be a great place to use concurrent.futures to download the containers in parallel + # however this has unintended consequences for the various OCI layers resulting in at least one container not being downloaded correctly. + # Thus it's better for now to download the containers sequentially. + for container in to_download: + print(f"Downloading {container}") + executable = containerization_executable() + status = subprocess.call( + f"{executable} pull --dir {config['container_cache']} docker://{upstream_registry}/{container}", + shell=True, + stderr=subprocess.PIPE, + stdout=subprocess.PIPE, + ) + if status != 0: + print(f"Failed to download {container}") + return 1 + print(f"Successfully downloaded {container}") + + return 0 + + +def construct_container_bind_args(samples_dict: Dict) -> str: + paths = [] + for nested_dict in samples_dict.items(): + paths.extend( + f"{os.path.dirname(value)}" + for value in nested_dict.values() + if isinstance(value, str) and os.path.exists(value) + ) + # remove all duplicates from the paths list by making it a set + # for every item in the set, add '--bind ' + # join all the items together to make a long string + return " ".join([f"--bind {path}" for path in set(paths)]) From 41bd864062d1310a9a754f5acbdbe6a880a424c7 Mon Sep 17 00:00:00 2001 From: Florian Zwagemaker Date: Fri, 3 May 2024 20:49:28 +0200 Subject: [PATCH 07/22] ci: add github actions workflow for automatic building of containers ci: add supplementary scripts for container build process --- .github/workflows/build_and_test.yml | 95 +++++++++++++ containers/add_OCI_to_docker_engine.py | 21 +++ containers/build_containers.py | 128 ++++++++++++++++++ ...nvert_artifact_containers_for_apptainer.py | 30 ++++ containers/tag_and_push_containers.py | 21 +++ 5 files changed, 295 insertions(+) create mode 100644 .github/workflows/build_and_test.yml create mode 100644 containers/add_OCI_to_docker_engine.py create mode 100644 containers/build_containers.py create mode 100644 containers/convert_artifact_containers_for_apptainer.py create mode 100644 containers/tag_and_push_containers.py diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml new file mode 100644 index 0000000..e8f6a02 --- /dev/null +++ b/.github/workflows/build_and_test.yml @@ -0,0 +1,95 @@ +name: build containers and run tests + +on: + pull_request: + branches: + - 'main' + +jobs: + Setup_and_build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Setup Mamba + uses: mamba-org/setup-micromamba@v1 + with: + cache-environment: true + post-cleanup: 'all' + environment-file: env.yml + init-shell: bash + + - name: Install local python package + run: | + pip install . --no-deps + shell: micromamba-shell {0} + + - name: build containers + run: | + python containers/build_containers.py + env: + TOKEN: ${{ secrets.GITHUB_TOKEN }} + shell: micromamba-shell {0} + + - name: zip built containers + run: | + cd ./containers/ + tar -czvf containers.tar.gz builtcontainers.json $(find . -type f -name "*.tar" -printf '%f ') + + + - name: Upload container artifacts + uses: actions/upload-artifact@v3 + with: + name: built_containers + path: ./containers/containers.tar.gz + + Test: + runs-on: ubuntu-latest + needs: Setup_and_build + steps: + - uses: actions/checkout@v3 + + - uses: actions/download-artifact@v3 + with: + name: built_containers + + - name: move artifact + run: | + mv ./containers.tar.gz ./containers/containers.tar.gz + + - name: list directory contents + run: | + ls -R ./ + + - name: unzip built containers + run: | + cd ./containers/ + tar -xzvf containers.tar.gz + cd .. + + - name: Setup Apptainer + uses: eWaterCycle/setup-apptainer@v2 + + - name: Setup Mamba + uses: mamba-org/setup-micromamba@v1 + with: + cache-environment: true + post-cleanup: 'all' + environment-file: env.yml + init-shell: bash + + - name: Install local python package + run: | + pip install . --no-deps + shell: micromamba-shell {0} + + - name: list directory contents + run: | + ls -R ./ + + - name: convert containers + run: | + python containers/convert_artifact_containers_for_apptainer.py + + # download already built containers + ## rest of the testing suite here \ No newline at end of file diff --git a/containers/add_OCI_to_docker_engine.py b/containers/add_OCI_to_docker_engine.py new file mode 100644 index 0000000..3eac792 --- /dev/null +++ b/containers/add_OCI_to_docker_engine.py @@ -0,0 +1,21 @@ +import json +import subprocess +from typing import List + +base_path_to_container_defs = "./containers" + +if __name__ == "__main__": + print("Adding OCI containers to Docker Engine from Artifact") + + builtcontainers: List = [] + with open(f"{base_path_to_container_defs}/builtcontainers.json", "r") as f: + builtcontainers: List = json.load(f) + + for container in builtcontainers: + print(f"Adding {container} to local Docker Engine") + subprocess.run( + f"docker image load -i {base_path_to_container_defs}/{container}.tar", + shell=True, + ) + + print("Done adding OCI containers to Docker Engine") diff --git a/containers/build_containers.py b/containers/build_containers.py new file mode 100644 index 0000000..7036bbe --- /dev/null +++ b/containers/build_containers.py @@ -0,0 +1,128 @@ +import json +import os +import subprocess +import tempfile + +import requests + +from ViroConstrictor import __prog__ +from ViroConstrictor.workflow.containers import fetch_hashes, upstream_registry + +base_path_to_container_defs = "./containers" +upstream_api_endpoint = ( + "https://api.github.com/orgs/RIVM-bioinformatics/packages/container/" +) +upstream_api_authtoken = os.environ.get("TOKEN") +upstream_api_responsetype = "application/vnd.github+json" +upstream_api_version = "2022-11-28" +upstream_api_headers = { + "Accept": f"{upstream_api_responsetype}", + "X-GitHub-Api-Version": f"{upstream_api_version}", + "Authorization": f"Bearer {upstream_api_authtoken}", +} + + +if __name__ == "__main__": + print("Start of container building process for ViroConstrictor") + recipe_hashes = fetch_hashes() + + builtcontainers = [] + for recipe, VersionHash in recipe_hashes.items(): + # strip the name of the recipe to only get the name of the environment + recipe_basename = os.path.basename(recipe).replace(".yaml", "") + container_basename = f"{__prog__}_{recipe_basename}".lower() + + associated_container_def_file = os.path.join( + base_path_to_container_defs, f"{recipe_basename}.dockerfile" + ) + upstream_registry_url = f"{upstream_registry}/{recipe_basename}:{VersionHash}" + upstream_existing_containers = ( + f"{upstream_api_endpoint}{__prog__}_{recipe_basename}/versions" + ) + print( + f"Checking if container '{container_basename}' with hash '{VersionHash}' exists in the upstream registry" + ) + json_response = requests.get( + upstream_existing_containers, headers=upstream_api_headers + ).json() + + tags = [] + + # if the container exists at all in the upstream registry, the json response will be a list. + # If the container does not exist, the json response will be a dict with a message that the container does not exist. + # You can therefore check if the json response is a list or a dict to see if the container exists or not. + if isinstance(json_response, list): + # json_response = json.loads(json_response) + tags = [ + version["metadata"]["container"]["tags"] for version in json_response + ] + # flatten the list of tags + tags = [tag for sublist in tags for tag in sublist] + print(tags) + + if VersionHash in tags: + print( + f"Container '{container_basename}' with hash '{VersionHash}' already exists in the upstream registry" + ) + continue + + print( + f"Container '{container_basename}' with hash '{VersionHash}' does not exist in the upstream registry" + ) + print( + f"Starting Apptainer build process for container '{container_basename}:{VersionHash}'" + ) + + # create a temporary file to write the container definition to, copy the contents of {recipe_basename}.def to it and then append the labels section to it including the version hash + # then use the temporary file as the container definition file for the apptainer build process + # the apptainer build process will build the .sif container file also in a temporary directory + # after the container is built, the built container file will be moved to the current working directory and the temporary directory will be deleted. + # the container file will not be pushed to the upstream registry yet, this will be done in a separate script after all containers have been built and tested. + with tempfile.NamedTemporaryFile( + mode="w", delete=False + ) as tmp, tempfile.TemporaryDirectory() as tmpdir: + with open(associated_container_def_file, "r") as f: + tmp.write(f.read()) + tmp.write( + f""" + +LABEL Author="RIVM-bioinformatics team" +LABEL Maintainer="RIVM-bioinformatics team" +LABEL Associated_pipeline="{__prog__}" +LABEL version="{VersionHash}" +LABEL org.opencontainers.image.authors="ids-bioinformatics@rivm.nl" +LABEL org.opencontainers.image.source=https://github.com/RIVM-bioinformatics/{__prog__} + + """ + ) + tmp.flush() # flush the temporary file to make sure the contents are written to disk + subprocess.run( + [ + "docker", + "build", + "-t", + f"{container_basename}:{VersionHash}", + "-f", + f"{tmp.name}", + ".", + "--network", + "host", + "--no-cache", + ], + check=True, + ) + # move the container file to the current working directory + subprocess.run( + [ + "docker", + "save", + "-o", + f"{base_path_to_container_defs}/{container_basename}:{VersionHash}.tar", + f"{container_basename}:{VersionHash}", + ] + ) + + builtcontainers.append(f"{container_basename}:{VersionHash}") + + with open(f"{base_path_to_container_defs}/builtcontainers.json", "w") as f: + json.dump(builtcontainers, f, indent=4) diff --git a/containers/convert_artifact_containers_for_apptainer.py b/containers/convert_artifact_containers_for_apptainer.py new file mode 100644 index 0000000..0af5568 --- /dev/null +++ b/containers/convert_artifact_containers_for_apptainer.py @@ -0,0 +1,30 @@ +import json +import shutil +import subprocess +from typing import List + +base_path_to_container_defs = "./containers" + +if __name__ == "__main__": + print("Renaming OCI containers and converting to Apptainer .sif format") + + builtcontainers: List = [] + with open(f"{base_path_to_container_defs}/builtcontainers.json", "r") as f: + builtcontainers: List = json.load(f) + + builtcontainers_trimmed = [ + container.replace(":", "_") for container in builtcontainers + ] + + for original_name, trimmed_name in zip(builtcontainers, builtcontainers_trimmed): + print(f"Renaming {original_name} to {trimmed_name}") + shutil.move( + f"{base_path_to_container_defs}/{original_name}.tar", + f"{base_path_to_container_defs}/{trimmed_name}.tar", + ) + trimmed_name_sif = trimmed_name.replace(":", "_") + print(f"Converting {original_name} to Apptainer .sif format") + subprocess.run( + f"apptainer build {base_path_to_container_defs}/{trimmed_name_sif}.sif docker-archive://{base_path_to_container_defs}/{trimmed_name}.tar", + shell=True, + ) diff --git a/containers/tag_and_push_containers.py b/containers/tag_and_push_containers.py new file mode 100644 index 0000000..6adbc2a --- /dev/null +++ b/containers/tag_and_push_containers.py @@ -0,0 +1,21 @@ +import json +import subprocess +from typing import List + +# main_upstream_registry = "ghcr.io/rivm-bioinformatics" +main_upstream_registry = "ghcr.io/florianzwagemaker" + +base_path_to_container_defs = "./containers" + +if __name__ == "__main__": + + built_containers: List[str] = [] + with open(f"{base_path_to_container_defs}/builtcontainers.json", "r") as f: + built_containers: List[str] = json.load(f) + + for container in built_containers: + print(f"Tagging and pushing {container}") + subprocess.run( + f"docker tag {container} {main_upstream_registry}/{container}", shell=True + ) + subprocess.run(f"docker push {main_upstream_registry}/{container}", shell=True) From aa5aec6613d4724277dbe94f9ad584570b98dab5 Mon Sep 17 00:00:00 2001 From: Florian Zwagemaker Date: Fri, 3 May 2024 20:51:29 +0200 Subject: [PATCH 08/22] fix: change the pathcompleter to a list comprehension instead of using list() to fix compatibility of root-paths --- ViroConstrictor/functions.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ViroConstrictor/functions.py b/ViroConstrictor/functions.py index 5fd05fc..4c23887 100644 --- a/ViroConstrictor/functions.py +++ b/ViroConstrictor/functions.py @@ -157,7 +157,8 @@ def pathCompleter(self, text: str, state: int) -> str: if os.path.isdir(text): text += "/" - return list(glob.glob(f"{text}*"))[state] + # we explicitly to a list comprehension here instead of a call to the constructor as the this would otherwise break the autocompletion functionality of paths. + return [x for x in glob.glob(f"{text}*")][state] def createListCompleter(self, ll: list[str]) -> None: """ From e67bb273004b735346114cabd234b923d1f05c34 Mon Sep 17 00:00:00 2001 From: Florian Zwagemaker Date: Fri, 3 May 2024 21:06:52 +0200 Subject: [PATCH 09/22] feat: add new global userprofile section for reproducibility method (conda or containers) fix: properly set the readline tabcompleter --- ViroConstrictor/userprofile.py | 49 ++++++++++++++++++++++++++++------ 1 file changed, 41 insertions(+), 8 deletions(-) diff --git a/ViroConstrictor/userprofile.py b/ViroConstrictor/userprofile.py index df1885a..f3ca826 100644 --- a/ViroConstrictor/userprofile.py +++ b/ViroConstrictor/userprofile.py @@ -16,6 +16,7 @@ from ViroConstrictor.functions import tabCompleter from ViroConstrictor.logging import log +from ViroConstrictor.workflow.containers import containerization_installed def FileExists(file: pathlib.Path) -> bool: @@ -43,7 +44,11 @@ def FileIsPopulated(file: pathlib.Path) -> bool: def AskPrompts( - intro: str, prompt: str, options: list, fixedchoices: bool = False + intro: str, + prompt: str, + options: list, + fixedchoices: bool = False, + default: str = None, ) -> str | None: """This function is used to ask the user a question and provide a list of options to choose from. A free-text user reply is also possible. @@ -72,13 +77,15 @@ def AskPrompts( the reply variable. """ + completer = tabCompleter() if fixedchoices: - completer = tabCompleter() completer.createListCompleter(options) - readline.set_completer_delims("\t") readline.parse_and_bind("tab: complete") readline.set_completer(completer.listCompleter) + else: + readline.parse_and_bind("tab: complete") + readline.set_completer(completer.pathCompleter) subprocess.call("/bin/clear", shell=False) @@ -101,8 +108,8 @@ def AskPrompts( reply = input(prompt).strip() if reply == "quit": sys.exit(-1) - else: - return reply + # if reply is empty and a default value is given, return the default value + return default if not reply and default else reply def BuildConfig(file: pathlib.Path) -> None: @@ -133,7 +140,7 @@ def BuildConfig(file: pathlib.Path) -> None: if conf_object["COMPUTING"]["compmode"] == "grid": conf_object["COMPUTING"]["queuename"] = AskPrompts( # type: ignore - f"""Grid mode has been chosen. Please enter the name of computing-queue that you wish to use on your grid/HPC cluster.\nThis is necessary so ViroConstrictor will send all the various tasks to the correct (remote) computers.\n\n[bold underline yellow]Please note that this is case-sensitive[/bold underline yellow]\n""", + """Grid mode has been chosen. Please enter the name of computing-queue that you wish to use on your grid/HPC cluster.\nThis is necessary so ViroConstrictor will send all the various tasks to the correct (remote) computers.\n\n[bold underline yellow]Please note that this is case-sensitive[/bold underline yellow]\n""", "Please specify the name of the Queue on your grid/HPC cluster that you wish to use. [free text] ", [], fixedchoices=False, @@ -143,7 +150,7 @@ def BuildConfig(file: pathlib.Path) -> None: "auto_update": AskPrompts( """ViroConstrictor can check and update itself everytime you run it. Please specify whether you wish to enable the auto-update feature.""", - f"""Do you wish to enable the auto-update feature? \[yes/no] """, + """Do you wish to enable the auto-update feature? \[yes/no] """, ["yes", "no"], fixedchoices=True, ) @@ -157,7 +164,22 @@ def BuildConfig(file: pathlib.Path) -> None: fixedchoices=True, ) - subprocess.call("/bin/clear", shell=False) + if containerization_installed is False: + conf_object["REPRODUCTION"] = { + "repro_method": "conda", + "container_cache_path": None, + } + else: + conf_object["REPRODUCTION"] = {"repro_method": "containers"} + conf_object["REPRODUCTION"]["container_cache_path"] = AskPrompts( + f"""ViroConstrictor will use containers to run the various analysis steps in a reproducible manner.\nHowever, to speed up the workflow and to allow offline-use, ViroConstrictor will cache the containers on your local machine.\nThis directory will be used to locally store the containers.\nIf you do not provide a path, the default path will be used. ({str(pathlib.Path.home())}/.viroconstrictor/containers)""", + """Please specify the path to the container cache directory. [free text] """, + [], + fixedchoices=False, + default=f"{str(pathlib.Path.home())}/.viroconstrictor/containers", + ) + + # subprocess.call("/bin/clear", shell=False) with open(file, "w") as conffile: conf_object.write(conffile) @@ -202,6 +224,17 @@ def AllOptionsGiven(config: configparser.ConfigParser) -> bool: else: all_present = False + if config.has_section("REPRODUCTION") is True: + if ( + config.has_option("REPRODUCTION", "repro_method") is True + and config["REPRODUCTION"]["repro_method"] == "containers" + and config.has_option("REPRODUCTION", "container_cache_path") is False + or config.has_option("REPRODUCTION", "repro_method") is not True + ): + all_present = False + else: + all_present = False + return all_present From 7a23f9256976dada06cc3db1945090d6a4204e4b Mon Sep 17 00:00:00 2001 From: Florian Zwagemaker Date: Fri, 3 May 2024 21:10:10 +0200 Subject: [PATCH 10/22] refactor: capture all custom snakemake configuration settings in a the snakemake_run_con object to properly access it from within workflows --- ViroConstrictor/runconfigs.py | 45 ++++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/ViroConstrictor/runconfigs.py b/ViroConstrictor/runconfigs.py index 2e510e0..e6c06db 100644 --- a/ViroConstrictor/runconfigs.py +++ b/ViroConstrictor/runconfigs.py @@ -52,28 +52,39 @@ def _snakemake_run_config(self) -> dict[str, Any]: cores = self._set_cores(self.inputs.flags.threads) configuration = self.inputs.user_config compmode = configuration["COMPUTING"]["compmode"] + reproduction_mode = configuration["REPRODUCTION"]["repro_method"] + reproduction_cache_path = configuration["REPRODUCTION"]["container_cache_path"] - if compmode == "grid": - queuename = configuration["COMPUTING"]["queuename"] - # threads = "{threads}" - # mem = "{resources.mem_mb}" - self.snakemake_run_conf = { - "cores": 300, - "latency-wait": 60, - "use-conda": True, - "dryrun": self.inputs.flags.dryrun, - "jobname": "ViroConstrictor_{name}.jobid{jobid}", - "drmaa": f' -q {queuename} -n {{threads}} -R "span[hosts=1]" -M {{resources.mem_mb}}', - "drmaa-log-dir": "logs/drmaa", - } - return self.snakemake_run_conf - self.snakemake_run_conf = { - "cores": cores, + base_config = { "latency-wait": 60, - "use-conda": True, "dryrun": self.inputs.flags.dryrun, "jobname": "ViroConstrictor_{name}.jobid{jobid}", + "use-conda": reproduction_mode == "conda", + "use-singularity": reproduction_mode == "containers", + "container_cache": reproduction_cache_path, + "restart-times": 3, + "keep-going": True, + "printshellcmds": False, + "scheduler": "greedy", } + + if compmode == "grid": + queuename = configuration["COMPUTING"]["queuename"] + base_config.update( + { + "cores": 300, + "drmaa": f' -q {queuename} -n {{threads}} -R "span[hosts=1]" -M {{resources.mem_mb}}', + "drmaa-log-dir": "logs/drmaa", + } + ) + self.snakemake_run_conf = base_config + return self.snakemake_run_conf + base_config.update( + { + "cores": cores, + } + ) + self.snakemake_run_conf = base_config return self.snakemake_run_conf def _snakemake_run_params(self) -> dict[str, Any]: From 46ff9066e9d260f9d774ce74a3c22b3de8e0c325 Mon Sep 17 00:00:00 2001 From: Florian Zwagemaker Date: Fri, 3 May 2024 21:18:48 +0200 Subject: [PATCH 11/22] refactor: pass all the snakemake configuration settings through the snakemake_run_conf object feat: add singularity arguments to snakemake config fix: pass the snakemak configuration settings in a dictionary to the snakemake workflow itself style: formatting with isort & black feat: handle the downloading of containers in the viroconstrictor wrapper --- ViroConstrictor/match_ref.py | 49 ++++++++++++++++++++++++++---------- 1 file changed, 36 insertions(+), 13 deletions(-) diff --git a/ViroConstrictor/match_ref.py b/ViroConstrictor/match_ref.py index fcffc02..c46b399 100644 --- a/ViroConstrictor/match_ref.py +++ b/ViroConstrictor/match_ref.py @@ -9,6 +9,10 @@ from ViroConstrictor.parser import CLIparser from ViroConstrictor.runconfigs import GetSnakemakeRunDetails, WriteYaml from ViroConstrictor.runreport import WriteReport +from ViroConstrictor.workflow.containers import ( + construct_container_bind_args, + download_containers, +) def run_snakemake( @@ -36,6 +40,8 @@ def run_snakemake( cores=snakemakedetails.snakemake_run_conf["cores"], use_conda=snakemakedetails.snakemake_run_conf["use-conda"], conda_frontend="mamba", + use_singularity=snakemakedetails.snakemake_run_conf["use-singularity"], + singularity_args=construct_container_bind_args(inputs_obj.samples_dict), jobname=snakemakedetails.snakemake_run_conf["jobname"], latency_wait=snakemakedetails.snakemake_run_conf["latency-wait"], dryrun=snakemakedetails.snakemake_run_conf["dryrun"], @@ -43,16 +49,20 @@ def run_snakemake( WriteYaml( snakemakedetails.snakemake_run_parameters, f"{inputs_obj.workdir}/config/run_params_MR.yaml", - ) + ), + WriteYaml( + snakemakedetails.snakemake_run_conf, + f"{inputs_obj.workdir}/config/run_configs_MR.yaml", + ), ], - restart_times=3, - keepgoing=True, + restart_times=snakemakedetails.snakemake_run_conf["restart-times"], + keepgoing=snakemakedetails.snakemake_run_conf["keep-going"], quiet=["all"], # type: ignore log_handler=[ ViroConstrictor.logging.snakemake_logger(logfile=inputs_obj.logfile), ], - printshellcmds=False, - scheduler="greedy", + printshellcmds=snakemakedetails.snakemake_run_conf["printshellcmds"], + scheduler=snakemakedetails.snakemake_run_conf["scheduler"], ) return snakemake( @@ -61,6 +71,8 @@ def run_snakemake( cores=snakemakedetails.snakemake_run_conf["cores"], use_conda=snakemakedetails.snakemake_run_conf["use-conda"], conda_frontend="mamba", + use_singularity=snakemakedetails.snakemake_run_conf["use-singularity"], + singularity_args=construct_container_bind_args(inputs_obj.samples_dict), jobname=snakemakedetails.snakemake_run_conf["jobname"], latency_wait=snakemakedetails.snakemake_run_conf["latency-wait"], drmaa=snakemakedetails.snakemake_run_conf["drmaa"], @@ -70,16 +82,20 @@ def run_snakemake( WriteYaml( snakemakedetails.snakemake_run_parameters, f"{inputs_obj.workdir}/config/run_params_MR.yaml", - ) + ), + WriteYaml( + snakemakedetails.snakemake_run_conf, + f"{inputs_obj.workdir}/config/run_configs_MR.yaml", + ), ], - restart_times=3, - keepgoing=True, + restart_times=snakemakedetails.snakemake_run_conf["restart-times"], + keepgoing=snakemakedetails.snakemake_run_conf["keep-going"], quiet=["all"], # type: ignore log_handler=[ ViroConstrictor.logging.snakemake_logger(logfile=inputs_obj.logfile), ], - printshellcmds=False, - scheduler="greedy", + printshellcmds=snakemakedetails.snakemake_run_conf["printshellcmds"], + scheduler=snakemakedetails.snakemake_run_conf["scheduler"], ) @@ -156,9 +172,11 @@ def replacement_merge_dataframe_on_cols( """ for i in zip(cols_left, cols_right): original_df[i[0]] = original_df.apply( - lambda x: override_df[i[1]][override_df["sample"] == x["SAMPLE"]].values[0] - if x["SAMPLE"] in override_df["sample"].values and x[i[0]] != "NONE" - else x[i[0]], + lambda x: ( + override_df[i[1]][override_df["sample"] == x["SAMPLE"]].values[0] + if x["SAMPLE"] in override_df["sample"].values and x[i[0]] != "NONE" + else x[i[0]] + ), axis=1, ) return original_df @@ -185,6 +203,11 @@ def process_match_ref(parsed_inputs: CLIparser) -> CLIparser: log.info( f"{'='*20} [bold orange_red1] Starting Match-reference process [/bold orange_red1] {'='*20}" ) + + # todo: add a proper exception here to catch the case where the containers are not downloaded + # should exit with status code 1 and a proper log.error message + status = download_containers(snakemakedetails.snakemake_run_conf) + status = run_snakemake(inputs_obj_match_ref, snakemakedetails) workflow_state: Literal["Failed", "Success"] = ( From bd9ac916f99f9a6e10d8a85cf3f71c64b70b0ef1 Mon Sep 17 00:00:00 2001 From: Florian Zwagemaker Date: Fri, 3 May 2024 21:26:21 +0200 Subject: [PATCH 12/22] refactor: add a singularity container activation log message to the logger --- ViroConstrictor/logging.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ViroConstrictor/logging.py b/ViroConstrictor/logging.py index d3b3fcf..c9d3bc2 100644 --- a/ViroConstrictor/logging.py +++ b/ViroConstrictor/logging.py @@ -221,6 +221,7 @@ def print_jobstatistics_logmessage(msg: dict) -> None: logmessage_strings_info: dict[str, Any] = { "Activating conda environment": ColorizeLogMessagePath, + "Activating singularity image": ColorizeLogMessagePath, "Building DAG of jobs": BaseLogMessage, "Creating conda environment": ColorizeLogMessagePath, "Removing incomplete Conda environment": ColorizeLogMessagePath, From e7907b17c049bf059979156dbe78b2d8cf722491 Mon Sep 17 00:00:00 2001 From: Florian Zwagemaker Date: Fri, 3 May 2024 21:27:41 +0200 Subject: [PATCH 13/22] refactor: pass all the snakemake configuration settings through the snakemake_run_conf object feat: add singularity arguments to snakemake config for main workflow fix: pass the snakemake configuration settings in a dictionary to the main snakemake workflow itself feat: handle the downloading of containers necessary for main workflow in the viroconstrictor wrapper --- ViroConstrictor/__main__.py | 38 ++++++++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/ViroConstrictor/__main__.py b/ViroConstrictor/__main__.py index 91d6be4..122d676 100644 --- a/ViroConstrictor/__main__.py +++ b/ViroConstrictor/__main__.py @@ -23,6 +23,10 @@ from ViroConstrictor.runconfigs import GetSnakemakeRunDetails, WriteYaml from ViroConstrictor.runreport import WriteReport from ViroConstrictor.update import update +from ViroConstrictor.workflow.containers import ( + construct_container_bind_args, + download_containers, +) def get_preset_warning_list( @@ -151,7 +155,14 @@ def main() -> NoReturn: inputs_obj=parsed_input, samplesheetfilename="samples_main" ) + # todo: add a proper exception here to catch the case where the containers are not downloaded + # should exit with status code 1 and a proper log.error message + container_download_status = download_containers( + snakemake_run_details.snakemake_run_conf + ) + log.info(f"{'='*20} [bold yellow] Starting Main Workflow [/bold yellow] {'='*20}") + status: bool = False if parsed_input.user_config["COMPUTING"]["compmode"] == "local": status = snakemake.snakemake( @@ -160,6 +171,8 @@ def main() -> NoReturn: cores=snakemake_run_details.snakemake_run_conf["cores"], use_conda=snakemake_run_details.snakemake_run_conf["use-conda"], conda_frontend="mamba", + use_singularity=snakemake_run_details.snakemake_run_conf["use-singularity"], + singularity_args=construct_container_bind_args(parsed_input.samples_dict), jobname=snakemake_run_details.snakemake_run_conf["jobname"], latency_wait=snakemake_run_details.snakemake_run_conf["latency-wait"], dryrun=snakemake_run_details.snakemake_run_conf["dryrun"], @@ -167,15 +180,19 @@ def main() -> NoReturn: WriteYaml( snakemake_run_details.snakemake_run_parameters, f"{parsed_input.workdir}/config/run_params.yaml", - ) + ), + WriteYaml( + snakemake_run_details.snakemake_run_conf, + f"{parsed_input.workdir}/config/run_params.yaml", + ), ], - restart_times=3, - keepgoing=True, + restart_times=snakemake_run_details.snakemake_run_conf["restart-times"], + keepgoing=snakemake_run_details.snakemake_run_conf["keep-going"], quiet=["all"], # type: ignore log_handler=[ ViroConstrictor.logging.snakemake_logger(logfile=parsed_input.logfile) ], - printshellcmds=False, + printshellcmds=snakemake_run_details.snakemake_run_conf["printshellcmds"], ) if parsed_input.user_config["COMPUTING"]["compmode"] == "grid": status = snakemake.snakemake( @@ -185,6 +202,8 @@ def main() -> NoReturn: nodes=snakemake_run_details.snakemake_run_conf["cores"], use_conda=snakemake_run_details.snakemake_run_conf["use-conda"], conda_frontend="mamba", + use_singularity=snakemake_run_details.snakemake_run_conf["use-singularity"], + singularity_args=construct_container_bind_args(parsed_input.samples_dict), jobname=snakemake_run_details.snakemake_run_conf["jobname"], latency_wait=snakemake_run_details.snakemake_run_conf["latency-wait"], drmaa=snakemake_run_details.snakemake_run_conf["drmaa"], @@ -194,14 +213,19 @@ def main() -> NoReturn: WriteYaml( snakemake_run_details.snakemake_run_parameters, f"{parsed_input.workdir}/config/run_params.yaml", - ) + ), + WriteYaml( + snakemake_run_details.snakemake_run_conf, + f"{parsed_input.workdir}/config/run_configs.yaml", + ), ], - restart_times=3, - keepgoing=True, + restart_times=snakemake_run_details.snakemake_run_conf["restart-times"], + keepgoing=snakemake_run_details.snakemake_run_conf["keep-going"], quiet=["all"], # type: ignore log_handler=[ ViroConstrictor.logging.snakemake_logger(logfile=parsed_input.logfile) ], + printshellcmds=snakemake_run_details.snakemake_run_conf["printshellcmds"], ) if snakemake_run_details.snakemake_run_conf["dryrun"] is False and status is True: From 3a7d02bf250f9295b8d7145f66a9fd600ac9f411 Mon Sep 17 00:00:00 2001 From: Florian Zwagemaker Date: Mon, 6 May 2024 09:22:29 +0200 Subject: [PATCH 14/22] ci: add step in container build/test workflow to download already existing containers --- .github/workflows/build_and_test.yml | 7 ++++++- containers/pull_published_containers.py | 26 +++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) create mode 100644 containers/pull_published_containers.py diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index e8f6a02..ac16701 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -91,5 +91,10 @@ jobs: run: | python containers/convert_artifact_containers_for_apptainer.py - # download already built containers + + - name: download existing containers + run: | + python containers/pull_published_containers.py + + ## rest of the testing suite here \ No newline at end of file diff --git a/containers/pull_published_containers.py b/containers/pull_published_containers.py new file mode 100644 index 0000000..fce2978 --- /dev/null +++ b/containers/pull_published_containers.py @@ -0,0 +1,26 @@ +import os +import shutil +import sys + +from ViroConstrictor.workflow.containers import download_containers + +base_path_to_container_defs = "./containers" + +if __name__ == "__main__": + config = {"container_cache": f"{os.getcwd()}/test_containers"} + os.makedirs(config["container_cache"], exist_ok=True) + + # move .sif files from ./containers to ./test_containers + for file in os.listdir(base_path_to_container_defs): + if file.endswith(".sif"): + shutil.move( + f"{base_path_to_container_defs}/{file}", + f"{config['container_cache']}/{file}", + ) + + download_status = download_containers(config) + if download_status == 1: + print( + "Failed to download all necessary containers. Please check the logs for more information." + ) + sys.exit(1) From 28e3333cc41a335f85aaaa9233bb2cd999d30c65 Mon Sep 17 00:00:00 2001 From: Florian Zwagemaker Date: Mon, 6 May 2024 09:23:53 +0200 Subject: [PATCH 15/22] ci: temporarily add listing of directory contents for debugging purposes --- .github/workflows/build_and_test.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index ac16701..12ebcda 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -96,5 +96,8 @@ jobs: run: | python containers/pull_published_containers.py + - name: list directory contents + run: | + ls -R ./ ## rest of the testing suite here \ No newline at end of file From ac8a3ac40f77bbb51aba10f9596b8586eee5a3ef Mon Sep 17 00:00:00 2001 From: Florian Zwagemaker Date: Mon, 6 May 2024 09:28:15 +0200 Subject: [PATCH 16/22] ci: change upstream registry --- containers/tag_and_push_containers.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/containers/tag_and_push_containers.py b/containers/tag_and_push_containers.py index 6adbc2a..5ffb5e0 100644 --- a/containers/tag_and_push_containers.py +++ b/containers/tag_and_push_containers.py @@ -2,8 +2,7 @@ import subprocess from typing import List -# main_upstream_registry = "ghcr.io/rivm-bioinformatics" -main_upstream_registry = "ghcr.io/florianzwagemaker" +from ViroConstrictor.workflow.containers import upstream_registry base_path_to_container_defs = "./containers" @@ -16,6 +15,6 @@ for container in built_containers: print(f"Tagging and pushing {container}") subprocess.run( - f"docker tag {container} {main_upstream_registry}/{container}", shell=True + f"docker tag {container} {upstream_registry}/{container}", shell=True ) - subprocess.run(f"docker push {main_upstream_registry}/{container}", shell=True) + subprocess.run(f"docker push {upstream_registry}/{container}", shell=True) From 9341f95a4df4ca369e5398f25f05c10e0ada55f5 Mon Sep 17 00:00:00 2001 From: Florian Zwagemaker Date: Mon, 6 May 2024 09:49:25 +0200 Subject: [PATCH 17/22] ci: Update GitHub Actions workflows for container publishing Refactor the GitHub Actions workflows to update the triggers for container publishing. Change the trigger for the "build_and_test.yml" workflow to only run on pull requests to the main branch. Additionally, add a new workflow "publish_containers.yaml" to handle the publishing of containers. This workflow will be triggered on pushes to the dev branch. Related recent commits: - ci: change upstream registry - ci: temporarily add listing of directory contents for debugging purposes - ci: add step in container build/test workflow to download already existing containers --- .github/workflows/build_and_test.yml | 3 +- .github/workflows/publish_containers.yaml | 65 +++++++++++++++++++++++ 2 files changed, 67 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/publish_containers.yaml diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 12ebcda..e2611ec 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -1,9 +1,10 @@ name: build containers and run tests +#todo: change this to an on_pullrequest trigger only to main branch on: pull_request: branches: - - 'main' + - '*' jobs: Setup_and_build: diff --git a/.github/workflows/publish_containers.yaml b/.github/workflows/publish_containers.yaml new file mode 100644 index 0000000..6d9f92c --- /dev/null +++ b/.github/workflows/publish_containers.yaml @@ -0,0 +1,65 @@ +name: Publish containers + +#todo: change this to an onrelease trigger +on: + push: + branches: + - dev + + +jobs: + Upload: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Download artifact + id: download-artifact + uses: dawidd6/action-download-artifact@v3 + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + workflow: build_and_test.yml + name: built_containers + skip_unpack: true + + - name: move artifact + run: | + mv ./containers.tar.gz ./containers/containers.tar.gz + + - name: list directory contents + run: | + ls -R ./ + + - name: unzip built containers + run: | + cd ./containers/ + tar -xzvf containers.tar.gz + cd .. + + - name: Setup Mamba + uses: mamba-org/setup-micromamba@v1 + with: + cache-environment: true + post-cleanup: 'all' + environment-file: env.yml + init-shell: bash + + - name: Install local python package + run: | + pip install . --no-deps + shell: micromamba-shell {0} + + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Add artifacted containers to docker daemon + run: | + python containers/add_OCI_to_docker_engine.py + + - name: tag and push containers + run: | + python containers/tag_and_push_containers.py \ No newline at end of file From 0992600adfc05c2ea6d50a4279507f94b37f12fb Mon Sep 17 00:00:00 2001 From: Florian Zwagemaker Date: Mon, 6 May 2024 10:10:28 +0200 Subject: [PATCH 18/22] fix: add an __init__.py file to the viroconstrictor.workflow dir --- ViroConstrictor/workflow/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 ViroConstrictor/workflow/__init__.py diff --git a/ViroConstrictor/workflow/__init__.py b/ViroConstrictor/workflow/__init__.py new file mode 100644 index 0000000..e69de29 From 2e4f18341ed7fb298dc81f92185c1ca41e4db2a0 Mon Sep 17 00:00:00 2001 From: Florian Zwagemaker Date: Mon, 6 May 2024 10:13:30 +0200 Subject: [PATCH 19/22] fix: change the fastqc wrapper script to its own dedicated dir --- ViroConstrictor/workflow/{scripts => wrappers}/fastqc_wrapper.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename ViroConstrictor/workflow/{scripts => wrappers}/fastqc_wrapper.sh (100%) diff --git a/ViroConstrictor/workflow/scripts/fastqc_wrapper.sh b/ViroConstrictor/workflow/wrappers/fastqc_wrapper.sh similarity index 100% rename from ViroConstrictor/workflow/scripts/fastqc_wrapper.sh rename to ViroConstrictor/workflow/wrappers/fastqc_wrapper.sh From 66ec41c993c375377f859c3ab0fe704503f71866 Mon Sep 17 00:00:00 2001 From: Florian Zwagemaker Date: Mon, 6 May 2024 10:47:29 +0200 Subject: [PATCH 20/22] refactor: add verbose option to `download_containers` function --- ViroConstrictor/workflow/containers.py | 6 +++--- containers/pull_published_containers.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ViroConstrictor/workflow/containers.py b/ViroConstrictor/workflow/containers.py index 76b6a89..5485867 100644 --- a/ViroConstrictor/workflow/containers.py +++ b/ViroConstrictor/workflow/containers.py @@ -135,7 +135,7 @@ def containerization_executable() -> str: ) -def download_containers(config: Dict[str, Any]) -> int: +def download_containers(config: Dict[str, Any], verbose=False) -> int: to_download = containers_to_download(config) to_download = [x.rsplit("_", 1)[0] + ":" + x.rsplit("_", 1)[1] for x in to_download] @@ -148,8 +148,8 @@ def download_containers(config: Dict[str, Any]) -> int: status = subprocess.call( f"{executable} pull --dir {config['container_cache']} docker://{upstream_registry}/{container}", shell=True, - stderr=subprocess.PIPE, - stdout=subprocess.PIPE, + stderr=subprocess.PIPE if verbose is False else None, + stdout=subprocess.PIPE if verbose is False else None, ) if status != 0: print(f"Failed to download {container}") diff --git a/containers/pull_published_containers.py b/containers/pull_published_containers.py index fce2978..3a0dcce 100644 --- a/containers/pull_published_containers.py +++ b/containers/pull_published_containers.py @@ -18,7 +18,7 @@ f"{config['container_cache']}/{file}", ) - download_status = download_containers(config) + download_status = download_containers(config, verbose=True) if download_status == 1: print( "Failed to download all necessary containers. Please check the logs for more information." From b5fed9ff823ad6335f1a8977073505cfbd6f315e Mon Sep 17 00:00:00 2001 From: Florian Zwagemaker Date: Mon, 6 May 2024 11:21:34 +0200 Subject: [PATCH 21/22] ci: add listing of pip modules as a test. force the usage of the micromamba shell in GH actions --- .github/workflows/build_and_test.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index e2611ec..17c1e1d 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -92,10 +92,16 @@ jobs: run: | python containers/convert_artifact_containers_for_apptainer.py + - name: list installed modules + run: | + pip list + mamba list + shell: micromamba-shell {0} - name: download existing containers run: | python containers/pull_published_containers.py + shell: micromamba-shell {0} - name: list directory contents run: | From ea17e1913e10e530c7e2cb95fe310a8888a23417 Mon Sep 17 00:00:00 2001 From: Florian Zwagemaker Date: Mon, 6 May 2024 12:14:29 +0200 Subject: [PATCH 22/22] refactor: Improve container download error handling and logging The `download_containers` function in the `containers.py` module has been refactored to include better error handling and logging. Previously, if a container failed to download, the function would return a non-zero status code without providing any specific error message. Now, when a container fails to download, the function logs an error message indicating the name of the container that failed. This change improves the clarity of error reporting and makes it easier to troubleshoot container download issues. Related recent commits: - refactor: add verbose option to `download_containers` function - ci: temporarily add listing of directory contents for debugging purposes - ci: add step in container build/test workflow to download already existing containers --- .github/workflows/build_and_test.yml | 18 ------------------ .github/workflows/publish_containers.yaml | 4 +++- ViroConstrictor/__main__.py | 8 +++----- ViroConstrictor/match_ref.py | 7 ++++--- ViroConstrictor/workflow/containers.py | 12 ++++++------ 5 files changed, 16 insertions(+), 33 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 17c1e1d..dd8b563 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -58,10 +58,6 @@ jobs: run: | mv ./containers.tar.gz ./containers/containers.tar.gz - - name: list directory contents - run: | - ls -R ./ - - name: unzip built containers run: | cd ./containers/ @@ -84,27 +80,13 @@ jobs: pip install . --no-deps shell: micromamba-shell {0} - - name: list directory contents - run: | - ls -R ./ - - name: convert containers run: | python containers/convert_artifact_containers_for_apptainer.py - - name: list installed modules - run: | - pip list - mamba list - shell: micromamba-shell {0} - - name: download existing containers run: | python containers/pull_published_containers.py shell: micromamba-shell {0} - - - name: list directory contents - run: | - ls -R ./ ## rest of the testing suite here \ No newline at end of file diff --git a/.github/workflows/publish_containers.yaml b/.github/workflows/publish_containers.yaml index 6d9f92c..30af39a 100644 --- a/.github/workflows/publish_containers.yaml +++ b/.github/workflows/publish_containers.yaml @@ -59,7 +59,9 @@ jobs: - name: Add artifacted containers to docker daemon run: | python containers/add_OCI_to_docker_engine.py + shell: micromamba-shell {0} - name: tag and push containers run: | - python containers/tag_and_push_containers.py \ No newline at end of file + python containers/tag_and_push_containers.py + shell: micromamba-shell {0} \ No newline at end of file diff --git a/ViroConstrictor/__main__.py b/ViroConstrictor/__main__.py index 122d676..b31f080 100644 --- a/ViroConstrictor/__main__.py +++ b/ViroConstrictor/__main__.py @@ -155,11 +155,9 @@ def main() -> NoReturn: inputs_obj=parsed_input, samplesheetfilename="samples_main" ) - # todo: add a proper exception here to catch the case where the containers are not downloaded - # should exit with status code 1 and a proper log.error message - container_download_status = download_containers( - snakemake_run_details.snakemake_run_conf - ) + if download_containers(snakemake_run_details.snakemake_run_conf) != 0: + log.error("Failed to download containers required for workflow.\nPlease check the logs and your settings for more information and try again later.") + sys.exit(1) log.info(f"{'='*20} [bold yellow] Starting Main Workflow [/bold yellow] {'='*20}") diff --git a/ViroConstrictor/match_ref.py b/ViroConstrictor/match_ref.py index c46b399..035f288 100644 --- a/ViroConstrictor/match_ref.py +++ b/ViroConstrictor/match_ref.py @@ -1,4 +1,5 @@ import copy +import sys from typing import Literal import pandas as pd @@ -204,9 +205,9 @@ def process_match_ref(parsed_inputs: CLIparser) -> CLIparser: f"{'='*20} [bold orange_red1] Starting Match-reference process [/bold orange_red1] {'='*20}" ) - # todo: add a proper exception here to catch the case where the containers are not downloaded - # should exit with status code 1 and a proper log.error message - status = download_containers(snakemakedetails.snakemake_run_conf) + if download_containers(snakemakedetails.snakemake_run_conf) != 0: + log.error("Failed to download containers required for workflow.\nPlease check the logs and your settings for more information and try again later.") + sys.exit(1) status = run_snakemake(inputs_obj_match_ref, snakemakedetails) diff --git a/ViroConstrictor/workflow/containers.py b/ViroConstrictor/workflow/containers.py index 5485867..e4181e6 100644 --- a/ViroConstrictor/workflow/containers.py +++ b/ViroConstrictor/workflow/containers.py @@ -3,9 +3,8 @@ import subprocess from typing import Any, Dict, List, Tuple -from rich import print - from ViroConstrictor import __prog__ +from ViroConstrictor.logging import log upstream_registry = "ghcr.io/rivm-bioinformatics" @@ -143,7 +142,7 @@ def download_containers(config: Dict[str, Any], verbose=False) -> int: # however this has unintended consequences for the various OCI layers resulting in at least one container not being downloaded correctly. # Thus it's better for now to download the containers sequentially. for container in to_download: - print(f"Downloading {container}") + log.info(f"Downloading container: [magenta]'{container}'[/magenta] to local cache") executable = containerization_executable() status = subprocess.call( f"{executable} pull --dir {config['container_cache']} docker://{upstream_registry}/{container}", @@ -152,16 +151,17 @@ def download_containers(config: Dict[str, Any], verbose=False) -> int: stdout=subprocess.PIPE if verbose is False else None, ) if status != 0: - print(f"Failed to download {container}") + log.error(f"Failed to download container: [magenta]'{container}'[/magenta]") return 1 - print(f"Successfully downloaded {container}") + log.info(f"Successfully downloaded container: [magenta]'{container}'[/magenta] to local cache") return 0 def construct_container_bind_args(samples_dict: Dict) -> str: paths = [] - for nested_dict in samples_dict.items(): + for keys, nested_dict in samples_dict.items(): + print(nested_dict) paths.extend( f"{os.path.dirname(value)}" for value in nested_dict.values()