From 004b1568ad5a1de86d7f767af516382bee27f567 Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Wed, 2 Oct 2024 12:08:00 +0200 Subject: [PATCH 1/4] Add a dry run option for faster debugging/ submission review/ updates --- ena-submission/README.md | 6 + ena-submission/scripts/create_assembly.py | 4 +- ena-submission/scripts/deposition_dry_run.py | 172 ++++++++++++++++++ .../scripts/ena_submission_helper.py | 64 ++++--- 4 files changed, 221 insertions(+), 25 deletions(-) create mode 100644 ena-submission/scripts/deposition_dry_run.py diff --git a/ena-submission/README.md b/ena-submission/README.md index 191add661..7839e0501 100644 --- a/ena-submission/README.md +++ b/ena-submission/README.md @@ -162,6 +162,12 @@ micromamba activate loculus-ena-submission python3 scripts/test_ena_submission.py ``` +You can also use the `deposition_dry_run.py` script to produce the same output files/XMLs that the pipeline would produce in order to submit to ENA. This is a good test if you would like to first verify what your submission to ENA will look like. Make sure that you have the same config.yaml that will be used in production (use deploy.py to generate this). Also note that the generator can only produce output for one submission at a time. + +``` +python scripts/deposition_dry_run.py --log-level=DEBUG --data-to-submit=results/approved_ena_submission_list.json --mode=assembly --center-name="Yale" +``` + ### Testing submission locally 1. Run loculus locally (need prepro, backend and ena-submission pod), e.g. diff --git a/ena-submission/scripts/create_assembly.py b/ena-submission/scripts/create_assembly.py index eb31f786d..a5998b2f7 100644 --- a/ena-submission/scripts/create_assembly.py +++ b/ena-submission/scripts/create_assembly.py @@ -117,6 +117,7 @@ def create_manifest_object( seq_key: dict[str, str], group_key: dict[str, str], test=False, + dir: str | None = None, ) -> AssemblyManifest: """ Create an AssemblyManifest object for an entry in the assembly table using: @@ -136,10 +137,11 @@ def create_manifest_object( unaligned_nucleotide_sequences = submission_table_entry["unaligned_nucleotide_sequences"] organism_metadata = config.organisms[group_key["organism"]]["ingest"] chromosome_list_object = create_chromosome_list_object(unaligned_nucleotide_sequences, seq_key) - chromosome_list_file = create_chromosome_list(list_object=chromosome_list_object) + chromosome_list_file = create_chromosome_list(list_object=chromosome_list_object, dir=dir) fasta_file = create_fasta( unaligned_sequences=unaligned_nucleotide_sequences, chromosome_list=chromosome_list_object, + dir = dir ) program = ( metadata["sequencingInstrument"] if metadata.get("sequencingInstrument") else "Unknown" diff --git a/ena-submission/scripts/deposition_dry_run.py b/ena-submission/scripts/deposition_dry_run.py new file mode 100644 index 000000000..ae0869ed3 --- /dev/null +++ b/ena-submission/scripts/deposition_dry_run.py @@ -0,0 +1,172 @@ +# This file offers command line options to generate the submission files +# for local ena submission from input data, it uses the configs specified in the config folder +# It requires an input file in the same format as required to trigger ena submission and should +# produce the same output as would be sent to ENA by the pipeline. + +# WARNING: Please still review submission files manually before using them!! +import json +import logging +import os +from dataclasses import dataclass +from typing import Any + +import click +import yaml +from create_assembly import create_manifest_object +from create_project import construct_project_set_object +from create_sample import construct_sample_set_object +from ena_submission_helper import create_manifest, get_project_xml, get_sample_xml + +logger = logging.getLogger(__name__) +logging.basicConfig( + encoding="utf-8", + level=logging.INFO, + format="%(asctime)s %(levelname)8s (%(filename)20s:%(lineno)4d) - %(message)s ", + datefmt="%H:%M:%S", +) + +with open("config/config.yaml", encoding="utf-8") as f: + config = yaml.safe_load(f) + +with open("config/defaults.yaml", encoding="utf-8") as f: + defaults = yaml.safe_load(f) + +# Merge configs, using defaults only as fallback +# Write to results/config.yaml +for key, value in defaults.items(): + if not key in config: + config[key] = value + + +@dataclass +class Config: + organisms: dict[dict[str, str]] + db_name: str + unique_project_suffix: str + metadata_mapping: dict[str, dict[str, str]] + metadata_mapping_mandatory_field_defaults: dict[str, str] + ena_checklist: str + use_ena_checklist: bool + + +@click.command() +@click.option( + "--data-to-submit", + required=False, + type=click.Path(exists=True), +) +@click.option( + "--mode", + required=True, + type=click.Choice(["project", "sample", "assembly"]), +) +@click.option("--center-name", required=False, type=str, default="CENTER_NAME") +@click.option( + "--log-level", + default="INFO", + type=click.Choice(["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]), +) +def local_ena_submission_generator( + data_to_submit, + center_name, + mode, + log_level, + config_file, +): + """ + Produce output of submission pipeline locally + """ + logger.setLevel(log_level) + logging.getLogger("requests").setLevel(logging.INFO) + + with open(config_file, encoding="utf-8") as file: + full_config = yaml.safe_load(file) + relevant_config = {key: full_config.get(key, []) for key in Config.__annotations__} + config = Config(**relevant_config) + + logger.debug(f"Config: {config}") + + with open(data_to_submit, encoding="utf-8") as json_file: + sequences_to_upload: dict[str, Any] = json.load(json_file) + + if len(sequences_to_upload) > 1: + logging.error("Script can only handle one entry at a time") + return + + for full_accession, data in sequences_to_upload.items(): + accession, version = full_accession.split(".") + entry = { + "accession": accession, + "version": version, + "group_id": data["metadata"]["groupId"], + "organism": data["organism"], + "metadata": data["metadata"], + "unaligned_nucleotide_sequences": data["unalignedNucleotideSequences"], + } + + group_info = {"institution": center_name} + + if mode == "project": + project_set = construct_project_set_object(group_info, config, entry) + project_xml = get_project_xml(project_set) + + directory = "project" + os.makedirs(directory, exist_ok=True) + logger.info(f"Writing results to {directory}") + + with open(os.path.join(directory, "submission.xml"), "w") as file: + file.write(project_xml["SUBMISSION"]) + with open(os.path.join(directory, "project.xml"), "w") as file: + file.write(project_xml["PROJECT"]) + + logger.info( + "You can submit the project to ENA using the command: \n" + "curl -u {params.ena_submission_username}:{params.ena_submission_password}" + "-F 'SUBMISSION=@{project/submission.xml}' -F 'PROJECT=@{project/project.xml}'" + " {params.ena_submission_url} > {output}" + ) + + if mode == "sample": + entry["center_name"] = center_name + sample_set = construct_sample_set_object(config, entry, entry) + sample_xml = get_sample_xml(sample_set) + + directory = "sample" + os.makedirs(directory, exist_ok=True) + logger.info(f"Writing results to {directory}") + + with open(os.path.join(directory, "submission.xml"), "w") as file: + file.write(sample_xml["SUBMISSION"]) + with open(os.path.join(directory, "sample.xml"), "w") as file: + file.write(sample_xml["SAMPLE"]) + + logger.info( + "You can submit the sample to ENA using the command: \n" + "curl -u {params.ena_submission_username}:{params.ena_submission_password}" + "-F 'SUBMISSION=@{sample/submission.xml}' -F 'SAMPLE=@{sample/project.xml}'" + " {params.ena_submission_url} > {output}" + ) + + if mode == "assembly": + dummy_sample_dict = {"result": {"ena_sample_accession": "BIOSAMPLE_ACCESSION"}} + dummy_project_dict = {"result": {"bioproject_accession": "BIOPROJECT_ACCESSION"}} + + directory = "assembly" + os.makedirs(directory, exist_ok=True) + logger.info(f"Writing results to {directory}") + + manifest_object = create_manifest_object( + config, dummy_sample_dict, dummy_project_dict, entry, entry, entry, dir=directory + ) + create_manifest(manifest_object, dir=directory) + logger.info( + "You can submit the assembly to ENA using the command: \n" + "java -jarwebin-cli.jar -username {ena_submission_username} " + "-password {ena_submission_password} -context genome " + "-manifest {assembly/manifest.tsv} -submit " + f"-centername {center_name}" + ) + + +if __name__ == "__main__": + local_ena_submission_generator() diff --git a/ena-submission/scripts/ena_submission_helper.py b/ena-submission/scripts/ena_submission_helper.py index 418a5e1c7..ce95aeb9b 100644 --- a/ena-submission/scripts/ena_submission_helper.py +++ b/ena-submission/scripts/ena_submission_helper.py @@ -116,6 +116,14 @@ def get_submission_dict(hold_until_date: str | None = None): ) +def get_project_xml(project_set): + submission_set = get_submission_dict() + return { + "SUBMISSION": dataclass_to_xml(submission_set, root_name="SUBMISSION"), + "PROJECT": dataclass_to_xml(project_set, root_name="PROJECT_SET"), + } + + def create_ena_project(config: ENAConfig, project_set: ProjectSet) -> CreationResult: """ The project creation request should be equivalent to @@ -128,13 +136,6 @@ def create_ena_project(config: ENAConfig, project_set: ProjectSet) -> CreationRe errors = [] warnings = [] - def get_project_xml(project_set): - submission_set = get_submission_dict() - return { - "SUBMISSION": dataclass_to_xml(submission_set, root_name="SUBMISSION"), - "PROJECT": dataclass_to_xml(project_set, root_name="PROJECT_SET"), - } - xml = get_project_xml(project_set) response = post_webin(config, xml) if not response.ok: @@ -165,6 +166,15 @@ def get_project_xml(project_set): return CreationResult(result=project_results, errors=errors, warnings=warnings) +def get_sample_xml(sample_set): + submission_set = get_submission_dict() + files = { + "SUBMISSION": dataclass_to_xml(submission_set, root_name="SUBMISSION"), + "SAMPLE": dataclass_to_xml(sample_set, root_name="SAMPLE_SET"), + } + return files + + def create_ena_sample(config: ENAConfig, sample_set: SampleSetType) -> CreationResult: """ The sample creation request should be equivalent to @@ -177,14 +187,6 @@ def create_ena_sample(config: ENAConfig, sample_set: SampleSetType) -> CreationR errors = [] warnings = [] - def get_sample_xml(sample_set): - submission_set = get_submission_dict() - files = { - "SUBMISSION": dataclass_to_xml(submission_set, root_name="SUBMISSION"), - "SAMPLE": dataclass_to_xml(sample_set, root_name="SAMPLE_SET"), - } - return files - xml = get_sample_xml(sample_set) response = post_webin(config, xml) if not response.ok: @@ -231,13 +233,17 @@ def post_webin(config: ENAConfig, xml: dict[str, Any]) -> requests.Response: ) -def create_chromosome_list(list_object: AssemblyChromosomeListFile) -> str: +def create_chromosome_list(list_object: AssemblyChromosomeListFile, dir: str | None = None) -> str: """ Creates a temp file chromosome list: https://ena-docs.readthedocs.io/en/latest/submit/fileprep/assembly.html#chromosome-list-file """ - with tempfile.NamedTemporaryFile(delete=False, suffix=".gz") as temp: - filename = temp.name + if dir: + os.makedirs(dir, exist_ok=True) + filename = os.path.join(dir, "chromosome_list.gz") + else: + with tempfile.NamedTemporaryFile(delete=False, suffix=".gz") as temp: + filename = temp.name with gzip.GzipFile(filename, "wb") as gz: for entry in list_object.chromosomes: @@ -249,14 +255,20 @@ def create_chromosome_list(list_object: AssemblyChromosomeListFile) -> str: def create_fasta( - unaligned_sequences: dict[str, str], chromosome_list: AssemblyChromosomeListFile + unaligned_sequences: dict[str, str], + chromosome_list: AssemblyChromosomeListFile, + dir: str | None = None, ) -> str: """ Creates a temp fasta file: https://ena-docs.readthedocs.io/en/latest/submit/fileprep/assembly.html#fasta-file """ - with tempfile.NamedTemporaryFile(delete=False, suffix=".fasta.gz") as temp: - filename = temp.name + if dir: + os.makedirs(dir, exist_ok=True) + filename = os.path.join(dir, "fasta.gz") + else: + with tempfile.NamedTemporaryFile(delete=False, suffix=".fasta.gz") as temp: + filename = temp.name with gzip.GzipFile(filename, "wb") as gz: if len(unaligned_sequences.keys()) == 1: @@ -271,13 +283,17 @@ def create_fasta( return filename -def create_manifest(manifest: AssemblyManifest) -> str: +def create_manifest(manifest: AssemblyManifest, dir: str | None = None) -> str: """ Creates a temp manifest file: https://ena-docs.readthedocs.io/en/latest/submit/assembly/genome.html#manifest-files """ - with tempfile.NamedTemporaryFile(delete=False, suffix=".tsv") as temp: - filename = temp.name + if dir: + os.makedirs(dir, exist_ok=True) + filename = os.path.join(dir, "manifest.tsv") + else: + with tempfile.NamedTemporaryFile(delete=False, suffix=".tsv") as temp: + filename = temp.name with open(filename, "w") as f: f.write(f"STUDY\t{manifest.study}\n") f.write(f"SAMPLE\t{manifest.sample}\n") From dcbdf1175de46d95b857f61c868d6bdc1f2076ec Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Wed, 2 Oct 2024 12:19:26 +0200 Subject: [PATCH 2/4] add little warning --- ena-submission/scripts/deposition_dry_run.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ena-submission/scripts/deposition_dry_run.py b/ena-submission/scripts/deposition_dry_run.py index ae0869ed3..1ef80b0bd 100644 --- a/ena-submission/scripts/deposition_dry_run.py +++ b/ena-submission/scripts/deposition_dry_run.py @@ -124,6 +124,7 @@ def local_ena_submission_generator( "curl -u {params.ena_submission_username}:{params.ena_submission_password}" "-F 'SUBMISSION=@{project/submission.xml}' -F 'PROJECT=@{project/project.xml}'" " {params.ena_submission_url} > {output}" + "\n Remember to submit to wwwdev. if you do not want to submit to production" ) if mode == "sample": @@ -145,6 +146,7 @@ def local_ena_submission_generator( "curl -u {params.ena_submission_username}:{params.ena_submission_password}" "-F 'SUBMISSION=@{sample/submission.xml}' -F 'SAMPLE=@{sample/project.xml}'" " {params.ena_submission_url} > {output}" + "\n Remember to submit to wwwdev. if you do not want to submit to production" ) if mode == "assembly": @@ -165,6 +167,7 @@ def local_ena_submission_generator( "-password {ena_submission_password} -context genome " "-manifest {assembly/manifest.tsv} -submit " f"-centername {center_name}" + "\n Remember to submit with -test if you do not want to submit to production" ) From 08b9f9dddb38eecfdeee75bafc821511bfdba6ea Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Fri, 4 Oct 2024 09:30:22 +0200 Subject: [PATCH 3/4] add test --- ena-submission/Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ena-submission/Snakefile b/ena-submission/Snakefile index 0c621d1ce..586da8c81 100644 --- a/ena-submission/Snakefile +++ b/ena-submission/Snakefile @@ -28,7 +28,7 @@ if SUBMIT_TO_ENA_DEV: print("Submitting to ENA dev environment") config["ena_submission_url"] = "https://wwwdev.ebi.ac.uk/ena/submit/drop-box/submit" config["github_url"] = ( - "https://raw.githubusercontent.com/pathoplexus/ena-submission/main/test/approved_ena_submission_list.json" + "https://raw.githubusercontent.com/pathoplexus/ena-submission/loculus_test/test/approved_ena_submission_list.json" ) config["ena_reports_service_url"] = "https://wwwdev.ebi.ac.uk/ena/submit/report" From efc84c153cf13c298a42e76bfc7a203d2cd20965 Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Fri, 4 Oct 2024 10:09:27 +0200 Subject: [PATCH 4/4] Update ena-submission/Snakefile --- ena-submission/Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ena-submission/Snakefile b/ena-submission/Snakefile index 586da8c81..0c621d1ce 100644 --- a/ena-submission/Snakefile +++ b/ena-submission/Snakefile @@ -28,7 +28,7 @@ if SUBMIT_TO_ENA_DEV: print("Submitting to ENA dev environment") config["ena_submission_url"] = "https://wwwdev.ebi.ac.uk/ena/submit/drop-box/submit" config["github_url"] = ( - "https://raw.githubusercontent.com/pathoplexus/ena-submission/loculus_test/test/approved_ena_submission_list.json" + "https://raw.githubusercontent.com/pathoplexus/ena-submission/main/test/approved_ena_submission_list.json" ) config["ena_reports_service_url"] = "https://wwwdev.ebi.ac.uk/ena/submit/report"