Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(ena-submission): Add a dry run option for faster debugging/ submission review/ updates #2931

Merged
merged 5 commits into from
Oct 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions ena-submission/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,12 @@ micromamba activate loculus-ena-submission
python3 scripts/test_ena_submission.py
```

You can also use the `deposition_dry_run.py` script to produce the same output files/XMLs that the pipeline would produce in order to submit to ENA. This is a good test if you would like to first verify what your submission to ENA will look like. Make sure that you have the same config.yaml that will be used in production (use deploy.py to generate this). Also note that the generator can only produce output for one submission at a time.

```
python scripts/deposition_dry_run.py --log-level=DEBUG --data-to-submit=results/approved_ena_submission_list.json --mode=assembly --center-name="Yale"
```

### Testing submission locally

1. Run loculus locally (need prepro, backend and ena-submission pod), e.g.
Expand Down
4 changes: 3 additions & 1 deletion ena-submission/scripts/create_assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ def create_manifest_object(
seq_key: dict[str, str],
group_key: dict[str, str],
test=False,
dir: str | None = None,
) -> AssemblyManifest:
"""
Create an AssemblyManifest object for an entry in the assembly table using:
Expand All @@ -136,10 +137,11 @@ def create_manifest_object(
unaligned_nucleotide_sequences = submission_table_entry["unaligned_nucleotide_sequences"]
organism_metadata = config.organisms[group_key["organism"]]["ingest"]
chromosome_list_object = create_chromosome_list_object(unaligned_nucleotide_sequences, seq_key)
chromosome_list_file = create_chromosome_list(list_object=chromosome_list_object)
chromosome_list_file = create_chromosome_list(list_object=chromosome_list_object, dir=dir)
fasta_file = create_fasta(
unaligned_sequences=unaligned_nucleotide_sequences,
chromosome_list=chromosome_list_object,
dir = dir
)
program = (
metadata["sequencingInstrument"] if metadata.get("sequencingInstrument") else "Unknown"
Expand Down
175 changes: 175 additions & 0 deletions ena-submission/scripts/deposition_dry_run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
# This file offers command line options to generate the submission files
# for local ena submission from input data, it uses the configs specified in the config folder
# It requires an input file in the same format as required to trigger ena submission and should
# produce the same output as would be sent to ENA by the pipeline.

# WARNING: Please still review submission files manually before using them!!
import json
import logging
import os
from dataclasses import dataclass
from typing import Any

import click
import yaml
from create_assembly import create_manifest_object
from create_project import construct_project_set_object
from create_sample import construct_sample_set_object
from ena_submission_helper import create_manifest, get_project_xml, get_sample_xml

logger = logging.getLogger(__name__)
logging.basicConfig(
encoding="utf-8",
level=logging.INFO,
format="%(asctime)s %(levelname)8s (%(filename)20s:%(lineno)4d) - %(message)s ",
datefmt="%H:%M:%S",
)

with open("config/config.yaml", encoding="utf-8") as f:
config = yaml.safe_load(f)

with open("config/defaults.yaml", encoding="utf-8") as f:
defaults = yaml.safe_load(f)

# Merge configs, using defaults only as fallback
# Write to results/config.yaml
for key, value in defaults.items():
if not key in config:
config[key] = value


@dataclass
class Config:
organisms: dict[dict[str, str]]
db_name: str
unique_project_suffix: str
metadata_mapping: dict[str, dict[str, str]]
metadata_mapping_mandatory_field_defaults: dict[str, str]
ena_checklist: str
use_ena_checklist: bool


@click.command()
@click.option(
"--data-to-submit",
required=False,
type=click.Path(exists=True),
)
@click.option(
"--mode",
required=True,
type=click.Choice(["project", "sample", "assembly"]),
)
@click.option("--center-name", required=False, type=str, default="CENTER_NAME")
@click.option(
"--log-level",
default="INFO",
type=click.Choice(["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]),
)
def local_ena_submission_generator(
data_to_submit,
center_name,
mode,
log_level,
config_file,
):
"""
Produce output of submission pipeline locally
"""
logger.setLevel(log_level)
logging.getLogger("requests").setLevel(logging.INFO)

with open(config_file, encoding="utf-8") as file:
full_config = yaml.safe_load(file)
relevant_config = {key: full_config.get(key, []) for key in Config.__annotations__}
config = Config(**relevant_config)

logger.debug(f"Config: {config}")

with open(data_to_submit, encoding="utf-8") as json_file:
sequences_to_upload: dict[str, Any] = json.load(json_file)

if len(sequences_to_upload) > 1:
logging.error("Script can only handle one entry at a time")
return

for full_accession, data in sequences_to_upload.items():
accession, version = full_accession.split(".")
entry = {
"accession": accession,
"version": version,
"group_id": data["metadata"]["groupId"],
"organism": data["organism"],
"metadata": data["metadata"],
"unaligned_nucleotide_sequences": data["unalignedNucleotideSequences"],
}

group_info = {"institution": center_name}

if mode == "project":
project_set = construct_project_set_object(group_info, config, entry)
project_xml = get_project_xml(project_set)

directory = "project"
os.makedirs(directory, exist_ok=True)
logger.info(f"Writing results to {directory}")

with open(os.path.join(directory, "submission.xml"), "w") as file:
file.write(project_xml["SUBMISSION"])
with open(os.path.join(directory, "project.xml"), "w") as file:
file.write(project_xml["PROJECT"])

logger.info(
"You can submit the project to ENA using the command: \n"
"curl -u {params.ena_submission_username}:{params.ena_submission_password}"
"-F 'SUBMISSION=@{project/submission.xml}' -F 'PROJECT=@{project/project.xml}'"
" {params.ena_submission_url} > {output}"
"\n Remember to submit to wwwdev. if you do not want to submit to production"
)

if mode == "sample":
entry["center_name"] = center_name
sample_set = construct_sample_set_object(config, entry, entry)
sample_xml = get_sample_xml(sample_set)

directory = "sample"
os.makedirs(directory, exist_ok=True)
logger.info(f"Writing results to {directory}")

with open(os.path.join(directory, "submission.xml"), "w") as file:
file.write(sample_xml["SUBMISSION"])
with open(os.path.join(directory, "sample.xml"), "w") as file:
file.write(sample_xml["SAMPLE"])

logger.info(
"You can submit the sample to ENA using the command: \n"
"curl -u {params.ena_submission_username}:{params.ena_submission_password}"
"-F 'SUBMISSION=@{sample/submission.xml}' -F 'SAMPLE=@{sample/project.xml}'"
" {params.ena_submission_url} > {output}"
"\n Remember to submit to wwwdev. if you do not want to submit to production"
)

if mode == "assembly":
dummy_sample_dict = {"result": {"ena_sample_accession": "BIOSAMPLE_ACCESSION"}}
dummy_project_dict = {"result": {"bioproject_accession": "BIOPROJECT_ACCESSION"}}

directory = "assembly"
os.makedirs(directory, exist_ok=True)
logger.info(f"Writing results to {directory}")

manifest_object = create_manifest_object(
config, dummy_sample_dict, dummy_project_dict, entry, entry, entry, dir=directory
)
create_manifest(manifest_object, dir=directory)
logger.info(
"You can submit the assembly to ENA using the command: \n"
"java -jarwebin-cli.jar -username {ena_submission_username} "
"-password {ena_submission_password} -context genome "
"-manifest {assembly/manifest.tsv} -submit "
f"-centername {center_name}"
"\n Remember to submit with -test if you do not want to submit to production"
)


if __name__ == "__main__":
local_ena_submission_generator()
65 changes: 42 additions & 23 deletions ena-submission/scripts/ena_submission_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,14 @@ def get_submission_dict(hold_until_date: str | None = None):
)


def get_project_xml(project_set):
submission_set = get_submission_dict()
return {
"SUBMISSION": dataclass_to_xml(submission_set, root_name="SUBMISSION"),
"PROJECT": dataclass_to_xml(project_set, root_name="PROJECT_SET"),
}


def create_ena_project(config: ENAConfig, project_set: ProjectSet) -> CreationResult:
"""
The project creation request should be equivalent to
Expand All @@ -128,13 +136,6 @@ def create_ena_project(config: ENAConfig, project_set: ProjectSet) -> CreationRe
errors = []
warnings = []

def get_project_xml(project_set):
submission_set = get_submission_dict()
return {
"SUBMISSION": dataclass_to_xml(submission_set, root_name="SUBMISSION"),
"PROJECT": dataclass_to_xml(project_set, root_name="PROJECT_SET"),
}

try:
xml = get_project_xml(project_set)
response = post_webin(config, xml)
Expand All @@ -143,6 +144,7 @@ def get_project_xml(project_set):
logger.error(error_message)
errors.append(error_message)
return CreationResult(results=None, errors=errors, warnings=warnings)

if not response.ok:
error_message = (
f"Request failed with status:{response.status_code}. " f"Response: {response.text}."
Expand Down Expand Up @@ -171,6 +173,15 @@ def get_project_xml(project_set):
return CreationResult(result=project_results, errors=errors, warnings=warnings)


def get_sample_xml(sample_set):
submission_set = get_submission_dict()
files = {
"SUBMISSION": dataclass_to_xml(submission_set, root_name="SUBMISSION"),
"SAMPLE": dataclass_to_xml(sample_set, root_name="SAMPLE_SET"),
}
return files


def create_ena_sample(config: ENAConfig, sample_set: SampleSetType) -> CreationResult:
"""
The sample creation request should be equivalent to
Expand All @@ -183,13 +194,6 @@ def create_ena_sample(config: ENAConfig, sample_set: SampleSetType) -> CreationR
errors = []
warnings = []

def get_sample_xml(sample_set):
submission_set = get_submission_dict()
files = {
"SUBMISSION": dataclass_to_xml(submission_set, root_name="SUBMISSION"),
"SAMPLE": dataclass_to_xml(sample_set, root_name="SAMPLE_SET"),
}
return files
try:
xml = get_sample_xml(sample_set)
response = post_webin(config, xml)
Expand All @@ -198,6 +202,7 @@ def get_sample_xml(sample_set):
logger.error(error_message)
errors.append(error_message)
return CreationResult(results=None, errors=errors, warnings=warnings)

if not response.ok:
error_message = (
f"Request failed with status:{response.status_code}. "
Expand Down Expand Up @@ -242,13 +247,17 @@ def post_webin(config: ENAConfig, xml: dict[str, Any]) -> requests.Response:
)


def create_chromosome_list(list_object: AssemblyChromosomeListFile) -> str:
def create_chromosome_list(list_object: AssemblyChromosomeListFile, dir: str | None = None) -> str:
"""
Creates a temp file chromosome list:
https://ena-docs.readthedocs.io/en/latest/submit/fileprep/assembly.html#chromosome-list-file
"""
with tempfile.NamedTemporaryFile(delete=False, suffix=".gz") as temp:
filename = temp.name
if dir:
os.makedirs(dir, exist_ok=True)
filename = os.path.join(dir, "chromosome_list.gz")
else:
with tempfile.NamedTemporaryFile(delete=False, suffix=".gz") as temp:
filename = temp.name

with gzip.GzipFile(filename, "wb") as gz:
for entry in list_object.chromosomes:
Expand All @@ -260,14 +269,20 @@ def create_chromosome_list(list_object: AssemblyChromosomeListFile) -> str:


def create_fasta(
unaligned_sequences: dict[str, str], chromosome_list: AssemblyChromosomeListFile
unaligned_sequences: dict[str, str],
chromosome_list: AssemblyChromosomeListFile,
dir: str | None = None,
) -> str:
"""
Creates a temp fasta file:
https://ena-docs.readthedocs.io/en/latest/submit/fileprep/assembly.html#fasta-file
"""
with tempfile.NamedTemporaryFile(delete=False, suffix=".fasta.gz") as temp:
filename = temp.name
if dir:
os.makedirs(dir, exist_ok=True)
filename = os.path.join(dir, "fasta.gz")
else:
with tempfile.NamedTemporaryFile(delete=False, suffix=".fasta.gz") as temp:
filename = temp.name

with gzip.GzipFile(filename, "wb") as gz:
if len(unaligned_sequences.keys()) == 1:
Expand All @@ -282,13 +297,17 @@ def create_fasta(
return filename


def create_manifest(manifest: AssemblyManifest) -> str:
def create_manifest(manifest: AssemblyManifest, dir: str | None = None) -> str:
"""
Creates a temp manifest file:
https://ena-docs.readthedocs.io/en/latest/submit/assembly/genome.html#manifest-files
"""
with tempfile.NamedTemporaryFile(delete=False, suffix=".tsv") as temp:
filename = temp.name
if dir:
os.makedirs(dir, exist_ok=True)
filename = os.path.join(dir, "manifest.tsv")
else:
with tempfile.NamedTemporaryFile(delete=False, suffix=".tsv") as temp:
filename = temp.name
with open(filename, "w") as f:
f.write(f"STUDY\t{manifest.study}\n")
f.write(f"SAMPLE\t{manifest.sample}\n")
Expand Down
Loading