diff --git a/ena-submission/ENA_submission.md b/ena-submission/ENA_submission.md index d12501922..1c1606093 100644 --- a/ena-submission/ENA_submission.md +++ b/ena-submission/ENA_submission.md @@ -267,6 +267,10 @@ The following could be implement as post-MVP features: ACGT ``` +Potentially a better option is to create flatfiles: https://ena-docs.readthedocs.io/en/latest/submit/fileprep/flat-file-example.html, which can include annotations: https://www.ebi.ac.uk/ena/WebFeat/. As we are submitting as a broker it is important that we add the AUTHORS to the assembly: https://ena-docs.readthedocs.io/en/latest/faq/data_brokering.html#authorship. + +https://github.com/NBISweden/EMBLmyGFF3 will generate an embl file given a gff3 file, however it needs a gff3 file for the specific sequence that is being submitted. NIH has gff3 for each reference sequence, but we need to create one for each sequence. + 4. Submit the files using the webin-cli: ```bash diff --git a/ena-submission/environment.yml b/ena-submission/environment.yml index e439d9da0..16f919e9c 100644 --- a/ena-submission/environment.yml +++ b/ena-submission/environment.yml @@ -16,3 +16,4 @@ dependencies: - psycopg2 - slack_sdk - xmltodict + - biopython diff --git a/ena-submission/scripts/create_assembly.py b/ena-submission/scripts/create_assembly.py index bf1f28afe..33de3278b 100644 --- a/ena-submission/scripts/create_assembly.py +++ b/ena-submission/scripts/create_assembly.py @@ -11,7 +11,7 @@ CreationResult, create_chromosome_list, create_ena_assembly, - create_fasta, + create_flatfile, create_manifest, get_ena_analysis_process, get_ena_config, @@ -84,14 +84,14 @@ def create_chromosome_list_object( for segment_name in segment_order: if segment_name != "main": entry = AssemblyChromosomeListFileObject( - object_name=f"{seq_key["accession"]}.{seq_key["version"]}_{segment_name}", + object_name=f"{seq_key["accession"]}_{segment_name}", chromosome_name=segment_name, chromosome_type=chromosome_type, ) entries.append(entry) continue entry = AssemblyChromosomeListFileObject( - object_name=f"{seq_key["accession"]}.{seq_key["version"]}", + object_name=f"{seq_key["accession"]}", chromosome_name="main", chromosome_type=chromosome_type, ) @@ -138,10 +138,35 @@ def create_manifest_object( organism_metadata = config.organisms[group_key["organism"]]["ingest"] chromosome_list_object = create_chromosome_list_object(unaligned_nucleotide_sequences, seq_key) chromosome_list_file = create_chromosome_list(list_object=chromosome_list_object, dir=dir) - fasta_file = create_fasta( - unaligned_sequences=unaligned_nucleotide_sequences, - chromosome_list=chromosome_list_object, - dir = dir + authors = ( + metadata["authors"] if metadata.get("authors") else metadata.get("submitter", "Unknown") + ) + collection_date = metadata.get("collectionDate", "Unknown") + country = metadata.get("geoLocCountry", "Unknown") + admin1 = metadata.get("geoLocAdmin1", "") + admin2 = metadata.get("geoLocAdmin2", "") + country = f"{country}:{admin1}, {admin2}" + try: + moleculetype = MoleculeType(organism_metadata.get("molecule_type")) + except ValueError as err: + msg = f"Invalid molecule type: {organism_metadata.get('molecule_type')}" + logger.error(msg) + raise ValueError(msg) from err + organism = organism_metadata.get("scientific_name", "Unknown") + description = ( + f"Original sequence submitted to {config.db_name} with accession: " + f"{seq_key["accession"]}, version: {seq_key["version"]}" + ) + flat_file = create_flatfile( + unaligned_nucleotide_sequences, + seq_key["accession"], + country=country, + collection_date=collection_date, + description=description, + authors=authors, + moleculetype=moleculetype, + organism=organism, + dir=dir, ) program = ( metadata["sequencingInstrument"] if metadata.get("sequencingInstrument") else "Unknown" @@ -159,18 +184,6 @@ def create_manifest_object( ) except ValueError: coverage = 1 - try: - moleculetype = ( - MoleculeType(metadata["moleculeType"]) - if organism_metadata.get("moleculeType") - else None - ) - except ValueError: - moleculetype = None - description = ( - f"Original sequence submitted to {config.db_name} with accession: " - f"{seq_key["accession"]}, version: {seq_key["version"]}" - ) assembly_name = ( seq_key["accession"] + f"{datetime.now(tz=pytz.utc)}".replace(" ", "_").replace("+", "_").replace(":", "_") @@ -186,7 +199,7 @@ def create_manifest_object( coverage=coverage, program=program, platform=platform, - fasta=fasta_file, + flatfile=flat_file, chromosome_list=chromosome_list_file, description=description, moleculetype=moleculetype, @@ -342,16 +355,22 @@ def assembly_table_create( error_msg = f"Entry {row["accession"]} not found in project_table" raise RuntimeError(error_msg) - manifest_object = create_manifest_object( - config, - results_in_sample_table[0], - results_in_project_table[0], - sample_data_in_submission_table[0], - seq_key, - group_key, - test, - ) - manifest_file = create_manifest(manifest_object) + try: + manifest_object = create_manifest_object( + config, + results_in_sample_table[0], + results_in_project_table[0], + sample_data_in_submission_table[0], + seq_key, + group_key, + test, + ) + manifest_file = create_manifest(manifest_object) + except Exception as e: + logger.error( + f"Manifest creation failed for accession {row["accession"]} with error {e}" + ) + continue update_values = {"status": Status.SUBMITTING} number_rows_updated = update_db_where_conditions( @@ -372,7 +391,11 @@ def assembly_table_create( sample_data_in_submission_table[0]["unaligned_nucleotide_sequences"] ) assembly_creation_results: CreationResult = create_ena_assembly( - ena_config, manifest_file, center_name=center_name, test=test + ena_config, + manifest_file, + accession=seq_key["accession"], + center_name=center_name, + test=test, ) if assembly_creation_results.result: assembly_creation_results.result["segment_order"] = segment_order diff --git a/ena-submission/scripts/ena_submission_helper.py b/ena-submission/scripts/ena_submission_helper.py index 35283e4b6..b0a3ebae7 100644 --- a/ena-submission/scripts/ena_submission_helper.py +++ b/ena-submission/scripts/ena_submission_helper.py @@ -1,4 +1,5 @@ import datetime +import glob import gzip import json import logging @@ -13,12 +14,17 @@ import pytz import requests import xmltodict +from Bio import SeqIO +from Bio.Seq import Seq +from Bio.SeqFeature import FeatureLocation, Reference, SeqFeature +from Bio.SeqRecord import SeqRecord from ena_types import ( Action, Actions, AssemblyChromosomeListFile, AssemblyManifest, Hold, + MoleculeType, ProjectSet, SampleSetType, Submission, @@ -144,7 +150,7 @@ def create_ena_project(config: ENAConfig, project_set: ProjectSet) -> CreationRe logger.error(error_message) errors.append(error_message) return CreationResult(results=None, errors=errors, warnings=warnings) - + if not response.ok: error_message = ( f"Request failed with status:{response.status_code}. " f"Response: {response.text}." @@ -268,6 +274,81 @@ def create_chromosome_list(list_object: AssemblyChromosomeListFile, dir: str | N return filename +def create_flatfile( + unaligned_sequences: dict[str, str], + accession: str, + description: str, + authors: str, + moleculetype: MoleculeType, + country: str, + collection_date: str, + organism: str, + dir: str | None = None, +) -> str: + if dir: + os.makedirs(dir, exist_ok=True) + filename = os.path.join(dir, "sequences.embl") + else: + with tempfile.NamedTemporaryFile(delete=False, suffix=".embl") as temp: + filename = temp.name + + seqIO_moleculetype = { + MoleculeType.GENOMIC_DNA: "DNA", + MoleculeType.GENOMIC_RNA: "RNA", + MoleculeType.VIRAL_CRNA: "cRNA", + } + + embl_content = [] + + multi_segment = True + if set(unaligned_sequences.keys()) == {"main"}: + multi_segment = False + + for seq_name, sequence_str in unaligned_sequences.items(): + if not sequence_str: + continue + reference = Reference() + reference.authors = authors + sequence = SeqRecord( + Seq(sequence_str), + id=f"{accession}_{seq_name}" if multi_segment else accession, + annotations={ + "molecule_type": seqIO_moleculetype[moleculetype], + "organism": organism, + "topology": "linear", + "references": [reference], + }, + description=description, + ) + + source_feature = SeqFeature( + FeatureLocation(start=0, end=len(sequence.seq)), + type="source", + qualifiers={ + "molecule_type": str(moleculetype), + "organism": organism, + "country": country, + "collection_date": collection_date, + }, + ) + sequence.features.append(source_feature) + + with tempfile.NamedTemporaryFile(delete=False, suffix=".embl") as temp_seq_file: + SeqIO.write(sequence, temp_seq_file.name, "embl") + + with open(temp_seq_file.name, encoding="utf-8") as temp_seq_file: + embl_content.append(temp_seq_file.read()) + + final_content = "\n".join(embl_content) + + gzip_filename = filename + ".gz" + + with gzip.open(gzip_filename, "wt", encoding="utf-8") as file: + file.write(final_content) + + return gzip_filename + + def create_fasta( unaligned_sequences: dict[str, str], chromosome_list: AssemblyChromosomeListFile, @@ -308,6 +389,8 @@ def create_manifest(manifest: AssemblyManifest, dir: str | None = None) -> str: else: with tempfile.NamedTemporaryFile(delete=False, suffix=".tsv") as temp: filename = temp.name + if not manifest.fasta and not manifest.flatfile: + raise ValueError("Either fasta or flatfile must be provided") with open(filename, "w") as f: f.write(f"STUDY\t{manifest.study}\n") f.write(f"SAMPLE\t{manifest.sample}\n") @@ -318,7 +401,10 @@ def create_manifest(manifest: AssemblyManifest, dir: str | None = None) -> str: f.write(f"COVERAGE\t{manifest.coverage}\n") f.write(f"PROGRAM\t{manifest.program}\n") f.write(f"PLATFORM\t{manifest.platform}\n") - f.write(f"FASTA\t{manifest.fasta}\n") + if manifest.flatfile: + f.write(f"FLATFILE\t{manifest.flatfile}\n") + if manifest.fasta: + f.write(f"FASTA\t{manifest.fasta}\n") f.write(f"CHROMOSOME_LIST\t{manifest.chromosome_list}\n") if manifest.description: f.write(f"DESCRIPTION\t{manifest.description}\n") @@ -357,7 +443,7 @@ def post_webin_cli( def create_ena_assembly( - config: ENAConfig, manifest_filename: str, center_name=None, test=True + config: ENAConfig, manifest_filename: str, accession: str = "", center_name=None, test=True ) -> CreationResult: """ This is equivalent to running: @@ -374,7 +460,21 @@ def create_ena_assembly( f"Request failed with status:{response.returncode}. " f"Stdout: {response.stdout}, Stderr: {response.stderr}" ) - logger.warning(error_message) + validate_log_path = f"../tmp/genome/{accession}*/validate/*.report" + matching_files = glob.glob(validate_log_path) + + if not matching_files: + logging.error("No .report files found.") + else: + file_path = matching_files[0] + print(f"Matching file found: {file_path}") + + try: + with open(file_path, "r") as file: + contents = file.read() + print(f"Contents of the file:\n{contents}") + except Exception as e: + logging.error(f"Error reading file {file_path}: {e}") errors.append(error_message) return CreationResult(result=None, errors=errors, warnings=warnings) diff --git a/ena-submission/scripts/ena_types.py b/ena-submission/scripts/ena_types.py index e08ae5e7f..fbb163d99 100644 --- a/ena-submission/scripts/ena_types.py +++ b/ena-submission/scripts/ena_types.py @@ -181,8 +181,9 @@ class AssemblyManifest: coverage: str program: str platform: str - fasta: str chromosome_list: str + fasta: str | None = None + flatfile: str | None = None mingaplength: int | None = None moleculetype: MoleculeType | None = None description: str | None = None diff --git a/ena-submission/scripts/test_ena_submission.py b/ena-submission/scripts/test_ena_submission.py index 8b277ac87..1bee66a27 100644 --- a/ena-submission/scripts/test_ena_submission.py +++ b/ena-submission/scripts/test_ena_submission.py @@ -43,7 +43,11 @@ def mock_config(): config = mock.Mock() config.db_name = "Loculus" config.unique_project_suffix = "Test suffix" - metadata_dict = {"taxon_id": "Test taxon", "scientific_name": "Test scientific name"} + metadata_dict = { + "taxon_id": "Test taxon", + "scientific_name": "Test scientific name", + "molecule_type": "genomic RNA", + } config.organisms = {"Test organism": {"ingest": metadata_dict}} config.metadata_mapping = defaults["metadata_mapping"] config.metadata_mapping_mandatory_field_defaults = defaults[ @@ -191,7 +195,7 @@ def test_create_chromosome_list_multi_segment(self): self.assertEqual( content, - b"test_accession.test_version_seg2\tseg2\tlinear-segmented\ntest_accession.test_version_seg3\tseg3\tlinear-segmented\n", + b"test_accession_seg2\tseg2\tlinear-segmented\ntest_accession_seg3\tseg3\tlinear-segmented\n", ) def test_create_chromosome_list(self): @@ -203,7 +207,7 @@ def test_create_chromosome_list(self): self.assertEqual( content, - b"test_accession.test_version\tmain\tlinear-segmented\n", + b"test_accession\tmain\tlinear-segmented\n", ) def test_create_fasta_multi(self): @@ -216,7 +220,7 @@ def test_create_fasta_multi(self): content = gz.read() self.assertEqual( content, - b">test_accession.test_version_seg2\nGCGGCACGTCAGTACGTAAGTGTATCTCAAAGAAATACTTAACTTTGAGAGAGTGAATT\n>test_accession.test_version_seg3\nCTTAACTTTGAGAGAGTGAATT\n", + b">test_accession_seg2\nGCGGCACGTCAGTACGTAAGTGTATCTCAAAGAAATACTTAACTTTGAGAGAGTGAATT\n>test_accession_seg3\nCTTAACTTTGAGAGAGTGAATT\n", ) def test_create_fasta(self): @@ -227,7 +231,7 @@ def test_create_fasta(self): content = gz.read() self.assertEqual( content, - b">test_accession.test_version\nCTTAACTTTGAGAGAGTGAATT\n", + b">test_accession\nCTTAACTTTGAGAGAGTGAATT\n", ) def test_create_manifest(self): @@ -256,7 +260,7 @@ def test_create_manifest(self): data[key] = value # Temp file names are different data.pop("CHROMOSOME_LIST") - data.pop("FASTA") + data.pop("FLATFILE") expected_data = { "STUDY": study_accession, "SAMPLE": sample_accession, @@ -266,6 +270,7 @@ def test_create_manifest(self): "PROGRAM": "Unknown", "PLATFORM": "Illumina", "DESCRIPTION": "Original sequence submitted to Loculus with accession: test_accession, version: test_version", + "MOLECULETYPE": "genomic RNA", } self.assertEqual(data, expected_data) diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml index 195cc9067..b38c10e24 100644 --- a/kubernetes/loculus/values.yaml +++ b/kubernetes/loculus/values.yaml @@ -1096,6 +1096,7 @@ defaultOrganismConfig: &defaultOrganismConfig configFile: &ingestConfigFile taxon_id: 186538 scientific_name: "Zaire ebolavirus" + molecule_type: "genomic RNA" referenceGenomes: nucleotideSequences: - name: "main" @@ -1180,6 +1181,7 @@ defaultOrganisms: configFile: taxon_id: 3048448 scientific_name: "West Nile virus" + molecule_type: "genomic RNA" referenceGenomes: nucleotideSequences: - name: main @@ -1408,6 +1410,7 @@ defaultOrganisms: <<: *ingestConfigFile taxon_id: 3052518 scientific_name: "Orthonairovirus haemorrhagiae" + molecule_type: "genomic RNA" nucleotide_sequences: - L - M