feat(deposition): submit EMBL flatfile instead of fasta for better nu…

…cleotide metadata (#2947) --------- Co-authored-by: Cornelius Roemer <[email protected]>
loculus-project · Oct 4, 2024 · 60aa2e4 · 60aa2e4
1 parent 8bca886
commit 60aa2e4
Show file tree

Hide file tree

Showing 7 changed files with 179 additions and 42 deletions.
diff --git a/ena-submission/ENA_submission.md b/ena-submission/ENA_submission.md
@@ -267,6 +267,10 @@ The following could be implement as post-MVP features:
    ACGT
    ```
 
+Potentially a better option is to create flatfiles: https://ena-docs.readthedocs.io/en/latest/submit/fileprep/flat-file-example.html, which can include annotations: https://www.ebi.ac.uk/ena/WebFeat/. As we are submitting as a broker it is important that we add the AUTHORS to the assembly: https://ena-docs.readthedocs.io/en/latest/faq/data_brokering.html#authorship. 
+
+https://github.com/NBISweden/EMBLmyGFF3 will generate an embl file given a gff3 file, however it needs a gff3 file for the specific sequence that is being submitted. NIH has gff3 for each reference sequence, but we need to create one for each sequence.
+
 4. Submit the files using the webin-cli:
 
    ```bash

diff --git a/ena-submission/environment.yml b/ena-submission/environment.yml
@@ -16,3 +16,4 @@ dependencies:
   - psycopg2
   - slack_sdk
   - xmltodict
+  - biopython
diff --git a/ena-submission/scripts/create_assembly.py b/ena-submission/scripts/create_assembly.py
@@ -11,7 +11,7 @@
     CreationResult,
     create_chromosome_list,
     create_ena_assembly,
-    create_fasta,
+    create_flatfile,
     create_manifest,
     get_ena_analysis_process,
     get_ena_config,
@@ -84,14 +84,14 @@ def create_chromosome_list_object(
     for segment_name in segment_order:
         if segment_name != "main":
             entry = AssemblyChromosomeListFileObject(
-                object_name=f"{seq_key["accession"]}.{seq_key["version"]}_{segment_name}",
+                object_name=f"{seq_key["accession"]}_{segment_name}",
                 chromosome_name=segment_name,
                 chromosome_type=chromosome_type,
             )
             entries.append(entry)
             continue
         entry = AssemblyChromosomeListFileObject(
-            object_name=f"{seq_key["accession"]}.{seq_key["version"]}",
+            object_name=f"{seq_key["accession"]}",
             chromosome_name="main",
             chromosome_type=chromosome_type,
         )
@@ -138,10 +138,35 @@ def create_manifest_object(
     organism_metadata = config.organisms[group_key["organism"]]["ingest"]
     chromosome_list_object = create_chromosome_list_object(unaligned_nucleotide_sequences, seq_key)
     chromosome_list_file = create_chromosome_list(list_object=chromosome_list_object, dir=dir)
-    fasta_file = create_fasta(
-        unaligned_sequences=unaligned_nucleotide_sequences,
-        chromosome_list=chromosome_list_object,
-        dir = dir
+    authors = (
+        metadata["authors"] if metadata.get("authors") else metadata.get("submitter", "Unknown")
+    )
+    collection_date = metadata.get("collectionDate", "Unknown")
+    country = metadata.get("geoLocCountry", "Unknown")
+    admin1 = metadata.get("geoLocAdmin1", "")
+    admin2 = metadata.get("geoLocAdmin2", "")
+    country = f"{country}:{admin1}, {admin2}"
+    try:
+        moleculetype = MoleculeType(organism_metadata.get("molecule_type"))
+    except ValueError as err:
+        msg = f"Invalid molecule type: {organism_metadata.get('molecule_type')}"
+        logger.error(msg)
+        raise ValueError(msg) from err
+    organism = organism_metadata.get("scientific_name", "Unknown")
+    description = (
+        f"Original sequence submitted to {config.db_name} with accession: "
+        f"{seq_key["accession"]}, version: {seq_key["version"]}"
+    )
+    flat_file = create_flatfile(
+        unaligned_nucleotide_sequences,
+        seq_key["accession"],
+        country=country,
+        collection_date=collection_date,
+        description=description,
+        authors=authors,
+        moleculetype=moleculetype,
+        organism=organism,
+        dir=dir,
     )
     program = (
         metadata["sequencingInstrument"] if metadata.get("sequencingInstrument") else "Unknown"
@@ -159,18 +184,6 @@ def create_manifest_object(
         )
     except ValueError:
         coverage = 1
-    try:
-        moleculetype = (
-            MoleculeType(metadata["moleculeType"])
-            if organism_metadata.get("moleculeType")
-            else None
-        )
-    except ValueError:
-        moleculetype = None
-    description = (
-        f"Original sequence submitted to {config.db_name} with accession: "
-        f"{seq_key["accession"]}, version: {seq_key["version"]}"
-    )
     assembly_name = (
         seq_key["accession"]
         + f"{datetime.now(tz=pytz.utc)}".replace(" ", "_").replace("+", "_").replace(":", "_")
@@ -186,7 +199,7 @@ def create_manifest_object(
         coverage=coverage,
         program=program,
         platform=platform,
-        fasta=fasta_file,
+        flatfile=flat_file,
         chromosome_list=chromosome_list_file,
         description=description,
         moleculetype=moleculetype,
@@ -342,16 +355,22 @@ def assembly_table_create(
             error_msg = f"Entry {row["accession"]} not found in project_table"
             raise RuntimeError(error_msg)
 
-        manifest_object = create_manifest_object(
-            config,
-            results_in_sample_table[0],
-            results_in_project_table[0],
-            sample_data_in_submission_table[0],
-            seq_key,
-            group_key,
-            test,
-        )
-        manifest_file = create_manifest(manifest_object)
+        try:
+            manifest_object = create_manifest_object(
+                config,
+                results_in_sample_table[0],
+                results_in_project_table[0],
+                sample_data_in_submission_table[0],
+                seq_key,
+                group_key,
+                test,
+            )
+            manifest_file = create_manifest(manifest_object)
+        except Exception as e:
+            logger.error(
+                f"Manifest creation failed for accession {row["accession"]} with error {e}"
+            )
+            continue
 
         update_values = {"status": Status.SUBMITTING}
         number_rows_updated = update_db_where_conditions(
@@ -372,7 +391,11 @@ def assembly_table_create(
             sample_data_in_submission_table[0]["unaligned_nucleotide_sequences"]
         )
         assembly_creation_results: CreationResult = create_ena_assembly(
-            ena_config, manifest_file, center_name=center_name, test=test
+            ena_config,
+            manifest_file,
+            accession=seq_key["accession"],
+            center_name=center_name,
+            test=test,
         )
         if assembly_creation_results.result:
             assembly_creation_results.result["segment_order"] = segment_order

diff --git a/ena-submission/scripts/ena_submission_helper.py b/ena-submission/scripts/ena_submission_helper.py
@@ -1,4 +1,5 @@
 import datetime
+import glob
 import gzip
 import json
 import logging
@@ -13,12 +14,17 @@
 import pytz
 import requests
 import xmltodict
+from Bio import SeqIO
+from Bio.Seq import Seq
+from Bio.SeqFeature import FeatureLocation, Reference, SeqFeature
+from Bio.SeqRecord import SeqRecord
 from ena_types import (
     Action,
     Actions,
     AssemblyChromosomeListFile,
     AssemblyManifest,
     Hold,
+    MoleculeType,
     ProjectSet,
     SampleSetType,
     Submission,
@@ -144,7 +150,7 @@ def create_ena_project(config: ENAConfig, project_set: ProjectSet) -> CreationRe
         logger.error(error_message)
         errors.append(error_message)
         return CreationResult(results=None, errors=errors, warnings=warnings)
- 
+
     if not response.ok:
         error_message = (
             f"Request failed with status:{response.status_code}. " f"Response: {response.text}."
@@ -268,6 +274,81 @@ def create_chromosome_list(list_object: AssemblyChromosomeListFile, dir: str | N
     return filename
 
 
+def create_flatfile(
+    unaligned_sequences: dict[str, str],
+    accession: str,
+    description: str,
+    authors: str,
+    moleculetype: MoleculeType,
+    country: str,
+    collection_date: str,
+    organism: str,
+    dir: str | None = None,
+) -> str:
+    if dir:
+        os.makedirs(dir, exist_ok=True)
+        filename = os.path.join(dir, "sequences.embl")
+    else:
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".embl") as temp:
+            filename = temp.name
+
+    seqIO_moleculetype = {
+        MoleculeType.GENOMIC_DNA: "DNA",
+        MoleculeType.GENOMIC_RNA: "RNA",
+        MoleculeType.VIRAL_CRNA: "cRNA",
+    }
+
+    embl_content = []
+
+    multi_segment = True
+    if set(unaligned_sequences.keys()) == {"main"}:
+        multi_segment = False
+
+    for seq_name, sequence_str in unaligned_sequences.items():
+        if not sequence_str:
+            continue
+        reference = Reference()
+        reference.authors = authors
+        sequence = SeqRecord(
+            Seq(sequence_str),
+            id=f"{accession}_{seq_name}" if multi_segment else accession,
+            annotations={
+                "molecule_type": seqIO_moleculetype[moleculetype],
+                "organism": organism,
+                "topology": "linear",
+                "references": [reference],
+            },
+            description=description,
+        )
+
+        source_feature = SeqFeature(
+            FeatureLocation(start=0, end=len(sequence.seq)),
+            type="source",
+            qualifiers={
+                "molecule_type": str(moleculetype),
+                "organism": organism,
+                "country": country,
+                "collection_date": collection_date,
+            },
+        )
+        sequence.features.append(source_feature)
+
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".embl") as temp_seq_file:
+            SeqIO.write(sequence, temp_seq_file.name, "embl")
+
+        with open(temp_seq_file.name, encoding="utf-8") as temp_seq_file:
+            embl_content.append(temp_seq_file.read())
+
+    final_content = "\n".join(embl_content)
+
+    gzip_filename = filename + ".gz"
+
+    with gzip.open(gzip_filename, "wt", encoding="utf-8") as file:
+        file.write(final_content)
+
+    return gzip_filename
+
+
 def create_fasta(
     unaligned_sequences: dict[str, str],
     chromosome_list: AssemblyChromosomeListFile,
@@ -308,6 +389,8 @@ def create_manifest(manifest: AssemblyManifest, dir: str | None = None) -> str:
     else:
         with tempfile.NamedTemporaryFile(delete=False, suffix=".tsv") as temp:
             filename = temp.name
+    if not manifest.fasta and not manifest.flatfile:
+        raise ValueError("Either fasta or flatfile must be provided")
     with open(filename, "w") as f:
         f.write(f"STUDY\t{manifest.study}\n")
         f.write(f"SAMPLE\t{manifest.sample}\n")
@@ -318,7 +401,10 @@ def create_manifest(manifest: AssemblyManifest, dir: str | None = None) -> str:
         f.write(f"COVERAGE\t{manifest.coverage}\n")
         f.write(f"PROGRAM\t{manifest.program}\n")
         f.write(f"PLATFORM\t{manifest.platform}\n")
-        f.write(f"FASTA\t{manifest.fasta}\n")
+        if manifest.flatfile:
+            f.write(f"FLATFILE\t{manifest.flatfile}\n")
+        if manifest.fasta:
+            f.write(f"FASTA\t{manifest.fasta}\n")
         f.write(f"CHROMOSOME_LIST\t{manifest.chromosome_list}\n")
         if manifest.description:
             f.write(f"DESCRIPTION\t{manifest.description}\n")
@@ -357,7 +443,7 @@ def post_webin_cli(
 
 
 def create_ena_assembly(
-    config: ENAConfig, manifest_filename: str, center_name=None, test=True
+    config: ENAConfig, manifest_filename: str, accession: str = "", center_name=None, test=True
 ) -> CreationResult:
     """
     This is equivalent to running:
@@ -374,7 +460,21 @@ def create_ena_assembly(
             f"Request failed with status:{response.returncode}. "
             f"Stdout: {response.stdout}, Stderr: {response.stderr}"
         )
-        logger.warning(error_message)
+        validate_log_path = f"../tmp/genome/{accession}*/validate/*.report"
+        matching_files = glob.glob(validate_log_path)
+
+        if not matching_files:
+            logging.error("No .report files found.")
+        else:
+            file_path = matching_files[0]
+            print(f"Matching file found: {file_path}")
+
+            try:
+                with open(file_path, "r") as file:
+                    contents = file.read()
+                    print(f"Contents of the file:\n{contents}")
+            except Exception as e:
+                logging.error(f"Error reading file {file_path}: {e}")
         errors.append(error_message)
         return CreationResult(result=None, errors=errors, warnings=warnings)
 

diff --git a/ena-submission/scripts/ena_types.py b/ena-submission/scripts/ena_types.py
@@ -181,8 +181,9 @@ class AssemblyManifest:
     coverage: str
     program: str
     platform: str
-    fasta: str
     chromosome_list: str
+    fasta: str | None = None
+    flatfile: str | None = None
     mingaplength: int | None = None
     moleculetype: MoleculeType | None = None
     description: str | None = None