Skip to content

Commit

Permalink
feat(deposition): submit EMBL flatfile instead of fasta for better nu…
Browse files Browse the repository at this point in the history
…cleotide metadata (#2947)



---------

Co-authored-by: Cornelius Roemer <[email protected]>
  • Loading branch information
anna-parker and corneliusroemer authored Oct 4, 2024
1 parent 8bca886 commit 60aa2e4
Show file tree
Hide file tree
Showing 7 changed files with 179 additions and 42 deletions.
4 changes: 4 additions & 0 deletions ena-submission/ENA_submission.md
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,10 @@ The following could be implement as post-MVP features:
ACGT
```

Potentially a better option is to create flatfiles: https://ena-docs.readthedocs.io/en/latest/submit/fileprep/flat-file-example.html, which can include annotations: https://www.ebi.ac.uk/ena/WebFeat/. As we are submitting as a broker it is important that we add the AUTHORS to the assembly: https://ena-docs.readthedocs.io/en/latest/faq/data_brokering.html#authorship.

https://github.com/NBISweden/EMBLmyGFF3 will generate an embl file given a gff3 file, however it needs a gff3 file for the specific sequence that is being submitted. NIH has gff3 for each reference sequence, but we need to create one for each sequence.

4. Submit the files using the webin-cli:

```bash
Expand Down
1 change: 1 addition & 0 deletions ena-submission/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@ dependencies:
- psycopg2
- slack_sdk
- xmltodict
- biopython
85 changes: 54 additions & 31 deletions ena-submission/scripts/create_assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
CreationResult,
create_chromosome_list,
create_ena_assembly,
create_fasta,
create_flatfile,
create_manifest,
get_ena_analysis_process,
get_ena_config,
Expand Down Expand Up @@ -84,14 +84,14 @@ def create_chromosome_list_object(
for segment_name in segment_order:
if segment_name != "main":
entry = AssemblyChromosomeListFileObject(
object_name=f"{seq_key["accession"]}.{seq_key["version"]}_{segment_name}",
object_name=f"{seq_key["accession"]}_{segment_name}",
chromosome_name=segment_name,
chromosome_type=chromosome_type,
)
entries.append(entry)
continue
entry = AssemblyChromosomeListFileObject(
object_name=f"{seq_key["accession"]}.{seq_key["version"]}",
object_name=f"{seq_key["accession"]}",
chromosome_name="main",
chromosome_type=chromosome_type,
)
Expand Down Expand Up @@ -138,10 +138,35 @@ def create_manifest_object(
organism_metadata = config.organisms[group_key["organism"]]["ingest"]
chromosome_list_object = create_chromosome_list_object(unaligned_nucleotide_sequences, seq_key)
chromosome_list_file = create_chromosome_list(list_object=chromosome_list_object, dir=dir)
fasta_file = create_fasta(
unaligned_sequences=unaligned_nucleotide_sequences,
chromosome_list=chromosome_list_object,
dir = dir
authors = (
metadata["authors"] if metadata.get("authors") else metadata.get("submitter", "Unknown")
)
collection_date = metadata.get("collectionDate", "Unknown")
country = metadata.get("geoLocCountry", "Unknown")
admin1 = metadata.get("geoLocAdmin1", "")
admin2 = metadata.get("geoLocAdmin2", "")
country = f"{country}:{admin1}, {admin2}"
try:
moleculetype = MoleculeType(organism_metadata.get("molecule_type"))
except ValueError as err:
msg = f"Invalid molecule type: {organism_metadata.get('molecule_type')}"
logger.error(msg)
raise ValueError(msg) from err
organism = organism_metadata.get("scientific_name", "Unknown")
description = (
f"Original sequence submitted to {config.db_name} with accession: "
f"{seq_key["accession"]}, version: {seq_key["version"]}"
)
flat_file = create_flatfile(
unaligned_nucleotide_sequences,
seq_key["accession"],
country=country,
collection_date=collection_date,
description=description,
authors=authors,
moleculetype=moleculetype,
organism=organism,
dir=dir,
)
program = (
metadata["sequencingInstrument"] if metadata.get("sequencingInstrument") else "Unknown"
Expand All @@ -159,18 +184,6 @@ def create_manifest_object(
)
except ValueError:
coverage = 1
try:
moleculetype = (
MoleculeType(metadata["moleculeType"])
if organism_metadata.get("moleculeType")
else None
)
except ValueError:
moleculetype = None
description = (
f"Original sequence submitted to {config.db_name} with accession: "
f"{seq_key["accession"]}, version: {seq_key["version"]}"
)
assembly_name = (
seq_key["accession"]
+ f"{datetime.now(tz=pytz.utc)}".replace(" ", "_").replace("+", "_").replace(":", "_")
Expand All @@ -186,7 +199,7 @@ def create_manifest_object(
coverage=coverage,
program=program,
platform=platform,
fasta=fasta_file,
flatfile=flat_file,
chromosome_list=chromosome_list_file,
description=description,
moleculetype=moleculetype,
Expand Down Expand Up @@ -342,16 +355,22 @@ def assembly_table_create(
error_msg = f"Entry {row["accession"]} not found in project_table"
raise RuntimeError(error_msg)

manifest_object = create_manifest_object(
config,
results_in_sample_table[0],
results_in_project_table[0],
sample_data_in_submission_table[0],
seq_key,
group_key,
test,
)
manifest_file = create_manifest(manifest_object)
try:
manifest_object = create_manifest_object(
config,
results_in_sample_table[0],
results_in_project_table[0],
sample_data_in_submission_table[0],
seq_key,
group_key,
test,
)
manifest_file = create_manifest(manifest_object)
except Exception as e:
logger.error(
f"Manifest creation failed for accession {row["accession"]} with error {e}"
)
continue

update_values = {"status": Status.SUBMITTING}
number_rows_updated = update_db_where_conditions(
Expand All @@ -372,7 +391,11 @@ def assembly_table_create(
sample_data_in_submission_table[0]["unaligned_nucleotide_sequences"]
)
assembly_creation_results: CreationResult = create_ena_assembly(
ena_config, manifest_file, center_name=center_name, test=test
ena_config,
manifest_file,
accession=seq_key["accession"],
center_name=center_name,
test=test,
)
if assembly_creation_results.result:
assembly_creation_results.result["segment_order"] = segment_order
Expand Down
108 changes: 104 additions & 4 deletions ena-submission/scripts/ena_submission_helper.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import datetime
import glob
import gzip
import json
import logging
Expand All @@ -13,12 +14,17 @@
import pytz
import requests
import xmltodict
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqFeature import FeatureLocation, Reference, SeqFeature
from Bio.SeqRecord import SeqRecord
from ena_types import (
Action,
Actions,
AssemblyChromosomeListFile,
AssemblyManifest,
Hold,
MoleculeType,
ProjectSet,
SampleSetType,
Submission,
Expand Down Expand Up @@ -144,7 +150,7 @@ def create_ena_project(config: ENAConfig, project_set: ProjectSet) -> CreationRe
logger.error(error_message)
errors.append(error_message)
return CreationResult(results=None, errors=errors, warnings=warnings)

if not response.ok:
error_message = (
f"Request failed with status:{response.status_code}. " f"Response: {response.text}."
Expand Down Expand Up @@ -268,6 +274,81 @@ def create_chromosome_list(list_object: AssemblyChromosomeListFile, dir: str | N
return filename


def create_flatfile(
unaligned_sequences: dict[str, str],
accession: str,
description: str,
authors: str,
moleculetype: MoleculeType,
country: str,
collection_date: str,
organism: str,
dir: str | None = None,
) -> str:
if dir:
os.makedirs(dir, exist_ok=True)
filename = os.path.join(dir, "sequences.embl")
else:
with tempfile.NamedTemporaryFile(delete=False, suffix=".embl") as temp:
filename = temp.name

seqIO_moleculetype = {
MoleculeType.GENOMIC_DNA: "DNA",
MoleculeType.GENOMIC_RNA: "RNA",
MoleculeType.VIRAL_CRNA: "cRNA",
}

embl_content = []

multi_segment = True
if set(unaligned_sequences.keys()) == {"main"}:
multi_segment = False

for seq_name, sequence_str in unaligned_sequences.items():
if not sequence_str:
continue
reference = Reference()
reference.authors = authors
sequence = SeqRecord(
Seq(sequence_str),
id=f"{accession}_{seq_name}" if multi_segment else accession,
annotations={
"molecule_type": seqIO_moleculetype[moleculetype],
"organism": organism,
"topology": "linear",
"references": [reference],
},
description=description,
)

source_feature = SeqFeature(
FeatureLocation(start=0, end=len(sequence.seq)),
type="source",
qualifiers={
"molecule_type": str(moleculetype),
"organism": organism,
"country": country,
"collection_date": collection_date,
},
)
sequence.features.append(source_feature)

with tempfile.NamedTemporaryFile(delete=False, suffix=".embl") as temp_seq_file:
SeqIO.write(sequence, temp_seq_file.name, "embl")

with open(temp_seq_file.name, encoding="utf-8") as temp_seq_file:
embl_content.append(temp_seq_file.read())

final_content = "\n".join(embl_content)

gzip_filename = filename + ".gz"

with gzip.open(gzip_filename, "wt", encoding="utf-8") as file:
file.write(final_content)

return gzip_filename


def create_fasta(
unaligned_sequences: dict[str, str],
chromosome_list: AssemblyChromosomeListFile,
Expand Down Expand Up @@ -308,6 +389,8 @@ def create_manifest(manifest: AssemblyManifest, dir: str | None = None) -> str:
else:
with tempfile.NamedTemporaryFile(delete=False, suffix=".tsv") as temp:
filename = temp.name
if not manifest.fasta and not manifest.flatfile:
raise ValueError("Either fasta or flatfile must be provided")
with open(filename, "w") as f:
f.write(f"STUDY\t{manifest.study}\n")
f.write(f"SAMPLE\t{manifest.sample}\n")
Expand All @@ -318,7 +401,10 @@ def create_manifest(manifest: AssemblyManifest, dir: str | None = None) -> str:
f.write(f"COVERAGE\t{manifest.coverage}\n")
f.write(f"PROGRAM\t{manifest.program}\n")
f.write(f"PLATFORM\t{manifest.platform}\n")
f.write(f"FASTA\t{manifest.fasta}\n")
if manifest.flatfile:
f.write(f"FLATFILE\t{manifest.flatfile}\n")
if manifest.fasta:
f.write(f"FASTA\t{manifest.fasta}\n")
f.write(f"CHROMOSOME_LIST\t{manifest.chromosome_list}\n")
if manifest.description:
f.write(f"DESCRIPTION\t{manifest.description}\n")
Expand Down Expand Up @@ -357,7 +443,7 @@ def post_webin_cli(


def create_ena_assembly(
config: ENAConfig, manifest_filename: str, center_name=None, test=True
config: ENAConfig, manifest_filename: str, accession: str = "", center_name=None, test=True
) -> CreationResult:
"""
This is equivalent to running:
Expand All @@ -374,7 +460,21 @@ def create_ena_assembly(
f"Request failed with status:{response.returncode}. "
f"Stdout: {response.stdout}, Stderr: {response.stderr}"
)
logger.warning(error_message)
validate_log_path = f"../tmp/genome/{accession}*/validate/*.report"
matching_files = glob.glob(validate_log_path)

if not matching_files:
logging.error("No .report files found.")
else:
file_path = matching_files[0]
print(f"Matching file found: {file_path}")

try:
with open(file_path, "r") as file:
contents = file.read()
print(f"Contents of the file:\n{contents}")
except Exception as e:
logging.error(f"Error reading file {file_path}: {e}")
errors.append(error_message)
return CreationResult(result=None, errors=errors, warnings=warnings)

Expand Down
3 changes: 2 additions & 1 deletion ena-submission/scripts/ena_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,8 +181,9 @@ class AssemblyManifest:
coverage: str
program: str
platform: str
fasta: str
chromosome_list: str
fasta: str | None = None
flatfile: str | None = None
mingaplength: int | None = None
moleculetype: MoleculeType | None = None
description: str | None = None
Expand Down
Loading

0 comments on commit 60aa2e4

Please sign in to comment.