Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(deposition): submit EMBL flatfile instead of fasta for better nucleotide metadata #2947

Merged
merged 21 commits into from
Oct 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions ena-submission/ENA_submission.md
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,10 @@ The following could be implement as post-MVP features:
ACGT
```

Potentially a better option is to create flatfiles: https://ena-docs.readthedocs.io/en/latest/submit/fileprep/flat-file-example.html, which can include annotations: https://www.ebi.ac.uk/ena/WebFeat/. As we are submitting as a broker it is important that we add the AUTHORS to the assembly: https://ena-docs.readthedocs.io/en/latest/faq/data_brokering.html#authorship.

https://github.com/NBISweden/EMBLmyGFF3 will generate an embl file given a gff3 file, however it needs a gff3 file for the specific sequence that is being submitted. NIH has gff3 for each reference sequence, but we need to create one for each sequence.

4. Submit the files using the webin-cli:

```bash
Expand Down
1 change: 1 addition & 0 deletions ena-submission/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@ dependencies:
- psycopg2
- slack_sdk
- xmltodict
- biopython
85 changes: 54 additions & 31 deletions ena-submission/scripts/create_assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
CreationResult,
create_chromosome_list,
create_ena_assembly,
create_fasta,
create_flatfile,
create_manifest,
get_ena_analysis_process,
get_ena_config,
Expand Down Expand Up @@ -84,14 +84,14 @@ def create_chromosome_list_object(
for segment_name in segment_order:
if segment_name != "main":
entry = AssemblyChromosomeListFileObject(
object_name=f"{seq_key["accession"]}.{seq_key["version"]}_{segment_name}",
object_name=f"{seq_key["accession"]}_{segment_name}",
chromosome_name=segment_name,
chromosome_type=chromosome_type,
)
entries.append(entry)
continue
entry = AssemblyChromosomeListFileObject(
object_name=f"{seq_key["accession"]}.{seq_key["version"]}",
object_name=f"{seq_key["accession"]}",
chromosome_name="main",
chromosome_type=chromosome_type,
)
Expand Down Expand Up @@ -138,10 +138,35 @@ def create_manifest_object(
organism_metadata = config.organisms[group_key["organism"]]["ingest"]
chromosome_list_object = create_chromosome_list_object(unaligned_nucleotide_sequences, seq_key)
chromosome_list_file = create_chromosome_list(list_object=chromosome_list_object, dir=dir)
fasta_file = create_fasta(
unaligned_sequences=unaligned_nucleotide_sequences,
chromosome_list=chromosome_list_object,
dir = dir
authors = (
metadata["authors"] if metadata.get("authors") else metadata.get("submitter", "Unknown")
)
collection_date = metadata.get("collectionDate", "Unknown")
country = metadata.get("geoLocCountry", "Unknown")
admin1 = metadata.get("geoLocAdmin1", "")
admin2 = metadata.get("geoLocAdmin2", "")
country = f"{country}:{admin1}, {admin2}"
try:
moleculetype = MoleculeType(organism_metadata.get("molecule_type"))
except ValueError as err:
msg = f"Invalid molecule type: {organism_metadata.get('molecule_type')}"
logger.error(msg)
raise ValueError(msg) from err
organism = organism_metadata.get("scientific_name", "Unknown")
description = (
f"Original sequence submitted to {config.db_name} with accession: "
f"{seq_key["accession"]}, version: {seq_key["version"]}"
)
flat_file = create_flatfile(
unaligned_nucleotide_sequences,
seq_key["accession"],
country=country,
collection_date=collection_date,
description=description,
authors=authors,
moleculetype=moleculetype,
organism=organism,
dir=dir,
)
program = (
metadata["sequencingInstrument"] if metadata.get("sequencingInstrument") else "Unknown"
Expand All @@ -159,18 +184,6 @@ def create_manifest_object(
)
except ValueError:
coverage = 1
try:
moleculetype = (
MoleculeType(metadata["moleculeType"])
if organism_metadata.get("moleculeType")
else None
)
except ValueError:
moleculetype = None
description = (
f"Original sequence submitted to {config.db_name} with accession: "
f"{seq_key["accession"]}, version: {seq_key["version"]}"
)
assembly_name = (
seq_key["accession"]
+ f"{datetime.now(tz=pytz.utc)}".replace(" ", "_").replace("+", "_").replace(":", "_")
Expand All @@ -186,7 +199,7 @@ def create_manifest_object(
coverage=coverage,
program=program,
platform=platform,
fasta=fasta_file,
flatfile=flat_file,
chromosome_list=chromosome_list_file,
description=description,
moleculetype=moleculetype,
Expand Down Expand Up @@ -342,16 +355,22 @@ def assembly_table_create(
error_msg = f"Entry {row["accession"]} not found in project_table"
raise RuntimeError(error_msg)

manifest_object = create_manifest_object(
config,
results_in_sample_table[0],
results_in_project_table[0],
sample_data_in_submission_table[0],
seq_key,
group_key,
test,
)
manifest_file = create_manifest(manifest_object)
try:
manifest_object = create_manifest_object(
config,
results_in_sample_table[0],
results_in_project_table[0],
sample_data_in_submission_table[0],
seq_key,
group_key,
test,
)
manifest_file = create_manifest(manifest_object)
except Exception as e:
logger.error(
f"Manifest creation failed for accession {row["accession"]} with error {e}"
)
continue

update_values = {"status": Status.SUBMITTING}
number_rows_updated = update_db_where_conditions(
Expand All @@ -372,7 +391,11 @@ def assembly_table_create(
sample_data_in_submission_table[0]["unaligned_nucleotide_sequences"]
)
assembly_creation_results: CreationResult = create_ena_assembly(
ena_config, manifest_file, center_name=center_name, test=test
ena_config,
manifest_file,
accession=seq_key["accession"],
center_name=center_name,
test=test,
)
if assembly_creation_results.result:
assembly_creation_results.result["segment_order"] = segment_order
Expand Down
108 changes: 104 additions & 4 deletions ena-submission/scripts/ena_submission_helper.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import datetime
import glob
import gzip
import json
import logging
Expand All @@ -13,12 +14,17 @@
import pytz
import requests
import xmltodict
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqFeature import FeatureLocation, Reference, SeqFeature
from Bio.SeqRecord import SeqRecord
from ena_types import (
Action,
Actions,
AssemblyChromosomeListFile,
AssemblyManifest,
Hold,
MoleculeType,
ProjectSet,
SampleSetType,
Submission,
Expand Down Expand Up @@ -144,7 +150,7 @@ def create_ena_project(config: ENAConfig, project_set: ProjectSet) -> CreationRe
logger.error(error_message)
errors.append(error_message)
return CreationResult(results=None, errors=errors, warnings=warnings)

if not response.ok:
error_message = (
f"Request failed with status:{response.status_code}. " f"Response: {response.text}."
Expand Down Expand Up @@ -268,6 +274,81 @@ def create_chromosome_list(list_object: AssemblyChromosomeListFile, dir: str | N
return filename


def create_flatfile(
unaligned_sequences: dict[str, str],
accession: str,
description: str,
authors: str,
moleculetype: MoleculeType,
country: str,
collection_date: str,
organism: str,
dir: str | None = None,
) -> str:
if dir:
os.makedirs(dir, exist_ok=True)
filename = os.path.join(dir, "sequences.embl")
else:
with tempfile.NamedTemporaryFile(delete=False, suffix=".embl") as temp:
filename = temp.name

seqIO_moleculetype = {
MoleculeType.GENOMIC_DNA: "DNA",
MoleculeType.GENOMIC_RNA: "RNA",
MoleculeType.VIRAL_CRNA: "cRNA",
}

embl_content = []

multi_segment = True
if set(unaligned_sequences.keys()) == {"main"}:
multi_segment = False

for seq_name, sequence_str in unaligned_sequences.items():
if not sequence_str:
continue
reference = Reference()
reference.authors = authors
sequence = SeqRecord(
Seq(sequence_str),
id=f"{accession}_{seq_name}" if multi_segment else accession,
annotations={
"molecule_type": seqIO_moleculetype[moleculetype],
"organism": organism,
"topology": "linear",
"references": [reference],
},
description=description,
)

source_feature = SeqFeature(
FeatureLocation(start=0, end=len(sequence.seq)),
type="source",
qualifiers={
"molecule_type": str(moleculetype),
"organism": organism,
"country": country,
"collection_date": collection_date,
},
)
sequence.features.append(source_feature)

with tempfile.NamedTemporaryFile(delete=False, suffix=".embl") as temp_seq_file:
SeqIO.write(sequence, temp_seq_file.name, "embl")

with open(temp_seq_file.name, encoding="utf-8") as temp_seq_file:
embl_content.append(temp_seq_file.read())

final_content = "\n".join(embl_content)

gzip_filename = filename + ".gz"

with gzip.open(gzip_filename, "wt", encoding="utf-8") as file:
file.write(final_content)

return gzip_filename


def create_fasta(
unaligned_sequences: dict[str, str],
chromosome_list: AssemblyChromosomeListFile,
Expand Down Expand Up @@ -308,6 +389,8 @@ def create_manifest(manifest: AssemblyManifest, dir: str | None = None) -> str:
else:
with tempfile.NamedTemporaryFile(delete=False, suffix=".tsv") as temp:
filename = temp.name
if not manifest.fasta and not manifest.flatfile:
raise ValueError("Either fasta or flatfile must be provided")
with open(filename, "w") as f:
f.write(f"STUDY\t{manifest.study}\n")
f.write(f"SAMPLE\t{manifest.sample}\n")
Expand All @@ -318,7 +401,10 @@ def create_manifest(manifest: AssemblyManifest, dir: str | None = None) -> str:
f.write(f"COVERAGE\t{manifest.coverage}\n")
f.write(f"PROGRAM\t{manifest.program}\n")
f.write(f"PLATFORM\t{manifest.platform}\n")
f.write(f"FASTA\t{manifest.fasta}\n")
if manifest.flatfile:
f.write(f"FLATFILE\t{manifest.flatfile}\n")
if manifest.fasta:
f.write(f"FASTA\t{manifest.fasta}\n")
f.write(f"CHROMOSOME_LIST\t{manifest.chromosome_list}\n")
if manifest.description:
f.write(f"DESCRIPTION\t{manifest.description}\n")
Expand Down Expand Up @@ -357,7 +443,7 @@ def post_webin_cli(


def create_ena_assembly(
config: ENAConfig, manifest_filename: str, center_name=None, test=True
config: ENAConfig, manifest_filename: str, accession: str = "", center_name=None, test=True
) -> CreationResult:
"""
This is equivalent to running:
Expand All @@ -374,7 +460,21 @@ def create_ena_assembly(
f"Request failed with status:{response.returncode}. "
f"Stdout: {response.stdout}, Stderr: {response.stderr}"
)
logger.warning(error_message)
validate_log_path = f"../tmp/genome/{accession}*/validate/*.report"
matching_files = glob.glob(validate_log_path)

if not matching_files:
logging.error("No .report files found.")
else:
file_path = matching_files[0]
print(f"Matching file found: {file_path}")

try:
with open(file_path, "r") as file:
contents = file.read()
print(f"Contents of the file:\n{contents}")
except Exception as e:
logging.error(f"Error reading file {file_path}: {e}")
errors.append(error_message)
return CreationResult(result=None, errors=errors, warnings=warnings)

Expand Down
3 changes: 2 additions & 1 deletion ena-submission/scripts/ena_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,8 +181,9 @@ class AssemblyManifest:
coverage: str
program: str
platform: str
fasta: str
chromosome_list: str
fasta: str | None = None
flatfile: str | None = None
mingaplength: int | None = None
moleculetype: MoleculeType | None = None
description: str | None = None
Expand Down
Loading
Loading