Skip to content

Commit

Permalink
Switch from a fasta to a flatfile with authors
Browse files Browse the repository at this point in the history
  • Loading branch information
anna-parker committed Oct 4, 2024
1 parent 154ca3c commit 3d078fa
Show file tree
Hide file tree
Showing 5 changed files with 82 additions and 21 deletions.
4 changes: 4 additions & 0 deletions ena-submission/ENA_submission.md
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,10 @@ The following could be implement as post-MVP features:
ACGT
```

Potentially a better option is to create flatfiles: https://ena-docs.readthedocs.io/en/latest/submit/fileprep/flat-file-example.html, which can include annotations: https://www.ebi.ac.uk/ena/WebFeat/. As we are submitting as a broker it is important that we add the AUTHORS to the assembly: https://ena-docs.readthedocs.io/en/latest/faq/data_brokering.html#authorship.

https://github.com/NBISweden/EMBLmyGFF3 will generate an embl file given a gff3 file, however it needs a gff3 file for the specific sequence that is being submitted. NIH has gff3 for each reference sequence, but we need to create one for each sequence.

4. Submit the files using the webin-cli:

```bash
Expand Down
1 change: 1 addition & 0 deletions ena-submission/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@ dependencies:
- psycopg2
- slack_sdk
- xmltodict
- biopython
35 changes: 17 additions & 18 deletions ena-submission/scripts/create_assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
CreationResult,
create_chromosome_list,
create_ena_assembly,
create_fasta,
create_flatfile,
create_manifest,
get_ena_analysis_process,
get_ena_config,
Expand Down Expand Up @@ -138,18 +138,26 @@ def create_manifest_object(
organism_metadata = config.organisms[group_key["organism"]]["ingest"]
chromosome_list_object = create_chromosome_list_object(unaligned_nucleotide_sequences, seq_key)
chromosome_list_file = create_chromosome_list(list_object=chromosome_list_object, dir=dir)
fasta_file = create_fasta(
unaligned_sequences=unaligned_nucleotide_sequences,
chromosome_list=chromosome_list_object,
dir = dir
authors = (
metadata["authors"] if metadata.get("authors") else metadata.get("submitter", "Unknown")
)
try:
moleculetype = MoleculeType(organism_metadata.get("moleculeType", "genomic RNA"))
except ValueError:
moleculetype = None
organism = organism_metadata.get("scientific_name", "Unknown")
flat_file = create_flatfile(
unaligned_nucleotide_sequences,
seq_key["accession"],
authors=authors,
moleculetype=moleculetype,
organism=organism,
dir=dir,
)
program = (
metadata["sequencingInstrument"] if metadata.get("sequencingInstrument") else "Unknown"
)
platform = metadata["sequencingProtocol"] if metadata.get("sequencingProtocol") else "Unknown"
authors = (
metadata["authors"] if metadata.get("authors") else metadata.get("submitter", "Unknown")
)
try:
coverage = (
(
Expand All @@ -162,14 +170,6 @@ def create_manifest_object(
)
except ValueError:
coverage = 1
try:
moleculetype = (
MoleculeType(metadata["moleculeType"])
if organism_metadata.get("moleculeType")
else None
)
except ValueError:
moleculetype = None
description = (
f"Original sequence submitted to {config.db_name} with accession: "
f"{seq_key["accession"]}, version: {seq_key["version"]}"
Expand All @@ -189,9 +189,8 @@ def create_manifest_object(
coverage=coverage,
program=program,
platform=platform,
fasta=fasta_file,
flatfile=flat_file,
chromosome_list=chromosome_list_file,
authors=authors,
description=description,
moleculetype=moleculetype,
)
Expand Down
60 changes: 58 additions & 2 deletions ena-submission/scripts/ena_submission_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,16 @@
import pytz
import requests
import xmltodict
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from ena_types import (
Action,
Actions,
AssemblyChromosomeListFile,
AssemblyManifest,
Hold,
MoleculeType,
ProjectSet,
SampleSetType,
Submission,
Expand Down Expand Up @@ -144,7 +148,7 @@ def create_ena_project(config: ENAConfig, project_set: ProjectSet) -> CreationRe
logger.error(error_message)
errors.append(error_message)
return CreationResult(results=None, errors=errors, warnings=warnings)

if not response.ok:
error_message = (
f"Request failed with status:{response.status_code}. " f"Response: {response.text}."
Expand Down Expand Up @@ -268,6 +272,53 @@ def create_chromosome_list(list_object: AssemblyChromosomeListFile, dir: str | N
return filename


def create_flatfile(
unaligned_sequences: dict[str, str],
accession: str,
authors: str,
moleculetype: MoleculeType,
organism: str,
dir: str | None = None,
) -> str:
if dir:
os.makedirs(dir, exist_ok=True)
filename = os.path.join(dir, "sequence.embl")
else:
with tempfile.NamedTemporaryFile(delete=False, suffix=".embl") as temp:
filename = temp.name

seqIO_moleculetype = {
MoleculeType.GENOMIC_DNA: "DNA",
MoleculeType.GENOMIC_RNA: "RNA",
MoleculeType.VIRAL_CRNA: "cRNA",
}

if set(unaligned_sequences.keys()) != {"main"}:
msg = "Only one unaligned sequence is supported"
raise ValueError(msg)
sequence = SeqRecord(
Seq(unaligned_sequences["main"]),
id=accession,
annotations={"molecule_type": seqIO_moleculetype[moleculetype], "organism": organism, "reference_authors": authors},
)

SeqIO.write(sequence, filename, "embl")

reference_authors = f"RA {authors};\n"

with open(filename, encoding="utf-8") as file:
lines = file.readlines()

for i, line in enumerate(lines):
if line.startswith("XX"): # Insert after XX separator following the description
lines.insert(i + 1, reference_authors)
break
with open(filename, "w") as file:
file.writelines(lines)

return filename


def create_fasta(
unaligned_sequences: dict[str, str],
chromosome_list: AssemblyChromosomeListFile,
Expand Down Expand Up @@ -308,6 +359,8 @@ def create_manifest(manifest: AssemblyManifest, dir: str | None = None) -> str:
else:
with tempfile.NamedTemporaryFile(delete=False, suffix=".tsv") as temp:
filename = temp.name
if not manifest.fasta and not manifest.flatfile:
raise ValueError("Either fasta or flatfile must be provided")
with open(filename, "w") as f:
f.write(f"STUDY\t{manifest.study}\n")
f.write(f"SAMPLE\t{manifest.sample}\n")
Expand All @@ -318,7 +371,10 @@ def create_manifest(manifest: AssemblyManifest, dir: str | None = None) -> str:
f.write(f"COVERAGE\t{manifest.coverage}\n")
f.write(f"PROGRAM\t{manifest.program}\n")
f.write(f"PLATFORM\t{manifest.platform}\n")
f.write(f"FASTA\t{manifest.fasta}\n")
if manifest.flatfile:
f.write(f"FLATFILE\t{manifest.flatfile}\n")
if manifest.fasta:
f.write(f"FASTA\t{manifest.fasta}\n")
f.write(f"CHROMOSOME_LIST\t{manifest.chromosome_list}\n")
if manifest.description:
f.write(f"DESCRIPTION\t{manifest.description}\n")
Expand Down
3 changes: 2 additions & 1 deletion ena-submission/scripts/ena_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,8 +181,9 @@ class AssemblyManifest:
coverage: str
program: str
platform: str
fasta: str
chromosome_list: str
fasta: str | None = None
flatfile: str | None = None
authors: str | None = None
mingaplength: int | None = None
moleculetype: MoleculeType | None = None
Expand Down

0 comments on commit 3d078fa

Please sign in to comment.