Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feature: minimal full translation run #101

Open
wants to merge 15 commits into
base: main
Choose a base branch
from
File renamed without changes.
6 changes: 3 additions & 3 deletions scripts/run_vp_transformer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@
SAMPLE_DIR="./tests/data/samples_large/A1_05_2024_10_08/20241024_2411515907/alignments"
SAMPLE_ID="A1_05_2024_10_08"
BATCH_ID="20241024_2411515907"
RESULTS_DIR="./tests/data/samples_original/A1_05_2024_10_08/20241024_2411515907/timeline_A1_05_2024_10_08.tsv"
TIMELINE_FILE="./tests/data/samples_original/A1_05_2024_10_08/20241024_2411515907/timeline_A1_05_2024_10_08.tsv"
RESULTS_DIR="./results_neo"
TIMELINE_FILE="./tests/data/samples/timeline_A1_05_2024_10_08.tsv"
PRIMERS_FILE="./tests/data/samples_large/primers.yaml"
NUC_REFERENCE="sars-cov2"
NUC_REFERENCE="sars-cov-2"
DATABASE_CONFIG="./scripts/database_config.yaml"

# Run the Python script with the arguments
Expand Down
59 changes: 42 additions & 17 deletions scripts/vp_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,13 @@
import yaml

from sr2silo.config import is_ci_environment
from sr2silo.process import parse_translate_align
from sr2silo.process import enrich_AlignedReads_with_metadata, parse_translate_align
from sr2silo.s3 import compress_bz2, upload_file_to_s3
from sr2silo.silo import LapisClient, Submission
from sr2silo.vpipe import Sample

logging.basicConfig(
level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s"
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)


Expand Down Expand Up @@ -89,6 +89,9 @@ def process_directory(

# TODO: absolb all these intermediary files into a temporary directory

## print PWD
logging.info(f"Current working directory: {os.getcwd()}")

# check that one was given a directory and not a file and it exists
if not input_dir.is_dir():
logging.error(f"Input directory not found, is it a directory?: {input_dir}")
Expand All @@ -107,12 +110,14 @@ def process_directory(
sample_to_process.enrich_metadata(timeline_file, primers_file)
metadata = sample_to_process.get_metadata()
# add nextclade reference to metadata
resource_fp = Path("resources") / nuc_reference
resource_fp = Path("./resources") / nuc_reference
nuc_reference_fp = resource_fp / "nuc_reference_genomes.fasta"
aa_reference_fp = resource_fp / "aa_reference_genomes.fasta"

metadata["nuc_reference"] = nuc_reference
metadata["aa_reference"] = aa_reference
# TODO: get reference from nextclade or loculus
metadata["nextclade_reference"] = nuc_reference
# metadata["nuc_reference"] = nuc_reference
# metadata["aa_reference"] = aa_reference
metadata_file = result_dir / "metadata.json"
result_dir.mkdir(parents=True, exist_ok=True)
with metadata_file.open("w") as f:
Expand All @@ -126,18 +131,25 @@ def process_directory(
##### Translate / Align / Normalize to JSON #####

aligned_reads = parse_translate_align(nuc_reference_fp, aa_reference_fp, sample_fp)
aligned_reads = enrich_AlignedReads_with_metadata(aligned_reads, metadata_file)

# TODO wrangle the aligned reads to aligned_reads_with_metadata and write to a file

# write the aligned reads to a file
aligned_reads_fp = result_dir / "silo_input.ndjson"

with aligned_reads_fp.open("w") as f:
for read in aligned_reads:
f.write(read.to_str() + "\n")
for read in aligned_reads.values():
try:
f.write(read.to_silo_json(indent=False) + "\n")
except Exception as e:
logging.error(f"Error writing read to file SILO JSON {e}")
logging.error(f"Read ID: {read.read_id}")
logging.error(f"Read: {read}")

##### Compress & Upload to S3 #####
file_to_upload = aligned_reads_fp
compressed_file = result_dir_transformed / "silo_input.ndjson.bz2"
compressed_file = result_dir / "silo_input.ndjson.bz2"
logging.info(f"Compressing file: {file_to_upload}")
compress_bz2(file_to_upload, compressed_file)

Expand All @@ -160,17 +172,30 @@ def process_directory(
KEYCLOAK_TOKEN_URL = "https://authentication-wise-seqs.loculus.org/realms/loculus/protocol/openid-connect/token"
SUBMISSION_URL = "https://backend-wise-seqs.loculus.org/test/submit?groupId={group_id}&dataUseTermsType=OPEN"
else:
# get the real environment variables
KEYCLOAK_TOKEN_URL = os.getenv("KEYCLOAK_TOKEN_URL")
SUBMISSION_URL = os.getenv("SUBMISSION_URL")
if os.getenv("KEYCLOAK_TOKEN_URL") or os.getenv("SUBMISSION_URL"):
KEYCLOAK_TOKEN_URL = os.getenv("KEYCLOAK_TOKEN_URL")
SUBMISSION_URL = os.getenv("SUBMISSION_URL")
else:
logging.warning("KEYCLOAK_TOKEN_URL and SUBMISSION_URL not set.")
logging.warning("Using default values.")
KEYCLOAK_TOKEN_URL = "https://authentication-wise-seqs.loculus.org/realms/loculus/protocol/openid-connect/token"
SUBMISSION_URL = "https://backend-wise-seqs.loculus.org/test/submit?groupId={group_id}&dataUseTermsType=OPEN"

client = LapisClient(KEYCLOAK_TOKEN_URL, SUBMISSION_URL) # type: ignore
client.authenticate(username="testuser", password="testuser")
submission_ids = Submission.get_submission_ids_from_tsv(input_fp)
fasta_str = Submission.generate_placeholder_fasta(submission_ids)
submission = Submission(fasta_str, input_fp)
response = client.submit(group_id=1, data=submission)
logging.info(f"Submission response: {response}")
if response.status_code == 200:
logging.info("Submission successful.")
logging.info(
"You can approve the upload for release at:\n\n"
"https://wise-seqs.loculus.org/salmonella/submission/1/review"
)
else:
logging.error(f"Error submitting data to Lapis: {response}")
logging.error(f"Response: {response.text}")


@click.command()
Expand Down Expand Up @@ -222,15 +247,15 @@ def main(
logging.info(f"Running in CI environment: {ci_env}")

process_directory(
input_dir=Path("sample"),
input_dir=Path(sample_dir),
sample_id=sample_id,
batch_id=batch_id,
result_dir=Path("results"),
timeline_file=Path("timeline.tsv"),
primers_file=Path("primers.yaml"),
result_dir=Path(result_dir),
timeline_file=Path(timeline_file),
primers_file=Path(primer_file),
nuc_reference=nuc_reference,
aa_reference=aa_reference,
database_config=Path("scripts/database_config.yaml"),
database_config=Path(database_config),
)


Expand Down
7 changes: 6 additions & 1 deletion src/sr2silo/process/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,11 @@
GeneSet,
NucInsertion,
)
from sr2silo.process.translate_align import nuc_to_aa_alignment, parse_translate_align
from sr2silo.process.translate_align import (
enrich_AlignedReads_with_metadata,
nuc_to_aa_alignment,
parse_translate_align,
)

__all__ = [
"bam_to_sam",
Expand All @@ -34,4 +38,5 @@
"GeneSet",
"get_gene_set_from_ref",
"parse_translate_align",
"enrich_AlignedReads_with_metadata",
]
1 change: 1 addition & 0 deletions src/sr2silo/process/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,7 @@ def pad_alignment(
return padded_alignment


# TODO: identify weather used for only AA or Nuc as well, adjust types // description
def sam_to_seq_and_indels(
seq: str, cigar: str
) -> Tuple[str, List[AAInsertion], List[Tuple[int, int]]]:
Expand Down
12 changes: 8 additions & 4 deletions src/sr2silo/process/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,12 +92,13 @@ def _validate_types(self):
f"aligned_amino_acid_sequences must be a dict, got "
f"{type(self.aligned_amino_acid_sequences).__name__}"
)
# TODO: rework what metadata is allowed to be
if self.metadata is not None and not isinstance(
self.metadata, (dict, ReadMetadata)
):
raise TypeError(
"metadata must be a dict or ReadMetadata, "
"got {type(self.metadata).__name__}"
f"got {type(self.metadata).__name__}"
)

def set_nuc_insertion(self, nuc_insertion: NucInsertion):
Expand Down Expand Up @@ -139,17 +140,21 @@ def to_dict(self) -> Dict[str, Any]:
"alignedAminoAcidSequences": self.aligned_amino_acid_sequences.to_dict(),
}
if self.metadata:
if isinstance(self.metadata, dict):
self.metadata["read_id"] = self.read_id
json_representation["metadata"] = self.metadata
return json_representation

def to_silo_json(self) -> None:
def to_silo_json(self, indent: bool = True) -> str:
"""
Validate the aligned read dict using a pydantic schema and print a
nicely formatted JSON string conforming to the DB requirements.
"""
try:
schema = AlignedReadSchema(**self.to_dict())
print(schema.model_dump_json(indent=2, exclude_none=True))
return schema.model_dump_json(
indent=2 if indent else None, exclude_none=True
)
except ValidationError as e:
raise e

Expand Down Expand Up @@ -288,7 +293,6 @@ def to_dict(self) -> dict:
return {
str(gene): [f"{ins.position} : {ins.sequence}" for ins in ins_per_gene]
for gene, ins_per_gene in self.aa_insertions.items()
if isinstance(ins_per_gene, list)
}

def __str__(self) -> str:
Expand Down
41 changes: 33 additions & 8 deletions src/sr2silo/process/translate_align.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from __future__ import annotations

import json
import logging
import os
import subprocess
Expand All @@ -13,7 +14,6 @@

import sr2silo.process.convert as convert
from sr2silo.process.interface import (
AAInsertion,
AAInsertionSet,
AASequenceSet,
AlignedRead,
Expand Down Expand Up @@ -133,10 +133,11 @@ def nuc_to_aa_alignment(
result = os.system(f"diamond makedb --in {in_aa_reference_fp} -d {db_ref_fp}")
if result != 0:
raise RuntimeError(
"Error occurred while making sequence DB with diamond makedb"
f"Error occurred while making sequence DB with diamond makedb "
f"- Error Code: {result}"
)
except Exception as e:
logging.error(f"An error occurred while making sequence DB: {e}")
logging.error(f"An error occurred while making sequence DB - Error Code: {e}")
raise

try:
Expand Down Expand Up @@ -269,12 +270,8 @@ def read_in_AlignedReads_aa_seq_and_ins(
aa_aligned, pos, gene_set.get_gene_length(gene_name)
)

aa_ins = [
AAInsertion(position=ins_pos, sequence=ins_seq)
for ins_pos, ins_seq in aa_insertions # pyright: ignore
]
aligned_reads[read_id].amino_acid_insertions.set_insertions_for_gene(
gene_name, aa_ins
gene_name, aa_insertions
)
aligned_reads[read_id].aligned_amino_acid_sequences.set_sequence(
gene_name, padded_aa_alignment
Expand Down Expand Up @@ -342,3 +339,31 @@ def parse_translate_align(
)

return aligned_reads


def enrich_AlignedReads_with_metadata(
aligned_reads: Dict[str, AlignedRead],
metadata_fp: Path,
) -> Dict[str, AlignedRead]:
"""Enrich the AlignedReads with metadata from a TSV file."""

try:
with open(metadata_fp, "r") as file:
metadata = json.load(file)
except FileNotFoundError:
logging.error("Error: File not found")
raise FileNotFoundError
except json.JSONDecodeError as e:
logging.error("Error: Invalid JSON format")
raise json.JSONDecodeError(e.msg, e.doc, e.pos)
except Exception as e:
logging.error(f"An unexpected error occurred: {e}")
raise e

if metadata:
for read_id, read in aligned_reads.items():
read.metadata = metadata
else:
logging.error("No metadata found in the file")
raise ValueError("No metadata found in the file")
return aligned_reads
Loading
Loading