cbg-ethz · gordonkoehn · Feb 20, 2025 · Feb 20, 2025 · Feb 20, 2025 · Feb 20, 2025
diff --git a/scripts/database_config.yaml → resources/silo/database_config.yaml b/scripts/database_config.yaml → resources/silo/database_config.yaml
diff --git a/scripts/run_vp_transformer.sh b/scripts/run_vp_transformer.sh
@@ -4,10 +4,10 @@
 SAMPLE_DIR="./tests/data/samples_large/A1_05_2024_10_08/20241024_2411515907/alignments"
 SAMPLE_ID="A1_05_2024_10_08"
 BATCH_ID="20241024_2411515907"
-RESULTS_DIR="./tests/data/samples_original/A1_05_2024_10_08/20241024_2411515907/timeline_A1_05_2024_10_08.tsv"
-TIMELINE_FILE="./tests/data/samples_original/A1_05_2024_10_08/20241024_2411515907/timeline_A1_05_2024_10_08.tsv"
+RESULTS_DIR="./results_neo"
+TIMELINE_FILE="./tests/data/samples/timeline_A1_05_2024_10_08.tsv"
 PRIMERS_FILE="./tests/data/samples_large/primers.yaml"
-NUC_REFERENCE="sars-cov2"
+NUC_REFERENCE="sars-cov-2"
 DATABASE_CONFIG="./scripts/database_config.yaml"
 
 # Run the Python script with the arguments

diff --git a/scripts/vp_transformer.py b/scripts/vp_transformer.py
@@ -12,13 +12,13 @@
 import yaml
 
 from sr2silo.config import is_ci_environment
-from sr2silo.process import parse_translate_align
+from sr2silo.process import enrich_AlignedReads_with_metadata, parse_translate_align
 from sr2silo.s3 import compress_bz2, upload_file_to_s3
 from sr2silo.silo import LapisClient, Submission
 from sr2silo.vpipe import Sample
 
 logging.basicConfig(
-    level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s"
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
 )
 
 
@@ -89,6 +89,9 @@ def process_directory(
 
     # TODO: absolb all these intermediary files into a temporary directory
 
+    ## print PWD
+    logging.info(f"Current working directory: {os.getcwd()}")
+
     # check that one was given a directory and not a file and it exists
     if not input_dir.is_dir():
         logging.error(f"Input directory not found, is it a directory?: {input_dir}")
@@ -107,12 +110,14 @@ def process_directory(
     sample_to_process.enrich_metadata(timeline_file, primers_file)
     metadata = sample_to_process.get_metadata()
     # add nextclade reference to metadata
-    resource_fp = Path("resources") / nuc_reference
+    resource_fp = Path("./resources") / nuc_reference
     nuc_reference_fp = resource_fp / "nuc_reference_genomes.fasta"
     aa_reference_fp = resource_fp / "aa_reference_genomes.fasta"
 
-    metadata["nuc_reference"] = nuc_reference
-    metadata["aa_reference"] = aa_reference
+    # TODO: get reference from nextclade or loculus
+    metadata["nextclade_reference"] = nuc_reference
+    # metadata["nuc_reference"] = nuc_reference
+    # metadata["aa_reference"] = aa_reference
     metadata_file = result_dir / "metadata.json"
     result_dir.mkdir(parents=True, exist_ok=True)
     with metadata_file.open("w") as f:
@@ -126,18 +131,25 @@ def process_directory(
     ##### Translate / Align / Normalize to JSON #####
 
     aligned_reads = parse_translate_align(nuc_reference_fp, aa_reference_fp, sample_fp)
+    aligned_reads = enrich_AlignedReads_with_metadata(aligned_reads, metadata_file)
 
     # TODO wrangle the aligned reads to aligned_reads_with_metadata and write to a file
 
     # write the aligned reads to a file
     aligned_reads_fp = result_dir / "silo_input.ndjson"
+
     with aligned_reads_fp.open("w") as f:
-        for read in aligned_reads:
-            f.write(read.to_str() + "\n")
+        for read in aligned_reads.values():
+            try:
+                f.write(read.to_silo_json(indent=False) + "\n")
+            except Exception as e:
+                logging.error(f"Error writing read to file SILO JSON {e}")
+                logging.error(f"Read ID: {read.read_id}")
+                logging.error(f"Read: {read}")
 
     #####   Compress & Upload to S3  #####
     file_to_upload = aligned_reads_fp
-    compressed_file = result_dir_transformed / "silo_input.ndjson.bz2"
+    compressed_file = result_dir / "silo_input.ndjson.bz2"
     logging.info(f"Compressing file: {file_to_upload}")
     compress_bz2(file_to_upload, compressed_file)
 
@@ -160,17 +172,30 @@ def process_directory(
         KEYCLOAK_TOKEN_URL = "https://authentication-wise-seqs.loculus.org/realms/loculus/protocol/openid-connect/token"
         SUBMISSION_URL = "https://backend-wise-seqs.loculus.org/test/submit?groupId={group_id}&dataUseTermsType=OPEN"
     else:
-        # get the real environment variables
-        KEYCLOAK_TOKEN_URL = os.getenv("KEYCLOAK_TOKEN_URL")
-        SUBMISSION_URL = os.getenv("SUBMISSION_URL")
+        if os.getenv("KEYCLOAK_TOKEN_URL") or os.getenv("SUBMISSION_URL"):
+            KEYCLOAK_TOKEN_URL = os.getenv("KEYCLOAK_TOKEN_URL")
+            SUBMISSION_URL = os.getenv("SUBMISSION_URL")
+        else:
+            logging.warning("KEYCLOAK_TOKEN_URL and SUBMISSION_URL not set.")
+            logging.warning("Using default values.")
+            KEYCLOAK_TOKEN_URL = "https://authentication-wise-seqs.loculus.org/realms/loculus/protocol/openid-connect/token"
+            SUBMISSION_URL = "https://backend-wise-seqs.loculus.org/test/submit?groupId={group_id}&dataUseTermsType=OPEN"
 
     client = LapisClient(KEYCLOAK_TOKEN_URL, SUBMISSION_URL)  # type: ignore
     client.authenticate(username="testuser", password="testuser")
     submission_ids = Submission.get_submission_ids_from_tsv(input_fp)
     fasta_str = Submission.generate_placeholder_fasta(submission_ids)
     submission = Submission(fasta_str, input_fp)
     response = client.submit(group_id=1, data=submission)
-    logging.info(f"Submission response: {response}")
+    if response.status_code == 200:
+        logging.info("Submission successful.")
+        logging.info(
+            "You can approve the upload for release at:\n\n"
+            "https://wise-seqs.loculus.org/salmonella/submission/1/review"
+        )
+    else:
+        logging.error(f"Error submitting data to Lapis: {response}")
+        logging.error(f"Response: {response.text}")
 
 
 @click.command()
@@ -222,15 +247,15 @@ def main(
     logging.info(f"Running in CI environment: {ci_env}")
 
     process_directory(
-        input_dir=Path("sample"),
+        input_dir=Path(sample_dir),
         sample_id=sample_id,
         batch_id=batch_id,
-        result_dir=Path("results"),
-        timeline_file=Path("timeline.tsv"),
-        primers_file=Path("primers.yaml"),
+        result_dir=Path(result_dir),
+        timeline_file=Path(timeline_file),
+        primers_file=Path(primer_file),
         nuc_reference=nuc_reference,
         aa_reference=aa_reference,
-        database_config=Path("scripts/database_config.yaml"),
+        database_config=Path(database_config),
     )
 
 

diff --git a/src/sr2silo/process/__init__.py b/src/sr2silo/process/__init__.py
@@ -19,7 +19,11 @@
     GeneSet,
     NucInsertion,
 )
-from sr2silo.process.translate_align import nuc_to_aa_alignment, parse_translate_align
+from sr2silo.process.translate_align import (
+    enrich_AlignedReads_with_metadata,
+    nuc_to_aa_alignment,
+    parse_translate_align,
+)
 
 __all__ = [
     "bam_to_sam",
@@ -34,4 +38,5 @@
     "GeneSet",
     "get_gene_set_from_ref",
     "parse_translate_align",
+    "enrich_AlignedReads_with_metadata",
 ]
diff --git a/src/sr2silo/process/convert.py b/src/sr2silo/process/convert.py
@@ -331,6 +331,7 @@ def pad_alignment(
     return padded_alignment
 
 
+# TODO: identify weather used for only AA or Nuc as well, adjust types // description
 def sam_to_seq_and_indels(
     seq: str, cigar: str
 ) -> Tuple[str, List[AAInsertion], List[Tuple[int, int]]]:

diff --git a/src/sr2silo/process/interface.py b/src/sr2silo/process/interface.py
@@ -92,12 +92,13 @@ def _validate_types(self):
                 f"aligned_amino_acid_sequences must be a dict, got "
                 f"{type(self.aligned_amino_acid_sequences).__name__}"
             )
+        # TODO: rework what metadata is allowed to be
         if self.metadata is not None and not isinstance(
             self.metadata, (dict, ReadMetadata)
         ):
             raise TypeError(
                 "metadata must be a dict or ReadMetadata, "
-                "got {type(self.metadata).__name__}"
+                f"got {type(self.metadata).__name__}"
             )
 
     def set_nuc_insertion(self, nuc_insertion: NucInsertion):
@@ -139,17 +140,21 @@ def to_dict(self) -> Dict[str, Any]:
             "alignedAminoAcidSequences": self.aligned_amino_acid_sequences.to_dict(),
         }
         if self.metadata:
+            if isinstance(self.metadata, dict):
+                self.metadata["read_id"] = self.read_id
             json_representation["metadata"] = self.metadata
         return json_representation
 
-    def to_silo_json(self) -> None:
+    def to_silo_json(self, indent: bool = True) -> str:
         """
         Validate the aligned read dict using a pydantic schema and print a
         nicely formatted JSON string conforming to the DB requirements.
         """
         try:
             schema = AlignedReadSchema(**self.to_dict())
-            print(schema.model_dump_json(indent=2, exclude_none=True))
+            return schema.model_dump_json(
+                indent=2 if indent else None, exclude_none=True
+            )
         except ValidationError as e:
             raise e
 
@@ -288,7 +293,6 @@ def to_dict(self) -> dict:
         return {
             str(gene): [f"{ins.position} : {ins.sequence}" for ins in ins_per_gene]
             for gene, ins_per_gene in self.aa_insertions.items()
-            if isinstance(ins_per_gene, list)
         }
 
     def __str__(self) -> str:

diff --git a/src/sr2silo/process/translate_align.py b/src/sr2silo/process/translate_align.py
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+import json
 import logging
 import os
 import subprocess
@@ -13,7 +14,6 @@
 
 import sr2silo.process.convert as convert
 from sr2silo.process.interface import (
-    AAInsertion,
     AAInsertionSet,
     AASequenceSet,
     AlignedRead,
@@ -133,10 +133,11 @@ def nuc_to_aa_alignment(
         result = os.system(f"diamond makedb --in {in_aa_reference_fp} -d {db_ref_fp}")
         if result != 0:
             raise RuntimeError(
-                "Error occurred while making sequence DB with diamond makedb"
+                f"Error occurred while making sequence DB with diamond makedb "
+                f"- Error Code: {result}"
             )
     except Exception as e:
-        logging.error(f"An error occurred while making sequence DB: {e}")
+        logging.error(f"An error occurred while making sequence DB - Error Code: {e}")
         raise
 
     try:
@@ -269,12 +270,8 @@ def read_in_AlignedReads_aa_seq_and_ins(
                     aa_aligned, pos, gene_set.get_gene_length(gene_name)
                 )
 
-                aa_ins = [
-                    AAInsertion(position=ins_pos, sequence=ins_seq)
-                    for ins_pos, ins_seq in aa_insertions  # pyright: ignore
-                ]
                 aligned_reads[read_id].amino_acid_insertions.set_insertions_for_gene(
-                    gene_name, aa_ins
+                    gene_name, aa_insertions
                 )
                 aligned_reads[read_id].aligned_amino_acid_sequences.set_sequence(
                     gene_name, padded_aa_alignment
@@ -342,3 +339,31 @@ def parse_translate_align(
     )
 
     return aligned_reads
+
+
+def enrich_AlignedReads_with_metadata(
+    aligned_reads: Dict[str, AlignedRead],
+    metadata_fp: Path,
+) -> Dict[str, AlignedRead]:
+    """Enrich the AlignedReads with metadata from a TSV file."""
+
+    try:
+        with open(metadata_fp, "r") as file:
+            metadata = json.load(file)
+    except FileNotFoundError:
+        logging.error("Error: File not found")
+        raise FileNotFoundError
+    except json.JSONDecodeError as e:
+        logging.error("Error: Invalid JSON format")
+        raise json.JSONDecodeError(e.msg, e.doc, e.pos)
+    except Exception as e:
+        logging.error(f"An unexpected error occurred: {e}")
+        raise e
+
+    if metadata:
+        for read_id, read in aligned_reads.items():
+            read.metadata = metadata
+    else:
+        logging.error("No metadata found in the file")
+        raise ValueError("No metadata found in the file")
+    return aligned_reads