Add ccfv to yaml

Update format_segmented_viruses to also update metadata. Add cchfv nextclade_dataset (still need to figure out how to add clade_memberships to the auspice trees) and start to modify the preprocessing pipeline to allow for multiple segments. Add correct trees to nextclade_datasets and update preprocessing pipelines to take multiple segments.
loculus-project · May 15, 2024 · 836f8b3 · 836f8b3
1 parent 990a2e2
commit 836f8b3
Show file tree

Hide file tree

Showing 27 changed files with 97,657 additions and 219 deletions.
diff --git a/...src/main/kotlin/org/loculus/backend/service/submission/ProcessedSequenceEntryValidator.kt b/...src/main/kotlin/org/loculus/backend/service/submission/ProcessedSequenceEntryValidator.kt
diff --git a/ingest/Snakefile b/ingest/Snakefile
@@ -50,19 +50,52 @@ rule fetch_ncbi_dataset_package:
         """
 
 
-rule extract_ncbi_dataset_sequences:
+# rule extract_ncbi_dataset_sequences:
+#     input:
+#         dataset_package="results/ncbi_dataset.zip",
+#     output:
+#         ncbi_dataset_sequences="results/sequences.fasta",
+#     shell:
+#         """
+#         unzip -jp {input.dataset_package} \
+#             ncbi_dataset/data/genomic.fna \
+#         | seqkit seq -i -w0 \
+#         > {output.ncbi_dataset_sequences}
+#         """
+
+rule extract_ncbi_dataset_sequences_full:
     input:
         dataset_package="results/ncbi_dataset.zip",
     output:
-        ncbi_dataset_sequences="results/sequences.fasta",
+        ncbi_dataset_sequences="results/sequences_full.fasta",
     shell:
         """
         unzip -jp {input.dataset_package} \
             ncbi_dataset/data/genomic.fna \
-        | seqkit seq -i -w0 \
+        | seqkit seq -w0 \
         > {output.ncbi_dataset_sequences}
         """
 
+rule format_segmented_viruses:
+    """Add segment to INSDC label"""
+    input:
+        sequences="results/sequences_full.fasta",
+        script="scripts/segmented_viruses_format.py",
+        ncbi_dataset_tsv="results/metadata_post_rename.tsv",
+        config="results/config.yaml",
+    output:
+        sequences_processed="results/sequences.fasta",
+        ncbi_dataset_tsv="results/metadata_post_format.tsv",
+    shell:
+        """
+        python {input.script} \
+            --config-file {input.config} \
+            --input-seq {input.sequences} \
+            --input-metadata {input.ncbi_dataset_tsv} \
+            --output-seq {output.sequences_processed} \
+            --output-metadata {output.ncbi_dataset_tsv}
+        """   
+
 
 rule calculate_sequence_hashes:
     """Output JSON: {insdc_accession: md5_sequence_hash, ...}"""
@@ -108,7 +141,7 @@ rule rename_columns:
 
 rule prepare_metadata:
     input:
-        metadata="results/metadata_post_rename.tsv",
+        metadata="results/metadata_post_format.tsv",
         sequence_hashes="results/sequence_hashes.json",
         config="results/config.yaml",
         script="scripts/prepare_metadata.py",

diff --git a/ingest/config/config.yaml b/ingest/config/config.yaml
@@ -1,4 +1,8 @@
-taxon_id: 186538
-backend_url: https://backend-main.loculus.org/
-keycloak_token_url: https://authentication-main.loculus.org/realms/loculus/protocol/openid-connect/token
-organism: ebola-zaire
+taxon_id:  3052518
+backend_url: http://localhost:8079/
+keycloak_token_url: http://localhost:8083/realms/loculus/protocol/openid-connect/token
+organism: cchf
+nucleotideSequences:
+  - M
+  - L
+  - S
diff --git a/ingest/scripts/prepare_files.py b/ingest/scripts/prepare_files.py
@@ -58,7 +58,11 @@ def main(
 
     for fasta_id in to_submit:
         metadata_submit.append(metadata[fasta_id])
-        sequences_submit[fasta_id] = sequences[fasta_id]
+        nucleotideSequences = ["S", "M", "L"]
+        for nucleotideSequence in nucleotideSequences:
+            segmented_fasta_id = fasta_id + '_' + nucleotideSequence
+            if segmented_fasta_id in sequences:
+                sequences_submit[segmented_fasta_id] = sequences[segmented_fasta_id]
 
     for fasta_id, loculus_accession in to_revise.items():
         revise_record = metadata[fasta_id]

diff --git a/ingest/scripts/prepare_metadata.py b/ingest/scripts/prepare_metadata.py
@@ -101,8 +101,8 @@ def main(config_file: str, input: str, sequence_hashes: str, output: str, log_le
     # Calculate overall hash of metadata + sequence
     for record in metadata:
         sequence_hash = sequence_hashes.get(record[config.rename[config.fasta_id_field]], "")
-        if sequence_hash == "":
-            raise ValueError(f"No hash found for {record[config.fasta_id_field]}")
+        # if sequence_hash == "":
+        #     raise ValueError(f"No hash found for {record[config.fasta_id_field]}")
 
         metadata_dump = json.dumps(record, sort_keys=True)
         prehash = metadata_dump + sequence_hash

diff --git a/ingest/scripts/segmented_viruses_format.py b/ingest/scripts/segmented_viruses_format.py
@@ -0,0 +1,83 @@
+"""For each downloaded sequences calculate md5 hash and put into JSON"""
+
+from pathlib import Path
+import re
+import logging
+import pandas as pd
+import csv
+
+import click
+from Bio import SeqIO
+import yaml
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(
+    encoding="utf-8",
+    level=logging.DEBUG,
+    format="%(asctime)s %(levelname)8s (%(filename)20s:%(lineno)4d) - %(message)s ",
+    datefmt="%H:%M:%S",
+)
+
+@click.command()
+@click.option("--config-file", required=True, type=click.Path(exists=True))
+@click.option("--input-seq", required=True, type=click.Path(exists=True))
+@click.option("--input-metadata", required=True, type=click.Path(exists=True))
+@click.option("--output-seq", required=True, type=click.Path())
+@click.option("--output-metadata", required=True, type=click.Path())
+@click.option(
+    "--log-level",
+    default="INFO",
+    type=click.Choice(["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]),
+)
+def main(config_file: str, input_seq: str, input_metadata: str, output_seq: str, output_metadata: str, log_level: str) -> None:
+    logger.setLevel(log_level)
+
+    with open(config_file) as file:
+        config = yaml.safe_load(file)
+
+    df = pd.read_csv(input_metadata, sep="\t", dtype=str, keep_default_na=False)
+    metadata = df.to_dict(orient="records", index='genbank_accession')
+    metadata_dict = {}
+    for entry in metadata:
+        metadata_dict[entry['genbank_accession']] = entry
+
+    # Discard all sequences with unclear segment annotations
+    # Append segment to end of NCBI accession ID to conform with LAPIS formatting
+    processed_seq = []
+    processed_metadata = []
+
+    with open(input_seq) as f:
+        records = SeqIO.parse(f, "fasta")
+        for record in records:
+            for segment in config['nucleotideSequences']:
+                re_input = re.compile('.*segment {0}.*'.format(segment), re.IGNORECASE)
+                x = re_input.search(record.description)
+                if x:
+                    processed_metadata.append(metadata_dict[record.id])
+                    record.id += '_' + segment
+                    processed_seq.append(record)
+
+    def write_to_fasta(data, filename):
+        if not data:
+            Path(filename).touch()
+            return
+        with open(filename, 'a') as file:
+            for record in processed_seq:
+                file.write(f">{record.id}\n{record.seq}\n")
+
+    def write_to_tsv(data, filename):
+        if not data:
+            Path(filename).touch()
+            return
+        columns = data[0].keys()
+        with open(filename, 'w', newline='') as output_file:
+            dict_writer = csv.DictWriter(output_file, columns, delimiter='\t')
+            dict_writer.writeheader()
+            dict_writer.writerows(data)
+
+    write_to_fasta(processed_seq, output_seq)
+    write_to_tsv(processed_metadata, output_metadata)
+
+
+if __name__ == "__main__":
+    main()