Skip to content

Commit

Permalink
Add ccfv to yaml
Browse files Browse the repository at this point in the history
Update format_segmented_viruses to also update metadata.

Add cchfv nextclade_dataset (still need to figure out how to add clade_memberships to the auspice trees) and start to modify the preprocessing pipeline to allow for multiple segments.

Add correct trees to nextclade_datasets and update preprocessing pipelines to take multiple segments.
  • Loading branch information
anna-parker committed May 15, 2024
1 parent 990a2e2 commit 836f8b3
Show file tree
Hide file tree
Showing 27 changed files with 97,657 additions and 219 deletions.

Large diffs are not rendered by default.

41 changes: 37 additions & 4 deletions ingest/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -50,19 +50,52 @@ rule fetch_ncbi_dataset_package:
"""


rule extract_ncbi_dataset_sequences:
# rule extract_ncbi_dataset_sequences:
# input:
# dataset_package="results/ncbi_dataset.zip",
# output:
# ncbi_dataset_sequences="results/sequences.fasta",
# shell:
# """
# unzip -jp {input.dataset_package} \
# ncbi_dataset/data/genomic.fna \
# | seqkit seq -i -w0 \
# > {output.ncbi_dataset_sequences}
# """

rule extract_ncbi_dataset_sequences_full:
input:
dataset_package="results/ncbi_dataset.zip",
output:
ncbi_dataset_sequences="results/sequences.fasta",
ncbi_dataset_sequences="results/sequences_full.fasta",
shell:
"""
unzip -jp {input.dataset_package} \
ncbi_dataset/data/genomic.fna \
| seqkit seq -i -w0 \
| seqkit seq -w0 \
> {output.ncbi_dataset_sequences}
"""

rule format_segmented_viruses:
"""Add segment to INSDC label"""
input:
sequences="results/sequences_full.fasta",
script="scripts/segmented_viruses_format.py",
ncbi_dataset_tsv="results/metadata_post_rename.tsv",
config="results/config.yaml",
output:
sequences_processed="results/sequences.fasta",
ncbi_dataset_tsv="results/metadata_post_format.tsv",
shell:
"""
python {input.script} \
--config-file {input.config} \
--input-seq {input.sequences} \
--input-metadata {input.ncbi_dataset_tsv} \
--output-seq {output.sequences_processed} \
--output-metadata {output.ncbi_dataset_tsv}
"""


rule calculate_sequence_hashes:
"""Output JSON: {insdc_accession: md5_sequence_hash, ...}"""
Expand Down Expand Up @@ -108,7 +141,7 @@ rule rename_columns:

rule prepare_metadata:
input:
metadata="results/metadata_post_rename.tsv",
metadata="results/metadata_post_format.tsv",
sequence_hashes="results/sequence_hashes.json",
config="results/config.yaml",
script="scripts/prepare_metadata.py",
Expand Down
12 changes: 8 additions & 4 deletions ingest/config/config.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
taxon_id: 186538
backend_url: https://backend-main.loculus.org/
keycloak_token_url: https://authentication-main.loculus.org/realms/loculus/protocol/openid-connect/token
organism: ebola-zaire
taxon_id: 3052518
backend_url: http://localhost:8079/
keycloak_token_url: http://localhost:8083/realms/loculus/protocol/openid-connect/token
organism: cchf
nucleotideSequences:
- M
- L
- S
6 changes: 5 additions & 1 deletion ingest/scripts/prepare_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,11 @@ def main(

for fasta_id in to_submit:
metadata_submit.append(metadata[fasta_id])
sequences_submit[fasta_id] = sequences[fasta_id]
nucleotideSequences = ["S", "M", "L"]
for nucleotideSequence in nucleotideSequences:
segmented_fasta_id = fasta_id + '_' + nucleotideSequence
if segmented_fasta_id in sequences:
sequences_submit[segmented_fasta_id] = sequences[segmented_fasta_id]

for fasta_id, loculus_accession in to_revise.items():
revise_record = metadata[fasta_id]
Expand Down
4 changes: 2 additions & 2 deletions ingest/scripts/prepare_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,8 @@ def main(config_file: str, input: str, sequence_hashes: str, output: str, log_le
# Calculate overall hash of metadata + sequence
for record in metadata:
sequence_hash = sequence_hashes.get(record[config.rename[config.fasta_id_field]], "")
if sequence_hash == "":
raise ValueError(f"No hash found for {record[config.fasta_id_field]}")
# if sequence_hash == "":
# raise ValueError(f"No hash found for {record[config.fasta_id_field]}")

metadata_dump = json.dumps(record, sort_keys=True)
prehash = metadata_dump + sequence_hash
Expand Down
83 changes: 83 additions & 0 deletions ingest/scripts/segmented_viruses_format.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
"""For each downloaded sequences calculate md5 hash and put into JSON"""

from pathlib import Path
import re
import logging
import pandas as pd
import csv

import click
from Bio import SeqIO
import yaml

logger = logging.getLogger(__name__)
logging.basicConfig(
encoding="utf-8",
level=logging.DEBUG,
format="%(asctime)s %(levelname)8s (%(filename)20s:%(lineno)4d) - %(message)s ",
datefmt="%H:%M:%S",
)

@click.command()
@click.option("--config-file", required=True, type=click.Path(exists=True))
@click.option("--input-seq", required=True, type=click.Path(exists=True))
@click.option("--input-metadata", required=True, type=click.Path(exists=True))
@click.option("--output-seq", required=True, type=click.Path())
@click.option("--output-metadata", required=True, type=click.Path())
@click.option(
"--log-level",
default="INFO",
type=click.Choice(["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]),
)
def main(config_file: str, input_seq: str, input_metadata: str, output_seq: str, output_metadata: str, log_level: str) -> None:
logger.setLevel(log_level)

with open(config_file) as file:
config = yaml.safe_load(file)

df = pd.read_csv(input_metadata, sep="\t", dtype=str, keep_default_na=False)
metadata = df.to_dict(orient="records", index='genbank_accession')
metadata_dict = {}
for entry in metadata:
metadata_dict[entry['genbank_accession']] = entry

# Discard all sequences with unclear segment annotations
# Append segment to end of NCBI accession ID to conform with LAPIS formatting
processed_seq = []
processed_metadata = []

with open(input_seq) as f:
records = SeqIO.parse(f, "fasta")
for record in records:
for segment in config['nucleotideSequences']:
re_input = re.compile('.*segment {0}.*'.format(segment), re.IGNORECASE)
x = re_input.search(record.description)
if x:
processed_metadata.append(metadata_dict[record.id])
record.id += '_' + segment
processed_seq.append(record)

def write_to_fasta(data, filename):
if not data:
Path(filename).touch()
return
with open(filename, 'a') as file:
for record in processed_seq:
file.write(f">{record.id}\n{record.seq}\n")

def write_to_tsv(data, filename):
if not data:
Path(filename).touch()
return
columns = data[0].keys()
with open(filename, 'w', newline='') as output_file:
dict_writer = csv.DictWriter(output_file, columns, delimiter='\t')
dict_writer.writeheader()
dict_writer.writerows(data)

write_to_fasta(processed_seq, output_seq)
write_to_tsv(processed_metadata, output_metadata)


if __name__ == "__main__":
main()
Loading

0 comments on commit 836f8b3

Please sign in to comment.