-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Update format_segmented_viruses to also update metadata. Add cchfv nextclade_dataset (still need to figure out how to add clade_memberships to the auspice trees) and start to modify the preprocessing pipeline to allow for multiple segments. Add correct trees to nextclade_datasets and update preprocessing pipelines to take multiple segments.
- Loading branch information
1 parent
990a2e2
commit 836f8b3
Showing
27 changed files
with
97,657 additions
and
219 deletions.
There are no files selected for viewing
231 changes: 131 additions & 100 deletions
231
...src/main/kotlin/org/loculus/backend/service/submission/ProcessedSequenceEntryValidator.kt
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,8 @@ | ||
taxon_id: 186538 | ||
backend_url: https://backend-main.loculus.org/ | ||
keycloak_token_url: https://authentication-main.loculus.org/realms/loculus/protocol/openid-connect/token | ||
organism: ebola-zaire | ||
taxon_id: 3052518 | ||
backend_url: http://localhost:8079/ | ||
keycloak_token_url: http://localhost:8083/realms/loculus/protocol/openid-connect/token | ||
organism: cchf | ||
nucleotideSequences: | ||
- M | ||
- L | ||
- S |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
"""For each downloaded sequences calculate md5 hash and put into JSON""" | ||
|
||
from pathlib import Path | ||
import re | ||
import logging | ||
import pandas as pd | ||
import csv | ||
|
||
import click | ||
from Bio import SeqIO | ||
import yaml | ||
|
||
logger = logging.getLogger(__name__) | ||
logging.basicConfig( | ||
encoding="utf-8", | ||
level=logging.DEBUG, | ||
format="%(asctime)s %(levelname)8s (%(filename)20s:%(lineno)4d) - %(message)s ", | ||
datefmt="%H:%M:%S", | ||
) | ||
|
||
@click.command() | ||
@click.option("--config-file", required=True, type=click.Path(exists=True)) | ||
@click.option("--input-seq", required=True, type=click.Path(exists=True)) | ||
@click.option("--input-metadata", required=True, type=click.Path(exists=True)) | ||
@click.option("--output-seq", required=True, type=click.Path()) | ||
@click.option("--output-metadata", required=True, type=click.Path()) | ||
@click.option( | ||
"--log-level", | ||
default="INFO", | ||
type=click.Choice(["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]), | ||
) | ||
def main(config_file: str, input_seq: str, input_metadata: str, output_seq: str, output_metadata: str, log_level: str) -> None: | ||
logger.setLevel(log_level) | ||
|
||
with open(config_file) as file: | ||
config = yaml.safe_load(file) | ||
|
||
df = pd.read_csv(input_metadata, sep="\t", dtype=str, keep_default_na=False) | ||
metadata = df.to_dict(orient="records", index='genbank_accession') | ||
metadata_dict = {} | ||
for entry in metadata: | ||
metadata_dict[entry['genbank_accession']] = entry | ||
|
||
# Discard all sequences with unclear segment annotations | ||
# Append segment to end of NCBI accession ID to conform with LAPIS formatting | ||
processed_seq = [] | ||
processed_metadata = [] | ||
|
||
with open(input_seq) as f: | ||
records = SeqIO.parse(f, "fasta") | ||
for record in records: | ||
for segment in config['nucleotideSequences']: | ||
re_input = re.compile('.*segment {0}.*'.format(segment), re.IGNORECASE) | ||
x = re_input.search(record.description) | ||
if x: | ||
processed_metadata.append(metadata_dict[record.id]) | ||
record.id += '_' + segment | ||
processed_seq.append(record) | ||
|
||
def write_to_fasta(data, filename): | ||
if not data: | ||
Path(filename).touch() | ||
return | ||
with open(filename, 'a') as file: | ||
for record in processed_seq: | ||
file.write(f">{record.id}\n{record.seq}\n") | ||
|
||
def write_to_tsv(data, filename): | ||
if not data: | ||
Path(filename).touch() | ||
return | ||
columns = data[0].keys() | ||
with open(filename, 'w', newline='') as output_file: | ||
dict_writer = csv.DictWriter(output_file, columns, delimiter='\t') | ||
dict_writer.writeheader() | ||
dict_writer.writerows(data) | ||
|
||
write_to_fasta(processed_seq, output_seq) | ||
write_to_tsv(processed_metadata, output_metadata) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
Oops, something went wrong.