diff --git a/ingest/Snakefile b/ingest/Snakefile index 15198c7e0..0a0623600 100644 --- a/ingest/Snakefile +++ b/ingest/Snakefile @@ -22,6 +22,7 @@ SEGMENTED = config["segmented"] ALL_FIELDS = ",".join(config["all_fields"]) COLUMN_MAPPING = config["column_mapping"] LOG_LEVEL = config.get("log_level", "INFO") +FILTER_FASTA_HEADERS = config.get("filter_fasta_headers", None) def rename_columns(input_file, output_file, mapping=COLUMN_MAPPING): @@ -96,21 +97,23 @@ rule rename_columns: rule extract_ncbi_dataset_sequences: """ For unsegmented sequences, we only keep the sequence ID in the header. - For segmented sequences, we keep full header as it contains segment information. + For sequences where we filter, we keep full header. """ input: dataset_package="results/ncbi_dataset.zip", output: ncbi_dataset_sequences="results/sequences.fasta", + params: + header_arg="" if FILTER_FASTA_HEADERS else "-i", # Keep full header if segmented shell: """ unzip -jp {input.dataset_package} \ ncbi_dataset/data/genomic.fna \ - | seqkit seq -w0 -i \ + | seqkit seq -w0 {params.header_arg} \ > {output.ncbi_dataset_sequences} """ - + rule calculate_sequence_hashes: """Output JSON: {insdc_accession: md5_sequence_hash, ...}""" input: @@ -127,10 +130,30 @@ rule calculate_sequence_hashes: --output-sequences {output.sequence_json} """ +if FILTER_FASTA_HEADERS: + rule filter_fasta_headers: + input: + sequences="results/sequences.fasta", + script="scripts/filter_sequences.py", + config="results/config.yaml", + output: + results="results/sequences_filtered.fasta", + params: + filter_fasta_headers=FILTER_FASTA_HEADERS, + log_level=LOG_LEVEL, + shell: + """ + python {input.script} \ + --input-seq {input.sequences} \ + --output-seq {output.results} \ + --log-level {params.log_level} \ + --config-file {input.config} \ + """ + rule align: input: - sequences="results/sequences.fasta", + sequences="results/sequences_filtered.fasta" if FILTER_FASTA_HEADERS else "results/sequences.fasta", output: results="results/nextclade_{segment}.tsv", params: diff --git a/ingest/scripts/filter_sequences.py b/ingest/scripts/filter_sequences.py new file mode 100644 index 000000000..463712050 --- /dev/null +++ b/ingest/scripts/filter_sequences.py @@ -0,0 +1,68 @@ +"""filters sequences by fasta header.""" + +import logging +import re +from dataclasses import dataclass + +import click +import yaml +from Bio import SeqIO + + +@dataclass +class Config: + filter_fasta_headers: str + + +logger = logging.getLogger(__name__) +logging.basicConfig( + encoding="utf-8", + level=logging.DEBUG, + format="%(asctime)s %(levelname)8s (%(filename)20s:%(lineno)4d) - %(message)s ", + datefmt="%H:%M:%S", +) + + +@click.command(help="Parse fasta header, only keep if fits regex filter_fasta_headers") +@click.option("--config-file", required=True, type=click.Path(exists=True)) +@click.option("--input-seq", required=True, type=click.Path(exists=True)) +@click.option("--output-seq", required=True, type=click.Path()) +@click.option( + "--log-level", + default="INFO", + type=click.Choice(["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]), +) +def main( + config_file: str, + input_seq: str, + output_seq: str, + log_level: str, +) -> None: + logger.setLevel(log_level) + + with open(config_file) as file: + full_config = yaml.safe_load(file) + relevant_config = {key: full_config[key] for key in Config.__annotations__} + config = Config(**relevant_config) + + # Discard all sequences without filter_fasta_headers in their header + discarded = 0 + found = 0 + with open(output_seq, "a") as output_file: + with open(input_seq) as f: + records = SeqIO.parse(f, "fasta") + for record in records: + found_segment = re.search(config.filter_fasta_headers, record.description) + if found_segment: + found += 1 + output_file.write(f">{record.id}\n{record.seq}\n") + else: + discarded += 1 + + logging.info( + f"Discarded {discarded} out of {discarded + found} sequences, as they did not contain the filter_fasta_headers: {config.filter_fasta_headers}." + ) + + +if __name__ == "__main__": + main()