Skip to content

Commit

Permalink
feat(ingest): Add option to filter sequences based on header content (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
anna-parker authored Jul 30, 2024
1 parent 9724dcf commit 96b6124
Show file tree
Hide file tree
Showing 2 changed files with 95 additions and 4 deletions.
31 changes: 27 additions & 4 deletions ingest/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ SEGMENTED = config["segmented"]
ALL_FIELDS = ",".join(config["all_fields"])
COLUMN_MAPPING = config["column_mapping"]
LOG_LEVEL = config.get("log_level", "INFO")
FILTER_FASTA_HEADERS = config.get("filter_fasta_headers", None)


def rename_columns(input_file, output_file, mapping=COLUMN_MAPPING):
Expand Down Expand Up @@ -96,21 +97,23 @@ rule rename_columns:
rule extract_ncbi_dataset_sequences:
"""
For unsegmented sequences, we only keep the sequence ID in the header.
For segmented sequences, we keep full header as it contains segment information.
For sequences where we filter, we keep full header.
"""
input:
dataset_package="results/ncbi_dataset.zip",
output:
ncbi_dataset_sequences="results/sequences.fasta",
params:
header_arg="" if FILTER_FASTA_HEADERS else "-i", # Keep full header if segmented
shell:
"""
unzip -jp {input.dataset_package} \
ncbi_dataset/data/genomic.fna \
| seqkit seq -w0 -i \
| seqkit seq -w0 {params.header_arg} \
> {output.ncbi_dataset_sequences}
"""


rule calculate_sequence_hashes:
"""Output JSON: {insdc_accession: md5_sequence_hash, ...}"""
input:
Expand All @@ -127,10 +130,30 @@ rule calculate_sequence_hashes:
--output-sequences {output.sequence_json}
"""

if FILTER_FASTA_HEADERS:
rule filter_fasta_headers:
input:
sequences="results/sequences.fasta",
script="scripts/filter_sequences.py",
config="results/config.yaml",
output:
results="results/sequences_filtered.fasta",
params:
filter_fasta_headers=FILTER_FASTA_HEADERS,
log_level=LOG_LEVEL,
shell:
"""
python {input.script} \
--input-seq {input.sequences} \
--output-seq {output.results} \
--log-level {params.log_level} \
--config-file {input.config} \
"""


rule align:
input:
sequences="results/sequences.fasta",
sequences="results/sequences_filtered.fasta" if FILTER_FASTA_HEADERS else "results/sequences.fasta",
output:
results="results/nextclade_{segment}.tsv",
params:
Expand Down
68 changes: 68 additions & 0 deletions ingest/scripts/filter_sequences.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
"""filters sequences by fasta header."""

import logging
import re
from dataclasses import dataclass

import click
import yaml
from Bio import SeqIO


@dataclass
class Config:
filter_fasta_headers: str


logger = logging.getLogger(__name__)
logging.basicConfig(
encoding="utf-8",
level=logging.DEBUG,
format="%(asctime)s %(levelname)8s (%(filename)20s:%(lineno)4d) - %(message)s ",
datefmt="%H:%M:%S",
)


@click.command(help="Parse fasta header, only keep if fits regex filter_fasta_headers")
@click.option("--config-file", required=True, type=click.Path(exists=True))
@click.option("--input-seq", required=True, type=click.Path(exists=True))
@click.option("--output-seq", required=True, type=click.Path())
@click.option(
"--log-level",
default="INFO",
type=click.Choice(["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]),
)
def main(
config_file: str,
input_seq: str,
output_seq: str,
log_level: str,
) -> None:
logger.setLevel(log_level)

with open(config_file) as file:
full_config = yaml.safe_load(file)
relevant_config = {key: full_config[key] for key in Config.__annotations__}
config = Config(**relevant_config)

# Discard all sequences without filter_fasta_headers in their header
discarded = 0
found = 0
with open(output_seq, "a") as output_file:
with open(input_seq) as f:
records = SeqIO.parse(f, "fasta")
for record in records:
found_segment = re.search(config.filter_fasta_headers, record.description)
if found_segment:
found += 1
output_file.write(f">{record.id}\n{record.seq}\n")
else:
discarded += 1

logging.info(
f"Discarded {discarded} out of {discarded + found} sequences, as they did not contain the filter_fasta_headers: {config.filter_fasta_headers}."
)


if __name__ == "__main__":
main()

0 comments on commit 96b6124

Please sign in to comment.