feat(ingest): Add option to filter sequences based on header content (#…

…2353)
loculus-project · Jul 30, 2024 · 96b6124 · 96b6124
1 parent 9724dcf
commit 96b6124
Show file tree

Hide file tree

Showing 2 changed files with 95 additions and 4 deletions.
diff --git a/ingest/Snakefile b/ingest/Snakefile
@@ -22,6 +22,7 @@ SEGMENTED = config["segmented"]
 ALL_FIELDS = ",".join(config["all_fields"])
 COLUMN_MAPPING = config["column_mapping"]
 LOG_LEVEL = config.get("log_level", "INFO")
+FILTER_FASTA_HEADERS = config.get("filter_fasta_headers", None)
 
 
 def rename_columns(input_file, output_file, mapping=COLUMN_MAPPING):
@@ -96,21 +97,23 @@ rule rename_columns:
 rule extract_ncbi_dataset_sequences:
     """
     For unsegmented sequences, we only keep the sequence ID in the header.
-    For segmented sequences, we keep full header as it contains segment information.
+    For sequences where we filter, we keep full header.
     """
     input:
         dataset_package="results/ncbi_dataset.zip",
     output:
         ncbi_dataset_sequences="results/sequences.fasta",
+    params:
+        header_arg="" if FILTER_FASTA_HEADERS else "-i",  # Keep full header if segmented
     shell:
         """
         unzip -jp {input.dataset_package} \
         ncbi_dataset/data/genomic.fna \
-        | seqkit seq -w0 -i \
+        | seqkit seq -w0 {params.header_arg} \
         > {output.ncbi_dataset_sequences}
         """
 
-
+  
 rule calculate_sequence_hashes:
     """Output JSON: {insdc_accession: md5_sequence_hash, ...}"""
     input:
@@ -127,10 +130,30 @@ rule calculate_sequence_hashes:
             --output-sequences {output.sequence_json}
         """
 
+if FILTER_FASTA_HEADERS:
+    rule filter_fasta_headers:
+        input:
+            sequences="results/sequences.fasta",
+            script="scripts/filter_sequences.py",
+            config="results/config.yaml",
+        output:
+            results="results/sequences_filtered.fasta",
+        params:
+            filter_fasta_headers=FILTER_FASTA_HEADERS,
+            log_level=LOG_LEVEL,
+        shell:
+            """
+            python {input.script} \
+            --input-seq {input.sequences} \
+            --output-seq {output.results} \
+            --log-level {params.log_level} \
+            --config-file {input.config} \
+            """
+
 
 rule align:
     input:
-        sequences="results/sequences.fasta",
+        sequences="results/sequences_filtered.fasta" if FILTER_FASTA_HEADERS else "results/sequences.fasta",
     output:
         results="results/nextclade_{segment}.tsv",
     params:

diff --git a/ingest/scripts/filter_sequences.py b/ingest/scripts/filter_sequences.py
@@ -0,0 +1,68 @@
+"""filters sequences by fasta header."""
+
+import logging
+import re
+from dataclasses import dataclass
+
+import click
+import yaml
+from Bio import SeqIO
+
+
+@dataclass
+class Config:
+    filter_fasta_headers: str
+
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(
+    encoding="utf-8",
+    level=logging.DEBUG,
+    format="%(asctime)s %(levelname)8s (%(filename)20s:%(lineno)4d) - %(message)s ",
+    datefmt="%H:%M:%S",
+)
+
+
+@click.command(help="Parse fasta header, only keep if fits regex filter_fasta_headers")
+@click.option("--config-file", required=True, type=click.Path(exists=True))
+@click.option("--input-seq", required=True, type=click.Path(exists=True))
+@click.option("--output-seq", required=True, type=click.Path())
+@click.option(
+    "--log-level",
+    default="INFO",
+    type=click.Choice(["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]),
+)
+def main(
+    config_file: str,
+    input_seq: str,
+    output_seq: str,
+    log_level: str,
+) -> None:
+    logger.setLevel(log_level)
+
+    with open(config_file) as file:
+        full_config = yaml.safe_load(file)
+        relevant_config = {key: full_config[key] for key in Config.__annotations__}
+        config = Config(**relevant_config)
+
+    # Discard all sequences without filter_fasta_headers in their header
+    discarded = 0
+    found = 0
+    with open(output_seq, "a") as output_file:
+        with open(input_seq) as f:
+            records = SeqIO.parse(f, "fasta")
+            for record in records:
+                found_segment = re.search(config.filter_fasta_headers, record.description)
+                if found_segment:
+                    found += 1
+                    output_file.write(f">{record.id}\n{record.seq}\n")
+                else:
+                    discarded += 1
+
+        logging.info(
+            f"Discarded {discarded} out of {discarded + found} sequences, as they did not contain the filter_fasta_headers: {config.filter_fasta_headers}."
+        )
+
+
+if __name__ == "__main__":
+    main()