From fdd17932e211895e725064f734afc7e8eb4185e5 Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Wed, 3 Jul 2024 20:31:28 +0200 Subject: [PATCH] Revert "fix(ingest): Remove tsv-utils so that image can build on main (#2185)" (#2240) This reverts commit f076cc1e28a9fcb534a6d0d183253f401533fe57. We can add tsv-utils back now that I've made it available for aarch64 in conda-forge: https://github.com/conda-forge/tsv-utils-feedstock This reduces amount of code to maintain, fewer Python scripts. --- ingest/Snakefile | 11 ++--- ingest/environment.yml | 1 + ingest/scripts/process_alignments.py | 66 ---------------------------- 3 files changed, 5 insertions(+), 73 deletions(-) delete mode 100644 ingest/scripts/process_alignments.py diff --git a/ingest/Snakefile b/ingest/Snakefile index 8d080681e..57f00e6fc 100644 --- a/ingest/Snakefile +++ b/ingest/Snakefile @@ -148,21 +148,18 @@ rule align: rule process_alignments: input: - script="scripts/process_alignments.py", results=expand( "results/nextclade_{segment}.tsv", segment=config["nucleotide_sequences"], ), output: merged="results/nextclade_merged.tsv", - params: - log_level=LOG_LEVEL, shell: """ - python {input.script} \ - --input "{input.results}" \ - --output {output.merged} \ - --log-level {params.log_level} \ + tsv-append --header {input.results} \ + | tsv-select --header --fields seqName,clade \ + | tsv-filter --header --not-empty clade \ + > {output.merged} """ diff --git a/ingest/environment.yml b/ingest/environment.yml index aee3e5939..2a778b6cb 100644 --- a/ingest/environment.yml +++ b/ingest/environment.yml @@ -18,4 +18,5 @@ dependencies: - requests - seqkit - snakemake + - tsv-utils - unzip diff --git a/ingest/scripts/process_alignments.py b/ingest/scripts/process_alignments.py deleted file mode 100644 index 541184d5e..000000000 --- a/ingest/scripts/process_alignments.py +++ /dev/null @@ -1,66 +0,0 @@ -import csv -import os -import pandas as pd -import logging -import sys - -import click - - -logger = logging.getLogger(__name__) -logging.basicConfig( - encoding="utf-8", - level=logging.DEBUG, - format="%(asctime)s %(levelname)8s (%(filename)20s:%(lineno)4d) - %(message)s ", - datefmt="%H:%M:%S", -) - -# https://stackoverflow.com/questions/15063936 -csv.field_size_limit(sys.maxsize) - - -def validate_paths(ctx, param, value): - """Custom validation function to check if all provided paths exist.""" - paths = value.split(" ") - for path in paths: - if not os.path.exists(path): - msg = f"Path does not exist: {path}" - raise click.BadParameter(msg) - return paths - - -@click.command() -@click.option( - "--input", - required=True, - callback=validate_paths, - help="List of paths to alignment files.", -) -@click.option("--output", required=True, type=click.Path()) -@click.option( - "--log-level", - default="INFO", - type=click.Choice(["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]), -) -def main( - input: str, - output: str, - log_level: str, -) -> None: - logger.setLevel(log_level) - - appended_df = pd.DataFrame({"seqName": [], "clade": []}) - - for alignment_path in input: - df = pd.read_csv(alignment_path, sep="\t", dtype=str) - seq_clade = df[["seqName", "clade"]] - # drop all rows that do not contain a clade - i.e. did not align to a segment - seq_clade = seq_clade.dropna(subset=["clade"]) - appended_df = appended_df._append(seq_clade, ignore_index=True) - - # saving as tsv file - appended_df.to_csv(output, sep="\t", index=False) - - -if __name__ == "__main__": - main()