First changes to the new refactorization

BU-ISCIII · Dec 19, 2023 · 5c7e3e3 · 5c7e3e3
1 parent 6022168
commit 5c7e3e3
Show file tree

Hide file tree

Showing 9 changed files with 2,998 additions and 58 deletions.
diff --git a/environment.yml b/environment.yml
@@ -7,10 +7,10 @@ dependencies:
   - conda-forge::python>=3.6
   - conda-forge::biopython==1.72
   - conda-forge::pandas==1.2.4
-  - conda-forge::progressbar==2.5
   - conda-forge::openpyxl==3.0.7
   - conda-forge::plotly==5.0.0
   - conda-forge::numpy==1.20.3
+  - conda-forge::rich==13.7.0
   - bioconda::prokka>=1.14
   - bioconda::blast>=2.9
   - bioconda::mash>=2

diff --git a/setup.py b/setup.py
@@ -2,7 +2,7 @@
 
 from setuptools import setup, find_packages
 
-version = "2.1.0"
+version = "2.2.0"
 
 with open("README.md") as f:
     readme = f.read()

diff --git a/taranis/__main__.py b/taranis/__main__.py
@@ -1,14 +1,17 @@
 import logging
 
 import click
+import glob
 import os
 import rich.console
 import rich.logging
 import rich.traceback
 import sys
 
+import taranis.prediction
 import taranis.utils
 import taranis.reference_alleles
+import taranis.allele_calling
 
 log = logging.getLogger()
 
@@ -86,7 +89,7 @@ def decorator(f):
     help="Print verbose output to the console.",
 )
 @click.option(
-    "-l", "--log-file", help="Save a verbose log to a file.", metavar="<filename>"
+    "-l", "--log-file", help="Save a verbose log to a file.", metavar="filename"
 )
 def taranis_cli(verbose, log_file):
     # Set the base logger to output DEBUG
@@ -111,6 +114,7 @@ def reference_alleles(
     schema,
     output,
 ):
+    # taranis reference-alleles -s ../../documentos_antiguos/datos_prueba/schema_1_locus/ -o ../../new_taranis_result_code
     # taranis reference-alleles -s ../../documentos_antiguos/datos_prueba/schema_test/ -o ../../new_taranis_result_code
     if not taranis.utils.folder_exists(schema):
         log.error("schema folder %s does not exists", schema)
@@ -140,4 +144,70 @@ def reference_alleles(
     """Create the reference alleles from the schema """
     for f_file in schema_files:
         ref_alleles = taranis.reference_alleles.ReferenceAlleles(f_file, output)
-    _ = ref_alleles.create_ref_alleles()
+    _ = ref_alleles.create_ref_alleles()
+
+
+# Allele calling
+#  taranis -l ../../test/taranis.log  allele-calling -s ../../documentos_antiguos/datos_prueba/schema_test/ -r ../../documentos_antiguos/datos_prueba/reference_alleles/ -g ../../taranis_data/listeria_genoma_referencia/listeria.fasta -a ../../taranis_data/listeria_sampled/RA-L2073_R1.fasta -o ../../test/
+# taranis allele-calling -s ../../documentos_antiguos/datos_prueba/schema_test/ -r ../../documentos_antiguos/datos_prueba/reference_alleles/ -g ../../taranis_data/listeria_genoma_referencia/listeria.fasta -a ../../taranis_data/listeria_sampled/RA-L2073_R1.fasta -o ../../test/
+# taranis allele-calling -s ../../documentos_antiguos/datos_prueba/schema_test/ -r ../../documentos_antiguos/datos_prueba/reference_alleles/ -g ../../taranis_data/listeria_genoma_referencia/listeria.fasta -a ../../taranis_data/muestras_listeria_servicio_fasta/3789/assembly.fasta -o ../../test/
+
+@taranis_cli.command(help_priority=3)
+@click.option("-s", "--schema", required=True, multiple=False, type=click.Path(), help="Directory where the schema with the core gene files are located. ")
+@click.option("-r", "--reference", required=True, multiple=False, type=click.Path(), help="Directory where the schema reference allele files are located. ")
+@click.option("-g", "--genome", required=True, multiple=False, type=click.Path(), help="Genome reference file")
+@click.option("-a", "--sample", required=True, multiple=False, type=click.Path(), help="Sample location file in fasta format. ")
+@click.option("-o", "--output", required=True, multiple=False, type=click.Path(), help="Output folder to save reference alleles")
+def allele_calling(
+    schema,
+    reference,
+    genome,
+    sample,
+    output,
+):
+    folder_to_check = [schema, reference]
+    for folder in folder_to_check:
+        if not taranis.utils.folder_exists(folder):
+            log.error("folder %s does not exists", folder)
+            stderr.print(
+                "[red] Folder does not exist. " + folder + "!"
+            )
+            sys.exit(1)
+    if not taranis.utils.file_exists(sample):
+        log.error("file %s does not exists", sample)
+        stderr.print(
+            "[red] File does not exist. " + sample + "!"
+        )
+        sys.exit(1)
+    schema_files = taranis.utils.get_files_in_folder(schema, "fasta")
+    if len(schema_files) == 0:
+        log.error("Schema folder %s does not have any fasta file", schema)
+        stderr.print("[red] Schema folder does not have any fasta file")
+        sys.exit(1)
+
+    # Check if output folder exists
+    if taranis.utils.folder_exists(output):
+        q_question = "Folder " + output + " already exists. Files will be overwritten. Do you want to continue?"
+        if "no" in taranis.utils.query_user_yes_no(q_question, "no"):
+            log.info("Aborting code by user request")
+            stderr.print("[red] Exiting code. ")
+            sys.exit(1)
+    else:
+        try:
+            os.makedirs(output)
+        except OSError as e:
+            log.info("Unable to create folder at %s", output)
+            stderr.print("[red] ERROR. Unable to create folder  " + output)
+            sys.exit(1)
+    # Filter fasta files from reference folder
+    ref_alleles = glob.glob(os.path.join(reference, "*.fasta"))
+    # Create predictions
+    pred_out = os.path.join(output, "prediction" )
+    pred_sample = taranis.prediction.Prediction(genome, sample, pred_out)
+    pred_sample.training()
+    pred_sample.prediction()
+
+    """Analyze the sample file against schema to identify outbreakers
+    """
+    sample_allele = taranis.allele_calling.Sample(pred_sample, sample, schema, ref_alleles ,output)
+    sample_allele.analyze_sample()
diff --git a/taranis/allele_calling.py b/taranis/allele_calling.py
@@ -0,0 +1,93 @@
+import logging
+import os
+import rich.console
+
+
+import taranis.utils
+import taranis.blast
+import numpy
+from pathlib import Path
+
+
+import pdb
+log = logging.getLogger(__name__)
+stderr = rich.console.Console(
+    stderr=True,
+    style="dim",
+    highlight=False,
+    force_terminal=taranis.utils.rich_force_colors(),
+)
+
+class Sample:
+    def __init__(self, prediction, sample_file, schema, reference_alleles, out_folder):
+        self.prediction = prediction
+        self.sample_file = sample_file
+        self.schema = schema
+        self.ref_alleles = reference_alleles
+        self.out_folder = out_folder
+        self.s_name = Path(sample_file).stem
+        self.blast_dir = os.path.join(out_folder,"blastdb")
+        self.blast_sample = os.path.join(self.blast_dir, self.s_name)
+
+    def assign_abbreviation(self, query_seq, allele_name, sample_contig, schema_gene):
+        s_alleles_blast = taranis.blast.Blast("nucl")
+        ref_allele_blast_dir = os.path.join(self.blast_dir, "ref_alleles")
+        query_path = os.path.join(self.out_folder, "tmp", allele_name)
+        # Write to file the sequence to find out the loci name that fully match 
+        f_name = taranis.utils.write_fasta_file(query_path, query_seq, allele_name)
+        query_file = os.path.join(query_path, f_name)
+        _ = s_alleles_blast.create_blastdb(schema_gene, ref_allele_blast_dir)
+        # Blast with sample sequence to find the allele in the schema 
+        seq_blast_match = s_alleles_blast.run_blast(query_file, perc_identity=100)
+        pdb.set_trace()
+        if len(seq_blast_match) == 1:
+
+            # Hacer un blast con la query esta secuencia y la database del alelo
+            # Create  blast db with sample file
+
+
+            pass
+
+
+    def catalog_alleles (self, ref_allele):
+        allele_name = Path(ref_allele).stem
+        schema_gene = os.path.join(self.schema, allele_name + ".fasta")  
+        allele_name = Path(ref_allele).stem
+        # run blast with sample as db and reference allele as query
+        sample_blast_match = self.sample_blast.run_blast(ref_allele)
+        if len(sample_blast_match) > 0 :
+            s_lines = []
+            for out_line in sample_blast_match:
+                s_lines.append(out_line.split("\t"))
+            np_lines = numpy.array(s_lines)
+            # convert to float the perc_identity to find out the max value
+            max_val = numpy.max(np_lines[:,2].astype(float))
+            mask = np_lines[:, 2] ==str(max_val)
+            # Select rows that match the percent identity. Index 2 in blast results
+            sel_row = np_lines[mask, :] = np_lines[mask, :]
+            query_seq = sel_row[0,14]
+            sample_contig = sel_row[0,1]
+            abbr = self.assign_abbreviation(query_seq, allele_name, sample_contig, schema_gene)
+        else:
+            # Sample does not have a reference allele to be matched
+            # Keep LNF info
+            # ver el codigo de espe
+            #lnf_tpr_tag()
+            pass
+        pdb.set_trace()
+
+
+    def analyze_sample(self):
+        # Create  blast db with sample file
+        self.sample_blast = taranis.blast.Blast("nucl")
+        _ = self.sample_blast.create_blastdb(self.sample_file, self.blast_dir)
+        result = {}
+        pdb.set_trace()
+        for ref_allele in self.ref_alleles:
+            # schema_alleles = os.path.join(self.schema, ref_allele)
+            # parallel in all CPUs in cluster node
+            result[ref_allele] = self.catalog_alleles(ref_allele)
+
+        pdb.set_trace()
+        return
+