Implemented Analyze schema

BU-ISCIII · Jan 4, 2024 · a25365c · a25365c
1 parent beb0413
commit a25365c
Show file tree

Hide file tree

Showing 6 changed files with 110 additions and 34 deletions.
diff --git a/environment.yml b/environment.yml
@@ -11,6 +11,7 @@ dependencies:
   - conda-forge::plotly==5.0.0
   - conda-forge::numpy==1.20.3
   - conda-forge::rich==13.7.0
+  - conda-forge::python-kaleido
   - bioconda::prokka>=1.14
   - bioconda::blast>=2.9
   - bioconda::mash>=2

diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,5 @@
 click
 questionary
 bio
-scikit-learn
+scikit-learn
+plotly
diff --git a/taranis/__main__.py b/taranis/__main__.py
@@ -8,6 +8,7 @@
 import rich.logging
 import rich.traceback
 import sys
+import time
 
 import taranis.prediction
 import taranis.utils
@@ -204,10 +205,11 @@ def analyze_schema(
     """
     # for schema_file in schema_files:
     results = []
+    start = time.perf_counter()
     with concurrent.futures.ProcessPoolExecutor() as executor:
         futures = [
             executor.submit(
-                taranis.analyze_schema.prueba_paralelizacion,
+                taranis.analyze_schema.parallel_execution,
                 schema_file,
                 output,
                 remove_subset,
@@ -222,8 +224,9 @@ def analyze_schema(
         # Collect results as they complete
         for future in concurrent.futures.as_completed(futures):
             results.append(future.result())
-    _ = taranis.analyze_schema.collect_statistics(results)
-
+    _ = taranis.analyze_schema.collect_statistics(results, output)
+    finish = time.perf_counter()
+    print(f"Schema analyze finish in {round((finish-start)/60, 2)} minutes")
 
 # Reference alleles
 @taranis_cli.command(help_priority=2)

diff --git a/taranis/analyze_schema.py b/taranis/analyze_schema.py
@@ -56,30 +56,27 @@ def check_allele_quality(self):
                     a_quality[record.id]["quality"] = "Bad quality"
                     a_quality[record.id]["reason"] = "Can not be converted to protein"
                     a_quality[record.id]["order"] = "-"
-                    continue
-                sequence_order = taranis.utils.check_sequence_order(str(record.seq))
-                if sequence_order == "Error":
-                    a_quality[record.id]["quality"] = "Bad quality"
-                    a_quality[record.id]["reason"] = "Start or end codon not found"
-                    a_quality[record.id]["order"] = "-"
-                    continue
-                elif sequence_order == "reverse":
-                    record_sequence = str(record.seq.reverse_complement())
                 else:
-                    record_sequence = str(record.seq)
-                a_quality[record.id]["order"] = sequence_order
-                if record_sequence[0:3] not in taranis.utils.START_CODON_FORWARD:
-                    a_quality[record.id]["quality"] = "Bad quality"
-                    a_quality[record.id]["reason"] = "Start codon not found"
-                    continue
-                if record_sequence[-3:] not in taranis.utils.STOP_CODON_FORWARD:
-                    a_quality[record.id]["quality"] = "Bad quality"
-                    a_quality[record.id]["reason"] = "Stop codon not found"
-                    continue
-                if taranis.utils.find_multiple_stop_codons(record_sequence):
-                    a_quality[record.id]["quality"] = "Bad quality"
-                    a_quality[record.id]["reason"] = "Multiple stop codons found"
-                    continue
+                    sequence_order = taranis.utils.check_sequence_order(str(record.seq))
+                    if sequence_order == "Error":
+                        a_quality[record.id]["quality"] = "Bad quality"
+                        a_quality[record.id]["reason"] = "Start or end codon not found"
+                        a_quality[record.id]["order"] = "-"
+                    elif sequence_order == "reverse":
+                        record_sequence = str(record.seq.reverse_complement())
+                    else:
+                        record_sequence = str(record.seq)
+                    a_quality[record.id]["order"] = sequence_order
+                    if record_sequence[0:3] not in taranis.utils.START_CODON_FORWARD:
+                        a_quality[record.id]["quality"] = "Bad quality"
+                        a_quality[record.id]["reason"] = "Start codon not found"
+                    elif record_sequence[-3:] not in taranis.utils.STOP_CODON_FORWARD:
+                        a_quality[record.id]["quality"] = "Bad quality"
+                        a_quality[record.id]["reason"] = "Stop codon not found"
+
+                    elif taranis.utils.find_multiple_stop_codons(record_sequence):
+                        a_quality[record.id]["quality"] = "Bad quality"
+                        a_quality[record.id]["reason"] = "Multiple stop codons found"
                 if (
                     self.remove_no_cds
                     and a_quality[record.id]["quality"] == "Bad quality"
@@ -96,12 +93,16 @@ def check_allele_quality(self):
                         tmp_dict[seq_value] = 0
                     else:
                         bad_quality_record.append(rec_id)
+                        a_quality[rec_id]["quality"] ="Bad quality"
+                        a_quality[rec_id]["reason"] ="Duplicate allele"
         if self.remove_subset:
             unique_seq = list(set(list(allele_seq.values())))
             for rec_id, seq_value in allele_seq.items():
                 unique_seq.remove(seq_value)
                 if seq_value in unique_seq:
                     bad_quality_record.append(rec_id)
+                    a_quality[rec_id]["quality"] ="Bad quality"
+                    a_quality[rec_id]["reason"] ="Sub set allele"
         new_schema_folder = os.path.join(self.output, "new_schema")
         _ = taranis.utils.create_new_folder(new_schema_folder)
         new_schema_file = os.path.join(new_schema_folder, self.allele_name + ".fasta")
@@ -118,6 +119,7 @@ def check_allele_quality(self):
         return a_quality
 
     def fetch_statistics_from_alleles(self, a_quality):
+        possible_bad_quality = ["Can not be converted to protein", "Start codon not found", "Stop codon not found", "Multiple stop codons found" ,"Duplicate allele", "Sub set allele"]
         record_data = {}
         bad_quality_reason = {}
         a_length = []
@@ -138,6 +140,9 @@ def fetch_statistics_from_alleles(self, a_quality):
         record_data["good_percent"] = round(
             100 * (total_alleles - bad_quality_counter) / total_alleles, 2
         )
+        for item in possible_bad_quality:
+            record_data[item] =  bad_quality_reason[item] if item in bad_quality_reason else 0
+        # record_data["bad_quality_reason"] = bad_quality_reason
         return record_data
 
     def analyze_allele_in_schema(self):
@@ -156,7 +161,7 @@ def analyze_allele_in_schema(self):
         return allele_data
 
 
-def prueba_paralelizacion(
+def parallel_execution(
     schema_allele,
     output,
     remove_subset,
@@ -179,6 +184,43 @@ def prueba_paralelizacion(
     return schema_obj.analyze_allele_in_schema()
 
 
-def collect_statistics(stat_data):
+
+def collect_statistics(stat_data, out_folder):
+    def stats_graphics(stats_folder):
+        print(out_folder)
+        graphic_folder = os.path.join(stats_folder, "graphics")
+        _ = taranis.utils.create_new_folder(graphic_folder)
+        # create graphic for alleles/number of genes
+        genes_alleles_df = stats_df["num_alleles"].value_counts().rename_axis("alleles").sort_index().reset_index(name="genes")
+        _ = taranis.utils.create_graphic(graphic_folder, "num_genes_per_allele.png", "lines", genes_alleles_df["alleles"].to_list(), genes_alleles_df["genes"].to_list(), ["Allele", "number of genes"],"title")
+        # create pie graph for good quality
+
+        good_percent = [round(stats_df["good_percent"].mean(),2)]
+        good_percent.append(100 - good_percent[0])
+        labels = ["Good quality", "Bad quality"]
+        # pdb.set_trace()
+        _ = taranis.utils.create_graphic(graphic_folder, "quality_of_locus.png", "pie", good_percent, "", labels, "Quality of locus")
+        # create pie graph for bad quality reason. This is optional if there are
+        # bad quality alleles
+        labels = []
+        values = []
+        for item in taranis.utils.POSIBLE_BAD_QUALITY:
+            labels.append(item)
+            values.append(stats_df[item].sum())
+        if sum(values) > 0:
+             _ = taranis.utils.create_graphic(graphic_folder, "bad_quality_reason.png", "pie", values, "", labels, "Bad quality reason")
+        # create pie graph for not found gene name
+        # pdb.set_trace()
+        times_not_found_gene = len(stats_df[stats_df["annotation_gene"] == "Not found by Prokka"])
+        if times_not_found_gene > 0:
+            gene_not_found = [times_not_found_gene, len(stat_data)]
+            labels = ["Not found gene name", "Number of alleles"]
+            _ = taranis.utils.create_graphic(graphic_folder, "gene_not_found.png", "pie", gene_not_found, "", labels, "Quality of locus")
+
     stats_df = pd.DataFrame(stat_data)
+    stats_folder = os.path.join(out_folder, "statistics")
+    _ = taranis.utils.create_new_folder(stats_folder)
+    _ = taranis.utils.write_data_to_file(stats_folder, "statistics.csv", stats_df)
+    stats_graphics(stats_folder)
+
     print(stats_df)
diff --git a/taranis/pruebas.py b/taranis/pruebas.py
@@ -7,6 +7,15 @@
 import pdb
 import random
 
+"""
+    Para hacer las pruebas con alfaclust activo el entorno de conda alfatclust_env
+    despues me voy a la carpeta donde me he descargado, de git, alfatclust y 
+    ejecuto :
+     ./alfatclust.py -i /media/lchapado/Reference_data/proyectos_isciii/taranis/taranis_testing_data/listeria_testing_schema/lmo0003.fasta  -o /media/lchapado/Reference_data/proyectos_isciii/taranis/test/alfacluster_test/resultado_alfaclust_lmo003  -l 0.9 
+    despues ejecuto este programa de prueba cambiando los ficheros de resultados 
+    
+"""
+
 # read result of alfatclust
 
 alfa_clust_file = "/media/lchapado/Reference_data/proyectos_isciii/taranis/test/resultado_alfatclust-090"
@@ -16,7 +25,7 @@
 locus_list = []
 for line in lines:
     line = line.strip()
-    if line == "#Cluster 9" :
+    if line == "#Cluster 5" :
         if alleles_found == False:
             alleles_found = True
             continue

diff --git a/taranis/utils.py b/taranis/utils.py
@@ -9,6 +9,7 @@
 import numpy as np
 import questionary
 import os
+import plotly.graph_objects as go
 import rich.console
 import shutil
 import subprocess
@@ -47,6 +48,8 @@ def rich_force_colors():
 STOP_CODON_FORWARD = ['TAA', 'TAG','TGA']
 stop_codon_reverse = ['TTA', 'CTA','TCA']
 
+POSIBLE_BAD_QUALITY = ["Can not be converted to protein", "Start codon not found", "Stop codon not found", "Multiple stop codons found" ,"Duplicate allele", "Sub set allele"]
+
 def check_sequence_order(allele_sequence):
     # check direction
     if allele_sequence[0:3] in START_CODON_FORWARD or allele_sequence[-3:] in STOP_CODON_FORWARD:
@@ -57,7 +60,7 @@ def check_sequence_order(allele_sequence):
 
 def create_annotation_files(fasta_file, annotation_dir, prefix, genus="Genus", species="species", usegenus=False):
     try:
-        _ = subprocess.run (['prokka', fasta_file, '--force', annotation_dir, '--genus', genus, '--species', species, '--usegenus', str(usegenus), '--gcode', '11', '--prefix', prefix, '--quiet'])
+        _ = subprocess.run (['prokka', fasta_file, '--force', '--outdir', annotation_dir, '--genus', genus, '--species', species, '--usegenus', str(usegenus), '--gcode', '11', '--prefix', prefix, '--quiet'])
     except Exception as e:
         log.error("Unable to run prokka. Error message: %s ", e )
         stderr.print("[red] Unable to run prokka. Given error; " + e)
@@ -83,7 +86,19 @@ def create_new_folder(folder_name):
         stderr.print("[red] Folder does not have any file which match your request")
         sys.exit(1)
     return
-
+
+
+def  create_graphic(out_folder, f_name, mode, x_data, y_data, labels, title ):
+    fig = go.Figure()
+    # pdb.set_trace()
+    if mode == "lines":
+        fig.add_trace(go.Scatter(x=x_data, y=y_data, mode=mode, name=title))
+    elif mode == "pie":
+        fig.add_trace(go.Pie(labels=labels, values=x_data))
+        fig.update_layout(title_text= title)
+    fig.write_image(os.path.join(out_folder, f_name))
+
+
 def get_files_in_folder(folder, extension=None):
     """get the list of files, filtered by extension in the input folder. If
     extension is not set, then all files in folder are returned
@@ -205,13 +220,13 @@ def read_annotation_file(ann_file, allele_name, only_first_line=True):
     gene_idx = heading.index("gene")
     if only_first_line:
         first_line = lines[1].split("\t")
-        ann_data[allele_name] = first_line[gene_idx] if first_line[gene_idx] != "" else "Not found by Prokka'"
+        ann_data[allele_name] = first_line[gene_idx] if first_line[gene_idx] != "" else "Not found by Prokka"
     else:
         # Return all annotation lines
         for line in lines[1:]:
             s_line = line.strip().split("\t")
             allele_key = allele_name + "_" + s_line[locus_tag_idx].split("_")[1]
-            ann_data[allele_key] = s_line[gene_idx] if s_line[gene_idx] != "" else "Not found by Prokka'"
+            ann_data[allele_key] = s_line[gene_idx] if s_line[gene_idx] != "" else "Not found by Prokka"
     return ann_data
 
 
@@ -243,5 +258,10 @@ def write_fasta_file(out_folder, seq_data, allele_name=None, f_name=None):
             fo.write(seq_data)
     return f_name
 
+def write_data_to_file(out_folder, f_name, data, include_header=True, data_type="pandas", extension="csv"):
+    f_path_name = os.path.join(out_folder,f_name)
+    if data_type == "pandas":
+        data.to_csv(f_path_name, sep=",",header=include_header)
+        return