Skip to content

Commit

Permalink
Implemented Analyze schema
Browse files Browse the repository at this point in the history
  • Loading branch information
luissian committed Jan 4, 2024
1 parent beb0413 commit a25365c
Show file tree
Hide file tree
Showing 6 changed files with 110 additions and 34 deletions.
1 change: 1 addition & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ dependencies:
- conda-forge::plotly==5.0.0
- conda-forge::numpy==1.20.3
- conda-forge::rich==13.7.0
- conda-forge::python-kaleido
- bioconda::prokka>=1.14
- bioconda::blast>=2.9
- bioconda::mash>=2
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
click
questionary
bio
scikit-learn
scikit-learn
plotly
9 changes: 6 additions & 3 deletions taranis/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import rich.logging
import rich.traceback
import sys
import time

import taranis.prediction
import taranis.utils
Expand Down Expand Up @@ -204,10 +205,11 @@ def analyze_schema(
"""
# for schema_file in schema_files:
results = []
start = time.perf_counter()
with concurrent.futures.ProcessPoolExecutor() as executor:
futures = [
executor.submit(
taranis.analyze_schema.prueba_paralelizacion,
taranis.analyze_schema.parallel_execution,
schema_file,
output,
remove_subset,
Expand All @@ -222,8 +224,9 @@ def analyze_schema(
# Collect results as they complete
for future in concurrent.futures.as_completed(futures):
results.append(future.result())
_ = taranis.analyze_schema.collect_statistics(results)

_ = taranis.analyze_schema.collect_statistics(results, output)
finish = time.perf_counter()
print(f"Schema analyze finish in {round((finish-start)/60, 2)} minutes")

# Reference alleles
@taranis_cli.command(help_priority=2)
Expand Down
92 changes: 67 additions & 25 deletions taranis/analyze_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,30 +56,27 @@ def check_allele_quality(self):
a_quality[record.id]["quality"] = "Bad quality"
a_quality[record.id]["reason"] = "Can not be converted to protein"
a_quality[record.id]["order"] = "-"
continue
sequence_order = taranis.utils.check_sequence_order(str(record.seq))
if sequence_order == "Error":
a_quality[record.id]["quality"] = "Bad quality"
a_quality[record.id]["reason"] = "Start or end codon not found"
a_quality[record.id]["order"] = "-"
continue
elif sequence_order == "reverse":
record_sequence = str(record.seq.reverse_complement())
else:
record_sequence = str(record.seq)
a_quality[record.id]["order"] = sequence_order
if record_sequence[0:3] not in taranis.utils.START_CODON_FORWARD:
a_quality[record.id]["quality"] = "Bad quality"
a_quality[record.id]["reason"] = "Start codon not found"
continue
if record_sequence[-3:] not in taranis.utils.STOP_CODON_FORWARD:
a_quality[record.id]["quality"] = "Bad quality"
a_quality[record.id]["reason"] = "Stop codon not found"
continue
if taranis.utils.find_multiple_stop_codons(record_sequence):
a_quality[record.id]["quality"] = "Bad quality"
a_quality[record.id]["reason"] = "Multiple stop codons found"
continue
sequence_order = taranis.utils.check_sequence_order(str(record.seq))
if sequence_order == "Error":
a_quality[record.id]["quality"] = "Bad quality"
a_quality[record.id]["reason"] = "Start or end codon not found"
a_quality[record.id]["order"] = "-"
elif sequence_order == "reverse":
record_sequence = str(record.seq.reverse_complement())
else:
record_sequence = str(record.seq)
a_quality[record.id]["order"] = sequence_order
if record_sequence[0:3] not in taranis.utils.START_CODON_FORWARD:
a_quality[record.id]["quality"] = "Bad quality"
a_quality[record.id]["reason"] = "Start codon not found"
elif record_sequence[-3:] not in taranis.utils.STOP_CODON_FORWARD:
a_quality[record.id]["quality"] = "Bad quality"
a_quality[record.id]["reason"] = "Stop codon not found"

elif taranis.utils.find_multiple_stop_codons(record_sequence):
a_quality[record.id]["quality"] = "Bad quality"
a_quality[record.id]["reason"] = "Multiple stop codons found"
if (
self.remove_no_cds
and a_quality[record.id]["quality"] == "Bad quality"
Expand All @@ -96,12 +93,16 @@ def check_allele_quality(self):
tmp_dict[seq_value] = 0
else:
bad_quality_record.append(rec_id)
a_quality[rec_id]["quality"] ="Bad quality"
a_quality[rec_id]["reason"] ="Duplicate allele"
if self.remove_subset:
unique_seq = list(set(list(allele_seq.values())))
for rec_id, seq_value in allele_seq.items():
unique_seq.remove(seq_value)
if seq_value in unique_seq:
bad_quality_record.append(rec_id)
a_quality[rec_id]["quality"] ="Bad quality"
a_quality[rec_id]["reason"] ="Sub set allele"
new_schema_folder = os.path.join(self.output, "new_schema")
_ = taranis.utils.create_new_folder(new_schema_folder)
new_schema_file = os.path.join(new_schema_folder, self.allele_name + ".fasta")
Expand All @@ -118,6 +119,7 @@ def check_allele_quality(self):
return a_quality

def fetch_statistics_from_alleles(self, a_quality):
possible_bad_quality = ["Can not be converted to protein", "Start codon not found", "Stop codon not found", "Multiple stop codons found" ,"Duplicate allele", "Sub set allele"]
record_data = {}
bad_quality_reason = {}
a_length = []
Expand All @@ -138,6 +140,9 @@ def fetch_statistics_from_alleles(self, a_quality):
record_data["good_percent"] = round(
100 * (total_alleles - bad_quality_counter) / total_alleles, 2
)
for item in possible_bad_quality:
record_data[item] = bad_quality_reason[item] if item in bad_quality_reason else 0
# record_data["bad_quality_reason"] = bad_quality_reason
return record_data

def analyze_allele_in_schema(self):
Expand All @@ -156,7 +161,7 @@ def analyze_allele_in_schema(self):
return allele_data


def prueba_paralelizacion(
def parallel_execution(
schema_allele,
output,
remove_subset,
Expand All @@ -179,6 +184,43 @@ def prueba_paralelizacion(
return schema_obj.analyze_allele_in_schema()


def collect_statistics(stat_data):

def collect_statistics(stat_data, out_folder):
def stats_graphics(stats_folder):
print(out_folder)
graphic_folder = os.path.join(stats_folder, "graphics")
_ = taranis.utils.create_new_folder(graphic_folder)
# create graphic for alleles/number of genes
genes_alleles_df = stats_df["num_alleles"].value_counts().rename_axis("alleles").sort_index().reset_index(name="genes")
_ = taranis.utils.create_graphic(graphic_folder, "num_genes_per_allele.png", "lines", genes_alleles_df["alleles"].to_list(), genes_alleles_df["genes"].to_list(), ["Allele", "number of genes"],"title")
# create pie graph for good quality

good_percent = [round(stats_df["good_percent"].mean(),2)]
good_percent.append(100 - good_percent[0])
labels = ["Good quality", "Bad quality"]
# pdb.set_trace()
_ = taranis.utils.create_graphic(graphic_folder, "quality_of_locus.png", "pie", good_percent, "", labels, "Quality of locus")
# create pie graph for bad quality reason. This is optional if there are
# bad quality alleles
labels = []
values = []
for item in taranis.utils.POSIBLE_BAD_QUALITY:
labels.append(item)
values.append(stats_df[item].sum())
if sum(values) > 0:
_ = taranis.utils.create_graphic(graphic_folder, "bad_quality_reason.png", "pie", values, "", labels, "Bad quality reason")
# create pie graph for not found gene name
# pdb.set_trace()
times_not_found_gene = len(stats_df[stats_df["annotation_gene"] == "Not found by Prokka"])
if times_not_found_gene > 0:
gene_not_found = [times_not_found_gene, len(stat_data)]
labels = ["Not found gene name", "Number of alleles"]
_ = taranis.utils.create_graphic(graphic_folder, "gene_not_found.png", "pie", gene_not_found, "", labels, "Quality of locus")

stats_df = pd.DataFrame(stat_data)
stats_folder = os.path.join(out_folder, "statistics")
_ = taranis.utils.create_new_folder(stats_folder)
_ = taranis.utils.write_data_to_file(stats_folder, "statistics.csv", stats_df)
stats_graphics(stats_folder)

print(stats_df)
11 changes: 10 additions & 1 deletion taranis/pruebas.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,15 @@
import pdb
import random

"""
Para hacer las pruebas con alfaclust activo el entorno de conda alfatclust_env
despues me voy a la carpeta donde me he descargado, de git, alfatclust y
ejecuto :
./alfatclust.py -i /media/lchapado/Reference_data/proyectos_isciii/taranis/taranis_testing_data/listeria_testing_schema/lmo0003.fasta -o /media/lchapado/Reference_data/proyectos_isciii/taranis/test/alfacluster_test/resultado_alfaclust_lmo003 -l 0.9
despues ejecuto este programa de prueba cambiando los ficheros de resultados
"""

# read result of alfatclust

alfa_clust_file = "/media/lchapado/Reference_data/proyectos_isciii/taranis/test/resultado_alfatclust-090"
Expand All @@ -16,7 +25,7 @@
locus_list = []
for line in lines:
line = line.strip()
if line == "#Cluster 9" :
if line == "#Cluster 5" :
if alleles_found == False:
alleles_found = True
continue
Expand Down
28 changes: 24 additions & 4 deletions taranis/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import numpy as np
import questionary
import os
import plotly.graph_objects as go
import rich.console
import shutil
import subprocess
Expand Down Expand Up @@ -47,6 +48,8 @@ def rich_force_colors():
STOP_CODON_FORWARD = ['TAA', 'TAG','TGA']
stop_codon_reverse = ['TTA', 'CTA','TCA']

POSIBLE_BAD_QUALITY = ["Can not be converted to protein", "Start codon not found", "Stop codon not found", "Multiple stop codons found" ,"Duplicate allele", "Sub set allele"]

def check_sequence_order(allele_sequence):
# check direction
if allele_sequence[0:3] in START_CODON_FORWARD or allele_sequence[-3:] in STOP_CODON_FORWARD:
Expand All @@ -57,7 +60,7 @@ def check_sequence_order(allele_sequence):

def create_annotation_files(fasta_file, annotation_dir, prefix, genus="Genus", species="species", usegenus=False):
try:
_ = subprocess.run (['prokka', fasta_file, '--force', annotation_dir, '--genus', genus, '--species', species, '--usegenus', str(usegenus), '--gcode', '11', '--prefix', prefix, '--quiet'])
_ = subprocess.run (['prokka', fasta_file, '--force', '--outdir', annotation_dir, '--genus', genus, '--species', species, '--usegenus', str(usegenus), '--gcode', '11', '--prefix', prefix, '--quiet'])
except Exception as e:
log.error("Unable to run prokka. Error message: %s ", e )
stderr.print("[red] Unable to run prokka. Given error; " + e)
Expand All @@ -83,7 +86,19 @@ def create_new_folder(folder_name):
stderr.print("[red] Folder does not have any file which match your request")
sys.exit(1)
return



def create_graphic(out_folder, f_name, mode, x_data, y_data, labels, title ):
fig = go.Figure()
# pdb.set_trace()
if mode == "lines":
fig.add_trace(go.Scatter(x=x_data, y=y_data, mode=mode, name=title))
elif mode == "pie":
fig.add_trace(go.Pie(labels=labels, values=x_data))
fig.update_layout(title_text= title)
fig.write_image(os.path.join(out_folder, f_name))


def get_files_in_folder(folder, extension=None):
"""get the list of files, filtered by extension in the input folder. If
extension is not set, then all files in folder are returned
Expand Down Expand Up @@ -205,13 +220,13 @@ def read_annotation_file(ann_file, allele_name, only_first_line=True):
gene_idx = heading.index("gene")
if only_first_line:
first_line = lines[1].split("\t")
ann_data[allele_name] = first_line[gene_idx] if first_line[gene_idx] != "" else "Not found by Prokka'"
ann_data[allele_name] = first_line[gene_idx] if first_line[gene_idx] != "" else "Not found by Prokka"
else:
# Return all annotation lines
for line in lines[1:]:
s_line = line.strip().split("\t")
allele_key = allele_name + "_" + s_line[locus_tag_idx].split("_")[1]
ann_data[allele_key] = s_line[gene_idx] if s_line[gene_idx] != "" else "Not found by Prokka'"
ann_data[allele_key] = s_line[gene_idx] if s_line[gene_idx] != "" else "Not found by Prokka"
return ann_data


Expand Down Expand Up @@ -243,5 +258,10 @@ def write_fasta_file(out_folder, seq_data, allele_name=None, f_name=None):
fo.write(seq_data)
return f_name

def write_data_to_file(out_folder, f_name, data, include_header=True, data_type="pandas", extension="csv"):
f_path_name = os.path.join(out_folder,f_name)
if data_type == "pandas":
data.to_csv(f_path_name, sep=",",header=include_header)
return


0 comments on commit a25365c

Please sign in to comment.