Skip to content

Commit

Permalink
First changes to the new refactorization
Browse files Browse the repository at this point in the history
  • Loading branch information
luissian committed Dec 19, 2023
1 parent 6022168 commit 5c7e3e3
Show file tree
Hide file tree
Showing 9 changed files with 2,998 additions and 58 deletions.
2 changes: 1 addition & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@ dependencies:
- conda-forge::python>=3.6
- conda-forge::biopython==1.72
- conda-forge::pandas==1.2.4
- conda-forge::progressbar==2.5
- conda-forge::openpyxl==3.0.7
- conda-forge::plotly==5.0.0
- conda-forge::numpy==1.20.3
- conda-forge::rich==13.7.0
- bioconda::prokka>=1.14
- bioconda::blast>=2.9
- bioconda::mash>=2
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from setuptools import setup, find_packages

version = "2.1.0"
version = "2.2.0"

with open("README.md") as f:
readme = f.read()
Expand Down
74 changes: 72 additions & 2 deletions taranis/__main__.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
import logging

import click
import glob
import os
import rich.console
import rich.logging
import rich.traceback
import sys

import taranis.prediction
import taranis.utils
import taranis.reference_alleles
import taranis.allele_calling

log = logging.getLogger()

Expand Down Expand Up @@ -86,7 +89,7 @@ def decorator(f):
help="Print verbose output to the console.",
)
@click.option(
"-l", "--log-file", help="Save a verbose log to a file.", metavar="<filename>"
"-l", "--log-file", help="Save a verbose log to a file.", metavar="filename"
)
def taranis_cli(verbose, log_file):
# Set the base logger to output DEBUG
Expand All @@ -111,6 +114,7 @@ def reference_alleles(
schema,
output,
):
# taranis reference-alleles -s ../../documentos_antiguos/datos_prueba/schema_1_locus/ -o ../../new_taranis_result_code
# taranis reference-alleles -s ../../documentos_antiguos/datos_prueba/schema_test/ -o ../../new_taranis_result_code
if not taranis.utils.folder_exists(schema):
log.error("schema folder %s does not exists", schema)
Expand Down Expand Up @@ -140,4 +144,70 @@ def reference_alleles(
"""Create the reference alleles from the schema """
for f_file in schema_files:
ref_alleles = taranis.reference_alleles.ReferenceAlleles(f_file, output)
_ = ref_alleles.create_ref_alleles()
_ = ref_alleles.create_ref_alleles()


# Allele calling
# taranis -l ../../test/taranis.log allele-calling -s ../../documentos_antiguos/datos_prueba/schema_test/ -r ../../documentos_antiguos/datos_prueba/reference_alleles/ -g ../../taranis_data/listeria_genoma_referencia/listeria.fasta -a ../../taranis_data/listeria_sampled/RA-L2073_R1.fasta -o ../../test/
# taranis allele-calling -s ../../documentos_antiguos/datos_prueba/schema_test/ -r ../../documentos_antiguos/datos_prueba/reference_alleles/ -g ../../taranis_data/listeria_genoma_referencia/listeria.fasta -a ../../taranis_data/listeria_sampled/RA-L2073_R1.fasta -o ../../test/
# taranis allele-calling -s ../../documentos_antiguos/datos_prueba/schema_test/ -r ../../documentos_antiguos/datos_prueba/reference_alleles/ -g ../../taranis_data/listeria_genoma_referencia/listeria.fasta -a ../../taranis_data/muestras_listeria_servicio_fasta/3789/assembly.fasta -o ../../test/

@taranis_cli.command(help_priority=3)
@click.option("-s", "--schema", required=True, multiple=False, type=click.Path(), help="Directory where the schema with the core gene files are located. ")
@click.option("-r", "--reference", required=True, multiple=False, type=click.Path(), help="Directory where the schema reference allele files are located. ")
@click.option("-g", "--genome", required=True, multiple=False, type=click.Path(), help="Genome reference file")
@click.option("-a", "--sample", required=True, multiple=False, type=click.Path(), help="Sample location file in fasta format. ")
@click.option("-o", "--output", required=True, multiple=False, type=click.Path(), help="Output folder to save reference alleles")
def allele_calling(
schema,
reference,
genome,
sample,
output,
):
folder_to_check = [schema, reference]
for folder in folder_to_check:
if not taranis.utils.folder_exists(folder):
log.error("folder %s does not exists", folder)
stderr.print(
"[red] Folder does not exist. " + folder + "!"
)
sys.exit(1)
if not taranis.utils.file_exists(sample):
log.error("file %s does not exists", sample)
stderr.print(
"[red] File does not exist. " + sample + "!"
)
sys.exit(1)
schema_files = taranis.utils.get_files_in_folder(schema, "fasta")
if len(schema_files) == 0:
log.error("Schema folder %s does not have any fasta file", schema)
stderr.print("[red] Schema folder does not have any fasta file")
sys.exit(1)

# Check if output folder exists
if taranis.utils.folder_exists(output):
q_question = "Folder " + output + " already exists. Files will be overwritten. Do you want to continue?"
if "no" in taranis.utils.query_user_yes_no(q_question, "no"):
log.info("Aborting code by user request")
stderr.print("[red] Exiting code. ")
sys.exit(1)
else:
try:
os.makedirs(output)
except OSError as e:
log.info("Unable to create folder at %s", output)
stderr.print("[red] ERROR. Unable to create folder " + output)
sys.exit(1)
# Filter fasta files from reference folder
ref_alleles = glob.glob(os.path.join(reference, "*.fasta"))
# Create predictions
pred_out = os.path.join(output, "prediction" )
pred_sample = taranis.prediction.Prediction(genome, sample, pred_out)
pred_sample.training()
pred_sample.prediction()

"""Analyze the sample file against schema to identify outbreakers
"""
sample_allele = taranis.allele_calling.Sample(pred_sample, sample, schema, ref_alleles ,output)
sample_allele.analyze_sample()
93 changes: 93 additions & 0 deletions taranis/allele_calling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import logging
import os
import rich.console


import taranis.utils
import taranis.blast
import numpy
from pathlib import Path


import pdb
log = logging.getLogger(__name__)
stderr = rich.console.Console(
stderr=True,
style="dim",
highlight=False,
force_terminal=taranis.utils.rich_force_colors(),
)

class Sample:
def __init__(self, prediction, sample_file, schema, reference_alleles, out_folder):
self.prediction = prediction
self.sample_file = sample_file
self.schema = schema
self.ref_alleles = reference_alleles
self.out_folder = out_folder
self.s_name = Path(sample_file).stem
self.blast_dir = os.path.join(out_folder,"blastdb")
self.blast_sample = os.path.join(self.blast_dir, self.s_name)

def assign_abbreviation(self, query_seq, allele_name, sample_contig, schema_gene):
s_alleles_blast = taranis.blast.Blast("nucl")
ref_allele_blast_dir = os.path.join(self.blast_dir, "ref_alleles")
query_path = os.path.join(self.out_folder, "tmp", allele_name)
# Write to file the sequence to find out the loci name that fully match
f_name = taranis.utils.write_fasta_file(query_path, query_seq, allele_name)
query_file = os.path.join(query_path, f_name)
_ = s_alleles_blast.create_blastdb(schema_gene, ref_allele_blast_dir)
# Blast with sample sequence to find the allele in the schema
seq_blast_match = s_alleles_blast.run_blast(query_file, perc_identity=100)
pdb.set_trace()
if len(seq_blast_match) == 1:

# Hacer un blast con la query esta secuencia y la database del alelo
# Create blast db with sample file


pass


def catalog_alleles (self, ref_allele):
allele_name = Path(ref_allele).stem
schema_gene = os.path.join(self.schema, allele_name + ".fasta")
allele_name = Path(ref_allele).stem
# run blast with sample as db and reference allele as query
sample_blast_match = self.sample_blast.run_blast(ref_allele)
if len(sample_blast_match) > 0 :
s_lines = []
for out_line in sample_blast_match:
s_lines.append(out_line.split("\t"))
np_lines = numpy.array(s_lines)
# convert to float the perc_identity to find out the max value
max_val = numpy.max(np_lines[:,2].astype(float))
mask = np_lines[:, 2] ==str(max_val)
# Select rows that match the percent identity. Index 2 in blast results
sel_row = np_lines[mask, :] = np_lines[mask, :]
query_seq = sel_row[0,14]
sample_contig = sel_row[0,1]
abbr = self.assign_abbreviation(query_seq, allele_name, sample_contig, schema_gene)
else:
# Sample does not have a reference allele to be matched
# Keep LNF info
# ver el codigo de espe
#lnf_tpr_tag()
pass
pdb.set_trace()


def analyze_sample(self):
# Create blast db with sample file
self.sample_blast = taranis.blast.Blast("nucl")
_ = self.sample_blast.create_blastdb(self.sample_file, self.blast_dir)
result = {}
pdb.set_trace()
for ref_allele in self.ref_alleles:
# schema_alleles = os.path.join(self.schema, ref_allele)
# parallel in all CPUs in cluster node
result[ref_allele] = self.catalog_alleles(ref_allele)

pdb.set_trace()
return

Loading

0 comments on commit 5c7e3e3

Please sign in to comment.