Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/allele calling #17

Merged
merged 45 commits into from
Apr 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
1f24d45
create commit for testing
luissian Mar 12, 2024
c6caadb
added file for checking allele type
luissian Mar 12, 2024
3e96c7b
creating collect_data function
luissian Mar 12, 2024
4cafc72
liting
luissian Mar 12, 2024
23de3d5
change to use node.js 20
luissian Mar 12, 2024
fd9d2c8
Adding product annotation
luissian Mar 13, 2024
cea9b44
Include product in annotation file missing in previous commit
luissian Mar 13, 2024
38ddb44
added annotation information to output files
luissian Mar 13, 2024
356ca9f
created inferred class to track inferred alleles
luissian Mar 16, 2024
30c0c6c
litting
luissian Mar 16, 2024
06b69c0
solving liting
luissian Mar 16, 2024
4638fc9
create finally at try when searching for distance
luissian Mar 17, 2024
c209381
fixed bug in saving annotation per allele
luissian Mar 17, 2024
d89a647
Check req programs before starting
luissian Mar 18, 2024
01d2269
save code before using valid result from blast
luissian Mar 18, 2024
c3c8bd5
implemented NIPH and NIPHEM clasification
luissian Mar 18, 2024
b5f20e2
fixing liting and error in program parameter
luissian Mar 18, 2024
2d34414
Implemented SNP file
luissian Mar 19, 2024
30ec831
added graphics per allele classification
luissian Mar 19, 2024
e572242
implemented alignment and parallel
luissian Mar 21, 2024
e27b7ef
correcting litin
luissian Mar 22, 2024
f5af817
removing comma at the end onf line in allele_calling_match file
luissian Mar 23, 2024
a31f46a
added comment changes at PR 17
luissian Mar 24, 2024
7fff56a
adding docstring and include threshold parameter
luissian Mar 24, 2024
f217b9d
remove the fix value and assign it to threshold parameter
luissian Mar 24, 2024
e5a365a
included reference allele sequence
luissian Mar 25, 2024
cd22171
prevent that 2 instances call the method at the same time
luissian Mar 25, 2024
3a99b42
add threshold parameter
luissian Mar 25, 2024
28812c5
re-writing the classification alleles
luissian Mar 27, 2024
079a37c
implemented percentage identity as parameter, default is 90
luissian Mar 27, 2024
29b0331
update the snp implementation including new fields in the snp output …
luissian Mar 27, 2024
e16b8f6
solving liting
luissian Mar 27, 2024
826997b
Partial implementation of multi alignment feature
luissian Mar 29, 2024
8a0ad4c
add function to extend sequence to find stop codon
luissian Apr 7, 2024
dbd8b32
Added extension sequences when start codon is not found because is trunk
luissian Apr 7, 2024
928e40b
fixing litin
luissian Apr 7, 2024
8f02d6c
fixing litin on allele_calling
luissian Apr 7, 2024
55aaecb
Fixed issue on not start codon
luissian Apr 9, 2024
ed88c4b
fixed ouput data when LNF
luissian Apr 9, 2024
4a03494
liting
luissian Apr 9, 2024
7d12e0e
liting 2
luissian Apr 9, 2024
8e656e8
Implementing parallel at multi alignment
luissian Apr 10, 2024
7e403d7
litin
luissian Apr 10, 2024
79220f0
check if mafft is installed
luissian Apr 11, 2024
2821301
litin
luissian Apr 11, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ jobs:
uses: actions/checkout@v4

- name: Set up Miniconda
uses: conda-incubator/setup-miniconda@v2
uses: conda-incubator/setup-miniconda@v3
with:
activate-environment: taranis_env
environment-file: environment.yml
Expand Down
1 change: 1 addition & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ dependencies:
- bioconda::blast>=2.9
- bioconda::mash>=2
- bioconda::prodigal=2.6.3
- bioconda::mafft=7.525
- pip
- pip :
- -r requirements.txt
222 changes: 166 additions & 56 deletions taranis/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import click
import concurrent.futures
import glob
import os
import rich.console
import rich.logging
import rich.traceback
Expand All @@ -15,7 +14,7 @@
import taranis.reference_alleles
import taranis.allele_calling

from pathlib import Path
import taranis.inferred_alleles

log = logging.getLogger()

Expand Down Expand Up @@ -217,6 +216,7 @@ def analyze_schema(
usegenus: str,
cpus: int,
):
_ = taranis.utils.check_additional_programs_installed([["prokka", "--version"]])
schema_files = taranis.utils.get_files_in_folder(inputdir, "fasta")

results = []
Expand Down Expand Up @@ -316,6 +316,12 @@ def analyze_schema(
default=1,
help="Number of cpus used for execution",
)
@click.option(
"--force/--no-force",
required=False,
default=False,
help="Overwrite the output folder if it exists",
)
def reference_alleles(
schema: str,
output: str,
Expand All @@ -325,7 +331,11 @@ def reference_alleles(
cluster_resolution: float,
seed: int,
cpus: int,
force: bool,
):
_ = taranis.utils.check_additional_programs_installed(
[["mash", "--version"], ["makeblastdb", "-version"], ["blastn", "-version"]]
)
start = time.perf_counter()
max_cpus = taranis.utils.cpus_available()
if cpus > max_cpus:
Expand All @@ -335,23 +345,8 @@ def reference_alleles(
schema_files = taranis.utils.get_files_in_folder(schema, "fasta")

# Check if output folder exists
if taranis.utils.folder_exists(output):
q_question = (
"Folder "
+ output
+ " already exists. Files will be overwritten. Do you want to continue?"
)
if "no" in taranis.utils.query_user_yes_no(q_question, "no"):
log.info("Aborting code by user request")
stderr.print("[red] Exiting code. ")
sys.exit(1)
else:
try:
os.makedirs(output)
except OSError as e:
log.info("Unable to create folder at %s with error %s", output, e)
stderr.print("[red] ERROR. Unable to create folder " + output)
sys.exit(1)
if not force:
_ = taranis.utils.prompt_user_if_folder_exists(output)
"""Create the reference alleles from the schema """
results = []
with concurrent.futures.ThreadPoolExecutor(max_workers=cpus) as executor:
Expand Down Expand Up @@ -396,6 +391,32 @@ def reference_alleles(
type=click.Path(exists=True),
help="Directory where the schema reference allele files are located. ",
)
@click.option(
"-a",
"--annotation",
required=True,
multiple=False,
type=click.Path(exists=True),
help="Annotation file. ",
)
@click.option(
"-t",
"--threshold",
required=False,
nargs=1,
default=0.8,
type=float,
help="Threshold value to consider in blast. Values from 0 to 1. default 0.8",
)
@click.option(
"-p",
"--perc-identity",
required=False,
nargs=1,
default=90,
type=int,
help="Percentage of identity to consider in blast. default 90",
)
@click.option(
"-o",
"--output",
Expand All @@ -404,65 +425,154 @@ def reference_alleles(
type=click.Path(),
help="Output folder to save reference alleles",
)
@click.option(
"--force/--no-force",
required=False,
default=False,
help="Overwrite the output folder if it exists",
)
@click.argument(
"assemblies",
callback=expand_wildcards,
nargs=-1,
required=True,
type=click.Path(exists=True),
)
@click.option(
"--snp/--no-snp",
required=False,
default=False,
help="Create SNP file for alleles in assembly in relation with reference allele",
)
@click.option(
"--alignment/--no-alignment",
required=False,
default=False,
help="Create alignment files",
)
@click.option(
"-q",
"--proteine-threshold",
required=False,
nargs=1,
default=80,
type=int,
help="Threshold of protein coverage to consider as TPR. default 90",
)
@click.option(
"-i",
"--increase-sequence",
required=False,
nargs=1,
default=20,
type=int,
help="Increase the number of triplet sequences to find the stop codon. default 20",
)
@click.option(
"--cpus",
required=False,
multiple=False,
type=int,
default=1,
help="Number of cpus used for execution",
)
def allele_calling(
schema,
reference,
assemblies,
output,
schema: str,
reference: str,
annotation: str,
assemblies: list,
threshold: float,
perc_identity: int,
output: str,
force: bool,
snp: bool,
alignment: bool,
proteine_threshold: int,
increase_sequence: int,
cpus: int,
):
_ = taranis.utils.check_additional_programs_installed(
[["blastn", "-version"], ["makeblastdb", "-version"], ["mafft", "--version"]]
)
schema_ref_files = taranis.utils.get_files_in_folder(reference, "fasta")
if len(schema_ref_files) == 0:
log.error("Referenc allele folder %s does not have any fasta file", schema)
stderr.print("[red] reference allele folder does not have any fasta file")
sys.exit(1)

# Check if output folder exists
if taranis.utils.folder_exists(output):
q_question = (
"Folder "
+ output
+ " already exists. Files will be overwritten. Do you want to continue?"
)
if "no" in taranis.utils.query_user_yes_no(q_question, "no"):
log.info("Aborting code by user request")
stderr.print("[red] Exiting code. ")
sys.exit(1)
else:
try:
os.makedirs(output)
except OSError as e:
log.info("Unable to create folder at %s with error %s", output, e)
stderr.print("[red] ERROR. Unable to create {output} folder")
sys.exit(1)
if not force:
_ = taranis.utils.prompt_user_if_folder_exists(output)
# Filter fasta files from reference folder
# ref_alleles = glob.glob(os.path.join(reference, "*.fasta"))
# Create predictions

"""
pred_out = os.path.join(output, "prediction")
pred_sample = taranis.prediction.Prediction(genome, sample, pred_out)
pred_sample.training()
pred_sample.prediction()
max_cpus = taranis.utils.cpus_available()
if cpus > max_cpus:
stderr.print("[red] Number of CPUs bigger than the CPUs available")
stderr.print("Running code with ", max_cpus)
cpus = max_cpus
# Read the annotation file
stderr.print("[green] Reading annotation file")
log.info("Reading annotation file")
map_pred = [["gene", 7], ["product", 8], ["allele_quality", 9]]
prediction_data = taranis.utils.read_compressed_file(
annotation, separator=",", index_key=1, mapping=map_pred
)
# Create the instanace for inference alleles
inf_allele_obj = taranis.inferred_alleles.InferredAllele()
"""Analyze the sample file against schema to identify alleles
"""

"""Analyze the sample file against schema to identify outbreakers
"""
start = time.perf_counter()
results = []

with concurrent.futures.ThreadPoolExecutor(max_workers=cpus) as executor:
futures = [
executor.submit(
taranis.allele_calling.parallel_execution,
assembly_file,
schema,
prediction_data,
schema_ref_files,
threshold,
perc_identity,
output,
inf_allele_obj,
snp,
alignment,
proteine_threshold,
increase_sequence,
)
for assembly_file in assemblies
]
for future in concurrent.futures.as_completed(futures):
try:
results.append(future.result())
except Exception as e:
print(e)
continue
"""
for assembly_file in assemblies:
assembly_name = Path(assembly_file).stem
results.append(
{
assembly_name: taranis.allele_calling.parallel_execution(
assembly_file, schema, schema_ref_files, output
)
}
taranis.allele_calling.parallel_execution(
assembly_file,
schema,
prediction_data,
schema_ref_files,
threshold,
perc_identity,
output,
inf_allele_obj,
snp,
alignment,
proteine_threshold,
increase_sequence,
)
)

"""
_ = taranis.allele_calling.collect_data(
results, output, snp, alignment, schema_ref_files, cpus
)
finish = time.perf_counter()
print(f"Allele calling finish in {round((finish-start)/60, 2)} minutes")
log.info("Allele calling finish in %s minutes", round((finish - start) / 60, 2))
# sample_allele_obj.analyze_sample()
Loading
Loading