From 3cfdd85690762df0c23aa9a93ce2b28738e0856b Mon Sep 17 00:00:00 2001 From: luissian Date: Sat, 13 Apr 2024 15:39:16 +0200 Subject: [PATCH 1/3] implemented distance matrix --- taranis/__main__.py | 116 ++++++++++++++++++++++++++++++++++++++++++++ taranis/distance.py | 49 +++++++++++++++++++ taranis/utils.py | 34 +++++++++++++ 3 files changed, 199 insertions(+) diff --git a/taranis/__main__.py b/taranis/__main__.py index cb17c20..e521785 100644 --- a/taranis/__main__.py +++ b/taranis/__main__.py @@ -3,12 +3,14 @@ import click import concurrent.futures import glob +import pandas as pd import rich.console import rich.logging import rich.traceback import sys import time +import taranis.distance import taranis.utils import taranis.analyze_schema import taranis.reference_alleles @@ -16,6 +18,7 @@ import taranis.inferred_alleles +# import pdb log = logging.getLogger() # Set up rich stderr console @@ -576,3 +579,116 @@ def allele_calling( print(f"Allele calling finish in {round((finish-start)/60, 2)} minutes") log.info("Allele calling finish in %s minutes", round((finish - start) / 60, 2)) # sample_allele_obj.analyze_sample() + + +@taranis_cli.command(help_priority=3) +@click.option( + "-a", + "--alleles", + required=True, + multiple=False, + type=click.Path(exists=True), + help="Alleles matrix file from which to obtain distances between samples", +) +@click.option( + "-o", + "--output", + required=True, + multiple=False, + type=click.Path(), + help="Output folder to save distance matrix", +) +@click.option( + "--force/--no-force", + required=False, + default=False, + help="Overwrite the output folder if it exists", +) +@click.option( + "-l", + "--locus-missing-threshold", + required=False, + multiple=False, + type=int, + default=100, + help="Threshold for missing alleles in locus, which loci is excluded from distance matrix", +) +@click.option( + "-s", + "--sample-missing-threshold", + required=False, + multiple=False, + type=int, + default=20, + help="Threshold for missing samples, which sample is excluded from distance matrix", +) +@click.option( + "--paralog-filter/--no-paralog-filter", + required=False, + multiple=False, + type=bool, + default=True, + help="Consider paralog tags (NIPH, NIPHEM) as missing values. Default is True", +) +@click.option( + "--lnf-filter/--no-lnf-filter", + required=False, + multiple=False, + type=bool, + default=True, + help="Consider LNF as missing values. Default is True", +) +@click.option( + "--plot-filter/--no-plot-filter", + required=False, + multiple=False, + type=bool, + default=True, + help="Consider PLOT as missing values. Default is True", +) +def distance_matrix( + alleles: str, + output: str, + force: bool, + locus_missing_threshold: int, + sample_missing_threshold: int, + paralog_filter: bool, + lnf_filter: bool, + plot_filter: bool, +): + # Check if file exists + if not taranis.utils.file_exists(alleles): + log.error("Alleles matrix file %s does not exist", alleles) + stderr.print("[red] Alleles matrix file does not exist") + sys.exit(1) + # Check if output folder exists + if not force: + _ = taranis.utils.prompt_user_if_folder_exists(output) + start = time.perf_counter() + # filter the alleles matrix according to the thresholds and filters + allele_matrix = pd.read_csv(alleles, sep=",", index_col=0, header=0) + filtering_string = ["ASM", "ALM"] + if paralog_filter: + filtering_string.append("NIPH") + filtering_string.append("NIPHEM") + if lnf_filter: + filtering_string.append("LNF") + if plot_filter: + filtering_string.append("PLOT") + # pdb.set_trace() + filtered_allele = taranis.utils.filter_data_frame_by_parameters( + allele_matrix, + locus_missing_threshold, + sample_missing_threshold, + filtering_string, + replaced_by_zero=False, + ) + # Create the distance matrix + # pdb.set_trace() + d_matrix_obj = taranis.distance.HammingDistance(filtered_allele) + distance_matrix = d_matrix_obj.create_matrix() + # pdb.set_trace() + print(distance_matrix) + finish = time.perf_counter() + print(f"Distance matrix finish in {round((finish-start)/60, 2)} minutes") + log.info("Distance matrix finish in %s minutes", round((finish - start) / 60, 2)) diff --git a/taranis/distance.py b/taranis/distance.py index 496fac0..67283b1 100644 --- a/taranis/distance.py +++ b/taranis/distance.py @@ -83,3 +83,52 @@ def create_matrix(self) -> pd.DataFrame: dist_matrix.close() log.debug(f"create distance for {allele_name}") return matrix_pd + + +class HammingDistance: + def __init__(self, dist_matrix: pd.DataFrame) -> "HammingDistance": + """HammingDistance instance creation + + Args: + dist_matrix (pd.DataFrame): Distance matrix + + Returns: + HammingDistance: created hamming distance + """ + self.dist_matrix = dist_matrix + + def create_matrix(self) -> pd.DataFrame: + """Create hamming distance matrix using external program called mash + + Returns: + pd.DataFrame: Hamming distance matrix as panda DataFrame + """ + + unique_values = pd.unique( + self.dist_matrix[list(self.dist_matrix.keys())].values.ravel("K") + ) + # Create binary matrix ('1' or '0' ) matching the input matrix vs the unique_values[0] + # astype(int) is used to transform the boolean matrix into integer + U = self.dist_matrix.eq(unique_values[0]).astype(int) + # multiply the matrix with the transpose + H = U.dot(U.T) + + # Repeat for each unique value + for unique_val in range(1, len(unique_values)): + U = self.dist_matrix.eq(unique_values[unique_val]).astype(int) + # Add the value of the binary matrix with the previous stored values + H = H.add(U.dot(U.T)) + + return len(self.dist_matrix.columns) - H + + """ + dist_matrix = self.dist_matrix + allele_names = dist_matrix.index + hamming_matrix = pd.DataFrame(index=allele_names, columns=allele_names) + for i in allele_names: + for j in allele_names: + hamming_matrix.at[i, j] = sum( + dist_matrix.loc[i] != dist_matrix.loc[j] + ) + return hamming_matrix + """ diff --git a/taranis/utils.py b/taranis/utils.py index d83228d..d362211 100644 --- a/taranis/utils.py +++ b/taranis/utils.py @@ -28,6 +28,8 @@ import warnings from Bio import BiopythonWarning +# import pdb + log = logging.getLogger(__name__) @@ -319,6 +321,38 @@ def find_nearest_numpy_value(array, value): """ +def filter_data_frame_by_parameters( + data_frame: pd.DataFrame, + column_thr: int, + row_thr: int, + filter_str: list[str], + replaced_by_zero: bool, +) -> pd.DataFrame: + # get the number of columns and rows + num_rows, num_columns = data_frame.shape + # remove the columns which the filter strings are higher than the threshold + column_threshold = column_thr * num_rows / 100 + # Condition: Check if any string in the filter list is present in each cell of the DataFrame + f_condition = data_frame.apply( + lambda column: column.astype(str).str.contains("|".join(filter_str), na=False) + ) + if replaced_by_zero: + new_data_frame = data_frame.mask(f_condition, 0) + else: + # Count the number of hits per column + hits_per_column = f_condition.sum() + # pdb.set_trace() + # Filter for removing columns where the count of hits is higher than the threshold + to_be_removed_columns = hits_per_column[ + hits_per_column > column_threshold + ].index + new_data_frame = data_frame.drop(columns=to_be_removed_columns) + # pdb.set_trace() + # remove the rows which the filter strings are higher than the threshold + row_threshold = row_thr * num_columns + return new_data_frame + + def folder_exists(folder_to_check): """Checks if input folder exists From c771cb46accdfbb72c04dd1159a43fa6a085090e Mon Sep 17 00:00:00 2001 From: luissian Date: Sat, 13 Apr 2024 18:50:38 +0200 Subject: [PATCH 2/3] liting --- taranis/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/taranis/utils.py b/taranis/utils.py index d362211..598ef41 100644 --- a/taranis/utils.py +++ b/taranis/utils.py @@ -349,7 +349,7 @@ def filter_data_frame_by_parameters( new_data_frame = data_frame.drop(columns=to_be_removed_columns) # pdb.set_trace() # remove the rows which the filter strings are higher than the threshold - row_threshold = row_thr * num_columns + # row_threshold = row_thr * num_columns return new_data_frame From 1e7aaf46bc9247073c708911813fde8e55d84b8b Mon Sep 17 00:00:00 2001 From: luissian Date: Mon, 15 Apr 2024 14:51:36 +0200 Subject: [PATCH 3/3] Update information with input parameters and outfiles description --- README.md | 475 +++++++++++++++++++++++++++++------------------------- 1 file changed, 258 insertions(+), 217 deletions(-) diff --git a/README.md b/README.md index 23ca67f..4ddebf7 100644 --- a/README.md +++ b/README.md @@ -10,31 +10,23 @@ - [Output](#output) - [Illustrated pipeline](#illustrated-pipeline) - - ## Introduction **Taranis** is a computational stand-alone pipeline for **gene-by-gene allele calling analysis** based on BLASTn using whole genome (wg) and core genome (cg) multilocus sequence typing (MLST) schemas on complete or draft genomes resulting from de novo assemblers, while tracking helpful and informative data among the process. Taranis includes four main functionalities: MLST **schema analysis**, gene-by-gene **allele calling**, **reference alleles** obtainment for allele calling analysis and the final **distance matrix** construction. - - ## Dependencies -* Python >=3.6 -* NCBI_blast >= v2.9 -* prokka >=1.14 -* prodigal v2.6.3 -* mash >=2 -* biopython v1.72 -* pandas v1.2.4 -* progressbar v2.5 -* openpyxl v3.0.7 -* plotly v5.0.0 -* numpy v1.20.3 - - +- Python >=3.8 +- NCBI_blast >= v2.9 +- prokka >=1.14.6 +- mafft = 7.520 +- mash >=2 +- biopython v1.81 +- pandas v2.1.1 +- plotly v5.17.0 +- numpy v1.26.0 ## Installation @@ -46,7 +38,6 @@ Install all dependencies and add them to $PATH. Add taranis and ./bin to $PATH. - #### Install using conda This option is recomended. @@ -58,8 +49,6 @@ Install Anaconda3. Wait for the environment to solve.
Ignore warnings/errors. - - ## Quick usage - **analyze_schema mode:** @@ -69,20 +58,26 @@ Ignore warnings/errors. ``` taranis analyze_schema \ -inputdir schema_dir \ --outputdir YYYY-MM-DD_taranis_analyze_schema_dir +-output output_analyze_schema_dir +--ouput-allele-annotation annotation_dir ``` - Schema analysis and duplicated alleles, alleles subsequences and no CDS alleles filtering: + Schema analysis for removing duplicated, subsequences and no CDS alleles: ``` taranis analyze_schema \ -inputdir schema_dir \ --outputdir YYYY-MM-DD_taranis_analyze_schema_dir \ --removesubsets True \ --removeduplicates True \ --removenocds True -``` +-output output_analyze_schema_dir \ +--remove-subsets \ +--remove-duplicated \ +--remove-no-cds \ +--ouput-allele-annotation annotation_dir \ +--genus prokka_genus_name \ +--usegenus prokka genus-specific BLAST database \ +--species prokka_species_name \ +--cpus number_of_cpus +``` - **reference_alleles mode:** @@ -90,10 +85,26 @@ taranis analyze_schema \ ``` taranis reference_alleles \ --coregenedir schema_dir \ --outputdir YYYY-MM-DD_taranis_reference_alleles_dir +-s schema_dir \ +-o output_reference_alleles_dir \ +--eval-cluster \ +--cpus number_of_cpus \ +--force overwrite output dir ``` + Reference alleles with clustering settings: + +``` +taranis reference_alleles \ +-s schema_dir \ +-o output_reference_alleles_dir \ +--eval-cluster \ +-k k-mer size for mash \ +-S Sketch size for mash \ +-r resolution used for clustering \ +--cpus number_of_cpus \ +--force overwrite output dir +``` - **allele_calling mode:** @@ -101,23 +112,36 @@ taranis reference_alleles \ ``` taranis allele_calling \ --coregenedir schema_dir \ --refalleles YYYY-MM-DD_taranis_reference_alleles_dir \ --inputdir samples_dir \ --refgenome reference_genome.fasta \ --outputdir YYYY-MM-DD_taranis_allele_calling_dir +-s schema_dir \ +-a annotation_file \ +-r reference_alleles_dir \ +-o output_allele_calling_dir \ +-t threshold to consider in blast \ +-p percentage of identity to consider in blast \ +-q threshold to consider as TPR \ +-i increase number of nucleotides to find stop codon \ +--snp Create SNP file \ +--cpus number_of_cpus \ +--alignment Create aligment files \ +samples_dir ``` - Run allele calling getting ST profile: + Allele calling for blast and threshold settings: ``` taranis allele_calling \ --coregenedir schema_dir \ --refalleles YYYY-MM-DD_taranis_reference_alleles_dir \ --inputdir samples_dir \ --refgenome reference_genome.fasta \ --profile profile.csv \ --outputdir YYYY-MM-DD_taranis_allele_calling_dir +-s schema_dir \ +-a annotation_file \ +-r reference_alleles_dir \ +-o output_allele_calling_dir \ +-t threshold to consider in blast \ +-p percentage of identity to consider in blast \ +-q threshold to consider as TPR \ +-i increase number of nucleotides to find stop codon \ +--snp Create SNP file \ +--cpus number_of_cpus \ +--alignment Create aligment files \ +samples_dir ``` - **distance_matrix mode:** @@ -126,223 +150,240 @@ taranis allele_calling \ ``` taranis distance_matrix \ --alleles_matrix YYYY-MM-DD_taranis_allele_calling_dir/result.tsv -outputdir YYYY-MM-DD_taranis_distance_matrix_dir +-a allele_calling_match.csv file \ +-o distance_matrix_dir +--force overwrite output folder ``` -

Get distance matrix filtering loci and samples which missing values percentage is above specified threshold: +Distance matrix with threshold settings: ``` -taranis distance_matrix\ --alleles_matrix YYYY-MM-DD_taranis_allele_calling_dir/result.tsv\ --locus_missing_threshold 20 \ --sample_missing_threshold 50 \ --outputdir YYYY-MM-DD_taranis_distance_matrix_dir +taranis distance_matrix \ +-a allele_calling_match.csv file \ +-o distance_matrix_dir +-l threshold for missing locus \ +-s threshold for missing samples \ +--paralog-filter \ +--lnf-filter \ +--plot-filter \ +--force overwrite output folder ``` - - ## Usage - **analyze_schema mode:** ``` -usage: taranis.py analyze_schema [-h] -inputdir INPUTDIR -outputdir OUTPUTDIR [-removesubsets REMOVESUBSETS] [-removeduplicates REMOVEDUPLICATES] [-removenocds REMOVENOCDS] [-newschema NEWSCHEMA] - [-genus GENUS] [-species SPECIES] [-usegenus USEGENUS] [-cpus CPUS] - -optional arguments: - -h, --help show this help message and exit - -inputdir INPUTDIR Directory where are the schema files. - -outputdir OUTPUTDIR Directory where the result files will be stored. - -removesubsets REMOVESUBSETS - Remove allele subsequences from the schema.True: Remove subsets.False: Do not remove subsets.Default is False. - -removeduplicates REMOVEDUPLICATES - Remove duplicated alleles from the schema.True: Remove duplicates.False: Do not remove duplicates.Default is False. - -removenocds REMOVENOCDS - Remove no CDS alleles from the schema.True: Remove no CDS alleles.False: Do not remove no CDS alleles.Default is False. - -newschema NEWSCHEMA Filter a copy of the core genes schema preserving the analysis core genes schema.True: Create a copy of the core genes schema for filtering.False: Do not create a copy of the - core genes schema for filtering.Default is False. - -genus GENUS Genus name for Prokka schema genes annotation. Default is Genus. - -species SPECIES Species name for Prokka schema genes annotation. Default is species. - -usegenus USEGENUS Use genus-specific BLAST databases for Prokka schema genes annotation (needs --genus). Default is False. - -cpus CPUS Number of CPUS to be used in the program. Default is 1. +Usage: taranis analyze-schema [OPTIONS] + +Options: + -i, --inputdir PATH Directory where the schema with the core + gene files are located. [required] + -o, --output PATH Output folder to save analyze schema + [required] + --remove-subset / --no-remove-subset + Remove allele subsequences from the schema. + --remove-duplicated / --no-remove-duplicated + Remove duplicated subsequences from the + schema. + --remove-no-cds / --no-remove-no-cds + Remove no CDS alleles from the schema. + --output-allele-annot / --no-output-allele-annot + output prokka/allele annotation for all + alleles in locus + --genus TEXT Genus name for Prokka schema genes + annotation. Default is Genus. + --species TEXT Species name for Prokka schema genes + annotation. Default is species + --usegenus TEXT Use genus-specific BLAST databases for + Prokka schema genes annotation (needs + --genus). Default is False. + --cpus INTEGER Number of cpus used for execution + --help Show this message and exit. ``` - - **reference_alleles mode:** ``` -usage: taranis.py reference_alleles [-h] -coregenedir COREGENEDIR -outputdir OUTPUTDIR - [-evalue EVALUE] [-perc_identity PERC_IDENTITY] - [-reward REWARD] [-penalty PENALTY] [-gapopen GAPOPEN] - [-gapextend GAPEXTEND] [-num_threads NUM_THREADS] [-cpus CPUS] - -optional arguments: - -h, --help show this help message and exit - -coregenedir COREGENEDIR - Directory where the core gene files are located. - -outputdir OUTPUTDIR Directory where the result files will be stored. - -evalue EVALUE E-value in BLAST searches. Default is 0.001. - -perc_identity PERC_IDENTITY - Identity percent in BLAST searches. Default is 90. - -reward REWARD Match reward in BLAST searches. Default is 1. - -penalty PENALTY Mismatch penalty in BLAST searches. Default is -2. - -gapopen GAPOPEN Gap open penalty in BLAST searches. Default is 1. - -gapextend GAPEXTEND Gap extension penalty in BLAST searches. Default is 1. - -num_threads NUM_THREADS - num_threads in BLAST searches. Default is 1. - -cpus CPUS Number of CPUS to be used in the program. Default is 1. +Usage: taranis reference-alleles [OPTIONS] + +Options: + -s, --schema PATH Directory where the schema with the core + gene files are located. [required] + -o, --output PATH Output folder to save reference alleles + [required] + --eval-cluster / --no-eval-cluster + Evaluate if the reference alleles match + against blast with a 90% identity + -k, --kmer-size INTEGER Mash parameter for K-mer size. + -S, --sketch-size INTEGER Mash parameter for Sketch size + -r, --cluster-resolution FLOAT Resolution value used for clustering. + --seed INTEGER Seed value for clustering + --cpus INTEGER Number of cpus used for execution + --force / --no-force Overwrite the output folder if it exists + --help Show this message and exit. ``` - - **allele_calling mode:** ``` -usage: taranis.py allele_calling [-h] -coregenedir COREGENEDIR -refalleles REFALLELES -inputdir - INPUTDIR -refgenome REFGENOME -outputdir OUTPUTDIR - [-percentlength PERCENTLENGTH] [-coverage COVERAGE] - [-evalue EVALUE] [-perc_identity_ref PERC_IDENTITY_REF] - [-perc_identity_loc PERC_IDENTITY_LOC] [-reward REWARD] - [-penalty PENALTY] [-gapopen GAPOPEN] [-gapextend GAPEXTEND] - [-max_target_seqs MAX_TARGET_SEQS] [-max_hsps MAX_HSPS] - [-num_threads NUM_THREADS] [-flankingnts FLANKINGNTS] - [-updateschema UPDATESCHEMA] [-profile PROFILE] - [-updateprofile UPDATEPROFILE] [-cpus CPUS] [-genus GENUS] - [-species SPECIES] [-usegenus USEGENUS] - -optional arguments: - -h, --help show this help message and exit - -coregenedir COREGENEDIR - Directory where the core gene files are located - -refalleles REFALLELES - Directory where the core gene references files are located - -inputdir INPUTDIR Directory where are located the sample fasta files - -refgenome REFGENOME Reference genome file for genes prediction - -outputdir OUTPUTDIR Directory where the result files will be stored - -percentlength PERCENTLENGTH - Allowed length percentage to be considered as INF. Outside of this limit it - is considered as ASM or ALM. Default is SD. - -coverage COVERAGE Coverage threshold to exclude found sequences. Outside of this limit it is - considered LNF. Default is 50. - -evalue EVALUE E-value in BLAST searches. Default is 0.001. - -perc_identity_ref PERC_IDENTITY_REF - Identity percentage in BLAST searches using reference alleles for each - locus detection in samples. Default is 90. - -perc_identity_loc PERC_IDENTITY_LOC - Identity percentage in BLAST searches using all alleles in each locus for - allele identification in samples. Default is 90. - -reward REWARD Match reward in BLAST searches. Default is 1. - -penalty PENALTY Mismatch penalty in BLAST searches. Default is -2. - -gapopen GAPOPEN Gap open penalty in BLAST searches. Default is 1. - -gapextend GAPEXTEND Gap extension penalty in BLAST searches. Default is 1. - -max_target_seqs MAX_TARGET_SEQS - max_target_seqs in BLAST searches. Default is 10. - -max_hsps MAX_HSPS max_hsps in BLAST searches. Default is 10. - -num_threads NUM_THREADS - num_threads in BLAST searches. Default is 1. - -flankingnts FLANKINGNTS - Number of flanking nucleotides to add to each BLAST result obtained after - locus detection in sample using reference allele for correct allele - identification. Default is 100. - -updateschema UPDATESCHEMA - Add INF alleles found for each locus to the core genes schema. True: Add - INF alleles to the analysis core genes schema. New: Add INF alleles to a - copy of the core genes schema preserving the analysis core genes schema. - False: Do not update the core gene schema adding new INF alleles found. - Default is True. - -profile PROFILE ST profile file based on core genes schema file to get ST for each sample. - Default is empty and Taranis does not calculate samples ST. - -updateprofile UPDATEPROFILE - Add new ST profiles found to the ST profile file. True: Add new ST profiles - to the analysis ST profile file. New: Add Add new ST profiles to a copy of - the ST profile file preserving the analysis ST file. False: Do not update - the ST profile file adding new ST profiles found. Default is True. - -cpus CPUS Number of CPUS to be used in the program. Default is 1. - -genus GENUS Genus name for Prokka schema genes annotation. Default is Genus. - -species SPECIES Species name for Prokka schema genes annotation. Default is species. - -usegenus USEGENUS Use genus-specific BLAST databases for Prokka schema genes annotation - (needs --genus). Default is False. +Usage: taranis allele-calling [OPTIONS] ASSEMBLIES... + +Options: + -s, --schema PATH Directory where the schema with the core + gene files are located. [required] + -r, --reference PATH Directory where the schema reference allele + files are located. [required] + -a, --annotation PATH Annotation file. [required] + -t, --threshold FLOAT Threshold value to consider in blast. Values + from 0 to 1. default 0.8 + -p, --perc-identity INTEGER Percentage of identity to consider in blast. + default 90 + -o, --output PATH Output folder to save reference alleles + [required] + --force / --no-force Overwrite the output folder if it exists + --snp / --no-snp Create SNP file for alleles in assembly in + relation with reference allele + --alignment / --no-alignment Create alignment files + -q, --proteine-threshold INTEGER + Threshold of protein coverage to consider as + TPR. default 90 + -i, --increase-sequence INTEGER + Increase the number of triplet sequences to + find the stop codon. default 20 + --cpus INTEGER Number of cpus used for execution + --help Show this message and exit. ``` - - **distance_matrix mode:** ``` -usage: taranis.py distance_matrix [-h] -alleles_matrix ALLELES_MATRIX [-locus_missing_threshold LOCUS_MISSING_THRESHOLD] [-sample_missing_threshold SAMPLE_MISSING_THRESHOLD] - [-paralog_filter PARALOG_FILTER] [-lnf_filter LNF_FILTER] [-plot_filter PLOT_FILTER] -outputdir OUTPUTDIR - -optional arguments: - -h, --help show this help message and exit - -alleles_matrix ALLELES_MATRIX - Alleles matrix file from which to obtain distances between samples - -locus_missing_threshold LOCUS_MISSING_THRESHOLD - Missing values percentage threshold above which loci are excluded for distance matrix creation. Default is 100. - -sample_missing_threshold SAMPLE_MISSING_THRESHOLD - Missing values percentage threshold above which samples are excluded for distance matrix creation. Default is 100. - -paralog_filter PARALOG_FILTER - Consider paralog tags (NIPH, NIPHEM) as missing values. Default is True - -lnf_filter LNF_FILTER - Consider locus not found tag (LNF) as missing value. Default is True - -plot_filter PLOT_FILTER - Consider incomplete alleles found on the tip of a contig tag (PLOT) as missing value. Default is True - -outputdir OUTPUTDIR Directory where the result files will be stored +Usage: taranis distance-matrix [OPTIONS] + +Options: + -a, --alleles PATH Alleles matrix file from which to obtain + distances between samples [required] + -o, --output PATH Output folder to save distance matrix + [required] + --force / --no-force Overwrite the output folder if it exists + -l, --locus-missing-threshold INTEGER + Threshold for missing alleles in locus, + which loci is excluded from distance matrix + -s, --sample-missing-threshold INTEGER + Threshold for missing samples, which sample + is excluded from distance matrix + --paralog-filter / --no-paralog-filter + Consider paralog tags (NIPH, NIPHEM) as + missing values. Default is True + --lnf-filter / --no-lnf-filter Consider LNF as missing values. Default is + True + --plot-filter / --no-plot-filter + Consider PLOT as missing values. Default is + True + --help Show this message and exit. ``` - - ## Output - **analyze_schema mode:** - * **FOLDERS:** + - **FOLDERS and FILES structure:** - * **raw_schema_information:** General information about each allele of each locus - - * **FILES:** - - * **alleles_subsets.tsv:** Report of alleles that are subsequences of other alleles of the same locus - * **duplicated_alleles.tsv:** Report of duplicate alleles within the same locus - * **length_statistics.tsv:** Allele length statistics report for each locus - * **schema_quality.tsv:** Quality report of alleles of each locus - + - **new_schema** Contains the new schema. + - **prokka** Contains the prokka results + - **statistics** Statistics data + - **graphics** Plot graphics folder + - **statistics.csv** Quality statistics showing the following data: + + - allele_name, + - min_length, + - max_length, + - num_alleles, + - mean_length, + - good_percent, + - not a start codon, + - not a stop codon, + - Extra in frame stop codon, + - is not a multiple of three, + - Duplicate allele, + - Sub set allele + + - **allele_annotation.tar.gz** Annotation schema file - **reference_alleles mode:** - * **FILES:** + - **FOLDERS and FILES structure:** - * **[refalleles_locusX].fasta:** One fasta file for each schema locus containing reference alleles for that locus - + - **Clusters** Contains the cluster allele files + - **[cluster_alleles].txt** cluster allele file + - **evaluate_cluster** + - **cluster_evaluuation.csv** Evaluation result with the following info: + - Locus name + - cluster number + - result + - alleles not match in blast + - alleles not found in cluster + + - **cluster_per_locus.csv** Number of cluster per locus + - number of clusters + - number of locus + + - **cluster_summary.csv** summary data with the following info: + - Locus name + - cluster number + - average + - center allele + - number of sequences + + - **graphics** Plot graphics folder + - **num_genes_per_allele.png** Bar graphic to show the number of clusters per gene + + - **[ref_alleles_locusX].fasta:** One fasta file for each schema locus containing reference alleles for that locus - **allele_calling mode:** - * **FOLDERS:** - * **alignments:** Nucleotide alignment between sequence found in the sample and allele - * **proteins:** Protein alignment between sequence found in sample and allele - * **plots:** Interactive pie charts of allele call results for each sample - - * **FILES:** - * **alm.tsv:** Sample sequences found x% larger than the locus alleles mean length report - * **asm.tsv:** Sample sequences found x% shorter than the locus alleles mean length report - * **exact.tsv:** Exact matches report - * **inferred_alleles.tsv:** New inferred alleles report - * **lnf_tpr.tsv:** Locus not found (LNF) and truncated protein (TPR) report - * **paralog.tsv:** Possible paralogs (NIPHEM (100% ID paralogs) and NIPH (<=100% ID paralogs)) report - * **plot.tsv:** Possible loci on the tip of the sample contig (PLOT) report - * **snp.tsv:** SNPs report - * **matching_contigs.tsv:** Summary report of loci found in samples - * **result.tsv:** Allele calling main results - * **summary_result.tsv:** Allele calling results summary. Count of each tag type found for each sample is indicated - * **stprofile.tsv:** Sequence type report - + - **FOLDERS and FILES structure:** + - **alignments:** Nucleotide alignment between sequence found in the sample and allele + - **[locus_name].txt** One file per locus + - **[locus_name]_multiple_alignment.aln** One file per locus + - **graphics** Graphics per type of allele classification + - **ALM_graphic.pnd** Number of ALM in samples. + - **ASM_graphic.pnd** Number of ASM in samples. + - **EXEC_graphic.pnd** Number of EXEC in samples. + - **INF_graphic.pnd** Number of INF in samples. + - **LNF_graphic.pnd** Number of LNF in samples. + - **NIPHEM_graphic.pnd** Number of NIPHEM in samples. + - **NIPH_graphic.pnd** Number of NIPH in samples. + - **PLOT_graphic.pnd** Number of PLOT in samples. + - **TPR_graphic.pnd** Number of TPR in samples. + - **[locus_name]_snp_data** One file per sample + - **allele_calling_match.csv** Contains the classification for each locus and for all samples + - **allele_calling_summary.csv** Contains the number of each classification per samples + - **matching_contig.csv** Summary for each locus in sample with the following data: + - sample + - contig + - core gene + - reference allele name + - codification + - query length + - match length + - contig length + - contig start + - contig stop + - direction + - gene notation + - product notation + - reference allele quality + - protein conversion result + - match sequence reference + - allele sequence + - predicted protein sequence - **distance_matrix mode:** - * **FILES:** - * **filtered_result.tsv:** Filtered allele calling matrix filtered - * **matrix_distance.tsv:** Samples matrix distance - * **matrix_distance_filter_report.tsv:** Allele calling matrix filtering report - - - -## Illustrated pipeline + - **FILES:** + - **filtered_result.tsv:** Filtered allele calling matrix filtered + - **matrix_distance.tsv:** Samples matrix distance + - **matrix_distance_filter_report.tsv:** Allele calling matrix filtering report -Under construction +## Illustrated pipeline