From a295e9273480b6af3fb48a6703ebf325541a2cd6 Mon Sep 17 00:00:00 2001 From: trinezac Date: Fri, 12 Apr 2024 10:49:43 +0200 Subject: [PATCH] v.0.1.20 added feature for detecting strain-mixtures using allele frequencies --- maginator/main.py | 1 + maginator/workflow/pileup_parse.Snakefile | 19 ++++++ .../workflow/scripts/allele_frequency_mat.R | 59 +++++++++++++++++++ setup.py | 2 +- 4 files changed, 80 insertions(+), 1 deletion(-) create mode 100644 maginator/workflow/scripts/allele_frequency_mat.R diff --git a/maginator/main.py b/maginator/main.py index 912c042c..6929c316 100755 --- a/maginator/main.py +++ b/maginator/main.py @@ -78,6 +78,7 @@ def cli(): app.add_argument('--min_nonN', help='Minimum fraction of non-N characters of a sample to be included in a phylogeny [%(default)s]', default=0.5, type=float) app.add_argument('--min_marker_genes', help='Minimum marker genes to be detected for inclusion of a sample in a phylogeny [%(default)s]', default=2, type=int) app.add_argument('--min_signature_genes', help='Minimum signature genes to be detected for inclusion of a sample in a phylogeny [%(default)s]', default=50, type=int) + app.add_argument('--af_cutoff', help='Cutoff for average median allele frequency of SG alignment for determining mixed/pure population of a MAG in a sample [%(default)s]', default=0, type=float) app.add_argument('--phylo', help='Software for phylogeny inference. Either fast (fasttree) or slow and more accurate (iqtree) [%(default)s]', default='fasttree', type=str, choices=['fasttree', 'iqtree']) app.add_argument('--tax_scope_threshold', help='Threshold for assigning the taxonomic scope of a gene cluster [%(default)s]', default=0.9, type=float) app.add_argument('--synteny_adj_cutoff', help='Minimum number of times gene clusters should be adjacent to be included in synteny graph [%(default)s]', default=1, type=int) diff --git a/maginator/workflow/pileup_parse.Snakefile b/maginator/workflow/pileup_parse.Snakefile index 2e19b4d1..6e89c328 100644 --- a/maginator/workflow/pileup_parse.Snakefile +++ b/maginator/workflow/pileup_parse.Snakefile @@ -42,3 +42,22 @@ rule parse: script: "scripts/mpileup.py" +rule allele_frequencies: + input: + stat=os.path.join(WD, 'phylo', 'stats.tab'), + gene=os.path.join(WD, 'phylo', 'stats_genes.tab') + output: + af_matrix=os.path.join(WD,'tabs','allele_frequencies.tab') + params: + af_cutoff = param_dict['af_cutoff'], + min_signature_genes = param_dict['min_signature_genes'], + min_nonN = param_dict['min_nonN'], + min_marker_genes = param_dict['min_marker_genes'] + conda: + "envs/signature_genes.yaml" + resources: + cores=2, + mem_gb=190, + runtime='172800' + script: + "scripts/allele_frequency_mat.R" diff --git a/maginator/workflow/scripts/allele_frequency_mat.R b/maginator/workflow/scripts/allele_frequency_mat.R new file mode 100644 index 00000000..e67d4418 --- /dev/null +++ b/maginator/workflow/scripts/allele_frequency_mat.R @@ -0,0 +1,59 @@ +library(dplyr) +gene_stats_file <- snakemake@input[["gene"]] +gene_stats <- read.table(file=gene_stats_file, header=TRUE) + +stats_file <- snakemake@input[["stat"]] +stats <- read.table(file=stats_file, header=TRUE) + +af_cutoff <- as.numeric(snakemake@params[["af_cutoff"]]) +min_signature_genes <- as.integer(snakemake@params[["min_signature_genes"]]) +min_nonN <- as.numeric(snakemake@params[["min_nonN"]]) +min_marker_genes <- as.integer(snakemake@params[["min_marker_genes"]]) + +# Identifying the samples in each tree fulfill the default tree-criteria +#(<50 detected SG, min 2 marker genes, min 50% non-n coverage) +filtered_data <- stats %>% + filter(Marker_N >= min_marker_genes, Signature_N>=min_signature_genes, NonN > min_nonN) + +# Count the number of different samples for each cluster +sample_counts <- filtered_data %>% + group_by(Cluster) %>% + summarize(Count = n_distinct(Sample)) + +# Get unique clusters +unique_clusters <- unique(gene_stats$Cluster) +unique_samples <- unique(gene_stats$Sample) + +# Create an empty matrix +empty_matrix <- matrix(NA, nrow = length(unique_samples), ncol = length(unique_clusters), + dimnames = list(unique_samples, unique_clusters)) + +# Loop through each cluster +for (cluster_id in unique_clusters) { + if ((dim(gene_stats[gene_stats$Cluster == cluster_id,]))[1]>af_cutoff){ + # Subset data for the current cluster + cluster_data <- gene_stats[gene_stats$Cluster == cluster_id, ] + samples <- (filtered_data[filtered_data$Cluster==cluster_id, "Sample"]) + absent_samples <- unique_samples[!unique_samples %in% samples] + + # Calculate the median Signature_AF per sample for the current cluster + median_signature_af_per_sample <- cluster_data %>% + group_by(Sample) %>% + summarise(median_signature_af = median(Signature_AF, na.rm = TRUE)) + + # Convert median_signature_af_per_sample to a data frame + median_signature_af_per_sample_df <- as.data.frame(median_signature_af_per_sample) + # Replace values based on conditions + median_signature_af_per_sample_df$median_signature_af[median_signature_af_per_sample_df$median_signature_af > af_cutoff] <- 2 + median_signature_af_per_sample_df$median_signature_af[median_signature_af_per_sample_df$median_signature_af == af_cutoff] <- 1 + + # Filling in the numbers in the matrix + empty_matrix[,cluster_id] = median_signature_af_per_sample_df$median_signature_af + empty_matrix[absent_samples,cluster_id] = 0 # if the sample is not in the tree - set NA as AF + } +} + +AF_matrix <- empty_matrix + +# Save the data frame to a CSV file +write.csv(AF_matrix, file = snakemake@output[["af_matrix"]], row.names = TRUE) diff --git a/setup.py b/setup.py index 9a75b8dd..080fcad1 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name="maginator", - version="0.1.19", + version="0.1.20", author="Jakob Russel & Trine Zachariasen", author_email="russel2620@gmail.com,trine_zachariasen@hotmail.com", description="MAGinator: Abundance, strain, and functional profiling of MAGs",