Skip to content

Commit

Permalink
Merge pull request #12 from Russel88/dev
Browse files Browse the repository at this point in the history
v.0.1.20 added feature for detecting strain-mixtures using allele frequeicies
  • Loading branch information
trinezac committed Apr 12, 2024
2 parents 6cca729 + a295e92 commit b9ceae7
Show file tree
Hide file tree
Showing 4 changed files with 80 additions and 1 deletion.
1 change: 1 addition & 0 deletions maginator/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ def cli():
app.add_argument('--min_nonN', help='Minimum fraction of non-N characters of a sample to be included in a phylogeny [%(default)s]', default=0.5, type=float)
app.add_argument('--min_marker_genes', help='Minimum marker genes to be detected for inclusion of a sample in a phylogeny [%(default)s]', default=2, type=int)
app.add_argument('--min_signature_genes', help='Minimum signature genes to be detected for inclusion of a sample in a phylogeny [%(default)s]', default=50, type=int)
app.add_argument('--af_cutoff', help='Cutoff for average median allele frequency of SG alignment for determining mixed/pure population of a MAG in a sample [%(default)s]', default=0, type=float)
app.add_argument('--phylo', help='Software for phylogeny inference. Either fast (fasttree) or slow and more accurate (iqtree) [%(default)s]', default='fasttree', type=str, choices=['fasttree', 'iqtree'])
app.add_argument('--tax_scope_threshold', help='Threshold for assigning the taxonomic scope of a gene cluster [%(default)s]', default=0.9, type=float)
app.add_argument('--synteny_adj_cutoff', help='Minimum number of times gene clusters should be adjacent to be included in synteny graph [%(default)s]', default=1, type=int)
Expand Down
19 changes: 19 additions & 0 deletions maginator/workflow/pileup_parse.Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,22 @@ rule parse:
script:
"scripts/mpileup.py"

rule allele_frequencies:
input:
stat=os.path.join(WD, 'phylo', 'stats.tab'),
gene=os.path.join(WD, 'phylo', 'stats_genes.tab')
output:
af_matrix=os.path.join(WD,'tabs','allele_frequencies.tab')
params:
af_cutoff = param_dict['af_cutoff'],
min_signature_genes = param_dict['min_signature_genes'],
min_nonN = param_dict['min_nonN'],
min_marker_genes = param_dict['min_marker_genes']
conda:
"envs/signature_genes.yaml"
resources:
cores=2,
mem_gb=190,
runtime='172800'
script:
"scripts/allele_frequency_mat.R"
59 changes: 59 additions & 0 deletions maginator/workflow/scripts/allele_frequency_mat.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
library(dplyr)
gene_stats_file <- snakemake@input[["gene"]]
gene_stats <- read.table(file=gene_stats_file, header=TRUE)

stats_file <- snakemake@input[["stat"]]
stats <- read.table(file=stats_file, header=TRUE)

af_cutoff <- as.numeric(snakemake@params[["af_cutoff"]])
min_signature_genes <- as.integer(snakemake@params[["min_signature_genes"]])
min_nonN <- as.numeric(snakemake@params[["min_nonN"]])
min_marker_genes <- as.integer(snakemake@params[["min_marker_genes"]])

# Identifying the samples in each tree fulfill the default tree-criteria
#(<50 detected SG, min 2 marker genes, min 50% non-n coverage)
filtered_data <- stats %>%
filter(Marker_N >= min_marker_genes, Signature_N>=min_signature_genes, NonN > min_nonN)

# Count the number of different samples for each cluster
sample_counts <- filtered_data %>%
group_by(Cluster) %>%
summarize(Count = n_distinct(Sample))

# Get unique clusters
unique_clusters <- unique(gene_stats$Cluster)
unique_samples <- unique(gene_stats$Sample)

# Create an empty matrix
empty_matrix <- matrix(NA, nrow = length(unique_samples), ncol = length(unique_clusters),
dimnames = list(unique_samples, unique_clusters))

# Loop through each cluster
for (cluster_id in unique_clusters) {
if ((dim(gene_stats[gene_stats$Cluster == cluster_id,]))[1]>af_cutoff){
# Subset data for the current cluster
cluster_data <- gene_stats[gene_stats$Cluster == cluster_id, ]
samples <- (filtered_data[filtered_data$Cluster==cluster_id, "Sample"])
absent_samples <- unique_samples[!unique_samples %in% samples]

# Calculate the median Signature_AF per sample for the current cluster
median_signature_af_per_sample <- cluster_data %>%
group_by(Sample) %>%
summarise(median_signature_af = median(Signature_AF, na.rm = TRUE))

# Convert median_signature_af_per_sample to a data frame
median_signature_af_per_sample_df <- as.data.frame(median_signature_af_per_sample)
# Replace values based on conditions
median_signature_af_per_sample_df$median_signature_af[median_signature_af_per_sample_df$median_signature_af > af_cutoff] <- 2
median_signature_af_per_sample_df$median_signature_af[median_signature_af_per_sample_df$median_signature_af == af_cutoff] <- 1

# Filling in the numbers in the matrix
empty_matrix[,cluster_id] = median_signature_af_per_sample_df$median_signature_af
empty_matrix[absent_samples,cluster_id] = 0 # if the sample is not in the tree - set NA as AF
}
}

AF_matrix <- empty_matrix

# Save the data frame to a CSV file
write.csv(AF_matrix, file = snakemake@output[["af_matrix"]], row.names = TRUE)
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setuptools.setup(
name="maginator",
version="0.1.19",
version="0.1.20",
author="Jakob Russel & Trine Zachariasen",
author_email="[email protected],[email protected]",
description="MAGinator: Abundance, strain, and functional profiling of MAGs",
Expand Down

0 comments on commit b9ceae7

Please sign in to comment.