Merge pull request #11 from Russel88/dev

v.0.1.19
Russel88 · Jan 4, 2024 · 6cca729 · 6cca729
2 parents fed31ec + 6951bf3
commit 6cca729
Show file tree

Hide file tree

Showing 20 changed files with 91 additions and 63 deletions.
diff --git a/README.md b/README.md
@@ -23,10 +23,10 @@ conda activate maginator
 pip install maginator
 ```
 
-Furthermore, MAGinator also needs the GTDB-tk database version R207_v2 downloaded. If you don't already have it, you can run the following:
+Furthermore, MAGinator also needs the GTDB-tk database downloaded. Here we download release 214. If you don't already have it, you can run the following:
 ```sh
-wget https://data.gtdb.ecogenomic.org/releases/release207/207.0/auxillary_files/gtdbtk_r207_v2_data.tar.gz
-tar xvzf gtdbtk_v2_data.tar.gz
+wget https://data.ace.uq.edu.au/public/gtdb/data/releases/release214/214.1/auxillary_files/gtdbtk_r214_data.tar.gz
+tar xvzf *.tar.gz
 ```
 
 ## Usage
@@ -39,7 +39,7 @@ MAGinator needs 3 input files:
 
 Run MAGinator:
 ```sh
-maginator -v vamb_clusters.tsv -r reads.csv -c contigs.fasta -o my_output -g "/path/to/GTDB-Tk/database/release207_v2/"
+maginator -v vamb_clusters.tsv -r reads.csv -c contigs.fasta -o my_output -g "/path/to/GTDB-Tk/database/release214/"
 ```
 
 A testset can be found in the test_data directory. 
@@ -65,11 +65,18 @@ A test set can be found in the maginator/test_data directory.
 4. Unzip the contigs.fasta.gz 
 5. Run MAGinator
 
-MAGinator has been run on the test data on a slurm server with the following command:
+MAGinator can been run on the test data on a slurm server with the following command:
 ```sh
-maginator --vamb_clusters clusters.tsv --reads reads.csv --contigs contigs.fasta --gtdb_db data/release207_v2/ --output test_out --cluster slurm --cluster_info "-n {cores} --mem {mem_gb}gb -t {runtime}" --max_mem 180
+maginator --vamb_clusters clusters.tsv --reads reads.csv --contigs contigs.fasta --gtdb_db data/release214/ --output test_out --cluster slurm --cluster_info "-n {cores} --mem {mem_gb}gb -t {runtime}" --max_mem 180
 ```
-The expected output can be found as a zipped file on Zenodo: https://doi.org/10.5281/zenodo.8279036
+The expected output can be found as a zipped file on Zenodo: https://doi.org/10.5281/zenodo.8279036. MAGinator has been run on the test data (using GTDB-tk db release207_v2) on a slurm server.
+
+On the compute cluster each job have had access to 180gb RAM, with the following time consumption: 
+real	72m27.379s
+user	0m18.830s
+sys	1m0.454s
+
+If you run on a smaller server you can set the parameters --max_cores and --max_mem.
 
 ## Recommended workflow 
 
@@ -120,11 +127,13 @@ This is what MAGinator does with your input (if you want to see all parameters r
     * Use --clustering_coverage to toggle the clustering coverage
     * Use --clustering_type to toggle whether to cluster on amino acid or nucleotide level
 * Map reads to the non-redundant gene catalogue and create a matrix with gene counts for each sample
-* Pick non-redundant genes that are only found in one MGS each
-* Fit signature gene model and use the resulting signature genes to get the abundance of each MGS
-* Prepare for generation of phylogenies for each MGS by finding outgroups and marker genes which will be used for rooting the phylogenies
+* Pick non-redundant genes that are only found in one MAG cluster each
+* Fit signature gene model and use the resulting signature genes to get the abundance of each MAG cluster
+    * Use --min_mapped_signature_genes to change minimum number of signature genes to be detected in the sample to be included in the analysis
+    * Use --min_samples to alter the number of samples with the MAG cluster present in order to perform signature gene refinement
+* Prepare for generation of phylogenies for each MAG cluster by finding outgroups and marker genes which will be used for rooting the phylogenies
 * Use the read mappings to collect SNV information for each signature gene and marker gene for each sample
-* Align signature and marker genes, concatenate alignments and infer phylogenetic trees for each MGS
+* Align signature and marker genes, concatenate alignments and infer phylogenetic trees for each MAG cluster
     * Use --phylo to toggle whether use fasttree (fast, approximate) or iqtree (slow, precise) to infer phylogenies
 * Infer the taxonomic scope of each gene cluster. That is, at what taxonomic level are genes from a given gene cluster found in
     * Use --tax_scope_threshold to toggle the threshold for how to find the taxonomic scope consensus
@@ -144,6 +153,7 @@ This is what MAGinator does with your input (if you want to see all parameters r
     * all_genes_cluster.tsv - Gene clusters
     * matrix/
         * gene_count_matrix.tsv - Read count for each gene cluster for each sample
+        * small_gene_count_matrix.tsv - Read count matrix only containing the genes, that does not cluster across MAG cluster
     * synteny/ - Intermediate files for synteny clustering of gene clusters
 * gtdbtk/
     * <cluster>/ - GTDB-tk taxonomic annotation for each VAMB cluster
@@ -152,19 +162,19 @@ This is what MAGinator does with your input (if you want to see all parameters r
     * bams/ - Bam files for mapping reads to gene clusters
 * phylo/
     * alignments/ - Alignments for each signature gene
-    * cluster_alignments/ - Concatenated alignments for each MGS
-    * pileup/ - SNV information for each MGS and each sample
-    * trees/ - Phylogenetic trees for each MGS
+    * cluster_alignments/ - Concatenated alignments for each MAG cluster
+    * pileup/ - SNV information for each MAG cluster and each sample
+    * trees/ - Phylogenetic trees for each MAG cluster
     * stats.tab - Mapping information such as non-N fraction, number of signature genes and marker genes, read depth, and number of bases not reaching allele frequency cutoff 
     * stats_genes.tab - Same as above but the information is split per gene
 * signature_genes/ 
     * \- R data files with signature gene optimization
-    * read-count_detected-genes.pdf - Figure for each MGS/cluster displaying number of identified SG's in each sample along with the number of reads mapped.
+    * read-count_detected-genes.pdf - Figure for each MAG cluster displaying number of identified SG's in each sample along with the number of reads mapped.
 * tabs/
     * gene_cluster_bins.tab - Table listing which bins each gene cluster was found in
     * gene_cluster_tax_scope.tab - Table listing the taxonomic scope of each gene cluster
-    * metagenomicspecies.tab - Table listing which, if any, clusters where merged in MGS and the taxonomy of those
-    * signature_genes_cluster.tsv - Table with the signature genes for each MGS/cluster
+    * metagenomicspecies.tab - Table listing which, if any, clusters where merged in MAG cluster and the taxonomy of those
+    * signature_genes_cluster.tsv - Table with the signature genes for each MAG cluster
     * synteny_clusters.tab - Table listing the synteny cluster association for the gene clusters. Gene clusters from the same synteny cluster are genomically adjacent.
-    * tax_matrix.tsv - Table with taxonomy information for MGS
+    * tax_matrix.tsv - Table with taxonomy information for MAG cluster
 
diff --git a/maginator/main.py b/maginator/main.py
@@ -69,6 +69,10 @@ def cli():
     app.add_argument('--clustering_type', help='Sequence type for gene clustering with MMseqs2. Nucleotide- or protein-level [%(default)s]', default='protein', type=str, choices=['nucleotide', 'protein'])
     app.add_argument('--min_gtdb_markers', help='Minimum GTDBtk marker genes shared between MGS and outgroup for rooting trees [%(default)s]', default=10, type=int)
     app.add_argument('--marker_gene_cluster_prevalence', help='Minimum prevalence of marker genes to be selected for rooting of MGS trees [%(default)s]', default=0.5, type=float)
+
+    app.add_argument('--min_mapped_signature_genes', help='Minimum number of signature genes with reads mapped for the sample to be included in the refinement [%(default)s]', default=3, type=int)
+    app.add_argument('--min_samples', help='Minimum number of samples containing the MAG cluster (more than "min_mapped_signature_genes" present) for the MAG cluster to identidy SG [%(default)s]', default=3, type=int)
+
     app.add_argument('--min_af', help='Minimim allele frequency for calling a base when creating phylogenies [%(default)s]', default=0.8, type=float)
     app.add_argument('--min_depth', help='Minimim read depth for calling a base when creating phylogenies [%(default)s]', default=2, type=int)
     app.add_argument('--min_nonN', help='Minimum fraction of non-N characters of a sample to be included in a phylogeny [%(default)s]', default=0.5, type=float)

diff --git a/maginator/recommended_workflow/reads_to_bins.Snakefile b/maginator/recommended_workflow/reads_to_bins.Snakefile
@@ -39,7 +39,7 @@ rule unzip:
         memory = 45,
         runtime = '1:00:00'
     shell:
-        "zcat {input.fastq1} > {output.unzip1}; zcat {input.fastq2} > {output.unzip2}"    
+        "zcat {input.fastq1} > {output.unzip1}; zcat {input.fastq2} > {output.unzip2}"
 
 # Removing adapters
 rule adapter_removal:
@@ -176,7 +176,7 @@ rule stats_fig:
     run:
         import numpy as np
         from matplotlib import pyplot as plt
-        
+
         a = np.loadtxt(input.stats, delimiter='\t')
         A = a[::, a[3,].argsort()[::-1]] # sorting the np array according to amount of reads in the sample
         reads_left = A[3]
@@ -229,7 +229,7 @@ rule contig_rename:
     input:
         "assembly/{sample}/contigs.fasta"
     output:
-        "assembly/{sample}/renamed_contigs.fasta"       	
+        "assembly/{sample}/renamed_contigs.fasta"
     resources:
         cores = 1,
         memory = 45,
@@ -305,8 +305,8 @@ rule map:
     input:
         index="assembly/all_assemblies",
         gene_cat="assembly/all_assemblies.fasta",
-	R1 = lambda wildcards: sample_dict[wildcards.sample][0],
-        R2 = lambda wildcards: sample_dict[wildcards.sample][1] 
+        R1 = lambda wildcards: sample_dict[wildcards.sample][0],
+        R2 = lambda wildcards: sample_dict[wildcards.sample][1]
     output:
         "mapped/{sample}.bam"
     params:
@@ -336,7 +336,7 @@ rule sort:
     resources:
         cores = 1,
         memory = 15,
-        runtime = '10:00:00:00'    
+        runtime = '10:00:00:00'
     params:
         prefix="mapped/tmp.{sample}"
     log:
@@ -373,7 +373,7 @@ rule cut_column1to3:
         runtime = '1:00:00:00'
     log:
         "log/jgi/column1to3"
-    shell: 
+    shell:
         "cut -f1-3 {input} > {output} 2>{log}"
 
 rule cut_column4to5:
@@ -387,23 +387,23 @@ rule cut_column4to5:
         runtime = '1:00:00:00'
     log:
         "log/jgi/{sample}.cut.log"
-    shell: 
+    shell:
         "cut -f1-3 --complement {input} > {output} 2>{log}"
 
 rule paste_abundances:
     input:
         column1to3="jgi/jgi.column1to3",
         data=expand("jgi/{sample}.cut.jgi", sample=SAMPLES)
     output:
-        "jgi_matrix/jgi.abundance.dat" 
+        "jgi_matrix/jgi.abundance.dat"
     resources:
         cores = 1,
         memory = 1,
         runtime = '1:00:00:00'
     log:
         "log/jgi/paste_abundances.log"
-    shell: 
-        "paste {input.column1to3} {input.data} > {output} 2>{log}" 
+    shell:
+        "paste {input.column1to3} {input.data} > {output} 2>{log}"
 
 
 rule vamb:

diff --git a/maginator/workflow.py b/maginator/workflow.py
@@ -31,7 +31,7 @@ def run(self, snakefile):
         # Define core snakemake command
         cmd = ['snakemake',
                '--use-conda',
-               '--latency-wait', '80',
+               '--latency-wait', '150',
                '-s', snakefile,
                '--config',
                'wd='+self.output,

diff --git a/maginator/workflow/envs/filter_geneclusters.yaml b/maginator/workflow/envs/filter_geneclusters.yaml
@@ -4,5 +4,5 @@ channels:
 dependencies:
  - bbmap=38.96
  - bwa-mem2=2.2.1
- - samtools=1.10
+ - samtools=1.18
 
diff --git a/maginator/workflow/envs/filter_gtdbtk.yaml b/maginator/workflow/envs/filter_gtdbtk.yaml
@@ -2,11 +2,11 @@ channels:
  - bioconda
  - conda-forge
 dependencies:
- - gtdbtk=2.1
+ - gtdbtk=2.3.2
  - biopython=1.79
  - pandas=1.4
  - numpy=1.23.1
  - mmseqs2=13.45111
  - bbmap=38.96
  - bwa-mem2=2.2.1
- - samtools=1.10
+ - samtools=1.18
diff --git a/maginator/workflow/envs/phylo.yaml b/maginator/workflow/envs/phylo.yaml
@@ -5,7 +5,7 @@ channels:
 dependencies:
  - biopython=1.79
  - pandas=1.4
- - samtools=1.10
+ - samtools=1.18
  - fasttree=2
  - perl=5
  - mafft=7

diff --git a/maginator/workflow/filter.Snakefile b/maginator/workflow/filter.Snakefile
@@ -22,7 +22,7 @@ mem = math.ceil(n_contigs/1000000)*30
 if mem > int(param_dict['max_mem']):
     mem = int(param_dict['max_mem'])
 ## time is 1 hour per million
-tim = str(math.ceil(n_contigs/1000000)*60*60) # runtime in seconds
+tim = str(max(1800, min(math.ceil(n_contigs/100000)*60*60, 72000))) # runtime in seconds, max 20h, min 30min
 
 rule all:
     input:

diff --git a/maginator/workflow/gene_count_mat.Snakefile b/maginator/workflow/gene_count_mat.Snakefile
@@ -40,7 +40,7 @@ rule gene_names:
     wildcard_constraints:
         sample=SAMPLES[0]
     conda:
-       	"envs/filter_geneclusters.yaml"
+        "envs/filter_geneclusters.yaml"
     resources:
         cores = 1,
         mem_gb = 20,
@@ -78,7 +78,7 @@ rule gene_count_matrix:
     output:
         os.path.join(WD, 'genes', 'matrix', 'gene_count_matrix.tsv')
     conda:
-       	"envs/filter_geneclusters.yaml"
+        "envs/filter_geneclusters.yaml"
     resources:
         cores = 1,
         mem_gb = 188,

diff --git a/maginator/workflow/gtdbtk.Snakefile b/maginator/workflow/gtdbtk.Snakefile
@@ -26,14 +26,14 @@ wildcard_constraints:
 
 rule all:
     input:
-        expand(os.path.join(WD, 'gtdbtk', '{cluster}/classify/gtdbtk.bac120.summary.tsv'), cluster=CLUSTERS)
+        expand(os.path.join(WD, 'gtdbtk', '{cluster}/gtdbtk.done'), cluster=CLUSTERS)
 
 rule gtdbtk:
     input:
-        os.path.join(F_DIR, '{cluster}/')
+        ancient(os.path.join(F_DIR, '{cluster}/'))
     output:
         directory(os.path.join(WD, 'gtdbtk', '{cluster}')),
-        os.path.join(WD, 'gtdbtk', '{cluster}/classify/gtdbtk.bac120.summary.tsv')
+        os.path.join(WD, 'gtdbtk', '{cluster}/gtdbtk.done')
     conda:
         "envs/filter_gtdbtk.yaml"
     params:
@@ -45,5 +45,8 @@ rule gtdbtk:
     shell:
         '''
         export GTDBTK_DATA_PATH={params.gtdbtk};
-        gtdbtk classify_wf --genome_dir {input} --out_dir {output[0]} --cpus {resources.cores} --extension fa --keep_intermediates || true
+        gtdbtk classify_wf --genome_dir {input} --out_dir {output[0]} --cpus {resources.cores} --extension fa --keep_intermediates --skip_ani_screen || true
+        if [[ -f {output[0]}/classify/gtdbtk.bac120.summary.tsv || -f {output[0]}/classify/gtdbtk.ar53.summary.tsv ]]; then
+            touch {output[1]}
+        fi
         '''
diff --git a/maginator/workflow/pileup.Snakefile b/maginator/workflow/pileup.Snakefile
@@ -36,7 +36,7 @@ rule pileup:
     output:
         os.path.join(WD, 'phylo', 'pileup', '{sample}.mp')
     conda:
-       	"envs/phylo.yaml"
+        "envs/phylo.yaml"
     resources:
         cores = 1,
         mem_gb = 20,

diff --git a/maginator/workflow/prescreening_genes.Snakefile b/maginator/workflow/prescreening_genes.Snakefile
@@ -47,7 +47,7 @@ rule sort_genes_across_MGS:
     output:
         os.path.join(WD, 'genes', 'matrix', 'small_gene_count_matrix.tsv')
     conda:
-       	"envs/signature_genes.yaml" 
+        "envs/signature_genes.yaml" 
     resources:
         cores = 1, 
         mem_gb = 188,
@@ -67,7 +67,7 @@ rule format_conversion:
         R_gene_lengths = os.path.join(WD, 'signature_genes', 'gene_lengths.RDS'),
         clusters_dir = (directory(cluster_DIR))
     conda:
-       	"envs/signature_genes.yaml" 
+        "envs/signature_genes.yaml" 
     resources:
         cores = 1,
         mem_gb = 188,
@@ -83,6 +83,8 @@ rule prescreening_genes:
         gene_lengths = os.path.join(WD, 'signature_genes', 'gene_lengths.RDS')
     output:
         clusters_sorted = os.path.join(WD, 'signature_genes', 'clusters_sorted.RDS'),
+    params:
+        min_mapped_signature_genes = param_dict['min_mapped_signature_genes']
     conda:
         "envs/signature_genes.yaml"
     resources:

diff --git a/maginator/workflow/reads_to_bins.Snakefile b/maginator/workflow/reads_to_bins.Snakefile
@@ -213,7 +213,7 @@ rule contig_rename:
     input:
         "assembly/{sample}/contigs.fasta"
     output:
-        "assembly/{sample}/renamed_contigs.fasta"       	
+        "assembly/{sample}/renamed_contigs.fasta"
     resources:
         cores = 1,
         mem_gb = 45,
@@ -289,7 +289,7 @@ rule map:
     input:
         index="assembly/all_assemblies",
         gene_cat="assembly/all_assemblies.fasta",
-	R1 = lambda wildcards: sample_dict[wildcards.sample][0],
+        R1 = lambda wildcards: sample_dict[wildcards.sample][0],
         R2 = lambda wildcards: sample_dict[wildcards.sample][1] 
     output:
         "mapped/{sample}.bam"

diff --git a/maginator/workflow/scripts/SG_refinement.R b/maginator/workflow/scripts/SG_refinement.R
@@ -20,7 +20,8 @@ snakemake@source(snakemake@params[["functions"]])
 n.genes <- 100
 
 #minimum number of mapped genes required
-n.mapped.minimum <- 3
+n.mapped.minimum <- as.integer(snakemake@params[["min_mapped_signature_genes"]])
+n.minimum.samples <- as.integer(snakemake@params[["min_samples"]])
 
 ids <- names(Clusterlist) #the ids of the MGS
 id <- tail(strsplit(strsplit(snakemake@input[["clusters_dir"]], ".RDS")[[1]], "/")[[1]], n=1)
@@ -59,12 +60,12 @@ run_one_id <- function(id){
 
   #colsum is the amount mapped genes in any given sample
   colsum <- colSums(Clusterlist[[id]][1:n.genes, ])
-  # extracting only the samples which contains above 3 mapped reads -- if there are 3 reads we believe this is a true detection
+  # extracting only the samples which contains above n.mapped.minimum mapped reads -- if there are above n.mapped.minimum reads we believe this is a true detection
   mapped <- colsum[colsum >= n.mapped.minimum]
 
   n_samples <- setNames(c(n_samples, sum(colsum>=n.mapped.minimum)), c(names(n_samples), id))
-  #if less than 3 samples are mapped to the MGS, the MGS should be skipped
-  if (length(mapped) < 3){
+  #if less than n.minimum.samples are mapped to the MGS, the MGS should be skipped
+  if (length(mapped) < n.minimum.samples){
 #    print("Not enough samples are mapped to the cluster")
     return()}
   genes <- names(Clusterlist[[id]][1:n.genes, 1])

diff --git a/maginator/workflow/scripts/abundance_profiles.R b/maginator/workflow/scripts/abundance_profiles.R
@@ -18,7 +18,7 @@ colnames(taxonomy) <- c("Cluster","Taxonomy")
 #setting important variables
 gene_index <- seq(1,length(GeneLengths))
 gene_names <- names(GeneLengths)
-n.mapped.minimum <- 3 #The number of genes that needs reads that map to count the cluster as present
+n.mapped.minimum <- as.integer(snakemake@params[["min_mapped_signature_genes"]]) #The number of genes that needs reads that map to count the cluster as present
 n.genes <- 100 # number of signature genes
 
 # inserting NA for the Clusters that do not have a annotation

diff --git a/maginator/workflow/scripts/gene_refinement_plots.R b/maginator/workflow/scripts/gene_refinement_plots.R
@@ -9,7 +9,7 @@ taxmat <- read.csv(snakemake@input[["tax_matrix"]], sep="\t", header=FALSE)
 
 # Initializing relevant parameters
 n_signature_genes_expected <- '95'
-minimum_sampels <- 3
+minimum_sampels <- as.integer(snakemake@params[["min_samples"]])
 n.genes <- 100
 
 #loading colors