diff --git a/CHANGELOG.md b/CHANGELOG.md
index 88c9473..8409b0a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,23 @@
# Bismark Changelog
+## Changelog for Bismark v0.24.2 (release on 27 Sep 2023)
+
+### Bismark
+
+- removed an `exit 0` that would terminate runs after processing a single (set of) input file(s).
+
+### deduplicate_bismark
+
+- Changed the path to Samtools to custom variable ([#609](https://github.com/FelixKrueger/Bismark/issues/609))
+
+### coverage2cytosine
+
+- set threshold reads to 1 (if it was 0) for `--gc_context` as intended and mentioned in the help text. Fixes [#621](https://github.com/FelixKrueger/Bismark/issues/621)
+
+
+Added scripts for merging coverage files (e.g. for when R1 and R2 had been run in single-end mode)
+
+
## Changelog for Bismark v0.24.1 (release on 29 May 2023)
- Added new [documentation website](http://felixkrueger.github.io/Bismark/), built using [Material for Mkdocs](https://squidfunk.github.io/mkdocs-material/). Thanks to @ewels for a great (late-night) effort to break up and restructure what had become a fairly unwieldy monolithic beast
diff --git a/NOMe_filtering b/NOMe_filtering
index 4010f3d..0fc326e 100755
--- a/NOMe_filtering
+++ b/NOMe_filtering
@@ -23,7 +23,7 @@ use Carp;
my %chromosomes; # storing sequence information of all chromosomes/scaffolds
my %processed; # keeping a record of which chromosomes have been processed
-my $nome_version = 'v0.24.1';
+my $nome_version = 'v0.24.2';
my ($output_dir,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$parent_dir,$coverage_infile,$cytosine_out,$merge_CpGs,$gc_context,$gzip,$nome) = process_commandline();
diff --git a/bam2nuc b/bam2nuc
index 986feaf..6490893 100755
--- a/bam2nuc
+++ b/bam2nuc
@@ -31,7 +31,7 @@ my %freqs; # keeping a record of which chromosomes have been processed
my %genomic_freqs;
my %processed;
-my $bam2nuc_version = 'v0.24.1';
+my $bam2nuc_version = 'v0.24.2';
my ($output_dir,$genome_folder,$parent_dir,$samtools_path,$genome_freq_only) = process_commandline();
diff --git a/bismark b/bismark
index 8026a15..b3e574d 100755
--- a/bismark
+++ b/bismark
@@ -25,7 +25,7 @@ use lib "$RealBin/../lib";
my $parent_dir = getcwd();
-my $bismark_version = 'v0.24.1';
+my $bismark_version = 'v0.24.2';
my $copyright_dates = "2010-23";
my $start_run = time();
@@ -921,7 +921,7 @@ foreach my $filename (@filenames){
warn "Nucleotide coverage statistics are currently only available for BAM or CRAM files\n\n";
}
}
- exit 0;
+ # exit 0; # will terminate after a single supplied file....
}
else{
# If multiple files were supplied as the command line, like so:
@@ -7963,9 +7963,9 @@ VERSION
### BOWTIE 2/HISAT2 PARALLELIZATION OPTIONS
if ($parallel){
- die "Please select a value for -p of 2 or more (ideally not higher than 4 though...)!\n" unless ($parallel > 1);
+ die "Please select a value for -p of 2 or more!\n" unless ($parallel > 1);
if ($parallel > 4){
- warn "Attention: using more than 4 cores per alignment thread has been reported to have diminishing returns. If possible try to limit -p to a value of 4\n"; sleep(2);
+ warn "Attention: early reports suggested that high values of -p to have diminishing returns. Please test different values using a small subset of data for your hardware setting.\n"; sleep(1);
}
push @aligner_options,"-p $parallel";
push @aligner_options,'--reorder'; ## re-orders the Bowtie 2 or HISAT2 output so that it does match the input files. This is abolutely required for parallelization to work.
@@ -9916,7 +9916,7 @@ Bowtie 2/ HISAT2 parallelization options:
and synchronize when parsing reads and outputting alignments. Searching for alignments is highly
parallel, and speedup is close to linear. Increasing -p increases Bowtie 2's memory footprint.
E.g. when aligning to a human genome index, increasing -p from 1 to 8 increases the memory footprint
- by a few hundred megabytes. This option is only available if bowtie is linked with the pthreads
+ by a few hundred megabytes. This option is only available if Bowtie 2 is linked with the pthreads
library (i.e. if BOWTIE_PTHREADS=0 is not specified at build time). In addition, this option will
automatically use the option '--reorder', which guarantees that output SAM records are printed in
an order corresponding to the order of the reads in the original input file, even when -p is set
@@ -9994,6 +9994,6 @@ Bismark BAM/SAM OUTPUT (default):
Each read of paired-end alignments is written out in a separate line in the above format.
-Last modified on 28 May 2023
+Last modified on 23 August 2023
HOW_TO
}
diff --git a/bismark2bedGraph b/bismark2bedGraph
index 2dcec77..a7872b9 100755
--- a/bismark2bedGraph
+++ b/bismark2bedGraph
@@ -21,7 +21,7 @@ use Carp;
## You should have received a copy of the GNU General Public License
## along with this program. If not, see .
-my $bismark2bedGraph_version = 'v0.24.1';
+my $bismark2bedGraph_version = 'v0.24.2';
my @bedfiles;
my @methylcalls = qw (0 0 0); # [0] = methylated, [1] = unmethylated, [2] = total
diff --git a/bismark2report b/bismark2report
index a94e8ac..3ce2a03 100755
--- a/bismark2report
+++ b/bismark2report
@@ -20,7 +20,7 @@ use lib "$RealBin/../lib";
## You should have received a copy of the GNU General Public License
## along with this program. If not, see .
-my $bismark2report_version = 'v0.24.1';
+my $bismark2report_version = 'v0.24.2';
my (@alignment_reports,@dedup_reports,@splitting_reports,@mbias_reports,@nuc_reports);
my ($output_dir,$verbose,$manual_output_file) = process_commandline();
diff --git a/bismark2summary b/bismark2summary
index f0dbe5d..dd2cfd3 100755
--- a/bismark2summary
+++ b/bismark2summary
@@ -22,7 +22,7 @@ use lib "$RealBin/../lib";
## You should have received a copy of the GNU General Public License
## along with this program. If not, see .
-my $bismark_version = '0.24.1';
+my $bismark_version = '0.24.2';
# Last modified 09 11 2020
diff --git a/bismark_genome_preparation b/bismark_genome_preparation
index 2857e50..b504938 100755
--- a/bismark_genome_preparation
+++ b/bismark_genome_preparation
@@ -42,7 +42,7 @@ my %genomic_freqs; # storing the genomic sequence composition
my %freqs;
-my $bismark_version = 'v0.24.1';
+my $bismark_version = 'v0.24.2';
my $copyright_date = '2010-23';
my $last_modified = "19 May 2022";
diff --git a/bismark_methylation_extractor b/bismark_methylation_extractor
index b9ed086..1139157 100755
--- a/bismark_methylation_extractor
+++ b/bismark_methylation_extractor
@@ -29,7 +29,7 @@ my $parent_dir = getcwd();
my %fhs;
-my $version = 'v0.24.1';
+my $version = 'v0.24.2';
my ($ignore,$genomic_fasta,$single,$paired,$full,$report,$no_overlap,$merge_non_CpG,$output_dir,$no_header,$bedGraph,$remove,$coverage_threshold,$counts,$cytosine_report,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$sort_size,$samtools_path,$gzip,$ignore_r2,$mbias_off,$mbias_only,$gazillion,$ample_mem,$ignore_3prime,$ignore_3prime_r2,$multicore,$yacht,$ucsc) = process_commandline();
### only needed for bedGraph output
diff --git a/coverage2cytosine b/coverage2cytosine
index f86ac4c..e8d531a 100755
--- a/coverage2cytosine
+++ b/coverage2cytosine
@@ -23,7 +23,7 @@ use Carp;
my %chromosomes; # storing sequence information of all chromosomes/scaffolds
my %processed; # keeping a record of which chromosomes have been processed
my %context_summary; # storing methylation values for all contexts for NOMe-seq or scNMT-experiments
-my $coverage2cytosine_version = 'v0.24.1';
+my $coverage2cytosine_version = 'v0.24.2';
my ($output_dir,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$parent_dir,$coverage_infile,$cytosine_out,$merge_CpGs,$gc_context,$gzip,$tetra,$nome,$disco,$threshold,$drach) = process_commandline();
@@ -708,6 +708,12 @@ sub generate_GC_context_report {
warn "="x82,"\n";
warn "Methylation information for GC context will now be written to a GpC-context report\n";
warn "="x82,"\n\n";
+
+ warn "For the GC report, positions need to have coverage of at least 1 call. The threshold currently is: $threshold\n";
+ if ($threshold == 0){
+ warn "Setting threshold to 1\n";
+ $threshold = 1;
+ }
sleep (2);
my $number_processed = 0;
@@ -981,6 +987,8 @@ sub generate_GC_context_report {
# if Cs were not covered at all they are not written out
if ($meth_bottom + $nonmeth_bottom >= $threshold){
+
+ # warn ("methylation bottom strand: $meth_bottom\nnon-methylation bottom strand: $nonmeth_bottom\nThreshold: $threshold\n\n"); sleep (1);
my $percentage = sprintf ("%.6f",$meth_bottom/ ($meth_bottom + $nonmeth_bottom) *100);
if ($nome and ($context_bottom eq 'CG') ){
# not reporting these (see point 2. above)
@@ -1965,12 +1973,12 @@ sub process_commandline{
print << "VERSION";
- Bismark Methylation Extractor Module -
- coverage2cytosine
+ Bismark Methylation Extractor Module -
+ coverage2cytosine
- Bismark coverage2cytosine Version: $coverage2cytosine_version
- Copyright 2010-22 Felix Krueger, Altos Bioinformatics
- https://github.com/FelixKrueger/Bismark
+ Version: $coverage2cytosine_version
+ Copyright 2010-23 Felix Krueger, Altos Bioinformatics
+ https://github.com/FelixKrueger/Bismark
VERSION
@@ -2231,7 +2239,7 @@ The genome-wide cytosine methylation output file is tab-delimited in the followi
- Script last modified: 06 Oct 2020
+ Script last modified: 09 Sep 2023
EOF
;
diff --git a/deduplicate_bismark b/deduplicate_bismark
index bd682b0..850eaeb 100755
--- a/deduplicate_bismark
+++ b/deduplicate_bismark
@@ -7,7 +7,7 @@ use Cwd;
### This script is supposed to remove alignments to the same position in the genome which can arise by e.g. PCR amplification
### Paired-end alignments are considered a duplicate if both partner reads start and end at the exact same position
-my $dedup_version = 'v0.24.1';
+my $dedup_version = 'v0.24.2';
my @filenames;
my ($single,$paired,$global_single,$global_paired,$samtools_path,$bam,$rrbs,$multiple,$output_dir,$outfile,$parallel) = process_commandline();
@@ -154,7 +154,7 @@ foreach my $file (@filenames){
if ($multiple){
# we are assuming that all files are either in BAM format
if ($file =~ /\.bam$/){
- open (IN,"$samtools_path cat -h $file @filenames | samtools view -h |") or die "Unable to read from BAM files in '@filenames': $!\n";
+ open (IN,"$samtools_path cat -h $file @filenames | $samtools_path view -h |") or die "Unable to read from BAM files in '@filenames': $!\n";
}
# or all in SAM format
else{ # if the reads are in SAM format we simply concatenate them
diff --git a/filter_non_conversion b/filter_non_conversion
index ba5dabf..73d1f98 100755
--- a/filter_non_conversion
+++ b/filter_non_conversion
@@ -22,7 +22,7 @@ $|++;
## along with this program. If not, see .
my $parent_dir = getcwd();
-my $filter_version = 'v0.24.1';
+my $filter_version = 'v0.24.2';
my ($global_single,$global_paired,$samtools_path,$threshold,$consecutive,$percentage_cutoff,$minimum_count) = process_commandline();
diff --git a/merge_arbitrary_coverage_files.py b/merge_arbitrary_coverage_files.py
new file mode 100755
index 0000000..c879559
--- /dev/null
+++ b/merge_arbitrary_coverage_files.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python
+
+import os
+import glob
+import time
+import gzip
+import argparse
+import sys
+
+def merge_coverage_files(basename):
+ print(f"Merging Bismark coverage files into a file called '{basename}.cov.gz'")
+ cov_files = glob.glob("*.cov.gz")
+
+ if not cov_files:
+ print("Error: No files ending in '.cov.gz' found in the current folder.")
+ sys.exit(1)
+
+ allcov = {} # overarching dictionary
+
+ for file in cov_files:
+ print(f"Reading methylation calls from file: {file}")
+
+ isGzip = False
+ if file.endswith("gz"):
+ infile = gzip.open(file, 'rt') # mode is 'rt' for text mode
+ isGzip = True
+ else:
+ infile = open(file)
+
+ for line in infile:
+
+ if isGzip:
+ line = line.rstrip() # no need to decode if using 'rt' mode
+ else:
+ line = line.rstrip()
+
+ chrom, pos, m, u = [line.split(sep="\t")[i] for i in (0, 1, 4, 5)] # list comprehension
+
+ if chrom in allcov.keys():
+ pass
+ else:
+ allcov[chrom] = {}
+
+ pos = int(pos)
+
+ if pos in allcov[chrom].keys():
+ pass
+ else:
+ allcov[chrom][pos] = {}
+ allcov[chrom][pos]["meth"] = 0
+ allcov[chrom][pos]["unmeth"] = 0
+
+ allcov[chrom][pos]["meth"] += int(m)
+ allcov[chrom][pos]["unmeth"] += int(u)
+
+ infile.close()
+
+ print("Now printing out a new, merged coverage file")
+
+ with gzip.open(f"{basename}.cov.gz", "wt") as out:
+ for chrom in sorted(allcov.keys()):
+ for pos in sorted(allcov[chrom].keys()):
+ perc = ''
+ if (allcov[chrom][pos]['meth'] + allcov[chrom][pos]['unmeth'] == 0):
+ print("Both methylated and unmethylated positions were 0. Exiting...")
+ sys.exit()
+ else:
+ perc = allcov[chrom][pos]['meth'] / (allcov[chrom][pos]['meth'] + allcov[chrom][pos]['unmeth']) * 100
+
+ out.write(f"{chrom}\t{pos}\t{pos}\t{perc:.2f}\t{allcov[chrom][pos]['meth']}\t{allcov[chrom][pos]['unmeth']}\n")
+
+ print(f"All done! The merged coverage file '{basename}.cov.gz' has been created.\n")
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(description='Merge Bismark coverage files into a file called "basename.cov.gz".')
+ parser.add_argument('-b', '--basename', default='merged_coverage_file', help='The basename for the merged coverage file.')
+ args = parser.parse_args()
+ merge_coverage_files(args.basename)
diff --git a/merge_coverage_files_ARGV.py b/merge_coverage_files_ARGV.py
new file mode 100755
index 0000000..2accaa4
--- /dev/null
+++ b/merge_coverage_files_ARGV.py
@@ -0,0 +1,125 @@
+#!/usr/bin/env python
+
+import os
+import glob
+import time
+import gzip
+import sys
+import argparse
+
+print ("Merging Bismark coverage files given as command line arguments")
+# cov_files = glob.glob("*.cov.gz")
+
+# if (options.verbose):
+# print_options(options)
+
+
+def parse_options():
+ parser = argparse.ArgumentParser(description="Set base-name to write merged Bismark coverage files to")
+ parser.add_argument("-b","--basename", type=str, help="Basename of file to write merged coverage output to (default 'merged_coverage_file')", default="merged_coverage_file.cov.gz")
+ parser.add_argument("filenames", help="The Bismark coverage files to merge", nargs='+')
+
+ options = parser.parse_args()
+ return options
+
+def main():
+
+ options = parse_options()
+ print (f"Using the following basename: {options.basename}")
+ cov_files = options.filenames
+ print (f"Meging the following files: {cov_files}")
+ allcov = {} # overarching dictionary
+
+ for filename in cov_files:
+ print (f"Reading methylation calls from file: {filename}")
+
+ isGzip = False
+ if filename.endswith("gz"):
+ infile = gzip.open(filename) # mode is 'rb'
+ isGzip = True
+ else:
+ infile = open(filename)
+
+ for line in infile:
+
+ if isGzip:
+ line = line.decode().rstrip() # need to decode from a Binary to Str object
+ else:
+ line = line.rstrip()
+
+ # print(line)
+ # chrom, pos, m, u = line.split(sep="\t")[0,1,4,5]
+
+ chrom, pos, m, u = [line.split(sep="\t")[i] for i in (0,1,4,5)] # list comprehension
+
+ if chrom in allcov.keys():
+ pass
+ # print (f"already present: {chrom}")
+ else:
+ allcov[chrom] = {}
+
+ # print (f"Key not present yet for chromosome: {chrom}. Creating now")
+ # converting the position to int() right here
+ pos = int(pos)
+
+ if pos in allcov[chrom].keys():
+ # print (f"Positions was already present: chr: {chrom}, pos: {pos}")
+ pass
+ else:
+ allcov[chrom][pos] = {}
+ allcov[chrom][pos]["meth"] = 0
+ allcov[chrom][pos]["unmeth"] = 0
+
+ allcov[chrom][pos]["meth"] += int(m)
+ allcov[chrom][pos]["unmeth"] += int(u)
+
+ # print (f"Chrom: {chrom}")
+ # print (f"Chrom: {chrom}, Pos: {pos}, Meth: {m}, Unmeth: {u}")
+ # time.sleep(0.1)
+
+ infile.close()
+
+
+
+
+
+ # resetting
+ del chrom
+ del pos
+ outfile = f"{options.basename}.merged.cov.gz"
+
+ print (f"Now printing out a new, merged coverage file to >>{outfile}<<")
+
+ with gzip.open(outfile,"wt") as out:
+ # Just sorting by key will sort lexicographically, which is unintended.
+ # At least not for the positions
+ for chrom in sorted(allcov.keys()):
+
+ # Option I: This is a solution using {} dictionary comprehension. Seems to work.
+ # print (f"Converting position keys to int first for chromosome {chrom}")
+ # allcov[chrom] = {int(k) : v for k, v in allcov[chrom].items()}
+
+ # Option II: Another option could be to use a Lambda function to make the keys integers on the fly
+ # print (f"Attempting solution using a Lambda function on the positions for chrom:{chrom}")
+ # for pos in sorted(allcov[chrom].keys(), key = lambda x: int(x)):
+ # This also works
+
+ # Option III: Since I changed the values going into the positions dictionary, sorted()
+ # will now automatically perform a numerical sort. So no further action is required.
+ print (f"Now sorting positions on chromosome: {chrom}")
+ for pos in sorted(allcov[chrom].keys()):
+ perc = ''
+ if (allcov[chrom][pos]['meth'] + allcov[chrom][pos]['unmeth'] == 0):
+ # This should not happen in real data, as coverage files by definition only show covered positions
+ print ("Both methylated and unmethylated positions were 0. Dying now...")
+ sys.exit()
+ else:
+ perc = allcov[chrom][pos]['meth'] / (allcov[chrom][pos]['meth'] + allcov[chrom][pos]['unmeth']) * 100
+
+ # percentage is displayed with 2 decimals with f-string formatting
+ out.write(f"{chrom}\t{pos}\t{pos}\t{perc:.2f}\t{allcov[chrom][pos]['meth']}\t{allcov[chrom][pos]['unmeth']}\n")
+
+ print ("All done, enjoy the merged coverage file!\n")
+
+if __name__ == '__main__':
+ main()
\ No newline at end of file
diff --git a/methylation_consistency b/methylation_consistency
index e3bc08a..60fdff7 100755
--- a/methylation_consistency
+++ b/methylation_consistency
@@ -3,7 +3,7 @@ use warnings;
use strict;
use Getopt::Long;
-my $VERSION = "0.24.1";
+my $VERSION = "0.24.2";
my ($min_count,$global_single,$global_paired,$lower_threshold,$upper_threshold,$samtools_path,$chh) = process_commandline();
if ($chh){