diff --git a/CHANGELOG.md b/CHANGELOG.md index 88c9473..8409b0a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,23 @@ # Bismark Changelog +## Changelog for Bismark v0.24.2 (release on 27 Sep 2023) + +### Bismark + +- removed an `exit 0` that would terminate runs after processing a single (set of) input file(s). + +### deduplicate_bismark + +- Changed the path to Samtools to custom variable ([#609](https://github.com/FelixKrueger/Bismark/issues/609)) + +### coverage2cytosine + +- set threshold reads to 1 (if it was 0) for `--gc_context` as intended and mentioned in the help text. Fixes [#621](https://github.com/FelixKrueger/Bismark/issues/621) + + +Added scripts for merging coverage files (e.g. for when R1 and R2 had been run in single-end mode) + + ## Changelog for Bismark v0.24.1 (release on 29 May 2023) - Added new [documentation website](http://felixkrueger.github.io/Bismark/), built using [Material for Mkdocs](https://squidfunk.github.io/mkdocs-material/). Thanks to @ewels for a great (late-night) effort to break up and restructure what had become a fairly unwieldy monolithic beast diff --git a/NOMe_filtering b/NOMe_filtering index 4010f3d..0fc326e 100755 --- a/NOMe_filtering +++ b/NOMe_filtering @@ -23,7 +23,7 @@ use Carp; my %chromosomes; # storing sequence information of all chromosomes/scaffolds my %processed; # keeping a record of which chromosomes have been processed -my $nome_version = 'v0.24.1'; +my $nome_version = 'v0.24.2'; my ($output_dir,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$parent_dir,$coverage_infile,$cytosine_out,$merge_CpGs,$gc_context,$gzip,$nome) = process_commandline(); diff --git a/bam2nuc b/bam2nuc index 986feaf..6490893 100755 --- a/bam2nuc +++ b/bam2nuc @@ -31,7 +31,7 @@ my %freqs; # keeping a record of which chromosomes have been processed my %genomic_freqs; my %processed; -my $bam2nuc_version = 'v0.24.1'; +my $bam2nuc_version = 'v0.24.2'; my ($output_dir,$genome_folder,$parent_dir,$samtools_path,$genome_freq_only) = process_commandline(); diff --git a/bismark b/bismark index 8026a15..b3e574d 100755 --- a/bismark +++ b/bismark @@ -25,7 +25,7 @@ use lib "$RealBin/../lib"; my $parent_dir = getcwd(); -my $bismark_version = 'v0.24.1'; +my $bismark_version = 'v0.24.2'; my $copyright_dates = "2010-23"; my $start_run = time(); @@ -921,7 +921,7 @@ foreach my $filename (@filenames){ warn "Nucleotide coverage statistics are currently only available for BAM or CRAM files\n\n"; } } - exit 0; + # exit 0; # will terminate after a single supplied file.... } else{ # If multiple files were supplied as the command line, like so: @@ -7963,9 +7963,9 @@ VERSION ### BOWTIE 2/HISAT2 PARALLELIZATION OPTIONS if ($parallel){ - die "Please select a value for -p of 2 or more (ideally not higher than 4 though...)!\n" unless ($parallel > 1); + die "Please select a value for -p of 2 or more!\n" unless ($parallel > 1); if ($parallel > 4){ - warn "Attention: using more than 4 cores per alignment thread has been reported to have diminishing returns. If possible try to limit -p to a value of 4\n"; sleep(2); + warn "Attention: early reports suggested that high values of -p to have diminishing returns. Please test different values using a small subset of data for your hardware setting.\n"; sleep(1); } push @aligner_options,"-p $parallel"; push @aligner_options,'--reorder'; ## re-orders the Bowtie 2 or HISAT2 output so that it does match the input files. This is abolutely required for parallelization to work. @@ -9916,7 +9916,7 @@ Bowtie 2/ HISAT2 parallelization options: and synchronize when parsing reads and outputting alignments. Searching for alignments is highly parallel, and speedup is close to linear. Increasing -p increases Bowtie 2's memory footprint. E.g. when aligning to a human genome index, increasing -p from 1 to 8 increases the memory footprint - by a few hundred megabytes. This option is only available if bowtie is linked with the pthreads + by a few hundred megabytes. This option is only available if Bowtie 2 is linked with the pthreads library (i.e. if BOWTIE_PTHREADS=0 is not specified at build time). In addition, this option will automatically use the option '--reorder', which guarantees that output SAM records are printed in an order corresponding to the order of the reads in the original input file, even when -p is set @@ -9994,6 +9994,6 @@ Bismark BAM/SAM OUTPUT (default): Each read of paired-end alignments is written out in a separate line in the above format. -Last modified on 28 May 2023 +Last modified on 23 August 2023 HOW_TO } diff --git a/bismark2bedGraph b/bismark2bedGraph index 2dcec77..a7872b9 100755 --- a/bismark2bedGraph +++ b/bismark2bedGraph @@ -21,7 +21,7 @@ use Carp; ## You should have received a copy of the GNU General Public License ## along with this program. If not, see . -my $bismark2bedGraph_version = 'v0.24.1'; +my $bismark2bedGraph_version = 'v0.24.2'; my @bedfiles; my @methylcalls = qw (0 0 0); # [0] = methylated, [1] = unmethylated, [2] = total diff --git a/bismark2report b/bismark2report index a94e8ac..3ce2a03 100755 --- a/bismark2report +++ b/bismark2report @@ -20,7 +20,7 @@ use lib "$RealBin/../lib"; ## You should have received a copy of the GNU General Public License ## along with this program. If not, see . -my $bismark2report_version = 'v0.24.1'; +my $bismark2report_version = 'v0.24.2'; my (@alignment_reports,@dedup_reports,@splitting_reports,@mbias_reports,@nuc_reports); my ($output_dir,$verbose,$manual_output_file) = process_commandline(); diff --git a/bismark2summary b/bismark2summary index f0dbe5d..dd2cfd3 100755 --- a/bismark2summary +++ b/bismark2summary @@ -22,7 +22,7 @@ use lib "$RealBin/../lib"; ## You should have received a copy of the GNU General Public License ## along with this program. If not, see . -my $bismark_version = '0.24.1'; +my $bismark_version = '0.24.2'; # Last modified 09 11 2020 diff --git a/bismark_genome_preparation b/bismark_genome_preparation index 2857e50..b504938 100755 --- a/bismark_genome_preparation +++ b/bismark_genome_preparation @@ -42,7 +42,7 @@ my %genomic_freqs; # storing the genomic sequence composition my %freqs; -my $bismark_version = 'v0.24.1'; +my $bismark_version = 'v0.24.2'; my $copyright_date = '2010-23'; my $last_modified = "19 May 2022"; diff --git a/bismark_methylation_extractor b/bismark_methylation_extractor index b9ed086..1139157 100755 --- a/bismark_methylation_extractor +++ b/bismark_methylation_extractor @@ -29,7 +29,7 @@ my $parent_dir = getcwd(); my %fhs; -my $version = 'v0.24.1'; +my $version = 'v0.24.2'; my ($ignore,$genomic_fasta,$single,$paired,$full,$report,$no_overlap,$merge_non_CpG,$output_dir,$no_header,$bedGraph,$remove,$coverage_threshold,$counts,$cytosine_report,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$sort_size,$samtools_path,$gzip,$ignore_r2,$mbias_off,$mbias_only,$gazillion,$ample_mem,$ignore_3prime,$ignore_3prime_r2,$multicore,$yacht,$ucsc) = process_commandline(); ### only needed for bedGraph output diff --git a/coverage2cytosine b/coverage2cytosine index f86ac4c..e8d531a 100755 --- a/coverage2cytosine +++ b/coverage2cytosine @@ -23,7 +23,7 @@ use Carp; my %chromosomes; # storing sequence information of all chromosomes/scaffolds my %processed; # keeping a record of which chromosomes have been processed my %context_summary; # storing methylation values for all contexts for NOMe-seq or scNMT-experiments -my $coverage2cytosine_version = 'v0.24.1'; +my $coverage2cytosine_version = 'v0.24.2'; my ($output_dir,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$parent_dir,$coverage_infile,$cytosine_out,$merge_CpGs,$gc_context,$gzip,$tetra,$nome,$disco,$threshold,$drach) = process_commandline(); @@ -708,6 +708,12 @@ sub generate_GC_context_report { warn "="x82,"\n"; warn "Methylation information for GC context will now be written to a GpC-context report\n"; warn "="x82,"\n\n"; + + warn "For the GC report, positions need to have coverage of at least 1 call. The threshold currently is: $threshold\n"; + if ($threshold == 0){ + warn "Setting threshold to 1\n"; + $threshold = 1; + } sleep (2); my $number_processed = 0; @@ -981,6 +987,8 @@ sub generate_GC_context_report { # if Cs were not covered at all they are not written out if ($meth_bottom + $nonmeth_bottom >= $threshold){ + + # warn ("methylation bottom strand: $meth_bottom\nnon-methylation bottom strand: $nonmeth_bottom\nThreshold: $threshold\n\n"); sleep (1); my $percentage = sprintf ("%.6f",$meth_bottom/ ($meth_bottom + $nonmeth_bottom) *100); if ($nome and ($context_bottom eq 'CG') ){ # not reporting these (see point 2. above) @@ -1965,12 +1973,12 @@ sub process_commandline{ print << "VERSION"; - Bismark Methylation Extractor Module - - coverage2cytosine + Bismark Methylation Extractor Module - + coverage2cytosine - Bismark coverage2cytosine Version: $coverage2cytosine_version - Copyright 2010-22 Felix Krueger, Altos Bioinformatics - https://github.com/FelixKrueger/Bismark + Version: $coverage2cytosine_version + Copyright 2010-23 Felix Krueger, Altos Bioinformatics + https://github.com/FelixKrueger/Bismark VERSION @@ -2231,7 +2239,7 @@ The genome-wide cytosine methylation output file is tab-delimited in the followi - Script last modified: 06 Oct 2020 + Script last modified: 09 Sep 2023 EOF ; diff --git a/deduplicate_bismark b/deduplicate_bismark index bd682b0..850eaeb 100755 --- a/deduplicate_bismark +++ b/deduplicate_bismark @@ -7,7 +7,7 @@ use Cwd; ### This script is supposed to remove alignments to the same position in the genome which can arise by e.g. PCR amplification ### Paired-end alignments are considered a duplicate if both partner reads start and end at the exact same position -my $dedup_version = 'v0.24.1'; +my $dedup_version = 'v0.24.2'; my @filenames; my ($single,$paired,$global_single,$global_paired,$samtools_path,$bam,$rrbs,$multiple,$output_dir,$outfile,$parallel) = process_commandline(); @@ -154,7 +154,7 @@ foreach my $file (@filenames){ if ($multiple){ # we are assuming that all files are either in BAM format if ($file =~ /\.bam$/){ - open (IN,"$samtools_path cat -h $file @filenames | samtools view -h |") or die "Unable to read from BAM files in '@filenames': $!\n"; + open (IN,"$samtools_path cat -h $file @filenames | $samtools_path view -h |") or die "Unable to read from BAM files in '@filenames': $!\n"; } # or all in SAM format else{ # if the reads are in SAM format we simply concatenate them diff --git a/filter_non_conversion b/filter_non_conversion index ba5dabf..73d1f98 100755 --- a/filter_non_conversion +++ b/filter_non_conversion @@ -22,7 +22,7 @@ $|++; ## along with this program. If not, see . my $parent_dir = getcwd(); -my $filter_version = 'v0.24.1'; +my $filter_version = 'v0.24.2'; my ($global_single,$global_paired,$samtools_path,$threshold,$consecutive,$percentage_cutoff,$minimum_count) = process_commandline(); diff --git a/merge_arbitrary_coverage_files.py b/merge_arbitrary_coverage_files.py new file mode 100755 index 0000000..c879559 --- /dev/null +++ b/merge_arbitrary_coverage_files.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python + +import os +import glob +import time +import gzip +import argparse +import sys + +def merge_coverage_files(basename): + print(f"Merging Bismark coverage files into a file called '{basename}.cov.gz'") + cov_files = glob.glob("*.cov.gz") + + if not cov_files: + print("Error: No files ending in '.cov.gz' found in the current folder.") + sys.exit(1) + + allcov = {} # overarching dictionary + + for file in cov_files: + print(f"Reading methylation calls from file: {file}") + + isGzip = False + if file.endswith("gz"): + infile = gzip.open(file, 'rt') # mode is 'rt' for text mode + isGzip = True + else: + infile = open(file) + + for line in infile: + + if isGzip: + line = line.rstrip() # no need to decode if using 'rt' mode + else: + line = line.rstrip() + + chrom, pos, m, u = [line.split(sep="\t")[i] for i in (0, 1, 4, 5)] # list comprehension + + if chrom in allcov.keys(): + pass + else: + allcov[chrom] = {} + + pos = int(pos) + + if pos in allcov[chrom].keys(): + pass + else: + allcov[chrom][pos] = {} + allcov[chrom][pos]["meth"] = 0 + allcov[chrom][pos]["unmeth"] = 0 + + allcov[chrom][pos]["meth"] += int(m) + allcov[chrom][pos]["unmeth"] += int(u) + + infile.close() + + print("Now printing out a new, merged coverage file") + + with gzip.open(f"{basename}.cov.gz", "wt") as out: + for chrom in sorted(allcov.keys()): + for pos in sorted(allcov[chrom].keys()): + perc = '' + if (allcov[chrom][pos]['meth'] + allcov[chrom][pos]['unmeth'] == 0): + print("Both methylated and unmethylated positions were 0. Exiting...") + sys.exit() + else: + perc = allcov[chrom][pos]['meth'] / (allcov[chrom][pos]['meth'] + allcov[chrom][pos]['unmeth']) * 100 + + out.write(f"{chrom}\t{pos}\t{pos}\t{perc:.2f}\t{allcov[chrom][pos]['meth']}\t{allcov[chrom][pos]['unmeth']}\n") + + print(f"All done! The merged coverage file '{basename}.cov.gz' has been created.\n") + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Merge Bismark coverage files into a file called "basename.cov.gz".') + parser.add_argument('-b', '--basename', default='merged_coverage_file', help='The basename for the merged coverage file.') + args = parser.parse_args() + merge_coverage_files(args.basename) diff --git a/merge_coverage_files_ARGV.py b/merge_coverage_files_ARGV.py new file mode 100755 index 0000000..2accaa4 --- /dev/null +++ b/merge_coverage_files_ARGV.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python + +import os +import glob +import time +import gzip +import sys +import argparse + +print ("Merging Bismark coverage files given as command line arguments") +# cov_files = glob.glob("*.cov.gz") + +# if (options.verbose): +# print_options(options) + + +def parse_options(): + parser = argparse.ArgumentParser(description="Set base-name to write merged Bismark coverage files to") + parser.add_argument("-b","--basename", type=str, help="Basename of file to write merged coverage output to (default 'merged_coverage_file')", default="merged_coverage_file.cov.gz") + parser.add_argument("filenames", help="The Bismark coverage files to merge", nargs='+') + + options = parser.parse_args() + return options + +def main(): + + options = parse_options() + print (f"Using the following basename: {options.basename}") + cov_files = options.filenames + print (f"Meging the following files: {cov_files}") + allcov = {} # overarching dictionary + + for filename in cov_files: + print (f"Reading methylation calls from file: {filename}") + + isGzip = False + if filename.endswith("gz"): + infile = gzip.open(filename) # mode is 'rb' + isGzip = True + else: + infile = open(filename) + + for line in infile: + + if isGzip: + line = line.decode().rstrip() # need to decode from a Binary to Str object + else: + line = line.rstrip() + + # print(line) + # chrom, pos, m, u = line.split(sep="\t")[0,1,4,5] + + chrom, pos, m, u = [line.split(sep="\t")[i] for i in (0,1,4,5)] # list comprehension + + if chrom in allcov.keys(): + pass + # print (f"already present: {chrom}") + else: + allcov[chrom] = {} + + # print (f"Key not present yet for chromosome: {chrom}. Creating now") + # converting the position to int() right here + pos = int(pos) + + if pos in allcov[chrom].keys(): + # print (f"Positions was already present: chr: {chrom}, pos: {pos}") + pass + else: + allcov[chrom][pos] = {} + allcov[chrom][pos]["meth"] = 0 + allcov[chrom][pos]["unmeth"] = 0 + + allcov[chrom][pos]["meth"] += int(m) + allcov[chrom][pos]["unmeth"] += int(u) + + # print (f"Chrom: {chrom}") + # print (f"Chrom: {chrom}, Pos: {pos}, Meth: {m}, Unmeth: {u}") + # time.sleep(0.1) + + infile.close() + + + + + + # resetting + del chrom + del pos + outfile = f"{options.basename}.merged.cov.gz" + + print (f"Now printing out a new, merged coverage file to >>{outfile}<<") + + with gzip.open(outfile,"wt") as out: + # Just sorting by key will sort lexicographically, which is unintended. + # At least not for the positions + for chrom in sorted(allcov.keys()): + + # Option I: This is a solution using {} dictionary comprehension. Seems to work. + # print (f"Converting position keys to int first for chromosome {chrom}") + # allcov[chrom] = {int(k) : v for k, v in allcov[chrom].items()} + + # Option II: Another option could be to use a Lambda function to make the keys integers on the fly + # print (f"Attempting solution using a Lambda function on the positions for chrom:{chrom}") + # for pos in sorted(allcov[chrom].keys(), key = lambda x: int(x)): + # This also works + + # Option III: Since I changed the values going into the positions dictionary, sorted() + # will now automatically perform a numerical sort. So no further action is required. + print (f"Now sorting positions on chromosome: {chrom}") + for pos in sorted(allcov[chrom].keys()): + perc = '' + if (allcov[chrom][pos]['meth'] + allcov[chrom][pos]['unmeth'] == 0): + # This should not happen in real data, as coverage files by definition only show covered positions + print ("Both methylated and unmethylated positions were 0. Dying now...") + sys.exit() + else: + perc = allcov[chrom][pos]['meth'] / (allcov[chrom][pos]['meth'] + allcov[chrom][pos]['unmeth']) * 100 + + # percentage is displayed with 2 decimals with f-string formatting + out.write(f"{chrom}\t{pos}\t{pos}\t{perc:.2f}\t{allcov[chrom][pos]['meth']}\t{allcov[chrom][pos]['unmeth']}\n") + + print ("All done, enjoy the merged coverage file!\n") + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/methylation_consistency b/methylation_consistency index e3bc08a..60fdff7 100755 --- a/methylation_consistency +++ b/methylation_consistency @@ -3,7 +3,7 @@ use warnings; use strict; use Getopt::Long; -my $VERSION = "0.24.1"; +my $VERSION = "0.24.2"; my ($min_count,$global_single,$global_paired,$lower_threshold,$upper_threshold,$samtools_path,$chh) = process_commandline(); if ($chh){