diff --git a/CGAT/IndexedFasta.py b/CGAT/IndexedFasta.py index 647cd7b0..33648f7a 100644 --- a/CGAT/IndexedFasta.py +++ b/CGAT/IndexedFasta.py @@ -1051,7 +1051,6 @@ def getSequence(self, last_pos, lsequence - first_pos sequence = self.mDatabaseFile.fetch(contig, first_pos, last_pos) - if str(strand) in ("-", "0", "-1"): try: # works in py2 only diff --git a/CGAT/scripts/bam_vs_bed.py b/CGAT/scripts/bam_vs_bed.py index ce4d2e43..1b1753e7 100644 --- a/CGAT/scripts/bam_vs_bed.py +++ b/CGAT/scripts/bam_vs_bed.py @@ -1,5 +1,4 @@ -''' -bam_vs_bed.py - count context that reads map to +'''bam_vs_bed.py - count context that reads map to ====================================================== :Tags: Genomics NGS Intervals BAM BED Counting @@ -17,6 +16,19 @@ in the :term:`bed` file can be overlapping - they are counted independently. +Note that duplicate intervals will be counted multiple times. This +situation can easily arise when building a set of genomic annotations +based on a geneset with alternative transcripts. For example:: + + chr1 10000 20000 protein_coding # gene1, transrcipt1 + chr1 10000 20000 protein_coding # gene1, transcript2 + +Any reads overlapping the interval chr1:10000-20000 will be counted +twice into the protein_coding bin by bedtools. To avoid this, remove any +duplicates from the :term:`bed` file:: + + zcat input_with_duplicates.bed.gz | cgat bed2bed --merge-by-name | bgzip > input_without_duplicates.bed.gz + This scripts requires bedtools_ to be installed. Options