evolve repenrich to repenrich bowtie2 (#679)

* clone repenrich for repenrich2 * seed repenrich2 * Update repenrich2.xml * adjust bowtie to bowtie2 * Update RepEnrich.py * Update RepEnrich_setup.py * fix bowtie2 b_opt and update tests * polish help and rename scripts
ARTbio · Apr 20, 2024 · 73721d9 · 73721d9
1 parent df6b949
commit 73721d9
Show file tree

Hide file tree

Showing 29 changed files with 13,284 additions and 0 deletions.
diff --git a/tools/repenrich2/.shed.yml b/tools/repenrich2/.shed.yml
@@ -0,0 +1,13 @@
+# .shed.yml supporting automatic pushes.
+owner: artbio
+name: repenrich2
+description: Repeat element profiling using bowtie2 aligner
+long_description: |
+  RepEnrich is a method to estimate repetitive element enrichment
+  using high-throughput sequencing data
+categories:
+  - Transcriptomics
+homepage_url: https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich
+remote_repository_url: https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2
+toolshed:
+  - toolshed
diff --git a/tools/repenrich2/RepEnrich2.py b/tools/repenrich2/RepEnrich2.py
@@ -0,0 +1,204 @@
+import argparse
+import csv
+import os
+import shlex
+import subprocess
+import sys
+from collections import defaultdict
+from concurrent.futures import ProcessPoolExecutor
+
+
+parser = argparse.ArgumentParser(description='''
+             Repenrich aligns reads to Repeat Elements pseudogenomes\
+             and counts aligned reads. RepEnrich_setup must be run\
+             before its use''')
+parser.add_argument('--annotation_file', action='store',
+                    metavar='annotation_file',
+                    help='RepeatMasker.org annotation file for your\
+                          organism. The file may be downloaded from\
+                          RepeatMasker.org. E.g. hg19_repeatmasker.txt')
+parser.add_argument('--alignment_bam', action='store', metavar='alignment_bam',
+                    help='Bam alignments of unique mapper reads.')
+parser.add_argument('--fastqfile', action='store', metavar='fastqfile',
+                    help='File of fastq reads mapping to multiple\
+                          locations. Example: /data/multimap.fastq')
+parser.add_argument('--fastqfile2', action='store', dest='fastqfile2',
+                    metavar='fastqfile2', default='',
+                    help='fastqfile #2 when using paired-end option.\
+                          Default none')
+parser.add_argument('--cpus', action='store', dest='cpus', metavar='cpus',
+                    default="1", type=int,
+                    help='Number of CPUs. The more cpus the\
+                          faster RepEnrich performs. Default: "1"')
+
+args = parser.parse_args()
+
+# parameters
+annotation_file = args.annotation_file
+unique_mapper_bam = args.alignment_bam
+fastqfile_1 = args.fastqfile
+fastqfile_2 = args.fastqfile2
+cpus = args.cpus
+# Change if simple repeats are differently annotated in your organism
+simple_repeat = "Simple_repeat"
+if args.fastqfile2:
+    paired_end = True
+else:
+    paired_end = False
+
+# check that the programs we need are available
+try:
+    subprocess.call(shlex.split("coverageBed -h"),
+                    stdout=open(os.devnull, 'wb'),
+                    stderr=open(os.devnull, 'wb'))
+    subprocess.call(shlex.split("bowtie2 --version"),
+                    stdout=open(os.devnull, 'wb'),
+                    stderr=open(os.devnull, 'wb'))
+except OSError:
+    print("Error: Bowtie2 or bedtools not loaded")
+    raise
+
+
+def starts_with_numerical(list):
+    try:
+        if len(list) == 0:
+            return False
+        int(list[0])
+        return True
+    except ValueError:
+        return False
+
+
+# define a text importer for .out/.txt format of repbase
+def import_text(filename, separator):
+    csv.field_size_limit(sys.maxsize)
+    file = csv.reader(open(filename), delimiter=separator,
+                      skipinitialspace=True)
+    return [line for line in file if starts_with_numerical(line)]
+
+
+def run_bowtie(args):
+    metagenome, fastqfile = args
+    b_opt = "-k 1 -p 1 --quiet --no-hd"
+    command = shlex.split(f"bowtie2 {b_opt} -x {metagenome} {fastqfile}")
+    bowtie_align = subprocess.run(command, check=True,
+                                  capture_output=True, text=True).stdout
+    bowtie_align = bowtie_align.rstrip('\r\n').split('\n')
+    readlist = [metagenome]
+    if paired_end:
+        for line in bowtie_align:
+            readlist.append(line.split("/")[0])
+    else:
+        for line in bowtie_align:
+            readlist.append(line.split("\t")[0])
+    return readlist
+
+
+# set a reference repeat list for the script
+repeat_list = [listline[9].translate(
+    str.maketrans(
+        '()/', '___')) for listline in import_text(annotation_file, ' ')]
+repeat_list = sorted(list(set(repeat_list)))
+
+# unique mapper counting
+cmd = f"bedtools bamtobed -i {unique_mapper_bam} | \
+        bedtools coverage -b stdin -a  repnames.bed"
+p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
+bedtools_counts = p.communicate()[0].decode().rstrip('\r\n').split('\n')
+
+# parse bedtools output
+counts = defaultdict(int)  # key: repeat names, value: unique mapper counts
+sumofrepeatreads = 0
+for line in bedtools_counts:
+    line = line.split('\t')
+    counts[line[3]] += int(line[4])
+    sumofrepeatreads += int(line[4])
+print(f"Identified {sumofrepeatreads} unique reads that mapped to repeats.")
+
+# multimapper parsing
+if not paired_end:
+    args_list = [(metagenome, fastqfile_1) for
+                 metagenome in repeat_list]
+    with ProcessPoolExecutor(max_workers=cpus) as executor:
+        results = executor.map(run_bowtie, args_list)
+else:
+    args_list = [(metagenome, fastqfile_1) for
+                 metagenome in repeat_list]
+    args_list.extend([(metagenome, fastqfile_2) for
+                     metagenome in repeat_list])
+    with ProcessPoolExecutor(max_workers=cpus) as executor:
+        results = executor.map(run_bowtie, args_list)
+
+# Aggregate results (avoiding race conditions)
+metagenome_reads = defaultdict(list)  # repeat_name: list of multimap reads
+for result in results:
+    metagenome_reads[result[0]] += result[1:]
+
+for name in metagenome_reads:
+    #  read are only once in list
+    metagenome_reads[name] = list(set(metagenome_reads[name]))
+    #  remove "no read" instances
+    metagenome_reads[name] = [read for read in metagenome_reads[name]
+                              if read != ""]
+
+# implement repeats_by_reads from the inverse dictionnary metagenome_reads
+repeats_by_reads = defaultdict(list)  # readids: list of repeats names
+for repname in metagenome_reads:
+    for read in metagenome_reads[repname]:
+        repeats_by_reads[read].append(repname)
+for repname in repeats_by_reads:
+    repeats_by_reads[repname] = list(set(repeats_by_reads[repname]))
+
+# 3 dictionnaries and 1 pointer variable to be populated
+fractionalcounts = defaultdict(float)
+familyfractionalcounts = defaultdict(float)
+classfractionalcounts = defaultdict(float)
+sumofrepeatreads = 0
+
+# Update counts dictionnary with sets of repeats (was "subfamilies")
+# matched by multimappers
+for repeat_set in repeats_by_reads.values():
+    repeat_set_string = ','.join(repeat_set)
+    counts[repeat_set_string] += 1
+    sumofrepeatreads += 1
+
+print(f'Identified more {sumofrepeatreads} mutimapper repeat reads')
+
+# Populate fractionalcounts
+for key, count in counts.items():
+    key_list = key.split(',')
+    for i in key_list:
+        fractionalcounts[i] += count / len(key_list)
+
+# build repeat_ref for easy access to rep class and rep families
+repeat_ref = defaultdict(dict)
+repeats = import_text(annotation_file, ' ')
+for repeat in repeats:
+    repeat_name = repeat[9].translate(str.maketrans('()/', '___'))
+    try:
+        repclass = repeat[10].split('/')[0]
+        repfamily = repeat[10].split('/')[1]
+    except IndexError:
+        repclass, repfamily = repeat[10], repeat[10]
+    repeat_ref[repeat_name]['class'] = repclass
+    repeat_ref[repeat_name]['family'] = repfamily
+
+# Populate classfractionalcounts and familyfractionalcounts
+for key, value in fractionalcounts.items():
+    classfractionalcounts[repeat_ref[key]['class']] += value
+    familyfractionalcounts[repeat_ref[key]['family']] += value
+
+# print class-, family- and fraction-repeats counts to files
+with open("class_fraction_counts.tsv", 'w') as fout:
+    for key in sorted(classfractionalcounts):
+        fout.write(f"{key}\t{classfractionalcounts[key]}\n")
+
+with open("family_fraction_counts.tsv", 'w') as fout:
+    for key in sorted(familyfractionalcounts):
+        fout.write(f"{key}\t{familyfractionalcounts[key]}\n")
+
+with open("fraction_counts.tsv", 'w') as fout:
+    for key in sorted(fractionalcounts):
+        fout.write(f"{key}\t{repeat_ref[key]['class']}\t"
+                   f"{repeat_ref[key]['family']}\t"
+                   f"{fractionalcounts[key]}\n")
diff --git a/tools/repenrich2/RepEnrich2_setup.py b/tools/repenrich2/RepEnrich2_setup.py
@@ -0,0 +1,148 @@
+#!/usr/bin/env python
+import argparse
+import csv
+import os
+import shlex
+import subprocess
+import sys
+from collections import defaultdict
+from concurrent.futures import ProcessPoolExecutor
+
+
+from Bio import SeqIO
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+
+parser = argparse.ArgumentParser(description='''
+             Prepartion of repetive element pseudogenomes bowtie\
+             indexes and annotation files used by RepEnrich.py enrichment.''',
+                                 prog='getargs_genome_maker.py')
+parser.add_argument('--annotation_file', action='store',
+                    metavar='annotation_file',
+                    help='''Repeat masker annotation of the genome of\
+                         interest. Download from RepeatMasker.org\
+                         Example: mm9.fa.out''')
+parser.add_argument('--genomefasta', action='store', metavar='genomefasta',
+                    help='''Genome of interest in fasta format.\
+                         Example: mm9.fa''')
+parser.add_argument('--gaplength', action='store', dest='gaplength',
+                    metavar='gaplength', default='200', type=int,
+                    help='''Length of the N-spacer in the\
+                         repeat pseudogenomes.  Default 200''')
+parser.add_argument('--flankinglength', action='store', dest='flankinglength',
+                    metavar='flankinglength', default='25', type=int,
+                    help='''Length of the flanking regions used to build\
+                         repeat pseudogenomes. Flanking length should be set\
+                         according to the length of your reads.\
+                         Default 25, for 50 nt reads''')
+parser.add_argument('--cpus', action='store', dest='cpus', metavar='cpus',
+                    default="1", type=int,
+                    help='Number of CPUs. The more cpus the\
+                          faster RepEnrich performs. Default: "1"')
+args = parser.parse_args()
+
+# parameters from argsparse
+gapl = args.gaplength
+flankingl = args.flankinglength
+annotation_file = args.annotation_file
+genomefasta = args.genomefasta
+cpus = args.cpus
+
+# check that the programs we need are available
+try:
+    subprocess.call(shlex.split("bowtie2 --version"),
+                    stdout=open(os.devnull, 'wb'),
+                    stderr=open(os.devnull, 'wb'))
+except OSError:
+    print("Error: Bowtie2 not available in the path")
+    raise
+
+
+def starts_with_numerical(list):
+    try:
+        if len(list) == 0:
+            return False
+        int(list[0])
+        return True
+    except ValueError:
+        return False
+
+
+# define a text importer for .out/.txt format of repbase
+def import_text(filename, separator):
+    csv.field_size_limit(sys.maxsize)
+    file = csv.reader(open(filename), delimiter=separator,
+                      skipinitialspace=True)
+    return [line for line in file if starts_with_numerical(line)]
+
+
+# load genome into dictionary and compute length
+g = SeqIO.to_dict(SeqIO.parse(genomefasta, "fasta"))
+genome = defaultdict(dict)
+
+for chr in g.keys():
+    genome[chr]['sequence'] = g[chr].seq
+    genome[chr]['length'] = len(g[chr].seq)
+
+# Build a bedfile of repeatcoordinates to use by RepEnrich region_sorter
+repeat_elements = set()
+rep_coords = defaultdict(list)  # Merged dictionary for coordinates
+
+with open('repnames.bed', 'w') as fout:
+    f_in = import_text(annotation_file, ' ')
+    for line in f_in:
+        repname = line[9].translate(str.maketrans('()/', '___'))
+        repeat_elements.add(repname)
+        repchr, repstart, repend = line[4], line[5], line[6]
+        fout.write(f"{repchr}\t{repstart}\t{repend}\t{repname}\n")
+        rep_coords[repname].extend([repchr, repstart, repend])
+# repeat_elements now contains the unique repeat names
+# rep_coords is a dictionary where keys are repeat names and values are lists
+# containing chromosome, start, and end coordinates for each repeat instance
+
+# sort repeat_elements and print them in repeatIDs.txt
+with open('repeatIDs.txt', 'w') as fout:
+    for i, repeat in enumerate(sorted(repeat_elements)):
+        fout.write('\t'.join([repeat, str(i)]) + '\n')
+
+# generate spacer for pseudogenomes
+spacer = ''.join(['N' for i in range(gapl)])
+
+# generate metagenomes and save them to FASTA files for bowtie build
+for repname in rep_coords:
+    metagenome = ''
+    # iterating coordinate list by block of 3 (chr, start, end)
+    block = 3
+    for i in range(0, len(rep_coords[repname]) - block + 1, block):
+        batch = rep_coords[repname][i:i+block]
+        chromosome = batch[0]
+        start = max(int(batch[1]) - flankingl, 0)
+        end = min(int(batch[2]) + flankingl,
+                  int(genome[chromosome]['length'])-1) + 1
+        metagenome = (
+            f"{metagenome}{spacer}"
+            f"{genome[chromosome]['sequence'][start:end]}"
+            )
+
+    # Create Fasta of repeat pseudogenome
+    fastafilename = f"{repname}.fa"
+    record = SeqRecord(Seq(metagenome), id=repname, name='', description='')
+    SeqIO.write(record, fastafilename, "fasta")
+
+
+def bowtie_build(args):
+    """
+    Function to be executed in parallel by ProcessPoolExecutor.
+    """
+    try:
+        bowtie_base, fasta = args
+        command = shlex.split(f"bowtie2-build -f {fasta} {bowtie_base}")
+        squash = subprocess.run(command, capture_output=True, text=True)
+        return squash.stdout
+    except Exception as e:
+        return str(e)
+
+
+args_list = [(name, f"{name}.fa") for name in rep_coords]
+with ProcessPoolExecutor(max_workers=cpus) as executor:
+    executor.map(bowtie_build, args_list)