Skip to content

Commit

Permalink
evolve repenrich to repenrich bowtie2 (#679)
Browse files Browse the repository at this point in the history
* clone repenrich for repenrich2

* seed repenrich2

* Update repenrich2.xml

* adjust bowtie to bowtie2

* Update RepEnrich.py

* Update RepEnrich_setup.py

* fix bowtie2 b_opt and update tests

* polish help and rename scripts
  • Loading branch information
drosofff authored Apr 20, 2024
1 parent df6b949 commit 73721d9
Show file tree
Hide file tree
Showing 29 changed files with 13,284 additions and 0 deletions.
13 changes: 13 additions & 0 deletions tools/repenrich2/.shed.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# .shed.yml supporting automatic pushes.
owner: artbio
name: repenrich2
description: Repeat element profiling using bowtie2 aligner
long_description: |
RepEnrich is a method to estimate repetitive element enrichment
using high-throughput sequencing data
categories:
- Transcriptomics
homepage_url: https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich
remote_repository_url: https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2
toolshed:
- toolshed
204 changes: 204 additions & 0 deletions tools/repenrich2/RepEnrich2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
import argparse
import csv
import os
import shlex
import subprocess
import sys
from collections import defaultdict
from concurrent.futures import ProcessPoolExecutor


parser = argparse.ArgumentParser(description='''
Repenrich aligns reads to Repeat Elements pseudogenomes\
and counts aligned reads. RepEnrich_setup must be run\
before its use''')
parser.add_argument('--annotation_file', action='store',
metavar='annotation_file',
help='RepeatMasker.org annotation file for your\
organism. The file may be downloaded from\
RepeatMasker.org. E.g. hg19_repeatmasker.txt')
parser.add_argument('--alignment_bam', action='store', metavar='alignment_bam',
help='Bam alignments of unique mapper reads.')
parser.add_argument('--fastqfile', action='store', metavar='fastqfile',
help='File of fastq reads mapping to multiple\
locations. Example: /data/multimap.fastq')
parser.add_argument('--fastqfile2', action='store', dest='fastqfile2',
metavar='fastqfile2', default='',
help='fastqfile #2 when using paired-end option.\
Default none')
parser.add_argument('--cpus', action='store', dest='cpus', metavar='cpus',
default="1", type=int,
help='Number of CPUs. The more cpus the\
faster RepEnrich performs. Default: "1"')

args = parser.parse_args()

# parameters
annotation_file = args.annotation_file
unique_mapper_bam = args.alignment_bam
fastqfile_1 = args.fastqfile
fastqfile_2 = args.fastqfile2
cpus = args.cpus
# Change if simple repeats are differently annotated in your organism
simple_repeat = "Simple_repeat"
if args.fastqfile2:
paired_end = True
else:
paired_end = False

# check that the programs we need are available
try:
subprocess.call(shlex.split("coverageBed -h"),
stdout=open(os.devnull, 'wb'),
stderr=open(os.devnull, 'wb'))
subprocess.call(shlex.split("bowtie2 --version"),
stdout=open(os.devnull, 'wb'),
stderr=open(os.devnull, 'wb'))
except OSError:
print("Error: Bowtie2 or bedtools not loaded")
raise


def starts_with_numerical(list):
try:
if len(list) == 0:
return False
int(list[0])
return True
except ValueError:
return False


# define a text importer for .out/.txt format of repbase
def import_text(filename, separator):
csv.field_size_limit(sys.maxsize)
file = csv.reader(open(filename), delimiter=separator,
skipinitialspace=True)
return [line for line in file if starts_with_numerical(line)]


def run_bowtie(args):
metagenome, fastqfile = args
b_opt = "-k 1 -p 1 --quiet --no-hd"
command = shlex.split(f"bowtie2 {b_opt} -x {metagenome} {fastqfile}")
bowtie_align = subprocess.run(command, check=True,
capture_output=True, text=True).stdout
bowtie_align = bowtie_align.rstrip('\r\n').split('\n')
readlist = [metagenome]
if paired_end:
for line in bowtie_align:
readlist.append(line.split("/")[0])
else:
for line in bowtie_align:
readlist.append(line.split("\t")[0])
return readlist


# set a reference repeat list for the script
repeat_list = [listline[9].translate(
str.maketrans(
'()/', '___')) for listline in import_text(annotation_file, ' ')]
repeat_list = sorted(list(set(repeat_list)))

# unique mapper counting
cmd = f"bedtools bamtobed -i {unique_mapper_bam} | \
bedtools coverage -b stdin -a repnames.bed"
p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
bedtools_counts = p.communicate()[0].decode().rstrip('\r\n').split('\n')

# parse bedtools output
counts = defaultdict(int) # key: repeat names, value: unique mapper counts
sumofrepeatreads = 0
for line in bedtools_counts:
line = line.split('\t')
counts[line[3]] += int(line[4])
sumofrepeatreads += int(line[4])
print(f"Identified {sumofrepeatreads} unique reads that mapped to repeats.")

# multimapper parsing
if not paired_end:
args_list = [(metagenome, fastqfile_1) for
metagenome in repeat_list]
with ProcessPoolExecutor(max_workers=cpus) as executor:
results = executor.map(run_bowtie, args_list)
else:
args_list = [(metagenome, fastqfile_1) for
metagenome in repeat_list]
args_list.extend([(metagenome, fastqfile_2) for
metagenome in repeat_list])
with ProcessPoolExecutor(max_workers=cpus) as executor:
results = executor.map(run_bowtie, args_list)

# Aggregate results (avoiding race conditions)
metagenome_reads = defaultdict(list) # repeat_name: list of multimap reads
for result in results:
metagenome_reads[result[0]] += result[1:]

for name in metagenome_reads:
# read are only once in list
metagenome_reads[name] = list(set(metagenome_reads[name]))
# remove "no read" instances
metagenome_reads[name] = [read for read in metagenome_reads[name]
if read != ""]

# implement repeats_by_reads from the inverse dictionnary metagenome_reads
repeats_by_reads = defaultdict(list) # readids: list of repeats names
for repname in metagenome_reads:
for read in metagenome_reads[repname]:
repeats_by_reads[read].append(repname)
for repname in repeats_by_reads:
repeats_by_reads[repname] = list(set(repeats_by_reads[repname]))

# 3 dictionnaries and 1 pointer variable to be populated
fractionalcounts = defaultdict(float)
familyfractionalcounts = defaultdict(float)
classfractionalcounts = defaultdict(float)
sumofrepeatreads = 0

# Update counts dictionnary with sets of repeats (was "subfamilies")
# matched by multimappers
for repeat_set in repeats_by_reads.values():
repeat_set_string = ','.join(repeat_set)
counts[repeat_set_string] += 1
sumofrepeatreads += 1

print(f'Identified more {sumofrepeatreads} mutimapper repeat reads')

# Populate fractionalcounts
for key, count in counts.items():
key_list = key.split(',')
for i in key_list:
fractionalcounts[i] += count / len(key_list)

# build repeat_ref for easy access to rep class and rep families
repeat_ref = defaultdict(dict)
repeats = import_text(annotation_file, ' ')
for repeat in repeats:
repeat_name = repeat[9].translate(str.maketrans('()/', '___'))
try:
repclass = repeat[10].split('/')[0]
repfamily = repeat[10].split('/')[1]
except IndexError:
repclass, repfamily = repeat[10], repeat[10]
repeat_ref[repeat_name]['class'] = repclass
repeat_ref[repeat_name]['family'] = repfamily

# Populate classfractionalcounts and familyfractionalcounts
for key, value in fractionalcounts.items():
classfractionalcounts[repeat_ref[key]['class']] += value
familyfractionalcounts[repeat_ref[key]['family']] += value

# print class-, family- and fraction-repeats counts to files
with open("class_fraction_counts.tsv", 'w') as fout:
for key in sorted(classfractionalcounts):
fout.write(f"{key}\t{classfractionalcounts[key]}\n")

with open("family_fraction_counts.tsv", 'w') as fout:
for key in sorted(familyfractionalcounts):
fout.write(f"{key}\t{familyfractionalcounts[key]}\n")

with open("fraction_counts.tsv", 'w') as fout:
for key in sorted(fractionalcounts):
fout.write(f"{key}\t{repeat_ref[key]['class']}\t"
f"{repeat_ref[key]['family']}\t"
f"{fractionalcounts[key]}\n")
148 changes: 148 additions & 0 deletions tools/repenrich2/RepEnrich2_setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
#!/usr/bin/env python
import argparse
import csv
import os
import shlex
import subprocess
import sys
from collections import defaultdict
from concurrent.futures import ProcessPoolExecutor


from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

parser = argparse.ArgumentParser(description='''
Prepartion of repetive element pseudogenomes bowtie\
indexes and annotation files used by RepEnrich.py enrichment.''',
prog='getargs_genome_maker.py')
parser.add_argument('--annotation_file', action='store',
metavar='annotation_file',
help='''Repeat masker annotation of the genome of\
interest. Download from RepeatMasker.org\
Example: mm9.fa.out''')
parser.add_argument('--genomefasta', action='store', metavar='genomefasta',
help='''Genome of interest in fasta format.\
Example: mm9.fa''')
parser.add_argument('--gaplength', action='store', dest='gaplength',
metavar='gaplength', default='200', type=int,
help='''Length of the N-spacer in the\
repeat pseudogenomes. Default 200''')
parser.add_argument('--flankinglength', action='store', dest='flankinglength',
metavar='flankinglength', default='25', type=int,
help='''Length of the flanking regions used to build\
repeat pseudogenomes. Flanking length should be set\
according to the length of your reads.\
Default 25, for 50 nt reads''')
parser.add_argument('--cpus', action='store', dest='cpus', metavar='cpus',
default="1", type=int,
help='Number of CPUs. The more cpus the\
faster RepEnrich performs. Default: "1"')
args = parser.parse_args()

# parameters from argsparse
gapl = args.gaplength
flankingl = args.flankinglength
annotation_file = args.annotation_file
genomefasta = args.genomefasta
cpus = args.cpus

# check that the programs we need are available
try:
subprocess.call(shlex.split("bowtie2 --version"),
stdout=open(os.devnull, 'wb'),
stderr=open(os.devnull, 'wb'))
except OSError:
print("Error: Bowtie2 not available in the path")
raise


def starts_with_numerical(list):
try:
if len(list) == 0:
return False
int(list[0])
return True
except ValueError:
return False


# define a text importer for .out/.txt format of repbase
def import_text(filename, separator):
csv.field_size_limit(sys.maxsize)
file = csv.reader(open(filename), delimiter=separator,
skipinitialspace=True)
return [line for line in file if starts_with_numerical(line)]


# load genome into dictionary and compute length
g = SeqIO.to_dict(SeqIO.parse(genomefasta, "fasta"))
genome = defaultdict(dict)

for chr in g.keys():
genome[chr]['sequence'] = g[chr].seq
genome[chr]['length'] = len(g[chr].seq)

# Build a bedfile of repeatcoordinates to use by RepEnrich region_sorter
repeat_elements = set()
rep_coords = defaultdict(list) # Merged dictionary for coordinates

with open('repnames.bed', 'w') as fout:
f_in = import_text(annotation_file, ' ')
for line in f_in:
repname = line[9].translate(str.maketrans('()/', '___'))
repeat_elements.add(repname)
repchr, repstart, repend = line[4], line[5], line[6]
fout.write(f"{repchr}\t{repstart}\t{repend}\t{repname}\n")
rep_coords[repname].extend([repchr, repstart, repend])
# repeat_elements now contains the unique repeat names
# rep_coords is a dictionary where keys are repeat names and values are lists
# containing chromosome, start, and end coordinates for each repeat instance

# sort repeat_elements and print them in repeatIDs.txt
with open('repeatIDs.txt', 'w') as fout:
for i, repeat in enumerate(sorted(repeat_elements)):
fout.write('\t'.join([repeat, str(i)]) + '\n')

# generate spacer for pseudogenomes
spacer = ''.join(['N' for i in range(gapl)])

# generate metagenomes and save them to FASTA files for bowtie build
for repname in rep_coords:
metagenome = ''
# iterating coordinate list by block of 3 (chr, start, end)
block = 3
for i in range(0, len(rep_coords[repname]) - block + 1, block):
batch = rep_coords[repname][i:i+block]
chromosome = batch[0]
start = max(int(batch[1]) - flankingl, 0)
end = min(int(batch[2]) + flankingl,
int(genome[chromosome]['length'])-1) + 1
metagenome = (
f"{metagenome}{spacer}"
f"{genome[chromosome]['sequence'][start:end]}"
)

# Create Fasta of repeat pseudogenome
fastafilename = f"{repname}.fa"
record = SeqRecord(Seq(metagenome), id=repname, name='', description='')
SeqIO.write(record, fastafilename, "fasta")


def bowtie_build(args):
"""
Function to be executed in parallel by ProcessPoolExecutor.
"""
try:
bowtie_base, fasta = args
command = shlex.split(f"bowtie2-build -f {fasta} {bowtie_base}")
squash = subprocess.run(command, capture_output=True, text=True)
return squash.stdout
except Exception as e:
return str(e)


args_list = [(name, f"{name}.fa") for name in rep_coords]
with ProcessPoolExecutor(max_workers=cpus) as executor:
executor.map(bowtie_build, args_list)
Loading

0 comments on commit 73721d9

Please sign in to comment.