-
Notifications
You must be signed in to change notification settings - Fork 18
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
evolve repenrich to repenrich bowtie2 (#679)
* clone repenrich for repenrich2 * seed repenrich2 * Update repenrich2.xml * adjust bowtie to bowtie2 * Update RepEnrich.py * Update RepEnrich_setup.py * fix bowtie2 b_opt and update tests * polish help and rename scripts
- Loading branch information
Showing
29 changed files
with
13,284 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
# .shed.yml supporting automatic pushes. | ||
owner: artbio | ||
name: repenrich2 | ||
description: Repeat element profiling using bowtie2 aligner | ||
long_description: | | ||
RepEnrich is a method to estimate repetitive element enrichment | ||
using high-throughput sequencing data | ||
categories: | ||
- Transcriptomics | ||
homepage_url: https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich | ||
remote_repository_url: https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich2 | ||
toolshed: | ||
- toolshed |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,204 @@ | ||
import argparse | ||
import csv | ||
import os | ||
import shlex | ||
import subprocess | ||
import sys | ||
from collections import defaultdict | ||
from concurrent.futures import ProcessPoolExecutor | ||
|
||
|
||
parser = argparse.ArgumentParser(description=''' | ||
Repenrich aligns reads to Repeat Elements pseudogenomes\ | ||
and counts aligned reads. RepEnrich_setup must be run\ | ||
before its use''') | ||
parser.add_argument('--annotation_file', action='store', | ||
metavar='annotation_file', | ||
help='RepeatMasker.org annotation file for your\ | ||
organism. The file may be downloaded from\ | ||
RepeatMasker.org. E.g. hg19_repeatmasker.txt') | ||
parser.add_argument('--alignment_bam', action='store', metavar='alignment_bam', | ||
help='Bam alignments of unique mapper reads.') | ||
parser.add_argument('--fastqfile', action='store', metavar='fastqfile', | ||
help='File of fastq reads mapping to multiple\ | ||
locations. Example: /data/multimap.fastq') | ||
parser.add_argument('--fastqfile2', action='store', dest='fastqfile2', | ||
metavar='fastqfile2', default='', | ||
help='fastqfile #2 when using paired-end option.\ | ||
Default none') | ||
parser.add_argument('--cpus', action='store', dest='cpus', metavar='cpus', | ||
default="1", type=int, | ||
help='Number of CPUs. The more cpus the\ | ||
faster RepEnrich performs. Default: "1"') | ||
|
||
args = parser.parse_args() | ||
|
||
# parameters | ||
annotation_file = args.annotation_file | ||
unique_mapper_bam = args.alignment_bam | ||
fastqfile_1 = args.fastqfile | ||
fastqfile_2 = args.fastqfile2 | ||
cpus = args.cpus | ||
# Change if simple repeats are differently annotated in your organism | ||
simple_repeat = "Simple_repeat" | ||
if args.fastqfile2: | ||
paired_end = True | ||
else: | ||
paired_end = False | ||
|
||
# check that the programs we need are available | ||
try: | ||
subprocess.call(shlex.split("coverageBed -h"), | ||
stdout=open(os.devnull, 'wb'), | ||
stderr=open(os.devnull, 'wb')) | ||
subprocess.call(shlex.split("bowtie2 --version"), | ||
stdout=open(os.devnull, 'wb'), | ||
stderr=open(os.devnull, 'wb')) | ||
except OSError: | ||
print("Error: Bowtie2 or bedtools not loaded") | ||
raise | ||
|
||
|
||
def starts_with_numerical(list): | ||
try: | ||
if len(list) == 0: | ||
return False | ||
int(list[0]) | ||
return True | ||
except ValueError: | ||
return False | ||
|
||
|
||
# define a text importer for .out/.txt format of repbase | ||
def import_text(filename, separator): | ||
csv.field_size_limit(sys.maxsize) | ||
file = csv.reader(open(filename), delimiter=separator, | ||
skipinitialspace=True) | ||
return [line for line in file if starts_with_numerical(line)] | ||
|
||
|
||
def run_bowtie(args): | ||
metagenome, fastqfile = args | ||
b_opt = "-k 1 -p 1 --quiet --no-hd" | ||
command = shlex.split(f"bowtie2 {b_opt} -x {metagenome} {fastqfile}") | ||
bowtie_align = subprocess.run(command, check=True, | ||
capture_output=True, text=True).stdout | ||
bowtie_align = bowtie_align.rstrip('\r\n').split('\n') | ||
readlist = [metagenome] | ||
if paired_end: | ||
for line in bowtie_align: | ||
readlist.append(line.split("/")[0]) | ||
else: | ||
for line in bowtie_align: | ||
readlist.append(line.split("\t")[0]) | ||
return readlist | ||
|
||
|
||
# set a reference repeat list for the script | ||
repeat_list = [listline[9].translate( | ||
str.maketrans( | ||
'()/', '___')) for listline in import_text(annotation_file, ' ')] | ||
repeat_list = sorted(list(set(repeat_list))) | ||
|
||
# unique mapper counting | ||
cmd = f"bedtools bamtobed -i {unique_mapper_bam} | \ | ||
bedtools coverage -b stdin -a repnames.bed" | ||
p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) | ||
bedtools_counts = p.communicate()[0].decode().rstrip('\r\n').split('\n') | ||
|
||
# parse bedtools output | ||
counts = defaultdict(int) # key: repeat names, value: unique mapper counts | ||
sumofrepeatreads = 0 | ||
for line in bedtools_counts: | ||
line = line.split('\t') | ||
counts[line[3]] += int(line[4]) | ||
sumofrepeatreads += int(line[4]) | ||
print(f"Identified {sumofrepeatreads} unique reads that mapped to repeats.") | ||
|
||
# multimapper parsing | ||
if not paired_end: | ||
args_list = [(metagenome, fastqfile_1) for | ||
metagenome in repeat_list] | ||
with ProcessPoolExecutor(max_workers=cpus) as executor: | ||
results = executor.map(run_bowtie, args_list) | ||
else: | ||
args_list = [(metagenome, fastqfile_1) for | ||
metagenome in repeat_list] | ||
args_list.extend([(metagenome, fastqfile_2) for | ||
metagenome in repeat_list]) | ||
with ProcessPoolExecutor(max_workers=cpus) as executor: | ||
results = executor.map(run_bowtie, args_list) | ||
|
||
# Aggregate results (avoiding race conditions) | ||
metagenome_reads = defaultdict(list) # repeat_name: list of multimap reads | ||
for result in results: | ||
metagenome_reads[result[0]] += result[1:] | ||
|
||
for name in metagenome_reads: | ||
# read are only once in list | ||
metagenome_reads[name] = list(set(metagenome_reads[name])) | ||
# remove "no read" instances | ||
metagenome_reads[name] = [read for read in metagenome_reads[name] | ||
if read != ""] | ||
|
||
# implement repeats_by_reads from the inverse dictionnary metagenome_reads | ||
repeats_by_reads = defaultdict(list) # readids: list of repeats names | ||
for repname in metagenome_reads: | ||
for read in metagenome_reads[repname]: | ||
repeats_by_reads[read].append(repname) | ||
for repname in repeats_by_reads: | ||
repeats_by_reads[repname] = list(set(repeats_by_reads[repname])) | ||
|
||
# 3 dictionnaries and 1 pointer variable to be populated | ||
fractionalcounts = defaultdict(float) | ||
familyfractionalcounts = defaultdict(float) | ||
classfractionalcounts = defaultdict(float) | ||
sumofrepeatreads = 0 | ||
|
||
# Update counts dictionnary with sets of repeats (was "subfamilies") | ||
# matched by multimappers | ||
for repeat_set in repeats_by_reads.values(): | ||
repeat_set_string = ','.join(repeat_set) | ||
counts[repeat_set_string] += 1 | ||
sumofrepeatreads += 1 | ||
|
||
print(f'Identified more {sumofrepeatreads} mutimapper repeat reads') | ||
|
||
# Populate fractionalcounts | ||
for key, count in counts.items(): | ||
key_list = key.split(',') | ||
for i in key_list: | ||
fractionalcounts[i] += count / len(key_list) | ||
|
||
# build repeat_ref for easy access to rep class and rep families | ||
repeat_ref = defaultdict(dict) | ||
repeats = import_text(annotation_file, ' ') | ||
for repeat in repeats: | ||
repeat_name = repeat[9].translate(str.maketrans('()/', '___')) | ||
try: | ||
repclass = repeat[10].split('/')[0] | ||
repfamily = repeat[10].split('/')[1] | ||
except IndexError: | ||
repclass, repfamily = repeat[10], repeat[10] | ||
repeat_ref[repeat_name]['class'] = repclass | ||
repeat_ref[repeat_name]['family'] = repfamily | ||
|
||
# Populate classfractionalcounts and familyfractionalcounts | ||
for key, value in fractionalcounts.items(): | ||
classfractionalcounts[repeat_ref[key]['class']] += value | ||
familyfractionalcounts[repeat_ref[key]['family']] += value | ||
|
||
# print class-, family- and fraction-repeats counts to files | ||
with open("class_fraction_counts.tsv", 'w') as fout: | ||
for key in sorted(classfractionalcounts): | ||
fout.write(f"{key}\t{classfractionalcounts[key]}\n") | ||
|
||
with open("family_fraction_counts.tsv", 'w') as fout: | ||
for key in sorted(familyfractionalcounts): | ||
fout.write(f"{key}\t{familyfractionalcounts[key]}\n") | ||
|
||
with open("fraction_counts.tsv", 'w') as fout: | ||
for key in sorted(fractionalcounts): | ||
fout.write(f"{key}\t{repeat_ref[key]['class']}\t" | ||
f"{repeat_ref[key]['family']}\t" | ||
f"{fractionalcounts[key]}\n") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,148 @@ | ||
#!/usr/bin/env python | ||
import argparse | ||
import csv | ||
import os | ||
import shlex | ||
import subprocess | ||
import sys | ||
from collections import defaultdict | ||
from concurrent.futures import ProcessPoolExecutor | ||
|
||
|
||
from Bio import SeqIO | ||
from Bio.Seq import Seq | ||
from Bio.SeqRecord import SeqRecord | ||
|
||
parser = argparse.ArgumentParser(description=''' | ||
Prepartion of repetive element pseudogenomes bowtie\ | ||
indexes and annotation files used by RepEnrich.py enrichment.''', | ||
prog='getargs_genome_maker.py') | ||
parser.add_argument('--annotation_file', action='store', | ||
metavar='annotation_file', | ||
help='''Repeat masker annotation of the genome of\ | ||
interest. Download from RepeatMasker.org\ | ||
Example: mm9.fa.out''') | ||
parser.add_argument('--genomefasta', action='store', metavar='genomefasta', | ||
help='''Genome of interest in fasta format.\ | ||
Example: mm9.fa''') | ||
parser.add_argument('--gaplength', action='store', dest='gaplength', | ||
metavar='gaplength', default='200', type=int, | ||
help='''Length of the N-spacer in the\ | ||
repeat pseudogenomes. Default 200''') | ||
parser.add_argument('--flankinglength', action='store', dest='flankinglength', | ||
metavar='flankinglength', default='25', type=int, | ||
help='''Length of the flanking regions used to build\ | ||
repeat pseudogenomes. Flanking length should be set\ | ||
according to the length of your reads.\ | ||
Default 25, for 50 nt reads''') | ||
parser.add_argument('--cpus', action='store', dest='cpus', metavar='cpus', | ||
default="1", type=int, | ||
help='Number of CPUs. The more cpus the\ | ||
faster RepEnrich performs. Default: "1"') | ||
args = parser.parse_args() | ||
|
||
# parameters from argsparse | ||
gapl = args.gaplength | ||
flankingl = args.flankinglength | ||
annotation_file = args.annotation_file | ||
genomefasta = args.genomefasta | ||
cpus = args.cpus | ||
|
||
# check that the programs we need are available | ||
try: | ||
subprocess.call(shlex.split("bowtie2 --version"), | ||
stdout=open(os.devnull, 'wb'), | ||
stderr=open(os.devnull, 'wb')) | ||
except OSError: | ||
print("Error: Bowtie2 not available in the path") | ||
raise | ||
|
||
|
||
def starts_with_numerical(list): | ||
try: | ||
if len(list) == 0: | ||
return False | ||
int(list[0]) | ||
return True | ||
except ValueError: | ||
return False | ||
|
||
|
||
# define a text importer for .out/.txt format of repbase | ||
def import_text(filename, separator): | ||
csv.field_size_limit(sys.maxsize) | ||
file = csv.reader(open(filename), delimiter=separator, | ||
skipinitialspace=True) | ||
return [line for line in file if starts_with_numerical(line)] | ||
|
||
|
||
# load genome into dictionary and compute length | ||
g = SeqIO.to_dict(SeqIO.parse(genomefasta, "fasta")) | ||
genome = defaultdict(dict) | ||
|
||
for chr in g.keys(): | ||
genome[chr]['sequence'] = g[chr].seq | ||
genome[chr]['length'] = len(g[chr].seq) | ||
|
||
# Build a bedfile of repeatcoordinates to use by RepEnrich region_sorter | ||
repeat_elements = set() | ||
rep_coords = defaultdict(list) # Merged dictionary for coordinates | ||
|
||
with open('repnames.bed', 'w') as fout: | ||
f_in = import_text(annotation_file, ' ') | ||
for line in f_in: | ||
repname = line[9].translate(str.maketrans('()/', '___')) | ||
repeat_elements.add(repname) | ||
repchr, repstart, repend = line[4], line[5], line[6] | ||
fout.write(f"{repchr}\t{repstart}\t{repend}\t{repname}\n") | ||
rep_coords[repname].extend([repchr, repstart, repend]) | ||
# repeat_elements now contains the unique repeat names | ||
# rep_coords is a dictionary where keys are repeat names and values are lists | ||
# containing chromosome, start, and end coordinates for each repeat instance | ||
|
||
# sort repeat_elements and print them in repeatIDs.txt | ||
with open('repeatIDs.txt', 'w') as fout: | ||
for i, repeat in enumerate(sorted(repeat_elements)): | ||
fout.write('\t'.join([repeat, str(i)]) + '\n') | ||
|
||
# generate spacer for pseudogenomes | ||
spacer = ''.join(['N' for i in range(gapl)]) | ||
|
||
# generate metagenomes and save them to FASTA files for bowtie build | ||
for repname in rep_coords: | ||
metagenome = '' | ||
# iterating coordinate list by block of 3 (chr, start, end) | ||
block = 3 | ||
for i in range(0, len(rep_coords[repname]) - block + 1, block): | ||
batch = rep_coords[repname][i:i+block] | ||
chromosome = batch[0] | ||
start = max(int(batch[1]) - flankingl, 0) | ||
end = min(int(batch[2]) + flankingl, | ||
int(genome[chromosome]['length'])-1) + 1 | ||
metagenome = ( | ||
f"{metagenome}{spacer}" | ||
f"{genome[chromosome]['sequence'][start:end]}" | ||
) | ||
|
||
# Create Fasta of repeat pseudogenome | ||
fastafilename = f"{repname}.fa" | ||
record = SeqRecord(Seq(metagenome), id=repname, name='', description='') | ||
SeqIO.write(record, fastafilename, "fasta") | ||
|
||
|
||
def bowtie_build(args): | ||
""" | ||
Function to be executed in parallel by ProcessPoolExecutor. | ||
""" | ||
try: | ||
bowtie_base, fasta = args | ||
command = shlex.split(f"bowtie2-build -f {fasta} {bowtie_base}") | ||
squash = subprocess.run(command, capture_output=True, text=True) | ||
return squash.stdout | ||
except Exception as e: | ||
return str(e) | ||
|
||
|
||
args_list = [(name, f"{name}.fa") for name in rep_coords] | ||
with ProcessPoolExecutor(max_workers=cpus) as executor: | ||
executor.map(bowtie_build, args_list) |
Oops, something went wrong.