diff --git a/mimseq/mimseq.py b/mimseq/mimseq.py index fed7d8f..38470f0 100755 --- a/mimseq/mimseq.py +++ b/mimseq/mimseq.py @@ -1,13 +1,13 @@ #! /usr/bin/env python3 # +-------------+ -# | mim-tRNAseq | +# | mim-tRNAseq | # +-------------+ #################################### # Main backbone and wrapper script # #################################### -# +# # author: Drew Behrens # contact: aberens@biochem.mpg.de # github: https://github.com/nedialkova-lab/mim-tRNAseq @@ -18,7 +18,7 @@ from .tRNAmap import mainAlign from .getCoverage import getCoverage, plotCoverage from .mmQuant import generateModsTable, plotCCA -from .ssAlign import structureParser, modContext +from .ssAlign import structureParser, modContext from .splitClusters import splitIsodecoder, unsplitClustersCov, getIsodecoderSizes, getDeconvSizes, writeDeconvTranscripts, writeIsodecoderTranscripts, writeSplitInfo, writeIsodecoderInfo import sys, os, subprocess, logging, datetime, copy import argparse @@ -49,8 +49,8 @@ def restrictedFloat2(x): raise argparse.ArgumentTypeError('{} not a real number'.format(x)) def mimseq(trnas, trnaout, name, species, out, cluster, cluster_id, cov_diff, posttrans, control_cond, threads, max_multi, snp_tolerance, \ - keep_temp, cca, double_cca, min_cov, mismatches, remap, remap_mismatches, misinc_thresh, mito_trnas, pretrnas, local_mod, p_adj, sample_data): - + keep_temp, cca, double_cca, min_cov, mismatches, remap, remap_mismatches, misinc_thresh, mito_trnas, plastid_trnas, pretrnas, local_mod, p_adj, sample_data): + # Main wrapper # Integrity check for output folder argument... try: @@ -98,7 +98,7 @@ def mimseq(trnas, trnaout, name, species, out, cluster, cluster_id, cov_diff, po modifications = os.path.dirname(os.path.realpath(__file__)) modifications += "/modifications" coverage_bed, snp_tolerance, mismatch_dict, insert_dict, del_dict, mod_lists, Inosine_lists, Inosine_clusters, tRNA_dict, cluster_dict, cluster_perPos_mismatchMembers \ - = modsToSNPIndex(trnas, trnaout, mito_trnas, modifications, name, out, double_cca, threads, snp_tolerance, cluster, cluster_id, posttrans, pretrnas, local_mod) + = modsToSNPIndex(trnas, trnaout, mito_trnas, plastid_trnas, modifications, name, out, double_cca, threads, snp_tolerance, cluster, cluster_id, posttrans, pretrnas, local_mod) structureParser() # Generate GSNAP indices genome_index_path, genome_index_name, snp_index_path, snp_index_name = generateGSNAPIndices(species, name, out, map_round, snp_tolerance, cluster) @@ -139,7 +139,7 @@ def mimseq(trnas, trnaout, name, species, out, cluster, cluster_id, cov_diff, po #else: # log.info("\n*** New modifications not discovered as remap is not enabled ***\n") - # redo checks for unsplit isodecoders based on coverage + # redo checks for unsplit isodecoders based on coverage # use original splitBool and unique_isodecoderMMs in case coverage changes in 2nd alignment round, regenerate splitBool_new and unique_isodecoderMMs_new # rewrite deconv transcripts if map_round == 2 and cluster and cluster_id != 1: @@ -166,7 +166,7 @@ def mimseq(trnas, trnaout, name, species, out, cluster, cluster_id, cov_diff, po mod_sites, cons_pos_list = modContext(out, unsplitCluster_lookup) script_path = os.path.dirname(os.path.realpath(__file__)) - + if snp_tolerance or not mismatches == 0.0: # plot mods and stops, catch exception with command call and print log error if many clusters are filtered (known to cause issues with R code handling mods table) log.info("Plotting modification and RT stop data...") @@ -217,25 +217,25 @@ def mimseq(trnas, trnaout, name, species, out, cluster, cluster_id, cov_diff, po def main(): - ################### + ################### # Parse arguments # - ################### - + ################### + parser = argparse.ArgumentParser(description = 'Custom high-throughput tRNA sequencing alignment and quantification pipeline\ based on modification induced misincorporation cDNA synthesis.', add_help = True, usage = "%(prog)s [options] sample data") inputs = parser.add_argument_group("Input files") - inputs.add_argument('-s','--species', metavar='species', required = not ('-t' in sys.argv), dest = 'species', help = \ + inputs.add_argument('-s','--species', metavar='species', required = not ('-t' in sys.argv or '--trnas' in sys.argv), dest = 'species', help = \ 'Species being analyzed for which to load pre-packaged data files (prioritized over -t, -o and -m). Options are: Hsap, Hsap19, Mmus, Rnor, Scer, Spom, Dmel, Drer, Ecol, Atha', \ choices = ['Hsap', 'Hsap19','Ggor','Mmus','Rnor','Scer', 'Spom','Dmel', 'Drer', 'Ecol', 'Atha', 'HsapTCC', 'ScerMut']) inputs.add_argument('-t', '--trnas', metavar='genomic tRNAs', required = False, dest = 'trnas', help = \ 'Genomic tRNA fasta file, e.g. from gtRNAdb or tRNAscan-SE. Already avalable in data folder for a few model organisms.') - inputs.add_argument('-o', '--trnaout', metavar = 'tRNA out file', required = (not '--species' or '-s' in sys.argv) or ('-t' in sys.argv), + inputs.add_argument('-o', '--trnaout', metavar = 'tRNA out file', required = (not '--species' or '-s' in sys.argv) or ('-t' in sys.argv), dest = 'trnaout', help = 'tRNA.out file generated by tRNAscan-SE (also may be available on gtRNAdb). Contains information about tRNA features, including introns.') - inputs.add_argument('-m', '--mito-trnas', metavar = 'mitochondrial/plastid tRNAs', required = False, nargs = "*", dest = 'mito', \ - help = 'Mitochondrial/plastid tRNA fasta file(s). Can be multiple space-separated file names to specify both mitochondrial and plastid seqences.\ - Ensure "plastid" and "mito" are in the file names in this case. Should be downloaded from mitotRNAdb or PtRNAdb for species of interest. Already available in data folder for a few model organisms.') - + inputs.add_argument('-m', '--mito-trnas', metavar = 'mitochondrial/plastid tRNAs', required = False, dest = 'mito', \ + help = 'Mitochondrial tRNA fasta file(s). Should be downloaded from mitotRNAdb for species of interest. Already available in data folder for a few model organisms.') + inputs.add_argument('-p', '--plastid-trnas', metavar = 'mitochondrial/plastid tRNAs', required = False, dest = 'plastid', \ + help = 'Plastid tRNA fasta file(s). Should be downloaded from PtRNAdb for species of interest. Already available in data folder for a few model organisms.') options = parser.add_argument_group("Program options") options.add_argument('--pretRNAs', required = False, dest = 'pretrnas', action = 'store_true',\ help = "Input reference sequences are pretRNAs. Enabling this option will disable the removal of intron sequences and addition of 3'-CCA to generate \ @@ -308,7 +308,7 @@ def main(): parser.add_argument('--version', action='version', version='%(prog)s {}'.format(version.__version__), help = 'Show version number and exit') parser.add_argument('sampledata', help = 'Sample data sheet in text format, tab-separated. Column 1: full path to fastq (or fastq.gz). Column 2: condition/group.') - + parser.set_defaults(threads=1, out="./", max_multi = 3, mito = '', cov_diff = 0.5) ######################################### @@ -346,7 +346,7 @@ def main(): if args.control_cond not in conditions: raise argparse.ArgumentTypeError('{} not a valid condition in {}'.format(args.control_cond, args.sampledata)) if not args.species and not (args.trnas or args.trnaout): - parser.error('Must specify valid --species argument or supply -t (tRNA sequences) and -o (tRNAscan out file)!') + parser.error('Must specify valid --species argument or supply -t (tRNA sequences) and -o (tRNAscan out file)!') else: if args.species: if args.species == 'Ggor': @@ -380,7 +380,7 @@ def main(): if args.species == 'Rnor': args.trnas = os.path.dirname(os.path.realpath(__file__)) + "/data/rn7-eColitK/rn7-tRNAs.fa" args.trnaout = os.path.dirname(os.path.realpath(__file__)) + "/data/rn7-eColitK/rn7_eColitK-tRNAs-confidence-set.out" - args.mito = os.path.dirname(os.path.realpath(__file__)) + "/data/rn7-eColitK/rn7-mitotRNAs.fa" + args.mito = os.path.dirname(os.path.realpath(__file__)) + "/data/rn7-eColitK/rn7-mitotRNAs.fa" if args.species == 'Spom': args.trnas = os.path.dirname(os.path.realpath(__file__)) + "/data/schiPomb-eColitK/schiPomb_972H-tRNAs.fa" args.trnaout = os.path.dirname(os.path.realpath(__file__)) + "/data/schiPomb-eColitK/schiPomb_eschColi-tRNAs.out" @@ -400,15 +400,15 @@ def main(): if args.species == 'Atha': args.trnas = os.path.dirname(os.path.realpath(__file__)) + "/data/araTha1-eColitK/araTha1-tRNAs.fa" args.trnaout = os.path.dirname(os.path.realpath(__file__)) + "/data/araTha1-eColitK/araTha1-tRNAs-detailed.out" - # two mito files here for plastid and mito references - args.mito = os.path.dirname(os.path.realpath(__file__)) + "/data/araTha1-eColitK/araTha1-plastidtRNAs.fa " + os.path.dirname(os.path.realpath(__file__)) + "/data/araTha1-eColitK/araTha1-mitotRNAs.fa" + args.mito = os.path.dirname(os.path.realpath(__file__)) + "/data/araTha1-eColitK/araTha1-mitotRNAs.fa" + # plastid reference needed for A.thaliana + args.plastid = os.path.dirname(os.path.realpath(__file__)) + "/data/araTha1-eColitK/araTha1-plastidtRNAs.fa " else: args.species = args.trnas.split("/")[-1].split(".")[0] mimseq(args.trnas, args.trnaout, args.name, args.species, args.out, args.cluster, args.cluster_id, args.cov_diff, \ args.posttrans, args.control_cond, args.threads, args.max_multi, args.snp_tolerance, \ args.keep_temp, args.cca, args.double_cca, args.min_cov, args.mismatches, args.remap, args.remap_mismatches, \ - args.misinc_thresh, args.mito, args.pretrnas, args.local_mod, args.p_adj, args.sampledata) + args.misinc_thresh, args.mito, args.plastid, args.pretrnas, args.local_mod, args.p_adj, args.sampledata) if __name__ == '__main__': main() - diff --git a/mimseq/ssAlign.py b/mimseq/ssAlign.py index c4b1db0..5c22a2b 100755 --- a/mimseq/ssAlign.py +++ b/mimseq/ssAlign.py @@ -10,12 +10,20 @@ stkname = '' -def aligntRNA(tRNAseqs, out): +def aligntRNA(tRNAseqs, out, threads=1): # run cmalign to generate Stockholm file for tRNA sequences global stkname stkname = tRNAseqs.split(".fa")[0] + '_align.stk' cmfile = os.path.dirname(os.path.realpath(__file__)) + '/data/tRNAmatureseq.cm' - cmcommand = ['cmalign', '-o', stkname, '--nonbanded', '-g', cmfile, tRNAseqs] + + if threads > 1: + cpus = threads + else: + # For single thread, set --cpu=0 + # http://eddylab.org/infernal/Userguide.pdf + cpus = 0 + + cmcommand = ['cmalign', '-o', stkname, '--nonbanded', '--cpu', str(cpus), '-g', cmfile, tRNAseqs] subprocess.check_call(cmcommand, stdout = open(out + 'cm.log', 'w')) def extraCCA(): @@ -64,7 +72,7 @@ def tRNAclassifier(ungapped = False): cons_pos_list.append('17a') cons_pos += 1 - elif cons_pos == 20: + elif cons_pos == 20: if not '20' in cons_pos_list: cons_pos_dict[pos+1] = '20' cons_pos_list.append('20') @@ -181,7 +189,7 @@ def getAnticodon(): for pos, char in enumerate(rf_cons): if char == "*": anticodon.append(pos) - + return(anticodon) def clusterAnticodon(cons_anticodon, cluster): @@ -210,13 +218,13 @@ def modContext(out, unsplitCluster_lookup): # Define positions of conserved mod sites in gapped alignment for each tRNA sites_dict = defaultdict() mod_sites = ['9', '20', '26', '32', '34', '37', '58'] - + for mod in mod_sites: sites_dict[mod] = list(cons_pos_dict.keys())[list(cons_pos_dict.values()).index(mod)] upstream_dict = defaultdict(lambda: defaultdict(list)) - stk = AlignIO.read(stkname, "stockholm") + stk = AlignIO.read(stkname, "stockholm") for record in stk: gene = record.id seq = record.seq @@ -230,7 +238,7 @@ def modContext(out, unsplitCluster_lookup): while seq[down].upper() not in ['A','C','G','U','T']: down += 1 canon_pos = cons_pos_dict[pos] - upstream_dict[gene][canon_pos].append(identity) + upstream_dict[gene][canon_pos].append(identity) upstream_dict[gene][canon_pos].append(seq[up]) # upstream base upstream_dict[gene][canon_pos].append(seq[down]) # downstream base @@ -256,7 +264,7 @@ def modContext(out, unsplitCluster_lookup): def structureParser(): # read in stk file generated above and define structural regions for each tRNA input - + struct_dict = dict() # get conserved tRNA structure from alignment ss_cons = "".join([line.split()[-1] for line in open(stkname) if line.startswith("#=GC SS_cons")]) diff --git a/mimseq/tRNAtools.py b/mimseq/tRNAtools.py index fbc8965..fb65fb8 100755 --- a/mimseq/tRNAtools.py +++ b/mimseq/tRNAtools.py @@ -28,17 +28,17 @@ def dd(): def dd_list(): return(defaultdict(list)) -def tRNAparser (gtRNAdb, tRNAscan_out, mitotRNAs, modifications_table, posttrans_mod_off, double_cca, pretrnas, local_mod): +def tRNAparser (gtRNAdb, tRNAscan_out, mitotRNAs, plastidtRNAs, modifications_table, posttrans_mod_off, double_cca, pretrnas, local_mod): # tRNA sequence files parser and dictionary building # Generate modification reference table modifications = modificationParser(modifications_table) temp_name = gtRNAdb.split("/")[-1] - + log.info("\n+" + ("-" * (len(temp_name)+24)) + "+\ \n| Starting analysis for {} |\ \n+".format(temp_name) + ("-" * (len(temp_name)+24)) + "+") - + log.info("Processing tRNA sequences...") # Build dictionary of sequences from gtRNAdb fasta @@ -63,33 +63,35 @@ def tRNAparser (gtRNAdb, tRNAscan_out, mitotRNAs, modifications_table, posttrans tRNA_dict[seq]['species'] = ' '.join(seq.split('_')[0:2]) tRNA_dict[seq]['type'] = "cytosolic" - # add mitochondrial tRNAs if given - if mitotRNAs: - mitotRNAs = mitotRNAs.split(" ") - - for fn in mitotRNAs: - org = "mito" if re.search("mito", fn) else "plastid" - temp_dict = SeqIO.to_dict(SeqIO.parse(fn,"fasta")) - mito_count = defaultdict(int) + def addExtraNucleartRNAs(trna_fasta, type, tRNA_dict, double_cca): + temp_dict = SeqIO.to_dict(SeqIO.parse(trna_fasta,"fasta")) + non_nuc_trna_count = defaultdict(int) org_count = 0 - # read each mito tRNA, edit sequence header to match nuclear genes as above and add to tRNA_dict + # read each tRNA, edit sequence header to match nuclear genes as above and add to tRNA_dict for seq in temp_dict: org_count += 1 seq_parts = seq.split("|") anticodon = seq_parts[4] amino = re.search("[a-zA-z]+", seq_parts[3]).group(0) - mito_count[anticodon] += 1 - if org == "mito": - new_seq = seq_parts[1] + "_mito_tRNA-" + amino + "-" + seq_parts[4] + "-" + str(mito_count[anticodon]) + "-1" + non_nuc_trna_count[anticodon] += 1 + if type == 'mitochondrial': + new_seq = seq_parts[1] + "_mito_tRNA-" + amino + "-" + seq_parts[4] + "-" + str(non_nuc_trna_count[anticodon]) + "-1" else: - new_seq = seq_parts[1] + "_plastid_tRNA-" + amino + "-" + seq_parts[4] + "-" + str(mito_count[anticodon]) + "-1" + new_seq = seq_parts[1] + "_plastid_tRNA-" + amino + "-" + seq_parts[4] + "-" + str(non_nuc_trna_count[anticodon]) + "-1" tRNAseq = str(temp_dict[seq].seq) + "CCA" if not double_cca else str(temp_dict[seq].seq) + "CCACCA" tRNA_dict[new_seq]['sequence'] = tRNAseq.upper() - tRNA_dict[new_seq]['type'] = 'mitochondrial' + tRNA_dict[new_seq]['type'] = type tRNA_dict[new_seq]['species'] = ' '.join(seq.split('_')[0:2]) - log.info("{} ".format(org_count) + org + " tRNA sequences imported") + log.info("{} ".format(org_count) + type + " tRNA sequences imported") + return(tRNA_dict) + + if mitotRNAs: + tRNA_dict = addExtraNucleartRNAs(mitotRNAs, 'mitochondrial', tRNA_dict, double_cca) + if plastidtRNAs: + tRNA_dict = addExtraNucleartRNAs(plastidtRNAs, 'plastid', tRNA_dict, double_cca) + if mitotRNAs or plastidtRNAs: num_cytosilic = len([k for k in tRNA_dict.keys() if tRNA_dict[k]['type'] == "cytosolic"]) log.info("{} cytosolic tRNA sequences imported".format(num_cytosilic)) @@ -103,7 +105,7 @@ def tRNAparser (gtRNAdb, tRNAscan_out, mitotRNAs, modifications_table, posttrans for s in species: log.info('Number of Modomics entries for {}: {}'.format(s, perSpecies_count[s])) - return(tRNA_dict,modomics_dict, species) + return(tRNA_dict, modomics_dict, species) def processModomics(modomics_file, fetch, species, modifications): @@ -126,7 +128,7 @@ def processModomics(modomics_file, fetch, species, modifications): new_anticodon = getUnmodSeq(anticodon, modifications) if "N" in new_anticodon: continue - + # Check amino acid name in modomics - set to iMet if equal to Ini to match gtRNAdb amino = data['subtype'] if amino == 'Ini': @@ -140,7 +142,7 @@ def processModomics(modomics_file, fetch, species, modifications): tRNA_type = data['organellum'] tRNA_type = "cytosolic" if re.search("cytosol",tRNA_type) else tRNA_type - + sequence = data['seq'].replace('U','T').replace('-','') unmod_sequence = getUnmodSeq(sequence, modifications) # Return list of modified nucl and inosines indices and add to modomics_dict @@ -148,7 +150,7 @@ def processModomics(modomics_file, fetch, species, modifications): Mods = ['"', 'K', 'R', "'", 'O', 'Y', 'W', '⊆', 'X', '*', '['] modPos = [i for i, x in enumerate(sequence) if x in Mods] inosinePos = [i for i, x in enumerate(sequence) if x == 'I'] - + modomics_dict[curr_id] = {'sequence':sequence,'type':tRNA_type, 'anticodon':new_anticodon, 'modified':modPos, 'unmod_sequence':unmod_sequence, 'InosinePos':inosinePos} # build modomics_dict if data was not fetched from API - i.e. using local txt version @@ -188,7 +190,7 @@ def processModomics(modomics_file, fetch, species, modifications): tRNA_type = str(line.split('|')[4]) tRNA_type = "cytosolic" if re.search("cytosol",tRNA_type) else tRNA_type modomics_dict[curr_id] = {'sequence':'','type':tRNA_type, 'anticodon':new_anticodon} - + else: if not mod_species in species: continue @@ -208,7 +210,7 @@ def processModomics(modomics_file, fetch, species, modifications): modomics_dict[curr_id]['modified'] = modPos modomics_dict[curr_id]['unmod_sequence'] = unmod_sequence modomics_dict[curr_id]["InosinePos"] = inosinePos - + return(modomics_dict, perSpecies_count) def getModomics(local_mod): @@ -236,7 +238,7 @@ def getModomics(local_mod): return modomics, fetch -def modsToSNPIndex(gtRNAdb, tRNAscan_out, mitotRNAs, modifications_table, experiment_name, out_dir, double_cca, threads, snp_tolerance = False, cluster = False, cluster_id = 0.95, posttrans_mod_off = False, pretrnas = False, local_mod = False): +def modsToSNPIndex(gtRNAdb, tRNAscan_out, mitotRNAs, plastidtRNAs, modifications_table, experiment_name, out_dir, double_cca, threads, snp_tolerance = False, cluster = False, cluster_id = 0.95, posttrans_mod_off = False, pretrnas = False, local_mod = False, search='usearch'): # Builds SNP index needed for GSNAP based on modificaiton data for each tRNA and clusters tRNAs nomatch_count = 0 @@ -248,7 +250,7 @@ def modsToSNPIndex(gtRNAdb, tRNAscan_out, mitotRNAs, modifications_table, experi anticodon_list = list() tRNAbed = open(out_dir + experiment_name + "_maturetRNA.bed","w") # generate modomics_dict and tRNA_dict - tRNA_dict, modomics_dict, species = tRNAparser(gtRNAdb, tRNAscan_out, mitotRNAs, modifications_table, posttrans_mod_off, double_cca, pretrnas, local_mod) + tRNA_dict, modomics_dict, species = tRNAparser(gtRNAdb, tRNAscan_out, mitotRNAs, plastidtRNAs, modifications_table, posttrans_mod_off, double_cca, pretrnas, local_mod) temp_dir = out_dir + "/tmp/" try: @@ -267,7 +269,7 @@ def modsToSNPIndex(gtRNAdb, tRNAscan_out, mitotRNAs, modifications_table, experi for seq in tRNA_dict: tempSeqs.write(">" + seq + "\n" + tRNA_dict[seq]['sequence'] + "\n") - aligntRNA(tempSeqs.name, out_dir) + aligntRNA(tempSeqs.name, out_dir, threads) extra_cca = extraCCA() for record in extra_cca: @@ -279,7 +281,7 @@ def modsToSNPIndex(gtRNAdb, tRNAscan_out, mitotRNAs, modifications_table, experi log.info("\n+------------------------+ \ \n| Beginning SNP indexing |\ - \n+------------------------+") + \n+------------------------+") for seq in tRNA_dict: # Initialise list of modified sites for each tRNA @@ -295,7 +297,7 @@ def modsToSNPIndex(gtRNAdb, tRNAscan_out, mitotRNAs, modifications_table, experi match = {k:v for k,v in modomics_dict.items() if re.match("^" + v['anticodon'] + "+$", anticodon) and tRNA_dict[seq]['type'] == v['type']} if len(match) >= 1: temp_matchFasta = open(temp_dir + "modomicsMatch.fasta","w") - for i in match: + for i in match: temp_matchFasta.write(">" + i + "\n" + match[i]['unmod_sequence'] + "\n") temp_matchFasta.close() @@ -303,7 +305,7 @@ def modsToSNPIndex(gtRNAdb, tRNAscan_out, mitotRNAs, modifications_table, experi blastn_cmd = ["blastn", "-query", temp_tRNAFasta.name, "-subject", temp_matchFasta.name, "-task", "blastn-short", "-out", temp_dir + "blast_temp.xml", "-outfmt", "5", "-num_threads", str(threads)] subprocess.check_call(blastn_cmd, stdout = subprocess.DEVNULL, stderr=subprocess.DEVNULL) - #parse XML result and store hit with highest bitscore + #parse XML result and store hit with highest bitscore blast_record = SearchIO.read(temp_dir + "blast_temp.xml", "blast-xml") maxbit = 0 tophit = '' @@ -320,7 +322,7 @@ def modsToSNPIndex(gtRNAdb, tRNAscan_out, mitotRNAs, modifications_table, experi match_count += 1 # some Modomics sequences are weird and have additional 5' nucleotides # if the hit match starts at pos 0 then log all mods and inosines as normal - # if not, then the start position of the modomics hit relative to query needs to be accounted for + # if not, then the start position of the modomics hit relative to query needs to be accounted for # wrt mods and inosine positions in the tRNA_dict sequence tRNA_dict[seq]['modified'] = match[tophit]['modified'] if hit_start == 0 else [x - hit_start for x in match[tophit]['modified']] tRNA_dict[seq]['InosinePos'] = match[tophit]['InosinePos'] if hit_start == 0 else [x - hit_start for x in match[tophit]['InosinePos']] @@ -342,12 +344,12 @@ def modsToSNPIndex(gtRNAdb, tRNAscan_out, mitotRNAs, modifications_table, experi with open(str(out_dir + experiment_name + '_tRNATranscripts.fa'), "w") as temptRNATranscripts: SeqIO.write(seq_records.values(), temptRNATranscripts, "fasta") - # if clustering is not activated then write full gff and report on total SNPs written + # if clustering is not activated then write full gff and report on total SNPs written if not cluster: coverage_bed = tRNAbed.name mod_lists = defaultdict(list) Inosine_lists = defaultdict(list) - with open(out_dir + experiment_name + "_tRNA.gff","w") as tRNAgff, open(out_dir + experiment_name + "isoacceptorInfo.txt","w") as isoacceptorInfo: + with open(out_dir + experiment_name + "_tRNA.gff","w") as tRNAgff, open(out_dir + experiment_name + "isoacceptorInfo.txt","w") as isoacceptorInfo: isoacceptor_dict = defaultdict(int) isoacceptorInfo.write("Isoacceptor\tsize\n") for seq in tRNA_dict: @@ -359,7 +361,7 @@ def modsToSNPIndex(gtRNAdb, tRNAscan_out, mitotRNAs, modifications_table, experi for key, value in isoacceptor_dict.items(): isoacceptorInfo.write(key + "\t" + str(value) + "\n") # generate Stockholm alignment file for all tRNA transcripts and parse additional mods file - aligntRNA(temptRNATranscripts.name, out_dir) + aligntRNA(temptRNATranscripts.name, out_dir, threads) additionalMods, additionalInosines = additionalModsParser(species, out_dir) # add additional SNPs from extra file to list of modified positions, and ensure non-redundancy with set() # index SNPs @@ -457,7 +459,7 @@ def modsToSNPIndex(gtRNAdb, tRNAscan_out, mitotRNAs, modifications_table, experi centroids = SeqIO.parse(temp_dir + "all_centroids.fa", "fasta") for centroid in centroids: centroid.id = centroid.id.split(";")[0] - final_centroids[centroid.id] = SeqRecord(Seq(str(centroid.seq).upper()), id = centroid.id) + final_centroids[centroid.id] = SeqRecord(Seq(str(centroid.seq).upper()), id = centroid.id) # read cluster files, get nonredudant set of mod positions of all members of a cluster, create snp_records for writing SNP index cluster_pathlist = Path(temp_dir).glob("**/*_clusters.uc") @@ -489,7 +491,7 @@ def modsToSNPIndex(gtRNAdb, tRNAscan_out, mitotRNAs, modifications_table, experi clusterbed.write(cluster_name + "\t0\t" + str(len(tRNA_dict[cluster_name]['sequence'])) + "\t" + cluster_name + "\t1000\t+\n" ) clustergff.write(cluster_name + "\ttRNAseq\texon\t1\t" + str(len(tRNA_dict[cluster_name]['sequence'])) + "\t.\t+\t0\tgene_id '" + cluster_name + "'\n") cluster_dict[cluster_name].append(cluster_name) - + # Handle members of clusters elif line.split("\t")[0] == "H": member_name = line.split("\t")[8].split(";")[0] @@ -500,7 +502,7 @@ def modsToSNPIndex(gtRNAdb, tRNAscan_out, mitotRNAs, modifications_table, experi mod_lists[cluster_name] = list(set(mod_lists[cluster_name] + tRNA_dict[member_name]["modified"])) Inosine_lists[cluster_name] = list(set(Inosine_lists[cluster_name] + tRNA_dict[member_name]["InosinePos"])) cluster_dict[cluster_name].append(member_name) - + # if there are insertions or deletions in the centroid, edit member or centroid sequences to ignore these positions # and edit modified positions list in order to make non-redundant positions list, similar to next else statement elif re.search("[ID]", compr_aln): @@ -515,7 +517,7 @@ def modsToSNPIndex(gtRNAdb, tRNAscan_out, mitotRNAs, modifications_table, experi aln_list = list(filter(None, aln_list)) for phrase in aln_list: - + if ("M" in phrase) and (phrase.split("M")[0] != ""): pos += int(phrase.split("M")[0]) elif ("M" in phrase) and (phrase.split("M")[0] == ""): @@ -584,7 +586,7 @@ def modsToSNPIndex(gtRNAdb, tRNAscan_out, mitotRNAs, modifications_table, experi mod_lists[cluster_name] = list(set(mod_lists[cluster_name] + member_mods)) Inosine_lists[cluster_name] = list(set(Inosine_lists[cluster_name] + member_Inosines)) cluster_dict[cluster_name].append(member_name) - + clusterbed.close() # Write cluster information to tsv and get number of unique sequences per cluster (i.e. isodecoders) for read count splitting @@ -654,18 +656,18 @@ def modsToSNPIndex(gtRNAdb, tRNAscan_out, mitotRNAs, modifications_table, experi SeqIO.write(final_centroids.values(), clusterTranscripts, "fasta") if total_snps == 0: - snp_tolerance = False + snp_tolerance = False log.info("{:,} modifications/cluster mismatches written to SNP index".format(total_snps)) - log.info("{:,} A to G replacements in reference sequences for inosine modifications".format(total_inosines)) - - # write outputs for indexing + log.info("{:,} A to G replacements in reference sequences for inosine modifications".format(total_inosines)) + + # write outputs for indexing with open(out_dir + experiment_name + "_modificationSNPs.txt", "w") as snp_file: for item in snp_records: snp_file.write('{}\n'.format(item)) - + shutil.rmtree(temp_dir) - + # Return coverage_bed (either tRNAbed or clusterbed depending on --cluster) for coverage calculation method return(coverage_bed, snp_tolerance, mismatch_dict, insert_dict, del_dict, mod_lists, Inosine_lists, Inosine_clusters, tRNA_dict, cluster_dict, cluster_perPos_mismatchMembers) @@ -674,7 +676,7 @@ def newModsParser(out_dir, experiment_name, new_mods_list, new_Inosines, mod_lis log.info("\n+------------------+ \ \n| Parsing new mods |\ - \n+------------------+") + \n+------------------+") new_snps_num = 0 new_inosines = 0 @@ -726,14 +728,14 @@ def newModsParser(out_dir, experiment_name, new_mods_list, new_Inosines, mod_lis tRNA_ref = out_dir + experiment_name + '_clusterTranscripts.fa' else: tRNA_ref = out_dir + experiment_name + '_tRNATranscripts.fa' - - tRNA_seqs = SeqIO.to_dict(SeqIO.parse(tRNA_ref, 'fasta')) + + tRNA_seqs = SeqIO.to_dict(SeqIO.parse(tRNA_ref, 'fasta')) # rewrite tRNA transcript reference with open(tRNA_ref, "w") as transcript_fasta: SeqIO.write(tRNA_seqs.values(), transcript_fasta, "fasta") - - log.info("{:,} modifications written to SNP index".format(total_snps)) + + log.info("{:,} modifications written to SNP index".format(total_snps)) return(Inosine_clusters, snp_tolerance, tRNA_dict, mod_lists, Inosine_lists) @@ -776,7 +778,7 @@ def additionalModsParser(input_species, out_dir): for cluster in clusters: no_gap_struct = [value for key, value in tRNA_struct_nogap.items() if key == cluster and 'nmt' not in key] if not no_gap_struct: # test if struct is empty, i.e. if isodecoder from additional mods does not exist in tRNA dictionary - continue + continue anticodon = clusterAnticodon(cons_anticodon, cluster) for mod in data['mods']: @@ -810,7 +812,7 @@ def getModSite(cluster, cons_pos, cons_pos_dict, tRNA_struct, tRNA_struct_nogap) mod_site = all_struct_element_list_nogap[index_struct_element] else: mod_site = 'NA' - + return(mod_site) def generateGSNAPIndices(species, name, out_dir, map_round, snp_tolerance = False, cluster = False): @@ -828,19 +830,19 @@ def generateGSNAPIndices(species, name, out_dir, map_round, snp_tolerance = Fals genome_index_path = out_dir + species + "_tRNAgenome" genome_index_name = genome_index_path.split("/")[-1] - + try: os.mkdir(genome_index_path) except FileExistsError: log.warning("Genome index folder found! Rebuilding index anyway...") - + if cluster: genome_file = out_dir + name + "_clusterTranscripts.fa" else: genome_file = out_dir + name + "_tRNATranscripts.fa" index_cmd = ["gmap_build", "-q", "1", "-D", out_dir, "-d", species + "_tRNAgenome", genome_file] - subprocess.check_call(index_cmd, stderr = open(out_dir + "genomeindex.log", "w"), stdout = subprocess.DEVNULL) + subprocess.check_call(index_cmd, stderr = open(out_dir + "genomeindex.log", "w"), stdout = subprocess.DEVNULL) log.info("Genome indices done...") snp_index_path = out_dir + species + "snp_index" @@ -940,7 +942,7 @@ def initIntronDict(tRNAscan_out): # Build dictionary of intron locations Intron_dict = {} - tRNAscan = open(tRNAscan_out, 'r') + tRNAscan = open(tRNAscan_out, 'r') intron_count = 0 for line in tRNAscan: if line.startswith("chr"): @@ -1002,7 +1004,7 @@ def countsAnticodon(input_counts, out_dir): log.info("** Read counts per anticodon saved to " + out_dir + "counts/Anticodon_counts_raw.txt **") def tidyFiles (out_dir, cca): - + os.mkdir(out_dir + "annotation/") os.mkdir(out_dir + "align/") os.mkdir(out_dir + "indices/")