Merge pull request #36 from MRCToxBioinformatics/ts_split_mt_plastid

Ts split mt plastid
nedialkova-lab · Sep 7, 2022 · 9f81252 · 9f81252
2 parents 19820b4 + cdc3c2a
commit 9f81252
Show file tree

Hide file tree

Showing 3 changed files with 99 additions and 89 deletions.
diff --git a/mimseq/mimseq.py b/mimseq/mimseq.py
@@ -1,13 +1,13 @@
 #! /usr/bin/env python3
 
 #   +-------------+
-#   | mim-tRNAseq | 
+#   | mim-tRNAseq |
 #   +-------------+
 
 ####################################
 # Main backbone and wrapper script #
 ####################################
-# 
+#
 # author: Drew Behrens
 # contact: [email protected]
 # github: https://github.com/nedialkova-lab/mim-tRNAseq
@@ -18,7 +18,7 @@
 from .tRNAmap import mainAlign
 from .getCoverage import getCoverage, plotCoverage
 from .mmQuant import generateModsTable, plotCCA
-from .ssAlign import structureParser, modContext 
+from .ssAlign import structureParser, modContext
 from .splitClusters import splitIsodecoder, unsplitClustersCov, getIsodecoderSizes, getDeconvSizes, writeDeconvTranscripts, writeIsodecoderTranscripts, writeSplitInfo, writeIsodecoderInfo
 import sys, os, subprocess, logging, datetime, copy
 import argparse
@@ -49,8 +49,8 @@ def restrictedFloat2(x):
 		raise argparse.ArgumentTypeError('{} not a real number'.format(x))
 
 def mimseq(trnas, trnaout, name, species, out, cluster, cluster_id, cov_diff, posttrans, control_cond, threads, max_multi, snp_tolerance, \
-	keep_temp, cca, double_cca, min_cov, mismatches, remap, remap_mismatches, misinc_thresh, mito_trnas, pretrnas, local_mod, p_adj, sample_data):
-	
+	keep_temp, cca, double_cca, min_cov, mismatches, remap, remap_mismatches, misinc_thresh, mito_trnas, plastid_trnas, pretrnas, local_mod, p_adj, sample_data):
+
 # Main wrapper
 	# Integrity check for output folder argument...
 	try:
@@ -98,7 +98,7 @@ def mimseq(trnas, trnaout, name, species, out, cluster, cluster_id, cov_diff, po
 	modifications = os.path.dirname(os.path.realpath(__file__))
 	modifications += "/modifications"
 	coverage_bed, snp_tolerance, mismatch_dict, insert_dict, del_dict, mod_lists, Inosine_lists, Inosine_clusters, tRNA_dict, cluster_dict, cluster_perPos_mismatchMembers \
-	= modsToSNPIndex(trnas, trnaout, mito_trnas, modifications, name, out, double_cca, threads, snp_tolerance, cluster, cluster_id, posttrans, pretrnas, local_mod)
+	= modsToSNPIndex(trnas, trnaout, mito_trnas, plastid_trnas, modifications, name, out, double_cca, threads, snp_tolerance, cluster, cluster_id, posttrans, pretrnas, local_mod)
 	structureParser()
 	# Generate GSNAP indices
 	genome_index_path, genome_index_name, snp_index_path, snp_index_name = generateGSNAPIndices(species, name, out, map_round, snp_tolerance, cluster)
@@ -139,7 +139,7 @@ def mimseq(trnas, trnaout, name, species, out, cluster, cluster_id, cov_diff, po
 	#else:
 	#	log.info("\n*** New modifications not discovered as remap is not enabled ***\n")
 
-	# redo checks for unsplit isodecoders based on coverage 
+	# redo checks for unsplit isodecoders based on coverage
 	# use original splitBool and unique_isodecoderMMs in case coverage changes in 2nd alignment round, regenerate splitBool_new and unique_isodecoderMMs_new
 	# rewrite deconv transcripts
 	if map_round == 2 and cluster and cluster_id != 1:
@@ -166,7 +166,7 @@ def mimseq(trnas, trnaout, name, species, out, cluster, cluster_id, cov_diff, po
 	mod_sites, cons_pos_list = modContext(out, unsplitCluster_lookup)
 
 	script_path = os.path.dirname(os.path.realpath(__file__))
-	
+
 	if snp_tolerance or not mismatches == 0.0:
 					# plot mods and stops, catch exception with command call and print log error if many clusters are filtered (known to cause issues with R code handling mods table)
 		log.info("Plotting modification and RT stop data...")
@@ -217,25 +217,25 @@ def mimseq(trnas, trnaout, name, species, out, cluster, cluster_id, cov_diff, po
 
 def main():
 
-	################### 
+	###################
 	# Parse arguments #
-	################### 
-	
+	###################
+
 	parser = argparse.ArgumentParser(description = 'Custom high-throughput tRNA sequencing alignment and quantification pipeline\
 		based on modification induced misincorporation cDNA synthesis.', add_help = True, usage = "%(prog)s [options] sample data")
 
 	inputs = parser.add_argument_group("Input files")
-	inputs.add_argument('-s','--species', metavar='species', required = not ('-t' in sys.argv), dest = 'species', help = \
+	inputs.add_argument('-s','--species', metavar='species', required = not ('-t' in sys.argv or '--trnas' in sys.argv), dest = 'species', help = \
 		'Species being analyzed for which to load pre-packaged data files (prioritized over -t, -o and -m). Options are: Hsap, Hsap19, Mmus, Rnor, Scer, Spom, Dmel, Drer, Ecol, Atha', \
 		choices = ['Hsap', 'Hsap19','Ggor','Mmus','Rnor','Scer', 'Spom','Dmel', 'Drer', 'Ecol', 'Atha', 'HsapTCC', 'ScerMut'])
 	inputs.add_argument('-t', '--trnas', metavar='genomic tRNAs', required = False, dest = 'trnas', help = \
 		'Genomic tRNA fasta file, e.g. from gtRNAdb or tRNAscan-SE. Already avalable in data folder for a few model organisms.')
-	inputs.add_argument('-o', '--trnaout', metavar = 'tRNA out file', required = (not '--species' or '-s' in sys.argv) or ('-t' in sys.argv), 
+	inputs.add_argument('-o', '--trnaout', metavar = 'tRNA out file', required = (not '--species' or '-s' in sys.argv) or ('-t' in sys.argv),
 		dest = 'trnaout', help = 'tRNA.out file generated by tRNAscan-SE (also may be available on gtRNAdb). Contains information about tRNA features, including introns.')
-	inputs.add_argument('-m', '--mito-trnas', metavar = 'mitochondrial/plastid tRNAs', required = False, nargs = "*", dest = 'mito', \
-		help = 'Mitochondrial/plastid tRNA fasta file(s). Can be multiple space-separated file names to specify both mitochondrial and plastid seqences.\
-			Ensure "plastid" and "mito" are in the file names in this case. Should be downloaded from mitotRNAdb or PtRNAdb for species of interest. Already available in data folder for a few model organisms.')
-
+	inputs.add_argument('-m', '--mito-trnas', metavar = 'mitochondrial/plastid tRNAs', required = False, dest = 'mito', \
+		help = 'Mitochondrial tRNA fasta file(s). Should be downloaded from mitotRNAdb for species of interest. Already available in data folder for a few model organisms.')
+	inputs.add_argument('-p', '--plastid-trnas', metavar = 'mitochondrial/plastid tRNAs', required = False, dest = 'plastid', \
+		help = 'Plastid tRNA fasta file(s). Should be downloaded from PtRNAdb for species of interest. Already available in data folder for a few model organisms.')
 	options = parser.add_argument_group("Program options")
 	options.add_argument('--pretRNAs', required = False, dest = 'pretrnas', action = 'store_true',\
 		help = "Input reference sequences are pretRNAs. Enabling this option will disable the removal of intron sequences and addition of 3'-CCA to generate \
@@ -308,7 +308,7 @@ def main():
 
 	parser.add_argument('--version', action='version', version='%(prog)s {}'.format(version.__version__), help = 'Show version number and exit')
 	parser.add_argument('sampledata', help = 'Sample data sheet in text format, tab-separated. Column 1: full path to fastq (or fastq.gz). Column 2: condition/group.')
-	
+
 	parser.set_defaults(threads=1, out="./", max_multi = 3, mito = '', cov_diff = 0.5)
 
 	#########################################
@@ -346,7 +346,7 @@ def main():
 		if args.control_cond not in conditions:
 			raise argparse.ArgumentTypeError('{} not a valid condition in {}'.format(args.control_cond, args.sampledata))
 		if not args.species and not (args.trnas or args.trnaout):
-			parser.error('Must specify valid --species argument or supply -t (tRNA sequences) and -o (tRNAscan out file)!')						
+			parser.error('Must specify valid --species argument or supply -t (tRNA sequences) and -o (tRNAscan out file)!')
 		else:
 			if args.species:
 				if args.species == 'Ggor':
@@ -380,7 +380,7 @@ def main():
 				if args.species == 'Rnor':
 					args.trnas = os.path.dirname(os.path.realpath(__file__)) + "/data/rn7-eColitK/rn7-tRNAs.fa"
 					args.trnaout = os.path.dirname(os.path.realpath(__file__)) + "/data/rn7-eColitK/rn7_eColitK-tRNAs-confidence-set.out"
-					args.mito = os.path.dirname(os.path.realpath(__file__)) + "/data/rn7-eColitK/rn7-mitotRNAs.fa"				
+					args.mito = os.path.dirname(os.path.realpath(__file__)) + "/data/rn7-eColitK/rn7-mitotRNAs.fa"
 				if args.species == 'Spom':
 					args.trnas = os.path.dirname(os.path.realpath(__file__)) + "/data/schiPomb-eColitK/schiPomb_972H-tRNAs.fa"
 					args.trnaout = os.path.dirname(os.path.realpath(__file__)) + "/data/schiPomb-eColitK/schiPomb_eschColi-tRNAs.out"
@@ -400,15 +400,15 @@ def main():
 				if args.species == 'Atha':
 					args.trnas = os.path.dirname(os.path.realpath(__file__)) + "/data/araTha1-eColitK/araTha1-tRNAs.fa"
 					args.trnaout = os.path.dirname(os.path.realpath(__file__)) + "/data/araTha1-eColitK/araTha1-tRNAs-detailed.out"
-					# two mito files here for plastid and mito references
-					args.mito = os.path.dirname(os.path.realpath(__file__)) + "/data/araTha1-eColitK/araTha1-plastidtRNAs.fa " + os.path.dirname(os.path.realpath(__file__)) + "/data/araTha1-eColitK/araTha1-mitotRNAs.fa"
+					args.mito = os.path.dirname(os.path.realpath(__file__)) + "/data/araTha1-eColitK/araTha1-mitotRNAs.fa"
+					# plastid reference needed for A.thaliana
+					args.plastid = os.path.dirname(os.path.realpath(__file__)) + "/data/araTha1-eColitK/araTha1-plastidtRNAs.fa "
 			else:
 				args.species = args.trnas.split("/")[-1].split(".")[0]
 			mimseq(args.trnas, args.trnaout, args.name, args.species, args.out, args.cluster, args.cluster_id, args.cov_diff, \
 				args.posttrans, args.control_cond, args.threads, args.max_multi, args.snp_tolerance, \
 				args.keep_temp, args.cca, args.double_cca, args.min_cov, args.mismatches, args.remap, args.remap_mismatches, \
-				args.misinc_thresh, args.mito, args.pretrnas, args.local_mod, args.p_adj, args.sampledata)
+				args.misinc_thresh, args.mito, args.plastid, args.pretrnas, args.local_mod, args.p_adj, args.sampledata)
 
 if __name__ == '__main__':
 	main()
-
diff --git a/mimseq/ssAlign.py b/mimseq/ssAlign.py
@@ -10,12 +10,20 @@
 
 stkname = ''
 
-def aligntRNA(tRNAseqs, out):
+def aligntRNA(tRNAseqs, out, threads=1):
 # run cmalign to generate Stockholm file for tRNA sequences
 	global stkname
 	stkname = tRNAseqs.split(".fa")[0] + '_align.stk'
 	cmfile = os.path.dirname(os.path.realpath(__file__)) + '/data/tRNAmatureseq.cm'
-	cmcommand = ['cmalign', '-o', stkname, '--nonbanded', '-g', cmfile, tRNAseqs]
+
+	if threads > 1:
+		cpus = threads
+	else:
+		# For single thread, set --cpu=0
+		# http://eddylab.org/infernal/Userguide.pdf
+		cpus = 0
+
+	cmcommand = ['cmalign', '-o', stkname, '--nonbanded', '--cpu', str(cpus), '-g', cmfile, tRNAseqs]
 	subprocess.check_call(cmcommand, stdout = open(out + 'cm.log', 'w'))
 
 def extraCCA():
@@ -64,7 +72,7 @@ def tRNAclassifier(ungapped = False):
 					cons_pos_list.append('17a')
 					cons_pos += 1
 
-				elif cons_pos == 20: 
+				elif cons_pos == 20:
 					if not '20' in cons_pos_list:
 						cons_pos_dict[pos+1] = '20'
 						cons_pos_list.append('20')
@@ -181,7 +189,7 @@ def getAnticodon():
 	for pos, char in enumerate(rf_cons):
 		if char == "*":
 			anticodon.append(pos)
-	
+
 	return(anticodon)
 
 def clusterAnticodon(cons_anticodon, cluster):
@@ -210,13 +218,13 @@ def modContext(out, unsplitCluster_lookup):
 	# Define positions of conserved mod sites in gapped alignment for each tRNA
 	sites_dict = defaultdict()
 	mod_sites = ['9', '20', '26', '32', '34', '37', '58']
-	
+
 	for mod in mod_sites:
 		sites_dict[mod] = list(cons_pos_dict.keys())[list(cons_pos_dict.values()).index(mod)]
 
 	upstream_dict = defaultdict(lambda: defaultdict(list))
 
-	stk = AlignIO.read(stkname, "stockholm") 
+	stk = AlignIO.read(stkname, "stockholm")
 	for record in stk:
 		gene = record.id
 		seq = record.seq
@@ -230,7 +238,7 @@ def modContext(out, unsplitCluster_lookup):
 				while seq[down].upper() not in ['A','C','G','U','T']:
 					down += 1
 				canon_pos = cons_pos_dict[pos]
-				upstream_dict[gene][canon_pos].append(identity) 
+				upstream_dict[gene][canon_pos].append(identity)
 				upstream_dict[gene][canon_pos].append(seq[up]) # upstream base
 				upstream_dict[gene][canon_pos].append(seq[down]) # downstream base
 
@@ -256,7 +264,7 @@ def modContext(out, unsplitCluster_lookup):
 
 def structureParser():
 # read in stk file generated above and define structural regions for each tRNA input
-	
+
 	struct_dict = dict()
 	# get conserved tRNA structure from alignment
 	ss_cons = "".join([line.split()[-1] for line in open(stkname) if line.startswith("#=GC SS_cons")])