version bump

nedialkova-lab · Sep 7, 2022 · 4d56f38 · 4d56f38
1 parent 9f81252
commit 4d56f38
Show file tree

Hide file tree

Showing 13 changed files with 153 additions and 130 deletions.
diff --git a/build/lib/mimseq/data/hg38-eColitK/README.txt b/build/lib/mimseq/data/hg38-eColitK/README.txt
@@ -22,6 +22,7 @@ Homo_sapiens_tRNA-Asn-GTT-16-5 does not share exact sequence with other GTT-16 g
 Homo_sapiens_tRNA-Asn-GTT-9-2 does not share sequence with GTT-9-1. Rename to Homo_sapiens_tRNA-Asn-GTT-14-1
 Homo_sapiens_tRX-Asn-GTT-3-1 shares sequence with Homo_sapiens_tRX-Asn-GTT-2-1. Rename to Homo_sapiens_tRX-Asn-GTT-2-2
 Homo_sapiens_tRX-Phe-GAA-1-1 shares sequence with Homo_sapiens_tRNA-Phe-GAA-10-1. Rename to Homo_sapiens_tRNA-Phe-GAA-10-2
+Homo_sapiens_tRNA-Ala-AGC-7-1 shares sequence with Homo_sapiens_tRNA-Ala-AGC-2. Renamed to Homo_sapiens_tRNA-Ala-AGC-2-3
 
 ### Manually removed sequences due to strange sequence, poor score and issues with infernal alignment with mimseq
 ### See hg38-tRNAs-filtered.fa

diff --git a/build/lib/mimseq/deseq.R b/build/lib/mimseq/deseq.R
@@ -433,29 +433,31 @@ for (type in c("cyto", "mito")) {
 
         # Annotation for baseMean counts
         if (nrow(comb_isodecoder) != 0) {
-          baseMeanAnno_iso = rowAnnotation(base_mean = anno_lines(comb_isodecoder$baseMean, 
-                                                                  gp = gpar(lwd = 1.5, col = "#084081")), 
-                                          annotation_name_rot = 90,
-                                          width = unit("0.8", "cm"),
-                                          height = unit(20, "cm"))
+          anno_type = ifelse(nrow(comb_isodecoder) > 1, anno_lines, anno_barplot)
+          baseMeanAnno_iso = rowAnnotation(base_mean = anno_type(comb_isodecoder$baseMean,
+                                                                  gp = gpar(lwd = 1.5, col = "#084081", fill = "#084081")), 
+                                           annotation_name_rot = 90,
+                                           width = unit("0.8", "cm"),
+                                           height = unit(20, "cm"))
 
           hm_iso = Heatmap(scaled_counts_isodecoder,
-                          col = col_fun,
-                          row_title = paste('n = ', nrow(scaled_counts_isodecoder), sep = ""),
-                          show_row_names = FALSE,
-                          width = unit((length(ordered_levels)*0.7) + 2, "cm"),
-                          height = unit(20, "cm"),
-                          border = "gray20",
-                          cluster_columns = TRUE,
-                          heatmap_legend_param = list(
-                              title = paste("Scaled expression\nlog2 fold-change (adj-p <= ", p_adj,")", sep = "")
-                            )
+                           col = col_fun,
+                           row_title = paste('n = ', nrow(scaled_counts_isodecoder), sep = ""),
+                           show_row_names = FALSE,
+                           width = unit((length(ordered_levels)*0.7) + 2, "cm"),
+                           height = unit(20, "cm"),
+                           border = "gray20",
+                           cluster_columns = TRUE,
+                           heatmap_legend_param = list(
+                             title = paste("Scaled expression\nlog2 fold-change (adj-p <= ", p_adj,")", sep = "")
+                           )
           )
         }
 
         if (nrow(comb_anticodon) != 0) {
-          baseMeanAnno_anti = rowAnnotation(base_mean = anno_lines(comb_anticodon$baseMean, 
-                                                                  gp = gpar(lwd = 1.5, col = "#084081")), 
+          anno_type = ifelse(nrow(comb_isodecoder) > 1, anno_lines, anno_barplot)
+          baseMeanAnno_anti = rowAnnotation(base_mean = anno_type(comb_anticodon$baseMean, 
+                                                                   gp = gpar(lwd = 1.5, col = "#084081", fill = "#084081")), 
                                             annotation_name_rot = 90,
                                             width = unit("0.8", "cm"),
                                             height = unit(20, "cm"))
@@ -490,17 +492,17 @@ for (type in c("cyto", "mito")) {
             }
             if (nrow(comb_anticodon) != 0) {
               lfc_anti = Heatmap(comb_anticodon[lfc],
-                                col = col_fun,
-                                width = unit(0.5, "cm"), 
-                                height = unit(20, "cm"),
-                                na_col = "white",
-                                border = "gray20",
-                                show_heatmap_legend = FALSE)
-            hm_anti = hm_anti + lfc_anti
+                                 col = col_fun,
+                                 width = unit(0.5, "cm"), 
+                                 height = unit(20, "cm"),
+                                 na_col = "white",
+                                 border = "gray20",
+                                 show_heatmap_legend = FALSE)
+              hm_anti = hm_anti + lfc_anti
             }
           }
         }
-
+        
         # calculate plot width based on sample numbers
         plot_width = (length(ordered_levels)*0.8 + 4) + (length(ordered_levels)-1)*1.3 + 0.8
         plot_width = plot_width/2.54

diff --git a/build/lib/mimseq/mimseq.py b/build/lib/mimseq/mimseq.py
@@ -1,13 +1,13 @@
 #! /usr/bin/env python3
 
 #   +-------------+
-#   | mim-tRNAseq | 
+#   | mim-tRNAseq |
 #   +-------------+
 
 ####################################
 # Main backbone and wrapper script #
 ####################################
-# 
+#
 # author: Drew Behrens
 # contact: [email protected]
 # github: https://github.com/nedialkova-lab/mim-tRNAseq
@@ -18,7 +18,7 @@
 from .tRNAmap import mainAlign
 from .getCoverage import getCoverage, plotCoverage
 from .mmQuant import generateModsTable, plotCCA
-from .ssAlign import structureParser, modContext 
+from .ssAlign import structureParser, modContext
 from .splitClusters import splitIsodecoder, unsplitClustersCov, getIsodecoderSizes, getDeconvSizes, writeDeconvTranscripts, writeIsodecoderTranscripts, writeSplitInfo, writeIsodecoderInfo
 import sys, os, subprocess, logging, datetime, copy
 import argparse
@@ -49,8 +49,8 @@ def restrictedFloat2(x):
 		raise argparse.ArgumentTypeError('{} not a real number'.format(x))
 
 def mimseq(trnas, trnaout, name, species, out, cluster, cluster_id, cov_diff, posttrans, control_cond, threads, max_multi, snp_tolerance, \
-	keep_temp, cca, double_cca, min_cov, mismatches, remap, remap_mismatches, misinc_thresh, mito_trnas, pretrnas, local_mod, p_adj, sample_data):
-	
+	keep_temp, cca, double_cca, min_cov, mismatches, remap, remap_mismatches, misinc_thresh, mito_trnas, plastid_trnas, pretrnas, local_mod, p_adj, sample_data):
+
 # Main wrapper
 	# Integrity check for output folder argument...
 	try:
@@ -98,7 +98,7 @@ def mimseq(trnas, trnaout, name, species, out, cluster, cluster_id, cov_diff, po
 	modifications = os.path.dirname(os.path.realpath(__file__))
 	modifications += "/modifications"
 	coverage_bed, snp_tolerance, mismatch_dict, insert_dict, del_dict, mod_lists, Inosine_lists, Inosine_clusters, tRNA_dict, cluster_dict, cluster_perPos_mismatchMembers \
-	= modsToSNPIndex(trnas, trnaout, mito_trnas, modifications, name, out, double_cca, threads, snp_tolerance, cluster, cluster_id, posttrans, pretrnas, local_mod)
+	= modsToSNPIndex(trnas, trnaout, mito_trnas, plastid_trnas, modifications, name, out, double_cca, threads, snp_tolerance, cluster, cluster_id, posttrans, pretrnas, local_mod)
 	structureParser()
 	# Generate GSNAP indices
 	genome_index_path, genome_index_name, snp_index_path, snp_index_name = generateGSNAPIndices(species, name, out, map_round, snp_tolerance, cluster)
@@ -139,7 +139,7 @@ def mimseq(trnas, trnaout, name, species, out, cluster, cluster_id, cov_diff, po
 	#else:
 	#	log.info("\n*** New modifications not discovered as remap is not enabled ***\n")
 
-	# redo checks for unsplit isodecoders based on coverage 
+	# redo checks for unsplit isodecoders based on coverage
 	# use original splitBool and unique_isodecoderMMs in case coverage changes in 2nd alignment round, regenerate splitBool_new and unique_isodecoderMMs_new
 	# rewrite deconv transcripts
 	if map_round == 2 and cluster and cluster_id != 1:
@@ -166,7 +166,7 @@ def mimseq(trnas, trnaout, name, species, out, cluster, cluster_id, cov_diff, po
 	mod_sites, cons_pos_list = modContext(out, unsplitCluster_lookup)
 
 	script_path = os.path.dirname(os.path.realpath(__file__))
-	
+
 	if snp_tolerance or not mismatches == 0.0:
 					# plot mods and stops, catch exception with command call and print log error if many clusters are filtered (known to cause issues with R code handling mods table)
 		log.info("Plotting modification and RT stop data...")
@@ -217,25 +217,25 @@ def mimseq(trnas, trnaout, name, species, out, cluster, cluster_id, cov_diff, po
 
 def main():
 
-	################### 
+	###################
 	# Parse arguments #
-	################### 
-	
+	###################
+
 	parser = argparse.ArgumentParser(description = 'Custom high-throughput tRNA sequencing alignment and quantification pipeline\
 		based on modification induced misincorporation cDNA synthesis.', add_help = True, usage = "%(prog)s [options] sample data")
 
 	inputs = parser.add_argument_group("Input files")
-	inputs.add_argument('-s','--species', metavar='species', required = not ('-t' in sys.argv), dest = 'species', help = \
+	inputs.add_argument('-s','--species', metavar='species', required = not ('-t' in sys.argv or '--trnas' in sys.argv), dest = 'species', help = \
 		'Species being analyzed for which to load pre-packaged data files (prioritized over -t, -o and -m). Options are: Hsap, Hsap19, Mmus, Rnor, Scer, Spom, Dmel, Drer, Ecol, Atha', \
 		choices = ['Hsap', 'Hsap19','Ggor','Mmus','Rnor','Scer', 'Spom','Dmel', 'Drer', 'Ecol', 'Atha', 'HsapTCC', 'ScerMut'])
 	inputs.add_argument('-t', '--trnas', metavar='genomic tRNAs', required = False, dest = 'trnas', help = \
 		'Genomic tRNA fasta file, e.g. from gtRNAdb or tRNAscan-SE. Already avalable in data folder for a few model organisms.')
-	inputs.add_argument('-o', '--trnaout', metavar = 'tRNA out file', required = (not '--species' or '-s' in sys.argv) or ('-t' in sys.argv), 
+	inputs.add_argument('-o', '--trnaout', metavar = 'tRNA out file', required = (not '--species' or '-s' in sys.argv) or ('-t' in sys.argv),
 		dest = 'trnaout', help = 'tRNA.out file generated by tRNAscan-SE (also may be available on gtRNAdb). Contains information about tRNA features, including introns.')
-	inputs.add_argument('-m', '--mito-trnas', metavar = 'mitochondrial/plastid tRNAs', required = False, nargs = "*", dest = 'mito', \
-		help = 'Mitochondrial/plastid tRNA fasta file(s). Can be multiple space-separated file names to specify both mitochondrial and plastid seqences.\
-			Ensure "plastid" and "mito" are in the file names in this case. Should be downloaded from mitotRNAdb or PtRNAdb for species of interest. Already available in data folder for a few model organisms.')
-
+	inputs.add_argument('-m', '--mito-trnas', metavar = 'mitochondrial/plastid tRNAs', required = False, dest = 'mito', \
+		help = 'Mitochondrial tRNA fasta file(s). Should be downloaded from mitotRNAdb for species of interest. Already available in data folder for a few model organisms.')
+	inputs.add_argument('-p', '--plastid-trnas', metavar = 'mitochondrial/plastid tRNAs', required = False, dest = 'plastid', \
+		help = 'Plastid tRNA fasta file(s). Should be downloaded from PtRNAdb for species of interest. Already available in data folder for a few model organisms.')
 	options = parser.add_argument_group("Program options")
 	options.add_argument('--pretRNAs', required = False, dest = 'pretrnas', action = 'store_true',\
 		help = "Input reference sequences are pretRNAs. Enabling this option will disable the removal of intron sequences and addition of 3'-CCA to generate \
@@ -308,7 +308,7 @@ def main():
 
 	parser.add_argument('--version', action='version', version='%(prog)s {}'.format(version.__version__), help = 'Show version number and exit')
 	parser.add_argument('sampledata', help = 'Sample data sheet in text format, tab-separated. Column 1: full path to fastq (or fastq.gz). Column 2: condition/group.')
-	
+
 	parser.set_defaults(threads=1, out="./", max_multi = 3, mito = '', cov_diff = 0.5)
 
 	#########################################
@@ -346,7 +346,7 @@ def main():
 		if args.control_cond not in conditions:
 			raise argparse.ArgumentTypeError('{} not a valid condition in {}'.format(args.control_cond, args.sampledata))
 		if not args.species and not (args.trnas or args.trnaout):
-			parser.error('Must specify valid --species argument or supply -t (tRNA sequences) and -o (tRNAscan out file)!')						
+			parser.error('Must specify valid --species argument or supply -t (tRNA sequences) and -o (tRNAscan out file)!')
 		else:
 			if args.species:
 				if args.species == 'Ggor':
@@ -380,7 +380,7 @@ def main():
 				if args.species == 'Rnor':
 					args.trnas = os.path.dirname(os.path.realpath(__file__)) + "/data/rn7-eColitK/rn7-tRNAs.fa"
 					args.trnaout = os.path.dirname(os.path.realpath(__file__)) + "/data/rn7-eColitK/rn7_eColitK-tRNAs-confidence-set.out"
-					args.mito = os.path.dirname(os.path.realpath(__file__)) + "/data/rn7-eColitK/rn7-mitotRNAs.fa"				
+					args.mito = os.path.dirname(os.path.realpath(__file__)) + "/data/rn7-eColitK/rn7-mitotRNAs.fa"
 				if args.species == 'Spom':
 					args.trnas = os.path.dirname(os.path.realpath(__file__)) + "/data/schiPomb-eColitK/schiPomb_972H-tRNAs.fa"
 					args.trnaout = os.path.dirname(os.path.realpath(__file__)) + "/data/schiPomb-eColitK/schiPomb_eschColi-tRNAs.out"
@@ -400,15 +400,15 @@ def main():
 				if args.species == 'Atha':
 					args.trnas = os.path.dirname(os.path.realpath(__file__)) + "/data/araTha1-eColitK/araTha1-tRNAs.fa"
 					args.trnaout = os.path.dirname(os.path.realpath(__file__)) + "/data/araTha1-eColitK/araTha1-tRNAs-detailed.out"
-					# two mito files here for plastid and mito references
-					args.mito = os.path.dirname(os.path.realpath(__file__)) + "/data/araTha1-eColitK/araTha1-plastidtRNAs.fa " + os.path.dirname(os.path.realpath(__file__)) + "/data/araTha1-eColitK/araTha1-mitotRNAs.fa"
+					args.mito = os.path.dirname(os.path.realpath(__file__)) + "/data/araTha1-eColitK/araTha1-mitotRNAs.fa"
+					# plastid reference needed for A.thaliana
+					args.plastid = os.path.dirname(os.path.realpath(__file__)) + "/data/araTha1-eColitK/araTha1-plastidtRNAs.fa "
 			else:
 				args.species = args.trnas.split("/")[-1].split(".")[0]
 			mimseq(args.trnas, args.trnaout, args.name, args.species, args.out, args.cluster, args.cluster_id, args.cov_diff, \
 				args.posttrans, args.control_cond, args.threads, args.max_multi, args.snp_tolerance, \
 				args.keep_temp, args.cca, args.double_cca, args.min_cov, args.mismatches, args.remap, args.remap_mismatches, \
-				args.misinc_thresh, args.mito, args.pretrnas, args.local_mod, args.p_adj, args.sampledata)
+				args.misinc_thresh, args.mito, args.plastid, args.pretrnas, args.local_mod, args.p_adj, args.sampledata)
 
 if __name__ == '__main__':
 	main()
-
diff --git a/build/lib/mimseq/mmQuant.py b/build/lib/mimseq/mmQuant.py
@@ -664,7 +664,10 @@ def generateModsTable(sampleGroups, out_dir, name, threads, min_cov, mismatch_di
 			splitBool[cluster].update(set(filtMembers))
 
 			# generate new unsplit cluster names and add to the lookup dictionary (keep tRX naming for these isodecoders)
-			member_IsoNums = tuple(int(iso.split("-")[-2]) if "tRNA" in iso else "tRX" + iso.split("-")[-2] for iso in filtMembers)
+			if "tRNA" in cluster:
+				member_IsoNums = tuple(int(iso.split("-")[-2]) if "tRNA" in iso else "tRX" + iso.split("-")[-2] for iso in filtMembers)
+			elif "tRX" in cluster:
+				member_IsoNums = tuple(int(iso.split("-")[-2]) if "tRX" in iso else "tRNA" + iso.split("-")[-2] for iso in filtMembers)
 			member_IsoNums = sorted(tuple(x for x in member_IsoNums if isinstance(x, int))) + [x for x in member_IsoNums if isinstance(x, str)]
 			member_IsoString = "/" + "/".join([str(iso) for iso in member_IsoNums])
 			newClusterName = "-".join(cluster.split("-")[:-1]) + member_IsoString

diff --git a/build/lib/mimseq/splitClusters.py b/build/lib/mimseq/splitClusters.py
@@ -411,8 +411,11 @@ def getDeconvSizes(splitBool, tRNA_dict, cluster_dict, unique_isodecoderMMs):
     # generate new names for unsplit clusters (tRNA-AA-Anti-IsoX/Y) - write to lookup for later
     unsplitCluster_lookup = defaultdict()
     for cluster, isos in splitBool.items():
-        member_IsoNums = tuple(int(iso.split("-")[-2]) for iso in isos)
-        member_IsoNums = sorted(member_IsoNums)
+        if "tRNA" in cluster:
+            member_IsoNums = tuple(int(iso.split("-")[-2]) if "tRNA" in iso else "tRX" + iso.split("-")[-2] for iso in isos)
+        elif "tRX" in cluster:
+            member_IsoNums = tuple(int(iso.split("-")[-2]) if "tRX" in iso else "tRNA" + iso.split("-")[-2] for iso in isos)
+        member_IsoNums = sorted(tuple(x for x in member_IsoNums if isinstance(x, int))) + [x for x in member_IsoNums if isinstance(x, str)]
         member_IsoString = "/" + "/".join([str(iso) for iso in member_IsoNums])
         newClusterName = "-".join(cluster.split("-")[:-1]) + member_IsoString
         shortClusterName = "-".join(cluster.split("-")[:-1])
@@ -466,11 +469,18 @@ def writeIsodecoderInfo(out_dir, experiment_name, isodecoder_sizes, readRef_unsp
         count = 0
         for num in name.split("-")[-1].split("/"):
             count += 1
-            if not "tRX" in num:
-                iso = base + "-" + num
-            else:
-                num = num.replace("tRX", "")
-                iso = base.replace("tRNA", "tRX") + "-" + num
+            if "tRNA" in base:
+                if not "tRX" in num:
+                    iso = base + "-" + num
+                else:
+                    num = num.replace("tRX", "")
+                    iso = base.replace("tRNA", "tRX") + "-" + num
+            elif "tRX" in base:
+                if not "tRNA" in num:
+                    iso = base + "-" + num
+                else:
+                    num = num.replace("tRX", "")
+                    iso = base.replace("tRNA", "tRX") + "-" + num
             isodecoder_list = [x for x in tRNA_dict.keys() if iso == "-".join(x.split("-")[:-1]) and not "chr" in x]
             iso_min = min([x.split("-")[-1] for x in isodecoder_list])
             gene = iso + "-" + str(iso_min)