From 9b6951e72101aee12b5dcf109e81abea03cfbb93 Mon Sep 17 00:00:00 2001
From: amos <raphenar@mcmaster.ca>
Date: Thu, 24 Aug 2023 15:28:58 -0400
Subject: [PATCH] fixes typos and adds missing variables

---
 app/ConvertRGIJsonToTSV.py |  2 +-
 app/HomologModel.py        | 27 +++++++++++++++----------
 app/OverexpressionModel.py | 41 ++++++++++++++++++++++----------------
 app/VariantModel.py        | 39 +++++++++++++++++++-----------------
 4 files changed, 63 insertions(+), 46 deletions(-)

diff --git a/app/ConvertRGIJsonToTSV.py b/app/ConvertRGIJsonToTSV.py
index 43c8708..492bd5b 100755
--- a/app/ConvertRGIJsonToTSV.py
+++ b/app/ConvertRGIJsonToTSV.py
@@ -10,7 +10,7 @@ def __init__(self, filepath, homolog_file=None, variant_file=None, overexpressio
 		name, ext = os.path.splitext(f_name)
 		self.filepath = os.path.join(f_path, "{}.json".format(f_name))
 		if ext.lower() == ".json":
-			self.filepath = os.path.join(f_path, "{}{}".format(name,ext))			
+			self.filepath = os.path.join(f_path, "{}{}".format(name,ext))
 		self.homolog_file = homolog_file
 		self.variant_file = variant_file
 		self.overexpression_file = overexpression_file
diff --git a/app/HomologModel.py b/app/HomologModel.py
index 51efbd9..b163c2c 100644
--- a/app/HomologModel.py
+++ b/app/HomologModel.py
@@ -65,9 +65,9 @@ def run(self):
 						else:
 							c += 1
 					orffrom = orfInfo[c:]
-				
+
 					modelTypeID = self.extract_nth_bar(alignTitle, 0)
-					
+
 					if modelTypeID == 40292:
 						spacepos = alignTitle.index(' ')
 						hitid = alignTitle[0:spacepos]
@@ -78,7 +78,7 @@ def run(self):
 						seqinModel = modelDescrpt[underscoreinMD+1: modelDescrpt.index(' ')]
 
 						pass_bitscore = "{}".format(self.extract_nth_bar(alignment.title, 1))
-						pass_evalue = "{}".format("n/a")		
+						pass_evalue = "{}".format("n/a")
 
 						# logger.info("pass_evalue: {}".format(pass_evalue))
 						# logger.info("pass_bitscore: {}".format(pass_bitscore))
@@ -94,7 +94,7 @@ def run(self):
 								card_sequence = str(json_data[modelID]["model_sequences"]["sequence"][seqinModel]["protein_sequence"]["sequence"])
 							except Exception as e:
 								logger.warning("Exception : {} -> {} -> Model({}) missing in database. Please generate new database.".format(type(e), e, modelID))
-								
+
 							# if predicted_genes_dict:
 							# 	if orfInfo.strip() in predicted_genes_dict.keys():
 							# 		orf_protein_sequence = str(Seq(predicted_genes_dict[orfInfo.decode()], generic_dna).translate(table=11)).strip("*")
@@ -153,7 +153,7 @@ def run(self):
 										ppinsidedict["hit_end"] = (hsp.sbjct_end)*3
 
 										if orfInfo.decode().split(' # ')[0] in predicted_genes_dict:
-											ppinsidedict["orf_dna_sequence"] = predicted_genes_dict[orfInfo.decode().split(' # ')[0]] 
+											ppinsidedict["orf_dna_sequence"] = predicted_genes_dict[orfInfo.decode().split(' # ')[0]]
 											# ppinsidedict["orf_prot_sequence"] = str(Seq(predicted_genes_dict[orfInfo.decode().split(' # ')[0]], generic_dna).translate(table=11)).strip("*")
 											ppinsidedict["orf_prot_sequence"] =  orf_protein_sequence
 										else:
@@ -165,6 +165,9 @@ def run(self):
 										ppinsidedict["query_end"] = hsp.query_start + realQueryLength
 										ppinsidedict["query_from"] = blast_record.query
 										ppinsidedict["orf_prot_sequence"] = orf_protein_sequence
+										ppinsidedict["hit_start"] = ""
+										ppinsidedict["hit_end"] = ""
+
 
 									elif self.input_type == 'read':
 										pass
@@ -179,7 +182,7 @@ def run(self):
 									insidedict = {}
 									insidedict["type_match"] = "Strict"
 									insidedict["orf_strand"] = self.extract_nth_bar(orfInfo.decode(), 0)
-									insidedict["orf_start"] = self.extract_nth_bar(orfInfo.decode(), 1)							
+									insidedict["orf_start"] = self.extract_nth_bar(orfInfo.decode(), 1)
 									insidedict["orf_end"] = self.extract_nth_bar(orfInfo.decode(), 2)
 									insidedict["orf_from"] = orffrom.decode()
 									insidedict["model_name"] = json_data[modelID]["model_name"]
@@ -214,20 +217,22 @@ def run(self):
 										insidedict["orf_from"] = self.extract_nth_hash(orfInfo.decode(), 0).rstrip()
 										insidedict["hit_start"] = (hsp.sbjct_start-1)*3
 										insidedict["hit_end"] = (hsp.sbjct_end)*3
-										
+
 										if orfInfo.decode().split(' # ')[0] in predicted_genes_dict:
-											insidedict["orf_dna_sequence"] = predicted_genes_dict[orfInfo.decode().split(' # ')[0]] 
+											insidedict["orf_dna_sequence"] = predicted_genes_dict[orfInfo.decode().split(' # ')[0]]
 											# insidedict["orf_prot_sequence"] = str(Seq(predicted_genes_dict[orfInfo.decode().split(' # ')[0]], generic_dna).translate(table=11)).strip("*")
 											insidedict["orf_prot_sequence"] = orf_protein_sequence
 										else:
 											insidedict["orf_dna_sequence"] = ""
-											insidedict["orf_prot_sequence"] = ""									
+											insidedict["orf_prot_sequence"] = ""
 
 									elif self.input_type == 'protein':
 										insidedict["query_start"] = hsp.query_start
 										insidedict["query_end"] = hsp.query_start + realQueryLength
 										insidedict["query_from"] = blast_record.query
 										insidedict["orf_prot_sequence"] = orf_protein_sequence
+										insidedict["hit_start"] = ""
+										insidedict["hit_end"] = ""
 
 									elif self.input_type == 'read':
 										pass
@@ -293,6 +298,9 @@ def run(self):
 										linsidedict["query_end"] = hsp.query_start + realQueryLength
 										linsidedict["query_from"] = blast_record.query
 										linsidedict["orf_prot_sequence"] = orf_protein_sequence
+										linsidedict["hit_start"] = ""
+										linsidedict["hit_end"] = ""
+
 
 									elif self.input_type == 'read':
 										pass
@@ -308,4 +316,3 @@ def run(self):
 				blastResults = self.results(blastResults, blast_record.query, perfect, strict , loose, self.include_nudge)
 
 			return blastResults
-
diff --git a/app/OverexpressionModel.py b/app/OverexpressionModel.py
index 12ce4c7..ea79b3b 100644
--- a/app/OverexpressionModel.py
+++ b/app/OverexpressionModel.py
@@ -89,7 +89,7 @@ def run(self):
 						temp = ""
 
 						pass_bitscore = "{}".format(self.extract_nth_bar(alignment.title, 1))
-						pass_evalue = "{}".format("n/a")		
+						pass_evalue = "{}".format("n/a")
 
 						# logger.debug("pass_evalue: {}".format(pass_evalue))
 						# logger.debug("pass_bitscore: {}".format(pass_bitscore))
@@ -110,7 +110,7 @@ def run(self):
 								card_sequence = ""
 
 							orf_protein_sequence = ""
-							
+
 							if predicted_genes_dict:
 								if orfInfo.strip() in predicted_genes_dict.keys():
 									orf_protein_sequence = str(Seq(predicted_genes_dict[orfInfo.decode()]).translate(table=11)).strip("*")
@@ -168,10 +168,10 @@ def run(self):
 										ppinsidedict["orf_from"] = self.extract_nth_hash(orfInfo.decode(), 0).rstrip()
 										ppinsidedict["hit_start"] = (hsp.sbjct_start-1)*3
 										ppinsidedict["hit_end"] = (hsp.sbjct_end)*3
-                
+
 
 										if orfInfo.decode().split(' # ')[0] in predicted_genes_dict:
-											ppinsidedict["orf_dna_sequence"] = predicted_genes_dict[orfInfo.decode().split(' # ')[0]] 
+											ppinsidedict["orf_dna_sequence"] = predicted_genes_dict[orfInfo.decode().split(' # ')[0]]
 											ppinsidedict["orf_prot_sequence"] = str(Seq(predicted_genes_dict[orfInfo.decode().split(' # ')[0]]).translate(table=11)).strip("*")
 											# ppinsidedict["orf_prot_sequence"] = orf_protein_sequence
 										else:
@@ -183,6 +183,8 @@ def run(self):
 										ppinsidedict["query_end"] = hsp.query_start + realQueryLength
 										ppinsidedict["query_from"] = blast_record.query
 										ppinsidedict["orf_prot_sequence"] = orf_protein_sequence
+										ppinsidedict["hit_start"] = ""
+										ppinsidedict["hit_end"] = ""
 
 									elif self.input_type == 'read':
 										pass
@@ -212,14 +214,14 @@ def run(self):
 												sinsidedict = {}
 												sinsidedict["type_match"] = "Strict"
 												sinsidedict["orf_strand"] = self.extract_nth_bar(orfInfo.decode(), 0)
-												sinsidedict["orf_start"] = self.extract_nth_bar(orfInfo.decode(), 1)							
+												sinsidedict["orf_start"] = self.extract_nth_bar(orfInfo.decode(), 1)
 												sinsidedict["orf_end"] = self.extract_nth_bar(orfInfo.decode(), 2)
 												sinsidedict["orf_from"] = orffrom.decode()
 												sinsidedict["model_name"] = json_data[modelID]["model_name"]
 												sinsidedict["model_type"] = json_data[modelID]["model_type"]
 												sinsidedict["model_type_id"] = modelTypeID
 												sinsidedict["model_id"] = modelID
-												sinsidedict["snp"] = eachs							
+												sinsidedict["snp"] = eachs
 												sinsidedict["pass_evalue"] = pass_evalue
 												sinsidedict["pass_bitscore"] = pass_bitscore
 												sinsidedict["ARO_accession"] = json_data[modelID]["ARO_accession"]
@@ -247,21 +249,23 @@ def run(self):
 													sinsidedict["orf_end"] = self.extract_nth_hash(orfInfo.decode(), 2)
 													sinsidedict["orf_from"] = self.extract_nth_hash(orfInfo.decode(), 0).rstrip()
 													sinsidedict["hit_start"] = (hsp.sbjct_start-1)*3
-													snsidedict["hit_end"] = (hsp.sbjct_end)*3
-													
+													sinsidedict["hit_end"] = (hsp.sbjct_end)*3
+
 													if orfInfo.decode().split(' # ')[0] in predicted_genes_dict:
-														sinsidedict["orf_dna_sequence"] = predicted_genes_dict[orfInfo.decode().split(' # ')[0]] 
+														sinsidedict["orf_dna_sequence"] = predicted_genes_dict[orfInfo.decode().split(' # ')[0]]
 														sinsidedict["orf_prot_sequence"] = str(Seq(predicted_genes_dict[orfInfo.decode().split(' # ')[0]]).translate(table=11)).strip("*")
 														# sinsidedict["orf_prot_sequence"] = orf_protein_sequence
 													else:
 														sinsidedict["orf_dna_sequence"] = ""
-														sinsidedict["orf_prot_sequence"] = ""									
+														sinsidedict["orf_prot_sequence"] = ""
 
 												elif self.input_type == 'protein':
 													sinsidedict["query_start"] = hsp.query_start
 													sinsidedict["query_end"] = hsp.query_start + realQueryLength
 													sinsidedict["query_from"] = blast_record.query
 													sinsidedict["orf_prot_sequence"] = orf_protein_sequence
+													sinsidedict["hit_start"] = ""
+													sinsidedict["hit_end"] = ""
 
 												elif self.input_type == 'read':
 													pass
@@ -273,11 +277,11 @@ def run(self):
 									else:
 										if snp_counter == 0:
 											"""If no SNP detected in strict hit."""
-											# logger.debug("Strict hits - no SNP") 
+											# logger.debug("Strict hits - no SNP")
 											insidedict = {}
 											insidedict["type_match"] = "Strict"
 											insidedict["orf_strand"] = self.extract_nth_bar(orfInfo.decode(), 0)
-											insidedict["orf_start"] = self.extract_nth_bar(orfInfo.decode(), 1)							
+											insidedict["orf_start"] = self.extract_nth_bar(orfInfo.decode(), 1)
 											insidedict["orf_end"] = self.extract_nth_bar(orfInfo.decode(), 2)
 											insidedict["orf_from"] = orffrom.decode()
 											insidedict["model_name"] = json_data[modelID]["model_name"]
@@ -312,20 +316,22 @@ def run(self):
 												insidedict["orf_from"] = self.extract_nth_hash(orfInfo.decode(), 0).rstrip()
 												insidedict["hit_start"] = (hsp.sbjct_start-1)*3
 												insidedict["hit_end"] = (hsp.sbjct_end)*3
-												
+
 												if orfInfo.decode().split(' # ')[0] in predicted_genes_dict:
-													insidedict["orf_dna_sequence"] = predicted_genes_dict[orfInfo.decode().split(' # ')[0]] 
+													insidedict["orf_dna_sequence"] = predicted_genes_dict[orfInfo.decode().split(' # ')[0]]
 													insidedict["orf_prot_sequence"] = str(Seq(predicted_genes_dict[orfInfo.decode().split(' # ')[0]]).translate(table=11)).strip("*")
 													# insidedict["orf_prot_sequence"] = orf_protein_sequence
 												else:
 													insidedict["orf_dna_sequence"] = ""
-													insidedict["orf_prot_sequence"] = ""									
+													insidedict["orf_prot_sequence"] = ""
 
 											elif self.input_type == 'protein':
 												insidedict["query_start"] = hsp.query_start
 												insidedict["query_end"] = hsp.query_start + realQueryLength
 												insidedict["query_from"] = blast_record.query
 												insidedict["orf_prot_sequence"] = orf_protein_sequence
+												insidedict["hit_start"] = ""
+												insidedict["hit_end"] = ""
 
 											elif self.input_type == 'read':
 												pass
@@ -390,6 +396,8 @@ def run(self):
 										linsidedict["query_end"] = hsp.query_start + realQueryLength
 										linsidedict["query_from"] = blast_record.query
 										linsidedict["orf_prot_sequence"] = orf_protein_sequence
+										linsidedict["hit_start"] = ""
+										linsidedict["hit_end"] = ""
 
 									elif self.input_type == 'read':
 										pass
@@ -403,6 +411,5 @@ def run(self):
 								logger.warning("{} ---> hsp.bits: {} {} ? {}".format(json_data[modelID]["model_name"],hsp.bits, type(hsp.bits), type(pass_bitscore)))
 
 				blastResults = self.results(blastResults, blast_record.query, perfect, strict , loose, self.include_nudge)
-				
-			return blastResults
 
+			return blastResults
diff --git a/app/VariantModel.py b/app/VariantModel.py
index 6acbf4b..db2fd05 100644
--- a/app/VariantModel.py
+++ b/app/VariantModel.py
@@ -62,7 +62,7 @@ def run(self):
 						else:
 							c += 1
 					orf_from = orf_info[c:]
-					
+
 					model_type_id = self.extract_nth_bar(align_title, 0)
 					# logger.info("model_type_id: {} ".format(model_type_id))
 					space_pos = align_title.index(' ')
@@ -81,7 +81,7 @@ def run(self):
 						except ValueError:
 							true_pass_evalue = float(pass_value[0:pass_value.find(' ')])
 
-						# logger.info("mutation | model_type_id = " + str(align_title))				
+						# logger.info("mutation | model_type_id = " + str(align_title))
 						init = 0
 						evalue_snp = self.extract_nth_bar(align_title, 2)
 						snpl = []
@@ -89,15 +89,15 @@ def run(self):
 						temp = ""
 						evalue_snp_dec = evalue_snp
 						snpl = evalue_snp_dec.split(',')
-						
+
 						for each_snp in snpl:
 							snp_dict_list.append({"original": each_snp[0], "change": each_snp[-1], "position": int(each_snp[1:-1])})
 
 						for hsp in alignment.hsps:
 							query_seq =  hsp.query.replace('-', '')
-							real_query_length = len(query_seq) 
-							sbjct_seq = hsp.sbjct.replace('-', '') 
-							real_sbjct_length = len(sbjct_seq) 
+							real_query_length = len(query_seq)
+							sbjct_seq = hsp.sbjct.replace('-', '')
+							real_sbjct_length = len(sbjct_seq)
 
 							for eachs in snp_dict_list:
 								pos = eachs["position"]
@@ -121,7 +121,7 @@ def run(self):
 
 									if submitted_proteins_dict:
 										orf_protein_sequence = str(submitted_proteins_dict[orf_info.decode().split(" ")[0]])
-										
+
 									# logger.info("mutation | Model:"+str(model_id) + " | pos:" +str(pos) +" | change: "+str(hsp.query[pos - hsp.sbjct_start + \
 									# 			self.find_num_dash(hsp.sbjct, (pos-hsp.sbjct_start))]) + "=" + str(chan) + " AND wildtype: " + str(hsp.sbjct[pos - hsp.sbjct_start \
 									# 			+self.find_num_dash(hsp.sbjct, (pos-hsp.sbjct_start))]) + "=" + str(ori))
@@ -131,7 +131,7 @@ def run(self):
 									sbj = int(pos) - hsp.sbjct_start + self.find_num_dash(hsp.sbjct, (int(pos) - hsp.sbjct_start))
 
 									if hsp.query[qry] == chan:
-										query_snps = {}	
+										query_snps = {}
 										# logger.debug("mutation | Model:"+str(model_id) + " | pos:" +str(pos) +" | change: "+str(hsp.query[pos - hsp.sbjct_start + \
 										# 		self.find_num_dash(hsp.sbjct, (pos-hsp.sbjct_start))]) + "=" + str(chan) + " AND wildtype: " + str(hsp.sbjct[pos - hsp.sbjct_start \
 										# 		+self.find_num_dash(hsp.sbjct, (pos-hsp.sbjct_start))]) + "=" + str(ori))
@@ -143,7 +143,7 @@ def run(self):
 										# logger.debug("query_snp on frame {} {}".format(hsp.frame, json.dumps(query_snps, indent=2)))
 
 										try:
-											if float(hsp.bits) >= float(true_pass_evalue):		
+											if float(hsp.bits) >= float(true_pass_evalue):
 												sinsidedict = {}
 												sinsidedict["type_match"] = "Strict"
 												sinsidedict["snp"] = eachs
@@ -174,7 +174,7 @@ def run(self):
 													sinsidedict["partial"] = json_data[model_id]["model_sequences"]["sequence"][seq_in_model]["dna_sequence"]["partial"]
 												else:
 													sinsidedict["partial"] = "0"
-												
+
 												if self.input_type == 'contig':
 													sinsidedict["query_start"] = self.extract_nth_hash(orf_info.decode(), 1) + (hsp.query_start - 1)*3
 													sinsidedict["query_end"] = self.extract_nth_hash(orf_info.decode(), 1) + (hsp.query_start - 1)*3 + real_query_length*3 - 1
@@ -184,15 +184,15 @@ def run(self):
 													sinsidedict["orf_from"] = self.extract_nth_hash(orf_info.decode(), 0)
 													sinsidedict["hit_start"] = (hsp.sbjct_start-1)*3
 													sinsidedict["hit_end"] = (hsp.sbjct_end)*3
-													
+
 
 													if orf_info.decode().split(' # ')[0] in predicted_genes_dict:
-														sinsidedict["orf_dna_sequence"] = predicted_genes_dict[orf_info.decode().split(' # ')[0]] 
+														sinsidedict["orf_dna_sequence"] = predicted_genes_dict[orf_info.decode().split(' # ')[0]]
 														# sinsidedict["orf_prot_sequence"] = str(Seq(predicted_genes_dict[orf_info.decode().split(' # ')[0]], generic_dna).translate(table=11)).strip("*")
 														sinsidedict["orf_prot_sequence"] = orf_protein_sequence
 													else:
 														sinsidedict["orf_dna_sequence"] = ""
-														sinsidedict["orf_prot_sequence"] = ""	
+														sinsidedict["orf_prot_sequence"] = ""
 
 
 												elif self.input_type == 'protein':
@@ -200,6 +200,8 @@ def run(self):
 													sinsidedict["query_end"] = hsp.query_start + real_query_length
 													sinsidedict["query_from"] = blast_record.query
 													sinsidedict["orf_prot_sequence"] = orf_protein_sequence
+													sinsidedict["hit_start"] = ""
+													sinsidedict["hit_end"] = ""
 
 												elif self.input_type == 'read':
 													pass
@@ -215,7 +217,7 @@ def run(self):
 												slinsidedict["snp"] = eachs
 												slinsidedict["query_snp"] = query_snps
 												slinsidedict["orf_strand"] = self.extract_nth_bar(orf_info.decode(), 0)
-												slinsidedict["orf_start"] = self.extract_nth_bar(orf_info.decode(), 1)				
+												slinsidedict["orf_start"] = self.extract_nth_bar(orf_info.decode(), 1)
 												slinsidedict["orf_end"] = self.extract_nth_bar(orf_info.decode(), 2)
 												slinsidedict["orf_from"] = orf_from.decode()
 												slinsidedict["model_name"] = json_data[model_id]["model_name"]
@@ -252,18 +254,20 @@ def run(self):
 													slinsidedict["hit_end"] = (hsp.sbjct_end)*3
 
 													if orf_info.decode().split(' # ')[0] in predicted_genes_dict:
-														slinsidedict["orf_dna_sequence"] = predicted_genes_dict[orf_info.decode().split(' # ')[0]] 
+														slinsidedict["orf_dna_sequence"] = predicted_genes_dict[orf_info.decode().split(' # ')[0]]
 														# slinsidedict["orf_prot_sequence"] = str(Seq(predicted_genes_dict[orf_info.decode().split(' # ')[0]], generic_dna).translate(table=11)).strip("*")
 														slinsidedict["orf_prot_sequence"] = orf_protein_sequence
 													else:
 														slinsidedict["orf_dna_sequence"] = ""
-														slinsidedict["orf_prot_sequence"] = ""	
+														slinsidedict["orf_prot_sequence"] = ""
 
 												elif self.input_type == 'protein':
 													slinsidedict["query_start"] = hsp.query_start
 													slinsidedict["query_end"] = hsp.query_start + real_query_length
 													slinsidedict["query_from"] = blast_record.query
 													slinsidedict["orf_prot_sequence"] = orf_protein_sequence
+													slinsidedict["hit_start"] = ""
+													slinsidedict["hit_end"] = ""
 
 												elif self.input_type == 'read':
 													pass
@@ -277,6 +281,5 @@ def run(self):
 											logger.warning("{} ---> hsp.bits: {} {} ? {}".format(json_data[model_id]["model_name"],hsp.bits,type(hsp.bits), type(true_pass_evalue)))
 
 				blastResults = self.results(blastResults, blast_record.query, perfect, strict , loose, self.include_nudge)
-						
+
 			return blastResults
-