From 9b6951e72101aee12b5dcf109e81abea03cfbb93 Mon Sep 17 00:00:00 2001 From: amos Date: Thu, 24 Aug 2023 15:28:58 -0400 Subject: [PATCH] fixes typos and adds missing variables --- app/ConvertRGIJsonToTSV.py | 2 +- app/HomologModel.py | 27 +++++++++++++++---------- app/OverexpressionModel.py | 41 ++++++++++++++++++++++---------------- app/VariantModel.py | 39 +++++++++++++++++++----------------- 4 files changed, 63 insertions(+), 46 deletions(-) diff --git a/app/ConvertRGIJsonToTSV.py b/app/ConvertRGIJsonToTSV.py index 43c8708..492bd5b 100755 --- a/app/ConvertRGIJsonToTSV.py +++ b/app/ConvertRGIJsonToTSV.py @@ -10,7 +10,7 @@ def __init__(self, filepath, homolog_file=None, variant_file=None, overexpressio name, ext = os.path.splitext(f_name) self.filepath = os.path.join(f_path, "{}.json".format(f_name)) if ext.lower() == ".json": - self.filepath = os.path.join(f_path, "{}{}".format(name,ext)) + self.filepath = os.path.join(f_path, "{}{}".format(name,ext)) self.homolog_file = homolog_file self.variant_file = variant_file self.overexpression_file = overexpression_file diff --git a/app/HomologModel.py b/app/HomologModel.py index 51efbd9..b163c2c 100644 --- a/app/HomologModel.py +++ b/app/HomologModel.py @@ -65,9 +65,9 @@ def run(self): else: c += 1 orffrom = orfInfo[c:] - + modelTypeID = self.extract_nth_bar(alignTitle, 0) - + if modelTypeID == 40292: spacepos = alignTitle.index(' ') hitid = alignTitle[0:spacepos] @@ -78,7 +78,7 @@ def run(self): seqinModel = modelDescrpt[underscoreinMD+1: modelDescrpt.index(' ')] pass_bitscore = "{}".format(self.extract_nth_bar(alignment.title, 1)) - pass_evalue = "{}".format("n/a") + pass_evalue = "{}".format("n/a") # logger.info("pass_evalue: {}".format(pass_evalue)) # logger.info("pass_bitscore: {}".format(pass_bitscore)) @@ -94,7 +94,7 @@ def run(self): card_sequence = str(json_data[modelID]["model_sequences"]["sequence"][seqinModel]["protein_sequence"]["sequence"]) except Exception as e: logger.warning("Exception : {} -> {} -> Model({}) missing in database. Please generate new database.".format(type(e), e, modelID)) - + # if predicted_genes_dict: # if orfInfo.strip() in predicted_genes_dict.keys(): # orf_protein_sequence = str(Seq(predicted_genes_dict[orfInfo.decode()], generic_dna).translate(table=11)).strip("*") @@ -153,7 +153,7 @@ def run(self): ppinsidedict["hit_end"] = (hsp.sbjct_end)*3 if orfInfo.decode().split(' # ')[0] in predicted_genes_dict: - ppinsidedict["orf_dna_sequence"] = predicted_genes_dict[orfInfo.decode().split(' # ')[0]] + ppinsidedict["orf_dna_sequence"] = predicted_genes_dict[orfInfo.decode().split(' # ')[0]] # ppinsidedict["orf_prot_sequence"] = str(Seq(predicted_genes_dict[orfInfo.decode().split(' # ')[0]], generic_dna).translate(table=11)).strip("*") ppinsidedict["orf_prot_sequence"] = orf_protein_sequence else: @@ -165,6 +165,9 @@ def run(self): ppinsidedict["query_end"] = hsp.query_start + realQueryLength ppinsidedict["query_from"] = blast_record.query ppinsidedict["orf_prot_sequence"] = orf_protein_sequence + ppinsidedict["hit_start"] = "" + ppinsidedict["hit_end"] = "" + elif self.input_type == 'read': pass @@ -179,7 +182,7 @@ def run(self): insidedict = {} insidedict["type_match"] = "Strict" insidedict["orf_strand"] = self.extract_nth_bar(orfInfo.decode(), 0) - insidedict["orf_start"] = self.extract_nth_bar(orfInfo.decode(), 1) + insidedict["orf_start"] = self.extract_nth_bar(orfInfo.decode(), 1) insidedict["orf_end"] = self.extract_nth_bar(orfInfo.decode(), 2) insidedict["orf_from"] = orffrom.decode() insidedict["model_name"] = json_data[modelID]["model_name"] @@ -214,20 +217,22 @@ def run(self): insidedict["orf_from"] = self.extract_nth_hash(orfInfo.decode(), 0).rstrip() insidedict["hit_start"] = (hsp.sbjct_start-1)*3 insidedict["hit_end"] = (hsp.sbjct_end)*3 - + if orfInfo.decode().split(' # ')[0] in predicted_genes_dict: - insidedict["orf_dna_sequence"] = predicted_genes_dict[orfInfo.decode().split(' # ')[0]] + insidedict["orf_dna_sequence"] = predicted_genes_dict[orfInfo.decode().split(' # ')[0]] # insidedict["orf_prot_sequence"] = str(Seq(predicted_genes_dict[orfInfo.decode().split(' # ')[0]], generic_dna).translate(table=11)).strip("*") insidedict["orf_prot_sequence"] = orf_protein_sequence else: insidedict["orf_dna_sequence"] = "" - insidedict["orf_prot_sequence"] = "" + insidedict["orf_prot_sequence"] = "" elif self.input_type == 'protein': insidedict["query_start"] = hsp.query_start insidedict["query_end"] = hsp.query_start + realQueryLength insidedict["query_from"] = blast_record.query insidedict["orf_prot_sequence"] = orf_protein_sequence + insidedict["hit_start"] = "" + insidedict["hit_end"] = "" elif self.input_type == 'read': pass @@ -293,6 +298,9 @@ def run(self): linsidedict["query_end"] = hsp.query_start + realQueryLength linsidedict["query_from"] = blast_record.query linsidedict["orf_prot_sequence"] = orf_protein_sequence + linsidedict["hit_start"] = "" + linsidedict["hit_end"] = "" + elif self.input_type == 'read': pass @@ -308,4 +316,3 @@ def run(self): blastResults = self.results(blastResults, blast_record.query, perfect, strict , loose, self.include_nudge) return blastResults - diff --git a/app/OverexpressionModel.py b/app/OverexpressionModel.py index 12ce4c7..ea79b3b 100644 --- a/app/OverexpressionModel.py +++ b/app/OverexpressionModel.py @@ -89,7 +89,7 @@ def run(self): temp = "" pass_bitscore = "{}".format(self.extract_nth_bar(alignment.title, 1)) - pass_evalue = "{}".format("n/a") + pass_evalue = "{}".format("n/a") # logger.debug("pass_evalue: {}".format(pass_evalue)) # logger.debug("pass_bitscore: {}".format(pass_bitscore)) @@ -110,7 +110,7 @@ def run(self): card_sequence = "" orf_protein_sequence = "" - + if predicted_genes_dict: if orfInfo.strip() in predicted_genes_dict.keys(): orf_protein_sequence = str(Seq(predicted_genes_dict[orfInfo.decode()]).translate(table=11)).strip("*") @@ -168,10 +168,10 @@ def run(self): ppinsidedict["orf_from"] = self.extract_nth_hash(orfInfo.decode(), 0).rstrip() ppinsidedict["hit_start"] = (hsp.sbjct_start-1)*3 ppinsidedict["hit_end"] = (hsp.sbjct_end)*3 - + if orfInfo.decode().split(' # ')[0] in predicted_genes_dict: - ppinsidedict["orf_dna_sequence"] = predicted_genes_dict[orfInfo.decode().split(' # ')[0]] + ppinsidedict["orf_dna_sequence"] = predicted_genes_dict[orfInfo.decode().split(' # ')[0]] ppinsidedict["orf_prot_sequence"] = str(Seq(predicted_genes_dict[orfInfo.decode().split(' # ')[0]]).translate(table=11)).strip("*") # ppinsidedict["orf_prot_sequence"] = orf_protein_sequence else: @@ -183,6 +183,8 @@ def run(self): ppinsidedict["query_end"] = hsp.query_start + realQueryLength ppinsidedict["query_from"] = blast_record.query ppinsidedict["orf_prot_sequence"] = orf_protein_sequence + ppinsidedict["hit_start"] = "" + ppinsidedict["hit_end"] = "" elif self.input_type == 'read': pass @@ -212,14 +214,14 @@ def run(self): sinsidedict = {} sinsidedict["type_match"] = "Strict" sinsidedict["orf_strand"] = self.extract_nth_bar(orfInfo.decode(), 0) - sinsidedict["orf_start"] = self.extract_nth_bar(orfInfo.decode(), 1) + sinsidedict["orf_start"] = self.extract_nth_bar(orfInfo.decode(), 1) sinsidedict["orf_end"] = self.extract_nth_bar(orfInfo.decode(), 2) sinsidedict["orf_from"] = orffrom.decode() sinsidedict["model_name"] = json_data[modelID]["model_name"] sinsidedict["model_type"] = json_data[modelID]["model_type"] sinsidedict["model_type_id"] = modelTypeID sinsidedict["model_id"] = modelID - sinsidedict["snp"] = eachs + sinsidedict["snp"] = eachs sinsidedict["pass_evalue"] = pass_evalue sinsidedict["pass_bitscore"] = pass_bitscore sinsidedict["ARO_accession"] = json_data[modelID]["ARO_accession"] @@ -247,21 +249,23 @@ def run(self): sinsidedict["orf_end"] = self.extract_nth_hash(orfInfo.decode(), 2) sinsidedict["orf_from"] = self.extract_nth_hash(orfInfo.decode(), 0).rstrip() sinsidedict["hit_start"] = (hsp.sbjct_start-1)*3 - snsidedict["hit_end"] = (hsp.sbjct_end)*3 - + sinsidedict["hit_end"] = (hsp.sbjct_end)*3 + if orfInfo.decode().split(' # ')[0] in predicted_genes_dict: - sinsidedict["orf_dna_sequence"] = predicted_genes_dict[orfInfo.decode().split(' # ')[0]] + sinsidedict["orf_dna_sequence"] = predicted_genes_dict[orfInfo.decode().split(' # ')[0]] sinsidedict["orf_prot_sequence"] = str(Seq(predicted_genes_dict[orfInfo.decode().split(' # ')[0]]).translate(table=11)).strip("*") # sinsidedict["orf_prot_sequence"] = orf_protein_sequence else: sinsidedict["orf_dna_sequence"] = "" - sinsidedict["orf_prot_sequence"] = "" + sinsidedict["orf_prot_sequence"] = "" elif self.input_type == 'protein': sinsidedict["query_start"] = hsp.query_start sinsidedict["query_end"] = hsp.query_start + realQueryLength sinsidedict["query_from"] = blast_record.query sinsidedict["orf_prot_sequence"] = orf_protein_sequence + sinsidedict["hit_start"] = "" + sinsidedict["hit_end"] = "" elif self.input_type == 'read': pass @@ -273,11 +277,11 @@ def run(self): else: if snp_counter == 0: """If no SNP detected in strict hit.""" - # logger.debug("Strict hits - no SNP") + # logger.debug("Strict hits - no SNP") insidedict = {} insidedict["type_match"] = "Strict" insidedict["orf_strand"] = self.extract_nth_bar(orfInfo.decode(), 0) - insidedict["orf_start"] = self.extract_nth_bar(orfInfo.decode(), 1) + insidedict["orf_start"] = self.extract_nth_bar(orfInfo.decode(), 1) insidedict["orf_end"] = self.extract_nth_bar(orfInfo.decode(), 2) insidedict["orf_from"] = orffrom.decode() insidedict["model_name"] = json_data[modelID]["model_name"] @@ -312,20 +316,22 @@ def run(self): insidedict["orf_from"] = self.extract_nth_hash(orfInfo.decode(), 0).rstrip() insidedict["hit_start"] = (hsp.sbjct_start-1)*3 insidedict["hit_end"] = (hsp.sbjct_end)*3 - + if orfInfo.decode().split(' # ')[0] in predicted_genes_dict: - insidedict["orf_dna_sequence"] = predicted_genes_dict[orfInfo.decode().split(' # ')[0]] + insidedict["orf_dna_sequence"] = predicted_genes_dict[orfInfo.decode().split(' # ')[0]] insidedict["orf_prot_sequence"] = str(Seq(predicted_genes_dict[orfInfo.decode().split(' # ')[0]]).translate(table=11)).strip("*") # insidedict["orf_prot_sequence"] = orf_protein_sequence else: insidedict["orf_dna_sequence"] = "" - insidedict["orf_prot_sequence"] = "" + insidedict["orf_prot_sequence"] = "" elif self.input_type == 'protein': insidedict["query_start"] = hsp.query_start insidedict["query_end"] = hsp.query_start + realQueryLength insidedict["query_from"] = blast_record.query insidedict["orf_prot_sequence"] = orf_protein_sequence + insidedict["hit_start"] = "" + insidedict["hit_end"] = "" elif self.input_type == 'read': pass @@ -390,6 +396,8 @@ def run(self): linsidedict["query_end"] = hsp.query_start + realQueryLength linsidedict["query_from"] = blast_record.query linsidedict["orf_prot_sequence"] = orf_protein_sequence + linsidedict["hit_start"] = "" + linsidedict["hit_end"] = "" elif self.input_type == 'read': pass @@ -403,6 +411,5 @@ def run(self): logger.warning("{} ---> hsp.bits: {} {} ? {}".format(json_data[modelID]["model_name"],hsp.bits, type(hsp.bits), type(pass_bitscore))) blastResults = self.results(blastResults, blast_record.query, perfect, strict , loose, self.include_nudge) - - return blastResults + return blastResults diff --git a/app/VariantModel.py b/app/VariantModel.py index 6acbf4b..db2fd05 100644 --- a/app/VariantModel.py +++ b/app/VariantModel.py @@ -62,7 +62,7 @@ def run(self): else: c += 1 orf_from = orf_info[c:] - + model_type_id = self.extract_nth_bar(align_title, 0) # logger.info("model_type_id: {} ".format(model_type_id)) space_pos = align_title.index(' ') @@ -81,7 +81,7 @@ def run(self): except ValueError: true_pass_evalue = float(pass_value[0:pass_value.find(' ')]) - # logger.info("mutation | model_type_id = " + str(align_title)) + # logger.info("mutation | model_type_id = " + str(align_title)) init = 0 evalue_snp = self.extract_nth_bar(align_title, 2) snpl = [] @@ -89,15 +89,15 @@ def run(self): temp = "" evalue_snp_dec = evalue_snp snpl = evalue_snp_dec.split(',') - + for each_snp in snpl: snp_dict_list.append({"original": each_snp[0], "change": each_snp[-1], "position": int(each_snp[1:-1])}) for hsp in alignment.hsps: query_seq = hsp.query.replace('-', '') - real_query_length = len(query_seq) - sbjct_seq = hsp.sbjct.replace('-', '') - real_sbjct_length = len(sbjct_seq) + real_query_length = len(query_seq) + sbjct_seq = hsp.sbjct.replace('-', '') + real_sbjct_length = len(sbjct_seq) for eachs in snp_dict_list: pos = eachs["position"] @@ -121,7 +121,7 @@ def run(self): if submitted_proteins_dict: orf_protein_sequence = str(submitted_proteins_dict[orf_info.decode().split(" ")[0]]) - + # logger.info("mutation | Model:"+str(model_id) + " | pos:" +str(pos) +" | change: "+str(hsp.query[pos - hsp.sbjct_start + \ # self.find_num_dash(hsp.sbjct, (pos-hsp.sbjct_start))]) + "=" + str(chan) + " AND wildtype: " + str(hsp.sbjct[pos - hsp.sbjct_start \ # +self.find_num_dash(hsp.sbjct, (pos-hsp.sbjct_start))]) + "=" + str(ori)) @@ -131,7 +131,7 @@ def run(self): sbj = int(pos) - hsp.sbjct_start + self.find_num_dash(hsp.sbjct, (int(pos) - hsp.sbjct_start)) if hsp.query[qry] == chan: - query_snps = {} + query_snps = {} # logger.debug("mutation | Model:"+str(model_id) + " | pos:" +str(pos) +" | change: "+str(hsp.query[pos - hsp.sbjct_start + \ # self.find_num_dash(hsp.sbjct, (pos-hsp.sbjct_start))]) + "=" + str(chan) + " AND wildtype: " + str(hsp.sbjct[pos - hsp.sbjct_start \ # +self.find_num_dash(hsp.sbjct, (pos-hsp.sbjct_start))]) + "=" + str(ori)) @@ -143,7 +143,7 @@ def run(self): # logger.debug("query_snp on frame {} {}".format(hsp.frame, json.dumps(query_snps, indent=2))) try: - if float(hsp.bits) >= float(true_pass_evalue): + if float(hsp.bits) >= float(true_pass_evalue): sinsidedict = {} sinsidedict["type_match"] = "Strict" sinsidedict["snp"] = eachs @@ -174,7 +174,7 @@ def run(self): sinsidedict["partial"] = json_data[model_id]["model_sequences"]["sequence"][seq_in_model]["dna_sequence"]["partial"] else: sinsidedict["partial"] = "0" - + if self.input_type == 'contig': sinsidedict["query_start"] = self.extract_nth_hash(orf_info.decode(), 1) + (hsp.query_start - 1)*3 sinsidedict["query_end"] = self.extract_nth_hash(orf_info.decode(), 1) + (hsp.query_start - 1)*3 + real_query_length*3 - 1 @@ -184,15 +184,15 @@ def run(self): sinsidedict["orf_from"] = self.extract_nth_hash(orf_info.decode(), 0) sinsidedict["hit_start"] = (hsp.sbjct_start-1)*3 sinsidedict["hit_end"] = (hsp.sbjct_end)*3 - + if orf_info.decode().split(' # ')[0] in predicted_genes_dict: - sinsidedict["orf_dna_sequence"] = predicted_genes_dict[orf_info.decode().split(' # ')[0]] + sinsidedict["orf_dna_sequence"] = predicted_genes_dict[orf_info.decode().split(' # ')[0]] # sinsidedict["orf_prot_sequence"] = str(Seq(predicted_genes_dict[orf_info.decode().split(' # ')[0]], generic_dna).translate(table=11)).strip("*") sinsidedict["orf_prot_sequence"] = orf_protein_sequence else: sinsidedict["orf_dna_sequence"] = "" - sinsidedict["orf_prot_sequence"] = "" + sinsidedict["orf_prot_sequence"] = "" elif self.input_type == 'protein': @@ -200,6 +200,8 @@ def run(self): sinsidedict["query_end"] = hsp.query_start + real_query_length sinsidedict["query_from"] = blast_record.query sinsidedict["orf_prot_sequence"] = orf_protein_sequence + sinsidedict["hit_start"] = "" + sinsidedict["hit_end"] = "" elif self.input_type == 'read': pass @@ -215,7 +217,7 @@ def run(self): slinsidedict["snp"] = eachs slinsidedict["query_snp"] = query_snps slinsidedict["orf_strand"] = self.extract_nth_bar(orf_info.decode(), 0) - slinsidedict["orf_start"] = self.extract_nth_bar(orf_info.decode(), 1) + slinsidedict["orf_start"] = self.extract_nth_bar(orf_info.decode(), 1) slinsidedict["orf_end"] = self.extract_nth_bar(orf_info.decode(), 2) slinsidedict["orf_from"] = orf_from.decode() slinsidedict["model_name"] = json_data[model_id]["model_name"] @@ -252,18 +254,20 @@ def run(self): slinsidedict["hit_end"] = (hsp.sbjct_end)*3 if orf_info.decode().split(' # ')[0] in predicted_genes_dict: - slinsidedict["orf_dna_sequence"] = predicted_genes_dict[orf_info.decode().split(' # ')[0]] + slinsidedict["orf_dna_sequence"] = predicted_genes_dict[orf_info.decode().split(' # ')[0]] # slinsidedict["orf_prot_sequence"] = str(Seq(predicted_genes_dict[orf_info.decode().split(' # ')[0]], generic_dna).translate(table=11)).strip("*") slinsidedict["orf_prot_sequence"] = orf_protein_sequence else: slinsidedict["orf_dna_sequence"] = "" - slinsidedict["orf_prot_sequence"] = "" + slinsidedict["orf_prot_sequence"] = "" elif self.input_type == 'protein': slinsidedict["query_start"] = hsp.query_start slinsidedict["query_end"] = hsp.query_start + real_query_length slinsidedict["query_from"] = blast_record.query slinsidedict["orf_prot_sequence"] = orf_protein_sequence + slinsidedict["hit_start"] = "" + slinsidedict["hit_end"] = "" elif self.input_type == 'read': pass @@ -277,6 +281,5 @@ def run(self): logger.warning("{} ---> hsp.bits: {} {} ? {}".format(json_data[model_id]["model_name"],hsp.bits,type(hsp.bits), type(true_pass_evalue))) blastResults = self.results(blastResults, blast_record.query, perfect, strict , loose, self.include_nudge) - + return blastResults -