diff --git a/src/parse/RheaParser.py b/src/parse/RheaParser.py index 31f2202..70b64a0 100644 --- a/src/parse/RheaParser.py +++ b/src/parse/RheaParser.py @@ -65,15 +65,22 @@ def __init__(self, resConfig): self.expasyLocalEc2ClassFile = "" - self.humanUniprotRecordDict = dict() + self.humanPrimaryUniprotRecordDict = dict() + + self.humanSecondaryUniprotRecordDict = dict() + + self.humanPrimaryUniprotAccSet = set() - self.humanUniprotAccSet = set() + self.humanSecondaryUniprotAccSet = set() self.chebiHumanIdSet = set() self.chebiCofactorId = "23357" self.chebiCofactorSet = set() + + self.parsingReviewedProteins = 0 + def processRhea(self): self.buildSupportingUniprotData() @@ -106,9 +113,6 @@ def processRhea(self): self.exportIntermediateFiles() - - - def buildSupportingUniprotData(self): print("building uniprot data store") uniParser = UniprotParser(self.config) @@ -117,22 +121,25 @@ def buildSupportingUniprotData(self): uniParser.parseUniprot() self.humanUniprotRecordDict = uniParser.uniprotRecords - threadedUniprotDict = dict() + threadedPrimaryUniprotDict = dict() + threadedSecondaryUniprotDict = dict() for acc in self.humanUniprotRecordDict: - self.humanUniprotAccSet.add(acc) - p = self.humanUniprotRecordDict[acc] - threadedUniprotDict[acc] = p + self.humanPrimaryUniprotAccSet.add(acc) + p = self.humanPrimaryUniprotRecordDict[acc] + threadedPrimaryUniprotDict[acc] = p for acc2 in p.secondaryAccs: - self.humanUniprotAccSet.add(acc2) - threadedUniprotDict[acc2] = p + self.humanSecondaryUniprotAccSet.add(acc2) + threadedSecondaryUniprotDict[acc2] = p - self.humanUniprotRecordDict = threadedUniprotDict + self.humanPrimaryUniprotRecordDict = threadedPrimaryUniprotDict + self.humanSecondaryUniprotRecordDict = threadedSecondaryUniprotDict print("in rhea uniprot build") - print("length of the dict"+str(len(self.humanUniprotRecordDict.keys()))) - print("length of the set"+str(len(self.humanUniprotAccSet))) - + print("length of the primary dict"+str(len(self.humanPrimaryUniprotRecordDict.keys()))) + print("length of the primary set"+str(len(self.humanPrimaryUniprotAccSet))) + print("length of the secondary dict"+str(len(self.humanSecondaryUniprotRecordDict.keys()))) + print("length of the secondary set"+str(len(self.humanSecondaryUniprotAccSet))) def buildSupportingChebiData(self): print("building chebi data store") @@ -198,10 +205,10 @@ def getRheaFiles(self): rhea2UniprotRemoteFile = uniprotToRheaConf.sourceFileName self.download_files(rhea2UniprotUrl, self.relDir + localDir + rhea2UniprotRemoteFile) - swissProtLocalFileName = uniprotToRheaConf.extractFileName + uniprotProtLocalFileName = uniprotToRheaConf.extractFileName # now gunzip with gzip.open(self.relDir + localDir + rhea2UniprotRemoteFile, 'rb') as f_in: - with open(self.relDir + localDir + swissProtLocalFileName, 'wb') as f_out: + with open(self.relDir + localDir + uniprotProtLocalFileName, 'wb') as f_out: shutil.copyfileobj(f_in, f_out) else: print("Using cached Rhea Uniprot-to-Rhea file.") @@ -760,9 +767,12 @@ def appendUniprotToReaction(self): #print(row) #print("appending protein accessions to reactions..." + str(row.RHEA_ID)+ " " +str(row.ID)) + uniprot = "uniprot:" + row.ID + # !!! just adding human uniprot - if ("uniprot:" + row.ID) in self.humanUniprotAccSet: + if uniprot in self.humanUniprotAccSet: #print("Have the human id!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") + unis = r2uMap.get("rhea:" + str(row.RHEA_ID)) if unis is None: unis = ['uniprot:'+row.ID] @@ -780,10 +790,10 @@ def appendUniprotToReaction(self): currRxn = self.rheaReactionDict.get(rxn, None) if currRxn is None: currRxn = self.rheaReactionDict.get("rhea:"+rxn, None) - if currRxn is not None: + if currRxn is not None: currRxn.proteins = uniSet #print("setting proteins, len:"+str(len(currRxn.proteins))) - + currRxn.addProteinSet(uniSet, self.humanUniprotRecordDict) # swiss prot @@ -816,7 +826,7 @@ def appendUniprotToReaction(self): if currRxn is not None: currRxn.proteins = uniSet #print("setting proteins, len:"+str(len(currRxn.proteins))) - + currRxn.addProteinSet(uniSet, self.humanUniprotRecordDict) def appendEcToReaction(self): diff --git a/src/parse/UniprotParser.py b/src/parse/UniprotParser.py index fe20b06..b9a80dd 100644 --- a/src/parse/UniprotParser.py +++ b/src/parse/UniprotParser.py @@ -29,6 +29,8 @@ def __init__(self, resConfig): self.uniprotRecords = dict() self.keyFields = ['AC', "DE", "GN", "ID", "DR"] + + self.parsingReviewedProteins = 0 def parseUniprot(self): @@ -63,12 +65,14 @@ def parseUniprot(self): print("starting to parse uniprot trembl dat file") print(extractFile) - self.parseUniprotFile(self.relDir + localDir + extractFile) + self.parsingReviewedProteins = 0 + self.parseUniprotFile(self.relDir + localDir + extractFile, 0) print("number of uniprot trembl records") tremblCount = len(self.uniprotRecords) print(str(tremblCount)) + # now add SwissProt human proteinConfig = self.resourceConfig.getConfig("swissprot_human") file_proteins = proteinConfig.sourceFileName @@ -97,7 +101,8 @@ def parseUniprot(self): print("starting to parse uniprot swissprot dat file") print(extractFile) - self.parseUniprotFile(self.relDir + localDir + extractFile) + self.parsingReviewedProteins = 1 + self.parseUniprotFile(self.relDir + localDir + extractFile, 1) print("number of uniprot trembl PLUSE swissprot records") tremblCount = len(self.uniprotRecords) @@ -108,7 +113,7 @@ def parseUniprot(self): - def parseUniprotFile(self, filePath): + def parseUniprotFile(self, filePath, isProteinReviewed): proteinDB = open(filePath, 'r+', encoding="utf-8") protein = None @@ -125,6 +130,7 @@ def parseUniprotFile(self, filePath): if(start == 1): protein = Protein() + protein.isReviewed = isProteinReviewed start = 0 prefix = line[0:2] @@ -141,12 +147,14 @@ def parseUniprotFile(self, filePath): #print("uniprot key: "+protein.uniprotAcc) protein = Protein() - + protein.isReviewed = isProteinReviewed def processData(self, prefix, line, proteinDB, protein): + #protein.isReviewed = self.parsingReviewedProteins + line = (line[3:(len(line)-1)]).strip() ['AC', "DE", "GN", "ID", "DR"] @@ -165,14 +173,8 @@ def processData(self, prefix, line, proteinDB, protein): protein.uniprotAcc = accs[0] protein.secondaryAccs = accs - if protein.uniprotAcc == 'uniprot:P19835': - print("HEYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY we have P19835 in uniprot parser") - print(accs) else: for acc in accs: - if acc == 'uniprot:P19835': - print("HEYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY we have P19835 AS A SECONDARY ACC in uniprot parser") - protein.secondaryAccs.append(acc) @@ -185,6 +187,13 @@ def processData(self, prefix, line, proteinDB, protein): if recName[-1] == ";": recName = recName[0:(len(recName)-1)] protein.recName = recName + if(line[0:7] == 'SubName'): + line = line.replace("SubName: Full=", "") + recName = line.split('{')[0].strip() + if recName[-1] == ";": + recName = recName[0:(len(recName)-1)] + protein.recName = recName + elif(prefix == 'GN'): # print("GN process") diff --git a/src/rampEntity/Gene.py b/src/rampEntity/Gene.py index 0076e35..49bb84e 100644 --- a/src/rampEntity/Gene.py +++ b/src/rampEntity/Gene.py @@ -44,6 +44,11 @@ def __init__(self): self.sources = list() self.proteinType = "Unknown" + + # status of review by uniprot. + # TrEMBL uniprot entries are not fully curated (0) + # SwissProt uniprot entries are completely curated (1) + self.isReviewed = 0 def __eq__(self, other): """ diff --git a/src/rampEntity/Protein.py b/src/rampEntity/Protein.py index b8b716e..abd98be 100644 --- a/src/rampEntity/Protein.py +++ b/src/rampEntity/Protein.py @@ -30,9 +30,14 @@ def __init__(self): self.hgncSymbol = "" + # status of review by uniprot. + # TrEMBL uniprot entries are not fully curated (0) + # SwissProt uniprot entries are completely curated (1) + self.isReviewed = 0 + def getPrimaryRecord(self): - s = self.uniprotAcc + "\t" + self.id + "\t" + self.geneName + "\t" + self.recName + "\t" + self.hgncSymbol + "\n" + s = self.uniprotAcc + "\t" + self.id + "\t" + self.geneName + "\t" + self.recName + "\t" + self.hgncSymbol + '\t' + str(self.isReviewed) + "\n" return s def getSecondaryToPrimaryRecord(self): diff --git a/src/rampEntity/RheaReaction.py b/src/rampEntity/RheaReaction.py index 59b370e..f3dfde8 100644 --- a/src/rampEntity/RheaReaction.py +++ b/src/rampEntity/RheaReaction.py @@ -47,6 +47,8 @@ def __init__(self): # mapping rhea id to uniprot ids, can be 1:n self.proteins = [] + self.proteinDict = dict() + self.isTransport = 0 # compounds ids for left side, populate using rhea2ec tsv file. @@ -172,11 +174,12 @@ def getMainReactionToProteinString(self, source): for p in self.proteins: - ids = p.idDict.get(source, None) - if ids is not None and len(ids) > 0: - uniprot = ids[0] - else: - uniprot = 'NA' + uniprot = p.sourceId +# ids = p.idDict.get(source, None) +# if ids is not None and len(ids) > 0: +# uniprot = ids[0] +# else: +# uniprot = 'NA' names = p.commonNameDict.get(source, None) name = " " @@ -205,18 +208,36 @@ def getMainReactionToProteinString(self, source): if name == " ": name = "UNK4" - s = s + self.rxnRampId + "\t" + self.rhea_id + "\t" + p.rampId + "\t" + uniprot + "\t" + name + "\n" + s = s + self.rxnRampId + "\t" + self.rhea_id + "\t" + p.rampId + "\t" + uniprot + "\t" + name + "\t" + str(p.isReviewed) + "\n" return s - def getMainReactionToProteinStringAllIds(self, source): + def getMainReactionToProteinStringAllIds(self, source, secondaryIdSet, swissProtIds, tremblProtIds): s = "" + if self.direction != 'UN': + return s + uniprotIdSet = set() + outputCount = 0 + + if swissProtIds is None: + swissCount = 0 + else: + swissCount = len(swissProtIds) + + if tremblProtIds is None: + tremblCount = 0 + else: + tremblCount = len(tremblProtIds) + + + totalIdCount = swissCount + tremblCount + for p in self.proteins: ids = p.idDict.get(source, None) @@ -225,7 +246,20 @@ def getMainReactionToProteinStringAllIds(self, source): for uniprot in ids: - #uniprot = ids[0] + if uniprot in secondaryIdSet: + continue + + if swissProtIds is not None and uniprot in swissProtIds: + isReviewed = 1 + outputCount = outputCount + 1 + elif tremblProtIds is not None and uniprot in tremblProtIds: + isReviewed = 0 + outputCount = outputCount + 1 + else: + # limits reporting of non-rhea proteins + continue + + #uniprot = ids[0] names = p.commonNameDict.get(source, None) name = " " @@ -255,9 +289,13 @@ def getMainReactionToProteinStringAllIds(self, source): # name = "UNK4" if uniprot not in uniprotIdSet: - s = s + self.rxnRampId + "\t" + self.rhea_id + "\t" + p.rampId + "\t" + uniprot + "\t" + name + "\n" + s = s + self.rxnRampId + "\t" + self.rhea_id + "\t" + p.rampId + "\t" + uniprot + "\t" + name + "\t" + str(isReviewed) + "\n" uniprotIdSet.add(uniprot) - + + if outputCount < totalIdCount: + print("rhea protein output mismatch. totalIdCount = " + str(totalIdCount), + " outputCount = " + str(outputCount)) + + return s @@ -288,7 +326,7 @@ def getReactionProteinToMetString(self, source): cid = list(namesDict.keys())[0] name = namesDict[cid] - s = s + self.rxnRampId + "\t" + self.rhea_id + "\t" + p.rampId + "\t" + uniprot + "\t0\t" + met.rampId + "\t" + cid + "\t" + name + "\t" + str(met.isCofactor) + "\n" + s = s + self.rxnRampId + "\t" + self.rhea_id + "\t" + p.rampId + "\t" + uniprot + "\t0\t" + met.rampId + "\t" + cid + "\t" + name + "\t" + str(met.isCofactor) + str(p.isReviewed) + "\n" for met in self.right_comps: if met.rampId not in hitMets: @@ -352,7 +390,15 @@ def getRheaIdToCompMappingString(self): def getRheaIdToUniprotMappingString(self): s = "" for pid in self.proteins: - s = s + self.rhea_id + "\t" + pid + "\t0\n" + pName = "" + p = self.proteinDict.get(pid, None) + protIsReviewed = 0 + if p is not None: + protIsReviewed = p.isReviewed + else: + protIsReviewed = 3 + + s = s + self.rhea_id + "\t" + pid + "\t0\t" + str(protIsReviewed) + "\n" return s def getCompoundToProteinString(self): @@ -408,7 +454,10 @@ def doIHaveACofactorCompoundCheck(self): # # return 0 - - + def addProteinSet(self, accList, accToProteinMap): + for acc in accList: + p = accToProteinMap.get(acc, None) + if p is not None: + self.proteinDict[acc] = p diff --git a/src/util/EntityBuilder.py b/src/util/EntityBuilder.py index 32ea7f2..dc84c4e 100644 --- a/src/util/EntityBuilder.py +++ b/src/util/EntityBuilder.py @@ -144,6 +144,14 @@ def __init__(self, resourceConfig): self.hmdbStatusLevel = {"predicted":1, "expected":2, "detected":3, "quantified":4} + # a simple list of uniprot accessions that are secondary + # use this for rhea export... to skip any uniprot in this list + self.uniprotSecondaryAccessions = set() + + # suport for delivering the correct proteins for rhea + self.rheaToSwissprotDict = dict() + self.rheaToTremblDict = dict() + def fullBuild(self): """ This high level method performs the entire process of entity construction @@ -162,6 +170,9 @@ def fullBuild(self): self.loadPathways() self.addPathwayCategory() + # collect uniprot secondary accessions + self.getUniprotSecondaryAccessions() + # load genes self.loadGeneList() self.addGeneCommonNameAndSynonyms() @@ -556,7 +567,21 @@ def buildMetaboliteToPathwayConnections(self): # print(str(len(strandedMetSourceIds))) # print(str(len(strandedPathSourceIds))) + def getUniprotSecondaryAccessions(self): + + self.uniprotSecondaryAccessions = set() + file = "../misc/output/uniprot_human/uniprot_acc_mapping.txt" + + data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False) + + for idx, row in data.iterrows(): + altId = row[0] + id = row[1] + + if altId != id: + self.uniprotSecondaryAccessions.add(altId) + def loadGeneList(self, eqMetric = 0): """ @@ -1018,19 +1043,55 @@ def appendRxnProteinsFromRhea(self, path): records = pd.read_table(path, header=None) + self.rheaToSwissprotDict = dict() + self.rheaToTremblDict = dict() + # just read them in first... for idx, record in records.iterrows(): rheaId = record[0] uniprot = record[1] + isReviewed = record[3] rxn = self.reactionDict.get(rheaId, None) + if isReviewed == 1: + idSet = self.rheaToSwissprotDict.get(rheaId, None) + if idSet is None: + idSet = set() + self.rheaToSwissprotDict[rheaId] = idSet + + idSet.add(uniprot) + else: + idSet = self.rheaToTremblDict.get(rheaId, None) + if idSet is None: + idSet = set() + self.rheaToTremblDict[rheaId] = idSet + + idSet.add(uniprot) + + + + if uniprot == "uniprot:Q13574": + print(" Found the uniprot Q13574 in the FILE.............................................................................") if rxn is not None: protein = self.geneList.getGeneById(uniprot) if protein is not None: - protein.soureId = uniprot + if uniprot == "uniprot:Q13574": + print(" Found the uniprot Q13574 actual PROTEIN.............................................................................") + print("Primary ID in protein = "+protein.sourceId) + print("Print all ids in order:") + rids = protein.idDict.get("rhea", None) + + if rids is not None: + for id in rids: + print(id) + + print("End Protein Ids List") + + protein.sourceId = uniprot rxn.proteins.append(protein) + protein.isReviewed = isReviewed else: # we need to make a protein? @@ -1360,7 +1421,10 @@ def writeReactionEntities(self): for rxnId in self.reactionDict: rxn = self.reactionDict[rxnId] - file.write(rxn.getMainReactionToProteinStringAllIds('rhea')) + swissProtIds = self.rheaToSwissprotDict.get(rxnId, None) + tremblProtIds = self.rheaToTremblDict.get(rxnId, None) + if swissProtIds is not None or tremblProtIds is not None: + file.write(rxn.getMainReactionToProteinStringAllIds('rhea', self.uniprotSecondaryAccessions, swissProtIds, tremblProtIds)) file.close()