updates to patch rhea rxn to protein mapping

ncats · Mar 27, 2024 · c5a61b9 · c5a61b9
1 parent bd0d673
commit c5a61b9
Show file tree

Hide file tree

Showing 6 changed files with 190 additions and 48 deletions.
diff --git a/src/parse/RheaParser.py b/src/parse/RheaParser.py
@@ -65,15 +65,22 @@ def __init__(self, resConfig):
 
         self.expasyLocalEc2ClassFile = ""
 
-        self.humanUniprotRecordDict = dict()
+        self.humanPrimaryUniprotRecordDict = dict()
+
+        self.humanSecondaryUniprotRecordDict = dict()
+
+        self.humanPrimaryUniprotAccSet = set()
 
-        self.humanUniprotAccSet = set()
+        self.humanSecondaryUniprotAccSet = set()
 
         self.chebiHumanIdSet = set()
 
         self.chebiCofactorId = "23357"
 
         self.chebiCofactorSet = set()
+
+        self.parsingReviewedProteins = 0
+
 
     def processRhea(self):
         self.buildSupportingUniprotData()
@@ -106,9 +113,6 @@ def processRhea(self):
         self.exportIntermediateFiles()
 
 
-
-
-
     def buildSupportingUniprotData(self):
         print("building uniprot data store")
         uniParser = UniprotParser(self.config)
@@ -117,22 +121,25 @@ def buildSupportingUniprotData(self):
         uniParser.parseUniprot()
         self.humanUniprotRecordDict = uniParser.uniprotRecords
 
-        threadedUniprotDict = dict()
+        threadedPrimaryUniprotDict = dict()
+        threadedSecondaryUniprotDict = dict()
 
         for acc in self.humanUniprotRecordDict:
-            self.humanUniprotAccSet.add(acc)
-            p = self.humanUniprotRecordDict[acc]
-            threadedUniprotDict[acc] = p
+            self.humanPrimaryUniprotAccSet.add(acc)
+            p = self.humanPrimaryUniprotRecordDict[acc]
+            threadedPrimaryUniprotDict[acc] = p
             for acc2 in p.secondaryAccs:
-                self.humanUniprotAccSet.add(acc2)
-                threadedUniprotDict[acc2] = p
+                self.humanSecondaryUniprotAccSet.add(acc2)
+                threadedSecondaryUniprotDict[acc2] = p
 
-        self.humanUniprotRecordDict = threadedUniprotDict        
+        self.humanPrimaryUniprotRecordDict = threadedPrimaryUniprotDict        
+        self.humanSecondaryUniprotRecordDict = threadedSecondaryUniprotDict
 
         print("in rhea uniprot build")
-        print("length of the dict"+str(len(self.humanUniprotRecordDict.keys())))
-        print("length of the set"+str(len(self.humanUniprotAccSet)))
-
+        print("length of the primary dict"+str(len(self.humanPrimaryUniprotRecordDict.keys())))
+        print("length of the primary set"+str(len(self.humanPrimaryUniprotAccSet)))
+        print("length of the secondary dict"+str(len(self.humanSecondaryUniprotRecordDict.keys())))
+        print("length of the secondary set"+str(len(self.humanSecondaryUniprotAccSet)))
 
     def buildSupportingChebiData(self):
         print("building chebi data store")
@@ -198,10 +205,10 @@ def getRheaFiles(self):
             rhea2UniprotRemoteFile = uniprotToRheaConf.sourceFileName
 
             self.download_files(rhea2UniprotUrl, self.relDir + localDir + rhea2UniprotRemoteFile)
-            swissProtLocalFileName = uniprotToRheaConf.extractFileName
+            uniprotProtLocalFileName = uniprotToRheaConf.extractFileName
             # now gunzip
             with gzip.open(self.relDir + localDir + rhea2UniprotRemoteFile, 'rb') as f_in:
-                with open(self.relDir + localDir + swissProtLocalFileName, 'wb') as f_out:
+                with open(self.relDir + localDir + uniprotProtLocalFileName, 'wb') as f_out:
                     shutil.copyfileobj(f_in, f_out)      
         else:
             print("Using cached Rhea Uniprot-to-Rhea file.")
@@ -760,9 +767,12 @@ def appendUniprotToReaction(self):
             #print(row)
             #print("appending protein accessions to reactions..." + str(row.RHEA_ID)+ "  " +str(row.ID))
 
+            uniprot = "uniprot:" + row.ID
+
             # !!! just adding human uniprot            
-            if ("uniprot:" + row.ID) in self.humanUniprotAccSet:
+            if uniprot in self.humanUniprotAccSet:
                 #print("Have the human id!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
+
                 unis = r2uMap.get("rhea:" + str(row.RHEA_ID))
                 if unis is None:     
                     unis = ['uniprot:'+row.ID]
@@ -780,10 +790,10 @@ def appendUniprotToReaction(self):
             currRxn = self.rheaReactionDict.get(rxn, None)
             if currRxn is None:
                 currRxn = self.rheaReactionDict.get("rhea:"+rxn, None)
-            if currRxn is not None:   
+            if currRxn is not None: 
                 currRxn.proteins = uniSet
                 #print("setting proteins, len:"+str(len(currRxn.proteins)))
-
+                currRxn.addProteinSet(uniSet, self.humanUniprotRecordDict)
 
 
         # swiss prot    
@@ -816,7 +826,7 @@ def appendUniprotToReaction(self):
             if currRxn is not None:   
                 currRxn.proteins = uniSet
                 #print("setting proteins, len:"+str(len(currRxn.proteins)))
-
+                currRxn.addProteinSet(uniSet, self.humanUniprotRecordDict)
 
 
     def appendEcToReaction(self):

diff --git a/src/parse/UniprotParser.py b/src/parse/UniprotParser.py
@@ -29,6 +29,8 @@ def __init__(self, resConfig):
         self.uniprotRecords = dict()
 
         self.keyFields = ['AC', "DE", "GN", "ID", "DR"]
+
+        self.parsingReviewedProteins = 0
 
 
     def parseUniprot(self):
@@ -63,12 +65,14 @@ def parseUniprot(self):
         print("starting to parse uniprot trembl dat file")
         print(extractFile)                
 
-        self.parseUniprotFile(self.relDir + localDir + extractFile)
+        self.parsingReviewedProteins = 0
+        self.parseUniprotFile(self.relDir + localDir + extractFile, 0)
 
         print("number of uniprot trembl records")
         tremblCount = len(self.uniprotRecords)
         print(str(tremblCount))
 
+
         # now add SwissProt human
         proteinConfig = self.resourceConfig.getConfig("swissprot_human")
         file_proteins = proteinConfig.sourceFileName
@@ -97,7 +101,8 @@ def parseUniprot(self):
         print("starting to parse uniprot swissprot dat file")
         print(extractFile)
 
-        self.parseUniprotFile(self.relDir + localDir + extractFile)
+        self.parsingReviewedProteins = 1
+        self.parseUniprotFile(self.relDir + localDir + extractFile, 1)
 
         print("number of uniprot trembl PLUSE swissprot records")
         tremblCount = len(self.uniprotRecords)
@@ -108,7 +113,7 @@ def parseUniprot(self):
 
 
 
-    def parseUniprotFile(self, filePath):
+    def parseUniprotFile(self, filePath, isProteinReviewed):
         proteinDB = open(filePath, 'r+', encoding="utf-8")
 
         protein = None
@@ -125,6 +130,7 @@ def parseUniprotFile(self, filePath):
 
             if(start == 1):
                 protein = Protein()
+                protein.isReviewed = isProteinReviewed
                 start = 0
 
             prefix = line[0:2]
@@ -141,12 +147,14 @@ def parseUniprotFile(self, filePath):
 
                 #print("uniprot key: "+protein.uniprotAcc)
                 protein = Protein()
-
+                protein.isReviewed = isProteinReviewed
 
 
 
     def processData(self, prefix, line, proteinDB, protein):
 
+        #protein.isReviewed = self.parsingReviewedProteins
+
         line = (line[3:(len(line)-1)]).strip()
 
         ['AC', "DE", "GN", "ID", "DR"]
@@ -165,14 +173,8 @@ def processData(self, prefix, line, proteinDB, protein):
                 protein.uniprotAcc = accs[0]
                 protein.secondaryAccs = accs
 
-                if protein.uniprotAcc == 'uniprot:P19835':
-                    print("HEYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY we have P19835 in uniprot parser")
-                    print(accs)
             else:
                 for acc in accs:
-                    if acc == 'uniprot:P19835':
-                        print("HEYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY we have P19835 AS A SECONDARY ACC in uniprot parser")
-
                     protein.secondaryAccs.append(acc)
 
 
@@ -185,6 +187,13 @@ def processData(self, prefix, line, proteinDB, protein):
                     if recName[-1] == ";":
                         recName = recName[0:(len(recName)-1)]
                     protein.recName = recName
+                if(line[0:7] == 'SubName'):
+                    line = line.replace("SubName: Full=", "")
+                    recName = line.split('{')[0].strip()
+                    if recName[-1] == ";":
+                        recName = recName[0:(len(recName)-1)]
+                    protein.recName = recName
+
 
         elif(prefix == 'GN'):
             # print("GN process")

diff --git a/src/rampEntity/Gene.py b/src/rampEntity/Gene.py
@@ -44,6 +44,11 @@ def __init__(self):
         self.sources = list()
 
         self.proteinType = "Unknown"
+
+        # status of review by uniprot.
+        # TrEMBL uniprot entries are not fully curated (0)
+        # SwissProt uniprot entries are completely curated (1)
+        self.isReviewed = 0
 
     def __eq__(self, other):
         """

diff --git a/src/rampEntity/Protein.py b/src/rampEntity/Protein.py
@@ -30,9 +30,14 @@ def __init__(self):
 
         self.hgncSymbol = ""
 
+        # status of review by uniprot.
+        # TrEMBL uniprot entries are not fully curated (0)
+        # SwissProt uniprot entries are completely curated (1)
+        self.isReviewed = 0
+
 
     def getPrimaryRecord(self):
-        s = self.uniprotAcc + "\t" + self.id + "\t" + self.geneName + "\t" + self.recName + "\t" + self.hgncSymbol + "\n"
+        s = self.uniprotAcc + "\t" + self.id + "\t" + self.geneName + "\t" + self.recName + "\t" + self.hgncSymbol + '\t' + str(self.isReviewed) + "\n"
         return s
 
     def getSecondaryToPrimaryRecord(self):

diff --git a/src/rampEntity/RheaReaction.py b/src/rampEntity/RheaReaction.py
@@ -47,6 +47,8 @@ def __init__(self):
         # mapping rhea id to uniprot ids, can be 1:n
         self.proteins = []
 
+        self.proteinDict = dict()
+
         self.isTransport = 0        
 
         # compounds ids for left side, populate using rhea2ec tsv file.
@@ -172,11 +174,12 @@ def getMainReactionToProteinString(self, source):
 
         for p in self.proteins:
 
-            ids = p.idDict.get(source, None)
-            if ids is not None and len(ids) > 0:
-                uniprot = ids[0]
-            else:
-                uniprot = 'NA'            
+            uniprot = p.sourceId
+#            ids = p.idDict.get(source, None)
+#            if ids is not None and len(ids) > 0:                
+#                uniprot = ids[0]
+#            else:
+#                uniprot = 'NA'            
 
             names = p.commonNameDict.get(source, None)
             name = " "
@@ -205,18 +208,36 @@ def getMainReactionToProteinString(self, source):
             if name == " ":
                 name = "UNK4"
 
-            s = s + self.rxnRampId + "\t" + self.rhea_id + "\t" + p.rampId + "\t" + uniprot + "\t" + name + "\n"
+            s = s + self.rxnRampId + "\t" + self.rhea_id + "\t" + p.rampId + "\t" + uniprot + "\t" + name + "\t" + str(p.isReviewed) + "\n"
 
         return s    
 
 
 
-    def getMainReactionToProteinStringAllIds(self, source):
+    def getMainReactionToProteinStringAllIds(self, source, secondaryIdSet, swissProtIds, tremblProtIds):
 
         s = ""
 
+        if self.direction != 'UN':
+            return s
+
         uniprotIdSet = set()
 
+        outputCount = 0
+
+        if swissProtIds is None:
+            swissCount = 0
+        else:
+            swissCount = len(swissProtIds)
+
+        if tremblProtIds is None:
+            tremblCount = 0
+        else:
+            tremblCount = len(tremblProtIds)
+
+
+        totalIdCount = swissCount + tremblCount
+
         for p in self.proteins:
 
             ids = p.idDict.get(source, None)
@@ -225,7 +246,20 @@ def getMainReactionToProteinStringAllIds(self, source):
 
                 for uniprot in ids:
 
-                    #uniprot = ids[0]
+                    if uniprot in secondaryIdSet:                        
+                        continue
+
+                    if swissProtIds is not None and uniprot in swissProtIds:
+                        isReviewed = 1
+                        outputCount = outputCount + 1
+                    elif tremblProtIds is not None and uniprot in tremblProtIds:
+                        isReviewed = 0
+                        outputCount = outputCount + 1
+                    else:
+                        # limits reporting of non-rhea proteins
+                        continue
+
+                    #uniprot = ids[0]                    
 
                     names = p.commonNameDict.get(source, None)
                     name = " "
@@ -255,9 +289,13 @@ def getMainReactionToProteinStringAllIds(self, source):
 #                         name = "UNK4"
 
                     if uniprot not in uniprotIdSet:                        
-                        s = s + self.rxnRampId + "\t" + self.rhea_id + "\t" + p.rampId + "\t" + uniprot + "\t" + name + "\n"
+                        s = s + self.rxnRampId + "\t" + self.rhea_id + "\t" + p.rampId + "\t" + uniprot + "\t" + name + "\t" + str(isReviewed) + "\n"
                         uniprotIdSet.add(uniprot)
-
+
+        if outputCount < totalIdCount:
+            print("rhea protein output mismatch. totalIdCount = " + str(totalIdCount), + " outputCount = " + str(outputCount))
+
+
         return s    
 
 
@@ -288,7 +326,7 @@ def getReactionProteinToMetString(self, source):
                             cid = list(namesDict.keys())[0]
                             name = namesDict[cid]
 
-                        s = s + self.rxnRampId + "\t" + self.rhea_id + "\t" + p.rampId + "\t" + uniprot + "\t0\t" + met.rampId + "\t" + cid + "\t" + name + "\t" + str(met.isCofactor) + "\n"
+                        s = s + self.rxnRampId + "\t" + self.rhea_id + "\t" + p.rampId + "\t" + uniprot + "\t0\t" + met.rampId + "\t" + cid + "\t" + name + "\t" + str(met.isCofactor) + str(p.isReviewed) + "\n"
 
                 for met in self.right_comps:
                     if met.rampId not in hitMets:
@@ -352,7 +390,15 @@ def getRheaIdToCompMappingString(self):
     def getRheaIdToUniprotMappingString(self):    
         s = ""
         for pid in self.proteins:
-            s = s + self.rhea_id + "\t" + pid + "\t0\n"             
+            pName = ""
+            p = self.proteinDict.get(pid, None)
+            protIsReviewed = 0
+            if p is not None:
+                protIsReviewed = p.isReviewed
+            else:
+                protIsReviewed = 3
+
+            s = s + self.rhea_id + "\t" + pid + "\t0\t" + str(protIsReviewed) + "\n"             
         return s
 
     def getCompoundToProteinString(self):
@@ -408,7 +454,10 @@ def doIHaveACofactorCompoundCheck(self):
 # 
 #         return 0   
 
-
-
+    def addProteinSet(self, accList, accToProteinMap):
+        for acc in accList:
+            p = accToProteinMap.get(acc, None)
+            if p is not None:
+                self.proteinDict[acc] = p