Skip to content

Commit

Permalink
updates to patch rhea rxn to protein mapping
Browse files Browse the repository at this point in the history
  • Loading branch information
johnbraisted committed Mar 27, 2024
1 parent bd0d673 commit c5a61b9
Show file tree
Hide file tree
Showing 6 changed files with 190 additions and 48 deletions.
52 changes: 31 additions & 21 deletions src/parse/RheaParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,15 +65,22 @@ def __init__(self, resConfig):

self.expasyLocalEc2ClassFile = ""

self.humanUniprotRecordDict = dict()
self.humanPrimaryUniprotRecordDict = dict()

self.humanSecondaryUniprotRecordDict = dict()

self.humanPrimaryUniprotAccSet = set()

self.humanUniprotAccSet = set()
self.humanSecondaryUniprotAccSet = set()

self.chebiHumanIdSet = set()

self.chebiCofactorId = "23357"

self.chebiCofactorSet = set()

self.parsingReviewedProteins = 0


def processRhea(self):
self.buildSupportingUniprotData()
Expand Down Expand Up @@ -106,9 +113,6 @@ def processRhea(self):
self.exportIntermediateFiles()





def buildSupportingUniprotData(self):
print("building uniprot data store")
uniParser = UniprotParser(self.config)
Expand All @@ -117,22 +121,25 @@ def buildSupportingUniprotData(self):
uniParser.parseUniprot()
self.humanUniprotRecordDict = uniParser.uniprotRecords

threadedUniprotDict = dict()
threadedPrimaryUniprotDict = dict()
threadedSecondaryUniprotDict = dict()

for acc in self.humanUniprotRecordDict:
self.humanUniprotAccSet.add(acc)
p = self.humanUniprotRecordDict[acc]
threadedUniprotDict[acc] = p
self.humanPrimaryUniprotAccSet.add(acc)
p = self.humanPrimaryUniprotRecordDict[acc]
threadedPrimaryUniprotDict[acc] = p
for acc2 in p.secondaryAccs:
self.humanUniprotAccSet.add(acc2)
threadedUniprotDict[acc2] = p
self.humanSecondaryUniprotAccSet.add(acc2)
threadedSecondaryUniprotDict[acc2] = p

self.humanUniprotRecordDict = threadedUniprotDict
self.humanPrimaryUniprotRecordDict = threadedPrimaryUniprotDict
self.humanSecondaryUniprotRecordDict = threadedSecondaryUniprotDict

print("in rhea uniprot build")
print("length of the dict"+str(len(self.humanUniprotRecordDict.keys())))
print("length of the set"+str(len(self.humanUniprotAccSet)))

print("length of the primary dict"+str(len(self.humanPrimaryUniprotRecordDict.keys())))
print("length of the primary set"+str(len(self.humanPrimaryUniprotAccSet)))
print("length of the secondary dict"+str(len(self.humanSecondaryUniprotRecordDict.keys())))
print("length of the secondary set"+str(len(self.humanSecondaryUniprotAccSet)))

def buildSupportingChebiData(self):
print("building chebi data store")
Expand Down Expand Up @@ -198,10 +205,10 @@ def getRheaFiles(self):
rhea2UniprotRemoteFile = uniprotToRheaConf.sourceFileName

self.download_files(rhea2UniprotUrl, self.relDir + localDir + rhea2UniprotRemoteFile)
swissProtLocalFileName = uniprotToRheaConf.extractFileName
uniprotProtLocalFileName = uniprotToRheaConf.extractFileName
# now gunzip
with gzip.open(self.relDir + localDir + rhea2UniprotRemoteFile, 'rb') as f_in:
with open(self.relDir + localDir + swissProtLocalFileName, 'wb') as f_out:
with open(self.relDir + localDir + uniprotProtLocalFileName, 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
else:
print("Using cached Rhea Uniprot-to-Rhea file.")
Expand Down Expand Up @@ -760,9 +767,12 @@ def appendUniprotToReaction(self):
#print(row)
#print("appending protein accessions to reactions..." + str(row.RHEA_ID)+ " " +str(row.ID))

uniprot = "uniprot:" + row.ID

# !!! just adding human uniprot
if ("uniprot:" + row.ID) in self.humanUniprotAccSet:
if uniprot in self.humanUniprotAccSet:
#print("Have the human id!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")

unis = r2uMap.get("rhea:" + str(row.RHEA_ID))
if unis is None:
unis = ['uniprot:'+row.ID]
Expand All @@ -780,10 +790,10 @@ def appendUniprotToReaction(self):
currRxn = self.rheaReactionDict.get(rxn, None)
if currRxn is None:
currRxn = self.rheaReactionDict.get("rhea:"+rxn, None)
if currRxn is not None:
if currRxn is not None:
currRxn.proteins = uniSet
#print("setting proteins, len:"+str(len(currRxn.proteins)))

currRxn.addProteinSet(uniSet, self.humanUniprotRecordDict)


# swiss prot
Expand Down Expand Up @@ -816,7 +826,7 @@ def appendUniprotToReaction(self):
if currRxn is not None:
currRxn.proteins = uniSet
#print("setting proteins, len:"+str(len(currRxn.proteins)))

currRxn.addProteinSet(uniSet, self.humanUniprotRecordDict)


def appendEcToReaction(self):
Expand Down
29 changes: 19 additions & 10 deletions src/parse/UniprotParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ def __init__(self, resConfig):
self.uniprotRecords = dict()

self.keyFields = ['AC', "DE", "GN", "ID", "DR"]

self.parsingReviewedProteins = 0


def parseUniprot(self):
Expand Down Expand Up @@ -63,12 +65,14 @@ def parseUniprot(self):
print("starting to parse uniprot trembl dat file")
print(extractFile)

self.parseUniprotFile(self.relDir + localDir + extractFile)
self.parsingReviewedProteins = 0
self.parseUniprotFile(self.relDir + localDir + extractFile, 0)

print("number of uniprot trembl records")
tremblCount = len(self.uniprotRecords)
print(str(tremblCount))


# now add SwissProt human
proteinConfig = self.resourceConfig.getConfig("swissprot_human")
file_proteins = proteinConfig.sourceFileName
Expand Down Expand Up @@ -97,7 +101,8 @@ def parseUniprot(self):
print("starting to parse uniprot swissprot dat file")
print(extractFile)

self.parseUniprotFile(self.relDir + localDir + extractFile)
self.parsingReviewedProteins = 1
self.parseUniprotFile(self.relDir + localDir + extractFile, 1)

print("number of uniprot trembl PLUSE swissprot records")
tremblCount = len(self.uniprotRecords)
Expand All @@ -108,7 +113,7 @@ def parseUniprot(self):



def parseUniprotFile(self, filePath):
def parseUniprotFile(self, filePath, isProteinReviewed):
proteinDB = open(filePath, 'r+', encoding="utf-8")

protein = None
Expand All @@ -125,6 +130,7 @@ def parseUniprotFile(self, filePath):

if(start == 1):
protein = Protein()
protein.isReviewed = isProteinReviewed
start = 0

prefix = line[0:2]
Expand All @@ -141,12 +147,14 @@ def parseUniprotFile(self, filePath):

#print("uniprot key: "+protein.uniprotAcc)
protein = Protein()

protein.isReviewed = isProteinReviewed



def processData(self, prefix, line, proteinDB, protein):

#protein.isReviewed = self.parsingReviewedProteins

line = (line[3:(len(line)-1)]).strip()

['AC', "DE", "GN", "ID", "DR"]
Expand All @@ -165,14 +173,8 @@ def processData(self, prefix, line, proteinDB, protein):
protein.uniprotAcc = accs[0]
protein.secondaryAccs = accs

if protein.uniprotAcc == 'uniprot:P19835':
print("HEYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY we have P19835 in uniprot parser")
print(accs)
else:
for acc in accs:
if acc == 'uniprot:P19835':
print("HEYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY we have P19835 AS A SECONDARY ACC in uniprot parser")

protein.secondaryAccs.append(acc)


Expand All @@ -185,6 +187,13 @@ def processData(self, prefix, line, proteinDB, protein):
if recName[-1] == ";":
recName = recName[0:(len(recName)-1)]
protein.recName = recName
if(line[0:7] == 'SubName'):
line = line.replace("SubName: Full=", "")
recName = line.split('{')[0].strip()
if recName[-1] == ";":
recName = recName[0:(len(recName)-1)]
protein.recName = recName


elif(prefix == 'GN'):
# print("GN process")
Expand Down
5 changes: 5 additions & 0 deletions src/rampEntity/Gene.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,11 @@ def __init__(self):
self.sources = list()

self.proteinType = "Unknown"

# status of review by uniprot.
# TrEMBL uniprot entries are not fully curated (0)
# SwissProt uniprot entries are completely curated (1)
self.isReviewed = 0

def __eq__(self, other):
"""
Expand Down
7 changes: 6 additions & 1 deletion src/rampEntity/Protein.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,14 @@ def __init__(self):

self.hgncSymbol = ""

# status of review by uniprot.
# TrEMBL uniprot entries are not fully curated (0)
# SwissProt uniprot entries are completely curated (1)
self.isReviewed = 0


def getPrimaryRecord(self):
s = self.uniprotAcc + "\t" + self.id + "\t" + self.geneName + "\t" + self.recName + "\t" + self.hgncSymbol + "\n"
s = self.uniprotAcc + "\t" + self.id + "\t" + self.geneName + "\t" + self.recName + "\t" + self.hgncSymbol + '\t' + str(self.isReviewed) + "\n"
return s

def getSecondaryToPrimaryRecord(self):
Expand Down
77 changes: 63 additions & 14 deletions src/rampEntity/RheaReaction.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ def __init__(self):
# mapping rhea id to uniprot ids, can be 1:n
self.proteins = []

self.proteinDict = dict()

self.isTransport = 0

# compounds ids for left side, populate using rhea2ec tsv file.
Expand Down Expand Up @@ -172,11 +174,12 @@ def getMainReactionToProteinString(self, source):

for p in self.proteins:

ids = p.idDict.get(source, None)
if ids is not None and len(ids) > 0:
uniprot = ids[0]
else:
uniprot = 'NA'
uniprot = p.sourceId
# ids = p.idDict.get(source, None)
# if ids is not None and len(ids) > 0:
# uniprot = ids[0]
# else:
# uniprot = 'NA'

names = p.commonNameDict.get(source, None)
name = " "
Expand Down Expand Up @@ -205,18 +208,36 @@ def getMainReactionToProteinString(self, source):
if name == " ":
name = "UNK4"

s = s + self.rxnRampId + "\t" + self.rhea_id + "\t" + p.rampId + "\t" + uniprot + "\t" + name + "\n"
s = s + self.rxnRampId + "\t" + self.rhea_id + "\t" + p.rampId + "\t" + uniprot + "\t" + name + "\t" + str(p.isReviewed) + "\n"

return s



def getMainReactionToProteinStringAllIds(self, source):
def getMainReactionToProteinStringAllIds(self, source, secondaryIdSet, swissProtIds, tremblProtIds):

s = ""

if self.direction != 'UN':
return s

uniprotIdSet = set()

outputCount = 0

if swissProtIds is None:
swissCount = 0
else:
swissCount = len(swissProtIds)

if tremblProtIds is None:
tremblCount = 0
else:
tremblCount = len(tremblProtIds)


totalIdCount = swissCount + tremblCount

for p in self.proteins:

ids = p.idDict.get(source, None)
Expand All @@ -225,7 +246,20 @@ def getMainReactionToProteinStringAllIds(self, source):

for uniprot in ids:

#uniprot = ids[0]
if uniprot in secondaryIdSet:
continue

if swissProtIds is not None and uniprot in swissProtIds:
isReviewed = 1
outputCount = outputCount + 1
elif tremblProtIds is not None and uniprot in tremblProtIds:
isReviewed = 0
outputCount = outputCount + 1
else:
# limits reporting of non-rhea proteins
continue

#uniprot = ids[0]

names = p.commonNameDict.get(source, None)
name = " "
Expand Down Expand Up @@ -255,9 +289,13 @@ def getMainReactionToProteinStringAllIds(self, source):
# name = "UNK4"

if uniprot not in uniprotIdSet:
s = s + self.rxnRampId + "\t" + self.rhea_id + "\t" + p.rampId + "\t" + uniprot + "\t" + name + "\n"
s = s + self.rxnRampId + "\t" + self.rhea_id + "\t" + p.rampId + "\t" + uniprot + "\t" + name + "\t" + str(isReviewed) + "\n"
uniprotIdSet.add(uniprot)


if outputCount < totalIdCount:
print("rhea protein output mismatch. totalIdCount = " + str(totalIdCount), + " outputCount = " + str(outputCount))


return s


Expand Down Expand Up @@ -288,7 +326,7 @@ def getReactionProteinToMetString(self, source):
cid = list(namesDict.keys())[0]
name = namesDict[cid]

s = s + self.rxnRampId + "\t" + self.rhea_id + "\t" + p.rampId + "\t" + uniprot + "\t0\t" + met.rampId + "\t" + cid + "\t" + name + "\t" + str(met.isCofactor) + "\n"
s = s + self.rxnRampId + "\t" + self.rhea_id + "\t" + p.rampId + "\t" + uniprot + "\t0\t" + met.rampId + "\t" + cid + "\t" + name + "\t" + str(met.isCofactor) + str(p.isReviewed) + "\n"

for met in self.right_comps:
if met.rampId not in hitMets:
Expand Down Expand Up @@ -352,7 +390,15 @@ def getRheaIdToCompMappingString(self):
def getRheaIdToUniprotMappingString(self):
s = ""
for pid in self.proteins:
s = s + self.rhea_id + "\t" + pid + "\t0\n"
pName = ""
p = self.proteinDict.get(pid, None)
protIsReviewed = 0
if p is not None:
protIsReviewed = p.isReviewed
else:
protIsReviewed = 3

s = s + self.rhea_id + "\t" + pid + "\t0\t" + str(protIsReviewed) + "\n"
return s

def getCompoundToProteinString(self):
Expand Down Expand Up @@ -408,7 +454,10 @@ def doIHaveACofactorCompoundCheck(self):
#
# return 0



def addProteinSet(self, accList, accToProteinMap):
for acc in accList:
p = accToProteinMap.get(acc, None)
if p is not None:
self.proteinDict[acc] = p


Loading

0 comments on commit c5a61b9

Please sign in to comment.