diff --git a/config/db_load_resource_config.txt b/config/db_load_resource_config.txt index 41c6c03..8ecfbd8 100644 --- a/config/db_load_resource_config.txt +++ b/config/db_load_resource_config.txt @@ -15,3 +15,4 @@ ready reaction.txt reaction bulk ramp_rxn_id "ramp_rxn_id,rxn_source_id,status,i ready reaction_to_metabolite.txt reaction2met bulk None "ramp_rxn_id,rxn_source_id,ramp_cmpd_id,substrate_product,met_source_id,met_name,is_cofactor" ready reaction_to_protein.txt reaction2protein bulk None "ramp_rxn_id,rxn_source_id,ramp_gene_id,uniprot,protein_name" ready reaction_protein_to_metabolite.txt reaction_protein2met bulk None "ramp_rxn_id,rxn_source_id,ramp_gene_id,gene_source_id,substrate_product,ramp_cmpd_id,cmpd_source_id,cmpd_name,is_cofactor" +ready rheaReactionToEcClass.txt reaction_ec_class bulk None "ramp_rxn_id,rxn_source_id,rxn_class_ec,ec_level,rxn_class,rxn_class_hierarchy" diff --git a/config/external_resource_config.txt b/config/external_resource_config.txt index 998ecd1..89c80a1 100644 --- a/config/external_resource_config.txt +++ b/config/external_resource_config.txt @@ -14,3 +14,4 @@ rhea_to_ec http https://ftp.expasy.org/databases/rhea/tsv/rhea2ec.tsv rhea2ec.ts rhea_rxn_direction http https://ftp.expasy.org/databases/rhea/tsv/rhea-directions.tsv rhea-directions.tsv rhea-directions.tsv ../misc/data/rhea/ none rhea_rxn_direction_table chebi_to_chebi_relations http http://ftp.ebi.ac.uk/pub/databases/chebi/Flat_file_tab_delimited/relation.tsv relation.tsv relation.tsv ../misc/data/chebi/ none chebi_relations chebi_ontology_owl http http://ftp.ebi.ac.uk/pub/databases/chebi/ontology/chebi.owl.gz chebi.owl.gz chebi.owl ../misc/data/chebi/ gzip chebi_ontology +expasy_ec2class ftp https://ftp.expasy.org/databases/enzyme/enzclass.txt enzclass.txt enzclass.txt ../misc/data/rhea none expasy_ec2class diff --git a/config/ramp_resource_version_update.txt b/config/ramp_resource_version_update.txt index 1683592..ea4f13a 100644 --- a/config/ramp_resource_version_update.txt +++ b/config/ramp_resource_version_update.txt @@ -1,8 +1,8 @@ ramp_db_version db_mod_date status data_source_id data_source_name data_source_url data_source_version -v2.4.0 10/24/2023 current hmdb HMDB https://hmdb.ca/ v5.0 (2021-11-17) -v2.4.0 10/24/2023 current reactome Reactome https://reactome.org/ v86 (Sep 2023) -v2.4.0 10/24/2023 current wiki WikiPathways https://www.wikipathways.org/index.php/WikiPathways v20231010 (2023-10-10) -v2.4.0 10/24/2023 current kegg KEGG https://www.genome.jp/kegg/ from HMDB (v5.0) (2021-11-17) -v2.4.0 10/24/2023 current chebi ChEBI https://www.ebi.ac.uk/chebi/ Release 226 (2023-10-01) -v2.4.0 10/24/2023 current lipidmaps Lipid Maps https://www.lipidmaps.org/ Release 2023-10-24 -v2.4.0 10/24/2023 current rhea Rhea https://www.rhea-db.org/ Release 128 (2023-06-28) +v2.4.1 10/24/2023 current hmdb HMDB https://hmdb.ca/ v5.0 (2021-11-17) +v2.4.1 10/24/2023 current reactome Reactome https://reactome.org/ v86 (Sep 2023) +v2.4.1 10/24/2023 current wiki WikiPathways https://www.wikipathways.org/index.php/WikiPathways v20231010 (2023-10-10) +v2.4.1 10/24/2023 current kegg KEGG https://www.genome.jp/kegg/ from HMDB (v5.0) (2021-11-17) +v2.4.1 10/24/2023 current chebi ChEBI https://www.ebi.ac.uk/chebi/ Release 226 (2023-10-01) +v2.4.1 10/24/2023 current lipidmaps Lipid Maps https://www.lipidmaps.org/ Release 2023-10-24 +v2.4.1 10/24/2023 current rhea Rhea https://www.rhea-db.org/ Release 129 (2023-09-13) diff --git a/main/main.py b/main/main.py index f9fd20e..9b2064f 100755 --- a/main/main.py +++ b/main/main.py @@ -52,192 +52,6 @@ def runEverything(self, resourceConfigFile, getDatabaseFiles = True): print("Getting Rhea info...") rhea.processRhea() - - #Here are the identifiers that are present for each gene: - #kegg: keggid (mainID), 'Ensembl', 'HGNC', 'HPRD', 'NCBI-GeneID', 'NCBI-ProteinID', 'OMIM', 'UniProt', 'Vega', 'miRBase' - #wikipathways: (no mainID), 'Entrez', 'Enzyme Nomenclature', 'Uniprot (Uniprot-TrEMBL) - #hmdb: HMDB-protien-accession (mainID), 'Uniprot' - #reactome:Uniprot (mainID) - - """ - print('Generate compound id') - hmdbcompoundnum = sql.createRampCompoundID(hmdb.metaboliteIDDictionary, "hmdb", 0) - print("hmdbcompoundnum: ", hmdbcompoundnum) - keggcompoundnum = sql.createRampCompoundID(kegg.metaboliteIDDictionary, "kegg", hmdbcompoundnum) - wikicompoundnum = sql.createRampCompoundID(wikipathways.metaboliteIDDictionary, "wiki", keggcompoundnum) - print("wikicompoundnum: ", wikicompoundnum) - reactomecompoundnum = sql.createRampCompoundID(reactome.metaboliteIDDictionary, "reactome", wikicompoundnum) - - print('Generate gene id ...') - hmdbgenenum = sql.createRampGeneID(hmdb.geneInfoDictionary, "hmdb", 0) - kegggenenum = sql.createRampGeneID(kegg.geneInfoDictionary, "kegg", hmdbgenenum) - wikigenenum = sql.createRampGeneID(wikipathways.geneInfoDictionary, "wiki", kegggenenum) - reactomegenenum = sql.createRampGeneID(reactome.geneInfoDictionary, "reactome", wikigenenum) - print(" hmdbgenenum ", hmdbgenenum, " kegggenenum ", kegggenenum, " wikigenenum ", wikigenenum, " reactomegenenum ", reactomegenenum) - """ - - """ print('Write to sql file...') - hmdbnumbers = sql.write( - hmdb.metaboliteCommonName, - hmdb.pathwayDictionary, - hmdb.pathwayCategory, - hmdb.metabolitesWithPathwaysDictionary, - hmdb.metabolitesWithSynonymsDictionary, - hmdb.metaboliteIDDictionary, - hmdb.pathwaysWithGenesDictionary, - hmdb.metabolitesLinkedToGenes, - hmdb.geneInfoDictionary, - hmdb.biofluidLocation, - hmdb.biofluid, - hmdb.cellularLocation, - hmdb.cellular, - hmdb.pathwayOntology, - hmdb.exoEndoDictionary, - hmdb.exoEndo, - hmdb.tissueLocation, - hmdb.tissue, - hmdb.metaInchi, - "hmdb", - 0,0) - - wikipathwaysnumbers = sql.write( - wikipathways.metaboliteCommonName, - wikipathways.pathwayDictionary, - wikipathways.pathwayCategory, - wikipathways.metabolitesWithPathwaysDictionary, - wikipathways.metabolitesWithSynonymsDictionary, - wikipathways.metaboliteIDDictionary, - wikipathways.pathwaysWithGenesDictionary, - wikipathways.metabolitesLinkedToGenes, - wikipathways.geneInfoDictionary, - wikipathways.biofluidLocation, - wikipathways.biofluid, - wikipathways.cellularLocation, - wikipathways.cellular, - wikipathways.pathwayOntology, - wikipathways.exoEndoDictionary, - wikipathways.exoEndo, - wikipathways.tissueLocation, - wikipathways.tissue, - dict(), - "wiki", - hmdbnumbers[0],hmdbnumbers[1]) - - reactomenumbers = sql.write( - reactome.metaboliteCommonName, - reactome.pathwayDictionary, - reactome.pathwayCategory, - reactome.metabolitesWithPathwaysDictionary, - reactome.metabolitesWithSynonymsDictionary, - reactome.metaboliteIDDictionary, - reactome.pathwaysWithGenesDictionary, - reactome.metabolitesLinkedToGenes, - reactome.geneInfoDictionary, - reactome.biofluidLocation, - reactome.biofluid, - reactome.cellularLocation, - reactome.cellular, - reactome.pathwayOntology, - reactome.exoEndoDictionary, - reactome.exoEndo, - reactome.tissueLocation, - reactome.tissue, - dict(), - "reactome", - wikipathwaysnumbers[0],wikipathwaysnumbers[1]) - - keggnumbers = sql.write( - kegg.metaboliteCommonName, - kegg.pathwayDictionary, - kegg.pathwayCategory, - kegg.metabolitesWithPathwaysDictionary, - kegg.metabolitesWithSynonymsDictionary, - kegg.metaboliteIDDictionary, - kegg.pathwaysWithGenesDictionary, - kegg.metabolitesLinkedToGenes, - kegg.geneInfoDictionary, - kegg.biofluidLocation, - kegg.biofluid, - kegg.cellularLocation, - kegg.cellular, - kegg.pathwayOntology, - kegg.exoEndoDictionary, - kegg.exoEndo, - kegg.tissueLocation, - kegg.tissue, - dict(), - "kegg", - reactomenumbers[0],reactomenumbers[1]) - - - print("Done ... for importing database") - - print("Compound:") - stat.analyteOverlaps(sql.rampCompoundIdInWhichDatabases, sql.rampCompoundIDdictionary, "Compound") - print("\n") - print("Gene:") - stat.analyteOverlaps(sql.rampGeneIdInWhichDatabases, sql.rampGeneIDdictionary, "Gene") - - stat.databaseContent(hmdb.pathwayDictionary, - hmdb.pathwayCategory, - hmdb.metabolitesWithPathwaysDictionary, - hmdb.metabolitesWithSynonymsDictionary, - hmdb.metaboliteIDDictionary, - hmdb.pathwaysWithGenesDictionary, - hmdb.geneInfoDictionary, - hmdb.biofluidLocation, - hmdb.biofluid, - hmdb.cellularLocation, - hmdb.cellular, - hmdb.pathwayOntology, - hmdb.exoEndoDictionary, - "hmdb") - - stat.databaseContent(kegg.pathwayDictionary, - kegg.pathwayCategory, - kegg.metabolitesWithPathwaysDictionary, - kegg.metabolitesWithSynonymsDictionary, - kegg.metaboliteIDDictionary, - kegg.pathwaysWithGenesDictionary, - kegg.geneInfoDictionary, - kegg.biofluidLocation, - kegg.biofluid, - kegg.cellularLocation, - kegg.cellular, - kegg.pathwayOntology, - kegg.exoEndoDictionary, - "kegg") - - stat.databaseContent(reactome.pathwayDictionary, - reactome.pathwayCategory, - reactome.metabolitesWithPathwaysDictionary, - reactome.metabolitesWithSynonymsDictionary, - reactome.metaboliteIDDictionary, - reactome.pathwaysWithGenesDictionary, - reactome.geneInfoDictionary, - reactome.biofluidLocation, - reactome.biofluid, - reactome.cellularLocation, - reactome.cellular, - reactome.pathwayOntology, - reactome.exoEndoDictionary, - "reactome") - - stat.databaseContent(wikipathways.pathwayDictionary, - wikipathways.pathwayCategory, - wikipathways.metabolitesWithPathwaysDictionary, - wikipathways.metabolitesWithSynonymsDictionary, - wikipathways.metaboliteIDDictionary, - wikipathways.pathwaysWithGenesDictionary, - wikipathways.geneInfoDictionary, - wikipathways.biofluidLocation, - wikipathways.biofluid, - wikipathways.cellularLocation, - wikipathways.cellular, - wikipathways.pathwayOntology, - wikipathways.exoEndoDictionary, - "wiki") - """ # constructs the entity builder builder = EntityBuilder(resourceConf) @@ -253,7 +67,6 @@ def runEverything(self, resourceConfigFile, getDatabaseFiles = True): # Database loading is handled as a separate, un-coupled step. - resourceConfFile = "../config/external_resource_config.txt" main = Main() main.runEverything(resourceConfigFile = resourceConfFile) diff --git a/main/mainDBLoad.py b/main/mainDBLoad.py index 84242e5..fa00665 100644 --- a/main/mainDBLoad.py +++ b/main/mainDBLoad.py @@ -3,7 +3,6 @@ from util.rampDBBulkLoader import rampDBBulkLoader - class mainDBLoad(): def __init__(self): @@ -73,14 +72,12 @@ def loadDBAfterTruncatingTables(self, incrementLevel = 'increment_patch_release' # this process replaced the old system of having Rdata in the package loader.generateAndLoadRampSupplementalData() - - loader = mainDBLoad() # increment level 'increment_patch_release', 'increment_minor_release', # or 'specified' (new version, perhaps major release) loader.loadDBAfterTruncatingTables(incrementLevel = 'specified', - optionalVersionOveride = "2.4.0", - optionalVersionNote = "20231027 Data refresh. Inchi-key harmonization.", + optionalVersionOveride = "2.4.2", + optionalVersionNote = "20231107 Data refresh. Rhea Reaction Classes. Reactome gene patch.", truncateTables=True) diff --git a/main/mainSqliteDBLoad.py b/main/mainSqliteDBLoad.py index 99fe76a..dec552d 100644 --- a/main/mainSqliteDBLoad.py +++ b/main/mainSqliteDBLoad.py @@ -79,8 +79,8 @@ def loadDBAfterTruncatingTables(self, sqliteFile, incrementLevel = 'increment_pa # increment level 'increment_patch_release', 'increment_minor_release', # or 'specified' (new version, perhaps major release) -loader.loadDBAfterTruncatingTables(sqliteFile = '/mnt/ncatsprod/braistedjc/tmp_work/RaMP_SQLite_v2.3.0.sqlite', incrementLevel = 'specified', - optionalVersionOveride = "2.3.0", - optionalVersionNote = "20230727 data update/refresh release", +loader.loadDBAfterTruncatingTables(sqliteFile = '/mnt/ncatsprod/braistedjc/tmp_work/RaMP_SQLite_BASE.sqlite', incrementLevel = 'specified', + optionalVersionOveride = "2.4.2", + optionalVersionNote = "20231107 data update, Rhea reaction to EC reaction class. Reactome Genes Patch.", truncateTables=True) diff --git a/src/parse/RheaParser.py b/src/parse/RheaParser.py index 1464608..447ff8b 100644 --- a/src/parse/RheaParser.py +++ b/src/parse/RheaParser.py @@ -12,7 +12,7 @@ from rampConfig.RampConfig import RampConfig import numpy as np -from rdflib import URIRef,Graph +from rdflib import URIRef, Graph import rdflib.namespace from rdflib.namespace import RDF,FOAF,RDFS,DC,DCTERMS from builtins import str @@ -49,6 +49,8 @@ def __init__(self, resConfig): self.rheaProteinDict = dict() + self.rheaEcToClassDict = dict() + self.rheaLocalRdfFile = "" self.rheaLocalRheaToUniprotFile = "" @@ -57,6 +59,8 @@ def __init__(self, resConfig): self.rheaLocalRxnDirectionFile = "" + self.expasyLocalEc2ClassFile = "" + self.humanUniprotRecordDict = dict() self.humanUniprotAccSet = set() @@ -78,10 +82,14 @@ def processRhea(self): # builds reactions objects self.processAllReactions() - + + # this gets expasy ec to enzyme class + self.ecToEnzymeClassFromExpasy() + self.appendUniprotToReaction() self.appendEcToReaction() + self.setReactionHumanUniprotState() self.setReactionHumanChebiState() @@ -136,6 +144,7 @@ def getRheaFiles(self): uniprotToRheaConf = self.config.getConfig('uniprot_to_rhea') rheaToEcConf = self.config.getConfig('rhea_to_ec') rheaDirectionConf = self.config.getConfig('rhea_rxn_direction') + expasyEc2EnzymeClassConf = self.config.getConfig('expasy_ec2class') localDir = rdfConf.localDir @@ -200,6 +209,20 @@ def getRheaFiles(self): print("Using cached Rhea reaction direction file.") + # supporting expasy EC to Enzyme Class file + ec2classFile = expasyEc2EnzymeClassConf.extractFileName + + self.expasyLocalEc2ClassFile = self.relDir + localDir + ec2classFile + + if not exists(self.relDir + localDir + ec2classFile): + rheaDirUrl = expasyEc2EnzymeClassConf.sourceURL + rheaDirRemoteFile = expasyEc2EnzymeClassConf.sourceFileName + + self.download_files(rheaDirUrl, self.relDir + localDir + rheaDirRemoteFile) + else: + print("Using cached Expasy ec2enzymeClass file.") + + def constructRDF(self): @@ -594,7 +617,21 @@ def exportIntermediateFiles(self): for acc in self.rheaReactionDict: rxn = self.rheaReactionDict[acc] recordOut.write(rxn.getRheaIdToUniprotMappingString()) - + + recordOut.close() + + + recordsFile = "rhea_reaction_to_ec.txt" + + recordOut = open(dir + recordsFile, 'w', encoding="utf-8") + for acc in self.rheaReactionDict: + rxn = self.rheaReactionDict[acc] + ecList = rxn.ec + if ecList is not None and len(ecList) > 0: + ecBlock = self.buildRxnEcExportBlock(acc, ecList) + if len(ecBlock) > 0: + recordOut.write(ecBlock) + recordOut.close() @@ -699,15 +736,78 @@ def appendEcToReaction(self): print(str(r2u.shape)) for idx, row in r2u.iterrows(): - r2EcMap['rhea:'+str(row.RHEA_ID)] = row.ID + rheaRxnId = 'rhea:'+str(row.RHEA_ID) + ecList = r2EcMap.get(rheaRxnId,None) + if ecList is None: + r2EcMap[rheaRxnId] = [row.ID] + else: + ecList.append(row.ID) for rxn in r2EcMap: - ec = r2EcMap[rxn] + ecList = r2EcMap[rxn] currRxn = self.rheaReactionDict.get(rxn, None) if currRxn is not None: - currRxn.ec = ec + currRxn.ec = list(set(ecList)) + + def ecToEnzymeClassFromExpasy(self): + + # ec2class = pd.read_csv(self.expasyLocalEc2ClassFile, sep="\t", skiprows=11, skipfooter=5) + with open(self.expasyLocalEc2ClassFile, 'r') as ec2c: + ec2classStrings = ec2c.readlines() + + start = 11 + end = len(ec2classStrings) - 6 + + for i in range(start, end): + line = ec2classStrings[i].strip() + ec_data = line.split(" ") + ec = ec_data[0] + enzClass = ec_data[1] + if len(ec_data) == 3: + enzClass = ec_data[2] + ec = ec.replace(" ", "") + enzClass = enzClass.strip() + self.rheaEcToClassDict[ec] = enzClass + + + + def buildRxnEcExportBlock(self, rxnId, ecList): + ecBlock = "" + enzClassJoin = "" + for ec in ecList: + ecChildren = self.getEcChildren(ec) + enzClassJoin = "" + i = 0 + for ecc in ecChildren: + enzClass = self.rheaEcToClassDict.get(ecc, None) + if enzClass is not None: + if i == 0: + enzClassJoin = enzClass + # just mark that we are past the first entry + i = 1 + else: + # concatentate the enzyme class info :), I think this is finally correct :) + enzClassJoin = enzClassJoin + " | " + enzClass + ecLevel = 4 - ecc.count("-") + ecBlock = ecBlock + rxnId + "\t" + ecc + "\t" + str(ecLevel) + "\t" + enzClass + "\t" + enzClassJoin + "\n" + + + return ecBlock + + + def getEcChildren(self, ec): + data = ec.split('.') + ecVariants = [ec] + ecVariants.append(data[0] + "." + data[1] + "." + data[2] + ".-") + ecVariants.append(data[0] + "." + data[1] + ".-.-") + ecVariants.append(data[0] + ".-.-.-") + ecVariants = sorted(ecVariants) + return ecVariants + + + #rConf = RampConfig() #rConf.loadConfig("../../config/external_resource_config.txt") # # # diff --git a/src/parse/reactomeData.py b/src/parse/reactomeData.py index 144cb41..219b25e 100755 --- a/src/parse/reactomeData.py +++ b/src/parse/reactomeData.py @@ -418,18 +418,18 @@ def getCommonNameFromUniprot(self): # we now have uniprot to 'common_name', really gene id. # now we want to grab the NCBI/Entrez 'GeneID' - if childtag == "dbReference": - if child2.get("type") == "GeneID": - geneId = child2.get("id") - geneId = 'entrez:'+geneId +# if childtag == "dbReference": +# if child2.get("type") == "GeneID": +# geneId = child2.get("id") +# geneId = 'entrez:'+geneId # protein to gene can be 1:n, so they have to be stored as a list # lets check for a value - idList = mapping.get("small_e_entrez", None) - if(idList == None): - idList = list() - mapping["small_e_entrez"] = idList +# idList = mapping.get("small_e_entrez", None) +# if(idList == None): +# idList = list() +# mapping["small_e_entrez"] = idList - idList.append(geneId) +# idList.append(geneId) diff --git a/src/rampEntity/RheaReaction.py b/src/rampEntity/RheaReaction.py index def8f61..3a71908 100644 --- a/src/rampEntity/RheaReaction.py +++ b/src/rampEntity/RheaReaction.py @@ -67,11 +67,23 @@ def __init__(self): self.hasAHumanMetabolite = False + self.ecAssociationBlock = [] + def getBasicRecordString(self): ec = self.ec + if ec is None: - ec = "" + ecVal = "" + else: + eCount = 0 + ec = sorted(ec) + for e in ec: + if eCount == 0: + ecVal = e + else: + ecVal = ecVal + "; " + e + dir = self.direction if dir is None: dir = "" @@ -80,15 +92,27 @@ def getBasicRecordString(self): onlyHumanMets = self.hasOnlyHumanMetabolites * 1 s = (self.rhea_id + "\t" + str(self.status) + "\t" + str(self.isTransport) + "\t" +self.direction + "\t" + self.rhea_label + "\t" + - self.rhea_equation + "\t" + self.rhea_html_eq + "\t" + ec + "\t" + str(humanEnzyme) + "\t" + str(onlyHumanMets) +"\n") + self.rhea_equation + "\t" + self.rhea_html_eq + "\t" + ecVal + "\t" + str(humanEnzyme) + "\t" + str(onlyHumanMets) +"\n") return s def getMainRecordString(self): ec = self.ec - if ec is None: - ec = "" + + if ec is None or type(ec) == float: + ecVal = "" + else: + if len(ec) == 1: + ecVal = ec[0] + else: + eCount = 0 + ec = sorted(ec) + for e in ec: + if eCount == 0: + ecVal = e + else: + ecVal = ecVal + "; " + e direction = self.direction @@ -100,7 +124,7 @@ def getMainRecordString(self): s = str(self.rxnRampId) + "\t" + str(self.rhea_id) + "\t" + str(self.status) + "\t" + str(self.isTransport) + "\t" s = s + str(direction) + "\t" + str(self.rhea_label) + "\t" - s = s + str(self.rhea_equation) + "\t" + str(self.rhea_html_eq) + "\t" + str(ec) + "\t" + str(humanEnzyme) + "\t" + str(onlyHumanMets) + "\n" + s = s + str(self.rhea_equation) + "\t" + str(self.rhea_html_eq) + "\t" + str(ecVal) + "\t" + str(humanEnzyme) + "\t" + str(onlyHumanMets) + "\n" return s @@ -236,7 +260,7 @@ def assignPrimaryFields(self, dataVals): self.rhea_label = dataVals[4] self.rhea_equation = dataVals[5] self.rhea_html_eq = dataVals[6] - self.ec = dataVals[7] + self.ec = [dataVals[7]] self.hasHumanEnzyme = dataVals[8] self.hasOnlyHumanMetabolites = dataVals[9] @@ -287,3 +311,17 @@ def getCompoundToProteinString(self): s = s + self.rhea_id + "\t" + pid + "\t" + cid + "\t1\n" return s + + def getRheaReactionToEcString(self): + ecBlock = "" + if len(self.ecAssociationBlock) > 0: + self.ecAssociationBlock = list(set(self.ecAssociationBlock)) + for ecData in self.ecAssociationBlock: + ecBlock = ecBlock + self.rxnRampId + "\t" + self.rhea_id + "\t" + ecData.strip() + "\n" + + return ecBlock + + def addEcAssociationBlock(self, ecData): + self.ecAssociationBlock.append(ecData) + + diff --git a/src/util/EntityBuilder.py b/src/util/EntityBuilder.py index 6c71837..701f07f 100644 --- a/src/util/EntityBuilder.py +++ b/src/util/EntityBuilder.py @@ -562,6 +562,8 @@ def loadGeneList(self, eqMetric = 0): Populates the gene list from all data sources using the geneInfoDictionary files. This builds gene entities and merges based on common ids. """ + f = open("geneList.log", 'w') + Metabolite.__equalityMetric = eqMetric for src in self.sourceList: @@ -600,10 +602,24 @@ def loadGeneList(self, eqMetric = 0): gene.addId(altId, source) self.geneList.addGene(altId, gene) + if gene.rampId == 'RAMP_G_000008086': + f.write("Creating our gene...RAMP_G_000008086\n") + f.write(gene.rampId+"\n") + f.write("\t".join(gene.idList)+"\n") + #f.write(gene.idDict) + gene.addId(currSourceId, source) gene.addSource(source) self.geneList.addGene(currSourceId, gene) - + + if gene.rampId == 'RAMP_G_000008086': + f.write("Adding IDs to our gene...RAMP_G_000008086\n") + f.write(gene.rampId + "\n") + f.write(currSourceId + "\n") + f.write(source + "\n") + f.write("\t".join(gene.idList) + "\n") + + # this is a sourceId lets add else: # need to check if the alt id already exists as a key id @@ -612,6 +628,14 @@ def loadGeneList(self, eqMetric = 0): gene2.addId(altId, source) gene2.addSource(source) gene2.addId(currSourceId, source) + + + if gene2.rampId == 'RAMP_G_000008086': + f.write("Linked and adding to our gene...RAMP_G_000008086\n") + f.write(altId + "\n") + f.write(source + "\n") + f.write(currSourceId + "\n") + #metaboliteList.addMataboliteByAltId(altId, met2) # this reasigns the primary source id and strands the 'metabolite' record self.geneList.addGene(currSourceId, gene2) @@ -624,6 +648,20 @@ def loadGeneList(self, eqMetric = 0): # we don't want two records # we need to consolidate metabolites... I think if(gene2 is not gene): + + if("gene_symbol:MDM2" in gene.idList or "gene_symbol:MDM2" in gene2.idList): + print("SUBSUME GENE\n") + # print(gene.rampId) + # print(gene.idList) + # print(gene.idDict) + + print("///\n") + #print(gene2.rampId) + #print(gene2.idList) + #print(gene2.idDict) + #print(" ") + #print(" ") + # keep the original metabolite (met2) and transfer info gene2.subsumeGene(gene) @@ -639,6 +677,8 @@ def loadGeneList(self, eqMetric = 0): self.geneList.addGene(altId, gene) # safe add, adds unique source to metabolite gene.addSource(source) + f.close() + def loadOntolgies(self): @@ -951,7 +991,7 @@ def processRheaReactions(self): self.buildRxnsFromRhea(rheaPath + "/rhea_primary_records.txt") self.appendRxnProteinsFromRhea(rheaPath + "/rhea_uniprot_mapping.txt") self.appendRxnParticipantsFromRhea(rheaPath + "/rhea_rxn_to_chebi_and_dir.txt") - + self.dumpReactionToEcEnzymeClass(rheaPath + "/rhea_reaction_to_ec.txt") def buildRxnsFromRhea(self, path): print("Building Rhea Reactions") @@ -1018,7 +1058,7 @@ def appendRxnParticipantsFromRhea(self,path): if rxn is not None and met is not None: if met.isCofactor == 1: - print("in append rxn members... cofactor = 1 :)") + # print("in append rxn members... cofactor = 1 :)") rheaCofactCnt = rheaCofactCnt + 1 if(rxnSide == 0): @@ -1031,8 +1071,31 @@ def appendRxnParticipantsFromRhea(self,path): print("in append participants from Rhea... have a None rxn for id: "+rheaId) print("Rhea cofact count/est: "+str(rheaCofactCnt)) + + def dumpReactionToEcEnzymeClass(self, path): + + rxn2EcClassFile = open("../misc/sql/rheaReactionToEcClass.txt", 'w') + + with open(path, 'r') as data: + for line in data: + print("reading rh2ec file") + print(line) + sline = line.split("\t") + rheaId = sline[0] + print(rheaId) + rxn = self.reactionDict.get(rheaId, None) + + if rxn is not None: + rampRxnId = rxn.rxnRampId + if rampRxnId != "": + rxn2EcClassFile.write(rampRxnId + "\t" + line) + rxn2EcClassFile.close() + print("reaction dict key examples") + print(list(self.reactionDict.keys())[0:4]) + + # def fullBuild(self): # """ # This high level method performs the entire process of entity construction @@ -1287,9 +1350,6 @@ def writeReactionEntities(self): file.write(rxn.getReactionProteinToMetString('rhea')) file.close() - - - def remove_whitespace(self, dF):