sqlite load, data refresh Oct_2023 config, inchi-key harmonization patch

ncats · Oct 27, 2023 · 1867aac · 1867aac
1 parent 0973f6b
commit 1867aac
Show file tree

Hide file tree

Showing 12 changed files with 188 additions and 138 deletions.
diff --git a/config/db_load_resource_config.txt b/config/db_load_resource_config.txt
@@ -4,7 +4,7 @@ ready	analytesource.txt	source	bulk	None	"sourceId,rampId,IDtype,geneOrCompound,
 ready	analytesynonym.txt	analytesynonym	bulk	None	"Synonym,rampId,geneOrCompound,source"
 ready	analytetopathway.txt	analytehaspathway	bulk	None	"rampId,pathwayRampId,pathwaySource"
 ready	analyte.txt	analyte	bulk	rampId	"rampId,type"
-ready	catalyzes.txt	catalyzed	bulk	None	"rampCompoundId,rampGeneId, proteinType"
+ready	catalyzes.txt	catalyzed	bulk	None	"rampCompoundId,rampGeneId,proteinType"
 empty	reactomecatalyzed.sql	catalyzed	bulk	None	"rampCompoundId,rampGeneId"
 empty	wikicatalyzed.sql	catalyzed	bulk	None	"rampCompoundId,rampGeneId"
 ready	chemProps.txt	chem_props	bulk	None	"ramp_id,chem_data_source,chem_source_id,iso_smiles,inchi_key_prefix,inchi_key,inchi,mw,monoisotop_mass,common_name,mol_formula"

diff --git a/config/external_resource_config.txt b/config/external_resource_config.txt
@@ -4,7 +4,7 @@ hmdb_gene	http	https://hmdb.ca/system/downloads/current/hmdb_proteins.zip	hmdb_p
 hmdb_met_sdf	http	https://hmdb.ca/system/downloads/current/structures.zip	structures.zip	structures.sdf	../misc/data/chemprops/hmdb/	zip	chem_props_sdf
 reactome_met	http	http://www.reactome.org/download/current/ChEBI2Reactome_All_Levels.txt	ChEBI2Reactome_All_Levels.txt	ChEBI2Reactome_All_Levels.txt	../misc/data/reactome/	none	pathways_mets
 reactome_gene	http	http://www.reactome.org/download/current/UniProt2Reactome_All_Levels.txt	UniProt2Reactome_All_Levels.txt	UniProt2Reactome_All_Levels.txt	../misc/data/reactome/	none	pathways_genes
-wiki_pathways_mets_genes	http	https://wikipathways-data.wmcloud.org/current/rdf/wikipathways-20230710-rdf-wp.zip	wikipathways-20230710-rdf-wp.zip	./wp/	../misc/data/wikipathwaysRDF/	zip	pathways_mets_genes
+wiki_pathways_mets_genes	http	https://wikipathways-data.wmcloud.org/current/rdf/wikipathways-20231010-rdf-wp.zip	wikipathways-20231010-rdf-wp.zip	./wp/	../misc/data/wikipathwaysRDF/	zip	pathways_mets_genes
 chebi_met_sdf	ftp	https://ftp.ebi.ac.uk/pub/databases/chebi/SDF/ChEBI_complete_3star.sdf.gz	ChEBI_complete_3star.sdf.gz	ChEBI_complete_3star.sdf	../misc/data/chemprops/chebi/	gzip	chem_props_sdf
 lipidmaps_met	http	https://www.lipidmaps.org/files/?file=LMSD&ext=sdf.zip	LMSD.sdf.zip	structures.sdf	../misc/data/chemprops/lipidmaps/	zip	chem_props_sdf
 uniprot_human	http	https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/uniprot_sprot_human.dat.gz	uniprot_sprot_human.dat.gz	uniprot_sprot_human.dat	../misc/data/uniprot_human/	gzip	proteins

diff --git a/config/ramp_resource_version_update.txt b/config/ramp_resource_version_update.txt
@@ -1,8 +1,8 @@
 ramp_db_version	db_mod_date	status	data_source_id	data_source_name	data_source_url	data_source_version
-v2.3.0	7/20/2023	current	hmdb	HMDB	https://hmdb.ca/	v5.0 (2021-11-17)
-v2.3.0	7/20/2023	current	reactome	Reactome	https://reactome.org/	v85 (May 2023)
-v2.3.0	7/20/2023	current	wiki	WikiPathways	https://www.wikipathways.org/index.php/WikiPathways	v20230710 (2023-07-10)
-v2.3.0	7/20/2023	current	kegg	KEGG	https://www.genome.jp/kegg/	from HMDB (v5.0) (2021-11-17) 
-v2.3.0	7/20/2023	current	chebi	ChEBI	https://www.ebi.ac.uk/chebi/	Release 223 (2023-07-01)
-v2.3.0	7/20/2023	current	lipidmaps	Lipid Maps	https://www.lipidmaps.org/	Release 2023-07-12
-v2.3.0  8/3/2022        current rhea    Rhea    https://www.rhea-db.org/        Release 128 (2023-06-28)
+v2.4.0	10/24/2023	current	hmdb	HMDB	https://hmdb.ca/	v5.0 (2021-11-17)
+v2.4.0	10/24/2023	current	reactome	Reactome	https://reactome.org/	v86 (Sep 2023)
+v2.4.0	10/24/2023	current	wiki	WikiPathways	https://www.wikipathways.org/index.php/WikiPathways	v20231010 (2023-10-10)
+v2.4.0	10/24/2023	current	kegg	KEGG	https://www.genome.jp/kegg/	from HMDB (v5.0) (2021-11-17) 
+v2.4.0	10/24/2023	current	chebi	ChEBI	https://www.ebi.ac.uk/chebi/	Release 226 (2023-10-01)
+v2.4.0	10/24/2023	current	lipidmaps	Lipid Maps	https://www.lipidmaps.org/	Release 2023-10-24
+v2.4.0	10/24/2023	current	rhea	Rhea	https://www.rhea-db.org/	Release 128 (2023-06-28)
diff --git a/main/main.py b/main/main.py
@@ -16,63 +16,66 @@
 class Main():
 
     def runEverything(self, resourceConfigFile, getDatabaseFiles = True):
+
+        start = time.time()
+
         sql = writeToSQL()
 
         # build the ramp resource config
         resourceConf = RampConfig()
         resourceConf.loadConfig(resourceConfigFile)
-
-        #stat = getStatistics()
+        
+        stat = getStatistics()
         hmdb = hmdbData(resourceConf)
         wikipathways = WikipathwaysRDF(resourceConf)
         reactome = reactomeData(resourceConf)
         kegg = KeggData()
         lipidmaps = lipidmapsChemData(resourceConf)
         rhea = RheaParser(resourceConf)
-         
+
         # works based on your computer, setup working directory
         os.chdir('../main/')
-# 
-#         #kegg.getEverything(False)
-#         #print("KEGG Wonder")
-#         print("Getting hmdb...")
-#         hmdb.getEverything(True)
-#         print("Getting wiki...")
-#         wikipathways.getEverything(True)
-#         print("Getting reactome...")
-#         reactome.getEverything(True)
-#         
-#         # This parses and writes lipid maps
-#         # sql write will be handled by EntityBuilder
-#         print("Getting LipidMaps...")
-#         lipidmaps.getEverything(True)
-#  
-#         print("Getting Rhea info...")
-#         rhea.processRhea()
 
+        #kegg.getEverything(False)
+        #print("KEGG Wonder")
+        print("Getting hmdb...")
+        hmdb.getEverything(True)
+        print("Getting wiki...")
+        wikipathways.getEverything(True)
+        print("Getting reactome...")
+        reactome.getEverything(True)
+
+        # This parses and writes lipid maps
+        # sql write will be handled by EntityBuilder
+        print("Getting LipidMaps...")
+        lipidmaps.getEverything(True)
+
+        print("Getting Rhea info...")
+        rhea.processRhea()
 
+        #Here are the identifiers that are present for each gene:
+        #kegg: keggid (mainID), 'Ensembl', 'HGNC', 'HPRD', 'NCBI-GeneID', 'NCBI-ProteinID', 'OMIM', 'UniProt', 'Vega', 'miRBase'
+        #wikipathways: (no mainID), 'Entrez', 'Enzyme Nomenclature', 'Uniprot (Uniprot-TrEMBL)
+        #hmdb: HMDB-protien-accession (mainID), 'Uniprot'
+        #reactome:Uniprot (mainID)
+
+        """
+        print('Generate compound id')
+        hmdbcompoundnum = sql.createRampCompoundID(hmdb.metaboliteIDDictionary, "hmdb", 0)
+        print("hmdbcompoundnum:   ", hmdbcompoundnum)
+        keggcompoundnum = sql.createRampCompoundID(kegg.metaboliteIDDictionary, "kegg", hmdbcompoundnum)
+        wikicompoundnum = sql.createRampCompoundID(wikipathways.metaboliteIDDictionary, "wiki", keggcompoundnum)
+        print("wikicompoundnum:   ", wikicompoundnum)
+        reactomecompoundnum = sql.createRampCompoundID(reactome.metaboliteIDDictionary, "reactome", wikicompoundnum)
+        
+        print('Generate gene id ...')
+        hmdbgenenum = sql.createRampGeneID(hmdb.geneInfoDictionary, "hmdb", 0)
+        kegggenenum = sql.createRampGeneID(kegg.geneInfoDictionary, "kegg", hmdbgenenum)
+        wikigenenum = sql.createRampGeneID(wikipathways.geneInfoDictionary, "wiki", kegggenenum)
+        reactomegenenum = sql.createRampGeneID(reactome.geneInfoDictionary, "reactome", wikigenenum)
+        print(" hmdbgenenum ", hmdbgenenum, " kegggenenum ", kegggenenum, " wikigenenum ", wikigenenum, " reactomegenenum ", reactomegenenum)
+        """
 
-#
-#         #Here are the identifiers that are present for each gene:
-#         #kegg: keggid (mainID), 'Ensembl', 'HGNC', 'HPRD', 'NCBI-GeneID', 'NCBI-ProteinID', 'OMIM', 'UniProt', 'Vega', 'miRBase'
-#         #wikipathways: (no mainID), 'Entrez', 'Enzyme Nomenclature', 'Uniprot (Uniprot-TrEMBL)
-#         #hmdb: HMDB-protien-accession (mainID), 'Uniprot'
-#         #reactome:Uniprot (mainID)
-#         
-#         print('Generate compound id')
-#         hmdbcompoundnum = sql.createRampCompoundID(hmdb.metaboliteIDDictionary, "hmdb", 0)
-#         print("hmdbcompoundnum:   ", hmdbcompoundnum)
-#         keggcompoundnum = sql.createRampCompoundID(kegg.metaboliteIDDictionary, "kegg", hmdbcompoundnum)
-#         wikicompoundnum = sql.createRampCompoundID(wikipathways.metaboliteIDDictionary, "wiki", keggcompoundnum)
-#         print("wikicompoundnum:   ", wikicompoundnum)
-#         reactomecompoundnum = sql.createRampCompoundID(reactome.metaboliteIDDictionary, "reactome", wikicompoundnum)
-#         
-#         print('Generate gene id ...')
-#         hmdbgenenum = sql.createRampGeneID(hmdb.geneInfoDictionary, "hmdb", 0)
-#         kegggenenum = sql.createRampGeneID(kegg.geneInfoDictionary, "kegg", hmdbgenenum)
-#         wikigenenum = sql.createRampGeneID(wikipathways.geneInfoDictionary, "wiki", kegggenenum)
-#         reactomegenenum = sql.createRampGeneID(reactome.geneInfoDictionary, "reactome", wikigenenum)
-#         print(" hmdbgenenum ", hmdbgenenum, " kegggenenum ", kegggenenum, " wikigenenum ", wikigenenum, " reactomegenenum ", reactomegenenum)
         """        print('Write to sql file...')
         hmdbnumbers = sql.write(
                  hmdb.metaboliteCommonName,
@@ -244,7 +247,10 @@ def runEverything(self, resourceConfigFile, getDatabaseFiles = True):
         # the result are files for DB loading in /misc/sql
 
         builder.fullBuild()
+
+        print(time.time() - start)
 
+
         # Database loading is handled as a separate, un-coupled step.
 
 

diff --git a/main/mainDBLoad.py b/main/mainDBLoad.py
@@ -79,8 +79,8 @@ def loadDBAfterTruncatingTables(self, incrementLevel = 'increment_patch_release'
 
 # increment level 'increment_patch_release', 'increment_minor_release', 
 # or 'specified' (new version, perhaps major release)
-loader.loadDBAfterTruncatingTables(incrementLevel = 'increment_patch_release', 
-                                   optionalVersionOveride = "", 
-                                   optionalVersionNote = "20220822 patch release, update chem_props inchi values.", 
+loader.loadDBAfterTruncatingTables(incrementLevel = 'specified', 
+                                   optionalVersionOveride = "2.4.0", 
+                                   optionalVersionNote = "20231027 Data refresh. Inchi-key harmonization.", 
                                    truncateTables=True)
 
diff --git a/main/mainSqliteDBLoad.py b/main/mainSqliteDBLoad.py
@@ -36,7 +36,7 @@ def loadDBAfterTruncatingTables(self, sqliteFile, incrementLevel = 'increment_pa
 
         # pass the credentials object to the constructed rampDBBulLoader
 
-        loader = SQLiteDBBulkLoader(self.dbPropsFile, sqliteFile)
+        loader = SQLiteDBBulkLoader(dbPropsFile=self.dbPropsFile, sqliteFileName=sqliteFile)
 
 
         # truncate tables
@@ -79,7 +79,7 @@ def loadDBAfterTruncatingTables(self, sqliteFile, incrementLevel = 'increment_pa
 
 # increment level 'increment_patch_release', 'increment_minor_release', 
 # or 'specified' (new version, perhaps major release)
-loader.loadDBAfterTruncatingTables(sqliteFile = '../RaMP_SQLite_v2.3.0_Structure.sqlite', incrementLevel = 'specified',  
+loader.loadDBAfterTruncatingTables(sqliteFile = '/mnt/ncatsprod/braistedjc/tmp_work/RaMP_SQLite_v2.3.0.sqlite', incrementLevel = 'specified',  
                                    optionalVersionOveride = "2.3.0", 
                                    optionalVersionNote = "20230727 data update/refresh release", 
                                    truncateTables=True)

diff --git a/src/rampEntity/Metabolite.py b/src/rampEntity/Metabolite.py
@@ -530,13 +530,13 @@ def getAveMW(self):
             for sourceId in molDict:
                 mol = molDict[sourceId]
 
-               if(mol.mw is not None and mol.mw != ""):
+                if(mol.mw is not None and mol.mw != ""):
                     mw = float(mol.mw)
                     if not math.isnan(mw):
                         mws.append(mw)
-                                        
+
         if(len(mws) > 0):
             medMw = median(mws)
 
         return medMw
-
+
diff --git a/src/rampEntity/Molecule.py b/src/rampEntity/Molecule.py
@@ -22,9 +22,9 @@ def __init__(self):
         self.inchiKey = ""
 
         self.inchiKeyPrefix = ""
-
-        self.inchiKeyDuplex = ""
 
+        self.inchiKeyDuplex = ""
+
         self.inchi = ""        
 
         self.mw = None
@@ -53,7 +53,7 @@ def toChemPropsString(self):
         if len(self.names) > 0:
             name = self.names[0]
         s =  self.source + "\t" + self.id + "\t" + self.smiles + "\t" + self.inchiKeyPrefix + "\t" + self.inchiKey + "\t" + self.inchi + "\t" 
-        
+
         mw = self.mw
         mi = self.monoisotopicMass
 
@@ -91,4 +91,4 @@ def toSynonymsString(self):
         return s            
 
 
-
+
diff --git a/src/util/EntityBuilder.py b/src/util/EntityBuilder.py
@@ -1831,8 +1831,9 @@ def populateExclusionList(self, filePath):
         print("Exclusion List Size = " + str(len(list(self.sourceIdToExtIdDict.keys()))))
 
 
-builder = EntityBuilder()
-builder.crossCheckMetaboliteHarmony(buildMetAndCompoundProps = True, criteria = "MW", tolerance = 0.1, pctOrAbs = 'pct')
+# builder = EntityBuilder()
+# builder.crossCheckMetaboliteHarmony(buildMetAndCompoundProps = True, criteria = "MW", tolerance = 0.1, pctOrAbs = 'pct')
+
 # builder.fullBuild()
 # print("starting to load metabolites")
 # builder.loadMetaboList()

diff --git a/src/util/RampSupplementalDataBuilder.py b/src/util/RampSupplementalDataBuilder.py
@@ -49,7 +49,13 @@ def createSQLiteEngine(self, sqliteFile=None):
         return engine
 
     def createMySQLEngine(self, dbConf = None):
-        engine = create_engine((("mysql+pymysql://{username}:{conpass}@{host_url}/{dbname}").format(username=dbConf.username, conpass=dbConf.conpass, host_url=dbConf.host,dbname=dbConf.dbname)), echo=False)
+        print("In ramp supplimental data builder, building mysql engine")
+        dbConf.dumpConfig()
+        print(type(dbConf.port))
+        conStr = ("mysql+pymysql://{username}:{conpass}@{host_url}/{dbname}?port={port}").format(username=dbConf.username, conpass=dbConf.conpass, host_url=dbConf.host,dbname=dbConf.dbname,port=dbConf.port)
+        print(conStr)
+        engine = create_engine((("mysql+pymysql://{username}:{conpass}@{host_url}:{port}/{dbname}").format(username=dbConf.username, conpass=dbConf.conpass, host_url=dbConf.host,dbname=dbConf.dbname,port=dbConf.port)), echo=False)
+
         return engine
 
 
@@ -123,7 +129,6 @@ def buildAnalyteSet(self, dataSource, geneOrMet):
 
         print("building analyte stat set")
 
-        # NOTE the % has to be escaped for mysql, also works for sqlite, but is optional for sqlite.
         rampIdPrefix = "RAMP_C%%"
         if geneOrMet == 'gene':
             rampIdPrefix = "RAMP_G%%"
@@ -135,9 +140,7 @@ def buildAnalyteSet(self, dataSource, geneOrMet):
         df = None
 
         with self.engine.connect() as conn:
-
-            print(sql)
-
+
             df = conn.execute(sql).all()
             df = pd.DataFrame(df)
 
@@ -146,6 +149,10 @@ def buildAnalyteSet(self, dataSource, geneOrMet):
             print("Stats header")
             print(df.columns)
             print(type(df))
+
+            df.columns = ['pathwayRampID', 'Freq', 'pathwaySource']
+            print(df.columns)
+
 
             print(df.head(5))
 
@@ -154,12 +161,11 @@ def buildAnalyteSet(self, dataSource, geneOrMet):
         return df
 
 
-pwob = RampSupplementalDataBuilder(dbType = "sqlite", sqliteCreds = "X:\\braistedjc\\tmp_work\\RaMP_SQLite_v2.3.1b.sqlite")
+#pwob = RampSupplementalDataBuilder(dbType = "sqlite", sqliteCreds = "/mnt/ncatsprod/braistedjc/tmp_work/RaMP_SQLite_v2.3.1b.sqlite")
 #pwob.listTables()
-dm = pwob.buildSimilarityMatrix(matrixType = "analytes")
-print(dm.values.sum())
-
-# pwob.buildSimilarityMatrix(matrixType = "genes")
+#pwob.buildBaseMatrix(matrixType = "analytes")
+#dm = pwob.buildSimilarityMatrix(matrixType = "analytes")
+#print(str(dm.values.sum()))
 
 #pwob.buildAnalyteSet("wiki", "met")
 #pwob.buildAnalyteSet("wiki", "gene")