Skip to content

Commit

Permalink
sqlite load, data refresh Oct_2023 config, inchi-key harmonization patch
Browse files Browse the repository at this point in the history
  • Loading branch information
johnbraisted committed Oct 27, 2023
1 parent 0973f6b commit 1867aac
Show file tree
Hide file tree
Showing 12 changed files with 188 additions and 138 deletions.
2 changes: 1 addition & 1 deletion config/db_load_resource_config.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ ready analytesource.txt source bulk None "sourceId,rampId,IDtype,geneOrCompound,
ready analytesynonym.txt analytesynonym bulk None "Synonym,rampId,geneOrCompound,source"
ready analytetopathway.txt analytehaspathway bulk None "rampId,pathwayRampId,pathwaySource"
ready analyte.txt analyte bulk rampId "rampId,type"
ready catalyzes.txt catalyzed bulk None "rampCompoundId,rampGeneId, proteinType"
ready catalyzes.txt catalyzed bulk None "rampCompoundId,rampGeneId,proteinType"
empty reactomecatalyzed.sql catalyzed bulk None "rampCompoundId,rampGeneId"
empty wikicatalyzed.sql catalyzed bulk None "rampCompoundId,rampGeneId"
ready chemProps.txt chem_props bulk None "ramp_id,chem_data_source,chem_source_id,iso_smiles,inchi_key_prefix,inchi_key,inchi,mw,monoisotop_mass,common_name,mol_formula"
Expand Down
2 changes: 1 addition & 1 deletion config/external_resource_config.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ hmdb_gene http https://hmdb.ca/system/downloads/current/hmdb_proteins.zip hmdb_p
hmdb_met_sdf http https://hmdb.ca/system/downloads/current/structures.zip structures.zip structures.sdf ../misc/data/chemprops/hmdb/ zip chem_props_sdf
reactome_met http http://www.reactome.org/download/current/ChEBI2Reactome_All_Levels.txt ChEBI2Reactome_All_Levels.txt ChEBI2Reactome_All_Levels.txt ../misc/data/reactome/ none pathways_mets
reactome_gene http http://www.reactome.org/download/current/UniProt2Reactome_All_Levels.txt UniProt2Reactome_All_Levels.txt UniProt2Reactome_All_Levels.txt ../misc/data/reactome/ none pathways_genes
wiki_pathways_mets_genes http https://wikipathways-data.wmcloud.org/current/rdf/wikipathways-20230710-rdf-wp.zip wikipathways-20230710-rdf-wp.zip ./wp/ ../misc/data/wikipathwaysRDF/ zip pathways_mets_genes
wiki_pathways_mets_genes http https://wikipathways-data.wmcloud.org/current/rdf/wikipathways-20231010-rdf-wp.zip wikipathways-20231010-rdf-wp.zip ./wp/ ../misc/data/wikipathwaysRDF/ zip pathways_mets_genes
chebi_met_sdf ftp https://ftp.ebi.ac.uk/pub/databases/chebi/SDF/ChEBI_complete_3star.sdf.gz ChEBI_complete_3star.sdf.gz ChEBI_complete_3star.sdf ../misc/data/chemprops/chebi/ gzip chem_props_sdf
lipidmaps_met http https://www.lipidmaps.org/files/?file=LMSD&ext=sdf.zip LMSD.sdf.zip structures.sdf ../misc/data/chemprops/lipidmaps/ zip chem_props_sdf
uniprot_human http https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/uniprot_sprot_human.dat.gz uniprot_sprot_human.dat.gz uniprot_sprot_human.dat ../misc/data/uniprot_human/ gzip proteins
Expand Down
14 changes: 7 additions & 7 deletions config/ramp_resource_version_update.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
ramp_db_version db_mod_date status data_source_id data_source_name data_source_url data_source_version
v2.3.0 7/20/2023 current hmdb HMDB https://hmdb.ca/ v5.0 (2021-11-17)
v2.3.0 7/20/2023 current reactome Reactome https://reactome.org/ v85 (May 2023)
v2.3.0 7/20/2023 current wiki WikiPathways https://www.wikipathways.org/index.php/WikiPathways v20230710 (2023-07-10)
v2.3.0 7/20/2023 current kegg KEGG https://www.genome.jp/kegg/ from HMDB (v5.0) (2021-11-17)
v2.3.0 7/20/2023 current chebi ChEBI https://www.ebi.ac.uk/chebi/ Release 223 (2023-07-01)
v2.3.0 7/20/2023 current lipidmaps Lipid Maps https://www.lipidmaps.org/ Release 2023-07-12
v2.3.0 8/3/2022 current rhea Rhea https://www.rhea-db.org/ Release 128 (2023-06-28)
v2.4.0 10/24/2023 current hmdb HMDB https://hmdb.ca/ v5.0 (2021-11-17)
v2.4.0 10/24/2023 current reactome Reactome https://reactome.org/ v86 (Sep 2023)
v2.4.0 10/24/2023 current wiki WikiPathways https://www.wikipathways.org/index.php/WikiPathways v20231010 (2023-10-10)
v2.4.0 10/24/2023 current kegg KEGG https://www.genome.jp/kegg/ from HMDB (v5.0) (2021-11-17)
v2.4.0 10/24/2023 current chebi ChEBI https://www.ebi.ac.uk/chebi/ Release 226 (2023-10-01)
v2.4.0 10/24/2023 current lipidmaps Lipid Maps https://www.lipidmaps.org/ Release 2023-10-24
v2.4.0 10/24/2023 current rhea Rhea https://www.rhea-db.org/ Release 128 (2023-06-28)
88 changes: 47 additions & 41 deletions main/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,63 +16,66 @@
class Main():

def runEverything(self, resourceConfigFile, getDatabaseFiles = True):

start = time.time()

sql = writeToSQL()

# build the ramp resource config
resourceConf = RampConfig()
resourceConf.loadConfig(resourceConfigFile)

#stat = getStatistics()
stat = getStatistics()
hmdb = hmdbData(resourceConf)
wikipathways = WikipathwaysRDF(resourceConf)
reactome = reactomeData(resourceConf)
kegg = KeggData()
lipidmaps = lipidmapsChemData(resourceConf)
rhea = RheaParser(resourceConf)

# works based on your computer, setup working directory
os.chdir('../main/')
#
# #kegg.getEverything(False)
# #print("KEGG Wonder")
# print("Getting hmdb...")
# hmdb.getEverything(True)
# print("Getting wiki...")
# wikipathways.getEverything(True)
# print("Getting reactome...")
# reactome.getEverything(True)
#
# # This parses and writes lipid maps
# # sql write will be handled by EntityBuilder
# print("Getting LipidMaps...")
# lipidmaps.getEverything(True)
#
# print("Getting Rhea info...")
# rhea.processRhea()

#kegg.getEverything(False)
#print("KEGG Wonder")
print("Getting hmdb...")
hmdb.getEverything(True)
print("Getting wiki...")
wikipathways.getEverything(True)
print("Getting reactome...")
reactome.getEverything(True)

# This parses and writes lipid maps
# sql write will be handled by EntityBuilder
print("Getting LipidMaps...")
lipidmaps.getEverything(True)

print("Getting Rhea info...")
rhea.processRhea()

#Here are the identifiers that are present for each gene:
#kegg: keggid (mainID), 'Ensembl', 'HGNC', 'HPRD', 'NCBI-GeneID', 'NCBI-ProteinID', 'OMIM', 'UniProt', 'Vega', 'miRBase'
#wikipathways: (no mainID), 'Entrez', 'Enzyme Nomenclature', 'Uniprot (Uniprot-TrEMBL)
#hmdb: HMDB-protien-accession (mainID), 'Uniprot'
#reactome:Uniprot (mainID)

"""
print('Generate compound id')
hmdbcompoundnum = sql.createRampCompoundID(hmdb.metaboliteIDDictionary, "hmdb", 0)
print("hmdbcompoundnum: ", hmdbcompoundnum)
keggcompoundnum = sql.createRampCompoundID(kegg.metaboliteIDDictionary, "kegg", hmdbcompoundnum)
wikicompoundnum = sql.createRampCompoundID(wikipathways.metaboliteIDDictionary, "wiki", keggcompoundnum)
print("wikicompoundnum: ", wikicompoundnum)
reactomecompoundnum = sql.createRampCompoundID(reactome.metaboliteIDDictionary, "reactome", wikicompoundnum)
print('Generate gene id ...')
hmdbgenenum = sql.createRampGeneID(hmdb.geneInfoDictionary, "hmdb", 0)
kegggenenum = sql.createRampGeneID(kegg.geneInfoDictionary, "kegg", hmdbgenenum)
wikigenenum = sql.createRampGeneID(wikipathways.geneInfoDictionary, "wiki", kegggenenum)
reactomegenenum = sql.createRampGeneID(reactome.geneInfoDictionary, "reactome", wikigenenum)
print(" hmdbgenenum ", hmdbgenenum, " kegggenenum ", kegggenenum, " wikigenenum ", wikigenenum, " reactomegenenum ", reactomegenenum)
"""

#
# #Here are the identifiers that are present for each gene:
# #kegg: keggid (mainID), 'Ensembl', 'HGNC', 'HPRD', 'NCBI-GeneID', 'NCBI-ProteinID', 'OMIM', 'UniProt', 'Vega', 'miRBase'
# #wikipathways: (no mainID), 'Entrez', 'Enzyme Nomenclature', 'Uniprot (Uniprot-TrEMBL)
# #hmdb: HMDB-protien-accession (mainID), 'Uniprot'
# #reactome:Uniprot (mainID)
#
# print('Generate compound id')
# hmdbcompoundnum = sql.createRampCompoundID(hmdb.metaboliteIDDictionary, "hmdb", 0)
# print("hmdbcompoundnum: ", hmdbcompoundnum)
# keggcompoundnum = sql.createRampCompoundID(kegg.metaboliteIDDictionary, "kegg", hmdbcompoundnum)
# wikicompoundnum = sql.createRampCompoundID(wikipathways.metaboliteIDDictionary, "wiki", keggcompoundnum)
# print("wikicompoundnum: ", wikicompoundnum)
# reactomecompoundnum = sql.createRampCompoundID(reactome.metaboliteIDDictionary, "reactome", wikicompoundnum)
#
# print('Generate gene id ...')
# hmdbgenenum = sql.createRampGeneID(hmdb.geneInfoDictionary, "hmdb", 0)
# kegggenenum = sql.createRampGeneID(kegg.geneInfoDictionary, "kegg", hmdbgenenum)
# wikigenenum = sql.createRampGeneID(wikipathways.geneInfoDictionary, "wiki", kegggenenum)
# reactomegenenum = sql.createRampGeneID(reactome.geneInfoDictionary, "reactome", wikigenenum)
# print(" hmdbgenenum ", hmdbgenenum, " kegggenenum ", kegggenenum, " wikigenenum ", wikigenenum, " reactomegenenum ", reactomegenenum)
""" print('Write to sql file...')
hmdbnumbers = sql.write(
hmdb.metaboliteCommonName,
Expand Down Expand Up @@ -244,7 +247,10 @@ def runEverything(self, resourceConfigFile, getDatabaseFiles = True):
# the result are files for DB loading in /misc/sql

builder.fullBuild()

print(time.time() - start)


# Database loading is handled as a separate, un-coupled step.


Expand Down
6 changes: 3 additions & 3 deletions main/mainDBLoad.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,8 @@ def loadDBAfterTruncatingTables(self, incrementLevel = 'increment_patch_release'

# increment level 'increment_patch_release', 'increment_minor_release',
# or 'specified' (new version, perhaps major release)
loader.loadDBAfterTruncatingTables(incrementLevel = 'increment_patch_release',
optionalVersionOveride = "",
optionalVersionNote = "20220822 patch release, update chem_props inchi values.",
loader.loadDBAfterTruncatingTables(incrementLevel = 'specified',
optionalVersionOveride = "2.4.0",
optionalVersionNote = "20231027 Data refresh. Inchi-key harmonization.",
truncateTables=True)

4 changes: 2 additions & 2 deletions main/mainSqliteDBLoad.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def loadDBAfterTruncatingTables(self, sqliteFile, incrementLevel = 'increment_pa

# pass the credentials object to the constructed rampDBBulLoader

loader = SQLiteDBBulkLoader(self.dbPropsFile, sqliteFile)
loader = SQLiteDBBulkLoader(dbPropsFile=self.dbPropsFile, sqliteFileName=sqliteFile)


# truncate tables
Expand Down Expand Up @@ -79,7 +79,7 @@ def loadDBAfterTruncatingTables(self, sqliteFile, incrementLevel = 'increment_pa

# increment level 'increment_patch_release', 'increment_minor_release',
# or 'specified' (new version, perhaps major release)
loader.loadDBAfterTruncatingTables(sqliteFile = '../RaMP_SQLite_v2.3.0_Structure.sqlite', incrementLevel = 'specified',
loader.loadDBAfterTruncatingTables(sqliteFile = '/mnt/ncatsprod/braistedjc/tmp_work/RaMP_SQLite_v2.3.0.sqlite', incrementLevel = 'specified',
optionalVersionOveride = "2.3.0",
optionalVersionNote = "20230727 data update/refresh release",
truncateTables=True)
Expand Down
6 changes: 3 additions & 3 deletions src/rampEntity/Metabolite.py
Original file line number Diff line number Diff line change
Expand Up @@ -530,13 +530,13 @@ def getAveMW(self):
for sourceId in molDict:
mol = molDict[sourceId]

if(mol.mw is not None and mol.mw != ""):
if(mol.mw is not None and mol.mw != ""):
mw = float(mol.mw)
if not math.isnan(mw):
mws.append(mw)

if(len(mws) > 0):
medMw = median(mws)

return medMw


8 changes: 4 additions & 4 deletions src/rampEntity/Molecule.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@ def __init__(self):
self.inchiKey = ""

self.inchiKeyPrefix = ""

self.inchiKeyDuplex = ""

self.inchiKeyDuplex = ""

self.inchi = ""

self.mw = None
Expand Down Expand Up @@ -53,7 +53,7 @@ def toChemPropsString(self):
if len(self.names) > 0:
name = self.names[0]
s = self.source + "\t" + self.id + "\t" + self.smiles + "\t" + self.inchiKeyPrefix + "\t" + self.inchiKey + "\t" + self.inchi + "\t"

mw = self.mw
mi = self.monoisotopicMass

Expand Down Expand Up @@ -91,4 +91,4 @@ def toSynonymsString(self):
return s




5 changes: 3 additions & 2 deletions src/util/EntityBuilder.py
Original file line number Diff line number Diff line change
Expand Up @@ -1831,8 +1831,9 @@ def populateExclusionList(self, filePath):
print("Exclusion List Size = " + str(len(list(self.sourceIdToExtIdDict.keys()))))


builder = EntityBuilder()
builder.crossCheckMetaboliteHarmony(buildMetAndCompoundProps = True, criteria = "MW", tolerance = 0.1, pctOrAbs = 'pct')
# builder = EntityBuilder()
# builder.crossCheckMetaboliteHarmony(buildMetAndCompoundProps = True, criteria = "MW", tolerance = 0.1, pctOrAbs = 'pct')

# builder.fullBuild()
# print("starting to load metabolites")
# builder.loadMetaboList()
Expand Down
26 changes: 16 additions & 10 deletions src/util/RampSupplementalDataBuilder.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,13 @@ def createSQLiteEngine(self, sqliteFile=None):
return engine

def createMySQLEngine(self, dbConf = None):
engine = create_engine((("mysql+pymysql://{username}:{conpass}@{host_url}/{dbname}").format(username=dbConf.username, conpass=dbConf.conpass, host_url=dbConf.host,dbname=dbConf.dbname)), echo=False)
print("In ramp supplimental data builder, building mysql engine")
dbConf.dumpConfig()
print(type(dbConf.port))
conStr = ("mysql+pymysql://{username}:{conpass}@{host_url}/{dbname}?port={port}").format(username=dbConf.username, conpass=dbConf.conpass, host_url=dbConf.host,dbname=dbConf.dbname,port=dbConf.port)
print(conStr)
engine = create_engine((("mysql+pymysql://{username}:{conpass}@{host_url}:{port}/{dbname}").format(username=dbConf.username, conpass=dbConf.conpass, host_url=dbConf.host,dbname=dbConf.dbname,port=dbConf.port)), echo=False)

return engine


Expand Down Expand Up @@ -123,7 +129,6 @@ def buildAnalyteSet(self, dataSource, geneOrMet):

print("building analyte stat set")

# NOTE the % has to be escaped for mysql, also works for sqlite, but is optional for sqlite.
rampIdPrefix = "RAMP_C%%"
if geneOrMet == 'gene':
rampIdPrefix = "RAMP_G%%"
Expand All @@ -135,9 +140,7 @@ def buildAnalyteSet(self, dataSource, geneOrMet):
df = None

with self.engine.connect() as conn:

print(sql)


df = conn.execute(sql).all()
df = pd.DataFrame(df)

Expand All @@ -146,6 +149,10 @@ def buildAnalyteSet(self, dataSource, geneOrMet):
print("Stats header")
print(df.columns)
print(type(df))

df.columns = ['pathwayRampID', 'Freq', 'pathwaySource']
print(df.columns)


print(df.head(5))

Expand All @@ -154,12 +161,11 @@ def buildAnalyteSet(self, dataSource, geneOrMet):
return df


pwob = RampSupplementalDataBuilder(dbType = "sqlite", sqliteCreds = "X:\\braistedjc\\tmp_work\\RaMP_SQLite_v2.3.1b.sqlite")
#pwob = RampSupplementalDataBuilder(dbType = "sqlite", sqliteCreds = "/mnt/ncatsprod/braistedjc/tmp_work/RaMP_SQLite_v2.3.1b.sqlite")
#pwob.listTables()
dm = pwob.buildSimilarityMatrix(matrixType = "analytes")
print(dm.values.sum())

# pwob.buildSimilarityMatrix(matrixType = "genes")
#pwob.buildBaseMatrix(matrixType = "analytes")
#dm = pwob.buildSimilarityMatrix(matrixType = "analytes")
#print(str(dm.values.sum()))

#pwob.buildAnalyteSet("wiki", "met")
#pwob.buildAnalyteSet("wiki", "gene")
Expand Down
Loading

0 comments on commit 1867aac

Please sign in to comment.