diff --git a/config/db_load_resource_config.txt b/config/db_load_resource_config.txt index 14f52f5..8ecfbd8 100644 --- a/config/db_load_resource_config.txt +++ b/config/db_load_resource_config.txt @@ -4,7 +4,7 @@ ready analytesource.txt source bulk None "sourceId,rampId,IDtype,geneOrCompound, ready analytesynonym.txt analytesynonym bulk None "Synonym,rampId,geneOrCompound,source" ready analytetopathway.txt analytehaspathway bulk None "rampId,pathwayRampId,pathwaySource" ready analyte.txt analyte bulk rampId "rampId,type" -ready catalyzes.txt catalyzed bulk None "rampCompoundId,rampGeneId, proteinType" +ready catalyzes.txt catalyzed bulk None "rampCompoundId,rampGeneId,proteinType" empty reactomecatalyzed.sql catalyzed bulk None "rampCompoundId,rampGeneId" empty wikicatalyzed.sql catalyzed bulk None "rampCompoundId,rampGeneId" ready chemProps.txt chem_props bulk None "ramp_id,chem_data_source,chem_source_id,iso_smiles,inchi_key_prefix,inchi_key,inchi,mw,monoisotop_mass,common_name,mol_formula" @@ -15,3 +15,4 @@ ready reaction.txt reaction bulk ramp_rxn_id "ramp_rxn_id,rxn_source_id,status,i ready reaction_to_metabolite.txt reaction2met bulk None "ramp_rxn_id,rxn_source_id,ramp_cmpd_id,substrate_product,met_source_id,met_name,is_cofactor" ready reaction_to_protein.txt reaction2protein bulk None "ramp_rxn_id,rxn_source_id,ramp_gene_id,uniprot,protein_name" ready reaction_protein_to_metabolite.txt reaction_protein2met bulk None "ramp_rxn_id,rxn_source_id,ramp_gene_id,gene_source_id,substrate_product,ramp_cmpd_id,cmpd_source_id,cmpd_name,is_cofactor" +ready rheaReactionToEcClass.txt reaction_ec_class bulk None "ramp_rxn_id,rxn_source_id,rxn_class_ec,ec_level,rxn_class,rxn_class_hierarchy" diff --git a/config/external_resource_config.txt b/config/external_resource_config.txt index 2ce0fc6..89c80a1 100644 --- a/config/external_resource_config.txt +++ b/config/external_resource_config.txt @@ -4,7 +4,7 @@ hmdb_gene http https://hmdb.ca/system/downloads/current/hmdb_proteins.zip hmdb_p hmdb_met_sdf http https://hmdb.ca/system/downloads/current/structures.zip structures.zip structures.sdf ../misc/data/chemprops/hmdb/ zip chem_props_sdf reactome_met http http://www.reactome.org/download/current/ChEBI2Reactome_All_Levels.txt ChEBI2Reactome_All_Levels.txt ChEBI2Reactome_All_Levels.txt ../misc/data/reactome/ none pathways_mets reactome_gene http http://www.reactome.org/download/current/UniProt2Reactome_All_Levels.txt UniProt2Reactome_All_Levels.txt UniProt2Reactome_All_Levels.txt ../misc/data/reactome/ none pathways_genes -wiki_pathways_mets_genes http https://wikipathways-data.wmcloud.org/current/rdf/wikipathways-20230710-rdf-wp.zip wikipathways-20230710-rdf-wp.zip ./wp/ ../misc/data/wikipathwaysRDF/ zip pathways_mets_genes +wiki_pathways_mets_genes http https://wikipathways-data.wmcloud.org/current/rdf/wikipathways-20231010-rdf-wp.zip wikipathways-20231010-rdf-wp.zip ./wp/ ../misc/data/wikipathwaysRDF/ zip pathways_mets_genes chebi_met_sdf ftp https://ftp.ebi.ac.uk/pub/databases/chebi/SDF/ChEBI_complete_3star.sdf.gz ChEBI_complete_3star.sdf.gz ChEBI_complete_3star.sdf ../misc/data/chemprops/chebi/ gzip chem_props_sdf lipidmaps_met http https://www.lipidmaps.org/files/?file=LMSD&ext=sdf.zip LMSD.sdf.zip structures.sdf ../misc/data/chemprops/lipidmaps/ zip chem_props_sdf uniprot_human http https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/uniprot_sprot_human.dat.gz uniprot_sprot_human.dat.gz uniprot_sprot_human.dat ../misc/data/uniprot_human/ gzip proteins @@ -14,3 +14,4 @@ rhea_to_ec http https://ftp.expasy.org/databases/rhea/tsv/rhea2ec.tsv rhea2ec.ts rhea_rxn_direction http https://ftp.expasy.org/databases/rhea/tsv/rhea-directions.tsv rhea-directions.tsv rhea-directions.tsv ../misc/data/rhea/ none rhea_rxn_direction_table chebi_to_chebi_relations http http://ftp.ebi.ac.uk/pub/databases/chebi/Flat_file_tab_delimited/relation.tsv relation.tsv relation.tsv ../misc/data/chebi/ none chebi_relations chebi_ontology_owl http http://ftp.ebi.ac.uk/pub/databases/chebi/ontology/chebi.owl.gz chebi.owl.gz chebi.owl ../misc/data/chebi/ gzip chebi_ontology +expasy_ec2class ftp https://ftp.expasy.org/databases/enzyme/enzclass.txt enzclass.txt enzclass.txt ../misc/data/rhea none expasy_ec2class diff --git a/config/ramp_resource_version_update.txt b/config/ramp_resource_version_update.txt index 7a7bf7e..ea4f13a 100644 --- a/config/ramp_resource_version_update.txt +++ b/config/ramp_resource_version_update.txt @@ -1,8 +1,8 @@ ramp_db_version db_mod_date status data_source_id data_source_name data_source_url data_source_version -v2.3.0 7/20/2023 current hmdb HMDB https://hmdb.ca/ v5.0 (2021-11-17) -v2.3.0 7/20/2023 current reactome Reactome https://reactome.org/ v85 (May 2023) -v2.3.0 7/20/2023 current wiki WikiPathways https://www.wikipathways.org/index.php/WikiPathways v20230710 (2023-07-10) -v2.3.0 7/20/2023 current kegg KEGG https://www.genome.jp/kegg/ from HMDB (v5.0) (2021-11-17) -v2.3.0 7/20/2023 current chebi ChEBI https://www.ebi.ac.uk/chebi/ Release 223 (2023-07-01) -v2.3.0 7/20/2023 current lipidmaps Lipid Maps https://www.lipidmaps.org/ Release 2023-07-12 -v2.3.0 8/3/2022 current rhea Rhea https://www.rhea-db.org/ Release 128 (2023-06-28) +v2.4.1 10/24/2023 current hmdb HMDB https://hmdb.ca/ v5.0 (2021-11-17) +v2.4.1 10/24/2023 current reactome Reactome https://reactome.org/ v86 (Sep 2023) +v2.4.1 10/24/2023 current wiki WikiPathways https://www.wikipathways.org/index.php/WikiPathways v20231010 (2023-10-10) +v2.4.1 10/24/2023 current kegg KEGG https://www.genome.jp/kegg/ from HMDB (v5.0) (2021-11-17) +v2.4.1 10/24/2023 current chebi ChEBI https://www.ebi.ac.uk/chebi/ Release 226 (2023-10-01) +v2.4.1 10/24/2023 current lipidmaps Lipid Maps https://www.lipidmaps.org/ Release 2023-10-24 +v2.4.1 10/24/2023 current rhea Rhea https://www.rhea-db.org/ Release 129 (2023-09-13) diff --git a/main/main.py b/main/main.py index f3e00b4..9b2064f 100755 --- a/main/main.py +++ b/main/main.py @@ -16,225 +16,42 @@ class Main(): def runEverything(self, resourceConfigFile, getDatabaseFiles = True): + + start = time.time() + sql = writeToSQL() # build the ramp resource config resourceConf = RampConfig() resourceConf.loadConfig(resourceConfigFile) - - #stat = getStatistics() + + stat = getStatistics() hmdb = hmdbData(resourceConf) wikipathways = WikipathwaysRDF(resourceConf) reactome = reactomeData(resourceConf) kegg = KeggData() lipidmaps = lipidmapsChemData(resourceConf) rhea = RheaParser(resourceConf) - + # works based on your computer, setup working directory os.chdir('../main/') -# -# #kegg.getEverything(False) -# #print("KEGG Wonder") -# print("Getting hmdb...") -# hmdb.getEverything(True) -# print("Getting wiki...") -# wikipathways.getEverything(True) -# print("Getting reactome...") -# reactome.getEverything(True) -# -# # This parses and writes lipid maps -# # sql write will be handled by EntityBuilder -# print("Getting LipidMaps...") -# lipidmaps.getEverything(True) -# -# print("Getting Rhea info...") -# rhea.processRhea() - - - -# -# #Here are the identifiers that are present for each gene: -# #kegg: keggid (mainID), 'Ensembl', 'HGNC', 'HPRD', 'NCBI-GeneID', 'NCBI-ProteinID', 'OMIM', 'UniProt', 'Vega', 'miRBase' -# #wikipathways: (no mainID), 'Entrez', 'Enzyme Nomenclature', 'Uniprot (Uniprot-TrEMBL) -# #hmdb: HMDB-protien-accession (mainID), 'Uniprot' -# #reactome:Uniprot (mainID) -# -# print('Generate compound id') -# hmdbcompoundnum = sql.createRampCompoundID(hmdb.metaboliteIDDictionary, "hmdb", 0) -# print("hmdbcompoundnum: ", hmdbcompoundnum) -# keggcompoundnum = sql.createRampCompoundID(kegg.metaboliteIDDictionary, "kegg", hmdbcompoundnum) -# wikicompoundnum = sql.createRampCompoundID(wikipathways.metaboliteIDDictionary, "wiki", keggcompoundnum) -# print("wikicompoundnum: ", wikicompoundnum) -# reactomecompoundnum = sql.createRampCompoundID(reactome.metaboliteIDDictionary, "reactome", wikicompoundnum) -# -# print('Generate gene id ...') -# hmdbgenenum = sql.createRampGeneID(hmdb.geneInfoDictionary, "hmdb", 0) -# kegggenenum = sql.createRampGeneID(kegg.geneInfoDictionary, "kegg", hmdbgenenum) -# wikigenenum = sql.createRampGeneID(wikipathways.geneInfoDictionary, "wiki", kegggenenum) -# reactomegenenum = sql.createRampGeneID(reactome.geneInfoDictionary, "reactome", wikigenenum) -# print(" hmdbgenenum ", hmdbgenenum, " kegggenenum ", kegggenenum, " wikigenenum ", wikigenenum, " reactomegenenum ", reactomegenenum) - """ print('Write to sql file...') - hmdbnumbers = sql.write( - hmdb.metaboliteCommonName, - hmdb.pathwayDictionary, - hmdb.pathwayCategory, - hmdb.metabolitesWithPathwaysDictionary, - hmdb.metabolitesWithSynonymsDictionary, - hmdb.metaboliteIDDictionary, - hmdb.pathwaysWithGenesDictionary, - hmdb.metabolitesLinkedToGenes, - hmdb.geneInfoDictionary, - hmdb.biofluidLocation, - hmdb.biofluid, - hmdb.cellularLocation, - hmdb.cellular, - hmdb.pathwayOntology, - hmdb.exoEndoDictionary, - hmdb.exoEndo, - hmdb.tissueLocation, - hmdb.tissue, - hmdb.metaInchi, - "hmdb", - 0,0) - - wikipathwaysnumbers = sql.write( - wikipathways.metaboliteCommonName, - wikipathways.pathwayDictionary, - wikipathways.pathwayCategory, - wikipathways.metabolitesWithPathwaysDictionary, - wikipathways.metabolitesWithSynonymsDictionary, - wikipathways.metaboliteIDDictionary, - wikipathways.pathwaysWithGenesDictionary, - wikipathways.metabolitesLinkedToGenes, - wikipathways.geneInfoDictionary, - wikipathways.biofluidLocation, - wikipathways.biofluid, - wikipathways.cellularLocation, - wikipathways.cellular, - wikipathways.pathwayOntology, - wikipathways.exoEndoDictionary, - wikipathways.exoEndo, - wikipathways.tissueLocation, - wikipathways.tissue, - dict(), - "wiki", - hmdbnumbers[0],hmdbnumbers[1]) - - reactomenumbers = sql.write( - reactome.metaboliteCommonName, - reactome.pathwayDictionary, - reactome.pathwayCategory, - reactome.metabolitesWithPathwaysDictionary, - reactome.metabolitesWithSynonymsDictionary, - reactome.metaboliteIDDictionary, - reactome.pathwaysWithGenesDictionary, - reactome.metabolitesLinkedToGenes, - reactome.geneInfoDictionary, - reactome.biofluidLocation, - reactome.biofluid, - reactome.cellularLocation, - reactome.cellular, - reactome.pathwayOntology, - reactome.exoEndoDictionary, - reactome.exoEndo, - reactome.tissueLocation, - reactome.tissue, - dict(), - "reactome", - wikipathwaysnumbers[0],wikipathwaysnumbers[1]) - - keggnumbers = sql.write( - kegg.metaboliteCommonName, - kegg.pathwayDictionary, - kegg.pathwayCategory, - kegg.metabolitesWithPathwaysDictionary, - kegg.metabolitesWithSynonymsDictionary, - kegg.metaboliteIDDictionary, - kegg.pathwaysWithGenesDictionary, - kegg.metabolitesLinkedToGenes, - kegg.geneInfoDictionary, - kegg.biofluidLocation, - kegg.biofluid, - kegg.cellularLocation, - kegg.cellular, - kegg.pathwayOntology, - kegg.exoEndoDictionary, - kegg.exoEndo, - kegg.tissueLocation, - kegg.tissue, - dict(), - "kegg", - reactomenumbers[0],reactomenumbers[1]) - - print("Done ... for importing database") + #kegg.getEverything(False) + #print("KEGG Wonder") + print("Getting hmdb...") + hmdb.getEverything(True) + print("Getting wiki...") + wikipathways.getEverything(True) + print("Getting reactome...") + reactome.getEverything(True) - print("Compound:") - stat.analyteOverlaps(sql.rampCompoundIdInWhichDatabases, sql.rampCompoundIDdictionary, "Compound") - print("\n") - print("Gene:") - stat.analyteOverlaps(sql.rampGeneIdInWhichDatabases, sql.rampGeneIDdictionary, "Gene") + # This parses and writes lipid maps + # sql write will be handled by EntityBuilder + print("Getting LipidMaps...") + lipidmaps.getEverything(True) - stat.databaseContent(hmdb.pathwayDictionary, - hmdb.pathwayCategory, - hmdb.metabolitesWithPathwaysDictionary, - hmdb.metabolitesWithSynonymsDictionary, - hmdb.metaboliteIDDictionary, - hmdb.pathwaysWithGenesDictionary, - hmdb.geneInfoDictionary, - hmdb.biofluidLocation, - hmdb.biofluid, - hmdb.cellularLocation, - hmdb.cellular, - hmdb.pathwayOntology, - hmdb.exoEndoDictionary, - "hmdb") - - stat.databaseContent(kegg.pathwayDictionary, - kegg.pathwayCategory, - kegg.metabolitesWithPathwaysDictionary, - kegg.metabolitesWithSynonymsDictionary, - kegg.metaboliteIDDictionary, - kegg.pathwaysWithGenesDictionary, - kegg.geneInfoDictionary, - kegg.biofluidLocation, - kegg.biofluid, - kegg.cellularLocation, - kegg.cellular, - kegg.pathwayOntology, - kegg.exoEndoDictionary, - "kegg") - - stat.databaseContent(reactome.pathwayDictionary, - reactome.pathwayCategory, - reactome.metabolitesWithPathwaysDictionary, - reactome.metabolitesWithSynonymsDictionary, - reactome.metaboliteIDDictionary, - reactome.pathwaysWithGenesDictionary, - reactome.geneInfoDictionary, - reactome.biofluidLocation, - reactome.biofluid, - reactome.cellularLocation, - reactome.cellular, - reactome.pathwayOntology, - reactome.exoEndoDictionary, - "reactome") - - stat.databaseContent(wikipathways.pathwayDictionary, - wikipathways.pathwayCategory, - wikipathways.metabolitesWithPathwaysDictionary, - wikipathways.metabolitesWithSynonymsDictionary, - wikipathways.metaboliteIDDictionary, - wikipathways.pathwaysWithGenesDictionary, - wikipathways.geneInfoDictionary, - wikipathways.biofluidLocation, - wikipathways.biofluid, - wikipathways.cellularLocation, - wikipathways.cellular, - wikipathways.pathwayOntology, - wikipathways.exoEndoDictionary, - "wiki") - """ + print("Getting Rhea info...") + rhea.processRhea() # constructs the entity builder builder = EntityBuilder(resourceConf) @@ -244,10 +61,12 @@ def runEverything(self, resourceConfigFile, getDatabaseFiles = True): # the result are files for DB loading in /misc/sql builder.fullBuild() + + print(time.time() - start) + # Database loading is handled as a separate, un-coupled step. - resourceConfFile = "../config/external_resource_config.txt" main = Main() main.runEverything(resourceConfigFile = resourceConfFile) diff --git a/main/mainDBLoad.py b/main/mainDBLoad.py index 705ddaa..fa00665 100644 --- a/main/mainDBLoad.py +++ b/main/mainDBLoad.py @@ -3,7 +3,6 @@ from util.rampDBBulkLoader import rampDBBulkLoader - class mainDBLoad(): def __init__(self): @@ -68,14 +67,17 @@ def loadDBAfterTruncatingTables(self, incrementLevel = 'increment_patch_release' # this method populates a table that reflects the current status of the database. # metrics such as gene and metabolite counts for reach data sets are tallied. loader.updateDataStatusSummary() - + + # generate pathway similarity matrices, analyte lists and whatnot + # this process replaced the old system of having Rdata in the package + loader.generateAndLoadRampSupplementalData() loader = mainDBLoad() # increment level 'increment_patch_release', 'increment_minor_release', # or 'specified' (new version, perhaps major release) -loader.loadDBAfterTruncatingTables(incrementLevel = 'increment_patch_release', - optionalVersionOveride = "", - optionalVersionNote = "20220822 patch release, update chem_props inchi values.", +loader.loadDBAfterTruncatingTables(incrementLevel = 'specified', + optionalVersionOveride = "2.4.2", + optionalVersionNote = "20231107 Data refresh. Rhea Reaction Classes. Reactome gene patch.", truncateTables=True) diff --git a/main/mainSqliteDBLoad.py b/main/mainSqliteDBLoad.py index db2d482..dec552d 100644 --- a/main/mainSqliteDBLoad.py +++ b/main/mainSqliteDBLoad.py @@ -36,7 +36,7 @@ def loadDBAfterTruncatingTables(self, sqliteFile, incrementLevel = 'increment_pa # pass the credentials object to the constructed rampDBBulLoader - loader = SQLiteDBBulkLoader(self.dbPropsFile, sqliteFile) + loader = SQLiteDBBulkLoader(dbPropsFile=self.dbPropsFile, sqliteFileName=sqliteFile) # truncate tables @@ -79,8 +79,8 @@ def loadDBAfterTruncatingTables(self, sqliteFile, incrementLevel = 'increment_pa # increment level 'increment_patch_release', 'increment_minor_release', # or 'specified' (new version, perhaps major release) -loader.loadDBAfterTruncatingTables(sqliteFile = '../RaMP_SQLite_v2.3.0_Structure.sqlite', incrementLevel = 'specified', - optionalVersionOveride = "2.3.0", - optionalVersionNote = "20230727 data update/refresh release", +loader.loadDBAfterTruncatingTables(sqliteFile = '/mnt/ncatsprod/braistedjc/tmp_work/RaMP_SQLite_BASE.sqlite', incrementLevel = 'specified', + optionalVersionOveride = "2.4.2", + optionalVersionNote = "20231107 data update, Rhea reaction to EC reaction class. Reactome Genes Patch.", truncateTables=True) diff --git a/schema/RaMP_MySQL_BASE_2.4.0.sql b/schema/RaMP_MySQL_BASE_2.4.0.sql new file mode 100644 index 0000000..b423ce3 --- /dev/null +++ b/schema/RaMP_MySQL_BASE_2.4.0.sql @@ -0,0 +1,387 @@ +-- MySQL dump 10.13 Distrib 8.0.21, for Win64 (x86_64) +-- +-- Host: localhost Database: ramp +-- ------------------------------------------------------ +-- Server version 8.0.28 + +/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; +/*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; +/*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; +/*!50503 SET NAMES utf8 */; +/*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */; +/*!40103 SET TIME_ZONE='+00:00' */; +/*!40014 SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0 */; +/*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */; +/*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */; +/*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; + +-- +-- Table structure for table `analyte` +-- + +DROP TABLE IF EXISTS `analyte`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `analyte` ( + `rampId` varchar(30) NOT NULL, + `type` varchar(30) DEFAULT NULL, + PRIMARY KEY (`rampId`), + KEY `analyte_rampId_RampID_IDX` (`rampId`) USING BTREE +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb3; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Table structure for table `analytehasontology` +-- + +DROP TABLE IF EXISTS `analytehasontology`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `analytehasontology` ( + `rampCompoundId` varchar(30) DEFAULT NULL, + `rampOntologyId` varchar(30) DEFAULT NULL, + KEY `analyte_ont_ramp_id_idx` (`rampCompoundId`), + KEY `analyte_ont_id_idx` (`rampOntologyId`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb3; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Table structure for table `analytehaspathway` +-- + +DROP TABLE IF EXISTS `analytehaspathway`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `analytehaspathway` ( + `rampId` varchar(30) DEFAULT NULL, + `pathwayRampId` varchar(30) DEFAULT NULL, + `pathwaySource` varchar(30) DEFAULT NULL, + KEY `pathwayRampID_IDX` (`pathwayRampId`) USING BTREE, + KEY `ahp_RampID_IDX` (`rampId`) USING BTREE, + KEY `ahp_path_source_IDX` (`pathwaySource`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb3; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Table structure for table `analytesynonym` +-- + +DROP TABLE IF EXISTS `analytesynonym`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `analytesynonym` ( + `Synonym` varchar(500) DEFAULT NULL, + `rampId` varchar(30) DEFAULT NULL, + `geneOrCompound` varchar(30) DEFAULT NULL, + `source` varchar(30) DEFAULT NULL, + KEY `analSyn_RampID_IDX` (`rampId`) USING BTREE, + KEY `analSyn_syn_IDX` (`Synonym`) USING BTREE, + KEY `idx_analytesynonym_geneOrCompound` (`geneOrCompound`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb3; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Table structure for table `catalyzed` +-- + +DROP TABLE IF EXISTS `catalyzed`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `catalyzed` ( + `rampCompoundId` varchar(30) DEFAULT NULL, + `rampGeneId` varchar(30) DEFAULT NULL, + `proteinType` varchar(32) DEFAULT NULL, + KEY `catal_gene_idx` (`rampGeneId`), + KEY `catal_comp_idx` (`rampCompoundId`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb3; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Table structure for table `chem_props` +-- + +DROP TABLE IF EXISTS `chem_props`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `chem_props` ( + `ramp_id` varchar(30) NOT NULL, + `chem_data_source` varchar(32) DEFAULT NULL, + `chem_source_id` varchar(45) DEFAULT NULL, + `iso_smiles` varchar(256) DEFAULT NULL, + `inchi_key_prefix` varchar(32) DEFAULT NULL, + `inchi_key` varchar(32) DEFAULT NULL, + `inchi` varchar(4096) DEFAULT NULL, + `mw` float DEFAULT NULL, + `monoisotop_mass` float DEFAULT NULL, + `common_name` varchar(1024) DEFAULT NULL, + `mol_formula` varchar(64) DEFAULT NULL, + KEY `prop_source_idx` (`chem_data_source`) USING BTREE, + KEY `inchi_key_idx` (`inchi_key`) USING BTREE, + KEY `inchi_key_prefix_idx` (`inchi_key_prefix`) USING BTREE, + KEY `ramp_id_idx` (`ramp_id`) USING BTREE +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COMMENT='Holds metabolite properties for all ramp metabolites'; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Table structure for table `db_version` +-- + +DROP TABLE IF EXISTS `db_version`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `db_version` ( + `ramp_version` varchar(20) NOT NULL, + `load_timestamp` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP, + `version_notes` varchar(256) DEFAULT NULL, + `met_intersects_json` varchar(10000) DEFAULT NULL, + `gene_intersects_json` varchar(10000) DEFAULT NULL, + `met_intersects_json_pw_mapped` varchar(10000) DEFAULT NULL, + `gene_intersects_json_pw_mapped` varchar(10000) DEFAULT NULL, + `db_sql_url` varchar(256) DEFAULT NULL +) ENGINE=InnoDB DEFAULT CHARSET=latin1; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Table structure for table `entity_status_info` +-- + +DROP TABLE IF EXISTS `entity_status_info`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `entity_status_info` ( + `status_category` varchar(64) NOT NULL, + `entity_source_id` varchar(32) NOT NULL, + `entity_source_name` varchar(45) NOT NULL, + `entity_count` int NOT NULL +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COMMENT='holds entity counts'; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Table structure for table `metabolite_class` +-- + +DROP TABLE IF EXISTS `metabolite_class`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `metabolite_class` ( + `ramp_id` varchar(32) NOT NULL, + `class_source_id` varchar(32) NOT NULL, + `class_level_name` varchar(128) NOT NULL, + `class_name` varchar(128) NOT NULL, + `source` varchar(32) NOT NULL, + KEY `ramp_id_metclass_idx` (`ramp_id`), + KEY `class_source_id_metclass_idx` (`class_source_id`), + KEY `class_name_metclass_idx` (`class_name`), + KEY `class_source_metclass_idx` (`source`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COMMENT='holds rampid and class source id to metabolic class levels and names'; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Table structure for table `ontology` +-- + +DROP TABLE IF EXISTS `ontology`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `ontology` ( + `rampOntologyId` varchar(30) DEFAULT NULL, + `commonName` varchar(64) DEFAULT NULL, + `HMDBOntologyType` varchar(30) DEFAULT NULL, + `metCount` int DEFAULT '0', + KEY `ontol_parent_idx` (`commonName`), + KEY `ontol_ramp_id_idx` (`rampOntologyId`), + KEY `ontol_term_idx` (`HMDBOntologyType`), + KEY `ontol_metCount_idx` (`metCount`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb3; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Table structure for table `pathway` +-- + +DROP TABLE IF EXISTS `pathway`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `pathway` ( + `pathwayRampId` varchar(30) NOT NULL, + `sourceId` varchar(30) DEFAULT NULL, + `type` varchar(30) DEFAULT NULL, + `pathwayCategory` varchar(30) DEFAULT NULL, + `pathwayName` varchar(250) DEFAULT NULL, + PRIMARY KEY (`pathwayRampId`), + KEY `pathway_RampID_IDX` (`pathwayRampId`) USING BTREE, + KEY `idx_pathway_sourceId` (`sourceId`), + KEY `idx_pathway_type` (`type`), + KEY `idx_pathway_pathwayCategory` (`pathwayCategory`), + FULLTEXT KEY `idx_pathway_pathwayName` (`pathwayName`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb3; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Table structure for table `ramp_data_object` +-- + +DROP TABLE IF EXISTS `ramp_data_object`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `ramp_data_object` ( + `data_key` varchar(32) NOT NULL, + `data_blob` longblob NOT NULL, + PRIMARY KEY (`data_key`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Table structure for table `reaction` +-- + +DROP TABLE IF EXISTS `reaction`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `reaction` ( + `ramp_rxn_id` varchar(16) NOT NULL, + `rxn_source_id` varchar(16) NOT NULL, + `status` int NOT NULL, + `is_transport` int NOT NULL, + `direction` varchar(8) NOT NULL, + `label` varchar(256) NOT NULL, + `equation` varchar(256) NOT NULL, + `html_equation` varchar(256) NOT NULL, + `ec_num` varchar(256) DEFAULT NULL, + `has_human_prot` int NOT NULL, + `only_human_mets` int NOT NULL, + PRIMARY KEY (`ramp_rxn_id`), + KEY `reaction_src_id_idx` (`rxn_source_id`), + KEY `reaction_ec_num_idx` (`ec_num`), + KEY `reaction_has_human_prot_idx` (`has_human_prot`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COMMENT='ramp primary reaction annotation table.'; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Table structure for table `reaction2met` +-- + +DROP TABLE IF EXISTS `reaction2met`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `reaction2met` ( + `ramp_rxn_id` varchar(16) NOT NULL, + `rxn_source_id` varchar(16) NOT NULL, + `ramp_cmpd_id` varchar(16) NOT NULL, + `substrate_product` int NOT NULL, + `met_source_id` varchar(32) NOT NULL, + `met_name` varchar(256) DEFAULT NULL, + `is_cofactor` int NOT NULL DEFAULT '0', + KEY `rxn2met_rxn_ramp_id_idx` (`ramp_rxn_id`), + KEY `rxn2met_rxn_source_id_idx` (`rxn_source_id`), + KEY `rxn2met_met_ramp_id_idx` (`ramp_cmpd_id`), + KEY `rxn2met_subs_prod_idx` (`substrate_product`), + KEY `rxn2met_met_source_id_idx` (`met_source_id`), + KEY `rxn2met_iscofactor_idx` (`is_cofactor`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COMMENT='holds reaction to metabolite mapping.'; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Table structure for table `reaction2protein` +-- + +DROP TABLE IF EXISTS `reaction2protein`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `reaction2protein` ( + `ramp_rxn_id` varchar(16) NOT NULL, + `rxn_source_id` varchar(16) NOT NULL, + `ramp_gene_id` varchar(16) NOT NULL, + `uniprot` varchar(16) NOT NULL, + `protein_name` varchar(16) NOT NULL, + KEY `rxn_prot_ramp_id_idx` (`ramp_rxn_id`), + KEY `rxn2prot_source_id_idx` (`rxn_source_id`), + KEY `rxn2prot_ramp_gene_id_idx` (`ramp_gene_id`), + KEY `rxn2prot_uniprot_idx` (`uniprot`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1 COMMENT='maps reaction ids to associated uniprot ids and their ramp ids'; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Table structure for table `reaction_protein2met` +-- + +DROP TABLE IF EXISTS `reaction_protein2met`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `reaction_protein2met` ( + `ramp_rxn_id` varchar(16) NOT NULL, + `rxn_source_id` varchar(16) NOT NULL, + `ramp_gene_id` varchar(16) NOT NULL, + `gene_source_id` varchar(16) NOT NULL, + `substrate_product` int NOT NULL, + `ramp_cmpd_id` varchar(16) NOT NULL, + `cmpd_source_id` varchar(45) NOT NULL, + `cmpd_name` varchar(256) DEFAULT NULL, + `is_cofactor` int NOT NULL DEFAULT '0', + KEY `rxn_p2m_rxn_ramp_id_idx` (`ramp_rxn_id`), + KEY `rxn_p2m_rxn_source_id_idx` (`rxn_source_id`), + KEY `rxn_p2m_rxn_gene_ramp_id_idx` (`ramp_gene_id`), + KEY `rxn_p2m_gene_source_id_idx` (`gene_source_id`), + KEY `rxn_subprod_idx` (`substrate_product`), + KEY `rxn_p2m_ramp_cmpd_id_idx` (`ramp_cmpd_id`), + KEY `rxn_p2m_cmpd_source_id_idx` (`cmpd_source_id`), + KEY `rxn_p2m_iscofactor_idx` (`is_cofactor`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Table structure for table `source` +-- + +DROP TABLE IF EXISTS `source`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `source` ( + `sourceId` varchar(30) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL, + `rampId` varchar(30) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL, + `IDtype` varchar(30) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL, + `geneOrCompound` varchar(30) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL, + `commonName` varchar(256) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL, + `priorityHMDBStatus` varchar(32) CHARACTER SET utf8 COLLATE utf8_bin DEFAULT NULL, + `dataSource` varchar(32) CHARACTER SET utf8 COLLATE utf8_bin DEFAULT NULL, + `pathwayCount` int NOT NULL DEFAULT '0', + KEY `source_RampID_IDX` (`rampId`) USING BTREE, + KEY `source_sid_RampID_IDX` (`sourceId`) USING BTREE, + KEY `source_comName_RampID_IDX` (`commonName`) USING BTREE, + KEY `source_datasrc_IDX` (`dataSource`), + KEY `source_pathCount_IDX` (`pathwayCount`), + KEY `idx_source_geneOrCompound` (`geneOrCompound`), + KEY `idx_source_IDtype` (`IDtype`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb3 COLLATE=utf8_bin; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Table structure for table `version_info` +-- + +DROP TABLE IF EXISTS `version_info`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `version_info` ( + `ramp_db_version` varchar(16) NOT NULL, + `db_mod_date` date NOT NULL, + `status` varchar(16) NOT NULL, + `data_source_id` varchar(32) NOT NULL, + `data_source_name` varchar(128) NOT NULL, + `data_source_url` varchar(128) NOT NULL, + `data_source_version` varchar(128) NOT NULL, + KEY `status_index` (`status`), + KEY `data_source_index` (`data_source_id`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1; +/*!40101 SET character_set_client = @saved_cs_client */; +/*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */; + +/*!40101 SET SQL_MODE=@OLD_SQL_MODE */; +/*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */; +/*!40014 SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS */; +/*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; +/*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; +/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; +/*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; + +-- Dump completed on 2023-10-30 14:02:17 diff --git a/schema/RaMP_SQLite_BASE_2.4.0.sqlite b/schema/RaMP_SQLite_BASE_2.4.0.sqlite new file mode 100644 index 0000000..1289a5f Binary files /dev/null and b/schema/RaMP_SQLite_BASE_2.4.0.sqlite differ diff --git a/src/chemprop/ChemWrangler.py b/src/chemprop/ChemWrangler.py index 40a8c28..1748399 100644 --- a/src/chemprop/ChemWrangler.py +++ b/src/chemprop/ChemWrangler.py @@ -162,7 +162,8 @@ def readHMDBSDF(self, source, filePath): mol.smiles = sdfDB.readline().strip() if line == '> ': mol.inchiKey = sdfDB.readline().strip() - mol.inchiKeyPrefix = mol.inchiKey.split("-")[0] + mol.inchiKeyPrefix = mol.inchiKey.split("-")[0] + mol.inchiKeyDuplex = mol.inchiKey.split("-")[0] + "-" + mol.inchiKey.split("-")[1] if line == '> ': mol.inchi = sdfDB.readline().strip() if line == '> ': @@ -211,7 +212,8 @@ def readChebiSDF(self, source, filePath): mol.smiles = sdfDB.readline().strip() if line == '> ': mol.inchiKey = sdfDB.readline().strip() - mol.inchiKeyPrefix = mol.inchiKey.split("-")[0] + mol.inchiKeyPrefix = mol.inchiKey.split("-")[0] + mol.inchiKeyDuplex = mol.inchiKey.split("-")[0] + "-" + mol.inchiKey.split("-")[1] if line == '> ': mol.inchi = sdfDB.readline().strip() if line == '> ': @@ -320,7 +322,8 @@ def readLipidMapsSDF(self, source, filePath): mol.smiles = sdfDB.readline().strip() if line == '> ': mol.inchiKey = sdfDB.readline().strip() - mol.inchiKeyPrefix = mol.inchiKey.split("-")[0] + mol.inchiKeyPrefix = mol.inchiKey.split("-")[0] + mol.inchiKeyDuplex = mol.inchiKey.split("-")[0] + "-" + mol.inchiKey.split("-")[1] if line == '> ': mol.inchi = sdfDB.readline().strip() if line == '> ': diff --git a/src/parse/RheaParser.py b/src/parse/RheaParser.py index 1464608..447ff8b 100644 --- a/src/parse/RheaParser.py +++ b/src/parse/RheaParser.py @@ -12,7 +12,7 @@ from rampConfig.RampConfig import RampConfig import numpy as np -from rdflib import URIRef,Graph +from rdflib import URIRef, Graph import rdflib.namespace from rdflib.namespace import RDF,FOAF,RDFS,DC,DCTERMS from builtins import str @@ -49,6 +49,8 @@ def __init__(self, resConfig): self.rheaProteinDict = dict() + self.rheaEcToClassDict = dict() + self.rheaLocalRdfFile = "" self.rheaLocalRheaToUniprotFile = "" @@ -57,6 +59,8 @@ def __init__(self, resConfig): self.rheaLocalRxnDirectionFile = "" + self.expasyLocalEc2ClassFile = "" + self.humanUniprotRecordDict = dict() self.humanUniprotAccSet = set() @@ -78,10 +82,14 @@ def processRhea(self): # builds reactions objects self.processAllReactions() - + + # this gets expasy ec to enzyme class + self.ecToEnzymeClassFromExpasy() + self.appendUniprotToReaction() self.appendEcToReaction() + self.setReactionHumanUniprotState() self.setReactionHumanChebiState() @@ -136,6 +144,7 @@ def getRheaFiles(self): uniprotToRheaConf = self.config.getConfig('uniprot_to_rhea') rheaToEcConf = self.config.getConfig('rhea_to_ec') rheaDirectionConf = self.config.getConfig('rhea_rxn_direction') + expasyEc2EnzymeClassConf = self.config.getConfig('expasy_ec2class') localDir = rdfConf.localDir @@ -200,6 +209,20 @@ def getRheaFiles(self): print("Using cached Rhea reaction direction file.") + # supporting expasy EC to Enzyme Class file + ec2classFile = expasyEc2EnzymeClassConf.extractFileName + + self.expasyLocalEc2ClassFile = self.relDir + localDir + ec2classFile + + if not exists(self.relDir + localDir + ec2classFile): + rheaDirUrl = expasyEc2EnzymeClassConf.sourceURL + rheaDirRemoteFile = expasyEc2EnzymeClassConf.sourceFileName + + self.download_files(rheaDirUrl, self.relDir + localDir + rheaDirRemoteFile) + else: + print("Using cached Expasy ec2enzymeClass file.") + + def constructRDF(self): @@ -594,7 +617,21 @@ def exportIntermediateFiles(self): for acc in self.rheaReactionDict: rxn = self.rheaReactionDict[acc] recordOut.write(rxn.getRheaIdToUniprotMappingString()) - + + recordOut.close() + + + recordsFile = "rhea_reaction_to_ec.txt" + + recordOut = open(dir + recordsFile, 'w', encoding="utf-8") + for acc in self.rheaReactionDict: + rxn = self.rheaReactionDict[acc] + ecList = rxn.ec + if ecList is not None and len(ecList) > 0: + ecBlock = self.buildRxnEcExportBlock(acc, ecList) + if len(ecBlock) > 0: + recordOut.write(ecBlock) + recordOut.close() @@ -699,15 +736,78 @@ def appendEcToReaction(self): print(str(r2u.shape)) for idx, row in r2u.iterrows(): - r2EcMap['rhea:'+str(row.RHEA_ID)] = row.ID + rheaRxnId = 'rhea:'+str(row.RHEA_ID) + ecList = r2EcMap.get(rheaRxnId,None) + if ecList is None: + r2EcMap[rheaRxnId] = [row.ID] + else: + ecList.append(row.ID) for rxn in r2EcMap: - ec = r2EcMap[rxn] + ecList = r2EcMap[rxn] currRxn = self.rheaReactionDict.get(rxn, None) if currRxn is not None: - currRxn.ec = ec + currRxn.ec = list(set(ecList)) + + def ecToEnzymeClassFromExpasy(self): + + # ec2class = pd.read_csv(self.expasyLocalEc2ClassFile, sep="\t", skiprows=11, skipfooter=5) + with open(self.expasyLocalEc2ClassFile, 'r') as ec2c: + ec2classStrings = ec2c.readlines() + + start = 11 + end = len(ec2classStrings) - 6 + + for i in range(start, end): + line = ec2classStrings[i].strip() + ec_data = line.split(" ") + ec = ec_data[0] + enzClass = ec_data[1] + if len(ec_data) == 3: + enzClass = ec_data[2] + ec = ec.replace(" ", "") + enzClass = enzClass.strip() + self.rheaEcToClassDict[ec] = enzClass + + + + def buildRxnEcExportBlock(self, rxnId, ecList): + ecBlock = "" + enzClassJoin = "" + for ec in ecList: + ecChildren = self.getEcChildren(ec) + enzClassJoin = "" + i = 0 + for ecc in ecChildren: + enzClass = self.rheaEcToClassDict.get(ecc, None) + if enzClass is not None: + if i == 0: + enzClassJoin = enzClass + # just mark that we are past the first entry + i = 1 + else: + # concatentate the enzyme class info :), I think this is finally correct :) + enzClassJoin = enzClassJoin + " | " + enzClass + ecLevel = 4 - ecc.count("-") + ecBlock = ecBlock + rxnId + "\t" + ecc + "\t" + str(ecLevel) + "\t" + enzClass + "\t" + enzClassJoin + "\n" + + + return ecBlock + + + def getEcChildren(self, ec): + data = ec.split('.') + ecVariants = [ec] + ecVariants.append(data[0] + "." + data[1] + "." + data[2] + ".-") + ecVariants.append(data[0] + "." + data[1] + ".-.-") + ecVariants.append(data[0] + ".-.-.-") + ecVariants = sorted(ecVariants) + return ecVariants + + + #rConf = RampConfig() #rConf.loadConfig("../../config/external_resource_config.txt") # # # diff --git a/src/parse/reactomeData.py b/src/parse/reactomeData.py index 144cb41..219b25e 100755 --- a/src/parse/reactomeData.py +++ b/src/parse/reactomeData.py @@ -418,18 +418,18 @@ def getCommonNameFromUniprot(self): # we now have uniprot to 'common_name', really gene id. # now we want to grab the NCBI/Entrez 'GeneID' - if childtag == "dbReference": - if child2.get("type") == "GeneID": - geneId = child2.get("id") - geneId = 'entrez:'+geneId +# if childtag == "dbReference": +# if child2.get("type") == "GeneID": +# geneId = child2.get("id") +# geneId = 'entrez:'+geneId # protein to gene can be 1:n, so they have to be stored as a list # lets check for a value - idList = mapping.get("small_e_entrez", None) - if(idList == None): - idList = list() - mapping["small_e_entrez"] = idList +# idList = mapping.get("small_e_entrez", None) +# if(idList == None): +# idList = list() +# mapping["small_e_entrez"] = idList - idList.append(geneId) +# idList.append(geneId) diff --git a/src/rampEntity/Metabolite.py b/src/rampEntity/Metabolite.py index 5394553..0ff5398 100644 --- a/src/rampEntity/Metabolite.py +++ b/src/rampEntity/Metabolite.py @@ -3,6 +3,8 @@ @author: braistedjc ''' +from statistics import median +import math class Metabolite(object): ''' @@ -467,6 +469,26 @@ def getInchiPrefixes(self): inchiPrefixes.append(mol.inchiKeyPrefix) return inchiPrefixes + def getInchiKeys(self): + inchiKeys = [] + for source in self.chemPropsMolecules: + molDict = self.chemPropsMolecules[source] + for sourceId in molDict: + mol = molDict[sourceId] + if mol.inchiKey is not "" and mol.inchiKey not in inchiKeys: + inchiKeys.append(mol.inchiKey) + return inchiKeys + + def getInchiKeyDuplexes(self): + inchiKeyDuplexes = [] + for source in self.chemPropsMolecules: + molDict = self.chemPropsMolecules[source] + for sourceId in molDict: + mol = molDict[sourceId] + if mol.inchiKeyDuplex is not "" and mol.inchiKeyDuplex not in inchiKeyDuplexes: + inchiKeyDuplexes.append(mol.inchiKeyDuplex) + return inchiKeyDuplexes + def addInchiNeighbor(self, otherMet): if self is not otherMet: if otherMet not in self.inchiPrefixNeigbors: @@ -498,6 +520,23 @@ def getNeighbors(self, neighbors): if(neighbor not in neighbors): neighbors.append(neighbor) neighbor.getNeighbors(neighbors) + + def getAveMW(self): + mws = [] + medMw = 0.0 + for source in self.chemPropsMolecules: + molDict = self.chemPropsMolecules[source] + for sourceId in molDict: + mol = molDict[sourceId] + + if(mol.mw is not None and mol.mw != ""): + mw = float(mol.mw) + if not math.isnan(mw): + mws.append(mw) + + if(len(mws) > 0): + medMw = median(mws) + + return medMw - \ No newline at end of file diff --git a/src/rampEntity/MetaboliteList.py b/src/rampEntity/MetaboliteList.py index 8f34e22..7b6a81d 100644 --- a/src/rampEntity/MetaboliteList.py +++ b/src/rampEntity/MetaboliteList.py @@ -106,8 +106,56 @@ def printChemPropSummaryStats(self): print("Tot Molecules records: " + str(molRecords)) + def buildInchiKeyPrefixToMetaboliteMappingMassMediated(self, mwCut = 500.0): + + self.inchikeyPrefixToMetab + + inchiPrefMap = dict() + noInchiMet = list() + + mets = self.getUniqueMetabolites() + + # build a mapping from inchi prefix to metabolites + for met in mets: + haveInchiPrefix = False + + aMW = met.getAveMW() + + if aMW >= mwCut: + inchiKeyPrefixes = met.getInchiPrefixes() + else: + inchiKeyPrefixes = met.getInchiKeyDuplexes() + + for prefix in inchiKeyPrefixes: + haveInchiPrefix = True + metList = self.inchikeyPrefixToMetab.get(prefix, None) + if metList is None: + metList = list() + metList.append(met) + self.inchikeyPrefixToMetab[prefix] = metList + else: + metList.append(met) +# if len(met.chemPropsMolecules) > 0: +# for source in met.chemPropsMolecules: +# +# molDict = met.chemPropsMolecules[molname] +# for sourceId in molDict: +# mol = molDict[sourceId] +# if len(mol.inchiKeyPrefix) > 0: +# haveInchiPrefix = True +# metList = self.inchikeyPrefixToMetab.get(mol.inchiKeyPrefix, None) +# if metList is None: +# metList = list() +# metList.append(met) +# self.inchikeyPrefixToMetab[mol.inchiKeyPrefix] = metList +# else: +# metList.append(met) + if not haveInchiPrefix: + noInchiMet.append(met) + + - def buildInchiKeyPrefixToMetaboliteMapping(self): + def buildInchiKeyPrefixToMetaboliteMapping(self, mwCut = 500.0): self.inchikeyPrefixToMetab @@ -119,7 +167,14 @@ def buildInchiKeyPrefixToMetaboliteMapping(self): # build a mapping from inchi prefix to metabolites for met in mets: haveInchiPrefix = False - inchiKeyPrefixes = met.getInchiPrefixes() + + aMW = met.getAveMW() + + if aMW >= mwCut: + inchiKeyPrefixes = met.getInchiPrefixes() + else: + inchiKeyPrefixes = met.getInchiKeyDuplexes() + for prefix in inchiKeyPrefixes: haveInchiPrefix = True metList = self.inchikeyPrefixToMetab.get(prefix, None) diff --git a/src/rampEntity/Molecule.py b/src/rampEntity/Molecule.py index 4ef390a..489925f 100644 --- a/src/rampEntity/Molecule.py +++ b/src/rampEntity/Molecule.py @@ -23,13 +23,15 @@ def __init__(self): self.inchiKeyPrefix = "" + self.inchiKeyDuplex = "" + self.inchi = "" - self.mw = "" + self.mw = None self.formula = "" - self.monoisotopicMass = "" + self.monoisotopicMass = None self.names = [] @@ -51,7 +53,17 @@ def toChemPropsString(self): if len(self.names) > 0: name = self.names[0] s = self.source + "\t" + self.id + "\t" + self.smiles + "\t" + self.inchiKeyPrefix + "\t" + self.inchiKey + "\t" + self.inchi + "\t" - s = s + self.mw + "\t" + self.monoisotopicMass + "\t" + name + "\t" + self.formula+ "\n" + + mw = self.mw + mi = self.monoisotopicMass + + if(mw is None): + mw = "" + + if(mi is None): + mi = "" + + s = s + str(mw) + "\t" + str(mi) + "\t" + name + "\t" + self.formula+ "\n" return s def toSourceString(self): @@ -79,4 +91,4 @@ def toSynonymsString(self): return s - \ No newline at end of file + diff --git a/src/rampEntity/RheaReaction.py b/src/rampEntity/RheaReaction.py index def8f61..3a71908 100644 --- a/src/rampEntity/RheaReaction.py +++ b/src/rampEntity/RheaReaction.py @@ -67,11 +67,23 @@ def __init__(self): self.hasAHumanMetabolite = False + self.ecAssociationBlock = [] + def getBasicRecordString(self): ec = self.ec + if ec is None: - ec = "" + ecVal = "" + else: + eCount = 0 + ec = sorted(ec) + for e in ec: + if eCount == 0: + ecVal = e + else: + ecVal = ecVal + "; " + e + dir = self.direction if dir is None: dir = "" @@ -80,15 +92,27 @@ def getBasicRecordString(self): onlyHumanMets = self.hasOnlyHumanMetabolites * 1 s = (self.rhea_id + "\t" + str(self.status) + "\t" + str(self.isTransport) + "\t" +self.direction + "\t" + self.rhea_label + "\t" + - self.rhea_equation + "\t" + self.rhea_html_eq + "\t" + ec + "\t" + str(humanEnzyme) + "\t" + str(onlyHumanMets) +"\n") + self.rhea_equation + "\t" + self.rhea_html_eq + "\t" + ecVal + "\t" + str(humanEnzyme) + "\t" + str(onlyHumanMets) +"\n") return s def getMainRecordString(self): ec = self.ec - if ec is None: - ec = "" + + if ec is None or type(ec) == float: + ecVal = "" + else: + if len(ec) == 1: + ecVal = ec[0] + else: + eCount = 0 + ec = sorted(ec) + for e in ec: + if eCount == 0: + ecVal = e + else: + ecVal = ecVal + "; " + e direction = self.direction @@ -100,7 +124,7 @@ def getMainRecordString(self): s = str(self.rxnRampId) + "\t" + str(self.rhea_id) + "\t" + str(self.status) + "\t" + str(self.isTransport) + "\t" s = s + str(direction) + "\t" + str(self.rhea_label) + "\t" - s = s + str(self.rhea_equation) + "\t" + str(self.rhea_html_eq) + "\t" + str(ec) + "\t" + str(humanEnzyme) + "\t" + str(onlyHumanMets) + "\n" + s = s + str(self.rhea_equation) + "\t" + str(self.rhea_html_eq) + "\t" + str(ecVal) + "\t" + str(humanEnzyme) + "\t" + str(onlyHumanMets) + "\n" return s @@ -236,7 +260,7 @@ def assignPrimaryFields(self, dataVals): self.rhea_label = dataVals[4] self.rhea_equation = dataVals[5] self.rhea_html_eq = dataVals[6] - self.ec = dataVals[7] + self.ec = [dataVals[7]] self.hasHumanEnzyme = dataVals[8] self.hasOnlyHumanMetabolites = dataVals[9] @@ -287,3 +311,17 @@ def getCompoundToProteinString(self): s = s + self.rhea_id + "\t" + pid + "\t" + cid + "\t1\n" return s + + def getRheaReactionToEcString(self): + ecBlock = "" + if len(self.ecAssociationBlock) > 0: + self.ecAssociationBlock = list(set(self.ecAssociationBlock)) + for ecData in self.ecAssociationBlock: + ecBlock = ecBlock + self.rxnRampId + "\t" + self.rhea_id + "\t" + ecData.strip() + "\n" + + return ecBlock + + def addEcAssociationBlock(self, ecData): + self.ecAssociationBlock.append(ecData) + + diff --git a/src/util/EntityBuilder.py b/src/util/EntityBuilder.py index 6c46c0b..701f07f 100644 --- a/src/util/EntityBuilder.py +++ b/src/util/EntityBuilder.py @@ -562,6 +562,8 @@ def loadGeneList(self, eqMetric = 0): Populates the gene list from all data sources using the geneInfoDictionary files. This builds gene entities and merges based on common ids. """ + f = open("geneList.log", 'w') + Metabolite.__equalityMetric = eqMetric for src in self.sourceList: @@ -600,10 +602,24 @@ def loadGeneList(self, eqMetric = 0): gene.addId(altId, source) self.geneList.addGene(altId, gene) + if gene.rampId == 'RAMP_G_000008086': + f.write("Creating our gene...RAMP_G_000008086\n") + f.write(gene.rampId+"\n") + f.write("\t".join(gene.idList)+"\n") + #f.write(gene.idDict) + gene.addId(currSourceId, source) gene.addSource(source) self.geneList.addGene(currSourceId, gene) - + + if gene.rampId == 'RAMP_G_000008086': + f.write("Adding IDs to our gene...RAMP_G_000008086\n") + f.write(gene.rampId + "\n") + f.write(currSourceId + "\n") + f.write(source + "\n") + f.write("\t".join(gene.idList) + "\n") + + # this is a sourceId lets add else: # need to check if the alt id already exists as a key id @@ -612,6 +628,14 @@ def loadGeneList(self, eqMetric = 0): gene2.addId(altId, source) gene2.addSource(source) gene2.addId(currSourceId, source) + + + if gene2.rampId == 'RAMP_G_000008086': + f.write("Linked and adding to our gene...RAMP_G_000008086\n") + f.write(altId + "\n") + f.write(source + "\n") + f.write(currSourceId + "\n") + #metaboliteList.addMataboliteByAltId(altId, met2) # this reasigns the primary source id and strands the 'metabolite' record self.geneList.addGene(currSourceId, gene2) @@ -624,6 +648,20 @@ def loadGeneList(self, eqMetric = 0): # we don't want two records # we need to consolidate metabolites... I think if(gene2 is not gene): + + if("gene_symbol:MDM2" in gene.idList or "gene_symbol:MDM2" in gene2.idList): + print("SUBSUME GENE\n") + # print(gene.rampId) + # print(gene.idList) + # print(gene.idDict) + + print("///\n") + #print(gene2.rampId) + #print(gene2.idList) + #print(gene2.idDict) + #print(" ") + #print(" ") + # keep the original metabolite (met2) and transfer info gene2.subsumeGene(gene) @@ -639,6 +677,8 @@ def loadGeneList(self, eqMetric = 0): self.geneList.addGene(altId, gene) # safe add, adds unique source to metabolite gene.addSource(source) + f.close() + def loadOntolgies(self): @@ -951,7 +991,7 @@ def processRheaReactions(self): self.buildRxnsFromRhea(rheaPath + "/rhea_primary_records.txt") self.appendRxnProteinsFromRhea(rheaPath + "/rhea_uniprot_mapping.txt") self.appendRxnParticipantsFromRhea(rheaPath + "/rhea_rxn_to_chebi_and_dir.txt") - + self.dumpReactionToEcEnzymeClass(rheaPath + "/rhea_reaction_to_ec.txt") def buildRxnsFromRhea(self, path): print("Building Rhea Reactions") @@ -1018,7 +1058,7 @@ def appendRxnParticipantsFromRhea(self,path): if rxn is not None and met is not None: if met.isCofactor == 1: - print("in append rxn members... cofactor = 1 :)") + # print("in append rxn members... cofactor = 1 :)") rheaCofactCnt = rheaCofactCnt + 1 if(rxnSide == 0): @@ -1031,8 +1071,31 @@ def appendRxnParticipantsFromRhea(self,path): print("in append participants from Rhea... have a None rxn for id: "+rheaId) print("Rhea cofact count/est: "+str(rheaCofactCnt)) + + def dumpReactionToEcEnzymeClass(self, path): + + rxn2EcClassFile = open("../misc/sql/rheaReactionToEcClass.txt", 'w') + + with open(path, 'r') as data: + for line in data: + print("reading rh2ec file") + print(line) + sline = line.split("\t") + rheaId = sline[0] + print(rheaId) + rxn = self.reactionDict.get(rheaId, None) + + if rxn is not None: + rampRxnId = rxn.rxnRampId + if rampRxnId != "": + rxn2EcClassFile.write(rampRxnId + "\t" + line) + rxn2EcClassFile.close() + print("reaction dict key examples") + print(list(self.reactionDict.keys())[0:4]) + + # def fullBuild(self): # """ # This high level method performs the entire process of entity construction @@ -1287,9 +1350,6 @@ def writeReactionEntities(self): file.write(rxn.getReactionProteinToMetString('rhea')) file.close() - - - def remove_whitespace(self, dF): @@ -1831,8 +1891,9 @@ def populateExclusionList(self, filePath): print("Exclusion List Size = " + str(len(list(self.sourceIdToExtIdDict.keys())))) -builder = EntityBuilder() -builder.crossCheckMetaboliteHarmony(buildMetAndCompoundProps = True, criteria = "MW", tolerance = 0.1, pctOrAbs = 'pct') +# builder = EntityBuilder() +# builder.crossCheckMetaboliteHarmony(buildMetAndCompoundProps = True, criteria = "MW", tolerance = 0.1, pctOrAbs = 'pct') + # builder.fullBuild() # print("starting to load metabolites") # builder.loadMetaboList() diff --git a/src/util/RampSupplementalDataBuilder.py b/src/util/RampSupplementalDataBuilder.py index 36b4d52..9853288 100644 --- a/src/util/RampSupplementalDataBuilder.py +++ b/src/util/RampSupplementalDataBuilder.py @@ -7,6 +7,7 @@ from sqlalchemy import create_engine from sqlalchemy import MetaData from sklearn.metrics.pairwise import pairwise_distances +#from .rampDBBulkLoader import dbConfig class RampSupplementalDataBuilder(object): ''' @@ -14,7 +15,7 @@ class RampSupplementalDataBuilder(object): ''' - def __init__(self, dbType, credInfo): + def __init__(self, dbType, sqliteCreds=None, dbConf=None): ''' Constructor ''' @@ -22,13 +23,15 @@ def __init__(self, dbType, credInfo): self.dbType = dbType # a MySQL RaMP db_properties file, or an SQLite DB file - self.credInfo = credInfo + self.credInfo = sqliteCreds # sqlalchemy engine to provide connections to DB self.engine = None if self.dbType == 'sqlite': self.engine = self.createSQLiteEngine(self.credInfo) + else: + self.engine = self.createMySQLEngine(dbConf) # all analyte pathway similarity matrix self.analyteResult = None @@ -44,12 +47,23 @@ def __init__(self, dbType, credInfo): def createSQLiteEngine(self, sqliteFile=None): engine = create_engine('sqlite:///'+sqliteFile, echo=False) return engine + + def createMySQLEngine(self, dbConf = None): + print("In ramp supplimental data builder, building mysql engine") + dbConf.dumpConfig() + print(type(dbConf.port)) + conStr = ("mysql+pymysql://{username}:{conpass}@{host_url}/{dbname}?port={port}").format(username=dbConf.username, conpass=dbConf.conpass, host_url=dbConf.host,dbname=dbConf.dbname,port=dbConf.port) + print(conStr) + engine = create_engine((("mysql+pymysql://{username}:{conpass}@{host_url}:{port}/{dbname}").format(username=dbConf.username, conpass=dbConf.conpass, host_url=dbConf.host,dbname=dbConf.dbname,port=dbConf.port)), echo=False) + + return engine + def listTables(self): if self.dbType == 'mysql': sql = 'show tables' elif self.dbType == 'sqlite': - sql = "SELECT name FROM sqlite_master WHERE type ='table' AND name NOT LIKE 'sqlite_%'"; + sql = "SELECT name FROM sqlite_master WHERE type ='table' AND name NOT LIKE 'sqlite_%%'"; else: print("Unsupported DB Type: " + self.dbType) return @@ -70,14 +84,14 @@ def buildAnalyteSetStats(self): def buildSimilarityMatrix(self, matrixType): df = None - analyteKey = 'RAMP_%' + analyteKey = 'RAMP_%%' minPathwaySize = 10 if matrixType == 'mets': - analyteKey = 'RAMP_C%' + analyteKey = 'RAMP_C%%' minPathwaySize = 5 elif matrixType == 'genes': - analyteKey = 'RAMP_G%' + analyteKey = 'RAMP_G%%' minPathwaySize = 5 sql = "select ap.pathwayRampId, ap.rampID from analytehaspathway ap, pathway p "\ @@ -115,17 +129,18 @@ def buildAnalyteSet(self, dataSource, geneOrMet): print("building analyte stat set") - rampIdPrefix = "RAMP_C%" - if geneOrMet == 'genes': - rampIdPrefix = "RAMP_G%" + rampIdPrefix = "RAMP_C%%" + if geneOrMet == 'gene': + rampIdPrefix = "RAMP_G%%" - sql = "select ap.pathwayRampId, count(distinct(ap.rampId)) as Freq, p.type as pathwaySource "\ + sql = "select ap.pathwayRampId as pathwayRampId, count(distinct(ap.rampId)) as Freq, p.type as pathwaySource "\ "from analytehaspathway ap, pathway p "\ "where p.type = '" + dataSource + "' and ap.pathwayRampId = p.pathwayRampId and ap.rampId like '" + rampIdPrefix + "' group by ap.pathwayRampId" df = None with self.engine.connect() as conn: + df = conn.execute(sql).all() df = pd.DataFrame(df) @@ -133,16 +148,24 @@ def buildAnalyteSet(self, dataSource, geneOrMet): print(df.shape) print("Stats header") print(df.columns) + print(type(df)) + + df.columns = ['pathwayRampID', 'Freq', 'pathwaySource'] + print(df.columns) + + + print(df.head(5)) conn.close() return df -#pwob = PathwayOverlapBuilder(dbType = "sqlite", credInfo = "X:\\braistedjc\\tmp_work\\RaMP_SQLite_v2.3.0_Structure.sqlite") +#pwob = RampSupplementalDataBuilder(dbType = "sqlite", sqliteCreds = "/mnt/ncatsprod/braistedjc/tmp_work/RaMP_SQLite_v2.3.1b.sqlite") #pwob.listTables() #pwob.buildBaseMatrix(matrixType = "analytes") -# pwob.buildSimilarityMatrix(matrixType = "genes") +#dm = pwob.buildSimilarityMatrix(matrixType = "analytes") +#print(str(dm.values.sum())) #pwob.buildAnalyteSet("wiki", "met") #pwob.buildAnalyteSet("wiki", "gene") diff --git a/src/util/SQLiteDBBulkLoader.py b/src/util/SQLiteDBBulkLoader.py index 1a60747..84c8ce9 100644 --- a/src/util/SQLiteDBBulkLoader.py +++ b/src/util/SQLiteDBBulkLoader.py @@ -924,7 +924,7 @@ def truncateTables(self, tablesToSkip): def generateAndLoadRampSupplementalData(self): - dataBuilder = RampSupplementalDataBuilder(dbType = 'sqlite', credInfo = self.sqliteFileName) + dataBuilder = RampSupplementalDataBuilder(dbType = 'sqlite', sqliteCreds = self.sqliteFileName) dataSources = ['reactome', 'wiki', 'kegg'] analyteTypes = ['metab', 'gene'] @@ -932,7 +932,7 @@ def generateAndLoadRampSupplementalData(self): pwSimMat_analytes = dataBuilder.buildSimilarityMatrix(matrixType='analytes') pwSimMat_mets = dataBuilder.buildSimilarityMatrix(matrixType='mets') pwSimMat_genes = dataBuilder.buildSimilarityMatrix(matrixType='genes') - + analyteSets = dict() for source in dataSources: @@ -941,31 +941,43 @@ def generateAndLoadRampSupplementalData(self): #pwSimMat_mets.to_csv("C:/Users/braistedjc/Desktop/Analysis/Ramp/Junk_Test_Mets_Sim_Mat.txt", sep="\t") - analytesSim = pwSimMat_mets.to_csv(sep="\t") - analytesSim = zlib.compress(analytesSim.encode()) + #analytesSim = pwSimMat_mets.to_csv(sep="\t") + #analytesSim = zlib.compress(analytesSim.encode()) + sqlDelete = "delete from ramp_data_object" - sql = "insert into ramp_data_object (data_key, data_blob) values (:data_key, :data_object)" + + sql = "insert into ramp_data_object (data_key, data_blob) values (:data_key, :data_blob)" with self.engine.connect() as conn: + conn.execute(sqlDelete) + + #meta_data = MetaData(bind=conn) + #meta_data.reflect() + #dataObj = meta_data.tables['ramp_data_object'] + vals = dict() vals['data_key'] = 'analyte_result' objVal = pwSimMat_analytes.to_csv(sep="\t") objVal = zlib.compress(objVal.encode()) - vals['data_object'] = objVal + vals['data_blob'] = objVal conn.execute(sql, vals) + #conn.execute(dataObj.insert(), vals) vals['data_key'] = 'metabolites_result' objVal = pwSimMat_mets.to_csv(sep="\t") objVal = zlib.compress(objVal.encode()) - vals['data_object'] = objVal + vals['data_blob'] = objVal conn.execute(sql, vals) - + #conn.execute(dataObj.insert(), vals) + + vals['data_key'] = 'genes_result' objVal = pwSimMat_genes.to_csv(sep="\t") objVal = zlib.compress(objVal.encode()) - vals['data_object'] = objVal + vals['data_blob'] = objVal conn.execute(sql, vals) + #conn.execute(dataObj.insert(), vals) for analyteKey in analyteSets: @@ -974,10 +986,14 @@ def generateAndLoadRampSupplementalData(self): objVal = analyteSets[analyteKey] objVal = objVal.to_csv(sep="\t") objVal = zlib.compress(objVal.encode()) - vals['data_object'] = objVal + vals['data_blob'] = objVal + # conn.execute(dataObj.insert(), vals) conn.execute(sql, vals) - conn.close() + conn.close() + + + @@ -1036,7 +1052,7 @@ def __init__(self): self.size = 0 self.id = "" - +#loader = SQLiteDBBulkLoader(dbPropsFile='../../config/ramp_resource_version_update.txt', sqliteFileName="/mnt/ncatsprod/braistedjc/tmp_work/RaMP_SQLite_v2.3.0.sqlite") #loader = SQLiteDBBulkLoader(dbPropsFile='../../config/ramp_resource_version_update.txt', sqliteFileName="/mnt/ncatsprod/braistedjc/tmp_work/RaMP_SQLite_v2.3.0_Structure.sqlite") #loader.generateAndLoadRampSupplementalData() diff --git a/src/util/rampDBBulkLoader.py b/src/util/rampDBBulkLoader.py index 623b5dc..b3da878 100644 --- a/src/util/rampDBBulkLoader.py +++ b/src/util/rampDBBulkLoader.py @@ -6,11 +6,13 @@ import sys import mysql.connector import pandas as pd +import zlib from pandas.api.types import is_string_dtype import os.path from os import path from sqlalchemy import create_engine from sqlalchemy import MetaData +from sqlalchemy import bindparam import logging from jproperties import Properties from urllib.parse import quote_plus @@ -18,6 +20,7 @@ import time from datetime import date import json +from util.RampSupplementalDataBuilder import RampSupplementalDataBuilder class rampDBBulkLoader(object): @@ -46,7 +49,7 @@ def __init__(self, dbPropsFile): pd.set_option('display.max_columns', None) - + self.engine = self.createMySQLEngine() def remove_whitespace(self, dF): for colName in dF.columns: @@ -55,6 +58,17 @@ def remove_whitespace(self, dF): print("fixing column...") return dF + def createMySQLEngine(self): + print("In supplemental data builder, building mysql engine") + #self.dbConf.dumpConfig() + #print(type(dbConf.port)) + #conStr = ("mysql+pymysql://{username}:{conpass}@{host_url}/{dbname}?port={port}").format(username=seldbConf.username, conpass=dbConf.conpass, host_url=dbConf.host,dbname=dbConf.dbname,port=dbConf.port) + #print(conStr) + engine = create_engine((("mysql+pymysql://{username}:{conpass}@{host_url}:{port}/{dbname}").format(username=self.dbConf.username, conpass=self.dbConf.conpass, host_url=self.dbConf.host,dbname=self.dbConf.dbname,port=self.dbConf.port)), echo=False) + + return engine + + @@ -107,7 +121,7 @@ def loadFile(self, resource, engine): #data = pd.read_csv(file_path, sep="\t+", header=None, index_col=None, engine="python") data = pd.read_table(file_path, sep="\t", header=None, names=colNames, index_col=None, engine="python") df = pd.DataFrame(data) - + # issue with whitespace df = self.remove_whitespace(df) @@ -152,6 +166,7 @@ def loadIgnore(self, engine, resource): user=self.dbConf.username, password=self.dbConf.conpass, db=self.dbConf.dbname, + port=self.dbConf.port, charset = 'utf8', use_unicode=True) #conn.set_charset_collation('utf16') @@ -246,7 +261,7 @@ def load(self, rampResourceConfigFile): fileResource.initFileResource(config) resources.append(fileResource) - engine = create_engine((("mysql+pymysql://{username}:{conpass}@{host_url}/{dbname}").format(username=self.dbConf.username, conpass=self.dbConf.conpass, host_url=self.dbConf.host,dbname=self.dbConf.dbname)), echo=False) + engine = self.engine print("Hey in loading loop now") for resource in resources: @@ -265,7 +280,7 @@ def load(self, rampResourceConfigFile): def updateVersionInfo(self, infoFile): print("Updating Version Info") - engine = create_engine((("mysql+pymysql://{username}:{conpass}@{host_url}/{dbname}").format(username=self.dbConf.username, conpass=self.dbConf.conpass, host_url=self.dbConf.host,dbname=self.dbConf.dbname)), echo=False) + engine = self.engine sql = "select ramp_version, load_timestamp from db_version order by load_timestamp desc limit 1" @@ -302,7 +317,7 @@ def updateDataStatusSummary(self): print("starting update entity summary") - engine = create_engine((("mysql+pymysql://{username}:{conpass}@{host_url}/{dbname}").format(username=self.dbConf.username, conpass=self.dbConf.conpass, host_url=self.dbConf.host,dbname=self.dbConf.dbname)), echo=False) + engine = self.engine sqlMets = "select dataSource, count(distinct(rampId)) from source where geneOrCompound = 'compound' and dataSource not like '%%kegg' group by dataSource" sqlKeggMets = "select count(distinct(rampId)) from source where geneOrCompound = 'compound' and dataSource like '%%_kegg'" @@ -399,7 +414,7 @@ def updateDBVersion(self, incrementLevel = 'increment_patch_release', optionalVe self.dbConf.dumpConfig() - engine = create_engine((("mysql+pymysql://{username}:{conpass}@{host_url}/{dbname}").format(username=self.dbConf.username, conpass=self.dbConf.conpass, host_url=self.dbConf.host,dbname=self.dbConf.dbname)), echo=False) + engine = self.engine versionSQL = "select * from db_version where load_timestamp = (select max(load_timestamp) from db_version)" @@ -465,7 +480,8 @@ def updateEntityIntersects(self, filterComps=False): vals.append({'met_intersects_json':cmpdIntersects, 'gene_intersects_json':geneIntersects, 'met_intersects_json_pw_mapped':cmpdIntersectsInPW, 'gene_intersects_json_pw_mapped':geneIntersectsInPW}) if self.currDBVersion != None: - engine = create_engine((("mysql+pymysql://{username}:{conpass}@{host_url}/{dbname}").format(username=self.dbConf.username, conpass=self.dbConf.conpass, host_url=self.dbConf.host,dbname=self.dbConf.dbname)), echo=False) + + engine = self.engine with engine.connect() as conn: meta_data = MetaData(bind=conn) @@ -814,7 +830,7 @@ def updateSourcePathwayCount(self): "where ap.pathwaySource != 'hmdb' group by ap.rampId) as metPathwayInfo "\ "set source.pathwayCount = metPathwayInfo.pathwayCount where source.rampId = metPathwayInfo.rampId" - engine = create_engine((("mysql+pymysql://{username}:{conpass}@{host_url}/{dbname}").format(username=self.dbConf.username, conpass=self.dbConf.conpass, host_url=self.dbConf.host,dbname=self.dbConf.dbname)), echo=False) + engine = self.engine with engine.connect() as conn: conn.execute(sql) @@ -830,7 +846,7 @@ def updateOntologyMetaboliteCounts(self): "(select rampOntologyId, count(distinct(rampCompoundId)) as metCount from analytehasontology group by rampOntologyId)"\ "as ontologyMetInfo set ontology.metCount = ontologyMetInfo.metCount where ontology.rampOntologyId = ontologyMetInfo.rampOntologyId" - engine = create_engine((("mysql+pymysql://{username}:{conpass}@{host_url}/{dbname}").format(username=self.dbConf.username, conpass=self.dbConf.conpass, host_url=self.dbConf.host,dbname=self.dbConf.dbname)), echo=False) + engine = self.engine with engine.connect() as conn: conn.execute(sql) @@ -841,7 +857,7 @@ def updateOntologyMetaboliteCounts(self): def updateCurrentDBVersionDumpURL(self, dumpUrl): self.dbConf.dumpConfig() - engine = create_engine((("mysql+pymysql://{username}:{conpass}@{host_url}/{dbname}").format(username=self.dbConf.username, conpass=self.dbConf.conpass, host_url=self.dbConf.host,dbname=self.dbConf.dbname)), echo=False) + engine = self.engine print("Updating DB Version") @@ -860,7 +876,9 @@ def updateCurrentDBVersionDumpURL(self, dumpUrl): def truncateTables(self, tablesToSkip): self.dbConf.dumpConfig() - engine = create_engine((("mysql+pymysql://{username}:{conpass}@{host_url}/{dbname}").format(username=self.dbConf.username, conpass=self.dbConf.conpass, host_url=self.dbConf.host,dbname=self.dbConf.dbname)), echo=False) + engine = self.engine + + #engine = create_engine((("mysql+pymysql://{username}:{conpass}@{host_url}/{dbname}").format(username=self.dbConf.username, conpass=self.dbConf.conpass, host_url=self.dbConf.host,dbname=self.dbConf.dbname)), echo=False) print("Updating DB Version") @@ -874,6 +892,86 @@ def truncateTables(self, tablesToSkip): conn.execute("truncate "+tableName) conn.close() + + + + def generateAndLoadRampSupplementalData(self): + + dataBuilder = RampSupplementalDataBuilder(dbType = 'mysql', sqliteCreds = None, dbConf = self.dbConf) + + dataSources = ['reactome', 'wiki', 'kegg'] + analyteTypes = ['metab', 'gene'] + + pwSimMat_analytes = dataBuilder.buildSimilarityMatrix(matrixType='analytes') + pwSimMat_mets = dataBuilder.buildSimilarityMatrix(matrixType='mets') + pwSimMat_genes = dataBuilder.buildSimilarityMatrix(matrixType='genes') + + analyteSets = dict() + + for source in dataSources: + for analyteType in analyteTypes: + analyteSets[source + "_" + analyteType] = dataBuilder.buildAnalyteSet(dataSource=source, geneOrMet=analyteType) + + #pwSimMat_mets.to_csv("C:/Users/braistedjc/Desktop/Analysis/Ramp/Junk_Test_Mets_Sim_Mat.txt", sep="\t") + + #analytesSim = pwSimMat_mets.to_csv(sep="\t") + #analytesSim = zlib.compress(analytesSim.encode()) + sqlDelete = "delete from ramp_data_object" + + + # sql = "insert into ramp_data_object (data_key, data_blob) values (:data_key, :data_blob)" + sql = "insert into ramp_data_object (data_key, data_blob) values (%s, %s)" + + baseSQL = "insert into ramp_data_object values (" + engine = self.engine + # engine = create_engine((("mysql+pymysql://{username}:{conpass}@{host_url}/{dbname}?port={port}").format(username=self.dbConf.username, conpass=self.dbConf.conpass, host_url=self.dbConf.host,dbname=self.dbConf.dbname,port=self.dbConf.port)), echo=False) + + with engine.connect() as conn: + conn.execute(sqlDelete) + + #meta_data = MetaData(bind=conn) + #meta_data.reflect() + #dataObj = meta_data.tables['ramp_data_object'] + + vals = dict() + + #vals['data_key'] = 'analyte_result' + objVal = pwSimMat_analytes.to_csv(sep="\t") + objVal = zlib.compress(objVal.encode()) + #vals['data_blob'] = bindparam(objVal) + #sql = baseSQL + "'analyte_result'" + ', ' + objVal + ')' + conn.execute(sql, 'analyte_result', objVal) + #conn.execute(dataObj.insert(), vals) + + #vals['data_key'] = 'metabolites_result' + objVal = pwSimMat_mets.to_csv(sep="\t") + objVal = zlib.compress(objVal.encode()) + #vals['data_blob'] = objVal + conn.execute(sql, 'metabolites_result', objVal) + + #conn.execute(sql, vals) + #conn.execute(dataObj.insert(), vals) + + + #vals['data_key'] = 'genes_result' + objVal = pwSimMat_genes.to_csv(sep="\t") + objVal = zlib.compress(objVal.encode()) + #vals['data_blob'] = objVal + conn.execute(sql, 'genes_result', objVal) + + #conn.execute(sql, vals) + #conn.execute(dataObj.insert(), vals) + + for analyteKey in analyteSets: + print("Analyte_Key: "+analyteKey) + #vals['data_key'] = analyteKey + objVal = analyteSets[analyteKey] + objVal = objVal.to_csv(sep="\t") + objVal = zlib.compress(objVal.encode()) + #vals['data_blob'] = objVal + conn.execute(sql, analyteKey, objVal) + + conn.close() class dbConfig(object): @@ -889,12 +987,14 @@ def __init__(self, configFile): self.username = dbConfig.get("username").data self.host = dbConfig.get("host").data self.dbname = dbConfig.get("dbname").data + self.port = int(dbConfig.get("port").data) def dumpConfig(self): print(self.host) print(self.dbname) print(self.username) - print(self.conpass) + print(self.conpass) + print(self.port) class rampFileResource(object):