fix errors and cleanup

add requirements.txt for easy python environment creation
ncats · May 29, 2024 · 3c6bd05 · 3c6bd05
1 parent c5a61b9
commit 3c6bd05
Show file tree

Hide file tree

Showing 9 changed files with 112 additions and 195 deletions.
diff --git a/.gitignore b/.gitignore
@@ -5,3 +5,4 @@ misc/output/
 misc/jupyter/
 conf/*db_props*
 
+.idea
diff --git a/config/external_resource_config.txt b/config/external_resource_config.txt
@@ -4,7 +4,7 @@ hmdb_gene	http	https://hmdb.ca/system/downloads/current/hmdb_proteins.zip	hmdb_p
 hmdb_met_sdf	http	https://hmdb.ca/system/downloads/current/structures.zip	structures.zip	structures.sdf	../misc/data/chemprops/hmdb/	zip	chem_props_sdf
 reactome_met	http	http://www.reactome.org/download/current/ChEBI2Reactome_All_Levels.txt	ChEBI2Reactome_All_Levels.txt	ChEBI2Reactome_All_Levels.txt	../misc/data/reactome/	none	pathways_mets
 reactome_gene	http	http://www.reactome.org/download/current/UniProt2Reactome_All_Levels.txt	UniProt2Reactome_All_Levels.txt	UniProt2Reactome_All_Levels.txt	../misc/data/reactome/	none	pathways_genes
-wiki_pathways_mets_genes	http	https://data.wikipathways.org/20240210/rdf/wikipathways-20240210-rdf-authors.zip	wikipathways-20240210-rdf-authors.zip	./authors/	../misc/data/wikipathwaysRDF/	zip	pathways_mets_genes
+wiki_pathways_mets_genes	http	https://data.wikipathways.org/20230810/rdf/wikipathways-20230810-rdf-wp.zip	wikipathways-20230810-rdf-wp.zip	./authors/	../misc/data/wikipathwaysRDF/	zip	pathways_mets_genes
 chebi_met_sdf	ftp	https://ftp.ebi.ac.uk/pub/databases/chebi/SDF/ChEBI_complete_3star.sdf.gz	ChEBI_complete_3star.sdf.gz	ChEBI_complete_3star.sdf	../misc/data/chemprops/chebi/	gzip	chem_props_sdf
 lipidmaps_met	http	https://www.lipidmaps.org/files/?file=LMSD&ext=sdf.zip	LMSD.sdf.zip	structures.sdf	../misc/data/chemprops/lipidmaps/	zip	chem_props_sdf
 swissprot_human	http	https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/uniprot_sprot_human.dat.gz	uniprot_sprot_human.dat.gz	uniprot_sprot_human.dat	../misc/data/uniprot_human/	gzip	proteins
@@ -17,3 +17,4 @@ rhea_rxn_direction	http	https://ftp.expasy.org/databases/rhea/tsv/rhea-direction
 chebi_to_chebi_relations	http	http://ftp.ebi.ac.uk/pub/databases/chebi/Flat_file_tab_delimited/relation.tsv	relation.tsv	relation.tsv	../misc/data/chebi/	none	chebi_relations
 chebi_ontology_owl	http	http://ftp.ebi.ac.uk/pub/databases/chebi/ontology/chebi.owl.gz	chebi.owl.gz	chebi.owl	../misc/data/chebi/	gzip	chebi_ontology
 expasy_ec2class	ftp	https://ftp.expasy.org/databases/enzyme/enzclass.txt	enzclass.txt	enzclass.txt	../misc/data/rhea	none	expasy_ec2class
+expasy_enzyme_dat	ftp	https://ftp.expasy.org/databases/enzyme/enzyme.dat	enzyme.dat	enzyme.dat	../misc/data/rhea	none	expasy_enzyme_dat
diff --git a/main/mainSqliteDBLoad.py b/main/mainSqliteDBLoad.py
@@ -79,8 +79,8 @@ def loadDBAfterTruncatingTables(self, sqliteFile, incrementLevel = 'increment_pa
 
 # increment level 'increment_patch_release', 'increment_minor_release', 
 # or 'specified' (new version, perhaps major release)
-loader.loadDBAfterTruncatingTables(sqliteFile = '/mnt/ncatsprod/braistedjc/tmp_work/RaMP_SQLite_BASE.sqlite', incrementLevel = 'specified',  
-                                   optionalVersionOveride = "2.4.2", 
-                                   optionalVersionNote = "20231107 data update, Rhea reaction to EC reaction class. Reactome Genes Patch.", 
+loader.loadDBAfterTruncatingTables(sqliteFile = 'RaMP_SQLite_BASE.sqlite', incrementLevel = 'specified',
+                                   optionalVersionOveride = "2.6.0",
+                                   optionalVersionNote = "20240524 data update",
                                    truncateTables=True)
 
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,10 @@
+rdflib==6.0.2
+pandas==1.5.3
+lxml
+libchebipy
+pubchempy
+
+sqlalchemy==1.4.52
+jproperties
+scikit-learn
+numpy
diff --git a/src/chemprop/ChemWrangler.py b/src/chemprop/ChemWrangler.py
@@ -565,7 +565,7 @@ def getCatalyticDistances(self, catalyzesFile):
 
         print("computing catalytic distances")
 
-        catMat = pd.read_csv(catalyzesFile, sep='\t', header=None)
+        catMat = pd.read_csv(catalyzesFile, sep='\t', header=None, engine='python')
         catMat.columns = ["compound","protein"]
 
         lowestDist = dict()

diff --git a/src/parse/RheaParser.py b/src/parse/RheaParser.py
@@ -126,7 +126,7 @@ def buildSupportingUniprotData(self):
 
         for acc in self.humanUniprotRecordDict:
             self.humanPrimaryUniprotAccSet.add(acc)
-            p = self.humanPrimaryUniprotRecordDict[acc]
+            p = self.humanUniprotRecordDict[acc]
             threadedPrimaryUniprotDict[acc] = p
             for acc2 in p.secondaryAccs:
                 self.humanSecondaryUniprotAccSet.add(acc2)
@@ -458,7 +458,7 @@ def processReactions(self, g, res):
 
     def processReactionDirectionInfo(self):
 
-        dirTable = pd.read_csv(self.rheaLocalRxnDirectionFile, sep="\t", header=0)
+        dirTable = pd.read_csv(self.rheaLocalRxnDirectionFile, sep="\t", header=0, engine='python')
 
         dirMapping = dict()
 
@@ -557,7 +557,7 @@ def setReactionHumanUniprotState(self):
         for rheaId in self.rheaReactionDict:
             rxn = self.rheaReactionDict[rheaId]
             for p in rxn.proteins:
-                if p in self.humanUniprotAccSet:
+                if p in self.humanPrimaryUniprotAccSet:
                     if rxn.status == 1:
                         numHumanUniprot = numHumanUniprot + 1
                         if rxn.direction == 'UN':
@@ -757,7 +757,7 @@ def exportIntermediateFiles(self):
     def appendUniprotToReaction(self):
         #self.rheaLocalRheaToEcFile
         #self.rheaLocalRheaToUniprotFile
-        r2u = pd.read_csv(self.rheaLocalRheaToUniprotFile, sep="\t", header=0)
+        r2u = pd.read_csv(self.rheaLocalRheaToUniprotFile, sep="\t", header=0, engine='python')
 
         r2uMap = dict()
 
@@ -770,7 +770,7 @@ def appendUniprotToReaction(self):
             uniprot = "uniprot:" + row.ID
 
             # !!! just adding human uniprot            
-            if uniprot in self.humanUniprotAccSet:
+            if uniprot in self.humanPrimaryUniprotAccSet:
                 #print("Have the human id!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
 
                 unis = r2uMap.get("rhea:" + str(row.RHEA_ID))
@@ -797,7 +797,7 @@ def appendUniprotToReaction(self):
 
 
         # swiss prot    
-        r2u = pd.read_csv(self.rheaLocalRheaToSwissprotFile, sep="\t", header=0)
+        r2u = pd.read_csv(self.rheaLocalRheaToSwissprotFile, sep="\t", header=0, engine='python')
 
         print(str(r2u.shape))
 
@@ -806,7 +806,7 @@ def appendUniprotToReaction(self):
             #print("appending protein accessions to reactions..." + str(row.RHEA_ID)+ "  " +str(row.ID))
 
             # !!! just adding human uniprot            
-            if ("uniprot:" + row.ID) in self.humanUniprotAccSet:
+            if ("uniprot:" + row.ID) in self.humanPrimaryUniprotAccSet:
                 #print("Have the human id!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
                 unis = r2uMap.get("rhea:" + str(row.RHEA_ID))
                 if unis is None:     
@@ -832,7 +832,7 @@ def appendUniprotToReaction(self):
     def appendEcToReaction(self):
         #self.rheaLocalRheaToEcFile
         #self.rheaLocalRheaToUniprotFile
-        r2u = pd.read_csv(self.rheaLocalRheaToEcFile, sep="\t", header=0)
+        r2u = pd.read_csv(self.rheaLocalRheaToEcFile, sep="\t", header=0, engine='python')
 
         r2EcMap = dict()
 
@@ -854,7 +854,7 @@ def appendEcToReaction(self):
 
     def ecToEnzymeClassFromExpasy(self):
 
-        # ec2class = pd.read_csv(self.expasyLocalEc2ClassFile, sep="\t", skiprows=11, skipfooter=5)
+        # ec2class = pd.read_csv(self.expasyLocalEc2ClassFile, sep="\t", skiprows=11, skipfooter=5, engine='python')
         with open(self.expasyLocalEc2ClassFile, 'r') as ec2c:
             ec2classStrings = ec2c.readlines()
 

diff --git a/src/util/EntityBuilder.py b/src/util/EntityBuilder.py
@@ -89,7 +89,10 @@ def __init__(self, resourceConfig):
         # This data source list will eventually be populated by config file
         self.source = DataSource()        
         self.sourceList.append(self.source)
-
+
+        if not os.path.exists(self.source.exportPath):
+            os.makedirs(self.source.exportPath)
+
         self.dataSource2 = DataSource()
         self.dataSource2.sourceName = 'reactome'
         self.dataSource2.filePrefix = 'reactome'
@@ -209,11 +212,6 @@ def fullBuild(self):
         self.metaboliteList.collapseMetsOnInchiKeyPrefix()
 
         # loader file writes
-
-        # make sql directory if it doesn't exist
-        if not exists("../misc/sql"):
-            os.mkdir("../misc/sql")
-
         self.writePathways()
         self.writeAnalyteSource()
         self.writeAnalyteSynonyms()
@@ -248,7 +246,7 @@ def loadMetaboList(self, eqMetric = 0):
             if not(path.exists(file)):
                 break
 
-            data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False)
+            data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False, engine='python')
             df = pd.DataFrame(data)
             df = self.remove_whitespace(df)
 
@@ -359,7 +357,7 @@ def addMetaboliteCommonName(self):
             source = src.sourceName
             file = src.sourceLocPath + "/" + src.filePrefix + "metaboliteCommonName.txt"
 
-            data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False)
+            data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False, engine='python')
             df = pd.DataFrame(data)
             df = self.remove_whitespace(df)
 
@@ -382,7 +380,7 @@ def addMetaboliteHMDBStatus(self):
             # capture id to status dictionary
             hmdbStatus = dict()
             file = hmdbSrc.sourceLocPath + "/" + hmdbSrc.filePrefix + "metStatus.txt"
-            data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False)
+            data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False, engine='python')
 
             for i,row in data.iterrows():
                 hmdbStatus[row[0]] = row[1]
@@ -418,7 +416,7 @@ def addMetaboliteSynonyms(self):
             if not os.path.exists(file) or os.path.getsize(file) < 1:
                 return
 
-            data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False)
+            data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False, engine='python')
             df = pd.DataFrame(data)
             df = self.remove_whitespace(df)
 
@@ -463,7 +461,7 @@ def loadPathways(self):
             if not(path.exists(file)):
                 break
 
-            data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False)
+            data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False, engine='python')
             df = pd.DataFrame(data)
             df = self.remove_whitespace(df)
 
@@ -490,7 +488,7 @@ def addPathwayCategory(self):
             if not(path.exists(file)):
                 break
 
-            data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None)
+            data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, engine='python')
 
             for i,row in data.iterrows():
                 pathway = self.pathList.getPathwayBySourceId(row[0])
@@ -520,7 +518,7 @@ def buildMetaboliteToPathwayConnections(self):
             if not(path.exists(file)):
                 break
 
-            data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False)
+            data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False, engine='python')
             df = pd.DataFrame(data)
             df = self.remove_whitespace(df)
 
@@ -573,7 +571,7 @@ def getUniprotSecondaryAccessions(self):
 
         file = "../misc/output/uniprot_human/uniprot_acc_mapping.txt"
 
-        data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False)
+        data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False, engine='python')
 
         for idx, row in data.iterrows():
             altId = row[0]
@@ -602,7 +600,7 @@ def loadGeneList(self, eqMetric = 0):
                 print("in add gene list... geneInfoDictionary not found for :" + file)
                 continue
 
-            data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False)
+            data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False, engine='python')
             df = pd.DataFrame(data)
             df = self.remove_whitespace(df)
             df.drop_duplicates(inplace = True)
@@ -722,7 +720,7 @@ def loadOntolgies(self):
             if not(path.exists(file)):
                 break
 
-            data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False)
+            data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False, engine='python')
             df = pd.DataFrame(data)
             df = self.remove_whitespace(df)
 
@@ -739,7 +737,7 @@ def loadOntolgies(self):
             if not(path.exists(file)):
                 break
 
-            data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False)
+            data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False, engine='python')
             df = pd.DataFrame(data)
             df = self.remove_whitespace(df)
 
@@ -756,7 +754,7 @@ def loadOntolgies(self):
             if not(path.exists(file)):
                 break
 
-            data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False)
+            data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False, engine='python')
             df = pd.DataFrame(data)
             df = self.remove_whitespace(df)
 
@@ -773,7 +771,7 @@ def loadOntolgies(self):
             if not(path.exists(file)):
                 break
 
-            data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False)
+            data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False, engine='python')
             df = pd.DataFrame(data)
             df = self.remove_whitespace(df)
 
@@ -789,7 +787,7 @@ def loadOntolgies(self):
             if not(path.exists(file)):
                 break
 
-            data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False)
+            data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False, engine='python')
             df = pd.DataFrame(data)
             df = self.remove_whitespace(df)
 
@@ -806,7 +804,7 @@ def loadOntolgies(self):
             if not(path.exists(file)):
                 break
 
-            data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False)
+            data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False, engine='python')
             df = pd.DataFrame(data)
             df = self.remove_whitespace(df)
 
@@ -823,7 +821,7 @@ def loadOntolgies(self):
             if not(path.exists(file)):
                 break
 
-            data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False)
+            data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False, engine='python')
             df = pd.DataFrame(data)
             df = self.remove_whitespace(df)
 
@@ -862,7 +860,7 @@ def addGeneCommonNameAndSynonyms(self):
             if not(path.exists(file)):
                 continue
 
-            data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False)
+            data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False, engine='python')
             df = pd.DataFrame(data)
             df = self.remove_whitespace(df)
 
@@ -885,7 +883,7 @@ def metaboliteClassConnections(self):
             file = src.sourceLocPath + "/" + src.filePrefix + "metaboliteClass.txt"
 
             if(path.exists(file) and src.haveChemClassInfo):    
-                data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False)            
+                data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False, engine='python')
                 df = pd.DataFrame(data)
                 df = self.remove_whitespace(df)
 
@@ -916,7 +914,7 @@ def buildGeneToPathwayConnections(self):
             if not(path.exists(file)):
                 break
 
-            data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False)
+            data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False, engine='python')
             df = pd.DataFrame(data)
             df = self.remove_whitespace(df)
 
@@ -969,7 +967,7 @@ def loadMetaboliteToGene(self):
             if path.exists(file):
 #                print ("metaboliteToGene mappings for " + source)
 
-                data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False)
+                data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False, engine='python')
                 df = pd.DataFrame(data)
                 df = self.remove_whitespace(df)
 
@@ -1958,7 +1956,7 @@ def isMappingProblem(self, sourceId, extId):
 
     def populateExclusionList(self, filePath):
 
-        data = pd.read_csv(filePath, delimiter=r'\t+', header=0, index_col=None)
+        data = pd.read_csv(filePath, delimiter=r'\t+', header=0, index_col=None, engine='python')
         df = pd.DataFrame(data)
 
         for i,row in df.iterrows():