Skip to content

Commit

Permalink
fix errors and cleanup
Browse files Browse the repository at this point in the history
add requirements.txt for easy python environment creation
  • Loading branch information
KeithKelleher committed May 29, 2024
1 parent c5a61b9 commit 3c6bd05
Show file tree
Hide file tree
Showing 9 changed files with 112 additions and 195 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ misc/output/
misc/jupyter/
conf/*db_props*

.idea
3 changes: 2 additions & 1 deletion config/external_resource_config.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ hmdb_gene http https://hmdb.ca/system/downloads/current/hmdb_proteins.zip hmdb_p
hmdb_met_sdf http https://hmdb.ca/system/downloads/current/structures.zip structures.zip structures.sdf ../misc/data/chemprops/hmdb/ zip chem_props_sdf
reactome_met http http://www.reactome.org/download/current/ChEBI2Reactome_All_Levels.txt ChEBI2Reactome_All_Levels.txt ChEBI2Reactome_All_Levels.txt ../misc/data/reactome/ none pathways_mets
reactome_gene http http://www.reactome.org/download/current/UniProt2Reactome_All_Levels.txt UniProt2Reactome_All_Levels.txt UniProt2Reactome_All_Levels.txt ../misc/data/reactome/ none pathways_genes
wiki_pathways_mets_genes http https://data.wikipathways.org/20240210/rdf/wikipathways-20240210-rdf-authors.zip wikipathways-20240210-rdf-authors.zip ./authors/ ../misc/data/wikipathwaysRDF/ zip pathways_mets_genes
wiki_pathways_mets_genes http https://data.wikipathways.org/20230810/rdf/wikipathways-20230810-rdf-wp.zip wikipathways-20230810-rdf-wp.zip ./authors/ ../misc/data/wikipathwaysRDF/ zip pathways_mets_genes
chebi_met_sdf ftp https://ftp.ebi.ac.uk/pub/databases/chebi/SDF/ChEBI_complete_3star.sdf.gz ChEBI_complete_3star.sdf.gz ChEBI_complete_3star.sdf ../misc/data/chemprops/chebi/ gzip chem_props_sdf
lipidmaps_met http https://www.lipidmaps.org/files/?file=LMSD&ext=sdf.zip LMSD.sdf.zip structures.sdf ../misc/data/chemprops/lipidmaps/ zip chem_props_sdf
swissprot_human http https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/uniprot_sprot_human.dat.gz uniprot_sprot_human.dat.gz uniprot_sprot_human.dat ../misc/data/uniprot_human/ gzip proteins
Expand All @@ -17,3 +17,4 @@ rhea_rxn_direction http https://ftp.expasy.org/databases/rhea/tsv/rhea-direction
chebi_to_chebi_relations http http://ftp.ebi.ac.uk/pub/databases/chebi/Flat_file_tab_delimited/relation.tsv relation.tsv relation.tsv ../misc/data/chebi/ none chebi_relations
chebi_ontology_owl http http://ftp.ebi.ac.uk/pub/databases/chebi/ontology/chebi.owl.gz chebi.owl.gz chebi.owl ../misc/data/chebi/ gzip chebi_ontology
expasy_ec2class ftp https://ftp.expasy.org/databases/enzyme/enzclass.txt enzclass.txt enzclass.txt ../misc/data/rhea none expasy_ec2class
expasy_enzyme_dat ftp https://ftp.expasy.org/databases/enzyme/enzyme.dat enzyme.dat enzyme.dat ../misc/data/rhea none expasy_enzyme_dat
6 changes: 3 additions & 3 deletions main/mainSqliteDBLoad.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,8 @@ def loadDBAfterTruncatingTables(self, sqliteFile, incrementLevel = 'increment_pa

# increment level 'increment_patch_release', 'increment_minor_release',
# or 'specified' (new version, perhaps major release)
loader.loadDBAfterTruncatingTables(sqliteFile = '/mnt/ncatsprod/braistedjc/tmp_work/RaMP_SQLite_BASE.sqlite', incrementLevel = 'specified',
optionalVersionOveride = "2.4.2",
optionalVersionNote = "20231107 data update, Rhea reaction to EC reaction class. Reactome Genes Patch.",
loader.loadDBAfterTruncatingTables(sqliteFile = 'RaMP_SQLite_BASE.sqlite', incrementLevel = 'specified',
optionalVersionOveride = "2.6.0",
optionalVersionNote = "20240524 data update",
truncateTables=True)

10 changes: 10 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
rdflib==6.0.2
pandas==1.5.3
lxml
libchebipy
pubchempy

sqlalchemy==1.4.52
jproperties
scikit-learn
numpy
2 changes: 1 addition & 1 deletion src/chemprop/ChemWrangler.py
Original file line number Diff line number Diff line change
Expand Up @@ -565,7 +565,7 @@ def getCatalyticDistances(self, catalyzesFile):

print("computing catalytic distances")

catMat = pd.read_csv(catalyzesFile, sep='\t', header=None)
catMat = pd.read_csv(catalyzesFile, sep='\t', header=None, engine='python')
catMat.columns = ["compound","protein"]

lowestDist = dict()
Expand Down
18 changes: 9 additions & 9 deletions src/parse/RheaParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def buildSupportingUniprotData(self):

for acc in self.humanUniprotRecordDict:
self.humanPrimaryUniprotAccSet.add(acc)
p = self.humanPrimaryUniprotRecordDict[acc]
p = self.humanUniprotRecordDict[acc]
threadedPrimaryUniprotDict[acc] = p
for acc2 in p.secondaryAccs:
self.humanSecondaryUniprotAccSet.add(acc2)
Expand Down Expand Up @@ -458,7 +458,7 @@ def processReactions(self, g, res):

def processReactionDirectionInfo(self):

dirTable = pd.read_csv(self.rheaLocalRxnDirectionFile, sep="\t", header=0)
dirTable = pd.read_csv(self.rheaLocalRxnDirectionFile, sep="\t", header=0, engine='python')

dirMapping = dict()

Expand Down Expand Up @@ -557,7 +557,7 @@ def setReactionHumanUniprotState(self):
for rheaId in self.rheaReactionDict:
rxn = self.rheaReactionDict[rheaId]
for p in rxn.proteins:
if p in self.humanUniprotAccSet:
if p in self.humanPrimaryUniprotAccSet:
if rxn.status == 1:
numHumanUniprot = numHumanUniprot + 1
if rxn.direction == 'UN':
Expand Down Expand Up @@ -757,7 +757,7 @@ def exportIntermediateFiles(self):
def appendUniprotToReaction(self):
#self.rheaLocalRheaToEcFile
#self.rheaLocalRheaToUniprotFile
r2u = pd.read_csv(self.rheaLocalRheaToUniprotFile, sep="\t", header=0)
r2u = pd.read_csv(self.rheaLocalRheaToUniprotFile, sep="\t", header=0, engine='python')

r2uMap = dict()

Expand All @@ -770,7 +770,7 @@ def appendUniprotToReaction(self):
uniprot = "uniprot:" + row.ID

# !!! just adding human uniprot
if uniprot in self.humanUniprotAccSet:
if uniprot in self.humanPrimaryUniprotAccSet:
#print("Have the human id!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")

unis = r2uMap.get("rhea:" + str(row.RHEA_ID))
Expand All @@ -797,7 +797,7 @@ def appendUniprotToReaction(self):


# swiss prot
r2u = pd.read_csv(self.rheaLocalRheaToSwissprotFile, sep="\t", header=0)
r2u = pd.read_csv(self.rheaLocalRheaToSwissprotFile, sep="\t", header=0, engine='python')

print(str(r2u.shape))

Expand All @@ -806,7 +806,7 @@ def appendUniprotToReaction(self):
#print("appending protein accessions to reactions..." + str(row.RHEA_ID)+ " " +str(row.ID))

# !!! just adding human uniprot
if ("uniprot:" + row.ID) in self.humanUniprotAccSet:
if ("uniprot:" + row.ID) in self.humanPrimaryUniprotAccSet:
#print("Have the human id!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
unis = r2uMap.get("rhea:" + str(row.RHEA_ID))
if unis is None:
Expand All @@ -832,7 +832,7 @@ def appendUniprotToReaction(self):
def appendEcToReaction(self):
#self.rheaLocalRheaToEcFile
#self.rheaLocalRheaToUniprotFile
r2u = pd.read_csv(self.rheaLocalRheaToEcFile, sep="\t", header=0)
r2u = pd.read_csv(self.rheaLocalRheaToEcFile, sep="\t", header=0, engine='python')

r2EcMap = dict()

Expand All @@ -854,7 +854,7 @@ def appendEcToReaction(self):

def ecToEnzymeClassFromExpasy(self):

# ec2class = pd.read_csv(self.expasyLocalEc2ClassFile, sep="\t", skiprows=11, skipfooter=5)
# ec2class = pd.read_csv(self.expasyLocalEc2ClassFile, sep="\t", skiprows=11, skipfooter=5, engine='python')
with open(self.expasyLocalEc2ClassFile, 'r') as ec2c:
ec2classStrings = ec2c.readlines()

Expand Down
52 changes: 25 additions & 27 deletions src/util/EntityBuilder.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,10 @@ def __init__(self, resourceConfig):
# This data source list will eventually be populated by config file
self.source = DataSource()
self.sourceList.append(self.source)


if not os.path.exists(self.source.exportPath):
os.makedirs(self.source.exportPath)

self.dataSource2 = DataSource()
self.dataSource2.sourceName = 'reactome'
self.dataSource2.filePrefix = 'reactome'
Expand Down Expand Up @@ -209,11 +212,6 @@ def fullBuild(self):
self.metaboliteList.collapseMetsOnInchiKeyPrefix()

# loader file writes

# make sql directory if it doesn't exist
if not exists("../misc/sql"):
os.mkdir("../misc/sql")

self.writePathways()
self.writeAnalyteSource()
self.writeAnalyteSynonyms()
Expand Down Expand Up @@ -248,7 +246,7 @@ def loadMetaboList(self, eqMetric = 0):
if not(path.exists(file)):
break

data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False)
data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False, engine='python')
df = pd.DataFrame(data)
df = self.remove_whitespace(df)

Expand Down Expand Up @@ -359,7 +357,7 @@ def addMetaboliteCommonName(self):
source = src.sourceName
file = src.sourceLocPath + "/" + src.filePrefix + "metaboliteCommonName.txt"

data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False)
data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False, engine='python')
df = pd.DataFrame(data)
df = self.remove_whitespace(df)

Expand All @@ -382,7 +380,7 @@ def addMetaboliteHMDBStatus(self):
# capture id to status dictionary
hmdbStatus = dict()
file = hmdbSrc.sourceLocPath + "/" + hmdbSrc.filePrefix + "metStatus.txt"
data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False)
data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False, engine='python')

for i,row in data.iterrows():
hmdbStatus[row[0]] = row[1]
Expand Down Expand Up @@ -418,7 +416,7 @@ def addMetaboliteSynonyms(self):
if not os.path.exists(file) or os.path.getsize(file) < 1:
return

data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False)
data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False, engine='python')
df = pd.DataFrame(data)
df = self.remove_whitespace(df)

Expand Down Expand Up @@ -463,7 +461,7 @@ def loadPathways(self):
if not(path.exists(file)):
break

data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False)
data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False, engine='python')
df = pd.DataFrame(data)
df = self.remove_whitespace(df)

Expand All @@ -490,7 +488,7 @@ def addPathwayCategory(self):
if not(path.exists(file)):
break

data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None)
data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, engine='python')

for i,row in data.iterrows():
pathway = self.pathList.getPathwayBySourceId(row[0])
Expand Down Expand Up @@ -520,7 +518,7 @@ def buildMetaboliteToPathwayConnections(self):
if not(path.exists(file)):
break

data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False)
data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False, engine='python')
df = pd.DataFrame(data)
df = self.remove_whitespace(df)

Expand Down Expand Up @@ -573,7 +571,7 @@ def getUniprotSecondaryAccessions(self):

file = "../misc/output/uniprot_human/uniprot_acc_mapping.txt"

data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False)
data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False, engine='python')

for idx, row in data.iterrows():
altId = row[0]
Expand Down Expand Up @@ -602,7 +600,7 @@ def loadGeneList(self, eqMetric = 0):
print("in add gene list... geneInfoDictionary not found for :" + file)
continue

data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False)
data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False, engine='python')
df = pd.DataFrame(data)
df = self.remove_whitespace(df)
df.drop_duplicates(inplace = True)
Expand Down Expand Up @@ -722,7 +720,7 @@ def loadOntolgies(self):
if not(path.exists(file)):
break

data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False)
data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False, engine='python')
df = pd.DataFrame(data)
df = self.remove_whitespace(df)

Expand All @@ -739,7 +737,7 @@ def loadOntolgies(self):
if not(path.exists(file)):
break

data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False)
data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False, engine='python')
df = pd.DataFrame(data)
df = self.remove_whitespace(df)

Expand All @@ -756,7 +754,7 @@ def loadOntolgies(self):
if not(path.exists(file)):
break

data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False)
data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False, engine='python')
df = pd.DataFrame(data)
df = self.remove_whitespace(df)

Expand All @@ -773,7 +771,7 @@ def loadOntolgies(self):
if not(path.exists(file)):
break

data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False)
data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False, engine='python')
df = pd.DataFrame(data)
df = self.remove_whitespace(df)

Expand All @@ -789,7 +787,7 @@ def loadOntolgies(self):
if not(path.exists(file)):
break

data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False)
data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False, engine='python')
df = pd.DataFrame(data)
df = self.remove_whitespace(df)

Expand All @@ -806,7 +804,7 @@ def loadOntolgies(self):
if not(path.exists(file)):
break

data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False)
data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False, engine='python')
df = pd.DataFrame(data)
df = self.remove_whitespace(df)

Expand All @@ -823,7 +821,7 @@ def loadOntolgies(self):
if not(path.exists(file)):
break

data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False)
data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False, engine='python')
df = pd.DataFrame(data)
df = self.remove_whitespace(df)

Expand Down Expand Up @@ -862,7 +860,7 @@ def addGeneCommonNameAndSynonyms(self):
if not(path.exists(file)):
continue

data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False)
data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False, engine='python')
df = pd.DataFrame(data)
df = self.remove_whitespace(df)

Expand All @@ -885,7 +883,7 @@ def metaboliteClassConnections(self):
file = src.sourceLocPath + "/" + src.filePrefix + "metaboliteClass.txt"

if(path.exists(file) and src.haveChemClassInfo):
data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False)
data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False, engine='python')
df = pd.DataFrame(data)
df = self.remove_whitespace(df)

Expand Down Expand Up @@ -916,7 +914,7 @@ def buildGeneToPathwayConnections(self):
if not(path.exists(file)):
break

data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False)
data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False, engine='python')
df = pd.DataFrame(data)
df = self.remove_whitespace(df)

Expand Down Expand Up @@ -969,7 +967,7 @@ def loadMetaboliteToGene(self):
if path.exists(file):
# print ("metaboliteToGene mappings for " + source)

data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False)
data = pd.read_csv(file, delimiter=r'\t+', header=None, index_col=None, na_filter = False, engine='python')
df = pd.DataFrame(data)
df = self.remove_whitespace(df)

Expand Down Expand Up @@ -1958,7 +1956,7 @@ def isMappingProblem(self, sourceId, extId):

def populateExclusionList(self, filePath):

data = pd.read_csv(filePath, delimiter=r'\t+', header=0, index_col=None)
data = pd.read_csv(filePath, delimiter=r'\t+', header=0, index_col=None, engine='python')
df = pd.DataFrame(data)

for i,row in df.iterrows():
Expand Down
Loading

0 comments on commit 3c6bd05

Please sign in to comment.