Skip to content

Commit

Permalink
Merge pull request #7 from ncats/sqlite_inchi_duplex_patch
Browse files Browse the repository at this point in the history
Sqlite inchi duplex patch
  • Loading branch information
johnbraisted authored Feb 16, 2024
2 parents 6bed4f3 + 17ba30f commit 41a3427
Show file tree
Hide file tree
Showing 6 changed files with 39 additions and 109 deletions.
35 changes: 26 additions & 9 deletions src/parse/reactomeData.py
Original file line number Diff line number Diff line change
Expand Up @@ -389,11 +389,16 @@ def getCommonNameFromUniprot(self):
files = os.listdir("../misc/data/Uniprot/")
path = "../misc/data/Uniprot/"
i = 0
haveMapping = False
haveGeneSymbol = False
#print('Parsing UniProt files ...')
for f in files:
i = i + 1
#if i % 1000 == 0:
#print('Processing {} files'.format(i))

haveMapping = False
haveGeneSymbol = False
try:
tree = ET.parse(path + f)
geneid = f.replace(".xml","")
Expand All @@ -410,31 +415,43 @@ def getCommonNameFromUniprot(self):
#print(geneid+":"+name.text)
try:
mapping = self.geneInfoDictionary['uniprot:'+geneid]
haveMapping = True
mapping["common_name"] = "gene_symbol:"+name.text
haveGeneSymbol = True
except KeyError:
pass
print("Key Error for "+geneid+" in file "+f)

# print("Raw data does not have this ID ...")
# print(geneid)

# we now have uniprot to 'common_name', really gene id.
# now we want to grab the NCBI/Entrez 'GeneID'
# if childtag == "dbReference":
# if child2.get("type") == "GeneID":
# geneId = child2.get("id")
# geneId = 'entrez:'+geneId
if childtag == "dbReference":
if child2.get("type") == "GeneID":

if not haveMapping:
print("Hey we are adding a gene id but don't have new mapping. Uniprot:"+f)
# we don't have the mapping for the protein based from above...
# jump to next child... eventually next file.
continue

geneId = child2.get("id")
geneId = 'entrez:'+geneId
# protein to gene can be 1:n, so they have to be stored as a list
# lets check for a value
# idList = mapping.get("small_e_entrez", None)
# if(idList == None):
# idList = list()
# mapping["small_e_entrez"] = idList
idList = mapping.get("small_e_entrez", None)
if(idList == None):
idList = list()
mapping["small_e_entrez"] = idList

# idList.append(geneId)
idList.append(geneId)



except ET.ParseError:
print("Skip {} ...".format(f))

pass

# def checkFiles(self):
Expand Down
68 changes: 0 additions & 68 deletions src/update/RaMPDatabase.py

This file was deleted.

27 changes: 0 additions & 27 deletions src/update/RaMPFixer.py

This file was deleted.

Binary file removed src/update/__pycache__/RaMPDatabase.cpython-35.pyc
Binary file not shown.
Binary file removed src/update/__pycache__/RaMPFixer.cpython-35.pyc
Binary file not shown.
18 changes: 13 additions & 5 deletions src/util/SQLiteDBBulkLoader.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,6 @@ def loadFile(self, resource, engine):
df = df.drop_duplicates(ignore_index=False, inplace=False, keep='first')
print(str(df.shape))

print(df.head(n=5))
table = resource.destTable
# this loads the data frame into the table.
try:
Expand Down Expand Up @@ -834,19 +833,28 @@ def updateSourcePathwayCount(self):
# conn.execute(sql)
# conn.close()

sql = "select ap.rampId, count(distinct(ap.pathwayRampId)) as pathwayCount from analytehaspathway ap "\
sql = "select count(distinct(ap.pathwayRampId)) as pathwayCount, ap.rampId from analytehaspathway ap "\
"where ap.pathwaySource != 'hmdb' group by ap.rampId"

sql2 = "update source set pathwayCount = :pathwayCount where rampId = :rampId"

with self.engine.connect() as conn:
df = conn.execute(sql).all()
df = pd.DataFrame(df)
df.columns = ["rampId", "pathwayCount"]

df.columns = ["pathwayCount", "rampId"]

print("setting pw count... shape=")
print(df.shape)
print(df.head(10))

k = 0
for i,row in df.iterrows():
k = k + 1
if k < 10:
print(row)
print("\n")
conn.execute(sql2, row)

conn.close()

print("Finished: updating pathway counts in source table")
Expand Down

0 comments on commit 41a3427

Please sign in to comment.