forked from unmtransinfo/ProteinGraphML
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathBuildKG.py
executable file
·202 lines (177 loc) · 8.65 KB
/
BuildKG.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
#!/usr/bin/env python3
"""
Create Protein Disease knowledge graph from DB adapter 'OlegDB' or 'TCRD'.
"""
import sys, os, re, argparse, time, json
import logging
from networkx.readwrite.json_graph import cytoscape_data
from networkx.readwrite.graphml import generate_graphml
from ProteinGraphML.DataAdapter import OlegDB, TCRD
from ProteinGraphML.GraphTools import ProteinDiseaseAssociationGraph
if __name__ == "__main__":
"""
Create a Protein-Disease knowledge graph (KG) from database and save it as a pickle file.
Also, save KG data in log, cyjs, graphml, and tsv format.
"""
DBS = ['olegdb', 'tcrd']
parser = argparse.ArgumentParser(description='Create Protein-Disease knowledge graph (KG) from source RDB')
parser.add_argument('--db', choices=DBS, default="tcrd", help='{0}'.format(str(DBS)))
parser.add_argument('--o', dest="ofile", help='output pickled KG')
parser.add_argument('--logfile', help='KG construction log.')
parser.add_argument('--cyjsfile', help='Save KG as CYJS.')
parser.add_argument('--graphmlfile', help='Save KG as GraphML.')
parser.add_argument('--tsvfile', help='Save KG as TSV.')
# parser.add_argument('--test', help='Build KG but do not save.')
parser.add_argument("-v", "--verbose", action="count", default=0, help="verbosity")
args = parser.parse_args()
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s',
level=(logging.DEBUG if args.verbose > 1 else logging.INFO))
t0 = time.time()
## Construct base protein-disease map from ProteinDiseaseAssociationGraph.
## Db is PonyORM db (https://docs.ponyorm.org/api_reference.html).
# Make TCRD as the default DB
dbad = OlegDB() if args.db == "olegdb" else TCRD()
pdg = ProteinDiseaseAssociationGraph(dbad)
## ProteinDiseaseAssociationGraph object has helper methods, but
## NetworkX methods also available.
## https://networkx.github.io/documentation/stable/reference/
logging.info('Total nodes: %d; edges: %d' % (pdg.graph.order(), pdg.graph.size()))
## Filter by proteins of interest; this list comes from a DB adapter, but any set will do.
proteins = dbad.loadTotalProteinList().protein_id
proteinSet = set(proteins)
logging.info('Protein set: %d' % (len(proteinSet)))
# Using attach() add edges from DB.
# With this method create graph, which can be saved, avoiding
# need for rebuilding for different diseases, models and analyses.
# Also filter by proteins of interest, in this case it is our original list.
pdg.attach(dbad.loadPPI(proteinSet))
pdg.attach(dbad.loadKegg(proteinSet))
pdg.attach(dbad.loadReactome(proteinSet))
pdg.attach(dbad.loadGo(proteinSet))
try:
pdg.attach(dbad.loadInterpro(proteinSet))
except Exception as e:
logging.error("InterPro failed to load: {0}".format(e))
# TCRD only: (Would these add value?)
try:
pdg.attach(dbad.loadOMIM(proteinSet))
except Exception as e:
logging.error("OMIM failed to load: {0}".format(e))
# try:
# pdg.attach(dbad.loadPfam(proteinSet))
# except Exception as e:
# logging.error("Pfam failed to load: {0}".format(e))
# try:
# pdg.attach(dbad.loadProsite(proteinSet))
# except Exception as e:
# logging.error("Prosite failed to load: {0}".format(e))
# Count node types based on IDs using NetworkX API.
keggNodes = [n for n in list(pdg.graph.nodes) if isinstance(n, str) and n[0:3] == "hsa"]
logging.info("KEGG nodes: %d" % (len(keggNodes)))
reactomeNodes = [n for n in list(pdg.graph.nodes) if isinstance(n, str) and n[0:2] == "R-"]
logging.info("REACTOME nodes: %d" % (len(reactomeNodes)))
goNodes = [n for n in list(pdg.graph.nodes) if isinstance(n, str) and n[0:3] == "GO:"]
logging.info("GO nodes: %d" % (len(goNodes)))
interNodes = [n for n in list(pdg.graph.nodes) if isinstance(n, str) and n[0:3] == "IPR"]
logging.info("INTERPRO nodes: %d" % (len(interNodes)))
# pfamNodes = [n for n in list(pdg.graph.nodes) if isinstance(n, str) and n[0:2]=="PF"]
# logging.info("Pfam nodes: %d"%(len(pfamNodes)))
# prositeNodes = [n for n in list(pdg.graph.nodes) if isinstance(n, str) and n[0:2]=="PS"]
# logging.info("PROSITE nodes: %d"%(len(prositeNodes)))
# Fetch node/edge information from db.
idDescription = dbad.fetchPathwayIdDescription()
idSymbol = dbad.fetchSymbolForProteinId()
try:
idUniprot = dbad.fetchUniprotForProteinId()
except Exception as e:
logging.error("No Uniprot in OlegDB: {0}".format(e))
idUniprot = {}
# add name, symbol and uniprot id to graph nodes
for n in pdg.graph.nodes:
if n in idUniprot:
pdg.graph.nodes[n]['UniprotId'] = idUniprot[n]
else:
pdg.graph.nodes[n]['UniprotId'] = ''
if n in idSymbol:
pdg.graph.nodes[n]['Symbol'] = idSymbol[n]
else:
pdg.graph.nodes[n]['Symbol'] = ''
if n in idDescription:
pdg.graph.nodes[n]['Description'] = idDescription[n]
else:
pdg.graph.nodes[n]['Description'] = ''
# print(pdg.graph.nodes.data())
# Save graph in pickle format.
if args.ofile is not None:
logging.info("Saving pickled graph to: {0}".format(args.ofile))
pdg.save(args.ofile)
# Save graph in CYJS format.
if args.cyjsfile is not None:
logging.info("Saving KG to CYJS file: {0}".format(args.cyjsfile))
gdata = cytoscape_data(pdg.graph)
with open(args.cyjsfile, 'w') as fcyjs:
json.dump(gdata, fcyjs, indent=2)
# Save graph in GraphML format.
if args.graphmlfile is not None:
logging.info("Saving KG to GraphML file: {0}".format(args.graphmlfile))
with open(args.graphmlfile, 'w') as fgraphml:
for line in generate_graphml(pdg.graph, encoding="utf-8", prettyprint=True):
fgraphml.write(line + "\n")
# Log node and edge info.
if args.logfile is not None:
edgeCount = 0
nodeCount = 0
with open(args.logfile, 'w') as flog:
allNodes = set(pdg.graph.nodes)
for node in allNodes:
nodeCount += 1
try:
flog.write('NODE ' + '{id:"' + str(node) + '", desc:"' + idDescription[node] + '"}' + '\n')
except:
logging.error('Node not found: {0}'.format(node))
allEdges = set(pdg.graph.edges)
for edge in allEdges:
edgeCount += 1
try:
flog.write('EDGE ' + '{idSource:"' + str(edge[0]) + '", idTarget:"' + str(edge[1]) + '"}' + '\n')
except:
logging.error('Edge node not found: {0}'.format(node))
logging.info('{0} nodes, {1} edges written to {2}'.format(nodeCount, edgeCount, args.logfile))
# TSV node and edge info, importable by Neo4j.
if args.tsvfile is not None:
edgeCount = 0;
nodeCount = 0;
with open(args.tsvfile, 'w') as fout:
fout.write('node_or_edge\tclass\tid\tname\tsourceId\ttargetId\n')
allNodes = set(pdg.graph.nodes)
for node in allNodes:
nodeCount += 1
try:
name = idDescription[node]
except:
logging.error('idDescription[node] not found: {0}'.format(node))
continue
if re.match(r'\d+$', str(node)):
node_class = "PROTEIN"
elif str(node)[0:3] == "hsa":
node_class = "KEGG"
elif str(node)[0:2] == "R-":
node_class = "REACTOME"
elif str(node)[0:3] == "GO:":
node_class = "GO"
elif str(node)[0:3] == "IPR":
node_class = "INTERPRO"
elif str(node)[0:2] == "MP":
node_class = "MP"
else:
node_class = "Unknown"
fout.write('node\t' + node_class + '\t' + str(node) + '\t' + idDescription[node] + '\t\t\n')
allEdges = set(pdg.graph.edges)
for edge in allEdges:
# if re.match(r'\d+$', str(edge[0])) and re.match(r'\d+$', str(edge[1])):
# continue #protein-protein edges: STRING?
edgeCount += 1
fout.write('edge\t\t\t\t' + str(edge[0]) + '\t' + str(edge[1]) + '\n')
logging.info('{0} nodes, {1} edges written to {2}'.format(nodeCount, edgeCount, args.tsvfile))
logging.info('{0}: elapsed time: {1}'.format(os.path.basename(sys.argv[0]),
time.strftime('%Hh:%Mm:%Ss', time.gmtime(time.time() - t0))))