forked from unmtransinfo/ProteinGraphML
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRunML_OLD.py
executable file
·141 lines (112 loc) · 5.48 KB
/
RunML_OLD.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#!/usr/bin/env python3
###
import sys,os,time,argparse,logging
import pyreadr,pickle
import numpy as np
import pandas as pd
import networkx as nx
from ProteinGraphML.DataAdapter import OlegDB,selectAsDF
from ProteinGraphML.GraphTools import ProteinDiseaseAssociationGraph
from ProteinGraphML.MLTools.MetapathFeatures import metapathFeatures,ProteinInteractionNode,KeggNode,ReactomeNode,GoNode,InterproNode,getMetapaths
from ProteinGraphML.MLTools.Data import BinaryLabel
from ProteinGraphML.MLTools.Models import XGBoostModel
from ProteinGraphML.MLTools.Procedures import *
from ProteinGraphML.Analysis import Visualize
t0 = time.time()
DATA_DIR = os.getcwd() + '/DataForML/'
NUM_OF_FOLDS = 2
#DEFAULT_GRAPH = "newCURRENT_GRAPH"
DEFAULT_GRAPH = "ProteinDisease_GRAPH.pkl"
DEFAULT_STATIC_FEATURES = "gtex,lincs,ccle,hpa"
PROCEDURES = ["XGBCrossVal", "XGBCrossValPred"]
parser = argparse.ArgumentParser(description='Run ML Procedure', epilog='--disease or --file must be specified; available procedures: {0}'.format(str(PROCEDURES)))
parser.add_argument('procedure', metavar='procedure', type=str, choices=PROCEDURES, nargs='+', help='ML procedure to run')
parser.add_argument('--disease', metavar='disease', type=str, nargs='?', help='Mammalian Phenotype ID, e.g. MP_0000180')
parser.add_argument('--file', type=str, nargs='?', help='input file, pickled training set, e.g. "diabetes.pkl"')
parser.add_argument('--dir', default=DATA_DIR, help='input dir (default: "{0}")'.format(DATA_DIR))
parser.add_argument('--resultdir', type=str, nargs='?', help='folder where results will be saved, e.g. "diabetes_no_lincs"')
parser.add_argument('--crossval_folds', type=int, default=NUM_OF_FOLDS, help='number of folds for average CV (default: "{0}")'.format(NUM_OF_FOLDS))
parser.add_argument('--kgfile', default=DEFAULT_GRAPH, help='input pickled KG (default: "{0}")'.format(DEFAULT_GRAPH))
parser.add_argument('--static_data', default=DEFAULT_STATIC_FEATURES, help='(default: "{0}")'.format(DEFAULT_STATIC_FEATURES))
parser.add_argument("-v", "--verbose", action="count", default=0, help="verbosity")
argData = vars(parser.parse_args())
logging.basicConfig(format='%(levelname)s:%(message)s', level=(logging.DEBUG if argData['verbose']>1 else logging.INFO))
#Get data from file or disease
disease = argData['disease']
fileName = argData['file']
fileData = None
if disease is None and fileName is None: # NO INPUT
parser.error("--disease or --file must be specified.")
elif disease is None and fileName is not None: # NO disease, use file
pklFile = argData['dir'] + fileName
diseaseName = fileName.split('.')[0]
try:
with open(pklFile, 'rb') as f:
fileData = pickle.load(f)
except:
logging.error('Must generate pickled training set file for the given disease')
exit()
#def load_obj(name):
#with open(pklFile, 'rb') as f:
# fileData = pickle.load(f)
#loadList = load_obj('nextDataset')
elif fileName is None and disease is not None:
logging.info("running on this disease: {0}".format(disease))
diseaseName = disease
else:
logging.error('Wrong parameters passed')
# CANT FIND THIS DISEASE
#disease = sys.argv[1]
Procedure = argData['procedure'][0]
logging.info('Procedure: {0}'.format(Procedure))
graphString = argData['kgfile']
# CANT FIND THIS GRAPH
currentGraph = ProteinDiseaseAssociationGraph.load(graphString)
# SOME DISEASES CAUSE "DIVIDE BY 0 error"
logging.info("GRAPH {0} LOADED".format(graphString))
#Get reult directory and number of folds
if (argData['resultdir'] is not None):
resultDir = argData['resultdir'] #folder where all results will be stored
else:
logging.error('Result directory is needed')
exit()
nfolds = argData['crossval_folds'] # applicable for average CV
#Nodes
nodes = [ProteinInteractionNode,KeggNode,ReactomeNode,GoNode,InterproNode]
#staticFeatures = []
staticFeatures = argData['static_data'].split(',')
logging.info(staticFeatures)
logging.info("--- USING {0} METAPATH FEATURE SETS".format(len(nodes)))
logging.info("--- USING {0} STATIC FEATURE SETS".format(len(staticFeatures)))
#fetch the description of proteins and pathway_ids
dbAdapter = OlegDB()
idDescription = dbAdapter.fetchPathwayIdDescription() #fetch the description
idNameSymbol = dbAdapter.fetchSymbolForProteinId() #fetch name and symbol for protein
if fileData is not None:
#logging.info("FOUND {0} POSITIVE LABELS".format(len(fileData[True])))
#logging.info("FOUND {0} NEGATIVE LABELS".format(len(fileData[False])))
trainData = metapathFeatures(disease,currentGraph,nodes,idDescription,staticFeatures,loadedLists=fileData).fillna(0)
else:
trainData = metapathFeatures(disease,currentGraph,nodes,idDescription,staticFeatures).fillna(0)
'''
# directory and file name for the ML Model
if not os.path.isdir(argData['modeldir']):os.mkdir(argData['modeldir'])
if ('.pkl' in diseaseName):
modelName = argData['modeldir'] + diseaseName.split('.')[0] + '.model'
else:
modelName = argData['modeldir'] + diseaseName + '.model'
'''
#call ML codes
d = BinaryLabel()
d.loadData(trainData)
#XGBCrossVal(d)
#print('calling function...', locals()[Procedure])
locals()[Procedure](d, idDescription, idNameSymbol, resultDir, nfolds)
#print("FEATURES CREATED, STARTING ML")
#d = BinaryLabel()
#d.loadData(trainData)
#newModel = XGBoostModel()
#print("SHAPE",d.features.shape)
#roc,acc,CM,report = newModel.cross_val_predict(d,["roc","acc","ConfusionMatrix","report"]) #"report","roc","rocCurve","ConfusionMatrix"
#roc.printOutput()
logging.info('{0}: elapsed time: {1}'.format(os.path.basename(sys.argv[0]), time.strftime('%Hh:%Mm:%Ss', time.gmtime(time.time()-t0))))