-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtrainmodel_pipeline.py
146 lines (110 loc) · 4.61 KB
/
trainmodel_pipeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#from extractSmallSet import *
#from generateDescriptors import *
import pandas as pd
import matplotlib
import utilities
import sklearn
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.manifold import MDS
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.dummy import DummyClassifier
from sklearn.externals import joblib
from sklearn.metrics import classification_report as clsr
def clean_dataset(df):
assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
df.dropna(inplace=True)
indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
return df[indices_to_keep].astype(np.float64)
df = pd.read_csv("descDFSmallSet.csv")
# Remove the fields from the data set that we don't want to include in our model, there are none in this case
pass
# Replace categorical data with one-hot encoded data, there are none in this case, so the original dataframe is copied
features_df = df
#Remove NaN entrys that may be present
features_df = clean_dataset(features_df)
# Create the X and y arrays
y = features_df['Outcome'].as_matrix()
print len(features_df.columns)
# Remove the Outcome from the feature data, abd the index parameter that got generated while reading in the csv
features_df.drop('Outcome', axis = 1, inplace=True)
features_df.drop(features_df.columns[[0]], axis = 1, inplace=True)
print len(features_df.columns)
#utilities.viewTable(features_df)
X = features_df.as_matrix().astype(np.float)
# Split the data set in a training set (70%) and a test set (30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
#utilities.viewTable(features_df)
# mds = MDS(n_components=1000)
# transData = mds.fit_transform(X_train[0:1000])
#
# plt.scatter(transData[0], transData[1])
# plt.show()
# print ("Actives in the training set %d" % y_train.sum())
# print ("Actives in the test set %d" % y_test.sum())
#
# clf = DummyClassifier(strategy='most_frequent',random_state=0)
# clf.fit(X_train, y_train)
#
# #print("Dummy score for Training Set: %.4f" % cross_val_score(clf, X_train, y_train, scoring='roc_auc'))
# print(cross_val_score(clf, X_test, y_test, scoring='accuracy'))
# print(cross_val_score(clf, X_test, y_test, scoring='average_precision'))
#print(cross_val_score(clf, X_test, y_test, scoring='f1'))
# model = svm.SVC(C= 1, gamma= 0.0000001)
# model.fit(X_train, y_train)
#
modelRF = RandomForestClassifier(n_estimators=1000, max_depth=5, max_features=10, class_weight="balanced")
modelRF.fit(X_train, y_train)
#
# #y_pred = modelRF.predict(X_test)
#
# joblib.dump(model, 'trainedSte3Model.pkl')
#
y_pred = modelRF.predict(X_test)
y_predTr = modelRF.predict(X_train)
#print y_pred
print clsr(y_test, y_pred)
print clsr(y_train, y_predTr)
#print (y_test,y_pred)
# print "For RF"
# print("Test set score: %.4f" % modelRF.score(X_test, y_test))
# print("Training Set score: %.4f" % modelRF.score(X_train, y_train))
# print cross_val_score(modelRF, X_test, y_test, scoring='f1')
# accuracy = cross_val_score(modelRF, X_test, y_test, scoring='accuracy')
# precision = cross_val_score(modelRF, X_test, y_test, scoring='average_precision')
# print cross_val_score(modelRF, X_test, y_test, scoring='roc_auc')
#
# print accuracy,precision, (accuracy+precision)/2
#print sum(y_pred), sum(y_test)
modelRF = svm.SVC()
param_grid = [
{'C': [1, 10, 100, 1000], 'gamma': [0.00000001, 0.0000000001, 0.00000000001], 'kernel': ['rbf'], 'class_weight':['balanced']}
]
# param_grid = {
#
# "max_depth": [2,5,10,20,50,100],
# "n_estimators" : [10,50,100,200,500]
# # 'n_estimators': [500, 1000, 3000],
# # 'max_depth': [4, 6],
# # 'min_samples_leaf': [3, 5, 9, 17],
# # 'learning_rate': [0.1, 0.05, 0.02, 0.01],
# # 'max_features': [1.0, 0.3, 0.1],
# }
#Define the grid search we want to run. Run it with four cpus in parallel.
gs_cv = GridSearchCV(modelRF, param_grid, n_jobs=4, verbose=100, scoring="f1")
# Run the grid search - on only the training data!
#gs_cv.fit(X_train, y_train)
# Print the parameters that gave us the best result!
print (gs_cv.cv_results_)
print "\n\nThe opimized parameters is: "
print(gs_cv.best_params_)
#
# print "For SVM"
# print("model.score for Test set: %.4f" % model.score(X_test, y_test))
# print("model.score for Training set: %.4f" % model.score(X_train, y_train))
# print("F1 score for Test Set: %.4f" % cross_val_score(model, X_test, y_test, scoring='f1'))
# print("F1 score for Training Set: %.4f" % cross_val_score(model, X_train, y_train, scoring='f1'))