-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathsolution_03_potato.py
94 lines (69 loc) · 3.73 KB
/
solution_03_potato.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
## splitting the data into a train an test set
X_train, X_test , y_train , y_test = train_test_split( dfTT , y ,
test_size=0.25,
stratify = y , random_state= 668141 )
## Counter is a nice class which takes a list and creates a dictionnary
# whose keys are the unique list items and values are the number of time they appear
from collections import Counter
print( "train set", Counter( y_train ) )
print( "test set", Counter( y_test ) )
# starting with a feature selection
skb = SelectKBest(chi2, k=20)
skb.fit(X_train , y_train)
X_train_reduced = X_train.loc[ : , skb.get_support() ]
X_test_reduced = X_test.loc[ : , skb.get_support() ]
# training pipeline
pipe = Pipeline([("classifier", DecisionTreeClassifier())])
# Create dictionary with candidate learning algorithms and their hyperparameters
grid_param = [ {'classifier':[RandomForestClassifier(n_jobs=-1,class_weight='balanced')],
'classifier__criterion': ['entropy','gini'],
'classifier__n_estimators':np.arange(1,1000,100),
'classifier__max_depth':[2],
'classifier__min_samples_split':[2],
'classifier__min_samples_leaf':[1]},
{"classifier": [GradientBoostingClassifier()],
'classifier__learning_rate':np.arange(0.01,0.1,0.02),
'classifier__n_estimators':np.arange(1,100,20),
'classifier__max_depth':[2],
'classifier__min_samples_split':[2],
'classifier__min_samples_leaf':[1]},
{'classifier':[AdaBoostClassifier()],
'classifier__n_estimators':np.arange(1,1000,200),
'classifier__learning_rate':np.arange(0.01,0.1,0.02) }]
%%time
##the %%time is a jupyter cell magic command which will measure the time
# it takes for a cell to run and report it.
# WARNING : it only works if it is the 1st line of the cell, so you have to manually move it there...
gridsearch_Potato = GridSearchCV(pipe, grid_param, cv=5, verbose=0,n_jobs=-1,scoring='roc_auc') # Fit grid search
best_model_Potato = gridsearch_Potato.fit(X_train_reduced,y_train)
print(best_model_Potato.best_params_)
print("Model roc_auc on test set:",best_model_Potato.score(X_test_reduced,y_test))
## predicting the labels on the test set
y_pred_test=best_model_Potato.predict(X_test_reduced)
title = []
for k in best_model_Potato.best_params_:
name = k.partition("__")[-1]
title.append( f"{name} {best_model_Potato.best_params_[k]}" )
plotTitle = " - ".join(title)
y_test_score=grid_tree_roc_auc.score(X_cancer_test,y_cancer_test)
print('Grid best parameter (max. accuracy) model on test: ', y_test_score)
y_cancer_pred_test = grid_tree_roc_auc.predict(X_cancer_test)
confusion_m = confusion_matrix(y_test, y_pred_test)
plt.figure(figsize=(5.5,4))
sns.heatmap(confusion_m, annot=True , xticklabels=['White','Yellow'] , yticklabels=['White','Yellow'] )
plt.ylabel('True label')
plt.xlabel('Predicted label')
from sklearn.metrics import RocCurveDisplay
RocCurveDisplay.from_estimator(best_model_Potato,X_test_reduced, y_test)
## extract the best estimator steps from the pipeline
RF = best_model_Potato.best_estimator_.steps[0][1]
w=RF.feature_importances_#get the weights
selectedFeatures = dfTT.columns[ skb.get_support() ]
featureW = pd.DataFrame( {'feature': selectedFeatures,'weight':w} )
# sort them by absolute value
featureWsorted = featureW.sort_values(by=['weight'] ,
ascending=False ,
key=lambda col : col.abs())
# get the non-null ones
print('Features sorted per importance:')
featureWsorted.loc[ ~ np.isclose( featureWsorted["weight"] , 0 ) ]