-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathML_models_WHITE.py
116 lines (85 loc) · 3.68 KB
/
ML_models_WHITE.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statistics import mean
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
df_white = pd.read_csv('Wine DATASET/WHITE/winequality-white.csv')
# df_white = df_white.drop(labels= ['fixed acidity', 'sulphates', 'citric acid', 'chlorides', 'pH', 'residual sugar'], axis=1)
df_white = df_white.drop(labels=['density', 'total sulfur dioxide'], axis=1)
# Create the y array
# y_series = df_red.iloc[:,11]
# y = pd.Series.to_numpy(y_series)
# or: y=np.array(df_red.iloc[:,11])
y_series = df_white.iloc[:, -1]
converter = LabelEncoder()
y=converter.fit_transform(y_series)
# SCALING THE FEATURES: standard scaler
scaler = StandardScaler()
X = scaler.fit_transform(np.array(df_white.iloc[:, :-1], dtype=float))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=898, random_state=40)
# # DECISION TREE
# tree_clf = DecisionTreeClassifier(max_depth=None)
# tree_clf.fit(X_train, y_train)
# y_test_pred = tree_clf.predict(X_test)
#
# Random Forest
rnd_clf = RandomForestClassifier(n_estimators=200)
# rnd_clf.fit(X_train, y_train)
# y_test_pred = rnd_clf.predict(X_test)
skf = StratifiedKFold(n_splits=10,shuffle=True, random_state=10)
list_of_accuracies =[]
for train_index, valid_index in skf.split(X_train, y_train):
X_train_fold, X_valid_fold = X[train_index], X[valid_index]
y_train_fold, y_valid_fold = y[train_index], y[valid_index]
rnd_clf.fit(X_train_fold, y_train_fold)
y_valid_pred = rnd_clf.predict(X_valid_fold)
list_of_accuracies.append(accuracy_score(y_true=y_valid_fold, y_pred=y_valid_pred))
print("List of accuracies: ", list_of_accuracies)
print("The best accuracy: ", max(list_of_accuracies))
print("Minimum accuracy: ", min(list_of_accuracies))
print("Overall accuracy: ", mean(list_of_accuracies))
y_test_pred = rnd_clf.predict(X_test)
# #### Extract features importances
# print('FEATURE IMPORTANCE:')
# print(rnd_clf.feature_importances_)
# feat_importances = pd.Series(rnd_clf.feature_importances_, index=df_white.iloc[:, :11].columns)
# feat_importances.nlargest(11).plot(kind='barh')
# plt.show()
# => deciding to drop the following columns:
# fixed acidity; sulphates; citric acid; chlorides; pH; residual sugar
# ## POLYNOMIAL KERNEL SVC
# svm_clf = Pipeline([("svm_clf", SVC(kernel='poly', C=10, degree=6))])
# svm_clf.fit(X_train, y_train)
# y_test_pred = svm_clf.predict(X_test)
print("Accuracy for wine quality: ", accuracy_score(y_true=y_test,y_pred=y_test_pred))
print(classification_report(y_test,y_test_pred))
matrix = confusion_matrix(y_test, y_test_pred)
print(matrix)
# displaying the confusion_matrix
display = ConfusionMatrixDisplay(matrix)
# Plotting Confusion Matrix
# Setting colour map to be used
display.plot(cmap='Greens', xticks_rotation=25)
# Other possible options for colour map are:
# 'autumn_r', 'Blues', 'cool', 'Greens', 'Greys', 'PuRd', 'copper_r'
# Setting fontsize for xticks and yticks
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
# Giving name to the plot
plt.title('Confusion Matrix', fontsize=24)
# Showing the plot
plt.show()