-
Notifications
You must be signed in to change notification settings - Fork 0
/
Exc2.py
223 lines (202 loc) · 7.39 KB
/
Exc2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
import matplotlib.pyplot as plt
import nltk as nltk
import numpy as np
import pandas as pd
import pandasql as pdsql
import pydotplus
import scikitplot as skplt
from sklearn import preprocessing
from sklearn import tree
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_graphviz
pysql = lambda q: pdsql.sqldf(q, globals())
# THIS PART TAKE THE WHOLE TABLE AND DROP THE UNNECCERY COLUMNS
df = pd.read_csv('Hotels_data_Changed.csv')
# df = pd.read_csv('Hotels_data_Changed_minimaized.csv') # for naive bayse after clean the data
keep_col = ['Snapshot Date', 'Checkin Date', 'Discount Code', 'Hotel Name', 'DayDiff', 'WeekDay', 'DiscountDiff']
df.rename(columns={'Snapshot Date': 'SnapshotDate'}, inplace=True)
df.rename(columns={'Checkin Date': 'CheckinDate'}, inplace=True)
df.rename(columns={'Discount Code': 'DiscountCode'}, inplace=True)
df.rename(columns={'Hotel Name': 'HotelName'}, inplace=True)
keep_col = ['SnapshotDate', 'CheckinDate', 'DiscountCode', 'HotelName', 'DayDiff', 'WeekDay', 'DiscountDiff']
cf = df[keep_col]
# GROUP BY QUERY- DROP THE EXPENSIVE VECTORS
query = 'select SnapshotDate, CheckinDate, DiscountCode, HotelName, DayDiff, WeekDay, max(DiscountDiff) from cf group by SnapshotDate, CheckinDate, HotelName, DayDiff, WeekDay'
df = pysql(query)
# PART 2.2
# GET ONLY THE FEATURES I NEED
features = ['SnapshotDate', 'CheckinDate', 'HotelName', 'WeekDay', 'DayDiff']
# DROP THE ROWS WITH MISSING VALUES
df = df.dropna()
# CONVERT THE FEATUERS TO NUMERIC: to translte back: le.transform(THE FETCHER NAME)
translate1 = lambda row: wde.transform([row])[0]
wde = preprocessing.LabelEncoder()
wde.fit(df['WeekDay'])
translate2 = lambda row: hne.transform([row])[0]
hne = preprocessing.LabelEncoder()
hne.fit(df['HotelName'])
translate3 = lambda row: cde.transform([row])[0]
cde = preprocessing.LabelEncoder()
cde.fit(df['CheckinDate'])
translate4 = lambda row: sde.transform([row])[0]
sde = preprocessing.LabelEncoder()
sde.fit(df['SnapshotDate'])
df['WeekDay'] = df['WeekDay'].apply(translate1)
df['HotelName'] = df['HotelName'].apply(translate2)
df['CheckinDate'] = df['CheckinDate'].apply(translate3)
df['SnapshotDate'] = df['SnapshotDate'].apply(translate4)
# INSERT THE VALUES INTO VARIABLES
X = df[features]
y = df["DiscountCode"]
columns_names=X.columns.values
# DECISION TREE CLASSIFIER
print("------------------------DECISION TREE-----------------------")
print()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
model = tree.DecisionTreeClassifier()
model.fit(X_train, y_train)
# TEST THE ALGORITHEM AND SHOW STATISTICS
y_predict = model.predict(X_test)
# CONFUSION MATRIX
matrix = pd.DataFrame(
confusion_matrix(y_test, y_predict),
columns=['Predicted 1', 'Predicted 2', 'Predicted 3', 'Predicted 4'],
index=['True 1', 'True 2', 'True 3', 'True 4']
)
print("-------------------------STATISTICS-------------------------")
print("confusion_matrix:")
print(matrix)
print("------------------------------------------------------------")
# accuracy
accuracy = accuracy_score(y_test, y_predict)
print("accuracy is: %s" % (accuracy))
print("------------------------------------------------------------")
# TP
tp = np.diag(matrix)
print("TP is: %s" % (tp))
print("------------------------------------------------------------")
# FP
fp = matrix.sum(axis=0) - np.diag(matrix)
print("FP:")
print(fp)
print("------------------------------------------------------------")
# FN
fn = matrix.sum(axis=1) - np.diag(matrix)
print("FN:")
print(fn)
print("------------------------------------------------------------")
most=model.feature_importances_
print("the most fetcher important: ")
print(columns_names)
print(most)
# ROC
print("ROC:")
# This is the ROC curve
y_predict2 = model.predict_proba(X_test)
skplt.metrics.plot_roc_curve(y_test, y_predict2)
plt.show()
print("see diagram")
print("------------------------------------------------------------")
print()
# # DRAW THE TREE
# dot_data = tree.export_graphviz(model,
# feature_names=features,
# out_file=None,
# filled=True,
# rounded=True)
# graph = pydotplus.graph_from_dot_data(dot_data)
# graph.write_png('tree.png')
# NAIVE BAYES CLASSIFIER
print("-------------------------NAIVE BAYES------------------------")
nb = GaussianNB()
X1_train, X1_test, y1_train, y1_test = train_test_split(X, y, random_state=1)
nb.fit(X1_train, y1_train)
predicted = nb.predict(X1_test)
predicted_probas = nb.predict_proba(X1_test)
matrix=pd.DataFrame(
confusion_matrix(y1_test, predicted),
columns=['Predicted 1', 'Predicted 2','Predicted 3','Predicted 4'],
index=['True 1', 'True 2','True 3','True 4']
)
print("-------------------------STATISTICS-------------------------")
print("confusion_matrix:")
print(matrix)
print("------------------------------------------------------------")
# accuracy
accuracy=accuracy_score(y1_test, predicted)
print("accuracy is: %s" %(accuracy))
print("------------------------------------------------------------")
# TP
tp = np.diag(matrix)
print("TP is: %s" %(tp))
print("------------------------------------------------------------")
# FP
fp=matrix.sum(axis=0)-np.diag(matrix)
print("FP:")
print(fp)
print("------------------------------------------------------------")
# FN
fn = matrix.sum(axis=1) - np.diag(matrix)
print("FN:")
print(fn)
print("------------------------------------------------------------")
# ROC
print("ROC:")
# This is the ROC curve
skplt.metrics.plot_roc_curve(y1_test, predicted_probas)
plt.show()
print("see diagram")
print("------------------------------------------------------------")
print()
print("-------------------------KNN------------------------")
print()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)
#
# TEST THE ALGORITHEM AND SHOW STATISTICS
y_predict = knn.predict(X_test)
#CONFUSION MATRIX
matrix=pd.DataFrame(
confusion_matrix(y_test, y_predict),
columns=['Predicted 1', 'Predicted 2','Predicted 3','Predicted 4'],
index=['True 1', 'True 2','True 3','True 4']
)
print("-------------------------STATISTICS-------------------------")
print("confusion_matrix:")
print(matrix)
print("------------------------------------------------------------")
# accuracy
accuracy=accuracy_score(y_test, y_predict)
print("accuracy is: %s" %(accuracy))
print("------------------------------------------------------------")
# TP
tp = np.diag(matrix)
print("TP is: %s" %(tp))
print("------------------------------------------------------------")
# FP
fp=matrix.sum(axis=0)-np.diag(matrix)
print("FP:")
print(fp)
print("------------------------------------------------------------")
# FN
fn = matrix.sum(axis=1) - np.diag(matrix)
print("FN:")
print(fn)
print("------------------------------------------------------------")
# ROC
print("ROC:")
# This is the ROC curve
y_predict2 = knn.predict_proba(X_test)
skplt.metrics.plot_roc_curve(y_test, y_predict2)
plt.show()
print("see diagram")
print("------------------------------------------------------------")
print()