-
Notifications
You must be signed in to change notification settings - Fork 0
/
gridSearch.py
96 lines (68 loc) · 3.14 KB
/
gridSearch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
"""
A simple script that demonstrates how we can use grid search to set the parameters of a classifier
"""
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from nltk.corpus import stopwords
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
#read the reviews and their polarities from a given file
def loadData(fname):
reviews=[]
labels=[]
f=open(fname)
for line in f:
review,rating=line.strip().split('\t')
reviews.append(review.lower())
labels.append(rating)
f.close()
return reviews,labels
rev_train,labels_train=loadData('reviews_train.txt')
rev_test,labels_test=loadData('reviews_test.txt')
#Build a counter based on the training dataset
counter = CountVectorizer(stop_words=stopwords.words('english'))
counter.fit(rev_train)
#count the number of times each term appears in a document and transform each doc into a count vector
counts_train = counter.transform(rev_train)#transform the training data
counts_test = counter.transform(rev_test)#transform the testing data
KNN_classifier=KNeighborsClassifier()
LREG_classifier=LogisticRegression()
DT_classifier = DecisionTreeClassifier()
predictors=[('knn',KNN_classifier),('lreg',LREG_classifier),('dt',DT_classifier)]
VT=VotingClassifier(predictors)
#=======================================================================================
#build the parameter grid
KNN_grid = [{'n_neighbors': [1,3,5,7,9,11,13,15,17], 'weights':['uniform','distance']}]
#build a grid search to find the best parameters
gridsearchKNN = GridSearchCV(KNN_classifier, KNN_grid, cv=5)
#run the grid search
gridsearchKNN.fit(counts_train,labels_train)
#=======================================================================================
#build the parameter grid
DT_grid = [{'max_depth': [3,4,5,6,7,8,9,10,11,12],'criterion':['gini','entropy']}]
#build a grid search to find the best parameters
gridsearchDT = GridSearchCV(DT_classifier, DT_grid, cv=5)
#run the grid search
gridsearchDT.fit(counts_train,labels_train)
#=======================================================================================
#build the parameter grid
LREG_grid = [ {'C':[0.5,1,1.5,2],'penalty':['l1','l2']}]
#build a grid search to find the best parameters
gridsearchLREG = GridSearchCV(LREG_classifier, LREG_grid, cv=5)
#run the grid search
gridsearchLREG.fit(counts_train,labels_train)
#=======================================================================================
VT.fit(counts_train,labels_train)
#use the VT classifier to predict
predicted=VT.predict(counts_test)
#print the accuracy
print (accuracy_score(predicted,labels_test))
"""
USE THIS IF YOU WANT TO SEE THE ACCURACY FOR EACH PARAM CONFIGURATION IN A GRID
#print the score for each parameter setting
for params, mean_score, scores in gridsearchKNN.grid_scores_:
print params, mean_score
"""