forked from arjunkalburgi/nltktutorial
-
Notifications
You must be signed in to change notification settings - Fork 0
/
tutprt16.py
190 lines (148 loc) · 6.88 KB
/
tutprt16.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
'''
Text Classifier: An algorithm that is able to classify a text based on it's contents.
Classic example, an email as spam or not - applies to two distinct things.
Sentiment Analysis - positive or negative connotation.
'''
import nltk
import random
from nltk.corpus import movie_reviews
# create a tuple - where the first element is a list of the words in the text
# and the second element is the sentiment rating (pos/neg)
documents = []
for category in movie_reviews.categories():
for fileid in movie_reviews.fileids(category):
documents.append((list(movie_reviews.words(fileid)), category))
# shuffle the docs so that it's not bias
random.shuffle(documents)
''' Now we can see the review and the sentiment analysis of that rating! '''
# create a list of all the words used in these movie reviews.
# so that we can see what are the most commonly used words and then
''' wise to keep out stopwords? '''
all_words = []
for w in movie_reviews.words():
all_words.append(w.lower())
all_words = nltk.FreqDist(all_words)
''' Now we can see the most common words: all_words.most_common(15)
And how many times a word appears: all_words["stupid"]'''
def find_features(document):
''' Function returns a dictionary of all the words in the top 3000 and if they're
in the document (passed in) '''
features = dict()
# a list of (most of) the words
word_features = list(all_words.keys())[:3000]
words = set(document) # all the different words
for w in word_features:
features[w] = (w in words)
return features
# print((find_features(movie_reviews.words("neg/cv000_29416.txt"))))
featuresets = [(find_features(rev), category) for (rev, category) in documents]
'''
Naive Bayes Algorithm
Classifies things as either positive or negative sentiment- part of nltk's library
training_set:
The idea is that we are looking at all the words and analyzing that if a word appears more
often in negative reviews then that words is negative, or the opposite for positive.
testing_set:
The idea here is that based on the words that the computer knows are negative or positive,
we ask the computer to tell us if the review is positive or negative.
Then we can check if the computer was right or wrong.
'''
training_set = featuresets[:1900]
testing_set = featuresets[1900:]
classifier = nltk.NaiveBayesClassifier.train(training_set)
'''
Saving the trained algorithm (as Python objects)
and then opening the trained algorithm previously saved.
import pickle
save_classifier = open("naivebayes.pickle","wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()
classifier_f = open("naivebayes.pickle", "rb")
classifier = pickle.load(classifier_f)
classifier_f.close()
'''
print("Original Naive Bayes Algorithm accuracy %:",
(nltk.classify.accuracy(classifier, testing_set)) * 100)
classifier.show_most_informative_features(15)
'''
Incorporating SciKitLearn
For better machine learning
'''
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("Multinomial Naive Bayes Algorithm accuracy %:",
(nltk.classify.accuracy(MNB_classifier, testing_set)) * 100)
# GNB_classifier = SklearnClassifier(GaussianNB())
# GNB_classifier.train(training_set)
# print("Gaussian Naive Bayes Algorithm accuracy %:", (nltk.classify.accuracy(GNB_classifier, testing_set))*100)
BNB_classifier = SklearnClassifier(BernoulliNB())
BNB_classifier.train(training_set)
print("Bernoulli Naive Bayes Algorithm accuracy %:",
(nltk.classify.accuracy(BNB_classifier, testing_set)) * 100)
LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression Algorithm accuracy %:", (nltk.classify.accuracy(
LogisticRegression_classifier, testing_set)) * 100)
SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDClassifier Algorithm accuracy %:",
(nltk.classify.accuracy(SGDClassifier_classifier, testing_set)) * 100)
SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training_set)
print("SVC Algorithm accuracy %:",
(nltk.classify.accuracy(SVC_classifier, testing_set)) * 100)
LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC Algorithm accuracy %:",
(nltk.classify.accuracy(LinearSVC_classifier, testing_set)) * 100)
NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print("NuSVC Algorithm accuracy %:",
(nltk.classify.accuracy(NuSVC_classifier, testing_set)) * 100)
'''
Now the idea is that we can combine all the algorithms so that we can determine the best.
This is done with a 'vote'
This will help us be more consistent
'''
from nltk.classify import ClassifierI
from statistics import mode
class VoteClassifier(ClassifierI):
"""docstring for VoteClassifier"""
def __init__(self, *classifiers):
self._classifiers = classifiers
def classify(self, features):
votes = []
for c in self._classifiers:
v = c.classify(features)
votes.append(v)
return mode(votes)
def confidence(self, features):
votes = []
for c in self._classifiers:
v = c.classify(features)
votes.append(v)
choice_votes = votes.count(mode(votes))
conf = choice_votes / len(votes)
return conf
voted_classifier = VoteClassifier(classifier, MNB_classifier, BNB_classifier, LogisticRegression_classifier,
SGDClassifier_classifier, LinearSVC_classifier, NuSVC_classifier)
print("VoteClassifier Algorithm accuracy %:",
(nltk.classify.accuracy(voted_classifier, testing_set)) * 100)
print("")
print("Sentiment Analysis and Confidence /% for 6 of the reviews in the testing_set")
print("Classification:", voted_classifier.classify(testing_set[0][
0]), "- Confidence %:", voted_classifier.confidence(testing_set[0][0]) * 100)
print("Classification:", voted_classifier.classify(testing_set[1][
0]), "- Confidence %:", voted_classifier.confidence(testing_set[1][0]) * 100)
print("Classification:", voted_classifier.classify(testing_set[2][
0]), "- Confidence %:", voted_classifier.confidence(testing_set[2][0]) * 100)
print("Classification:", voted_classifier.classify(testing_set[3][
0]), "- Confidence %:", voted_classifier.confidence(testing_set[3][0]) * 100)
print("Classification:", voted_classifier.classify(testing_set[4][
0]), "- Confidence %:", voted_classifier.confidence(testing_set[4][0]) * 100)
print("Classification:", voted_classifier.classify(testing_set[5][
0]), "- Confidence %:", voted_classifier.confidence(testing_set[5][0]) * 100)