-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain_clf.py
106 lines (82 loc) · 3.51 KB
/
train_clf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import nltk
import random
import pickle
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from nltk.tokenize import word_tokenize
from nltk.classify.scikitlearn import SklearnClassifier
def find_features(document, word_features):
words = word_tokenize(document)
features = {}
for w in word_features:
features[w] = (w in words)
return features
def main():
short_pos = open("sentiment_data/positive.txt", "r").read()
short_neg = open("sentiment_data/negative.txt", "r").read()
# move this up here
all_words = []
documents = []
allowed_word_types = ["J"]
# add all positive and negative reviews to documents-list with label
for review in short_pos.split('\n'):
documents.append((review, "pos"))
words = word_tokenize(review)
pos = nltk.pos_tag(words)
for w in pos:
if w[1][0] in allowed_word_types:
all_words.append(w[0].lower())
for review in short_neg.split('\n'):
documents.append((review, "neg"))
words = word_tokenize(review)
pos = nltk.pos_tag(words)
for w in pos:
if w[1][0] in allowed_word_types:
all_words.append(w[0].lower())
all_words = nltk.FreqDist(all_words)
# use 5000 most used words as features
word_features = list(all_words.keys())[:5000]
# loop through all data_types to save as a .pickle
data_types = [word_features, documents, all_words]
data_types_string = ["word_features", "documents", "all_words"]
for i in range(len(data_types)):
string = data_types_string[i] + ".pickle"
with open("pickled/data/" + string, "wb") as write_data:
pickle.dump(data_types[i], write_data)
random.shuffle(documents)
# training and testing data added to feature sets
featuresets = [(find_features(review, word_features), category) for
(review, category) in documents]
testing_set = featuresets[10000:]
training_set = featuresets[:10000]
print("Training: NB_classifier")
NB_classifier = nltk.NaiveBayesClassifier.train(training_set)
print("Done training: NB_classifier")
MNB_classifier = SklearnClassifier(MultinomialNB())
BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LinearSVC_classifier = SklearnClassifier(LinearSVC())
classifiers = [NB_classifier,
MNB_classifier,
BernoulliNB_classifier,
LogisticRegression_classifier,
LinearSVC_classifier]
classifier_strings = ["NB_classifier",
"MNB_classifier",
"BernoulliNB_classifier",
"LogisticRegression_classifier",
"LinearSVC_classifier"]
# loop through all classifiers and save it as a .pickle
for i in range(len(classifiers)):
if classifier_strings[i] is not "NB_classifier":
print("Training: " + classifier_strings[i])
classifiers[i].train(training_set)
print("Done training: " + classifier_strings[i])
string = classifier_strings[i] + ".pickle"
print("Pickling: " + string)
with open("pickled/algorithms/" + string, "wb") as write_data:
pickle.dump(classifiers[i], write_data)
print("Done Pickling: " + string)
if __name__ == "__main__":
main()