-
Notifications
You must be signed in to change notification settings - Fork 0
/
TestSentiment.py
66 lines (52 loc) · 2.13 KB
/
TestSentiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
"""Build a sentiment analysis / polarity model
Sentiment analysis can be casted as a binary text classification problem,
that is fitting a linear classifier on features extracted from the text
of the user messages so as to guess whether the opinion of the author is
positive or negative.
In this examples we will use a movie review dataset.
"""
# Author: Olivier Grisel <[email protected]>
# License: Simplified BSD
import sys
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn import metrics
from classification import remove_sklearn_incompatible
def classify():
# Load data and split to test/train
path = '/home/james/Documents/text_analytics/data/twenty_newsgroups/20news-bydate-test'
remove_sklearn_incompatible(path)
dataset = load_files(path, shuffle=False)
# Build a vectorizer / classifier pipeline that filters out tokens
# that are too rare or too frequent
pipeline = Pipeline([
('vect', TfidfVectorizer(min_df=3, max_df=0.95,)),
# ('clf', LinearSVC(C=1000)),
# ('clf', MultinomialNB()),
# ('clf', Perceptron()),
('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)),
])
avg = 0
runs = 10
for i in range(0, runs):
docs_train, docs_test, y_train, y_test = train_test_split(
dataset.data, dataset.target, test_size=0.25)
# create classifier and predict
clf = pipeline.fit(docs_train, y_train)
y_predicted = clf.predict(docs_test)
# mean accuracy and classification report.
accuracy = np.mean(y_predicted == y_test)
print i, accuracy
avg += accuracy
print avg/runs
if __name__ == "__main__":
classify()