-
Notifications
You must be signed in to change notification settings - Fork 376
/
13_naive_bayes_class.py
153 lines (118 loc) · 4.47 KB
/
13_naive_bayes_class.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
'''
CLASS: Naive Bayes spam classifier using sklearn
Files used:
../data/ham/
../data/spam/
File source:
http://spamassassin.apache.org/publiccorpus/
300 ham emails from 20021010_easy_ham.tar.bz2
300 spam emails from 20021010_spam.tar.bz2
'''
## READING FILES
# getting a list of filenames
import glob
ham_files = glob.glob("../data/ham/*")
spam_files = glob.glob("../data/spam/*")
# read the ham file contents into a list (each element is one email)
ham_text = []
for filename in ham_files:
with open(filename, 'rU') as f:
ham_text.append(f.read())
# read the spam file contents into a list (each element is one email)
spam_text = []
for filename in spam_files:
with open(filename, 'rU') as f:
spam_text.append(f.read())
# use the first 200 ham and first 200 spam as training data
train_text = ham_text[:200] + spam_text[:200]
train_labels = [0]*200 + [1]*200
# use the last 100 ham and last 100 spam as testing data
test_text = ham_text[200:] + spam_text[200:]
test_labels = [0]*100 + [1]*100
## COUNTVECTORIZER: 'convert text into a matrix of token counts'
# learn the 'vocabulary' of the training data
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(decode_error="ignore")
vect.fit(train_text)
# store feature names and examine them
train_features = vect.get_feature_names()
len(train_features)
train_features[:50]
train_features[10000:10002]
# transform training data into a 'document-term matrix'
train_dtm = vect.transform(train_text)
type(train_dtm)
# convert train_dtm to a regular array and examine it
train_arr = train_dtm.toarray()
train_arr.shape
train_arr
sum(train_arr[0])
## SIMPLE SUMMARIES OF THE DATA
# sum the rows and columns
import numpy as np
tokens_per_email = np.sum(train_arr, axis=1) # sum of each row
tokens_per_email
count_per_token = np.sum(train_arr, axis=0) # sum of each column
count_per_token[:50]
# find the most frequent token
np.max(count_per_token)
np.argmax(count_per_token)
train_features[np.argmax(count_per_token)]
## MODEL BUILDING AND EVALUATION
# train a Naive Bayes model on the training data
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(train_dtm, train_labels)
# transform testing data into a document-term matrix
test_dtm = vect.transform(test_text)
test_dtm
# make predictions on test data and compare to true labels
preds = nb.predict(test_dtm)
preds
from sklearn import metrics
print metrics.accuracy_score(test_labels, preds)
print metrics.confusion_matrix(test_labels, preds)
# predict (poorly calibrated) probabilities and calculate AUC
probs = nb.predict_proba(test_dtm)[:, 1]
probs
print metrics.roc_auc_score(test_labels, probs)
# plot ROC curve
fpr, tpr, thresholds = metrics.roc_curve(test_labels, probs)
import matplotlib.pyplot as plt
plt.figure()
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
# pretend we didn't have test set and use cross-validation instead
from sklearn.cross_validation import cross_val_score
cross_val_score(MultinomialNB(), train_dtm, train_labels, cv=5, scoring="accuracy")
cross_val_score(MultinomialNB(), train_dtm, train_labels, cv=5, scoring="roc_auc")
## FIND THE 'HAMMIEST' AND 'SPAMMIEST' TOKENS
# split train_arr into ham and spam sections
ham_arr = train_arr[:200]
spam_arr = train_arr[200:]
ham_arr
spam_arr
# calculate count of each token
ham_count_per_token = np.sum(ham_arr, axis=0) + 1
spam_count_per_token = np.sum(spam_arr, axis=0) + 1
# alternative method for accessing counts
ham_count_per_token = nb.feature_count_[0] + 1
spam_count_per_token = nb.feature_count_[1] + 1
# calculate rate of each token
ham_token_rate = ham_count_per_token/float(200)
spam_token_rate = spam_count_per_token/float(200)
ham_token_rate
spam_token_rate
# for each token, calculate ratio of ham-to-spam
ham_to_spam_ratio = ham_token_rate/spam_token_rate
np.max(ham_to_spam_ratio)
ham_arr[:, np.argmax(ham_to_spam_ratio)] # count of that token in ham emails
spam_arr[:, np.argmax(ham_to_spam_ratio)] # count of that token in spam emails
train_features[np.argmax(ham_to_spam_ratio)] # hammiest token
# for each token, calculate ratio of spam-to-ham
spam_to_ham_ratio = spam_token_rate/ham_token_rate
np.max(spam_to_ham_ratio)
spam_arr[:, np.argmax(spam_to_ham_ratio)] # count of that token in spam emails
ham_arr[:, np.argmax(spam_to_ham_ratio)] # count of that token in ham emails
train_features[np.argmax(spam_to_ham_ratio)] # spammiest token