-
Notifications
You must be signed in to change notification settings - Fork 17
/
naivebayesevaluator.py
138 lines (115 loc) · 5 KB
/
naivebayesevaluator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#!/usr/bin/env python
'''
Contains implementation of Naive Bayes Classifier based upon
Generic Classifier for tweets.
Options:
--csvout -> put results in a CSV file
--stdout -> print results to STDOUT
Usage:
python naivebayesevaluator.py [(--csvout | --stdout) --dev -g <grams-list>
-w <weights-start, weights-end, step>
-t <thresholds-start, thresholds-end, step>]
For example,
python naivebayesevaluator.py --csvout --dev -g 1 1,2 -w 0.1,1.5,0.5 -t 1.0,2.0,0.5
should evaluate naive bayes (on the DEV set) using:
1. unigrams and unigrams + bigrams.
2. weights 0.1, 0.6, 1.1
3. threshold values 1.0, 1.5, 2.0
and store the result in the file 'stats/nbevaluatorstats<current datetime>.csv'.
python naivebayesevaluator.py -g 1 1,2 should print out accuracy of
naive bayes on the TRAINING set.
'''
from evaluator import Evaluator
from naivebayesclassifier import NaiveBayesClassifier
import csv
import datetime
import sys
import argparse
class NaiveBayesEvaluator(Evaluator):
def __init__(self, trainfile, devfile, testfile, *args, **kargs):
Evaluator.__init__(self, trainfile, devfile, testfile, *args, **kargs)
self.allthresholds = kargs.get("allthresholds")
self.csvout = kargs.get("csvout", False)
self.results = []
def flushToCSV(self):
fname = "stats/nbevaluatorstats%s.csv" % \
str(datetime.datetime.now()).replace(' ', '-')
with open(fname, "wb") as f:
w = csv.writer(f, delimiter=',', quotechar='"')
# write out header
w.writerow(["Classifier Info",
"Accuracy for Positives (%)",
"Accuracy for Negatives (%)",
"Accuracy for (Positives|Negatives) (%)",
"Correlation for (Positives|Negatives) (%)"])
for row in self.results:
w.writerow(row)
print "Flushing results of Naive Bayes evaluation into '%s'..." % fname
def run(self):
if not self.usedev:
for grams in self.allgrams:
c = NaiveBayesClassifier(self.rawfname,
grams=grams)
c.trainClassifier()
self.stdout = True
self.evaluate(c)
return
for grams in self.allgrams:
c = NaiveBayesClassifier(self.rawfname,
grams=grams)
c.trainClassifier()
for w in self.allweights:
c.setWeight(w)
for t1 in self.allthresholds:
for t2 in self.allthresholds:
c.setThresholds(neg=t1, pos=t2)
cinfo, accpos, accneg, accall, corrall = self.evaluate(c)
self.results.append([cinfo, accpos, accneg,
accall, corrall])
if self.csvout:
self.flushToCSV()
def processGrams(glist):
return [[int(eachr) for eachr in each.split(',')] for each in glist]
def floatrange(start, end, step):
return [start + step*x for x in range(int((end-start)/step)+1)]
def processWT(wstr):
start, end, step = [float(res) for res in wstr.split(',')]
return floatrange(start, end, step)
def main():
trainfile = "trainingandtestdata/training.csv"
devfile = "trainingandtestdata/devset.csv"
testfile = "trainingandtestdata/testing.csv"
parser = argparse.ArgumentParser()
parser.add_argument("--csvout", dest="csvout",
action="store_true", default=False)
parser.add_argument("--stdout", dest="stdout",
action="store_true", default=False)
parser.add_argument("--dev", dest="dev",
action="store_true", default=False)
parser.add_argument("-g", dest="g", nargs="+",
metavar="x,y,z,..", required=True)
parser.add_argument("-w", dest="w",
metavar="START, END, STEP", required=False)
parser.add_argument("-t", dest="t",
metavar="START, END, STEP", required=False)
args = parser.parse_args()
grams = processGrams(args.g)
try:
if args.g and args.w and args.t:
weights = processWT(args.w)
thresholds = processWT(args.t)
else:
weights = thresholds = []
args.dev = False
nbEvaluator = NaiveBayesEvaluator(trainfile, devfile, testfile,
allgrams=grams,
allweights=weights,
allthresholds=thresholds,
csvout=args.csvout,
stdout=args.stdout,
usedev=args.dev)
nbEvaluator.run()
except:
parser.print_help()
if __name__ == "__main__":
main()