-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnblearn.py
86 lines (74 loc) · 3.61 KB
/
nblearn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#Program will be invoked as python3 nblearn.py TRAININGFILE MODELFILE
#TRAININGFILE will be spam_training.txt for the spam dataset, and sentiment_training.txt for the sentiment dataset
#MODELFILE will be spam.nb, and sentiment.nb
#MODELFILE will contain all the probabilities of words with respect to given class stored in 1 dictionary
#The words will be stored insdie nested dictionary
#{'spam': {'pharmacy':2}, 'ham': {'china': 1},'Info':{'totaldocs':2}}
#P(word|class)=(frequency of word in document of class C)/(Total number of words in documents of Class C)
#Add-one smoothing=(freq+1)/(total words+size of vocabulary)
#To prevent underflow we take the logs of probabilities
import sys
import pickle
import math
def calculate_word_prob(ip,op):
words_dict=dict() ##store the unique words in dictionary
words_dict['Meta_Info']=dict()##store the number of documents,classes count etc
words_dict['Meta_Info']['names_of_classes']=list()##store the names of classes
words_dict['Meta_Info']['size_vocab']=0 #no of unique words in vocab which will be stored in dictionary
words_dict['Meta_Info']['total_no_docs']=0 #total no of documents
with open(ip,'r') as trainingfile:
with open(op,'wb')as modelfile:
for line in trainingfile:
words_dict['Meta_Info']['total_no_docs']+=1
words=line.split()
for index,word in enumerate(words):
## If index is 0 then store the class name
class_name=words[0]##first word is the class name
##print("classname:",class_name)
if index==0:
##check if class has been detected before
if class_name in words_dict:
##if class already exists just increase count of documents of that class
words_dict['Meta_Info'][class_name+"_Docs_Count"]+=1
else:
words_dict['Meta_Info']['names_of_classes'].append(class_name)
##if class is new and does not exist already add it to list of class names
words_dict[class_name]=dict()
words_dict['Meta_Info'][class_name+"_Docs_Count"]=1
words_dict['Meta_Info'][class_name+'_words']=0
##else store the frequencies of words in dictionary
else:
words_dict['Meta_Info'][class_name+'_words']+=1
if(word in words_dict[class_name]):
words_dict[class_name][word]+=1
else:
words_dict[class_name][word]=1
#for calculating unique words in vocab
count=0
for classes in words_dict['Meta_Info']['names_of_classes']:
if(word in words_dict[classes]):
count+=1
if count==1:
words_dict['Meta_Info']['size_vocab']+=1
for classes in words_dict['Meta_Info']['names_of_classes']:
## Calculate the priors
##P(class)=No of docs of that class/Total no of docs
words_dict['Meta_Info']["P("+classes+")"]=(words_dict['Meta_Info'][classes+"_Docs_Count"]/words_dict['Meta_Info']['total_no_docs'])
##Store as logs to eliminate underflow
words_dict['Meta_Info']["P("+classes+")"]=math.log(words_dict['Meta_Info']["P("+classes+")"])
for word in words_dict[classes]:
#P(word|class)
##Add 1 to numerator for add one smoothing
##The denominator has legth of words of that class+unique vocab size
words_dict[classes][word]=(words_dict[classes][word]+1)/(words_dict['Meta_Info'][classes+"_words"]+words_dict['Meta_Info']['size_vocab'])
words_dict[classes][word]=math.log(words_dict[classes][word])
print(words_dict['Meta_Info'])
pickle.dump(words_dict,modelfile)
modelfile.closed
trainingfile.closed
return
def main():
calculate_word_prob(sys.argv[1],sys.argv[2])
return
if __name__=="__main__":
main()