-
Notifications
You must be signed in to change notification settings - Fork 7
/
z_score.py
89 lines (81 loc) · 3.11 KB
/
z_score.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# -*- coding: utf-8 -*-
"""
Created on Thu Feb 27 11:38:22 2014
@author: hussam
"""
import utility as ut
from math import *
import sent_analysis as twzscore2
def loadzscore(corpus, path):
dict_3 = [dict(), dict(), dict()]
f = [0, 0, 0]
if corpus=='tw':
zdic = ["".join((path, filename)) for filename in ("data/zneg.txt", "data/znet.txt", "data/zpos.txt")]
elif corpus=='txt':
print "*****************************************************"
print path
zdic = ["".join((path, filename)) for filename in ("data/cr0.txt", "data/cr1.txt", "data/cr2.txt")]
f[0] = open(zdic[0], "r").readlines()
f[1] = open(zdic[1], "r").readlines()
f[2] = open(zdic[2], "r").readlines()
for i in range(3):
for line in f[i]:
line = line.replace("\n", "").split("\t")
dict_3[i][line[0]] = (float(line[1]))
return dict_3
def getrealZscore(data, labels):
#TODO Add an option to construct real Zscore for text.
# Normally we should reconstruct cr0, cr1, cr2 in a training phase (not write
# in hard in the repository)
allvocab = dict()
dict_3 = [dict(), dict(), dict()]
count_3 = [0, 0, 0]
nvocab = 0
for i in xrange(len(data)):
sentence = data[i]
polarity = labels[i]
sentence = twzscore2.splitfun4tweet(sentence)
for token in sentence:
if (len(token) > 1):
if token in dict_3[polarity]:
dict_3[polarity][token] += 1
count_3[polarity] += 1
else:
dict_3[polarity][token] = 1
count_3[polarity] += 1
if allvocab.has_key(token):
allvocab[token] += 1
nvocab += 1
else:
allvocab[token] = 1
nvocab += 1
all = ["", "", ""]
xy = [[[], []], [[], []], [[], []]]
index = 0
for k, v in allvocab.iteritems():
for i in xrange(3):
if dict_3[i].has_key(k):
p = ((allvocab[k]) / float(nvocab))
all[i] = str(k) + "\t" + str((dict_3[i][k] - count_3[i] * p) / (sqrt(count_3[i] * p * (1 - p)))) + "\n" + all[i]
xy[i][0].append(index)
xy[i][1].append((dict_3[i][k] - count_3[i] * p) / (sqrt(count_3[i] * p * (1 - p))))
index += 1
else:
p = ((allvocab[k] * 1.0) / nvocab)
all[i] = str(k) + "\t" + str((0 - count_3[i] * p) / (sqrt(count_3[i] * p * (1 - p)))) + "\n" + all[i]
xy[i][0].append(index)
xy[i][1].append((0 - count_3[i] * p) / (sqrt(count_3[i] * p * (1 - p))))
index += 1
f0 = open("".join([path, "data/zneg.txt"]), "w")
f1 = open("".join([path, "data/znet.txt"]), "w")
f2 = open("".join([path, "data/zpos.txt"]), "w")
f0.writelines(all[0])
f1.writelines(all[1])
f2.writelines(all[2])
f0.close()
f1.close()
f2.close()
if __name__ == '__main__':
filename = "./corpus/twitter-train-cleansed-B.txt"
data, labels = zscore2.readFile(filename)
getrealZscore(data, labels)