-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathcoh_tc_read.py
128 lines (105 loc) · 3.43 KB
/
coh_tc_read.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import sys
from gensim import corpora
from topic.topicio import TopicIO
from coherence.umass import TopicCoherence
import utils.name_convention as name
#
# syntax: python tc_read.py <input directory name> <corpus type> <# of topics> <src> <word count>
# <input dictionary name> the directory that has the required corpus
# <corpus type> default to bag of words. b for binary, t for tf-idf, anything else or missing for bag of words
# <# of topics> number of topics. default to 8
# <src> src folder which contains documents for LDA
# <word count> the number of top words used in the calculation of topic coherence
# <startw> the start point of collecting words
# <tfidf> t: use tfidf topic coherence measure anything else or missing: use regular topic coherence measure
#
# Read command line parameters
#
if len(sys.argv) <= 1:
dname = 'pp_test_LDA'
else:
dname = sys.argv[1]
if len(sys.argv) <= 2:
corpus_type = "bow"
else:
if sys.argv[2] == "t":
corpus_type = "tfidf"
elif sys.argv[2] == "b":
corpus_type = "binary"
else:
corpus_type = "bow"
if len(sys.argv) <= 3:
topics_count = 8;
else:
topics_count = int(sys.argv[3]);
if len(sys.argv) <= 4:
src = "pp_test"
else:
src = sys.argv[4]
if len(sys.argv) <= 5:
word_count = 10
else:
word_count = int(sys.argv[5])
if len(sys.argv) <= 6:
startw = 0
else:
startw = int(sys.argv[6])
if len(sys.argv) <= 7:
tfidf = False
else:
if sys.argv[7] == "t":
tfidf = True
else:
tfidf = False
if tfidf:
epsilon = 0.0001
else:
epsilon = 1
output = name.get_output_dir(corpus_type, topics_count, src)
print "input directory : " + dname
print "corpus type :" + corpus_type
print "# of topics : " + str(topics_count)
print "src : " + src
print "# of words used for topic coherence: " + str(word_count)
print "output : " + output
print "word count : " + str(word_count)
print "startw : " + str(startw)
print "Tfidf : " + str(tfidf)
print "\n"
# Load directory
dictionary = corpora.Dictionary.load(dname + "/dict.dict")
print(dictionary)
# Init helpers
topics_io = TopicIO()
tc = TopicCoherence()
# get all topics
tlist = topics_io.read_topics(output + "/topics")
# sort all words by decreasing frequency
tlist2 = []
for topic in tlist:
topic.sort()
tlist2.append(topic.list(word_count, start=startw))
# prepare output file
tf_file = name.tc_tf_file(dname, corpus_type, topics_count, startw, tfidf)
co_occur_file = name.tc_co_occur_file(dname, corpus_type, topics_count, startw, tfidf)
wd_dict = tc.read_into_dict(tf_file)
cofreq_dict = tc.read_into_dict(co_occur_file)
# calculate topic coherence values for each topic with a specific number of words
ofilename = name.tc_contribution(output, word_count, startw, tfidf)
ofile = open(ofilename, "w")
ctlist = []
for index, t in enumerate(tlist2):
t = t[:word_count]
subt = [wt[0] for wt in t]
ofile.write("topic " + str(index) + "\n")
ctlist.append((index, tc.coherence_dict(subt, wd_dict, cofreq_dict, ofile, epsilon=epsilon), t))
ofile.write("\n")
# sort all topics by topic coherence
ctlist = list(reversed(sorted(ctlist, key=lambda x: x[1])))
ofilename = name.tc_output_file(output, word_count, startw, tfidf)
ofile = open(ofilename, "w")
for tctuple in ctlist:
ofile.write("topic " + str(tctuple[0]) + " " + str(tctuple[1]) + "\n\n")
for item in tctuple[2]:
ofile.write(item[0] + " : " + str(item[1]) + "\n")
ofile.write("\n\n")