Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Python 3 and Poetry package management #8

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file added code/__init__.py
Empty file.
14 changes: 7 additions & 7 deletions code/bdi.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from os import listdir
from collections import defaultdict
import utils
import Queue
import queue
import argparse
from os import listdir
from os.path import isfile, join, isdir, abspath, dirname, basename, exists
Expand Down Expand Up @@ -55,23 +55,23 @@ def compute_dbi(embs, clus_map, center_map):
def output_dbi(dbi_scores):

dbi_by_lvl = {}
all_dbi = [x[0] for x in dbi_scores.values()]
all_dbi = [x[0] for x in list(dbi_scores.values())]

print 'Average DBI for all is: %f' % (sum(all_dbi) / len(all_dbi))
print('Average DBI for all is: %f' % (sum(all_dbi) / len(all_dbi)))

for x in dbi_scores.values():
for x in list(dbi_scores.values()):
if x[1] not in dbi_by_lvl:
dbi_by_lvl[x[1]] = []
dbi_by_lvl[x[1]].append(x[0])

for lvl in dbi_by_lvl:
dbis = dbi_by_lvl[lvl]
print 'Average DBI for level %d is: %f' % (lvl, sum(dbis) / len(dbis))
print('Average DBI for level %d is: %f' % (lvl, sum(dbis) / len(dbis)))


def recursion(root, lvl):

q = Queue.Queue()
q = queue.Queue()
q.put((root, -1, 1, '*'))

dbi_scores = {}
Expand Down Expand Up @@ -102,7 +102,7 @@ def recursion(root, lvl):

# handle current
dbi = compute_dbi(embs, clus_map, hier_map)
print 'Computing DBI for %s: %f' % (c_name, dbi)
print('Computing DBI for %s: %f' % (c_name, dbi))
dbi_scores[c_name] = (dbi, level)
output_dbi(dbi_scores)

Expand Down
125 changes: 63 additions & 62 deletions code/case_ranker.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,80 +8,81 @@
import utils
import operator


def read_caseolap_result(case_file):
phrase_map = {}
cell_map = {}

cell_cnt = 0
with open(case_file) as f:
for line in f:
cell_cnt += 1
segments = line.strip('\r\n ').split('\t')
cell_id, phs_str = segments[0], segments[1][1:-1]
cell_map[cell_id] = []
segments = phs_str.split(', ')
for ph_score in segments:
parts = ph_score.split('|')
ph, score = parts[0], float(parts[1])
if ph not in phrase_map:
phrase_map[ph] = {}
phrase_map[ph][cell_id] = score
cell_map[cell_id].append((ph, score))

return phrase_map, cell_map, cell_cnt
phrase_map = {}
cell_map = {}

cell_cnt = 0
with open(case_file) as f:
for line in f:
cell_cnt += 1
segments = line.strip('\r\n ').split('\t')
cell_id, phs_str = segments[0], segments[1][1:-1]
cell_map[cell_id] = []
segments = phs_str.split(', ')
for ph_score in segments:
parts = ph_score.split('|')
ph, score = parts[0], float(parts[1])
if ph not in phrase_map:
phrase_map[ph] = {}
phrase_map[ph][cell_id] = score
cell_map[cell_id].append((ph, score))

return phrase_map, cell_map, cell_cnt


def rank_phrase(case_file):
ph_dist_map = {}
smoothing_factor = 0.0

ph_dist_map = {}
smoothing_factor = 0.0

phrase_map, cell_map, cell_cnt = read_caseolap_result(case_file)
phrase_map, cell_map, cell_cnt = read_caseolap_result(case_file)
print(cell_cnt)
unif = [1.0 / cell_cnt] * cell_cnt

unif = [1.0 / cell_cnt] * cell_cnt
for ph in phrase_map:
ph_vec = [x[1] for x in phrase_map[ph].items()]
if len(ph_vec) < cell_cnt:
ph_vec += [0] * (cell_cnt - len(ph_vec))
# smoothing
ph_vec = [x + smoothing_factor for x in ph_vec]
ph_vec = utils.l1_normalize(ph_vec)
ph_dist_map[ph] = utils.kl_divergence(ph_vec, unif)

for ph in phrase_map:
ph_vec = [x[1] for x in phrase_map[ph].iteritems()]
if len(ph_vec) < cell_cnt:
ph_vec += [0] * (cell_cnt - len(ph_vec))
# smoothing
ph_vec = [x + smoothing_factor for x in ph_vec]
ph_vec = utils.l1_normalize(ph_vec)
ph_dist_map[ph] = utils.kl_divergence(ph_vec, unif)
ranked_list = sorted(list(ph_dist_map.items()), key=operator.itemgetter(1), reverse=True)

ranked_list = sorted(ph_dist_map.items(), key=operator.itemgetter(1), reverse=True)

return ranked_list
return ranked_list


def write_keywords(o_file, ranked_list, thres):
with open(o_file, 'w+') as g:
for ph in ranked_list:
if ph[1] > thres:
g.write('%s\n' % (ph[0]))
tmp_file = o_file + '-score.txt'
with open(tmp_file, 'w+') as g:
for ph in ranked_list:
g.write('%s\t%f\n' % (ph[0], ph[1]))
with open(o_file, 'w+') as g:
for ph in ranked_list:
if ph[1] > thres:
g.write('%s\n' % (ph[0]))
tmp_file = o_file + '-score.txt'
with open(tmp_file, 'w+') as g:
for ph in ranked_list:
g.write('%s\t%f\n' % (ph[0], ph[1]))

def main_rank_phrase(input_f, output_f, thres):
ranked_list = rank_phrase(input_f)
write_keywords(output_f, ranked_list, thres)
print("[CaseOLAP] Finish pushing general terms up")

if __name__ == "__main__":
parser = argparse.ArgumentParser(prog='case_ranker.py', \
description='Ranks the distinctiveness score using caseolap result.')
parser.add_argument('-folder', required=True, \
help='The folder that stores the file.')
parser.add_argument('-iter', required=True, \
help='Iteration index.')
parser.add_argument('-thres', required=True, \
help='The files used.')
args = parser.parse_args()

input_f = '%s/caseolap-%s.txt' % (args.folder, args.iter)
output_f = '%s/keywords-%s.txt' % (args.folder, args.iter)
def main_rank_phrase(input_f, output_f, thres):
ranked_list = rank_phrase(input_f)
write_keywords(output_f, ranked_list, thres)
print("[CaseOLAP] Finish pushing general terms up")

main_rank_phrase(input_f, output_f, float(args.thres))

if __name__ == "__main__":
parser = argparse.ArgumentParser(prog='case_ranker.py', \
description='Ranks the distinctiveness score using caseolap result.')
parser.add_argument('-folder', required=True, \
help='The folder that stores the file.')
parser.add_argument('-iter', required=True, \
help='Iteration index.')
parser.add_argument('-thres', required=True, \
help='The files used.')
args = parser.parse_args()

input_f = '%s/caseolap-%s.txt' % (args.folder, args.iter)
output_f = '%s/keywords-%s.txt' % (args.folder, args.iter)

main_rank_phrase(input_f, output_f, float(args.thres))
8 changes: 4 additions & 4 deletions code/caseslim.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def compute(self, score_type='ALL'):
group = [(self_df, self.max_df, self_cnt, sum_self)]

self.context_groups[phrase] = []
for phrase_group, phrase_values in self.phrase_cnt_context.items():
for phrase_group, phrase_values in list(self.phrase_cnt_context.items()):
context_df = self.phrase_df_context[phrase_group].get(phrase, 0)
sum_context = self.sum_cnt_context[phrase_group]
context_cnt = phrase_values.get(phrase, 0)
Expand Down Expand Up @@ -167,7 +167,7 @@ def __init__(self, freq_data, selected_docs, context_doc_groups, global_scores=N
self.sum_cnt = sum(self.phrase_cnt.values())
self.sum_cnt_context = {}
self.global_scores = global_scores
for group, docs in context_doc_groups.items():
for group, docs in list(context_doc_groups.items()):
self.phrase_cnt_context[group], self.phrase_df_context[group] = self.agg_phrase_cnt_df(freq_data, docs)
if len(self.phrase_df_context[group]) > 0:
self.max_df_context[group] = max(self.phrase_df_context[group].values())
Expand Down Expand Up @@ -248,7 +248,7 @@ def run_caseolap(cells, freq_data, target_phs, o_file, verbose=3, top_k=200):
of = open(o_file, 'w+')

for cell in cells:
print('[CaseOLAP] Running CaseOLAP for cell: %s' % cell)
print(('[CaseOLAP] Running CaseOLAP for cell: %s' % cell))

selected_docs = cells[cell]
context_doc_groups = copy.copy(cells)
Expand All @@ -260,7 +260,7 @@ def run_caseolap(cells, freq_data, target_phs, o_file, verbose=3, top_k=200):

phr_str = ', '.join([ph[0] + '|' + str(ph[1]) for ph in top_phrases if ph[0] in target_phs])
of.write('[%s]\n' % phr_str)
print('[CaseOLAP] Finished CaseOLAP for cell: %s' % cell)
print(('[CaseOLAP] Finished CaseOLAP for cell: %s' % cell))


def main_caseolap(link_f, cell_f, token_f, output_f):
Expand Down
4 changes: 2 additions & 2 deletions code/cluster-preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def gen_doc_keyword_cnt_file(doc_file, keyword_cnt_file):

def counter_to_string(counter):
elements = []
for k, v in counter.items():
for k, v in list(counter.items()):
elements.append(k)
elements.append(v)
return '\t'.join([str(e) for e in elements])
Expand Down Expand Up @@ -129,7 +129,7 @@ def main(raw_dir, input_dir, init_dir):
# input_dir = '/shared/data/czhang82/projects/local-embedding/sp/input/'
# init_dir = '/shared/data/czhang82/projects/local-embedding/sp/init/'
if __name__ == '__main__':
corpusName = sys.argv[1]
corpusName = 'dblp'
raw_dir = '../data/'+corpusName+'/raw/'
input_dir = '../data/'+corpusName+'/input/'
init_dir = '../data/'+corpusName+'/init/'
Expand Down
8 changes: 4 additions & 4 deletions code/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def fit(self):
self.membership = labels
self.center_ids = self.gen_center_idx()
self.inertia_scores = self.clus.inertia_
print('Clustering concentration score:', self.inertia_scores)
print(('Clustering concentration score:', self.inertia_scores))

# find the idx of each cluster center
def gen_center_idx(self):
Expand Down Expand Up @@ -58,13 +58,13 @@ def calc_cosine(self, vec_a, vec_b):
def run_clustering(full_data, doc_id_file, filter_keyword_file, n_cluster, parent_direcotry, parent_description,\
cluster_keyword_file, hierarchy_file, doc_membership_file):
dataset = SubDataSet(full_data, doc_id_file, filter_keyword_file)
print('Start clustering for ', len(dataset.keywords), ' keywords under parent:', parent_description)
print(('Start clustering for ', len(dataset.keywords), ' keywords under parent:', parent_description))
## TODO: change later here for n_cluster selection from a range
clus = Clusterer(dataset.embeddings, n_cluster)
clus.fit()
print('Done clustering for ', len(dataset.keywords), ' keywords under parent:', parent_description)
print(('Done clustering for ', len(dataset.keywords), ' keywords under parent:', parent_description))
dataset.write_cluster_members(clus, cluster_keyword_file, parent_direcotry)
center_names = dataset.write_cluster_centers(clus, parent_description, hierarchy_file)
dataset.write_document_membership(clus, doc_membership_file, parent_direcotry)
print('Done saving cluster results for ', len(dataset.keywords), ' keywords under parent:', parent_description)
print(('Done saving cluster results for ', len(dataset.keywords), ' keywords under parent:', parent_description))
return center_names
16 changes: 8 additions & 8 deletions code/compress.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import argparse
import utils
import operator
import Queue
import queue
import math
from os import listdir
from os.path import isfile, join, isdir, abspath, dirname, basename, exists
Expand Down Expand Up @@ -37,12 +37,12 @@ def parse_reidx(reidx_f):
if len(pd_map[ph]) > 0:
ph_idf[ph] = math.log(float(d_cnt) / len(pd_map[ph]))

print 'Inverted Index file read.'
print('Inverted Index file read.')



def get_rep(folder, c_id, N):
print('Start get representative phrases for %s, %s ========================' % (folder, c_id))
print(('Start get representative phrases for %s, %s ========================' % (folder, c_id)))
# print folder
par_folder = dirname(folder)
cur_label = basename(folder)
Expand Down Expand Up @@ -72,7 +72,7 @@ def get_rep(folder, c_id, N):
emb_dist = utils.cossim(embs[ph], embs[cur_label])
ph_scores[ph] = score * emb_dist

ph_scores = sorted(ph_scores.items(), key=operator.itemgetter(1), reverse=True)
ph_scores = sorted(list(ph_scores.items()), key=operator.itemgetter(1), reverse=True)

for (ph, score) in ph_scores:
if ph not in result_phrases and ph in kws:
Expand All @@ -81,7 +81,7 @@ def get_rep(folder, c_id, N):
break

elif ph_idf == None:
print 'looking at embeddings for %s' % folder
print('looking at embeddings for %s' % folder)

ph_f = '%s/embeddings.txt' % par_folder
kw_f = '%s/keywords.txt' % par_folder
Expand Down Expand Up @@ -119,7 +119,7 @@ def get_rep(folder, c_id, N):
result_phrases.append(ph)
else:
# Using TF-IDF to generate
print 'looking at tf-idf for %s' % folder
print('looking at tf-idf for %s' % folder)
d_clus_f = '%s/paper_cluster.txt' % par_folder
kw_clus_f = '%s/cluster_keywords.txt' % par_folder
docs = []
Expand Down Expand Up @@ -148,7 +148,7 @@ def get_rep(folder, c_id, N):
continue
ph_scores[ph] = 1 + math.log(ph_scores[ph])
ph_scores[ph] *= ph_idf[ph]
ph_scores = sorted(ph_scores.items(), key=operator.itemgetter(1), reverse=True)
ph_scores = sorted(list(ph_scores.items()), key=operator.itemgetter(1), reverse=True)

for (ph, score) in ph_scores:
if ph not in result_phrases:
Expand All @@ -161,7 +161,7 @@ def get_rep(folder, c_id, N):

def recursion(root, o_file, N):

q = Queue.Queue()
q = queue.Queue()
q.put((root, -1, '*'))

g = open(o_file, 'w+')
Expand Down
4 changes: 2 additions & 2 deletions code/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def load_keywords(self, keyword_file, full_data):
if keyword in full_data.embeddings:
keywords.append(keyword)
else:
print(keyword, ' not in the embedding file')
print((keyword, ' not in the embedding file'))
return keywords

def gen_keyword_id(self):
Expand Down Expand Up @@ -210,4 +210,4 @@ def assign_document(self, doc_membership):
keyword_file = data_dir + 'input/candidates.txt'
embedding_file = data_dir + 'input/embeddings.txt'
dataset = DataSet(embedding_file, document_file, keyword_file)
print(len(dataset.get_candidate_embeddings()))
print((len(dataset.get_candidate_embeddings())))
Loading