From 27a411f2d40161d10f12807258b13077a1111ca3 Mon Sep 17 00:00:00 2001 From: Cezar Sas Date: Tue, 14 Dec 2021 08:39:26 +0100 Subject: [PATCH 1/2] Switched to python 3, and added poetry package management for easy environment setup --- code/__init__.py | 0 code/bdi.py | 14 +- code/case_ranker.py | 125 +-- code/caseslim.py | 8 +- code/cluster-preprocess.py | 4 +- code/cluster.py | 8 +- code/compress.py | 16 +- code/dataset.py | 4 +- .../fetch_papers_relevant.py | 26 +- code/embedding_deprecated/metrics.py | 32 +- code/eval.py | 12 +- code/gen_eval.py | 12 +- code/labeling.py | 18 +- code/local_embedding_training.py | 4 +- code/main.py | 15 +- code/paras.py | 8 +- code/postprocess/redundancy.py | 4 +- code/postprocess/visualize.py | 10 +- code/preprocess.py | 4 +- code/preprocess/AutoPhraseOutput.py | 14 +- code/preprocess/SegPhraseOutput.py | 16 +- code/preprocess/main.py | 6 +- code/run.sh | 8 +- code/sim_emb.py | 14 +- code/spherecluster/__init__.py | 12 + code/spherecluster/spherecluster/__init__.py | 6 + .../spherecluster/spherical_kmeans.py | 372 +++++++ .../spherecluster/tests/test_common.py | 6 + .../tests/test_von_mises_fisher_mixture.py | 205 ++++ code/spherecluster/spherecluster/util.py | 57 ++ .../spherecluster/von_mises_fisher_mixture.py | 946 ++++++++++++++++++ code/spherecluster/spherical_kmeans.py | 424 ++++++++ code/spherecluster/util.py | 57 ++ .../spherecluster/von_mises_fisher_mixture.py | 946 ++++++++++++++++++ code/taxonomy.py | 2 +- code/utils.py | 12 +- pyproject.toml | 21 + 37 files changed, 3253 insertions(+), 195 deletions(-) create mode 100644 code/__init__.py mode change 100644 => 100755 code/run.sh create mode 100644 code/spherecluster/__init__.py create mode 100644 code/spherecluster/spherecluster/__init__.py create mode 100644 code/spherecluster/spherecluster/spherical_kmeans.py create mode 100644 code/spherecluster/spherecluster/tests/test_common.py create mode 100644 code/spherecluster/spherecluster/tests/test_von_mises_fisher_mixture.py create mode 100644 code/spherecluster/spherecluster/util.py create mode 100644 code/spherecluster/spherecluster/von_mises_fisher_mixture.py create mode 100644 code/spherecluster/spherical_kmeans.py create mode 100644 code/spherecluster/util.py create mode 100644 code/spherecluster/von_mises_fisher_mixture.py create mode 100644 pyproject.toml diff --git a/code/__init__.py b/code/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/code/bdi.py b/code/bdi.py index 0913bed..5a09a8a 100644 --- a/code/bdi.py +++ b/code/bdi.py @@ -2,7 +2,7 @@ from os import listdir from collections import defaultdict import utils -import Queue +import queue import argparse from os import listdir from os.path import isfile, join, isdir, abspath, dirname, basename, exists @@ -55,23 +55,23 @@ def compute_dbi(embs, clus_map, center_map): def output_dbi(dbi_scores): dbi_by_lvl = {} - all_dbi = [x[0] for x in dbi_scores.values()] + all_dbi = [x[0] for x in list(dbi_scores.values())] - print 'Average DBI for all is: %f' % (sum(all_dbi) / len(all_dbi)) + print('Average DBI for all is: %f' % (sum(all_dbi) / len(all_dbi))) - for x in dbi_scores.values(): + for x in list(dbi_scores.values()): if x[1] not in dbi_by_lvl: dbi_by_lvl[x[1]] = [] dbi_by_lvl[x[1]].append(x[0]) for lvl in dbi_by_lvl: dbis = dbi_by_lvl[lvl] - print 'Average DBI for level %d is: %f' % (lvl, sum(dbis) / len(dbis)) + print('Average DBI for level %d is: %f' % (lvl, sum(dbis) / len(dbis))) def recursion(root, lvl): - q = Queue.Queue() + q = queue.Queue() q.put((root, -1, 1, '*')) dbi_scores = {} @@ -102,7 +102,7 @@ def recursion(root, lvl): # handle current dbi = compute_dbi(embs, clus_map, hier_map) - print 'Computing DBI for %s: %f' % (c_name, dbi) + print('Computing DBI for %s: %f' % (c_name, dbi)) dbi_scores[c_name] = (dbi, level) output_dbi(dbi_scores) diff --git a/code/case_ranker.py b/code/case_ranker.py index 22da68c..e27cece 100644 --- a/code/case_ranker.py +++ b/code/case_ranker.py @@ -8,80 +8,81 @@ import utils import operator + def read_caseolap_result(case_file): - phrase_map = {} - cell_map = {} - - cell_cnt = 0 - with open(case_file) as f: - for line in f: - cell_cnt += 1 - segments = line.strip('\r\n ').split('\t') - cell_id, phs_str = segments[0], segments[1][1:-1] - cell_map[cell_id] = [] - segments = phs_str.split(', ') - for ph_score in segments: - parts = ph_score.split('|') - ph, score = parts[0], float(parts[1]) - if ph not in phrase_map: - phrase_map[ph] = {} - phrase_map[ph][cell_id] = score - cell_map[cell_id].append((ph, score)) - - return phrase_map, cell_map, cell_cnt + phrase_map = {} + cell_map = {} + + cell_cnt = 0 + with open(case_file) as f: + for line in f: + cell_cnt += 1 + segments = line.strip('\r\n ').split('\t') + cell_id, phs_str = segments[0], segments[1][1:-1] + cell_map[cell_id] = [] + segments = phs_str.split(', ') + for ph_score in segments: + parts = ph_score.split('|') + ph, score = parts[0], float(parts[1]) + if ph not in phrase_map: + phrase_map[ph] = {} + phrase_map[ph][cell_id] = score + cell_map[cell_id].append((ph, score)) + + return phrase_map, cell_map, cell_cnt def rank_phrase(case_file): + ph_dist_map = {} + smoothing_factor = 0.0 - ph_dist_map = {} - smoothing_factor = 0.0 - - phrase_map, cell_map, cell_cnt = read_caseolap_result(case_file) + phrase_map, cell_map, cell_cnt = read_caseolap_result(case_file) + print(cell_cnt) + unif = [1.0 / cell_cnt] * cell_cnt - unif = [1.0 / cell_cnt] * cell_cnt + for ph in phrase_map: + ph_vec = [x[1] for x in phrase_map[ph].items()] + if len(ph_vec) < cell_cnt: + ph_vec += [0] * (cell_cnt - len(ph_vec)) + # smoothing + ph_vec = [x + smoothing_factor for x in ph_vec] + ph_vec = utils.l1_normalize(ph_vec) + ph_dist_map[ph] = utils.kl_divergence(ph_vec, unif) - for ph in phrase_map: - ph_vec = [x[1] for x in phrase_map[ph].iteritems()] - if len(ph_vec) < cell_cnt: - ph_vec += [0] * (cell_cnt - len(ph_vec)) - # smoothing - ph_vec = [x + smoothing_factor for x in ph_vec] - ph_vec = utils.l1_normalize(ph_vec) - ph_dist_map[ph] = utils.kl_divergence(ph_vec, unif) + ranked_list = sorted(list(ph_dist_map.items()), key=operator.itemgetter(1), reverse=True) - ranked_list = sorted(ph_dist_map.items(), key=operator.itemgetter(1), reverse=True) - - return ranked_list + return ranked_list def write_keywords(o_file, ranked_list, thres): - with open(o_file, 'w+') as g: - for ph in ranked_list: - if ph[1] > thres: - g.write('%s\n' % (ph[0])) - tmp_file = o_file + '-score.txt' - with open(tmp_file, 'w+') as g: - for ph in ranked_list: - g.write('%s\t%f\n' % (ph[0], ph[1])) + with open(o_file, 'w+') as g: + for ph in ranked_list: + if ph[1] > thres: + g.write('%s\n' % (ph[0])) + tmp_file = o_file + '-score.txt' + with open(tmp_file, 'w+') as g: + for ph in ranked_list: + g.write('%s\t%f\n' % (ph[0], ph[1])) -def main_rank_phrase(input_f, output_f, thres): - ranked_list = rank_phrase(input_f) - write_keywords(output_f, ranked_list, thres) - print("[CaseOLAP] Finish pushing general terms up") -if __name__ == "__main__": - parser = argparse.ArgumentParser(prog='case_ranker.py', \ - description='Ranks the distinctiveness score using caseolap result.') - parser.add_argument('-folder', required=True, \ - help='The folder that stores the file.') - parser.add_argument('-iter', required=True, \ - help='Iteration index.') - parser.add_argument('-thres', required=True, \ - help='The files used.') - args = parser.parse_args() - - input_f = '%s/caseolap-%s.txt' % (args.folder, args.iter) - output_f = '%s/keywords-%s.txt' % (args.folder, args.iter) +def main_rank_phrase(input_f, output_f, thres): + ranked_list = rank_phrase(input_f) + write_keywords(output_f, ranked_list, thres) + print("[CaseOLAP] Finish pushing general terms up") - main_rank_phrase(input_f, output_f, float(args.thres)) +if __name__ == "__main__": + parser = argparse.ArgumentParser(prog='case_ranker.py', \ + description='Ranks the distinctiveness score using caseolap result.') + parser.add_argument('-folder', required=True, \ + help='The folder that stores the file.') + parser.add_argument('-iter', required=True, \ + help='Iteration index.') + parser.add_argument('-thres', required=True, \ + help='The files used.') + args = parser.parse_args() + + input_f = '%s/caseolap-%s.txt' % (args.folder, args.iter) + output_f = '%s/keywords-%s.txt' % (args.folder, args.iter) + + main_rank_phrase(input_f, output_f, float(args.thres)) diff --git a/code/caseslim.py b/code/caseslim.py index 65a5cfb..0b5b076 100644 --- a/code/caseslim.py +++ b/code/caseslim.py @@ -66,7 +66,7 @@ def compute(self, score_type='ALL'): group = [(self_df, self.max_df, self_cnt, sum_self)] self.context_groups[phrase] = [] - for phrase_group, phrase_values in self.phrase_cnt_context.items(): + for phrase_group, phrase_values in list(self.phrase_cnt_context.items()): context_df = self.phrase_df_context[phrase_group].get(phrase, 0) sum_context = self.sum_cnt_context[phrase_group] context_cnt = phrase_values.get(phrase, 0) @@ -167,7 +167,7 @@ def __init__(self, freq_data, selected_docs, context_doc_groups, global_scores=N self.sum_cnt = sum(self.phrase_cnt.values()) self.sum_cnt_context = {} self.global_scores = global_scores - for group, docs in context_doc_groups.items(): + for group, docs in list(context_doc_groups.items()): self.phrase_cnt_context[group], self.phrase_df_context[group] = self.agg_phrase_cnt_df(freq_data, docs) if len(self.phrase_df_context[group]) > 0: self.max_df_context[group] = max(self.phrase_df_context[group].values()) @@ -248,7 +248,7 @@ def run_caseolap(cells, freq_data, target_phs, o_file, verbose=3, top_k=200): of = open(o_file, 'w+') for cell in cells: - print('[CaseOLAP] Running CaseOLAP for cell: %s' % cell) + print(('[CaseOLAP] Running CaseOLAP for cell: %s' % cell)) selected_docs = cells[cell] context_doc_groups = copy.copy(cells) @@ -260,7 +260,7 @@ def run_caseolap(cells, freq_data, target_phs, o_file, verbose=3, top_k=200): phr_str = ', '.join([ph[0] + '|' + str(ph[1]) for ph in top_phrases if ph[0] in target_phs]) of.write('[%s]\n' % phr_str) - print('[CaseOLAP] Finished CaseOLAP for cell: %s' % cell) + print(('[CaseOLAP] Finished CaseOLAP for cell: %s' % cell)) def main_caseolap(link_f, cell_f, token_f, output_f): diff --git a/code/cluster-preprocess.py b/code/cluster-preprocess.py index c547540..44ea0eb 100644 --- a/code/cluster-preprocess.py +++ b/code/cluster-preprocess.py @@ -87,7 +87,7 @@ def gen_doc_keyword_cnt_file(doc_file, keyword_cnt_file): def counter_to_string(counter): elements = [] - for k, v in counter.items(): + for k, v in list(counter.items()): elements.append(k) elements.append(v) return '\t'.join([str(e) for e in elements]) @@ -129,7 +129,7 @@ def main(raw_dir, input_dir, init_dir): # input_dir = '/shared/data/czhang82/projects/local-embedding/sp/input/' # init_dir = '/shared/data/czhang82/projects/local-embedding/sp/init/' if __name__ == '__main__': - corpusName = sys.argv[1] + corpusName = 'dblp' raw_dir = '../data/'+corpusName+'/raw/' input_dir = '../data/'+corpusName+'/input/' init_dir = '../data/'+corpusName+'/init/' diff --git a/code/cluster.py b/code/cluster.py index 9bc25ff..5e89e0f 100644 --- a/code/cluster.py +++ b/code/cluster.py @@ -28,7 +28,7 @@ def fit(self): self.membership = labels self.center_ids = self.gen_center_idx() self.inertia_scores = self.clus.inertia_ - print('Clustering concentration score:', self.inertia_scores) + print(('Clustering concentration score:', self.inertia_scores)) # find the idx of each cluster center def gen_center_idx(self): @@ -58,13 +58,13 @@ def calc_cosine(self, vec_a, vec_b): def run_clustering(full_data, doc_id_file, filter_keyword_file, n_cluster, parent_direcotry, parent_description,\ cluster_keyword_file, hierarchy_file, doc_membership_file): dataset = SubDataSet(full_data, doc_id_file, filter_keyword_file) - print('Start clustering for ', len(dataset.keywords), ' keywords under parent:', parent_description) + print(('Start clustering for ', len(dataset.keywords), ' keywords under parent:', parent_description)) ## TODO: change later here for n_cluster selection from a range clus = Clusterer(dataset.embeddings, n_cluster) clus.fit() - print('Done clustering for ', len(dataset.keywords), ' keywords under parent:', parent_description) + print(('Done clustering for ', len(dataset.keywords), ' keywords under parent:', parent_description)) dataset.write_cluster_members(clus, cluster_keyword_file, parent_direcotry) center_names = dataset.write_cluster_centers(clus, parent_description, hierarchy_file) dataset.write_document_membership(clus, doc_membership_file, parent_direcotry) - print('Done saving cluster results for ', len(dataset.keywords), ' keywords under parent:', parent_description) + print(('Done saving cluster results for ', len(dataset.keywords), ' keywords under parent:', parent_description)) return center_names diff --git a/code/compress.py b/code/compress.py index d444ad5..4fff169 100644 --- a/code/compress.py +++ b/code/compress.py @@ -1,7 +1,7 @@ import argparse import utils import operator -import Queue +import queue import math from os import listdir from os.path import isfile, join, isdir, abspath, dirname, basename, exists @@ -37,12 +37,12 @@ def parse_reidx(reidx_f): if len(pd_map[ph]) > 0: ph_idf[ph] = math.log(float(d_cnt) / len(pd_map[ph])) - print 'Inverted Index file read.' + print('Inverted Index file read.') def get_rep(folder, c_id, N): - print('Start get representative phrases for %s, %s ========================' % (folder, c_id)) + print(('Start get representative phrases for %s, %s ========================' % (folder, c_id))) # print folder par_folder = dirname(folder) cur_label = basename(folder) @@ -72,7 +72,7 @@ def get_rep(folder, c_id, N): emb_dist = utils.cossim(embs[ph], embs[cur_label]) ph_scores[ph] = score * emb_dist - ph_scores = sorted(ph_scores.items(), key=operator.itemgetter(1), reverse=True) + ph_scores = sorted(list(ph_scores.items()), key=operator.itemgetter(1), reverse=True) for (ph, score) in ph_scores: if ph not in result_phrases and ph in kws: @@ -81,7 +81,7 @@ def get_rep(folder, c_id, N): break elif ph_idf == None: - print 'looking at embeddings for %s' % folder + print('looking at embeddings for %s' % folder) ph_f = '%s/embeddings.txt' % par_folder kw_f = '%s/keywords.txt' % par_folder @@ -119,7 +119,7 @@ def get_rep(folder, c_id, N): result_phrases.append(ph) else: # Using TF-IDF to generate - print 'looking at tf-idf for %s' % folder + print('looking at tf-idf for %s' % folder) d_clus_f = '%s/paper_cluster.txt' % par_folder kw_clus_f = '%s/cluster_keywords.txt' % par_folder docs = [] @@ -148,7 +148,7 @@ def get_rep(folder, c_id, N): continue ph_scores[ph] = 1 + math.log(ph_scores[ph]) ph_scores[ph] *= ph_idf[ph] - ph_scores = sorted(ph_scores.items(), key=operator.itemgetter(1), reverse=True) + ph_scores = sorted(list(ph_scores.items()), key=operator.itemgetter(1), reverse=True) for (ph, score) in ph_scores: if ph not in result_phrases: @@ -161,7 +161,7 @@ def get_rep(folder, c_id, N): def recursion(root, o_file, N): - q = Queue.Queue() + q = queue.Queue() q.put((root, -1, '*')) g = open(o_file, 'w+') diff --git a/code/dataset.py b/code/dataset.py index f167919..600adf5 100644 --- a/code/dataset.py +++ b/code/dataset.py @@ -76,7 +76,7 @@ def load_keywords(self, keyword_file, full_data): if keyword in full_data.embeddings: keywords.append(keyword) else: - print(keyword, ' not in the embedding file') + print((keyword, ' not in the embedding file')) return keywords def gen_keyword_id(self): @@ -210,4 +210,4 @@ def assign_document(self, doc_membership): keyword_file = data_dir + 'input/candidates.txt' embedding_file = data_dir + 'input/embeddings.txt' dataset = DataSet(embedding_file, document_file, keyword_file) - print(len(dataset.get_candidate_embeddings())) + print((len(dataset.get_candidate_embeddings()))) diff --git a/code/embedding_deprecated/fetch_papers_relevant.py b/code/embedding_deprecated/fetch_papers_relevant.py index ab5e0cf..d070b1a 100644 --- a/code/embedding_deprecated/fetch_papers_relevant.py +++ b/code/embedding_deprecated/fetch_papers_relevant.py @@ -21,19 +21,19 @@ def fetch_paper(thread, N, term, relevant_dict, tp_map): targets, target_set = [term], set([term]) - for i in xrange(N): + for i in range(N): q, s = relevant_dict[term][i][0], relevant_dict[term][i][1] if s > thread and q not in target_set: targets.append(q) target_set.add(q) paper_set = set() - print "Query :", term, len(targets), "---" + print("Query :", term, len(targets), "---") for t in targets: if t not in tp_map: - print "(%s)" % t, + print("(%s)" % t, end=' ') continue paper_set = paper_set | tp_map[t] - print "(%s, %d) " % (t, len(paper_set)), + print("(%s, %d) " % (t, len(paper_set)), end=' ') return paper_set @@ -71,15 +71,15 @@ def fetch_paper(thread, N, term, relevant_dict, tp_map): for term in query_terms: relevant_dict[term] = [] - print "%s/%s" % (args.relevant, term) + print("%s/%s" % (args.relevant, term)) data = open("%s/%s" % (args.relevant, term)) - print term + print(term) for line in data: values = line.strip().split() relevant_dict[term].append((values[0], float(values[1]))) - print values[0], + print(values[0], end=' ') all_terms.add(values[0]) - print "\n" + print("\n") # read paper-term term_paper_map = dict() @@ -93,13 +93,13 @@ def fetch_paper(thread, N, term, relevant_dict, tp_map): if term not in term_paper_map: term_paper_map[term] = set() term_paper_map[term].add(paper) - print "Finish reading paper_term.net" + print("Finish reading paper_term.net") data = open(args.query) for line in data: queries = line.strip().split("#") paper_set = None - for i in xrange(len(queries)): + for i in range(len(queries)): if i == 0: paper_set = fetch_paper(threshold, N, queries[i], relevant_dict, term_paper_map) @@ -107,9 +107,9 @@ def fetch_paper(thread, N, term, relevant_dict, tp_map): paper_set = paper_set | fetch_paper(threshold, N, queries[i], relevant_dict, term_paper_map) - print "For query :", queries[i], - print "Before :", len(term_paper_map[queries[i]]), - print "After :", len(paper_set) + print("For query :", queries[i], end=' ') + print("Before :", len(term_paper_map[queries[i]]), end=' ') + print("After :", len(paper_set)) path = "%s/%s" % (args.output, "_".join(queries)) diff --git a/code/embedding_deprecated/metrics.py b/code/embedding_deprecated/metrics.py index 8658cc3..07b1761 100644 --- a/code/embedding_deprecated/metrics.py +++ b/code/embedding_deprecated/metrics.py @@ -29,23 +29,23 @@ def precision(candidates, ground_truth): p_5, p_10, p_20 = 0, 0, 0 true_positive = 0 - for i in xrange(5): + for i in range(5): if i < len(candidates) and candidates[i] in ground_truth: true_positive += 1 p_5 = float(true_positive) / 5.0 - for i in xrange(5, 10): + for i in range(5, 10): if i < len(candidates) and candidates[i] in ground_truth: true_positive += 1 p_10 = float(true_positive) / 10.0 - for i in xrange(10, 20): + for i in range(10, 20): if i < len(candidates) and candidates[i] in ground_truth: true_positive += 1 p_20 = float(true_positive) / 20.0 true_positive = 0 - for i in xrange(min(R, C)): + for i in range(min(R, C)): if i < len(candidates) and candidates[i] in ground_truth: true_positive += 1 @@ -58,7 +58,7 @@ def MAP(candidates, ground_truth): C = len(candidates) R = len(ground_truth) true_positive, precision, total_precision = 0, 0, 0 - for n in xrange(1, min(C, R) + 1): + for n in range(1, min(C, R) + 1): if candidates[n - 1] in ground_truth: true_positive += 1 precision = float(true_positive) / float(n) @@ -68,7 +68,7 @@ def MAP(candidates, ground_truth): def RR(candidates, ground_truth): result = 0 nCandidates = len(candidates) - for n in xrange(nCandidates): + for n in range(nCandidates): if candidates[n] in ground_truth: result = n + 1 break @@ -77,7 +77,7 @@ def RR(candidates, ground_truth): def NDCG(candidates, ground_truth, K = 20): DCG, IDCG = 0, 0 ndcg_5, ndcg_10, ndcg_20 = 0, 0, 0 - for i in xrange(K): + for i in range(K): x = 1.0 / log(i + 2, 2) IDCG += x if i < len(candidates) and candidates[i] in ground_truth: @@ -93,7 +93,7 @@ def NDCG(candidates, ground_truth, K = 20): def bpref(candidates, ground_truth): R = len(candidates) false_positive, bpref, k = 0, 0, 0 - for n in xrange(R): + for n in range(R): if candidates[n] not in ground_truth: false_positive += 1 else: @@ -104,11 +104,11 @@ def bpref(candidates, ground_truth): return bpref / float(k) def printResults(name, result): - print name, - print "%.4f" % (np.mean(result)), + print(name, end=' ') + print("%.4f" % (np.mean(result)), end=' ') for res in result: - print "%.4f" % res, - print + print("%.4f" % res, end=' ') + print() if __name__ == "__main__": parser = argparse.ArgumentParser(prog='metrics.py', \ @@ -131,11 +131,11 @@ def printResults(name, result): for line in query_data: query = line.split("\n")[0].replace(" ", "_") queries.append(query) - print "measure overall", " ".join(queries) + print("measure overall", " ".join(queries)) Precision_5, Precision_10, Precision_20, Precision_R, l_MAP, l_MRR, l_bpref = \ - list([[] for i in xrange(7)]) - NDCG_5, NDCG_10, NDCG_20 = list([[] for i in xrange(3)]) + list([[] for i in range(7)]) + NDCG_5, NDCG_10, NDCG_20 = list([[] for i in range(3)]) for query in queries: candidate_file = input_folder + "/" + query + "/author.weights" @@ -145,7 +145,7 @@ def printResults(name, result): for line in data: value = line.split("\n")[0].split() scores[value[0]] = float(value[1]) - scores = sorted(scores.items(), key=lambda x:x[1], reverse=True) + scores = sorted(list(scores.items()), key=lambda x:x[1], reverse=True) candidates = [x[0] for x in scores] diff --git a/code/eval.py b/code/eval.py index bd67933..ff239a5 100644 --- a/code/eval.py +++ b/code/eval.py @@ -13,7 +13,7 @@ def handler(folder): if True: files = ['%s/%s' % (folder, f) for f in listdir(folder) if isfile(join(folder, f)) and 'intrusion_exp' in f] - print files + print(files) intru_gold = '%s/%s' % (folder, 'intrusion_gold.txt') @@ -51,9 +51,9 @@ def handler(folder): # print idx - print '\nIntrusion results!!' + print('\nIntrusion results!!') for method in total: - print '%s accuracy: %d/%d - %f' % (method, correct[method], total[method], float(correct[method]) / total[method]) + print('%s accuracy: %d/%d - %f' % (method, correct[method], total[method], float(correct[method]) / total[method])) # files = ['%s/%s' % (folder, f) for f in listdir(folder) if isfile(join(folder, f)) and 'isa_exp' in f] @@ -95,7 +95,7 @@ def handler(folder): # subdomain result files = ['%s/%s' % (folder, f) for f in listdir(folder) if isfile(join(folder, f)) and 'subdomain_exp' in f] - print files + print(files) intru_gold = '%s/%s' % (folder, 'subdomain_gold.txt') @@ -129,9 +129,9 @@ def handler(folder): if value == 'n': incorrect[method] += 1 - print '\nSubdomain results!!' + print('\nSubdomain results!!') for method in total: - print '%s accuracy: %d/%d/%d - %f' % (method, correct[method], incorrect[method], total[method], float(correct[method]) / total[method]) + print('%s accuracy: %d/%d/%d - %f' % (method, correct[method], incorrect[method], total[method], float(correct[method]) / total[method])) if __name__ == "__main__": diff --git a/code/gen_eval.py b/code/gen_eval.py index 1407708..cdc8926 100644 --- a/code/gen_eval.py +++ b/code/gen_eval.py @@ -59,11 +59,11 @@ def gen_isa_pairs(tax, isa_N, case_N): p_phs = '|'.join(p_node.ph_list[:isa_N]) rmd_phs = '|'.join(rmd_node.ph_list[:isa_N]) if len(p_phs) == 0 or len(rmd_phs) == 0: - print tax - print n_phs - print node.name - print p_node.name - print rmd_node.name + print(tax) + print(n_phs) + print(node.name) + print(p_node.name) + print(rmd_node.name) exit(1) order = random.choice([0, 1]) p_id = 0 @@ -122,7 +122,7 @@ def handler(folder, output, N, isa_N, case_N): subdomain_map = {} for tax_name in taxs: - print tax_name + print(tax_name) # generate intrusion pairs # intru_f = '%s/%s.intrusion' % (output, tax_name) intru_maps[tax_name] = gen_intrusion_pairs(taxs[tax_name], N, case_N) diff --git a/code/labeling.py b/code/labeling.py index f05ef0d..4234e9e 100644 --- a/code/labeling.py +++ b/code/labeling.py @@ -1,13 +1,13 @@ import argparse import utils import operator -import Queue +import queue from os import listdir from os.path import isfile, join, isdir, abspath, dirname, basename from case_ranker import read_caseolap_result, rank_phrase def label_emb_centric(folder, c_id): - print 'Start labeling for %s, %s ========================' % (folder, c_id) + print('Start labeling for %s, %s ========================' % (folder, c_id)) # print folder par_folder = dirname(folder) cur_label = basename(folder) @@ -18,7 +18,7 @@ def label_emb_centric(folder, c_id): # generate word2vec phrases embs = utils.load_embeddings(emb_f) if cur_label not in embs: - print 'Error!!!' + print('Error!!!') exit(1) N = 100 worst = -100 @@ -58,8 +58,8 @@ def label_emb_centric(folder, c_id): label_cands[ph] = score - ranked_list = sorted(label_cands.items(), key=operator.itemgetter(1), reverse=True) - print ranked_list + ranked_list = sorted(list(label_cands.items()), key=operator.itemgetter(1), reverse=True) + print(ranked_list) return ranked_list[0][0] @@ -129,7 +129,7 @@ def label_emb_centric(folder, c_id): def recursion(root): - q = Queue.Queue() + q = queue.Queue() q.put((root, -1)) label_map = {} @@ -149,10 +149,10 @@ def recursion(root): l = label_emb_centric(c_folder, str(c_id)) cur_label = basename(c_folder) label_map[cur_label] = l - print 'label for %s is: %s\n' % (c_folder, l) + print('label for %s is: %s\n' % (c_folder, l)) except: - for (o_l, l) in label_map.items(): - print '%s ==> %s' % (o_l, l) + for (o_l, l) in list(label_map.items()): + print('%s ==> %s' % (o_l, l)) if __name__ == "__main__": diff --git a/code/local_embedding_training.py b/code/local_embedding_training.py index 7d6fcca..dcab154 100644 --- a/code/local_embedding_training.py +++ b/code/local_embedding_training.py @@ -10,7 +10,7 @@ import os def read_files(folder, parent): - print("[Local-embedding] Reading file:", parent) + print(("[Local-embedding] Reading file:", parent)) emb_file = '%s/embeddings.txt' % folder hier_file = '%s/hierarchy.txt' % folder keyword_file = '%s/keywords.txt' % folder ## here only consider those remaining keywords @@ -106,7 +106,7 @@ def run_word2vec(pd_map, docs, cates, folder): for ph in cates[cate]: c_docs = c_docs.union(pd_map[ph]) - print('Starting cell %s with %d docs.' % (cate, len(c_docs))) + print(('Starting cell %s with %d docs.' % (cate, len(c_docs)))) # save file # sub_folder = '%s/%s' % (folder, cate) diff --git a/code/main.py b/code/main.py index b7a0ff7..e464c02 100644 --- a/code/main.py +++ b/code/main.py @@ -49,20 +49,22 @@ def recur(input_dir, node_dir, n_cluster, parent, n_cluster_iter, filter_thre,\ n_expand, level, caseolap=True, local_embedding=True): if level > MAX_LEVEL: return - print('============================= Running level ', level, ' and node ', parent, '=============================') + print(('============================= Running level ', level, ' and node ', parent, '=============================')) start = time.time() df = DataFiles(input_dir, node_dir) ## TODO: Everytime we need to read-in the whole corpus, which can be slow. full_data = DataSet(df.embedding_file, df.doc_file) end = time.time() - print('[Main] Done reading the full data using time %s seconds' % (end-start)) + print(('[Main] Done reading the full data using time %s seconds' % (end-start))) # filter the keywords if caseolap is False: try: children = run_clustering(full_data, df.doc_id_file, df.seed_keyword_file, n_cluster, node_dir, parent, \ df.cluster_keyword_file, df.hierarchy_file, df.doc_membership_file) - except: + except Exception as e: + print(e) + print(e) print('Clustering not finished.') return copyfile(df.seed_keyword_file, df.filtered_keyword_file) @@ -74,7 +76,8 @@ def recur(input_dir, node_dir, n_cluster, parent, n_cluster_iter, filter_thre,\ try: children = run_clustering(full_data, df.doc_id_file, df.seed_keyword_file, n_cluster, node_dir, parent,\ df.cluster_keyword_file, df.hierarchy_file, df.doc_membership_file) - except: + except Exception as e: + print(e) print('Clustering not finished.') return @@ -82,7 +85,7 @@ def recur(input_dir, node_dir, n_cluster, parent, n_cluster_iter, filter_thre,\ main_caseolap(df.link_file, df.doc_membership_file, df.cluster_keyword_file, df.caseolap_keyword_file) main_rank_phrase(df.caseolap_keyword_file, df.filtered_keyword_file, filter_thre) end = time.time() - print("[Main] Finish running CaseOALP using %s (seconds)" % (end - start)) + print(("[Main] Finish running CaseOALP using %s (seconds)" % (end - start))) # prepare the embedding for child level if level < MAX_LEVEL: @@ -96,7 +99,7 @@ def recur(input_dir, node_dir, n_cluster, parent, n_cluster_iter, filter_thre,\ start = time.time() main_local_embedding(node_dir, df.doc_file, df.index_file, parent, n_expand) end = time.time() - print("[Main] Finish running local embedding training using %s (seconds)" % (end - start)) + print(("[Main] Finish running local embedding training using %s (seconds)" % (end - start))) for child in children: recur(input_dir, node_dir + child + '/', n_cluster, child, n_cluster_iter, \ diff --git a/code/paras.py b/code/paras.py index 027ae53..f29e54b 100644 --- a/code/paras.py +++ b/code/paras.py @@ -99,14 +99,14 @@ def load_sp_params(): def load_dblp_params_method(): pd = dict() - pd['data_dir'] = '/shared/data/jiaming/local-embedding/data/dblp/' + pd['data_dir'] = '/home/sasce/PycharmProjects/taxogen/data/dblp/' pd['doc_file'] = pd['data_dir'] + 'input/papers.txt' pd['doc_keyword_cnt_file'] = pd['data_dir'] + 'input/keyword_cnt.txt' pd['input_dir'] = pd['data_dir'] + 'input/' pd['root_node_dir'] = pd['data_dir'] + 'cluster/' pd['n_cluster'] = 5 - pd['filter_thre'] = 0.25 - pd['n_expand'] = 100 - pd['n_cluster_iter'] = 2 + pd['filter_thre'] = 0.15 + pd['n_expand'] = 200 + pd['n_cluster_iter'] = 5 return pd diff --git a/code/postprocess/redundancy.py b/code/postprocess/redundancy.py index 256810c..af505ea 100644 --- a/code/postprocess/redundancy.py +++ b/code/postprocess/redundancy.py @@ -31,7 +31,7 @@ def load_taxonomy(tax_file): def compute_pmi(inverted_index, tax): values = [] - for node, keywords in tax.items(): + for node, keywords in list(tax.items()): pairs = set(frozenset([i, j]) for i in keywords for j in keywords if i != j) for p in pairs: pair = list(p) @@ -54,7 +54,7 @@ def main(idx_file, taxonomy_dir, output_file): inverted_index = load_inverted_index(idx_file) # print len(inverted_index) taxonomy_files = get_tax_filenames(taxonomy_dir) - print taxonomy_files + print(taxonomy_files) with open(output_file, 'a') as fout: for tax_file in taxonomy_files: tax = load_taxonomy(tax_file) diff --git a/code/postprocess/visualize.py b/code/postprocess/visualize.py index 87ed098..f63f99a 100644 --- a/code/postprocess/visualize.py +++ b/code/postprocess/visualize.py @@ -13,7 +13,7 @@ def load_nodes(node_file, min_level=1, max_level=3, prefix_list=['*']): node_content = [] nodes[node_id] = node_content prune_nodes = {} - for node_id, node_content in nodes.items(): + for node_id, node_content in list(nodes.items()): # prune nodes if not has_one_prefix(node_id, prefix_list): continue @@ -43,11 +43,11 @@ def is_exact_prefix(s, prefix): def gen_edges(nodes): - node_ids = nodes.keys() + node_ids = list(nodes.keys()) node_ids.sort(key=lambda x: len(x)) edges = [] - for i in xrange(len(nodes) - 1): - for j in xrange(i+1, len(nodes)): + for i in range(len(nodes) - 1): + for j in range(i+1, len(nodes)): if is_parent(node_ids[i], node_ids[j]): edges.append([node_ids[i], node_ids[j]]) return edges @@ -75,7 +75,7 @@ def gen_node_label(node_id, node_content): def draw(nodes, edges, output_file): d = Digraph(node_attr={'shape': 'record'}) - for node_id, node_content in nodes.items(): + for node_id, node_content in list(nodes.items()): d.node(node_id, gen_node_label(node_id, node_content)) for e in edges: d.edge(e[0], e[1]) diff --git a/code/preprocess.py b/code/preprocess.py index 10446bc..8499e5e 100644 --- a/code/preprocess.py +++ b/code/preprocess.py @@ -17,7 +17,7 @@ def get_candidates(folder, o_file): phrase = line.split(' ')[0] quality_phrases.add(phrase) - print('Quality phrase count: ' + str(len(quality_phrases))) + print(('Quality phrase count: ' + str(len(quality_phrases)))) with open(o_file, 'w+') as g: for phrase in quality_phrases: @@ -49,7 +49,7 @@ def get_reidx_file(text, cand_f, o_file): pd_map[t].add(str(idx)) idx += 1 if idx % 10000 == 0: - print("[Construct Inverted Index] Parse %s documents" % idx) + print(("[Construct Inverted Index] Parse %s documents" % idx)) with open(o_file, 'w+') as g: for ph in pd_map: diff --git a/code/preprocess/AutoPhraseOutput.py b/code/preprocess/AutoPhraseOutput.py index cf7b8e0..3b72141 100644 --- a/code/preprocess/AutoPhraseOutput.py +++ b/code/preprocess/AutoPhraseOutput.py @@ -67,7 +67,7 @@ def parse_one_doc(self, doc): # sanity checking if (len(q) != 0): - print("[ERROR]: mismatched in document: %s" % doc) + print(("[ERROR]: mismatched in document: %s" % doc)) def save_phrase_to_pos_sequence(self, output_path=""): with open(output_path, "w") as fout: @@ -88,7 +88,7 @@ def load_phrase_to_pos_sequence(self, input_path=""): def obtain_pos_sequence_to_score(self): pos_sequence_2_score = {} - for v in self.phrase_to_pos_sequence.values(): + for v in list(self.phrase_to_pos_sequence.values()): for pos_sequence in v: if pos_sequence not in pos_sequence_2_score: pos_sequence_list = pos_sequence.split() @@ -103,8 +103,8 @@ def obtain_pos_sequence_to_score(self): self.pos_sequence_to_score = pos_sequence_2_score - print(len(self.pos_sequence_to_score)) - print(self.pos_sequence_to_score) + print((len(self.pos_sequence_to_score))) + print((self.pos_sequence_to_score)) def obtain_candidate_phrase(self, threshold = 0.8, min_sup = 5): candidate_phrase = [] @@ -113,15 +113,15 @@ def obtain_candidate_phrase(self, threshold = 0.8, min_sup = 5): freq = sum(self.phrase_to_pos_sequence[phrase].values()) if freq < min_sup: continue - for pos_sequence in self.phrase_to_pos_sequence[phrase].keys(): + for pos_sequence in list(self.phrase_to_pos_sequence[phrase].keys()): pos_sequence_weight = float(self.phrase_to_pos_sequence[phrase][pos_sequence]) / freq pos_sequence_score = self.pos_sequence_to_score[pos_sequence] phrase_score += (pos_sequence_weight * pos_sequence_score) if phrase_score >= threshold: # print(phrase, phrase_score) candidate_phrase.append(phrase) - print(len(candidate_phrase)) - print(candidate_phrase[0:10]) + print((len(candidate_phrase))) + print((candidate_phrase[0:10])) self.candidate_phrase = candidate_phrase def save_candidate_phrase(self, output_path=""): diff --git a/code/preprocess/SegPhraseOutput.py b/code/preprocess/SegPhraseOutput.py index d869a0f..ccd9a68 100644 --- a/code/preprocess/SegPhraseOutput.py +++ b/code/preprocess/SegPhraseOutput.py @@ -65,7 +65,7 @@ def parse_one_doc(self, doc): # sanity checking if (len(q) != 0): - print("[ERROR]: mismatched in document: %s" % doc) + print(("[ERROR]: mismatched in document: %s" % doc)) def save_phrase_to_pos_sequence(self, output_path=""): with open(output_path, "w") as fout: @@ -83,11 +83,11 @@ def load_phrase_to_pos_sequence(self, input_path=""): phrase = seg[0] pos_sequence = eval(seg[1]) self.phrase_to_pos_sequence[phrase] = pos_sequence - print("[INFO] Number of phrases before NP pruning = ", len(self.phrase_to_pos_sequence)) + print(("[INFO] Number of phrases before NP pruning = ", len(self.phrase_to_pos_sequence))) def obtain_pos_sequence_to_score(self): pos_sequence_2_score = {} - for v in self.phrase_to_pos_sequence.values(): + for v in list(self.phrase_to_pos_sequence.values()): for pos_sequence in v: if pos_sequence not in pos_sequence_2_score: pos_sequence_list = pos_sequence.split() @@ -102,8 +102,8 @@ def obtain_pos_sequence_to_score(self): self.pos_sequence_to_score = pos_sequence_2_score - print(len(self.pos_sequence_to_score)) - print(self.pos_sequence_to_score) + print((len(self.pos_sequence_to_score))) + print((self.pos_sequence_to_score)) def obtain_candidate_phrase(self, threshold = 0.8, min_sup = 5): candidate_phrase = [] @@ -112,15 +112,15 @@ def obtain_candidate_phrase(self, threshold = 0.8, min_sup = 5): freq = sum(self.phrase_to_pos_sequence[phrase].values()) if freq < min_sup: continue - for pos_sequence in self.phrase_to_pos_sequence[phrase].keys(): + for pos_sequence in list(self.phrase_to_pos_sequence[phrase].keys()): pos_sequence_weight = float(self.phrase_to_pos_sequence[phrase][pos_sequence]) / freq pos_sequence_score = self.pos_sequence_to_score[pos_sequence] phrase_score += (pos_sequence_weight * pos_sequence_score) if phrase_score >= threshold: # print(phrase, phrase_score) candidate_phrase.append(phrase) - print(len(candidate_phrase)) - print(candidate_phrase[0:10]) + print((len(candidate_phrase))) + print((candidate_phrase[0:10])) self.candidate_phrase = candidate_phrase def save_candidate_phrase(self, output_path=""): diff --git a/code/preprocess/main.py b/code/preprocess/main.py index b19a78c..8431e81 100644 --- a/code/preprocess/main.py +++ b/code/preprocess/main.py @@ -3,9 +3,9 @@ __description__: Based on AutoPhrase output, generate 1) papers.txt, 2) keywords.txt __latest_update__: Auguest 2, 2017 ''' -from config import * -from AutoPhraseOutput import AutoPhraseOutput -from SegPhraseOutput import SegPhraseOutput +from .config import * +from .AutoPhraseOutput import AutoPhraseOutput +from .SegPhraseOutput import SegPhraseOutput import re from pattern.en import parsetree from pattern.en import parse diff --git a/code/run.sh b/code/run.sh old mode 100644 new mode 100755 index 1bcf2ff..8f0b7db --- a/code/run.sh +++ b/code/run.sh @@ -2,14 +2,16 @@ ## Name of the input corpus corpusName=dblp ## Name of the taxonomy -taxonName=our-l3-0.25 +taxonName=our-l3-0.15 ## If need preprocessing from raw input, set it to be 1, otherwise, set 0 -FIRST_RUN=${FIRST_RUN:- 0} +FIRST_RUN=${FIRST_RUN:- 1} + +source activate '/home/sasce/.cache/pypoetry/virtualenvs/taxogen-Q1ywnX6T-py3.8/bin/python' if [ $FIRST_RUN -eq 1 ]; then echo 'Start data preprocessing' ## compile word2vec for embedding learning - gcc word2vec.c -o word2veec -lm -pthread -O2 -Wall -funroll-loops -Wno-unused-result + gcc word2vec.c -o word2vec -lm -pthread -O2 -Wall -funroll-loops -Wno-unused-result ## create initial folder if not exist if [ ! -d ../data/$corpusName/init ]; then diff --git a/code/sim_emb.py b/code/sim_emb.py index fc5ff17..0367eef 100644 --- a/code/sim_emb.py +++ b/code/sim_emb.py @@ -1,7 +1,7 @@ import argparse import utils import operator -import Queue +import queue import math from os import listdir from os.path import isfile, join, isdir, abspath, dirname, basename, exists @@ -28,7 +28,7 @@ def compare(a_f, b_f): b_emb_f = '%s/%s' % (b_f, emb_f) if not exists(a_emb_f) or not exists(b_emb_f): - print 'Embedding file not found' + print('Embedding file not found') exit(1) embs_a = load_embeddings(a_emb_f) @@ -37,9 +37,9 @@ def compare(a_f, b_f): embs_groups = [embs_a, embs_b] while 1: - n_name = raw_input("Enter your node: ") + n_name = input("Enter your node: ") if n_name not in embs_a or n_name not in embs_b: - print '%s not found' % n_name + print('%s not found' % n_name) for embs in embs_groups: t_emb = embs[n_name] @@ -47,11 +47,11 @@ def compare(a_f, b_f): for key in embs: sim_map[key] = utils.cossim(t_emb, embs[key]) - sim_map = sorted(sim_map.items(), key=operator.itemgetter(1), reverse=True) + sim_map = sorted(list(sim_map.items()), key=operator.itemgetter(1), reverse=True) output_str = '\n'.join([sim_map[i][0] + '\t' + str(sim_map[i][1]) for i in range(10)]) # print sim_map[:10] - print output_str - print 'group finished\n' + print(output_str) + print('group finished\n') diff --git a/code/spherecluster/__init__.py b/code/spherecluster/__init__.py new file mode 100644 index 0000000..5a1e5aa --- /dev/null +++ b/code/spherecluster/__init__.py @@ -0,0 +1,12 @@ +from __future__ import absolute_import +from .spherical_kmeans import SphericalKMeans +from .von_mises_fisher_mixture import VonMisesFisherMixture +from .util import sample_vMF +""" +@author = 'Jason Laska', +@author_email = 'jason@claralabs.com', +@url = 'https://github.com/jasonlaska/spherecluster/' +@edited = 'https://github.com/rfayat/spherecluster' +""" + +__all__ = ["SphericalKMeans", "VonMisesFisherMixture", "sample_vMF"] diff --git a/code/spherecluster/spherecluster/__init__.py b/code/spherecluster/spherecluster/__init__.py new file mode 100644 index 0000000..5d2c502 --- /dev/null +++ b/code/spherecluster/spherecluster/__init__.py @@ -0,0 +1,6 @@ +from __future__ import absolute_import +from .spherical_kmeans import SphericalKMeans +from .von_mises_fisher_mixture import VonMisesFisherMixture +from .util import sample_vMF + +__all__ = ["SphericalKMeans", "VonMisesFisherMixture", "sample_vMF"] diff --git a/code/spherecluster/spherecluster/spherical_kmeans.py b/code/spherecluster/spherecluster/spherical_kmeans.py new file mode 100644 index 0000000..d56e03c --- /dev/null +++ b/code/spherecluster/spherecluster/spherical_kmeans.py @@ -0,0 +1,372 @@ +import warnings + +import numpy as np +import scipy.sparse as sp +from joblib import Parallel, delayed + +from sklearn.cluster import KMeans + +# from sklearn.cluster import _k_means +from sklearn.cluster import _k_means_fast as _k_means +from sklearn.cluster.k_means_ import ( + _check_sample_weight, + _init_centroids, + _labels_inertia, + _tolerance, + _validate_center_shape, +) +from sklearn.preprocessing import normalize +from sklearn.utils import check_array, check_random_state +from sklearn.utils.extmath import row_norms, squared_norm +from sklearn.utils.validation import _num_samples + + +def _spherical_kmeans_single_lloyd( + X, + n_clusters, + sample_weight=None, + max_iter=300, + init="k-means++", + verbose=False, + x_squared_norms=None, + random_state=None, + tol=1e-4, + precompute_distances=True, +): + """ + Modified from sklearn.cluster.k_means_.k_means_single_lloyd. + """ + random_state = check_random_state(random_state) + + sample_weight = _check_sample_weight(sample_weight, X) + + best_labels, best_inertia, best_centers = None, None, None + + # init + centers = _init_centroids( + X, n_clusters, init, random_state=random_state, x_squared_norms=x_squared_norms + ) + if verbose: + print("Initialization complete") + + # Allocate memory to store the distances for each sample to its + # closer center for reallocation in case of ties + distances = np.zeros(shape=(X.shape[0],), dtype=X.dtype) + + # iterations + for i in range(max_iter): + centers_old = centers.copy() + + # labels assignment + # TODO: _labels_inertia should be done with cosine distance + # since ||a - b|| = 2(1 - cos(a,b)) when a,b are unit normalized + # this doesn't really matter. + labels, inertia = _labels_inertia( + X, + sample_weight, + x_squared_norms, + centers, + precompute_distances=precompute_distances, + distances=distances, + ) + + # computation of the means + if sp.issparse(X): + centers = _k_means._centers_sparse( + X, sample_weight, labels, n_clusters, distances + ) + else: + centers = _k_means._centers_dense( + X.astype(np.float), + sample_weight.astype(np.float), + labels, + n_clusters, + distances.astype(np.float), + ) + + # l2-normalize centers (this is the main contibution here) + centers = normalize(centers) + + if verbose: + print("Iteration %2d, inertia %.3f" % (i, inertia)) + + if best_inertia is None or inertia < best_inertia: + best_labels = labels.copy() + best_centers = centers.copy() + best_inertia = inertia + + center_shift_total = squared_norm(centers_old - centers) + if center_shift_total <= tol: + if verbose: + print( + "Converged at iteration %d: " + "center shift %e within tolerance %e" % (i, center_shift_total, tol) + ) + break + + if center_shift_total > 0: + # rerun E-step in case of non-convergence so that predicted labels + # match cluster centers + best_labels, best_inertia = _labels_inertia( + X, + sample_weight, + x_squared_norms, + best_centers, + precompute_distances=precompute_distances, + distances=distances, + ) + + return best_labels, best_inertia, best_centers, i + 1 + + +def spherical_k_means( + X, + n_clusters, + sample_weight=None, + init="k-means++", + n_init=10, + max_iter=300, + verbose=False, + tol=1e-4, + random_state=None, + copy_x=True, + n_jobs=1, + algorithm="auto", + return_n_iter=False, +): + """Modified from sklearn.cluster.k_means_.k_means. + """ + if n_init <= 0: + raise ValueError( + "Invalid number of initializations." + " n_init=%d must be bigger than zero." % n_init + ) + random_state = check_random_state(random_state) + + if max_iter <= 0: + raise ValueError( + "Number of iterations should be a positive number," + " got %d instead" % max_iter + ) + + best_inertia = np.infty + # avoid forcing order when copy_x=False + order = "C" if copy_x else None + X = check_array( + X, accept_sparse="csr", dtype=[np.float64, np.float32], order=order, copy=copy_x + ) + # verify that the number of samples given is larger than k + if _num_samples(X) < n_clusters: + raise ValueError( + "n_samples=%d should be >= n_clusters=%d" % (_num_samples(X), n_clusters) + ) + tol = _tolerance(X, tol) + + if hasattr(init, "__array__"): + init = check_array(init, dtype=X.dtype.type, order="C", copy=True) + _validate_center_shape(X, n_clusters, init) + + if n_init != 1: + warnings.warn( + "Explicit initial center position passed: " + "performing only one init in k-means instead of n_init=%d" % n_init, + RuntimeWarning, + stacklevel=2, + ) + n_init = 1 + + # precompute squared norms of data points + x_squared_norms = row_norms(X, squared=True) + + if n_jobs == 1: + # For a single thread, less memory is needed if we just store one set + # of the best results (as opposed to one set per run per thread). + for it in range(n_init): + # run a k-means once + labels, inertia, centers, n_iter_ = _spherical_kmeans_single_lloyd( + X, + n_clusters, + sample_weight, + max_iter=max_iter, + init=init, + verbose=verbose, + tol=tol, + x_squared_norms=x_squared_norms, + random_state=random_state, + ) + + # determine if these results are the best so far + if best_inertia is None or inertia < best_inertia: + best_labels = labels.copy() + best_centers = centers.copy() + best_inertia = inertia + best_n_iter = n_iter_ + else: + # parallelisation of k-means runs + seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init) + results = Parallel(n_jobs=n_jobs, verbose=0)( + delayed(_spherical_kmeans_single_lloyd)( + X, + n_clusters, + sample_weight, + max_iter=max_iter, + init=init, + verbose=verbose, + tol=tol, + x_squared_norms=x_squared_norms, + # Change seed to ensure variety + random_state=seed, + ) + for seed in seeds + ) + + # Get results with the lowest inertia + labels, inertia, centers, n_iters = zip(*results) + best = np.argmin(inertia) + best_labels = labels[best] + best_inertia = inertia[best] + best_centers = centers[best] + best_n_iter = n_iters[best] + + if return_n_iter: + return best_centers, best_labels, best_inertia, best_n_iter + else: + return best_centers, best_labels, best_inertia + + +class SphericalKMeans(KMeans): + """Spherical K-Means clustering + + Modfication of sklearn.cluster.KMeans where cluster centers are normalized + (projected onto the sphere) in each iteration. + + Parameters + ---------- + + n_clusters : int, optional, default: 8 + The number of clusters to form as well as the number of + centroids to generate. + + max_iter : int, default: 300 + Maximum number of iterations of the k-means algorithm for a + single run. + + n_init : int, default: 10 + Number of time the k-means algorithm will be run with different + centroid seeds. The final results will be the best output of + n_init consecutive runs in terms of inertia. + + init : {'k-means++', 'random' or an ndarray} + Method for initialization, defaults to 'k-means++': + 'k-means++' : selects initial cluster centers for k-mean + clustering in a smart way to speed up convergence. See section + Notes in k_init for more details. + 'random': choose k observations (rows) at random from data for + the initial centroids. + If an ndarray is passed, it should be of shape (n_clusters, n_features) + and gives the initial centers. + + tol : float, default: 1e-4 + Relative tolerance with regards to inertia to declare convergence + + n_jobs : int + The number of jobs to use for the computation. This works by computing + each of the n_init runs in parallel. + If -1 all CPUs are used. If 1 is given, no parallel computing code is + used at all, which is useful for debugging. For n_jobs below -1, + (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one + are used. + + random_state : integer or numpy.RandomState, optional + The generator used to initialize the centers. If an integer is + given, it fixes the seed. Defaults to the global numpy random + number generator. + + verbose : int, default 0 + Verbosity mode. + + copy_x : boolean, default True + When pre-computing distances it is more numerically accurate to center + the data first. If copy_x is True, then the original data is not + modified. If False, the original data is modified, and put back before + the function returns, but small numerical differences may be introduced + by subtracting and then adding the data mean. + + normalize : boolean, default True + Normalize the input to have unnit norm. + + Attributes + ---------- + + cluster_centers_ : array, [n_clusters, n_features] + Coordinates of cluster centers + + labels_ : + Labels of each point + + inertia_ : float + Sum of distances of samples to their closest cluster center. + """ + + def __init__( + self, + n_clusters=8, + init="k-means++", + n_init=10, + max_iter=300, + tol=1e-4, + n_jobs=1, + verbose=0, + random_state=None, + copy_x=True, + normalize=True, + ): + self.n_clusters = n_clusters + self.init = init + self.max_iter = max_iter + self.tol = tol + self.n_init = n_init + self.verbose = verbose + self.random_state = random_state + self.copy_x = copy_x + self.n_jobs = n_jobs + self.normalize = normalize + + def fit(self, X, y=None, sample_weight=None): + """Compute k-means clustering. + + Parameters + ---------- + + X : array-like or sparse matrix, shape=(n_samples, n_features) + + y : Ignored + not used, present here for API consistency by convention. + + sample_weight : array-like, shape (n_samples,), optional + The weights for each observation in X. If None, all observations + are assigned equal weight (default: None) + """ + if self.normalize: + X = normalize(X) + + random_state = check_random_state(self.random_state) + + # TODO: add check that all data is unit-normalized + + self.cluster_centers_, self.labels_, self.inertia_, self.n_iter_ = spherical_k_means( + X, + n_clusters=self.n_clusters, + sample_weight=sample_weight, + init=self.init, + n_init=self.n_init, + max_iter=self.max_iter, + verbose=self.verbose, + tol=self.tol, + random_state=random_state, + copy_x=self.copy_x, + n_jobs=self.n_jobs, + return_n_iter=True, + ) + + return self diff --git a/code/spherecluster/spherecluster/tests/test_common.py b/code/spherecluster/spherecluster/tests/test_common.py new file mode 100644 index 0000000..119f1c4 --- /dev/null +++ b/code/spherecluster/spherecluster/tests/test_common.py @@ -0,0 +1,6 @@ +from sklearn.utils.estimator_checks import check_estimator +from spherecluster import SphericalKMeans + + +def test_estimator_spherical_k_means(): + return check_estimator(SphericalKMeans) diff --git a/code/spherecluster/spherecluster/tests/test_von_mises_fisher_mixture.py b/code/spherecluster/spherecluster/tests/test_von_mises_fisher_mixture.py new file mode 100644 index 0000000..29e6d73 --- /dev/null +++ b/code/spherecluster/spherecluster/tests/test_von_mises_fisher_mixture.py @@ -0,0 +1,205 @@ +import pytest +import scipy as sp +import numpy as np +from numpy.testing import assert_almost_equal, assert_array_equal +from spherecluster import VonMisesFisherMixture +from spherecluster import von_mises_fisher_mixture +from spherecluster import sample_vMF + + +def test_vmf_log_dense(): + """ + Test that approximation approaches whatever scipy has. + """ + n_examples = 2 + n_features = 50 + + kappas = np.linspace(2, 600, 20) + + mu = np.random.randn(n_features) + mu /= np.linalg.norm(mu) + + X = np.random.randn(n_examples, n_features) + for ee in range(n_examples): + X[ee, :] /= np.linalg.norm(X[ee, :]) + + diffs = [] + for kappa in kappas: + v = von_mises_fisher_mixture._vmf_log(X, kappa, mu) + + v_approx = von_mises_fisher_mixture._vmf_log_asymptotic(X, kappa, mu) + + normalized_approx_diff = np.linalg.norm(v - v_approx) / np.linalg.norm(v) + print(normalized_approx_diff) + diffs.append(normalized_approx_diff) + + assert diffs[0] > 10 * diffs[-1] + + +def test_vmf_log_detect_breakage(): + """ + Find where scipy approximation breaks down. + This doesn't really test anything but demonstrates where approximation + should be applied instead. + """ + n_examples = 3 + kappas = [5, 30, 100, 1000, 5000] + n_features = range(2, 500) + + breakage_points = [] + for kappa in kappas: + first_breakage = None + for n_f in n_features: + mu = np.random.randn(n_f) + mu /= np.linalg.norm(mu) + + X = np.random.randn(n_examples, n_f) + for ee in range(n_examples): + X[ee, :] /= np.linalg.norm(X[ee, :]) + + try: + von_mises_fisher_mixture._vmf_log(X, kappa, mu) + except: + if first_breakage is None: + first_breakage = n_f + + breakage_points.append(first_breakage) + print( + "Scipy vmf_log breaks for kappa={} at n_features={}".format( + kappa, first_breakage + ) + ) + + print(breakage_points) + assert_array_equal(breakage_points, [141, 420, 311, 3, 3]) + + +def test_maximization(): + num_points = 5000 + n_features = 500 + posterior = np.ones((1, num_points)) + + kappas = [5000, 8000, 16400] + for kappa in kappas: + mu = np.random.randn(n_features) + mu /= np.linalg.norm(mu) + + X = sample_vMF(mu, kappa, num_points) + + centers, weights, concentrations = von_mises_fisher_mixture._maximization( + X, posterior + ) + + print("center estimate error", np.linalg.norm(centers[0, :] - mu)) + print( + "kappa estimate", + np.abs(kappa - concentrations[0]) / kappa, + kappa, + concentrations[0], + ) + + assert_almost_equal(1., weights[0]) + assert_almost_equal(0.0, np.abs(kappa - concentrations[0]) / kappa, decimal=2) + assert_almost_equal(0.0, np.linalg.norm(centers[0, :] - mu), decimal=2) + + +@pytest.mark.parametrize( + "params_in", + [ + {"posterior_type": "soft"}, + {"posterior_type": "hard"}, + {"posterior_type": "soft", "n_jobs": 2}, + {"posterior_type": "hard", "n_jobs": 3}, + {"posterior_type": "hard", "force_weights": np.ones(5) / 5.}, + {"posterior_type": "soft", "n_jobs": -1}, + ], +) +def test_integration_dense(params_in): + n_clusters = 5 + n_examples = 20 + n_features = 100 + X = np.random.randn(n_examples, n_features) + for ee in range(n_examples): + X[ee, :] /= np.linalg.norm(X[ee, :]) + + params_in.update({"n_clusters": n_clusters}) + movmf = VonMisesFisherMixture(**params_in) + movmf.fit(X) + + assert movmf.cluster_centers_.shape == (n_clusters, n_features) + assert len(movmf.concentrations_) == n_clusters + assert len(movmf.weights_) == n_clusters + assert len(movmf.labels_) == n_examples + assert len(movmf.posterior_) == n_clusters + + for center in movmf.cluster_centers_: + assert_almost_equal(np.linalg.norm(center), 1.0) + + for concentration in movmf.concentrations_: + assert concentration > 0 + + for weight in movmf.weights_: + assert not np.isnan(weight) + + plabels = movmf.predict(X) + assert_array_equal(plabels, movmf.labels_) + + ll = movmf.log_likelihood(X) + ll_labels = np.zeros(movmf.labels_.shape) + for ee in range(n_examples): + ll_labels[ee] = np.argmax(ll[:, ee]) + + assert_array_equal(ll_labels, movmf.labels_) + + +@pytest.mark.parametrize( + "params_in", + [ + {"posterior_type": "soft"}, + {"posterior_type": "hard"}, + {"posterior_type": "soft", "n_jobs": 2}, + {"posterior_type": "hard", "n_jobs": 3}, + {"posterior_type": "hard", "force_weights": np.ones(5) / 5.}, + {"posterior_type": "soft", "n_jobs": -1}, + ], +) +def test_integration_sparse(params_in): + n_clusters = 5 + n_examples = 20 + n_features = 100 + n_nonzero = 10 + X = sp.sparse.csr_matrix((n_examples, n_features)) + for ee in range(n_examples): + ridx = np.random.randint(n_features, size=(n_nonzero)) + random_values = np.random.randn(n_nonzero) + random_values = random_values / np.linalg.norm(random_values) + X[ee, ridx] = random_values + + params_in.update({"n_clusters": n_clusters}) + movmf = VonMisesFisherMixture(**params_in) + movmf.fit(X) + + assert movmf.cluster_centers_.shape == (n_clusters, n_features) + assert len(movmf.concentrations_) == n_clusters + assert len(movmf.weights_) == n_clusters + assert len(movmf.labels_) == n_examples + assert len(movmf.posterior_) == n_clusters + + for center in movmf.cluster_centers_: + assert_almost_equal(np.linalg.norm(center), 1.0) + + for concentration in movmf.concentrations_: + assert concentration > 0 + + for weight in movmf.weights_: + assert not np.isnan(weight) + + plabels = movmf.predict(X) + assert_array_equal(plabels, movmf.labels_) + + ll = movmf.log_likelihood(X) + ll_labels = np.zeros(movmf.labels_.shape) + for ee in range(n_examples): + ll_labels[ee] = np.argmax(ll[:, ee]) + + assert_array_equal(ll_labels, movmf.labels_) diff --git a/code/spherecluster/spherecluster/util.py b/code/spherecluster/spherecluster/util.py new file mode 100644 index 0000000..bb68261 --- /dev/null +++ b/code/spherecluster/spherecluster/util.py @@ -0,0 +1,57 @@ +""" +Generate multivariate von Mises Fisher samples. + +This solution originally appears here: +http://stats.stackexchange.com/questions/156729/sampling-from-von-mises-fisher-distribution-in-python + +Also see: + +Sampling from vMF on S^2: + https://www.mitsuba-renderer.org/~wenzel/files/vmf.pdf + http://www.stat.pitt.edu/sungkyu/software/randvonMisesFisher3.pdf +""" +import numpy as np + + +def sample_vMF(mu, kappa, num_samples): + """Generate num_samples N-dimensional samples from von Mises Fisher + distribution around center mu \in R^N with concentration kappa. + """ + dim = len(mu) + result = np.zeros((num_samples, dim)) + for nn in range(num_samples): + # sample offset from center (on sphere) with spread kappa + w = _sample_weight(kappa, dim) + + # sample a point v on the unit sphere that's orthogonal to mu + v = _sample_orthonormal_to(mu) + + # compute new point + result[nn, :] = v * np.sqrt(1. - w ** 2) + w * mu + + return result + + +def _sample_weight(kappa, dim): + """Rejection sampling scheme for sampling distance from center on + surface of the sphere. + """ + dim = dim - 1 # since S^{n-1} + b = dim / (np.sqrt(4. * kappa ** 2 + dim ** 2) + 2 * kappa) + x = (1. - b) / (1. + b) + c = kappa * x + dim * np.log(1 - x ** 2) + + while True: + z = np.random.beta(dim / 2., dim / 2.) + w = (1. - (1. + b) * z) / (1. - (1. - b) * z) + u = np.random.uniform(low=0, high=1) + if kappa * w + dim * np.log(1. - x * w) - c >= np.log(u): + return w + + +def _sample_orthonormal_to(mu): + """Sample point on sphere orthogonal to mu.""" + v = np.random.randn(mu.shape[0]) + proj_mu_v = mu * np.dot(mu, v) / np.linalg.norm(mu) + orthto = v - proj_mu_v + return orthto / np.linalg.norm(orthto) diff --git a/code/spherecluster/spherecluster/von_mises_fisher_mixture.py b/code/spherecluster/spherecluster/von_mises_fisher_mixture.py new file mode 100644 index 0000000..53c4c67 --- /dev/null +++ b/code/spherecluster/spherecluster/von_mises_fisher_mixture.py @@ -0,0 +1,946 @@ +import warnings + +import numpy as np +import scipy.sparse as sp +from joblib import Parallel, delayed +from numpy import i0 # modified Bessel function of first kind order 0, I_0 +from scipy.special import iv # modified Bessel function of first kind, I_v +from scipy.special import logsumexp + +from sklearn.base import BaseEstimator, ClusterMixin, TransformerMixin +from sklearn.cluster.k_means_ import _init_centroids, _tolerance, _validate_center_shape +from sklearn.metrics.pairwise import cosine_distances +from sklearn.preprocessing import normalize +from sklearn.utils import check_array, check_random_state, as_float_array +from sklearn.utils.extmath import squared_norm +from sklearn.utils.validation import FLOAT_DTYPES +from sklearn.utils.validation import check_is_fitted + +from . import spherical_kmeans + +MAX_CONTENTRATION = 1e10 + + +def _inertia_from_labels(X, centers, labels): + """Compute inertia with cosine distance using known labels. + """ + n_examples, n_features = X.shape + inertia = np.zeros((n_examples,)) + for ee in range(n_examples): + inertia[ee] = 1 - X[ee, :].dot(centers[int(labels[ee]), :].T) + + return np.sum(inertia) + + +def _labels_inertia(X, centers): + """Compute labels and inertia with cosine distance. + """ + n_examples, n_features = X.shape + n_clusters, n_features = centers.shape + + labels = np.zeros((n_examples,)) + inertia = np.zeros((n_examples,)) + + for ee in range(n_examples): + dists = np.zeros((n_clusters,)) + for cc in range(n_clusters): + dists[cc] = 1 - X[ee, :].dot(centers[cc, :].T) + + labels[ee] = np.argmin(dists) + inertia[ee] = dists[int(labels[ee])] + + return labels, np.sum(inertia) + + +def _vmf_log(X, kappa, mu): + """Computs log(vMF(X, kappa, mu)) using built-in numpy/scipy Bessel + approximations. + + Works well on small kappa and mu. + """ + n_examples, n_features = X.shape + return np.log(_vmf_normalize(kappa, n_features) * np.exp(kappa * X.dot(mu).T)) + + +def _vmf_normalize(kappa, dim): + """Compute normalization constant using built-in numpy/scipy Bessel + approximations. + + Works well on small kappa and mu. + """ + num = np.power(kappa, dim / 2.0 - 1.0) + + if dim / 2.0 - 1.0 < 1e-15: + denom = np.power(2.0 * np.pi, dim / 2.0) * i0(kappa) + else: + denom = np.power(2.0 * np.pi, dim / 2.0) * iv(dim / 2.0 - 1.0, kappa) + + if np.isinf(num): + raise ValueError("VMF scaling numerator was inf.") + + if np.isinf(denom): + raise ValueError("VMF scaling denominator was inf.") + + if np.abs(denom) < 1e-15: + raise ValueError("VMF scaling denominator was 0.") + + return num / denom + + +def _log_H_asymptotic(nu, kappa): + """Compute the Amos-type upper bound asymptotic approximation on H where + log(H_\nu)(\kappa) = \int_0^\kappa R_\nu(t) dt. + + See "lH_asymptotic <-" in movMF.R and utility function implementation notes + from https://cran.r-project.org/web/packages/movMF/index.html + """ + beta = np.sqrt((nu + 0.5) ** 2) + kappa_l = np.min([kappa, np.sqrt((3.0 * nu + 11.0 / 2.0) * (nu + 3.0 / 2.0))]) + return _S(kappa, nu + 0.5, beta) + ( + _S(kappa_l, nu, nu + 2.0) - _S(kappa_l, nu + 0.5, beta) + ) + + +def _S(kappa, alpha, beta): + """Compute the antiderivative of the Amos-type bound G on the modified + Bessel function ratio. + + Note: Handles scalar kappa, alpha, and beta only. + + See "S <-" in movMF.R and utility function implementation notes from + https://cran.r-project.org/web/packages/movMF/index.html + """ + kappa = 1.0 * np.abs(kappa) + alpha = 1.0 * alpha + beta = 1.0 * np.abs(beta) + a_plus_b = alpha + beta + u = np.sqrt(kappa ** 2 + beta ** 2) + if alpha == 0: + alpha_scale = 0 + else: + alpha_scale = alpha * np.log((alpha + u) / a_plus_b) + + return u - beta - alpha_scale + + +def _vmf_log_asymptotic(X, kappa, mu): + """Compute log(f(x|theta)) via Amos approximation + + log(f(x|theta)) = theta' x - log(H_{d/2-1})(\|theta\|) + + where theta = kappa * X, \|theta\| = kappa. + + Computing _vmf_log helps with numerical stability / loss of precision for + for large values of kappa and n_features. + + See utility function implementation notes in movMF.R from + https://cran.r-project.org/web/packages/movMF/index.html + """ + n_examples, n_features = X.shape + log_vfm = kappa * X.dot(mu).T + -_log_H_asymptotic(n_features / 2.0 - 1.0, kappa) + + return log_vfm + + +def _log_likelihood(X, centers, weights, concentrations): + if len(np.shape(X)) != 2: + X = X.reshape((1, len(X))) + + n_examples, n_features = np.shape(X) + n_clusters, _ = centers.shape + + if n_features <= 50: # works up to about 50 before numrically unstable + vmf_f = _vmf_log + else: + vmf_f = _vmf_log_asymptotic + + f_log = np.zeros((n_clusters, n_examples)) + for cc in range(n_clusters): + f_log[cc, :] = vmf_f(X, concentrations[cc], centers[cc, :]) + + posterior = np.zeros((n_clusters, n_examples)) + weights_log = np.log(weights) + posterior = np.tile(weights_log.T, (n_examples, 1)).T + f_log + for ee in range(n_examples): + posterior[:, ee] = np.exp(posterior[:, ee] - logsumexp(posterior[:, ee])) + + return posterior + + +def _init_unit_centers(X, n_clusters, random_state, init): + """Initializes unit norm centers. + + Parameters + ---------- + X : array-like or sparse matrix, shape=(n_samples, n_features) + + n_clusters : int, optional, default: 8 + The number of clusters to form as well as the number of + centroids to generate. + + random_state : integer or numpy.RandomState, optional + The generator used to initialize the centers. If an integer is + given, it fixes the seed. Defaults to the global numpy random + number generator. + + init: (string) one of + k-means++ : uses sklearn k-means++ initialization algorithm + spherical-k-means : use centroids from one pass of spherical k-means + random : random unit norm vectors + random-orthonormal : random orthonormal vectors + If an ndarray is passed, it should be of shape (n_clusters, n_features) + and gives the initial centers. + """ + n_examples, n_features = np.shape(X) + if isinstance(init, np.ndarray): + n_init_clusters, n_init_features = init.shape + assert n_init_clusters == n_clusters + assert n_init_features == n_features + + # ensure unit normed centers + centers = init + for cc in range(n_clusters): + centers[cc, :] = centers[cc, :] / np.linalg.norm(centers[cc, :]) + + return centers + + elif init == "spherical-k-means": + labels, inertia, centers, iters = spherical_kmeans._spherical_kmeans_single_lloyd( + X, n_clusters, x_squared_norms=np.ones((n_examples,)), init="k-means++" + ) + + return centers + + elif init == "random": + centers = np.random.randn(n_clusters, n_features) + for cc in range(n_clusters): + centers[cc, :] = centers[cc, :] / np.linalg.norm(centers[cc, :]) + + return centers + + elif init == "k-means++": + centers = _init_centroids( + X, + n_clusters, + "k-means++", + random_state=random_state, + x_squared_norms=np.ones((n_examples,)), + ) + + for cc in range(n_clusters): + centers[cc, :] = centers[cc, :] / np.linalg.norm(centers[cc, :]) + + return centers + + elif init == "random-orthonormal": + centers = np.random.randn(n_clusters, n_features) + q, r = np.linalg.qr(centers.T, mode="reduced") + + return q.T + + elif init == "random-class": + centers = np.zeros((n_clusters, n_features)) + for cc in range(n_clusters): + while np.linalg.norm(centers[cc, :]) == 0: + labels = np.random.randint(0, n_clusters, n_examples) + centers[cc, :] = X[labels == cc, :].sum(axis=0) + + for cc in range(n_clusters): + centers[cc, :] = centers[cc, :] / np.linalg.norm(centers[cc, :]) + + return centers + + +def _expectation(X, centers, weights, concentrations, posterior_type="soft"): + """Compute the log-likelihood of each datapoint being in each cluster. + + Parameters + ---------- + centers (mu) : array, [n_centers x n_features] + weights (alpha) : array, [n_centers, ] (alpha) + concentrations (kappa) : array, [n_centers, ] + + Returns + ---------- + posterior : array, [n_centers, n_examples] + """ + n_examples, n_features = np.shape(X) + n_clusters, _ = centers.shape + + if n_features <= 50: # works up to about 50 before numrically unstable + vmf_f = _vmf_log + else: + vmf_f = _vmf_log_asymptotic + + f_log = np.zeros((n_clusters, n_examples)) + for cc in range(n_clusters): + f_log[cc, :] = vmf_f(X, concentrations[cc], centers[cc, :]) + + posterior = np.zeros((n_clusters, n_examples)) + if posterior_type == "soft": + weights_log = np.log(weights) + posterior = np.tile(weights_log.T, (n_examples, 1)).T + f_log + for ee in range(n_examples): + posterior[:, ee] = np.exp(posterior[:, ee] - logsumexp(posterior[:, ee])) + + elif posterior_type == "hard": + weights_log = np.log(weights) + weighted_f_log = np.tile(weights_log.T, (n_examples, 1)).T + f_log + for ee in range(n_examples): + posterior[np.argmax(weighted_f_log[:, ee]), ee] = 1.0 + + return posterior + + +def _maximization(X, posterior, force_weights=None): + """Estimate new centers, weights, and concentrations from + + Parameters + ---------- + posterior : array, [n_centers, n_examples] + The posterior matrix from the expectation step. + + force_weights : None or array, [n_centers, ] + If None is passed, will estimate weights. + If an array is passed, will use instead of estimating. + + Returns + ---------- + centers (mu) : array, [n_centers x n_features] + weights (alpha) : array, [n_centers, ] (alpha) + concentrations (kappa) : array, [n_centers, ] + """ + n_examples, n_features = X.shape + n_clusters, n_examples = posterior.shape + concentrations = np.zeros((n_clusters,)) + centers = np.zeros((n_clusters, n_features)) + if force_weights is None: + weights = np.zeros((n_clusters,)) + + for cc in range(n_clusters): + # update weights (alpha) + if force_weights is None: + weights[cc] = np.mean(posterior[cc, :]) + else: + weights = force_weights + + # update centers (mu) + X_scaled = X.copy() + if sp.issparse(X): + X_scaled.data *= posterior[cc, :].repeat(np.diff(X_scaled.indptr)) + else: + for ee in range(n_examples): + X_scaled[ee, :] *= posterior[cc, ee] + + centers[cc, :] = X_scaled.sum(axis=0) + + # normalize centers + center_norm = np.linalg.norm(centers[cc, :]) + if center_norm > 1e-8: + centers[cc, :] = centers[cc, :] / center_norm + + # update concentration (kappa) [TODO: add other kappa approximations] + rbar = center_norm / (n_examples * weights[cc]) + concentrations[cc] = rbar * n_features - np.power(rbar, 3.0) + if np.abs(rbar - 1.0) < 1e-10: + concentrations[cc] = MAX_CONTENTRATION + else: + concentrations[cc] /= 1.0 - np.power(rbar, 2.0) + + # let python know we can free this (good for large dense X) + del X_scaled + + return centers, weights, concentrations + + +def _movMF( + X, + n_clusters, + posterior_type="soft", + force_weights=None, + max_iter=300, + verbose=False, + init="random-class", + random_state=None, + tol=1e-6, +): + """Mixture of von Mises Fisher clustering. + + Implements the algorithms (i) and (ii) from + + "Clustering on the Unit Hypersphere using von Mises-Fisher Distributions" + by Banerjee, Dhillon, Ghosh, and Sra. + + TODO: Currently only supports Banerjee et al 2005 approximation of kappa, + however, there are numerous other approximations see _update_params. + + Attribution + ---------- + Approximation of log-vmf distribution function from movMF R-package. + + movMF: An R Package for Fitting Mixtures of von Mises-Fisher Distributions + by Kurt Hornik, Bettina Grun, 2014 + + Find more at: + https://cran.r-project.org/web/packages/movMF/vignettes/movMF.pdf + https://cran.r-project.org/web/packages/movMF/index.html + + Parameters + ---------- + n_clusters : int, optional, default: 8 + The number of clusters to form as well as the number of + centroids to generate. + + posterior_type: 'soft' or 'hard' + Type of posterior computed in exepectation step. + See note about attribute: self.posterior_ + + force_weights : None or array [n_clusters, ] + If None, the algorithm will estimate the weights. + If an array of weights, algorithm will estimate concentrations and + centers with given weights. + + max_iter : int, default: 300 + Maximum number of iterations of the k-means algorithm for a + single run. + + n_init : int, default: 10 + Number of time the k-means algorithm will be run with different + centroid seeds. The final results will be the best output of + n_init consecutive runs in terms of inertia. + + init: (string) one of + random-class [default]: random class assignment & centroid computation + k-means++ : uses sklearn k-means++ initialization algorithm + spherical-k-means : use centroids from one pass of spherical k-means + random : random unit norm vectors + random-orthonormal : random orthonormal vectors + If an ndarray is passed, it should be of shape (n_clusters, n_features) + and gives the initial centers. + + tol : float, default: 1e-6 + Relative tolerance with regards to inertia to declare convergence + + n_jobs : int + The number of jobs to use for the computation. This works by computing + each of the n_init runs in parallel. + If -1 all CPUs are used. If 1 is given, no parallel computing code is + used at all, which is useful for debugging. For n_jobs below -1, + (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one + are used. + + random_state : integer or numpy.RandomState, optional + The generator used to initialize the centers. If an integer is + given, it fixes the seed. Defaults to the global numpy random + number generator. + + verbose : int, default 0 + Verbosity mode. + + copy_x : boolean, default True + When pre-computing distances it is more numerically accurate to center + the data first. If copy_x is True, then the original data is not + modified. If False, the original data is modified, and put back before + the function returns, but small numerical differences may be introduced + by subtracting and then adding the data mean. + """ + random_state = check_random_state(random_state) + n_examples, n_features = np.shape(X) + + # init centers (mus) + centers = _init_unit_centers(X, n_clusters, random_state, init) + + # init weights (alphas) + if force_weights is None: + weights = np.ones((n_clusters,)) + weights = weights / np.sum(weights) + else: + weights = force_weights + + # init concentrations (kappas) + concentrations = np.ones((n_clusters,)) + + if verbose: + print("Initialization complete") + + for iter in range(max_iter): + centers_prev = centers.copy() + + # expectation step + posterior = _expectation( + X, centers, weights, concentrations, posterior_type=posterior_type + ) + + # maximization step + centers, weights, concentrations = _maximization( + X, posterior, force_weights=force_weights + ) + + # check convergence + tolcheck = squared_norm(centers_prev - centers) + if tolcheck <= tol: + if verbose: + print( + "Converged at iteration %d: " + "center shift %e within tolerance %e" % (iter, tolcheck, tol) + ) + break + + # labels come for free via posterior + labels = np.zeros((n_examples,)) + for ee in range(n_examples): + labels[ee] = np.argmax(posterior[:, ee]) + + inertia = _inertia_from_labels(X, centers, labels) + + return centers, weights, concentrations, posterior, labels, inertia + + +def movMF( + X, + n_clusters, + posterior_type="soft", + force_weights=None, + n_init=10, + n_jobs=1, + max_iter=300, + verbose=False, + init="random-class", + random_state=None, + tol=1e-6, + copy_x=True, +): + """Wrapper for parallelization of _movMF and running n_init times. + """ + if n_init <= 0: + raise ValueError( + "Invalid number of initializations." + " n_init=%d must be bigger than zero." % n_init + ) + random_state = check_random_state(random_state) + + if max_iter <= 0: + raise ValueError( + "Number of iterations should be a positive number," + " got %d instead" % max_iter + ) + + best_inertia = np.infty + X = as_float_array(X, copy=copy_x) + tol = _tolerance(X, tol) + + if hasattr(init, "__array__"): + init = check_array(init, dtype=X.dtype.type, copy=True) + _validate_center_shape(X, n_clusters, init) + + if n_init != 1: + warnings.warn( + "Explicit initial center position passed: " + "performing only one init in k-means instead of n_init=%d" % n_init, + RuntimeWarning, + stacklevel=2, + ) + n_init = 1 + + # defaults + best_centers = None + best_labels = None + best_weights = None + best_concentrations = None + best_posterior = None + best_inertia = None + + if n_jobs == 1: + # For a single thread, less memory is needed if we just store one set + # of the best results (as opposed to one set per run per thread). + for it in range(n_init): + # cluster on the sphere + (centers, weights, concentrations, posterior, labels, inertia) = _movMF( + X, + n_clusters, + posterior_type=posterior_type, + force_weights=force_weights, + max_iter=max_iter, + verbose=verbose, + init=init, + random_state=random_state, + tol=tol, + ) + + # determine if these results are the best so far + if best_inertia is None or inertia < best_inertia: + best_centers = centers.copy() + best_labels = labels.copy() + best_weights = weights.copy() + best_concentrations = concentrations.copy() + best_posterior = posterior.copy() + best_inertia = inertia + else: + # parallelisation of movMF runs + seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init) + results = Parallel(n_jobs=n_jobs, verbose=0)( + delayed(_movMF)( + X, + n_clusters, + posterior_type=posterior_type, + force_weights=force_weights, + max_iter=max_iter, + verbose=verbose, + init=init, + random_state=random_state, + tol=tol, + ) + for seed in seeds + ) + + # Get results with the lowest inertia + centers, weights, concentrations, posteriors, labels, inertia = zip(*results) + best = np.argmin(inertia) + best_labels = labels[best] + best_inertia = inertia[best] + best_centers = centers[best] + best_concentrations = concentrations[best] + best_posterior = posteriors[best] + best_weights = weights[best] + + return ( + best_centers, + best_labels, + best_inertia, + best_weights, + best_concentrations, + best_posterior, + ) + + +class VonMisesFisherMixture(BaseEstimator, ClusterMixin, TransformerMixin): + """Estimator for Mixture of von Mises Fisher clustering on the unit sphere. + + Implements the algorithms (i) and (ii) from + + "Clustering on the Unit Hypersphere using von Mises-Fisher Distributions" + by Banerjee, Dhillon, Ghosh, and Sra. + + TODO: Currently only supports Banerjee et al 2005 approximation of kappa, + however, there are numerous other approximations see _update_params. + + Attribution + ---------- + Approximation of log-vmf distribution function from movMF R-package. + + movMF: An R Package for Fitting Mixtures of von Mises-Fisher Distributions + by Kurt Hornik, Bettina Grun, 2014 + + Find more at: + https://cran.r-project.org/web/packages/movMF/vignettes/movMF.pdf + https://cran.r-project.org/web/packages/movMF/index.html + + Basic sklearn scaffolding from sklearn.cluster.KMeans. + + Parameters + ---------- + n_clusters : int, optional, default: 8 + The number of clusters to form as well as the number of + centroids to generate. + + posterior_type: 'soft' or 'hard' + Type of posterior computed in exepectation step. + See note about attribute: self.posterior_ + + force_weights : None or array [n_clusters, ] + If None, the algorithm will estimate the weights. + If an array of weights, algorithm will estimate concentrations and + centers with given weights. + + max_iter : int, default: 300 + Maximum number of iterations of the k-means algorithm for a + single run. + + n_init : int, default: 10 + Number of time the k-means algorithm will be run with different + centroid seeds. The final results will be the best output of + n_init consecutive runs in terms of inertia. + + init: (string) one of + random-class [default]: random class assignment & centroid computation + k-means++ : uses sklearn k-means++ initialization algorithm + spherical-k-means : use centroids from one pass of spherical k-means + random : random unit norm vectors + random-orthonormal : random orthonormal vectors + If an ndarray is passed, it should be of shape (n_clusters, n_features) + and gives the initial centers. + + tol : float, default: 1e-6 + Relative tolerance with regards to inertia to declare convergence + + n_jobs : int + The number of jobs to use for the computation. This works by computing + each of the n_init runs in parallel. + If -1 all CPUs are used. If 1 is given, no parallel computing code is + used at all, which is useful for debugging. For n_jobs below -1, + (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one + are used. + + random_state : integer or numpy.RandomState, optional + The generator used to initialize the centers. If an integer is + given, it fixes the seed. Defaults to the global numpy random + number generator. + + verbose : int, default 0 + Verbosity mode. + + copy_x : boolean, default True + When pre-computing distances it is more numerically accurate to center + the data first. If copy_x is True, then the original data is not + modified. If False, the original data is modified, and put back before + the function returns, but small numerical differences may be introduced + by subtracting and then adding the data mean. + + normalize : boolean, default True + Normalize the input to have unnit norm. + + Attributes + ---------- + + cluster_centers_ : array, [n_clusters, n_features] + Coordinates of cluster centers + + labels_ : + Labels of each point + + inertia_ : float + Sum of distances of samples to their closest cluster center. + + weights_ : array, [n_clusters,] + Weights of each cluster in vMF distribution (alpha). + + concentrations_ : array [n_clusters,] + Concentration parameter for each cluster (kappa). + Larger values correspond to more concentrated clusters. + + posterior_ : array, [n_clusters, n_examples] + Each column corresponds to the posterio distribution for and example. + + If posterior_type='hard' is used, there will only be one non-zero per + column, its index corresponding to the example's cluster label. + + If posterior_type='soft' is used, this matrix will be dense and the + column values correspond to soft clustering weights. + """ + + def __init__( + self, + n_clusters=5, + posterior_type="soft", + force_weights=None, + n_init=10, + n_jobs=1, + max_iter=300, + verbose=False, + init="random-class", + random_state=None, + tol=1e-6, + copy_x=True, + normalize=True, + ): + self.n_clusters = n_clusters + self.posterior_type = posterior_type + self.force_weights = force_weights + self.n_init = n_init + self.n_jobs = n_jobs + self.max_iter = max_iter + self.verbose = verbose + self.init = init + self.random_state = random_state + self.tol = tol + self.copy_x = copy_x + self.normalize = normalize + + def _check_force_weights(self): + if self.force_weights is None: + return + + if len(self.force_weights) != self.n_clusters: + raise ValueError( + ( + "len(force_weights)={} but must equal " + "n_clusters={}".format(len(self.force_weights), self.n_clusters) + ) + ) + + def _check_fit_data(self, X): + """Verify that the number of samples given is larger than k""" + X = check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) + n_samples, n_features = X.shape + if X.shape[0] < self.n_clusters: + raise ValueError( + "n_samples=%d should be >= n_clusters=%d" + % (X.shape[0], self.n_clusters) + ) + + for ee in range(n_samples): + if sp.issparse(X): + n = sp.linalg.norm(X[ee, :]) + else: + n = np.linalg.norm(X[ee, :]) + + if np.abs(n - 1.0) > 1e-4: + raise ValueError("Data l2-norm must be 1, found {}".format(n)) + + return X + + def _check_test_data(self, X): + X = check_array(X, accept_sparse="csr", dtype=FLOAT_DTYPES) + n_samples, n_features = X.shape + expected_n_features = self.cluster_centers_.shape[1] + if not n_features == expected_n_features: + raise ValueError( + "Incorrect number of features. " + "Got %d features, expected %d" % (n_features, expected_n_features) + ) + + for ee in range(n_samples): + if sp.issparse(X): + n = sp.linalg.norm(X[ee, :]) + else: + n = np.linalg.norm(X[ee, :]) + + if np.abs(n - 1.0) > 1e-4: + raise ValueError("Data l2-norm must be 1, found {}".format(n)) + + return X + + def fit(self, X, y=None): + """Compute mixture of von Mises Fisher clustering. + + Parameters + ---------- + X : array-like or sparse matrix, shape=(n_samples, n_features) + """ + if self.normalize: + X = normalize(X) + + self._check_force_weights() + random_state = check_random_state(self.random_state) + X = self._check_fit_data(X) + + ( + self.cluster_centers_, + self.labels_, + self.inertia_, + self.weights_, + self.concentrations_, + self.posterior_, + ) = movMF( + X, + self.n_clusters, + posterior_type=self.posterior_type, + force_weights=self.force_weights, + n_init=self.n_init, + n_jobs=self.n_jobs, + max_iter=self.max_iter, + verbose=self.verbose, + init=self.init, + random_state=random_state, + tol=self.tol, + copy_x=self.copy_x, + ) + + return self + + def fit_predict(self, X, y=None): + """Compute cluster centers and predict cluster index for each sample. + Convenience method; equivalent to calling fit(X) followed by + predict(X). + """ + return self.fit(X).labels_ + + def fit_transform(self, X, y=None): + """Compute clustering and transform X to cluster-distance space. + Equivalent to fit(X).transform(X), but more efficiently implemented. + """ + # Currently, this just skips a copy of the data if it is not in + # np.array or CSR format already. + # XXX This skips _check_test_data, which may change the dtype; + # we should refactor the input validation. + return self.fit(X)._transform(X) + + def transform(self, X, y=None): + """Transform X to a cluster-distance space. + In the new space, each dimension is the cosine distance to the cluster + centers. Note that even if X is sparse, the array returned by + `transform` will typically be dense. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape = [n_samples, n_features] + New data to transform. + + Returns + ------- + X_new : array, shape [n_samples, k] + X transformed in the new space. + """ + if self.normalize: + X = normalize(X) + + check_is_fitted(self) + X = self._check_test_data(X) + return self._transform(X) + + def _transform(self, X): + """guts of transform method; no input validation""" + return cosine_distances(X, self.cluster_centers_) + + def predict(self, X): + """Predict the closest cluster each sample in X belongs to. + In the vector quantization literature, `cluster_centers_` is called + the code book and each value returned by `predict` is the index of + the closest code in the code book. + + Note: Does not check that each point is on the sphere. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape = [n_samples, n_features] + New data to predict. + + Returns + ------- + labels : array, shape [n_samples,] + Index of the cluster each sample belongs to. + """ + if self.normalize: + X = normalize(X) + + check_is_fitted(self) + + X = self._check_test_data(X) + return _labels_inertia(X, self.cluster_centers_)[0] + + def score(self, X, y=None): + """Inertia score (sum of all distances to closest cluster). + + Parameters + ---------- + X : {array-like, sparse matrix}, shape = [n_samples, n_features] + New data. + + Returns + ------- + score : float + Larger score is better. + """ + if self.normalize: + X = normalize(X) + + check_is_fitted(self) + X = self._check_test_data(X) + return -_labels_inertia(X, self.cluster_centers_)[1] + + def log_likelihood(self, X): + check_is_fitted(self) + + return _log_likelihood( + X, self.cluster_centers_, self.weights_, self.concentrations_ + ) diff --git a/code/spherecluster/spherical_kmeans.py b/code/spherecluster/spherical_kmeans.py new file mode 100644 index 0000000..882eba1 --- /dev/null +++ b/code/spherecluster/spherical_kmeans.py @@ -0,0 +1,424 @@ +import warnings + +import numpy as np +import scipy.sparse as sp +from joblib import Parallel, delayed + +from sklearn.cluster import KMeans + +from sklearn.cluster import _k_means_fast as _k_means +from sklearn.cluster import _k_means_lloyd +from sklearn.cluster._kmeans import ( + _check_sample_weight, + _labels_inertia, + _tolerance, +) +from sklearn.preprocessing import normalize +from sklearn.utils import check_array, check_random_state +from sklearn.utils.extmath import row_norms, squared_norm +from sklearn.utils.validation import _num_samples +from sklearn.utils._openmp_helpers import _openmp_effective_n_threads + + +def _spherical_kmeans_single_lloyd( + X, + n_clusters, + centers_init, + sample_weight=None, + max_iter=300, + init="k-means++", + verbose=False, + x_squared_norms=None, + random_state=None, + tol=1e-4, + n_threads=1, +): + """ + Modified from sklearn.cluster.k_means_.k_means_single_lloyd. + """ + random_state = check_random_state(random_state) + + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) + + best_labels, best_inertia, best_centers = None, None, None + + # Allocate memory to store the distances for each sample to its + # closer center for reallocation in case of ties + distances = np.zeros(shape=(X.shape[0],), dtype=X.dtype) + + # Buffers to avoid new allocations at each iteration. + centers = centers_init + centers_new = np.zeros_like(centers) + labels = np.full(X.shape[0], -1, dtype=np.int32) + labels_old = labels.copy() + weight_in_clusters = np.zeros(n_clusters, dtype=X.dtype) + center_shift = np.zeros(n_clusters, dtype=X.dtype) + + if sp.issparse(X): + lloyd_iter = _k_means_lloyd.lloyd_iter_chunked_sparse + _inertia = _k_means._inertia_sparse + else: + lloyd_iter = _k_means_lloyd.lloyd_iter_chunked_dense + _inertia = _k_means._inertia_dense + + # iterations + for i in range(max_iter): + centers_old = centers.copy() + + # labels assignment + # TODO: _labels_inertia should be done with cosine distance + # since ||a - b|| = 2(1 - cos(a,b)) when a,b are unit normalized + # this doesn't really matter. + labels, inertia = _labels_inertia( + X, + sample_weight, + x_squared_norms, + centers, + ) + lloyd_iter(X, sample_weight, x_squared_norms, centers, centers_new, + weight_in_clusters, labels, center_shift, n_threads) + + centers, centers_new = centers_new, centers + # l2-normalize centers (this is the main contibution here) + centers = normalize(centers) + + if verbose: + print("Iteration %2d, inertia %.3f" % (i, inertia)) + + if best_inertia is None or inertia < best_inertia: + best_labels = labels.copy() + best_centers = centers.copy() + best_inertia = inertia + + center_shift_total = squared_norm(centers_old - centers) + if center_shift_total <= tol: + if verbose: + print( + "Converged at iteration %d: " + "center shift %e within tolerance %e" % (i, center_shift_total, tol) + ) + break + + if center_shift_total > 0: + # rerun E-step in case of non-convergence so that predicted labels + # match cluster centers + best_labels, best_inertia = _labels_inertia( + X, + sample_weight, + x_squared_norms, + best_centers, + ) + + return best_labels, best_inertia, best_centers, i + 1 + + +def spherical_k_means( + X, + n_clusters, + centers_init, + sample_weight=None, + init="k-means++", + n_init=10, + max_iter=300, + verbose=False, + tol=1e-4, + random_state=None, + copy_x=True, + n_jobs=1, + algorithm="auto", + return_n_iter=False, +): + """Modified from sklearn.cluster.k_means_.k_means. + """ + if n_init <= 0: + raise ValueError( + "Invalid number of initializations." + " n_init=%d must be bigger than zero." % n_init + ) + random_state = check_random_state(random_state) + + if max_iter <= 0: + raise ValueError( + "Number of iterations should be a positive number," + " got %d instead" % max_iter + ) + + best_inertia = np.infty + # avoid forcing order when copy_x=False + order = "C" if copy_x else None + X = check_array( + X, accept_sparse="csr", dtype=[np.float64, np.float32], order=order, copy=copy_x + ) + # verify that the number of samples given is larger than k + if _num_samples(X) < n_clusters: + raise ValueError( + "n_samples=%d should be >= n_clusters=%d" % (_num_samples(X), n_clusters) + ) + tol = _tolerance(X, tol) + + if hasattr(init, "__array__"): + if n_init != 1: + warnings.warn( + "Explicit initial center position passed: " + "performing only one init in k-means instead of n_init=%d" % n_init, + RuntimeWarning, + stacklevel=2, + ) + n_init = 1 + + # precompute squared norms of data points + x_squared_norms = row_norms(X, squared=True) + + if n_jobs == 1: + # For a single thread, less memory is needed if we just store one set + # of the best results (as opposed to one set per run per thread). + for it in range(n_init): + # run a k-means once + labels, inertia, centers, n_iter_ = _spherical_kmeans_single_lloyd( + X, + n_clusters, + centers_init=centers_init, + sample_weight=sample_weight, + max_iter=max_iter, + init=init, + verbose=verbose, + tol=tol, + x_squared_norms=x_squared_norms, + random_state=random_state, + ) + + # determine if these results are the best so far + if best_inertia is None or inertia < best_inertia: + best_labels = labels.copy() + best_centers = centers.copy() + best_inertia = inertia + best_n_iter = n_iter_ + else: + # parallelisation of k-means runs + seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init) + results = Parallel(n_jobs=n_jobs, verbose=0)( + delayed(_spherical_kmeans_single_lloyd)( + X, + n_clusters, + sample_weight, + max_iter=max_iter, + init=init, + verbose=verbose, + tol=tol, + x_squared_norms=x_squared_norms, + # Change seed to ensure variety + random_state=seed, + ) + for seed in seeds + ) + + # Get results with the lowest inertia + labels, inertia, centers, n_iters = zip(*results) + best = np.argmin(inertia) + best_labels = labels[best] + best_inertia = inertia[best] + best_centers = centers[best] + best_n_iter = n_iters[best] + + if return_n_iter: + return best_centers, best_labels, best_inertia, best_n_iter + else: + return best_centers, best_labels, best_inertia + + +class SphericalKMeans(KMeans): + """Spherical K-Means clustering + + Modfication of sklearn.cluster.KMeans where cluster centers are normalized + (projected onto the sphere) in each iteration. + + Parameters + ---------- + + n_clusters : int, optional, default: 8 + The number of clusters to form as well as the number of + centroids to generate. + + max_iter : int, default: 300 + Maximum number of iterations of the k-means algorithm for a + single run. + + n_init : int, default: 10 + Number of time the k-means algorithm will be run with different + centroid seeds. The final results will be the best output of + n_init consecutive runs in terms of inertia. + + init : {'k-means++', 'random' or an ndarray} + Method for initialization, defaults to 'k-means++': + 'k-means++' : selects initial cluster centers for k-mean + clustering in a smart way to speed up convergence. See section + Notes in k_init for more details. + 'random': choose k observations (rows) at random from data for + the initial centroids. + If an ndarray is passed, it should be of shape (n_clusters, n_features) + and gives the initial centers. + + tol : float, default: 1e-4 + Relative tolerance with regards to inertia to declare convergence + + n_jobs : int + The number of jobs to use for the computation. This works by computing + each of the n_init runs in parallel. + If -1 all CPUs are used. If 1 is given, no parallel computing code is + used at all, which is useful for debugging. For n_jobs below -1, + (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one + are used. + + random_state : integer or numpy.RandomState, optional + The generator used to initialize the centers. If an integer is + given, it fixes the seed. Defaults to the global numpy random + number generator. + + verbose : int, default 0 + Verbosity mode. + + copy_x : boolean, default True + When pre-computing distances it is more numerically accurate to center + the data first. If copy_x is True, then the original data is not + modified. If False, the original data is modified, and put back before + the function returns, but small numerical differences may be introduced + by subtracting and then adding the data mean. + + normalize : boolean, default True + Normalize the input to have unnit norm. + + Attributes + ---------- + + cluster_centers_ : array, [n_clusters, n_features] + Coordinates of cluster centers + + labels_ : + Labels of each point + + inertia_ : float + Sum of distances of samples to their closest cluster center. + """ + + def __init__( + self, + n_clusters=8, + init="k-means++", + n_init=10, + max_iter=300, + tol=1e-4, + n_jobs=1, + verbose=0, + random_state=None, + copy_x=True, + normalize=True, + ): + self.n_clusters = n_clusters + self.init = init + self.max_iter = max_iter + self.tol = tol + self.n_init = n_init + self.verbose = verbose + self.random_state = random_state + self.copy_x = copy_x + self.n_jobs = n_jobs + self.normalize = normalize + + def _check_params(self, X): + # n_jobs + if self.n_jobs != 'deprecated': + warnings.warn("'n_jobs' was deprecated in version 0.23 and will be" + " removed in 1.0 (renaming of 0.25).", FutureWarning) + self._n_threads = self.n_jobs + else: + self._n_threads = None + self._n_threads = _openmp_effective_n_threads(self._n_threads) + + # n_init + if self.n_init <= 0: + raise ValueError( + f"n_init should be > 0, got {self.n_init} instead.") + self._n_init = self.n_init + + # max_iter + if self.max_iter <= 0: + raise ValueError( + f"max_iter should be > 0, got {self.max_iter} instead.") + + # n_clusters + if X.shape[0] < self.n_clusters: + raise ValueError(f"n_samples={X.shape[0]} should be >= " + f"n_clusters={self.n_clusters}.") + + # tol + self._tol = _tolerance(X, self.tol) + + # init + if not (hasattr(self.init, '__array__') or callable(self.init) + or (isinstance(self.init, str) + and self.init in ["k-means++", "random"])): + raise ValueError( + f"init should be either 'k-means++', 'random', a ndarray or a " + f"callable, got '{self.init}' instead.") + + if hasattr(self.init, '__array__') and self._n_init != 1: + warnings.warn( + f"Explicit initial center position passed: performing only" + f" one init in {self.__class__.__name__} instead of " + f"n_init={self._n_init}.", RuntimeWarning, stacklevel=2) + self._n_init = 1 + + + def fit(self, X, y=None, sample_weight=None): + """Compute k-means clustering. + + Parameters + ---------- + + X : array-like or sparse matrix, shape=(n_samples, n_features) + + y : Ignored + not used, present here for API consistency by convention. + + sample_weight : array-like, shape (n_samples,), optional + The weights for each observation in X. If None, all observations + are assigned equal weight (default: None) + """ + if self.normalize: + X = normalize(X) + self._check_params(X) + + random_state = check_random_state(self.random_state) + + # Validate init array + init = self.init + if hasattr(init, "__array__"): + init = check_array(init, dtype=X.dtype.type, order="C", copy=True) + self._validate_center_shape(X, init) + + # TODO: add check that all data is unit-normalized + x_squared_norms = row_norms(X, squared=True) + centers_init = self._init_centroids( + X, + init=self.init, + random_state=random_state, + x_squared_norms=x_squared_norms + ) + + self.cluster_centers_, self.labels_, self.inertia_, self.n_iter_ = spherical_k_means( + X, + n_clusters=self.n_clusters, + centers_init=centers_init, + sample_weight=sample_weight, + init=self.init, + n_init=self.n_init, + max_iter=self.max_iter, + verbose=self.verbose, + tol=self.tol, + random_state=random_state, + copy_x=self.copy_x, + n_jobs=self.n_jobs, + return_n_iter=True, + ) + + return self diff --git a/code/spherecluster/util.py b/code/spherecluster/util.py new file mode 100644 index 0000000..bb68261 --- /dev/null +++ b/code/spherecluster/util.py @@ -0,0 +1,57 @@ +""" +Generate multivariate von Mises Fisher samples. + +This solution originally appears here: +http://stats.stackexchange.com/questions/156729/sampling-from-von-mises-fisher-distribution-in-python + +Also see: + +Sampling from vMF on S^2: + https://www.mitsuba-renderer.org/~wenzel/files/vmf.pdf + http://www.stat.pitt.edu/sungkyu/software/randvonMisesFisher3.pdf +""" +import numpy as np + + +def sample_vMF(mu, kappa, num_samples): + """Generate num_samples N-dimensional samples from von Mises Fisher + distribution around center mu \in R^N with concentration kappa. + """ + dim = len(mu) + result = np.zeros((num_samples, dim)) + for nn in range(num_samples): + # sample offset from center (on sphere) with spread kappa + w = _sample_weight(kappa, dim) + + # sample a point v on the unit sphere that's orthogonal to mu + v = _sample_orthonormal_to(mu) + + # compute new point + result[nn, :] = v * np.sqrt(1. - w ** 2) + w * mu + + return result + + +def _sample_weight(kappa, dim): + """Rejection sampling scheme for sampling distance from center on + surface of the sphere. + """ + dim = dim - 1 # since S^{n-1} + b = dim / (np.sqrt(4. * kappa ** 2 + dim ** 2) + 2 * kappa) + x = (1. - b) / (1. + b) + c = kappa * x + dim * np.log(1 - x ** 2) + + while True: + z = np.random.beta(dim / 2., dim / 2.) + w = (1. - (1. + b) * z) / (1. - (1. - b) * z) + u = np.random.uniform(low=0, high=1) + if kappa * w + dim * np.log(1. - x * w) - c >= np.log(u): + return w + + +def _sample_orthonormal_to(mu): + """Sample point on sphere orthogonal to mu.""" + v = np.random.randn(mu.shape[0]) + proj_mu_v = mu * np.dot(mu, v) / np.linalg.norm(mu) + orthto = v - proj_mu_v + return orthto / np.linalg.norm(orthto) diff --git a/code/spherecluster/von_mises_fisher_mixture.py b/code/spherecluster/von_mises_fisher_mixture.py new file mode 100644 index 0000000..7193f35 --- /dev/null +++ b/code/spherecluster/von_mises_fisher_mixture.py @@ -0,0 +1,946 @@ +import warnings + +import numpy as np +import scipy.sparse as sp +from joblib import Parallel, delayed +from numpy import i0 # modified Bessel function of first kind order 0, I_0 +from scipy.special import iv # modified Bessel function of first kind, I_v +from scipy.special import logsumexp + + +from sklearn.cluster import KMeans +from sklearn.cluster._kmeans import _tolerance, _kmeans_plusplus +from sklearn.metrics.pairwise import cosine_distances +from sklearn.preprocessing import normalize +from sklearn.utils import check_array, check_random_state, as_float_array +from sklearn.utils.extmath import squared_norm +from sklearn.utils.validation import FLOAT_DTYPES +from sklearn.utils.validation import check_is_fitted +from . import spherical_kmeans +# _init_centroids + +MAX_CONTENTRATION = 1e10 + + +def _inertia_from_labels(X, centers, labels): + """Compute inertia with cosine distance using known labels. + """ + n_examples, n_features = X.shape + inertia = np.zeros((n_examples,)) + for ee in range(n_examples): + inertia[ee] = 1 - X[ee, :].dot(centers[int(labels[ee]), :].T) + + return np.sum(inertia) + + +def _labels_inertia(X, centers): + """Compute labels and inertia with cosine distance. + """ + n_examples, n_features = X.shape + n_clusters, n_features = centers.shape + + labels = np.zeros((n_examples,)) + inertia = np.zeros((n_examples,)) + + for ee in range(n_examples): + dists = np.zeros((n_clusters,)) + for cc in range(n_clusters): + dists[cc] = 1 - X[ee, :].dot(centers[cc, :].T) + + labels[ee] = np.argmin(dists) + inertia[ee] = dists[int(labels[ee])] + + return labels, np.sum(inertia) + + +def _vmf_log(X, kappa, mu): + """Computs log(vMF(X, kappa, mu)) using built-in numpy/scipy Bessel + approximations. + + Works well on small kappa and mu. + """ + n_examples, n_features = X.shape + return np.log(_vmf_normalize(kappa, n_features) * np.exp(kappa * X.dot(mu).T)) + + +def _vmf_normalize(kappa, dim): + """Compute normalization constant using built-in numpy/scipy Bessel + approximations. + + Works well on small kappa and mu. + """ + num = np.power(kappa, dim / 2.0 - 1.0) + + if dim / 2.0 - 1.0 < 1e-15: + denom = np.power(2.0 * np.pi, dim / 2.0) * i0(kappa) + else: + denom = np.power(2.0 * np.pi, dim / 2.0) * iv(dim / 2.0 - 1.0, kappa) + + if np.isinf(num): + raise ValueError("VMF scaling numerator was inf.") + + if np.isinf(denom): + raise ValueError("VMF scaling denominator was inf.") + + if np.abs(denom) < 1e-15: + raise ValueError("VMF scaling denominator was 0.") + + return num / denom + + +def _log_H_asymptotic(nu, kappa): + """Compute the Amos-type upper bound asymptotic approximation on H where + log(H_\nu)(\kappa) = \int_0^\kappa R_\nu(t) dt. + + See "lH_asymptotic <-" in movMF.R and utility function implementation notes + from https://cran.r-project.org/web/packages/movMF/index.html + """ + beta = np.sqrt((nu + 0.5) ** 2) + kappa_l = np.min([kappa, np.sqrt((3.0 * nu + 11.0 / 2.0) * (nu + 3.0 / 2.0))]) + return _S(kappa, nu + 0.5, beta) + ( + _S(kappa_l, nu, nu + 2.0) - _S(kappa_l, nu + 0.5, beta) + ) + + +def _S(kappa, alpha, beta): + """Compute the antiderivative of the Amos-type bound G on the modified + Bessel function ratio. + + Note: Handles scalar kappa, alpha, and beta only. + + See "S <-" in movMF.R and utility function implementation notes from + https://cran.r-project.org/web/packages/movMF/index.html + """ + kappa = 1.0 * np.abs(kappa) + alpha = 1.0 * alpha + beta = 1.0 * np.abs(beta) + a_plus_b = alpha + beta + u = np.sqrt(kappa ** 2 + beta ** 2) + if alpha == 0: + alpha_scale = 0 + else: + alpha_scale = alpha * np.log((alpha + u) / a_plus_b) + + return u - beta - alpha_scale + + +def _vmf_log_asymptotic(X, kappa, mu): + """Compute log(f(x|theta)) via Amos approximation + + log(f(x|theta)) = theta' x - log(H_{d/2-1})(\|theta\|) + + where theta = kappa * X, \|theta\| = kappa. + + Computing _vmf_log helps with numerical stability / loss of precision for + for large values of kappa and n_features. + + See utility function implementation notes in movMF.R from + https://cran.r-project.org/web/packages/movMF/index.html + """ + n_examples, n_features = X.shape + log_vfm = kappa * X.dot(mu).T + -_log_H_asymptotic(n_features / 2.0 - 1.0, kappa) + + return log_vfm + + +def _log_likelihood(X, centers, weights, concentrations): + if len(np.shape(X)) != 2: + X = X.reshape((1, len(X))) + + n_examples, n_features = np.shape(X) + n_clusters, _ = centers.shape + + if n_features <= 50: # works up to about 50 before numrically unstable + vmf_f = _vmf_log + else: + vmf_f = _vmf_log_asymptotic + + f_log = np.zeros((n_clusters, n_examples)) + for cc in range(n_clusters): + f_log[cc, :] = vmf_f(X, concentrations[cc], centers[cc, :]) + + posterior = np.zeros((n_clusters, n_examples)) + weights_log = np.log(weights) + posterior = np.tile(weights_log.T, (n_examples, 1)).T + f_log + for ee in range(n_examples): + posterior[:, ee] = np.exp(posterior[:, ee] - logsumexp(posterior[:, ee])) + + return posterior + + +def _init_unit_centers(X, n_clusters, random_state, init): + """Initializes unit norm centers. + + Parameters + ---------- + X : array-like or sparse matrix, shape=(n_samples, n_features) + + n_clusters : int, optional, default: 8 + The number of clusters to form as well as the number of + centroids to generate. + + random_state : integer or numpy.RandomState, optional + The generator used to initialize the centers. If an integer is + given, it fixes the seed. Defaults to the global numpy random + number generator. + + init: (string) one of + k-means++ : uses sklearn k-means++ initialization algorithm + spherical-k-means : use centroids from one pass of spherical k-means + random : random unit norm vectors + random-orthonormal : random orthonormal vectors + If an ndarray is passed, it should be of shape (n_clusters, n_features) + and gives the initial centers. + """ + n_examples, n_features = np.shape(X) + if isinstance(init, np.ndarray): + n_init_clusters, n_init_features = init.shape + assert n_init_clusters == n_clusters + assert n_init_features == n_features + + # ensure unit normed centers + centers = init + for cc in range(n_clusters): + centers[cc, :] = centers[cc, :] / np.linalg.norm(centers[cc, :]) + + return centers + + elif init == "spherical-k-means": + labels, inertia, centers, iters = spherical_kmeans._spherical_kmeans_single_lloyd( + X, n_clusters, x_squared_norms=np.ones((n_examples,)), init="k-means++" + ) + + return centers + + elif init == "random": + centers = np.random.randn(n_clusters, n_features) + for cc in range(n_clusters): + centers[cc, :] = centers[cc, :] / np.linalg.norm(centers[cc, :]) + + return centers + + elif init == "k-means++": + centers, _ = _kmeans_plusplus(X, n_clusters, + random_state=random_state, + x_squared_norms=np.ones((n_examples,))) + + for cc in range(n_clusters): + centers[cc, :] = centers[cc, :] / np.linalg.norm(centers[cc, :]) + + return centers + + elif init == "random-orthonormal": + centers = np.random.randn(n_clusters, n_features) + q, r = np.linalg.qr(centers.T, mode="reduced") + + return q.T + + elif init == "random-class": + centers = np.zeros((n_clusters, n_features)) + for cc in range(n_clusters): + while np.linalg.norm(centers[cc, :]) == 0: + labels = np.random.randint(0, n_clusters, n_examples) + centers[cc, :] = X[labels == cc, :].sum(axis=0) + + for cc in range(n_clusters): + centers[cc, :] = centers[cc, :] / np.linalg.norm(centers[cc, :]) + + return centers + + +def _expectation(X, centers, weights, concentrations, posterior_type="soft"): + """Compute the log-likelihood of each datapoint being in each cluster. + + Parameters + ---------- + centers (mu) : array, [n_centers x n_features] + weights (alpha) : array, [n_centers, ] (alpha) + concentrations (kappa) : array, [n_centers, ] + + Returns + ---------- + posterior : array, [n_centers, n_examples] + """ + n_examples, n_features = np.shape(X) + n_clusters, _ = centers.shape + + if n_features <= 50: # works up to about 50 before numrically unstable + vmf_f = _vmf_log + else: + vmf_f = _vmf_log_asymptotic + + f_log = np.zeros((n_clusters, n_examples)) + for cc in range(n_clusters): + f_log[cc, :] = vmf_f(X, concentrations[cc], centers[cc, :]) + + posterior = np.zeros((n_clusters, n_examples)) + if posterior_type == "soft": + weights_log = np.log(weights) + posterior = np.tile(weights_log.T, (n_examples, 1)).T + f_log + for ee in range(n_examples): + posterior[:, ee] = np.exp(posterior[:, ee] - logsumexp(posterior[:, ee])) + + elif posterior_type == "hard": + weights_log = np.log(weights) + weighted_f_log = np.tile(weights_log.T, (n_examples, 1)).T + f_log + for ee in range(n_examples): + posterior[np.argmax(weighted_f_log[:, ee]), ee] = 1.0 + + return posterior + + +def _maximization(X, posterior, force_weights=None): + """Estimate new centers, weights, and concentrations from + + Parameters + ---------- + posterior : array, [n_centers, n_examples] + The posterior matrix from the expectation step. + + force_weights : None or array, [n_centers, ] + If None is passed, will estimate weights. + If an array is passed, will use instead of estimating. + + Returns + ---------- + centers (mu) : array, [n_centers x n_features] + weights (alpha) : array, [n_centers, ] (alpha) + concentrations (kappa) : array, [n_centers, ] + """ + n_examples, n_features = X.shape + n_clusters, n_examples = posterior.shape + concentrations = np.zeros((n_clusters,)) + centers = np.zeros((n_clusters, n_features)) + if force_weights is None: + weights = np.zeros((n_clusters,)) + + for cc in range(n_clusters): + # update weights (alpha) + if force_weights is None: + weights[cc] = np.mean(posterior[cc, :]) + else: + weights = force_weights + + # update centers (mu) + X_scaled = X.copy() + if sp.issparse(X): + X_scaled.data *= posterior[cc, :].repeat(np.diff(X_scaled.indptr)) + else: + for ee in range(n_examples): + X_scaled[ee, :] *= posterior[cc, ee] + + centers[cc, :] = X_scaled.sum(axis=0) + + # normalize centers + center_norm = np.linalg.norm(centers[cc, :]) + if center_norm > 1e-8: + centers[cc, :] = centers[cc, :] / center_norm + + # update concentration (kappa) [TODO: add other kappa approximations] + rbar = center_norm / (n_examples * weights[cc]) + concentrations[cc] = rbar * n_features - np.power(rbar, 3.0) + if np.abs(rbar - 1.0) < 1e-10: + concentrations[cc] = MAX_CONTENTRATION + else: + concentrations[cc] /= 1.0 - np.power(rbar, 2.0) + + # let python know we can free this (good for large dense X) + del X_scaled + + return centers, weights, concentrations + + +def _movMF( + X, + n_clusters, + posterior_type="soft", + force_weights=None, + max_iter=300, + verbose=False, + init="random-class", + random_state=None, + tol=1e-6, +): + """Mixture of von Mises Fisher clustering. + + Implements the algorithms (i) and (ii) from + + "Clustering on the Unit Hypersphere using von Mises-Fisher Distributions" + by Banerjee, Dhillon, Ghosh, and Sra. + + TODO: Currently only supports Banerjee et al 2005 approximation of kappa, + however, there are numerous other approximations see _update_params. + + Attribution + ---------- + Approximation of log-vmf distribution function from movMF R-package. + + movMF: An R Package for Fitting Mixtures of von Mises-Fisher Distributions + by Kurt Hornik, Bettina Grun, 2014 + + Find more at: + https://cran.r-project.org/web/packages/movMF/vignettes/movMF.pdf + https://cran.r-project.org/web/packages/movMF/index.html + + Parameters + ---------- + n_clusters : int, optional, default: 8 + The number of clusters to form as well as the number of + centroids to generate. + + posterior_type: 'soft' or 'hard' + Type of posterior computed in exepectation step. + See note about attribute: self.posterior_ + + force_weights : None or array [n_clusters, ] + If None, the algorithm will estimate the weights. + If an array of weights, algorithm will estimate concentrations and + centers with given weights. + + max_iter : int, default: 300 + Maximum number of iterations of the k-means algorithm for a + single run. + + n_init : int, default: 10 + Number of time the k-means algorithm will be run with different + centroid seeds. The final results will be the best output of + n_init consecutive runs in terms of inertia. + + init: (string) one of + random-class [default]: random class assignment & centroid computation + k-means++ : uses sklearn k-means++ initialization algorithm + spherical-k-means : use centroids from one pass of spherical k-means + random : random unit norm vectors + random-orthonormal : random orthonormal vectors + If an ndarray is passed, it should be of shape (n_clusters, n_features) + and gives the initial centers. + + tol : float, default: 1e-6 + Relative tolerance with regards to inertia to declare convergence + + n_jobs : int + The number of jobs to use for the computation. This works by computing + each of the n_init runs in parallel. + If -1 all CPUs are used. If 1 is given, no parallel computing code is + used at all, which is useful for debugging. For n_jobs below -1, + (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one + are used. + + random_state : integer or numpy.RandomState, optional + The generator used to initialize the centers. If an integer is + given, it fixes the seed. Defaults to the global numpy random + number generator. + + verbose : int, default 0 + Verbosity mode. + + copy_x : boolean, default True + When pre-computing distances it is more numerically accurate to center + the data first. If copy_x is True, then the original data is not + modified. If False, the original data is modified, and put back before + the function returns, but small numerical differences may be introduced + by subtracting and then adding the data mean. + """ + random_state = check_random_state(random_state) + n_examples, n_features = np.shape(X) + + # init centers (mus) + centers = _init_unit_centers(X, n_clusters, random_state, init) + + # init weights (alphas) + if force_weights is None: + weights = np.ones((n_clusters,)) + weights = weights / np.sum(weights) + else: + weights = force_weights + + # init concentrations (kappas) + concentrations = np.ones((n_clusters,)) + + if verbose: + print("Initialization complete") + + for iter in range(max_iter): + centers_prev = centers.copy() + + # expectation step + posterior = _expectation( + X, centers, weights, concentrations, posterior_type=posterior_type + ) + + # maximization step + centers, weights, concentrations = _maximization( + X, posterior, force_weights=force_weights + ) + + # check convergence + tolcheck = squared_norm(centers_prev - centers) + if tolcheck <= tol: + if verbose: + print( + "Converged at iteration %d: " + "center shift %e within tolerance %e" % (iter, tolcheck, tol) + ) + break + + # labels come for free via posterior + labels = np.zeros((n_examples,)) + for ee in range(n_examples): + labels[ee] = np.argmax(posterior[:, ee]) + + inertia = _inertia_from_labels(X, centers, labels) + + return centers, weights, concentrations, posterior, labels, inertia + + +def movMF( + X, + n_clusters, + posterior_type="soft", + force_weights=None, + n_init=10, + n_jobs=1, + max_iter=300, + verbose=False, + init="random-class", + random_state=None, + tol=1e-6, + copy_x=True, +): + """Wrapper for parallelization of _movMF and running n_init times. + """ + if n_init <= 0: + raise ValueError( + "Invalid number of initializations." + " n_init=%d must be bigger than zero." % n_init + ) + random_state = check_random_state(random_state) + + if max_iter <= 0: + raise ValueError( + "Number of iterations should be a positive number," + " got %d instead" % max_iter + ) + + best_inertia = np.infty + X = as_float_array(X, copy=copy_x) + tol = _tolerance(X, tol) + + if hasattr(init, "__array__"): + if n_init != 1: + warnings.warn( + "Explicit initial center position passed: " + "performing only one init in k-means instead of n_init=%d" % n_init, + RuntimeWarning, + stacklevel=2, + ) + n_init = 1 + + # defaults + best_centers = None + best_labels = None + best_weights = None + best_concentrations = None + best_posterior = None + best_inertia = None + + if n_jobs == 1: + # For a single thread, less memory is needed if we just store one set + # of the best results (as opposed to one set per run per thread). + for it in range(n_init): + # cluster on the sphere + (centers, weights, concentrations, posterior, labels, inertia) = _movMF( + X, + n_clusters, + posterior_type=posterior_type, + force_weights=force_weights, + max_iter=max_iter, + verbose=verbose, + init=init, + random_state=random_state, + tol=tol, + ) + + # determine if these results are the best so far + if best_inertia is None or inertia < best_inertia: + best_centers = centers.copy() + best_labels = labels.copy() + best_weights = weights.copy() + best_concentrations = concentrations.copy() + best_posterior = posterior.copy() + best_inertia = inertia + else: + # parallelisation of movMF runs + seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init) + results = Parallel(n_jobs=n_jobs, verbose=0)( + delayed(_movMF)( + X, + n_clusters, + posterior_type=posterior_type, + force_weights=force_weights, + max_iter=max_iter, + verbose=verbose, + init=init, + random_state=random_state, + tol=tol, + ) + for seed in seeds + ) + + # Get results with the lowest inertia + centers, weights, concentrations, posteriors, labels, inertia = zip(*results) + best = np.argmin(inertia) + best_labels = labels[best] + best_inertia = inertia[best] + best_centers = centers[best] + best_concentrations = concentrations[best] + best_posterior = posteriors[best] + best_weights = weights[best] + + return ( + best_centers, + best_labels, + best_inertia, + best_weights, + best_concentrations, + best_posterior, + ) + + +class VonMisesFisherMixture(KMeans): + """Estimator for Mixture of von Mises Fisher clustering on the unit sphere. + + Implements the algorithms (i) and (ii) from + + "Clustering on the Unit Hypersphere using von Mises-Fisher Distributions" + by Banerjee, Dhillon, Ghosh, and Sra. + + TODO: Currently only supports Banerjee et al 2005 approximation of kappa, + however, there are numerous other approximations see _update_params. + + Attribution + ---------- + Approximation of log-vmf distribution function from movMF R-package. + + movMF: An R Package for Fitting Mixtures of von Mises-Fisher Distributions + by Kurt Hornik, Bettina Grun, 2014 + + Find more at: + https://cran.r-project.org/web/packages/movMF/vignettes/movMF.pdf + https://cran.r-project.org/web/packages/movMF/index.html + + Basic sklearn scaffolding from sklearn.cluster.KMeans. + + Parameters + ---------- + n_clusters : int, optional, default: 8 + The number of clusters to form as well as the number of + centroids to generate. + + posterior_type: 'soft' or 'hard' + Type of posterior computed in exepectation step. + See note about attribute: self.posterior_ + + force_weights : None or array [n_clusters, ] + If None, the algorithm will estimate the weights. + If an array of weights, algorithm will estimate concentrations and + centers with given weights. + + max_iter : int, default: 300 + Maximum number of iterations of the k-means algorithm for a + single run. + + n_init : int, default: 10 + Number of time the k-means algorithm will be run with different + centroid seeds. The final results will be the best output of + n_init consecutive runs in terms of inertia. + + init: (string) one of + random-class [default]: random class assignment & centroid computation + k-means++ : uses sklearn k-means++ initialization algorithm + spherical-k-means : use centroids from one pass of spherical k-means + random : random unit norm vectors + random-orthonormal : random orthonormal vectors + If an ndarray is passed, it should be of shape (n_clusters, n_features) + and gives the initial centers. + + tol : float, default: 1e-6 + Relative tolerance with regards to inertia to declare convergence + + n_jobs : int + The number of jobs to use for the computation. This works by computing + each of the n_init runs in parallel. + If -1 all CPUs are used. If 1 is given, no parallel computing code is + used at all, which is useful for debugging. For n_jobs below -1, + (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one + are used. + + random_state : integer or numpy.RandomState, optional + The generator used to initialize the centers. If an integer is + given, it fixes the seed. Defaults to the global numpy random + number generator. + + verbose : int, default 0 + Verbosity mode. + + copy_x : boolean, default True + When pre-computing distances it is more numerically accurate to center + the data first. If copy_x is True, then the original data is not + modified. If False, the original data is modified, and put back before + the function returns, but small numerical differences may be introduced + by subtracting and then adding the data mean. + + normalize : boolean, default True + Normalize the input to have unnit norm. + + Attributes + ---------- + + cluster_centers_ : array, [n_clusters, n_features] + Coordinates of cluster centers + + labels_ : + Labels of each point + + inertia_ : float + Sum of distances of samples to their closest cluster center. + + weights_ : array, [n_clusters,] + Weights of each cluster in vMF distribution (alpha). + + concentrations_ : array [n_clusters,] + Concentration parameter for each cluster (kappa). + Larger values correspond to more concentrated clusters. + + posterior_ : array, [n_clusters, n_examples] + Each column corresponds to the posterio distribution for and example. + + If posterior_type='hard' is used, there will only be one non-zero per + column, its index corresponding to the example's cluster label. + + If posterior_type='soft' is used, this matrix will be dense and the + column values correspond to soft clustering weights. + """ + + def __init__( + self, + n_clusters=5, + posterior_type="soft", + force_weights=None, + n_init=10, + n_jobs=1, + max_iter=300, + verbose=False, + init="random-class", + random_state=None, + tol=1e-6, + copy_x=True, + normalize=True, + ): + self.n_clusters = n_clusters + self.posterior_type = posterior_type + self.force_weights = force_weights + self.n_init = n_init + self.n_jobs = n_jobs + self.max_iter = max_iter + self.verbose = verbose + self.init = init + self.random_state = random_state + self.tol = tol + self.copy_x = copy_x + self.normalize = normalize + + def _check_force_weights(self): + if self.force_weights is None: + return + + if len(self.force_weights) != self.n_clusters: + raise ValueError( + ( + "len(force_weights)={} but must equal " + "n_clusters={}".format(len(self.force_weights), self.n_clusters) + ) + ) + + def _check_fit_data(self, X): + """Verify that the number of samples given is larger than k""" + X = check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) + n_samples, n_features = X.shape + if X.shape[0] < self.n_clusters: + raise ValueError( + "n_samples=%d should be >= n_clusters=%d" + % (X.shape[0], self.n_clusters) + ) + + for ee in range(n_samples): + if sp.issparse(X): + n = sp.linalg.norm(X[ee, :]) + else: + n = np.linalg.norm(X[ee, :]) + + if np.abs(n - 1.0) > 1e-4: + raise ValueError("Data l2-norm must be 1, found {}".format(n)) + + return X + + def _check_test_data(self, X): + X = check_array(X, accept_sparse="csr", dtype=FLOAT_DTYPES) + n_samples, n_features = X.shape + expected_n_features = self.cluster_centers_.shape[1] + if not n_features == expected_n_features: + raise ValueError( + "Incorrect number of features. " + "Got %d features, expected %d" % (n_features, expected_n_features) + ) + + for ee in range(n_samples): + if sp.issparse(X): + n = sp.linalg.norm(X[ee, :]) + else: + n = np.linalg.norm(X[ee, :]) + + if np.abs(n - 1.0) > 1e-4: + raise ValueError("Data l2-norm must be 1, found {}".format(n)) + + return X + + def fit(self, X, y=None): + """Compute mixture of von Mises Fisher clustering. + + Parameters + ---------- + X : array-like or sparse matrix, shape=(n_samples, n_features) + """ + if self.normalize: + X = normalize(X) + + # Validate init array + init = self.init + if hasattr(init, "__array__"): + init = check_array(init, dtype=X.dtype.type, order="C", copy=True) + self._validate_center_shape(X, init) + + self._check_force_weights() + random_state = check_random_state(self.random_state) + X = self._check_fit_data(X) + + ( + self.cluster_centers_, + self.labels_, + self.inertia_, + self.weights_, + self.concentrations_, + self.posterior_, + ) = movMF( + X, + self.n_clusters, + posterior_type=self.posterior_type, + force_weights=self.force_weights, + n_init=self.n_init, + n_jobs=self.n_jobs, + max_iter=self.max_iter, + verbose=self.verbose, + init=self.init, + random_state=random_state, + tol=self.tol, + copy_x=self.copy_x, + ) + + return self + + def fit_predict(self, X, y=None): + """Compute cluster centers and predict cluster index for each sample. + Convenience method; equivalent to calling fit(X) followed by + predict(X). + """ + return self.fit(X).labels_ + + def fit_transform(self, X, y=None): + """Compute clustering and transform X to cluster-distance space. + Equivalent to fit(X).transform(X), but more efficiently implemented. + """ + # Currently, this just skips a copy of the data if it is not in + # np.array or CSR format already. + # XXX This skips _check_test_data, which may change the dtype; + # we should refactor the input validation. + return self.fit(X)._transform(X) + + def transform(self, X, y=None): + """Transform X to a cluster-distance space. + In the new space, each dimension is the cosine distance to the cluster + centers. Note that even if X is sparse, the array returned by + `transform` will typically be dense. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape = [n_samples, n_features] + New data to transform. + + Returns + ------- + X_new : array, shape [n_samples, k] + X transformed in the new space. + """ + if self.normalize: + X = normalize(X) + + check_is_fitted(self) + X = self._check_test_data(X) + return self._transform(X) + + def _transform(self, X): + """guts of transform method; no input validation""" + return cosine_distances(X, self.cluster_centers_) + + def predict(self, X): + """Predict the closest cluster each sample in X belongs to. + In the vector quantization literature, `cluster_centers_` is called + the code book and each value returned by `predict` is the index of + the closest code in the code book. + + Note: Does not check that each point is on the sphere. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape = [n_samples, n_features] + New data to predict. + + Returns + ------- + labels : array, shape [n_samples,] + Index of the cluster each sample belongs to. + """ + if self.normalize: + X = normalize(X) + + check_is_fitted(self) + + X = self._check_test_data(X) + return _labels_inertia(X, self.cluster_centers_)[0] + + def score(self, X, y=None): + """Inertia score (sum of all distances to closest cluster). + + Parameters + ---------- + X : {array-like, sparse matrix}, shape = [n_samples, n_features] + New data. + + Returns + ------- + score : float + Larger score is better. + """ + if self.normalize: + X = normalize(X) + + check_is_fitted(self) + X = self._check_test_data(X) + return -_labels_inertia(X, self.cluster_centers_)[1] + + def log_likelihood(self, X): + check_is_fitted(self) + + return _log_likelihood( + X, self.cluster_centers_, self.weights_, self.concentrations_ + ) diff --git a/code/taxonomy.py b/code/taxonomy.py index 3c5eb20..00794cd 100644 --- a/code/taxonomy.py +++ b/code/taxonomy.py @@ -53,7 +53,7 @@ def find_node(self, name): return None def sample_a_node(self): - return random.choice(self.all_nodes.values()) + return random.choice(list(self.all_nodes.values())) diff --git a/code/utils.py b/code/utils.py index 085e722..ca29d88 100644 --- a/code/utils.py +++ b/code/utils.py @@ -31,7 +31,7 @@ def avg_weighted_colors(color_list, c_size): for (color, weight) in color_list: w_color = [x * weight for x in color] # print w_color - result_color = map(operator.add, result_color, w_color) + result_color = list(map(operator.add, result_color, w_color)) # print result_color return l1_normalize(result_color) @@ -57,7 +57,7 @@ def cossim(p, q): def euclidean_distance(p, q): if len(p) != len(q): - print 'Euclidean distance error: p, q have different length' + print('Euclidean distance error: p, q have different length') distance = 0 @@ -69,7 +69,7 @@ def euclidean_distance(p, q): def euclidean_cluster(ps, c): if len(ps) == 0 or c == None: - print 'Cluster is empty' + print('Cluster is empty') distance = 0 @@ -83,7 +83,7 @@ def euclidean_cluster(ps, c): def dot_product(p, q): if len(p) != len(q): - print 'KL divergence error: p, q have different length' + print('KL divergence error: p, q have different length') p_len = q_len = mix_len = 0 @@ -122,7 +122,7 @@ def avg_emb_with_distinct(ele_map, embs_from, dist_map, vec_size): avg_emb = [0] * vec_size t_weight = 0 - for key, value in ele_map.iteritems(): + for key, value in ele_map.items(): t_emb = embs_from[key] w = value * dist_map[key] for i in range(vec_size): @@ -140,7 +140,7 @@ def avg_emb(ele_map, embs_from, vec_size): avg_emb = [0] * vec_size t_weight = 0 - for key, value in ele_map.iteritems(): + for key, value in ele_map.items(): t_emb = embs_from[key] w = value for i in range(vec_size): diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..3e630e5 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,21 @@ +[tool.poetry] +name = "taxogen" +version = "0.1.0" +description = "" +authors = ["Cezar Sas "] + +[tool.poetry.dependencies] +python = ">=3.8,<3.11" +numpy = "^1.21.4" +scipy = "^1.7.3" +scikit-learn = "0.24.2" +PyYAML = "^6.0" + + +[tool.poetry.dev-dependencies] +2to3 = "^1.0" + + +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" From 9e231940c94e10257806f31a92f1206dab819b1d Mon Sep 17 00:00:00 2001 From: Cezar Sas Date: Tue, 14 Dec 2021 09:52:40 +0100 Subject: [PATCH 2/2] Minor fixes --- code/spherecluster/spherecluster/__init__.py | 6 - .../spherecluster/spherical_kmeans.py | 372 ------- .../spherecluster/tests/test_common.py | 6 - .../tests/test_von_mises_fisher_mixture.py | 205 ---- code/spherecluster/spherecluster/util.py | 57 -- .../spherecluster/von_mises_fisher_mixture.py | 946 ------------------ 6 files changed, 1592 deletions(-) delete mode 100644 code/spherecluster/spherecluster/__init__.py delete mode 100644 code/spherecluster/spherecluster/spherical_kmeans.py delete mode 100644 code/spherecluster/spherecluster/tests/test_common.py delete mode 100644 code/spherecluster/spherecluster/tests/test_von_mises_fisher_mixture.py delete mode 100644 code/spherecluster/spherecluster/util.py delete mode 100644 code/spherecluster/spherecluster/von_mises_fisher_mixture.py diff --git a/code/spherecluster/spherecluster/__init__.py b/code/spherecluster/spherecluster/__init__.py deleted file mode 100644 index 5d2c502..0000000 --- a/code/spherecluster/spherecluster/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -from __future__ import absolute_import -from .spherical_kmeans import SphericalKMeans -from .von_mises_fisher_mixture import VonMisesFisherMixture -from .util import sample_vMF - -__all__ = ["SphericalKMeans", "VonMisesFisherMixture", "sample_vMF"] diff --git a/code/spherecluster/spherecluster/spherical_kmeans.py b/code/spherecluster/spherecluster/spherical_kmeans.py deleted file mode 100644 index d56e03c..0000000 --- a/code/spherecluster/spherecluster/spherical_kmeans.py +++ /dev/null @@ -1,372 +0,0 @@ -import warnings - -import numpy as np -import scipy.sparse as sp -from joblib import Parallel, delayed - -from sklearn.cluster import KMeans - -# from sklearn.cluster import _k_means -from sklearn.cluster import _k_means_fast as _k_means -from sklearn.cluster.k_means_ import ( - _check_sample_weight, - _init_centroids, - _labels_inertia, - _tolerance, - _validate_center_shape, -) -from sklearn.preprocessing import normalize -from sklearn.utils import check_array, check_random_state -from sklearn.utils.extmath import row_norms, squared_norm -from sklearn.utils.validation import _num_samples - - -def _spherical_kmeans_single_lloyd( - X, - n_clusters, - sample_weight=None, - max_iter=300, - init="k-means++", - verbose=False, - x_squared_norms=None, - random_state=None, - tol=1e-4, - precompute_distances=True, -): - """ - Modified from sklearn.cluster.k_means_.k_means_single_lloyd. - """ - random_state = check_random_state(random_state) - - sample_weight = _check_sample_weight(sample_weight, X) - - best_labels, best_inertia, best_centers = None, None, None - - # init - centers = _init_centroids( - X, n_clusters, init, random_state=random_state, x_squared_norms=x_squared_norms - ) - if verbose: - print("Initialization complete") - - # Allocate memory to store the distances for each sample to its - # closer center for reallocation in case of ties - distances = np.zeros(shape=(X.shape[0],), dtype=X.dtype) - - # iterations - for i in range(max_iter): - centers_old = centers.copy() - - # labels assignment - # TODO: _labels_inertia should be done with cosine distance - # since ||a - b|| = 2(1 - cos(a,b)) when a,b are unit normalized - # this doesn't really matter. - labels, inertia = _labels_inertia( - X, - sample_weight, - x_squared_norms, - centers, - precompute_distances=precompute_distances, - distances=distances, - ) - - # computation of the means - if sp.issparse(X): - centers = _k_means._centers_sparse( - X, sample_weight, labels, n_clusters, distances - ) - else: - centers = _k_means._centers_dense( - X.astype(np.float), - sample_weight.astype(np.float), - labels, - n_clusters, - distances.astype(np.float), - ) - - # l2-normalize centers (this is the main contibution here) - centers = normalize(centers) - - if verbose: - print("Iteration %2d, inertia %.3f" % (i, inertia)) - - if best_inertia is None or inertia < best_inertia: - best_labels = labels.copy() - best_centers = centers.copy() - best_inertia = inertia - - center_shift_total = squared_norm(centers_old - centers) - if center_shift_total <= tol: - if verbose: - print( - "Converged at iteration %d: " - "center shift %e within tolerance %e" % (i, center_shift_total, tol) - ) - break - - if center_shift_total > 0: - # rerun E-step in case of non-convergence so that predicted labels - # match cluster centers - best_labels, best_inertia = _labels_inertia( - X, - sample_weight, - x_squared_norms, - best_centers, - precompute_distances=precompute_distances, - distances=distances, - ) - - return best_labels, best_inertia, best_centers, i + 1 - - -def spherical_k_means( - X, - n_clusters, - sample_weight=None, - init="k-means++", - n_init=10, - max_iter=300, - verbose=False, - tol=1e-4, - random_state=None, - copy_x=True, - n_jobs=1, - algorithm="auto", - return_n_iter=False, -): - """Modified from sklearn.cluster.k_means_.k_means. - """ - if n_init <= 0: - raise ValueError( - "Invalid number of initializations." - " n_init=%d must be bigger than zero." % n_init - ) - random_state = check_random_state(random_state) - - if max_iter <= 0: - raise ValueError( - "Number of iterations should be a positive number," - " got %d instead" % max_iter - ) - - best_inertia = np.infty - # avoid forcing order when copy_x=False - order = "C" if copy_x else None - X = check_array( - X, accept_sparse="csr", dtype=[np.float64, np.float32], order=order, copy=copy_x - ) - # verify that the number of samples given is larger than k - if _num_samples(X) < n_clusters: - raise ValueError( - "n_samples=%d should be >= n_clusters=%d" % (_num_samples(X), n_clusters) - ) - tol = _tolerance(X, tol) - - if hasattr(init, "__array__"): - init = check_array(init, dtype=X.dtype.type, order="C", copy=True) - _validate_center_shape(X, n_clusters, init) - - if n_init != 1: - warnings.warn( - "Explicit initial center position passed: " - "performing only one init in k-means instead of n_init=%d" % n_init, - RuntimeWarning, - stacklevel=2, - ) - n_init = 1 - - # precompute squared norms of data points - x_squared_norms = row_norms(X, squared=True) - - if n_jobs == 1: - # For a single thread, less memory is needed if we just store one set - # of the best results (as opposed to one set per run per thread). - for it in range(n_init): - # run a k-means once - labels, inertia, centers, n_iter_ = _spherical_kmeans_single_lloyd( - X, - n_clusters, - sample_weight, - max_iter=max_iter, - init=init, - verbose=verbose, - tol=tol, - x_squared_norms=x_squared_norms, - random_state=random_state, - ) - - # determine if these results are the best so far - if best_inertia is None or inertia < best_inertia: - best_labels = labels.copy() - best_centers = centers.copy() - best_inertia = inertia - best_n_iter = n_iter_ - else: - # parallelisation of k-means runs - seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init) - results = Parallel(n_jobs=n_jobs, verbose=0)( - delayed(_spherical_kmeans_single_lloyd)( - X, - n_clusters, - sample_weight, - max_iter=max_iter, - init=init, - verbose=verbose, - tol=tol, - x_squared_norms=x_squared_norms, - # Change seed to ensure variety - random_state=seed, - ) - for seed in seeds - ) - - # Get results with the lowest inertia - labels, inertia, centers, n_iters = zip(*results) - best = np.argmin(inertia) - best_labels = labels[best] - best_inertia = inertia[best] - best_centers = centers[best] - best_n_iter = n_iters[best] - - if return_n_iter: - return best_centers, best_labels, best_inertia, best_n_iter - else: - return best_centers, best_labels, best_inertia - - -class SphericalKMeans(KMeans): - """Spherical K-Means clustering - - Modfication of sklearn.cluster.KMeans where cluster centers are normalized - (projected onto the sphere) in each iteration. - - Parameters - ---------- - - n_clusters : int, optional, default: 8 - The number of clusters to form as well as the number of - centroids to generate. - - max_iter : int, default: 300 - Maximum number of iterations of the k-means algorithm for a - single run. - - n_init : int, default: 10 - Number of time the k-means algorithm will be run with different - centroid seeds. The final results will be the best output of - n_init consecutive runs in terms of inertia. - - init : {'k-means++', 'random' or an ndarray} - Method for initialization, defaults to 'k-means++': - 'k-means++' : selects initial cluster centers for k-mean - clustering in a smart way to speed up convergence. See section - Notes in k_init for more details. - 'random': choose k observations (rows) at random from data for - the initial centroids. - If an ndarray is passed, it should be of shape (n_clusters, n_features) - and gives the initial centers. - - tol : float, default: 1e-4 - Relative tolerance with regards to inertia to declare convergence - - n_jobs : int - The number of jobs to use for the computation. This works by computing - each of the n_init runs in parallel. - If -1 all CPUs are used. If 1 is given, no parallel computing code is - used at all, which is useful for debugging. For n_jobs below -1, - (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one - are used. - - random_state : integer or numpy.RandomState, optional - The generator used to initialize the centers. If an integer is - given, it fixes the seed. Defaults to the global numpy random - number generator. - - verbose : int, default 0 - Verbosity mode. - - copy_x : boolean, default True - When pre-computing distances it is more numerically accurate to center - the data first. If copy_x is True, then the original data is not - modified. If False, the original data is modified, and put back before - the function returns, but small numerical differences may be introduced - by subtracting and then adding the data mean. - - normalize : boolean, default True - Normalize the input to have unnit norm. - - Attributes - ---------- - - cluster_centers_ : array, [n_clusters, n_features] - Coordinates of cluster centers - - labels_ : - Labels of each point - - inertia_ : float - Sum of distances of samples to their closest cluster center. - """ - - def __init__( - self, - n_clusters=8, - init="k-means++", - n_init=10, - max_iter=300, - tol=1e-4, - n_jobs=1, - verbose=0, - random_state=None, - copy_x=True, - normalize=True, - ): - self.n_clusters = n_clusters - self.init = init - self.max_iter = max_iter - self.tol = tol - self.n_init = n_init - self.verbose = verbose - self.random_state = random_state - self.copy_x = copy_x - self.n_jobs = n_jobs - self.normalize = normalize - - def fit(self, X, y=None, sample_weight=None): - """Compute k-means clustering. - - Parameters - ---------- - - X : array-like or sparse matrix, shape=(n_samples, n_features) - - y : Ignored - not used, present here for API consistency by convention. - - sample_weight : array-like, shape (n_samples,), optional - The weights for each observation in X. If None, all observations - are assigned equal weight (default: None) - """ - if self.normalize: - X = normalize(X) - - random_state = check_random_state(self.random_state) - - # TODO: add check that all data is unit-normalized - - self.cluster_centers_, self.labels_, self.inertia_, self.n_iter_ = spherical_k_means( - X, - n_clusters=self.n_clusters, - sample_weight=sample_weight, - init=self.init, - n_init=self.n_init, - max_iter=self.max_iter, - verbose=self.verbose, - tol=self.tol, - random_state=random_state, - copy_x=self.copy_x, - n_jobs=self.n_jobs, - return_n_iter=True, - ) - - return self diff --git a/code/spherecluster/spherecluster/tests/test_common.py b/code/spherecluster/spherecluster/tests/test_common.py deleted file mode 100644 index 119f1c4..0000000 --- a/code/spherecluster/spherecluster/tests/test_common.py +++ /dev/null @@ -1,6 +0,0 @@ -from sklearn.utils.estimator_checks import check_estimator -from spherecluster import SphericalKMeans - - -def test_estimator_spherical_k_means(): - return check_estimator(SphericalKMeans) diff --git a/code/spherecluster/spherecluster/tests/test_von_mises_fisher_mixture.py b/code/spherecluster/spherecluster/tests/test_von_mises_fisher_mixture.py deleted file mode 100644 index 29e6d73..0000000 --- a/code/spherecluster/spherecluster/tests/test_von_mises_fisher_mixture.py +++ /dev/null @@ -1,205 +0,0 @@ -import pytest -import scipy as sp -import numpy as np -from numpy.testing import assert_almost_equal, assert_array_equal -from spherecluster import VonMisesFisherMixture -from spherecluster import von_mises_fisher_mixture -from spherecluster import sample_vMF - - -def test_vmf_log_dense(): - """ - Test that approximation approaches whatever scipy has. - """ - n_examples = 2 - n_features = 50 - - kappas = np.linspace(2, 600, 20) - - mu = np.random.randn(n_features) - mu /= np.linalg.norm(mu) - - X = np.random.randn(n_examples, n_features) - for ee in range(n_examples): - X[ee, :] /= np.linalg.norm(X[ee, :]) - - diffs = [] - for kappa in kappas: - v = von_mises_fisher_mixture._vmf_log(X, kappa, mu) - - v_approx = von_mises_fisher_mixture._vmf_log_asymptotic(X, kappa, mu) - - normalized_approx_diff = np.linalg.norm(v - v_approx) / np.linalg.norm(v) - print(normalized_approx_diff) - diffs.append(normalized_approx_diff) - - assert diffs[0] > 10 * diffs[-1] - - -def test_vmf_log_detect_breakage(): - """ - Find where scipy approximation breaks down. - This doesn't really test anything but demonstrates where approximation - should be applied instead. - """ - n_examples = 3 - kappas = [5, 30, 100, 1000, 5000] - n_features = range(2, 500) - - breakage_points = [] - for kappa in kappas: - first_breakage = None - for n_f in n_features: - mu = np.random.randn(n_f) - mu /= np.linalg.norm(mu) - - X = np.random.randn(n_examples, n_f) - for ee in range(n_examples): - X[ee, :] /= np.linalg.norm(X[ee, :]) - - try: - von_mises_fisher_mixture._vmf_log(X, kappa, mu) - except: - if first_breakage is None: - first_breakage = n_f - - breakage_points.append(first_breakage) - print( - "Scipy vmf_log breaks for kappa={} at n_features={}".format( - kappa, first_breakage - ) - ) - - print(breakage_points) - assert_array_equal(breakage_points, [141, 420, 311, 3, 3]) - - -def test_maximization(): - num_points = 5000 - n_features = 500 - posterior = np.ones((1, num_points)) - - kappas = [5000, 8000, 16400] - for kappa in kappas: - mu = np.random.randn(n_features) - mu /= np.linalg.norm(mu) - - X = sample_vMF(mu, kappa, num_points) - - centers, weights, concentrations = von_mises_fisher_mixture._maximization( - X, posterior - ) - - print("center estimate error", np.linalg.norm(centers[0, :] - mu)) - print( - "kappa estimate", - np.abs(kappa - concentrations[0]) / kappa, - kappa, - concentrations[0], - ) - - assert_almost_equal(1., weights[0]) - assert_almost_equal(0.0, np.abs(kappa - concentrations[0]) / kappa, decimal=2) - assert_almost_equal(0.0, np.linalg.norm(centers[0, :] - mu), decimal=2) - - -@pytest.mark.parametrize( - "params_in", - [ - {"posterior_type": "soft"}, - {"posterior_type": "hard"}, - {"posterior_type": "soft", "n_jobs": 2}, - {"posterior_type": "hard", "n_jobs": 3}, - {"posterior_type": "hard", "force_weights": np.ones(5) / 5.}, - {"posterior_type": "soft", "n_jobs": -1}, - ], -) -def test_integration_dense(params_in): - n_clusters = 5 - n_examples = 20 - n_features = 100 - X = np.random.randn(n_examples, n_features) - for ee in range(n_examples): - X[ee, :] /= np.linalg.norm(X[ee, :]) - - params_in.update({"n_clusters": n_clusters}) - movmf = VonMisesFisherMixture(**params_in) - movmf.fit(X) - - assert movmf.cluster_centers_.shape == (n_clusters, n_features) - assert len(movmf.concentrations_) == n_clusters - assert len(movmf.weights_) == n_clusters - assert len(movmf.labels_) == n_examples - assert len(movmf.posterior_) == n_clusters - - for center in movmf.cluster_centers_: - assert_almost_equal(np.linalg.norm(center), 1.0) - - for concentration in movmf.concentrations_: - assert concentration > 0 - - for weight in movmf.weights_: - assert not np.isnan(weight) - - plabels = movmf.predict(X) - assert_array_equal(plabels, movmf.labels_) - - ll = movmf.log_likelihood(X) - ll_labels = np.zeros(movmf.labels_.shape) - for ee in range(n_examples): - ll_labels[ee] = np.argmax(ll[:, ee]) - - assert_array_equal(ll_labels, movmf.labels_) - - -@pytest.mark.parametrize( - "params_in", - [ - {"posterior_type": "soft"}, - {"posterior_type": "hard"}, - {"posterior_type": "soft", "n_jobs": 2}, - {"posterior_type": "hard", "n_jobs": 3}, - {"posterior_type": "hard", "force_weights": np.ones(5) / 5.}, - {"posterior_type": "soft", "n_jobs": -1}, - ], -) -def test_integration_sparse(params_in): - n_clusters = 5 - n_examples = 20 - n_features = 100 - n_nonzero = 10 - X = sp.sparse.csr_matrix((n_examples, n_features)) - for ee in range(n_examples): - ridx = np.random.randint(n_features, size=(n_nonzero)) - random_values = np.random.randn(n_nonzero) - random_values = random_values / np.linalg.norm(random_values) - X[ee, ridx] = random_values - - params_in.update({"n_clusters": n_clusters}) - movmf = VonMisesFisherMixture(**params_in) - movmf.fit(X) - - assert movmf.cluster_centers_.shape == (n_clusters, n_features) - assert len(movmf.concentrations_) == n_clusters - assert len(movmf.weights_) == n_clusters - assert len(movmf.labels_) == n_examples - assert len(movmf.posterior_) == n_clusters - - for center in movmf.cluster_centers_: - assert_almost_equal(np.linalg.norm(center), 1.0) - - for concentration in movmf.concentrations_: - assert concentration > 0 - - for weight in movmf.weights_: - assert not np.isnan(weight) - - plabels = movmf.predict(X) - assert_array_equal(plabels, movmf.labels_) - - ll = movmf.log_likelihood(X) - ll_labels = np.zeros(movmf.labels_.shape) - for ee in range(n_examples): - ll_labels[ee] = np.argmax(ll[:, ee]) - - assert_array_equal(ll_labels, movmf.labels_) diff --git a/code/spherecluster/spherecluster/util.py b/code/spherecluster/spherecluster/util.py deleted file mode 100644 index bb68261..0000000 --- a/code/spherecluster/spherecluster/util.py +++ /dev/null @@ -1,57 +0,0 @@ -""" -Generate multivariate von Mises Fisher samples. - -This solution originally appears here: -http://stats.stackexchange.com/questions/156729/sampling-from-von-mises-fisher-distribution-in-python - -Also see: - -Sampling from vMF on S^2: - https://www.mitsuba-renderer.org/~wenzel/files/vmf.pdf - http://www.stat.pitt.edu/sungkyu/software/randvonMisesFisher3.pdf -""" -import numpy as np - - -def sample_vMF(mu, kappa, num_samples): - """Generate num_samples N-dimensional samples from von Mises Fisher - distribution around center mu \in R^N with concentration kappa. - """ - dim = len(mu) - result = np.zeros((num_samples, dim)) - for nn in range(num_samples): - # sample offset from center (on sphere) with spread kappa - w = _sample_weight(kappa, dim) - - # sample a point v on the unit sphere that's orthogonal to mu - v = _sample_orthonormal_to(mu) - - # compute new point - result[nn, :] = v * np.sqrt(1. - w ** 2) + w * mu - - return result - - -def _sample_weight(kappa, dim): - """Rejection sampling scheme for sampling distance from center on - surface of the sphere. - """ - dim = dim - 1 # since S^{n-1} - b = dim / (np.sqrt(4. * kappa ** 2 + dim ** 2) + 2 * kappa) - x = (1. - b) / (1. + b) - c = kappa * x + dim * np.log(1 - x ** 2) - - while True: - z = np.random.beta(dim / 2., dim / 2.) - w = (1. - (1. + b) * z) / (1. - (1. - b) * z) - u = np.random.uniform(low=0, high=1) - if kappa * w + dim * np.log(1. - x * w) - c >= np.log(u): - return w - - -def _sample_orthonormal_to(mu): - """Sample point on sphere orthogonal to mu.""" - v = np.random.randn(mu.shape[0]) - proj_mu_v = mu * np.dot(mu, v) / np.linalg.norm(mu) - orthto = v - proj_mu_v - return orthto / np.linalg.norm(orthto) diff --git a/code/spherecluster/spherecluster/von_mises_fisher_mixture.py b/code/spherecluster/spherecluster/von_mises_fisher_mixture.py deleted file mode 100644 index 53c4c67..0000000 --- a/code/spherecluster/spherecluster/von_mises_fisher_mixture.py +++ /dev/null @@ -1,946 +0,0 @@ -import warnings - -import numpy as np -import scipy.sparse as sp -from joblib import Parallel, delayed -from numpy import i0 # modified Bessel function of first kind order 0, I_0 -from scipy.special import iv # modified Bessel function of first kind, I_v -from scipy.special import logsumexp - -from sklearn.base import BaseEstimator, ClusterMixin, TransformerMixin -from sklearn.cluster.k_means_ import _init_centroids, _tolerance, _validate_center_shape -from sklearn.metrics.pairwise import cosine_distances -from sklearn.preprocessing import normalize -from sklearn.utils import check_array, check_random_state, as_float_array -from sklearn.utils.extmath import squared_norm -from sklearn.utils.validation import FLOAT_DTYPES -from sklearn.utils.validation import check_is_fitted - -from . import spherical_kmeans - -MAX_CONTENTRATION = 1e10 - - -def _inertia_from_labels(X, centers, labels): - """Compute inertia with cosine distance using known labels. - """ - n_examples, n_features = X.shape - inertia = np.zeros((n_examples,)) - for ee in range(n_examples): - inertia[ee] = 1 - X[ee, :].dot(centers[int(labels[ee]), :].T) - - return np.sum(inertia) - - -def _labels_inertia(X, centers): - """Compute labels and inertia with cosine distance. - """ - n_examples, n_features = X.shape - n_clusters, n_features = centers.shape - - labels = np.zeros((n_examples,)) - inertia = np.zeros((n_examples,)) - - for ee in range(n_examples): - dists = np.zeros((n_clusters,)) - for cc in range(n_clusters): - dists[cc] = 1 - X[ee, :].dot(centers[cc, :].T) - - labels[ee] = np.argmin(dists) - inertia[ee] = dists[int(labels[ee])] - - return labels, np.sum(inertia) - - -def _vmf_log(X, kappa, mu): - """Computs log(vMF(X, kappa, mu)) using built-in numpy/scipy Bessel - approximations. - - Works well on small kappa and mu. - """ - n_examples, n_features = X.shape - return np.log(_vmf_normalize(kappa, n_features) * np.exp(kappa * X.dot(mu).T)) - - -def _vmf_normalize(kappa, dim): - """Compute normalization constant using built-in numpy/scipy Bessel - approximations. - - Works well on small kappa and mu. - """ - num = np.power(kappa, dim / 2.0 - 1.0) - - if dim / 2.0 - 1.0 < 1e-15: - denom = np.power(2.0 * np.pi, dim / 2.0) * i0(kappa) - else: - denom = np.power(2.0 * np.pi, dim / 2.0) * iv(dim / 2.0 - 1.0, kappa) - - if np.isinf(num): - raise ValueError("VMF scaling numerator was inf.") - - if np.isinf(denom): - raise ValueError("VMF scaling denominator was inf.") - - if np.abs(denom) < 1e-15: - raise ValueError("VMF scaling denominator was 0.") - - return num / denom - - -def _log_H_asymptotic(nu, kappa): - """Compute the Amos-type upper bound asymptotic approximation on H where - log(H_\nu)(\kappa) = \int_0^\kappa R_\nu(t) dt. - - See "lH_asymptotic <-" in movMF.R and utility function implementation notes - from https://cran.r-project.org/web/packages/movMF/index.html - """ - beta = np.sqrt((nu + 0.5) ** 2) - kappa_l = np.min([kappa, np.sqrt((3.0 * nu + 11.0 / 2.0) * (nu + 3.0 / 2.0))]) - return _S(kappa, nu + 0.5, beta) + ( - _S(kappa_l, nu, nu + 2.0) - _S(kappa_l, nu + 0.5, beta) - ) - - -def _S(kappa, alpha, beta): - """Compute the antiderivative of the Amos-type bound G on the modified - Bessel function ratio. - - Note: Handles scalar kappa, alpha, and beta only. - - See "S <-" in movMF.R and utility function implementation notes from - https://cran.r-project.org/web/packages/movMF/index.html - """ - kappa = 1.0 * np.abs(kappa) - alpha = 1.0 * alpha - beta = 1.0 * np.abs(beta) - a_plus_b = alpha + beta - u = np.sqrt(kappa ** 2 + beta ** 2) - if alpha == 0: - alpha_scale = 0 - else: - alpha_scale = alpha * np.log((alpha + u) / a_plus_b) - - return u - beta - alpha_scale - - -def _vmf_log_asymptotic(X, kappa, mu): - """Compute log(f(x|theta)) via Amos approximation - - log(f(x|theta)) = theta' x - log(H_{d/2-1})(\|theta\|) - - where theta = kappa * X, \|theta\| = kappa. - - Computing _vmf_log helps with numerical stability / loss of precision for - for large values of kappa and n_features. - - See utility function implementation notes in movMF.R from - https://cran.r-project.org/web/packages/movMF/index.html - """ - n_examples, n_features = X.shape - log_vfm = kappa * X.dot(mu).T + -_log_H_asymptotic(n_features / 2.0 - 1.0, kappa) - - return log_vfm - - -def _log_likelihood(X, centers, weights, concentrations): - if len(np.shape(X)) != 2: - X = X.reshape((1, len(X))) - - n_examples, n_features = np.shape(X) - n_clusters, _ = centers.shape - - if n_features <= 50: # works up to about 50 before numrically unstable - vmf_f = _vmf_log - else: - vmf_f = _vmf_log_asymptotic - - f_log = np.zeros((n_clusters, n_examples)) - for cc in range(n_clusters): - f_log[cc, :] = vmf_f(X, concentrations[cc], centers[cc, :]) - - posterior = np.zeros((n_clusters, n_examples)) - weights_log = np.log(weights) - posterior = np.tile(weights_log.T, (n_examples, 1)).T + f_log - for ee in range(n_examples): - posterior[:, ee] = np.exp(posterior[:, ee] - logsumexp(posterior[:, ee])) - - return posterior - - -def _init_unit_centers(X, n_clusters, random_state, init): - """Initializes unit norm centers. - - Parameters - ---------- - X : array-like or sparse matrix, shape=(n_samples, n_features) - - n_clusters : int, optional, default: 8 - The number of clusters to form as well as the number of - centroids to generate. - - random_state : integer or numpy.RandomState, optional - The generator used to initialize the centers. If an integer is - given, it fixes the seed. Defaults to the global numpy random - number generator. - - init: (string) one of - k-means++ : uses sklearn k-means++ initialization algorithm - spherical-k-means : use centroids from one pass of spherical k-means - random : random unit norm vectors - random-orthonormal : random orthonormal vectors - If an ndarray is passed, it should be of shape (n_clusters, n_features) - and gives the initial centers. - """ - n_examples, n_features = np.shape(X) - if isinstance(init, np.ndarray): - n_init_clusters, n_init_features = init.shape - assert n_init_clusters == n_clusters - assert n_init_features == n_features - - # ensure unit normed centers - centers = init - for cc in range(n_clusters): - centers[cc, :] = centers[cc, :] / np.linalg.norm(centers[cc, :]) - - return centers - - elif init == "spherical-k-means": - labels, inertia, centers, iters = spherical_kmeans._spherical_kmeans_single_lloyd( - X, n_clusters, x_squared_norms=np.ones((n_examples,)), init="k-means++" - ) - - return centers - - elif init == "random": - centers = np.random.randn(n_clusters, n_features) - for cc in range(n_clusters): - centers[cc, :] = centers[cc, :] / np.linalg.norm(centers[cc, :]) - - return centers - - elif init == "k-means++": - centers = _init_centroids( - X, - n_clusters, - "k-means++", - random_state=random_state, - x_squared_norms=np.ones((n_examples,)), - ) - - for cc in range(n_clusters): - centers[cc, :] = centers[cc, :] / np.linalg.norm(centers[cc, :]) - - return centers - - elif init == "random-orthonormal": - centers = np.random.randn(n_clusters, n_features) - q, r = np.linalg.qr(centers.T, mode="reduced") - - return q.T - - elif init == "random-class": - centers = np.zeros((n_clusters, n_features)) - for cc in range(n_clusters): - while np.linalg.norm(centers[cc, :]) == 0: - labels = np.random.randint(0, n_clusters, n_examples) - centers[cc, :] = X[labels == cc, :].sum(axis=0) - - for cc in range(n_clusters): - centers[cc, :] = centers[cc, :] / np.linalg.norm(centers[cc, :]) - - return centers - - -def _expectation(X, centers, weights, concentrations, posterior_type="soft"): - """Compute the log-likelihood of each datapoint being in each cluster. - - Parameters - ---------- - centers (mu) : array, [n_centers x n_features] - weights (alpha) : array, [n_centers, ] (alpha) - concentrations (kappa) : array, [n_centers, ] - - Returns - ---------- - posterior : array, [n_centers, n_examples] - """ - n_examples, n_features = np.shape(X) - n_clusters, _ = centers.shape - - if n_features <= 50: # works up to about 50 before numrically unstable - vmf_f = _vmf_log - else: - vmf_f = _vmf_log_asymptotic - - f_log = np.zeros((n_clusters, n_examples)) - for cc in range(n_clusters): - f_log[cc, :] = vmf_f(X, concentrations[cc], centers[cc, :]) - - posterior = np.zeros((n_clusters, n_examples)) - if posterior_type == "soft": - weights_log = np.log(weights) - posterior = np.tile(weights_log.T, (n_examples, 1)).T + f_log - for ee in range(n_examples): - posterior[:, ee] = np.exp(posterior[:, ee] - logsumexp(posterior[:, ee])) - - elif posterior_type == "hard": - weights_log = np.log(weights) - weighted_f_log = np.tile(weights_log.T, (n_examples, 1)).T + f_log - for ee in range(n_examples): - posterior[np.argmax(weighted_f_log[:, ee]), ee] = 1.0 - - return posterior - - -def _maximization(X, posterior, force_weights=None): - """Estimate new centers, weights, and concentrations from - - Parameters - ---------- - posterior : array, [n_centers, n_examples] - The posterior matrix from the expectation step. - - force_weights : None or array, [n_centers, ] - If None is passed, will estimate weights. - If an array is passed, will use instead of estimating. - - Returns - ---------- - centers (mu) : array, [n_centers x n_features] - weights (alpha) : array, [n_centers, ] (alpha) - concentrations (kappa) : array, [n_centers, ] - """ - n_examples, n_features = X.shape - n_clusters, n_examples = posterior.shape - concentrations = np.zeros((n_clusters,)) - centers = np.zeros((n_clusters, n_features)) - if force_weights is None: - weights = np.zeros((n_clusters,)) - - for cc in range(n_clusters): - # update weights (alpha) - if force_weights is None: - weights[cc] = np.mean(posterior[cc, :]) - else: - weights = force_weights - - # update centers (mu) - X_scaled = X.copy() - if sp.issparse(X): - X_scaled.data *= posterior[cc, :].repeat(np.diff(X_scaled.indptr)) - else: - for ee in range(n_examples): - X_scaled[ee, :] *= posterior[cc, ee] - - centers[cc, :] = X_scaled.sum(axis=0) - - # normalize centers - center_norm = np.linalg.norm(centers[cc, :]) - if center_norm > 1e-8: - centers[cc, :] = centers[cc, :] / center_norm - - # update concentration (kappa) [TODO: add other kappa approximations] - rbar = center_norm / (n_examples * weights[cc]) - concentrations[cc] = rbar * n_features - np.power(rbar, 3.0) - if np.abs(rbar - 1.0) < 1e-10: - concentrations[cc] = MAX_CONTENTRATION - else: - concentrations[cc] /= 1.0 - np.power(rbar, 2.0) - - # let python know we can free this (good for large dense X) - del X_scaled - - return centers, weights, concentrations - - -def _movMF( - X, - n_clusters, - posterior_type="soft", - force_weights=None, - max_iter=300, - verbose=False, - init="random-class", - random_state=None, - tol=1e-6, -): - """Mixture of von Mises Fisher clustering. - - Implements the algorithms (i) and (ii) from - - "Clustering on the Unit Hypersphere using von Mises-Fisher Distributions" - by Banerjee, Dhillon, Ghosh, and Sra. - - TODO: Currently only supports Banerjee et al 2005 approximation of kappa, - however, there are numerous other approximations see _update_params. - - Attribution - ---------- - Approximation of log-vmf distribution function from movMF R-package. - - movMF: An R Package for Fitting Mixtures of von Mises-Fisher Distributions - by Kurt Hornik, Bettina Grun, 2014 - - Find more at: - https://cran.r-project.org/web/packages/movMF/vignettes/movMF.pdf - https://cran.r-project.org/web/packages/movMF/index.html - - Parameters - ---------- - n_clusters : int, optional, default: 8 - The number of clusters to form as well as the number of - centroids to generate. - - posterior_type: 'soft' or 'hard' - Type of posterior computed in exepectation step. - See note about attribute: self.posterior_ - - force_weights : None or array [n_clusters, ] - If None, the algorithm will estimate the weights. - If an array of weights, algorithm will estimate concentrations and - centers with given weights. - - max_iter : int, default: 300 - Maximum number of iterations of the k-means algorithm for a - single run. - - n_init : int, default: 10 - Number of time the k-means algorithm will be run with different - centroid seeds. The final results will be the best output of - n_init consecutive runs in terms of inertia. - - init: (string) one of - random-class [default]: random class assignment & centroid computation - k-means++ : uses sklearn k-means++ initialization algorithm - spherical-k-means : use centroids from one pass of spherical k-means - random : random unit norm vectors - random-orthonormal : random orthonormal vectors - If an ndarray is passed, it should be of shape (n_clusters, n_features) - and gives the initial centers. - - tol : float, default: 1e-6 - Relative tolerance with regards to inertia to declare convergence - - n_jobs : int - The number of jobs to use for the computation. This works by computing - each of the n_init runs in parallel. - If -1 all CPUs are used. If 1 is given, no parallel computing code is - used at all, which is useful for debugging. For n_jobs below -1, - (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one - are used. - - random_state : integer or numpy.RandomState, optional - The generator used to initialize the centers. If an integer is - given, it fixes the seed. Defaults to the global numpy random - number generator. - - verbose : int, default 0 - Verbosity mode. - - copy_x : boolean, default True - When pre-computing distances it is more numerically accurate to center - the data first. If copy_x is True, then the original data is not - modified. If False, the original data is modified, and put back before - the function returns, but small numerical differences may be introduced - by subtracting and then adding the data mean. - """ - random_state = check_random_state(random_state) - n_examples, n_features = np.shape(X) - - # init centers (mus) - centers = _init_unit_centers(X, n_clusters, random_state, init) - - # init weights (alphas) - if force_weights is None: - weights = np.ones((n_clusters,)) - weights = weights / np.sum(weights) - else: - weights = force_weights - - # init concentrations (kappas) - concentrations = np.ones((n_clusters,)) - - if verbose: - print("Initialization complete") - - for iter in range(max_iter): - centers_prev = centers.copy() - - # expectation step - posterior = _expectation( - X, centers, weights, concentrations, posterior_type=posterior_type - ) - - # maximization step - centers, weights, concentrations = _maximization( - X, posterior, force_weights=force_weights - ) - - # check convergence - tolcheck = squared_norm(centers_prev - centers) - if tolcheck <= tol: - if verbose: - print( - "Converged at iteration %d: " - "center shift %e within tolerance %e" % (iter, tolcheck, tol) - ) - break - - # labels come for free via posterior - labels = np.zeros((n_examples,)) - for ee in range(n_examples): - labels[ee] = np.argmax(posterior[:, ee]) - - inertia = _inertia_from_labels(X, centers, labels) - - return centers, weights, concentrations, posterior, labels, inertia - - -def movMF( - X, - n_clusters, - posterior_type="soft", - force_weights=None, - n_init=10, - n_jobs=1, - max_iter=300, - verbose=False, - init="random-class", - random_state=None, - tol=1e-6, - copy_x=True, -): - """Wrapper for parallelization of _movMF and running n_init times. - """ - if n_init <= 0: - raise ValueError( - "Invalid number of initializations." - " n_init=%d must be bigger than zero." % n_init - ) - random_state = check_random_state(random_state) - - if max_iter <= 0: - raise ValueError( - "Number of iterations should be a positive number," - " got %d instead" % max_iter - ) - - best_inertia = np.infty - X = as_float_array(X, copy=copy_x) - tol = _tolerance(X, tol) - - if hasattr(init, "__array__"): - init = check_array(init, dtype=X.dtype.type, copy=True) - _validate_center_shape(X, n_clusters, init) - - if n_init != 1: - warnings.warn( - "Explicit initial center position passed: " - "performing only one init in k-means instead of n_init=%d" % n_init, - RuntimeWarning, - stacklevel=2, - ) - n_init = 1 - - # defaults - best_centers = None - best_labels = None - best_weights = None - best_concentrations = None - best_posterior = None - best_inertia = None - - if n_jobs == 1: - # For a single thread, less memory is needed if we just store one set - # of the best results (as opposed to one set per run per thread). - for it in range(n_init): - # cluster on the sphere - (centers, weights, concentrations, posterior, labels, inertia) = _movMF( - X, - n_clusters, - posterior_type=posterior_type, - force_weights=force_weights, - max_iter=max_iter, - verbose=verbose, - init=init, - random_state=random_state, - tol=tol, - ) - - # determine if these results are the best so far - if best_inertia is None or inertia < best_inertia: - best_centers = centers.copy() - best_labels = labels.copy() - best_weights = weights.copy() - best_concentrations = concentrations.copy() - best_posterior = posterior.copy() - best_inertia = inertia - else: - # parallelisation of movMF runs - seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init) - results = Parallel(n_jobs=n_jobs, verbose=0)( - delayed(_movMF)( - X, - n_clusters, - posterior_type=posterior_type, - force_weights=force_weights, - max_iter=max_iter, - verbose=verbose, - init=init, - random_state=random_state, - tol=tol, - ) - for seed in seeds - ) - - # Get results with the lowest inertia - centers, weights, concentrations, posteriors, labels, inertia = zip(*results) - best = np.argmin(inertia) - best_labels = labels[best] - best_inertia = inertia[best] - best_centers = centers[best] - best_concentrations = concentrations[best] - best_posterior = posteriors[best] - best_weights = weights[best] - - return ( - best_centers, - best_labels, - best_inertia, - best_weights, - best_concentrations, - best_posterior, - ) - - -class VonMisesFisherMixture(BaseEstimator, ClusterMixin, TransformerMixin): - """Estimator for Mixture of von Mises Fisher clustering on the unit sphere. - - Implements the algorithms (i) and (ii) from - - "Clustering on the Unit Hypersphere using von Mises-Fisher Distributions" - by Banerjee, Dhillon, Ghosh, and Sra. - - TODO: Currently only supports Banerjee et al 2005 approximation of kappa, - however, there are numerous other approximations see _update_params. - - Attribution - ---------- - Approximation of log-vmf distribution function from movMF R-package. - - movMF: An R Package for Fitting Mixtures of von Mises-Fisher Distributions - by Kurt Hornik, Bettina Grun, 2014 - - Find more at: - https://cran.r-project.org/web/packages/movMF/vignettes/movMF.pdf - https://cran.r-project.org/web/packages/movMF/index.html - - Basic sklearn scaffolding from sklearn.cluster.KMeans. - - Parameters - ---------- - n_clusters : int, optional, default: 8 - The number of clusters to form as well as the number of - centroids to generate. - - posterior_type: 'soft' or 'hard' - Type of posterior computed in exepectation step. - See note about attribute: self.posterior_ - - force_weights : None or array [n_clusters, ] - If None, the algorithm will estimate the weights. - If an array of weights, algorithm will estimate concentrations and - centers with given weights. - - max_iter : int, default: 300 - Maximum number of iterations of the k-means algorithm for a - single run. - - n_init : int, default: 10 - Number of time the k-means algorithm will be run with different - centroid seeds. The final results will be the best output of - n_init consecutive runs in terms of inertia. - - init: (string) one of - random-class [default]: random class assignment & centroid computation - k-means++ : uses sklearn k-means++ initialization algorithm - spherical-k-means : use centroids from one pass of spherical k-means - random : random unit norm vectors - random-orthonormal : random orthonormal vectors - If an ndarray is passed, it should be of shape (n_clusters, n_features) - and gives the initial centers. - - tol : float, default: 1e-6 - Relative tolerance with regards to inertia to declare convergence - - n_jobs : int - The number of jobs to use for the computation. This works by computing - each of the n_init runs in parallel. - If -1 all CPUs are used. If 1 is given, no parallel computing code is - used at all, which is useful for debugging. For n_jobs below -1, - (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one - are used. - - random_state : integer or numpy.RandomState, optional - The generator used to initialize the centers. If an integer is - given, it fixes the seed. Defaults to the global numpy random - number generator. - - verbose : int, default 0 - Verbosity mode. - - copy_x : boolean, default True - When pre-computing distances it is more numerically accurate to center - the data first. If copy_x is True, then the original data is not - modified. If False, the original data is modified, and put back before - the function returns, but small numerical differences may be introduced - by subtracting and then adding the data mean. - - normalize : boolean, default True - Normalize the input to have unnit norm. - - Attributes - ---------- - - cluster_centers_ : array, [n_clusters, n_features] - Coordinates of cluster centers - - labels_ : - Labels of each point - - inertia_ : float - Sum of distances of samples to their closest cluster center. - - weights_ : array, [n_clusters,] - Weights of each cluster in vMF distribution (alpha). - - concentrations_ : array [n_clusters,] - Concentration parameter for each cluster (kappa). - Larger values correspond to more concentrated clusters. - - posterior_ : array, [n_clusters, n_examples] - Each column corresponds to the posterio distribution for and example. - - If posterior_type='hard' is used, there will only be one non-zero per - column, its index corresponding to the example's cluster label. - - If posterior_type='soft' is used, this matrix will be dense and the - column values correspond to soft clustering weights. - """ - - def __init__( - self, - n_clusters=5, - posterior_type="soft", - force_weights=None, - n_init=10, - n_jobs=1, - max_iter=300, - verbose=False, - init="random-class", - random_state=None, - tol=1e-6, - copy_x=True, - normalize=True, - ): - self.n_clusters = n_clusters - self.posterior_type = posterior_type - self.force_weights = force_weights - self.n_init = n_init - self.n_jobs = n_jobs - self.max_iter = max_iter - self.verbose = verbose - self.init = init - self.random_state = random_state - self.tol = tol - self.copy_x = copy_x - self.normalize = normalize - - def _check_force_weights(self): - if self.force_weights is None: - return - - if len(self.force_weights) != self.n_clusters: - raise ValueError( - ( - "len(force_weights)={} but must equal " - "n_clusters={}".format(len(self.force_weights), self.n_clusters) - ) - ) - - def _check_fit_data(self, X): - """Verify that the number of samples given is larger than k""" - X = check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) - n_samples, n_features = X.shape - if X.shape[0] < self.n_clusters: - raise ValueError( - "n_samples=%d should be >= n_clusters=%d" - % (X.shape[0], self.n_clusters) - ) - - for ee in range(n_samples): - if sp.issparse(X): - n = sp.linalg.norm(X[ee, :]) - else: - n = np.linalg.norm(X[ee, :]) - - if np.abs(n - 1.0) > 1e-4: - raise ValueError("Data l2-norm must be 1, found {}".format(n)) - - return X - - def _check_test_data(self, X): - X = check_array(X, accept_sparse="csr", dtype=FLOAT_DTYPES) - n_samples, n_features = X.shape - expected_n_features = self.cluster_centers_.shape[1] - if not n_features == expected_n_features: - raise ValueError( - "Incorrect number of features. " - "Got %d features, expected %d" % (n_features, expected_n_features) - ) - - for ee in range(n_samples): - if sp.issparse(X): - n = sp.linalg.norm(X[ee, :]) - else: - n = np.linalg.norm(X[ee, :]) - - if np.abs(n - 1.0) > 1e-4: - raise ValueError("Data l2-norm must be 1, found {}".format(n)) - - return X - - def fit(self, X, y=None): - """Compute mixture of von Mises Fisher clustering. - - Parameters - ---------- - X : array-like or sparse matrix, shape=(n_samples, n_features) - """ - if self.normalize: - X = normalize(X) - - self._check_force_weights() - random_state = check_random_state(self.random_state) - X = self._check_fit_data(X) - - ( - self.cluster_centers_, - self.labels_, - self.inertia_, - self.weights_, - self.concentrations_, - self.posterior_, - ) = movMF( - X, - self.n_clusters, - posterior_type=self.posterior_type, - force_weights=self.force_weights, - n_init=self.n_init, - n_jobs=self.n_jobs, - max_iter=self.max_iter, - verbose=self.verbose, - init=self.init, - random_state=random_state, - tol=self.tol, - copy_x=self.copy_x, - ) - - return self - - def fit_predict(self, X, y=None): - """Compute cluster centers and predict cluster index for each sample. - Convenience method; equivalent to calling fit(X) followed by - predict(X). - """ - return self.fit(X).labels_ - - def fit_transform(self, X, y=None): - """Compute clustering and transform X to cluster-distance space. - Equivalent to fit(X).transform(X), but more efficiently implemented. - """ - # Currently, this just skips a copy of the data if it is not in - # np.array or CSR format already. - # XXX This skips _check_test_data, which may change the dtype; - # we should refactor the input validation. - return self.fit(X)._transform(X) - - def transform(self, X, y=None): - """Transform X to a cluster-distance space. - In the new space, each dimension is the cosine distance to the cluster - centers. Note that even if X is sparse, the array returned by - `transform` will typically be dense. - - Parameters - ---------- - X : {array-like, sparse matrix}, shape = [n_samples, n_features] - New data to transform. - - Returns - ------- - X_new : array, shape [n_samples, k] - X transformed in the new space. - """ - if self.normalize: - X = normalize(X) - - check_is_fitted(self) - X = self._check_test_data(X) - return self._transform(X) - - def _transform(self, X): - """guts of transform method; no input validation""" - return cosine_distances(X, self.cluster_centers_) - - def predict(self, X): - """Predict the closest cluster each sample in X belongs to. - In the vector quantization literature, `cluster_centers_` is called - the code book and each value returned by `predict` is the index of - the closest code in the code book. - - Note: Does not check that each point is on the sphere. - - Parameters - ---------- - X : {array-like, sparse matrix}, shape = [n_samples, n_features] - New data to predict. - - Returns - ------- - labels : array, shape [n_samples,] - Index of the cluster each sample belongs to. - """ - if self.normalize: - X = normalize(X) - - check_is_fitted(self) - - X = self._check_test_data(X) - return _labels_inertia(X, self.cluster_centers_)[0] - - def score(self, X, y=None): - """Inertia score (sum of all distances to closest cluster). - - Parameters - ---------- - X : {array-like, sparse matrix}, shape = [n_samples, n_features] - New data. - - Returns - ------- - score : float - Larger score is better. - """ - if self.normalize: - X = normalize(X) - - check_is_fitted(self) - X = self._check_test_data(X) - return -_labels_inertia(X, self.cluster_centers_)[1] - - def log_likelihood(self, X): - check_is_fitted(self) - - return _log_likelihood( - X, self.cluster_centers_, self.weights_, self.concentrations_ - )