From 27a411f2d40161d10f12807258b13077a1111ca3 Mon Sep 17 00:00:00 2001
From: Cezar Sas <cezar.sas@gmail.com>
Date: Tue, 14 Dec 2021 08:39:26 +0100
Subject: [PATCH 1/2] Switched to python 3, and added poetry package management
 for easy environment setup

---
 code/__init__.py                              |   0
 code/bdi.py                                   |  14 +-
 code/case_ranker.py                           | 125 +--
 code/caseslim.py                              |   8 +-
 code/cluster-preprocess.py                    |   4 +-
 code/cluster.py                               |   8 +-
 code/compress.py                              |  16 +-
 code/dataset.py                               |   4 +-
 .../fetch_papers_relevant.py                  |  26 +-
 code/embedding_deprecated/metrics.py          |  32 +-
 code/eval.py                                  |  12 +-
 code/gen_eval.py                              |  12 +-
 code/labeling.py                              |  18 +-
 code/local_embedding_training.py              |   4 +-
 code/main.py                                  |  15 +-
 code/paras.py                                 |   8 +-
 code/postprocess/redundancy.py                |   4 +-
 code/postprocess/visualize.py                 |  10 +-
 code/preprocess.py                            |   4 +-
 code/preprocess/AutoPhraseOutput.py           |  14 +-
 code/preprocess/SegPhraseOutput.py            |  16 +-
 code/preprocess/main.py                       |   6 +-
 code/run.sh                                   |   8 +-
 code/sim_emb.py                               |  14 +-
 code/spherecluster/__init__.py                |  12 +
 code/spherecluster/spherecluster/__init__.py  |   6 +
 .../spherecluster/spherical_kmeans.py         | 372 +++++++
 .../spherecluster/tests/test_common.py        |   6 +
 .../tests/test_von_mises_fisher_mixture.py    | 205 ++++
 code/spherecluster/spherecluster/util.py      |  57 ++
 .../spherecluster/von_mises_fisher_mixture.py | 946 ++++++++++++++++++
 code/spherecluster/spherical_kmeans.py        | 424 ++++++++
 code/spherecluster/util.py                    |  57 ++
 .../spherecluster/von_mises_fisher_mixture.py | 946 ++++++++++++++++++
 code/taxonomy.py                              |   2 +-
 code/utils.py                                 |  12 +-
 pyproject.toml                                |  21 +
 37 files changed, 3253 insertions(+), 195 deletions(-)
 create mode 100644 code/__init__.py
 mode change 100644 => 100755 code/run.sh
 create mode 100644 code/spherecluster/__init__.py
 create mode 100644 code/spherecluster/spherecluster/__init__.py
 create mode 100644 code/spherecluster/spherecluster/spherical_kmeans.py
 create mode 100644 code/spherecluster/spherecluster/tests/test_common.py
 create mode 100644 code/spherecluster/spherecluster/tests/test_von_mises_fisher_mixture.py
 create mode 100644 code/spherecluster/spherecluster/util.py
 create mode 100644 code/spherecluster/spherecluster/von_mises_fisher_mixture.py
 create mode 100644 code/spherecluster/spherical_kmeans.py
 create mode 100644 code/spherecluster/util.py
 create mode 100644 code/spherecluster/von_mises_fisher_mixture.py
 create mode 100644 pyproject.toml

diff --git a/code/__init__.py b/code/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/code/bdi.py b/code/bdi.py
index 0913bed..5a09a8a 100644
--- a/code/bdi.py
+++ b/code/bdi.py
@@ -2,7 +2,7 @@
 from os import listdir
 from collections import defaultdict
 import utils
-import Queue
+import queue
 import argparse
 from os import listdir
 from os.path import isfile, join, isdir, abspath, dirname, basename, exists
@@ -55,23 +55,23 @@ def compute_dbi(embs, clus_map, center_map):
 def output_dbi(dbi_scores):
 
     dbi_by_lvl = {}
-    all_dbi = [x[0] for x in dbi_scores.values()]
+    all_dbi = [x[0] for x in list(dbi_scores.values())]
 
-    print 'Average DBI for all is: %f' % (sum(all_dbi) / len(all_dbi))
+    print('Average DBI for all is: %f' % (sum(all_dbi) / len(all_dbi)))
 
-    for x in dbi_scores.values():
+    for x in list(dbi_scores.values()):
         if x[1] not in dbi_by_lvl:
             dbi_by_lvl[x[1]] = []
         dbi_by_lvl[x[1]].append(x[0])
 
     for lvl in dbi_by_lvl:
         dbis = dbi_by_lvl[lvl]
-        print 'Average DBI for level %d is: %f' % (lvl, sum(dbis) / len(dbis))
+        print('Average DBI for level %d is: %f' % (lvl, sum(dbis) / len(dbis)))
 
 
 def recursion(root, lvl):
 
-    q = Queue.Queue()
+    q = queue.Queue()
     q.put((root, -1, 1, '*'))
 
     dbi_scores = {}
@@ -102,7 +102,7 @@ def recursion(root, lvl):
         
         # handle current
         dbi = compute_dbi(embs, clus_map, hier_map)
-        print 'Computing DBI for %s: %f' % (c_name, dbi)
+        print('Computing DBI for %s: %f' % (c_name, dbi))
         dbi_scores[c_name] = (dbi, level)
     output_dbi(dbi_scores)
 
diff --git a/code/case_ranker.py b/code/case_ranker.py
index 22da68c..e27cece 100644
--- a/code/case_ranker.py
+++ b/code/case_ranker.py
@@ -8,80 +8,81 @@
 import utils
 import operator
 
+
 def read_caseolap_result(case_file):
-	phrase_map = {}
-	cell_map = {}
-
-	cell_cnt = 0
-	with open(case_file) as f:
-		for line in f:
-			cell_cnt += 1
-			segments = line.strip('\r\n ').split('\t')
-			cell_id, phs_str = segments[0], segments[1][1:-1]
-			cell_map[cell_id] = []
-			segments = phs_str.split(', ')
-			for ph_score in segments:
-				parts = ph_score.split('|')
-				ph, score = parts[0], float(parts[1])
-				if ph not in phrase_map:
-					phrase_map[ph] = {}
-				phrase_map[ph][cell_id] = score
-				cell_map[cell_id].append((ph, score))
-
-	return phrase_map, cell_map, cell_cnt
+    phrase_map = {}
+    cell_map = {}
+
+    cell_cnt = 0
+    with open(case_file) as f:
+        for line in f:
+            cell_cnt += 1
+            segments = line.strip('\r\n ').split('\t')
+            cell_id, phs_str = segments[0], segments[1][1:-1]
+            cell_map[cell_id] = []
+            segments = phs_str.split(', ')
+            for ph_score in segments:
+                parts = ph_score.split('|')
+                ph, score = parts[0], float(parts[1])
+                if ph not in phrase_map:
+                    phrase_map[ph] = {}
+                phrase_map[ph][cell_id] = score
+                cell_map[cell_id].append((ph, score))
+
+    return phrase_map, cell_map, cell_cnt
 
 
 def rank_phrase(case_file):
+    ph_dist_map = {}
+    smoothing_factor = 0.0
 
-	ph_dist_map = {}
-	smoothing_factor = 0.0
-
-	phrase_map, cell_map, cell_cnt = read_caseolap_result(case_file)
+    phrase_map, cell_map, cell_cnt = read_caseolap_result(case_file)
+    print(cell_cnt)
+    unif = [1.0 / cell_cnt] * cell_cnt
 
-	unif = [1.0 / cell_cnt] * cell_cnt
+    for ph in phrase_map:
+        ph_vec = [x[1] for x in phrase_map[ph].items()]
+        if len(ph_vec) < cell_cnt:
+            ph_vec += [0] * (cell_cnt - len(ph_vec))
+        # smoothing
+        ph_vec = [x + smoothing_factor for x in ph_vec]
+        ph_vec = utils.l1_normalize(ph_vec)
+        ph_dist_map[ph] = utils.kl_divergence(ph_vec, unif)
 
-	for ph in phrase_map:
-		ph_vec = [x[1] for x in phrase_map[ph].iteritems()]
-		if len(ph_vec) < cell_cnt:
-			ph_vec += [0] * (cell_cnt - len(ph_vec))
-		# smoothing
-		ph_vec = [x + smoothing_factor for x in ph_vec]
-		ph_vec = utils.l1_normalize(ph_vec)
-		ph_dist_map[ph] = utils.kl_divergence(ph_vec, unif)
+    ranked_list = sorted(list(ph_dist_map.items()), key=operator.itemgetter(1), reverse=True)
 
-	ranked_list = sorted(ph_dist_map.items(), key=operator.itemgetter(1), reverse=True)
-	
-	return ranked_list
+    return ranked_list
 
 
 def write_keywords(o_file, ranked_list, thres):
-	with open(o_file, 'w+') as g:
-		for ph in ranked_list:
-			if ph[1] > thres:
-				g.write('%s\n' % (ph[0]))
-	tmp_file = o_file + '-score.txt'
-	with open(tmp_file, 'w+') as g:
-		for ph in ranked_list:
-			g.write('%s\t%f\n' % (ph[0], ph[1]))
+    with open(o_file, 'w+') as g:
+        for ph in ranked_list:
+            if ph[1] > thres:
+                g.write('%s\n' % (ph[0]))
+    tmp_file = o_file + '-score.txt'
+    with open(tmp_file, 'w+') as g:
+        for ph in ranked_list:
+            g.write('%s\t%f\n' % (ph[0], ph[1]))
 
-def main_rank_phrase(input_f, output_f, thres):
-  ranked_list = rank_phrase(input_f)
-  write_keywords(output_f, ranked_list, thres)
-  print("[CaseOLAP] Finish pushing general terms up")
 
-if __name__ == "__main__":
-	parser = argparse.ArgumentParser(prog='case_ranker.py', \
-			description='Ranks the distinctiveness score using caseolap result.')
-	parser.add_argument('-folder', required=True, \
-			help='The folder that stores the file.')
-	parser.add_argument('-iter', required=True, \
-			help='Iteration index.')
-	parser.add_argument('-thres', required=True, \
-			help='The files used.')
-	args = parser.parse_args()
-
-	input_f = '%s/caseolap-%s.txt' % (args.folder, args.iter)
-	output_f = '%s/keywords-%s.txt' % (args.folder, args.iter)
+def main_rank_phrase(input_f, output_f, thres):
+    ranked_list = rank_phrase(input_f)
+    write_keywords(output_f, ranked_list, thres)
+    print("[CaseOLAP] Finish pushing general terms up")
 
-	main_rank_phrase(input_f, output_f, float(args.thres))
 
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(prog='case_ranker.py', \
+                                     description='Ranks the distinctiveness score using caseolap result.')
+    parser.add_argument('-folder', required=True, \
+                        help='The folder that stores the file.')
+    parser.add_argument('-iter', required=True, \
+                        help='Iteration index.')
+    parser.add_argument('-thres', required=True, \
+                        help='The files used.')
+    args = parser.parse_args()
+
+    input_f = '%s/caseolap-%s.txt' % (args.folder, args.iter)
+    output_f = '%s/keywords-%s.txt' % (args.folder, args.iter)
+
+    main_rank_phrase(input_f, output_f, float(args.thres))
diff --git a/code/caseslim.py b/code/caseslim.py
index 65a5cfb..0b5b076 100644
--- a/code/caseslim.py
+++ b/code/caseslim.py
@@ -66,7 +66,7 @@ def compute(self, score_type='ALL'):
 			group = [(self_df, self.max_df, self_cnt, sum_self)]
 
 			self.context_groups[phrase] = []
-			for phrase_group, phrase_values in self.phrase_cnt_context.items():
+			for phrase_group, phrase_values in list(self.phrase_cnt_context.items()):
 				context_df = self.phrase_df_context[phrase_group].get(phrase, 0)
 				sum_context = self.sum_cnt_context[phrase_group]
 				context_cnt = phrase_values.get(phrase, 0)
@@ -167,7 +167,7 @@ def __init__(self, freq_data, selected_docs, context_doc_groups, global_scores=N
 		self.sum_cnt = sum(self.phrase_cnt.values())
 		self.sum_cnt_context = {}
 		self.global_scores = global_scores
-		for group, docs in context_doc_groups.items():
+		for group, docs in list(context_doc_groups.items()):
 			self.phrase_cnt_context[group], self.phrase_df_context[group] = self.agg_phrase_cnt_df(freq_data, docs)
 			if len(self.phrase_df_context[group]) > 0:
 				self.max_df_context[group] = max(self.phrase_df_context[group].values())
@@ -248,7 +248,7 @@ def run_caseolap(cells, freq_data, target_phs, o_file, verbose=3, top_k=200):
   of = open(o_file, 'w+')
 
   for cell in cells:
-    print('[CaseOLAP] Running CaseOLAP for cell: %s' % cell)
+    print(('[CaseOLAP] Running CaseOLAP for cell: %s' % cell))
 
     selected_docs = cells[cell]
     context_doc_groups = copy.copy(cells)
@@ -260,7 +260,7 @@ def run_caseolap(cells, freq_data, target_phs, o_file, verbose=3, top_k=200):
 
     phr_str = ', '.join([ph[0] + '|' + str(ph[1]) for ph in top_phrases if ph[0] in target_phs])
     of.write('[%s]\n' % phr_str)
-    print('[CaseOLAP] Finished CaseOLAP for cell: %s' % cell)
+    print(('[CaseOLAP] Finished CaseOLAP for cell: %s' % cell))
 
 
 def main_caseolap(link_f, cell_f, token_f, output_f):
diff --git a/code/cluster-preprocess.py b/code/cluster-preprocess.py
index c547540..44ea0eb 100644
--- a/code/cluster-preprocess.py
+++ b/code/cluster-preprocess.py
@@ -87,7 +87,7 @@ def gen_doc_keyword_cnt_file(doc_file, keyword_cnt_file):
 
 def counter_to_string(counter):
     elements = []
-    for k, v in counter.items():
+    for k, v in list(counter.items()):
         elements.append(k)
         elements.append(v)
     return '\t'.join([str(e) for e in elements])
@@ -129,7 +129,7 @@ def main(raw_dir, input_dir, init_dir):
 # input_dir = '/shared/data/czhang82/projects/local-embedding/sp/input/'
 # init_dir = '/shared/data/czhang82/projects/local-embedding/sp/init/'
 if __name__ == '__main__':
-    corpusName = sys.argv[1]
+    corpusName = 'dblp'
     raw_dir = '../data/'+corpusName+'/raw/'
     input_dir = '../data/'+corpusName+'/input/'
     init_dir = '../data/'+corpusName+'/init/'
diff --git a/code/cluster.py b/code/cluster.py
index 9bc25ff..5e89e0f 100644
--- a/code/cluster.py
+++ b/code/cluster.py
@@ -28,7 +28,7 @@ def fit(self):
         self.membership = labels
         self.center_ids = self.gen_center_idx()
         self.inertia_scores = self.clus.inertia_
-        print('Clustering concentration score:', self.inertia_scores)
+        print(('Clustering concentration score:', self.inertia_scores))
 
     # find the idx of each cluster center
     def gen_center_idx(self):
@@ -58,13 +58,13 @@ def calc_cosine(self, vec_a, vec_b):
 def run_clustering(full_data, doc_id_file, filter_keyword_file, n_cluster, parent_direcotry, parent_description,\
                    cluster_keyword_file, hierarchy_file, doc_membership_file):
     dataset = SubDataSet(full_data, doc_id_file, filter_keyword_file)
-    print('Start clustering for ', len(dataset.keywords), ' keywords under parent:', parent_description)
+    print(('Start clustering for ', len(dataset.keywords), ' keywords under parent:', parent_description))
     ## TODO: change later here for n_cluster selection from a range
     clus = Clusterer(dataset.embeddings, n_cluster)
     clus.fit()
-    print('Done clustering for ', len(dataset.keywords), ' keywords under parent:', parent_description)
+    print(('Done clustering for ', len(dataset.keywords), ' keywords under parent:', parent_description))
     dataset.write_cluster_members(clus, cluster_keyword_file, parent_direcotry)
     center_names = dataset.write_cluster_centers(clus, parent_description, hierarchy_file)
     dataset.write_document_membership(clus, doc_membership_file, parent_direcotry)
-    print('Done saving cluster results for ', len(dataset.keywords), ' keywords under parent:', parent_description)
+    print(('Done saving cluster results for ', len(dataset.keywords), ' keywords under parent:', parent_description))
     return center_names
diff --git a/code/compress.py b/code/compress.py
index d444ad5..4fff169 100644
--- a/code/compress.py
+++ b/code/compress.py
@@ -1,7 +1,7 @@
 import argparse
 import utils
 import operator
-import Queue
+import queue
 import math
 from os import listdir
 from os.path import isfile, join, isdir, abspath, dirname, basename, exists
@@ -37,12 +37,12 @@ def parse_reidx(reidx_f):
 		if len(pd_map[ph]) > 0:
 			ph_idf[ph] = math.log(float(d_cnt) / len(pd_map[ph]))
 
-	print 'Inverted Index file read.'
+	print('Inverted Index file read.')
 
 
 
 def get_rep(folder, c_id, N):
-	print('Start get representative phrases for %s, %s ========================' % (folder, c_id))
+	print(('Start get representative phrases for %s, %s ========================' % (folder, c_id)))
 	# print folder
 	par_folder = dirname(folder)
 	cur_label = basename(folder)
@@ -72,7 +72,7 @@ def get_rep(folder, c_id, N):
 			emb_dist = utils.cossim(embs[ph], embs[cur_label])
 			ph_scores[ph] = score * emb_dist
 
-		ph_scores = sorted(ph_scores.items(), key=operator.itemgetter(1), reverse=True)
+		ph_scores = sorted(list(ph_scores.items()), key=operator.itemgetter(1), reverse=True)
 
 		for (ph, score) in ph_scores:
 			if ph not in result_phrases and ph in kws:
@@ -81,7 +81,7 @@ def get_rep(folder, c_id, N):
 				break
 		
 	elif ph_idf == None:
-		print 'looking at embeddings for %s' % folder
+		print('looking at embeddings for %s' % folder)
 
 		ph_f = '%s/embeddings.txt' % par_folder
 		kw_f = '%s/keywords.txt' % par_folder
@@ -119,7 +119,7 @@ def get_rep(folder, c_id, N):
 				result_phrases.append(ph)
 	else:
 		# Using TF-IDF to generate
-		print 'looking at tf-idf for %s' % folder
+		print('looking at tf-idf for %s' % folder)
 		d_clus_f = '%s/paper_cluster.txt' % par_folder
 		kw_clus_f = '%s/cluster_keywords.txt' % par_folder
 		docs = []
@@ -148,7 +148,7 @@ def get_rep(folder, c_id, N):
 				continue
 			ph_scores[ph] = 1 + math.log(ph_scores[ph])
 			ph_scores[ph] *= ph_idf[ph]
-		ph_scores = sorted(ph_scores.items(), key=operator.itemgetter(1), reverse=True)
+		ph_scores = sorted(list(ph_scores.items()), key=operator.itemgetter(1), reverse=True)
 
 		for (ph, score) in ph_scores:
 			if ph not in result_phrases:
@@ -161,7 +161,7 @@ def get_rep(folder, c_id, N):
 
 def recursion(root, o_file, N):
 
-	q = Queue.Queue()
+	q = queue.Queue()
 	q.put((root, -1, '*'))
 
 	g = open(o_file, 'w+')
diff --git a/code/dataset.py b/code/dataset.py
index f167919..600adf5 100644
--- a/code/dataset.py
+++ b/code/dataset.py
@@ -76,7 +76,7 @@ def load_keywords(self, keyword_file, full_data):
                 if keyword in full_data.embeddings:
                     keywords.append(keyword)
                 else:
-                    print(keyword, ' not in the embedding file')
+                    print((keyword, ' not in the embedding file'))
         return keywords
 
     def gen_keyword_id(self):
@@ -210,4 +210,4 @@ def assign_document(self, doc_membership):
     keyword_file = data_dir + 'input/candidates.txt'
     embedding_file = data_dir + 'input/embeddings.txt'
     dataset = DataSet(embedding_file, document_file, keyword_file)
-    print(len(dataset.get_candidate_embeddings()))
+    print((len(dataset.get_candidate_embeddings())))
diff --git a/code/embedding_deprecated/fetch_papers_relevant.py b/code/embedding_deprecated/fetch_papers_relevant.py
index ab5e0cf..d070b1a 100644
--- a/code/embedding_deprecated/fetch_papers_relevant.py
+++ b/code/embedding_deprecated/fetch_papers_relevant.py
@@ -21,19 +21,19 @@
 
 def fetch_paper(thread, N, term, relevant_dict, tp_map):
     targets, target_set = [term], set([term])
-    for i in xrange(N):
+    for i in range(N):
         q, s = relevant_dict[term][i][0], relevant_dict[term][i][1]
         if s > thread and q not in target_set:
             targets.append(q)
             target_set.add(q)
     paper_set = set()
-    print "Query :", term, len(targets), "---"
+    print("Query :", term, len(targets), "---")
     for t in targets:
         if t not in tp_map:
-            print "(%s)" % t,
+            print("(%s)" % t, end=' ')
             continue
         paper_set = paper_set | tp_map[t]
-        print "(%s, %d) " % (t, len(paper_set)), 
+        print("(%s, %d) " % (t, len(paper_set)), end=' ') 
     return paper_set
     
 
@@ -71,15 +71,15 @@ def fetch_paper(thread, N, term, relevant_dict, tp_map):
     
     for term in query_terms:
         relevant_dict[term] = []
-        print "%s/%s" % (args.relevant, term)
+        print("%s/%s" % (args.relevant, term))
         data = open("%s/%s" % (args.relevant, term))
-        print term 
+        print(term) 
         for line in data:
             values = line.strip().split()
             relevant_dict[term].append((values[0], float(values[1])))
-            print values[0],  
+            print(values[0], end=' ')  
             all_terms.add(values[0])
-        print "\n" 
+        print("\n") 
         
     # read paper-term 
     term_paper_map = dict()
@@ -93,13 +93,13 @@ def fetch_paper(thread, N, term, relevant_dict, tp_map):
         if term not in term_paper_map:
             term_paper_map[term] = set()
         term_paper_map[term].add(paper)
-    print "Finish reading paper_term.net"
+    print("Finish reading paper_term.net")
 
     data = open(args.query)
     for line in data:
         queries = line.strip().split("#")
         paper_set = None 
-        for i in xrange(len(queries)):
+        for i in range(len(queries)):
             if i == 0:
                 paper_set = fetch_paper(threshold, N,
                         queries[i], relevant_dict, term_paper_map)
@@ -107,9 +107,9 @@ def fetch_paper(thread, N, term, relevant_dict, tp_map):
                 paper_set = paper_set | fetch_paper(threshold, N, 
                         queries[i], relevant_dict, term_paper_map)
             
-            print "For query :", queries[i],
-            print "Before :", len(term_paper_map[queries[i]]),        
-            print "After :", len(paper_set)
+            print("For query :", queries[i], end=' ')
+            print("Before :", len(term_paper_map[queries[i]]), end=' ')        
+            print("After :", len(paper_set))
         
         
         path = "%s/%s" % (args.output, "_".join(queries))
diff --git a/code/embedding_deprecated/metrics.py b/code/embedding_deprecated/metrics.py
index 8658cc3..07b1761 100644
--- a/code/embedding_deprecated/metrics.py
+++ b/code/embedding_deprecated/metrics.py
@@ -29,23 +29,23 @@ def precision(candidates, ground_truth):
 
     p_5, p_10, p_20 = 0, 0, 0
     true_positive = 0
-    for i in xrange(5):
+    for i in range(5):
         if i < len(candidates) and candidates[i] in ground_truth:
             true_positive += 1
     p_5 = float(true_positive) / 5.0
 
-    for i in xrange(5, 10):
+    for i in range(5, 10):
         if i < len(candidates) and candidates[i] in ground_truth:
             true_positive += 1
     p_10 = float(true_positive) / 10.0
 
-    for i in xrange(10, 20):
+    for i in range(10, 20):
         if i < len(candidates) and candidates[i] in ground_truth:
             true_positive += 1
     p_20 = float(true_positive) / 20.0
 
     true_positive = 0
-    for i in xrange(min(R, C)):
+    for i in range(min(R, C)):
         if i < len(candidates) and candidates[i] in ground_truth:
             true_positive += 1
             
@@ -58,7 +58,7 @@ def MAP(candidates, ground_truth):
     C = len(candidates)
     R = len(ground_truth)
     true_positive, precision, total_precision = 0, 0, 0
-    for n in xrange(1, min(C, R) + 1):
+    for n in range(1, min(C, R) + 1):
         if candidates[n - 1] in ground_truth:
             true_positive += 1
             precision = float(true_positive) / float(n)
@@ -68,7 +68,7 @@ def MAP(candidates, ground_truth):
 def RR(candidates, ground_truth):
     result = 0
     nCandidates = len(candidates)
-    for n in xrange(nCandidates):
+    for n in range(nCandidates):
         if candidates[n] in ground_truth:
             result = n + 1
             break 
@@ -77,7 +77,7 @@ def RR(candidates, ground_truth):
 def NDCG(candidates, ground_truth, K = 20):
     DCG, IDCG = 0, 0
     ndcg_5, ndcg_10, ndcg_20 = 0, 0, 0
-    for i in xrange(K):
+    for i in range(K):
         x = 1.0 / log(i + 2, 2)
         IDCG += x
         if i < len(candidates) and candidates[i] in ground_truth:
@@ -93,7 +93,7 @@ def NDCG(candidates, ground_truth, K = 20):
 def bpref(candidates, ground_truth):
     R = len(candidates)
     false_positive, bpref, k = 0, 0, 0
-    for n in xrange(R):
+    for n in range(R):
         if candidates[n] not in ground_truth:
             false_positive += 1
         else:
@@ -104,11 +104,11 @@ def bpref(candidates, ground_truth):
     return bpref / float(k)
 
 def printResults(name, result):
-    print name,
-    print "%.4f" % (np.mean(result)),
+    print(name, end=' ')
+    print("%.4f" % (np.mean(result)), end=' ')
     for res in result:
-        print "%.4f" % res,
-    print
+        print("%.4f" % res, end=' ')
+    print()
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(prog='metrics.py', \
@@ -131,11 +131,11 @@ def printResults(name, result):
     for line in query_data:
         query = line.split("\n")[0].replace(" ", "_")
         queries.append(query)
-    print "measure overall", " ".join(queries)
+    print("measure overall", " ".join(queries))
     
     Precision_5, Precision_10, Precision_20, Precision_R, l_MAP, l_MRR, l_bpref = \
-            list([[] for i in xrange(7)])
-    NDCG_5, NDCG_10, NDCG_20 = list([[] for i in xrange(3)])
+            list([[] for i in range(7)])
+    NDCG_5, NDCG_10, NDCG_20 = list([[] for i in range(3)])
 
     for query in queries:
         candidate_file = input_folder + "/" + query + "/author.weights"
@@ -145,7 +145,7 @@ def printResults(name, result):
         for line in data:
             value = line.split("\n")[0].split()
             scores[value[0]] = float(value[1])
-        scores = sorted(scores.items(), key=lambda x:x[1], reverse=True)
+        scores = sorted(list(scores.items()), key=lambda x:x[1], reverse=True)
 
         candidates = [x[0] for x in scores]
         
diff --git a/code/eval.py b/code/eval.py
index bd67933..ff239a5 100644
--- a/code/eval.py
+++ b/code/eval.py
@@ -13,7 +13,7 @@ def handler(folder):
 
 	if True:
 		files = ['%s/%s' % (folder, f) for f in listdir(folder) if isfile(join(folder, f)) and 'intrusion_exp' in f]
-		print files
+		print(files)
 
 		intru_gold = '%s/%s' % (folder, 'intrusion_gold.txt')
 
@@ -51,9 +51,9 @@ def handler(folder):
 
 		# print idx
 
-		print '\nIntrusion results!!'
+		print('\nIntrusion results!!')
 		for method in total:
-			print '%s accuracy: %d/%d - %f' % (method, correct[method], total[method], float(correct[method]) / total[method])
+			print('%s accuracy: %d/%d - %f' % (method, correct[method], total[method], float(correct[method]) / total[method]))
 
 
 	# files = ['%s/%s' % (folder, f) for f in listdir(folder) if isfile(join(folder, f)) and 'isa_exp' in f]
@@ -95,7 +95,7 @@ def handler(folder):
 
 	# subdomain result
 	files = ['%s/%s' % (folder, f) for f in listdir(folder) if isfile(join(folder, f)) and 'subdomain_exp' in f]
-	print files
+	print(files)
 
 	intru_gold = '%s/%s' % (folder, 'subdomain_gold.txt')
 
@@ -129,9 +129,9 @@ def handler(folder):
 				if value == 'n':
 					incorrect[method] += 1
 
-	print '\nSubdomain results!!'
+	print('\nSubdomain results!!')
 	for method in total:
-		print '%s accuracy: %d/%d/%d - %f' % (method, correct[method], incorrect[method], total[method], float(correct[method]) / total[method])
+		print('%s accuracy: %d/%d/%d - %f' % (method, correct[method], incorrect[method], total[method], float(correct[method]) / total[method]))
 
 
 if __name__ == "__main__":
diff --git a/code/gen_eval.py b/code/gen_eval.py
index 1407708..cdc8926 100644
--- a/code/gen_eval.py
+++ b/code/gen_eval.py
@@ -59,11 +59,11 @@ def gen_isa_pairs(tax, isa_N, case_N):
 		p_phs = '|'.join(p_node.ph_list[:isa_N])
 		rmd_phs = '|'.join(rmd_node.ph_list[:isa_N])
 		if len(p_phs) == 0 or len(rmd_phs) == 0:
-			print tax
-			print n_phs
-			print node.name
-			print p_node.name
-			print rmd_node.name
+			print(tax)
+			print(n_phs)
+			print(node.name)
+			print(p_node.name)
+			print(rmd_node.name)
 			exit(1)
 		order = random.choice([0, 1])
 		p_id = 0
@@ -122,7 +122,7 @@ def handler(folder, output, N, isa_N, case_N):
 	subdomain_map = {}
 
 	for tax_name in taxs:
-		print tax_name
+		print(tax_name)
 		# generate intrusion pairs
 		# intru_f = '%s/%s.intrusion' % (output, tax_name)
 		intru_maps[tax_name] = gen_intrusion_pairs(taxs[tax_name], N, case_N)
diff --git a/code/labeling.py b/code/labeling.py
index f05ef0d..4234e9e 100644
--- a/code/labeling.py
+++ b/code/labeling.py
@@ -1,13 +1,13 @@
 import argparse
 import utils
 import operator
-import Queue
+import queue
 from os import listdir
 from os.path import isfile, join, isdir, abspath, dirname, basename
 from case_ranker import read_caseolap_result, rank_phrase
 
 def label_emb_centric(folder, c_id):
-	print 'Start labeling for %s, %s ========================' % (folder, c_id)
+	print('Start labeling for %s, %s ========================' % (folder, c_id))
 	# print folder
 	par_folder = dirname(folder)
 	cur_label = basename(folder)
@@ -18,7 +18,7 @@ def label_emb_centric(folder, c_id):
 	# generate word2vec phrases
 	embs = utils.load_embeddings(emb_f)
 	if cur_label not in embs:
-		print 'Error!!!'
+		print('Error!!!')
 		exit(1)
 	N = 100
 	worst = -100
@@ -58,8 +58,8 @@ def label_emb_centric(folder, c_id):
 
 		label_cands[ph] = score
 
-	ranked_list = sorted(label_cands.items(), key=operator.itemgetter(1), reverse=True)
-	print ranked_list
+	ranked_list = sorted(list(label_cands.items()), key=operator.itemgetter(1), reverse=True)
+	print(ranked_list)
 
 	return ranked_list[0][0]
 
@@ -129,7 +129,7 @@ def label_emb_centric(folder, c_id):
 
 def recursion(root):
 
-	q = Queue.Queue()
+	q = queue.Queue()
 	q.put((root, -1))
 
 	label_map = {}
@@ -149,10 +149,10 @@ def recursion(root):
 				l = label_emb_centric(c_folder, str(c_id))
 				cur_label = basename(c_folder)
 				label_map[cur_label] = l
-				print 'label for %s is: %s\n' % (c_folder, l)
+				print('label for %s is: %s\n' % (c_folder, l))
 	except:
-		for (o_l, l) in label_map.items():
-			print '%s ==> %s' % (o_l, l)
+		for (o_l, l) in list(label_map.items()):
+			print('%s ==> %s' % (o_l, l))
 
 
 if __name__ == "__main__":
diff --git a/code/local_embedding_training.py b/code/local_embedding_training.py
index 7d6fcca..dcab154 100644
--- a/code/local_embedding_training.py
+++ b/code/local_embedding_training.py
@@ -10,7 +10,7 @@
 import os
 
 def read_files(folder, parent):
-    print("[Local-embedding] Reading file:", parent)
+    print(("[Local-embedding] Reading file:", parent))
     emb_file = '%s/embeddings.txt' % folder
     hier_file = '%s/hierarchy.txt' % folder
     keyword_file = '%s/keywords.txt' % folder ## here only consider those remaining keywords
@@ -106,7 +106,7 @@ def run_word2vec(pd_map, docs, cates, folder):
         for ph in cates[cate]:
             c_docs = c_docs.union(pd_map[ph])
 
-        print('Starting cell %s with %d docs.' % (cate, len(c_docs)))
+        print(('Starting cell %s with %d docs.' % (cate, len(c_docs))))
         
         # save file
         # sub_folder = '%s/%s' % (folder, cate)
diff --git a/code/main.py b/code/main.py
index b7a0ff7..e464c02 100644
--- a/code/main.py
+++ b/code/main.py
@@ -49,20 +49,22 @@ def recur(input_dir, node_dir, n_cluster, parent, n_cluster_iter, filter_thre,\
           n_expand, level, caseolap=True, local_embedding=True):
     if level > MAX_LEVEL:
         return
-    print('============================= Running level ', level, ' and node ', parent, '=============================')
+    print(('============================= Running level ', level, ' and node ', parent, '============================='))
     start = time.time()
     df = DataFiles(input_dir, node_dir)
     ## TODO: Everytime we need to read-in the whole corpus, which can be slow.
     full_data = DataSet(df.embedding_file, df.doc_file)
     end = time.time()
-    print('[Main] Done reading the full data using time %s seconds' % (end-start))
+    print(('[Main] Done reading the full data using time %s seconds' % (end-start)))
 
     # filter the keywords
     if caseolap is False:
         try:
             children = run_clustering(full_data, df.doc_id_file, df.seed_keyword_file, n_cluster, node_dir, parent, \
                                       df.cluster_keyword_file, df.hierarchy_file, df.doc_membership_file)
-        except:
+        except Exception as e:
+            print(e)
+            print(e)
             print('Clustering not finished.')
             return
         copyfile(df.seed_keyword_file, df.filtered_keyword_file)
@@ -74,7 +76,8 @@ def recur(input_dir, node_dir, n_cluster, parent, n_cluster_iter, filter_thre,\
             try:
                 children = run_clustering(full_data, df.doc_id_file, df.seed_keyword_file, n_cluster, node_dir, parent,\
                                df.cluster_keyword_file, df.hierarchy_file, df.doc_membership_file)
-            except:
+            except Exception as e:
+                print(e)
                 print('Clustering not finished.')
                 return
 
@@ -82,7 +85,7 @@ def recur(input_dir, node_dir, n_cluster, parent, n_cluster_iter, filter_thre,\
             main_caseolap(df.link_file, df.doc_membership_file, df.cluster_keyword_file, df.caseolap_keyword_file)
             main_rank_phrase(df.caseolap_keyword_file, df.filtered_keyword_file, filter_thre)
             end = time.time()
-            print("[Main] Finish running CaseOALP using %s (seconds)" % (end - start))
+            print(("[Main] Finish running CaseOALP using %s (seconds)" % (end - start)))
 
     # prepare the embedding for child level
     if level < MAX_LEVEL:
@@ -96,7 +99,7 @@ def recur(input_dir, node_dir, n_cluster, parent, n_cluster_iter, filter_thre,\
             start = time.time()
             main_local_embedding(node_dir, df.doc_file, df.index_file, parent, n_expand)
             end = time.time()
-            print("[Main] Finish running local embedding training using %s (seconds)" % (end - start))
+            print(("[Main] Finish running local embedding training using %s (seconds)" % (end - start)))
 
     for child in children:
         recur(input_dir, node_dir + child + '/', n_cluster, child, n_cluster_iter, \
diff --git a/code/paras.py b/code/paras.py
index 027ae53..f29e54b 100644
--- a/code/paras.py
+++ b/code/paras.py
@@ -99,14 +99,14 @@ def load_sp_params():
 
 def load_dblp_params_method():
     pd = dict()
-    pd['data_dir'] = '/shared/data/jiaming/local-embedding/data/dblp/'
+    pd['data_dir'] = '/home/sasce/PycharmProjects/taxogen/data/dblp/'
     pd['doc_file'] = pd['data_dir'] + 'input/papers.txt'
     pd['doc_keyword_cnt_file'] = pd['data_dir'] + 'input/keyword_cnt.txt'
     pd['input_dir'] = pd['data_dir'] + 'input/'
     pd['root_node_dir'] = pd['data_dir'] + 'cluster/'
     pd['n_cluster'] = 5
-    pd['filter_thre'] = 0.25
-    pd['n_expand'] = 100
-    pd['n_cluster_iter'] = 2
+    pd['filter_thre'] = 0.15
+    pd['n_expand'] = 200
+    pd['n_cluster_iter'] = 5
     return pd
 
diff --git a/code/postprocess/redundancy.py b/code/postprocess/redundancy.py
index 256810c..af505ea 100644
--- a/code/postprocess/redundancy.py
+++ b/code/postprocess/redundancy.py
@@ -31,7 +31,7 @@ def load_taxonomy(tax_file):
 
 def compute_pmi(inverted_index, tax):
     values = []
-    for node, keywords in tax.items():
+    for node, keywords in list(tax.items()):
         pairs = set(frozenset([i, j]) for i in keywords for j in keywords if i != j)
         for p in pairs:
             pair = list(p)
@@ -54,7 +54,7 @@ def main(idx_file, taxonomy_dir, output_file):
     inverted_index = load_inverted_index(idx_file)
     # print len(inverted_index)
     taxonomy_files = get_tax_filenames(taxonomy_dir)
-    print taxonomy_files
+    print(taxonomy_files)
     with open(output_file, 'a') as fout:
         for tax_file in taxonomy_files:
             tax = load_taxonomy(tax_file)
diff --git a/code/postprocess/visualize.py b/code/postprocess/visualize.py
index 87ed098..f63f99a 100644
--- a/code/postprocess/visualize.py
+++ b/code/postprocess/visualize.py
@@ -13,7 +13,7 @@ def load_nodes(node_file, min_level=1, max_level=3, prefix_list=['*']):
                 node_content = []
             nodes[node_id] = node_content
     prune_nodes = {}
-    for node_id, node_content in nodes.items():
+    for node_id, node_content in list(nodes.items()):
         # prune nodes
         if not has_one_prefix(node_id, prefix_list):
             continue
@@ -43,11 +43,11 @@ def is_exact_prefix(s, prefix):
 
 
 def gen_edges(nodes):
-    node_ids = nodes.keys()
+    node_ids = list(nodes.keys())
     node_ids.sort(key=lambda x: len(x))
     edges = []
-    for i in xrange(len(nodes) - 1):
-        for j in xrange(i+1, len(nodes)):
+    for i in range(len(nodes) - 1):
+        for j in range(i+1, len(nodes)):
             if is_parent(node_ids[i], node_ids[j]):
                 edges.append([node_ids[i], node_ids[j]])
     return edges
@@ -75,7 +75,7 @@ def gen_node_label(node_id, node_content):
 
 def draw(nodes, edges, output_file):
     d = Digraph(node_attr={'shape': 'record'})
-    for node_id, node_content in nodes.items():
+    for node_id, node_content in list(nodes.items()):
         d.node(node_id, gen_node_label(node_id, node_content))
     for e in edges:
         d.edge(e[0], e[1])
diff --git a/code/preprocess.py b/code/preprocess.py
index 10446bc..8499e5e 100644
--- a/code/preprocess.py
+++ b/code/preprocess.py
@@ -17,7 +17,7 @@ def get_candidates(folder, o_file):
         phrase = line.split(' ')[0]
         quality_phrases.add(phrase)
 
-  print('Quality phrase count: ' + str(len(quality_phrases)))
+  print(('Quality phrase count: ' + str(len(quality_phrases))))
 
   with open(o_file, 'w+') as g:
     for phrase in quality_phrases:
@@ -49,7 +49,7 @@ def get_reidx_file(text, cand_f, o_file):
           pd_map[t].add(str(idx))
       idx += 1
       if idx % 10000 == 0:
-        print("[Construct Inverted Index] Parse %s documents" % idx)
+        print(("[Construct Inverted Index] Parse %s documents" % idx))
 
   with open(o_file, 'w+') as g:
     for ph in pd_map:
diff --git a/code/preprocess/AutoPhraseOutput.py b/code/preprocess/AutoPhraseOutput.py
index cf7b8e0..3b72141 100644
--- a/code/preprocess/AutoPhraseOutput.py
+++ b/code/preprocess/AutoPhraseOutput.py
@@ -67,7 +67,7 @@ def parse_one_doc(self, doc):
 
         # sanity checking
         if (len(q) != 0):
-            print("[ERROR]: mismatched </phrase> in document: %s" % doc)
+            print(("[ERROR]: mismatched </phrase> in document: %s" % doc))
 
     def save_phrase_to_pos_sequence(self, output_path=""):
         with open(output_path, "w") as fout:
@@ -88,7 +88,7 @@ def load_phrase_to_pos_sequence(self, input_path=""):
 
     def obtain_pos_sequence_to_score(self):
         pos_sequence_2_score = {}
-        for v in self.phrase_to_pos_sequence.values():
+        for v in list(self.phrase_to_pos_sequence.values()):
             for pos_sequence in v:
                 if pos_sequence not in pos_sequence_2_score:
                     pos_sequence_list = pos_sequence.split()
@@ -103,8 +103,8 @@ def obtain_pos_sequence_to_score(self):
         self.pos_sequence_to_score = pos_sequence_2_score
 
 
-        print(len(self.pos_sequence_to_score))
-        print(self.pos_sequence_to_score)
+        print((len(self.pos_sequence_to_score)))
+        print((self.pos_sequence_to_score))
 
     def obtain_candidate_phrase(self, threshold = 0.8, min_sup = 5):
         candidate_phrase = []
@@ -113,15 +113,15 @@ def obtain_candidate_phrase(self, threshold = 0.8, min_sup = 5):
             freq = sum(self.phrase_to_pos_sequence[phrase].values())
             if freq < min_sup:
                 continue
-            for pos_sequence in self.phrase_to_pos_sequence[phrase].keys():
+            for pos_sequence in list(self.phrase_to_pos_sequence[phrase].keys()):
                 pos_sequence_weight = float(self.phrase_to_pos_sequence[phrase][pos_sequence]) / freq
                 pos_sequence_score = self.pos_sequence_to_score[pos_sequence]
                 phrase_score += (pos_sequence_weight * pos_sequence_score)
             if phrase_score >= threshold:
                 # print(phrase, phrase_score)
                 candidate_phrase.append(phrase)
-        print(len(candidate_phrase))
-        print(candidate_phrase[0:10])
+        print((len(candidate_phrase)))
+        print((candidate_phrase[0:10]))
         self.candidate_phrase = candidate_phrase
 
     def save_candidate_phrase(self, output_path=""):
diff --git a/code/preprocess/SegPhraseOutput.py b/code/preprocess/SegPhraseOutput.py
index d869a0f..ccd9a68 100644
--- a/code/preprocess/SegPhraseOutput.py
+++ b/code/preprocess/SegPhraseOutput.py
@@ -65,7 +65,7 @@ def parse_one_doc(self, doc):
 
         # sanity checking
         if (len(q) != 0):
-            print("[ERROR]: mismatched </phrase> in document: %s" % doc)
+            print(("[ERROR]: mismatched </phrase> in document: %s" % doc))
 
     def save_phrase_to_pos_sequence(self, output_path=""):
         with open(output_path, "w") as fout:
@@ -83,11 +83,11 @@ def load_phrase_to_pos_sequence(self, input_path=""):
                 phrase = seg[0]
                 pos_sequence = eval(seg[1])
                 self.phrase_to_pos_sequence[phrase] = pos_sequence
-        print("[INFO] Number of phrases before NP pruning = ", len(self.phrase_to_pos_sequence))
+        print(("[INFO] Number of phrases before NP pruning = ", len(self.phrase_to_pos_sequence)))
 
     def obtain_pos_sequence_to_score(self):
         pos_sequence_2_score = {}
-        for v in self.phrase_to_pos_sequence.values():
+        for v in list(self.phrase_to_pos_sequence.values()):
             for pos_sequence in v:
                 if pos_sequence not in pos_sequence_2_score:
                     pos_sequence_list = pos_sequence.split()
@@ -102,8 +102,8 @@ def obtain_pos_sequence_to_score(self):
         self.pos_sequence_to_score = pos_sequence_2_score
 
 
-        print(len(self.pos_sequence_to_score))
-        print(self.pos_sequence_to_score)
+        print((len(self.pos_sequence_to_score)))
+        print((self.pos_sequence_to_score))
 
     def obtain_candidate_phrase(self, threshold = 0.8, min_sup = 5):
         candidate_phrase = []
@@ -112,15 +112,15 @@ def obtain_candidate_phrase(self, threshold = 0.8, min_sup = 5):
             freq = sum(self.phrase_to_pos_sequence[phrase].values())
             if freq < min_sup:
                 continue
-            for pos_sequence in self.phrase_to_pos_sequence[phrase].keys():
+            for pos_sequence in list(self.phrase_to_pos_sequence[phrase].keys()):
                 pos_sequence_weight = float(self.phrase_to_pos_sequence[phrase][pos_sequence]) / freq
                 pos_sequence_score = self.pos_sequence_to_score[pos_sequence]
                 phrase_score += (pos_sequence_weight * pos_sequence_score)
             if phrase_score >= threshold:
                 # print(phrase, phrase_score)
                 candidate_phrase.append(phrase)
-        print(len(candidate_phrase))
-        print(candidate_phrase[0:10])
+        print((len(candidate_phrase)))
+        print((candidate_phrase[0:10]))
         self.candidate_phrase = candidate_phrase
 
     def save_candidate_phrase(self, output_path=""):
diff --git a/code/preprocess/main.py b/code/preprocess/main.py
index b19a78c..8431e81 100644
--- a/code/preprocess/main.py
+++ b/code/preprocess/main.py
@@ -3,9 +3,9 @@
 __description__: Based on AutoPhrase output, generate 1) papers.txt, 2) keywords.txt
 __latest_update__: Auguest 2, 2017
 '''
-from config import *
-from AutoPhraseOutput import AutoPhraseOutput
-from SegPhraseOutput import SegPhraseOutput
+from .config import *
+from .AutoPhraseOutput import AutoPhraseOutput
+from .SegPhraseOutput import SegPhraseOutput
 import re
 from pattern.en import parsetree
 from pattern.en import parse
diff --git a/code/run.sh b/code/run.sh
old mode 100644
new mode 100755
index 1bcf2ff..8f0b7db
--- a/code/run.sh
+++ b/code/run.sh
@@ -2,14 +2,16 @@
 ## Name of the input corpus
 corpusName=dblp
 ## Name of the taxonomy
-taxonName=our-l3-0.25
+taxonName=our-l3-0.15
 ## If need preprocessing from raw input, set it to be 1, otherwise, set 0
-FIRST_RUN=${FIRST_RUN:- 0}
+FIRST_RUN=${FIRST_RUN:- 1}
+
+source activate '/home/sasce/.cache/pypoetry/virtualenvs/taxogen-Q1ywnX6T-py3.8/bin/python'
 
 if [ $FIRST_RUN -eq 1 ]; then
 	echo 'Start data preprocessing'
 	## compile word2vec for embedding learning
-	gcc word2vec.c -o word2veec -lm -pthread -O2 -Wall -funroll-loops -Wno-unused-result
+	gcc word2vec.c -o word2vec -lm -pthread -O2 -Wall -funroll-loops -Wno-unused-result
 
 	## create initial folder if not exist
 	if [ ! -d ../data/$corpusName/init ]; then
diff --git a/code/sim_emb.py b/code/sim_emb.py
index fc5ff17..0367eef 100644
--- a/code/sim_emb.py
+++ b/code/sim_emb.py
@@ -1,7 +1,7 @@
 import argparse
 import utils
 import operator
-import Queue
+import queue
 import math
 from os import listdir
 from os.path import isfile, join, isdir, abspath, dirname, basename, exists
@@ -28,7 +28,7 @@ def compare(a_f, b_f):
 	b_emb_f = '%s/%s' % (b_f, emb_f)
 
 	if not exists(a_emb_f) or not exists(b_emb_f):
-		print 'Embedding file not found'
+		print('Embedding file not found')
 		exit(1)
 
 	embs_a = load_embeddings(a_emb_f)
@@ -37,9 +37,9 @@ def compare(a_f, b_f):
 	embs_groups = [embs_a, embs_b]
 
 	while 1:
-		n_name = raw_input("Enter your node: ")
+		n_name = input("Enter your node: ")
 		if n_name not in embs_a or n_name not in embs_b:
-			print '%s not found' % n_name
+			print('%s not found' % n_name)
 
 		for embs in embs_groups:
 			t_emb = embs[n_name]
@@ -47,11 +47,11 @@ def compare(a_f, b_f):
 			for key in embs:
 				sim_map[key] = utils.cossim(t_emb, embs[key])
 			
-			sim_map = sorted(sim_map.items(), key=operator.itemgetter(1), reverse=True)
+			sim_map = sorted(list(sim_map.items()), key=operator.itemgetter(1), reverse=True)
 			output_str = '\n'.join([sim_map[i][0] + '\t' + str(sim_map[i][1]) for i in range(10)])
 			# print sim_map[:10]
-			print output_str
-			print 'group finished\n'
+			print(output_str)
+			print('group finished\n')
 
 
 
diff --git a/code/spherecluster/__init__.py b/code/spherecluster/__init__.py
new file mode 100644
index 0000000..5a1e5aa
--- /dev/null
+++ b/code/spherecluster/__init__.py
@@ -0,0 +1,12 @@
+from __future__ import absolute_import
+from .spherical_kmeans import SphericalKMeans
+from .von_mises_fisher_mixture import VonMisesFisherMixture
+from .util import sample_vMF
+"""
+@author = 'Jason Laska',
+@author_email = 'jason@claralabs.com',
+@url = 'https://github.com/jasonlaska/spherecluster/'
+@edited = 'https://github.com/rfayat/spherecluster'
+"""
+
+__all__ = ["SphericalKMeans", "VonMisesFisherMixture", "sample_vMF"]
diff --git a/code/spherecluster/spherecluster/__init__.py b/code/spherecluster/spherecluster/__init__.py
new file mode 100644
index 0000000..5d2c502
--- /dev/null
+++ b/code/spherecluster/spherecluster/__init__.py
@@ -0,0 +1,6 @@
+from __future__ import absolute_import
+from .spherical_kmeans import SphericalKMeans
+from .von_mises_fisher_mixture import VonMisesFisherMixture
+from .util import sample_vMF
+
+__all__ = ["SphericalKMeans", "VonMisesFisherMixture", "sample_vMF"]
diff --git a/code/spherecluster/spherecluster/spherical_kmeans.py b/code/spherecluster/spherecluster/spherical_kmeans.py
new file mode 100644
index 0000000..d56e03c
--- /dev/null
+++ b/code/spherecluster/spherecluster/spherical_kmeans.py
@@ -0,0 +1,372 @@
+import warnings
+
+import numpy as np
+import scipy.sparse as sp
+from joblib import Parallel, delayed
+
+from sklearn.cluster import KMeans
+
+# from sklearn.cluster import _k_means
+from sklearn.cluster import _k_means_fast as _k_means
+from sklearn.cluster.k_means_ import (
+    _check_sample_weight,
+    _init_centroids,
+    _labels_inertia,
+    _tolerance,
+    _validate_center_shape,
+)
+from sklearn.preprocessing import normalize
+from sklearn.utils import check_array, check_random_state
+from sklearn.utils.extmath import row_norms, squared_norm
+from sklearn.utils.validation import _num_samples
+
+
+def _spherical_kmeans_single_lloyd(
+    X,
+    n_clusters,
+    sample_weight=None,
+    max_iter=300,
+    init="k-means++",
+    verbose=False,
+    x_squared_norms=None,
+    random_state=None,
+    tol=1e-4,
+    precompute_distances=True,
+):
+    """
+    Modified from sklearn.cluster.k_means_.k_means_single_lloyd.
+    """
+    random_state = check_random_state(random_state)
+
+    sample_weight = _check_sample_weight(sample_weight, X)
+
+    best_labels, best_inertia, best_centers = None, None, None
+
+    # init
+    centers = _init_centroids(
+        X, n_clusters, init, random_state=random_state, x_squared_norms=x_squared_norms
+    )
+    if verbose:
+        print("Initialization complete")
+
+    # Allocate memory to store the distances for each sample to its
+    # closer center for reallocation in case of ties
+    distances = np.zeros(shape=(X.shape[0],), dtype=X.dtype)
+
+    # iterations
+    for i in range(max_iter):
+        centers_old = centers.copy()
+
+        # labels assignment
+        # TODO: _labels_inertia should be done with cosine distance
+        #       since ||a - b|| = 2(1 - cos(a,b)) when a,b are unit normalized
+        #       this doesn't really matter.
+        labels, inertia = _labels_inertia(
+            X,
+            sample_weight,
+            x_squared_norms,
+            centers,
+            precompute_distances=precompute_distances,
+            distances=distances,
+        )
+
+        # computation of the means
+        if sp.issparse(X):
+            centers = _k_means._centers_sparse(
+                X, sample_weight, labels, n_clusters, distances
+            )
+        else:
+            centers = _k_means._centers_dense(
+                X.astype(np.float),
+                sample_weight.astype(np.float),
+                labels,
+                n_clusters,
+                distances.astype(np.float),
+            )
+
+        # l2-normalize centers (this is the main contibution here)
+        centers = normalize(centers)
+
+        if verbose:
+            print("Iteration %2d, inertia %.3f" % (i, inertia))
+
+        if best_inertia is None or inertia < best_inertia:
+            best_labels = labels.copy()
+            best_centers = centers.copy()
+            best_inertia = inertia
+
+        center_shift_total = squared_norm(centers_old - centers)
+        if center_shift_total <= tol:
+            if verbose:
+                print(
+                    "Converged at iteration %d: "
+                    "center shift %e within tolerance %e" % (i, center_shift_total, tol)
+                )
+            break
+
+    if center_shift_total > 0:
+        # rerun E-step in case of non-convergence so that predicted labels
+        # match cluster centers
+        best_labels, best_inertia = _labels_inertia(
+            X,
+            sample_weight,
+            x_squared_norms,
+            best_centers,
+            precompute_distances=precompute_distances,
+            distances=distances,
+        )
+
+    return best_labels, best_inertia, best_centers, i + 1
+
+
+def spherical_k_means(
+    X,
+    n_clusters,
+    sample_weight=None,
+    init="k-means++",
+    n_init=10,
+    max_iter=300,
+    verbose=False,
+    tol=1e-4,
+    random_state=None,
+    copy_x=True,
+    n_jobs=1,
+    algorithm="auto",
+    return_n_iter=False,
+):
+    """Modified from sklearn.cluster.k_means_.k_means.
+    """
+    if n_init <= 0:
+        raise ValueError(
+            "Invalid number of initializations."
+            " n_init=%d must be bigger than zero." % n_init
+        )
+    random_state = check_random_state(random_state)
+
+    if max_iter <= 0:
+        raise ValueError(
+            "Number of iterations should be a positive number,"
+            " got %d instead" % max_iter
+        )
+
+    best_inertia = np.infty
+    # avoid forcing order when copy_x=False
+    order = "C" if copy_x else None
+    X = check_array(
+        X, accept_sparse="csr", dtype=[np.float64, np.float32], order=order, copy=copy_x
+    )
+    # verify that the number of samples given is larger than k
+    if _num_samples(X) < n_clusters:
+        raise ValueError(
+            "n_samples=%d should be >= n_clusters=%d" % (_num_samples(X), n_clusters)
+        )
+    tol = _tolerance(X, tol)
+
+    if hasattr(init, "__array__"):
+        init = check_array(init, dtype=X.dtype.type, order="C", copy=True)
+        _validate_center_shape(X, n_clusters, init)
+
+        if n_init != 1:
+            warnings.warn(
+                "Explicit initial center position passed: "
+                "performing only one init in k-means instead of n_init=%d" % n_init,
+                RuntimeWarning,
+                stacklevel=2,
+            )
+            n_init = 1
+
+    # precompute squared norms of data points
+    x_squared_norms = row_norms(X, squared=True)
+
+    if n_jobs == 1:
+        # For a single thread, less memory is needed if we just store one set
+        # of the best results (as opposed to one set per run per thread).
+        for it in range(n_init):
+            # run a k-means once
+            labels, inertia, centers, n_iter_ = _spherical_kmeans_single_lloyd(
+                X,
+                n_clusters,
+                sample_weight,
+                max_iter=max_iter,
+                init=init,
+                verbose=verbose,
+                tol=tol,
+                x_squared_norms=x_squared_norms,
+                random_state=random_state,
+            )
+
+            # determine if these results are the best so far
+            if best_inertia is None or inertia < best_inertia:
+                best_labels = labels.copy()
+                best_centers = centers.copy()
+                best_inertia = inertia
+                best_n_iter = n_iter_
+    else:
+        # parallelisation of k-means runs
+        seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init)
+        results = Parallel(n_jobs=n_jobs, verbose=0)(
+            delayed(_spherical_kmeans_single_lloyd)(
+                X,
+                n_clusters,
+                sample_weight,
+                max_iter=max_iter,
+                init=init,
+                verbose=verbose,
+                tol=tol,
+                x_squared_norms=x_squared_norms,
+                # Change seed to ensure variety
+                random_state=seed,
+            )
+            for seed in seeds
+        )
+
+        # Get results with the lowest inertia
+        labels, inertia, centers, n_iters = zip(*results)
+        best = np.argmin(inertia)
+        best_labels = labels[best]
+        best_inertia = inertia[best]
+        best_centers = centers[best]
+        best_n_iter = n_iters[best]
+
+    if return_n_iter:
+        return best_centers, best_labels, best_inertia, best_n_iter
+    else:
+        return best_centers, best_labels, best_inertia
+
+
+class SphericalKMeans(KMeans):
+    """Spherical K-Means clustering
+
+    Modfication of sklearn.cluster.KMeans where cluster centers are normalized
+    (projected onto the sphere) in each iteration.
+
+    Parameters
+    ----------
+
+    n_clusters : int, optional, default: 8
+        The number of clusters to form as well as the number of
+        centroids to generate.
+
+    max_iter : int, default: 300
+        Maximum number of iterations of the k-means algorithm for a
+        single run.
+
+    n_init : int, default: 10
+        Number of time the k-means algorithm will be run with different
+        centroid seeds. The final results will be the best output of
+        n_init consecutive runs in terms of inertia.
+
+    init : {'k-means++', 'random' or an ndarray}
+        Method for initialization, defaults to 'k-means++':
+        'k-means++' : selects initial cluster centers for k-mean
+        clustering in a smart way to speed up convergence. See section
+        Notes in k_init for more details.
+        'random': choose k observations (rows) at random from data for
+        the initial centroids.
+        If an ndarray is passed, it should be of shape (n_clusters, n_features)
+        and gives the initial centers.
+
+    tol : float, default: 1e-4
+        Relative tolerance with regards to inertia to declare convergence
+
+    n_jobs : int
+        The number of jobs to use for the computation. This works by computing
+        each of the n_init runs in parallel.
+        If -1 all CPUs are used. If 1 is given, no parallel computing code is
+        used at all, which is useful for debugging. For n_jobs below -1,
+        (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one
+        are used.
+
+    random_state : integer or numpy.RandomState, optional
+        The generator used to initialize the centers. If an integer is
+        given, it fixes the seed. Defaults to the global numpy random
+        number generator.
+
+    verbose : int, default 0
+        Verbosity mode.
+
+    copy_x : boolean, default True
+        When pre-computing distances it is more numerically accurate to center
+        the data first.  If copy_x is True, then the original data is not
+        modified.  If False, the original data is modified, and put back before
+        the function returns, but small numerical differences may be introduced
+        by subtracting and then adding the data mean.
+
+    normalize : boolean, default True
+        Normalize the input to have unnit norm.
+
+    Attributes
+    ----------
+
+    cluster_centers_ : array, [n_clusters, n_features]
+        Coordinates of cluster centers
+
+    labels_ :
+        Labels of each point
+
+    inertia_ : float
+        Sum of distances of samples to their closest cluster center.
+    """
+
+    def __init__(
+        self,
+        n_clusters=8,
+        init="k-means++",
+        n_init=10,
+        max_iter=300,
+        tol=1e-4,
+        n_jobs=1,
+        verbose=0,
+        random_state=None,
+        copy_x=True,
+        normalize=True,
+    ):
+        self.n_clusters = n_clusters
+        self.init = init
+        self.max_iter = max_iter
+        self.tol = tol
+        self.n_init = n_init
+        self.verbose = verbose
+        self.random_state = random_state
+        self.copy_x = copy_x
+        self.n_jobs = n_jobs
+        self.normalize = normalize
+
+    def fit(self, X, y=None, sample_weight=None):
+        """Compute k-means clustering.
+
+        Parameters
+        ----------
+
+        X : array-like or sparse matrix, shape=(n_samples, n_features)
+
+        y : Ignored
+            not used, present here for API consistency by convention.
+
+        sample_weight : array-like, shape (n_samples,), optional
+            The weights for each observation in X. If None, all observations
+            are assigned equal weight (default: None)
+        """
+        if self.normalize:
+            X = normalize(X)
+
+        random_state = check_random_state(self.random_state)
+
+        # TODO: add check that all data is unit-normalized
+
+        self.cluster_centers_, self.labels_, self.inertia_, self.n_iter_ = spherical_k_means(
+            X,
+            n_clusters=self.n_clusters,
+            sample_weight=sample_weight,
+            init=self.init,
+            n_init=self.n_init,
+            max_iter=self.max_iter,
+            verbose=self.verbose,
+            tol=self.tol,
+            random_state=random_state,
+            copy_x=self.copy_x,
+            n_jobs=self.n_jobs,
+            return_n_iter=True,
+        )
+
+        return self
diff --git a/code/spherecluster/spherecluster/tests/test_common.py b/code/spherecluster/spherecluster/tests/test_common.py
new file mode 100644
index 0000000..119f1c4
--- /dev/null
+++ b/code/spherecluster/spherecluster/tests/test_common.py
@@ -0,0 +1,6 @@
+from sklearn.utils.estimator_checks import check_estimator
+from spherecluster import SphericalKMeans
+
+
+def test_estimator_spherical_k_means():
+    return check_estimator(SphericalKMeans)
diff --git a/code/spherecluster/spherecluster/tests/test_von_mises_fisher_mixture.py b/code/spherecluster/spherecluster/tests/test_von_mises_fisher_mixture.py
new file mode 100644
index 0000000..29e6d73
--- /dev/null
+++ b/code/spherecluster/spherecluster/tests/test_von_mises_fisher_mixture.py
@@ -0,0 +1,205 @@
+import pytest
+import scipy as sp
+import numpy as np
+from numpy.testing import assert_almost_equal, assert_array_equal
+from spherecluster import VonMisesFisherMixture
+from spherecluster import von_mises_fisher_mixture
+from spherecluster import sample_vMF
+
+
+def test_vmf_log_dense():
+    """
+    Test that approximation approaches whatever scipy has.
+    """
+    n_examples = 2
+    n_features = 50
+
+    kappas = np.linspace(2, 600, 20)
+
+    mu = np.random.randn(n_features)
+    mu /= np.linalg.norm(mu)
+
+    X = np.random.randn(n_examples, n_features)
+    for ee in range(n_examples):
+        X[ee, :] /= np.linalg.norm(X[ee, :])
+
+    diffs = []
+    for kappa in kappas:
+        v = von_mises_fisher_mixture._vmf_log(X, kappa, mu)
+
+        v_approx = von_mises_fisher_mixture._vmf_log_asymptotic(X, kappa, mu)
+
+        normalized_approx_diff = np.linalg.norm(v - v_approx) / np.linalg.norm(v)
+        print(normalized_approx_diff)
+        diffs.append(normalized_approx_diff)
+
+    assert diffs[0] > 10 * diffs[-1]
+
+
+def test_vmf_log_detect_breakage():
+    """
+    Find where scipy approximation breaks down.
+    This doesn't really test anything but demonstrates where approximation
+    should be applied instead.
+    """
+    n_examples = 3
+    kappas = [5, 30, 100, 1000, 5000]
+    n_features = range(2, 500)
+
+    breakage_points = []
+    for kappa in kappas:
+        first_breakage = None
+        for n_f in n_features:
+            mu = np.random.randn(n_f)
+            mu /= np.linalg.norm(mu)
+
+            X = np.random.randn(n_examples, n_f)
+            for ee in range(n_examples):
+                X[ee, :] /= np.linalg.norm(X[ee, :])
+
+            try:
+                von_mises_fisher_mixture._vmf_log(X, kappa, mu)
+            except:
+                if first_breakage is None:
+                    first_breakage = n_f
+
+        breakage_points.append(first_breakage)
+        print(
+            "Scipy vmf_log breaks for kappa={} at n_features={}".format(
+                kappa, first_breakage
+            )
+        )
+
+    print(breakage_points)
+    assert_array_equal(breakage_points, [141, 420, 311, 3, 3])
+
+
+def test_maximization():
+    num_points = 5000
+    n_features = 500
+    posterior = np.ones((1, num_points))
+
+    kappas = [5000, 8000, 16400]
+    for kappa in kappas:
+        mu = np.random.randn(n_features)
+        mu /= np.linalg.norm(mu)
+
+        X = sample_vMF(mu, kappa, num_points)
+
+        centers, weights, concentrations = von_mises_fisher_mixture._maximization(
+            X, posterior
+        )
+
+        print("center estimate error", np.linalg.norm(centers[0, :] - mu))
+        print(
+            "kappa estimate",
+            np.abs(kappa - concentrations[0]) / kappa,
+            kappa,
+            concentrations[0],
+        )
+
+        assert_almost_equal(1., weights[0])
+        assert_almost_equal(0.0, np.abs(kappa - concentrations[0]) / kappa, decimal=2)
+        assert_almost_equal(0.0, np.linalg.norm(centers[0, :] - mu), decimal=2)
+
+
+@pytest.mark.parametrize(
+    "params_in",
+    [
+        {"posterior_type": "soft"},
+        {"posterior_type": "hard"},
+        {"posterior_type": "soft", "n_jobs": 2},
+        {"posterior_type": "hard", "n_jobs": 3},
+        {"posterior_type": "hard", "force_weights": np.ones(5) / 5.},
+        {"posterior_type": "soft", "n_jobs": -1},
+    ],
+)
+def test_integration_dense(params_in):
+    n_clusters = 5
+    n_examples = 20
+    n_features = 100
+    X = np.random.randn(n_examples, n_features)
+    for ee in range(n_examples):
+        X[ee, :] /= np.linalg.norm(X[ee, :])
+
+    params_in.update({"n_clusters": n_clusters})
+    movmf = VonMisesFisherMixture(**params_in)
+    movmf.fit(X)
+
+    assert movmf.cluster_centers_.shape == (n_clusters, n_features)
+    assert len(movmf.concentrations_) == n_clusters
+    assert len(movmf.weights_) == n_clusters
+    assert len(movmf.labels_) == n_examples
+    assert len(movmf.posterior_) == n_clusters
+
+    for center in movmf.cluster_centers_:
+        assert_almost_equal(np.linalg.norm(center), 1.0)
+
+    for concentration in movmf.concentrations_:
+        assert concentration > 0
+
+    for weight in movmf.weights_:
+        assert not np.isnan(weight)
+
+    plabels = movmf.predict(X)
+    assert_array_equal(plabels, movmf.labels_)
+
+    ll = movmf.log_likelihood(X)
+    ll_labels = np.zeros(movmf.labels_.shape)
+    for ee in range(n_examples):
+        ll_labels[ee] = np.argmax(ll[:, ee])
+
+    assert_array_equal(ll_labels, movmf.labels_)
+
+
+@pytest.mark.parametrize(
+    "params_in",
+    [
+        {"posterior_type": "soft"},
+        {"posterior_type": "hard"},
+        {"posterior_type": "soft", "n_jobs": 2},
+        {"posterior_type": "hard", "n_jobs": 3},
+        {"posterior_type": "hard", "force_weights": np.ones(5) / 5.},
+        {"posterior_type": "soft", "n_jobs": -1},
+    ],
+)
+def test_integration_sparse(params_in):
+    n_clusters = 5
+    n_examples = 20
+    n_features = 100
+    n_nonzero = 10
+    X = sp.sparse.csr_matrix((n_examples, n_features))
+    for ee in range(n_examples):
+        ridx = np.random.randint(n_features, size=(n_nonzero))
+        random_values = np.random.randn(n_nonzero)
+        random_values = random_values / np.linalg.norm(random_values)
+        X[ee, ridx] = random_values
+
+    params_in.update({"n_clusters": n_clusters})
+    movmf = VonMisesFisherMixture(**params_in)
+    movmf.fit(X)
+
+    assert movmf.cluster_centers_.shape == (n_clusters, n_features)
+    assert len(movmf.concentrations_) == n_clusters
+    assert len(movmf.weights_) == n_clusters
+    assert len(movmf.labels_) == n_examples
+    assert len(movmf.posterior_) == n_clusters
+
+    for center in movmf.cluster_centers_:
+        assert_almost_equal(np.linalg.norm(center), 1.0)
+
+    for concentration in movmf.concentrations_:
+        assert concentration > 0
+
+    for weight in movmf.weights_:
+        assert not np.isnan(weight)
+
+    plabels = movmf.predict(X)
+    assert_array_equal(plabels, movmf.labels_)
+
+    ll = movmf.log_likelihood(X)
+    ll_labels = np.zeros(movmf.labels_.shape)
+    for ee in range(n_examples):
+        ll_labels[ee] = np.argmax(ll[:, ee])
+
+    assert_array_equal(ll_labels, movmf.labels_)
diff --git a/code/spherecluster/spherecluster/util.py b/code/spherecluster/spherecluster/util.py
new file mode 100644
index 0000000..bb68261
--- /dev/null
+++ b/code/spherecluster/spherecluster/util.py
@@ -0,0 +1,57 @@
+"""
+Generate multivariate von Mises Fisher samples.
+
+This solution originally appears here:
+http://stats.stackexchange.com/questions/156729/sampling-from-von-mises-fisher-distribution-in-python
+
+Also see:
+
+Sampling from vMF on S^2:
+    https://www.mitsuba-renderer.org/~wenzel/files/vmf.pdf
+    http://www.stat.pitt.edu/sungkyu/software/randvonMisesFisher3.pdf
+"""
+import numpy as np
+
+
+def sample_vMF(mu, kappa, num_samples):
+    """Generate num_samples N-dimensional samples from von Mises Fisher
+    distribution around center mu \in R^N with concentration kappa.
+    """
+    dim = len(mu)
+    result = np.zeros((num_samples, dim))
+    for nn in range(num_samples):
+        # sample offset from center (on sphere) with spread kappa
+        w = _sample_weight(kappa, dim)
+
+        # sample a point v on the unit sphere that's orthogonal to mu
+        v = _sample_orthonormal_to(mu)
+
+        # compute new point
+        result[nn, :] = v * np.sqrt(1. - w ** 2) + w * mu
+
+    return result
+
+
+def _sample_weight(kappa, dim):
+    """Rejection sampling scheme for sampling distance from center on
+    surface of the sphere.
+    """
+    dim = dim - 1  # since S^{n-1}
+    b = dim / (np.sqrt(4. * kappa ** 2 + dim ** 2) + 2 * kappa)
+    x = (1. - b) / (1. + b)
+    c = kappa * x + dim * np.log(1 - x ** 2)
+
+    while True:
+        z = np.random.beta(dim / 2., dim / 2.)
+        w = (1. - (1. + b) * z) / (1. - (1. - b) * z)
+        u = np.random.uniform(low=0, high=1)
+        if kappa * w + dim * np.log(1. - x * w) - c >= np.log(u):
+            return w
+
+
+def _sample_orthonormal_to(mu):
+    """Sample point on sphere orthogonal to mu."""
+    v = np.random.randn(mu.shape[0])
+    proj_mu_v = mu * np.dot(mu, v) / np.linalg.norm(mu)
+    orthto = v - proj_mu_v
+    return orthto / np.linalg.norm(orthto)
diff --git a/code/spherecluster/spherecluster/von_mises_fisher_mixture.py b/code/spherecluster/spherecluster/von_mises_fisher_mixture.py
new file mode 100644
index 0000000..53c4c67
--- /dev/null
+++ b/code/spherecluster/spherecluster/von_mises_fisher_mixture.py
@@ -0,0 +1,946 @@
+import warnings
+
+import numpy as np
+import scipy.sparse as sp
+from joblib import Parallel, delayed
+from numpy import i0  # modified Bessel function of first kind order 0, I_0
+from scipy.special import iv  # modified Bessel function of first kind, I_v
+from scipy.special import logsumexp
+
+from sklearn.base import BaseEstimator, ClusterMixin, TransformerMixin
+from sklearn.cluster.k_means_ import _init_centroids, _tolerance, _validate_center_shape
+from sklearn.metrics.pairwise import cosine_distances
+from sklearn.preprocessing import normalize
+from sklearn.utils import check_array, check_random_state, as_float_array
+from sklearn.utils.extmath import squared_norm
+from sklearn.utils.validation import FLOAT_DTYPES
+from sklearn.utils.validation import check_is_fitted
+
+from . import spherical_kmeans
+
+MAX_CONTENTRATION = 1e10
+
+
+def _inertia_from_labels(X, centers, labels):
+    """Compute inertia with cosine distance using known labels.
+    """
+    n_examples, n_features = X.shape
+    inertia = np.zeros((n_examples,))
+    for ee in range(n_examples):
+        inertia[ee] = 1 - X[ee, :].dot(centers[int(labels[ee]), :].T)
+
+    return np.sum(inertia)
+
+
+def _labels_inertia(X, centers):
+    """Compute labels and inertia with cosine distance.
+    """
+    n_examples, n_features = X.shape
+    n_clusters, n_features = centers.shape
+
+    labels = np.zeros((n_examples,))
+    inertia = np.zeros((n_examples,))
+
+    for ee in range(n_examples):
+        dists = np.zeros((n_clusters,))
+        for cc in range(n_clusters):
+            dists[cc] = 1 - X[ee, :].dot(centers[cc, :].T)
+
+        labels[ee] = np.argmin(dists)
+        inertia[ee] = dists[int(labels[ee])]
+
+    return labels, np.sum(inertia)
+
+
+def _vmf_log(X, kappa, mu):
+    """Computs log(vMF(X, kappa, mu)) using built-in numpy/scipy Bessel
+    approximations.
+
+    Works well on small kappa and mu.
+    """
+    n_examples, n_features = X.shape
+    return np.log(_vmf_normalize(kappa, n_features) * np.exp(kappa * X.dot(mu).T))
+
+
+def _vmf_normalize(kappa, dim):
+    """Compute normalization constant using built-in numpy/scipy Bessel
+    approximations.
+
+    Works well on small kappa and mu.
+    """
+    num = np.power(kappa, dim / 2.0 - 1.0)
+
+    if dim / 2.0 - 1.0 < 1e-15:
+        denom = np.power(2.0 * np.pi, dim / 2.0) * i0(kappa)
+    else:
+        denom = np.power(2.0 * np.pi, dim / 2.0) * iv(dim / 2.0 - 1.0, kappa)
+
+    if np.isinf(num):
+        raise ValueError("VMF scaling numerator was inf.")
+
+    if np.isinf(denom):
+        raise ValueError("VMF scaling denominator was inf.")
+
+    if np.abs(denom) < 1e-15:
+        raise ValueError("VMF scaling denominator was 0.")
+
+    return num / denom
+
+
+def _log_H_asymptotic(nu, kappa):
+    """Compute the Amos-type upper bound asymptotic approximation on H where
+    log(H_\nu)(\kappa) = \int_0^\kappa R_\nu(t) dt.
+
+    See "lH_asymptotic <-" in movMF.R and utility function implementation notes
+    from https://cran.r-project.org/web/packages/movMF/index.html
+    """
+    beta = np.sqrt((nu + 0.5) ** 2)
+    kappa_l = np.min([kappa, np.sqrt((3.0 * nu + 11.0 / 2.0) * (nu + 3.0 / 2.0))])
+    return _S(kappa, nu + 0.5, beta) + (
+        _S(kappa_l, nu, nu + 2.0) - _S(kappa_l, nu + 0.5, beta)
+    )
+
+
+def _S(kappa, alpha, beta):
+    """Compute the antiderivative of the Amos-type bound G on the modified
+    Bessel function ratio.
+
+    Note:  Handles scalar kappa, alpha, and beta only.
+
+    See "S <-" in movMF.R and utility function implementation notes from
+    https://cran.r-project.org/web/packages/movMF/index.html
+    """
+    kappa = 1.0 * np.abs(kappa)
+    alpha = 1.0 * alpha
+    beta = 1.0 * np.abs(beta)
+    a_plus_b = alpha + beta
+    u = np.sqrt(kappa ** 2 + beta ** 2)
+    if alpha == 0:
+        alpha_scale = 0
+    else:
+        alpha_scale = alpha * np.log((alpha + u) / a_plus_b)
+
+    return u - beta - alpha_scale
+
+
+def _vmf_log_asymptotic(X, kappa, mu):
+    """Compute log(f(x|theta)) via Amos approximation
+
+        log(f(x|theta)) = theta' x - log(H_{d/2-1})(\|theta\|)
+
+    where theta = kappa * X, \|theta\| = kappa.
+
+    Computing _vmf_log helps with numerical stability / loss of precision for
+    for large values of kappa and n_features.
+
+    See utility function implementation notes in movMF.R from
+    https://cran.r-project.org/web/packages/movMF/index.html
+    """
+    n_examples, n_features = X.shape
+    log_vfm = kappa * X.dot(mu).T + -_log_H_asymptotic(n_features / 2.0 - 1.0, kappa)
+
+    return log_vfm
+
+
+def _log_likelihood(X, centers, weights, concentrations):
+    if len(np.shape(X)) != 2:
+        X = X.reshape((1, len(X)))
+
+    n_examples, n_features = np.shape(X)
+    n_clusters, _ = centers.shape
+
+    if n_features <= 50:  # works up to about 50 before numrically unstable
+        vmf_f = _vmf_log
+    else:
+        vmf_f = _vmf_log_asymptotic
+
+    f_log = np.zeros((n_clusters, n_examples))
+    for cc in range(n_clusters):
+        f_log[cc, :] = vmf_f(X, concentrations[cc], centers[cc, :])
+
+    posterior = np.zeros((n_clusters, n_examples))
+    weights_log = np.log(weights)
+    posterior = np.tile(weights_log.T, (n_examples, 1)).T + f_log
+    for ee in range(n_examples):
+        posterior[:, ee] = np.exp(posterior[:, ee] - logsumexp(posterior[:, ee]))
+
+    return posterior
+
+
+def _init_unit_centers(X, n_clusters, random_state, init):
+    """Initializes unit norm centers.
+
+    Parameters
+    ----------
+    X : array-like or sparse matrix, shape=(n_samples, n_features)
+
+    n_clusters : int, optional, default: 8
+        The number of clusters to form as well as the number of
+        centroids to generate.
+
+    random_state : integer or numpy.RandomState, optional
+        The generator used to initialize the centers. If an integer is
+        given, it fixes the seed. Defaults to the global numpy random
+        number generator.
+
+    init:  (string) one of
+        k-means++ : uses sklearn k-means++ initialization algorithm
+        spherical-k-means : use centroids from one pass of spherical k-means
+        random : random unit norm vectors
+        random-orthonormal : random orthonormal vectors
+        If an ndarray is passed, it should be of shape (n_clusters, n_features)
+        and gives the initial centers.
+    """
+    n_examples, n_features = np.shape(X)
+    if isinstance(init, np.ndarray):
+        n_init_clusters, n_init_features = init.shape
+        assert n_init_clusters == n_clusters
+        assert n_init_features == n_features
+
+        # ensure unit normed centers
+        centers = init
+        for cc in range(n_clusters):
+            centers[cc, :] = centers[cc, :] / np.linalg.norm(centers[cc, :])
+
+        return centers
+
+    elif init == "spherical-k-means":
+        labels, inertia, centers, iters = spherical_kmeans._spherical_kmeans_single_lloyd(
+            X, n_clusters, x_squared_norms=np.ones((n_examples,)), init="k-means++"
+        )
+
+        return centers
+
+    elif init == "random":
+        centers = np.random.randn(n_clusters, n_features)
+        for cc in range(n_clusters):
+            centers[cc, :] = centers[cc, :] / np.linalg.norm(centers[cc, :])
+
+        return centers
+
+    elif init == "k-means++":
+        centers = _init_centroids(
+            X,
+            n_clusters,
+            "k-means++",
+            random_state=random_state,
+            x_squared_norms=np.ones((n_examples,)),
+        )
+
+        for cc in range(n_clusters):
+            centers[cc, :] = centers[cc, :] / np.linalg.norm(centers[cc, :])
+
+        return centers
+
+    elif init == "random-orthonormal":
+        centers = np.random.randn(n_clusters, n_features)
+        q, r = np.linalg.qr(centers.T, mode="reduced")
+
+        return q.T
+
+    elif init == "random-class":
+        centers = np.zeros((n_clusters, n_features))
+        for cc in range(n_clusters):
+            while np.linalg.norm(centers[cc, :]) == 0:
+                labels = np.random.randint(0, n_clusters, n_examples)
+                centers[cc, :] = X[labels == cc, :].sum(axis=0)
+
+        for cc in range(n_clusters):
+            centers[cc, :] = centers[cc, :] / np.linalg.norm(centers[cc, :])
+
+        return centers
+
+
+def _expectation(X, centers, weights, concentrations, posterior_type="soft"):
+    """Compute the log-likelihood of each datapoint being in each cluster.
+
+    Parameters
+    ----------
+    centers (mu) : array, [n_centers x n_features]
+    weights (alpha) : array, [n_centers, ] (alpha)
+    concentrations (kappa) : array, [n_centers, ]
+
+    Returns
+    ----------
+    posterior : array, [n_centers, n_examples]
+    """
+    n_examples, n_features = np.shape(X)
+    n_clusters, _ = centers.shape
+
+    if n_features <= 50:  # works up to about 50 before numrically unstable
+        vmf_f = _vmf_log
+    else:
+        vmf_f = _vmf_log_asymptotic
+
+    f_log = np.zeros((n_clusters, n_examples))
+    for cc in range(n_clusters):
+        f_log[cc, :] = vmf_f(X, concentrations[cc], centers[cc, :])
+
+    posterior = np.zeros((n_clusters, n_examples))
+    if posterior_type == "soft":
+        weights_log = np.log(weights)
+        posterior = np.tile(weights_log.T, (n_examples, 1)).T + f_log
+        for ee in range(n_examples):
+            posterior[:, ee] = np.exp(posterior[:, ee] - logsumexp(posterior[:, ee]))
+
+    elif posterior_type == "hard":
+        weights_log = np.log(weights)
+        weighted_f_log = np.tile(weights_log.T, (n_examples, 1)).T + f_log
+        for ee in range(n_examples):
+            posterior[np.argmax(weighted_f_log[:, ee]), ee] = 1.0
+
+    return posterior
+
+
+def _maximization(X, posterior, force_weights=None):
+    """Estimate new centers, weights, and concentrations from
+
+    Parameters
+    ----------
+    posterior : array, [n_centers, n_examples]
+        The posterior matrix from the expectation step.
+
+    force_weights : None or array, [n_centers, ]
+        If None is passed, will estimate weights.
+        If an array is passed, will use instead of estimating.
+
+    Returns
+    ----------
+    centers (mu) : array, [n_centers x n_features]
+    weights (alpha) : array, [n_centers, ] (alpha)
+    concentrations (kappa) : array, [n_centers, ]
+    """
+    n_examples, n_features = X.shape
+    n_clusters, n_examples = posterior.shape
+    concentrations = np.zeros((n_clusters,))
+    centers = np.zeros((n_clusters, n_features))
+    if force_weights is None:
+        weights = np.zeros((n_clusters,))
+
+    for cc in range(n_clusters):
+        # update weights (alpha)
+        if force_weights is None:
+            weights[cc] = np.mean(posterior[cc, :])
+        else:
+            weights = force_weights
+
+        # update centers (mu)
+        X_scaled = X.copy()
+        if sp.issparse(X):
+            X_scaled.data *= posterior[cc, :].repeat(np.diff(X_scaled.indptr))
+        else:
+            for ee in range(n_examples):
+                X_scaled[ee, :] *= posterior[cc, ee]
+
+        centers[cc, :] = X_scaled.sum(axis=0)
+
+        # normalize centers
+        center_norm = np.linalg.norm(centers[cc, :])
+        if center_norm > 1e-8:
+            centers[cc, :] = centers[cc, :] / center_norm
+
+        # update concentration (kappa) [TODO: add other kappa approximations]
+        rbar = center_norm / (n_examples * weights[cc])
+        concentrations[cc] = rbar * n_features - np.power(rbar, 3.0)
+        if np.abs(rbar - 1.0) < 1e-10:
+            concentrations[cc] = MAX_CONTENTRATION
+        else:
+            concentrations[cc] /= 1.0 - np.power(rbar, 2.0)
+
+        # let python know we can free this (good for large dense X)
+        del X_scaled
+
+    return centers, weights, concentrations
+
+
+def _movMF(
+    X,
+    n_clusters,
+    posterior_type="soft",
+    force_weights=None,
+    max_iter=300,
+    verbose=False,
+    init="random-class",
+    random_state=None,
+    tol=1e-6,
+):
+    """Mixture of von Mises Fisher clustering.
+
+    Implements the algorithms (i) and (ii) from
+
+      "Clustering on the Unit Hypersphere using von Mises-Fisher Distributions"
+      by Banerjee, Dhillon, Ghosh, and Sra.
+
+    TODO: Currently only supports Banerjee et al 2005 approximation of kappa,
+          however, there are numerous other approximations see _update_params.
+
+    Attribution
+    ----------
+    Approximation of log-vmf distribution function from movMF R-package.
+
+    movMF: An R Package for Fitting Mixtures of von Mises-Fisher Distributions
+    by Kurt Hornik, Bettina Grun, 2014
+
+    Find more at:
+      https://cran.r-project.org/web/packages/movMF/vignettes/movMF.pdf
+      https://cran.r-project.org/web/packages/movMF/index.html
+
+    Parameters
+    ----------
+    n_clusters : int, optional, default: 8
+        The number of clusters to form as well as the number of
+        centroids to generate.
+
+    posterior_type: 'soft' or 'hard'
+        Type of posterior computed in exepectation step.
+        See note about attribute: self.posterior_
+
+    force_weights : None or array [n_clusters, ]
+        If None, the algorithm will estimate the weights.
+        If an array of weights, algorithm will estimate concentrations and
+        centers with given weights.
+
+    max_iter : int, default: 300
+        Maximum number of iterations of the k-means algorithm for a
+        single run.
+
+    n_init : int, default: 10
+        Number of time the k-means algorithm will be run with different
+        centroid seeds. The final results will be the best output of
+        n_init consecutive runs in terms of inertia.
+
+    init:  (string) one of
+        random-class [default]: random class assignment & centroid computation
+        k-means++ : uses sklearn k-means++ initialization algorithm
+        spherical-k-means : use centroids from one pass of spherical k-means
+        random : random unit norm vectors
+        random-orthonormal : random orthonormal vectors
+        If an ndarray is passed, it should be of shape (n_clusters, n_features)
+        and gives the initial centers.
+
+    tol : float, default: 1e-6
+        Relative tolerance with regards to inertia to declare convergence
+
+    n_jobs : int
+        The number of jobs to use for the computation. This works by computing
+        each of the n_init runs in parallel.
+        If -1 all CPUs are used. If 1 is given, no parallel computing code is
+        used at all, which is useful for debugging. For n_jobs below -1,
+        (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one
+        are used.
+
+    random_state : integer or numpy.RandomState, optional
+        The generator used to initialize the centers. If an integer is
+        given, it fixes the seed. Defaults to the global numpy random
+        number generator.
+
+    verbose : int, default 0
+        Verbosity mode.
+
+    copy_x : boolean, default True
+        When pre-computing distances it is more numerically accurate to center
+        the data first.  If copy_x is True, then the original data is not
+        modified.  If False, the original data is modified, and put back before
+        the function returns, but small numerical differences may be introduced
+        by subtracting and then adding the data mean.
+    """
+    random_state = check_random_state(random_state)
+    n_examples, n_features = np.shape(X)
+
+    # init centers (mus)
+    centers = _init_unit_centers(X, n_clusters, random_state, init)
+
+    # init weights (alphas)
+    if force_weights is None:
+        weights = np.ones((n_clusters,))
+        weights = weights / np.sum(weights)
+    else:
+        weights = force_weights
+
+    # init concentrations (kappas)
+    concentrations = np.ones((n_clusters,))
+
+    if verbose:
+        print("Initialization complete")
+
+    for iter in range(max_iter):
+        centers_prev = centers.copy()
+
+        # expectation step
+        posterior = _expectation(
+            X, centers, weights, concentrations, posterior_type=posterior_type
+        )
+
+        # maximization step
+        centers, weights, concentrations = _maximization(
+            X, posterior, force_weights=force_weights
+        )
+
+        # check convergence
+        tolcheck = squared_norm(centers_prev - centers)
+        if tolcheck <= tol:
+            if verbose:
+                print(
+                    "Converged at iteration %d: "
+                    "center shift %e within tolerance %e" % (iter, tolcheck, tol)
+                )
+            break
+
+    # labels come for free via posterior
+    labels = np.zeros((n_examples,))
+    for ee in range(n_examples):
+        labels[ee] = np.argmax(posterior[:, ee])
+
+    inertia = _inertia_from_labels(X, centers, labels)
+
+    return centers, weights, concentrations, posterior, labels, inertia
+
+
+def movMF(
+    X,
+    n_clusters,
+    posterior_type="soft",
+    force_weights=None,
+    n_init=10,
+    n_jobs=1,
+    max_iter=300,
+    verbose=False,
+    init="random-class",
+    random_state=None,
+    tol=1e-6,
+    copy_x=True,
+):
+    """Wrapper for parallelization of _movMF and running n_init times.
+    """
+    if n_init <= 0:
+        raise ValueError(
+            "Invalid number of initializations."
+            " n_init=%d must be bigger than zero." % n_init
+        )
+    random_state = check_random_state(random_state)
+
+    if max_iter <= 0:
+        raise ValueError(
+            "Number of iterations should be a positive number,"
+            " got %d instead" % max_iter
+        )
+
+    best_inertia = np.infty
+    X = as_float_array(X, copy=copy_x)
+    tol = _tolerance(X, tol)
+
+    if hasattr(init, "__array__"):
+        init = check_array(init, dtype=X.dtype.type, copy=True)
+        _validate_center_shape(X, n_clusters, init)
+
+        if n_init != 1:
+            warnings.warn(
+                "Explicit initial center position passed: "
+                "performing only one init in k-means instead of n_init=%d" % n_init,
+                RuntimeWarning,
+                stacklevel=2,
+            )
+            n_init = 1
+
+    # defaults
+    best_centers = None
+    best_labels = None
+    best_weights = None
+    best_concentrations = None
+    best_posterior = None
+    best_inertia = None
+
+    if n_jobs == 1:
+        # For a single thread, less memory is needed if we just store one set
+        # of the best results (as opposed to one set per run per thread).
+        for it in range(n_init):
+            # cluster on the sphere
+            (centers, weights, concentrations, posterior, labels, inertia) = _movMF(
+                X,
+                n_clusters,
+                posterior_type=posterior_type,
+                force_weights=force_weights,
+                max_iter=max_iter,
+                verbose=verbose,
+                init=init,
+                random_state=random_state,
+                tol=tol,
+            )
+
+            # determine if these results are the best so far
+            if best_inertia is None or inertia < best_inertia:
+                best_centers = centers.copy()
+                best_labels = labels.copy()
+                best_weights = weights.copy()
+                best_concentrations = concentrations.copy()
+                best_posterior = posterior.copy()
+                best_inertia = inertia
+    else:
+        # parallelisation of movMF runs
+        seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init)
+        results = Parallel(n_jobs=n_jobs, verbose=0)(
+            delayed(_movMF)(
+                X,
+                n_clusters,
+                posterior_type=posterior_type,
+                force_weights=force_weights,
+                max_iter=max_iter,
+                verbose=verbose,
+                init=init,
+                random_state=random_state,
+                tol=tol,
+            )
+            for seed in seeds
+        )
+
+        # Get results with the lowest inertia
+        centers, weights, concentrations, posteriors, labels, inertia = zip(*results)
+        best = np.argmin(inertia)
+        best_labels = labels[best]
+        best_inertia = inertia[best]
+        best_centers = centers[best]
+        best_concentrations = concentrations[best]
+        best_posterior = posteriors[best]
+        best_weights = weights[best]
+
+    return (
+        best_centers,
+        best_labels,
+        best_inertia,
+        best_weights,
+        best_concentrations,
+        best_posterior,
+    )
+
+
+class VonMisesFisherMixture(BaseEstimator, ClusterMixin, TransformerMixin):
+    """Estimator for Mixture of von Mises Fisher clustering on the unit sphere.
+
+    Implements the algorithms (i) and (ii) from
+
+      "Clustering on the Unit Hypersphere using von Mises-Fisher Distributions"
+      by Banerjee, Dhillon, Ghosh, and Sra.
+
+    TODO: Currently only supports Banerjee et al 2005 approximation of kappa,
+          however, there are numerous other approximations see _update_params.
+
+    Attribution
+    ----------
+    Approximation of log-vmf distribution function from movMF R-package.
+
+    movMF: An R Package for Fitting Mixtures of von Mises-Fisher Distributions
+    by Kurt Hornik, Bettina Grun, 2014
+
+    Find more at:
+      https://cran.r-project.org/web/packages/movMF/vignettes/movMF.pdf
+      https://cran.r-project.org/web/packages/movMF/index.html
+
+    Basic sklearn scaffolding from sklearn.cluster.KMeans.
+
+    Parameters
+    ----------
+    n_clusters : int, optional, default: 8
+        The number of clusters to form as well as the number of
+        centroids to generate.
+
+    posterior_type: 'soft' or 'hard'
+        Type of posterior computed in exepectation step.
+        See note about attribute: self.posterior_
+
+    force_weights : None or array [n_clusters, ]
+        If None, the algorithm will estimate the weights.
+        If an array of weights, algorithm will estimate concentrations and
+        centers with given weights.
+
+    max_iter : int, default: 300
+        Maximum number of iterations of the k-means algorithm for a
+        single run.
+
+    n_init : int, default: 10
+        Number of time the k-means algorithm will be run with different
+        centroid seeds. The final results will be the best output of
+        n_init consecutive runs in terms of inertia.
+
+    init:  (string) one of
+        random-class [default]: random class assignment & centroid computation
+        k-means++ : uses sklearn k-means++ initialization algorithm
+        spherical-k-means : use centroids from one pass of spherical k-means
+        random : random unit norm vectors
+        random-orthonormal : random orthonormal vectors
+        If an ndarray is passed, it should be of shape (n_clusters, n_features)
+        and gives the initial centers.
+
+    tol : float, default: 1e-6
+        Relative tolerance with regards to inertia to declare convergence
+
+    n_jobs : int
+        The number of jobs to use for the computation. This works by computing
+        each of the n_init runs in parallel.
+        If -1 all CPUs are used. If 1 is given, no parallel computing code is
+        used at all, which is useful for debugging. For n_jobs below -1,
+        (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one
+        are used.
+
+    random_state : integer or numpy.RandomState, optional
+        The generator used to initialize the centers. If an integer is
+        given, it fixes the seed. Defaults to the global numpy random
+        number generator.
+
+    verbose : int, default 0
+        Verbosity mode.
+
+    copy_x : boolean, default True
+        When pre-computing distances it is more numerically accurate to center
+        the data first.  If copy_x is True, then the original data is not
+        modified.  If False, the original data is modified, and put back before
+        the function returns, but small numerical differences may be introduced
+        by subtracting and then adding the data mean.
+
+    normalize : boolean, default True
+        Normalize the input to have unnit norm.
+
+    Attributes
+    ----------
+
+    cluster_centers_ : array, [n_clusters, n_features]
+        Coordinates of cluster centers
+
+    labels_ :
+        Labels of each point
+
+    inertia_ : float
+        Sum of distances of samples to their closest cluster center.
+
+    weights_ : array, [n_clusters,]
+        Weights of each cluster in vMF distribution (alpha).
+
+    concentrations_ : array [n_clusters,]
+        Concentration parameter for each cluster (kappa).
+        Larger values correspond to more concentrated clusters.
+
+    posterior_ : array, [n_clusters, n_examples]
+        Each column corresponds to the posterio distribution for and example.
+
+        If posterior_type='hard' is used, there will only be one non-zero per
+        column, its index corresponding to the example's cluster label.
+
+        If posterior_type='soft' is used, this matrix will be dense and the
+        column values correspond to soft clustering weights.
+    """
+
+    def __init__(
+        self,
+        n_clusters=5,
+        posterior_type="soft",
+        force_weights=None,
+        n_init=10,
+        n_jobs=1,
+        max_iter=300,
+        verbose=False,
+        init="random-class",
+        random_state=None,
+        tol=1e-6,
+        copy_x=True,
+        normalize=True,
+    ):
+        self.n_clusters = n_clusters
+        self.posterior_type = posterior_type
+        self.force_weights = force_weights
+        self.n_init = n_init
+        self.n_jobs = n_jobs
+        self.max_iter = max_iter
+        self.verbose = verbose
+        self.init = init
+        self.random_state = random_state
+        self.tol = tol
+        self.copy_x = copy_x
+        self.normalize = normalize
+
+    def _check_force_weights(self):
+        if self.force_weights is None:
+            return
+
+        if len(self.force_weights) != self.n_clusters:
+            raise ValueError(
+                (
+                    "len(force_weights)={} but must equal "
+                    "n_clusters={}".format(len(self.force_weights), self.n_clusters)
+                )
+            )
+
+    def _check_fit_data(self, X):
+        """Verify that the number of samples given is larger than k"""
+        X = check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32])
+        n_samples, n_features = X.shape
+        if X.shape[0] < self.n_clusters:
+            raise ValueError(
+                "n_samples=%d should be >= n_clusters=%d"
+                % (X.shape[0], self.n_clusters)
+            )
+
+        for ee in range(n_samples):
+            if sp.issparse(X):
+                n = sp.linalg.norm(X[ee, :])
+            else:
+                n = np.linalg.norm(X[ee, :])
+
+            if np.abs(n - 1.0) > 1e-4:
+                raise ValueError("Data l2-norm must be 1, found {}".format(n))
+
+        return X
+
+    def _check_test_data(self, X):
+        X = check_array(X, accept_sparse="csr", dtype=FLOAT_DTYPES)
+        n_samples, n_features = X.shape
+        expected_n_features = self.cluster_centers_.shape[1]
+        if not n_features == expected_n_features:
+            raise ValueError(
+                "Incorrect number of features. "
+                "Got %d features, expected %d" % (n_features, expected_n_features)
+            )
+
+        for ee in range(n_samples):
+            if sp.issparse(X):
+                n = sp.linalg.norm(X[ee, :])
+            else:
+                n = np.linalg.norm(X[ee, :])
+
+            if np.abs(n - 1.0) > 1e-4:
+                raise ValueError("Data l2-norm must be 1, found {}".format(n))
+
+        return X
+
+    def fit(self, X, y=None):
+        """Compute mixture of von Mises Fisher clustering.
+
+        Parameters
+        ----------
+        X : array-like or sparse matrix, shape=(n_samples, n_features)
+        """
+        if self.normalize:
+            X = normalize(X)
+
+        self._check_force_weights()
+        random_state = check_random_state(self.random_state)
+        X = self._check_fit_data(X)
+
+        (
+            self.cluster_centers_,
+            self.labels_,
+            self.inertia_,
+            self.weights_,
+            self.concentrations_,
+            self.posterior_,
+        ) = movMF(
+            X,
+            self.n_clusters,
+            posterior_type=self.posterior_type,
+            force_weights=self.force_weights,
+            n_init=self.n_init,
+            n_jobs=self.n_jobs,
+            max_iter=self.max_iter,
+            verbose=self.verbose,
+            init=self.init,
+            random_state=random_state,
+            tol=self.tol,
+            copy_x=self.copy_x,
+        )
+
+        return self
+
+    def fit_predict(self, X, y=None):
+        """Compute cluster centers and predict cluster index for each sample.
+        Convenience method; equivalent to calling fit(X) followed by
+        predict(X).
+        """
+        return self.fit(X).labels_
+
+    def fit_transform(self, X, y=None):
+        """Compute clustering and transform X to cluster-distance space.
+        Equivalent to fit(X).transform(X), but more efficiently implemented.
+        """
+        # Currently, this just skips a copy of the data if it is not in
+        # np.array or CSR format already.
+        # XXX This skips _check_test_data, which may change the dtype;
+        # we should refactor the input validation.
+        return self.fit(X)._transform(X)
+
+    def transform(self, X, y=None):
+        """Transform X to a cluster-distance space.
+        In the new space, each dimension is the cosine distance to the cluster
+        centers.  Note that even if X is sparse, the array returned by
+        `transform` will typically be dense.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
+            New data to transform.
+
+        Returns
+        -------
+        X_new : array, shape [n_samples, k]
+            X transformed in the new space.
+        """
+        if self.normalize:
+            X = normalize(X)
+
+        check_is_fitted(self)
+        X = self._check_test_data(X)
+        return self._transform(X)
+
+    def _transform(self, X):
+        """guts of transform method; no input validation"""
+        return cosine_distances(X, self.cluster_centers_)
+
+    def predict(self, X):
+        """Predict the closest cluster each sample in X belongs to.
+        In the vector quantization literature, `cluster_centers_` is called
+        the code book and each value returned by `predict` is the index of
+        the closest code in the code book.
+
+        Note:  Does not check that each point is on the sphere.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
+            New data to predict.
+
+        Returns
+        -------
+        labels : array, shape [n_samples,]
+            Index of the cluster each sample belongs to.
+        """
+        if self.normalize:
+            X = normalize(X)
+
+        check_is_fitted(self)
+
+        X = self._check_test_data(X)
+        return _labels_inertia(X, self.cluster_centers_)[0]
+
+    def score(self, X, y=None):
+        """Inertia score (sum of all distances to closest cluster).
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
+            New data.
+
+        Returns
+        -------
+        score : float
+            Larger score is better.
+        """
+        if self.normalize:
+            X = normalize(X)
+
+        check_is_fitted(self)
+        X = self._check_test_data(X)
+        return -_labels_inertia(X, self.cluster_centers_)[1]
+
+    def log_likelihood(self, X):
+        check_is_fitted(self)
+
+        return _log_likelihood(
+            X, self.cluster_centers_, self.weights_, self.concentrations_
+        )
diff --git a/code/spherecluster/spherical_kmeans.py b/code/spherecluster/spherical_kmeans.py
new file mode 100644
index 0000000..882eba1
--- /dev/null
+++ b/code/spherecluster/spherical_kmeans.py
@@ -0,0 +1,424 @@
+import warnings
+
+import numpy as np
+import scipy.sparse as sp
+from joblib import Parallel, delayed
+
+from sklearn.cluster import KMeans
+
+from sklearn.cluster import _k_means_fast as _k_means
+from sklearn.cluster import _k_means_lloyd
+from sklearn.cluster._kmeans import (
+    _check_sample_weight,
+    _labels_inertia,
+    _tolerance,
+)
+from sklearn.preprocessing import normalize
+from sklearn.utils import check_array, check_random_state
+from sklearn.utils.extmath import row_norms, squared_norm
+from sklearn.utils.validation import _num_samples
+from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
+
+
+def _spherical_kmeans_single_lloyd(
+    X,
+    n_clusters,
+    centers_init,
+    sample_weight=None,
+    max_iter=300,
+    init="k-means++",
+    verbose=False,
+    x_squared_norms=None,
+    random_state=None,
+    tol=1e-4,
+    n_threads=1,
+):
+    """
+    Modified from sklearn.cluster.k_means_.k_means_single_lloyd.
+    """
+    random_state = check_random_state(random_state)
+
+    sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
+
+    best_labels, best_inertia, best_centers = None, None, None
+
+    # Allocate memory to store the distances for each sample to its
+    # closer center for reallocation in case of ties
+    distances = np.zeros(shape=(X.shape[0],), dtype=X.dtype)
+
+    # Buffers to avoid new allocations at each iteration.
+    centers = centers_init
+    centers_new = np.zeros_like(centers)
+    labels = np.full(X.shape[0], -1, dtype=np.int32)
+    labels_old = labels.copy()
+    weight_in_clusters = np.zeros(n_clusters, dtype=X.dtype)
+    center_shift = np.zeros(n_clusters, dtype=X.dtype)
+
+    if sp.issparse(X):
+        lloyd_iter = _k_means_lloyd.lloyd_iter_chunked_sparse
+        _inertia = _k_means._inertia_sparse
+    else:
+        lloyd_iter = _k_means_lloyd.lloyd_iter_chunked_dense
+        _inertia = _k_means._inertia_dense
+
+    # iterations
+    for i in range(max_iter):
+        centers_old = centers.copy()
+
+        # labels assignment
+        # TODO: _labels_inertia should be done with cosine distance
+        #       since ||a - b|| = 2(1 - cos(a,b)) when a,b are unit normalized
+        #       this doesn't really matter.
+        labels, inertia = _labels_inertia(
+            X,
+            sample_weight,
+            x_squared_norms,
+            centers,
+        )
+        lloyd_iter(X, sample_weight, x_squared_norms, centers, centers_new,
+                   weight_in_clusters, labels, center_shift, n_threads)
+
+        centers, centers_new = centers_new, centers
+        # l2-normalize centers (this is the main contibution here)
+        centers = normalize(centers)
+
+        if verbose:
+            print("Iteration %2d, inertia %.3f" % (i, inertia))
+
+        if best_inertia is None or inertia < best_inertia:
+            best_labels = labels.copy()
+            best_centers = centers.copy()
+            best_inertia = inertia
+
+        center_shift_total = squared_norm(centers_old - centers)
+        if center_shift_total <= tol:
+            if verbose:
+                print(
+                    "Converged at iteration %d: "
+                    "center shift %e within tolerance %e" % (i, center_shift_total, tol)
+                )
+            break
+
+    if center_shift_total > 0:
+        # rerun E-step in case of non-convergence so that predicted labels
+        # match cluster centers
+        best_labels, best_inertia = _labels_inertia(
+            X,
+            sample_weight,
+            x_squared_norms,
+            best_centers,
+        )
+
+    return best_labels, best_inertia, best_centers, i + 1
+
+
+def spherical_k_means(
+    X,
+    n_clusters,
+    centers_init,
+    sample_weight=None,
+    init="k-means++",
+    n_init=10,
+    max_iter=300,
+    verbose=False,
+    tol=1e-4,
+    random_state=None,
+    copy_x=True,
+    n_jobs=1,
+    algorithm="auto",
+    return_n_iter=False,
+):
+    """Modified from sklearn.cluster.k_means_.k_means.
+    """
+    if n_init <= 0:
+        raise ValueError(
+            "Invalid number of initializations."
+            " n_init=%d must be bigger than zero." % n_init
+        )
+    random_state = check_random_state(random_state)
+
+    if max_iter <= 0:
+        raise ValueError(
+            "Number of iterations should be a positive number,"
+            " got %d instead" % max_iter
+        )
+
+    best_inertia = np.infty
+    # avoid forcing order when copy_x=False
+    order = "C" if copy_x else None
+    X = check_array(
+        X, accept_sparse="csr", dtype=[np.float64, np.float32], order=order, copy=copy_x
+    )
+    # verify that the number of samples given is larger than k
+    if _num_samples(X) < n_clusters:
+        raise ValueError(
+            "n_samples=%d should be >= n_clusters=%d" % (_num_samples(X), n_clusters)
+        )
+    tol = _tolerance(X, tol)
+
+    if hasattr(init, "__array__"):
+        if n_init != 1:
+            warnings.warn(
+                "Explicit initial center position passed: "
+                "performing only one init in k-means instead of n_init=%d" % n_init,
+                RuntimeWarning,
+                stacklevel=2,
+            )
+            n_init = 1
+
+    # precompute squared norms of data points
+    x_squared_norms = row_norms(X, squared=True)
+
+    if n_jobs == 1:
+        # For a single thread, less memory is needed if we just store one set
+        # of the best results (as opposed to one set per run per thread).
+        for it in range(n_init):
+            # run a k-means once
+            labels, inertia, centers, n_iter_ = _spherical_kmeans_single_lloyd(
+                X,
+                n_clusters,
+                centers_init=centers_init,
+                sample_weight=sample_weight,
+                max_iter=max_iter,
+                init=init,
+                verbose=verbose,
+                tol=tol,
+                x_squared_norms=x_squared_norms,
+                random_state=random_state,
+            )
+
+            # determine if these results are the best so far
+            if best_inertia is None or inertia < best_inertia:
+                best_labels = labels.copy()
+                best_centers = centers.copy()
+                best_inertia = inertia
+                best_n_iter = n_iter_
+    else:
+        # parallelisation of k-means runs
+        seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init)
+        results = Parallel(n_jobs=n_jobs, verbose=0)(
+            delayed(_spherical_kmeans_single_lloyd)(
+                X,
+                n_clusters,
+                sample_weight,
+                max_iter=max_iter,
+                init=init,
+                verbose=verbose,
+                tol=tol,
+                x_squared_norms=x_squared_norms,
+                # Change seed to ensure variety
+                random_state=seed,
+            )
+            for seed in seeds
+        )
+
+        # Get results with the lowest inertia
+        labels, inertia, centers, n_iters = zip(*results)
+        best = np.argmin(inertia)
+        best_labels = labels[best]
+        best_inertia = inertia[best]
+        best_centers = centers[best]
+        best_n_iter = n_iters[best]
+
+    if return_n_iter:
+        return best_centers, best_labels, best_inertia, best_n_iter
+    else:
+        return best_centers, best_labels, best_inertia
+
+
+class SphericalKMeans(KMeans):
+    """Spherical K-Means clustering
+
+    Modfication of sklearn.cluster.KMeans where cluster centers are normalized
+    (projected onto the sphere) in each iteration.
+
+    Parameters
+    ----------
+
+    n_clusters : int, optional, default: 8
+        The number of clusters to form as well as the number of
+        centroids to generate.
+
+    max_iter : int, default: 300
+        Maximum number of iterations of the k-means algorithm for a
+        single run.
+
+    n_init : int, default: 10
+        Number of time the k-means algorithm will be run with different
+        centroid seeds. The final results will be the best output of
+        n_init consecutive runs in terms of inertia.
+
+    init : {'k-means++', 'random' or an ndarray}
+        Method for initialization, defaults to 'k-means++':
+        'k-means++' : selects initial cluster centers for k-mean
+        clustering in a smart way to speed up convergence. See section
+        Notes in k_init for more details.
+        'random': choose k observations (rows) at random from data for
+        the initial centroids.
+        If an ndarray is passed, it should be of shape (n_clusters, n_features)
+        and gives the initial centers.
+
+    tol : float, default: 1e-4
+        Relative tolerance with regards to inertia to declare convergence
+
+    n_jobs : int
+        The number of jobs to use for the computation. This works by computing
+        each of the n_init runs in parallel.
+        If -1 all CPUs are used. If 1 is given, no parallel computing code is
+        used at all, which is useful for debugging. For n_jobs below -1,
+        (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one
+        are used.
+
+    random_state : integer or numpy.RandomState, optional
+        The generator used to initialize the centers. If an integer is
+        given, it fixes the seed. Defaults to the global numpy random
+        number generator.
+
+    verbose : int, default 0
+        Verbosity mode.
+
+    copy_x : boolean, default True
+        When pre-computing distances it is more numerically accurate to center
+        the data first.  If copy_x is True, then the original data is not
+        modified.  If False, the original data is modified, and put back before
+        the function returns, but small numerical differences may be introduced
+        by subtracting and then adding the data mean.
+
+    normalize : boolean, default True
+        Normalize the input to have unnit norm.
+
+    Attributes
+    ----------
+
+    cluster_centers_ : array, [n_clusters, n_features]
+        Coordinates of cluster centers
+
+    labels_ :
+        Labels of each point
+
+    inertia_ : float
+        Sum of distances of samples to their closest cluster center.
+    """
+
+    def __init__(
+        self,
+        n_clusters=8,
+        init="k-means++",
+        n_init=10,
+        max_iter=300,
+        tol=1e-4,
+        n_jobs=1,
+        verbose=0,
+        random_state=None,
+        copy_x=True,
+        normalize=True,
+    ):
+        self.n_clusters = n_clusters
+        self.init = init
+        self.max_iter = max_iter
+        self.tol = tol
+        self.n_init = n_init
+        self.verbose = verbose
+        self.random_state = random_state
+        self.copy_x = copy_x
+        self.n_jobs = n_jobs
+        self.normalize = normalize
+
+    def _check_params(self, X):
+        # n_jobs
+        if self.n_jobs != 'deprecated':
+            warnings.warn("'n_jobs' was deprecated in version 0.23 and will be"
+                          " removed in 1.0 (renaming of 0.25).", FutureWarning)
+            self._n_threads = self.n_jobs
+        else:
+            self._n_threads = None
+        self._n_threads = _openmp_effective_n_threads(self._n_threads)
+
+        # n_init
+        if self.n_init <= 0:
+            raise ValueError(
+                f"n_init should be > 0, got {self.n_init} instead.")
+        self._n_init = self.n_init
+
+        # max_iter
+        if self.max_iter <= 0:
+            raise ValueError(
+                f"max_iter should be > 0, got {self.max_iter} instead.")
+
+        # n_clusters
+        if X.shape[0] < self.n_clusters:
+            raise ValueError(f"n_samples={X.shape[0]} should be >= "
+                             f"n_clusters={self.n_clusters}.")
+
+        # tol
+        self._tol = _tolerance(X, self.tol)
+
+        # init
+        if not (hasattr(self.init, '__array__') or callable(self.init)
+                or (isinstance(self.init, str)
+                    and self.init in ["k-means++", "random"])):
+            raise ValueError(
+                f"init should be either 'k-means++', 'random', a ndarray or a "
+                f"callable, got '{self.init}' instead.")
+
+        if hasattr(self.init, '__array__') and self._n_init != 1:
+            warnings.warn(
+                f"Explicit initial center position passed: performing only"
+                f" one init in {self.__class__.__name__} instead of "
+                f"n_init={self._n_init}.", RuntimeWarning, stacklevel=2)
+            self._n_init = 1
+
+
+    def fit(self, X, y=None, sample_weight=None):
+        """Compute k-means clustering.
+
+        Parameters
+        ----------
+
+        X : array-like or sparse matrix, shape=(n_samples, n_features)
+
+        y : Ignored
+            not used, present here for API consistency by convention.
+
+        sample_weight : array-like, shape (n_samples,), optional
+            The weights for each observation in X. If None, all observations
+            are assigned equal weight (default: None)
+        """
+        if self.normalize:
+            X = normalize(X)
+        self._check_params(X)
+
+        random_state = check_random_state(self.random_state)
+
+        # Validate init array
+        init = self.init
+        if hasattr(init, "__array__"):
+            init = check_array(init, dtype=X.dtype.type, order="C", copy=True)
+            self._validate_center_shape(X, init)
+
+        # TODO: add check that all data is unit-normalized
+        x_squared_norms = row_norms(X, squared=True)
+        centers_init = self._init_centroids(
+            X,
+            init=self.init,
+            random_state=random_state,
+            x_squared_norms=x_squared_norms
+        )
+
+        self.cluster_centers_, self.labels_, self.inertia_, self.n_iter_ = spherical_k_means(
+            X,
+            n_clusters=self.n_clusters,
+            centers_init=centers_init,
+            sample_weight=sample_weight,
+            init=self.init,
+            n_init=self.n_init,
+            max_iter=self.max_iter,
+            verbose=self.verbose,
+            tol=self.tol,
+            random_state=random_state,
+            copy_x=self.copy_x,
+            n_jobs=self.n_jobs,
+            return_n_iter=True,
+        )
+
+        return self
diff --git a/code/spherecluster/util.py b/code/spherecluster/util.py
new file mode 100644
index 0000000..bb68261
--- /dev/null
+++ b/code/spherecluster/util.py
@@ -0,0 +1,57 @@
+"""
+Generate multivariate von Mises Fisher samples.
+
+This solution originally appears here:
+http://stats.stackexchange.com/questions/156729/sampling-from-von-mises-fisher-distribution-in-python
+
+Also see:
+
+Sampling from vMF on S^2:
+    https://www.mitsuba-renderer.org/~wenzel/files/vmf.pdf
+    http://www.stat.pitt.edu/sungkyu/software/randvonMisesFisher3.pdf
+"""
+import numpy as np
+
+
+def sample_vMF(mu, kappa, num_samples):
+    """Generate num_samples N-dimensional samples from von Mises Fisher
+    distribution around center mu \in R^N with concentration kappa.
+    """
+    dim = len(mu)
+    result = np.zeros((num_samples, dim))
+    for nn in range(num_samples):
+        # sample offset from center (on sphere) with spread kappa
+        w = _sample_weight(kappa, dim)
+
+        # sample a point v on the unit sphere that's orthogonal to mu
+        v = _sample_orthonormal_to(mu)
+
+        # compute new point
+        result[nn, :] = v * np.sqrt(1. - w ** 2) + w * mu
+
+    return result
+
+
+def _sample_weight(kappa, dim):
+    """Rejection sampling scheme for sampling distance from center on
+    surface of the sphere.
+    """
+    dim = dim - 1  # since S^{n-1}
+    b = dim / (np.sqrt(4. * kappa ** 2 + dim ** 2) + 2 * kappa)
+    x = (1. - b) / (1. + b)
+    c = kappa * x + dim * np.log(1 - x ** 2)
+
+    while True:
+        z = np.random.beta(dim / 2., dim / 2.)
+        w = (1. - (1. + b) * z) / (1. - (1. - b) * z)
+        u = np.random.uniform(low=0, high=1)
+        if kappa * w + dim * np.log(1. - x * w) - c >= np.log(u):
+            return w
+
+
+def _sample_orthonormal_to(mu):
+    """Sample point on sphere orthogonal to mu."""
+    v = np.random.randn(mu.shape[0])
+    proj_mu_v = mu * np.dot(mu, v) / np.linalg.norm(mu)
+    orthto = v - proj_mu_v
+    return orthto / np.linalg.norm(orthto)
diff --git a/code/spherecluster/von_mises_fisher_mixture.py b/code/spherecluster/von_mises_fisher_mixture.py
new file mode 100644
index 0000000..7193f35
--- /dev/null
+++ b/code/spherecluster/von_mises_fisher_mixture.py
@@ -0,0 +1,946 @@
+import warnings
+
+import numpy as np
+import scipy.sparse as sp
+from joblib import Parallel, delayed
+from numpy import i0  # modified Bessel function of first kind order 0, I_0
+from scipy.special import iv  # modified Bessel function of first kind, I_v
+from scipy.special import logsumexp
+
+
+from sklearn.cluster import KMeans
+from sklearn.cluster._kmeans import _tolerance, _kmeans_plusplus
+from sklearn.metrics.pairwise import cosine_distances
+from sklearn.preprocessing import normalize
+from sklearn.utils import check_array, check_random_state, as_float_array
+from sklearn.utils.extmath import squared_norm
+from sklearn.utils.validation import FLOAT_DTYPES
+from sklearn.utils.validation import check_is_fitted
+from . import spherical_kmeans
+# _init_centroids
+
+MAX_CONTENTRATION = 1e10
+
+
+def _inertia_from_labels(X, centers, labels):
+    """Compute inertia with cosine distance using known labels.
+    """
+    n_examples, n_features = X.shape
+    inertia = np.zeros((n_examples,))
+    for ee in range(n_examples):
+        inertia[ee] = 1 - X[ee, :].dot(centers[int(labels[ee]), :].T)
+
+    return np.sum(inertia)
+
+
+def _labels_inertia(X, centers):
+    """Compute labels and inertia with cosine distance.
+    """
+    n_examples, n_features = X.shape
+    n_clusters, n_features = centers.shape
+
+    labels = np.zeros((n_examples,))
+    inertia = np.zeros((n_examples,))
+
+    for ee in range(n_examples):
+        dists = np.zeros((n_clusters,))
+        for cc in range(n_clusters):
+            dists[cc] = 1 - X[ee, :].dot(centers[cc, :].T)
+
+        labels[ee] = np.argmin(dists)
+        inertia[ee] = dists[int(labels[ee])]
+
+    return labels, np.sum(inertia)
+
+
+def _vmf_log(X, kappa, mu):
+    """Computs log(vMF(X, kappa, mu)) using built-in numpy/scipy Bessel
+    approximations.
+
+    Works well on small kappa and mu.
+    """
+    n_examples, n_features = X.shape
+    return np.log(_vmf_normalize(kappa, n_features) * np.exp(kappa * X.dot(mu).T))
+
+
+def _vmf_normalize(kappa, dim):
+    """Compute normalization constant using built-in numpy/scipy Bessel
+    approximations.
+
+    Works well on small kappa and mu.
+    """
+    num = np.power(kappa, dim / 2.0 - 1.0)
+
+    if dim / 2.0 - 1.0 < 1e-15:
+        denom = np.power(2.0 * np.pi, dim / 2.0) * i0(kappa)
+    else:
+        denom = np.power(2.0 * np.pi, dim / 2.0) * iv(dim / 2.0 - 1.0, kappa)
+
+    if np.isinf(num):
+        raise ValueError("VMF scaling numerator was inf.")
+
+    if np.isinf(denom):
+        raise ValueError("VMF scaling denominator was inf.")
+
+    if np.abs(denom) < 1e-15:
+        raise ValueError("VMF scaling denominator was 0.")
+
+    return num / denom
+
+
+def _log_H_asymptotic(nu, kappa):
+    """Compute the Amos-type upper bound asymptotic approximation on H where
+    log(H_\nu)(\kappa) = \int_0^\kappa R_\nu(t) dt.
+
+    See "lH_asymptotic <-" in movMF.R and utility function implementation notes
+    from https://cran.r-project.org/web/packages/movMF/index.html
+    """
+    beta = np.sqrt((nu + 0.5) ** 2)
+    kappa_l = np.min([kappa, np.sqrt((3.0 * nu + 11.0 / 2.0) * (nu + 3.0 / 2.0))])
+    return _S(kappa, nu + 0.5, beta) + (
+        _S(kappa_l, nu, nu + 2.0) - _S(kappa_l, nu + 0.5, beta)
+    )
+
+
+def _S(kappa, alpha, beta):
+    """Compute the antiderivative of the Amos-type bound G on the modified
+    Bessel function ratio.
+
+    Note:  Handles scalar kappa, alpha, and beta only.
+
+    See "S <-" in movMF.R and utility function implementation notes from
+    https://cran.r-project.org/web/packages/movMF/index.html
+    """
+    kappa = 1.0 * np.abs(kappa)
+    alpha = 1.0 * alpha
+    beta = 1.0 * np.abs(beta)
+    a_plus_b = alpha + beta
+    u = np.sqrt(kappa ** 2 + beta ** 2)
+    if alpha == 0:
+        alpha_scale = 0
+    else:
+        alpha_scale = alpha * np.log((alpha + u) / a_plus_b)
+
+    return u - beta - alpha_scale
+
+
+def _vmf_log_asymptotic(X, kappa, mu):
+    """Compute log(f(x|theta)) via Amos approximation
+
+        log(f(x|theta)) = theta' x - log(H_{d/2-1})(\|theta\|)
+
+    where theta = kappa * X, \|theta\| = kappa.
+
+    Computing _vmf_log helps with numerical stability / loss of precision for
+    for large values of kappa and n_features.
+
+    See utility function implementation notes in movMF.R from
+    https://cran.r-project.org/web/packages/movMF/index.html
+    """
+    n_examples, n_features = X.shape
+    log_vfm = kappa * X.dot(mu).T + -_log_H_asymptotic(n_features / 2.0 - 1.0, kappa)
+
+    return log_vfm
+
+
+def _log_likelihood(X, centers, weights, concentrations):
+    if len(np.shape(X)) != 2:
+        X = X.reshape((1, len(X)))
+
+    n_examples, n_features = np.shape(X)
+    n_clusters, _ = centers.shape
+
+    if n_features <= 50:  # works up to about 50 before numrically unstable
+        vmf_f = _vmf_log
+    else:
+        vmf_f = _vmf_log_asymptotic
+
+    f_log = np.zeros((n_clusters, n_examples))
+    for cc in range(n_clusters):
+        f_log[cc, :] = vmf_f(X, concentrations[cc], centers[cc, :])
+
+    posterior = np.zeros((n_clusters, n_examples))
+    weights_log = np.log(weights)
+    posterior = np.tile(weights_log.T, (n_examples, 1)).T + f_log
+    for ee in range(n_examples):
+        posterior[:, ee] = np.exp(posterior[:, ee] - logsumexp(posterior[:, ee]))
+
+    return posterior
+
+
+def _init_unit_centers(X, n_clusters, random_state, init):
+    """Initializes unit norm centers.
+
+    Parameters
+    ----------
+    X : array-like or sparse matrix, shape=(n_samples, n_features)
+
+    n_clusters : int, optional, default: 8
+        The number of clusters to form as well as the number of
+        centroids to generate.
+
+    random_state : integer or numpy.RandomState, optional
+        The generator used to initialize the centers. If an integer is
+        given, it fixes the seed. Defaults to the global numpy random
+        number generator.
+
+    init:  (string) one of
+        k-means++ : uses sklearn k-means++ initialization algorithm
+        spherical-k-means : use centroids from one pass of spherical k-means
+        random : random unit norm vectors
+        random-orthonormal : random orthonormal vectors
+        If an ndarray is passed, it should be of shape (n_clusters, n_features)
+        and gives the initial centers.
+    """
+    n_examples, n_features = np.shape(X)
+    if isinstance(init, np.ndarray):
+        n_init_clusters, n_init_features = init.shape
+        assert n_init_clusters == n_clusters
+        assert n_init_features == n_features
+
+        # ensure unit normed centers
+        centers = init
+        for cc in range(n_clusters):
+            centers[cc, :] = centers[cc, :] / np.linalg.norm(centers[cc, :])
+
+        return centers
+
+    elif init == "spherical-k-means":
+        labels, inertia, centers, iters = spherical_kmeans._spherical_kmeans_single_lloyd(
+            X, n_clusters, x_squared_norms=np.ones((n_examples,)), init="k-means++"
+        )
+
+        return centers
+
+    elif init == "random":
+        centers = np.random.randn(n_clusters, n_features)
+        for cc in range(n_clusters):
+            centers[cc, :] = centers[cc, :] / np.linalg.norm(centers[cc, :])
+
+        return centers
+
+    elif init == "k-means++":
+        centers, _ = _kmeans_plusplus(X, n_clusters,
+                                      random_state=random_state,
+                                      x_squared_norms=np.ones((n_examples,)))
+
+        for cc in range(n_clusters):
+            centers[cc, :] = centers[cc, :] / np.linalg.norm(centers[cc, :])
+
+        return centers
+
+    elif init == "random-orthonormal":
+        centers = np.random.randn(n_clusters, n_features)
+        q, r = np.linalg.qr(centers.T, mode="reduced")
+
+        return q.T
+
+    elif init == "random-class":
+        centers = np.zeros((n_clusters, n_features))
+        for cc in range(n_clusters):
+            while np.linalg.norm(centers[cc, :]) == 0:
+                labels = np.random.randint(0, n_clusters, n_examples)
+                centers[cc, :] = X[labels == cc, :].sum(axis=0)
+
+        for cc in range(n_clusters):
+            centers[cc, :] = centers[cc, :] / np.linalg.norm(centers[cc, :])
+
+        return centers
+
+
+def _expectation(X, centers, weights, concentrations, posterior_type="soft"):
+    """Compute the log-likelihood of each datapoint being in each cluster.
+
+    Parameters
+    ----------
+    centers (mu) : array, [n_centers x n_features]
+    weights (alpha) : array, [n_centers, ] (alpha)
+    concentrations (kappa) : array, [n_centers, ]
+
+    Returns
+    ----------
+    posterior : array, [n_centers, n_examples]
+    """
+    n_examples, n_features = np.shape(X)
+    n_clusters, _ = centers.shape
+
+    if n_features <= 50:  # works up to about 50 before numrically unstable
+        vmf_f = _vmf_log
+    else:
+        vmf_f = _vmf_log_asymptotic
+
+    f_log = np.zeros((n_clusters, n_examples))
+    for cc in range(n_clusters):
+        f_log[cc, :] = vmf_f(X, concentrations[cc], centers[cc, :])
+
+    posterior = np.zeros((n_clusters, n_examples))
+    if posterior_type == "soft":
+        weights_log = np.log(weights)
+        posterior = np.tile(weights_log.T, (n_examples, 1)).T + f_log
+        for ee in range(n_examples):
+            posterior[:, ee] = np.exp(posterior[:, ee] - logsumexp(posterior[:, ee]))
+
+    elif posterior_type == "hard":
+        weights_log = np.log(weights)
+        weighted_f_log = np.tile(weights_log.T, (n_examples, 1)).T + f_log
+        for ee in range(n_examples):
+            posterior[np.argmax(weighted_f_log[:, ee]), ee] = 1.0
+
+    return posterior
+
+
+def _maximization(X, posterior, force_weights=None):
+    """Estimate new centers, weights, and concentrations from
+
+    Parameters
+    ----------
+    posterior : array, [n_centers, n_examples]
+        The posterior matrix from the expectation step.
+
+    force_weights : None or array, [n_centers, ]
+        If None is passed, will estimate weights.
+        If an array is passed, will use instead of estimating.
+
+    Returns
+    ----------
+    centers (mu) : array, [n_centers x n_features]
+    weights (alpha) : array, [n_centers, ] (alpha)
+    concentrations (kappa) : array, [n_centers, ]
+    """
+    n_examples, n_features = X.shape
+    n_clusters, n_examples = posterior.shape
+    concentrations = np.zeros((n_clusters,))
+    centers = np.zeros((n_clusters, n_features))
+    if force_weights is None:
+        weights = np.zeros((n_clusters,))
+
+    for cc in range(n_clusters):
+        # update weights (alpha)
+        if force_weights is None:
+            weights[cc] = np.mean(posterior[cc, :])
+        else:
+            weights = force_weights
+
+        # update centers (mu)
+        X_scaled = X.copy()
+        if sp.issparse(X):
+            X_scaled.data *= posterior[cc, :].repeat(np.diff(X_scaled.indptr))
+        else:
+            for ee in range(n_examples):
+                X_scaled[ee, :] *= posterior[cc, ee]
+
+        centers[cc, :] = X_scaled.sum(axis=0)
+
+        # normalize centers
+        center_norm = np.linalg.norm(centers[cc, :])
+        if center_norm > 1e-8:
+            centers[cc, :] = centers[cc, :] / center_norm
+
+        # update concentration (kappa) [TODO: add other kappa approximations]
+        rbar = center_norm / (n_examples * weights[cc])
+        concentrations[cc] = rbar * n_features - np.power(rbar, 3.0)
+        if np.abs(rbar - 1.0) < 1e-10:
+            concentrations[cc] = MAX_CONTENTRATION
+        else:
+            concentrations[cc] /= 1.0 - np.power(rbar, 2.0)
+
+        # let python know we can free this (good for large dense X)
+        del X_scaled
+
+    return centers, weights, concentrations
+
+
+def _movMF(
+    X,
+    n_clusters,
+    posterior_type="soft",
+    force_weights=None,
+    max_iter=300,
+    verbose=False,
+    init="random-class",
+    random_state=None,
+    tol=1e-6,
+):
+    """Mixture of von Mises Fisher clustering.
+
+    Implements the algorithms (i) and (ii) from
+
+      "Clustering on the Unit Hypersphere using von Mises-Fisher Distributions"
+      by Banerjee, Dhillon, Ghosh, and Sra.
+
+    TODO: Currently only supports Banerjee et al 2005 approximation of kappa,
+          however, there are numerous other approximations see _update_params.
+
+    Attribution
+    ----------
+    Approximation of log-vmf distribution function from movMF R-package.
+
+    movMF: An R Package for Fitting Mixtures of von Mises-Fisher Distributions
+    by Kurt Hornik, Bettina Grun, 2014
+
+    Find more at:
+      https://cran.r-project.org/web/packages/movMF/vignettes/movMF.pdf
+      https://cran.r-project.org/web/packages/movMF/index.html
+
+    Parameters
+    ----------
+    n_clusters : int, optional, default: 8
+        The number of clusters to form as well as the number of
+        centroids to generate.
+
+    posterior_type: 'soft' or 'hard'
+        Type of posterior computed in exepectation step.
+        See note about attribute: self.posterior_
+
+    force_weights : None or array [n_clusters, ]
+        If None, the algorithm will estimate the weights.
+        If an array of weights, algorithm will estimate concentrations and
+        centers with given weights.
+
+    max_iter : int, default: 300
+        Maximum number of iterations of the k-means algorithm for a
+        single run.
+
+    n_init : int, default: 10
+        Number of time the k-means algorithm will be run with different
+        centroid seeds. The final results will be the best output of
+        n_init consecutive runs in terms of inertia.
+
+    init:  (string) one of
+        random-class [default]: random class assignment & centroid computation
+        k-means++ : uses sklearn k-means++ initialization algorithm
+        spherical-k-means : use centroids from one pass of spherical k-means
+        random : random unit norm vectors
+        random-orthonormal : random orthonormal vectors
+        If an ndarray is passed, it should be of shape (n_clusters, n_features)
+        and gives the initial centers.
+
+    tol : float, default: 1e-6
+        Relative tolerance with regards to inertia to declare convergence
+
+    n_jobs : int
+        The number of jobs to use for the computation. This works by computing
+        each of the n_init runs in parallel.
+        If -1 all CPUs are used. If 1 is given, no parallel computing code is
+        used at all, which is useful for debugging. For n_jobs below -1,
+        (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one
+        are used.
+
+    random_state : integer or numpy.RandomState, optional
+        The generator used to initialize the centers. If an integer is
+        given, it fixes the seed. Defaults to the global numpy random
+        number generator.
+
+    verbose : int, default 0
+        Verbosity mode.
+
+    copy_x : boolean, default True
+        When pre-computing distances it is more numerically accurate to center
+        the data first.  If copy_x is True, then the original data is not
+        modified.  If False, the original data is modified, and put back before
+        the function returns, but small numerical differences may be introduced
+        by subtracting and then adding the data mean.
+    """
+    random_state = check_random_state(random_state)
+    n_examples, n_features = np.shape(X)
+
+    # init centers (mus)
+    centers = _init_unit_centers(X, n_clusters, random_state, init)
+
+    # init weights (alphas)
+    if force_weights is None:
+        weights = np.ones((n_clusters,))
+        weights = weights / np.sum(weights)
+    else:
+        weights = force_weights
+
+    # init concentrations (kappas)
+    concentrations = np.ones((n_clusters,))
+
+    if verbose:
+        print("Initialization complete")
+
+    for iter in range(max_iter):
+        centers_prev = centers.copy()
+
+        # expectation step
+        posterior = _expectation(
+            X, centers, weights, concentrations, posterior_type=posterior_type
+        )
+
+        # maximization step
+        centers, weights, concentrations = _maximization(
+            X, posterior, force_weights=force_weights
+        )
+
+        # check convergence
+        tolcheck = squared_norm(centers_prev - centers)
+        if tolcheck <= tol:
+            if verbose:
+                print(
+                    "Converged at iteration %d: "
+                    "center shift %e within tolerance %e" % (iter, tolcheck, tol)
+                )
+            break
+
+    # labels come for free via posterior
+    labels = np.zeros((n_examples,))
+    for ee in range(n_examples):
+        labels[ee] = np.argmax(posterior[:, ee])
+
+    inertia = _inertia_from_labels(X, centers, labels)
+
+    return centers, weights, concentrations, posterior, labels, inertia
+
+
+def movMF(
+    X,
+    n_clusters,
+    posterior_type="soft",
+    force_weights=None,
+    n_init=10,
+    n_jobs=1,
+    max_iter=300,
+    verbose=False,
+    init="random-class",
+    random_state=None,
+    tol=1e-6,
+    copy_x=True,
+):
+    """Wrapper for parallelization of _movMF and running n_init times.
+    """
+    if n_init <= 0:
+        raise ValueError(
+            "Invalid number of initializations."
+            " n_init=%d must be bigger than zero." % n_init
+        )
+    random_state = check_random_state(random_state)
+
+    if max_iter <= 0:
+        raise ValueError(
+            "Number of iterations should be a positive number,"
+            " got %d instead" % max_iter
+        )
+
+    best_inertia = np.infty
+    X = as_float_array(X, copy=copy_x)
+    tol = _tolerance(X, tol)
+
+    if hasattr(init, "__array__"):
+        if n_init != 1:
+            warnings.warn(
+                "Explicit initial center position passed: "
+                "performing only one init in k-means instead of n_init=%d" % n_init,
+                RuntimeWarning,
+                stacklevel=2,
+            )
+            n_init = 1
+
+    # defaults
+    best_centers = None
+    best_labels = None
+    best_weights = None
+    best_concentrations = None
+    best_posterior = None
+    best_inertia = None
+
+    if n_jobs == 1:
+        # For a single thread, less memory is needed if we just store one set
+        # of the best results (as opposed to one set per run per thread).
+        for it in range(n_init):
+            # cluster on the sphere
+            (centers, weights, concentrations, posterior, labels, inertia) = _movMF(
+                X,
+                n_clusters,
+                posterior_type=posterior_type,
+                force_weights=force_weights,
+                max_iter=max_iter,
+                verbose=verbose,
+                init=init,
+                random_state=random_state,
+                tol=tol,
+            )
+
+            # determine if these results are the best so far
+            if best_inertia is None or inertia < best_inertia:
+                best_centers = centers.copy()
+                best_labels = labels.copy()
+                best_weights = weights.copy()
+                best_concentrations = concentrations.copy()
+                best_posterior = posterior.copy()
+                best_inertia = inertia
+    else:
+        # parallelisation of movMF runs
+        seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init)
+        results = Parallel(n_jobs=n_jobs, verbose=0)(
+            delayed(_movMF)(
+                X,
+                n_clusters,
+                posterior_type=posterior_type,
+                force_weights=force_weights,
+                max_iter=max_iter,
+                verbose=verbose,
+                init=init,
+                random_state=random_state,
+                tol=tol,
+            )
+            for seed in seeds
+        )
+
+        # Get results with the lowest inertia
+        centers, weights, concentrations, posteriors, labels, inertia = zip(*results)
+        best = np.argmin(inertia)
+        best_labels = labels[best]
+        best_inertia = inertia[best]
+        best_centers = centers[best]
+        best_concentrations = concentrations[best]
+        best_posterior = posteriors[best]
+        best_weights = weights[best]
+
+    return (
+        best_centers,
+        best_labels,
+        best_inertia,
+        best_weights,
+        best_concentrations,
+        best_posterior,
+    )
+
+
+class VonMisesFisherMixture(KMeans):
+    """Estimator for Mixture of von Mises Fisher clustering on the unit sphere.
+
+    Implements the algorithms (i) and (ii) from
+
+      "Clustering on the Unit Hypersphere using von Mises-Fisher Distributions"
+      by Banerjee, Dhillon, Ghosh, and Sra.
+
+    TODO: Currently only supports Banerjee et al 2005 approximation of kappa,
+          however, there are numerous other approximations see _update_params.
+
+    Attribution
+    ----------
+    Approximation of log-vmf distribution function from movMF R-package.
+
+    movMF: An R Package for Fitting Mixtures of von Mises-Fisher Distributions
+    by Kurt Hornik, Bettina Grun, 2014
+
+    Find more at:
+      https://cran.r-project.org/web/packages/movMF/vignettes/movMF.pdf
+      https://cran.r-project.org/web/packages/movMF/index.html
+
+    Basic sklearn scaffolding from sklearn.cluster.KMeans.
+
+    Parameters
+    ----------
+    n_clusters : int, optional, default: 8
+        The number of clusters to form as well as the number of
+        centroids to generate.
+
+    posterior_type: 'soft' or 'hard'
+        Type of posterior computed in exepectation step.
+        See note about attribute: self.posterior_
+
+    force_weights : None or array [n_clusters, ]
+        If None, the algorithm will estimate the weights.
+        If an array of weights, algorithm will estimate concentrations and
+        centers with given weights.
+
+    max_iter : int, default: 300
+        Maximum number of iterations of the k-means algorithm for a
+        single run.
+
+    n_init : int, default: 10
+        Number of time the k-means algorithm will be run with different
+        centroid seeds. The final results will be the best output of
+        n_init consecutive runs in terms of inertia.
+
+    init:  (string) one of
+        random-class [default]: random class assignment & centroid computation
+        k-means++ : uses sklearn k-means++ initialization algorithm
+        spherical-k-means : use centroids from one pass of spherical k-means
+        random : random unit norm vectors
+        random-orthonormal : random orthonormal vectors
+        If an ndarray is passed, it should be of shape (n_clusters, n_features)
+        and gives the initial centers.
+
+    tol : float, default: 1e-6
+        Relative tolerance with regards to inertia to declare convergence
+
+    n_jobs : int
+        The number of jobs to use for the computation. This works by computing
+        each of the n_init runs in parallel.
+        If -1 all CPUs are used. If 1 is given, no parallel computing code is
+        used at all, which is useful for debugging. For n_jobs below -1,
+        (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one
+        are used.
+
+    random_state : integer or numpy.RandomState, optional
+        The generator used to initialize the centers. If an integer is
+        given, it fixes the seed. Defaults to the global numpy random
+        number generator.
+
+    verbose : int, default 0
+        Verbosity mode.
+
+    copy_x : boolean, default True
+        When pre-computing distances it is more numerically accurate to center
+        the data first.  If copy_x is True, then the original data is not
+        modified.  If False, the original data is modified, and put back before
+        the function returns, but small numerical differences may be introduced
+        by subtracting and then adding the data mean.
+
+    normalize : boolean, default True
+        Normalize the input to have unnit norm.
+
+    Attributes
+    ----------
+
+    cluster_centers_ : array, [n_clusters, n_features]
+        Coordinates of cluster centers
+
+    labels_ :
+        Labels of each point
+
+    inertia_ : float
+        Sum of distances of samples to their closest cluster center.
+
+    weights_ : array, [n_clusters,]
+        Weights of each cluster in vMF distribution (alpha).
+
+    concentrations_ : array [n_clusters,]
+        Concentration parameter for each cluster (kappa).
+        Larger values correspond to more concentrated clusters.
+
+    posterior_ : array, [n_clusters, n_examples]
+        Each column corresponds to the posterio distribution for and example.
+
+        If posterior_type='hard' is used, there will only be one non-zero per
+        column, its index corresponding to the example's cluster label.
+
+        If posterior_type='soft' is used, this matrix will be dense and the
+        column values correspond to soft clustering weights.
+    """
+
+    def __init__(
+        self,
+        n_clusters=5,
+        posterior_type="soft",
+        force_weights=None,
+        n_init=10,
+        n_jobs=1,
+        max_iter=300,
+        verbose=False,
+        init="random-class",
+        random_state=None,
+        tol=1e-6,
+        copy_x=True,
+        normalize=True,
+    ):
+        self.n_clusters = n_clusters
+        self.posterior_type = posterior_type
+        self.force_weights = force_weights
+        self.n_init = n_init
+        self.n_jobs = n_jobs
+        self.max_iter = max_iter
+        self.verbose = verbose
+        self.init = init
+        self.random_state = random_state
+        self.tol = tol
+        self.copy_x = copy_x
+        self.normalize = normalize
+
+    def _check_force_weights(self):
+        if self.force_weights is None:
+            return
+
+        if len(self.force_weights) != self.n_clusters:
+            raise ValueError(
+                (
+                    "len(force_weights)={} but must equal "
+                    "n_clusters={}".format(len(self.force_weights), self.n_clusters)
+                )
+            )
+
+    def _check_fit_data(self, X):
+        """Verify that the number of samples given is larger than k"""
+        X = check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32])
+        n_samples, n_features = X.shape
+        if X.shape[0] < self.n_clusters:
+            raise ValueError(
+                "n_samples=%d should be >= n_clusters=%d"
+                % (X.shape[0], self.n_clusters)
+            )
+
+        for ee in range(n_samples):
+            if sp.issparse(X):
+                n = sp.linalg.norm(X[ee, :])
+            else:
+                n = np.linalg.norm(X[ee, :])
+
+            if np.abs(n - 1.0) > 1e-4:
+                raise ValueError("Data l2-norm must be 1, found {}".format(n))
+
+        return X
+
+    def _check_test_data(self, X):
+        X = check_array(X, accept_sparse="csr", dtype=FLOAT_DTYPES)
+        n_samples, n_features = X.shape
+        expected_n_features = self.cluster_centers_.shape[1]
+        if not n_features == expected_n_features:
+            raise ValueError(
+                "Incorrect number of features. "
+                "Got %d features, expected %d" % (n_features, expected_n_features)
+            )
+
+        for ee in range(n_samples):
+            if sp.issparse(X):
+                n = sp.linalg.norm(X[ee, :])
+            else:
+                n = np.linalg.norm(X[ee, :])
+
+            if np.abs(n - 1.0) > 1e-4:
+                raise ValueError("Data l2-norm must be 1, found {}".format(n))
+
+        return X
+
+    def fit(self, X, y=None):
+        """Compute mixture of von Mises Fisher clustering.
+
+        Parameters
+        ----------
+        X : array-like or sparse matrix, shape=(n_samples, n_features)
+        """
+        if self.normalize:
+            X = normalize(X)
+
+        # Validate init array
+        init = self.init
+        if hasattr(init, "__array__"):
+            init = check_array(init, dtype=X.dtype.type, order="C", copy=True)
+            self._validate_center_shape(X, init)
+
+        self._check_force_weights()
+        random_state = check_random_state(self.random_state)
+        X = self._check_fit_data(X)
+
+        (
+            self.cluster_centers_,
+            self.labels_,
+            self.inertia_,
+            self.weights_,
+            self.concentrations_,
+            self.posterior_,
+        ) = movMF(
+            X,
+            self.n_clusters,
+            posterior_type=self.posterior_type,
+            force_weights=self.force_weights,
+            n_init=self.n_init,
+            n_jobs=self.n_jobs,
+            max_iter=self.max_iter,
+            verbose=self.verbose,
+            init=self.init,
+            random_state=random_state,
+            tol=self.tol,
+            copy_x=self.copy_x,
+        )
+
+        return self
+
+    def fit_predict(self, X, y=None):
+        """Compute cluster centers and predict cluster index for each sample.
+        Convenience method; equivalent to calling fit(X) followed by
+        predict(X).
+        """
+        return self.fit(X).labels_
+
+    def fit_transform(self, X, y=None):
+        """Compute clustering and transform X to cluster-distance space.
+        Equivalent to fit(X).transform(X), but more efficiently implemented.
+        """
+        # Currently, this just skips a copy of the data if it is not in
+        # np.array or CSR format already.
+        # XXX This skips _check_test_data, which may change the dtype;
+        # we should refactor the input validation.
+        return self.fit(X)._transform(X)
+
+    def transform(self, X, y=None):
+        """Transform X to a cluster-distance space.
+        In the new space, each dimension is the cosine distance to the cluster
+        centers.  Note that even if X is sparse, the array returned by
+        `transform` will typically be dense.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
+            New data to transform.
+
+        Returns
+        -------
+        X_new : array, shape [n_samples, k]
+            X transformed in the new space.
+        """
+        if self.normalize:
+            X = normalize(X)
+
+        check_is_fitted(self)
+        X = self._check_test_data(X)
+        return self._transform(X)
+
+    def _transform(self, X):
+        """guts of transform method; no input validation"""
+        return cosine_distances(X, self.cluster_centers_)
+
+    def predict(self, X):
+        """Predict the closest cluster each sample in X belongs to.
+        In the vector quantization literature, `cluster_centers_` is called
+        the code book and each value returned by `predict` is the index of
+        the closest code in the code book.
+
+        Note:  Does not check that each point is on the sphere.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
+            New data to predict.
+
+        Returns
+        -------
+        labels : array, shape [n_samples,]
+            Index of the cluster each sample belongs to.
+        """
+        if self.normalize:
+            X = normalize(X)
+
+        check_is_fitted(self)
+
+        X = self._check_test_data(X)
+        return _labels_inertia(X, self.cluster_centers_)[0]
+
+    def score(self, X, y=None):
+        """Inertia score (sum of all distances to closest cluster).
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
+            New data.
+
+        Returns
+        -------
+        score : float
+            Larger score is better.
+        """
+        if self.normalize:
+            X = normalize(X)
+
+        check_is_fitted(self)
+        X = self._check_test_data(X)
+        return -_labels_inertia(X, self.cluster_centers_)[1]
+
+    def log_likelihood(self, X):
+        check_is_fitted(self)
+
+        return _log_likelihood(
+            X, self.cluster_centers_, self.weights_, self.concentrations_
+        )
diff --git a/code/taxonomy.py b/code/taxonomy.py
index 3c5eb20..00794cd 100644
--- a/code/taxonomy.py
+++ b/code/taxonomy.py
@@ -53,7 +53,7 @@ def find_node(self, name):
 		return None
 
 	def sample_a_node(self):
-		return random.choice(self.all_nodes.values())
+		return random.choice(list(self.all_nodes.values()))
 
 
 
diff --git a/code/utils.py b/code/utils.py
index 085e722..ca29d88 100644
--- a/code/utils.py
+++ b/code/utils.py
@@ -31,7 +31,7 @@ def avg_weighted_colors(color_list, c_size):
 	for (color, weight) in color_list:
 		w_color = [x * weight for x in color]
 		# print w_color
-		result_color = map(operator.add, result_color, w_color)
+		result_color = list(map(operator.add, result_color, w_color))
 		# print result_color
 	return l1_normalize(result_color)
 
@@ -57,7 +57,7 @@ def cossim(p, q):
 
 def euclidean_distance(p, q):
 	if len(p) != len(q):
-		print 'Euclidean distance error: p, q have different length'
+		print('Euclidean distance error: p, q have different length')
 	
 	distance = 0
 
@@ -69,7 +69,7 @@ def euclidean_distance(p, q):
 
 def euclidean_cluster(ps, c):
 	if len(ps) == 0 or c == None:
-		print 'Cluster is empty'
+		print('Cluster is empty')
 
 	distance = 0
 
@@ -83,7 +83,7 @@ def euclidean_cluster(ps, c):
 
 def dot_product(p, q):
 	if len(p) != len(q):
-		print 'KL divergence error: p, q have different length'
+		print('KL divergence error: p, q have different length')
 	
 	p_len = q_len = mix_len = 0
 
@@ -122,7 +122,7 @@ def avg_emb_with_distinct(ele_map, embs_from, dist_map, vec_size):
 	avg_emb = [0] * vec_size
 	t_weight = 0
 
-	for key, value in ele_map.iteritems():
+	for key, value in ele_map.items():
 		t_emb = embs_from[key]
 		w = value * dist_map[key]
 		for i in range(vec_size):
@@ -140,7 +140,7 @@ def avg_emb(ele_map, embs_from, vec_size):
 	avg_emb = [0] * vec_size
 	t_weight = 0
 
-	for key, value in ele_map.iteritems():
+	for key, value in ele_map.items():
 		t_emb = embs_from[key]
 		w = value
 		for i in range(vec_size):
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..3e630e5
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,21 @@
+[tool.poetry]
+name = "taxogen"
+version = "0.1.0"
+description = ""
+authors = ["Cezar Sas <cezar.sas@gmail.com>"]
+
+[tool.poetry.dependencies]
+python = ">=3.8,<3.11"
+numpy = "^1.21.4"
+scipy = "^1.7.3"
+scikit-learn = "0.24.2"
+PyYAML = "^6.0"
+
+
+[tool.poetry.dev-dependencies]
+2to3 = "^1.0"
+
+
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"

From 9e231940c94e10257806f31a92f1206dab819b1d Mon Sep 17 00:00:00 2001
From: Cezar Sas <cezar.sas@gmail.com>
Date: Tue, 14 Dec 2021 09:52:40 +0100
Subject: [PATCH 2/2] Minor fixes

---
 code/spherecluster/spherecluster/__init__.py  |   6 -
 .../spherecluster/spherical_kmeans.py         | 372 -------
 .../spherecluster/tests/test_common.py        |   6 -
 .../tests/test_von_mises_fisher_mixture.py    | 205 ----
 code/spherecluster/spherecluster/util.py      |  57 --
 .../spherecluster/von_mises_fisher_mixture.py | 946 ------------------
 6 files changed, 1592 deletions(-)
 delete mode 100644 code/spherecluster/spherecluster/__init__.py
 delete mode 100644 code/spherecluster/spherecluster/spherical_kmeans.py
 delete mode 100644 code/spherecluster/spherecluster/tests/test_common.py
 delete mode 100644 code/spherecluster/spherecluster/tests/test_von_mises_fisher_mixture.py
 delete mode 100644 code/spherecluster/spherecluster/util.py
 delete mode 100644 code/spherecluster/spherecluster/von_mises_fisher_mixture.py

diff --git a/code/spherecluster/spherecluster/__init__.py b/code/spherecluster/spherecluster/__init__.py
deleted file mode 100644
index 5d2c502..0000000
--- a/code/spherecluster/spherecluster/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from __future__ import absolute_import
-from .spherical_kmeans import SphericalKMeans
-from .von_mises_fisher_mixture import VonMisesFisherMixture
-from .util import sample_vMF
-
-__all__ = ["SphericalKMeans", "VonMisesFisherMixture", "sample_vMF"]
diff --git a/code/spherecluster/spherecluster/spherical_kmeans.py b/code/spherecluster/spherecluster/spherical_kmeans.py
deleted file mode 100644
index d56e03c..0000000
--- a/code/spherecluster/spherecluster/spherical_kmeans.py
+++ /dev/null
@@ -1,372 +0,0 @@
-import warnings
-
-import numpy as np
-import scipy.sparse as sp
-from joblib import Parallel, delayed
-
-from sklearn.cluster import KMeans
-
-# from sklearn.cluster import _k_means
-from sklearn.cluster import _k_means_fast as _k_means
-from sklearn.cluster.k_means_ import (
-    _check_sample_weight,
-    _init_centroids,
-    _labels_inertia,
-    _tolerance,
-    _validate_center_shape,
-)
-from sklearn.preprocessing import normalize
-from sklearn.utils import check_array, check_random_state
-from sklearn.utils.extmath import row_norms, squared_norm
-from sklearn.utils.validation import _num_samples
-
-
-def _spherical_kmeans_single_lloyd(
-    X,
-    n_clusters,
-    sample_weight=None,
-    max_iter=300,
-    init="k-means++",
-    verbose=False,
-    x_squared_norms=None,
-    random_state=None,
-    tol=1e-4,
-    precompute_distances=True,
-):
-    """
-    Modified from sklearn.cluster.k_means_.k_means_single_lloyd.
-    """
-    random_state = check_random_state(random_state)
-
-    sample_weight = _check_sample_weight(sample_weight, X)
-
-    best_labels, best_inertia, best_centers = None, None, None
-
-    # init
-    centers = _init_centroids(
-        X, n_clusters, init, random_state=random_state, x_squared_norms=x_squared_norms
-    )
-    if verbose:
-        print("Initialization complete")
-
-    # Allocate memory to store the distances for each sample to its
-    # closer center for reallocation in case of ties
-    distances = np.zeros(shape=(X.shape[0],), dtype=X.dtype)
-
-    # iterations
-    for i in range(max_iter):
-        centers_old = centers.copy()
-
-        # labels assignment
-        # TODO: _labels_inertia should be done with cosine distance
-        #       since ||a - b|| = 2(1 - cos(a,b)) when a,b are unit normalized
-        #       this doesn't really matter.
-        labels, inertia = _labels_inertia(
-            X,
-            sample_weight,
-            x_squared_norms,
-            centers,
-            precompute_distances=precompute_distances,
-            distances=distances,
-        )
-
-        # computation of the means
-        if sp.issparse(X):
-            centers = _k_means._centers_sparse(
-                X, sample_weight, labels, n_clusters, distances
-            )
-        else:
-            centers = _k_means._centers_dense(
-                X.astype(np.float),
-                sample_weight.astype(np.float),
-                labels,
-                n_clusters,
-                distances.astype(np.float),
-            )
-
-        # l2-normalize centers (this is the main contibution here)
-        centers = normalize(centers)
-
-        if verbose:
-            print("Iteration %2d, inertia %.3f" % (i, inertia))
-
-        if best_inertia is None or inertia < best_inertia:
-            best_labels = labels.copy()
-            best_centers = centers.copy()
-            best_inertia = inertia
-
-        center_shift_total = squared_norm(centers_old - centers)
-        if center_shift_total <= tol:
-            if verbose:
-                print(
-                    "Converged at iteration %d: "
-                    "center shift %e within tolerance %e" % (i, center_shift_total, tol)
-                )
-            break
-
-    if center_shift_total > 0:
-        # rerun E-step in case of non-convergence so that predicted labels
-        # match cluster centers
-        best_labels, best_inertia = _labels_inertia(
-            X,
-            sample_weight,
-            x_squared_norms,
-            best_centers,
-            precompute_distances=precompute_distances,
-            distances=distances,
-        )
-
-    return best_labels, best_inertia, best_centers, i + 1
-
-
-def spherical_k_means(
-    X,
-    n_clusters,
-    sample_weight=None,
-    init="k-means++",
-    n_init=10,
-    max_iter=300,
-    verbose=False,
-    tol=1e-4,
-    random_state=None,
-    copy_x=True,
-    n_jobs=1,
-    algorithm="auto",
-    return_n_iter=False,
-):
-    """Modified from sklearn.cluster.k_means_.k_means.
-    """
-    if n_init <= 0:
-        raise ValueError(
-            "Invalid number of initializations."
-            " n_init=%d must be bigger than zero." % n_init
-        )
-    random_state = check_random_state(random_state)
-
-    if max_iter <= 0:
-        raise ValueError(
-            "Number of iterations should be a positive number,"
-            " got %d instead" % max_iter
-        )
-
-    best_inertia = np.infty
-    # avoid forcing order when copy_x=False
-    order = "C" if copy_x else None
-    X = check_array(
-        X, accept_sparse="csr", dtype=[np.float64, np.float32], order=order, copy=copy_x
-    )
-    # verify that the number of samples given is larger than k
-    if _num_samples(X) < n_clusters:
-        raise ValueError(
-            "n_samples=%d should be >= n_clusters=%d" % (_num_samples(X), n_clusters)
-        )
-    tol = _tolerance(X, tol)
-
-    if hasattr(init, "__array__"):
-        init = check_array(init, dtype=X.dtype.type, order="C", copy=True)
-        _validate_center_shape(X, n_clusters, init)
-
-        if n_init != 1:
-            warnings.warn(
-                "Explicit initial center position passed: "
-                "performing only one init in k-means instead of n_init=%d" % n_init,
-                RuntimeWarning,
-                stacklevel=2,
-            )
-            n_init = 1
-
-    # precompute squared norms of data points
-    x_squared_norms = row_norms(X, squared=True)
-
-    if n_jobs == 1:
-        # For a single thread, less memory is needed if we just store one set
-        # of the best results (as opposed to one set per run per thread).
-        for it in range(n_init):
-            # run a k-means once
-            labels, inertia, centers, n_iter_ = _spherical_kmeans_single_lloyd(
-                X,
-                n_clusters,
-                sample_weight,
-                max_iter=max_iter,
-                init=init,
-                verbose=verbose,
-                tol=tol,
-                x_squared_norms=x_squared_norms,
-                random_state=random_state,
-            )
-
-            # determine if these results are the best so far
-            if best_inertia is None or inertia < best_inertia:
-                best_labels = labels.copy()
-                best_centers = centers.copy()
-                best_inertia = inertia
-                best_n_iter = n_iter_
-    else:
-        # parallelisation of k-means runs
-        seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init)
-        results = Parallel(n_jobs=n_jobs, verbose=0)(
-            delayed(_spherical_kmeans_single_lloyd)(
-                X,
-                n_clusters,
-                sample_weight,
-                max_iter=max_iter,
-                init=init,
-                verbose=verbose,
-                tol=tol,
-                x_squared_norms=x_squared_norms,
-                # Change seed to ensure variety
-                random_state=seed,
-            )
-            for seed in seeds
-        )
-
-        # Get results with the lowest inertia
-        labels, inertia, centers, n_iters = zip(*results)
-        best = np.argmin(inertia)
-        best_labels = labels[best]
-        best_inertia = inertia[best]
-        best_centers = centers[best]
-        best_n_iter = n_iters[best]
-
-    if return_n_iter:
-        return best_centers, best_labels, best_inertia, best_n_iter
-    else:
-        return best_centers, best_labels, best_inertia
-
-
-class SphericalKMeans(KMeans):
-    """Spherical K-Means clustering
-
-    Modfication of sklearn.cluster.KMeans where cluster centers are normalized
-    (projected onto the sphere) in each iteration.
-
-    Parameters
-    ----------
-
-    n_clusters : int, optional, default: 8
-        The number of clusters to form as well as the number of
-        centroids to generate.
-
-    max_iter : int, default: 300
-        Maximum number of iterations of the k-means algorithm for a
-        single run.
-
-    n_init : int, default: 10
-        Number of time the k-means algorithm will be run with different
-        centroid seeds. The final results will be the best output of
-        n_init consecutive runs in terms of inertia.
-
-    init : {'k-means++', 'random' or an ndarray}
-        Method for initialization, defaults to 'k-means++':
-        'k-means++' : selects initial cluster centers for k-mean
-        clustering in a smart way to speed up convergence. See section
-        Notes in k_init for more details.
-        'random': choose k observations (rows) at random from data for
-        the initial centroids.
-        If an ndarray is passed, it should be of shape (n_clusters, n_features)
-        and gives the initial centers.
-
-    tol : float, default: 1e-4
-        Relative tolerance with regards to inertia to declare convergence
-
-    n_jobs : int
-        The number of jobs to use for the computation. This works by computing
-        each of the n_init runs in parallel.
-        If -1 all CPUs are used. If 1 is given, no parallel computing code is
-        used at all, which is useful for debugging. For n_jobs below -1,
-        (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one
-        are used.
-
-    random_state : integer or numpy.RandomState, optional
-        The generator used to initialize the centers. If an integer is
-        given, it fixes the seed. Defaults to the global numpy random
-        number generator.
-
-    verbose : int, default 0
-        Verbosity mode.
-
-    copy_x : boolean, default True
-        When pre-computing distances it is more numerically accurate to center
-        the data first.  If copy_x is True, then the original data is not
-        modified.  If False, the original data is modified, and put back before
-        the function returns, but small numerical differences may be introduced
-        by subtracting and then adding the data mean.
-
-    normalize : boolean, default True
-        Normalize the input to have unnit norm.
-
-    Attributes
-    ----------
-
-    cluster_centers_ : array, [n_clusters, n_features]
-        Coordinates of cluster centers
-
-    labels_ :
-        Labels of each point
-
-    inertia_ : float
-        Sum of distances of samples to their closest cluster center.
-    """
-
-    def __init__(
-        self,
-        n_clusters=8,
-        init="k-means++",
-        n_init=10,
-        max_iter=300,
-        tol=1e-4,
-        n_jobs=1,
-        verbose=0,
-        random_state=None,
-        copy_x=True,
-        normalize=True,
-    ):
-        self.n_clusters = n_clusters
-        self.init = init
-        self.max_iter = max_iter
-        self.tol = tol
-        self.n_init = n_init
-        self.verbose = verbose
-        self.random_state = random_state
-        self.copy_x = copy_x
-        self.n_jobs = n_jobs
-        self.normalize = normalize
-
-    def fit(self, X, y=None, sample_weight=None):
-        """Compute k-means clustering.
-
-        Parameters
-        ----------
-
-        X : array-like or sparse matrix, shape=(n_samples, n_features)
-
-        y : Ignored
-            not used, present here for API consistency by convention.
-
-        sample_weight : array-like, shape (n_samples,), optional
-            The weights for each observation in X. If None, all observations
-            are assigned equal weight (default: None)
-        """
-        if self.normalize:
-            X = normalize(X)
-
-        random_state = check_random_state(self.random_state)
-
-        # TODO: add check that all data is unit-normalized
-
-        self.cluster_centers_, self.labels_, self.inertia_, self.n_iter_ = spherical_k_means(
-            X,
-            n_clusters=self.n_clusters,
-            sample_weight=sample_weight,
-            init=self.init,
-            n_init=self.n_init,
-            max_iter=self.max_iter,
-            verbose=self.verbose,
-            tol=self.tol,
-            random_state=random_state,
-            copy_x=self.copy_x,
-            n_jobs=self.n_jobs,
-            return_n_iter=True,
-        )
-
-        return self
diff --git a/code/spherecluster/spherecluster/tests/test_common.py b/code/spherecluster/spherecluster/tests/test_common.py
deleted file mode 100644
index 119f1c4..0000000
--- a/code/spherecluster/spherecluster/tests/test_common.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from sklearn.utils.estimator_checks import check_estimator
-from spherecluster import SphericalKMeans
-
-
-def test_estimator_spherical_k_means():
-    return check_estimator(SphericalKMeans)
diff --git a/code/spherecluster/spherecluster/tests/test_von_mises_fisher_mixture.py b/code/spherecluster/spherecluster/tests/test_von_mises_fisher_mixture.py
deleted file mode 100644
index 29e6d73..0000000
--- a/code/spherecluster/spherecluster/tests/test_von_mises_fisher_mixture.py
+++ /dev/null
@@ -1,205 +0,0 @@
-import pytest
-import scipy as sp
-import numpy as np
-from numpy.testing import assert_almost_equal, assert_array_equal
-from spherecluster import VonMisesFisherMixture
-from spherecluster import von_mises_fisher_mixture
-from spherecluster import sample_vMF
-
-
-def test_vmf_log_dense():
-    """
-    Test that approximation approaches whatever scipy has.
-    """
-    n_examples = 2
-    n_features = 50
-
-    kappas = np.linspace(2, 600, 20)
-
-    mu = np.random.randn(n_features)
-    mu /= np.linalg.norm(mu)
-
-    X = np.random.randn(n_examples, n_features)
-    for ee in range(n_examples):
-        X[ee, :] /= np.linalg.norm(X[ee, :])
-
-    diffs = []
-    for kappa in kappas:
-        v = von_mises_fisher_mixture._vmf_log(X, kappa, mu)
-
-        v_approx = von_mises_fisher_mixture._vmf_log_asymptotic(X, kappa, mu)
-
-        normalized_approx_diff = np.linalg.norm(v - v_approx) / np.linalg.norm(v)
-        print(normalized_approx_diff)
-        diffs.append(normalized_approx_diff)
-
-    assert diffs[0] > 10 * diffs[-1]
-
-
-def test_vmf_log_detect_breakage():
-    """
-    Find where scipy approximation breaks down.
-    This doesn't really test anything but demonstrates where approximation
-    should be applied instead.
-    """
-    n_examples = 3
-    kappas = [5, 30, 100, 1000, 5000]
-    n_features = range(2, 500)
-
-    breakage_points = []
-    for kappa in kappas:
-        first_breakage = None
-        for n_f in n_features:
-            mu = np.random.randn(n_f)
-            mu /= np.linalg.norm(mu)
-
-            X = np.random.randn(n_examples, n_f)
-            for ee in range(n_examples):
-                X[ee, :] /= np.linalg.norm(X[ee, :])
-
-            try:
-                von_mises_fisher_mixture._vmf_log(X, kappa, mu)
-            except:
-                if first_breakage is None:
-                    first_breakage = n_f
-
-        breakage_points.append(first_breakage)
-        print(
-            "Scipy vmf_log breaks for kappa={} at n_features={}".format(
-                kappa, first_breakage
-            )
-        )
-
-    print(breakage_points)
-    assert_array_equal(breakage_points, [141, 420, 311, 3, 3])
-
-
-def test_maximization():
-    num_points = 5000
-    n_features = 500
-    posterior = np.ones((1, num_points))
-
-    kappas = [5000, 8000, 16400]
-    for kappa in kappas:
-        mu = np.random.randn(n_features)
-        mu /= np.linalg.norm(mu)
-
-        X = sample_vMF(mu, kappa, num_points)
-
-        centers, weights, concentrations = von_mises_fisher_mixture._maximization(
-            X, posterior
-        )
-
-        print("center estimate error", np.linalg.norm(centers[0, :] - mu))
-        print(
-            "kappa estimate",
-            np.abs(kappa - concentrations[0]) / kappa,
-            kappa,
-            concentrations[0],
-        )
-
-        assert_almost_equal(1., weights[0])
-        assert_almost_equal(0.0, np.abs(kappa - concentrations[0]) / kappa, decimal=2)
-        assert_almost_equal(0.0, np.linalg.norm(centers[0, :] - mu), decimal=2)
-
-
-@pytest.mark.parametrize(
-    "params_in",
-    [
-        {"posterior_type": "soft"},
-        {"posterior_type": "hard"},
-        {"posterior_type": "soft", "n_jobs": 2},
-        {"posterior_type": "hard", "n_jobs": 3},
-        {"posterior_type": "hard", "force_weights": np.ones(5) / 5.},
-        {"posterior_type": "soft", "n_jobs": -1},
-    ],
-)
-def test_integration_dense(params_in):
-    n_clusters = 5
-    n_examples = 20
-    n_features = 100
-    X = np.random.randn(n_examples, n_features)
-    for ee in range(n_examples):
-        X[ee, :] /= np.linalg.norm(X[ee, :])
-
-    params_in.update({"n_clusters": n_clusters})
-    movmf = VonMisesFisherMixture(**params_in)
-    movmf.fit(X)
-
-    assert movmf.cluster_centers_.shape == (n_clusters, n_features)
-    assert len(movmf.concentrations_) == n_clusters
-    assert len(movmf.weights_) == n_clusters
-    assert len(movmf.labels_) == n_examples
-    assert len(movmf.posterior_) == n_clusters
-
-    for center in movmf.cluster_centers_:
-        assert_almost_equal(np.linalg.norm(center), 1.0)
-
-    for concentration in movmf.concentrations_:
-        assert concentration > 0
-
-    for weight in movmf.weights_:
-        assert not np.isnan(weight)
-
-    plabels = movmf.predict(X)
-    assert_array_equal(plabels, movmf.labels_)
-
-    ll = movmf.log_likelihood(X)
-    ll_labels = np.zeros(movmf.labels_.shape)
-    for ee in range(n_examples):
-        ll_labels[ee] = np.argmax(ll[:, ee])
-
-    assert_array_equal(ll_labels, movmf.labels_)
-
-
-@pytest.mark.parametrize(
-    "params_in",
-    [
-        {"posterior_type": "soft"},
-        {"posterior_type": "hard"},
-        {"posterior_type": "soft", "n_jobs": 2},
-        {"posterior_type": "hard", "n_jobs": 3},
-        {"posterior_type": "hard", "force_weights": np.ones(5) / 5.},
-        {"posterior_type": "soft", "n_jobs": -1},
-    ],
-)
-def test_integration_sparse(params_in):
-    n_clusters = 5
-    n_examples = 20
-    n_features = 100
-    n_nonzero = 10
-    X = sp.sparse.csr_matrix((n_examples, n_features))
-    for ee in range(n_examples):
-        ridx = np.random.randint(n_features, size=(n_nonzero))
-        random_values = np.random.randn(n_nonzero)
-        random_values = random_values / np.linalg.norm(random_values)
-        X[ee, ridx] = random_values
-
-    params_in.update({"n_clusters": n_clusters})
-    movmf = VonMisesFisherMixture(**params_in)
-    movmf.fit(X)
-
-    assert movmf.cluster_centers_.shape == (n_clusters, n_features)
-    assert len(movmf.concentrations_) == n_clusters
-    assert len(movmf.weights_) == n_clusters
-    assert len(movmf.labels_) == n_examples
-    assert len(movmf.posterior_) == n_clusters
-
-    for center in movmf.cluster_centers_:
-        assert_almost_equal(np.linalg.norm(center), 1.0)
-
-    for concentration in movmf.concentrations_:
-        assert concentration > 0
-
-    for weight in movmf.weights_:
-        assert not np.isnan(weight)
-
-    plabels = movmf.predict(X)
-    assert_array_equal(plabels, movmf.labels_)
-
-    ll = movmf.log_likelihood(X)
-    ll_labels = np.zeros(movmf.labels_.shape)
-    for ee in range(n_examples):
-        ll_labels[ee] = np.argmax(ll[:, ee])
-
-    assert_array_equal(ll_labels, movmf.labels_)
diff --git a/code/spherecluster/spherecluster/util.py b/code/spherecluster/spherecluster/util.py
deleted file mode 100644
index bb68261..0000000
--- a/code/spherecluster/spherecluster/util.py
+++ /dev/null
@@ -1,57 +0,0 @@
-"""
-Generate multivariate von Mises Fisher samples.
-
-This solution originally appears here:
-http://stats.stackexchange.com/questions/156729/sampling-from-von-mises-fisher-distribution-in-python
-
-Also see:
-
-Sampling from vMF on S^2:
-    https://www.mitsuba-renderer.org/~wenzel/files/vmf.pdf
-    http://www.stat.pitt.edu/sungkyu/software/randvonMisesFisher3.pdf
-"""
-import numpy as np
-
-
-def sample_vMF(mu, kappa, num_samples):
-    """Generate num_samples N-dimensional samples from von Mises Fisher
-    distribution around center mu \in R^N with concentration kappa.
-    """
-    dim = len(mu)
-    result = np.zeros((num_samples, dim))
-    for nn in range(num_samples):
-        # sample offset from center (on sphere) with spread kappa
-        w = _sample_weight(kappa, dim)
-
-        # sample a point v on the unit sphere that's orthogonal to mu
-        v = _sample_orthonormal_to(mu)
-
-        # compute new point
-        result[nn, :] = v * np.sqrt(1. - w ** 2) + w * mu
-
-    return result
-
-
-def _sample_weight(kappa, dim):
-    """Rejection sampling scheme for sampling distance from center on
-    surface of the sphere.
-    """
-    dim = dim - 1  # since S^{n-1}
-    b = dim / (np.sqrt(4. * kappa ** 2 + dim ** 2) + 2 * kappa)
-    x = (1. - b) / (1. + b)
-    c = kappa * x + dim * np.log(1 - x ** 2)
-
-    while True:
-        z = np.random.beta(dim / 2., dim / 2.)
-        w = (1. - (1. + b) * z) / (1. - (1. - b) * z)
-        u = np.random.uniform(low=0, high=1)
-        if kappa * w + dim * np.log(1. - x * w) - c >= np.log(u):
-            return w
-
-
-def _sample_orthonormal_to(mu):
-    """Sample point on sphere orthogonal to mu."""
-    v = np.random.randn(mu.shape[0])
-    proj_mu_v = mu * np.dot(mu, v) / np.linalg.norm(mu)
-    orthto = v - proj_mu_v
-    return orthto / np.linalg.norm(orthto)
diff --git a/code/spherecluster/spherecluster/von_mises_fisher_mixture.py b/code/spherecluster/spherecluster/von_mises_fisher_mixture.py
deleted file mode 100644
index 53c4c67..0000000
--- a/code/spherecluster/spherecluster/von_mises_fisher_mixture.py
+++ /dev/null
@@ -1,946 +0,0 @@
-import warnings
-
-import numpy as np
-import scipy.sparse as sp
-from joblib import Parallel, delayed
-from numpy import i0  # modified Bessel function of first kind order 0, I_0
-from scipy.special import iv  # modified Bessel function of first kind, I_v
-from scipy.special import logsumexp
-
-from sklearn.base import BaseEstimator, ClusterMixin, TransformerMixin
-from sklearn.cluster.k_means_ import _init_centroids, _tolerance, _validate_center_shape
-from sklearn.metrics.pairwise import cosine_distances
-from sklearn.preprocessing import normalize
-from sklearn.utils import check_array, check_random_state, as_float_array
-from sklearn.utils.extmath import squared_norm
-from sklearn.utils.validation import FLOAT_DTYPES
-from sklearn.utils.validation import check_is_fitted
-
-from . import spherical_kmeans
-
-MAX_CONTENTRATION = 1e10
-
-
-def _inertia_from_labels(X, centers, labels):
-    """Compute inertia with cosine distance using known labels.
-    """
-    n_examples, n_features = X.shape
-    inertia = np.zeros((n_examples,))
-    for ee in range(n_examples):
-        inertia[ee] = 1 - X[ee, :].dot(centers[int(labels[ee]), :].T)
-
-    return np.sum(inertia)
-
-
-def _labels_inertia(X, centers):
-    """Compute labels and inertia with cosine distance.
-    """
-    n_examples, n_features = X.shape
-    n_clusters, n_features = centers.shape
-
-    labels = np.zeros((n_examples,))
-    inertia = np.zeros((n_examples,))
-
-    for ee in range(n_examples):
-        dists = np.zeros((n_clusters,))
-        for cc in range(n_clusters):
-            dists[cc] = 1 - X[ee, :].dot(centers[cc, :].T)
-
-        labels[ee] = np.argmin(dists)
-        inertia[ee] = dists[int(labels[ee])]
-
-    return labels, np.sum(inertia)
-
-
-def _vmf_log(X, kappa, mu):
-    """Computs log(vMF(X, kappa, mu)) using built-in numpy/scipy Bessel
-    approximations.
-
-    Works well on small kappa and mu.
-    """
-    n_examples, n_features = X.shape
-    return np.log(_vmf_normalize(kappa, n_features) * np.exp(kappa * X.dot(mu).T))
-
-
-def _vmf_normalize(kappa, dim):
-    """Compute normalization constant using built-in numpy/scipy Bessel
-    approximations.
-
-    Works well on small kappa and mu.
-    """
-    num = np.power(kappa, dim / 2.0 - 1.0)
-
-    if dim / 2.0 - 1.0 < 1e-15:
-        denom = np.power(2.0 * np.pi, dim / 2.0) * i0(kappa)
-    else:
-        denom = np.power(2.0 * np.pi, dim / 2.0) * iv(dim / 2.0 - 1.0, kappa)
-
-    if np.isinf(num):
-        raise ValueError("VMF scaling numerator was inf.")
-
-    if np.isinf(denom):
-        raise ValueError("VMF scaling denominator was inf.")
-
-    if np.abs(denom) < 1e-15:
-        raise ValueError("VMF scaling denominator was 0.")
-
-    return num / denom
-
-
-def _log_H_asymptotic(nu, kappa):
-    """Compute the Amos-type upper bound asymptotic approximation on H where
-    log(H_\nu)(\kappa) = \int_0^\kappa R_\nu(t) dt.
-
-    See "lH_asymptotic <-" in movMF.R and utility function implementation notes
-    from https://cran.r-project.org/web/packages/movMF/index.html
-    """
-    beta = np.sqrt((nu + 0.5) ** 2)
-    kappa_l = np.min([kappa, np.sqrt((3.0 * nu + 11.0 / 2.0) * (nu + 3.0 / 2.0))])
-    return _S(kappa, nu + 0.5, beta) + (
-        _S(kappa_l, nu, nu + 2.0) - _S(kappa_l, nu + 0.5, beta)
-    )
-
-
-def _S(kappa, alpha, beta):
-    """Compute the antiderivative of the Amos-type bound G on the modified
-    Bessel function ratio.
-
-    Note:  Handles scalar kappa, alpha, and beta only.
-
-    See "S <-" in movMF.R and utility function implementation notes from
-    https://cran.r-project.org/web/packages/movMF/index.html
-    """
-    kappa = 1.0 * np.abs(kappa)
-    alpha = 1.0 * alpha
-    beta = 1.0 * np.abs(beta)
-    a_plus_b = alpha + beta
-    u = np.sqrt(kappa ** 2 + beta ** 2)
-    if alpha == 0:
-        alpha_scale = 0
-    else:
-        alpha_scale = alpha * np.log((alpha + u) / a_plus_b)
-
-    return u - beta - alpha_scale
-
-
-def _vmf_log_asymptotic(X, kappa, mu):
-    """Compute log(f(x|theta)) via Amos approximation
-
-        log(f(x|theta)) = theta' x - log(H_{d/2-1})(\|theta\|)
-
-    where theta = kappa * X, \|theta\| = kappa.
-
-    Computing _vmf_log helps with numerical stability / loss of precision for
-    for large values of kappa and n_features.
-
-    See utility function implementation notes in movMF.R from
-    https://cran.r-project.org/web/packages/movMF/index.html
-    """
-    n_examples, n_features = X.shape
-    log_vfm = kappa * X.dot(mu).T + -_log_H_asymptotic(n_features / 2.0 - 1.0, kappa)
-
-    return log_vfm
-
-
-def _log_likelihood(X, centers, weights, concentrations):
-    if len(np.shape(X)) != 2:
-        X = X.reshape((1, len(X)))
-
-    n_examples, n_features = np.shape(X)
-    n_clusters, _ = centers.shape
-
-    if n_features <= 50:  # works up to about 50 before numrically unstable
-        vmf_f = _vmf_log
-    else:
-        vmf_f = _vmf_log_asymptotic
-
-    f_log = np.zeros((n_clusters, n_examples))
-    for cc in range(n_clusters):
-        f_log[cc, :] = vmf_f(X, concentrations[cc], centers[cc, :])
-
-    posterior = np.zeros((n_clusters, n_examples))
-    weights_log = np.log(weights)
-    posterior = np.tile(weights_log.T, (n_examples, 1)).T + f_log
-    for ee in range(n_examples):
-        posterior[:, ee] = np.exp(posterior[:, ee] - logsumexp(posterior[:, ee]))
-
-    return posterior
-
-
-def _init_unit_centers(X, n_clusters, random_state, init):
-    """Initializes unit norm centers.
-
-    Parameters
-    ----------
-    X : array-like or sparse matrix, shape=(n_samples, n_features)
-
-    n_clusters : int, optional, default: 8
-        The number of clusters to form as well as the number of
-        centroids to generate.
-
-    random_state : integer or numpy.RandomState, optional
-        The generator used to initialize the centers. If an integer is
-        given, it fixes the seed. Defaults to the global numpy random
-        number generator.
-
-    init:  (string) one of
-        k-means++ : uses sklearn k-means++ initialization algorithm
-        spherical-k-means : use centroids from one pass of spherical k-means
-        random : random unit norm vectors
-        random-orthonormal : random orthonormal vectors
-        If an ndarray is passed, it should be of shape (n_clusters, n_features)
-        and gives the initial centers.
-    """
-    n_examples, n_features = np.shape(X)
-    if isinstance(init, np.ndarray):
-        n_init_clusters, n_init_features = init.shape
-        assert n_init_clusters == n_clusters
-        assert n_init_features == n_features
-
-        # ensure unit normed centers
-        centers = init
-        for cc in range(n_clusters):
-            centers[cc, :] = centers[cc, :] / np.linalg.norm(centers[cc, :])
-
-        return centers
-
-    elif init == "spherical-k-means":
-        labels, inertia, centers, iters = spherical_kmeans._spherical_kmeans_single_lloyd(
-            X, n_clusters, x_squared_norms=np.ones((n_examples,)), init="k-means++"
-        )
-
-        return centers
-
-    elif init == "random":
-        centers = np.random.randn(n_clusters, n_features)
-        for cc in range(n_clusters):
-            centers[cc, :] = centers[cc, :] / np.linalg.norm(centers[cc, :])
-
-        return centers
-
-    elif init == "k-means++":
-        centers = _init_centroids(
-            X,
-            n_clusters,
-            "k-means++",
-            random_state=random_state,
-            x_squared_norms=np.ones((n_examples,)),
-        )
-
-        for cc in range(n_clusters):
-            centers[cc, :] = centers[cc, :] / np.linalg.norm(centers[cc, :])
-
-        return centers
-
-    elif init == "random-orthonormal":
-        centers = np.random.randn(n_clusters, n_features)
-        q, r = np.linalg.qr(centers.T, mode="reduced")
-
-        return q.T
-
-    elif init == "random-class":
-        centers = np.zeros((n_clusters, n_features))
-        for cc in range(n_clusters):
-            while np.linalg.norm(centers[cc, :]) == 0:
-                labels = np.random.randint(0, n_clusters, n_examples)
-                centers[cc, :] = X[labels == cc, :].sum(axis=0)
-
-        for cc in range(n_clusters):
-            centers[cc, :] = centers[cc, :] / np.linalg.norm(centers[cc, :])
-
-        return centers
-
-
-def _expectation(X, centers, weights, concentrations, posterior_type="soft"):
-    """Compute the log-likelihood of each datapoint being in each cluster.
-
-    Parameters
-    ----------
-    centers (mu) : array, [n_centers x n_features]
-    weights (alpha) : array, [n_centers, ] (alpha)
-    concentrations (kappa) : array, [n_centers, ]
-
-    Returns
-    ----------
-    posterior : array, [n_centers, n_examples]
-    """
-    n_examples, n_features = np.shape(X)
-    n_clusters, _ = centers.shape
-
-    if n_features <= 50:  # works up to about 50 before numrically unstable
-        vmf_f = _vmf_log
-    else:
-        vmf_f = _vmf_log_asymptotic
-
-    f_log = np.zeros((n_clusters, n_examples))
-    for cc in range(n_clusters):
-        f_log[cc, :] = vmf_f(X, concentrations[cc], centers[cc, :])
-
-    posterior = np.zeros((n_clusters, n_examples))
-    if posterior_type == "soft":
-        weights_log = np.log(weights)
-        posterior = np.tile(weights_log.T, (n_examples, 1)).T + f_log
-        for ee in range(n_examples):
-            posterior[:, ee] = np.exp(posterior[:, ee] - logsumexp(posterior[:, ee]))
-
-    elif posterior_type == "hard":
-        weights_log = np.log(weights)
-        weighted_f_log = np.tile(weights_log.T, (n_examples, 1)).T + f_log
-        for ee in range(n_examples):
-            posterior[np.argmax(weighted_f_log[:, ee]), ee] = 1.0
-
-    return posterior
-
-
-def _maximization(X, posterior, force_weights=None):
-    """Estimate new centers, weights, and concentrations from
-
-    Parameters
-    ----------
-    posterior : array, [n_centers, n_examples]
-        The posterior matrix from the expectation step.
-
-    force_weights : None or array, [n_centers, ]
-        If None is passed, will estimate weights.
-        If an array is passed, will use instead of estimating.
-
-    Returns
-    ----------
-    centers (mu) : array, [n_centers x n_features]
-    weights (alpha) : array, [n_centers, ] (alpha)
-    concentrations (kappa) : array, [n_centers, ]
-    """
-    n_examples, n_features = X.shape
-    n_clusters, n_examples = posterior.shape
-    concentrations = np.zeros((n_clusters,))
-    centers = np.zeros((n_clusters, n_features))
-    if force_weights is None:
-        weights = np.zeros((n_clusters,))
-
-    for cc in range(n_clusters):
-        # update weights (alpha)
-        if force_weights is None:
-            weights[cc] = np.mean(posterior[cc, :])
-        else:
-            weights = force_weights
-
-        # update centers (mu)
-        X_scaled = X.copy()
-        if sp.issparse(X):
-            X_scaled.data *= posterior[cc, :].repeat(np.diff(X_scaled.indptr))
-        else:
-            for ee in range(n_examples):
-                X_scaled[ee, :] *= posterior[cc, ee]
-
-        centers[cc, :] = X_scaled.sum(axis=0)
-
-        # normalize centers
-        center_norm = np.linalg.norm(centers[cc, :])
-        if center_norm > 1e-8:
-            centers[cc, :] = centers[cc, :] / center_norm
-
-        # update concentration (kappa) [TODO: add other kappa approximations]
-        rbar = center_norm / (n_examples * weights[cc])
-        concentrations[cc] = rbar * n_features - np.power(rbar, 3.0)
-        if np.abs(rbar - 1.0) < 1e-10:
-            concentrations[cc] = MAX_CONTENTRATION
-        else:
-            concentrations[cc] /= 1.0 - np.power(rbar, 2.0)
-
-        # let python know we can free this (good for large dense X)
-        del X_scaled
-
-    return centers, weights, concentrations
-
-
-def _movMF(
-    X,
-    n_clusters,
-    posterior_type="soft",
-    force_weights=None,
-    max_iter=300,
-    verbose=False,
-    init="random-class",
-    random_state=None,
-    tol=1e-6,
-):
-    """Mixture of von Mises Fisher clustering.
-
-    Implements the algorithms (i) and (ii) from
-
-      "Clustering on the Unit Hypersphere using von Mises-Fisher Distributions"
-      by Banerjee, Dhillon, Ghosh, and Sra.
-
-    TODO: Currently only supports Banerjee et al 2005 approximation of kappa,
-          however, there are numerous other approximations see _update_params.
-
-    Attribution
-    ----------
-    Approximation of log-vmf distribution function from movMF R-package.
-
-    movMF: An R Package for Fitting Mixtures of von Mises-Fisher Distributions
-    by Kurt Hornik, Bettina Grun, 2014
-
-    Find more at:
-      https://cran.r-project.org/web/packages/movMF/vignettes/movMF.pdf
-      https://cran.r-project.org/web/packages/movMF/index.html
-
-    Parameters
-    ----------
-    n_clusters : int, optional, default: 8
-        The number of clusters to form as well as the number of
-        centroids to generate.
-
-    posterior_type: 'soft' or 'hard'
-        Type of posterior computed in exepectation step.
-        See note about attribute: self.posterior_
-
-    force_weights : None or array [n_clusters, ]
-        If None, the algorithm will estimate the weights.
-        If an array of weights, algorithm will estimate concentrations and
-        centers with given weights.
-
-    max_iter : int, default: 300
-        Maximum number of iterations of the k-means algorithm for a
-        single run.
-
-    n_init : int, default: 10
-        Number of time the k-means algorithm will be run with different
-        centroid seeds. The final results will be the best output of
-        n_init consecutive runs in terms of inertia.
-
-    init:  (string) one of
-        random-class [default]: random class assignment & centroid computation
-        k-means++ : uses sklearn k-means++ initialization algorithm
-        spherical-k-means : use centroids from one pass of spherical k-means
-        random : random unit norm vectors
-        random-orthonormal : random orthonormal vectors
-        If an ndarray is passed, it should be of shape (n_clusters, n_features)
-        and gives the initial centers.
-
-    tol : float, default: 1e-6
-        Relative tolerance with regards to inertia to declare convergence
-
-    n_jobs : int
-        The number of jobs to use for the computation. This works by computing
-        each of the n_init runs in parallel.
-        If -1 all CPUs are used. If 1 is given, no parallel computing code is
-        used at all, which is useful for debugging. For n_jobs below -1,
-        (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one
-        are used.
-
-    random_state : integer or numpy.RandomState, optional
-        The generator used to initialize the centers. If an integer is
-        given, it fixes the seed. Defaults to the global numpy random
-        number generator.
-
-    verbose : int, default 0
-        Verbosity mode.
-
-    copy_x : boolean, default True
-        When pre-computing distances it is more numerically accurate to center
-        the data first.  If copy_x is True, then the original data is not
-        modified.  If False, the original data is modified, and put back before
-        the function returns, but small numerical differences may be introduced
-        by subtracting and then adding the data mean.
-    """
-    random_state = check_random_state(random_state)
-    n_examples, n_features = np.shape(X)
-
-    # init centers (mus)
-    centers = _init_unit_centers(X, n_clusters, random_state, init)
-
-    # init weights (alphas)
-    if force_weights is None:
-        weights = np.ones((n_clusters,))
-        weights = weights / np.sum(weights)
-    else:
-        weights = force_weights
-
-    # init concentrations (kappas)
-    concentrations = np.ones((n_clusters,))
-
-    if verbose:
-        print("Initialization complete")
-
-    for iter in range(max_iter):
-        centers_prev = centers.copy()
-
-        # expectation step
-        posterior = _expectation(
-            X, centers, weights, concentrations, posterior_type=posterior_type
-        )
-
-        # maximization step
-        centers, weights, concentrations = _maximization(
-            X, posterior, force_weights=force_weights
-        )
-
-        # check convergence
-        tolcheck = squared_norm(centers_prev - centers)
-        if tolcheck <= tol:
-            if verbose:
-                print(
-                    "Converged at iteration %d: "
-                    "center shift %e within tolerance %e" % (iter, tolcheck, tol)
-                )
-            break
-
-    # labels come for free via posterior
-    labels = np.zeros((n_examples,))
-    for ee in range(n_examples):
-        labels[ee] = np.argmax(posterior[:, ee])
-
-    inertia = _inertia_from_labels(X, centers, labels)
-
-    return centers, weights, concentrations, posterior, labels, inertia
-
-
-def movMF(
-    X,
-    n_clusters,
-    posterior_type="soft",
-    force_weights=None,
-    n_init=10,
-    n_jobs=1,
-    max_iter=300,
-    verbose=False,
-    init="random-class",
-    random_state=None,
-    tol=1e-6,
-    copy_x=True,
-):
-    """Wrapper for parallelization of _movMF and running n_init times.
-    """
-    if n_init <= 0:
-        raise ValueError(
-            "Invalid number of initializations."
-            " n_init=%d must be bigger than zero." % n_init
-        )
-    random_state = check_random_state(random_state)
-
-    if max_iter <= 0:
-        raise ValueError(
-            "Number of iterations should be a positive number,"
-            " got %d instead" % max_iter
-        )
-
-    best_inertia = np.infty
-    X = as_float_array(X, copy=copy_x)
-    tol = _tolerance(X, tol)
-
-    if hasattr(init, "__array__"):
-        init = check_array(init, dtype=X.dtype.type, copy=True)
-        _validate_center_shape(X, n_clusters, init)
-
-        if n_init != 1:
-            warnings.warn(
-                "Explicit initial center position passed: "
-                "performing only one init in k-means instead of n_init=%d" % n_init,
-                RuntimeWarning,
-                stacklevel=2,
-            )
-            n_init = 1
-
-    # defaults
-    best_centers = None
-    best_labels = None
-    best_weights = None
-    best_concentrations = None
-    best_posterior = None
-    best_inertia = None
-
-    if n_jobs == 1:
-        # For a single thread, less memory is needed if we just store one set
-        # of the best results (as opposed to one set per run per thread).
-        for it in range(n_init):
-            # cluster on the sphere
-            (centers, weights, concentrations, posterior, labels, inertia) = _movMF(
-                X,
-                n_clusters,
-                posterior_type=posterior_type,
-                force_weights=force_weights,
-                max_iter=max_iter,
-                verbose=verbose,
-                init=init,
-                random_state=random_state,
-                tol=tol,
-            )
-
-            # determine if these results are the best so far
-            if best_inertia is None or inertia < best_inertia:
-                best_centers = centers.copy()
-                best_labels = labels.copy()
-                best_weights = weights.copy()
-                best_concentrations = concentrations.copy()
-                best_posterior = posterior.copy()
-                best_inertia = inertia
-    else:
-        # parallelisation of movMF runs
-        seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init)
-        results = Parallel(n_jobs=n_jobs, verbose=0)(
-            delayed(_movMF)(
-                X,
-                n_clusters,
-                posterior_type=posterior_type,
-                force_weights=force_weights,
-                max_iter=max_iter,
-                verbose=verbose,
-                init=init,
-                random_state=random_state,
-                tol=tol,
-            )
-            for seed in seeds
-        )
-
-        # Get results with the lowest inertia
-        centers, weights, concentrations, posteriors, labels, inertia = zip(*results)
-        best = np.argmin(inertia)
-        best_labels = labels[best]
-        best_inertia = inertia[best]
-        best_centers = centers[best]
-        best_concentrations = concentrations[best]
-        best_posterior = posteriors[best]
-        best_weights = weights[best]
-
-    return (
-        best_centers,
-        best_labels,
-        best_inertia,
-        best_weights,
-        best_concentrations,
-        best_posterior,
-    )
-
-
-class VonMisesFisherMixture(BaseEstimator, ClusterMixin, TransformerMixin):
-    """Estimator for Mixture of von Mises Fisher clustering on the unit sphere.
-
-    Implements the algorithms (i) and (ii) from
-
-      "Clustering on the Unit Hypersphere using von Mises-Fisher Distributions"
-      by Banerjee, Dhillon, Ghosh, and Sra.
-
-    TODO: Currently only supports Banerjee et al 2005 approximation of kappa,
-          however, there are numerous other approximations see _update_params.
-
-    Attribution
-    ----------
-    Approximation of log-vmf distribution function from movMF R-package.
-
-    movMF: An R Package for Fitting Mixtures of von Mises-Fisher Distributions
-    by Kurt Hornik, Bettina Grun, 2014
-
-    Find more at:
-      https://cran.r-project.org/web/packages/movMF/vignettes/movMF.pdf
-      https://cran.r-project.org/web/packages/movMF/index.html
-
-    Basic sklearn scaffolding from sklearn.cluster.KMeans.
-
-    Parameters
-    ----------
-    n_clusters : int, optional, default: 8
-        The number of clusters to form as well as the number of
-        centroids to generate.
-
-    posterior_type: 'soft' or 'hard'
-        Type of posterior computed in exepectation step.
-        See note about attribute: self.posterior_
-
-    force_weights : None or array [n_clusters, ]
-        If None, the algorithm will estimate the weights.
-        If an array of weights, algorithm will estimate concentrations and
-        centers with given weights.
-
-    max_iter : int, default: 300
-        Maximum number of iterations of the k-means algorithm for a
-        single run.
-
-    n_init : int, default: 10
-        Number of time the k-means algorithm will be run with different
-        centroid seeds. The final results will be the best output of
-        n_init consecutive runs in terms of inertia.
-
-    init:  (string) one of
-        random-class [default]: random class assignment & centroid computation
-        k-means++ : uses sklearn k-means++ initialization algorithm
-        spherical-k-means : use centroids from one pass of spherical k-means
-        random : random unit norm vectors
-        random-orthonormal : random orthonormal vectors
-        If an ndarray is passed, it should be of shape (n_clusters, n_features)
-        and gives the initial centers.
-
-    tol : float, default: 1e-6
-        Relative tolerance with regards to inertia to declare convergence
-
-    n_jobs : int
-        The number of jobs to use for the computation. This works by computing
-        each of the n_init runs in parallel.
-        If -1 all CPUs are used. If 1 is given, no parallel computing code is
-        used at all, which is useful for debugging. For n_jobs below -1,
-        (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one
-        are used.
-
-    random_state : integer or numpy.RandomState, optional
-        The generator used to initialize the centers. If an integer is
-        given, it fixes the seed. Defaults to the global numpy random
-        number generator.
-
-    verbose : int, default 0
-        Verbosity mode.
-
-    copy_x : boolean, default True
-        When pre-computing distances it is more numerically accurate to center
-        the data first.  If copy_x is True, then the original data is not
-        modified.  If False, the original data is modified, and put back before
-        the function returns, but small numerical differences may be introduced
-        by subtracting and then adding the data mean.
-
-    normalize : boolean, default True
-        Normalize the input to have unnit norm.
-
-    Attributes
-    ----------
-
-    cluster_centers_ : array, [n_clusters, n_features]
-        Coordinates of cluster centers
-
-    labels_ :
-        Labels of each point
-
-    inertia_ : float
-        Sum of distances of samples to their closest cluster center.
-
-    weights_ : array, [n_clusters,]
-        Weights of each cluster in vMF distribution (alpha).
-
-    concentrations_ : array [n_clusters,]
-        Concentration parameter for each cluster (kappa).
-        Larger values correspond to more concentrated clusters.
-
-    posterior_ : array, [n_clusters, n_examples]
-        Each column corresponds to the posterio distribution for and example.
-
-        If posterior_type='hard' is used, there will only be one non-zero per
-        column, its index corresponding to the example's cluster label.
-
-        If posterior_type='soft' is used, this matrix will be dense and the
-        column values correspond to soft clustering weights.
-    """
-
-    def __init__(
-        self,
-        n_clusters=5,
-        posterior_type="soft",
-        force_weights=None,
-        n_init=10,
-        n_jobs=1,
-        max_iter=300,
-        verbose=False,
-        init="random-class",
-        random_state=None,
-        tol=1e-6,
-        copy_x=True,
-        normalize=True,
-    ):
-        self.n_clusters = n_clusters
-        self.posterior_type = posterior_type
-        self.force_weights = force_weights
-        self.n_init = n_init
-        self.n_jobs = n_jobs
-        self.max_iter = max_iter
-        self.verbose = verbose
-        self.init = init
-        self.random_state = random_state
-        self.tol = tol
-        self.copy_x = copy_x
-        self.normalize = normalize
-
-    def _check_force_weights(self):
-        if self.force_weights is None:
-            return
-
-        if len(self.force_weights) != self.n_clusters:
-            raise ValueError(
-                (
-                    "len(force_weights)={} but must equal "
-                    "n_clusters={}".format(len(self.force_weights), self.n_clusters)
-                )
-            )
-
-    def _check_fit_data(self, X):
-        """Verify that the number of samples given is larger than k"""
-        X = check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32])
-        n_samples, n_features = X.shape
-        if X.shape[0] < self.n_clusters:
-            raise ValueError(
-                "n_samples=%d should be >= n_clusters=%d"
-                % (X.shape[0], self.n_clusters)
-            )
-
-        for ee in range(n_samples):
-            if sp.issparse(X):
-                n = sp.linalg.norm(X[ee, :])
-            else:
-                n = np.linalg.norm(X[ee, :])
-
-            if np.abs(n - 1.0) > 1e-4:
-                raise ValueError("Data l2-norm must be 1, found {}".format(n))
-
-        return X
-
-    def _check_test_data(self, X):
-        X = check_array(X, accept_sparse="csr", dtype=FLOAT_DTYPES)
-        n_samples, n_features = X.shape
-        expected_n_features = self.cluster_centers_.shape[1]
-        if not n_features == expected_n_features:
-            raise ValueError(
-                "Incorrect number of features. "
-                "Got %d features, expected %d" % (n_features, expected_n_features)
-            )
-
-        for ee in range(n_samples):
-            if sp.issparse(X):
-                n = sp.linalg.norm(X[ee, :])
-            else:
-                n = np.linalg.norm(X[ee, :])
-
-            if np.abs(n - 1.0) > 1e-4:
-                raise ValueError("Data l2-norm must be 1, found {}".format(n))
-
-        return X
-
-    def fit(self, X, y=None):
-        """Compute mixture of von Mises Fisher clustering.
-
-        Parameters
-        ----------
-        X : array-like or sparse matrix, shape=(n_samples, n_features)
-        """
-        if self.normalize:
-            X = normalize(X)
-
-        self._check_force_weights()
-        random_state = check_random_state(self.random_state)
-        X = self._check_fit_data(X)
-
-        (
-            self.cluster_centers_,
-            self.labels_,
-            self.inertia_,
-            self.weights_,
-            self.concentrations_,
-            self.posterior_,
-        ) = movMF(
-            X,
-            self.n_clusters,
-            posterior_type=self.posterior_type,
-            force_weights=self.force_weights,
-            n_init=self.n_init,
-            n_jobs=self.n_jobs,
-            max_iter=self.max_iter,
-            verbose=self.verbose,
-            init=self.init,
-            random_state=random_state,
-            tol=self.tol,
-            copy_x=self.copy_x,
-        )
-
-        return self
-
-    def fit_predict(self, X, y=None):
-        """Compute cluster centers and predict cluster index for each sample.
-        Convenience method; equivalent to calling fit(X) followed by
-        predict(X).
-        """
-        return self.fit(X).labels_
-
-    def fit_transform(self, X, y=None):
-        """Compute clustering and transform X to cluster-distance space.
-        Equivalent to fit(X).transform(X), but more efficiently implemented.
-        """
-        # Currently, this just skips a copy of the data if it is not in
-        # np.array or CSR format already.
-        # XXX This skips _check_test_data, which may change the dtype;
-        # we should refactor the input validation.
-        return self.fit(X)._transform(X)
-
-    def transform(self, X, y=None):
-        """Transform X to a cluster-distance space.
-        In the new space, each dimension is the cosine distance to the cluster
-        centers.  Note that even if X is sparse, the array returned by
-        `transform` will typically be dense.
-
-        Parameters
-        ----------
-        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
-            New data to transform.
-
-        Returns
-        -------
-        X_new : array, shape [n_samples, k]
-            X transformed in the new space.
-        """
-        if self.normalize:
-            X = normalize(X)
-
-        check_is_fitted(self)
-        X = self._check_test_data(X)
-        return self._transform(X)
-
-    def _transform(self, X):
-        """guts of transform method; no input validation"""
-        return cosine_distances(X, self.cluster_centers_)
-
-    def predict(self, X):
-        """Predict the closest cluster each sample in X belongs to.
-        In the vector quantization literature, `cluster_centers_` is called
-        the code book and each value returned by `predict` is the index of
-        the closest code in the code book.
-
-        Note:  Does not check that each point is on the sphere.
-
-        Parameters
-        ----------
-        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
-            New data to predict.
-
-        Returns
-        -------
-        labels : array, shape [n_samples,]
-            Index of the cluster each sample belongs to.
-        """
-        if self.normalize:
-            X = normalize(X)
-
-        check_is_fitted(self)
-
-        X = self._check_test_data(X)
-        return _labels_inertia(X, self.cluster_centers_)[0]
-
-    def score(self, X, y=None):
-        """Inertia score (sum of all distances to closest cluster).
-
-        Parameters
-        ----------
-        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
-            New data.
-
-        Returns
-        -------
-        score : float
-            Larger score is better.
-        """
-        if self.normalize:
-            X = normalize(X)
-
-        check_is_fitted(self)
-        X = self._check_test_data(X)
-        return -_labels_inertia(X, self.cluster_centers_)[1]
-
-    def log_likelihood(self, X):
-        check_is_fitted(self)
-
-        return _log_likelihood(
-            X, self.cluster_centers_, self.weights_, self.concentrations_
-        )