@@ -57,7 +57,7 @@ def _run_hdbscan_assign(
57
57
:param min_samples: The number of samples in a neighborhood for a point
58
58
:param ancillary_df: (optional) Ancillary data to include in the clustering
59
59
:param out_path: The output path to save the clustering artifacts to
60
- :return: The average similarity score for each cluster, cluster ids, cluster means, and coverage
60
+ :return: The average similarity score for each cluster, exemplar_df, cluster ids, cluster means, and coverage
61
61
"""
62
62
info (f'Clustering using HDBSCAN using alpha { alpha } ...' )
63
63
@@ -80,8 +80,15 @@ def _run_hdbscan_assign(
80
80
# Get the number of samples which is the number of rows in the dataframe - this is used mostly for calculating coverage
81
81
num_samples = df .shape [0 ]
82
82
83
- tsne = TSNE (n_components = 2 , perplexity = 40 , metric = "cosine" , n_jobs = 8 , random_state = 42 , verbose = True )
84
- embedding = tsne .fit_transform (df .values )
83
+ # Perplexity must be less than the number of samples
84
+ perplexity = min (30 , num_samples - 1 )
85
+
86
+ # TSN-E does not work well when we have a few samples
87
+ if num_samples > 100 :
88
+ tsne = TSNE (n_components = 2 , perplexity = perplexity , metric = "cosine" , n_jobs = 8 , random_state = 42 , verbose = True )
89
+ embedding = tsne .fit_transform (df .values )
90
+ else :
91
+ embedding = df .values
85
92
x = MinMaxScaler ().fit_transform (embedding ) # scale the embedding to 0-1
86
93
87
94
# Cluster the embeddings using HDBSCAN
@@ -131,13 +138,20 @@ def _run_hdbscan_assign(
131
138
# Get the index of the highest scores for each unique cluster sorted in increasing order
132
139
# and use this as a representative image for the cluster
133
140
max_scores = cluster_df .sort_values ('cluster' , ascending = True ).groupby ('cluster' )['score' ].idxmax ()
134
- # Remove the first and last index which are the unassigned cluster and the noise cluster
135
- max_scores = max_scores [1 :- 1 ]
141
+ # Remove the last index which is the -1 cluster
142
+ max_scores = max_scores [:- 1 ]
136
143
137
- # Get the representative embeddings for the max scoring each cluster
144
+ # Get the representative embeddings for the max scoring examplars for each cluster and store them in a numpy array
138
145
exemplar_emb = [image_emb [i ] for i in max_scores ]
139
146
exemplar_emb = np .array (exemplar_emb )
140
147
148
+ # Save the exemplar embeddings to a dataframe with some metadata
149
+ exemplar_df = pd .DataFrame ()
150
+ exemplar_df ['cluster' ] = [f'Unknown C{ i } ' for i in range (0 , len (max_scores ))]
151
+ if ancillary_df is not None and 'image_path' in ancillary_df .columns :
152
+ exemplar_df ['image_path' ] = ancillary_df .iloc [max_scores ]['image_path' ].tolist ()
153
+ exemplar_df ['embedding' ] = exemplar_emb .tolist ()
154
+
141
155
# Reassign the unknowns to the closest cluster - this is only needed if the coverage is less than 1
142
156
clustered = labels >= 0
143
157
coverage = np .sum (clustered ) / num_samples
@@ -215,7 +229,7 @@ def _run_hdbscan_assign(
215
229
with open (f'{ out_path } /{ prefix } _summary.json' , 'w' ) as f :
216
230
json .dump (params , f )
217
231
218
- return avg_sim_scores , clusters , cluster_means , coverage
232
+ return avg_sim_scores , exemplar_df , clusters , cluster_means , coverage
219
233
220
234
221
235
def cluster_vits (
@@ -298,7 +312,7 @@ def cluster_vits(
298
312
ancillary_df = df_dets
299
313
300
314
# Cluster the images
301
- cluster_sim , unique_clusters , cluster_means , coverage = _run_hdbscan_assign (prefix ,
315
+ cluster_sim , exemplar_df , unique_clusters , cluster_means , coverage = _run_hdbscan_assign (prefix ,
302
316
image_emb ,
303
317
alpha ,
304
318
cluster_selection_epsilon ,
@@ -345,6 +359,10 @@ def cluster_vits(
345
359
range (0 , len (unique_clusters ))]
346
360
pool .starmap (cluster_grid , args )
347
361
362
+ # Save the exemplar embeddings with the model type
363
+ exemplar_df ['model' ] = model
364
+ exemplar_df .to_csv (output_path / f'{ prefix } _exemplars.csv' , index = False )
365
+
348
366
info (f"Number of images { len (images )} " )
349
367
info (f"Number of clusters { len (unique_clusters )} " )
350
368
info (f"Coverage { coverage :.2f} " )
0 commit comments