Skip to content

Commit 2306316

Browse files
committed
feat: added export of the examplars, handle small clustering input by bypassing tSNE which fails, and make dino_vits8 the default
1 parent 5d083df commit 2306316

File tree

4 files changed

+35
-15
lines changed

4 files changed

+35
-15
lines changed

README.md

+4-2
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ Options:
110110

111111
## File organization
112112

113-
The sdcat toolkit generates data in the following folders:
113+
The sdcat toolkit generates data in the following folders. Here, we assume both detection and clustering is output to the same root folder.:
114114

115115
```
116116
/data/20230504-MBARI/
@@ -126,7 +126,9 @@ The sdcat toolkit generates data in the following folders:
126126
├── det_filtered # The filtered detections from the model
127127
├── det_filtered_clustered # Clustered detections from the model
128128
├── crops # Crops of the detections
129-
├── dino_vits8... # The model output, i.e. cached embeddings, clustered detections, etc.
129+
├── dino_vits8...date # The clustering results - one folder per each run of the clustering algorithm
130+
├── dino_vits8..exemplars.csv # Exemplar embeddings - examples with the highest cosine similarity within a cluster
131+
├── dino_vits8..detections.csv # The detections with the cluster id
130132
├── stats.txt # Statistics of the detections
131133
└── vizresults # Visualizations of the detections (boxes overlaid on images)
132134
├── DSC01833.jpg

sdcat/cluster/cluster.py

+26-8
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ def _run_hdbscan_assign(
5757
:param min_samples: The number of samples in a neighborhood for a point
5858
:param ancillary_df: (optional) Ancillary data to include in the clustering
5959
:param out_path: The output path to save the clustering artifacts to
60-
:return: The average similarity score for each cluster, cluster ids, cluster means, and coverage
60+
:return: The average similarity score for each cluster, exemplar_df, cluster ids, cluster means, and coverage
6161
"""
6262
info(f'Clustering using HDBSCAN using alpha {alpha}...')
6363

@@ -80,8 +80,15 @@ def _run_hdbscan_assign(
8080
# Get the number of samples which is the number of rows in the dataframe - this is used mostly for calculating coverage
8181
num_samples = df.shape[0]
8282

83-
tsne = TSNE(n_components=2, perplexity=40, metric="cosine", n_jobs=8, random_state=42, verbose=True)
84-
embedding = tsne.fit_transform(df.values)
83+
# Perplexity must be less than the number of samples
84+
perplexity = min(30, num_samples - 1)
85+
86+
# TSN-E does not work well when we have a few samples
87+
if num_samples > 100:
88+
tsne = TSNE(n_components=2, perplexity=perplexity, metric="cosine", n_jobs=8, random_state=42, verbose=True)
89+
embedding = tsne.fit_transform(df.values)
90+
else:
91+
embedding = df.values
8592
x = MinMaxScaler().fit_transform(embedding) # scale the embedding to 0-1
8693

8794
# Cluster the embeddings using HDBSCAN
@@ -131,13 +138,20 @@ def _run_hdbscan_assign(
131138
# Get the index of the highest scores for each unique cluster sorted in increasing order
132139
# and use this as a representative image for the cluster
133140
max_scores = cluster_df.sort_values('cluster', ascending=True).groupby('cluster')['score'].idxmax()
134-
# Remove the first and last index which are the unassigned cluster and the noise cluster
135-
max_scores = max_scores[1:-1]
141+
# Remove the last index which is the -1 cluster
142+
max_scores = max_scores[:-1]
136143

137-
# Get the representative embeddings for the max scoring each cluster
144+
# Get the representative embeddings for the max scoring examplars for each cluster and store them in a numpy array
138145
exemplar_emb = [image_emb[i] for i in max_scores]
139146
exemplar_emb = np.array(exemplar_emb)
140147

148+
# Save the exemplar embeddings to a dataframe with some metadata
149+
exemplar_df = pd.DataFrame()
150+
exemplar_df['cluster'] = [f'Unknown C{i}' for i in range(0, len(max_scores))]
151+
if ancillary_df is not None and 'image_path' in ancillary_df.columns:
152+
exemplar_df['image_path'] = ancillary_df.iloc[max_scores]['image_path'].tolist()
153+
exemplar_df['embedding'] = exemplar_emb.tolist()
154+
141155
# Reassign the unknowns to the closest cluster - this is only needed if the coverage is less than 1
142156
clustered = labels >= 0
143157
coverage = np.sum(clustered) / num_samples
@@ -215,7 +229,7 @@ def _run_hdbscan_assign(
215229
with open(f'{out_path}/{prefix}_summary.json', 'w') as f:
216230
json.dump(params, f)
217231

218-
return avg_sim_scores, clusters, cluster_means, coverage
232+
return avg_sim_scores, exemplar_df, clusters, cluster_means, coverage
219233

220234

221235
def cluster_vits(
@@ -298,7 +312,7 @@ def cluster_vits(
298312
ancillary_df = df_dets
299313

300314
# Cluster the images
301-
cluster_sim, unique_clusters, cluster_means, coverage = _run_hdbscan_assign(prefix,
315+
cluster_sim, exemplar_df, unique_clusters, cluster_means, coverage = _run_hdbscan_assign(prefix,
302316
image_emb,
303317
alpha,
304318
cluster_selection_epsilon,
@@ -345,6 +359,10 @@ def cluster_vits(
345359
range(0, len(unique_clusters))]
346360
pool.starmap(cluster_grid, args)
347361

362+
# Save the exemplar embeddings with the model type
363+
exemplar_df['model'] = model
364+
exemplar_df.to_csv(output_path / f'{prefix}_exemplars.csv', index=False)
365+
348366
info(f"Number of images {len(images)}")
349367
info(f"Number of clusters {len(unique_clusters)}")
350368
info(f"Coverage {coverage:.2f}")

sdcat/cluster/commands.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,7 @@ def run_cluster_det(det_dir, save_dir, device, config_ini, alpha, cluster_select
134134
axis=1)
135135

136136
# Add in a column for the unique crop name for each detection with a unique id
137-
df['cluster_id'] = -1 # -1 is the default value and means that the image is not in a cluster
137+
df['cluster'] = -1 # -1 is the default value and means that the image is not in a cluster
138138

139139
# Remove small or large detections before clustering
140140
size_before = len(df)
@@ -328,11 +328,11 @@ def run_cluster_roi(roi_dir, save_dir, device, config_ini, alpha, cluster_select
328328
crop_path = save_dir / 'crops'
329329
crop_path.mkdir(parents=True, exist_ok=True)
330330
df['crop_path'] = df.apply(lambda row:
331-
f'{crop_path}/{uuid.uuid5(uuid.NAMESPACE_DNS, row["image_path"])}.png',
331+
f'{crop_path}/{Path(row["image_path"]).stem}.png',
332332
axis=1)
333333

334334
# Add in a column for the unique crop name for each detection with a unique id
335-
df['cluster_id'] = -1 # -1 is the default value and means that the image is not in a cluster
335+
df['cluster'] = -1 # -1 is the default value and means that the image is not in a cluster
336336

337337
# Replace any NaNs with 0
338338
df.fillna(0)

sdcat/config/config.ini

+2-2
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,8 @@ min_similarity = 0.70
3737
# dino_vits8 has block_size=8 which can be good for very small objects
3838
# dino_vits14 has block_size=14
3939
# Smaller block_size means more patches and more accurate fine-grained clustering on smaller objects
40-
;model = dino_vits8
41-
model = dinov2_vits14
40+
model = dino_vits8
41+
;model = dinov2_vits14
4242
;model = dinov2_vitb14
4343

4444
[detect]

0 commit comments

Comments
 (0)