feat: added export of the examplars, handle small clustering input by bypassing tSNE which fails, and make dino_vits8 the default

danellecline · danellecline · commit 2306316f4218 · 2024-06-25T19:23:50.000-07:00
diff --git a/README.md b/README.md
@@ -110,7 +110,7 @@ Options:
 
 ## File organization
 
-The sdcat toolkit generates data in the following folders:
+The sdcat toolkit generates data in the following folders. Here, we assume both detection and clustering is output to the same root folder.:
  
 ```
 /data/20230504-MBARI/
@@ -126,7 +126,9 @@ The sdcat toolkit generates data in the following folders:
             ├── det_filtered                    # The filtered detections from the model
             ├── det_filtered_clustered          # Clustered detections from the model
                 ├── crops                       # Crops of the detections 
-                ├── dino_vits8...               # The model output, i.e. cached embeddings, clustered detections, etc.
+                ├── dino_vits8...date           # The clustering results - one folder per each run of the clustering algorithm
+                ├── dino_vits8..exemplars.csv   # Exemplar embeddings - examples with the highest cosine similarity within a cluster
+                ├── dino_vits8..detections.csv  # The detections with the cluster id
             ├── stats.txt                       # Statistics of the detections
             └── vizresults                      # Visualizations of the detections (boxes overlaid on images)
                 ├── DSC01833.jpg
diff --git a/sdcat/cluster/cluster.py b/sdcat/cluster/cluster.py
@@ -57,7 +57,7 @@ def _run_hdbscan_assign(
     :param min_samples:   The number of samples in a neighborhood for a point
     :param ancillary_df:  (optional) Ancillary data to include in the clustering
     :param out_path:  The output path to save the clustering artifacts to
-    :return: The average similarity score for each cluster, cluster ids, cluster means, and coverage
+    :return: The average similarity score for each cluster, exemplar_df, cluster ids, cluster means, and coverage
     """
     info(f'Clustering using HDBSCAN using alpha {alpha}...')
 
@@ -80,8 +80,15 @@ def _run_hdbscan_assign(
     # Get the number of samples which is the number of rows in the dataframe - this is used mostly for calculating coverage
     num_samples = df.shape[0]
 
-    tsne = TSNE(n_components=2, perplexity=40, metric="cosine", n_jobs=8, random_state=42, verbose=True)
-    embedding = tsne.fit_transform(df.values)
+    # Perplexity must be less than the number of samples
+    perplexity = min(30, num_samples - 1)
+
+    # TSN-E does not work well when we have a few samples
+    if num_samples > 100:
+        tsne = TSNE(n_components=2, perplexity=perplexity, metric="cosine", n_jobs=8, random_state=42, verbose=True)
+        embedding = tsne.fit_transform(df.values)
+    else:
+        embedding = df.values
     x = MinMaxScaler().fit_transform(embedding) # scale the embedding to 0-1
 
     # Cluster the embeddings using HDBSCAN
@@ -131,13 +138,20 @@ def _run_hdbscan_assign(
     # Get the index of the highest scores for each unique cluster sorted in increasing order
     # and use this as a representative image for the cluster
     max_scores = cluster_df.sort_values('cluster', ascending=True).groupby('cluster')['score'].idxmax()
-    # Remove the first and last index which are the unassigned cluster and the noise cluster
-    max_scores = max_scores[1:-1]
+    # Remove the last index which is the -1 cluster
+    max_scores = max_scores[:-1]
 
-    # Get the representative embeddings for the max scoring each cluster
+    # Get the representative embeddings for the max scoring examplars for each cluster and store them in a numpy array
     exemplar_emb = [image_emb[i] for i in max_scores]
     exemplar_emb = np.array(exemplar_emb)
 
+    # Save the exemplar embeddings to a dataframe with some metadata
+    exemplar_df = pd.DataFrame()
+    exemplar_df['cluster'] = [f'Unknown C{i}' for i in range(0, len(max_scores))]
+    if ancillary_df is not None and 'image_path' in ancillary_df.columns:
+        exemplar_df['image_path'] = ancillary_df.iloc[max_scores]['image_path'].tolist()
+    exemplar_df['embedding'] = exemplar_emb.tolist()
+
     # Reassign the unknowns to the closest cluster - this is only needed if the coverage is less than 1
     clustered = labels >= 0
     coverage = np.sum(clustered) / num_samples
@@ -215,7 +229,7 @@ def _run_hdbscan_assign(
     with open(f'{out_path}/{prefix}_summary.json', 'w') as f:
         json.dump(params, f)
 
-    return avg_sim_scores, clusters, cluster_means, coverage
+    return avg_sim_scores, exemplar_df, clusters, cluster_means, coverage
 
 
 def cluster_vits(
@@ -298,7 +312,7 @@ def cluster_vits(
         ancillary_df = df_dets
 
     # Cluster the images
-    cluster_sim, unique_clusters, cluster_means, coverage = _run_hdbscan_assign(prefix,
+    cluster_sim, exemplar_df, unique_clusters, cluster_means, coverage = _run_hdbscan_assign(prefix,
                                                                                 image_emb,
                                                                                 alpha,
                                                                                 cluster_selection_epsilon,
@@ -345,6 +359,10 @@ def cluster_vits(
                 range(0, len(unique_clusters))]
         pool.starmap(cluster_grid, args)
 
+    # Save the exemplar embeddings with the model type
+    exemplar_df['model'] = model
+    exemplar_df.to_csv(output_path / f'{prefix}_exemplars.csv', index=False)
+
     info(f"Number of images {len(images)}")
     info(f"Number of clusters {len(unique_clusters)}")
     info(f"Coverage {coverage:.2f}")
diff --git a/sdcat/cluster/commands.py b/sdcat/cluster/commands.py
@@ -134,7 +134,7 @@ def run_cluster_det(det_dir, save_dir, device, config_ini, alpha, cluster_select
                                axis=1)
 
     # Add in a column for the unique crop name for each detection with a unique id
-    df['cluster_id'] = -1  # -1 is the default value and means that the image is not in a cluster
+    df['cluster'] = -1  # -1 is the default value and means that the image is not in a cluster
 
     # Remove small or large detections before clustering
     size_before = len(df)
@@ -328,11 +328,11 @@ def run_cluster_roi(roi_dir, save_dir, device, config_ini, alpha, cluster_select
     crop_path = save_dir / 'crops'
     crop_path.mkdir(parents=True, exist_ok=True)
     df['crop_path'] = df.apply(lambda row:
-                               f'{crop_path}/{uuid.uuid5(uuid.NAMESPACE_DNS, row["image_path"])}.png',
+                               f'{crop_path}/{Path(row["image_path"]).stem}.png',
                                axis=1)
 
     # Add in a column for the unique crop name for each detection with a unique id
-    df['cluster_id'] = -1  # -1 is the default value and means that the image is not in a cluster
+    df['cluster'] = -1  # -1 is the default value and means that the image is not in a cluster
 
     # Replace any NaNs with 0
     df.fillna(0)
diff --git a/sdcat/config/config.ini b/sdcat/config/config.ini
@@ -37,8 +37,8 @@ min_similarity = 0.70
 # dino_vits8 has block_size=8 which can be good for very small objects
 # dino_vits14 has block_size=14
 # Smaller block_size means more patches and more accurate fine-grained clustering on smaller objects
-;model = dino_vits8
-model = dinov2_vits14
+model = dino_vits8
+;model = dinov2_vits14
 ;model = dinov2_vitb14
 
 [detect]