diff --git a/README.md b/README.md index f7e39be..91e98c1 100644 --- a/README.md +++ b/README.md @@ -110,7 +110,7 @@ Options: ## File organization -The sdcat toolkit generates data in the following folders: +The sdcat toolkit generates data in the following folders. Here, we assume both detection and clustering is output to the same root folder.: ``` /data/20230504-MBARI/ @@ -126,7 +126,9 @@ The sdcat toolkit generates data in the following folders: ├── det_filtered # The filtered detections from the model ├── det_filtered_clustered # Clustered detections from the model ├── crops # Crops of the detections - ├── dino_vits8... # The model output, i.e. cached embeddings, clustered detections, etc. + ├── dino_vits8...date # The clustering results - one folder per each run of the clustering algorithm + ├── dino_vits8..exemplars.csv # Exemplar embeddings - examples with the highest cosine similarity within a cluster + ├── dino_vits8..detections.csv # The detections with the cluster id ├── stats.txt # Statistics of the detections └── vizresults # Visualizations of the detections (boxes overlaid on images) ├── DSC01833.jpg diff --git a/sdcat/cluster/cluster.py b/sdcat/cluster/cluster.py index ba68e0d..a7ef7f8 100644 --- a/sdcat/cluster/cluster.py +++ b/sdcat/cluster/cluster.py @@ -57,7 +57,7 @@ def _run_hdbscan_assign( :param min_samples: The number of samples in a neighborhood for a point :param ancillary_df: (optional) Ancillary data to include in the clustering :param out_path: The output path to save the clustering artifacts to - :return: The average similarity score for each cluster, cluster ids, cluster means, and coverage + :return: The average similarity score for each cluster, exemplar_df, cluster ids, cluster means, and coverage """ info(f'Clustering using HDBSCAN using alpha {alpha}...') @@ -80,8 +80,15 @@ def _run_hdbscan_assign( # Get the number of samples which is the number of rows in the dataframe - this is used mostly for calculating coverage num_samples = df.shape[0] - tsne = TSNE(n_components=2, perplexity=40, metric="cosine", n_jobs=8, random_state=42, verbose=True) - embedding = tsne.fit_transform(df.values) + # Perplexity must be less than the number of samples + perplexity = min(30, num_samples - 1) + + # TSN-E does not work well when we have a few samples + if num_samples > 100: + tsne = TSNE(n_components=2, perplexity=perplexity, metric="cosine", n_jobs=8, random_state=42, verbose=True) + embedding = tsne.fit_transform(df.values) + else: + embedding = df.values x = MinMaxScaler().fit_transform(embedding) # scale the embedding to 0-1 # Cluster the embeddings using HDBSCAN @@ -131,13 +138,20 @@ def _run_hdbscan_assign( # Get the index of the highest scores for each unique cluster sorted in increasing order # and use this as a representative image for the cluster max_scores = cluster_df.sort_values('cluster', ascending=True).groupby('cluster')['score'].idxmax() - # Remove the first and last index which are the unassigned cluster and the noise cluster - max_scores = max_scores[1:-1] + # Remove the last index which is the -1 cluster + max_scores = max_scores[:-1] - # Get the representative embeddings for the max scoring each cluster + # Get the representative embeddings for the max scoring examplars for each cluster and store them in a numpy array exemplar_emb = [image_emb[i] for i in max_scores] exemplar_emb = np.array(exemplar_emb) + # Save the exemplar embeddings to a dataframe with some metadata + exemplar_df = pd.DataFrame() + exemplar_df['cluster'] = [f'Unknown C{i}' for i in range(0, len(max_scores))] + if ancillary_df is not None and 'image_path' in ancillary_df.columns: + exemplar_df['image_path'] = ancillary_df.iloc[max_scores]['image_path'].tolist() + exemplar_df['embedding'] = exemplar_emb.tolist() + # Reassign the unknowns to the closest cluster - this is only needed if the coverage is less than 1 clustered = labels >= 0 coverage = np.sum(clustered) / num_samples @@ -215,7 +229,7 @@ def _run_hdbscan_assign( with open(f'{out_path}/{prefix}_summary.json', 'w') as f: json.dump(params, f) - return avg_sim_scores, clusters, cluster_means, coverage + return avg_sim_scores, exemplar_df, clusters, cluster_means, coverage def cluster_vits( @@ -298,7 +312,7 @@ def cluster_vits( ancillary_df = df_dets # Cluster the images - cluster_sim, unique_clusters, cluster_means, coverage = _run_hdbscan_assign(prefix, + cluster_sim, exemplar_df, unique_clusters, cluster_means, coverage = _run_hdbscan_assign(prefix, image_emb, alpha, cluster_selection_epsilon, @@ -345,6 +359,10 @@ def cluster_vits( range(0, len(unique_clusters))] pool.starmap(cluster_grid, args) + # Save the exemplar embeddings with the model type + exemplar_df['model'] = model + exemplar_df.to_csv(output_path / f'{prefix}_exemplars.csv', index=False) + info(f"Number of images {len(images)}") info(f"Number of clusters {len(unique_clusters)}") info(f"Coverage {coverage:.2f}") diff --git a/sdcat/cluster/commands.py b/sdcat/cluster/commands.py index ae9ab90..b80cca1 100644 --- a/sdcat/cluster/commands.py +++ b/sdcat/cluster/commands.py @@ -134,7 +134,7 @@ def run_cluster_det(det_dir, save_dir, device, config_ini, alpha, cluster_select axis=1) # Add in a column for the unique crop name for each detection with a unique id - df['cluster_id'] = -1 # -1 is the default value and means that the image is not in a cluster + df['cluster'] = -1 # -1 is the default value and means that the image is not in a cluster # Remove small or large detections before clustering size_before = len(df) @@ -328,11 +328,11 @@ def run_cluster_roi(roi_dir, save_dir, device, config_ini, alpha, cluster_select crop_path = save_dir / 'crops' crop_path.mkdir(parents=True, exist_ok=True) df['crop_path'] = df.apply(lambda row: - f'{crop_path}/{uuid.uuid5(uuid.NAMESPACE_DNS, row["image_path"])}.png', + f'{crop_path}/{Path(row["image_path"]).stem}.png', axis=1) # Add in a column for the unique crop name for each detection with a unique id - df['cluster_id'] = -1 # -1 is the default value and means that the image is not in a cluster + df['cluster'] = -1 # -1 is the default value and means that the image is not in a cluster # Replace any NaNs with 0 df.fillna(0) diff --git a/sdcat/config/config.ini b/sdcat/config/config.ini index 8528c70..ef3a1a6 100644 --- a/sdcat/config/config.ini +++ b/sdcat/config/config.ini @@ -37,8 +37,8 @@ min_similarity = 0.70 # dino_vits8 has block_size=8 which can be good for very small objects # dino_vits14 has block_size=14 # Smaller block_size means more patches and more accurate fine-grained clustering on smaller objects -;model = dino_vits8 -model = dinov2_vits14 +model = dino_vits8 +;model = dinov2_vits14 ;model = dinov2_vitb14 [detect]