diff --git a/sdcat/cluster/cluster.py b/sdcat/cluster/cluster.py index 3a77ea3..4346c14 100644 --- a/sdcat/cluster/cluster.py +++ b/sdcat/cluster/cluster.py @@ -10,7 +10,6 @@ import json import seaborn as sns import numpy as np -from matplotlib import pyplot as plt from umap import UMAP from hdbscan import HDBSCAN from sklearn.metrics.pairwise import cosine_similarity @@ -75,10 +74,13 @@ def _run_hdbscan_assign( # Get the number of samples which is the number of rows in the dataframe - this is used mostly for calculating coverage num_samples = df.shape[0] - # Compute the cosine similarity matrix - cosine_sim_matrix = cosine_similarity(df.values) - distance_matrix = 1 - cosine_sim_matrix - x = distance_matrix.astype(np.float64) + # from sklearn.manifold import TSNE + from sklearn.preprocessing import MinMaxScaler + from MulticoreTSNE import MulticoreTSNE as TSNE + + tsne = TSNE(n_components=2, perplexity=40, metric="cosine", n_jobs=8, random_state=42, verbose=True) + embedding = tsne.fit_transform(df.values) + x = MinMaxScaler().fit_transform(embedding) # scale the embedding to 0-1 # Cluster the embeddings using HDBSCAN if have_gpu: @@ -93,7 +95,7 @@ def _run_hdbscan_assign( labels = scan.fit_predict(x) else: scan = HDBSCAN( - metric='precomputed', + metric='l2', allow_single_cluster=True, min_cluster_size=min_cluster_size, min_samples=min_samples, @@ -206,6 +208,7 @@ def _run_hdbscan_assign( df = pd.DataFrame({'x': xx[clustered, 0], 'y': xx[clustered, 1], 'labels': labels[clustered]}) p = sns.jointplot(data=df, x='x', y='y', hue='labels') p.savefig(f"{out_path}/{prefix}_summary.png") + info(f"Saved {out_path}/{prefix}_summary.png") with open(f'{out_path}/{prefix}_summary.json', 'w') as f: json.dump(params, f) diff --git a/sdcat/cluster/commands.py b/sdcat/cluster/commands.py index 72aa06a..7768bab 100644 --- a/sdcat/cluster/commands.py +++ b/sdcat/cluster/commands.py @@ -259,3 +259,99 @@ def is_day(utc_dt): shutil.copy(Path(config_ini), save_dir / f'{prefix}_config.ini') else: warn(f'No detections found to cluster') + +@click.command('roi', help='Cluster roi. See cluster --config-ini to override cluster defaults.') +@common_args.config_ini +@click.option('--roi-dir', help='Input folder(s) with raw ROI images', multiple=True) +@click.option('--save-dir', help='Output directory to save clustered detection results') +@click.option('--device', help='Device to use, e.g. cpu or cuda:0', type=str) +@click.option('--alpha', help='Alpha is a parameter that controls the linkage. See https://hdbscan.readthedocs.io/en/latest/parameter_selection.html. Default is 0.92. Increase for less conservative clustering, e.g. 1.0', type=float) +@click.option('--cluster-selection-epsilon', help='Epsilon is a parameter that controls the linkage. Default is 0. Increase for less conservative clustering', type=float) +@click.option('--min-cluster-size', help='The minimum number of samples in a group for that group to be considered a cluster. Default is 2. Increase for less conservative clustering, e.g. 5, 15', type=int) +def run_cluster_roi(roi_dir, save_dir, device, config_ini, alpha, cluster_selection_epsilon, min_cluster_size): + config = cfg.Config(config_ini) + min_samples = int(config('cluster', 'min_samples')) + alpha = alpha if alpha else float(config('cluster', 'alpha')) + min_cluster_size = min_cluster_size if min_cluster_size else int(config('cluster', 'min_cluster_size')) + cluster_selection_epsilon = cluster_selection_epsilon if cluster_selection_epsilon else float(config('cluster','cluster_selection_epsilon')) + min_similarity = float(config('cluster', 'min_similarity')) + model = config('cluster', 'model') + + if device: + num_devices = torch.cuda.device_count() + info(f'{num_devices} cuda devices available') + info(f'Using device {device}') + if 'cuda' in device: + device_num = device.split(':')[-1] + info(f'Setting CUDA_VISIBLE_DEVICES to {device_num}') + torch.cuda.set_device(device) + os.environ['CUDA_VISIBLE_DEVICES'] = device_num + + save_dir = Path(save_dir) + save_dir.mkdir(parents=True, exist_ok=True) + + # Grab all images from the input directories + supported_extensions = ['.png', '.jpg', '.jpeg', '.JPG', '.JPEG', '.PNG'] + images = [] + + for r in roi_dir: + roi_path = Path(r) + for ext in supported_extensions: + images.extend(list(roi_path.rglob(f'*{ext}'))) + + # Create a dataframe to store the combined data in an image_path column in sorted order + df = pd.DataFrame() + df['image_path'] = images + + # Convert the image_path column to a string + df['image_path'] = df['image_path'].astype(str) + + info(f'Found {len(df)} detections in {roi_dir}') + + if len(df) == 0: + info(f'No detections found in {roi_dir}') + return + + # Sort the dataframe by image_path to make sure the images are in order for start_image and end_image filtering + df = df.sort_values(by='image_path') + + # Add the image_width and image_height columns to the dataframe + for index, row in df.iterrows(): + im_size = Image.open(row['image_path']).size + df.at[index, 'image_width'] = im_size[0] + df.at[index, 'image_height'] = im_size[1] + df['image_width'] = df['image_width'].astype(int) + df['image_height'] = df['image_height'].astype(int) + + # Create a unique crop name for each detection with a unique id + crop_path = save_dir / 'crops' + crop_path.mkdir(parents=True, exist_ok=True) + df['crop_path'] = df.apply(lambda row: + f'{crop_path}/{uuid.uuid5(uuid.NAMESPACE_DNS, row["image_path"])}.png', + axis=1) + + # Add in a column for the unique crop name for each detection with a unique id + df['cluster_id'] = -1 # -1 is the default value and means that the image is not in a cluster + + # Replace any NaNs with 0 + df.fillna(0) + + # Print the first 5 rows of the dataframe + info(df.head(5)) + + if len(df) > 0: + # A prefix for the output files to make sure the output is unique for each execution + prefix = f'{model}_{datetime.now().strftime("%Y%m%d_%H%M%S")}' + + # Cluster the detections + df_cluster = cluster_vits(prefix, model, df, save_dir, alpha, cluster_selection_epsilon, min_similarity, + min_cluster_size, min_samples, roi=True) + + # Merge the results with the original DataFrame + df.update(df_cluster) + + # Save the clustered detections to a csv file and a copy of the config.ini file + df.to_csv(save_dir / f'{prefix}_cluster_detections.csv', index=False, header=True) + shutil.copy(Path(config_ini), save_dir / f'{prefix}_config.ini') + else: + warn(f'No detections found to cluster') diff --git a/sdcat/cluster/embedding.py b/sdcat/cluster/embedding.py index 7c68dc7..2e5caf0 100644 --- a/sdcat/cluster/embedding.py +++ b/sdcat/cluster/embedding.py @@ -11,6 +11,7 @@ from numpy import save, load import numpy as np from sahi.utils.torch import torch +from torchvision import transforms as pth_transforms import torch.nn as nn import cv2 from sdcat.logger import info, err @@ -119,8 +120,10 @@ def compute_embedding(images: list, model_name: str): image = np.array(square_img) - # Convert the image to a tensor + norm_transform = pth_transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) img_tensor = torch.from_numpy(image).permute(2, 0, 1).float() / 255.0 + # Noramlize the tensor with the mean and std of the ImageNet dataset + img_tensor = norm_transform(img_tensor) img_tensor = img_tensor.unsqueeze(0) # Add batch dimension if 'cuda' in device: img_tensor = img_tensor.to(device)