Merge pull request #5 from dleemiller/feature/clustering

Feature/clustering
dleemiller · Jul 18, 2024 · 54b8984 · 54b8984
2 parents a716b9d + 7431dd2
commit 54b8984
Show file tree

Hide file tree

Showing 6 changed files with 288 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -52,6 +52,7 @@ print(ranked_docs)
 
 # additional inference methods
 wl.deduplicate(candidates, threshold=0.8) # fuzzy deduplication
+wl.cluster(docs, k=5, max_iterations=100, tolerance=1e-4) # labels using kmeans/kmeans++ init
 wl.filter(query, candidates, threshold=0.3) # filter candidates based on query
 wl.topk(query, candidates, k=3) # return topk strings based on query
 ```
@@ -78,6 +79,7 @@ The final weights are saved *after* weighting, projection and truncation of the
 It's good option for some nlp-lite tasks. You can train sklearn classifiers on it, perform basic semantic matching, fuzzy deduplication, ranking and clustering.
 I think it should work well for creating LLM output evaluators, or other preparatory tasks involved in multi-hop or agentic workflows.
 You can perform your own llm surgery and train your own model on consumer GPUs in a few hours.
+Because of its fast and portable size, it makes a good "Swiss-Army Knife" utility for exploratory analysis and utility applications.
 
 ## MTEB Results (l2_supercat)
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -44,3 +44,5 @@ classifiers = { file = "classifiers.txt" }
 
 [tool.setuptools_scm]
 write_to = "wordllama/_version.py"
+version_scheme = "post-release"
+local_scheme = "no-local-version"
diff --git a/tests/test_kmeans.py b/tests/test_kmeans.py
@@ -0,0 +1,96 @@
+import unittest
+import numpy as np
+
+from wordllama.algorithms.kmeans import (
+    kmeans_plusplus_initialization,
+    kmeans_clustering,
+)
+
+
+class TestKMeansClustering(unittest.TestCase):
+    def setUp(self):
+        self.random_state = np.random.RandomState(42)
+        self.embeddings = np.array(
+            [
+                [0.1, 0.2, 0.3],
+                [0.2, 0.1, 0.3],
+                [0.8, 0.7, 0.6],
+                [0.9, 0.8, 0.7],
+                [0.4, 0.5, 0.6],
+                [0.5, 0.4, 0.7],
+            ]
+        )
+
+    def test_kmeans_plusplus_initialization(self):
+        k = 2
+        centroids = kmeans_plusplus_initialization(
+            self.embeddings, k, self.random_state
+        )
+
+        self.assertEqual(centroids.shape[0], k)
+        self.assertEqual(centroids.shape[1], self.embeddings.shape[1])
+
+        # Check that centroids are among the original points
+        for centroid in centroids:
+            self.assertTrue(
+                any(np.allclose(centroid, point) for point in self.embeddings)
+            )
+
+    def test_kmeans_clustering_convergence(self):
+        k = 2
+        labels, losses = kmeans_clustering(
+            self.embeddings, k, random_state=self.random_state
+        )
+
+        self.assertEqual(len(labels), self.embeddings.shape[0])
+        self.assertGreater(len(losses), 0)
+
+        # Check that the losses decrease over iterations
+        for i in range(1, len(losses)):
+            self.assertLessEqual(losses[i], losses[i - 1])
+
+    def test_kmeans_clustering_labels(self):
+        k = 2
+        labels, _ = kmeans_clustering(
+            self.embeddings, k, random_state=self.random_state
+        )
+
+        # Check that labels are within the valid range
+        for label in labels:
+            self.assertIn(label, range(k))
+
+    def test_kmeans_clustering_different_k(self):
+        k = 3
+        labels, _ = kmeans_clustering(
+            self.embeddings, k, random_state=self.random_state
+        )
+
+        self.assertEqual(len(labels), self.embeddings.shape[0])
+
+        # Check that labels are within the valid range
+        for label in labels:
+            self.assertIn(label, range(k))
+
+    def test_kmeans_clustering_random_state(self):
+        k = 2
+        labels1, losses1 = kmeans_clustering(self.embeddings, k, random_state=42)
+        labels2, losses2 = kmeans_clustering(self.embeddings, k, random_state=42)
+
+        self.assertEqual(labels1, labels2)
+        self.assertEqual(losses1, losses2)
+
+    def test_kmeans_clustering_different_initializations(self):
+        k = 2
+        labels1, losses1 = kmeans_clustering(
+            self.embeddings, k, random_state=42, n_init=1
+        )
+        labels2, losses2 = kmeans_clustering(
+            self.embeddings, k, random_state=42, n_init=10
+        )
+
+        self.assertEqual(len(labels1), len(labels2))
+        self.assertEqual(len(losses2), len(losses1))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/wordllama/algorithms/__init__.py b/wordllama/algorithms/__init__.py
@@ -0,0 +1 @@
+from .kmeans import kmeans_clustering
diff --git a/wordllama/algorithms/kmeans.py b/wordllama/algorithms/kmeans.py
@@ -0,0 +1,145 @@
+import numpy as np
+from typing import List, Tuple
+
+
+def kmeans_plusplus_initialization(
+    embeddings: np.ndarray, k: int, random_state: np.random.RandomState
+) -> np.ndarray:
+    """
+    Initialize centroids using the K-Means++ algorithm.
+
+    Parameters:
+    embeddings (np.ndarray): The input data points (embeddings) to cluster.
+    k (int): The number of clusters.
+    random_state (np.random.RandomState): Random state for reproducibility.
+
+    Returns:
+    np.ndarray: The initialized centroids.
+    """
+    n_samples, n_features = embeddings.shape
+    centroids = np.empty((k, n_features), dtype=embeddings.dtype)
+
+    # Step 1a: Choose the first centroid randomly from the data points
+    centroids[0] = embeddings[random_state.randint(n_samples)]
+    distances = np.linalg.norm(embeddings - centroids[0], axis=1)
+
+    for i in range(1, k):
+        # Step 1b: Compute the probability distribution based on squared distances
+        probabilities = distances**2
+        probabilities /= probabilities.sum()
+        cumulative_probabilities = probabilities.cumsum()
+        r = random_state.rand()
+        index = np.searchsorted(cumulative_probabilities, r)
+        centroids[i] = embeddings[index]
+
+        # Update distances to the nearest centroid for the next iteration
+        new_distances = np.linalg.norm(embeddings - centroids[i], axis=1)
+        distances = np.minimum(distances, new_distances)
+
+    return centroids
+
+
+def calculate_inertia(
+    embeddings: np.ndarray, labels: np.ndarray, centroids: np.ndarray
+) -> float:
+    """
+    Calculate the inertia (sum of squared distances to the closest centroid).
+
+    Parameters:
+    embeddings (np.ndarray): The input data points (embeddings) to cluster.
+    labels (np.ndarray): The cluster labels for each point.
+    centroids (np.ndarray): The cluster centroids.
+
+    Returns:
+    float: The calculated inertia.
+    """
+    inertia = 0.0
+    for i, centroid in enumerate(centroids):
+        cluster_points = embeddings[labels == i]
+        inertia += np.sum((cluster_points - centroid) ** 2)
+    return inertia
+
+
+def kmeans_clustering(
+    embeddings: np.ndarray,
+    k: int,
+    max_iterations: int = 100,
+    tolerance: float = 1e-4,
+    n_init: int = 10,
+    min_iterations: int = 5,
+    random_state=None,
+) -> Tuple[List[int], List[float]]:
+    """
+    Perform K-Means clustering on the provided embeddings.
+
+    Parameters:
+    embeddings (np.ndarray): The input data points (embeddings) to cluster.
+    k (int): The number of clusters.
+    max_iterations (int, optional): The maximum number of iterations to run the algorithm. Defaults to 100.
+    tolerance (float, optional): The tolerance to declare convergence. Defaults to 1e-4.
+    n_init (int, optional): Number of times the algorithm will be run with different centroid seeds. The final result will be the best output in terms of loss. Defaults to 10.
+    min_iterations (int, optional): Minimum number of iterations before checking for convergence. Defaults to 5.
+    random_state (int or np.random.RandomState, optional): Random state for reproducibility.
+
+    Returns:
+    Tuple[List[int], List[float]]: A tuple containing the cluster labels and the list of loss values for each iteration.
+    """
+    if random_state is None:
+        random_state = np.random.RandomState()
+    elif isinstance(random_state, int):
+        random_state = np.random.RandomState(random_state)
+
+    best_labels = None
+    best_inertia = float("inf")
+    best_losses = None
+
+    for init_run in range(n_init):
+        centroids = kmeans_plusplus_initialization(embeddings, k, random_state)
+
+        prev_inertia = float("inf")
+        losses = []
+
+        for iteration in range(max_iterations):
+            # Step 2: Assign each point to the nearest centroid
+            distances = np.sqrt(
+                ((embeddings[:, np.newaxis, :] - centroids[np.newaxis, :, :]) ** 2).sum(
+                    axis=2
+                )
+            )
+            labels = np.argmin(distances, axis=1)
+
+            # Step 2: Calculate inertia
+            inertia = calculate_inertia(embeddings, labels, centroids)
+            losses.append(inertia)
+
+            # Check for convergence based on inertia
+            if iteration >= min_iterations and abs(prev_inertia - inertia) < tolerance:
+                break
+
+            prev_inertia = inertia
+
+            # Step 3: Update centroids to the mean of the points in each cluster
+            new_centroids = np.array(
+                [
+                    embeddings[labels == i].mean(axis=0)
+                    if np.sum(labels == i) > 0
+                    else centroids[i]
+                    for i in range(k)
+                ]
+            )
+
+            # Check for convergence based on centroids
+            if iteration >= min_iterations and np.allclose(
+                centroids, new_centroids, atol=tolerance
+            ):
+                break
+
+            centroids = new_centroids
+
+        # Check if this initialization run has the best result
+        if inertia < best_inertia:
+            best_inertia = inertia
+            best_labels = labels
+            best_losses = losses
+
+    return best_labels.tolist(), best_losses
diff --git a/wordllama/inference.py b/wordllama/inference.py
@@ -1,9 +1,10 @@
 import numpy as np
 from tokenizers import Tokenizer
-from typing import Union, List
+from typing import Union, List, Tuple
 import logging
 
-from wordllama.config import WordLlamaConfig
+from .algorithms import kmeans_clustering
+from .config import WordLlamaConfig
 
 # Set up logging
 logging.basicConfig(level=logging.INFO)
@@ -323,3 +324,42 @@ def filter(
             if score > threshold
         ]
         return filtered_docs
+
+    def cluster(
+        self,
+        docs: List[str],
+        k: int,
+        max_iterations: int = 100,
+        tolerance: float = 1e-4,
+        n_init: int = 10,
+        min_iterations: int = 5,
+        random_state=None,
+    ) -> Tuple[List[int], float]:
+        """
+        Cluster the given text collection into k clusters.
+
+        Parameters:
+        docs (List[str]): The list of text documents to cluster.
+        k (int): The number of clusters.
+        max_iterations (int, optional): The maximum number of iterations to run the algorithm. Defaults to 300.
+        tolerance (float, optional): The tolerance to declare convergence. Defaults to 1e-4.
+        n_init (int, optional): Number of times the algorithm will be run with different centroid seeds. The final result will be the best output in terms of loss. Defaults to 10.
+        min_iterations (int, optional): Minimum number of iterations before checking for convergence. Defaults to 5.
+        random_state (int or np.random.RandomState, optional): Random state for reproducibility.
+
+        Returns:
+        Tuple[List[int], float]: A list of cluster labels and the final loss (inertia)
+        """
+        if self.binary:
+            raise ValueError("KMeans clustering only implemented for dense embeddings")
+        embeddings = self.embed(docs, norm=True)
+        cluster_labels, loss = kmeans_clustering(
+            embeddings,
+            k,
+            max_iterations=max_iterations,
+            tolerance=tolerance,
+            n_init=n_init,
+            min_iterations=min_iterations,
+            random_state=random_state,
+        )
+        return cluster_labels, loss[-1].item()