Change embeddings and predict methods to accept both a single input a…

…nd a list, closes #2. Generate vectors for out of vocabulary tokens with no subtokens, closes #3.
neuml · Jan 27, 2025 · 0672744 · 0672744
1 parent 38edd3e
commit 0672744
Show file tree

Hide file tree

Showing 4 changed files with 101 additions and 37 deletions.
diff --git a/README.md b/README.md
@@ -85,8 +85,8 @@ from staticvectors import StaticVectors
 
 model = StaticVectors("/path/to/vectors.magnitude")
 
-# Get word vectors
-model.embeddings(["hello"])
+# Get word vector
+model.embeddings("hello")
 ```
 
 ### Convert and quantize

diff --git a/src/python/staticvectors/model.py b/src/python/staticvectors/model.py
@@ -37,48 +37,29 @@ def __init__(self, path=None):
         if path:
             self.load(path)
 
-    def load(self, path):
-        """
-        Loads model at path.
-
-        Args:
-            path: model path
-        """
-
-        # Create a storage reader
-        reader = StorageFactory.create(path)
-
-        # Load model data
-        self.config, self.vectors, self.quantization, self.weights, self.tokens, self.labels, self.counts = reader.load()
-
-        # Load additional parameters with classification models
-        if self.isclassification():
-            for token in self.tokens:
-                self.cache[token] = self.tokenize(token)
-
-        # Create model loss when label weights are available
-        self.loss = LossFactory.create(self.config["loss"], self.counts, self.weights) if self.weights is not None else None
-
-    def embeddings(self, tokens, normalize=True):
+    def embeddings(self, token, normalize=True):
         """
-        Gets embeddings vectors for tokens.
+        Gets embeddings vector for token.
 
         Args:
-            tokens: list of tokens to get
+            token: token|list
             normalize: if True (default), vectors will be normalized
 
         Returns:
             array of embeddings vectors
         """
 
+        # format inputs
+        tokens = [token] if isinstance(token, str) else token
+
         embeddings = []
-        for token in tokens:
+        for x in tokens:
             if self.isclassification():
                 # Vectors from a FastText model
-                embeddings.append(self.query(token))
+                embeddings.append(self.query(x))
             else:
                 # Vectors from a vectors dump
-                embeddings.append(self.lookup(token))
+                embeddings.append(self.lookup(x))
 
         # Get embeddings as np.array
         embeddings = np.array(embeddings)
@@ -87,23 +68,55 @@ def embeddings(self, tokens, normalize=True):
         if normalize:
             self.normalize(embeddings)
 
-        return embeddings
+        return embeddings[0] if isinstance(token, str) else embeddings
 
     def predict(self, text, limit=1):
         """
         Predicts a label for text. This only works for supervised classification models.
 
         Args:
-            text: input text
+            text: text|list
             limit: maximum labels to return
+
+        Returns:
+            predictions as [(label, score)]
         """
 
         if not self.loss:
             raise ValueError("Predictions only supported with classification models")
 
-        # Create query vector from input text
-        vector = self.query(text)
-        return [(self.labels[uid].replace(self.config["label"], ""), score) for uid, score in self.loss(vector, limit)]
+        # Format inputs
+        texts = [text] if isinstance(text, str) else text
+
+        results = []
+        for x in texts:
+            # Create query vector from input text
+            vector = self.query(x)
+            results.append([(self.labels[uid].replace(self.config["label"], ""), score) for uid, score in self.loss(vector, limit)])
+
+        return results[0] if isinstance(text, str) else results
+
+    def load(self, path):
+        """
+        Loads model at path.
+
+        Args:
+            path: model path
+        """
+
+        # Create a storage reader
+        reader = StorageFactory.create(path)
+
+        # Load model data
+        self.config, self.vectors, self.quantization, self.weights, self.tokens, self.labels, self.counts = reader.load()
+
+        # Load additional parameters with classification models
+        if self.isclassification():
+            for token in self.tokens:
+                self.cache[token] = self.tokenize(token)
+
+        # Create model loss when label weights are available
+        self.loss = LossFactory.create(self.config["loss"], self.counts, self.weights) if self.weights is not None else None
 
     def isclassification(self):
         """
@@ -168,7 +181,8 @@ def lookup(self, token):
             # Generate vector for out of vocabulary term
             tokenids = [self.tokens[subtoken] for subtoken in self.tokenizer(token, minn, maxn) if subtoken in self.tokens]
 
-        return self.getvectors(np.array(tokenids)).mean(axis=0)
+        # Generate a mean vector for all subtokens. Otherwise, generate a token vector.
+        return self.getvectors(np.array(tokenids)).mean(axis=0) if tokenids else self.generate(token)
 
     def query(self, text):
         """
@@ -265,3 +279,22 @@ def ids(self, tokens):
         """
 
         return len(self.tokens) + (self.hasher(tokens) % self.config["bucket"])
+
+    def generate(self, token):
+        """
+        Generates a vector for an out of vocabulary token. This is a deterministic algorithm, the same
+        vector will be generated for the same token.
+
+        Args:
+            token: token
+
+        Returns:
+            vector
+        """
+
+        # Get a hash for the token and use it as the random seed
+        seed = self.hasher([token])[0]
+        random = np.random.default_rng(seed)
+
+        # Generate the vector
+        return random.uniform(-1, 1, self.config["dim"])
diff --git a/src/python/staticvectors/storage/database.py b/src/python/staticvectors/storage/database.py
@@ -13,7 +13,7 @@
 # pylint: disable=W0223
 class Database(Storage):
     """
-    SQLite storage format. Also supports legacy magnitude-light databases (https://github.com/neuml/magnitude).
+    SQLite storage format. Also supports legacy magnitude-lite databases (https://github.com/neuml/magnitude).
 
     Configuration, vectors and vocabulary are all stored in SQLite. Configuration can optionally be mirrored in a JSON file to facilitate
     loading from the Hugging Face Hub.

diff --git a/test/python/testmodel.py b/test/python/testmodel.py
@@ -0,0 +1,31 @@
+"""
+Model module
+"""
+
+import unittest
+
+from staticvectors import StaticVectors
+
+
+class TestModel(unittest.TestCase):
+    """
+    Model tests.
+    """
+
+    def testGenerate(self):
+        """
+        Test generating a vector for an out of vocabulary token
+        """
+
+        # Create model for testing
+        model1, model2 = StaticVectors(), StaticVectors()
+
+        # Set the dimensions for testing
+        model1.config = {"dim": 100}
+        model2.config = {"dim": 100}
+
+        # Generate vectors from two different models for same token and test they are the same
+        self.assertTrue((model1.generate("abc") == model2.generate("abc")).all())
+
+        # Repeat and confirm it's still the same
+        self.assertTrue((model1.generate("abc") == model2.generate("abc")).all())