Skip to content

Commit

Permalink
Change embeddings and predict methods to accept both a single input a…
Browse files Browse the repository at this point in the history
…nd a list, closes #2. Generate vectors for out of vocabulary tokens with no subtokens, closes #3.
  • Loading branch information
davidmezzetti committed Jan 27, 2025
1 parent 38edd3e commit 0672744
Show file tree
Hide file tree
Showing 4 changed files with 101 additions and 37 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,8 @@ from staticvectors import StaticVectors

model = StaticVectors("/path/to/vectors.magnitude")

# Get word vectors
model.embeddings(["hello"])
# Get word vector
model.embeddings("hello")
```

### Convert and quantize
Expand Down
101 changes: 67 additions & 34 deletions src/python/staticvectors/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,48 +37,29 @@ def __init__(self, path=None):
if path:
self.load(path)

def load(self, path):
"""
Loads model at path.
Args:
path: model path
"""

# Create a storage reader
reader = StorageFactory.create(path)

# Load model data
self.config, self.vectors, self.quantization, self.weights, self.tokens, self.labels, self.counts = reader.load()

# Load additional parameters with classification models
if self.isclassification():
for token in self.tokens:
self.cache[token] = self.tokenize(token)

# Create model loss when label weights are available
self.loss = LossFactory.create(self.config["loss"], self.counts, self.weights) if self.weights is not None else None

def embeddings(self, tokens, normalize=True):
def embeddings(self, token, normalize=True):
"""
Gets embeddings vectors for tokens.
Gets embeddings vector for token.
Args:
tokens: list of tokens to get
token: token|list
normalize: if True (default), vectors will be normalized
Returns:
array of embeddings vectors
"""

# format inputs
tokens = [token] if isinstance(token, str) else token

embeddings = []
for token in tokens:
for x in tokens:
if self.isclassification():
# Vectors from a FastText model
embeddings.append(self.query(token))
embeddings.append(self.query(x))
else:
# Vectors from a vectors dump
embeddings.append(self.lookup(token))
embeddings.append(self.lookup(x))

# Get embeddings as np.array
embeddings = np.array(embeddings)
Expand All @@ -87,23 +68,55 @@ def embeddings(self, tokens, normalize=True):
if normalize:
self.normalize(embeddings)

return embeddings
return embeddings[0] if isinstance(token, str) else embeddings

def predict(self, text, limit=1):
"""
Predicts a label for text. This only works for supervised classification models.
Args:
text: input text
text: text|list
limit: maximum labels to return
Returns:
predictions as [(label, score)]
"""

if not self.loss:
raise ValueError("Predictions only supported with classification models")

# Create query vector from input text
vector = self.query(text)
return [(self.labels[uid].replace(self.config["label"], ""), score) for uid, score in self.loss(vector, limit)]
# Format inputs
texts = [text] if isinstance(text, str) else text

results = []
for x in texts:
# Create query vector from input text
vector = self.query(x)
results.append([(self.labels[uid].replace(self.config["label"], ""), score) for uid, score in self.loss(vector, limit)])

return results[0] if isinstance(text, str) else results

def load(self, path):
"""
Loads model at path.
Args:
path: model path
"""

# Create a storage reader
reader = StorageFactory.create(path)

# Load model data
self.config, self.vectors, self.quantization, self.weights, self.tokens, self.labels, self.counts = reader.load()

# Load additional parameters with classification models
if self.isclassification():
for token in self.tokens:
self.cache[token] = self.tokenize(token)

# Create model loss when label weights are available
self.loss = LossFactory.create(self.config["loss"], self.counts, self.weights) if self.weights is not None else None

def isclassification(self):
"""
Expand Down Expand Up @@ -168,7 +181,8 @@ def lookup(self, token):
# Generate vector for out of vocabulary term
tokenids = [self.tokens[subtoken] for subtoken in self.tokenizer(token, minn, maxn) if subtoken in self.tokens]

return self.getvectors(np.array(tokenids)).mean(axis=0)
# Generate a mean vector for all subtokens. Otherwise, generate a token vector.
return self.getvectors(np.array(tokenids)).mean(axis=0) if tokenids else self.generate(token)

def query(self, text):
"""
Expand Down Expand Up @@ -265,3 +279,22 @@ def ids(self, tokens):
"""

return len(self.tokens) + (self.hasher(tokens) % self.config["bucket"])

def generate(self, token):
"""
Generates a vector for an out of vocabulary token. This is a deterministic algorithm, the same
vector will be generated for the same token.
Args:
token: token
Returns:
vector
"""

# Get a hash for the token and use it as the random seed
seed = self.hasher([token])[0]
random = np.random.default_rng(seed)

# Generate the vector
return random.uniform(-1, 1, self.config["dim"])
2 changes: 1 addition & 1 deletion src/python/staticvectors/storage/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# pylint: disable=W0223
class Database(Storage):
"""
SQLite storage format. Also supports legacy magnitude-light databases (https://github.com/neuml/magnitude).
SQLite storage format. Also supports legacy magnitude-lite databases (https://github.com/neuml/magnitude).
Configuration, vectors and vocabulary are all stored in SQLite. Configuration can optionally be mirrored in a JSON file to facilitate
loading from the Hugging Face Hub.
Expand Down
31 changes: 31 additions & 0 deletions test/python/testmodel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
"""
Model module
"""

import unittest

from staticvectors import StaticVectors


class TestModel(unittest.TestCase):
"""
Model tests.
"""

def testGenerate(self):
"""
Test generating a vector for an out of vocabulary token
"""

# Create model for testing
model1, model2 = StaticVectors(), StaticVectors()

# Set the dimensions for testing
model1.config = {"dim": 100}
model2.config = {"dim": 100}

# Generate vectors from two different models for same token and test they are the same
self.assertTrue((model1.generate("abc") == model2.generate("abc")).all())

# Repeat and confirm it's still the same
self.assertTrue((model1.generate("abc") == model2.generate("abc")).all())

0 comments on commit 0672744

Please sign in to comment.