Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Build Index from regex #125

Draft
wants to merge 22 commits into
base: main
Choose a base branch
from
Draft
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 11 additions & 26 deletions tests/fsm/test_statistical.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,12 @@
from typing import Callable, List, Optional

import numpy as np
from outlines_core.fsm.guide import RegexGuide
from outlines_core.fsm import Guide, Index, Vocabulary
from pytest import approx
from scipy.stats import ks_2samp


def test_generate_length():
class MockTokenizer:
vocabulary = {"0": 1, "1": 2, "eos": 3}
inverse_vocabulary = {1: "0", 2: "1", 3: ""}
special_tokens = {"eos"}
eos_token_id = 3

def length(self):
return len(self.vocabulary)

def convert_token_to_string(self, token):
return token

def decode(self, token):
return self.inverse_vocabulary[token]

class NextToken:
def __init__(
self,
Expand All @@ -43,17 +28,18 @@ def __call__(
next_t = [self.rng.choice(self.states, p=prob / np.sum(prob))]
return tokens + next_t if tokens is not None else next_t

def generate(model, tokenizer, regex_str) -> Optional[List[int]]:
n_tokens = tokenizer.length()
def generate(model, regex_str) -> Optional[List[int]]:
vocabulary = Vocabulary(3, {"0": [1], "1": [2], "2": [4]})
index = Index(regex_str, vocabulary)
guide = Guide(index)

fsm = RegexGuide.from_regex(regex_str, tokenizer)
state: int = fsm.initial_state
n_tokens = len(vocabulary)
tokens = None
while state != -1:
allowed = fsm.get_next_instruction(state).tokens
allowed = guide.get_start_tokens()
while not guide.is_finished():
mask: List[int] = [1 if s in allowed else 0 for s in range(1, n_tokens + 1)]
tokens = model(tokens, mask=mask)
state = fsm.get_next_state(state, tokens[-1])
allowed = guide.read_next_token(tokens[-1])
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@dpsimpson Could you, please, take a look at this failing test: https://github.com/dottxt-ai/outlines-core/actions/runs/12695987037/job/35389081344

Interface has changed and I updated it here somewhat accordingly, but it needs to be checked, for example I added third value to Vocabulary (instead of eos token before that) just for the sake of keeping the dimensions right, I suspect that could be incorrect 😅

But overall, since its statistical, it goes over my head to fully understand the intentions and adjust properly expected testing values or construction logic, so I would really appreciate your help here 🙏

return tokens

def prob_non_markov(tokens: List[int]) -> np.array:
Expand All @@ -75,16 +61,15 @@ def prob_markov(token: List[int]) -> np.array:

n_samples: int = 250
regex_str: str = r"11[01]+|0[01]*"
tokenizer = MockTokenizer()
model1 = NextToken(prob_markov, p0, states, 30127)
model2 = NextToken(prob_non_markov, p0, states, 24601)

lengths1: np.array = np.zeros((n_samples,))
lengths2: np.array = np.zeros((n_samples,))
for i in range(n_samples):
out1: List[int] = generate(model1, tokenizer, regex_str)
out1: List[int] = generate(model1, regex_str)
lengths1[i] = len(out1) - 1 # take off the eos token
out2: List[int] = generate(model2, tokenizer, regex_str)
out2: List[int] = generate(model2, regex_str)
lengths2[i] = len(out2) - 1 # take off the eos token

# 2 sample KS test to check that lengths has the same distribution as
Expand Down
Loading