From 13ce1fd194bd71a3c85d3544c3620fd0e921b94c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Louf?= Date: Thu, 15 Jun 2023 14:44:55 +0200 Subject: [PATCH] Add the `Sequence` base class --- outlines/text/sequences/sequence.py | 250 ++++++++++++++++ pyproject.toml | 2 +- tests/text/sequences/test_sequence.py | 393 ++++++++++++++++++++++++++ 3 files changed, 644 insertions(+), 1 deletion(-) create mode 100644 outlines/text/sequences/sequence.py create mode 100644 tests/text/sequences/test_sequence.py diff --git a/outlines/text/sequences/sequence.py b/outlines/text/sequences/sequence.py new file mode 100644 index 000000000..bea23de4c --- /dev/null +++ b/outlines/text/sequences/sequence.py @@ -0,0 +1,250 @@ +from typing import List, Optional, Tuple, Union + +import numpy as np +from numpy.random import Generator +from numpy.typing import NDArray + + +class Sequence: + """Represents a sequence generation method.""" + + def __init__(self, model, max_tokens: Optional[int] = None): + """Create a `Sequence` instance. + + Parameters + ---------- + model + The instance of the model used to generate next-token probabilities. + max_tokens + The maximum number of tokens that will be generated if no termination + condition is met. + + """ + self.model = model + self.max_tokens = max_tokens + + def is_finished(self, token_ids: NDArray[np.int64]) -> NDArray[np.bool_]: + """Determine whether we should stop the generation.""" + raise NotImplementedError( + "`Sequence.is_finished` must be implemented by subclasses." + ) + + def step( + self, + rng: Generator, + token_ids: NDArray[np.int64], + attention_mask: NDArray[np.int64], + samples: int = 1, + ) -> Tuple[NDArray[np.int64], NDArray[float]]: + """Generate one or several tokens that complete the input sequence. + + The sampling step consists in using a model to generate next-token + logits and then sample `samples`-many new tokens from a categorical + distribution parametrized by these logits. + + Parameters + ---------- + rng + NumPy random number Generator instance + token_ids + The token ids passed as an input to the model, of shape `batch_shape + + (num_tokens,)`, where `num_tokens` is the sequences' length. + samples + The number of continuations to sample from the next-token probability + distribution. + + Returns + ------- + A tuple with an array of shape `new_batch_shape + (num_tokens+1,)`that + contains the completed sequences (input token ids and generated token + ids) and an array of shape `new_batch_shape + (vocab_size,)` that + contains the next token probabilities. + `new_batch_shape` is computed by removing dimensions of size one in + `(samples,) + batch_shape`. + + """ + num_input_dims = token_ids.ndim + probs = self.model(token_ids, attention_mask) + + # Sample `samples`-many new tokens + next_token_ids = vectorized_random_choice(rng, probs, samples) + + # Add the missing `num_tokens` and `num_sample` dimensions + next_token_ids = np.expand_dims(next_token_ids, -1) + token_ids = np.expand_dims(token_ids, 0) + + # Expand the input `token_ids` array to be able to concatenate several + # samples. + if samples > 1: + repetitions = (samples,) + (1,) * num_input_dims + token_ids = np.tile(token_ids, repetitions) + probs = np.tile(probs, repetitions) + + token_ids = np.concatenate([token_ids, next_token_ids], axis=-1) + + # Merge sample and batch dimensions by removing dimensions of length + # 1. The shape of the resulting arrays is `new_batch_shape + (num_tokens,)` + # and `new_batch_shape + (vocab_size,)` respectively. + token_ids = np.atleast_2d(token_ids.squeeze()) + probs = np.atleast_2d(probs.squeeze()) + + return token_ids, probs + + def expand_attention_mask( + self, attention_mask: NDArray[np.int64] + ) -> NDArray[np.int64]: + """Expand the attention mask after the last completion.""" + batch_shape = attention_mask.shape[:-1] + attention_mask = np.concatenate( + [attention_mask, np.broadcast_to([1], batch_shape + (1,))], axis=-1 + ) + return attention_mask + + def update_token_ids( + self, + is_finished: NDArray[np.bool_], + token_ids: NDArray[np.int64], + token_ids_unfinished: NDArray[np.int64], + ) -> NDArray[np.int64]: + """Update the array of token ids after the last completion. + + We only generate new tokens for the sequences that are not finished. We thus + update the array with the new tokens, and append pad tokens to the finished + sequences. + + Parameters + ---------- + is_finished + Boolean array that indicates which sequences are finished. + token_ids + Array that contains the sequences before the generation's last step. + token_ids_unfinished + Array that contains the sequences of the unfinished sequences + after the generation's last step. + + Returns + ------- + An array that contains the updated array that contains the sequences. We append + pad tokens to the finished sequences. + + """ + batch_shape = token_ids.shape[:-1] + num_tokens = token_ids.shape[-1] + new_token_ids = np.empty(batch_shape + (num_tokens + 1,), dtype=np.int64) + + token_ids_finished = token_ids[is_finished] + batch_shape_finished = token_ids_finished.shape[:-1] + token_ids_finished = np.concatenate( + [ + token_ids_finished, + np.broadcast_to( + [self.model.tokenizer.pad_token_id], batch_shape_finished + (1,) + ), + ], + axis=-1, + ) + + new_token_ids[~is_finished] = token_ids_unfinished + new_token_ids[is_finished] = token_ids_finished + + return new_token_ids + + def __call__( + self, + prompt: Union[str, List[str]], + samples: int = 1, + rng: Generator = np.random.default_rng(), + ) -> Union[str, List[str]]: + """Generate a new sequence given a prompt. + + Parameters + ---------- + prompt + The input prompt. + samples + The number of samples to generate for each prompt. + + Returns + ------- + The full sequence that contains the prompts and the generated string. + + """ + token_ids, attention_mask = self.model.tokenizer.encode(prompt) + num_prompt_tokens = token_ids.shape[-1] + + if samples > 1: + token_ids, _ = self.step(rng, token_ids, attention_mask, samples) + is_finished = self.is_finished(token_ids) + + num_batch_dims = token_ids.ndim - 1 + repetitions = (samples,) + (1,) * num_batch_dims + attention_mask = np.tile(attention_mask, repetitions) + attention_mask = self.expand_attention_mask(attention_mask) + else: + batch_shape = token_ids.shape[:-1] + is_finished = np.zeros(batch_shape, dtype=np.bool_) + + while True: + num_generated_tokens = token_ids.shape[-1] - num_prompt_tokens + if np.all(is_finished) or num_generated_tokens == self.max_tokens: + break + + token_ids_unfinished = token_ids[~is_finished] + attention_mask_unfinished = attention_mask[~is_finished] + token_ids_unfinished, _ = self.step( + rng, token_ids_unfinished, attention_mask_unfinished + ) + + token_ids = self.update_token_ids( + is_finished, token_ids, token_ids_unfinished + ) + attention_mask = self.expand_attention_mask(attention_mask) + is_finished[~is_finished] = self.is_finished(token_ids_unfinished).flatten() + + result = self.model.tokenizer.decode(token_ids) + + if len(result) == 1: + return result[0] + + return result + + +vsearchsorted = np.vectorize(np.searchsorted, otypes=[int], signature="(n),()->()") + + +def vectorized_random_choice( + rng: Generator, + p: NDArray[np.float64], + samples: int = 1, +): + """Vectorized implementation of `np.random.choice`. + + `np.random.choice` does not support arrays of probability. This implements + the equivalent of this function where the `p` argument can be a matrix. + + Note + ---- + `searchsorted` might be more efficient here since the number of elements + can be quite large. + + Parameters + ---------- + rng + NumPy random number Generator instance + p + An array of probability of shape `(num_probability_vectors, num_items)` + that must sum to 1. + samples + The number of samples to take for each probability vector. + + Returns + ------- + An array of shape `(num_samples, batch_size)` + + """ + + cumsum = np.expand_dims(p.cumsum(axis=-1), 0) + rand = rng.random((samples,) + p.shape[:-1]) + idx = vsearchsorted(cumsum, rand) + + return idx diff --git a/pyproject.toml b/pyproject.toml index 01222eeb9..62c7ae99a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -75,8 +75,8 @@ module = [ "diffusers", "jinja2", "joblib", - "numpy.*", "openai", + "numpy.*", "perscache.*", "PIL", "PIL.Image", diff --git a/tests/text/sequences/test_sequence.py b/tests/text/sequences/test_sequence.py new file mode 100644 index 000000000..946990102 --- /dev/null +++ b/tests/text/sequences/test_sequence.py @@ -0,0 +1,393 @@ +from typing import Dict, List, Union + +import numpy as np +import pytest +from numpy.testing import assert_array_equal + +from outlines.text.sequences.sequence import Sequence, vectorized_random_choice + + +def test_vectorized_random_choice(): + rng = np.random.default_rng(0) + + probs = np.array([[1, 0, 0, 0]]) + sample = vectorized_random_choice(rng, probs) + assert sample.shape == (1, 1) + assert_array_equal(sample, np.zeros((1, 1))) + + probs = np.array([[1, 0, 0, 0]]) + sample = vectorized_random_choice(rng, probs, samples=3) + assert sample.shape == (3, 1) + assert_array_equal(sample, np.zeros((3, 1))) + + probs = np.tile(np.array([[1, 0, 0, 0]]), (2, 1)) + sample = vectorized_random_choice(rng, probs) + assert sample.shape == (1, 2) + assert_array_equal(sample, np.zeros((1, 2))) + + probs = np.array([[1, 0, 0, 0], [0, 1, 0, 0]]) + sample = vectorized_random_choice(rng, probs, samples=3) + assert sample.shape == (3, 2) + assert_array_equal(sample, [[0, 1], [0, 1], [0, 1]]) + + probs = np.array([[[1, 0, 0, 0], [0, 1, 0, 0]], [[0, 0, 1, 0], [0, 0, 0, 1]]]) + sample = vectorized_random_choice(rng, probs, samples=3) + assert sample.shape == (3, 2, 2) + assert_array_equal(sample, [[[0, 1], [2, 3]], [[0, 1], [2, 3]], [[0, 1], [2, 3]]]) + + +def test_sequence_error(): + with pytest.raises(NotImplementedError, match="must be implemented"): + sequence = Sequence(None) + sequence.is_finished(np.array([1])) + + +def ModelStep(logits): + """Mock model to test `Sequence.step`""" + + logits = np.array([logits]) + + def call(input_ids, *_): + """Call the model. + + We first repeat the logits `num_sequences` times, and then + reshape the resulting array to match the batch size. + + """ + import math + + batch_shape = input_ids.shape[:-1] + vocab_shape = (logits.shape[-1],) + shaped_logits = np.tile(logits, (math.prod(batch_shape), 1)) + return shaped_logits.reshape(batch_shape + vocab_shape) + + return call + + +def test_sequence_step(): + rng = np.random.default_rng(0) + + logits = np.array([0, 1, 0, 0]) + model = ModelStep(logits) + + sequence = Sequence(model) + + input_ids = np.array([[1, 2]]) + token_ids, probs = sequence.step(rng, input_ids, np.ones((1, 2))) + assert_array_equal(token_ids, [[1, 2, 1]]) + assert probs.shape == (1, 4) + + +def test_sequence_step_batch(): + rng = np.random.default_rng(0) + + logits = np.array([0, 1, 0, 0]) + model = ModelStep(logits) + + sequence = Sequence(model) + + input_ids = np.array([[1, 2], [3, 4]]) + token_ids, probs = sequence.step(rng, input_ids, np.ones((2, 2))) + assert_array_equal(token_ids, [[1, 2, 1], [3, 4, 1]]) + assert probs.shape == (2, 4) + + +def test_sequence_step_sample(): + rng = np.random.default_rng(0) + + logits = np.array([0, 1, 0, 0]) + model = ModelStep(logits) + + sequence = Sequence(model) + input_ids = np.array([[1, 2]]) + token_ids, probs = sequence.step(rng, input_ids, np.ones((1, 2)), samples=3) + assert_array_equal(token_ids, [[1, 2, 1], [1, 2, 1], [1, 2, 1]]) + assert probs.shape == (3, 4) + + +def test_sequence_sample_batch(): + rng = np.random.default_rng(0) + + logits = np.array([0, 1, 0, 0]) + model = ModelStep(logits) + + sequence = Sequence(model) + input_ids = np.array([[1, 2, 1], [3, 4, 1]]) + token_ids, probs = sequence.step(rng, input_ids, np.ones((2, 3)), samples=3) + assert_array_equal( + token_ids, + [ + [[1, 2, 1, 1], [3, 4, 1, 1]], + [[1, 2, 1, 1], [3, 4, 1, 1]], + [[1, 2, 1, 1], [3, 4, 1, 1]], + ], + ) + assert probs.shape == (3, 2, 4) + + +def test_sequence_step_loop(): + """Make sure that we can feed `step`'s output back as an input.""" + + rng = np.random.default_rng(0) + + logits = np.array([0, 1, 0, 0]) + model = ModelStep(logits) + + sequence = Sequence(model) + input_ids = np.array([[1, 2]]) + token_ids, _ = sequence.step(rng, input_ids, np.ones((1, 2))) + token_ids, probs = sequence.step(rng, token_ids, np.ones((1, 3))) + assert_array_equal(token_ids, [[1, 2, 1, 1]]) + assert probs.shape == (1, 4) + + input_ids = np.array([[1, 2], [3, 4]]) + token_ids, _ = sequence.step(rng, input_ids, np.ones((2, 2))) + token_ids, probs = sequence.step(rng, token_ids, np.ones((2, 3))) + assert_array_equal(token_ids, [[1, 2, 1, 1], [3, 4, 1, 1]]) + assert probs.shape == (2, 4) + + # The number of samples becomes the batch size at the next iteration. + input_ids = np.array([[1, 2]]) + token_ids, _ = sequence.step(rng, input_ids, np.ones((1, 2)), samples=3) + token_ids, probs = sequence.step(rng, token_ids, np.ones((3, 3))) + assert_array_equal(token_ids, [[1, 2, 1, 1], [1, 2, 1, 1], [1, 2, 1, 1]]) + assert probs.shape == (3, 4) + + +def test_sequence_step_loop_general(): + rng = np.random.default_rng(0) + + logits = np.array([0, 1, 0, 0]) + model = ModelStep(logits) + + sequence = Sequence(model) + input_ids = np.array([[1, 2, 1], [3, 4, 1]]) + token_ids, _ = sequence.step(rng, input_ids, np.ones((1, 3)), samples=3) + result, _ = sequence.step(rng, token_ids, np.ones((3, 4))) + assert result.shape == (3, 2, 5) + assert_array_equal( + result, + [ + [[1, 2, 1, 1, 1], [3, 4, 1, 1, 1]], + [[1, 2, 1, 1, 1], [3, 4, 1, 1, 1]], + [[1, 2, 1, 1, 1], [3, 4, 1, 1, 1]], + ], + ) + + +class TokenizerUpdateTokens: + pad_token_id = -1 + + +class ModelUpdateTokens: + tokenizer = TokenizerUpdateTokens() + + +def test_update_token_ids_all_unfinished(): + sequence = Sequence(ModelUpdateTokens()) + + previous_token_ids = np.array([[1, 1], [1, 1]]) + is_finished = np.array([False, False]) + token_ids_unfinished = np.array([[1, 1, 1], [1, 1, 1]]) + + result = sequence.update_token_ids( + is_finished, previous_token_ids, token_ids_unfinished + ) + assert_array_equal(result, [[1, 1, 1], [1, 1, 1]]) + + +def test_update_token_ids_some_unfinished(): + "Makes sure that the pad token is appended to finished sequences." + sequence = Sequence(ModelUpdateTokens()) + + previous_token_ids = np.array([[1, 1], [1, 1]]) + token_ids_unfinished = np.array([[1, 1, 1]]) + is_finished = np.array([True, False]) + result = sequence.update_token_ids( + is_finished, previous_token_ids, token_ids_unfinished + ) + assert_array_equal(result, [[1, 1, -1], [1, 1, 1]]) + + +@pytest.mark.xfail +def test_update_token_ids_larger_dimensions(): + sequence = Sequence(ModelUpdateTokens()) + + previous_token_ids = np.array([[1, 1], [1, 1]]) + is_finished = np.array([False, False]) + token_ids_unfinished = np.array([[1, 1, 1], [1, 1, 1]]) + result = sequence.update_token_ids( + is_finished, previous_token_ids, token_ids_unfinished + ) + assert_array_equal(result, [[1, 1, -1], [1, 1, 1]]) + + +class MockModel: + def __init__(self, tokenizer, logits): + self.tokenizer = tokenizer + self.logits = np.array(logits) + self.iteration_idx = 0 + + def __call__(self, input_ids, *_): + import math + + batch_shape = input_ids.shape[:-1] + vocab_shape = (self.logits.shape[-1],) + shaped_logits = np.tile( + self.logits[self.iteration_idx], (math.prod(batch_shape), 1) + ) + self.iteration_idx += 1 + + return shaped_logits.reshape(batch_shape + vocab_shape) + + +class MockTokenizer: + def __init__(self, vocabulary: Dict[str, int]): + self.vocabulary = vocabulary + self.pad_token_id = -1 + + def encode(self, prompts: Union[str, List[str]]): + if isinstance(prompts, str): + prompts = [prompts] + + token_ids = np.array([[self.vocabulary[prompt]] for prompt in prompts]) + attention_mask = np.ones_like(token_ids) + + return token_ids, attention_mask + + def decode(self, token_ids): + return token_ids + + +def test_call_single_prompt(): + class FinishAfterTwo(Sequence): + def __init__(self, model): + super().__init__(model) + self.iteration_idx = 0 + + def is_finished(self, token_ids): + """Finish generating the sequence after two iterations""" + if self.iteration_idx == 0: + self.iteration_idx += 1 + return np.array([False]) + else: + return np.array([True]) + + tokenizer = MockTokenizer({"Test": 0, "a": 1, "b": 2}) + model = MockModel(tokenizer, [[1, 0, 0], [0, 1, 0]]) + sequence = FinishAfterTwo(model) + + result = sequence("Test") + assert_array_equal(result, [0, 0, 1]) + + +def test_call_prompt_list(): + class Tokenizer: + def __init__(self, vocabulary: Dict[str, int]): + self.vocabulary = vocabulary + self.pad_token_id = -1 + + def __call__(self, prompts: List[str], **_): + return { + "input_ids": np.array([[self.vocabulary[prompt]] for prompt in prompts]) + } + + def batch_decode(self, token_ids): + return token_ids + + class FinishAfterThree(Sequence): + def __init__(self, model): + super().__init__(model) + self.iteration_idx = 0 + + def is_finished(self, token_ids): + """Finish generating the first sequence after two iteration and the + second one after two iterations. + + """ + if self.iteration_idx == 0: + self.iteration_idx += 1 + return np.array([False, False, False]) + elif self.iteration_idx == 1: + self.iteration_idx += 1 + return np.array([True, False, True]) + else: + return np.array([True]) # We only consider the unfinished sequences + + tokenizer = MockTokenizer( + {"Test1": 0, "Test2": 1, "a": 2, "b": 3, "c": 4, "Test3": 5} + ) + model = MockModel( + tokenizer, + [[0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 1, 0]], + ) + sequence = FinishAfterThree(model) + + result = sequence(["Test1", "Test2", "Test3"]) + assert_array_equal(result, [[0, 2, 3, -1], [1, 2, 3, 4], [5, 2, 3, -1]]) + + +def test_call_single_prompt_samples(): + class FinishAfterTwo(Sequence): + def __init__(self, model): + super().__init__(model) + self.iteration_idx = 0 + + def is_finished(self, token_ids): + if self.iteration_idx == 0: + self.iteration_idx += 1 + return np.array([False, False, False]) + else: + return np.array([True, True, True]) + + tokenizer = MockTokenizer({"a": 0, "b": 1, "c": 2, "Test": 4}) + model = MockModel(tokenizer, [[1, 0, 0, 0], [0, 1, 0, 0]]) + sequence = FinishAfterTwo(model) + result = sequence("Test", samples=3) + assert_array_equal(result, [[4, 0, 1], [4, 0, 1], [4, 0, 1]]) + + class FinishAfterOne(Sequence): + def __init__(self, model): + super().__init__(model) + + def is_finished(self, token_ids): + return np.array([True, True, True]) + + tokenizer = MockTokenizer({"a": 0, "b": 1, "c": 3, "Test": 4}) + model = MockModel(tokenizer, [[1, 0, 0, 0], [0, 1, 0, 0]]) + sequence = FinishAfterOne(model) + result = sequence("Test", samples=3) + assert_array_equal(result, [[4, 0], [4, 0], [4, 0]]) + + +def test_call_prompt_list_samples(): + class FinishAfterThree(Sequence): + def __init__(self, model): + super().__init__(model) + self.iteration_idx = 0 + + def is_finished(self, token_ids): + if self.iteration_idx == 0: + self.iteration_idx += 1 + batch_shape = token_ids.shape[:-1] + return np.zeros(batch_shape, dtype=np.bool_) + elif self.iteration_idx == 1: + self.iteration_idx += 1 + return np.array( + [[True, False, True], [True, False, True], [True, False, True]] + ) + else: + return np.array([True, True, True]) + + tokenizer = MockTokenizer( + {"a": 0, "b": 1, "c": 2, "Test1": 3, "Test2": 4, "Test3": 5} + ) + model = MockModel( + tokenizer, [[1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0]] + ) + sequence = FinishAfterThree(model) + + result = sequence(["Test1", "Test2", "Test3"], samples=3) + assert_array_equal( + result, np.tile([[3, 0, 1, -1], [4, 0, 1, 2], [5, 0, 1, -1]], (3, 1, 1)) + )