From 13ce1fd194bd71a3c85d3544c3620fd0e921b94c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remi@thetypicalset.com>
Date: Thu, 15 Jun 2023 14:44:55 +0200
Subject: [PATCH] Add the `Sequence` base class

---
 outlines/text/sequences/sequence.py   | 250 ++++++++++++++++
 pyproject.toml                        |   2 +-
 tests/text/sequences/test_sequence.py | 393 ++++++++++++++++++++++++++
 3 files changed, 644 insertions(+), 1 deletion(-)
 create mode 100644 outlines/text/sequences/sequence.py
 create mode 100644 tests/text/sequences/test_sequence.py

diff --git a/outlines/text/sequences/sequence.py b/outlines/text/sequences/sequence.py
new file mode 100644
index 000000000..bea23de4c
--- /dev/null
+++ b/outlines/text/sequences/sequence.py
@@ -0,0 +1,250 @@
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+from numpy.random import Generator
+from numpy.typing import NDArray
+
+
+class Sequence:
+    """Represents a sequence generation method."""
+
+    def __init__(self, model, max_tokens: Optional[int] = None):
+        """Create a `Sequence` instance.
+
+        Parameters
+        ----------
+        model
+            The instance of the model used to generate next-token probabilities.
+        max_tokens
+            The maximum number of tokens that will be generated if no termination
+            condition is met.
+
+        """
+        self.model = model
+        self.max_tokens = max_tokens
+
+    def is_finished(self, token_ids: NDArray[np.int64]) -> NDArray[np.bool_]:
+        """Determine whether we should stop the generation."""
+        raise NotImplementedError(
+            "`Sequence.is_finished` must be implemented by subclasses."
+        )
+
+    def step(
+        self,
+        rng: Generator,
+        token_ids: NDArray[np.int64],
+        attention_mask: NDArray[np.int64],
+        samples: int = 1,
+    ) -> Tuple[NDArray[np.int64], NDArray[float]]:
+        """Generate one or several tokens that complete the input sequence.
+
+        The sampling step consists in using a model to generate next-token
+        logits and then sample `samples`-many new tokens from a categorical
+        distribution parametrized by these logits.
+
+        Parameters
+        ----------
+        rng
+            NumPy random number Generator instance
+        token_ids
+            The token ids passed as an input to the model, of shape `batch_shape
+            + (num_tokens,)`, where `num_tokens` is the sequences' length.
+        samples
+            The number of continuations to sample from the next-token probability
+            distribution.
+
+        Returns
+        -------
+        A tuple with an array of shape `new_batch_shape + (num_tokens+1,)`that
+        contains the completed sequences (input token ids and generated token
+        ids) and an array of shape `new_batch_shape + (vocab_size,)` that
+        contains the next token probabilities.
+        `new_batch_shape` is computed by removing dimensions of size one in
+        `(samples,) + batch_shape`.
+
+        """
+        num_input_dims = token_ids.ndim
+        probs = self.model(token_ids, attention_mask)
+
+        # Sample `samples`-many new tokens
+        next_token_ids = vectorized_random_choice(rng, probs, samples)
+
+        # Add the missing `num_tokens` and `num_sample` dimensions
+        next_token_ids = np.expand_dims(next_token_ids, -1)
+        token_ids = np.expand_dims(token_ids, 0)
+
+        # Expand the input `token_ids` array to be able to concatenate several
+        # samples.
+        if samples > 1:
+            repetitions = (samples,) + (1,) * num_input_dims
+            token_ids = np.tile(token_ids, repetitions)
+            probs = np.tile(probs, repetitions)
+
+        token_ids = np.concatenate([token_ids, next_token_ids], axis=-1)
+
+        # Merge sample and batch dimensions by removing dimensions of length
+        # 1. The shape of the resulting arrays is `new_batch_shape + (num_tokens,)`
+        # and `new_batch_shape + (vocab_size,)` respectively.
+        token_ids = np.atleast_2d(token_ids.squeeze())
+        probs = np.atleast_2d(probs.squeeze())
+
+        return token_ids, probs
+
+    def expand_attention_mask(
+        self, attention_mask: NDArray[np.int64]
+    ) -> NDArray[np.int64]:
+        """Expand the attention mask after the last completion."""
+        batch_shape = attention_mask.shape[:-1]
+        attention_mask = np.concatenate(
+            [attention_mask, np.broadcast_to([1], batch_shape + (1,))], axis=-1
+        )
+        return attention_mask
+
+    def update_token_ids(
+        self,
+        is_finished: NDArray[np.bool_],
+        token_ids: NDArray[np.int64],
+        token_ids_unfinished: NDArray[np.int64],
+    ) -> NDArray[np.int64]:
+        """Update the array of token ids after the last completion.
+
+        We only generate new tokens for the sequences that are not finished. We thus
+        update the array with the new tokens, and append pad tokens to the finished
+        sequences.
+
+        Parameters
+        ----------
+        is_finished
+            Boolean array that indicates which sequences are finished.
+        token_ids
+            Array that contains the sequences before the generation's last step.
+        token_ids_unfinished
+            Array that contains the sequences of the unfinished sequences
+            after the generation's last step.
+
+        Returns
+        -------
+        An array that contains the updated array that contains the sequences. We append
+        pad tokens to the finished sequences.
+
+        """
+        batch_shape = token_ids.shape[:-1]
+        num_tokens = token_ids.shape[-1]
+        new_token_ids = np.empty(batch_shape + (num_tokens + 1,), dtype=np.int64)
+
+        token_ids_finished = token_ids[is_finished]
+        batch_shape_finished = token_ids_finished.shape[:-1]
+        token_ids_finished = np.concatenate(
+            [
+                token_ids_finished,
+                np.broadcast_to(
+                    [self.model.tokenizer.pad_token_id], batch_shape_finished + (1,)
+                ),
+            ],
+            axis=-1,
+        )
+
+        new_token_ids[~is_finished] = token_ids_unfinished
+        new_token_ids[is_finished] = token_ids_finished
+
+        return new_token_ids
+
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        samples: int = 1,
+        rng: Generator = np.random.default_rng(),
+    ) -> Union[str, List[str]]:
+        """Generate a new sequence given a prompt.
+
+        Parameters
+        ----------
+        prompt
+            The input prompt.
+        samples
+            The number of samples to generate for each prompt.
+
+        Returns
+        -------
+        The full sequence that contains the prompts and the generated string.
+
+        """
+        token_ids, attention_mask = self.model.tokenizer.encode(prompt)
+        num_prompt_tokens = token_ids.shape[-1]
+
+        if samples > 1:
+            token_ids, _ = self.step(rng, token_ids, attention_mask, samples)
+            is_finished = self.is_finished(token_ids)
+
+            num_batch_dims = token_ids.ndim - 1
+            repetitions = (samples,) + (1,) * num_batch_dims
+            attention_mask = np.tile(attention_mask, repetitions)
+            attention_mask = self.expand_attention_mask(attention_mask)
+        else:
+            batch_shape = token_ids.shape[:-1]
+            is_finished = np.zeros(batch_shape, dtype=np.bool_)
+
+        while True:
+            num_generated_tokens = token_ids.shape[-1] - num_prompt_tokens
+            if np.all(is_finished) or num_generated_tokens == self.max_tokens:
+                break
+
+            token_ids_unfinished = token_ids[~is_finished]
+            attention_mask_unfinished = attention_mask[~is_finished]
+            token_ids_unfinished, _ = self.step(
+                rng, token_ids_unfinished, attention_mask_unfinished
+            )
+
+            token_ids = self.update_token_ids(
+                is_finished, token_ids, token_ids_unfinished
+            )
+            attention_mask = self.expand_attention_mask(attention_mask)
+            is_finished[~is_finished] = self.is_finished(token_ids_unfinished).flatten()
+
+        result = self.model.tokenizer.decode(token_ids)
+
+        if len(result) == 1:
+            return result[0]
+
+        return result
+
+
+vsearchsorted = np.vectorize(np.searchsorted, otypes=[int], signature="(n),()->()")
+
+
+def vectorized_random_choice(
+    rng: Generator,
+    p: NDArray[np.float64],
+    samples: int = 1,
+):
+    """Vectorized implementation of `np.random.choice`.
+
+    `np.random.choice` does not support arrays of probability. This implements
+    the equivalent of this function where the `p` argument can be a matrix.
+
+    Note
+    ----
+    `searchsorted` might be more efficient here since the number of elements
+    can be quite large.
+
+    Parameters
+    ----------
+    rng
+        NumPy random number Generator instance
+    p
+        An array of probability of shape `(num_probability_vectors, num_items)`
+        that must sum to 1.
+    samples
+        The number of samples to take for each probability vector.
+
+    Returns
+    -------
+    An array of shape `(num_samples, batch_size)`
+
+    """
+
+    cumsum = np.expand_dims(p.cumsum(axis=-1), 0)
+    rand = rng.random((samples,) + p.shape[:-1])
+    idx = vsearchsorted(cumsum, rand)
+
+    return idx
diff --git a/pyproject.toml b/pyproject.toml
index 01222eeb9..62c7ae99a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -75,8 +75,8 @@ module = [
     "diffusers",
     "jinja2",
     "joblib",
-    "numpy.*",
     "openai",
+    "numpy.*",
     "perscache.*",
     "PIL",
     "PIL.Image",
diff --git a/tests/text/sequences/test_sequence.py b/tests/text/sequences/test_sequence.py
new file mode 100644
index 000000000..946990102
--- /dev/null
+++ b/tests/text/sequences/test_sequence.py
@@ -0,0 +1,393 @@
+from typing import Dict, List, Union
+
+import numpy as np
+import pytest
+from numpy.testing import assert_array_equal
+
+from outlines.text.sequences.sequence import Sequence, vectorized_random_choice
+
+
+def test_vectorized_random_choice():
+    rng = np.random.default_rng(0)
+
+    probs = np.array([[1, 0, 0, 0]])
+    sample = vectorized_random_choice(rng, probs)
+    assert sample.shape == (1, 1)
+    assert_array_equal(sample, np.zeros((1, 1)))
+
+    probs = np.array([[1, 0, 0, 0]])
+    sample = vectorized_random_choice(rng, probs, samples=3)
+    assert sample.shape == (3, 1)
+    assert_array_equal(sample, np.zeros((3, 1)))
+
+    probs = np.tile(np.array([[1, 0, 0, 0]]), (2, 1))
+    sample = vectorized_random_choice(rng, probs)
+    assert sample.shape == (1, 2)
+    assert_array_equal(sample, np.zeros((1, 2)))
+
+    probs = np.array([[1, 0, 0, 0], [0, 1, 0, 0]])
+    sample = vectorized_random_choice(rng, probs, samples=3)
+    assert sample.shape == (3, 2)
+    assert_array_equal(sample, [[0, 1], [0, 1], [0, 1]])
+
+    probs = np.array([[[1, 0, 0, 0], [0, 1, 0, 0]], [[0, 0, 1, 0], [0, 0, 0, 1]]])
+    sample = vectorized_random_choice(rng, probs, samples=3)
+    assert sample.shape == (3, 2, 2)
+    assert_array_equal(sample, [[[0, 1], [2, 3]], [[0, 1], [2, 3]], [[0, 1], [2, 3]]])
+
+
+def test_sequence_error():
+    with pytest.raises(NotImplementedError, match="must be implemented"):
+        sequence = Sequence(None)
+        sequence.is_finished(np.array([1]))
+
+
+def ModelStep(logits):
+    """Mock model to test `Sequence.step`"""
+
+    logits = np.array([logits])
+
+    def call(input_ids, *_):
+        """Call the model.
+
+        We first repeat the logits `num_sequences` times, and then
+        reshape the resulting array to match the batch size.
+
+        """
+        import math
+
+        batch_shape = input_ids.shape[:-1]
+        vocab_shape = (logits.shape[-1],)
+        shaped_logits = np.tile(logits, (math.prod(batch_shape), 1))
+        return shaped_logits.reshape(batch_shape + vocab_shape)
+
+    return call
+
+
+def test_sequence_step():
+    rng = np.random.default_rng(0)
+
+    logits = np.array([0, 1, 0, 0])
+    model = ModelStep(logits)
+
+    sequence = Sequence(model)
+
+    input_ids = np.array([[1, 2]])
+    token_ids, probs = sequence.step(rng, input_ids, np.ones((1, 2)))
+    assert_array_equal(token_ids, [[1, 2, 1]])
+    assert probs.shape == (1, 4)
+
+
+def test_sequence_step_batch():
+    rng = np.random.default_rng(0)
+
+    logits = np.array([0, 1, 0, 0])
+    model = ModelStep(logits)
+
+    sequence = Sequence(model)
+
+    input_ids = np.array([[1, 2], [3, 4]])
+    token_ids, probs = sequence.step(rng, input_ids, np.ones((2, 2)))
+    assert_array_equal(token_ids, [[1, 2, 1], [3, 4, 1]])
+    assert probs.shape == (2, 4)
+
+
+def test_sequence_step_sample():
+    rng = np.random.default_rng(0)
+
+    logits = np.array([0, 1, 0, 0])
+    model = ModelStep(logits)
+
+    sequence = Sequence(model)
+    input_ids = np.array([[1, 2]])
+    token_ids, probs = sequence.step(rng, input_ids, np.ones((1, 2)), samples=3)
+    assert_array_equal(token_ids, [[1, 2, 1], [1, 2, 1], [1, 2, 1]])
+    assert probs.shape == (3, 4)
+
+
+def test_sequence_sample_batch():
+    rng = np.random.default_rng(0)
+
+    logits = np.array([0, 1, 0, 0])
+    model = ModelStep(logits)
+
+    sequence = Sequence(model)
+    input_ids = np.array([[1, 2, 1], [3, 4, 1]])
+    token_ids, probs = sequence.step(rng, input_ids, np.ones((2, 3)), samples=3)
+    assert_array_equal(
+        token_ids,
+        [
+            [[1, 2, 1, 1], [3, 4, 1, 1]],
+            [[1, 2, 1, 1], [3, 4, 1, 1]],
+            [[1, 2, 1, 1], [3, 4, 1, 1]],
+        ],
+    )
+    assert probs.shape == (3, 2, 4)
+
+
+def test_sequence_step_loop():
+    """Make sure that we can feed `step`'s output back as an input."""
+
+    rng = np.random.default_rng(0)
+
+    logits = np.array([0, 1, 0, 0])
+    model = ModelStep(logits)
+
+    sequence = Sequence(model)
+    input_ids = np.array([[1, 2]])
+    token_ids, _ = sequence.step(rng, input_ids, np.ones((1, 2)))
+    token_ids, probs = sequence.step(rng, token_ids, np.ones((1, 3)))
+    assert_array_equal(token_ids, [[1, 2, 1, 1]])
+    assert probs.shape == (1, 4)
+
+    input_ids = np.array([[1, 2], [3, 4]])
+    token_ids, _ = sequence.step(rng, input_ids, np.ones((2, 2)))
+    token_ids, probs = sequence.step(rng, token_ids, np.ones((2, 3)))
+    assert_array_equal(token_ids, [[1, 2, 1, 1], [3, 4, 1, 1]])
+    assert probs.shape == (2, 4)
+
+    # The number of samples becomes the batch size at the next iteration.
+    input_ids = np.array([[1, 2]])
+    token_ids, _ = sequence.step(rng, input_ids, np.ones((1, 2)), samples=3)
+    token_ids, probs = sequence.step(rng, token_ids, np.ones((3, 3)))
+    assert_array_equal(token_ids, [[1, 2, 1, 1], [1, 2, 1, 1], [1, 2, 1, 1]])
+    assert probs.shape == (3, 4)
+
+
+def test_sequence_step_loop_general():
+    rng = np.random.default_rng(0)
+
+    logits = np.array([0, 1, 0, 0])
+    model = ModelStep(logits)
+
+    sequence = Sequence(model)
+    input_ids = np.array([[1, 2, 1], [3, 4, 1]])
+    token_ids, _ = sequence.step(rng, input_ids, np.ones((1, 3)), samples=3)
+    result, _ = sequence.step(rng, token_ids, np.ones((3, 4)))
+    assert result.shape == (3, 2, 5)
+    assert_array_equal(
+        result,
+        [
+            [[1, 2, 1, 1, 1], [3, 4, 1, 1, 1]],
+            [[1, 2, 1, 1, 1], [3, 4, 1, 1, 1]],
+            [[1, 2, 1, 1, 1], [3, 4, 1, 1, 1]],
+        ],
+    )
+
+
+class TokenizerUpdateTokens:
+    pad_token_id = -1
+
+
+class ModelUpdateTokens:
+    tokenizer = TokenizerUpdateTokens()
+
+
+def test_update_token_ids_all_unfinished():
+    sequence = Sequence(ModelUpdateTokens())
+
+    previous_token_ids = np.array([[1, 1], [1, 1]])
+    is_finished = np.array([False, False])
+    token_ids_unfinished = np.array([[1, 1, 1], [1, 1, 1]])
+
+    result = sequence.update_token_ids(
+        is_finished, previous_token_ids, token_ids_unfinished
+    )
+    assert_array_equal(result, [[1, 1, 1], [1, 1, 1]])
+
+
+def test_update_token_ids_some_unfinished():
+    "Makes sure that the pad token is appended to finished sequences."
+    sequence = Sequence(ModelUpdateTokens())
+
+    previous_token_ids = np.array([[1, 1], [1, 1]])
+    token_ids_unfinished = np.array([[1, 1, 1]])
+    is_finished = np.array([True, False])
+    result = sequence.update_token_ids(
+        is_finished, previous_token_ids, token_ids_unfinished
+    )
+    assert_array_equal(result, [[1, 1, -1], [1, 1, 1]])
+
+
+@pytest.mark.xfail
+def test_update_token_ids_larger_dimensions():
+    sequence = Sequence(ModelUpdateTokens())
+
+    previous_token_ids = np.array([[1, 1], [1, 1]])
+    is_finished = np.array([False, False])
+    token_ids_unfinished = np.array([[1, 1, 1], [1, 1, 1]])
+    result = sequence.update_token_ids(
+        is_finished, previous_token_ids, token_ids_unfinished
+    )
+    assert_array_equal(result, [[1, 1, -1], [1, 1, 1]])
+
+
+class MockModel:
+    def __init__(self, tokenizer, logits):
+        self.tokenizer = tokenizer
+        self.logits = np.array(logits)
+        self.iteration_idx = 0
+
+    def __call__(self, input_ids, *_):
+        import math
+
+        batch_shape = input_ids.shape[:-1]
+        vocab_shape = (self.logits.shape[-1],)
+        shaped_logits = np.tile(
+            self.logits[self.iteration_idx], (math.prod(batch_shape), 1)
+        )
+        self.iteration_idx += 1
+
+        return shaped_logits.reshape(batch_shape + vocab_shape)
+
+
+class MockTokenizer:
+    def __init__(self, vocabulary: Dict[str, int]):
+        self.vocabulary = vocabulary
+        self.pad_token_id = -1
+
+    def encode(self, prompts: Union[str, List[str]]):
+        if isinstance(prompts, str):
+            prompts = [prompts]
+
+        token_ids = np.array([[self.vocabulary[prompt]] for prompt in prompts])
+        attention_mask = np.ones_like(token_ids)
+
+        return token_ids, attention_mask
+
+    def decode(self, token_ids):
+        return token_ids
+
+
+def test_call_single_prompt():
+    class FinishAfterTwo(Sequence):
+        def __init__(self, model):
+            super().__init__(model)
+            self.iteration_idx = 0
+
+        def is_finished(self, token_ids):
+            """Finish generating the sequence after two iterations"""
+            if self.iteration_idx == 0:
+                self.iteration_idx += 1
+                return np.array([False])
+            else:
+                return np.array([True])
+
+    tokenizer = MockTokenizer({"Test": 0, "a": 1, "b": 2})
+    model = MockModel(tokenizer, [[1, 0, 0], [0, 1, 0]])
+    sequence = FinishAfterTwo(model)
+
+    result = sequence("Test")
+    assert_array_equal(result, [0, 0, 1])
+
+
+def test_call_prompt_list():
+    class Tokenizer:
+        def __init__(self, vocabulary: Dict[str, int]):
+            self.vocabulary = vocabulary
+            self.pad_token_id = -1
+
+        def __call__(self, prompts: List[str], **_):
+            return {
+                "input_ids": np.array([[self.vocabulary[prompt]] for prompt in prompts])
+            }
+
+        def batch_decode(self, token_ids):
+            return token_ids
+
+    class FinishAfterThree(Sequence):
+        def __init__(self, model):
+            super().__init__(model)
+            self.iteration_idx = 0
+
+        def is_finished(self, token_ids):
+            """Finish generating the first sequence after two iteration and the
+            second one after two iterations.
+
+            """
+            if self.iteration_idx == 0:
+                self.iteration_idx += 1
+                return np.array([False, False, False])
+            elif self.iteration_idx == 1:
+                self.iteration_idx += 1
+                return np.array([True, False, True])
+            else:
+                return np.array([True])  # We only consider the unfinished sequences
+
+    tokenizer = MockTokenizer(
+        {"Test1": 0, "Test2": 1, "a": 2, "b": 3, "c": 4, "Test3": 5}
+    )
+    model = MockModel(
+        tokenizer,
+        [[0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 1, 0]],
+    )
+    sequence = FinishAfterThree(model)
+
+    result = sequence(["Test1", "Test2", "Test3"])
+    assert_array_equal(result, [[0, 2, 3, -1], [1, 2, 3, 4], [5, 2, 3, -1]])
+
+
+def test_call_single_prompt_samples():
+    class FinishAfterTwo(Sequence):
+        def __init__(self, model):
+            super().__init__(model)
+            self.iteration_idx = 0
+
+        def is_finished(self, token_ids):
+            if self.iteration_idx == 0:
+                self.iteration_idx += 1
+                return np.array([False, False, False])
+            else:
+                return np.array([True, True, True])
+
+    tokenizer = MockTokenizer({"a": 0, "b": 1, "c": 2, "Test": 4})
+    model = MockModel(tokenizer, [[1, 0, 0, 0], [0, 1, 0, 0]])
+    sequence = FinishAfterTwo(model)
+    result = sequence("Test", samples=3)
+    assert_array_equal(result, [[4, 0, 1], [4, 0, 1], [4, 0, 1]])
+
+    class FinishAfterOne(Sequence):
+        def __init__(self, model):
+            super().__init__(model)
+
+        def is_finished(self, token_ids):
+            return np.array([True, True, True])
+
+    tokenizer = MockTokenizer({"a": 0, "b": 1, "c": 3, "Test": 4})
+    model = MockModel(tokenizer, [[1, 0, 0, 0], [0, 1, 0, 0]])
+    sequence = FinishAfterOne(model)
+    result = sequence("Test", samples=3)
+    assert_array_equal(result, [[4, 0], [4, 0], [4, 0]])
+
+
+def test_call_prompt_list_samples():
+    class FinishAfterThree(Sequence):
+        def __init__(self, model):
+            super().__init__(model)
+            self.iteration_idx = 0
+
+        def is_finished(self, token_ids):
+            if self.iteration_idx == 0:
+                self.iteration_idx += 1
+                batch_shape = token_ids.shape[:-1]
+                return np.zeros(batch_shape, dtype=np.bool_)
+            elif self.iteration_idx == 1:
+                self.iteration_idx += 1
+                return np.array(
+                    [[True, False, True], [True, False, True], [True, False, True]]
+                )
+            else:
+                return np.array([True, True, True])
+
+    tokenizer = MockTokenizer(
+        {"a": 0, "b": 1, "c": 2, "Test1": 3, "Test2": 4, "Test3": 5}
+    )
+    model = MockModel(
+        tokenizer, [[1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0]]
+    )
+    sequence = FinishAfterThree(model)
+
+    result = sequence(["Test1", "Test2", "Test3"], samples=3)
+    assert_array_equal(
+        result, np.tile([[3, 0, 1, -1], [4, 0, 1, 2], [5, 0, 1, -1]], (3, 1, 1))
+    )