Skip to content

Commit

Permalink
Generate text using any interegular FSM
Browse files Browse the repository at this point in the history
  • Loading branch information
miftahmoha authored and rlouf committed Feb 25, 2024
1 parent 9c4e7f4 commit 7a21043
Show file tree
Hide file tree
Showing 5 changed files with 140 additions and 1 deletion.
83 changes: 83 additions & 0 deletions docs/reference/custom_fsm_ops.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# Custom FSM Operations

```RegexFSM.from_interegular_fsm``` leverages the flexibility of ```interegular.FSM``` to use the available operations in ```interegular```.

## Examples

### ```difference```

Returns an FSM which recognises only the strings recognised by the first FSM in the list, but none of the others.

```python
list_of_strings_pattern = """\["[^"\s]*"(?:,"[^"\s]*")*\]"""
pink_elephant_pattern = """.*(pink|elephant).*"""

list_of_strings_fsm = interegular.parse_pattern(list_of_strings_pattern).to_fsm()
pink_elephant_fsm = interegular.parse_pattern(pink_elephant_pattern).to_fsm()

list_of_strings_fsm.accepts('["a","pink","elephant"]')
# True

difference_fsm = list_of_strings_fsm - pink_elephant_fsm

difference_fsm_fsm.accepts('["a","pink","elephant"]')
# False
difference_fsm_fsm.accepts('["a","blue","donkey"]')
# True
```

### ```union```

Returns a finite state machine which accepts any sequence of symbols that is accepted by either self or other.

```python
list_of_strings_pattern = """\["[^"\s]*"(?:,"[^"\s]*")*\]"""
tuple_of_strings_pattern = """\("[^"\s]*"(?:,"[^"\s]*")*\)"""

list_of_strings_fsm = interegular.parse_pattern(list_of_strings_pattern).to_fsm()
tuple_of_strings_fsm = interegular.parse_pattern(tuple_of_strings_pattern).to_fsm()

list_of_strings_fsm.accepts('("a","pink","elephant")')
# False

union_fsm = list_of_strings_fsm|tuple_of_strings_fsm

union_fsm.accepts('["a","pink","elephant"]')
# True
union_fsm.accepts('("a","blue","donkey")')
# True
```

### ```intersection```

Returns an FSM which accepts any sequence of symbols that is accepted by both of the original FSMs.

```python
list_of_strings_pattern = """\["[^"\s]*"(?:,"[^"\s]*")*\]"""
pink_elephant_pattern = """.*(pink|elephant).*"""

list_of_strings_fsm = interegular.parse_pattern(list_of_strings_pattern).to_fsm()
pink_elephant_fsm = interegular.parse_pattern(pink_elephant_pattern).to_fsm()

list_of_strings_fsm.accepts('["a","blue","donkey"]')
# True

intersection_fsm = list_of_strings_fsm & pink_elephant_fsm

intersection_fsm.accepts('["a","pink","elephant"]')
# True
intersection_fsm.accepts('["a","blue","donkey"]')
# False
```

_There are more operations available, we refer to https://github.com/MegaIng/interegular/blob/master/interegular/fsm.py._

# Loading Custom FSM

```python
import outlines

generator = outlines.generate.fsm(model, custom_fsm)

response = generator(prompt)
```
1 change: 1 addition & 0 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ nav:
- Regex: reference/regex.md
- Types: reference/types.md
- Samplers: reference/samplers.md
- Custom FSM operations: reference/custom_fsm_ops.md
- Utilities:
- Serve with vLLM: reference/vllm.md
- Prompt templating: reference/prompting.md
Expand Down
42 changes: 41 additions & 1 deletion outlines/fsm/fsm.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ class RegexFSM(FSM):
def __init__(self, regex_string: str, tokenizer):
@cache()
def create_states_mapping(
regex_string: str, cacheable_vocabulary: Tuple[Tuple[str, int]]
regex_string: str, cacheable_vocabulary: Tuple[Tuple[str, int], ...]
) -> Tuple[dict, set]:
"""Create the variables related to the mapping between states and tokens
The parameters of the function are used for caching purpose
Expand Down Expand Up @@ -182,6 +182,46 @@ def next_state(self, state: FSMState, token_id: int) -> FSMState:

return FSMState(next_state)

@classmethod
def from_interegular_fsm(
cls, interegular_fsm: interegular.fsm.FSM, tokenizer: "Tokenizer"
):
from_interegular_instance = cls.__new__(cls)

def create_states_mapping_from_interegular_fsm(
fsm: interegular.fsm.FSM, cacheable_vocabulary: Tuple[Tuple[str, int], ...]
) -> Tuple[dict, set]:
"""Create the variables related to the mapping between states and tokens
The parameters of the function are used for caching purpose
"""
regex_fsm, _ = make_deterministic_fsm(fsm.reduce())
states_to_token_maps, empty_token_ids = create_fsm_index_tokenizer(
regex_fsm, tokenizer
)

# We make sure that it is possible to generate strings in the language
# of the regular expression with the tokens present in the model's
# vocabulary.
if not any(
regex_fsm.finals.intersection(v.values())
for v in states_to_token_maps.values()
):
raise ValueError(
"The vocabulary does not allow us to build a sequence that matches the input regex"
)

return states_to_token_maps, empty_token_ids

(
from_interegular_instance.states_to_token_maps,
from_interegular_instance.empty_token_ids,
) = create_states_mapping_from_interegular_fsm(
interegular_fsm, tuple(sorted(tokenizer.vocabulary.items()))
)
from_interegular_instance.vocabulary = tokenizer.vocabulary.values()
from_interegular_instance.eos_token_id = tokenizer.eos_token_id
return from_interegular_instance

def copy(self) -> "RegexFSM":
"""Create a copy of the FSM."""
return self
Expand Down
1 change: 1 addition & 0 deletions outlines/generate/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from .cfg import cfg
from .choice import choice
from .format import format
from .fsm import fsm
from .json import json
from .regex import regex
from .text import text
14 changes: 14 additions & 0 deletions outlines/generate/fsm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import interegular

from outlines.fsm.fsm import RegexFSM
from outlines.generate.api import SequenceGenerator
from outlines.samplers import Sampler, multinomial


def fsm(
model, fsm: interegular.fsm.FSM, sampler: Sampler = multinomial()
) -> SequenceGenerator:
fsm = RegexFSM.from_interegular_fsm(fsm, model.tokenizer)
device = model.device
generator = SequenceGenerator(fsm, model, sampler, device)
return generator

0 comments on commit 7a21043

Please sign in to comment.