Generate text using any interegular FSM

dottxt-ai · Feb 25, 2024 · 0beb5de · 0beb5de
1 parent d23807b
commit 0beb5de
Show file tree

Hide file tree

Showing 6 changed files with 141 additions and 2 deletions.
diff --git a/docs/reference/custom_fsm_ops.md b/docs/reference/custom_fsm_ops.md
@@ -0,0 +1,83 @@
+# Custom FSM Operations
+
+```RegexFSM.from_interegular_fsm``` leverages the flexibility of ```interegular.FSM``` to use the available operations in ```interegular```.
+
+## Examples
+
+### ```difference``` 
+
+Returns an FSM which recognises only the strings recognised by the first FSM in the list, but none of the others.
+
+```python
+list_of_strings_pattern = """\["[^"\s]*"(?:,"[^"\s]*")*\]"""
+pink_elephant_pattern = """.*(pink|elephant).*"""
+
+list_of_strings_fsm = interegular.parse_pattern(list_of_strings_pattern).to_fsm()
+pink_elephant_fsm = interegular.parse_pattern(pink_elephant_pattern).to_fsm()
+
+list_of_strings_fsm.accepts('["a","pink","elephant"]')
+# True
+
+difference_fsm = list_of_strings_fsm - pink_elephant_fsm
+
+difference_fsm_fsm.accepts('["a","pink","elephant"]')
+# False
+difference_fsm_fsm.accepts('["a","blue","donkey"]')
+# True
+```
+
+### ```union```
+
+Returns a finite state machine which accepts any sequence of symbols that is accepted by either self or other.
+
+```python
+list_of_strings_pattern = """\["[^"\s]*"(?:,"[^"\s]*")*\]"""
+tuple_of_strings_pattern = """\("[^"\s]*"(?:,"[^"\s]*")*\)"""
+
+list_of_strings_fsm = interegular.parse_pattern(list_of_strings_pattern).to_fsm()
+tuple_of_strings_fsm = interegular.parse_pattern(tuple_of_strings_pattern).to_fsm()
+
+list_of_strings_fsm.accepts('("a","pink","elephant")')
+# False
+
+union_fsm = list_of_strings_fsm|tuple_of_strings_fsm
+
+union_fsm.accepts('["a","pink","elephant"]')
+# True
+union_fsm.accepts('("a","blue","donkey")')
+# True
+```
+
+### ```intersection```
+
+Returns an FSM which accepts any sequence of symbols that is accepted by both of the original FSMs.
+
+```python
+list_of_strings_pattern = """\["[^"\s]*"(?:,"[^"\s]*")*\]"""
+pink_elephant_pattern = """.*(pink|elephant).*"""
+
+list_of_strings_fsm = interegular.parse_pattern(list_of_strings_pattern).to_fsm()
+pink_elephant_fsm = interegular.parse_pattern(pink_elephant_pattern).to_fsm()
+
+list_of_strings_fsm.accepts('["a","blue","donkey"]')
+# True
+
+intersection_fsm = list_of_strings_fsm & pink_elephant_fsm
+
+intersection_fsm.accepts('["a","pink","elephant"]')
+# True
+intersection_fsm.accepts('["a","blue","donkey"]')
+# False
+```
+
+_There are more operations available, we refer to https://github.com/MegaIng/interegular/blob/master/interegular/fsm.py._
+
+# Loading Custom FSM
+
+```python
+import outlines
+
+generator = outlines.generate.fsm(model, custom_fsm)
+
+response = generator(prompt)
+```
diff --git a/docs/welcome.md b/docs/welcome.md
@@ -28,7 +28,7 @@ We support Openai, but the true power of Outlines〰 is unleashed with Open Sour
     [:octicons-arrow-right-24: Deploy outlines](reference/vllm.md)
 
 
--   :material-regex:{ .lg .middle } __Make LLMs follows a Regex__
+-   :material-regex:{ .lg .middle } __Make LLMs follow a Regex__
 
     ---
 

diff --git a/mkdocs.yml b/mkdocs.yml
@@ -123,6 +123,7 @@ nav:
         - Regex: reference/regex.md
         - Types: reference/types.md
         - Samplers: reference/samplers.md
+        - Custom FSM operations: reference/custom_fsm_ops.md
     - Utilities:
         - Serve with vLLM: reference/vllm.md
         - Prompt templating: reference/prompting.md

diff --git a/outlines/fsm/fsm.py b/outlines/fsm/fsm.py
@@ -94,7 +94,7 @@ class RegexFSM(FSM):
     def __init__(self, regex_string: str, tokenizer):
         @cache()
         def create_states_mapping(
-            regex_string: str, cacheable_vocabulary: Tuple[Tuple[str, int]]
+            regex_string: str, cacheable_vocabulary: Tuple[Tuple[str, int], ...]
         ) -> Tuple[dict, set]:
             """Create the variables related to the mapping between states and tokens
             The parameters of the function are used for caching purpose
@@ -182,6 +182,46 @@ def next_state(self, state: FSMState, token_id: int) -> FSMState:
 
         return FSMState(next_state)
 
+    @classmethod
+    def from_interegular_fsm(
+        cls, interegular_fsm: interegular.fsm.FSM, tokenizer: "Tokenizer"
+    ):
+        from_interegular_instance = cls.__new__(cls)
+
+        def create_states_mapping_from_interegular_fsm(
+            fsm: interegular.fsm.FSM, cacheable_vocabulary: Tuple[Tuple[str, int], ...]
+        ) -> Tuple[dict, set]:
+            """Create the variables related to the mapping between states and tokens
+            The parameters of the function are used for caching purpose
+            """
+            regex_fsm, _ = make_deterministic_fsm(fsm.reduce())
+            states_to_token_maps, empty_token_ids = create_fsm_index_tokenizer(
+                regex_fsm, tokenizer
+            )
+
+            # We make sure that it is possible to generate strings in the language
+            # of the regular expression with the tokens present in the model's
+            # vocabulary.
+            if not any(
+                regex_fsm.finals.intersection(v.values())
+                for v in states_to_token_maps.values()
+            ):
+                raise ValueError(
+                    "The vocabulary does not allow us to build a sequence that matches the input regex"
+                )
+
+            return states_to_token_maps, empty_token_ids
+
+        (
+            from_interegular_instance.states_to_token_maps,
+            from_interegular_instance.empty_token_ids,
+        ) = create_states_mapping_from_interegular_fsm(
+            interegular_fsm, tuple(sorted(tokenizer.vocabulary.items()))
+        )
+        from_interegular_instance.vocabulary = tokenizer.vocabulary.values()
+        from_interegular_instance.eos_token_id = tokenizer.eos_token_id
+        return from_interegular_instance
+
     def copy(self) -> "RegexFSM":
         """Create a copy of the FSM."""
         return self

diff --git a/outlines/generate/__init__.py b/outlines/generate/__init__.py
@@ -2,6 +2,7 @@
 from .cfg import cfg
 from .choice import choice
 from .format import format
+from .fsm import fsm
 from .json import json
 from .regex import regex
 from .text import text
diff --git a/outlines/generate/fsm.py b/outlines/generate/fsm.py
@@ -0,0 +1,14 @@
+import interegular
+
+from outlines.fsm.fsm import RegexFSM
+from outlines.generate.api import SequenceGenerator
+from outlines.samplers import Sampler, multinomial
+
+
+def fsm(
+    model, fsm: interegular.fsm.FSM, sampler: Sampler = multinomial()
+) -> SequenceGenerator:
+    fsm = RegexFSM.from_interegular_fsm(fsm, model.tokenizer)
+    device = model.device
+    generator = SequenceGenerator(fsm, model, sampler, device)
+    return generator