diff --git a/docs/reference/custom_fsm_ops.md b/docs/reference/custom_fsm_ops.md new file mode 100644 index 000000000..5043ba11f --- /dev/null +++ b/docs/reference/custom_fsm_ops.md @@ -0,0 +1,83 @@ +# Custom FSM Operations + +```RegexFSM.from_interegular_fsm``` leverages the flexibility of ```interegular.FSM``` to use the available operations in ```interegular```. + +## Examples + +### ```difference``` + +Returns an FSM which recognises only the strings recognised by the first FSM in the list, but none of the others. + +```python +list_of_strings_pattern = """\["[^"\s]*"(?:,"[^"\s]*")*\]""" +pink_elephant_pattern = """.*(pink|elephant).*""" + +list_of_strings_fsm = interegular.parse_pattern(list_of_strings_pattern).to_fsm() +pink_elephant_fsm = interegular.parse_pattern(pink_elephant_pattern).to_fsm() + +list_of_strings_fsm.accepts('["a","pink","elephant"]') +# True + +difference_fsm = list_of_strings_fsm - pink_elephant_fsm + +difference_fsm_fsm.accepts('["a","pink","elephant"]') +# False +difference_fsm_fsm.accepts('["a","blue","donkey"]') +# True +``` + +### ```union``` + +Returns a finite state machine which accepts any sequence of symbols that is accepted by either self or other. + +```python +list_of_strings_pattern = """\["[^"\s]*"(?:,"[^"\s]*")*\]""" +tuple_of_strings_pattern = """\("[^"\s]*"(?:,"[^"\s]*")*\)""" + +list_of_strings_fsm = interegular.parse_pattern(list_of_strings_pattern).to_fsm() +tuple_of_strings_fsm = interegular.parse_pattern(tuple_of_strings_pattern).to_fsm() + +list_of_strings_fsm.accepts('("a","pink","elephant")') +# False + +union_fsm = list_of_strings_fsm|tuple_of_strings_fsm + +union_fsm.accepts('["a","pink","elephant"]') +# True +union_fsm.accepts('("a","blue","donkey")') +# True +``` + +### ```intersection``` + +Returns an FSM which accepts any sequence of symbols that is accepted by both of the original FSMs. + +```python +list_of_strings_pattern = """\["[^"\s]*"(?:,"[^"\s]*")*\]""" +pink_elephant_pattern = """.*(pink|elephant).*""" + +list_of_strings_fsm = interegular.parse_pattern(list_of_strings_pattern).to_fsm() +pink_elephant_fsm = interegular.parse_pattern(pink_elephant_pattern).to_fsm() + +list_of_strings_fsm.accepts('["a","blue","donkey"]') +# True + +intersection_fsm = list_of_strings_fsm & pink_elephant_fsm + +intersection_fsm.accepts('["a","pink","elephant"]') +# True +intersection_fsm.accepts('["a","blue","donkey"]') +# False +``` + +_There are more operations available, we refer to https://github.com/MegaIng/interegular/blob/master/interegular/fsm.py._ + +# Loading Custom FSM + +```python +import outlines + +generator = outlines.generate.fsm(model, custom_fsm) + +response = generator(prompt) +``` diff --git a/docs/welcome.md b/docs/welcome.md index 231a0e3cb..ca619e56c 100644 --- a/docs/welcome.md +++ b/docs/welcome.md @@ -28,7 +28,7 @@ We support Openai, but the true power of Outlines〰 is unleashed with Open Sour [:octicons-arrow-right-24: Deploy outlines](reference/vllm.md) -- :material-regex:{ .lg .middle } __Make LLMs follows a Regex__ +- :material-regex:{ .lg .middle } __Make LLMs follow a Regex__ --- diff --git a/mkdocs.yml b/mkdocs.yml index f9abe927c..86ccf326d 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -123,6 +123,7 @@ nav: - Regex: reference/regex.md - Types: reference/types.md - Samplers: reference/samplers.md + - Custom FSM operations: reference/custom_fsm_ops.md - Utilities: - Serve with vLLM: reference/vllm.md - Prompt templating: reference/prompting.md diff --git a/outlines/fsm/fsm.py b/outlines/fsm/fsm.py index 6ec3ff9e4..8e5be82e1 100644 --- a/outlines/fsm/fsm.py +++ b/outlines/fsm/fsm.py @@ -94,7 +94,7 @@ class RegexFSM(FSM): def __init__(self, regex_string: str, tokenizer): @cache() def create_states_mapping( - regex_string: str, cacheable_vocabulary: Tuple[Tuple[str, int]] + regex_string: str, cacheable_vocabulary: Tuple[Tuple[str, int], ...] ) -> Tuple[dict, set]: """Create the variables related to the mapping between states and tokens The parameters of the function are used for caching purpose @@ -182,6 +182,46 @@ def next_state(self, state: FSMState, token_id: int) -> FSMState: return FSMState(next_state) + @classmethod + def from_interegular_fsm( + cls, interegular_fsm: interegular.fsm.FSM, tokenizer: "Tokenizer" + ): + from_interegular_instance = cls.__new__(cls) + + def create_states_mapping_from_interegular_fsm( + fsm: interegular.fsm.FSM, cacheable_vocabulary: Tuple[Tuple[str, int], ...] + ) -> Tuple[dict, set]: + """Create the variables related to the mapping between states and tokens + The parameters of the function are used for caching purpose + """ + regex_fsm, _ = make_deterministic_fsm(fsm.reduce()) + states_to_token_maps, empty_token_ids = create_fsm_index_tokenizer( + regex_fsm, tokenizer + ) + + # We make sure that it is possible to generate strings in the language + # of the regular expression with the tokens present in the model's + # vocabulary. + if not any( + regex_fsm.finals.intersection(v.values()) + for v in states_to_token_maps.values() + ): + raise ValueError( + "The vocabulary does not allow us to build a sequence that matches the input regex" + ) + + return states_to_token_maps, empty_token_ids + + ( + from_interegular_instance.states_to_token_maps, + from_interegular_instance.empty_token_ids, + ) = create_states_mapping_from_interegular_fsm( + interegular_fsm, tuple(sorted(tokenizer.vocabulary.items())) + ) + from_interegular_instance.vocabulary = tokenizer.vocabulary.values() + from_interegular_instance.eos_token_id = tokenizer.eos_token_id + return from_interegular_instance + def copy(self) -> "RegexFSM": """Create a copy of the FSM.""" return self diff --git a/outlines/generate/__init__.py b/outlines/generate/__init__.py index be98b89ca..f28cbd80d 100644 --- a/outlines/generate/__init__.py +++ b/outlines/generate/__init__.py @@ -2,6 +2,7 @@ from .cfg import cfg from .choice import choice from .format import format +from .fsm import fsm from .json import json from .regex import regex from .text import text diff --git a/outlines/generate/fsm.py b/outlines/generate/fsm.py new file mode 100644 index 000000000..6f9b1b84b --- /dev/null +++ b/outlines/generate/fsm.py @@ -0,0 +1,14 @@ +import interegular + +from outlines.fsm.fsm import RegexFSM +from outlines.generate.api import SequenceGenerator +from outlines.samplers import Sampler, multinomial + + +def fsm( + model, fsm: interegular.fsm.FSM, sampler: Sampler = multinomial() +) -> SequenceGenerator: + fsm = RegexFSM.from_interegular_fsm(fsm, model.tokenizer) + device = model.device + generator = SequenceGenerator(fsm, model, sampler, device) + return generator