From 9ddf5e796fc24d0e4f157283e659c18f1bf9df9a Mon Sep 17 00:00:00 2001 From: saattrupdan Date: Thu, 14 Nov 2024 08:19:35 +0100 Subject: [PATCH] fix: Handle Salamandra and OpenCoder tokenizers --- python/outlines_core/fsm/regex.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/python/outlines_core/fsm/regex.py b/python/outlines_core/fsm/regex.py index af337e34..b4d2c2c8 100644 --- a/python/outlines_core/fsm/regex.py +++ b/python/outlines_core/fsm/regex.py @@ -342,9 +342,11 @@ def make_deterministic_fsm(fsm: FSM) -> Tuple[BetterFSM, Dict[int, int]]: re_llama_byte_token = re.compile(r"^<0x[0-9A-F]{2}>$") -# The "▁*" prefix is required to handle Gemma and GPT-SW3 tokenizers, and the "\.*" -# suffix is required to handle the NorwAI tokenizer. -re_replacement_seq = re.compile(r"^▁*�+\.*$") +# The "▁*" prefix is required to handle Gemma and GPT-SW3 tokenizers. +# The "\.*" suffix is required to handle the NorwAI tokenizer. +# The "\.*" prefix is required to handle the Salamandra tokenizer. +# The "s*$" suffix is required to handle the OpenCoder tokenizer. +re_replacement_seq = re.compile(r"^▁*\.*�+\.*s*$") # Copied from transformers.models.gpt2.tokenization_gpt2.bytes_to_unicode