diff --git a/outlines/fsm/regex.py b/outlines/fsm/regex.py index 8cfd81ead..6b105a7b9 100644 --- a/outlines/fsm/regex.py +++ b/outlines/fsm/regex.py @@ -905,9 +905,15 @@ def reduced_vocabulary( ) if token_str: - # invalid utf-8 sequences are replaced with � (\ufffd), but there - # might also be tokens specifically for �, ��, ���, etc. - if "\ufffd" in token_str and not re_replacement_seq.match(token): + if isinstance(token, bytes): + # Handle BPE tokenizers where the tokens are directly stored as bytes + # https://github.com/QwenLM/Qwen/blob/main/tokenization_note.md#regular-tokens + token_str = "".join(byte_symbol(b) for b in token) + + elif "\ufffd" in token_str and not re_replacement_seq.match(token): + # invalid utf-8 sequences are replaced with � (\ufffd), but there + # might also be tokens specifically for �, ��, ���, etc. + if re_llama_byte_token.match(token): # llama-like tokenizers have <0xXX> tokens for all # bytes >= 0x80 and represent all incomplete utf-8 diff --git a/tests/fsm/test_regex.py b/tests/fsm/test_regex.py index 7418deca2..1789c4a7c 100644 --- a/tests/fsm/test_regex.py +++ b/tests/fsm/test_regex.py @@ -714,8 +714,29 @@ def test_reduced_vocabulary_with_rare_tokens(rare_token): [1]: https://github.com/dottxt-ai/outlines/pull/763 [2]: https://github.com/dottxt-ai/outlines/pull/948 + [3]: https://github.com/dottxt-ai/outlines/pull/1153 """ tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2") tokenizer = TransformerTokenizer(tokenizer=tokenizer) tokenizer.vocabulary[rare_token] = max(tokenizer.vocabulary.values()) + 1 reduced_vocabulary(tokenizer) + + +def test_reduced_vocabulary_with_byte_tokens(): + class MockTokenizer: + vocabulary = { + "string": 1, + b"\xa1": 2, # Qwen-Style + "eos": 3, + } + special_tokens = {"eos"} + eos_token_id = 3 + + def convert_token_to_string(self, token): + return b"\xef\xbf\xbd".decode() + + reduced_vocab = reduced_vocabulary(MockTokenizer()) + + # See fsm.regex.get_token_transition_keys() + # FSM transition keys represents bytes as + assert reduced_vocab[0][1][0] == "\x00A1"