Skip to content

Commit

Permalink
enable actual-byte tokens in reduced_vocabulary
Browse files Browse the repository at this point in the history
  • Loading branch information
lapp0 committed Sep 14, 2024
1 parent 0b9a3f1 commit 0909276
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 5 deletions.
2 changes: 1 addition & 1 deletion docs/overrides/home.html
Original file line number Diff line number Diff line change
Expand Up @@ -117,4 +117,4 @@ <h2 class="subtitle" style="font-weight: 400; padding-top: 1rem;">
</section>
{% endblock %}
{% block content %}{% endblock %}
{% block footer %}{% endblock %}
{% block footer %}{% endblock %}
1 change: 0 additions & 1 deletion docs/overrides/main.html
Original file line number Diff line number Diff line change
@@ -1,2 +1 @@
{% extends "base.html" %}

12 changes: 9 additions & 3 deletions outlines/fsm/regex.py
Original file line number Diff line number Diff line change
Expand Up @@ -905,9 +905,15 @@ def reduced_vocabulary(
)

if token_str:
# invalid utf-8 sequences are replaced with � (\ufffd), but there
# might also be tokens specifically for �, ��, ���, etc.
if "\ufffd" in token_str and not re_replacement_seq.match(token):
if isinstance(token, bytes):
# Handle BPE tokenizers where the tokens are directly stored as bytes
# https://github.com/QwenLM/Qwen/blob/main/tokenization_note.md#regular-tokens
token_str = "".join(byte_symbol(b) for b in token)

elif "\ufffd" in token_str and not re_replacement_seq.match(token):
# invalid utf-8 sequences are replaced with � (\ufffd), but there
# might also be tokens specifically for �, ��, ���, etc.

if re_llama_byte_token.match(token):
# llama-like tokenizers have <0xXX> tokens for all
# bytes >= 0x80 and represent all incomplete utf-8
Expand Down
21 changes: 21 additions & 0 deletions tests/fsm/test_regex.py
Original file line number Diff line number Diff line change
Expand Up @@ -714,8 +714,29 @@ def test_reduced_vocabulary_with_rare_tokens(rare_token):
[1]: https://github.com/dottxt-ai/outlines/pull/763
[2]: https://github.com/dottxt-ai/outlines/pull/948
[3]: https://github.com/dottxt-ai/outlines/pull/1153
"""
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
tokenizer = TransformerTokenizer(tokenizer=tokenizer)
tokenizer.vocabulary[rare_token] = max(tokenizer.vocabulary.values()) + 1
reduced_vocabulary(tokenizer)


def test_reduced_vocabulary_with_byte_tokens():
class MockTokenizer:
vocabulary = {
"string": 1,
b"\xa1": 2, # Qwen-Style
"eos": 3,
}
special_tokens = {"eos"}
eos_token_id = 3

def convert_token_to_string(self, token):
return b"\xef\xbf\xbd".decode()

reduced_vocab = reduced_vocabulary(MockTokenizer())

# See fsm.regex.get_token_transition_keys()
# FSM transition keys represents bytes as <null_prefix><hex_byte>
assert reduced_vocab[0][1][0] == "\x00A1"

0 comments on commit 0909276

Please sign in to comment.