Skip to content

Commit

Permalink
robust incremental decode for leading space (#581)
Browse files Browse the repository at this point in the history
* robust incremental decode for leading space

* speed up lookup as prefix_space_tokens is shorter than no_prefix_space_tokens

* add UT and fix qwen stuff
  • Loading branch information
AllentDan authored Oct 19, 2023
1 parent 70a5c63 commit 186bfd2
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 22 deletions.
46 changes: 24 additions & 22 deletions lmdeploy/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ class SentencePieceTokenizer:
def __init__(self, model_file: str):
from sentencepiece import SentencePieceProcessor
self.model = SentencePieceProcessor(model_file=model_file)
self._no_prefix_space_tokens = None
self._prefix_space_tokens = None

@property
def vocab_size(self):
Expand All @@ -34,19 +34,20 @@ def eos_token_id(self):
return self.model.eos_id()

@property
def no_prefix_space_tokens(self):
def prefix_space_tokens(self):
"""tokens without prefix space."""
if self._no_prefix_space_tokens is None:
if self._prefix_space_tokens is None:
vocab = self.model.IdToPiece(list(range(self.vocab_size)))
self._no_prefix_space_tokens = {
self._prefix_space_tokens = {
i
for i, tok in enumerate(vocab) if not tok.startswith('▁')
for i, tok in enumerate(vocab) if tok.startswith('▁')
}
return self._no_prefix_space_tokens
return self._prefix_space_tokens

def _maybe_add_prefix_space(self, tokens, decoded):
"""maybe add prefix space for incremental decoding."""
if len(tokens) and tokens[0] not in self.no_prefix_space_tokens:
if len(tokens) and not decoded.startswith(' ') and\
tokens[0] in self.prefix_space_tokens:
return ' ' + decoded
else:
return decoded
Expand Down Expand Up @@ -111,8 +112,7 @@ class HuggingFaceTokenizer:
"""

def __init__(self, model_dir: str):
from transformers import (AutoTokenizer, CodeLlamaTokenizerFast,
LlamaTokenizerFast)
from transformers import AutoTokenizer
model_file = osp.join(model_dir, 'tokenizer.model')
backend_tokenizer_file = osp.join(model_dir, 'tokenizer.json')
model_file_exists = osp.exists(model_file)
Expand All @@ -121,9 +121,7 @@ def __init__(self, model_dir: str):
'It may take long time to initialize the tokenizer.')
self.model = AutoTokenizer.from_pretrained(model_dir,
trust_remote_code=True)
self.need_padding = isinstance(self.model, LlamaTokenizerFast) \
or isinstance(self.model, CodeLlamaTokenizerFast)
self._no_prefix_space_tokens = None
self._prefix_space_tokens = None
# save tokenizer.json to reuse
if not osp.exists(backend_tokenizer_file) and model_file_exists:
if hasattr(self.model, 'backend_tokenizer'):
Expand All @@ -132,9 +130,12 @@ def __init__(self, model_dir: str):
if self.model.eos_token_id is None:
generation_config_file = osp.join(model_dir,
'generation_config.json')
with open(generation_config_file, 'r') as f:
cfg = json.load(f)
self.model.eos_token_id = cfg['eos_token_id']
if osp.exists(generation_config_file):
with open(generation_config_file, 'r') as f:
cfg = json.load(f)
self.model.eos_token_id = cfg['eos_token_id']
elif hasattr(self.model, 'eod_id'): # Qwen remote
self.model.eos_token_id = self.model.eod_id

@property
def vocab_size(self):
Expand All @@ -152,21 +153,22 @@ def eos_token_id(self):
return self.model.eos_token_id

@property
def no_prefix_space_tokens(self):
def prefix_space_tokens(self):
"""tokens without prefix space."""
if self._no_prefix_space_tokens is None:
if self._prefix_space_tokens is None:
vocab = self.model.convert_ids_to_tokens(
list(range(self.vocab_size)))
self._no_prefix_space_tokens = {
self._prefix_space_tokens = {
i
for i, tok in enumerate(vocab) if not tok.startswith('▁')
for i, tok in enumerate(vocab)
if tok.startswith('▁' if isinstance(tok, str) else b' ')
}
return self._no_prefix_space_tokens
return self._prefix_space_tokens

def _maybe_add_prefix_space(self, tokens, decoded):
"""maybe add prefix space for incremental decoding."""
if self.need_padding and len(
tokens) and tokens[0] not in self.no_prefix_space_tokens:
if len(tokens) and not decoded.startswith(' ') and\
tokens[0] in self.prefix_space_tokens:
return ' ' + decoded
else:
return decoded
Expand Down
24 changes: 24 additions & 0 deletions tests/test_lmdeploy/test_tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import pytest

from lmdeploy.tokenizer import HuggingFaceTokenizer


@pytest.mark.parametrize('model_path', [
'internlm/internlm-chat-7b', 'Qwen/Qwen-7B-Chat',
'baichuan-inc/Baichuan-7B', 'codellama/CodeLlama-7b-hf',
'upstage/SOLAR-0-70b-16bit'
])
@pytest.mark.parametrize(
'input', ['hi, this is a test 😆😆! ' * 5, '為什麼我還在用繁體字 😆😆 gg! ' * 5])
def test_tokenizer(model_path, input):
tokenizer = HuggingFaceTokenizer(model_path)
encoded = tokenizer.encode(input)
output = ''
offset = 0
for i in range(1, len(encoded) + 1):
decoded = tokenizer.decode(encoded[:i], offset)
if decoded.endswith('�'):
continue
output += decoded
offset = i
assert input == output, 'input string should equal to output after enc-dec'

0 comments on commit 186bfd2

Please sign in to comment.