From 186bfd2eb9ee44db129d1dac70dcabe01461227b Mon Sep 17 00:00:00 2001 From: AllentDan <41138331+AllentDan@users.noreply.github.com> Date: Thu, 19 Oct 2023 15:54:57 +0800 Subject: [PATCH] robust incremental decode for leading space (#581) * robust incremental decode for leading space * speed up lookup as prefix_space_tokens is shorter than no_prefix_space_tokens * add UT and fix qwen stuff --- lmdeploy/tokenizer.py | 46 ++++++++++++++------------- tests/test_lmdeploy/test_tokenizer.py | 24 ++++++++++++++ 2 files changed, 48 insertions(+), 22 deletions(-) create mode 100644 tests/test_lmdeploy/test_tokenizer.py diff --git a/lmdeploy/tokenizer.py b/lmdeploy/tokenizer.py index 98db9c2b6..138705dfa 100644 --- a/lmdeploy/tokenizer.py +++ b/lmdeploy/tokenizer.py @@ -16,7 +16,7 @@ class SentencePieceTokenizer: def __init__(self, model_file: str): from sentencepiece import SentencePieceProcessor self.model = SentencePieceProcessor(model_file=model_file) - self._no_prefix_space_tokens = None + self._prefix_space_tokens = None @property def vocab_size(self): @@ -34,19 +34,20 @@ def eos_token_id(self): return self.model.eos_id() @property - def no_prefix_space_tokens(self): + def prefix_space_tokens(self): """tokens without prefix space.""" - if self._no_prefix_space_tokens is None: + if self._prefix_space_tokens is None: vocab = self.model.IdToPiece(list(range(self.vocab_size))) - self._no_prefix_space_tokens = { + self._prefix_space_tokens = { i - for i, tok in enumerate(vocab) if not tok.startswith('▁') + for i, tok in enumerate(vocab) if tok.startswith('▁') } - return self._no_prefix_space_tokens + return self._prefix_space_tokens def _maybe_add_prefix_space(self, tokens, decoded): """maybe add prefix space for incremental decoding.""" - if len(tokens) and tokens[0] not in self.no_prefix_space_tokens: + if len(tokens) and not decoded.startswith(' ') and\ + tokens[0] in self.prefix_space_tokens: return ' ' + decoded else: return decoded @@ -111,8 +112,7 @@ class HuggingFaceTokenizer: """ def __init__(self, model_dir: str): - from transformers import (AutoTokenizer, CodeLlamaTokenizerFast, - LlamaTokenizerFast) + from transformers import AutoTokenizer model_file = osp.join(model_dir, 'tokenizer.model') backend_tokenizer_file = osp.join(model_dir, 'tokenizer.json') model_file_exists = osp.exists(model_file) @@ -121,9 +121,7 @@ def __init__(self, model_dir: str): 'It may take long time to initialize the tokenizer.') self.model = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True) - self.need_padding = isinstance(self.model, LlamaTokenizerFast) \ - or isinstance(self.model, CodeLlamaTokenizerFast) - self._no_prefix_space_tokens = None + self._prefix_space_tokens = None # save tokenizer.json to reuse if not osp.exists(backend_tokenizer_file) and model_file_exists: if hasattr(self.model, 'backend_tokenizer'): @@ -132,9 +130,12 @@ def __init__(self, model_dir: str): if self.model.eos_token_id is None: generation_config_file = osp.join(model_dir, 'generation_config.json') - with open(generation_config_file, 'r') as f: - cfg = json.load(f) - self.model.eos_token_id = cfg['eos_token_id'] + if osp.exists(generation_config_file): + with open(generation_config_file, 'r') as f: + cfg = json.load(f) + self.model.eos_token_id = cfg['eos_token_id'] + elif hasattr(self.model, 'eod_id'): # Qwen remote + self.model.eos_token_id = self.model.eod_id @property def vocab_size(self): @@ -152,21 +153,22 @@ def eos_token_id(self): return self.model.eos_token_id @property - def no_prefix_space_tokens(self): + def prefix_space_tokens(self): """tokens without prefix space.""" - if self._no_prefix_space_tokens is None: + if self._prefix_space_tokens is None: vocab = self.model.convert_ids_to_tokens( list(range(self.vocab_size))) - self._no_prefix_space_tokens = { + self._prefix_space_tokens = { i - for i, tok in enumerate(vocab) if not tok.startswith('▁') + for i, tok in enumerate(vocab) + if tok.startswith('▁' if isinstance(tok, str) else b' ') } - return self._no_prefix_space_tokens + return self._prefix_space_tokens def _maybe_add_prefix_space(self, tokens, decoded): """maybe add prefix space for incremental decoding.""" - if self.need_padding and len( - tokens) and tokens[0] not in self.no_prefix_space_tokens: + if len(tokens) and not decoded.startswith(' ') and\ + tokens[0] in self.prefix_space_tokens: return ' ' + decoded else: return decoded diff --git a/tests/test_lmdeploy/test_tokenizer.py b/tests/test_lmdeploy/test_tokenizer.py new file mode 100644 index 000000000..ff7d8047b --- /dev/null +++ b/tests/test_lmdeploy/test_tokenizer.py @@ -0,0 +1,24 @@ +import pytest + +from lmdeploy.tokenizer import HuggingFaceTokenizer + + +@pytest.mark.parametrize('model_path', [ + 'internlm/internlm-chat-7b', 'Qwen/Qwen-7B-Chat', + 'baichuan-inc/Baichuan-7B', 'codellama/CodeLlama-7b-hf', + 'upstage/SOLAR-0-70b-16bit' +]) +@pytest.mark.parametrize( + 'input', ['hi, this is a test πŸ˜†πŸ˜†! ' * 5, 'η‚Ίδ»€ιΊΌζˆ‘ι‚„εœ¨η”¨ηΉι«”ε­— πŸ˜†πŸ˜† gg! ' * 5]) +def test_tokenizer(model_path, input): + tokenizer = HuggingFaceTokenizer(model_path) + encoded = tokenizer.encode(input) + output = '' + offset = 0 + for i in range(1, len(encoded) + 1): + decoded = tokenizer.decode(encoded[:i], offset) + if decoded.endswith('οΏ½'): + continue + output += decoded + offset = i + assert input == output, 'input string should equal to output after enc-dec'