From bb1dfa6115f4e5b39c3239922929e5ced5482e17 Mon Sep 17 00:00:00 2001 From: AllentDan <41138331+AllentDan@users.noreply.github.com> Date: Thu, 26 Sep 2024 19:31:08 +0800 Subject: [PATCH] Fix chatglm tokenizer failed when transformers>=4.45.0 (#2520) * Fix chatglm tokenizer failed when transformers>=4.45.0 * fix chatglm2-6b --- lmdeploy/tokenizer.py | 27 +++++++++++++++++++++++++++ tests/test_lmdeploy/test_tokenizer.py | 4 ++-- 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/lmdeploy/tokenizer.py b/lmdeploy/tokenizer.py index 02338a37d..e97700558 100644 --- a/lmdeploy/tokenizer.py +++ b/lmdeploy/tokenizer.py @@ -519,6 +519,15 @@ class ChatGLM4Tokenizer(HuggingFaceTokenizer): def __init__(self, model_path): super(ChatGLM4Tokenizer, self).__init__(model_path) + original_pad = self.model._pad + + def __pad(*args, **kwargs): + if 'padding_side' in kwargs: + kwargs.pop('padding_side') + return original_pad(*args, **kwargs) + + # fix for transformers>4.45.0 + self.model._pad = __pad def encode(self, s: str, @@ -534,6 +543,22 @@ def encode(self, **kwargs) +class ChatGLMTokenizer(HuggingFaceTokenizer): + """tokenizer of GLM2.""" + + def __init__(self, model_path): + super(ChatGLMTokenizer, self).__init__(model_path) + original_pad = self.model._pad + + def __pad(*args, **kwargs): + if 'padding_side' in kwargs: + kwargs.pop('padding_side') + return original_pad(*args, **kwargs) + + # fix for transformers>4.45.0 + self.model._pad = __pad + + class Tokenizer: """Tokenize prompts or de-tokenize tokens into texts. @@ -563,6 +588,8 @@ def __init__(self, model_file: str): config_tokenizer_class = tokenizer_config.get('tokenizer_class') if config_tokenizer_class == 'ChatGLM4Tokenizer': self.model = ChatGLM4Tokenizer(model_folder) + elif config_tokenizer_class == 'ChatGLMTokenizer': + self.model = ChatGLMTokenizer(model_folder) else: self.model = HuggingFaceTokenizer(model_folder) diff --git a/tests/test_lmdeploy/test_tokenizer.py b/tests/test_lmdeploy/test_tokenizer.py index 6787249e2..38b7dee6b 100644 --- a/tests/test_lmdeploy/test_tokenizer.py +++ b/tests/test_lmdeploy/test_tokenizer.py @@ -2,7 +2,7 @@ import pytest -from lmdeploy.tokenizer import DetokenizeState, HuggingFaceTokenizer +from lmdeploy.tokenizer import DetokenizeState, HuggingFaceTokenizer, Tokenizer @pytest.mark.parametrize('model_path', [ @@ -20,7 +20,7 @@ @pytest.mark.parametrize('skip_special_tokens', [True, False]) def test_tokenizer(model_path, input, interval, add_special_tokens, skip_special_tokens): - tokenizer = HuggingFaceTokenizer(model_path) + tokenizer = Tokenizer(model_path).model encoded = tokenizer.encode(input, False, add_special_tokens=add_special_tokens)