From ba7596e09a79c2555c6302c26a1b42f8de14a2b5 Mon Sep 17 00:00:00 2001 From: AllentDan Date: Fri, 1 Sep 2023 15:35:55 +0800 Subject: [PATCH 1/7] expose stop words --- lmdeploy/model.py | 27 +++++++-------------------- 1 file changed, 7 insertions(+), 20 deletions(-) diff --git a/lmdeploy/model.py b/lmdeploy/model.py index efbb54dbfc..ab0493ce72 100644 --- a/lmdeploy/model.py +++ b/lmdeploy/model.py @@ -23,6 +23,7 @@ def __init__(self, self.top_k = top_k self.temperature = temperature self.repetition_penalty = repetition_penalty + self.stop_words = None @staticmethod def get_prompt(prompt, sequence_start=True): @@ -82,11 +83,6 @@ def messages2prompt(self, messages, sequence_start=True): return self.get_prompt(messages) # chat history processing in derived classes - @property - def stop_words(self): - """Return the stop-words' token ids.""" - return None - @MODELS.register_module(name='vicuna') class Vicuna(BaseModel): @@ -158,6 +154,7 @@ def __init__(self, eoh='', eoa='', assistant='<|Bot|>', + stop_words=[103027, 103028], **kwargs): super().__init__(**kwargs) self.system = system @@ -165,6 +162,7 @@ def __init__(self, self.eoh = eoh self.eoa = eoa self.assistant = assistant + self.stop_words = stop_words def get_prompt(self, prompt, sequence_start=True): """Return the prompt that is concatenated with other elements in the @@ -205,11 +203,6 @@ def messages2prompt(self, messages, sequence_start=True): ret += f'{self.user}:{user}{self.eoh}\n{self.assistant}:' return ret - @property - def stop_words(self): - """Return the stop-words' token ids.""" - return [103027, 103028] - @MODELS.register_module(name='internlm-chat-7b-8k') class InternLMChat7B8K(InternLMChat7B): @@ -239,6 +232,7 @@ def __init__(self, eosys='', assistant='<|Assistant|>: ', system='<|System|>: ', + stop_words=[45623], **kwargs): super().__init__(**kwargs) self.meta_instruction = meta_instruction @@ -247,6 +241,7 @@ def __init__(self, self.eosys = eosys self.assistant = assistant self.system = system + self.stop_words = stop_words def get_prompt(self, prompt, sequence_start=True): if sequence_start: @@ -256,11 +251,6 @@ def get_prompt(self, prompt, sequence_start=True): else: return f'\n{self.user}{prompt}{self.eoh}\n{self.assistant}' - @property - def stop_words(self): - """Return the stop-words' token ids.""" - return [45623] - @MODELS.register_module(name='llama2') class Llama2(BaseModel): @@ -340,6 +330,7 @@ def __init__(self, im_start='<|im_start|>', im_end='<|im_end|>', system='You are a helpful assistant.', + stop_words=[151645], **kwargs): super().__init__(**kwargs) self.session_len = session_len @@ -350,6 +341,7 @@ def __init__(self, self.im_start = im_start self.im_end = im_end self.system = system + self.stop_words = stop_words def get_prompt(self, prompt, sequence_start=True): if sequence_start: @@ -360,11 +352,6 @@ def get_prompt(self, prompt, sequence_start=True): return f'\n{self.im_start}user\n{prompt}{self.im_end}' \ f'\n{self.im_start}assistant\n' - @property - def stop_words(self): - """Return the stop-words' token ids.""" - return [151645] # <|im_end|> - def main(model_name: str = 'test'): assert model_name in MODELS.module_dict.keys(), \ From d92d132e1faf3cb2c36086f7c0b4b92deb834e97 Mon Sep 17 00:00:00 2001 From: AllentDan Date: Mon, 11 Sep 2023 17:36:53 +0800 Subject: [PATCH 2/7] support string --- lmdeploy/model.py | 23 ++++++++++++----------- lmdeploy/serve/turbomind/chatbot.py | 10 ++++++++-- lmdeploy/turbomind/turbomind.py | 14 ++++++++++---- 3 files changed, 30 insertions(+), 17 deletions(-) diff --git a/lmdeploy/model.py b/lmdeploy/model.py index ab0493ce72..c47307afbb 100644 --- a/lmdeploy/model.py +++ b/lmdeploy/model.py @@ -154,7 +154,7 @@ def __init__(self, eoh='', eoa='', assistant='<|Bot|>', - stop_words=[103027, 103028], + stop_words=[''], **kwargs): super().__init__(**kwargs) self.system = system @@ -225,15 +225,16 @@ class Puyu(BaseModel): """Chat template of puyu model.This is only for internal usage in Shanghai AI Laboratory.""" - def __init__(self, - meta_instruction='', - user='<|Human|>: ', - eoh='', - eosys='', - assistant='<|Assistant|>: ', - system='<|System|>: ', - stop_words=[45623], - **kwargs): + def __init__( + self, + meta_instruction='', + user='<|Human|>: ', + eoh='', + eosys='', + assistant='<|Assistant|>: ', + system='<|System|>: ', + stop_words: List[str] = None, # set to None for protection + **kwargs): super().__init__(**kwargs) self.meta_instruction = meta_instruction self.user = user @@ -330,7 +331,7 @@ def __init__(self, im_start='<|im_start|>', im_end='<|im_end|>', system='You are a helpful assistant.', - stop_words=[151645], + stop_words=['<|im_end|>'], **kwargs): super().__init__(**kwargs) self.session_len = session_len diff --git a/lmdeploy/serve/turbomind/chatbot.py b/lmdeploy/serve/turbomind/chatbot.py index a8d32825a6..393434c533 100644 --- a/lmdeploy/serve/turbomind/chatbot.py +++ b/lmdeploy/serve/turbomind/chatbot.py @@ -385,16 +385,22 @@ def _get_eos(self): token_ids, _ = self.preprocess('') return token_ids[0][0] - def _stop_words(self, stop_words: List[int]): + def _stop_words(self, stop_words: List[str]): """return stop-words' token ids.""" if stop_words is None: return None assert isinstance(stop_words, List) and \ - all(isinstance(elem, int) for elem in stop_words), \ + all(isinstance(elem, str) for elem in stop_words), \ f'stop_words must be a list but got {type(stop_words)}' # each id in stop_words represents a stop word # refer to https://github.com/fauxpilot/fauxpilot/discussions/165 for # detailed explanation about turbomind's stop_words + stop_words = [ + self.preprocess(stop_word)[0][0][0] for stop_word in stop_words + ] + assert isinstance(stop_words, List) and \ + all(isinstance(elem, int) for elem in stop_words), \ + 'invalid stop_words' stop_word_offsets = range(1, len(stop_words) + 1) stop_words = np.array([[stop_words, stop_word_offsets]]).astype(np.int32) diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py index 807bd55c82..c0a67871ae 100644 --- a/lmdeploy/turbomind/turbomind.py +++ b/lmdeploy/turbomind/turbomind.py @@ -14,6 +14,7 @@ import lmdeploy from lmdeploy.model import MODELS +from lmdeploy.turbomind import Tokenizer from lmdeploy.utils import get_logger # TODO: find another way import _turbomind @@ -22,14 +23,16 @@ import _turbomind as _tm # noqa: E402 -def _stop_words(stop_words: List[int]): +def _stop_words(stop_words: List[str], tokenizer: Tokenizer): """return list of stop-words to numpy.ndarray.""" if stop_words is None: return None assert isinstance(stop_words, List) and \ - all(isinstance(elem, int) for elem in stop_words), \ + all(isinstance(elem, str) for elem in stop_words), \ f'stop_words must be a list but got {type(stop_words)}' - + stop_words = [tokenizer.encode(stop_word)[0] for stop_word in stop_words] + assert isinstance(stop_words, List) and all( + isinstance(elem, int) for elem in stop_words), 'invalid stop_words' # each id in stop_words represents a stop word # refer to https://github.com/fauxpilot/fauxpilot/discussions/165 for # detailed explanation about fastertransformer's stop_words @@ -106,7 +109,10 @@ def __init__(self, model_path: str, eos_id: int = 2, tp: int = 1): self.model_name = parser.get(section_name, 'model_name') data_type = parser.get(section_name, 'weight_type') model = MODELS.get(self.model_name)() - self.stop_words = _stop_words(model.stop_words) + tokenizer_model_path = osp.join(model_path, 'triton_models', + 'tokenizer') + tokenizer = Tokenizer(tokenizer_model_path) + self.stop_words = _stop_words(model.stop_words, tokenizer) # params self.node_id = node_id From 01d441c3a80db67ebed56f33136c485a9cdc9551 Mon Sep 17 00:00:00 2001 From: AllentDan Date: Mon, 11 Sep 2023 17:40:37 +0800 Subject: [PATCH 3/7] fix --- lmdeploy/serve/turbomind/chatbot.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lmdeploy/serve/turbomind/chatbot.py b/lmdeploy/serve/turbomind/chatbot.py index 393434c533..e2af63ec3b 100644 --- a/lmdeploy/serve/turbomind/chatbot.py +++ b/lmdeploy/serve/turbomind/chatbot.py @@ -396,7 +396,8 @@ def _stop_words(self, stop_words: List[str]): # refer to https://github.com/fauxpilot/fauxpilot/discussions/165 for # detailed explanation about turbomind's stop_words stop_words = [ - self.preprocess(stop_word)[0][0][0] for stop_word in stop_words + int(self.preprocess(stop_word)[0][0][0]) + for stop_word in stop_words ] assert isinstance(stop_words, List) and \ all(isinstance(elem, int) for elem in stop_words), \ From cdaf7f26938e8416bd573c1ee18abe14afb0bf3d Mon Sep 17 00:00:00 2001 From: AllentDan Date: Fri, 22 Sep 2023 15:06:22 +0800 Subject: [PATCH 4/7] remove eoa from chatbot --- lmdeploy/model.py | 2 +- lmdeploy/serve/turbomind/chatbot.py | 5 +++++ lmdeploy/utils.py | 19 ++++++++++++++++++- 3 files changed, 24 insertions(+), 2 deletions(-) diff --git a/lmdeploy/model.py b/lmdeploy/model.py index c3f96df581..0148574e72 100644 --- a/lmdeploy/model.py +++ b/lmdeploy/model.py @@ -513,7 +513,7 @@ def __init__(self, self.top_p = kwargs.get('top_p', 0.9) self.temperature = kwargs.get('temperature', 0.0) if self.stop_words is None: - self.stop_words = '' + self.stop_words = [''] def decorate_prompt(self, prompt, sequence_start=True): if self.capability == 'infilling': diff --git a/lmdeploy/serve/turbomind/chatbot.py b/lmdeploy/serve/turbomind/chatbot.py index a60b4df0cf..cc12fcff3b 100644 --- a/lmdeploy/serve/turbomind/chatbot.py +++ b/lmdeploy/serve/turbomind/chatbot.py @@ -18,6 +18,7 @@ from lmdeploy.model import MODELS from lmdeploy.serve.turbomind.utils import (Postprocessor, Preprocessor, prepare_tensor) +from lmdeploy.utils import filter_suffix @dataclass @@ -157,6 +158,8 @@ def stream_infer(self, request_output_len, sequence_start, sequence_end): + if status == StatusCode.TRITON_STREAM_END: # remove stop_words + res = filter_suffix(res, self.model.stop_words) if status.value < 0: break else: @@ -346,6 +349,8 @@ def infer(self, sequence_end): if status.value < 0: break + if status == StatusCode.TRITON_STREAM_END: # remove stop_words + res = filter_suffix(res, self.model.stop_words) if status.value == 0: self._session.histories = \ self._session.histories + self._session.prompt + \ diff --git a/lmdeploy/utils.py b/lmdeploy/utils.py index 7b6d51a01a..bac7ee9a19 100644 --- a/lmdeploy/utils.py +++ b/lmdeploy/utils.py @@ -1,6 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. import logging -from typing import Optional +from typing import List, Optional logger_initialized = {} @@ -77,3 +77,20 @@ def get_logger(name: str, logger_initialized[name] = True return logger + + +def filter_suffix(response: str, suffixes: Optional[List[str]] = None) -> str: + """Filter response with suffixes. + + Args: + response (str): generated response by LLMs. + suffixes (str): a list of suffixes to be deleted. + + Return: + str: a clean response. + """ + if suffixes is None: + return response + for item in suffixes: + response = response.removesuffix(item) + return response From 637bfff367a27c5e85bc5b1577b9084f8f858923 Mon Sep 17 00:00:00 2001 From: AllentDan Date: Fri, 22 Sep 2023 15:27:57 +0800 Subject: [PATCH 5/7] remove eoa of turbomind --- lmdeploy/turbomind/turbomind.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py index bc630430a7..f8a7444546 100644 --- a/lmdeploy/turbomind/turbomind.py +++ b/lmdeploy/turbomind/turbomind.py @@ -168,6 +168,8 @@ def __init__(self, tm_model, cuda_stream_id=0): self.gpu_count = tm_model.gpu_count self.stop_words = tm_model.stop_words + self.stop_tokens = [] if self.stop_words is None else \ + self.stop_words.flatten().tolist() self.eos_id = tm_model.eos_id self.session_len = tm_model.session_len @@ -352,6 +354,8 @@ def _broadcast_np(data, dtype, shape=(batch_size, )): output, len_ = output, len_.item() if len(output) > 0 and output[-1].item() == self.eos_id: outputs.append((output[:-1], len_ - 1)) + elif len(output) > 0 and output[-1].item() in self.stop_tokens: + outputs.append((output[:-1], len_)) else: outputs.append((output, len_)) From 4f8953c4c13478eddc2335111ed40ff8c1a1b90f Mon Sep 17 00:00:00 2001 From: AllentDan Date: Mon, 25 Sep 2023 14:37:28 +0800 Subject: [PATCH 6/7] fix ut --- tests/test_lmdeploy/test_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_lmdeploy/test_model.py b/tests/test_lmdeploy/test_model.py index dcf04d5c28..d07e1f1f73 100644 --- a/tests/test_lmdeploy/test_model.py +++ b/tests/test_lmdeploy/test_model.py @@ -133,7 +133,7 @@ def test_codellama_infilling(): ''' _prompt = model.get_prompt(prompt) assert _prompt.find('') == -1 - assert model.stop_words == [32010] + assert model.stop_words == [''] model = MODELS.get('codellama')(capability='infilling', suffix_first=True) _prompt = model.get_prompt(prompt) From dff66bc9d7153dee70b001164d5da4930493c27e Mon Sep 17 00:00:00 2001 From: AllentDan Date: Mon, 25 Sep 2023 16:24:32 +0800 Subject: [PATCH 7/7] suffix wheel and fix InternLM no system bug --- lmdeploy/model.py | 4 ++-- lmdeploy/utils.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/lmdeploy/model.py b/lmdeploy/model.py index 6e0ae0bdbe..ffe8dd558e 100644 --- a/lmdeploy/model.py +++ b/lmdeploy/model.py @@ -201,7 +201,7 @@ def decorate_prompt(self, prompt, sequence_start=True): assert self.capability == 'chat', \ f'{type(self).__name__} has no capability of {self.capability}' if sequence_start: - return f'{self.user}:{prompt}{self.eoh}\n' \ + return f'{self.system}{self.user}:{prompt}{self.eoh}\n' \ f'{self.assistant}:' else: return f'\n{self.user}:{prompt}{self.eoh}\n' \ @@ -219,7 +219,7 @@ def messages2prompt(self, messages, sequence_start=True): if isinstance(messages, str): return self.get_prompt(messages, sequence_start) system, users, assistants = self._translate_messages(messages) - ret = '' + ret = '' + self.system if system is None else '' + system for user, assistant in zip(users, assistants): if assistant: ret += f'{self.user}:{user}{self.eoh}\n{self.assistant}:' \ diff --git a/lmdeploy/utils.py b/lmdeploy/utils.py index bac7ee9a19..e284f50075 100644 --- a/lmdeploy/utils.py +++ b/lmdeploy/utils.py @@ -92,5 +92,6 @@ def filter_suffix(response: str, suffixes: Optional[List[str]] = None) -> str: if suffixes is None: return response for item in suffixes: - response = response.removesuffix(item) + if response.endswith(item): + response = response[:len(response) - len(item)] return response