From ba7596e09a79c2555c6302c26a1b42f8de14a2b5 Mon Sep 17 00:00:00 2001
From: AllentDan <AllentDan@yeah.net>
Date: Fri, 1 Sep 2023 15:35:55 +0800
Subject: [PATCH 1/7] expose stop words

---
 lmdeploy/model.py | 27 +++++++--------------------
 1 file changed, 7 insertions(+), 20 deletions(-)
diff --git a/lmdeploy/model.py b/lmdeploy/model.py
index efbb54dbfc..ab0493ce72 100644
--- a/lmdeploy/model.py
+++ b/lmdeploy/model.py
@@ -23,6 +23,7 @@ def __init__(self,
         self.top_k = top_k
         self.temperature = temperature
         self.repetition_penalty = repetition_penalty
+        self.stop_words = None
 
     @staticmethod
     def get_prompt(prompt, sequence_start=True):
@@ -82,11 +83,6 @@ def messages2prompt(self, messages, sequence_start=True):
             return self.get_prompt(messages)
         # chat history processing in derived classes
 
-    @property
-    def stop_words(self):
-        """Return the stop-words' token ids."""
-        return None
-
 
 @MODELS.register_module(name='vicuna')
 class Vicuna(BaseModel):
@@ -158,6 +154,7 @@ def __init__(self,
                  eoh='<eoh>',
                  eoa='<eoa>',
                  assistant='<|Bot|>',
+                 stop_words=[103027, 103028],
                  **kwargs):
         super().__init__(**kwargs)
         self.system = system
@@ -165,6 +162,7 @@ def __init__(self,
         self.eoh = eoh
         self.eoa = eoa
         self.assistant = assistant
+        self.stop_words = stop_words
 
     def get_prompt(self, prompt, sequence_start=True):
         """Return the prompt that is concatenated with other elements in the
@@ -205,11 +203,6 @@ def messages2prompt(self, messages, sequence_start=True):
                 ret += f'{self.user}:{user}{self.eoh}\n{self.assistant}:'
         return ret
 
-    @property
-    def stop_words(self):
-        """Return the stop-words' token ids."""
-        return [103027, 103028]
-
 
 @MODELS.register_module(name='internlm-chat-7b-8k')
 class InternLMChat7B8K(InternLMChat7B):
@@ -239,6 +232,7 @@ def __init__(self,
                  eosys='',
                  assistant='<|Assistant|>: ',
                  system='<|System|>: ',
+                 stop_words=[45623],
                  **kwargs):
         super().__init__(**kwargs)
         self.meta_instruction = meta_instruction
@@ -247,6 +241,7 @@ def __init__(self,
         self.eosys = eosys
         self.assistant = assistant
         self.system = system
+        self.stop_words = stop_words
 
     def get_prompt(self, prompt, sequence_start=True):
         if sequence_start:
@@ -256,11 +251,6 @@ def get_prompt(self, prompt, sequence_start=True):
         else:
             return f'\n{self.user}{prompt}{self.eoh}\n{self.assistant}'
 
-    @property
-    def stop_words(self):
-        """Return the stop-words' token ids."""
-        return [45623]
-
 
 @MODELS.register_module(name='llama2')
 class Llama2(BaseModel):
@@ -340,6 +330,7 @@ def __init__(self,
                  im_start='<|im_start|>',
                  im_end='<|im_end|>',
                  system='You are a helpful assistant.',
+                 stop_words=[151645],
                  **kwargs):
         super().__init__(**kwargs)
         self.session_len = session_len
@@ -350,6 +341,7 @@ def __init__(self,
         self.im_start = im_start
         self.im_end = im_end
         self.system = system
+        self.stop_words = stop_words
 
     def get_prompt(self, prompt, sequence_start=True):
         if sequence_start:
@@ -360,11 +352,6 @@ def get_prompt(self, prompt, sequence_start=True):
         return f'\n{self.im_start}user\n{prompt}{self.im_end}' \
                f'\n{self.im_start}assistant\n'
 
-    @property
-    def stop_words(self):
-        """Return the stop-words' token ids."""
-        return [151645]  # <|im_end|>
-
 
 def main(model_name: str = 'test'):
     assert model_name in MODELS.module_dict.keys(), \

From d92d132e1faf3cb2c36086f7c0b4b92deb834e97 Mon Sep 17 00:00:00 2001
From: AllentDan <AllentDan@yeah.net>
Date: Mon, 11 Sep 2023 17:36:53 +0800
Subject: [PATCH 2/7] support string

---
 lmdeploy/model.py                   | 23 ++++++++++++-----------
 lmdeploy/serve/turbomind/chatbot.py | 10 ++++++++--
 lmdeploy/turbomind/turbomind.py     | 14 ++++++++++----
 3 files changed, 30 insertions(+), 17 deletions(-)

diff --git a/lmdeploy/model.py b/lmdeploy/model.py
index ab0493ce72..c47307afbb 100644
--- a/lmdeploy/model.py
+++ b/lmdeploy/model.py
@@ -154,7 +154,7 @@ def __init__(self,
                  eoh='<eoh>',
                  eoa='<eoa>',
                  assistant='<|Bot|>',
-                 stop_words=[103027, 103028],
+                 stop_words=['<eoa>'],
                  **kwargs):
         super().__init__(**kwargs)
         self.system = system
@@ -225,15 +225,16 @@ class Puyu(BaseModel):
     """Chat template of puyu model.This is only for internal usage in Shanghai
     AI Laboratory."""
 
-    def __init__(self,
-                 meta_instruction='',
-                 user='<|Human|>: ',
-                 eoh='',
-                 eosys='',
-                 assistant='<|Assistant|>: ',
-                 system='<|System|>: ',
-                 stop_words=[45623],
-                 **kwargs):
+    def __init__(
+            self,
+            meta_instruction='',
+            user='<|Human|>: ',
+            eoh='',
+            eosys='',
+            assistant='<|Assistant|>: ',
+            system='<|System|>: ',
+            stop_words: List[str] = None,  # set to None for protection
+            **kwargs):
         super().__init__(**kwargs)
         self.meta_instruction = meta_instruction
         self.user = user
@@ -330,7 +331,7 @@ def __init__(self,
                  im_start='<|im_start|>',
                  im_end='<|im_end|>',
                  system='You are a helpful assistant.',
-                 stop_words=[151645],
+                 stop_words=['<|im_end|>'],
                  **kwargs):
         super().__init__(**kwargs)
         self.session_len = session_len
diff --git a/lmdeploy/serve/turbomind/chatbot.py b/lmdeploy/serve/turbomind/chatbot.py
index a8d32825a6..393434c533 100644
--- a/lmdeploy/serve/turbomind/chatbot.py
+++ b/lmdeploy/serve/turbomind/chatbot.py
@@ -385,16 +385,22 @@ def _get_eos(self):
         token_ids, _ = self.preprocess('<EOS>')
         return token_ids[0][0]
 
-    def _stop_words(self, stop_words: List[int]):
+    def _stop_words(self, stop_words: List[str]):
         """return stop-words' token ids."""
         if stop_words is None:
             return None
         assert isinstance(stop_words, List) and \
-               all(isinstance(elem, int) for elem in stop_words), \
+               all(isinstance(elem, str) for elem in stop_words), \
                f'stop_words must be a list but got {type(stop_words)}'
         # each id in stop_words represents a stop word
         # refer to https://github.com/fauxpilot/fauxpilot/discussions/165 for
         # detailed explanation about turbomind's stop_words
+        stop_words = [
+            self.preprocess(stop_word)[0][0][0] for stop_word in stop_words
+        ]
+        assert isinstance(stop_words, List) and \
+               all(isinstance(elem, int) for elem in stop_words), \
+               'invalid stop_words'
         stop_word_offsets = range(1, len(stop_words) + 1)
         stop_words = np.array([[stop_words,
                                 stop_word_offsets]]).astype(np.int32)
diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py
index 807bd55c82..c0a67871ae 100644
--- a/lmdeploy/turbomind/turbomind.py
+++ b/lmdeploy/turbomind/turbomind.py
@@ -14,6 +14,7 @@
 
 import lmdeploy
 from lmdeploy.model import MODELS
+from lmdeploy.turbomind import Tokenizer
 from lmdeploy.utils import get_logger
 
 # TODO: find another way import _turbomind
@@ -22,14 +23,16 @@
 import _turbomind as _tm  # noqa: E402
 
 
-def _stop_words(stop_words: List[int]):
+def _stop_words(stop_words: List[str], tokenizer: Tokenizer):
     """return list of stop-words to numpy.ndarray."""
     if stop_words is None:
         return None
     assert isinstance(stop_words, List) and \
-           all(isinstance(elem, int) for elem in stop_words), \
+           all(isinstance(elem, str) for elem in stop_words), \
            f'stop_words must be a list but got {type(stop_words)}'
-
+    stop_words = [tokenizer.encode(stop_word)[0] for stop_word in stop_words]
+    assert isinstance(stop_words, List) and all(
+        isinstance(elem, int) for elem in stop_words), 'invalid stop_words'
     # each id in stop_words represents a stop word
     # refer to https://github.com/fauxpilot/fauxpilot/discussions/165 for
     # detailed explanation about fastertransformer's stop_words
@@ -106,7 +109,10 @@ def __init__(self, model_path: str, eos_id: int = 2, tp: int = 1):
             self.model_name = parser.get(section_name, 'model_name')
             data_type = parser.get(section_name, 'weight_type')
         model = MODELS.get(self.model_name)()
-        self.stop_words = _stop_words(model.stop_words)
+        tokenizer_model_path = osp.join(model_path, 'triton_models',
+                                        'tokenizer')
+        tokenizer = Tokenizer(tokenizer_model_path)
+        self.stop_words = _stop_words(model.stop_words, tokenizer)
 
         # params
         self.node_id = node_id

From 01d441c3a80db67ebed56f33136c485a9cdc9551 Mon Sep 17 00:00:00 2001
From: AllentDan <AllentDan@yeah.net>
Date: Mon, 11 Sep 2023 17:40:37 +0800
Subject: [PATCH 3/7] fix

---
 lmdeploy/serve/turbomind/chatbot.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lmdeploy/serve/turbomind/chatbot.py b/lmdeploy/serve/turbomind/chatbot.py
index 393434c533..e2af63ec3b 100644
--- a/lmdeploy/serve/turbomind/chatbot.py
+++ b/lmdeploy/serve/turbomind/chatbot.py
@@ -396,7 +396,8 @@ def _stop_words(self, stop_words: List[str]):
         # refer to https://github.com/fauxpilot/fauxpilot/discussions/165 for
         # detailed explanation about turbomind's stop_words
         stop_words = [
-            self.preprocess(stop_word)[0][0][0] for stop_word in stop_words
+            int(self.preprocess(stop_word)[0][0][0])
+            for stop_word in stop_words
         ]
         assert isinstance(stop_words, List) and \
                all(isinstance(elem, int) for elem in stop_words), \

From cdaf7f26938e8416bd573c1ee18abe14afb0bf3d Mon Sep 17 00:00:00 2001
From: AllentDan <AllentDan@yeah.net>
Date: Fri, 22 Sep 2023 15:06:22 +0800
Subject: [PATCH 4/7] remove eoa from chatbot

---
 lmdeploy/model.py                   |  2 +-
 lmdeploy/serve/turbomind/chatbot.py |  5 +++++
 lmdeploy/utils.py                   | 19 ++++++++++++++++++-
 3 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/lmdeploy/model.py b/lmdeploy/model.py
index c3f96df581..0148574e72 100644
--- a/lmdeploy/model.py
+++ b/lmdeploy/model.py
@@ -513,7 +513,7 @@ def __init__(self,
             self.top_p = kwargs.get('top_p', 0.9)
             self.temperature = kwargs.get('temperature', 0.0)
             if self.stop_words is None:
-                self.stop_words = '<EOT>'
+                self.stop_words = ['<EOT>']
 
     def decorate_prompt(self, prompt, sequence_start=True):
         if self.capability == 'infilling':
diff --git a/lmdeploy/serve/turbomind/chatbot.py b/lmdeploy/serve/turbomind/chatbot.py
index a60b4df0cf..cc12fcff3b 100644
--- a/lmdeploy/serve/turbomind/chatbot.py
+++ b/lmdeploy/serve/turbomind/chatbot.py
@@ -18,6 +18,7 @@
 from lmdeploy.model import MODELS
 from lmdeploy.serve.turbomind.utils import (Postprocessor, Preprocessor,
                                             prepare_tensor)
+from lmdeploy.utils import filter_suffix
 
 
 @dataclass
@@ -157,6 +158,8 @@ def stream_infer(self,
                                                       request_output_len,
                                                       sequence_start,
                                                       sequence_end):
+            if status == StatusCode.TRITON_STREAM_END:  # remove stop_words
+                res = filter_suffix(res, self.model.stop_words)
             if status.value < 0:
                 break
             else:
@@ -346,6 +349,8 @@ def infer(self,
                                                       sequence_end):
             if status.value < 0:
                 break
+            if status == StatusCode.TRITON_STREAM_END:  # remove stop_words
+                res = filter_suffix(res, self.model.stop_words)
         if status.value == 0:
             self._session.histories = \
                 self._session.histories + self._session.prompt + \
diff --git a/lmdeploy/utils.py b/lmdeploy/utils.py
index 7b6d51a01a..bac7ee9a19 100644
--- a/lmdeploy/utils.py
+++ b/lmdeploy/utils.py
@@ -1,6 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import logging
-from typing import Optional
+from typing import List, Optional
 
 logger_initialized = {}
 
@@ -77,3 +77,20 @@ def get_logger(name: str,
     logger_initialized[name] = True
 
     return logger
+
+
+def filter_suffix(response: str, suffixes: Optional[List[str]] = None) -> str:
+    """Filter response with suffixes.
+
+    Args:
+        response (str): generated response by LLMs.
+        suffixes (str): a list of suffixes to be deleted.
+
+    Return:
+        str: a clean response.
+    """
+    if suffixes is None:
+        return response
+    for item in suffixes:
+        response = response.removesuffix(item)
+    return response

From 637bfff367a27c5e85bc5b1577b9084f8f858923 Mon Sep 17 00:00:00 2001
From: AllentDan <AllentDan@yeah.net>
Date: Fri, 22 Sep 2023 15:27:57 +0800
Subject: [PATCH 5/7] remove eoa of turbomind

---
 lmdeploy/turbomind/turbomind.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py
index bc630430a7..f8a7444546 100644
--- a/lmdeploy/turbomind/turbomind.py
+++ b/lmdeploy/turbomind/turbomind.py
@@ -168,6 +168,8 @@ def __init__(self, tm_model, cuda_stream_id=0):
         self.gpu_count = tm_model.gpu_count
 
         self.stop_words = tm_model.stop_words
+        self.stop_tokens = [] if self.stop_words is None else \
+            self.stop_words.flatten().tolist()
         self.eos_id = tm_model.eos_id
         self.session_len = tm_model.session_len
 
@@ -352,6 +354,8 @@ def _broadcast_np(data, dtype, shape=(batch_size, )):
                 output, len_ = output, len_.item()
                 if len(output) > 0 and output[-1].item() == self.eos_id:
                     outputs.append((output[:-1], len_ - 1))
+                elif len(output) > 0 and output[-1].item() in self.stop_tokens:
+                    outputs.append((output[:-1], len_))
                 else:
                     outputs.append((output, len_))
 

From 4f8953c4c13478eddc2335111ed40ff8c1a1b90f Mon Sep 17 00:00:00 2001
From: AllentDan <AllentDan@yeah.net>
Date: Mon, 25 Sep 2023 14:37:28 +0800
Subject: [PATCH 6/7] fix ut

---
 tests/test_lmdeploy/test_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_lmdeploy/test_model.py b/tests/test_lmdeploy/test_model.py
index dcf04d5c28..d07e1f1f73 100644
--- a/tests/test_lmdeploy/test_model.py
+++ b/tests/test_lmdeploy/test_model.py
@@ -133,7 +133,7 @@ def test_codellama_infilling():
 '''
     _prompt = model.get_prompt(prompt)
     assert _prompt.find('<FILL>') == -1
-    assert model.stop_words == [32010]
+    assert model.stop_words == ['<EOT>']
 
     model = MODELS.get('codellama')(capability='infilling', suffix_first=True)
     _prompt = model.get_prompt(prompt)

From dff66bc9d7153dee70b001164d5da4930493c27e Mon Sep 17 00:00:00 2001
From: AllentDan <AllentDan@yeah.net>
Date: Mon, 25 Sep 2023 16:24:32 +0800
Subject: [PATCH 7/7] suffix wheel and fix InternLM no system bug

---
 lmdeploy/model.py | 4 ++--
 lmdeploy/utils.py | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/lmdeploy/model.py b/lmdeploy/model.py
index 6e0ae0bdbe..ffe8dd558e 100644
--- a/lmdeploy/model.py
+++ b/lmdeploy/model.py
@@ -201,7 +201,7 @@ def decorate_prompt(self, prompt, sequence_start=True):
         assert self.capability == 'chat', \
             f'{type(self).__name__} has no capability of {self.capability}'
         if sequence_start:
-            return f'<BOS>{self.user}:{prompt}{self.eoh}\n' \
+            return f'<BOS>{self.system}{self.user}:{prompt}{self.eoh}\n' \
                    f'{self.assistant}:'
         else:
             return f'\n{self.user}:{prompt}{self.eoh}\n' \
@@ -219,7 +219,7 @@ def messages2prompt(self, messages, sequence_start=True):
         if isinstance(messages, str):
             return self.get_prompt(messages, sequence_start)
         system, users, assistants = self._translate_messages(messages)
-        ret = '<BOS>'
+        ret = '<BOS>' + self.system if system is None else '<BOS>' + system
         for user, assistant in zip(users, assistants):
             if assistant:
                 ret += f'{self.user}:{user}{self.eoh}\n{self.assistant}:' \
diff --git a/lmdeploy/utils.py b/lmdeploy/utils.py
index bac7ee9a19..e284f50075 100644
--- a/lmdeploy/utils.py
+++ b/lmdeploy/utils.py
@@ -92,5 +92,6 @@ def filter_suffix(response: str, suffixes: Optional[List[str]] = None) -> str:
     if suffixes is None:
         return response
     for item in suffixes:
-        response = response.removesuffix(item)
+        if response.endswith(item):
+            response = response[:len(response) - len(item)]
     return response