Fix crash and remove sys_instruct from chat.py and client.py(#591)

* fix crash * update profile_generation.py * format * use self.bos_id * remove sys_instruct
InternLM · Oct 24, 2023 · ffe4ba9 · ffe4ba9
1 parent af2f072
commit ffe4ba9
Show file tree

Hide file tree

Showing 5 changed files with 11 additions and 13 deletions.
diff --git a/benchmark/README.md b/benchmark/README.md
@@ -30,7 +30,7 @@ pip install nvidia-ml-py
 ```bash
 python profile_generation.py \
  --model-path /path/to/your/model \
- --concurrency 1 8 --prompt-tokens 0 512 --completion-tokens 2048 512
+ --concurrency 1 8 --prompt-tokens 1 512 --completion-tokens 2048 512
 ```
 
 ## profile serving

diff --git a/benchmark/profile_generation.py b/benchmark/profile_generation.py
@@ -90,7 +90,7 @@ def _infer(model, session_id):
 
 def profile_throughput(model_path: str,
                        concurrency: int = 1,
-                       input_seqlen: int = 0,
+                       input_seqlen: int = 1,
                        output_seqlen: int = 512,
                        test_round: int = 10,
                        tp: int = 1):
@@ -99,8 +99,10 @@ def profile_throughput(model_path: str,
     tm_model = TurboMind(model_path=model_path, tp=tp)
 
     # make up a prompt that can be tokenized into {input_seqlen} tokens
-    prompt = '' if input_seqlen == 0 else 'hi' + ' hi' * (input_seqlen - 1)
+    assert input_seqlen > 0, 'input_seqlen should > 0'
+    prompt = 'hi'
     input_ids = tokenizer.encode(prompt)
+    input_ids = input_ids * input_seqlen
 
     warmup(tm_model, concurrency, input_ids, output_seqlen)
 

diff --git a/lmdeploy/serve/client.py b/lmdeploy/serve/client.py
@@ -20,7 +20,6 @@ def input_prompt(model_name):
 def main(tritonserver_addr: str,
          session_id: int = 1,
          cap: str = 'chat',
-         sys_instruct: str = None,
          stream_output: bool = True,
          **kwargs):
     """An example to communicate with inference server through the command line
@@ -32,13 +31,11 @@ def main(tritonserver_addr: str,
         session_id (int): the identical id of a session
         cap (str): the capability of a model. For example, codellama has
             the ability among ['completion', 'infill', 'instruct', 'python']
-        sys_instruct (str): the content of 'system' role, which is used by
-            conversational model
         stream_output (bool): indicator for streaming output or not
         **kwargs (dict): other arguments for initializing model's chat template
     """
     log_level = os.environ.get('SERVICE_LOG_LEVEL', 'WARNING')
-    kwargs.update(capability=cap, system=sys_instruct)
+    kwargs.update(capability=cap)
     chatbot = Chatbot(tritonserver_addr,
                       log_level=log_level,
                       display=stream_output,

diff --git a/lmdeploy/serve/turbomind/chatbot.py b/lmdeploy/serve/turbomind/chatbot.py
@@ -459,6 +459,10 @@ def _stream_infer(self,
             session.sequence_length = 0
 
         input_ids, input_lengths = self.preprocess(prompt)
+        # will crash if last_token_id == eos_id and send empty input_ids
+        if sequence_end and request_output_len == 0:
+            input_ids = np.array([[self.bos_id]], dtype=np.uint32)
+            input_lengths = np.array([[1]], dtype=np.uint32)
         input_tokens = input_lengths.squeeze()
         if self.profile_generation:
             yield StatusCode.TRITON_STREAM_ING, \

diff --git a/lmdeploy/turbomind/chat.py b/lmdeploy/turbomind/chat.py
@@ -73,7 +73,6 @@ def get_gen_param(cap,
 def main(model_path,
          session_id: int = 1,
          cap: str = 'chat',
-         sys_instruct: str = None,
          tp=1,
          stream_output=True,
          **kwargs):
@@ -85,8 +84,6 @@ def main(model_path,
         session_id (int): the identical id of a session
         cap (str): the capability of a model. For example, codellama has
             the ability among ['completion', 'infilling', 'chat', 'python']
-        sys_instruct (str): the content of 'system' role, which is used by
-            conversational model
         tp (int): GPU number used in tensor parallelism
         stream_output (bool): indicator for streaming output or not
         **kwarg (dict): other arguments for initializing model's chat template
@@ -100,9 +97,7 @@ def main(model_path,
     step = 0
     seed = random.getrandbits(64)
     model_name = tm_model.model_name
-    model = MODELS.get(model_name)(capability=cap, **kwargs) \
-        if sys_instruct is None else MODELS.get(model_name)(
-            capability=cap, system=sys_instruct, **kwargs)
+    model = MODELS.get(model_name)(capability=cap, **kwargs)
 
     print(f'session {session_id}')
     while True: