Fix wrong eos_id and bos_id obtained through grpc api (#644)

* Fix wrong eos_id and bos_id obtained through grpc api * fix according to review comments * update
InternLM · Nov 20, 2023 · 65d735b · 65d735b
1 parent 07640a3
commit 65d735b
Show file tree

Hide file tree

Showing 4 changed files with 18 additions and 59 deletions.
diff --git a/lmdeploy/serve/turbomind/chatbot.py b/lmdeploy/serve/turbomind/chatbot.py
@@ -465,7 +465,7 @@ def _stream_infer(self,
             input_lengths = input_lengths - 1
         # will crash if last_token_id == eos_id and send empty input_ids
         if sequence_end and request_output_len == 0:
-            input_ids = np.array([[self.bos_id]], dtype=np.uint32)
+            input_ids = np.array([[1]], dtype=np.uint32)
             input_lengths = np.array([[1]], dtype=np.uint32)
         input_tokens = input_lengths.squeeze()
         if self.profile_generation:

diff --git a/lmdeploy/serve/turbomind/triton_models/preprocessing/1/model.py b/lmdeploy/serve/turbomind/triton_models/preprocessing/1/model.py
@@ -42,9 +42,7 @@ def initialize(self, args):
         self.model_config = model_config = json.loads(args['model_config'])
 
         # Parse model output configs and convert Triton types to numpy types
-        input_names = [
-            'INPUT_ID', 'REQUEST_INPUT_LEN', 'BAD_WORDS_IDS', 'STOP_WORDS_IDS'
-        ]
+        input_names = ['INPUT_ID', 'REQUEST_INPUT_LEN']
         for input_name in input_names:
             setattr(
                 self,
@@ -89,8 +87,6 @@ def execute(self, requests):
             # Get input tensors
             query = pb_utils.get_input_tensor_by_name(request,
                                                       'QUERY').as_numpy()
-            request_output_len = pb_utils.get_input_tensor_by_name(
-                request, 'REQUEST_OUTPUT_LEN').as_numpy()
 
             # Preprocessing input data.
             input_id, request_input_len = self._create_request(query)
@@ -104,8 +100,6 @@ def execute(self, requests):
                 'REQUEST_INPUT_LEN',
                 np.array(request_input_len).astype(
                     self.request_input_len_dtype))
-            request_output_len_tensor = pb_utils.Tensor(
-                'REQUEST_OUTPUT_LEN', request_output_len)
 
             # Create InferenceResponse. You can set an error here in case
             # there was a problem with handling this inference request.
@@ -114,10 +108,8 @@ def execute(self, requests):
             #
             # pb_utils.InferenceResponse(
             #    output_tensors=..., TritonError("An error occurred"))
-            inference_response = pb_utils.InferenceResponse(output_tensors=[
-                input_id_tensor, request_input_len_tensor,
-                request_output_len_tensor
-            ])
+            inference_response = pb_utils.InferenceResponse(
+                output_tensors=[input_id_tensor, request_input_len_tensor])
             responses.append(inference_response)
 
         # You should return a list of pb_utils.InferenceResponse. Length
@@ -140,10 +132,18 @@ def _create_request(self, query):
         Returns:
             tuple: token ids and their length
         """
-        start_ids = [
-            torch.IntTensor(self.tokenizer.encode(s[0].decode()))
-            for s in query
-        ]
+        start_ids = []
+        for s in query:
+            _s = s[0].decode()
+            if _s == '<BOS>':
+                start_id = [self.start_id
+                            ] if self.start_id is not None else [-1]
+            elif _s == '<EOS>':
+                start_id = [self.end_id] if self.end_id is not None else [-1]
+            else:
+                start_id = self.tokenizer.encode(_s)
+            start_ids.append(torch.IntTensor(start_id))
+
         start_lengths = torch.IntTensor([[len(ids)] for ids in start_ids])
         start_ids = pad_sequence(start_ids,
                                  batch_first=True,

diff --git a/lmdeploy/serve/turbomind/triton_models/preprocessing/config.pbtxt b/lmdeploy/serve/turbomind/triton_models/preprocessing/config.pbtxt
@@ -7,53 +7,16 @@ input [
         name: "QUERY"
         data_type: TYPE_STRING
         dims: [ -1 ]
-    },
-    {
-        name: "BAD_WORDS_DICT"
-        data_type: TYPE_STRING
-        dims: [ -1 ]
-        optional: true
-    },
-    {
-        name: "STOP_WORDS_DICT"
-        data_type: TYPE_STRING
-        dims: [ -1 ]
-        optional: true
-    },
-    {
-        name: "REQUEST_OUTPUT_LEN"
-        data_type: TYPE_UINT32
-        dims: [ -1 ]
     }
 ]
 output [
     {
         name: "INPUT_ID"
-        data_type: TYPE_UINT32
-        dims: [ -1 ]
-    },
-    {
-        name: "REQUEST_INPUT_LEN"
-        data_type: TYPE_UINT32
-        dims: [ 1 ]
-    },
-    {
-        name: "BAD_WORDS_IDS"
         data_type: TYPE_INT32
-        dims: [ 2, -1 ]
-    },
-    {
-        name: "STOP_WORDS_IDS"
-        data_type: TYPE_INT32
-        dims: [ 2, -1 ]
-    },
-    {
-        name: "REQUEST_OUTPUT_LEN"
-        data_type: TYPE_UINT32
         dims: [ -1 ]
     },
     {
-        name: "PROMPT_LEARNING_TASK_NAME_IDS"
+        name: "REQUEST_INPUT_LEN"
         data_type: TYPE_UINT32
         dims: [ 1 ]
     }

diff --git a/lmdeploy/serve/turbomind/utils.py b/lmdeploy/serve/turbomind/utils.py
@@ -48,11 +48,7 @@ def infer(self, prompts: Union[str, List[str]]) -> tuple:
                       f'{type(prompts)}'
 
         input0_data = np.array(input0).astype(object)
-        output0_len = np.ones_like(input0).astype(np.uint32)
-        inputs = [
-            prepare_tensor('QUERY', input0_data),
-            prepare_tensor('REQUEST_OUTPUT_LEN', output0_len)
-        ]
+        inputs = [prepare_tensor('QUERY', input0_data)]
 
         with grpcclient.InferenceServerClient(self.tritonserver_addr) as \
                 client: