Skip to content

Commit

Permalink
Fix wrong eos_id and bos_id obtained through grpc api (#644)
Browse files Browse the repository at this point in the history
* Fix wrong eos_id and bos_id obtained through grpc api

* fix according to review comments

* update
  • Loading branch information
lvhan028 authored Nov 20, 2023
1 parent 07640a3 commit 65d735b
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 59 deletions.
2 changes: 1 addition & 1 deletion lmdeploy/serve/turbomind/chatbot.py
Original file line number Diff line number Diff line change
Expand Up @@ -465,7 +465,7 @@ def _stream_infer(self,
input_lengths = input_lengths - 1
# will crash if last_token_id == eos_id and send empty input_ids
if sequence_end and request_output_len == 0:
input_ids = np.array([[self.bos_id]], dtype=np.uint32)
input_ids = np.array([[1]], dtype=np.uint32)
input_lengths = np.array([[1]], dtype=np.uint32)
input_tokens = input_lengths.squeeze()
if self.profile_generation:
Expand Down
30 changes: 15 additions & 15 deletions lmdeploy/serve/turbomind/triton_models/preprocessing/1/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,7 @@ def initialize(self, args):
self.model_config = model_config = json.loads(args['model_config'])

# Parse model output configs and convert Triton types to numpy types
input_names = [
'INPUT_ID', 'REQUEST_INPUT_LEN', 'BAD_WORDS_IDS', 'STOP_WORDS_IDS'
]
input_names = ['INPUT_ID', 'REQUEST_INPUT_LEN']
for input_name in input_names:
setattr(
self,
Expand Down Expand Up @@ -89,8 +87,6 @@ def execute(self, requests):
# Get input tensors
query = pb_utils.get_input_tensor_by_name(request,
'QUERY').as_numpy()
request_output_len = pb_utils.get_input_tensor_by_name(
request, 'REQUEST_OUTPUT_LEN').as_numpy()

# Preprocessing input data.
input_id, request_input_len = self._create_request(query)
Expand All @@ -104,8 +100,6 @@ def execute(self, requests):
'REQUEST_INPUT_LEN',
np.array(request_input_len).astype(
self.request_input_len_dtype))
request_output_len_tensor = pb_utils.Tensor(
'REQUEST_OUTPUT_LEN', request_output_len)

# Create InferenceResponse. You can set an error here in case
# there was a problem with handling this inference request.
Expand All @@ -114,10 +108,8 @@ def execute(self, requests):
#
# pb_utils.InferenceResponse(
# output_tensors=..., TritonError("An error occurred"))
inference_response = pb_utils.InferenceResponse(output_tensors=[
input_id_tensor, request_input_len_tensor,
request_output_len_tensor
])
inference_response = pb_utils.InferenceResponse(
output_tensors=[input_id_tensor, request_input_len_tensor])
responses.append(inference_response)

# You should return a list of pb_utils.InferenceResponse. Length
Expand All @@ -140,10 +132,18 @@ def _create_request(self, query):
Returns:
tuple: token ids and their length
"""
start_ids = [
torch.IntTensor(self.tokenizer.encode(s[0].decode()))
for s in query
]
start_ids = []
for s in query:
_s = s[0].decode()
if _s == '<BOS>':
start_id = [self.start_id
] if self.start_id is not None else [-1]
elif _s == '<EOS>':
start_id = [self.end_id] if self.end_id is not None else [-1]
else:
start_id = self.tokenizer.encode(_s)
start_ids.append(torch.IntTensor(start_id))

start_lengths = torch.IntTensor([[len(ids)] for ids in start_ids])
start_ids = pad_sequence(start_ids,
batch_first=True,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,53 +7,16 @@ input [
name: "QUERY"
data_type: TYPE_STRING
dims: [ -1 ]
},
{
name: "BAD_WORDS_DICT"
data_type: TYPE_STRING
dims: [ -1 ]
optional: true
},
{
name: "STOP_WORDS_DICT"
data_type: TYPE_STRING
dims: [ -1 ]
optional: true
},
{
name: "REQUEST_OUTPUT_LEN"
data_type: TYPE_UINT32
dims: [ -1 ]
}
]
output [
{
name: "INPUT_ID"
data_type: TYPE_UINT32
dims: [ -1 ]
},
{
name: "REQUEST_INPUT_LEN"
data_type: TYPE_UINT32
dims: [ 1 ]
},
{
name: "BAD_WORDS_IDS"
data_type: TYPE_INT32
dims: [ 2, -1 ]
},
{
name: "STOP_WORDS_IDS"
data_type: TYPE_INT32
dims: [ 2, -1 ]
},
{
name: "REQUEST_OUTPUT_LEN"
data_type: TYPE_UINT32
dims: [ -1 ]
},
{
name: "PROMPT_LEARNING_TASK_NAME_IDS"
name: "REQUEST_INPUT_LEN"
data_type: TYPE_UINT32
dims: [ 1 ]
}
Expand Down
6 changes: 1 addition & 5 deletions lmdeploy/serve/turbomind/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,11 +48,7 @@ def infer(self, prompts: Union[str, List[str]]) -> tuple:
f'{type(prompts)}'

input0_data = np.array(input0).astype(object)
output0_len = np.ones_like(input0).astype(np.uint32)
inputs = [
prepare_tensor('QUERY', input0_data),
prepare_tensor('REQUEST_OUTPUT_LEN', output0_len)
]
inputs = [prepare_tensor('QUERY', input0_data)]

with grpcclient.InferenceServerClient(self.tritonserver_addr) as \
client:
Expand Down

0 comments on commit 65d735b

Please sign in to comment.