Skip to content

Commit

Permalink
Fix crash and remove sys_instruct from chat.py and client.py(#591)
Browse files Browse the repository at this point in the history
* fix crash

* update profile_generation.py

* format

* use self.bos_id

* remove sys_instruct
  • Loading branch information
irexyc authored Oct 24, 2023
1 parent af2f072 commit ffe4ba9
Show file tree
Hide file tree
Showing 5 changed files with 11 additions and 13 deletions.
2 changes: 1 addition & 1 deletion benchmark/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ pip install nvidia-ml-py
```bash
python profile_generation.py \
--model-path /path/to/your/model \
--concurrency 1 8 --prompt-tokens 0 512 --completion-tokens 2048 512
--concurrency 1 8 --prompt-tokens 1 512 --completion-tokens 2048 512
```

## profile serving
Expand Down
6 changes: 4 additions & 2 deletions benchmark/profile_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def _infer(model, session_id):

def profile_throughput(model_path: str,
concurrency: int = 1,
input_seqlen: int = 0,
input_seqlen: int = 1,
output_seqlen: int = 512,
test_round: int = 10,
tp: int = 1):
Expand All @@ -99,8 +99,10 @@ def profile_throughput(model_path: str,
tm_model = TurboMind(model_path=model_path, tp=tp)

# make up a prompt that can be tokenized into {input_seqlen} tokens
prompt = '' if input_seqlen == 0 else 'hi' + ' hi' * (input_seqlen - 1)
assert input_seqlen > 0, 'input_seqlen should > 0'
prompt = 'hi'
input_ids = tokenizer.encode(prompt)
input_ids = input_ids * input_seqlen

warmup(tm_model, concurrency, input_ids, output_seqlen)

Expand Down
5 changes: 1 addition & 4 deletions lmdeploy/serve/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ def input_prompt(model_name):
def main(tritonserver_addr: str,
session_id: int = 1,
cap: str = 'chat',
sys_instruct: str = None,
stream_output: bool = True,
**kwargs):
"""An example to communicate with inference server through the command line
Expand All @@ -32,13 +31,11 @@ def main(tritonserver_addr: str,
session_id (int): the identical id of a session
cap (str): the capability of a model. For example, codellama has
the ability among ['completion', 'infill', 'instruct', 'python']
sys_instruct (str): the content of 'system' role, which is used by
conversational model
stream_output (bool): indicator for streaming output or not
**kwargs (dict): other arguments for initializing model's chat template
"""
log_level = os.environ.get('SERVICE_LOG_LEVEL', 'WARNING')
kwargs.update(capability=cap, system=sys_instruct)
kwargs.update(capability=cap)
chatbot = Chatbot(tritonserver_addr,
log_level=log_level,
display=stream_output,
Expand Down
4 changes: 4 additions & 0 deletions lmdeploy/serve/turbomind/chatbot.py
Original file line number Diff line number Diff line change
Expand Up @@ -459,6 +459,10 @@ def _stream_infer(self,
session.sequence_length = 0

input_ids, input_lengths = self.preprocess(prompt)
# will crash if last_token_id == eos_id and send empty input_ids
if sequence_end and request_output_len == 0:
input_ids = np.array([[self.bos_id]], dtype=np.uint32)
input_lengths = np.array([[1]], dtype=np.uint32)
input_tokens = input_lengths.squeeze()
if self.profile_generation:
yield StatusCode.TRITON_STREAM_ING, \
Expand Down
7 changes: 1 addition & 6 deletions lmdeploy/turbomind/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,6 @@ def get_gen_param(cap,
def main(model_path,
session_id: int = 1,
cap: str = 'chat',
sys_instruct: str = None,
tp=1,
stream_output=True,
**kwargs):
Expand All @@ -85,8 +84,6 @@ def main(model_path,
session_id (int): the identical id of a session
cap (str): the capability of a model. For example, codellama has
the ability among ['completion', 'infilling', 'chat', 'python']
sys_instruct (str): the content of 'system' role, which is used by
conversational model
tp (int): GPU number used in tensor parallelism
stream_output (bool): indicator for streaming output or not
**kwarg (dict): other arguments for initializing model's chat template
Expand All @@ -100,9 +97,7 @@ def main(model_path,
step = 0
seed = random.getrandbits(64)
model_name = tm_model.model_name
model = MODELS.get(model_name)(capability=cap, **kwargs) \
if sys_instruct is None else MODELS.get(model_name)(
capability=cap, system=sys_instruct, **kwargs)
model = MODELS.get(model_name)(capability=cap, **kwargs)

print(f'session {session_id}')
while True:
Expand Down

0 comments on commit ffe4ba9

Please sign in to comment.