diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 1ea18c1d3300e..1dee798858a16 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -146,9 +146,8 @@ async def create_chat_completion( sampling_params = request.to_sampling_params( tokenizer, guided_decode_logits_processor) if sampling_params.max_tokens is None: - sampling_params.max_tokens = ( - self.max_model_len - - len(prompt_inputs["prompt_token_ids"])) + sampling_params.max_tokens = \ + self.max_model_len - len(prompt_inputs["prompt_token_ids"]) self._log_inputs(request_id, prompt_inputs, diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 226cb5e04c5dd..51de2f1826554 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -108,9 +108,8 @@ async def create_completion(self, request: CompletionRequest, sampling_params = request.to_sampling_params( tokenizer, guided_decode_logits_processor) if sampling_params.max_tokens is None: - sampling_params.max_tokens = ( - self.max_model_len - - len(prompt_inputs["prompt_token_ids"])) + sampling_params.max_tokens = self.max_model_len - \ + len(prompt_inputs["prompt_token_ids"]) request_id_item = f"{request_id}-{i}"