From 389fd9eced8f7ae253dccdca67d5e9fd13358c60 Mon Sep 17 00:00:00 2001 From: Guillaume Calmettes Date: Tue, 29 Oct 2024 16:31:53 +0100 Subject: [PATCH 1/7] feat(openai): add new field max_completion_tokens Signed-off-by: Guillaume Calmettes --- vllm/entrypoints/openai/protocol.py | 10 +++++++--- vllm/entrypoints/openai/serving_engine.py | 14 ++++++++++---- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 7f270a81a7692..565009660f60b 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -159,7 +159,9 @@ class ChatCompletionRequest(OpenAIBaseModel): logit_bias: Optional[Dict[str, float]] = None logprobs: Optional[bool] = False top_logprobs: Optional[int] = 0 - max_tokens: Optional[int] = None + # TODO: remove max_tokens when deprecated field max_tokens is removed from OpenAI API + max_tokens: Optional[int] = Field(default=None, deprecated='max_tokens is now deprecated in favor of the max_completion_tokens field') + max_completion_tokens: Optional[int] = None n: Optional[int] = 1 presence_penalty: Optional[float] = 0.0 response_format: Optional[ResponseFormat] = None @@ -295,7 +297,8 @@ class ChatCompletionRequest(OpenAIBaseModel): def to_beam_search_params(self, default_max_tokens: int) -> BeamSearchParams: - max_tokens = self.max_tokens + # TODO: remove self.max_tokens when deprecated field max_tokens is removed from OpenAI API + max_tokens = self.max_completion_tokens or self.max_tokens if max_tokens is None: max_tokens = default_max_tokens @@ -311,7 +314,8 @@ def to_beam_search_params(self, include_stop_str_in_output=self.include_stop_str_in_output) def to_sampling_params(self, default_max_tokens: int) -> SamplingParams: - max_tokens = self.max_tokens + # TODO: remove self.max_tokens when field deprecated max_tokens is removed from OpenAI API + max_tokens = self.max_completion_tokens or self.max_tokens if max_tokens is None: max_tokens = default_max_tokens diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index e6d2ab93d3363..0f522ff5379f9 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -263,20 +263,26 @@ def _validate_input( return TextTokensPrompt(prompt=input_text, prompt_token_ids=input_ids) - if request.max_tokens is None: + # max_tokens is deprecated in favor of max_completion_tokens for the OpenAI chat completion endpoint + if isinstance(request, ChatCompletionRequest): + # TODO: remove self.max_tokens when deprecated field max_tokens is removed from OpenAI API + max_tokens = request.max_completion_tokens or request.max_tokens + else: + max_tokens = request.max_tokens + if max_tokens is None: if token_num >= self.max_model_len: raise ValueError( f"This model's maximum context length is " f"{self.max_model_len} tokens. However, you requested " f"{token_num} tokens in the messages, " f"Please reduce the length of the messages.") - elif token_num + request.max_tokens > self.max_model_len: + elif token_num + max_tokens > self.max_model_len: raise ValueError( f"This model's maximum context length is " f"{self.max_model_len} tokens. However, you requested " - f"{request.max_tokens + token_num} tokens " + f"{max_tokens + token_num} tokens " f"({token_num} in the messages, " - f"{request.max_tokens} in the completion). " + f"{max_tokens} in the completion). " f"Please reduce the length of the messages or completion.") return TextTokensPrompt(prompt=input_text, prompt_token_ids=input_ids) From c5371927172504d87dc465ca13f9bc9bdaec9398 Mon Sep 17 00:00:00 2001 From: Guillaume Calmettes Date: Wed, 30 Oct 2024 12:03:30 +0100 Subject: [PATCH 2/7] feat: use new max_completion_tokens field for chat_completions calls Signed-off-by: Guillaume Calmettes --- benchmarks/backend_request_func.py | 2 +- docs/source/serving/run_on_sky.rst | 6 +-- examples/offline_inference_openai.md | 8 +-- examples/openai_api_client_for_multimodal.py | 12 ++--- examples/openai_example_batch.jsonl | 4 +- tests/entrypoints/openai/test_audio.py | 14 ++--- tests/entrypoints/openai/test_chat.py | 56 ++++++++++---------- tests/entrypoints/openai/test_vision.py | 20 +++---- tests/tool_use/test_chat_completions.py | 8 +-- tests/tool_use/test_parallel_tool_calls.py | 8 +-- tests/tool_use/test_tool_calls.py | 8 +-- 11 files changed, 73 insertions(+), 73 deletions(-) diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index 4813fde27f0bc..0a903877f000d 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -324,7 +324,7 @@ async def async_request_openai_chat_completions( }, ], "temperature": 0.0, - "max_tokens": request_func_input.output_len, + "max_completion_tokens": request_func_input.output_len, "stream": True, "ignore_eos": request_func_input.ignore_eos, } diff --git a/docs/source/serving/run_on_sky.rst b/docs/source/serving/run_on_sky.rst index 674b14a879bc3..227e6fd2a7818 100644 --- a/docs/source/serving/run_on_sky.rst +++ b/docs/source/serving/run_on_sky.rst @@ -109,7 +109,7 @@ SkyPilot can scale up the service to multiple service replicas with built-in aut messages: - role: user content: Hello! What is your name? - max_tokens: 1 + max_completion_tokens: 1 .. raw:: html @@ -129,7 +129,7 @@ SkyPilot can scale up the service to multiple service replicas with built-in aut messages: - role: user content: Hello! What is your name? - max_tokens: 1 + max_completion_tokens: 1 resources: accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model. @@ -255,7 +255,7 @@ This will scale the service up to when the QPS exceeds 2 for each replica. messages: - role: user content: Hello! What is your name? - max_tokens: 1 + max_completion_tokens: 1 resources: accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model. diff --git a/examples/offline_inference_openai.md b/examples/offline_inference_openai.md index ea34374edd3f9..4c64197975534 100644 --- a/examples/offline_inference_openai.md +++ b/examples/offline_inference_openai.md @@ -35,8 +35,8 @@ ``` $ cat openai_example_batch.jsonl -{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}} -{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}} +{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} +{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} ``` ### Step 2: Run the batch @@ -94,8 +94,8 @@ To follow along with this example, you can download the example batch, or create ``` $ cat openai_example_batch.jsonl -{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}} -{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}} +{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} +{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} ``` Now upload your batch file to your S3 bucket. diff --git a/examples/openai_api_client_for_multimodal.py b/examples/openai_api_client_for_multimodal.py index beb83e494ed0b..0ec4f71dddf93 100644 --- a/examples/openai_api_client_for_multimodal.py +++ b/examples/openai_api_client_for_multimodal.py @@ -53,7 +53,7 @@ def run_text_only() -> None: "content": "What's the capital of France?" }], model=model, - max_tokens=64, + max_completion_tokens=64, ) result = chat_completion.choices[0].message.content @@ -83,7 +83,7 @@ def run_single_image() -> None: ], }], model=model, - max_tokens=64, + max_completion_tokens=64, ) result = chat_completion_from_url.choices[0].message.content @@ -109,7 +109,7 @@ def run_single_image() -> None: ], }], model=model, - max_tokens=64, + max_completion_tokens=64, ) result = chat_completion_from_base64.choices[0].message.content @@ -144,7 +144,7 @@ def run_multi_image() -> None: ], }], model=model, - max_tokens=64, + max_completion_tokens=64, ) result = chat_completion_from_url.choices[0].message.content @@ -175,7 +175,7 @@ def run_audio() -> None: ], }], model=model, - max_tokens=64, + max_completion_tokens=64, ) result = chat_completion_from_url.choices[0].message.content @@ -201,7 +201,7 @@ def run_audio() -> None: ], }], model=model, - max_tokens=64, + max_completion_tokens=64, ) result = chat_completion_from_base64.choices[0].message.content diff --git a/examples/openai_example_batch.jsonl b/examples/openai_example_batch.jsonl index 5aa7e185c180a..54ac8c813ddb7 100644 --- a/examples/openai_example_batch.jsonl +++ b/examples/openai_example_batch.jsonl @@ -1,2 +1,2 @@ -{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}} -{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}} +{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} +{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py index df8a140283fbb..1184731d295aa 100644 --- a/tests/entrypoints/openai/test_audio.py +++ b/tests/entrypoints/openai/test_audio.py @@ -70,7 +70,7 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI, # test single completion chat_completion = await client.chat.completions.create(model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, logprobs=True, top_logprobs=5) assert len(chat_completion.choices) == 1 @@ -91,7 +91,7 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI, chat_completion = await client.chat.completions.create( model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, ) message = chat_completion.choices[0].message assert message.content is not None and len(message.content) >= 0 @@ -125,7 +125,7 @@ async def test_single_chat_session_audio_base64encoded( # test single completion chat_completion = await client.chat.completions.create(model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, logprobs=True, top_logprobs=5) assert len(chat_completion.choices) == 1 @@ -146,7 +146,7 @@ async def test_single_chat_session_audio_base64encoded( chat_completion = await client.chat.completions.create( model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, ) message = chat_completion.choices[0].message assert message.content is not None and len(message.content) >= 0 @@ -178,7 +178,7 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI, chat_completion = await client.chat.completions.create( model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, temperature=0.0, ) output = chat_completion.choices[0].message.content @@ -188,7 +188,7 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI, stream = await client.chat.completions.create( model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, temperature=0.0, stream=True, ) @@ -242,7 +242,7 @@ async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str, await client.chat.completions.create( model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, temperature=0.0, ) diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py index d1aebbd70d256..60a1367d70462 100644 --- a/tests/entrypoints/openai/test_chat.py +++ b/tests/entrypoints/openai/test_chat.py @@ -67,7 +67,7 @@ async def test_no_logprobs_chat(client: openai.AsyncOpenAI, model_name: str): chat_completion = await client.chat.completions.create(model=model_name, messages=messages, - max_tokens=5, + max_completion_tokens=5, temperature=0.0, logprobs=False) @@ -92,7 +92,7 @@ async def test_zero_logprobs_chat(client: openai.AsyncOpenAI, model_name: str): chat_completion = await client.chat.completions.create(model=model_name, messages=messages, - max_tokens=5, + max_completion_tokens=5, temperature=0.0, logprobs=True, top_logprobs=0) @@ -119,7 +119,7 @@ async def test_some_logprobs_chat(client: openai.AsyncOpenAI, model_name: str): chat_completion = await client.chat.completions.create(model=model_name, messages=messages, - max_tokens=5, + max_completion_tokens=5, temperature=0.0, logprobs=True, top_logprobs=5) @@ -149,7 +149,7 @@ async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI, with pytest.raises((openai.BadRequestError, openai.APIError)): stream = await client.chat.completions.create(model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, logprobs=True, top_logprobs=21, stream=True) @@ -159,7 +159,7 @@ async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI, with pytest.raises(openai.BadRequestError): await client.chat.completions.create(model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, logprobs=True, top_logprobs=30, stream=False) @@ -167,7 +167,7 @@ async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI, # the server should still work afterwards chat_completion = await client.chat.completions.create(model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, stream=False) message = chat_completion.choices[0].message assert message.content is not None and len(message.content) >= 0 @@ -273,7 +273,7 @@ async def test_single_chat_session(client: openai.AsyncOpenAI, # test single completion chat_completion = await client.chat.completions.create(model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, logprobs=True, top_logprobs=5) assert chat_completion.id is not None @@ -294,7 +294,7 @@ async def test_single_chat_session(client: openai.AsyncOpenAI, chat_completion = await client.chat.completions.create( model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, ) message = chat_completion.choices[0].message assert message.content is not None and len(message.content) >= 0 @@ -319,7 +319,7 @@ async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str): chat_completion = await client.chat.completions.create( model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, temperature=0.0, ) output = chat_completion.choices[0].message.content @@ -329,7 +329,7 @@ async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str): stream = await client.chat.completions.create( model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, temperature=0.0, stream=True, ) @@ -369,7 +369,7 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI, stream = await client.chat.completions.create( model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, temperature=0.0, stream=True, stream_options={"include_usage": False}) @@ -380,7 +380,7 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI, # "continuous_usage_stats": False}} stream = await client.chat.completions.create(model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, temperature=0.0, stream=True, stream_options={ @@ -409,7 +409,7 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI, await client.chat.completions.create( model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, temperature=0.0, stream=False, stream_options={"include_usage": None}) @@ -419,7 +419,7 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI, await client.chat.completions.create( model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, temperature=0.0, stream=False, stream_options={"include_usage": True}) @@ -429,7 +429,7 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI, stream = await client.chat.completions.create( model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, extra_body=dict(min_tokens=10), temperature=0.0, stream=True, @@ -476,7 +476,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI, chat_completion = await client.chat.completions.create( model=MODEL_NAME, messages=messages, - max_tokens=10, + max_completion_tokens=10, extra_body=dict(guided_choice=sample_guided_choice, guided_decoding_backend=guided_decoding_backend)) choice1 = chat_completion.choices[0].message.content @@ -490,7 +490,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI, chat_completion = await client.chat.completions.create( model=MODEL_NAME, messages=messages, - max_tokens=10, + max_completion_tokens=10, extra_body=dict(guided_choice=sample_guided_choice, guided_decoding_backend=guided_decoding_backend)) choice2 = chat_completion.choices[0].message.content @@ -517,7 +517,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, chat_completion = await client.chat.completions.create( model=MODEL_NAME, messages=messages, - max_tokens=1000, + max_completion_tokens=1000, extra_body=dict(guided_json=sample_json_schema, guided_decoding_backend=guided_decoding_backend)) message = chat_completion.choices[0].message @@ -535,7 +535,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, chat_completion = await client.chat.completions.create( model=MODEL_NAME, messages=messages, - max_tokens=1000, + max_completion_tokens=1000, extra_body=dict(guided_json=sample_json_schema, guided_decoding_backend=guided_decoding_backend)) message = chat_completion.choices[0].message @@ -563,7 +563,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI, chat_completion = await client.chat.completions.create( model=MODEL_NAME, messages=messages, - max_tokens=20, + max_completion_tokens=20, extra_body=dict(guided_regex=sample_regex, guided_decoding_backend=guided_decoding_backend)) ip1 = chat_completion.choices[0].message.content @@ -575,7 +575,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI, chat_completion = await client.chat.completions.create( model=MODEL_NAME, messages=messages, - max_tokens=20, + max_completion_tokens=20, extra_body=dict(guided_regex=sample_regex, guided_decoding_backend=guided_decoding_backend)) ip2 = chat_completion.choices[0].message.content @@ -623,7 +623,7 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI, chat_completion = await client.chat.completions.create( model=MODEL_NAME, messages=messages, - max_tokens=10, + max_completion_tokens=10, logprobs=True, top_logprobs=5, extra_body=dict(guided_choice=sample_guided_choice, @@ -660,7 +660,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, chat_completion = await client.chat.completions.create( model=MODEL_NAME, messages=messages, - max_tokens=1000, + max_completion_tokens=1000, tools=[{ "type": "function", "function": { @@ -694,7 +694,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, stream = await client.chat.completions.create( model=MODEL_NAME, messages=messages, - max_tokens=1000, + max_completion_tokens=1000, tools=[{ "type": "function", "function": { @@ -750,7 +750,7 @@ async def test_required_tool_use_not_yet_supported( await client.chat.completions.create( model=MODEL_NAME, messages=messages, - max_tokens=1000, + max_completion_tokens=1000, tools=[{ "type": "function", "function": { @@ -765,7 +765,7 @@ async def test_required_tool_use_not_yet_supported( await client.chat.completions.create( model=MODEL_NAME, messages=messages, - max_tokens=1000, + max_completion_tokens=1000, tools=[{ "type": "function", "function": { @@ -796,7 +796,7 @@ async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI, with pytest.raises(openai.BadRequestError): await client.chat.completions.create(model=MODEL_NAME, messages=messages, - max_tokens=1000, + max_completion_tokens=1000, tool_choice={ "type": "function", "function": { @@ -809,7 +809,7 @@ async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI, await client.chat.completions.create( model=MODEL_NAME, messages=messages, - max_tokens=1000, + max_completion_tokens=1000, tools=[{ "type": "function", "function": { diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py index 68804d6833c73..6bc7de03f7133 100644 --- a/tests/entrypoints/openai/test_vision.py +++ b/tests/entrypoints/openai/test_vision.py @@ -80,7 +80,7 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI, # test single completion chat_completion = await client.chat.completions.create(model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, logprobs=True, top_logprobs=5) assert len(chat_completion.choices) == 1 @@ -101,7 +101,7 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI, chat_completion = await client.chat.completions.create( model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, ) message = chat_completion.choices[0].message assert message.content is not None and len(message.content) >= 0 @@ -134,7 +134,7 @@ async def test_single_chat_session_image_beamsearch(client: openai.AsyncOpenAI, model=model_name, messages=messages, n=2, - max_tokens=10, + max_completion_tokens=10, logprobs=True, top_logprobs=5, extra_body=dict(use_beam_search=True)) @@ -171,7 +171,7 @@ async def test_single_chat_session_image_base64encoded( # test single completion chat_completion = await client.chat.completions.create(model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, logprobs=True, top_logprobs=5) assert len(chat_completion.choices) == 1 @@ -192,7 +192,7 @@ async def test_single_chat_session_image_base64encoded( chat_completion = await client.chat.completions.create( model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, ) message = chat_completion.choices[0].message assert message.content is not None and len(message.content) >= 0 @@ -226,7 +226,7 @@ async def test_single_chat_session_image_base64encoded_beamsearch( model=model_name, messages=messages, n=2, - max_tokens=10, + max_completion_tokens=10, extra_body=dict(use_beam_search=True)) assert len(chat_completion.choices) == 2 assert chat_completion.choices[ @@ -259,7 +259,7 @@ async def test_chat_streaming_image(client: openai.AsyncOpenAI, chat_completion = await client.chat.completions.create( model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, temperature=0.0, ) output = chat_completion.choices[0].message.content @@ -269,7 +269,7 @@ async def test_chat_streaming_image(client: openai.AsyncOpenAI, stream = await client.chat.completions.create( model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, temperature=0.0, stream=True, ) @@ -320,7 +320,7 @@ async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str, await client.chat.completions.create( model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, temperature=0.0, ) @@ -337,7 +337,7 @@ async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str, chat_completion = await client.chat.completions.create( model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, temperature=0.0, ) message = chat_completion.choices[0].message diff --git a/tests/tool_use/test_chat_completions.py b/tests/tool_use/test_chat_completions.py index 8e7cb9f5d3d90..75bbfbb766931 100644 --- a/tests/tool_use/test_chat_completions.py +++ b/tests/tool_use/test_chat_completions.py @@ -18,7 +18,7 @@ async def test_chat_completion_without_tools(client: openai.AsyncOpenAI, chat_completion = await client.chat.completions.create( messages=ensure_system_prompt(MESSAGES_WITHOUT_TOOLS, server_config), temperature=0, - max_tokens=150, + max_completion_tokens=150, model=model_name, logprobs=False) choice = chat_completion.choices[0] @@ -38,7 +38,7 @@ async def test_chat_completion_without_tools(client: openai.AsyncOpenAI, stream = await client.chat.completions.create( messages=ensure_system_prompt(MESSAGES_WITHOUT_TOOLS, server_config), temperature=0, - max_tokens=150, + max_completion_tokens=150, model=model_name, logprobs=False, stream=True, @@ -86,7 +86,7 @@ async def test_chat_completion_with_tools(client: openai.AsyncOpenAI, chat_completion = await client.chat.completions.create( messages=ensure_system_prompt(MESSAGES_WITHOUT_TOOLS, server_config), temperature=0, - max_tokens=150, + max_completion_tokens=150, model=model_name, tools=[WEATHER_TOOL], logprobs=False) @@ -107,7 +107,7 @@ async def test_chat_completion_with_tools(client: openai.AsyncOpenAI, stream = await client.chat.completions.create( messages=ensure_system_prompt(MESSAGES_WITHOUT_TOOLS, server_config), temperature=0, - max_tokens=150, + max_completion_tokens=150, model=model_name, logprobs=False, tools=[WEATHER_TOOL], diff --git a/tests/tool_use/test_parallel_tool_calls.py b/tests/tool_use/test_parallel_tool_calls.py index cff3c8a556ca4..c294cb04919fa 100644 --- a/tests/tool_use/test_parallel_tool_calls.py +++ b/tests/tool_use/test_parallel_tool_calls.py @@ -26,7 +26,7 @@ async def test_parallel_tool_calls(client: openai.AsyncOpenAI, chat_completion = await client.chat.completions.create( messages=MESSAGES_ASKING_FOR_PARALLEL_TOOLS, temperature=0, - max_tokens=200, + max_completion_tokens=200, model=model_name, tools=[WEATHER_TOOL, SEARCH_TOOL], logprobs=False) @@ -63,7 +63,7 @@ async def test_parallel_tool_calls(client: openai.AsyncOpenAI, model=model_name, messages=MESSAGES_ASKING_FOR_PARALLEL_TOOLS, temperature=0, - max_tokens=200, + max_completion_tokens=200, tools=[WEATHER_TOOL, SEARCH_TOOL], logprobs=False, stream=True) @@ -154,7 +154,7 @@ async def test_parallel_tool_calls_with_results(client: openai.AsyncOpenAI, chat_completion = await client.chat.completions.create( messages=MESSAGES_WITH_PARALLEL_TOOL_RESPONSE, temperature=0, - max_tokens=200, + max_completion_tokens=200, model=model_name, tools=[WEATHER_TOOL, SEARCH_TOOL], logprobs=False) @@ -172,7 +172,7 @@ async def test_parallel_tool_calls_with_results(client: openai.AsyncOpenAI, stream = await client.chat.completions.create( messages=MESSAGES_WITH_PARALLEL_TOOL_RESPONSE, temperature=0, - max_tokens=200, + max_completion_tokens=200, model=model_name, tools=[WEATHER_TOOL, SEARCH_TOOL], logprobs=False, diff --git a/tests/tool_use/test_tool_calls.py b/tests/tool_use/test_tool_calls.py index 9e6d715f44fcf..fe8cb496c9741 100644 --- a/tests/tool_use/test_tool_calls.py +++ b/tests/tool_use/test_tool_calls.py @@ -17,7 +17,7 @@ async def test_tool_call_and_choice(client: openai.AsyncOpenAI): chat_completion = await client.chat.completions.create( messages=MESSAGES_ASKING_FOR_TOOLS, temperature=0, - max_tokens=100, + max_completion_tokens=100, model=model_name, tools=[WEATHER_TOOL, SEARCH_TOOL], logprobs=False) @@ -61,7 +61,7 @@ async def test_tool_call_and_choice(client: openai.AsyncOpenAI): model=model_name, messages=MESSAGES_ASKING_FOR_TOOLS, temperature=0, - max_tokens=100, + max_completion_tokens=100, tools=[WEATHER_TOOL, SEARCH_TOOL], logprobs=False, stream=True) @@ -142,7 +142,7 @@ async def test_tool_call_with_results(client: openai.AsyncOpenAI): chat_completion = await client.chat.completions.create( messages=MESSAGES_WITH_TOOL_RESPONSE, temperature=0, - max_tokens=100, + max_completion_tokens=100, model=model_name, tools=[WEATHER_TOOL, SEARCH_TOOL], logprobs=False) @@ -159,7 +159,7 @@ async def test_tool_call_with_results(client: openai.AsyncOpenAI): stream = await client.chat.completions.create( messages=MESSAGES_WITH_TOOL_RESPONSE, temperature=0, - max_tokens=100, + max_completion_tokens=100, model=model_name, tools=[WEATHER_TOOL, SEARCH_TOOL], logprobs=False, From acac142639a4b89f5b4b51d970bf4edf08834846 Mon Sep 17 00:00:00 2001 From: Guillaume Calmettes Date: Wed, 30 Oct 2024 12:16:11 +0100 Subject: [PATCH 3/7] lint: apply formatting Signed-off-by: Guillaume Calmettes --- tests/entrypoints/openai/test_audio.py | 22 +++++---- tests/entrypoints/openai/test_chat.py | 57 ++++++++++++----------- tests/entrypoints/openai/test_vision.py | 22 +++++---- vllm/entrypoints/openai/protocol.py | 13 ++++-- vllm/entrypoints/openai/serving_engine.py | 4 +- 5 files changed, 66 insertions(+), 52 deletions(-) diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py index 1184731d295aa..a74109e2f5120 100644 --- a/tests/entrypoints/openai/test_audio.py +++ b/tests/entrypoints/openai/test_audio.py @@ -68,11 +68,12 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI, }] # test single completion - chat_completion = await client.chat.completions.create(model=model_name, - messages=messages, - max_completion_tokens=10, - logprobs=True, - top_logprobs=5) + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=10, + logprobs=True, + top_logprobs=5) assert len(chat_completion.choices) == 1 choice = chat_completion.choices[0] @@ -123,11 +124,12 @@ async def test_single_chat_session_audio_base64encoded( }] # test single completion - chat_completion = await client.chat.completions.create(model=model_name, - messages=messages, - max_completion_tokens=10, - logprobs=True, - top_logprobs=5) + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=10, + logprobs=True, + top_logprobs=5) assert len(chat_completion.choices) == 1 choice = chat_completion.choices[0] diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py index 60a1367d70462..8d13f64dce01c 100644 --- a/tests/entrypoints/openai/test_chat.py +++ b/tests/entrypoints/openai/test_chat.py @@ -65,11 +65,12 @@ async def test_no_logprobs_chat(client: openai.AsyncOpenAI, model_name: str): "content": "what is 1+1?" }] - chat_completion = await client.chat.completions.create(model=model_name, - messages=messages, - max_completion_tokens=5, - temperature=0.0, - logprobs=False) + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=5, + temperature=0.0, + logprobs=False) choice = chat_completion.choices[0] assert choice.logprobs is None @@ -90,12 +91,13 @@ async def test_zero_logprobs_chat(client: openai.AsyncOpenAI, model_name: str): "content": "what is 1+1?" }] - chat_completion = await client.chat.completions.create(model=model_name, - messages=messages, - max_completion_tokens=5, - temperature=0.0, - logprobs=True, - top_logprobs=0) + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=5, + temperature=0.0, + logprobs=True, + top_logprobs=0) choice = chat_completion.choices[0] assert choice.logprobs is not None @@ -117,12 +119,13 @@ async def test_some_logprobs_chat(client: openai.AsyncOpenAI, model_name: str): "content": "what is 1+1?" }] - chat_completion = await client.chat.completions.create(model=model_name, - messages=messages, - max_completion_tokens=5, - temperature=0.0, - logprobs=True, - top_logprobs=5) + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=5, + temperature=0.0, + logprobs=True, + top_logprobs=5) choice = chat_completion.choices[0] assert choice.logprobs is not None @@ -165,10 +168,11 @@ async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI, stream=False) # the server should still work afterwards - chat_completion = await client.chat.completions.create(model=model_name, - messages=messages, - max_completion_tokens=10, - stream=False) + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=10, + stream=False) message = chat_completion.choices[0].message assert message.content is not None and len(message.content) >= 0 @@ -271,11 +275,12 @@ async def test_single_chat_session(client: openai.AsyncOpenAI, }] # test single completion - chat_completion = await client.chat.completions.create(model=model_name, - messages=messages, - max_completion_tokens=10, - logprobs=True, - top_logprobs=5) + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=10, + logprobs=True, + top_logprobs=5) assert chat_completion.id is not None assert len(chat_completion.choices) == 1 diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py index 6bc7de03f7133..157d873a75b4d 100644 --- a/tests/entrypoints/openai/test_vision.py +++ b/tests/entrypoints/openai/test_vision.py @@ -78,11 +78,12 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI, }] # test single completion - chat_completion = await client.chat.completions.create(model=model_name, - messages=messages, - max_completion_tokens=10, - logprobs=True, - top_logprobs=5) + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=10, + logprobs=True, + top_logprobs=5) assert len(chat_completion.choices) == 1 choice = chat_completion.choices[0] @@ -169,11 +170,12 @@ async def test_single_chat_session_image_base64encoded( }] # test single completion - chat_completion = await client.chat.completions.create(model=model_name, - messages=messages, - max_completion_tokens=10, - logprobs=True, - top_logprobs=5) + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=10, + logprobs=True, + top_logprobs=5) assert len(chat_completion.choices) == 1 choice = chat_completion.choices[0] diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 565009660f60b..27a7f61fdd824 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -159,8 +159,12 @@ class ChatCompletionRequest(OpenAIBaseModel): logit_bias: Optional[Dict[str, float]] = None logprobs: Optional[bool] = False top_logprobs: Optional[int] = 0 - # TODO: remove max_tokens when deprecated field max_tokens is removed from OpenAI API - max_tokens: Optional[int] = Field(default=None, deprecated='max_tokens is now deprecated in favor of the max_completion_tokens field') + # TODO: remove max_tokens when field is removed from OpenAI API + max_tokens: Optional[int] = Field( + default=None, + deprecated= + 'max_tokens is deprecated in favor of the max_completion_tokens field' + ) max_completion_tokens: Optional[int] = None n: Optional[int] = 1 presence_penalty: Optional[float] = 0.0 @@ -297,7 +301,8 @@ class ChatCompletionRequest(OpenAIBaseModel): def to_beam_search_params(self, default_max_tokens: int) -> BeamSearchParams: - # TODO: remove self.max_tokens when deprecated field max_tokens is removed from OpenAI API + + # TODO: remove max_tokens when field is removed from OpenAI API max_tokens = self.max_completion_tokens or self.max_tokens if max_tokens is None: max_tokens = default_max_tokens @@ -314,7 +319,7 @@ def to_beam_search_params(self, include_stop_str_in_output=self.include_stop_str_in_output) def to_sampling_params(self, default_max_tokens: int) -> SamplingParams: - # TODO: remove self.max_tokens when field deprecated max_tokens is removed from OpenAI API + # TODO: remove max_tokens when field is removed from OpenAI API max_tokens = self.max_completion_tokens or self.max_tokens if max_tokens is None: max_tokens = default_max_tokens diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 0f522ff5379f9..2ec6b6703ff1c 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -263,9 +263,9 @@ def _validate_input( return TextTokensPrompt(prompt=input_text, prompt_token_ids=input_ids) - # max_tokens is deprecated in favor of max_completion_tokens for the OpenAI chat completion endpoint + # chat completion endpoint supports max_completion_tokens if isinstance(request, ChatCompletionRequest): - # TODO: remove self.max_tokens when deprecated field max_tokens is removed from OpenAI API + # TODO: remove max_tokens when field is removed from OpenAI API max_tokens = request.max_completion_tokens or request.max_tokens else: max_tokens = request.max_tokens From cf5041b92bca25673872dbf746eda19c62e2b25c Mon Sep 17 00:00:00 2001 From: Guillaume Calmettes Date: Wed, 30 Oct 2024 12:36:24 +0100 Subject: [PATCH 4/7] feat: bump openai dependency to version introducing max_completion_tokens Signed-off-by: Guillaume Calmettes --- requirements-common.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-common.txt b/requirements-common.txt index d72cc44762720..7adc87f7b151c 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -10,7 +10,7 @@ protobuf # Required by LlamaTokenizer. fastapi >= 0.107.0, < 0.113.0; python_version < '3.9' fastapi >= 0.107.0, != 0.113.*, != 0.114.0; python_version >= '3.9' aiohttp -openai >= 1.40.0 # Ensure modern openai package (ensure types module present) +openai >= 1.45.0 # Ensure modern openai package (ensure types module present) uvicorn[standard] pydantic >= 2.9 # Required for fastapi >= 0.113.0 pillow # Required for image processing From b1621bfea7d31baab376295d504b79d761f482c6 Mon Sep 17 00:00:00 2001 From: Guillaume Calmettes Date: Wed, 30 Oct 2024 12:47:15 +0100 Subject: [PATCH 5/7] lint: apply yapf Signed-off-by: Guillaume Calmettes --- vllm/entrypoints/openai/protocol.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 27a7f61fdd824..76bc218138151 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -163,8 +163,7 @@ class ChatCompletionRequest(OpenAIBaseModel): max_tokens: Optional[int] = Field( default=None, deprecated= - 'max_tokens is deprecated in favor of the max_completion_tokens field' - ) + 'max_tokens is deprecated in favor of the max_completion_tokens field') max_completion_tokens: Optional[int] = None n: Optional[int] = 1 presence_penalty: Optional[float] = 0.0 From 520be809917cb59b723b7c3fd0695f2ce260e519 Mon Sep 17 00:00:00 2001 From: Guillaume Calmettes Date: Wed, 30 Oct 2024 12:57:00 +0100 Subject: [PATCH 6/7] chore: reflect max_completion_tokens support for minimal openai lib version Signed-off-by: Guillaume Calmettes --- requirements-common.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-common.txt b/requirements-common.txt index 7adc87f7b151c..ef5ed8b645158 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -10,7 +10,7 @@ protobuf # Required by LlamaTokenizer. fastapi >= 0.107.0, < 0.113.0; python_version < '3.9' fastapi >= 0.107.0, != 0.113.*, != 0.114.0; python_version >= '3.9' aiohttp -openai >= 1.45.0 # Ensure modern openai package (ensure types module present) +openai >= 1.45.0 # Ensure modern openai package (ensure types module present and max_completion_tokens field support) uvicorn[standard] pydantic >= 2.9 # Required for fastapi >= 0.113.0 pillow # Required for image processing From aad5b19ec060ca26dc4e49606d2ab6fa3f34fb32 Mon Sep 17 00:00:00 2001 From: Guillaume Calmettes Date: Wed, 30 Oct 2024 17:40:03 +0100 Subject: [PATCH 7/7] feat: tie TODO comments to relevant issue for better referencing Signed-off-by: Guillaume Calmettes --- vllm/entrypoints/openai/protocol.py | 7 +++---- vllm/entrypoints/openai/serving_engine.py | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 76bc218138151..60fc5ac8d11d2 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -159,7 +159,7 @@ class ChatCompletionRequest(OpenAIBaseModel): logit_bias: Optional[Dict[str, float]] = None logprobs: Optional[bool] = False top_logprobs: Optional[int] = 0 - # TODO: remove max_tokens when field is removed from OpenAI API + # TODO(#9845): remove max_tokens when field is removed from OpenAI API max_tokens: Optional[int] = Field( default=None, deprecated= @@ -300,8 +300,7 @@ class ChatCompletionRequest(OpenAIBaseModel): def to_beam_search_params(self, default_max_tokens: int) -> BeamSearchParams: - - # TODO: remove max_tokens when field is removed from OpenAI API + # TODO(#9845): remove max_tokens when field is removed from OpenAI API max_tokens = self.max_completion_tokens or self.max_tokens if max_tokens is None: max_tokens = default_max_tokens @@ -318,7 +317,7 @@ def to_beam_search_params(self, include_stop_str_in_output=self.include_stop_str_in_output) def to_sampling_params(self, default_max_tokens: int) -> SamplingParams: - # TODO: remove max_tokens when field is removed from OpenAI API + # TODO(#9845): remove max_tokens when field is removed from OpenAI API max_tokens = self.max_completion_tokens or self.max_tokens if max_tokens is None: max_tokens = default_max_tokens diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 2ec6b6703ff1c..22a01b3dc4cc0 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -265,7 +265,7 @@ def _validate_input( # chat completion endpoint supports max_completion_tokens if isinstance(request, ChatCompletionRequest): - # TODO: remove max_tokens when field is removed from OpenAI API + # TODO(#9845): remove max_tokens when field dropped from OpenAI API max_tokens = request.max_completion_tokens or request.max_tokens else: max_tokens = request.max_tokens