Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Misc][OpenAI] deprecate max_tokens in favor of new max_completion_tokens field for chat completion endpoint #9837

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion benchmarks/backend_request_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,7 +324,7 @@ async def async_request_openai_chat_completions(
},
],
"temperature": 0.0,
"max_tokens": request_func_input.output_len,
"max_completion_tokens": request_func_input.output_len,
"stream": True,
"ignore_eos": request_func_input.ignore_eos,
}
Expand Down
6 changes: 3 additions & 3 deletions docs/source/serving/run_on_sky.rst
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ SkyPilot can scale up the service to multiple service replicas with built-in aut
messages:
- role: user
content: Hello! What is your name?
max_tokens: 1
max_completion_tokens: 1

.. raw:: html

Expand All @@ -129,7 +129,7 @@ SkyPilot can scale up the service to multiple service replicas with built-in aut
messages:
- role: user
content: Hello! What is your name?
max_tokens: 1
max_completion_tokens: 1

resources:
accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
Expand Down Expand Up @@ -255,7 +255,7 @@ This will scale the service up to when the QPS exceeds 2 for each replica.
messages:
- role: user
content: Hello! What is your name?
max_tokens: 1
max_completion_tokens: 1

resources:
accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
Expand Down
8 changes: 4 additions & 4 deletions examples/offline_inference_openai.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@

```
$ cat openai_example_batch.jsonl
{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
```

### Step 2: Run the batch
Expand Down Expand Up @@ -94,8 +94,8 @@ To follow along with this example, you can download the example batch, or create

```
$ cat openai_example_batch.jsonl
{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
```

Now upload your batch file to your S3 bucket.
Expand Down
12 changes: 6 additions & 6 deletions examples/openai_api_client_for_multimodal.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def run_text_only() -> None:
"content": "What's the capital of France?"
}],
model=model,
max_tokens=64,
max_completion_tokens=64,
)

result = chat_completion.choices[0].message.content
Expand Down Expand Up @@ -83,7 +83,7 @@ def run_single_image() -> None:
],
}],
model=model,
max_tokens=64,
max_completion_tokens=64,
)

result = chat_completion_from_url.choices[0].message.content
Expand All @@ -109,7 +109,7 @@ def run_single_image() -> None:
],
}],
model=model,
max_tokens=64,
max_completion_tokens=64,
)

result = chat_completion_from_base64.choices[0].message.content
Expand Down Expand Up @@ -144,7 +144,7 @@ def run_multi_image() -> None:
],
}],
model=model,
max_tokens=64,
max_completion_tokens=64,
)

result = chat_completion_from_url.choices[0].message.content
Expand Down Expand Up @@ -175,7 +175,7 @@ def run_audio() -> None:
],
}],
model=model,
max_tokens=64,
max_completion_tokens=64,
)

result = chat_completion_from_url.choices[0].message.content
Expand All @@ -201,7 +201,7 @@ def run_audio() -> None:
],
}],
model=model,
max_tokens=64,
max_completion_tokens=64,
)

result = chat_completion_from_base64.choices[0].message.content
Expand Down
4 changes: 2 additions & 2 deletions examples/openai_example_batch.jsonl
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
2 changes: 1 addition & 1 deletion requirements-common.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ protobuf # Required by LlamaTokenizer.
fastapi >= 0.107.0, < 0.113.0; python_version < '3.9'
fastapi >= 0.107.0, != 0.113.*, != 0.114.0; python_version >= '3.9'
aiohttp
openai >= 1.40.0 # Ensure modern openai package (ensure types module present)
openai >= 1.45.0 # Ensure modern openai package (ensure types module present and max_completion_tokens field support)
uvicorn[standard]
pydantic >= 2.9 # Required for fastapi >= 0.113.0
pillow # Required for image processing
Expand Down
32 changes: 17 additions & 15 deletions tests/entrypoints/openai/test_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,11 +68,12 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
}]

# test single completion
chat_completion = await client.chat.completions.create(model=model_name,
messages=messages,
max_tokens=10,
logprobs=True,
top_logprobs=5)
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
logprobs=True,
top_logprobs=5)
assert len(chat_completion.choices) == 1

choice = chat_completion.choices[0]
Expand All @@ -91,7 +92,7 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_tokens=10,
max_completion_tokens=10,
)
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0
Expand Down Expand Up @@ -123,11 +124,12 @@ async def test_single_chat_session_audio_base64encoded(
}]

# test single completion
chat_completion = await client.chat.completions.create(model=model_name,
messages=messages,
max_tokens=10,
logprobs=True,
top_logprobs=5)
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
logprobs=True,
top_logprobs=5)
assert len(chat_completion.choices) == 1

choice = chat_completion.choices[0]
Expand All @@ -146,7 +148,7 @@ async def test_single_chat_session_audio_base64encoded(
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_tokens=10,
max_completion_tokens=10,
)
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0
Expand Down Expand Up @@ -178,7 +180,7 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI,
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_tokens=10,
max_completion_tokens=10,
temperature=0.0,
)
output = chat_completion.choices[0].message.content
Expand All @@ -188,7 +190,7 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI,
stream = await client.chat.completions.create(
model=model_name,
messages=messages,
max_tokens=10,
max_completion_tokens=10,
temperature=0.0,
stream=True,
)
Expand Down Expand Up @@ -242,7 +244,7 @@ async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str,
await client.chat.completions.create(
model=model_name,
messages=messages,
max_tokens=10,
max_completion_tokens=10,
temperature=0.0,
)

Expand Down
Loading