Skip to content

Commit

Permalink
[Misc][OpenAI] deprecate max_tokens in favor of new max_completion_to…
Browse files Browse the repository at this point in the history
…kens field for chat completion endpoint (vllm-project#9837)
  • Loading branch information
gcalmettes authored Oct 31, 2024
1 parent 64384bb commit abbfb61
Show file tree
Hide file tree
Showing 14 changed files with 140 additions and 118 deletions.
2 changes: 1 addition & 1 deletion benchmarks/backend_request_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,7 +324,7 @@ async def async_request_openai_chat_completions(
},
],
"temperature": 0.0,
"max_tokens": request_func_input.output_len,
"max_completion_tokens": request_func_input.output_len,
"stream": True,
"ignore_eos": request_func_input.ignore_eos,
}
Expand Down
6 changes: 3 additions & 3 deletions docs/source/serving/run_on_sky.rst
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ SkyPilot can scale up the service to multiple service replicas with built-in aut
messages:
- role: user
content: Hello! What is your name?
max_tokens: 1
max_completion_tokens: 1
.. raw:: html

Expand All @@ -129,7 +129,7 @@ SkyPilot can scale up the service to multiple service replicas with built-in aut
messages:
- role: user
content: Hello! What is your name?
max_tokens: 1
max_completion_tokens: 1
resources:
accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
Expand Down Expand Up @@ -255,7 +255,7 @@ This will scale the service up to when the QPS exceeds 2 for each replica.
messages:
- role: user
content: Hello! What is your name?
max_tokens: 1
max_completion_tokens: 1
resources:
accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
Expand Down
8 changes: 4 additions & 4 deletions examples/offline_inference_openai.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@

```
$ cat openai_example_batch.jsonl
{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
```

### Step 2: Run the batch
Expand Down Expand Up @@ -94,8 +94,8 @@ To follow along with this example, you can download the example batch, or create

```
$ cat openai_example_batch.jsonl
{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
```

Now upload your batch file to your S3 bucket.
Expand Down
12 changes: 6 additions & 6 deletions examples/openai_api_client_for_multimodal.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def run_text_only() -> None:
"content": "What's the capital of France?"
}],
model=model,
max_tokens=64,
max_completion_tokens=64,
)

result = chat_completion.choices[0].message.content
Expand Down Expand Up @@ -83,7 +83,7 @@ def run_single_image() -> None:
],
}],
model=model,
max_tokens=64,
max_completion_tokens=64,
)

result = chat_completion_from_url.choices[0].message.content
Expand All @@ -109,7 +109,7 @@ def run_single_image() -> None:
],
}],
model=model,
max_tokens=64,
max_completion_tokens=64,
)

result = chat_completion_from_base64.choices[0].message.content
Expand Down Expand Up @@ -144,7 +144,7 @@ def run_multi_image() -> None:
],
}],
model=model,
max_tokens=64,
max_completion_tokens=64,
)

result = chat_completion_from_url.choices[0].message.content
Expand Down Expand Up @@ -175,7 +175,7 @@ def run_audio() -> None:
],
}],
model=model,
max_tokens=64,
max_completion_tokens=64,
)

result = chat_completion_from_url.choices[0].message.content
Expand All @@ -201,7 +201,7 @@ def run_audio() -> None:
],
}],
model=model,
max_tokens=64,
max_completion_tokens=64,
)

result = chat_completion_from_base64.choices[0].message.content
Expand Down
4 changes: 2 additions & 2 deletions examples/openai_example_batch.jsonl
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
2 changes: 1 addition & 1 deletion requirements-common.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ protobuf # Required by LlamaTokenizer.
fastapi >= 0.107.0, < 0.113.0; python_version < '3.9'
fastapi >= 0.107.0, != 0.113.*, != 0.114.0; python_version >= '3.9'
aiohttp
openai >= 1.40.0 # Ensure modern openai package (ensure types module present)
openai >= 1.45.0 # Ensure modern openai package (ensure types module present and max_completion_tokens field support)
uvicorn[standard]
pydantic >= 2.9 # Required for fastapi >= 0.113.0
pillow # Required for image processing
Expand Down
32 changes: 17 additions & 15 deletions tests/entrypoints/openai/test_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,11 +68,12 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
}]

# test single completion
chat_completion = await client.chat.completions.create(model=model_name,
messages=messages,
max_tokens=10,
logprobs=True,
top_logprobs=5)
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
logprobs=True,
top_logprobs=5)
assert len(chat_completion.choices) == 1

choice = chat_completion.choices[0]
Expand All @@ -91,7 +92,7 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_tokens=10,
max_completion_tokens=10,
)
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0
Expand Down Expand Up @@ -123,11 +124,12 @@ async def test_single_chat_session_audio_base64encoded(
}]

# test single completion
chat_completion = await client.chat.completions.create(model=model_name,
messages=messages,
max_tokens=10,
logprobs=True,
top_logprobs=5)
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
logprobs=True,
top_logprobs=5)
assert len(chat_completion.choices) == 1

choice = chat_completion.choices[0]
Expand All @@ -146,7 +148,7 @@ async def test_single_chat_session_audio_base64encoded(
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_tokens=10,
max_completion_tokens=10,
)
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0
Expand Down Expand Up @@ -178,7 +180,7 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI,
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_tokens=10,
max_completion_tokens=10,
temperature=0.0,
)
output = chat_completion.choices[0].message.content
Expand All @@ -188,7 +190,7 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI,
stream = await client.chat.completions.create(
model=model_name,
messages=messages,
max_tokens=10,
max_completion_tokens=10,
temperature=0.0,
stream=True,
)
Expand Down Expand Up @@ -242,7 +244,7 @@ async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str,
await client.chat.completions.create(
model=model_name,
messages=messages,
max_tokens=10,
max_completion_tokens=10,
temperature=0.0,
)

Expand Down
Loading

0 comments on commit abbfb61

Please sign in to comment.