Skip to content

Commit

Permalink
[Frontend] Support scores endpoint in run_batch (vllm-project#12430)
Browse files Browse the repository at this point in the history
Signed-off-by: Pooya Davoodi <[email protected]>
  • Loading branch information
pooyadavoodi authored Jan 27, 2025
1 parent 28e0750 commit 0cc6b38
Show file tree
Hide file tree
Showing 4 changed files with 99 additions and 7 deletions.
33 changes: 32 additions & 1 deletion examples/offline_inference/openai/openai_batch.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ The OpenAI batch file format consists of a series of json objects on new lines.
Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details.

```{note}
We currently only support `/v1/chat/completions` and `/v1/embeddings` endpoints (completions coming soon).
We currently support `/v1/chat/completions`, `/v1/embeddings`, and `/v1/score` endpoints (completions coming soon).
```

## Pre-requisites
Expand Down Expand Up @@ -203,3 +203,34 @@ $ cat results.jsonl
{"id":"vllm-db0f71f7dec244e6bce530e0b4ef908b","custom_id":"request-1","response":{"status_code":200,"request_id":"vllm-batch-3580bf4d4ae54d52b67eee266a6eab20","body":{"id":"embd-33ac2efa7996430184461f2e38529746","object":"list","created":444647,"model":"intfloat/e5-mistral-7b-instruct","data":[{"index":0,"object":"embedding","embedding":[0.016204833984375,0.0092010498046875,0.0018358230590820312,-0.0028228759765625,0.001422882080078125,-0.0031147003173828125,...]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0}}},"error":null}
...
```

## Example 5: Using score endpoint

### Additional prerequisites

* Ensure you are using `vllm >= 0.7.0`.

### Step 1: Create your batch file

Add score requests to your batch file. The following is an example:

```
{"custom_id": "request-1", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
```

You can mix chat completion, embedding, and score requests in the batch file, as long as the model you are using supports them all (note that all requests must use the same model).

### Step 2: Run the batch

You can run the batch using the same command as in earlier examples.

### Step 3: Check your results

You can check your results by running `cat results.jsonl`

```
$ cat results.jsonl
{"id":"vllm-f87c5c4539184f618e555744a2965987","custom_id":"request-1","response":{"status_code":200,"request_id":"vllm-batch-806ab64512e44071b37d3f7ccd291413","body":{"id":"score-4ee45236897b4d29907d49b01298cdb1","object":"list","created":1737847944,"model":"BAAI/bge-reranker-v2-m3","data":[{"index":0,"object":"score","score":0.0010900497436523438},{"index":1,"object":"score","score":1.0}],"usage":{"prompt_tokens":37,"total_tokens":37,"completion_tokens":0,"prompt_tokens_details":null}}},"error":null}
{"id":"vllm-41990c51a26d4fac8419077f12871099","custom_id":"request-2","response":{"status_code":200,"request_id":"vllm-batch-73ce66379026482699f81974e14e1e99","body":{"id":"score-13f2ffe6ba40460fbf9f7f00ad667d75","object":"list","created":1737847944,"model":"BAAI/bge-reranker-v2-m3","data":[{"index":0,"object":"score","score":0.001094818115234375},{"index":1,"object":"score","score":1.0}],"usage":{"prompt_tokens":37,"total_tokens":37,"completion_tokens":0,"prompt_tokens_details":null}}},"error":null}
```
37 changes: 37 additions & 0 deletions tests/entrypoints/openai/test_run_batch.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
import subprocess
import sys
import tempfile
Expand All @@ -21,6 +22,9 @@
{"custom_id": "request-3", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "Hello world!"}}
{"custom_id": "request-4", "method": "POST", "url": "/v1/embeddings", "body": {"model": "NonExistModel", "input": "Hello world!"}}"""

INPUT_SCORE_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}"""


def test_empty_file():
with tempfile.NamedTemporaryFile(
Expand Down Expand Up @@ -102,3 +106,36 @@ def test_embeddings():
# Ensure that the output format conforms to the openai api.
# Validation should throw if the schema is wrong.
BatchRequestOutput.model_validate_json(line)


def test_score():
with tempfile.NamedTemporaryFile(
"w") as input_file, tempfile.NamedTemporaryFile(
"r") as output_file:
input_file.write(INPUT_SCORE_BATCH)
input_file.flush()
proc = subprocess.Popen([
sys.executable,
"-m",
"vllm.entrypoints.openai.run_batch",
"-i",
input_file.name,
"-o",
output_file.name,
"--model",
"BAAI/bge-reranker-v2-m3",
], )
proc.communicate()
proc.wait()
assert proc.returncode == 0, f"{proc=}"

contents = output_file.read()
for line in contents.strip().split("\n"):
# Ensure that the output format conforms to the openai api.
# Validation should throw if the schema is wrong.
BatchRequestOutput.model_validate_json(line)

# Ensure that there is no error in the response.
line_dict = json.loads(line)
assert isinstance(line_dict, dict)
assert line_dict["error"] is None
5 changes: 3 additions & 2 deletions vllm/entrypoints/openai/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -1283,7 +1283,7 @@ class BatchRequestInput(OpenAIBaseModel):
url: str

# The parameters of the request.
body: Union[ChatCompletionRequest, EmbeddingRequest]
body: Union[ChatCompletionRequest, EmbeddingRequest, ScoreRequest]


class BatchResponseData(OpenAIBaseModel):
Expand All @@ -1294,7 +1294,8 @@ class BatchResponseData(OpenAIBaseModel):
request_id: str

# The body of the response.
body: Optional[Union[ChatCompletionResponse, EmbeddingResponse]] = None
body: Optional[Union[ChatCompletionResponse, EmbeddingResponse,
ScoreResponse]] = None


class BatchRequestOutput(OpenAIBaseModel):
Expand Down
31 changes: 27 additions & 4 deletions vllm/entrypoints/openai/run_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,14 @@
BatchRequestOutput,
BatchResponseData,
ChatCompletionResponse,
EmbeddingResponse, ErrorResponse)
EmbeddingResponse, ErrorResponse,
ScoreResponse)
# yapf: enable
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
from vllm.entrypoints.openai.serving_models import (BaseModelPath,
OpenAIServingModels)
from vllm.entrypoints.openai.serving_score import OpenAIServingScores
from vllm.usage.usage_lib import UsageContext
from vllm.utils import FlexibleArgumentParser, random_uuid
from vllm.version import __version__ as VLLM_VERSION
Expand Down Expand Up @@ -167,7 +169,8 @@ async def run_request(serving_engine_func: Callable,
tracker: BatchProgressTracker) -> BatchRequestOutput:
response = await serving_engine_func(request.body)

if isinstance(response, (ChatCompletionResponse, EmbeddingResponse)):
if isinstance(response,
(ChatCompletionResponse, EmbeddingResponse, ScoreResponse)):
batch_output = BatchRequestOutput(
id=f"vllm-{random_uuid()}",
custom_id=request.custom_id,
Expand Down Expand Up @@ -239,6 +242,12 @@ async def main(args):
chat_template=None,
chat_template_content_format="auto",
) if model_config.task == "embed" else None
openai_serving_scores = (OpenAIServingScores(
engine,
model_config,
openai_serving_models,
request_logger=request_logger,
) if model_config.task == "score" else None)

tracker = BatchProgressTracker()
logger.info("Reading batch from %s...", args.input_file)
Expand Down Expand Up @@ -279,14 +288,28 @@ async def main(args):
))
continue

response_futures.append(run_request(handler_fn, request, tracker))
tracker.submitted()
elif request.url == "/v1/score":
handler_fn = (None if openai_serving_scores is None else
openai_serving_scores.create_score)
if handler_fn is None:
response_futures.append(
make_async_error_request_output(
request,
error_msg="The model does not support Scores API",
))
continue

response_futures.append(run_request(handler_fn, request, tracker))
tracker.submitted()
else:
response_futures.append(
make_async_error_request_output(
request,
error_msg="Only /v1/chat/completions and "
"/v1/embeddings are supported in the batch endpoint.",
error_msg=
"Only /v1/chat/completions, /v1/embeddings, and /v1/score "
"are supported in the batch endpoint.",
))

with tracker.pbar():
Expand Down

0 comments on commit 0cc6b38

Please sign in to comment.