From cce2f16d9e09ee90d8134effd5d382c1311a8e7d Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 17 Jul 2024 15:43:21 +0800 Subject: [PATCH] [Doc][CI/Build] Update docs and tests to use `vllm serve` (#6431) Signed-off-by: Alvant --- docs/source/getting_started/quickstart.rst | 7 +-- docs/source/models/adding_model.rst | 4 +- docs/source/models/engine_args.rst | 4 +- docs/source/models/lora.rst | 3 +- docs/source/models/vlm.rst | 4 +- docs/source/serving/deploying_with_dstack.rst | 2 +- docs/source/serving/distributed_serving.rst | 6 +- .../serving/openai_compatible_server.md | 8 +-- examples/api_client.py | 5 +- examples/logging_configuration.md | 12 +--- examples/openai_vision_api_client.py | 4 +- examples/production_monitoring/Otel.md | 6 +- examples/production_monitoring/README.md | 3 +- tests/async_engine/test_openapi_server_ray.py | 22 +++---- tests/distributed/test_pipeline_parallel.py | 6 +- tests/entrypoints/openai/test_chat.py | 42 ++++++------- tests/entrypoints/openai/test_completion.py | 60 +++++++++---------- tests/entrypoints/openai/test_embedding.py | 22 +++---- tests/entrypoints/openai/test_models.py | 42 ++++++------- tests/entrypoints/openai/test_tokenization.py | 24 ++++---- tests/entrypoints/openai/test_vision.py | 22 +++---- tests/tensorizer_loader/test_tensorizer.py | 4 +- tests/utils.py | 18 +++--- 23 files changed, 155 insertions(+), 175 deletions(-) diff --git a/docs/source/getting_started/quickstart.rst b/docs/source/getting_started/quickstart.rst index 7c44a96865a50..89bdc247c5e8e 100644 --- a/docs/source/getting_started/quickstart.rst +++ b/docs/source/getting_started/quickstart.rst @@ -73,16 +73,13 @@ Start the server: .. code-block:: console - $ python -m vllm.entrypoints.openai.api_server \ - $ --model facebook/opt-125m + $ vllm serve facebook/opt-125m By default, the server uses a predefined chat template stored in the tokenizer. You can override this template by using the ``--chat-template`` argument: .. code-block:: console - $ python -m vllm.entrypoints.openai.api_server \ - $ --model facebook/opt-125m \ - $ --chat-template ./examples/template_chatml.jinja + $ vllm serve facebook/opt-125m --chat-template ./examples/template_chatml.jinja This server can be queried in the same format as OpenAI API. For example, list the models: diff --git a/docs/source/models/adding_model.rst b/docs/source/models/adding_model.rst index 53c19e5829218..5cffb58cafd96 100644 --- a/docs/source/models/adding_model.rst +++ b/docs/source/models/adding_model.rst @@ -114,7 +114,7 @@ Just add the following lines in your code: from your_code import YourModelForCausalLM ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM) -If you are running api server with `python -m vllm.entrypoints.openai.api_server args`, you can wrap the entrypoint with the following code: +If you are running api server with :code:`vllm serve `, you can wrap the entrypoint with the following code: .. code-block:: python @@ -124,4 +124,4 @@ If you are running api server with `python -m vllm.entrypoints.openai.api_server import runpy runpy.run_module('vllm.entrypoints.openai.api_server', run_name='__main__') -Save the above code in a file and run it with `python your_file.py args`. +Save the above code in a file and run it with :code:`python your_file.py `. diff --git a/docs/source/models/engine_args.rst b/docs/source/models/engine_args.rst index bdf566d3ebbd1..e7ce8cdcabe88 100644 --- a/docs/source/models/engine_args.rst +++ b/docs/source/models/engine_args.rst @@ -8,7 +8,7 @@ Below, you can find an explanation of every engine argument for vLLM: .. argparse:: :module: vllm.engine.arg_utils :func: _engine_args_parser - :prog: -m vllm.entrypoints.openai.api_server + :prog: vllm serve :nodefaultconst: Async Engine Arguments @@ -19,5 +19,5 @@ Below are the additional arguments related to the asynchronous engine: .. argparse:: :module: vllm.engine.arg_utils :func: _async_engine_args_parser - :prog: -m vllm.entrypoints.openai.api_server + :prog: vllm serve :nodefaultconst: \ No newline at end of file diff --git a/docs/source/models/lora.rst b/docs/source/models/lora.rst index 5cc3076073fbd..f08773fe59d92 100644 --- a/docs/source/models/lora.rst +++ b/docs/source/models/lora.rst @@ -61,8 +61,7 @@ LoRA adapted models can also be served with the Open-AI compatible vLLM server. .. code-block:: bash - python -m vllm.entrypoints.openai.api_server \ - --model meta-llama/Llama-2-7b-hf \ + vllm serve meta-llama/Llama-2-7b-hf \ --enable-lora \ --lora-modules sql-lora=$HOME/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/ diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst index d488b0fefdf06..92aca168dadf2 100644 --- a/docs/source/models/vlm.rst +++ b/docs/source/models/vlm.rst @@ -94,9 +94,7 @@ Below is an example on how to launch the same ``llava-hf/llava-1.5-7b-hf`` with .. code-block:: bash - python -m vllm.entrypoints.openai.api_server \ - --model llava-hf/llava-1.5-7b-hf \ - --chat-template template_llava.jinja + vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja .. important:: We have removed all vision language related CLI args in the ``0.5.1`` release. **This is a breaking change**, so please update your code to follow diff --git a/docs/source/serving/deploying_with_dstack.rst b/docs/source/serving/deploying_with_dstack.rst index baf87314ca8e4..e1eb45b225d9c 100644 --- a/docs/source/serving/deploying_with_dstack.rst +++ b/docs/source/serving/deploying_with_dstack.rst @@ -40,7 +40,7 @@ Next, to provision a VM instance with LLM of your choice(`NousResearch/Llama-2-7 gpu: 24GB commands: - pip install vllm - - python -m vllm.entrypoints.openai.api_server --model $MODEL --port 8000 + - vllm serve $MODEL --port 8000 model: format: openai type: chat diff --git a/docs/source/serving/distributed_serving.rst b/docs/source/serving/distributed_serving.rst index 2dfb83f168b5d..fa1b04dc3dce5 100644 --- a/docs/source/serving/distributed_serving.rst +++ b/docs/source/serving/distributed_serving.rst @@ -35,16 +35,14 @@ To run multi-GPU serving, pass in the :code:`--tensor-parallel-size` argument wh .. code-block:: console - $ python -m vllm.entrypoints.openai.api_server \ - $ --model facebook/opt-13b \ + $ vllm serve facebook/opt-13b \ $ --tensor-parallel-size 4 You can also additionally specify :code:`--pipeline-parallel-size` to enable pipeline parallelism. For example, to run API server on 8 GPUs with pipeline parallelism and tensor parallelism: .. code-block:: console - $ python -m vllm.entrypoints.openai.api_server \ - $ --model gpt2 \ + $ vllm serve gpt2 \ $ --tensor-parallel-size 4 \ $ --pipeline-parallel-size 2 \ $ --distributed-executor-backend ray diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index 092c3c6cb9a3d..a06c30d9c48c6 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -4,7 +4,7 @@ vLLM provides an HTTP server that implements OpenAI's [Completions](https://plat You can start the server using Python, or using [Docker](deploying_with_docker.rst): ```bash -python -m vllm.entrypoints.openai.api_server --model NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123 +vllm serve NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123 ``` To call the server, you can use the official OpenAI Python client library, or any other HTTP client. @@ -97,9 +97,7 @@ template, or the template in string form. Without a chat template, the server wi and all chat requests will error. ```bash -python -m vllm.entrypoints.openai.api_server \ - --model ... \ - --chat-template ./path-to-chat-template.jinja +vllm serve --chat-template ./path-to-chat-template.jinja ``` vLLM community provides a set of chat templates for popular models. You can find them in the examples @@ -110,7 +108,7 @@ directory [here](https://github.com/vllm-project/vllm/tree/main/examples/) ```{argparse} :module: vllm.entrypoints.openai.cli_args :func: create_parser_for_docs -:prog: -m vllm.entrypoints.openai.api_server +:prog: vllm serve ``` ## Tool calling in the chat completion API diff --git a/examples/api_client.py b/examples/api_client.py index 5f7daa14d5044..27a2a08b7b0c3 100644 --- a/examples/api_client.py +++ b/examples/api_client.py @@ -1,8 +1,7 @@ -"""Example Python client for vllm.entrypoints.api_server +"""Example Python client for `vllm.entrypoints.api_server` NOTE: The API server is used only for demonstration and simple performance benchmarks. It is not intended for production use. -For production use, we recommend vllm.entrypoints.openai.api_server -and the OpenAI client API +For production use, we recommend `vllm serve` and the OpenAI client API. """ import argparse diff --git a/examples/logging_configuration.md b/examples/logging_configuration.md index 75b4b31a80462..0d278b0392403 100644 --- a/examples/logging_configuration.md +++ b/examples/logging_configuration.md @@ -95,9 +95,7 @@ to the path of the custom logging configuration JSON file: ```bash VLLM_LOGGING_CONFIG_PATH=/path/to/logging_config.json \ - python3 -m vllm.entrypoints.openai.api_server \ - --max-model-len 2048 \ - --model mistralai/Mistral-7B-v0.1 + vllm serve mistralai/Mistral-7B-v0.1 --max-model-len 2048 ``` @@ -152,9 +150,7 @@ to the path of the custom logging configuration JSON file: ```bash VLLM_LOGGING_CONFIG_PATH=/path/to/logging_config.json \ - python3 -m vllm.entrypoints.openai.api_server \ - --max-model-len 2048 \ - --model mistralai/Mistral-7B-v0.1 + vllm serve mistralai/Mistral-7B-v0.1 --max-model-len 2048 ``` @@ -167,9 +163,7 @@ loggers. ```bash VLLM_CONFIGURE_LOGGING=0 \ - python3 -m vllm.entrypoints.openai.api_server \ - --max-model-len 2048 \ - --model mistralai/Mistral-7B-v0.1 + vllm serve mistralai/Mistral-7B-v0.1 --max-model-len 2048 ``` diff --git a/examples/openai_vision_api_client.py b/examples/openai_vision_api_client.py index d4d9738a1f7bc..2082c378e267c 100644 --- a/examples/openai_vision_api_client.py +++ b/examples/openai_vision_api_client.py @@ -1,9 +1,7 @@ """An example showing how to use vLLM to serve VLMs. Launch the vLLM server with the following command: -python -m vllm.entrypoints.openai.api_server \ - --model llava-hf/llava-1.5-7b-hf \ - --chat-template template_llava.jinja +vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja """ import base64 diff --git a/examples/production_monitoring/Otel.md b/examples/production_monitoring/Otel.md index 1449442273c7a..2c7a7caa1bd7c 100644 --- a/examples/production_monitoring/Otel.md +++ b/examples/production_monitoring/Otel.md @@ -36,7 +36,7 @@ ``` export OTEL_SERVICE_NAME="vllm-server" export OTEL_EXPORTER_OTLP_TRACES_INSECURE=true - python -m vllm.entrypoints.openai.api_server --model="facebook/opt-125m" --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT" + vllm serve facebook/opt-125m --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT" ``` 1. In a new shell, send requests with trace context from a dummy client @@ -62,7 +62,7 @@ By default, `grpc` is used. To set `http/protobuf` as the protocol, configure th ``` export OTEL_EXPORTER_OTLP_TRACES_PROTOCOL=http/protobuf export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://$JAEGER_IP:4318/v1/traces -python -m vllm.entrypoints.openai.api_server --model="facebook/opt-125m" --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT" +vllm serve facebook/opt-125m --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT" ``` ## Instrumentation of FastAPI @@ -74,7 +74,7 @@ OpenTelemetry allows automatic instrumentation of FastAPI. 1. Run vLLM with `opentelemetry-instrument` ``` - opentelemetry-instrument python -m vllm.entrypoints.openai.api_server --model="facebook/opt-125m" + opentelemetry-instrument vllm serve facebook/opt-125m ``` 1. Send a request to vLLM and find its trace in Jaeger. It should contain spans from FastAPI. diff --git a/examples/production_monitoring/README.md b/examples/production_monitoring/README.md index 268f2e771018f..807c0470e7b30 100644 --- a/examples/production_monitoring/README.md +++ b/examples/production_monitoring/README.md @@ -10,8 +10,7 @@ Install: Prometheus metric logging is enabled by default in the OpenAI-compatible server. Launch via the entrypoint: ```bash -python3 -m vllm.entrypoints.openai.api_server \ - --model mistralai/Mistral-7B-v0.1 \ +vllm serve mistralai/Mistral-7B-v0.1 \ --max-model-len 2048 \ --disable-log-requests ``` diff --git a/tests/async_engine/test_openapi_server_ray.py b/tests/async_engine/test_openapi_server_ray.py index 575f8f19b8ebe..5ecd770ede836 100644 --- a/tests/async_engine/test_openapi_server_ray.py +++ b/tests/async_engine/test_openapi_server_ray.py @@ -9,17 +9,17 @@ @pytest.fixture(scope="module") def server(): - with RemoteOpenAIServer([ - "--model", - MODEL_NAME, - # use half precision for speed and memory savings in CI environment - "--dtype", - "float16", - "--max-model-len", - "2048", - "--enforce-eager", - "--engine-use-ray" - ]) as remote_server: + args = [ + # use half precision for speed and memory savings in CI environment + "--dtype", + "float16", + "--max-model-len", + "2048", + "--enforce-eager", + "--engine-use-ray" + ] + + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: yield remote_server diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 7b7475a77167c..52074a93329ea 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -15,8 +15,6 @@ ]) def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME): pp_args = [ - "--model", - MODEL_NAME, # use half precision for speed and memory savings in CI environment "--dtype", "bfloat16", @@ -34,8 +32,6 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME): # schedule all workers in a node other than the head node, # which can cause the test to fail. tp_args = [ - "--model", - MODEL_NAME, # use half precision for speed and memory savings in CI environment "--dtype", "bfloat16", @@ -53,7 +49,7 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME): results = [] for args in [pp_args, tp_args]: - with RemoteOpenAIServer(args) as server: + with RemoteOpenAIServer(MODEL_NAME, args) as server: client = server.get_client() # test models list diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py index d370c63c0c7ba..8f67dd54edff0 100644 --- a/tests/entrypoints/openai/test_chat.py +++ b/tests/entrypoints/openai/test_chat.py @@ -27,27 +27,27 @@ def zephyr_lora_files(): @pytest.fixture(scope="module") def server(zephyr_lora_files): - with RemoteOpenAIServer([ - "--model", - MODEL_NAME, - # use half precision for speed and memory savings in CI environment - "--dtype", - "bfloat16", - "--max-model-len", - "8192", - "--enforce-eager", - # lora config below - "--enable-lora", - "--lora-modules", - f"zephyr-lora={zephyr_lora_files}", - f"zephyr-lora2={zephyr_lora_files}", - "--max-lora-rank", - "64", - "--max-cpu-loras", - "2", - "--max-num-seqs", - "128", - ]) as remote_server: + args = [ + # use half precision for speed and memory savings in CI environment + "--dtype", + "bfloat16", + "--max-model-len", + "8192", + "--enforce-eager", + # lora config below + "--enable-lora", + "--lora-modules", + f"zephyr-lora={zephyr_lora_files}", + f"zephyr-lora2={zephyr_lora_files}", + "--max-lora-rank", + "64", + "--max-cpu-loras", + "2", + "--max-num-seqs", + "128", + ] + + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: yield remote_server diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py index 35af0b02747e9..59151b9c4e99e 100644 --- a/tests/entrypoints/openai/test_completion.py +++ b/tests/entrypoints/openai/test_completion.py @@ -37,36 +37,36 @@ def zephyr_pa_files(): @pytest.fixture(scope="module") def server(zephyr_lora_files, zephyr_pa_files): - with RemoteOpenAIServer([ - "--model", - MODEL_NAME, - # use half precision for speed and memory savings in CI environment - "--dtype", - "bfloat16", - "--max-model-len", - "8192", - "--max-num-seqs", - "128", - "--enforce-eager", - # lora config - "--enable-lora", - "--lora-modules", - f"zephyr-lora={zephyr_lora_files}", - f"zephyr-lora2={zephyr_lora_files}", - "--max-lora-rank", - "64", - "--max-cpu-loras", - "2", - # pa config - "--enable-prompt-adapter", - "--prompt-adapters", - f"zephyr-pa={zephyr_pa_files}", - f"zephyr-pa2={zephyr_pa_files}", - "--max-prompt-adapters", - "2", - "--max-prompt-adapter-token", - "128", - ]) as remote_server: + args = [ + # use half precision for speed and memory savings in CI environment + "--dtype", + "bfloat16", + "--max-model-len", + "8192", + "--max-num-seqs", + "128", + "--enforce-eager", + # lora config + "--enable-lora", + "--lora-modules", + f"zephyr-lora={zephyr_lora_files}", + f"zephyr-lora2={zephyr_lora_files}", + "--max-lora-rank", + "64", + "--max-cpu-loras", + "2", + # pa config + "--enable-prompt-adapter", + "--prompt-adapters", + f"zephyr-pa={zephyr_pa_files}", + f"zephyr-pa2={zephyr_pa_files}", + "--max-prompt-adapters", + "2", + "--max-prompt-adapter-token", + "128", + ] + + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: yield remote_server diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py index 4a32aadc8c3ae..2ca0c0d63c25c 100644 --- a/tests/entrypoints/openai/test_embedding.py +++ b/tests/entrypoints/openai/test_embedding.py @@ -11,17 +11,17 @@ @pytest.fixture(scope="module") def embedding_server(): - with RemoteOpenAIServer([ - "--model", - EMBEDDING_MODEL_NAME, - # use half precision for speed and memory savings in CI environment - "--dtype", - "bfloat16", - "--enforce-eager", - "--max-model-len", - "8192", - "--enforce-eager", - ]) as remote_server: + args = [ + # use half precision for speed and memory savings in CI environment + "--dtype", + "bfloat16", + "--enforce-eager", + "--max-model-len", + "8192", + "--enforce-eager", + ] + + with RemoteOpenAIServer(EMBEDDING_MODEL_NAME, args) as remote_server: yield remote_server diff --git a/tests/entrypoints/openai/test_models.py b/tests/entrypoints/openai/test_models.py index bf63f9a813f2c..c2cfff228c546 100644 --- a/tests/entrypoints/openai/test_models.py +++ b/tests/entrypoints/openai/test_models.py @@ -19,27 +19,27 @@ def zephyr_lora_files(): @pytest.fixture(scope="module") def server(zephyr_lora_files): - with RemoteOpenAIServer([ - "--model", - MODEL_NAME, - # use half precision for speed and memory savings in CI environment - "--dtype", - "bfloat16", - "--max-model-len", - "8192", - "--enforce-eager", - # lora config below - "--enable-lora", - "--lora-modules", - f"zephyr-lora={zephyr_lora_files}", - f"zephyr-lora2={zephyr_lora_files}", - "--max-lora-rank", - "64", - "--max-cpu-loras", - "2", - "--max-num-seqs", - "128", - ]) as remote_server: + args = [ + # use half precision for speed and memory savings in CI environment + "--dtype", + "bfloat16", + "--max-model-len", + "8192", + "--enforce-eager", + # lora config below + "--enable-lora", + "--lora-modules", + f"zephyr-lora={zephyr_lora_files}", + f"zephyr-lora2={zephyr_lora_files}", + "--max-lora-rank", + "64", + "--max-cpu-loras", + "2", + "--max-num-seqs", + "128", + ] + + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: yield remote_server diff --git a/tests/entrypoints/openai/test_tokenization.py b/tests/entrypoints/openai/test_tokenization.py index d33fd222ee150..f32abba225d40 100644 --- a/tests/entrypoints/openai/test_tokenization.py +++ b/tests/entrypoints/openai/test_tokenization.py @@ -12,18 +12,18 @@ @pytest.fixture(scope="module") def server(): - with RemoteOpenAIServer([ - "--model", - MODEL_NAME, - # use half precision for speed and memory savings in CI environment - "--dtype", - "bfloat16", - "--max-model-len", - "8192", - "--enforce-eager", - "--max-num-seqs", - "128", - ]) as remote_server: + args = [ + # use half precision for speed and memory savings in CI environment + "--dtype", + "bfloat16", + "--max-model-len", + "8192", + "--enforce-eager", + "--max-num-seqs", + "128", + ] + + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: yield remote_server diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py index 563b68566bd2c..cc5c8d619183f 100644 --- a/tests/entrypoints/openai/test_vision.py +++ b/tests/entrypoints/openai/test_vision.py @@ -23,17 +23,17 @@ @pytest.fixture(scope="module") def server(): - with RemoteOpenAIServer([ - "--model", - MODEL_NAME, - "--dtype", - "bfloat16", - "--max-model-len", - "4096", - "--enforce-eager", - "--chat-template", - str(LLAVA_CHAT_TEMPLATE), - ]) as remote_server: + args = [ + "--dtype", + "bfloat16", + "--max-model-len", + "4096", + "--enforce-eager", + "--chat-template", + str(LLAVA_CHAT_TEMPLATE), + ] + + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: yield remote_server diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py index a43f9132585b5..b7030e3cd6d42 100644 --- a/tests/tensorizer_loader/test_tensorizer.py +++ b/tests/tensorizer_loader/test_tensorizer.py @@ -214,12 +214,12 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path): ## Start OpenAI API server openai_args = [ - "--model", model_ref, "--dtype", "float16", "--load-format", + "--dtype", "float16", "--load-format", "tensorizer", "--model-loader-extra-config", json.dumps(model_loader_extra_config), ] - with RemoteOpenAIServer(openai_args) as server: + with RemoteOpenAIServer(model_ref, openai_args) as server: print("Server ready.") client = server.get_client() diff --git a/tests/utils.py b/tests/utils.py index 8780d45a31b29..80e0895c551b2 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -49,7 +49,13 @@ class RemoteOpenAIServer: DUMMY_API_KEY = "token-abc123" # vLLM's OpenAI server does not need API key MAX_SERVER_START_WAIT_S = 600 # wait for server to start for 60 seconds - def __init__(self, cli_args: List[str], *, auto_port: bool = True) -> None: + def __init__( + self, + model: str, + cli_args: List[str], + *, + auto_port: bool = True, + ) -> None: if auto_port: if "-p" in cli_args or "--port" in cli_args: raise ValueError("You have manually specified the port" @@ -68,12 +74,10 @@ def __init__(self, cli_args: List[str], *, auto_port: bool = True) -> None: # the current process might initialize cuda, # to be safe, we should use spawn method env['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn' - self.proc = subprocess.Popen( - [sys.executable, "-m", "vllm.entrypoints.openai.api_server"] + - cli_args, - env=env, - stdout=sys.stdout, - stderr=sys.stderr) + self.proc = subprocess.Popen(["vllm", "serve"] + [model] + cli_args, + env=env, + stdout=sys.stdout, + stderr=sys.stderr) self._wait_for_server(url=self.url_for("health"), timeout=self.MAX_SERVER_START_WAIT_S)