From cce2f16d9e09ee90d8134effd5d382c1311a8e7d Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 17 Jul 2024 15:43:21 +0800
Subject: [PATCH] [Doc][CI/Build] Update docs and tests to use `vllm serve`
 (#6431)

Signed-off-by: Alvant <alvasian@yandex.ru>
---
 docs/source/getting_started/quickstart.rst    |  7 +--
 docs/source/models/adding_model.rst           |  4 +-
 docs/source/models/engine_args.rst            |  4 +-
 docs/source/models/lora.rst                   |  3 +-
 docs/source/models/vlm.rst                    |  4 +-
 docs/source/serving/deploying_with_dstack.rst |  2 +-
 docs/source/serving/distributed_serving.rst   |  6 +-
 .../serving/openai_compatible_server.md       |  8 +--
 examples/api_client.py                        |  5 +-
 examples/logging_configuration.md             | 12 +---
 examples/openai_vision_api_client.py          |  4 +-
 examples/production_monitoring/Otel.md        |  6 +-
 examples/production_monitoring/README.md      |  3 +-
 tests/async_engine/test_openapi_server_ray.py | 22 +++----
 tests/distributed/test_pipeline_parallel.py   |  6 +-
 tests/entrypoints/openai/test_chat.py         | 42 ++++++-------
 tests/entrypoints/openai/test_completion.py   | 60 +++++++++----------
 tests/entrypoints/openai/test_embedding.py    | 22 +++----
 tests/entrypoints/openai/test_models.py       | 42 ++++++-------
 tests/entrypoints/openai/test_tokenization.py | 24 ++++----
 tests/entrypoints/openai/test_vision.py       | 22 +++----
 tests/tensorizer_loader/test_tensorizer.py    |  4 +-
 tests/utils.py                                | 18 +++---
 23 files changed, 155 insertions(+), 175 deletions(-)
diff --git a/docs/source/getting_started/quickstart.rst b/docs/source/getting_started/quickstart.rst
index 7c44a96865a50..89bdc247c5e8e 100644
--- a/docs/source/getting_started/quickstart.rst
+++ b/docs/source/getting_started/quickstart.rst
@@ -73,16 +73,13 @@ Start the server:
 
 .. code-block:: console
 
-    $ python -m vllm.entrypoints.openai.api_server \
-    $     --model facebook/opt-125m
+    $ vllm serve facebook/opt-125m
 
 By default, the server uses a predefined chat template stored in the tokenizer. You can override this template by using the ``--chat-template`` argument:
 
 .. code-block:: console
 
-   $ python -m vllm.entrypoints.openai.api_server \
-   $     --model facebook/opt-125m \
-   $     --chat-template ./examples/template_chatml.jinja
+    $ vllm serve facebook/opt-125m --chat-template ./examples/template_chatml.jinja
 
 This server can be queried in the same format as OpenAI API. For example, list the models:
 
diff --git a/docs/source/models/adding_model.rst b/docs/source/models/adding_model.rst
index 53c19e5829218..5cffb58cafd96 100644
--- a/docs/source/models/adding_model.rst
+++ b/docs/source/models/adding_model.rst
@@ -114,7 +114,7 @@ Just add the following lines in your code:
     from your_code import YourModelForCausalLM
     ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM)
 
-If you are running api server with `python -m vllm.entrypoints.openai.api_server args`, you can wrap the entrypoint with the following code:
+If you are running api server with :code:`vllm serve <args>`, you can wrap the entrypoint with the following code:
 
 .. code-block:: python
 
@@ -124,4 +124,4 @@ If you are running api server with `python -m vllm.entrypoints.openai.api_server
     import runpy
     runpy.run_module('vllm.entrypoints.openai.api_server', run_name='__main__')
 
-Save the above code in a file and run it with `python your_file.py args`.
+Save the above code in a file and run it with :code:`python your_file.py <args>`.
diff --git a/docs/source/models/engine_args.rst b/docs/source/models/engine_args.rst
index bdf566d3ebbd1..e7ce8cdcabe88 100644
--- a/docs/source/models/engine_args.rst
+++ b/docs/source/models/engine_args.rst
@@ -8,7 +8,7 @@ Below, you can find an explanation of every engine argument for vLLM:
 .. argparse::
     :module: vllm.engine.arg_utils
     :func: _engine_args_parser
-    :prog: -m vllm.entrypoints.openai.api_server
+    :prog: vllm serve
     :nodefaultconst:
 
 Async Engine Arguments
@@ -19,5 +19,5 @@ Below are the additional arguments related to the asynchronous engine:
 .. argparse::
     :module: vllm.engine.arg_utils
     :func: _async_engine_args_parser
-    :prog: -m vllm.entrypoints.openai.api_server
+    :prog: vllm serve
     :nodefaultconst:
\ No newline at end of file
diff --git a/docs/source/models/lora.rst b/docs/source/models/lora.rst
index 5cc3076073fbd..f08773fe59d92 100644
--- a/docs/source/models/lora.rst
+++ b/docs/source/models/lora.rst
@@ -61,8 +61,7 @@ LoRA adapted models can also be served with the Open-AI compatible vLLM server.
 
 .. code-block:: bash
 
-    python -m vllm.entrypoints.openai.api_server \
-        --model meta-llama/Llama-2-7b-hf \
+    vllm serve meta-llama/Llama-2-7b-hf \
         --enable-lora \
         --lora-modules sql-lora=$HOME/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/
 
diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
index d488b0fefdf06..92aca168dadf2 100644
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -94,9 +94,7 @@ Below is an example on how to launch the same ``llava-hf/llava-1.5-7b-hf`` with
 
 .. code-block:: bash
 
-    python -m vllm.entrypoints.openai.api_server \
-        --model llava-hf/llava-1.5-7b-hf \
-        --chat-template template_llava.jinja
+    vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja
 
 .. important::
     We have removed all vision language related CLI args in the ``0.5.1`` release. **This is a breaking change**, so please update your code to follow
diff --git a/docs/source/serving/deploying_with_dstack.rst b/docs/source/serving/deploying_with_dstack.rst
index baf87314ca8e4..e1eb45b225d9c 100644
--- a/docs/source/serving/deploying_with_dstack.rst
+++ b/docs/source/serving/deploying_with_dstack.rst
@@ -40,7 +40,7 @@ Next, to provision a VM instance with LLM of your choice(`NousResearch/Llama-2-7
         gpu: 24GB
     commands:
         - pip install vllm
-        - python -m vllm.entrypoints.openai.api_server --model $MODEL --port 8000
+        - vllm serve $MODEL --port 8000
     model:
         format: openai
         type: chat
diff --git a/docs/source/serving/distributed_serving.rst b/docs/source/serving/distributed_serving.rst
index 2dfb83f168b5d..fa1b04dc3dce5 100644
--- a/docs/source/serving/distributed_serving.rst
+++ b/docs/source/serving/distributed_serving.rst
@@ -35,16 +35,14 @@ To run multi-GPU serving, pass in the :code:`--tensor-parallel-size` argument wh
 
 .. code-block:: console
 
-    $ python -m vllm.entrypoints.openai.api_server \
-    $     --model facebook/opt-13b \
+    $ vllm serve facebook/opt-13b \
     $     --tensor-parallel-size 4
 
 You can also additionally specify :code:`--pipeline-parallel-size` to enable pipeline parallelism. For example, to run API server on 8 GPUs with pipeline parallelism and tensor parallelism:
 
 .. code-block:: console
 
-    $ python -m vllm.entrypoints.openai.api_server \
-    $     --model gpt2 \
+    $ vllm serve gpt2 \
     $     --tensor-parallel-size 4 \
     $     --pipeline-parallel-size 2 \
     $     --distributed-executor-backend ray
diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index 092c3c6cb9a3d..a06c30d9c48c6 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -4,7 +4,7 @@ vLLM provides an HTTP server that implements OpenAI's [Completions](https://plat
 
 You can start the server using Python, or using [Docker](deploying_with_docker.rst):
 ```bash
-python -m vllm.entrypoints.openai.api_server --model NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123
+vllm serve NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123
 ```
 
 To call the server, you can use the official OpenAI Python client library, or any other HTTP client.
@@ -97,9 +97,7 @@ template, or the template in string form. Without a chat template, the server wi
 and all chat requests will error.
 
 ```bash
-python -m vllm.entrypoints.openai.api_server \
-  --model ... \
-  --chat-template ./path-to-chat-template.jinja
+vllm serve <model> --chat-template ./path-to-chat-template.jinja
 ```
 
 vLLM community provides a set of chat templates for popular models. You can find them in the examples
@@ -110,7 +108,7 @@ directory [here](https://github.com/vllm-project/vllm/tree/main/examples/)
 ```{argparse}
 :module: vllm.entrypoints.openai.cli_args
 :func: create_parser_for_docs
-:prog: -m vllm.entrypoints.openai.api_server
+:prog: vllm serve
 ```
 
 ## Tool calling in the chat completion API
diff --git a/examples/api_client.py b/examples/api_client.py
index 5f7daa14d5044..27a2a08b7b0c3 100644
--- a/examples/api_client.py
+++ b/examples/api_client.py
@@ -1,8 +1,7 @@
-"""Example Python client for vllm.entrypoints.api_server
+"""Example Python client for `vllm.entrypoints.api_server`
 NOTE: The API server is used only for demonstration and simple performance
 benchmarks. It is not intended for production use.
-For production use, we recommend vllm.entrypoints.openai.api_server
-and the OpenAI client API
+For production use, we recommend `vllm serve` and the OpenAI client API.
 """
 
 import argparse
diff --git a/examples/logging_configuration.md b/examples/logging_configuration.md
index 75b4b31a80462..0d278b0392403 100644
--- a/examples/logging_configuration.md
+++ b/examples/logging_configuration.md
@@ -95,9 +95,7 @@ to the path of the custom logging configuration JSON file:
 
 ```bash
 VLLM_LOGGING_CONFIG_PATH=/path/to/logging_config.json \
-    python3 -m vllm.entrypoints.openai.api_server \
-    --max-model-len 2048 \
-    --model mistralai/Mistral-7B-v0.1
+    vllm serve mistralai/Mistral-7B-v0.1 --max-model-len 2048
 ```
 
 
@@ -152,9 +150,7 @@ to the path of the custom logging configuration JSON file:
 
 ```bash
 VLLM_LOGGING_CONFIG_PATH=/path/to/logging_config.json \
-    python3 -m vllm.entrypoints.openai.api_server \
-    --max-model-len 2048 \
-    --model mistralai/Mistral-7B-v0.1
+    vllm serve mistralai/Mistral-7B-v0.1 --max-model-len 2048
 ```
 
 
@@ -167,9 +163,7 @@ loggers.
 
 ```bash
 VLLM_CONFIGURE_LOGGING=0 \
-    python3 -m vllm.entrypoints.openai.api_server \
-    --max-model-len 2048 \
-    --model mistralai/Mistral-7B-v0.1
+    vllm serve mistralai/Mistral-7B-v0.1 --max-model-len 2048
 ```
 
 
diff --git a/examples/openai_vision_api_client.py b/examples/openai_vision_api_client.py
index d4d9738a1f7bc..2082c378e267c 100644
--- a/examples/openai_vision_api_client.py
+++ b/examples/openai_vision_api_client.py
@@ -1,9 +1,7 @@
 """An example showing how to use vLLM to serve VLMs.
 
 Launch the vLLM server with the following command:
-python -m vllm.entrypoints.openai.api_server \
-    --model llava-hf/llava-1.5-7b-hf \
-    --chat-template template_llava.jinja
+vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja
 """
 import base64
 
diff --git a/examples/production_monitoring/Otel.md b/examples/production_monitoring/Otel.md
index 1449442273c7a..2c7a7caa1bd7c 100644
--- a/examples/production_monitoring/Otel.md
+++ b/examples/production_monitoring/Otel.md
@@ -36,7 +36,7 @@
     ```
     export OTEL_SERVICE_NAME="vllm-server"
     export OTEL_EXPORTER_OTLP_TRACES_INSECURE=true
-    python -m vllm.entrypoints.openai.api_server --model="facebook/opt-125m" --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
+    vllm serve facebook/opt-125m --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
     ```
 
 1. In a new shell, send requests with trace context from a dummy client
@@ -62,7 +62,7 @@ By default, `grpc` is used. To set `http/protobuf` as the protocol, configure th
 ```
 export OTEL_EXPORTER_OTLP_TRACES_PROTOCOL=http/protobuf
 export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://$JAEGER_IP:4318/v1/traces
-python -m vllm.entrypoints.openai.api_server --model="facebook/opt-125m" --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
+vllm serve facebook/opt-125m --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
 ```
 
 ## Instrumentation of FastAPI
@@ -74,7 +74,7 @@ OpenTelemetry allows automatic instrumentation of FastAPI.
 
 1. Run vLLM with `opentelemetry-instrument`
     ```
-    opentelemetry-instrument python -m vllm.entrypoints.openai.api_server --model="facebook/opt-125m" 
+    opentelemetry-instrument vllm serve facebook/opt-125m
     ```
 
 1. Send a request to vLLM and find its trace in Jaeger. It should contain spans from FastAPI.
diff --git a/examples/production_monitoring/README.md b/examples/production_monitoring/README.md
index 268f2e771018f..807c0470e7b30 100644
--- a/examples/production_monitoring/README.md
+++ b/examples/production_monitoring/README.md
@@ -10,8 +10,7 @@ Install:
 
 Prometheus metric logging is enabled by default in the OpenAI-compatible server. Launch via the entrypoint:
 ```bash
-python3 -m vllm.entrypoints.openai.api_server \
-    --model mistralai/Mistral-7B-v0.1 \
+vllm serve mistralai/Mistral-7B-v0.1 \
     --max-model-len 2048 \
     --disable-log-requests
 ```
diff --git a/tests/async_engine/test_openapi_server_ray.py b/tests/async_engine/test_openapi_server_ray.py
index 575f8f19b8ebe..5ecd770ede836 100644
--- a/tests/async_engine/test_openapi_server_ray.py
+++ b/tests/async_engine/test_openapi_server_ray.py
@@ -9,17 +9,17 @@
 
 @pytest.fixture(scope="module")
 def server():
-    with RemoteOpenAIServer([
-            "--model",
-            MODEL_NAME,
-            # use half precision for speed and memory savings in CI environment
-            "--dtype",
-            "float16",
-            "--max-model-len",
-            "2048",
-            "--enforce-eager",
-            "--engine-use-ray"
-    ]) as remote_server:
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "float16",
+        "--max-model-len",
+        "2048",
+        "--enforce-eager",
+        "--engine-use-ray"
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
         yield remote_server
 
 
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 7b7475a77167c..52074a93329ea 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -15,8 +15,6 @@
     ])
 def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME):
     pp_args = [
-        "--model",
-        MODEL_NAME,
         # use half precision for speed and memory savings in CI environment
         "--dtype",
         "bfloat16",
@@ -34,8 +32,6 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME):
     #  schedule all workers in a node other than the head node,
     #  which can cause the test to fail.
     tp_args = [
-        "--model",
-        MODEL_NAME,
         # use half precision for speed and memory savings in CI environment
         "--dtype",
         "bfloat16",
@@ -53,7 +49,7 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME):
 
     results = []
     for args in [pp_args, tp_args]:
-        with RemoteOpenAIServer(args) as server:
+        with RemoteOpenAIServer(MODEL_NAME, args) as server:
             client = server.get_client()
 
             # test models list
diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index d370c63c0c7ba..8f67dd54edff0 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -27,27 +27,27 @@ def zephyr_lora_files():
 
 @pytest.fixture(scope="module")
 def server(zephyr_lora_files):
-    with RemoteOpenAIServer([
-            "--model",
-            MODEL_NAME,
-            # use half precision for speed and memory savings in CI environment
-            "--dtype",
-            "bfloat16",
-            "--max-model-len",
-            "8192",
-            "--enforce-eager",
-            # lora config below
-            "--enable-lora",
-            "--lora-modules",
-            f"zephyr-lora={zephyr_lora_files}",
-            f"zephyr-lora2={zephyr_lora_files}",
-            "--max-lora-rank",
-            "64",
-            "--max-cpu-loras",
-            "2",
-            "--max-num-seqs",
-            "128",
-    ]) as remote_server:
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        # lora config below
+        "--enable-lora",
+        "--lora-modules",
+        f"zephyr-lora={zephyr_lora_files}",
+        f"zephyr-lora2={zephyr_lora_files}",
+        "--max-lora-rank",
+        "64",
+        "--max-cpu-loras",
+        "2",
+        "--max-num-seqs",
+        "128",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
         yield remote_server
 
 
diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py
index 35af0b02747e9..59151b9c4e99e 100644
--- a/tests/entrypoints/openai/test_completion.py
+++ b/tests/entrypoints/openai/test_completion.py
@@ -37,36 +37,36 @@ def zephyr_pa_files():
 
 @pytest.fixture(scope="module")
 def server(zephyr_lora_files, zephyr_pa_files):
-    with RemoteOpenAIServer([
-            "--model",
-            MODEL_NAME,
-            # use half precision for speed and memory savings in CI environment
-            "--dtype",
-            "bfloat16",
-            "--max-model-len",
-            "8192",
-            "--max-num-seqs",
-            "128",
-            "--enforce-eager",
-            # lora config
-            "--enable-lora",
-            "--lora-modules",
-            f"zephyr-lora={zephyr_lora_files}",
-            f"zephyr-lora2={zephyr_lora_files}",
-            "--max-lora-rank",
-            "64",
-            "--max-cpu-loras",
-            "2",
-            # pa config
-            "--enable-prompt-adapter",
-            "--prompt-adapters",
-            f"zephyr-pa={zephyr_pa_files}",
-            f"zephyr-pa2={zephyr_pa_files}",
-            "--max-prompt-adapters",
-            "2",
-            "--max-prompt-adapter-token",
-            "128",
-    ]) as remote_server:
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--max-num-seqs",
+        "128",
+        "--enforce-eager",
+        # lora config
+        "--enable-lora",
+        "--lora-modules",
+        f"zephyr-lora={zephyr_lora_files}",
+        f"zephyr-lora2={zephyr_lora_files}",
+        "--max-lora-rank",
+        "64",
+        "--max-cpu-loras",
+        "2",
+        # pa config
+        "--enable-prompt-adapter",
+        "--prompt-adapters",
+        f"zephyr-pa={zephyr_pa_files}",
+        f"zephyr-pa2={zephyr_pa_files}",
+        "--max-prompt-adapters",
+        "2",
+        "--max-prompt-adapter-token",
+        "128",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
         yield remote_server
 
 
diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py
index 4a32aadc8c3ae..2ca0c0d63c25c 100644
--- a/tests/entrypoints/openai/test_embedding.py
+++ b/tests/entrypoints/openai/test_embedding.py
@@ -11,17 +11,17 @@
 
 @pytest.fixture(scope="module")
 def embedding_server():
-    with RemoteOpenAIServer([
-            "--model",
-            EMBEDDING_MODEL_NAME,
-            # use half precision for speed and memory savings in CI environment
-            "--dtype",
-            "bfloat16",
-            "--enforce-eager",
-            "--max-model-len",
-            "8192",
-            "--enforce-eager",
-    ]) as remote_server:
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--enforce-eager",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+    ]
+
+    with RemoteOpenAIServer(EMBEDDING_MODEL_NAME, args) as remote_server:
         yield remote_server
 
 
diff --git a/tests/entrypoints/openai/test_models.py b/tests/entrypoints/openai/test_models.py
index bf63f9a813f2c..c2cfff228c546 100644
--- a/tests/entrypoints/openai/test_models.py
+++ b/tests/entrypoints/openai/test_models.py
@@ -19,27 +19,27 @@ def zephyr_lora_files():
 
 @pytest.fixture(scope="module")
 def server(zephyr_lora_files):
-    with RemoteOpenAIServer([
-            "--model",
-            MODEL_NAME,
-            # use half precision for speed and memory savings in CI environment
-            "--dtype",
-            "bfloat16",
-            "--max-model-len",
-            "8192",
-            "--enforce-eager",
-            # lora config below
-            "--enable-lora",
-            "--lora-modules",
-            f"zephyr-lora={zephyr_lora_files}",
-            f"zephyr-lora2={zephyr_lora_files}",
-            "--max-lora-rank",
-            "64",
-            "--max-cpu-loras",
-            "2",
-            "--max-num-seqs",
-            "128",
-    ]) as remote_server:
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        # lora config below
+        "--enable-lora",
+        "--lora-modules",
+        f"zephyr-lora={zephyr_lora_files}",
+        f"zephyr-lora2={zephyr_lora_files}",
+        "--max-lora-rank",
+        "64",
+        "--max-cpu-loras",
+        "2",
+        "--max-num-seqs",
+        "128",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
         yield remote_server
 
 
diff --git a/tests/entrypoints/openai/test_tokenization.py b/tests/entrypoints/openai/test_tokenization.py
index d33fd222ee150..f32abba225d40 100644
--- a/tests/entrypoints/openai/test_tokenization.py
+++ b/tests/entrypoints/openai/test_tokenization.py
@@ -12,18 +12,18 @@
 
 @pytest.fixture(scope="module")
 def server():
-    with RemoteOpenAIServer([
-            "--model",
-            MODEL_NAME,
-            # use half precision for speed and memory savings in CI environment
-            "--dtype",
-            "bfloat16",
-            "--max-model-len",
-            "8192",
-            "--enforce-eager",
-            "--max-num-seqs",
-            "128",
-    ]) as remote_server:
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        "--max-num-seqs",
+        "128",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
         yield remote_server
 
 
diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index 563b68566bd2c..cc5c8d619183f 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -23,17 +23,17 @@
 
 @pytest.fixture(scope="module")
 def server():
-    with RemoteOpenAIServer([
-            "--model",
-            MODEL_NAME,
-            "--dtype",
-            "bfloat16",
-            "--max-model-len",
-            "4096",
-            "--enforce-eager",
-            "--chat-template",
-            str(LLAVA_CHAT_TEMPLATE),
-    ]) as remote_server:
+    args = [
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "4096",
+        "--enforce-eager",
+        "--chat-template",
+        str(LLAVA_CHAT_TEMPLATE),
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
         yield remote_server
 
 
diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py
index a43f9132585b5..b7030e3cd6d42 100644
--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
@@ -214,12 +214,12 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
 
     ## Start OpenAI API server
     openai_args = [
-        "--model", model_ref, "--dtype", "float16", "--load-format",
+        "--dtype", "float16", "--load-format",
         "tensorizer", "--model-loader-extra-config",
         json.dumps(model_loader_extra_config),
     ]
 
-    with RemoteOpenAIServer(openai_args) as server:
+    with RemoteOpenAIServer(model_ref, openai_args) as server:
         print("Server ready.")
 
         client = server.get_client()
diff --git a/tests/utils.py b/tests/utils.py
index 8780d45a31b29..80e0895c551b2 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -49,7 +49,13 @@ class RemoteOpenAIServer:
     DUMMY_API_KEY = "token-abc123"  # vLLM's OpenAI server does not need API key
     MAX_SERVER_START_WAIT_S = 600  # wait for server to start for 60 seconds
 
-    def __init__(self, cli_args: List[str], *, auto_port: bool = True) -> None:
+    def __init__(
+        self,
+        model: str,
+        cli_args: List[str],
+        *,
+        auto_port: bool = True,
+    ) -> None:
         if auto_port:
             if "-p" in cli_args or "--port" in cli_args:
                 raise ValueError("You have manually specified the port"
@@ -68,12 +74,10 @@ def __init__(self, cli_args: List[str], *, auto_port: bool = True) -> None:
         # the current process might initialize cuda,
         # to be safe, we should use spawn method
         env['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
-        self.proc = subprocess.Popen(
-            [sys.executable, "-m", "vllm.entrypoints.openai.api_server"] +
-            cli_args,
-            env=env,
-            stdout=sys.stdout,
-            stderr=sys.stderr)
+        self.proc = subprocess.Popen(["vllm", "serve"] + [model] + cli_args,
+                                     env=env,
+                                     stdout=sys.stdout,
+                                     stderr=sys.stderr)
         self._wait_for_server(url=self.url_for("health"),
                               timeout=self.MAX_SERVER_START_WAIT_S)