diff --git a/docs/Arguments.md b/docs/Arguments.md index 156eca72..c8397bfa 100644 --- a/docs/Arguments.md +++ b/docs/Arguments.md @@ -38,7 +38,7 @@ usage: -m llumnix.entrypoints.vllm.api_server [-h] [--migration-num-layers MIGRATION_NUM_LAYERS] [--last-stage-max-blocks LAST_STAGE_MAX_BLOCKS] [--max-stages MAX_STAGES] - [--enable-pd-disagg ENABLE_PD_DISAGG] + [--enable-pd-disagg] [--num-dispatch-instances NUM_DISPATCH_INSTANCES] [--log-request-timestamps] @@ -170,6 +170,12 @@ usage: -m llumnix.entrypoints.vllm.api_server [-h] `--log-request-timestamps` - Enable logging request timestamps. +`--enable-pd-disagg` +- Enable prefill decoding disaggregation. + +`--num-dispatch-instances` +- Number of available instances for dispatch. + # Unsupported vLLM feature options `--device` diff --git a/llumnix/arg_utils.py b/llumnix/arg_utils.py index f9a66dc6..dd80276d 100644 --- a/llumnix/arg_utils.py +++ b/llumnix/arg_utils.py @@ -306,7 +306,7 @@ def add_cli_args( type=int, help='drop migration if the number of stages > max_stages') parser.add_argument('--enable-pd-disagg', - type=bool, + action='store_true', help='enable prefill decoding disaggregation') parser.add_argument('--num-dispatch-instances', type=int, diff --git a/tests/e2e_test/test_e2e.py b/tests/e2e_test/test_e2e.py index 4cee1082..cd382a0b 100644 --- a/tests/e2e_test/test_e2e.py +++ b/tests/e2e_test/test_e2e.py @@ -67,7 +67,7 @@ def generate_launch_command(result_filename: str = "", launch_ray_cluster: bool f"--tensor-parallel-size 1 " f"--request-output-queue-port {1234+port} " f"{'--enable-pd-disagg ' if enable_pd_disagg else ''} " - f"{'--num-dispatch-instances ' if num_dispatch_instances!=math.inf else ''} " + f"{'--num-dispatch-instances {num_dispatch_instances} ' if num_dispatch_instances!=math.inf else ''} " f"{'--launch-ray-cluster ' if launch_ray_cluster else ''}" f"{'> instance_'+result_filename if len(result_filename)> 0 else ''} 2>&1 &" ) diff --git a/tests/e2e_test/test_migration.py b/tests/e2e_test/test_migration.py index 6c2f41d7..ddf7fb51 100644 --- a/tests/e2e_test/test_migration.py +++ b/tests/e2e_test/test_migration.py @@ -66,7 +66,7 @@ def parse_manager_log_file(log_file): @pytest.mark.asyncio @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="at least 2 gpus required for migration bench") @pytest.mark.parametrize("model", ['/mnt/model/Qwen-7B']) -@pytest.mark.parametrize("migration_backend", ['rpc'])# 'gloo', 'nccl']) +@pytest.mark.parametrize("migration_backend", ['rpc', 'gloo', 'nccl']) @pytest.mark.parametrize("enable_pd_disagg", [False, True]) async def test_migration_benchmark(model, migration_backend, enable_pd_disagg): base_port = 37037