Skip to content

Commit

Permalink
Remove pd_disagg test in e2e migration test
Browse files Browse the repository at this point in the history
  • Loading branch information
s5u13b committed Nov 7, 2024
1 parent 1b4224c commit c12e2c5
Show file tree
Hide file tree
Showing 2 changed files with 1 addition and 7 deletions.
3 changes: 0 additions & 3 deletions tests/e2e_test/test_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ def generate_launch_command(result_filename: str = "", launch_ray_cluster: bool
ip: str = "127.0.0.1", port: int = 37000, instances_num = 1, dispatch_policy: str = "load",
migration_backend = "gloo", model = "facebook/opt-125m", max_model_len: int = 2048,
launch_mode: str = 'eief', log_instance_info: bool = False,
enable_pd_disagg: bool = False, num_dispatch_instances: int = math.inf,
request_migration_policy: str = 'SR'):
disable_init_instance_by_manager, disable_fixed_node_init_instance = parse_launch_mode(launch_mode)
command = (
Expand All @@ -67,8 +66,6 @@ def generate_launch_command(result_filename: str = "", launch_ray_cluster: bool
f"--migration-cache-blocks 32 "
f"--tensor-parallel-size 1 "
f"--request-output-queue-port {1234+port} "
f"{'--enable-pd-disagg ' if enable_pd_disagg else ''} "
f"{f'--num-dispatch-instances {num_dispatch_instances} ' if num_dispatch_instances != math.inf else ''} "
f"{'--launch-ray-cluster ' if launch_ray_cluster else ''}"
f"{'> instance_'+result_filename if len(result_filename)> 0 else ''} 2>&1 &"
)
Expand Down
5 changes: 1 addition & 4 deletions tests/e2e_test/test_migration.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,8 @@ def parse_manager_log_file(log_file):
@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="at least 2 gpus required for migration bench")
@pytest.mark.parametrize("model", ['/mnt/model/Qwen-7B'])
@pytest.mark.parametrize("migration_backend", ['rpc', 'gloo', 'nccl'])
@pytest.mark.parametrize("enable_pd_disagg", [False, True])
@pytest.mark.parametrize("migrated_request_status", ['running', 'waiting'])
async def test_migration_benchmark(model, migration_backend, enable_pd_disagg, migrated_request_status):
async def test_migration_benchmark(model, migration_backend, migrated_request_status):
if migrated_request_status == 'waiting' and migration_backend != 'rpc':
pytest.skip("When the migrated request status is waiting, only test the rpc migration backend.")

Expand All @@ -79,14 +78,12 @@ async def test_migration_benchmark(model, migration_backend, enable_pd_disagg, m
instance_output_logs = []

device_count = torch.cuda.device_count()
num_dispatch_instances = device_count//2 if enable_pd_disagg else math.inf
for i in range(device_count):
output_log = f"{base_port+i}.out"
instance_output_logs.append("instance_"+output_log)
launch_command = generate_launch_command(result_filename=output_log, launch_ray_cluster=False, port=base_port+i,
model=model, dispatch_policy="flood", migration_backend=migration_backend,
log_instance_info=True,
enable_pd_disagg=enable_pd_disagg, num_dispatch_instances=num_dispatch_instances,
request_migration_policy=request_migration_policy)
subprocess.run(launch_command, shell=True, check=True)
await asyncio.sleep(5)
Expand Down

0 comments on commit c12e2c5

Please sign in to comment.