Skip to content

Commit

Permalink
fix
Browse files Browse the repository at this point in the history
  • Loading branch information
KuilongCui committed Nov 7, 2024
1 parent bd57e3d commit 915bbe3
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 16 deletions.
6 changes: 3 additions & 3 deletions tests/e2e_test/test_migration.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,13 +67,13 @@ def parse_manager_log_file(log_file):
@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="at least 2 gpus required for migration bench")
@pytest.mark.parametrize("model", ['/mnt/model/Qwen-7B'])
@pytest.mark.parametrize("migration_backend", ['rpc', 'gloo', 'nccl'])
@pytest.mark.parametrize("enable_pd_disagg", [False, True])
async def test_migration_benchmark(model, migration_backend, enable_pd_disagg):
async def test_migration_benchmark(model, migration_backend):
base_port = 37037
instance_output_logs = []

device_count = torch.cuda.device_count()
num_dispatch_instances = device_count//2 if enable_pd_disagg else math.inf
enable_pd_disagg = False
num_dispatch_instances = math.inf
for i in range(device_count):
output_log = f"{base_port+i}.out"
instance_output_logs.append("instance_"+output_log)
Expand Down
25 changes: 15 additions & 10 deletions tests/unit_test/backends/vllm/test_migration_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,12 @@ def get_ready_workers(num_worker, num_gpu_blocks, engine_config, migraiton_confi
workers = []
worker_ids = []

for i in range(num_worker):
for _ in range(num_worker):
worker_id = random_uuid()
worker = create_worker(rank=0, local_rank=i, engine_config=engine_config,
worker = create_worker(rank=0, local_rank=0, engine_config=engine_config,
worker_module_name="tests.unit_test.backends.vllm.test_migration_backend",
worker_class_name="MockMigrationWorker")

ray.get(worker.execute_method.remote('init_device'))
ray.get(worker.execute_method.remote('initialize_cache', num_gpu_blocks=num_gpu_blocks, num_cpu_blocks=0))
ray.get(worker.execute_method.remote(
'init_migration',
Expand Down Expand Up @@ -73,14 +73,18 @@ def set_gpu_cache(self, data):

def get_gpu_cache(self):
torch.cuda.synchronize()
return self.gpu_cache
gpu_data = []
for layer_idx in range(self.cache_engine.num_layers):
gpu_data.append(self.gpu_cache[layer_idx].clone().cpu())
return gpu_data

@pytest.mark.skipif(torch.cuda.device_count() < 3, reason="Need at least 3 GPU to run the test.")
@pytest.mark.parametrize("backend", ['rpc', 'gloo', 'nccl'])
def test_one_to_many_migrate_cache(setup_ray_env, backend):
engine_config = EngineArgs(model='facebook/opt-125m', max_model_len=8, enforce_eager=True).create_engine_config()
migration_internal_buffer_num = 2
migraiton_config = EngineManagerArgs(migration_buffer_blocks=3, migration_num_layers=5,
migration_internal_buffer_num=2).create_migration_config()
migration_internal_buffer_num=migration_internal_buffer_num).create_migration_config()
migraiton_config.migration_backend = backend

num_worker = 3
Expand Down Expand Up @@ -117,16 +121,17 @@ def test_one_to_many_migrate_cache(setup_ray_env, backend):
dst_worker_data = ray.get(workers[worker_idx].execute_method.remote('get_gpu_cache'))
for layer_idx in range(num_layers):
for src_idx, dst_idx in src_to_dst.items():
assert torch.allclose(worker0_data[layer_idx][0][src_idx].cpu(), dst_worker_data[layer_idx][0][dst_idx].cpu())
assert torch.allclose(worker0_data[layer_idx][1][src_idx].cpu(), dst_worker_data[layer_idx][1][dst_idx].cpu())
assert torch.allclose(worker0_data[layer_idx][0][src_idx], dst_worker_data[layer_idx][0][dst_idx])
assert torch.allclose(worker0_data[layer_idx][1][src_idx], dst_worker_data[layer_idx][1][dst_idx])
worker_idx += 1

@pytest.mark.skipif(torch.cuda.device_count() < 3, reason="Need at least 3 GPU to run the test.")
@pytest.mark.parametrize("backend", ['rpc', 'gloo', 'nccl'])
def test_many_to_one_migrate_cache(setup_ray_env, backend):
engine_config = EngineArgs(model='facebook/opt-125m', max_model_len=8, enforce_eager=True).create_engine_config()
migration_internal_buffer_num = 2
migraiton_config = EngineManagerArgs(migration_buffer_blocks=3, migration_num_layers=5,
migration_internal_buffer_num=2).create_migration_config()
migration_internal_buffer_num=migration_internal_buffer_num).create_migration_config()
migraiton_config.migration_backend = backend

num_worker = 3
Expand Down Expand Up @@ -168,6 +173,6 @@ def test_many_to_one_migrate_cache(setup_ray_env, backend):

for layer_idx in range(num_layers):
for src_idx, dst_idx in src_to_dst.items():
assert torch.allclose(worker_datas[worker_idx][layer_idx][0][src_idx].cpu(), dst_worker_data[layer_idx][0][dst_idx].cpu())
assert torch.allclose(worker_datas[worker_idx][layer_idx][1][src_idx].cpu(), dst_worker_data[layer_idx][1][dst_idx].cpu())
assert torch.allclose(worker_datas[worker_idx][layer_idx][0][src_idx], dst_worker_data[layer_idx][0][dst_idx])
assert torch.allclose(worker_datas[worker_idx][layer_idx][1][src_idx], dst_worker_data[layer_idx][1][dst_idx])
worker_idx += 1
5 changes: 2 additions & 3 deletions tests/unit_test/backends/vllm/test_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ def create_worker(rank: int, local_rank: int, engine_config: EngineConfig,
worker_class_name=worker_class_name,
trust_remote_code=True
)
cuda_env = {'CUDA_VISIBLE_DEVICES': ",".join([str(i) for i in range(torch.cuda.device_count())])}
ray.get(worker.update_environment_variables.remote(cuda_env))
# cuda_env = {'CUDA_VISIBLE_DEVICES': ",".join([str(i) for i in range(torch.cuda.device_count())])}
# ray.get(worker.update_environment_variables.remote(cuda_env))
ray.get(worker.init_worker.remote(
model_config=engine_config.model_config,
parallel_config=engine_config.parallel_config,
Expand All @@ -54,7 +54,6 @@ def create_worker(rank: int, local_rank: int, engine_config: EngineConfig,
vision_language_config=engine_config.vision_language_config,
is_driver_worker = False
))
ray.get(worker.execute_method.remote('init_device'))

return worker

Expand Down

0 comments on commit 915bbe3

Please sign in to comment.