From 9bade024fb3ef2a5f6b16a353f8c59ba0032abbf Mon Sep 17 00:00:00 2001 From: Hongpeng Guo Date: Wed, 18 Dec 2024 16:43:04 -0800 Subject: [PATCH 1/2] add env vars to enable AMD devices are shared Signed-off-by: Hongpeng Guo --- python/ray/_private/ray_constants.py | 1 + python/ray/train/_internal/backend_executor.py | 7 +++++++ python/ray/train/constants.py | 4 ++++ 3 files changed, 12 insertions(+) diff --git a/python/ray/_private/ray_constants.py b/python/ray/_private/ray_constants.py index 44512e755afd..82adb72d37e1 100644 --- a/python/ray/_private/ray_constants.py +++ b/python/ray/_private/ray_constants.py @@ -434,6 +434,7 @@ def env_set_by_user(key): NOSET_CUDA_VISIBLE_DEVICES_ENV_VAR = "RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES" CUDA_VISIBLE_DEVICES_ENV_VAR = "CUDA_VISIBLE_DEVICES" +ROCM_VISIBLE_DEVICES_ENV_VAR = "ROCM_VISIBLE_DEVICES" NEURON_RT_VISIBLE_CORES_ENV_VAR = "NEURON_RT_VISIBLE_CORES" TPU_VISIBLE_CHIPS_ENV_VAR = "TPU_VISIBLE_CHIPS" NPU_RT_VISIBLE_DEVICES_ENV_VAR = "ASCEND_RT_VISIBLE_DEVICES" diff --git a/python/ray/train/_internal/backend_executor.py b/python/ray/train/_internal/backend_executor.py index d1bef8705f43..4682e7001fe4 100644 --- a/python/ray/train/_internal/backend_executor.py +++ b/python/ray/train/_internal/backend_executor.py @@ -27,6 +27,7 @@ ENABLE_SHARE_CUDA_VISIBLE_DEVICES_ENV, ENABLE_SHARE_NEURON_CORES_ACCELERATOR_ENV, ENABLE_SHARE_NPU_RT_VISIBLE_DEVICES_ENV, + ENABLE_SHARE_ROCM_VISIBLE_DEVICES_ENV, RAY_TRAIN_ENABLE_STATE_TRACKING, TRAIN_ENABLE_WORKER_SPREAD_ENV, TRAIN_PLACEMENT_GROUP_TIMEOUT_S_ENV, @@ -124,6 +125,12 @@ def __init__( ENABLE_SHARE_NPU_RT_VISIBLE_DEVICES_ENV, ray_constants.NPU_RT_VISIBLE_DEVICES_ENV_VAR, ), + # For AMD GPUs, they are using ROCM_VISIBLE_DEVICES env var. + ResourceConfig( + ray_constants.GPU, + ENABLE_SHARE_ROCM_VISIBLE_DEVICES_ENV, + ray_constants.ROCM_VISIBLE_DEVICES_ENV_VAR, + ), ] # Record the initialization time of BackendExecutor, which is diff --git a/python/ray/train/constants.py b/python/ray/train/constants.py index 62611a6060e5..d0f3fa04af7d 100644 --- a/python/ray/train/constants.py +++ b/python/ray/train/constants.py @@ -56,6 +56,10 @@ def _get_ray_train_session_dir() -> str: # Backend.share_cuda_visible_devices. 1 for True, 0 for False. ENABLE_SHARE_CUDA_VISIBLE_DEVICES_ENV = "TRAIN_ENABLE_SHARE_CUDA_VISIBLE_DEVICES" +# Integer value which if set will not share ROCM accelerator visible devices +# across workers. 1 for True (default), 0 for False. +ENABLE_SHARE_ROCM_VISIBLE_DEVICES_ENV = "TRAIN_ENABLE_SHARE_ROCM_VISIBLE_DEVICES" + # Integer value which if set will not share neuron-core accelerator visible cores # across workers. 1 for True (default), 0 for False. ENABLE_SHARE_NEURON_CORES_ACCELERATOR_ENV = ( From be5ca426ce5bf5385676bf3e3a52d2466bbc79d5 Mon Sep 17 00:00:00 2001 From: Hongpeng Guo Date: Thu, 19 Dec 2024 13:35:09 -0800 Subject: [PATCH 2/2] update ROCM to ROCR Signed-off-by: Hongpeng Guo --- python/ray/_private/ray_constants.py | 2 +- python/ray/train/_internal/backend_executor.py | 8 ++++---- python/ray/train/constants.py | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/python/ray/_private/ray_constants.py b/python/ray/_private/ray_constants.py index 82adb72d37e1..470436c4008d 100644 --- a/python/ray/_private/ray_constants.py +++ b/python/ray/_private/ray_constants.py @@ -434,7 +434,7 @@ def env_set_by_user(key): NOSET_CUDA_VISIBLE_DEVICES_ENV_VAR = "RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES" CUDA_VISIBLE_DEVICES_ENV_VAR = "CUDA_VISIBLE_DEVICES" -ROCM_VISIBLE_DEVICES_ENV_VAR = "ROCM_VISIBLE_DEVICES" +ROCR_VISIBLE_DEVICES_ENV_VAR = "ROCR_VISIBLE_DEVICES" NEURON_RT_VISIBLE_CORES_ENV_VAR = "NEURON_RT_VISIBLE_CORES" TPU_VISIBLE_CHIPS_ENV_VAR = "TPU_VISIBLE_CHIPS" NPU_RT_VISIBLE_DEVICES_ENV_VAR = "ASCEND_RT_VISIBLE_DEVICES" diff --git a/python/ray/train/_internal/backend_executor.py b/python/ray/train/_internal/backend_executor.py index 4682e7001fe4..c3f3a79aaec2 100644 --- a/python/ray/train/_internal/backend_executor.py +++ b/python/ray/train/_internal/backend_executor.py @@ -27,7 +27,7 @@ ENABLE_SHARE_CUDA_VISIBLE_DEVICES_ENV, ENABLE_SHARE_NEURON_CORES_ACCELERATOR_ENV, ENABLE_SHARE_NPU_RT_VISIBLE_DEVICES_ENV, - ENABLE_SHARE_ROCM_VISIBLE_DEVICES_ENV, + ENABLE_SHARE_ROCR_VISIBLE_DEVICES_ENV, RAY_TRAIN_ENABLE_STATE_TRACKING, TRAIN_ENABLE_WORKER_SPREAD_ENV, TRAIN_PLACEMENT_GROUP_TIMEOUT_S_ENV, @@ -125,11 +125,11 @@ def __init__( ENABLE_SHARE_NPU_RT_VISIBLE_DEVICES_ENV, ray_constants.NPU_RT_VISIBLE_DEVICES_ENV_VAR, ), - # For AMD GPUs, they are using ROCM_VISIBLE_DEVICES env var. + # For AMD GPUs, they are using ROCR_VISIBLE_DEVICES env var. ResourceConfig( ray_constants.GPU, - ENABLE_SHARE_ROCM_VISIBLE_DEVICES_ENV, - ray_constants.ROCM_VISIBLE_DEVICES_ENV_VAR, + ENABLE_SHARE_ROCR_VISIBLE_DEVICES_ENV, + ray_constants.ROCR_VISIBLE_DEVICES_ENV_VAR, ), ] diff --git a/python/ray/train/constants.py b/python/ray/train/constants.py index d0f3fa04af7d..e7f259302821 100644 --- a/python/ray/train/constants.py +++ b/python/ray/train/constants.py @@ -56,9 +56,9 @@ def _get_ray_train_session_dir() -> str: # Backend.share_cuda_visible_devices. 1 for True, 0 for False. ENABLE_SHARE_CUDA_VISIBLE_DEVICES_ENV = "TRAIN_ENABLE_SHARE_CUDA_VISIBLE_DEVICES" -# Integer value which if set will not share ROCM accelerator visible devices +# Integer value which if set will not share ROCR accelerator visible devices # across workers. 1 for True (default), 0 for False. -ENABLE_SHARE_ROCM_VISIBLE_DEVICES_ENV = "TRAIN_ENABLE_SHARE_ROCM_VISIBLE_DEVICES" +ENABLE_SHARE_ROCR_VISIBLE_DEVICES_ENV = "TRAIN_ENABLE_SHARE_ROCR_VISIBLE_DEVICES" # Integer value which if set will not share neuron-core accelerator visible cores # across workers. 1 for True (default), 0 for False.