diff --git a/python/ray/_private/ray_constants.py b/python/ray/_private/ray_constants.py index 44512e755afd..470436c4008d 100644 --- a/python/ray/_private/ray_constants.py +++ b/python/ray/_private/ray_constants.py @@ -434,6 +434,7 @@ def env_set_by_user(key): NOSET_CUDA_VISIBLE_DEVICES_ENV_VAR = "RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES" CUDA_VISIBLE_DEVICES_ENV_VAR = "CUDA_VISIBLE_DEVICES" +ROCR_VISIBLE_DEVICES_ENV_VAR = "ROCR_VISIBLE_DEVICES" NEURON_RT_VISIBLE_CORES_ENV_VAR = "NEURON_RT_VISIBLE_CORES" TPU_VISIBLE_CHIPS_ENV_VAR = "TPU_VISIBLE_CHIPS" NPU_RT_VISIBLE_DEVICES_ENV_VAR = "ASCEND_RT_VISIBLE_DEVICES" diff --git a/python/ray/train/_internal/backend_executor.py b/python/ray/train/_internal/backend_executor.py index 50eabe77642b..3815f31add40 100644 --- a/python/ray/train/_internal/backend_executor.py +++ b/python/ray/train/_internal/backend_executor.py @@ -27,6 +27,7 @@ ENABLE_SHARE_CUDA_VISIBLE_DEVICES_ENV, ENABLE_SHARE_NEURON_CORES_ACCELERATOR_ENV, ENABLE_SHARE_NPU_RT_VISIBLE_DEVICES_ENV, + ENABLE_SHARE_ROCR_VISIBLE_DEVICES_ENV, RAY_TRAIN_ENABLE_STATE_TRACKING, TRAIN_ENABLE_WORKER_SPREAD_ENV, TRAIN_PLACEMENT_GROUP_TIMEOUT_S_ENV, @@ -124,6 +125,12 @@ def __init__( ENABLE_SHARE_NPU_RT_VISIBLE_DEVICES_ENV, ray_constants.NPU_RT_VISIBLE_DEVICES_ENV_VAR, ), + # For AMD GPUs, they are using ROCR_VISIBLE_DEVICES env var. + ResourceConfig( + ray_constants.GPU, + ENABLE_SHARE_ROCR_VISIBLE_DEVICES_ENV, + ray_constants.ROCR_VISIBLE_DEVICES_ENV_VAR, + ), ] # Record the initialization time of BackendExecutor, which is diff --git a/python/ray/train/constants.py b/python/ray/train/constants.py index 62611a6060e5..e7f259302821 100644 --- a/python/ray/train/constants.py +++ b/python/ray/train/constants.py @@ -56,6 +56,10 @@ def _get_ray_train_session_dir() -> str: # Backend.share_cuda_visible_devices. 1 for True, 0 for False. ENABLE_SHARE_CUDA_VISIBLE_DEVICES_ENV = "TRAIN_ENABLE_SHARE_CUDA_VISIBLE_DEVICES" +# Integer value which if set will not share ROCR accelerator visible devices +# across workers. 1 for True (default), 0 for False. +ENABLE_SHARE_ROCR_VISIBLE_DEVICES_ENV = "TRAIN_ENABLE_SHARE_ROCR_VISIBLE_DEVICES" + # Integer value which if set will not share neuron-core accelerator visible cores # across workers. 1 for True (default), 0 for False. ENABLE_SHARE_NEURON_CORES_ACCELERATOR_ENV = (