Skip to content

Commit

Permalink
[RayCluster] add GPUShare to known custom accelerators
Browse files Browse the repository at this point in the history
Signed-off-by: win5923 <[email protected]>
  • Loading branch information
win5923 committed Dec 10, 2024
1 parent aeba37e commit bde319f
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 0 deletions.
3 changes: 3 additions & 0 deletions ray-operator/controllers/ray/common/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,14 @@ const (
NeuronCoreRayResourceName = "neuron_cores"
TPUContainerResourceName = "google.com/tpu"
TPURayResourceName = "TPU"
GPUShareContainerResourceName = "aliyun.com/gpu-mem"
GPUShareResourceName = "gpu_share"
)

var customAcceleratorToRayResourceMap = map[string]string{
NeuronCoreContainerResourceName: NeuronCoreRayResourceName,
TPUContainerResourceName: TPURayResourceName,
GPUShareContainerResourceName: GPUShareResourceName,
}

// Get the port required to connect to the Ray cluster by worker nodes and drivers
Expand Down
25 changes: 25 additions & 0 deletions ray-operator/controllers/ray/common/pod_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1244,6 +1244,17 @@ func TestGenerateRayStartCommand(t *testing.T) {
},
expected: `ray start --resources='{"TPU":4}' `,
},
{
name: "WorkerNode with GPU Share",
nodeType: rayv1.WorkerNode,
rayStartParams: map[string]string{},
resource: corev1.ResourceRequirements{
Limits: corev1.ResourceList{
"aliyun.com/gpu-mem": resource.MustParse("4"),
},
},
expected: `ray start --resources='{"gpu_share":4}' `,
},
{
name: "HeadNode with Neuron Cores",
nodeType: rayv1.HeadNode,
Expand Down Expand Up @@ -1273,6 +1284,7 @@ func TestGenerateRayStartCommand(t *testing.T) {
rayStartParams: map[string]string{},
resource: corev1.ResourceRequirements{
Limits: corev1.ResourceList{
"aliyun.com/gpu-mem": resource.MustParse("10"),
"google.com/tpu": resource.MustParse("8"),
"aws.amazon.com/neuroncore": resource.MustParse("4"),
"nvidia.com/gpu": resource.MustParse("1"),
Expand Down Expand Up @@ -1319,6 +1331,19 @@ func TestGenerateRayStartCommand(t *testing.T) {
},
expected: `ray start --head --resources='{"custom_resource":2,"TPU":4}' `,
},
{
name: "HeadNode with existing GPU Share resources",
nodeType: rayv1.HeadNode,
rayStartParams: map[string]string{
"resources": `'{"custom_resource":2,"gpu_share":4}'`,
},
resource: corev1.ResourceRequirements{
Limits: corev1.ResourceList{
"aliyun.com/gpu-mem": resource.MustParse("8"),
},
},
expected: `ray start --head --resources='{"custom_resource":2,"gpu_share":4}' `,
},
{
name: "HeadNode with invalid resources string",
nodeType: rayv1.HeadNode,
Expand Down

0 comments on commit bde319f

Please sign in to comment.