ray-project · andrewsykim · Oct 15, 2024 · Oct 10, 2024 · Oct 11, 2024 · Oct 11, 2024
diff --git a/apiserver/pkg/model/converter.go b/apiserver/pkg/model/converter.go
@@ -385,7 +385,17 @@ func FromKubeToAPIComputeTemplate(configMap *corev1.ConfigMap) *api.ComputeTempl
 	runtime.Memory = uint32(memory)
 	runtime.Gpu = uint32(gpu)
 	runtime.GpuAccelerator = configMap.Data["gpu_accelerator"]
-	val, ok := configMap.Data["tolerations"]
+
+	val, ok := configMap.Data["extended_resources"]
+	if ok {
+		err := json.Unmarshal([]byte(val), &runtime.ExtendedResources)
+		if err != nil {
+			klog.Error("failed to unmarshall extended resources for compute template ", runtime.Name, " value ",
+				runtime.ExtendedResources, " error ", err)
+		}
+	}
+
+	val, ok = configMap.Data["tolerations"]
 	if ok {
 		err := json.Unmarshal([]byte(val), &runtime.Tolerations)
 		if err != nil {

diff --git a/apiserver/pkg/model/converter_test.go b/apiserver/pkg/model/converter_test.go
@@ -126,24 +126,26 @@ var headSpecTest = rayv1api.HeadGroupSpec{
 
 var configMapWithoutTolerations = corev1.ConfigMap{
 	Data: map[string]string{
-		"cpu":             "4",
-		"gpu":             "0",
-		"gpu_accelerator": "",
-		"memory":          "8",
-		"name":            "head-node-template",
-		"namespace":       "max",
+		"cpu":                "4",
+		"gpu":                "0",
+		"gpu_accelerator":    "",
+		"memory":             "8",
+		"extended_resources": "{\"vpc.amazonaws.com/efa\": 32}",
+		"name":               "head-node-template",
+		"namespace":          "max",
 	},
 }
 
 var configMapWithTolerations = corev1.ConfigMap{
 	Data: map[string]string{
-		"cpu":             "4",
-		"gpu":             "0",
-		"gpu_accelerator": "",
-		"memory":          "8",
-		"name":            "head-node-template",
-		"namespace":       "max",
-		"tolerations":     "[{\"key\":\"blah1\",\"operator\":\"Exists\",\"effect\":\"NoExecute\"}]",
+		"cpu":                "4",
+		"gpu":                "0",
+		"gpu_accelerator":    "",
+		"memory":             "8",
+		"extended_resources": "{\"vpc.amazonaws.com/efa\": 32}",
+		"name":               "head-node-template",
+		"namespace":          "max",
+		"tolerations":        "[{\"key\":\"blah1\",\"operator\":\"Exists\",\"effect\":\"NoExecute\"}]",
 	},
 }
 
@@ -578,6 +580,11 @@ func TestPopulateTemplate(t *testing.T) {
 		t.Errorf("failed to convert config map, got %v, expected %v", tolerationToString(template.Tolerations[0]),
 			tolerationToString(&expectedTolerations))
 	}
+
+	assert.Equal(t, uint32(4), template.Cpu, "CPU mismatch")
+	assert.Equal(t, uint32(8), template.Memory, "Memory mismatch")
+	assert.Equal(t, uint32(0), template.Gpu, "GPU mismatch")
+	assert.Equal(t, map[string]uint32{"vpc.amazonaws.com/efa": 32}, template.ExtendedResources, "Extended resources mismatch")
 }
 
 func tolerationToString(toleration *api.PodToleration) string {

diff --git a/apiserver/pkg/util/cluster.go b/apiserver/pkg/util/cluster.go
@@ -145,6 +145,15 @@ func buildNodeGroupAnnotations(computeTemplate *api.ComputeTemplate, image strin
 	return annotations
 }
 
+// Add resource to container
+func addResourceToContainer(container *corev1.Container, resourceName string, quantity uint32) {
+	if quantity > 0 {
+		quantityStr := fmt.Sprint(quantity)
+		container.Resources.Requests[corev1.ResourceName(resourceName)] = resource.MustParse(quantityStr)
+		container.Resources.Limits[corev1.ResourceName(resourceName)] = resource.MustParse(quantityStr)
+	}
+}
+
 // Build head node template
 func buildHeadPodTemplate(imageVersion string, envs *api.EnvironmentVariables, spec *api.HeadGroupSpec, computeRuntime *api.ComputeTemplate, enableServeService bool) (*corev1.PodTemplateSpec, error) {
 	image := constructRayImage(RayClusterDefaultImageRepository, imageVersion)
@@ -232,15 +241,18 @@ func buildHeadPodTemplate(imageVersion string, envs *api.EnvironmentVariables, s
 	// We are filtering container by name `ray-head`. If container with this name does not exist
 	// (should never happen) we are not adding container specific parameters
 	if container, index, ok := GetContainerByName(podTemplateSpec.Spec.Containers, "ray-head"); ok {
-		if computeRuntime.GetGpu() != 0 {
-			gpu := computeRuntime.GetGpu()
+		if gpu := computeRuntime.GetGpu(); gpu != 0 {
 			accelerator := "nvidia.com/gpu"
 			if len(computeRuntime.GetGpuAccelerator()) != 0 {
 				accelerator = computeRuntime.GetGpuAccelerator()
 			}
-			container.Resources.Requests[corev1.ResourceName(accelerator)] = resource.MustParse(fmt.Sprint(gpu))
-			container.Resources.Limits[corev1.ResourceName(accelerator)] = resource.MustParse(fmt.Sprint(gpu))
+			addResourceToContainer(&container, accelerator, gpu)
+		}
+
+		for k, v := range computeRuntime.GetExtendedResources() {
+			addResourceToContainer(&container, k, v)
 		}
+
 		globalEnv := convertEnvironmentVariables(envs)
 		if len(globalEnv) > 0 {
 			container.Env = append(container.Env, globalEnv...)
@@ -528,16 +540,16 @@ func buildWorkerPodTemplate(imageVersion string, envs *api.EnvironmentVariables,
 	// We are filtering container by name `ray-worker`. If container with this name does not exist
 	// (should never happen) we are not adding container specific parameters
 	if container, index, ok := GetContainerByName(podTemplateSpec.Spec.Containers, "ray-worker"); ok {
-		if computeRuntime.GetGpu() != 0 {
-			gpu := computeRuntime.GetGpu()
+		if gpu := computeRuntime.GetGpu(); gpu != 0 {
 			accelerator := "nvidia.com/gpu"
 			if len(computeRuntime.GetGpuAccelerator()) != 0 {
 				accelerator = computeRuntime.GetGpuAccelerator()
 			}
+			addResourceToContainer(&container, accelerator, gpu)
+		}
 
-			// need smarter algorithm to filter main container. for example filter by name `ray-worker`
-			container.Resources.Requests[corev1.ResourceName(accelerator)] = resource.MustParse(fmt.Sprint(gpu))
-			container.Resources.Limits[corev1.ResourceName(accelerator)] = resource.MustParse(fmt.Sprint(gpu))
+		for k, v := range computeRuntime.GetExtendedResources() {
+			addResourceToContainer(&container, k, v)
 		}
 
 		globalEnv := convertEnvironmentVariables(envs)
@@ -800,14 +812,20 @@ func (c *RayCluster) SetAnnotationsToAllTemplates(key string, value string) {
 
 // Build compute template
 func NewComputeTemplate(runtime *api.ComputeTemplate) (*corev1.ConfigMap, error) {
+	extendedResourcesJSON, err := json.Marshal(runtime.ExtendedResources)
+	if err != nil {
+		return nil, fmt.Errorf("failed to marshal extended resources: %v", err)
+	}
+
 	// Create data map
 	dmap := map[string]string{
-		"name":            runtime.Name,
-		"namespace":       runtime.Namespace,
-		"cpu":             strconv.FormatUint(uint64(runtime.Cpu), 10),
-		"memory":          strconv.FormatUint(uint64(runtime.Memory), 10),
-		"gpu":             strconv.FormatUint(uint64(runtime.Gpu), 10),
-		"gpu_accelerator": runtime.GpuAccelerator,
+		"name":               runtime.Name,
+		"namespace":          runtime.Namespace,
+		"cpu":                strconv.FormatUint(uint64(runtime.Cpu), 10),
+		"memory":             strconv.FormatUint(uint64(runtime.Memory), 10),
+		"gpu":                strconv.FormatUint(uint64(runtime.Gpu), 10),
+		"gpu_accelerator":    runtime.GpuAccelerator,
+		"extended_resources": string(extendedResourcesJSON),
 	}
 	// Add tolerations in defined
 	if runtime.Tolerations != nil && len(runtime.Tolerations) > 0 {

diff --git a/apiserver/pkg/util/cluster_test.go b/apiserver/pkg/util/cluster_test.go
@@ -243,6 +243,22 @@ var template = api.ComputeTemplate{
 	},
 }
 
+var templateWorker = api.ComputeTemplate{
+	Name:              "",
+	Namespace:         "",
+	Cpu:               2,
+	Memory:            8,
+	Gpu:               4,
+	ExtendedResources: map[string]uint32{"vpc.amazonaws.com/efa": 32},
+	Tolerations: []*api.PodToleration{
+		{
+			Key:      "blah1",
+			Operator: "Exists",
+			Effect:   "NoExecute",
+		},
+	},
+}
+
 var expectedToleration = corev1.Toleration{
 	Key:      "blah1",
 	Operator: "Exists",
@@ -591,34 +607,33 @@ func TestBuildRayCluster(t *testing.T) {
 }
 
 func TestBuilWorkerPodTemplate(t *testing.T) {
-	podSpec, err := buildWorkerPodTemplate("2.4", &api.EnvironmentVariables{}, &workerGroup, &template)
+	podSpec, err := buildWorkerPodTemplate("2.4", &api.EnvironmentVariables{}, &workerGroup, &templateWorker)
 	assert.Nil(t, err)
 
-	if podSpec.Spec.ServiceAccountName != "account" {
-		t.Errorf("failed to propagate service account")
-	}
-	if podSpec.Spec.ImagePullSecrets[0].Name != "foo" {
-		t.Errorf("failed to propagate image pull secret")
-	}
-	if (string)(podSpec.Spec.Containers[0].ImagePullPolicy) != "Always" {
-		t.Errorf("failed to propagate image pull policy")
-	}
-	if !containsEnv(podSpec.Spec.Containers[0].Env, "foo", "bar") {
-		t.Errorf("failed to propagate environment")
-	}
-	if len(podSpec.Spec.Tolerations) != 1 {
-		t.Errorf("failed to propagate tolerations, expected 1, got %d", len(podSpec.Spec.Tolerations))
-	}
-	if !reflect.DeepEqual(podSpec.Spec.Tolerations[0], expectedToleration) {
-		t.Errorf("failed to propagate annotations, got %v, expected %v", tolerationToString(&podSpec.Spec.Tolerations[0]),
-			tolerationToString(&expectedToleration))
-	}
-	if val, exists := podSpec.Annotations["foo"]; !exists || val != "bar" {
-		t.Errorf("failed to convert annotations")
-	}
-	if !reflect.DeepEqual(podSpec.Labels, expectedLabels) {
-		t.Errorf("failed to convert labels, got %v, expected %v", podSpec.Labels, expectedLabels)
-	}
+	assert.Equal(t, "account", podSpec.Spec.ServiceAccountName, "failed to propagate service account")
+	assert.Equal(t, "foo", podSpec.Spec.ImagePullSecrets[0].Name, "failed to propagate image pull secret")
+	assert.Equal(t, corev1.PullAlways, podSpec.Spec.Containers[0].ImagePullPolicy, "failed to propagate image pull policy")
+	assert.True(t, containsEnv(podSpec.Spec.Containers[0].Env, "foo", "bar"), "failed to propagate environment")
+	assert.Len(t, podSpec.Spec.Tolerations, 1, "failed to propagate tolerations")
+	assert.Equal(t, expectedToleration, podSpec.Spec.Tolerations[0], "failed to propagate tolerations")
+	assert.Equal(t, "bar", podSpec.Annotations["foo"], "failed to convert annotations")
+	assert.Equal(t, expectedLabels, podSpec.Labels, "failed to convert labels")
+
+	// Check Resources
+	container := podSpec.Spec.Containers[0]
+	resources := container.Resources
+
+	assert.Equal(t, resource.MustParse("2"), resources.Limits[corev1.ResourceCPU], "CPU limit doesn't match")
+	assert.Equal(t, resource.MustParse("2"), resources.Requests[corev1.ResourceCPU], "CPU request doesn't match")
+
+	assert.Equal(t, resource.MustParse("8Gi"), resources.Limits[corev1.ResourceMemory], "Memory limit doesn't match")
+	assert.Equal(t, resource.MustParse("8Gi"), resources.Requests[corev1.ResourceMemory], "Memory request doesn't match")
+
+	assert.Equal(t, resource.MustParse("4"), resources.Limits["nvidia.com/gpu"], "GPU limit doesn't match")
+	assert.Equal(t, resource.MustParse("4"), resources.Requests["nvidia.com/gpu"], "GPU request doesn't match")
+
+	assert.Equal(t, resource.MustParse("32"), resources.Limits["vpc.amazonaws.com/efa"], "EFA limit doesn't match")
+	assert.Equal(t, resource.MustParse("32"), resources.Requests["vpc.amazonaws.com/efa"], "EFA request doesn't match")
 }
 
 func containsEnv(envs []corev1.EnvVar, key string, val string) bool {

diff --git a/clients/python-apiserver-client/src/python_apiserver_client/params/templates.py b/clients/python-apiserver-client/src/python_apiserver_client/params/templates.py
@@ -92,6 +92,7 @@ class Template:
         memory - required, template memory (GB)
         gpus - optional, number of GPUs, default 0
         gpu_accelerator - optional, if not defined nvidia.com/gpu is assumed
+        extended_resources - optional, name and number of the extended resources
         tolerations - optional, tolerations for pod placing, default none
     - to_string() -> str: convert toleration to string for printing
     - to_dict() -> dict[str, Any] convert to dict
@@ -106,6 +107,7 @@ def __init__(
             memory: int,
             gpu: int = 0,
             gpu_accelerator: str = None,
+            extended_resources: dict[str, int] = None,
             tolerations: list[Toleration] = None,
     ):
         """
@@ -116,6 +118,7 @@ def __init__(
         :param memory: memory
         :param gpu: gpu
         :param gpu_accelerator: accelerator type
+        :param extended_resources: extended resources
         :param tolerations: tolerations
         """
         self.name = name
@@ -124,6 +127,7 @@ def __init__(
         self.memory = memory
         self.gpu = gpu
         self.gpu_accelerator = gpu_accelerator
+        self.extended_resources = extended_resources
         self.tolerations = tolerations
 
     def to_string(self) -> str:
@@ -136,6 +140,8 @@ def to_string(self) -> str:
             val = val + f", gpu {self.gpu}"
         if self.gpu_accelerator is not None:
             val = val + f", gpu accelerator {self.gpu_accelerator}"
+        if self.extended_resources is not None:
+            val = val + f", extended resources {self.extended_resources}"
         if self.tolerations is None:
             return val
         val = val + ", tolerations ["
@@ -158,6 +164,8 @@ def to_dict(self) -> dict[str, Any]:
             dct["gpu"] = self.gpu
         if self.gpu_accelerator is not None:
             dct["gpu accelerator"] = self.gpu_accelerator
+        if self.extended_resources is not None:
+            dct["extended resources"] = self.extended_resources
         if self.tolerations is not None:
             dct["tolerations"] = [tl.to_dict() for tl in self.tolerations]
         return dct
@@ -199,6 +207,7 @@ def template_decoder(dct: dict[str, Any]) -> Template:
         memory=int(dct.get("memory", "0")),
         gpu=int(dct.get("gpu", "0")),
         gpu_accelerator=dct.get("gpu_accelerator"),
+        extended_resources=dct.get("extended_resources"),
         tolerations=tolerations,
     )
 

diff --git a/clients/python-apiserver-client/test/api_params_test.py b/clients/python-apiserver-client/test/api_params_test.py
@@ -77,8 +77,14 @@ def test_templates():
     tm2_json = json.dumps(temp2.to_dict())
     print(f"template 2 JSON: {tm2_json}")
 
+    temp3 = Template(name="template3", namespace="namespace", cpu=2, memory=8, gpu=1, extended_resources={"vpc.amazonaws.com/efa": 32})
+    print(f"template 3: {temp3.to_string()}")
+    tm3_json = json.dumps(temp3.to_dict())
+    print(f"template 3 JSON: {tm3_json}")
+
     assert temp1.to_string() == template_decoder(json.loads(tm1_json)).to_string()
     assert temp2.to_string() == template_decoder(json.loads(tm2_json)).to_string()
+    assert temp3.to_string() == template_decoder(json.loads(tm3_json)).to_string()
 
 
 def test_volumes():

diff --git a/clients/python-apiserver-client/test/kuberay_api_test.py b/clients/python-apiserver-client/test/kuberay_api_test.py
@@ -43,7 +43,7 @@ def test_templates():
     _, _ = apis.delete_compute_template(ns="default", name="default-template")
     # create
     toleration = Toleration(key="blah1", operator=TolerationOperation.Exists, effect=TolerationEffect.NoExecute)
-    template = Template(name="default-template", namespace="default", cpu=2, memory=8, tolerations=[toleration])
+    template = Template(name="default-template", namespace="default", cpu=2, memory=8, gpu=1, extended_resources={"vpc.amazonaws.com/efa": 32}, tolerations=[toleration])
     status, error = apis.create_compute_template(template)
     assert status == 200
     assert error is None

diff --git a/proto/config.proto b/proto/config.proto
@@ -128,6 +128,8 @@ message ComputeTemplate {
   string gpu_accelerator = 6;
   // Optional pod tolerations
   repeated PodToleration tolerations = 7;
+  // Optional. Name and number of the extended resources
+  map<string, uint32> extended_resources = 8;
 }
 
 // This service is not implemented.