From a56b091b9642024a21004ee3e12232fba194faf8 Mon Sep 17 00:00:00 2001 From: mounchin <106834051+mounchin@users.noreply.github.com> Date: Wed, 23 Oct 2024 07:19:05 -0700 Subject: [PATCH] [Fix] Consistent parsing of custom accelerator resources (#2464) --- ray-operator/controllers/ray/common/pod.go | 19 ++++++++++++++++--- .../controllers/ray/common/pod_test.go | 2 +- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/ray-operator/controllers/ray/common/pod.go b/ray-operator/controllers/ray/common/pod.go index dba8056e7e1..6cce6a2a9d8 100644 --- a/ray-operator/controllers/ray/common/pod.go +++ b/ray-operator/controllers/ray/common/pod.go @@ -795,8 +795,12 @@ func addWellKnownAcceleratorResources(rayStartParams map[string]string, resource // Flag to track if any custom accelerator resource are present/added in rayStartParams resources. isCustomAcceleratorResourceAdded := isCustomAcceleratorPresentInResources(resourcesMap) - for resourceKey, resourceValue := range resourceLimits { - resourceKeyString := string(resourceKey) + // Create a sorted slice of resource keys + // Needed for consistent looping and adding first found custom accelerator resource to ray start params + sortedResourceKeys := getSortedResourceKeys(resourceLimits) + + for _, resourceKeyString := range sortedResourceKeys { + resourceValue := resourceLimits[corev1.ResourceName(resourceKeyString)] // Scan for resource keys ending with "gpu" like "nvidia.com/gpu" if _, ok := rayStartParams["num-gpus"]; !ok { @@ -809,7 +813,7 @@ func addWellKnownAcceleratorResources(rayStartParams map[string]string, resource if !isCustomAcceleratorResourceAdded { if rayResourceName, ok := customAcceleratorToRayResourceMap[resourceKeyString]; ok && !resourceValue.IsZero() { if _, exists := resourcesMap[rayResourceName]; !exists { - resourcesMap[rayResourceName] = float64(resourceValue.Value()) + resourcesMap[rayResourceName] = resourceValue.AsApproximateFloat64() // Update the resources map in the rayStartParams updatedResourcesStr, err := json.Marshal(resourcesMap) @@ -855,6 +859,15 @@ func getResourcesMap(rayStartParams map[string]string) (map[string]float64, erro return resources, nil } +func getSortedResourceKeys(resourceLimits corev1.ResourceList) []string { + sortedResourceKeys := make([]string, 0, len(resourceLimits)) + for resourceKey := range resourceLimits { + sortedResourceKeys = append(sortedResourceKeys, string(resourceKey)) + } + sort.Strings(sortedResourceKeys) + return sortedResourceKeys +} + func convertParamMap(rayStartParams map[string]string) (s string) { // Order rayStartParams keys for consistent ray start command flags generation keys := make([]string, 0, len(rayStartParams)) diff --git a/ray-operator/controllers/ray/common/pod_test.go b/ray-operator/controllers/ray/common/pod_test.go index 46e23f9ffb9..b9890f28b6e 100644 --- a/ray-operator/controllers/ray/common/pod_test.go +++ b/ray-operator/controllers/ray/common/pod_test.go @@ -1272,7 +1272,7 @@ func TestGenerateRayStartCommand(t *testing.T) { NeuronCoreContainerResourceName: NeuronCoreRayResourceName, "cloud-tpus.google.com/v3": "tpu", }, - expected: `ray start --head --num-gpus=1 --resources='{"tpu":8}' `, + expected: `ray start --head --num-gpus=1 --resources='{"neuron_cores":4}' `, }, { name: "HeadNode with existing resources",