-
Notifications
You must be signed in to change notification settings - Fork 440
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
support extended resources for Ray pods #2436
Changes from 3 commits
2625fa2
2ccc105
e9d4d7a
67c152c
74cb3e5
e77cf81
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -145,6 +145,15 @@ func buildNodeGroupAnnotations(computeTemplate *api.ComputeTemplate, image strin | |
return annotations | ||
} | ||
|
||
// Add resource to container | ||
func addResourceToContainer(container *corev1.Container, resourceName string, quantity uint32) { | ||
if quantity > 0 { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: if quantity == 0 {
return
}
quantityStr := fmt.Sprint(quantity)\
container.Resources.Requests[corev1.ResourceName(resourceName)] = resource.MustParse(quantityStr)
container.Resources.Limits[corev1.ResourceName(resourceName)] = resource.MustParse(quantityStr) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done, thanks! |
||
quantityStr := fmt.Sprint(quantity) | ||
container.Resources.Requests[corev1.ResourceName(resourceName)] = resource.MustParse(quantityStr) | ||
container.Resources.Limits[corev1.ResourceName(resourceName)] = resource.MustParse(quantityStr) | ||
} | ||
} | ||
|
||
// Build head node template | ||
func buildHeadPodTemplate(imageVersion string, envs *api.EnvironmentVariables, spec *api.HeadGroupSpec, computeRuntime *api.ComputeTemplate, enableServeService bool) (*corev1.PodTemplateSpec, error) { | ||
image := constructRayImage(RayClusterDefaultImageRepository, imageVersion) | ||
|
@@ -232,15 +241,18 @@ func buildHeadPodTemplate(imageVersion string, envs *api.EnvironmentVariables, s | |
// We are filtering container by name `ray-head`. If container with this name does not exist | ||
// (should never happen) we are not adding container specific parameters | ||
if container, index, ok := GetContainerByName(podTemplateSpec.Spec.Containers, "ray-head"); ok { | ||
if computeRuntime.GetGpu() != 0 { | ||
gpu := computeRuntime.GetGpu() | ||
if gpu := computeRuntime.GetGpu(); gpu != 0 { | ||
accelerator := "nvidia.com/gpu" | ||
if len(computeRuntime.GetGpuAccelerator()) != 0 { | ||
accelerator = computeRuntime.GetGpuAccelerator() | ||
} | ||
container.Resources.Requests[corev1.ResourceName(accelerator)] = resource.MustParse(fmt.Sprint(gpu)) | ||
container.Resources.Limits[corev1.ResourceName(accelerator)] = resource.MustParse(fmt.Sprint(gpu)) | ||
addResourceToContainer(&container, accelerator, gpu) | ||
} | ||
|
||
for k, v := range computeRuntime.GetExtendedResources() { | ||
addResourceToContainer(&container, k, v) | ||
} | ||
|
||
globalEnv := convertEnvironmentVariables(envs) | ||
if len(globalEnv) > 0 { | ||
container.Env = append(container.Env, globalEnv...) | ||
|
@@ -528,16 +540,16 @@ func buildWorkerPodTemplate(imageVersion string, envs *api.EnvironmentVariables, | |
// We are filtering container by name `ray-worker`. If container with this name does not exist | ||
// (should never happen) we are not adding container specific parameters | ||
if container, index, ok := GetContainerByName(podTemplateSpec.Spec.Containers, "ray-worker"); ok { | ||
if computeRuntime.GetGpu() != 0 { | ||
gpu := computeRuntime.GetGpu() | ||
if gpu := computeRuntime.GetGpu(); gpu != 0 { | ||
accelerator := "nvidia.com/gpu" | ||
if len(computeRuntime.GetGpuAccelerator()) != 0 { | ||
accelerator = computeRuntime.GetGpuAccelerator() | ||
} | ||
addResourceToContainer(&container, accelerator, gpu) | ||
} | ||
|
||
// need smarter algorithm to filter main container. for example filter by name `ray-worker` | ||
container.Resources.Requests[corev1.ResourceName(accelerator)] = resource.MustParse(fmt.Sprint(gpu)) | ||
container.Resources.Limits[corev1.ResourceName(accelerator)] = resource.MustParse(fmt.Sprint(gpu)) | ||
for k, v := range computeRuntime.GetExtendedResources() { | ||
addResourceToContainer(&container, k, v) | ||
} | ||
|
||
globalEnv := convertEnvironmentVariables(envs) | ||
|
@@ -800,14 +812,20 @@ func (c *RayCluster) SetAnnotationsToAllTemplates(key string, value string) { | |
|
||
// Build compute template | ||
func NewComputeTemplate(runtime *api.ComputeTemplate) (*corev1.ConfigMap, error) { | ||
extendedResourcesJSON, err := json.Marshal(runtime.ExtendedResources) | ||
if err != nil { | ||
return nil, fmt.Errorf("failed to marshal extended resources: %v", err) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When we fail to marshal runtime.Tolerations on line 842, we log error instead and leave tolerations unset. Should we consider something similar for extended resources? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Returning is probably better here, we should consider updating line 841 below to also return error in a follow-up PR There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In this change, I've modified the |
||
} | ||
|
||
// Create data map | ||
dmap := map[string]string{ | ||
"name": runtime.Name, | ||
"namespace": runtime.Namespace, | ||
"cpu": strconv.FormatUint(uint64(runtime.Cpu), 10), | ||
"memory": strconv.FormatUint(uint64(runtime.Memory), 10), | ||
"gpu": strconv.FormatUint(uint64(runtime.Gpu), 10), | ||
"gpu_accelerator": runtime.GpuAccelerator, | ||
"name": runtime.Name, | ||
"namespace": runtime.Namespace, | ||
"cpu": strconv.FormatUint(uint64(runtime.Cpu), 10), | ||
"memory": strconv.FormatUint(uint64(runtime.Memory), 10), | ||
"gpu": strconv.FormatUint(uint64(runtime.Gpu), 10), | ||
"gpu_accelerator": runtime.GpuAccelerator, | ||
"extended_resources": string(extendedResourcesJSON), | ||
} | ||
// Add tolerations in defined | ||
if runtime.Tolerations != nil && len(runtime.Tolerations) > 0 { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Consider "custom_resources" instead, which is more aligned to Ray terminology: https://docs.ray.io/en/latest/ray-core/scheduling/resources.html#custom-resources
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
On second thought, maybe custom_resources is misleading because this is never passed into the
--resources
flag inray start
. Do you need to include this as a custom resoruce in Ray or is it enough to add it as a container resource?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is not for custom_resources, but the extended resource for kubernetes container.