Skip to content

[RayCluster] add annotation to enable non-login bash #3630

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 27 additions & 7 deletions ray-operator/controllers/ray/common/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,11 @@ var customAcceleratorToRayResourceMap = map[string]string{
TPUContainerResourceName: TPURayResourceName,
}

var (
containerCommandWithLoginBash = []string{"/bin/bash", "-lc", "--"}
containerCommandWithNonLoginBash = []string{"/bin/bash", "-c", "--"}
)

// Get the port required to connect to the Ray cluster by worker nodes and drivers
// started within the cluster.
// For Ray >= 1.11.0 this is the GCS server port. For Ray < 1.11.0 it is the Redis port.
Expand All @@ -54,6 +59,12 @@ func GetHeadPort(headStartParams map[string]string) string {
return strconv.Itoa(utils.DefaultGcsServerPort)
}

// Check if the Ray cluster is using a non-login bash command.
func isUseNonLoginBashCmd(instance rayv1.RayCluster) bool {
v, ok := instance.Annotations[utils.RayNonLoginBashCmdAnnotationKey]
return ok && strings.ToLower(v) == "true"
}

// Check if overwrites the container command.
func isOverwriteRayContainerCmd(instance rayv1.RayCluster) bool {
v, ok := instance.Annotations[utils.RayOverwriteContainerCmdAnnotationKey]
Expand All @@ -68,6 +79,10 @@ func initTemplateAnnotations(instance rayv1.RayCluster, podTemplate *corev1.PodT
if isOverwriteRayContainerCmd(instance) {
podTemplate.Annotations[utils.RayOverwriteContainerCmdAnnotationKey] = "true"
}

if isUseNonLoginBashCmd(instance) {
podTemplate.Annotations[utils.RayNonLoginBashCmdAnnotationKey] = "true"
}
}

func configureGCSFaultTolerance(podTemplate *corev1.PodTemplateSpec, instance rayv1.RayCluster, rayNodeType rayv1.RayNodeType) {
Expand Down Expand Up @@ -189,6 +204,9 @@ func DefaultHeadPodTemplate(ctx context.Context, instance rayv1.RayCluster, head
autoscalerImage := podTemplate.Spec.Containers[utils.RayContainerIndex].Image
// inject autoscaler container into head pod
autoscalerContainer := BuildAutoscalerContainer(autoscalerImage)
if isUseNonLoginBashCmd(instance) {
autoscalerContainer.Command = containerCommandWithNonLoginBash
}
// Merge the user overrides from autoscalerOptions into the autoscaler container config.
mergeAutoscalerOverrides(&autoscalerContainer, instance.Spec.AutoscalerOptions)
podTemplate.Spec.Containers = append(podTemplate.Spec.Containers, autoscalerContainer)
Expand Down Expand Up @@ -259,7 +277,7 @@ func DefaultWorkerPodTemplate(ctx context.Context, instance rayv1.RayCluster, wo
Name: "wait-gcs-ready",
Image: podTemplate.Spec.Containers[utils.RayContainerIndex].Image,
ImagePullPolicy: podTemplate.Spec.Containers[utils.RayContainerIndex].ImagePullPolicy,
Command: []string{"/bin/bash", "-lc", "--"},
Command: containerCommandWithLoginBash,
Args: []string{
fmt.Sprintf(`
SECONDS=0
Expand Down Expand Up @@ -303,6 +321,9 @@ func DefaultWorkerPodTemplate(ctx context.Context, instance rayv1.RayCluster, wo
},
},
}
if isUseNonLoginBashCmd(instance) {
initContainer.Command = containerCommandWithNonLoginBash
}
podTemplate.Spec.InitContainers = append(podTemplate.Spec.InitContainers, initContainer)
}
// If the replica of workers is more than 1, `ObjectMeta.Name` may cause name conflict errors.
Expand Down Expand Up @@ -465,7 +486,10 @@ func BuildPod(ctx context.Context, podTemplateSpec corev1.PodTemplateSpec, rayNo
generatedCmd := fmt.Sprintf("%s; %s", ulimitCmd, rayStartCmd)
log.Info("BuildPod", "rayNodeType", rayNodeType, "generatedCmd", generatedCmd)
// replacing the old command
pod.Spec.Containers[utils.RayContainerIndex].Command = []string{"/bin/bash", "-lc", "--"}
pod.Spec.Containers[utils.RayContainerIndex].Command = containerCommandWithLoginBash
if v, ok := podTemplateSpec.Annotations[utils.RayNonLoginBashCmdAnnotationKey]; ok && strings.ToLower(v) == "true" {
pod.Spec.Containers[utils.RayContainerIndex].Command = containerCommandWithNonLoginBash
}
if cmd != "" {
// If 'ray start' has --block specified, commands after it will not get executed.
// so we need to put cmd before cont.
Expand Down Expand Up @@ -532,11 +556,7 @@ func BuildAutoscalerContainer(autoscalerImage string) corev1.Container {
Value: "v1",
},
},
Command: []string{
"/bin/bash",
"-lc",
"--",
},
Command: containerCommandWithLoginBash,
Args: []string{
"ray kuberay-autoscaler --cluster-name $(RAY_CLUSTER_NAME) --cluster-namespace $(RAY_CLUSTER_NAMESPACE)",
},
Expand Down
62 changes: 62 additions & 0 deletions ray-operator/controllers/ray/common/pod_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -790,6 +790,34 @@ func TestBuildPod_WithOverwriteCommand(t *testing.T) {
assert.Equal(t, []string{"I am worker again"}, workerContainer.Args)
}

func TestBuildPod_WithNonLoginBashCommand(t *testing.T) {
ctx := context.Background()

cluster := instance.DeepCopy()
cluster.Annotations = map[string]string{
// When the value of the annotation is "true", KubeRay will not generate the command and args for the container.
// Instead, it will use the command and args specified by the use.
utils.RayNonLoginBashCmdAnnotationKey: "true",
}

podName := strings.ToLower(cluster.Name + utils.DashSymbol + string(rayv1.HeadNode) + utils.DashSymbol + utils.FormatInt32(0))
podTemplateSpec := DefaultHeadPodTemplate(ctx, *cluster, cluster.Spec.HeadGroupSpec, podName, "6379")
headPod := BuildPod(ctx, podTemplateSpec, rayv1.HeadNode, cluster.Spec.HeadGroupSpec.RayStartParams, "6379", false, utils.GetCRDType(""), "")
headContainer := headPod.Spec.Containers[utils.RayContainerIndex]
assert.Equal(t, []string{"/bin/bash", "-c", "--"}, headContainer.Command)

worker := cluster.Spec.WorkerGroupSpecs[0]
podName = cluster.Name + utils.DashSymbol + string(rayv1.WorkerNode) + utils.DashSymbol + worker.GroupName + utils.DashSymbol + utils.FormatInt32(0)
fqdnRayIP := utils.GenerateFQDNServiceName(ctx, *cluster, cluster.Namespace)
podTemplateSpec = DefaultWorkerPodTemplate(ctx, *cluster, worker, podName, fqdnRayIP, "6379")
workerPod := BuildPod(ctx, podTemplateSpec, rayv1.WorkerNode, worker.RayStartParams, "6379", false, utils.GetCRDType(""), fqdnRayIP)
workerContainer := workerPod.Spec.Containers[utils.RayContainerIndex]
assert.Equal(t, []string{"/bin/bash", "-c", "--"}, workerContainer.Command)

initContainer := workerPod.Spec.InitContainers[0]
assert.Equal(t, []string{"/bin/bash", "-c", "--"}, initContainer.Command)
}

func TestBuildPod_WithAutoscalerEnabled(t *testing.T) {
ctx := context.Background()
cluster := instance.DeepCopy()
Expand Down Expand Up @@ -821,6 +849,40 @@ func TestBuildPod_WithAutoscalerEnabled(t *testing.T) {
assert.True(t, reflect.DeepEqual(expectedContainer, actualContainer))
}

func TestBuildPod_WithNonLoginBashCommand_WithAutoscalerEnabled(t *testing.T) {
ctx := context.Background()

cluster := instance.DeepCopy()
cluster.Annotations = map[string]string{
// When the value of the annotation is "true", KubeRay will not generate the command and args for the container.
// Instead, it will use the command and args specified by the use.
utils.RayNonLoginBashCmdAnnotationKey: "true",
}
cluster.Spec.EnableInTreeAutoscaling = &trueFlag

podName := strings.ToLower(cluster.Name + utils.DashSymbol + string(rayv1.HeadNode) + utils.DashSymbol + utils.FormatInt32(0))
podTemplateSpec := DefaultHeadPodTemplate(ctx, *cluster, cluster.Spec.HeadGroupSpec, podName, "6379")
headPod := BuildPod(ctx, podTemplateSpec, rayv1.HeadNode, cluster.Spec.HeadGroupSpec.RayStartParams, "6379", false, utils.GetCRDType(""), "")
headContainer := headPod.Spec.Containers[utils.RayContainerIndex]
assert.Equal(t, []string{"/bin/bash", "-c", "--"}, headContainer.Command)

assert.Len(t, headPod.Spec.Containers, 2)
index := getAutoscalerContainerIndex(headPod)
actualContainer := headPod.Spec.Containers[index]
assert.Equal(t, []string{"/bin/bash", "-c", "--"}, actualContainer.Command)

worker := cluster.Spec.WorkerGroupSpecs[0]
podName = cluster.Name + utils.DashSymbol + string(rayv1.WorkerNode) + utils.DashSymbol + worker.GroupName + utils.DashSymbol + utils.FormatInt32(0)
fqdnRayIP := utils.GenerateFQDNServiceName(ctx, *cluster, cluster.Namespace)
podTemplateSpec = DefaultWorkerPodTemplate(ctx, *cluster, worker, podName, fqdnRayIP, "6379")
workerPod := BuildPod(ctx, podTemplateSpec, rayv1.WorkerNode, worker.RayStartParams, "6379", false, utils.GetCRDType(""), fqdnRayIP)
workerContainer := workerPod.Spec.Containers[utils.RayContainerIndex]
assert.Equal(t, []string{"/bin/bash", "-c", "--"}, workerContainer.Command)

initContainer := workerPod.Spec.InitContainers[0]
assert.Equal(t, []string{"/bin/bash", "-c", "--"}, initContainer.Command)
}

func TestBuildPod_WithCreatedByRayService(t *testing.T) {
ctx := context.Background()

Expand Down
4 changes: 4 additions & 0 deletions ray-operator/controllers/ray/utils/constant.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,10 @@ const (
// `KUBERAY_GEN_RAY_START_CMD`.
RayOverwriteContainerCmdAnnotationKey = "ray.io/overwrite-container-cmd"

// If this annotation is set to "true", the KubeRay operator will use the bash without `-l` in command.
// This is useful for some container images that do not want PATH being overwrite.
RayNonLoginBashCmdAnnotationKey = "ray.io/non-login-bash-cmd"

// Finalizers for GCS fault tolerance
GCSFaultToleranceRedisCleanupFinalizer = "ray.io/gcs-ft-redis-cleanup-finalizer"

Expand Down
Loading