From f458b8453ee7b373bb8c2d4c76e834176f65363c Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Tue, 12 Mar 2024 03:17:10 +0000 Subject: [PATCH] Simplify podAffinity injection --- applications/ray/kuberay-tpu-webhook/main.go | 34 +++++++------------- 1 file changed, 11 insertions(+), 23 deletions(-) diff --git a/applications/ray/kuberay-tpu-webhook/main.go b/applications/ray/kuberay-tpu-webhook/main.go index 83be6a88d..ecaa57e7f 100755 --- a/applications/ray/kuberay-tpu-webhook/main.go +++ b/applications/ray/kuberay-tpu-webhook/main.go @@ -177,32 +177,20 @@ func injectMultiHostReplicaLabel(replicaIndex int, workerGroupName string, patch // inject pod affinity and anti-affinity scheduling constraints using multiHostReplica label func injectPodAffinity(replicaIndex int, workerGroupName string, patches *[]patch) { - podAffinityPatch := patch{"op": "add"} - podAffinityPath := "/spec/affinity" - podAntiAffinityPatch := patch{"op": "add"} - - // construct pod affinity value to inject - schedule pods with the same multiHostReplica together key := "multiHostReplica" - value := []string{workerGroupName + strconv.Itoa(replicaIndex)} - affinitySelectorRequirement := metav1.LabelSelectorRequirement{key, metav1.LabelSelectorOpIn, value} + value := workerGroupName + "-" + strconv.Itoa(replicaIndex) + topologyKey := "cloud.google.com/gke-nodepool" + + // construct affinity value to inject - schedule pods with the same multiHostReplica together + podAffinityPatch := patch{"op": "add"} + podAffinityPatch["path"] = "/spec/affinity/podAffinity" + affinitySelectorRequirement := metav1.LabelSelectorRequirement{key, metav1.LabelSelectorOpIn, []string{value}} affinityMatchExpressions := []metav1.LabelSelectorRequirement{affinitySelectorRequirement} affinityLabelSelector := metav1.LabelSelector{MatchExpressions: affinityMatchExpressions} - podAffinityValue := corev1.PodAffinityTerm{LabelSelector: &affinityLabelSelector} - - // construct pod anti-affinity value to inject - our requirement is that we don't - // schedule any other pods with the multi-host replica pods when the multiHostReplica label exists - antiSelectorRequirement := metav1.LabelSelectorRequirement{key, metav1.LabelSelectorOpNotIn, value} - labelExistsRequirement := metav1.LabelSelectorRequirement{key, metav1.LabelSelectorOpExists, value} - antiMatchExpressions := []metav1.LabelSelectorRequirement{antiSelectorRequirement, labelExistsRequirement} - antiLabelSelector := metav1.LabelSelector{MatchExpressions: antiMatchExpressions} - podAntiAffinityValue := corev1.PodAffinityTerm{LabelSelector: &antiLabelSelector} - - podAffinityPatch["path"] = podAffinityPath - podAffinityPatch["value"] = corev1.PodAffinity{RequiredDuringSchedulingIgnoredDuringExecution: []corev1.PodAffinityTerm{podAffinityValue}} - podAntiAffinityPatch["path"] = podAffinityPath - podAntiAffinityPatch["value"] = corev1.PodAntiAffinity{RequiredDuringSchedulingIgnoredDuringExecution: []corev1.PodAffinityTerm{podAntiAffinityValue}} - - *patches = append(*patches, podAffinityPatch, podAntiAffinityPatch) + podAffinityTerms := []corev1.PodAffinityTerm{corev1.PodAffinityTerm{LabelSelector: &affinityLabelSelector, TopologyKey: topologyKey}} + podAffinityPatch["value"] = corev1.PodAffinity{RequiredDuringSchedulingIgnoredDuringExecution: podAffinityTerms} + + *patches = append(*patches, podAffinityPatch) } // check that the # of Ray TPU worker pods equals the # of hosts defined in the topology key