From 9fd2728543f22fb95d3b177573e572ae188b94ac Mon Sep 17 00:00:00 2001 From: Tzu-Hua Lan Date: Fri, 28 Jun 2024 14:46:48 +0800 Subject: [PATCH 1/5] [YUNIKORN-2667] E2E test for Gang app originator pod changes after restart --- .../gang_scheduling/gang_scheduling_test.go | 66 +++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/test/e2e/gang_scheduling/gang_scheduling_test.go b/test/e2e/gang_scheduling/gang_scheduling_test.go index 7169a45bc..c30d99d04 100644 --- a/test/e2e/gang_scheduling/gang_scheduling_test.go +++ b/test/e2e/gang_scheduling/gang_scheduling_test.go @@ -571,6 +571,72 @@ var _ = Describe("", func() { Ω(appDaoInfo.UsedResource[hugepageKey]).To(Equal(int64(314572800)), "Used huge page resource is not correct") }) + // Test to verify that the gang app originator pod does not change after a restart + // Create a gang app with a real pod that does not have a TaskGroupName set + // A real pod will never be replaced, and the core app's allocations will always contain only the declared number of placeholder pods + // 1. Create an originator pod + // 2. Ensure the originator pod is not in allocations + // 3. Restart YuniKorn + // 4. Ensure the originator pod is still not in allocations after restart + It("Verify_Gang_App_Originator_Pod_Does_Not_Change_After_Restart", func() { + placeholderCount := 5 + + By("Create an originator pod") + podConf := k8s.TestPodConfig{ + Name: "gang-driver-pod" + common.RandSeq(5), + Labels: map[string]string{ + "app": "sleep-" + common.RandSeq(5), + "applicationId": appID, + }, + Annotations: &k8s.PodAnnotation{ + TaskGroups: []cache.TaskGroup{ + {Name: groupA, MinMember: int32(placeholderCount), MinResource: minResource}, + }, + }, + Resources: &v1.ResourceRequirements{ + Requests: v1.ResourceList{"cpu": minResource["cpu"], "memory": minResource["memory"]}, + }, + } + podTest, err := k8s.InitTestPod(podConf) + Ω(err).NotTo(HaveOccurred()) + originator, err := kClient.CreatePod(podTest, ns) + Ω(err).NotTo(HaveOccurred()) + + // Wait for the app to be created + checkAppStatus(appID, yunikorn.States().Application.Running) + + By("Ensure placeholders are allocated") + appDaoInfoBeforeRestart, appDaoInfoErr := restClient.GetAppInfo(configmanager.DefaultPartition, nsQueue, appID) + Ω(appDaoInfoErr).NotTo(HaveOccurred()) + checkPlaceholderData(appDaoInfoBeforeRestart, groupA, placeholderCount, 0, 0) + + By("Ensure originator pod is not in allocations") + for _, alloc := range appDaoInfoBeforeRestart.Allocations { + podName := alloc.AllocationTags["kubernetes.io/meta/podName"] + Ω(originator.Name).NotTo(Equal(podName), "The originator pod should not be in allocations") + Ω(alloc.Placeholder).To(Equal(true), "The allocation should be for a placeholder pod") + } + + By("Restart the scheduler pod") + yunikorn.RestartYunikorn(&kClient) + yunikorn.RestorePortForwarding(&kClient) + + // Wait for the app to be created + checkAppStatus(appID, yunikorn.States().Application.Running) + + By("Ensure placeholders are allocated after restart") + appDaoInfoAfterRestart, appDaoInfoErr := restClient.GetAppInfo(configmanager.DefaultPartition, nsQueue, appID) + Ω(appDaoInfoErr).NotTo(HaveOccurred()) + checkPlaceholderData(appDaoInfoAfterRestart, groupA, placeholderCount, 0, 0) + + By("Ensure the originator pod is still not in allocations after restart") + for _, alloc := range appDaoInfoBeforeRestart.Allocations { + podName := alloc.AllocationTags["kubernetes.io/meta/podName"] + Ω(originator.Name).NotTo(Equal(podName), "The originator pod should not be in allocations") + Ω(alloc.Placeholder).To(Equal(true), "The allocation should be for a placeholder pod") + } + }) + AfterEach(func() { tests.DumpClusterInfoIfSpecFailed(suiteName, []string{ns}) From dc832e957a9858e989b282cfbf0117e07ac657fb Mon Sep 17 00:00:00 2001 From: Tzu-Hua Lan Date: Tue, 2 Jul 2024 18:52:53 +0800 Subject: [PATCH 2/5] Change the method of verifying the originator pod --- .../gang_scheduling/gang_scheduling_test.go | 34 +++++++++++++------ 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/test/e2e/gang_scheduling/gang_scheduling_test.go b/test/e2e/gang_scheduling/gang_scheduling_test.go index c30d99d04..39f90b7b6 100644 --- a/test/e2e/gang_scheduling/gang_scheduling_test.go +++ b/test/e2e/gang_scheduling/gang_scheduling_test.go @@ -572,10 +572,8 @@ var _ = Describe("", func() { }) // Test to verify that the gang app originator pod does not change after a restart - // Create a gang app with a real pod that does not have a TaskGroupName set - // A real pod will never be replaced, and the core app's allocations will always contain only the declared number of placeholder pods // 1. Create an originator pod - // 2. Ensure the originator pod is not in allocations + // 2. Ensure the originator pod is not a placeholder pod // 3. Restart YuniKorn // 4. Ensure the originator pod is still not in allocations after restart It("Verify_Gang_App_Originator_Pod_Does_Not_Change_After_Restart", func() { @@ -583,7 +581,7 @@ var _ = Describe("", func() { By("Create an originator pod") podConf := k8s.TestPodConfig{ - Name: "gang-driver-pod" + common.RandSeq(5), + Name: "gang-driver-pod-" + common.RandSeq(5), Labels: map[string]string{ "app": "sleep-" + common.RandSeq(5), "applicationId": appID, @@ -605,16 +603,23 @@ var _ = Describe("", func() { // Wait for the app to be created checkAppStatus(appID, yunikorn.States().Application.Running) + By("Sleeping") + time.Sleep(10 * time.Second) + By("Ensure placeholders are allocated") appDaoInfoBeforeRestart, appDaoInfoErr := restClient.GetAppInfo(configmanager.DefaultPartition, nsQueue, appID) Ω(appDaoInfoErr).NotTo(HaveOccurred()) checkPlaceholderData(appDaoInfoBeforeRestart, groupA, placeholderCount, 0, 0) - By("Ensure originator pod is not in allocations") + By("Ensure the originator pod is not a placeholder pod") + Ω(len(appDaoInfoBeforeRestart.Allocations)).To(Equal(1+placeholderCount), "Amount of allocations is incorrect") for _, alloc := range appDaoInfoBeforeRestart.Allocations { podName := alloc.AllocationTags["kubernetes.io/meta/podName"] - Ω(originator.Name).NotTo(Equal(podName), "The originator pod should not be in allocations") - Ω(alloc.Placeholder).To(Equal(true), "The allocation should be for a placeholder pod") + if podName == originator.Name { + Ω(alloc.Placeholder).To(Equal(false), "Originator pod should not be a placeholder pod") + } else { + Ω(alloc.Placeholder).To(Equal(true), "Placeholder pod should be a placeholder pod") + } } By("Restart the scheduler pod") @@ -624,16 +629,23 @@ var _ = Describe("", func() { // Wait for the app to be created checkAppStatus(appID, yunikorn.States().Application.Running) + By("Sleeping") + time.Sleep(10 * time.Second) + By("Ensure placeholders are allocated after restart") appDaoInfoAfterRestart, appDaoInfoErr := restClient.GetAppInfo(configmanager.DefaultPartition, nsQueue, appID) Ω(appDaoInfoErr).NotTo(HaveOccurred()) checkPlaceholderData(appDaoInfoAfterRestart, groupA, placeholderCount, 0, 0) - By("Ensure the originator pod is still not in allocations after restart") - for _, alloc := range appDaoInfoBeforeRestart.Allocations { + By("Ensure the originator pod is still not a placeholder pod after restart") + Ω(len(appDaoInfoAfterRestart.Allocations)).To(Equal(1+placeholderCount), "Amount of allocations is incorrect") + for _, alloc := range appDaoInfoAfterRestart.Allocations { podName := alloc.AllocationTags["kubernetes.io/meta/podName"] - Ω(originator.Name).NotTo(Equal(podName), "The originator pod should not be in allocations") - Ω(alloc.Placeholder).To(Equal(true), "The allocation should be for a placeholder pod") + if podName == originator.Name { + Ω(alloc.Placeholder).To(Equal(false), "Originator pod should not be a placeholder pod") + } else { + Ω(alloc.Placeholder).To(Equal(true), "Placeholder pod should be a placeholder pod") + } } }) From c38f139e4da89aa037aa29008b84509253da53d9 Mon Sep 17 00:00:00 2001 From: Tzu-Hua Lan Date: Thu, 4 Jul 2024 21:06:54 +0800 Subject: [PATCH 3/5] utils: add `WaitForAllExecPodsAllocated` method --- .../helpers/yunikorn/rest_api_utils.go | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/test/e2e/framework/helpers/yunikorn/rest_api_utils.go b/test/e2e/framework/helpers/yunikorn/rest_api_utils.go index f33c506b8..a27dae2ab 100644 --- a/test/e2e/framework/helpers/yunikorn/rest_api_utils.go +++ b/test/e2e/framework/helpers/yunikorn/rest_api_utils.go @@ -356,6 +356,26 @@ func (c *RClient) WaitForCompletedAppStateTransition(partition string, appID str return wait.PollUntilContextTimeout(context.TODO(), time.Second, time.Duration(timeout)*time.Second, false, c.isAppInDesiredCompletedState(partition, appID, state).WithContext()) } +func (c *RClient) WaitForAllExecPodsAllocated(partition string, queueName string, appID string, execPodCount int, timeout int) error { + return wait.PollUntilContextTimeout(context.TODO(), time.Second, time.Duration(timeout)*time.Second, false, c.areAllExecPodsAllocated(partition, queueName, appID, execPodCount).WithContext()) +} + +func (c *RClient) areAllExecPodsAllocated(partition string, queueName string, appID string, execPodCount int) wait.ConditionFunc { + return func() (bool, error) { + appInfo, err := c.GetAppInfo(partition, queueName, appID) + if err != nil { + return false, nil // returning nil here for wait & loop + } + if appInfo.Allocations == nil { + return false, nil + } + if len(appInfo.Allocations) >= execPodCount { + return true, nil + } + return false, nil + } +} + func (c *RClient) AreAllExecPodsAllotted(partition string, queueName string, appID string, execPodCount int) wait.ConditionFunc { return func() (bool, error) { appInfo, err := c.GetAppInfo(partition, queueName, appID) From 735ac1adfb7c4f16afbdd8f44725e4240a2fb8cd Mon Sep 17 00:00:00 2001 From: Tzu-Hua Lan Date: Thu, 4 Jul 2024 21:11:56 +0800 Subject: [PATCH 4/5] test: replace `sleep()` by `WaitForAllExecPodsAllocated()` --- .../gang_scheduling/gang_scheduling_test.go | 38 +++++++++---------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/test/e2e/gang_scheduling/gang_scheduling_test.go b/test/e2e/gang_scheduling/gang_scheduling_test.go index 39f90b7b6..524d2ba44 100644 --- a/test/e2e/gang_scheduling/gang_scheduling_test.go +++ b/test/e2e/gang_scheduling/gang_scheduling_test.go @@ -573,9 +573,9 @@ var _ = Describe("", func() { // Test to verify that the gang app originator pod does not change after a restart // 1. Create an originator pod - // 2. Ensure the originator pod is not a placeholder pod + // 2. Verify the originator pod is not a placeholder pod // 3. Restart YuniKorn - // 4. Ensure the originator pod is still not in allocations after restart + // 4. Verify the originator pod is not changed after restart It("Verify_Gang_App_Originator_Pod_Does_Not_Change_After_Restart", func() { placeholderCount := 5 @@ -603,21 +603,20 @@ var _ = Describe("", func() { // Wait for the app to be created checkAppStatus(appID, yunikorn.States().Application.Running) - By("Sleeping") - time.Sleep(10 * time.Second) + By("Ensure all pods are allocated") + err = restClient.WaitForAllExecPodsAllocated(configmanager.DefaultPartition, nsQueue, appID, 1+placeholderCount, 30) + Ω(err).NotTo(HaveOccurred()) - By("Ensure placeholders are allocated") - appDaoInfoBeforeRestart, appDaoInfoErr := restClient.GetAppInfo(configmanager.DefaultPartition, nsQueue, appID) + By("Verify the originator pod is not a placeholder pod") + appDaoInfo, appDaoInfoErr := restClient.GetAppInfo(configmanager.DefaultPartition, nsQueue, appID) Ω(appDaoInfoErr).NotTo(HaveOccurred()) - checkPlaceholderData(appDaoInfoBeforeRestart, groupA, placeholderCount, 0, 0) - - By("Ensure the originator pod is not a placeholder pod") - Ω(len(appDaoInfoBeforeRestart.Allocations)).To(Equal(1+placeholderCount), "Amount of allocations is incorrect") - for _, alloc := range appDaoInfoBeforeRestart.Allocations { + for _, alloc := range appDaoInfo.Allocations { podName := alloc.AllocationTags["kubernetes.io/meta/podName"] if podName == originator.Name { + Ω(alloc.Originator).To(Equal(true), "Originator pod should be a originator pod") Ω(alloc.Placeholder).To(Equal(false), "Originator pod should not be a placeholder pod") } else { + Ω(alloc.Originator).To(Equal(false), "Placeholder pod should not be a originator pod") Ω(alloc.Placeholder).To(Equal(true), "Placeholder pod should be a placeholder pod") } } @@ -629,21 +628,20 @@ var _ = Describe("", func() { // Wait for the app to be created checkAppStatus(appID, yunikorn.States().Application.Running) - By("Sleeping") - time.Sleep(10 * time.Second) + By("Ensure all pods are allocated") + err = restClient.WaitForAllExecPodsAllocated(configmanager.DefaultPartition, nsQueue, appID, 1+placeholderCount, 30) + Ω(err).NotTo(HaveOccurred()) - By("Ensure placeholders are allocated after restart") - appDaoInfoAfterRestart, appDaoInfoErr := restClient.GetAppInfo(configmanager.DefaultPartition, nsQueue, appID) + By("Verify the originator pod is not changed after restart") + appDaoInfo, appDaoInfoErr = restClient.GetAppInfo(configmanager.DefaultPartition, nsQueue, appID) Ω(appDaoInfoErr).NotTo(HaveOccurred()) - checkPlaceholderData(appDaoInfoAfterRestart, groupA, placeholderCount, 0, 0) - - By("Ensure the originator pod is still not a placeholder pod after restart") - Ω(len(appDaoInfoAfterRestart.Allocations)).To(Equal(1+placeholderCount), "Amount of allocations is incorrect") - for _, alloc := range appDaoInfoAfterRestart.Allocations { + for _, alloc := range appDaoInfo.Allocations { podName := alloc.AllocationTags["kubernetes.io/meta/podName"] if podName == originator.Name { + Ω(alloc.Originator).To(Equal(true), "Originator pod should be a originator pod") Ω(alloc.Placeholder).To(Equal(false), "Originator pod should not be a placeholder pod") } else { + Ω(alloc.Originator).To(Equal(false), "Placeholder pod should not be a originator pod") Ω(alloc.Placeholder).To(Equal(true), "Placeholder pod should be a placeholder pod") } } From d7f1988cae6acd7cf20bca7a49eee2b4b7a32b9a Mon Sep 17 00:00:00 2001 From: Tzu-Hua Lan Date: Sat, 6 Jul 2024 23:44:56 +0800 Subject: [PATCH 5/5] chore: update yunikorn core --- go.mod | 2 +- go.sum | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/go.mod b/go.mod index 9766dddd2..2a41192c6 100644 --- a/go.mod +++ b/go.mod @@ -21,7 +21,7 @@ module github.com/apache/yunikorn-k8shim go 1.21 require ( - github.com/apache/yunikorn-core v0.0.0-20240705110923-108ed0d25768 + github.com/apache/yunikorn-core v0.0.0-20240705144325-135e4fab11f1 github.com/apache/yunikorn-scheduler-interface v0.0.0-20240425182941-07f5695119a1 github.com/google/go-cmp v0.6.0 github.com/google/uuid v1.6.0 diff --git a/go.sum b/go.sum index 31aba37d8..bc421bb1e 100644 --- a/go.sum +++ b/go.sum @@ -11,6 +11,8 @@ github.com/antlr/antlr4/runtime/Go/antlr/v4 v4.0.0-20230305170008-8188dc5388df h github.com/antlr/antlr4/runtime/Go/antlr/v4 v4.0.0-20230305170008-8188dc5388df/go.mod h1:pSwJ0fSY5KhvocuWSx4fz3BA8OrA1bQn+K1Eli3BRwM= github.com/apache/yunikorn-core v0.0.0-20240705110923-108ed0d25768 h1:RYjrRqr8rumlEAbYRh5Z88FsJe+8LQ4c1mzjNJoSk70= github.com/apache/yunikorn-core v0.0.0-20240705110923-108ed0d25768/go.mod h1:pSi7AFBRiGCGQ7RwQffpD4m6dvA5lc1HuCrg7LpJJqs= +github.com/apache/yunikorn-core v0.0.0-20240705144325-135e4fab11f1 h1:mviSIYQHPE4ZWcH/B5KEYNLDdvInCbR3WJSYMu8otWY= +github.com/apache/yunikorn-core v0.0.0-20240705144325-135e4fab11f1/go.mod h1:pSi7AFBRiGCGQ7RwQffpD4m6dvA5lc1HuCrg7LpJJqs= github.com/apache/yunikorn-scheduler-interface v0.0.0-20240425182941-07f5695119a1 h1:v4J9L3MlW8BQfYnbq6FV2l3uyay3SqMS2Ffpo+SFat4= github.com/apache/yunikorn-scheduler-interface v0.0.0-20240425182941-07f5695119a1/go.mod h1:WuHJpVk34t8N5+1ErYGj/5Qq33/cRzL4YtuoAsbMtWc= github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio=