From 5c65eb8a43d49b0bf450c6f631bdbe954140db8b Mon Sep 17 00:00:00 2001 From: Craig Condit Date: Fri, 6 Sep 2024 12:50:31 -0500 Subject: [PATCH] [YUNIKORN-2857] Fix flaky gang scheduling e2e test --- .../helpers/yunikorn/rest_api_utils.go | 2 +- .../gang_scheduling/gang_scheduling_test.go | 74 ++++++++++++++----- 2 files changed, 55 insertions(+), 21 deletions(-) diff --git a/test/e2e/framework/helpers/yunikorn/rest_api_utils.go b/test/e2e/framework/helpers/yunikorn/rest_api_utils.go index 82afbc2b4..91cede918 100644 --- a/test/e2e/framework/helpers/yunikorn/rest_api_utils.go +++ b/test/e2e/framework/helpers/yunikorn/rest_api_utils.go @@ -448,7 +448,7 @@ func compareQueueTS(queuePathStr string, ts string) wait.ConditionFunc { restClient := RClient{} qInfo, err := restClient.GetQueue(DefaultPartition, queuePathStr, false) if err != nil { - return false, err + return false, nil } return qInfo.Properties["timestamp"] == ts, nil diff --git a/test/e2e/gang_scheduling/gang_scheduling_test.go b/test/e2e/gang_scheduling/gang_scheduling_test.go index 18d5a15bb..bcca2b02a 100644 --- a/test/e2e/gang_scheduling/gang_scheduling_test.go +++ b/test/e2e/gang_scheduling/gang_scheduling_test.go @@ -19,6 +19,8 @@ package gangscheduling_test import ( + "context" + "errors" "fmt" "strings" "time" @@ -29,6 +31,7 @@ import ( v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/wait" "github.com/apache/yunikorn-core/pkg/webservice/dao" "github.com/apache/yunikorn-k8shim/pkg/cache" @@ -96,9 +99,8 @@ var _ = Describe("", func() { checkAppStatus(appID, yunikorn.States().Application.Running) // Ensure placeholders are created - appDaoInfo, appDaoInfoErr := restClient.GetAppInfo(configmanager.DefaultPartition, nsQueue, appID) - Ω(appDaoInfoErr).NotTo(HaveOccurred()) - checkPlaceholderData(appDaoInfo, groupA, 5, 0, 0) + phErr := waitForPlaceholderData(nsQueue, appID, groupA, 5, 0, 0, 30) + Ω(phErr).NotTo(HaveOccurred()) // Deploy job, now with 5 pods part of taskGroup By("Deploy second job with 5 real taskGroup pods") @@ -118,9 +120,8 @@ var _ = Describe("", func() { checkAppStatus(appID, yunikorn.States().Application.Running) // Ensure placeholders are replaced - appDaoInfo, appDaoInfoErr = restClient.GetAppInfo(configmanager.DefaultPartition, nsQueue, appID) - Ω(appDaoInfoErr).NotTo(HaveOccurred()) - checkPlaceholderData(appDaoInfo, groupA, 5, 5, 0) + phErr = waitForPlaceholderData(nsQueue, appID, groupA, 5, 5, 0, 30) + Ω(phErr).NotTo(HaveOccurred()) }) // Test to verify multiple task group nodes @@ -218,9 +219,10 @@ var _ = Describe("", func() { Ω(phTermErr).NotTo(HaveOccurred()) // Ensure placeholders are replaced and allocations count is correct + phErr := waitForPlaceholderData(nsQueue, appID, groupA, 3, 3, 0, 30) + Ω(phErr).NotTo(HaveOccurred()) appDaoInfo, appDaoInfoErr := restClient.GetAppInfo(configmanager.DefaultPartition, nsQueue, appID) Ω(appDaoInfoErr).NotTo(HaveOccurred()) - checkPlaceholderData(appDaoInfo, groupA, 3, 3, 0) Ω(len(appDaoInfo.Allocations)).To(Equal(int(6)), "Allocations count is not correct") }) @@ -250,11 +252,13 @@ var _ = Describe("", func() { checkAppStatus(appID, yunikorn.States().Application.Running) // Ensure placeholders are timed out and allocations count is correct as app started running normal because of 'soft' gang style + phErr := waitForPlaceholderData(nsQueue, appID, groupA, 3, 0, 3, 30) + Ω(phErr).NotTo(HaveOccurred()) + phErr = waitForPlaceholderData(nsQueue, appID, groupB, 1, 0, 1, 30) + Ω(phErr).NotTo(HaveOccurred()) appDaoInfo, appDaoInfoErr := restClient.GetAppInfo(configmanager.DefaultPartition, nsQueue, appID) Ω(appDaoInfoErr).NotTo(HaveOccurred()) Ω(len(appDaoInfo.PlaceholderData)).To(Equal(2), "Placeholder count is not correct") - checkPlaceholderData(appDaoInfo, groupA, 3, 0, 3) - checkPlaceholderData(appDaoInfo, groupB, 1, 0, 1) Ω(len(appDaoInfo.Allocations)).To(Equal(int(3)), "Allocations count is not correct") for _, alloc := range appDaoInfo.Allocations { Ω(alloc.Placeholder).To(Equal(false), "Allocation should be non placeholder") @@ -289,11 +293,13 @@ var _ = Describe("", func() { checkCompletedAppStatus(appID, yunikorn.States().Application.Failed) // Ensure placeholders are timed out and allocations count is correct as app started running normal because of 'soft' gang style + phErr := waitForPlaceholderData(nsQueue, appID, groupA, 3, 0, 3, 30) + Ω(phErr).NotTo(HaveOccurred()) + phErr = waitForPlaceholderData(nsQueue, appID, groupB, 3, 0, 3, 30) + Ω(phErr).NotTo(HaveOccurred()) appDaoInfo, appDaoInfoErr := restClient.GetCompletedAppInfo(configmanager.DefaultPartition, appID) Ω(appDaoInfoErr).NotTo(HaveOccurred()) Ω(len(appDaoInfo.PlaceholderData)).To(Equal(2), "Placeholder count is not correct") - checkPlaceholderData(appDaoInfo, groupA, 3, 0, 3) - checkPlaceholderData(appDaoInfo, groupB, 3, 0, 3) }) // Test to verify Gang Apps FIFO order @@ -563,10 +569,11 @@ var _ = Describe("", func() { checkAppStatus(appID, yunikorn.States().Application.Running) // Ensure placeholders are replaced and allocations count is correct + phErr := waitForPlaceholderData(nsQueue, appID, groupA, 3, 3, 0, 30) + Ω(phErr).NotTo(HaveOccurred()) appDaoInfo, appDaoInfoErr := restClient.GetAppInfo(configmanager.DefaultPartition, nsQueue, appID) Ω(appDaoInfoErr).NotTo(HaveOccurred()) Ω(len(appDaoInfo.PlaceholderData)).To(Equal(1), "Placeholder count is not correct") - checkPlaceholderData(appDaoInfo, groupA, 3, 3, 0) Ω(len(appDaoInfo.Allocations)).To(Equal(int(3)), "Allocations count is not correct") Ω(appDaoInfo.UsedResource[hugepageKey]).To(Equal(int64(314572800)), "Used huge page resource is not correct") }) @@ -725,18 +732,45 @@ func checkCompletedAppStatus(applicationID, state string) { Ω(timeoutErr).NotTo(HaveOccurred()) } -func checkPlaceholderData(appDaoInfo *dao.ApplicationDAOInfo, tgName string, count, replaced, timeout int) { - verified := false +func waitForPlaceholderData(nsQueue string, appID string, tgName string, count, replaced, timedOut, timeout int) error { + lastOk := false + var lastCount, lastReplaced, lastTimedOut int + + err := wait.PollUntilContextTimeout(context.Background(), 2*time.Second, time.Duration(timeout)*time.Second, false, func(c context.Context) (bool, error) { + appDaoInfo, err := restClient.GetAppInfo(configmanager.DefaultPartition, nsQueue, appID) + if err != nil { + return false, err + } + lastOk, lastCount, lastReplaced, lastTimedOut = getPlaceholderData(appDaoInfo, tgName) + return lastOk && lastCount == count && lastReplaced == replaced && lastTimedOut == timedOut, nil + }) + if err != nil { + errs := make([]error, 0) + errs = append(errs, err) + if !lastOk { + errs = append(errs, fmt.Errorf("can't find task group %s in app info", tgName)) + } + if lastCount != count { + errs = append(errs, fmt.Errorf("placeholder count is incorrect (expected %d, got %d)", count, lastCount)) + } + if lastReplaced != replaced { + errs = append(errs, fmt.Errorf("placeholder replaced is incorrect (expected %d, got %d)", replaced, lastReplaced)) + } + if lastTimedOut != timedOut { + errs = append(errs, fmt.Errorf("placeholder timedout is incorrect (expected %d, got %d)", timedOut, lastTimedOut)) + } + return errors.Join(errs...) + } + return nil +} + +func getPlaceholderData(appDaoInfo *dao.ApplicationDAOInfo, tgName string) (bool, int, int, int) { for _, placeholderData := range appDaoInfo.PlaceholderData { if tgName == placeholderData.TaskGroupName { - Ω(int(placeholderData.Count)).To(Equal(count), "Placeholder count is not correct") - Ω(int(placeholderData.Replaced)).To(Equal(replaced), "Placeholder replaced is not correct") - Ω(int(placeholderData.TimedOut)).To(Equal(timeout), "Placeholder timeout is not correct") - verified = true - break + return true, int(placeholderData.Count), int(placeholderData.Replaced), int(placeholderData.TimedOut) } } - Ω(verified).To(Equal(true), fmt.Sprintf("Can't find task group %s in app info", tgName)) + return false, 0, 0, 0 } func verifyOriginatorDeletionCase(withOwnerRef bool) {