Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[YUNIKORN-2667] E2E test for Gang app originator pod changes after restart #865

Closed
wants to merge 8 commits into from
20 changes: 20 additions & 0 deletions test/e2e/framework/helpers/yunikorn/rest_api_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,26 @@ func (c *RClient) WaitForCompletedAppStateTransition(partition string, appID str
return wait.PollUntilContextTimeout(context.TODO(), time.Second, time.Duration(timeout)*time.Second, false, c.isAppInDesiredCompletedState(partition, appID, state).WithContext())
}

func (c *RClient) WaitForAllExecPodsAllocated(partition string, queueName string, appID string, execPodCount int, timeout int) error {
return wait.PollUntilContextTimeout(context.TODO(), time.Second, time.Duration(timeout)*time.Second, false, c.areAllExecPodsAllocated(partition, queueName, appID, execPodCount).WithContext())
}

func (c *RClient) areAllExecPodsAllocated(partition string, queueName string, appID string, execPodCount int) wait.ConditionFunc {
return func() (bool, error) {
appInfo, err := c.GetAppInfo(partition, queueName, appID)
if err != nil {
return false, nil // returning nil here for wait & loop
}
if appInfo.Allocations == nil {
return false, nil
}
if len(appInfo.Allocations) >= execPodCount {
return true, nil
}
return false, nil
}
}

func (c *RClient) AreAllExecPodsAllotted(partition string, queueName string, appID string, execPodCount int) wait.ConditionFunc {
return func() (bool, error) {
appInfo, err := c.GetAppInfo(partition, queueName, appID)
Expand Down
76 changes: 76 additions & 0 deletions test/e2e/gang_scheduling/gang_scheduling_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -571,6 +571,82 @@ var _ = Describe("", func() {
Ω(appDaoInfo.UsedResource[hugepageKey]).To(Equal(int64(314572800)), "Used huge page resource is not correct")
})

// Test to verify that the gang app originator pod does not change after a restart
// 1. Create an originator pod
// 2. Verify the originator pod is not a placeholder pod
// 3. Restart YuniKorn
// 4. Verify the originator pod is not changed after restart
It("Verify_Gang_App_Originator_Pod_Does_Not_Change_After_Restart", func() {
placeholderCount := 5

By("Create an originator pod")
podConf := k8s.TestPodConfig{
Name: "gang-driver-pod-" + common.RandSeq(5),
Labels: map[string]string{
"app": "sleep-" + common.RandSeq(5),
"applicationId": appID,
},
Annotations: &k8s.PodAnnotation{
TaskGroups: []cache.TaskGroup{
{Name: groupA, MinMember: int32(placeholderCount), MinResource: minResource},
},
},
Resources: &v1.ResourceRequirements{
Requests: v1.ResourceList{"cpu": minResource["cpu"], "memory": minResource["memory"]},
},
}
podTest, err := k8s.InitTestPod(podConf)
Ω(err).NotTo(HaveOccurred())
originator, err := kClient.CreatePod(podTest, ns)
Ω(err).NotTo(HaveOccurred())

// Wait for the app to be created
checkAppStatus(appID, yunikorn.States().Application.Running)

By("Ensure all pods are allocated")
err = restClient.WaitForAllExecPodsAllocated(configmanager.DefaultPartition, nsQueue, appID, 1+placeholderCount, 30)
Ω(err).NotTo(HaveOccurred())

By("Verify the originator pod is not a placeholder pod")
appDaoInfo, appDaoInfoErr := restClient.GetAppInfo(configmanager.DefaultPartition, nsQueue, appID)
Ω(appDaoInfoErr).NotTo(HaveOccurred())
for _, alloc := range appDaoInfo.Allocations {
podName := alloc.AllocationTags["kubernetes.io/meta/podName"]
if podName == originator.Name {
Ω(alloc.Originator).To(Equal(true), "Originator pod should be a originator pod")
Ω(alloc.Placeholder).To(Equal(false), "Originator pod should not be a placeholder pod")
} else {
Ω(alloc.Originator).To(Equal(false), "Placeholder pod should not be a originator pod")
Ω(alloc.Placeholder).To(Equal(true), "Placeholder pod should be a placeholder pod")
}
}

By("Restart the scheduler pod")
yunikorn.RestartYunikorn(&kClient)
yunikorn.RestorePortForwarding(&kClient)

// Wait for the app to be created
checkAppStatus(appID, yunikorn.States().Application.Running)

By("Ensure all pods are allocated")
err = restClient.WaitForAllExecPodsAllocated(configmanager.DefaultPartition, nsQueue, appID, 1+placeholderCount, 30)
Ω(err).NotTo(HaveOccurred())

By("Verify the originator pod is not changed after restart")
appDaoInfo, appDaoInfoErr = restClient.GetAppInfo(configmanager.DefaultPartition, nsQueue, appID)
Ω(appDaoInfoErr).NotTo(HaveOccurred())
for _, alloc := range appDaoInfo.Allocations {
podName := alloc.AllocationTags["kubernetes.io/meta/podName"]
if podName == originator.Name {
Ω(alloc.Originator).To(Equal(true), "Originator pod should be a originator pod")
Ω(alloc.Placeholder).To(Equal(false), "Originator pod should not be a placeholder pod")
} else {
Ω(alloc.Originator).To(Equal(false), "Placeholder pod should not be a originator pod")
Ω(alloc.Placeholder).To(Equal(true), "Placeholder pod should be a placeholder pod")
}
}
})

AfterEach(func() {
tests.DumpClusterInfoIfSpecFailed(suiteName, []string{ns})

Expand Down