Skip to content

Commit

Permalink
Merge pull request #714 from shajmakh/more-ci-improvements
Browse files Browse the repository at this point in the history
serial: wait for NRT to settle part-2
  • Loading branch information
openshift-merge-bot[bot] authored Nov 27, 2023
2 parents 431b08f + 2a3ae5a commit 376317f
Show file tree
Hide file tree
Showing 7 changed files with 72 additions and 47 deletions.
3 changes: 3 additions & 0 deletions test/e2e/serial/tests/non_regression.go
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,9 @@ var _ = Describe("[serial][disruptive][scheduler] numaresources workload placeme
failedPodIds := e2efixture.WaitForPaddingPodsRunning(fxt, paddingPods)
Expect(failedPodIds).To(BeEmpty(), "some padding pods have failed to run")

By("waiting for the NRT data to settle")
e2efixture.MustSettleNRT(fxt)

//prepare the test pod
//get maximum allocatable resources across all numas of the target node
var targetNrtListInitial nrtv1alpha2.NodeResourceTopologyList
Expand Down
5 changes: 4 additions & 1 deletion test/e2e/serial/tests/resource_accounting.go
Original file line number Diff line number Diff line change
Expand Up @@ -164,12 +164,15 @@ var _ = Describe("[serial][disruptive][scheduler][resacct] numaresources workloa
failedPodIds := e2efixture.WaitForPaddingPodsRunning(fxt, paddingPods)
Expect(failedPodIds).To(BeEmpty(), "some padding pods have failed to run")

By("waiting for the NRT data to settle")
e2efixture.MustSettleNRT(fxt)

var targetNrtBefore *nrtv1alpha2.NodeResourceTopology
var targetNrtListBefore nrtv1alpha2.NodeResourceTopologyList
for idx, zone := range nrtInfo.Zones {
if idx == len(nrtInfo.Zones)-1 {
// store the NRT of the target node before scheduling the last placeholder pod,
// later we'll compare this when we delete of of those pods
// later we'll compare this when we delete of those pods
targetNrtListBefore, err = e2enrt.GetUpdated(fxt.Client, nrtList, 1*time.Minute)
Expect(err).ToNot(HaveOccurred())
targetNrtBefore, err = e2enrt.FindFromList(targetNrtListBefore.Items, targetNodeName)
Expand Down
2 changes: 1 addition & 1 deletion test/e2e/serial/tests/scheduler_cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,7 @@ var _ = Describe("[serial][scheduler][cache][tier1] scheduler cache", Label("sch
// note a compute node can handle exactly 2 pods because how we constructed the requirements.
// scheduling 2 pods right off the bat on the same compute node is actually correct (it will work)
// but it's not the behavior we expect. A conforming scheduler is expected to send first two pods,
// wait for reconciliation, the send the missing two.
// wait for reconciliation, then send the missing two.

klog.Infof("Creating %d pods each requiring %q", desiredPods, e2ereslist.ToString(podRequiredRes))
for _, testPod := range testPods {
Expand Down
83 changes: 44 additions & 39 deletions test/e2e/serial/tests/workload_placement.go
Original file line number Diff line number Diff line change
Expand Up @@ -869,25 +869,27 @@ var _ = Describe("[serial][disruptive][scheduler] numaresources workload placeme
Expect(pod.Spec.NodeName).To(Equal(targetNodeName), "pod landed on %q instead of on %v", pod.Spec.NodeName, targetNodeName)
}

By("Waiting for the NRT data to stabilize")
e2efixture.MustSettleNRT(fxt)

By(fmt.Sprintf("verifying resources allocation correctness for NRT target: %q", targetNodeName))
var nrtAfterRSCreation nrtv1alpha2.NodeResourceTopologyList
Eventually(func() bool {
nrtAfterRSCreation, err := e2enrt.GetUpdated(fxt.Client, nrtInitial, timeout)
Expect(err).ToNot(HaveOccurred())
nrtAfterRSCreation, err = e2enrt.GetUpdated(fxt.Client, nrtInitial, timeout)
Expect(err).ToNot(HaveOccurred())

nrtInitialTarget, err := e2enrt.FindFromList(nrtInitial.Items, targetNodeName)
Expect(err).ToNot(HaveOccurred())
Expect(nrtInitialTarget.Name).To(Equal(targetNodeName), "expected targetNrt to be equal to %q", targetNodeName)
nrtInitialTarget, err := e2enrt.FindFromList(nrtInitial.Items, targetNodeName)
Expect(err).ToNot(HaveOccurred())
Expect(nrtInitialTarget.Name).To(Equal(targetNodeName), "expected targetNrt to be equal to %q", targetNodeName)

updatedTargetNrt, err := e2enrt.FindFromList(nrtAfterRSCreation.Items, targetNodeName)
Expect(err).ToNot(HaveOccurred())
Expect(updatedTargetNrt.Name).To(Equal(targetNodeName), "expected targetNrt to be equal to %q", targetNodeName)
updatedTargetNrt, err := e2enrt.FindFromList(nrtAfterRSCreation.Items, targetNodeName)
Expect(err).ToNot(HaveOccurred())
Expect(updatedTargetNrt.Name).To(Equal(targetNodeName), "expected targetNrt to be equal to %q", targetNodeName)

rl := e2ereslist.FromReplicaSet(*rs)
rl := e2ereslist.FromReplicaSet(*rs)

_, match := checkNRTConsumedResources(fxt, *nrtInitialTarget, rl, &pods[0])
return match != ""
}).WithTimeout(timeout).WithPolling(10 * time.Second).Should(BeTrue())
match, err := e2enrt.CheckNodeConsumedResourcesAtLeast(*nrtInitialTarget, *updatedTargetNrt, rl, corev1qos.GetPodQOS(&pods[0]))
Expect(err).ToNot(HaveOccurred())
Expect(match).ToNot(BeEmpty(), "inconsistent accounting when checking NRTs consumed resources")

By(fmt.Sprintf("deleting replicaset %s/%s", fxt.Namespace.Name, rsName))
err = fxt.Client.Delete(context.TODO(), rs)
Expand All @@ -900,22 +902,23 @@ var _ = Describe("[serial][disruptive][scheduler] numaresources workload placeme
Expect(err).ToNot(HaveOccurred(), "pod %s/%s still exists", pod.Namespace, pod.Name)
}

Eventually(func() bool {
By(fmt.Sprintf("checking the resources are restored as expected on %q", targetNodeName))
By("Waiting for the NRT data to stabilize")
e2efixture.MustSettleNRT(fxt)

nrtListPostDelete, err := e2enrt.GetUpdated(fxt.Client, nrtAfterRSCreation, 1*time.Minute)
Expect(err).ToNot(HaveOccurred())
By(fmt.Sprintf("checking the resources are restored as expected on %q", targetNodeName))

nrtPostDelete, err := e2enrt.FindFromList(nrtListPostDelete.Items, targetNodeName)
Expect(err).ToNot(HaveOccurred())
nrtListPostDelete, err := e2enrt.GetUpdated(fxt.Client, nrtAfterRSCreation, 1*time.Minute)
Expect(err).ToNot(HaveOccurred())

nrtInitial, err := e2enrt.FindFromList(nrtInitial.Items, targetNodeName)
Expect(err).ToNot(HaveOccurred())
nrtPostDelete, err := e2enrt.FindFromList(nrtListPostDelete.Items, targetNodeName)
Expect(err).ToNot(HaveOccurred())

ok, err := e2enrt.CheckEqualAvailableResources(*nrtInitial, *nrtPostDelete)
Expect(err).ToNot(HaveOccurred())
return ok
}, time.Minute, time.Second*5).Should(BeTrue(), "resources not restored on %q", targetNodeName)
nrtInitialTarget, err = e2enrt.FindFromList(nrtInitial.Items, targetNodeName)
Expect(err).ToNot(HaveOccurred())

ok, err = e2enrt.CheckEqualAvailableResources(*nrtInitialTarget, *nrtPostDelete)
Expect(err).ToNot(HaveOccurred())
Expect(ok).To(BeTrue(), "resources not restored on %q", targetNodeName)

rs = objects.NewTestReplicaSetWithPodSpec(replicaNumber, podLabels, map[string]string{}, fxt.Namespace.Name, rsName, corev1.PodSpec{
SchedulerName: serialconfig.Config.SchedulerName,
Expand Down Expand Up @@ -963,30 +966,32 @@ var _ = Describe("[serial][disruptive][scheduler] numaresources workload placeme
Expect(schedOK).To(BeTrue(), "pod %s/%s not scheduled with expected scheduler %s", pod.Namespace, pod.Name, serialconfig.Config.SchedulerName)
}

By("Waiting for the NRT data to stabilize")
e2efixture.MustSettleNRT(fxt)

By(fmt.Sprintf("verifying resources allocation correctness for NRT target: %q", targetNodeName))
Eventually(func() bool {
nrtAfterDPCreation, err := e2enrt.GetUpdated(fxt.Client, nrtInitial, timeout)
Expect(err).ToNot(HaveOccurred())
nrtAfterDPCreation, err := e2enrt.GetUpdated(fxt.Client, nrtInitial, timeout)
Expect(err).ToNot(HaveOccurred())

nrtInitialTarget, err := e2enrt.FindFromList(nrtInitial.Items, targetNodeName)
Expect(err).ToNot(HaveOccurred())
Expect(nrtInitialTarget.Name).To(Equal(targetNodeName), "expected targetNrt to be equal to %q", targetNodeName)
nrtInitialTarget, err = e2enrt.FindFromList(nrtInitial.Items, targetNodeName)
Expect(err).ToNot(HaveOccurred())
Expect(nrtInitialTarget.Name).To(Equal(targetNodeName), "expected targetNrt to be equal to %q", targetNodeName)

updatedTargetNrt, err := e2enrt.FindFromList(nrtAfterDPCreation.Items, targetNodeName)
Expect(err).ToNot(HaveOccurred())
Expect(updatedTargetNrt.Name).To(Equal(targetNodeName), "expected targetNrt to be equal to %q", targetNodeName)
updatedTargetNrt, err = e2enrt.FindFromList(nrtAfterDPCreation.Items, targetNodeName)
Expect(err).ToNot(HaveOccurred())
Expect(updatedTargetNrt.Name).To(Equal(targetNodeName), "expected targetNrt to be equal to %q", targetNodeName)

rl := e2ereslist.FromReplicaSet(*rs)
rl = e2ereslist.FromReplicaSet(*rs)

_, match := checkNRTConsumedResources(fxt, *nrtInitialTarget, rl, &pods[0])
return match != ""
}).WithTimeout(timeout).WithPolling(10 * time.Second).Should(BeTrue())
match, err = e2enrt.CheckNodeConsumedResourcesAtLeast(*nrtInitialTarget, *updatedTargetNrt, rl, corev1qos.GetPodQOS(&pods[0]))
Expect(err).ToNot(HaveOccurred())
Expect(match).ToNot(BeEmpty(), "inconsistent accounting when checking NRTs consumed resources")

By(fmt.Sprintf("comparing scheduling times between %q and %q", corev1.DefaultSchedulerName, serialconfig.Config.SchedulerName))
diff := int64(math.Abs(float64(schedTimeWithTopologyScheduler.Milliseconds() - schedTimeWithDefaultScheduler.Milliseconds())))
// 2000 milliseconds diff seems reasonable, but can evaluate later if needed.
d := time.Millisecond * 2000
Expect(diff).To(BeNumerically("<", d.Milliseconds()), "expected the difference between scheduling times to be %d at max; actual diff: %d milliseconds", d, diff)
Expect(diff).To(BeNumerically("<", d.Milliseconds()), "expected the difference between scheduling times to be %d at max; actual diff: %d milliseconds", d.Milliseconds(), diff)

By(fmt.Sprintf("deleting deployment %s/%s", fxt.Namespace.Name, rsName))
err = fxt.Client.Delete(context.TODO(), rs)
Expand Down
3 changes: 3 additions & 0 deletions test/e2e/serial/tests/workload_placement_resources.go
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,9 @@ func setupNodes(fxt *e2efixture.Fixture, nodesState desiredNodesState) ([]nrtv1a
failedPodIds := e2efixture.WaitForPaddingPodsRunning(fxt, paddingPods)
ExpectWithOffset(1, failedPodIds).To(BeEmpty(), "some padding pods have failed to run")

By("waiting for the NRT data to settle")
e2efixture.MustSettleNRT(fxt)

return nrtCandidates, targetNodeName
}

Expand Down
10 changes: 4 additions & 6 deletions test/e2e/serial/tests/workload_placement_tmpol.go
Original file line number Diff line number Diff line change
Expand Up @@ -437,9 +437,8 @@ var _ = Describe("[serial][disruptive][scheduler] numaresources workload placeme
failedPodIds := e2efixture.WaitForPaddingPodsRunning(fxt, paddingPods)
Expect(failedPodIds).To(BeEmpty(), "some padding pods have failed to run")

// TODO: smarter cooldown
klog.Infof("cooling down")
time.Sleep(18 * time.Second)
By("waiting for the NRT data to settle")
e2efixture.MustSettleNRT(fxt)

for _, unsuitableNodeName := range unsuitableNodeNames {
dumpNRTForNode(fxt.Client, unsuitableNodeName, "unsuitable")
Expand Down Expand Up @@ -1349,9 +1348,8 @@ var _ = Describe("[serial][disruptive][scheduler] numaresources workload placeme
failedPodIds := e2efixture.WaitForPaddingPodsRunning(fxt, paddingPods)
Expect(failedPodIds).To(BeEmpty(), "some padding pods have failed to run")

// TODO: smarter cooldown
klog.Infof("cooling down")
time.Sleep(18 * time.Second)
By("waiting for the NRT data to settle")
e2efixture.MustSettleNRT(fxt)

for _, unsuitableNodeName := range unsuitableNodeNames {
dumpNRTForNode(fxt.Client, unsuitableNodeName, "unsuitable")
Expand Down
13 changes: 13 additions & 0 deletions test/e2e/serial/tests/workload_unschedulable.go
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,9 @@ var _ = Describe("[serial][disruptive][scheduler] numaresources workload unsched
failedPodIds := e2efixture.WaitForPaddingPodsRunning(fxt, paddingPods)
Expect(failedPodIds).To(BeEmpty(), "some padding pods have failed to run")

By("waiting for the NRT data to settle")
e2efixture.MustSettleNRT(fxt)

_, err := e2enrt.GetUpdated(fxt.Client, nrtList, time.Minute)
Expect(err).ToNot(HaveOccurred())
})
Expand Down Expand Up @@ -375,6 +378,9 @@ var _ = Describe("[serial][disruptive][scheduler] numaresources workload unsched
failedPodIds := e2efixture.WaitForPaddingPodsRunning(fxt, paddingPods)
Expect(failedPodIds).To(BeEmpty(), "some padding pods have failed to run")

By("waiting for the NRT data to settle")
e2efixture.MustSettleNRT(fxt)

targetNrtListBefore, err := e2enrt.GetUpdated(fxt.Client, nrtList, 1*time.Minute)
Expect(err).ToNot(HaveOccurred())
targetNrtBefore, err := e2enrt.FindFromList(targetNrtListBefore.Items, targetNodeName)
Expand Down Expand Up @@ -412,6 +418,7 @@ var _ = Describe("[serial][disruptive][scheduler] numaresources workload unsched
podRunningTimeout := 3 * time.Minute
for _, pod := range pods {
if pod.Spec.NodeName == targetNodeName {
By(fmt.Sprintf("verify events for pod %s/%s node: %q", pod.Namespace, pod.Name, pod.Spec.NodeName))
scheduledWithTAS, err := nrosched.CheckPODWasScheduledWith(fxt.K8sClient, pod.Namespace, pod.Name, serialconfig.Config.SchedulerName)
if err != nil {
_ = objects.LogEventsForPod(fxt.K8sClient, pod.Namespace, pod.Name)
Expand Down Expand Up @@ -554,6 +561,9 @@ var _ = Describe("[serial][disruptive][scheduler] numaresources workload unsched
failedPodIds := e2efixture.WaitForPaddingPodsRunning(fxt, paddingPods)
Expect(failedPodIds).To(BeEmpty(), "some padding pods have failed to run")

By("waiting for the NRT data to settle")
e2efixture.MustSettleNRT(fxt)

targetNrtListBefore, err := e2enrt.GetUpdated(fxt.Client, nrtList, 1*time.Minute)
Expect(err).ToNot(HaveOccurred())
targetNrtBefore, err := e2enrt.FindFromList(targetNrtListBefore.Items, targetNodeName)
Expand Down Expand Up @@ -634,6 +644,9 @@ var _ = Describe("[serial][disruptive][scheduler] numaresources workload unsched
})
Expect(err).ToNot(HaveOccurred())

By("waiting for the NRT data to settle")
e2efixture.MustSettleNRT(fxt)

nrtInitialList, err := e2enrt.GetUpdated(fxt.Client, nrtv1alpha2.NodeResourceTopologyList{}, time.Second*10)
Expect(err).ToNot(HaveOccurred())

Expand Down

0 comments on commit 376317f

Please sign in to comment.