From d83cc70c8bf61722899c228aa019ccfbe27f5a0b Mon Sep 17 00:00:00 2001 From: Francesco Romani Date: Tue, 19 Nov 2024 12:49:19 +0100 Subject: [PATCH] sched: always set the leader election parameters Because of a overlook, we used to set the leader election params only if replicas > 1 was requested. This left the key *and default* corner case of replicas=1 with compiled in defaults, which are questionable at best and most likely harmful for our use case. Make sure to always set sane parameters, obviously taking into account the user desires from the spec (Replicas field). Add more logs to make troubleshooting easier Signed-off-by: Francesco Romani --- .../numaresourcesscheduler_controller.go | 46 +++++++++------- .../numaresourcesscheduler_controller_test.go | 55 ++++++++++++++++--- pkg/numaresourcesscheduler/consts.go | 22 ++++++++ pkg/status/status.go | 4 ++ 4 files changed, 98 insertions(+), 29 deletions(-) create mode 100644 pkg/numaresourcesscheduler/consts.go diff --git a/controllers/numaresourcesscheduler_controller.go b/controllers/numaresourcesscheduler_controller.go index df53d43c7..5bc00f640 100644 --- a/controllers/numaresourcesscheduler_controller.go +++ b/controllers/numaresourcesscheduler_controller.go @@ -45,6 +45,7 @@ import ( "github.com/openshift-kni/numaresources-operator/pkg/apply" "github.com/openshift-kni/numaresources-operator/pkg/hash" "github.com/openshift-kni/numaresources-operator/pkg/loglevel" + nrosched "github.com/openshift-kni/numaresources-operator/pkg/numaresourcesscheduler" schedmanifests "github.com/openshift-kni/numaresources-operator/pkg/numaresourcesscheduler/manifests/sched" schedstate "github.com/openshift-kni/numaresources-operator/pkg/numaresourcesscheduler/objectstate/sched" "github.com/openshift-kni/numaresources-operator/pkg/objectnames" @@ -52,15 +53,6 @@ import ( "github.com/openshift-kni/numaresources-operator/pkg/status" ) -const ( - leaderElectionResourceName = "numa-scheduler-leader" - schedulerPriorityClassName = "system-node-critical" -) - -const ( - conditionTypeIncorrectNUMAResourcesSchedulerResourceName = "IncorrectNUMAResourcesSchedulerResourceName" -) - // NUMAResourcesSchedulerReconciler reconciles a NUMAResourcesScheduler object type NUMAResourcesSchedulerReconciler struct { client.Client @@ -107,7 +99,7 @@ func (r *NUMAResourcesSchedulerReconciler) Reconcile(ctx context.Context, req ct if req.Name != objectnames.DefaultNUMAResourcesSchedulerCrName { message := fmt.Sprintf("incorrect NUMAResourcesScheduler resource name: %s", instance.Name) - return ctrl.Result{}, r.updateStatus(ctx, instance, status.ConditionDegraded, conditionTypeIncorrectNUMAResourcesSchedulerResourceName, message) + return ctrl.Result{}, r.updateStatus(ctx, instance, status.ConditionDegraded, status.ConditionTypeIncorrectNUMAResourcesSchedulerResourceName, message) } result, condition, err := r.reconcileResource(ctx, instance) @@ -215,7 +207,7 @@ func (r *NUMAResourcesSchedulerReconciler) syncNUMASchedulerResources(ctx contex // should set a degraded state // node-critical so the pod won't be preempted by pods having the most critical priority class - r.SchedulerManifests.Deployment.Spec.Template.Spec.PriorityClassName = schedulerPriorityClassName + r.SchedulerManifests.Deployment.Spec.Template.Spec.PriorityClassName = nrosched.SchedulerPriorityClassName schedupdate.DeploymentImageSettings(r.SchedulerManifests.Deployment, schedSpec.SchedulerImage) cmHash := hash.ConfigMapData(r.SchedulerManifests.ConfigMap) @@ -226,7 +218,7 @@ func (r *NUMAResourcesSchedulerReconciler) syncNUMASchedulerResources(ctx contex schedupdate.DeploymentEnvVarSettings(r.SchedulerManifests.Deployment, schedSpec) - k8swgrbacupdate.RoleForLeaderElection(r.SchedulerManifests.Role, r.Namespace, leaderElectionResourceName) + k8swgrbacupdate.RoleForLeaderElection(r.SchedulerManifests.Role, r.Namespace, nrosched.LeaderElectionResourceName) existing := schedstate.FromClient(ctx, r.Client, r.SchedulerManifests) for _, objState := range existing.State(r.SchedulerManifests) { @@ -264,6 +256,10 @@ func unpackAPIResyncPeriod(reconcilePeriod *metav1.Duration) time.Duration { func configParamsFromSchedSpec(schedSpec nropv1.NUMAResourcesSchedulerSpec, cacheResyncPeriod time.Duration, namespace string) k8swgmanifests.ConfigParams { resyncPeriod := int64(cacheResyncPeriod.Seconds()) + // if no actual replicas are required, leader election is unnecessary, so + // we force it to off to reduce the background noise. + // note: the api validation/normalization layer must ensure this value is != nil + leaderElect := (*schedSpec.Replicas > 1) params := k8swgmanifests.ConfigParams{ ProfileName: schedSpec.SchedulerName, @@ -271,16 +267,18 @@ func configParamsFromSchedSpec(schedSpec nropv1.NUMAResourcesSchedulerSpec, cach ResyncPeriodSeconds: &resyncPeriod, }, ScoringStrategy: &k8swgmanifests.ScoringStrategyParams{}, - } - - if schedSpec.Replicas != nil && *schedSpec.Replicas > 1 { - params.LeaderElection = &k8swgmanifests.LeaderElectionParams{ - LeaderElect: true, + LeaderElection: &k8swgmanifests.LeaderElectionParams{ + // Make sure to always set explicitly the value and override the configmap defaults. + LeaderElect: leaderElect, + // unconditionally set those to make sure + // to play nice with the cluster and the main scheduler ResourceNamespace: namespace, - ResourceName: leaderElectionResourceName, - } + ResourceName: nrosched.LeaderElectionResourceName, + }, } + klog.V(2).InfoS("setting leader election parameters", dumpLeaderElectionParams(params.LeaderElection)...) + var foreignPodsDetect string var resyncMethod string = k8swgmanifests.CacheResyncAutodetect var informerMode string @@ -317,7 +315,7 @@ func configParamsFromSchedSpec(schedSpec nropv1.NUMAResourcesSchedulerSpec, cach params.Cache.ResyncMethod = &resyncMethod params.Cache.ForeignPodsDetectMode = &foreignPodsDetect params.Cache.InformerMode = &informerMode - klog.InfoS("setting cache parameters", dumpConfigCacheParams(params.Cache)...) + klog.V(2).InfoS("setting cache parameters", dumpConfigCacheParams(params.Cache)...) return params } @@ -331,6 +329,14 @@ func dumpConfigCacheParams(ccp *k8swgmanifests.ConfigCacheParams) []interface{} } } +func dumpLeaderElectionParams(lep *k8swgmanifests.LeaderElectionParams) []interface{} { + return []interface{}{ + "leaderElect", lep.LeaderElect, + "resourceNamespace", lep.ResourceNamespace, + "resourceName", lep.ResourceName, + } +} + func strInt64Ptr(ip *int64) string { if ip == nil { return "N/A" diff --git a/controllers/numaresourcesscheduler_controller_test.go b/controllers/numaresourcesscheduler_controller_test.go index b8c700fc1..ddd35ca05 100644 --- a/controllers/numaresourcesscheduler_controller_test.go +++ b/controllers/numaresourcesscheduler_controller_test.go @@ -41,6 +41,7 @@ import ( depobjupdate "github.com/k8stopologyawareschedwg/deployer/pkg/objectupdate" nropv1 "github.com/openshift-kni/numaresources-operator/api/numaresourcesoperator/v1" "github.com/openshift-kni/numaresources-operator/pkg/hash" + nrosched "github.com/openshift-kni/numaresources-operator/pkg/numaresourcesscheduler" schedmanifests "github.com/openshift-kni/numaresources-operator/pkg/numaresourcesscheduler/manifests/sched" "github.com/openshift-kni/numaresources-operator/pkg/numaresourcesscheduler/objectstate/sched" schedupdate "github.com/openshift-kni/numaresources-operator/pkg/objectupdate/sched" @@ -85,7 +86,7 @@ var _ = ginkgo.Describe("Test NUMAResourcesScheduler Reconcile", func() { ginkgo.Context("with unexpected NRS CR name", func() { ginkgo.It("should updated the CR condition to degraded", func() { nrs := testobjs.NewNUMAResourcesScheduler("test", "some/url:latest", testSchedulerName, 9*time.Second) - verifyDegradedCondition(nrs, conditionTypeIncorrectNUMAResourcesSchedulerResourceName) + verifyDegradedCondition(nrs, status.ConditionTypeIncorrectNUMAResourcesSchedulerResourceName) }) }) @@ -221,7 +222,7 @@ var _ = ginkgo.Describe("Test NUMAResourcesScheduler Reconcile", func() { dp := &appsv1.Deployment{} gomega.Expect(reconciler.Client.Get(context.TODO(), key, dp)).ToNot(gomega.HaveOccurred()) - gomega.Expect(dp.Spec.Template.Spec.PriorityClassName).To(gomega.BeEquivalentTo(schedulerPriorityClassName)) + gomega.Expect(dp.Spec.Template.Spec.PriorityClassName).To(gomega.BeEquivalentTo(nrosched.SchedulerPriorityClassName)) }) ginkgo.It("should have a config hash annotation under deployment", func() { @@ -434,7 +435,6 @@ var _ = ginkgo.Describe("Test NUMAResourcesScheduler Reconcile", func() { _, err := reconciler.Reconcile(context.TODO(), reconcile.Request{NamespacedName: key}) gomega.Expect(err).ToNot(gomega.HaveOccurred()) expectCacheParams(reconciler.Client, depmanifests.CacheResyncAutodetect, depmanifests.CacheResyncOnlyExclusiveResources, depmanifests.CacheInformerDedicated) - }) ginkgo.It("should allow to change the informerMode to Shared", func() { @@ -454,7 +454,6 @@ var _ = ginkgo.Describe("Test NUMAResourcesScheduler Reconcile", func() { gomega.Expect(err).ToNot(gomega.HaveOccurred()) expectCacheParams(reconciler.Client, depmanifests.CacheResyncAutodetect, depmanifests.CacheResyncOnlyExclusiveResources, depmanifests.CacheInformerShared) - }) ginkgo.It("should allow to change the informerMode to Dedicated", func() { @@ -474,7 +473,6 @@ var _ = ginkgo.Describe("Test NUMAResourcesScheduler Reconcile", func() { gomega.Expect(err).ToNot(gomega.HaveOccurred()) expectCacheParams(reconciler.Client, depmanifests.CacheResyncAutodetect, depmanifests.CacheResyncOnlyExclusiveResources, depmanifests.CacheInformerDedicated) - }) ginkgo.It("should configure by default the ScoringStrategy to be LeastAllocated", func() { @@ -504,7 +502,6 @@ var _ = ginkgo.Describe("Test NUMAResourcesScheduler Reconcile", func() { gomega.Expect(err).ToNot(gomega.HaveOccurred()) resources := []depmanifests.ResourceSpecParams{{Name: "cpu", Weight: 10}, {Name: "memory", Weight: 5}} expectScoringStrategyParams(reconciler.Client, depmanifests.ScoringStrategyLeastAllocated, resources) - }) ginkgo.It("should allow to change the ScoringStrategy to BalancedAllocation", func() { @@ -524,7 +521,6 @@ var _ = ginkgo.Describe("Test NUMAResourcesScheduler Reconcile", func() { gomega.Expect(err).ToNot(gomega.HaveOccurred()) var resources []depmanifests.ResourceSpecParams expectScoringStrategyParams(reconciler.Client, depmanifests.ScoringStrategyBalancedAllocation, resources) - }) ginkgo.It("should allow to change the ScoringStrategy to MostAllocated", func() { @@ -544,7 +540,6 @@ var _ = ginkgo.Describe("Test NUMAResourcesScheduler Reconcile", func() { gomega.Expect(err).ToNot(gomega.HaveOccurred()) var resources []depmanifests.ResourceSpecParams expectScoringStrategyParams(reconciler.Client, depmanifests.ScoringStrategyMostAllocated, resources) - }) ginkgo.It("should allow to change the ScoringStrategy to BalancedAllocation with resources", func() { @@ -566,7 +561,6 @@ var _ = ginkgo.Describe("Test NUMAResourcesScheduler Reconcile", func() { gomega.Expect(err).ToNot(gomega.HaveOccurred()) resources := []depmanifests.ResourceSpecParams{{Name: "cpu", Weight: 10}, {Name: "memory", Weight: 5}} expectScoringStrategyParams(reconciler.Client, depmanifests.ScoringStrategyBalancedAllocation, resources) - }) ginkgo.It("should allow to change the ScoringStrategy to MostAllocated with resources", func() { @@ -588,8 +582,29 @@ var _ = ginkgo.Describe("Test NUMAResourcesScheduler Reconcile", func() { gomega.Expect(err).ToNot(gomega.HaveOccurred()) resources := []depmanifests.ResourceSpecParams{{Name: "cpu", Weight: 10}, {Name: "memory", Weight: 5}} expectScoringStrategyParams(reconciler.Client, depmanifests.ScoringStrategyMostAllocated, resources) + }) + ginkgo.It("should set the leader election resource parameters by default", func() { + key := client.ObjectKeyFromObject(nrs) + _, err := reconciler.Reconcile(context.TODO(), reconcile.Request{NamespacedName: key}) + gomega.Expect(err).ToNot(gomega.HaveOccurred()) + expectLeaderElectParams(reconciler.Client, false, testNamespace, nrosched.LeaderElectionResourceName) }) + + ginkgo.DescribeTable("should set the leader election resource parameters depending on replica count", func(replicas int32, expectedEnabled bool) { + nrs := nrs.DeepCopy() + nrs.Spec.Replicas = &replicas + gomega.Eventually(reconciler.Client.Update(context.TODO(), nrs)).WithPolling(30 * time.Second).WithTimeout(5 * time.Minute).Should(gomega.Succeed()) + + key := client.ObjectKeyFromObject(nrs) + _, err := reconciler.Reconcile(context.TODO(), reconcile.Request{NamespacedName: key}) + gomega.Expect(err).ToNot(gomega.HaveOccurred()) + expectLeaderElectParams(reconciler.Client, expectedEnabled, testNamespace, nrosched.LeaderElectionResourceName) + }, + ginkgo.Entry("replicas=0", int32(0), false), + ginkgo.Entry("replicas=1", int32(1), false), + ginkgo.Entry("replicas=3", int32(3), true), + ) }) }) @@ -658,3 +673,25 @@ func expectScoringStrategyParams(cli client.Client, ScoringStrategyType string, gomega.Expect(cfg.ScoringStrategy.Type).To(gomega.Equal(ScoringStrategyType)) gomega.Expect(cfg.ScoringStrategy.Resources).To(gomega.Equal(resources)) } + +func expectLeaderElectParams(cli client.Client, enabled bool, resourceNamespace, resourceName string) { + ginkgo.GinkgoHelper() + + key := client.ObjectKey{ + Name: "topo-aware-scheduler-config", + Namespace: testNamespace, + } + + cm := corev1.ConfigMap{} + gomega.Expect(cli.Get(context.TODO(), key, &cm)).To(gomega.Succeed()) + + confRaw := cm.Data[sched.SchedulerConfigFileName] + cfgs, err := depmanifests.DecodeSchedulerProfilesFromData([]byte(confRaw)) + gomega.Expect(err).ToNot(gomega.HaveOccurred()) + gomega.Expect(cfgs).To(gomega.HaveLen(1), "unexpected config params count: %d", len(cfgs)) + cfg := cfgs[0] + + gomega.Expect(cfg.LeaderElection.LeaderElect).To(gomega.Equal(enabled)) + gomega.Expect(cfg.LeaderElection.ResourceNamespace).To(gomega.Equal(resourceNamespace)) + gomega.Expect(cfg.LeaderElection.ResourceName).To(gomega.Equal(resourceName)) +} diff --git a/pkg/numaresourcesscheduler/consts.go b/pkg/numaresourcesscheduler/consts.go new file mode 100644 index 000000000..114b35066 --- /dev/null +++ b/pkg/numaresourcesscheduler/consts.go @@ -0,0 +1,22 @@ +/* + * Copyright 2021 Red Hat, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package numaresourcesscheduler + +const ( + LeaderElectionResourceName = "numa-scheduler-leader" + SchedulerPriorityClassName = "system-node-critical" +) diff --git a/pkg/status/status.go b/pkg/status/status.go index 37637d268..89d9c0fc3 100644 --- a/pkg/status/status.go +++ b/pkg/status/status.go @@ -46,6 +46,10 @@ const ( ConditionTypeIncorrectNUMAResourcesOperatorResourceName = "IncorrectNUMAResourcesOperatorResourceName" ) +const ( + ConditionTypeIncorrectNUMAResourcesSchedulerResourceName = "IncorrectNUMAResourcesSchedulerResourceName" +) + func IsUpdatedNUMAResourcesOperator(oldStatus, newStatus *nropv1.NUMAResourcesOperatorStatus) bool { options := []cmp.Option{ cmpopts.IgnoreFields(metav1.Condition{}, "LastTransitionTime"),