kubernetes-sigs · edibble21 · Sep 19, 2024 · Sep 19, 2024 · Sep 19, 2024 · rschalo
diff --git a/kwok/apis/crds/karpenter.kwok.sh_kwoknodeclasses.yaml b/kwok/apis/crds/karpenter.kwok.sh_kwoknodeclasses.yaml
@@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1
 kind: CustomResourceDefinition
 metadata:
   annotations:
-    controller-gen.kubebuilder.io/version: v0.16.2
+    controller-gen.kubebuilder.io/version: v0.16.3
   name: kwoknodeclasses.karpenter.kwok.sh
 spec:
   group: karpenter.kwok.sh

diff --git a/kwok/charts/crds/karpenter.sh_nodeclaims.yaml b/kwok/charts/crds/karpenter.sh_nodeclaims.yaml
@@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1
 kind: CustomResourceDefinition
 metadata:
   annotations:
-    controller-gen.kubebuilder.io/version: v0.16.2
+    controller-gen.kubebuilder.io/version: v0.16.3
   name: nodeclaims.karpenter.sh
 spec:
   group: karpenter.sh

diff --git a/kwok/charts/crds/karpenter.sh_nodepools.yaml b/kwok/charts/crds/karpenter.sh_nodepools.yaml
@@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1
 kind: CustomResourceDefinition
 metadata:
   annotations:
-    controller-gen.kubebuilder.io/version: v0.16.2
+    controller-gen.kubebuilder.io/version: v0.16.3
   name: nodepools.karpenter.sh
 spec:
   group: karpenter.sh

diff --git a/pkg/apis/crds/karpenter.sh_nodeclaims.yaml b/pkg/apis/crds/karpenter.sh_nodeclaims.yaml
@@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1
 kind: CustomResourceDefinition
 metadata:
   annotations:
-    controller-gen.kubebuilder.io/version: v0.16.2
+    controller-gen.kubebuilder.io/version: v0.16.3
   name: nodeclaims.karpenter.sh
 spec:
   group: karpenter.sh

diff --git a/pkg/apis/crds/karpenter.sh_nodepools.yaml b/pkg/apis/crds/karpenter.sh_nodepools.yaml
@@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1
 kind: CustomResourceDefinition
 metadata:
   annotations:
-    controller-gen.kubebuilder.io/version: v0.16.2
+    controller-gen.kubebuilder.io/version: v0.16.3
   name: nodepools.karpenter.sh
 spec:
   group: karpenter.sh

diff --git a/pkg/controllers/disruption/helpers.go b/pkg/controllers/disruption/helpers.go
@@ -89,7 +89,7 @@ func SimulateScheduling(ctx context.Context, kubeClient client.Client, cluster *
 		return client.ObjectKeyFromObject(p), nil
 	})
 
-	results := scheduler.Solve(log.IntoContext(ctx, operatorlogging.NopLogger), pods).TruncateInstanceTypes(pscheduling.MaxInstanceTypes)
+	results := scheduler.Solve(log.IntoContext(ctx, operatorlogging.NopLogger), pods, pscheduling.MaxInstanceTypes)
 	for _, n := range results.ExistingNodes {
 		// We consider existing nodes for scheduling. When these nodes are unmanaged, their taint logic should
 		// tell us if we can schedule to them or not; however, if these nodes are managed, we will still schedule to them

diff --git a/pkg/controllers/provisioning/provisioner.go b/pkg/controllers/provisioning/provisioner.go
@@ -348,7 +348,7 @@ func (p *Provisioner) Schedule(ctx context.Context) (scheduler.Results, error) {
 		}
 		return scheduler.Results{}, fmt.Errorf("creating scheduler, %w", err)
 	}
-	results := s.Solve(ctx, pods).TruncateInstanceTypes(scheduler.MaxInstanceTypes)
+	results := s.Solve(ctx, pods, scheduler.MaxInstanceTypes)
 	if len(results.NewNodeClaims) > 0 {
 		log.FromContext(ctx).WithValues("Pods", pretty.Slice(lo.Map(pods, func(p *corev1.Pod, _ int) string { return klog.KRef(p.Namespace, p.Name).String() }), 5), "duration", time.Since(start)).Info("found provisionable pod(s)")
 	}

diff --git a/pkg/controllers/provisioning/scheduling/metrics.go b/pkg/controllers/provisioning/scheduling/metrics.go
@@ -24,7 +24,7 @@ import (
 )
 
 func init() {
-	crmetrics.Registry.MustRegister(SchedulingDurationSeconds, QueueDepth)
+	crmetrics.Registry.MustRegister(SchedulingDurationSeconds, QueueDepth, UnschedulablePodsCount)
 }
 
 const (
@@ -58,4 +58,16 @@ var (
 			schedulingIDLabel,
 		},
 	)
+	UnschedulablePodsCount = prometheus.NewGaugeVec(
+		prometheus.GaugeOpts{
+			Namespace: metrics.Namespace,
+			Subsystem: schedulerSubsystem,
+			Name:      "unschedulable_pods_count",
+			Help:      "The number of unschedulable Pods.",
+		},
+		[]string{
+			ControllerLabel,
+			schedulingIDLabel,
+		},
+	)
 )
diff --git a/pkg/controllers/provisioning/scheduling/scheduler.go b/pkg/controllers/provisioning/scheduling/scheduler.go
@@ -174,7 +174,7 @@ func (r Results) NonPendingPodSchedulingErrors() string {
 
 // TruncateInstanceTypes filters the result based on the maximum number of instanceTypes that needs
 // to be considered. This filters all instance types generated in NewNodeClaims in the Results
-func (r Results) TruncateInstanceTypes(maxInstanceTypes int) Results {
+func (r *Results) TruncateInstanceTypes(maxInstanceTypes int) {
 	var validNewNodeClaims []*NodeClaim
 	for _, newNodeClaim := range r.NewNodeClaims {
 		// The InstanceTypeOptions are truncated due to limitations in sending the number of instances to launch API.
@@ -191,10 +191,10 @@ func (r Results) TruncateInstanceTypes(maxInstanceTypes int) Results {
 		}
 	}
 	r.NewNodeClaims = validNewNodeClaims
-	return r
+
 }
 
-func (s *Scheduler) Solve(ctx context.Context, pods []*corev1.Pod) Results {
+func (s *Scheduler) Solve(ctx context.Context, pods []*corev1.Pod, maxInstanceTypes int) Results {
 	defer metrics.Measure(SchedulingDurationSeconds.With(
 		prometheus.Labels{ControllerLabel: injection.GetControllerName(ctx)},
 	))()
@@ -204,7 +204,8 @@ func (s *Scheduler) Solve(ctx context.Context, pods []*corev1.Pod) Results {
 	// had 5xA pods and 5xB pods were they have a zonal topology spread, but A can only go in one zone and B in another.
 	// We need to schedule them alternating, A, B, A, B, .... and this solution also solves that as well.
 	errors := map[*corev1.Pod]error{}
-	QueueDepth.DeletePartialMatch(prometheus.Labels{ControllerLabel: injection.GetControllerName(ctx)}) // Reset the metric for the controller, so we don't keep old ids around
+	UnschedulablePodsCount.DeletePartialMatch(prometheus.Labels{ControllerLabel: injection.GetControllerName(ctx)}) // Reset the metric for the controller, so we don't keep old ids around
+	QueueDepth.DeletePartialMatch(prometheus.Labels{ControllerLabel: injection.GetControllerName(ctx)})             // Reset the metric for the controller, so we don't keep old ids around
 	q := NewQueue(pods...)
 	for {
 		QueueDepth.With(
@@ -240,11 +241,16 @@ func (s *Scheduler) Solve(ctx context.Context, pods []*corev1.Pod) Results {
 			delete(errors, k)
 		}
 	}
-	return Results{
+	var results = Results{
 		NewNodeClaims: s.newNodeClaims,
 		ExistingNodes: s.existingNodes,
 		PodErrors:     errors,
 	}
+	results.TruncateInstanceTypes(MaxInstanceTypes)
+	UnschedulablePodsCount.With(
+		prometheus.Labels{ControllerLabel: injection.GetControllerName(ctx), schedulingIDLabel: string(s.id)},
+	).Set(float64(len(results.PodErrors)))
+	return results
 }
 
 func (s *Scheduler) add(ctx context.Context, pod *corev1.Pod) error {

diff --git a/pkg/controllers/provisioning/scheduling/scheduling_benchmark_test.go b/pkg/controllers/provisioning/scheduling/scheduling_benchmark_test.go
@@ -184,7 +184,7 @@ func benchmarkScheduler(b *testing.B, instanceCount, podCount int) {
 	podsScheduledInRound1 := 0
 	nodesInRound1 := 0
 	for i := 0; i < b.N; i++ {
-		results := scheduler.Solve(ctx, pods)
+		results := scheduler.Solve(ctx, pods, scheduling.MaxInstanceTypes)
 		if i == 0 {
 
 			minPods := math.MaxInt64

diff --git a/pkg/controllers/provisioning/scheduling/suite_test.go b/pkg/controllers/provisioning/scheduling/suite_test.go
@@ -118,6 +118,7 @@ var _ = AfterEach(func() {
 	cluster.Reset()
 	scheduling.QueueDepth.Reset()
 	scheduling.SchedulingDurationSeconds.Reset()
+	scheduling.UnschedulablePodsCount.Reset()
 })
 
 var _ = Context("Scheduling", func() {
@@ -3679,9 +3680,45 @@ var _ = Context("Scheduling", func() {
 					g.Expect(lo.FromPtr(m.Gauge.Value)).To(BeNumerically(">", 0))
 				}, time.Second).Should(Succeed())
 			}()
-			s.Solve(injection.WithControllerName(ctx, "provisioner"), pods)
+			s.Solve(injection.WithControllerName(ctx, "provisioner"), pods, scheduling.MaxInstanceTypes)
 			wg.Wait()
 		})
+		It("should surface the UnschedulablePodsCount metric while executing the scheduling loop", func() {
+			nodePool := test.NodePool()
+			ExpectApplied(ctx, env.Client, nodePool)
+			// all of these pods have anti-affinity to each other
+			labels := map[string]string{
+				"app": "nginx",
+			}
+			pods := test.Pods(1000, test.PodOptions{
+				ObjectMeta: metav1.ObjectMeta{
+					Labels: labels,
+				},
+				PodAntiRequirements: []corev1.PodAffinityTerm{
+					{
+						LabelSelector: &metav1.LabelSelector{MatchLabels: labels},
+						TopologyKey:   corev1.LabelHostname,
+					},
+				},
+			}) // Create 1000 pods which should take long enough to schedule that we should be able to read the UnschedulablePodsCount metric with a value
+			s, err := prov.NewScheduler(ctx, pods, nil)
+			Expect(err).To(BeNil())
+
+			var wg sync.WaitGroup
+			wg.Add(1)
+			go func() {
+				defer GinkgoRecover()
+				defer wg.Done()
+				Eventually(func(g Gomega) {
+					m, ok := FindMetricWithLabelValues("karpenter_scheduler_unschedulable_pods_count", map[string]string{"controller": "provisioner"})
+					g.Expect(ok).To(BeTrue())
+					g.Expect(lo.FromPtr(m.Gauge.Value)).To(BeNumerically(">=", 0))
+				}, 10*time.Second).Should(Succeed())
+			}()
+			s.Solve(injection.WithControllerName(ctx, "provisioner"), pods, scheduling.MaxInstanceTypes)
+			wg.Wait()
+
+		})
 		It("should surface the schedulingDuration metric after executing a scheduling loop", func() {
 			nodePool := test.NodePool()
 			ExpectApplied(ctx, env.Client, nodePool)
@@ -3702,7 +3739,7 @@ var _ = Context("Scheduling", func() {
 			}) // Create 1000 pods which should take long enough to schedule that we should be able to read the queueDepth metric with a value
 			s, err := prov.NewScheduler(ctx, pods, nil)
 			Expect(err).To(BeNil())
-			s.Solve(injection.WithControllerName(ctx, "provisioner"), pods)
+			s.Solve(injection.WithControllerName(ctx, "provisioner"), pods, scheduling.MaxInstanceTypes)
 
 			m, ok := FindMetricWithLabelValues("karpenter_scheduler_scheduling_duration_seconds", map[string]string{"controller": "provisioner"})
 			Expect(ok).To(BeTrue())

diff --git a/pkg/test/v1alpha1/crds/karpenter.test.sh_testnodeclasses.yaml b/pkg/test/v1alpha1/crds/karpenter.test.sh_testnodeclasses.yaml
@@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1
 kind: CustomResourceDefinition
 metadata:
   annotations:
-    controller-gen.kubebuilder.io/version: v0.16.2
+    controller-gen.kubebuilder.io/version: v0.16.3
   name: testnodeclasses.karpenter.test.sh
 spec:
   group: karpenter.test.sh