From f329cf41419cc5da6559788aea2816cfcef5f905 Mon Sep 17 00:00:00 2001
From: Leah Dibble <edibble@amazon.com>
Date: Thu, 19 Sep 2024 09:37:28 -0700
Subject: [PATCH 01/11] Moved TruncateInstanceTypes to Solve

---
 pkg/controllers/disruption/helpers.go                  |  2 +-
 pkg/controllers/provisioning/provisioner.go            |  2 +-
 pkg/controllers/provisioning/scheduling/scheduler.go   | 10 ++++++----
 .../scheduling/scheduling_benchmark_test.go            |  2 +-
 4 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/pkg/controllers/disruption/helpers.go b/pkg/controllers/disruption/helpers.go
index 8d7b37bba3..a382f3c6c0 100644
--- a/pkg/controllers/disruption/helpers.go
+++ b/pkg/controllers/disruption/helpers.go
@@ -87,7 +87,7 @@ func SimulateScheduling(ctx context.Context, kubeClient client.Client, cluster *
 		return client.ObjectKeyFromObject(p), nil
 	})
 
-	results := scheduler.Solve(log.IntoContext(ctx, operatorlogging.NopLogger), pods).TruncateInstanceTypes(pscheduling.MaxInstanceTypes)
+	results := scheduler.Solve(log.IntoContext(ctx, operatorlogging.NopLogger), pods, pscheduling.MaxInstanceTypes)
 	for _, n := range results.ExistingNodes {
 		// We consider existing nodes for scheduling. When these nodes are unmanaged, their taint logic should
 		// tell us if we can schedule to them or not; however, if these nodes are managed, we will still schedule to them
diff --git a/pkg/controllers/provisioning/provisioner.go b/pkg/controllers/provisioning/provisioner.go
index a51b04a4fd..9cb9cbc512 100644
--- a/pkg/controllers/provisioning/provisioner.go
+++ b/pkg/controllers/provisioning/provisioner.go
@@ -344,7 +344,7 @@ func (p *Provisioner) Schedule(ctx context.Context) (scheduler.Results, error) {
 		}
 		return scheduler.Results{}, fmt.Errorf("creating scheduler, %w", err)
 	}
-	results := s.Solve(ctx, pods).TruncateInstanceTypes(scheduler.MaxInstanceTypes)
+	results := s.Solve(ctx, pods, scheduler.MaxInstanceTypes)
 	if len(results.NewNodeClaims) > 0 {
 		log.FromContext(ctx).WithValues("Pods", pretty.Slice(lo.Map(pods, func(p *corev1.Pod, _ int) string { return klog.KRef(p.Namespace, p.Name).String() }), 5), "duration", time.Since(start)).Info("found provisionable pod(s)")
 	}
diff --git a/pkg/controllers/provisioning/scheduling/scheduler.go b/pkg/controllers/provisioning/scheduling/scheduler.go
index 5f2f65c4a0..46af767b6c 100644
--- a/pkg/controllers/provisioning/scheduling/scheduler.go
+++ b/pkg/controllers/provisioning/scheduling/scheduler.go
@@ -173,7 +173,7 @@ func (r Results) NonPendingPodSchedulingErrors() string {
 
 // TruncateInstanceTypes filters the result based on the maximum number of instanceTypes that needs
 // to be considered. This filters all instance types generated in NewNodeClaims in the Results
-func (r Results) TruncateInstanceTypes(maxInstanceTypes int) Results {
+func (r *Results) TruncateInstanceTypes(maxInstanceTypes int) {
 	var validNewNodeClaims []*NodeClaim
 	for _, newNodeClaim := range r.NewNodeClaims {
 		// The InstanceTypeOptions are truncated due to limitations in sending the number of instances to launch API.
@@ -190,10 +190,10 @@ func (r Results) TruncateInstanceTypes(maxInstanceTypes int) Results {
 		}
 	}
 	r.NewNodeClaims = validNewNodeClaims
-	return r
+
 }
 
-func (s *Scheduler) Solve(ctx context.Context, pods []*corev1.Pod) Results {
+func (s *Scheduler) Solve(ctx context.Context, pods []*corev1.Pod, maxInstanceTypes int) Results {
 	defer metrics.Measure(SchedulingDurationSeconds.With(
 		prometheus.Labels{ControllerLabel: injection.GetControllerName(ctx)},
 	))()
@@ -239,11 +239,13 @@ func (s *Scheduler) Solve(ctx context.Context, pods []*corev1.Pod) Results {
 			delete(errors, k)
 		}
 	}
-	return Results{
+	var results = Results{
 		NewNodeClaims: s.newNodeClaims,
 		ExistingNodes: s.existingNodes,
 		PodErrors:     errors,
 	}
+	results.TruncateInstanceTypes(MaxInstanceTypes)
+	return results
 }
 
 func (s *Scheduler) add(ctx context.Context, pod *corev1.Pod) error {
diff --git a/pkg/controllers/provisioning/scheduling/scheduling_benchmark_test.go b/pkg/controllers/provisioning/scheduling/scheduling_benchmark_test.go
index aeeb80a52d..7867cc387f 100644
--- a/pkg/controllers/provisioning/scheduling/scheduling_benchmark_test.go
+++ b/pkg/controllers/provisioning/scheduling/scheduling_benchmark_test.go
@@ -183,7 +183,7 @@ func benchmarkScheduler(b *testing.B, instanceCount, podCount int) {
 	podsScheduledInRound1 := 0
 	nodesInRound1 := 0
 	for i := 0; i < b.N; i++ {
-		results := scheduler.Solve(ctx, pods)
+		results := scheduler.Solve(ctx, pods, scheduling.MaxInstanceTypes)
 		if i == 0 {
 
 			minPods := math.MaxInt64

From 5ed3b150152654732698a22f24fb59f03d71e9f4 Mon Sep 17 00:00:00 2001
From: Leah Dibble <edibble@amazon.com>
Date: Thu, 19 Sep 2024 09:42:46 -0700
Subject: [PATCH 02/11] Added UnschedulablePodsCount metric

---
 .../provisioning/scheduling/metrics.go        | 14 ++++++-
 .../provisioning/scheduling/scheduler.go      |  6 ++-
 .../provisioning/scheduling/suite_test.go     | 41 ++++++++++++++++++-
 3 files changed, 57 insertions(+), 4 deletions(-)

diff --git a/pkg/controllers/provisioning/scheduling/metrics.go b/pkg/controllers/provisioning/scheduling/metrics.go
index 1c3328761c..d49a7b6139 100644
--- a/pkg/controllers/provisioning/scheduling/metrics.go
+++ b/pkg/controllers/provisioning/scheduling/metrics.go
@@ -24,7 +24,7 @@ import (
 )
 
 func init() {
-	crmetrics.Registry.MustRegister(SchedulingDurationSeconds, QueueDepth, IgnoredPodCount)
+	crmetrics.Registry.MustRegister(SchedulingDurationSeconds, QueueDepth, IgnoredPodCount, UnschedulablePodsCount)
 }
 
 const (
@@ -65,4 +65,16 @@ var (
 			Help:      "Number of pods ignored during scheduling by Karpenter",
 		},
 	)
+	UnschedulablePodsCount = prometheus.NewGaugeVec(
+		prometheus.GaugeOpts{
+			Namespace: metrics.Namespace,
+			Subsystem: schedulerSubsystem,
+			Name:      "unschedulable_pods_count",
+			Help:      "The number of unschedulable Pods.",
+		},
+		[]string{
+			ControllerLabel,
+			schedulingIDLabel,
+		},
+	)
 )
diff --git a/pkg/controllers/provisioning/scheduling/scheduler.go b/pkg/controllers/provisioning/scheduling/scheduler.go
index 46af767b6c..434a6c2ac6 100644
--- a/pkg/controllers/provisioning/scheduling/scheduler.go
+++ b/pkg/controllers/provisioning/scheduling/scheduler.go
@@ -203,7 +203,8 @@ func (s *Scheduler) Solve(ctx context.Context, pods []*corev1.Pod, maxInstanceTy
 	// had 5xA pods and 5xB pods were they have a zonal topology spread, but A can only go in one zone and B in another.
 	// We need to schedule them alternating, A, B, A, B, .... and this solution also solves that as well.
 	errors := map[*corev1.Pod]error{}
-	QueueDepth.DeletePartialMatch(prometheus.Labels{ControllerLabel: injection.GetControllerName(ctx)}) // Reset the metric for the controller, so we don't keep old ids around
+	UnschedulablePodsCount.DeletePartialMatch(prometheus.Labels{ControllerLabel: injection.GetControllerName(ctx)}) // Reset the metric for the controller, so we don't keep old ids around
+	QueueDepth.DeletePartialMatch(prometheus.Labels{ControllerLabel: injection.GetControllerName(ctx)})             // Reset the metric for the controller, so we don't keep old ids around
 	q := NewQueue(pods...)
 	for {
 		QueueDepth.With(
@@ -245,6 +246,9 @@ func (s *Scheduler) Solve(ctx context.Context, pods []*corev1.Pod, maxInstanceTy
 		PodErrors:     errors,
 	}
 	results.TruncateInstanceTypes(MaxInstanceTypes)
+	UnschedulablePodsCount.With(
+		prometheus.Labels{ControllerLabel: injection.GetControllerName(ctx), schedulingIDLabel: string(s.id)},
+	).Set(float64(len(results.PodErrors)))
 	return results
 }
 
diff --git a/pkg/controllers/provisioning/scheduling/suite_test.go b/pkg/controllers/provisioning/scheduling/suite_test.go
index 0a1016ef61..74623e3862 100644
--- a/pkg/controllers/provisioning/scheduling/suite_test.go
+++ b/pkg/controllers/provisioning/scheduling/suite_test.go
@@ -112,6 +112,7 @@ var _ = AfterEach(func() {
 	cluster.Reset()
 	scheduling.QueueDepth.Reset()
 	scheduling.SchedulingDurationSeconds.Reset()
+	scheduling.UnschedulablePodsCount.Reset()
 })
 
 var _ = Context("Scheduling", func() {
@@ -3673,9 +3674,45 @@ var _ = Context("Scheduling", func() {
 					g.Expect(lo.FromPtr(m.Gauge.Value)).To(BeNumerically(">", 0))
 				}, time.Second).Should(Succeed())
 			}()
-			s.Solve(injection.WithControllerName(ctx, "provisioner"), pods)
+			s.Solve(injection.WithControllerName(ctx, "provisioner"), pods, scheduling.MaxInstanceTypes)
 			wg.Wait()
 		})
+		It("should surface the UnschedulablePodsCount metric while executing the scheduling loop", func() {
+			nodePool := test.NodePool()
+			ExpectApplied(ctx, env.Client, nodePool)
+			// all of these pods have anti-affinity to each other
+			labels := map[string]string{
+				"app": "nginx",
+			}
+			pods := test.Pods(1000, test.PodOptions{
+				ObjectMeta: metav1.ObjectMeta{
+					Labels: labels,
+				},
+				PodAntiRequirements: []corev1.PodAffinityTerm{
+					{
+						LabelSelector: &metav1.LabelSelector{MatchLabels: labels},
+						TopologyKey:   corev1.LabelHostname,
+					},
+				},
+			}) // Create 1000 pods which should take long enough to schedule that we should be able to read the UnschedulablePodsCount metric with a value
+			s, err := prov.NewScheduler(ctx, pods, nil)
+			Expect(err).To(BeNil())
+
+			var wg sync.WaitGroup
+			wg.Add(1)
+			go func() {
+				defer GinkgoRecover()
+				defer wg.Done()
+				Eventually(func(g Gomega) {
+					m, ok := FindMetricWithLabelValues("karpenter_scheduler_unschedulable_pods_count", map[string]string{"controller": "provisioner"})
+					g.Expect(ok).To(BeTrue())
+					g.Expect(lo.FromPtr(m.Gauge.Value)).To(BeNumerically(">=", 0))
+				}, time.Second).Should(Succeed())
+			}()
+			s.Solve(injection.WithControllerName(ctx, "provisioner"), pods, scheduling.MaxInstanceTypes)
+			wg.Wait()
+
+		})
 		It("should surface the schedulingDuration metric after executing a scheduling loop", func() {
 			nodePool := test.NodePool()
 			ExpectApplied(ctx, env.Client, nodePool)
@@ -3696,7 +3733,7 @@ var _ = Context("Scheduling", func() {
 			}) // Create 1000 pods which should take long enough to schedule that we should be able to read the queueDepth metric with a value
 			s, err := prov.NewScheduler(ctx, pods, nil)
 			Expect(err).To(BeNil())
-			s.Solve(injection.WithControllerName(ctx, "provisioner"), pods)
+			s.Solve(injection.WithControllerName(ctx, "provisioner"), pods, scheduling.MaxInstanceTypes)
 
 			m, ok := FindMetricWithLabelValues("karpenter_scheduler_scheduling_duration_seconds", map[string]string{"controller": "provisioner"})
 			Expect(ok).To(BeTrue())

From 6b6d3833a0599f41b30c48a6a61ee3b766e44df3 Mon Sep 17 00:00:00 2001
From: Leah Dibble <edibble@amazon.com>
Date: Thu, 19 Sep 2024 12:17:17 -0700
Subject: [PATCH 03/11] Increased time to succeed for UnschedulablePodsCount

---
 pkg/controllers/provisioning/scheduling/suite_test.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkg/controllers/provisioning/scheduling/suite_test.go b/pkg/controllers/provisioning/scheduling/suite_test.go
index 74623e3862..95571fdbaa 100644
--- a/pkg/controllers/provisioning/scheduling/suite_test.go
+++ b/pkg/controllers/provisioning/scheduling/suite_test.go
@@ -3707,7 +3707,7 @@ var _ = Context("Scheduling", func() {
 					m, ok := FindMetricWithLabelValues("karpenter_scheduler_unschedulable_pods_count", map[string]string{"controller": "provisioner"})
 					g.Expect(ok).To(BeTrue())
 					g.Expect(lo.FromPtr(m.Gauge.Value)).To(BeNumerically(">=", 0))
-				}, time.Second).Should(Succeed())
+				}, 10*time.Second).Should(Succeed())
 			}()
 			s.Solve(injection.WithControllerName(ctx, "provisioner"), pods, scheduling.MaxInstanceTypes)
 			wg.Wait()

From d90c1441dd0866205532e083a277bb19276e1c3d Mon Sep 17 00:00:00 2001
From: Leah Dibble <edibble@amazon.com>
Date: Mon, 23 Sep 2024 11:24:09 -0700
Subject: [PATCH 04/11] Addressed Feedback and reimplemtented Unschedulable
 PodsCount Test

---
 .../provisioning/scheduling/scheduler.go      |  8 +++---
 .../provisioning/scheduling/suite_test.go     | 25 ++++++++-----------
 2 files changed, 14 insertions(+), 19 deletions(-)

diff --git a/pkg/controllers/provisioning/scheduling/scheduler.go b/pkg/controllers/provisioning/scheduling/scheduler.go
index 434a6c2ac6..7bfb31de29 100644
--- a/pkg/controllers/provisioning/scheduling/scheduler.go
+++ b/pkg/controllers/provisioning/scheduling/scheduler.go
@@ -190,7 +190,6 @@ func (r *Results) TruncateInstanceTypes(maxInstanceTypes int) {
 		}
 	}
 	r.NewNodeClaims = validNewNodeClaims
-
 }
 
 func (s *Scheduler) Solve(ctx context.Context, pods []*corev1.Pod, maxInstanceTypes int) Results {
@@ -203,8 +202,9 @@ func (s *Scheduler) Solve(ctx context.Context, pods []*corev1.Pod, maxInstanceTy
 	// had 5xA pods and 5xB pods were they have a zonal topology spread, but A can only go in one zone and B in another.
 	// We need to schedule them alternating, A, B, A, B, .... and this solution also solves that as well.
 	errors := map[*corev1.Pod]error{}
-	UnschedulablePodsCount.DeletePartialMatch(prometheus.Labels{ControllerLabel: injection.GetControllerName(ctx)}) // Reset the metric for the controller, so we don't keep old ids around
-	QueueDepth.DeletePartialMatch(prometheus.Labels{ControllerLabel: injection.GetControllerName(ctx)})             // Reset the metric for the controller, so we don't keep old ids around
+	// Reset the metric for the controller, so we don't keep old ids around
+	UnschedulablePodsCount.DeletePartialMatch(prometheus.Labels{ControllerLabel: injection.GetControllerName(ctx)})
+	QueueDepth.DeletePartialMatch(prometheus.Labels{ControllerLabel: injection.GetControllerName(ctx)})
 	q := NewQueue(pods...)
 	for {
 		QueueDepth.With(
@@ -240,7 +240,7 @@ func (s *Scheduler) Solve(ctx context.Context, pods []*corev1.Pod, maxInstanceTy
 			delete(errors, k)
 		}
 	}
-	var results = Results{
+	results := Results{
 		NewNodeClaims: s.newNodeClaims,
 		ExistingNodes: s.existingNodes,
 		PodErrors:     errors,
diff --git a/pkg/controllers/provisioning/scheduling/suite_test.go b/pkg/controllers/provisioning/scheduling/suite_test.go
index 95571fdbaa..6697195504 100644
--- a/pkg/controllers/provisioning/scheduling/suite_test.go
+++ b/pkg/controllers/provisioning/scheduling/suite_test.go
@@ -37,6 +37,7 @@ import (
 	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/util/sets"
+	"k8s.io/apimachinery/pkg/util/uuid"
 	"k8s.io/client-go/tools/record"
 	cloudproviderapi "k8s.io/cloud-provider/api"
 	"k8s.io/csi-translation-lib/plugins"
@@ -3679,22 +3680,16 @@ var _ = Context("Scheduling", func() {
 		})
 		It("should surface the UnschedulablePodsCount metric while executing the scheduling loop", func() {
 			nodePool := test.NodePool()
+			nodePool.Spec.Template.Spec.Requirements = []v1.NodeSelectorRequirementWithMinValues{
+				{NodeSelectorRequirement: corev1.NodeSelectorRequirement{Key: corev1.LabelInstanceTypeStable, Operator: corev1.NodeSelectorOpIn, Values: []string{"default-instance-type"}}}}
 			ExpectApplied(ctx, env.Client, nodePool)
-			// all of these pods have anti-affinity to each other
-			labels := map[string]string{
-				"app": "nginx",
+			//Creates 15 pods, 5 schedulable and 10 unschedulable and adds and UID
+			podsUnschedulable := test.UnschedulablePods(test.PodOptions{NodeSelector: map[string]string{corev1.LabelInstanceTypeStable: "unknown"}}, 10)
+			podsSchedulable := test.UnschedulablePods(test.PodOptions{NodeSelector: map[string]string{corev1.LabelInstanceTypeStable: "default-instance-type"}}, 5)
+			pods := append(podsUnschedulable, podsSchedulable...)
+			for _, i := range pods {
+				i.UID = uuid.NewUUID()
 			}
-			pods := test.Pods(1000, test.PodOptions{
-				ObjectMeta: metav1.ObjectMeta{
-					Labels: labels,
-				},
-				PodAntiRequirements: []corev1.PodAffinityTerm{
-					{
-						LabelSelector: &metav1.LabelSelector{MatchLabels: labels},
-						TopologyKey:   corev1.LabelHostname,
-					},
-				},
-			}) // Create 1000 pods which should take long enough to schedule that we should be able to read the UnschedulablePodsCount metric with a value
 			s, err := prov.NewScheduler(ctx, pods, nil)
 			Expect(err).To(BeNil())
 
@@ -3706,7 +3701,7 @@ var _ = Context("Scheduling", func() {
 				Eventually(func(g Gomega) {
 					m, ok := FindMetricWithLabelValues("karpenter_scheduler_unschedulable_pods_count", map[string]string{"controller": "provisioner"})
 					g.Expect(ok).To(BeTrue())
-					g.Expect(lo.FromPtr(m.Gauge.Value)).To(BeNumerically(">=", 0))
+					g.Expect(lo.FromPtr(m.Gauge.Value)).To(BeNumerically("==", 10))
 				}, 10*time.Second).Should(Succeed())
 			}()
 			s.Solve(injection.WithControllerName(ctx, "provisioner"), pods, scheduling.MaxInstanceTypes)

From b1195ad213f5c99d61c59672551ef49522448473 Mon Sep 17 00:00:00 2001
From: Leah Dibble <edibble@amazon.com>
Date: Tue, 24 Sep 2024 11:14:34 -0700
Subject: [PATCH 05/11] Set properties inline through test.Nodepool for
 UnschedulablePodsCount

---
 .../provisioning/scheduling/suite_test.go     | 22 ++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/pkg/controllers/provisioning/scheduling/suite_test.go b/pkg/controllers/provisioning/scheduling/suite_test.go
index 6697195504..d72b97c5cf 100644
--- a/pkg/controllers/provisioning/scheduling/suite_test.go
+++ b/pkg/controllers/provisioning/scheduling/suite_test.go
@@ -3679,9 +3679,25 @@ var _ = Context("Scheduling", func() {
 			wg.Wait()
 		})
 		It("should surface the UnschedulablePodsCount metric while executing the scheduling loop", func() {
-			nodePool := test.NodePool()
-			nodePool.Spec.Template.Spec.Requirements = []v1.NodeSelectorRequirementWithMinValues{
-				{NodeSelectorRequirement: corev1.NodeSelectorRequirement{Key: corev1.LabelInstanceTypeStable, Operator: corev1.NodeSelectorOpIn, Values: []string{"default-instance-type"}}}}
+			nodePool := test.NodePool(v1.NodePool{
+				Spec: v1.NodePoolSpec{
+					Template: v1.NodeClaimTemplate{
+						Spec: v1.NodeClaimTemplateSpec{
+							Requirements: []v1.NodeSelectorRequirementWithMinValues{
+								{
+									NodeSelectorRequirement: corev1.NodeSelectorRequirement{
+										Key:      corev1.LabelInstanceTypeStable,
+										Operator: corev1.NodeSelectorOpIn,
+										Values: []string{
+											"default-instance-type",
+										},
+									},
+								},
+							},
+						},
+					},
+				},
+			})
 			ExpectApplied(ctx, env.Client, nodePool)
 			//Creates 15 pods, 5 schedulable and 10 unschedulable and adds and UID
 			podsUnschedulable := test.UnschedulablePods(test.PodOptions{NodeSelector: map[string]string{corev1.LabelInstanceTypeStable: "unknown"}}, 10)

From cb82999bb2a87e4ba9c3cb6d29b9efa2afcba0bd Mon Sep 17 00:00:00 2001
From: Leah Dibble <edibble@amazon.com>
Date: Tue, 24 Sep 2024 11:15:32 -0700
Subject: [PATCH 06/11] Clarified use of UID in Scheduler tests

---
 pkg/controllers/provisioning/scheduling/suite_test.go | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pkg/controllers/provisioning/scheduling/suite_test.go b/pkg/controllers/provisioning/scheduling/suite_test.go
index d72b97c5cf..5222dd0f0b 100644
--- a/pkg/controllers/provisioning/scheduling/suite_test.go
+++ b/pkg/controllers/provisioning/scheduling/suite_test.go
@@ -3699,10 +3699,12 @@ var _ = Context("Scheduling", func() {
 				},
 			})
 			ExpectApplied(ctx, env.Client, nodePool)
-			//Creates 15 pods, 5 schedulable and 10 unschedulable and adds and UID
+			//Creates 15 pods, 5 schedulable and 10 unschedulable
 			podsUnschedulable := test.UnschedulablePods(test.PodOptions{NodeSelector: map[string]string{corev1.LabelInstanceTypeStable: "unknown"}}, 10)
 			podsSchedulable := test.UnschedulablePods(test.PodOptions{NodeSelector: map[string]string{corev1.LabelInstanceTypeStable: "default-instance-type"}}, 5)
 			pods := append(podsUnschedulable, podsSchedulable...)
+			//Adds UID to pods for queue in solve. Solve pushes any unschedulable pod back onto the queue and
+			//then maps the current length of the queue to the pod using the UID
 			for _, i := range pods {
 				i.UID = uuid.NewUUID()
 			}

From dae06dbc76217280fa0e791a1d12c4908a283476 Mon Sep 17 00:00:00 2001
From: Leah Dibble <edibble@amazon.com>
Date: Wed, 9 Oct 2024 11:01:02 -0700
Subject: [PATCH 07/11] readd truncateInstance

---
 pkg/controllers/disruption/helpers.go           |  2 +-
 pkg/controllers/provisioning/provisioner.go     |  5 ++++-
 .../provisioning/scheduling/metrics.go          |  1 -
 .../provisioning/scheduling/scheduler.go        |  9 +++------
 .../scheduling/scheduling_benchmark_test.go     |  2 +-
 .../provisioning/scheduling/suite_test.go       | 17 ++++++++---------
 pkg/controllers/provisioning/suite_test.go      |  2 +-
 7 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/pkg/controllers/disruption/helpers.go b/pkg/controllers/disruption/helpers.go
index a382f3c6c0..8d7b37bba3 100644
--- a/pkg/controllers/disruption/helpers.go
+++ b/pkg/controllers/disruption/helpers.go
@@ -87,7 +87,7 @@ func SimulateScheduling(ctx context.Context, kubeClient client.Client, cluster *
 		return client.ObjectKeyFromObject(p), nil
 	})
 
-	results := scheduler.Solve(log.IntoContext(ctx, operatorlogging.NopLogger), pods, pscheduling.MaxInstanceTypes)
+	results := scheduler.Solve(log.IntoContext(ctx, operatorlogging.NopLogger), pods).TruncateInstanceTypes(pscheduling.MaxInstanceTypes)
 	for _, n := range results.ExistingNodes {
 		// We consider existing nodes for scheduling. When these nodes are unmanaged, their taint logic should
 		// tell us if we can schedule to them or not; however, if these nodes are managed, we will still schedule to them
diff --git a/pkg/controllers/provisioning/provisioner.go b/pkg/controllers/provisioning/provisioner.go
index 9cb9cbc512..3114e45498 100644
--- a/pkg/controllers/provisioning/provisioner.go
+++ b/pkg/controllers/provisioning/provisioner.go
@@ -344,7 +344,10 @@ func (p *Provisioner) Schedule(ctx context.Context) (scheduler.Results, error) {
 		}
 		return scheduler.Results{}, fmt.Errorf("creating scheduler, %w", err)
 	}
-	results := s.Solve(ctx, pods, scheduler.MaxInstanceTypes)
+	results := s.Solve(ctx, pods).TruncateInstanceTypes(scheduler.MaxInstanceTypes)
+	scheduler.UnschedulablePodsCount.With(
+		prometheus.Labels{scheduler.ControllerLabel: injection.GetControllerName(ctx)},
+	).Set(float64(len(results.PodErrors)))
 	if len(results.NewNodeClaims) > 0 {
 		log.FromContext(ctx).WithValues("Pods", pretty.Slice(lo.Map(pods, func(p *corev1.Pod, _ int) string { return klog.KRef(p.Namespace, p.Name).String() }), 5), "duration", time.Since(start)).Info("found provisionable pod(s)")
 	}
diff --git a/pkg/controllers/provisioning/scheduling/metrics.go b/pkg/controllers/provisioning/scheduling/metrics.go
index d49a7b6139..9f6d9cfd1b 100644
--- a/pkg/controllers/provisioning/scheduling/metrics.go
+++ b/pkg/controllers/provisioning/scheduling/metrics.go
@@ -74,7 +74,6 @@ var (
 		},
 		[]string{
 			ControllerLabel,
-			schedulingIDLabel,
 		},
 	)
 )
diff --git a/pkg/controllers/provisioning/scheduling/scheduler.go b/pkg/controllers/provisioning/scheduling/scheduler.go
index 7bfb31de29..a2e2792e0d 100644
--- a/pkg/controllers/provisioning/scheduling/scheduler.go
+++ b/pkg/controllers/provisioning/scheduling/scheduler.go
@@ -173,7 +173,7 @@ func (r Results) NonPendingPodSchedulingErrors() string {
 
 // TruncateInstanceTypes filters the result based on the maximum number of instanceTypes that needs
 // to be considered. This filters all instance types generated in NewNodeClaims in the Results
-func (r *Results) TruncateInstanceTypes(maxInstanceTypes int) {
+func (r Results) TruncateInstanceTypes(maxInstanceTypes int) Results {
 	var validNewNodeClaims []*NodeClaim
 	for _, newNodeClaim := range r.NewNodeClaims {
 		// The InstanceTypeOptions are truncated due to limitations in sending the number of instances to launch API.
@@ -190,9 +190,10 @@ func (r *Results) TruncateInstanceTypes(maxInstanceTypes int) {
 		}
 	}
 	r.NewNodeClaims = validNewNodeClaims
+	return r
 }
 
-func (s *Scheduler) Solve(ctx context.Context, pods []*corev1.Pod, maxInstanceTypes int) Results {
+func (s *Scheduler) Solve(ctx context.Context, pods []*corev1.Pod) Results {
 	defer metrics.Measure(SchedulingDurationSeconds.With(
 		prometheus.Labels{ControllerLabel: injection.GetControllerName(ctx)},
 	))()
@@ -245,10 +246,6 @@ func (s *Scheduler) Solve(ctx context.Context, pods []*corev1.Pod, maxInstanceTy
 		ExistingNodes: s.existingNodes,
 		PodErrors:     errors,
 	}
-	results.TruncateInstanceTypes(MaxInstanceTypes)
-	UnschedulablePodsCount.With(
-		prometheus.Labels{ControllerLabel: injection.GetControllerName(ctx), schedulingIDLabel: string(s.id)},
-	).Set(float64(len(results.PodErrors)))
 	return results
 }
 
diff --git a/pkg/controllers/provisioning/scheduling/scheduling_benchmark_test.go b/pkg/controllers/provisioning/scheduling/scheduling_benchmark_test.go
index 7867cc387f..63d21dedc6 100644
--- a/pkg/controllers/provisioning/scheduling/scheduling_benchmark_test.go
+++ b/pkg/controllers/provisioning/scheduling/scheduling_benchmark_test.go
@@ -183,7 +183,7 @@ func benchmarkScheduler(b *testing.B, instanceCount, podCount int) {
 	podsScheduledInRound1 := 0
 	nodesInRound1 := 0
 	for i := 0; i < b.N; i++ {
-		results := scheduler.Solve(ctx, pods, scheduling.MaxInstanceTypes)
+		results := scheduler.Solve(ctx, pods).TruncateInstanceTypes(scheduling.MaxInstanceTypes)
 		if i == 0 {
 
 			minPods := math.MaxInt64
diff --git a/pkg/controllers/provisioning/scheduling/suite_test.go b/pkg/controllers/provisioning/scheduling/suite_test.go
index 5222dd0f0b..e2fbd3e936 100644
--- a/pkg/controllers/provisioning/scheduling/suite_test.go
+++ b/pkg/controllers/provisioning/scheduling/suite_test.go
@@ -37,7 +37,6 @@ import (
 	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/util/sets"
-	"k8s.io/apimachinery/pkg/util/uuid"
 	"k8s.io/client-go/tools/record"
 	cloudproviderapi "k8s.io/cloud-provider/api"
 	"k8s.io/csi-translation-lib/plugins"
@@ -103,6 +102,7 @@ var _ = AfterSuite(func() {
 var _ = BeforeEach(func() {
 	// reset instance types
 	newCP := fake.CloudProvider{}
+	//ctx = options.ToContext(ctx, test.Options())
 	cloudProvider.InstanceTypes, _ = newCP.GetInstanceTypes(ctx, nil)
 	cloudProvider.CreateCalls = nil
 	scheduling.MaxInstanceTypes = 60
@@ -3675,10 +3675,10 @@ var _ = Context("Scheduling", func() {
 					g.Expect(lo.FromPtr(m.Gauge.Value)).To(BeNumerically(">", 0))
 				}, time.Second).Should(Succeed())
 			}()
-			s.Solve(injection.WithControllerName(ctx, "provisioner"), pods, scheduling.MaxInstanceTypes)
+			s.Solve(injection.WithControllerName(ctx, "provisioner"), pods).TruncateInstanceTypes(scheduling.MaxInstanceTypes)
 			wg.Wait()
 		})
-		It("should surface the UnschedulablePodsCount metric while executing the scheduling loop", func() {
+		FIt("should surface the UnschedulablePodsCount metric while executing the scheduling loop", func() {
 			nodePool := test.NodePool(v1.NodePool{
 				Spec: v1.NodePoolSpec{
 					Template: v1.NodeClaimTemplate{
@@ -3703,14 +3703,12 @@ var _ = Context("Scheduling", func() {
 			podsUnschedulable := test.UnschedulablePods(test.PodOptions{NodeSelector: map[string]string{corev1.LabelInstanceTypeStable: "unknown"}}, 10)
 			podsSchedulable := test.UnschedulablePods(test.PodOptions{NodeSelector: map[string]string{corev1.LabelInstanceTypeStable: "default-instance-type"}}, 5)
 			pods := append(podsUnschedulable, podsSchedulable...)
+			ExpectApplied(ctx, env.Client, nodePool)
 			//Adds UID to pods for queue in solve. Solve pushes any unschedulable pod back onto the queue and
 			//then maps the current length of the queue to the pod using the UID
 			for _, i := range pods {
-				i.UID = uuid.NewUUID()
+				ExpectApplied(ctx, env.Client, i)
 			}
-			s, err := prov.NewScheduler(ctx, pods, nil)
-			Expect(err).To(BeNil())
-
 			var wg sync.WaitGroup
 			wg.Add(1)
 			go func() {
@@ -3722,7 +3720,8 @@ var _ = Context("Scheduling", func() {
 					g.Expect(lo.FromPtr(m.Gauge.Value)).To(BeNumerically("==", 10))
 				}, 10*time.Second).Should(Succeed())
 			}()
-			s.Solve(injection.WithControllerName(ctx, "provisioner"), pods, scheduling.MaxInstanceTypes)
+			_, err := prov.Schedule(injection.WithControllerName(ctx, "provisioner"))
+			Expect(err).To(BeNil())
 			wg.Wait()
 
 		})
@@ -3746,7 +3745,7 @@ var _ = Context("Scheduling", func() {
 			}) // Create 1000 pods which should take long enough to schedule that we should be able to read the queueDepth metric with a value
 			s, err := prov.NewScheduler(ctx, pods, nil)
 			Expect(err).To(BeNil())
-			s.Solve(injection.WithControllerName(ctx, "provisioner"), pods, scheduling.MaxInstanceTypes)
+			s.Solve(injection.WithControllerName(ctx, "provisioner"), pods).TruncateInstanceTypes(scheduling.MaxInstanceTypes)
 
 			m, ok := FindMetricWithLabelValues("karpenter_scheduler_scheduling_duration_seconds", map[string]string{"controller": "provisioner"})
 			Expect(ok).To(BeTrue())
diff --git a/pkg/controllers/provisioning/suite_test.go b/pkg/controllers/provisioning/suite_test.go
index 8e497a542c..367dc52f20 100644
--- a/pkg/controllers/provisioning/suite_test.go
+++ b/pkg/controllers/provisioning/suite_test.go
@@ -250,7 +250,7 @@ var _ = Describe("Provisioning", func() {
 			ExpectScheduled(ctx, env.Client, pod)
 		}
 	})
-	It("should not use a different NodePool hash on the NodeClaim if the NodePool changes during scheduling", func() {
+	FIt("should not use a different NodePool hash on the NodeClaim if the NodePool changes during scheduling", func() {
 		// This test was added since we were generating the NodeClaim's NodePool hash from a NodePool that was re-retrieved
 		// after scheduling had been completed. This could have resulted in the hash not accurately reflecting the actual NodePool
 		// state that it was generated from

From b8f6d5bf1ccb0d47adac674af179e68a6cdf72ce Mon Sep 17 00:00:00 2001
From: Leah Dibble <edibble@amazon.com>
Date: Wed, 9 Oct 2024 11:11:40 -0700
Subject: [PATCH 08/11] removed commented out code

---
 pkg/controllers/provisioning/scheduling/suite_test.go | 1 -
 pkg/controllers/provisioning/suite_test.go            | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/pkg/controllers/provisioning/scheduling/suite_test.go b/pkg/controllers/provisioning/scheduling/suite_test.go
index e2fbd3e936..4245092ad7 100644
--- a/pkg/controllers/provisioning/scheduling/suite_test.go
+++ b/pkg/controllers/provisioning/scheduling/suite_test.go
@@ -102,7 +102,6 @@ var _ = AfterSuite(func() {
 var _ = BeforeEach(func() {
 	// reset instance types
 	newCP := fake.CloudProvider{}
-	//ctx = options.ToContext(ctx, test.Options())
 	cloudProvider.InstanceTypes, _ = newCP.GetInstanceTypes(ctx, nil)
 	cloudProvider.CreateCalls = nil
 	scheduling.MaxInstanceTypes = 60
diff --git a/pkg/controllers/provisioning/suite_test.go b/pkg/controllers/provisioning/suite_test.go
index 367dc52f20..8e497a542c 100644
--- a/pkg/controllers/provisioning/suite_test.go
+++ b/pkg/controllers/provisioning/suite_test.go
@@ -250,7 +250,7 @@ var _ = Describe("Provisioning", func() {
 			ExpectScheduled(ctx, env.Client, pod)
 		}
 	})
-	FIt("should not use a different NodePool hash on the NodeClaim if the NodePool changes during scheduling", func() {
+	It("should not use a different NodePool hash on the NodeClaim if the NodePool changes during scheduling", func() {
 		// This test was added since we were generating the NodeClaim's NodePool hash from a NodePool that was re-retrieved
 		// after scheduling had been completed. This could have resulted in the hash not accurately reflecting the actual NodePool
 		// state that it was generated from

From 8081da32f88dd22ab7a0d210ac00fa9c068fab52 Mon Sep 17 00:00:00 2001
From: Leah Dibble <edibble@amazon.com>
Date: Wed, 9 Oct 2024 11:28:18 -0700
Subject: [PATCH 09/11] fixed presubmit

---
 pkg/controllers/provisioning/scheduling/suite_test.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkg/controllers/provisioning/scheduling/suite_test.go b/pkg/controllers/provisioning/scheduling/suite_test.go
index 4245092ad7..ed2287e06d 100644
--- a/pkg/controllers/provisioning/scheduling/suite_test.go
+++ b/pkg/controllers/provisioning/scheduling/suite_test.go
@@ -3677,7 +3677,7 @@ var _ = Context("Scheduling", func() {
 			s.Solve(injection.WithControllerName(ctx, "provisioner"), pods).TruncateInstanceTypes(scheduling.MaxInstanceTypes)
 			wg.Wait()
 		})
-		FIt("should surface the UnschedulablePodsCount metric while executing the scheduling loop", func() {
+		It("should surface the UnschedulablePodsCount metric while executing the scheduling loop", func() {
 			nodePool := test.NodePool(v1.NodePool{
 				Spec: v1.NodePoolSpec{
 					Template: v1.NodeClaimTemplate{

From 107559a88b3b5891bdb4556cdc75690473ec3031 Mon Sep 17 00:00:00 2001
From: edibble21 <85638465+edibble21@users.noreply.github.com>
Date: Tue, 29 Oct 2024 10:34:02 -0700
Subject: [PATCH 10/11] returned results directly

---
 pkg/controllers/provisioning/scheduling/scheduler.go | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pkg/controllers/provisioning/scheduling/scheduler.go b/pkg/controllers/provisioning/scheduling/scheduler.go
index a2e2792e0d..d19b4de1ed 100644
--- a/pkg/controllers/provisioning/scheduling/scheduler.go
+++ b/pkg/controllers/provisioning/scheduling/scheduler.go
@@ -241,12 +241,11 @@ func (s *Scheduler) Solve(ctx context.Context, pods []*corev1.Pod) Results {
 			delete(errors, k)
 		}
 	}
-	results := Results{
+	return Results{
 		NewNodeClaims: s.newNodeClaims,
 		ExistingNodes: s.existingNodes,
 		PodErrors:     errors,
 	}
-	return results
 }
 
 func (s *Scheduler) add(ctx context.Context, pod *corev1.Pod) error {

From d48b0e3ad6f78a7fbbf16166bba909ba6dcfc7f8 Mon Sep 17 00:00:00 2001
From: edibble21 <85638465+edibble21@users.noreply.github.com>
Date: Wed, 30 Oct 2024 13:02:13 -0700
Subject: [PATCH 11/11] removed truncateinstancetypes and removed eventually

---
 .../scheduling/scheduling_benchmark_test.go   |  2 +-
 .../provisioning/scheduling/suite_test.go     | 20 +++++--------------
 2 files changed, 6 insertions(+), 16 deletions(-)

diff --git a/pkg/controllers/provisioning/scheduling/scheduling_benchmark_test.go b/pkg/controllers/provisioning/scheduling/scheduling_benchmark_test.go
index 63d21dedc6..aeeb80a52d 100644
--- a/pkg/controllers/provisioning/scheduling/scheduling_benchmark_test.go
+++ b/pkg/controllers/provisioning/scheduling/scheduling_benchmark_test.go
@@ -183,7 +183,7 @@ func benchmarkScheduler(b *testing.B, instanceCount, podCount int) {
 	podsScheduledInRound1 := 0
 	nodesInRound1 := 0
 	for i := 0; i < b.N; i++ {
-		results := scheduler.Solve(ctx, pods).TruncateInstanceTypes(scheduling.MaxInstanceTypes)
+		results := scheduler.Solve(ctx, pods)
 		if i == 0 {
 
 			minPods := math.MaxInt64
diff --git a/pkg/controllers/provisioning/scheduling/suite_test.go b/pkg/controllers/provisioning/scheduling/suite_test.go
index ed2287e06d..0414e1f0a0 100644
--- a/pkg/controllers/provisioning/scheduling/suite_test.go
+++ b/pkg/controllers/provisioning/scheduling/suite_test.go
@@ -3674,7 +3674,7 @@ var _ = Context("Scheduling", func() {
 					g.Expect(lo.FromPtr(m.Gauge.Value)).To(BeNumerically(">", 0))
 				}, time.Second).Should(Succeed())
 			}()
-			s.Solve(injection.WithControllerName(ctx, "provisioner"), pods).TruncateInstanceTypes(scheduling.MaxInstanceTypes)
+			s.Solve(injection.WithControllerName(ctx, "provisioner"), pods)
 			wg.Wait()
 		})
 		It("should surface the UnschedulablePodsCount metric while executing the scheduling loop", func() {
@@ -3708,21 +3708,11 @@ var _ = Context("Scheduling", func() {
 			for _, i := range pods {
 				ExpectApplied(ctx, env.Client, i)
 			}
-			var wg sync.WaitGroup
-			wg.Add(1)
-			go func() {
-				defer GinkgoRecover()
-				defer wg.Done()
-				Eventually(func(g Gomega) {
-					m, ok := FindMetricWithLabelValues("karpenter_scheduler_unschedulable_pods_count", map[string]string{"controller": "provisioner"})
-					g.Expect(ok).To(BeTrue())
-					g.Expect(lo.FromPtr(m.Gauge.Value)).To(BeNumerically("==", 10))
-				}, 10*time.Second).Should(Succeed())
-			}()
 			_, err := prov.Schedule(injection.WithControllerName(ctx, "provisioner"))
+			m, ok := FindMetricWithLabelValues("karpenter_scheduler_unschedulable_pods_count", map[string]string{"controller": "provisioner"})
+			Expect(ok).To(BeTrue())
+			Expect(lo.FromPtr(m.Gauge.Value)).To(BeNumerically("==", 10))
 			Expect(err).To(BeNil())
-			wg.Wait()
-
 		})
 		It("should surface the schedulingDuration metric after executing a scheduling loop", func() {
 			nodePool := test.NodePool()
@@ -3744,7 +3734,7 @@ var _ = Context("Scheduling", func() {
 			}) // Create 1000 pods which should take long enough to schedule that we should be able to read the queueDepth metric with a value
 			s, err := prov.NewScheduler(ctx, pods, nil)
 			Expect(err).To(BeNil())
-			s.Solve(injection.WithControllerName(ctx, "provisioner"), pods).TruncateInstanceTypes(scheduling.MaxInstanceTypes)
+			s.Solve(injection.WithControllerName(ctx, "provisioner"), pods)
 
 			m, ok := FindMetricWithLabelValues("karpenter_scheduler_scheduling_duration_seconds", map[string]string{"controller": "provisioner"})
 			Expect(ok).To(BeTrue())