Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implemented UnschedulablePodsCount metric #1698

Merged
merged 12 commits into from
Oct 30, 2024
3 changes: 3 additions & 0 deletions pkg/controllers/provisioning/provisioner.go
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,9 @@ func (p *Provisioner) Schedule(ctx context.Context) (scheduler.Results, error) {
return scheduler.Results{}, fmt.Errorf("creating scheduler, %w", err)
}
results := s.Solve(ctx, pods).TruncateInstanceTypes(scheduler.MaxInstanceTypes)
scheduler.UnschedulablePodsCount.With(
prometheus.Labels{scheduler.ControllerLabel: injection.GetControllerName(ctx)},
).Set(float64(len(results.PodErrors)))
if len(results.NewNodeClaims) > 0 {
log.FromContext(ctx).WithValues("Pods", pretty.Slice(lo.Map(pods, func(p *corev1.Pod, _ int) string { return klog.KRef(p.Namespace, p.Name).String() }), 5), "duration", time.Since(start)).Info("found provisionable pod(s)")
}
Expand Down
13 changes: 12 additions & 1 deletion pkg/controllers/provisioning/scheduling/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ import (
)

func init() {
crmetrics.Registry.MustRegister(SchedulingDurationSeconds, QueueDepth, IgnoredPodCount)
crmetrics.Registry.MustRegister(SchedulingDurationSeconds, QueueDepth, IgnoredPodCount, UnschedulablePodsCount)
}

const (
Expand Down Expand Up @@ -65,4 +65,15 @@ var (
Help: "Number of pods ignored during scheduling by Karpenter",
},
)
UnschedulablePodsCount = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: metrics.Namespace,
Subsystem: schedulerSubsystem,
Name: "unschedulable_pods_count",
Help: "The number of unschedulable Pods.",
},
[]string{
ControllerLabel,
},
)
)
4 changes: 3 additions & 1 deletion pkg/controllers/provisioning/scheduling/scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,9 @@ func (s *Scheduler) Solve(ctx context.Context, pods []*corev1.Pod) Results {
// had 5xA pods and 5xB pods were they have a zonal topology spread, but A can only go in one zone and B in another.
// We need to schedule them alternating, A, B, A, B, .... and this solution also solves that as well.
errors := map[*corev1.Pod]error{}
QueueDepth.DeletePartialMatch(prometheus.Labels{ControllerLabel: injection.GetControllerName(ctx)}) // Reset the metric for the controller, so we don't keep old ids around
// Reset the metric for the controller, so we don't keep old ids around
UnschedulablePodsCount.DeletePartialMatch(prometheus.Labels{ControllerLabel: injection.GetControllerName(ctx)})
QueueDepth.DeletePartialMatch(prometheus.Labels{ControllerLabel: injection.GetControllerName(ctx)})
q := NewQueue(pods...)
for {
QueueDepth.With(
Expand Down
38 changes: 38 additions & 0 deletions pkg/controllers/provisioning/scheduling/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ var _ = AfterEach(func() {
cluster.Reset()
scheduling.QueueDepth.Reset()
scheduling.SchedulingDurationSeconds.Reset()
scheduling.UnschedulablePodsCount.Reset()
})

var _ = Context("Scheduling", func() {
Expand Down Expand Up @@ -3676,6 +3677,43 @@ var _ = Context("Scheduling", func() {
s.Solve(injection.WithControllerName(ctx, "provisioner"), pods)
wg.Wait()
})
It("should surface the UnschedulablePodsCount metric while executing the scheduling loop", func() {
nodePool := test.NodePool(v1.NodePool{
Spec: v1.NodePoolSpec{
Template: v1.NodeClaimTemplate{
Spec: v1.NodeClaimTemplateSpec{
Requirements: []v1.NodeSelectorRequirementWithMinValues{
{
NodeSelectorRequirement: corev1.NodeSelectorRequirement{
Key: corev1.LabelInstanceTypeStable,
Operator: corev1.NodeSelectorOpIn,
Values: []string{
"default-instance-type",
},
},
},
},
},
},
},
})
ExpectApplied(ctx, env.Client, nodePool)
//Creates 15 pods, 5 schedulable and 10 unschedulable
podsUnschedulable := test.UnschedulablePods(test.PodOptions{NodeSelector: map[string]string{corev1.LabelInstanceTypeStable: "unknown"}}, 10)
podsSchedulable := test.UnschedulablePods(test.PodOptions{NodeSelector: map[string]string{corev1.LabelInstanceTypeStable: "default-instance-type"}}, 5)
pods := append(podsUnschedulable, podsSchedulable...)
ExpectApplied(ctx, env.Client, nodePool)
//Adds UID to pods for queue in solve. Solve pushes any unschedulable pod back onto the queue and
//then maps the current length of the queue to the pod using the UID
for _, i := range pods {
ExpectApplied(ctx, env.Client, i)
}
_, err := prov.Schedule(injection.WithControllerName(ctx, "provisioner"))
m, ok := FindMetricWithLabelValues("karpenter_scheduler_unschedulable_pods_count", map[string]string{"controller": "provisioner"})
Expect(ok).To(BeTrue())
Expect(lo.FromPtr(m.Gauge.Value)).To(BeNumerically("==", 10))
Expect(err).To(BeNil())
})
It("should surface the schedulingDuration metric after executing a scheduling loop", func() {
nodePool := test.NodePool()
ExpectApplied(ctx, env.Client, nodePool)
Expand Down
Loading