Skip to content

Commit

Permalink
chore: Additional upstream metrics Part1
Browse files Browse the repository at this point in the history
  • Loading branch information
jigisha620 committed Sep 18, 2024
1 parent 71f7aef commit 82fd92c
Show file tree
Hide file tree
Showing 8 changed files with 61 additions and 6 deletions.
21 changes: 18 additions & 3 deletions pkg/controllers/metrics/pod/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,15 @@ var (
Objectives: metrics.SummaryObjectives(),
},
)
podBoundDurationSeconds = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: "karpenter",
Subsystem: metrics.PodSubsystem,
Name: "bound_duration_seconds",
Help: "The time from pod creation until the pod is bound.",
},
labelNames(),
)
)

// Controller for the resource
Expand All @@ -82,7 +91,7 @@ type Controller struct {
}

func init() {
crmetrics.Registry.MustRegister(podState, podStartupDurationSeconds)
crmetrics.Registry.MustRegister(podState, podStartupDurationSeconds, podBoundDurationSeconds)
}

func labelNames() []string {
Expand Down Expand Up @@ -132,13 +141,19 @@ func (c *Controller) Reconcile(ctx context.Context, req reconcile.Request) (reco
Labels: labels,
},
})
c.recordPodStartupMetric(pod)
c.recordPodStartupMetric(pod, labels)
return reconcile.Result{}, nil
}

func (c *Controller) recordPodStartupMetric(pod *corev1.Pod) {
func (c *Controller) recordPodStartupMetric(pod *corev1.Pod, labels prometheus.Labels) {
key := client.ObjectKeyFromObject(pod).String()
if pod.Status.Phase == phasePending {
cond, ok := lo.Find(pod.Status.Conditions, func(c corev1.PodCondition) bool {
return c.Type == corev1.PodScheduled
})
if ok && cond.Status == corev1.ConditionTrue {
podBoundDurationSeconds.With(labels).Observe(cond.LastTransitionTime.Sub(pod.CreationTimestamp.Time).Seconds())
}
c.pendingPods.Insert(key)
return
}
Expand Down
14 changes: 14 additions & 0 deletions pkg/controllers/metrics/pod/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ import (
"context"
"testing"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
corev1 "k8s.io/api/core/v1"
Expand Down Expand Up @@ -84,6 +86,18 @@ var _ = Describe("Pod Metrics", func() {
})
Expect(found).To(BeTrue())
})
It("should update the pod bound_duration_seconds metric", func() {
p := test.Pod()
p.Status.Phase = corev1.PodPending
p.Status.Conditions = []corev1.PodCondition{{Type: corev1.PodScheduled, Status: corev1.ConditionTrue, LastTransitionTime: metav1.Now()}}
ExpectApplied(ctx, env.Client, p)
ExpectReconcileSucceeded(ctx, podController, client.ObjectKeyFromObject(p))
_, found := FindMetricWithLabelValues("karpenter_pods_bound_duration_seconds", map[string]string{
"name": p.GetName(),
"namespace": p.GetNamespace(),
})
Expect(found).To(BeTrue())
})
It("should delete the pod state metric on pod delete", func() {
p := test.Pod()
ExpectApplied(ctx, env.Client, p)
Expand Down
3 changes: 3 additions & 0 deletions pkg/controllers/node/termination/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,9 @@ func (c *Controller) finalize(ctx context.Context, node *corev1.Node) (reconcile

return reconcile.Result{RequeueAfter: 1 * time.Second}, nil
}
NodesDrainedTotal.With(prometheus.Labels{
metrics.NodePoolLabel: node.Labels[v1.NodePoolLabelKey],
}).Inc()
// In order for Pods associated with PersistentVolumes to smoothly migrate from the terminating Node, we wait
// for VolumeAttachments of drain-able Pods to be cleaned up before terminating Node and removing its finalizer.
// However, if TerminationGracePeriod is configured for Node, and we are past that period, we will skip waiting.
Expand Down
12 changes: 11 additions & 1 deletion pkg/controllers/node/termination/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ import (
func init() {
crmetrics.Registry.MustRegister(
TerminationDurationSeconds,
NodeLifetimeDurationSeconds)
NodeLifetimeDurationSeconds,
NodesDrainedTotal)
}

const dayDuration = time.Hour * 24
Expand All @@ -44,6 +45,15 @@ var (
},
[]string{metrics.NodePoolLabel},
)
NodesDrainedTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: metrics.Namespace,
Subsystem: metrics.NodeSubsystem,
Name: "drained_total",
Help: "The total number of nodes drained by Karpenter",
},
[]string{metrics.NodePoolLabel},
)
NodeLifetimeDurationSeconds = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: metrics.Namespace,
Expand Down
2 changes: 2 additions & 0 deletions pkg/controllers/node/termination/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ var _ = Describe("Termination", func() {
metrics.NodesTerminatedTotal.Reset()
termination.TerminationDurationSeconds.Reset()
termination.NodeLifetimeDurationSeconds.Reset()
termination.NodesDrainedTotal.Reset()
})

Context("Reconciliation", func() {
Expand Down Expand Up @@ -841,6 +842,7 @@ var _ = Describe("Termination", func() {
node = ExpectNodeExists(ctx, env.Client, node.Name)
// Reconcile twice, once to set the NodeClaim to terminating, another to check the instance termination status (and delete the node).
ExpectObjectReconciled(ctx, env.Client, terminationController, node)
ExpectMetricCounterValue(termination.NodesDrainedTotal, 1, map[string]string{"nodepool": node.Labels[v1.NodePoolLabelKey]})
ExpectObjectReconciled(ctx, env.Client, terminationController, node)

m, ok := FindMetricWithLabelValues("karpenter_nodes_terminated_total", map[string]string{"nodepool": node.Labels[v1.NodePoolLabelKey]})
Expand Down
3 changes: 2 additions & 1 deletion pkg/controllers/provisioning/provisioner.go
Original file line number Diff line number Diff line change
Expand Up @@ -159,13 +159,14 @@ func (p *Provisioner) GetPendingPods(ctx context.Context) ([]*corev1.Pod, error)
if err != nil {
return nil, fmt.Errorf("listing pods, %w", err)
}
pods = lo.Reject(pods, func(po *corev1.Pod, _ int) bool {
rejectedPods, pods := lo.FilterReject(pods, func(po *corev1.Pod, _ int) bool {
if err := p.Validate(ctx, po); err != nil {
log.FromContext(ctx).WithValues("Pod", klog.KRef(po.Namespace, po.Name)).V(1).Info(fmt.Sprintf("ignoring pod, %s", err))
return true
}
return false
})
metrics.IgnoredPodCount.Set(float64(len(rejectedPods)))
p.consolidationWarnings(ctx, pods)
return pods, nil
}
Expand Down
3 changes: 3 additions & 0 deletions pkg/controllers/provisioning/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package provisioning_test
import (
"context"
"fmt"
"sigs.k8s.io/karpenter/pkg/metrics"
"testing"
"time"

Expand Down Expand Up @@ -97,6 +98,7 @@ var _ = AfterEach(func() {
ExpectCleanedUp(ctx, env.Client)
cloudProvider.Reset()
cluster.Reset()
metrics.IgnoredPodCount.Set(0)
})

var _ = Describe("Provisioning", func() {
Expand Down Expand Up @@ -1338,6 +1340,7 @@ var _ = Describe("Provisioning", func() {
PersistentVolumeClaims: []string{"invalid"},
})
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pod)
ExpectMetricGaugeValue(metrics.IgnoredPodCount, 1, nil)
ExpectNotScheduled(ctx, env.Client, pod)
})
It("should schedule with an empty storage class if the pvc is bound", func() {
Expand Down
9 changes: 8 additions & 1 deletion pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,9 +89,16 @@ var (
NodePoolLabel,
},
)
IgnoredPodCount = prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: Namespace,
Name: "ignored_pod_count",
Help: "Number of pods ignored during scheduling by Karpenter",
},
)
)

func init() {
crmetrics.Registry.MustRegister(NodeClaimsCreatedTotal, NodeClaimsTerminatedTotal, NodeClaimsDisruptedTotal,
NodesCreatedTotal, NodesTerminatedTotal)
NodesCreatedTotal, NodesTerminatedTotal, IgnoredPodCount)
}

0 comments on commit 82fd92c

Please sign in to comment.