-
Notifications
You must be signed in to change notification settings - Fork 34
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Adding system metrics support #97
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -41,6 +41,7 @@ import ( | |
"sigs.k8s.io/controller-runtime/pkg/reconcile" | ||
|
||
leaderworkerset "sigs.k8s.io/lws/api/leaderworkerset/v1" | ||
"sigs.k8s.io/lws/pkg/metrics" | ||
"sigs.k8s.io/lws/pkg/utils" | ||
podutils "sigs.k8s.io/lws/pkg/utils/pod" | ||
statefulsetutils "sigs.k8s.io/lws/pkg/utils/statefulset" | ||
|
@@ -348,7 +349,7 @@ func (r *LeaderWorkerSetReconciler) updateConditions(ctx context.Context, lws *l | |
readyCount := 0 | ||
updatedCount := 0 | ||
templateHash := utils.LeaderWorkerTemplateHash(lws) | ||
|
||
var readyLeaderPods []corev1.Pod | ||
// Iterate through all statefulsets. | ||
for _, sts := range lwssts.Items { | ||
if sts.Name == lws.Name { | ||
|
@@ -367,7 +368,7 @@ func (r *LeaderWorkerSetReconciler) updateConditions(ctx context.Context, lws *l | |
} | ||
if podutils.PodRunningAndReady(leaderPod) { | ||
readyCount++ | ||
|
||
readyLeaderPods = append(readyLeaderPods, leaderPod) | ||
if sts.Labels[leaderworkerset.TemplateRevisionHashKey] == templateHash && leaderPod.Labels[leaderworkerset.TemplateRevisionHashKey] == templateHash { | ||
updatedCount++ | ||
} | ||
|
@@ -389,6 +390,9 @@ func (r *LeaderWorkerSetReconciler) updateConditions(ctx context.Context, lws *l | |
updateCondition := setCondition(lws, condition) | ||
// if condition changed, record events | ||
if updateCondition { | ||
if updatedCount == int(*lws.Spec.Replicas) { | ||
metrics.ReplicaReadyStatus(readyLeaderPods, getLastTransitionTime(string(leaderworkerset.LeaderWorkerSetProgressing), lws)) | ||
} | ||
r.Record.Eventf(lws, corev1.EventTypeNormal, condition.Reason, condition.Message+fmt.Sprintf(", with %d groups ready of total %d groups", readyCount, int(*lws.Spec.Replicas))) | ||
} | ||
return updateStatus || updateCondition, nil | ||
|
@@ -569,3 +573,12 @@ func templateUpdated(sts *appsv1.StatefulSet, lws *leaderworkerset.LeaderWorkerS | |
func replicasUpdated(sts *appsv1.StatefulSet, lws *leaderworkerset.LeaderWorkerSet) bool { | ||
return *sts.Spec.Replicas != *lws.Spec.Replicas | ||
} | ||
|
||
func getLastTransitionTime(conditionType string, lws *leaderworkerset.LeaderWorkerSet) metav1.Time { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe move this to a util file? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Was thinking about that as well, can create an lws utils file similar to the one that exists for pods, and move this function, |
||
for _, condition := range lws.Status.Conditions { | ||
if condition.Type == conditionType { | ||
return condition.LastTransitionTime | ||
} | ||
} | ||
return metav1.Now() | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
/* | ||
Copyright 2023. | ||
|
||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
|
||
http://www.apache.org/licenses/LICENSE-2.0 | ||
|
||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
*/ | ||
|
||
package metrics | ||
|
||
import ( | ||
"time" | ||
|
||
corev1 "k8s.io/api/core/v1" | ||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||
podutils "sigs.k8s.io/lws/pkg/utils/pod" | ||
|
||
"github.com/prometheus/client_golang/prometheus" | ||
"sigs.k8s.io/controller-runtime/pkg/metrics" | ||
) | ||
|
||
var ( | ||
rollingUpdateDuration = prometheus.NewHistogramVec( | ||
prometheus.HistogramOpts{ | ||
Subsystem: "lws", | ||
Name: "rolling_update_duration", | ||
Help: "Duration of rolling updates", | ||
}, []string{"hash"}, | ||
) | ||
|
||
recreateGroupTimes = prometheus.NewCounterVec( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is not called anywhere else right? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, adding it to register it, but waiting on #106 to implement it. |
||
prometheus.CounterOpts{ | ||
Subsystem: "lws", | ||
Name: "recreate_group_times", | ||
Help: "number of times a group has been recreated", | ||
}, []string{"leadername"}, | ||
) | ||
|
||
replicaReadyStatusDuration = prometheus.NewHistogramVec( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add a comment here about what is the metrics record, like "the latency for a pod group to be ready after creation" There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. but overall, I start to think about whether it's necessary to have this metrics for now. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the help attribute of the object describes what it records, but I can add a comment for it as well There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think we need such a metric. The time it takes for the pod to become ready is not a characteristic of the lws controller. |
||
prometheus.HistogramOpts{ | ||
Subsystem: "lws", | ||
Name: "replica_ready_status_duration", | ||
Help: "latency for each replica to be scheduled and become ready", | ||
}, []string{"leadername"}, | ||
) | ||
) | ||
|
||
func RollingUpdate(hash string, duration time.Duration) { | ||
rollingUpdateDuration.WithLabelValues(hash).Observe(duration.Seconds()) | ||
} | ||
|
||
func RecreatingGroup(leaderName string) { | ||
recreateGroupTimes.WithLabelValues(leaderName).Inc() | ||
} | ||
|
||
func ReplicaReadyStatus(readyPods []corev1.Pod, startTime metav1.Time) { | ||
for _, pod := range readyPods { | ||
readyTime := podutils.PodReadyConditionLastTransitionTime(pod).Time | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. when the leader pod is ready, its worker statefulset may not be ready right? the latency here calculates the leader pods' latency to be ready There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we may need to pass the worker sts and leader pod as a pair and make sure that latency=worker_sts_ready-leader_pod_created, wdyt There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The LeaderPod is only added into the list that is passed if the worker sts is ready, that is checked here I guess I'm a bit confused with what the replica latency measures, I thought it was from when the lws object was created/edited to when the replica is ready. So is it from when the leader statefulset is created to when the worker sts is ready? |
||
latency := readyTime.Sub(startTime.Time) | ||
replicaReadyStatusDuration.WithLabelValues(pod.Name).Observe(latency.Seconds()) | ||
} | ||
} | ||
|
||
func Register() { | ||
metrics.Registry.MustRegister( | ||
rollingUpdateDuration, | ||
recreateGroupTimes, | ||
replicaReadyStatusDuration, | ||
) | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
package metrics | ||
|
||
import ( | ||
"testing" | ||
|
||
"github.com/prometheus/client_golang/prometheus" | ||
"github.com/prometheus/client_golang/prometheus/testutil" | ||
) | ||
|
||
func TestRecreatingGroup(t *testing.T) { | ||
prometheus.MustRegister(recreateGroupTimes) | ||
|
||
RecreatingGroup("lws-sample-0") | ||
RecreatingGroup("lws-sample-1") | ||
RecreatingGroup("lws-sample-0") | ||
|
||
if count := testutil.CollectAndCount(recreateGroupTimes); count != 2 { | ||
t.Errorf("Expecting %d metrics, got: %d", 2, count) | ||
} | ||
|
||
if count := testutil.ToFloat64(recreateGroupTimes.WithLabelValues("lws-sample-0")); count != float64(2) { | ||
t.Errorf("Expecting %s to have value %d, but got %f", "lws-sample-0", 2, count) | ||
} | ||
|
||
if count := testutil.ToFloat64(recreateGroupTimes.WithLabelValues("lws-sample-1")); count != float64(1) { | ||
t.Errorf("Expecting %s to have value %d, but got %f", "lws-sample-1", 1, count) | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
the start time here is when the lws just progress, there might be a possibility that the leader pod is not scheduled.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Asked a similar question in other comment: what is the replica latency actually measuring? I thought it was from lws creation to when the replica is ready