From 3897103b77a6f28f9c6d26ccf0fd1cb0336f5c20 Mon Sep 17 00:00:00 2001 From: Siyuan Zhang Date: Mon, 16 Oct 2023 15:16:42 -0700 Subject: [PATCH] etcdserver: add metric counters for livez/readyz health checks. Signed-off-by: Siyuan Zhang --- server/etcdserver/api/etcdhttp/health.go | 91 +++++++++++++++---- server/etcdserver/api/etcdhttp/health_test.go | 54 +++++++++++ 2 files changed, 127 insertions(+), 18 deletions(-) diff --git a/server/etcdserver/api/etcdhttp/health.go b/server/etcdserver/api/etcdhttp/health.go index 234580805b4..2791689e7f3 100644 --- a/server/etcdserver/api/etcdhttp/health.go +++ b/server/etcdserver/api/etcdhttp/health.go @@ -12,6 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +// This file defines the http endpoints for etcd health checks. +// The endpoints include /livez, /readyz and /health. + package etcdhttp import ( @@ -34,8 +37,13 @@ import ( ) const ( - PathHealth = "/health" - PathProxyHealth = "/proxy/health" + PathHealth = "/health" + PathProxyHealth = "/proxy/health" + HealthStatusSuccess string = "success" + HealthStatusError string = "error" + checkTypeLivez = "livez" + checkTypeReadyz = "readyz" + checkTypeHealth = "health" ) type ServerHealth interface { @@ -111,11 +119,29 @@ var ( Name: "health_failures", Help: "The total number of failed health checks", }) + healthCheckGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "etcd", + Subsystem: "server", + Name: "healthcheck", + Help: "The result of each kind of healthcheck.", + }, + []string{"type", "name"}, + ) + healthCheckCounter = prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: "etcd", + Subsystem: "server", + Name: "healthchecks_total", + Help: "The total number of each kind of healthcheck.", + }, + []string{"type", "name", "status"}, + ) ) func init() { prometheus.MustRegister(healthSuccess) prometheus.MustRegister(healthFailed) + prometheus.MustRegister(healthCheckGauge) + prometheus.MustRegister(healthCheckCounter) } // Health defines etcd server health status. @@ -125,6 +151,12 @@ type Health struct { Reason string `json:"reason"` } +// HealthStatus is used in new /readyz or /livez health checks instead of the Health struct. +type HealthStatus struct { + Reason string `json:"reason"` + Status string `json:"status"` +} + func getQuerySet(r *http.Request, query string) StringSet { querySet := make(map[string]struct{}) qs, found := r.URL.Query()[query] @@ -201,18 +233,18 @@ func checkAPI(ctx context.Context, lg *zap.Logger, srv ServerHealth, serializabl type HealthCheck func(ctx context.Context) error type CheckRegistry struct { - path string - checks map[string]HealthCheck + checkType string + checks map[string]HealthCheck } func installLivezEndpoints(lg *zap.Logger, mux *http.ServeMux, server ServerHealth) { - reg := CheckRegistry{path: "/livez", checks: make(map[string]HealthCheck)} + reg := CheckRegistry{checkType: checkTypeLivez, checks: make(map[string]HealthCheck)} reg.Register("serializable_read", serializableReadCheck(server)) reg.InstallHttpEndpoints(lg, mux) } func installReadyzEndpoints(lg *zap.Logger, mux *http.ServeMux, server ServerHealth) { - reg := CheckRegistry{path: "/readyz", checks: make(map[string]HealthCheck)} + reg := CheckRegistry{checkType: checkTypeReadyz, checks: make(map[string]HealthCheck)} reg.Register("data_corruption", activeAlarmCheck(server, pb.AlarmType_CORRUPT)) reg.Register("serializable_read", serializableReadCheck(server)) reg.InstallHttpEndpoints(lg, mux) @@ -222,6 +254,10 @@ func (reg *CheckRegistry) Register(name string, check HealthCheck) { reg.checks[name] = check } +func (reg *CheckRegistry) RootPath() string { + return "/" + reg.checkType +} + func (reg *CheckRegistry) InstallHttpEndpoints(lg *zap.Logger, mux *http.ServeMux) { checkNames := make([]string, 0, len(reg.checks)) for k := range reg.checks { @@ -229,19 +265,19 @@ func (reg *CheckRegistry) InstallHttpEndpoints(lg *zap.Logger, mux *http.ServeMu } // installs the http handler for the root path. - reg.installRootHttpEndpoint(lg, mux, reg.path, checkNames...) + reg.installRootHttpEndpoint(lg, mux, checkNames...) for _, checkName := range checkNames { // installs the http handler for the individual check sub path. - subpath := path.Join(reg.path, checkName) + subpath := path.Join(reg.RootPath(), checkName) check := checkName - mux.Handle(subpath, newHealthHandler(subpath, lg, func(r *http.Request) Health { + mux.Handle(subpath, newHealthHandler(subpath, lg, func(r *http.Request) HealthStatus { return reg.runHealthChecks(r.Context(), check) })) } } -func (reg *CheckRegistry) runHealthChecks(ctx context.Context, checkNames ...string) Health { - h := Health{Health: "true"} +func (reg *CheckRegistry) runHealthChecks(ctx context.Context, checkNames ...string) HealthStatus { + h := HealthStatus{Status: HealthStatusSuccess} var individualCheckOutput bytes.Buffer for _, checkName := range checkNames { check, found := reg.checks[checkName] @@ -250,9 +286,11 @@ func (reg *CheckRegistry) runHealthChecks(ctx context.Context, checkNames ...str } if err := check(ctx); err != nil { fmt.Fprintf(&individualCheckOutput, "[-]%s failed: %v\n", checkName, err) - h.Health = "false" + h.Status = HealthStatusError + recordMetrics(reg.checkType, checkName, HealthStatusError) } else { fmt.Fprintf(&individualCheckOutput, "[+]%s ok\n", checkName) + recordMetrics(reg.checkType, checkName, HealthStatusSuccess) } } h.Reason = individualCheckOutput.String() @@ -260,19 +298,20 @@ func (reg *CheckRegistry) runHealthChecks(ctx context.Context, checkNames ...str } // installRootHttpEndpoint installs the http handler for the root path. -func (reg *CheckRegistry) installRootHttpEndpoint(lg *zap.Logger, mux *http.ServeMux, path string, checks ...string) { - hfunc := func(r *http.Request) Health { +func (reg *CheckRegistry) installRootHttpEndpoint(lg *zap.Logger, mux *http.ServeMux, checks ...string) { + hfunc := func(r *http.Request) HealthStatus { // extracts the health check names to be excludeList from the query param excluded := getQuerySet(r, "exclude") filteredCheckNames := filterCheckList(lg, listToStringSet(checks), excluded) - return reg.runHealthChecks(r.Context(), filteredCheckNames...) + h := reg.runHealthChecks(r.Context(), filteredCheckNames...) + return h } - mux.Handle(path, newHealthHandler(path, lg, hfunc)) + mux.Handle(reg.RootPath(), newHealthHandler(reg.RootPath(), lg, hfunc)) } // newHealthHandler generates a http HandlerFunc for a health check function hfunc. -func newHealthHandler(path string, lg *zap.Logger, hfunc func(*http.Request) Health) http.HandlerFunc { +func newHealthHandler(path string, lg *zap.Logger, hfunc func(*http.Request) HealthStatus) http.HandlerFunc { return func(w http.ResponseWriter, r *http.Request) { if r.Method != http.MethodGet { w.Header().Set("Allow", http.MethodGet) @@ -282,7 +321,7 @@ func newHealthHandler(path string, lg *zap.Logger, hfunc func(*http.Request) Hea } h := hfunc(r) // Always returns detailed reason for failed checks. - if h.Health != "true" { + if h.Status == HealthStatusError { http.Error(w, h.Reason, http.StatusServiceUnavailable) lg.Error("Health check error", zap.String("path", path), zap.String("reason", h.Reason), zap.Int("status-code", http.StatusServiceUnavailable)) return @@ -342,6 +381,22 @@ func listToStringSet(list []string) StringSet { return set } +func recordMetrics(checkType, name string, status string) { + val := 0.0 + if status == HealthStatusSuccess { + val = 1.0 + } + healthCheckGauge.With(prometheus.Labels{ + "type": checkType, + "name": name, + }).Set(val) + healthCheckCounter.With(prometheus.Labels{ + "type": checkType, + "name": name, + "status": status, + }).Inc() +} + // activeAlarmCheck checks if a specific alarm type is active in the server. func activeAlarmCheck(srv ServerHealth, at pb.AlarmType) func(context.Context) error { return func(ctx context.Context) error { diff --git a/server/etcdserver/api/etcdhttp/health_test.go b/server/etcdserver/api/etcdhttp/health_test.go index 122fbf6adcf..e0aafd4c273 100644 --- a/server/etcdserver/api/etcdhttp/health_test.go +++ b/server/etcdserver/api/etcdhttp/health_test.go @@ -23,6 +23,7 @@ import ( "strings" "testing" + "github.com/prometheus/client_golang/prometheus" "go.uber.org/zap/zaptest" "go.etcd.io/raft/v3" @@ -193,6 +194,7 @@ func TestHttpSubPath(t *testing.T) { ts := httptest.NewServer(mux) defer ts.Close() checkHttpResponse(t, ts, tt.healthCheckURL, tt.expectStatusCode, tt.inResult, tt.notInResult) + checkMetrics(t, tt.healthCheckURL, "", tt.expectStatusCode) }) } } @@ -291,6 +293,7 @@ func TestSerializableReadCheck(t *testing.T) { ts := httptest.NewServer(mux) defer ts.Close() checkHttpResponse(t, ts, tt.healthCheckURL, tt.expectStatusCode, tt.inResult, tt.notInResult) + checkMetrics(t, tt.healthCheckURL, "serializable_read", tt.expectStatusCode) }) } } @@ -323,3 +326,54 @@ func checkHttpResponse(t *testing.T, ts *httptest.Server, url string, expectStat } } } + +func checkMetrics(t *testing.T, url, checkName string, expectStatusCode int) { + defer healthCheckGauge.Reset() + defer healthCheckCounter.Reset() + + typeName := strings.TrimPrefix(strings.Split(url, "?")[0], "/") + if len(checkName) == 0 { + checkName = strings.Split(typeName, "/")[1] + typeName = strings.Split(typeName, "/")[0] + } + + expectedSuccessCount := 1 + expectedErrorCount := 0 + if expectStatusCode != http.StatusOK { + expectedSuccessCount = 0 + expectedErrorCount = 1 + } + + gather, _ := prometheus.DefaultGatherer.Gather() + for _, mf := range gather { + name := *mf.Name + val := 0 + switch name { + case "etcd_server_healthcheck": + val = int(mf.GetMetric()[0].GetGauge().GetValue()) + case "etcd_server_healthcheck_total": + val = int(mf.GetMetric()[0].GetCounter().GetValue()) + default: + continue + } + labelMap := make(map[string]string) + for _, label := range mf.GetMetric()[0].Label { + labelMap[label.GetName()] = label.GetValue() + } + if typeName != labelMap["type"] { + continue + } + if labelMap["name"] != checkName { + continue + } + if statusLabel, found := labelMap["status"]; found && statusLabel == HealthStatusError { + if val != expectedErrorCount { + t.Fatalf("%s got errorCount %d, wanted %d\n", name, val, expectedErrorCount) + } + } else { + if val != expectedSuccessCount { + t.Fatalf("%s got expectedSuccessCount %d, wanted %d\n", name, val, expectedSuccessCount) + } + } + } +}