diff --git a/pkg/exporters/drift/drift.go b/pkg/exporters/drift/drift.go index b3b7a47..bcc46ff 100644 --- a/pkg/exporters/drift/drift.go +++ b/pkg/exporters/drift/drift.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "github.com/01builders/ev-metrics/pkg/metrics" + "github.com/01builders/ev-metrics/pkg/utils" "github.com/ethereum/go-ethereum/ethclient" "github.com/rs/zerolog" "time" @@ -53,9 +54,12 @@ func (e exporter) ExportMetrics(ctx context.Context, m *metrics.Metrics) error { refHeight, err := getBlockHeight(ctx, e.referenceNode) if err != nil { e.logger.Error().Err(err).Str("endpoint", e.referenceNode).Msg("failed to get reference node block height") + m.RecordEndpointAvailability(e.chainID, e.referenceNode, false) + m.RecordEndpointError(e.chainID, e.referenceNode, utils.CategorizeError(err)) continue } + m.RecordEndpointAvailability(e.chainID, e.referenceNode, true) m.RecordReferenceBlockHeight(e.chainID, e.referenceNode, refHeight) e.logger.Info().Uint64("height", refHeight).Str("endpoint", e.referenceNode).Msg("recorded reference node height") @@ -64,9 +68,12 @@ func (e exporter) ExportMetrics(ctx context.Context, m *metrics.Metrics) error { currentHeight, err := getBlockHeight(ctx, fullNode) if err != nil { e.logger.Error().Err(err).Str("endpoint", fullNode).Msg("failed to get full node block height") + m.RecordEndpointAvailability(e.chainID, fullNode, false) + m.RecordEndpointError(e.chainID, fullNode, utils.CategorizeError(err)) continue } + m.RecordEndpointAvailability(e.chainID, fullNode, true) m.RecordCurrentBlockHeight(e.chainID, fullNode, currentHeight) m.RecordBlockHeightDrift(e.chainID, fullNode, refHeight, currentHeight) diff --git a/pkg/exporters/jsonrpc/json_rpc.go b/pkg/exporters/jsonrpc/json_rpc.go index e15c1f5..64abce4 100644 --- a/pkg/exporters/jsonrpc/json_rpc.go +++ b/pkg/exporters/jsonrpc/json_rpc.go @@ -4,6 +4,7 @@ import ( "context" "github.com/01builders/ev-metrics/internal/clients/evm" "github.com/01builders/ev-metrics/pkg/metrics" + "github.com/01builders/ev-metrics/pkg/utils" "time" "github.com/rs/zerolog" @@ -67,9 +68,12 @@ func performHealthCheck( ) error { duration, err := evmClient.HealthCheckRequest(ctx) if err != nil { + m.RecordEndpointAvailability(chainID, "jsonrpc", false) + m.RecordEndpointError(chainID, "jsonrpc", utils.CategorizeError(err)) return err } + m.RecordEndpointAvailability(chainID, "jsonrpc", true) m.RecordJsonRpcRequestDuration(chainID, duration) logger.Info(). diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index 0e0b8a2..1967880 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -34,6 +34,10 @@ type Metrics struct { JsonRpcRequestDuration *prometheus.HistogramVec // JsonRpcRequestSloSeconds exports constant SLO thresholds for JSON-RPC requests. JsonRpcRequestSloSeconds *prometheus.GaugeVec + // EndpointAvailability tracks whether an endpoint is reachable (1.0 = available, 0.0 = unavailable). + EndpointAvailability *prometheus.GaugeVec + // EndpointErrors tracks endpoint connection errors by type. + EndpointErrors *prometheus.CounterVec // internal tracking to ensure we only record increasing DA heights latestHeaderDaHeight uint64 @@ -164,6 +168,22 @@ func NewWithRegistry(namespace string, registerer prometheus.Registerer) *Metric }, []string{"chain_id", "percentile"}, ), + EndpointAvailability: factory.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Name: "endpoint_availability", + Help: "endpoint availability status (1.0 = available, 0.0 = unavailable)", + }, + []string{"chain_id", "endpoint"}, + ), + EndpointErrors: factory.NewCounterVec( + prometheus.CounterOpts{ + Namespace: namespace, + Name: "endpoint_errors_total", + Help: "total number of endpoint connection errors by type", + }, + []string{"chain_id", "endpoint", "error_type"}, + ), ranges: make(map[string][]*blockRange), lastBlockArrivalTime: make(map[string]time.Time), } @@ -431,3 +451,18 @@ func (m *Metrics) InitializeJsonRpcSloThresholds(chainID string) { m.JsonRpcRequestSloSeconds.WithLabelValues(chainID, "p95").Set(0.4) m.JsonRpcRequestSloSeconds.WithLabelValues(chainID, "p99").Set(0.5) } + +// RecordEndpointAvailability records whether an endpoint is reachable +// available should be true if endpoint is reachable, false otherwise +func (m *Metrics) RecordEndpointAvailability(chainID, endpoint string, available bool) { + value := 0.0 + if available { + value = 1.0 + } + m.EndpointAvailability.WithLabelValues(chainID, endpoint).Set(value) +} + +// RecordEndpointError records an endpoint connection error with its type +func (m *Metrics) RecordEndpointError(chainID, endpoint, errorType string) { + m.EndpointErrors.WithLabelValues(chainID, endpoint, errorType).Inc() +} diff --git a/pkg/utils/errors.go b/pkg/utils/errors.go new file mode 100644 index 0000000..52e4598 --- /dev/null +++ b/pkg/utils/errors.go @@ -0,0 +1,43 @@ +package utils + +import "strings" + +// CategorizeError categorizes errors for metrics tracking +// Returns a string representing the error type for use in metrics labels +func CategorizeError(err error) string { + if err == nil { + return "none" + } + + errStr := err.Error() + + // Check for common error patterns + switch { + case strings.Contains(errStr, "connection refused"): + return "connection_refused" + case strings.Contains(errStr, "timeout"): + return "timeout" + case strings.Contains(errStr, "no such host"): + return "dns_error" + case strings.Contains(errStr, "context canceled"): + return "context_canceled" + case strings.Contains(errStr, "context deadline exceeded"): + return "context_deadline" + case strings.Contains(errStr, "failed to connect"): + return "connection_failed" + case strings.Contains(errStr, "failed to get block number"): + return "rpc_error" + case strings.Contains(errStr, "unexpected status code"): + return "http_error" + case strings.Contains(errStr, "failed to send request"): + return "request_failed" + case strings.Contains(errStr, "failed to read response"): + return "response_read_failed" + case strings.Contains(errStr, "failed to marshal"): + return "marshal_error" + case strings.Contains(errStr, "failed to create request"): + return "request_creation_failed" + default: + return "unknown" + } +}