Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions pkg/exporters/drift/drift.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"
"fmt"
"github.com/01builders/ev-metrics/pkg/metrics"
"github.com/01builders/ev-metrics/pkg/utils"
"github.com/ethereum/go-ethereum/ethclient"
"github.com/rs/zerolog"
"time"
Expand Down Expand Up @@ -53,9 +54,12 @@ func (e exporter) ExportMetrics(ctx context.Context, m *metrics.Metrics) error {
refHeight, err := getBlockHeight(ctx, e.referenceNode)
if err != nil {
e.logger.Error().Err(err).Str("endpoint", e.referenceNode).Msg("failed to get reference node block height")
m.RecordEndpointAvailability(e.chainID, e.referenceNode, false)
m.RecordEndpointError(e.chainID, e.referenceNode, utils.CategorizeError(err))
continue
}

m.RecordEndpointAvailability(e.chainID, e.referenceNode, true)
m.RecordReferenceBlockHeight(e.chainID, e.referenceNode, refHeight)
e.logger.Info().Uint64("height", refHeight).Str("endpoint", e.referenceNode).Msg("recorded reference node height")

Expand All @@ -64,9 +68,12 @@ func (e exporter) ExportMetrics(ctx context.Context, m *metrics.Metrics) error {
currentHeight, err := getBlockHeight(ctx, fullNode)
if err != nil {
e.logger.Error().Err(err).Str("endpoint", fullNode).Msg("failed to get full node block height")
m.RecordEndpointAvailability(e.chainID, fullNode, false)
m.RecordEndpointError(e.chainID, fullNode, utils.CategorizeError(err))
continue
}

m.RecordEndpointAvailability(e.chainID, fullNode, true)
m.RecordCurrentBlockHeight(e.chainID, fullNode, currentHeight)
m.RecordBlockHeightDrift(e.chainID, fullNode, refHeight, currentHeight)

Expand Down
4 changes: 4 additions & 0 deletions pkg/exporters/jsonrpc/json_rpc.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"
"github.com/01builders/ev-metrics/internal/clients/evm"
"github.com/01builders/ev-metrics/pkg/metrics"
"github.com/01builders/ev-metrics/pkg/utils"
"time"

"github.com/rs/zerolog"
Expand Down Expand Up @@ -67,9 +68,12 @@ func performHealthCheck(
) error {
duration, err := evmClient.HealthCheckRequest(ctx)
if err != nil {
m.RecordEndpointAvailability(chainID, "jsonrpc", false)
m.RecordEndpointError(chainID, "jsonrpc", utils.CategorizeError(err))
return err
}

m.RecordEndpointAvailability(chainID, "jsonrpc", true)
m.RecordJsonRpcRequestDuration(chainID, duration)

logger.Info().
Expand Down
35 changes: 35 additions & 0 deletions pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ type Metrics struct {
JsonRpcRequestDuration *prometheus.HistogramVec
// JsonRpcRequestSloSeconds exports constant SLO thresholds for JSON-RPC requests.
JsonRpcRequestSloSeconds *prometheus.GaugeVec
// EndpointAvailability tracks whether an endpoint is reachable (1.0 = available, 0.0 = unavailable).
EndpointAvailability *prometheus.GaugeVec
// EndpointErrors tracks endpoint connection errors by type.
EndpointErrors *prometheus.CounterVec

// internal tracking to ensure we only record increasing DA heights
latestHeaderDaHeight uint64
Expand Down Expand Up @@ -164,6 +168,22 @@ func NewWithRegistry(namespace string, registerer prometheus.Registerer) *Metric
},
[]string{"chain_id", "percentile"},
),
EndpointAvailability: factory.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: namespace,
Name: "endpoint_availability",
Help: "endpoint availability status (1.0 = available, 0.0 = unavailable)",
},
[]string{"chain_id", "endpoint"},
),
EndpointErrors: factory.NewCounterVec(
prometheus.CounterOpts{
Namespace: namespace,
Name: "endpoint_errors_total",
Help: "total number of endpoint connection errors by type",
},
[]string{"chain_id", "endpoint", "error_type"},
),
ranges: make(map[string][]*blockRange),
lastBlockArrivalTime: make(map[string]time.Time),
}
Expand Down Expand Up @@ -431,3 +451,18 @@ func (m *Metrics) InitializeJsonRpcSloThresholds(chainID string) {
m.JsonRpcRequestSloSeconds.WithLabelValues(chainID, "p95").Set(0.4)
m.JsonRpcRequestSloSeconds.WithLabelValues(chainID, "p99").Set(0.5)
}

// RecordEndpointAvailability records whether an endpoint is reachable
// available should be true if endpoint is reachable, false otherwise
func (m *Metrics) RecordEndpointAvailability(chainID, endpoint string, available bool) {
value := 0.0
if available {
value = 1.0
}
m.EndpointAvailability.WithLabelValues(chainID, endpoint).Set(value)
}

// RecordEndpointError records an endpoint connection error with its type
func (m *Metrics) RecordEndpointError(chainID, endpoint, errorType string) {
m.EndpointErrors.WithLabelValues(chainID, endpoint, errorType).Inc()
}
Comment on lines +455 to +468

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The new functions RecordEndpointAvailability and RecordEndpointError are defined but are not called anywhere in the pull request's changes. This means the new metrics (endpoint_availability and endpoint_errors_total) will be registered but will never be populated with data. This appears to be an incomplete implementation. Was the code that uses these functions intended to be part of this PR?

43 changes: 43 additions & 0 deletions pkg/utils/errors.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
package utils

import "strings"

// CategorizeError categorizes errors for metrics tracking
// Returns a string representing the error type for use in metrics labels
func CategorizeError(err error) string {
if err == nil {
return "none"
}

errStr := err.Error()

// Check for common error patterns
switch {
case strings.Contains(errStr, "connection refused"):
return "connection_refused"
case strings.Contains(errStr, "timeout"):
return "timeout"
case strings.Contains(errStr, "no such host"):
return "dns_error"
case strings.Contains(errStr, "context canceled"):
return "context_canceled"
case strings.Contains(errStr, "context deadline exceeded"):
return "context_deadline"
case strings.Contains(errStr, "failed to connect"):
return "connection_failed"
case strings.Contains(errStr, "failed to get block number"):
return "rpc_error"
case strings.Contains(errStr, "unexpected status code"):
return "http_error"
case strings.Contains(errStr, "failed to send request"):
return "request_failed"
case strings.Contains(errStr, "failed to read response"):
return "response_read_failed"
case strings.Contains(errStr, "failed to marshal"):
return "marshal_error"
case strings.Contains(errStr, "failed to create request"):
return "request_creation_failed"
default:
return "unknown"
}
}