Skip to content

Commit

Permalink
Merge #131017
Browse files Browse the repository at this point in the history
131017: crosscluster: remove some unused or less useful metrics r=dt a=dt

See commits.

Co-authored-by: David Taylor <[email protected]>
  • Loading branch information
craig[bot] and dt committed Sep 25, 2024
2 parents 611d44d + 7ed7389 commit e49c98b
Show file tree
Hide file tree
Showing 10 changed files with 2 additions and 223 deletions.
9 changes: 0 additions & 9 deletions docs/generated/metrics/metrics.html
Original file line number Diff line number Diff line change
Expand Up @@ -1453,12 +1453,7 @@
<tr><td>APPLICATION</td><td>logical_replication.events_initial_success</td><td>Successful applications of an incoming row update</td><td>Failures</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>logical_replication.events_retry_failure</td><td>Failed re-attempts to apply a row update</td><td>Failures</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>logical_replication.events_retry_success</td><td>Row update events applied after one or more retries</td><td>Failures</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>logical_replication.flush_bytes</td><td>Number of bytes in a given flush</td><td>Logical bytes</td><td>HISTOGRAM</td><td>BYTES</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>logical_replication.flush_hist_nanos</td><td>Time spent flushing messages across all replication streams</td><td>Nanoseconds</td><td>HISTOGRAM</td><td>NANOSECONDS</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>logical_replication.flush_row_count</td><td>Number of rows in a given flush</td><td>Rows</td><td>HISTOGRAM</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>logical_replication.kv_write_fallback_count</td><td>Total number of times the kv write path could not handle a row update and fell back to SQL instead</td><td>Events</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>logical_replication.logical_bytes</td><td>Logical bytes (sum of keys + values) received by all replication jobs</td><td>Bytes</td><td>COUNTER</td><td>BYTES</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>logical_replication.optimistic_insert_conflict_count</td><td>Total number of times the optimistic insert encountered a conflict</td><td>Events</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>logical_replication.replan_count</td><td>Total number of dist sql replanning events</td><td>Events</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>logical_replication.replicated_time_seconds</td><td>The replicated time of the logical replication stream in seconds since the unix epoch.</td><td>Seconds</td><td>GAUGE</td><td>SECONDS</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>logical_replication.retry_queue_bytes</td><td>The replicated time of the logical replication stream in seconds since the unix epoch.</td><td>Bytes</td><td>GAUGE</td><td>BYTES</td><td>AVG</td><td>NONE</td></tr>
Expand All @@ -1468,17 +1463,13 @@
<tr><td>APPLICATION</td><td>physical_replication.commit_latency</td><td>Event commit latency: a difference between event MVCC timestamp and the time it was flushed into disk. If we batch events, then the difference between the oldest event in the batch and flush is recorded</td><td>Nanoseconds</td><td>HISTOGRAM</td><td>NANOSECONDS</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>physical_replication.cutover_progress</td><td>The number of ranges left to revert in order to complete an inflight cutover</td><td>Ranges</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>physical_replication.distsql_replan_count</td><td>Total number of dist sql replanning events</td><td>Events</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>physical_replication.earliest_data_checkpoint_span</td><td>The earliest timestamp of the last checkpoint forwarded by an ingestion data processor</td><td>Timestamp</td><td>GAUGE</td><td>TIMESTAMP_NS</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>physical_replication.events_ingested</td><td>Events ingested by all replication jobs</td><td>Events</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>physical_replication.flush_hist_nanos</td><td>Time spent flushing messages across all replication streams</td><td>Nanoseconds</td><td>HISTOGRAM</td><td>NANOSECONDS</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>physical_replication.flushes</td><td>Total flushes across all replication jobs</td><td>Flushes</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>physical_replication.job_progress_updates</td><td>Total number of updates to the ingestion job progress</td><td>Job Updates</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>physical_replication.latest_data_checkpoint_span</td><td>The latest timestamp of the last checkpoint forwarded by an ingestion data processor</td><td>Timestamp</td><td>GAUGE</td><td>TIMESTAMP_NS</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>physical_replication.logical_bytes</td><td>Logical bytes (sum of keys + values) ingested by all replication jobs</td><td>Bytes</td><td>COUNTER</td><td>BYTES</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>physical_replication.replicated_time_seconds</td><td>The replicated time of the physical replication stream in seconds since the unix epoch.</td><td>Seconds</td><td>GAUGE</td><td>SECONDS</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>physical_replication.resolved_events_ingested</td><td>Resolved events ingested by all replication jobs</td><td>Events</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>physical_replication.running</td><td>Number of currently running replication streams</td><td>Replication Streams</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>physical_replication.sst_bytes</td><td>SST bytes (compressed) sent to KV by all replication jobs</td><td>Bytes</td><td>COUNTER</td><td>BYTES</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>requests.slow.distsender</td><td>Number of range-bound RPCs currently stuck or retrying for a long time.<br/><br/>Note that this is not a good signal for KV health. The remote side of the<br/>RPCs tracked here may experience contention, so an end user can easily<br/>cause values for this metric to be emitted by leaving a transaction open<br/>for a long time and contending with it using a second transaction.</td><td>Requests</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>round-trip-latency</td><td>Distribution of round-trip latencies with other nodes.<br/><br/>This only reflects successful heartbeats and measures gRPC overhead as well as<br/>possible head-of-line blocking. Elevated values in this metric may hint at<br/>network issues and/or saturation, but they are no proof of them. CPU overload<br/>can similarly elevate this metric. The operator should look towards OS-level<br/>metrics such as packet loss, retransmits, etc, to conclusively diagnose network<br/>issues. Heartbeats are not very frequent (~seconds), so they may not capture<br/>rare or short-lived degradations.<br/></td><td>Round-trip time</td><td>HISTOGRAM</td><td>NANOSECONDS</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>rpc.client.bytes.egress</td><td>Counter of TCP bytes sent via gRPC on connections we initiated.</td><td>Bytes</td><td>COUNTER</td><td>BYTES</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -670,8 +670,6 @@ func (lrw *logicalReplicationWriterProcessor) flushBuffer(
return err
}
perChunkStats[worker] = s
lrw.metrics.OptimisticInsertConflictCount.Inc(s.optimisticInsertConflicts)
lrw.metrics.KVWriteFallbackCount.Inc(s.kvWriteFallbacks)
return nil
})
}
Expand Down Expand Up @@ -707,9 +705,6 @@ func (lrw *logicalReplicationWriterProcessor) flushBuffer(
} else {
lrw.metrics.InitialApplySuccesses.Inc(stats.processed.success)
lrw.metrics.InitialApplyFailures.Inc(stats.notProcessed.count + stats.processed.dlq)
lrw.metrics.StreamBatchNanosHist.RecordValue(flushTime)
lrw.metrics.StreamBatchRowsHist.RecordValue(int64(len(kvs)))
lrw.metrics.StreamBatchBytesHist.RecordValue(stats.processed.bytes + stats.notProcessed.bytes)
lrw.metrics.ReceivedLogicalBytes.Inc(stats.processed.bytes + stats.notProcessed.bytes)
}
return notProcessed, stats.notProcessed.bytes, nil
Expand Down
60 changes: 2 additions & 58 deletions pkg/ccl/crosscluster/logical/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -119,36 +119,6 @@ var (
Measurement: "Events",
Unit: metric.Unit_COUNT,
}
metaStreamBatchRowsHist = metric.Metadata{
Name: "logical_replication.flush_row_count",
Help: "Number of rows in a given flush",
Measurement: "Rows",
Unit: metric.Unit_COUNT,
}
metaStreamBatchBytesHist = metric.Metadata{
Name: "logical_replication.flush_bytes",
Help: "Number of bytes in a given flush",
Measurement: "Logical bytes",
Unit: metric.Unit_BYTES,
}
metaStreamBatchNanosHist = metric.Metadata{
Name: "logical_replication.flush_hist_nanos",
Help: "Time spent flushing messages across all replication streams",
Measurement: "Nanoseconds",
Unit: metric.Unit_NANOSECONDS,
}
metaOptimisticInsertConflictCount = metric.Metadata{
Name: "logical_replication.optimistic_insert_conflict_count",
Help: "Total number of times the optimistic insert encountered a conflict",
Measurement: "Events",
Unit: metric.Unit_COUNT,
}
metaKVWriteFallbackCount = metric.Metadata{
Name: "logical_replication.kv_write_fallback_count",
Help: "Total number of times the kv write path could not handle a row update and fell back to SQL instead",
Measurement: "Events",
Unit: metric.Unit_COUNT,
}
metaDistSQLReplanCount = metric.Metadata{
Name: "logical_replication.replan_count",
Help: "Total number of dist sql replanning events",
Expand Down Expand Up @@ -186,13 +156,7 @@ type Metrics struct {
// Internal numbers that are useful for determining why a stream is behaving
// a specific way.
CheckpointEvents *metric.Counter
// TODO(dt): are these stream batch size or latency numbers useful?
StreamBatchRowsHist metric.IHistogram
StreamBatchBytesHist metric.IHistogram
StreamBatchNanosHist metric.IHistogram
OptimisticInsertConflictCount *metric.Counter
KVWriteFallbackCount *metric.Counter
ReplanCount *metric.Counter
ReplanCount *metric.Counter
}

// MetricStruct implements the metric.Struct interface.
Expand Down Expand Up @@ -228,26 +192,6 @@ func MakeMetrics(histogramWindow time.Duration) metric.Struct {
RetriedApplySuccesses: metric.NewCounter(metaRetriedApplySuccesses),
RetriedApplyFailures: metric.NewCounter(metaRetriedApplyFailures),
CheckpointEvents: metric.NewCounter(metaCheckpointEvents),
StreamBatchRowsHist: metric.NewHistogram(metric.HistogramOptions{
Mode: metric.HistogramModePrometheus,
Metadata: metaStreamBatchRowsHist,
Duration: histogramWindow,
BucketConfig: metric.DataCount16MBuckets,
}),
StreamBatchBytesHist: metric.NewHistogram(metric.HistogramOptions{
Mode: metric.HistogramModePrometheus,
Metadata: metaStreamBatchBytesHist,
Duration: histogramWindow,
BucketConfig: metric.MemoryUsage64MBBuckets,
}),
StreamBatchNanosHist: metric.NewHistogram(metric.HistogramOptions{
Mode: metric.HistogramModePrometheus,
Metadata: metaStreamBatchNanosHist,
Duration: histogramWindow,
BucketConfig: metric.IOLatencyBuckets,
}),
OptimisticInsertConflictCount: metric.NewCounter(metaOptimisticInsertConflictCount),
KVWriteFallbackCount: metric.NewCounter(metaKVWriteFallbackCount),
ReplanCount: metric.NewCounter(metaDistSQLReplanCount),
ReplanCount: metric.NewCounter(metaDistSQLReplanCount),
}
}
32 changes: 0 additions & 32 deletions pkg/ccl/crosscluster/physical/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,6 @@ var (
Measurement: "Bytes",
Unit: metric.Unit_BYTES,
}
metaReplicationSSTBytes = metric.Metadata{
Name: "physical_replication.sst_bytes",
Help: "SST bytes (compressed) sent to KV by all replication jobs",
Measurement: "Bytes",
Unit: metric.Unit_BYTES,
}
metaReplicationFlushes = metric.Metadata{
Name: "physical_replication.flushes",
Help: "Total flushes across all replication jobs",
Expand Down Expand Up @@ -80,31 +74,13 @@ var (
Measurement: "Replication Streams",
Unit: metric.Unit_COUNT,
}
metaEarliestDataCheckpointSpan = metric.Metadata{
Name: "physical_replication.earliest_data_checkpoint_span",
Help: "The earliest timestamp of the last checkpoint forwarded by an ingestion data processor",
Measurement: "Timestamp",
Unit: metric.Unit_TIMESTAMP_NS,
}
metaLatestDataCheckpointSpan = metric.Metadata{
Name: "physical_replication.latest_data_checkpoint_span",
Help: "The latest timestamp of the last checkpoint forwarded by an ingestion data processor",
Measurement: "Timestamp",
Unit: metric.Unit_TIMESTAMP_NS,
}

metaReplicatedTimeSeconds = metric.Metadata{
Name: "physical_replication.replicated_time_seconds",
Help: "The replicated time of the physical replication stream in seconds since the unix epoch.",
Measurement: "Seconds",
Unit: metric.Unit_SECONDS,
}
metaJobProgressUpdates = metric.Metadata{
Name: "physical_replication.job_progress_updates",
Help: "Total number of updates to the ingestion job progress",
Measurement: "Job Updates",
Unit: metric.Unit_COUNT,
}
// This metric would be 0 until cutover begins, and then it will be updated to
// the total number of ranges that need to be reverted, and then gradually go
// down to 0 again. NB: that the number of ranges is the total number of
Expand All @@ -128,17 +104,13 @@ var (
type Metrics struct {
IngestedEvents *metric.Counter
IngestedLogicalBytes *metric.Counter
IngestedSSTBytes *metric.Counter
Flushes *metric.Counter
JobProgressUpdates *metric.Counter
ResolvedEvents *metric.Counter
ReplanCount *metric.Counter
FlushHistNanos metric.IHistogram
CommitLatency metric.IHistogram
AdmitLatency metric.IHistogram
RunningCount *metric.Gauge
EarliestDataCheckpointSpan *metric.Gauge
LatestDataCheckpointSpan *metric.Gauge
ReplicatedTimeSeconds *metric.Gauge
ReplicationCutoverProgress *metric.Gauge
}
Expand All @@ -151,10 +123,8 @@ func MakeMetrics(histogramWindow time.Duration) metric.Struct {
m := &Metrics{
IngestedEvents: metric.NewCounter(metaReplicationEventsIngested),
IngestedLogicalBytes: metric.NewCounter(metaReplicationIngestedBytes),
IngestedSSTBytes: metric.NewCounter(metaReplicationSSTBytes),
Flushes: metric.NewCounter(metaReplicationFlushes),
ResolvedEvents: metric.NewCounter(metaReplicationResolvedEventsIngested),
JobProgressUpdates: metric.NewCounter(metaJobProgressUpdates),
ReplanCount: metric.NewCounter(metaDistSQLReplanCount),
FlushHistNanos: metric.NewHistogram(metric.HistogramOptions{
Metadata: metaReplicationFlushHistNanos,
Expand All @@ -178,8 +148,6 @@ func MakeMetrics(histogramWindow time.Duration) metric.Struct {
SigFigs: 1,
}),
RunningCount: metric.NewGauge(metaStreamsRunning),
EarliestDataCheckpointSpan: metric.NewGauge(metaEarliestDataCheckpointSpan),
LatestDataCheckpointSpan: metric.NewGauge(metaLatestDataCheckpointSpan),
ReplicatedTimeSeconds: metric.NewGauge(metaReplicatedTimeSeconds),
ReplicationCutoverProgress: metric.NewGauge(metaReplicationCutoverProgress),
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -392,7 +392,6 @@ func (sf *streamIngestionFrontier) maybeUpdateProgress() error {
return err
}

sf.metrics.JobProgressUpdates.Inc(1)
sf.persistedReplicatedTime = f.Frontier()
sf.metrics.ReplicatedTimeSeconds.Update(sf.persistedReplicatedTime.GoTime().Unix())
return nil
Expand Down
4 changes: 0 additions & 4 deletions pkg/ccl/crosscluster/physical/stream_ingestion_processor.go
Original file line number Diff line number Diff line change
Expand Up @@ -665,7 +665,6 @@ func (sip *streamIngestionProcessor) flushLoop(_ context.Context) error {

func (sip *streamIngestionProcessor) onFlushUpdateMetricUpdate(batchSummary kvpb.BulkOpSummary) {
sip.metrics.IngestedLogicalBytes.Inc(batchSummary.DataSize)
sip.metrics.IngestedSSTBytes.Inc(batchSummary.SSTDataSize)
}

// consumeEvents handles processing events on the merged event queue and returns
Expand Down Expand Up @@ -952,9 +951,6 @@ func (sip *streamIngestionProcessor) bufferCheckpoint(event PartitionEvent) erro
return errors.Wrap(err, "unable to forward checkpoint frontier")
}
}

sip.metrics.EarliestDataCheckpointSpan.Update(lowestTimestamp.GoTime().UnixNano())
sip.metrics.LatestDataCheckpointSpan.Update(highestTimestamp.GoTime().UnixNano())
sip.metrics.ResolvedEvents.Inc(1)
return nil
}
Expand Down
93 changes: 0 additions & 93 deletions pkg/cmd/roachprod/grafana/configs/crosscluster_replication.json
Original file line number Diff line number Diff line change
Expand Up @@ -212,99 +212,6 @@
"align": false
}
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 1,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "decbytes"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 8
},
"id": 4,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "physical_replication_sst_bytes{job=\"cockroachdb\",cluster=\"$cluster\",instance=~\"$node\"}",
"legendFormat": "__auto",
"range": true,
"refId": "A"
}
],
"title": "Replication SST Bytes",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
Expand Down
3 changes: 0 additions & 3 deletions pkg/cmd/roachtest/tests/cluster_to_cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,9 +98,6 @@ var c2cPromMetrics = map[string]clusterstats.ClusterStat{
"LogicalMegabytes": {
LabelName: "node",
Query: "physical_replication_logical_bytes / 1e6"},
"PhysicalMegabytes": {
LabelName: "node",
Query: "physical_replication_sst_bytes / 1e6"},
"PhysicalReplicatedMegabytes": {
LabelName: "node",
Query: "capacity_used / 1e6"},
Expand Down
Loading

0 comments on commit e49c98b

Please sign in to comment.