Merge #131017

131017: crosscluster: remove some unused or less useful metrics r=dt a=dt See commits. Co-authored-by: David Taylor <[email protected]>
cockroachdb · Sep 25, 2024 · e49c98b · e49c98b
2 parents 611d44d + 7ed7389
commit e49c98b
Show file tree

Hide file tree

Showing 10 changed files with 2 additions and 223 deletions.
diff --git a/docs/generated/metrics/metrics.html b/docs/generated/metrics/metrics.html
@@ -1453,12 +1453,7 @@
 <tr><td>APPLICATION</td><td>logical_replication.events_initial_success</td><td>Successful applications of an incoming row update</td><td>Failures</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
 <tr><td>APPLICATION</td><td>logical_replication.events_retry_failure</td><td>Failed re-attempts to apply a row update</td><td>Failures</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
 <tr><td>APPLICATION</td><td>logical_replication.events_retry_success</td><td>Row update events applied after one or more retries</td><td>Failures</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
-<tr><td>APPLICATION</td><td>logical_replication.flush_bytes</td><td>Number of bytes in a given flush</td><td>Logical bytes</td><td>HISTOGRAM</td><td>BYTES</td><td>AVG</td><td>NONE</td></tr>
-<tr><td>APPLICATION</td><td>logical_replication.flush_hist_nanos</td><td>Time spent flushing messages across all replication streams</td><td>Nanoseconds</td><td>HISTOGRAM</td><td>NANOSECONDS</td><td>AVG</td><td>NONE</td></tr>
-<tr><td>APPLICATION</td><td>logical_replication.flush_row_count</td><td>Number of rows in a given flush</td><td>Rows</td><td>HISTOGRAM</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
-<tr><td>APPLICATION</td><td>logical_replication.kv_write_fallback_count</td><td>Total number of times the kv write path could not handle a row update and fell back to SQL instead</td><td>Events</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
 <tr><td>APPLICATION</td><td>logical_replication.logical_bytes</td><td>Logical bytes (sum of keys + values) received by all replication jobs</td><td>Bytes</td><td>COUNTER</td><td>BYTES</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
-<tr><td>APPLICATION</td><td>logical_replication.optimistic_insert_conflict_count</td><td>Total number of times the optimistic insert encountered a conflict</td><td>Events</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
 <tr><td>APPLICATION</td><td>logical_replication.replan_count</td><td>Total number of dist sql replanning events</td><td>Events</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
 <tr><td>APPLICATION</td><td>logical_replication.replicated_time_seconds</td><td>The replicated time of the logical replication stream in seconds since the unix epoch.</td><td>Seconds</td><td>GAUGE</td><td>SECONDS</td><td>AVG</td><td>NONE</td></tr>
 <tr><td>APPLICATION</td><td>logical_replication.retry_queue_bytes</td><td>The replicated time of the logical replication stream in seconds since the unix epoch.</td><td>Bytes</td><td>GAUGE</td><td>BYTES</td><td>AVG</td><td>NONE</td></tr>
@@ -1468,17 +1463,13 @@
 <tr><td>APPLICATION</td><td>physical_replication.commit_latency</td><td>Event commit latency: a difference between event MVCC timestamp and the time it was flushed into disk. If we batch events, then the difference between the oldest event in the batch and flush is recorded</td><td>Nanoseconds</td><td>HISTOGRAM</td><td>NANOSECONDS</td><td>AVG</td><td>NONE</td></tr>
 <tr><td>APPLICATION</td><td>physical_replication.cutover_progress</td><td>The number of ranges left to revert in order to complete an inflight cutover</td><td>Ranges</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
 <tr><td>APPLICATION</td><td>physical_replication.distsql_replan_count</td><td>Total number of dist sql replanning events</td><td>Events</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
-<tr><td>APPLICATION</td><td>physical_replication.earliest_data_checkpoint_span</td><td>The earliest timestamp of the last checkpoint forwarded by an ingestion data processor</td><td>Timestamp</td><td>GAUGE</td><td>TIMESTAMP_NS</td><td>AVG</td><td>NONE</td></tr>
 <tr><td>APPLICATION</td><td>physical_replication.events_ingested</td><td>Events ingested by all replication jobs</td><td>Events</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
 <tr><td>APPLICATION</td><td>physical_replication.flush_hist_nanos</td><td>Time spent flushing messages across all replication streams</td><td>Nanoseconds</td><td>HISTOGRAM</td><td>NANOSECONDS</td><td>AVG</td><td>NONE</td></tr>
 <tr><td>APPLICATION</td><td>physical_replication.flushes</td><td>Total flushes across all replication jobs</td><td>Flushes</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
-<tr><td>APPLICATION</td><td>physical_replication.job_progress_updates</td><td>Total number of updates to the ingestion job progress</td><td>Job Updates</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
-<tr><td>APPLICATION</td><td>physical_replication.latest_data_checkpoint_span</td><td>The latest timestamp of the last checkpoint forwarded by an ingestion data processor</td><td>Timestamp</td><td>GAUGE</td><td>TIMESTAMP_NS</td><td>AVG</td><td>NONE</td></tr>
 <tr><td>APPLICATION</td><td>physical_replication.logical_bytes</td><td>Logical bytes (sum of keys + values) ingested by all replication jobs</td><td>Bytes</td><td>COUNTER</td><td>BYTES</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
 <tr><td>APPLICATION</td><td>physical_replication.replicated_time_seconds</td><td>The replicated time of the physical replication stream in seconds since the unix epoch.</td><td>Seconds</td><td>GAUGE</td><td>SECONDS</td><td>AVG</td><td>NONE</td></tr>
 <tr><td>APPLICATION</td><td>physical_replication.resolved_events_ingested</td><td>Resolved events ingested by all replication jobs</td><td>Events</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
 <tr><td>APPLICATION</td><td>physical_replication.running</td><td>Number of currently running replication streams</td><td>Replication Streams</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
-<tr><td>APPLICATION</td><td>physical_replication.sst_bytes</td><td>SST bytes (compressed) sent to KV by all replication jobs</td><td>Bytes</td><td>COUNTER</td><td>BYTES</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
 <tr><td>APPLICATION</td><td>requests.slow.distsender</td><td>Number of range-bound RPCs currently stuck or retrying for a long time.<br/><br/>Note that this is not a good signal for KV health. The remote side of the<br/>RPCs tracked here may experience contention, so an end user can easily<br/>cause values for this metric to be emitted by leaving a transaction open<br/>for a long time and contending with it using a second transaction.</td><td>Requests</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
 <tr><td>APPLICATION</td><td>round-trip-latency</td><td>Distribution of round-trip latencies with other nodes.<br/><br/>This only reflects successful heartbeats and measures gRPC overhead as well as<br/>possible head-of-line blocking. Elevated values in this metric may hint at<br/>network issues and/or saturation, but they are no proof of them. CPU overload<br/>can similarly elevate this metric. The operator should look towards OS-level<br/>metrics such as packet loss, retransmits, etc, to conclusively diagnose network<br/>issues. Heartbeats are not very frequent (~seconds), so they may not capture<br/>rare or short-lived degradations.<br/></td><td>Round-trip time</td><td>HISTOGRAM</td><td>NANOSECONDS</td><td>AVG</td><td>NONE</td></tr>
 <tr><td>APPLICATION</td><td>rpc.client.bytes.egress</td><td>Counter of TCP bytes sent via gRPC on connections we initiated.</td><td>Bytes</td><td>COUNTER</td><td>BYTES</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>

diff --git a/pkg/ccl/crosscluster/logical/logical_replication_writer_processor.go b/pkg/ccl/crosscluster/logical/logical_replication_writer_processor.go
@@ -670,8 +670,6 @@ func (lrw *logicalReplicationWriterProcessor) flushBuffer(
 				return err
 			}
 			perChunkStats[worker] = s
-			lrw.metrics.OptimisticInsertConflictCount.Inc(s.optimisticInsertConflicts)
-			lrw.metrics.KVWriteFallbackCount.Inc(s.kvWriteFallbacks)
 			return nil
 		})
 	}
@@ -707,9 +705,6 @@ func (lrw *logicalReplicationWriterProcessor) flushBuffer(
 	} else {
 		lrw.metrics.InitialApplySuccesses.Inc(stats.processed.success)
 		lrw.metrics.InitialApplyFailures.Inc(stats.notProcessed.count + stats.processed.dlq)
-		lrw.metrics.StreamBatchNanosHist.RecordValue(flushTime)
-		lrw.metrics.StreamBatchRowsHist.RecordValue(int64(len(kvs)))
-		lrw.metrics.StreamBatchBytesHist.RecordValue(stats.processed.bytes + stats.notProcessed.bytes)
 		lrw.metrics.ReceivedLogicalBytes.Inc(stats.processed.bytes + stats.notProcessed.bytes)
 	}
 	return notProcessed, stats.notProcessed.bytes, nil

diff --git a/pkg/ccl/crosscluster/logical/metrics.go b/pkg/ccl/crosscluster/logical/metrics.go
@@ -119,36 +119,6 @@ var (
 		Measurement: "Events",
 		Unit:        metric.Unit_COUNT,
 	}
-	metaStreamBatchRowsHist = metric.Metadata{
-		Name:        "logical_replication.flush_row_count",
-		Help:        "Number of rows in a given flush",
-		Measurement: "Rows",
-		Unit:        metric.Unit_COUNT,
-	}
-	metaStreamBatchBytesHist = metric.Metadata{
-		Name:        "logical_replication.flush_bytes",
-		Help:        "Number of bytes in a given flush",
-		Measurement: "Logical bytes",
-		Unit:        metric.Unit_BYTES,
-	}
-	metaStreamBatchNanosHist = metric.Metadata{
-		Name:        "logical_replication.flush_hist_nanos",
-		Help:        "Time spent flushing messages across all replication streams",
-		Measurement: "Nanoseconds",
-		Unit:        metric.Unit_NANOSECONDS,
-	}
-	metaOptimisticInsertConflictCount = metric.Metadata{
-		Name:        "logical_replication.optimistic_insert_conflict_count",
-		Help:        "Total number of times the optimistic insert encountered a conflict",
-		Measurement: "Events",
-		Unit:        metric.Unit_COUNT,
-	}
-	metaKVWriteFallbackCount = metric.Metadata{
-		Name:        "logical_replication.kv_write_fallback_count",
-		Help:        "Total number of times the kv write path could not handle a row update and fell back to SQL instead",
-		Measurement: "Events",
-		Unit:        metric.Unit_COUNT,
-	}
 	metaDistSQLReplanCount = metric.Metadata{
 		Name:        "logical_replication.replan_count",
 		Help:        "Total number of dist sql replanning events",
@@ -186,13 +156,7 @@ type Metrics struct {
 	// Internal numbers that are useful for determining why a stream is behaving
 	// a specific way.
 	CheckpointEvents *metric.Counter
-	// TODO(dt): are these stream batch size or latency numbers useful?
-	StreamBatchRowsHist           metric.IHistogram
-	StreamBatchBytesHist          metric.IHistogram
-	StreamBatchNanosHist          metric.IHistogram
-	OptimisticInsertConflictCount *metric.Counter
-	KVWriteFallbackCount          *metric.Counter
-	ReplanCount                   *metric.Counter
+	ReplanCount      *metric.Counter
 }
 
 // MetricStruct implements the metric.Struct interface.
@@ -228,26 +192,6 @@ func MakeMetrics(histogramWindow time.Duration) metric.Struct {
 		RetriedApplySuccesses: metric.NewCounter(metaRetriedApplySuccesses),
 		RetriedApplyFailures:  metric.NewCounter(metaRetriedApplyFailures),
 		CheckpointEvents:      metric.NewCounter(metaCheckpointEvents),
-		StreamBatchRowsHist: metric.NewHistogram(metric.HistogramOptions{
-			Mode:         metric.HistogramModePrometheus,
-			Metadata:     metaStreamBatchRowsHist,
-			Duration:     histogramWindow,
-			BucketConfig: metric.DataCount16MBuckets,
-		}),
-		StreamBatchBytesHist: metric.NewHistogram(metric.HistogramOptions{
-			Mode:         metric.HistogramModePrometheus,
-			Metadata:     metaStreamBatchBytesHist,
-			Duration:     histogramWindow,
-			BucketConfig: metric.MemoryUsage64MBBuckets,
-		}),
-		StreamBatchNanosHist: metric.NewHistogram(metric.HistogramOptions{
-			Mode:         metric.HistogramModePrometheus,
-			Metadata:     metaStreamBatchNanosHist,
-			Duration:     histogramWindow,
-			BucketConfig: metric.IOLatencyBuckets,
-		}),
-		OptimisticInsertConflictCount: metric.NewCounter(metaOptimisticInsertConflictCount),
-		KVWriteFallbackCount:          metric.NewCounter(metaKVWriteFallbackCount),
-		ReplanCount:                   metric.NewCounter(metaDistSQLReplanCount),
+		ReplanCount:           metric.NewCounter(metaDistSQLReplanCount),
 	}
 }
diff --git a/pkg/ccl/crosscluster/physical/metrics.go b/pkg/ccl/crosscluster/physical/metrics.go
@@ -40,12 +40,6 @@ var (
 		Measurement: "Bytes",
 		Unit:        metric.Unit_BYTES,
 	}
-	metaReplicationSSTBytes = metric.Metadata{
-		Name:        "physical_replication.sst_bytes",
-		Help:        "SST bytes (compressed) sent to KV by all replication jobs",
-		Measurement: "Bytes",
-		Unit:        metric.Unit_BYTES,
-	}
 	metaReplicationFlushes = metric.Metadata{
 		Name:        "physical_replication.flushes",
 		Help:        "Total flushes across all replication jobs",
@@ -80,31 +74,13 @@ var (
 		Measurement: "Replication Streams",
 		Unit:        metric.Unit_COUNT,
 	}
-	metaEarliestDataCheckpointSpan = metric.Metadata{
-		Name:        "physical_replication.earliest_data_checkpoint_span",
-		Help:        "The earliest timestamp of the last checkpoint forwarded by an ingestion data processor",
-		Measurement: "Timestamp",
-		Unit:        metric.Unit_TIMESTAMP_NS,
-	}
-	metaLatestDataCheckpointSpan = metric.Metadata{
-		Name:        "physical_replication.latest_data_checkpoint_span",
-		Help:        "The latest timestamp of the last checkpoint forwarded by an ingestion data processor",
-		Measurement: "Timestamp",
-		Unit:        metric.Unit_TIMESTAMP_NS,
-	}
 
 	metaReplicatedTimeSeconds = metric.Metadata{
 		Name:        "physical_replication.replicated_time_seconds",
 		Help:        "The replicated time of the physical replication stream in seconds since the unix epoch.",
 		Measurement: "Seconds",
 		Unit:        metric.Unit_SECONDS,
 	}
-	metaJobProgressUpdates = metric.Metadata{
-		Name:        "physical_replication.job_progress_updates",
-		Help:        "Total number of updates to the ingestion job progress",
-		Measurement: "Job Updates",
-		Unit:        metric.Unit_COUNT,
-	}
 	// This metric would be 0 until cutover begins, and then it will be updated to
 	// the total number of ranges that need to be reverted, and then gradually go
 	// down to 0 again. NB: that the number of ranges is the total number of
@@ -128,17 +104,13 @@ var (
 type Metrics struct {
 	IngestedEvents             *metric.Counter
 	IngestedLogicalBytes       *metric.Counter
-	IngestedSSTBytes           *metric.Counter
 	Flushes                    *metric.Counter
-	JobProgressUpdates         *metric.Counter
 	ResolvedEvents             *metric.Counter
 	ReplanCount                *metric.Counter
 	FlushHistNanos             metric.IHistogram
 	CommitLatency              metric.IHistogram
 	AdmitLatency               metric.IHistogram
 	RunningCount               *metric.Gauge
-	EarliestDataCheckpointSpan *metric.Gauge
-	LatestDataCheckpointSpan   *metric.Gauge
 	ReplicatedTimeSeconds      *metric.Gauge
 	ReplicationCutoverProgress *metric.Gauge
 }
@@ -151,10 +123,8 @@ func MakeMetrics(histogramWindow time.Duration) metric.Struct {
 	m := &Metrics{
 		IngestedEvents:       metric.NewCounter(metaReplicationEventsIngested),
 		IngestedLogicalBytes: metric.NewCounter(metaReplicationIngestedBytes),
-		IngestedSSTBytes:     metric.NewCounter(metaReplicationSSTBytes),
 		Flushes:              metric.NewCounter(metaReplicationFlushes),
 		ResolvedEvents:       metric.NewCounter(metaReplicationResolvedEventsIngested),
-		JobProgressUpdates:   metric.NewCounter(metaJobProgressUpdates),
 		ReplanCount:          metric.NewCounter(metaDistSQLReplanCount),
 		FlushHistNanos: metric.NewHistogram(metric.HistogramOptions{
 			Metadata:     metaReplicationFlushHistNanos,
@@ -178,8 +148,6 @@ func MakeMetrics(histogramWindow time.Duration) metric.Struct {
 			SigFigs:      1,
 		}),
 		RunningCount:               metric.NewGauge(metaStreamsRunning),
-		EarliestDataCheckpointSpan: metric.NewGauge(metaEarliestDataCheckpointSpan),
-		LatestDataCheckpointSpan:   metric.NewGauge(metaLatestDataCheckpointSpan),
 		ReplicatedTimeSeconds:      metric.NewGauge(metaReplicatedTimeSeconds),
 		ReplicationCutoverProgress: metric.NewGauge(metaReplicationCutoverProgress),
 	}

diff --git a/pkg/ccl/crosscluster/physical/stream_ingestion_frontier_processor.go b/pkg/ccl/crosscluster/physical/stream_ingestion_frontier_processor.go
@@ -392,7 +392,6 @@ func (sf *streamIngestionFrontier) maybeUpdateProgress() error {
 		return err
 	}
 
-	sf.metrics.JobProgressUpdates.Inc(1)
 	sf.persistedReplicatedTime = f.Frontier()
 	sf.metrics.ReplicatedTimeSeconds.Update(sf.persistedReplicatedTime.GoTime().Unix())
 	return nil

diff --git a/pkg/ccl/crosscluster/physical/stream_ingestion_processor.go b/pkg/ccl/crosscluster/physical/stream_ingestion_processor.go
@@ -665,7 +665,6 @@ func (sip *streamIngestionProcessor) flushLoop(_ context.Context) error {
 
 func (sip *streamIngestionProcessor) onFlushUpdateMetricUpdate(batchSummary kvpb.BulkOpSummary) {
 	sip.metrics.IngestedLogicalBytes.Inc(batchSummary.DataSize)
-	sip.metrics.IngestedSSTBytes.Inc(batchSummary.SSTDataSize)
 }
 
 // consumeEvents handles processing events on the merged event queue and returns
@@ -952,9 +951,6 @@ func (sip *streamIngestionProcessor) bufferCheckpoint(event PartitionEvent) erro
 			return errors.Wrap(err, "unable to forward checkpoint frontier")
 		}
 	}
-
-	sip.metrics.EarliestDataCheckpointSpan.Update(lowestTimestamp.GoTime().UnixNano())
-	sip.metrics.LatestDataCheckpointSpan.Update(highestTimestamp.GoTime().UnixNano())
 	sip.metrics.ResolvedEvents.Inc(1)
 	return nil
 }

diff --git a/pkg/cmd/roachprod/grafana/configs/crosscluster_replication.json b/pkg/cmd/roachprod/grafana/configs/crosscluster_replication.json
@@ -212,99 +212,6 @@
         "align": false
       }
     },
-    {
-      "datasource": {
-        "type": "prometheus",
-        "uid": "${DS_PROMETHEUS}"
-      },
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "custom": {
-            "axisCenteredZero": false,
-            "axisColorMode": "text",
-            "axisLabel": "",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "drawStyle": "line",
-            "fillOpacity": 0,
-            "gradientMode": "none",
-            "hideFrom": {
-              "legend": false,
-              "tooltip": false,
-              "viz": false
-            },
-            "lineInterpolation": "linear",
-            "lineWidth": 1,
-            "pointSize": 1,
-            "scaleDistribution": {
-              "type": "linear"
-            },
-            "showPoints": "auto",
-            "spanNulls": false,
-            "stacking": {
-              "group": "A",
-              "mode": "none"
-            },
-            "thresholdsStyle": {
-              "mode": "off"
-            }
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "red",
-                "value": 80
-              }
-            ]
-          },
-          "unit": "decbytes"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 8,
-        "w": 12,
-        "x": 0,
-        "y": 8
-      },
-      "id": 4,
-      "options": {
-        "legend": {
-          "calcs": [],
-          "displayMode": "list",
-          "placement": "bottom",
-          "showLegend": true
-        },
-        "tooltip": {
-          "mode": "single",
-          "sort": "none"
-        }
-      },
-      "targets": [
-        {
-          "datasource": {
-            "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
-          },
-          "editorMode": "code",
-          "expr": "physical_replication_sst_bytes{job=\"cockroachdb\",cluster=\"$cluster\",instance=~\"$node\"}",
-          "legendFormat": "__auto",
-          "range": true,
-          "refId": "A"
-        }
-      ],
-      "title": "Replication SST Bytes",
-      "type": "timeseries"
-    },
     {
       "datasource": {
         "type": "prometheus",

diff --git a/pkg/cmd/roachtest/tests/cluster_to_cluster.go b/pkg/cmd/roachtest/tests/cluster_to_cluster.go
@@ -98,9 +98,6 @@ var c2cPromMetrics = map[string]clusterstats.ClusterStat{
 	"LogicalMegabytes": {
 		LabelName: "node",
 		Query:     "physical_replication_logical_bytes / 1e6"},
-	"PhysicalMegabytes": {
-		LabelName: "node",
-		Query:     "physical_replication_sst_bytes / 1e6"},
 	"PhysicalReplicatedMegabytes": {
 		LabelName: "node",
 		Query:     "capacity_used / 1e6"},