From beec0c579bca23e51394408b34aaaeb7fc025843 Mon Sep 17 00:00:00 2001
From: Tobias Grieger <tobias.b.grieger@gmail.com>
Date: Wed, 17 Aug 2022 09:50:32 +0200
Subject: [PATCH 1/6] storage: skip BenchmarkMVCCExportToSST under short

Release note: None
---
 pkg/storage/bench_test.go | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pkg/storage/bench_test.go b/pkg/storage/bench_test.go
index 6f67d9fcf316..5b6dacfeaa05 100644
--- a/pkg/storage/bench_test.go
+++ b/pkg/storage/bench_test.go
@@ -108,6 +108,7 @@ func BenchmarkMVCCGarbageCollect(b *testing.B) {
 }
 
 func BenchmarkMVCCExportToSST(b *testing.B) {
+	skip.UnderShort(b)
 	defer log.Scope(b).Close(b)
 
 	numKeys := []int{64, 512, 1024, 8192, 65536}

From 68237edb5167498e18ef27ddb7ccf4e178226a92 Mon Sep 17 00:00:00 2001
From: Tobias Grieger <tobias.b.grieger@gmail.com>
Date: Wed, 17 Aug 2022 08:26:38 +0200
Subject: [PATCH 2/6] storage: fix accidental no-op in runMVCCExportToSST

The keys were accidentally prefixed by 16 null bytes, and these sort
before `keys.LocalMax`. As a result, the SST export in this benchmark
never returned any kv pairs, rendering the benchmark useless.

Release note: None
---
 pkg/storage/bench_test.go | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/pkg/storage/bench_test.go b/pkg/storage/bench_test.go
index 5b6dacfeaa05..ff4b821bddd1 100644
--- a/pkg/storage/bench_test.go
+++ b/pkg/storage/bench_test.go
@@ -1616,10 +1616,11 @@ func runMVCCExportToSST(b *testing.B, numKeys int, numRevisions int, exportAllRe
 	ctx := context.Background()
 	st := cluster.MakeTestingClusterSettings()
 
-	batch := engine.NewUnindexedBatch(true /* writeOnly */)
+	batch := engine.NewBatch()
 	for i := 0; i < numKeys; i++ {
-		key := make([]byte, 16)
-		key = append(key, 'a', 'a', 'a')
+		var key []byte
+		key = append(key, keys.LocalMax...)
+		key = append(key, bytes.Repeat([]byte{'a'}, 19)...)
 		key = encoding.EncodeUint32Ascending(key, uint32(i))
 
 		for j := 0; j < numRevisions; j++ {

From d20aae427919e7957932153778f8e5a581762e72 Mon Sep 17 00:00:00 2001
From: Tobias Grieger <tobias.b.grieger@gmail.com>
Date: Wed, 17 Aug 2022 08:51:24 +0200
Subject: [PATCH 3/6] storage: don't throw away output in runMVCCExportToSST

That'll allow us to check it (more on that later).

Release note: None
---
 pkg/storage/bench_test.go | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/pkg/storage/bench_test.go b/pkg/storage/bench_test.go
index ff4b821bddd1..db7f16f1e3b5 100644
--- a/pkg/storage/bench_test.go
+++ b/pkg/storage/bench_test.go
@@ -1640,8 +1640,14 @@ func runMVCCExportToSST(b *testing.B, numKeys int, numRevisions int, exportAllRe
 		b.Fatal(err)
 	}
 
+	var buf bytes.Buffer
+	buf.Grow(1 << 20)
 	b.ResetTimer()
+	b.StopTimer()
+	var assertLen int // buf.Len shouldn't change between runs
 	for i := 0; i < b.N; i++ {
+		buf.Reset()
+		b.StartTimer()
 		startTS := hlc.Timestamp{WallTime: int64(numRevisions / 2)}
 		endTS := hlc.Timestamp{WallTime: int64(numRevisions + 2)}
 		_, _, err := MVCCExportToSST(ctx, st, engine, MVCCExportOptions{
@@ -1653,18 +1659,20 @@ func runMVCCExportToSST(b *testing.B, numKeys int, numRevisions int, exportAllRe
 			TargetSize:         0,
 			MaxSize:            0,
 			StopMidKey:         false,
-		}, noopWriter{})
+		}, &buf)
 		if err != nil {
 			b.Fatal(err)
 		}
-	}
-	b.StopTimer()
-}
+		b.StopTimer()
 
-type noopWriter struct{}
+		if i == 0 {
+			require.NotZero(b, buf.Len())
+			assertLen = buf.Len()
+		}
 
-func (noopWriter) Close() error                { return nil }
-func (noopWriter) Write(p []byte) (int, error) { return len(p), nil }
+		require.Equal(b, assertLen, buf.Len())
+	}
+}
 
 func runCheckSSTConflicts(
 	b *testing.B, numEngineKeys, numVersions, numSstKeys int, overlap, usePrefixSeek bool,

From e6882fba9cae8f067b51f6a63ad1cb522ff0497e Mon Sep 17 00:00:00 2001
From: Tobias Grieger <tobias.b.grieger@gmail.com>
Date: Wed, 17 Aug 2022 08:30:21 +0200
Subject: [PATCH 4/6] storage: minor refactor in runMVCCExportToSST

We want to slide MVCC range deletions under the kvs in a follow-up
commit, and this helps make that easier.

Release note: None
---
 pkg/storage/bench_test.go | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/pkg/storage/bench_test.go b/pkg/storage/bench_test.go
index db7f16f1e3b5..cb47939ce5f5 100644
--- a/pkg/storage/bench_test.go
+++ b/pkg/storage/bench_test.go
@@ -1616,15 +1616,24 @@ func runMVCCExportToSST(b *testing.B, numKeys int, numRevisions int, exportAllRe
 	ctx := context.Background()
 	st := cluster.MakeTestingClusterSettings()
 
-	batch := engine.NewBatch()
-	for i := 0; i < numKeys; i++ {
+	mkKey := func(i int) roachpb.Key {
 		var key []byte
 		key = append(key, keys.LocalMax...)
 		key = append(key, bytes.Repeat([]byte{'a'}, 19)...)
 		key = encoding.EncodeUint32Ascending(key, uint32(i))
+		return key
+	}
+
+	mkWall := func(j int) int64 {
+		return int64(j + 1)
+	}
+
+	batch := engine.NewBatch()
+	for i := 0; i < numKeys; i++ {
+		key := mkKey(i)
 
 		for j := 0; j < numRevisions; j++ {
-			mvccKey := MVCCKey{Key: key, Timestamp: hlc.Timestamp{WallTime: int64(j + 1), Logical: 0}}
+			mvccKey := MVCCKey{Key: key, Timestamp: hlc.Timestamp{WallTime: mkWall(j), Logical: 0}}
 			mvccValue := MVCCValue{Value: roachpb.MakeValueFromString("foobar")}
 			err := batch.PutMVCC(mvccKey, mvccValue)
 			if err != nil {
@@ -1648,8 +1657,8 @@ func runMVCCExportToSST(b *testing.B, numKeys int, numRevisions int, exportAllRe
 	for i := 0; i < b.N; i++ {
 		buf.Reset()
 		b.StartTimer()
-		startTS := hlc.Timestamp{WallTime: int64(numRevisions / 2)}
-		endTS := hlc.Timestamp{WallTime: int64(numRevisions + 2)}
+		startTS := hlc.Timestamp{WallTime: mkWall(numRevisions/2 - 1)}
+		endTS := hlc.Timestamp{WallTime: mkWall(numRevisions + 1)}
 		_, _, err := MVCCExportToSST(ctx, st, engine, MVCCExportOptions{
 			StartKey:           MVCCKey{Key: keys.LocalMax},
 			EndKey:             roachpb.KeyMax,

From 752faa116cb23ae058067eda8e95753ba3646375 Mon Sep 17 00:00:00 2001
From: Tobias Grieger <tobias.b.grieger@gmail.com>
Date: Wed, 17 Aug 2022 08:36:14 +0200
Subject: [PATCH 5/6] storage: add mvcc rangedels to BenchmarkMVCCExportToSST

Also adds a sanity check that we're not emitting the range dels. This
wasn't actually true, and required a small tweak to the start timestamp
so that the rangedels would actually be excluded.

Closes https://github.com/cockroachdb/cockroach/issues/86140.
---
 pkg/storage/bench_test.go | 114 +++++++++++++++++++++++++++++++++++---
 1 file changed, 106 insertions(+), 8 deletions(-)

diff --git a/pkg/storage/bench_test.go b/pkg/storage/bench_test.go
index cb47939ce5f5..77f53b29abde 100644
--- a/pkg/storage/bench_test.go
+++ b/pkg/storage/bench_test.go
@@ -111,6 +111,13 @@ func BenchmarkMVCCExportToSST(b *testing.B) {
 	skip.UnderShort(b)
 	defer log.Scope(b).Close(b)
 
+	// To run and compare on range keys:
+	//
+	//     go test ./pkg/storage -run - -count 5 -bench BenchmarkMVCCExportToSST -timeout 500m 2>&1 | tee bench.txt
+	//     for flavor in numRangeKeys=0 numRangeKeys=1 numRangeKeys=100; do grep -E "${flavor}[^0-9]+" bench.txt | sed -E "s/${flavor}+/X/" > $flavor.txt; done
+	//     benchstat numRangeKeys\={0,1}.txt
+	//     benchstat numRangeKeys\={0,100}.txt
+
 	numKeys := []int{64, 512, 1024, 8192, 65536}
 	numRevisions := []int{1, 10, 100}
 	exportAllRevisions := []bool{false, true}
@@ -121,7 +128,17 @@ func BenchmarkMVCCExportToSST(b *testing.B) {
 				b.Run(fmt.Sprintf("numRevisions=%d", numRevision), func(b *testing.B) {
 					for _, exportAllRevisionsVal := range exportAllRevisions {
 						b.Run(fmt.Sprintf("exportAllRevisions=%t", exportAllRevisionsVal), func(b *testing.B) {
-							runMVCCExportToSST(b, numKey, numRevision, exportAllRevisionsVal)
+							for _, numRangeKeys := range []int{0, 1, 100} {
+								b.Run(fmt.Sprintf("numRangeKeys=%d", numRangeKeys), func(b *testing.B) {
+									opts := mvccExportToSSTOpts{
+										numKeys:            numKey,
+										numRevisions:       numRevision,
+										exportAllRevisions: exportAllRevisionsVal,
+										numRangeKeys:       numRangeKeys,
+									}
+									runMVCCExportToSST(b, opts)
+								})
+							}
 						})
 					}
 				})
@@ -1607,7 +1624,12 @@ func runBatchApplyBatchRepr(
 	b.StopTimer()
 }
 
-func runMVCCExportToSST(b *testing.B, numKeys int, numRevisions int, exportAllRevisions bool) {
+type mvccExportToSSTOpts struct {
+	numKeys, numRevisions, numRangeKeys int
+	exportAllRevisions                  bool
+}
+
+func runMVCCExportToSST(b *testing.B, opts mvccExportToSSTOpts) {
 	dir, cleanup := testutils.TempDir(b)
 	defer cleanup()
 	engine := setupMVCCPebble(b, dir)
@@ -1625,14 +1647,40 @@ func runMVCCExportToSST(b *testing.B, numKeys int, numRevisions int, exportAllRe
 	}
 
 	mkWall := func(j int) int64 {
-		return int64(j + 1)
+		wt := int64(j + 2)
+		return wt
 	}
 
+	// Write range keys.
+	func() {
+		rng := rand.New(rand.NewSource(12345))
+		batch := engine.NewBatch()
+		defer batch.Close()
+		for i := 0; i < opts.numRangeKeys; i++ {
+			// NB: regular keys are written at ts 2+, so this is below any of the
+			// regular writes and thus won't delete anything.
+			ts := hlc.Timestamp{WallTime: 1, Logical: int32(i + 1)}
+			start := rng.Intn(opts.numKeys)
+			end := start + rng.Intn(opts.numKeys-start) + 1
+			// As a special case, if we're only writing one range key, write it across
+			// the entire span.
+			if opts.numRangeKeys == 1 {
+				start = 0
+				end = opts.numKeys + 1
+			}
+			startKey := mkKey(start)
+			endKey := mkKey(end)
+			require.NoError(b, MVCCDeleteRangeUsingTombstone(
+				ctx, batch, nil, startKey, endKey, ts, hlc.ClockTimestamp{}, nil, nil, false, 0, nil))
+		}
+		require.NoError(b, batch.Commit(false /* sync */))
+	}()
+
 	batch := engine.NewBatch()
-	for i := 0; i < numKeys; i++ {
+	for i := 0; i < opts.numKeys; i++ {
 		key := mkKey(i)
 
-		for j := 0; j < numRevisions; j++ {
+		for j := 0; j < opts.numRevisions; j++ {
 			mvccKey := MVCCKey{Key: key, Timestamp: hlc.Timestamp{WallTime: mkWall(j), Logical: 0}}
 			mvccValue := MVCCValue{Value: roachpb.MakeValueFromString("foobar")}
 			err := batch.PutMVCC(mvccKey, mvccValue)
@@ -1649,6 +1697,25 @@ func runMVCCExportToSST(b *testing.B, numKeys int, numRevisions int, exportAllRe
 		b.Fatal(err)
 	}
 
+	startWall := mkWall(opts.numRevisions/2 - 1) // exclusive, at least 1, so never see rangedels
+	endWall := mkWall(opts.numRevisions + 1)     // see latest revision for every key
+	var expKVsInSST int
+	if opts.exportAllRevisions {
+		// First, compute how many revisions are visible for each key.
+		// Could probably use a closed formula for this but this works.
+		for i := 0; i < opts.numRevisions; i++ {
+			wall := mkWall(i)
+			if wall > startWall && wall <= endWall {
+				expKVsInSST++
+			}
+		}
+		// Then compute the total.
+		expKVsInSST *= opts.numKeys
+	} else {
+		// See one revision per key.
+		expKVsInSST = opts.numKeys
+	}
+
 	var buf bytes.Buffer
 	buf.Grow(1 << 20)
 	b.ResetTimer()
@@ -1657,14 +1724,14 @@ func runMVCCExportToSST(b *testing.B, numKeys int, numRevisions int, exportAllRe
 	for i := 0; i < b.N; i++ {
 		buf.Reset()
 		b.StartTimer()
-		startTS := hlc.Timestamp{WallTime: mkWall(numRevisions/2 - 1)}
-		endTS := hlc.Timestamp{WallTime: mkWall(numRevisions + 1)}
+		startTS := hlc.Timestamp{WallTime: startWall, Logical: math.MaxInt32} // use 1.infinity to avoid rangedels at 1.<n>
+		endTS := hlc.Timestamp{WallTime: endWall}
 		_, _, err := MVCCExportToSST(ctx, st, engine, MVCCExportOptions{
 			StartKey:           MVCCKey{Key: keys.LocalMax},
 			EndKey:             roachpb.KeyMax,
 			StartTS:            startTS,
 			EndTS:              endTS,
-			ExportAllRevisions: exportAllRevisions,
+			ExportAllRevisions: opts.exportAllRevisions,
 			TargetSize:         0,
 			MaxSize:            0,
 			StopMidKey:         false,
@@ -1681,6 +1748,37 @@ func runMVCCExportToSST(b *testing.B, numKeys int, numRevisions int, exportAllRe
 
 		require.Equal(b, assertLen, buf.Len())
 	}
+
+	// Run sanity checks on last produced SST.
+	it, err := NewPebbleMemSSTIterator(
+		buf.Bytes(), true /* verify */, IterOptions{
+			LowerBound: keys.LocalMax,
+			UpperBound: roachpb.KeyMax,
+			KeyTypes:   IterKeyTypePointsAndRanges,
+		},
+	)
+	it.SeekGE(MakeMVCCMetadataKey(roachpb.LocalMax))
+	require.NoError(b, err)
+	var n int // points
+	var r int // range key stacks
+	for {
+		ok, err := it.Valid()
+		require.NoError(b, err)
+		if !ok {
+			break
+		}
+		hasPoint, hasRange := it.HasPointAndRange()
+		if hasPoint {
+			n++
+		}
+		if hasRange && it.RangeKeyChanged() {
+			r++
+		}
+		it.Next()
+	}
+	require.Equal(b, expKVsInSST, n)
+	// Should not see any rangedel stacks.
+	require.Zero(b, r)
 }
 
 func runCheckSSTConflicts(

From ff6d72d1aee70d4809a3b62eccf69b296336aa84 Mon Sep 17 00:00:00 2001
From: Tobias Grieger <tobias.b.grieger@gmail.com>
Date: Wed, 17 Aug 2022 11:26:43 +0200
Subject: [PATCH 6/6] storage: touch up comment on MVCCExportToSST

Release note: None
---
 pkg/storage/mvcc.go | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/pkg/storage/mvcc.go b/pkg/storage/mvcc.go
index cf67cf9578f4..3ef46d122415 100644
--- a/pkg/storage/mvcc.go
+++ b/pkg/storage/mvcc.go
@@ -5712,12 +5712,24 @@ func MVCCIsSpanEmpty(
 
 // MVCCExportToSST exports changes to the keyrange [StartKey, EndKey) over the
 // interval (StartTS, EndTS] as a Pebble SST. See MVCCExportOptions for options.
+// StartTS may be zero.
 //
-// Tombstones are included if all revisions are requested (all tombstones) or if
-// the StartTS is non-zero (latest tombstone), including both MVCC point
-// tombstones and MVCC range tombstones. Intents within the time interval will
-// return a WriteIntentError, while intents outside the time interval are
-// ignored.
+// This comes in two principal flavors: all revisions or latest revision only.
+// In all-revisions mode, exports everything matching the span and time bounds,
+// i.e. extracts contiguous blocks of MVCC history. In latest-revision mode,
+// extracts just the changes necessary to transform an MVCC snapshot at StartTS
+// into one equivalent to the data at EndTS, but without including all
+// intermediate revisions not visible at EndTS. The latter mode is used for
+// incremental backups that can only be restored to EndTS, the former allows
+// restoring to any intermediate timestamp.
+//
+// Tombstones (both point and MVCC range tombstones) are treated like revisions.
+// That is, if all revisions are requested, all tombstones in (StartTS, EndTS]
+// and overlapping [StartKey, EndKey) are returned. If only the latest revision
+// is requested, only the most recent matching tombstone is returned.
+//
+// Intents within the time and span bounds will return a WriteIntentError, while
+// intents outside are ignored.
 //
 // Returns an export summary and a resume key that allows resuming the export if
 // it reached a limit. Data is written to dest as it is collected. If an error