From a55d4b0d85f97166bcbbf341dfc18503302fe815 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Sat, 13 Aug 2022 12:00:19 +0000 Subject: [PATCH 1/4] storage: fix slice extention in `MVCCRangeKeyStack.CloneInto` This patch makes `MVCCRangeKeyStack.CloneInto` extend the existing slice to its full capacity before growing it. Previously, this could build a slice that was too small and panic. Release note: None --- pkg/storage/mvcc_key.go | 2 +- pkg/storage/mvcc_key_test.go | 74 +++++++++++++++++++----------------- 2 files changed, 41 insertions(+), 35 deletions(-) diff --git a/pkg/storage/mvcc_key.go b/pkg/storage/mvcc_key.go index e0ef43019a11..189e154171b7 100644 --- a/pkg/storage/mvcc_key.go +++ b/pkg/storage/mvcc_key.go @@ -664,7 +664,7 @@ func (v MVCCRangeKeyVersions) CloneInto(c *MVCCRangeKeyVersions) { if length, capacity := len(v), cap(*c); length > capacity { // Extend the slice, keeping the existing versions to reuse their Value byte // slices. The compiler optimizes away the intermediate, appended slice. - (*c) = append(*c, make(MVCCRangeKeyVersions, length-capacity)...) + *c = append((*c)[:capacity], make(MVCCRangeKeyVersions, length-capacity)...) } else { *c = (*c)[:length] } diff --git a/pkg/storage/mvcc_key_test.go b/pkg/storage/mvcc_key_test.go index 22ceafdf3df3..a2d2cbb225cc 100644 --- a/pkg/storage/mvcc_key_test.go +++ b/pkg/storage/mvcc_key_test.go @@ -22,6 +22,7 @@ import ( "testing/quick" "github.com/cockroachdb/cockroach/pkg/roachpb" + "github.com/cockroachdb/cockroach/pkg/testutils" "github.com/cockroachdb/cockroach/pkg/util/hlc" "github.com/cockroachdb/cockroach/pkg/util/leaktest" "github.com/cockroachdb/cockroach/pkg/util/log" @@ -457,46 +458,51 @@ func TestMVCCRangeKeyCloneInto(t *testing.T) { } for name, tc := range testcases { t.Run(name, func(t *testing.T) { - clone := tc.target - orig.CloneInto(&clone) - - // We don't discard empty byte slices when cloning a nil value, so we have - // to normalize these back to nil for the purpose of comparison. - for i := range clone.Versions { - if orig.Versions[i].Value == nil && len(clone.Versions[i].Value) == 0 { - clone.Versions[i].Value = nil + testutils.RunTrueAndFalse(t, "cleared", func(t *testing.T, cleared bool) { + clone := tc.target + if cleared { + clone.Clear() } - } - require.Equal(t, orig, clone) - - requireSliceIdentity := func(t *testing.T, a, b []byte, expectSame bool) { - t.Helper() - a, b = a[:cap(a)], b[:cap(b)] - if len(a) > 0 { - if expectSame { - require.Same(t, &a[0], &b[0]) - } else { - require.NotSame(t, &a[0], &b[0]) + orig.CloneInto(&clone) + + // We don't discard empty byte slices when cloning a nil value, so we have + // to normalize these back to nil for the purpose of comparison. + for i := range clone.Versions { + if orig.Versions[i].Value == nil && len(clone.Versions[i].Value) == 0 { + clone.Versions[i].Value = nil + } + } + require.Equal(t, orig, clone) + + requireSliceIdentity := func(t *testing.T, a, b []byte, expectSame bool) { + t.Helper() + a, b = a[:cap(a)], b[:cap(b)] + if len(a) > 0 { + if expectSame { + require.Same(t, &a[0], &b[0]) + } else { + require.NotSame(t, &a[0], &b[0]) + } } } - } - // Assert that slices are actual clones, by asserting the address of the - // backing array at [0]. - requireSliceIdentity(t, orig.Bounds.Key, clone.Bounds.Key, false) - requireSliceIdentity(t, orig.Bounds.EndKey, clone.Bounds.EndKey, false) - for i := range orig.Versions { - requireSliceIdentity(t, orig.Versions[i].Value, clone.Versions[i].Value, false) - } + // Assert that slices are actual clones, by asserting the address of the + // backing array at [0]. + requireSliceIdentity(t, orig.Bounds.Key, clone.Bounds.Key, false) + requireSliceIdentity(t, orig.Bounds.EndKey, clone.Bounds.EndKey, false) + for i := range orig.Versions { + requireSliceIdentity(t, orig.Versions[i].Value, clone.Versions[i].Value, false) + } - // Assert whether the clone is reusing byte slices from the target. - requireSliceIdentity(t, tc.target.Bounds.Key, clone.Bounds.Key, tc.expectReused) - requireSliceIdentity(t, tc.target.Bounds.EndKey, clone.Bounds.EndKey, tc.expectReused) - for i := range tc.target.Versions { - if i < len(clone.Versions) { - requireSliceIdentity(t, tc.target.Versions[i].Value, clone.Versions[i].Value, tc.expectReused) + // Assert whether the clone is reusing byte slices from the target. + requireSliceIdentity(t, tc.target.Bounds.Key, clone.Bounds.Key, tc.expectReused) + requireSliceIdentity(t, tc.target.Bounds.EndKey, clone.Bounds.EndKey, tc.expectReused) + for i := range tc.target.Versions { + if i < len(clone.Versions) { + requireSliceIdentity(t, tc.target.Versions[i].Value, clone.Versions[i].Value, tc.expectReused) + } } - } + }) }) } } From e507b0cae5f663cbc63dfe68a92a55c35d233e12 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Sun, 14 Aug 2022 17:56:42 +0000 Subject: [PATCH 2/4] storage: add `MVCCIterator.IsPrefix()` This method allows detecting whether an iterator is a prefix iterator, which comes in handy for wrapping iterators to apply optimizations. Release note: None --- pkg/kv/kvserver/spanset/batch.go | 5 +++++ pkg/storage/engine.go | 3 +++ pkg/storage/intent_interleaving_iter.go | 5 +++++ pkg/storage/mvcc_history_test.go | 10 +++++++--- pkg/storage/pebble_iterator.go | 5 +++++ pkg/storage/point_synthesizing_iter.go | 5 +++++ 6 files changed, 30 insertions(+), 3 deletions(-) diff --git a/pkg/kv/kvserver/spanset/batch.go b/pkg/kv/kvserver/spanset/batch.go index 240397a45793..c6be42aea240 100644 --- a/pkg/kv/kvserver/spanset/batch.go +++ b/pkg/kv/kvserver/spanset/batch.go @@ -215,6 +215,11 @@ func (i *MVCCIterator) Stats() storage.IteratorStats { return i.i.Stats() } +// IsPrefix is part of the storage.MVCCIterator interface. +func (i *MVCCIterator) IsPrefix() bool { + return i.i.IsPrefix() +} + // SupportsPrev is part of the storage.MVCCIterator interface. func (i *MVCCIterator) SupportsPrev() bool { return i.i.SupportsPrev() diff --git a/pkg/storage/engine.go b/pkg/storage/engine.go index 6324b34582bd..647dda6d2631 100644 --- a/pkg/storage/engine.go +++ b/pkg/storage/engine.go @@ -267,6 +267,9 @@ type MVCCIterator interface { FindSplitKey(start, end, minSplitKey roachpb.Key, targetSize int64) (MVCCKey, error) // Stats returns statistics about the iterator. Stats() IteratorStats + // IsPrefix returns true if the MVCCIterator is a prefix iterator, i.e. + // created with IterOptions.Prefix enabled. + IsPrefix() bool // SupportsPrev returns true if MVCCIterator implementation supports reverse // iteration with Prev() or SeekLT(). SupportsPrev() bool diff --git a/pkg/storage/intent_interleaving_iter.go b/pkg/storage/intent_interleaving_iter.go index be0ba49a1bbd..253d1398c073 100644 --- a/pkg/storage/intent_interleaving_iter.go +++ b/pkg/storage/intent_interleaving_iter.go @@ -1236,6 +1236,11 @@ func (i *intentInterleavingIter) Stats() IteratorStats { return stats } +// IsPrefix implements the MVCCIterator interface. +func (i *intentInterleavingIter) IsPrefix() bool { + return i.prefix +} + func (i *intentInterleavingIter) SupportsPrev() bool { return true } diff --git a/pkg/storage/mvcc_history_test.go b/pkg/storage/mvcc_history_test.go index 8de94151a0e3..9ddab23d606b 100644 --- a/pkg/storage/mvcc_history_test.go +++ b/pkg/storage/mvcc_history_test.go @@ -1433,11 +1433,15 @@ func cmdIterNew(e *evalCtx) error { } r, closer := metamorphicReader(e) - e.iter = &iterWithCloser{r.NewMVCCIterator(kind, opts), closer} - + iter := r.NewMVCCIterator(kind, opts) if e.hasArg("pointSynthesis") { - e.iter = newPointSynthesizingIter(e.mvccIter(), e.hasArg("emitOnSeekGE")) + iter = newPointSynthesizingIter(iter, e.hasArg("emitOnSeekGE")) + } + if opts.Prefix != iter.IsPrefix() { + return errors.Errorf("prefix iterator returned IsPrefix=false") } + + e.iter = &iterWithCloser{iter, closer} e.iterRangeKeys.Clear() return nil } diff --git a/pkg/storage/pebble_iterator.go b/pkg/storage/pebble_iterator.go index 652de8b4b5bc..0d50ce1b58e1 100644 --- a/pkg/storage/pebble_iterator.go +++ b/pkg/storage/pebble_iterator.go @@ -863,6 +863,11 @@ func (p *pebbleIterator) Stats() IteratorStats { } } +// IsPrefix implements the MVCCIterator interface. +func (p *pebbleIterator) IsPrefix() bool { + return p.prefix +} + // SupportsPrev implements the MVCCIterator interface. func (p *pebbleIterator) SupportsPrev() bool { return true diff --git a/pkg/storage/point_synthesizing_iter.go b/pkg/storage/point_synthesizing_iter.go index d6a3e3c7ad15..592391f82582 100644 --- a/pkg/storage/point_synthesizing_iter.go +++ b/pkg/storage/point_synthesizing_iter.go @@ -652,6 +652,11 @@ func (i *pointSynthesizingIter) Stats() IteratorStats { return i.iter.Stats() } +// IsPrefix implements the MVCCIterator interface. +func (i *pointSynthesizingIter) IsPrefix() bool { + return i.iter.IsPrefix() +} + // SupportsPrev implements MVCCIterator. func (i *pointSynthesizingIter) SupportsPrev() bool { return i.iter.SupportsPrev() From 27aedb997f40cbadda63543db6aadfca9d74de1c Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Sat, 13 Aug 2022 11:07:18 +0000 Subject: [PATCH 3/4] storage: reuse `pointSynthesizingIter` range key slice MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ``` name old time/op new time/op delta MVCCGet_Pebble/batch=true/versions=1/valueSize=8/numRangeKeys=0-24 3.70µs ± 1% 3.67µs ± 1% -0.89% (p=0.002 n=10+10) MVCCGet_Pebble/batch=true/versions=1/valueSize=8/numRangeKeys=1-24 7.37µs ± 0% 7.25µs ± 0% -1.70% (p=0.000 n=9+10) MVCCGet_Pebble/batch=true/versions=1/valueSize=8/numRangeKeys=10-24 14.7µs ± 1% 14.4µs ± 1% -2.21% (p=0.000 n=10+10) ``` Release note: None --- pkg/storage/point_synthesizing_iter.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pkg/storage/point_synthesizing_iter.go b/pkg/storage/point_synthesizing_iter.go index 592391f82582..e672de900354 100644 --- a/pkg/storage/point_synthesizing_iter.go +++ b/pkg/storage/point_synthesizing_iter.go @@ -133,6 +133,7 @@ func newPointSynthesizingIter(parent MVCCIterator, emitOnSeekGE bool) *pointSynt iter: parent, emitOnSeekGE: emitOnSeekGE, // Reuse pooled byte slices. + rangeKeys: iter.rangeKeys, rangeKeysPos: iter.rangeKeysPos, rangeKeysStart: iter.rangeKeysStart, } @@ -162,7 +163,8 @@ func (i *pointSynthesizingIter) Close() { // release releases the iterator back into the pool. func (i *pointSynthesizingIter) release() { *i = pointSynthesizingIter{ - // Reuse byte slices. + // Reuse slices. + rangeKeys: i.rangeKeys[:0], rangeKeysPos: i.rangeKeysPos[:0], rangeKeysStart: i.rangeKeysStart[:0], } @@ -200,7 +202,7 @@ func (i *pointSynthesizingIter) updateRangeKeys() { i.rangeKeysPos = append(i.rangeKeysPos[:0], i.iter.UnsafeKey().Key...) if rangeStart := i.iter.RangeBounds().Key; !rangeStart.Equal(i.rangeKeysStart) { i.rangeKeysStart = append(i.rangeKeysStart[:0], rangeStart...) - i.rangeKeys = i.iter.RangeKeys().Versions.Clone() + i.iter.RangeKeys().Versions.CloneInto(&i.rangeKeys) } if i.rangeKeysPos.Equal(i.rangeKeysStart) { i.rangeKeysEnd = len(i.rangeKeys) From 290e9baeef4aa178d8bb1365ba259ec579cb2f31 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Sat, 13 Aug 2022 13:38:21 +0000 Subject: [PATCH 4/4] storage: add `prefix` mode for `pointSynthesizingIter` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds a `prefix` mode for `pointSynthesizingIter`, detected from the parent's `IsPrefix()` method. When enabled, this allows omitting key cloning and comparisons. This replaces the previous `emitOnSeekGE` parameter. ``` name old time/op new time/op delta MVCCGet_Pebble/batch=true/versions=1/valueSize=8/numRangeKeys=0-24 3.67µs ± 1% 3.67µs ± 1% ~ (p=0.517 n=10+10) MVCCGet_Pebble/batch=true/versions=1/valueSize=8/numRangeKeys=1-24 7.25µs ± 0% 7.07µs ± 0% -2.45% (p=0.000 n=10+9) MVCCGet_Pebble/batch=true/versions=1/valueSize=8/numRangeKeys=10-24 14.4µs ± 1% 14.2µs ± 1% -1.10% (p=0.000 n=10+10) ``` Release note: None --- pkg/storage/mvcc.go | 4 + pkg/storage/mvcc_history_test.go | 4 +- pkg/storage/pebble_mvcc_scanner.go | 2 +- pkg/storage/point_synthesizing_iter.go | 165 ++++++++++-------- .../mvcc_histories/range_key_point_synthesis | 82 ++------- 5 files changed, 110 insertions(+), 147 deletions(-) diff --git a/pkg/storage/mvcc.go b/pkg/storage/mvcc.go index 5cf08f5f00e2..449e6cd26468 100644 --- a/pkg/storage/mvcc.go +++ b/pkg/storage/mvcc.go @@ -28,6 +28,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/settings" "github.com/cockroachdb/cockroach/pkg/settings/cluster" "github.com/cockroachdb/cockroach/pkg/storage/enginepb" + "github.com/cockroachdb/cockroach/pkg/util" "github.com/cockroachdb/cockroach/pkg/util/envutil" "github.com/cockroachdb/cockroach/pkg/util/hlc" "github.com/cockroachdb/cockroach/pkg/util/iterutil" @@ -969,6 +970,9 @@ func mvccGet( if timestamp.WallTime < 0 { return optionalValue{}, nil, errors.Errorf("cannot write to %q at timestamp %s", key, timestamp) } + if util.RaceEnabled && !iter.IsPrefix() { + return optionalValue{}, nil, errors.AssertionFailedf("mvccGet called with non-prefix iterator") + } if err := opts.validate(); err != nil { return optionalValue{}, nil, err } diff --git a/pkg/storage/mvcc_history_test.go b/pkg/storage/mvcc_history_test.go index 9ddab23d606b..7b0778122161 100644 --- a/pkg/storage/mvcc_history_test.go +++ b/pkg/storage/mvcc_history_test.go @@ -92,7 +92,7 @@ var ( // scan [t=] [ts=[,]] [resolve [status=]] k= [end=] [inconsistent] [skipLocked] [tombstones] [reverse] [failOnMoreRecent] [localUncertaintyLimit=[,]] [globalUncertaintyLimit=[,]] [max=] [targetbytes=] [allowEmpty] // export [k=] [end=] [ts=[,]] [kTs=[,]] [startTs=[,]] [maxIntents=] [allRevisions] [targetSize=] [maxSize=] [stopMidKey] // -// iter_new [k=] [end=] [prefix] [kind=key|keyAndIntents] [types=pointsOnly|pointsWithRanges|pointsAndRanges|rangesOnly] [pointSynthesis [emitOnSeekGE]] [maskBelow=[,]] +// iter_new [k=] [end=] [prefix] [kind=key|keyAndIntents] [types=pointsOnly|pointsWithRanges|pointsAndRanges|rangesOnly] [pointSynthesis] [maskBelow=[,]] // iter_new_incremental [k=] [end=] [startTs=[,]] [endTs=[,]] [types=pointsOnly|pointsWithRanges|pointsAndRanges|rangesOnly] [maskBelow=[,]] [intents=error|aggregate|emit] // iter_seek_ge k= [ts=[,]] // iter_seek_lt k= [ts=[,]] @@ -1435,7 +1435,7 @@ func cmdIterNew(e *evalCtx) error { r, closer := metamorphicReader(e) iter := r.NewMVCCIterator(kind, opts) if e.hasArg("pointSynthesis") { - iter = newPointSynthesizingIter(iter, e.hasArg("emitOnSeekGE")) + iter = newPointSynthesizingIter(iter) } if opts.Prefix != iter.IsPrefix() { return errors.Errorf("prefix iterator returned IsPrefix=false") diff --git a/pkg/storage/pebble_mvcc_scanner.go b/pkg/storage/pebble_mvcc_scanner.go index e24046415f1c..46597aa91027 100644 --- a/pkg/storage/pebble_mvcc_scanner.go +++ b/pkg/storage/pebble_mvcc_scanner.go @@ -1184,7 +1184,7 @@ func (p *pebbleMVCCScanner) updateCurrent() bool { // iterator was valid when called and returns true if there is no change. func (p *pebbleMVCCScanner) maybeEnablePointSynthesis() bool { if _, hasRange := p.parent.HasPointAndRange(); hasRange { - p.pointIter = newPointSynthesizingIterAtParent(p.parent, p.isGet) + p.pointIter = newPointSynthesizingIterAtParent(p.parent) p.parent = p.pointIter return p.iterValid() } diff --git a/pkg/storage/point_synthesizing_iter.go b/pkg/storage/point_synthesizing_iter.go index e672de900354..8a861f1b8a1f 100644 --- a/pkg/storage/point_synthesizing_iter.go +++ b/pkg/storage/point_synthesizing_iter.go @@ -31,12 +31,9 @@ var pointSynthesizingIterPool = sync.Pool{ // pointSynthesizingIter wraps an MVCCIterator, and synthesizes MVCC point keys // for MVCC range keys above existing point keys (not below), and at the start -// of range keys (truncated to iterator bounds). If emitOnSeekGE is set, it will -// also unconditionally synthesize point keys for all MVCC range keys that -// overlap the seek key. -// -// It does not emit MVCC range keys at all, since these would appear to conflict -// with the synthesized point keys. +// of range keys (truncated to iterator bounds). It does not emit MVCC range +// keys at all, since these would appear to conflict with the synthesized point +// keys. // // During iteration, any range keys overlapping the current iterator position // are kept in rangeKeys. When atPoint is true, the iterator is positioned on a @@ -71,6 +68,10 @@ type pointSynthesizingIter struct { // position, for which points will be synthesized. rangeKeys MVCCRangeKeyVersions + // rangeKeysBuf is a reusable buffer for rangeKeys. Non-prefix iterators use + // it as a MVCCRangeKeyVersions.CloneInto() target and set rangeKeys to it. + rangeKeysBuf MVCCRangeKeyVersions + // rangeKeysPos is the current key (along the rangeKeys span) that points will // be synthesized for. It is only set if rangeKeys is non-empty, and may // differ from the underlying iterator position. @@ -92,30 +93,15 @@ type pointSynthesizingIter struct { // key in the underlying iterator. See struct comment for details. atPoint bool + // If prefix is true, then the underlying iterator is a prefix iterator which + // can only be on a single key position. This allows omitting key cloning and + // comparisons in many cases. + prefix bool + // reverse is true when the current iterator direction is in reverse, i.e. // following a SeekLT or Prev call. reverse bool - // emitOnSeekGE will synthesize point keys for all range keys that overlap the - // SeekGE seek key, regardless of whether a point key exists there. The - // primary use-case is to synthesize point keys for e.g. an MVCCGet that does - // not match a point key but overlaps a range key, which is necessary for - // conflict checks. - // - // This is optional, because e.g. pebbleMVCCScanner often uses seeks as an - // optimization to skip over old versions of a key, and we don't want to keep - // synthesizing point keys every time it skips ahead. - // - // Note that these synthesized points are not stable: if the iterator leaves - // the seek key prefix and then reverses direction, points will be synthesized - // according to the normal policy: above existing point keys and at the start - // key of range keys. This parameter is primarily for use with prefix - // iterators where this is not an issue. - // - // TODO(erikgrinaker): This could instead check for prefix iterators, or a - // separate SeekPrefixGE() method, but we don't currently have APIs for it. - emitOnSeekGE bool - // iterValid is true if the underlying iterator is valid. iterValid bool @@ -127,13 +113,13 @@ var _ MVCCIterator = new(pointSynthesizingIter) // newPointSynthesizingIter creates a new pointSynthesizingIter, or gets one // from the pool. -func newPointSynthesizingIter(parent MVCCIterator, emitOnSeekGE bool) *pointSynthesizingIter { +func newPointSynthesizingIter(parent MVCCIterator) *pointSynthesizingIter { iter := pointSynthesizingIterPool.Get().(*pointSynthesizingIter) *iter = pointSynthesizingIter{ - iter: parent, - emitOnSeekGE: emitOnSeekGE, + iter: parent, + prefix: parent.IsPrefix(), // Reuse pooled byte slices. - rangeKeys: iter.rangeKeys, + rangeKeysBuf: iter.rangeKeysBuf, rangeKeysPos: iter.rangeKeysPos, rangeKeysStart: iter.rangeKeysStart, } @@ -142,10 +128,8 @@ func newPointSynthesizingIter(parent MVCCIterator, emitOnSeekGE bool) *pointSynt // newPointSynthesizingIterAtParent creates a new pointSynthesizingIter and // loads the position from the parent iterator (which must be valid). -func newPointSynthesizingIterAtParent( - parent MVCCIterator, emitOnSeekGE bool, -) *pointSynthesizingIter { - iter := newPointSynthesizingIter(parent, emitOnSeekGE) +func newPointSynthesizingIterAtParent(parent MVCCIterator) *pointSynthesizingIter { + iter := newPointSynthesizingIter(parent) iter.iterValid = true iter.updateSeekGEPosition(parent.UnsafeKey()) return iter @@ -164,7 +148,7 @@ func (i *pointSynthesizingIter) Close() { func (i *pointSynthesizingIter) release() { *i = pointSynthesizingIter{ // Reuse slices. - rangeKeys: i.rangeKeys[:0], + rangeKeysBuf: i.rangeKeysBuf[:0], rangeKeysPos: i.rangeKeysPos[:0], rangeKeysStart: i.rangeKeysStart[:0], } @@ -197,26 +181,35 @@ func (i *pointSynthesizingIter) updateValid() (bool, error) { func (i *pointSynthesizingIter) updateRangeKeys() { if !i.iterValid { i.clearRangeKeys() - } else if _, hasRange := i.iter.HasPointAndRange(); hasRange { - // TODO(erikgrinaker): Optimize this. + } else if _, hasRange := i.iter.HasPointAndRange(); !hasRange { + i.clearRangeKeys() + } else { i.rangeKeysPos = append(i.rangeKeysPos[:0], i.iter.UnsafeKey().Key...) - if rangeStart := i.iter.RangeBounds().Key; !rangeStart.Equal(i.rangeKeysStart) { - i.rangeKeysStart = append(i.rangeKeysStart[:0], rangeStart...) - i.iter.RangeKeys().Versions.CloneInto(&i.rangeKeys) - } - if i.rangeKeysPos.Equal(i.rangeKeysStart) { + if i.prefix { + // A prefix iterator will always be at the start bound of the range key, + // and never move onto a different range key, so we can omit the cloning + // and other processing. + i.rangeKeys = i.iter.RangeKeys().Versions i.rangeKeysEnd = len(i.rangeKeys) } else { - i.rangeKeysEnd = 0 - i.extendRangeKeysEnd() + // TODO(erikgrinaker): Use RangeKeyChanged() to detect this. + if rangeStart := i.iter.RangeBounds().Key; !rangeStart.Equal(i.rangeKeysStart) { + i.rangeKeysStart = append(i.rangeKeysStart[:0], rangeStart...) + i.iter.RangeKeys().Versions.CloneInto(&i.rangeKeysBuf) + i.rangeKeys = i.rangeKeysBuf + } + if i.rangeKeysPos.Equal(i.rangeKeysStart) { + i.rangeKeysEnd = len(i.rangeKeys) + } else { + i.rangeKeysEnd = 0 + i.extendRangeKeysEnd() + } } if !i.reverse { i.rangeKeysIdx = 0 } else { i.rangeKeysIdx = i.rangeKeysEnd - 1 // NB: -1 is correct with no range keys } - } else { - i.clearRangeKeys() } } @@ -225,7 +218,8 @@ func (i *pointSynthesizingIter) updateRangeKeys() { func (i *pointSynthesizingIter) extendRangeKeysEnd() { if i.iterValid { if hasPoint, _ := i.iter.HasPointAndRange(); hasPoint { - if p := i.iter.UnsafeKey(); p.Key.Equal(i.rangeKeysPos) && !p.Timestamp.IsEmpty() { + p := i.iter.UnsafeKey() + if (i.prefix || p.Key.Equal(i.rangeKeysPos)) && !p.Timestamp.IsEmpty() { if end := sort.Search(len(i.rangeKeys), func(idx int) bool { return i.rangeKeys[idx].Timestamp.Less(p.Timestamp) }); end > i.rangeKeysEnd { @@ -246,7 +240,7 @@ func (i *pointSynthesizingIter) updateAtPoint() { i.atPoint = false } else if len(i.rangeKeys) == 0 { i.atPoint = true - } else if point := i.iter.UnsafeKey(); !point.Key.Equal(i.rangeKeysPos) { + } else if point := i.iter.UnsafeKey(); !i.prefix && !point.Key.Equal(i.rangeKeysPos) { i.atPoint = false } else if !i.reverse { i.atPoint = i.rangeKeysIdx >= i.rangeKeysEnd || !point.Timestamp.IsSet() || @@ -349,8 +343,9 @@ func (i *pointSynthesizingIter) updateSeekGEPosition(seekKey MVCCKey) { } // If we land in the middle of a bare range key then skip over it to the next - // point/range key unless emitOnSeekGE is enabled. - if !i.emitOnSeekGE && hasRange && !hasPoint && + // point/range key. If prefix is enabled, we must be at its start key, so we + // can omit the comparison. + if hasRange && !hasPoint && !i.prefix && !i.iter.RangeBounds().Key.Equal(i.iter.UnsafeKey().Key) { if ok, _ := i.iterNext(); !ok { i.updatePosition() @@ -361,10 +356,8 @@ func (i *pointSynthesizingIter) updateSeekGEPosition(seekKey MVCCKey) { i.updateRangeKeys() - // If we're now at a bare range key, we must either be at the start of it, - // or in the middle with emitOnSeekGE enabled. In either case, we want to - // move the iterator ahead to look for a point key with the same key as the - // start/seek key in order to interleave it. + // If we're still at a bare range key, we must be at its start key. Move the + // iterator ahead to look for a point key at the same key. if hasRange && !hasPoint { if _, err := i.iterNext(); err != nil { return @@ -372,13 +365,9 @@ func (i *pointSynthesizingIter) updateSeekGEPosition(seekKey MVCCKey) { i.extendRangeKeysEnd() } - // If emitOnSeekGE, always expose all range keys at the current position. - if hasRange && i.emitOnSeekGE { - i.rangeKeysEnd = len(i.rangeKeys) - } - // If we're seeking to a specific version, skip newer range keys. - if len(i.rangeKeys) > 0 && seekKey.Timestamp.IsSet() && seekKey.Key.Equal(i.rangeKeysPos) { + if len(i.rangeKeys) > 0 && seekKey.Timestamp.IsSet() && + (i.prefix || seekKey.Key.Equal(i.rangeKeysPos)) { i.rangeKeysIdx = sort.Search(i.rangeKeysEnd, func(idx int) bool { return i.rangeKeys[idx].Timestamp.LessEq(seekKey.Timestamp) }) @@ -445,7 +434,7 @@ func (i *pointSynthesizingIter) NextKey() { } } // Don't call NextKey() if the underlying iterator is already on the next key. - if i.atPoint || i.rangeKeysPos.Equal(i.iter.UnsafeKey().Key) { + if i.atPoint || i.prefix || i.rangeKeysPos.Equal(i.iter.UnsafeKey().Key) { i.iter.NextKey() if _, err := i.updateValid(); err != nil { return @@ -656,7 +645,7 @@ func (i *pointSynthesizingIter) Stats() IteratorStats { // IsPrefix implements the MVCCIterator interface. func (i *pointSynthesizingIter) IsPrefix() bool { - return i.iter.IsPrefix() + return i.prefix } // SupportsPrev implements MVCCIterator. @@ -676,6 +665,11 @@ func (i *pointSynthesizingIter) assertInvariants() error { return nil } + // In prefix mode, the iterator must never be used in reverse. + if i.prefix && i.reverse { + return errors.AssertionFailedf("prefix iterator used in reverse") + } + // When atPoint is true, the underlying iterator must be valid and on a point. if i.atPoint { if ok, _ := i.iter.Valid(); !ok { @@ -717,14 +711,25 @@ func (i *pointSynthesizingIter) assertInvariants() error { return nil } - // rangeKeysStart must be set, and rangeKeysPos must be at or after it. This - // implies that rangeKeysPos must also be set. - if len(i.rangeKeysStart) == 0 { - return errors.AssertionFailedf("no rangeKeysStart at %s", i.iter.UnsafeKey()) + // rangeKeysPos must be set when range keys are present. + if len(i.rangeKeysPos) == 0 { + return errors.AssertionFailedf("rangeKeysPos not set") } - if i.rangeKeysPos.Compare(i.rangeKeysStart) < 0 { - return errors.AssertionFailedf("rangeKeysPos %s not after rangeKeysStart %s", - i.rangeKeysPos, i.rangeKeysStart) + + // rangeKeysStart must be set, and rangeKeysPos must be at or after it. + // prefix iterators do not set rangeKeysStart. + if !i.prefix { + if len(i.rangeKeysStart) == 0 { + return errors.AssertionFailedf("no rangeKeysStart at %s", i.iter.UnsafeKey()) + } + if i.rangeKeysPos.Compare(i.rangeKeysStart) < 0 { + return errors.AssertionFailedf("rangeKeysPos %s not after rangeKeysStart %s", + i.rangeKeysPos, i.rangeKeysStart) + } + } else { + if len(i.rangeKeysStart) != 0 { + return errors.AssertionFailedf("rangeKeysStart set to %s for prefix iterator", i.rangeKeysStart) + } } // rangeKeysIdx must be valid if we're not on a point. @@ -739,10 +744,24 @@ func (i *pointSynthesizingIter) assertInvariants() error { return nil } - // We now have range keys and a non-exhausted iterator. Check their relative - // positioning as minimum and maximum iter keys (in MVCC order). We can assume - // that overlapping range keys and point keys don't have the same timestamp, - // since this is enforced by MVCC mutations. + // We now have range keys and a non-exhausted iterator. + // + // prefix iterators must have range key bounds [key, key.Next), and be + // positioned on key. + if _, hasRange := i.iter.HasPointAndRange(); i.prefix && hasRange { + expect := roachpb.Span{Key: i.rangeKeysPos, EndKey: i.rangeKeysPos.Next()} + if bounds := i.iter.RangeBounds(); !bounds.Equal(expect) { + return errors.AssertionFailedf("unexpected range bounds %s with prefix, expected %s", + bounds, expect) + + } else if key := i.iter.UnsafeKey().Key; !key.Equal(bounds.Key) { + return errors.AssertionFailedf("iter not on prefix position %s, got %s", bounds, key) + } + } + + // Check the relative positioning as minimum and maximum iter keys (in MVCC + // order). We can assume that overlapping range keys and point keys don't have + // the same timestamp, since this is enforced by MVCC mutations. var minKey, maxKey MVCCKey // The iterator should never lag behind the range key position. diff --git a/pkg/storage/testdata/mvcc_histories/range_key_point_synthesis b/pkg/storage/testdata/mvcc_histories/range_key_point_synthesis index 737c81014399..6b885f705e24 100644 --- a/pkg/storage/testdata/mvcc_histories/range_key_point_synthesis +++ b/pkg/storage/testdata/mvcc_histories/range_key_point_synthesis @@ -220,7 +220,7 @@ iter_seek_ge: "n"/5.000000000,0=/ iter_seek_ge: . run ok -iter_new types=pointsAndRanges pointSynthesis emitOnSeekGE +iter_new types=pointsAndRanges pointSynthesis prefix iter_seek_ge k=a iter_seek_ge k=b iter_seek_ge k=c @@ -322,7 +322,7 @@ iter_seek_intent_ge: "n"/5.000000000,0=/ iter_seek_intent_ge: . run ok -iter_new types=pointsAndRanges pointSynthesis emitOnSeekGE +iter_new types=pointsAndRanges pointSynthesis prefix iter_seek_intent_ge k=a txn=A iter_seek_intent_ge k=b txn=A iter_seek_intent_ge k=c txn=A @@ -560,9 +560,9 @@ iter_seek_ge: . iter_seek_ge: . iter_seek_ge: . -# Versioned seeks with emitOnSeekGE. +# Versioned prefix seeks. run ok -iter_new types=pointsAndRanges pointSynthesis emitOnSeekGE +iter_new types=pointsAndRanges pointSynthesis prefix iter_seek_ge k=e ts=6 iter_seek_ge k=e ts=5 iter_seek_ge k=e ts=4 @@ -578,7 +578,7 @@ iter_seek_ge: "e"/1.000000000,0=/ iter_seek_ge: "e"/1.000000000,0=/ run ok -iter_new types=pointsAndRanges pointSynthesis emitOnSeekGE +iter_new types=pointsAndRanges pointSynthesis prefix iter_seek_ge k=j ts=8 iter_seek_ge k=j ts=7 iter_seek_ge k=j ts=6 @@ -590,27 +590,27 @@ iter_seek_ge: "j"/1.000000000,0=/ iter_seek_ge: "j"/1.000000000,0=/ run ok -iter_new types=pointsAndRanges pointSynthesis emitOnSeekGE +iter_new types=pointsAndRanges pointSynthesis prefix iter_seek_ge k=l ts=6 iter_seek_ge k=l ts=5 iter_seek_ge k=l ts=4 ---- iter_seek_ge: "l"/5.000000000,0=/ iter_seek_ge: "l"/5.000000000,0=/ -iter_seek_ge: "n"/5.000000000,0=/ +iter_seek_ge: . run ok -iter_new types=pointsAndRanges pointSynthesis emitOnSeekGE +iter_new types=pointsAndRanges pointSynthesis prefix iter_seek_ge k=m ts=6 iter_seek_ge k=m ts=5 iter_seek_ge k=m ts=4 ---- iter_seek_ge: "m"/5.000000000,0=/ iter_seek_ge: "m"/5.000000000,0=/ -iter_seek_ge: "n"/5.000000000,0=/ +iter_seek_ge: . run ok -iter_new types=pointsAndRanges pointSynthesis emitOnSeekGE +iter_new types=pointsAndRanges pointSynthesis prefix iter_seek_ge k=n ts=6 iter_seek_ge k=n ts=5 iter_seek_ge k=n ts=4 @@ -622,7 +622,7 @@ iter_seek_ge: "n"/3.000000000,0=/ iter_seek_ge: "n"/3.000000000,0=/ run ok -iter_new types=pointsAndRanges pointSynthesis emitOnSeekGE +iter_new types=pointsAndRanges pointSynthesis prefix iter_seek_ge k=o ts=6 iter_seek_ge k=o ts=5 iter_seek_ge k=o ts=4 @@ -631,66 +631,6 @@ iter_seek_ge: . iter_seek_ge: . iter_seek_ge: . -# Next after emitOnSeekGE also emits tombstones below points, but these are not -# stable following a reversal from a different key prefix. -run ok -iter_new types=pointsAndRanges pointSynthesis emitOnSeekGE -iter_seek_ge k=e -iter_next -iter_next -iter_next -iter_prev -iter_prev -iter_next -iter_next ----- -iter_seek_ge: "e"/5.000000000,0=/ -iter_next: "e"/3.000000000,0=/BYTES/e3 -iter_next: "e"/1.000000000,0=/ -iter_next: "f"/6.000000000,0=/BYTES/f6 -iter_prev: "e"/3.000000000,0=/BYTES/e3 -iter_prev: "e"/5.000000000,0=/ -iter_next: "e"/3.000000000,0=/BYTES/e3 -iter_next: "f"/6.000000000,0=/BYTES/f6 - -run ok -iter_new types=pointsAndRanges pointSynthesis emitOnSeekGE -iter_seek_ge k=j -iter_next -iter_next -iter_next -iter_prev -iter_prev -iter_next -iter_next ----- -iter_seek_ge: "j"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true -iter_next: "j"/7.000000000,0=/BYTES/j7 -iter_next: "j"/1.000000000,0=/ -iter_next: "k"/5.000000000,0=/BYTES/k5 -iter_prev: "j"/7.000000000,0=/BYTES/j7 -iter_prev: "j"/0,0=txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true -iter_next: "j"/7.000000000,0=/BYTES/j7 -iter_next: "k"/5.000000000,0=/BYTES/k5 - -run ok -iter_new types=pointsAndRanges pointSynthesis emitOnSeekGE -iter_seek_ge k=nnn -iter_next -iter_next -iter_prev -iter_prev -iter_next -iter_next ----- -iter_seek_ge: "nnn"/5.000000000,0=/ -iter_next: "nnn"/3.000000000,0=/ -iter_next: . -iter_prev: "n"/3.000000000,0=/ -iter_prev: "n"/5.000000000,0=/ -iter_next: "n"/3.000000000,0=/ -iter_next: . - # Versioned reverse seeks. run ok iter_new types=pointsAndRanges pointSynthesis