Skip to content

Commit 78edad0

Browse files
craig[bot]pav-kv
andcommitted
Merge #155143
155143: kvserver: fix race in split application r=tbg,arulajmani a=pav-kv This PR fixes the replica storage race that may occur when applying a split in the rare case there is a concurrent creation of a higher-`ReplicaID` RHS. The race is described in #152199, and stems from the fact that a higher-`ReplicaID` uninitialized replica is not locked for the entirety of `splitPreApply`, so it can be created and make some limited progress concurrently. We remove the clearing/rewriting of the unreplicated state which belongs to that RHS, to let it progress untouched. This also removes a blocker towards raft/state-machine storage separation: we don't want to be touching raft state of the RHS in the same batch with the state machine updates. Fixes #152199 Release note (bug fix): fixed a race in range splits that can result in a regressed raft state of a post-split range. The race conditions are very rare / nearly impossible, and we haven't seen it in the wild. Co-authored-by: Pavel Kalinnikov <[email protected]>
2 parents 5433e4f + 5b7ceb3 commit 78edad0

File tree

7 files changed

+130
-105
lines changed

7 files changed

+130
-105
lines changed

pkg/kv/kvserver/kvstorage/destroy.go

Lines changed: 4 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -199,9 +199,6 @@ func SubsumeReplica(
199199
// is used in a situation when the RHS replica is already known to have been
200200
// removed from our store, so any pending writes that were supposed to
201201
// initialize the RHS replica should be dropped from the write batch.
202-
//
203-
// TODO(#152199): do not remove the unreplicated state which can belong to a
204-
// newer (uninitialized) replica.
205202
func RemoveStaleRHSFromSplit(
206203
ctx context.Context,
207204
reader storage.Reader,
@@ -216,19 +213,10 @@ func RemoveStaleRHSFromSplit(
216213
// staged in the batch.
217214
ReplicatedByRangeID: true,
218215
Ranged: rditer.SelectAllRanged(keys),
219-
// TODO(tbg): we don't actually want to touch the raft state of the RHS
220-
// replica since it's absent or a more recent one than in the split. Now
221-
// that we have a bool targeting unreplicated RangeID-local keys, we can set
222-
// it to false and remove the HardState+ReplicaID write-back in the caller.
223-
// However, there can be historical split proposals with the
224-
// RaftTruncatedState key set in splitTriggerHelper[^1]. We must first make
225-
// sure that such proposals no longer exist, e.g. with a below-raft
226-
// migration.
227-
//
228-
// [^1]: https://github.com/cockroachdb/cockroach/blob/f263a765d750e41f2701da0a923a6e92d09159fa/pkg/kv/kvserver/batcheval/cmd_end_transaction.go#L1109-L1149
229-
//
230-
// See also: https://github.com/cockroachdb/cockroach/issues/94933
231-
UnreplicatedByRangeID: true,
216+
// Leave the unreplicated keys intact. The unreplicated space either belongs
217+
// to a newer (uninitialized) replica, or is empty and only contains a
218+
// RangeTombstone with a higher ReplicaID than the RHS in the split trigger.
219+
UnreplicatedByRangeID: false,
232220
}) {
233221
if err := storage.ClearRangeWithHeuristic(
234222
ctx, reader, writer, span.Key, span.EndKey, ClearRangeThresholdPointKeys(),

pkg/kv/kvserver/logstore/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ go_test(
5353
"logstore_bench_test.go",
5454
"logstore_test.go",
5555
"sideload_test.go",
56+
"stateloader_test.go",
5657
"sync_waiter_test.go",
5758
],
5859
data = glob(["testdata/**"]),

pkg/kv/kvserver/logstore/stateloader.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,11 @@ func (sl StateLoader) SetRaftTruncatedState(
135135
)
136136
}
137137

138+
// ClearRaftTruncatedState clears the RaftTruncatedState.
139+
func (sl StateLoader) ClearRaftTruncatedState(writer storage.Writer) error {
140+
return writer.ClearUnversioned(sl.RaftTruncatedStateKey(), storage.ClearOptions{})
141+
}
142+
138143
// LoadHardState loads the HardState.
139144
func (sl StateLoader) LoadHardState(
140145
ctx context.Context, reader storage.Reader,
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
// Copyright 2025 The Cockroach Authors.
2+
//
3+
// Use of this software is governed by the CockroachDB Software License
4+
// included in the /LICENSE file.
5+
6+
package logstore
7+
8+
import (
9+
"context"
10+
"testing"
11+
12+
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb"
13+
"github.com/cockroachdb/cockroach/pkg/roachpb"
14+
"github.com/cockroachdb/cockroach/pkg/storage"
15+
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
16+
"github.com/cockroachdb/cockroach/pkg/util/log"
17+
"github.com/stretchr/testify/require"
18+
)
19+
20+
// TestStateLoader tests the various methods of StateLoader.
21+
//
22+
// TODO(pav-kv): extend it to test keys other than RaftTruncatedState.
23+
func TestStateLoader(t *testing.T) {
24+
defer leaktest.AfterTest(t)()
25+
defer log.Scope(t).Close(t)
26+
27+
ctx := context.Background()
28+
eng := storage.NewDefaultInMemForTesting()
29+
defer eng.Close()
30+
31+
const rangeID = roachpb.RangeID(123)
32+
sl := NewStateLoader(rangeID)
33+
34+
// Test that RaftTruncatedState can be read after it is written.
35+
ts := kvserverpb.RaftTruncatedState{Index: 100, Term: 10}
36+
require.NoError(t, sl.SetRaftTruncatedState(ctx, eng, &ts))
37+
got, err := sl.LoadRaftTruncatedState(ctx, eng)
38+
require.NoError(t, err)
39+
require.Equal(t, ts, got)
40+
// Test that RaftTruncatedState is correctly cleared.
41+
require.NoError(t, sl.ClearRaftTruncatedState(eng))
42+
got, err = sl.LoadRaftTruncatedState(ctx, eng)
43+
require.NoError(t, err)
44+
require.Equal(t, kvserverpb.RaftTruncatedState{}, got)
45+
}

pkg/kv/kvserver/replica.go

Lines changed: 7 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -2461,44 +2461,18 @@ func shouldWaitForPendingMerge(
24612461
return &kvpb.MergeInProgressError{}
24622462
}
24632463

2464-
// isNewerThanSplit is a helper used in split(Pre|Post)Apply to
2465-
// determine whether the Replica on the right hand side of the split must
2466-
// have been removed from this store after the split.
2464+
// isNewerThanSplit is a helper used in split(Pre|Post)Apply to determine
2465+
// whether the RHS replica of the split must have been removed from this store
2466+
// after the split, and the given Replica has a higher ID.
24672467
//
2468-
// TODO(tbg): the below is true as of 22.2: we persist any Replica's ReplicaID
2469-
// under RaftReplicaIDKey, so the below caveats should be addressed now.
2470-
//
2471-
// TODO(ajwerner): There is one false negative where false will be returned but
2472-
// the hard state may be due to a newer replica which is outlined below. It
2473-
// should be safe.
2474-
// Ideally if this store had ever learned that the replica created by the split
2475-
// were removed it would not forget that fact. There exists one edge case where
2476-
// the store may learn that it should house a replica of the same range with a
2477-
// higher replica ID and then forget. If the first raft message this store ever
2478-
// receives for the this range contains a replica ID higher than the replica ID
2479-
// in the split trigger then an in-memory replica at that higher replica ID will
2480-
// be created and no tombstone at a lower replica ID will be written. If the
2481-
// server then crashes it will forget that it had ever been the higher replica
2482-
// ID. The server may then proceed to process the split and initialize a replica
2483-
// at the replica ID implied by the split. This is potentially problematic as
2484-
// the replica may have voted as this higher replica ID and when it rediscovers
2485-
// the higher replica ID it will delete all of the state corresponding to the
2486-
// older replica ID including its hard state which may have been synthesized
2487-
// with votes as the newer replica ID. This case tends to be handled safely in
2488-
// practice because the replica should only be receiving messages as the newer
2489-
// replica ID after it has been added to the range as a learner.
2490-
//
2491-
// Despite the safety due to the change replicas protocol explained above it'd
2492-
// be good to know for sure that a replica ID for a range on a store is always
2493-
// monotonically increasing, even across restarts.
2468+
// NB: from v22.2, we persist any Replica's ID under RaftReplicaIDKey. A
2469+
// complementary mechanism, RangeTombstone, ensures that replica deletions are
2470+
// persistent as well. As a result, the ReplicaID existence and non-existence is
2471+
// monotonic and survives restarts.
24942472
//
24952473
// See TestProcessSplitAfterRightHandSideHasBeenRemoved.
24962474
func (r *Replica) isNewerThanSplit(split *roachpb.SplitTrigger) bool {
24972475
rightDesc, _ := split.RightDesc.GetReplicaDescriptor(r.StoreID())
2498-
// If the first raft message we received for the RHS range was for a replica
2499-
// ID which is above the replica ID of the split then we would not have
2500-
// written a tombstone but we will have a replica ID that will exceed the
2501-
// split replica ID.
25022476
return r.replicaID > rightDesc.ReplicaID
25032477
}
25042478

pkg/kv/kvserver/replica_app_batch.go

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -309,15 +309,18 @@ func (b *replicaAppBatch) runPostAddTriggersReplicaOnly(
309309
}
310310

311311
if res.Split != nil {
312-
// Splits require a new HardState to be written to the new RHS
313-
// range (and this needs to be atomic with the main batch). This
314-
// cannot be constructed at evaluation time because it differs
315-
// on each replica (votes may have already been cast on the
316-
// uninitialized replica). Write this new hardstate to the batch too.
312+
// Splits require a new HardState to be written for the new RHS replica,
313+
// atomically with the main batch. This cannot be constructed at evaluation
314+
// time because it differs on each replica (votes may have already been cast
315+
// on the uninitialized replica). Write this new HardState to the batch too.
317316
// See https://github.com/cockroachdb/cockroach/issues/20629.
318317
//
319-
// Alternatively if we discover that the RHS has already been removed
320-
// from this store, clean up its data.
318+
// Alternatively if we discover that the RHS has already been removed from
319+
// this store, clean up its data.
320+
//
321+
// NB: another reason why we shouldn't write HardState at evaluation time is
322+
// that it belongs to the log engine, whereas the evaluated batch must
323+
// contain only state machine updates.
321324
splitPreApply(ctx, b.r, b.batch, res.Split.SplitTrigger, cmd.Cmd.ClosedTimestamp)
322325

323326
// The rangefeed processor will no longer be provided logical ops for
@@ -330,9 +333,7 @@ func (b *replicaAppBatch) runPostAddTriggersReplicaOnly(
330333
if res.Split.SplitTrigger.ManualSplit {
331334
reason = kvpb.RangeFeedRetryError_REASON_MANUAL_RANGE_SPLIT
332335
}
333-
b.r.disconnectRangefeedWithReason(
334-
reason,
335-
)
336+
b.r.disconnectRangefeedWithReason(reason)
336337
}
337338

338339
if merge := res.Merge; merge != nil {

pkg/kv/kvserver/store_split.go

Lines changed: 57 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@ import (
1212
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb"
1313
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvstorage"
1414
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/load"
15-
"github.com/cockroachdb/cockroach/pkg/raft/raftpb"
1615
"github.com/cockroachdb/cockroach/pkg/roachpb"
1716
"github.com/cockroachdb/cockroach/pkg/storage"
1817
"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
@@ -51,82 +50,94 @@ func splitPreApply(
5150
// raftMu is locked.
5251
//
5352
// In the less common case, the ReplicaID is already removed from this Store,
54-
// and rightRepl is either nil or an uninitialized replica with a higher
55-
// ReplicaID. Its raftMu is not locked.
53+
// and rightRepl is either nil (though the replica may be created concurrently
54+
// after we got this nil), or an uninitialized replica with a higher
55+
// ReplicaID. Its raftMu is not locked, so this replica might as well be in
56+
// the process of destruction or being replaced with another higher-ReplicaID
57+
// uninitialized replica.
58+
//
59+
// In any case, the RHS, if exists (one or multiple replicas, throughout this
60+
// splitPreApply call), is uninitialized. This is due to the Store invariant
61+
// that all initialized replicas don't overlap, thus the RHS one can only be
62+
// uninitialized while we still own the RHS part of the keyspace.
63+
//
64+
// Uninitialized replicas don't have replicated state, and only have non-empty
65+
// RaftReplicaID and RaftHardState keys in storage (at the time of writing),
66+
// or, more generally, only unreplicated keys. As a rule of thumb, all
67+
// unreplicated keys belong to the *current* ReplicaID in the store, rather
68+
// than the ReplicaID in the split trigger (which can be stale).
5669
rightRepl := r.store.GetReplicaIfExists(split.RightDesc.RangeID)
70+
71+
rsl := kvstorage.MakeStateLoader(split.RightDesc.RangeID)
72+
// After PR #149620, the split trigger batch may only contain replicated state
73+
// machine keys, and never contains unreplicated / raft keys. One exception:
74+
// there can still be historical split proposals that write the initial
75+
// RaftTruncatedState of the RHS. Remove this key (if exists), and set it
76+
// below only if necessary.
77+
//
78+
// Note that if the RHS range is already present or being created concurrently
79+
// on this Store, it doesn't have a RaftTruncatedState (which only initialized
80+
// replicas can have), so this deletion will not conflict with or corrupt it.
81+
//
82+
// TODO(#152847): remove this workaround when there are no historical
83+
// proposals with RaftTruncatedState, e.g. after a below-raft migration.
84+
if ts, err := rsl.LoadRaftTruncatedState(ctx, readWriter); err != nil {
85+
log.KvExec.Fatalf(ctx, "cannot load RaftTruncatedState: %v", err)
86+
} else if ts == (kvserverpb.RaftTruncatedState{}) {
87+
// Common case. Do nothing.
88+
} else if err := rsl.ClearRaftTruncatedState(readWriter); err != nil {
89+
log.KvExec.Fatalf(ctx, "cannot clear RaftTruncatedState: %v", err)
90+
}
91+
5792
// Check to see if we know that the RHS has already been removed from this
5893
// store at the replica ID implied by the split.
5994
if rightRepl == nil || rightRepl.isNewerThanSplit(&split) {
60-
// We're in the rare case where we know that the RHS has been removed
61-
// and re-added with a higher replica ID (and then maybe removed again).
95+
// We're in the rare case where we know that the RHS has been removed or
96+
// re-added with a higher replica ID (one or more times).
6297
//
6398
// If rightRepl is not nil, we are *not* holding raftMu.
6499
//
65100
// To apply the split, we need to "throw away" the data that would belong to
66101
// the RHS, i.e. we clear the user data the RHS would have inherited from
67-
// the LHS due to the split and additionally clear all of the range ID local
68-
// state that the split trigger writes into the RHS. At the time of writing,
69-
// unfortunately that means that we'll also delete any data that might
70-
// already be present in the RHS: the HardState and RaftReplicaID. It is
71-
// important to preserve the HardState because we might however have already
72-
// voted at a higher term. In general this shouldn't happen because we add
73-
// learners and then promote them only after they apply a snapshot but we're
74-
// going to be extra careful in case future versions of cockroach somehow
75-
// promote replicas without ensuring that a snapshot has been received. So
76-
// we write it back (and the RaftReplicaID too, since it's an invariant that
77-
// it's always present).
78-
var hs raftpb.HardState
102+
// the LHS due to the split.
103+
//
104+
// Leave the RangeID-local state intact, since it either belongs to a newer
105+
// replica or does not exist. At the time of writing, it can be a non-empty
106+
// HardState and RaftReplicaID. It is important to preserve the HardState
107+
// because the replica might have already voted at a higher term. In general
108+
// this shouldn't happen because we add learners and then promote them only
109+
// after they apply a snapshot, but we're going to be extra careful in case
110+
// future versions of cockroach somehow promote replicas without ensuring
111+
// that a snapshot has been received.
112+
//
113+
// NB: the rightRepl == nil condition is flaky, in a sense that the RHS
114+
// replica can be created concurrently here, one or more times. But we only
115+
// use it for a best effort assertion, so this is not critical.
79116
if rightRepl != nil {
80-
// TODO(pav-kv): rightRepl could have been destroyed by the time we get to
81-
// lock it here. The HardState read-then-write appears risky in this case.
82-
rightRepl.raftMu.Lock()
83-
defer rightRepl.raftMu.Unlock()
84117
// Assert that the rightRepl is not initialized. We're about to clear out
85118
// the data of the RHS of the split; we cannot have already accepted a
86119
// snapshot to initialize this newer RHS.
87120
if rightRepl.IsInitialized() {
88121
log.KvExec.Fatalf(ctx, "unexpectedly found initialized newer RHS of split: %v", rightRepl.Desc())
89122
}
90-
var err error
91-
hs, err = rightRepl.raftMu.stateLoader.LoadHardState(ctx, readWriter)
92-
if err != nil {
93-
log.KvExec.Fatalf(ctx, "failed to load hard state for removed rhs: %v", err)
94-
}
95123
}
96-
// TODO(#152199): the rightRepl == nil condition is flaky. There can be a
97-
// racing replica creation for a higher ReplicaID, and it can subsequently
98-
// update its HardState. Here, we can accidentally clear the HardState of
99-
// that new replica.
100124
if err := kvstorage.RemoveStaleRHSFromSplit(
101125
ctx, readWriter, readWriter, split.RightDesc.RangeID, split.RightDesc.RSpan(),
102126
); err != nil {
103127
log.KvExec.Fatalf(ctx, "failed to clear range data for removed rhs: %v", err)
104128
}
105-
if rightRepl != nil {
106-
// Cleared the HardState and RaftReplicaID, so rewrite them to the current
107-
// values. NB: rightRepl.raftMu is still locked since HardState was read,
108-
// so it can't have been rewritten in the meantime (fixed in #75918).
109-
if err := rightRepl.raftMu.stateLoader.SetHardState(ctx, readWriter, hs); err != nil {
110-
log.KvExec.Fatalf(ctx, "failed to set hard state with 0 commit index for removed rhs: %v", err)
111-
}
112-
if err := rightRepl.raftMu.stateLoader.SetRaftReplicaID(
113-
ctx, readWriter, rightRepl.ReplicaID()); err != nil {
114-
log.KvExec.Fatalf(ctx, "failed to set RaftReplicaID for removed rhs: %v", err)
115-
}
116-
}
117129
return
118130
}
119131

120132
// The RHS replica exists and is uninitialized. We are initializing it here.
121133
// This is the common case.
122134
//
123-
// Update the raft HardState with the new Commit index (taken from the
124-
// applied state in the write batch), and use existing[*] or default Term
125-
// and Vote. Also write the initial RaftTruncatedState.
135+
// Update the raft HardState with the new Commit index (taken from the applied
136+
// state in the write batch), and use existing[*] or default Term and Vote.
137+
// Also write the initial RaftTruncatedState.
126138
//
127139
// [*] Note that uninitialized replicas may cast votes, and if they have, we
128140
// can't load the default Term and Vote values.
129-
rsl := kvstorage.MakeStateLoader(split.RightDesc.RangeID)
130141
if err := rsl.SynthesizeRaftState(ctx, readWriter, kvstorage.TODORaft(readWriter)); err != nil {
131142
log.KvExec.Fatalf(ctx, "%v", err)
132143
}

0 commit comments

Comments
 (0)