@@ -12,7 +12,6 @@ import (
1212 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb"
1313 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvstorage"
1414 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/load"
15- "github.com/cockroachdb/cockroach/pkg/raft/raftpb"
1615 "github.com/cockroachdb/cockroach/pkg/roachpb"
1716 "github.com/cockroachdb/cockroach/pkg/storage"
1817 "github.com/cockroachdb/cockroach/pkg/storage/enginepb"
@@ -51,82 +50,94 @@ func splitPreApply(
5150 // raftMu is locked.
5251 //
5352 // In the less common case, the ReplicaID is already removed from this Store,
54- // and rightRepl is either nil or an uninitialized replica with a higher
55- // ReplicaID. Its raftMu is not locked.
53+ // and rightRepl is either nil (though the replica may be created concurrently
54+ // after we got this nil), or an uninitialized replica with a higher
55+ // ReplicaID. Its raftMu is not locked, so this replica might as well be in
56+ // the process of destruction or being replaced with another higher-ReplicaID
57+ // uninitialized replica.
58+ //
59+ // In any case, the RHS, if exists (one or multiple replicas, throughout this
60+ // splitPreApply call), is uninitialized. This is due to the Store invariant
61+ // that all initialized replicas don't overlap, thus the RHS one can only be
62+ // uninitialized while we still own the RHS part of the keyspace.
63+ //
64+ // Uninitialized replicas don't have replicated state, and only have non-empty
65+ // RaftReplicaID and RaftHardState keys in storage (at the time of writing),
66+ // or, more generally, only unreplicated keys. As a rule of thumb, all
67+ // unreplicated keys belong to the *current* ReplicaID in the store, rather
68+ // than the ReplicaID in the split trigger (which can be stale).
5669 rightRepl := r .store .GetReplicaIfExists (split .RightDesc .RangeID )
70+
71+ rsl := kvstorage .MakeStateLoader (split .RightDesc .RangeID )
72+ // After PR #149620, the split trigger batch may only contain replicated state
73+ // machine keys, and never contains unreplicated / raft keys. One exception:
74+ // there can still be historical split proposals that write the initial
75+ // RaftTruncatedState of the RHS. Remove this key (if exists), and set it
76+ // below only if necessary.
77+ //
78+ // Note that if the RHS range is already present or being created concurrently
79+ // on this Store, it doesn't have a RaftTruncatedState (which only initialized
80+ // replicas can have), so this deletion will not conflict with or corrupt it.
81+ //
82+ // TODO(#152847): remove this workaround when there are no historical
83+ // proposals with RaftTruncatedState, e.g. after a below-raft migration.
84+ if ts , err := rsl .LoadRaftTruncatedState (ctx , readWriter ); err != nil {
85+ log .KvExec .Fatalf (ctx , "cannot load RaftTruncatedState: %v" , err )
86+ } else if ts == (kvserverpb.RaftTruncatedState {}) {
87+ // Common case. Do nothing.
88+ } else if err := rsl .ClearRaftTruncatedState (readWriter ); err != nil {
89+ log .KvExec .Fatalf (ctx , "cannot clear RaftTruncatedState: %v" , err )
90+ }
91+
5792 // Check to see if we know that the RHS has already been removed from this
5893 // store at the replica ID implied by the split.
5994 if rightRepl == nil || rightRepl .isNewerThanSplit (& split ) {
60- // We're in the rare case where we know that the RHS has been removed
61- // and re-added with a higher replica ID (and then maybe removed again ).
95+ // We're in the rare case where we know that the RHS has been removed or
96+ // re-added with a higher replica ID (one or more times ).
6297 //
6398 // If rightRepl is not nil, we are *not* holding raftMu.
6499 //
65100 // To apply the split, we need to "throw away" the data that would belong to
66101 // the RHS, i.e. we clear the user data the RHS would have inherited from
67- // the LHS due to the split and additionally clear all of the range ID local
68- // state that the split trigger writes into the RHS. At the time of writing,
69- // unfortunately that means that we'll also delete any data that might
70- // already be present in the RHS: the HardState and RaftReplicaID. It is
71- // important to preserve the HardState because we might however have already
72- // voted at a higher term. In general this shouldn't happen because we add
73- // learners and then promote them only after they apply a snapshot but we're
74- // going to be extra careful in case future versions of cockroach somehow
75- // promote replicas without ensuring that a snapshot has been received. So
76- // we write it back (and the RaftReplicaID too, since it's an invariant that
77- // it's always present).
78- var hs raftpb.HardState
102+ // the LHS due to the split.
103+ //
104+ // Leave the RangeID-local state intact, since it either belongs to a newer
105+ // replica or does not exist. At the time of writing, it can be a non-empty
106+ // HardState and RaftReplicaID. It is important to preserve the HardState
107+ // because the replica might have already voted at a higher term. In general
108+ // this shouldn't happen because we add learners and then promote them only
109+ // after they apply a snapshot, but we're going to be extra careful in case
110+ // future versions of cockroach somehow promote replicas without ensuring
111+ // that a snapshot has been received.
112+ //
113+ // NB: the rightRepl == nil condition is flaky, in a sense that the RHS
114+ // replica can be created concurrently here, one or more times. But we only
115+ // use it for a best effort assertion, so this is not critical.
79116 if rightRepl != nil {
80- // TODO(pav-kv): rightRepl could have been destroyed by the time we get to
81- // lock it here. The HardState read-then-write appears risky in this case.
82- rightRepl .raftMu .Lock ()
83- defer rightRepl .raftMu .Unlock ()
84117 // Assert that the rightRepl is not initialized. We're about to clear out
85118 // the data of the RHS of the split; we cannot have already accepted a
86119 // snapshot to initialize this newer RHS.
87120 if rightRepl .IsInitialized () {
88121 log .KvExec .Fatalf (ctx , "unexpectedly found initialized newer RHS of split: %v" , rightRepl .Desc ())
89122 }
90- var err error
91- hs , err = rightRepl .raftMu .stateLoader .LoadHardState (ctx , readWriter )
92- if err != nil {
93- log .KvExec .Fatalf (ctx , "failed to load hard state for removed rhs: %v" , err )
94- }
95123 }
96- // TODO(#152199): the rightRepl == nil condition is flaky. There can be a
97- // racing replica creation for a higher ReplicaID, and it can subsequently
98- // update its HardState. Here, we can accidentally clear the HardState of
99- // that new replica.
100124 if err := kvstorage .RemoveStaleRHSFromSplit (
101125 ctx , readWriter , readWriter , split .RightDesc .RangeID , split .RightDesc .RSpan (),
102126 ); err != nil {
103127 log .KvExec .Fatalf (ctx , "failed to clear range data for removed rhs: %v" , err )
104128 }
105- if rightRepl != nil {
106- // Cleared the HardState and RaftReplicaID, so rewrite them to the current
107- // values. NB: rightRepl.raftMu is still locked since HardState was read,
108- // so it can't have been rewritten in the meantime (fixed in #75918).
109- if err := rightRepl .raftMu .stateLoader .SetHardState (ctx , readWriter , hs ); err != nil {
110- log .KvExec .Fatalf (ctx , "failed to set hard state with 0 commit index for removed rhs: %v" , err )
111- }
112- if err := rightRepl .raftMu .stateLoader .SetRaftReplicaID (
113- ctx , readWriter , rightRepl .ReplicaID ()); err != nil {
114- log .KvExec .Fatalf (ctx , "failed to set RaftReplicaID for removed rhs: %v" , err )
115- }
116- }
117129 return
118130 }
119131
120132 // The RHS replica exists and is uninitialized. We are initializing it here.
121133 // This is the common case.
122134 //
123- // Update the raft HardState with the new Commit index (taken from the
124- // applied state in the write batch), and use existing[*] or default Term
125- // and Vote. Also write the initial RaftTruncatedState.
135+ // Update the raft HardState with the new Commit index (taken from the applied
136+ // state in the write batch), and use existing[*] or default Term and Vote.
137+ // Also write the initial RaftTruncatedState.
126138 //
127139 // [*] Note that uninitialized replicas may cast votes, and if they have, we
128140 // can't load the default Term and Vote values.
129- rsl := kvstorage .MakeStateLoader (split .RightDesc .RangeID )
130141 if err := rsl .SynthesizeRaftState (ctx , readWriter , kvstorage .TODORaft (readWriter )); err != nil {
131142 log .KvExec .Fatalf (ctx , "%v" , err )
132143 }
0 commit comments