Skip to content

Commit

Permalink
Merge pull request #125600 from kvoli/backport-release-23.2.6-rc-125276
Browse files Browse the repository at this point in the history
release-23.2.6-rc: kvserver: constrain store gossip chatter
  • Loading branch information
kvoli authored Jun 13, 2024
2 parents 5235eda + 8f933f2 commit fa33ff1
Show file tree
Hide file tree
Showing 17 changed files with 403 additions and 112 deletions.
2 changes: 2 additions & 0 deletions pkg/kv/kvserver/asim/gossip/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,10 @@ go_library(
"//pkg/kv/kvserver/asim/config",
"//pkg/kv/kvserver/asim/state",
"//pkg/roachpb",
"//pkg/settings/cluster",
"//pkg/util/hlc",
"//pkg/util/protoutil",
"//pkg/util/timeutil",
],
)

Expand Down
10 changes: 7 additions & 3 deletions pkg/kv/kvserver/asim/gossip/gossip.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@ import (
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/asim/config"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/asim/state"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
"github.com/cockroachdb/cockroach/pkg/util/protoutil"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
)

// Gossip collects and updates the storepools of the cluster upon capacity
Expand Down Expand Up @@ -52,15 +54,17 @@ type storeGossiper struct {
addingStore bool
}

func newStoreGossiper(descriptorGetter func(cached bool) roachpb.StoreDescriptor) *storeGossiper {
func newStoreGossiper(
descriptorGetter func(cached bool) roachpb.StoreDescriptor, clock timeutil.TimeSource,
) *storeGossiper {
sg := &storeGossiper{
lastIntervalGossip: time.Time{},
descriptorGetter: descriptorGetter,
}

desc := sg.descriptorGetter(false /* cached */)
knobs := kvserver.StoreGossipTestingKnobs{AsyncDisabled: true}
sg.local = kvserver.NewStoreGossip(sg, sg, knobs)
sg.local = kvserver.NewStoreGossip(sg, sg, knobs, &cluster.MakeTestingClusterSettings().SV, clock)
sg.local.Ident = roachpb.StoreIdent{StoreID: desc.StoreID, NodeID: desc.Node.NodeID}

return sg
Expand Down Expand Up @@ -123,7 +127,7 @@ func (g *gossip) addStoreToGossip(s state.State, storeID state.StoreID) {
g.storeGossip[storeID] = &storeGossiper{addingStore: true}
g.storeGossip[storeID] = newStoreGossiper(func(cached bool) roachpb.StoreDescriptor {
return s.StoreDescriptors(cached, storeID)[0]
})
}, s.Clock())
}

// Tick checks for completed gossip updates and triggers new gossip
Expand Down
13 changes: 12 additions & 1 deletion pkg/kv/kvserver/asim/gossip/gossip_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ package gossip
import (
"context"
"testing"
"time"

"github.com/cockroachdb/cockroach/pkg/kv/kvserver/allocator/storepool"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/asim/config"
Expand Down Expand Up @@ -87,11 +88,17 @@ func TestGossip(t *testing.T) {
require.Len(t, gossip.exchange.pending, 0)
assertStorePool(assertSameFn)

// Tick state by a large duration to ensure the below capacity changes don't
// run into the max gossip frequency limit.
storeTick := tick

// Update the usage info leases for s1 and s2, so that it exceeds the delta
// required to trigger a gossip update. We do this by transferring every
// lease to s2.
for _, rng := range s.Ranges() {
s.TransferLease(rng.RangeID(), 2)
storeTick = storeTick.Add(3 * time.Second)
s.TickClock(storeTick)
}
gossip.Tick(ctx, tick, s)
// There should be just store 1 and 2 pending gossip updates in the exchanger.
Expand All @@ -108,6 +115,10 @@ func TestGossip(t *testing.T) {
// Assert that the lease counts are as expected after transferring all of
// the leases to s2.
require.Equal(t, int32(0), (*details[1])[1].Desc.Capacity.LeaseCount)
require.Equal(t, int32(100), (*details[1])[2].Desc.Capacity.LeaseCount)
// Depending on the capacity delta threshold, s2 may not have gossiped
// exactly when it reached 100 leases, as it earlier gossiped at 90+ leases,
// so 100 may be < lastGossip * capacityDeltaThreshold, not triggering
// gossip. Assert that the lease count gossiped is at least 90.
require.Greater(t, (*details[1])[2].Desc.Capacity.LeaseCount, int32(90))
require.Equal(t, int32(0), (*details[1])[3].Desc.Capacity.LeaseCount)
}
5 changes: 5 additions & 0 deletions pkg/kv/kvserver/asim/state/impl.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/spanconfig"
"github.com/cockroachdb/cockroach/pkg/spanconfig/spanconfigreporter"
"github.com/cockroachdb/cockroach/pkg/util/hlc"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/google/btree"
"go.etcd.io/raft/v3"
"go.etcd.io/raft/v3/tracker"
Expand Down Expand Up @@ -1058,6 +1059,10 @@ func (s *state) TickClock(tick time.Time) {
s.clock.Set(tick.UnixNano())
}

func (s *state) Clock() timeutil.TimeSource {
return s.clock
}

// UpdateStorePool modifies the state of the StorePool for the Store with
// ID StoreID.
func (s *state) UpdateStorePool(
Expand Down
16 changes: 16 additions & 0 deletions pkg/kv/kvserver/asim/state/state.go
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,8 @@ type State interface {
ClusterUsageInfo() *ClusterUsageInfo
// TickClock modifies the state Clock time to Tick.
TickClock(time.Time)
// Clock returns the state Clock.
Clock() timeutil.TimeSource
// UpdateStorePool modifies the state of the StorePool for the Store with
// ID StoreID.
UpdateStorePool(StoreID, map[roachpb.StoreID]*storepool.StoreDetail)
Expand Down Expand Up @@ -290,6 +292,8 @@ type ManualSimClock struct {
nanos int64
}

var _ timeutil.TimeSource = &ManualSimClock{}

// Now returns the current time.
func (m *ManualSimClock) Now() time.Time {
return timeutil.Unix(0, m.nanos)
Expand All @@ -300,6 +304,18 @@ func (m *ManualSimClock) Set(tsNanos int64) {
m.nanos = tsNanos
}

func (m *ManualSimClock) Since(t time.Time) time.Duration {
return m.Now().Sub(t)
}

func (m *ManualSimClock) NewTimer() timeutil.TimerI {
panic("unimplemented")
}

func (m *ManualSimClock) NewTicker(duration time.Duration) timeutil.TickerI {
panic("unimplemented")
}

// Keys in the simulator are 64 bit integers. They are mapped to Keys in
// cockroach as the decimal representation, with 0 padding such that they are
// lexicographically ordered as strings. The simplification to limit keys to
Expand Down
10 changes: 5 additions & 5 deletions pkg/kv/kvserver/asim/tests/testdata/non_rand/example_add_node
Original file line number Diff line number Diff line change
Expand Up @@ -42,14 +42,14 @@ plot stat=replicas sample=1
261 ┤ ╭╭──╯
241 ┤ ╭╭─╯
221 ┤ ╭───╯
201 ┤ ╭─╯
181 ┤ ╭──╯
161 ┤ ╭──╯
201 ┤ ╭─╯
181 ┤ ╭──
161 ┤ ╭──╯
140 ┤ ╭──╯╯
120 ┤ ╭─╯╯
100 ┤ ╭──╯
80 ┤ ─╯╯
60 ┤ ╭─
80 ┤ ╭──╯╯
60 ┤ ╭─
40 ┤ ╭─╯
20 ┤ ╭──╯
0 ┼─╯
Expand Down
51 changes: 26 additions & 25 deletions pkg/kv/kvserver/asim/tests/testdata/non_rand/example_fulldisk
Original file line number Diff line number Diff line change
Expand Up @@ -21,42 +21,43 @@ plot stat=replicas
----
----

336╭╮ ╭╭╮╭╮─╭╮╭╭╮ ╭╮╭──╮╮╭╭─
325╭╮╭────╮╭────────╯╰╯╰─╯╰─╯╰──────────────╮╭╯─╯╰──╯╯
314╭╭╭╮╭─╭──╯╰╯╯╰╯╰╰ ╰╯ ╰╯ ╰╯╰╯╰╯ ╰╯
302 ┼────────────────────────
291 ┤ ╰───
280 ╰╮ ╭
269 ┤ ╰─╯╰╮
258╰╮
246 ╰──╮
235
224 ┤
213╰╮╭────╮ ╭
202╰╯ ╰╮ ╭──────────╯╰───
190 ┤ ╰─────╭───╮ ╭╯
179 ╰───╯ ╰─╯ ╰─╮ ╭──╮
168 ╰─╯
342╮╭╮ ╭╮╭
330 ╭╭╮╭╭╭───────────╯╰───╯╰──────╮╭────────╮╭─╮╭───
318╮╭──────╯╰──╯─╯╰─╯╰╯ ╰╯ ╰╯ ╰╯╰─╯╰╯╯╰╯╰╯ ╰╯
307 ┤╭────────────────────╭─╯╰╯╯╰
295 ┼─────────────────╮───╯╰╯
283╰──╮╭─
271 ┤ ╰╰╮
259
248─╮
236╰─
224 ┤ ╰──
212───╮
200 ┤ ╰── ╭──╮ ╭
189 ┤ ╰───╮ ╭╮│ ╰─────╮│╰───╮ ╭
177 ┤ ╰────╮╭────╮ ╭───╯╰╯ ╰╯ ╰─╯
165╰╯ ╰─╯
replicas
----
----


# Plot the % of disk storage capacity used. We should see s5 hovering right
# around 92.5-95% (the storage capacity threshold value).
plot stat=disk_fraction_used
----
----

0.98╭─╮ ╭╮ ╭╮╭─╮╭──╮ ╭──────╮╭─╮ ╭───╮ ╭╮ ╭╮╭─╮ ╭─── ╭─╮
0.91 ┤ ╭───────╯ ╰─╯╰──╯╰╯ ╰╯ ╰──╯ ╰╯ ╰────╯ ╰────╯╰──╯╰╯ ╰──╯ ╰───╯ ╰
0.85 ┼──────╯
0.78 ┤
0.72
0.65
0.59
0.52
0.99 ╭─╮ ╭╮ ╭╮ ╭─╮
0.93 ╭────────────╯ ───────────────────────────╯╰──────╮╭────────╮│╰──╯ ╰
0.86 ┼╮╭───────╯ ╰╯ ╰
0.79 ┤╰╯
0.73
0.66
0.60
0.53
0.46 ┤
0.39
0.40
0.33 ┤
0.26 ┤
0.20 ┤
Expand Down
22 changes: 11 additions & 11 deletions pkg/kv/kvserver/asim/tests/testdata/non_rand/example_multi_store
Original file line number Diff line number Diff line change
Expand Up @@ -27,20 +27,20 @@ plot stat=leases

14.00 ┼╮
13.07 ┤╰╮
12.13 ┤ ╰╮
11.20 ┤
12.13 ┤
11.20 ┤ ╰╮
10.27 ┤ │
9.33 ┤ │
8.40 ┤ ╰╮
7.47 ┤ ╰╮
6.53 ┤
5.60 ┤
4.67 ┤
3.73 ┤ ╰───────────╮
2.80 ┤
1.87 ┤ ───────────╮──────────────╮
0.93 ┤╭╭╭────────────────────────────────────────────────────────────────────────────
0.00 ┼──╯─────────────╯──────────────╯
7.47 ┤
6.53 ┤ │
5.60 ┤ │
4.67 ┤ ╰╮
3.73 ┤
2.80 ┤ ╰───────────╮
1.87 ┤ ╭─────────────╮──────────────╮
0.93 ┤╭╭╮╭───────────────────────────────────────────────────────────────────────────
0.00 ┼─╯╰╯╯───────────╯──────────────╯
leases
----
----
67 changes: 39 additions & 28 deletions pkg/kv/kvserver/asim/tests/testdata/non_rand/example_rebalancing
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,17 @@ setting gossip_delay=20s
# assertion is reached due to how the system reacts to the long gossip delays.
eval duration=5m samples=2 seed=42
----
failed assertion sample 1
balance stat=qps threshold=(<1.15) ticks=6
max/mean=1.17 tick=0
max/mean=1.17 tick=1
max/mean=1.17 tick=2
max/mean=1.17 tick=3
max/mean=1.40 tick=4
max/mean=1.41 tick=5
steady state stat=qps threshold=(<0.05) ticks=6
store=1 min/mean=1.00 max/mean=6.00
store=4 min/mean=1.00 max/mean=0.76
failed assertion sample 2
balance stat=qps threshold=(<1.15) ticks=6
max/mean=2.00 tick=0
Expand All @@ -95,18 +106,18 @@ plot stat=qps sample=3
6533 ┤ │ │
6067 ┤ │ │
5600 ┤ │ ╰╮
5133 ┤ │ │ ╭───╮
4667 ┤ │ │ │ │
4200 ┤ │ │ ╭───╮ ╭──
3733 ┤ │ │ │ │ │ │
3267 ┤ │ │ │ │ │ │
2800 ┤ │ │╭╭─────╮╭╯ ╭╯ ╰╮ ╭──╮ ─────╮
2333 ┤ │ │││ ││ │ │ │╭╯
1867 ┤ │ ╰││ │╰╮╭───╮╮│ ││╭──╰╮╭───────────╮╮╭─────────────╮
1400 ┤ │ ││ ││╭╯ │││ │││ ╭╯│ │ ││
933 ┤ │ ╭╭╯ ╰╮│╭──────────────────────────────────────────────────────────────
467 ┤ │ ││ │││ ││ │││ ││
0 ┼────────────────╯────────────────────────────────╯───────────────╯
5133 ┤ │ │
4667 ┤ │ │
4200 ┤ │ │ ╭───╮ ╭──╮ ╭─────╮
3733 ┤ │ │ │
3267 ┤ │ │ │
2800 ┤ │ │╭╭─────╮ ╭─╮ ╭───── │ ╰───────╮
2333 ┤ │ │││ ││╭╯ ╰╮ │ │
1867 ┤ │ ╰││ ╰╰╮ ╭╭─────────╮───────────────╮ │ ╭──────╰╮╭──╮ ╭─────╮
1400 ┤ │ ││ ││ ││ ││ │╰╮│ │ │ │ │││ │ │
933 ┤ │ ╭╭╯ ││╭──────────────────────────────╮╮│──╮──╭──────────────────────
467 ┤ │ ││ │││ ││ │││ │ │││ │╰╮│ │ │ │
0 ┼────────────────╯──────────────────────────────╰─────────────────╯────╰───────╯
qps
----
----
Expand All @@ -118,22 +129,22 @@ plot stat=replica_moves sample=3
----
----

22.00 ┤ ╭──────────────────────────────────────
20.53 ┤ ╭───────────────────────────────────────────
19.07╭╯
17.60 ┤ ╭─────╭──────────╯
16.13╭╯
14.67╭╯ ╭╯ ╭────────────────────────────────────────────
13.20 │ │ │
11.73 ╭╯
10.27 ┤ ╭───────────╭───
8.80 ┤ ╭─────╯ ╭╯
7.33 ┤ ╭─────╭╭────────────╯ ╭─────────────────────────────────────
5.87 ┤ ╭╯ │ ╭────────
4.40 ┤ │ ╭───╭╯ ╭───────────────╯ ╭─────────────────────────────
2.93 ┤ ╭╯ ╭╯ ╭╯ ╭─╯ ╭╯ ╭─
1.47 ┤ │╭─╭───╯ ╭╭──────╯───────────────────╯╭──────────────────────────────
0.00 ┼────────────────────╯───────────────────────────╯
23.00 ┤
21.47 ╭─────────────────
19.93 ╭──────╯
18.40 ╭────────────╯
16.87 ╭╯
15.33─────────────────
13.80╭─╯
12.27╭─╯ ╭──────────────────────────╭──────────
10.73 ┤ ╭─────────────────────────╯ ╭─────────────────────╯ ╭╭─
9.20 ┤ ╭╯╭╯ ╭───────────── ╭╭───────╭─
7.67 ┤ ╭─────╭─╯ ╭──────╯ ╭─────────────────────────────
6.13 ┤ ╭╯ │ ╭─────────╯ ╭─────╭╯╯────╯ │
4.60 ┤ │ ╭───╭╯ ╭─╭──────────────╯ ╭──────
3.07 ┤ ╭╯ ╭╯ ╭╯ ╭─────
1.53 ┤ │╭─╭────────╯╯ │
0.00 ┼───────────────────╯─╯───────────────────────────────╯
replica_moves
----
----
Expand Down
Loading

0 comments on commit fa33ff1

Please sign in to comment.