Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Backport 13856 - resharding error fix #432

Merged
merged 3 commits into from
Jul 1, 2024
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
VTGate Buffering: Use a more accurate heuristic for determining if we…
…'re doing a reshard (vitessio#13856)

Signed-off-by: Matt Lord <[email protected]>
mattlord authored and tanjinx committed Jun 28, 2024
commit 31bb813516d50bd9874e2e9f26a966228943fb8c
1 change: 1 addition & 0 deletions examples/local/scripts/vtgate-up.sh
Original file line number Diff line number Diff line change
@@ -39,6 +39,7 @@ vtgate \
--tablet_types_to_wait PRIMARY,REPLICA \
--service_map 'grpc-vtgateservice' \
--pid_file $VTDATAROOT/tmp/vtgate.pid \
--enable_buffer \
--mysql_auth_server_impl none \
> $VTDATAROOT/tmp/vtgate.out 2>&1 &

37 changes: 25 additions & 12 deletions go/vt/discovery/keyspace_events.go
Original file line number Diff line number Diff line change
@@ -23,12 +23,14 @@ import (

"google.golang.org/protobuf/proto"

"vitess.io/vitess/go/vt/key"
"vitess.io/vitess/go/vt/log"
"vitess.io/vitess/go/vt/proto/query"
topodatapb "vitess.io/vitess/go/vt/proto/topodata"
"vitess.io/vitess/go/vt/srvtopo"
"vitess.io/vitess/go/vt/topo"
"vitess.io/vitess/go/vt/topo/topoproto"

querypb "vitess.io/vitess/go/vt/proto/query"
topodatapb "vitess.io/vitess/go/vt/proto/topodata"
)

// KeyspaceEventWatcher is an auxiliary watcher that watches all availability incidents
@@ -65,7 +67,7 @@ type KeyspaceEvent struct {

type ShardEvent struct {
Tablet *topodatapb.TabletAlias
Target *query.Target
Target *querypb.Target
Serving bool
}

@@ -124,17 +126,27 @@ func (kss *keyspaceState) beingResharded(currentShard string) bool {
kss.mu.Lock()
defer kss.mu.Unlock()

// if the keyspace is gone, or if it has no known availability events, the keyspace
// cannot be in the middle of a resharding operation
// If the keyspace is gone, has no known availability events, or is in the middle of a
// MoveTables then the keyspace cannot be in the middle of a resharding operation.
if kss.deleted || kss.consistent {
return false
}

// for all the known shards, try to find a primary shard besides the one we're trying to access
// and which is currently healthy. if there are other healthy primaries in the keyspace, it means
// we're in the middle of a resharding operation
// If there are unequal and overlapping shards in the keyspace and any of them are
// currently serving then we assume that we are in the middle of a Reshard.
_, ckr, err := topo.ValidateShardName(currentShard)
if err != nil || ckr == nil { // Assume not and avoid potential panic
return false
}
for shard, sstate := range kss.shards {
if shard != currentShard && sstate.serving {
if !sstate.serving || shard == currentShard {
continue
}
_, skr, err := topo.ValidateShardName(shard)
if err != nil || skr == nil { // Assume not and avoid potential panic
return false
}
if key.KeyRangesIntersect(ckr, skr) {
return true
}
}
@@ -143,7 +155,7 @@ func (kss *keyspaceState) beingResharded(currentShard string) bool {
}

type shardState struct {
target *query.Target
target *querypb.Target
serving bool
externallyReparented int64
currentPrimary *topodatapb.TabletAlias
@@ -426,7 +438,7 @@ func (kew *KeyspaceEventWatcher) getKeyspaceStatus(keyspace string) *keyspaceSta
// This is not a fully accurate heuristic, but it's good enough that we'd want to buffer the
// request for the given target under the assumption that the reason why it cannot be completed
// right now is transitory.
func (kew *KeyspaceEventWatcher) TargetIsBeingResharded(target *query.Target) bool {
func (kew *KeyspaceEventWatcher) TargetIsBeingResharded(target *querypb.Target) bool {
if target.TabletType != topodatapb.TabletType_PRIMARY {
return false
}
@@ -446,7 +458,8 @@ func (kew *KeyspaceEventWatcher) TargetIsBeingResharded(target *query.Target) bo
// The shard state keeps track of the current primary and the last externally reparented time, which we can use
// to determine that there was a serving primary which now became non serving. This is only possible in a DemotePrimary
// RPC which are only called from ERS and PRS. So buffering will stop when these operations succeed.
func (kew *KeyspaceEventWatcher) PrimaryIsNotServing(target *query.Target) bool {
// We return the tablet alias of the primary if it is serving.
func (kew *KeyspaceEventWatcher) PrimaryIsNotServing(target *querypb.Target) bool {
if target.TabletType != topodatapb.TabletType_PRIMARY {
return false
}
Loading