Skip to content

Commit

Permalink
Monitor and heal master replica connections (#43)
Browse files Browse the repository at this point in the history
When a RedisFailover's "master" Redis node is being replicated to a
"slave" Redis node that is NOT part of the RedisFailover the
redis-operator resets the sentinels indefinitely.

Consider this scenario, the RedisFailover is being replicated
asynchronously to a warm standby Redis cluster in a different data
center to handle primary data site outages. Usually we'd configure the
secondary site to replicate from the "slave" nodes of the Primary site's
RedisFailover. However, if a failover occurs in the primary data site,
it's possible that the "slave" to which the secondary site is connected
to and replicating from is promoated to the Primary site's new master.
When this happens, sentinel picks up the secondary site's replication
connections and adds them to the list of replicas to consider for leader
election. Thankfully, the operator prevents the sentinels from
communicating with any pods that it ought NOT consider for leader
election, so failovers still behave as expected. However, this causes
the redis-operator to detect that the sentinels are trying to monitor
replicas that they shouldn't and calls `SENTINEL RESET` to clear any
stale replica entries form the sentinel. The secondary site is still
replicating from the newly promoted master so the secondary site's
replication connections are added back to the sentinel replicas list
when the sentinel calls `INFO` on the primary site's "master"; repeating
the reset cycle indefinitely.

This change assumes that any replication not immediately meant to be
managed by the RedisFailover should connect via the RedisFailover's
"slave" nodes; the operator provides services to reach these nodes. When
the operator detects that the master node has replication connections
that would otherwise confuse the sentinel's leader election, it attempts
to clean stale replication connections by resetting them; forcing
replication clients to re-establish connections to a "slave" node in the
primary site rather than the master.
  • Loading branch information
indiebrain authored Feb 13, 2024
1 parent 55611bc commit 75824b3
Show file tree
Hide file tree
Showing 12 changed files with 250 additions and 15 deletions.
6 changes: 5 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@ Also check this project's [releases](https://github.com/powerhome/redis-operator

## Unreleased

### Fixed

- Operator detects and attempts to heal excessive replication connections on the master node. This prevents excessive sentinel resets from the operator when extra-RedisFailvoer replication connnections are present on the "slave" nodes. #43

## [v2.0.1] - 2024-02-09

### Fixed
Expand All @@ -25,7 +29,7 @@ This update modifies how the operator generates network policies. In version v2.

Update notes:

This release will change the labels of the HAProxy deployment resource.
This release will change the labels of the HAProxy deployment resource.
It's important to note that in API version apps/v1, a Deployment's label selector [cannot be changed once it's created](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#label-selector-updates). Therefore, any existing HAProxy deployment placed by an <v2.0.0 version of the redis-operator MUST be deleted so the new deployment with the correct labels and selectors can be recreated by redis-operator v2.0.0+

## [v1.8.0] - 2024-01-16
Expand Down
2 changes: 2 additions & 0 deletions metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ const (
MISC = "MISC_ERROR"
SENTINEL_NUMBER_IN_MEMORY_MISMATCH = "SENTINEL_NUMBER_IN_MEMORY_MISMATCH"
REDIS_SLAVES_NUMBER_IN_MEMORY_MISMATCH = "REDIS_SLAVES_NUMBER_IN_MEMORY_MISMATCH"
REDIS_SLAVES_NUMBER_CONNECTED_MISMATCH = "REDIS_SLAVES_NUMBER_CONNECTED_MISMATCH"
// redis connection related errors
WRONG_PASSWORD_USED = "WRONG_PASSWORD_USED"
NOAUTH = "AUTH_CREDENTIALS_NOT_PROVIDED"
Expand Down Expand Up @@ -69,6 +70,7 @@ const (
GET_SENTINEL_MONITOR = "SENTINEL_GET_MASTER_INSTANCE"
CHECK_SENTINEL_QUORUM = "SENTINEL_CKQUORUM"
SLAVE_IS_READY = "CHECK_IF_SLAVE_IS_READY"
RESET_REPLICA_CONNECTIONS = "RESET_REPLICA_CONNECTIONS"
)

var ( // used for grabage collection of metrics
Expand Down
14 changes: 14 additions & 0 deletions mocks/operator/redisfailover/service/RedisFailoverCheck.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 14 additions & 0 deletions mocks/operator/redisfailover/service/RedisFailoverHeal.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

38 changes: 38 additions & 0 deletions mocks/service/redis/Client.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 9 additions & 0 deletions operator/redisfailover/checker.go
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,15 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e
}
}

err = r.rfChecker.CheckNumberRedisConnectedSlaves(master, rf)
setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.REDIS_SLAVES_NUMBER_CONNECTED_MISMATCH, metrics.NOT_APPLICABLE, err)
if err != nil {
r.logger.WithField("redisfailover", rf.ObjectMeta.Name).WithField("namespace", rf.ObjectMeta.Namespace).Warningf("Master has wrong number of slaves: %s", err.Error())
if err = r.rfHealer.ResetReplicaConnections(master, rf); err != nil {
return err
}
}

err = r.applyRedisCustomConfig(rf)
setRedisCheckerMetrics(r.mClient, "redis", rf.Namespace, rf.Name, metrics.APPLY_REDIS_CONFIG, metrics.NOT_APPLICABLE, err)
if err != nil {
Expand Down
3 changes: 3 additions & 0 deletions operator/redisfailover/checker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -364,8 +364,11 @@ func TestCheckAndHeal(t *testing.T) {
mrfc.On("GetMasterIP", rf).Twice().Return(master, nil)
if test.slavesOK {
mrfc.On("CheckAllSlavesFromMaster", master, rf).Once().Return(nil)
mrfc.On("CheckNumberRedisConnectedSlaves", master, rf).Once().Return(nil)
} else {
mrfc.On("CheckAllSlavesFromMaster", master, rf).Once().Return(errors.New(""))
mrfc.On("CheckNumberRedisConnectedSlaves", master, rf).Once().Return(errors.New(""))
mrfh.On("ResetReplicaConnections", master, rf).Once().Return(nil)
if test.redisSetMasterOnAllOK {
mrfh.On("SetMasterOnAll", master, rf).Once().Return(nil)
} else {
Expand Down
13 changes: 13 additions & 0 deletions operator/redisfailover/service/check.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ type RedisFailoverCheck interface {
CheckSentinelNumber(rFailover *redisfailoverv1.RedisFailover) error
CheckAllSlavesFromMaster(master string, rFailover *redisfailoverv1.RedisFailover) error
CheckSentinelNumberInMemory(sentinel string, rFailover *redisfailoverv1.RedisFailover) error
CheckNumberRedisConnectedSlaves(masterIP string, rFailover *redisfailoverv1.RedisFailover) error
CheckSentinelSlavesNumberInMemory(sentinel string, rFailover *redisfailoverv1.RedisFailover) error
CheckSentinelQuorum(rFailover *redisfailoverv1.RedisFailover) (int, error)
CheckIfMasterLocalhost(rFailover *redisfailoverv1.RedisFailover) (bool, error)
Expand Down Expand Up @@ -248,7 +249,19 @@ func (r *RedisFailoverChecker) CheckSentinelSlavesNumberInMemory(sentinel string
}
}
return nil
}

func (r *RedisFailoverChecker) CheckNumberRedisConnectedSlaves(masterIP string, rf *redisfailoverv1.RedisFailover) error {
portString := rf.Spec.Redis.Port.ToString()
nSlaves, err := r.redisClient.GetNumberRedisConnectedSlaves(masterIP, portString)
if err != nil {
return err
} else {
if nSlaves != rf.Spec.Redis.Replicas-1 {
return errors.New("redis number of slaves mismatch")
}
}
return nil
}

// CheckSentinelMonitor controls if the sentinels are monitoring the expected master
Expand Down
44 changes: 44 additions & 0 deletions operator/redisfailover/service/check_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -436,6 +436,50 @@ func TestCheckSentinelMonitorWithPortIPMismatch(t *testing.T) {
assert.Error(err)
}

func TestCheckNumberRedisConnectedSlavesGeConnectedSlavesNumberError(t *testing.T) {
assert := assert.New(t)

rf := generateRF()

ms := &mK8SService.Services{}
mr := &mRedisService.Client{}
mr.On("GetNumberRedisConnectedSlaves", "1.1.1.1", "0").Once().Return(int32(0), errors.New("expected error"))

checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}, metrics.Dummy)

err := checker.CheckNumberRedisConnectedSlaves("1.1.1.1", rf)
assert.Error(err)
}

func TestCheckNumberRedisConnectedSlavesGeConnectedSlavesNumberMismatch(t *testing.T) {
assert := assert.New(t)

rf := generateRF()

ms := &mK8SService.Services{}
mr := &mRedisService.Client{}
mr.On("GetNumberRedisConnectedSlaves", "1.1.1.1", "0").Once().Return(int32(rf.Spec.Redis.Replicas+1), nil)

checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}, metrics.Dummy)

err := checker.CheckNumberRedisConnectedSlaves("1.1.1.1", rf)
assert.Error(err)
}

func TestCheckNumberRedisConnectedSlaves(t *testing.T) {
assert := assert.New(t)
rf := generateRF()

ms := &mK8SService.Services{}
mr := &mRedisService.Client{}
mr.On("GetNumberRedisConnectedSlaves", "1.1.1.1", "0").Once().Return(rf.Spec.Redis.Replicas-1, nil)

checker := rfservice.NewRedisFailoverChecker(ms, mr, log.DummyLogger{}, metrics.Dummy)

err := checker.CheckNumberRedisConnectedSlaves("1.1.1.1", rf)
assert.NoError(err)
}

func TestGetMasterIPGetStatefulSetPodsError(t *testing.T) {
assert := assert.New(t)

Expand Down
16 changes: 16 additions & 0 deletions operator/redisfailover/service/heal.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import (
// RedisFailoverHeal defines the interface able to fix the problems on the redis failovers
type RedisFailoverHeal interface {
MakeMaster(ip string, rFailover *redisfailoverv1.RedisFailover) error
ResetReplicaConnections(ip string, rFailover *redisfailoverv1.RedisFailover) error
SetOldestAsMaster(rFailover *redisfailoverv1.RedisFailover) error
SetMasterOnAll(masterIP string, rFailover *redisfailoverv1.RedisFailover) error
SetExternalMasterOnAll(masterIP string, masterPort string, rFailover *redisfailoverv1.RedisFailover) error
Expand Down Expand Up @@ -85,6 +86,21 @@ func (r *RedisFailoverHealer) MakeMaster(ip string, rf *redisfailoverv1.RedisFai
return nil
}

func (r *RedisFailoverHealer) ResetReplicaConnections(ip string, rf *redisfailoverv1.RedisFailover) error {
password, err := k8s.GetRedisPassword(r.k8sService, rf)
if err != nil {
return err
}

port := rf.Spec.Redis.Port.ToString()
err = r.redisClient.ResetReplicaConnections(ip, port, password)
if err != nil {
return err
}

return nil
}

// SetOldestAsMaster puts all redis to the same master, choosen by order of appearance
func (r *RedisFailoverHealer) SetOldestAsMaster(rf *redisfailoverv1.RedisFailover) error {
ssp, err := r.k8sService.GetStatefulSetPods(rf.Namespace, GetRedisName(rf))
Expand Down
28 changes: 28 additions & 0 deletions operator/redisfailover/service/heal_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,34 @@ import (
rfservice "github.com/spotahome/redis-operator/operator/redisfailover/service"
)

func TestResetReplicaConnectionsError(t *testing.T) {
assert := assert.New(t)
rf := generateRF()

ms := &mK8SService.Services{}
mr := &mRedisService.Client{}
mr.On("ResetReplicaConnections", "0.0.0.0", "0", "").Once().Return(errors.New(""))

healer := rfservice.NewRedisFailoverHealer(ms, mr, log.DummyLogger{})

err := healer.ResetReplicaConnections("0.0.0.0", rf)
assert.Error(err)
}

func TestResetReplicaConnections(t *testing.T) {
assert := assert.New(t)
rf := generateRF()

ms := &mK8SService.Services{}
mr := &mRedisService.Client{}
mr.On("ResetReplicaConnections", "0.0.0.0", "0", "").Once().Return(nil)

healer := rfservice.NewRedisFailoverHealer(ms, mr, log.DummyLogger{})

err := healer.ResetReplicaConnections("0.0.0.0", rf)
assert.NoError(err)
}

func TestSetOldestAsMasterNewMasterError(t *testing.T) {
assert := assert.New(t)

Expand Down
Loading

0 comments on commit 75824b3

Please sign in to comment.