diff --git a/go/vt/discovery/healthcheck.go b/go/vt/discovery/healthcheck.go index 440737bd696..8817e20e029 100644 --- a/go/vt/discovery/healthcheck.go +++ b/go/vt/discovery/healthcheck.go @@ -499,6 +499,8 @@ func (hc *HealthCheckImpl) updateHealth(th *TabletHealth, prevTarget *query.Targ hc.mu.Lock() defer hc.mu.Unlock() + log.Infof("updating tablet health: trivialUpdate: %v, up: %v, target: %v; tablet: %v; serving: %v", trivialUpdate, up, th.Target, th.Tablet, th.Serving) + tabletAlias := tabletAliasString(topoproto.TabletAliasString(th.Tablet.Alias)) // let's be sure that this tablet hasn't been deleted from the authoritative map // so that we're not racing to update it and in effect re-adding a copy of the @@ -516,6 +518,9 @@ func (hc *HealthCheckImpl) updateHealth(th *TabletHealth, prevTarget *query.Targ // keyspace and shard are not expected to change, but just in case ... // move this tabletHealthCheck to the correct map oldTargetKey := KeyFromTarget(prevTarget) + + log.Infof("deleting tablet %v from health stats", th.Tablet) + delete(hc.healthData[oldTargetKey], tabletAlias) _, ok := hc.healthData[targetKey] if !ok { @@ -554,6 +559,7 @@ func (hc *HealthCheckImpl) updateHealth(th *TabletHealth, prevTarget *query.Targ alias := tabletAliasString(topoproto.TabletAliasString(healthy[0].Tablet.Alias)) // Clear healthy list for primary if the existing tablet is down if alias == tabletAlias { + log.Warningf("Removing tablet %v from the healthy map.", tabletAlias) hc.healthy[targetKey] = []*TabletHealth{} } } @@ -563,6 +569,7 @@ func (hc *HealthCheckImpl) updateHealth(th *TabletHealth, prevTarget *query.Targ // We re-sort the healthy tablet list whenever we get a health update for tablets we can route to. // Tablets from other cells for non-primary targets should not trigger a re-sort; // they should also be excluded from healthy list. + log.Infof("Recomputing tablet healthy stats for %v", th.Tablet) if th.Target.TabletType != topodata.TabletType_PRIMARY && hc.isIncluded(th.Target.TabletType, th.Tablet.Alias) { hc.recomputeHealthy(targetKey) } diff --git a/go/vt/discovery/tablet_health_check.go b/go/vt/discovery/tablet_health_check.go index 05ab47dee05..cd14cc6df88 100644 --- a/go/vt/discovery/tablet_health_check.go +++ b/go/vt/discovery/tablet_health_check.go @@ -224,6 +224,8 @@ func (thc *tabletHealthCheck) processResponse(hc *HealthCheckImpl, shr *query.St } thc.setServingState(serving, reason) + log.Infof("healthcheck update for tablet %v: serving: %v, reason: %s", thc.Tablet, thc.Serving, reason) + // notify downstream for primary change hc.updateHealth(thc.SimpleCopy(), prevTarget, trivialUpdate, thc.Serving) return nil @@ -326,6 +328,7 @@ func (thc *tabletHealthCheck) checkConn(hc *HealthCheckImpl) { } // trivialUpdate = false because this is an error // up = false because we did not get a healthy response + log.Errorf("healthcheck got error for tablet %v : %v", thc.Tablet, err.Error()) hc.updateHealth(thc.SimpleCopy(), thc.Target, false, false) } // If there was a timeout send an error. We do this after stream has returned. @@ -337,6 +340,7 @@ func (thc *tabletHealthCheck) checkConn(hc *HealthCheckImpl) { hcErrorCounters.Add([]string{thc.Target.Keyspace, thc.Target.Shard, topoproto.TabletTypeLString(thc.Target.TabletType)}, 1) // trivialUpdate = false because this is an error // up = false because we did not get a healthy response within the timeout + log.Warningf("healthcheck timed out for tablet %v", thc.Tablet) hc.updateHealth(thc.SimpleCopy(), thc.Target, false, false) }