From 6d5fd6df49e6c001ce9930ee631936b2619f0d59 Mon Sep 17 00:00:00 2001 From: Marco Iorio Date: Mon, 18 Nov 2024 15:24:24 +0100 Subject: [PATCH] status: don't treat kvstore as ready during initialization Currently, the kvstore status is considered ready even during the initialization phase, when the connection has not been established yet. This behavior is potentially confusing, as the agents/operator eventually turn ready, breaking out of the startup probe phase, even if an essential subsystem is not ready at all, only to start crashing a few minutes later due to other timeouts kicking in. Let's modify this behavior so that the kvstore subsystem, and in turn the Cilium agents are marked ready only after successfully establishing the connection to etcd. Yet, we enable this behavior only if the support for running etcd in pod network is disabled. Differently, we need to preserve the previous behavior, to break the chicken-and-egg dependency during startup. Signed-off-by: Marco Iorio --- daemon/cmd/status.go | 8 ++++++++ operator/api/health.go | 8 +++++++- pkg/kvstore/etcd.go | 2 +- 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/daemon/cmd/status.go b/daemon/cmd/status.go index 4ceeddf473f55..df76d29d058ae 100644 --- a/daemon/cmd/status.go +++ b/daemon/cmd/status.go @@ -596,6 +596,14 @@ func (d *Daemon) startStatusCollector(cleaner *daemonCleanup) { } if kvstore, ok := status.Data.(*models.Status); ok { + if kvstore.State == models.StatusStateWarning && option.Config.KVstorePodNetworkSupport { + // Don't treat warnings as errors when the support for running + // etcd in pod network is enabled. This is necessary to allow + // Cilium turning ready even before connecting to the kvstore, + // and break the chicken-and-egg dependency during startup. + kvstore.State = models.StatusStateOk + } + d.statusResponse.Kvstore = kvstore } }, diff --git a/operator/api/health.go b/operator/api/health.go index 8ce23cd108a57..e4d74b792dc90 100644 --- a/operator/api/health.go +++ b/operator/api/health.go @@ -16,6 +16,7 @@ import ( k8sClient "github.com/cilium/cilium/pkg/k8s/client" "github.com/cilium/cilium/pkg/kvstore" "github.com/cilium/cilium/pkg/logging/logfields" + "github.com/cilium/cilium/pkg/option" ) type kvstoreEnabledFunc func() bool @@ -86,7 +87,12 @@ func (h *healthHandler) checkStatus() error { } status := client.Status() - if status.State != models.StatusStateOk { + if status.State != models.StatusStateOk && + // Don't treat warnings as errors when the support for running + // etcd in pod network is enabled. This is necessary to allow + // Cilium turning ready even before connecting to the kvstore, + // and break the chicken-and-egg dependency during startup. + !(status.State == models.StatusStateWarning && option.Config.KVstorePodNetworkSupport) { return errors.New(status.Msg) } } diff --git a/pkg/kvstore/etcd.go b/pkg/kvstore/etcd.go index 2fd8ef0469015..75dde84877d80 100644 --- a/pkg/kvstore/etcd.go +++ b/pkg/kvstore/etcd.go @@ -544,7 +544,7 @@ func connectEtcdClient(ctx context.Context, config *client.Config, cfgPath strin configPath: cfgPath, firstSession: make(chan struct{}), status: models.Status{ - State: models.StatusStateOk, + State: models.StatusStateWarning, Msg: "Waiting for initial connection to be established", }, stopStatusChecker: make(chan struct{}),