From f7efa40696457a49958d53edcc57a062fee9a220 Mon Sep 17 00:00:00 2001 From: Krisztian Litkey Date: Thu, 26 Oct 2023 01:35:00 +0300 Subject: [PATCH] config-manager: add kludge/workaround for CRI-O. It looks like in the case of CRI-O we need to give it some time after we have been started up but before we kick it in the head to restart it over D-Bus. Otherwise it will always report a 255 (-1) exit status for us. Since we run as an init-container, a non-zero exit status would prevent other containers in our pod from ever starting up. It would be good to try and find out what is the exact reason why this behavior is exhibited by CRI-O but not containerd, and if it could be fixed in CRI-O. Until then... this. Signed-off-by: Krisztian Litkey (cherry picked from commit 8146d2f5204b59e69f799805c29806f942efbaad) --- cmd/config-manager/main.go | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/cmd/config-manager/main.go b/cmd/config-manager/main.go index 0a2d93aaf..a8b1df630 100644 --- a/cmd/config-manager/main.go +++ b/cmd/config-manager/main.go @@ -22,6 +22,7 @@ import ( "context" "fmt" "os" + "time" "github.com/coreos/go-systemd/v22/dbus" tomlv2 "github.com/pelletier/go-toml/v2" @@ -62,6 +63,19 @@ func main() { log.Fatalf("error enabling NRI: %v", err) } + // + // TODO(klihub): Kludge warning... + // If the runtime is CRI-O, it looks like we need to cut it some + // slack, after we've been started up by it but before we restart + // it. Otherwise it always reports our exit status as -1 (255). + // We are an init-container so a non-zero exit status would prevent + // other containers in our pod from ever starting... + // + + if unit == crioUnit { + time.Sleep(3 * time.Second) + } + if err = restartSystemdUnit(conn, unit); err != nil { log.Fatalf("failed to restart %q unit: %v", unit, err) }