Skip to content

Commit

Permalink
Fix transient secret sync error handling (#4310)
Browse files Browse the repository at this point in the history
* Extra logging

Signed-off-by: Thomas Newton <[email protected]>

* Hopefully fix it

Signed-off-by: Thomas Newton <[email protected]>

* Remove debug prints

Signed-off-by: Thomas Newton <[email protected]>

* Reduce duplication

Signed-off-by: Thomas Newton <[email protected]>

* Tidy

Signed-off-by: Thomas Newton <[email protected]>

* Add a test

Signed-off-by: Thomas Newton <[email protected]>

* Use a CreateContainerConfigErrorGracePeriod

Signed-off-by: Thomas Newton <[email protected]>

* Add grace period to error message

Signed-off-by: Thomas Newton <[email protected]>

* Retryable failure with cleanup

Signed-off-by: Thomas Newton <[email protected]>

* Longer grace period

Signed-off-by: Thomas Newton <[email protected]>

* Fix test

Signed-off-by: Thomas Newton <[email protected]>

* Set default grace period to 0 to avoid changing default behaviour

Signed-off-by: Thomas Newton <[email protected]>

* Update comment

Signed-off-by: Thomas Newton <[email protected]>

* Use permanent error with clean up

Signed-off-by: Thomas Newton <[email protected]>

* Don't use cleanup

Signed-off-by: Thomas Newton <[email protected]>

* Fix test

Signed-off-by: Thomas Newton <[email protected]>

---------

Signed-off-by: Thomas Newton <[email protected]>
Co-authored-by: Dan Rammer <[email protected]>
  • Loading branch information
2 people authored and pvditt committed Dec 13, 2023
1 parent bcf8f9d commit a4b9424
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ var (
CreateContainerErrorGracePeriod: config2.Duration{
Duration: time.Minute * 3,
},
CreateContainerConfigErrorGracePeriod: config2.Duration{
Duration: time.Minute * 0,
},
ImagePullBackoffGracePeriod: config2.Duration{
Duration: time.Minute * 3,
},
Expand Down Expand Up @@ -139,6 +142,11 @@ type K8sPluginConfig struct {
// one, and the corresponding task marked as failed
CreateContainerErrorGracePeriod config2.Duration `json:"create-container-error-grace-period" pflag:"-,Time to wait for transient CreateContainerError errors to be resolved."`

// Time to wait for transient CreateContainerConfigError errors to be resolved. If the
// error persists past this grace period, it will be inferred to be a permanent error.
// The pod will be deleted, and the corresponding task marked as failed.
CreateContainerConfigErrorGracePeriod config2.Duration `json:"create-container-config-error-grace-period" pflag:"-,Time to wait for transient CreateContainerConfigError errors to be resolved."`

// Time to wait for transient ImagePullBackoff errors to be resolved. If the
// error persists past this grace period, it will be inferred to be a permanent
// one, and the corresponding task marked as failed
Expand Down
30 changes: 25 additions & 5 deletions flyteplugins/go/tasks/pluginmachinery/flytek8s/pod_helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -683,8 +683,9 @@ func demystifyPendingHelper(status v1.PodStatus) (pluginsCore.PhaseInfo, time.Ti
// synced, and therefore, only provides an
// approximation of the elapsed time since the last
// transition.
if time.Since(t) >= config.GetK8sPluginConfig().CreateContainerErrorGracePeriod.Duration {
return pluginsCore.PhaseInfoFailure(finalReason, finalMessage, &pluginsCore.TaskInfo{
gracePeriod := config.GetK8sPluginConfig().CreateContainerErrorGracePeriod.Duration
if time.Since(t) >= gracePeriod {
return pluginsCore.PhaseInfoFailure(finalReason, GetMessageAfterGracePeriod(finalMessage, gracePeriod), &pluginsCore.TaskInfo{
OccurredAt: &t,
}), t, nil
}
Expand All @@ -695,14 +696,29 @@ func demystifyPendingHelper(status v1.PodStatus) (pluginsCore.PhaseInfo, time.Ti
&pluginsCore.TaskInfo{OccurredAt: &t},
), t, nil

case "CreateContainerConfigError", "InvalidImageName":
case "CreateContainerConfigError":
gracePeriod := config.GetK8sPluginConfig().CreateContainerConfigErrorGracePeriod.Duration
if time.Since(t) >= gracePeriod {
return pluginsCore.PhaseInfoFailure(finalReason, GetMessageAfterGracePeriod(finalMessage, gracePeriod), &pluginsCore.TaskInfo{
OccurredAt: &t,
}), t, nil
}
return pluginsCore.PhaseInfoInitializing(
t,
pluginsCore.DefaultPhaseVersion,
fmt.Sprintf("[%s]: %s", finalReason, finalMessage),
&pluginsCore.TaskInfo{OccurredAt: &t},
), t, nil

case "InvalidImageName":
return pluginsCore.PhaseInfoFailure(finalReason, finalMessage, &pluginsCore.TaskInfo{
OccurredAt: &t,
}), t, nil

case "ImagePullBackOff":
if time.Since(t) >= config.GetK8sPluginConfig().ImagePullBackoffGracePeriod.Duration {
return pluginsCore.PhaseInfoRetryableFailureWithCleanup(finalReason, finalMessage, &pluginsCore.TaskInfo{
gracePeriod := config.GetK8sPluginConfig().ImagePullBackoffGracePeriod.Duration
if time.Since(t) >= gracePeriod {
return pluginsCore.PhaseInfoRetryableFailureWithCleanup(finalReason, GetMessageAfterGracePeriod(finalMessage, gracePeriod), &pluginsCore.TaskInfo{
OccurredAt: &t,
}), t, nil
}
Expand Down Expand Up @@ -734,6 +750,10 @@ func demystifyPendingHelper(status v1.PodStatus) (pluginsCore.PhaseInfo, time.Ti
return phaseInfo, t, nil
}

func GetMessageAfterGracePeriod(message string, gracePeriod time.Duration) string {
return fmt.Sprintf("Grace period [%s] exceeded|%s", gracePeriod, message)
}

func DemystifySuccess(status v1.PodStatus, info pluginsCore.TaskInfo) (pluginsCore.PhaseInfo, error) {
for _, status := range append(
append(status.InitContainerStatuses, status.ContainerStatuses...), status.EphemeralContainerStatuses...) {
Expand Down
34 changes: 29 additions & 5 deletions flyteplugins/go/tasks/pluginmachinery/flytek8s/pod_helper_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1175,6 +1175,9 @@ func TestDemystifyPending(t *testing.T) {
CreateContainerErrorGracePeriod: config1.Duration{
Duration: time.Minute * 3,
},
CreateContainerConfigErrorGracePeriod: config1.Duration{
Duration: time.Minute * 4,
},
ImagePullBackoffGracePeriod: config1.Duration{
Duration: time.Minute * 3,
},
Expand Down Expand Up @@ -1401,19 +1404,40 @@ func TestDemystifyPending(t *testing.T) {
assert.Equal(t, pluginsCore.PhaseRetryableFailure, taskStatus.Phase())
})

t.Run("CreateContainerConfigError", func(t *testing.T) {
s.ContainerStatuses = []v1.ContainerStatus{
t.Run("CreateContainerConfigErrorWithinGracePeriod", func(t *testing.T) {
s2 := *s.DeepCopy()
s2.Conditions[0].LastTransitionTime = metav1.Now()
s2.ContainerStatuses = []v1.ContainerStatus{
{
Ready: false,
State: v1.ContainerState{
Waiting: &v1.ContainerStateWaiting{
Reason: "CreateContainerConfigError",
Message: "this an error",
Message: "this is a transient error",
},
},
},
}
taskStatus, err := DemystifyPending(s)
taskStatus, err := DemystifyPending(s2)
assert.NoError(t, err)
assert.Equal(t, pluginsCore.PhaseInitializing, taskStatus.Phase())
})

t.Run("CreateContainerConfigErrorOutsideGracePeriod", func(t *testing.T) {
s2 := *s.DeepCopy()
s2.Conditions[0].LastTransitionTime.Time = metav1.Now().Add(-config.GetK8sPluginConfig().CreateContainerConfigErrorGracePeriod.Duration)
s2.ContainerStatuses = []v1.ContainerStatus{
{
Ready: false,
State: v1.ContainerState{
Waiting: &v1.ContainerStateWaiting{
Reason: "CreateContainerConfigError",
Message: "this a permanent error",
},
},
},
}
taskStatus, err := DemystifyPending(s2)
assert.NoError(t, err)
assert.Equal(t, pluginsCore.PhasePermanentFailure, taskStatus.Phase())
})
Expand Down Expand Up @@ -1618,7 +1642,7 @@ func TestDemystifyPending_testcases(t *testing.T) {
errCode string
message string
}{
{"ImagePullBackOff", "imagepull-failurepod.json", false, "ContainersNotReady|ImagePullBackOff", "containers with unready status: [fdf98e4ed2b524dc3bf7-get-flyte-id-task-0]|Back-off pulling image \"image\""},
{"ImagePullBackOff", "imagepull-failurepod.json", false, "ContainersNotReady|ImagePullBackOff", "Grace period [3m0s] exceeded|containers with unready status: [fdf98e4ed2b524dc3bf7-get-flyte-id-task-0]|Back-off pulling image \"image\""},
}

for _, tt := range tests {
Expand Down

0 comments on commit a4b9424

Please sign in to comment.