From 4ff99afa1d38c079a57b5a5922c27c1d1afae886 Mon Sep 17 00:00:00 2001 From: qingliu Date: Tue, 17 Sep 2024 10:48:42 +0800 Subject: [PATCH] fix(TaskRun): fixed the issue where some step statuses might not be correctly updated in failed TaskRun Previously, if the Pod cache is not updated in time, a `TaskRun` would be immediately marked as failed after a certain step fails. When the pod status changes to failed, the TaskRun at that time still has a completed status and will not enter the reconcile logic to update the status of each step. Currently, TaskRun will only fail prematurely in the event of an OOM. If a specific step exits abnormally, it will wait for the Pod status to ultimately change to failed before synchronizing the status of each step. --- pkg/pod/status.go | 26 ++++++++++++++++++++- pkg/pod/status_test.go | 52 +++++++++++++++++++++++++++++++++++------- 2 files changed, 69 insertions(+), 9 deletions(-) diff --git a/pkg/pod/status.go b/pkg/pod/status.go index dbb87f420c8..7381b216df4 100644 --- a/pkg/pod/status.go +++ b/pkg/pod/status.go @@ -125,7 +125,7 @@ func MakeTaskRunStatus(ctx context.Context, logger *zap.SugaredLogger, tr v1.Tas sortPodContainerStatuses(pod.Status.ContainerStatuses, pod.Spec.Containers) - complete := areContainersCompleted(ctx, pod) || pod.Status.Phase == corev1.PodSucceeded || DidTaskRunFail(pod) + complete := areContainersCompleted(ctx, pod) || isPodCompleted(pod) if complete { onError, ok := tr.Annotations[v1.PipelineTaskOnErrorAnnotation] @@ -636,6 +636,30 @@ func updateIncompleteTaskRunStatus(trs *v1.TaskRunStatus, pod *corev1.Pod) { } } +// isPodCompleted checks if the given pod is completed. +// A pod is considered completed if its phase is either "Succeeded" or "Failed". +// +// If it is foreseeable that the pod will eventually be in a failed state, +// but it remains in a Running status for a visible period of time, it should be considered completed in advance. +// +// For example, when certain steps encounter OOM, only the pods that have timed out will change to a failed state, +// we should consider them completed in advance. +func isPodCompleted(pod *corev1.Pod) bool { + if pod.Status.Phase == corev1.PodSucceeded || pod.Status.Phase == corev1.PodFailed { + return true + } + for _, s := range pod.Status.ContainerStatuses { + if IsContainerStep(s.Name) { + if s.State.Terminated != nil { + if isOOMKilled(s) { + return true + } + } + } + } + return false +} + // DidTaskRunFail check the status of pod to decide if related taskrun is failed func DidTaskRunFail(pod *corev1.Pod) bool { if pod.Status.Phase == corev1.PodFailed { diff --git a/pkg/pod/status_test.go b/pkg/pod/status_test.go index a963cf6aa92..eb5a4695a33 100644 --- a/pkg/pod/status_test.go +++ b/pkg/pod/status_test.go @@ -847,7 +847,7 @@ func TestMakeTaskRunStatus(t *testing.T) { }}, }, want: v1.TaskRunStatus{ - Status: statusFailure(v1.TaskRunReasonFailed.String(), "\"step-state-name\" exited with code 123"), + Status: statusRunning(), TaskRunStatusFields: v1.TaskRunStatusFields{ Steps: []v1.StepState{{ ContainerState: corev1.ContainerState{ @@ -859,9 +859,7 @@ func TestMakeTaskRunStatus(t *testing.T) { Container: "step-state-name", }}, Sidecars: []v1.SidecarState{}, - Artifacts: &v1.Artifacts{}, - // We don't actually care about the time, just that it's not nil - CompletionTime: &metav1.Time{Time: time.Now()}, + Artifacts: nil, }, }, }, { @@ -885,7 +883,7 @@ func TestMakeTaskRunStatus(t *testing.T) { }}, }, want: v1.TaskRunStatus{ - Status: statusFailure(v1.TaskRunReasonFailed.String(), "\"step-state-name\" exited with code 123"), + Status: statusRunning(), TaskRunStatusFields: v1.TaskRunStatusFields{ Steps: []v1.StepState{{ ContainerState: corev1.ContainerState{ @@ -898,9 +896,7 @@ func TestMakeTaskRunStatus(t *testing.T) { ImageID: "image-id", }}, Sidecars: []v1.SidecarState{}, - Artifacts: &v1.Artifacts{}, - // We don't actually care about the time, just that it's not nil - CompletionTime: &metav1.Time{Time: time.Now()}, + Artifacts: nil, }, }, }, { @@ -1454,6 +1450,46 @@ func TestMakeTaskRunStatus(t *testing.T) { CompletionTime: &metav1.Time{Time: time.Now()}, }, }, + }, { + desc: "oom occurred in the pod", + podStatus: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{{ + Name: "step-one", + State: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{ + Reason: oomKilled, + ExitCode: 137, + }, + }, + }, { + Name: "step-two", + State: corev1.ContainerState{}, + }}, + }, + want: v1.TaskRunStatus{ + Status: statusFailure(v1.TaskRunReasonFailed.String(), "\"step-one\" exited with code 137"), + TaskRunStatusFields: v1.TaskRunStatusFields{ + Steps: []v1.StepState{{ + ContainerState: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{ + Reason: oomKilled, + ExitCode: 137, + }, + }, + Name: "one", + Container: "step-one", + }, { + ContainerState: corev1.ContainerState{}, + Name: "two", + Container: "step-two", + }}, + Sidecars: []v1.SidecarState{}, + Artifacts: &v1.Artifacts{}, + // We don't actually care about the time, just that it's not nil + CompletionTime: &metav1.Time{Time: time.Now()}, + }, + }, }, { desc: "the failed task show task results", podStatus: corev1.PodStatus{