Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(TaskRun): fixed the issue where some step statuses might not be correctly updated in failed TaskRun #8270

Merged
merged 1 commit into from
Sep 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 25 additions & 1 deletion pkg/pod/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ func MakeTaskRunStatus(ctx context.Context, logger *zap.SugaredLogger, tr v1.Tas

sortPodContainerStatuses(pod.Status.ContainerStatuses, pod.Spec.Containers)

complete := areContainersCompleted(ctx, pod) || pod.Status.Phase == corev1.PodSucceeded || DidTaskRunFail(pod)
complete := areContainersCompleted(ctx, pod) || isPodCompleted(pod)

if complete {
onError, ok := tr.Annotations[v1.PipelineTaskOnErrorAnnotation]
Expand Down Expand Up @@ -636,6 +636,30 @@ func updateIncompleteTaskRunStatus(trs *v1.TaskRunStatus, pod *corev1.Pod) {
}
}

// isPodCompleted checks if the given pod is completed.
// A pod is considered completed if its phase is either "Succeeded" or "Failed".
//
// If it is foreseeable that the pod will eventually be in a failed state,
// but it remains in a Running status for a visible period of time, it should be considered completed in advance.
//
// For example, when certain steps encounter OOM, only the pods that have timed out will change to a failed state,
// we should consider them completed in advance.
func isPodCompleted(pod *corev1.Pod) bool {
if pod.Status.Phase == corev1.PodSucceeded || pod.Status.Phase == corev1.PodFailed {
return true
}
for _, s := range pod.Status.ContainerStatuses {
if IsContainerStep(s.Name) {
if s.State.Terminated != nil {
if isOOMKilled(s) {
return true
}
}
}
}
return false
}

// DidTaskRunFail check the status of pod to decide if related taskrun is failed
func DidTaskRunFail(pod *corev1.Pod) bool {
if pod.Status.Phase == corev1.PodFailed {
Expand Down
52 changes: 44 additions & 8 deletions pkg/pod/status_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -847,7 +847,7 @@ func TestMakeTaskRunStatus(t *testing.T) {
}},
},
want: v1.TaskRunStatus{
Status: statusFailure(v1.TaskRunReasonFailed.String(), "\"step-state-name\" exited with code 123"),
Status: statusRunning(),
TaskRunStatusFields: v1.TaskRunStatusFields{
Steps: []v1.StepState{{
ContainerState: corev1.ContainerState{
Expand All @@ -859,9 +859,7 @@ func TestMakeTaskRunStatus(t *testing.T) {
Container: "step-state-name",
}},
Sidecars: []v1.SidecarState{},
Artifacts: &v1.Artifacts{},
// We don't actually care about the time, just that it's not nil
CompletionTime: &metav1.Time{Time: time.Now()},
Artifacts: nil,
},
},
}, {
Expand All @@ -885,7 +883,7 @@ func TestMakeTaskRunStatus(t *testing.T) {
}},
},
want: v1.TaskRunStatus{
Status: statusFailure(v1.TaskRunReasonFailed.String(), "\"step-state-name\" exited with code 123"),
Status: statusRunning(),
TaskRunStatusFields: v1.TaskRunStatusFields{
Steps: []v1.StepState{{
ContainerState: corev1.ContainerState{
Expand All @@ -898,9 +896,7 @@ func TestMakeTaskRunStatus(t *testing.T) {
ImageID: "image-id",
}},
Sidecars: []v1.SidecarState{},
Artifacts: &v1.Artifacts{},
// We don't actually care about the time, just that it's not nil
CompletionTime: &metav1.Time{Time: time.Now()},
Artifacts: nil,
},
},
}, {
Expand Down Expand Up @@ -1454,6 +1450,46 @@ func TestMakeTaskRunStatus(t *testing.T) {
CompletionTime: &metav1.Time{Time: time.Now()},
},
},
}, {
desc: "oom occurred in the pod",
podStatus: corev1.PodStatus{
Phase: corev1.PodRunning,
ContainerStatuses: []corev1.ContainerStatus{{
Name: "step-one",
State: corev1.ContainerState{
Terminated: &corev1.ContainerStateTerminated{
Reason: oomKilled,
ExitCode: 137,
},
},
}, {
Name: "step-two",
State: corev1.ContainerState{},
}},
},
want: v1.TaskRunStatus{
Status: statusFailure(v1.TaskRunReasonFailed.String(), "\"step-one\" exited with code 137"),
TaskRunStatusFields: v1.TaskRunStatusFields{
Steps: []v1.StepState{{
ContainerState: corev1.ContainerState{
Terminated: &corev1.ContainerStateTerminated{
Reason: oomKilled,
ExitCode: 137,
},
},
Name: "one",
Container: "step-one",
}, {
ContainerState: corev1.ContainerState{},
Name: "two",
Container: "step-two",
}},
Sidecars: []v1.SidecarState{},
Artifacts: &v1.Artifacts{},
// We don't actually care about the time, just that it's not nil
CompletionTime: &metav1.Time{Time: time.Now()},
},
},
}, {
desc: "the failed task show task results",
podStatus: corev1.PodStatus{
Expand Down