Skip to content

Commit

Permalink
fix(TaskRun): fixed the issue where some step statuses might not be c…
Browse files Browse the repository at this point in the history
…orrectly updated in failed TaskRun

Previously, if the Pod cache is not updated in time, a `TaskRun` would be
immediately marked as failed after a certain step fails. When the pod status
changes to failed, the TaskRun at that time still has a completed status and
will not enter the reconcile logic to update the status of each step.

Currently, TaskRun will only fail prematurely in the event of an OOM.
If a specific step exits abnormally, it will wait for the Pod status to
ultimately change to failed before synchronizing the status of each step.
  • Loading branch information
l-qing committed Sep 17, 2024
1 parent 1597151 commit 5d66369
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 9 deletions.
26 changes: 25 additions & 1 deletion pkg/pod/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ func MakeTaskRunStatus(ctx context.Context, logger *zap.SugaredLogger, tr v1.Tas

sortPodContainerStatuses(pod.Status.ContainerStatuses, pod.Spec.Containers)

complete := areContainersCompleted(ctx, pod) || pod.Status.Phase == corev1.PodSucceeded || DidTaskRunFail(pod)
complete := areContainersCompleted(ctx, pod) || isPodCompleted(pod)

if complete {
onError, ok := tr.Annotations[v1.PipelineTaskOnErrorAnnotation]
Expand Down Expand Up @@ -636,6 +636,30 @@ func updateIncompleteTaskRunStatus(trs *v1.TaskRunStatus, pod *corev1.Pod) {
}
}

// isPodCompleted checks if the given pod is completed.
// A pod is considered completed if its phase is either "Succeeded" or "Failed".
//
// If it is foreseeable that the pod will eventually be in a failed state,
// but it remains in a Running status for a visible period of time, it should be considered completed in advance.
//
// For example, when certain steps encounter OOM, only the pods that have timed out will change to a failed state,
// we should consider them completed in advance.
func isPodCompleted(pod *corev1.Pod) bool {
if pod.Status.Phase == corev1.PodSucceeded || pod.Status.Phase == corev1.PodFailed {
return true
}
for _, s := range pod.Status.ContainerStatuses {
if IsContainerStep(s.Name) {
if s.State.Terminated != nil {
if isOOMKilled(s) {
return true
}
}
}
}
return false
}

// DidTaskRunFail check the status of pod to decide if related taskrun is failed
func DidTaskRunFail(pod *corev1.Pod) bool {
if pod.Status.Phase == corev1.PodFailed {
Expand Down
52 changes: 44 additions & 8 deletions pkg/pod/status_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -847,7 +847,7 @@ func TestMakeTaskRunStatus(t *testing.T) {
}},
},
want: v1.TaskRunStatus{
Status: statusFailure(v1.TaskRunReasonFailed.String(), "\"step-state-name\" exited with code 123"),
Status: statusRunning(),
TaskRunStatusFields: v1.TaskRunStatusFields{
Steps: []v1.StepState{{
ContainerState: corev1.ContainerState{
Expand All @@ -859,9 +859,7 @@ func TestMakeTaskRunStatus(t *testing.T) {
Container: "step-state-name",
}},
Sidecars: []v1.SidecarState{},
Artifacts: &v1.Artifacts{},
// We don't actually care about the time, just that it's not nil
CompletionTime: &metav1.Time{Time: time.Now()},
Artifacts: nil,
},
},
}, {
Expand All @@ -885,7 +883,7 @@ func TestMakeTaskRunStatus(t *testing.T) {
}},
},
want: v1.TaskRunStatus{
Status: statusFailure(v1.TaskRunReasonFailed.String(), "\"step-state-name\" exited with code 123"),
Status: statusRunning(),
TaskRunStatusFields: v1.TaskRunStatusFields{
Steps: []v1.StepState{{
ContainerState: corev1.ContainerState{
Expand All @@ -898,9 +896,7 @@ func TestMakeTaskRunStatus(t *testing.T) {
ImageID: "image-id",
}},
Sidecars: []v1.SidecarState{},
Artifacts: &v1.Artifacts{},
// We don't actually care about the time, just that it's not nil
CompletionTime: &metav1.Time{Time: time.Now()},
Artifacts: nil,
},
},
}, {
Expand Down Expand Up @@ -1454,6 +1450,46 @@ func TestMakeTaskRunStatus(t *testing.T) {
CompletionTime: &metav1.Time{Time: time.Now()},
},
},
}, {
desc: "oom occurred in the pod",
podStatus: corev1.PodStatus{
Phase: corev1.PodRunning,
ContainerStatuses: []corev1.ContainerStatus{{
Name: "step-one",
State: corev1.ContainerState{
Terminated: &corev1.ContainerStateTerminated{
Reason: oomKilled,
ExitCode: 137,
},
},
}, {
Name: "step-two",
State: corev1.ContainerState{},
}},
},
want: v1.TaskRunStatus{
Status: statusFailure(v1.TaskRunReasonFailed.String(), "\"step-one\" exited with code 137"),
TaskRunStatusFields: v1.TaskRunStatusFields{
Steps: []v1.StepState{{
ContainerState: corev1.ContainerState{
Terminated: &corev1.ContainerStateTerminated{
Reason: oomKilled,
ExitCode: 137,
},
},
Name: "one",
Container: "step-one",
}, {
ContainerState: corev1.ContainerState{},
Name: "two",
Container: "step-two",
}},
Sidecars: []v1.SidecarState{},
Artifacts: &v1.Artifacts{},
// We don't actually care about the time, just that it's not nil
CompletionTime: &metav1.Time{Time: time.Now()},
},
},
}, {
desc: "the failed task show task results",
podStatus: corev1.PodStatus{
Expand Down

0 comments on commit 5d66369

Please sign in to comment.