From cfe2bb791d0e769fbe0536bc7042b4c7ff33f987 Mon Sep 17 00:00:00 2001 From: shuangkun tian <72060326+shuangkun@users.noreply.github.com> Date: Mon, 25 Mar 2024 15:19:07 +0800 Subject: [PATCH] fix(containerSet): mark container deleted when pod deleted. Fixes: #12210 (#12756) Signed-off-by: shuangkun --- workflow/controller/operator.go | 13 +++- workflow/controller/operator_test.go | 89 ++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+), 1 deletion(-) diff --git a/workflow/controller/operator.go b/workflow/controller/operator.go index 4d8af7777ccd..8edda5a5f561 100644 --- a/workflow/controller/operator.go +++ b/workflow/controller/operator.go @@ -1223,7 +1223,18 @@ func (woc *wfOperationCtx) podReconciliation(ctx context.Context) (error, bool) node.Daemoned = nil woc.updated = true } - woc.markNodePhase(node.Name, wfv1.NodeError, "pod deleted") + woc.markNodeError(node.Name, errors.New("", "pod deleted")) + // Set pod's child(container) error if pod deleted + for _, childNodeID := range node.Children { + childNode, err := woc.wf.Status.Nodes.Get(childNodeID) + if err != nil { + woc.log.Errorf("was unable to obtain node for %s", childNodeID) + continue + } + if childNode.Type == wfv1.NodeTypeContainer { + woc.markNodeError(childNode.Name, errors.New("", "container deleted")) + } + } } } return nil, !taskResultIncomplete diff --git a/workflow/controller/operator_test.go b/workflow/controller/operator_test.go index c9c7227322fc..3d27e3b7782a 100644 --- a/workflow/controller/operator_test.go +++ b/workflow/controller/operator_test.go @@ -10974,3 +10974,92 @@ status: woc.operate(ctx) assert.Equal(t, wfv1.WorkflowSucceeded, woc.wf.Status.Phase) } + +var wfHasContainerSet = ` +apiVersion: argoproj.io/v1alpha1 +kind: Workflow +metadata: + name: lovely-rhino +spec: + templates: + - name: init + dag: + tasks: + - name: A + template: run + arguments: {} + - name: run + containerSet: + containers: + - name: main + image: alpine:latest + command: + - /bin/sh + args: + - '-c' + - sleep 9000 + resources: {} + - name: main2 + image: alpine:latest + command: + - /bin/sh + args: + - '-c' + - sleep 9000 + resources: {} + entrypoint: init + arguments: {} + ttlStrategy: + secondsAfterCompletion: 300 + podGC: + strategy: OnPodCompletion` + +// TestContainerSetDeleteContainerWhenPodDeleted test whether a workflow has ContainerSet error when pod deleted. +func TestContainerSetDeleteContainerWhenPodDeleted(t *testing.T) { + cancel, controller := newController() + defer cancel() + ctx := context.Background() + wfcset := controller.wfclientset.ArgoprojV1alpha1().Workflows("") + wf := wfv1.MustUnmarshalWorkflow(wfHasContainerSet) + wf, err := wfcset.Create(ctx, wf, metav1.CreateOptions{}) + assert.Nil(t, err) + wf, err = wfcset.Get(ctx, wf.ObjectMeta.Name, metav1.GetOptions{}) + assert.Nil(t, err) + woc := newWorkflowOperationCtx(wf, controller) + woc.operate(ctx) + pods, err := listPods(woc) + assert.Nil(t, err) + assert.Equal(t, 1, len(pods.Items)) + + // mark pod Running + makePodsPhase(ctx, woc, apiv1.PodRunning) + woc = newWorkflowOperationCtx(woc.wf, controller) + woc.operate(ctx) + for _, node := range woc.wf.Status.Nodes { + if node.Type == wfv1.NodeTypePod { + assert.Equal(t, wfv1.NodeRunning, node.Phase) + } + } + + // TODO: Refactor to use local-scoped env vars in test to avoid long wait. See https://github.com/argoproj/argo-workflows/pull/12756#discussion_r1530245007 + // delete pod + time.Sleep(10 * time.Second) + deletePods(ctx, woc) + pods, err = listPods(woc) + assert.Nil(t, err) + assert.Equal(t, 0, len(pods.Items)) + + // reconcile + woc = newWorkflowOperationCtx(woc.wf, controller) + woc.operate(ctx) + assert.Equal(t, wfv1.WorkflowError, woc.wf.Status.Phase) + for _, node := range woc.wf.Status.Nodes { + assert.Equal(t, wfv1.NodeError, node.Phase) + if node.Type == wfv1.NodeTypePod { + assert.Equal(t, "pod deleted", node.Message) + } + if node.Type == wfv1.NodeTypeContainer { + assert.Equal(t, "container deleted", node.Message) + } + } +}