From 0acb4356d01813193b6f38195ba0c551698e7fde Mon Sep 17 00:00:00 2001 From: shuangkun tian <72060326+shuangkun@users.noreply.github.com> Date: Wed, 14 Feb 2024 09:42:26 +0800 Subject: [PATCH] fix: retry node with expression status Running -> Pending (#12637) Signed-off-by: shuangkun --- workflow/controller/operator.go | 2 +- workflow/controller/operator_test.go | 80 ++++++++++++++++++++++++++++ 2 files changed, 81 insertions(+), 1 deletion(-) diff --git a/workflow/controller/operator.go b/workflow/controller/operator.go index 2ef0c7222ad9..f6d5e8954e6f 100644 --- a/workflow/controller/operator.go +++ b/workflow/controller/operator.go @@ -960,7 +960,7 @@ func (woc *wfOperationCtx) processNodeRetries(node *wfv1.NodeStatus, retryStrate if err != nil { return nil, false, err } - if !shouldContinue { + if !shouldContinue && lastChildNode.Fulfilled() { return woc.markNodePhase(node.Name, lastChildNode.Phase, "retryStrategy.expression evaluated to false"), true, nil } } diff --git a/workflow/controller/operator_test.go b/workflow/controller/operator_test.go index ca00a4854e8b..3c0e5d07bb54 100644 --- a/workflow/controller/operator_test.go +++ b/workflow/controller/operator_test.go @@ -876,6 +876,86 @@ func TestProcessNodeRetriesWithExponentialBackoff(t *testing.T) { require.Equal(wfv1.NodeSucceeded, n.Phase) } +// TestProcessNodeRetries tests retrying with Expression +func TestProcessNodeRetriesWithExpression(t *testing.T) { + cancel, controller := newController() + defer cancel() + assert.NotNil(t, controller) + wf := wfv1.MustUnmarshalWorkflow(helloWorldWf) + assert.NotNil(t, wf) + woc := newWorkflowOperationCtx(wf, controller) + assert.NotNil(t, woc) + // Verify that there are no nodes in the wf status. + assert.Zero(t, len(woc.wf.Status.Nodes)) + + // Add the parent node for retries. + nodeName := "test-node" + nodeID := woc.wf.NodeID(nodeName) + node := woc.initializeNode(nodeName, wfv1.NodeTypeRetry, "", &wfv1.WorkflowStep{}, "", wfv1.NodeRunning, &wfv1.NodeFlag{}) + retries := wfv1.RetryStrategy{} + retries.Expression = "false" + retries.Limit = intstrutil.ParsePtr("2") + retries.RetryPolicy = wfv1.RetryPolicyAlways + woc.wf.Status.Nodes[nodeID] = *node + + assert.Equal(t, node.Phase, wfv1.NodeRunning) + + // Ensure there are no child nodes yet. + lastChild := getChildNodeIndex(node, woc.wf.Status.Nodes, -1) + assert.Nil(t, lastChild) + + // Add child nodes. + for i := 0; i < 2; i++ { + childNode := fmt.Sprintf("%s(%d)", nodeName, i) + woc.initializeNode(childNode, wfv1.NodeTypePod, "", &wfv1.WorkflowStep{}, "", wfv1.NodeRunning, &wfv1.NodeFlag{Retried: true}) + woc.addChildNode(nodeName, childNode) + } + + n, err := woc.wf.GetNodeByName(nodeName) + assert.NoError(t, err) + lastChild = getChildNodeIndex(n, woc.wf.Status.Nodes, -1) + assert.NotNil(t, lastChild) + + // Last child is still running. processNodeRetries() should return false since + // there should be no retries at this point. + n, _, err = woc.processNodeRetries(n, retries, &executeTemplateOpts{}) + assert.NoError(t, err) + assert.Equal(t, n.Phase, wfv1.NodeRunning) + + // Mark lastChild Pending. + woc.markNodePhase(lastChild.Name, wfv1.NodePending) + n, _, err = woc.processNodeRetries(n, retries, &executeTemplateOpts{}) + assert.Nil(t, err) + assert.Equal(t, n.Phase, wfv1.NodeRunning) + + // Mark lastChild as successful. + woc.markNodePhase(lastChild.Name, wfv1.NodeSucceeded) + n, _, err = woc.processNodeRetries(n, retries, &executeTemplateOpts{}) + assert.Nil(t, err) + // The parent node also gets marked as Succeeded. + assert.Equal(t, n.Phase, wfv1.NodeSucceeded) + + // Mark the parent node as running again and the lastChild as errored. + n = woc.markNodePhase(n.Name, wfv1.NodeRunning) + woc.markNodePhase(lastChild.Name, wfv1.NodeError) + _, _, err = woc.processNodeRetries(n, retries, &executeTemplateOpts{}) + assert.NoError(t, err) + n, err = woc.wf.GetNodeByName(nodeName) + assert.NoError(t, err) + assert.Equal(t, n.Phase, wfv1.NodeError) + + // Add a third node that has failed. + woc.markNodePhase(n.Name, wfv1.NodeRunning) + childNode := fmt.Sprintf("%s(%d)", nodeName, 3) + woc.initializeNode(childNode, wfv1.NodeTypePod, "", &wfv1.WorkflowStep{}, "", wfv1.NodeFailed, &wfv1.NodeFlag{Retried: true}) + woc.addChildNode(nodeName, childNode) + n, err = woc.wf.GetNodeByName(nodeName) + assert.NoError(t, err) + n, _, err = woc.processNodeRetries(n, retries, &executeTemplateOpts{}) + assert.NoError(t, err) + assert.Equal(t, n.Phase, wfv1.NodeFailed) +} + func parseRetryMessage(message string) (int, error) { pattern := regexp.MustCompile(`Backoff for (\d+) minutes (\d+) seconds`) matches := pattern.FindStringSubmatch(message)