-
Notifications
You must be signed in to change notification settings - Fork 220
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Don't replay commands from non-completed task #1750
base: master
Are you sure you want to change the base?
Changes from all commits
a3df5df
6241536
027e581
f30fecf
c316076
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -32,6 +32,7 @@ import ( | |
"fmt" | ||
"math" | ||
"reflect" | ||
"strconv" | ||
"strings" | ||
"sync" | ||
"time" | ||
|
@@ -1023,6 +1024,8 @@ func (w *workflowExecutionContextImpl) ProcessWorkflowTask(workflowTask *workflo | |
return !wfPanicked && isInReplayer | ||
} | ||
|
||
var wftCompletedIndex int64 | ||
|
||
metricsHandler := w.wth.metricsHandler.WithTags(metrics.WorkflowTags(task.WorkflowType.GetName())) | ||
start := time.Now() | ||
// This is set to nil once recorded | ||
|
@@ -1045,10 +1048,26 @@ ProcessEvents: | |
binaryChecksum := nextTask.binaryChecksum | ||
nextTaskBuildId := nextTask.buildID | ||
admittedUpdates := nextTask.admittedMsgs | ||
// Only replay up to the last completed event | ||
if len(reorderedEvents) > 0 { | ||
if reorderedEvents[len(reorderedEvents)-1].GetEventType() == enumspb.EVENT_TYPE_WORKFLOW_TASK_STARTED { | ||
wftCompletedIndex = reorderedEvents[len(reorderedEvents)-2].EventId | ||
} else if reorderedEvents[len(reorderedEvents)-1].GetEventType() == enumspb.EVENT_TYPE_WORKFLOW_TASK_COMPLETED || reorderedEvents[len(reorderedEvents)-1].GetEventType() == enumspb.EVENT_TYPE_ACTIVITY_TASK_COMPLETED { | ||
wftCompletedIndex = reorderedEvents[len(reorderedEvents)-1].EventId | ||
} else | ||
// If completed + task scheduled, that's a WFT heartbeat, does not complete sequence | ||
if (len(reorderedEvents) == 1 && reorderedEvents[0].GetEventType() == enumspb.EVENT_TYPE_WORKFLOW_TASK_COMPLETED) || (len(reorderedEvents) > 1 && reorderedEvents[0].GetEventType() == enumspb.EVENT_TYPE_WORKFLOW_TASK_COMPLETED && reorderedEvents[1].GetEventType() != enumspb.EVENT_TYPE_WORKFLOW_TASK_SCHEDULED) { | ||
wftCompletedIndex = reorderedEvents[0].EventId | ||
} | ||
} | ||
|
||
// Check if we are replaying so we know if we should use the messages in the WFT or the history | ||
isReplay := len(reorderedEvents) > 0 && reorderedHistory.IsReplayEvent(reorderedEvents[len(reorderedEvents)-1]) | ||
var msgs *eventMsgIndex | ||
if isReplay { | ||
//if reorderedEvents[0].GetEventType() == enumspb.EVENT_TYPE_WORKFLOW_TASK_COMPLETED { | ||
// completedTaskReplayCommandIndex = len(replayCommands) | ||
//} | ||
admittedUpdatesByID := make(map[string]*protocolpb.Message, len(admittedUpdates)) | ||
for _, admittedUpdate := range admittedUpdates { | ||
admittedUpdatesByID[admittedUpdate.GetProtocolInstanceId()] = admittedUpdate | ||
|
@@ -1181,6 +1200,9 @@ ProcessEvents: | |
} | ||
} | ||
if isReplay { | ||
// TODO: Why does this have commands that aren't a part of history? | ||
// looks like both replayCommands and respondEvents sometimes hold events/commands that aren't a part of | ||
// the replay history | ||
eventCommands := eventHandler.commandsHelper.getCommands(true) | ||
if !skipReplayCheck { | ||
replayCommands = append(replayCommands, eventCommands...) | ||
|
@@ -1195,6 +1217,23 @@ ProcessEvents: | |
metricsTimer = nil | ||
} | ||
|
||
// We do not want to run non-determinism checks on a task start that | ||
// doesn't have a corresponding completed task. | ||
for i, cmd := range replayCommands { | ||
activityId, err := strconv.ParseInt(cmd.GetScheduleActivityTaskCommandAttributes().ActivityId, 10, 64) | ||
if err != nil { | ||
return nil, err | ||
} | ||
if activityId > wftCompletedIndex { | ||
replayCommands = replayCommands[:i] | ||
} | ||
} | ||
for i, event := range respondEvents { | ||
if event.EventId > wftCompletedIndex { | ||
respondEvents = respondEvents[:i] | ||
} | ||
} | ||
|
||
// Non-deterministic error could happen in 2 different places: | ||
// 1) the replay commands does not match to history events. This is usually due to non backwards compatible code | ||
// change to workflow logic. For example, change calling one activity to a different activity. | ||
|
@@ -2339,3 +2378,14 @@ func traceLog(fn func()) { | |
fn() | ||
} | ||
} | ||
|
||
//func trimEventsToCompletedWFT(commands []*commandpb.Command) []*commandpb.Command { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. reminder to self: remove |
||
// lastIndex := 0 | ||
// sawAnyCommandEvent := false | ||
// // wftStartedEventIdToIndex | ||
// for ix, command := range commands { | ||
// last_index := ix | ||
// | ||
// if | ||
// } | ||
//} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6904,3 +6904,38 @@ func (ts *InvalidUTF8Suite) TestBasic() { | |
// Go's JSON coding stack will replace invalid bytes with the unicode substitute char U+FFFD | ||
ts.Equal("\n�\x01\n\x0ejunk\x12data", response) | ||
} | ||
|
||
func (ts *IntegrationTestSuite) TestPartialHistoryReplay() { | ||
ctx, cancel := context.WithCancel(context.Background()) | ||
defer cancel() | ||
|
||
// Run the workflow | ||
run, err := ts.client.ExecuteWorkflow(ctx, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this workflow misses some key features that we have struggle with replaying in the past. It only executes activities serially. I would like to see a workflow that has some more events like version markers, signals, and local activities. It also only has 2 WFT. Maybe we can add a small fuzzing workflow that picks from a few commands in a deterministic , but random manner and does a few iterations of that? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To make sure I understand, you're suggesting writing a workflow that takes in a deterministic seed, and then "randomly" selects different commands to call? As in something along the lines of
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah could generate the random number in a |
||
ts.startWorkflowOptions("test-partial-history-replay"), ts.workflows.Basic) | ||
ts.NotNil(run) | ||
ts.NoError(err) | ||
ts.NoError(run.Get(ctx, nil)) | ||
|
||
// Obtain history | ||
var history historypb.History | ||
iter := ts.client.GetWorkflowHistory(ctx, run.GetID(), run.GetRunID(), false, | ||
enumspb.HISTORY_EVENT_FILTER_TYPE_ALL_EVENT) | ||
for iter.HasNext() { | ||
event, err := iter.Next() | ||
ts.NoError(err) | ||
history.Events = append(history.Events, event) | ||
} | ||
|
||
for _ = range history.Events { | ||
if len(history.Events) >= 3 { | ||
fmt.Println("\n[REPLAY]") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. remove before merging |
||
for i, event := range history.Events { | ||
fmt.Println("\t[event],", i, event) | ||
} | ||
replayer := worker.NewWorkflowReplayer() | ||
replayer.RegisterWorkflow(ts.workflows.Basic) | ||
ts.NoError(replayer.ReplayWorkflowHistory(nil, &history)) | ||
history.Events = history.Events[:len(history.Events)-1] | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@Quinn-With-Two-Ns, when working through the integration test I added, it appeared that both replayCommands and respondEvents sometimes held events/commands with
event id
's that didn't exist in the history yet. I'm pretty sure this only happens when the history partial (I assume getCommands is eagerly generating the next commands, even if the partial history doesn't have the corresponding event)Let me know if this assumption is wrong
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not sure I understand what you mean, do you have an example? What do you mean by
with event id's that didn't exist in the history yet
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
When replaying this history
replayCommands contains
but that activity_id doesn't exist in the history we're replaying
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It looks like respondEvents properly removes the entry with event_id 11 when we take that out of history
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't understand that history you shared, is that intended to be the whole history? Since if that is the history then that history should contain an activity scheduled event or throw a non determinism exception.