-
Notifications
You must be signed in to change notification settings - Fork 4k
restore: fix retry revert stuck in reverting #156149
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -116,6 +116,10 @@ const ( | |
| // be _exceeded_ before we no longer fast fail the restore job after hitting the | ||
| // maxRestoreRetryFastFail threshold. | ||
| restoreRetryProgressThreshold = 0 | ||
|
|
||
| // droppedDescsOnFailKey is an info key that is set for a restore job when it | ||
| // has finished dropping its descriptors on failure. | ||
| droppedDescsOnFailKey = "dropped_descs_on_fail" | ||
| ) | ||
|
|
||
| var restoreStatsInsertionConcurrency = settings.RegisterIntSetting( | ||
|
|
@@ -2820,6 +2824,13 @@ func (r *restoreResumer) OnFailOrCancel( | |
| return err | ||
| } | ||
|
|
||
| testingKnobs := execCfg.BackupRestoreTestingKnobs | ||
| if testingKnobs != nil && testingKnobs.AfterRevertRestoreDropDescriptors != nil { | ||
| if err := testingKnobs.AfterRevertRestoreDropDescriptors(); err != nil { | ||
| return err | ||
| } | ||
| } | ||
|
|
||
| if details.DescriptorCoverage == tree.AllDescriptors { | ||
| // The temporary system table descriptors should already have been dropped | ||
| // in `dropDescriptors` but we still need to drop the temporary system db. | ||
|
|
@@ -2865,6 +2876,19 @@ func (r *restoreResumer) dropDescriptors( | |
| return nil | ||
| } | ||
|
|
||
| jobInfo := jobs.InfoStorageForJob(txn, r.job.ID()) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Instead of adding a checkpoint, how difficult would it be to make the cleanup logic idempotent so it is safe to run twice? Making the code naturally idempotent is my preferred solution for job retries.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My original implementation was to make the code naturally idempotent. It required adding Since it required more reasoning to determine where to place these checks, it felt more susceptible to bugs if we were to ever come back and add more cleanup logic, which is why I went with the checkpoint approach. I'm open to being convinced otherwise though.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looking through the code I think you are right. This would probably need to be rewritten to be cleanly idempotent. |
||
| _, hasDropped, err := jobInfo.Get( | ||
| ctx, "get-restore-dropped-descs-on-fail-key", droppedDescsOnFailKey, | ||
| ) | ||
| if err != nil { | ||
| return err | ||
| } | ||
| if hasDropped { | ||
| // Descriptors have already been dropped once before, this is a retry of the | ||
| // cleanup. | ||
| return nil | ||
| } | ||
|
|
||
| b := txn.KV().NewBatch() | ||
| const kvTrace = false | ||
| // Collect the tables into mutable versions. | ||
|
|
@@ -3186,7 +3210,10 @@ func (r *restoreResumer) dropDescriptors( | |
| return errors.Wrap(err, "dropping tables created at the start of restore caused by fail/cancel") | ||
| } | ||
|
|
||
| return nil | ||
| return errors.Wrap( | ||
| jobInfo.Write(ctx, droppedDescsOnFailKey, []byte{}), | ||
| "checkpointing dropped descs on fail", | ||
| ) | ||
| } | ||
|
|
||
| // removeExistingTypeBackReferences removes back references from types that | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.