Skip to content

Commit a82786a

Browse files
committed
restore: fix retry revert stuck in reverting
Previously, if a reverting restore job was paused after dropping its descriptors and then resumed, the restore job would get stuck in a retry loop due to missing descriptors. This commit adds a field in the restore details that can be set once descriptors have been dropped during restore cleanup. This allows us to avoid attempting to drop descriptors if they have already been dropped. Fixes: #156019 Release note: Restore no longer gets stuck in a retry loop when reverts are attempted twice.
1 parent 901677f commit a82786a

File tree

4 files changed

+74
-2
lines changed

4 files changed

+74
-2
lines changed

pkg/backup/restore_job.go

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2820,6 +2820,13 @@ func (r *restoreResumer) OnFailOrCancel(
28202820
return err
28212821
}
28222822

2823+
testingKnobs := execCfg.BackupRestoreTestingKnobs
2824+
if testingKnobs != nil && testingKnobs.AfterRevertRestoreDropDescriptors != nil {
2825+
if err := testingKnobs.AfterRevertRestoreDropDescriptors(); err != nil {
2826+
return err
2827+
}
2828+
}
2829+
28232830
if details.DescriptorCoverage == tree.AllDescriptors {
28242831
// The temporary system table descriptors should already have been dropped
28252832
// in `dropDescriptors` but we still need to drop the temporary system db.
@@ -2865,6 +2872,12 @@ func (r *restoreResumer) dropDescriptors(
28652872
return nil
28662873
}
28672874

2875+
// Descriptors have already been dropped once before, this is a retry of the
2876+
// cleanup.
2877+
if details.DroppedDescsOnFail {
2878+
return nil
2879+
}
2880+
28682881
b := txn.KV().NewBatch()
28692882
const kvTrace = false
28702883
// Collect the tables into mutable versions.
@@ -3186,7 +3199,8 @@ func (r *restoreResumer) dropDescriptors(
31863199
return errors.Wrap(err, "dropping tables created at the start of restore caused by fail/cancel")
31873200
}
31883201

3189-
return nil
3202+
details.DroppedDescsOnFail = true
3203+
return errors.Wrap(r.job.WithTxn(txn).SetDetails(ctx, details), "checkpointing dropped descs on fail")
31903204
}
31913205

31923206
// removeExistingTypeBackReferences removes back references from types that

pkg/backup/restore_test.go

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -318,3 +318,53 @@ func TestRestoreDuplicateTempTables(t *testing.T) {
318318
result := sqlDB.QueryStr(t, `SELECT table_name FROM [SHOW TABLES] ORDER BY table_name`)
319319
require.Equal(t, [][]string{{"permanent_table"}}, result)
320320
}
321+
322+
func TestRestoreRetryRevert(t *testing.T) {
323+
defer leaktest.AfterTest(t)()
324+
defer log.Scope(t).Close(t)
325+
326+
droppedDescs := make(chan struct{})
327+
testKnobs := &sql.BackupRestoreTestingKnobs{
328+
AfterRevertRestoreDropDescriptors: func() error {
329+
close(droppedDescs)
330+
return errors.New("injected error to test pause during revert")
331+
},
332+
}
333+
var params base.TestClusterArgs
334+
params.ServerArgs.Knobs.BackupRestore = testKnobs
335+
336+
const numAccounts = 1
337+
_, sqlDB, _, cleanupFn := backuptestutils.StartBackupRestoreTestCluster(
338+
t, singleNode, backuptestutils.WithParams(params),
339+
)
340+
defer cleanupFn()
341+
342+
// We create a variety of descriptors to ensure that missing descriptors
343+
// during drop do not break revert.
344+
sqlDB.Exec(t, "CREATE DATABASE foo")
345+
sqlDB.Exec(t, "USE foo")
346+
sqlDB.Exec(t, "CREATE OR REPLACE FUNCTION bar(a INT) RETURNS INT AS 'SELECT a*a' LANGUAGE SQL;")
347+
sqlDB.Exec(t, "CREATE TYPE baz AS ENUM ('a', 'b', 'c')")
348+
sqlDB.Exec(t, "CREATE TABLE qux (x INT)")
349+
sqlDB.Exec(t, "BACKUP DATABASE foo INTO 'nodelocal://1/backup'")
350+
351+
// We need restore to publish descriptors so that they will be cleaned up
352+
// during restore.
353+
sqlDB.Exec(t, "SET CLUSTER SETTING jobs.debug.pausepoints = 'restore.after_publishing_descriptors'")
354+
355+
var restoreID jobspb.JobID
356+
sqlDB.QueryRow(
357+
t, "RESTORE DATABASE foo FROM LATEST IN 'nodelocal://1/backup' WITH detached, new_db_name='foo_restored'",
358+
).Scan(&restoreID)
359+
360+
jobutils.WaitForJobToPause(t, sqlDB, restoreID)
361+
362+
sqlDB.Exec(t, "CANCEL JOB $1", restoreID)
363+
<-droppedDescs
364+
sqlDB.Exec(t, "PAUSE JOB $1", restoreID)
365+
jobutils.WaitForJobToPause(t, sqlDB, restoreID)
366+
testKnobs.AfterRevertRestoreDropDescriptors = nil
367+
368+
sqlDB.Exec(t, "RESUME JOB $1", restoreID)
369+
jobutils.WaitForJobToCancel(t, sqlDB, restoreID)
370+
}

pkg/jobs/jobspb/jobs.proto

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -656,7 +656,11 @@ message RestoreDetails {
656656

657657
bool experimental_copy = 37;
658658

659-
// NEXT ID: 38.
659+
// DroppedDescsOnFail is set once the restore job has cleaned up descriptors
660+
// on cancel/failure.
661+
bool dropped_descs_on_fail = 38;
662+
663+
// NEXT ID: 39.
660664
}
661665

662666

pkg/sql/exec_util_backup.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,10 @@ type BackupRestoreTestingKnobs struct {
8686
// RunBeforeDownloadCleanup is called before we cleanup after all external
8787
// files have been download.
8888
RunBeforeDownloadCleanup func() error
89+
90+
// AfterRevertRestoreDropDescriptors is called after a reverting restore
91+
// drops its descriptors.
92+
AfterRevertRestoreDropDescriptors func() error
8993
}
9094

9195
var _ base.ModuleTestingKnobs = &BackupRestoreTestingKnobs{}

0 commit comments

Comments
 (0)