Skip to content

Commit f09f957

Browse files
committed
restore: fix retry revert stuck in reverting
Previously, if a reverting restore job was paused after dropping its descriptors and then resumed, the restore job would get stuck in a retry loop due to missing descriptors. This commit adds a field in the restore details that can be set once descriptors have been dropped during restore cleanup. This allows us to avoid attempting to drop descriptors if they have already been dropped. Fixes: #156019 Release note: Restore no longer gets stuck in a retry loop when reverts are attempted twice.
1 parent 75b649c commit f09f957

File tree

3 files changed

+82
-1
lines changed

3 files changed

+82
-1
lines changed

pkg/backup/restore_job.go

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,10 @@ const (
116116
// be _exceeded_ before we no longer fast fail the restore job after hitting the
117117
// maxRestoreRetryFastFail threshold.
118118
restoreRetryProgressThreshold = 0
119+
120+
// droppedDescsOnFailKey is an info key that is set for a restore job when it
121+
// has finished dropping its descriptors on failure.
122+
droppedDescsOnFailKey = "dropped_descs_on_fail"
119123
)
120124

121125
var restoreStatsInsertionConcurrency = settings.RegisterIntSetting(
@@ -2820,6 +2824,13 @@ func (r *restoreResumer) OnFailOrCancel(
28202824
return err
28212825
}
28222826

2827+
testingKnobs := execCfg.BackupRestoreTestingKnobs
2828+
if testingKnobs != nil && testingKnobs.AfterRevertRestoreDropDescriptors != nil {
2829+
if err := testingKnobs.AfterRevertRestoreDropDescriptors(); err != nil {
2830+
return err
2831+
}
2832+
}
2833+
28232834
if details.DescriptorCoverage == tree.AllDescriptors {
28242835
// The temporary system table descriptors should already have been dropped
28252836
// in `dropDescriptors` but we still need to drop the temporary system db.
@@ -2865,6 +2876,17 @@ func (r *restoreResumer) dropDescriptors(
28652876
return nil
28662877
}
28672878

2879+
jobInfo := jobs.InfoStorageForJob(txn, r.job.ID())
2880+
if _, hasDropped, err := jobInfo.Get(
2881+
ctx, "get-restore-dropped-descs-on-fail-key", droppedDescsOnFailKey,
2882+
); err != nil {
2883+
return err
2884+
} else if hasDropped {
2885+
// Descriptors have already been dropped once before, this is a retry of the
2886+
// cleanup.
2887+
return nil
2888+
}
2889+
28682890
b := txn.KV().NewBatch()
28692891
const kvTrace = false
28702892
// Collect the tables into mutable versions.
@@ -3186,7 +3208,10 @@ func (r *restoreResumer) dropDescriptors(
31863208
return errors.Wrap(err, "dropping tables created at the start of restore caused by fail/cancel")
31873209
}
31883210

3189-
return nil
3211+
return errors.Wrap(
3212+
jobInfo.Write(ctx, droppedDescsOnFailKey, []byte{}),
3213+
"checkpointing dropped descs on fail",
3214+
)
31903215
}
31913216

31923217
// removeExistingTypeBackReferences removes back references from types that

pkg/backup/restore_test.go

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -318,3 +318,55 @@ func TestRestoreDuplicateTempTables(t *testing.T) {
318318
result := sqlDB.QueryStr(t, `SELECT table_name FROM [SHOW TABLES] ORDER BY table_name`)
319319
require.Equal(t, [][]string{{"permanent_table"}}, result)
320320
}
321+
322+
func TestRestoreRetryRevert(t *testing.T) {
323+
defer leaktest.AfterTest(t)()
324+
defer log.Scope(t).Close(t)
325+
326+
droppedDescs := make(chan struct{})
327+
jobPaused := make(chan struct{})
328+
testKnobs := &sql.BackupRestoreTestingKnobs{
329+
AfterRevertRestoreDropDescriptors: func() error {
330+
close(droppedDescs)
331+
<-jobPaused
332+
return nil
333+
},
334+
}
335+
var params base.TestClusterArgs
336+
params.ServerArgs.Knobs.BackupRestore = testKnobs
337+
338+
_, sqlDB, _, cleanupFn := backuptestutils.StartBackupRestoreTestCluster(
339+
t, singleNode, backuptestutils.WithParams(params),
340+
)
341+
defer cleanupFn()
342+
343+
// We create a variety of descriptors to ensure that missing descriptors
344+
// during drop do not break revert.
345+
sqlDB.Exec(t, "CREATE DATABASE foo")
346+
sqlDB.Exec(t, "USE foo")
347+
sqlDB.Exec(t, "CREATE OR REPLACE FUNCTION bar(a INT) RETURNS INT AS 'SELECT a*a' LANGUAGE SQL;")
348+
sqlDB.Exec(t, "CREATE TYPE baz AS ENUM ('a', 'b', 'c')")
349+
sqlDB.Exec(t, "CREATE TABLE qux (x INT)")
350+
sqlDB.Exec(t, "BACKUP DATABASE foo INTO 'nodelocal://1/backup'")
351+
352+
// We need restore to publish descriptors so that they will be cleaned up
353+
// during restore.
354+
sqlDB.Exec(t, "SET CLUSTER SETTING jobs.debug.pausepoints = 'restore.after_publishing_descriptors'")
355+
356+
var restoreID jobspb.JobID
357+
sqlDB.QueryRow(
358+
t, "RESTORE DATABASE foo FROM LATEST IN 'nodelocal://1/backup' WITH detached, new_db_name='foo_restored'",
359+
).Scan(&restoreID)
360+
361+
jobutils.WaitForJobToPause(t, sqlDB, restoreID)
362+
363+
sqlDB.Exec(t, "CANCEL JOB $1", restoreID)
364+
<-droppedDescs
365+
sqlDB.Exec(t, "PAUSE JOB $1", restoreID)
366+
jobutils.WaitForJobToPause(t, sqlDB, restoreID)
367+
close(jobPaused)
368+
testKnobs.AfterRevertRestoreDropDescriptors = nil
369+
370+
sqlDB.Exec(t, "RESUME JOB $1", restoreID)
371+
jobutils.WaitForJobToCancel(t, sqlDB, restoreID)
372+
}

pkg/sql/exec_util_backup.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,10 @@ type BackupRestoreTestingKnobs struct {
8686
// RunBeforeDownloadCleanup is called before we cleanup after all external
8787
// files have been download.
8888
RunBeforeDownloadCleanup func() error
89+
90+
// AfterRevertRestoreDropDescriptors is called after a reverting restore
91+
// drops its descriptors.
92+
AfterRevertRestoreDropDescriptors func() error
8993
}
9094

9195
var _ base.ModuleTestingKnobs = &BackupRestoreTestingKnobs{}

0 commit comments

Comments
 (0)