diff --git a/pkg/backup/restore_job.go b/pkg/backup/restore_job.go index 936fbb5e68b9..644c6e99f6e6 100644 --- a/pkg/backup/restore_job.go +++ b/pkg/backup/restore_job.go @@ -116,6 +116,10 @@ const ( // be _exceeded_ before we no longer fast fail the restore job after hitting the // maxRestoreRetryFastFail threshold. restoreRetryProgressThreshold = 0 + + // droppedDescsOnFailKey is an info key that is set for a restore job when it + // has finished dropping its descriptors on failure. + droppedDescsOnFailKey = "dropped_descs_on_fail" ) var restoreStatsInsertionConcurrency = settings.RegisterIntSetting( @@ -2820,6 +2824,13 @@ func (r *restoreResumer) OnFailOrCancel( return err } + testingKnobs := execCfg.BackupRestoreTestingKnobs + if testingKnobs != nil && testingKnobs.AfterRevertRestoreDropDescriptors != nil { + if err := testingKnobs.AfterRevertRestoreDropDescriptors(); err != nil { + return err + } + } + if details.DescriptorCoverage == tree.AllDescriptors { // The temporary system table descriptors should already have been dropped // in `dropDescriptors` but we still need to drop the temporary system db. @@ -2865,6 +2876,19 @@ func (r *restoreResumer) dropDescriptors( return nil } + jobInfo := jobs.InfoStorageForJob(txn, r.job.ID()) + _, hasDropped, err := jobInfo.Get( + ctx, "get-restore-dropped-descs-on-fail-key", droppedDescsOnFailKey, + ) + if err != nil { + return err + } + if hasDropped { + // Descriptors have already been dropped once before, this is a retry of the + // cleanup. + return nil + } + b := txn.KV().NewBatch() const kvTrace = false // Collect the tables into mutable versions. @@ -3186,7 +3210,10 @@ func (r *restoreResumer) dropDescriptors( return errors.Wrap(err, "dropping tables created at the start of restore caused by fail/cancel") } - return nil + return errors.Wrap( + jobInfo.Write(ctx, droppedDescsOnFailKey, []byte{}), + "checkpointing dropped descs on fail", + ) } // removeExistingTypeBackReferences removes back references from types that diff --git a/pkg/backup/restore_test.go b/pkg/backup/restore_test.go index ea511f4ff4ed..2c71d3fff043 100644 --- a/pkg/backup/restore_test.go +++ b/pkg/backup/restore_test.go @@ -318,3 +318,55 @@ func TestRestoreDuplicateTempTables(t *testing.T) { result := sqlDB.QueryStr(t, `SELECT table_name FROM [SHOW TABLES] ORDER BY table_name`) require.Equal(t, [][]string{{"permanent_table"}}, result) } + +func TestRestoreRetryRevert(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + droppedDescs := make(chan struct{}) + jobPaused := make(chan struct{}) + testKnobs := &sql.BackupRestoreTestingKnobs{ + AfterRevertRestoreDropDescriptors: func() error { + close(droppedDescs) + <-jobPaused + return nil + }, + } + var params base.TestClusterArgs + params.ServerArgs.Knobs.BackupRestore = testKnobs + + _, sqlDB, _, cleanupFn := backuptestutils.StartBackupRestoreTestCluster( + t, singleNode, backuptestutils.WithParams(params), + ) + defer cleanupFn() + + // We create a variety of descriptors to ensure that missing descriptors + // during drop do not break revert. + sqlDB.Exec(t, "CREATE DATABASE foo") + sqlDB.Exec(t, "USE foo") + sqlDB.Exec(t, "CREATE OR REPLACE FUNCTION bar(a INT) RETURNS INT AS 'SELECT a*a' LANGUAGE SQL;") + sqlDB.Exec(t, "CREATE TYPE baz AS ENUM ('a', 'b', 'c')") + sqlDB.Exec(t, "CREATE TABLE qux (x INT)") + sqlDB.Exec(t, "BACKUP DATABASE foo INTO 'nodelocal://1/backup'") + + // We need restore to publish descriptors so that they will be cleaned up + // during restore. + sqlDB.Exec(t, "SET CLUSTER SETTING jobs.debug.pausepoints = 'restore.after_publishing_descriptors'") + + var restoreID jobspb.JobID + sqlDB.QueryRow( + t, "RESTORE DATABASE foo FROM LATEST IN 'nodelocal://1/backup' WITH detached, new_db_name='foo_restored'", + ).Scan(&restoreID) + + jobutils.WaitForJobToPause(t, sqlDB, restoreID) + + sqlDB.Exec(t, "CANCEL JOB $1", restoreID) + <-droppedDescs + sqlDB.Exec(t, "PAUSE JOB $1", restoreID) + jobutils.WaitForJobToPause(t, sqlDB, restoreID) + close(jobPaused) + testKnobs.AfterRevertRestoreDropDescriptors = nil + + sqlDB.Exec(t, "RESUME JOB $1", restoreID) + jobutils.WaitForJobToCancel(t, sqlDB, restoreID) +} diff --git a/pkg/sql/exec_util_backup.go b/pkg/sql/exec_util_backup.go index 08247fba7ead..b50e3fd5a59d 100644 --- a/pkg/sql/exec_util_backup.go +++ b/pkg/sql/exec_util_backup.go @@ -90,6 +90,10 @@ type BackupRestoreTestingKnobs struct { // AfterAddRemoteSST is called after a remote SST is linked to pebble during // the link phase of online restore. AfterAddRemoteSST func() error + + // AfterRevertRestoreDropDescriptors is called after a reverting restore + // drops its descriptors. + AfterRevertRestoreDropDescriptors func() error } var _ base.ModuleTestingKnobs = &BackupRestoreTestingKnobs{}