Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 28 additions & 1 deletion pkg/backup/restore_job.go
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,10 @@ const (
// be _exceeded_ before we no longer fast fail the restore job after hitting the
// maxRestoreRetryFastFail threshold.
restoreRetryProgressThreshold = 0

// droppedDescsOnFailKey is an info key that is set for a restore job when it
// has finished dropping its descriptors on failure.
droppedDescsOnFailKey = "dropped_descs_on_fail"
)

var restoreStatsInsertionConcurrency = settings.RegisterIntSetting(
Expand Down Expand Up @@ -2820,6 +2824,13 @@ func (r *restoreResumer) OnFailOrCancel(
return err
}

testingKnobs := execCfg.BackupRestoreTestingKnobs
if testingKnobs != nil && testingKnobs.AfterRevertRestoreDropDescriptors != nil {
if err := testingKnobs.AfterRevertRestoreDropDescriptors(); err != nil {
return err
}
}

if details.DescriptorCoverage == tree.AllDescriptors {
// The temporary system table descriptors should already have been dropped
// in `dropDescriptors` but we still need to drop the temporary system db.
Expand Down Expand Up @@ -2865,6 +2876,19 @@ func (r *restoreResumer) dropDescriptors(
return nil
}

jobInfo := jobs.InfoStorageForJob(txn, r.job.ID())
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of adding a checkpoint, how difficult would it be to make the cleanup logic idempotent so it is safe to run twice? Making the code naturally idempotent is my preferred solution for job retries.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My original implementation was to make the code naturally idempotent. It required adding if pgerr.GetPGCode(err) == pgcode.UndefinedX checks in 4 or 5 places.

Since it required more reasoning to determine where to place these checks, it felt more susceptible to bugs if we were to ever come back and add more cleanup logic, which is why I went with the checkpoint approach.

I'm open to being convinced otherwise though.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looking through the code I think you are right. This would probably need to be rewritten to be cleanly idempotent.

_, hasDropped, err := jobInfo.Get(
ctx, "get-restore-dropped-descs-on-fail-key", droppedDescsOnFailKey,
)
if err != nil {
return err
}
if hasDropped {
// Descriptors have already been dropped once before, this is a retry of the
// cleanup.
return nil
}

b := txn.KV().NewBatch()
const kvTrace = false
// Collect the tables into mutable versions.
Expand Down Expand Up @@ -3186,7 +3210,10 @@ func (r *restoreResumer) dropDescriptors(
return errors.Wrap(err, "dropping tables created at the start of restore caused by fail/cancel")
}

return nil
return errors.Wrap(
jobInfo.Write(ctx, droppedDescsOnFailKey, []byte{}),
"checkpointing dropped descs on fail",
)
}

// removeExistingTypeBackReferences removes back references from types that
Expand Down
52 changes: 52 additions & 0 deletions pkg/backup/restore_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -318,3 +318,55 @@ func TestRestoreDuplicateTempTables(t *testing.T) {
result := sqlDB.QueryStr(t, `SELECT table_name FROM [SHOW TABLES] ORDER BY table_name`)
require.Equal(t, [][]string{{"permanent_table"}}, result)
}

func TestRestoreRetryRevert(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)

droppedDescs := make(chan struct{})
jobPaused := make(chan struct{})
testKnobs := &sql.BackupRestoreTestingKnobs{
AfterRevertRestoreDropDescriptors: func() error {
close(droppedDescs)
<-jobPaused
return nil
},
}
var params base.TestClusterArgs
params.ServerArgs.Knobs.BackupRestore = testKnobs

_, sqlDB, _, cleanupFn := backuptestutils.StartBackupRestoreTestCluster(
t, singleNode, backuptestutils.WithParams(params),
)
defer cleanupFn()

// We create a variety of descriptors to ensure that missing descriptors
// during drop do not break revert.
sqlDB.Exec(t, "CREATE DATABASE foo")
sqlDB.Exec(t, "USE foo")
sqlDB.Exec(t, "CREATE OR REPLACE FUNCTION bar(a INT) RETURNS INT AS 'SELECT a*a' LANGUAGE SQL;")
sqlDB.Exec(t, "CREATE TYPE baz AS ENUM ('a', 'b', 'c')")
sqlDB.Exec(t, "CREATE TABLE qux (x INT)")
sqlDB.Exec(t, "BACKUP DATABASE foo INTO 'nodelocal://1/backup'")

// We need restore to publish descriptors so that they will be cleaned up
// during restore.
sqlDB.Exec(t, "SET CLUSTER SETTING jobs.debug.pausepoints = 'restore.after_publishing_descriptors'")

var restoreID jobspb.JobID
sqlDB.QueryRow(
t, "RESTORE DATABASE foo FROM LATEST IN 'nodelocal://1/backup' WITH detached, new_db_name='foo_restored'",
).Scan(&restoreID)

jobutils.WaitForJobToPause(t, sqlDB, restoreID)

sqlDB.Exec(t, "CANCEL JOB $1", restoreID)
<-droppedDescs
sqlDB.Exec(t, "PAUSE JOB $1", restoreID)
jobutils.WaitForJobToPause(t, sqlDB, restoreID)
close(jobPaused)
testKnobs.AfterRevertRestoreDropDescriptors = nil

sqlDB.Exec(t, "RESUME JOB $1", restoreID)
jobutils.WaitForJobToCancel(t, sqlDB, restoreID)
}
4 changes: 4 additions & 0 deletions pkg/sql/exec_util_backup.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,10 @@ type BackupRestoreTestingKnobs struct {
// AfterAddRemoteSST is called after a remote SST is linked to pebble during
// the link phase of online restore.
AfterAddRemoteSST func() error

// AfterRevertRestoreDropDescriptors is called after a reverting restore
// drops its descriptors.
AfterRevertRestoreDropDescriptors func() error
}

var _ base.ModuleTestingKnobs = &BackupRestoreTestingKnobs{}
Expand Down
Loading