diff --git a/DEPS.bzl b/DEPS.bzl index bf4ee0e36d695..947c79bbfb1b1 100644 --- a/DEPS.bzl +++ b/DEPS.bzl @@ -5867,13 +5867,13 @@ def go_deps(): name = "com_github_pingcap_kvproto", build_file_proto_mode = "disable_global", importpath = "github.com/pingcap/kvproto", - sha256 = "92a67bcc499c06fd3d76cc153362540b22eaf1b09c4bda62a1599ce876b8ed78", - strip_prefix = "github.com/pingcap/kvproto@v0.0.0-20241120071417-b5b7843d9037", + sha256 = "db08607b0c90f3909b66577e9c568d0cbd6b2825d287d7b5caab86ea6e4b60ad", + strip_prefix = "github.com/pingcap/kvproto@v0.0.0-20250108041715-3b77f2c65c63", urls = [ - "http://bazel-cache.pingcap.net:8080/gomod/github.com/pingcap/kvproto/com_github_pingcap_kvproto-v0.0.0-20241120071417-b5b7843d9037.zip", - "http://ats.apps.svc/gomod/github.com/pingcap/kvproto/com_github_pingcap_kvproto-v0.0.0-20241120071417-b5b7843d9037.zip", - "https://cache.hawkingrei.com/gomod/github.com/pingcap/kvproto/com_github_pingcap_kvproto-v0.0.0-20241120071417-b5b7843d9037.zip", - "https://storage.googleapis.com/pingcapmirror/gomod/github.com/pingcap/kvproto/com_github_pingcap_kvproto-v0.0.0-20241120071417-b5b7843d9037.zip", + "http://bazel-cache.pingcap.net:8080/gomod/github.com/pingcap/kvproto/com_github_pingcap_kvproto-v0.0.0-20250108041715-3b77f2c65c63.zip", + "http://ats.apps.svc/gomod/github.com/pingcap/kvproto/com_github_pingcap_kvproto-v0.0.0-20250108041715-3b77f2c65c63.zip", + "https://cache.hawkingrei.com/gomod/github.com/pingcap/kvproto/com_github_pingcap_kvproto-v0.0.0-20250108041715-3b77f2c65c63.zip", + "https://storage.googleapis.com/pingcapmirror/gomod/github.com/pingcap/kvproto/com_github_pingcap_kvproto-v0.0.0-20250108041715-3b77f2c65c63.zip", ], ) go_repository( diff --git a/br/cmd/br/operator.go b/br/cmd/br/operator.go index 4e41adeab329f..abd0156a5457b 100644 --- a/br/cmd/br/operator.go +++ b/br/cmd/br/operator.go @@ -35,6 +35,8 @@ func newOperatorCommand() *cobra.Command { cmd.AddCommand(newBase64ifyCommand()) cmd.AddCommand(newListMigrationsCommand()) cmd.AddCommand(newMigrateToCommand()) + cmd.AddCommand(newForceFlushCommand()) + cmd.AddCommand(newChecksumCommand()) return cmd } @@ -109,3 +111,43 @@ func newMigrateToCommand() *cobra.Command { operator.DefineFlagsForMigrateToConfig(cmd.Flags()) return cmd } + +func newChecksumCommand() *cobra.Command { + cmd := &cobra.Command{ + Use: "checksum-as", + Short: "calculate the checksum with rewrite rules", + Long: "Calculate the checksum of the current cluster (specified by `-u`) " + + "with applying the rewrite rules generated from a backup (specified by `-s`). " + + "This can be used when you have the checksum of upstream elsewhere.", + Args: cobra.NoArgs, + RunE: func(cmd *cobra.Command, args []string) error { + cfg := operator.ChecksumWithRewriteRulesConfig{} + if err := cfg.ParseFromFlags(cmd.Flags()); err != nil { + return err + } + ctx := GetDefaultContext() + return operator.RunChecksumTable(ctx, tidbGlue, cfg) + }, + } + task.DefineFilterFlags(cmd, []string{"!*.*"}, false) + operator.DefineFlagsForChecksumTableConfig(cmd.Flags()) + return cmd +} + +func newForceFlushCommand() *cobra.Command { + cmd := &cobra.Command{ + Use: "force-flush", + Short: "force a log backup task to flush", + Args: cobra.NoArgs, + RunE: func(cmd *cobra.Command, args []string) error { + cfg := operator.ForceFlushConfig{} + if err := cfg.ParseFromFlags(cmd.Flags()); err != nil { + return err + } + ctx := GetDefaultContext() + return operator.RunForceFlush(ctx, &cfg) + }, + } + operator.DefineFlagsForForceFlushConfig(cmd.Flags()) + return cmd +} diff --git a/br/pkg/checkpoint/restore.go b/br/pkg/checkpoint/restore.go index 32ee7e02d4392..8dd9d3c5405ff 100644 --- a/br/pkg/checkpoint/restore.go +++ b/br/pkg/checkpoint/restore.go @@ -19,6 +19,7 @@ import ( "encoding/json" "time" + "github.com/google/uuid" "github.com/pingcap/errors" "github.com/pingcap/tidb/br/pkg/glue" "github.com/pingcap/tidb/br/pkg/pdutil" @@ -137,6 +138,8 @@ type CheckpointMetadataForSnapshotRestore struct { UpstreamClusterID uint64 `json:"upstream-cluster-id"` RestoredTS uint64 `json:"restored-ts"` SchedulersConfig *pdutil.ClusterConfig `json:"schedulers-config"` + + RestoreUUID uuid.UUID `json:"restore-uuid"` } func LoadCheckpointMetadataForSnapshotRestore( diff --git a/br/pkg/checkpoint/storage.go b/br/pkg/checkpoint/storage.go index c68d9b41ff506..5fdad07b84c1f 100644 --- a/br/pkg/checkpoint/storage.go +++ b/br/pkg/checkpoint/storage.go @@ -32,6 +32,11 @@ import ( "go.uber.org/zap" ) +type hookedOnFlush struct { + checkpointStorage + cb func(ctx context.Context) error +} + type checkpointStorage interface { flushCheckpointData(ctx context.Context, data []byte) error flushCheckpointChecksum(ctx context.Context, data []byte) error diff --git a/br/pkg/errors/errors.go b/br/pkg/errors/errors.go index 6a9449eff95d1..2db9ece1e1735 100644 --- a/br/pkg/errors/errors.go +++ b/br/pkg/errors/errors.go @@ -40,6 +40,7 @@ var ( ErrEnvNotSpecified = errors.Normalize("environment variable not found", errors.RFCCodeText("BR:Common:ErrEnvNotSpecified")) ErrUnsupportedOperation = errors.Normalize("the operation is not supported", errors.RFCCodeText("BR:Common:ErrUnsupportedOperation")) ErrInvalidRange = errors.Normalize("invalid restore range", errors.RFCCodeText("BR:Common:ErrInvalidRange")) + ErrMigrationNotFound = errors.Normalize("no migrtion found", errors.RFCCodeText("BR:Common:ErrMigrationNotFound")) ErrMigrationVersionNotSupported = errors.Normalize("the migration version isn't supported", errors.RFCCodeText("BR:Common:ErrMigrationVersionNotSupported")) ErrPDUpdateFailed = errors.Normalize("failed to update PD", errors.RFCCodeText("BR:PD:ErrPDUpdateFailed")) diff --git a/br/pkg/glue/glue.go b/br/pkg/glue/glue.go index 751a2acb10164..73f916fdff9ea 100644 --- a/br/pkg/glue/glue.go +++ b/br/pkg/glue/glue.go @@ -4,6 +4,7 @@ package glue import ( "context" + "sync/atomic" "github.com/pingcap/tidb/pkg/ddl" "github.com/pingcap/tidb/pkg/domain" @@ -82,3 +83,22 @@ type Progress interface { // called. Close() } + +type CounterProgress struct { + Counter atomic.Int64 +} + +func (c *CounterProgress) Inc() { + c.Counter.Add(1) +} + +func (c *CounterProgress) IncBy(cnt int64) { + c.Counter.Add(cnt) +} + +func (c *CounterProgress) GetCurrent() int64 { + return c.Counter.Load() +} + +func (c *CounterProgress) Close() { +} diff --git a/br/pkg/gluetidb/glue.go b/br/pkg/gluetidb/glue.go index e81752c0c9079..7f0ba1bd0a515 100644 --- a/br/pkg/gluetidb/glue.go +++ b/br/pkg/gluetidb/glue.go @@ -55,6 +55,10 @@ type Glue struct { startDomainMu *sync.Mutex } +func WrapSession(se sessiontypes.Session) glue.Session { + return &tidbSession{se: se} +} + type tidbSession struct { se sessiontypes.Session } diff --git a/br/pkg/logutil/logging.go b/br/pkg/logutil/logging.go index 353ca6622e896..22480e7b12d48 100644 --- a/br/pkg/logutil/logging.go +++ b/br/pkg/logutil/logging.go @@ -15,7 +15,9 @@ import ( "github.com/pingcap/kvproto/pkg/metapb" "github.com/pingcap/log" "github.com/pingcap/tidb/pkg/kv" + "github.com/pingcap/tidb/pkg/lightning/metric" "github.com/pingcap/tidb/pkg/util/redact" + "github.com/prometheus/client_golang/prometheus" "go.uber.org/zap" "go.uber.org/zap/zapcore" ) @@ -356,3 +358,25 @@ func (b HexBytes) String() string { func (b HexBytes) MarshalJSON() ([]byte, error) { return json.Marshal(hex.EncodeToString(b)) } + +func MarshalHistogram(m prometheus.Histogram) zapcore.ObjectMarshaler { + return zapcore.ObjectMarshalerFunc(func(mal zapcore.ObjectEncoder) error { + if m == nil { + return nil + } + + met := metric.ReadHistogram(m) + if met == nil || met.Histogram == nil { + return nil + } + + hist := met.Histogram + for _, b := range hist.GetBucket() { + key := fmt.Sprintf("lt_%f", b.GetUpperBound()) + mal.AddUint64(key, b.GetCumulativeCount()) + } + mal.AddUint64("count", hist.GetSampleCount()) + mal.AddFloat64("total", hist.GetSampleSum()) + return nil + }) +} diff --git a/br/pkg/logutil/rate.go b/br/pkg/logutil/rate.go index db7df537a81b4..f7d4d2e79c8a0 100644 --- a/br/pkg/logutil/rate.go +++ b/br/pkg/logutil/rate.go @@ -4,6 +4,7 @@ package logutil import ( "fmt" + "math" "time" "github.com/pingcap/log" @@ -12,14 +13,6 @@ import ( "go.uber.org/zap" ) -// MetricTableCreatedCounter counts how many tables created. -// TODO: when br decided to introduce Prometheus, move this to its metric package. -var MetricTableCreatedCounter = prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: "BR", - Name: "table_created", - Help: "The count of tables have been created.", -}) - // RateTracer is a trivial rate tracer based on a prometheus counter. // It traces the average speed from it was created. type RateTracer struct { @@ -46,6 +39,9 @@ func (r *RateTracer) Rate() float64 { // RateAt returns the rate until some instant. This function is mainly for testing. // WARN: the counter value for calculating is still its CURRENT VALUE. func (r *RateTracer) RateAt(instant time.Time) float64 { + if r.Counter == nil { + return math.NaN() + } return (metric.ReadCounter(r.Counter) - r.base) / instant.Sub(r.start).Seconds() } diff --git a/br/pkg/restore/import_mode_switcher.go b/br/pkg/restore/import_mode_switcher.go index 0bec6a4d1e384..33552ca0734bd 100644 --- a/br/pkg/restore/import_mode_switcher.go +++ b/br/pkg/restore/import_mode_switcher.go @@ -139,7 +139,7 @@ func (switcher *ImportModeSwitcher) GoSwitchToImportMode( } // Create a new context for the goroutine - ctx, cancel := context.WithCancel(context.Background()) + ctx, cancel := context.WithCancel(ctx) switcher.cancel = cancel // [important!] switch tikv mode into import at the beginning diff --git a/br/pkg/restore/log_client/BUILD.bazel b/br/pkg/restore/log_client/BUILD.bazel index 7fb781e7ad0ef..13bfa3bac9334 100644 --- a/br/pkg/restore/log_client/BUILD.bazel +++ b/br/pkg/restore/log_client/BUILD.bazel @@ -11,6 +11,7 @@ go_library( "log_file_map.go", "log_split_strategy.go", "migration.go", + "ssts.go", ], importpath = "github.com/pingcap/tidb/br/pkg/restore/log_client", visibility = ["//visibility:public"], @@ -43,11 +44,13 @@ go_library( "//pkg/kv", "//pkg/meta", "//pkg/meta/model", + "//pkg/tablecodec", "//pkg/util", "//pkg/util/codec", "//pkg/util/redact", "//pkg/util/sqlexec", "//pkg/util/table-filter", + "@com_github_docker_go_units//:go-units", "@com_github_fatih_color//:color", "@com_github_gogo_protobuf//proto", "@com_github_opentracing_opentracing_go//:opentracing-go", @@ -90,7 +93,7 @@ go_test( ], embed = [":log_client"], flaky = True, - shard_count = 45, + shard_count = 50, deps = [ "//br/pkg/errors", "//br/pkg/glue", @@ -119,6 +122,7 @@ go_test( "//pkg/util/sqlexec", "//pkg/util/table-filter", "@com_github_docker_go_units//:go-units", + "@com_github_google_uuid//:uuid", "@com_github_pingcap_errors//:errors", "@com_github_pingcap_failpoint//:failpoint", "@com_github_pingcap_kvproto//pkg/brpb", diff --git a/br/pkg/restore/log_client/client.go b/br/pkg/restore/log_client/client.go index 474578a05e8dc..c3c442a6b282d 100644 --- a/br/pkg/restore/log_client/client.go +++ b/br/pkg/restore/log_client/client.go @@ -25,13 +25,16 @@ import ( "strconv" "strings" "sync" + "sync/atomic" "time" + "github.com/docker/go-units" "github.com/fatih/color" "github.com/gogo/protobuf/proto" "github.com/opentracing/opentracing-go" "github.com/pingcap/errors" "github.com/pingcap/failpoint" + backup "github.com/pingcap/kvproto/pkg/brpb" backuppb "github.com/pingcap/kvproto/pkg/brpb" "github.com/pingcap/kvproto/pkg/encryptionpb" "github.com/pingcap/log" @@ -40,6 +43,7 @@ import ( "github.com/pingcap/tidb/br/pkg/conn" "github.com/pingcap/tidb/br/pkg/conn/util" "github.com/pingcap/tidb/br/pkg/encryption" + berrors "github.com/pingcap/tidb/br/pkg/errors" "github.com/pingcap/tidb/br/pkg/glue" "github.com/pingcap/tidb/br/pkg/logutil" "github.com/pingcap/tidb/br/pkg/metautil" @@ -178,6 +182,7 @@ func NewSstRestoreManager( type LogClient struct { *LogFileManager + logRestoreManager *LogRestoreManager sstRestoreManager *SstRestoreManager @@ -209,6 +214,16 @@ type LogClient struct { // checkpoint information for log restore useCheckpoint bool + + logFilesStat logFilesStatistic + restoreStat restoreStatistic +} + +type restoreStatistic struct { + restoreSSTKVSize uint64 + restoreSSTKVCount uint64 + restoreSSTPhySize uint64 + restoreSSTTakes uint64 } // NewRestoreClient returns a new RestoreClient. @@ -250,13 +265,34 @@ func (rc *LogClient) Close(ctx context.Context) { log.Info("Restore client closed") } +func rewriteRewriteRuleBy(sst SSTs, rules *restoreutils.RewriteRules) (*restoreutils.RewriteRules, error) { + if r, ok := sst.(RewrittenSST); ok { + rewritten := r.RewrittenTo() + if rewritten > 0 && rewritten != sst.TableID() { + rewriteRules := rules.Clone() + if !rewriteRules.RewriteSourceTableID(rewritten, sst.TableID()) { + return nil, errors.Annotatef( + berrors.ErrUnknown, + "table rewritten from a table id (%d) to (%d) which doesn't exist in the stream", + rewritten, + sst.TableID(), + ) + } + log.Info("Rewritten rewrite rules.", zap.Stringer("rules", rewriteRules), zap.Int64("table_id", sst.TableID()), zap.Int64("rewritten_to", rewritten)) + return rewriteRules, nil + } + } + return rules, nil +} + func (rc *LogClient) RestoreCompactedSstFiles( ctx context.Context, - compactionsIter iter.TryNextor[*backuppb.LogFileSubcompaction], + compactionsIter iter.TryNextor[SSTs], rules map[int64]*restoreutils.RewriteRules, importModeSwitcher *restore.ImportModeSwitcher, onProgress func(int64), ) error { + begin := time.Now() backupFileSets := make([]restore.BackupFileSet, 0, 8) // Collect all items from the iterator in advance to avoid blocking during restoration. // This approach ensures that we have all necessary data ready for processing, @@ -267,15 +303,25 @@ func (rc *LogClient) RestoreCompactedSstFiles( return r.Err } i := r.Item - rewriteRules, ok := rules[i.Meta.TableId] + + tid := i.TableID() + if r, ok := i.(RewrittenSST); ok && r.RewrittenTo() > 0 { + tid = r.RewrittenTo() + } + rewriteRules, ok := rules[tid] if !ok { - log.Warn("[Compacted SST Restore] Skipping excluded table during restore.", zap.Int64("table_id", i.Meta.TableId)) + log.Warn("[Compacted SST Restore] Skipping excluded table during restore.", zap.Int64("table_id", i.TableID())) continue } + newRules, err := rewriteRewriteRuleBy(i, rewriteRules) + if err != nil { + return err + } + set := restore.BackupFileSet{ - TableID: i.Meta.TableId, - SSTFiles: i.SstOutputs, - RewriteRules: rewriteRules, + TableID: i.TableID(), + SSTFiles: i.GetSSTs(), + RewriteRules: newRules, } backupFileSets = append(backupFileSets, set) } @@ -311,7 +357,30 @@ func (rc *LogClient) RestoreCompactedSstFiles( if err != nil { return errors.Trace(err) } - return rc.sstRestoreManager.restorer.WaitUntilFinish() + err = rc.sstRestoreManager.restorer.WaitUntilFinish() + + for _, files := range backupFileSets { + for _, f := range files.SSTFiles { + log.Info("Collected file.", zap.Uint64("total_kv", f.TotalKvs), zap.Uint64("total_bytes", f.TotalBytes), zap.Uint64("size", f.Size_)) + atomic.AddUint64(&rc.restoreStat.restoreSSTKVCount, f.TotalKvs) + atomic.AddUint64(&rc.restoreStat.restoreSSTKVSize, f.TotalBytes) + atomic.AddUint64(&rc.restoreStat.restoreSSTPhySize, f.Size_) + } + } + atomic.AddUint64(&rc.restoreStat.restoreSSTTakes, uint64(time.Since(begin))) + return err +} + +func (rc *LogClient) RestoreSSTStatisticFields(pushTo *[]zapcore.Field) { + takes := time.Duration(rc.restoreStat.restoreSSTTakes) + fields := []zapcore.Field{ + zap.Uint64("restore-sst-kv-count", rc.restoreStat.restoreSSTKVCount), + zap.Uint64("restore-sst-kv-size", rc.restoreStat.restoreSSTKVSize), + zap.Uint64("restore-sst-physical-size (after compression)", rc.restoreStat.restoreSSTPhySize), + zap.Duration("restore-sst-total-take", takes), + zap.String("average-speed (sst)", units.HumanSize(float64(rc.restoreStat.restoreSSTKVSize)/takes.Seconds())+"/s"), + } + *pushTo = append(*pushTo, fields...) } func (rc *LogClient) SetRawKVBatchClient( @@ -516,13 +585,29 @@ func (rc *LogClient) InitCheckpointMetadataForLogRestore( return gcRatio, nil } -func (rc *LogClient) GetMigrations(ctx context.Context) ([]*backuppb.Migration, error) { - ext := stream.MigerationExtension(rc.storage) +type LockedMigrations struct { + Migs []*backup.Migration + ReadLock storage.RemoteLock +} + +func (rc *LogClient) GetMigrations(ctx context.Context) (*LockedMigrations, error) { + ext := stream.MigrationExtension(rc.storage) migs, err := ext.Load(ctx) if err != nil { return nil, errors.Trace(err) } - return migs.ListAll(), nil + + ms := migs.ListAll() + readLock, err := ext.GetReadLock(ctx, "restore stream") + if err != nil { + return nil, err + } + + lms := &LockedMigrations{ + Migs: ms, + ReadLock: readLock, + } + return lms, nil } func (rc *LogClient) InstallLogFileManager(ctx context.Context, startTS, restoreTS uint64, metadataDownloadBatchSize uint, @@ -544,6 +629,8 @@ func (rc *LogClient) InstallLogFileManager(ctx context.Context, startTS, restore if err != nil { return err } + rc.logFilesStat = logFilesStatistic{} + rc.LogFileManager.Stats = &rc.logFilesStat return nil } @@ -1509,15 +1596,15 @@ func (rc *LogClient) UpdateSchemaVersion(ctx context.Context) error { // It uses a region splitter to handle the splitting logic based on the provided rules and checkpoint sets. func (rc *LogClient) WrapCompactedFilesIterWithSplitHelper( ctx context.Context, - compactedIter iter.TryNextor[*backuppb.LogFileSubcompaction], + compactedIter iter.TryNextor[SSTs], rules map[int64]*restoreutils.RewriteRules, checkpointSets map[string]struct{}, updateStatsFn func(uint64, uint64), splitSize uint64, splitKeys int64, -) (iter.TryNextor[*backuppb.LogFileSubcompaction], error) { +) (iter.TryNextor[SSTs], error) { client := split.NewClient(rc.pdClient, rc.pdHTTPClient, rc.tlsConf, maxSplitKeysOnce, 3) - wrapper := restore.PipelineRestorerWrapper[*backuppb.LogFileSubcompaction]{ + wrapper := restore.PipelineRestorerWrapper[SSTs]{ PipelineRegionsSplitter: split.NewPipelineRegionsSplitter(client, splitSize, splitKeys), } strategy := NewCompactedFileSplitStrategy(rules, checkpointSets, updateStatsFn) diff --git a/br/pkg/restore/log_client/client_test.go b/br/pkg/restore/log_client/client_test.go index 1b16b25ecfa46..58a34dcfa61bb 100644 --- a/br/pkg/restore/log_client/client_test.go +++ b/br/pkg/restore/log_client/client_test.go @@ -1659,11 +1659,11 @@ func TestCompactedSplitStrategy(t *testing.T) { } cases := []struct { - MockSubcompationIter iter.TryNextor[*backuppb.LogFileSubcompaction] + MockSubcompationIter iter.TryNextor[logclient.SSTs] ExpectRegionEndKeys [][]byte }{ { - iter.FromSlice([]*backuppb.LogFileSubcompaction{ + iter.FromSlice([]logclient.SSTs{ fakeSubCompactionWithOneSst(1, 100, 16*units.MiB, 100), fakeSubCompactionWithOneSst(1, 200, 32*units.MiB, 200), fakeSubCompactionWithOneSst(2, 100, 48*units.MiB, 300), @@ -1678,7 +1678,7 @@ func TestCompactedSplitStrategy(t *testing.T) { }, }, { - iter.FromSlice([]*backuppb.LogFileSubcompaction{ + iter.FromSlice([]logclient.SSTs{ fakeSubCompactionWithOneSst(1, 100, 16*units.MiB, 100), fakeSubCompactionWithOneSst(1, 200, 32*units.MiB, 200), fakeSubCompactionWithOneSst(1, 100, 32*units.MiB, 10), @@ -1694,7 +1694,7 @@ func TestCompactedSplitStrategy(t *testing.T) { }, }, { - iter.FromSlice([]*backuppb.LogFileSubcompaction{ + iter.FromSlice([]logclient.SSTs{ fakeSubCompactionWithOneSst(1, 100, 16*units.MiB, 100), fakeSubCompactionWithOneSst(1, 200, 32*units.MiB, 200), fakeSubCompactionWithOneSst(2, 100, 32*units.MiB, 300), @@ -1719,7 +1719,7 @@ func TestCompactedSplitStrategy(t *testing.T) { mockPDCli.SetRegions(oriRegions) client := split.NewClient(mockPDCli, nil, nil, 100, 4) - wrapper := restore.PipelineRestorerWrapper[*backuppb.LogFileSubcompaction]{ + wrapper := restore.PipelineRestorerWrapper[logclient.SSTs]{ PipelineRegionsSplitter: split.NewPipelineRegionsSplitter(client, 4*units.MB, 400), } @@ -1774,14 +1774,14 @@ func TestCompactedSplitStrategyWithCheckpoint(t *testing.T) { } cases := []struct { - MockSubcompationIter iter.TryNextor[*backuppb.LogFileSubcompaction] + MockSubcompationIter iter.TryNextor[logclient.SSTs] CheckpointSet map[string]struct{} ProcessedKVCount int ProcessedSize int ExpectRegionEndKeys [][]byte }{ { - iter.FromSlice([]*backuppb.LogFileSubcompaction{ + iter.FromSlice([]logclient.SSTs{ fakeSubCompactionWithOneSst(1, 100, 16*units.MiB, 100), fakeSubCompactionWithOneSst(1, 200, 32*units.MiB, 200), fakeSubCompactionWithOneSst(2, 100, 48*units.MiB, 300), @@ -1801,7 +1801,7 @@ func TestCompactedSplitStrategyWithCheckpoint(t *testing.T) { }, }, { - iter.FromSlice([]*backuppb.LogFileSubcompaction{ + iter.FromSlice([]logclient.SSTs{ fakeSubCompactionWithOneSst(1, 100, 16*units.MiB, 100), fakeSubCompactionWithOneSst(1, 200, 32*units.MiB, 200), fakeSubCompactionWithOneSst(1, 100, 32*units.MiB, 10), @@ -1820,7 +1820,7 @@ func TestCompactedSplitStrategyWithCheckpoint(t *testing.T) { }, }, { - iter.FromSlice([]*backuppb.LogFileSubcompaction{ + iter.FromSlice([]logclient.SSTs{ fakeSubCompactionWithOneSst(1, 100, 16*units.MiB, 100), fakeSubCompactionWithOneSst(1, 200, 32*units.MiB, 200), fakeSubCompactionWithOneSst(2, 100, 32*units.MiB, 300), @@ -1843,7 +1843,7 @@ func TestCompactedSplitStrategyWithCheckpoint(t *testing.T) { }, }, { - iter.FromSlice([]*backuppb.LogFileSubcompaction{ + iter.FromSlice([]logclient.SSTs{ fakeSubCompactionWithOneSst(1, 100, 16*units.MiB, 100), fakeSubCompactionWithOneSst(1, 200, 32*units.MiB, 200), fakeSubCompactionWithOneSst(2, 100, 32*units.MiB, 300), @@ -1866,7 +1866,7 @@ func TestCompactedSplitStrategyWithCheckpoint(t *testing.T) { }, }, { - iter.FromSlice([]*backuppb.LogFileSubcompaction{ + iter.FromSlice([]logclient.SSTs{ fakeSubCompactionWithOneSst(1, 100, 16*units.MiB, 100), fakeSubCompactionWithMultiSsts(1, 200, 32*units.MiB, 200), fakeSubCompactionWithOneSst(2, 100, 32*units.MiB, 300), @@ -1897,7 +1897,7 @@ func TestCompactedSplitStrategyWithCheckpoint(t *testing.T) { mockPDCli.SetRegions(oriRegions) client := split.NewClient(mockPDCli, nil, nil, 100, 4) - wrapper := restore.PipelineRestorerWrapper[*backuppb.LogFileSubcompaction]{ + wrapper := restore.PipelineRestorerWrapper[logclient.SSTs]{ PipelineRegionsSplitter: split.NewPipelineRegionsSplitter(client, 4*units.MB, 400), } totalSize := 0 @@ -1929,8 +1929,8 @@ func TestCompactedSplitStrategyWithCheckpoint(t *testing.T) { } } -func fakeSubCompactionWithMultiSsts(tableID, rowID int64, length uint64, num uint64) *backuppb.LogFileSubcompaction { - return &backuppb.LogFileSubcompaction{ +func fakeSubCompactionWithMultiSsts(tableID, rowID int64, length uint64, num uint64) logclient.SSTs { + return &logclient.CompactedSSTs{&backuppb.LogFileSubcompaction{ Meta: &backuppb.LogFileSubcompactionMeta{ TableId: tableID, }, @@ -1950,10 +1950,10 @@ func fakeSubCompactionWithMultiSsts(tableID, rowID int64, length uint64, num uin TotalKvs: num, }, }, - } + }} } -func fakeSubCompactionWithOneSst(tableID, rowID int64, length uint64, num uint64) *backuppb.LogFileSubcompaction { - return &backuppb.LogFileSubcompaction{ +func fakeSubCompactionWithOneSst(tableID, rowID int64, length uint64, num uint64) logclient.SSTs { + return &logclient.CompactedSSTs{&backuppb.LogFileSubcompaction{ Meta: &backuppb.LogFileSubcompactionMeta{ TableId: tableID, }, @@ -1966,7 +1966,7 @@ func fakeSubCompactionWithOneSst(tableID, rowID int64, length uint64, num uint64 TotalKvs: num, }, }, - } + }} } func fakeFile(tableID, rowID int64, length uint64, num int64) *backuppb.DataFileInfo { diff --git a/br/pkg/restore/log_client/compacted_file_strategy.go b/br/pkg/restore/log_client/compacted_file_strategy.go index 9637cf2e529b6..97ef09e59ca48 100644 --- a/br/pkg/restore/log_client/compacted_file_strategy.go +++ b/br/pkg/restore/log_client/compacted_file_strategy.go @@ -7,9 +7,10 @@ import ( backuppb "github.com/pingcap/kvproto/pkg/brpb" "github.com/pingcap/log" + "github.com/pingcap/tidb/br/pkg/logutil" "github.com/pingcap/tidb/br/pkg/restore/split" + "github.com/pingcap/tidb/br/pkg/restore/utils" restoreutils "github.com/pingcap/tidb/br/pkg/restore/utils" - "github.com/pingcap/tidb/pkg/util/codec" "go.uber.org/zap" ) @@ -23,7 +24,7 @@ type CompactedFileSplitStrategy struct { checkpointFileProgressFn func(uint64, uint64) } -var _ split.SplitStrategy[*backuppb.LogFileSubcompaction] = &CompactedFileSplitStrategy{} +var _ split.SplitStrategy[SSTs] = &CompactedFileSplitStrategy{} func NewCompactedFileSplitStrategy( rules map[int64]*restoreutils.RewriteRules, @@ -37,19 +38,50 @@ func NewCompactedFileSplitStrategy( } } -func (cs *CompactedFileSplitStrategy) Accumulate(subCompaction *backuppb.LogFileSubcompaction) { - splitHelper, exist := cs.TableSplitter[subCompaction.Meta.TableId] +type sstIdentity struct { + EffectiveID int64 + RewriteBoundary *restoreutils.RewriteRules +} + +func (cs *CompactedFileSplitStrategy) inspect(ssts SSTs) sstIdentity { + r, ok := ssts.(RewrittenSST) + if !ok { + return sstIdentity{ + EffectiveID: ssts.TableID(), + RewriteBoundary: nil, + } + } + + rule := restoreutils.GetRewriteRuleOfTable(ssts.TableID(), r.RewrittenTo(), 0, map[int64]int64{}, false) + + return sstIdentity{ + EffectiveID: r.RewrittenTo(), + RewriteBoundary: rule, + } +} + +func (cs *CompactedFileSplitStrategy) Accumulate(ssts SSTs) { + identity := cs.inspect(ssts) + + splitHelper, exist := cs.TableSplitter[identity.EffectiveID] if !exist { splitHelper = split.NewSplitHelper() - cs.TableSplitter[subCompaction.Meta.TableId] = splitHelper + log.Info("Initialized splitter for table.", + zap.Int64("table-id", ssts.TableID()), zap.Int64("effective-id", identity.EffectiveID), zap.Stringer("rewrite-boundary", identity.RewriteBoundary)) + cs.TableSplitter[identity.EffectiveID] = splitHelper } - for _, f := range subCompaction.SstOutputs { - startKey := codec.EncodeBytes(nil, f.StartKey) - endKey := codec.EncodeBytes(nil, f.EndKey) + for _, f := range ssts.GetSSTs() { + startKey, endKey, err := utils.GetRewriteRawKeys(f, identity.RewriteBoundary) + if err != nil { + log.Panic("[unreachable] the rewrite rule doesn't match the SST file, this shouldn't happen...", + logutil.ShortError(err), zap.Stringer("rule", identity.RewriteBoundary), zap.Int64("effective-id", identity.EffectiveID), + zap.Stringer("file", f), + ) + } cs.AccumulateCount += 1 if f.TotalKvs == 0 || f.Size_ == 0 { - log.Error("No key-value pairs in subcompaction", zap.String("name", f.Name)) + log.Warn("No key-value pairs in subcompaction", zap.String("name", f.Name)) continue } // The number of MVCC entries in the compacted SST files can be excessive. @@ -82,14 +114,27 @@ func (cs *CompactedFileSplitStrategy) ShouldSplit() bool { return cs.AccumulateCount > (4096 / impactFactor) } -func (cs *CompactedFileSplitStrategy) ShouldSkip(subCompaction *backuppb.LogFileSubcompaction) bool { - _, exist := cs.Rules[subCompaction.Meta.TableId] - if !exist { - log.Info("skip for no rule files", zap.Int64("tableID", subCompaction.Meta.TableId)) +func hasARule[T any](ssts SSTs, rules map[int64]T) bool { + if _, exist := rules[ssts.TableID()]; exist { + return true + } + + if r, ok := ssts.(RewrittenSST); ok { + if _, exist := rules[r.RewrittenTo()]; exist { + return true + } + } + + return false +} + +func (cs *CompactedFileSplitStrategy) ShouldSkip(ssts SSTs) bool { + if !hasARule(ssts, cs.Rules) { + log.Warn("skip for no rule files", zap.Int64("tableID", ssts.TableID()), zap.Any("ssts", ssts)) return true } - sstOutputs := make([]*backuppb.File, 0, len(subCompaction.SstOutputs)) - for _, sst := range subCompaction.SstOutputs { + sstOutputs := make([]*backuppb.File, 0, len(ssts.GetSSTs())) + for _, sst := range ssts.GetSSTs() { if _, ok := cs.checkpointSets[sst.Name]; !ok { sstOutputs = append(sstOutputs, sst) } else { @@ -103,9 +148,12 @@ func (cs *CompactedFileSplitStrategy) ShouldSkip(subCompaction *backuppb.LogFile log.Info("all files in sub compaction skipped") return true } - if len(sstOutputs) != len(subCompaction.SstOutputs) { - log.Info("partial files in sub compaction skipped due to checkpoint") - subCompaction.SstOutputs = sstOutputs + if len(sstOutputs) != len(ssts.GetSSTs()) { + log.Info( + "partial files in sub compaction skipped due to checkpoint", + zap.Int("origin", len(ssts.GetSSTs())), zap.Int("output", len(sstOutputs)), + ) + ssts.SetSSTs(sstOutputs) return false } return false diff --git a/br/pkg/restore/log_client/export_test.go b/br/pkg/restore/log_client/export_test.go index f78a54bf50c8a..d477b8b8cf897 100644 --- a/br/pkg/restore/log_client/export_test.go +++ b/br/pkg/restore/log_client/export_test.go @@ -127,3 +127,11 @@ func (helper *FakeStreamMetadataHelper) ReadFile( ) ([]byte, error) { return helper.Data[offset : offset+length], nil } + +func (w *WithMigrations) AddIngestedSSTs(extPath string) { + w.fullBackups = append(w.fullBackups, extPath) +} + +func (w *WithMigrations) SetRestoredTS(ts uint64) { + w.restoredTS = ts +} diff --git a/br/pkg/restore/log_client/log_file_manager.go b/br/pkg/restore/log_client/log_file_manager.go index 4c2992467a2ab..c77c1ebbab0ba 100644 --- a/br/pkg/restore/log_client/log_file_manager.go +++ b/br/pkg/restore/log_client/log_file_manager.go @@ -9,9 +9,11 @@ import ( "fmt" "strings" "sync" + "sync/atomic" "time" "github.com/pingcap/errors" + backup "github.com/pingcap/kvproto/pkg/brpb" backuppb "github.com/pingcap/kvproto/pkg/brpb" "github.com/pingcap/kvproto/pkg/encryptionpb" "github.com/pingcap/log" @@ -26,8 +28,6 @@ import ( "go.uber.org/zap" ) -var TotalEntryCount int64 - // MetaIter is the type of iterator of metadata files' content. type MetaIter = iter.TryNextor[*backuppb.Metadata] @@ -86,6 +86,12 @@ type streamMetadataHelper interface { ParseToMetadata(rawMetaData []byte) (*backuppb.Metadata, error) } +type logFilesStatistic struct { + NumEntries int64 + NumFiles uint64 + Size uint64 +} + // LogFileManager is the manager for log files of a certain restoration, // which supports read / filter from the log backup archive with static start TS / restore TS. type LogFileManager struct { @@ -107,6 +113,10 @@ type LogFileManager struct { withMigrations *WithMigrations metadataDownloadBatchSize uint + + // The output channel for statistics. + // This will be collected when reading the metadata. + Stats *logFilesStatistic } // LogFileManagerInit is the config needed for initializing the log file manager. @@ -310,6 +320,18 @@ func (rc *LogFileManager) LoadDDLFilesAndCountDMLFiles(ctx context.Context) ([]L return rc.collectDDLFilesAndPrepareCache(ctx, mg) } +type loadDMLFilesConfig struct { + Statistic *logFilesStatistic +} + +type loadDMLFilesOption func(*loadDMLFilesConfig) + +func lDOptWithStatistics(s *logFilesStatistic) loadDMLFilesOption { + return func(c *loadDMLFilesConfig) { + c.Statistic = s + } +} + // LoadDMLFiles loads all DML files needs to be restored in the restoration. // This function returns a stream, because there are usually many DML files need to be restored. func (rc *LogFileManager) LoadDMLFiles(ctx context.Context) (LogIter, error) { @@ -334,7 +356,11 @@ func (rc *LogFileManager) FilterMetaFiles(ms MetaNameIter) MetaGroupIter { return true } // count the progress - TotalEntryCount += d.NumberOfEntries + if rc.Stats != nil { + atomic.AddInt64(&rc.Stats.NumEntries, d.NumberOfEntries) + atomic.AddUint64(&rc.Stats.NumFiles, 1) + atomic.AddUint64(&rc.Stats.Size, d.Length) + } return !d.IsMeta }) return DDLMetaGroup{ @@ -347,8 +373,43 @@ func (rc *LogFileManager) FilterMetaFiles(ms MetaNameIter) MetaGroupIter { } // Fetch compactions that may contain file less than the TS. -func (rc *LogFileManager) GetCompactionIter(ctx context.Context) iter.TryNextor[*backuppb.LogFileSubcompaction] { - return rc.withMigrations.Compactions(ctx, rc.storage) +func (rc *LogFileManager) GetCompactionIter(ctx context.Context) iter.TryNextor[SSTs] { + return iter.Map(rc.withMigrations.Compactions(ctx, rc.storage), func(c *backup.LogFileSubcompaction) SSTs { + return &CompactedSSTs{c} + }) +} + +func (rc *LogFileManager) GetIngestedSSTsSSTs(ctx context.Context) iter.TryNextor[SSTs] { + return iter.FlatMap(rc.withMigrations.IngestedSSTss(ctx, rc.storage), func(c *backup.IngestedSSTs) iter.TryNextor[SSTs] { + remap := map[int64]int64{} + for _, r := range c.RewrittenTables { + remap[r.AncestorUpstream] = r.Upstream + } + return iter.TryMap(iter.FromSlice(c.Files), func(f *backup.File) (SSTs, error) { + sst := &AddedSSTs{File: f} + if id, ok := remap[sst.TableID()]; ok && id != sst.TableID() { + sst.Rewritten = backuppb.RewrittenTableID{ + AncestorUpstream: sst.TableID(), + Upstream: id, + } + } + return sst, nil + }) + }) +} + +func (rc *LogFileManager) CountExtraSSTTotalKVs(ctx context.Context) (int64, error) { + count := int64(0) + ssts := iter.ConcatAll(rc.GetCompactionIter(ctx), rc.GetIngestedSSTsSSTs(ctx)) + for err, ssts := range iter.AsSeq(ctx, ssts) { + if err != nil { + return 0, errors.Trace(err) + } + for _, sst := range ssts.GetSSTs() { + count += int64(sst.TotalKvs) + } + } + return count, nil } // the kv entry with ts, the ts is decoded from entry. diff --git a/br/pkg/restore/log_client/migration.go b/br/pkg/restore/log_client/migration.go index a7b4307e0f568..19bcbc2747d8c 100644 --- a/br/pkg/restore/log_client/migration.go +++ b/br/pkg/restore/log_client/migration.go @@ -19,6 +19,7 @@ import ( backuppb "github.com/pingcap/kvproto/pkg/brpb" "github.com/pingcap/tidb/br/pkg/storage" + "github.com/pingcap/tidb/br/pkg/stream" "github.com/pingcap/tidb/br/pkg/utils/iter" ) @@ -144,6 +145,7 @@ func (builder *WithMigrationsBuilder) coarseGrainedFilter(mig *backuppb.Migratio func (builder *WithMigrationsBuilder) Build(migs []*backuppb.Migration) WithMigrations { skipmap := make(metaSkipMap) compactionDirs := make([]string, 0, 8) + fullBackups := make([]string, 0, 8) for _, mig := range migs { // TODO: deal with TruncatedTo and DestructPrefix @@ -155,10 +157,15 @@ func (builder *WithMigrationsBuilder) Build(migs []*backuppb.Migration) WithMigr for _, c := range mig.Compactions { compactionDirs = append(compactionDirs, c.Artifacts) } + + fullBackups = append(fullBackups, mig.IngestedSstPaths...) } withMigrations := WithMigrations{ skipmap: skipmap, compactionDirs: compactionDirs, + fullBackups: fullBackups, + restoredTS: builder.restoredTS, + startTS: builder.startTS, } return withMigrations } @@ -210,6 +217,9 @@ func (mwm *MetaWithMigrations) Physicals(groupIndexIter GroupIndexIter) Physical type WithMigrations struct { skipmap metaSkipMap compactionDirs []string + fullBackups []string + restoredTS uint64 + startTS uint64 } func (wm *WithMigrations) Metas(metaNameIter MetaNameIter) MetaMigrationsIter { @@ -238,3 +248,15 @@ func (wm *WithMigrations) Compactions(ctx context.Context, s storage.ExternalSto return Subcompactions(ctx, name, s) }) } + +func (wm *WithMigrations) IngestedSSTss(ctx context.Context, s storage.ExternalStorage) iter.TryNextor[*backuppb.IngestedSSTs] { + filteredOut := iter.FilterOut(stream.LoadIngestedSSTss(ctx, s, wm.fullBackups), func(ebk stream.IngestedSSTss) bool { + gts := ebk.GroupTS() + return !ebk.GroupFinished() || gts < wm.startTS || gts >= wm.restoredTS + }) + return iter.FlatMap(filteredOut, func(ebk stream.IngestedSSTss) iter.TryNextor[*backuppb.IngestedSSTs] { + return iter.Map(iter.FromSlice(ebk), func(p stream.PathedIngestedSSTs) *backuppb.IngestedSSTs { + return p.IngestedSSTs + }) + }) +} diff --git a/br/pkg/restore/log_client/migration_test.go b/br/pkg/restore/log_client/migration_test.go index 5368d7416dadf..48ed70ebbee4b 100644 --- a/br/pkg/restore/log_client/migration_test.go +++ b/br/pkg/restore/log_client/migration_test.go @@ -19,8 +19,10 @@ import ( "fmt" "testing" + "github.com/google/uuid" backuppb "github.com/pingcap/kvproto/pkg/brpb" logclient "github.com/pingcap/tidb/br/pkg/restore/log_client" + "github.com/pingcap/tidb/br/pkg/storage" "github.com/pingcap/tidb/br/pkg/utils/iter" "github.com/stretchr/testify/require" ) @@ -350,3 +352,154 @@ func TestMigrations(t *testing.T) { } } } + +type efOP func(*backuppb.IngestedSSTs) + +func extFullBkup(ops ...efOP) *backuppb.IngestedSSTs { + ef := &backuppb.IngestedSSTs{} + for _, op := range ops { + op(ef) + } + return ef +} + +func finished() efOP { + return func(ef *backuppb.IngestedSSTs) { + ef.Finished = true + } +} + +func makeID() efOP { + id := uuid.New() + return func(ef *backuppb.IngestedSSTs) { + ef.BackupUuid = id[:] + } +} + +func prefix(pfx string) efOP { + return func(ef *backuppb.IngestedSSTs) { + ef.FilesPrefixHint = pfx + } +} + +func asIfTS(ts uint64) efOP { + return func(ef *backuppb.IngestedSSTs) { + ef.AsIfTs = ts + } +} + +func pef(t *testing.T, fb *backuppb.IngestedSSTs, sn int, s storage.ExternalStorage) string { + path := fmt.Sprintf("extbackupmeta_%08d", sn) + bs, err := fb.Marshal() + if err != nil { + require.NoError(t, err) + } + + err = s.WriteFile(context.Background(), path, bs) + require.NoError(t, err) + return path +} + +// tmp creates a temporary storage. +func tmp(t *testing.T) *storage.LocalStorage { + tmpDir := t.TempDir() + s, err := storage.NewLocalStorage(tmpDir) + require.NoError(t, err) + s.IgnoreEnoentForDelete = true + return s +} + +func assertFullBackupPfxs(t *testing.T, it iter.TryNextor[*backuppb.IngestedSSTs], items ...string) { + i := 0 + for res := it.TryNext(context.Background()); !res.Finished; res = it.TryNext(context.Background()) { + require.NoError(t, res.Err) + require.Equal(t, items[i], res.Item.FilesPrefixHint, + "item %d not match, wants %s, got %s", i, items[i], res.Item.FilesPrefixHint) + i++ + } + require.Equal(t, i, len(items), "not exceeded: %#v, i = %d", items, i) +} + +func TestNotRestoreIncomplete(t *testing.T) { + ctx := context.Background() + strg := tmp(t) + ebk := extFullBkup(prefix("001"), asIfTS(90), makeID()) + wm := new(logclient.WithMigrations) + wm.AddIngestedSSTs(pef(t, ebk, 0, strg)) + wm.SetRestoredTS(91) + + assertFullBackupPfxs(t, wm.IngestedSSTss(ctx, strg)) +} + +func TestRestoreSegmented(t *testing.T) { + ctx := context.Background() + strg := tmp(t) + id := makeID() + ebk1 := extFullBkup(prefix("001"), id) + ebk2 := extFullBkup(prefix("002"), asIfTS(90), finished(), id) + wm := new(logclient.WithMigrations) + wm.AddIngestedSSTs(pef(t, ebk1, 0, strg)) + wm.AddIngestedSSTs(pef(t, ebk2, 1, strg)) + wm.SetRestoredTS(91) + + assertFullBackupPfxs(t, wm.IngestedSSTss(ctx, strg), "001", "002") +} + +func TestFilteredOut(t *testing.T) { + ctx := context.Background() + strg := tmp(t) + id := makeID() + ebk1 := extFullBkup(prefix("001"), id) + ebk2 := extFullBkup(prefix("002"), asIfTS(90), finished(), id) + ebk3 := extFullBkup(prefix("003"), asIfTS(10), finished(), makeID()) + wm := new(logclient.WithMigrations) + wm.AddIngestedSSTs(pef(t, ebk1, 0, strg)) + wm.AddIngestedSSTs(pef(t, ebk2, 1, strg)) + wm.AddIngestedSSTs(pef(t, ebk3, 2, strg)) + wm.SetRestoredTS(89) + wm.SetStartTS(42) + + assertFullBackupPfxs(t, wm.IngestedSSTss(ctx, strg)) +} + +func TestMultiRestores(t *testing.T) { + ctx := context.Background() + strg := tmp(t) + id := makeID() + id2 := makeID() + + ebka1 := extFullBkup(prefix("001"), id) + ebkb1 := extFullBkup(prefix("101"), id2) + ebkb2 := extFullBkup(prefix("102"), asIfTS(88), finished(), id2) + ebka2 := extFullBkup(prefix("002"), asIfTS(90), finished(), id) + + wm := new(logclient.WithMigrations) + wm.AddIngestedSSTs(pef(t, ebka1, 0, strg)) + wm.AddIngestedSSTs(pef(t, ebkb1, 2, strg)) + wm.AddIngestedSSTs(pef(t, ebkb2, 3, strg)) + wm.AddIngestedSSTs(pef(t, ebka2, 4, strg)) + wm.SetRestoredTS(91) + + assertFullBackupPfxs(t, wm.IngestedSSTss(ctx, strg), "101", "102", "001", "002") +} + +func TestMultiFilteredOutOne(t *testing.T) { + ctx := context.Background() + strg := tmp(t) + id := makeID() + id2 := makeID() + + ebka1 := extFullBkup(prefix("001"), id) + ebkb1 := extFullBkup(prefix("101"), id2) + ebkb2 := extFullBkup(prefix("102"), asIfTS(88), finished(), id2) + ebka2 := extFullBkup(prefix("002"), asIfTS(90), finished(), id) + + wm := new(logclient.WithMigrations) + wm.AddIngestedSSTs(pef(t, ebka1, 0, strg)) + wm.AddIngestedSSTs(pef(t, ebkb1, 2, strg)) + wm.AddIngestedSSTs(pef(t, ebkb2, 3, strg)) + wm.AddIngestedSSTs(pef(t, ebka2, 4, strg)) + wm.SetRestoredTS(89) + + assertFullBackupPfxs(t, wm.IngestedSSTss(ctx, strg), "101", "102") +} diff --git a/br/pkg/restore/log_client/ssts.go b/br/pkg/restore/log_client/ssts.go new file mode 100644 index 0000000000000..f8d2a155c9f6e --- /dev/null +++ b/br/pkg/restore/log_client/ssts.go @@ -0,0 +1,109 @@ +// Copyright 2024 PingCAP, Inc. Licensed under Apache-2.0. + +package logclient + +import ( + "encoding/hex" + "fmt" + "log" + "sync/atomic" + + backuppb "github.com/pingcap/kvproto/pkg/brpb" + "github.com/pingcap/tidb/pkg/tablecodec" + "go.uber.org/zap" +) + +var ( + _ RewrittenSST = &AddedSSTs{} +) + +// RewrittenSST is an extension to the `SSTs` that needs extra key rewriting. +// This allows a SST being restored "as if" it in another table. +// +// The name "rewritten" means that the SST has already been rewritten somewhere else -- +// before importing it, we need "replay" the rewrite on it. +// +// For example, if a SST contains content of table `1`. And `RewrittenTo` returns `10`, +// the downstream wants to rewrite table `10` to `100`: +// - When searching for rewrite rules for the SSTs, we will use the table ID `10`(`RewrittenTo()`). +// - When importing the SST, we will use the rewrite rule `1`(`TableID()`) -> `100`(RewriteRule). +type RewrittenSST interface { + // RewrittenTo returns the table ID that the SST should be treated as. + RewrittenTo() int64 +} + +// SSTs is an interface that represents a collection of SST files. +type SSTs interface { + // TableID returns the ID of the table associated with the SST files. + TableID() int64 + // GetSSTs returns a slice of pointers to backuppb.File, representing the SST files. + GetSSTs() []*backuppb.File + // SetSSTs allows the user to override the internal SSTs to be restored. + // The input SST set should already be a subset of `GetSSTs.` + SetSSTs([]*backuppb.File) +} + +type CompactedSSTs struct { + *backuppb.LogFileSubcompaction +} + +func (s *CompactedSSTs) TableID() int64 { + return s.Meta.TableId +} + +func (s *CompactedSSTs) GetSSTs() []*backuppb.File { + return s.SstOutputs +} + +func (s *CompactedSSTs) SetSSTs(files []*backuppb.File) { + s.SstOutputs = files +} + +type AddedSSTs struct { + File *backuppb.File + Rewritten backuppb.RewrittenTableID + + cachedTableID atomic.Int64 +} + +func (s *AddedSSTs) TableID() int64 { + cached := s.cachedTableID.Load() + if cached == 0 { + id := tablecodec.DecodeTableID(s.File.StartKey) + id2 := tablecodec.DecodeTableID(s.File.EndKey) + if id != id2 { + panic(fmt.Sprintf( + "yet restoring a SST with two adjacent tables not supported, they are %d and %d (start key = %s; end key = %s)", + id, + id2, + hex.EncodeToString(s.File.StartKey), + hex.EncodeToString(s.File.EndKey), + )) + } + s.cachedTableID.Store(id) + return id + } + + return cached +} + +func (s *AddedSSTs) GetSSTs() []*backuppb.File { + if s.File == nil { + return nil + } + return []*backuppb.File{s.File} +} + +func (s *AddedSSTs) SetSSTs(fs []*backuppb.File) { + if len(fs) == 0 { + s.File = nil + } + if len(fs) == 1 { + s.File = fs[0] + } + log.Panic("Too many files passed to AddedSSTs.SetSSTs.", zap.Any("input", fs)) +} + +func (s *AddedSSTs) RewrittenTo() int64 { + return s.Rewritten.Upstream +} diff --git a/br/pkg/restore/restorer.go b/br/pkg/restore/restorer.go index 9d999af9c09fc..c3e9034ce2001 100644 --- a/br/pkg/restore/restorer.go +++ b/br/pkg/restore/restorer.go @@ -276,7 +276,7 @@ func (m *MultiTablesRestorer) GoRestore(onProgress func(int64), batchFileSets .. m.ectx = opentracing.ContextWithSpan(m.ectx, span1) } - for _, batchFileSet := range batchFileSets { + for i, batchFileSet := range batchFileSets { if m.ectx.Err() != nil { log.Warn("Restoring encountered error and already stopped, give up remained files.", logutil.ShortError(m.ectx.Err())) @@ -287,15 +287,16 @@ func (m *MultiTablesRestorer) GoRestore(onProgress func(int64), batchFileSets .. } filesReplica := batchFileSet m.fileImporter.PauseForBackpressure() + cx := logutil.ContextWithField(m.ectx, zap.Int("sn", i)) m.workerPool.ApplyOnErrorGroup(m.eg, func() (restoreErr error) { fileStart := time.Now() defer func() { if restoreErr == nil { - log.Info("import files done", zap.Duration("take", time.Since(fileStart))) + logutil.CL(cx).Info("import files done", zap.Duration("take", time.Since(fileStart))) onProgress(int64(len(filesReplica))) } }() - if importErr := m.fileImporter.Import(m.ectx, filesReplica...); importErr != nil { + if importErr := m.fileImporter.Import(cx, filesReplica...); importErr != nil { return errors.Trace(importErr) } diff --git a/br/pkg/restore/snap_client/BUILD.bazel b/br/pkg/restore/snap_client/BUILD.bazel index 2df9df140d94f..84ac42c7698b5 100644 --- a/br/pkg/restore/snap_client/BUILD.bazel +++ b/br/pkg/restore/snap_client/BUILD.bazel @@ -6,6 +6,7 @@ go_library( "client.go", "import.go", "pipeline_items.go", + "pitr_collector.go", "placement_rule_manager.go", "systable_restore.go", "tikv_sender.go", @@ -29,6 +30,8 @@ go_library( "//br/pkg/restore/split", "//br/pkg/restore/utils", "//br/pkg/storage", + "//br/pkg/stream", + "//br/pkg/streamhelper", "//br/pkg/summary", "//br/pkg/utils", "//br/pkg/version", @@ -38,6 +41,7 @@ go_library( "//pkg/kv", "//pkg/meta", "//pkg/meta/model", + "//pkg/metrics", "//pkg/parser/ast", "//pkg/parser/mysql", "//pkg/tablecodec", @@ -55,9 +59,11 @@ go_library( "@com_github_pingcap_kvproto//pkg/kvrpcpb", "@com_github_pingcap_kvproto//pkg/metapb", "@com_github_pingcap_log//:log", + "@com_github_tikv_client_go_v2//oracle", "@com_github_tikv_client_go_v2//util", "@com_github_tikv_pd_client//:client", "@com_github_tikv_pd_client//http", + "@io_etcd_go_etcd_client_v3//:client", "@org_golang_google_grpc//codes", "@org_golang_google_grpc//keepalive", "@org_golang_google_grpc//status", @@ -76,13 +82,14 @@ go_test( "export_test.go", "import_test.go", "main_test.go", + "pitr_collector_test.go", "placement_rule_manager_test.go", "systable_restore_test.go", "tikv_sender_test.go", ], embed = [":snap_client"], flaky = True, - shard_count = 19, + shard_count = 23, deps = [ "//br/pkg/errors", "//br/pkg/glue", @@ -93,6 +100,8 @@ go_test( "//br/pkg/restore/internal/import_client", "//br/pkg/restore/split", "//br/pkg/restore/utils", + "//br/pkg/storage", + "//br/pkg/stream", "//br/pkg/utils", "//pkg/domain", "//pkg/kv", @@ -105,6 +114,7 @@ go_test( "//pkg/types", "//pkg/util", "//pkg/util/codec", + "@com_github_google_uuid//:uuid", "@com_github_pingcap_errors//:errors", "@com_github_pingcap_failpoint//:failpoint", "@com_github_pingcap_kvproto//pkg/brpb", diff --git a/br/pkg/restore/snap_client/client.go b/br/pkg/restore/snap_client/client.go index ae878b0e9e0ca..ab3e257b259d9 100644 --- a/br/pkg/restore/snap_client/client.go +++ b/br/pkg/restore/snap_client/client.go @@ -25,6 +25,7 @@ import ( "sync" "time" + "github.com/google/uuid" "github.com/opentracing/opentracing-go" "github.com/pingcap/errors" "github.com/pingcap/failpoint" @@ -53,6 +54,7 @@ import ( "github.com/pingcap/tidb/pkg/kv" "github.com/pingcap/tidb/pkg/meta" "github.com/pingcap/tidb/pkg/meta/model" + "github.com/pingcap/tidb/pkg/metrics" tidbutil "github.com/pingcap/tidb/pkg/util" "github.com/pingcap/tidb/pkg/util/redact" kvutil "github.com/tikv/client-go/v2/util" @@ -76,6 +78,7 @@ const minBatchDdlSize = 1 type SnapClient struct { restorer restore.SstRestorer + importer *SnapFileImporter // Use a closure to lazy load checkpoint runner getRestorerFn func(*checkpoint.CheckpointRunner[checkpoint.RestoreKeyType, checkpoint.RestoreValueType]) restore.SstRestorer // Tool clients used by SnapClient @@ -153,6 +156,10 @@ type SnapClient struct { checkpointRunner *checkpoint.CheckpointRunner[checkpoint.RestoreKeyType, checkpoint.RestoreValueType] checkpointChecksum map[int64]*checkpoint.ChecksumItem + + // restoreUUID is the UUID of this restore. + // restore from a checkpoint inherits the same restoreUUID. + restoreUUID uuid.UUID } // NewRestoreClient returns a new RestoreClient. @@ -326,6 +333,7 @@ func (rc *SnapClient) InitCheckpoint( if err != nil { return checkpointSetWithTableID, nil, errors.Trace(err) } + rc.restoreUUID = meta.RestoreUUID if meta.UpstreamClusterID != rc.backupMeta.ClusterId { return checkpointSetWithTableID, nil, errors.Errorf( @@ -377,10 +385,13 @@ func (rc *SnapClient) InitCheckpoint( } } else { // initialize the checkpoint metadata since it is the first time to restore. + restoreID := uuid.New() meta := &checkpoint.CheckpointMetadataForSnapshotRestore{ UpstreamClusterID: rc.backupMeta.ClusterId, RestoredTS: rc.backupMeta.EndVersion, + RestoreUUID: restoreID, } + rc.restoreUUID = restoreID // a nil config means undo function if config != nil { meta.SchedulersConfig = &pdutil.ClusterConfig{Schedulers: config.Schedulers, ScheduleCfg: config.ScheduleCfg} @@ -422,6 +433,35 @@ func makeDBPool(size uint, dbFactory func() (*tidallocdb.DB, error)) ([]*tidallo return dbPool, nil } +func (rc *SnapClient) InstallPiTRSupport(ctx context.Context, deps PiTRCollDep) error { + collector, err := newPiTRColl(ctx, deps) + if err != nil { + return errors.Trace(err) + } + if !collector.enabled { + return nil + } + if rc.IsIncremental() { + // Even there were an error, don't return it to confuse the user... + _ = collector.close() + return errors.Annotatef(berrors.ErrStreamLogTaskExist, "it seems there is a log backup task exists, "+ + "if an incremental restore were performed to such cluster, log backup cannot properly handle this, "+ + "the restore will be aborted, you may stop the log backup task, then restore, finally restart the task") + } + + collector.restoreUUID = rc.restoreUUID + if collector.restoreUUID == (uuid.UUID{}) { + collector.restoreUUID = uuid.New() + log.Warn("UUID not found(checkpoint not enabled?), generating a new UUID for backup.", + zap.Stringer("uuid", collector.restoreUUID)) + } + rc.importer.beforeIngestCallbacks = append(rc.importer.beforeIngestCallbacks, collector.onBatch) + rc.importer.closeCallbacks = append(rc.importer.closeCallbacks, func(sfi *SnapFileImporter) error { + return collector.close() + }) + return nil +} + // Init create db connection and domain for storage. func (rc *SnapClient) Init(g glue.Glue, store kv.Storage) error { // setDB must happen after set PolicyMode. @@ -532,7 +572,6 @@ func (rc *SnapClient) initClients(ctx context.Context, backend *backuppb.Storage metaClient := split.NewClient(rc.pdClient, rc.pdHTTPClient, rc.tlsConf, maxSplitKeysOnce, rc.storeCount+1, splitClientOpts...) importCli := importclient.NewImportClient(metaClient, rc.tlsConf, rc.keepaliveConf) - var fileImporter *SnapFileImporter opt := NewSnapFileImporterOptions( rc.cipher, metaClient, importCli, backend, rc.rewriteMode, stores, rc.concurrencyPerStore, createCallBacks, closeCallBacks, @@ -543,23 +582,23 @@ func (rc *SnapClient) initClients(ctx context.Context, backend *backuppb.Storage mode = Txn } // for raw/txn mode. use backupMeta.ApiVersion to create fileImporter - fileImporter, err = NewSnapFileImporter(ctx, rc.backupMeta.ApiVersion, mode, opt) + rc.importer, err = NewSnapFileImporter(ctx, rc.backupMeta.ApiVersion, mode, opt) if err != nil { return errors.Trace(err) } // Raw/Txn restore are not support checkpoint for now rc.getRestorerFn = func(checkpointRunner *checkpoint.CheckpointRunner[checkpoint.RestoreKeyType, checkpoint.RestoreValueType]) restore.SstRestorer { - return restore.NewSimpleSstRestorer(ctx, fileImporter, rc.workerPool, nil) + return restore.NewSimpleSstRestorer(ctx, rc.importer, rc.workerPool, nil) } } else { // or create a fileImporter with the cluster API version - fileImporter, err = NewSnapFileImporter( + rc.importer, err = NewSnapFileImporter( ctx, rc.dom.Store().GetCodec().GetAPIVersion(), TiDBFull, opt) if err != nil { return errors.Trace(err) } rc.getRestorerFn = func(checkpointRunner *checkpoint.CheckpointRunner[checkpoint.RestoreKeyType, checkpoint.RestoreValueType]) restore.SstRestorer { - return restore.NewMultiTablesRestorer(ctx, fileImporter, rc.workerPool, checkpointRunner) + return restore.NewMultiTablesRestorer(ctx, rc.importer, rc.workerPool, checkpointRunner) } } return nil @@ -864,7 +903,7 @@ func (rc *SnapClient) createTables( func (rc *SnapClient) createTablesBatch(ctx context.Context, tables []*metautil.Table, newTS uint64) ([]*CreatedTable, error) { eg, ectx := errgroup.WithContext(ctx) - rater := logutil.TraceRateOver(logutil.MetricTableCreatedCounter) + rater := logutil.TraceRateOver(metrics.RestoreTableCreatedCount) workers := tidbutil.NewWorkerPool(uint(len(rc.dbPool)), "Create Tables Worker") numOfTables := len(tables) createdTables := struct { @@ -948,7 +987,7 @@ func (rc *SnapClient) createTablesSingle( ) ([]*CreatedTable, error) { eg, ectx := errgroup.WithContext(ctx) workers := tidbutil.NewWorkerPool(uint(len(dbPool)), "DDL workers") - rater := logutil.TraceRateOver(logutil.MetricTableCreatedCounter) + rater := logutil.TraceRateOver(metrics.RestoreTableCreatedCount) createdTables := struct { sync.Mutex tables []*CreatedTable diff --git a/br/pkg/restore/snap_client/import.go b/br/pkg/restore/snap_client/import.go index 3db134fddf1e0..eabb5be7ede0f 100644 --- a/br/pkg/restore/snap_client/import.go +++ b/br/pkg/restore/snap_client/import.go @@ -41,6 +41,7 @@ import ( "github.com/pingcap/tidb/br/pkg/summary" "github.com/pingcap/tidb/br/pkg/utils" "github.com/pingcap/tidb/pkg/kv" + "github.com/pingcap/tidb/pkg/metrics" "github.com/pingcap/tidb/pkg/util/codec" kvutil "github.com/tikv/client-go/v2/util" "go.uber.org/zap" @@ -141,7 +142,8 @@ type SnapFileImporter struct { downloadTokensMap *storeTokenChannelMap ingestTokensMap *storeTokenChannelMap - closeCallbacks []func(*SnapFileImporter) error + closeCallbacks []func(*SnapFileImporter) error + beforeIngestCallbacks []func(context.Context, restore.BatchBackupFileSet) (afterIngest func() error, err error) concurrencyPerStore uint @@ -372,6 +374,18 @@ func (importer *SnapFileImporter) Import( ctx context.Context, backupFileSets ...restore.BackupFileSet, ) error { + delayCbs := []func() error{} + for i, cb := range importer.beforeIngestCallbacks { + d, err := cb(ctx, backupFileSets) + if err != nil { + return errors.Annotatef(err, "failed to executing the callback #%d", i) + } + if d != nil { + delayCbs = append(delayCbs, d) + } + } + + importBegin := time.Now() // Rewrite the start key and end key of file to scan regions startKey, endKey, err := importer.getKeyRangeForFiles(backupFileSets) if err != nil { @@ -386,7 +400,7 @@ func (importer *SnapFileImporter) Import( return errors.Trace(errScanRegion) } - log.Debug("scan regions", logutil.Key("start key", startKey), logutil.Key("end key", endKey), zap.Int("count", len(regionInfos))) + logutil.CL(ctx).Debug("scan regions", logutil.Key("start key", startKey), logutil.Key("end key", endKey), zap.Int("count", len(regionInfos))) start := time.Now() // Try to download and ingest the file in every region for _, regionInfo := range regionInfos { @@ -394,18 +408,18 @@ func (importer *SnapFileImporter) Import( // Try to download file. downloadMetas, errDownload := importer.download(ctx, info, backupFileSets, importer.cipher, importer.apiVersion) if errDownload != nil { - log.Warn("download file failed, retry later", + logutil.CL(ctx).Warn("download file failed, retry later", logutil.Region(info.Region), logutil.Key("startKey", startKey), logutil.Key("endKey", endKey), logutil.ShortError(errDownload)) return errors.Trace(errDownload) } - log.Debug("download file done", zap.Stringer("take", time.Since(start)), + logutil.CL(ctx).Debug("download file done", zap.Stringer("take", time.Since(start)), logutil.Key("start", startKey), logutil.Key("end", endKey)) start = time.Now() if errIngest := importer.ingest(ctx, info, downloadMetas); errIngest != nil { - log.Warn("ingest file failed, retry later", + logutil.CL(ctx).Warn("ingest file failed, retry later", logutil.Key("start", startKey), logutil.Key("end", endKey), logutil.SSTMetas(downloadMetas), @@ -413,14 +427,22 @@ func (importer *SnapFileImporter) Import( zap.Error(errIngest)) return errors.Trace(errIngest) } - log.Debug("ingest file done", logutil.Key("start", startKey), logutil.Key("end", endKey), zap.Stringer("take", time.Since(start))) + logutil.CL(ctx).Debug("ingest file done", logutil.Key("start", startKey), logutil.Key("end", endKey), zap.Stringer("take", time.Since(start))) } return nil }, utils.NewImportSSTBackoffStrategy()) if err != nil { - log.Error("import sst file failed after retry, stop the whole progress", restore.ZapBatchBackupFileSet(backupFileSets), zap.Error(err)) + logutil.CL(ctx).Error("import sst file failed after retry, stop the whole progress", restore.ZapBatchBackupFileSet(backupFileSets), zap.Error(err)) return errors.Trace(err) } + metrics.RestoreImportFileSeconds.Observe(time.Since(importBegin).Seconds()) + + for i, cb := range delayCbs { + if err := cb(); err != nil { + return errors.Annotatef(err, "failed to execute the delaied callback #%d", i) + } + } + for _, files := range backupFileSets { for _, f := range files.SSTFiles { summary.CollectSuccessUnit(summary.TotalKV, 1, f.TotalKvs) @@ -456,7 +478,7 @@ func getSSTMetaFromFile( } // Get the column family of the file by the file name. - var cfName string + cfName := file.GetCf() if strings.Contains(file.GetName(), restoreutils.DefaultCFName) { cfName = restoreutils.DefaultCFName } else if strings.Contains(file.GetName(), restoreutils.WriteCFName) { @@ -531,15 +553,15 @@ func (importer *SnapFileImporter) download( failpoint.Inject("restore-storage-error", func(val failpoint.Value) { msg := val.(string) - log.Debug("failpoint restore-storage-error injected.", zap.String("msg", msg)) + logutil.CL(ctx).Debug("failpoint restore-storage-error injected.", zap.String("msg", msg)) e = errors.Annotate(e, msg) }) failpoint.Inject("restore-gRPC-error", func(_ failpoint.Value) { - log.Warn("the connection to TiKV has been cut by a neko, meow :3") + logutil.CL(ctx).Warn("the connection to TiKV has been cut by a neko, meow :3") e = status.Error(codes.Unavailable, "the connection to TiKV has been cut by a neko, meow :3") }) if isDecryptSstErr(e) { - log.Info("fail to decrypt when download sst, try again with no-crypt") + logutil.CL(ctx).Info("fail to decrypt when download sst, try again with no-crypt") if importer.kvMode == Raw || importer.kvMode == Txn { downloadMetas, e = importer.downloadRawKVSST(ctx, regionInfo, filesGroup, nil, apiVersion) } else { @@ -840,7 +862,7 @@ func (importer *SnapFileImporter) ingest( break } // do not get region info, wait a second and GetRegion() again. - log.Warn("ingest get region by key return nil", logutil.Region(info.Region), + logutil.CL(ctx).Warn("ingest get region by key return nil", logutil.Region(info.Region), logutil.SSTMetas(downloadMetas), ) time.Sleep(time.Second) @@ -850,7 +872,7 @@ func (importer *SnapFileImporter) ingest( if !split.CheckRegionEpoch(newInfo, info) { return errors.Trace(berrors.ErrKVEpochNotMatch) } - log.Debug("ingest sst returns not leader error, retry it", + logutil.CL(ctx).Debug("ingest sst returns not leader error, retry it", logutil.SSTMetas(downloadMetas), logutil.Region(info.Region), zap.Stringer("newLeader", newInfo.Leader)) @@ -893,7 +915,7 @@ func (importer *SnapFileImporter) ingestSSTs( Context: reqCtx, Ssts: sstMetas, } - log.Debug("ingest SSTs", logutil.SSTMetas(sstMetas), logutil.Leader(leader)) + logutil.CL(ctx).Debug("ingest SSTs", logutil.SSTMetas(sstMetas), logutil.Leader(leader)) resp, err := importer.importClient.MultiIngest(ctx, leader.GetStoreId(), req) return resp, errors.Trace(err) } diff --git a/br/pkg/restore/snap_client/pitr_collector.go b/br/pkg/restore/snap_client/pitr_collector.go new file mode 100644 index 0000000000000..7a5c6e9cca781 --- /dev/null +++ b/br/pkg/restore/snap_client/pitr_collector.go @@ -0,0 +1,474 @@ +package snapclient + +import ( + "context" + "fmt" + "path/filepath" + "sync" + "time" + + "github.com/google/uuid" + "github.com/pingcap/errors" + pb "github.com/pingcap/kvproto/pkg/brpb" + "github.com/pingcap/log" + berrors "github.com/pingcap/tidb/br/pkg/errors" + "github.com/pingcap/tidb/br/pkg/logutil" + "github.com/pingcap/tidb/br/pkg/restore" + "github.com/pingcap/tidb/br/pkg/storage" + "github.com/pingcap/tidb/br/pkg/stream" + "github.com/pingcap/tidb/br/pkg/streamhelper" + "github.com/pingcap/tidb/br/pkg/summary" + "github.com/pingcap/tidb/pkg/metrics" + "github.com/pingcap/tidb/pkg/util" + "github.com/tikv/client-go/v2/oracle" + pd "github.com/tikv/pd/client" + clientv3 "go.etcd.io/etcd/client/v3" + "go.uber.org/zap" + "golang.org/x/sync/errgroup" +) + +type persistCall struct { + cx context.Context + cb func(error) +} + +// persisterHandle is a handle to the background writer persisting the metadata. +type persisterHandle struct { + hnd chan<- persistCall +} + +// close releases the handle. +func (w persisterHandle) close() { + close(w.hnd) +} + +// write starts a request to persist the current metadata to the external storage. +// +// all modification before the `write` call will be persisted in the external storage +// after this returns. +func (w persisterHandle) write(ctx context.Context) error { + // A buffer here is necessrary. + // Or once the writerCall finished too fastly, it calls the callback before the `select` + // block entered, we may lose the response. + ch := make(chan error, 1) + w.hnd <- persistCall{ + cx: ctx, + cb: func(err error) { + select { + case ch <- err: + default: + log.Warn("Blocked when sending to a oneshot channel, dropping the message.", + logutil.AShortError("dropped-result", err), zap.StackSkip("caller", 1)) + } + }, + } + + select { + case err, ok := <-ch: + if !ok { + // Though the channel is never closed, we can still gracefully exit + // by canceling the context. + log.Panic("[unreachable] A channel excepted to be never closed was closed.") + } + return err + case <-ctx.Done(): + return ctx.Err() + } +} + +// goPersister spawns the background centeralized persister. +// +// this would be the sole goroutine that writes to `c.metaPath()`. +func (c *pitrCollector) goPersister() { + hnd := make(chan persistCall, 2048) + exhaust := func(f func(persistCall)) { + collect: + for { + select { + case cb, ok := <-hnd: + if !ok { + log.Warn("Early channel close. Should not happen.") + return + } + f(cb) + default: + break collect + } + } + } + + go func() { + for newCall := range hnd { + cs := []persistCall{newCall} + // Consuming all pending writes. + exhaust(func(newCall persistCall) { + cs = append(cs, newCall) + }) + + err := c.doPersistExtraBackupMeta(cs[0].cx) + + for _, c := range cs { + c.cb(err) + } + } + }() + + c.writerRoutine = persisterHandle{ + hnd: hnd, + } +} + +type pitrCollector struct { + // Immutable state. + taskStorage storage.ExternalStorage + restoreStorage storage.ExternalStorage + name string + enabled bool + restoreUUID uuid.UUID + + // Mutable state. + extraBackupMeta ingestedSSTsMeta + extraBackupMetaLock sync.Mutex + putMigOnce sync.Once + + writerRoutine persisterHandle + + // Delegates. + tso func(ctx context.Context) (uint64, error) + restoreSuccess func() bool +} + +// ingestedSSTsMeta is state of already imported SSTs. +// +// This and only this will be fully persisted to the +// ingested ssts meta in the external storage. +type ingestedSSTsMeta struct { + msg pb.IngestedSSTs + rewrites map[int64]int64 +} + +// genMsg generates the protocol buffer message to persist. +func (c *ingestedSSTsMeta) genMsg() *pb.IngestedSSTs { + msg := util.ProtoV1Clone(&c.msg) + for old, new := range c.rewrites { + msg.RewrittenTables = append(msg.RewrittenTables, &pb.RewrittenTableID{AncestorUpstream: old, Upstream: new}) + } + return msg +} + +func (c *pitrCollector) close() error { + if !c.enabled { + return nil + } + + defer c.writerRoutine.close() + + cx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + + if !c.restoreSuccess() { + log.Warn("Backup not success, put a half-finished metadata to the log backup.", + zap.Stringer("uuid", c.restoreUUID)) + return errors.Annotatef(c.persistExtraBackupMeta(cx), "failed to persist the meta") + } + + commitTS, err := c.commit(cx) + if err != nil { + return errors.Annotate(err, "failed to commit pitrCollector") + } + log.Info("Log backup SSTs are committed.", + zap.Uint64("commitTS", commitTS), zap.String("committedTo", c.outputPath())) + + return nil +} + +func (C *pitrCollector) verifyCompatibilityFor(fileset *restore.BackupFileSet) error { + if len(fileset.RewriteRules.NewKeyspace) > 0 { + return errors.Annotate(berrors.ErrUnsupportedOperation, "keyspace rewriting isn't supported when log backup enabled") + } + for i, r := range fileset.RewriteRules.Data { + if r.NewTimestamp > 0 { + return errors.Annotatef(berrors.ErrUnsupportedOperation, + "rewrite rule #%d: rewrite timestamp isn't supported when log backup enabled", i) + } + if r.IgnoreAfterTimestamp > 0 || r.IgnoreBeforeTimestamp > 0 { + return errors.Annotatef(berrors.ErrUnsupportedOperation, + "rewrite rule #%d: truncating timestamp isn't supported when log backup enabled", i) + } + } + return nil +} + +func (c *pitrCollector) onBatch(ctx context.Context, fileSets restore.BatchBackupFileSet) (func() error, error) { + if !c.enabled { + return nil, nil + } + + if err := c.prepareMigIfNeeded(ctx); err != nil { + return nil, err + } + + begin := time.Now() + eg, ectx := errgroup.WithContext(ctx) + fileCount := 0 + for _, fileSet := range fileSets { + if err := c.verifyCompatibilityFor(&fileSet); err != nil { + return nil, err + } + + for _, file := range fileSet.SSTFiles { + file := file + fileCount += 1 + eg.Go(func() error { + if err := c.putSST(ectx, file); err != nil { + return errors.Annotatef(err, "failed to put sst %s", file.GetName()) + } + return nil + }) + } + for _, hint := range fileSet.RewriteRules.TableIDRemapHint { + hint := hint + eg.Go(func() error { + if err := c.putRewriteRule(ectx, hint.Origin, hint.Rewritten); err != nil { + return errors.Annotatef(err, "failed to put rewrite rule of %v", fileSet.RewriteRules) + } + return nil + }) + } + } + + waitDone := func() error { + err := eg.Wait() + if err != nil { + logutil.CL(ctx).Warn("Failed to upload SSTs for future PiTR.", logutil.ShortError(err)) + return err + } + + logutil.CL(ctx).Info("Uploaded a batch of SSTs for future PiTR.", + zap.Duration("take", time.Since(begin)), zap.Int("file-count", fileCount)) + + err = c.persistExtraBackupMeta(ctx) + if err != nil { + return errors.Annotatef(err, "failed to persist backup meta when finishing batch") + } + return nil + } + return waitDone, nil +} + +func (c *pitrCollector) doWithMetaLock(f func()) { + c.extraBackupMetaLock.Lock() + f() + c.extraBackupMetaLock.Unlock() +} + +// outputPath constructs the path by a relative path for outputting. +func (c *pitrCollector) outputPath(segs ...string) string { + return filepath.Join(append([]string{"v1", "ext_backups", c.name}, segs...)...) +} + +func (c *pitrCollector) metaPath() string { + return c.outputPath("extbackupmeta") +} + +func (c *pitrCollector) sstPath(name string) string { + return c.outputPath("sst_files", name) +} + +// putSST records an SST file. +func (c *pitrCollector) putSST(ctx context.Context, f *pb.File) error { + if !c.enabled { + return nil + } + + begin := time.Now() + + f = util.ProtoV1Clone(f) + out := c.sstPath(f.Name) + + copier, ok := c.taskStorage.(storage.Copier) + if !ok { + return errors.Annotatef(berrors.ErrInvalidArgument, "storage %T does not support copying", c.taskStorage) + } + spec := storage.CopySpec{ + From: f.GetName(), + To: out, + } + + copyStart := time.Now() + if err := copier.CopyFrom(ctx, c.restoreStorage, spec); err != nil { + return errors.Annotatef(err, "failed to copy sst file %s to %s, "+ + "you may check whether permissions are granted in both %s and %s, "+ + "and the two storages are provided by the same cloud vendor", + spec.From, spec.To, c.restoreStorage.URI(), c.taskStorage.URI()) + } + log.Info("Copy SST to log backup storage success.", zap.String("file", f.Name), zap.Stringer("takes", time.Since(copyStart))) + + f.Name = out + c.doWithMetaLock(func() { c.extraBackupMeta.msg.Files = append(c.extraBackupMeta.msg.Files, f) }) + + metrics.RestoreUploadSSTForPiTRSeconds.Observe(time.Since(begin).Seconds()) + return nil +} + +// putRewriteRule records a rewrite rule. +func (c *pitrCollector) putRewriteRule(_ context.Context, oldID int64, newID int64) error { + if !c.enabled { + return nil + } + var err error + c.doWithMetaLock(func() { + if oldVal, ok := c.extraBackupMeta.rewrites[oldID]; ok && oldVal != newID { + err = errors.Annotatef( + berrors.ErrInvalidArgument, + "pitr coll rewrite rule conflict: we had %v -> %v, but you want rewrite to %v", + oldID, + oldVal, + newID, + ) + return + } + c.extraBackupMeta.rewrites[oldID] = newID + }) + return err +} + +// doPersistExtraBackupMeta writes the current content of extra backup meta to the external storage. +// This isn't goroutine-safe. Please don't call it concurrently. +func (c *pitrCollector) doPersistExtraBackupMeta(ctx context.Context) (err error) { + var bs []byte + begin := time.Now() + c.doWithMetaLock(func() { + msg := c.extraBackupMeta.genMsg() + // Here, after generating a snapshot of the current message then we can continue. + // This requires only a single active writer at anytime. + // (i.e. concurrent call to `doPersistExtraBackupMeta` may cause data race.) + // If there are many writers, the writer gets a stale snapshot may overwrite + // the latest persisted file. + bs, err = msg.Marshal() + }) + + if err != nil { + return errors.Annotate(err, "failed to marsal the committing message") + } + logutil.CL(ctx).Info("Persisting extra backup meta.", + zap.Stringer("uuid", c.restoreUUID), zap.String("path", c.metaPath()), zap.Stringer("takes", time.Since(begin))) + + err = c.taskStorage.WriteFile(ctx, c.metaPath(), bs) + if err != nil { + return errors.Annotatef(err, "failed to put content to meta to %s", c.metaPath()) + } + + metrics.RestoreUploadSSTMetaForPiTRSeconds.Observe(time.Since(begin).Seconds()) + logutil.CL(ctx).Debug("Persisting extra backup meta.", + zap.Stringer("uuid", c.restoreUUID), zap.String("path", c.metaPath()), zap.Stringer("takes", time.Since(begin))) + return nil +} + +func (c *pitrCollector) persistExtraBackupMeta(ctx context.Context) (err error) { + return c.writerRoutine.write(ctx) +} + +// Commit commits the collected SSTs to a migration. +func (c *pitrCollector) prepareMig(ctx context.Context) error { + if !c.enabled { + return nil + } + + est := stream.MigrationExtension(c.taskStorage) + + m := stream.NewMigration() + m.IngestedSstPaths = append(m.IngestedSstPaths, c.metaPath()) + + _, err := est.AppendMigration(ctx, m) + if err != nil { + return errors.Annotatef(err, "failed to add the extra backup at path %s", c.metaPath()) + } + + c.doWithMetaLock(func() { + c.resetCommitting() + }) + // Persist the metadata in case of SSTs were uploaded but the meta wasn't, + // which leads to a leakage. + return c.persistExtraBackupMeta(ctx) +} + +func (c *pitrCollector) prepareMigIfNeeded(ctx context.Context) (err error) { + c.putMigOnce.Do(func() { + err = c.prepareMig(ctx) + }) + return +} + +func (c *pitrCollector) commit(ctx context.Context) (uint64, error) { + c.extraBackupMeta.msg.Finished = true + ts, err := c.tso(ctx) + if err != nil { + return 0, err + } + c.extraBackupMeta.msg.AsIfTs = ts + return ts, c.persistExtraBackupMeta(ctx) +} + +func (c *pitrCollector) resetCommitting() { + c.extraBackupMeta = ingestedSSTsMeta{ + rewrites: map[int64]int64{}, + } + c.extraBackupMeta.msg.FilesPrefixHint = c.sstPath("") + c.extraBackupMeta.msg.Finished = false + c.extraBackupMeta.msg.BackupUuid = c.restoreUUID[:] +} + +// PiTRCollDep is the dependencies of a PiTR collector. +type PiTRCollDep struct { + PDCli pd.Client + EtcdCli *clientv3.Client + Storage *pb.StorageBackend +} + +// newPiTRColl creates a new PiTR collector. +func newPiTRColl(ctx context.Context, deps PiTRCollDep) (*pitrCollector, error) { + mcli := streamhelper.NewMetaDataClient(deps.EtcdCli) + ts, err := mcli.GetAllTasks(ctx) + if err != nil { + return nil, errors.Trace(err) + } + if len(ts) > 1 { + return nil, errors.Annotatef(berrors.ErrInvalidArgument, "more than one task found, pitr collector doesn't support that") + } + if len(ts) == 0 { + return &pitrCollector{}, nil + } + + coll := &pitrCollector{ + enabled: true, + } + + strg, err := storage.Create(ctx, ts[0].Info.Storage, false) + if err != nil { + return nil, errors.Trace(err) + } + coll.taskStorage = strg + + tso := func(ctx context.Context) (uint64, error) { + l, o, err := deps.PDCli.GetTS(ctx) + return oracle.ComposeTS(l, o), err + } + coll.tso = tso + + t, err := tso(ctx) + if err != nil { + return nil, errors.Trace(err) + } + coll.name = fmt.Sprintf("backup-%016X", t) + + restoreStrg, err := storage.Create(ctx, deps.Storage, false) + if err != nil { + return nil, errors.Trace(err) + } + coll.restoreStorage = restoreStrg + coll.restoreSuccess = summary.Succeed + coll.goPersister() + coll.resetCommitting() + return coll, nil +} diff --git a/br/pkg/restore/snap_client/pitr_collector_test.go b/br/pkg/restore/snap_client/pitr_collector_test.go new file mode 100644 index 0000000000000..50854e902f3e1 --- /dev/null +++ b/br/pkg/restore/snap_client/pitr_collector_test.go @@ -0,0 +1,261 @@ +package snapclient + +import ( + "context" + "fmt" + "sync/atomic" + "testing" + + "github.com/google/uuid" + backuppb "github.com/pingcap/kvproto/pkg/brpb" + "github.com/pingcap/tidb/br/pkg/restore" + "github.com/pingcap/tidb/br/pkg/restore/utils" + "github.com/pingcap/tidb/br/pkg/storage" + "github.com/pingcap/tidb/br/pkg/stream" + "github.com/stretchr/testify/require" +) + +func tmp(t *testing.T) *storage.LocalStorage { + tmpDir := t.TempDir() + s, err := storage.NewLocalStorage(tmpDir) + require.NoError(t, err) + s.IgnoreEnoentForDelete = true + return s +} + +type pitrCollectorT struct { + t *testing.T + coll *pitrCollector + tsoCnt *atomic.Uint64 + success *atomic.Bool + cx context.Context +} + +func (p pitrCollectorT) RestoreAFile(fs restore.BatchBackupFileSet) func() error { + for _, b := range fs { + for _, file := range b.SSTFiles { + require.NoError(p.t, p.coll.restoreStorage.WriteFile(p.cx, file.Name, []byte("something"))) + } + } + + res, err := p.coll.onBatch(p.cx, fs) + require.NoError(p.t, err) + return res +} + +func (p pitrCollectorT) Done() { + require.NoError(p.t, p.coll.close()) +} + +func (p pitrCollectorT) ExtFullBkups() []backuppb.IngestedSSTs { + est := stream.MigrationExtension(p.coll.taskStorage) + migs, err := est.Load(p.cx) + require.NoError(p.t, err) + res := []backuppb.IngestedSSTs{} + for _, m := range migs.ListAll() { + for _, pth := range m.IngestedSstPaths { + content, err := p.coll.taskStorage.ReadFile(p.cx, pth) + require.NoError(p.t, err) + var sst backuppb.IngestedSSTs + require.NoError(p.t, sst.Unmarshal(content)) + res = append(res, sst) + } + } + return res +} + +func (p *pitrCollectorT) MarkSuccess() { + p.success.Store(true) +} + +func (p *pitrCollectorT) Reopen() { + newColl := &pitrCollector{ + enabled: p.coll.enabled, + taskStorage: p.coll.taskStorage, + restoreStorage: p.coll.restoreStorage, + name: fmt.Sprintf("test-%s-%d", p.t.Name(), p.tsoCnt.Add(1)), + restoreUUID: p.coll.restoreUUID, + tso: p.coll.tso, + restoreSuccess: p.coll.restoreSuccess, + } + p.success.Store(false) + p.coll = newColl +} + +func (p pitrCollectorT) RequireCopied(extBk backuppb.IngestedSSTs, files ...string) { + extFiles := make([]string, 0) + for _, f := range extBk.Files { + extFiles = append(extFiles, f.Name) + } + + locatedFiles := make([]string, 0) + for _, f := range files { + locatedFiles = append(locatedFiles, p.coll.sstPath(f)) + } + + require.ElementsMatch(p.t, extFiles, locatedFiles) +} + +func (p pitrCollectorT) RequireRewrite(extBk backuppb.IngestedSSTs, rules ...utils.TableIDRemap) { + rulesInExtBk := []utils.TableIDRemap{} + for _, f := range extBk.RewrittenTables { + rulesInExtBk = append(rulesInExtBk, utils.TableIDRemap{ + Origin: f.AncestorUpstream, + Rewritten: f.Upstream, + }) + } + require.ElementsMatch(p.t, rulesInExtBk, rules) +} + +func newPiTRCollForTest(t *testing.T) pitrCollectorT { + taskStorage := tmp(t) + restoreStorage := tmp(t) + + coll := &pitrCollector{ + enabled: true, + taskStorage: taskStorage, + restoreStorage: restoreStorage, + name: "test-" + t.Name(), + restoreUUID: uuid.New(), + } + tsoCnt := new(atomic.Uint64) + restoreSuccess := new(atomic.Bool) + coll.tso = func(ctx context.Context) (uint64, error) { + return tsoCnt.Add(1), nil + } + coll.restoreSuccess = restoreSuccess.Load + + return pitrCollectorT{ + t: t, + coll: coll, + tsoCnt: tsoCnt, + success: restoreSuccess, + cx: context.Background(), + } +} + +type backupFileSetOp func(*restore.BackupFileSet) + +func backupFileSet(ops ...backupFileSetOp) restore.BackupFileSet { + set := restore.BackupFileSet{ + RewriteRules: new(utils.RewriteRules), + } + for _, op := range ops { + op(&set) + } + return set +} + +func nameFile(n string) *backuppb.File { + return &backuppb.File{ + Name: n, + } +} + +func withFile(f *backuppb.File) backupFileSetOp { + return func(set *restore.BackupFileSet) { + set.SSTFiles = append(set.SSTFiles, f) + } +} + +func remap(from, to int64) utils.TableIDRemap { + return utils.TableIDRemap{Origin: from, Rewritten: to} +} + +func withRewriteRule(hints ...utils.TableIDRemap) backupFileSetOp { + return func(set *restore.BackupFileSet) { + set.RewriteRules.TableIDRemapHint = append(set.RewriteRules.TableIDRemapHint, hints...) + } +} + +func TestCollAFile(t *testing.T) { + coll := newPiTRCollForTest(t) + batch := restore.BatchBackupFileSet{backupFileSet(withFile(nameFile("foo.txt")))} + + require.NoError(t, coll.RestoreAFile(batch)()) + coll.MarkSuccess() + coll.Done() + + exts := coll.ExtFullBkups() + require.Len(t, exts, 1) + e := exts[0] + coll.RequireCopied(e, "foo.txt") + require.True(t, e.Finished, "%v", e) + require.Equal(t, coll.coll.restoreUUID[:], e.BackupUuid) +} + +func TestCollManyFileAndRewriteRules(t *testing.T) { + coll := newPiTRCollForTest(t) + batch := restore.BatchBackupFileSet{ + backupFileSet(withFile(nameFile("foo.txt"))), + backupFileSet(withFile(nameFile("bar.txt")), withRewriteRule(remap(1, 10))), + backupFileSet(withFile(nameFile("baz.txt")), withRewriteRule(remap(2, 20))), + backupFileSet(withFile(nameFile("quux.txt")), withRewriteRule(remap(3, 21))), + } + + require.NoError(t, coll.RestoreAFile(batch)()) + coll.MarkSuccess() + coll.Done() + + exts := coll.ExtFullBkups() + require.Len(t, exts, 1) + e := exts[0] + coll.RequireCopied(e, "foo.txt", "bar.txt", "baz.txt", "quux.txt") + coll.RequireRewrite(e, remap(1, 10), remap(2, 20), remap(3, 21)) + require.True(t, e.Finished, "%v", e) + require.Equal(t, coll.coll.restoreUUID[:], e.BackupUuid) +} + +func TestReopen(t *testing.T) { + coll := newPiTRCollForTest(t) + batch1 := restore.BatchBackupFileSet{ + backupFileSet(withFile(nameFile("foo.txt"))), + backupFileSet(withFile(nameFile("bar.txt")), withRewriteRule(remap(1, 10)))} + batch2 := restore.BatchBackupFileSet{backupFileSet(withFile(nameFile("baz.txt")), withRewriteRule(remap(2, 20)))} + batch3 := restore.BatchBackupFileSet{backupFileSet(withFile(nameFile("quux.txt")), withRewriteRule(remap(3, 21)))} + + require.NoError(t, coll.RestoreAFile(batch1)()) + coll.Done() + exts := coll.ExtFullBkups() + require.Len(t, exts, 1) + e := exts[0] + coll.RequireCopied(e, "foo.txt", "bar.txt") + coll.RequireRewrite(e, remap(1, 10)) + require.False(t, e.Finished, "%v", e) + require.Equal(t, coll.coll.restoreUUID[:], e.BackupUuid) + + coll.Reopen() + require.NoError(t, coll.RestoreAFile(batch2)()) + exts = coll.ExtFullBkups() + require.Len(t, exts, 2) + e = exts[1] + coll.RequireCopied(e, "baz.txt") + coll.RequireRewrite(e, remap(2, 20)) + require.False(t, e.Finished, "%v", e) + require.Equal(t, coll.coll.restoreUUID[:], e.BackupUuid) + + coll.Reopen() + require.NoError(t, coll.RestoreAFile(batch3)()) + coll.MarkSuccess() + coll.Done() + exts = coll.ExtFullBkups() + require.Len(t, exts, 3) + e = exts[2] + coll.RequireCopied(e, "quux.txt") + coll.RequireRewrite(e, remap(3, 21)) + require.True(t, e.Finished, "%v", e) + require.Equal(t, coll.coll.restoreUUID[:], e.BackupUuid) +} + +func TestConflict(t *testing.T) { + coll := newPiTRCollForTest(t) + batch := restore.BatchBackupFileSet{ + backupFileSet(withFile(nameFile("foo.txt")), withRewriteRule(remap(1, 10))), + backupFileSet(withFile(nameFile("foo.txt")), withRewriteRule(remap(1, 11))), + } + + cb, err := coll.coll.onBatch(coll.cx, batch) + // NOTE: An error here is also acceptable. + require.NoError(t, err) + require.Error(t, cb()) +} diff --git a/br/pkg/restore/utils/BUILD.bazel b/br/pkg/restore/utils/BUILD.bazel index ef0d8355b9ded..d463e9640338f 100644 --- a/br/pkg/restore/utils/BUILD.bazel +++ b/br/pkg/restore/utils/BUILD.bazel @@ -15,6 +15,7 @@ go_library( "//br/pkg/rtree", "//pkg/meta/model", "//pkg/tablecodec", + "//pkg/util", "//pkg/util/codec", "//pkg/util/redact", "@com_github_pingcap_errors//:errors", diff --git a/br/pkg/restore/utils/rewrite_rule.go b/br/pkg/restore/utils/rewrite_rule.go index a664d97a5f11d..67fe7cf312675 100644 --- a/br/pkg/restore/utils/rewrite_rule.go +++ b/br/pkg/restore/utils/rewrite_rule.go @@ -27,6 +27,7 @@ import ( "github.com/pingcap/tidb/br/pkg/rtree" "github.com/pingcap/tidb/pkg/meta/model" "github.com/pingcap/tidb/pkg/tablecodec" + "github.com/pingcap/tidb/pkg/util" "github.com/pingcap/tidb/pkg/util/codec" "github.com/pingcap/tidb/pkg/util/redact" "go.uber.org/zap" @@ -47,6 +48,44 @@ type RewriteRules struct { NewKeyspace []byte // used to record checkpoint data NewTableID int64 + // used to record backup files to pitr. + // note: should NewTableID merged with this? + TableIDRemapHint []TableIDRemap +} + +func (r *RewriteRules) RewriteSourceTableID(from, to int64) (rewritten bool) { + toPrefix := tablecodec.EncodeTablePrefix(to) + fromPrefix := tablecodec.EncodeTablePrefix(from) + for _, rule := range r.Data { + if bytes.HasPrefix(rule.OldKeyPrefix, fromPrefix) { + rule.OldKeyPrefix = append(toPrefix, rule.OldKeyPrefix[len(toPrefix):]...) + rewritten = true + } + } + return +} + +func (r *RewriteRules) Clone() *RewriteRules { + data := make([]*import_sstpb.RewriteRule, len(r.Data)) + for i, rule := range r.Data { + data[i] = util.ProtoV1Clone(rule) + } + remap := make([]TableIDRemap, len(r.TableIDRemapHint)) + copy(remap, r.TableIDRemapHint) + + return &RewriteRules{ + Data: data, + TableIDRemapHint: remap, + OldKeyspace: r.OldKeyspace, + NewKeyspace: r.NewKeyspace, + NewTableID: r.NewTableID, + } +} + +// TableIDRemap presents a remapping of table id during rewriting. +type TableIDRemap struct { + Origin int64 + Rewritten int64 } // Append append its argument to this rewrite rules. @@ -75,9 +114,11 @@ func GetRewriteRules( ) *RewriteRules { tableIDs := GetTableIDMap(newTable, oldTable) indexIDs := GetIndexIDMap(newTable, oldTable) + remaps := make([]TableIDRemap, 0) dataRules := make([]*import_sstpb.RewriteRule, 0) for oldTableID, newTableID := range tableIDs { + remaps = append(remaps, TableIDRemap{Origin: oldTableID, Rewritten: newTableID}) if getDetailRule { dataRules = append(dataRules, &import_sstpb.RewriteRule{ OldKeyPrefix: tablecodec.GenTableRecordPrefix(oldTableID), @@ -101,7 +142,8 @@ func GetRewriteRules( } return &RewriteRules{ - Data: dataRules, + Data: dataRules, + TableIDRemapHint: remaps, } } @@ -112,8 +154,10 @@ func GetRewriteRulesMap( tableIDs := GetTableIDMap(newTable, oldTable) indexIDs := GetIndexIDMap(newTable, oldTable) + remaps := make([]TableIDRemap, 0) for oldTableID, newTableID := range tableIDs { + remaps = append(remaps, TableIDRemap{Origin: oldTableID, Rewritten: newTableID}) dataRules := make([]*import_sstpb.RewriteRule, 0) if getDetailRule { dataRules = append(dataRules, &import_sstpb.RewriteRule{ @@ -137,7 +181,8 @@ func GetRewriteRulesMap( } rules[oldTableID] = &RewriteRules{ - Data: dataRules, + Data: dataRules, + TableIDRemapHint: remaps, } } @@ -152,7 +197,7 @@ func GetRewriteRuleOfTable( getDetailRule bool, ) *RewriteRules { dataRules := make([]*import_sstpb.RewriteRule, 0) - + remaps := []TableIDRemap{{Origin: oldTableID, Rewritten: newTableID}} if getDetailRule { dataRules = append(dataRules, &import_sstpb.RewriteRule{ OldKeyPrefix: tablecodec.GenTableRecordPrefix(oldTableID), @@ -174,7 +219,7 @@ func GetRewriteRuleOfTable( }) } - return &RewriteRules{Data: dataRules, NewTableID: newTableID} + return &RewriteRules{Data: dataRules, NewTableID: newTableID, TableIDRemapHint: remaps} } // ValidateFileRewriteRule uses rewrite rules to validate the ranges of a file. @@ -286,6 +331,10 @@ func FindMatchedRewriteRule(file AppliedFile, rules *RewriteRules) *import_sstpb } func (r *RewriteRules) String() string { + if r == nil { + return "[]" + } + var out strings.Builder out.WriteRune('[') if len(r.OldKeyspace) != 0 { @@ -340,12 +389,14 @@ func GetRewriteEncodedKeys(file AppliedFile, rewriteRules *RewriteRules) (startK if startID == endID { startKey, rule = rewriteEncodedKey(file.GetStartKey(), rewriteRules) if rewriteRules != nil && rule == nil { - err = errors.Annotatef(berrors.ErrRestoreInvalidRewrite, "cannot find encode rewrite rule for start key, startKey: %s", redact.Key(file.GetStartKey())) + err = errors.Annotatef(berrors.ErrRestoreInvalidRewrite, "cannot find encode rewrite rule for start key, startKey: %s; rewrite rules: %s", + redact.Key(file.GetStartKey()), rewriteRules) return } endKey, rule = rewriteEncodedKey(file.GetEndKey(), rewriteRules) if rewriteRules != nil && rule == nil { - err = errors.Annotatef(berrors.ErrRestoreInvalidRewrite, "cannot find encode rewrite rule for end key, endKey: %s", redact.Key(file.GetEndKey())) + err = errors.Annotatef(berrors.ErrRestoreInvalidRewrite, "cannot find encode rewrite rule for end key, endKey: %s; rewrite rules: %s", + redact.Key(file.GetEndKey()), rewriteRules) return } } else { diff --git a/br/pkg/storage/BUILD.bazel b/br/pkg/storage/BUILD.bazel index 60c587893af9f..ee28627832715 100644 --- a/br/pkg/storage/BUILD.bazel +++ b/br/pkg/storage/BUILD.bazel @@ -28,6 +28,7 @@ go_library( deps = [ "//br/pkg/errors", "//br/pkg/logutil", + "//br/pkg/utils", "//br/pkg/utils/iter", "//pkg/lightning/log", "//pkg/sessionctx/variable", diff --git a/br/pkg/storage/ks3.go b/br/pkg/storage/ks3.go index 919da5e3aa760..24aaa88e81b0d 100644 --- a/br/pkg/storage/ks3.go +++ b/br/pkg/storage/ks3.go @@ -38,6 +38,10 @@ import ( "go.uber.org/zap" ) +var ( + _ Copier = &KS3Storage{} +) + const ( // ks3 sdk does not expose context, we use hardcoded timeout for network request ks3SDKProvider = "ks3-sdk" @@ -734,3 +738,45 @@ func (rs *KS3Storage) Rename(ctx context.Context, oldFileName, newFileName strin // Close implements ExternalStorage interface. func (*KS3Storage) Close() {} + +func maybeObjectAlreadyExists(err awserr.Error) bool { + // Some versions of server did return the error code "ObjectAlreayExists"... + return err.Code() == "ObjectAlreayExists" || err.Code() == "ObjectAlreadyExists" +} + +// CopyFrom implements Copier. +func (rs *KS3Storage) CopyFrom(ctx context.Context, e ExternalStorage, spec CopySpec) error { + s, ok := e.(*KS3Storage) + if !ok { + return errors.Annotatef(berrors.ErrStorageInvalidConfig, "S3Storage.CopyFrom supports S3 storage only, get %T", e) + } + + copyInput := &s3.CopyObjectInput{ + Bucket: aws.String(rs.options.Bucket), + // NOTE: Perhaps we need to allow copy cross regions / accounts. + CopySource: aws.String(path.Join(s.options.Bucket, s.options.Prefix, spec.From)), + Key: aws.String(rs.options.Prefix + spec.To), + } + + // NOTE: Maybe check whether the Go SDK will handle 200 OK errors. + // https://repost.aws/knowledge-center/s3-resolve-200-internalerror + _, err := s.svc.CopyObjectWithContext(ctx, copyInput) + if err != nil { + aErr, ok := err.(awserr.Error) + if !ok { + return err + } + // KS3 reports an error when copying an object to an existing path. + // AWS S3 will directly override the target. Simulating its behavior. + // Glitch: this isn't an atomic operation. So it is possible left nothing to `spec.To`... + if maybeObjectAlreadyExists(aErr) { + log.Warn("The object of `spec.To` already exists, will delete it and retry", zap.String("object", spec.To), logutil.ShortError(err)) + if err := rs.DeleteFile(ctx, spec.To); err != nil { + return errors.Annotate(err, "during deleting an exist object for making place for copy") + } + + return rs.CopyFrom(ctx, e, spec) + } + } + return nil +} diff --git a/br/pkg/storage/local.go b/br/pkg/storage/local.go index 54f011afb3dd9..2626add835fb0 100644 --- a/br/pkg/storage/local.go +++ b/br/pkg/storage/local.go @@ -14,6 +14,7 @@ import ( "github.com/pingcap/errors" "github.com/pingcap/failpoint" "github.com/pingcap/log" + berrors "github.com/pingcap/tidb/br/pkg/errors" "github.com/pingcap/tidb/br/pkg/logutil" "go.uber.org/zap" ) @@ -274,6 +275,19 @@ func (l *LocalStorage) Rename(_ context.Context, oldFileName, newFileName string // Close implements ExternalStorage interface. func (*LocalStorage) Close() {} +func (l *LocalStorage) CopyFrom(ctx context.Context, e ExternalStorage, spec CopySpec) error { + sl, ok := e.(*LocalStorage) + if !ok { + return errors.Annotatef(berrors.ErrInvalidArgument, "expect source to be LocalStorage, got %T", e) + } + from := filepath.Join(sl.base, spec.From) + to := filepath.Join(l.base, spec.To) + if err := mkdirAll(filepath.Dir(to)); err != nil { + return errors.Trace(err) + } + return os.Link(from, to) +} + func pathExists(_path string) (bool, error) { _, err := os.Stat(_path) if err != nil { diff --git a/br/pkg/storage/locking.go b/br/pkg/storage/locking.go index 9a5ce3a26cd39..84ab25817ca9a 100644 --- a/br/pkg/storage/locking.go +++ b/br/pkg/storage/locking.go @@ -8,6 +8,7 @@ import ( "encoding/hex" "encoding/json" "fmt" + "math" "math/rand" "os" "path" @@ -18,6 +19,7 @@ import ( "github.com/pingcap/failpoint" "github.com/pingcap/log" "github.com/pingcap/tidb/br/pkg/logutil" + "github.com/pingcap/tidb/br/pkg/utils" "go.uber.org/multierr" "go.uber.org/zap" ) @@ -262,6 +264,35 @@ func newReadLockName(path string) string { return fmt.Sprintf("%s.READ.%016x", path, readID) } +type Locker = func(ctx context.Context, storage ExternalStorage, path, hint string) (lock RemoteLock, err error) + +func LockWith(ctx context.Context, locker Locker, storage ExternalStorage, path, hint string) (lock RemoteLock, err error) { + const JitterMs = 5000 + + retry := utils.InitialRetryState(math.MaxInt, 1*time.Second, 60*time.Second) + jitter := time.Duration(rand.Uint32()%JitterMs+(JitterMs/2)) * time.Millisecond + for { + lock, err = locker(ctx, storage, path, hint) + if err == nil { + return lock, nil + } + retryAfter := retry.ExponentialBackoff() + jitter + log.Info( + "Encountered lock, will retry then.", + logutil.ShortError(err), + zap.String("path", path), + zap.Duration("retry-after", retryAfter), + ) + + select { + case <-ctx.Done(): + err = ctx.Err() + return + case <-time.After(retryAfter): + } + } +} + func TryLockRemoteWrite(ctx context.Context, storage ExternalStorage, path, hint string) (lock RemoteLock, err error) { target := writeLockName(path) writer := conditionalPut{ diff --git a/br/pkg/storage/s3.go b/br/pkg/storage/s3.go index 00a90f55c3400..5c5c03c911063 100644 --- a/br/pkg/storage/s3.go +++ b/br/pkg/storage/s3.go @@ -97,6 +97,25 @@ func (rs *S3Storage) GetOptions() *backuppb.S3 { return rs.options } +func (rs *S3Storage) CopyFrom(ctx context.Context, e ExternalStorage, spec CopySpec) error { + s, ok := e.(*S3Storage) + if !ok { + return errors.Annotatef(berrors.ErrStorageInvalidConfig, "S3Storage.CopyFrom supports S3 storage only, get %T", e) + } + + copyInput := &s3.CopyObjectInput{ + Bucket: aws.String(rs.options.Bucket), + // NOTE: Perhaps we need to allow copy cross regions / accounts. + CopySource: aws.String(path.Join(s.options.Bucket, s.options.Prefix, spec.From)), + Key: aws.String(rs.options.Prefix + spec.To), + } + + // NOTE: Maybe check whether the Go SDK will handle 200 OK errors. + // https://repost.aws/knowledge-center/s3-resolve-200-internalerror + _, err := s.svc.CopyObjectWithContext(ctx, copyInput) + return err +} + // S3Uploader does multi-part upload to s3. type S3Uploader struct { svc s3iface.S3API diff --git a/br/pkg/storage/storage.go b/br/pkg/storage/storage.go index 0ad8cdc20aeac..2d54b49a57464 100644 --- a/br/pkg/storage/storage.go +++ b/br/pkg/storage/storage.go @@ -116,6 +116,16 @@ type ReaderOption struct { PrefetchSize int } +type Copier interface { + // CopyFrom copies a object to the current external storage by the specification. + CopyFrom(ctx context.Context, e ExternalStorage, spec CopySpec) error +} + +type CopySpec struct { + From string + To string +} + // ExternalStorage represents a kind of file system storage. type ExternalStorage interface { // WriteFile writes a complete file to storage, similar to os.WriteFile, but WriteFile should be atomic diff --git a/br/pkg/stream/BUILD.bazel b/br/pkg/stream/BUILD.bazel index 8d5eb63becacf..d5a1361e83cb7 100644 --- a/br/pkg/stream/BUILD.bazel +++ b/br/pkg/stream/BUILD.bazel @@ -38,6 +38,7 @@ go_library( "//pkg/util/versioninfo", "@com_github_docker_go_units//:go-units", "@com_github_fatih_color//:color", + "@com_github_google_uuid//:uuid", "@com_github_klauspost_compress//zstd", "@com_github_pingcap_errors//:errors", "@com_github_pingcap_kvproto//pkg/brpb", @@ -67,7 +68,7 @@ go_test( ], embed = [":stream"], flaky = True, - shard_count = 47, + shard_count = 48, deps = [ "//br/pkg/storage", "//br/pkg/streamhelper", @@ -82,6 +83,7 @@ go_test( "//pkg/util/intest", "//pkg/util/table-filter", "@com_github_fsouza_fake_gcs_server//fakestorage", + "@com_github_google_uuid//:uuid", "@com_github_pingcap_errors//:errors", "@com_github_pingcap_failpoint//:failpoint", "@com_github_pingcap_kvproto//pkg/brpb", diff --git a/br/pkg/stream/stream_metas.go b/br/pkg/stream/stream_metas.go index 6801035ce7214..4797e10e951fb 100644 --- a/br/pkg/stream/stream_metas.go +++ b/br/pkg/stream/stream_metas.go @@ -5,8 +5,10 @@ package stream import ( "context" "encoding/binary" + "encoding/hex" "fmt" "hash/crc64" + "maps" "math" "path" "slices" @@ -14,9 +16,11 @@ import ( "strconv" "strings" "sync" + "time" "github.com/docker/go-units" "github.com/fatih/color" + "github.com/google/uuid" "github.com/pingcap/errors" pb "github.com/pingcap/kvproto/pkg/brpb" "github.com/pingcap/log" @@ -28,6 +32,7 @@ import ( "github.com/pingcap/tidb/pkg/util" "github.com/pingcap/tidb/pkg/util/mathutil" "github.com/pingcap/tidb/pkg/util/versioninfo" + "github.com/tikv/client-go/v2/oracle" "go.uber.org/multierr" "go.uber.org/zap" "golang.org/x/sync/errgroup" @@ -39,13 +44,14 @@ const ( baseTmp = "BASE_TMP" metaSuffix = ".meta" migrationPrefix = "v1/migrations" + lockPrefix = "v1/LOCK" - SupportedMigVersion = pb.MigrationVersion_M1 + SupportedMigVersion = pb.MigrationVersion_M2 ) func NewMigration() *pb.Migration { return &pb.Migration{ - Version: pb.MigrationVersion_M1, + Version: pb.MigrationVersion_M2, Creator: fmt.Sprintf("br;commit=%s;branch=%s", versioninfo.TiDBGitHash, versioninfo.TiDBGitBranch), } } @@ -204,7 +210,7 @@ func (ms *StreamMetadataSet) RemoveDataFilesAndUpdateMetadataInBatch( updateFn func(num int64), ) ([]string, error) { hst := ms.hook(st) - est := MigerationExtension(hst) + est := MigrationExtension(hst) est.Hooks = updateFnHook{updateFn: updateFn} res := MigratedTo{NewBase: NewMigration()} est.doTruncateLogs(ctx, ms, from, &res) @@ -342,45 +348,113 @@ func ReplaceMetadata(meta *pb.Metadata, filegroups []*pb.DataFileGroup) { updateMetadataInternalStat(meta) } -func AddMigrationToTable(m *pb.Migration, table *glue.Table) { - rd := color.New(color.FgHiRed).Sprint - for i, c := range m.Compactions { - addCompactionToTable(c, table, i) +type marshalMigrationContext struct { + context.Context + est MigrationExt + + output *glue.Table + keyspace []string +} + +func (m *marshalMigrationContext) emit(key, value string) { + bold := color.New(color.Bold).Sprintf + ks := new(strings.Builder) + for _, k := range m.keyspace { + ks.WriteString(k) + ks.WriteString("/") } + ks.WriteString(key) - if len(m.EditMeta) > 0 { - totalDeletePhyFile := 0 - totalDeleteLgcFile := 0 - for _, edit := range m.EditMeta { - totalDeletePhyFile += len(edit.DeletePhysicalFiles) - for _, dl := range edit.DeleteLogicalFiles { - totalDeleteLgcFile += len(dl.Spans) - } + finalValue := bold(value) + m.output.Add(ks.String(), finalValue) +} + +func (m *marshalMigrationContext) keyspaced(key []string, f func()) { + m.keyspace = append(m.keyspace, key...) + defer func() { + m.keyspace = m.keyspace[:len(m.keyspace)-len(key)] + }() + + f() +} + +func (m *marshalMigrationContext) addCompaction(c *pb.LogFileCompaction) { + m.emit("name", c.Name) + m.emit("time", fmt.Sprintf("%d ~ %d", c.CompactionFromTs, c.CompactionUntilTs)) + m.emit("file", fmt.Sprintf("[%q, %q]", c.Artifacts, c.GeneratedFiles)) +} + +func (m *marshalMigrationContext) addMetaEdits(em []*pb.MetaEdit) { + if len(em) == 0 { + return + } + + totalDeletePhyFile := 0 + totalDeleteLgcFile := 0 + for _, edit := range em { + totalDeletePhyFile += len(edit.DeletePhysicalFiles) + for _, dl := range edit.DeleteLogicalFiles { + totalDeleteLgcFile += len(dl.Spans) } - table.Add( - "edit-meta-files", - fmt.Sprintf("%s meta files will be edited.", rd(len(m.EditMeta))), - ) - table.Add( - "delete-physical-file", - fmt.Sprintf("%s physical files will be deleted.", rd(totalDeletePhyFile)), - ) - table.Add( - "delete-logical-file", - fmt.Sprintf("%s logical segments may be deleted, if possible.", rd(totalDeleteLgcFile)), - ) } - for i, c := range m.DestructPrefix { - table.Add(fmt.Sprintf("destruct-prefix[%02d]", i), rd(c)) + m.emit("edit_meta_files", strconv.Itoa(len(em))) + m.emit("delete_physical_file", strconv.Itoa(totalDeletePhyFile)) + m.emit("delete_logical_file", strconv.Itoa(totalDeleteLgcFile)) +} + +func (m *marshalMigrationContext) addTruncatedTo(tso uint64) { + if tso == 0 { + return + } + m.emit("truncated_to", strconv.FormatUint(tso, 10)) + t := oracle.GetTimeFromTS(tso) + m.emit("truncated_to_in_rfc3339", t.Format(time.RFC3339)) +} + +func (m *marshalMigrationContext) addMigration(mig *pb.Migration) { + m.addTruncatedTo(mig.TruncatedTo) + for i, c := range mig.Compactions { + m.keyspaced([]string{"compactions", strconv.Itoa(i)}, func() { + m.addCompaction(c) + }) + } + m.keyspaced([]string{"meta_edit"}, func() { + m.addMetaEdits(mig.EditMeta) + }) + for i, d := range mig.DestructPrefix { + m.keyspaced([]string{"destruct_prefix", strconv.Itoa(i)}, func() { + m.emit("value", d) + }) + } + for i, p := range mig.IngestedSstPaths { + m.keyspaced([]string{"extra_full_backup", strconv.Itoa(i)}, func() { + m.addIngestedSSTss(p) + }) } - table.Add("truncate-to", rd(m.TruncatedTo)) } -func addCompactionToTable(m *pb.LogFileCompaction, table *glue.Table, idx int) { - withIdx := func(s string) string { return fmt.Sprintf("compactions[%d].%s", idx, s) } - table.Add(withIdx("name"), m.Name) - table.Add(withIdx("time"), fmt.Sprintf("%d ~ %d", m.CompactionFromTs, m.CompactionUntilTs)) - table.Add(withIdx("file"), fmt.Sprintf("[%q, %q]", m.Artifacts, m.GeneratedFiles)) +func (m *marshalMigrationContext) addIngestedSSTss(path string) { + fullbk, err := readIngestedSSTs(m.Context, path, m.est.s) + if err != nil { + m.emit("err_during_reading", err.Error()) + m.emit("meta_path", path) + return + } + + m.emit("as_if_ts", strconv.FormatUint(fullbk.AsIfTs, 10)) + m.emit("backup_uuid", hex.EncodeToString(fullbk.GetBackupUuid())) + m.emit("files_count", strconv.Itoa(len(fullbk.Files))) + m.emit("files_position", fullbk.FilesPrefixHint) +} + +func (m MigrationExt) AddMigrationToTable(ctx context.Context, mig *pb.Migration, table *glue.Table) { + cx := marshalMigrationContext{ + Context: ctx, + est: m, + output: table, + } + + cx.addMigration(mig) } // MigrationExt is an extension to the `ExternalStorage` type. @@ -516,7 +590,7 @@ func (NoHooks) HandledAMetaEdit(*pb.MetaEdit) func (NoHooks) HandingMetaEditDone() {} // MigrateionExtnsion installs the extension methods to an `ExternalStorage`. -func MigerationExtension(s storage.ExternalStorage) MigrationExt { +func MigrationExtension(s storage.ExternalStorage) MigrationExt { return MigrationExt{ s: s, prefix: migrationPrefix, @@ -534,6 +608,7 @@ func MergeMigrations(m1 *pb.Migration, m2 *pb.Migration) *pb.Migration { out.TruncatedTo = max(m1.GetTruncatedTo(), m2.GetTruncatedTo()) out.DestructPrefix = append(out.DestructPrefix, m1.GetDestructPrefix()...) out.DestructPrefix = append(out.DestructPrefix, m2.GetDestructPrefix()...) + out.IngestedSstPaths = append(out.IngestedSstPaths, m1.GetIngestedSstPaths()...) return out } @@ -551,12 +626,16 @@ type MergeAndMigratedTo struct { // The term "migrate to" means, try to performance all possible operations // from a migration to the storage. type MigratedTo struct { - // Errors happen during executing the migration. + // Non-fatal errors happen during executing the migration. Warnings []error // The new BASE migration after the operation. NewBase *pb.Migration } +func (m *MigratedTo) Warn(err error) { + m.Warnings = append(m.Warnings, err) +} + // Migrations represents living migrations from the storage. type Migrations struct { // The BASE migration. @@ -566,6 +645,11 @@ type Migrations struct { Layers []*OrderedMigration `json:"layers"` } +// GetReadLock locks the storage and make sure there won't be other one modify this backup. +func (m *MigrationExt) GetReadLock(ctx context.Context, hint string) (storage.RemoteLock, error) { + return storage.LockWith(ctx, storage.TryLockRemoteRead, m.s, lockPrefix, hint) +} + // OrderedMigration is a migration with its path and sequence number. type OrderedMigration struct { SeqNum int `json:"seq_num"` @@ -591,8 +675,25 @@ func (o *OrderedMigration) unmarshalContent(b []byte) error { return nil } +type LoadOptions func(*loadConfig) + +type loadConfig struct { + notFoundIsErr bool +} + +func MLNotFoundIsErr() LoadOptions { + return func(c *loadConfig) { + c.notFoundIsErr = true + } +} + // Load loads the current living migrations from the storage. -func (m MigrationExt) Load(ctx context.Context) (Migrations, error) { +func (m MigrationExt) Load(ctx context.Context, opts ...LoadOptions) (Migrations, error) { + cfg := loadConfig{} + for _, o := range opts { + o(&cfg) + } + opt := &storage.WalkOption{ SubDir: m.prefix, } @@ -624,6 +725,9 @@ func (m MigrationExt) Load(ctx context.Context) (Migrations, error) { if collected.Err != nil { return Migrations{}, collected.Err } + if len(collected.Item) == 0 && cfg.notFoundIsErr { + return Migrations{}, errors.Annotatef(berrors.ErrMigrationNotFound, "in the storage %s", m.s.URI()) + } sort.Slice(collected.Item, func(i, j int) bool { return collected.Item[i].SeqNum < collected.Item[j].SeqNum }) @@ -656,11 +760,20 @@ func (m MigrationExt) DryRun(f func(MigrationExt)) []storage.Effect { } func (m MigrationExt) AppendMigration(ctx context.Context, mig *pb.Migration) (int, error) { + lock, err := storage.LockWith(ctx, storage.TryLockRemoteWrite, m.s, lockPrefix, "AppendMigration") + if err != nil { + return 0, err + } + defer lock.Unlock(ctx) + migs, err := m.Load(ctx) if err != nil { return 0, err } - newSN := migs.Layers[len(migs.Layers)-1].SeqNum + 1 + newSN := 1 + if len(migs.Layers) > 0 { + newSN = migs.Layers[len(migs.Layers)-1].SeqNum + 1 + } name := path.Join(migrationPrefix, nameOf(mig, newSN)) data, err := mig.Marshal() if err != nil { @@ -738,6 +851,16 @@ func (m MigrationExt) MergeAndMigrateTo( targetSpec int, opts ...MergeAndMigrateToOpt, ) (result MergeAndMigratedTo) { + lock, err := storage.LockWith(ctx, storage.TryLockRemoteWrite, m.s, lockPrefix, "AppendMigration") + if err != nil { + result.MigratedTo = MigratedTo{ + Warnings: []error{ + errors.Annotate(err, "failed to get the lock, nothing will happen"), + }} + return + } + defer lock.Unlock(ctx) + config := mergeAndMigrateToConfig{} for _, o := range opts { o(&config) @@ -814,7 +937,7 @@ func (m MigrationExt) MergeAndMigrateTo( } } } - result.MigratedTo = m.MigrateTo(ctx, newBase, MTMaybeSkipTruncateLog(!config.alwaysRunTruncate && canSkipTruncate)) + result.MigratedTo = m.migrateTo(ctx, newBase, MTMaybeSkipTruncateLog(!config.alwaysRunTruncate && canSkipTruncate)) // Put the final BASE. err = m.writeBase(ctx, result.NewBase) @@ -824,7 +947,7 @@ func (m MigrationExt) MergeAndMigrateTo( return } -type MigrateToOpt func(*migToOpt) +type migrateToOpt func(*migToOpt) type migToOpt struct { skipTruncateLog bool @@ -834,17 +957,17 @@ func MTSkipTruncateLog(o *migToOpt) { o.skipTruncateLog = true } -func MTMaybeSkipTruncateLog(cond bool) MigrateToOpt { +func MTMaybeSkipTruncateLog(cond bool) migrateToOpt { if cond { return MTSkipTruncateLog } return func(*migToOpt) {} } -// MigrateTo migrates to a migration. +// migrateTo migrates to a migration. // If encountered some error during executing some operation, the operation will be put // to the new BASE, which can be retryed then. -func (m MigrationExt) MigrateTo(ctx context.Context, mig *pb.Migration, opts ...MigrateToOpt) MigratedTo { +func (m MigrationExt) migrateTo(ctx context.Context, mig *pb.Migration, opts ...migrateToOpt) MigratedTo { opt := migToOpt{} for _, o := range opts { o(&opt) @@ -853,18 +976,16 @@ func (m MigrationExt) MigrateTo(ctx context.Context, mig *pb.Migration, opts ... result := MigratedTo{ NewBase: NewMigration(), } - // Fills: TruncatedTo, Compactions, DesctructPrefix. + m.processCompactions(ctx, mig, &result) + m.processDestroyPrefixes(ctx, mig, &result) + m.processExtFullBackup(ctx, mig, &result) if !opt.skipTruncateLog { - m.doTruncating(ctx, mig, &result) - } else { - // Fast path: `truncate_to` wasn't updated, just copy the compactions and truncated to. - result.NewBase.Compactions = mig.Compactions - result.NewBase.TruncatedTo = mig.TruncatedTo + m.processTruncatedTo(ctx, mig, &result) } // We do skip truncate log first, so metas removed by truncating can be removed in this execution. // Fills: EditMeta for new Base. - m.doMetaEdits(ctx, mig, &result) + m.processMetaEdits(ctx, mig, &result) return result } @@ -881,9 +1002,8 @@ func (m MigrationExt) writeBase(ctx context.Context, mig *pb.Migration) error { return m.s.Rename(ctx, path.Join(m.prefix, baseTmp), path.Join(m.prefix, baseMigrationName)) } -// doMetaEdits applies the modification to the meta files in the storage. -// This will delete data files firstly. Make sure the new BASE was persisted before calling this. -func (m MigrationExt) doMetaEdits(ctx context.Context, mig *pb.Migration, out *MigratedTo) { +// processMetaEdits applies the modification to the meta files in the storage. +func (m MigrationExt) processMetaEdits(ctx context.Context, mig *pb.Migration, out *MigratedTo) { m.Hooks.StartHandlingMetaEdits(mig.EditMeta) handleAMetaEdit := func(medit *pb.MetaEdit) { @@ -1068,10 +1188,29 @@ func (m MigrationExt) tryRemovePrefix(ctx context.Context, pfx string, out *Migr } } -// doTruncating tries to remove outdated compaction, filling the not-yet removed compactions to the new migration. -func (m MigrationExt) doTruncating(ctx context.Context, mig *pb.Migration, result *MigratedTo) { - // NOTE: Execution of truncation wasn't implemented here. - // If we are going to truncate some files, for now we still need to use `br log truncate`. +// processTruncatedTo tries to remove outdated compaction, filling the not-yet removed compactions to the new migration. +func (m MigrationExt) processTruncatedTo(ctx context.Context, mig *pb.Migration, result *MigratedTo) { + result.NewBase.TruncatedTo = mig.TruncatedTo + m.Hooks.StartLoadingMetaForTruncating() + mdSet := new(StreamMetadataSet) + mdSet.MetadataDownloadBatchSize = 128 + shiftTS, err := mdSet.LoadUntilAndCalculateShiftTS(ctx, m.s, mig.TruncatedTo) + if err != nil { + result.Warnings = append(result.Warnings, errors.Annotatef(err, "failed to open meta storage")) + return + } + m.Hooks.EndLoadingMetaForTruncating() + + m.doTruncateLogs(ctx, mdSet, shiftTS, result) +} + +func (m MigrationExt) processDestroyPrefixes(ctx context.Context, mig *pb.Migration, result *MigratedTo) { + for _, pfx := range mig.DestructPrefix { + m.tryRemovePrefix(ctx, pfx, result) + } +} + +func (m MigrationExt) processCompactions(ctx context.Context, mig *pb.Migration, result *MigratedTo) { for _, compaction := range mig.Compactions { // Can we also remove the compaction when `until-ts` is equal to `truncated-to`...? if compaction.CompactionUntilTs > mig.TruncatedTo { @@ -1081,23 +1220,134 @@ func (m MigrationExt) doTruncating(ctx context.Context, mig *pb.Migration, resul m.tryRemovePrefix(ctx, compaction.GeneratedFiles, result) } } - for _, pfx := range mig.DestructPrefix { - m.tryRemovePrefix(ctx, pfx, result) +} + +func (m MigrationExt) processExtFullBackup(ctx context.Context, mig *pb.Migration, result *MigratedTo) { + groups := LoadIngestedSSTss(ctx, m.s, mig.IngestedSstPaths) + processGroup := func(outErr error, e IngestedSSTss) (copyToNewMig bool, err error) { + if outErr != nil { + return true, outErr + } + + if !e.GroupFinished() { + return true, nil + } + + if e.GroupTS() >= mig.TruncatedTo { + return true, nil + } + + for _, b := range e { + m.tryRemovePrefix(ctx, b.FilesPrefixHint, result) + } + return false, nil } + for err, item := range iter.AsSeq(ctx, groups) { + copyToNewMig, err := processGroup(err, item) + if err != nil { + result.Warn(err) + } + if copyToNewMig { + for _, exb := range item { + result.NewBase.IngestedSstPaths = append(result.NewBase.IngestedSstPaths, exb.path) + } + } + } +} - result.NewBase.TruncatedTo = mig.TruncatedTo +type PathedIngestedSSTs struct { + *pb.IngestedSSTs + path string +} - m.Hooks.StartLoadingMetaForTruncating() - mdSet := new(StreamMetadataSet) - mdSet.MetadataDownloadBatchSize = 128 - shiftTS, err := mdSet.LoadUntilAndCalculateShiftTS(ctx, m.s, mig.TruncatedTo) +type IngestedSSTss []PathedIngestedSSTs + +func (ebs IngestedSSTss) GroupFinished() bool { + for _, b := range ebs { + if b.Finished { + return true + } + } + return false +} + +func (ebs IngestedSSTss) GroupTS() uint64 { + for _, b := range ebs { + if b.Finished { + return b.AsIfTs + } + } + return math.MaxUint64 +} + +func LoadIngestedSSTss(ctx context.Context, s storage.ExternalStorage, paths []string) iter.TryNextor[IngestedSSTss] { + + fullBackupDirIter := iter.FromSlice(paths) + backups := iter.TryMap(fullBackupDirIter, func(name string) (PathedIngestedSSTs, error) { + // name is the absolute path in external storage. + bkup, err := readIngestedSSTs(ctx, name, s) + if err != nil { + return PathedIngestedSSTs{}, errors.Annotatef(err, "failed to read backup at %s", name) + } + return PathedIngestedSSTs{IngestedSSTs: bkup, path: name}, nil + }) + extBackups, err := groupExtraBackups(ctx, backups) if err != nil { - result.Warnings = append(result.Warnings, errors.Annotatef(err, "failed to open meta storage")) - return + return iter.Fail[IngestedSSTss](err) } - m.Hooks.EndLoadingMetaForTruncating() + return iter.FromSlice(extBackups) +} - m.doTruncateLogs(ctx, mdSet, shiftTS, result) +func groupExtraBackups(ctx context.Context, i iter.TryNextor[PathedIngestedSSTs]) ([]IngestedSSTss, error) { + var ( + collected = map[uuid.UUID]IngestedSSTss{} + finished = map[uuid.UUID]struct{}{} + ) + + for { + res := i.TryNext(ctx) + if res.FinishedOrError() { + res := make([]IngestedSSTss, 0, len(collected)) + for v := range maps.Values(collected) { + res = append(res, v) + } + return res, nil + } + + fbk := res.Item + if len(fbk.BackupUuid) != len(uuid.UUID{}) { + return nil, errors.Annotatef(berrors.ErrInvalidArgument, "the full backup UUID has bad length(%d)", len(fbk.BackupUuid)) + } + uid := uuid.UUID(fbk.BackupUuid) + log.Info("Collecting extra full backup", zap.Stringer("UUID", uid), zap.String("path", fbk.FilesPrefixHint), zap.Bool("finished", fbk.Finished)) + + if _, ok := finished[uid]; ok { + log.Warn("Encountered a finished full backup.", zap.Stringer("UUID", uid), zap.String("path", fbk.FilesPrefixHint)) + return nil, errors.Annotatef( + berrors.ErrInvalidArgument, + "the extra full backup group %s at %s encounters an extra full backup meta after a finished one", + uid, fbk.FilesPrefixHint, + ) + } + + collected[uid] = append(collected[uid], fbk) + if fbk.Finished { + finished[uid] = struct{}{} + } + } +} + +func readIngestedSSTs(ctx context.Context, name string, s storage.ExternalStorage) (*pb.IngestedSSTs, error) { + reader, err := s.ReadFile(ctx, name) + if err != nil { + return nil, err + } + + var backup pb.IngestedSSTs + if err := backup.Unmarshal(reader); err != nil { + return nil, err + } + return &backup, nil } func (m MigrationExt) loadFilesOfPrefix(ctx context.Context, prefix string) (out []string, err error) { @@ -1342,15 +1592,33 @@ func isEmptyMetadata(md *pb.Metadata) bool { return len(md.FileGroups) == 0 && len(md.Files) == 0 } +/* Below are hash algorithms for hashing a component of the migration. + * Sadly there isn't a document describes the behavior of the algorithms. + * Perhaps we can standardlize them in the future. + * Maybe by defining a ordering-insensitive object hash algorithm for protocol buffer. + * + * Note: For now, the canon of the hash algorithm for a message should follow the following rules: + * - If a hash algorithm for a message exists both in TiKV and BR and conflicting, we + * follow the implementation at where the message firstly creates (say, for compactions, + * TiKV will be the canonical implementation. while for extra full backups, BR is canonical.). + * - For commonly used fields, follow the implementation in BR. + * + * Another note: nowadays, the hash of a migration is mainly used for detecting duplicated works, + * so the difference between hash algorithms won't result in something too bad... + */ + func hashMigration(m *pb.Migration) uint64 { - var crc64 uint64 = 0 + var crc64Res uint64 = 0 for _, compaction := range m.Compactions { - crc64 ^= compaction.ArtifactsHash + crc64Res ^= compaction.ArtifactsHash } for _, metaEdit := range m.EditMeta { - crc64 ^= hashMetaEdit(metaEdit) + crc64Res ^= hashMetaEdit(metaEdit) + } + for _, extBkup := range m.IngestedSstPaths { + crc64Res ^= crc64.Checksum([]byte(extBkup), crc64.MakeTable(crc64.ISO)) } - return crc64 ^ m.TruncatedTo + return crc64Res ^ m.TruncatedTo } func hashMetaEdit(metaEdit *pb.MetaEdit) uint64 { @@ -1381,3 +1649,11 @@ func hashMetaEdit(metaEdit *pb.MetaEdit) uint64 { func nameOf(mig *pb.Migration, sn int) string { return fmt.Sprintf("%08d_%016X.mgrt", sn, hashMigration(mig)) } + +func isEmptyMigration(mig *pb.Migration) bool { + return len(mig.Compactions) == 0 && + len(mig.EditMeta) == 0 && + len(mig.IngestedSstPaths) == 0 && + len(mig.DestructPrefix) == 0 && + mig.TruncatedTo == 0 +} diff --git a/br/pkg/stream/stream_metas_test.go b/br/pkg/stream/stream_metas_test.go index c0fcbbae623ce..e36eedfa82b9b 100644 --- a/br/pkg/stream/stream_metas_test.go +++ b/br/pkg/stream/stream_metas_test.go @@ -17,6 +17,7 @@ import ( "testing" "github.com/fsouza/fake-gcs-server/fakestorage" + "github.com/google/uuid" "github.com/pingcap/errors" "github.com/pingcap/failpoint" backuppb "github.com/pingcap/kvproto/pkg/brpb" @@ -435,6 +436,53 @@ func TestReplaceMetadataTs(t *testing.T) { require.Equal(t, m.MaxTs, uint64(4)) } +func pef(t *testing.T, fb *backuppb.IngestedSSTs, sn int, s storage.ExternalStorage) string { + path := fmt.Sprintf("extbackupmeta_%08d", sn) + bs, err := fb.Marshal() + if err != nil { + require.NoError(t, err) + } + + err = s.WriteFile(context.Background(), path, bs) + require.NoError(t, err) + return path +} + +type efOP func(*backuppb.IngestedSSTs) + +func extFullBkup(ops ...efOP) *backuppb.IngestedSSTs { + ef := &backuppb.IngestedSSTs{} + for _, op := range ops { + op(ef) + } + return ef +} + +func finished() efOP { + return func(ef *backuppb.IngestedSSTs) { + ef.Finished = true + } +} + +func makeID() efOP { + id := uuid.New() + return func(ef *backuppb.IngestedSSTs) { + ef.BackupUuid = id[:] + } +} + +func prefix(pfx string) efOP { + return func(ef *backuppb.IngestedSSTs) { + ef.FilesPrefixHint = pfx + } +} + +func asIfTS(ts uint64) efOP { + return func(ef *backuppb.IngestedSSTs) { + ef.AsIfTs = ts + } +} + func m(storeId int64, minTS, maxTS uint64) *backuppb.Metadata { return &backuppb.Metadata{ StoreId: storeId, @@ -446,6 +494,12 @@ func m(storeId int64, minTS, maxTS uint64) *backuppb.Metadata { type migOP func(*backuppb.Migration) +func mExtFullBackup(path ...string) migOP { + return func(m *backuppb.Migration) { + m.IngestedSstPaths = append(m.IngestedSstPaths, path...) + } +} + func mDstrPfx(path ...string) migOP { return func(m *backuppb.Migration) { m.DestructPrefix = append(m.DestructPrefix, path...) @@ -625,7 +679,7 @@ func tmp(t *testing.T) *storage.LocalStorage { } func mig(ops ...migOP) *backuppb.Migration { - mig := &backuppb.Migration{} + mig := NewMigration() for _, op := range ops { op(mig) } @@ -2442,7 +2496,7 @@ func TestBasicMigration(t *testing.T) { ) bs := storage.Batch(s) - est := MigerationExtension(bs) + est := MigrationExtension(bs) res := MergeMigrations(mig1, mig2) resE := mig( @@ -2460,7 +2514,7 @@ func TestBasicMigration(t *testing.T) { requireMigrationsEqual(t, resE, res) ctx := context.Background() - mg := est.MigrateTo(ctx, res) + mg := est.migrateTo(ctx, res) newBaseE := mig(mLogDel("00002.meta", spans("00001.log", 1024, sp(0, 42), sp(42, 18)))) require.Empty(t, mg.Warnings) @@ -2476,7 +2530,7 @@ func TestBasicMigration(t *testing.T) { delRem := mig(mLogDel("00002.meta", spans("00001.log", 1024, sp(60, 1024-60)))) newNewBase := MergeMigrations(mg.NewBase, delRem) - mg = est.MigrateTo(ctx, newNewBase) + mg = est.migrateTo(ctx, newNewBase) require.Empty(t, mg.Warnings) requireMigrationsEqual(t, mg.NewBase, mig()) } @@ -2508,7 +2562,7 @@ func TestMergeAndMigrateTo(t *testing.T) { mig3p := pmig(s, 3, mig3) bs := storage.Batch(s) - est := MigerationExtension(bs) + est := MigrationExtension(bs) ctx := context.Background() migs, err := est.Load(ctx) @@ -2576,7 +2630,7 @@ func TestRemoveCompaction(t *testing.T) { mTruncatedTo(20), ) bs := storage.Batch(s) - est := MigerationExtension(bs) + est := MigrationExtension(bs) merged := MergeMigrations(mig1, mig2) requireMigrationsEqual(t, merged, mig( @@ -2587,7 +2641,7 @@ func TestRemoveCompaction(t *testing.T) { mTruncatedTo(30), )) - mg := est.MigrateTo(ctx, merged) + mg := est.migrateTo(ctx, merged) requireMigrationsEqual(t, mg.NewBase, mig( mCompaction(cDir(1), aDir(1), 10, 40), mCompaction(cDir(2), aDir(2), 35, 50), @@ -2615,7 +2669,7 @@ func TestRetry(t *testing.T) { require.NoError(t, failpoint.Enable("github.com/pingcap/tidb/br/pkg/storage/local_write_file_err", `1*return("this disk remembers nothing")`)) ctx := context.Background() - est := MigerationExtension(s) + est := MigrationExtension(s) mg := est.MergeAndMigrateTo(ctx, 2) require.Len(t, mg.Warnings, 1) require.Error(t, mg.Warnings[0], "this disk remembers nothing") @@ -2646,8 +2700,8 @@ func TestRetryRemoveCompaction(t *testing.T) { ) require.NoError(t, failpoint.Enable("github.com/pingcap/tidb/br/pkg/storage/local_delete_file_err", `1*return("this disk will never forget")`)) - est := MigerationExtension(s) - mg := est.MigrateTo(ctx, mig1) + est := MigrationExtension(s) + mg := est.migrateTo(ctx, mig1) require.Len(t, mg.Warnings, 1) require.Error(t, mg.Warnings[0], "this disk will never forget") requireMigrationsEqual(t, mg.NewBase, mig( @@ -2656,7 +2710,7 @@ func TestRetryRemoveCompaction(t *testing.T) { mDstrPfx(cDir(1), aDir(1)), )) - mg = est.MigrateTo(ctx, mg.NewBase) + mg = est.migrateTo(ctx, mg.NewBase) require.Empty(t, mg.Warnings) requireMigrationsEqual(t, mg.NewBase, mig( mCompaction(placeholder(cDir(2)), placeholder(aDir(2)), 28, 32), @@ -2690,10 +2744,10 @@ func TestWithSimpleTruncate(t *testing.T) { }, })) - est := MigerationExtension(s) + est := MigrationExtension(s) m := mig(mTruncatedTo(65)) var res MigratedTo - effs := est.DryRun(func(me MigrationExt) { res = me.MigrateTo(ctx, m) }) + effs := est.DryRun(func(me MigrationExt) { res = me.migrateTo(ctx, m) }) require.Empty(t, res.Warnings) for _, eff := range effs { @@ -2748,7 +2802,7 @@ func TestAppendingMigs(t *testing.T) { asp(fi(80, 85, WriteCF, 72), sp(34, 5)), }, }), lN(2)) - est := MigerationExtension(s) + est := MigrationExtension(s) cDir := func(n uint64) string { return fmt.Sprintf("%05d/output", n) } aDir := func(n uint64) string { return fmt.Sprintf("%05d/metas", n) } @@ -2781,7 +2835,7 @@ func TestUserAbort(t *testing.T) { pmig(s, 0, mig(mTruncatedTo(42))) pmig(s, 1, mig(mTruncatedTo(96))) - est := MigerationExtension(s) + est := MigrationExtension(s) var res MergeAndMigratedTo effs := est.DryRun(func(me MigrationExt) { res = me.MergeAndMigrateTo(ctx, 1, MMOptInteractiveCheck(func(ctx context.Context, m *backuppb.Migration) bool { @@ -2798,7 +2852,7 @@ func TestUnsupportedVersion(t *testing.T) { m := mig(mVersion(backuppb.MigrationVersion(65535))) pmig(s, 1, m) - est := MigerationExtension(s) + est := MigrationExtension(s) ctx := context.Background() _, err := est.Load(ctx) require.Error(t, err) @@ -2810,3 +2864,129 @@ func TestCreator(t *testing.T) { require.Contains(t, mig.Creator, "br") require.Equal(t, mig.Version, SupportedMigVersion) } + +func TestGroupedExtFullBackup(t *testing.T) { + ctx := context.Background() + s := tmp(t) + placeholder := func(pfx string) string { + path := path.Join(pfx, "monolith") + require.NoError(t, s.WriteFile(ctx, path, []byte("🪨"))) + return path + } + idx := 0 + somewhere := func() string { + idx += 1 + return placeholder(fmt.Sprintf("%06d", idx)) + } + + type Case struct { + InputGroups []*backuppb.IngestedSSTs + TruncatedTo uint64 + + RequireRem []int + } + + cases := []Case{ + { + InputGroups: []*backuppb.IngestedSSTs{ + extFullBkup(prefix(somewhere()), asIfTS(10), makeID(), finished()), + extFullBkup(prefix(somewhere()), asIfTS(12), makeID(), finished()), + }, + TruncatedTo: 11, + RequireRem: []int{1}, + }, + { + InputGroups: []*backuppb.IngestedSSTs{ + extFullBkup(prefix(somewhere()), asIfTS(10), makeID(), finished()), + extFullBkup(prefix(somewhere()), asIfTS(12), makeID(), finished()), + }, + TruncatedTo: 13, + RequireRem: []int{}, + }, + { + InputGroups: []*backuppb.IngestedSSTs{ + extFullBkup(prefix(somewhere()), asIfTS(10), makeID(), finished()), + extFullBkup(prefix(somewhere()), asIfTS(12), makeID(), finished()), + }, + TruncatedTo: 10, + RequireRem: []int{0, 1}, + }, + { + InputGroups: func() []*backuppb.IngestedSSTs { + id := makeID() + return []*backuppb.IngestedSSTs{ + extFullBkup(prefix(somewhere()), id), + extFullBkup(prefix(somewhere()), asIfTS(10), id, finished()), + extFullBkup(prefix(somewhere()), asIfTS(12), makeID(), finished()), + } + }(), + TruncatedTo: 11, + RequireRem: []int{2}, + }, + { + InputGroups: func() []*backuppb.IngestedSSTs { + id := makeID() + return []*backuppb.IngestedSSTs{ + extFullBkup(prefix(somewhere()), id), + extFullBkup(prefix(somewhere()), asIfTS(12), id, finished()), + extFullBkup(prefix(somewhere()), asIfTS(10), makeID(), finished()), + } + }(), + TruncatedTo: 11, + RequireRem: []int{0, 1}, + }, + { + InputGroups: func() []*backuppb.IngestedSSTs { + id := makeID() + return []*backuppb.IngestedSSTs{ + extFullBkup(prefix(somewhere()), asIfTS(999), id), + extFullBkup(prefix(somewhere()), asIfTS(10), id, finished()), + extFullBkup(prefix(somewhere()), asIfTS(12), makeID(), finished()), + } + }(), + TruncatedTo: 11, + RequireRem: []int{2}, + }, + { + InputGroups: []*backuppb.IngestedSSTs{ + extFullBkup(prefix(somewhere()), asIfTS(10), makeID()), + extFullBkup(prefix(somewhere()), asIfTS(12), makeID()), + extFullBkup(prefix(somewhere()), asIfTS(14), makeID()), + }, + TruncatedTo: 11, + RequireRem: []int{0, 1, 2}, + }, + } + + for i, c := range cases { + t.Run(fmt.Sprintf("#%d", i), func(t *testing.T) { + m := mig() + paths := []PathedIngestedSSTs{} + for i, input := range c.InputGroups { + p := pef(t, input, i, s) + paths = append(paths, PathedIngestedSSTs{ + path: p, + IngestedSSTs: input, + }) + mExtFullBackup(p)(m) + require.FileExists(t, path.Join(s.Base(), input.FilesPrefixHint)) + } + mTruncatedTo(c.TruncatedTo)(m) + est := MigrationExtension(s) + res := est.migrateTo(ctx, m) + require.NoError(t, multierr.Combine(res.Warnings...)) + chosen := []string{} + nonChosen := []PathedIngestedSSTs{} + forgottenIdx := 0 + for _, i := range c.RequireRem { + chosen = append(chosen, paths[i].path) + nonChosen = append(nonChosen, paths[forgottenIdx:i]...) + forgottenIdx = i + 1 + } + require.ElementsMatch(t, chosen, res.NewBase.IngestedSstPaths) + for _, p := range nonChosen { + require.NoFileExists(t, path.Join(s.Base(), p.FilesPrefixHint, "monolith")) + } + }) + } +} diff --git a/br/pkg/streamhelper/basic_lib_for_test.go b/br/pkg/streamhelper/basic_lib_for_test.go index 22fa031854fbe..bbc74b0d44b85 100644 --- a/br/pkg/streamhelper/basic_lib_for_test.go +++ b/br/pkg/streamhelper/basic_lib_for_test.go @@ -183,6 +183,11 @@ func (f *fakeStore) GetID() uint64 { return f.id } +func (f *fakeStore) FlushNow(ctx context.Context, in *logbackup.FlushNowRequest, opts ...grpc.CallOption) (*logbackup.FlushNowResponse, error) { + f.flush() + return &logbackup.FlushNowResponse{Results: []*logbackup.FlushResult{{TaskName: "Universe", Success: true}}}, nil +} + func (f *fakeStore) SubscribeFlushEvent(ctx context.Context, in *logbackup.SubscribeFlushEventRequest, opts ...grpc.CallOption) (logbackup.LogBackup_SubscribeFlushEventClient, error) { f.clientMu.Lock() defer f.clientMu.Unlock() diff --git a/br/pkg/summary/summary.go b/br/pkg/summary/summary.go index 3642410bdcd16..0a5b7f7fcd255 100644 --- a/br/pkg/summary/summary.go +++ b/br/pkg/summary/summary.go @@ -3,11 +3,16 @@ package summary import ( + "sync/atomic" "time" "go.uber.org/zap" ) +var ( + lastStatus atomic.Bool +) + // SetUnit set unit "backup/restore" for summary log. func SetUnit(unit string) { collector.SetUnit(unit) @@ -40,9 +45,15 @@ func CollectUint(name string, t uint64) { // SetSuccessStatus sets final success status. func SetSuccessStatus(success bool) { + lastStatus.Store(success) collector.SetSuccessStatus(success) } +// Succeed returns whether the last call to `SetSuccessStatus` passes `true`. +func Succeed() bool { + return lastStatus.Load() +} + // NowDureTime returns the duration between start time and current time func NowDureTime() time.Duration { return collector.NowDureTime() diff --git a/br/pkg/task/BUILD.bazel b/br/pkg/task/BUILD.bazel index d6d4aaaf0291e..2fde9e8981389 100644 --- a/br/pkg/task/BUILD.bazel +++ b/br/pkg/task/BUILD.bazel @@ -50,6 +50,7 @@ go_library( "//br/pkg/streamhelper/daemon", "//br/pkg/summary", "//br/pkg/utils", + "//br/pkg/utils/iter", "//br/pkg/version", "//pkg/config", "//pkg/ddl", @@ -58,6 +59,7 @@ go_library( "//pkg/infoschema/context", "//pkg/kv", "//pkg/meta/model", + "//pkg/metrics", "//pkg/parser/ast", "//pkg/parser/mysql", "//pkg/sessionctx/stmtctx", @@ -98,6 +100,7 @@ go_library( "@org_golang_x_sync//errgroup", "@org_uber_go_multierr//:multierr", "@org_uber_go_zap//:zap", + "@org_uber_go_zap//zapcore", ], ) diff --git a/br/pkg/task/common.go b/br/pkg/task/common.go index 1813741634609..c95bee08c59d8 100644 --- a/br/pkg/task/common.go +++ b/br/pkg/task/common.go @@ -596,6 +596,10 @@ func (cfg *Config) normalizePDURLs() error { return nil } +func (cfg *Config) UserFiltered() bool { + return len(cfg.Schemas) != 0 || len(cfg.Tables) != 0 || len(cfg.FilterStr) != 0 +} + // ParseFromFlags parses the config from the flag set. func (cfg *Config) ParseFromFlags(flags *pflag.FlagSet) error { var err error diff --git a/br/pkg/task/operator/BUILD.bazel b/br/pkg/task/operator/BUILD.bazel index 14760027a49b8..6d232d6c36bf0 100644 --- a/br/pkg/task/operator/BUILD.bazel +++ b/br/pkg/task/operator/BUILD.bazel @@ -4,7 +4,9 @@ go_library( name = "operator", srcs = [ "base64ify.go", + "checksum_table.go", "config.go", + "force_flush.go", "list_migration.go", "migrate_to.go", "prepare_snap.go", @@ -12,22 +14,37 @@ go_library( importpath = "github.com/pingcap/tidb/br/pkg/task/operator", visibility = ["//visibility:public"], deps = [ + "//br/pkg/backup", "//br/pkg/backup/prepare_snap", + "//br/pkg/checksum", + "//br/pkg/conn", "//br/pkg/errors", "//br/pkg/glue", "//br/pkg/logutil", + "//br/pkg/metautil", "//br/pkg/pdutil", "//br/pkg/storage", "//br/pkg/stream", "//br/pkg/task", "//br/pkg/utils", + "//pkg/domain", + "//pkg/meta/model", + "//pkg/util", + "//pkg/util/engine", "@com_github_fatih_color//:color", "@com_github_pingcap_errors//:errors", "@com_github_pingcap_failpoint//:failpoint", "@com_github_pingcap_kvproto//pkg/brpb", + "@com_github_pingcap_kvproto//pkg/logbackuppb", + "@com_github_pingcap_kvproto//pkg/metapb", "@com_github_pingcap_log//:log", "@com_github_spf13_pflag//:pflag", + "@com_github_tikv_client_go_v2//oracle", "@com_github_tikv_client_go_v2//tikv", + "@com_github_tikv_client_go_v2//util", + "@com_github_tikv_pd_client//:client", + "@com_github_tikv_pd_client//opt", + "@org_golang_google_grpc//:grpc", "@org_golang_google_grpc//keepalive", "@org_golang_x_sync//errgroup", "@org_uber_go_multierr//:multierr", diff --git a/br/pkg/task/operator/checksum_table.go b/br/pkg/task/operator/checksum_table.go new file mode 100644 index 0000000000000..dbf9d751bd370 --- /dev/null +++ b/br/pkg/task/operator/checksum_table.go @@ -0,0 +1,271 @@ +package operator + +import ( + "context" + "encoding/json" + "os" + "sync" + "sync/atomic" + + kvutil "github.com/tikv/client-go/v2/util" + "golang.org/x/sync/errgroup" + + "github.com/pingcap/errors" + backup "github.com/pingcap/kvproto/pkg/brpb" + "github.com/pingcap/log" + "github.com/pingcap/tidb/br/pkg/checksum" + "github.com/pingcap/tidb/br/pkg/conn" + "github.com/pingcap/tidb/br/pkg/glue" + "github.com/pingcap/tidb/br/pkg/metautil" + "github.com/pingcap/tidb/br/pkg/task" + "github.com/pingcap/tidb/pkg/domain" + "github.com/pingcap/tidb/pkg/meta/model" + "github.com/pingcap/tidb/pkg/util" + "github.com/tikv/client-go/v2/oracle" + "go.uber.org/zap" +) + +type checksumTableCtx struct { + cfg ChecksumWithRewriteRulesConfig + + mgr *conn.Mgr + dom *domain.Domain +} + +type tableInDB struct { + info *model.TableInfo + dbName string +} + +func RunChecksumTable(ctx context.Context, g glue.Glue, cfg ChecksumWithRewriteRulesConfig) error { + c := &checksumTableCtx{cfg: cfg} + + if err := c.init(ctx, g); err != nil { + return errors.Trace(err) + } + + curr, err := c.getTables(ctx) + if err != nil { + return errors.Trace(err) + } + + old, err := c.loadOldTableIDs(ctx) + if err != nil { + return errors.Trace(err) + } + + reqs, err := c.genRequests(ctx, old, curr) + if err != nil { + return errors.Trace(err) + } + + results, err := c.runChecksum(ctx, reqs) + if err != nil { + return errors.Trace(err) + } + + for _, result := range results { + log.Info("Checksum result", zap.String("db", result.DBName), zap.String("table", result.TableName), zap.Uint64("checksum", result.Checksum), + zap.Uint64("total_bytes", result.TotalBytes), zap.Uint64("total_kvs", result.TotalKVs)) + } + + return json.NewEncoder(os.Stdout).Encode(results) +} + +func (c *checksumTableCtx) init(ctx context.Context, g glue.Glue) error { + cfg := c.cfg + var err error + c.mgr, err = task.NewMgr(ctx, g, cfg.PD, cfg.TLS, task.GetKeepalive(&cfg.Config), cfg.CheckRequirements, true, conn.NormalVersionChecker) + if err != nil { + return err + } + + c.dom, err = g.GetDomain(c.mgr.GetStorage()) + if err != nil { + return err + } + return nil +} + +func (c *checksumTableCtx) getTables(ctx context.Context) (res []tableInDB, err error) { + sch := c.dom.InfoSchema() + dbs := sch.AllSchemas() + for _, db := range dbs { + if !c.cfg.TableFilter.MatchSchema(db.Name.L) { + continue + } + + tbls, err := sch.SchemaTableInfos(ctx, db.Name) + if err != nil { + return nil, errors.Annotatef(err, "failed to load data for db %s", db.Name) + } + for _, tbl := range tbls { + if !c.cfg.TableFilter.MatchTable(db.Name.L, tbl.Name.L) { + continue + } + log.Info("Added table from cluster.", zap.String("db", db.Name.L), zap.String("table", tbl.Name.L)) + res = append(res, tableInDB{ + info: tbl, + dbName: db.Name.L, + }) + } + } + + return +} + +func (c *checksumTableCtx) loadOldTableIDs(ctx context.Context) (res []*metautil.Table, err error) { + _, strg, err := task.GetStorage(ctx, c.cfg.Storage, &c.cfg.Config) + if err != nil { + return nil, errors.Annotate(err, "failed to create storage") + } + + mPath := metautil.MetaFile + metaContent, err := strg.ReadFile(ctx, mPath) + if err != nil { + return nil, errors.Annotatef(err, "failed to open metafile %s", mPath) + } + + var backupMeta backup.BackupMeta + if err := backupMeta.Unmarshal(metaContent); err != nil { + return nil, errors.Annotate(err, "failed to parse backupmeta") + } + + metaReader := metautil.NewMetaReader(&backupMeta, strg, &c.cfg.CipherInfo) + + tblCh := make(chan *metautil.Table, 1024) + errCh := make(chan error, 1) + go func() { + if err := metaReader.ReadSchemasFiles(ctx, tblCh, metautil.SkipFiles, metautil.SkipStats); err != nil { + errCh <- errors.Annotate(err, "failed to read schema files") + } + close(tblCh) + }() + + for { + select { + case err := <-errCh: + return nil, err + case tbl, ok := <-tblCh: + if !ok { + return + } + if !c.cfg.TableFilter.MatchTable(tbl.DB.Name.L, tbl.Info.Name.L) { + continue + } + log.Info("Added table from backup data.", zap.String("db", tbl.DB.Name.L), zap.String("table", tbl.Info.Name.L)) + res = append(res, tbl) + case <-ctx.Done(): + return nil, ctx.Err() + } + } +} + +type request struct { + copReq *checksum.Executor + tableName string + dbName string +} + +func (c *checksumTableCtx) genRequests(ctx context.Context, bkup []*metautil.Table, curr []tableInDB) (reqs []request, err error) { + phy, logi, err := c.mgr.GetPDClient().GetTS(ctx) + if err != nil { + return nil, errors.Annotate(err, "failed to get TSO for checksumming") + } + tso := oracle.ComposeTS(phy, logi) + + bkupTbls := map[string]map[string]*metautil.Table{} + for _, t := range bkup { + m, ok := bkupTbls[t.DB.Name.L] + if !ok { + m = make(map[string]*metautil.Table) + bkupTbls[t.DB.Name.L] = m + } + + m[t.Info.Name.L] = t + } + + for _, t := range curr { + rb := checksum.NewExecutorBuilder(t.info, tso) + rb.SetConcurrency(c.cfg.ChecksumConcurrency) + oldDB, ok := bkupTbls[t.dbName] + if !ok { + log.Warn("db not found, will skip", zap.String("db", t.dbName)) + continue + } + oldTable, ok := oldDB[t.info.Name.L] + if !ok { + log.Warn("table not found, will skip", zap.String("db", t.dbName), zap.String("table", t.info.Name.L)) + continue + } + + rb.SetOldTable(oldTable) + rb.SetExplicitRequestSourceType(kvutil.ExplicitTypeBR) + req, err := rb.Build() + if err != nil { + return nil, errors.Annotatef(err, "failed to build checksum builder for table %s.%s", t.dbName, t.info.Name.L) + } + reqs = append(reqs, request{ + copReq: req, + dbName: t.dbName, + tableName: t.info.Name.L, + }) + } + + return +} + +type ChecksumResult struct { + DBName string `json:"db_name"` + TableName string `json:"table_name"` + + Checksum uint64 `json:"checksum"` + TotalBytes uint64 `json:"total_bytes"` + TotalKVs uint64 `json:"total_kvs"` +} + +func (c *checksumTableCtx) runChecksum(ctx context.Context, reqs []request) ([]ChecksumResult, error) { + wkPool := util.NewWorkerPool(c.cfg.TableConcurrency, "checksum") + eg, ectx := errgroup.WithContext(ctx) + results := make([]ChecksumResult, 0, len(reqs)) + resultsMu := new(sync.Mutex) + + for _, req := range reqs { + req := req + wkPool.ApplyOnErrorGroup(eg, func() error { + total := req.copReq.Len() + finished := new(atomic.Int64) + resp, err := req.copReq.Execute(ectx, c.mgr.GetStorage().GetClient(), func() { + finished.Add(1) + log.Info( + "Finish one request of a table.", + zap.String("db", req.dbName), + zap.String("table", req.tableName), + zap.Int64("finished", finished.Load()), + zap.Int64("total", int64(total)), + ) + }) + if err != nil { + return err + } + res := ChecksumResult{ + DBName: req.dbName, + TableName: req.tableName, + + Checksum: resp.Checksum, + TotalBytes: resp.TotalBytes, + TotalKVs: resp.TotalKvs, + } + resultsMu.Lock() + results = append(results, res) + resultsMu.Unlock() + return nil + }) + } + + if err := eg.Wait(); err != nil { + return nil, err + } + + return results, nil +} diff --git a/br/pkg/task/operator/config.go b/br/pkg/task/operator/config.go index c42382abe504d..03996beed3011 100644 --- a/br/pkg/task/operator/config.go +++ b/br/pkg/task/operator/config.go @@ -3,15 +3,32 @@ package operator import ( + "regexp" "time" "github.com/pingcap/errors" + "github.com/pingcap/tidb/br/pkg/backup" berrors "github.com/pingcap/tidb/br/pkg/errors" "github.com/pingcap/tidb/br/pkg/storage" "github.com/pingcap/tidb/br/pkg/task" "github.com/spf13/pflag" ) +const ( + flagTableConcurrency = "table-concurrency" + flagStorePatterns = "stores" + flagTTL = "ttl" + flagSafePoint = "safepoint" + flagStorage = "storage" + flagLoadCreds = "load-creds" + flagJSON = "json" + flagRecent = "recent" + flagTo = "to" + flagBase = "base" + flagYes = "yes" + flagDryRun = "dry-run" +) + type PauseGcConfig struct { task.Config @@ -23,8 +40,8 @@ type PauseGcConfig struct { } func DefineFlagsForPrepareSnapBackup(f *pflag.FlagSet) { - _ = f.DurationP("ttl", "i", 2*time.Minute, "The time-to-live of the safepoint.") - _ = f.Uint64P("safepoint", "t", 0, "The GC safepoint to be kept.") + _ = f.DurationP(flagTTL, "i", 2*time.Minute, "The time-to-live of the safepoint.") + _ = f.Uint64P(flagSafePoint, "t", 0, "The GC safepoint to be kept.") } // ParseFromFlags fills the config via the flags. @@ -34,11 +51,11 @@ func (cfg *PauseGcConfig) ParseFromFlags(flags *pflag.FlagSet) error { } var err error - cfg.SafePoint, err = flags.GetUint64("safepoint") + cfg.SafePoint, err = flags.GetUint64(flagSafePoint) if err != nil { return err } - cfg.TTL, err = flags.GetDuration("ttl") + cfg.TTL, err = flags.GetDuration(flagTTL) if err != nil { return err } @@ -54,8 +71,8 @@ type Base64ifyConfig struct { func DefineFlagsForBase64ifyConfig(flags *pflag.FlagSet) { storage.DefineFlags(flags) - flags.StringP("storage", "s", "", "The external storage input.") - flags.Bool("load-creds", false, "whether loading the credientials from current environment and marshal them to the base64 string. [!]") + flags.StringP(flagStorage, "s", "", "The external storage input.") + flags.Bool(flagLoadCreds, false, "whether loading the credientials from current environment and marshal them to the base64 string. [!]") } func (cfg *Base64ifyConfig) ParseFromFlags(flags *pflag.FlagSet) error { @@ -64,11 +81,11 @@ func (cfg *Base64ifyConfig) ParseFromFlags(flags *pflag.FlagSet) error { if err != nil { return err } - cfg.StorageURI, err = flags.GetString("storage") + cfg.StorageURI, err = flags.GetString(flagStorage) if err != nil { return err } - cfg.LoadCerd, err = flags.GetBool("load-creds") + cfg.LoadCerd, err = flags.GetBool(flagLoadCreds) if err != nil { return err } @@ -83,8 +100,8 @@ type ListMigrationConfig struct { func DefineFlagsForListMigrationConfig(flags *pflag.FlagSet) { storage.DefineFlags(flags) - flags.StringP("storage", "s", "", "the external storage input.") - flags.Bool("json", false, "output the result in json format.") + flags.StringP(flagStorage, "s", "", "the external storage input.") + flags.Bool(flagJSON, false, "output the result in json format.") } func (cfg *ListMigrationConfig) ParseFromFlags(flags *pflag.FlagSet) error { @@ -93,11 +110,11 @@ func (cfg *ListMigrationConfig) ParseFromFlags(flags *pflag.FlagSet) error { if err != nil { return err } - cfg.StorageURI, err = flags.GetString("storage") + cfg.StorageURI, err = flags.GetString(flagStorage) if err != nil { return err } - cfg.JSONOutput, err = flags.GetBool("json") + cfg.JSONOutput, err = flags.GetBool(flagJSON) if err != nil { return err } @@ -115,15 +132,6 @@ type MigrateToConfig struct { DryRun bool } -const ( - flagStorage = "storage" - flagRecent = "recent" - flagTo = "to" - flagBase = "base" - flagYes = "yes" - flagDryRun = "dry-run" -) - func DefineFlagsForMigrateToConfig(flags *pflag.FlagSet) { storage.DefineFlags(flags) flags.StringP(flagStorage, "s", "", "the external storage input.") @@ -180,3 +188,43 @@ func (cfg *MigrateToConfig) Verify() error { } return nil } + +type ForceFlushConfig struct { + task.Config + + StoresPattern *regexp.Regexp +} + +func DefineFlagsForForceFlushConfig(f *pflag.FlagSet) { + f.String(flagStorePatterns, ".*", "The regexp to match the store peer address to be force flushed.") +} + +func (cfg *ForceFlushConfig) ParseFromFlags(flags *pflag.FlagSet) (err error) { + storePat, err := flags.GetString(flagStorePatterns) + if err != nil { + return err + } + cfg.StoresPattern, err = regexp.Compile(storePat) + if err != nil { + return errors.Annotatef(err, "invalid expression in --%s", flagStorePatterns) + } + + return cfg.Config.ParseFromFlags(flags) +} + +type ChecksumWithRewriteRulesConfig struct { + task.Config +} + +func DefineFlagsForChecksumTableConfig(f *pflag.FlagSet) { + f.Uint(flagTableConcurrency, backup.DefaultSchemaConcurrency, "The size of a BR thread pool used for backup table metas, "+ + "including tableInfo/checksum and stats.") +} + +func (cfg *ChecksumWithRewriteRulesConfig) ParseFromFlags(flags *pflag.FlagSet) (err error) { + cfg.TableConcurrency, err = flags.GetUint(flagTableConcurrency) + if err != nil { + return + } + return cfg.Config.ParseFromFlags(flags) +} diff --git a/br/pkg/task/operator/force_flush.go b/br/pkg/task/operator/force_flush.go new file mode 100644 index 0000000000000..4f9f622812c5b --- /dev/null +++ b/br/pkg/task/operator/force_flush.go @@ -0,0 +1,76 @@ +package operator + +import ( + "context" + "slices" + + "github.com/pingcap/errors" + logbackup "github.com/pingcap/kvproto/pkg/logbackuppb" + "github.com/pingcap/kvproto/pkg/metapb" + "github.com/pingcap/log" + "github.com/pingcap/tidb/pkg/util/engine" + pd "github.com/tikv/pd/client" + "github.com/tikv/pd/client/opt" + "go.uber.org/zap" + "golang.org/x/sync/errgroup" + "google.golang.org/grpc" +) + +func getAllTiKVs(ctx context.Context, p pd.Client) ([]*metapb.Store, error) { + stores, err := p.GetAllStores(ctx, opt.WithExcludeTombstone()) + if err != nil { + return nil, err + } + withoutTiFlash := slices.DeleteFunc(stores, engine.IsTiFlash) + return withoutTiFlash, err +} + +func RunForceFlush(ctx context.Context, cfg *ForceFlushConfig) error { + pdMgr, err := dialPD(ctx, &cfg.Config) + if err != nil { + return err + } + defer pdMgr.Close() + + stores, err := createStoreManager(pdMgr.GetPDClient(), &cfg.Config) + if err != nil { + return err + } + defer stores.Close() + + tikvs, err := getAllTiKVs(ctx, pdMgr.GetPDClient()) + if err != nil { + return err + } + eg, ectx := errgroup.WithContext(ctx) + log.Info("About to start force flushing.", zap.Stringer("stores-pattern", cfg.StoresPattern)) + for _, s := range tikvs { + s := s + if !cfg.StoresPattern.MatchString(s.Address) { + log.Info("Skipping not matched TiKV.", zap.Uint64("store", s.GetId()), zap.String("addr", s.Address)) + } + log.Info("Starting force flush TiKV.", zap.Uint64("store", s.GetId()), zap.String("addr", s.Address)) + eg.Go(func() error { + var logBackupCli logbackup.LogBackupClient + err := stores.WithConn(ectx, s.GetId(), func(cc *grpc.ClientConn) { + logBackupCli = logbackup.NewLogBackupClient(cc) + }) + if err != nil { + return err + } + + resp, err := logBackupCli.FlushNow(ectx, &logbackup.FlushNowRequest{}) + if err != nil { + return errors.Annotatef(err, "failed to flush store %d", s.GetId()) + } + for _, res := range resp.Results { + if !res.Success { + return errors.Errorf("failed to flush task %s at store %d: %s", res.TaskName, s.GetId(), res.ErrorMessage) + } + log.Info("Force flushed task of TiKV store.", zap.Uint64("store", s.Id), zap.String("task", res.TaskName)) + } + return nil + }) + } + return eg.Wait() +} diff --git a/br/pkg/task/operator/list_migration.go b/br/pkg/task/operator/list_migration.go index d6c7efd57197a..1e030d7e0f3d8 100644 --- a/br/pkg/task/operator/list_migration.go +++ b/br/pkg/task/operator/list_migration.go @@ -26,8 +26,8 @@ func RunListMigrations(ctx context.Context, cfg ListMigrationConfig) error { if err != nil { return err } - ext := stream.MigerationExtension(st) - migs, err := ext.Load(ctx) + ext := stream.MigrationExtension(st) + migs, err := ext.Load(ctx, stream.MLNotFoundIsErr()) if err != nil { return err } @@ -40,12 +40,12 @@ func RunListMigrations(ctx context.Context, cfg ListMigrationConfig) error { console.Println(statusOK(fmt.Sprintf("Total %d Migrations.", len(migs.Layers)+1))) console.Printf("> BASE <\n") tbl := console.CreateTable() - stream.AddMigrationToTable(migs.Base, tbl) + ext.AddMigrationToTable(ctx, migs.Base, tbl) tbl.Print() for _, t := range migs.Layers { console.Printf("> %08d <\n", t.SeqNum) tbl := console.CreateTable() - stream.AddMigrationToTable(&t.Content, tbl) + ext.AddMigrationToTable(ctx, &t.Content, tbl) tbl.Print() } } diff --git a/br/pkg/task/operator/migrate_to.go b/br/pkg/task/operator/migrate_to.go index 282e82784ecb9..2a086b9868db1 100644 --- a/br/pkg/task/operator/migrate_to.go +++ b/br/pkg/task/operator/migrate_to.go @@ -5,7 +5,7 @@ import ( "github.com/fatih/color" "github.com/pingcap/errors" - backuppb "github.com/pingcap/kvproto/pkg/brpb" + backup "github.com/pingcap/kvproto/pkg/brpb" "github.com/pingcap/tidb/br/pkg/glue" "github.com/pingcap/tidb/br/pkg/storage" "github.com/pingcap/tidb/br/pkg/stream" @@ -39,16 +39,16 @@ func (cx migrateToCtx) printErr(errs []error, msg string) { } } -func (cx migrateToCtx) askForContinue(targetMig *backuppb.Migration) bool { +func (cx migrateToCtx) askForContinue(ctx context.Context, targetMig *backup.Migration) bool { tbl := cx.console.CreateTable() - stream.AddMigrationToTable(targetMig, tbl) + cx.est.AddMigrationToTable(ctx, targetMig, tbl) cx.console.Println("The migration going to be executed will be like: ") tbl.Print() return cx.console.PromptBool("Continue? ") } -func (cx migrateToCtx) dryRun(f func(stream.MigrationExt) stream.MergeAndMigratedTo) error { +func (cx migrateToCtx) dryRun(ctx context.Context, f func(stream.MigrationExt) stream.MergeAndMigratedTo) error { var ( est = cx.est console = cx.console @@ -60,7 +60,7 @@ func (cx migrateToCtx) dryRun(f func(stream.MigrationExt) stream.MergeAndMigrate }) tbl := console.CreateTable() - stream.AddMigrationToTable(estBase.NewBase, tbl) + cx.est.AddMigrationToTable(ctx, estBase.NewBase, tbl) console.Println("The new BASE migration will be like: ") tbl.Print() file, err := storage.SaveJSONEffectsToTmp(effects) @@ -90,7 +90,7 @@ func RunMigrateTo(ctx context.Context, cfg MigrateToConfig) error { console := glue.ConsoleOperations{ConsoleGlue: glue.StdIOGlue{}} - est := stream.MigerationExtension(st) + est := stream.MigrationExtension(st) est.Hooks = stream.NewProgressBarHooks(console) migs, err := est.Load(ctx) if err != nil { @@ -120,12 +120,14 @@ func RunMigrateTo(ctx context.Context, cfg MigrateToConfig) error { return nil } if cfg.DryRun { - run = cx.dryRun + run = func(f func(stream.MigrationExt) stream.MergeAndMigratedTo) error { + return cx.dryRun(ctx, f) + } } return run(func(est stream.MigrationExt) stream.MergeAndMigratedTo { - return est.MergeAndMigrateTo(ctx, targetVersion, stream.MMOptInteractiveCheck(func(ctx context.Context, m *backuppb.Migration) bool { - return cfg.Yes || cx.askForContinue(m) + return est.MergeAndMigrateTo(ctx, targetVersion, stream.MMOptInteractiveCheck(func(ctx context.Context, m *backup.Migration) bool { + return cfg.Yes || cx.askForContinue(ctx, m) })) }) } diff --git a/br/pkg/task/operator/prepare_snap.go b/br/pkg/task/operator/prepare_snap.go index 2f846e2ac9dc2..4bf6ed5b1b8e9 100644 --- a/br/pkg/task/operator/prepare_snap.go +++ b/br/pkg/task/operator/prepare_snap.go @@ -19,12 +19,32 @@ import ( "github.com/pingcap/tidb/br/pkg/task" "github.com/pingcap/tidb/br/pkg/utils" "github.com/tikv/client-go/v2/tikv" + pd "github.com/tikv/pd/client" "go.uber.org/multierr" "go.uber.org/zap" "golang.org/x/sync/errgroup" "google.golang.org/grpc/keepalive" ) +func createStoreManager(pd pd.Client, cfg *task.Config) (*utils.StoreManager, error) { + var ( + tconf *tls.Config + err error + ) + + if cfg.TLS.IsEnabled() { + tconf, err = cfg.TLS.ToTLSConfig() + if err != nil { + return nil, errors.Annotate(err, "invalid tls config") + } + } + kvMgr := utils.NewStoreManager(pd, keepalive.ClientParameters{ + Time: cfg.GRPCKeepaliveTime, + Timeout: cfg.GRPCKeepaliveTimeout, + }, tconf) + return kvMgr, nil +} + func dialPD(ctx context.Context, cfg *task.Config) (*pdutil.PdController, error) { var tc *tls.Config if cfg.TLS.IsEnabled() { diff --git a/br/pkg/task/restore.go b/br/pkg/task/restore.go index 1cb3fd9e92fe7..9b575bcea6c4b 100644 --- a/br/pkg/task/restore.go +++ b/br/pkg/task/restore.go @@ -15,7 +15,9 @@ import ( "github.com/google/uuid" "github.com/opentracing/opentracing-go" "github.com/pingcap/errors" + "github.com/pingcap/failpoint" backuppb "github.com/pingcap/kvproto/pkg/brpb" + "github.com/pingcap/kvproto/pkg/encryptionpb" "github.com/pingcap/log" "github.com/pingcap/tidb/br/pkg/checkpoint" pconfig "github.com/pingcap/tidb/br/pkg/config" @@ -38,7 +40,9 @@ import ( "github.com/pingcap/tidb/pkg/infoschema" "github.com/pingcap/tidb/pkg/kv" "github.com/pingcap/tidb/pkg/meta/model" + "github.com/pingcap/tidb/pkg/metrics" "github.com/pingcap/tidb/pkg/parser/ast" + "github.com/pingcap/tidb/pkg/util" "github.com/pingcap/tidb/pkg/util/collate" "github.com/pingcap/tidb/pkg/util/engine" "github.com/spf13/cobra" @@ -276,6 +280,10 @@ type RestoreConfig struct { UseFSR bool `json:"use-fsr" toml:"use-fsr"` } +func (r *RestoreConfig) LocalEncryptionEnabled() bool { + return r.CipherInfo.CipherType != encryptionpb.EncryptionMethod_PLAINTEXT +} + // DefineRestoreFlags defines common flags for the restore tidb command. func DefineRestoreFlags(flags *pflag.FlagSet) { flags.Bool(flagNoSchema, false, "skip creating schemas and tables, reuse existing empty ones") @@ -665,6 +673,12 @@ func DefaultRestoreConfig(commonConfig Config) RestoreConfig { return cfg } +func printRestoreMetrics() { + log.Info("Metric: import_file_seconds", zap.Object("metric", logutil.MarshalHistogram(metrics.RestoreImportFileSeconds))) + log.Info("Metric: upload_sst_for_pitr_seconds", zap.Object("metric", logutil.MarshalHistogram(metrics.RestoreUploadSSTForPiTRSeconds))) + log.Info("Metric: upload_sst_meta_for_pitr_seconds", zap.Object("metric", logutil.MarshalHistogram(metrics.RestoreUploadSSTMetaForPiTRSeconds))) +} + // RunRestore starts a restore task inside the current goroutine. func RunRestore(c context.Context, g glue.Glue, cmdName string, cfg *RestoreConfig) error { etcdCLI, err := dialEtcdWithCfg(c, cfg.Config) @@ -676,7 +690,7 @@ func RunRestore(c context.Context, g glue.Glue, cmdName string, cfg *RestoreConf log.Error("failed to close the etcd client", zap.Error(err)) } }() - if err := checkTaskExists(c, cfg, etcdCLI); err != nil { + if err := checkConflictingLogBackup(c, cfg, etcdCLI); err != nil { return errors.Annotate(err, "failed to check task exists") } closeF, err := registerTaskToPD(c, etcdCLI) @@ -698,6 +712,8 @@ func RunRestore(c context.Context, g glue.Glue, cmdName string, cfg *RestoreConf } defer mgr.Close() + defer printRestoreMetrics() + var restoreError error if IsStreamRestore(cmdName) { if err := version.CheckClusterVersion(c, mgr.GetPDClient(), version.CheckVersionForBRPiTR); err != nil { @@ -786,14 +802,15 @@ func runSnapshotRestore(c context.Context, mgr *conn.Mgr, g glue.Glue, cmdName s // Init DB connection sessions err = client.Init(g, mgr.GetStorage()) defer client.Close() - if err != nil { return errors.Trace(err) } + u, s, backupMeta, err := ReadBackupMeta(ctx, metautil.MetaFile, &cfg.Config) if err != nil { return errors.Trace(err) } + if cfg.CheckRequirements { err := checkIncompatibleChangefeed(ctx, backupMeta.EndVersion, mgr.GetDomain().GetEtcdClient()) log.Info("Checking incompatible TiCDC changefeeds before restoring.", @@ -933,6 +950,15 @@ func runSnapshotRestore(c context.Context, mgr *conn.Mgr, g glue.Glue, cmdName s }() } + err = client.InstallPiTRSupport(ctx, snapclient.PiTRCollDep{ + PDCli: mgr.GetPDClient(), + EtcdCli: mgr.GetDomain().GetEtcdClient(), + Storage: util.ProtoV1Clone(u), + }) + if err != nil { + return errors.Trace(err) + } + sp := utils.BRServiceSafePoint{ BackupTS: restoreTS, TTL: utils.DefaultBRGCSafePointTTL, @@ -1161,6 +1187,11 @@ func runSnapshotRestore(c context.Context, mgr *conn.Mgr, g glue.Glue, cmdName s return errors.Trace(err) } + failpoint.InjectCall("run-snapshot-restore-about-to-finish", &err) + if err != nil { + return err + } + schedulersRemovable = true // Set task summary to success status. diff --git a/br/pkg/task/stream.go b/br/pkg/task/stream.go index 0c22db685ce16..2ee1713f4f6c3 100644 --- a/br/pkg/task/stream.go +++ b/br/pkg/task/stream.go @@ -54,13 +54,16 @@ import ( "github.com/pingcap/tidb/br/pkg/streamhelper/daemon" "github.com/pingcap/tidb/br/pkg/summary" "github.com/pingcap/tidb/br/pkg/utils" + "github.com/pingcap/tidb/br/pkg/utils/iter" "github.com/pingcap/tidb/pkg/kv" "github.com/pingcap/tidb/pkg/meta/model" "github.com/pingcap/tidb/pkg/util/cdcutil" "github.com/spf13/pflag" "github.com/tikv/client-go/v2/oracle" clientv3 "go.etcd.io/etcd/client/v3" + "go.uber.org/multierr" "go.uber.org/zap" + "go.uber.org/zap/zapcore" ) const ( @@ -138,6 +141,18 @@ type StreamConfig struct { AdvancerCfg advancercfg.Config `json:"advancer-config" toml:"advancer-config"` } +func DefaultStreamConfig(flagsDef func(*pflag.FlagSet)) StreamConfig { + fs := pflag.NewFlagSet("dummy", pflag.ContinueOnError) + flagsDef(fs) + DefineCommonFlags(fs) + cfg := StreamConfig{} + err := cfg.ParseFromFlags(fs) + if err != nil { + log.Panic("failed to parse backup flags to config", zap.Error(err)) + } + return cfg +} + func (cfg *StreamConfig) makeStorage(ctx context.Context) (storage.ExternalStorage, error) { u, err := storage.ParseBackend(cfg.Storage, &cfg.BackendOptions) if err != nil { @@ -1090,13 +1105,13 @@ func RunStreamTruncate(c context.Context, g glue.Glue, cmdName string, cfg *Stre } if cfg.CleanUpCompactions { - est := stream.MigerationExtension(extStorage) + est := stream.MigrationExtension(extStorage) est.Hooks = stream.NewProgressBarHooks(console) newSN := math.MaxInt optPrompt := stream.MMOptInteractiveCheck(func(ctx context.Context, m *backuppb.Migration) bool { console.Println("We are going to do the following: ") tbl := console.CreateTable() - stream.AddMigrationToTable(m, tbl) + est.AddMigrationToTable(ctx, m, tbl) tbl.Print() return console.PromptBool("Continue? ") }) @@ -1190,9 +1205,9 @@ func RunStreamTruncate(c context.Context, g glue.Glue, cmdName string, cfg *Stre return nil } -// checkTaskExists checks whether there is a log backup task running. +// checkConflictingLogBackup checks whether there is a log backup task running. // If so, return an error. -func checkTaskExists(ctx context.Context, cfg *RestoreConfig, etcdCLI *clientv3.Client) error { +func checkConflictingLogBackup(ctx context.Context, cfg *RestoreConfig, etcdCLI *clientv3.Client) error { if err := checkConfigForStatus(cfg.PD); err != nil { return err } @@ -1203,15 +1218,37 @@ func checkTaskExists(ctx context.Context, cfg *RestoreConfig, etcdCLI *clientv3. if err != nil { return err } - if len(tasks) > 0 { - return errors.Errorf("log backup task is running: %s, "+ - "please stop the task before restore, and after PITR operation finished, "+ - "create log-backup task again and create a full backup on this cluster", tasks[0].Info.Name) + for _, task := range tasks { + if err := checkTaskCompat(cfg, task); err != nil { + return err + } } return nil } +func checkTaskCompat(cfg *RestoreConfig, task streamhelper.Task) error { + baseErr := errors.Errorf("log backup task is running: %s, and isn't compatible with your restore."+ + "You may check the extra information to get rid of this. If that doesn't work, you may "+ + "stop the task before restore, and after the restore operation finished, "+ + "create log-backup task again and create a full backup on this cluster.", task.Info.Name) + if len(cfg.FullBackupStorage) > 0 { + return errors.Annotate(baseErr, "you want to do point in time restore, which isn't compatible with an enabled log backup task yet") + } + if !cfg.UserFiltered() { + return errors.Annotate(baseErr, + "you want to restore a whole cluster, you may use `-f` or `restore table|database` to "+ + "specify the tables to restore to continue") + } + if cfg.LocalEncryptionEnabled() { + return errors.Annotate(baseErr, "the data you want to restore is encrypted, they cannot be copied to the log storage") + } + if task.Info.GetSecurityConfig().GetEncryption() != nil { + return errors.Annotate(baseErr, "the running log backup task is encrypted, the data copied to the log storage cannot work") + } + return nil +} + func checkIncompatibleChangefeed(ctx context.Context, backupTS uint64, etcdCLI *clientv3.Client) error { nameSet, err := cdcutil.GetIncompatibleChangefeedsWithSafeTS(ctx, etcdCLI, backupTS) if err != nil { @@ -1332,6 +1369,7 @@ func restoreStream( checkpointTotalKVCount uint64 checkpointTotalSize uint64 currentTS uint64 + extraFields []zapcore.Field mu sync.Mutex startTime = time.Now() ) @@ -1340,18 +1378,20 @@ func restoreStream( summary.Log("restore log failed summary", zap.Error(err)) } else { totalDureTime := time.Since(startTime) - summary.Log("restore log success summary", zap.Duration("total-take", totalDureTime), - zap.Uint64("source-start-point", cfg.StartTS), - zap.Uint64("source-end-point", cfg.RestoreTS), - zap.Uint64("target-end-point", currentTS), - zap.String("source-start", stream.FormatDate(oracle.GetTimeFromTS(cfg.StartTS))), - zap.String("source-end", stream.FormatDate(oracle.GetTimeFromTS(cfg.RestoreTS))), - zap.String("target-end", stream.FormatDate(oracle.GetTimeFromTS(currentTS))), - zap.Uint64("total-kv-count", totalKVCount), - zap.Uint64("skipped-kv-count-by-checkpoint", checkpointTotalKVCount), - zap.String("total-size", units.HumanSize(float64(totalSize))), - zap.String("skipped-size-by-checkpoint", units.HumanSize(float64(checkpointTotalSize))), - zap.String("average-speed", units.HumanSize(float64(totalSize)/totalDureTime.Seconds())+"/s"), + summary.Log("restore log success summary", + append([]zapcore.Field{zap.Duration("total-take", totalDureTime), + zap.Uint64("source-start-point", cfg.StartTS), + zap.Uint64("source-end-point", cfg.RestoreTS), + zap.Uint64("target-end-point", currentTS), + zap.String("source-start", stream.FormatDate(oracle.GetTimeFromTS(cfg.StartTS))), + zap.String("source-end", stream.FormatDate(oracle.GetTimeFromTS(cfg.RestoreTS))), + zap.String("target-end", stream.FormatDate(oracle.GetTimeFromTS(currentTS))), + zap.Uint64("total-kv-count", totalKVCount), + zap.Uint64("skipped-kv-count-by-checkpoint", checkpointTotalKVCount), + zap.String("total-size", units.HumanSize(float64(totalSize))), + zap.String("skipped-size-by-checkpoint", units.HumanSize(float64(checkpointTotalSize))), + zap.String("average-speed (log)", units.HumanSize(float64(totalSize)/totalDureTime.Seconds())+"/s")}, + extraFields...)..., ) } }() @@ -1376,6 +1416,7 @@ func restoreStream( return errors.Annotate(err, "failed to create restore client") } defer client.Close(ctx) + defer client.RestoreSSTStatisticFields(&extraFields) if taskInfo != nil && taskInfo.Metadata != nil { // reuse the task's rewrite ts @@ -1452,7 +1493,8 @@ func restoreStream( if err != nil { return errors.Trace(err) } - client.BuildMigrations(migs) + client.BuildMigrations(migs.Migs) + defer cleanUpWithRetErr(&err, migs.ReadLock.Unlock) // get full backup meta storage to generate rewrite rules. fullBackupStorage, err := parseFullBackupTablesStorage(cfg) @@ -1524,7 +1566,10 @@ func restoreStream( return errors.Trace(err) } - compactionIter := client.LogFileManager.GetCompactionIter(ctx) + numberOfKVsInSST, err := client.LogFileManager.CountExtraSSTTotalKVs(ctx) + if err != nil { + return err + } se, err := g.CreateSession(mgr.GetStorage()) if err != nil { @@ -1534,7 +1579,12 @@ func restoreStream( splitSize, splitKeys := utils.GetRegionSplitInfo(execCtx) log.Info("[Log Restore] get split threshold from tikv config", zap.Uint64("split-size", splitSize), zap.Int64("split-keys", splitKeys)) - pd := g.StartProgress(ctx, "Restore Files(SST + KV)", logclient.TotalEntryCount, !cfg.LogProgress) + addedSSTsIter := client.LogFileManager.GetIngestedSSTsSSTs(ctx) + compactionIter := client.LogFileManager.GetCompactionIter(ctx) + sstsIter := iter.ConcatAll(addedSSTsIter, compactionIter) + + totalWorkUnits := numberOfKVsInSST + int64(client.Stats.NumEntries) + pd := g.StartProgress(ctx, "Restore Files(SST + Log)", totalWorkUnits, !cfg.LogProgress) err = withProgress(pd, func(p glue.Progress) (pErr error) { updateStatsWithCheckpoint := func(kvCount, size uint64) { mu.Lock() @@ -1547,7 +1597,7 @@ func restoreStream( p.IncBy(int64(kvCount)) } compactedSplitIter, err := client.WrapCompactedFilesIterWithSplitHelper( - ctx, compactionIter, rewriteRules, sstCheckpointSets, + ctx, sstsIter, rewriteRules, sstCheckpointSets, updateStatsWithCheckpoint, splitSize, splitKeys, ) if err != nil { @@ -1999,6 +2049,15 @@ func checkPiTRTaskInfo( return checkInfo, nil } +func cleanUpWithRetErr(errOut *error, f func(ctx context.Context) error) { + ctx, cancel := context.WithTimeout(context.Background(), time.Minute) + defer cancel() + err := f(ctx) + if errOut != nil { + *errOut = multierr.Combine(*errOut, err) + } +} + func waitUntilSchemaReload(ctx context.Context, client *logclient.LogClient) error { log.Info("waiting for schema info finishes reloading") reloadStart := time.Now() diff --git a/br/pkg/utils/iter/iter.go b/br/pkg/utils/iter/iter.go index 6f8f280905c2c..f1207cdec3b3b 100644 --- a/br/pkg/utils/iter/iter.go +++ b/br/pkg/utils/iter/iter.go @@ -5,6 +5,7 @@ package iter import ( "context" "fmt" + goiter "iter" ) // IterResult is the result of try to advancing an impure iterator. @@ -121,3 +122,23 @@ func Tap[T any](i TryNextor[T], with func(T)) TryNextor[T] { tapper: with, } } + +// AsSeq wraps an `TryNextor` to a Seq2. +func AsSeq[T any](ctx context.Context, i TryNextor[T]) goiter.Seq2[error, T] { + return func(yield func(error, T) bool) { + for { + res := i.TryNext(ctx) + cont := true + if res.Err != nil { + cont = yield(res.Err, *new(T)) + } else if res.Finished { + cont = false + } else { + cont = yield(nil, res.Item) + } + if !cont { + break + } + } + } +} diff --git a/errors.toml b/errors.toml index b5a728cb2d7e8..c67de89bf7bb7 100644 --- a/errors.toml +++ b/errors.toml @@ -56,6 +56,11 @@ error = ''' invalid restore range ''' +["BR:Common:ErrMigrationNotFound"] +error = ''' +no migrtion found +''' + ["BR:Common:ErrMigrationVersionNotSupported"] error = ''' the migration version isn't supported diff --git a/go.mod b/go.mod index 7e9a0ac1a907d..012e598961546 100644 --- a/go.mod +++ b/go.mod @@ -87,7 +87,7 @@ require ( github.com/pingcap/errors v0.11.5-0.20240318064555-6bd07397691f github.com/pingcap/failpoint v0.0.0-20240528011301-b51a646c7c86 github.com/pingcap/fn v1.0.0 - github.com/pingcap/kvproto v0.0.0-20241120071417-b5b7843d9037 + github.com/pingcap/kvproto v0.0.0-20250108041715-3b77f2c65c63 github.com/pingcap/log v1.1.1-0.20241212030209-7e3ff8601a2a github.com/pingcap/sysutil v1.0.1-0.20240311050922-ae81ee01f3a5 github.com/pingcap/tidb/pkg/parser v0.0.0-20211011031125-9b13dc409c5e diff --git a/go.sum b/go.sum index 0e36eb7a11528..939b987305752 100644 --- a/go.sum +++ b/go.sum @@ -676,8 +676,8 @@ github.com/pingcap/fn v1.0.0/go.mod h1:u9WZ1ZiOD1RpNhcI42RucFh/lBuzTu6rw88a+oF2Z github.com/pingcap/goleveldb v0.0.0-20191226122134-f82aafb29989 h1:surzm05a8C9dN8dIUmo4Be2+pMRb6f55i+UIYrluu2E= github.com/pingcap/goleveldb v0.0.0-20191226122134-f82aafb29989/go.mod h1:O17XtbryoCJhkKGbT62+L2OlrniwqiGLSqrmdHCMzZw= github.com/pingcap/kvproto v0.0.0-20191211054548-3c6b38ea5107/go.mod h1:WWLmULLO7l8IOcQG+t+ItJ3fEcrL5FxF0Wu+HrMy26w= -github.com/pingcap/kvproto v0.0.0-20241120071417-b5b7843d9037 h1:xYNSJjYNur4Dr5bV+9BXK9n5E0T1zlcAN25XX68+mOg= -github.com/pingcap/kvproto v0.0.0-20241120071417-b5b7843d9037/go.mod h1:rXxWk2UnwfUhLXha1jxRWPADw9eMZGWEWCg92Tgmb/8= +github.com/pingcap/kvproto v0.0.0-20250108041715-3b77f2c65c63 h1:ThJ7ddLJVk96Iai2HDeyJGuuhrcBtc3HwYKJfuKPLsI= +github.com/pingcap/kvproto v0.0.0-20250108041715-3b77f2c65c63/go.mod h1:rXxWk2UnwfUhLXha1jxRWPADw9eMZGWEWCg92Tgmb/8= github.com/pingcap/log v0.0.0-20210625125904-98ed8e2eb1c7/go.mod h1:8AanEdAHATuRurdGxZXBz0At+9avep+ub7U1AGYLIMM= github.com/pingcap/log v1.1.0/go.mod h1:DWQW5jICDR7UJh4HtxXSM20Churx4CQL0fwL/SoOSA4= github.com/pingcap/log v1.1.1-0.20241212030209-7e3ff8601a2a h1:WIhmJBlNGmnCWH6TLMdZfNEDaiU8cFpZe3iaqDbQ0M8= diff --git a/pkg/executor/brie.go b/pkg/executor/brie.go index 07341ed5e0812..7076f8a8075fb 100644 --- a/pkg/executor/brie.go +++ b/pkg/executor/brie.go @@ -371,11 +371,16 @@ func (b *executorBuilder) buildBRIE(s *ast.BRIEStmt, schema *expression.Schema) case len(s.Tables) != 0: tables := make([]filter.Table, 0, len(s.Tables)) for _, tbl := range s.Tables { - tables = append(tables, filter.Table{Name: tbl.Name.O, Schema: tbl.Schema.O}) + table := filter.Table{Name: tbl.Name.O, Schema: tbl.Schema.O} + tables = append(tables, table) + cfg.FilterStr = append(cfg.FilterStr, table.String()) } cfg.TableFilter = filter.NewTablesFilter(tables...) case len(s.Schemas) != 0: cfg.TableFilter = filter.NewSchemasFilter(s.Schemas...) + for _, schema := range s.Schemas { + cfg.FilterStr = append(cfg.FilterStr, fmt.Sprintf("`%s`.*", schema)) + } default: cfg.TableFilter = filter.All() } diff --git a/pkg/metrics/BUILD.bazel b/pkg/metrics/BUILD.bazel index 643972fbb6ce8..fdead9a11fb01 100644 --- a/pkg/metrics/BUILD.bazel +++ b/pkg/metrics/BUILD.bazel @@ -4,6 +4,7 @@ go_library( name = "metrics", srcs = [ "bindinfo.go", + "br.go", "ddl.go", "distsql.go", "disttask.go", diff --git a/pkg/metrics/br.go b/pkg/metrics/br.go new file mode 100644 index 0000000000000..e174f0185900e --- /dev/null +++ b/pkg/metrics/br.go @@ -0,0 +1,49 @@ +package metrics + +import "github.com/prometheus/client_golang/prometheus" + +var ( + RestoreImportFileSeconds prometheus.Histogram + RestoreUploadSSTForPiTRSeconds prometheus.Histogram + RestoreUploadSSTMetaForPiTRSeconds prometheus.Histogram + + // RestoreTableCreatedCount counts how many tables created. + RestoreTableCreatedCount prometheus.Counter +) + +func InitBRMetrics() { + RestoreTableCreatedCount = prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "BR", + Name: "table_created", + Help: "The count of tables have been created.", + }) + + RestoreImportFileSeconds = prometheus.NewHistogram(prometheus.HistogramOpts{ + Namespace: "tidb", + Subsystem: "br", + Name: "restore_import_file_seconds", + + Help: "The time cost for importing a file. (including the time costed in queuing)", + + Buckets: prometheus.ExponentialBuckets(0.01, 4, 14), + }) + + RestoreUploadSSTForPiTRSeconds = prometheus.NewHistogram(prometheus.HistogramOpts{ + Namespace: "tidb", + Subsystem: "br", + Name: "restore_upload_sst_for_pitr_seconds", + + Help: "The time cost for uploading SST files for point-in-time recovery", + + Buckets: prometheus.DefBuckets, + }) + + RestoreUploadSSTMetaForPiTRSeconds = prometheus.NewHistogram(prometheus.HistogramOpts{ + Namespace: "tidb", + Subsystem: "br", + Name: "restore_upload_sst_meta_for_pitr_seconds", + + Help: "The time cost for uploading SST metadata for point-in-time recovery", + Buckets: prometheus.ExponentialBuckets(0.01, 2, 14), + }) +} diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index 17670041c2820..d74596a04acec 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -98,6 +98,10 @@ func InitMetrics() { InitInfoSchemaV2Metrics() timermetrics.InitTimerMetrics() + // For now, those metrics are initialized but not registered. + // They will be printed to log during restoring... + InitBRMetrics() + PanicCounter = NewCounterVec( prometheus.CounterOpts{ Namespace: "tidb", diff --git a/pkg/util/BUILD.bazel b/pkg/util/BUILD.bazel index 79b9e1d58c546..5d84b0a9b441c 100644 --- a/pkg/util/BUILD.bazel +++ b/pkg/util/BUILD.bazel @@ -59,6 +59,8 @@ go_library( "@io_etcd_go_etcd_client_v3//:client", "@io_etcd_go_etcd_client_v3//concurrency", "@org_golang_google_grpc//:grpc", + "@org_golang_google_protobuf//proto", + "@org_golang_google_protobuf//protoadapt", "@org_golang_x_sync//errgroup", "@org_uber_go_atomic//:atomic", "@org_uber_go_zap//:zap", @@ -102,6 +104,7 @@ go_test( "//pkg/util/memory", "@com_github_ngaut_pools//:pools", "@com_github_pingcap_errors//:errors", + "@com_github_pingcap_kvproto//pkg/brpb", "@com_github_stretchr_testify//assert", "@com_github_stretchr_testify//require", "@org_uber_go_atomic//:atomic", diff --git a/pkg/util/util.go b/pkg/util/util.go index e8f056b4da0a0..cc123e087ade8 100644 --- a/pkg/util/util.go +++ b/pkg/util/util.go @@ -32,6 +32,8 @@ import ( "github.com/pingcap/tidb/pkg/parser" "go.uber.org/atomic" "go.uber.org/zap" + "google.golang.org/protobuf/proto" + "google.golang.org/protobuf/protoadapt" ) // SliceToMap converts slice to map @@ -292,6 +294,11 @@ func GetRecoverError(r any) error { return errors.Errorf("%v", r) } +// ProtoV1Clone clones a V1 proto message. +func ProtoV1Clone[T protoadapt.MessageV1](p T) T { + return protoadapt.MessageV1Of(proto.Clone(protoadapt.MessageV2Of(p))).(T) +} + // CheckIfSameCluster reads PD addresses registered in etcd from two sources, to // check if there are common addresses in both sources. If there are common // addresses, the first return value is true which means we have confidence that diff --git a/pkg/util/util_test.go b/pkg/util/util_test.go index bb7a467a5e203..3955130549165 100644 --- a/pkg/util/util_test.go +++ b/pkg/util/util_test.go @@ -22,6 +22,7 @@ import ( "time" "github.com/pingcap/errors" + pb "github.com/pingcap/kvproto/pkg/brpb" "github.com/pingcap/tidb/pkg/sessionctx/stmtctx" "github.com/pingcap/tidb/pkg/util/memory" "github.com/stretchr/testify/assert" @@ -120,3 +121,18 @@ func TestIsInCorrectIdentifierName(t *testing.T) { require.Equalf(t, tc.correct, got, "IsInCorrectIdentifierName(%v) != %v", tc.name, tc.correct) } } + +func TestDupProto(t *testing.T) { + p := &pb.StorageBackend{ + Backend: &pb.StorageBackend_S3{ + S3: &pb.S3{ + Endpoint: "127.0.0.1", + }, + }, + } + + p2 := ProtoV1Clone(p) + require.Equal(t, p2.Backend.(*pb.StorageBackend_S3).S3.Endpoint, "127.0.0.1") + p2.Backend.(*pb.StorageBackend_S3).S3.Endpoint = "127.0.0.2" + require.Equal(t, p.Backend.(*pb.StorageBackend_S3).S3.Endpoint, "127.0.0.1") +} diff --git a/tests/realtikvtest/brietest/BUILD.bazel b/tests/realtikvtest/brietest/BUILD.bazel index d480c17fc9874..5b9612e08a466 100644 --- a/tests/realtikvtest/brietest/BUILD.bazel +++ b/tests/realtikvtest/brietest/BUILD.bazel @@ -9,31 +9,45 @@ go_test( "brie_test.go", "main_test.go", "operator_test.go", + "pitr_test.go", ], flaky = True, race = "on", deps = [ + "//br/pkg/glue", + "//br/pkg/gluetidb", + "//br/pkg/logutil", + "//br/pkg/streamhelper", + "//br/pkg/summary", "//br/pkg/task", "//br/pkg/task/operator", "//pkg/config", + "//pkg/domain", "//pkg/executor", + "//pkg/kv", "//pkg/parser/mysql", "//pkg/session", "//pkg/testkit", "//pkg/testkit/testsetup", + "//pkg/util/printer", + "//pkg/util/table-filter", "//tests/realtikvtest", "@com_github_google_uuid//:uuid", "@com_github_pingcap_failpoint//:failpoint", + "@com_github_pingcap_kvproto//pkg/brpb", + "@com_github_pingcap_kvproto//pkg/encryptionpb", "@com_github_pingcap_kvproto//pkg/import_sstpb", "@com_github_pingcap_kvproto//pkg/kvrpcpb", "@com_github_pingcap_log//:log", "@com_github_stretchr_testify//require", "@com_github_tikv_client_go_v2//oracle", + "@com_github_tikv_client_go_v2//tikv", "@com_github_tikv_pd_client//:client", "@com_github_tikv_pd_client//pkg/caller", "@org_golang_google_grpc//:grpc", "@org_golang_google_grpc//credentials/insecure", "@org_uber_go_goleak//:goleak", + "@org_uber_go_zap//:zap", "@org_uber_go_zap//zapcore", ], ) diff --git a/tests/realtikvtest/brietest/main_test.go b/tests/realtikvtest/brietest/main_test.go index c8117a7615e96..1b8f820204249 100644 --- a/tests/realtikvtest/brietest/main_test.go +++ b/tests/realtikvtest/brietest/main_test.go @@ -29,7 +29,8 @@ func TestMain(m *testing.M) { goleak.IgnoreTopFunction("github.com/bazelbuild/rules_go/go/tools/bzltestutil.RegisterTimeoutHandler.func1"), goleak.IgnoreTopFunction("github.com/lestrrat-go/httprc.runFetchWorker"), goleak.IgnoreTopFunction("google.golang.org/grpc.(*ccBalancerWrapper).watcher"), - goleak.IgnoreTopFunction("google.golang.org/grpc/internal/transport.(*http2Client).keepalive"), + // The top function now is `sync.runtime_notifyListWait`... + goleak.IgnoreAnyFunction("google.golang.org/grpc/internal/transport.(*http2Client).keepalive"), goleak.IgnoreTopFunction("google.golang.org/grpc/internal/transport.(*controlBuffer).get"), goleak.IgnoreTopFunction("google.golang.org/grpc/internal/grpcsync.(*CallbackSerializer).run"), goleak.IgnoreTopFunction("net/http.(*persistConn).writeLoop"), diff --git a/tests/realtikvtest/brietest/pitr_test.go b/tests/realtikvtest/brietest/pitr_test.go new file mode 100644 index 0000000000000..095e60f6badba --- /dev/null +++ b/tests/realtikvtest/brietest/pitr_test.go @@ -0,0 +1,545 @@ +package brietest + +import ( + "context" + "encoding/hex" + "errors" + "fmt" + "math" + "os" + "path/filepath" + "regexp" + "strings" + "testing" + "time" + + "github.com/pingcap/failpoint" + backup "github.com/pingcap/kvproto/pkg/brpb" + "github.com/pingcap/kvproto/pkg/encryptionpb" + "github.com/pingcap/log" + "github.com/pingcap/tidb/br/pkg/glue" + "github.com/pingcap/tidb/br/pkg/gluetidb" + "github.com/pingcap/tidb/br/pkg/logutil" + "github.com/pingcap/tidb/br/pkg/streamhelper" + "github.com/pingcap/tidb/br/pkg/summary" + "github.com/pingcap/tidb/br/pkg/task" + "github.com/pingcap/tidb/br/pkg/task/operator" + "github.com/pingcap/tidb/pkg/config" + "github.com/pingcap/tidb/pkg/domain" + "github.com/pingcap/tidb/pkg/kv" + "github.com/pingcap/tidb/pkg/testkit" + "github.com/pingcap/tidb/pkg/util/printer" + filter "github.com/pingcap/tidb/pkg/util/table-filter" + "github.com/stretchr/testify/require" + "github.com/tikv/client-go/v2/oracle" + "github.com/tikv/client-go/v2/tikv" + pd "github.com/tikv/pd/client" + "go.uber.org/zap" +) + +type TestKitGlue struct { + tk *testkit.TestKit +} + +func (tk TestKitGlue) GetDomain(_ kv.Storage) (*domain.Domain, error) { + return domain.GetDomain(tk.tk.Session()), nil +} + +func (tk TestKitGlue) CreateSession(_ kv.Storage) (glue.Session, error) { + return gluetidb.WrapSession(tk.tk.Session()), nil +} + +func (tk TestKitGlue) Open(path string, option pd.SecurityOption) (kv.Storage, error) { + return tk.tk.Session().GetStore(), nil +} + +// OwnsStorage returns whether the storage returned by Open() is owned +// If this method returns false, the connection manager will never close the storage. +func (tk TestKitGlue) OwnsStorage() bool { + return false +} + +func (tk TestKitGlue) StartProgress(ctx context.Context, cmdName string, total int64, redirectLog bool) glue.Progress { + return &glue.CounterProgress{} +} + +// Record records some information useful for log-less summary. +func (tk TestKitGlue) Record(name string, value uint64) {} + +// GetVersion gets BR package version to run backup/restore job +func (tk TestKitGlue) GetVersion() string { + return "In Test\n" + printer.GetTiDBInfo() +} + +// UseOneShotSession temporary creates session from store when run backup job. +// because we don't have to own domain/session during the whole backup. +// we can close domain as soon as possible. +// and we must reuse the exists session and don't close it in SQL backup job. +func (tk TestKitGlue) UseOneShotSession(_ kv.Storage, _ bool, fn func(se glue.Session) error) error { + return fn(gluetidb.WrapSession(tk.tk.Session())) +} + +// GetClient returns the client type of the glue +func (tk TestKitGlue) GetClient() glue.GlueClient { + return glue.ClientSql +} + +type LogBackupKit struct { + t *testing.T + tk *testkit.TestKit + metaCli *streamhelper.MetaDataClient + base string + + checkerF func(err error) +} + +func NewLogBackupKit(t *testing.T) *LogBackupKit { + tk := initTestKit(t) + metaCli := streamhelper.NewMetaDataClient(domain.GetDomain(tk.Session()).EtcdClient()) + begin := time.Now() + // So the cases can finish faster... + tk.MustExec("set config tikv `log-backup.max-flush-interval` = '30s';") + t.Cleanup(func() { + if !t.Failed() { + log.Info("[TEST.LogBackupKit] success", zap.String("case", t.Name()), zap.Stringer("takes", time.Since(begin))) + } + }) + return &LogBackupKit{ + tk: tk, + t: t, + metaCli: metaCli, + base: t.TempDir(), + checkerF: func(err error) { + require.NoError(t, err) + }, + } +} + +func (kit *LogBackupKit) tempFile(name string, content []byte) string { + path := filepath.Join(kit.t.TempDir(), name) + require.NoError(kit.t, os.WriteFile(path, content, 0o666)) + return path +} + +func (kit *LogBackupKit) RunFullRestore(extConfig func(*task.RestoreConfig)) { + kit.runAndCheck(func(ctx context.Context) error { + cfg := task.DefaultRestoreConfig(task.DefaultConfig()) + cfg.Storage = kit.LocalURI("full") + cfg.FilterStr = []string{"test.*"} + var err error + cfg.TableFilter, err = filter.Parse(cfg.FilterStr) + cfg.CheckRequirements = false + cfg.WithSysTable = false + require.NoError(kit.t, err) + cfg.UseCheckpoint = false + + extConfig(&cfg) + return task.RunRestore(ctx, kit.Glue(), task.FullRestoreCmd, &cfg) + }) +} + +func (kit *LogBackupKit) RunStreamRestore(extConfig func(*task.RestoreConfig)) { + kit.runAndCheck(func(ctx context.Context) error { + cfg := task.DefaultRestoreConfig(task.DefaultConfig()) + cfg.Storage = kit.LocalURI("incr") + cfg.FullBackupStorage = kit.LocalURI("full") + cfg.CheckRequirements = false + cfg.UseCheckpoint = false + cfg.WithSysTable = false + + extConfig(&cfg) + return task.RunRestore(ctx, kit.Glue(), task.PointRestoreCmd, &cfg) + }) +} + +func (kit *LogBackupKit) SetFilter(cfg *task.Config, f ...string) { + var err error + cfg.TableFilter, err = filter.Parse(f) + require.NoError(kit.t, err) + cfg.FilterStr = f + cfg.ExplicitFilter = true +} + +func (kit *LogBackupKit) RunFullBackup(extConfig func(*task.BackupConfig)) { + kit.runAndCheck(func(ctx context.Context) error { + cfg := task.DefaultBackupConfig(task.DefaultConfig()) + cfg.Storage = kit.LocalURI("full") + + extConfig(&cfg) + return task.RunBackup(ctx, kit.Glue(), "backup full[intest]", &cfg) + }) +} + +func (kit *LogBackupKit) StopTaskIfExists(taskName string) { + kit.runAndCheck(func(ctx context.Context) error { + cfg := task.DefaultStreamConfig(task.DefineStreamCommonFlags) + cfg.TaskName = taskName + err := task.RunStreamStop(ctx, kit.Glue(), "stream stop[intest]", &cfg) + if err != nil && strings.Contains(err.Error(), "task not found") { + return nil + } + return err + }) +} + +func (kit *LogBackupKit) RunLogStart(taskName string, extConfig func(*task.StreamConfig)) { + kit.runAndCheck(func(ctx context.Context) error { + cfg := task.DefaultStreamConfig(task.DefineStreamStartFlags) + cfg.Storage = kit.LocalURI("incr") + cfg.TaskName = taskName + cfg.EndTS = math.MaxUint64 + cfg.TableFilter = filter.All() + cfg.FilterStr = []string{"*.*"} + extConfig(&cfg) + err := task.RunStreamStart(ctx, kit.Glue(), "stream start[intest]", &cfg) + return err + }) + kit.t.Cleanup(func() { kit.StopTaskIfExists(taskName) }) +} + +func (kit *LogBackupKit) ctx() context.Context { + return context.Background() +} + +func (kit *LogBackupKit) TSO() uint64 { + ts, err := kit.tk.Session().GetStore().(tikv.Storage).GetOracle().GetTimestamp(kit.ctx(), &oracle.Option{}) + require.NoError(kit.t, err) + return ts +} + +func (kit *LogBackupKit) LocalURI(rel ...string) string { + return "local://" + kit.base + "/" + filepath.Join(rel...) +} + +func (kit *LogBackupKit) CheckpointTSOf(taskName string) uint64 { + task, err := kit.metaCli.GetTask(kit.ctx(), taskName) + require.NoError(kit.t, err) + ts, err := task.GetGlobalCheckPointTS(kit.ctx()) + require.NoError(kit.t, err) + return ts +} + +func (kit *LogBackupKit) Glue() glue.Glue { + return &TestKitGlue{tk: kit.tk} +} + +func (kit *LogBackupKit) WithChecker(checker func(v error), f func()) { + oldExpected := kit.checkerF + defer func() { + kit.checkerF = oldExpected + }() + kit.checkerF = checker + + f() +} + +func (kit *LogBackupKit) runAndCheck(f func(context.Context) error) { + ctx, cancel := context.WithCancel(context.Background()) + begin := time.Now() + summary.SetSuccessStatus(false) + err := f(ctx) + cancel() + kit.checkerF(err) + log.Info("[TEST.runAndCheck] A task finished.", zap.StackSkip("caller", 1), zap.Stringer("take", time.Since(begin))) +} + +func (kit *LogBackupKit) forceFlush() { + kit.runAndCheck(func(ctx context.Context) error { + cfg := task.DefaultConfig() + cfg.PD = append(cfg.PD, config.GetGlobalConfig().Path) + err := operator.RunForceFlush(ctx, &operator.ForceFlushConfig{ + Config: cfg, + StoresPattern: regexp.MustCompile(".*"), + }) + if err != nil { + log.Warn("[TEST.forceFlush] It seems this version of TiKV doesn't support force flush, the test may be much more slower.", + logutil.ShortError(err)) + } + return nil + }) +} + +func (kit *LogBackupKit) forceFlushAndWait(taskName string) { + ts := kit.TSO() + start := time.Now() + kit.forceFlush() + require.Eventually(kit.t, func() bool { + ckpt := kit.CheckpointTSOf(taskName) + log.Info("[TEST.forceFlushAndWait] checkpoint", zap.Uint64("checkpoint", ckpt), zap.Uint64("ts", ts)) + return ckpt >= ts + }, 300*time.Second, 1*time.Second) + time.Sleep(6 * time.Second) // Wait the storage checkpoint uploaded... + log.Info("[TEST.forceFlushAndWait] done", zap.Stringer("take", time.Since(start))) +} + +func (kit *LogBackupKit) simpleWorkload() simpleWorkload { + return simpleWorkload{ + tbl: kit.t.Name(), + } +} + +type simpleWorkload struct { + tbl string +} + +func (s simpleWorkload) createSimpleTableWithData(kit *LogBackupKit) { + kit.tk.MustExec(fmt.Sprintf("DROP TABLE IF EXISTs test.%s", s.tbl)) + kit.tk.MustExec(fmt.Sprintf("CREATE TABLE test.%s(t text)", s.tbl)) + kit.tk.MustExec(fmt.Sprintf("INSERT INTO test.%s VALUES ('Ear'), ('Eye'), ('Nose')", s.tbl)) +} + +func (s simpleWorkload) insertSimpleIncreaseData(kit *LogBackupKit) { + kit.tk.MustExec(fmt.Sprintf("INSERT INTO test.%s VALUES ('Body')", s.tbl)) + kit.tk.MustExec(fmt.Sprintf("INSERT INTO test.%s VALUES ('Mind')", s.tbl)) +} + +func (s simpleWorkload) verifySimpleData(kit *LogBackupKit) { + kit.tk.MustQuery(fmt.Sprintf("SELECT * FROM test.%s", s.tbl)).Check([][]any{{"Ear"}, {"Eye"}, {"Nose"}, {"Body"}, {"Mind"}}) +} + +func (s simpleWorkload) cleanSimpleData(kit *LogBackupKit) { + kit.tk.MustExec(fmt.Sprintf("DROP TABLE IF EXISTS test.%s", s.tbl)) +} + +func TestPiTRAndBackupInSQL(t *testing.T) { + kit := NewLogBackupKit(t) + s := kit.simpleWorkload() + s.createSimpleTableWithData(kit) + s.insertSimpleIncreaseData(kit) + + taskName := t.Name() + kit.RunFullBackup(func(bc *task.BackupConfig) {}) + s.cleanSimpleData(kit) + + ts := kit.TSO() + kit.RunFullBackup(func(bc *task.BackupConfig) { + bc.Storage = kit.LocalURI("full2") + bc.BackupTS = ts + }) + kit.RunLogStart(taskName, func(sc *task.StreamConfig) { + sc.StartTS = ts + }) + _ = kit.tk.MustQuery(fmt.Sprintf("RESTORE TABLE test.%s FROM '%s'", t.Name(), kit.LocalURI("full"))) + s.verifySimpleData(kit) + kit.forceFlushAndWait(taskName) + + s.cleanSimpleData(kit) + kit.StopTaskIfExists(taskName) + kit.RunStreamRestore(func(rc *task.RestoreConfig) { + rc.FullBackupStorage = kit.LocalURI("full2") + }) + s.verifySimpleData(kit) +} + +func TestPiTRAndManyBackups(t *testing.T) { + kit := NewLogBackupKit(t) + s := kit.simpleWorkload() + s.createSimpleTableWithData(kit) + s.insertSimpleIncreaseData(kit) + + taskName := t.Name() + + kit.RunFullBackup(func(bc *task.BackupConfig) { + kit.SetFilter(&bc.Config, fmt.Sprintf("test.%s", s.tbl)) + bc.Storage = kit.LocalURI("fulla") + }) + s.cleanSimpleData(kit) + + s2 := kit.simpleWorkload() + s2.tbl += "2" + s2.createSimpleTableWithData(kit) + s2.insertSimpleIncreaseData(kit) + kit.RunFullBackup(func(bc *task.BackupConfig) { + kit.SetFilter(&bc.Config, fmt.Sprintf("test.%s", s2.tbl)) + bc.Storage = kit.LocalURI("fullb") + }) + s2.cleanSimpleData(kit) + + ts := kit.TSO() + kit.RunFullBackup(func(bc *task.BackupConfig) { + bc.Storage = kit.LocalURI("pitr_base") + bc.BackupTS = ts + }) + kit.RunLogStart(taskName, func(sc *task.StreamConfig) { + sc.StartTS = ts + }) + kit.RunFullRestore(func(rc *task.RestoreConfig) { + rc.Storage = kit.LocalURI("fulla") + kit.SetFilter(&rc.Config, fmt.Sprintf("test.%s", s.tbl)) + }) + kit.RunFullRestore(func(rc *task.RestoreConfig) { + rc.Storage = kit.LocalURI("fullb") + kit.SetFilter(&rc.Config, fmt.Sprintf("test.%s", s2.tbl)) + }) + + kit.forceFlushAndWait(taskName) + s.cleanSimpleData(kit) + s2.cleanSimpleData(kit) + kit.StopTaskIfExists(taskName) + kit.RunStreamRestore(func(rc *task.RestoreConfig) { + rc.FullBackupStorage = kit.LocalURI("pitr_base") + }) + s.verifySimpleData(kit) + s2.verifySimpleData(kit) +} + +func TestPiTRAndEncryptedFullBackup(t *testing.T) { + kit := NewLogBackupKit(t) + s := kit.simpleWorkload() + s.createSimpleTableWithData(kit) + keyContent, err := hex.DecodeString("9d4cf8f268514d2c38836197008eded1050a5806afa632f7ab1e313bb6697da2") + require.NoError(t, err) + + kit.RunFullBackup(func(bc *task.BackupConfig) { + bc.CipherInfo = backup.CipherInfo{ + CipherType: encryptionpb.EncryptionMethod_AES256_CTR, + CipherKey: keyContent, + } + }) + + s.cleanSimpleData(kit) + kit.RunLogStart(t.Name(), func(sc *task.StreamConfig) {}) + chk := func(err error) { require.ErrorContains(t, err, "the data you want to restore is encrypted") } + kit.WithChecker(chk, func() { + kit.RunFullRestore(func(rc *task.RestoreConfig) { + rc.CipherInfo = backup.CipherInfo{ + CipherType: encryptionpb.EncryptionMethod_AES256_CTR, + CipherKey: keyContent, + } + }) + }) +} + +func TestPiTRAndEncryptedLogBackup(t *testing.T) { + kit := NewLogBackupKit(t) + s := kit.simpleWorkload() + s.createSimpleTableWithData(kit) + + keyContent, err := hex.DecodeString("0ae31c060ff933cabe842430e1716185cc9c6b5cdde8e56976afaff41b92528f") + require.NoError(t, err) + keyFile := kit.tempFile("KEY", keyContent) + + kit.RunFullBackup(func(bc *task.BackupConfig) {}) + s.cleanSimpleData(kit) + + kit.RunLogStart(t.Name(), func(sc *task.StreamConfig) { + sc.MasterKeyConfig.EncryptionType = encryptionpb.EncryptionMethod_AES256_CTR + sc.MasterKeyConfig.MasterKeys = append(sc.MasterKeyConfig.MasterKeys, &encryptionpb.MasterKey{ + Backend: &encryptionpb.MasterKey_File{ + File: &encryptionpb.MasterKeyFile{ + Path: keyFile, + }, + }, + }) + }) + + chk := func(err error) { require.ErrorContains(t, err, "the running log backup task is encrypted") } + kit.WithChecker(chk, func() { + kit.RunFullRestore(func(rc *task.RestoreConfig) {}) + }) +} + +func TestPiTRAndBothEncrypted(t *testing.T) { + kit := NewLogBackupKit(t) + s := kit.simpleWorkload() + s.createSimpleTableWithData(kit) + + keyContent, err := hex.DecodeString("319b4a104651746f1bf1ad67c9ba7d635d8c4769b03f3e5c63f1da93891ce4f9") + require.NoError(t, err) + keyFile := kit.tempFile("KEY", keyContent) + + kit.RunFullBackup(func(bc *task.BackupConfig) { + bc.CipherInfo = backup.CipherInfo{ + CipherType: encryptionpb.EncryptionMethod_AES256_CTR, + CipherKey: keyContent, + } + }) + s.cleanSimpleData(kit) + + kit.RunLogStart(t.Name(), func(sc *task.StreamConfig) { + sc.MasterKeyConfig.EncryptionType = encryptionpb.EncryptionMethod_AES256_CTR + sc.MasterKeyConfig.MasterKeys = append(sc.MasterKeyConfig.MasterKeys, &encryptionpb.MasterKey{ + Backend: &encryptionpb.MasterKey_File{ + File: &encryptionpb.MasterKeyFile{ + Path: keyFile, + }, + }, + }) + }) + + chk := func(err error) { require.ErrorContains(t, err, "encrypted") } + kit.WithChecker(chk, func() { + kit.RunFullRestore(func(rc *task.RestoreConfig) { + rc.CipherInfo = backup.CipherInfo{ + CipherType: encryptionpb.EncryptionMethod_AES256_CTR, + CipherKey: keyContent, + } + }) + }) +} + +func TestPiTRAndFailureRestore(t *testing.T) { + kit := NewLogBackupKit(t) + s := kit.simpleWorkload() + s.createSimpleTableWithData(kit) + s.insertSimpleIncreaseData(kit) + + taskName := t.Name() + kit.RunFullBackup(func(bc *task.BackupConfig) {}) + s.cleanSimpleData(kit) + + ts := kit.TSO() + kit.RunFullBackup(func(bc *task.BackupConfig) { + bc.Storage = kit.LocalURI("full2") + bc.BackupTS = ts + }) + kit.RunLogStart(taskName, func(sc *task.StreamConfig) { + sc.StartTS = ts + }) + require.NoError(t, failpoint.EnableCall("github.com/pingcap/tidb/br/pkg/task/run-snapshot-restore-about-to-finish", func(e *error) { + *e = errors.New("not my fault") + })) + checker := func(e error) { require.Error(t, e) } + kit.WithChecker(checker, func() { + kit.RunFullRestore(func(rc *task.RestoreConfig) { + rc.UseCheckpoint = false + }) + }) + kit.forceFlushAndWait(taskName) + + s.cleanSimpleData(kit) + require.NoError(t, failpoint.Disable("github.com/pingcap/tidb/br/pkg/task/run-snapshot-restore-about-to-finish")) + + kit.StopTaskIfExists(taskName) + kit.RunStreamRestore(func(rc *task.RestoreConfig) { + rc.FullBackupStorage = kit.LocalURI("full2") + }) + res := kit.tk.MustQuery(fmt.Sprintf("SELECT COUNT(*) FROM test.%s", t.Name())) + res.Check([][]any{{"0"}}) +} + +func TestPiTRAndIncrementalRestore(t *testing.T) { + kit := NewLogBackupKit(t) + s := kit.simpleWorkload() + s.createSimpleTableWithData(kit) + kit.RunFullBackup(func(bc *task.BackupConfig) { + kit.SetFilter(&bc.Config, fmt.Sprintf("test.%s", s.tbl)) + }) + s.insertSimpleIncreaseData(kit) + ts := kit.TSO() + kit.RunFullBackup(func(bc *task.BackupConfig) { + kit.SetFilter(&bc.Config, fmt.Sprintf("test.%s", s.tbl)) + bc.Storage = kit.LocalURI("incr-legacy") + bc.LastBackupTS = ts + }) + s.cleanSimpleData(kit) + + kit.RunLogStart("dummy", func(sc *task.StreamConfig) {}) + kit.RunFullRestore(func(rc *task.RestoreConfig) {}) + chk := func(err error) { require.ErrorContains(t, err, "BR:Stream:ErrStreamLogTaskExist") } + kit.WithChecker(chk, func() { + kit.RunFullRestore(func(rc *task.RestoreConfig) { + rc.Storage = kit.LocalURI("incr-legacy") + }) + }) +}