From c438fcbafde8021da0a4ad5234bcbea532e953b6 Mon Sep 17 00:00:00 2001 From: Wei Fu Date: Tue, 9 Apr 2024 23:34:30 +0800 Subject: [PATCH] storage/backend: fatal if there is panic during defrag We should exit as soon as possible if there is panic during defrag. Because that tx might be closed. The inflight request might use invalid tx and then panic as well. However, the real panic might be shadowed by new panic. It's related to goroutine schedule. So, we should fatal here. How to reproduce bbolt#715: ```diff diff --git a/server/etcdserver/api/v3rpc/maintenance.go b/server/etcdserver/api/v3rpc/maintenance.go index 4f55c7c74..0a91b4225 100644 --- a/server/etcdserver/api/v3rpc/maintenance.go +++ b/server/etcdserver/api/v3rpc/maintenance.go @@ -89,7 +89,13 @@ func NewMaintenanceServer(s *etcdserver.EtcdServer, healthNotifier notifier) pb. func (ms *maintenanceServer) Defragment(ctx context.Context, sr *pb.DefragmentRequest) (*pb.DefragmentResponse, error) { ms.lg.Info("starting defragment") ms.healthNotifier.defragStarted() - defer ms.healthNotifier.defragFinished() + defer func() { + ms.healthNotifier.defragFinished() + if rerr := recover(); rerr != nil { + time.Sleep(30 * time.Second) + panic(rerr) + } + }() err := ms.bg.Backend().Defrag() if err != nil { ms.lg.Warn("failed to defragment", zap.Error(err)) ``` ```bash $ make gofail-enable $ make $ GOFAIL_HTTP="127.0.0.1:1234" bin/etcd $ New Terminal $ curl http://127.0.0.1:1234/defragBeforeRename -XPUT -d'panic()' $ bin/etcdctl defrag $ Old Terminal {"level":"info","ts":"2024-04-09T23:28:54.084434+0800","caller":"v3rpc/maintenance.go:90","msg":"starting defragment"} {"level":"info","ts":"2024-04-09T23:28:54.087363+0800","caller":"backend/backend.go:509","msg":"defragmenting","path":"default.etcd/member/snap/db","current-db-size-bytes":20480,"current-db-size":"20 kB","current-db-size-in-use-bytes":16384,"current-db-size-in-use":"16 kB"} {"level":"info","ts":"2024-04-09T23:28:54.091229+0800","caller":"v3rpc/health.go:62","msg":"grpc service status changed","service":"","status":"SERVING"} panic: runtime error: invalid memory address or nil pointer dereference [signal SIGSEGV: segmentation violation code=0x1 addr=0x0 pc=0xbb2553] goroutine 156 [running]: go.etcd.io/bbolt.(*Tx).Bucket(...) go.etcd.io/bbolt@v1.4.0-alpha.0/tx.go:112 go.etcd.io/etcd/server/v3/storage/backend.(*baseReadTx).UnsafeRange(0xc00061ac80, {0x12b7780, 0x1a73420}, {0x1a113e8, 0xe, 0xe}, {0x0, 0x0, 0x0}, 0x1) go.etcd.io/etcd/server/v3/storage/backend/read_tx.go:103 +0x233 go.etcd.io/etcd/server/v3/storage/schema.UnsafeReadStorageVersion({0x7f6280613618?, 0xc00061ac80?}) go.etcd.io/etcd/server/v3/storage/schema/version.go:35 +0x5d go.etcd.io/etcd/server/v3/storage/schema.UnsafeDetectSchemaVersion(0xc000334b80, {0x7f6280613618, 0xc00061ac80}) go.etcd.io/etcd/server/v3/storage/schema/schema.go:94 +0x47 go.etcd.io/etcd/server/v3/storage/schema.DetectSchemaVersion(0xc000334b80, {0x12b77f0, 0xc00061ac80}) go.etcd.io/etcd/server/v3/storage/schema/schema.go:89 +0xf2 go.etcd.io/etcd/server/v3/etcdserver.(*EtcdServer).StorageVersion(0xc000393c08) go.etcd.io/etcd/server/v3/etcdserver/server.go:2216 +0x105 go.etcd.io/etcd/server/v3/etcdserver.(*serverVersionAdapter).GetStorageVersion(0x0?) go.etcd.io/etcd/server/v3/etcdserver/adapters.go:77 +0x16 go.etcd.io/etcd/server/v3/etcdserver/version.(*Monitor).UpdateStorageVersionIfNeeded(0xc00002df70) go.etcd.io/etcd/server/v3/etcdserver/version/monitor.go:112 +0x5d go.etcd.io/etcd/server/v3/etcdserver.(*EtcdServer).monitorStorageVersion(0xc000393c08) go.etcd.io/etcd/server/v3/etcdserver/server.go:2259 +0xa8 go.etcd.io/etcd/server/v3/etcdserver.(*EtcdServer).GoAttach.func1() go.etcd.io/etcd/server/v3/etcdserver/server.go:2440 +0x59 created by go.etcd.io/etcd/server/v3/etcdserver.(*EtcdServer).GoAttach in goroutine 1 go.etcd.io/etcd/server/v3/etcdserver/server.go:2438 +0xf9 ``` REF: https://github.com/etcd-io/bbolt/issues/715 Signed-off-by: Wei Fu --- server/storage/backend/backend.go | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/server/storage/backend/backend.go b/server/storage/backend/backend.go index c29993ab43b..773bcb3a4b8 100644 --- a/server/storage/backend/backend.go +++ b/server/storage/backend/backend.go @@ -515,6 +515,16 @@ func (b *backend) defrag() error { ) } + defer func() { + // NOTE: We should exit as soon as possible because that tx + // might be closed. The inflight request might use invalid + // tx and then panic as well. The real panic reason might be + // shadowed by new panic. So, we should fatal here with lock. + if rerr := recover(); rerr != nil { + b.lg.Fatal("unexpected panic during defrag", zap.Any("panic", rerr)) + } + }() + // Commit/stop and then reset current transactions (including the readTx) b.batchTx.unsafeCommit(true) b.batchTx.tx = nil