Skip to content

Commit

Permalink
storage/backend: fatal if there is panic during defrag
Browse files Browse the repository at this point in the history
We should exit as soon as possible if there is panic during defrag.
Because that tx might be closed. The inflight request might use invalid
tx and then panic as well. However, the real panic might be shadowed by
new panic. It's related to goroutine schedule. So, we should fatal here.

How to reproduce bbolt#715:

```diff
diff --git a/server/etcdserver/api/v3rpc/maintenance.go b/server/etcdserver/api/v3rpc/maintenance.go
index 4f55c7c74..0a91b4225 100644
--- a/server/etcdserver/api/v3rpc/maintenance.go
+++ b/server/etcdserver/api/v3rpc/maintenance.go
@@ -89,7 +89,13 @@ func NewMaintenanceServer(s *etcdserver.EtcdServer, healthNotifier notifier) pb.
 func (ms *maintenanceServer) Defragment(ctx context.Context, sr *pb.DefragmentRequest) (*pb.DefragmentResponse, error) {
        ms.lg.Info("starting defragment")
        ms.healthNotifier.defragStarted()
-       defer ms.healthNotifier.defragFinished()
+       defer func() {
+               ms.healthNotifier.defragFinished()
+               if rerr := recover(); rerr != nil {
+                       time.Sleep(30 * time.Second)
+                       panic(rerr)
+               }
+       }()
        err := ms.bg.Backend().Defrag()
        if err != nil {
                ms.lg.Warn("failed to defragment", zap.Error(err))
```

```bash
$ make gofail-enable
$ make
$ GOFAIL_HTTP="127.0.0.1:1234" bin/etcd

$ New Terminal
$ curl http://127.0.0.1:1234/defragBeforeRename -XPUT -d'panic()'
$ bin/etcdctl defrag

$ Old Terminal
{"level":"info","ts":"2024-04-09T23:28:54.084434+0800","caller":"v3rpc/maintenance.go:90","msg":"starting defragment"}
{"level":"info","ts":"2024-04-09T23:28:54.087363+0800","caller":"backend/backend.go:509","msg":"defragmenting","path":"default.etcd/member/snap/db","current-db-size-bytes":20480,"current-db-size":"20 kB","current-db-size-in-use-bytes":16384,"current-db-size-in-use":"16 kB"}
{"level":"info","ts":"2024-04-09T23:28:54.091229+0800","caller":"v3rpc/health.go:62","msg":"grpc service status changed","service":"","status":"SERVING"}
panic: runtime error: invalid memory address or nil pointer dereference
[signal SIGSEGV: segmentation violation code=0x1 addr=0x0 pc=0xbb2553]

goroutine 156 [running]:
go.etcd.io/bbolt.(*Tx).Bucket(...)
        go.etcd.io/[email protected]/tx.go:112
go.etcd.io/etcd/server/v3/storage/backend.(*baseReadTx).UnsafeRange(0xc00061ac80, {0x12b7780, 0x1a73420}, {0x1a113e8, 0xe, 0xe}, {0x0, 0x0, 0x0}, 0x1)
        go.etcd.io/etcd/server/v3/storage/backend/read_tx.go:103 +0x233
go.etcd.io/etcd/server/v3/storage/schema.UnsafeReadStorageVersion({0x7f6280613618?, 0xc00061ac80?})
        go.etcd.io/etcd/server/v3/storage/schema/version.go:35 +0x5d
go.etcd.io/etcd/server/v3/storage/schema.UnsafeDetectSchemaVersion(0xc000334b80, {0x7f6280613618, 0xc00061ac80})
        go.etcd.io/etcd/server/v3/storage/schema/schema.go:94 +0x47
go.etcd.io/etcd/server/v3/storage/schema.DetectSchemaVersion(0xc000334b80, {0x12b77f0, 0xc00061ac80})
        go.etcd.io/etcd/server/v3/storage/schema/schema.go:89 +0xf2
go.etcd.io/etcd/server/v3/etcdserver.(*EtcdServer).StorageVersion(0xc000393c08)
        go.etcd.io/etcd/server/v3/etcdserver/server.go:2216 +0x105
go.etcd.io/etcd/server/v3/etcdserver.(*serverVersionAdapter).GetStorageVersion(0x0?)
        go.etcd.io/etcd/server/v3/etcdserver/adapters.go:77 +0x16
go.etcd.io/etcd/server/v3/etcdserver/version.(*Monitor).UpdateStorageVersionIfNeeded(0xc00002df70)
        go.etcd.io/etcd/server/v3/etcdserver/version/monitor.go:112 +0x5d
go.etcd.io/etcd/server/v3/etcdserver.(*EtcdServer).monitorStorageVersion(0xc000393c08)
        go.etcd.io/etcd/server/v3/etcdserver/server.go:2259 +0xa8
go.etcd.io/etcd/server/v3/etcdserver.(*EtcdServer).GoAttach.func1()
        go.etcd.io/etcd/server/v3/etcdserver/server.go:2440 +0x59
created by go.etcd.io/etcd/server/v3/etcdserver.(*EtcdServer).GoAttach in goroutine 1
        go.etcd.io/etcd/server/v3/etcdserver/server.go:2438 +0xf9
```

REF: etcd-io/bbolt#715

Signed-off-by: Wei Fu <[email protected]>
  • Loading branch information
fuweid committed Nov 6, 2024
1 parent 75274b2 commit c438fcb
Showing 1 changed file with 10 additions and 0 deletions.
10 changes: 10 additions & 0 deletions server/storage/backend/backend.go
Original file line number Diff line number Diff line change
Expand Up @@ -515,6 +515,16 @@ func (b *backend) defrag() error {
)
}

defer func() {
// NOTE: We should exit as soon as possible because that tx
// might be closed. The inflight request might use invalid
// tx and then panic as well. The real panic reason might be
// shadowed by new panic. So, we should fatal here with lock.
if rerr := recover(); rerr != nil {
b.lg.Fatal("unexpected panic during defrag", zap.Any("panic", rerr))
}
}()

// Commit/stop and then reset current transactions (including the readTx)
b.batchTx.unsafeCommit(true)
b.batchTx.tx = nil
Expand Down

0 comments on commit c438fcb

Please sign in to comment.