Skip to content

Commit

Permalink
fix(defrag): handle defragdb failure
Browse files Browse the repository at this point in the history
Signed-off-by: Thomas Gosteli <[email protected]>
  • Loading branch information
ghouscht committed Nov 6, 2024
1 parent 35cab80 commit 04c042c
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 22 deletions.
12 changes: 10 additions & 2 deletions server/storage/backend/backend.go
Original file line number Diff line number Diff line change
Expand Up @@ -490,8 +490,8 @@ func (b *backend) defrag() error {
options = *boltOpenOptions
}
options.OpenFile = func(_ string, _ int, _ os.FileMode) (file *os.File, err error) {
// gofail: var defragNoSpace string
// return nil, fmt.Errorf(defragNoSpace)
// gofail: var defragOpenFileError string
// return nil, fmt.Errorf(defragOpenFileError)
return temp, nil
}
// Don't load tmp db into memory regardless of opening options
Expand Down Expand Up @@ -526,6 +526,11 @@ func (b *backend) defrag() error {
if rmErr := os.RemoveAll(tmpdb.Path()); rmErr != nil {
b.lg.Error("failed to remove db.tmp after defragmentation completed", zap.Error(rmErr))
}

// restore the bbolt transactions if defragmentation fails
b.batchTx.tx = b.unsafeBegin(true)
b.readTx.tx = b.unsafeBegin(false)

return err
}

Expand Down Expand Up @@ -578,6 +583,9 @@ func (b *backend) defrag() error {
}

func defragdb(odb, tmpdb *bolt.DB, limit int) error {
// gofail: var defragdbFail string
// return fmt.Errorf(defragdbFail)

// open a tx on tmpdb for writes
tmptx, err := tmpdb.Begin(true)
if err != nil {
Expand Down
62 changes: 42 additions & 20 deletions tests/e2e/defrag_no_space_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ package e2e

import (
"context"
"fmt"
"testing"
"time"

Expand All @@ -26,24 +27,45 @@ import (
)

func TestDefragNoSpace(t *testing.T) {
e2e.BeforeTest(t)

clus, err := e2e.NewEtcdProcessCluster(context.TODO(), t,
e2e.WithClusterSize(1),
e2e.WithGoFailEnabled(true),
)
require.NoError(t, err)
t.Cleanup(func() { clus.Stop() })

member := clus.Procs[0]

require.NoError(t, member.Failpoints().SetupHTTP(context.Background(), "defragNoSpace", `return("no space")`))
require.ErrorContains(t, member.Etcdctl().Defragment(context.Background(), config.DefragOption{Timeout: time.Minute}), "no space")

// Make sure etcd continues to run even after the failed defrag attempt
require.NoError(t, member.Etcdctl().Put(context.Background(), "foo", "bar", config.PutOptions{}))
value, err := member.Etcdctl().Get(context.Background(), "foo", config.GetOptions{})
require.NoError(t, err)
require.Len(t, value.Kvs, 1)
require.Equal(t, "bar", string(value.Kvs[0].Value))
tests := []struct {
name string
failpoint string
err string
}{
{
name: "no space (#18810) - can't open/create new bbolt db",
failpoint: "defragOpenFileError",
err: "no space",
},
{
name: "defragdb failure",
failpoint: "defragdbFail",
err: "some random error",
},
}

for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
e2e.BeforeTest(t)

clus, err := e2e.NewEtcdProcessCluster(context.TODO(), t,
e2e.WithClusterSize(1),
e2e.WithGoFailEnabled(true),
)
require.NoError(t, err)
t.Cleanup(func() { clus.Stop() })

member := clus.Procs[0]

require.NoError(t, member.Failpoints().SetupHTTP(context.Background(), tc.failpoint, fmt.Sprintf(`return("%s")`, tc.err)))
require.ErrorContains(t, member.Etcdctl().Defragment(context.Background(), config.DefragOption{Timeout: time.Minute}), tc.err)

// Make sure etcd continues to run even after the failed defrag attempt
require.NoError(t, member.Etcdctl().Put(context.Background(), "foo", "bar", config.PutOptions{}))
value, err := member.Etcdctl().Get(context.Background(), "foo", config.GetOptions{})
require.NoError(t, err)
require.Len(t, value.Kvs, 1)
require.Equal(t, "bar", string(value.Kvs[0].Value))
})
}
}

0 comments on commit 04c042c

Please sign in to comment.