Skip to content

Commit

Permalink
Merge pull request etcd-io#16677 from serathius/revert-13525
Browse files Browse the repository at this point in the history
Revert "etcd server shouldn't wait for the ready notification infinitely on startup"
  • Loading branch information
serathius authored Oct 4, 2023
2 parents 19a6baf + e31de5e commit 1c5289d
Show file tree
Hide file tree
Showing 11 changed files with 23 additions and 90 deletions.
1 change: 0 additions & 1 deletion CHANGELOG/CHANGELOG-3.6.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,6 @@ See [code changes](https://github.com/etcd-io/etcd/compare/v3.5.0...v3.6.0).
- Add [`etcd --experimental-max-learners`](https://github.com/etcd-io/etcd/pull/13377) flag to allow configuration of learner max membership.
- Add [`etcd --experimental-enable-lease-checkpoint-persist`](https://github.com/etcd-io/etcd/pull/13508) flag to handle upgrade from v3.5.2 clusters with this feature enabled.
- Add [`etcdctl make-mirror --rev`](https://github.com/etcd-io/etcd/pull/13519) flag to support incremental mirror.
- Add [`etcd --experimental-wait-cluster-ready-timeout`](https://github.com/etcd-io/etcd/pull/13525) flag to wait for cluster to be ready before serving client requests.
- Add [v3 discovery](https://github.com/etcd-io/etcd/pull/13635) to bootstrap a new etcd cluster.
- Add [field `storage`](https://github.com/etcd-io/etcd/pull/13772) into the response body of endpoint `/version`.
- Add [`etcd --max-concurrent-streams`](https://github.com/etcd-io/etcd/pull/14169) flag to configure the max concurrent streams each client can open at a time, and defaults to math.MaxUint32.
Expand Down
4 changes: 0 additions & 4 deletions server/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,10 +83,6 @@ type ServerConfig struct {
TickMs uint
ElectionTicks int

// WaitClusterReadyTimeout is the maximum time to wait for the
// cluster to be ready on startup before serving client requests.
WaitClusterReadyTimeout time.Duration

// InitialElectionTickAdvance is true, then local member fast-forwards
// election ticks to speed up "initial" leader election trigger. This
// benefits the case of larger election ticks. For instance, cross
Expand Down
14 changes: 5 additions & 9 deletions server/embed/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,6 @@ const (
DefaultGRPCKeepAliveInterval = 2 * time.Hour
DefaultGRPCKeepAliveTimeout = 20 * time.Second
DefaultDowngradeCheckTime = 5 * time.Second
DefaultWaitClusterReadyTimeout = 5 * time.Second
DefaultAutoCompactionMode = "periodic"

DefaultDiscoveryDialTimeout = 2 * time.Second
Expand Down Expand Up @@ -242,10 +241,9 @@ type Config struct {
Durl string `json:"discovery"`
DiscoveryCfg v3discovery.DiscoveryConfig `json:"discovery-config"`

InitialCluster string `json:"initial-cluster"`
InitialClusterToken string `json:"initial-cluster-token"`
StrictReconfigCheck bool `json:"strict-reconfig-check"`
ExperimentalWaitClusterReadyTimeout time.Duration `json:"wait-cluster-ready-timeout"`
InitialCluster string `json:"initial-cluster"`
InitialClusterToken string `json:"initial-cluster-token"`
StrictReconfigCheck bool `json:"strict-reconfig-check"`

// AutoCompactionMode is either 'periodic' or 'revision'.
AutoCompactionMode string `json:"auto-compaction-mode"`
Expand Down Expand Up @@ -502,9 +500,8 @@ func NewConfig() *Config {
AdvertisePeerUrls: []url.URL{*apurl},
AdvertiseClientUrls: []url.URL{*acurl},

ClusterState: ClusterStateFlagNew,
InitialClusterToken: "etcd-cluster",
ExperimentalWaitClusterReadyTimeout: DefaultWaitClusterReadyTimeout,
ClusterState: ClusterStateFlagNew,
InitialClusterToken: "etcd-cluster",

StrictReconfigCheck: DefaultStrictReconfigCheck,
Metrics: "basic",
Expand Down Expand Up @@ -728,7 +725,6 @@ func (cfg *Config) AddFlags(fs *flag.FlagSet) {
fs.BoolVar(&cfg.ExperimentalTxnModeWriteWithSharedBuffer, "experimental-txn-mode-write-with-shared-buffer", true, "Enable the write transaction to use a shared buffer in its readonly check operations.")
fs.UintVar(&cfg.ExperimentalBootstrapDefragThresholdMegabytes, "experimental-bootstrap-defrag-threshold-megabytes", 0, "Enable the defrag during etcd server bootstrap on condition that it will free at least the provided threshold of disk space. Needs to be set to non-zero value to take effect.")
fs.IntVar(&cfg.ExperimentalMaxLearners, "experimental-max-learners", membership.DefaultMaxLearners, "Sets the maximum number of learners that can be available in the cluster membership.")
fs.DurationVar(&cfg.ExperimentalWaitClusterReadyTimeout, "experimental-wait-cluster-ready-timeout", cfg.ExperimentalWaitClusterReadyTimeout, "Maximum duration to wait for the cluster to be ready.")
fs.Uint64Var(&cfg.SnapshotCatchUpEntries, "experimental-snapshot-catchup-entries", cfg.SnapshotCatchUpEntries, "Number of entries for a slow follower to catch up after compacting the raft storage entries.")

// unsafe
Expand Down
2 changes: 0 additions & 2 deletions server/embed/etcd.go
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,6 @@ func StartEtcd(inCfg *Config) (e *Etcd, err error) {
PeerTLSInfo: cfg.PeerTLSInfo,
TickMs: cfg.TickMs,
ElectionTicks: cfg.ElectionTicks(),
WaitClusterReadyTimeout: cfg.ExperimentalWaitClusterReadyTimeout,
InitialElectionTickAdvance: cfg.InitialElectionTickAdvance,
AutoCompactionRetention: autoCompactionRetention,
AutoCompactionMode: cfg.AutoCompactionMode,
Expand Down Expand Up @@ -328,7 +327,6 @@ func print(lg *zap.Logger, ec Config, sc config.ServerConfig, memberInitialized
zap.Bool("force-new-cluster", sc.ForceNewCluster),
zap.String("heartbeat-interval", fmt.Sprintf("%v", time.Duration(sc.TickMs)*time.Millisecond)),
zap.String("election-timeout", fmt.Sprintf("%v", time.Duration(sc.ElectionTicks*int(sc.TickMs))*time.Millisecond)),
zap.String("wait-cluster-ready-timeout", sc.WaitClusterReadyTimeout.String()),
zap.Bool("initial-election-tick-advance", sc.InitialElectionTickAdvance),
zap.Uint64("snapshot-count", sc.SnapshotCount),
zap.Uint("max-wals", sc.MaxWALFiles),
Expand Down
11 changes: 1 addition & 10 deletions server/embed/serve.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ import (
"net"
"net/http"
"strings"
"time"

gw "github.com/grpc-ecosystem/grpc-gateway/v2/runtime"
"github.com/soheilhy/cmux"
Expand Down Expand Up @@ -100,15 +99,7 @@ func (sctx *serveCtx) serve(
splitHttp bool,
gopts ...grpc.ServerOption) (err error) {
logger := defaultLog.New(io.Discard, "etcdhttp", 0)

// When the quorum isn't satisfied, then etcd server will be blocked
// on <-s.ReadyNotify(). Set a timeout here so that the etcd server
// can continue to serve serializable read request.
select {
case <-time.After(s.Cfg.WaitClusterReadyTimeout):
sctx.lg.Warn("timed out waiting for the ready notification")
case <-s.ReadyNotify():
}
<-s.ReadyNotify()

sctx.lg.Info("ready to serve client requests")

Expand Down
3 changes: 0 additions & 3 deletions server/etcdmain/etcd.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ import (
"os"
"runtime"
"strings"
"time"

"go.uber.org/zap"
"google.golang.org/grpc"
Expand Down Expand Up @@ -211,8 +210,6 @@ func startEtcd(cfg *embed.Config) (<-chan struct{}, <-chan error, error) {
select {
case <-e.Server.ReadyNotify(): // wait for e.Server to join the cluster
case <-e.Server.StopNotify(): // publish aborted from 'ErrStopped'
case <-time.After(cfg.ExperimentalWaitClusterReadyTimeout):
e.GetLogger().Warn("startEtcd: timed out waiting for the ready notification")
}
return e.Server.StopNotify(), e.Err(), nil
}
Expand Down
2 changes: 0 additions & 2 deletions server/etcdmain/help.go
Original file line number Diff line number Diff line change
Expand Up @@ -293,8 +293,6 @@ Experimental feature:
Set time duration after which a warning is generated if a unary request takes more than this duration. It's deprecated, and will be decommissioned in v3.7. Use --warning-unary-request-duration instead.
--experimental-max-learners '1'
Set the max number of learner members allowed in the cluster membership.
--experimental-wait-cluster-ready-timeout '5s'
Set the maximum time duration to wait for the cluster to be ready.
--experimental-snapshot-catch-up-entries '5000'
Number of entries for a slow follower to catch up after compacting the raft storage entries.
--experimental-compaction-sleep-interval
Expand Down
23 changes: 17 additions & 6 deletions tests/e2e/corrupt_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,14 @@
// See the License for the specific language governing permissions and
// limitations under the License.

//go:build !cluster_proxy

package e2e

import (
"context"
"fmt"
"sync"
"testing"
"time"

Expand Down Expand Up @@ -155,22 +158,30 @@ func TestInPlaceRecovery(t *testing.T) {
assert.NoError(t, err)

// Rolling recovery of the servers.
wg := sync.WaitGroup{}
t.Log("rolling updating servers in place...")
for i, newProc := range epcNew.Procs {
for i := range epcNew.Procs {
oldProc := epcOld.Procs[i]
err = oldProc.Close()
if err != nil {
t.Fatalf("could not stop etcd process (%v)", err)
}
t.Logf("old cluster server %d: %s stopped.", i, oldProc.Config().Name)
err = newProc.Start(ctx)
if err != nil {
t.Fatalf("could not start etcd process (%v)", err)
}
t.Logf("new cluster server %d: %s started in-place with blank db.", i, newProc.Config().Name)
wg.Add(1)
// Start servers in background to avoid blocking on server start.
// EtcdProcess.Start waits until etcd becomes healthy, which will not happen here until we restart at least 2 members.
go func(proc e2e.EtcdProcess) {
defer wg.Done()
err = proc.Start(ctx)
if err != nil {
t.Errorf("could not start etcd process (%v)", err)
}
t.Logf("new cluster server: %s started in-place with blank db.", proc.Config().Name)
}(epcNew.Procs[i])
t.Log("sleeping 5 sec to let nodes do periodical check...")
time.Sleep(5 * time.Second)
}
wg.Wait()
t.Log("new cluster started.")

alarmResponse, err := newCc.AlarmList(ctx)
Expand Down
1 change: 0 additions & 1 deletion tests/e2e/ctl_v3_member_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,6 @@ func TestCtlV3ConsistentMemberList(t *testing.T) {

epc, err := e2e.NewEtcdProcessCluster(ctx, t,
e2e.WithClusterSize(1),
e2e.WithWaitClusterReadyTimeout(1*time.Nanosecond),
e2e.WithEnvVars(map[string]string{"GOFAIL_FAILPOINTS": `beforeApplyOneConfChange=sleep("2s")`}),
)
require.NoError(t, err, "failed to start etcd cluster: %v", err)
Expand Down
45 changes: 0 additions & 45 deletions tests/e2e/no_quorum_ready_test.go

This file was deleted.

7 changes: 0 additions & 7 deletions tests/framework/e2e/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -354,10 +354,6 @@ func WithWatchProcessNotifyInterval(interval time.Duration) EPClusterOption {
return func(c *EtcdProcessClusterConfig) { c.ServerConfig.ExperimentalWatchProgressNotifyInterval = interval }
}

func WithWaitClusterReadyTimeout(readyTimeout time.Duration) EPClusterOption {
return func(c *EtcdProcessClusterConfig) { c.ServerConfig.ExperimentalWaitClusterReadyTimeout = readyTimeout }
}

func WithEnvVars(ev map[string]string) EPClusterOption {
return func(c *EtcdProcessClusterConfig) { c.EnvVars = ev }
}
Expand Down Expand Up @@ -601,9 +597,6 @@ func (cfg *EtcdProcessClusterConfig) EtcdServerProcessConfig(tb testing.TB, i in
if cfg.ServerConfig.ExperimentalWatchProgressNotifyInterval != 0 {
args = append(args, "--experimental-watch-progress-notify-interval", cfg.ServerConfig.ExperimentalWatchProgressNotifyInterval.String())
}
if cfg.ServerConfig.ExperimentalWaitClusterReadyTimeout != 0 {
args = append(args, "--experimental-wait-cluster-ready-timeout", cfg.ServerConfig.ExperimentalWaitClusterReadyTimeout.String())
}
if cfg.ServerConfig.SnapshotCatchUpEntries != etcdserver.DefaultSnapshotCatchUpEntries {
if cfg.Version == CurrentVersion || (cfg.Version == MinorityLastVersion && i <= cfg.ClusterSize/2) || (cfg.Version == QuorumLastVersion && i > cfg.ClusterSize/2) {
args = append(args, "--experimental-snapshot-catchup-entries", fmt.Sprintf("%d", cfg.ServerConfig.SnapshotCatchUpEntries))
Expand Down

0 comments on commit 1c5289d

Please sign in to comment.