Skip to content

Commit

Permalink
Added watchdog support for a Multi-Kill threshold. (envoyproxy#12108)
Browse files Browse the repository at this point in the history
WatchDog will now kill if max(2, registered_threads * multi_kill_threshold) threads have gone above the multikill_timeout.

Signed-off-by: Kevin Baichoo <[email protected]>
  • Loading branch information
KBaichoo authored Jul 21, 2020
1 parent ffd8a6e commit 7f78581
Show file tree
Hide file tree
Showing 17 changed files with 154 additions and 30 deletions.
1 change: 1 addition & 0 deletions api/envoy/config/bootstrap/v3/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ api_proto_package(
"//envoy/config/overload/v3:pkg",
"//envoy/config/trace/v3:pkg",
"//envoy/extensions/transport_sockets/tls/v3:pkg",
"//envoy/type/v3:pkg",
"@com_github_cncf_udpa//udpa/annotations:pkg",
"@com_github_cncf_udpa//udpa/core/v1:pkg",
],
Expand Down
14 changes: 11 additions & 3 deletions api/envoy/config/bootstrap/v3/bootstrap.proto
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import "envoy/config/metrics/v3/stats.proto";
import "envoy/config/overload/v3/overload.proto";
import "envoy/config/trace/v3/http_tracer.proto";
import "envoy/extensions/transport_sockets/tls/v3/secret.proto";
import "envoy/type/v3/percent.proto";

import "google/protobuf/duration.proto";
import "google/protobuf/struct.proto";
Expand Down Expand Up @@ -297,6 +298,7 @@ message ClusterManager {
// Envoy process watchdog configuration. When configured, this monitors for
// nonresponsive threads and kills the process after the configured thresholds.
// See the :ref:`watchdog documentation <operations_performance_watchdog>` for more information.
// [#next-free-field: 6]
message Watchdog {
option (udpa.annotations.versioning).previous_message_type = "envoy.config.bootstrap.v2.Watchdog";

Expand All @@ -314,10 +316,16 @@ message Watchdog {
// kill behavior. If not specified the default is 0 (disabled).
google.protobuf.Duration kill_timeout = 3;

// If at least two watched threads have been nonresponsive for at least this
// duration assume a true deadlock and kill the entire Envoy process. Set to 0
// to disable this behavior. If not specified the default is 0 (disabled).
// If max(2, ceil(registered_threads * Fraction(*multikill_threshold*)))
// threads have been nonresponsive for at least this duration kill the entire
// Envoy process. Set to 0 to disable this behavior. If not specified the
// default is 0 (disabled).
google.protobuf.Duration multikill_timeout = 4;

// Sets the threshold for *multikill_timeout* in terms of the percentage of
// nonresponsive threads required for the *multikill_timeout*.
// If not specified the default is 0.
type.v3.Percent multikill_threshold = 5;
}

// Runtime :ref:`configuration overview <config_runtime>` (deprecated).
Expand Down
1 change: 1 addition & 0 deletions api/envoy/config/bootstrap/v4alpha/BUILD

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 11 additions & 3 deletions api/envoy/config/bootstrap/v4alpha/bootstrap.proto

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions generated_api_shadow/envoy/config/bootstrap/v3/BUILD

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 11 additions & 3 deletions generated_api_shadow/envoy/config/bootstrap/v3/bootstrap.proto

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions generated_api_shadow/envoy/config/bootstrap/v4alpha/BUILD

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 9 additions & 0 deletions include/envoy/server/configuration.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,15 @@ class Main {
* multiple nonresponsive threads.
*/
virtual std::chrono::milliseconds wdMultiKillTimeout() const PURE;

/**
* @return double the percentage of threads that need to meet the MultiKillTimeout before we
* kill the process. This is used in the calculation below
* Max(2, ceil(registered_threads * Fraction(MultiKillThreshold)))
* which computes the number of threads that need to be be nonresponsive
* for at least MultiKillTimeout before we kill the process.
*/
virtual double wdMultiKillThreshold() const PURE;
};

/**
Expand Down
2 changes: 2 additions & 0 deletions source/server/configuration_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,8 @@ void MainImpl::initialize(const envoy::config::bootstrap::v3::Bootstrap& bootstr
std::chrono::milliseconds(PROTOBUF_GET_MS_OR_DEFAULT(watchdog, kill_timeout, 0));
watchdog_multikill_timeout_ =
std::chrono::milliseconds(PROTOBUF_GET_MS_OR_DEFAULT(watchdog, multikill_timeout, 0));
watchdog_multikill_threshold_ =
PROTOBUF_PERCENT_TO_DOUBLE_OR_DEFAULT(watchdog, multikill_threshold, 0.0);

initializeStatsSinks(bootstrap, server);
}
Expand Down
3 changes: 3 additions & 0 deletions source/server/configuration_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,8 @@ class MainImpl : Logger::Loggable<Logger::Id::config>, public Main {
return watchdog_multikill_timeout_;
}

double wdMultiKillThreshold() const override { return watchdog_multikill_threshold_; }

private:
/**
* Initialize tracers and corresponding sinks.
Expand All @@ -126,6 +128,7 @@ class MainImpl : Logger::Loggable<Logger::Id::config>, public Main {
std::chrono::milliseconds watchdog_megamiss_timeout_;
std::chrono::milliseconds watchdog_kill_timeout_;
std::chrono::milliseconds watchdog_multikill_timeout_;
double watchdog_multikill_threshold_;
};

/**
Expand Down
22 changes: 14 additions & 8 deletions source/server/guarddog_impl.cc
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#include "server/guarddog_impl.h"

#include <sys/types.h>

#include <chrono>
#include <memory>

Expand All @@ -23,6 +25,7 @@ GuardDogImpl::GuardDogImpl(Stats::Scope& stats_scope, const Server::Configuratio
time_source_(api.timeSource()), miss_timeout_(config.wdMissTimeout()),
megamiss_timeout_(config.wdMegaMissTimeout()), kill_timeout_(config.wdKillTimeout()),
multi_kill_timeout_(config.wdMultiKillTimeout()),
multi_kill_fraction_(config.wdMultiKillThreshold() / 100.0),
loop_interval_([&]() -> std::chrono::milliseconds {
// The loop interval is simply the minimum of all specified intervals,
// but we must account for the 0=disabled case. This lambda takes care
Expand Down Expand Up @@ -60,8 +63,14 @@ void GuardDogImpl::step() {
const auto now = time_source_.monotonicTime();

{
bool seen_one_multi_timeout(false);
size_t multi_kill_count = 0;
Thread::LockGuard guard(wd_lock_);

// Compute the multikill threshold
const size_t required_for_multi_kill =
std::max(static_cast<size_t>(2),
static_cast<size_t>(ceil(multi_kill_fraction_ * watched_dogs_.size())));

for (auto& watched_dog : watched_dogs_) {
const auto ltt = watched_dog->dog_->lastTouchTime();
const auto delta = now - ltt;
Expand Down Expand Up @@ -90,13 +99,10 @@ void GuardDogImpl::step() {
watched_dog->dog_->threadId().debugString()));
}
if (multikillEnabled() && delta > multi_kill_timeout_) {
if (seen_one_multi_timeout) {

PANIC(fmt::format(
"GuardDog: multiple threads ({},...) stuck for more than watchdog_multikill_timeout",
watched_dog->dog_->threadId().debugString()));
} else {
seen_one_multi_timeout = true;
if (++multi_kill_count >= required_for_multi_kill) {
PANIC(fmt::format("GuardDog: At least {} threads ({},...) stuck for more than "
"watchdog_multikill_timeout",
multi_kill_count, watched_dog->dog_->threadId().debugString()));
}
}
}
Expand Down
1 change: 1 addition & 0 deletions source/server/guarddog_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ class GuardDogImpl : public GuardDog {
const std::chrono::milliseconds megamiss_timeout_;
const std::chrono::milliseconds kill_timeout_;
const std::chrono::milliseconds multi_kill_timeout_;
const double multi_kill_fraction_;
const std::chrono::milliseconds loop_interval_;
Stats::Counter& watchdog_miss_counter_;
Stats::Counter& watchdog_megamiss_counter_;
Expand Down
7 changes: 5 additions & 2 deletions test/mocks/server/main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,15 @@ namespace Configuration {

using ::testing::Return;

MockMain::MockMain(int wd_miss, int wd_megamiss, int wd_kill, int wd_multikill)
: wd_miss_(wd_miss), wd_megamiss_(wd_megamiss), wd_kill_(wd_kill), wd_multikill_(wd_multikill) {
MockMain::MockMain(int wd_miss, int wd_megamiss, int wd_kill, int wd_multikill,
double wd_multikill_threshold)
: wd_miss_(wd_miss), wd_megamiss_(wd_megamiss), wd_kill_(wd_kill), wd_multikill_(wd_multikill),
wd_multikill_threshold_(wd_multikill_threshold) {
ON_CALL(*this, wdMissTimeout()).WillByDefault(Return(wd_miss_));
ON_CALL(*this, wdMegaMissTimeout()).WillByDefault(Return(wd_megamiss_));
ON_CALL(*this, wdKillTimeout()).WillByDefault(Return(wd_kill_));
ON_CALL(*this, wdMultiKillTimeout()).WillByDefault(Return(wd_multikill_));
ON_CALL(*this, wdMultiKillThreshold()).WillByDefault(Return(wd_multikill_threshold_));
}

MockMain::~MockMain() = default;
Expand Down
7 changes: 5 additions & 2 deletions test/mocks/server/main.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,9 @@ namespace Server {
namespace Configuration {
class MockMain : public Main {
public:
MockMain() : MockMain(0, 0, 0, 0) {}
MockMain(int wd_miss, int wd_megamiss, int wd_kill, int wd_multikill);
MockMain() : MockMain(0, 0, 0, 0, 0.0) {}
MockMain(int wd_miss, int wd_megamiss, int wd_kill, int wd_multikill,
double wd_multikill_threshold);
~MockMain() override;

MOCK_METHOD(Upstream::ClusterManager*, clusterManager, ());
Expand All @@ -26,11 +27,13 @@ class MockMain : public Main {
MOCK_METHOD(std::chrono::milliseconds, wdMegaMissTimeout, (), (const));
MOCK_METHOD(std::chrono::milliseconds, wdKillTimeout, (), (const));
MOCK_METHOD(std::chrono::milliseconds, wdMultiKillTimeout, (), (const));
MOCK_METHOD(double, wdMultiKillThreshold, (), (const));

std::chrono::milliseconds wd_miss_;
std::chrono::milliseconds wd_megamiss_;
std::chrono::milliseconds wd_kill_;
std::chrono::milliseconds wd_multikill_;
double wd_multikill_threshold_;
};
} // namespace Configuration
} // namespace Server
Expand Down
Loading

0 comments on commit 7f78581

Please sign in to comment.