Skip to content

Commit

Permalink
Watchdog action that will signal a particular thread to abort. (envoy…
Browse files Browse the repository at this point in the history
…proxy#12860)

Signed-off-by: Kevin Baichoo <[email protected]>
  • Loading branch information
KBaichoo authored Sep 16, 2020
1 parent 7c458e9 commit cd5bdc4
Show file tree
Hide file tree
Showing 27 changed files with 564 additions and 30 deletions.
1 change: 1 addition & 0 deletions CODEOWNERS
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ extensions/filters/common/original_src @snowp @klarose
/*/extensions/filters/http/decompressor @rojkov @dio
# Watchdog Extensions
/*/extensions/watchdog/profile_action @kbaichoo @htuch
/*/extensions/watchdog/abort_action @kbaichoo @htuch
# Core upstream code
extensions/upstreams/http @alyssawilk @snowp @mattklein123
extensions/upstreams/http/http @alyssawilk @snowp @mattklein123
Expand Down
1 change: 1 addition & 0 deletions api/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,7 @@ proto_library(
"//envoy/extensions/upstreams/http/http/v3:pkg",
"//envoy/extensions/upstreams/http/tcp/v3:pkg",
"//envoy/extensions/wasm/v3:pkg",
"//envoy/extensions/watchdog/abort_action/v3alpha:pkg",
"//envoy/extensions/watchdog/profile_action/v3alpha:pkg",
"//envoy/service/accesslog/v3:pkg",
"//envoy/service/auth/v3:pkg",
Expand Down
9 changes: 9 additions & 0 deletions api/envoy/extensions/watchdog/abort_action/v3alpha/BUILD
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# DO NOT EDIT. This file is generated by tools/proto_format/proto_sync.py.

load("@envoy_api//bazel:api_build_system.bzl", "api_proto_package")

licenses(["notice"]) # Apache 2

api_proto_package(
deps = ["@com_github_cncf_udpa//udpa/annotations:pkg"],
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
syntax = "proto3";

package envoy.extensions.watchdog.abort_action.v3alpha;

import "google/protobuf/duration.proto";

import "udpa/annotations/status.proto";
import "udpa/annotations/versioning.proto";
import "validate/validate.proto";

option java_package = "io.envoyproxy.envoy.extensions.watchdog.abort_action.v3alpha";
option java_outer_classname = "AbortActionProto";
option java_multiple_files = true;
option (udpa.annotations.file_status).work_in_progress = true;
option (udpa.annotations.file_status).package_version_status = ACTIVE;

// [#protodoc-title: Watchdog Action that sends a SIGABRT to kill the process.]
// [#extension: envoy.watchdog.abort_action]

// A GuardDogAction that will terminate the process by sending SIGABRT to the
// stuck thread. This would allow easier access to the call stack of the stuck
// thread since we would run signal handlers on that thread. This would be
// more useful than the default watchdog kill behaviors since those PANIC
// from the watchdog's thread.

// This is currently only implemented for systems that support kill to send
// signals.
message AbortActionConfig {
// How long to wait for the thread to respond to the SIGABRT before killing the
// process from this action. This is a blocking action.
google.protobuf.Duration wait_duration = 1;
}
1 change: 1 addition & 0 deletions api/versioning/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ proto_library(
"//envoy/extensions/upstreams/http/http/v3:pkg",
"//envoy/extensions/upstreams/http/tcp/v3:pkg",
"//envoy/extensions/wasm/v3:pkg",
"//envoy/extensions/watchdog/abort_action/v3alpha:pkg",
"//envoy/extensions/watchdog/profile_action/v3alpha:pkg",
"//envoy/service/accesslog/v3:pkg",
"//envoy/service/auth/v3:pkg",
Expand Down
1 change: 1 addition & 0 deletions bazel/repositories.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ WINDOWS_SKIP_TARGETS = [
"envoy.tracers.lightstep",
"envoy.tracers.datadog",
"envoy.tracers.opencensus",
"envoy.watchdog.abort_action",
]

# Make all contents of an external repository accessible under a filegroup. Used for external HTTP
Expand Down
1 change: 1 addition & 0 deletions docs/root/api-v3/config/watchdog/watchdog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ Watchdog
:maxdepth: 2

../../extensions/watchdog/profile_action/v3alpha/*
../../extensions/watchdog/abort_action/v3alpha/*
1 change: 1 addition & 0 deletions docs/root/version_history/current.rst
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ New Features
* watchdog: support randomizing the watchdog's kill timeout to prevent synchronized kills via a maximium jitter parameter :ref:`max_kill_timeout_jitter<envoy_v3_api_field_config.bootstrap.v3.Watchdog.max_kill_timeout_jitter>`.
* watchdog: supports an extension point where actions can be registered to fire on watchdog events such as miss, megamiss, kill and multikill. See ref:`watchdog actions<envoy_v3_api_field_config.bootstrap.v3.Watchdog.actions>`.
* watchdog: watchdog action extension that does cpu profiling. See ref:`Profile Action <envoy_v3_api_file_envoy/extensions/watchdog/profile_action/v3alpha/profile_action.proto>`.
* watchdog: watchdog action extension that sends SIGABRT to the stuck thread to terminate the process. See ref:`Abort Action <envoy_v3_api_file_envoy/extensions/watchdog/abort_action/v3alpha/abort_action.proto>`.
* xds: added :ref:`extension config discovery<envoy_v3_api_msg_config.core.v3.ExtensionConfigSource>` support for HTTP filters.
* zlib: added option to use `zlib-ng <https://github.com/zlib-ng/zlib-ng>`_ as zlib library.

Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

11 changes: 6 additions & 5 deletions include/envoy/server/guarddog_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,14 @@ class GuardDogAction {
/**
* Callback function for when the GuardDog observes an event.
* @param event the event the GuardDog observes.
* @param thread_ltt_pairs pairs of the relevant thread to the event, and the
* last time touched (LTT) of those threads with their watchdog.
* @param thread_last_checkin_pairs pair of the relevant thread to the event, and the
* last check in time of those threads with their watchdog.
* @param now the current time.
*/
virtual void run(envoy::config::bootstrap::v3::Watchdog::WatchdogAction::WatchdogEvent event,
const std::vector<std::pair<Thread::ThreadId, MonotonicTime>>& thread_ltt_pairs,
MonotonicTime now) PURE;
virtual void
run(envoy::config::bootstrap::v3::Watchdog::WatchdogAction::WatchdogEvent event,
const std::vector<std::pair<Thread::ThreadId, MonotonicTime>>& thread_last_checkin_pairs,
MonotonicTime now) PURE;
};

using GuardDogActionPtr = std::unique_ptr<GuardDogAction>;
Expand Down
1 change: 1 addition & 0 deletions source/extensions/extensions_build_config.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,7 @@ EXTENSIONS = {
# Watchdog actions
#
"envoy.watchdog.profile_action": "//source/extensions/watchdog/profile_action:config",
"envoy.watchdog.abort_action": "//source/extensions/watchdog/abort_action:config",

}

Expand Down
40 changes: 40 additions & 0 deletions source/extensions/watchdog/abort_action/BUILD
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
load(
"//bazel:envoy_build_system.bzl",
"envoy_cc_extension",
"envoy_cc_library",
"envoy_extension_package",
)

licenses(["notice"]) # Apache 2

envoy_extension_package()

envoy_cc_library(
name = "abort_action_lib",
srcs = ["abort_action.cc"],
hdrs = ["abort_action.h"],
deps = [
"//include/envoy/common:time_interface",
"//include/envoy/server:guarddog_config_interface",
"//include/envoy/thread:thread_interface",
"//source/common/common:assert_lib",
"//source/common/protobuf:utility_lib",
"@envoy_api//envoy/extensions/watchdog/abort_action/v3alpha:pkg_cc_proto",
],
)

envoy_cc_extension(
name = "config",
srcs = ["config.cc"],
hdrs = ["config.h"],
security_posture = "data_plane_agnostic",
status = "alpha",
deps = [
":abort_action_lib",
"//include/envoy/registry",
"//source/common/config:utility_lib",
"//source/common/protobuf",
"//source/common/protobuf:message_validator_lib",
"@envoy_api//envoy/extensions/watchdog/abort_action/v3alpha:pkg_cc_proto",
],
)
67 changes: 67 additions & 0 deletions source/extensions/watchdog/abort_action/abort_action.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#include "extensions/watchdog/abort_action/abort_action.h"

#include <sys/types.h>

#include <csignal>

#include "envoy/thread/thread.h"

#include "common/common/assert.h"
#include "common/common/fmt.h"
#include "common/common/logger.h"
#include "common/protobuf/utility.h"

namespace Envoy {
namespace Extensions {
namespace Watchdog {
namespace AbortAction {
namespace {
#ifdef __linux__
pid_t toPlatformTid(int64_t tid) { return static_cast<pid_t>(tid); }
#elif defined(__APPLE__)
uint64_t toPlatformTid(int64_t tid) { return static_cast<uint64_t>(tid); }
#endif
} // namespace

AbortAction::AbortAction(
envoy::extensions::watchdog::abort_action::v3alpha::AbortActionConfig& config,
Server::Configuration::GuardDogActionFactoryContext& /*context*/)
: config_(config){};

void AbortAction::run(
envoy::config::bootstrap::v3::Watchdog::WatchdogAction::WatchdogEvent /*event*/,
const std::vector<std::pair<Thread::ThreadId, MonotonicTime>>& thread_last_checkin_pairs,
MonotonicTime /*now*/) {

if (thread_last_checkin_pairs.empty()) {
ENVOY_LOG_MISC(warn, "Watchdog AbortAction called without any thread.");
return;
}

// The following lines of code won't be considered covered by code coverage
// tools since they would run in DEATH tests.
int64_t raw_tid = thread_last_checkin_pairs[0].first.getId();

// Assume POSIX-compatible system and signal to the thread.
ENVOY_LOG_MISC(error, "Watchdog AbortAction sending abort signal to thread with tid {}.",
raw_tid);

if (kill(toPlatformTid(raw_tid), SIGABRT) == 0) {
// Successfully sent signal, sleep for wait_duration.
absl::SleepFor(absl::Milliseconds(PROTOBUF_GET_MS_OR_DEFAULT(config_, wait_duration, 0)));
} else {
// Failed to send the signal, abort?
ENVOY_LOG_MISC(error, "Failed to send signal to tid {}", raw_tid);
}

// Abort from the action since the signaled thread hasn't yet crashed the process.
// panicing in the action gives flexibility since it doesn't depend on
// external code to kill the process if the signal fails.
PANIC(fmt::format("Failed to kill thread with id {}, aborting from Watchdog AbortAction instead.",
raw_tid));
}

} // namespace AbortAction
} // namespace Watchdog
} // namespace Extensions
} // namespace Envoy
37 changes: 37 additions & 0 deletions source/extensions/watchdog/abort_action/abort_action.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#pragma once

#include <chrono>

#include "envoy/extensions/watchdog/abort_action/v3alpha/abort_action.pb.h"
#include "envoy/server/guarddog_config.h"
#include "envoy/thread/thread.h"

namespace Envoy {
namespace Extensions {
namespace Watchdog {
namespace AbortAction {

/**
* A GuardDogAction that will terminate the process by sending SIGABRT to the
* stuck thread. This is currently only implemented for systems that
* support kill to send signals.
*/
class AbortAction : public Server::Configuration::GuardDogAction {
public:
AbortAction(envoy::extensions::watchdog::abort_action::v3alpha::AbortActionConfig& config,
Server::Configuration::GuardDogActionFactoryContext& context);

void run(envoy::config::bootstrap::v3::Watchdog::WatchdogAction::WatchdogEvent event,
const std::vector<std::pair<Thread::ThreadId, MonotonicTime>>& thread_last_checkin_pairs,
MonotonicTime now) override;

private:
const envoy::extensions::watchdog::abort_action::v3alpha::AbortActionConfig config_;
};

using AbortActionPtr = std::unique_ptr<AbortAction>;

} // namespace AbortAction
} // namespace Watchdog
} // namespace Extensions
} // namespace Envoy
32 changes: 32 additions & 0 deletions source/extensions/watchdog/abort_action/config.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#include "extensions/watchdog/abort_action/config.h"

#include "envoy/registry/registry.h"

#include "common/config/utility.h"
#include "common/protobuf/message_validator_impl.h"

#include "extensions/watchdog/abort_action/abort_action.h"

namespace Envoy {
namespace Extensions {
namespace Watchdog {
namespace AbortAction {

Server::Configuration::GuardDogActionPtr AbortActionFactory::createGuardDogActionFromProto(
const envoy::config::bootstrap::v3::Watchdog::WatchdogAction& config,
Server::Configuration::GuardDogActionFactoryContext& context) {
auto message = createEmptyConfigProto();
Config::Utility::translateOpaqueConfig(config.config().typed_config(), ProtobufWkt::Struct(),
ProtobufMessage::getStrictValidationVisitor(), *message);
return std::make_unique<AbortAction>(dynamic_cast<AbortActionConfig&>(*message), context);
}

/**
* Static registration for the fixed heap resource monitor factory. @see RegistryFactory.
*/
REGISTER_FACTORY(AbortActionFactory, Server::Configuration::GuardDogActionFactory);

} // namespace AbortAction
} // namespace Watchdog
} // namespace Extensions
} // namespace Envoy
33 changes: 33 additions & 0 deletions source/extensions/watchdog/abort_action/config.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#pragma once

#include "envoy/extensions/watchdog/abort_action/v3alpha/abort_action.pb.h"
#include "envoy/server/guarddog_config.h"

#include "common/protobuf/protobuf.h"

namespace Envoy {
namespace Extensions {
namespace Watchdog {
namespace AbortAction {

class AbortActionFactory : public Server::Configuration::GuardDogActionFactory {
public:
AbortActionFactory() = default;

Server::Configuration::GuardDogActionPtr createGuardDogActionFromProto(
const envoy::config::bootstrap::v3::Watchdog::WatchdogAction& config,
Server::Configuration::GuardDogActionFactoryContext& context) override;

ProtobufTypes::MessagePtr createEmptyConfigProto() override {
return std::make_unique<AbortActionConfig>();
}

std::string name() const override { return "envoy.watchdog.abort_action"; }

using AbortActionConfig = envoy::extensions::watchdog::abort_action::v3alpha::AbortActionConfig;
};

} // namespace AbortAction
} // namespace Watchdog
} // namespace Extensions
} // namespace Envoy
4 changes: 2 additions & 2 deletions source/extensions/watchdog/profile_action/profile_action.cc
Original file line number Diff line number Diff line change
Expand Up @@ -49,14 +49,14 @@ ProfileAction::ProfileAction(

void ProfileAction::run(
envoy::config::bootstrap::v3::Watchdog::WatchdogAction::WatchdogEvent /*event*/,
const std::vector<std::pair<Thread::ThreadId, MonotonicTime>>& thread_ltt_pairs,
const std::vector<std::pair<Thread::ThreadId, MonotonicTime>>& thread_last_checkin_pairs,
MonotonicTime /*now*/) {
if (running_profile_) {
return;
}

// Check if there's a tid that justifies profiling
if (thread_ltt_pairs.empty()) {
if (thread_last_checkin_pairs.empty()) {
ENVOY_LOG_MISC(warn, "Profile Action: No tids were provided.");
return;
}
Expand Down
2 changes: 1 addition & 1 deletion source/extensions/watchdog/profile_action/profile_action.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ class ProfileAction : public Server::Configuration::GuardDogAction {
Server::Configuration::GuardDogActionFactoryContext& context);

void run(envoy::config::bootstrap::v3::Watchdog::WatchdogAction::WatchdogEvent event,
const std::vector<std::pair<Thread::ThreadId, MonotonicTime>>& thread_ltt_pairs,
const std::vector<std::pair<Thread::ThreadId, MonotonicTime>>& thread_last_checkin_pairs,
MonotonicTime now) override;

private:
Expand Down
Loading

0 comments on commit cd5bdc4

Please sign in to comment.