Skip to content

Commit

Permalink
Detect client suspend cycle in nanny to avoid restart
Browse files Browse the repository at this point in the history
  • Loading branch information
scudette committed Jan 21, 2025
1 parent 3ce075b commit e64d6d9
Show file tree
Hide file tree
Showing 4 changed files with 188 additions and 38 deletions.
2 changes: 1 addition & 1 deletion config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ func GetDefaultConfig() *config_proto.Config {
MaxPoll: 60,

// By default restart the client if we are unable to
// contant the server within this long. (NOTE - even a
// contact the server within this long. (NOTE - even a
// failed connection will reset the counter, the nanny
// will only fire if the client has failed in some way -
// e.g. the communicator is stopped for some reason).
Expand Down
110 changes: 73 additions & 37 deletions executor/nanny.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,24 +14,27 @@ import (
)

var (
Nanny = &NannyService{}
Clock utils.Clock = utils.RealClock{}
Nanny = &NannyService{}
)

type NannyService struct {
mu sync.Mutex
last_pump_to_rb_attempt time.Time
last_pump_rb_to_server_attempt time.Time
last_read_from_server time.Time
last_check_time time.Time

MaxMemoryHardLimit uint64
MaxConnectionDelay time.Duration

Logger *logging.LogContext

// Function that will be called when the nanny detects out of
// specs condition. If not specified we exit immediately.
OnExit func()
// specs condition. If not specified we exit immediately. This
// function will only be called once. If the exit condition occurs
// further OnExit2 will be called repeatadly.
OnExit func()
OnExit2 func()

on_exit_called bool
}
Expand All @@ -40,21 +43,21 @@ func (self *NannyService) UpdatePumpToRb() {
self.mu.Lock()
defer self.mu.Unlock()

self.last_pump_to_rb_attempt = Clock.Now()
self.last_pump_to_rb_attempt = utils.GetTime().Now()
}

func (self *NannyService) UpdatePumpRbToServer() {
self.mu.Lock()
defer self.mu.Unlock()

self.last_pump_rb_to_server_attempt = Clock.Now()
self.last_pump_rb_to_server_attempt = utils.GetTime().Now()
}

func (self *NannyService) UpdateReadFromServer() {
self.mu.Lock()
defer self.mu.Unlock()

self.last_read_from_server = Clock.Now()
self.last_read_from_server = utils.GetTime().Now()
}

func (self *NannyService) _CheckMemory(message string) bool {
Expand Down Expand Up @@ -90,7 +93,7 @@ func (self *NannyService) _CheckTime(t time.Time, message string) bool {
return false
}

now := Clock.Now()
now := utils.GetTime().Now()
if t.Add(self.MaxConnectionDelay).Before(now) {
self.Logger.Error(
"NannyService: <red>Last %v too long ago %v (now is %v MaxConnectionDelay is %v)</>",
Expand All @@ -106,7 +109,11 @@ func (self *NannyService) _Exit() {
// If we already called the OnExit one time, we just hard exit.
if self.OnExit == nil || self.on_exit_called {
self.Logger.Error("Hard Exit called!")
os.Exit(-1)
if self.OnExit2 != nil {
self.OnExit2()
} else {
os.Exit(-1)
}
return
}

Expand All @@ -118,6 +125,43 @@ func (self *NannyService) _Exit() {
self.on_exit_called = true
}

func (self *NannyService) checkOnce(period time.Duration) {
self.mu.Lock()
defer self.mu.Unlock()

// Keep track of the last time we checked things.
last_check_time := self.last_check_time
self.last_check_time = utils.GetTime().Now()

// We should update self.last_check_time periodically
// so it shoud never be more than 20 sec behind this
// check. Unless the machine just wakes up from sleep
// - in that case the last check time is far before
// this check time. In this case we cosider the check
// invalid and try again later.
if last_check_time.Add(period * 2).
Before(self.last_check_time) {
return
}

called := self._CheckTime(self.last_pump_to_rb_attempt, "Pump to Ring Buffer")
if self._CheckTime(self.last_pump_rb_to_server_attempt, "Pump Ring Buffer to Server") {
called = true
}
if self._CheckTime(self.last_read_from_server, "Read From Server") {
called = true
}
if self._CheckMemory("Exceeded HardMemoryLimit") {
called = true
}

// Allow the trigger to be disarmed if the on_exit was
// able to reduce memory use or unstick the process.
if !called {
self.on_exit_called = false
}
}

func (self *NannyService) Start(
ctx context.Context,
wg *sync.WaitGroup) {
Expand All @@ -131,36 +175,34 @@ func (self *NannyService) Start(
"<green>Starting</> nanny with MaxConnectionDelay %v and MaxMemoryHardLimit %v",
self.MaxConnectionDelay, self.MaxMemoryHardLimit)

period := 10 * time.Second

for {
select {
case <-ctx.Done():
return

case <-Clock.After(10 * time.Second):
self.mu.Lock()

called := self._CheckTime(self.last_pump_to_rb_attempt, "Pump to Ring Buffer")
if self._CheckTime(self.last_pump_rb_to_server_attempt, "Pump Ring Buffer to Server") {
called = true
}
if self._CheckTime(self.last_read_from_server, "Read From Server") {
called = true
}
if self._CheckMemory("Exceeded HardMemoryLimit") {
called = true
}

// Allow the trigger to be disarmed if the on_exit was
// able to reduce memory use or unstick the process.
if !called {
self.on_exit_called = false
}
self.mu.Unlock()
case <-utils.GetTime().After(period):
self.checkOnce(period)
}
}
}()
}

func NewNanny(
config_obj *config_proto.Config) *NannyService {
if config_obj.Client.NannyMaxConnectionDelay > 0 {
return &NannyService{
MaxMemoryHardLimit: config_obj.Client.MaxMemoryHardLimit,
last_check_time: utils.GetTime().Now(),
MaxConnectionDelay: time.Duration(
config_obj.Client.NannyMaxConnectionDelay) * time.Second,
Logger: logging.GetLogger(config_obj, &logging.ClientComponent),
}
}
return nil
}

func StartNannyService(
ctx context.Context,
wg *sync.WaitGroup,
Expand All @@ -169,14 +211,8 @@ func StartNannyService(
return nil
}

if config_obj.Client.NannyMaxConnectionDelay > 0 {
Nanny = &NannyService{
MaxMemoryHardLimit: config_obj.Client.MaxMemoryHardLimit,
MaxConnectionDelay: time.Duration(
config_obj.Client.NannyMaxConnectionDelay) * time.Second,
Logger: logging.GetLogger(config_obj, &logging.ClientComponent),
}

Nanny = NewNanny(config_obj)
if Nanny != nil {
Nanny.Start(ctx, wg)
}
return nil
Expand Down
102 changes: 102 additions & 0 deletions executor/nanny_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
package executor

import (
"testing"
"time"

"www.velocidex.com/golang/velociraptor/config"
"www.velocidex.com/golang/velociraptor/utils"
"www.velocidex.com/golang/velociraptor/vtesting/assert"
)

type OnExitHelper struct {
exit_called time.Time
}

func (self *OnExitHelper) Exit() {
// Only record the first time we were called
if self.exit_called.IsZero() {
self.exit_called = utils.GetTime().Now()
}
}

func TestNanny(t *testing.T) {
period := 10 * time.Second

// If we did not communicate with the server in 60 sec, hard exit.
config_obj := config.GetDefaultConfig()
config_obj.Client.NannyMaxConnectionDelay = 60

closer := utils.MockTime(utils.NewMockClock(time.Unix(1000, 0)))
defer closer()

helper := OnExitHelper{}
Nanny := NewNanny(config_obj)
Nanny.OnExit = helper.Exit
Nanny.OnExit2 = helper.Exit

// Set all checks to now.
Nanny.UpdatePumpToRb()
Nanny.UpdatePumpRbToServer()
Nanny.UpdateReadFromServer()

// A check at time 1000 - should be fine as this is the same time
// the pumps were touched.
Nanny.checkOnce(period)

// Move the time 70 sec on, 10 sec at the time. This emulates the
// nanny periodic checking as happens at runtime.
for i := 1000; i <= 1080; i += 10 {
utils.MockTime(utils.NewMockClock(time.Unix(int64(i), 0)))
Nanny.checkOnce(period)
}

// Only the first check after the 60 second timeout will trigger
// an exit. Earlier checks will not trigger exit.
assert.Equal(t, int64(1070), helper.exit_called.Unix())
}

// Check that nanny is able to detect a large time step (like endpoint
// sleep/suspend cycle)
func TestNannySleep(t *testing.T) {
period := 10 * time.Second

// If we did not communicate with the server in 60 sec, hard exit.
config_obj := config.GetDefaultConfig()
config_obj.Client.NannyMaxConnectionDelay = 60

closer := utils.MockTime(utils.NewMockClock(time.Unix(1000, 0)))
defer closer()

helper := OnExitHelper{}
Nanny := NewNanny(config_obj)
Nanny.OnExit = helper.Exit
Nanny.OnExit2 = helper.Exit

// Set all checks to now.
Nanny.UpdatePumpToRb()
Nanny.UpdatePumpRbToServer()
Nanny.UpdateReadFromServer()

// A check at time 1000 - should be fine as this is the same time
// the pumps were touched.
Nanny.checkOnce(period)

// Now emulate a suspend cycle - the next check occurs a long time
// after the last check
utils.MockTime(utils.NewMockClock(time.Unix(2000, 0)))

Nanny.checkOnce(period)

// Did not trigger an exit.
assert.True(t, helper.exit_called.IsZero())

// Step the next check by 10 sec
utils.MockTime(utils.NewMockClock(time.Unix(2010, 0)))

// This will now trigger an exit
Nanny.checkOnce(period)

// First check after the 60 second timeout will trigger an exit.
assert.Equal(t, int64(2010), helper.exit_called.Unix())
}
12 changes: 12 additions & 0 deletions services/launcher/compiler.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,18 @@ func (self *Launcher) CompileSingleArtifact(
escaped_name, maybeEscape(name+"_")),
})

case "upload_file":
result.Query = append(result.Query, &actions_proto.VQLRequest{
VQL: fmt.Sprintf(`LET %v <= if(condition=%v, then={
SELECT Content FROM http_client(url=%v, tempfile_extension='.tmp')
})`,
maybeEscape(name+"_"), escaped_name, escaped_name),
})
result.Query = append(result.Query, &actions_proto.VQLRequest{
VQL: fmt.Sprintf("LET %v <= %v.Content[0]",
escaped_name, maybeEscape(name+"_")),
})

case "server_metadata":
client_info_manager, err := services.GetClientInfoManager(config_obj)
if err == nil {
Expand Down

0 comments on commit e64d6d9

Please sign in to comment.