From e64d6d931edf6488955631a491904deb09e5816d Mon Sep 17 00:00:00 2001 From: Mike Cohen Date: Tue, 21 Jan 2025 15:00:10 +1000 Subject: [PATCH] Detect client suspend cycle in nanny to avoid restart --- config/config.go | 2 +- executor/nanny.go | 110 ++++++++++++++++++++++------------ executor/nanny_test.go | 102 +++++++++++++++++++++++++++++++ services/launcher/compiler.go | 12 ++++ 4 files changed, 188 insertions(+), 38 deletions(-) create mode 100644 executor/nanny_test.go diff --git a/config/config.go b/config/config.go index 41155a2baf3..2afb6fda258 100644 --- a/config/config.go +++ b/config/config.go @@ -65,7 +65,7 @@ func GetDefaultConfig() *config_proto.Config { MaxPoll: 60, // By default restart the client if we are unable to - // contant the server within this long. (NOTE - even a + // contact the server within this long. (NOTE - even a // failed connection will reset the counter, the nanny // will only fire if the client has failed in some way - // e.g. the communicator is stopped for some reason). diff --git a/executor/nanny.go b/executor/nanny.go index adf2ee08b6e..9d87a5b8572 100644 --- a/executor/nanny.go +++ b/executor/nanny.go @@ -14,8 +14,7 @@ import ( ) var ( - Nanny = &NannyService{} - Clock utils.Clock = utils.RealClock{} + Nanny = &NannyService{} ) type NannyService struct { @@ -23,6 +22,7 @@ type NannyService struct { last_pump_to_rb_attempt time.Time last_pump_rb_to_server_attempt time.Time last_read_from_server time.Time + last_check_time time.Time MaxMemoryHardLimit uint64 MaxConnectionDelay time.Duration @@ -30,8 +30,11 @@ type NannyService struct { Logger *logging.LogContext // Function that will be called when the nanny detects out of - // specs condition. If not specified we exit immediately. - OnExit func() + // specs condition. If not specified we exit immediately. This + // function will only be called once. If the exit condition occurs + // further OnExit2 will be called repeatadly. + OnExit func() + OnExit2 func() on_exit_called bool } @@ -40,21 +43,21 @@ func (self *NannyService) UpdatePumpToRb() { self.mu.Lock() defer self.mu.Unlock() - self.last_pump_to_rb_attempt = Clock.Now() + self.last_pump_to_rb_attempt = utils.GetTime().Now() } func (self *NannyService) UpdatePumpRbToServer() { self.mu.Lock() defer self.mu.Unlock() - self.last_pump_rb_to_server_attempt = Clock.Now() + self.last_pump_rb_to_server_attempt = utils.GetTime().Now() } func (self *NannyService) UpdateReadFromServer() { self.mu.Lock() defer self.mu.Unlock() - self.last_read_from_server = Clock.Now() + self.last_read_from_server = utils.GetTime().Now() } func (self *NannyService) _CheckMemory(message string) bool { @@ -90,7 +93,7 @@ func (self *NannyService) _CheckTime(t time.Time, message string) bool { return false } - now := Clock.Now() + now := utils.GetTime().Now() if t.Add(self.MaxConnectionDelay).Before(now) { self.Logger.Error( "NannyService: Last %v too long ago %v (now is %v MaxConnectionDelay is %v)", @@ -106,7 +109,11 @@ func (self *NannyService) _Exit() { // If we already called the OnExit one time, we just hard exit. if self.OnExit == nil || self.on_exit_called { self.Logger.Error("Hard Exit called!") - os.Exit(-1) + if self.OnExit2 != nil { + self.OnExit2() + } else { + os.Exit(-1) + } return } @@ -118,6 +125,43 @@ func (self *NannyService) _Exit() { self.on_exit_called = true } +func (self *NannyService) checkOnce(period time.Duration) { + self.mu.Lock() + defer self.mu.Unlock() + + // Keep track of the last time we checked things. + last_check_time := self.last_check_time + self.last_check_time = utils.GetTime().Now() + + // We should update self.last_check_time periodically + // so it shoud never be more than 20 sec behind this + // check. Unless the machine just wakes up from sleep + // - in that case the last check time is far before + // this check time. In this case we cosider the check + // invalid and try again later. + if last_check_time.Add(period * 2). + Before(self.last_check_time) { + return + } + + called := self._CheckTime(self.last_pump_to_rb_attempt, "Pump to Ring Buffer") + if self._CheckTime(self.last_pump_rb_to_server_attempt, "Pump Ring Buffer to Server") { + called = true + } + if self._CheckTime(self.last_read_from_server, "Read From Server") { + called = true + } + if self._CheckMemory("Exceeded HardMemoryLimit") { + called = true + } + + // Allow the trigger to be disarmed if the on_exit was + // able to reduce memory use or unstick the process. + if !called { + self.on_exit_called = false + } +} + func (self *NannyService) Start( ctx context.Context, wg *sync.WaitGroup) { @@ -131,36 +175,34 @@ func (self *NannyService) Start( "Starting nanny with MaxConnectionDelay %v and MaxMemoryHardLimit %v", self.MaxConnectionDelay, self.MaxMemoryHardLimit) + period := 10 * time.Second + for { select { case <-ctx.Done(): return - case <-Clock.After(10 * time.Second): - self.mu.Lock() - - called := self._CheckTime(self.last_pump_to_rb_attempt, "Pump to Ring Buffer") - if self._CheckTime(self.last_pump_rb_to_server_attempt, "Pump Ring Buffer to Server") { - called = true - } - if self._CheckTime(self.last_read_from_server, "Read From Server") { - called = true - } - if self._CheckMemory("Exceeded HardMemoryLimit") { - called = true - } - - // Allow the trigger to be disarmed if the on_exit was - // able to reduce memory use or unstick the process. - if !called { - self.on_exit_called = false - } - self.mu.Unlock() + case <-utils.GetTime().After(period): + self.checkOnce(period) } } }() } +func NewNanny( + config_obj *config_proto.Config) *NannyService { + if config_obj.Client.NannyMaxConnectionDelay > 0 { + return &NannyService{ + MaxMemoryHardLimit: config_obj.Client.MaxMemoryHardLimit, + last_check_time: utils.GetTime().Now(), + MaxConnectionDelay: time.Duration( + config_obj.Client.NannyMaxConnectionDelay) * time.Second, + Logger: logging.GetLogger(config_obj, &logging.ClientComponent), + } + } + return nil +} + func StartNannyService( ctx context.Context, wg *sync.WaitGroup, @@ -169,14 +211,8 @@ func StartNannyService( return nil } - if config_obj.Client.NannyMaxConnectionDelay > 0 { - Nanny = &NannyService{ - MaxMemoryHardLimit: config_obj.Client.MaxMemoryHardLimit, - MaxConnectionDelay: time.Duration( - config_obj.Client.NannyMaxConnectionDelay) * time.Second, - Logger: logging.GetLogger(config_obj, &logging.ClientComponent), - } - + Nanny = NewNanny(config_obj) + if Nanny != nil { Nanny.Start(ctx, wg) } return nil diff --git a/executor/nanny_test.go b/executor/nanny_test.go new file mode 100644 index 00000000000..b7519ebf8e8 --- /dev/null +++ b/executor/nanny_test.go @@ -0,0 +1,102 @@ +package executor + +import ( + "testing" + "time" + + "www.velocidex.com/golang/velociraptor/config" + "www.velocidex.com/golang/velociraptor/utils" + "www.velocidex.com/golang/velociraptor/vtesting/assert" +) + +type OnExitHelper struct { + exit_called time.Time +} + +func (self *OnExitHelper) Exit() { + // Only record the first time we were called + if self.exit_called.IsZero() { + self.exit_called = utils.GetTime().Now() + } +} + +func TestNanny(t *testing.T) { + period := 10 * time.Second + + // If we did not communicate with the server in 60 sec, hard exit. + config_obj := config.GetDefaultConfig() + config_obj.Client.NannyMaxConnectionDelay = 60 + + closer := utils.MockTime(utils.NewMockClock(time.Unix(1000, 0))) + defer closer() + + helper := OnExitHelper{} + Nanny := NewNanny(config_obj) + Nanny.OnExit = helper.Exit + Nanny.OnExit2 = helper.Exit + + // Set all checks to now. + Nanny.UpdatePumpToRb() + Nanny.UpdatePumpRbToServer() + Nanny.UpdateReadFromServer() + + // A check at time 1000 - should be fine as this is the same time + // the pumps were touched. + Nanny.checkOnce(period) + + // Move the time 70 sec on, 10 sec at the time. This emulates the + // nanny periodic checking as happens at runtime. + for i := 1000; i <= 1080; i += 10 { + utils.MockTime(utils.NewMockClock(time.Unix(int64(i), 0))) + Nanny.checkOnce(period) + } + + // Only the first check after the 60 second timeout will trigger + // an exit. Earlier checks will not trigger exit. + assert.Equal(t, int64(1070), helper.exit_called.Unix()) +} + +// Check that nanny is able to detect a large time step (like endpoint +// sleep/suspend cycle) +func TestNannySleep(t *testing.T) { + period := 10 * time.Second + + // If we did not communicate with the server in 60 sec, hard exit. + config_obj := config.GetDefaultConfig() + config_obj.Client.NannyMaxConnectionDelay = 60 + + closer := utils.MockTime(utils.NewMockClock(time.Unix(1000, 0))) + defer closer() + + helper := OnExitHelper{} + Nanny := NewNanny(config_obj) + Nanny.OnExit = helper.Exit + Nanny.OnExit2 = helper.Exit + + // Set all checks to now. + Nanny.UpdatePumpToRb() + Nanny.UpdatePumpRbToServer() + Nanny.UpdateReadFromServer() + + // A check at time 1000 - should be fine as this is the same time + // the pumps were touched. + Nanny.checkOnce(period) + + // Now emulate a suspend cycle - the next check occurs a long time + // after the last check + utils.MockTime(utils.NewMockClock(time.Unix(2000, 0))) + + Nanny.checkOnce(period) + + // Did not trigger an exit. + assert.True(t, helper.exit_called.IsZero()) + + // Step the next check by 10 sec + utils.MockTime(utils.NewMockClock(time.Unix(2010, 0))) + + // This will now trigger an exit + Nanny.checkOnce(period) + + // First check after the 60 second timeout will trigger an exit. + assert.Equal(t, int64(2010), helper.exit_called.Unix()) +} diff --git a/services/launcher/compiler.go b/services/launcher/compiler.go index 44866dddfa4..c2a166f838c 100644 --- a/services/launcher/compiler.go +++ b/services/launcher/compiler.go @@ -81,6 +81,18 @@ func (self *Launcher) CompileSingleArtifact( escaped_name, maybeEscape(name+"_")), }) + case "upload_file": + result.Query = append(result.Query, &actions_proto.VQLRequest{ + VQL: fmt.Sprintf(`LET %v <= if(condition=%v, then={ + SELECT Content FROM http_client(url=%v, tempfile_extension='.tmp') +})`, + maybeEscape(name+"_"), escaped_name, escaped_name), + }) + result.Query = append(result.Query, &actions_proto.VQLRequest{ + VQL: fmt.Sprintf("LET %v <= %v.Content[0]", + escaped_name, maybeEscape(name+"_")), + }) + case "server_metadata": client_info_manager, err := services.GetClientInfoManager(config_obj) if err == nil {