From 429696f0d6657bd59591133718fd6f8b0d15f405 Mon Sep 17 00:00:00 2001 From: Fred Lotter Date: Fri, 28 Jul 2023 13:45:41 +0200 Subject: [PATCH 1/4] servstate: backoff svcs should be stopped on exit The default behaviour today for a service that exits is that Pebble will auto-restart the service. This process has a built in back-off strategy which means that on exit a timer is set to restart the service after the back-off period expires. The way to prevent a service in back-off to restart is to explicitly stop the service (this case is covered in a unit test). When the Pebble daemon starts a graceful exit, it has to first stop all running services by requesting this explicitly, and waiting for the change request to complete. However, services currently scheduled for restart (back-off) is not currently stopped through this request. This is an issue because the restart mechanism relies on the reaper package, which is stopped straight after running services are stopped. This means any service in back-off can find itself in one of the following situations: 1. The most common case is that the service restart thread is killed by the garbage collector on process exit, so this appears clean. 2. The second scenario is the service restarts between the services stop request and the reaper kill request, in which case the service will continue to run after the Pebble process is dead (of course this is statistically very unlikely to happen, but possible). 3. The last scenario is that the service restart after the reaper is killed, but before the Pebble process exits. This will result in the "reaper not started" panic we often see in unit tests (this is also a statistically unlikely scenario, but possible). This PR modifies the servstate function used for terminating all services during shutdown (only) to also explicitly stop services in back-off. --- internals/overlord/servstate/manager.go | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/internals/overlord/servstate/manager.go b/internals/overlord/servstate/manager.go index ad99bef2..f8edb2e0 100644 --- a/internals/overlord/servstate/manager.go +++ b/internals/overlord/servstate/manager.go @@ -548,7 +548,14 @@ func (m *ServiceManager) SetServiceArgs(serviceArgs map[string][]string) error { return m.appendLayer(newLayer) } -// servicesToStop returns a slice of service names to stop, in dependency order. +// servicesToStop is used during service manager shutdown to cleanly terminate +// all running services. Running services include both services in the +// stateRunning and stateBackoff, since a service in backoff state can start +// running once the timeout expires, which creates a race on service manager +// exit. If it starts just before, it would continue to run after the service +// manager is terminated. If it starts just after (before the main process +// exits), it would generate a runtime error as the reaper would already be dead. +// This function returns a slice of service names to stop, in dependency order. func servicesToStop(m *ServiceManager) ([]string, error) { releasePlan, err := m.acquirePlan() if err != nil { @@ -568,15 +575,15 @@ func servicesToStop(m *ServiceManager) ([]string, error) { return nil, err } - // Filter down to only those that are running. + // Filter down to only those that are running or in backoff m.servicesLock.Lock() defer m.servicesLock.Unlock() - var running []string + var notStopped []string for _, name := range stop { s := m.services[name] - if s != nil && s.state == stateRunning { - running = append(running, name) + if s != nil && (s.state == stateRunning || s.state == stateBackoff) { + notStopped = append(notStopped, name) } } - return running, nil + return notStopped, nil } From ae03ff6646a9b3ba4c11bea2ca5a6201553dc0a2 Mon Sep 17 00:00:00 2001 From: Fred Lotter Date: Fri, 28 Jul 2023 14:04:08 +0200 Subject: [PATCH 2/4] servstate: unit test teardown must stop svcs Currently running unit test services are overlapping each other as there is no explicit services stop on test tear-down. This is not directly a problem although it makes interpreting the unit test output impossible, as the servstate messages you see during the test also includes output from service events of previous tests. There is however one real issue: services that restart after a unit test exits (due to expiring back-off), requires the original test reaper instance to start and wait on the service. The reaper is correctly destroyed during test teardown, so as a result we constantly get error messages like: panic: reaper not started This PR includes the following work: - Reorganise the test setup and tear-down logic to allow consistent management of servstate manager instances (some tests use a default setup template, while others create a test specific instance). - Enforce clean service shutdown (running and back-off) during test teardown. - Arrange the unit tests to be first in the file, with helper functions at the end, to allow for easy reading. --- internals/overlord/servstate/manager_test.go | 647 ++++++++++--------- 1 file changed, 354 insertions(+), 293 deletions(-) diff --git a/internals/overlord/servstate/manager_test.go b/internals/overlord/servstate/manager_test.go index 629cf029..de9a1bb1 100644 --- a/internals/overlord/servstate/manager_test.go +++ b/internals/overlord/servstate/manager_test.go @@ -77,6 +77,7 @@ type S struct { log string logBuffer bytes.Buffer logBufferMut sync.Mutex + logOutput writerFunc st *state.State @@ -148,20 +149,10 @@ func (s *S) SetUpTest(c *C) { s.BaseTest.SetUpTest(c) s.st = state.New(nil) - s.dir = c.MkDir() - os.Mkdir(filepath.Join(s.dir, "layers"), 0755) - - s.log = filepath.Join(s.dir, "log.txt") - data := fmt.Sprintf(planLayer1, s.log, s.log) - err := ioutil.WriteFile(filepath.Join(s.dir, "layers", "001-base.yaml"), []byte(data), 0644) - c.Assert(err, IsNil) - err = ioutil.WriteFile(filepath.Join(s.dir, "layers", "002-two.yaml"), []byte(planLayer2), 0644) - c.Assert(err, IsNil) - s.logBufferMut.Lock() s.logBuffer.Reset() s.logBufferMut.Unlock() - logOutput := writerFunc(func(p []byte) (int, error) { + s.logOutput = writerFunc(func(p []byte) (int, error) { s.logBufferMut.Lock() defer s.logBufferMut.Unlock() return s.logBuffer.Write(p) @@ -169,10 +160,6 @@ func (s *S) SetUpTest(c *C) { s.runner = state.NewTaskRunner(s.st) s.stopDaemon = make(chan struct{}) - manager, err := servstate.NewManager(s.st, s.runner, s.dir, logOutput, testRestarter{s.stopDaemon}, fakeLogManager{}) - c.Assert(err, IsNil) - s.AddCleanup(manager.Stop) - s.manager = manager restore := servstate.FakeOkayWait(shortOkayDelay) s.AddCleanup(restore) @@ -181,91 +168,23 @@ func (s *S) SetUpTest(c *C) { } func (s *S) TearDownTest(c *C) { + // Make sure all services are stopped + s.stopRunningServices(c) + // Kill the reaper + s.manager.Stop() + // General test cleanup s.BaseTest.TearDownTest(c) } -type testRestarter struct { - ch chan struct{} -} - -func (r testRestarter) HandleRestart(t restart.RestartType) { - close(r.ch) -} - -func (s *S) assertLog(c *C, expected string) { - s.logBufferMut.Lock() - defer s.logBufferMut.Unlock() - data, err := ioutil.ReadFile(s.log) - if os.IsNotExist(err) { - c.Fatal("Services have not run") - } - c.Assert(err, IsNil) - c.Assert(string(data), Matches, "(?s)"+expected) - c.Assert(s.logBuffer.String(), Matches, "(?s)"+expected) -} - -func (s *S) logBufferString() string { - s.logBufferMut.Lock() - defer s.logBufferMut.Unlock() - str := s.logBuffer.String() - s.logBuffer.Reset() - return str -} - func (s *S) TestDefaultServiceNames(c *C) { + s.setupDefaultServiceManager(c) services, err := s.manager.DefaultServiceNames() c.Assert(err, IsNil) c.Assert(services, DeepEquals, []string{"test1", "test2"}) } -func (s *S) ensure(c *C, n int) { - for i := 0; i < n; i++ { - s.runner.Ensure() - s.runner.Wait() - } -} - -func (s *S) startServices(c *C, services []string, nEnsure int) *state.Change { - s.st.Lock() - ts, err := servstate.Start(s.st, services) - c.Check(err, IsNil) - chg := s.st.NewChange("test", "Start test") - chg.AddAll(ts) - s.st.Unlock() - - // Num to ensure may be more than one due to the cross-task dependencies. - s.ensure(c, nEnsure) - - return chg -} - -func (s *S) stopServices(c *C, services []string, nEnsure int) *state.Change { - s.st.Lock() - ts, err := servstate.Stop(s.st, services) - c.Check(err, IsNil) - chg := s.st.NewChange("test", "Stop test") - chg.AddAll(ts) - s.st.Unlock() - - // Num to ensure may be more than one due to the cross-task dependencies. - s.ensure(c, nEnsure) - - return chg -} - -func (s *S) startTestServices(c *C) { - chg := s.startServices(c, []string{"test1", "test2"}, 2) - s.st.Lock() - c.Check(chg.Status(), Equals, state.DoneStatus, Commentf("Error: %v", chg.Err())) - s.st.Unlock() - - s.assertLog(c, ".*test1\n.*test2\n") - - cmds := s.manager.RunningCmds() - c.Check(cmds, HasLen, 2) -} - func (s *S) TestStartStopServices(c *C) { + s.setupDefaultServiceManager(c) s.startTestServices(c) if c.Failed() { @@ -276,6 +195,7 @@ func (s *S) TestStartStopServices(c *C) { } func (s *S) TestStartStopServicesIdempotency(c *C) { + s.setupDefaultServiceManager(c) s.startTestServices(c) if c.Failed() { return @@ -294,42 +214,8 @@ func (s *S) TestStartStopServicesIdempotency(c *C) { s.stopTestServicesAlreadyDead(c) } -func (s *S) stopTestServices(c *C) { - cmds := s.manager.RunningCmds() - c.Check(cmds, HasLen, 2) - - chg := s.stopServices(c, []string{"test1", "test2"}, 2) - - // Ensure processes are gone indeed. - c.Assert(cmds, HasLen, 2) - for name, cmd := range cmds { - err := cmd.Process.Signal(syscall.Signal(0)) - if err == nil { - c.Fatalf("Process for %q did not stop properly", name) - } else { - c.Check(err, ErrorMatches, ".*process already finished.*") - } - } - - s.st.Lock() - c.Check(chg.Status(), Equals, state.DoneStatus, Commentf("Error: %v", chg.Err())) - s.st.Unlock() -} - -func (s *S) stopTestServicesAlreadyDead(c *C) { - cmds := s.manager.RunningCmds() - c.Check(cmds, HasLen, 0) - - chg := s.stopServices(c, []string{"test1", "test2"}, 2) - - c.Assert(cmds, HasLen, 0) - - s.st.Lock() - c.Check(chg.Status(), Equals, state.DoneStatus, Commentf("Error: %v", chg.Err())) - s.st.Unlock() -} - func (s *S) TestStopTimeout(c *C) { + s.setupDefaultServiceManager(c) layer := parseLayer(c, 0, "layer99", ` services: test9: @@ -353,6 +239,7 @@ services: } func (s *S) TestKillDelayIsUsed(c *C) { + s.setupDefaultServiceManager(c) layer4 := parseLayer(c, 0, "layer4", planLayer4) err := s.manager.AppendLayer(layer4) c.Assert(err, IsNil) @@ -379,6 +266,7 @@ func (s *S) TestKillDelayIsUsed(c *C) { } func (s *S) TestReplanServices(c *C) { + s.setupDefaultServiceManager(c) s.startTestServices(c) if c.Failed() { @@ -398,6 +286,7 @@ func (s *S) TestReplanServices(c *C) { } func (s *S) TestReplanUpdatesConfig(c *C) { + s.setupDefaultServiceManager(c) s.startTestServices(c) defer s.stopTestServices(c) @@ -430,6 +319,7 @@ services: } func (s *S) TestStopStartUpdatesConfig(c *C) { + s.setupDefaultServiceManager(c) s.startTestServices(c) defer s.stopTestServices(c) @@ -454,6 +344,7 @@ services: } func (s *S) TestServiceLogs(c *C) { + s.setupDefaultServiceManager(c) outputs := map[string]string{ "test1": `2.* \[test1\] test1\n`, "test2": `2.* \[test2\] test2\n`, @@ -466,34 +357,8 @@ func (s *S) TestServiceLogs(c *C) { s.testServiceLogs(c, outputs) } -func (s *S) testServiceLogs(c *C, outputs map[string]string) { - s.startTestServices(c) - - if c.Failed() { - return - } - - iterators, err := s.manager.ServiceLogs([]string{"test1", "test2"}, -1) - c.Assert(err, IsNil) - c.Assert(iterators, HasLen, 2) - - for serviceName, it := range iterators { - buf := &bytes.Buffer{} - for it.Next(nil) { - _, err = io.Copy(buf, it) - c.Assert(err, IsNil) - } - - c.Assert(buf.String(), Matches, outputs[serviceName]) - - err = it.Close() - c.Assert(err, IsNil) - } - - s.stopTestServices(c) -} - func (s *S) TestStartBadCommand(c *C) { + s.setupDefaultServiceManager(c) chg := s.startServices(c, []string{"test3"}, 1) s.st.Lock() @@ -506,14 +371,11 @@ func (s *S) TestStartBadCommand(c *C) { } func (s *S) TestCurrentUserGroup(c *C) { - // Don't re-use s.manager, because we're adding a layer and service, and - // that would conflict with other tests like TestServiceLogs. dir := c.MkDir() err := os.Mkdir(filepath.Join(dir, "layers"), 0755) c.Assert(err, IsNil) - manager, err := servstate.NewManager(s.st, s.runner, dir, nil, nil, fakeLogManager{}) + s.manager, err = servstate.NewManager(s.st, s.runner, dir, nil, nil, fakeLogManager{}) c.Assert(err, IsNil) - defer manager.Stop() current, err := user.Current() c.Assert(err, IsNil) @@ -529,7 +391,7 @@ services: user: %s group: %s `, outputPath, shortOkayDelay.Seconds()+0.01, current.Username, group.Name)) - err = manager.AppendLayer(layer) + err = s.manager.AppendLayer(layer) c.Assert(err, IsNil) chg := s.startServices(c, []string{"usrtest"}, 1) @@ -543,6 +405,7 @@ services: } func (s *S) TestUserGroupFails(c *C) { + s.setupDefaultServiceManager(c) // Test with non-current user and group will fail due to permission issues // (unless running as root) if os.Getuid() == 0 { @@ -581,6 +444,7 @@ func (s *S) TestUserGroupFails(c *C) { // See .github/workflows/tests.yml for how to run this test as root. func (s *S) TestUserGroup(c *C) { + s.setupDefaultServiceManager(c) if os.Getuid() != 0 { c.Skip("requires running as root") } @@ -620,14 +484,8 @@ services: fmt.Sprintf(`(?s).* \[usrgrp\] %[1]s\n.* \[usrgrp\] user=%[1]s home=/home/%[1]s\n`, username)) } -func (s *S) serviceByName(c *C, name string) *servstate.ServiceInfo { - services, err := s.manager.Services([]string{name}) - c.Assert(err, IsNil) - c.Assert(services, HasLen, 1) - return services[0] -} - func (s *S) TestStartFastExitCommand(c *C) { + s.setupDefaultServiceManager(c) chg := s.startServices(c, []string{"test4"}, 1) s.st.Lock() @@ -642,15 +500,8 @@ func (s *S) TestStartFastExitCommand(c *C) { c.Assert(svc.Current, Equals, servstate.StatusInactive) } -func planYAML(c *C, manager *servstate.ServiceManager) string { - plan, err := manager.Plan() - c.Assert(err, IsNil) - yml, err := yaml.Marshal(plan) - c.Assert(err, IsNil) - return string(yml) -} - func (s *S) TestPlan(c *C) { + s.setupDefaultServiceManager(c) expected := fmt.Sprintf(` services: test1: @@ -679,25 +530,12 @@ services: c.Assert(planYAML(c, s.manager), Equals, expected) } -func parseLayer(c *C, order int, label, layerYAML string) *plan.Layer { - layer, err := plan.ParseLayer(order, label, []byte(layerYAML)) - c.Assert(err, IsNil) - return layer -} - -func (s *S) planLayersHasLen(c *C, manager *servstate.ServiceManager, expectedLen int) { - plan, err := manager.Plan() - c.Assert(err, IsNil) - c.Assert(plan.Layers, HasLen, expectedLen) -} - func (s *S) TestAppendLayer(c *C) { dir := c.MkDir() - os.Mkdir(filepath.Join(dir, "layers"), 0755) - runner := state.NewTaskRunner(s.st) - manager, err := servstate.NewManager(s.st, runner, dir, nil, nil, fakeLogManager{}) + err := os.Mkdir(filepath.Join(dir, "layers"), 0755) + c.Assert(err, IsNil) + s.manager, err = servstate.NewManager(s.st, s.runner, dir, nil, nil, fakeLogManager{}) c.Assert(err, IsNil) - defer manager.Stop() // Append a layer when there are no layers. layer := parseLayer(c, 0, "label1", ` @@ -706,16 +544,16 @@ services: override: replace command: /bin/sh `) - err = manager.AppendLayer(layer) + err = s.manager.AppendLayer(layer) c.Assert(err, IsNil) c.Assert(layer.Order, Equals, 1) - c.Assert(planYAML(c, manager), Equals, ` + c.Assert(planYAML(c, s.manager), Equals, ` services: svc1: override: replace command: /bin/sh `[1:]) - s.planLayersHasLen(c, manager, 1) + s.planLayersHasLen(c, s.manager, 1) // Try to append a layer when that label already exists. layer = parseLayer(c, 0, "label1", ` @@ -724,15 +562,15 @@ services: override: foobar command: /bin/bar `) - err = manager.AppendLayer(layer) + err = s.manager.AppendLayer(layer) c.Assert(err.(*servstate.LabelExists).Label, Equals, "label1") - c.Assert(planYAML(c, manager), Equals, ` + c.Assert(planYAML(c, s.manager), Equals, ` services: svc1: override: replace command: /bin/sh `[1:]) - s.planLayersHasLen(c, manager, 1) + s.planLayersHasLen(c, s.manager, 1) // Append another layer on top. layer = parseLayer(c, 0, "label2", ` @@ -741,16 +579,16 @@ services: override: replace command: /bin/bash `) - err = manager.AppendLayer(layer) + err = s.manager.AppendLayer(layer) c.Assert(err, IsNil) c.Assert(layer.Order, Equals, 2) - c.Assert(planYAML(c, manager), Equals, ` + c.Assert(planYAML(c, s.manager), Equals, ` services: svc1: override: replace command: /bin/bash `[1:]) - s.planLayersHasLen(c, manager, 2) + s.planLayersHasLen(c, s.manager, 2) // Append a layer with a different service. layer = parseLayer(c, 0, "label3", ` @@ -759,10 +597,10 @@ services: override: replace command: /bin/foo `) - err = manager.AppendLayer(layer) + err = s.manager.AppendLayer(layer) c.Assert(err, IsNil) c.Assert(layer.Order, Equals, 3) - c.Assert(planYAML(c, manager), Equals, ` + c.Assert(planYAML(c, s.manager), Equals, ` services: svc1: override: replace @@ -771,16 +609,15 @@ services: override: replace command: /bin/foo `[1:]) - s.planLayersHasLen(c, manager, 3) + s.planLayersHasLen(c, s.manager, 3) } func (s *S) TestCombineLayer(c *C) { dir := c.MkDir() - os.Mkdir(filepath.Join(dir, "layers"), 0755) - runner := state.NewTaskRunner(s.st) - manager, err := servstate.NewManager(s.st, runner, dir, nil, nil, fakeLogManager{}) + err := os.Mkdir(filepath.Join(dir, "layers"), 0755) + c.Assert(err, IsNil) + s.manager, err = servstate.NewManager(s.st, s.runner, dir, nil, nil, fakeLogManager{}) c.Assert(err, IsNil) - defer manager.Stop() // "Combine" layer with no layers should just append. layer := parseLayer(c, 0, "label1", ` @@ -789,16 +626,16 @@ services: override: replace command: /bin/sh `) - err = manager.CombineLayer(layer) + err = s.manager.CombineLayer(layer) c.Assert(err, IsNil) c.Assert(layer.Order, Equals, 1) - c.Assert(planYAML(c, manager), Equals, ` + c.Assert(planYAML(c, s.manager), Equals, ` services: svc1: override: replace command: /bin/sh `[1:]) - s.planLayersHasLen(c, manager, 1) + s.planLayersHasLen(c, s.manager, 1) // Combine layer with different label should just append. layer = parseLayer(c, 0, "label2", ` @@ -807,10 +644,10 @@ services: override: replace command: /bin/foo `) - err = manager.CombineLayer(layer) + err = s.manager.CombineLayer(layer) c.Assert(err, IsNil) c.Assert(layer.Order, Equals, 2) - c.Assert(planYAML(c, manager), Equals, ` + c.Assert(planYAML(c, s.manager), Equals, ` services: svc1: override: replace @@ -819,7 +656,7 @@ services: override: replace command: /bin/foo `[1:]) - s.planLayersHasLen(c, manager, 2) + s.planLayersHasLen(c, s.manager, 2) // Combine layer with first layer. layer = parseLayer(c, 0, "label1", ` @@ -828,10 +665,10 @@ services: override: replace command: /bin/bash `) - err = manager.CombineLayer(layer) + err = s.manager.CombineLayer(layer) c.Assert(err, IsNil) c.Assert(layer.Order, Equals, 1) - c.Assert(planYAML(c, manager), Equals, ` + c.Assert(planYAML(c, s.manager), Equals, ` services: svc1: override: replace @@ -840,7 +677,7 @@ services: override: replace command: /bin/foo `[1:]) - s.planLayersHasLen(c, manager, 2) + s.planLayersHasLen(c, s.manager, 2) // Combine layer with second layer. layer = parseLayer(c, 0, "label2", ` @@ -849,10 +686,10 @@ services: override: replace command: /bin/bar `) - err = manager.CombineLayer(layer) + err = s.manager.CombineLayer(layer) c.Assert(err, IsNil) c.Assert(layer.Order, Equals, 2) - c.Assert(planYAML(c, manager), Equals, ` + c.Assert(planYAML(c, s.manager), Equals, ` services: svc1: override: replace @@ -861,7 +698,7 @@ services: override: replace command: /bin/bar `[1:]) - s.planLayersHasLen(c, manager, 2) + s.planLayersHasLen(c, s.manager, 2) // One last append for good measure. layer = parseLayer(c, 0, "label3", ` @@ -873,10 +710,10 @@ services: override: replace command: /bin/b `) - err = manager.CombineLayer(layer) + err = s.manager.CombineLayer(layer) c.Assert(err, IsNil) c.Assert(layer.Order, Equals, 3) - c.Assert(planYAML(c, manager), Equals, ` + c.Assert(planYAML(c, s.manager), Equals, ` services: svc1: override: replace @@ -885,16 +722,15 @@ services: override: replace command: /bin/b `[1:]) - s.planLayersHasLen(c, manager, 3) + s.planLayersHasLen(c, s.manager, 3) } func (s *S) TestSetServiceArgs(c *C) { dir := c.MkDir() - os.Mkdir(filepath.Join(dir, "layers"), 0755) - runner := state.NewTaskRunner(s.st) - manager, err := servstate.NewManager(s.st, runner, dir, nil, nil, fakeLogManager{}) + err := os.Mkdir(filepath.Join(dir, "layers"), 0755) + c.Assert(err, IsNil) + s.manager, err = servstate.NewManager(s.st, s.runner, dir, nil, nil, fakeLogManager{}) c.Assert(err, IsNil) - defer manager.Stop() // Append a layer with a few services having default args. layer := parseLayer(c, 0, "base-layer", ` @@ -909,19 +745,19 @@ services: override: replace command: foo `) - err = manager.AppendLayer(layer) + err = s.manager.AppendLayer(layer) c.Assert(err, IsNil) c.Assert(layer.Order, Equals, 1) - s.planLayersHasLen(c, manager, 1) + s.planLayersHasLen(c, s.manager, 1) // Set arguments to services. serviceArgs := map[string][]string{ "svc1": {"-abc", "--xyz"}, "svc2": {"--bar"}, } - err = manager.SetServiceArgs(serviceArgs) + err = s.manager.SetServiceArgs(serviceArgs) c.Assert(err, IsNil) - c.Assert(planYAML(c, manager), Equals, ` + c.Assert(planYAML(c, s.manager), Equals, ` services: svc1: override: replace @@ -933,10 +769,11 @@ services: override: replace command: foo `[1:]) - s.planLayersHasLen(c, manager, 2) + s.planLayersHasLen(c, s.manager, 2) } func (s *S) TestServices(c *C) { + s.setupDefaultServiceManager(c) started := time.Now() services, err := s.manager.Services(nil) c.Assert(err, IsNil) @@ -971,7 +808,12 @@ func (s *S) TestServices(c *C) { }) } -var planLayerEnv = ` +func (s *S) TestEnvironment(c *C) { + s.setupDefaultServiceManager(c) + // Setup new state and add "envtest" layer + dir := c.MkDir() + logPath := filepath.Join(dir, "log.txt") + layerYAML := fmt.Sprintf(` services: envtest: override: replace @@ -979,13 +821,7 @@ services: environment: PEBBLE_ENV_TEST_1: foo PEBBLE_ENV_TEST_2: bar bazz -` - -func (s *S) TestEnvironment(c *C) { - // Setup new state and add "envtest" layer - dir := c.MkDir() - logPath := filepath.Join(dir, "log.txt") - layerYAML := fmt.Sprintf(planLayerEnv, logPath) +`, logPath) layer := parseLayer(c, 0, "envlayer", layerYAML) err := s.manager.AppendLayer(layer) c.Assert(err, IsNil) @@ -1017,13 +853,8 @@ PEBBLE_ENV_TEST_PARENT=from-parent `[1:]) } -type writerFunc func([]byte) (int, error) - -func (f writerFunc) Write(p []byte) (int, error) { - return f(p) -} - func (s *S) TestActionRestart(c *C) { + s.setupDefaultServiceManager(c) // Add custom backoff delay so it auto-restarts quickly. layer := parseLayer(c, 0, "layer", ` services: @@ -1099,6 +930,7 @@ services: } func (s *S) TestStopDuringBackoff(c *C) { + s.setupDefaultServiceManager(c) layer := parseLayer(c, 0, "layer", ` services: test2: @@ -1134,6 +966,7 @@ services: } func (s *S) TestOnCheckFailureRestartWhileRunning(c *C) { + s.setupDefaultServiceManager(c) // Create check manager and tell it about plan updates checkMgr := checkstate.NewManager() defer checkMgr.PlanChanged(&plan.Plan{}) @@ -1224,6 +1057,7 @@ checks: } func (s *S) TestOnCheckFailureRestartDuringBackoff(c *C) { + s.setupDefaultServiceManager(c) // Create check manager and tell it about plan updates checkMgr := checkstate.NewManager() defer checkMgr.PlanChanged(&plan.Plan{}) @@ -1302,6 +1136,7 @@ checks: } func (s *S) TestOnCheckFailureIgnore(c *C) { + s.setupDefaultServiceManager(c) // Create check manager and tell it about plan updates checkMgr := checkstate.NewManager() defer checkMgr.PlanChanged(&plan.Plan{}) @@ -1374,6 +1209,7 @@ checks: } func (s *S) TestOnCheckFailureShutdown(c *C) { + s.setupDefaultServiceManager(c) // Create check manager and tell it about plan updates checkMgr := checkstate.NewManager() defer checkMgr.PlanChanged(&plan.Plan{}) @@ -1439,18 +1275,8 @@ checks: } } -func (s *S) waitUntilService(c *C, service string, f func(svc *servstate.ServiceInfo) bool) { - for i := 0; i < 310; i++ { - svc := s.serviceByName(c, service) - if f(svc) { - return - } - time.Sleep(10 * time.Millisecond) - } - c.Fatalf("timed out waiting for service") -} - func (s *S) TestActionShutdown(c *C) { + s.setupDefaultServiceManager(c) layer := parseLayer(c, 0, "layer", ` services: test2: @@ -1481,6 +1307,7 @@ services: } func (s *S) TestActionIgnore(c *C) { + s.setupDefaultServiceManager(c) layer := parseLayer(c, 0, "layer", ` services: test2: @@ -1504,6 +1331,7 @@ services: } func (s *S) TestGetAction(c *C) { + s.setupDefaultServiceManager(c) tests := []struct { onSuccess plan.ServiceAction onFailure plan.ServiceAction @@ -1558,6 +1386,7 @@ func (s *S) TestGetAction(c *C) { } func (s *S) TestGetJitter(c *C) { + s.setupDefaultServiceManager(c) // It's tricky to test a function that generates randomness, but ensure all // the values are in range, and that the number of values distributed across // each of 3 buckets is reasonable. @@ -1584,6 +1413,7 @@ func (s *S) TestGetJitter(c *C) { } func (s *S) TestCalculateNextBackoff(c *C) { + s.setupDefaultServiceManager(c) tests := []struct { delay time.Duration factor float64 @@ -1618,6 +1448,7 @@ func (s *S) TestCalculateNextBackoff(c *C) { } func (s *S) TestReapZombies(c *C) { + s.setupDefaultServiceManager(c) // Ensure we've been set as a child subreaper isSubreaper, err := getChildSubreaper() c.Assert(err, IsNil) @@ -1689,33 +1520,8 @@ services: } } -func getChildSubreaper() (bool, error) { - var i uintptr - err := unix.Prctl(unix.PR_GET_CHILD_SUBREAPER, uintptr(unsafe.Pointer(&i)), 0, 0, 0) - if err != nil { - return false, err - } - return i != 0, nil -} - -func createZombie() error { - testExecutable, err := os.Executable() - if err != nil { - return err - } - procAttr := syscall.ProcAttr{ - Env: []string{"PEBBLE_TEST_ZOMBIE_CHILD=1"}, - } - childPid, err := syscall.ForkExec(testExecutable, []string{"zombie-child"}, &procAttr) - if err != nil { - return err - } - fmt.Printf("childPid %d\n", childPid) - time.Sleep(shortOkayDelay + 25*time.Millisecond) - return nil -} - func (s *S) TestStopRunning(c *C) { + s.setupDefaultServiceManager(c) s.startServices(c, []string{"test2"}, 1) s.waitUntilService(c, "test2", func(svc *servstate.ServiceInfo) bool { return svc.Current == servstate.StatusActive @@ -1753,26 +1559,18 @@ func (s *S) TestStopRunning(c *C) { } func (s *S) TestStopRunningNoServices(c *C) { + s.setupDefaultServiceManager(c) taskSet, err := servstate.StopRunning(s.st, s.manager) c.Assert(err, IsNil) c.Assert(taskSet, IsNil) } -type fakeLogManager struct{} - -func (f fakeLogManager) ServiceStarted(serviceName string, logs *servicelog.RingBuffer) { - // no-op -} - func (s *S) TestNoWorkingDir(c *C) { - // Service command should run in current directory (package directory) - // if "working-dir" config option not set. dir := c.MkDir() err := os.Mkdir(filepath.Join(dir, "layers"), 0755) c.Assert(err, IsNil) - manager, err := servstate.NewManager(s.st, s.runner, dir, nil, nil, fakeLogManager{}) + s.manager, err = servstate.NewManager(s.st, s.runner, dir, nil, nil, fakeLogManager{}) c.Assert(err, IsNil) - defer manager.Stop() outputPath := filepath.Join(dir, "output") layer := parseLayer(c, 0, "layer", fmt.Sprintf(` @@ -1781,9 +1579,11 @@ services: override: replace command: /bin/sh -c "pwd >%s; sleep %g" `, outputPath, shortOkayDelay.Seconds()+0.01)) - err = manager.AppendLayer(layer) + err = s.manager.AppendLayer(layer) c.Assert(err, IsNil) + // Service command should run in current directory (package directory) + // if "working-dir" config option not set. chg := s.startServices(c, []string{"nowrkdir"}, 1) s.st.Lock() c.Assert(chg.Err(), IsNil) @@ -1798,9 +1598,8 @@ func (s *S) TestWorkingDir(c *C) { dir := c.MkDir() err := os.Mkdir(filepath.Join(dir, "layers"), 0755) c.Assert(err, IsNil) - manager, err := servstate.NewManager(s.st, s.runner, dir, nil, nil, fakeLogManager{}) + s.manager, err = servstate.NewManager(s.st, s.runner, dir, nil, nil, fakeLogManager{}) c.Assert(err, IsNil) - defer manager.Stop() outputPath := filepath.Join(dir, "output") layer := parseLayer(c, 0, "layer", fmt.Sprintf(` @@ -1810,7 +1609,7 @@ services: command: /bin/sh -c "pwd >%s; sleep %g" working-dir: %s `, outputPath, shortOkayDelay.Seconds()+0.01, dir)) - err = manager.AppendLayer(layer) + err = s.manager.AppendLayer(layer) c.Assert(err, IsNil) chg := s.startServices(c, []string{"wrkdir"}, 1) @@ -1822,3 +1621,265 @@ services: c.Assert(err, IsNil) c.Check(string(output), Equals, dir+"\n") } + +// setupDefaultServiceManager provides a basic setup that can be used by many +// of the unit tests without having to create a custom setup. +func (s *S) setupDefaultServiceManager(c *C) { + s.dir = c.MkDir() + s.log = filepath.Join(s.dir, "log.txt") + os.Mkdir(filepath.Join(s.dir, "layers"), 0755) + data := fmt.Sprintf(planLayer1, s.log, s.log) + err := ioutil.WriteFile(filepath.Join(s.dir, "layers", "001-base.yaml"), []byte(data), 0644) + c.Assert(err, IsNil) + err = ioutil.WriteFile(filepath.Join(s.dir, "layers", "002-two.yaml"), []byte(planLayer2), 0644) + c.Assert(err, IsNil) + + s.manager, err = servstate.NewManager(s.st, s.runner, s.dir, s.logOutput, testRestarter{s.stopDaemon}, fakeLogManager{}) + c.Assert(err, IsNil) +} + +// Make sure services are all stopped before the next test starts. +func (s *S) stopRunningServices(c *C) { + taskSet, err := servstate.StopRunning(s.st, s.manager) + c.Assert(err, IsNil) + + if taskSet == nil { + return + } + + // One change to stop them all. + s.st.Lock() + chg := s.st.NewChange("stop", "Stop all running services") + chg.AddAll(taskSet) + s.st.EnsureBefore(0) // start operation right away + s.st.Unlock() + + // Wait for a limited amount of time for them to stop. + timeout := time.After(10 * time.Second) + for { + s.runner.Ensure() + s.runner.Wait() + + // Exit loop if change is complete + select { + case <-chg.Ready(): + return + case <-timeout: + c.Fatal("timeout waiting for services to stop") + default: + } + } +} + +type testRestarter struct { + ch chan struct{} +} + +func (r testRestarter) HandleRestart(t restart.RestartType) { + close(r.ch) +} + +func (s *S) assertLog(c *C, expected string) { + s.logBufferMut.Lock() + defer s.logBufferMut.Unlock() + data, err := ioutil.ReadFile(s.log) + if os.IsNotExist(err) { + c.Fatal("Services have not run") + } + c.Assert(err, IsNil) + c.Assert(string(data), Matches, "(?s)"+expected) + c.Assert(s.logBuffer.String(), Matches, "(?s)"+expected) +} + +func (s *S) logBufferString() string { + s.logBufferMut.Lock() + defer s.logBufferMut.Unlock() + str := s.logBuffer.String() + s.logBuffer.Reset() + return str +} + +func (s *S) testServiceLogs(c *C, outputs map[string]string) { + s.startTestServices(c) + + if c.Failed() { + return + } + + iterators, err := s.manager.ServiceLogs([]string{"test1", "test2"}, -1) + c.Assert(err, IsNil) + c.Assert(iterators, HasLen, 2) + + for serviceName, it := range iterators { + buf := &bytes.Buffer{} + for it.Next(nil) { + _, err = io.Copy(buf, it) + c.Assert(err, IsNil) + } + + c.Assert(buf.String(), Matches, outputs[serviceName]) + + err = it.Close() + c.Assert(err, IsNil) + } + + s.stopTestServices(c) +} + +func (s *S) ensure(c *C, n int) { + for i := 0; i < n; i++ { + s.runner.Ensure() + s.runner.Wait() + } +} + +func (s *S) startServices(c *C, services []string, nEnsure int) *state.Change { + s.st.Lock() + ts, err := servstate.Start(s.st, services) + c.Check(err, IsNil) + chg := s.st.NewChange("test", "Start test") + chg.AddAll(ts) + s.st.Unlock() + + // Num to ensure may be more than one due to the cross-task dependencies. + s.ensure(c, nEnsure) + + return chg +} + +func (s *S) stopServices(c *C, services []string, nEnsure int) *state.Change { + s.st.Lock() + ts, err := servstate.Stop(s.st, services) + c.Check(err, IsNil) + chg := s.st.NewChange("test", "Stop test") + chg.AddAll(ts) + s.st.Unlock() + + // Num to ensure may be more than one due to the cross-task dependencies. + s.ensure(c, nEnsure) + + return chg +} + +func (s *S) serviceByName(c *C, name string) *servstate.ServiceInfo { + services, err := s.manager.Services([]string{name}) + c.Assert(err, IsNil) + c.Assert(services, HasLen, 1) + return services[0] +} + +func (s *S) startTestServices(c *C) { + chg := s.startServices(c, []string{"test1", "test2"}, 2) + s.st.Lock() + c.Check(chg.Status(), Equals, state.DoneStatus, Commentf("Error: %v", chg.Err())) + s.st.Unlock() + + s.assertLog(c, ".*test1\n.*test2\n") + + cmds := s.manager.RunningCmds() + c.Check(cmds, HasLen, 2) +} + +func (s *S) stopTestServices(c *C) { + cmds := s.manager.RunningCmds() + c.Check(cmds, HasLen, 2) + + chg := s.stopServices(c, []string{"test1", "test2"}, 2) + + // Ensure processes are gone indeed. + c.Assert(cmds, HasLen, 2) + for name, cmd := range cmds { + err := cmd.Process.Signal(syscall.Signal(0)) + if err == nil { + c.Fatalf("Process for %q did not stop properly", name) + } else { + c.Check(err, ErrorMatches, ".*process already finished.*") + } + } + + s.st.Lock() + c.Check(chg.Status(), Equals, state.DoneStatus, Commentf("Error: %v", chg.Err())) + s.st.Unlock() +} + +func (s *S) stopTestServicesAlreadyDead(c *C) { + cmds := s.manager.RunningCmds() + c.Check(cmds, HasLen, 0) + + chg := s.stopServices(c, []string{"test1", "test2"}, 2) + + c.Assert(cmds, HasLen, 0) + + s.st.Lock() + c.Check(chg.Status(), Equals, state.DoneStatus, Commentf("Error: %v", chg.Err())) + s.st.Unlock() +} + +func planYAML(c *C, manager *servstate.ServiceManager) string { + plan, err := manager.Plan() + c.Assert(err, IsNil) + yml, err := yaml.Marshal(plan) + c.Assert(err, IsNil) + return string(yml) +} + +func parseLayer(c *C, order int, label, layerYAML string) *plan.Layer { + layer, err := plan.ParseLayer(order, label, []byte(layerYAML)) + c.Assert(err, IsNil) + return layer +} + +func (s *S) planLayersHasLen(c *C, manager *servstate.ServiceManager, expectedLen int) { + plan, err := manager.Plan() + c.Assert(err, IsNil) + c.Assert(plan.Layers, HasLen, expectedLen) +} + +type writerFunc func([]byte) (int, error) + +func (f writerFunc) Write(p []byte) (int, error) { + return f(p) +} + +func (s *S) waitUntilService(c *C, service string, f func(svc *servstate.ServiceInfo) bool) { + for i := 0; i < 310; i++ { + svc := s.serviceByName(c, service) + if f(svc) { + return + } + time.Sleep(10 * time.Millisecond) + } + c.Fatalf("timed out waiting for service") +} + +func getChildSubreaper() (bool, error) { + var i uintptr + err := unix.Prctl(unix.PR_GET_CHILD_SUBREAPER, uintptr(unsafe.Pointer(&i)), 0, 0, 0) + if err != nil { + return false, err + } + return i != 0, nil +} + +func createZombie() error { + testExecutable, err := os.Executable() + if err != nil { + return err + } + procAttr := syscall.ProcAttr{ + Env: []string{"PEBBLE_TEST_ZOMBIE_CHILD=1"}, + } + childPid, err := syscall.ForkExec(testExecutable, []string{"zombie-child"}, &procAttr) + if err != nil { + return err + } + fmt.Printf("childPid %d\n", childPid) + time.Sleep(shortOkayDelay + 25*time.Millisecond) + return nil +} + +type fakeLogManager struct{} + +func (f fakeLogManager) ServiceStarted(serviceName string, logs *servicelog.RingBuffer) { + // no-op +} From 5566aae3fb5327437284ccacbe039cc2a9f0270a Mon Sep 17 00:00:00 2001 From: Fred Lotter Date: Fri, 28 Jul 2023 16:45:32 +0200 Subject: [PATCH 3/4] servstate: remove test races on svcs file output Unit Tests for servstate is extremely timing sensitive. If you run the unit tests under heavy CPU load, changes are very good you will start to see various failures. The easiest way to demonstrate this is to use the stress utility to simulate heavy CPU load (stress --io 12 --cpu 12). Unit Test output snippet: START: manager_test.go:828: S.TestEnvironment ... obtained string = "" ... expected string = "" + ... "PEBBLE_ENV_TEST_1=foo\n" + ... "PEBBLE_ENV_TEST_2=bar bazz\n" + ... "PEBBLE_ENV_TEST_PARENT=from-parent\n" FAIL: manager_test.go:828: S.TestEnvironment This specific race condition is caused because unit tests often make assumptions on how long it takes from the moment the service is started until the moment the service command completes. This PR will only address this very specific kind of race, and hopefully in the process provide a solution that will be used in future tests. Please note that using the stress utility on tests from this package will still trigger lots of remaining races caused by very timing sensitive tests. --- internals/overlord/servstate/manager_test.go | 174 ++++++++++--------- 1 file changed, 93 insertions(+), 81 deletions(-) diff --git a/internals/overlord/servstate/manager_test.go b/internals/overlord/servstate/manager_test.go index de9a1bb1..3e7c392b 100644 --- a/internals/overlord/servstate/manager_test.go +++ b/internals/overlord/servstate/manager_test.go @@ -84,6 +84,8 @@ type S struct { manager *servstate.ServiceManager runner *state.TaskRunner stopDaemon chan struct{} + + donePath string } var _ = Suite(&S{}) @@ -165,6 +167,10 @@ func (s *S) SetUpTest(c *C) { s.AddCleanup(restore) restore = servstate.FakeKillFailDelay(shortKillDelay, shortFailDelay) s.AddCleanup(restore) + + // Sync feature to ensure service action is complete. See + // waitForDoneFile() + s.donePath = filepath.Join(c.MkDir(), "done") } func (s *S) TearDownTest(c *C) { @@ -387,10 +393,10 @@ func (s *S) TestCurrentUserGroup(c *C) { services: usrtest: override: replace - command: /bin/sh -c "id -n -u >%s; sleep %g" + command: /bin/sh -c "id -n -u >%s; sync; touch %s; sleep %g" user: %s group: %s -`, outputPath, shortOkayDelay.Seconds()+0.01, current.Username, group.Name)) +`, outputPath, s.donePath, shortOkayDelay.Seconds()+0.01, current.Username, group.Name)) err = s.manager.AppendLayer(layer) c.Assert(err, IsNil) @@ -399,6 +405,8 @@ services: c.Assert(chg.Err(), IsNil) s.st.Unlock() + s.waitForDoneFile(c) + output, err := ioutil.ReadFile(outputPath) c.Assert(err, IsNil) c.Check(string(output), Equals, current.Username+"\n") @@ -817,11 +825,11 @@ func (s *S) TestEnvironment(c *C) { services: envtest: override: replace - command: /bin/sh -c "env | grep PEBBLE_ENV_TEST | sort > %s; sleep 10" + command: /bin/sh -c "env | grep PEBBLE_ENV_TEST | sort > %s; sync; touch %s; sleep 10" environment: PEBBLE_ENV_TEST_1: foo PEBBLE_ENV_TEST_2: bar bazz -`, logPath) +`, logPath, s.donePath) layer := parseLayer(c, 0, "envlayer", layerYAML) err := s.manager.AppendLayer(layer) c.Assert(err, IsNil) @@ -840,6 +848,8 @@ services: c.Check(chg.Status(), Equals, state.DoneStatus, Commentf("Error: %v", chg.Err())) s.st.Unlock() + s.waitForDoneFile(c) + // Ensure it read environment variables correctly data, err := ioutil.ReadFile(logPath) if os.IsNotExist(err) { @@ -981,7 +991,7 @@ func (s *S) TestOnCheckFailureRestartWhileRunning(c *C) { services: test2: override: replace - command: /bin/sh -c 'echo x >>%s; sleep 1' + command: /bin/sh -c 'echo x >>%s; sync; touch %s; sleep 1' backoff-delay: 50ms on-check-failure: chk1: restart @@ -993,22 +1003,18 @@ checks: threshold: 1 exec: command: will-fail -`, tempFile)) +`, tempFile, s.donePath)) err := s.manager.AppendLayer(layer) c.Assert(err, IsNil) // Start service and wait till it starts up (output file is written to) s.startServices(c, []string{"test2"}, 1) - for i := 0; ; i++ { - if i >= 100 { - c.Fatalf("failed waiting for command to start") - } - b, _ := ioutil.ReadFile(tempFile) - if string(b) == "x\n" { - break - } - time.Sleep(5 * time.Millisecond) - } + + s.waitForDoneFile(c) + + b, err := ioutil.ReadFile(tempFile) + c.Assert(err, IsNil) + c.Assert(string(b), Equals, "x\n") // Now wait till check happens (it will-fail) for i := 0; ; i++ { @@ -1029,21 +1035,16 @@ checks: s.waitUntilService(c, "test2", func(svc *servstate.ServiceInfo) bool { return svc.Current == servstate.StatusBackoff }) - for i := 0; ; i++ { - if i >= 100 { - c.Fatalf("failed waiting for command to start") - } - b, err := ioutil.ReadFile(tempFile) - c.Assert(err, IsNil) - if string(b) == "x\nx\n" { - break - } - time.Sleep(5 * time.Millisecond) - } + + s.waitForDoneFile(c) + + b, err = ioutil.ReadFile(tempFile) + c.Assert(err, IsNil) + c.Assert(string(b), Equals, "x\nx\n") // Shouldn't be restarted again time.Sleep(100 * time.Millisecond) - b, err := ioutil.ReadFile(tempFile) + b, err = ioutil.ReadFile(tempFile) c.Assert(err, IsNil) c.Assert(string(b), Equals, "x\nx\n") checks, err := checkMgr.Checks() @@ -1072,7 +1073,7 @@ func (s *S) TestOnCheckFailureRestartDuringBackoff(c *C) { services: test2: override: replace - command: /bin/sh -c 'echo x >>%s; sleep 0.075' + command: /bin/sh -c 'echo x >>%s; sync; touch %s; sleep 0.075' backoff-delay: 50ms backoff-factor: 100 # ensure it only backoff-restarts once on-check-failure: @@ -1085,22 +1086,18 @@ checks: threshold: 1 exec: command: will-fail -`, tempFile)) +`, tempFile, s.donePath)) err := s.manager.AppendLayer(layer) c.Assert(err, IsNil) // Start service and wait till it starts up (output file is written to) s.startServices(c, []string{"test2"}, 1) - for i := 0; ; i++ { - if i >= 100 { - c.Fatalf("failed waiting for command to start") - } - b, _ := ioutil.ReadFile(tempFile) - if string(b) == "x\n" { - break - } - time.Sleep(5 * time.Millisecond) - } + + s.waitForDoneFile(c) + + b, err := ioutil.ReadFile(tempFile) + c.Assert(err, IsNil) + c.Assert(string(b), Equals, "x\n") // Ensure it exits and goes into backoff state s.waitUntilService(c, "test2", func(svc *servstate.ServiceInfo) bool { @@ -1108,24 +1105,20 @@ checks: }) // Check failure should wait for current backoff (after which it will be restarted) - for i := 0; ; i++ { - if i >= 100 { - c.Fatalf("failed waiting for command to start") - } - b, err := ioutil.ReadFile(tempFile) - c.Assert(err, IsNil) - if string(b) == "x\nx\n" { - break - } - time.Sleep(5 * time.Millisecond) - } + + s.waitForDoneFile(c) + + b, err = ioutil.ReadFile(tempFile) + c.Assert(err, IsNil) + c.Assert(string(b), Equals, "x\nx\n") + svc := s.serviceByName(c, "test2") c.Assert(svc.Current, Equals, servstate.StatusActive) c.Assert(s.manager.BackoffNum("test2"), Equals, 1) // Shouldn't be restarted again time.Sleep(125 * time.Millisecond) - b, err := ioutil.ReadFile(tempFile) + b, err = ioutil.ReadFile(tempFile) c.Assert(err, IsNil) c.Assert(string(b), Equals, "x\nx\n") checks, err := checkMgr.Checks() @@ -1151,7 +1144,7 @@ func (s *S) TestOnCheckFailureIgnore(c *C) { services: test2: override: replace - command: /bin/sh -c 'echo x >>%s; sleep 1' + command: /bin/sh -c 'echo x >>%s; sync; touch %s; sleep 1' on-check-failure: chk1: ignore @@ -1162,22 +1155,18 @@ checks: threshold: 1 exec: command: will-fail -`, tempFile)) +`, tempFile, s.donePath)) err := s.manager.AppendLayer(layer) c.Assert(err, IsNil) // Start service and wait till it starts up (output file is written to) s.startServices(c, []string{"test2"}, 1) - for i := 0; ; i++ { - if i >= 100 { - c.Fatalf("failed waiting for command to start") - } - b, _ := ioutil.ReadFile(tempFile) - if string(b) == "x\n" { - break - } - time.Sleep(5 * time.Millisecond) - } + + s.waitForDoneFile(c) + + b, err := ioutil.ReadFile(tempFile) + c.Assert(err, IsNil) + c.Assert(string(b), Equals, "x\n") // Now wait till check happens (it will-fail) for i := 0; ; i++ { @@ -1196,7 +1185,7 @@ checks: // Service shouldn't have been restarted time.Sleep(100 * time.Millisecond) - b, err := ioutil.ReadFile(tempFile) + b, err = ioutil.ReadFile(tempFile) c.Assert(err, IsNil) c.Assert(string(b), Equals, "x\n") checks, err := checkMgr.Checks() @@ -1224,7 +1213,7 @@ func (s *S) TestOnCheckFailureShutdown(c *C) { services: test2: override: replace - command: /bin/sh -c 'echo x >>%s; sleep 1' + command: /bin/sh -c 'echo x >>%s; sync; touch %s; sleep 1' on-check-failure: chk1: shutdown @@ -1235,22 +1224,18 @@ checks: threshold: 1 exec: command: will-fail -`, tempFile)) +`, tempFile, s.donePath)) err := s.manager.AppendLayer(layer) c.Assert(err, IsNil) // Start service and wait till it starts up (output file is written to) s.startServices(c, []string{"test2"}, 1) - for i := 0; ; i++ { - if i >= 100 { - c.Fatalf("failed waiting for command to start") - } - b, _ := ioutil.ReadFile(tempFile) - if string(b) == "x\n" { - break - } - time.Sleep(5 * time.Millisecond) - } + + s.waitForDoneFile(c) + + b, err := ioutil.ReadFile(tempFile) + c.Assert(err, IsNil) + c.Assert(string(b), Equals, "x\n") // Now wait till check happens (it will-fail) for i := 0; ; i++ { @@ -1577,8 +1562,8 @@ func (s *S) TestNoWorkingDir(c *C) { services: nowrkdir: override: replace - command: /bin/sh -c "pwd >%s; sleep %g" -`, outputPath, shortOkayDelay.Seconds()+0.01)) + command: /bin/sh -c "pwd >%s; sync; touch %s; sleep %g" +`, outputPath, s.donePath, shortOkayDelay.Seconds()+0.01)) err = s.manager.AppendLayer(layer) c.Assert(err, IsNil) @@ -1589,6 +1574,8 @@ services: c.Assert(chg.Err(), IsNil) s.st.Unlock() + s.waitForDoneFile(c) + output, err := ioutil.ReadFile(outputPath) c.Assert(err, IsNil) c.Check(string(output), Matches, ".*/overlord/servstate\n") @@ -1606,9 +1593,9 @@ func (s *S) TestWorkingDir(c *C) { services: wrkdir: override: replace - command: /bin/sh -c "pwd >%s; sleep %g" + command: /bin/sh -c "pwd >%s; sync; touch %s; sleep %g" working-dir: %s -`, outputPath, shortOkayDelay.Seconds()+0.01, dir)) +`, outputPath, s.donePath, shortOkayDelay.Seconds()+0.01, dir)) err = s.manager.AppendLayer(layer) c.Assert(err, IsNil) @@ -1617,6 +1604,8 @@ services: c.Assert(chg.Err(), IsNil) s.st.Unlock() + s.waitForDoneFile(c) + output, err := ioutil.ReadFile(outputPath) c.Assert(err, IsNil) c.Check(string(output), Equals, dir+"\n") @@ -1638,6 +1627,29 @@ func (s *S) setupDefaultServiceManager(c *C) { c.Assert(err, IsNil) } +// Return on timeout or when the file appears. This is used to determine +// when the expected service output is actually available, not when the +// service starts to run. +func (s *S) waitForDoneFile(c *C) { + timeout := time.After(10 * time.Second) + ticker := time.NewTicker(time.Millisecond) + for { + select { + case <-timeout: + c.Fatal("timeout waiting for service output") + + case <-ticker.C: + stat, err := os.Stat(s.donePath) + if err == nil && stat.Mode().IsRegular() { + // Delete it so we can reuse this feature + // in the same test. + os.Remove(s.donePath) + return + } + } + } +} + // Make sure services are all stopped before the next test starts. func (s *S) stopRunningServices(c *C) { taskSet, err := servstate.StopRunning(s.st, s.manager) From a93509e2327f2e7441ff1cff34b19f1f7356d9c6 Mon Sep 17 00:00:00 2001 From: Fred Lotter Date: Tue, 1 Aug 2023 13:07:31 +0200 Subject: [PATCH 4/4] servstate: unify log assert and done check The startTestServices() test helper uses a special entry in the service command under test to write the service standard output also to a log file that can be inspected. This mechanism suffers from a race condition (as highlighted in https://github.com/canonical/pebble/issues/264) because when the content of the log file is loaded, the service may not yet have completed writing to the log. Since standard output is also verified separately through a different mechanism, the following changes are made: - Enhance the global "done check" (previous called "done file") to check completion per service. This effectively adds the capability previously provided by the log assert mechanism. - Use the existing "done check" mechanism to wait until the service command-line is complete up to the point of the check. - Only now verify the stdout buffer content as checked previously. - Remove the log mechanism all together. --- internals/overlord/servstate/manager_test.go | 184 ++++++++++--------- 1 file changed, 94 insertions(+), 90 deletions(-) diff --git a/internals/overlord/servstate/manager_test.go b/internals/overlord/servstate/manager_test.go index 3e7c392b..48a360c0 100644 --- a/internals/overlord/servstate/manager_test.go +++ b/internals/overlord/servstate/manager_test.go @@ -73,8 +73,9 @@ func Test(t *testing.T) { TestingT(t) } type S struct { testutil.BaseTest - dir string - log string + // Unique tmp directory for each test + dir string + logBuffer bytes.Buffer logBufferMut sync.Mutex logOutput writerFunc @@ -94,7 +95,7 @@ var planLayer1 = ` services: test1: override: replace - command: /bin/sh -c "echo test1 | tee -a %s; sleep 10" + command: /bin/sh -c "echo test1; %s; sleep 10" startup: enabled requires: - test2 @@ -103,7 +104,7 @@ services: test2: override: replace - command: /bin/sh -c "echo test2 | tee -a %s; sleep 10" + command: /bin/sh -c "echo test2; %s; sleep 10" ` var planLayer2 = ` @@ -127,7 +128,7 @@ var planLayer3 = ` services: test2: override: merge - command: /bin/sh -c "echo test2b | tee -a %s; sleep 10" + command: /bin/sh -c "echo test2b; sleep 10" ` var planLayer4 = ` @@ -149,6 +150,8 @@ func (s *S) SetUpSuite(c *C) { func (s *S) SetUpTest(c *C) { s.BaseTest.SetUpTest(c) + + s.dir = c.MkDir() s.st = state.New(nil) s.logBufferMut.Lock() @@ -167,10 +170,6 @@ func (s *S) SetUpTest(c *C) { s.AddCleanup(restore) restore = servstate.FakeKillFailDelay(shortKillDelay, shortFailDelay) s.AddCleanup(restore) - - // Sync feature to ensure service action is complete. See - // waitForDoneFile() - s.donePath = filepath.Join(c.MkDir(), "done") } func (s *S) TearDownTest(c *C) { @@ -191,7 +190,7 @@ func (s *S) TestDefaultServiceNames(c *C) { func (s *S) TestStartStopServices(c *C) { s.setupDefaultServiceManager(c) - s.startTestServices(c) + s.startTestServices(c, true) if c.Failed() { return @@ -202,12 +201,14 @@ func (s *S) TestStartStopServices(c *C) { func (s *S) TestStartStopServicesIdempotency(c *C) { s.setupDefaultServiceManager(c) - s.startTestServices(c) + s.startTestServices(c, true) if c.Failed() { return } - s.startTestServices(c) + // Do not check the logs again, the service + // is not actually restarted here. + s.startTestServices(c, false) if c.Failed() { return } @@ -273,7 +274,7 @@ func (s *S) TestKillDelayIsUsed(c *C) { func (s *S) TestReplanServices(c *C) { s.setupDefaultServiceManager(c) - s.startTestServices(c) + s.startTestServices(c, true) if c.Failed() { return @@ -293,7 +294,7 @@ func (s *S) TestReplanServices(c *C) { func (s *S) TestReplanUpdatesConfig(c *C) { s.setupDefaultServiceManager(c) - s.startTestServices(c) + s.startTestServices(c, true) defer s.stopTestServices(c) // Ensure the ServiceManager's config reflects the plan config. @@ -326,7 +327,7 @@ services: func (s *S) TestStopStartUpdatesConfig(c *C) { s.setupDefaultServiceManager(c) - s.startTestServices(c) + s.startTestServices(c, true) defer s.stopTestServices(c) // Add a layer and override a couple of values. @@ -342,7 +343,7 @@ services: // Call Stop and Start and ensure the ServiceManager's config has updated. s.stopTestServices(c) - s.startTestServices(c) + s.startTestServices(c, true) config := s.manager.Config("test2") c.Assert(config, NotNil) c.Check(config.OnSuccess, Equals, plan.ActionIgnore) @@ -393,10 +394,10 @@ func (s *S) TestCurrentUserGroup(c *C) { services: usrtest: override: replace - command: /bin/sh -c "id -n -u >%s; sync; touch %s; sleep %g" + command: /bin/sh -c "id -n -u >%s; sync; %s; sleep %g" user: %s group: %s -`, outputPath, s.donePath, shortOkayDelay.Seconds()+0.01, current.Username, group.Name)) +`, outputPath, s.insertDoneCheck(c, "usrtest"), shortOkayDelay.Seconds()+0.01, current.Username, group.Name)) err = s.manager.AppendLayer(layer) c.Assert(err, IsNil) @@ -405,7 +406,7 @@ services: c.Assert(chg.Err(), IsNil) s.st.Unlock() - s.waitForDoneFile(c) + s.waitForDoneCheck(c, "usrtest") output, err := ioutil.ReadFile(outputPath) c.Assert(err, IsNil) @@ -515,14 +516,14 @@ services: test1: startup: enabled override: replace - command: /bin/sh -c "echo test1 | tee -a %s; sleep 10" + command: /bin/sh -c "echo test1; %s; sleep 10" before: - test2 requires: - test2 test2: override: replace - command: /bin/sh -c "echo test2 | tee -a %s; sleep 10" + command: /bin/sh -c "echo test2; %s; sleep 10" test3: override: replace command: some-bad-command @@ -534,7 +535,7 @@ services: command: /bin/sh -c "sleep 10" user: nobody group: nogroup -`[1:], s.log, s.log) +`[1:], s.insertDoneCheck(c, "test1"), s.insertDoneCheck(c, "test2")) c.Assert(planYAML(c, s.manager), Equals, expected) } @@ -825,11 +826,11 @@ func (s *S) TestEnvironment(c *C) { services: envtest: override: replace - command: /bin/sh -c "env | grep PEBBLE_ENV_TEST | sort > %s; sync; touch %s; sleep 10" + command: /bin/sh -c "env | grep PEBBLE_ENV_TEST | sort > %s; %s; sleep 10" environment: PEBBLE_ENV_TEST_1: foo PEBBLE_ENV_TEST_2: bar bazz -`, logPath, s.donePath) +`, logPath, s.insertDoneCheck(c, "envtest")) layer := parseLayer(c, 0, "envlayer", layerYAML) err := s.manager.AppendLayer(layer) c.Assert(err, IsNil) @@ -848,7 +849,7 @@ services: c.Check(chg.Status(), Equals, state.DoneStatus, Commentf("Error: %v", chg.Err())) s.st.Unlock() - s.waitForDoneFile(c) + s.waitForDoneCheck(c, "envtest") // Ensure it read environment variables correctly data, err := ioutil.ReadFile(logPath) @@ -991,7 +992,7 @@ func (s *S) TestOnCheckFailureRestartWhileRunning(c *C) { services: test2: override: replace - command: /bin/sh -c 'echo x >>%s; sync; touch %s; sleep 1' + command: /bin/sh -c 'echo x >>%s; %s; sleep 1' backoff-delay: 50ms on-check-failure: chk1: restart @@ -1003,14 +1004,14 @@ checks: threshold: 1 exec: command: will-fail -`, tempFile, s.donePath)) +`, tempFile, s.insertDoneCheck(c, "test2"))) err := s.manager.AppendLayer(layer) c.Assert(err, IsNil) // Start service and wait till it starts up (output file is written to) s.startServices(c, []string{"test2"}, 1) - s.waitForDoneFile(c) + s.waitForDoneCheck(c, "test2") b, err := ioutil.ReadFile(tempFile) c.Assert(err, IsNil) @@ -1036,7 +1037,7 @@ checks: return svc.Current == servstate.StatusBackoff }) - s.waitForDoneFile(c) + s.waitForDoneCheck(c, "test2") b, err = ioutil.ReadFile(tempFile) c.Assert(err, IsNil) @@ -1073,7 +1074,7 @@ func (s *S) TestOnCheckFailureRestartDuringBackoff(c *C) { services: test2: override: replace - command: /bin/sh -c 'echo x >>%s; sync; touch %s; sleep 0.075' + command: /bin/sh -c 'echo x >>%s; %s; sleep 0.075' backoff-delay: 50ms backoff-factor: 100 # ensure it only backoff-restarts once on-check-failure: @@ -1086,14 +1087,14 @@ checks: threshold: 1 exec: command: will-fail -`, tempFile, s.donePath)) +`, tempFile, s.insertDoneCheck(c, "test2"))) err := s.manager.AppendLayer(layer) c.Assert(err, IsNil) // Start service and wait till it starts up (output file is written to) s.startServices(c, []string{"test2"}, 1) - s.waitForDoneFile(c) + s.waitForDoneCheck(c, "test2") b, err := ioutil.ReadFile(tempFile) c.Assert(err, IsNil) @@ -1106,7 +1107,7 @@ checks: // Check failure should wait for current backoff (after which it will be restarted) - s.waitForDoneFile(c) + s.waitForDoneCheck(c, "test2") b, err = ioutil.ReadFile(tempFile) c.Assert(err, IsNil) @@ -1144,7 +1145,7 @@ func (s *S) TestOnCheckFailureIgnore(c *C) { services: test2: override: replace - command: /bin/sh -c 'echo x >>%s; sync; touch %s; sleep 1' + command: /bin/sh -c 'echo x >>%s; %s; sleep 1' on-check-failure: chk1: ignore @@ -1155,14 +1156,14 @@ checks: threshold: 1 exec: command: will-fail -`, tempFile, s.donePath)) +`, tempFile, s.insertDoneCheck(c, "test2"))) err := s.manager.AppendLayer(layer) c.Assert(err, IsNil) // Start service and wait till it starts up (output file is written to) s.startServices(c, []string{"test2"}, 1) - s.waitForDoneFile(c) + s.waitForDoneCheck(c, "test2") b, err := ioutil.ReadFile(tempFile) c.Assert(err, IsNil) @@ -1213,7 +1214,7 @@ func (s *S) TestOnCheckFailureShutdown(c *C) { services: test2: override: replace - command: /bin/sh -c 'echo x >>%s; sync; touch %s; sleep 1' + command: /bin/sh -c 'echo x >>%s; %s; sleep 1' on-check-failure: chk1: shutdown @@ -1224,14 +1225,14 @@ checks: threshold: 1 exec: command: will-fail -`, tempFile, s.donePath)) +`, tempFile, s.insertDoneCheck(c, "test2"))) err := s.manager.AppendLayer(layer) c.Assert(err, IsNil) // Start service and wait till it starts up (output file is written to) s.startServices(c, []string{"test2"}, 1) - s.waitForDoneFile(c) + s.waitForDoneCheck(c, "test2") b, err := ioutil.ReadFile(tempFile) c.Assert(err, IsNil) @@ -1562,8 +1563,8 @@ func (s *S) TestNoWorkingDir(c *C) { services: nowrkdir: override: replace - command: /bin/sh -c "pwd >%s; sync; touch %s; sleep %g" -`, outputPath, s.donePath, shortOkayDelay.Seconds()+0.01)) + command: /bin/sh -c "pwd >%s; %s; sleep %g" +`, outputPath, s.insertDoneCheck(c, "nowrkdir"), shortOkayDelay.Seconds()+0.01)) err = s.manager.AppendLayer(layer) c.Assert(err, IsNil) @@ -1574,7 +1575,7 @@ services: c.Assert(chg.Err(), IsNil) s.st.Unlock() - s.waitForDoneFile(c) + s.waitForDoneCheck(c, "nowrkdir") output, err := ioutil.ReadFile(outputPath) c.Assert(err, IsNil) @@ -1593,9 +1594,9 @@ func (s *S) TestWorkingDir(c *C) { services: wrkdir: override: replace - command: /bin/sh -c "pwd >%s; sync; touch %s; sleep %g" + command: /bin/sh -c "pwd >%s; %s; sleep %g" working-dir: %s -`, outputPath, s.donePath, shortOkayDelay.Seconds()+0.01, dir)) +`, outputPath, s.insertDoneCheck(c, "wrkdir"), shortOkayDelay.Seconds()+0.01, dir)) err = s.manager.AppendLayer(layer) c.Assert(err, IsNil) @@ -1604,7 +1605,7 @@ services: c.Assert(chg.Err(), IsNil) s.st.Unlock() - s.waitForDoneFile(c) + s.waitForDoneCheck(c, "wrkdir") output, err := ioutil.ReadFile(outputPath) c.Assert(err, IsNil) @@ -1614,42 +1615,19 @@ services: // setupDefaultServiceManager provides a basic setup that can be used by many // of the unit tests without having to create a custom setup. func (s *S) setupDefaultServiceManager(c *C) { - s.dir = c.MkDir() - s.log = filepath.Join(s.dir, "log.txt") - os.Mkdir(filepath.Join(s.dir, "layers"), 0755) - data := fmt.Sprintf(planLayer1, s.log, s.log) - err := ioutil.WriteFile(filepath.Join(s.dir, "layers", "001-base.yaml"), []byte(data), 0644) + layers := filepath.Join(s.dir, "layers") + err := os.Mkdir(layers, 0755) + c.Assert(err, IsNil) + data := fmt.Sprintf(planLayer1, s.insertDoneCheck(c, "test1"), s.insertDoneCheck(c, "test2")) + err = ioutil.WriteFile(filepath.Join(layers, "001-base.yaml"), []byte(data), 0644) c.Assert(err, IsNil) - err = ioutil.WriteFile(filepath.Join(s.dir, "layers", "002-two.yaml"), []byte(planLayer2), 0644) + err = ioutil.WriteFile(filepath.Join(layers, "002-two.yaml"), []byte(planLayer2), 0644) c.Assert(err, IsNil) s.manager, err = servstate.NewManager(s.st, s.runner, s.dir, s.logOutput, testRestarter{s.stopDaemon}, fakeLogManager{}) c.Assert(err, IsNil) } -// Return on timeout or when the file appears. This is used to determine -// when the expected service output is actually available, not when the -// service starts to run. -func (s *S) waitForDoneFile(c *C) { - timeout := time.After(10 * time.Second) - ticker := time.NewTicker(time.Millisecond) - for { - select { - case <-timeout: - c.Fatal("timeout waiting for service output") - - case <-ticker.C: - stat, err := os.Stat(s.donePath) - if err == nil && stat.Mode().IsRegular() { - // Delete it so we can reuse this feature - // in the same test. - os.Remove(s.donePath) - return - } - } - } -} - // Make sure services are all stopped before the next test starts. func (s *S) stopRunningServices(c *C) { taskSet, err := servstate.StopRunning(s.st, s.manager) @@ -1691,18 +1669,6 @@ func (r testRestarter) HandleRestart(t restart.RestartType) { close(r.ch) } -func (s *S) assertLog(c *C, expected string) { - s.logBufferMut.Lock() - defer s.logBufferMut.Unlock() - data, err := ioutil.ReadFile(s.log) - if os.IsNotExist(err) { - c.Fatal("Services have not run") - } - c.Assert(err, IsNil) - c.Assert(string(data), Matches, "(?s)"+expected) - c.Assert(s.logBuffer.String(), Matches, "(?s)"+expected) -} - func (s *S) logBufferString() string { s.logBufferMut.Lock() defer s.logBufferMut.Unlock() @@ -1712,7 +1678,7 @@ func (s *S) logBufferString() string { } func (s *S) testServiceLogs(c *C, outputs map[string]string) { - s.startTestServices(c) + s.startTestServices(c, true) if c.Failed() { return @@ -1780,16 +1746,25 @@ func (s *S) serviceByName(c *C, name string) *servstate.ServiceInfo { return services[0] } -func (s *S) startTestServices(c *C) { +func (s *S) startTestServices(c *C, logCheck bool) { chg := s.startServices(c, []string{"test1", "test2"}, 2) s.st.Lock() c.Check(chg.Status(), Equals, state.DoneStatus, Commentf("Error: %v", chg.Err())) s.st.Unlock() - s.assertLog(c, ".*test1\n.*test2\n") - cmds := s.manager.RunningCmds() c.Check(cmds, HasLen, 2) + + // When this helper is used for testing idempotence + // the services are not actually started unless they + // they are not running. In this case we have to disable + // the log checks, as no new entries are expected. + if logCheck { + s.waitForDoneCheck(c, "test1") + s.waitForDoneCheck(c, "test2") + + c.Assert(s.logBufferString(), Matches, "(?s).*test1\n.*test2\n") + } } func (s *S) stopTestServices(c *C) { @@ -1895,3 +1870,32 @@ type fakeLogManager struct{} func (f fakeLogManager) ServiceStarted(serviceName string, logs *servicelog.RingBuffer) { // no-op } + +func (s *S) insertDoneCheck(c *C, service string) string { + return fmt.Sprintf("sync; touch %s", filepath.Join(s.dir, service)) +} + +// Return on timeout or when the file appears. This is used to determine +// when the expected service output is actually available, not when the +// service starts to run. +func (s *S) waitForDoneCheck(c *C, service string) { + timeout := time.After(10 * time.Second) + ticker := time.NewTicker(time.Millisecond) + donePath := filepath.Join(s.dir, service) + for { + select { + case <-timeout: + c.Fatal("timeout waiting for service output") + + case <-ticker.C: + stat, err := os.Stat(donePath) + if err == nil && stat.Mode().IsRegular() { + // Delete it so we can reuse this feature + // in the same test once a service gets + // restarted. + os.Remove(donePath) + return + } + } + } +}