Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[core] Ensure auto-transitioning environments always end #525

Merged
merged 1 commit into from
Mar 8, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 32 additions & 48 deletions core/environment/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -922,22 +922,22 @@ func (envs *Manager) CreateAutoEnvironment(workflowPath string, userVars map[str
)
}

if err != nil {
goErrorKillDestroy := func(op string) {
envState := env.CurrentState()
env.sendEnvironmentEvent(&event.EnvironmentEvent{EnvironmentID: env.Id().String(), Error: err})
log.WithField("state", envState).
WithField("environment", env.Id().String()).
WithError(err).
Warn("environment deployment and configuration failed, cleanup in progress")
Warnf("auto-transitioning environment failed %s, cleanup in progress", op)

err = env.TryTransition(NewGoErrorTransition(
err := env.TryTransition(NewGoErrorTransition(
envs.taskman),
)
if err != nil {
log.WithField("partition", env.Id().String()).
WithField("state", envState).
Debug("could not transition to ERROR after failed deployment/configuration, cleanup in progress")

Debug("could not transition failed auto-transitioning environment to ERROR, cleanup in progress")
env.setState("ERROR")
}

envTasks := env.Workflow().GetTasks()
Expand All @@ -957,8 +957,13 @@ func (envs *Manager) CreateAutoEnvironment(workflowPath string, userVars map[str
"level": infologger.IL_Support,
"partition": env.Id().String(),
}).
Info("environment deployment failed, tasks were cleaned up")
Infof("auto-environment failed at %s, tasks were cleaned up", op)
log.WithField("partition", env.Id().String()).Info("environment teardown complete")
}

if err != nil {
goErrorKillDestroy("transition CONFIGURE")

return
}

Expand All @@ -969,66 +974,40 @@ func (envs *Manager) CreateAutoEnvironment(workflowPath string, userVars map[str
trans := NewStartActivityTransition(envs.taskman)
if trans == nil {
env.sendEnvironmentEvent(&event.EnvironmentEvent{EnvironmentID: env.Id().String(), Error: err})
goErrorKillDestroy("transition START_ACTIVITY")

return
}

err = env.TryTransition(trans)
if err != nil {
envState := env.CurrentState()

env.sendEnvironmentEvent(&event.EnvironmentEvent{EnvironmentID: env.Id().String(), Error: err})

log.WithField("state", envState).
WithField("environment", env.Id().String()).
WithError(err).
Warn("environment start activity failed, cleanup in progress")

err = env.TryTransition(NewGoErrorTransition(
envs.taskman),
)
if err != nil {
log.WithField("partition", env.Id().String()).
WithField("state", envState).
Debug("could not transition to ERROR after failed start activity, cleanup in progress")

env.setState("ERROR")
}

envTasks := env.Workflow().GetTasks()
// TeardownEnvironment manages the envs.mu internally
err = envs.TeardownEnvironment(env.Id(), true /*force*/)
if err != nil {
env.sendEnvironmentEvent(&event.EnvironmentEvent{EnvironmentID: env.Id().String(), Error: err})
}

killedTasks, _, rlsErr := envs.taskman.KillTasks(envTasks.GetTaskIds())
if rlsErr != nil {
log.WithError(rlsErr).Warn("task teardown error")
}
log.WithFields(logrus.Fields{
"killedCount": len(killedTasks),
"lastEnvState": envState,
"level": infologger.IL_Support,
"partition": env.Id().String(),
}).Info("environment start activity failed, tasks were cleaned up")

log.WithField("partition", env.Id().String()).Info("environment teardown complete")
goErrorKillDestroy("transition START_ACTIVITY")
return
}

for {
// we know we performed START_ACTIVITY, so we poll at 1Hz for the run to finish
time.Sleep(1 * time.Second)

if env == nil {
// must've died during the loop
return
}

envState := env.CurrentState()
switch envState {
case "CONFIGURED":
// RUN finished so we can reset and delete the environment
err := env.TryTransition(NewResetTransition(envs.taskman))
err = env.TryTransition(NewResetTransition(envs.taskman))
if err != nil {
env.sendEnvironmentEvent(&event.EnvironmentEvent{EnvironmentID: env.Id().String(), Error: err})
goErrorKillDestroy("transition RESET")
return
}
err = envs.TeardownEnvironment(env.id, false)
if err != nil {
env.sendEnvironmentEvent(&event.EnvironmentEvent{EnvironmentID: env.Id().String(), Error: err})
goErrorKillDestroy("teardown")
return
}
tasksForEnv := env.Workflow().GetTasks().GetTaskIds()
Expand All @@ -1039,11 +1018,16 @@ func (envs *Manager) CreateAutoEnvironment(workflowPath string, userVars map[str
}
return
case "ERROR":
fallthrough
case "STANDBY":
fallthrough
case "DEPLOYED":
goErrorKillDestroy("transition STOP_ACTIVITY")
return
case "MIXED":
return
continue
case "":
return
continue
}
}
}
Loading