Skip to content

Commit

Permalink
YUNIKORN-1706 We should clean up failed apps in shim side
Browse files Browse the repository at this point in the history
  • Loading branch information
zhuqi-lucas committed Nov 20, 2023
1 parent 8a0d449 commit cfa589d
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 0 deletions.
4 changes: 4 additions & 0 deletions pkg/cache/application.go
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,10 @@ func (app *Application) getNonTerminatedTaskAlias() []string {
return nonTerminatedTaskAlias
}

func (app *Application) IsAllTasksTerminated() bool {
return len(app.getNonTerminatedTaskAlias()) == 0
}

// SetState is only for testing
// this is just used for testing, it is not supposed to change state like this
func (app *Application) SetState(state string) {
Expand Down
20 changes: 20 additions & 0 deletions pkg/shim/scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,26 @@ func (ss *KubernetesShim) registerShimLayer() error {
func (ss *KubernetesShim) schedule() {
apps := ss.context.GetAllApplications()
for _, app := range apps {
// Clean up terminal failed apps for shim side
// 1. When we reject an app, we set the app state to Rejected, and immediately set it to Failed, but we don't clean up the app.
// 2. When we failed an app, we set the app state to Failed, but we don't clean up the app.
// 3. The completed app already handled by UpdateApplication function.
// case cache.ApplicationStates().Completed:
// callback.context.RemoveApplicationInternal(updated.ApplicationID)
// 4. The killed status is not used until now, so we don't need to handle it.
if app.GetApplicationState() == cache.ApplicationStates().Failed {
if app.IsAllTasksTerminated() {
log.Log(log.ShimScheduler).Info("Clean up failed application",
zap.String("appID", app.GetApplicationID()))
ss.context.RemoveApplicationInternal(app.GetApplicationID())

} else {
log.Log(log.ShimScheduler).Info("Failed application is not cleaned up due to not all tasks terminated, wait for next scheduling iteration",
zap.String("appID", app.GetApplicationID()))
}
continue
}

if app.Schedule() {
ss.setOutstandingAppsFound(true)
}
Expand Down

0 comments on commit cfa589d

Please sign in to comment.