From b839e1693a7419181ae1c590a9ca88904ee1f79d Mon Sep 17 00:00:00 2001 From: Wilfred Spiegelenburg Date: Thu, 11 Jul 2024 16:11:54 +1000 Subject: [PATCH 1/2] [YUNIKORN-2323] Gang scheduling user experience new events Additional events for gang scheduling covering: * placeholder timeout (resuming state) * placeholder creation * placeholder create failure(s) --- pkg/cache/application.go | 27 +++++++++++++++++++++++++-- pkg/cache/application_state.go | 4 ++++ 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/pkg/cache/application.go b/pkg/cache/application.go index ff4db058c..6fac0e37e 100644 --- a/pkg/cache/application.go +++ b/pkg/cache/application.go @@ -495,12 +495,30 @@ func (app *Application) postAppAccepted() { dispatcher.Dispatch(ev) } +// onResuming triggered when entering the resuming state which is triggered by the time out of the gang placeholders +// if SOFT gang scheduling is configured. +func (app *Application) onResuming() { + if app.originatingTask != nil { + events.GetRecorder().Eventf(app.originatingTask.GetTaskPod().DeepCopy(), nil, v1.EventTypeWarning, "GangScheduling", + "GangSchedulingFailed", "Application %s resuming as non-gang application (SOFT)", app.applicationID) + } +} + +// onReserving triggered when entering the reserving state. +// During normal operation this creates all the placeholders. During recovery this call could cause the application +// in the shim and core to progress to the next state. func (app *Application) onReserving() { - // happens after recovery - if placeholders already exist, we need to send + // if any placeholder already exist during recovery we might need to send // an event to trigger Application state change in the core if len(app.getPlaceHolderTasks()) > 0 { ev := NewUpdateApplicationReservationEvent(app.applicationID) dispatcher.Dispatch(ev) + } else { + // not recovery or no placeholders created yet add an event to the pod + if app.originatingTask != nil { + events.GetRecorder().Eventf(app.originatingTask.GetTaskPod().DeepCopy(), nil, v1.EventTypeNormal, "GangScheduling", + "CreatingPlaceholders", "Application %s creating placeholders", app.applicationID) + } } go func() { @@ -511,6 +529,11 @@ func (app *Application) onReserving() { getPlaceholderManager().cleanUp(app) ev := NewRunApplicationEvent(app.applicationID) dispatcher.Dispatch(ev) + // failed at least one placeholder creation progress as a normal application + if app.originatingTask != nil { + events.GetRecorder().Eventf(app.originatingTask.GetTaskPod().DeepCopy(), nil, v1.EventTypeWarning, "GangScheduling", + "PlaceholderCreateFailed", "Application %s fall back to normal scheduling", app.applicationID) + } } }() } @@ -520,7 +543,7 @@ func (app *Application) onReserving() { func (app *Application) onReservationStateChange() { if app.originatingTask != nil { events.GetRecorder().Eventf(app.originatingTask.GetTaskPod().DeepCopy(), nil, v1.EventTypeNormal, "GangScheduling", - "Placeholder Allocated", "Application %s placeholder has been allocated.", app.applicationID) + "PlaceholderAllocated", "Application %s placeholder has been allocated.", app.applicationID) } desireCounts := make(map[string]int32, len(app.taskGroups)) for _, tg := range app.taskGroups { diff --git a/pkg/cache/application_state.go b/pkg/cache/application_state.go index ef251e2ff..aee1c3e19 100644 --- a/pkg/cache/application_state.go +++ b/pkg/cache/application_state.go @@ -506,6 +506,10 @@ func newAppState() *fsm.FSM { //nolint:funlen app := event.Args[0].(*Application) //nolint:errcheck app.onReserving() }, + states.Resuming: func(_ context.Context, event *fsm.Event) { + app := event.Args[0].(*Application) //nolint:errcheck + app.onResuming() + }, SubmitApplication.String(): func(_ context.Context, event *fsm.Event) { app := event.Args[0].(*Application) //nolint:errcheck event.Err = app.handleSubmitApplicationEvent() From 6217aeb0902890008ff5d60b3a3de0fe17ef8783 Mon Sep 17 00:00:00 2001 From: Wilfred Spiegelenburg Date: Thu, 11 Jul 2024 16:25:53 +1000 Subject: [PATCH 2/2] lint fix --- pkg/cache/application.go | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/pkg/cache/application.go b/pkg/cache/application.go index 6fac0e37e..9833a6f2a 100644 --- a/pkg/cache/application.go +++ b/pkg/cache/application.go @@ -513,12 +513,10 @@ func (app *Application) onReserving() { if len(app.getPlaceHolderTasks()) > 0 { ev := NewUpdateApplicationReservationEvent(app.applicationID) dispatcher.Dispatch(ev) - } else { + } else if app.originatingTask != nil { // not recovery or no placeholders created yet add an event to the pod - if app.originatingTask != nil { - events.GetRecorder().Eventf(app.originatingTask.GetTaskPod().DeepCopy(), nil, v1.EventTypeNormal, "GangScheduling", - "CreatingPlaceholders", "Application %s creating placeholders", app.applicationID) - } + events.GetRecorder().Eventf(app.originatingTask.GetTaskPod().DeepCopy(), nil, v1.EventTypeNormal, "GangScheduling", + "CreatingPlaceholders", "Application %s creating placeholders", app.applicationID) } go func() {