diff --git a/pkg/controller.v1/common/job.go b/pkg/controller.v1/common/job.go index 825afdf8b1..d8a494dd0c 100644 --- a/pkg/controller.v1/common/job.go +++ b/pkg/controller.v1/common/job.go @@ -421,7 +421,7 @@ func (jc *JobController) CleanupJob(runPolicy *apiv1.RunPolicy, jobStatus apiv1. currentTime := time.Now() metaObject, _ := job.(metav1.Object) ttl := runPolicy.TTLSecondsAfterFinished - if ttl == nil { + if ttl == nil || trainutil.IsJobSuspended(runPolicy) { return nil } duration := time.Second * time.Duration(*ttl) diff --git a/pkg/controller.v1/tensorflow/job_test.go b/pkg/controller.v1/tensorflow/job_test.go index c7e5a43c45..df146ef15a 100644 --- a/pkg/controller.v1/tensorflow/job_test.go +++ b/pkg/controller.v1/tensorflow/job_test.go @@ -663,6 +663,30 @@ var _ = Describe("Test for controller.v1/common", func() { wantTFJobIsRemoved: false, wantErr: false, }), + Entry("No error with completionTime is nil if suspended", &cleanUpCases{ + tfJob: tftestutil.NewTFJobWithCleanupJobDelay(1, 2, 0, nil), + runPolicy: &kubeflowv1.RunPolicy{ + TTLSecondsAfterFinished: nil, + Suspend: ptr.To(true), + }, + jobStatus: kubeflowv1.JobStatus{ + CompletionTime: nil, + }, + wantTFJobIsRemoved: false, + wantErr: false, + }), + Entry("No error with TTL is set and completionTime is nil, if suspended", &cleanUpCases{ + tfJob: tftestutil.NewTFJobWithCleanupJobDelay(1, 2, 0, ptr.To[int32](10)), + runPolicy: &kubeflowv1.RunPolicy{ + TTLSecondsAfterFinished: ptr.To[int32](10), + Suspend: ptr.To(true), + }, + jobStatus: kubeflowv1.JobStatus{ + CompletionTime: nil, + }, + wantTFJobIsRemoved: false, + wantErr: false, + }), Entry("Error is occurred since completionTime is nil", &cleanUpCases{ tfJob: tftestutil.NewTFJobWithCleanupJobDelay(1, 2, 0, ptr.To[int32](10)), runPolicy: &kubeflowv1.RunPolicy{