-
Notifications
You must be signed in to change notification settings - Fork 539
[Feature] [kubectl-plugin] Expose setting shutdownAfterJobFinishes
and ttlSecondsAfterFinished
in ray job submit
#3627
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -40,40 +40,42 @@ const ( | |||||
) | ||||||
|
||||||
type SubmitJobOptions struct { | ||||||
cmdFactory cmdutil.Factory | ||||||
dashboardClient utils.RayDashboardClientInterface | ||||||
ioStreams *genericiooptions.IOStreams | ||||||
RayJob *rayv1.RayJob | ||||||
logColor string | ||||||
image string | ||||||
fileName string | ||||||
workingDir string | ||||||
runtimeEnv string | ||||||
headers string | ||||||
verify string | ||||||
cluster string | ||||||
runtimeEnvJson string | ||||||
entryPointResource string | ||||||
metadataJson string | ||||||
logStyle string | ||||||
submissionID string | ||||||
rayjobName string | ||||||
rayVersion string | ||||||
entryPoint string | ||||||
headCPU string | ||||||
headMemory string | ||||||
headGPU string | ||||||
workerCPU string | ||||||
workerMemory string | ||||||
workerGPU string | ||||||
namespace string | ||||||
entryPointMemory int | ||||||
entryPointGPU float32 | ||||||
workerReplicas int32 | ||||||
entryPointCPU float32 | ||||||
noWait bool | ||||||
dryRun bool | ||||||
verbose bool | ||||||
cmdFactory cmdutil.Factory | ||||||
dashboardClient utils.RayDashboardClientInterface | ||||||
ioStreams *genericiooptions.IOStreams | ||||||
RayJob *rayv1.RayJob | ||||||
logColor string | ||||||
image string | ||||||
fileName string | ||||||
workingDir string | ||||||
runtimeEnv string | ||||||
headers string | ||||||
verify string | ||||||
cluster string | ||||||
runtimeEnvJson string | ||||||
entryPointResource string | ||||||
metadataJson string | ||||||
logStyle string | ||||||
submissionID string | ||||||
rayjobName string | ||||||
rayVersion string | ||||||
entryPoint string | ||||||
headCPU string | ||||||
headMemory string | ||||||
headGPU string | ||||||
workerCPU string | ||||||
workerMemory string | ||||||
workerGPU string | ||||||
namespace string | ||||||
entryPointMemory int | ||||||
entryPointGPU float32 | ||||||
workerReplicas int32 | ||||||
entryPointCPU float32 | ||||||
noWait bool | ||||||
dryRun bool | ||||||
verbose bool | ||||||
shutdownAfterJobFinishes bool | ||||||
ttlSecondsAfterFinished int32 | ||||||
Comment on lines
+77
to
+78
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add |
||||||
} | ||||||
|
||||||
type JobInfo struct { | ||||||
|
@@ -171,6 +173,8 @@ func NewJobSubmitCommand(cmdFactory cmdutil.Factory, streams genericclioptions.I | |||||
cmd.Flags().StringVar(&options.workerGPU, "worker-gpu", "0", "number of GPUs in each worker group replica") | ||||||
cmd.Flags().BoolVar(&options.dryRun, "dry-run", false, "print the generated YAML instead of creating the cluster. Only works when filename is not provided") | ||||||
cmd.Flags().BoolVarP(&options.verbose, "verbose", "v", false, "Passing the '--verbose' flag to the 'ray job submit' command") | ||||||
cmd.Flags().BoolVar(&options.shutdownAfterJobFinishes, "shutdown-after-job-finishes", false, "Shutdown the cluster after the job finishes") | ||||||
cmd.Flags().Int32Var(&options.ttlSecondsAfterFinished, "ttl-seconds-after-finished", 0, "TTL seconds after finished. Only works when filename is not provided") | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It still works when filename is provided right? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Currently no, however I will make sure it works in the next commit. |
||||||
|
||||||
return cmd | ||||||
} | ||||||
|
@@ -254,8 +258,17 @@ func (options *SubmitJobOptions) Validate() error { | |||||
} | ||||||
options.runtimeEnvJson = string(runtimeJson) | ||||||
} | ||||||
|
||||||
if !options.RayJob.Spec.ShutdownAfterJobFinishes && options.RayJob.Spec.TTLSecondsAfterFinished != 0 { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. unrelated to this PR, but when I was testing this change, I ran There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1. I think we should take the filename as a base, and flags should override the values in the file. I created an issue to track this #3639 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The behavior of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is worth overriding the values in the file. I will change this. |
||||||
return fmt.Errorf("ttl-seconds-after-finished is only supported when shutdown-after-job-finishes is set to true") | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. might be better to use the YAML field names
Suggested change
|
||||||
} | ||||||
|
||||||
Comment on lines
+262
to
+265
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Implement validation logic when
|
||||||
} else if strings.TrimSpace(options.rayjobName) == "" { | ||||||
return fmt.Errorf("Must set either yaml file (--filename) or set Ray job name (--name)") | ||||||
} else if options.fileName == "" { | ||||||
if !options.shutdownAfterJobFinishes && options.ttlSecondsAfterFinished != 0 { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. also check ttlSecondsAfterFinished ≥ 0 in both cases? Right now this command will create a RayJob with ttlSecondsAfterFinished: -10
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I will fix this, thx |
||||||
return fmt.Errorf("ttl-seconds-after-finished is only supported when shutdown-after-job-finishes is set to true") | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
} | ||||||
Comment on lines
+268
to
+271
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Implement validation logic when
|
||||||
} | ||||||
|
||||||
if options.workingDir == "" { | ||||||
|
@@ -289,9 +302,11 @@ func (options *SubmitJobOptions) Run(ctx context.Context, factory cmdutil.Factor | |||||
if options.fileName == "" { | ||||||
// Genarate the Ray job. | ||||||
rayJobObject := generation.RayJobYamlObject{ | ||||||
RayJobName: options.rayjobName, | ||||||
Namespace: options.namespace, | ||||||
SubmissionMode: "InteractiveMode", | ||||||
RayJobName: options.rayjobName, | ||||||
Namespace: options.namespace, | ||||||
ShutdownAfterJobFinishes: options.shutdownAfterJobFinishes, | ||||||
TTLSecondsAfterFinished: options.ttlSecondsAfterFinished, | ||||||
SubmissionMode: "InteractiveMode", | ||||||
// Prior to kuberay 1.2.2, the entry point is required. To maintain | ||||||
// backwards compatibility with 1.2.x, we submit the entry point | ||||||
// here, even though it will be ignored. | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -30,7 +30,7 @@ func TestRayJobSubmitComplete(t *testing.T) { | |
assert.Equal(t, "fake/path/to/env/yaml", fakeSubmitJobOptions.runtimeEnv) | ||
} | ||
|
||
func TestRayJobSubmitValidate(t *testing.T) { | ||
func TestRayJobSubmitWithYamlValidate(t *testing.T) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Split |
||
testStreams, _, _, _ := genericclioptions.NewTestIOStreams() | ||
cmdFactory := cmdutil.NewFactory(genericclioptions.NewConfigFlags(true)) | ||
|
||
|
@@ -71,6 +71,29 @@ spec: | |
submissionMode: 'InteractiveMode' | ||
backoffLimit: 0`, | ||
}, | ||
{ | ||
name: "shutdownAfterJobFinishes is false and ttlSecondsAfterFinished is not zero", | ||
yamlContent: `apiVersion: ray.io/v1 | ||
kind: RayJob | ||
metadata: | ||
name: rayjob-sample | ||
spec: | ||
shutdownAfterJobFinishes: false | ||
ttlSecondsAfterFinished: 10 | ||
submissionMode: 'InteractiveMode'`, | ||
expectError: "ttl-seconds-after-finished is only supported when shutdown-after-job-finishes is set to true", | ||
}, | ||
{ | ||
name: "shutdownAfterJobFinishes is false and ttlSecondsAfterFinished is not zero", | ||
yamlContent: `apiVersion: ray.io/v1 | ||
kind: RayJob | ||
metadata: | ||
name: rayjob-sample | ||
spec: | ||
shutdownAfterJobFinishes: true | ||
ttlSecondsAfterFinished: 10 | ||
submissionMode: 'InteractiveMode'`, | ||
}, | ||
} | ||
|
||
for _, tc := range tests { | ||
|
@@ -99,6 +122,52 @@ spec: | |
} | ||
} | ||
|
||
func TestRayJobSubmitWithoutYamlValidate(t *testing.T) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Split |
||
testStreams, _, _, _ := genericclioptions.NewTestIOStreams() | ||
cmdFactory := cmdutil.NewFactory(genericclioptions.NewConfigFlags(true)) | ||
|
||
test := []struct { | ||
name string | ||
rayjobName string | ||
expectError string | ||
ttlSecondsAfterFinished int32 | ||
shutdownAfterJobFinishes bool | ||
}{ | ||
{ | ||
name: "shutdownAfterJobFinishes is false and ttlSecondsAfterFinished is not zero", | ||
rayjobName: "rayjob-sample", | ||
shutdownAfterJobFinishes: false, | ||
ttlSecondsAfterFinished: 10, | ||
expectError: "ttl-seconds-after-finished is only supported when shutdown-after-job-finishes is set to true", | ||
}, | ||
{ | ||
name: "shutdownAfterJobFinishes is true and ttlSecondsAfterFinished is not zero", | ||
rayjobName: "rayjob-sample", | ||
shutdownAfterJobFinishes: true, | ||
ttlSecondsAfterFinished: 10, | ||
}, | ||
} | ||
|
||
for _, tc := range test { | ||
t.Run(tc.name, func(t *testing.T) { | ||
opts := &SubmitJobOptions{ | ||
cmdFactory: cmdFactory, | ||
ioStreams: &testStreams, | ||
rayjobName: tc.rayjobName, | ||
workingDir: "Fake/File/Path", | ||
shutdownAfterJobFinishes: tc.shutdownAfterJobFinishes, | ||
ttlSecondsAfterFinished: tc.ttlSecondsAfterFinished, | ||
} | ||
err := opts.Validate() | ||
if tc.expectError != "" { | ||
require.EqualError(t, err, tc.expectError) | ||
} else { | ||
require.NoError(t, err) | ||
} | ||
}) | ||
} | ||
} | ||
|
||
func TestDecodeRayJobYaml(t *testing.T) { | ||
rayjobtmpfile, err := os.CreateTemp("./", "rayjob-temp-*.yaml") | ||
require.NoError(t, err) | ||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -144,6 +144,10 @@ func ValidateRayJobSpec(rayJob *rayv1.RayJob) error { | |||||
return fmt.Errorf("a RayJob with shutdownAfterJobFinishes set to false is not allowed to be suspended") | ||||||
} | ||||||
|
||||||
if !rayJob.Spec.ShutdownAfterJobFinishes && rayJob.Spec.TTLSecondsAfterFinished != 0 { | ||||||
return fmt.Errorf("a RayJob with shutdownAfterJobFinishes set to false cannot have TTLSecondsAfterFinished is non-zero") | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
} | ||||||
|
||||||
Comment on lines
+147
to
+150
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Implement validation logic in ray operator
|
||||||
isClusterSelectorMode := len(rayJob.Spec.ClusterSelector) != 0 | ||||||
if rayJob.Spec.Suspend && isClusterSelectorMode { | ||||||
return fmt.Errorf("the ClusterSelector mode doesn't support the suspend operation") | ||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why change apiserver tests?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Because I have modified the ray operator that
ShutdownAfterJobFinishes
should be true; otherwise,ttlSecondsAfterFinishedcannot
cannot be set.reference
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Seems unrelated to this PR? Could you do apiserver-related changes in a separate PR?