From 071614a7033e767aef71fcb1393f7ee76aca8fa6 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Tue, 7 May 2024 13:49:03 -0700 Subject: [PATCH] [Core] Allow a very long user program (#3512) * Allow a very long user program * Fix commnet * Update sky/backends/cloud_vm_ray_backend.py Co-authored-by: Tian Xia * Update sky/backends/cloud_vm_ray_backend.py Co-authored-by: Tian Xia --------- Co-authored-by: Tian Xia --- sky/backends/cloud_vm_ray_backend.py | 30 ++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index f916d931b5f..4d0fdb8d68b 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -3144,6 +3144,36 @@ def _exec_code_on_head( code = job_lib.JobLibCodeGen.queue_job(job_id, job_submit_cmd) job_submit_cmd = ' && '.join([mkdir_code, create_script_code, code]) + if len(job_submit_cmd) > 120 * 1024: + # The maximum size of a command line arguments is 128 KB, i.e. the + # command executed with /bin/sh should be less than 128KB. + # https://github.com/torvalds/linux/blob/master/include/uapi/linux/binfmts.h + # If a user have very long run or setup commands, the generated + # command may exceed the limit, as we encode the script in base64 + # and directly include it in the job submission command. If the + # command is too long, we instead write it to a file, rsync and + # execute it. + # We use 120KB as a threshold to be safe for other arguments that + # might be added during ssh. + ssh_credentials = backend_utils.ssh_credential_from_yaml( + handle.cluster_yaml, handle.docker_user, handle.ssh_user) + head_ssh_port = handle.head_ssh_port + runner = command_runner.SSHCommandRunner(handle.head_ip, + port=head_ssh_port, + **ssh_credentials) + with tempfile.NamedTemporaryFile('w', prefix='sky_app_') as fp: + fp.write(codegen) + fp.flush() + script_path = os.path.join(SKY_REMOTE_APP_DIR, + f'sky_job_{job_id}') + # We choose to sync code + exec, because the alternative of 'ray + # submit' may not work as it may use system python (python2) to + # execute the script. Happens for AWS. + runner.rsync(source=fp.name, + target=script_path, + up=True, + stream_logs=False) + job_submit_cmd = f'{mkdir_code} && {code}' if managed_job_dag is not None: # Add the managed job to job queue database.