From b8e7cd4f50530eeede013d599f9f8161f96f1098 Mon Sep 17 00:00:00 2001 From: jackyk02 Date: Wed, 10 Apr 2024 12:51:45 -0700 Subject: [PATCH 1/4] initial commit --- sky/provision/gcp/instance.py | 1 + sky/provision/gcp/instance_utils.py | 59 ++++++++++++++++++++++++++++- 2 files changed, 59 insertions(+), 1 deletion(-) diff --git a/sky/provision/gcp/instance.py b/sky/provision/gcp/instance.py index e7f69f8c6eb..cc59177a8d0 100644 --- a/sky/provision/gcp/instance.py +++ b/sky/provision/gcp/instance.py @@ -267,6 +267,7 @@ def get_order_key(node): for instance_id in resumed_instance_ids: resource.start_instance(instance_id, project_id, availability_zone) + resource.resize_disk(project_id, availability_zone, config.node_config, instance_id) resource.set_labels(project_id, availability_zone, instance_id, labels) to_start_count -= len(resumed_instance_ids) diff --git a/sky/provision/gcp/instance_utils.py b/sky/provision/gcp/instance_utils.py index dde0918274d..c6b32bf1434 100644 --- a/sky/provision/gcp/instance_utils.py +++ b/sky/provision/gcp/instance_utils.py @@ -1376,7 +1376,64 @@ def resize_disk(cls, project_id: str, availability_zone: str, The boot disk of TPU VMs is not resizable, and users need to add a persistent disk to expand disk capacity. Related issue: #2387 """ - return + from sky.skylet import log_lib + import os + from googleapiclient import discovery + + resource = discovery.build("compute", "v1") + + # TODO: Update the CLI to prompt the for disk mode and size + disk_size_gb = 200 + disk_mode = "read-write" + tpu_name = instance_name.split("/")[-1] + + # Create a new disk + disk_body = { + "name": f"{tpu_name}-extra-disk", + "sizeGb": str(disk_size_gb), + "type": f"zones/{availability_zone}/diskTypes/pd-standard", # or pd-ssd + } + + try: + create_operation = ( + resource.disks() + .insert( + project=project_id, zone=availability_zone, body=disk_body + ) + .execute() + ) + time.sleep(3) + + except HttpError as e: + # Catch HttpError for issues during disk creation or attachment + logger.warning(f"googleapiclient.errors.HttpError: {e.reason}") + + # Attach the disk to TPUVMs with gcloud alpha + attach_to_tpus = ( + f"gcloud alpha compute tpus tpu-vm attach-disk {tpu_name} " + f"--zone {availability_zone} --disk {disk_body['name']} " + f"--mode {disk_mode}" + ) + + rcode, stdout, stderr = log_lib.run_with_log( + attach_to_tpus, + os.devnull, + shell=True, + stream_logs=False, + require_outputs=True, + ) + + if rcode != 0: + failure_massage = ( + "Failed to attach disk to TPU VMs.\n" + "**** STDOUT ****\n" + "{stdout}\n" + "**** STDERR ****\n" + "{stderr}" + ) + logger.warning(failure_massage) + + return None @classmethod def get_instance_info(cls, project_id: str, availability_zone: str, From a69aa08bed75ec9b710bcaadd6b838a75b778b32 Mon Sep 17 00:00:00 2001 From: jackyk02 Date: Fri, 26 Apr 2024 21:13:34 -0700 Subject: [PATCH 2/4] add autodelete & disk size --- sky/backends/backend_utils.py | 9 +++++ sky/backends/cloud_vm_ray_backend.py | 1 + sky/provision/gcp/instance_utils.py | 58 +++++++++++++++++++++++++++- 3 files changed, 67 insertions(+), 1 deletion(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 7a6a2ab33cc..dee0121ed0c 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -932,6 +932,9 @@ def write_cluster_config( config_dict['ray'] = tmp_yaml_path return config_dict _add_auth_to_cluster_config(cloud, tmp_yaml_path) + # _DEFAULT_DISK_SIZE_GB = 256 + if to_provision.disk_size != 256: + _add_disk_size_to_cluster_config(to_provision.disk_size, tmp_yaml_path) # Add kubernetes config fields from ~/.sky/config if isinstance(cloud, clouds.Kubernetes): @@ -997,6 +1000,12 @@ def _add_auth_to_cluster_config(cloud: clouds.Cloud, cluster_config_file: str): assert False, cloud common_utils.dump_yaml(cluster_config_file, config) +def _add_disk_size_to_cluster_config(disk_size: int, cluster_config_file: str): + """Add disk size to the cluster config.""" + config = common_utils.read_yaml(cluster_config_file) + config['initDiskSize'] = str(disk_size) + common_utils.dump_yaml(cluster_config_file, config) + def get_run_timestamp() -> str: return 'sky-' + datetime.now().strftime('%Y-%m-%d-%H-%M-%S-%f') diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 44ade8c9c5e..50fa4b79ab7 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -4223,6 +4223,7 @@ def _check_existing_cluster( to_provision = handle_before_refresh.launched_resources self.check_resources_fit_cluster(handle_before_refresh, task) + logger.info( f'{colorama.Fore.CYAN}Creating a new cluster: {cluster_name!r} ' f'[{task.num_nodes}x {to_provision}].' diff --git a/sky/provision/gcp/instance_utils.py b/sky/provision/gcp/instance_utils.py index c6b32bf1434..513477bdc28 100644 --- a/sky/provision/gcp/instance_utils.py +++ b/sky/provision/gcp/instance_utils.py @@ -1383,7 +1383,15 @@ def resize_disk(cls, project_id: str, availability_zone: str, resource = discovery.build("compute", "v1") # TODO: Update the CLI to prompt the for disk mode and size - disk_size_gb = 200 + # Extract the specified disk size from the configuration + disk_size_gb = 100 + if 'diskSize' in node_config['metadata']: + disk_size_gb = int(node_config['metadata']['diskSize']) + # By default, each Cloud TPU VM has a 100GB single boot persistent disk that contains the operating system. + if disk_size_gb <= 100: + return None + disk_size_gb -= 100 + logger.info(f"Request persistent disk for size: {disk_size_gb}") disk_mode = "read-write" tpu_name = instance_name.split("/")[-1] @@ -1433,6 +1441,54 @@ def resize_disk(cls, project_id: str, availability_zone: str, ) logger.warning(failure_massage) + # # Set auto-delete state of the Persistent Disk + # auto_delete_disk = ( + # f"gcloud compute instances set-disk-auto-delete {instance_name} " + # f"--zone={availability_zone} " + # f"--auto-delete " + # f"--disk={disk_body['name']}" + # ) + # + # rcode, stdout, stderr = log_lib.run_with_log( + # auto_delete_disk, + # os.devnull, + # shell=True, + # stream_logs=False, + # require_outputs=True, + # ) + # + # if rcode != 0: + # failure_massage = ( + # "Failed to set auto-delete state of the Persistent Disk. Please delete the disk manually.\n" + # "**** STDOUT ****\n" + # "{stdout}\n" + # "**** STDERR ****\n" + # "{stderr}" + # ) + # logger.warning(failure_massage) + + # Format the persistent disk, create directory and mount the persistent disk + format_mount_disk = ( + f"gcloud compute tpus tpu-vm ssh {tpu_name} --zone={availability_zone} " + f"--command='sudo mkfs.ext4 -m 0 -E lazy_itable_init=0,lazy_journal_init=0,discard /dev/sdb ; sudo mkdir -p /mnt/disks/persist ; sudo mount -o discard,defaults /dev/sdb /mnt/disks/persist'" + ) + rcode, stdout, stderr = log_lib.run_with_log( + format_mount_disk, + os.devnull, + shell=True, + stream_logs=False, + require_outputs=True, + ) + if rcode != 0: + failure_massage = ( + "Failed to format and mount persistent disk\n" + "**** STDOUT ****\n" + "{stdout}\n" + "**** STDERR ****\n" + "{stderr}" + ) + logger.warning(failure_massage) + return None @classmethod From 05f9f86c4267540bdcd6c6ee2c2464ce7252c237 Mon Sep 17 00:00:00 2001 From: jackyk02 Date: Fri, 26 Apr 2024 21:44:26 -0700 Subject: [PATCH 3/4] rereformatted --- sky/provision/gcp/instance_utils.py | 158 ++++++++++------------------ 1 file changed, 57 insertions(+), 101 deletions(-) diff --git a/sky/provision/gcp/instance_utils.py b/sky/provision/gcp/instance_utils.py index 513477bdc28..d087aafcefd 100644 --- a/sky/provision/gcp/instance_utils.py +++ b/sky/provision/gcp/instance_utils.py @@ -1368,128 +1368,84 @@ def start_instance(cls, node_id: str, project_id: str, zone: str) -> None: cls.wait_for_operation(operation, project_id, zone) @classmethod - def resize_disk(cls, project_id: str, availability_zone: str, - node_config: dict, instance_name: str) -> None: - """Resize the disk a machine image with a different size is used. - - TODO: Implement the feature to attach persistent disks for TPU VMs. - The boot disk of TPU VMs is not resizable, and users need to add a - persistent disk to expand disk capacity. Related issue: #2387 - """ - from sky.skylet import log_lib - import os - from googleapiclient import discovery - - resource = discovery.build("compute", "v1") - - # TODO: Update the CLI to prompt the for disk mode and size - # Extract the specified disk size from the configuration - disk_size_gb = 100 - if 'diskSize' in node_config['metadata']: - disk_size_gb = int(node_config['metadata']['diskSize']) - # By default, each Cloud TPU VM has a 100GB single boot persistent disk that contains the operating system. - if disk_size_gb <= 100: - return None - disk_size_gb -= 100 - logger.info(f"Request persistent disk for size: {disk_size_gb}") - disk_mode = "read-write" + def resize_disk(cls, project_id: str, availability_zone: str, node_config: dict, + instance_name: str) -> None: + """Resizes disk for TPU VMs by adding a persistent disk when needed.""" + import time + from googleapiclient.errors import HttpError + resource = cls.load_resource() + + # Determine the required disk size from configuration + default_disk_size = 100 # Default boot disk size for TPUVMs + requested_size = int(node_config['metadata'].get( + 'diskSize', default_disk_size)) + + # Calculate additional disk size needed + additional_size = requested_size - default_disk_size + if additional_size <= 0: + return # No additional disk needed + + # Log the disk size request + logger.info( + f"Requesting additional persistent disk of size: {additional_size}GB") + + # Set disk specifications tpu_name = instance_name.split("/")[-1] + disk_name = f"{tpu_name}-extra-disk" + disk_type = f"zones/{availability_zone}/diskTypes/pd-standard" - # Create a new disk + # Prepare the disk creation body disk_body = { - "name": f"{tpu_name}-extra-disk", - "sizeGb": str(disk_size_gb), - "type": f"zones/{availability_zone}/diskTypes/pd-standard", # or pd-ssd + "name": disk_name, + "sizeGb": str(additional_size), + "type": disk_type, } + # Create the disk try: - create_operation = ( - resource.disks() - .insert( - project=project_id, zone=availability_zone, body=disk_body - ) - .execute() - ) - time.sleep(3) - + resource.disks().insert(project=project_id, zone=availability_zone, + body=disk_body).execute() + time.sleep(3) # Short pause after disk creation except HttpError as e: - # Catch HttpError for issues during disk creation or attachment - logger.warning(f"googleapiclient.errors.HttpError: {e.reason}") + logger.warning(f"Disk creation failed: {e.reason}") + return - # Attach the disk to TPUVMs with gcloud alpha - attach_to_tpus = ( + # Attach the newly created disk + attach_command = ( f"gcloud alpha compute tpus tpu-vm attach-disk {tpu_name} " - f"--zone {availability_zone} --disk {disk_body['name']} " - f"--mode {disk_mode}" - ) - - rcode, stdout, stderr = log_lib.run_with_log( - attach_to_tpus, - os.devnull, - shell=True, - stream_logs=False, - require_outputs=True, + f"--zone {availability_zone} --disk {disk_name} --mode read-write" ) + if cls.execute_command_with_log(attach_command) != 0: + logger.warning("Failed to attach disk to TPU VMs.") - if rcode != 0: - failure_massage = ( - "Failed to attach disk to TPU VMs.\n" - "**** STDOUT ****\n" - "{stdout}\n" - "**** STDERR ****\n" - "{stderr}" - ) - logger.warning(failure_massage) - - # # Set auto-delete state of the Persistent Disk - # auto_delete_disk = ( - # f"gcloud compute instances set-disk-auto-delete {instance_name} " - # f"--zone={availability_zone} " - # f"--auto-delete " - # f"--disk={disk_body['name']}" - # ) - # - # rcode, stdout, stderr = log_lib.run_with_log( - # auto_delete_disk, - # os.devnull, - # shell=True, - # stream_logs=False, - # require_outputs=True, - # ) - # - # if rcode != 0: - # failure_massage = ( - # "Failed to set auto-delete state of the Persistent Disk. Please delete the disk manually.\n" - # "**** STDOUT ****\n" - # "{stdout}\n" - # "**** STDERR ****\n" - # "{stderr}" - # ) - # logger.warning(failure_massage) - - # Format the persistent disk, create directory and mount the persistent disk - format_mount_disk = ( + # Format and mount the disk + mount_command = ( f"gcloud compute tpus tpu-vm ssh {tpu_name} --zone={availability_zone} " - f"--command='sudo mkfs.ext4 -m 0 -E lazy_itable_init=0,lazy_journal_init=0,discard /dev/sdb ; sudo mkdir -p /mnt/disks/persist ; sudo mount -o discard,defaults /dev/sdb /mnt/disks/persist'" + f"--command='sudo mkfs.ext4 -m 0 -E lazy_itable_init=0,lazy_journal_init=0," + f"discard /dev/sdb ; sudo mkdir -p /mnt/disks/persist ; sudo mount -o " + f"discard,defaults /dev/sdb /mnt/disks/persist'" ) + if cls.execute_command_with_log(mount_command) != 0: + logger.warning("Failed to format and mount persistent disk.") + + @classmethod + def execute_command_with_log(cls, command: str) -> int: + """Executes a shell command and logs the output, returning the return code.""" + from sky.skylet import log_lib + import os + rcode, stdout, stderr = log_lib.run_with_log( - format_mount_disk, + command, os.devnull, shell=True, stream_logs=False, require_outputs=True, ) if rcode != 0: - failure_massage = ( - "Failed to format and mount persistent disk\n" - "**** STDOUT ****\n" - "{stdout}\n" - "**** STDERR ****\n" - "{stderr}" - ) - logger.warning(failure_massage) + logger.warning(f"Command failed.\n**** STDOUT ****\n{stdout}\n**** STDERR ****" + f"\n{stderr}") + return rcode - return None @classmethod def get_instance_info(cls, project_id: str, availability_zone: str, From b07b2e239dc23654d0d8209c8bd8c2ad338fa8aa Mon Sep 17 00:00:00 2001 From: jackyk02 Date: Sun, 28 Apr 2024 20:13:28 -0700 Subject: [PATCH 4/4] fix issue with loading resource --- sky/provision/gcp/instance_utils.py | 5 ++++- sky/provision/provisioner.py | 3 +++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/sky/provision/gcp/instance_utils.py b/sky/provision/gcp/instance_utils.py index d087aafcefd..f816accebbc 100644 --- a/sky/provision/gcp/instance_utils.py +++ b/sky/provision/gcp/instance_utils.py @@ -1373,7 +1373,10 @@ def resize_disk(cls, project_id: str, availability_zone: str, node_config: dict, """Resizes disk for TPU VMs by adding a persistent disk when needed.""" import time from googleapiclient.errors import HttpError - resource = cls.load_resource() + from googleapiclient import discovery + + resource = discovery.build("compute", "v1") + # Determine the required disk size from configuration default_disk_size = 100 # Default boot disk size for TPUVMs diff --git a/sky/provision/provisioner.py b/sky/provision/provisioner.py index 8c86ca7f0ba..28ca8b3ae5c 100644 --- a/sky/provision/provisioner.py +++ b/sky/provision/provisioner.py @@ -164,6 +164,9 @@ def bulk_provision( tags={}, resume_stopped_nodes=True) + if 'initDiskSize' in original_config: + bootstrap_config.node_config['metadata']['diskSize'] = original_config['initDiskSize'] + with provision_logging.setup_provision_logging(log_dir): try: logger.debug(f'SkyPilot version: {sky.__version__}; '