Skip to content

Commit

Permalink
[GCP] Fix --disk-size for Custom Machine Images (#2718)
Browse files Browse the repository at this point in the history
* initial commit for gcp resizing disk

* disk resizing for new instances

* Change from compute_v1 to googleapiclient, Disk Resizing for custom images only, Address Formatting Issues

* Address Formatting Issues

* Address Formatting Issues

* update gcp-ray.yml comments

* address reformatting

* Create documentation for custom image cloud permissions

* [GCP] Minimal permissions for custom image

* Check exisiting disk size & Move implementation to GCPCompute

* Fix Format

* Removed Unnecessary Imports & Return type

* remove unnecessary check and resizing for restart

* add disk resizing for TPUVMs

* Add TODO for TPUVM resize

* Update reference to github issue for TPUVM

Co-authored-by: Tian Xia <[email protected]>

* Return None for TPU Resize

Co-authored-by: Tian Xia <[email protected]>

* shorten comments for TPU function

* Remove redundant check

* Updated try_validate_image_id to raise error when users specify the same disk size

* update try_validate_image_id

* update resources.py

* Fix formatting for resources.py

* Update resources.py

* Update resources.py

* Allowing users to create instances with the same size as the image

---------

Co-authored-by: Tian Xia <[email protected]>
  • Loading branch information
jackyk02 and cblmemo authored Nov 15, 2023
1 parent e81cbaa commit 7d9bcec
Show file tree
Hide file tree
Showing 4 changed files with 83 additions and 7 deletions.
17 changes: 13 additions & 4 deletions docs/source/cloud-setup/cloud-permissions/gcp.rst
Original file line number Diff line number Diff line change
Expand Up @@ -134,17 +134,26 @@ User
compute.firewalls.list
compute.firewalls.update
8. Click **Create** to create the role.
9. Go back to the "IAM" tab and click on **GRANT ACCESS**.
10. Fill in the email address of the user in the “Add principals” section, and select ``minimal-skypilot-role`` in the “Assign roles” section. Click **Save**.
8. **Optional**: If the user needs to use custom machine images with ``sky launch --image-id``, you can additionally add the following permissions:

.. code-block:: text
compute.disks.get
compute.disks.resize
compute.images.get
compute.images.useReadOnly
9. Click **Create** to create the role.
10. Go back to the "IAM" tab and click on **GRANT ACCESS**.
11. Fill in the email address of the user in the “Add principals” section, and select ``minimal-skypilot-role`` in the “Assign roles” section. Click **Save**.


.. image:: ../../images/screenshots/gcp/create-iam.png
:width: 80%
:align: center
:alt: GCP Grant Access

11. The user should receive an invitation to the project and should be able to setup SkyPilot by following the instructions in :ref:`Installation <installation-gcp>`.
12. The user should receive an invitation to the project and should be able to setup SkyPilot by following the instructions in :ref:`Installation <installation-gcp>`.

.. note::

Expand Down
68 changes: 66 additions & 2 deletions sky/skylet/providers/gcp/node.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,16 @@ def create_instance(
"""
return

@abc.abstractmethod
def resize_disk(
self, base_config: dict, instance_name: str, wait_for_operation: bool = True
) -> dict:
"""Resize a Google Cloud disk based on the provided configuration.
Returns the response of resize operation.
"""
return

def create_instances(
self,
base_config: dict,
Expand Down Expand Up @@ -518,7 +528,6 @@ def _convert_resources_to_urls(
def create_instance(
self, base_config: dict, labels: dict, wait_for_operation: bool = True
) -> Tuple[dict, str]:

config = self._convert_resources_to_urls(base_config)
# removing TPU-specific default key set in config.py
config.pop("networkConfig", None)
Expand Down Expand Up @@ -621,6 +630,53 @@ def delete_instance(self, node_id: str, wait_for_operation: bool = True) -> dict

return result

def resize_disk(
self, base_config: dict, instance_name: str, wait_for_operation: bool = True
) -> dict:
"""Resize a Google Cloud disk based on the provided configuration."""

# Extract the specified disk size from the configuration
new_size_gb = base_config["disks"][0]["initializeParams"]["diskSizeGb"]

# Fetch the instance details to get the disk name and current disk size
response = (
self.resource.instances()
.get(
project=self.project_id,
zone=self.availability_zone,
instance=instance_name,
)
.execute()
)
disk_name = response["disks"][0]["source"].split("/")[-1]

try:
# Execute the resize request and return the response
operation = (
self.resource.disks()
.resize(
project=self.project_id,
zone=self.availability_zone,
disk=disk_name,
body={
"sizeGb": str(new_size_gb),
},
)
.execute()
)
except HttpError as e:
# Catch HttpError when provided with invalid value for new disk size.
# Allowing users to create instances with the same size as the image
logger.warning(f"googleapiclient.errors.HttpError: {e.reason}")
return {}

if wait_for_operation:
result = self.wait_for_operation(operation)
else:
result = operation

return result


class GCPTPU(GCPResource):
"""Abstraction around GCP TPU resource"""
Expand Down Expand Up @@ -698,7 +754,6 @@ def _list_instances(
label_filters[TAG_RAY_CLUSTER_NAME] = self.cluster_name

def filter_instance(instance: GCPTPUNode) -> bool:

labels = instance.get_labels()
if label_filters:
for key, value in label_filters.items():
Expand Down Expand Up @@ -839,3 +894,12 @@ def delete_instance(self, node_id: str, wait_for_operation: bool = True) -> dict
result = operation

return result

def resize_disk(
self, base_config: dict, instance_name: str, wait_for_operation: bool = True
) -> dict:
"""
TODO: Implement the feature to attach persistent disks for TPU VMs.
The boot disk of TPU VMs is not resizable, and users need to add a
persistent disk to expand disk capacity. Related issue: #2387
"""
3 changes: 3 additions & 0 deletions sky/skylet/providers/gcp/node_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,9 @@ def get_order_key(node):
count -= len(reuse_node_ids)
if count:
results = resource.create_instances(base_config, labels, count)
if "sourceMachineImage" in base_config:
for _, instance_id in results:
resource.resize_disk(base_config, instance_id)
result_dict.update(
{instance_id: result for result, instance_id in results}
)
Expand Down
2 changes: 1 addition & 1 deletion sky/templates/gcp-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ docker:
{%- endif %}

provider:
# We use a custom node provider for GCP to support instance stop and reuse.
# We use a custom node provider for GCP to create, stop and reuse instances.
type: external # type: gcp
module: sky.skylet.providers.gcp.GCPNodeProvider
region: {{region}}
Expand Down

0 comments on commit 7d9bcec

Please sign in to comment.