Skip to content

Commit

Permalink
[Paperspace] Add paperspace cloud (#3197)
Browse files Browse the repository at this point in the history
* paperspace first commit

* switch to the new paperspace API

* check, launch, start, stop, down, auto-stop work

* autodown works

* fix typos

* update doc

* update smoke tests

* fix doc formatting

* add more machine images

* update smoke tests

* fix get for list network

* patch autodown

* disable docker for paperspace

* enable docker

* patch template

* fix imports and nits

* update smoke, move auth to bootstrap, add 'stopping' and 'serviceready' states, wait for 'stopping' instances to be 'off' before starting

* use `self._REPR`

Co-authored-by: Zhanghao Wu <[email protected]>

* add mem and disk tier defaults

Co-authored-by: Zhanghao Wu <[email protected]>

* add mem requirement

Co-authored-by: Zhanghao Wu <[email protected]>

* wait for all instances to stop

Co-authored-by: Zhanghao Wu <[email protected]>

* switch to debug from info

Co-authored-by: Zhanghao Wu <[email protected]>

* fix logging for restarting instances

Co-authored-by: Zhanghao Wu <[email protected]>

* remove none check

* ports are exposed by default, enable open ports

* add user hash to startup script to prevent collision between team members using same API key

* templated ssh keys into

Co-authored-by: Zhanghao Wu <[email protected]>

* remove paperspace specific auth, use default configure_ssh_info

* handle worker only for stop

Co-authored-by: Zhanghao Wu <[email protected]>

* disable ssh password auth to fix terminal stdin bug

* format

* update publick key template

Co-authored-by: Zhanghao Wu <[email protected]>

* add open ports

* remove non terminated

Co-authored-by: Zhanghao Wu <[email protected]>

* fail fast if capacity is not met

Co-authored-by: Zhanghao Wu <[email protected]>

* lint

* rename to MAX_POLLS_FOR_UP_OR_STOP

* capacity error fail early

Co-authored-by: Zhanghao Wu <[email protected]>

* Remove stopped instance optimization

* Update sky/clouds/paperspace.py

---------

Co-authored-by: Zhanghao Wu <[email protected]>
  • Loading branch information
asaiacai and Michaelvll authored Mar 25, 2024
1 parent eb442b0 commit 99d0aff
Show file tree
Hide file tree
Showing 17 changed files with 1,204 additions and 18 deletions.
10 changes: 10 additions & 0 deletions docs/source/getting-started/installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ This will produce a summary like:
OCI: enabled
Lambda: enabled
RunPod: enabled
Paperspace: enabled
Fluidstack: enabled
Cudo: enabled
IBM: enabled
Expand Down Expand Up @@ -267,6 +268,15 @@ Lambda Cloud
mkdir -p ~/.lambda_cloud
echo "api_key = <your_api_key_here>" > ~/.lambda_cloud/lambda_keys
Paperspace
~~~~~~~~~~~~~~~~~~

`Paperspace <https://www.paperspace.com/>`_ is a cloud provider that provides access to GPU accelerated VMs. To configure Paperspace access, go to follow `these instructions to generate an API key <https://docs.digitalocean.com/reference/paperspace/api-keys/>`_. Add the API key with:

.. code-block:: shell
mkdir -p ~/.paperspace
echo "{'api_key' : <your_api_key_here>}" > ~/.paperspace/config.json
RunPod
~~~~~~~~~~
Expand Down
2 changes: 2 additions & 0 deletions sky/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ def get_git_commit():
SCP = clouds.SCP
Kubernetes = clouds.Kubernetes
OCI = clouds.OCI
Paperspace = clouds.Paperspace
RunPod = clouds.RunPod
Vsphere = clouds.Vsphere
Fluidstack = clouds.Fluidstack
Expand All @@ -96,6 +97,7 @@ def get_git_commit():
'Kubernetes',
'Lambda',
'OCI',
'Paperspace',
'RunPod',
'SCP',
'Vsphere',
Expand Down
5 changes: 2 additions & 3 deletions sky/backends/backend_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -976,9 +976,8 @@ def _add_auth_to_cluster_config(cloud: clouds.Cloud, cluster_config_file: str):
"""
config = common_utils.read_yaml(cluster_config_file)
# Check the availability of the cloud type.
if isinstance(
cloud,
(clouds.AWS, clouds.OCI, clouds.SCP, clouds.Vsphere, clouds.Cudo)):
if isinstance(cloud, (clouds.AWS, clouds.OCI, clouds.SCP, clouds.Vsphere,
clouds.Cudo, clouds.Paperspace)):
config = auth.configure_ssh_info(config)
elif isinstance(cloud, clouds.GCP):
config = auth.setup_gcp_authentication(config)
Expand Down
1 change: 1 addition & 0 deletions sky/backends/cloud_vm_ray_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,7 @@ def _get_cluster_config_template(cloud):
clouds.IBM: 'ibm-ray.yml.j2',
clouds.SCP: 'scp-ray.yml.j2',
clouds.OCI: 'oci-ray.yml.j2',
clouds.Paperspace: 'paperspace-ray.yml.j2',
clouds.RunPod: 'runpod-ray.yml.j2',
clouds.Kubernetes: 'kubernetes-ray.yml.j2',
clouds.Vsphere: 'vsphere-ray.yml.j2',
Expand Down
3 changes: 3 additions & 0 deletions sky/clouds/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Clouds in Sky."""

from sky.clouds.cloud import Cloud
from sky.clouds.cloud import cloud_in_list
from sky.clouds.cloud import CloudImplementationFeatures
Expand All @@ -19,6 +20,7 @@
from sky.clouds.kubernetes import Kubernetes
from sky.clouds.lambda_cloud import Lambda
from sky.clouds.oci import OCI
from sky.clouds.paperspace import Paperspace
from sky.clouds.runpod import RunPod
from sky.clouds.scp import SCP
from sky.clouds.vsphere import Vsphere
Expand All @@ -31,6 +33,7 @@
'Cudo',
'GCP',
'Lambda',
'Paperspace',
'SCP',
'RunPod',
'OCI',
Expand Down
293 changes: 293 additions & 0 deletions sky/clouds/paperspace.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,293 @@
""" Paperspace Cloud. """

import json
import typing
from typing import Dict, Iterator, List, Optional, Tuple

import requests

from sky import clouds
from sky.clouds import service_catalog
from sky.provision.paperspace import utils
from sky.utils import resources_utils

if typing.TYPE_CHECKING:
from sky import resources as resources_lib

_CREDENTIAL_FILES = [
# credential files for Paperspace,
'config.json',
]


@clouds.CLOUD_REGISTRY.register
class Paperspace(clouds.Cloud):
"""Paperspace GPU Cloud"""

_REPR = 'Paperspace'
_CLOUD_UNSUPPORTED_FEATURES = {
clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER:
'Migrating '
f'disk is not supported in {_REPR}.',
clouds.CloudImplementationFeatures.SPOT_INSTANCE:
'Spot instances are'
f' not supported in {_REPR}.',
clouds.CloudImplementationFeatures.IMAGE_ID:
'Specifying image ID '
f'is not supported for {_REPR}.',
clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
'Custom disk tiers'
f' is not supported in {_REPR}.',
}
_MAX_CLUSTER_NAME_LEN_LIMIT = 120
_regions: List[clouds.Region] = []

# Using the latest SkyPilot provisioner API to provision and check status.
PROVISIONER_VERSION = clouds.ProvisionerVersion.SKYPILOT
STATUS_VERSION = clouds.StatusVersion.SKYPILOT

@classmethod
def _unsupported_features_for_resources(
cls, resources: 'resources_lib.Resources'
) -> Dict[clouds.CloudImplementationFeatures, str]:
"""The features not supported based on the resources provided.
This method is used by check_features_are_supported() to check if the
cloud implementation supports all the requested features.
Returns:
A dict of {feature: reason} for the features not supported by the
cloud implementation.
"""
del resources # unused
return cls._CLOUD_UNSUPPORTED_FEATURES

@classmethod
def _max_cluster_name_length(cls) -> Optional[int]:
return cls._MAX_CLUSTER_NAME_LEN_LIMIT

@classmethod
def regions_with_offering(
cls,
instance_type: str,
accelerators: Optional[Dict[str, int]],
use_spot: bool,
region: Optional[str],
zone: Optional[str],
) -> List[clouds.Region]:
assert zone is None, 'Paperspace does not support zones.'
del accelerators, zone # unused
if use_spot:
return []
else:
regions = service_catalog.get_region_zones_for_instance_type(
instance_type, use_spot, 'paperspace')

if region is not None:
regions = [r for r in regions if r.name == region]
return regions

@classmethod
def get_vcpus_mem_from_instance_type(
cls,
instance_type: str,
) -> Tuple[Optional[float], Optional[float]]:
return service_catalog.get_vcpus_mem_from_instance_type(
instance_type, clouds='paperspace')

@classmethod
def zones_provision_loop(
cls,
*,
region: str,
num_nodes: int,
instance_type: str,
accelerators: Optional[Dict[str, int]] = None,
use_spot: bool = False,
) -> Iterator[None]:
del num_nodes # unused
regions = cls.regions_with_offering(instance_type,
accelerators,
use_spot,
region=region,
zone=None)
for r in regions:
assert r.zones is None, r
yield r.zones

def instance_type_to_hourly_cost(
self,
instance_type: str,
use_spot: bool,
region: Optional[str] = None,
zone: Optional[str] = None,
) -> float:
return service_catalog.get_hourly_cost(
instance_type,
use_spot=use_spot,
region=region,
zone=zone,
clouds='paperspace',
)

def accelerators_to_hourly_cost(
self,
accelerators: Dict[str, int],
use_spot: bool,
region: Optional[str] = None,
zone: Optional[str] = None,
) -> float:
"""Returns the hourly cost of the accelerators, in dollars/hour."""
del accelerators, use_spot, region, zone # unused
return 0.0

def get_egress_cost(self, num_gigabytes: float) -> float:
return 0.0

def __repr__(self):
return self._REPR

def is_same_cloud(self, other: clouds.Cloud) -> bool:
# Returns true if the two clouds are the same cloud type.
return isinstance(other, Paperspace)

@classmethod
def get_default_instance_type(
cls,
cpus: Optional[str] = None,
memory: Optional[str] = None,
disk_tier: Optional[resources_utils.DiskTier] = None,
) -> Optional[str]:
"""Returns the default instance type for Paperspace."""
return service_catalog.get_default_instance_type(cpus=cpus,
memory=memory,
disk_tier=disk_tier,
clouds='paperspace')

@classmethod
def get_accelerators_from_instance_type(
cls, instance_type: str) -> Optional[Dict[str, int]]:
return service_catalog.get_accelerators_from_instance_type(
instance_type, clouds='paperspace')

@classmethod
def get_zone_shell_cmd(cls) -> Optional[str]:
return None

def make_deploy_resources_variables(
self,
resources: 'resources_lib.Resources',
cluster_name_on_cloud: str,
region: 'clouds.Region',
zones: Optional[List['clouds.Zone']],
dryrun: bool = False) -> Dict[str, Optional[str]]:
del zones, dryrun

r = resources
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
if acc_dict is not None:
custom_resources = json.dumps(acc_dict, separators=(',', ':'))
else:
custom_resources = None

return {
'instance_type': resources.instance_type,
'custom_resources': custom_resources,
'region': region.name,
}

def _get_feasible_launchable_resources(
self, resources: 'resources_lib.Resources'):
"""Returns a list of feasible resources for the given resources."""
if resources.use_spot:
return ([], [])
if resources.instance_type is not None:
assert resources.is_launchable(), resources
resources = resources.copy(accelerators=None)
return ([resources], [])

def _make(instance_list):
resource_list = []
for instance_type in instance_list:
r = resources.copy(
cloud=Paperspace(),
instance_type=instance_type,
accelerators=None,
cpus=None,
)
resource_list.append(r)
return resource_list

# Currently, handle a filter on accelerators only.
accelerators = resources.accelerators
if accelerators is None:
# Return a default instance type
default_instance_type = Paperspace.get_default_instance_type(
cpus=resources.cpus,
memory=resources.memory,
disk_tier=resources.disk_tier)
if default_instance_type is None:
return ([], [])
else:
return (_make([default_instance_type]), [])

assert len(accelerators) == 1, resources
acc, acc_count = list(accelerators.items())[0]
(instance_list, fuzzy_candidate_list) = (
service_catalog.get_instance_type_for_accelerator(
acc,
acc_count,
use_spot=resources.use_spot,
cpus=resources.cpus,
memory=resources.memory,
region=resources.region,
zone=resources.zone,
clouds='paperspace',
))
if instance_list is None:
return ([], fuzzy_candidate_list)
return (_make(instance_list), fuzzy_candidate_list)

@classmethod
def check_credentials(cls) -> Tuple[bool, Optional[str]]:
"""Verify that the user has valid credentials for Paperspace."""
try:
# attempt to make a CURL request for listing instances
utils.PaperspaceCloudClient().list_instances()
except (AssertionError, KeyError, utils.PaperspaceCloudError) as e:
# pylint: disable=line-too-long
return False, (
'Failed to access Paperspace Cloud with credentials.\n '
'To configure credentials, follow the instructions at: '
'https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#paperspace\n '
'Generate API key and create a json at `~/.paperspace/config.json` with \n '
' {"apiKey": "[YOUR API KEY]"}\n '
f'Reason: {str(e)}')
except requests.exceptions.ConnectionError:
return False, ('Failed to verify Paperspace Cloud credentials. '
'Check your network connection '
'and try again.')
except Exception as e: # pylint: disable=broad-except
return False, str(e)

return True, None

def get_credential_file_mounts(self) -> Dict[str, str]:
return {
f'~/.paperspace/{filename}': f'~/.paperspace/{filename}'
for filename in _CREDENTIAL_FILES
}

@classmethod
def get_current_user_identity(cls) -> Optional[List[str]]:
# NOTE: used for very advanced SkyPilot functionality
# Can implement later if desired
return None

def instance_type_exists(self, instance_type: str) -> bool:
return service_catalog.instance_type_exists(instance_type, 'paperspace')

def validate_region_zone(self, region: Optional[str], zone: Optional[str]):
return service_catalog.validate_region_zone(region,
zone,
clouds='paperspace')
3 changes: 2 additions & 1 deletion sky/clouds/service_catalog/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@

CloudFilter = Optional[Union[List[str], str]]
ALL_CLOUDS = ('aws', 'azure', 'gcp', 'ibm', 'lambda', 'scp', 'oci',
'kubernetes', 'runpod', 'vsphere', 'cudo', 'fluidstack')
'kubernetes', 'runpod', 'vsphere', 'cudo', 'fluidstack',
'paperspace')


def _map_clouds_catalog(clouds: CloudFilter, method_name: str, *args, **kwargs):
Expand Down
Loading

0 comments on commit 99d0aff

Please sign in to comment.