From 3340171001ea923a4c133c721f2266d171ef46c9 Mon Sep 17 00:00:00 2001 From: mjibril Date: Mon, 29 Jul 2024 17:26:47 +0100 Subject: [PATCH 1/2] [FluidStack][API] Update FluidStack to new API * Update FluidStack to new API * Simplify deployment by removing CUDA installation and using pre-built images Co-authored-by: Mubarak Jibril --- docs/source/getting-started/installation.rst | 5 +- sky/clouds/fluidstack.py | 47 +--- .../data_fetchers/fetch_fluidstack.py | 218 ++++++++++++++---- sky/provision/fluidstack/fluidstack_utils.py | 175 +++++--------- sky/provision/fluidstack/instance.py | 48 +--- sky/templates/fluidstack-ray.yml.j2 | 1 - tests/test_smoke.py | 23 ++ 7 files changed, 273 insertions(+), 244 deletions(-) diff --git a/docs/source/getting-started/installation.rst b/docs/source/getting-started/installation.rst index be7ae1ff327..9f251a5aafe 100644 --- a/docs/source/getting-started/installation.rst +++ b/docs/source/getting-started/installation.rst @@ -301,13 +301,12 @@ RunPod Fluidstack ~~~~~~~~~~~~~~~~~~ -`Fluidstack `__ is a cloud provider offering low-cost GPUs. To configure Fluidstack access, go to the `Home `__ page on your Fluidstack console to generate an API key and then add the :code:`API key` to :code:`~/.fluidstack/api_key` and the :code:`API token` to :code:`~/.fluidstack/api_token`: - +`Fluidstack `__ is a cloud provider offering low-cost GPUs. To configure Fluidstack access, go to the `Home `__ page on your Fluidstack console to generate an API key and then add the :code:`API key` to :code:`~/.fluidstack/api_key` : .. code-block:: shell mkdir -p ~/.fluidstack echo "your_api_key_here" > ~/.fluidstack/api_key - echo "your_api_token_here" > ~/.fluidstack/api_token + Cudo Compute diff --git a/sky/clouds/fluidstack.py b/sky/clouds/fluidstack.py index c4f15a0e510..40166e06d09 100644 --- a/sky/clouds/fluidstack.py +++ b/sky/clouds/fluidstack.py @@ -15,8 +15,7 @@ _CREDENTIAL_FILES = [ # credential files for FluidStack, - fluidstack_utils.FLUIDSTACK_API_KEY_PATH, - fluidstack_utils.FLUIDSTACK_API_TOKEN_PATH, + fluidstack_utils.FLUIDSTACK_API_KEY_PATH ] if typing.TYPE_CHECKING: # Renaming to avoid shadowing variables. @@ -189,20 +188,12 @@ def make_deploy_resources_variables( custom_resources = json.dumps(acc_dict, separators=(',', ':')) else: custom_resources = None - cuda_installation_commands = """ - sudo wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.1-1_all.deb -O /usr/local/cuda-keyring_1.1-1_all.deb; - sudo dpkg -i /usr/local/cuda-keyring_1.1-1_all.deb; - sudo apt-get update; - sudo apt-get -y install cuda-toolkit-12-3; - sudo apt-get install -y cuda-drivers; - sudo apt-get install -y python3-pip; - nvidia-smi || sudo reboot;""" + return { 'instance_type': resources.instance_type, 'custom_resources': custom_resources, 'region': region.name, - 'fluidstack_username': self.default_username(region.name), - 'cuda_installation_commands': cuda_installation_commands, + 'fluidstack_username': 'ubuntu', } def _get_feasible_launchable_resources( @@ -265,17 +256,14 @@ def check_credentials(cls) -> Tuple[bool, Optional[str]]: try: assert os.path.exists( os.path.expanduser(fluidstack_utils.FLUIDSTACK_API_KEY_PATH)) - assert os.path.exists( - os.path.expanduser(fluidstack_utils.FLUIDSTACK_API_TOKEN_PATH)) except AssertionError: - return False, ( - 'Failed to access FluidStack Cloud' - ' with credentials. ' - 'To configure credentials, go to:\n ' - ' https://console.fluidstack.io \n ' - 'to obtain an API key and API Token, ' - 'then add save the contents ' - 'to ~/.fluidstack/api_key and ~/.fluidstack/api_token \n') + return False, ('Failed to access FluidStack Cloud' + ' with credentials. ' + 'To configure credentials, go to:\n ' + ' https://dashboard.fluidstack.io \n ' + 'to obtain an API key, ' + 'then add save the contents ' + 'to ~/.fluidstack/api_key \n') except requests.exceptions.ConnectionError: return False, ('Failed to verify FluidStack Cloud credentials. ' 'Check your network connection ' @@ -298,21 +286,6 @@ def validate_region_zone(self, region: Optional[str], zone: Optional[str]): zone, clouds='fluidstack') - @classmethod - def default_username(cls, region: str) -> str: - return { - 'norway_2_eu': 'ubuntu', - 'calgary_1_canada': 'ubuntu', - 'norway_3_eu': 'ubuntu', - 'norway_4_eu': 'ubuntu', - 'india_2': 'root', - 'nevada_1_usa': 'fsuser', - 'generic_1_canada': 'ubuntu', - 'iceland_1_eu': 'ubuntu', - 'new_york_1_usa': 'fsuser', - 'illinois_1_usa': 'fsuser' - }.get(region, 'ubuntu') - @classmethod def query_status( cls, diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py b/sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py index 5d50399ab89..cf943541e08 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py @@ -11,10 +11,140 @@ import requests -ENDPOINT = 'https://api.fluidstack.io/v1/plans' +ENDPOINT = 'https://platform.fluidstack.io/list_available_configurations' DEFAULT_FLUIDSTACK_API_KEY_PATH = os.path.expanduser('~/.fluidstack/api_key') -DEFAULT_FLUIDSTACK_API_TOKEN_PATH = os.path.expanduser( - '~/.fluidstack/api_token') + +plan_vcpus_memory = [{ + 'gpu_type': 'RTX_A6000_48GB', + 'gpu_count': 2, + 'min_cpu_count': 12, + 'min_memory': 110.0 +}, { + 'gpu_type': 'RTX_A6000_48GB', + 'gpu_count': 4, + 'min_cpu_count': 24, + 'min_memory': 220.0 +}, { + 'gpu_type': 'A100_NVLINK_80GB', + 'gpu_count': 8, + 'min_cpu_count': 252, + 'min_memory': 960.0 +}, { + 'gpu_type': 'H100_PCIE_80GB', + 'gpu_count': 8, + 'min_cpu_count': 252, + 'min_memory': 1440.0 +}, { + 'gpu_type': 'RTX_A4000_16GB', + 'gpu_count': 2, + 'min_cpu_count': 12, + 'min_memory': 48.0 +}, { + 'gpu_type': 'H100_PCIE_80GB', + 'gpu_count': 2, + 'min_cpu_count': 60, + 'min_memory': 360.0 +}, { + 'gpu_type': 'RTX_A6000_48GB', + 'gpu_count': 8, + 'min_cpu_count': 252, + 'min_memory': 464.0 +}, { + 'gpu_type': 'H100_NVLINK_80GB', + 'gpu_count': 8, + 'min_cpu_count': 252, + 'min_memory': 1440.0 +}, { + 'gpu_type': 'H100_PCIE_80GB', + 'gpu_count': 1, + 'min_cpu_count': 28, + 'min_memory': 180.0 +}, { + 'gpu_type': 'RTX_A5000_24GB', + 'gpu_count': 1, + 'min_cpu_count': 8, + 'min_memory': 30.0 +}, { + 'gpu_type': 'RTX_A5000_24GB', + 'gpu_count': 2, + 'min_cpu_count': 16, + 'min_memory': 60.0 +}, { + 'gpu_type': 'L40_48GB', + 'gpu_count': 2, + 'min_cpu_count': 64, + 'min_memory': 120.0 +}, { + 'gpu_type': 'RTX_A4000_16GB', + 'gpu_count': 8, + 'min_cpu_count': 48, + 'min_memory': 192.0 +}, { + 'gpu_type': 'RTX_A4000_16GB', + 'gpu_count': 1, + 'min_cpu_count': 6, + 'min_memory': 24.0 +}, { + 'gpu_type': 'RTX_A4000_16GB', + 'gpu_count': 4, + 'min_cpu_count': 24, + 'min_memory': 96.0 +}, { + 'gpu_type': 'A100_PCIE_80GB', + 'gpu_count': 4, + 'min_cpu_count': 124, + 'min_memory': 480.0 +}, { + 'gpu_type': 'H100_PCIE_80GB', + 'gpu_count': 4, + 'min_cpu_count': 124, + 'min_memory': 720.0 +}, { + 'gpu_type': 'L40_48GB', + 'gpu_count': 8, + 'min_cpu_count': 252, + 'min_memory': 480.0 +}, { + 'gpu_type': 'RTX_A5000_24GB', + 'gpu_count': 8, + 'min_cpu_count': 64, + 'min_memory': 240.0 +}, { + 'gpu_type': 'L40_48GB', + 'gpu_count': 1, + 'min_cpu_count': 32, + 'min_memory': 60.0 +}, { + 'gpu_type': 'RTX_A6000_48GB', + 'gpu_count': 1, + 'min_cpu_count': 6, + 'min_memory': 55.0 +}, { + 'gpu_type': 'L40_48GB', + 'gpu_count': 4, + 'min_cpu_count': 126, + 'min_memory': 240.0 +}, { + 'gpu_type': 'A100_PCIE_80GB', + 'gpu_count': 1, + 'min_cpu_count': 28, + 'min_memory': 120.0 +}, { + 'gpu_type': 'A100_PCIE_80GB', + 'gpu_count': 8, + 'min_cpu_count': 252, + 'min_memory': 1440.0 +}, { + 'gpu_type': 'A100_PCIE_80GB', + 'gpu_count': 2, + 'min_cpu_count': 60, + 'min_memory': 240.0 +}, { + 'gpu_type': 'RTX_A5000_24GB', + 'gpu_count': 4, + 'min_cpu_count': 32, + 'min_memory': 120.0 +}] GPU_MAP = { 'H100_PCIE_80GB': 'H100', @@ -47,19 +177,15 @@ def get_regions(plans: List) -> dict: regions = {} for plan in plans: for region in plan.get('regions', []): - regions[region['id']] = region['id'] + regions[region] = region return regions def create_catalog(output_dir: str) -> None: - response = requests.get(ENDPOINT) + with open(DEFAULT_FLUIDSTACK_API_KEY_PATH, 'r', encoding='UTF-8') as f: + api_key = f.read().strip() + response = requests.get(ENDPOINT, headers={'api-key': api_key}) plans = response.json() - #plans = [plan for plan in plans if len(plan['regions']) > 0] - plans = [ - plan for plan in plans if plan['minimum_commitment'] == 'hourly' and - plan['type'] in ['preconfigured'] and - plan['gpu_type'] not in ['NO GPU', 'RTX_3080_10GB', 'RTX_3090_24GB'] - ] with open(os.path.join(output_dir, 'vms.csv'), mode='w', encoding='utf-8') as f: @@ -81,39 +207,45 @@ def create_catalog(output_dir: str) -> None: except KeyError: #print(f'Could not map {plan["gpu_type"]}') continue - gpu_memory = int( - str(plan['configuration']['gpu_memory']).replace('GB', - '')) * 1024 - gpu_cnt = int(plan['configuration']['gpu_count']) - vcpus = float(plan['configuration']['core_count']) - mem = float(plan['configuration']['ram']) - price = float(plan['price']['hourly']) * gpu_cnt - gpuinfo = { - 'Gpus': [{ - 'Name': gpu, - 'Manufacturer': 'NVIDIA', - 'Count': gpu_cnt, - 'MemoryInfo': { - 'SizeInMiB': int(gpu_memory) - }, - }], - 'TotalGpuMemoryInMiB': int(gpu_memory * gpu_cnt), - } - gpuinfo = json.dumps(gpuinfo).replace('"', "'") # pylint: disable=invalid-string-quote - for r in plan.get('regions', []): - if r['id'] == 'india_2': + for gpu_cnt in plan['gpu_counts']: + gpu_memory = float(plan['gpu_type'].split('_')[-1].replace( + 'GB', '')) * 1024 + try: + vcpus_mem = [ + x for x in plan_vcpus_memory + if x['gpu_type'] == plan['gpu_type'] and + x['gpu_count'] == gpu_cnt + ][0] + vcpus = vcpus_mem['min_cpu_count'] + mem = vcpus_mem['min_memory'] + except IndexError: continue - writer.writerow([ - plan['plan_id'], - gpu, - gpu_cnt, - vcpus, - mem, - price, - r['id'], - gpuinfo, - '', - ]) + price = float(plan['price_per_gpu_hr']) * gpu_cnt + gpuinfo = { + 'Gpus': [{ + 'Name': gpu, + 'Manufacturer': 'NVIDIA', + 'Count': gpu_cnt, + 'MemoryInfo': { + 'SizeInMiB': int(gpu_memory) + }, + }], + 'TotalGpuMemoryInMiB': int(gpu_memory * gpu_cnt), + } + gpuinfo = json.dumps(gpuinfo).replace('"', "'") # pylint: disable=invalid-string-quote + instance_type = f'{plan["gpu_type"]}::{gpu_cnt}' + for region in plan.get('regions', []): + writer.writerow([ + instance_type, + gpu, + gpu_cnt, + vcpus, + mem, + price, + region, + gpuinfo, + '', + ]) if __name__ == '__main__': diff --git a/sky/provision/fluidstack/fluidstack_utils.py b/sky/provision/fluidstack/fluidstack_utils.py index ebc616c0bfc..a9efb865a3c 100644 --- a/sky/provision/fluidstack/fluidstack_utils.py +++ b/sky/provision/fluidstack/fluidstack_utils.py @@ -3,7 +3,8 @@ import functools import json import os -from typing import Any, Dict, List, Optional +import time +from typing import Any, Dict, List import uuid import requests @@ -13,9 +14,8 @@ def get_key_suffix(): return str(uuid.uuid4()).replace('-', '')[:8] -ENDPOINT = 'https://api.fluidstack.io/v1/' +ENDPOINT = 'https://platform.fluidstack.io/' FLUIDSTACK_API_KEY_PATH = '~/.fluidstack/api_key' -FLUIDSTACK_API_TOKEN_PATH = '~/.fluidstack/api_token' def read_contents(path: str) -> str: @@ -46,109 +46,76 @@ def raise_fluidstack_error(response: requests.Response) -> None: raise FluidstackAPIError(f'{message}', status_code) -@functools.lru_cache() -def with_nvidia_drivers(region: str): - if region in ['norway_4_eu', 'generic_1_canada']: - return False - client = FluidstackClient() - plans = client.get_plans() - for plan in plans: - if region in [r['id'] for r in plan['regions']]: - if 'Ubuntu 20.04 LTS (Nvidia)' in plan['os_options']: - return True - return False - - class FluidstackClient: """FluidStack API Client""" def __init__(self): self.api_key = read_contents( - os.path.expanduser(FLUIDSTACK_API_KEY_PATH)) - self.api_token = read_contents( - os.path.expanduser(FLUIDSTACK_API_TOKEN_PATH)) + os.path.expanduser(FLUIDSTACK_API_KEY_PATH)).strip() def get_plans(self): - response = requests.get(ENDPOINT + 'plans') + response = requests.get(ENDPOINT + 'list_available_configurations', + headers={'api-key': self.api_key}) raise_fluidstack_error(response) plans = response.json() - plans = [ - plan for plan in plans - if plan['minimum_commitment'] == 'hourly' and plan['type'] in - ['preconfigured', 'custom'] and plan['gpu_type'] != 'NO GPU' - ] return plans - def list_instances( - self, - tag_filters: Optional[Dict[str, - str]] = None) -> List[Dict[str, Any]]: + def list_instances(self) -> List[Dict[str, Any]]: response = requests.get( - ENDPOINT + 'servers', - auth=(self.api_key, self.api_token), + ENDPOINT + 'instances', + headers={'api-key': self.api_key}, ) raise_fluidstack_error(response) instances = response.json() - filtered_instances = [] - - for instance in instances: - if isinstance(instance['tags'], str): - instance['tags'] = json.loads(instance['tags']) - if not instance['tags']: - instance['tags'] = {} - if tag_filters: - for key in tag_filters: - if instance['tags'].get(key, None) != tag_filters[key]: - break - else: - filtered_instances.append(instance) - else: - filtered_instances.append(instance) - - return filtered_instances + return instances def create_instance( self, instance_type: str = '', - hostname: str = '', + name: str = '', region: str = '', ssh_pub_key: str = '', count: int = 1, ) -> List[str]: """Launch new instances.""" - config: Dict[str, Any] = {} plans = self.get_plans() regions = self.list_regions() + gpu_type, gpu_count = instance_type.split('::') + gpu_count = int(gpu_count) + plans = [ - plan for plan in plans if plan['plan_id'] == instance_type and - region in [r['id'] for r in plan['regions']] + plan for plan in plans if plan['gpu_type'] == gpu_type and + gpu_count in plan['gpu_counts'] and region in plan['regions'] ] if not plans: raise FluidstackAPIError( f'Plan {instance_type} out of stock in region {region}') ssh_key = self.get_or_add_ssh_key(ssh_pub_key) - os_id = 'Ubuntu 20.04 LTS' - body = dict(plan=None if config else instance_type, - region=regions[region], - os=os_id, - hostname=hostname, - ssh_keys=[ssh_key['id']], - multiplicity=count, - config=config) - - response = requests.post(ENDPOINT + 'server', - auth=(self.api_key, self.api_token), - json=body) - raise_fluidstack_error(response) - instance_ids = response.json().get('multiple') - assert all(id is not None for id in instance_ids), instance_ids + default_operating_system = 'ubuntu_22_04_lts_nvidia' + instance_ids = [] + for _ in range(count): + body = dict(gpu_type=gpu_type, + gpu_count=gpu_count, + region=regions[region], + operating_system_label=default_operating_system, + name=name, + ssh_key=ssh_key['name']) + + response = requests.post(ENDPOINT + 'instances', + headers={'api-key': self.api_key}, + json=body) + raise_fluidstack_error(response) + instance_id = response.json().get('id') + instance_ids.append(instance_id) + time.sleep(1) + return instance_ids def list_ssh_keys(self): - response = requests.get(ENDPOINT + 'ssh', - auth=(self.api_key, self.api_token)) + response = requests.get(ENDPOINT + 'ssh_keys', + headers={'api-key': self.api_key}) raise_fluidstack_error(response) return response.json() @@ -156,86 +123,50 @@ def get_or_add_ssh_key(self, ssh_pub_key: str = '') -> Dict[str, str]: """Add ssh key if not already added.""" ssh_keys = self.list_ssh_keys() for key in ssh_keys: - if key['public_key'].strip() == ssh_pub_key.strip(): - return { - 'id': key['id'], - 'name': key['name'], - 'ssh_key': ssh_pub_key - } + if key['public_key'].strip().split()[:2] == ssh_pub_key.strip( + ).split()[:2]: + return {'name': key['name'], 'ssh_key': ssh_pub_key} ssh_key_name = 'skypilot-' + get_key_suffix() response = requests.post( - ENDPOINT + 'ssh', - auth=(self.api_key, self.api_token), + ENDPOINT + 'ssh_keys', + headers={'api-key': self.api_key}, json=dict(name=ssh_key_name, public_key=ssh_pub_key), ) raise_fluidstack_error(response) - key_id = response.json()['id'] - return {'id': key_id, 'name': ssh_key_name, 'ssh_key': ssh_pub_key} + return {'name': ssh_key_name, 'ssh_key': ssh_pub_key} @functools.lru_cache() def list_regions(self): - response = requests.get(ENDPOINT + 'plans') - raise_fluidstack_error(response) - plans = response.json() - plans = [ - plan for plan in plans - if plan['minimum_commitment'] == 'hourly' and plan['type'] in - ['preconfigured', 'custom'] and plan['gpu_type'] != 'NO GPU' - ] + plans = self.get_plans() def get_regions(plans: List) -> dict: """Return a list of regions where the plan is available.""" regions = {} for plan in plans: for region in plan.get('regions', []): - regions[region['id']] = region['id'] + regions[region] = region return regions regions = get_regions(plans) return regions def delete(self, instance_id: str): - response = requests.delete(ENDPOINT + 'server/' + instance_id, - auth=(self.api_key, self.api_token)) + response = requests.delete(ENDPOINT + 'instances/' + instance_id, + headers={'api-key': self.api_key}) raise_fluidstack_error(response) return response.json() def stop(self, instance_id: str): - response = requests.put(ENDPOINT + 'server/' + instance_id + '/stop', - auth=(self.api_key, self.api_token)) - raise_fluidstack_error(response) - return response.json() - - def restart(self, instance_id: str): - response = requests.post(ENDPOINT + 'server/' + instance_id + '/reboot', - auth=(self.api_key, self.api_token)) - raise_fluidstack_error(response) - return response.json() - - def info(self, instance_id: str): - response = requests.get(ENDPOINT + f'server/{instance_id}', - auth=(self.api_key, self.api_token)) - raise_fluidstack_error(response) - return response.json() - - def status(self, instance_id: str): - response = self.info(instance_id) - return response['status'] - - def add_tags(self, instance_id: str, tags: Dict[str, str]) -> str: - response = requests.patch( - ENDPOINT + f'server/{instance_id}/tag', - auth=(self.api_key, self.api_token), - json=dict(tags=json.dumps(tags)), - ) + response = requests.put(ENDPOINT + 'instances/' + instance_id + '/stop', + headers={'api-key': self.api_key}) raise_fluidstack_error(response) return response.json() - def rename(self, instance_id: str, hostname: str) -> str: - response = requests.patch( - ENDPOINT + f'server/{instance_id}/rename', - auth=(self.api_key, self.api_token), - json=dict(name=hostname), + def rename(self, instance_id: str, name: str) -> str: + response = requests.put( + ENDPOINT + f'instances/{instance_id}/rename', + headers={'api-key': self.api_key}, + json=dict(new_instance_name=name), ) raise_fluidstack_error(response) return response.json() diff --git a/sky/provision/fluidstack/instance.py b/sky/provision/fluidstack/instance.py index e870ff15e0c..538aafc8887 100644 --- a/sky/provision/fluidstack/instance.py +++ b/sky/provision/fluidstack/instance.py @@ -27,7 +27,7 @@ def get_internal_ip(node_info: Dict[str, Any]) -> None: node_info['internal_ip'] = node_info['ip_address'] runner = command_runner.SSHCommandRunner( (node_info['ip_address'], 22), - ssh_user=node_info['capabilities']['default_user_name'], + ssh_user='ubuntu', ssh_private_key=auth.PRIVATE_SSH_KEY_PATH) result = runner.run(_GET_INTERNAL_IP_CMD, require_outputs=True, @@ -61,7 +61,7 @@ def _filter_instances( if (include_instances is not None and instance['id'] not in include_instances): continue - if instance.get('hostname') in possible_names: + if instance.get('name') in possible_names: filtered_instances[instance['id']] = instance return filtered_instances @@ -69,7 +69,7 @@ def _filter_instances( def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]: head_instance_id = None for inst_id, inst in instances.items(): - if inst['hostname'].endswith('-head'): + if inst['name'].endswith('-head'): head_instance_id = inst_id break return head_instance_id @@ -80,16 +80,7 @@ def run_instances(region: str, cluster_name_on_cloud: str, """Runs instances for the given cluster.""" pending_status = [ - 'create', - 'requesting', - 'provisioning', - 'customizing', - 'starting', - 'stopping', - 'start', - 'stop', - 'reboot', - 'rebooting', + 'pending', ] while True: instances = _filter_instances(cluster_name_on_cloud, pending_status) @@ -127,7 +118,7 @@ def rename(instance_id: str, new_name: str) -> None: f'{instance_name}') rename(instance_id, instance_name) if (instance_id != head_instance_id and - instance['hostname'].endswith('-head')): + instance['name'].endswith('-head')): # Multiple head instances exist. # This is a rare case when the instance name was manually modified # on the cloud or some unexpected behavior happened. @@ -167,7 +158,7 @@ def rename(instance_id: str, new_name: str) -> None: node_type = 'head' if head_instance_id is None else 'worker' try: instance_ids = utils.FluidstackClient().create_instance( - hostname=f'{cluster_name_on_cloud}-{node_type}', + name=f'{cluster_name_on_cloud}-{node_type}', instance_type=config.node_config['InstanceType'], ssh_pub_key=config.node_config['AuthorizedKey'], region=region) @@ -184,9 +175,6 @@ def rename(instance_id: str, new_name: str) -> None: instances = _filter_instances(cluster_name_on_cloud, pending_status + ['running']) if len(instances) < config.count: - # Some of pending instances have been convert to a state that will - # not convert to `running` status. This can be due to resource - # availability issue. all_instances = _filter_instances( cluster_name_on_cloud, status_filters=None, @@ -253,15 +241,11 @@ def terminate_instances( instances = _filter_instances(cluster_name_on_cloud, None) for inst_id, inst in instances.items(): logger.debug(f'Terminating instance {inst_id}: {inst}') - if worker_only and inst['hostname'].endswith('-head'): + if worker_only and inst['name'].endswith('-head'): continue try: utils.FluidstackClient().delete(inst_id) except Exception as e: # pylint: disable=broad-except - if (isinstance(e, utils.FluidstackAPIError) and - 'Machine is already terminated' in str(e)): - logger.debug(f'Instance {inst_id} is already terminated.') - continue with ux_utils.print_exception_no_traceback(): raise RuntimeError( f'Failed to terminate instance {inst_id}: ' @@ -291,7 +275,7 @@ def get_cluster_info( tags={}, ) ] - if instance_info['hostname'].endswith('-head'): + if instance_info['name'].endswith('-head'): head_instance_id = instance_id return common.ClusterInfo(instances=instances, @@ -311,22 +295,10 @@ def query_instances( instances = _filter_instances(cluster_name_on_cloud, None) instances = _filter_instances(cluster_name_on_cloud, None) status_map = { - 'provisioning': status_lib.ClusterStatus.INIT, - 'requesting': status_lib.ClusterStatus.INIT, - 'create': status_lib.ClusterStatus.INIT, - 'customizing': status_lib.ClusterStatus.INIT, - 'stopping': status_lib.ClusterStatus.STOPPED, - 'stop': status_lib.ClusterStatus.STOPPED, - 'start': status_lib.ClusterStatus.INIT, - 'reboot': status_lib.ClusterStatus.STOPPED, - 'rebooting': status_lib.ClusterStatus.STOPPED, + 'pending': status_lib.ClusterStatus.INIT, 'stopped': status_lib.ClusterStatus.STOPPED, - 'starting': status_lib.ClusterStatus.INIT, 'running': status_lib.ClusterStatus.UP, - 'failed to create': status_lib.ClusterStatus.INIT, - 'timeout error': status_lib.ClusterStatus.INIT, - 'out of stock': status_lib.ClusterStatus.INIT, - 'terminating': None, + 'unhealthy': status_lib.ClusterStatus.INIT, 'terminated': None, } statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {} diff --git a/sky/templates/fluidstack-ray.yml.j2 b/sky/templates/fluidstack-ray.yml.j2 index 309a5393828..3eb277ec6d9 100644 --- a/sky/templates/fluidstack-ray.yml.j2 +++ b/sky/templates/fluidstack-ray.yml.j2 @@ -65,7 +65,6 @@ setup_commands: sudo pkill -9 apt-get; sudo pkill -9 dpkg; sudo dpkg --configure -a; - {{ cuda_installation_commands }} mkdir -p ~/.ssh; touch ~/.ssh/config; {{ conda_installation_commands }} {{ ray_skypilot_installation_commands }} diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 325a836cf4c..c4347f53a21 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -962,6 +962,7 @@ def test_env_check(generic_cloud: str): # ---------- file_mounts ---------- +@pytest.mark.no_fluidstack # Requires other clouds to be enabled @pytest.mark.no_scp # SCP does not support num_nodes > 1 yet. Run test_scp_file_mounts instead. def test_file_mounts(generic_cloud: str): name = _get_cluster_name() @@ -1168,6 +1169,7 @@ def test_kubernetes_storage_mounts(): run_one_test(test) +@pytest.mark.no_fluidstack # FluidStack does not support docker for now @pytest.mark.parametrize( 'image_id', [ @@ -1533,6 +1535,7 @@ def test_job_queue_multinode(generic_cloud: str): run_one_test(test) +@pytest.mark.no_fluidstack # No FluidStack VM has 8 CPUs @pytest.mark.no_lambda_cloud # No Lambda Cloud VM has 8 CPUs def test_large_job_queue(generic_cloud: str): name = _get_cluster_name() @@ -1576,6 +1579,7 @@ def test_large_job_queue(generic_cloud: str): run_one_test(test) +@pytest.mark.no_fluidstack # No FluidStack VM has 8 CPUs @pytest.mark.no_lambda_cloud # No Lambda Cloud VM has 8 CPUs def test_fast_large_job_queue(generic_cloud: str): # This is to test the jobs can be scheduled quickly when there are many jobs in the queue. @@ -1683,6 +1687,7 @@ def test_multi_echo(generic_cloud: str): # ---------- Task: 1 node training. ---------- +@pytest.mark.no_fluidstack # Fluidstack does not have T4 gpus for now @pytest.mark.no_lambda_cloud # Lambda Cloud does not have V100 gpus @pytest.mark.no_ibm # IBM cloud currently doesn't provide public image with CUDA @pytest.mark.no_scp # SCP does not have V100 (16GB) GPUs. Run test_scp_huggingface instead. @@ -2292,6 +2297,7 @@ def test_cancel_azure(): run_one_test(test) +@pytest.mark.no_fluidstack # Fluidstack does not support V100 gpus for now @pytest.mark.no_lambda_cloud # Lambda Cloud does not have V100 gpus @pytest.mark.no_ibm # IBM cloud currently doesn't provide public image with CUDA @pytest.mark.no_paperspace # Paperspace has `gnome-shell` on nvidia-smi @@ -2398,6 +2404,7 @@ def test_stop_gcp_spot(): # ---------- Testing managed job ---------- +@pytest.mark.no_fluidstack @pytest.mark.managed_jobs def test_managed_jobs(generic_cloud: str): """Test the managed jobs yaml.""" @@ -3021,6 +3028,7 @@ def test_managed_jobs_tpu(): # ---------- Testing env for managed jobs ---------- +@pytest.mark.no_fluidstack @pytest.mark.managed_jobs def test_managed_jobs_inline_env(generic_cloud: str): """Test managed jobs env""" @@ -3124,6 +3132,7 @@ def test_kubernetes_custom_image(image_id): run_one_test(test) +@pytest.mark.no_fluidstack def test_azure_start_stop_two_nodes(): name = _get_cluster_name() test = Test( @@ -3457,6 +3466,7 @@ def test_skyserve_kubernetes_http(): run_one_test(test) +@pytest.mark.no_fluidstack # Fluidstack does not support T4 gpus for now @pytest.mark.serve def test_skyserve_llm(generic_cloud: str): """Test skyserve with real LLM usecase""" @@ -3514,6 +3524,7 @@ def test_skyserve_spot_recovery(): run_one_test(test) +@pytest.mark.no_fluidstack # Fluidstack does not support spot instances @pytest.mark.serve @pytest.mark.no_kubernetes def test_skyserve_base_ondemand_fallback(generic_cloud: str): @@ -3578,6 +3589,7 @@ def test_skyserve_dynamic_ondemand_fallback(): run_one_test(test) +@pytest.mark.no_fluidstack # FluidStack does not cloud storage @pytest.mark.serve def test_skyserve_user_bug_restart(generic_cloud: str): """Tests that we restart the service after user bug.""" @@ -3611,6 +3623,7 @@ def test_skyserve_user_bug_restart(generic_cloud: str): run_one_test(test) +@pytest.mark.no_fluidstack @pytest.mark.serve @pytest.mark.no_kubernetes # Replicas on k8s may be running on the same node and have the same public IP def test_skyserve_load_balancer(generic_cloud: str): @@ -3677,6 +3690,7 @@ def test_skyserve_auto_restart(): run_one_test(test) +@pytest.mark.no_fluidstack # FluidStack does not support cloud storage @pytest.mark.serve def test_skyserve_cancel(generic_cloud: str): """Test skyserve with cancel""" @@ -3702,6 +3716,7 @@ def test_skyserve_cancel(generic_cloud: str): run_one_test(test) +@pytest.mark.no_fluidstack # FluidStack does not support cloud storage @pytest.mark.serve def test_skyserve_streaming(generic_cloud: str): """Test skyserve with streaming""" @@ -3721,6 +3736,7 @@ def test_skyserve_streaming(generic_cloud: str): run_one_test(test) +@pytest.mark.no_fluidstack # FluidStack does not support cloud storage @pytest.mark.serve def test_skyserve_readiness_timeout_fail(generic_cloud: str): """Test skyserve with large readiness probe latency, expected to fail""" @@ -3744,6 +3760,7 @@ def test_skyserve_readiness_timeout_fail(generic_cloud: str): run_one_test(test) +@pytest.mark.no_fluidstack # FluidStack does not support cloud storage @pytest.mark.serve def test_skyserve_large_readiness_timeout(generic_cloud: str): """Test skyserve with customized large readiness timeout""" @@ -3762,6 +3779,7 @@ def test_skyserve_large_readiness_timeout(generic_cloud: str): run_one_test(test) +@pytest.mark.no_fluidstack # FluidStack does not support cloud storage @pytest.mark.serve def test_skyserve_update(generic_cloud: str): """Test skyserve with update""" @@ -3790,6 +3808,7 @@ def test_skyserve_update(generic_cloud: str): run_one_test(test) +@pytest.mark.no_fluidstack # FluidStack does not support cloud storage @pytest.mark.serve def test_skyserve_rolling_update(generic_cloud: str): """Test skyserve with rolling update""" @@ -3867,6 +3886,7 @@ def test_skyserve_fast_update(generic_cloud: str): run_one_test(test) +@pytest.mark.no_fluidstack # FluidStack does not support cloud storage @pytest.mark.serve def test_skyserve_update_autoscale(generic_cloud: str): """Test skyserve update with autoscale""" @@ -3903,6 +3923,7 @@ def test_skyserve_update_autoscale(generic_cloud: str): run_one_test(test) +@pytest.mark.no_fluidstack @pytest.mark.serve @pytest.mark.no_kubernetes # Spot instances are not supported in Kubernetes @pytest.mark.parametrize('mode', ['rolling', 'blue_green']) @@ -3966,6 +3987,7 @@ def test_skyserve_new_autoscaler_update(mode: str, generic_cloud: str): run_one_test(test) +@pytest.mark.no_fluidstack @pytest.mark.serve def test_skyserve_failures(generic_cloud: str): """Test replica failure statuses""" @@ -5023,6 +5045,7 @@ def test_excluded_file_cloud_storage_upload_copy(self, gitignore_structure, assert '4' in cnt_output.decode('utf-8'), \ 'Some items listed in .gitignore and .git/info/exclude are not excluded.' + @pytest.mark.no_fluidstack @pytest.mark.parametrize('ext_bucket_fixture, store_type', [('tmp_awscli_bucket', storage_lib.StoreType.S3), ('tmp_gsutil_bucket', storage_lib.StoreType.GCS), From 8b40139f84c8ea70341712a720c44a3b4d276cf7 Mon Sep 17 00:00:00 2001 From: mjibril Date: Tue, 30 Jul 2024 14:32:37 +0100 Subject: [PATCH 2/2] fix format --- sky/clouds/fluidstack.py | 12 ++++++++++++ tests/test_smoke.py | 28 ++++++++++++---------------- 2 files changed, 24 insertions(+), 16 deletions(-) diff --git a/sky/clouds/fluidstack.py b/sky/clouds/fluidstack.py index 4bc4fca9d8a..ef397d4c55e 100644 --- a/sky/clouds/fluidstack.py +++ b/sky/clouds/fluidstack.py @@ -261,6 +261,18 @@ def check_credentials(cls) -> Tuple[bool, Optional[str]]: try: assert os.path.exists( os.path.expanduser(fluidstack_utils.FLUIDSTACK_API_KEY_PATH)) + + with open(os.path.expanduser( + fluidstack_utils.FLUIDSTACK_API_KEY_PATH), + encoding='UTF-8') as f: + api_key = f.read().strip() + if not api_key.startswith('api_key'): + return False, ('Invalid FluidStack API key format. ' + 'To configure credentials, go to:\n ' + ' https://dashboard.fluidstack.io \n ' + 'to obtain an API key, ' + 'then add save the contents ' + 'to ~/.fluidstack/api_key \n') except AssertionError: return False, ('Failed to access FluidStack Cloud' ' with credentials. ' diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 952832718fb..978a53a7efe 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -839,6 +839,7 @@ def test_image_no_conda(): run_one_test(test) +@pytest.mark.no_fluidstack # FluidStack does not support stopping instances in SkyPilot implementation @pytest.mark.no_kubernetes # Kubernetes does not support stopping instances def test_custom_default_conda_env(generic_cloud: str): name = _get_cluster_name() @@ -963,7 +964,6 @@ def test_env_check(generic_cloud: str): # ---------- file_mounts ---------- -@pytest.mark.no_fluidstack # Requires other clouds to be enabled @pytest.mark.no_scp # SCP does not support num_nodes > 1 yet. Run test_scp_file_mounts instead. def test_file_mounts(generic_cloud: str): name = _get_cluster_name() @@ -1170,7 +1170,6 @@ def test_kubernetes_storage_mounts(): run_one_test(test) -@pytest.mark.no_fluidstack # FluidStack does not support docker for now @pytest.mark.parametrize( 'image_id', [ @@ -2420,7 +2419,6 @@ def test_stop_gcp_spot(): # ---------- Testing managed job ---------- -@pytest.mark.no_fluidstack @pytest.mark.managed_jobs def test_managed_jobs(generic_cloud: str): """Test the managed jobs yaml.""" @@ -3044,7 +3042,6 @@ def test_managed_jobs_tpu(): # ---------- Testing env for managed jobs ---------- -@pytest.mark.no_fluidstack @pytest.mark.managed_jobs def test_managed_jobs_inline_env(generic_cloud: str): """Test managed jobs env""" @@ -3148,7 +3145,6 @@ def test_kubernetes_custom_image(image_id): run_one_test(test) - @pytest.mark.azure def test_azure_start_stop_two_nodes(): name = _get_cluster_name() @@ -3606,7 +3602,8 @@ def test_skyserve_dynamic_ondemand_fallback(): run_one_test(test) -@pytest.mark.no_fluidstack # FluidStack does not cloud storage +# TODO: fluidstack does not support `--cpus 2`, but the check for services in this test is based on CPUs +@pytest.mark.no_fluidstack @pytest.mark.serve def test_skyserve_user_bug_restart(generic_cloud: str): """Tests that we restart the service after user bug.""" @@ -3640,7 +3637,6 @@ def test_skyserve_user_bug_restart(generic_cloud: str): run_one_test(test) -@pytest.mark.no_fluidstack @pytest.mark.serve @pytest.mark.no_kubernetes # Replicas on k8s may be running on the same node and have the same public IP def test_skyserve_load_balancer(generic_cloud: str): @@ -3707,7 +3703,6 @@ def test_skyserve_auto_restart(): run_one_test(test) -@pytest.mark.no_fluidstack # FluidStack does not support cloud storage @pytest.mark.serve def test_skyserve_cancel(generic_cloud: str): """Test skyserve with cancel""" @@ -3733,7 +3728,6 @@ def test_skyserve_cancel(generic_cloud: str): run_one_test(test) -@pytest.mark.no_fluidstack # FluidStack does not support cloud storage @pytest.mark.serve def test_skyserve_streaming(generic_cloud: str): """Test skyserve with streaming""" @@ -3753,7 +3747,6 @@ def test_skyserve_streaming(generic_cloud: str): run_one_test(test) -@pytest.mark.no_fluidstack # FluidStack does not support cloud storage @pytest.mark.serve def test_skyserve_readiness_timeout_fail(generic_cloud: str): """Test skyserve with large readiness probe latency, expected to fail""" @@ -3777,7 +3770,6 @@ def test_skyserve_readiness_timeout_fail(generic_cloud: str): run_one_test(test) -@pytest.mark.no_fluidstack # FluidStack does not support cloud storage @pytest.mark.serve def test_skyserve_large_readiness_timeout(generic_cloud: str): """Test skyserve with customized large readiness timeout""" @@ -3796,7 +3788,8 @@ def test_skyserve_large_readiness_timeout(generic_cloud: str): run_one_test(test) -@pytest.mark.no_fluidstack # FluidStack does not support cloud storage +# TODO: fluidstack does not support `--cpus 2`, but the check for services in this test is based on CPUs +@pytest.mark.no_fluidstack @pytest.mark.serve def test_skyserve_update(generic_cloud: str): """Test skyserve with update""" @@ -3825,7 +3818,10 @@ def test_skyserve_update(generic_cloud: str): run_one_test(test) -@pytest.mark.no_fluidstack # FluidStack does not support cloud storage +# TODO: fluidstack does not support `--cpus 2`, but the check for services in this test is based on CPUs +pytest.mark.no_fluidstack + + @pytest.mark.serve def test_skyserve_rolling_update(generic_cloud: str): """Test skyserve with rolling update""" @@ -3862,6 +3858,7 @@ def test_skyserve_rolling_update(generic_cloud: str): run_one_test(test) +@pytest.mark.no_fluidstack @pytest.mark.serve def test_skyserve_fast_update(generic_cloud: str): """Test skyserve with fast update (Increment version of old replicas)""" @@ -3903,7 +3900,6 @@ def test_skyserve_fast_update(generic_cloud: str): run_one_test(test) -@pytest.mark.no_fluidstack # FluidStack does not support cloud storage @pytest.mark.serve def test_skyserve_update_autoscale(generic_cloud: str): """Test skyserve update with autoscale""" @@ -3940,7 +3936,7 @@ def test_skyserve_update_autoscale(generic_cloud: str): run_one_test(test) -@pytest.mark.no_fluidstack +@pytest.mark.no_fluidstack # Spot instances are note supported by Fluidstack @pytest.mark.serve @pytest.mark.no_kubernetes # Spot instances are not supported in Kubernetes @pytest.mark.parametrize('mode', ['rolling', 'blue_green']) @@ -4004,6 +4000,7 @@ def test_skyserve_new_autoscaler_update(mode: str, generic_cloud: str): run_one_test(test) +# TODO: fluidstack does not support `--cpus 2`, but the check for services in this test is based on CPUs @pytest.mark.no_fluidstack @pytest.mark.serve def test_skyserve_failures(generic_cloud: str): @@ -5070,7 +5067,6 @@ def test_excluded_file_cloud_storage_upload_copy(self, gitignore_structure, assert '4' in cnt_output.decode('utf-8'), \ 'Some items listed in .gitignore and .git/info/exclude are not excluded.' - @pytest.mark.no_fluidstack @pytest.mark.parametrize('ext_bucket_fixture, store_type', [('tmp_awscli_bucket', storage_lib.StoreType.S3), ('tmp_gsutil_bucket', storage_lib.StoreType.GCS),