Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Cudo] privte networks and API/fetch fix #3841

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/source/getting-started/installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -328,7 +328,7 @@ Cudo Compute
✔ context: default
config file saved ~/.config/cudo/cudo.yml

pip install "cudo-compute>=0.1.10"
pip install "cudo-compute>=0.2.0"

If you want to want to use SkyPilot with a different Cudo Compute account or project, run :code:`cudoctl init` again.

Expand Down
43 changes: 26 additions & 17 deletions sky/clouds/service_catalog/data_fetchers/fetch_cudo.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ def cudo_api():


def get_gpu_info(count, model):
if not model:
return ''
mem = utils.cudo_gpu_mem[model]
# pylint: disable=line-too-long
# {'Name': 'A4000', 'Manufacturer': 'NVIDIA', 'Count': 1.0, 'MemoryInfo': {'SizeInMiB': 16384}}], 'TotalGpuMemoryInMiB': 16384}"
Expand All @@ -45,39 +47,46 @@ def get_instance_type(machine_type, vcpu, mem, gpu):
mem) + 'gb'


def machine_types(gpu_model, mem_gib, vcpu_count, gpu_count):
def machine_types():
try:
api = cudo_api()
types = api.list_vm_machine_types(mem_gib,
vcpu_count,
gpu=gpu_count,
gpu_model=gpu_model)
return types.to_dict()
types = api.list_vm_machine_types2()
return types.to_dict()['machine_types']
except cudo_compute.rest.ApiException as e:
raise e


def update_prices():
rows = []
for spec in utils.machine_specs:
mts = machine_types('', spec['mem'], spec['vcpu'], spec['gpu'])
for hc in mts['host_configs']:
if not utils.gpu_exists(hc['gpu_model']):
mts = machine_types()
for mt in mts:
if not utils.gpu_exists(mt['gpu_model_id']):
continue
accelerator_name = utils.cudo_gpu_to_skypilot_gpu(hc['gpu_model'])
accelerator_name = utils.cudo_gpu_to_skypilot_gpu(
mt['gpu_model_id'])
gpu_count = spec['gpu']
if not accelerator_name:
gpu_count = 0

price = ((float(mt['vcpu_price_hr']['value']) * spec['vcpu']) +
(float(mt['memory_gib_price_hr']['value']) * spec['mem']) +
(float(mt['gpu_price_hr']['value']) * gpu_count))
row = {
'instance_type': get_instance_type(hc['machine_type'],
'instance_type': get_instance_type(mt['machine_type'],
spec['vcpu'], spec['mem'],
spec['gpu']),
gpu_count),
'accelerator_name': accelerator_name,
'accelerator_count': str(spec['gpu']) + '.0',
'accelerator_count': str(gpu_count) + '.0',
'vcpus': str(spec['vcpu']),
'memory_gib': str(spec['mem']),
'price': hc['total_price_hr']['value'],
'region': hc['data_center_id'],
'gpu_info': get_gpu_info(spec['gpu'], accelerator_name),
'price': str(price),
'region': mt['data_center_id'],
'gpu_info': get_gpu_info(gpu_count, accelerator_name),
}
rows.append(row)
if mt['total_gpu_free'] > 0 or gpu_count == 0:
rows.append(row)

path = VMS_CSV
with open(path, 'w', encoding='utf-8') as file:
file.write(
Expand Down
3 changes: 2 additions & 1 deletion sky/provision/cudo/config.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
"""Cudo Compute configuration bootstrapping."""

from sky.provision import common
from sky.provision.cudo import cudo_wrapper


def bootstrap_instances(
region: str, cluster_name: str,
config: common.ProvisionConfig) -> common.ProvisionConfig:
"""Bootstraps instances for the given cluster."""
del region, cluster_name # unused
cudo_wrapper.setup_network(region, cluster_name)
return config
2 changes: 1 addition & 1 deletion sky/provision/cudo/cudo_machine_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def get_spec_from_instance(instance_type, data_center_id):
spec = row
break
return {
'gpu_model': spec[1],
'gpu_model_id': spec[1],
'vcpu_count': spec[3],
'mem_gb': spec[4],
'gpu_count': spec[2],
Expand Down
17 changes: 10 additions & 7 deletions sky/provision/cudo/cudo_utils.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
"""Cudo catalog helper."""

cudo_gpu_model = {
'NVIDIA V100': 'V100',
'NVIDIA A40': 'A40',
'RTX 3080': 'RTX3080',
'RTX A4000': 'RTXA4000',
'RTX A4500': 'RTXA4500',
'RTX A5000': 'RTXA5000',
'RTX A6000': 'RTXA6000',
'': '',
# 'nvidia-a40': 'A40',
'nvidia-a40-compute': 'A40',
'nvidia-h100': 'H100',
'nvidia-rtx-a4000': 'RTXA4000',
'nvidia-rtx-a4500': 'RTXA4500',
'nvidia-rtx-a5000': 'RTXA5000',
'nvidia-rtx-a6000': 'RTXA6000',
'nvidia-v100': 'V100',
}

cudo_gpu_mem = {
Expand All @@ -18,6 +20,7 @@
'RTXA5000': 24,
'RTXA6000': 48,
'V100': 16,
'H100': 80,
}

machine_specs = [
Expand Down
111 changes: 94 additions & 17 deletions sky/provision/cudo/cudo_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

def launch(name: str, data_center_id: str, ssh_key: str, machine_type: str,
memory_gib: int, vcpu_count: int, gpu_count: int,
tags: Dict[str, str], disk_size: int):
tags: Dict[str, str], disk_size: int, net_name):
JungleCatSW marked this conversation as resolved.
Show resolved Hide resolved
"""Launches an instance with the given parameters."""

request = cudo.cudo.CreateVMBody(
Expand All @@ -24,6 +24,10 @@ def launch(name: str, data_center_id: str, ssh_key: str, machine_type: str,
memory_gib=memory_gib,
vcpus=vcpu_count,
gpus=gpu_count,
nics=[
cudo.cudo.CreateVMRequestNIC(network_id=net_name,
assign_public_ip=True)
],
boot_disk=cudo.cudo.Disk(storage_class='STORAGE_CLASS_NETWORK',
size_gib=disk_size),
metadata=tags)
Expand Down Expand Up @@ -106,14 +110,11 @@ def list_instances():
for vm in vms.to_dict()['vms']:
ex_ip = vm['external_ip_address']
in_ip = vm['internal_ip_address']
if not in_ip:
in_ip = ex_ip
instance = {
# active_state, init_state, lcm_state, short_state
'status': vm['short_state'],
'tags': vm['metadata'],
'name': vm['id'],
'ip': ex_ip,
'external_ip': ex_ip,
'internal_ip': in_ip
}
Expand All @@ -123,22 +124,98 @@ def list_instances():
raise e


def vm_available(to_start_count, gpu_count, gpu_model, data_center_id, mem,
def vm_available(to_start_count, gpu_count, gpu_model_id, data_center_id, mem,
cpus):
try:
gpu_model = utils.skypilot_gpu_to_cudo_gpu(gpu_model)
gpu_model_id = utils.skypilot_gpu_to_cudo_gpu(gpu_model_id)
api = cudo.cudo.cudo_api.virtual_machines()
types = api.list_vm_machine_types(mem,
cpus,
gpu=gpu_count,
gpu_model=gpu_model,
data_center_id=data_center_id)
types = api.list_vm_machine_types2()
types_dict = types.to_dict()
hc = types_dict['host_configs']
total_count = sum(item['count_vm_available'] for item in hc)
if total_count < to_start_count:
raise Exception(
'Too many VMs requested, try another gpu type or region')
return total_count

exists = False
gpu_count_okay = False
mem_size_okay = False
cpu_count_okay = False

for mt in types_dict['machine_types']:
if mt['data_center_id'] == data_center_id and mt[
'gpu_model_id'] == gpu_model_id:
exists = True

if (mt['max_gpu_free'] > gpu_count and mt['total_gpu_free'] >
(gpu_count * to_start_count)) or gpu_count == 0:
gpu_count_okay = True

if mt['max_memory_gib_free'] > mem and mt[
'total_memory_gib_free'] > (mem * to_start_count):
mem_size_okay = True

if mt['max_vcpu_free'] > cpus and mt['total_vcpu_free'] > (
cpus * to_start_count):
cpu_count_okay = True

if not exists:
raise Exception('GPU model could not be found in data center')
if not gpu_count_okay:
raise Exception('Number of GPUs requested is too high')
if not mem_size_okay:
raise Exception('Memory size requested is too high')
if not cpu_count_okay:
raise Exception('Number of CPUs requested is too high')

return True
except cudo.cudo.rest.ApiException as e:
raise e


def setup_network(region, network_id):
api = cudo.cudo.cudo_api.networks()
project_id = cudo.cudo.cudo_api.project_id_throwable()

try:
network = cudo.cudo.CreateNetworkBody(id=network_id,
cidr_prefix='10.0.0.0/10',
data_center_id=region)
api.create_network(project_id, create_network_body=network)

except cudo.cudo.rest.ApiException as e:
raise e
# Wait for network
max_retries = 240
retry_interval = 1
wait = True
retry_count = 0
while wait:
try:
net = api.get_network(project_id, network_id)
state = net.to_dict()['network']['short_state']
except cudo.cudo.rest.ApiException as e:
raise e

if state == 'runn':
wait = False
else:
time.sleep(retry_interval)
retry_count += 1
if retry_count > max_retries:
net.delete_network(project_id, network_id)
raise cudo.cudo.rest.ApiException(
'Network could not be created')


def delete_network(network_id):
time.sleep(60)
max_retries = 24
retry_interval = 5
retry_count = 0
project_id = cudo.cudo.cudo_api.project_id_throwable()
while retry_count <= max_retries:
try:
api = cudo.cudo.cudo_api.networks()
api.delete_network(project_id, id=network_id)
break
except cudo.cudo.rest.ApiException:
pass

retry_count += 1
time.sleep(retry_interval)
30 changes: 20 additions & 10 deletions sky/provision/cudo/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import time
from typing import Any, Dict, List, Optional
import uuid

from sky import sky_logging
from sky import status_lib
Expand All @@ -17,16 +18,13 @@
def _filter_instances(cluster_name_on_cloud: str,
status_filters: Optional[List[str]]) -> Dict[str, Any]:
instances = cudo_wrapper.list_instances()
possible_names = [
f'{cluster_name_on_cloud}-head', f'{cluster_name_on_cloud}-worker'
]

filtered_nodes = {}
for instance_id, instance in instances.items():
if (status_filters is not None and
instance['status'] not in status_filters):
continue
if instance.get('name') in possible_names:
if instance.get('name').startswith(cluster_name_on_cloud):
filtered_nodes[instance_id] = instance
return filtered_nodes

Expand Down Expand Up @@ -81,27 +79,32 @@ def run_instances(region: str, cluster_name_on_cloud: str,
gpu_count = int(float(spec['gpu_count']))
vcpu_count = int(spec['vcpu_count'])
memory_gib = int(spec['mem_gb'])
gpu_model = spec['gpu_model']
gpu_model_id = spec['gpu_model_id']
try:
cudo_wrapper.vm_available(to_start_count, gpu_count, gpu_model, region,
memory_gib, vcpu_count)
cudo_wrapper.vm_available(to_start_count, gpu_count, gpu_model_id,
region, memory_gib, vcpu_count)
except Exception as e:
logger.warning(f'run_instances: {e}')
raise
for _ in range(to_start_count):

node_type = 'head' if head_instance_id is None else 'worker'

if node_type == 'head':
node_name = f'{cluster_name_on_cloud}-head'
else:
node_name = f'{cluster_name_on_cloud}-worker-{uuid.uuid4().hex[:4]}'
try:
instance_id = cudo_wrapper.launch(
name=f'{cluster_name_on_cloud}-{node_type}',
name=node_name,
ssh_key=public_key,
data_center_id=region,
machine_type=spec['machine_type'],
memory_gib=memory_gib,
vcpu_count=vcpu_count,
gpu_count=gpu_count,
tags={},
disk_size=config.node_config['DiskSize'])
disk_size=config.node_config['DiskSize'],
net_name=cluster_name_on_cloud)
except Exception as e: # pylint: disable=broad-except
logger.warning(f'run_instances error: {e}')
raise
Expand Down Expand Up @@ -163,6 +166,13 @@ def terminate_instances(
f'{inst}')
cudo_wrapper.remove(inst_id)

if not worker_only:
try:
cudo_wrapper.delete_network(cluster_name_on_cloud)
except IndexError:
logger.warning(f'Network {cluster_name_on_cloud}'
'already deleted')


def get_cluster_info(
region: str,
Expand Down
2 changes: 1 addition & 1 deletion sky/setup_files/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ def parse_readme(readme: str) -> str:
'remote': remote,
'runpod': ['runpod>=1.5.1'],
'fluidstack': [], # No dependencies needed for fluidstack
'cudo': ['cudo-compute>=0.1.10'],
'cudo': ['cudo-compute>=0.2.0'],
'paperspace': [], # No dependencies needed for paperspace
'vsphere': [
'pyvmomi==8.0.1.0.2',
Expand Down
2 changes: 1 addition & 1 deletion sky/utils/controller_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,7 @@ def _get_cloud_dependencies_installation_commands(
commands.append(
f'echo -en "\\r{prefix_str}Cudo{empty_str}" && '
'pip list | grep cudo-compute > /dev/null 2>&1 || '
'pip install "cudo-compute>=0.1.10" > /dev/null 2>&1 && '
'pip install "cudo-compute>=0.2.0" > /dev/null 2>&1 && '
'wget https://download.cudo.org/compute/cudoctl-0.3.2-amd64.deb -O ~/cudoctl.deb > /dev/null 2>&1 && ' # pylint: disable=line-too-long
'sudo dpkg -i ~/cudoctl.deb > /dev/null 2>&1')
if controller == Controllers.JOBS_CONTROLLER:
Expand Down
Loading