Skip to content

Commit

Permalink
Merge branch 'master' of github.com:skypilot-org/skypilot into azure-…
Browse files Browse the repository at this point in the history
…termination
  • Loading branch information
Michaelvll committed Jul 1, 2024
2 parents b5f79e3 + b03c617 commit f164c7f
Show file tree
Hide file tree
Showing 7 changed files with 228 additions and 145 deletions.
11 changes: 10 additions & 1 deletion examples/managed_job_with_storage.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,17 @@ workdir: ./examples

file_mounts:
~/bucket_workdir:
# Change this to the your own globally unique bucket name.
# Change this to your own globally unique bucket name.
name: sky-workdir-zhwu
source: ./examples
persistent: false
mode: COPY

/output_path:
# Change this to your own globally unique bucket name.
name: sky-output-bucket
mode: MOUNT

/imagenet-image:
source: s3://sky-imagenet-data

Expand Down Expand Up @@ -55,3 +61,6 @@ run: |
cat ~/tmpfile
cat ~/a/b/c/tmpfile
# Write to a file in the mounted bucket
echo "hello world!" > /output_path/output.txt
125 changes: 8 additions & 117 deletions sky/clouds/service_catalog/data_fetchers/fetch_cudo.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,98 +9,9 @@

import cudo_compute

VMS_CSV = 'cudo/vms.csv'
import sky.provision.cudo.cudo_utils as utils

cudo_gpu_model = {
'NVIDIA V100': 'V100',
'NVIDIA A40': 'A40',
'RTX 3080': 'RTX3080',
'RTX A4000': 'RTXA4000',
'RTX A4500': 'RTXA4500',
'RTX A5000': 'RTXA5000',
'RTX A6000': 'RTXA6000',
}

cudo_gpu_mem = {
'RTX3080': 12,
'A40': 48,
'RTXA4000': 16,
'RTXA4500': 20,
'RTXA5000': 24,
'RTXA6000': 48,
'V100': 16,
}

machine_specs = [
# Low
{
'vcpu': 2,
'mem': 4,
'gpu': 1,
},
{
'vcpu': 4,
'mem': 8,
'gpu': 1,
},
{
'vcpu': 8,
'mem': 16,
'gpu': 2,
},
{
'vcpu': 16,
'mem': 32,
'gpu': 2,
},
{
'vcpu': 32,
'mem': 64,
'gpu': 4,
},
{
'vcpu': 64,
'mem': 128,
'gpu': 8,
},
# Mid
{
'vcpu': 96,
'mem': 192,
'gpu': 8
},
{
'vcpu': 48,
'mem': 96,
'gpu': 4
},
{
'vcpu': 24,
'mem': 48,
'gpu': 2
},
{
'vcpu': 12,
'mem': 24,
'gpu': 1
},
# Hi
{
'vcpu': 96,
'mem': 192,
'gpu': 4
},
{
'vcpu': 48,
'mem': 96,
'gpu': 2
},
{
'vcpu': 24,
'mem': 48,
'gpu': 1
},
]
VMS_CSV = 'cudo/vms.csv'


def cudo_api():
Expand All @@ -110,28 +21,8 @@ def cudo_api():
return cudo_compute.VirtualMachinesApi(client)


def cudo_gpu_to_skypilot_gpu(model):
if model in cudo_gpu_model:
return cudo_gpu_model[model]
else:
return model


def skypilot_gpu_to_cudo_gpu(model):
for key, value in cudo_gpu_model.items():
if value == model:
return key
return model


def gpu_exists(model):
if model in cudo_gpu_model:
return True
return False


def get_gpu_info(count, model):
mem = cudo_gpu_mem[model]
mem = utils.cudo_gpu_mem[model]
# pylint: disable=line-too-long
# {'Name': 'A4000', 'Manufacturer': 'NVIDIA', 'Count': 1.0, 'MemoryInfo': {'SizeInMiB': 16384}}], 'TotalGpuMemoryInMiB': 16384}"
info = {
Expand Down Expand Up @@ -168,16 +59,16 @@ def machine_types(gpu_model, mem_gib, vcpu_count, gpu_count):

def update_prices():
rows = []
for spec in machine_specs:
for spec in utils.machine_specs:
mts = machine_types('', spec['mem'], spec['vcpu'], spec['gpu'])
for hc in mts['host_configs']:
if not gpu_exists(hc['gpu_model']):
if not utils.gpu_exists(hc['gpu_model']):
continue
accelerator_name = cudo_gpu_to_skypilot_gpu(hc['gpu_model'])
accelerator_name = utils.cudo_gpu_to_skypilot_gpu(hc['gpu_model'])
row = {
'instance_type': get_instance_type(hc['machine_type'],
spec['gpu'], spec['vcpu'],
spec['mem']),
spec['vcpu'], spec['mem'],
spec['gpu']),
'accelerator_name': accelerator_name,
'accelerator_count': str(spec['gpu']) + '.0',
'vcpus': str(spec['vcpu']),
Expand Down
112 changes: 112 additions & 0 deletions sky/provision/cudo/cudo_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
"""Cudo catalog helper."""

cudo_gpu_model = {
'NVIDIA V100': 'V100',
'NVIDIA A40': 'A40',
'RTX 3080': 'RTX3080',
'RTX A4000': 'RTXA4000',
'RTX A4500': 'RTXA4500',
'RTX A5000': 'RTXA5000',
'RTX A6000': 'RTXA6000',
}

cudo_gpu_mem = {
'RTX3080': 12,
'A40': 48,
'RTXA4000': 16,
'RTXA4500': 20,
'RTXA5000': 24,
'RTXA6000': 48,
'V100': 16,
}

machine_specs = [
# Low
{
'vcpu': 2,
'mem': 4,
'gpu': 1,
},
{
'vcpu': 4,
'mem': 8,
'gpu': 1,
},
{
'vcpu': 8,
'mem': 16,
'gpu': 2,
},
{
'vcpu': 16,
'mem': 32,
'gpu': 2,
},
{
'vcpu': 32,
'mem': 64,
'gpu': 4,
},
{
'vcpu': 64,
'mem': 128,
'gpu': 8,
},
# Mid
{
'vcpu': 96,
'mem': 192,
'gpu': 8
},
{
'vcpu': 48,
'mem': 96,
'gpu': 4
},
{
'vcpu': 24,
'mem': 48,
'gpu': 2
},
{
'vcpu': 12,
'mem': 24,
'gpu': 1
},
# Hi
{
'vcpu': 96,
'mem': 192,
'gpu': 4
},
{
'vcpu': 48,
'mem': 96,
'gpu': 2
},
{
'vcpu': 24,
'mem': 48,
'gpu': 1
},
]


def cudo_gpu_to_skypilot_gpu(model):
if model in cudo_gpu_model:
return cudo_gpu_model[model]
else:
return model


def skypilot_gpu_to_cudo_gpu(model):
for key, value in cudo_gpu_model.items():
if value == model:
return key
return model


def gpu_exists(model):
if model in cudo_gpu_model:
return True
return False
53 changes: 37 additions & 16 deletions sky/provision/cudo/cudo_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,29 +4,29 @@

from sky import sky_logging
from sky.adaptors import cudo
import sky.provision.cudo.cudo_utils as utils

logger = sky_logging.init_logger(__name__)


def launch(name: str, data_center_id: str, ssh_key: str, machine_type: str,
memory_gib: int, vcpu_count: int, gpu_count: int, gpu_model: str,
memory_gib: int, vcpu_count: int, gpu_count: int,
tags: Dict[str, str], disk_size: int):
"""Launches an instance with the given parameters."""
disk = cudo.cudo.Disk(storage_class='STORAGE_CLASS_NETWORK',
size_gib=disk_size)

request = cudo.cudo.CreateVMBody(ssh_key_source='SSH_KEY_SOURCE_NONE',
custom_ssh_keys=[ssh_key],
vm_id=name,
machine_type=machine_type,
data_center_id=data_center_id,
boot_disk_image_id='ubuntu-nvidia-docker',
memory_gib=memory_gib,
vcpus=vcpu_count,
gpus=gpu_count,
gpu_model=gpu_model,
boot_disk=disk,
metadata=tags)

request = cudo.cudo.CreateVMBody(
ssh_key_source='SSH_KEY_SOURCE_NONE',
custom_ssh_keys=[ssh_key],
vm_id=name,
machine_type=machine_type,
data_center_id=data_center_id,
boot_disk_image_id='ubuntu-2204-nvidia-535-docker-v20240214',
memory_gib=memory_gib,
vcpus=vcpu_count,
gpus=gpu_count,
boot_disk=cudo.cudo.Disk(storage_class='STORAGE_CLASS_NETWORK',
size_gib=disk_size),
metadata=tags)

try:
api = cudo.cudo.cudo_api.virtual_machines()
Expand Down Expand Up @@ -121,3 +121,24 @@ def list_instances():
return instances
except cudo.cudo.rest.ApiException as e:
raise e


def vm_available(to_start_count, gpu_count, gpu_model, data_center_id, mem,
cpus):
try:
gpu_model = utils.skypilot_gpu_to_cudo_gpu(gpu_model)
api = cudo.cudo.cudo_api.virtual_machines()
types = api.list_vm_machine_types(mem,
cpus,
gpu=gpu_count,
gpu_model=gpu_model,
data_center_id=data_center_id)
types_dict = types.to_dict()
hc = types_dict['host_configs']
total_count = sum(item['count_vm_available'] for item in hc)
if total_count < to_start_count:
raise Exception(
'Too many VMs requested, try another gpu type or region')
return total_count
except cudo.cudo.rest.ApiException as e:
raise e
Loading

0 comments on commit f164c7f

Please sign in to comment.