Merge branch 'master' of github.com:skypilot-org/skypilot into azure-…

…termination
skypilot-org · Jul 1, 2024 · f164c7f · f164c7f
2 parents b5f79e3 + b03c617
commit f164c7f
Show file tree

Hide file tree

Showing 7 changed files with 228 additions and 145 deletions.
diff --git a/examples/managed_job_with_storage.yaml b/examples/managed_job_with_storage.yaml
@@ -15,11 +15,17 @@ workdir: ./examples
 
 file_mounts:
   ~/bucket_workdir:
-    # Change this to the your own globally unique bucket name.
+    # Change this to your own globally unique bucket name.
     name: sky-workdir-zhwu
     source: ./examples
     persistent: false
     mode: COPY
+
+  /output_path:
+    # Change this to your own globally unique bucket name.
+    name: sky-output-bucket
+    mode: MOUNT
+
   /imagenet-image:
     source: s3://sky-imagenet-data
 
@@ -55,3 +61,6 @@ run: |
 
   cat ~/tmpfile
   cat ~/a/b/c/tmpfile
+  
+  # Write to a file in the mounted bucket
+  echo "hello world!" > /output_path/output.txt
diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_cudo.py b/sky/clouds/service_catalog/data_fetchers/fetch_cudo.py
@@ -9,98 +9,9 @@
 
 import cudo_compute
 
-VMS_CSV = 'cudo/vms.csv'
+import sky.provision.cudo.cudo_utils as utils
 
-cudo_gpu_model = {
-    'NVIDIA V100': 'V100',
-    'NVIDIA A40': 'A40',
-    'RTX 3080': 'RTX3080',
-    'RTX A4000': 'RTXA4000',
-    'RTX A4500': 'RTXA4500',
-    'RTX A5000': 'RTXA5000',
-    'RTX A6000': 'RTXA6000',
-}
-
-cudo_gpu_mem = {
-    'RTX3080': 12,
-    'A40': 48,
-    'RTXA4000': 16,
-    'RTXA4500': 20,
-    'RTXA5000': 24,
-    'RTXA6000': 48,
-    'V100': 16,
-}
-
-machine_specs = [
-    # Low
-    {
-        'vcpu': 2,
-        'mem': 4,
-        'gpu': 1,
-    },
-    {
-        'vcpu': 4,
-        'mem': 8,
-        'gpu': 1,
-    },
-    {
-        'vcpu': 8,
-        'mem': 16,
-        'gpu': 2,
-    },
-    {
-        'vcpu': 16,
-        'mem': 32,
-        'gpu': 2,
-    },
-    {
-        'vcpu': 32,
-        'mem': 64,
-        'gpu': 4,
-    },
-    {
-        'vcpu': 64,
-        'mem': 128,
-        'gpu': 8,
-    },
-    # Mid
-    {
-        'vcpu': 96,
-        'mem': 192,
-        'gpu': 8
-    },
-    {
-        'vcpu': 48,
-        'mem': 96,
-        'gpu': 4
-    },
-    {
-        'vcpu': 24,
-        'mem': 48,
-        'gpu': 2
-    },
-    {
-        'vcpu': 12,
-        'mem': 24,
-        'gpu': 1
-    },
-    # Hi
-    {
-        'vcpu': 96,
-        'mem': 192,
-        'gpu': 4
-    },
-    {
-        'vcpu': 48,
-        'mem': 96,
-        'gpu': 2
-    },
-    {
-        'vcpu': 24,
-        'mem': 48,
-        'gpu': 1
-    },
-]
+VMS_CSV = 'cudo/vms.csv'
 
 
 def cudo_api():
@@ -110,28 +21,8 @@ def cudo_api():
     return cudo_compute.VirtualMachinesApi(client)
 
 
-def cudo_gpu_to_skypilot_gpu(model):
-    if model in cudo_gpu_model:
-        return cudo_gpu_model[model]
-    else:
-        return model
-
-
-def skypilot_gpu_to_cudo_gpu(model):
-    for key, value in cudo_gpu_model.items():
-        if value == model:
-            return key
-    return model
-
-
-def gpu_exists(model):
-    if model in cudo_gpu_model:
-        return True
-    return False
-
-
 def get_gpu_info(count, model):
-    mem = cudo_gpu_mem[model]
+    mem = utils.cudo_gpu_mem[model]
     # pylint: disable=line-too-long
     # {'Name': 'A4000', 'Manufacturer': 'NVIDIA', 'Count': 1.0, 'MemoryInfo': {'SizeInMiB': 16384}}], 'TotalGpuMemoryInMiB': 16384}"
     info = {
@@ -168,16 +59,16 @@ def machine_types(gpu_model, mem_gib, vcpu_count, gpu_count):
 
 def update_prices():
     rows = []
-    for spec in machine_specs:
+    for spec in utils.machine_specs:
         mts = machine_types('', spec['mem'], spec['vcpu'], spec['gpu'])
         for hc in mts['host_configs']:
-            if not gpu_exists(hc['gpu_model']):
+            if not utils.gpu_exists(hc['gpu_model']):
                 continue
-            accelerator_name = cudo_gpu_to_skypilot_gpu(hc['gpu_model'])
+            accelerator_name = utils.cudo_gpu_to_skypilot_gpu(hc['gpu_model'])
             row = {
                 'instance_type': get_instance_type(hc['machine_type'],
-                                                   spec['gpu'], spec['vcpu'],
-                                                   spec['mem']),
+                                                   spec['vcpu'], spec['mem'],
+                                                   spec['gpu']),
                 'accelerator_name': accelerator_name,
                 'accelerator_count': str(spec['gpu']) + '.0',
                 'vcpus': str(spec['vcpu']),

diff --git a/sky/provision/cudo/cudo_utils.py b/sky/provision/cudo/cudo_utils.py
@@ -0,0 +1,112 @@
+"""Cudo catalog helper."""
+
+cudo_gpu_model = {
+    'NVIDIA V100': 'V100',
+    'NVIDIA A40': 'A40',
+    'RTX 3080': 'RTX3080',
+    'RTX A4000': 'RTXA4000',
+    'RTX A4500': 'RTXA4500',
+    'RTX A5000': 'RTXA5000',
+    'RTX A6000': 'RTXA6000',
+}
+
+cudo_gpu_mem = {
+    'RTX3080': 12,
+    'A40': 48,
+    'RTXA4000': 16,
+    'RTXA4500': 20,
+    'RTXA5000': 24,
+    'RTXA6000': 48,
+    'V100': 16,
+}
+
+machine_specs = [
+    # Low
+    {
+        'vcpu': 2,
+        'mem': 4,
+        'gpu': 1,
+    },
+    {
+        'vcpu': 4,
+        'mem': 8,
+        'gpu': 1,
+    },
+    {
+        'vcpu': 8,
+        'mem': 16,
+        'gpu': 2,
+    },
+    {
+        'vcpu': 16,
+        'mem': 32,
+        'gpu': 2,
+    },
+    {
+        'vcpu': 32,
+        'mem': 64,
+        'gpu': 4,
+    },
+    {
+        'vcpu': 64,
+        'mem': 128,
+        'gpu': 8,
+    },
+    # Mid
+    {
+        'vcpu': 96,
+        'mem': 192,
+        'gpu': 8
+    },
+    {
+        'vcpu': 48,
+        'mem': 96,
+        'gpu': 4
+    },
+    {
+        'vcpu': 24,
+        'mem': 48,
+        'gpu': 2
+    },
+    {
+        'vcpu': 12,
+        'mem': 24,
+        'gpu': 1
+    },
+    # Hi
+    {
+        'vcpu': 96,
+        'mem': 192,
+        'gpu': 4
+    },
+    {
+        'vcpu': 48,
+        'mem': 96,
+        'gpu': 2
+    },
+    {
+        'vcpu': 24,
+        'mem': 48,
+        'gpu': 1
+    },
+]
+
+
+def cudo_gpu_to_skypilot_gpu(model):
+    if model in cudo_gpu_model:
+        return cudo_gpu_model[model]
+    else:
+        return model
+
+
+def skypilot_gpu_to_cudo_gpu(model):
+    for key, value in cudo_gpu_model.items():
+        if value == model:
+            return key
+    return model
+
+
+def gpu_exists(model):
+    if model in cudo_gpu_model:
+        return True
+    return False
diff --git a/sky/provision/cudo/cudo_wrapper.py b/sky/provision/cudo/cudo_wrapper.py
@@ -4,29 +4,29 @@
 
 from sky import sky_logging
 from sky.adaptors import cudo
+import sky.provision.cudo.cudo_utils as utils
 
 logger = sky_logging.init_logger(__name__)
 
 
 def launch(name: str, data_center_id: str, ssh_key: str, machine_type: str,
-           memory_gib: int, vcpu_count: int, gpu_count: int, gpu_model: str,
+           memory_gib: int, vcpu_count: int, gpu_count: int,
            tags: Dict[str, str], disk_size: int):
     """Launches an instance with the given parameters."""
-    disk = cudo.cudo.Disk(storage_class='STORAGE_CLASS_NETWORK',
-                          size_gib=disk_size)
-
-    request = cudo.cudo.CreateVMBody(ssh_key_source='SSH_KEY_SOURCE_NONE',
-                                     custom_ssh_keys=[ssh_key],
-                                     vm_id=name,
-                                     machine_type=machine_type,
-                                     data_center_id=data_center_id,
-                                     boot_disk_image_id='ubuntu-nvidia-docker',
-                                     memory_gib=memory_gib,
-                                     vcpus=vcpu_count,
-                                     gpus=gpu_count,
-                                     gpu_model=gpu_model,
-                                     boot_disk=disk,
-                                     metadata=tags)
+
+    request = cudo.cudo.CreateVMBody(
+        ssh_key_source='SSH_KEY_SOURCE_NONE',
+        custom_ssh_keys=[ssh_key],
+        vm_id=name,
+        machine_type=machine_type,
+        data_center_id=data_center_id,
+        boot_disk_image_id='ubuntu-2204-nvidia-535-docker-v20240214',
+        memory_gib=memory_gib,
+        vcpus=vcpu_count,
+        gpus=gpu_count,
+        boot_disk=cudo.cudo.Disk(storage_class='STORAGE_CLASS_NETWORK',
+                                 size_gib=disk_size),
+        metadata=tags)
 
     try:
         api = cudo.cudo.cudo_api.virtual_machines()
@@ -121,3 +121,24 @@ def list_instances():
         return instances
     except cudo.cudo.rest.ApiException as e:
         raise e
+
+
+def vm_available(to_start_count, gpu_count, gpu_model, data_center_id, mem,
+                 cpus):
+    try:
+        gpu_model = utils.skypilot_gpu_to_cudo_gpu(gpu_model)
+        api = cudo.cudo.cudo_api.virtual_machines()
+        types = api.list_vm_machine_types(mem,
+                                          cpus,
+                                          gpu=gpu_count,
+                                          gpu_model=gpu_model,
+                                          data_center_id=data_center_id)
+        types_dict = types.to_dict()
+        hc = types_dict['host_configs']
+        total_count = sum(item['count_vm_available'] for item in hc)
+        if total_count < to_start_count:
+            raise Exception(
+                'Too many VMs requested, try another gpu type or region')
+        return total_count
+    except cudo.cudo.rest.ApiException as e:
+        raise e