From 92bf37cfe35adb27e67dd9eb977d73834f87784e Mon Sep 17 00:00:00 2001 From: Bihan Rana Date: Tue, 12 Mar 2024 22:58:01 +0545 Subject: [PATCH 1/9] Change cudo to online provider --- src/gpuhunt/_internal/catalog.py | 4 +- src/gpuhunt/_internal/default.py | 1 + src/gpuhunt/providers/cudo.py | 678 ++++++++++++------------------- src/integrity_tests/test_cudo.py | 48 --- src/tests/providers/test_cudo.py | 101 +++++ 5 files changed, 355 insertions(+), 477 deletions(-) delete mode 100644 src/integrity_tests/test_cudo.py create mode 100644 src/tests/providers/test_cudo.py diff --git a/src/gpuhunt/_internal/catalog.py b/src/gpuhunt/_internal/catalog.py index 45d012f..c5b4451 100644 --- a/src/gpuhunt/_internal/catalog.py +++ b/src/gpuhunt/_internal/catalog.py @@ -17,8 +17,8 @@ logger = logging.getLogger(__name__) version_url = "https://dstack-gpu-pricing.s3.eu-west-1.amazonaws.com/v1/version" catalog_url = "https://dstack-gpu-pricing.s3.eu-west-1.amazonaws.com/v1/{version}/catalog.zip" -OFFLINE_PROVIDERS = ["aws", "azure", "cudo", "datacrunch", "gcp", "lambdalabs", "nebius"] -ONLINE_PROVIDERS = ["tensordock", "vastai"] +OFFLINE_PROVIDERS = ["aws", "azure", "datacrunch", "gcp", "lambdalabs", "nebius"] +ONLINE_PROVIDERS = ["cudo", "tensordock", "vastai"] RELOAD_INTERVAL = 4 * 60 * 60 # 4 hours diff --git a/src/gpuhunt/_internal/default.py b/src/gpuhunt/_internal/default.py index 07e9b59..d437789 100644 --- a/src/gpuhunt/_internal/default.py +++ b/src/gpuhunt/_internal/default.py @@ -21,6 +21,7 @@ def default_catalog() -> Catalog: for module, provider in [ ("gpuhunt.providers.tensordock", "TensorDockProvider"), ("gpuhunt.providers.vastai", "VastAIProvider"), + ("gpuhunt.providers.cudo", "CudoProvider") ]: try: module = importlib.import_module(module) diff --git a/src/gpuhunt/providers/cudo.py b/src/gpuhunt/providers/cudo.py index 9a7da10..cf3f22e 100644 --- a/src/gpuhunt/providers/cudo.py +++ b/src/gpuhunt/providers/cudo.py @@ -1,89 +1,91 @@ import logging from collections import namedtuple -from concurrent.futures import ThreadPoolExecutor, as_completed from itertools import chain -from typing import List, Optional +from math import ceil +from typing import List, Optional, Union, TypeVar import requests from gpuhunt import QueryFilter, RawCatalogItem -from gpuhunt._internal.constraints import KNOWN_GPUS +from gpuhunt._internal.constraints import KNOWN_GPUS, is_between, get_compute_capability from gpuhunt.providers import AbstractProvider CpuMemoryGpu = namedtuple("CpuMemoryGpu", ["cpu", "memory", "gpu"]) logger = logging.getLogger(__name__) API_URL = "https://rest.compute.cudo.org/v1" +MIN_CPU = 2 +MIN_MEMORY = 8 +RAM_PER_VRAM = 2 +RAM_DIV = 2 +CPU_DIV = 2 +RAM_PER_CORE = 4 +MIN_DISK_SIZE = 100 class CudoProvider(AbstractProvider): NAME = "cudo" def get( - self, query_filter: Optional[QueryFilter] = None, balance_resources: bool = True + self, query_filter: Optional[QueryFilter] = None, balance_resources: bool = True ) -> List[RawCatalogItem]: - offers = self.fetch_all_vm_types() + offers = self.fetch_offers(query_filter, balance_resources) return sorted(offers, key=lambda i: i.price) - def fetch_all_vm_types(self): - with ThreadPoolExecutor(max_workers=10) as executor: - futures = [ - executor.submit(self.fetch_vm_type, cmg.cpu, cmg.memory, cmg.gpu) - for cmg in GPU_MACHINES - ] - results = [] - for future in as_completed(futures): - try: - result = future.result() - results.append(result) - except Exception as e: - logger.info( - f"Unable to find VM type with vCPU: {e.vcpu}, Memory: {e.memory_gib} GiB, GPU: {e.gpu}." - ) - return list(chain.from_iterable(results)) - - def get_raw_catalog_list(self, vm_machine_type_list, vcpu, memory, gpu: int): - raw_list = [] - for vm in vm_machine_type_list: - memory = None - name = gpu_name(vm["gpuModel"]) - if name is not None: - memory = get_memory(name) - if gpu and name is None: - logger.warning("Skip. Unknown GPU name: %s", vm["gpuModel"]) - continue - raw = RawCatalogItem( - instance_name=vm["machineType"], - location=vm["dataCenterId"], - spot=False, - price=round(float(vm["totalPriceHr"]["value"]), 5), - cpu=vcpu, - memory=memory, - gpu_count=gpu, - gpu_name=name, - gpu_memory=memory, - disk_size=None, - ) - raw_list.append(raw) - return raw_list - - def fetch_vm_type(self, vcpu, memory_gib, gpu): - try: - result = self._list_vm_machine_types(vcpu, memory_gib, gpu) - return self.get_raw_catalog_list(result, vcpu, memory_gib, gpu) - except requests.HTTPError as e: - raise VMTypeFetchError(f"Failed to fetch VM type: {e}", vcpu, memory_gib, gpu) - - def _list_vm_machine_types(self, vcpu, memory_gib, gpu): + def fetch_offers(self, query_filter: Optional[QueryFilter], balance_resources) -> List[RawCatalogItem]: + machine_types = self.list_vm_machine_types() + if query_filter is not None: + return self.optimize_offers(machine_types, query_filter, balance_resources) + else: + offers = [] + for machine_type in machine_types: + optimized_specs = optimize_offers_with_gpu(QueryFilter(), machine_type, balance_resources=False) + raw_catalogs = [get_raw_catalog(machine_type, spec) for spec in optimized_specs] + offers.append(raw_catalogs) + return list(chain.from_iterable(offers)) + + @staticmethod + def list_vm_machine_types() -> list[dict]: resp = requests.request( method="GET", - url=f"{API_URL}/vms/machine-types?vcpu={vcpu}&memory_gib={memory_gib}&gpu={gpu}", + url=f"{API_URL}/vms/machine-types-2", ) if resp.ok: data = resp.json() - return data["hostConfigs"] + return data["machineTypes"] resp.raise_for_status() + @staticmethod + def optimize_offers(machine_types: list[dict], q: QueryFilter, balance_resource) -> List[RawCatalogItem]: + offers = [] + if any(condition is not None for condition in + [q.min_gpu_count, q.max_gpu_count, q.min_total_gpu_memory, q.max_total_gpu_memory, + q.min_gpu_memory, q.max_gpu_memory, q.gpu_name]): + # filter offers with gpus + gpu_machine_types = [vm for vm in machine_types if vm['maxGpuFree'] != 0] + for machine_type in gpu_machine_types: + machine_type["gpu_name"] = gpu_name(machine_type["gpuModel"]) + machine_type["gpu_memory"] = get_memory(machine_type["gpu_name"]) + if not is_between(machine_type["gpu_memory"], q.min_gpu_memory, + q.max_total_gpu_memory): + continue + if q.gpu_name is not None and machine_type["gpu_name"].lower() not in q.gpu_name: + continue + cc = get_compute_capability(machine_type["gpu_name"]) + if not cc or not is_between(cc, q.min_compute_capability, q.max_compute_capability): + continue + optimized_specs = optimize_offers_with_gpu(q, machine_type, balance_resource) + raw_catalogs = [get_raw_catalog(machine_type, spec) for spec in optimized_specs] + offers.append(raw_catalogs) + else: + cpu_only_machine_types = [vm for vm in machine_types if vm['maxGpuFree'] == 0] + for machine_type in cpu_only_machine_types: + optimized_specs = optimize_offers_no_gpu(q, machine_type, balance_resource) + raw_catalogs = [get_raw_catalog(machine_type, spec) for spec in optimized_specs] + offers.append(raw_catalogs) + + return list(chain.from_iterable(offers)) + class VMTypeFetchError(Exception): def __init__(self, message, vcpu, memory_gib, gpu): @@ -96,6 +98,173 @@ def __str__(self): return f"{super().__str__()} - [vCPU: {self.vcpu}, Memory: {self.memory_gib} GiB, GPU: {self.gpu}]" +def get_raw_catalog(machine_type, spec): + raw = RawCatalogItem( + instance_name=machine_type["machineType"], + location=machine_type["dataCenterId"], + spot=False, + price=(round(float(machine_type["vcpuPriceHr"]["value"]), 5) * spec["cpu"]) + + (round(float(machine_type["memoryGibPriceHr"]["value"]), 5) * spec["memory"]) + + (round(float(machine_type["gpuPriceHr"]["value"]), 5) * spec.get("gpu", 0)) + + (round(float(machine_type["minStorageGibPriceHr"]["value"]), 5) * spec["disk_size"]) + + (round(float(machine_type["ipv4PriceHr"]["value"]), 5)), + cpu=spec["cpu"], + memory=spec["memory"], + gpu_count=spec.get("gpu", 0), + gpu_name=machine_type.get("gpu_name", ""), + gpu_memory=machine_type.get("gpu_memory", 0), + disk_size=spec["disk_size"], + ) + return raw + + +def optimize_offers_with_gpu(q: QueryFilter, machine_type, balance_resources) -> List[dict]: + # Generate ranges for CPU, GPU, and memory based on the specified minimums, maximums, and available resources + cpu_range = get_cpu_range(q.min_cpu, q.max_cpu, machine_type["maxVcpuFree"]) + gpu_range = get_gpu_range(q.min_gpu_count, q.max_gpu_count, machine_type["maxGpuFree"]) + memory_range = get_memory_range(q.min_memory, q.max_memory, machine_type["maxMemoryGibFree"]) + min_vcpu_per_memory_gib = machine_type.get("minVcpuPerMemoryGib", 0) + max_vcpu_per_memory_gib = machine_type.get("maxVcpuPerMemoryGib", float('inf')) + min_vcpu_per_gpu = machine_type.get("minVcpuPerGpu", 0) + max_vcpu_per_gpu = machine_type.get("maxVcpuPerGpu", float('inf')) + unbalanced_specs = [] + for cpu in cpu_range: + for gpu in gpu_range: + for memory in memory_range: + # Check CPU/memory constraints + if not is_between(cpu, memory * min_vcpu_per_memory_gib, memory * max_vcpu_per_memory_gib): + continue + + # Check CPU/GPU constraints + if gpu > 0: + if not is_between(cpu, gpu * min_vcpu_per_gpu, gpu * max_vcpu_per_gpu): + continue + + # If all constraints are met, append this combination + unbalanced_specs.append({"cpu": cpu, "memory": memory, "gpu": gpu}) + + # If resource balancing is required, filter combinations to meet the balanced memory requirement + if balance_resources: + memory_balanced = [spec for spec in unbalanced_specs + if spec["memory"] == + get_balanced_memory(spec["gpu"], machine_type["gpu_memory"], q.max_memory)] + balanced_specs = memory_balanced + # Add disk + balanced_specs = [{"cpu": spec["cpu"], + "memory": spec["memory"], + "gpu": spec["gpu"], + "disk_size": get_balanced_disk_size(machine_type["maxStorageGibFree"], + spec["memory"], + spec["gpu"] * machine_type["gpu_memory"], + q.max_disk_size, q.min_disk_size)} + for spec in balanced_specs] + # Return balanced combinations if any; otherwise, return all combinations + return balanced_specs + + disk_size = q.min_disk_size if q.min_disk_size is not None else MIN_DISK_SIZE + # Add disk + unbalanced_specs = [{"cpu": spec["cpu"], + "memory": spec["memory"], + "gpu": spec["gpu"], + "disk_size": disk_size} + for spec in unbalanced_specs] + return unbalanced_specs + + +def optimize_offers_no_gpu(q: QueryFilter, machine_type, balance_resource) -> List[dict]: + # Generate ranges for CPU, memory based on the specified minimums, maximums, and available resources + cpu_range = get_cpu_range(q.min_cpu, q.max_cpu, machine_type["maxVcpuFree"]) + memory_range = get_memory_range(q.min_memory, q.max_memory, machine_type["maxMemoryGibFree"]) + + # Cudo Specific Constraints + min_vcpu_per_memory_gib = machine_type.get("minVcpuPerMemoryGib", 0) + max_vcpu_per_memory_gib = machine_type.get("maxVcpuPerMemoryGib", float('inf')) + + unbalanced_specs = [] + for cpu in cpu_range: + for memory in memory_range: + # Check CPU/memory constraints + if not is_between(cpu, memory * min_vcpu_per_memory_gib, memory * max_vcpu_per_memory_gib): + continue + # If all constraints are met, append this combination + unbalanced_specs.append({"cpu": cpu, "memory": memory}) + + # If resource balancing is required, filter combinations to meet the balanced memory requirement + if balance_resource: + cpu_balanced = [spec for spec in unbalanced_specs + if spec["cpu"] == + get_balanced_cpu(spec["memory"], q.max_memory)] + + balanced_specs = cpu_balanced + # Add disk + disk_size = q.min_disk_size if q.min_disk_size is not None else MIN_DISK_SIZE + balanced_specs = [{"cpu": spec["cpu"], + "memory": spec["memory"], + "disk_size": disk_size} + for spec in balanced_specs] + # Return balanced combinations if any; otherwise, return all combinations + return balanced_specs + + disk_size = q.min_disk_size if q.min_disk_size is not None else MIN_DISK_SIZE + # Add disk + unbalanced_specs = [{"cpu": spec["cpu"], + "memory": spec["memory"], + "gpu": 0, + "disk_size": min_none(machine_type["maxStorageGibFree"], disk_size)} + for spec in unbalanced_specs] + return unbalanced_specs + + +def get_cpu_range(min_cpu, max_cpu, max_cpu_free): + cpu_range = range( + min_cpu if min_cpu is not None else MIN_CPU, + min(max_cpu if max_cpu is not None else max_cpu_free, + max_cpu_free) + 1 + ) + return cpu_range + + +def get_gpu_range(min_gpu_count, max_gpu_count, max_gpu_free): + gpu_range = range( + min_gpu_count if min_gpu_count is not None else 1, + min(max_gpu_count if max_gpu_count is not None else max_gpu_free, + max_gpu_free) + 1 + ) + return gpu_range + + +def get_memory_range(min_memory, max_memory, max_memory_gib_free): + memory_range = range( + int(min_memory) if min_memory is not None else MIN_MEMORY, + min(int(max_memory) if max_memory is not None else max_memory_gib_free, + max_memory_gib_free) + 1 + ) + return memory_range + + +def get_balanced_memory(gpu_count, gpu_memory, max_memory): + return min_none( + round_up( + RAM_PER_VRAM * gpu_memory * gpu_count, RAM_DIV), + round_down(max_memory, RAM_DIV)) + + +def get_balanced_cpu(memory, max_cpu): + return min_none( + round_up(ceil(memory / RAM_PER_CORE), CPU_DIV), + round_down(max_cpu, CPU_DIV), # can be None + ) + + +def get_balanced_disk_size(available_disk, memory, total_gpu_memory, max_disk_size, min_disk_size): + return max_none( + min_none( + available_disk, + max(memory, total_gpu_memory), + max_disk_size, + ), min_disk_size) + + def gpu_name(name: str) -> Optional[str]: if not name: return None @@ -112,6 +281,29 @@ def get_memory(gpu_name: str) -> Optional[int]: raise Exception("There is no '%s' in KNOWN_GPUS", gpu_name) +def round_up(value: Optional[Union[int, float]], step: int) -> Optional[int]: + if value is None: + return None + return round_down(value + step - 1, step) + + +def round_down(value: Optional[Union[int, float]], step: int) -> Optional[int]: + if value is None: + return None + return value // step * step + + +T = TypeVar("T", bound=Union[int, float]) + + +def min_none(*args: Optional[T]) -> T: + return min(v for v in args if v is not None) + + +def max_none(*args: Optional[T]) -> T: + return max(v for v in args if v is not None) + + GPU_MAP = { "RTX A4000": "A4000", "RTX A4500": "A4500", @@ -119,373 +311,5 @@ def get_memory(gpu_name: str) -> Optional[int]: "RTX A6000": "A6000", "NVIDIA A40": "A40", "NVIDIA V100": "V100", -} - -GPU_MACHINES = [ - CpuMemoryGpu(1, 1, 1), - CpuMemoryGpu(1, 2, 1), - CpuMemoryGpu(1, 3, 1), - CpuMemoryGpu(1, 4, 1), - CpuMemoryGpu(2, 2, 1), - CpuMemoryGpu(2, 2, 2), - CpuMemoryGpu(2, 3, 1), - CpuMemoryGpu(2, 3, 2), - CpuMemoryGpu(2, 4, 1), - CpuMemoryGpu(2, 4, 2), - CpuMemoryGpu(2, 6, 1), - CpuMemoryGpu(2, 6, 2), - CpuMemoryGpu(2, 8, 1), - CpuMemoryGpu(2, 8, 2), - CpuMemoryGpu(3, 3, 1), - CpuMemoryGpu(3, 3, 2), - CpuMemoryGpu(3, 3, 3), - CpuMemoryGpu(3, 4, 1), - CpuMemoryGpu(3, 4, 2), - CpuMemoryGpu(3, 4, 3), - CpuMemoryGpu(3, 6, 1), - CpuMemoryGpu(3, 6, 2), - CpuMemoryGpu(3, 6, 3), - CpuMemoryGpu(3, 8, 1), - CpuMemoryGpu(3, 8, 2), - CpuMemoryGpu(3, 8, 3), - CpuMemoryGpu(3, 12, 1), - CpuMemoryGpu(3, 12, 2), - CpuMemoryGpu(3, 12, 3), - CpuMemoryGpu(4, 4, 1), - CpuMemoryGpu(4, 4, 2), - CpuMemoryGpu(4, 4, 3), - CpuMemoryGpu(4, 4, 4), - CpuMemoryGpu(4, 6, 1), - CpuMemoryGpu(4, 6, 2), - CpuMemoryGpu(4, 6, 3), - CpuMemoryGpu(4, 6, 4), - CpuMemoryGpu(4, 8, 1), - CpuMemoryGpu(4, 8, 2), - CpuMemoryGpu(4, 8, 3), - CpuMemoryGpu(4, 8, 4), - CpuMemoryGpu(4, 12, 1), - CpuMemoryGpu(4, 12, 2), - CpuMemoryGpu(4, 12, 3), - CpuMemoryGpu(4, 12, 4), - CpuMemoryGpu(4, 16, 1), - CpuMemoryGpu(4, 16, 2), - CpuMemoryGpu(4, 16, 3), - CpuMemoryGpu(4, 16, 4), - CpuMemoryGpu(6, 6, 1), - CpuMemoryGpu(6, 6, 2), - CpuMemoryGpu(6, 6, 3), - CpuMemoryGpu(6, 6, 4), - CpuMemoryGpu(6, 6, 5), - CpuMemoryGpu(6, 6, 6), - CpuMemoryGpu(6, 8, 1), - CpuMemoryGpu(6, 8, 2), - CpuMemoryGpu(6, 8, 3), - CpuMemoryGpu(6, 8, 4), - CpuMemoryGpu(6, 8, 5), - CpuMemoryGpu(6, 8, 6), - CpuMemoryGpu(6, 12, 1), - CpuMemoryGpu(6, 12, 2), - CpuMemoryGpu(6, 12, 3), - CpuMemoryGpu(6, 12, 4), - CpuMemoryGpu(6, 12, 5), - CpuMemoryGpu(6, 12, 6), - CpuMemoryGpu(6, 16, 1), - CpuMemoryGpu(6, 16, 2), - CpuMemoryGpu(6, 16, 3), - CpuMemoryGpu(6, 16, 4), - CpuMemoryGpu(6, 16, 5), - CpuMemoryGpu(6, 16, 6), - CpuMemoryGpu(6, 24, 1), - CpuMemoryGpu(6, 24, 2), - CpuMemoryGpu(6, 24, 3), - CpuMemoryGpu(6, 24, 4), - CpuMemoryGpu(6, 24, 5), - CpuMemoryGpu(6, 24, 6), - CpuMemoryGpu(8, 8, 1), - CpuMemoryGpu(8, 8, 2), - CpuMemoryGpu(8, 8, 3), - CpuMemoryGpu(8, 8, 4), - CpuMemoryGpu(8, 8, 5), - CpuMemoryGpu(8, 8, 6), - CpuMemoryGpu(8, 8, 7), - CpuMemoryGpu(8, 8, 8), - CpuMemoryGpu(8, 12, 1), - CpuMemoryGpu(8, 12, 2), - CpuMemoryGpu(8, 12, 3), - CpuMemoryGpu(8, 12, 4), - CpuMemoryGpu(8, 12, 5), - CpuMemoryGpu(8, 12, 6), - CpuMemoryGpu(8, 12, 7), - CpuMemoryGpu(8, 12, 8), - CpuMemoryGpu(8, 16, 1), - CpuMemoryGpu(8, 16, 2), - CpuMemoryGpu(8, 16, 3), - CpuMemoryGpu(8, 16, 4), - CpuMemoryGpu(8, 16, 5), - CpuMemoryGpu(8, 16, 6), - CpuMemoryGpu(8, 16, 7), - CpuMemoryGpu(8, 16, 8), - CpuMemoryGpu(8, 24, 1), - CpuMemoryGpu(8, 24, 2), - CpuMemoryGpu(8, 24, 3), - CpuMemoryGpu(8, 24, 4), - CpuMemoryGpu(8, 24, 5), - CpuMemoryGpu(8, 24, 6), - CpuMemoryGpu(8, 24, 7), - CpuMemoryGpu(8, 24, 8), - CpuMemoryGpu(8, 32, 1), - CpuMemoryGpu(8, 32, 2), - CpuMemoryGpu(8, 32, 3), - CpuMemoryGpu(8, 32, 4), - CpuMemoryGpu(8, 32, 5), - CpuMemoryGpu(8, 32, 6), - CpuMemoryGpu(8, 32, 7), - CpuMemoryGpu(8, 32, 8), - CpuMemoryGpu(12, 12, 1), - CpuMemoryGpu(12, 12, 2), - CpuMemoryGpu(12, 12, 3), - CpuMemoryGpu(12, 12, 4), - CpuMemoryGpu(12, 12, 5), - CpuMemoryGpu(12, 12, 6), - CpuMemoryGpu(12, 12, 7), - CpuMemoryGpu(12, 12, 8), - CpuMemoryGpu(12, 16, 1), - CpuMemoryGpu(12, 16, 2), - CpuMemoryGpu(12, 16, 3), - CpuMemoryGpu(12, 16, 4), - CpuMemoryGpu(12, 16, 5), - CpuMemoryGpu(12, 16, 6), - CpuMemoryGpu(12, 16, 7), - CpuMemoryGpu(12, 16, 8), - CpuMemoryGpu(12, 24, 1), - CpuMemoryGpu(12, 24, 2), - CpuMemoryGpu(12, 24, 3), - CpuMemoryGpu(12, 24, 4), - CpuMemoryGpu(12, 24, 5), - CpuMemoryGpu(12, 24, 6), - CpuMemoryGpu(12, 24, 7), - CpuMemoryGpu(12, 24, 8), - CpuMemoryGpu(12, 32, 1), - CpuMemoryGpu(12, 32, 2), - CpuMemoryGpu(12, 32, 3), - CpuMemoryGpu(12, 32, 4), - CpuMemoryGpu(12, 32, 5), - CpuMemoryGpu(12, 32, 6), - CpuMemoryGpu(12, 32, 7), - CpuMemoryGpu(12, 32, 8), - CpuMemoryGpu(12, 48, 1), - CpuMemoryGpu(12, 48, 2), - CpuMemoryGpu(12, 48, 3), - CpuMemoryGpu(12, 48, 4), - CpuMemoryGpu(12, 48, 5), - CpuMemoryGpu(12, 48, 6), - CpuMemoryGpu(12, 48, 7), - CpuMemoryGpu(12, 48, 8), - CpuMemoryGpu(16, 16, 1), - CpuMemoryGpu(16, 16, 2), - CpuMemoryGpu(16, 16, 3), - CpuMemoryGpu(16, 16, 4), - CpuMemoryGpu(16, 16, 5), - CpuMemoryGpu(16, 16, 6), - CpuMemoryGpu(16, 16, 7), - CpuMemoryGpu(16, 16, 8), - CpuMemoryGpu(16, 24, 1), - CpuMemoryGpu(16, 24, 2), - CpuMemoryGpu(16, 24, 3), - CpuMemoryGpu(16, 24, 4), - CpuMemoryGpu(16, 24, 5), - CpuMemoryGpu(16, 24, 6), - CpuMemoryGpu(16, 24, 7), - CpuMemoryGpu(16, 24, 8), - CpuMemoryGpu(16, 32, 1), - CpuMemoryGpu(16, 32, 2), - CpuMemoryGpu(16, 32, 3), - CpuMemoryGpu(16, 32, 4), - CpuMemoryGpu(16, 32, 5), - CpuMemoryGpu(16, 32, 6), - CpuMemoryGpu(16, 32, 7), - CpuMemoryGpu(16, 32, 8), - CpuMemoryGpu(16, 48, 1), - CpuMemoryGpu(16, 48, 2), - CpuMemoryGpu(16, 48, 3), - CpuMemoryGpu(16, 48, 4), - CpuMemoryGpu(16, 48, 5), - CpuMemoryGpu(16, 48, 6), - CpuMemoryGpu(16, 48, 7), - CpuMemoryGpu(16, 48, 8), - CpuMemoryGpu(16, 64, 1), - CpuMemoryGpu(16, 64, 2), - CpuMemoryGpu(16, 64, 3), - CpuMemoryGpu(16, 64, 4), - CpuMemoryGpu(16, 64, 5), - CpuMemoryGpu(16, 64, 6), - CpuMemoryGpu(16, 64, 7), - CpuMemoryGpu(16, 64, 8), - CpuMemoryGpu(24, 24, 1), - CpuMemoryGpu(24, 24, 2), - CpuMemoryGpu(24, 24, 3), - CpuMemoryGpu(24, 24, 4), - CpuMemoryGpu(24, 24, 5), - CpuMemoryGpu(24, 24, 6), - CpuMemoryGpu(24, 24, 7), - CpuMemoryGpu(24, 24, 8), - CpuMemoryGpu(24, 32, 1), - CpuMemoryGpu(24, 32, 2), - CpuMemoryGpu(24, 32, 3), - CpuMemoryGpu(24, 32, 4), - CpuMemoryGpu(24, 32, 5), - CpuMemoryGpu(24, 32, 6), - CpuMemoryGpu(24, 32, 7), - CpuMemoryGpu(24, 32, 8), - CpuMemoryGpu(24, 48, 1), - CpuMemoryGpu(24, 48, 2), - CpuMemoryGpu(24, 48, 3), - CpuMemoryGpu(24, 48, 4), - CpuMemoryGpu(24, 48, 5), - CpuMemoryGpu(24, 48, 6), - CpuMemoryGpu(24, 48, 7), - CpuMemoryGpu(24, 48, 8), - CpuMemoryGpu(24, 64, 1), - CpuMemoryGpu(24, 64, 2), - CpuMemoryGpu(24, 64, 3), - CpuMemoryGpu(24, 64, 4), - CpuMemoryGpu(24, 64, 5), - CpuMemoryGpu(24, 64, 6), - CpuMemoryGpu(24, 64, 7), - CpuMemoryGpu(24, 64, 8), - CpuMemoryGpu(24, 96, 1), - CpuMemoryGpu(24, 96, 2), - CpuMemoryGpu(24, 96, 3), - CpuMemoryGpu(24, 96, 4), - CpuMemoryGpu(24, 96, 5), - CpuMemoryGpu(24, 96, 6), - CpuMemoryGpu(24, 96, 7), - CpuMemoryGpu(24, 96, 8), - CpuMemoryGpu(32, 32, 1), - CpuMemoryGpu(32, 32, 2), - CpuMemoryGpu(32, 32, 3), - CpuMemoryGpu(32, 32, 4), - CpuMemoryGpu(32, 32, 5), - CpuMemoryGpu(32, 32, 6), - CpuMemoryGpu(32, 32, 7), - CpuMemoryGpu(32, 32, 8), - CpuMemoryGpu(32, 48, 1), - CpuMemoryGpu(32, 48, 2), - CpuMemoryGpu(32, 48, 3), - CpuMemoryGpu(32, 48, 4), - CpuMemoryGpu(32, 48, 5), - CpuMemoryGpu(32, 48, 6), - CpuMemoryGpu(32, 48, 7), - CpuMemoryGpu(32, 48, 8), - CpuMemoryGpu(32, 64, 1), - CpuMemoryGpu(32, 64, 2), - CpuMemoryGpu(32, 64, 3), - CpuMemoryGpu(32, 64, 4), - CpuMemoryGpu(32, 64, 5), - CpuMemoryGpu(32, 64, 6), - CpuMemoryGpu(32, 64, 7), - CpuMemoryGpu(32, 64, 8), - CpuMemoryGpu(32, 96, 1), - CpuMemoryGpu(32, 96, 2), - CpuMemoryGpu(32, 96, 3), - CpuMemoryGpu(32, 96, 4), - CpuMemoryGpu(32, 96, 5), - CpuMemoryGpu(32, 96, 6), - CpuMemoryGpu(32, 96, 7), - CpuMemoryGpu(32, 96, 8), - CpuMemoryGpu(32, 128, 2), - CpuMemoryGpu(32, 128, 3), - CpuMemoryGpu(32, 128, 4), - CpuMemoryGpu(32, 128, 5), - CpuMemoryGpu(32, 128, 6), - CpuMemoryGpu(32, 128, 7), - CpuMemoryGpu(32, 128, 8), - CpuMemoryGpu(48, 48, 2), - CpuMemoryGpu(48, 48, 3), - CpuMemoryGpu(48, 48, 4), - CpuMemoryGpu(48, 48, 5), - CpuMemoryGpu(48, 48, 6), - CpuMemoryGpu(48, 48, 7), - CpuMemoryGpu(48, 48, 8), - CpuMemoryGpu(48, 64, 2), - CpuMemoryGpu(48, 64, 3), - CpuMemoryGpu(48, 64, 4), - CpuMemoryGpu(48, 64, 5), - CpuMemoryGpu(48, 64, 6), - CpuMemoryGpu(48, 64, 7), - CpuMemoryGpu(48, 64, 8), - CpuMemoryGpu(48, 96, 2), - CpuMemoryGpu(48, 96, 3), - CpuMemoryGpu(48, 96, 4), - CpuMemoryGpu(48, 96, 5), - CpuMemoryGpu(48, 96, 6), - CpuMemoryGpu(48, 96, 7), - CpuMemoryGpu(48, 96, 8), - CpuMemoryGpu(48, 128, 2), - CpuMemoryGpu(48, 128, 3), - CpuMemoryGpu(48, 128, 4), - CpuMemoryGpu(48, 128, 5), - CpuMemoryGpu(48, 128, 6), - CpuMemoryGpu(48, 128, 7), - CpuMemoryGpu(48, 128, 8), - CpuMemoryGpu(48, 192, 2), - CpuMemoryGpu(48, 192, 3), - CpuMemoryGpu(48, 192, 4), - CpuMemoryGpu(48, 192, 5), - CpuMemoryGpu(48, 192, 6), - CpuMemoryGpu(48, 192, 7), - CpuMemoryGpu(48, 192, 8), - CpuMemoryGpu(64, 64, 2), - CpuMemoryGpu(64, 64, 3), - CpuMemoryGpu(64, 64, 4), - CpuMemoryGpu(64, 64, 5), - CpuMemoryGpu(64, 64, 6), - CpuMemoryGpu(64, 64, 7), - CpuMemoryGpu(64, 64, 8), - CpuMemoryGpu(64, 96, 2), - CpuMemoryGpu(64, 96, 3), - CpuMemoryGpu(64, 96, 4), - CpuMemoryGpu(64, 96, 5), - CpuMemoryGpu(64, 96, 6), - CpuMemoryGpu(64, 96, 7), - CpuMemoryGpu(64, 96, 8), - CpuMemoryGpu(64, 128, 2), - CpuMemoryGpu(64, 128, 3), - CpuMemoryGpu(64, 128, 4), - CpuMemoryGpu(64, 128, 5), - CpuMemoryGpu(64, 128, 6), - CpuMemoryGpu(64, 128, 7), - CpuMemoryGpu(64, 128, 8), - CpuMemoryGpu(64, 192, 2), - CpuMemoryGpu(64, 192, 4), - CpuMemoryGpu(64, 192, 5), - CpuMemoryGpu(64, 192, 6), - CpuMemoryGpu(64, 192, 7), - CpuMemoryGpu(64, 192, 8), - CpuMemoryGpu(64, 256, 4), - CpuMemoryGpu(64, 256, 5), - CpuMemoryGpu(64, 256, 6), - CpuMemoryGpu(64, 256, 7), - CpuMemoryGpu(64, 256, 8), - CpuMemoryGpu(96, 96, 4), - CpuMemoryGpu(96, 96, 6), - CpuMemoryGpu(96, 96, 7), - CpuMemoryGpu(96, 96, 8), - CpuMemoryGpu(96, 128, 4), - CpuMemoryGpu(96, 128, 6), - CpuMemoryGpu(96, 128, 7), - CpuMemoryGpu(96, 128, 8), - CpuMemoryGpu(96, 192, 6), - CpuMemoryGpu(96, 192, 7), - CpuMemoryGpu(96, 192, 8), - CpuMemoryGpu(96, 256, 6), - CpuMemoryGpu(96, 256, 7), - CpuMemoryGpu(96, 256, 8), - CpuMemoryGpu(96, 384, 6), - CpuMemoryGpu(96, 384, 7), - CpuMemoryGpu(96, 384, 8), - CpuMemoryGpu(128, 128, 8), - CpuMemoryGpu(128, 192, 8), - CpuMemoryGpu(128, 256, 8), - CpuMemoryGpu(128, 384, 8), -] + "RTX 3080": "RTX3080" +} \ No newline at end of file diff --git a/src/integrity_tests/test_cudo.py b/src/integrity_tests/test_cudo.py deleted file mode 100644 index c23e92c..0000000 --- a/src/integrity_tests/test_cudo.py +++ /dev/null @@ -1,48 +0,0 @@ -import csv -from collections import Counter -from pathlib import Path -from typing import List - -import pytest - -from gpuhunt.providers.cudo import GPU_MAP - - -@pytest.fixture -def data_rows(catalog_dir: Path) -> List[dict]: - print(catalog_dir) - file = catalog_dir / "cudo.csv" - reader = csv.DictReader(file.open()) - return list(reader) - - -def select_row(rows, name: str) -> List[str]: - return [r[name] for r in rows] - - -@pytest.mark.xfail -def test_locations(data_rows): - expected = { - "no-luster-1", - "se-smedjebacken-1", - "se-stockholm-1", - "us-newyork-1", - "us-santaclara-1", - } - locations = select_row(data_rows, "location") - assert set(locations) == expected - - count = Counter(locations) - for loc in expected: - assert count[loc] > 1 - - -def test_price(data_rows): - prices = select_row(data_rows, "price") - assert min(float(p) for p in prices) > 0 - - -def test_gpu_present(data_rows): - refs = GPU_MAP.values() - gpus = select_row(data_rows, "gpu_name") - assert all(i in refs for i in gpus) diff --git a/src/tests/providers/test_cudo.py b/src/tests/providers/test_cudo.py new file mode 100644 index 0000000..ae7ee86 --- /dev/null +++ b/src/tests/providers/test_cudo.py @@ -0,0 +1,101 @@ +from itertools import chain +from typing import List + +import pytest +from gpuhunt import QueryFilter + +from src.gpuhunt.providers.cudo import CudoProvider, get_memory, gpu_name, get_balanced_memory, \ + get_balanced_disk_size, optimize_offers_with_gpu + + +@pytest.fixture +def machine_types() -> List[dict]: + return [{ + "dataCenterId": "br-saopaulo-1", + "machineType": "cascade-lake", + "cpuModel": "Cascadelake-Server-noTSX", + "gpuModel": "RTX 3080", + "gpuModelId": "nvidia-rtx-3080", + "minVcpuPerMemoryGib": 0.25, + "maxVcpuPerMemoryGib": 1, + "minVcpuPerGpu": 1, + "maxVcpuPerGpu": 13, + "vcpuPriceHr": { + "value": "0.002500" + }, + "memoryGibPriceHr": { + "value": "0.003800" + }, + "gpuPriceHr": { + "value": "0.05" + }, + "minStorageGibPriceHr": { + "value": "0.00013" + }, + "ipv4PriceHr": { + "value": "0.005500" + }, + "maxVcpuFree": 76, + "totalVcpuFree": 377, + "maxMemoryGibFree": 227, + "totalMemoryGibFree": 1132, + "maxGpuFree": 5, + "totalGpuFree": 24, + "maxStorageGibFree": 42420, + "totalStorageGibFree": 42420 + }] + + +def test_get_offers_with_query_filter(): + cudo = CudoProvider() + offers = cudo.get(QueryFilter(min_gpu_count=1, max_gpu_count=1), balance_resources=True) + print(f'{len(offers)} offers found') + assert len(offers) >= 1, f'No offers found' + + +def test_get_offers_no_query_filter(): + cudo = CudoProvider() + offers = cudo.get(balance_resources=True) + print(f'{len(offers)} offers found') + assert len(offers) >= 1, f'No offers found' + + +def test_optimize_offers(machine_types): + machine_type = machine_types[0] + machine_type["gpu_memory"] = get_memory(gpu_name(machine_type["gpuModel"])) + q = QueryFilter(min_cpu=2, min_gpu_count=1, max_gpu_count=1, min_memory=8) + balance_resource = True + available_disk = machine_type["maxStorageGibFree"] + gpu_memory = get_memory(gpu_name(machine_type["gpuModel"])) + max_memory = q.max_memory + max_disk_size = q.max_disk_size + min_disk_size = q.min_disk_size + vm_configs = optimize_offers_with_gpu(q, machine_type, balance_resources=balance_resource) + + assert len(vm_configs) >= 1 + + for config in vm_configs: + min_cpus_for_memory = machine_type["minVcpuPerMemoryGib"] * config["memory"] + max_cpus_for_memory = machine_type["maxVcpuPerMemoryGib"] * config["memory"] + min_cpus_for_gpu = machine_type["minVcpuPerGpu"] * config["gpu"] + assert config["cpu"] >= min_cpus_for_memory, \ + f"VM config does not meet the minimum CPU:Memory requirement. Required minimum CPUs: " \ + f"{min_cpus_for_memory}, Found: {config['cpu']}" + assert config["cpu"] <= max_cpus_for_memory, \ + f"VM config exceeds the maximum CPU:Memory allowance. Allowed maximum CPUs: " \ + f"{max_cpus_for_memory}, Found: {config['cpu']}" + assert config["cpu"] >= min_cpus_for_gpu, \ + f"VM config does not meet the minimum CPU:GPU requirement. " \ + f"Required minimum CPUs: {min_cpus_for_gpu}, Found: {config['cpu']}" + # Perform the balance resource checks if balance_resource is True + if balance_resource: + expected_memory = get_balanced_memory(config['gpu'], gpu_memory, max_memory) + expected_disk_size = get_balanced_disk_size(available_disk, config['memory'], config["gpu"] * gpu_memory, + max_disk_size, min_disk_size) + + assert config['memory'] == expected_memory, \ + f"Memory allocation does not match the expected balanced memory. " \ + f"Expected: {expected_memory}, Found: {config['memory']} in config {config}" + assert config['disk_size'] == expected_disk_size, \ + f"Disk size allocation does not match the expected balanced disk size. " \ + f"Expected: {expected_disk_size}, Found: {config['disk_size']}" From 189e2160b69c456c8c0a9fd7f913b657803b8f50 Mon Sep 17 00:00:00 2001 From: Bihan Rana Date: Tue, 12 Mar 2024 23:09:43 +0545 Subject: [PATCH 2/9] Sort imports for cudo --- src/gpuhunt/providers/cudo.py | 6 +++--- src/tests/providers/test_cudo.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/gpuhunt/providers/cudo.py b/src/gpuhunt/providers/cudo.py index cf3f22e..7059cb1 100644 --- a/src/gpuhunt/providers/cudo.py +++ b/src/gpuhunt/providers/cudo.py @@ -2,12 +2,12 @@ from collections import namedtuple from itertools import chain from math import ceil -from typing import List, Optional, Union, TypeVar +from typing import List, Optional, TypeVar, Union import requests - from gpuhunt import QueryFilter, RawCatalogItem -from gpuhunt._internal.constraints import KNOWN_GPUS, is_between, get_compute_capability +from gpuhunt._internal.constraints import (KNOWN_GPUS, get_compute_capability, + is_between) from gpuhunt.providers import AbstractProvider CpuMemoryGpu = namedtuple("CpuMemoryGpu", ["cpu", "memory", "gpu"]) diff --git a/src/tests/providers/test_cudo.py b/src/tests/providers/test_cudo.py index ae7ee86..ef0218a 100644 --- a/src/tests/providers/test_cudo.py +++ b/src/tests/providers/test_cudo.py @@ -3,9 +3,9 @@ import pytest from gpuhunt import QueryFilter - -from src.gpuhunt.providers.cudo import CudoProvider, get_memory, gpu_name, get_balanced_memory, \ - get_balanced_disk_size, optimize_offers_with_gpu +from src.gpuhunt.providers.cudo import (CudoProvider, get_balanced_disk_size, + get_balanced_memory, get_memory, + gpu_name, optimize_offers_with_gpu) @pytest.fixture From 9a64a2ca063ebfea021fbd2f1b87dab809f03aa2 Mon Sep 17 00:00:00 2001 From: Bihan Rana Date: Wed, 13 Mar 2024 12:18:38 +0545 Subject: [PATCH 3/9] Isort cudo as per pyconfig --- src/gpuhunt/providers/cudo.py | 4 ++-- src/tests/providers/test_cudo.py | 13 ++++++++++--- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/src/gpuhunt/providers/cudo.py b/src/gpuhunt/providers/cudo.py index 7059cb1..5112d18 100644 --- a/src/gpuhunt/providers/cudo.py +++ b/src/gpuhunt/providers/cudo.py @@ -5,9 +5,9 @@ from typing import List, Optional, TypeVar, Union import requests + from gpuhunt import QueryFilter, RawCatalogItem -from gpuhunt._internal.constraints import (KNOWN_GPUS, get_compute_capability, - is_between) +from gpuhunt._internal.constraints import KNOWN_GPUS, get_compute_capability, is_between from gpuhunt.providers import AbstractProvider CpuMemoryGpu = namedtuple("CpuMemoryGpu", ["cpu", "memory", "gpu"]) diff --git a/src/tests/providers/test_cudo.py b/src/tests/providers/test_cudo.py index ef0218a..9d0a8ef 100644 --- a/src/tests/providers/test_cudo.py +++ b/src/tests/providers/test_cudo.py @@ -2,10 +2,17 @@ from typing import List import pytest + +from src.gpuhunt.providers.cudo import ( + CudoProvider, + get_balanced_disk_size, + get_balanced_memory, + get_memory, + gpu_name, + optimize_offers_with_gpu, +) + from gpuhunt import QueryFilter -from src.gpuhunt.providers.cudo import (CudoProvider, get_balanced_disk_size, - get_balanced_memory, get_memory, - gpu_name, optimize_offers_with_gpu) @pytest.fixture From abf59baa5ab817fd16956d3937e0efffa5c4a9fb Mon Sep 17 00:00:00 2001 From: Bihan Rana Date: Wed, 13 Mar 2024 12:28:20 +0545 Subject: [PATCH 4/9] Run pre-commit for cudo --- src/gpuhunt/_internal/default.py | 2 +- src/gpuhunt/providers/cudo.py | 162 +++++++++++++++++++------------ src/tests/providers/test_cudo.py | 105 ++++++++++---------- 3 files changed, 154 insertions(+), 115 deletions(-) diff --git a/src/gpuhunt/_internal/default.py b/src/gpuhunt/_internal/default.py index d437789..3ea430b 100644 --- a/src/gpuhunt/_internal/default.py +++ b/src/gpuhunt/_internal/default.py @@ -21,7 +21,7 @@ def default_catalog() -> Catalog: for module, provider in [ ("gpuhunt.providers.tensordock", "TensorDockProvider"), ("gpuhunt.providers.vastai", "VastAIProvider"), - ("gpuhunt.providers.cudo", "CudoProvider") + ("gpuhunt.providers.cudo", "CudoProvider"), ]: try: module = importlib.import_module(module) diff --git a/src/gpuhunt/providers/cudo.py b/src/gpuhunt/providers/cudo.py index 5112d18..b891aa5 100644 --- a/src/gpuhunt/providers/cudo.py +++ b/src/gpuhunt/providers/cudo.py @@ -27,19 +27,23 @@ class CudoProvider(AbstractProvider): NAME = "cudo" def get( - self, query_filter: Optional[QueryFilter] = None, balance_resources: bool = True + self, query_filter: Optional[QueryFilter] = None, balance_resources: bool = True ) -> List[RawCatalogItem]: offers = self.fetch_offers(query_filter, balance_resources) return sorted(offers, key=lambda i: i.price) - def fetch_offers(self, query_filter: Optional[QueryFilter], balance_resources) -> List[RawCatalogItem]: + def fetch_offers( + self, query_filter: Optional[QueryFilter], balance_resources + ) -> List[RawCatalogItem]: machine_types = self.list_vm_machine_types() if query_filter is not None: return self.optimize_offers(machine_types, query_filter, balance_resources) else: offers = [] for machine_type in machine_types: - optimized_specs = optimize_offers_with_gpu(QueryFilter(), machine_type, balance_resources=False) + optimized_specs = optimize_offers_with_gpu( + QueryFilter(), machine_type, balance_resources=False + ) raw_catalogs = [get_raw_catalog(machine_type, spec) for spec in optimized_specs] offers.append(raw_catalogs) return list(chain.from_iterable(offers)) @@ -56,29 +60,43 @@ def list_vm_machine_types() -> list[dict]: resp.raise_for_status() @staticmethod - def optimize_offers(machine_types: list[dict], q: QueryFilter, balance_resource) -> List[RawCatalogItem]: + def optimize_offers( + machine_types: list[dict], q: QueryFilter, balance_resource + ) -> List[RawCatalogItem]: offers = [] - if any(condition is not None for condition in - [q.min_gpu_count, q.max_gpu_count, q.min_total_gpu_memory, q.max_total_gpu_memory, - q.min_gpu_memory, q.max_gpu_memory, q.gpu_name]): + if any( + condition is not None + for condition in [ + q.min_gpu_count, + q.max_gpu_count, + q.min_total_gpu_memory, + q.max_total_gpu_memory, + q.min_gpu_memory, + q.max_gpu_memory, + q.gpu_name, + ] + ): # filter offers with gpus - gpu_machine_types = [vm for vm in machine_types if vm['maxGpuFree'] != 0] + gpu_machine_types = [vm for vm in machine_types if vm["maxGpuFree"] != 0] for machine_type in gpu_machine_types: machine_type["gpu_name"] = gpu_name(machine_type["gpuModel"]) machine_type["gpu_memory"] = get_memory(machine_type["gpu_name"]) - if not is_between(machine_type["gpu_memory"], q.min_gpu_memory, - q.max_total_gpu_memory): + if not is_between( + machine_type["gpu_memory"], q.min_gpu_memory, q.max_total_gpu_memory + ): continue if q.gpu_name is not None and machine_type["gpu_name"].lower() not in q.gpu_name: continue cc = get_compute_capability(machine_type["gpu_name"]) - if not cc or not is_between(cc, q.min_compute_capability, q.max_compute_capability): + if not cc or not is_between( + cc, q.min_compute_capability, q.max_compute_capability + ): continue optimized_specs = optimize_offers_with_gpu(q, machine_type, balance_resource) raw_catalogs = [get_raw_catalog(machine_type, spec) for spec in optimized_specs] offers.append(raw_catalogs) else: - cpu_only_machine_types = [vm for vm in machine_types if vm['maxGpuFree'] == 0] + cpu_only_machine_types = [vm for vm in machine_types if vm["maxGpuFree"] == 0] for machine_type in cpu_only_machine_types: optimized_specs = optimize_offers_no_gpu(q, machine_type, balance_resource) raw_catalogs = [get_raw_catalog(machine_type, spec) for spec in optimized_specs] @@ -103,11 +121,11 @@ def get_raw_catalog(machine_type, spec): instance_name=machine_type["machineType"], location=machine_type["dataCenterId"], spot=False, - price=(round(float(machine_type["vcpuPriceHr"]["value"]), 5) * spec["cpu"]) + - (round(float(machine_type["memoryGibPriceHr"]["value"]), 5) * spec["memory"]) + - (round(float(machine_type["gpuPriceHr"]["value"]), 5) * spec.get("gpu", 0)) + - (round(float(machine_type["minStorageGibPriceHr"]["value"]), 5) * spec["disk_size"]) + - (round(float(machine_type["ipv4PriceHr"]["value"]), 5)), + price=(round(float(machine_type["vcpuPriceHr"]["value"]), 5) * spec["cpu"]) + + (round(float(machine_type["memoryGibPriceHr"]["value"]), 5) * spec["memory"]) + + (round(float(machine_type["gpuPriceHr"]["value"]), 5) * spec.get("gpu", 0)) + + (round(float(machine_type["minStorageGibPriceHr"]["value"]), 5) * spec["disk_size"]) + + (round(float(machine_type["ipv4PriceHr"]["value"]), 5)), cpu=spec["cpu"], memory=spec["memory"], gpu_count=spec.get("gpu", 0), @@ -124,15 +142,17 @@ def optimize_offers_with_gpu(q: QueryFilter, machine_type, balance_resources) -> gpu_range = get_gpu_range(q.min_gpu_count, q.max_gpu_count, machine_type["maxGpuFree"]) memory_range = get_memory_range(q.min_memory, q.max_memory, machine_type["maxMemoryGibFree"]) min_vcpu_per_memory_gib = machine_type.get("minVcpuPerMemoryGib", 0) - max_vcpu_per_memory_gib = machine_type.get("maxVcpuPerMemoryGib", float('inf')) + max_vcpu_per_memory_gib = machine_type.get("maxVcpuPerMemoryGib", float("inf")) min_vcpu_per_gpu = machine_type.get("minVcpuPerGpu", 0) - max_vcpu_per_gpu = machine_type.get("maxVcpuPerGpu", float('inf')) + max_vcpu_per_gpu = machine_type.get("maxVcpuPerGpu", float("inf")) unbalanced_specs = [] for cpu in cpu_range: for gpu in gpu_range: for memory in memory_range: # Check CPU/memory constraints - if not is_between(cpu, memory * min_vcpu_per_memory_gib, memory * max_vcpu_per_memory_gib): + if not is_between( + cpu, memory * min_vcpu_per_memory_gib, memory * max_vcpu_per_memory_gib + ): continue # Check CPU/GPU constraints @@ -145,29 +165,38 @@ def optimize_offers_with_gpu(q: QueryFilter, machine_type, balance_resources) -> # If resource balancing is required, filter combinations to meet the balanced memory requirement if balance_resources: - memory_balanced = [spec for spec in unbalanced_specs - if spec["memory"] == - get_balanced_memory(spec["gpu"], machine_type["gpu_memory"], q.max_memory)] + memory_balanced = [ + spec + for spec in unbalanced_specs + if spec["memory"] + == get_balanced_memory(spec["gpu"], machine_type["gpu_memory"], q.max_memory) + ] balanced_specs = memory_balanced # Add disk - balanced_specs = [{"cpu": spec["cpu"], - "memory": spec["memory"], - "gpu": spec["gpu"], - "disk_size": get_balanced_disk_size(machine_type["maxStorageGibFree"], - spec["memory"], - spec["gpu"] * machine_type["gpu_memory"], - q.max_disk_size, q.min_disk_size)} - for spec in balanced_specs] + balanced_specs = [ + { + "cpu": spec["cpu"], + "memory": spec["memory"], + "gpu": spec["gpu"], + "disk_size": get_balanced_disk_size( + machine_type["maxStorageGibFree"], + spec["memory"], + spec["gpu"] * machine_type["gpu_memory"], + q.max_disk_size, + q.min_disk_size, + ), + } + for spec in balanced_specs + ] # Return balanced combinations if any; otherwise, return all combinations return balanced_specs disk_size = q.min_disk_size if q.min_disk_size is not None else MIN_DISK_SIZE # Add disk - unbalanced_specs = [{"cpu": spec["cpu"], - "memory": spec["memory"], - "gpu": spec["gpu"], - "disk_size": disk_size} - for spec in unbalanced_specs] + unbalanced_specs = [ + {"cpu": spec["cpu"], "memory": spec["memory"], "gpu": spec["gpu"], "disk_size": disk_size} + for spec in unbalanced_specs + ] return unbalanced_specs @@ -178,48 +207,55 @@ def optimize_offers_no_gpu(q: QueryFilter, machine_type, balance_resource) -> Li # Cudo Specific Constraints min_vcpu_per_memory_gib = machine_type.get("minVcpuPerMemoryGib", 0) - max_vcpu_per_memory_gib = machine_type.get("maxVcpuPerMemoryGib", float('inf')) + max_vcpu_per_memory_gib = machine_type.get("maxVcpuPerMemoryGib", float("inf")) unbalanced_specs = [] for cpu in cpu_range: for memory in memory_range: # Check CPU/memory constraints - if not is_between(cpu, memory * min_vcpu_per_memory_gib, memory * max_vcpu_per_memory_gib): + if not is_between( + cpu, memory * min_vcpu_per_memory_gib, memory * max_vcpu_per_memory_gib + ): continue # If all constraints are met, append this combination unbalanced_specs.append({"cpu": cpu, "memory": memory}) # If resource balancing is required, filter combinations to meet the balanced memory requirement if balance_resource: - cpu_balanced = [spec for spec in unbalanced_specs - if spec["cpu"] == - get_balanced_cpu(spec["memory"], q.max_memory)] + cpu_balanced = [ + spec + for spec in unbalanced_specs + if spec["cpu"] == get_balanced_cpu(spec["memory"], q.max_memory) + ] balanced_specs = cpu_balanced # Add disk disk_size = q.min_disk_size if q.min_disk_size is not None else MIN_DISK_SIZE - balanced_specs = [{"cpu": spec["cpu"], - "memory": spec["memory"], - "disk_size": disk_size} - for spec in balanced_specs] + balanced_specs = [ + {"cpu": spec["cpu"], "memory": spec["memory"], "disk_size": disk_size} + for spec in balanced_specs + ] # Return balanced combinations if any; otherwise, return all combinations return balanced_specs disk_size = q.min_disk_size if q.min_disk_size is not None else MIN_DISK_SIZE # Add disk - unbalanced_specs = [{"cpu": spec["cpu"], - "memory": spec["memory"], - "gpu": 0, - "disk_size": min_none(machine_type["maxStorageGibFree"], disk_size)} - for spec in unbalanced_specs] + unbalanced_specs = [ + { + "cpu": spec["cpu"], + "memory": spec["memory"], + "gpu": 0, + "disk_size": min_none(machine_type["maxStorageGibFree"], disk_size), + } + for spec in unbalanced_specs + ] return unbalanced_specs def get_cpu_range(min_cpu, max_cpu, max_cpu_free): cpu_range = range( min_cpu if min_cpu is not None else MIN_CPU, - min(max_cpu if max_cpu is not None else max_cpu_free, - max_cpu_free) + 1 + min(max_cpu if max_cpu is not None else max_cpu_free, max_cpu_free) + 1, ) return cpu_range @@ -227,8 +263,7 @@ def get_cpu_range(min_cpu, max_cpu, max_cpu_free): def get_gpu_range(min_gpu_count, max_gpu_count, max_gpu_free): gpu_range = range( min_gpu_count if min_gpu_count is not None else 1, - min(max_gpu_count if max_gpu_count is not None else max_gpu_free, - max_gpu_free) + 1 + min(max_gpu_count if max_gpu_count is not None else max_gpu_free, max_gpu_free) + 1, ) return gpu_range @@ -236,17 +271,18 @@ def get_gpu_range(min_gpu_count, max_gpu_count, max_gpu_free): def get_memory_range(min_memory, max_memory, max_memory_gib_free): memory_range = range( int(min_memory) if min_memory is not None else MIN_MEMORY, - min(int(max_memory) if max_memory is not None else max_memory_gib_free, - max_memory_gib_free) + 1 + min( + int(max_memory) if max_memory is not None else max_memory_gib_free, max_memory_gib_free + ) + + 1, ) return memory_range def get_balanced_memory(gpu_count, gpu_memory, max_memory): return min_none( - round_up( - RAM_PER_VRAM * gpu_memory * gpu_count, RAM_DIV), - round_down(max_memory, RAM_DIV)) + round_up(RAM_PER_VRAM * gpu_memory * gpu_count, RAM_DIV), round_down(max_memory, RAM_DIV) + ) def get_balanced_cpu(memory, max_cpu): @@ -262,7 +298,9 @@ def get_balanced_disk_size(available_disk, memory, total_gpu_memory, max_disk_si available_disk, max(memory, total_gpu_memory), max_disk_size, - ), min_disk_size) + ), + min_disk_size, + ) def gpu_name(name: str) -> Optional[str]: @@ -311,5 +349,5 @@ def max_none(*args: Optional[T]) -> T: "RTX A6000": "A6000", "NVIDIA A40": "A40", "NVIDIA V100": "V100", - "RTX 3080": "RTX3080" -} \ No newline at end of file + "RTX 3080": "RTX3080", +} diff --git a/src/tests/providers/test_cudo.py b/src/tests/providers/test_cudo.py index 9d0a8ef..852079e 100644 --- a/src/tests/providers/test_cudo.py +++ b/src/tests/providers/test_cudo.py @@ -1,4 +1,3 @@ -from itertools import chain from typing import List import pytest @@ -17,54 +16,46 @@ @pytest.fixture def machine_types() -> List[dict]: - return [{ - "dataCenterId": "br-saopaulo-1", - "machineType": "cascade-lake", - "cpuModel": "Cascadelake-Server-noTSX", - "gpuModel": "RTX 3080", - "gpuModelId": "nvidia-rtx-3080", - "minVcpuPerMemoryGib": 0.25, - "maxVcpuPerMemoryGib": 1, - "minVcpuPerGpu": 1, - "maxVcpuPerGpu": 13, - "vcpuPriceHr": { - "value": "0.002500" - }, - "memoryGibPriceHr": { - "value": "0.003800" - }, - "gpuPriceHr": { - "value": "0.05" - }, - "minStorageGibPriceHr": { - "value": "0.00013" - }, - "ipv4PriceHr": { - "value": "0.005500" - }, - "maxVcpuFree": 76, - "totalVcpuFree": 377, - "maxMemoryGibFree": 227, - "totalMemoryGibFree": 1132, - "maxGpuFree": 5, - "totalGpuFree": 24, - "maxStorageGibFree": 42420, - "totalStorageGibFree": 42420 - }] + return [ + { + "dataCenterId": "br-saopaulo-1", + "machineType": "cascade-lake", + "cpuModel": "Cascadelake-Server-noTSX", + "gpuModel": "RTX 3080", + "gpuModelId": "nvidia-rtx-3080", + "minVcpuPerMemoryGib": 0.25, + "maxVcpuPerMemoryGib": 1, + "minVcpuPerGpu": 1, + "maxVcpuPerGpu": 13, + "vcpuPriceHr": {"value": "0.002500"}, + "memoryGibPriceHr": {"value": "0.003800"}, + "gpuPriceHr": {"value": "0.05"}, + "minStorageGibPriceHr": {"value": "0.00013"}, + "ipv4PriceHr": {"value": "0.005500"}, + "maxVcpuFree": 76, + "totalVcpuFree": 377, + "maxMemoryGibFree": 227, + "totalMemoryGibFree": 1132, + "maxGpuFree": 5, + "totalGpuFree": 24, + "maxStorageGibFree": 42420, + "totalStorageGibFree": 42420, + } + ] def test_get_offers_with_query_filter(): cudo = CudoProvider() offers = cudo.get(QueryFilter(min_gpu_count=1, max_gpu_count=1), balance_resources=True) - print(f'{len(offers)} offers found') - assert len(offers) >= 1, f'No offers found' + print(f"{len(offers)} offers found") + assert len(offers) >= 1, "No offers found" def test_get_offers_no_query_filter(): cudo = CudoProvider() offers = cudo.get(balance_resources=True) - print(f'{len(offers)} offers found') - assert len(offers) >= 1, f'No offers found' + print(f"{len(offers)} offers found") + assert len(offers) >= 1, "No offers found" def test_optimize_offers(machine_types): @@ -85,24 +76,34 @@ def test_optimize_offers(machine_types): min_cpus_for_memory = machine_type["minVcpuPerMemoryGib"] * config["memory"] max_cpus_for_memory = machine_type["maxVcpuPerMemoryGib"] * config["memory"] min_cpus_for_gpu = machine_type["minVcpuPerGpu"] * config["gpu"] - assert config["cpu"] >= min_cpus_for_memory, \ - f"VM config does not meet the minimum CPU:Memory requirement. Required minimum CPUs: " \ + assert config["cpu"] >= min_cpus_for_memory, ( + f"VM config does not meet the minimum CPU:Memory requirement. Required minimum CPUs: " f"{min_cpus_for_memory}, Found: {config['cpu']}" - assert config["cpu"] <= max_cpus_for_memory, \ - f"VM config exceeds the maximum CPU:Memory allowance. Allowed maximum CPUs: " \ + ) + assert config["cpu"] <= max_cpus_for_memory, ( + f"VM config exceeds the maximum CPU:Memory allowance. Allowed maximum CPUs: " f"{max_cpus_for_memory}, Found: {config['cpu']}" - assert config["cpu"] >= min_cpus_for_gpu, \ - f"VM config does not meet the minimum CPU:GPU requirement. " \ + ) + assert config["cpu"] >= min_cpus_for_gpu, ( + f"VM config does not meet the minimum CPU:GPU requirement. " f"Required minimum CPUs: {min_cpus_for_gpu}, Found: {config['cpu']}" + ) # Perform the balance resource checks if balance_resource is True if balance_resource: - expected_memory = get_balanced_memory(config['gpu'], gpu_memory, max_memory) - expected_disk_size = get_balanced_disk_size(available_disk, config['memory'], config["gpu"] * gpu_memory, - max_disk_size, min_disk_size) + expected_memory = get_balanced_memory(config["gpu"], gpu_memory, max_memory) + expected_disk_size = get_balanced_disk_size( + available_disk, + config["memory"], + config["gpu"] * gpu_memory, + max_disk_size, + min_disk_size, + ) - assert config['memory'] == expected_memory, \ - f"Memory allocation does not match the expected balanced memory. " \ + assert config["memory"] == expected_memory, ( + f"Memory allocation does not match the expected balanced memory. " f"Expected: {expected_memory}, Found: {config['memory']} in config {config}" - assert config['disk_size'] == expected_disk_size, \ - f"Disk size allocation does not match the expected balanced disk size. " \ + ) + assert config["disk_size"] == expected_disk_size, ( + f"Disk size allocation does not match the expected balanced disk size. " f"Expected: {expected_disk_size}, Found: {config['disk_size']}" + ) From 21b7155d721b9548ff5368fe0898434417babda0 Mon Sep 17 00:00:00 2001 From: Bihan Rana Date: Wed, 13 Mar 2024 12:32:31 +0545 Subject: [PATCH 5/9] Fix no module for Cudo --- src/tests/providers/test_cudo.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/tests/providers/test_cudo.py b/src/tests/providers/test_cudo.py index 852079e..20ca077 100644 --- a/src/tests/providers/test_cudo.py +++ b/src/tests/providers/test_cudo.py @@ -2,7 +2,8 @@ import pytest -from src.gpuhunt.providers.cudo import ( +from gpuhunt import QueryFilter +from gpuhunt.providers.cudo import ( CudoProvider, get_balanced_disk_size, get_balanced_memory, @@ -11,8 +12,6 @@ optimize_offers_with_gpu, ) -from gpuhunt import QueryFilter - @pytest.fixture def machine_types() -> List[dict]: From 361885e921f000d8e1315a7eca1a7ed24aeed1da Mon Sep 17 00:00:00 2001 From: Bihan Rana Date: Wed, 13 Mar 2024 13:07:14 +0545 Subject: [PATCH 6/9] Fix Type Error in Cudo --- src/gpuhunt/providers/cudo.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/gpuhunt/providers/cudo.py b/src/gpuhunt/providers/cudo.py index b891aa5..402461e 100644 --- a/src/gpuhunt/providers/cudo.py +++ b/src/gpuhunt/providers/cudo.py @@ -49,7 +49,7 @@ def fetch_offers( return list(chain.from_iterable(offers)) @staticmethod - def list_vm_machine_types() -> list[dict]: + def list_vm_machine_types(): resp = requests.request( method="GET", url=f"{API_URL}/vms/machine-types-2", @@ -60,9 +60,7 @@ def list_vm_machine_types() -> list[dict]: resp.raise_for_status() @staticmethod - def optimize_offers( - machine_types: list[dict], q: QueryFilter, balance_resource - ) -> List[RawCatalogItem]: + def optimize_offers(machine_types, q: QueryFilter, balance_resource) -> List[RawCatalogItem]: offers = [] if any( condition is not None @@ -136,7 +134,7 @@ def get_raw_catalog(machine_type, spec): return raw -def optimize_offers_with_gpu(q: QueryFilter, machine_type, balance_resources) -> List[dict]: +def optimize_offers_with_gpu(q: QueryFilter, machine_type, balance_resources): # Generate ranges for CPU, GPU, and memory based on the specified minimums, maximums, and available resources cpu_range = get_cpu_range(q.min_cpu, q.max_cpu, machine_type["maxVcpuFree"]) gpu_range = get_gpu_range(q.min_gpu_count, q.max_gpu_count, machine_type["maxGpuFree"]) @@ -200,7 +198,7 @@ def optimize_offers_with_gpu(q: QueryFilter, machine_type, balance_resources) -> return unbalanced_specs -def optimize_offers_no_gpu(q: QueryFilter, machine_type, balance_resource) -> List[dict]: +def optimize_offers_no_gpu(q: QueryFilter, machine_type, balance_resource): # Generate ranges for CPU, memory based on the specified minimums, maximums, and available resources cpu_range = get_cpu_range(q.min_cpu, q.max_cpu, machine_type["maxVcpuFree"]) memory_range = get_memory_range(q.min_memory, q.max_memory, machine_type["maxMemoryGibFree"]) From da44332169488e7258a3ab7567845e535a250209 Mon Sep 17 00:00:00 2001 From: Bihan Rana Date: Wed, 13 Mar 2024 18:37:28 +0545 Subject: [PATCH 7/9] Remove unnecessary exceptions for cudo --- src/gpuhunt/providers/cudo.py | 26 ++++++++++---------------- src/tests/providers/test_cudo.py | 23 +++++++++++++++++++++++ 2 files changed, 33 insertions(+), 16 deletions(-) diff --git a/src/gpuhunt/providers/cudo.py b/src/gpuhunt/providers/cudo.py index 402461e..9b38c14 100644 --- a/src/gpuhunt/providers/cudo.py +++ b/src/gpuhunt/providers/cudo.py @@ -77,8 +77,14 @@ def optimize_offers(machine_types, q: QueryFilter, balance_resource) -> List[Raw # filter offers with gpus gpu_machine_types = [vm for vm in machine_types if vm["maxGpuFree"] != 0] for machine_type in gpu_machine_types: - machine_type["gpu_name"] = gpu_name(machine_type["gpuModel"]) - machine_type["gpu_memory"] = get_memory(machine_type["gpu_name"]) + gpu_model_name = gpu_name(machine_type["gpuModel"]) + if gpu_model_name is None: + continue + gpu_memory_size = get_memory(gpu_model_name) + if gpu_memory_size is None: + continue + machine_type["gpu_name"] = gpu_model_name + machine_type["gpu_memory"] = gpu_memory_size if not is_between( machine_type["gpu_memory"], q.min_gpu_memory, q.max_total_gpu_memory ): @@ -103,17 +109,6 @@ def optimize_offers(machine_types, q: QueryFilter, balance_resource) -> List[Raw return list(chain.from_iterable(offers)) -class VMTypeFetchError(Exception): - def __init__(self, message, vcpu, memory_gib, gpu): - super().__init__(message) - self.vcpu = vcpu - self.memory_gib = memory_gib - self.gpu = gpu - - def __str__(self): - return f"{super().__str__()} - [vCPU: {self.vcpu}, Memory: {self.memory_gib} GiB, GPU: {self.gpu}]" - - def get_raw_catalog(machine_type, spec): raw = RawCatalogItem( instance_name=machine_type["machineType"], @@ -305,16 +300,15 @@ def gpu_name(name: str) -> Optional[str]: if not name: return None result = GPU_MAP.get(name) - if result is None: - raise Exception("There is no '%s' in GPU_MAP", name) return result def get_memory(gpu_name: str) -> Optional[int]: + if not gpu_name: + return None for gpu in KNOWN_GPUS: if gpu.name.lower() == gpu_name.lower(): return gpu.memory - raise Exception("There is no '%s' in KNOWN_GPUS", gpu_name) def round_up(value: Optional[Union[int, float]], step: int) -> Optional[int]: diff --git a/src/tests/providers/test_cudo.py b/src/tests/providers/test_cudo.py index 20ca077..fa853cd 100644 --- a/src/tests/providers/test_cudo.py +++ b/src/tests/providers/test_cudo.py @@ -50,6 +50,29 @@ def test_get_offers_with_query_filter(): assert len(offers) >= 1, "No offers found" +def test_get_offers_for_gpu_name(): + cudo = CudoProvider() + offers = cudo.get(QueryFilter(min_gpu_count=1, gpu_name=["A4000"]), balance_resources=True) + print(f"{len(offers)} offers found") + assert len(offers) >= 1, "No offers found" + + +def test_get_offers_for_gpu_memory(): + cudo = CudoProvider() + offers = cudo.get(QueryFilter(min_gpu_count=1, min_gpu_memory=16), balance_resources=True) + print(f"{len(offers)} offers found") + assert len(offers) >= 1, "No offers found" + + +def test_get_offers_for_compute_capability(): + cudo = CudoProvider() + offers = cudo.get( + QueryFilter(min_gpu_count=1, min_compute_capability=(8, 6)), balance_resources=True + ) + print(f"{len(offers)} offers found") + assert len(offers) >= 1, "No offers found" + + def test_get_offers_no_query_filter(): cudo = CudoProvider() offers = cudo.get(balance_resources=True) From 1b2073f836ef6d7dd58612271e19ff15331ff6c8 Mon Sep 17 00:00:00 2001 From: Bihan Rana Date: Wed, 13 Mar 2024 21:23:01 +0545 Subject: [PATCH 8/9] Improve Cudo Tests --- src/gpuhunt/providers/cudo.py | 2 +- src/tests/providers/test_cudo.py | 153 ++++++++++++++++++++----------- 2 files changed, 102 insertions(+), 53 deletions(-) diff --git a/src/gpuhunt/providers/cudo.py b/src/gpuhunt/providers/cudo.py index 9b38c14..196ccbe 100644 --- a/src/gpuhunt/providers/cudo.py +++ b/src/gpuhunt/providers/cudo.py @@ -100,7 +100,7 @@ def optimize_offers(machine_types, q: QueryFilter, balance_resource) -> List[Raw raw_catalogs = [get_raw_catalog(machine_type, spec) for spec in optimized_specs] offers.append(raw_catalogs) else: - cpu_only_machine_types = [vm for vm in machine_types if vm["maxGpuFree"] == 0] + cpu_only_machine_types = [vm for vm in machine_types if vm["maxVcpuFree"] != 0] for machine_type in cpu_only_machine_types: optimized_specs = optimize_offers_no_gpu(q, machine_type, balance_resource) raw_catalogs = [get_raw_catalog(machine_type, spec) for spec in optimized_specs] diff --git a/src/tests/providers/test_cudo.py b/src/tests/providers/test_cudo.py index fa853cd..8a9c5d6 100644 --- a/src/tests/providers/test_cudo.py +++ b/src/tests/providers/test_cudo.py @@ -2,14 +2,14 @@ import pytest -from gpuhunt import QueryFilter +import gpuhunt._internal.catalog as internal_catalog +from gpuhunt import Catalog from gpuhunt.providers.cudo import ( CudoProvider, get_balanced_disk_size, get_balanced_memory, get_memory, gpu_name, - optimize_offers_with_gpu, ) @@ -39,93 +39,142 @@ def machine_types() -> List[dict]: "totalGpuFree": 24, "maxStorageGibFree": 42420, "totalStorageGibFree": 42420, - } + }, + { + "dataCenterId": "no-luster-1", + "machineType": "epyc-rome-rtx-a5000", + "cpuModel": "EPYC-Rome", + "gpuModel": "RTX A5000", + "gpuModelId": "nvidia-rtx-a5000", + "minVcpuPerMemoryGib": 0.259109, + "maxVcpuPerMemoryGib": 1.036437, + "minVcpuPerGpu": 1, + "maxVcpuPerGpu": 16, + "vcpuPriceHr": {"value": "0.002100"}, + "memoryGibPriceHr": {"value": "0.003400"}, + "gpuPriceHr": {"value": "0.520000"}, + "minStorageGibPriceHr": {"value": "0.000107"}, + "ipv4PriceHr": {"value": "0.003500"}, + "renewableEnergy": False, + "maxVcpuFree": 116, + "totalVcpuFree": 208, + "maxMemoryGibFree": 219, + "totalMemoryGibFree": 390, + "maxGpuFree": 4, + "totalGpuFree": 7, + "maxStorageGibFree": 1170, + "totalStorageGibFree": 1170, + }, ] -def test_get_offers_with_query_filter(): +def test_get_offers_with_query_filter(mocker, machine_types): + catalog = Catalog(balance_resources=False, auto_reload=False) cudo = CudoProvider() - offers = cudo.get(QueryFilter(min_gpu_count=1, max_gpu_count=1), balance_resources=True) - print(f"{len(offers)} offers found") - assert len(offers) >= 1, "No offers found" + cudo.list_vm_machine_types = mocker.Mock(return_value=machine_types) + internal_catalog.ONLINE_PROVIDERS = ["cudo"] + internal_catalog.OFFLINE_PROVIDERS = [] + catalog.add_provider(cudo) + query_result = catalog.query(provider=["cudo"], min_gpu_count=1, max_gpu_count=1) + assert len(query_result) >= 1, "No offers found" -def test_get_offers_for_gpu_name(): +def test_get_offers_for_gpu_name(mocker, machine_types): + catalog = Catalog(balance_resources=True, auto_reload=False) cudo = CudoProvider() - offers = cudo.get(QueryFilter(min_gpu_count=1, gpu_name=["A4000"]), balance_resources=True) - print(f"{len(offers)} offers found") - assert len(offers) >= 1, "No offers found" + cudo.list_vm_machine_types = mocker.Mock(return_value=machine_types) + internal_catalog.ONLINE_PROVIDERS = ["cudo"] + internal_catalog.OFFLINE_PROVIDERS = [] + catalog.add_provider(cudo) + query_result = catalog.query(provider=["cudo"], min_gpu_count=1, gpu_name=["A5000"]) + assert len(query_result) >= 1, "No offers found" -def test_get_offers_for_gpu_memory(): +def test_get_offers_for_gpu_memory(mocker, machine_types): + catalog = Catalog(balance_resources=True, auto_reload=False) cudo = CudoProvider() - offers = cudo.get(QueryFilter(min_gpu_count=1, min_gpu_memory=16), balance_resources=True) - print(f"{len(offers)} offers found") - assert len(offers) >= 1, "No offers found" + cudo.list_vm_machine_types = mocker.Mock(return_value=machine_types) + internal_catalog.ONLINE_PROVIDERS = ["cudo"] + internal_catalog.OFFLINE_PROVIDERS = [] + catalog.add_provider(cudo) + query_result = catalog.query(provider=["cudo"], min_gpu_count=1, min_gpu_memory=16) + assert len(query_result) >= 1, "No offers found" -def test_get_offers_for_compute_capability(): +def test_get_offers_for_compute_capability(mocker, machine_types): + catalog = Catalog(balance_resources=True, auto_reload=False) cudo = CudoProvider() - offers = cudo.get( - QueryFilter(min_gpu_count=1, min_compute_capability=(8, 6)), balance_resources=True - ) - print(f"{len(offers)} offers found") - assert len(offers) >= 1, "No offers found" + cudo.list_vm_machine_types = mocker.Mock(return_value=machine_types) + internal_catalog.ONLINE_PROVIDERS = ["cudo"] + internal_catalog.OFFLINE_PROVIDERS = [] + catalog.add_provider(cudo) + query_result = catalog.query(provider=["cudo"], min_gpu_count=1, min_compute_capability=(8, 6)) + assert len(query_result) >= 1, "No offers found" -def test_get_offers_no_query_filter(): +def test_get_offers_no_query_filter(mocker, machine_types): + catalog = Catalog(balance_resources=True, auto_reload=False) cudo = CudoProvider() - offers = cudo.get(balance_resources=True) - print(f"{len(offers)} offers found") - assert len(offers) >= 1, "No offers found" + cudo.list_vm_machine_types = mocker.Mock(return_value=machine_types) + internal_catalog.ONLINE_PROVIDERS = ["cudo"] + internal_catalog.OFFLINE_PROVIDERS = [] + catalog.add_provider(cudo) + query_result = catalog.query(provider=["cudo"]) + assert len(query_result) >= 1, "No offers found" -def test_optimize_offers(machine_types): +def test_optimize_offers_2(mocker, machine_types): + catalog = Catalog(balance_resources=True, auto_reload=False) + cudo = CudoProvider() + cudo.list_vm_machine_types = mocker.Mock(return_value=machine_types[0:1]) + internal_catalog.ONLINE_PROVIDERS = ["cudo"] + internal_catalog.OFFLINE_PROVIDERS = [] + catalog.add_provider(cudo) + query_result = catalog.query( + provider=["cudo"], min_cpu=2, min_gpu_count=1, max_gpu_count=1, min_memory=8 + ) machine_type = machine_types[0] - machine_type["gpu_memory"] = get_memory(gpu_name(machine_type["gpuModel"])) - q = QueryFilter(min_cpu=2, min_gpu_count=1, max_gpu_count=1, min_memory=8) balance_resource = True available_disk = machine_type["maxStorageGibFree"] gpu_memory = get_memory(gpu_name(machine_type["gpuModel"])) - max_memory = q.max_memory - max_disk_size = q.max_disk_size - min_disk_size = q.min_disk_size - vm_configs = optimize_offers_with_gpu(q, machine_type, balance_resources=balance_resource) - - assert len(vm_configs) >= 1 - - for config in vm_configs: - min_cpus_for_memory = machine_type["minVcpuPerMemoryGib"] * config["memory"] - max_cpus_for_memory = machine_type["maxVcpuPerMemoryGib"] * config["memory"] - min_cpus_for_gpu = machine_type["minVcpuPerGpu"] * config["gpu"] - assert config["cpu"] >= min_cpus_for_memory, ( + max_memory = None + max_disk_size = None + min_disk_size = None + + assert len(query_result) >= 1 + + for config in query_result: + min_cpus_for_memory = machine_type["minVcpuPerMemoryGib"] * config.cpu + max_cpus_for_memory = machine_type["maxVcpuPerMemoryGib"] * config.memory + min_cpus_for_gpu = machine_type["minVcpuPerGpu"] * config.gpu_count + assert config.cpu >= min_cpus_for_memory, ( f"VM config does not meet the minimum CPU:Memory requirement. Required minimum CPUs: " - f"{min_cpus_for_memory}, Found: {config['cpu']}" + f"{min_cpus_for_memory}, Found: {config.cpu}" ) - assert config["cpu"] <= max_cpus_for_memory, ( + assert config.cpu <= max_cpus_for_memory, ( f"VM config exceeds the maximum CPU:Memory allowance. Allowed maximum CPUs: " - f"{max_cpus_for_memory}, Found: {config['cpu']}" + f"{max_cpus_for_memory}, Found: {config.cpu}" ) - assert config["cpu"] >= min_cpus_for_gpu, ( + assert config.cpu >= min_cpus_for_gpu, ( f"VM config does not meet the minimum CPU:GPU requirement. " - f"Required minimum CPUs: {min_cpus_for_gpu}, Found: {config['cpu']}" + f"Required minimum CPUs: {min_cpus_for_gpu}, Found: {config.cpu}" ) # Perform the balance resource checks if balance_resource is True if balance_resource: - expected_memory = get_balanced_memory(config["gpu"], gpu_memory, max_memory) + expected_memory = get_balanced_memory(config.gpu_count, gpu_memory, max_memory) expected_disk_size = get_balanced_disk_size( available_disk, - config["memory"], - config["gpu"] * gpu_memory, + config.memory, + config.gpu_count * gpu_memory, max_disk_size, min_disk_size, ) - assert config["memory"] == expected_memory, ( + assert config.memory == expected_memory, ( f"Memory allocation does not match the expected balanced memory. " - f"Expected: {expected_memory}, Found: {config['memory']} in config {config}" + f"Expected: {expected_memory}, Found: {config.memory}" ) - assert config["disk_size"] == expected_disk_size, ( + assert config.disk_size == expected_disk_size, ( f"Disk size allocation does not match the expected balanced disk size. " - f"Expected: {expected_disk_size}, Found: {config['disk_size']}" + f"Expected: {expected_disk_size}, Found: {config.disk_size}" ) From 33737efb39053ea89bd21bbac0a176997ef78177 Mon Sep 17 00:00:00 2001 From: Bihan Rana Date: Wed, 13 Mar 2024 22:12:18 +0545 Subject: [PATCH 9/9] Cast round_down return type to int --- src/gpuhunt/providers/cudo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gpuhunt/providers/cudo.py b/src/gpuhunt/providers/cudo.py index 196ccbe..1d099fa 100644 --- a/src/gpuhunt/providers/cudo.py +++ b/src/gpuhunt/providers/cudo.py @@ -320,7 +320,7 @@ def round_up(value: Optional[Union[int, float]], step: int) -> Optional[int]: def round_down(value: Optional[Union[int, float]], step: int) -> Optional[int]: if value is None: return None - return value // step * step + return int(value // step * step) T = TypeVar("T", bound=Union[int, float])