diff --git a/src/gpuhunt/_internal/catalog.py b/src/gpuhunt/_internal/catalog.py index 917f643..a8821a2 100644 --- a/src/gpuhunt/_internal/catalog.py +++ b/src/gpuhunt/_internal/catalog.py @@ -17,7 +17,7 @@ version_url = "https://dstack-gpu-pricing.s3.eu-west-1.amazonaws.com/v1/version" catalog_url = "https://dstack-gpu-pricing.s3.eu-west-1.amazonaws.com/v1/{version}/catalog.zip" OFFLINE_PROVIDERS = ["aws", "azure", "gcp", "lambdalabs"] -ONLINE_PROVIDERS = ["tensordock"] +ONLINE_PROVIDERS = ["tensordock", "vastai"] RELOAD_INTERVAL = 4 * 60 * 60 # 4 hours @@ -112,6 +112,7 @@ def query( ) if self.fill_missing: query_filter = constraints.fill_missing(query_filter) + logger.debug("Effective query filter: %s", query_filter) # validate providers for p in query_filter.provider or []: if p not in OFFLINE_PROVIDERS + ONLINE_PROVIDERS: diff --git a/src/gpuhunt/_internal/constraints.py b/src/gpuhunt/_internal/constraints.py index 26e2143..a40cfd5 100644 --- a/src/gpuhunt/_internal/constraints.py +++ b/src/gpuhunt/_internal/constraints.py @@ -4,7 +4,7 @@ from gpuhunt._internal.models import CatalogItem, GPUInfo, QueryFilter -def fill_missing(q: QueryFilter, *, memory_per_core: int = 8) -> QueryFilter: +def fill_missing(q: QueryFilter, *, memory_per_core: int = 6) -> QueryFilter: q = copy.deepcopy(q) # if there is some information about gpu @@ -26,21 +26,15 @@ def fill_missing(q: QueryFilter, *, memory_per_core: int = 8) -> QueryFilter: min_gpu_memory = [] if q.min_gpu_memory is not None: min_gpu_memory.append(q.min_gpu_memory) - if q.min_compute_capability is not None: - min_gpu_memory.extend( - [ - i.memory - for i in KNOWN_GPUS - if i.compute_capability >= q.min_compute_capability - ] - ) - if q.gpu_name is not None: - min_gpu_memory.extend( - [i.memory for i in KNOWN_GPUS if i.name.lower() in q.gpu_name] - ) - min_total_gpu_memory = ( - min(min_gpu_memory, default=min(i.memory for i in KNOWN_GPUS)) * min_gpu_count + gpus = KNOWN_GPUS + if q.min_compute_capability is not None: # filter gpus by compute capability + gpus = [i for i in gpus if i.compute_capability >= q.min_compute_capability] + if q.gpu_name is not None: # filter gpus by name + gpus = [i for i in gpus if i.name.lower() in q.gpu_name] + min_gpu_memory.append( + min((i.memory for i in gpus), default=min(i.memory for i in KNOWN_GPUS)) ) + min_total_gpu_memory = max(min_gpu_memory) * min_gpu_count if min_total_gpu_memory is not None: if q.min_memory is None: # gpu memory to memory diff --git a/src/gpuhunt/_internal/default.py b/src/gpuhunt/_internal/default.py index 455606d..07e9b59 100644 --- a/src/gpuhunt/_internal/default.py +++ b/src/gpuhunt/_internal/default.py @@ -18,7 +18,10 @@ def default_catalog() -> Catalog: """ catalog = Catalog() catalog.load() - for module, provider in [("gpuhunt.providers.tensordock", "TensorDockProvider")]: + for module, provider in [ + ("gpuhunt.providers.tensordock", "TensorDockProvider"), + ("gpuhunt.providers.vastai", "VastAIProvider"), + ]: try: module = importlib.import_module(module) provider = getattr(module, provider)() diff --git a/src/gpuhunt/_internal/models.py b/src/gpuhunt/_internal/models.py index 6e5a46b..aa97720 100644 --- a/src/gpuhunt/_internal/models.py +++ b/src/gpuhunt/_internal/models.py @@ -1,4 +1,4 @@ -from dataclasses import asdict, dataclass +from dataclasses import asdict, dataclass, fields from typing import Dict, List, Optional, Tuple, Union from gpuhunt._internal.utils import empty_as_none @@ -117,6 +117,22 @@ def __post_init__(self): if self.gpu_name is not None: self.gpu_name = [i.lower() for i in self.gpu_name] + def __repr__(self) -> str: + """ + >>> QueryFilter() + QueryFilter() + >>> QueryFilter(min_cpu=4) + QueryFilter(min_cpu=4) + >>> QueryFilter(max_price=1.2, min_cpu=4) + QueryFilter(min_cpu=4, max_price=1.2) + """ + kv = ", ".join( + f"{f.name}={value}" + for f in fields(self) + if (value := getattr(self, f.name)) is not None + ) + return f"QueryFilter({kv})" + @dataclass class GPUInfo: diff --git a/src/gpuhunt/providers/vastai.py b/src/gpuhunt/providers/vastai.py index 6ef875f..82eae94 100644 --- a/src/gpuhunt/providers/vastai.py +++ b/src/gpuhunt/providers/vastai.py @@ -1,31 +1,46 @@ import copy -from typing import List, Optional +import logging +from collections import defaultdict +from typing import List, Optional, Tuple import requests from gpuhunt._internal.models import QueryFilter, RawCatalogItem from gpuhunt.providers import AbstractProvider -bundles_url = "https://console.vast.ai/api/v0/bundles" +logger = logging.getLogger(__name__) +bundles_url = "https://console.vast.ai/api/v0/bundles/" +kilo = 1000 class VastAIProvider(AbstractProvider): NAME = "vastai" def get(self, query_filter: Optional[QueryFilter] = None) -> List[RawCatalogItem]: - data = requests.get(bundles_url).json() + filters = self.make_filters(query_filter or QueryFilter()) + filters["rentable"]["eq"] = True + filters["direct_port_count"]["gte"] = 1 # publicly accessible + filters["reliability2"]["gte"] = 0.9 + resp = requests.post(bundles_url, json=filters) + resp.raise_for_status() + data = resp.json() + instance_offers = [] for offer in data["offers"]: gpu_name = get_gpu_name(offer["gpu_name"]) ondemand_offer = RawCatalogItem( - instance_name=f"{offer['host_id']}", + instance_name=str(offer["id"]), location=get_location(offer["geolocation"]), - price=round(offer["dph_total"], 5), - cpu=offer["cpu_cores"], - memory=round(offer["cpu_ram"] / 1024), + price=round(offer["dph_total"], 5), # TODO(egor-s) add disk price + cpu=int(offer["cpu_cores_effective"]), + memory=float( + int( + offer["cpu_ram"] * offer["cpu_cores_effective"] / offer["cpu_cores"] / kilo + ) + ), gpu_count=offer["num_gpus"], gpu_name=gpu_name, - gpu_memory=round(offer["gpu_ram"] / 1024), + gpu_memory=float(int(offer["gpu_ram"] / kilo)), spot=False, ) instance_offers.append(ondemand_offer) @@ -36,6 +51,40 @@ def get(self, query_filter: Optional[QueryFilter] = None) -> List[RawCatalogItem instance_offers.append(spot_offer) return instance_offers + @staticmethod + def make_filters(q: QueryFilter) -> dict: + filters = defaultdict(dict) + if q.min_cpu is not None: + filters["cpu_cores"]["gte"] = q.min_cpu + if q.max_cpu is not None: + filters["cpu_cores"]["lte"] = q.max_cpu + if q.min_memory is not None: + filters["cpu_ram"]["gte"] = q.min_memory * kilo + if q.max_memory is not None: + filters["cpu_ram"]["lte"] = q.max_memory * kilo + if q.min_gpu_count is not None: + filters["num_gpus"]["gte"] = q.min_gpu_count + if q.max_gpu_count is not None: + filters["num_gpus"]["lte"] = q.max_gpu_count + if q.min_gpu_memory is not None: + filters["gpu_ram"]["gte"] = q.min_gpu_memory * kilo + if q.max_gpu_memory is not None: + filters["gpu_ram"]["lte"] = q.max_gpu_memory * kilo + if q.min_disk_size is not None: + filters["disk_space"]["gte"] = q.min_disk_size + if q.max_disk_size is not None: + filters["disk_space"]["lte"] = q.max_disk_size + if q.min_price is not None: + filters["dph_total"]["gte"] = q.min_price + if q.max_price is not None: + filters["dph_total"]["lte"] = q.max_price + # TODO(egor-s): add compute capability info for all GPUs + if q.min_compute_capability is not None: + filters["compute_capability"]["gte"] = compute_cap(q.min_compute_capability) + if q.max_compute_capability is not None: + filters["compute_capability"]["lte"] = compute_cap(q.max_compute_capability) + return filters + def get_gpu_name(gpu_name: str) -> str: gpu_name = gpu_name.replace("RTX A", "A").replace("Tesla ", "").replace("Q ", "") @@ -53,3 +102,14 @@ def get_location(location: Optional[str]) -> str: except ValueError: pass return location.lower().replace(" ", "") + + +def compute_cap(cc: Tuple[int, int]) -> str: + """ + >>> compute_cap((7, 0)) + '700' + >>> compute_cap((7, 5)) + '750' + """ + major, minor = cc + return f"{major}{str(minor).ljust(2, '0')}" diff --git a/src/tests/_internal/test_constraints.py b/src/tests/_internal/test_constraints.py index c8cc858..8dddcca 100644 --- a/src/tests/_internal/test_constraints.py +++ b/src/tests/_internal/test_constraints.py @@ -149,3 +149,14 @@ def test_from_compute_capability(self): min_disk_size=110, min_cpu=40, ) + + def test_from_gpu_name_and_gpu_memory(self): + assert fill_missing( + QueryFilter(gpu_name=["A100"], min_gpu_memory=80), memory_per_core=4 + ) == QueryFilter( + gpu_name=["A100"], + min_gpu_memory=80, + min_memory=160, + min_disk_size=110, + min_cpu=40, + )