Skip to content

Commit

Permalink
Improve VastAI provider (#6)
Browse files Browse the repository at this point in the history
* Consider the highest min_gpu_memory

* Filter vastai offers on request
  • Loading branch information
Egor-S authored Nov 2, 2023
1 parent 0b103ae commit 917ea8b
Show file tree
Hide file tree
Showing 6 changed files with 111 additions and 26 deletions.
3 changes: 2 additions & 1 deletion src/gpuhunt/_internal/catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
version_url = "https://dstack-gpu-pricing.s3.eu-west-1.amazonaws.com/v1/version"
catalog_url = "https://dstack-gpu-pricing.s3.eu-west-1.amazonaws.com/v1/{version}/catalog.zip"
OFFLINE_PROVIDERS = ["aws", "azure", "gcp", "lambdalabs"]
ONLINE_PROVIDERS = ["tensordock"]
ONLINE_PROVIDERS = ["tensordock", "vastai"]
RELOAD_INTERVAL = 4 * 60 * 60 # 4 hours


Expand Down Expand Up @@ -112,6 +112,7 @@ def query(
)
if self.fill_missing:
query_filter = constraints.fill_missing(query_filter)
logger.debug("Effective query filter: %s", query_filter)
# validate providers
for p in query_filter.provider or []:
if p not in OFFLINE_PROVIDERS + ONLINE_PROVIDERS:
Expand Down
24 changes: 9 additions & 15 deletions src/gpuhunt/_internal/constraints.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from gpuhunt._internal.models import CatalogItem, GPUInfo, QueryFilter


def fill_missing(q: QueryFilter, *, memory_per_core: int = 8) -> QueryFilter:
def fill_missing(q: QueryFilter, *, memory_per_core: int = 6) -> QueryFilter:
q = copy.deepcopy(q)

# if there is some information about gpu
Expand All @@ -26,21 +26,15 @@ def fill_missing(q: QueryFilter, *, memory_per_core: int = 8) -> QueryFilter:
min_gpu_memory = []
if q.min_gpu_memory is not None:
min_gpu_memory.append(q.min_gpu_memory)
if q.min_compute_capability is not None:
min_gpu_memory.extend(
[
i.memory
for i in KNOWN_GPUS
if i.compute_capability >= q.min_compute_capability
]
)
if q.gpu_name is not None:
min_gpu_memory.extend(
[i.memory for i in KNOWN_GPUS if i.name.lower() in q.gpu_name]
)
min_total_gpu_memory = (
min(min_gpu_memory, default=min(i.memory for i in KNOWN_GPUS)) * min_gpu_count
gpus = KNOWN_GPUS
if q.min_compute_capability is not None: # filter gpus by compute capability
gpus = [i for i in gpus if i.compute_capability >= q.min_compute_capability]
if q.gpu_name is not None: # filter gpus by name
gpus = [i for i in gpus if i.name.lower() in q.gpu_name]
min_gpu_memory.append(
min((i.memory for i in gpus), default=min(i.memory for i in KNOWN_GPUS))
)
min_total_gpu_memory = max(min_gpu_memory) * min_gpu_count

if min_total_gpu_memory is not None:
if q.min_memory is None: # gpu memory to memory
Expand Down
5 changes: 4 additions & 1 deletion src/gpuhunt/_internal/default.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,10 @@ def default_catalog() -> Catalog:
"""
catalog = Catalog()
catalog.load()
for module, provider in [("gpuhunt.providers.tensordock", "TensorDockProvider")]:
for module, provider in [
("gpuhunt.providers.tensordock", "TensorDockProvider"),
("gpuhunt.providers.vastai", "VastAIProvider"),
]:
try:
module = importlib.import_module(module)
provider = getattr(module, provider)()
Expand Down
18 changes: 17 additions & 1 deletion src/gpuhunt/_internal/models.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from dataclasses import asdict, dataclass
from dataclasses import asdict, dataclass, fields
from typing import Dict, List, Optional, Tuple, Union

from gpuhunt._internal.utils import empty_as_none
Expand Down Expand Up @@ -117,6 +117,22 @@ def __post_init__(self):
if self.gpu_name is not None:
self.gpu_name = [i.lower() for i in self.gpu_name]

def __repr__(self) -> str:
"""
>>> QueryFilter()
QueryFilter()
>>> QueryFilter(min_cpu=4)
QueryFilter(min_cpu=4)
>>> QueryFilter(max_price=1.2, min_cpu=4)
QueryFilter(min_cpu=4, max_price=1.2)
"""
kv = ", ".join(
f"{f.name}={value}"
for f in fields(self)
if (value := getattr(self, f.name)) is not None
)
return f"QueryFilter({kv})"


@dataclass
class GPUInfo:
Expand Down
76 changes: 68 additions & 8 deletions src/gpuhunt/providers/vastai.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,46 @@
import copy
from typing import List, Optional
import logging
from collections import defaultdict
from typing import List, Optional, Tuple

import requests

from gpuhunt._internal.models import QueryFilter, RawCatalogItem
from gpuhunt.providers import AbstractProvider

bundles_url = "https://console.vast.ai/api/v0/bundles"
logger = logging.getLogger(__name__)
bundles_url = "https://console.vast.ai/api/v0/bundles/"
kilo = 1000


class VastAIProvider(AbstractProvider):
NAME = "vastai"

def get(self, query_filter: Optional[QueryFilter] = None) -> List[RawCatalogItem]:
data = requests.get(bundles_url).json()
filters = self.make_filters(query_filter or QueryFilter())
filters["rentable"]["eq"] = True
filters["direct_port_count"]["gte"] = 1 # publicly accessible
filters["reliability2"]["gte"] = 0.9
resp = requests.post(bundles_url, json=filters)
resp.raise_for_status()
data = resp.json()

instance_offers = []
for offer in data["offers"]:
gpu_name = get_gpu_name(offer["gpu_name"])
ondemand_offer = RawCatalogItem(
instance_name=f"{offer['host_id']}",
instance_name=str(offer["id"]),
location=get_location(offer["geolocation"]),
price=round(offer["dph_total"], 5),
cpu=offer["cpu_cores"],
memory=round(offer["cpu_ram"] / 1024),
price=round(offer["dph_total"], 5), # TODO(egor-s) add disk price
cpu=int(offer["cpu_cores_effective"]),
memory=float(
int(
offer["cpu_ram"] * offer["cpu_cores_effective"] / offer["cpu_cores"] / kilo
)
),
gpu_count=offer["num_gpus"],
gpu_name=gpu_name,
gpu_memory=round(offer["gpu_ram"] / 1024),
gpu_memory=float(int(offer["gpu_ram"] / kilo)),
spot=False,
)
instance_offers.append(ondemand_offer)
Expand All @@ -36,6 +51,40 @@ def get(self, query_filter: Optional[QueryFilter] = None) -> List[RawCatalogItem
instance_offers.append(spot_offer)
return instance_offers

@staticmethod
def make_filters(q: QueryFilter) -> dict:
filters = defaultdict(dict)
if q.min_cpu is not None:
filters["cpu_cores"]["gte"] = q.min_cpu
if q.max_cpu is not None:
filters["cpu_cores"]["lte"] = q.max_cpu
if q.min_memory is not None:
filters["cpu_ram"]["gte"] = q.min_memory * kilo
if q.max_memory is not None:
filters["cpu_ram"]["lte"] = q.max_memory * kilo
if q.min_gpu_count is not None:
filters["num_gpus"]["gte"] = q.min_gpu_count
if q.max_gpu_count is not None:
filters["num_gpus"]["lte"] = q.max_gpu_count
if q.min_gpu_memory is not None:
filters["gpu_ram"]["gte"] = q.min_gpu_memory * kilo
if q.max_gpu_memory is not None:
filters["gpu_ram"]["lte"] = q.max_gpu_memory * kilo
if q.min_disk_size is not None:
filters["disk_space"]["gte"] = q.min_disk_size
if q.max_disk_size is not None:
filters["disk_space"]["lte"] = q.max_disk_size
if q.min_price is not None:
filters["dph_total"]["gte"] = q.min_price
if q.max_price is not None:
filters["dph_total"]["lte"] = q.max_price
# TODO(egor-s): add compute capability info for all GPUs
if q.min_compute_capability is not None:
filters["compute_capability"]["gte"] = compute_cap(q.min_compute_capability)
if q.max_compute_capability is not None:
filters["compute_capability"]["lte"] = compute_cap(q.max_compute_capability)
return filters


def get_gpu_name(gpu_name: str) -> str:
gpu_name = gpu_name.replace("RTX A", "A").replace("Tesla ", "").replace("Q ", "")
Expand All @@ -53,3 +102,14 @@ def get_location(location: Optional[str]) -> str:
except ValueError:
pass
return location.lower().replace(" ", "")


def compute_cap(cc: Tuple[int, int]) -> str:
"""
>>> compute_cap((7, 0))
'700'
>>> compute_cap((7, 5))
'750'
"""
major, minor = cc
return f"{major}{str(minor).ljust(2, '0')}"
11 changes: 11 additions & 0 deletions src/tests/_internal/test_constraints.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,3 +149,14 @@ def test_from_compute_capability(self):
min_disk_size=110,
min_cpu=40,
)

def test_from_gpu_name_and_gpu_memory(self):
assert fill_missing(
QueryFilter(gpu_name=["A100"], min_gpu_memory=80), memory_per_core=4
) == QueryFilter(
gpu_name=["A100"],
min_gpu_memory=80,
min_memory=160,
min_disk_size=110,
min_cpu=40,
)

0 comments on commit 917ea8b

Please sign in to comment.