Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve VastAI provider #6

Merged
merged 2 commits into from
Nov 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion src/gpuhunt/_internal/catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
version_url = "https://dstack-gpu-pricing.s3.eu-west-1.amazonaws.com/v1/version"
catalog_url = "https://dstack-gpu-pricing.s3.eu-west-1.amazonaws.com/v1/{version}/catalog.zip"
OFFLINE_PROVIDERS = ["aws", "azure", "gcp", "lambdalabs"]
ONLINE_PROVIDERS = ["tensordock"]
ONLINE_PROVIDERS = ["tensordock", "vastai"]
RELOAD_INTERVAL = 4 * 60 * 60 # 4 hours


Expand Down Expand Up @@ -112,6 +112,7 @@ def query(
)
if self.fill_missing:
query_filter = constraints.fill_missing(query_filter)
logger.debug("Effective query filter: %s", query_filter)
# validate providers
for p in query_filter.provider or []:
if p not in OFFLINE_PROVIDERS + ONLINE_PROVIDERS:
Expand Down
24 changes: 9 additions & 15 deletions src/gpuhunt/_internal/constraints.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from gpuhunt._internal.models import CatalogItem, GPUInfo, QueryFilter


def fill_missing(q: QueryFilter, *, memory_per_core: int = 8) -> QueryFilter:
def fill_missing(q: QueryFilter, *, memory_per_core: int = 6) -> QueryFilter:
q = copy.deepcopy(q)

# if there is some information about gpu
Expand All @@ -26,21 +26,15 @@ def fill_missing(q: QueryFilter, *, memory_per_core: int = 8) -> QueryFilter:
min_gpu_memory = []
if q.min_gpu_memory is not None:
min_gpu_memory.append(q.min_gpu_memory)
if q.min_compute_capability is not None:
min_gpu_memory.extend(
[
i.memory
for i in KNOWN_GPUS
if i.compute_capability >= q.min_compute_capability
]
)
if q.gpu_name is not None:
min_gpu_memory.extend(
[i.memory for i in KNOWN_GPUS if i.name.lower() in q.gpu_name]
)
min_total_gpu_memory = (
min(min_gpu_memory, default=min(i.memory for i in KNOWN_GPUS)) * min_gpu_count
gpus = KNOWN_GPUS
if q.min_compute_capability is not None: # filter gpus by compute capability
gpus = [i for i in gpus if i.compute_capability >= q.min_compute_capability]
if q.gpu_name is not None: # filter gpus by name
gpus = [i for i in gpus if i.name.lower() in q.gpu_name]
min_gpu_memory.append(
min((i.memory for i in gpus), default=min(i.memory for i in KNOWN_GPUS))
)
min_total_gpu_memory = max(min_gpu_memory) * min_gpu_count

if min_total_gpu_memory is not None:
if q.min_memory is None: # gpu memory to memory
Expand Down
5 changes: 4 additions & 1 deletion src/gpuhunt/_internal/default.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,10 @@ def default_catalog() -> Catalog:
"""
catalog = Catalog()
catalog.load()
for module, provider in [("gpuhunt.providers.tensordock", "TensorDockProvider")]:
for module, provider in [
("gpuhunt.providers.tensordock", "TensorDockProvider"),
("gpuhunt.providers.vastai", "VastAIProvider"),
]:
try:
module = importlib.import_module(module)
provider = getattr(module, provider)()
Expand Down
18 changes: 17 additions & 1 deletion src/gpuhunt/_internal/models.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from dataclasses import asdict, dataclass
from dataclasses import asdict, dataclass, fields
from typing import Dict, List, Optional, Tuple, Union

from gpuhunt._internal.utils import empty_as_none
Expand Down Expand Up @@ -117,6 +117,22 @@ def __post_init__(self):
if self.gpu_name is not None:
self.gpu_name = [i.lower() for i in self.gpu_name]

def __repr__(self) -> str:
"""
>>> QueryFilter()
QueryFilter()
>>> QueryFilter(min_cpu=4)
QueryFilter(min_cpu=4)
>>> QueryFilter(max_price=1.2, min_cpu=4)
QueryFilter(min_cpu=4, max_price=1.2)
"""
kv = ", ".join(
f"{f.name}={value}"
for f in fields(self)
if (value := getattr(self, f.name)) is not None
)
return f"QueryFilter({kv})"


@dataclass
class GPUInfo:
Expand Down
76 changes: 68 additions & 8 deletions src/gpuhunt/providers/vastai.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,46 @@
import copy
from typing import List, Optional
import logging
from collections import defaultdict
from typing import List, Optional, Tuple

import requests

from gpuhunt._internal.models import QueryFilter, RawCatalogItem
from gpuhunt.providers import AbstractProvider

bundles_url = "https://console.vast.ai/api/v0/bundles"
logger = logging.getLogger(__name__)
bundles_url = "https://console.vast.ai/api/v0/bundles/"
kilo = 1000


class VastAIProvider(AbstractProvider):
NAME = "vastai"

def get(self, query_filter: Optional[QueryFilter] = None) -> List[RawCatalogItem]:
data = requests.get(bundles_url).json()
filters = self.make_filters(query_filter or QueryFilter())
filters["rentable"]["eq"] = True
filters["direct_port_count"]["gte"] = 1 # publicly accessible
filters["reliability2"]["gte"] = 0.9
resp = requests.post(bundles_url, json=filters)
resp.raise_for_status()
data = resp.json()

instance_offers = []
for offer in data["offers"]:
gpu_name = get_gpu_name(offer["gpu_name"])
ondemand_offer = RawCatalogItem(
instance_name=f"{offer['host_id']}",
instance_name=str(offer["id"]),
location=get_location(offer["geolocation"]),
price=round(offer["dph_total"], 5),
cpu=offer["cpu_cores"],
memory=round(offer["cpu_ram"] / 1024),
price=round(offer["dph_total"], 5), # TODO(egor-s) add disk price
cpu=int(offer["cpu_cores_effective"]),
memory=float(
int(
offer["cpu_ram"] * offer["cpu_cores_effective"] / offer["cpu_cores"] / kilo
)
),
gpu_count=offer["num_gpus"],
gpu_name=gpu_name,
gpu_memory=round(offer["gpu_ram"] / 1024),
gpu_memory=float(int(offer["gpu_ram"] / kilo)),
spot=False,
)
instance_offers.append(ondemand_offer)
Expand All @@ -36,6 +51,40 @@ def get(self, query_filter: Optional[QueryFilter] = None) -> List[RawCatalogItem
instance_offers.append(spot_offer)
return instance_offers

@staticmethod
def make_filters(q: QueryFilter) -> dict:
filters = defaultdict(dict)
if q.min_cpu is not None:
filters["cpu_cores"]["gte"] = q.min_cpu
if q.max_cpu is not None:
filters["cpu_cores"]["lte"] = q.max_cpu
if q.min_memory is not None:
filters["cpu_ram"]["gte"] = q.min_memory * kilo
if q.max_memory is not None:
filters["cpu_ram"]["lte"] = q.max_memory * kilo
if q.min_gpu_count is not None:
filters["num_gpus"]["gte"] = q.min_gpu_count
if q.max_gpu_count is not None:
filters["num_gpus"]["lte"] = q.max_gpu_count
if q.min_gpu_memory is not None:
filters["gpu_ram"]["gte"] = q.min_gpu_memory * kilo
if q.max_gpu_memory is not None:
filters["gpu_ram"]["lte"] = q.max_gpu_memory * kilo
if q.min_disk_size is not None:
filters["disk_space"]["gte"] = q.min_disk_size
if q.max_disk_size is not None:
filters["disk_space"]["lte"] = q.max_disk_size
if q.min_price is not None:
filters["dph_total"]["gte"] = q.min_price
if q.max_price is not None:
filters["dph_total"]["lte"] = q.max_price
# TODO(egor-s): add compute capability info for all GPUs
if q.min_compute_capability is not None:
filters["compute_capability"]["gte"] = compute_cap(q.min_compute_capability)
if q.max_compute_capability is not None:
filters["compute_capability"]["lte"] = compute_cap(q.max_compute_capability)
return filters


def get_gpu_name(gpu_name: str) -> str:
gpu_name = gpu_name.replace("RTX A", "A").replace("Tesla ", "").replace("Q ", "")
Expand All @@ -53,3 +102,14 @@ def get_location(location: Optional[str]) -> str:
except ValueError:
pass
return location.lower().replace(" ", "")


def compute_cap(cc: Tuple[int, int]) -> str:
"""
>>> compute_cap((7, 0))
'700'
>>> compute_cap((7, 5))
'750'
"""
major, minor = cc
return f"{major}{str(minor).ljust(2, '0')}"
11 changes: 11 additions & 0 deletions src/tests/_internal/test_constraints.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,3 +149,14 @@ def test_from_compute_capability(self):
min_disk_size=110,
min_cpu=40,
)

def test_from_gpu_name_and_gpu_memory(self):
assert fill_missing(
QueryFilter(gpu_name=["A100"], min_gpu_memory=80), memory_per_core=4
) == QueryFilter(
gpu_name=["A100"],
min_gpu_memory=80,
min_memory=160,
min_disk_size=110,
min_cpu=40,
)