Skip to content

Commit

Permalink
TensorDock fixes (#5)
Browse files Browse the repository at this point in the history
* Case-insensitive GPU name matching

* Auto-convert unknown GPU models for TensorDock

* Run doctest

* Cap TensorDock resources at 75% if not all GPUs are taken

* Use country as a location

* Install all dependencies for testing
  • Loading branch information
Egor-S authored Oct 31, 2023
1 parent 5852515 commit da41d23
Show file tree
Hide file tree
Showing 9 changed files with 134 additions and 76 deletions.
5 changes: 4 additions & 1 deletion .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,11 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install .
pip install '.[all]'
pip install -r requirements_dev.txt
- name: Run doctest
run: |
pytest --doctest-modules src/gpuhunt
- name: Run pytest
run: |
pytest src/tests
Expand Down
5 changes: 4 additions & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,11 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install .
pip install '.[all]'
pip install -r requirements_dev.txt
- name: Run doctest
run: |
pytest --doctest-modules src/gpuhunt
- name: Run pytest
run: |
pytest src/tests
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ gcp = [
"google-cloud-billing",
"google-cloud-compute"
]
all = ["gpuhunt[aws,azure,gcp]"]

[tool.setuptools.dynamic]
version = {attr = "gpuhunt.version.__version__"}
10 changes: 4 additions & 6 deletions src/gpuhunt/_internal/catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def query(
max_memory: maximum amount of RAM in GB
min_gpu_count: minimum number of GPUs
max_gpu_count: maximum number of GPUs
gpu_name: case-sensitive name of the GPU to filter by. If not specified, all GPUs will be used
gpu_name: name of the GPU to filter by. If not specified, all GPUs will be used
min_gpu_memory: minimum amount of GPU VRAM in GB for each GPU
max_gpu_memory: maximum amount of GPU VRAM in GB for each GPU
min_total_gpu_memory: minimum amount of GPU VRAM in GB for all GPUs combined
Expand Down Expand Up @@ -113,11 +113,9 @@ def query(
if self.fill_missing:
query_filter = constraints.fill_missing(query_filter)
# validate providers
if query_filter.provider is not None:
query_filter.provider = [p.lower() for p in query_filter.provider]
for p in query_filter.provider:
if p not in OFFLINE_PROVIDERS + ONLINE_PROVIDERS:
raise ValueError(f"Unknown provider: {p}")
for p in query_filter.provider or []:
if p not in OFFLINE_PROVIDERS + ONLINE_PROVIDERS:
raise ValueError(f"Unknown provider: {p}")

# fetch providers
items = []
Expand Down
19 changes: 14 additions & 5 deletions src/gpuhunt/_internal/constraints.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,9 @@ def fill_missing(q: QueryFilter, *, memory_per_core: int = 8) -> QueryFilter:
]
)
if q.gpu_name is not None:
min_gpu_memory.extend([i.memory for i in KNOWN_GPUS if i.name in q.gpu_name])
min_gpu_memory.extend(
[i.memory for i in KNOWN_GPUS if i.name.lower() in q.gpu_name]
)
min_total_gpu_memory = (
min(min_gpu_memory, default=min(i.memory for i in KNOWN_GPUS)) * min_gpu_count
)
Expand Down Expand Up @@ -102,7 +104,7 @@ def matches(i: CatalogItem, q: QueryFilter) -> bool:
Returns:
whether the catalog item matches the filters
"""
if q.provider is not None and i.provider not in q.provider:
if q.provider is not None and i.provider.lower() not in q.provider:
return False
if not is_between(i.cpu, q.min_cpu, q.max_cpu):
return False
Expand All @@ -111,11 +113,11 @@ def matches(i: CatalogItem, q: QueryFilter) -> bool:
if not is_between(i.gpu_count, q.min_gpu_count, q.max_gpu_count):
return False
if q.gpu_name is not None:
if i.gpu_name not in q.gpu_name:
if i.gpu_name.lower() not in q.gpu_name:
return False
if q.min_compute_capability is not None or q.max_compute_capability is not None:
cc = [info.compute_capability for info in KNOWN_GPUS if info.name == i.gpu_name]
if not cc or not is_between(min(cc), q.min_compute_capability, q.max_compute_capability):
cc = get_compute_capability(i.gpu_name)
if not cc or not is_between(cc, q.min_compute_capability, q.max_compute_capability):
return False
if not is_between(i.gpu_memory if i.gpu_count > 0 else 0, q.min_gpu_memory, q.max_gpu_memory):
return False
Expand All @@ -135,6 +137,13 @@ def matches(i: CatalogItem, q: QueryFilter) -> bool:
return True


def get_compute_capability(gpu_name: str) -> Optional[Tuple[int, int]]:
for gpu in KNOWN_GPUS:
if gpu.name.lower() == gpu_name.lower():
return gpu.compute_capability
return None


KNOWN_GPUS = [
GPUInfo(name="A10", memory=24, compute_capability=(8, 6)),
GPUInfo(name="A100", memory=40, compute_capability=(8, 0)),
Expand Down
8 changes: 7 additions & 1 deletion src/gpuhunt/_internal/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ class QueryFilter:
max_memory: maximum amount of RAM in GB
min_gpu_count: minimum number of GPUs
max_gpu_count: maximum number of GPUs
gpu_name: case-sensitive name of the GPU to filter by. If not specified, all GPUs will be used
gpu_name: name of the GPU to filter by. If not specified, all GPUs will be used
min_gpu_memory: minimum amount of GPU VRAM in GB for each GPU
max_gpu_memory: maximum amount of GPU VRAM in GB for each GPU
min_total_gpu_memory: minimum amount of GPU VRAM in GB for all GPUs combined
Expand Down Expand Up @@ -111,6 +111,12 @@ class QueryFilter:
max_compute_capability: Optional[Tuple[int, int]] = None
spot: Optional[bool] = None

def __post_init__(self):
if self.provider is not None:
self.provider = [i.lower() for i in self.provider]
if self.gpu_name is not None:
self.gpu_name = [i.lower() for i in self.gpu_name]


@dataclass
class GPUInfo:
Expand Down
141 changes: 79 additions & 62 deletions src/gpuhunt/providers/tensordock.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import requests

from gpuhunt._internal.constraints import is_between, optimize
from gpuhunt._internal.constraints import get_compute_capability, is_between, optimize
from gpuhunt._internal.models import QueryFilter, RawCatalogItem
from gpuhunt.providers import AbstractProvider

Expand Down Expand Up @@ -39,11 +39,7 @@ def get(self, query_filter: Optional[QueryFilter] = None) -> List[RawCatalogItem
hostnodes = requests.get(marketplace_hostnodes_url).json()["hostnodes"]
offers = []
for hostnode, details in hostnodes.items():
location = (
"-".join([details["location"][key] for key in ["country", "region", "city"]])
.lower()
.replace(" ", "")
)
location = details["location"]["country"].lower().replace(" ", "")
if query_filter is not None:
offers += self.optimize_offers(query_filter, details["specs"], hostnode, location)
else:
Expand All @@ -66,7 +62,7 @@ def get(self, query_filter: Optional[QueryFilter] = None) -> List[RawCatalogItem
cpu=details["specs"]["cpu"]["amount"],
memory=float(round_down(details["specs"]["ram"]["amount"], 2)),
gpu_count=gpu["amount"],
gpu_name=marketplace_gpus.get(gpu_name, gpu_name),
gpu_name=convert_gpu_name(gpu_name),
gpu_memory=float(gpu["vram"]),
spot=False,
)
Expand All @@ -77,64 +73,67 @@ def get(self, query_filter: Optional[QueryFilter] = None) -> List[RawCatalogItem
def optimize_offers(
q: QueryFilter, specs: dict, instance_name: str, location: str
) -> List[RawCatalogItem]:
cpu = optimize(specs["cpu"]["amount"], q.min_cpu or 1, q.max_cpu)
memory = optimize( # has to be even
round_down(specs["ram"]["amount"], 2),
round_up(q.min_memory or 1, 2),
round_down(q.max_memory, 2) if q.max_memory is not None else None,
)
disk_size = optimize( # 30 GB at least for Ubuntu
specs["storage"]["amount"],
q.min_disk_size or 30,
q.max_disk_size,
)
if cpu is None or memory is None or disk_size is None:
return []
base_price = sum(
n * specs[key]["price"]
for key, n in [("cpu", cpu), ("ram", memory), ("storage", disk_size)]
)
offers = []
for gpu_name, gpu in specs["gpu"].items():
gpu_name = marketplace_gpus.get(gpu_name, gpu_name)
if q.gpu_name is not None and gpu_name not in q.gpu_name:
for gpu_model, gpu_info in specs["gpu"].items():
# filter by single gpu characteristics
if not is_between(gpu_info["vram"], q.min_gpu_memory, q.max_gpu_memory):
continue
if not is_between(gpu["vram"], q.min_gpu_memory, q.max_gpu_memory):
gpu_name = convert_gpu_name(gpu_model)
if q.gpu_name is not None and gpu_name.lower() not in q.gpu_name:
continue
if (
gpu_count := optimize(gpu["amount"], q.min_gpu_count or 1, q.max_gpu_count)
) is None:
continue
# filter by total gpu memory
if q.min_total_gpu_memory is None:
min_total_gpu_memory = gpu_count * gpu["vram"]
else:
min_total_gpu_memory = max(q.min_total_gpu_memory, gpu_count * gpu["vram"])
gpu_total_memory = optimize(
gpu["amount"] * gpu["vram"],
round_up(min_total_gpu_memory, gpu["vram"]),
round_down(q.max_total_gpu_memory, gpu["vram"])
if q.max_total_gpu_memory is not None
else None,
)
if gpu_total_memory is None:
continue
gpu_count = gpu_total_memory // gpu["vram"]
if not is_between(gpu_count, q.min_gpu_count, q.max_gpu_count):
continue
# make an offer
offer = RawCatalogItem(
instance_name=instance_name,
location=location,
price=round(gpu_count * gpu["price"] + base_price, 5),
cpu=cpu,
memory=float(memory),
gpu_count=gpu_count,
gpu_name=gpu_name,
gpu_memory=float(gpu["vram"]),
spot=False,
)
offers.append(offer)
if q.min_compute_capability is not None or q.max_compute_capability is not None:
cc = get_compute_capability(gpu_name)
if not cc or not is_between(
cc, q.min_compute_capability, q.max_compute_capability
):
continue

for gpu_count in range(1, gpu_info["amount"] + 1): # try all possible gpu counts
if not is_between(gpu_count, q.min_gpu_count, q.max_gpu_count):
continue
if not is_between(
gpu_count * gpu_info["vram"], q.min_total_gpu_memory, q.max_total_gpu_memory
):
continue
# we can't take 100% of CPU/RAM/storage if we don't take all GPUs
multiplier = 0.75 if gpu_count < gpu_info["amount"] else 1
cpu = optimize(
int(multiplier * specs["cpu"]["amount"]),
q.min_cpu or 1,
q.max_cpu,
)
memory = optimize( # has to be even
round_down(int(multiplier * specs["ram"]["amount"]), 2),
round_up(q.min_memory or 1, 2),
round_down(q.max_memory, 2) if q.max_memory is not None else None,
)
disk_size = optimize( # 30 GB at least for Ubuntu
int(multiplier * specs["storage"]["amount"]),
q.min_disk_size or 30,
q.max_disk_size,
)
if cpu is None or memory is None or disk_size is None:
continue
price = round(
cpu * specs["cpu"]["price"]
+ memory * specs["ram"]["price"]
+ disk_size * specs["storage"]["price"]
+ gpu_count * gpu_info["price"],
5,
)
offer = RawCatalogItem(
instance_name=instance_name,
location=location,
price=price,
cpu=cpu,
memory=float(memory),
gpu_name=gpu_name,
gpu_count=gpu_count,
gpu_memory=float(gpu_info["vram"]),
spot=False,
)
offers.append(offer)
break # stop increasing gpu count
return offers


Expand All @@ -144,3 +143,21 @@ def round_up(value: Union[int, float], step: int) -> int:

def round_down(value: Union[int, float], step: int) -> int:
return value // step * step


def convert_gpu_name(model: str) -> str:
"""
>>> convert_gpu_name("geforcegtx1070-pcie-8gb")
'GTX1070'
>>> convert_gpu_name("geforcertx1111ti-pcie-13gb")
'RTX1111Ti'
>>> convert_gpu_name("a100-pcie-40gb")
'A100'
"""
if model in marketplace_gpus:
return marketplace_gpus[model]
model = model.split("-")[0]
prefix = "geforce"
if model.startswith(prefix):
model = model[len(prefix) :]
return model.upper().replace("TI", "Ti")
15 changes: 15 additions & 0 deletions src/tests/_internal/test_constraints.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,21 @@ def test_compute_capability(self, item: CatalogItem):
assert not matches(item, QueryFilter(min_compute_capability=(8, 1)))
assert not matches(item, QueryFilter(max_compute_capability=(7, 9)))

def test_ti_gpu(self):
item = CatalogItem(
instance_name="large",
location="us-east-1",
price=1.2,
cpu=16,
memory=64.0,
gpu_count=1,
gpu_name="RTX3060Ti", # case-sensitive
gpu_memory=8.0,
spot=False,
provider="aws",
)
assert matches(item, QueryFilter(gpu_name=["RTX3060TI"]))


class TestFillMissing:
def test_empty(self):
Expand Down
6 changes: 6 additions & 0 deletions src/tests/providers/test_tensordock.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,12 @@ def test_controversial_gpu(self, specs: dict):
)
assert offers == []

def test_all_cpu_all_gpu(self, specs: dict):
offers = TensorDockProvider.optimize_offers(
QueryFilter(min_cpu=256, min_gpu_count=1), specs, "", ""
)
assert offers == make_offers(specs, cpu=256, memory=2, disk_size=30, gpu_count=8)


def make_offers(
specs: dict, cpu: int, memory: int, disk_size: int, gpu_count: int
Expand Down

0 comments on commit da41d23

Please sign in to comment.