From da41d23aee79129f5725093ac246700696713ce6 Mon Sep 17 00:00:00 2001 From: Egor Sklyarov Date: Tue, 31 Oct 2023 14:32:09 +0400 Subject: [PATCH] TensorDock fixes (#5) * Case-insensitive GPU name matching * Auto-convert unknown GPU models for TensorDock * Run doctest * Cap TensorDock resources at 75% if not all GPUs are taken * Use country as a location * Install all dependencies for testing --- .github/workflows/release.yml | 5 +- .github/workflows/test.yml | 5 +- pyproject.toml | 1 + src/gpuhunt/_internal/catalog.py | 10 +- src/gpuhunt/_internal/constraints.py | 19 +++- src/gpuhunt/_internal/models.py | 8 +- src/gpuhunt/providers/tensordock.py | 141 +++++++++++++----------- src/tests/_internal/test_constraints.py | 15 +++ src/tests/providers/test_tensordock.py | 6 + 9 files changed, 134 insertions(+), 76 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 24e811d..925ccde 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -42,8 +42,11 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install . + pip install '.[all]' pip install -r requirements_dev.txt + - name: Run doctest + run: | + pytest --doctest-modules src/gpuhunt - name: Run pytest run: | pytest src/tests diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 54ed777..6d251c0 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -46,8 +46,11 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install . + pip install '.[all]' pip install -r requirements_dev.txt + - name: Run doctest + run: | + pytest --doctest-modules src/gpuhunt - name: Run pytest run: | pytest src/tests diff --git a/pyproject.toml b/pyproject.toml index 097ce7e..ff5406a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,6 +38,7 @@ gcp = [ "google-cloud-billing", "google-cloud-compute" ] +all = ["gpuhunt[aws,azure,gcp]"] [tool.setuptools.dynamic] version = {attr = "gpuhunt.version.__version__"} diff --git a/src/gpuhunt/_internal/catalog.py b/src/gpuhunt/_internal/catalog.py index 4fd446e..917f643 100644 --- a/src/gpuhunt/_internal/catalog.py +++ b/src/gpuhunt/_internal/catalog.py @@ -68,7 +68,7 @@ def query( max_memory: maximum amount of RAM in GB min_gpu_count: minimum number of GPUs max_gpu_count: maximum number of GPUs - gpu_name: case-sensitive name of the GPU to filter by. If not specified, all GPUs will be used + gpu_name: name of the GPU to filter by. If not specified, all GPUs will be used min_gpu_memory: minimum amount of GPU VRAM in GB for each GPU max_gpu_memory: maximum amount of GPU VRAM in GB for each GPU min_total_gpu_memory: minimum amount of GPU VRAM in GB for all GPUs combined @@ -113,11 +113,9 @@ def query( if self.fill_missing: query_filter = constraints.fill_missing(query_filter) # validate providers - if query_filter.provider is not None: - query_filter.provider = [p.lower() for p in query_filter.provider] - for p in query_filter.provider: - if p not in OFFLINE_PROVIDERS + ONLINE_PROVIDERS: - raise ValueError(f"Unknown provider: {p}") + for p in query_filter.provider or []: + if p not in OFFLINE_PROVIDERS + ONLINE_PROVIDERS: + raise ValueError(f"Unknown provider: {p}") # fetch providers items = [] diff --git a/src/gpuhunt/_internal/constraints.py b/src/gpuhunt/_internal/constraints.py index 6af2fb0..26e2143 100644 --- a/src/gpuhunt/_internal/constraints.py +++ b/src/gpuhunt/_internal/constraints.py @@ -35,7 +35,9 @@ def fill_missing(q: QueryFilter, *, memory_per_core: int = 8) -> QueryFilter: ] ) if q.gpu_name is not None: - min_gpu_memory.extend([i.memory for i in KNOWN_GPUS if i.name in q.gpu_name]) + min_gpu_memory.extend( + [i.memory for i in KNOWN_GPUS if i.name.lower() in q.gpu_name] + ) min_total_gpu_memory = ( min(min_gpu_memory, default=min(i.memory for i in KNOWN_GPUS)) * min_gpu_count ) @@ -102,7 +104,7 @@ def matches(i: CatalogItem, q: QueryFilter) -> bool: Returns: whether the catalog item matches the filters """ - if q.provider is not None and i.provider not in q.provider: + if q.provider is not None and i.provider.lower() not in q.provider: return False if not is_between(i.cpu, q.min_cpu, q.max_cpu): return False @@ -111,11 +113,11 @@ def matches(i: CatalogItem, q: QueryFilter) -> bool: if not is_between(i.gpu_count, q.min_gpu_count, q.max_gpu_count): return False if q.gpu_name is not None: - if i.gpu_name not in q.gpu_name: + if i.gpu_name.lower() not in q.gpu_name: return False if q.min_compute_capability is not None or q.max_compute_capability is not None: - cc = [info.compute_capability for info in KNOWN_GPUS if info.name == i.gpu_name] - if not cc or not is_between(min(cc), q.min_compute_capability, q.max_compute_capability): + cc = get_compute_capability(i.gpu_name) + if not cc or not is_between(cc, q.min_compute_capability, q.max_compute_capability): return False if not is_between(i.gpu_memory if i.gpu_count > 0 else 0, q.min_gpu_memory, q.max_gpu_memory): return False @@ -135,6 +137,13 @@ def matches(i: CatalogItem, q: QueryFilter) -> bool: return True +def get_compute_capability(gpu_name: str) -> Optional[Tuple[int, int]]: + for gpu in KNOWN_GPUS: + if gpu.name.lower() == gpu_name.lower(): + return gpu.compute_capability + return None + + KNOWN_GPUS = [ GPUInfo(name="A10", memory=24, compute_capability=(8, 6)), GPUInfo(name="A100", memory=40, compute_capability=(8, 0)), diff --git a/src/gpuhunt/_internal/models.py b/src/gpuhunt/_internal/models.py index b580f5b..6e5a46b 100644 --- a/src/gpuhunt/_internal/models.py +++ b/src/gpuhunt/_internal/models.py @@ -77,7 +77,7 @@ class QueryFilter: max_memory: maximum amount of RAM in GB min_gpu_count: minimum number of GPUs max_gpu_count: maximum number of GPUs - gpu_name: case-sensitive name of the GPU to filter by. If not specified, all GPUs will be used + gpu_name: name of the GPU to filter by. If not specified, all GPUs will be used min_gpu_memory: minimum amount of GPU VRAM in GB for each GPU max_gpu_memory: maximum amount of GPU VRAM in GB for each GPU min_total_gpu_memory: minimum amount of GPU VRAM in GB for all GPUs combined @@ -111,6 +111,12 @@ class QueryFilter: max_compute_capability: Optional[Tuple[int, int]] = None spot: Optional[bool] = None + def __post_init__(self): + if self.provider is not None: + self.provider = [i.lower() for i in self.provider] + if self.gpu_name is not None: + self.gpu_name = [i.lower() for i in self.gpu_name] + @dataclass class GPUInfo: diff --git a/src/gpuhunt/providers/tensordock.py b/src/gpuhunt/providers/tensordock.py index 6a7d8de..bb34014 100644 --- a/src/gpuhunt/providers/tensordock.py +++ b/src/gpuhunt/providers/tensordock.py @@ -3,7 +3,7 @@ import requests -from gpuhunt._internal.constraints import is_between, optimize +from gpuhunt._internal.constraints import get_compute_capability, is_between, optimize from gpuhunt._internal.models import QueryFilter, RawCatalogItem from gpuhunt.providers import AbstractProvider @@ -39,11 +39,7 @@ def get(self, query_filter: Optional[QueryFilter] = None) -> List[RawCatalogItem hostnodes = requests.get(marketplace_hostnodes_url).json()["hostnodes"] offers = [] for hostnode, details in hostnodes.items(): - location = ( - "-".join([details["location"][key] for key in ["country", "region", "city"]]) - .lower() - .replace(" ", "") - ) + location = details["location"]["country"].lower().replace(" ", "") if query_filter is not None: offers += self.optimize_offers(query_filter, details["specs"], hostnode, location) else: @@ -66,7 +62,7 @@ def get(self, query_filter: Optional[QueryFilter] = None) -> List[RawCatalogItem cpu=details["specs"]["cpu"]["amount"], memory=float(round_down(details["specs"]["ram"]["amount"], 2)), gpu_count=gpu["amount"], - gpu_name=marketplace_gpus.get(gpu_name, gpu_name), + gpu_name=convert_gpu_name(gpu_name), gpu_memory=float(gpu["vram"]), spot=False, ) @@ -77,64 +73,67 @@ def get(self, query_filter: Optional[QueryFilter] = None) -> List[RawCatalogItem def optimize_offers( q: QueryFilter, specs: dict, instance_name: str, location: str ) -> List[RawCatalogItem]: - cpu = optimize(specs["cpu"]["amount"], q.min_cpu or 1, q.max_cpu) - memory = optimize( # has to be even - round_down(specs["ram"]["amount"], 2), - round_up(q.min_memory or 1, 2), - round_down(q.max_memory, 2) if q.max_memory is not None else None, - ) - disk_size = optimize( # 30 GB at least for Ubuntu - specs["storage"]["amount"], - q.min_disk_size or 30, - q.max_disk_size, - ) - if cpu is None or memory is None or disk_size is None: - return [] - base_price = sum( - n * specs[key]["price"] - for key, n in [("cpu", cpu), ("ram", memory), ("storage", disk_size)] - ) offers = [] - for gpu_name, gpu in specs["gpu"].items(): - gpu_name = marketplace_gpus.get(gpu_name, gpu_name) - if q.gpu_name is not None and gpu_name not in q.gpu_name: + for gpu_model, gpu_info in specs["gpu"].items(): + # filter by single gpu characteristics + if not is_between(gpu_info["vram"], q.min_gpu_memory, q.max_gpu_memory): continue - if not is_between(gpu["vram"], q.min_gpu_memory, q.max_gpu_memory): + gpu_name = convert_gpu_name(gpu_model) + if q.gpu_name is not None and gpu_name.lower() not in q.gpu_name: continue - if ( - gpu_count := optimize(gpu["amount"], q.min_gpu_count or 1, q.max_gpu_count) - ) is None: - continue - # filter by total gpu memory - if q.min_total_gpu_memory is None: - min_total_gpu_memory = gpu_count * gpu["vram"] - else: - min_total_gpu_memory = max(q.min_total_gpu_memory, gpu_count * gpu["vram"]) - gpu_total_memory = optimize( - gpu["amount"] * gpu["vram"], - round_up(min_total_gpu_memory, gpu["vram"]), - round_down(q.max_total_gpu_memory, gpu["vram"]) - if q.max_total_gpu_memory is not None - else None, - ) - if gpu_total_memory is None: - continue - gpu_count = gpu_total_memory // gpu["vram"] - if not is_between(gpu_count, q.min_gpu_count, q.max_gpu_count): - continue - # make an offer - offer = RawCatalogItem( - instance_name=instance_name, - location=location, - price=round(gpu_count * gpu["price"] + base_price, 5), - cpu=cpu, - memory=float(memory), - gpu_count=gpu_count, - gpu_name=gpu_name, - gpu_memory=float(gpu["vram"]), - spot=False, - ) - offers.append(offer) + if q.min_compute_capability is not None or q.max_compute_capability is not None: + cc = get_compute_capability(gpu_name) + if not cc or not is_between( + cc, q.min_compute_capability, q.max_compute_capability + ): + continue + + for gpu_count in range(1, gpu_info["amount"] + 1): # try all possible gpu counts + if not is_between(gpu_count, q.min_gpu_count, q.max_gpu_count): + continue + if not is_between( + gpu_count * gpu_info["vram"], q.min_total_gpu_memory, q.max_total_gpu_memory + ): + continue + # we can't take 100% of CPU/RAM/storage if we don't take all GPUs + multiplier = 0.75 if gpu_count < gpu_info["amount"] else 1 + cpu = optimize( + int(multiplier * specs["cpu"]["amount"]), + q.min_cpu or 1, + q.max_cpu, + ) + memory = optimize( # has to be even + round_down(int(multiplier * specs["ram"]["amount"]), 2), + round_up(q.min_memory or 1, 2), + round_down(q.max_memory, 2) if q.max_memory is not None else None, + ) + disk_size = optimize( # 30 GB at least for Ubuntu + int(multiplier * specs["storage"]["amount"]), + q.min_disk_size or 30, + q.max_disk_size, + ) + if cpu is None or memory is None or disk_size is None: + continue + price = round( + cpu * specs["cpu"]["price"] + + memory * specs["ram"]["price"] + + disk_size * specs["storage"]["price"] + + gpu_count * gpu_info["price"], + 5, + ) + offer = RawCatalogItem( + instance_name=instance_name, + location=location, + price=price, + cpu=cpu, + memory=float(memory), + gpu_name=gpu_name, + gpu_count=gpu_count, + gpu_memory=float(gpu_info["vram"]), + spot=False, + ) + offers.append(offer) + break # stop increasing gpu count return offers @@ -144,3 +143,21 @@ def round_up(value: Union[int, float], step: int) -> int: def round_down(value: Union[int, float], step: int) -> int: return value // step * step + + +def convert_gpu_name(model: str) -> str: + """ + >>> convert_gpu_name("geforcegtx1070-pcie-8gb") + 'GTX1070' + >>> convert_gpu_name("geforcertx1111ti-pcie-13gb") + 'RTX1111Ti' + >>> convert_gpu_name("a100-pcie-40gb") + 'A100' + """ + if model in marketplace_gpus: + return marketplace_gpus[model] + model = model.split("-")[0] + prefix = "geforce" + if model.startswith(prefix): + model = model[len(prefix) :] + return model.upper().replace("TI", "Ti") diff --git a/src/tests/_internal/test_constraints.py b/src/tests/_internal/test_constraints.py index 62d07b6..c8cc858 100644 --- a/src/tests/_internal/test_constraints.py +++ b/src/tests/_internal/test_constraints.py @@ -74,6 +74,21 @@ def test_compute_capability(self, item: CatalogItem): assert not matches(item, QueryFilter(min_compute_capability=(8, 1))) assert not matches(item, QueryFilter(max_compute_capability=(7, 9))) + def test_ti_gpu(self): + item = CatalogItem( + instance_name="large", + location="us-east-1", + price=1.2, + cpu=16, + memory=64.0, + gpu_count=1, + gpu_name="RTX3060Ti", # case-sensitive + gpu_memory=8.0, + spot=False, + provider="aws", + ) + assert matches(item, QueryFilter(gpu_name=["RTX3060TI"])) + class TestFillMissing: def test_empty(self): diff --git a/src/tests/providers/test_tensordock.py b/src/tests/providers/test_tensordock.py index 9883956..d92fac6 100644 --- a/src/tests/providers/test_tensordock.py +++ b/src/tests/providers/test_tensordock.py @@ -73,6 +73,12 @@ def test_controversial_gpu(self, specs: dict): ) assert offers == [] + def test_all_cpu_all_gpu(self, specs: dict): + offers = TensorDockProvider.optimize_offers( + QueryFilter(min_cpu=256, min_gpu_count=1), specs, "", "" + ) + assert offers == make_offers(specs, cpu=256, memory=2, disk_size=30, gpu_count=8) + def make_offers( specs: dict, cpu: int, memory: int, disk_size: int, gpu_count: int