TensorDock fixes (#5)

* Case-insensitive GPU name matching * Auto-convert unknown GPU models for TensorDock * Run doctest * Cap TensorDock resources at 75% if not all GPUs are taken * Use country as a location * Install all dependencies for testing
dstackai · Oct 31, 2023 · da41d23 · da41d23
1 parent 5852515
commit da41d23
Show file tree

Hide file tree

Showing 9 changed files with 134 additions and 76 deletions.
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -42,8 +42,11 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install .
+          pip install '.[all]'
           pip install -r requirements_dev.txt
+      - name: Run doctest
+        run: |
+          pytest --doctest-modules src/gpuhunt
       - name: Run pytest
         run: |
           pytest src/tests

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -46,8 +46,11 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install .
+          pip install '.[all]'
           pip install -r requirements_dev.txt
+      - name: Run doctest
+        run: |
+          pytest --doctest-modules src/gpuhunt
       - name: Run pytest
         run: |
           pytest src/tests
diff --git a/pyproject.toml b/pyproject.toml
@@ -38,6 +38,7 @@ gcp = [
     "google-cloud-billing",
     "google-cloud-compute"
 ]
+all = ["gpuhunt[aws,azure,gcp]"]
 
 [tool.setuptools.dynamic]
 version = {attr = "gpuhunt.version.__version__"}
diff --git a/src/gpuhunt/_internal/catalog.py b/src/gpuhunt/_internal/catalog.py
@@ -68,7 +68,7 @@ def query(
             max_memory: maximum amount of RAM in GB
             min_gpu_count: minimum number of GPUs
             max_gpu_count: maximum number of GPUs
-            gpu_name: case-sensitive name of the GPU to filter by. If not specified, all GPUs will be used
+            gpu_name: name of the GPU to filter by. If not specified, all GPUs will be used
             min_gpu_memory: minimum amount of GPU VRAM in GB for each GPU
             max_gpu_memory: maximum amount of GPU VRAM in GB for each GPU
             min_total_gpu_memory: minimum amount of GPU VRAM in GB for all GPUs combined
@@ -113,11 +113,9 @@ def query(
         if self.fill_missing:
             query_filter = constraints.fill_missing(query_filter)
         # validate providers
-        if query_filter.provider is not None:
-            query_filter.provider = [p.lower() for p in query_filter.provider]
-            for p in query_filter.provider:
-                if p not in OFFLINE_PROVIDERS + ONLINE_PROVIDERS:
-                    raise ValueError(f"Unknown provider: {p}")
+        for p in query_filter.provider or []:
+            if p not in OFFLINE_PROVIDERS + ONLINE_PROVIDERS:
+                raise ValueError(f"Unknown provider: {p}")
 
         # fetch providers
         items = []

diff --git a/src/gpuhunt/_internal/constraints.py b/src/gpuhunt/_internal/constraints.py
@@ -35,7 +35,9 @@ def fill_missing(q: QueryFilter, *, memory_per_core: int = 8) -> QueryFilter:
                     ]
                 )
             if q.gpu_name is not None:
-                min_gpu_memory.extend([i.memory for i in KNOWN_GPUS if i.name in q.gpu_name])
+                min_gpu_memory.extend(
+                    [i.memory for i in KNOWN_GPUS if i.name.lower() in q.gpu_name]
+                )
             min_total_gpu_memory = (
                 min(min_gpu_memory, default=min(i.memory for i in KNOWN_GPUS)) * min_gpu_count
             )
@@ -102,7 +104,7 @@ def matches(i: CatalogItem, q: QueryFilter) -> bool:
     Returns:
         whether the catalog item matches the filters
     """
-    if q.provider is not None and i.provider not in q.provider:
+    if q.provider is not None and i.provider.lower() not in q.provider:
         return False
     if not is_between(i.cpu, q.min_cpu, q.max_cpu):
         return False
@@ -111,11 +113,11 @@ def matches(i: CatalogItem, q: QueryFilter) -> bool:
     if not is_between(i.gpu_count, q.min_gpu_count, q.max_gpu_count):
         return False
     if q.gpu_name is not None:
-        if i.gpu_name not in q.gpu_name:
+        if i.gpu_name.lower() not in q.gpu_name:
             return False
     if q.min_compute_capability is not None or q.max_compute_capability is not None:
-        cc = [info.compute_capability for info in KNOWN_GPUS if info.name == i.gpu_name]
-        if not cc or not is_between(min(cc), q.min_compute_capability, q.max_compute_capability):
+        cc = get_compute_capability(i.gpu_name)
+        if not cc or not is_between(cc, q.min_compute_capability, q.max_compute_capability):
             return False
     if not is_between(i.gpu_memory if i.gpu_count > 0 else 0, q.min_gpu_memory, q.max_gpu_memory):
         return False
@@ -135,6 +137,13 @@ def matches(i: CatalogItem, q: QueryFilter) -> bool:
     return True
 
 
+def get_compute_capability(gpu_name: str) -> Optional[Tuple[int, int]]:
+    for gpu in KNOWN_GPUS:
+        if gpu.name.lower() == gpu_name.lower():
+            return gpu.compute_capability
+    return None
+
+
 KNOWN_GPUS = [
     GPUInfo(name="A10", memory=24, compute_capability=(8, 6)),
     GPUInfo(name="A100", memory=40, compute_capability=(8, 0)),

diff --git a/src/gpuhunt/_internal/models.py b/src/gpuhunt/_internal/models.py
@@ -77,7 +77,7 @@ class QueryFilter:
         max_memory: maximum amount of RAM in GB
         min_gpu_count: minimum number of GPUs
         max_gpu_count: maximum number of GPUs
-        gpu_name: case-sensitive name of the GPU to filter by. If not specified, all GPUs will be used
+        gpu_name: name of the GPU to filter by. If not specified, all GPUs will be used
         min_gpu_memory: minimum amount of GPU VRAM in GB for each GPU
         max_gpu_memory: maximum amount of GPU VRAM in GB for each GPU
         min_total_gpu_memory: minimum amount of GPU VRAM in GB for all GPUs combined
@@ -111,6 +111,12 @@ class QueryFilter:
     max_compute_capability: Optional[Tuple[int, int]] = None
     spot: Optional[bool] = None
 
+    def __post_init__(self):
+        if self.provider is not None:
+            self.provider = [i.lower() for i in self.provider]
+        if self.gpu_name is not None:
+            self.gpu_name = [i.lower() for i in self.gpu_name]
+
 
 @dataclass
 class GPUInfo:

diff --git a/src/gpuhunt/providers/tensordock.py b/src/gpuhunt/providers/tensordock.py
@@ -3,7 +3,7 @@
 
 import requests
 
-from gpuhunt._internal.constraints import is_between, optimize
+from gpuhunt._internal.constraints import get_compute_capability, is_between, optimize
 from gpuhunt._internal.models import QueryFilter, RawCatalogItem
 from gpuhunt.providers import AbstractProvider
 
@@ -39,11 +39,7 @@ def get(self, query_filter: Optional[QueryFilter] = None) -> List[RawCatalogItem
         hostnodes = requests.get(marketplace_hostnodes_url).json()["hostnodes"]
         offers = []
         for hostnode, details in hostnodes.items():
-            location = (
-                "-".join([details["location"][key] for key in ["country", "region", "city"]])
-                .lower()
-                .replace(" ", "")
-            )
+            location = details["location"]["country"].lower().replace(" ", "")
             if query_filter is not None:
                 offers += self.optimize_offers(query_filter, details["specs"], hostnode, location)
             else:
@@ -66,7 +62,7 @@ def get(self, query_filter: Optional[QueryFilter] = None) -> List[RawCatalogItem
                             cpu=details["specs"]["cpu"]["amount"],
                             memory=float(round_down(details["specs"]["ram"]["amount"], 2)),
                             gpu_count=gpu["amount"],
-                            gpu_name=marketplace_gpus.get(gpu_name, gpu_name),
+                            gpu_name=convert_gpu_name(gpu_name),
                             gpu_memory=float(gpu["vram"]),
                             spot=False,
                         )
@@ -77,64 +73,67 @@ def get(self, query_filter: Optional[QueryFilter] = None) -> List[RawCatalogItem
     def optimize_offers(
         q: QueryFilter, specs: dict, instance_name: str, location: str
     ) -> List[RawCatalogItem]:
-        cpu = optimize(specs["cpu"]["amount"], q.min_cpu or 1, q.max_cpu)
-        memory = optimize(  # has to be even
-            round_down(specs["ram"]["amount"], 2),
-            round_up(q.min_memory or 1, 2),
-            round_down(q.max_memory, 2) if q.max_memory is not None else None,
-        )
-        disk_size = optimize(  # 30 GB at least for Ubuntu
-            specs["storage"]["amount"],
-            q.min_disk_size or 30,
-            q.max_disk_size,
-        )
-        if cpu is None or memory is None or disk_size is None:
-            return []
-        base_price = sum(
-            n * specs[key]["price"]
-            for key, n in [("cpu", cpu), ("ram", memory), ("storage", disk_size)]
-        )
         offers = []
-        for gpu_name, gpu in specs["gpu"].items():
-            gpu_name = marketplace_gpus.get(gpu_name, gpu_name)
-            if q.gpu_name is not None and gpu_name not in q.gpu_name:
+        for gpu_model, gpu_info in specs["gpu"].items():
+            # filter by single gpu characteristics
+            if not is_between(gpu_info["vram"], q.min_gpu_memory, q.max_gpu_memory):
                 continue
-            if not is_between(gpu["vram"], q.min_gpu_memory, q.max_gpu_memory):
+            gpu_name = convert_gpu_name(gpu_model)
+            if q.gpu_name is not None and gpu_name.lower() not in q.gpu_name:
                 continue
-            if (
-                gpu_count := optimize(gpu["amount"], q.min_gpu_count or 1, q.max_gpu_count)
-            ) is None:
-                continue
-            # filter by total gpu memory
-            if q.min_total_gpu_memory is None:
-                min_total_gpu_memory = gpu_count * gpu["vram"]
-            else:
-                min_total_gpu_memory = max(q.min_total_gpu_memory, gpu_count * gpu["vram"])
-            gpu_total_memory = optimize(
-                gpu["amount"] * gpu["vram"],
-                round_up(min_total_gpu_memory, gpu["vram"]),
-                round_down(q.max_total_gpu_memory, gpu["vram"])
-                if q.max_total_gpu_memory is not None
-                else None,
-            )
-            if gpu_total_memory is None:
-                continue
-            gpu_count = gpu_total_memory // gpu["vram"]
-            if not is_between(gpu_count, q.min_gpu_count, q.max_gpu_count):
-                continue
-            # make an offer
-            offer = RawCatalogItem(
-                instance_name=instance_name,
-                location=location,
-                price=round(gpu_count * gpu["price"] + base_price, 5),
-                cpu=cpu,
-                memory=float(memory),
-                gpu_count=gpu_count,
-                gpu_name=gpu_name,
-                gpu_memory=float(gpu["vram"]),
-                spot=False,
-            )
-            offers.append(offer)
+            if q.min_compute_capability is not None or q.max_compute_capability is not None:
+                cc = get_compute_capability(gpu_name)
+                if not cc or not is_between(
+                    cc, q.min_compute_capability, q.max_compute_capability
+                ):
+                    continue
+
+            for gpu_count in range(1, gpu_info["amount"] + 1):  # try all possible gpu counts
+                if not is_between(gpu_count, q.min_gpu_count, q.max_gpu_count):
+                    continue
+                if not is_between(
+                    gpu_count * gpu_info["vram"], q.min_total_gpu_memory, q.max_total_gpu_memory
+                ):
+                    continue
+                # we can't take 100% of CPU/RAM/storage if we don't take all GPUs
+                multiplier = 0.75 if gpu_count < gpu_info["amount"] else 1
+                cpu = optimize(
+                    int(multiplier * specs["cpu"]["amount"]),
+                    q.min_cpu or 1,
+                    q.max_cpu,
+                )
+                memory = optimize(  # has to be even
+                    round_down(int(multiplier * specs["ram"]["amount"]), 2),
+                    round_up(q.min_memory or 1, 2),
+                    round_down(q.max_memory, 2) if q.max_memory is not None else None,
+                )
+                disk_size = optimize(  # 30 GB at least for Ubuntu
+                    int(multiplier * specs["storage"]["amount"]),
+                    q.min_disk_size or 30,
+                    q.max_disk_size,
+                )
+                if cpu is None or memory is None or disk_size is None:
+                    continue
+                price = round(
+                    cpu * specs["cpu"]["price"]
+                    + memory * specs["ram"]["price"]
+                    + disk_size * specs["storage"]["price"]
+                    + gpu_count * gpu_info["price"],
+                    5,
+                )
+                offer = RawCatalogItem(
+                    instance_name=instance_name,
+                    location=location,
+                    price=price,
+                    cpu=cpu,
+                    memory=float(memory),
+                    gpu_name=gpu_name,
+                    gpu_count=gpu_count,
+                    gpu_memory=float(gpu_info["vram"]),
+                    spot=False,
+                )
+                offers.append(offer)
+                break  # stop increasing gpu count
         return offers
 
 
@@ -144,3 +143,21 @@ def round_up(value: Union[int, float], step: int) -> int:
 
 def round_down(value: Union[int, float], step: int) -> int:
     return value // step * step
+
+
+def convert_gpu_name(model: str) -> str:
+    """
+    >>> convert_gpu_name("geforcegtx1070-pcie-8gb")
+    'GTX1070'
+    >>> convert_gpu_name("geforcertx1111ti-pcie-13gb")
+    'RTX1111Ti'
+    >>> convert_gpu_name("a100-pcie-40gb")
+    'A100'
+    """
+    if model in marketplace_gpus:
+        return marketplace_gpus[model]
+    model = model.split("-")[0]
+    prefix = "geforce"
+    if model.startswith(prefix):
+        model = model[len(prefix) :]
+    return model.upper().replace("TI", "Ti")
diff --git a/src/tests/_internal/test_constraints.py b/src/tests/_internal/test_constraints.py
@@ -74,6 +74,21 @@ def test_compute_capability(self, item: CatalogItem):
         assert not matches(item, QueryFilter(min_compute_capability=(8, 1)))
         assert not matches(item, QueryFilter(max_compute_capability=(7, 9)))
 
+    def test_ti_gpu(self):
+        item = CatalogItem(
+            instance_name="large",
+            location="us-east-1",
+            price=1.2,
+            cpu=16,
+            memory=64.0,
+            gpu_count=1,
+            gpu_name="RTX3060Ti",  # case-sensitive
+            gpu_memory=8.0,
+            spot=False,
+            provider="aws",
+        )
+        assert matches(item, QueryFilter(gpu_name=["RTX3060TI"]))
+
 
 class TestFillMissing:
     def test_empty(self):

diff --git a/src/tests/providers/test_tensordock.py b/src/tests/providers/test_tensordock.py
@@ -73,6 +73,12 @@ def test_controversial_gpu(self, specs: dict):
         )
         assert offers == []
 
+    def test_all_cpu_all_gpu(self, specs: dict):
+        offers = TensorDockProvider.optimize_offers(
+            QueryFilter(min_cpu=256, min_gpu_count=1), specs, "", ""
+        )
+        assert offers == make_offers(specs, cpu=256, memory=2, disk_size=30, gpu_count=8)
+
 
 def make_offers(
     specs: dict, cpu: int, memory: int, disk_size: int, gpu_count: int