Fix incorect state in the RawCatalogItem

dstackai · Feb 29, 2024 · 2d8149a · 2d8149a
1 parent 1e4be3d
commit 2d8149a
Show file tree

Hide file tree

Showing 7 changed files with 67 additions and 20 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -12,6 +12,6 @@ repos:
         name: isort (python)
         args: ['--settings-file', 'pyconfig.toml']
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.1.5
+    rev: v0.2.2
     hooks:
       - id: ruff
diff --git a/src/gpuhunt/providers/azure.py b/src/gpuhunt/providers/azure.py
@@ -176,6 +176,9 @@ def fill_details(self, offers: List[RawCatalogItem]) -> List[RawCatalogItem]:
             if "GPUs" in capabilities:
                 gpu_count = int(capabilities["GPUs"])
                 gpu_name, gpu_memory = get_gpu_name_memory(resource.name)
+                if gpu_name is None and gpu_count:
+                    logger.warning("Can't parse VM name: %s", resource.name)
+                    continue
             instances[resource.name] = RawCatalogItem(
                 instance_name=resource.name,
                 cpu=capabilities["vCPUs"],

diff --git a/src/gpuhunt/providers/cudo.py b/src/gpuhunt/providers/cudo.py
@@ -42,9 +42,16 @@ def fetch_all_vm_types(self):
                     )
         return list(chain.from_iterable(results))
 
-    def get_raw_catalog_list(self, vm_machine_type_list, vcpu, memory, gpu):
+    def get_raw_catalog_list(self, vm_machine_type_list, vcpu, memory, gpu: int):
         raw_list = []
         for vm in vm_machine_type_list:
+            memory = None
+            name = gpu_name(vm["gpuModel"])
+            if name is not None:
+                memory = get_memory(name)
+            if gpu and name is None:
+                logger.warning("Skip. Unknown GPU name: %s", vm["gpuModel"])
+                continue
             raw = RawCatalogItem(
                 instance_name=vm["machineType"],
                 location=vm["dataCenterId"],
@@ -53,8 +60,8 @@ def get_raw_catalog_list(self, vm_machine_type_list, vcpu, memory, gpu):
                 cpu=vcpu,
                 memory=memory,
                 gpu_count=gpu,
-                gpu_name=gpu_name(vm["gpuModel"]),
-                gpu_memory=get_memory(gpu_name(vm["gpuModel"])),
+                gpu_name=name,
+                gpu_memory=memory,
                 disk_size=None,
             )
             raw_list.append(raw)

diff --git a/src/gpuhunt/providers/datacrunch.py b/src/gpuhunt/providers/datacrunch.py
@@ -9,6 +9,13 @@
 from gpuhunt import QueryFilter, RawCatalogItem
 from gpuhunt.providers import AbstractProvider
 
+logger = logging.getLogger(__name__)
+
+AMD_RX7900XTX = "RX7900XTX"
+ALL_AMD_GPUS = [
+    AMD_RX7900XTX,
+]
+
 
 class DataCrunchProvider(AbstractProvider):
     NAME = "datacrunch"
@@ -37,30 +44,45 @@ def _get_availabilities(self, spot: bool) -> List[dict]:
     def _get_locations(self) -> List[dict]:
         return self.datacrunch_client.locations.get()
 
+    @classmethod
+    def filter(cls, offers: List[RawCatalogItem]) -> List[RawCatalogItem]:
+        return [o for o in offers if o.gpu_name not in ALL_AMD_GPUS]  # skip AMD GPU
+
 
 def generate_instances(
     spots: Iterable[bool], location_codes: Iterable[str], instance_types: Iterable[InstanceType]
 ) -> List[RawCatalogItem]:
     instances = []
     for spot, location, instance in itertools.product(spots, location_codes, instance_types):
         item = transform_instance(copy.copy(instance), spot, location)
+        if item is None:
+            continue
         instances.append(RawCatalogItem.from_dict(item))
     return instances
 
 
-def transform_instance(instance: InstanceType, spot: bool, location: str) -> dict:
+def transform_instance(instance: InstanceType, spot: bool, location: str) -> Optional[dict]:
     gpu_memory = 0
+    gpu_count = instance.gpu["number_of_gpus"]
+    gpu_name = None
+
     if instance.gpu["number_of_gpus"]:
         gpu_memory = instance.gpu_memory["size_in_gigabytes"] / instance.gpu["number_of_gpus"]
+        gpu_name = get_gpu_name(instance.gpu["description"])
+
+    if gpu_count and gpu_name is None:
+        logger.warning("Can't get GPU name from description: '%s'", instance.gpu["description"])
+        return None
+
     raw = dict(
         instance_name=instance.instance_type,
         location=location,
         spot=spot,
         price=instance.spot_price_per_hour if spot else instance.price_per_hour,
         cpu=instance.cpu["number_of_cores"],
         memory=instance.memory["size_in_gigabytes"],
-        gpu_count=instance.gpu["number_of_gpus"],
-        gpu_name=gpu_name(instance.gpu["description"]),
+        gpu_count=gpu_count,
+        gpu_name=gpu_name,
         gpu_memory=gpu_memory,
     )
     return raw
@@ -91,16 +113,25 @@ def transform_instance(instance: InstanceType, spot: bool, location: str) -> dic
     "2x NVidia Tesla V100 16GB": "V100",
     "4x NVidia Tesla V100 16GB": "V100",
     "8x NVidia Tesla V100 16GB": "V100",
+    "1x NVidia L40S": "L40S",
+    "2x NVidia L40S": "L40S",
+    "4x NVidia L40S": "L40S",
+    "8x NVidia L40S": "L40S",
+    "1x AMD 7900XTX": AMD_RX7900XTX,
+    "2x AMD 7900XTX": AMD_RX7900XTX,
+    "4x AMD 7900XTX": AMD_RX7900XTX,
+    "8x AMD 7900XTX": AMD_RX7900XTX,
+    "12x AMD 7900XTX": AMD_RX7900XTX,
 }
 
 
-def gpu_name(name: str) -> Optional[str]:
+def get_gpu_name(name: str) -> Optional[str]:
     if not name:
         return None
 
     result = GPU_MAP.get(name)
 
     if result is None:
-        logging.warning("There is no '%s' in GPU_MAP", name)
+        logger.warning("There is no '%s' in GPU_MAP", name)
 
     return result
diff --git a/src/gpuhunt/providers/lambdalabs.py b/src/gpuhunt/providers/lambdalabs.py
@@ -41,7 +41,12 @@ def get(
         for instance in data.values():
             instance = instance["instance_type"]
             logger.info(instance["name"])
-            gpu_count, gpu_name, gpu_memory = parse_description(instance["description"])
+            description = instance["description"]
+            result = parse_description(description)
+            if result is None:
+                logger.warning("Can't parse GPU info from description: %s", description)
+                continue
+            gpu_count, gpu_name, gpu_memory = result
             offer = RawCatalogItem(
                 instance_name=instance["name"],
                 price=instance["price_cents_per_hour"] / 100,
@@ -69,8 +74,10 @@ def add_regions(self, offers: List[RawCatalogItem]) -> List[RawCatalogItem]:
         return region_offers
 
 
-def parse_description(v: str) -> Tuple[int, str, float]:
+def parse_description(v: str) -> Optional[Tuple[int, str, float]]:
     """Returns gpus count, gpu name, and GPU memory"""
     r = re.match(r"^(\d)x (?:Tesla )?(.+) \((\d+) GB", v)
+    if r is None:
+        return None
     count, gpu_name, gpu_memory = r.groups()
     return int(count), gpu_name.replace(" ", ""), float(gpu_memory)
diff --git a/src/integrity_tests/test_datacrunch.py b/src/integrity_tests/test_datacrunch.py
@@ -5,7 +5,7 @@
 
 import pytest
 
-from gpuhunt.providers.datacrunch import GPU_MAP
+from gpuhunt.providers.datacrunch import ALL_AMD_GPUS, GPU_MAP
 
 
 @pytest.fixture
@@ -16,7 +16,7 @@ def data_rows(catalog_dir: Path) -> List[dict]:
 
 
 def select_row(rows, name: str) -> List[str]:
-    return [r[name] for r in rows]
+    return [r[name] for r in rows if r[name]]
 
 
 def test_locations(data_rows):
@@ -46,6 +46,6 @@ def test_price(data_rows):
 
 
 def test_gpu_present(data_rows):
-    refs = GPU_MAP.values()
+    refs = [name for name in GPU_MAP.values() if name not in ALL_AMD_GPUS]
     gpus = select_row(data_rows, "gpu_name")
-    assert set(gpus) != set(refs)
+    assert set(gpus) == set(refs)
diff --git a/src/tests/providers/test_datacrunch.py b/src/tests/providers/test_datacrunch.py
@@ -10,7 +10,7 @@
     DataCrunchProvider,
     InstanceType,
     generate_instances,
-    gpu_name,
+    get_gpu_name,
     transform_instance,
 )
 
@@ -179,12 +179,11 @@ def list_available_instances(raw_instance_types, locations):
 
 
 def test_gpu_name(caplog):
-    assert gpu_name("1x H100 SXM5 80GB") == "H100"
-    assert gpu_name("") is None
-    assert gpu_name(None) is None
+    assert get_gpu_name("1x H100 SXM5 80GB") == "H100"
+    assert get_gpu_name("") is None
 
     with caplog.at_level(logging.WARNING):
-        gpu_name("1x H200 SXM5 80GB")
+        get_gpu_name("1x H200 SXM5 80GB")
     assert "There is no '1x H200 SXM5 80GB' in GPU_MAP" in caplog.text