Skip to content

Commit

Permalink
Fix incorect state in the RawCatalogItem
Browse files Browse the repository at this point in the history
  • Loading branch information
Sergey Mezentsev committed Feb 29, 2024
1 parent 1e4be3d commit 2d8149a
Show file tree
Hide file tree
Showing 7 changed files with 67 additions and 20 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,6 @@ repos:
name: isort (python)
args: ['--settings-file', 'pyconfig.toml']
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.1.5
rev: v0.2.2
hooks:
- id: ruff
3 changes: 3 additions & 0 deletions src/gpuhunt/providers/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,9 @@ def fill_details(self, offers: List[RawCatalogItem]) -> List[RawCatalogItem]:
if "GPUs" in capabilities:
gpu_count = int(capabilities["GPUs"])
gpu_name, gpu_memory = get_gpu_name_memory(resource.name)
if gpu_name is None and gpu_count:
logger.warning("Can't parse VM name: %s", resource.name)
continue
instances[resource.name] = RawCatalogItem(
instance_name=resource.name,
cpu=capabilities["vCPUs"],
Expand Down
13 changes: 10 additions & 3 deletions src/gpuhunt/providers/cudo.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,16 @@ def fetch_all_vm_types(self):
)
return list(chain.from_iterable(results))

def get_raw_catalog_list(self, vm_machine_type_list, vcpu, memory, gpu):
def get_raw_catalog_list(self, vm_machine_type_list, vcpu, memory, gpu: int):
raw_list = []
for vm in vm_machine_type_list:
memory = None
name = gpu_name(vm["gpuModel"])
if name is not None:
memory = get_memory(name)
if gpu and name is None:
logger.warning("Skip. Unknown GPU name: %s", vm["gpuModel"])
continue
raw = RawCatalogItem(
instance_name=vm["machineType"],
location=vm["dataCenterId"],
Expand All @@ -53,8 +60,8 @@ def get_raw_catalog_list(self, vm_machine_type_list, vcpu, memory, gpu):
cpu=vcpu,
memory=memory,
gpu_count=gpu,
gpu_name=gpu_name(vm["gpuModel"]),
gpu_memory=get_memory(gpu_name(vm["gpuModel"])),
gpu_name=name,
gpu_memory=memory,
disk_size=None,
)
raw_list.append(raw)
Expand Down
41 changes: 36 additions & 5 deletions src/gpuhunt/providers/datacrunch.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,13 @@
from gpuhunt import QueryFilter, RawCatalogItem
from gpuhunt.providers import AbstractProvider

logger = logging.getLogger(__name__)

AMD_RX7900XTX = "RX7900XTX"
ALL_AMD_GPUS = [
AMD_RX7900XTX,
]


class DataCrunchProvider(AbstractProvider):
NAME = "datacrunch"
Expand Down Expand Up @@ -37,30 +44,45 @@ def _get_availabilities(self, spot: bool) -> List[dict]:
def _get_locations(self) -> List[dict]:
return self.datacrunch_client.locations.get()

@classmethod
def filter(cls, offers: List[RawCatalogItem]) -> List[RawCatalogItem]:
return [o for o in offers if o.gpu_name not in ALL_AMD_GPUS] # skip AMD GPU


def generate_instances(
spots: Iterable[bool], location_codes: Iterable[str], instance_types: Iterable[InstanceType]
) -> List[RawCatalogItem]:
instances = []
for spot, location, instance in itertools.product(spots, location_codes, instance_types):
item = transform_instance(copy.copy(instance), spot, location)
if item is None:
continue
instances.append(RawCatalogItem.from_dict(item))
return instances


def transform_instance(instance: InstanceType, spot: bool, location: str) -> dict:
def transform_instance(instance: InstanceType, spot: bool, location: str) -> Optional[dict]:
gpu_memory = 0
gpu_count = instance.gpu["number_of_gpus"]
gpu_name = None

if instance.gpu["number_of_gpus"]:
gpu_memory = instance.gpu_memory["size_in_gigabytes"] / instance.gpu["number_of_gpus"]
gpu_name = get_gpu_name(instance.gpu["description"])

if gpu_count and gpu_name is None:
logger.warning("Can't get GPU name from description: '%s'", instance.gpu["description"])
return None

raw = dict(
instance_name=instance.instance_type,
location=location,
spot=spot,
price=instance.spot_price_per_hour if spot else instance.price_per_hour,
cpu=instance.cpu["number_of_cores"],
memory=instance.memory["size_in_gigabytes"],
gpu_count=instance.gpu["number_of_gpus"],
gpu_name=gpu_name(instance.gpu["description"]),
gpu_count=gpu_count,
gpu_name=gpu_name,
gpu_memory=gpu_memory,
)
return raw
Expand Down Expand Up @@ -91,16 +113,25 @@ def transform_instance(instance: InstanceType, spot: bool, location: str) -> dic
"2x NVidia Tesla V100 16GB": "V100",
"4x NVidia Tesla V100 16GB": "V100",
"8x NVidia Tesla V100 16GB": "V100",
"1x NVidia L40S": "L40S",
"2x NVidia L40S": "L40S",
"4x NVidia L40S": "L40S",
"8x NVidia L40S": "L40S",
"1x AMD 7900XTX": AMD_RX7900XTX,
"2x AMD 7900XTX": AMD_RX7900XTX,
"4x AMD 7900XTX": AMD_RX7900XTX,
"8x AMD 7900XTX": AMD_RX7900XTX,
"12x AMD 7900XTX": AMD_RX7900XTX,
}


def gpu_name(name: str) -> Optional[str]:
def get_gpu_name(name: str) -> Optional[str]:
if not name:
return None

result = GPU_MAP.get(name)

if result is None:
logging.warning("There is no '%s' in GPU_MAP", name)
logger.warning("There is no '%s' in GPU_MAP", name)

return result
11 changes: 9 additions & 2 deletions src/gpuhunt/providers/lambdalabs.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,12 @@ def get(
for instance in data.values():
instance = instance["instance_type"]
logger.info(instance["name"])
gpu_count, gpu_name, gpu_memory = parse_description(instance["description"])
description = instance["description"]
result = parse_description(description)
if result is None:
logger.warning("Can't parse GPU info from description: %s", description)
continue
gpu_count, gpu_name, gpu_memory = result
offer = RawCatalogItem(
instance_name=instance["name"],
price=instance["price_cents_per_hour"] / 100,
Expand Down Expand Up @@ -69,8 +74,10 @@ def add_regions(self, offers: List[RawCatalogItem]) -> List[RawCatalogItem]:
return region_offers


def parse_description(v: str) -> Tuple[int, str, float]:
def parse_description(v: str) -> Optional[Tuple[int, str, float]]:
"""Returns gpus count, gpu name, and GPU memory"""
r = re.match(r"^(\d)x (?:Tesla )?(.+) \((\d+) GB", v)
if r is None:
return None
count, gpu_name, gpu_memory = r.groups()
return int(count), gpu_name.replace(" ", ""), float(gpu_memory)
8 changes: 4 additions & 4 deletions src/integrity_tests/test_datacrunch.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import pytest

from gpuhunt.providers.datacrunch import GPU_MAP
from gpuhunt.providers.datacrunch import ALL_AMD_GPUS, GPU_MAP


@pytest.fixture
Expand All @@ -16,7 +16,7 @@ def data_rows(catalog_dir: Path) -> List[dict]:


def select_row(rows, name: str) -> List[str]:
return [r[name] for r in rows]
return [r[name] for r in rows if r[name]]


def test_locations(data_rows):
Expand Down Expand Up @@ -46,6 +46,6 @@ def test_price(data_rows):


def test_gpu_present(data_rows):
refs = GPU_MAP.values()
refs = [name for name in GPU_MAP.values() if name not in ALL_AMD_GPUS]
gpus = select_row(data_rows, "gpu_name")
assert set(gpus) != set(refs)
assert set(gpus) == set(refs)
9 changes: 4 additions & 5 deletions src/tests/providers/test_datacrunch.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
DataCrunchProvider,
InstanceType,
generate_instances,
gpu_name,
get_gpu_name,
transform_instance,
)

Expand Down Expand Up @@ -179,12 +179,11 @@ def list_available_instances(raw_instance_types, locations):


def test_gpu_name(caplog):
assert gpu_name("1x H100 SXM5 80GB") == "H100"
assert gpu_name("") is None
assert gpu_name(None) is None
assert get_gpu_name("1x H100 SXM5 80GB") == "H100"
assert get_gpu_name("") is None

with caplog.at_level(logging.WARNING):
gpu_name("1x H200 SXM5 80GB")
get_gpu_name("1x H200 SXM5 80GB")
assert "There is no '1x H200 SXM5 80GB' in GPU_MAP" in caplog.text


Expand Down

0 comments on commit 2d8149a

Please sign in to comment.