diff --git a/.github/workflows/catalogs.yml b/.github/workflows/catalogs.yml index 4785098..eb7bf6a 100644 --- a/.github/workflows/catalogs.yml +++ b/.github/workflows/catalogs.yml @@ -170,6 +170,33 @@ jobs: path: lambdalabs.csv retention-days: 1 + catalog-oci: + name: Collect OCI catalog + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: 3.11 + - name: Install dependencies + run: | + pip install pip -U + pip install -e '.[oci]' + - name: Collect catalog + working-directory: src + run: python -m gpuhunt oci --output ../oci.csv + env: + OCI_CLI_USER: ${{ secrets.OCI_CLI_USER }} + OCI_CLI_KEY_CONTENT: ${{ secrets.OCI_CLI_KEY_CONTENT }} + OCI_CLI_FINGERPRINT: ${{ secrets.OCI_CLI_FINGERPRINT }} + OCI_CLI_TENANCY: ${{ secrets.OCI_CLI_TENANCY }} + OCI_CLI_REGION: ${{ secrets.OCI_CLI_REGION }} + - uses: actions/upload-artifact@v4 + with: + name: catalogs-oci + path: oci.csv + retention-days: 1 + catalog-runpod: name: Collect Runpod catalog runs-on: ubuntu-latest @@ -200,6 +227,7 @@ jobs: - catalog-datacrunch - catalog-gcp - catalog-lambdalabs + - catalog-oci - catalog-runpod runs-on: ubuntu-latest steps: diff --git a/README.md b/README.md index 588fa17..b23ca7a 100644 --- a/README.md +++ b/README.md @@ -60,6 +60,7 @@ print(*items, sep="\n") * DataCrunch * GCP * LambdaLabs +* OCI * RunPod * TensorDock * Vast AI diff --git a/pyproject.toml b/pyproject.toml index d48ab41..4350358 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,10 +43,14 @@ nebius = [ "cryptography", "beautifulsoup4" ] +oci = [ + "oci", + "pydantic>=1.10.10,<2.0.0", +] datacrunch = [ "datacrunch" ] -all = ["gpuhunt[aws,azure,datacrunch,gcp,nebius]"] +all = ["gpuhunt[aws,azure,datacrunch,gcp,nebius,oci]"] dev = [ "pre-commit", "isort~=5.13", diff --git a/src/gpuhunt/__main__.py b/src/gpuhunt/__main__.py index db2ebba..15ff87f 100644 --- a/src/gpuhunt/__main__.py +++ b/src/gpuhunt/__main__.py @@ -17,6 +17,7 @@ def main(): "datacrunch", "gcp", "lambdalabs", + "oci", "runpod", "tensordock", "vastai", @@ -57,6 +58,18 @@ def main(): from gpuhunt.providers.lambdalabs import LambdaLabsProvider provider = LambdaLabsProvider(os.getenv("LAMBDALABS_TOKEN")) + elif args.provider == "oci": + from gpuhunt.providers.oci import OCICredentials, OCIProvider + + provider = OCIProvider( + OCICredentials( + user=os.getenv("OCI_CLI_USER"), + key_content=os.getenv("OCI_CLI_KEY_CONTENT"), + fingerprint=os.getenv("OCI_CLI_FINGERPRINT"), + tenancy=os.getenv("OCI_CLI_TENANCY"), + region=os.getenv("OCI_CLI_REGION"), + ) + ) elif args.provider == "runpod": from gpuhunt.providers.runpod import RunpodProvider diff --git a/src/gpuhunt/_internal/catalog.py b/src/gpuhunt/_internal/catalog.py index 8e39dc8..78675f7 100644 --- a/src/gpuhunt/_internal/catalog.py +++ b/src/gpuhunt/_internal/catalog.py @@ -17,7 +17,7 @@ logger = logging.getLogger(__name__) version_url = "https://dstack-gpu-pricing.s3.eu-west-1.amazonaws.com/v1/version" catalog_url = "https://dstack-gpu-pricing.s3.eu-west-1.amazonaws.com/v1/{version}/catalog.zip" -OFFLINE_PROVIDERS = ["aws", "azure", "datacrunch", "gcp", "lambdalabs", "runpod"] +OFFLINE_PROVIDERS = ["aws", "azure", "datacrunch", "gcp", "lambdalabs", "oci", "runpod"] ONLINE_PROVIDERS = ["cudo", "tensordock", "vastai"] RELOAD_INTERVAL = 4 * 60 * 60 # 4 hours diff --git a/src/gpuhunt/_internal/utils.py b/src/gpuhunt/_internal/utils.py index 118eb99..04a5abe 100644 --- a/src/gpuhunt/_internal/utils.py +++ b/src/gpuhunt/_internal/utils.py @@ -16,3 +16,10 @@ def parse_compute_capability( major, minor = value.split(".") return int(major), int(minor) return value + + +def to_camel_case(snake_case: str) -> str: + words = snake_case.split("_") + words = list(filter(None, words)) + words[1:] = [word[:1].upper() + word[1:] for word in words[1:]] + return "".join(words) diff --git a/src/gpuhunt/providers/oci.py b/src/gpuhunt/providers/oci.py new file mode 100644 index 0000000..795ba9c --- /dev/null +++ b/src/gpuhunt/providers/oci.py @@ -0,0 +1,237 @@ +import logging +import re +from typing import Iterable, List, Optional, Type + +import oci +from oci.identity.models import Region +from pydantic import BaseModel, Field +from requests import Session +from typing_extensions import Annotated, TypedDict + +from gpuhunt._internal.constraints import KNOWN_GPUS +from gpuhunt._internal.models import QueryFilter, RawCatalogItem +from gpuhunt._internal.utils import to_camel_case +from gpuhunt.providers import AbstractProvider + +logger = logging.getLogger(__name__) +COST_ESTIMATOR_URL_TEMPLATE = "https://www.oracle.com/a/ocom/docs/cloudestimator2/data/{resource}" +COST_ESTIMATOR_REQUEST_TIMEOUT = 10 + + +class OCICredentials(TypedDict): + user: Optional[str] + key_content: Optional[str] + fingerprint: Optional[str] + tenancy: Optional[str] + region: Optional[str] + + +class OCIProvider(AbstractProvider): + NAME = "oci" + + def __init__(self, credentials: OCICredentials): + self.api_client = oci.identity.IdentityClient( + credentials if all(credentials.values()) else oci.config.from_file() + ) + self.cost_estimator = CostEstimator() + + def get( + self, query_filter: Optional[QueryFilter] = None, balance_resources: bool = True + ) -> List[RawCatalogItem]: + shapes = self.cost_estimator.get_shapes() + products = self.cost_estimator.get_products() + regions: List[Region] = self.api_client.list_regions().data + + result = [] + + for shape in shapes.items: + if ( + shape.hidden + or shape.status != "ACTIVE" + or shape.shape_type.value != "vm" + or shape.sub_type.value == "flexible" + ): + continue + + # extra validation, failing here would mean we are not handling some + # case that was not present in the data at the time of writing + if ( + len(shape.products) != 1 + or (ocpu_product := shape.products[0]).type.value != "ocpu" + or (product_details := products.find(ocpu_product.part_number)) is None + or product_details.billing_model != "UCM" + or product_details.price_type != "HOUR" + or (price_l10n := product_details.find_price_l10n("USD")) is None + or len(price_l10n.prices) != 1 + or (product_price := price_l10n.prices[0]).model != "PAY_AS_YOU_GO" + ): + logger.warning( + "Skipping shape %s due to unexpected cost estimator data", + shape.name, + ) + continue + + if shape.sub_type.value == "gpu" and shape.gpu_qty is not None: + shape_price = product_price.value * shape.gpu_qty + else: + shape_price = product_price.value * ocpu_product.qty + + vcpu = ocpu_product.qty if shape.is_arm_cpu() else ocpu_product.qty * 2 + + gpu = dict( + gpu_count=shape.gpu_qty or 0, + gpu_name=get_gpu_name(shape.name), + gpu_memory=shape.get_gpu_unit_memory_gb(), + ) + if any(gpu.values()) and not all(gpu.values()): + logger.warning( + "Skipping shape %s due to incomplete GPU parameters: %s", shape.name, gpu + ) + continue + + catalog_item = RawCatalogItem( + instance_name=shape.name, + location=None, + price=shape_price, + cpu=vcpu, + memory=shape.bundle_memory_qty, + **gpu, + spot=False, + disk_size=None, + ) + result.extend(self._duplicate_item_in_regions(catalog_item, regions)) + + return sorted(result, key=lambda i: i.price) + + @staticmethod + def _duplicate_item_in_regions( + item: RawCatalogItem, regions: Iterable[Region] + ) -> List[RawCatalogItem]: + result = [] + for region in regions: + regional_item = RawCatalogItem(**item.dict()) + regional_item.location = region.name + result.append(regional_item) + return result + + +class CostEstimatorTypeField(BaseModel): + value: str + + +class CostEstimatorShapeProduct(BaseModel): + type: CostEstimatorTypeField + part_number: str + qty: int + + class Config: + alias_generator = to_camel_case + + +class CostEstimatorShape(BaseModel): + name: str + hidden: bool + status: str + bundle_memory_qty: int + gpu_qty: Optional[int] + gpu_memory_qty: Optional[int] + processor_type: CostEstimatorTypeField + shape_type: CostEstimatorTypeField + sub_type: CostEstimatorTypeField + products: List[CostEstimatorShapeProduct] + + class Config: + alias_generator = to_camel_case + + def is_arm_cpu(self): + is_ampere_gpu = self.sub_type.value == "gpu" and ( + "GPU4" in self.name or "GPU.A10" in self.name + ) + # the data says A10 and A100 GPU instances are ARM, but they are not + return self.processor_type.value == "arm" and not is_ampere_gpu + + def get_gpu_unit_memory_gb(self) -> Optional[float]: + if self.gpu_memory_qty and self.gpu_qty: + return self.gpu_memory_qty / self.gpu_qty + return None + + +class CostEstimatorShapeList(BaseModel): + items: List[CostEstimatorShape] + + +class CostEstimatorPrice(BaseModel): + model: str + value: float + + +class CostEstimatorPriceLocalization(BaseModel): + currency_code: str + prices: List[CostEstimatorPrice] + + class Config: + alias_generator = to_camel_case + + +class CostEstimatorProduct(BaseModel): + part_number: str + billing_model: str + price_type: Annotated[str, Field(alias="pricetype")] + currency_code_localizations: List[CostEstimatorPriceLocalization] + + class Config: + alias_generator = to_camel_case + + def find_price_l10n(self, currency_code: str) -> Optional[CostEstimatorPriceLocalization]: + return next( + filter( + lambda price: price.currency_code == currency_code, + self.currency_code_localizations, + ), + None, + ) + + +class CostEstimatorProductList(BaseModel): + items: List[CostEstimatorProduct] + + def find(self, part_number: str) -> Optional[CostEstimatorProduct]: + return next(filter(lambda product: product.part_number == part_number, self.items), None) + + +class CostEstimator: + def __init__(self): + self.session = Session() + + def get_shapes(self) -> CostEstimatorShapeList: + return self._get("shapes.json", CostEstimatorShapeList) + + def get_products(self) -> CostEstimatorProductList: + return self._get("products.json", CostEstimatorProductList) + + def _get(self, resource: str, ResponseModel: Type[BaseModel]): + url = COST_ESTIMATOR_URL_TEMPLATE.format(resource=resource) + resp = self.session.get(url, timeout=COST_ESTIMATOR_REQUEST_TIMEOUT) + resp.raise_for_status() + return ResponseModel.parse_raw(resp.content) + + +def get_gpu_name(shape_name: str) -> Optional[str]: + parts = re.split(r"[\.-]", shape_name.upper()) + + if "GPU4" in parts: + return "A100" + if "GPU3" in parts: + return "V100" + if "GPU2" in parts: + return "P100" + + if "GPU" in parts: + gpu_name_index = parts.index("GPU") + 1 + if gpu_name_index < len(parts): + gpu_name = parts[gpu_name_index] + + for gpu in KNOWN_GPUS: + if gpu.name.upper() == gpu_name: + return gpu.name + return None diff --git a/src/integrity_tests/test_oci.py b/src/integrity_tests/test_oci.py new file mode 100644 index 0000000..ffe25b7 --- /dev/null +++ b/src/integrity_tests/test_oci.py @@ -0,0 +1,35 @@ +import csv +from operator import itemgetter +from pathlib import Path +from typing import List + +import pytest + + +@pytest.fixture +def data_rows(catalog_dir: Path) -> List[dict]: + with open(catalog_dir / "oci.csv") as f: + return list(csv.DictReader(f)) + + +@pytest.mark.parametrize("gpu", ["P100", "V100", "A10", ""]) +def test_gpu_present(gpu: str, data_rows: List[dict]): + assert gpu in map(itemgetter("gpu_name"), data_rows) + + +def test_on_demand_present(data_rows: List[dict]): + assert "False" in map(itemgetter("spot"), data_rows) + + +def test_vm_present(data_rows: List[dict]): + assert any(name.startswith("VM") for name in map(itemgetter("instance_name"), data_rows)) + + +def test_quantity_decreases_as_query_complexity_increases(data_rows: List[dict]): + zero_or_one_gpu = list(filter(lambda row: int(row["gpu_count"]) in (0, 1), data_rows)) + zero_gpu = list(filter(lambda row: int(row["gpu_count"]) == 0, data_rows)) + one_gpu = list(filter(lambda row: int(row["gpu_count"]) == 1, data_rows)) + + assert len(data_rows) > len(zero_or_one_gpu) + assert len(zero_or_one_gpu) > len(zero_gpu) + assert len(zero_gpu) > len(one_gpu) diff --git a/src/tests/_internal/test_utils.py b/src/tests/_internal/test_utils.py new file mode 100644 index 0000000..aa58ea8 --- /dev/null +++ b/src/tests/_internal/test_utils.py @@ -0,0 +1,20 @@ +import pytest + +from gpuhunt._internal.utils import to_camel_case + + +@pytest.mark.parametrize( + ["before", "after"], + [ + ["spam_ham_eggs", "spamHamEggs"], + ["spam__ham__eggs", "spamHamEggs"], + ["__spam_ham_eggs__", "spamHamEggs"], + ["spamHam_eggs", "spamHamEggs"], + ["spamHamEggs", "spamHamEggs"], + ["SpamHam_eggs", "SpamHamEggs"], + ["spam", "spam"], + ["", ""], + ], +) +def test_to_camel_case(before, after): + assert to_camel_case(before) == after diff --git a/src/tests/providers/test_oci.py b/src/tests/providers/test_oci.py new file mode 100644 index 0000000..5516580 --- /dev/null +++ b/src/tests/providers/test_oci.py @@ -0,0 +1,20 @@ +import pytest + +from gpuhunt.providers.oci import get_gpu_name + + +@pytest.mark.parametrize( + ("shape_name", "gpu_name"), + [ + ("VM.GPU.A10.2", "A10"), + ("BM.GPU.A100-v2.8", "A100"), + ("BM.GPU4.8", "A100"), + ("VM.GPU3.4", "V100"), + ("VM.GPU2.1", "P100"), + ("BM.GPU.H100.8", "H100"), + ("VM.Standard2.8", None), + ("VM.Notgpu.A10", None), + ], +) +def test_get_gpu_name(shape_name, gpu_name): + assert get_gpu_name(shape_name) == gpu_name