dstackai · TheBits · May 10, 2024 · May 7, 2024 · May 10, 2024 · May 10, 2024
diff --git a/.github/workflows/catalogs.yml b/.github/workflows/catalogs.yml
@@ -170,6 +170,33 @@ jobs:
           path: lambdalabs.csv
           retention-days: 1
 
+  catalog-oci:
+    name: Collect OCI catalog
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: 3.11
+      - name: Install dependencies
+        run: |
+          pip install pip -U
+          pip install -e '.[oci]'
+      - name: Collect catalog
+        working-directory: src
+        run: python -m gpuhunt oci --output ../oci.csv
+        env:
+          OCI_CLI_USER: ${{ secrets.OCI_CLI_USER }}
+          OCI_CLI_KEY_CONTENT: ${{ secrets.OCI_CLI_KEY_CONTENT }}
+          OCI_CLI_FINGERPRINT: ${{ secrets.OCI_CLI_FINGERPRINT }}
+          OCI_CLI_TENANCY: ${{ secrets.OCI_CLI_TENANCY }}
+          OCI_CLI_REGION: ${{ secrets.OCI_CLI_REGION }}
+      - uses: actions/upload-artifact@v4
+        with:
+          name: catalogs-oci
+          path: oci.csv
+          retention-days: 1
+
   catalog-runpod:
     name: Collect Runpod catalog
     runs-on: ubuntu-latest
@@ -200,6 +227,7 @@ jobs:
       - catalog-datacrunch
       - catalog-gcp
       - catalog-lambdalabs
+      - catalog-oci
       - catalog-runpod
     runs-on: ubuntu-latest
     steps:

diff --git a/README.md b/README.md
@@ -60,6 +60,7 @@ print(*items, sep="\n")
 * DataCrunch
 * GCP
 * LambdaLabs
+* OCI
 * RunPod
 * TensorDock
 * Vast AI

diff --git a/pyproject.toml b/pyproject.toml
@@ -43,10 +43,14 @@ nebius = [
     "cryptography",
     "beautifulsoup4"
 ]
+oci = [
+    "oci",
+    "pydantic>=1.10.10,<2.0.0",
+]
 datacrunch = [
     "datacrunch"
 ]
-all = ["gpuhunt[aws,azure,datacrunch,gcp,nebius]"]
+all = ["gpuhunt[aws,azure,datacrunch,gcp,nebius,oci]"]
 dev = [
     "pre-commit",
     "isort~=5.13",

diff --git a/src/gpuhunt/__main__.py b/src/gpuhunt/__main__.py
@@ -17,6 +17,7 @@ def main():
             "datacrunch",
             "gcp",
             "lambdalabs",
+            "oci",
             "runpod",
             "tensordock",
             "vastai",
@@ -57,6 +58,18 @@ def main():
         from gpuhunt.providers.lambdalabs import LambdaLabsProvider
 
         provider = LambdaLabsProvider(os.getenv("LAMBDALABS_TOKEN"))
+    elif args.provider == "oci":
+        from gpuhunt.providers.oci import OCICredentials, OCIProvider
+
+        provider = OCIProvider(
+            OCICredentials(
+                user=os.getenv("OCI_CLI_USER"),
+                key_content=os.getenv("OCI_CLI_KEY_CONTENT"),
+                fingerprint=os.getenv("OCI_CLI_FINGERPRINT"),
+                tenancy=os.getenv("OCI_CLI_TENANCY"),
+                region=os.getenv("OCI_CLI_REGION"),
+            )
+        )
     elif args.provider == "runpod":
         from gpuhunt.providers.runpod import RunpodProvider
 

diff --git a/src/gpuhunt/_internal/catalog.py b/src/gpuhunt/_internal/catalog.py
@@ -17,7 +17,7 @@
 logger = logging.getLogger(__name__)
 version_url = "https://dstack-gpu-pricing.s3.eu-west-1.amazonaws.com/v1/version"
 catalog_url = "https://dstack-gpu-pricing.s3.eu-west-1.amazonaws.com/v1/{version}/catalog.zip"
-OFFLINE_PROVIDERS = ["aws", "azure", "datacrunch", "gcp", "lambdalabs", "runpod"]
+OFFLINE_PROVIDERS = ["aws", "azure", "datacrunch", "gcp", "lambdalabs", "oci", "runpod"]
 ONLINE_PROVIDERS = ["cudo", "tensordock", "vastai"]
 RELOAD_INTERVAL = 4 * 60 * 60  # 4 hours
 

diff --git a/src/gpuhunt/_internal/utils.py b/src/gpuhunt/_internal/utils.py
@@ -16,3 +16,10 @@ def parse_compute_capability(
         major, minor = value.split(".")
         return int(major), int(minor)
     return value
+
+
+def to_camel_case(snake_case: str) -> str:
+    words = snake_case.split("_")
+    words = list(filter(None, words))
+    words[1:] = [word[:1].upper() + word[1:] for word in words[1:]]
+    return "".join(words)
diff --git a/src/gpuhunt/providers/oci.py b/src/gpuhunt/providers/oci.py
@@ -0,0 +1,237 @@
+import logging
+import re
+from typing import Iterable, List, Optional, Type
+
+import oci
+from oci.identity.models import Region
+from pydantic import BaseModel, Field
+from requests import Session
+from typing_extensions import Annotated, TypedDict
+
+from gpuhunt._internal.constraints import KNOWN_GPUS
+from gpuhunt._internal.models import QueryFilter, RawCatalogItem
+from gpuhunt._internal.utils import to_camel_case
+from gpuhunt.providers import AbstractProvider
+
+logger = logging.getLogger(__name__)
+COST_ESTIMATOR_URL_TEMPLATE = "https://www.oracle.com/a/ocom/docs/cloudestimator2/data/{resource}"
+COST_ESTIMATOR_REQUEST_TIMEOUT = 10
+
+
+class OCICredentials(TypedDict):
+    user: Optional[str]
+    key_content: Optional[str]
+    fingerprint: Optional[str]
+    tenancy: Optional[str]
+    region: Optional[str]
+
+
+class OCIProvider(AbstractProvider):
+    NAME = "oci"
+
+    def __init__(self, credentials: OCICredentials):
+        self.api_client = oci.identity.IdentityClient(
+            credentials if all(credentials.values()) else oci.config.from_file()
+        )
+        self.cost_estimator = CostEstimator()
+
+    def get(
+        self, query_filter: Optional[QueryFilter] = None, balance_resources: bool = True
+    ) -> List[RawCatalogItem]:
+        shapes = self.cost_estimator.get_shapes()
+        products = self.cost_estimator.get_products()
+        regions: List[Region] = self.api_client.list_regions().data
+
+        result = []
+
+        for shape in shapes.items:
+            if (
+                shape.hidden
+                or shape.status != "ACTIVE"
+                or shape.shape_type.value != "vm"
+                or shape.sub_type.value == "flexible"
+            ):
+                continue
+
+            # extra validation, failing here would mean we are not handling some
+            # case that was not present in the data at the time of writing
+            if (
+                len(shape.products) != 1
+                or (ocpu_product := shape.products[0]).type.value != "ocpu"
+                or (product_details := products.find(ocpu_product.part_number)) is None
+                or product_details.billing_model != "UCM"
+                or product_details.price_type != "HOUR"
+                or (price_l10n := product_details.find_price_l10n("USD")) is None
+                or len(price_l10n.prices) != 1
+                or (product_price := price_l10n.prices[0]).model != "PAY_AS_YOU_GO"
+            ):
+                logger.warning(
+                    "Skipping shape %s due to unexpected cost estimator data",
+                    shape.name,
+                )
+                continue
+
+            if shape.sub_type.value == "gpu" and shape.gpu_qty is not None:
+                shape_price = product_price.value * shape.gpu_qty
+            else:
+                shape_price = product_price.value * ocpu_product.qty
+
+            vcpu = ocpu_product.qty if shape.is_arm_cpu() else ocpu_product.qty * 2
+
+            gpu = dict(
+                gpu_count=shape.gpu_qty or 0,
+                gpu_name=get_gpu_name(shape.name),
+                gpu_memory=shape.get_gpu_unit_memory_gb(),
+            )
+            if any(gpu.values()) and not all(gpu.values()):
+                logger.warning(
+                    "Skipping shape %s due to incomplete GPU parameters: %s", shape.name, gpu
+                )
+                continue
+
+            catalog_item = RawCatalogItem(
+                instance_name=shape.name,
+                location=None,
+                price=shape_price,
+                cpu=vcpu,
+                memory=shape.bundle_memory_qty,
+                **gpu,
+                spot=False,
+                disk_size=None,
+            )
+            result.extend(self._duplicate_item_in_regions(catalog_item, regions))
+
+        return sorted(result, key=lambda i: i.price)
+
+    @staticmethod
+    def _duplicate_item_in_regions(
+        item: RawCatalogItem, regions: Iterable[Region]
+    ) -> List[RawCatalogItem]:
+        result = []
+        for region in regions:
+            regional_item = RawCatalogItem(**item.dict())
+            regional_item.location = region.name
+            result.append(regional_item)
+        return result
+
+
+class CostEstimatorTypeField(BaseModel):
+    value: str
+
+
+class CostEstimatorShapeProduct(BaseModel):
+    type: CostEstimatorTypeField
+    part_number: str
+    qty: int
+
+    class Config:
+        alias_generator = to_camel_case
+
+
+class CostEstimatorShape(BaseModel):
+    name: str
+    hidden: bool
+    status: str
+    bundle_memory_qty: int
+    gpu_qty: Optional[int]
+    gpu_memory_qty: Optional[int]
+    processor_type: CostEstimatorTypeField
+    shape_type: CostEstimatorTypeField
+    sub_type: CostEstimatorTypeField
+    products: List[CostEstimatorShapeProduct]
+
+    class Config:
+        alias_generator = to_camel_case
+
+    def is_arm_cpu(self):
+        is_ampere_gpu = self.sub_type.value == "gpu" and (
+            "GPU4" in self.name or "GPU.A10" in self.name
+        )
+        # the data says A10 and A100 GPU instances are ARM, but they are not
+        return self.processor_type.value == "arm" and not is_ampere_gpu
+
+    def get_gpu_unit_memory_gb(self) -> Optional[float]:
+        if self.gpu_memory_qty and self.gpu_qty:
+            return self.gpu_memory_qty / self.gpu_qty
+        return None
+
+
+class CostEstimatorShapeList(BaseModel):
+    items: List[CostEstimatorShape]
+
+
+class CostEstimatorPrice(BaseModel):
+    model: str
+    value: float
+
+
+class CostEstimatorPriceLocalization(BaseModel):
+    currency_code: str
+    prices: List[CostEstimatorPrice]
+
+    class Config:
+        alias_generator = to_camel_case
+
+
+class CostEstimatorProduct(BaseModel):
+    part_number: str
+    billing_model: str
+    price_type: Annotated[str, Field(alias="pricetype")]
+    currency_code_localizations: List[CostEstimatorPriceLocalization]
+
+    class Config:
+        alias_generator = to_camel_case
+
+    def find_price_l10n(self, currency_code: str) -> Optional[CostEstimatorPriceLocalization]:
+        return next(
+            filter(
+                lambda price: price.currency_code == currency_code,
+                self.currency_code_localizations,
+            ),
+            None,
+        )
+
+
+class CostEstimatorProductList(BaseModel):
+    items: List[CostEstimatorProduct]
+
+    def find(self, part_number: str) -> Optional[CostEstimatorProduct]:
+        return next(filter(lambda product: product.part_number == part_number, self.items), None)
+
+
+class CostEstimator:
+    def __init__(self):
+        self.session = Session()
+
+    def get_shapes(self) -> CostEstimatorShapeList:
+        return self._get("shapes.json", CostEstimatorShapeList)
+
+    def get_products(self) -> CostEstimatorProductList:
+        return self._get("products.json", CostEstimatorProductList)
+
+    def _get(self, resource: str, ResponseModel: Type[BaseModel]):
+        url = COST_ESTIMATOR_URL_TEMPLATE.format(resource=resource)
+        resp = self.session.get(url, timeout=COST_ESTIMATOR_REQUEST_TIMEOUT)
+        resp.raise_for_status()
+        return ResponseModel.parse_raw(resp.content)
+
+
+def get_gpu_name(shape_name: str) -> Optional[str]:
+    parts = re.split(r"[\.-]", shape_name.upper())
+
+    if "GPU4" in parts:
+        return "A100"
+    if "GPU3" in parts:
+        return "V100"
+    if "GPU2" in parts:
+        return "P100"
+
+    if "GPU" in parts:
+        gpu_name_index = parts.index("GPU") + 1
+        if gpu_name_index < len(parts):
+            gpu_name = parts[gpu_name_index]
+
+            for gpu in KNOWN_GPUS:
+                if gpu.name.upper() == gpu_name:
+                    return gpu.name
+    return None
diff --git a/src/integrity_tests/test_oci.py b/src/integrity_tests/test_oci.py
@@ -0,0 +1,35 @@
+import csv
+from operator import itemgetter
+from pathlib import Path
+from typing import List
+
+import pytest
+
+
+@pytest.fixture
+def data_rows(catalog_dir: Path) -> List[dict]:
+    with open(catalog_dir / "oci.csv") as f:
+        return list(csv.DictReader(f))
+
+
+@pytest.mark.parametrize("gpu", ["P100", "V100", "A10", ""])
+def test_gpu_present(gpu: str, data_rows: List[dict]):
+    assert gpu in map(itemgetter("gpu_name"), data_rows)
+
+
+def test_on_demand_present(data_rows: List[dict]):
+    assert "False" in map(itemgetter("spot"), data_rows)
+
+
+def test_vm_present(data_rows: List[dict]):
+    assert any(name.startswith("VM") for name in map(itemgetter("instance_name"), data_rows))
+
+
+def test_quantity_decreases_as_query_complexity_increases(data_rows: List[dict]):
+    zero_or_one_gpu = list(filter(lambda row: int(row["gpu_count"]) in (0, 1), data_rows))
+    zero_gpu = list(filter(lambda row: int(row["gpu_count"]) == 0, data_rows))
+    one_gpu = list(filter(lambda row: int(row["gpu_count"]) == 1, data_rows))
+
+    assert len(data_rows) > len(zero_or_one_gpu)
+    assert len(zero_or_one_gpu) > len(zero_gpu)
+    assert len(zero_gpu) > len(one_gpu)