Skip to content

Commit 1c56561

Browse files
authored
Support Runpod Instant Clusters (#3214)
* Fix runpod type annotations * Add _generate_create_cluster_mutation * feat: add create_cluster method to RunpodApiClient * feat: add delete_cluster method to RunpodApiClient * Use keyword arguments * Implement run_jobs and terminate_compute_group * Prototype compute.run_jobs calling * Prototype compute group provisioning for multinode tasks * Add JobModel.waiting_master_job * Add ComputeGroupModel * Implement process_compute_groups to terminate compute groups * Remove todo * Fix comments * Set internal_ip * Support Runpod Clusters offers * Respect supported pod_counts * Support registry_auth * Fix tests * Add feature flag DSTACK_FF_RUNPOD_CLUSTER_OFFERS_ENABLED * Remove hardcoded template_id * Use BACKENDS_WITH_GROUP_PROVISIONING_SUPPORT
1 parent 34d363a commit 1c56561

File tree

30 files changed

+1282
-216
lines changed

30 files changed

+1282
-216
lines changed

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ include = [
8484
"src/dstack/_internal/server",
8585
"src/dstack/_internal/core/services",
8686
"src/dstack/_internal/core/backends/kubernetes",
87+
"src/dstack/_internal/core/backends/runpod",
8788
"src/dstack/_internal/cli/services/configurators",
8889
"src/dstack/_internal/cli/commands",
8990
]

src/dstack/_internal/core/backends/base/compute.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,15 @@
1717
from gpuhunt import CPUArchitecture
1818

1919
from dstack._internal import settings
20+
from dstack._internal.core.backends.base.models import JobConfiguration
2021
from dstack._internal.core.backends.base.offers import OfferModifier, filter_offers_by_requirements
2122
from dstack._internal.core.consts import (
2223
DSTACK_RUNNER_HTTP_PORT,
2324
DSTACK_RUNNER_SSH_PORT,
2425
DSTACK_SHIM_HTTP_PORT,
2526
)
2627
from dstack._internal.core.models.backends.base import BackendType
28+
from dstack._internal.core.models.compute_groups import ComputeGroup, ComputeGroupProvisioningData
2729
from dstack._internal.core.models.configurations import LEGACY_REPO_DIR
2830
from dstack._internal.core.models.gateways import (
2931
GatewayComputeConfiguration,
@@ -324,6 +326,23 @@ def _restrict_instance_offer_az_to_volumes_az(
324326
]
325327

326328

329+
class ComputeWithGroupProvisioningSupport(ABC):
330+
@abstractmethod
331+
def run_jobs(
332+
self,
333+
run: Run,
334+
job_configurations: List[JobConfiguration],
335+
instance_offer: InstanceOfferWithAvailability,
336+
project_ssh_public_key: str,
337+
project_ssh_private_key: str,
338+
) -> ComputeGroupProvisioningData:
339+
pass
340+
341+
@abstractmethod
342+
def terminate_compute_group(self, compute_group: ComputeGroup):
343+
pass
344+
345+
327346
class ComputeWithPrivilegedSupport:
328347
"""
329348
Must be subclassed to support runs with `privileged: true`.

src/dstack/_internal/core/backends/base/models.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,14 @@
11
from pathlib import Path
2+
from typing import List
3+
4+
from dstack._internal.core.models.common import CoreModel
5+
from dstack._internal.core.models.runs import Job
6+
from dstack._internal.core.models.volumes import Volume
7+
8+
9+
class JobConfiguration(CoreModel):
10+
job: Job
11+
volumes: List[Volume]
212

313

414
def fill_data(values: dict, filename_field: str = "filename", data_field: str = "data") -> dict:

src/dstack/_internal/core/backends/base/offers.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
"gcp-a4",
2626
"gcp-g4",
2727
"gcp-dws-calendar-mode",
28+
"runpod-cluster",
2829
]
2930

3031

src/dstack/_internal/core/backends/features.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from dstack._internal.core.backends.base.compute import (
22
ComputeWithCreateInstanceSupport,
33
ComputeWithGatewaySupport,
4+
ComputeWithGroupProvisioningSupport,
45
ComputeWithMultinodeSupport,
56
ComputeWithPlacementGroupSupport,
67
ComputeWithPrivateGatewaySupport,
@@ -39,6 +40,10 @@ def _get_backends_with_compute_feature(
3940
configurator_classes=_configurator_classes,
4041
compute_feature_class=ComputeWithCreateInstanceSupport,
4142
)
43+
BACKENDS_WITH_GROUP_PROVISIONING_SUPPORT = _get_backends_with_compute_feature(
44+
configurator_classes=_configurator_classes,
45+
compute_feature_class=ComputeWithGroupProvisioningSupport,
46+
)
4247
BACKENDS_WITH_PRIVILEGED_SUPPORT = _get_backends_with_compute_feature(
4348
configurator_classes=_configurator_classes,
4449
compute_feature_class=ComputeWithPrivilegedSupport,

0 commit comments

Comments
 (0)