Skip to content

Commit

Permalink
Allow to customize AMIs used by AWS backend (#1920)
Browse files Browse the repository at this point in the history
Closes: #1913
  • Loading branch information
un-def authored Oct 31, 2024
1 parent b8b0e04 commit 6959dab
Show file tree
Hide file tree
Showing 7 changed files with 248 additions and 13 deletions.
52 changes: 52 additions & 0 deletions docs/docs/reference/server/config.yml.md
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,42 @@ There are two ways to configure AWS: using an access key or using the default cr
Using private subnets assumes that both the `dstack` server and users can access the configured VPC's private subnets.
Additionally, private subnets must have outbound internet connectivity provided by NAT Gateway, Transit Gateway, or other mechanism.

??? info "OS images"
By default, `dstack` uses its own [AMI :material-arrow-top-right-thin:{ .external }](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AMIs.html)
optimized for `dstack`.
To use your own or other third-party images, set the `os_images` property:

```yaml
projects:
- name: main
backends:
- type: aws
creds:
type: default

os_images:
cpu:
name: my-ami-for-cpu-instances
owner: self
user: dstack
nvidia:
name: 'Some ThirdParty CUDA image'
owner: 123456789012
user: ubuntu
```

Here, both `cpu` and `nvidia` properties are optional, but if the property is not set, you won´t be able to use the corresponding instance types.

The `name` is an AMI name.
The `owner` is either an AWS account ID (a 12-digit number) or a special value `self` indicating the current account.
The `user` specifies an OS user for instance provisioning.

!!! info "Image requirements"
* SSH server listening on port 22
* `user` with passwordless sudo access
* Docker is installed
* (For NVIDIA instances) NVIDIA/CUDA drivers and NVIDIA Container Toolkit are installed

#### Azure

There are two ways to configure Azure: using a client secret or using the default credentials.
Expand Down Expand Up @@ -920,6 +956,22 @@ See the [reference table](#default-permissions) for all configurable permissions
type:
required: true

## `projects[n].backends[type=aws].os_images` { #_aws-os-images data-toc-label="backends[type=aws].os_images" }

#SCHEMA# dstack._internal.core.models.backends.aws.AWSOSImageConfig
overrides:
show_root_heading: false
type:
required: true

## `projects[n].backends[type=aws].os_images.*` { #_aws-os-image data-toc-label="backends[type=aws].os_images.*" }

#SCHEMA# dstack._internal.core.models.backends.aws.AWSOSImage
overrides:
show_root_heading: false
type:
required: true

## `projects[n].backends[type=azure]` { #_azure data-toc-label="backends[type=azure]" }

#SCHEMA# dstack._internal.server.services.config.AzureConfig
Expand Down
12 changes: 7 additions & 5 deletions src/dstack/_internal/core/backends/aws/compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,13 +170,15 @@ def create_instance(
tried_availability_zones.add(az)
try:
logger.debug("Trying provisioning %s in %s", instance_offer.instance.name, az)
image_id, username = aws_resources.get_image_id_and_username(
ec2_client=ec2_client,
cuda=len(instance_offer.instance.resources.gpus) > 0,
image_config=self.config.os_images,
)
response = ec2_resource.create_instances(
**aws_resources.create_instances_struct(
disk_size=disk_size,
image_id=aws_resources.get_image_id(
ec2_client=ec2_client,
cuda=len(instance_offer.instance.resources.gpus) > 0,
),
image_id=image_id,
instance_type=instance_offer.instance.name,
iam_instance_profile_arn=None,
user_data=get_user_data(authorized_keys=instance_config.get_public_keys()),
Expand Down Expand Up @@ -211,7 +213,7 @@ def create_instance(
region=instance_offer.region,
availability_zone=az,
price=instance_offer.price,
username="ubuntu",
username=username,
ssh_port=22,
dockerized=True, # because `dstack-shim docker` is used
ssh_proxy=None,
Expand Down
35 changes: 29 additions & 6 deletions src/dstack/_internal/core/backends/aws/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,22 +6,45 @@

import dstack.version as version
from dstack._internal.core.errors import BackendError, ComputeError, ComputeResourceNotFoundError
from dstack._internal.core.models.backends.aws import AWSOSImageConfig
from dstack._internal.utils.logging import get_logger

logger = get_logger(__name__)

DSTACK_ACCOUNT_ID = "142421590066"

def get_image_id(ec2_client: botocore.client.BaseClient, cuda: bool) -> str:
image_name = (
f"dstack-{version.base_image}" if not cuda else f"dstack-cuda-{version.base_image}"
)

response = ec2_client.describe_images(Filters=[{"Name": "name", "Values": [image_name]}])
def get_image_id_and_username(
ec2_client: botocore.client.BaseClient,
cuda: bool,
image_config: Optional[AWSOSImageConfig] = None,
) -> tuple[str, str]:
if image_config is not None:
image = image_config.nvidia if cuda else image_config.cpu
if image is None:
logger.warning("%s image not configured", "nvidia" if cuda else "cpu")
raise ComputeResourceNotFoundError()
image_name = image.name
image_owner = image.owner
username = image.user
else:
image_name = (
f"dstack-{version.base_image}" if not cuda else f"dstack-cuda-{version.base_image}"
)
image_owner = DSTACK_ACCOUNT_ID
username = "ubuntu"
response = ec2_client.describe_images(
Filters=[{"Name": "name", "Values": [image_name]}], Owners=[image_owner]
)
images = sorted(
(i for i in response["Images"] if i["State"] == "available"),
key=lambda i: i["CreationDate"],
reverse=True,
)
if not images:
logger.warning("image '%s' not found", image_name)
raise ComputeResourceNotFoundError()
return images[0]["ImageId"]
return images[0]["ImageId"], username


def create_security_group(
Expand Down
18 changes: 18 additions & 0 deletions src/dstack/_internal/core/models/backends/aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,22 @@
from dstack._internal.core.models.common import CoreModel


class AWSOSImage(CoreModel):
name: Annotated[str, Field(description="AMI name")]
owner: Annotated[
str,
Field(regex=r"^(\d{12}|self)$", description="AMI owner, account ID or `self`"),
] = "self"
user: Annotated[str, Field(description="OS user for provisioning")]


class AWSOSImageConfig(CoreModel):
cpu: Annotated[Optional[AWSOSImage], Field(description="AMI used for CPU instances")] = None
nvidia: Annotated[
Optional[AWSOSImage], Field(description="AMI used for NVIDIA GPU instances")
] = None


class AWSConfigInfo(CoreModel):
type: Literal["aws"] = "aws"
regions: Optional[List[str]] = None
Expand All @@ -15,6 +31,7 @@ class AWSConfigInfo(CoreModel):
default_vpcs: Optional[bool] = None
public_ips: Optional[bool] = None
tags: Optional[Dict[str, str]] = None
os_images: Optional[AWSOSImageConfig] = None


class AWSAccessKeyCreds(CoreModel):
Expand Down Expand Up @@ -52,6 +69,7 @@ class AWSConfigInfoWithCredsPartial(CoreModel):
default_vpcs: Optional[bool]
public_ips: Optional[bool]
tags: Optional[Dict[str, str]]
os_images: Optional["AWSOSImageConfig"]


class AWSConfigValues(CoreModel):
Expand Down
8 changes: 7 additions & 1 deletion src/dstack/_internal/server/services/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
ServerClientError,
)
from dstack._internal.core.models.backends import AnyConfigInfoWithCreds, BackendInfoYAML
from dstack._internal.core.models.backends.aws import AnyAWSCreds
from dstack._internal.core.models.backends.aws import AnyAWSCreds, AWSOSImageConfig
from dstack._internal.core.models.backends.azure import AnyAzureCreds
from dstack._internal.core.models.backends.base import BackendType
from dstack._internal.core.models.backends.cudo import AnyCudoCreds
Expand Down Expand Up @@ -107,6 +107,12 @@ class AWSConfig(CoreModel):
Optional[Dict[str, str]],
Field(description="The tags that will be assigned to resources created by `dstack`"),
] = None
os_images: Annotated[
Optional[AWSOSImageConfig],
Field(
description="The mapping of instance categories (CPU, NVIDIA GPU) to AMI configurations"
),
] = None
creds: AnyAWSCreds = Field(..., description="The credentials", discriminator="type")


Expand Down
135 changes: 134 additions & 1 deletion src/tests/_internal/core/backends/aws/test_resources.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
import logging
from unittest.mock import Mock

import pytest

from dstack._internal.core.backends.aws.resources import (
_is_valid_tag_key,
_is_valid_tag_value,
get_image_id_and_username,
validate_tags,
)
from dstack._internal.core.errors import BackendError
from dstack._internal.core.errors import BackendError, ComputeResourceNotFoundError
from dstack._internal.core.models.backends.aws import AWSOSImage, AWSOSImageConfig


class TestIsValidTagKey:
Expand Down Expand Up @@ -71,3 +76,131 @@ def test_validate_invalid_tags(self):
tags = {"aws:ReservedKey": "SomeValue", "ValidKey": "Invalid#Value"}
with pytest.raises(BackendError, match="Invalid resource tags"):
validate_tags(tags)


class TestGetImageIdAndUsername:
@pytest.fixture
def ec2_client_mock(self) -> Mock:
mock = Mock(spec_set=["describe_images"])
mock.describe_images.return_value = {
"Images": [
{
"ImageId": "ami-00000000000000000",
"State": "available",
"CreationDate": "2000-01-01T00:00:00.000Z",
},
],
}
return mock

def test_returns_the_latest_available(self, ec2_client_mock: Mock):
ec2_client_mock.describe_images.return_value = {
"Images": [
# the latest, but not available
{
"ImageId": "ami-00000000000000001",
"State": "failed",
"CreationDate": "2024-01-01T00:00:00.000Z",
},
# available, but not the latest
{
"ImageId": "ami-00000000000000002",
"State": "available",
"CreationDate": "2022-01-01T00:00:00.000Z",
},
# the latest available
{
"ImageId": "ami-00000000000000003",
"State": "available",
"CreationDate": "2023-01-01T00:00:00.000Z",
},
]
}
image_id, username = get_image_id_and_username(ec2_client_mock, cuda=False)
assert image_id == "ami-00000000000000003"
assert username == "ubuntu"

def test_raises_resource_not_found_if_none_available(
self,
monkeypatch: pytest.MonkeyPatch,
caplog: pytest.LogCaptureFixture,
ec2_client_mock: Mock,
):
monkeypatch.setattr("dstack.version.base_image", "0.0")
caplog.set_level(logging.WARNING)
ec2_client_mock.describe_images.return_value = {
"Images": [
{
"ImageId": "ami-00000000000000000",
"State": "failed",
"CreationDate": "2000-01-01T00:00:00.000Z",
},
]
}
with pytest.raises(ComputeResourceNotFoundError):
get_image_id_and_username(ec2_client_mock, cuda=False)
assert "image 'dstack-0.0' not found" in caplog.text

@pytest.mark.parametrize(
["cuda", "expected"],
[
[False, "dstack-0.0"],
[True, "dstack-cuda-0.0"],
],
)
def test_uses_dstack_image_name_and_account_id_if_image_config_not_provided(
self, monkeypatch: pytest.MonkeyPatch, ec2_client_mock: Mock, cuda: bool, expected: str
):
monkeypatch.setattr("dstack.version.base_image", "0.0")
_, username = get_image_id_and_username(ec2_client_mock, cuda)
assert username == "ubuntu"
ec2_client_mock.describe_images.assert_called_once_with(
Filters=[{"Name": "name", "Values": [expected]}], Owners=["142421590066"]
)

@pytest.mark.parametrize(
["cuda", "expected_name", "expected_owner", "expected_username"],
[
[False, "cpu-ami", "123456789012", "debian"],
[True, "nvidia-ami", "self", "dstack"],
],
)
def test_uses_image_config_if_provided(
self,
ec2_client_mock: Mock,
cuda: bool,
expected_name: str,
expected_owner: str,
expected_username: str,
):
image_config = AWSOSImageConfig(
cpu=AWSOSImage(
name="cpu-ami",
owner="123456789012",
user="debian",
),
nvidia=AWSOSImage(
name="nvidia-ami",
user="dstack",
),
)
_, username = get_image_id_and_username(ec2_client_mock, cuda, image_config)
assert username == expected_username
ec2_client_mock.describe_images.assert_called_once_with(
Filters=[{"Name": "name", "Values": [expected_name]}],
Owners=[expected_owner],
)

def test_raises_resource_not_found_if_image_config_property_not_set(
self, caplog: pytest.LogCaptureFixture, ec2_client_mock: Mock
):
caplog.set_level(logging.WARNING)
image_config = AWSOSImageConfig(
nvidia=AWSOSImage(
name="nvidia-ami",
user="dstack",
),
)
with pytest.raises(ComputeResourceNotFoundError):
get_image_id_and_username(ec2_client_mock, cuda=False, image_config=image_config)
assert "cpu image not configured" in caplog.text
1 change: 1 addition & 0 deletions src/tests/_internal/server/routers/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -1230,6 +1230,7 @@ async def test_returns_config_info(self, test_db, session: AsyncSession, client:
"default_vpcs": None,
"public_ips": None,
"tags": None,
"os_images": None,
"creds": json.loads(backend.auth.plaintext),
}

Expand Down

0 comments on commit 6959dab

Please sign in to comment.