From 6959dab3fc2f97ab203e4454449a3759e4f0618b Mon Sep 17 00:00:00 2001 From: Dmitry Meyer Date: Thu, 31 Oct 2024 09:28:46 +0000 Subject: [PATCH] Allow to customize AMIs used by AWS backend (#1920) Closes: https://github.com/dstackai/dstack/issues/1913 --- docs/docs/reference/server/config.yml.md | 52 +++++++ .../_internal/core/backends/aws/compute.py | 12 +- .../_internal/core/backends/aws/resources.py | 35 ++++- .../_internal/core/models/backends/aws.py | 18 +++ .../_internal/server/services/config.py | 8 +- .../core/backends/aws/test_resources.py | 135 +++++++++++++++++- .../_internal/server/routers/test_backends.py | 1 + 7 files changed, 248 insertions(+), 13 deletions(-) diff --git a/docs/docs/reference/server/config.yml.md b/docs/docs/reference/server/config.yml.md index c96b0b84c..1b538a9d2 100644 --- a/docs/docs/reference/server/config.yml.md +++ b/docs/docs/reference/server/config.yml.md @@ -179,6 +179,42 @@ There are two ways to configure AWS: using an access key or using the default cr Using private subnets assumes that both the `dstack` server and users can access the configured VPC's private subnets. Additionally, private subnets must have outbound internet connectivity provided by NAT Gateway, Transit Gateway, or other mechanism. +??? info "OS images" + By default, `dstack` uses its own [AMI :material-arrow-top-right-thin:{ .external }](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AMIs.html) + optimized for `dstack`. + To use your own or other third-party images, set the `os_images` property: + + ```yaml + projects: + - name: main + backends: + - type: aws + creds: + type: default + + os_images: + cpu: + name: my-ami-for-cpu-instances + owner: self + user: dstack + nvidia: + name: 'Some ThirdParty CUDA image' + owner: 123456789012 + user: ubuntu + ``` + + Here, both `cpu` and `nvidia` properties are optional, but if the property is not set, you won´t be able to use the corresponding instance types. + + The `name` is an AMI name. + The `owner` is either an AWS account ID (a 12-digit number) or a special value `self` indicating the current account. + The `user` specifies an OS user for instance provisioning. + + !!! info "Image requirements" + * SSH server listening on port 22 + * `user` with passwordless sudo access + * Docker is installed + * (For NVIDIA instances) NVIDIA/CUDA drivers and NVIDIA Container Toolkit are installed + #### Azure There are two ways to configure Azure: using a client secret or using the default credentials. @@ -920,6 +956,22 @@ See the [reference table](#default-permissions) for all configurable permissions type: required: true +## `projects[n].backends[type=aws].os_images` { #_aws-os-images data-toc-label="backends[type=aws].os_images" } + +#SCHEMA# dstack._internal.core.models.backends.aws.AWSOSImageConfig + overrides: + show_root_heading: false + type: + required: true + +## `projects[n].backends[type=aws].os_images.*` { #_aws-os-image data-toc-label="backends[type=aws].os_images.*" } + +#SCHEMA# dstack._internal.core.models.backends.aws.AWSOSImage + overrides: + show_root_heading: false + type: + required: true + ## `projects[n].backends[type=azure]` { #_azure data-toc-label="backends[type=azure]" } #SCHEMA# dstack._internal.server.services.config.AzureConfig diff --git a/src/dstack/_internal/core/backends/aws/compute.py b/src/dstack/_internal/core/backends/aws/compute.py index 689dfdbde..b52f25f0a 100644 --- a/src/dstack/_internal/core/backends/aws/compute.py +++ b/src/dstack/_internal/core/backends/aws/compute.py @@ -170,13 +170,15 @@ def create_instance( tried_availability_zones.add(az) try: logger.debug("Trying provisioning %s in %s", instance_offer.instance.name, az) + image_id, username = aws_resources.get_image_id_and_username( + ec2_client=ec2_client, + cuda=len(instance_offer.instance.resources.gpus) > 0, + image_config=self.config.os_images, + ) response = ec2_resource.create_instances( **aws_resources.create_instances_struct( disk_size=disk_size, - image_id=aws_resources.get_image_id( - ec2_client=ec2_client, - cuda=len(instance_offer.instance.resources.gpus) > 0, - ), + image_id=image_id, instance_type=instance_offer.instance.name, iam_instance_profile_arn=None, user_data=get_user_data(authorized_keys=instance_config.get_public_keys()), @@ -211,7 +213,7 @@ def create_instance( region=instance_offer.region, availability_zone=az, price=instance_offer.price, - username="ubuntu", + username=username, ssh_port=22, dockerized=True, # because `dstack-shim docker` is used ssh_proxy=None, diff --git a/src/dstack/_internal/core/backends/aws/resources.py b/src/dstack/_internal/core/backends/aws/resources.py index 729697cf1..fc95cdab0 100644 --- a/src/dstack/_internal/core/backends/aws/resources.py +++ b/src/dstack/_internal/core/backends/aws/resources.py @@ -6,22 +6,45 @@ import dstack.version as version from dstack._internal.core.errors import BackendError, ComputeError, ComputeResourceNotFoundError +from dstack._internal.core.models.backends.aws import AWSOSImageConfig +from dstack._internal.utils.logging import get_logger +logger = get_logger(__name__) + +DSTACK_ACCOUNT_ID = "142421590066" -def get_image_id(ec2_client: botocore.client.BaseClient, cuda: bool) -> str: - image_name = ( - f"dstack-{version.base_image}" if not cuda else f"dstack-cuda-{version.base_image}" - ) - response = ec2_client.describe_images(Filters=[{"Name": "name", "Values": [image_name]}]) +def get_image_id_and_username( + ec2_client: botocore.client.BaseClient, + cuda: bool, + image_config: Optional[AWSOSImageConfig] = None, +) -> tuple[str, str]: + if image_config is not None: + image = image_config.nvidia if cuda else image_config.cpu + if image is None: + logger.warning("%s image not configured", "nvidia" if cuda else "cpu") + raise ComputeResourceNotFoundError() + image_name = image.name + image_owner = image.owner + username = image.user + else: + image_name = ( + f"dstack-{version.base_image}" if not cuda else f"dstack-cuda-{version.base_image}" + ) + image_owner = DSTACK_ACCOUNT_ID + username = "ubuntu" + response = ec2_client.describe_images( + Filters=[{"Name": "name", "Values": [image_name]}], Owners=[image_owner] + ) images = sorted( (i for i in response["Images"] if i["State"] == "available"), key=lambda i: i["CreationDate"], reverse=True, ) if not images: + logger.warning("image '%s' not found", image_name) raise ComputeResourceNotFoundError() - return images[0]["ImageId"] + return images[0]["ImageId"], username def create_security_group( diff --git a/src/dstack/_internal/core/models/backends/aws.py b/src/dstack/_internal/core/models/backends/aws.py index b91220a59..60be08326 100644 --- a/src/dstack/_internal/core/models/backends/aws.py +++ b/src/dstack/_internal/core/models/backends/aws.py @@ -7,6 +7,22 @@ from dstack._internal.core.models.common import CoreModel +class AWSOSImage(CoreModel): + name: Annotated[str, Field(description="AMI name")] + owner: Annotated[ + str, + Field(regex=r"^(\d{12}|self)$", description="AMI owner, account ID or `self`"), + ] = "self" + user: Annotated[str, Field(description="OS user for provisioning")] + + +class AWSOSImageConfig(CoreModel): + cpu: Annotated[Optional[AWSOSImage], Field(description="AMI used for CPU instances")] = None + nvidia: Annotated[ + Optional[AWSOSImage], Field(description="AMI used for NVIDIA GPU instances") + ] = None + + class AWSConfigInfo(CoreModel): type: Literal["aws"] = "aws" regions: Optional[List[str]] = None @@ -15,6 +31,7 @@ class AWSConfigInfo(CoreModel): default_vpcs: Optional[bool] = None public_ips: Optional[bool] = None tags: Optional[Dict[str, str]] = None + os_images: Optional[AWSOSImageConfig] = None class AWSAccessKeyCreds(CoreModel): @@ -52,6 +69,7 @@ class AWSConfigInfoWithCredsPartial(CoreModel): default_vpcs: Optional[bool] public_ips: Optional[bool] tags: Optional[Dict[str, str]] + os_images: Optional["AWSOSImageConfig"] class AWSConfigValues(CoreModel): diff --git a/src/dstack/_internal/server/services/config.py b/src/dstack/_internal/server/services/config.py index c50ca6d26..6bb0016a0 100644 --- a/src/dstack/_internal/server/services/config.py +++ b/src/dstack/_internal/server/services/config.py @@ -12,7 +12,7 @@ ServerClientError, ) from dstack._internal.core.models.backends import AnyConfigInfoWithCreds, BackendInfoYAML -from dstack._internal.core.models.backends.aws import AnyAWSCreds +from dstack._internal.core.models.backends.aws import AnyAWSCreds, AWSOSImageConfig from dstack._internal.core.models.backends.azure import AnyAzureCreds from dstack._internal.core.models.backends.base import BackendType from dstack._internal.core.models.backends.cudo import AnyCudoCreds @@ -107,6 +107,12 @@ class AWSConfig(CoreModel): Optional[Dict[str, str]], Field(description="The tags that will be assigned to resources created by `dstack`"), ] = None + os_images: Annotated[ + Optional[AWSOSImageConfig], + Field( + description="The mapping of instance categories (CPU, NVIDIA GPU) to AMI configurations" + ), + ] = None creds: AnyAWSCreds = Field(..., description="The credentials", discriminator="type") diff --git a/src/tests/_internal/core/backends/aws/test_resources.py b/src/tests/_internal/core/backends/aws/test_resources.py index 7b23bbf5b..3d6fe8b15 100644 --- a/src/tests/_internal/core/backends/aws/test_resources.py +++ b/src/tests/_internal/core/backends/aws/test_resources.py @@ -1,11 +1,16 @@ +import logging +from unittest.mock import Mock + import pytest from dstack._internal.core.backends.aws.resources import ( _is_valid_tag_key, _is_valid_tag_value, + get_image_id_and_username, validate_tags, ) -from dstack._internal.core.errors import BackendError +from dstack._internal.core.errors import BackendError, ComputeResourceNotFoundError +from dstack._internal.core.models.backends.aws import AWSOSImage, AWSOSImageConfig class TestIsValidTagKey: @@ -71,3 +76,131 @@ def test_validate_invalid_tags(self): tags = {"aws:ReservedKey": "SomeValue", "ValidKey": "Invalid#Value"} with pytest.raises(BackendError, match="Invalid resource tags"): validate_tags(tags) + + +class TestGetImageIdAndUsername: + @pytest.fixture + def ec2_client_mock(self) -> Mock: + mock = Mock(spec_set=["describe_images"]) + mock.describe_images.return_value = { + "Images": [ + { + "ImageId": "ami-00000000000000000", + "State": "available", + "CreationDate": "2000-01-01T00:00:00.000Z", + }, + ], + } + return mock + + def test_returns_the_latest_available(self, ec2_client_mock: Mock): + ec2_client_mock.describe_images.return_value = { + "Images": [ + # the latest, but not available + { + "ImageId": "ami-00000000000000001", + "State": "failed", + "CreationDate": "2024-01-01T00:00:00.000Z", + }, + # available, but not the latest + { + "ImageId": "ami-00000000000000002", + "State": "available", + "CreationDate": "2022-01-01T00:00:00.000Z", + }, + # the latest available + { + "ImageId": "ami-00000000000000003", + "State": "available", + "CreationDate": "2023-01-01T00:00:00.000Z", + }, + ] + } + image_id, username = get_image_id_and_username(ec2_client_mock, cuda=False) + assert image_id == "ami-00000000000000003" + assert username == "ubuntu" + + def test_raises_resource_not_found_if_none_available( + self, + monkeypatch: pytest.MonkeyPatch, + caplog: pytest.LogCaptureFixture, + ec2_client_mock: Mock, + ): + monkeypatch.setattr("dstack.version.base_image", "0.0") + caplog.set_level(logging.WARNING) + ec2_client_mock.describe_images.return_value = { + "Images": [ + { + "ImageId": "ami-00000000000000000", + "State": "failed", + "CreationDate": "2000-01-01T00:00:00.000Z", + }, + ] + } + with pytest.raises(ComputeResourceNotFoundError): + get_image_id_and_username(ec2_client_mock, cuda=False) + assert "image 'dstack-0.0' not found" in caplog.text + + @pytest.mark.parametrize( + ["cuda", "expected"], + [ + [False, "dstack-0.0"], + [True, "dstack-cuda-0.0"], + ], + ) + def test_uses_dstack_image_name_and_account_id_if_image_config_not_provided( + self, monkeypatch: pytest.MonkeyPatch, ec2_client_mock: Mock, cuda: bool, expected: str + ): + monkeypatch.setattr("dstack.version.base_image", "0.0") + _, username = get_image_id_and_username(ec2_client_mock, cuda) + assert username == "ubuntu" + ec2_client_mock.describe_images.assert_called_once_with( + Filters=[{"Name": "name", "Values": [expected]}], Owners=["142421590066"] + ) + + @pytest.mark.parametrize( + ["cuda", "expected_name", "expected_owner", "expected_username"], + [ + [False, "cpu-ami", "123456789012", "debian"], + [True, "nvidia-ami", "self", "dstack"], + ], + ) + def test_uses_image_config_if_provided( + self, + ec2_client_mock: Mock, + cuda: bool, + expected_name: str, + expected_owner: str, + expected_username: str, + ): + image_config = AWSOSImageConfig( + cpu=AWSOSImage( + name="cpu-ami", + owner="123456789012", + user="debian", + ), + nvidia=AWSOSImage( + name="nvidia-ami", + user="dstack", + ), + ) + _, username = get_image_id_and_username(ec2_client_mock, cuda, image_config) + assert username == expected_username + ec2_client_mock.describe_images.assert_called_once_with( + Filters=[{"Name": "name", "Values": [expected_name]}], + Owners=[expected_owner], + ) + + def test_raises_resource_not_found_if_image_config_property_not_set( + self, caplog: pytest.LogCaptureFixture, ec2_client_mock: Mock + ): + caplog.set_level(logging.WARNING) + image_config = AWSOSImageConfig( + nvidia=AWSOSImage( + name="nvidia-ami", + user="dstack", + ), + ) + with pytest.raises(ComputeResourceNotFoundError): + get_image_id_and_username(ec2_client_mock, cuda=False, image_config=image_config) + assert "cpu image not configured" in caplog.text diff --git a/src/tests/_internal/server/routers/test_backends.py b/src/tests/_internal/server/routers/test_backends.py index 31871af9d..550e6d072 100644 --- a/src/tests/_internal/server/routers/test_backends.py +++ b/src/tests/_internal/server/routers/test_backends.py @@ -1230,6 +1230,7 @@ async def test_returns_config_info(self, test_db, session: AsyncSession, client: "default_vpcs": None, "public_ips": None, "tags": None, + "os_images": None, "creds": json.loads(backend.auth.plaintext), }