diff --git a/changes/2852.feature.md b/changes/2852.feature.md new file mode 100644 index 0000000000..b793bd86b8 --- /dev/null +++ b/changes/2852.feature.md @@ -0,0 +1 @@ +Add support for setting a timeout when pulling Docker images and upgrade aiodocker to version 0.23.0. diff --git a/python.lock b/python.lock index 2044bf3e27..28fd4b095d 100644 --- a/python.lock +++ b/python.lock @@ -15,7 +15,7 @@ // "SQLAlchemy[postgresql_asyncpg]~=1.4.54", // "aiodataloader-ng~=0.2.1", // "aiodns>=3.2", -// "aiodocker==0.22.1", +// "aiodocker==0.23.0", // "aiofiles~=24.1.0", // "aiohttp_cors~=0.7", // "aiohttp_jinja2~=1.6", @@ -197,40 +197,41 @@ "artifacts": [ { "algorithm": "sha256", - "hash": "4e42d6e6cbf8f2afb431b77208be0f7e81d07eb38c8176f18d58761115b2acf5", - "url": "https://files.pythonhosted.org/packages/55/f6/48a88b2aaf6a3ecade21467770f17bca5716b7cb5446e8d41bde6d300a4a/aiodocker-0.22.1-py3-none-any.whl" + "hash": "8c7ff2fc9e557898ae77bc9c1af8916f269285f230aedf1abbb81436054baed4", + "url": "https://files.pythonhosted.org/packages/f9/dc/7a34f2a50fef8a3e7e02618b8fec516fa29a91d2fe264ab49514f9affc82/aiodocker-0.23.0-py3-none-any.whl" }, { "algorithm": "sha256", - "hash": "fb969fdf2ac574d800ddf132940e0337fe459e191b8024ef66ec8328effa63ce", - "url": "https://files.pythonhosted.org/packages/55/5e/9e61e7604ac586c9f0b0fa7f8b6a0c25e8b15c9478ca55969cc43f258167/aiodocker-0.22.1.tar.gz" + "hash": "45ede291063c7d1c24e78a766013c25e85b354a3bdcca68fe2bca64348e4dee2", + "url": "https://files.pythonhosted.org/packages/78/cb/c0fa4944604a182db4c062fea84fbdd77d2e508932dd0052ec784040ac13/aiodocker-0.23.0.tar.gz" } ], "project_name": "aiodocker", "requires_dists": [ - "aiohttp==3.9.5; extra == \"ci\"", + "aiohttp==3.10.5; extra == \"ci\"", "aiohttp>=3.8", - "alabaster==0.7.16; extra == \"doc\"", + "alabaster==1.0.0; extra == \"doc\"", "async-timeout==4.0.3; extra == \"ci\"", + "async-timeout==4.0.3; extra == \"dev\"", "codecov==2.1.13; extra == \"dev\"", "multidict==6.0.5; extra == \"ci\"", - "mypy==1.10.1; extra == \"dev\"", + "mypy==1.11.2; extra == \"dev\"", "packaging==24.1; extra == \"dev\"", "pre-commit>=3.5.0; extra == \"dev\"", - "pytest-asyncio==0.23.7; extra == \"dev\"", + "pytest-asyncio==0.24.0; extra == \"dev\"", "pytest-cov==5.0.0; extra == \"dev\"", "pytest-sugar==1.0.0; extra == \"dev\"", - "pytest==8.2.2; extra == \"dev\"", + "pytest==8.3.2; extra == \"dev\"", "ruff-lsp==0.0.54; extra == \"dev\"", - "ruff==0.5.0; extra == \"dev\"", - "sphinx-autodoc-typehints==2.2.2; extra == \"doc\"", - "sphinx==7.3.7; extra == \"doc\"", + "ruff==0.6.3; extra == \"dev\"", + "sphinx-autodoc-typehints==2.4.4; extra == \"doc\"", + "sphinx==8.0.2; extra == \"doc\"", "sphinxcontrib-asyncio==0.3.0; extra == \"doc\"", - "towncrier==23.11.0; extra == \"dev\"", - "yarl==1.9.4; extra == \"ci\"" + "towncrier==24.8.0; extra == \"dev\"", + "yarl==1.11.1; extra == \"ci\"" ], "requires_python": ">=3.8.0", - "version": "0.22.1" + "version": "0.23.0" }, { "artifacts": [ @@ -1034,36 +1035,36 @@ "artifacts": [ { "algorithm": "sha256", - "hash": "ecba4362f82e23ef775c72b3e6fdef3ef68443629b79e88886d5088302ffc050", - "url": "https://files.pythonhosted.org/packages/2d/5f/94b0310a492dd97b70c927f67c189e339b5b09504bf251144eed913f766f/boto3-1.35.23-py3-none-any.whl" + "hash": "97fcc1a14cbc759e4ba9535ced703a99fcf652c9c4b8dfcd06f292c80551684b", + "url": "https://files.pythonhosted.org/packages/5a/d2/3e0071e8ca4ceec9c9199b5cccec570930f77d0a20aba6c0d352eeffd6c8/boto3-1.35.24-py3-none-any.whl" }, { "algorithm": "sha256", - "hash": "3fbf1d5b749c92ed43aa190650979dff9f83790a42522e1e9eefa54c8e44bc4b", - "url": "https://files.pythonhosted.org/packages/09/31/aa8f565871e00264874bf220ab9913a168fe8acf8b19f7c1a344d1623104/boto3-1.35.23.tar.gz" + "hash": "be7807f30f26d6c0057e45cfd09dad5968e664488bf4f9138d0bb7a0f6d8ed40", + "url": "https://files.pythonhosted.org/packages/c2/e4/b3438c3493a5b534f86308809029dc72c854b6007c331c03893345799a35/boto3-1.35.24.tar.gz" } ], "project_name": "boto3", "requires_dists": [ - "botocore<1.36.0,>=1.35.23", + "botocore<1.36.0,>=1.35.24", "botocore[crt]<2.0a0,>=1.21.0; extra == \"crt\"", "jmespath<2.0.0,>=0.7.1", "s3transfer<0.11.0,>=0.10.0" ], "requires_python": ">=3.8", - "version": "1.35.23" + "version": "1.35.24" }, { "artifacts": [ { "algorithm": "sha256", - "hash": "cab9ec4e0367b9f33f0bc02c5a29f587b0119ecffd6d125bacee085dcbc8817d", - "url": "https://files.pythonhosted.org/packages/f8/81/90e1b82697d849e4a5e7e6dcf21ef7ba9fa902b98324849bd2956e6efac3/botocore-1.35.23-py3-none-any.whl" + "hash": "eb9ccc068255cc3d24c36693fda6aec7786db05ae6c2b13bcba66dce6a13e2e3", + "url": "https://files.pythonhosted.org/packages/f5/84/e8a1220f2fcf06c68970c8ddfe0687cc4eb967c0ad219de5dfed65dd3958/botocore-1.35.24-py3-none-any.whl" }, { "algorithm": "sha256", - "hash": "25b17a9ccba6ad32bb5bf7ba4f52656aa03c1cb29f6b4e438050ee4ad1967a3b", - "url": "https://files.pythonhosted.org/packages/9a/7a/1c9a1b478c4cdafae166572d5dc2aff93cd34c04fdfbfb0772cf1fccfcfa/botocore-1.35.23.tar.gz" + "hash": "1e59b0f14f4890c4f70bd6a58a634b9464bed1c4c6171f87c8795d974ade614b", + "url": "https://files.pythonhosted.org/packages/44/68/8c6e4e8d7ec73f4daa0a1411dd0b3efcb06ed77c8d02ae95c90b85afdcbc/botocore-1.35.24.tar.gz" } ], "project_name": "botocore", @@ -1075,7 +1076,7 @@ "urllib3<1.27,>=1.25.4; python_version < \"3.10\"" ], "requires_python": ">=3.8", - "version": "1.35.23" + "version": "1.35.24" }, { "artifacts": [ @@ -1144,13 +1145,13 @@ "artifacts": [ { "algorithm": "sha256", - "hash": "ec8ce8fdc725de9d07547cd616f968670687c6fa7a2e263b088370c46d834d97", - "url": "https://files.pythonhosted.org/packages/ca/a1/90fa8e601c28937a8426eaae853e0009807e6287c7bf03fe7af4296ec510/cattrs-24.1.1-py3-none-any.whl" + "hash": "67c7495b760168d931a10233f979b28dc04daf853b30752246f4f8471c6d68d0", + "url": "https://files.pythonhosted.org/packages/c8/d5/867e75361fc45f6de75fe277dd085627a9db5ebb511a87f27dc1396b5351/cattrs-24.1.2-py3-none-any.whl" }, { "algorithm": "sha256", - "hash": "16e94a13f9aaf6438bd5be5df521e072b1b00481b4cf807bcb1acbd49f814c08", - "url": "https://files.pythonhosted.org/packages/3c/ba/08912e7e6e796fa7d5da1aaf3f53235ee6b2a73ec51d93bdf69b77b1c0d1/cattrs-24.1.1.tar.gz" + "hash": "8028cfe1ff5382df59dd36474a86e02d817b06eaf8af84555441bac915d2ef85", + "url": "https://files.pythonhosted.org/packages/64/65/af6d57da2cb32c076319b7489ae0958f746949d407109e3ccf4d115f147c/cattrs-24.1.2.tar.gz" } ], "project_name": "cattrs", @@ -1168,7 +1169,7 @@ "ujson>=5.7.0; extra == \"ujson\"" ], "requires_python": ">=3.8", - "version": "24.1.1" + "version": "24.1.2" }, { "artifacts": [ @@ -1846,48 +1847,48 @@ "artifacts": [ { "algorithm": "sha256", - "hash": "a53dfe8f82b715319e9953330fa5c8708b610d48b5c59f1316337302af5c0811", - "url": "https://files.pythonhosted.org/packages/a2/90/912a1227a841d5df57d6dbe5730e049d5fd38c902c3940e45222360ca336/greenlet-3.1.0-cp312-cp312-musllinux_1_1_x86_64.whl" + "hash": "23f20bb60ae298d7d8656c6ec6db134bca379ecefadb0b19ce6f19d1f232a942", + "url": "https://files.pythonhosted.org/packages/38/f9/c0a0eb61bdf808d23266ecf1d63309f0e1471f284300ce6dac0ae1231881/greenlet-3.1.1-cp312-cp312-musllinux_1_1_x86_64.whl" }, { "algorithm": "sha256", - "hash": "24fc216ec7c8be9becba8b64a98a78f9cd057fd2dc75ae952ca94ed8a893bf27", - "url": "https://files.pythonhosted.org/packages/58/a8/a54a8816187e55f42fa135419efe3a88a2749f75ed4169abc6bf300ce0a9/greenlet-3.1.0-cp312-cp312-macosx_11_0_universal2.whl" + "hash": "b7cede291382a78f7bb5f04a529cb18e068dd29e0fb27376074b6d0317bf4dd0", + "url": "https://files.pythonhosted.org/packages/19/c5/36384a06f748044d06bdd8776e231fadf92fc896bd12cb1c9f5a1bda9578/greenlet-3.1.1-cp312-cp312-musllinux_1_1_aarch64.whl" }, { "algorithm": "sha256", - "hash": "b395121e9bbe8d02a750886f108d540abe66075e61e22f7353d9acb0b81be0f0", - "url": "https://files.pythonhosted.org/packages/65/1b/3d91623c3eff61c11799e7f3d6c01f6bfa9bd2d1f0181116fd0b9b108a40/greenlet-3.1.0.tar.gz" + "hash": "2846930c65b47d70b9d178e89c7e1a69c95c1f68ea5aa0a58646b7a96df12441", + "url": "https://files.pythonhosted.org/packages/27/8f/2a93cd9b1e7107d5c7b3b7816eeadcac2ebcaf6d6513df9abaf0334777f6/greenlet-3.1.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl" }, { "algorithm": "sha256", - "hash": "c9d86401550b09a55410f32ceb5fe7efcd998bd2dad9e82521713cb148a4a15f", - "url": "https://files.pythonhosted.org/packages/75/4a/c612e5688dbbce6873763642195d9902e04de43914fe415661fe3c435e1e/greenlet-3.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl" + "hash": "4ce3ac6cdb6adf7946475d7ef31777c26d94bccc377e070a7986bd2d5c515467", + "url": "https://files.pythonhosted.org/packages/2f/ff/df5fede753cc10f6a5be0931204ea30c35fa2f2ea7a35b25bdaf4fe40e46/greenlet-3.1.1.tar.gz" }, { "algorithm": "sha256", - "hash": "26811df4dc81271033a7836bc20d12cd30938e6bd2e9437f56fa03da81b0f8fc", - "url": "https://files.pythonhosted.org/packages/77/d5/489ee9a7a9bace162d99c52f347edc14ffa570fdf5684e95d9dc146ba1be/greenlet-3.1.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl" + "hash": "c3a701fe5a9695b238503ce5bbe8218e03c3bcccf7e204e455e7462d770268aa", + "url": "https://files.pythonhosted.org/packages/51/41/467b12a8c7c1303d20abcca145db2be4e6cd50a951fa30af48b6ec607581/greenlet-3.1.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl" }, { "algorithm": "sha256", - "hash": "3d07c28b85b350564bdff9f51c1c5007dfb2f389385d1bc23288de51134ca303", - "url": "https://files.pythonhosted.org/packages/89/dc/d2eaaefca5e295ec9cc09c958f7c3086582a6e1d93de31b780e420cbf6dc/greenlet-3.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl" + "hash": "99cfaa2110534e2cf3ba31a7abcac9d328d1d9f1b95beede58294a60348fba36", + "url": "https://files.pythonhosted.org/packages/57/5c/7c6f50cb12be092e1dccb2599be5a942c3416dbcfb76efcf54b3f8be4d8d/greenlet-3.1.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl" }, { "algorithm": "sha256", - "hash": "26d9c1c4f1748ccac0bae1dbb465fb1a795a75aba8af8ca871503019f4285e2a", - "url": "https://files.pythonhosted.org/packages/aa/67/12f51aa488d8778e1b8e9fcaeb25678524eda29a7a133a9263d6449fe011/greenlet-3.1.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl" + "hash": "f406b22b7c9a9b4f8aa9d2ab13d6ae0ac3e85c9a809bd590ad53fed2bf70dc79", + "url": "https://files.pythonhosted.org/packages/66/d4/c8c04958870f482459ab5956c2942c4ec35cac7fe245527f1039837c17a9/greenlet-3.1.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl" }, { "algorithm": "sha256", - "hash": "243a223c96a4246f8a30ea470c440fe9db1f5e444941ee3c3cd79df119b8eebf", - "url": "https://files.pythonhosted.org/packages/e8/65/577971a48f06ebd2f759466b4c1c59cd4dc901ec43f1a775207430ad80b9/greenlet-3.1.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl" + "hash": "4afe7ea89de619adc868e087b4d2359282058479d7cfb94970adf4b55284574d", + "url": "https://files.pythonhosted.org/packages/7d/ec/bad1ac26764d26aa1353216fcbfa4670050f66d445448aafa227f8b16e80/greenlet-3.1.1-cp312-cp312-macosx_11_0_universal2.whl" }, { "algorithm": "sha256", - "hash": "cd468ec62257bb4544989402b19d795d2305eccb06cde5da0eb739b63dc04665", - "url": "https://files.pythonhosted.org/packages/fb/e8/9374e77fc204973d6d901c8bb2d7cb223e81513754874cbee6cc5c5fc0ba/greenlet-3.1.0-cp312-cp312-musllinux_1_1_aarch64.whl" + "hash": "1443279c19fca463fc33e65ef2a935a5b09bb90f978beab37729e1c3c6c25fe9", + "url": "https://files.pythonhosted.org/packages/f1/66/033e58a50fd9ec9df00a8671c74f1f3a320564c6415a4ed82a1c651654ba/greenlet-3.1.1-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl" } ], "project_name": "greenlet", @@ -1898,7 +1899,7 @@ "psutil; extra == \"test\"" ], "requires_python": ">=3.7", - "version": "3.1.0" + "version": "3.1.1" }, { "artifacts": [ @@ -4909,7 +4910,7 @@ "SQLAlchemy[postgresql_asyncpg]~=1.4.54", "aiodataloader-ng~=0.2.1", "aiodns>=3.2", - "aiodocker==0.22.1", + "aiodocker==0.23.0", "aiofiles~=24.1.0", "aiohttp_cors~=0.7", "aiohttp_jinja2~=1.6", diff --git a/requirements.txt b/requirements.txt index 874c22bd55..9ccdc9944e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ aiodataloader-ng~=0.2.1 -aiodocker==0.22.1 +aiodocker==0.23.0 aiofiles~=24.1.0 aiohttp~=3.10.5 aiohttp_cors~=0.7 diff --git a/src/ai/backend/agent/agent.py b/src/ai/backend/agent/agent.py index ec68c7446d..6c0799439e 100644 --- a/src/ai/backend/agent/agent.py +++ b/src/ai/backend/agent/agent.py @@ -1600,7 +1600,13 @@ async def push_image(self, image_ref: ImageRef, registry_conf: ImageRegistry) -> """ @abstractmethod - async def pull_image(self, image_ref: ImageRef, registry_conf: ImageRegistry) -> None: + async def pull_image( + self, + image_ref: ImageRef, + registry_conf: ImageRegistry, + *, + timeout: float | None, + ) -> None: """ Pull the given image from the given registry. """ @@ -1835,11 +1841,26 @@ async def create_kernel( kernel_config["image"]["digest"], AutoPullBehavior(kernel_config.get("auto_pull", "digest")), ) + image_pull_timeout = cast( + float | None, self.local_config["agent"]["api"]["pull-timeout"] + ) if do_pull: await self.produce_event( KernelPullingEvent(kernel_id, session_id, ctx.image_ref.canonical), ) - await self.pull_image(ctx.image_ref, kernel_config["image"]["registry"]) + try: + await self.pull_image( + ctx.image_ref, + kernel_config["image"]["registry"], + timeout=image_pull_timeout, + ) + except asyncio.TimeoutError: + log.exception( + f"Image pull timeout after {image_pull_timeout} seconds. Destroying kernel (k:{kernel_id}, img:{ctx.image_ref.canonical})" + ) + raise AgentError( + f"Image pull timeout after {image_pull_timeout} seconds. (img:{ctx.image_ref.canonical})" + ) if not restarting: await self.produce_event( diff --git a/src/ai/backend/agent/config.py b/src/ai/backend/agent/config.py index b056fbd729..6480602d7b 100644 --- a/src/ai/backend/agent/config.py +++ b/src/ai/backend/agent/config.py @@ -147,11 +147,21 @@ "chunk-size": "64K", # used when storing logs to Redis as a side-channel to the event bus } +DEFAULT_PULL_TIMEOUT = 2 * 60 * 60 # 2 hours + +default_api_config = { + "pull-timeout": DEFAULT_PULL_TIMEOUT, +} + agent_etcd_config_iv = t.Dict({ t.Key("container-logs", default=default_container_logs_config): t.Dict({ t.Key("max-length", default=default_container_logs_config["max-length"]): tx.BinarySize(), t.Key("chunk-size", default=default_container_logs_config["chunk-size"]): tx.BinarySize(), }).allow_extra("*"), + t.Key("api", default=default_api_config): t.Dict({ + t.Key("pull-timeout", default=default_api_config["pull-timeout"]): tx.ToNone + | t.ToFloat[0:], # Set the image pull timeout in seconds + }).allow_extra("*"), }).allow_extra("*") container_etcd_config_iv = t.Dict({ diff --git a/src/ai/backend/agent/docker/agent.py b/src/ai/backend/agent/docker/agent.py index 2477bb56b1..eb47115c2e 100644 --- a/src/ai/backend/agent/docker/agent.py +++ b/src/ai/backend/agent/docker/agent.py @@ -1383,7 +1383,13 @@ async def push_image(self, image_ref: ImageRef, registry_conf: ImageRegistry) -> async with closing_async(Docker()) as docker: await docker.images.push(image_ref.canonical, auth=auth_config) - async def pull_image(self, image_ref: ImageRef, registry_conf: ImageRegistry) -> None: + async def pull_image( + self, + image_ref: ImageRef, + registry_conf: ImageRegistry, + *, + timeout: float | None, + ) -> None: auth_config = None reg_user = registry_conf.get("username") reg_passwd = registry_conf.get("password") @@ -1396,7 +1402,7 @@ async def pull_image(self, image_ref: ImageRef, registry_conf: ImageRegistry) -> } log.info("pulling image {} from registry", image_ref.canonical) async with closing_async(Docker()) as docker: - await docker.images.pull(image_ref.canonical, auth=auth_config) + await docker.images.pull(image_ref.canonical, auth=auth_config, timeout=timeout) async def check_image( self, image_ref: ImageRef, image_id: str, auto_pull: AutoPullBehavior diff --git a/src/ai/backend/agent/dummy/agent.py b/src/ai/backend/agent/dummy/agent.py index 39e308bca0..886a893f1e 100644 --- a/src/ai/backend/agent/dummy/agent.py +++ b/src/ai/backend/agent/dummy/agent.py @@ -272,7 +272,13 @@ async def scan_images(self) -> Mapping[str, str]: await asyncio.sleep(delay) return {} - async def pull_image(self, image_ref: ImageRef, registry_conf: ImageRegistry) -> None: + async def pull_image( + self, + image_ref: ImageRef, + registry_conf: ImageRegistry, + *, + timeout: float | None, + ) -> None: delay = self.dummy_agent_cfg["delay"]["pull-image"] await asyncio.sleep(delay) diff --git a/src/ai/backend/agent/kubernetes/agent.py b/src/ai/backend/agent/kubernetes/agent.py index f3d88c1c2d..9dd36efb7b 100644 --- a/src/ai/backend/agent/kubernetes/agent.py +++ b/src/ai/backend/agent/kubernetes/agent.py @@ -1001,7 +1001,13 @@ async def handle_agent_socket(self): # TODO: Add support for remote agent socket mechanism pass - async def pull_image(self, image_ref: ImageRef, registry_conf: ImageRegistry) -> None: + async def pull_image( + self, + image_ref: ImageRef, + registry_conf: ImageRegistry, + *, + timeout: float | None, + ) -> None: # TODO: Add support for appropriate image pulling mechanism on K8s pass diff --git a/src/ai/backend/common/validators.py b/src/ai/backend/common/validators.py index 46e26704be..53b7fde826 100644 --- a/src/ai/backend/common/validators.py +++ b/src/ai/backend/common/validators.py @@ -673,6 +673,19 @@ def check_and_return(self, value: Any) -> set: self._failure("value must be Iterable") +class ToNone(t.Trafaret): + allowed_values = ("none", "null", "nil") + + def check_and_return(self, value: Any) -> None: + if value is None: + return None + _value = str(value).strip().lower() + if _value in self.allowed_values: + return None + else: + self._failure(f"value must one of {self.allowed_values}") + + class Delay(t.Trafaret): """ Convert a float or a tuple of 2 floats into a random generated float value