From c99e7383ea714f876b9fcf8dd272597e5dcb3202 Mon Sep 17 00:00:00 2001 From: Sanghun Lee Date: Fri, 25 Oct 2024 01:05:54 +0900 Subject: [PATCH 01/20] fix: Handle value error when convert shmem to binary size --- src/ai/backend/manager/defs.py | 2 ++ src/ai/backend/manager/registry.py | 20 +++++++++++++++----- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/src/ai/backend/manager/defs.py b/src/ai/backend/manager/defs.py index 7c93275b18..0dc2421579 100644 --- a/src/ai/backend/manager/defs.py +++ b/src/ai/backend/manager/defs.py @@ -88,3 +88,5 @@ class LockID(enum.IntEnum): DEFAULT_KEYPAIR_RESOURCE_POLICY_NAME: Final = "default" DEFAULT_KEYPAIR_RATE_LIMIT: Final = 10000 + +DEFAULT_SHARED_MEMORY_SIZE: Final[str] = "64m" diff --git a/src/ai/backend/manager/registry.py b/src/ai/backend/manager/registry.py index e1bb4679f0..e93904ea0e 100644 --- a/src/ai/backend/manager/registry.py +++ b/src/ai/backend/manager/registry.py @@ -132,7 +132,7 @@ TooManySessionsMatched, ) from .config import LocalConfig, SharedConfig -from .defs import DEFAULT_IMAGE_ARCH, DEFAULT_ROLE, INTRINSIC_SLOTS +from .defs import DEFAULT_IMAGE_ARCH, DEFAULT_ROLE, DEFAULT_SHARED_MEMORY_SIZE, INTRINSIC_SLOTS from .exceptions import MultiAgentError, convert_to_status_data from .models import ( AGENT_RESOURCE_OCCUPYING_KERNEL_STATUSES, @@ -1128,10 +1128,20 @@ async def enqueue_session( # We need to subtract the amount of shared memory from the memory limit of # a container, since tmpfs including /dev/shm uses host-side kernel memory # and cgroup's memory limit does not apply. - shmem = resource_opts.get("shmem", None) - if shmem is None: - shmem = labels.get("ai.backend.resource.preferred.shmem", "64m") - shmem = BinarySize.from_str(shmem) + raw_shmem: Optional[str] = resource_opts.get("shmem") + if raw_shmem is None: + raw_shmem = labels.get("ai.backend.resource.preferred.shmem") + if not raw_shmem: + # raw_shmem is None or empty string ("") + raw_shmem = DEFAULT_SHARED_MEMORY_SIZE + try: + shmem = BinarySize.from_str(raw_shmem) + except ValueError: + log.warning( + f"Failed to convert raw `shmem({raw_shmem})` " + f"to a decimal value. Fallback to default({DEFAULT_SHARED_MEMORY_SIZE})." + ) + shmem = BinarySize.from_str(DEFAULT_SHARED_MEMORY_SIZE) resource_opts["shmem"] = shmem image_min_slots = copy.deepcopy(image_min_slots) image_min_slots["mem"] += shmem From 5b821cf038ad5f1b96ffd48347d74ca4ed8bf368 Mon Sep 17 00:00:00 2001 From: Sanghun Lee Date: Fri, 8 Dec 2023 16:54:53 +0900 Subject: [PATCH 02/20] feature: Compare image memory limit to sum of shmem and main mem --- src/ai/backend/manager/registry.py | 33 ++++++++++++------------------ 1 file changed, 13 insertions(+), 20 deletions(-) diff --git a/src/ai/backend/manager/registry.py b/src/ai/backend/manager/registry.py index e93904ea0e..d30d8aa542 100644 --- a/src/ai/backend/manager/registry.py +++ b/src/ai/backend/manager/registry.py @@ -2,7 +2,6 @@ import asyncio import base64 -import copy import itertools import logging import re @@ -1125,26 +1124,20 @@ async def enqueue_session( ) # Shared memory. - # We need to subtract the amount of shared memory from the memory limit of - # a container, since tmpfs including /dev/shm uses host-side kernel memory - # and cgroup's memory limit does not apply. - raw_shmem: Optional[str] = resource_opts.get("shmem") - if raw_shmem is None: - raw_shmem = labels.get("ai.backend.resource.preferred.shmem") - if not raw_shmem: - # raw_shmem is None or empty string ("") - raw_shmem = DEFAULT_SHARED_MEMORY_SIZE - try: - shmem = BinarySize.from_str(raw_shmem) - except ValueError: - log.warning( - f"Failed to convert raw `shmem({raw_shmem})` " - f"to a decimal value. Fallback to default({DEFAULT_SHARED_MEMORY_SIZE})." - ) - shmem = BinarySize.from_str(DEFAULT_SHARED_MEMORY_SIZE) + # The minimum-required/maximum-limited resource size(=MM) for images excludes the shared memory size(=S). + # However, MOST client's session creation requests set S by default and + # our scheduler allocates the sum of the requested main memory size(=M) and S. + # That's why we should compare MM to the sum of M and S, + # which is the same as comparing MM minus S to M. + raw_shmem = ( + resource_opts.get("shmem") + or labels.get("ai.backend.resource.preferred.shmem") + or DEFAULT_SHARED_MEMORY_SIZE + ) + shmem = BinarySize.from_str(raw_shmem) resource_opts["shmem"] = shmem - image_min_slots = copy.deepcopy(image_min_slots) - image_min_slots["mem"] += shmem + image_min_slots["mem"] -= shmem + image_max_slots["mem"] -= shmem # Sanitize user input: does it have resource config? if (resources := creation_config.get("resources")) is not None: From 0bef259085deeba6c3d7edc9544372ec853142a1 Mon Sep 17 00:00:00 2001 From: Sanghun Lee Date: Fri, 8 Dec 2023 17:00:15 +0900 Subject: [PATCH 03/20] add news fragment --- changes/1770.feature.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 changes/1770.feature.md diff --git a/changes/1770.feature.md b/changes/1770.feature.md new file mode 100644 index 0000000000..81b5fed1da --- /dev/null +++ b/changes/1770.feature.md @@ -0,0 +1 @@ +When validating parameters of session creation requests, compare the image's memory limit to the sum of the shared memory size and the main memory size. From ec523d94dec72636f49a2adf06e1764bae1704d2 Mon Sep 17 00:00:00 2001 From: Sanghun Lee Date: Mon, 11 Dec 2023 12:13:10 +0900 Subject: [PATCH 04/20] add shmem to requested_slot and log --- src/ai/backend/manager/registry.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/src/ai/backend/manager/registry.py b/src/ai/backend/manager/registry.py index d30d8aa542..f31b50d7f5 100644 --- a/src/ai/backend/manager/registry.py +++ b/src/ai/backend/manager/registry.py @@ -1124,20 +1124,12 @@ async def enqueue_session( ) # Shared memory. - # The minimum-required/maximum-limited resource size(=MM) for images excludes the shared memory size(=S). - # However, MOST client's session creation requests set S by default and - # our scheduler allocates the sum of the requested main memory size(=M) and S. - # That's why we should compare MM to the sum of M and S, - # which is the same as comparing MM minus S to M. - raw_shmem = ( + shmem = BinarySize.from_str( resource_opts.get("shmem") or labels.get("ai.backend.resource.preferred.shmem") or DEFAULT_SHARED_MEMORY_SIZE ) - shmem = BinarySize.from_str(raw_shmem) resource_opts["shmem"] = shmem - image_min_slots["mem"] -= shmem - image_max_slots["mem"] -= shmem # Sanitize user input: does it have resource config? if (resources := creation_config.get("resources")) is not None: @@ -1190,6 +1182,18 @@ async def enqueue_session( if tpu is not None: raise InvalidAPIParameters("Client upgrade required to use TPUs (v19.03+).") + # Shared memory. + # The minimum-required/maximum-limited resource size(=MM) for images excludes the shared memory size(=S). + # However, MOST client's session creation requests set S by default and + # our scheduler allocates the sum of the requested main memory size(=M) and S. + # That's why we should compare MM to the sum of M and S, + # which is the same as comparing MM minus S to M. + total_mem = requested_slots["mem"] + shmem + log.debug( + f"Total memory size ({str(total_mem)}) is allocated to requested main memory size. (original memory: {str(requested_slots['mem'])}, shared memory: {str(shmem)})", + ) + requested_slots["mem"] = total_mem + # Check the image resource slots. log_fmt = "s:{} k:{} r:{}-{}" log_args = (session_id, kernel_id, kernel["cluster_role"], kernel["cluster_idx"]) From 5367ab26330ca1cfdfbd42cf907aaf8365ad81e4 Mon Sep 17 00:00:00 2001 From: Joongi Kim Date: Sat, 23 Mar 2024 08:41:03 +0900 Subject: [PATCH 05/20] fix: Clarify the comment and move the shmem/mem size comparison before overriding requested_slots["mem"] --- src/ai/backend/manager/registry.py | 31 ++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/src/ai/backend/manager/registry.py b/src/ai/backend/manager/registry.py index f31b50d7f5..1d8483b4e0 100644 --- a/src/ai/backend/manager/registry.py +++ b/src/ai/backend/manager/registry.py @@ -1182,12 +1182,23 @@ async def enqueue_session( if tpu is not None: raise InvalidAPIParameters("Client upgrade required to use TPUs (v19.03+).") - # Shared memory. - # The minimum-required/maximum-limited resource size(=MM) for images excludes the shared memory size(=S). - # However, MOST client's session creation requests set S by default and - # our scheduler allocates the sum of the requested main memory size(=M) and S. - # That's why we should compare MM to the sum of M and S, - # which is the same as comparing MM minus S to M. + # Check if the user has allocated an "imbalanced" shared memory amount. + if shmem >= requested_slots["mem"]: + raise InvalidAPIParameters( + "Shared memory should be less than the main memory. (s:{}, m:{})".format( + str(shmem), str(BinarySize(requested_slots["mem"])) + ), + ) + + # Include the shared memory as a part of the requested memory. + # When checking the min/max range limits of memory size (=MM) specified as image + # metadata, we should take the share memory (=S) into account as well as the requested + # memory size (=M). + # Most client's session creation requests set S as the implicit default and the passed requested + # memory size is M - S, so this may break up the min/max check in the minimum edge. + # (e.g., MM.min = 256MiB, M = 256MiB, S = 64MiB -> intuitively this should work but if + # we compare MM.min with the passed value M - S (192MiB) only, it will prevent the session + # creation.) total_mem = requested_slots["mem"] + shmem log.debug( f"Total memory size ({str(total_mem)}) is allocated to requested main memory size. (original memory: {str(requested_slots['mem'])}, shared memory: {str(shmem)})", @@ -1226,14 +1237,6 @@ async def enqueue_session( ) ) - # Check if: shmem < memory - if shmem >= requested_slots["mem"]: - raise InvalidAPIParameters( - "Shared memory should be less than the main memory. (s:{}, m:{})".format( - str(shmem), str(BinarySize(requested_slots["mem"])) - ), - ) - # Add requested resource slot data to session session_requested_slots += requested_slots From 654e2736a4f71971af050f509393428236cc4ccb Mon Sep 17 00:00:00 2001 From: Joongi Kim Date: Tue, 26 Mar 2024 18:04:48 +0900 Subject: [PATCH 06/20] feat: Always enable the `cuda_open` plugin in the TUI (scie) installer (#1966) --- changes/1966.feature.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 changes/1966.feature.md diff --git a/changes/1966.feature.md b/changes/1966.feature.md new file mode 100644 index 0000000000..2f1ead9027 --- /dev/null +++ b/changes/1966.feature.md @@ -0,0 +1 @@ +Always enable `ai.backend.accelerator.cuda_open` in the scie-based installer From 6d17e5bc5c9f255f3540a131560d1519669bca09 Mon Sep 17 00:00:00 2001 From: Jihyun Kang Date: Tue, 26 Mar 2024 18:13:02 +0900 Subject: [PATCH 07/20] docs: resize font-size of ads (#1965) --- changes/1965.doc.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 changes/1965.doc.md diff --git a/changes/1965.doc.md b/changes/1965.doc.md new file mode 100644 index 0000000000..7a478086d3 --- /dev/null +++ b/changes/1965.doc.md @@ -0,0 +1 @@ +Resize font-size of footer text in ethical ads in documentation hosted by read-the-docs \ No newline at end of file From f735f56184443fa57de568a57ce4374ba4003007 Mon Sep 17 00:00:00 2001 From: Sanghun Lee Date: Wed, 27 Mar 2024 09:53:44 +0900 Subject: [PATCH 08/20] fix: alembic migration failing when primary key does not exist (#1963) Co-authored-by: Kyujin Cho --- changes/1963.fix.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 changes/1963.fix.md diff --git a/changes/1963.fix.md b/changes/1963.fix.md new file mode 100644 index 0000000000..044b278dad --- /dev/null +++ b/changes/1963.fix.md @@ -0,0 +1 @@ +Fix `caf54fcc17ab` migration to drop a primary key only if it exists and in `589c764a18f1` migration, add missing table arguments. \ No newline at end of file From 27d4653f02c7bd19a14caeea79d8ddc4565a35ad Mon Sep 17 00:00:00 2001 From: Sanghun Lee Date: Thu, 28 Mar 2024 14:11:57 +0900 Subject: [PATCH 09/20] do not compare shmem + mem with image mem slot --- src/ai/backend/manager/registry.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/src/ai/backend/manager/registry.py b/src/ai/backend/manager/registry.py index 1d8483b4e0..4e0c6b7f3c 100644 --- a/src/ai/backend/manager/registry.py +++ b/src/ai/backend/manager/registry.py @@ -1190,21 +1190,6 @@ async def enqueue_session( ), ) - # Include the shared memory as a part of the requested memory. - # When checking the min/max range limits of memory size (=MM) specified as image - # metadata, we should take the share memory (=S) into account as well as the requested - # memory size (=M). - # Most client's session creation requests set S as the implicit default and the passed requested - # memory size is M - S, so this may break up the min/max check in the minimum edge. - # (e.g., MM.min = 256MiB, M = 256MiB, S = 64MiB -> intuitively this should work but if - # we compare MM.min with the passed value M - S (192MiB) only, it will prevent the session - # creation.) - total_mem = requested_slots["mem"] + shmem - log.debug( - f"Total memory size ({str(total_mem)}) is allocated to requested main memory size. (original memory: {str(requested_slots['mem'])}, shared memory: {str(shmem)})", - ) - requested_slots["mem"] = total_mem - # Check the image resource slots. log_fmt = "s:{} k:{} r:{}-{}" log_args = (session_id, kernel_id, kernel["cluster_role"], kernel["cluster_idx"]) From bfb7aecf374498ea9971ec9189a732d80e163840 Mon Sep 17 00:00:00 2001 From: Sanghun Lee Date: Thu, 28 Mar 2024 14:21:59 +0900 Subject: [PATCH 10/20] add comment and update news fragment --- changes/1770.feature.md | 1 - changes/1770.fix.md | 1 + src/ai/backend/manager/registry.py | 1 + 3 files changed, 2 insertions(+), 1 deletion(-) delete mode 100644 changes/1770.feature.md create mode 100644 changes/1770.fix.md diff --git a/changes/1770.feature.md b/changes/1770.feature.md deleted file mode 100644 index 81b5fed1da..0000000000 --- a/changes/1770.feature.md +++ /dev/null @@ -1 +0,0 @@ -When validating parameters of session creation requests, compare the image's memory limit to the sum of the shared memory size and the main memory size. diff --git a/changes/1770.fix.md b/changes/1770.fix.md new file mode 100644 index 0000000000..b823fd9f56 --- /dev/null +++ b/changes/1770.fix.md @@ -0,0 +1 @@ +Do not include the shared memory when comparing the requested memory slot to the minimum resource slot required by an image. \ No newline at end of file diff --git a/src/ai/backend/manager/registry.py b/src/ai/backend/manager/registry.py index 4e0c6b7f3c..e6f4441d12 100644 --- a/src/ai/backend/manager/registry.py +++ b/src/ai/backend/manager/registry.py @@ -1124,6 +1124,7 @@ async def enqueue_session( ) # Shared memory. + # Do not including the shared memory when comparing the memory slot with image requiring resource slot. shmem = BinarySize.from_str( resource_opts.get("shmem") or labels.get("ai.backend.resource.preferred.shmem") From 2f5c3779735ea8e457da3be77b1b4b0eb4476459 Mon Sep 17 00:00:00 2001 From: Sanghun Lee Date: Mon, 8 Apr 2024 17:42:57 +0900 Subject: [PATCH 11/20] do null check rather than 'or' coalescing and configurable mem/shmem ratio --- src/ai/backend/manager/defs.py | 5 ++++- src/ai/backend/manager/registry.py | 29 ++++++++++++++++++++++------- 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/src/ai/backend/manager/defs.py b/src/ai/backend/manager/defs.py index 0dc2421579..85d22cce22 100644 --- a/src/ai/backend/manager/defs.py +++ b/src/ai/backend/manager/defs.py @@ -4,6 +4,7 @@ import enum import re +from decimal import Decimal from typing import Final from ai.backend.common.arch import CURRENT_ARCH @@ -89,4 +90,6 @@ class LockID(enum.IntEnum): DEFAULT_KEYPAIR_RESOURCE_POLICY_NAME: Final = "default" DEFAULT_KEYPAIR_RATE_LIMIT: Final = 10000 -DEFAULT_SHARED_MEMORY_SIZE: Final[str] = "64m" +DEFAULT_SHARED_MEMORY_SIZE: Final = "64m" +DEFAULT_MIN_MEM_SHARED_MEM_RATIO: Final = Decimal(1.0) +MIN_MEM_SHARED_MEM_RATIO_KEY: Final = "manager/mem-shmem-ratio" diff --git a/src/ai/backend/manager/registry.py b/src/ai/backend/manager/registry.py index e6f4441d12..9e80ca73a4 100644 --- a/src/ai/backend/manager/registry.py +++ b/src/ai/backend/manager/registry.py @@ -131,7 +131,14 @@ TooManySessionsMatched, ) from .config import LocalConfig, SharedConfig -from .defs import DEFAULT_IMAGE_ARCH, DEFAULT_ROLE, DEFAULT_SHARED_MEMORY_SIZE, INTRINSIC_SLOTS +from .defs import ( + DEFAULT_IMAGE_ARCH, + DEFAULT_MIN_MEM_SHARED_MEM_RATIO, + DEFAULT_ROLE, + DEFAULT_SHARED_MEMORY_SIZE, + INTRINSIC_SLOTS, + MIN_MEM_SHARED_MEM_RATIO_KEY, +) from .exceptions import MultiAgentError, convert_to_status_data from .models import ( AGENT_RESOURCE_OCCUPYING_KERNEL_STATUSES, @@ -1125,11 +1132,12 @@ async def enqueue_session( # Shared memory. # Do not including the shared memory when comparing the memory slot with image requiring resource slot. - shmem = BinarySize.from_str( - resource_opts.get("shmem") - or labels.get("ai.backend.resource.preferred.shmem") - or DEFAULT_SHARED_MEMORY_SIZE - ) + raw_shmem: str | None = resource_opts.get("shmem") + if raw_shmem is None: + raw_shmem = labels.get("ai.backend.resource.preferred.shmem") + if raw_shmem is None: + raw_shmem = DEFAULT_SHARED_MEMORY_SIZE + shmem = BinarySize.from_str(raw_shmem) resource_opts["shmem"] = shmem # Sanitize user input: does it have resource config? @@ -1184,7 +1192,14 @@ async def enqueue_session( raise InvalidAPIParameters("Client upgrade required to use TPUs (v19.03+).") # Check if the user has allocated an "imbalanced" shared memory amount. - if shmem >= requested_slots["mem"]: + raw_min_mem_shmem_ratio = await self.shared_config.etcd.get( + MIN_MEM_SHARED_MEM_RATIO_KEY + ) + if raw_min_mem_shmem_ratio is None: + min_mem_shmem_ratio = DEFAULT_MIN_MEM_SHARED_MEM_RATIO + else: + min_mem_shmem_ratio = Decimal(raw_min_mem_shmem_ratio) + if Decimal(requested_slots["mem"]) / Decimal(shmem) <= min_mem_shmem_ratio: raise InvalidAPIParameters( "Shared memory should be less than the main memory. (s:{}, m:{})".format( str(shmem), str(BinarySize(requested_slots["mem"])) From 6dd7f0242dd766492b16aa5bba63767ec55ecaa1 Mon Sep 17 00:00:00 2001 From: Sanghun Lee Date: Thu, 20 Jun 2024 17:21:25 +0900 Subject: [PATCH 12/20] rename and relocate constants and variables --- src/ai/backend/common/defs.py | 6 ++++++ src/ai/backend/manager/defs.py | 5 ----- src/ai/backend/manager/registry.py | 15 +++++---------- 3 files changed, 11 insertions(+), 15 deletions(-) diff --git a/src/ai/backend/common/defs.py b/src/ai/backend/common/defs.py index 9dadda4c90..74115f7a0a 100644 --- a/src/ai/backend/common/defs.py +++ b/src/ai/backend/common/defs.py @@ -1,3 +1,4 @@ +from decimal import Decimal from typing import Final # Redis database IDs depending on purposes @@ -10,3 +11,8 @@ DEFAULT_FILE_IO_TIMEOUT: Final = 10 + + +DEFAULT_SHARED_MEMORY_SIZE: Final = "64m" +DEFAULT_ALLOWED_MAX_SHMEM_RATIO: Final = Decimal(1.0) +SHMEM_RATIO_KEY: Final = "resources/mem-shmem-ratio" diff --git a/src/ai/backend/manager/defs.py b/src/ai/backend/manager/defs.py index 85d22cce22..7c93275b18 100644 --- a/src/ai/backend/manager/defs.py +++ b/src/ai/backend/manager/defs.py @@ -4,7 +4,6 @@ import enum import re -from decimal import Decimal from typing import Final from ai.backend.common.arch import CURRENT_ARCH @@ -89,7 +88,3 @@ class LockID(enum.IntEnum): DEFAULT_KEYPAIR_RESOURCE_POLICY_NAME: Final = "default" DEFAULT_KEYPAIR_RATE_LIMIT: Final = 10000 - -DEFAULT_SHARED_MEMORY_SIZE: Final = "64m" -DEFAULT_MIN_MEM_SHARED_MEM_RATIO: Final = Decimal(1.0) -MIN_MEM_SHARED_MEM_RATIO_KEY: Final = "manager/mem-shmem-ratio" diff --git a/src/ai/backend/manager/registry.py b/src/ai/backend/manager/registry.py index 9e80ca73a4..bd429e0384 100644 --- a/src/ai/backend/manager/registry.py +++ b/src/ai/backend/manager/registry.py @@ -133,11 +133,8 @@ from .config import LocalConfig, SharedConfig from .defs import ( DEFAULT_IMAGE_ARCH, - DEFAULT_MIN_MEM_SHARED_MEM_RATIO, DEFAULT_ROLE, - DEFAULT_SHARED_MEMORY_SIZE, INTRINSIC_SLOTS, - MIN_MEM_SHARED_MEM_RATIO_KEY, ) from .exceptions import MultiAgentError, convert_to_status_data from .models import ( @@ -1192,14 +1189,12 @@ async def enqueue_session( raise InvalidAPIParameters("Client upgrade required to use TPUs (v19.03+).") # Check if the user has allocated an "imbalanced" shared memory amount. - raw_min_mem_shmem_ratio = await self.shared_config.etcd.get( - MIN_MEM_SHARED_MEM_RATIO_KEY - ) - if raw_min_mem_shmem_ratio is None: - min_mem_shmem_ratio = DEFAULT_MIN_MEM_SHARED_MEM_RATIO + raw_allowed_mem_shmem_ratio = await self.shared_config.etcd.get(SHMEM_RATIO_KEY) + if raw_allowed_mem_shmem_ratio is None: + allowed_mem_shmem_ratio = DEFAULT_ALLOWED_MAX_SHMEM_RATIO else: - min_mem_shmem_ratio = Decimal(raw_min_mem_shmem_ratio) - if Decimal(requested_slots["mem"]) / Decimal(shmem) <= min_mem_shmem_ratio: + allowed_mem_shmem_ratio = Decimal(raw_allowed_mem_shmem_ratio) + if Decimal(requested_slots["mem"]) / Decimal(shmem) <= allowed_mem_shmem_ratio: raise InvalidAPIParameters( "Shared memory should be less than the main memory. (s:{}, m:{})".format( str(shmem), str(BinarySize(requested_slots["mem"])) From 891e0f4ab69df64ae4c2094124a8ef9c1950aae6 Mon Sep 17 00:00:00 2001 From: Sanghun Lee Date: Thu, 20 Jun 2024 17:52:28 +0900 Subject: [PATCH 13/20] update error message since ratio is changable --- src/ai/backend/common/defs.py | 2 +- src/ai/backend/manager/registry.py | 15 +++++++-------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/src/ai/backend/common/defs.py b/src/ai/backend/common/defs.py index 74115f7a0a..8a87630eb5 100644 --- a/src/ai/backend/common/defs.py +++ b/src/ai/backend/common/defs.py @@ -15,4 +15,4 @@ DEFAULT_SHARED_MEMORY_SIZE: Final = "64m" DEFAULT_ALLOWED_MAX_SHMEM_RATIO: Final = Decimal(1.0) -SHMEM_RATIO_KEY: Final = "resources/mem-shmem-ratio" +SHMEM_RATIO_KEY: Final = "resources/shmem-mem-ratio" diff --git a/src/ai/backend/manager/registry.py b/src/ai/backend/manager/registry.py index bd429e0384..4827554e9b 100644 --- a/src/ai/backend/manager/registry.py +++ b/src/ai/backend/manager/registry.py @@ -1189,16 +1189,15 @@ async def enqueue_session( raise InvalidAPIParameters("Client upgrade required to use TPUs (v19.03+).") # Check if the user has allocated an "imbalanced" shared memory amount. - raw_allowed_mem_shmem_ratio = await self.shared_config.etcd.get(SHMEM_RATIO_KEY) - if raw_allowed_mem_shmem_ratio is None: - allowed_mem_shmem_ratio = DEFAULT_ALLOWED_MAX_SHMEM_RATIO + raw_allowed_max_shmem_ratio = await self.shared_config.etcd.get(SHMEM_RATIO_KEY) + if raw_allowed_max_shmem_ratio is None: + allowed_max_shmem_ratio = DEFAULT_ALLOWED_MAX_SHMEM_RATIO else: - allowed_mem_shmem_ratio = Decimal(raw_allowed_mem_shmem_ratio) - if Decimal(requested_slots["mem"]) / Decimal(shmem) <= allowed_mem_shmem_ratio: + allowed_max_shmem_ratio = Decimal(raw_allowed_max_shmem_ratio) + if Decimal(shmem) >= Decimal(requested_slots["mem"]) * allowed_max_shmem_ratio: raise InvalidAPIParameters( - "Shared memory should be less than the main memory. (s:{}, m:{})".format( - str(shmem), str(BinarySize(requested_slots["mem"])) - ), + f"Too large shared memory. Maximum ratio of 'shared memory / memory' is {str(allowed_max_shmem_ratio)}. " + f"(s:{str(shmem)}, m:{str(BinarySize(requested_slots['mem']))}" ) # Check the image resource slots. From c4001f13ca9c01d0d50082b04c75aa845b30af6b Mon Sep 17 00:00:00 2001 From: Sanghun Lee Date: Thu, 20 Jun 2024 18:13:27 +0900 Subject: [PATCH 14/20] update news fragment --- changes/1770.fix.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/changes/1770.fix.md b/changes/1770.fix.md index b823fd9f56..e0281c6ac4 100644 --- a/changes/1770.fix.md +++ b/changes/1770.fix.md @@ -1 +1,2 @@ -Do not include the shared memory when comparing the requested memory slot to the minimum resource slot required by an image. \ No newline at end of file +* Do not include the shared memory when comparing the requested memory slot to the minimum resource slot required by an image. +* Configurable ratio between shared memory and memory. From bc26637e60f7b66ca7619199b5bbaba571497414 Mon Sep 17 00:00:00 2001 From: Sanghun Lee Date: Thu, 24 Oct 2024 11:10:56 +0900 Subject: [PATCH 15/20] check empty shmem value and robust type conversion handling --- src/ai/backend/manager/registry.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/src/ai/backend/manager/registry.py b/src/ai/backend/manager/registry.py index 4827554e9b..aeb9a38e1a 100644 --- a/src/ai/backend/manager/registry.py +++ b/src/ai/backend/manager/registry.py @@ -17,7 +17,7 @@ Sequence, ) from datetime import datetime -from decimal import Decimal +from decimal import Decimal, InvalidOperation from io import BytesIO from typing import ( TYPE_CHECKING, @@ -1129,10 +1129,11 @@ async def enqueue_session( # Shared memory. # Do not including the shared memory when comparing the memory slot with image requiring resource slot. - raw_shmem: str | None = resource_opts.get("shmem") + raw_shmem: Optional[str] = resource_opts.get("shmem") if raw_shmem is None: raw_shmem = labels.get("ai.backend.resource.preferred.shmem") - if raw_shmem is None: + if not raw_shmem: + # raw_shmem is None or empty string ("") raw_shmem = DEFAULT_SHARED_MEMORY_SIZE shmem = BinarySize.from_str(raw_shmem) resource_opts["shmem"] = shmem @@ -1189,11 +1190,19 @@ async def enqueue_session( raise InvalidAPIParameters("Client upgrade required to use TPUs (v19.03+).") # Check if the user has allocated an "imbalanced" shared memory amount. - raw_allowed_max_shmem_ratio = await self.shared_config.etcd.get(SHMEM_RATIO_KEY) - if raw_allowed_max_shmem_ratio is None: + raw_allowed_max_shmem_ratio = self.shared_config.data.get(SHMEM_RATIO_KEY) + try: + allowed_max_shmem_ratio = ( + Decimal(raw_allowed_max_shmem_ratio) + if raw_allowed_max_shmem_ratio is not None + else DEFAULT_ALLOWED_MAX_SHMEM_RATIO + ) + except (TypeError, InvalidOperation): + log.warning( + f"Failed to convert `raw_allowed_max_shmem_ratio({raw_allowed_max_shmem_ratio})` " + "to a decimal value. Fallback to default." + ) allowed_max_shmem_ratio = DEFAULT_ALLOWED_MAX_SHMEM_RATIO - else: - allowed_max_shmem_ratio = Decimal(raw_allowed_max_shmem_ratio) if Decimal(shmem) >= Decimal(requested_slots["mem"]) * allowed_max_shmem_ratio: raise InvalidAPIParameters( f"Too large shared memory. Maximum ratio of 'shared memory / memory' is {str(allowed_max_shmem_ratio)}. " From 1761ae90b3e5902476d1dae3a1e79e003efc3746 Mon Sep 17 00:00:00 2001 From: Sanghun Lee Date: Thu, 24 Oct 2024 11:53:20 +0900 Subject: [PATCH 16/20] compare image requiring mem to requested mem plus shmem --- changes/1770.fix.md | 3 +-- src/ai/backend/common/defs.py | 6 +++--- src/ai/backend/manager/registry.py | 14 ++++++++++++-- 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/changes/1770.fix.md b/changes/1770.fix.md index e0281c6ac4..be05096f01 100644 --- a/changes/1770.fix.md +++ b/changes/1770.fix.md @@ -1,2 +1 @@ -* Do not include the shared memory when comparing the requested memory slot to the minimum resource slot required by an image. -* Configurable ratio between shared memory and memory. +Include the shared memory when comparing the requested memory slot to the minimum resource slot required by an image and Add configuration of a ratio between shared memory and memory. diff --git a/src/ai/backend/common/defs.py b/src/ai/backend/common/defs.py index 8a87630eb5..1f5556e46b 100644 --- a/src/ai/backend/common/defs.py +++ b/src/ai/backend/common/defs.py @@ -13,6 +13,6 @@ DEFAULT_FILE_IO_TIMEOUT: Final = 10 -DEFAULT_SHARED_MEMORY_SIZE: Final = "64m" -DEFAULT_ALLOWED_MAX_SHMEM_RATIO: Final = Decimal(1.0) -SHMEM_RATIO_KEY: Final = "resources/shmem-mem-ratio" +DEFAULT_SHARED_MEMORY_SIZE: Final[str] = "64m" +DEFAULT_ALLOWED_MAX_SHMEM_RATIO: Final[Decimal] = Decimal(1.0) +SHMEM_RATIO_KEY: Final[str] = "resources/shmem-mem-ratio" diff --git a/src/ai/backend/manager/registry.py b/src/ai/backend/manager/registry.py index aeb9a38e1a..bec293d353 100644 --- a/src/ai/backend/manager/registry.py +++ b/src/ai/backend/manager/registry.py @@ -1128,14 +1128,20 @@ async def enqueue_session( ) # Shared memory. - # Do not including the shared memory when comparing the memory slot with image requiring resource slot. raw_shmem: Optional[str] = resource_opts.get("shmem") if raw_shmem is None: raw_shmem = labels.get("ai.backend.resource.preferred.shmem") if not raw_shmem: # raw_shmem is None or empty string ("") raw_shmem = DEFAULT_SHARED_MEMORY_SIZE - shmem = BinarySize.from_str(raw_shmem) + try: + shmem = BinarySize.from_str(raw_shmem) + except (ValueError, IndexError): + log.warning( + f"Failed to convert raw `shmem({raw_shmem})` " + f"to a decimal value. Fallback to default({DEFAULT_SHARED_MEMORY_SIZE})." + ) + shmem = BinarySize.from_str(DEFAULT_SHARED_MEMORY_SIZE) resource_opts["shmem"] = shmem # Sanitize user input: does it have resource config? @@ -1209,6 +1215,10 @@ async def enqueue_session( f"(s:{str(shmem)}, m:{str(BinarySize(requested_slots['mem']))}" ) + # Compare ai.backend.resource.min.mem to (Memory + Shared-memory) + # because for most use cases, client side hides detailed shared-memory configuration. + requested_slots["mem"] += shmem + # Check the image resource slots. log_fmt = "s:{} k:{} r:{}-{}" log_args = (session_id, kernel_id, kernel["cluster_role"], kernel["cluster_idx"]) From 7311611328e2a1e107e0bf2e0ac95f65658bce93 Mon Sep 17 00:00:00 2001 From: Sanghun Lee Date: Thu, 24 Oct 2024 12:12:57 +0900 Subject: [PATCH 17/20] update cli help txt --- src/ai/backend/client/cli/service.py | 14 ++++++++++++-- src/ai/backend/client/cli/session/args.py | 7 ++++++- src/ai/backend/client/cli/session/execute.py | 7 ++++++- 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/src/ai/backend/client/cli/service.py b/src/ai/backend/client/cli/service.py index 3441464a89..944bcaf572 100644 --- a/src/ai/backend/client/cli/service.py +++ b/src/ai/backend/client/cli/service.py @@ -192,7 +192,12 @@ def info(ctx: CLIContext, service_name_or_id: str): metavar="KEY=VAL", type=str, multiple=True, - help="Resource options for creating compute session (e.g: shmem=64m)", + help=( + "Resource options for creating compute session (e.g: shmem=64m). " + "The session APIs compare the total resources (the sum of this value and `resources`) " + "to the minimum/maximum resources requirements specified by an image. " + "If the total does not meet these resource limits, the APIs raise a InvalidAPIParameters error." + ), ) @click.option( "--cluster-size", @@ -393,7 +398,12 @@ def create( metavar="KEY=VAL", type=str, multiple=True, - help="Resource options for creating compute session (e.g: shmem=64m)", + help=( + "Resource options for creating compute session (e.g: shmem=64m). " + "The session APIs compare the total resources (the sum of this value and `resources`) " + "to the minimum/maximum resources requirements specified by an image. " + "If the total does not meet these resource limits, the APIs raise a InvalidAPIParameters error." + ), ) @click.option( "--cluster-size", diff --git a/src/ai/backend/client/cli/session/args.py b/src/ai/backend/client/cli/session/args.py index 250449f0cb..75c0ff2b9b 100644 --- a/src/ai/backend/client/cli/session/args.py +++ b/src/ai/backend/client/cli/session/args.py @@ -131,7 +131,12 @@ metavar="KEY=VAL", type=str, multiple=True, - help="Resource options for creating compute session (e.g: shmem=64m)", + help=( + "Resource options for creating compute session (e.g: shmem=64m). " + "The session APIs compare the total resources (the sum of this value and `resources`) " + "to the minimum/maximum resources requirements specified by an image. " + "If the total does not meet these resource limits, the APIs raise a InvalidAPIParameters error." + ), ), # resource grouping click.option( diff --git a/src/ai/backend/client/cli/session/execute.py b/src/ai/backend/client/cli/session/execute.py index a708e2f28b..94abd28fe9 100644 --- a/src/ai/backend/client/cli/session/execute.py +++ b/src/ai/backend/client/cli/session/execute.py @@ -365,7 +365,12 @@ def prepare_mount_arg( metavar="KEY=VAL", type=str, multiple=True, - help="Resource options for creating compute session. (e.g: shmem=64m)", + help=( + "Resource options for creating compute session (e.g: shmem=64m). " + "The session APIs compare the total resources (the sum of this value and `resources`) " + "to the minimum/maximum resources requirements specified by an image. " + "If the total does not meet these resource limits, the APIs raise a InvalidAPIParameters error." + ), ) @click.option( "--arch", From 6a8ef1986630f75fda1b86f7c71fbcc56d815980 Mon Sep 17 00:00:00 2001 From: Sanghun Lee Date: Thu, 24 Oct 2024 12:27:51 +0900 Subject: [PATCH 18/20] remove too verbose help text --- src/ai/backend/client/cli/service.py | 6 ++---- src/ai/backend/client/cli/session/args.py | 3 +-- src/ai/backend/client/cli/session/execute.py | 3 +-- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/src/ai/backend/client/cli/service.py b/src/ai/backend/client/cli/service.py index 944bcaf572..a56795c577 100644 --- a/src/ai/backend/client/cli/service.py +++ b/src/ai/backend/client/cli/service.py @@ -195,8 +195,7 @@ def info(ctx: CLIContext, service_name_or_id: str): help=( "Resource options for creating compute session (e.g: shmem=64m). " "The session APIs compare the total resources (the sum of this value and `resources`) " - "to the minimum/maximum resources requirements specified by an image. " - "If the total does not meet these resource limits, the APIs raise a InvalidAPIParameters error." + "to the minimum/maximum resources requirements specified by an image." ), ) @click.option( @@ -401,8 +400,7 @@ def create( help=( "Resource options for creating compute session (e.g: shmem=64m). " "The session APIs compare the total resources (the sum of this value and `resources`) " - "to the minimum/maximum resources requirements specified by an image. " - "If the total does not meet these resource limits, the APIs raise a InvalidAPIParameters error." + "to the minimum/maximum resources requirements specified by an image." ), ) @click.option( diff --git a/src/ai/backend/client/cli/session/args.py b/src/ai/backend/client/cli/session/args.py index 75c0ff2b9b..091f537f96 100644 --- a/src/ai/backend/client/cli/session/args.py +++ b/src/ai/backend/client/cli/session/args.py @@ -134,8 +134,7 @@ help=( "Resource options for creating compute session (e.g: shmem=64m). " "The session APIs compare the total resources (the sum of this value and `resources`) " - "to the minimum/maximum resources requirements specified by an image. " - "If the total does not meet these resource limits, the APIs raise a InvalidAPIParameters error." + "to the minimum/maximum resources requirements specified by an image." ), ), # resource grouping diff --git a/src/ai/backend/client/cli/session/execute.py b/src/ai/backend/client/cli/session/execute.py index 94abd28fe9..3a3e308ef6 100644 --- a/src/ai/backend/client/cli/session/execute.py +++ b/src/ai/backend/client/cli/session/execute.py @@ -368,8 +368,7 @@ def prepare_mount_arg( help=( "Resource options for creating compute session (e.g: shmem=64m). " "The session APIs compare the total resources (the sum of this value and `resources`) " - "to the minimum/maximum resources requirements specified by an image. " - "If the total does not meet these resource limits, the APIs raise a InvalidAPIParameters error." + "to the minimum/maximum resources requirements specified by an image." ), ) @click.option( From 4a060f101fe76149e021825626956ca56e08729a Mon Sep 17 00:00:00 2001 From: Sanghun Lee Date: Thu, 24 Oct 2024 17:39:35 +0900 Subject: [PATCH 19/20] minor change --- src/ai/backend/manager/registry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ai/backend/manager/registry.py b/src/ai/backend/manager/registry.py index bec293d353..db91d55ab4 100644 --- a/src/ai/backend/manager/registry.py +++ b/src/ai/backend/manager/registry.py @@ -1136,7 +1136,7 @@ async def enqueue_session( raw_shmem = DEFAULT_SHARED_MEMORY_SIZE try: shmem = BinarySize.from_str(raw_shmem) - except (ValueError, IndexError): + except ValueError: log.warning( f"Failed to convert raw `shmem({raw_shmem})` " f"to a decimal value. Fallback to default({DEFAULT_SHARED_MEMORY_SIZE})." From 245de358032ba59d4e357c4467820c47ec847c3a Mon Sep 17 00:00:00 2001 From: Sanghun Lee Date: Fri, 25 Oct 2024 00:59:01 +0900 Subject: [PATCH 20/20] fix wrong resolved conflicts --- src/ai/backend/manager/registry.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/ai/backend/manager/registry.py b/src/ai/backend/manager/registry.py index db91d55ab4..de3cad8729 100644 --- a/src/ai/backend/manager/registry.py +++ b/src/ai/backend/manager/registry.py @@ -54,6 +54,11 @@ from ai.backend.common import msgpack, redis_helper from ai.backend.common.asyncio import cancel_tasks +from ai.backend.common.defs import ( + DEFAULT_ALLOWED_MAX_SHMEM_RATIO, + DEFAULT_SHARED_MEMORY_SIZE, + SHMEM_RATIO_KEY, +) from ai.backend.common.docker import ImageRef from ai.backend.common.events import ( AgentHeartbeatEvent, @@ -1212,7 +1217,7 @@ async def enqueue_session( if Decimal(shmem) >= Decimal(requested_slots["mem"]) * allowed_max_shmem_ratio: raise InvalidAPIParameters( f"Too large shared memory. Maximum ratio of 'shared memory / memory' is {str(allowed_max_shmem_ratio)}. " - f"(s:{str(shmem)}, m:{str(BinarySize(requested_slots['mem']))}" + f"(s:{str(shmem)}, m:{str(BinarySize(requested_slots["mem"]))}" ) # Compare ai.backend.resource.min.mem to (Memory + Shared-memory)