lablup · fregataa · Oct 24, 2024 · Dec 8, 2023 · Dec 8, 2023 · Dec 11, 2023
diff --git a/changes/1770.fix.md b/changes/1770.fix.md
@@ -0,0 +1 @@
+Include the shared memory when comparing the requested memory slot to the minimum resource slot required by an image and Add configuration of a ratio between shared memory and memory.
diff --git a/changes/1963.fix.md b/changes/1963.fix.md
@@ -0,0 +1 @@
+Fix `caf54fcc17ab` migration to drop a primary key only if it exists and in `589c764a18f1` migration, add missing table arguments.
diff --git a/changes/1965.doc.md b/changes/1965.doc.md
@@ -0,0 +1 @@
+Resize font-size of footer text in ethical ads in documentation hosted by read-the-docs
diff --git a/changes/1966.feature.md b/changes/1966.feature.md
@@ -0,0 +1 @@
+Always enable `ai.backend.accelerator.cuda_open` in the scie-based installer
diff --git a/src/ai/backend/client/cli/service.py b/src/ai/backend/client/cli/service.py
@@ -192,7 +192,11 @@ def info(ctx: CLIContext, service_name_or_id: str):
     metavar="KEY=VAL",
     type=str,
     multiple=True,
-    help="Resource options for creating compute session (e.g: shmem=64m)",
+    help=(
+        "Resource options for creating compute session (e.g: shmem=64m). "
+        "The session APIs compare the total resources (the sum of this value and `resources`) "
+        "to the minimum/maximum resources requirements specified by an image."
+    ),
 )
 @click.option(
     "--cluster-size",
@@ -393,7 +397,11 @@ def create(
     metavar="KEY=VAL",
     type=str,
     multiple=True,
-    help="Resource options for creating compute session (e.g: shmem=64m)",
+    help=(
+        "Resource options for creating compute session (e.g: shmem=64m). "
+        "The session APIs compare the total resources (the sum of this value and `resources`) "
+        "to the minimum/maximum resources requirements specified by an image."
+    ),
 )
 @click.option(
     "--cluster-size",

diff --git a/src/ai/backend/client/cli/session/args.py b/src/ai/backend/client/cli/session/args.py
@@ -131,7 +131,11 @@
         metavar="KEY=VAL",
         type=str,
         multiple=True,
-        help="Resource options for creating compute session (e.g: shmem=64m)",
+        help=(
+            "Resource options for creating compute session (e.g: shmem=64m). "
+            "The session APIs compare the total resources (the sum of this value and `resources`) "
+            "to the minimum/maximum resources requirements specified by an image."
+        ),
     ),
     # resource grouping
     click.option(

diff --git a/src/ai/backend/client/cli/session/execute.py b/src/ai/backend/client/cli/session/execute.py
@@ -365,7 +365,11 @@ def prepare_mount_arg(
     metavar="KEY=VAL",
     type=str,
     multiple=True,
-    help="Resource options for creating compute session. (e.g: shmem=64m)",
+    help=(
+        "Resource options for creating compute session (e.g: shmem=64m). "
+        "The session APIs compare the total resources (the sum of this value and `resources`) "
+        "to the minimum/maximum resources requirements specified by an image."
+    ),
 )
 @click.option(
     "--arch",

diff --git a/src/ai/backend/common/defs.py b/src/ai/backend/common/defs.py
@@ -1,3 +1,4 @@
+from decimal import Decimal
 from typing import Final
 
 # Redis database IDs depending on purposes
@@ -10,3 +11,8 @@
 
 
 DEFAULT_FILE_IO_TIMEOUT: Final = 10
+
+
+DEFAULT_SHARED_MEMORY_SIZE: Final[str] = "64m"
+DEFAULT_ALLOWED_MAX_SHMEM_RATIO: Final[Decimal] = Decimal(1.0)
+SHMEM_RATIO_KEY: Final[str] = "resources/shmem-mem-ratio"
diff --git a/src/ai/backend/manager/registry.py b/src/ai/backend/manager/registry.py
@@ -2,7 +2,6 @@
 
 import asyncio
 import base64
-import copy
 import itertools
 import logging
 import re
@@ -18,7 +17,7 @@
     Sequence,
 )
 from datetime import datetime
-from decimal import Decimal
+from decimal import Decimal, InvalidOperation
 from io import BytesIO
 from typing import (
     TYPE_CHECKING,
@@ -55,6 +54,11 @@
 
 from ai.backend.common import msgpack, redis_helper
 from ai.backend.common.asyncio import cancel_tasks
+from ai.backend.common.defs import (
+    DEFAULT_ALLOWED_MAX_SHMEM_RATIO,
+    DEFAULT_SHARED_MEMORY_SIZE,
+    SHMEM_RATIO_KEY,
+)
 from ai.backend.common.docker import ImageRef
 from ai.backend.common.events import (
     AgentHeartbeatEvent,
@@ -132,7 +136,11 @@
     TooManySessionsMatched,
 )
 from .config import LocalConfig, SharedConfig
-from .defs import DEFAULT_IMAGE_ARCH, DEFAULT_ROLE, INTRINSIC_SLOTS
+from .defs import (
+    DEFAULT_IMAGE_ARCH,
+    DEFAULT_ROLE,
+    INTRINSIC_SLOTS,
+)
 from .exceptions import MultiAgentError, convert_to_status_data
 from .models import (
     AGENT_RESOURCE_OCCUPYING_KERNEL_STATUSES,
@@ -1125,16 +1133,21 @@ async def enqueue_session(
                         )
 
             # Shared memory.
-            # We need to subtract the amount of shared memory from the memory limit of
-            # a container, since tmpfs including /dev/shm uses host-side kernel memory
-            # and cgroup's memory limit does not apply.
-            shmem = resource_opts.get("shmem", None)
-            if shmem is None:
-                shmem = labels.get("ai.backend.resource.preferred.shmem", "64m")
-            shmem = BinarySize.from_str(shmem)
+            raw_shmem: Optional[str] = resource_opts.get("shmem")
+            if raw_shmem is None:
+                raw_shmem = labels.get("ai.backend.resource.preferred.shmem")
+            if not raw_shmem:
+                # raw_shmem is None or empty string ("")
+                raw_shmem = DEFAULT_SHARED_MEMORY_SIZE
+            try:
+                shmem = BinarySize.from_str(raw_shmem)
+            except ValueError:
+                log.warning(
+                    f"Failed to convert raw `shmem({raw_shmem})` "
+                    f"to a decimal value. Fallback to default({DEFAULT_SHARED_MEMORY_SIZE})."
+                )
+                shmem = BinarySize.from_str(DEFAULT_SHARED_MEMORY_SIZE)
             resource_opts["shmem"] = shmem
-            image_min_slots = copy.deepcopy(image_min_slots)
-            image_min_slots["mem"] += shmem
 
             # Sanitize user input: does it have resource config?
             if (resources := creation_config.get("resources")) is not None:
@@ -1187,6 +1200,30 @@ async def enqueue_session(
                 if tpu is not None:
                     raise InvalidAPIParameters("Client upgrade required to use TPUs (v19.03+).")
 
+            # Check if the user has allocated an "imbalanced" shared memory amount.
+            raw_allowed_max_shmem_ratio = self.shared_config.data.get(SHMEM_RATIO_KEY)
+            try:
+                allowed_max_shmem_ratio = (
+                    Decimal(raw_allowed_max_shmem_ratio)
+                    if raw_allowed_max_shmem_ratio is not None
+                    else DEFAULT_ALLOWED_MAX_SHMEM_RATIO
+                )
+            except (TypeError, InvalidOperation):
+                log.warning(
+                    f"Failed to convert `raw_allowed_max_shmem_ratio({raw_allowed_max_shmem_ratio})` "
+                    "to a decimal value. Fallback to default."
+                )
+                allowed_max_shmem_ratio = DEFAULT_ALLOWED_MAX_SHMEM_RATIO
+            if Decimal(shmem) >= Decimal(requested_slots["mem"]) * allowed_max_shmem_ratio:
+                raise InvalidAPIParameters(
+                    f"Too large shared memory. Maximum ratio of 'shared memory / memory' is {str(allowed_max_shmem_ratio)}. "
+                    f"(s:{str(shmem)}, m:{str(BinarySize(requested_slots["mem"]))}"
+                )
+
+            # Compare ai.backend.resource.min.mem to (Memory + Shared-memory)
+            # because for most use cases, client side hides detailed shared-memory configuration.
+            requested_slots["mem"] += shmem
+
             # Check the image resource slots.
             log_fmt = "s:{} k:{} r:{}-{}"
             log_args = (session_id, kernel_id, kernel["cluster_role"], kernel["cluster_idx"])
@@ -1219,14 +1256,6 @@ async def enqueue_session(
                     )
                 )
 
-            # Check if: shmem < memory
-            if shmem >= requested_slots["mem"]:
-                raise InvalidAPIParameters(
-                    "Shared memory should be less than the main memory. (s:{}, m:{})".format(
-                        str(shmem), str(BinarySize(requested_slots["mem"]))
-                    ),
-                )
-
             # Add requested resource slot data to session
             session_requested_slots += requested_slots