Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: Compare image memory limit to sum of shmem and main mem #1770

Open
wants to merge 20 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
c99e738
fix: Handle value error when convert shmem to binary size
fregataa Oct 24, 2024
5b821cf
feature: Compare image memory limit to sum of shmem and main mem
fregataa Dec 8, 2023
0bef259
add news fragment
fregataa Dec 8, 2023
ec523d9
add shmem to requested_slot and log
fregataa Dec 11, 2023
5367ab2
fix: Clarify the comment and move the shmem/mem size comparison befor…
achimnol Mar 22, 2024
654e273
feat: Always enable the `cuda_open` plugin in the TUI (scie) installe…
achimnol Mar 26, 2024
6d17e5b
docs: resize font-size of ads (#1965)
lizable Mar 26, 2024
f735f56
fix: alembic migration failing when primary key does not exist (#1963)
fregataa Mar 27, 2024
27d4653
do not compare shmem + mem with image mem slot
fregataa Mar 28, 2024
bfb7aec
add comment and update news fragment
fregataa Mar 28, 2024
2f5c377
do null check rather than 'or' coalescing and configurable mem/shmem …
fregataa Apr 8, 2024
6dd7f02
rename and relocate constants and variables
fregataa Jun 20, 2024
891e0f4
update error message since ratio is changable
fregataa Jun 20, 2024
c4001f1
update news fragment
fregataa Jun 20, 2024
bc26637
check empty shmem value and robust type conversion handling
fregataa Oct 24, 2024
1761ae9
compare image requiring mem to requested mem plus shmem
fregataa Oct 24, 2024
7311611
update cli help txt
fregataa Oct 24, 2024
6a8ef19
remove too verbose help text
fregataa Oct 24, 2024
4a060f1
minor change
fregataa Oct 24, 2024
245de35
fix wrong resolved conflicts
fregataa Oct 24, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changes/1770.fix.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Include the shared memory when comparing the requested memory slot to the minimum resource slot required by an image and Add configuration of a ratio between shared memory and memory.
1 change: 1 addition & 0 deletions changes/1963.fix.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix `caf54fcc17ab` migration to drop a primary key only if it exists and in `589c764a18f1` migration, add missing table arguments.
1 change: 1 addition & 0 deletions changes/1965.doc.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Resize font-size of footer text in ethical ads in documentation hosted by read-the-docs
1 change: 1 addition & 0 deletions changes/1966.feature.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Always enable `ai.backend.accelerator.cuda_open` in the scie-based installer
12 changes: 10 additions & 2 deletions src/ai/backend/client/cli/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,11 @@ def info(ctx: CLIContext, service_name_or_id: str):
metavar="KEY=VAL",
type=str,
multiple=True,
help="Resource options for creating compute session (e.g: shmem=64m)",
help=(
"Resource options for creating compute session (e.g: shmem=64m). "
"The session APIs compare the total resources (the sum of this value and `resources`) "
"to the minimum/maximum resources requirements specified by an image."
),
)
@click.option(
"--cluster-size",
Expand Down Expand Up @@ -393,7 +397,11 @@ def create(
metavar="KEY=VAL",
type=str,
multiple=True,
help="Resource options for creating compute session (e.g: shmem=64m)",
help=(
"Resource options for creating compute session (e.g: shmem=64m). "
"The session APIs compare the total resources (the sum of this value and `resources`) "
"to the minimum/maximum resources requirements specified by an image."
),
)
@click.option(
"--cluster-size",
Expand Down
6 changes: 5 additions & 1 deletion src/ai/backend/client/cli/session/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,11 @@
metavar="KEY=VAL",
type=str,
multiple=True,
help="Resource options for creating compute session (e.g: shmem=64m)",
help=(
"Resource options for creating compute session (e.g: shmem=64m). "
"The session APIs compare the total resources (the sum of this value and `resources`) "
"to the minimum/maximum resources requirements specified by an image."
),
),
# resource grouping
click.option(
Expand Down
6 changes: 5 additions & 1 deletion src/ai/backend/client/cli/session/execute.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,7 +365,11 @@ def prepare_mount_arg(
metavar="KEY=VAL",
type=str,
multiple=True,
help="Resource options for creating compute session. (e.g: shmem=64m)",
help=(
"Resource options for creating compute session (e.g: shmem=64m). "
"The session APIs compare the total resources (the sum of this value and `resources`) "
"to the minimum/maximum resources requirements specified by an image."
),
)
@click.option(
"--arch",
Expand Down
6 changes: 6 additions & 0 deletions src/ai/backend/common/defs.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from decimal import Decimal
from typing import Final

# Redis database IDs depending on purposes
Expand All @@ -10,3 +11,8 @@


DEFAULT_FILE_IO_TIMEOUT: Final = 10


DEFAULT_SHARED_MEMORY_SIZE: Final[str] = "64m"
DEFAULT_ALLOWED_MAX_SHMEM_RATIO: Final[Decimal] = Decimal(1.0)
SHMEM_RATIO_KEY: Final[str] = "resources/shmem-mem-ratio"
69 changes: 49 additions & 20 deletions src/ai/backend/manager/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import asyncio
import base64
import copy
import itertools
import logging
import re
Expand All @@ -18,7 +17,7 @@
Sequence,
)
from datetime import datetime
from decimal import Decimal
from decimal import Decimal, InvalidOperation
from io import BytesIO
from typing import (
TYPE_CHECKING,
Expand Down Expand Up @@ -55,6 +54,11 @@

from ai.backend.common import msgpack, redis_helper
from ai.backend.common.asyncio import cancel_tasks
from ai.backend.common.defs import (
DEFAULT_ALLOWED_MAX_SHMEM_RATIO,
DEFAULT_SHARED_MEMORY_SIZE,
SHMEM_RATIO_KEY,
)
from ai.backend.common.docker import ImageRef
from ai.backend.common.events import (
AgentHeartbeatEvent,
Expand Down Expand Up @@ -132,7 +136,11 @@
TooManySessionsMatched,
)
from .config import LocalConfig, SharedConfig
from .defs import DEFAULT_IMAGE_ARCH, DEFAULT_ROLE, INTRINSIC_SLOTS
from .defs import (
DEFAULT_IMAGE_ARCH,
DEFAULT_ROLE,
INTRINSIC_SLOTS,
)
from .exceptions import MultiAgentError, convert_to_status_data
from .models import (
AGENT_RESOURCE_OCCUPYING_KERNEL_STATUSES,
Expand Down Expand Up @@ -1125,16 +1133,21 @@ async def enqueue_session(
)

# Shared memory.
# We need to subtract the amount of shared memory from the memory limit of
# a container, since tmpfs including /dev/shm uses host-side kernel memory
# and cgroup's memory limit does not apply.
shmem = resource_opts.get("shmem", None)
if shmem is None:
shmem = labels.get("ai.backend.resource.preferred.shmem", "64m")
shmem = BinarySize.from_str(shmem)
raw_shmem: Optional[str] = resource_opts.get("shmem")
if raw_shmem is None:
raw_shmem = labels.get("ai.backend.resource.preferred.shmem")
if not raw_shmem:
# raw_shmem is None or empty string ("")
raw_shmem = DEFAULT_SHARED_MEMORY_SIZE
try:
shmem = BinarySize.from_str(raw_shmem)
except ValueError:
log.warning(
f"Failed to convert raw `shmem({raw_shmem})` "
f"to a decimal value. Fallback to default({DEFAULT_SHARED_MEMORY_SIZE})."
)
shmem = BinarySize.from_str(DEFAULT_SHARED_MEMORY_SIZE)
resource_opts["shmem"] = shmem
image_min_slots = copy.deepcopy(image_min_slots)
image_min_slots["mem"] += shmem

# Sanitize user input: does it have resource config?
if (resources := creation_config.get("resources")) is not None:
Expand Down Expand Up @@ -1187,6 +1200,30 @@ async def enqueue_session(
if tpu is not None:
raise InvalidAPIParameters("Client upgrade required to use TPUs (v19.03+).")

# Check if the user has allocated an "imbalanced" shared memory amount.
raw_allowed_max_shmem_ratio = self.shared_config.data.get(SHMEM_RATIO_KEY)
try:
allowed_max_shmem_ratio = (
Decimal(raw_allowed_max_shmem_ratio)
if raw_allowed_max_shmem_ratio is not None
else DEFAULT_ALLOWED_MAX_SHMEM_RATIO
)
except (TypeError, InvalidOperation):
log.warning(
f"Failed to convert `raw_allowed_max_shmem_ratio({raw_allowed_max_shmem_ratio})` "
"to a decimal value. Fallback to default."
)
allowed_max_shmem_ratio = DEFAULT_ALLOWED_MAX_SHMEM_RATIO
if Decimal(shmem) >= Decimal(requested_slots["mem"]) * allowed_max_shmem_ratio:
raise InvalidAPIParameters(
f"Too large shared memory. Maximum ratio of 'shared memory / memory' is {str(allowed_max_shmem_ratio)}. "
f"(s:{str(shmem)}, m:{str(BinarySize(requested_slots["mem"]))}"
)
Comment on lines +1218 to +1221
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The error message string is missing a closing parenthesis. The line should be:

f"(s:{str(shmem)}, m:{str(BinarySize(requested_slots['mem']))})"

Spotted by Graphite Reviewer

Is this helpful? React 👍 or 👎 to let us know.


# Compare ai.backend.resource.min.mem to (Memory + Shared-memory)
# because for most use cases, client side hides detailed shared-memory configuration.
requested_slots["mem"] += shmem

# Check the image resource slots.
log_fmt = "s:{} k:{} r:{}-{}"
log_args = (session_id, kernel_id, kernel["cluster_role"], kernel["cluster_idx"])
Expand Down Expand Up @@ -1219,14 +1256,6 @@ async def enqueue_session(
)
)

# Check if: shmem < memory
if shmem >= requested_slots["mem"]:
raise InvalidAPIParameters(
"Shared memory should be less than the main memory. (s:{}, m:{})".format(
str(shmem), str(BinarySize(requested_slots["mem"]))
),
)

# Add requested resource slot data to session
session_requested_slots += requested_slots

Expand Down