Skip to content

Commit

Permalink
fix: Handle value error when convert shmem to binary size (#2972)
Browse files Browse the repository at this point in the history
Backported-from: main (24.12)
Backported-to: 24.09
Backport-of: 2972
  • Loading branch information
fregataa committed Oct 25, 2024
1 parent c33c168 commit 718ce1a
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 5 deletions.
1 change: 1 addition & 0 deletions changes/2972.fix.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Handle error when convert `shmem` string value into `BinarySize`
2 changes: 2 additions & 0 deletions src/ai/backend/manager/defs.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,3 +88,5 @@ class LockID(enum.IntEnum):

DEFAULT_KEYPAIR_RESOURCE_POLICY_NAME: Final = "default"
DEFAULT_KEYPAIR_RATE_LIMIT: Final = 10000

DEFAULT_SHARED_MEMORY_SIZE: Final[str] = "64m"
20 changes: 15 additions & 5 deletions src/ai/backend/manager/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@
TooManySessionsMatched,
)
from .config import LocalConfig, SharedConfig
from .defs import DEFAULT_IMAGE_ARCH, DEFAULT_ROLE, INTRINSIC_SLOTS
from .defs import DEFAULT_IMAGE_ARCH, DEFAULT_ROLE, DEFAULT_SHARED_MEMORY_SIZE, INTRINSIC_SLOTS
from .exceptions import MultiAgentError, convert_to_status_data
from .models import (
AGENT_RESOURCE_OCCUPYING_KERNEL_STATUSES,
Expand Down Expand Up @@ -1128,10 +1128,20 @@ async def enqueue_session(
# We need to subtract the amount of shared memory from the memory limit of
# a container, since tmpfs including /dev/shm uses host-side kernel memory
# and cgroup's memory limit does not apply.
shmem = resource_opts.get("shmem", None)
if shmem is None:
shmem = labels.get("ai.backend.resource.preferred.shmem", "64m")
shmem = BinarySize.from_str(shmem)
raw_shmem: Optional[str] = resource_opts.get("shmem")
if raw_shmem is None:
raw_shmem = labels.get("ai.backend.resource.preferred.shmem")
if not raw_shmem:
# raw_shmem is None or empty string ("")
raw_shmem = DEFAULT_SHARED_MEMORY_SIZE
try:
shmem = BinarySize.from_str(raw_shmem)
except ValueError:
log.warning(
f"Failed to convert raw `shmem({raw_shmem})` "
f"to a decimal value. Fallback to default({DEFAULT_SHARED_MEMORY_SIZE})."
)
shmem = BinarySize.from_str(DEFAULT_SHARED_MEMORY_SIZE)
resource_opts["shmem"] = shmem
image_min_slots = copy.deepcopy(image_min_slots)
image_min_slots["mem"] += shmem
Expand Down

0 comments on commit 718ce1a

Please sign in to comment.