Skip to content

Commit

Permalink
feat: Replace PREPARING status with CREATING for compute session and …
Browse files Browse the repository at this point in the history
…kernel
  • Loading branch information
fregataa committed Oct 2, 2024
1 parent 1bd2f37 commit 23e8b85
Show file tree
Hide file tree
Showing 17 changed files with 54 additions and 52 deletions.
2 changes: 1 addition & 1 deletion configs/manager/sample.etcd.config.json
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
"session": {
"hang-tolerance": {
"threshold": {
"PREPARING": "1h",
"CREATING": "1h",
"TERMINATING": "30m"
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"threshold": {
"PREPARING": "1h",
"CREATING": "1h",
"TERMINATING": "30m"
}
}
4 changes: 2 additions & 2 deletions docs/client/cli/sessions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,11 @@ For other options, please consult the output of ``--help``.
- Included Session Status

* - (no option)
- ``PENDING``, ``PREPARING``, ``RUNNING``, ``RESTARTING``,
- ``PENDING``, ``CREATING``, ``RUNNING``, ``RESTARTING``,
``TERMINATING``, ``RESIZING``, ``SUSPENDED``, and ``ERROR``.

* - ``--running``
- ``PREPARING``, ``PULLING``, and ``RUNNING``.
- ``CREATING``, ``PULLING``, and ``RUNNING``.

* - ``--dead``
- ``CANCELLED`` and ``TERMINATED``.
Expand Down
2 changes: 1 addition & 1 deletion docs/dev/adding-kernels.rst
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,7 @@ This per-image bootstrap script is executed as *root* by the agent-injected ``en

.. warning::

``/opt/container/bootstrap.sh`` **must return immediately** to prevent the session from staying in the ``PREPARING`` status.
``/opt/container/bootstrap.sh`` **must return immediately** to prevent the session from staying in the ``CREATING`` status.
This means that it should run service applications in background by *daemonization*.

To run a process as the user privilege, you should use ``su-exec`` which is also injected by the agent like:
Expand Down
4 changes: 2 additions & 2 deletions docs/locales/ko/LC_MESSAGES/client/cli/sessions.po
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ msgstr ""

#: ../../client/cli/sessions.rst:42 4f067c02b259464f84aeade4874f054f
msgid ""
"``PENDING``, ``PREPARING``, ``RUNNING``, ``RESTARTING``, ``TERMINATING``,"
"``PENDING``, ``CREATING``, ``RUNNING``, ``RESTARTING``, ``TERMINATING``,"
" ``RESIZING``, ``SUSPENDED``, and ``ERROR``."
msgstr ""

Expand All @@ -78,7 +78,7 @@ msgid "``--running``"
msgstr ""

#: ../../client/cli/sessions.rst:46 e9096ba3896544348e2c88ac765dc5c6
msgid "``PREPARING``, ``PULLING``, and ``RUNNING``."
msgid "``CREATING``, ``PULLING``, and ``RUNNING``."
msgstr ""

#: ../../client/cli/sessions.rst:48 7675c8771d0d4d20984e45e995ae38de
Expand Down
4 changes: 2 additions & 2 deletions docs/locales/ko/LC_MESSAGES/client/func/session.po
Original file line number Diff line number Diff line change
Expand Up @@ -72,15 +72,15 @@ msgstr ""
#: ai.backend.client.func.session.ComputeSession.paginated_list:4 of
msgid ""
"Fetches sessions in a specific status (PENDING, SCHEDULED, PULLING, "
"PREPARING, RUNNING, RESTARTING, RUNNING_DEGRADED, TERMINATING, "
"CREATING, RUNNING, RESTARTING, RUNNING_DEGRADED, TERMINATING, "
"TERMINATED, ERROR, CANCELLED)"
msgstr ""

#: 0435d36edb8d480b8abc9c1bf2011b75
#: ai.backend.client.func.session.ComputeSession.paginated_list:4 of
msgid ""
"Fetches sessions in a specific status (PENDING, SCHEDULED, PULLING, "
"PREPARING,"
"CREATING,"
msgstr ""

#: 0d33952fe7344b56bf4cecb9537cd08a
Expand Down
2 changes: 1 addition & 1 deletion docs/locales/ko/LC_MESSAGES/dev/adding-kernels.po
Original file line number Diff line number Diff line change
Expand Up @@ -576,7 +576,7 @@ msgstr ""
#: ../../dev/adding-kernels.rst:304 fc4cb9f5e7884254aa1fbc0d08fe0c8d
msgid ""
"``/opt/container/bootstrap.sh`` **must return immediately** to prevent "
"the session from staying in the ``PREPARING`` status. This means that it "
"the session from staying in the ``CREATING`` status. This means that it "
"should run service applications in background by *daemonization*."
msgstr ""

Expand Down
8 changes: 4 additions & 4 deletions src/ai/backend/client/cli/admin/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def _list_cmd(name: str = "list", docs: Optional[str] = None):
"PENDING",
"SCHEDULED",
"PULLING",
"PREPARING",
"CREATING",
"RUNNING",
"RESTARTING",
"RUNNING_DEGRADED",
Expand Down Expand Up @@ -154,7 +154,7 @@ def list(
"PENDING",
"SCHEDULED",
"PULLING",
"PREPARING",
"CREATING",
"RUNNING",
"RUNNING_DEGRADED",
"TERMINATING",
Expand All @@ -163,7 +163,7 @@ def list(
no_match_name = "active"
if running:
status = ",".join([
"PREPARING",
"CREATING",
"RUNNING",
"RUNNING_DEGRADED",
])
Expand All @@ -179,7 +179,7 @@ def list(
"PENDING",
"SCHEDULED",
"PULLING",
"PREPARING",
"CREATING",
"RUNNING",
"RESTARTING",
"RUNNING_DEGRADED",
Expand Down
4 changes: 2 additions & 2 deletions src/ai/backend/client/cli/session/lifecycle.py
Original file line number Diff line number Diff line change
Expand Up @@ -577,7 +577,7 @@ def destroy(session_names, forced, owner, stats, recursive):
if forced:
print_warn(
"If you have destroyed a session whose status is one of "
"[`PULLING`, `SCHEDULED`, `PREPARING`, `TERMINATING`, `ERROR`], "
"[`PULLING`, `SCHEDULED`, `CREATING`, `TERMINATING`, `ERROR`], "
"Manual cleanup of actual containers may be required."
)
if stats:
Expand Down Expand Up @@ -1193,7 +1193,7 @@ def _fetch_session_names() -> tuple[str]:
status = ",".join([
"PENDING",
"SCHEDULED",
"PREPARING",
"CREATING",
"RUNNING",
"RUNNING_DEGRADED",
"RESTARTING",
Expand Down
2 changes: 1 addition & 1 deletion src/ai/backend/client/func/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ async def paginated_list(
Fetches the list of sessions.
:param status: Fetches sessions in a specific status
(PENDING, SCHEDULED, PULLING, PREPARING,
(PENDING, SCHEDULED, PULLING, CREATING,
RUNNING, RESTARTING, RUNNING_DEGRADED,
TERMINATING, TERMINATED, ERROR, CANCELLED)
:param fields: Additional per-session query fields to fetch.
Expand Down
2 changes: 1 addition & 1 deletion src/ai/backend/manager/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,7 @@
t.Key(
"threshold", default=_config_defaults["session"]["hang-tolerance"]["threshold"]
): t.Dict({
t.Key(SessionStatus.PREPARING.name, optional=True): tx.TimeDuration(),
t.Key(SessionStatus.CREATING.name, optional=True): tx.TimeDuration(),
t.Key(SessionStatus.TERMINATING.name, optional=True): tx.TimeDuration(),
}).ignore_extra("*"),
},
Expand Down
2 changes: 1 addition & 1 deletion src/ai/backend/manager/idle.py
Original file line number Diff line number Diff line change
Expand Up @@ -780,7 +780,7 @@ async def check_idleness(
session_id = kernel["session_id"]
if (max_session_lifetime := policy["max_session_lifetime"]) > 0:
# TODO: once per-status time tracking is implemented, let's change created_at
# to the timestamp when the session entered PREPARING status.
# to the timestamp when the session entered CREATING status.
idle_timeout = timedelta(seconds=max_session_lifetime)
now: datetime = await get_db_now(dbconn)
kernel_created_at: datetime = kernel["created_at"]
Expand Down
7 changes: 4 additions & 3 deletions src/ai/backend/manager/models/kernel.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,8 @@ class KernelStatus(enum.StrEnum):
PENDING = "PENDING"
# ---
SCHEDULED = "SCHEDULED"
PREPARING = "PREPARING"
PREPARING = "CREATING" # For backward compatibility
CREATING = "CREATING"
# ---
BUILDING = "BUILDING"
PULLING = "PULLING"
Expand Down Expand Up @@ -226,7 +227,7 @@ def default_hostname(context) -> str:
KernelStatus.CANCELLED,
KernelStatus.ERROR,
},
KernelStatus.PREPARING: {
KernelStatus.CREATING: {
KernelStatus.PULLING, # TODO: Delete this after applying check-and-pull API
KernelStatus.RUNNING,
KernelStatus.TERMINATING,
Expand Down Expand Up @@ -697,7 +698,7 @@ async def set_kernel_status(
kernels.c.status_history,
(),
{
status.name: now.isoformat(), # ["PULLING", "PREPARING"]
status.name: now.isoformat(), # ["PULLING", "CREATING"]
},
),
}
Expand Down
19 changes: 10 additions & 9 deletions src/ai/backend/manager/models/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,8 @@ class SessionStatus(enum.StrEnum):
# ---
PULLING = "PULLING"
PREPARED = "PREPARED"
PREPARING = "PREPARING"
CREATING = "CREATING"
PREPARING = "CREATING" # For backward compatibility
# ---
RUNNING = "RUNNING"
RESTARTING = "RESTARTING"
Expand Down Expand Up @@ -233,8 +234,8 @@ class SessionStatus(enum.StrEnum):
KERNEL_SESSION_STATUS_MAPPING: Mapping[KernelStatus, SessionStatus] = {
KernelStatus.PENDING: SessionStatus.PENDING,
KernelStatus.SCHEDULED: SessionStatus.SCHEDULED,
KernelStatus.PREPARING: SessionStatus.PREPARING,
KernelStatus.BUILDING: SessionStatus.PREPARING,
KernelStatus.CREATING: SessionStatus.CREATING,
KernelStatus.BUILDING: SessionStatus.CREATING,
KernelStatus.PULLING: SessionStatus.PULLING,
KernelStatus.PREPARED: SessionStatus.PREPARED,
KernelStatus.RUNNING: SessionStatus.RUNNING,
Expand All @@ -250,7 +251,7 @@ class SessionStatus(enum.StrEnum):
SESSION_KERNEL_STATUS_MAPPING: Mapping[SessionStatus, KernelStatus] = {
SessionStatus.PENDING: KernelStatus.PENDING,
SessionStatus.SCHEDULED: KernelStatus.SCHEDULED,
SessionStatus.PREPARING: KernelStatus.PREPARING,
SessionStatus.CREATING: KernelStatus.CREATING,
SessionStatus.PULLING: KernelStatus.PULLING,
SessionStatus.PREPARED: KernelStatus.PREPARED,
SessionStatus.RUNNING: KernelStatus.RUNNING,
Expand Down Expand Up @@ -286,7 +287,7 @@ class SessionStatus(enum.StrEnum):
SessionStatus.ERROR,
SessionStatus.CANCELLED,
},
SessionStatus.PREPARING: {
SessionStatus.CREATING: {
SessionStatus.PULLING, # TODO: Delete this after applying check-and-pull API
SessionStatus.RUNNING,
SessionStatus.RUNNING_DEGRADED,
Expand Down Expand Up @@ -356,8 +357,8 @@ def determine_session_status(sibling_kernels: Sequence[KernelRow]) -> SessionSta
continue
case KernelStatus.PULLING:
candidate = SessionStatus.PULLING
case KernelStatus.PREPARING:
candidate = SessionStatus.PREPARING
case KernelStatus.CREATING:
candidate = SessionStatus.CREATING
case KernelStatus.RUNNING | KernelStatus.RESTARTING | KernelStatus.RESIZING:
continue
case KernelStatus.TERMINATING:
Expand Down Expand Up @@ -393,7 +394,7 @@ def determine_session_status(sibling_kernels: Sequence[KernelRow]) -> SessionSta
KernelStatus.PENDING
| KernelStatus.PREPARED
| KernelStatus.SCHEDULED
| KernelStatus.PREPARING
| KernelStatus.CREATING
| KernelStatus.BUILDING
| KernelStatus.PULLING
| KernelStatus.RESIZING
Expand Down Expand Up @@ -1264,7 +1265,7 @@ def _calculate_session_occupied_slots(session_row: SessionRow):
session_row.occupying_slots = session_occupying_slots

match session_row.status:
case SessionStatus.PREPARING:
case SessionStatus.CREATING:
_calculate_session_occupied_slots(session_row)
case SessionStatus.RUNNING if transited:
_calculate_session_occupied_slots(session_row)
Expand Down
12 changes: 6 additions & 6 deletions src/ai/backend/manager/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -1744,7 +1744,7 @@ async def _update_failure() -> None:
{
KernelStatus.ERROR.name: (
now.isoformat()
), # ["PULLING", "PREPARING"]
), # ["PULLING", "CREATING"]
},
),
status_data=err_info,
Expand Down Expand Up @@ -2130,9 +2130,9 @@ async def destroy_session(
) -> Mapping[str, Any]:
"""
Destroy session kernels. Do not destroy
PREPARING/TERMINATING/ERROR and PULLING sessions.
CREATING/TERMINATING/ERROR and PULLING sessions.
:param forced: If True, destroy PREPARING/TERMINATING/ERROR session.
:param forced: If True, destroy CREATING/TERMINATING/ERROR session.
However, PULLING session still cannot be destroyed.
:param reason: Reason to destroy a session if client wants to specify it manually.
"""
Expand Down Expand Up @@ -2269,7 +2269,7 @@ async def _decrease_concurrency_used(access_key: AccessKey, is_private: bool) ->
raise GenericForbidden("Cannot destroy sessions in pulling status")
case (
SessionStatus.SCHEDULED
| SessionStatus.PREPARING
| SessionStatus.CREATING
| SessionStatus.TERMINATING
| SessionStatus.ERROR
):
Expand Down Expand Up @@ -2364,7 +2364,7 @@ async def _decrease_concurrency_used(access_key: AccessKey, is_private: bool) ->
raise GenericForbidden("Cannot destroy kernels in pulling status")
case (
KernelStatus.SCHEDULED
| KernelStatus.PREPARING
| KernelStatus.CREATING
| KernelStatus.TERMINATING
| KernelStatus.ERROR
):
Expand Down Expand Up @@ -3152,7 +3152,7 @@ async def mark_kernel_preparing(
async def _set_status(db_session: AsyncSession) -> None:
kernel_row = await KernelRow.get_kernel_to_update_status(db_session, kernel_id)
kernel_row.transit_status(
KernelStatus.PREPARING, reason, status_data={}, status_changed_at=now
KernelStatus.CREATING, reason, status_data={}, status_changed_at=now
)

await execute_with_txn_retry(_set_status, self.db.begin_session, db_conn)
Expand Down
12 changes: 6 additions & 6 deletions src/ai/backend/manager/scheduler/dispatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -1102,7 +1102,7 @@ async def _update_generic_failure() -> None:
kernel_agent_bindings.append(KernelAgentBinding(kernel, agent_alloc_ctx, set()))

assert len(kernel_agent_bindings) == len(sess_ctx.kernels)
# Proceed to PREPARING only when all kernels are successfully scheduled.
# Proceed to CREATING only when all kernels are successfully scheduled.

async def _finalize_scheduled() -> None:
agent_ids: list[AgentId] = []
Expand Down Expand Up @@ -1169,7 +1169,7 @@ async def prepare(
Scan the scheduled sessions and perform the agent RPC calls to begin preparation of them.
Each RPC calls are done in separate asyncio tasks.
Session status transition: SCHEDULED -> PREPARING
Session status transition: SCHEDULED -> CREATING
"""
manager_id = self.local_config["manager"]["id"]
redis_key = f"manager.{manager_id}.prepare"
Expand Down Expand Up @@ -1204,15 +1204,15 @@ async def _mark_session_preparing() -> Sequence[SessionRow]:
update_query = (
sa.update(KernelRow)
.values(
status=KernelStatus.PREPARING,
status=KernelStatus.CREATING,
status_changed=now,
status_info="",
status_data={},
status_history=sql_json_merge(
KernelRow.status_history,
(),
{
KernelStatus.PREPARING.name: now.isoformat(),
KernelStatus.CREATING.name: now.isoformat(),
},
),
)
Expand All @@ -1224,15 +1224,15 @@ async def _mark_session_preparing() -> Sequence[SessionRow]:
update_sess_query = (
sa.update(SessionRow)
.values(
status=SessionStatus.PREPARING,
status=SessionStatus.CREATING,
# status_changed=now,
status_info="",
status_data={},
status_history=sql_json_merge(
SessionRow.status_history,
(),
{
SessionStatus.PREPARING.name: now.isoformat(),
SessionStatus.CREATING.name: now.isoformat(),
},
),
)
Expand Down
Loading

0 comments on commit 23e8b85

Please sign in to comment.