From e1c03aadb5c84a89896af05163b1e165eea93d10 Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Fri, 27 Oct 2023 05:42:57 +0000 Subject: [PATCH 01/45] Change status_history from map to list --- .../backend/client/cli/session/lifecycle.py | 5 +- src/ai/backend/common/utils.py | 7 ++ src/ai/backend/manager/api/resource.py | 2 +- src/ai/backend/manager/models/kernel.py | 31 +++--- src/ai/backend/manager/models/session.py | 30 +++--- src/ai/backend/manager/models/utils.py | 10 ++ src/ai/backend/manager/models/vfolder.py | 13 ++- src/ai/backend/manager/registry.py | 77 ++++++------- .../backend/manager/scheduler/dispatcher.py | 102 +++++++++--------- 9 files changed, 138 insertions(+), 139 deletions(-) diff --git a/src/ai/backend/client/cli/session/lifecycle.py b/src/ai/backend/client/cli/session/lifecycle.py index d0c3185df1..c51191ee79 100644 --- a/src/ai/backend/client/cli/session/lifecycle.py +++ b/src/ai/backend/client/cli/session/lifecycle.py @@ -27,6 +27,7 @@ from ai.backend.cli.types import ExitCode, Undefined, undefined from ai.backend.common.arch import DEFAULT_IMAGE_ARCH from ai.backend.common.types import ClusterMode +from ai.backend.common.utils import get_first_status_history from ...compat import asyncio_run from ...exceptions import BackendAPIError @@ -792,14 +793,14 @@ def status_history(session_id): try: status_history = kernel.get_status_history().get("result") print_info(f"status_history: {status_history}") - if (preparing := status_history.get("preparing")) is None: + if (preparing := get_first_status_history(status_history, "PREPARING")) is None: result = { "result": { "seconds": 0, "microseconds": 0, }, } - elif (terminated := status_history.get("terminated")) is None: + elif (terminated := get_first_status_history(status_history, "TERMINATED")) is None: alloc_time_until_now: timedelta = datetime.now(tzutc()) - isoparse(preparing) result = { "result": { diff --git a/src/ai/backend/common/utils.py b/src/ai/backend/common/utils.py index 34b74cc684..627757861f 100644 --- a/src/ai/backend/common/utils.py +++ b/src/ai/backend/common/utils.py @@ -404,3 +404,10 @@ async def umount( fstab = Fstab(fp) await fstab.remove_by_mountpoint(str(mountpoint)) return True + + +def get_first_status_history(arr: list[list[str]], status: str) -> list[str] | None: + for item in arr: + if item[0] == status: + return item + return None diff --git a/src/ai/backend/manager/api/resource.py b/src/ai/backend/manager/api/resource.py index 156b8d9b66..2d7b1ba57d 100644 --- a/src/ai/backend/manager/api/resource.py +++ b/src/ai/backend/manager/api/resource.py @@ -465,7 +465,7 @@ async def _pipe_builder(r: Redis) -> RedisPipeline: "status": row["status"].name, "status_info": row["status_info"], "status_changed": str(row["status_changed"]), - "status_history": row["status_history"] or {}, + "status_history": row["status_history"] or [], "cluster_mode": row["cluster_mode"], } if group_id not in objs_per_group: diff --git a/src/ai/backend/manager/models/kernel.py b/src/ai/backend/manager/models/kernel.py index d78ef7c92f..05d14120e6 100644 --- a/src/ai/backend/manager/models/kernel.py +++ b/src/ai/backend/manager/models/kernel.py @@ -46,6 +46,7 @@ SessionTypes, VFolderMount, ) +from ai.backend.common.utils import get_first_status_history from ..api.exceptions import ( BackendError, @@ -79,7 +80,12 @@ from .minilang.ordering import ColumnMapType, QueryOrderParser from .minilang.queryfilter import FieldSpecType, QueryFilterParser, enum_field_getter from .user import users -from .utils import ExtendedAsyncSAEngine, JSONCoalesceExpr, execute_with_retry, sql_json_merge +from .utils import ( + ExtendedAsyncSAEngine, + JSONCoalesceExpr, + execute_with_retry, + sql_list_append, +) if TYPE_CHECKING: from .gql import GraphQueryContext @@ -723,12 +729,8 @@ async def set_kernel_status( data = { "status": status, "status_changed": now, - "status_history": sql_json_merge( - kernels.c.status_history, - (), - { - status.name: now.isoformat(), # ["PULLING", "PREPARING"] - }, + "status_history": sql_list_append( + KernelRow.status_history, [status.name, now.isoformat()] ), } if status_data is not None: @@ -774,12 +776,8 @@ async def _update() -> bool: if update_data is None: update_values = { "status": new_status, - "status_history": sql_json_merge( - KernelRow.status_history, - (), - { - new_status.name: now.isoformat(), - }, + "status_history": sql_list_append( + KernelRow.status_history, [new_status.name, now.isoformat()] ), } else: @@ -921,7 +919,10 @@ def parse_row(cls, ctx: GraphQueryContext, row: KernelRow) -> Mapping[str, Any]: hide_agents = False else: hide_agents = ctx.local_config["manager"]["hide-agents"] - status_history = row.status_history or {} + + status_history = row["status_history"] or [] + scheduled_at = get_first_status_history(status_history, KernelStatus.SCHEDULED.name) + return { # identity "id": row.id, @@ -947,7 +948,7 @@ def parse_row(cls, ctx: GraphQueryContext, row: KernelRow) -> Mapping[str, Any]: "created_at": row.created_at, "terminated_at": row.terminated_at, "starts_at": row.starts_at, - "scheduled_at": status_history.get(KernelStatus.SCHEDULED.name), + "scheduled_at": scheduled_at[1] if scheduled_at else None, "occupied_slots": row.occupied_slots.to_json(), # resources "agent": row.agent if not hide_agents else None, diff --git a/src/ai/backend/manager/models/session.py b/src/ai/backend/manager/models/session.py index 42c7967a2c..8a135bce84 100644 --- a/src/ai/backend/manager/models/session.py +++ b/src/ai/backend/manager/models/session.py @@ -38,6 +38,7 @@ SessionTypes, VFolderMount, ) +from ai.backend.common.utils import get_first_status_history from ..api.exceptions import ( AgentError, @@ -79,7 +80,7 @@ JSONCoalesceExpr, agg_to_array, execute_with_retry, - sql_json_merge, + sql_list_append, ) if TYPE_CHECKING: @@ -729,7 +730,10 @@ def status_changed(self) -> Optional[datetime]: if self.status_history is None: return None try: - return datetime.fromisoformat(self.status_history[self.status.name]) + first = get_first_status_history(self.status_history, self.status.name) + assert first is not None + + return datetime.fromisoformat(first[1]) except KeyError: return None @@ -805,12 +809,8 @@ async def _check_and_update() -> SessionStatus | None: update_values = { "status": determined_status, - "status_history": sql_json_merge( - SessionRow.status_history, - (), - { - determined_status.name: now.isoformat(), - }, + "status_history": sql_list_append( + SessionRow.status_history, [determined_status.name, now.isoformat()] ), } if determined_status in (SessionStatus.CANCELLED, SessionStatus.TERMINATED): @@ -911,12 +911,8 @@ async def set_session_status( now = status_changed_at data = { "status": status, - "status_history": sql_json_merge( - SessionRow.status_history, - (), - { - status.name: datetime.now(tzutc()).isoformat(), - }, + "status_history": sql_list_append( + SessionRow.status_history, [status.name, datetime.now(tzutc()).isoformat()] ), } if status_data is not None: @@ -1324,8 +1320,10 @@ def parse_row(cls, ctx: GraphQueryContext, row: Row) -> Mapping[str, Any]: full_name = getattr(row, "full_name") group_name = getattr(row, "group_name") row = row.SessionRow - status_history = row.status_history or {} - raw_scheduled_at = status_history.get(SessionStatus.SCHEDULED.name) + status_history = row.status_history or [] + first = get_first_status_history(status_history, SessionStatus.SCHEDULED.name) + raw_scheduled_at = first[1] if first is not None else None + return { # identity "id": row.id, diff --git a/src/ai/backend/manager/models/utils.py b/src/ai/backend/manager/models/utils.py index 4ad6d2a195..73eb8e73ce 100644 --- a/src/ai/backend/manager/models/utils.py +++ b/src/ai/backend/manager/models/utils.py @@ -452,6 +452,16 @@ def sql_json_merge( return expr +def sql_list_append(col, arr): + """ + Generate an SQLAlchemy column update expression that appends an item to + the existing JSONB array. + """ + new_item_str = str(arr).replace("'", '"') + expr = col.op("||")(sa.text(f"'[{new_item_str}]'::jsonb")) + return expr + + def sql_json_increment( col, key: Tuple[str, ...], diff --git a/src/ai/backend/manager/models/vfolder.py b/src/ai/backend/manager/models/vfolder.py index 1b5f87b7a7..33925b1c4f 100644 --- a/src/ai/backend/manager/models/vfolder.py +++ b/src/ai/backend/manager/models/vfolder.py @@ -104,7 +104,7 @@ from .rbac.exceptions import InvalidScope, NotEnoughPermission from .session import DEAD_SESSION_STATUSES, SessionRow from .user import UserRole, UserRow -from .utils import ExtendedAsyncSAEngine, execute_with_retry, sql_json_merge +from .utils import ExtendedAsyncSAEngine, execute_with_retry, sql_list_append if TYPE_CHECKING: from ..api.context import BackgroundTaskManager @@ -1466,13 +1466,12 @@ async def _update() -> None: sa.update(vfolders) .values( status=update_status, - status_changed=now, - status_history=sql_json_merge( + status_history=sql_list_append( vfolders.c.status_history, - (), - { - update_status.name: now.isoformat(), - }, + [ + update_status.name, + datetime.now(tzutc()).isoformat(), + ], ), ) .where(cond) diff --git a/src/ai/backend/manager/registry.py b/src/ai/backend/manager/registry.py index 283b9f3877..29b617f1b0 100644 --- a/src/ai/backend/manager/registry.py +++ b/src/ai/backend/manager/registry.py @@ -180,6 +180,7 @@ reenter_txn, reenter_txn_session, sql_json_merge, + sql_list_append, ) from .types import UserScope @@ -1006,9 +1007,9 @@ async def enqueue_session( session_data = { "id": session_id, "status": SessionStatus.PENDING, - "status_history": { - SessionStatus.PENDING.name: datetime.now(tzutc()).isoformat(), - }, + "status_history": [ + [SessionStatus.PENDING.name, datetime.now(tzutc()).isoformat()], + ], "creation_id": session_creation_id, "name": session_name, "session_type": session_type, @@ -1029,9 +1030,9 @@ async def enqueue_session( kernel_shared_data = { "status": KernelStatus.PENDING, - "status_history": { - KernelStatus.PENDING.name: datetime.now(tzutc()).isoformat(), - }, + "status_history": [ + [KernelStatus.PENDING.name, datetime.now(tzutc()).isoformat()], + ], "session_creation_id": session_creation_id, "session_id": session_id, "session_name": session_name, @@ -1583,6 +1584,7 @@ async def finalize_running( created_info["resource_spec"]["allocations"] ) new_status = KernelStatus.RUNNING + update_data = { "occupied_slots": actual_allocs, "scaling_group": created_info["scaling_group"], @@ -1595,14 +1597,11 @@ async def finalize_running( "stdin_port": created_info["stdin_port"], "stdout_port": created_info["stdout_port"], "service_ports": service_ports, - "status_history": sql_json_merge( - kernels.c.status_history, - (), - { - new_status.name: datetime.now(tzutc()).isoformat(), - }, + "status_history": sql_list_append( + KernelRow.status_history, [new_status.name, datetime.now(tzutc()).isoformat()] ), } + self._kernel_actual_allocated_resources[kernel_id] = actual_allocs async def _update_session_occupying_slots(db_session: AsyncSession) -> None: @@ -1805,16 +1804,14 @@ async def _update_failure() -> None: status_info=f"other-error ({ex!r})", status_changed=now, terminated_at=now, - status_history=sql_json_merge( + status_history=sql_list_append( KernelRow.status_history, - (), - { - KernelStatus.ERROR.name: ( - now.isoformat() - ), # ["PULLING", "PREPARING"] - }, + [ + KernelStatus.ERROR.name, + now.isoformat(), # ["PULLING", "PREPARING"] + ], ), - status_data=err_info, + status_data=convert_to_status_data(ex, self.debug), ) ) await db_sess.execute(query) @@ -2416,12 +2413,9 @@ async def _update() -> None: "status_info": reason, "status_changed": now, "terminated_at": now, - "status_history": sql_json_merge( + "status_history": sql_list_append( KernelRow.status_history, - (), - { - KernelStatus.TERMINATED.name: now.isoformat(), - }, + [KernelStatus.TERMINATED.name, now.isoformat()], ), } if kern_stat: @@ -2463,12 +2457,9 @@ async def _update() -> None: "kernel": {"exit_code": None}, "session": {"status": "terminating"}, }, - "status_history": sql_json_merge( + "status_history": sql_list_append( KernelRow.status_history, - (), - { - KernelStatus.TERMINATING.name: now.isoformat(), - }, + [KernelStatus.TERMINATING.name, now.isoformat()], ), } await db_sess.execute( @@ -2633,12 +2624,12 @@ async def _restarting_session() -> None: sa.update(SessionRow) .values( status=SessionStatus.RESTARTING, - status_history=sql_json_merge( + status_history=sql_list_append( SessionRow.status_history, - (), - { - SessionStatus.RESTARTING.name: datetime.now(tzutc()).isoformat(), - }, + [ + SessionStatus.RESTARTING.name, + datetime.now(tzutc()).isoformat(), + ], ), ) .where(SessionRow.id == session.id) @@ -2672,12 +2663,8 @@ async def _restart_kernel(kernel: KernelRow) -> None: "stdin_port": kernel_info["stdin_port"], "stdout_port": kernel_info["stdout_port"], "service_ports": kernel_info.get("service_ports", []), - "status_history": sql_json_merge( - KernelRow.status_history, - (), - { - KernelStatus.RUNNING.name: now.isoformat(), - }, + "status_history": sql_list_append( + KernelRow.status_history, [KernelStatus.RUNNING.name, now.isoformat()] ), } await KernelRow.update_kernel( @@ -3237,12 +3224,8 @@ async def _update_kernel() -> tuple[AccessKey, AgentId] | None: ("kernel",), {"exit_code": exit_code}, ), - "status_history": sql_json_merge( - KernelRow.status_history, - (), - { - KernelStatus.TERMINATED.name: now.isoformat(), - }, + "status_history": sql_list_append( + KernelRow.status_history, [KernelStatus.TERMINATED.name, now.isoformat()] ), "terminated_at": now, } diff --git a/src/ai/backend/manager/scheduler/dispatcher.py b/src/ai/backend/manager/scheduler/dispatcher.py index e65ae6c694..8b8129ba4a 100644 --- a/src/ai/backend/manager/scheduler/dispatcher.py +++ b/src/ai/backend/manager/scheduler/dispatcher.py @@ -92,7 +92,7 @@ recalc_concurrency_used, ) from ..models.utils import ExtendedAsyncSAEngine as SAEngine -from ..models.utils import execute_with_retry, sql_json_increment, sql_json_merge +from ..models.utils import execute_with_retry, sql_json_increment, sql_json_merge, sql_list_append from .predicates import ( check_concurrency, check_dependencies, @@ -371,12 +371,12 @@ async def _apply_cancellation( status=KernelStatus.CANCELLED, status_info=reason, terminated_at=now, - status_history=sql_json_merge( + status_history=sql_list_append( KernelRow.status_history, - (), - { - KernelStatus.CANCELLED.name: now.isoformat(), - }, + [ + KernelStatus.CANCELLED.name, + now.isoformat(), + ], ), ) .where(KernelRow.session_id.in_(session_ids)) @@ -388,12 +388,12 @@ async def _apply_cancellation( status=SessionStatus.CANCELLED, status_info=reason, terminated_at=now, - status_history=sql_json_merge( + status_history=sql_list_append( SessionRow.status_history, - (), - { - SessionStatus.CANCELLED.name: now.isoformat(), - }, + [ + SessionStatus.CANCELLED.name, + now.isoformat(), + ], ), ) .where(SessionRow.id.in_(session_ids)) @@ -962,12 +962,12 @@ async def _finalize_scheduled() -> None: status_info="scheduled", status_data={}, status_changed=now, - status_history=sql_json_merge( + status_history=sql_list_append( KernelRow.status_history, - (), - { - KernelStatus.SCHEDULED.name: now.isoformat(), - }, + [ + KernelStatus.SCHEDULED.name, + now.isoformat(), + ], ), ) .where(KernelRow.id == kernel.id) @@ -984,12 +984,12 @@ async def _finalize_scheduled() -> None: status=SessionStatus.SCHEDULED, status_info="scheduled", status_data={}, - status_history=sql_json_merge( + status_history=sql_list_append( SessionRow.status_history, - (), - { - SessionStatus.SCHEDULED.name: now.isoformat(), - }, + [ + SessionStatus.SCHEDULED.name, + now.isoformat(), + ], ), ) .where(SessionRow.id == sess_ctx.id) @@ -1199,12 +1199,12 @@ async def _finalize_scheduled() -> None: status_info="scheduled", status_data={}, status_changed=now, - status_history=sql_json_merge( + status_history=sql_list_append( KernelRow.status_history, - (), - { - KernelStatus.SCHEDULED.name: now.isoformat(), - }, + [ + KernelStatus.SCHEDULED.name, + now.isoformat(), + ], ), ) .where(KernelRow.id == binding.kernel.id) @@ -1222,12 +1222,12 @@ async def _finalize_scheduled() -> None: status_info="scheduled", status_data={}, # status_changed=now, - status_history=sql_json_merge( + status_history=sql_list_append( SessionRow.status_history, - (), - { - SessionStatus.SCHEDULED.name: now.isoformat(), - }, + [ + SessionStatus.SCHEDULED.name, + now.isoformat(), + ], ), ) .where(SessionRow.id == sess_ctx.id) @@ -1288,12 +1288,12 @@ async def _mark_session_preparing() -> Sequence[SessionRow]: status_changed=now, status_info="", status_data={}, - status_history=sql_json_merge( + status_history=sql_list_append( KernelRow.status_history, - (), - { - KernelStatus.PREPARING.name: now.isoformat(), - }, + [ + KernelStatus.PREPARING.name, + now.isoformat(), + ], ), ) .where( @@ -1308,12 +1308,12 @@ async def _mark_session_preparing() -> Sequence[SessionRow]: # status_changed=now, status_info="", status_data={}, - status_history=sql_json_merge( + status_history=sql_list_append( SessionRow.status_history, - (), - { - SessionStatus.PREPARING.name: now.isoformat(), - }, + [ + SessionStatus.PREPARING.name, + now.isoformat(), + ], ), ) .where(SessionRow.status == SessionStatus.SCHEDULED) @@ -1612,12 +1612,12 @@ async def _mark_session_cancelled() -> None: status_info="failed-to-start", status_data=status_data, terminated_at=now, - status_history=sql_json_merge( + status_history=sql_list_append( KernelRow.status_history, - (), - { - KernelStatus.CANCELLED.name: now.isoformat(), - }, + [ + KernelStatus.CANCELLED.name, + now.isoformat(), + ], ), ) .where(KernelRow.session_id == session.id) @@ -1631,12 +1631,12 @@ async def _mark_session_cancelled() -> None: status_info="failed-to-start", status_data=status_data, terminated_at=now, - status_history=sql_json_merge( + status_history=sql_list_append( SessionRow.status_history, - (), - { - SessionStatus.CANCELLED.name: now.isoformat(), - }, + [ + SessionStatus.CANCELLED.name, + now.isoformat(), + ], ), ) .where(SessionRow.id == session.id) From 59f4b603fab790ab019cca9000e26776c7225dd7 Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Sun, 29 Oct 2023 23:25:43 +0000 Subject: [PATCH 02/45] Fix test --- tests/manager/models/test_utils.py | 229 +++++++++++++++-------------- 1 file changed, 117 insertions(+), 112 deletions(-) diff --git a/tests/manager/models/test_utils.py b/tests/manager/models/test_utils.py index 233fd795c6..da76fcb133 100644 --- a/tests/manager/models/test_utils.py +++ b/tests/manager/models/test_utils.py @@ -8,7 +8,11 @@ from dateutil.tz import tzutc from ai.backend.manager.models import KernelRow, SessionRow, kernels -from ai.backend.manager.models.utils import agg_to_array, agg_to_str, sql_json_merge +from ai.backend.manager.models.utils import ( + agg_to_array, + agg_to_str, + sql_list_append, +) async def _select_kernel_row( @@ -33,26 +37,24 @@ async def test_sql_json_merge__default(session_info): async def test_sql_json_merge__deeper_object(session_info): session_id, conn = session_info timestamp = datetime.now(tzutc()).isoformat() - expected = { - "kernel": { - "session": { - "PENDING": timestamp, - "PREPARING": timestamp, - }, - }, - } + expected = [ + ["PENDING", timestamp], + ["PREPARING", timestamp], + ] + query = ( kernels.update() - .values({ - "status_history": sql_json_merge( - kernels.c.status_history, - ("kernel", "session"), - { - "PENDING": timestamp, - "PREPARING": timestamp, - }, - ), - }) + .values( + { + "status_history": sql_list_append( + kernels.c.status_history, + [ + ["PENDING", timestamp], + ["PREPARING", timestamp], + ], + ), + } + ) .where(kernels.c.session_id == session_id) ) await conn.execute(query) @@ -65,43 +67,42 @@ async def test_sql_json_merge__deeper_object(session_info): async def test_sql_json_merge__append_values(session_info): session_id, conn = session_info timestamp = datetime.now(tzutc()).isoformat() - expected = { - "kernel": { - "session": { - "PENDING": timestamp, - "PREPARING": timestamp, - "TERMINATED": timestamp, - "TERMINATING": timestamp, - }, - }, - } + expected = [ + ["PENDING", timestamp], + ["PREPARING", timestamp], + ["TERMINATED", timestamp], + ["TERMINATING", timestamp], + ] + query = ( kernels.update() - .values({ - "status_history": sql_json_merge( - kernels.c.status_history, - ("kernel", "session"), - { - "PENDING": timestamp, - "PREPARING": timestamp, - }, - ), - }) + .values( + { + "status_history": sql_list_append( + kernels.c.status_history, + [ + ["PENDING", timestamp], + ["PREPARING", timestamp], + ], + ), + } + ) .where(kernels.c.session_id == session_id) ) await conn.execute(query) query = ( kernels.update() - .values({ - "status_history": sql_json_merge( - kernels.c.status_history, - ("kernel", "session"), - { - "TERMINATING": timestamp, - "TERMINATED": timestamp, - }, - ), - }) + .values( + { + "status_history": sql_list_append( + kernels.c.status_history, + [ + ["TERMINATING", timestamp], + ["TERMINATED", timestamp], + ], + ), + } + ) .where(kernels.c.session_id == session_id) ) await conn.execute(query) @@ -114,45 +115,47 @@ async def test_sql_json_merge__append_values(session_info): async def test_sql_json_merge__kernel_status_history(session_info): session_id, conn = session_info timestamp = datetime.now(tzutc()).isoformat() - expected = { - "PENDING": timestamp, - "PREPARING": timestamp, - "TERMINATING": timestamp, - "TERMINATED": timestamp, - } + expected = [ + ["PENDING", timestamp], + ["PREPARING", timestamp], + ["TERMINATING", timestamp], + ["TERMINATED", timestamp], + ] query = ( kernels.update() - .values({ - # "status_history": sqlalchemy.func.coalesce(sqlalchemy.text("'{}'::jsonb")).concat( - # sqlalchemy.func.cast( - # {"PENDING": timestamp, "PREPARING": timestamp}, - # sqlalchemy.dialects.postgresql.JSONB, - # ), - # ), - "status_history": sql_json_merge( - kernels.c.status_history, - (), - { - "PENDING": timestamp, - "PREPARING": timestamp, - }, - ), - }) + .values( + { + # "status_history": sqlalchemy.func.coalesce(sqlalchemy.text("'{}'::jsonb")).concat( + # sqlalchemy.func.cast( + # {"PENDING": timestamp, "PREPARING": timestamp}, + # sqlalchemy.dialects.postgresql.JSONB, + # ), + # ), + "status_history": sql_list_append( + kernels.c.status_history, + [ + ["PENDING", timestamp], + ["PREPARING", timestamp], + ], + ), + } + ) .where(kernels.c.session_id == session_id) ) await conn.execute(query) query = ( kernels.update() - .values({ - "status_history": sql_json_merge( - kernels.c.status_history, - (), - { - "TERMINATING": timestamp, - "TERMINATED": timestamp, - }, - ), - }) + .values( + { + "status_history": sql_list_append( + kernels.c.status_history, + [ + ["TERMINATING", timestamp], + ["TERMINATED", timestamp], + ], + ), + } + ) .where(kernels.c.session_id == session_id) ) await conn.execute(query) @@ -165,38 +168,39 @@ async def test_sql_json_merge__kernel_status_history(session_info): async def test_sql_json_merge__mixed_formats(session_info): session_id, conn = session_info timestamp = datetime.now(tzutc()).isoformat() - expected = { - "PENDING": timestamp, - "kernel": { - "PREPARING": timestamp, - }, - } + expected = [ + ["PENDING", timestamp], + ["PREPARING", timestamp], + ] + query = ( kernels.update() - .values({ - "status_history": sql_json_merge( - kernels.c.status_history, - (), - { - "PENDING": timestamp, - }, - ), - }) + .values( + { + "status_history": sql_list_append( + kernels.c.status_history, + [ + ["PENDING", timestamp], + ], + ), + } + ) .where(kernels.c.session_id == session_id) ) await conn.execute(query) kernel = await _select_kernel_row(conn, session_id) query = ( kernels.update() - .values({ - "status_history": sql_json_merge( - kernels.c.status_history, - ("kernel",), - { - "PREPARING": timestamp, - }, - ), - }) + .values( + { + "status_history": sql_list_append( + kernels.c.status_history, + [ + ["PREPARING", timestamp], + ], + ), + } + ) .where(kernels.c.session_id == session_id) ) await conn.execute(query) @@ -224,13 +228,14 @@ async def test_sql_json_merge__json_serializable_types(session_info): } query = ( kernels.update() - .values({ - "status_history": sql_json_merge( - kernels.c.status_history, - (), - expected, - ), - }) + .values( + { + "status_history": sql_list_append( + kernels.c.status_history, + expected, + ), + } + ) .where(kernels.c.session_id == session_id) ) await conn.execute(query) From f576ca59fb72c2406f0a6cad790a96113a3684fe Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Mon, 30 Oct 2023 00:08:31 +0000 Subject: [PATCH 03/45] Try to fix CI --- src/ai/backend/manager/models/utils.py | 12 +-- tests/manager/models/test_utils.py | 104 +++++++++++-------------- 2 files changed, 52 insertions(+), 64 deletions(-) diff --git a/src/ai/backend/manager/models/utils.py b/src/ai/backend/manager/models/utils.py index 73eb8e73ce..324e292f20 100644 --- a/src/ai/backend/manager/models/utils.py +++ b/src/ai/backend/manager/models/utils.py @@ -452,13 +452,15 @@ def sql_json_merge( return expr -def sql_list_append(col, arr): +def sql_list_append(col, *arrs): """ - Generate an SQLAlchemy column update expression that appends an item to - the existing JSONB array. + Generate an SQLAlchemy column update expression that appends items to + the existing JSONB array from multiple arrays. """ - new_item_str = str(arr).replace("'", '"') - expr = col.op("||")(sa.text(f"'[{new_item_str}]'::jsonb")) + expr = col + for arr in arrs: + new_item_str = str(arr).replace("'", '"') + expr = expr.op("||")(sa.text(f"'[{new_item_str}]'::jsonb")) return expr diff --git a/tests/manager/models/test_utils.py b/tests/manager/models/test_utils.py index da76fcb133..37d63d7263 100644 --- a/tests/manager/models/test_utils.py +++ b/tests/manager/models/test_utils.py @@ -48,10 +48,8 @@ async def test_sql_json_merge__deeper_object(session_info): { "status_history": sql_list_append( kernels.c.status_history, - [ - ["PENDING", timestamp], - ["PREPARING", timestamp], - ], + ["PENDING", timestamp], + ["PREPARING", timestamp], ), } ) @@ -80,10 +78,8 @@ async def test_sql_json_merge__append_values(session_info): { "status_history": sql_list_append( kernels.c.status_history, - [ - ["PENDING", timestamp], - ["PREPARING", timestamp], - ], + ["PENDING", timestamp], + ["PREPARING", timestamp], ), } ) @@ -96,10 +92,8 @@ async def test_sql_json_merge__append_values(session_info): { "status_history": sql_list_append( kernels.c.status_history, - [ - ["TERMINATING", timestamp], - ["TERMINATED", timestamp], - ], + ["TERMINATING", timestamp], + ["TERMINATED", timestamp], ), } ) @@ -133,10 +127,8 @@ async def test_sql_json_merge__kernel_status_history(session_info): # ), "status_history": sql_list_append( kernels.c.status_history, - [ - ["PENDING", timestamp], - ["PREPARING", timestamp], - ], + ["PENDING", timestamp], + ["PREPARING", timestamp], ), } ) @@ -149,10 +141,8 @@ async def test_sql_json_merge__kernel_status_history(session_info): { "status_history": sql_list_append( kernels.c.status_history, - [ - ["TERMINATING", timestamp], - ["TERMINATED", timestamp], - ], + ["TERMINATING", timestamp], + ["TERMINATED", timestamp], ), } ) @@ -179,9 +169,7 @@ async def test_sql_json_merge__mixed_formats(session_info): { "status_history": sql_list_append( kernels.c.status_history, - [ - ["PENDING", timestamp], - ], + ["PENDING", timestamp], ), } ) @@ -195,9 +183,7 @@ async def test_sql_json_merge__mixed_formats(session_info): { "status_history": sql_list_append( kernels.c.status_history, - [ - ["PREPARING", timestamp], - ], + ["PREPARING", timestamp], ), } ) @@ -209,39 +195,39 @@ async def test_sql_json_merge__mixed_formats(session_info): assert kernel.status_history == expected -@pytest.mark.asyncio -async def test_sql_json_merge__json_serializable_types(session_info): - session_id, conn = session_info - expected = { - "boolean": True, - "integer": 10101010, - "float": 1010.1010, - "string": "10101010", - # "bytes": b"10101010", - "list": [ - 10101010, - "10101010", - ], - "dict": { - "10101010": 10101010, - }, - } - query = ( - kernels.update() - .values( - { - "status_history": sql_list_append( - kernels.c.status_history, - expected, - ), - } - ) - .where(kernels.c.session_id == session_id) - ) - await conn.execute(query) - kernel = await _select_kernel_row(conn, session_id) - assert kernel is not None - assert kernel.status_history == expected +# @pytest.mark.asyncio +# async def test_sql_json_merge__json_serializable_types(session_info): +# session_id, conn = session_info +# expected = { +# "boolean": True, +# "integer": 10101010, +# "float": 1010.1010, +# "string": "10101010", +# # "bytes": b"10101010", +# "list": [ +# 10101010, +# "10101010", +# ], +# "dict": { +# "10101010": 10101010, +# }, +# } +# query = ( +# kernels.update() +# .values( +# { +# "status_history": sql_list_append( +# kernels.c.status_history, +# expected, +# ), +# } +# ) +# .where(kernels.c.session_id == session_id) +# ) +# await conn.execute(query) +# kernel = await _select_kernel_row(conn, session_id) +# assert kernel is not None +# assert kernel.status_history == expected @pytest.mark.asyncio From 42d12edc65c73dbfb8713e48cc66b87a4f47677a Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Mon, 30 Oct 2023 02:52:05 +0000 Subject: [PATCH 04/45] Fix test --- src/ai/backend/manager/models/kernel.py | 6 +- tests/manager/models/test_utils.py | 161 +----------------------- 2 files changed, 7 insertions(+), 160 deletions(-) diff --git a/src/ai/backend/manager/models/kernel.py b/src/ai/backend/manager/models/kernel.py index 05d14120e6..619ad973ca 100644 --- a/src/ai/backend/manager/models/kernel.py +++ b/src/ai/backend/manager/models/kernel.py @@ -544,9 +544,9 @@ class KernelRow(Base): # // used to prevent duplication of SessionTerminatedEvent # } # } - status_history = sa.Column("status_history", pgsql.JSONB(), nullable=True, default=sa.null()) - callback_url = sa.Column("callback_url", URLColumn, nullable=True, default=sa.null()) - startup_command = sa.Column("startup_command", sa.Text, nullable=True) + status_history = (sa.Column("status_history", pgsql.JSONB(), nullable=False, default=[]),) + callback_url = (sa.Column("callback_url", URLColumn, nullable=True, default=sa.null()),) + startup_command = (sa.Column("startup_command", sa.Text, nullable=True),) result = sa.Column( "result", EnumType(SessionResult), diff --git a/tests/manager/models/test_utils.py b/tests/manager/models/test_utils.py index 37d63d7263..5fdbe79b24 100644 --- a/tests/manager/models/test_utils.py +++ b/tests/manager/models/test_utils.py @@ -1,6 +1,6 @@ import uuid from datetime import datetime -from typing import Any, Dict, Optional, Union +from typing import Union import pytest import sqlalchemy @@ -27,86 +27,14 @@ async def _select_kernel_row( @pytest.mark.asyncio async def test_sql_json_merge__default(session_info): session_id, conn = session_info - expected: Optional[Dict[str, Any]] = None + expected: list[list[str, str]] = [] kernel = await _select_kernel_row(conn, session_id) assert kernel is not None assert kernel.status_history == expected @pytest.mark.asyncio -async def test_sql_json_merge__deeper_object(session_info): - session_id, conn = session_info - timestamp = datetime.now(tzutc()).isoformat() - expected = [ - ["PENDING", timestamp], - ["PREPARING", timestamp], - ] - - query = ( - kernels.update() - .values( - { - "status_history": sql_list_append( - kernels.c.status_history, - ["PENDING", timestamp], - ["PREPARING", timestamp], - ), - } - ) - .where(kernels.c.session_id == session_id) - ) - await conn.execute(query) - kernel = await _select_kernel_row(conn, session_id) - assert kernel is not None - assert kernel.status_history == expected - - -@pytest.mark.asyncio -async def test_sql_json_merge__append_values(session_info): - session_id, conn = session_info - timestamp = datetime.now(tzutc()).isoformat() - expected = [ - ["PENDING", timestamp], - ["PREPARING", timestamp], - ["TERMINATED", timestamp], - ["TERMINATING", timestamp], - ] - - query = ( - kernels.update() - .values( - { - "status_history": sql_list_append( - kernels.c.status_history, - ["PENDING", timestamp], - ["PREPARING", timestamp], - ), - } - ) - .where(kernels.c.session_id == session_id) - ) - await conn.execute(query) - query = ( - kernels.update() - .values( - { - "status_history": sql_list_append( - kernels.c.status_history, - ["TERMINATING", timestamp], - ["TERMINATED", timestamp], - ), - } - ) - .where(kernels.c.session_id == session_id) - ) - await conn.execute(query) - kernel = await _select_kernel_row(conn, session_id) - assert kernel is not None - assert kernel.status_history == expected - - -@pytest.mark.asyncio -async def test_sql_json_merge__kernel_status_history(session_info): +async def test_sql_list_append(session_info): session_id, conn = session_info timestamp = datetime.now(tzutc()).isoformat() expected = [ @@ -115,16 +43,11 @@ async def test_sql_json_merge__kernel_status_history(session_info): ["TERMINATING", timestamp], ["TERMINATED", timestamp], ] + query = ( kernels.update() .values( { - # "status_history": sqlalchemy.func.coalesce(sqlalchemy.text("'{}'::jsonb")).concat( - # sqlalchemy.func.cast( - # {"PENDING": timestamp, "PREPARING": timestamp}, - # sqlalchemy.dialects.postgresql.JSONB, - # ), - # ), "status_history": sql_list_append( kernels.c.status_history, ["PENDING", timestamp], @@ -154,82 +77,6 @@ async def test_sql_json_merge__kernel_status_history(session_info): assert kernel.status_history == expected -@pytest.mark.asyncio -async def test_sql_json_merge__mixed_formats(session_info): - session_id, conn = session_info - timestamp = datetime.now(tzutc()).isoformat() - expected = [ - ["PENDING", timestamp], - ["PREPARING", timestamp], - ] - - query = ( - kernels.update() - .values( - { - "status_history": sql_list_append( - kernels.c.status_history, - ["PENDING", timestamp], - ), - } - ) - .where(kernels.c.session_id == session_id) - ) - await conn.execute(query) - kernel = await _select_kernel_row(conn, session_id) - query = ( - kernels.update() - .values( - { - "status_history": sql_list_append( - kernels.c.status_history, - ["PREPARING", timestamp], - ), - } - ) - .where(kernels.c.session_id == session_id) - ) - await conn.execute(query) - kernel = await _select_kernel_row(conn, session_id) - assert kernel is not None - assert kernel.status_history == expected - - -# @pytest.mark.asyncio -# async def test_sql_json_merge__json_serializable_types(session_info): -# session_id, conn = session_info -# expected = { -# "boolean": True, -# "integer": 10101010, -# "float": 1010.1010, -# "string": "10101010", -# # "bytes": b"10101010", -# "list": [ -# 10101010, -# "10101010", -# ], -# "dict": { -# "10101010": 10101010, -# }, -# } -# query = ( -# kernels.update() -# .values( -# { -# "status_history": sql_list_append( -# kernels.c.status_history, -# expected, -# ), -# } -# ) -# .where(kernels.c.session_id == session_id) -# ) -# await conn.execute(query) -# kernel = await _select_kernel_row(conn, session_id) -# assert kernel is not None -# assert kernel.status_history == expected - - @pytest.mark.asyncio async def test_agg_to_str(session_info): session_id, conn = session_info From 49338b6232c664c32a02fba3777bc5dbb2b4af2e Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Mon, 30 Oct 2023 02:57:42 +0000 Subject: [PATCH 05/45] Rename `sql_list_append` -> `sql_append_lists_to_list` --- src/ai/backend/manager/models/kernel.py | 6 ++--- src/ai/backend/manager/models/session.py | 6 ++--- src/ai/backend/manager/models/utils.py | 6 ++--- src/ai/backend/manager/models/vfolder.py | 4 +-- src/ai/backend/manager/registry.py | 17 ++++++------ .../backend/manager/scheduler/dispatcher.py | 27 +++++++++++-------- tests/manager/models/test_utils.py | 8 +++--- 7 files changed, 39 insertions(+), 35 deletions(-) diff --git a/src/ai/backend/manager/models/kernel.py b/src/ai/backend/manager/models/kernel.py index 619ad973ca..ead447de3b 100644 --- a/src/ai/backend/manager/models/kernel.py +++ b/src/ai/backend/manager/models/kernel.py @@ -84,7 +84,7 @@ ExtendedAsyncSAEngine, JSONCoalesceExpr, execute_with_retry, - sql_list_append, + sql_append_lists_to_list, ) if TYPE_CHECKING: @@ -729,7 +729,7 @@ async def set_kernel_status( data = { "status": status, "status_changed": now, - "status_history": sql_list_append( + "status_history": sql_append_lists_to_list( KernelRow.status_history, [status.name, now.isoformat()] ), } @@ -776,7 +776,7 @@ async def _update() -> bool: if update_data is None: update_values = { "status": new_status, - "status_history": sql_list_append( + "status_history": sql_append_lists_to_list( KernelRow.status_history, [new_status.name, now.isoformat()] ), } diff --git a/src/ai/backend/manager/models/session.py b/src/ai/backend/manager/models/session.py index 8a135bce84..f3e2ba9789 100644 --- a/src/ai/backend/manager/models/session.py +++ b/src/ai/backend/manager/models/session.py @@ -80,7 +80,7 @@ JSONCoalesceExpr, agg_to_array, execute_with_retry, - sql_list_append, + sql_append_lists_to_list, ) if TYPE_CHECKING: @@ -809,7 +809,7 @@ async def _check_and_update() -> SessionStatus | None: update_values = { "status": determined_status, - "status_history": sql_list_append( + "status_history": sql_append_lists_to_list( SessionRow.status_history, [determined_status.name, now.isoformat()] ), } @@ -911,7 +911,7 @@ async def set_session_status( now = status_changed_at data = { "status": status, - "status_history": sql_list_append( + "status_history": sql_append_lists_to_list( SessionRow.status_history, [status.name, datetime.now(tzutc()).isoformat()] ), } diff --git a/src/ai/backend/manager/models/utils.py b/src/ai/backend/manager/models/utils.py index 324e292f20..672b4f7972 100644 --- a/src/ai/backend/manager/models/utils.py +++ b/src/ai/backend/manager/models/utils.py @@ -452,10 +452,10 @@ def sql_json_merge( return expr -def sql_list_append(col, *arrs): +def sql_append_lists_to_list(col, *arrs): """ - Generate an SQLAlchemy column update expression that appends items to - the existing JSONB array from multiple arrays. + Generate an SQLAlchemy column update expression that append arrays to + the existing JSONB array. """ expr = col for arr in arrs: diff --git a/src/ai/backend/manager/models/vfolder.py b/src/ai/backend/manager/models/vfolder.py index 33925b1c4f..51b9540b9f 100644 --- a/src/ai/backend/manager/models/vfolder.py +++ b/src/ai/backend/manager/models/vfolder.py @@ -104,7 +104,7 @@ from .rbac.exceptions import InvalidScope, NotEnoughPermission from .session import DEAD_SESSION_STATUSES, SessionRow from .user import UserRole, UserRow -from .utils import ExtendedAsyncSAEngine, execute_with_retry, sql_list_append +from .utils import ExtendedAsyncSAEngine, execute_with_retry, sql_append_lists_to_list if TYPE_CHECKING: from ..api.context import BackgroundTaskManager @@ -1466,7 +1466,7 @@ async def _update() -> None: sa.update(vfolders) .values( status=update_status, - status_history=sql_list_append( + status_history=sql_append_lists_to_list( vfolders.c.status_history, [ update_status.name, diff --git a/src/ai/backend/manager/registry.py b/src/ai/backend/manager/registry.py index 29b617f1b0..83f88e59eb 100644 --- a/src/ai/backend/manager/registry.py +++ b/src/ai/backend/manager/registry.py @@ -179,8 +179,8 @@ is_db_retry_error, reenter_txn, reenter_txn_session, + sql_append_lists_to_list, sql_json_merge, - sql_list_append, ) from .types import UserScope @@ -1597,7 +1597,7 @@ async def finalize_running( "stdin_port": created_info["stdin_port"], "stdout_port": created_info["stdout_port"], "service_ports": service_ports, - "status_history": sql_list_append( + "status_history": sql_append_lists_to_list( KernelRow.status_history, [new_status.name, datetime.now(tzutc()).isoformat()] ), } @@ -1786,7 +1786,6 @@ async def _update_kernel() -> None: log.warning("_create_kernels_in_one_agent(s:{}) cancelled", scheduled_session.id) except Exception as e: ex = e - err_info = convert_to_status_data(ex, self.debug) # The agent has already cancelled or issued the destruction lifecycle event # for this batch of kernels. @@ -1804,7 +1803,7 @@ async def _update_failure() -> None: status_info=f"other-error ({ex!r})", status_changed=now, terminated_at=now, - status_history=sql_list_append( + status_history=sql_append_lists_to_list( KernelRow.status_history, [ KernelStatus.ERROR.name, @@ -2413,7 +2412,7 @@ async def _update() -> None: "status_info": reason, "status_changed": now, "terminated_at": now, - "status_history": sql_list_append( + "status_history": sql_append_lists_to_list( KernelRow.status_history, [KernelStatus.TERMINATED.name, now.isoformat()], ), @@ -2457,7 +2456,7 @@ async def _update() -> None: "kernel": {"exit_code": None}, "session": {"status": "terminating"}, }, - "status_history": sql_list_append( + "status_history": sql_append_lists_to_list( KernelRow.status_history, [KernelStatus.TERMINATING.name, now.isoformat()], ), @@ -2624,7 +2623,7 @@ async def _restarting_session() -> None: sa.update(SessionRow) .values( status=SessionStatus.RESTARTING, - status_history=sql_list_append( + status_history=sql_append_lists_to_list( SessionRow.status_history, [ SessionStatus.RESTARTING.name, @@ -2663,7 +2662,7 @@ async def _restart_kernel(kernel: KernelRow) -> None: "stdin_port": kernel_info["stdin_port"], "stdout_port": kernel_info["stdout_port"], "service_ports": kernel_info.get("service_ports", []), - "status_history": sql_list_append( + "status_history": sql_append_lists_to_list( KernelRow.status_history, [KernelStatus.RUNNING.name, now.isoformat()] ), } @@ -3224,7 +3223,7 @@ async def _update_kernel() -> tuple[AccessKey, AgentId] | None: ("kernel",), {"exit_code": exit_code}, ), - "status_history": sql_list_append( + "status_history": sql_append_lists_to_list( KernelRow.status_history, [KernelStatus.TERMINATED.name, now.isoformat()] ), "terminated_at": now, diff --git a/src/ai/backend/manager/scheduler/dispatcher.py b/src/ai/backend/manager/scheduler/dispatcher.py index 8b8129ba4a..c553352fcd 100644 --- a/src/ai/backend/manager/scheduler/dispatcher.py +++ b/src/ai/backend/manager/scheduler/dispatcher.py @@ -92,7 +92,12 @@ recalc_concurrency_used, ) from ..models.utils import ExtendedAsyncSAEngine as SAEngine -from ..models.utils import execute_with_retry, sql_json_increment, sql_json_merge, sql_list_append +from ..models.utils import ( + execute_with_retry, + sql_append_lists_to_list, + sql_json_increment, + sql_json_merge, +) from .predicates import ( check_concurrency, check_dependencies, @@ -371,7 +376,7 @@ async def _apply_cancellation( status=KernelStatus.CANCELLED, status_info=reason, terminated_at=now, - status_history=sql_list_append( + status_history=sql_append_lists_to_list( KernelRow.status_history, [ KernelStatus.CANCELLED.name, @@ -388,7 +393,7 @@ async def _apply_cancellation( status=SessionStatus.CANCELLED, status_info=reason, terminated_at=now, - status_history=sql_list_append( + status_history=sql_append_lists_to_list( SessionRow.status_history, [ SessionStatus.CANCELLED.name, @@ -962,7 +967,7 @@ async def _finalize_scheduled() -> None: status_info="scheduled", status_data={}, status_changed=now, - status_history=sql_list_append( + status_history=sql_append_lists_to_list( KernelRow.status_history, [ KernelStatus.SCHEDULED.name, @@ -984,7 +989,7 @@ async def _finalize_scheduled() -> None: status=SessionStatus.SCHEDULED, status_info="scheduled", status_data={}, - status_history=sql_list_append( + status_history=sql_append_lists_to_list( SessionRow.status_history, [ SessionStatus.SCHEDULED.name, @@ -1199,7 +1204,7 @@ async def _finalize_scheduled() -> None: status_info="scheduled", status_data={}, status_changed=now, - status_history=sql_list_append( + status_history=sql_append_lists_to_list( KernelRow.status_history, [ KernelStatus.SCHEDULED.name, @@ -1222,7 +1227,7 @@ async def _finalize_scheduled() -> None: status_info="scheduled", status_data={}, # status_changed=now, - status_history=sql_list_append( + status_history=sql_append_lists_to_list( SessionRow.status_history, [ SessionStatus.SCHEDULED.name, @@ -1288,7 +1293,7 @@ async def _mark_session_preparing() -> Sequence[SessionRow]: status_changed=now, status_info="", status_data={}, - status_history=sql_list_append( + status_history=sql_append_lists_to_list( KernelRow.status_history, [ KernelStatus.PREPARING.name, @@ -1308,7 +1313,7 @@ async def _mark_session_preparing() -> Sequence[SessionRow]: # status_changed=now, status_info="", status_data={}, - status_history=sql_list_append( + status_history=sql_append_lists_to_list( SessionRow.status_history, [ SessionStatus.PREPARING.name, @@ -1612,7 +1617,7 @@ async def _mark_session_cancelled() -> None: status_info="failed-to-start", status_data=status_data, terminated_at=now, - status_history=sql_list_append( + status_history=sql_append_lists_to_list( KernelRow.status_history, [ KernelStatus.CANCELLED.name, @@ -1631,7 +1636,7 @@ async def _mark_session_cancelled() -> None: status_info="failed-to-start", status_data=status_data, terminated_at=now, - status_history=sql_list_append( + status_history=sql_append_lists_to_list( SessionRow.status_history, [ SessionStatus.CANCELLED.name, diff --git a/tests/manager/models/test_utils.py b/tests/manager/models/test_utils.py index 5fdbe79b24..d461c9d063 100644 --- a/tests/manager/models/test_utils.py +++ b/tests/manager/models/test_utils.py @@ -11,7 +11,7 @@ from ai.backend.manager.models.utils import ( agg_to_array, agg_to_str, - sql_list_append, + sql_append_lists_to_list, ) @@ -34,7 +34,7 @@ async def test_sql_json_merge__default(session_info): @pytest.mark.asyncio -async def test_sql_list_append(session_info): +async def test_sql_append_lists_to_list(session_info): session_id, conn = session_info timestamp = datetime.now(tzutc()).isoformat() expected = [ @@ -48,7 +48,7 @@ async def test_sql_list_append(session_info): kernels.update() .values( { - "status_history": sql_list_append( + "status_history": sql_append_lists_to_list( kernels.c.status_history, ["PENDING", timestamp], ["PREPARING", timestamp], @@ -62,7 +62,7 @@ async def test_sql_list_append(session_info): kernels.update() .values( { - "status_history": sql_list_append( + "status_history": sql_append_lists_to_list( kernels.c.status_history, ["TERMINATING", timestamp], ["TERMINATED", timestamp], From 19bb91dfec92285f3cf6be2f1bc89e50af8e2482 Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Mon, 30 Oct 2023 02:58:08 +0000 Subject: [PATCH 06/45] Add fragment --- changes/1662.fix.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 changes/1662.fix.md diff --git a/changes/1662.fix.md b/changes/1662.fix.md new file mode 100644 index 0000000000..dc32fcc559 --- /dev/null +++ b/changes/1662.fix.md @@ -0,0 +1 @@ +Replace `status_history`'s type `map` with `list` \ No newline at end of file From da24de464843dcf5038c89d6518619214d651512 Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Mon, 30 Oct 2023 03:01:34 +0000 Subject: [PATCH 07/45] Fix wrong comment position --- src/ai/backend/manager/registry.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ai/backend/manager/registry.py b/src/ai/backend/manager/registry.py index 83f88e59eb..ca545a94ce 100644 --- a/src/ai/backend/manager/registry.py +++ b/src/ai/backend/manager/registry.py @@ -1806,8 +1806,8 @@ async def _update_failure() -> None: status_history=sql_append_lists_to_list( KernelRow.status_history, [ - KernelStatus.ERROR.name, - now.isoformat(), # ["PULLING", "PREPARING"] + KernelStatus.ERROR.name, # ["PULLING", "PREPARING"] + now.isoformat(), ], ), status_data=convert_to_status_data(ex, self.debug), From feec035d0a849e42c7432f74229b0b956925c506 Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Mon, 30 Oct 2023 03:10:16 +0000 Subject: [PATCH 08/45] Rename get_first_status_history_record function --- src/ai/backend/client/cli/session/lifecycle.py | 6 ++++-- src/ai/backend/common/utils.py | 2 +- src/ai/backend/manager/models/kernel.py | 4 ++-- src/ai/backend/manager/models/session.py | 6 +++--- 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/src/ai/backend/client/cli/session/lifecycle.py b/src/ai/backend/client/cli/session/lifecycle.py index c51191ee79..26cd80f430 100644 --- a/src/ai/backend/client/cli/session/lifecycle.py +++ b/src/ai/backend/client/cli/session/lifecycle.py @@ -793,14 +793,16 @@ def status_history(session_id): try: status_history = kernel.get_status_history().get("result") print_info(f"status_history: {status_history}") - if (preparing := get_first_status_history(status_history, "PREPARING")) is None: + if (preparing := get_first_status_history_record(status_history, "PREPARING")) is None: result = { "result": { "seconds": 0, "microseconds": 0, }, } - elif (terminated := get_first_status_history(status_history, "TERMINATED")) is None: + elif ( + terminated := get_first_status_history_record(status_history, "TERMINATED") + ) is None: alloc_time_until_now: timedelta = datetime.now(tzutc()) - isoparse(preparing) result = { "result": { diff --git a/src/ai/backend/common/utils.py b/src/ai/backend/common/utils.py index 627757861f..f489bfbb54 100644 --- a/src/ai/backend/common/utils.py +++ b/src/ai/backend/common/utils.py @@ -406,7 +406,7 @@ async def umount( return True -def get_first_status_history(arr: list[list[str]], status: str) -> list[str] | None: +def get_first_status_history_record(arr: list[list[str]], status: str) -> list[str] | None: for item in arr: if item[0] == status: return item diff --git a/src/ai/backend/manager/models/kernel.py b/src/ai/backend/manager/models/kernel.py index ead447de3b..1c77cc3584 100644 --- a/src/ai/backend/manager/models/kernel.py +++ b/src/ai/backend/manager/models/kernel.py @@ -46,7 +46,7 @@ SessionTypes, VFolderMount, ) -from ai.backend.common.utils import get_first_status_history +from ai.backend.common.utils import get_first_status_history_record from ..api.exceptions import ( BackendError, @@ -921,7 +921,7 @@ def parse_row(cls, ctx: GraphQueryContext, row: KernelRow) -> Mapping[str, Any]: hide_agents = ctx.local_config["manager"]["hide-agents"] status_history = row["status_history"] or [] - scheduled_at = get_first_status_history(status_history, KernelStatus.SCHEDULED.name) + scheduled_at = get_first_status_history_record(status_history, KernelStatus.SCHEDULED.name) return { # identity diff --git a/src/ai/backend/manager/models/session.py b/src/ai/backend/manager/models/session.py index f3e2ba9789..abdc0d8bd5 100644 --- a/src/ai/backend/manager/models/session.py +++ b/src/ai/backend/manager/models/session.py @@ -38,7 +38,7 @@ SessionTypes, VFolderMount, ) -from ai.backend.common.utils import get_first_status_history +from ai.backend.common.utils import get_first_status_history_record from ..api.exceptions import ( AgentError, @@ -730,7 +730,7 @@ def status_changed(self) -> Optional[datetime]: if self.status_history is None: return None try: - first = get_first_status_history(self.status_history, self.status.name) + first = get_first_status_history_record(self.status_history, self.status.name) assert first is not None return datetime.fromisoformat(first[1]) @@ -1321,7 +1321,7 @@ def parse_row(cls, ctx: GraphQueryContext, row: Row) -> Mapping[str, Any]: group_name = getattr(row, "group_name") row = row.SessionRow status_history = row.status_history or [] - first = get_first_status_history(status_history, SessionStatus.SCHEDULED.name) + first = get_first_status_history_record(status_history, SessionStatus.SCHEDULED.name) raw_scheduled_at = first[1] if first is not None else None return { From 380908abf0c10a75ac6617882e4145d41dfe81dd Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Mon, 30 Oct 2023 03:10:50 +0000 Subject: [PATCH 09/45] Fix wrong implementation of session_history of vfolder --- src/ai/backend/manager/models/vfolder.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/ai/backend/manager/models/vfolder.py b/src/ai/backend/manager/models/vfolder.py index 51b9540b9f..8ca3fca982 100644 --- a/src/ai/backend/manager/models/vfolder.py +++ b/src/ai/backend/manager/models/vfolder.py @@ -104,7 +104,7 @@ from .rbac.exceptions import InvalidScope, NotEnoughPermission from .session import DEAD_SESSION_STATUSES, SessionRow from .user import UserRole, UserRow -from .utils import ExtendedAsyncSAEngine, execute_with_retry, sql_append_lists_to_list +from .utils import ExtendedAsyncSAEngine, execute_with_retry, sql_json_merge if TYPE_CHECKING: from ..api.context import BackgroundTaskManager @@ -1466,12 +1466,12 @@ async def _update() -> None: sa.update(vfolders) .values( status=update_status, - status_history=sql_append_lists_to_list( + status_history=sql_json_merge( vfolders.c.status_history, - [ - update_status.name, - datetime.now(tzutc()).isoformat(), - ], + (), + { + update_status.name: datetime.now(tzutc()).isoformat(), + }, ), ) .where(cond) From 1f521a4e16999891231772fe11ba41b9772f266c Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Mon, 30 Oct 2023 03:45:54 +0000 Subject: [PATCH 10/45] Add default value to session status history --- src/ai/backend/manager/api/resource.py | 2 +- src/ai/backend/manager/models/session.py | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/ai/backend/manager/api/resource.py b/src/ai/backend/manager/api/resource.py index 2d7b1ba57d..3e5d4e4035 100644 --- a/src/ai/backend/manager/api/resource.py +++ b/src/ai/backend/manager/api/resource.py @@ -465,7 +465,7 @@ async def _pipe_builder(r: Redis) -> RedisPipeline: "status": row["status"].name, "status_info": row["status_info"], "status_changed": str(row["status_changed"]), - "status_history": row["status_history"] or [], + "status_history": row["status_history"], "cluster_mode": row["cluster_mode"], } if group_id not in objs_per_group: diff --git a/src/ai/backend/manager/models/session.py b/src/ai/backend/manager/models/session.py index abdc0d8bd5..01561f571b 100644 --- a/src/ai/backend/manager/models/session.py +++ b/src/ai/backend/manager/models/session.py @@ -680,7 +680,7 @@ class SessionRow(Base): # // used to prevent duplication of SessionTerminatedEvent # } # } - status_history = sa.Column("status_history", pgsql.JSONB(), nullable=True, default=sa.null()) + status_history = sa.Column("status_history", pgsql.JSONB(), nullable=False, default=[]) callback_url = sa.Column("callback_url", URLColumn, nullable=True, default=sa.null()) startup_command = sa.Column("startup_command", sa.Text, nullable=True) @@ -727,8 +727,6 @@ def main_kernel(self) -> KernelRow: @property def status_changed(self) -> Optional[datetime]: - if self.status_history is None: - return None try: first = get_first_status_history_record(self.status_history, self.status.name) assert first is not None From 23730057b7ca9f06a507e883abbcd5c020801c19 Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Mon, 30 Oct 2023 03:55:12 +0000 Subject: [PATCH 11/45] Code organization --- src/ai/backend/manager/models/kernel.py | 3 +-- src/ai/backend/manager/models/session.py | 18 ++++++------------ 2 files changed, 7 insertions(+), 14 deletions(-) diff --git a/src/ai/backend/manager/models/kernel.py b/src/ai/backend/manager/models/kernel.py index 1c77cc3584..6148002184 100644 --- a/src/ai/backend/manager/models/kernel.py +++ b/src/ai/backend/manager/models/kernel.py @@ -919,8 +919,7 @@ def parse_row(cls, ctx: GraphQueryContext, row: KernelRow) -> Mapping[str, Any]: hide_agents = False else: hide_agents = ctx.local_config["manager"]["hide-agents"] - - status_history = row["status_history"] or [] + status_history = row["status_history"] scheduled_at = get_first_status_history_record(status_history, KernelStatus.SCHEDULED.name) return { diff --git a/src/ai/backend/manager/models/session.py b/src/ai/backend/manager/models/session.py index 01561f571b..8572bba01e 100644 --- a/src/ai/backend/manager/models/session.py +++ b/src/ai/backend/manager/models/session.py @@ -727,13 +727,9 @@ def main_kernel(self) -> KernelRow: @property def status_changed(self) -> Optional[datetime]: - try: - first = get_first_status_history_record(self.status_history, self.status.name) - assert first is not None - - return datetime.fromisoformat(first[1]) - except KeyError: - return None + if first_record := get_first_status_history_record(self.status_history, self.status.name): + return datetime.fromisoformat(first_record[1]) + return None @property def resource_opts(self) -> dict[str, Any]: @@ -1318,9 +1314,9 @@ def parse_row(cls, ctx: GraphQueryContext, row: Row) -> Mapping[str, Any]: full_name = getattr(row, "full_name") group_name = getattr(row, "group_name") row = row.SessionRow - status_history = row.status_history or [] + status_history = row.status_history first = get_first_status_history_record(status_history, SessionStatus.SCHEDULED.name) - raw_scheduled_at = first[1] if first is not None else None + scheduled_at = datetime.fromisoformat(first[1]) if first is not None else None return { # identity @@ -1357,9 +1353,7 @@ def parse_row(cls, ctx: GraphQueryContext, row: Row) -> Mapping[str, Any]: "created_at": row.created_at, "terminated_at": row.terminated_at, "starts_at": row.starts_at, - "scheduled_at": ( - datetime.fromisoformat(raw_scheduled_at) if raw_scheduled_at is not None else None - ), + "scheduled_at": scheduled_at, "startup_command": row.startup_command, "result": row.result.name, # resources From 8f7a6d6eb7992b545e2b851e149e228447b22908 Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Mon, 30 Oct 2023 04:30:16 +0000 Subject: [PATCH 12/45] Rename get_first_occurrence_time function --- src/ai/backend/client/cli/session/lifecycle.py | 8 +++----- src/ai/backend/common/utils.py | 6 +++--- src/ai/backend/manager/models/kernel.py | 4 ++-- src/ai/backend/manager/models/session.py | 9 ++++----- 4 files changed, 12 insertions(+), 15 deletions(-) diff --git a/src/ai/backend/client/cli/session/lifecycle.py b/src/ai/backend/client/cli/session/lifecycle.py index 26cd80f430..b8ae5ac9e5 100644 --- a/src/ai/backend/client/cli/session/lifecycle.py +++ b/src/ai/backend/client/cli/session/lifecycle.py @@ -27,7 +27,7 @@ from ai.backend.cli.types import ExitCode, Undefined, undefined from ai.backend.common.arch import DEFAULT_IMAGE_ARCH from ai.backend.common.types import ClusterMode -from ai.backend.common.utils import get_first_status_history +from ai.backend.common.utils import get_first_occurrence_time from ...compat import asyncio_run from ...exceptions import BackendAPIError @@ -793,16 +793,14 @@ def status_history(session_id): try: status_history = kernel.get_status_history().get("result") print_info(f"status_history: {status_history}") - if (preparing := get_first_status_history_record(status_history, "PREPARING")) is None: + if (preparing := get_first_occurrence_time(status_history, "PREPARING")) is None: result = { "result": { "seconds": 0, "microseconds": 0, }, } - elif ( - terminated := get_first_status_history_record(status_history, "TERMINATED") - ) is None: + elif (terminated := get_first_occurrence_time(status_history, "TERMINATED")) is None: alloc_time_until_now: timedelta = datetime.now(tzutc()) - isoparse(preparing) result = { "result": { diff --git a/src/ai/backend/common/utils.py b/src/ai/backend/common/utils.py index f489bfbb54..759f5c93fd 100644 --- a/src/ai/backend/common/utils.py +++ b/src/ai/backend/common/utils.py @@ -406,8 +406,8 @@ async def umount( return True -def get_first_status_history_record(arr: list[list[str]], status: str) -> list[str] | None: - for item in arr: +def get_first_occurrence_time(status_history: list[list[str]], status: str) -> str | None: + for item in status_history: if item[0] == status: - return item + return item[1] return None diff --git a/src/ai/backend/manager/models/kernel.py b/src/ai/backend/manager/models/kernel.py index 6148002184..173d4ddb59 100644 --- a/src/ai/backend/manager/models/kernel.py +++ b/src/ai/backend/manager/models/kernel.py @@ -46,7 +46,7 @@ SessionTypes, VFolderMount, ) -from ai.backend.common.utils import get_first_status_history_record +from ai.backend.common.utils import get_first_occurrence_time from ..api.exceptions import ( BackendError, @@ -920,7 +920,7 @@ def parse_row(cls, ctx: GraphQueryContext, row: KernelRow) -> Mapping[str, Any]: else: hide_agents = ctx.local_config["manager"]["hide-agents"] status_history = row["status_history"] - scheduled_at = get_first_status_history_record(status_history, KernelStatus.SCHEDULED.name) + scheduled_at = get_first_occurrence_time(status_history, KernelStatus.SCHEDULED.name) return { # identity diff --git a/src/ai/backend/manager/models/session.py b/src/ai/backend/manager/models/session.py index 8572bba01e..8a290ece8b 100644 --- a/src/ai/backend/manager/models/session.py +++ b/src/ai/backend/manager/models/session.py @@ -38,7 +38,7 @@ SessionTypes, VFolderMount, ) -from ai.backend.common.utils import get_first_status_history_record +from ai.backend.common.utils import get_first_occurrence_time from ..api.exceptions import ( AgentError, @@ -727,8 +727,8 @@ def main_kernel(self) -> KernelRow: @property def status_changed(self) -> Optional[datetime]: - if first_record := get_first_status_history_record(self.status_history, self.status.name): - return datetime.fromisoformat(first_record[1]) + if first := get_first_occurrence_time(self.status_history, self.status.name): + return datetime.fromisoformat(first) return None @property @@ -1315,8 +1315,7 @@ def parse_row(cls, ctx: GraphQueryContext, row: Row) -> Mapping[str, Any]: group_name = getattr(row, "group_name") row = row.SessionRow status_history = row.status_history - first = get_first_status_history_record(status_history, SessionStatus.SCHEDULED.name) - scheduled_at = datetime.fromisoformat(first[1]) if first is not None else None + scheduled_at = get_first_occurrence_time(status_history, SessionStatus.SCHEDULED.name) return { # identity From 9b0e4be487b8ff63ca992e60786c2baec292bd6c Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Mon, 30 Oct 2023 05:07:08 +0000 Subject: [PATCH 13/45] Add comments for status_history column --- src/ai/backend/manager/models/kernel.py | 7 +++++++ src/ai/backend/manager/models/session.py | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/src/ai/backend/manager/models/kernel.py b/src/ai/backend/manager/models/kernel.py index 173d4ddb59..b53bbef265 100644 --- a/src/ai/backend/manager/models/kernel.py +++ b/src/ai/backend/manager/models/kernel.py @@ -545,6 +545,13 @@ class KernelRow(Base): # } # } status_history = (sa.Column("status_history", pgsql.JSONB(), nullable=False, default=[]),) + # status_history records all status changes + # e.g) + # [ + # ["PENDING", "2022-10-22T10:22:30"], + # ["SCHEDULED", "2022-10-22T11:40:30"], + # ["PREPARING", "2022-10-25T10:22:30"] + # ] callback_url = (sa.Column("callback_url", URLColumn, nullable=True, default=sa.null()),) startup_command = (sa.Column("startup_command", sa.Text, nullable=True),) result = sa.Column( diff --git a/src/ai/backend/manager/models/session.py b/src/ai/backend/manager/models/session.py index 8a290ece8b..6fedef1b66 100644 --- a/src/ai/backend/manager/models/session.py +++ b/src/ai/backend/manager/models/session.py @@ -681,6 +681,13 @@ class SessionRow(Base): # } # } status_history = sa.Column("status_history", pgsql.JSONB(), nullable=False, default=[]) + # status_history records all status changes + # e.g) + # [ + # ["PENDING", "2022-10-22T10:22:30"], + # ["SCHEDULED", "2022-10-22T11:40:30"], + # ["PREPARING", "2022-10-25T10:22:30"] + # ] callback_url = sa.Column("callback_url", URLColumn, nullable=True, default=sa.null()) startup_command = sa.Column("startup_command", sa.Text, nullable=True) From 2cf840e6afbfdd4715a7ceb40882077a9be59574 Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Mon, 30 Oct 2023 05:39:42 +0000 Subject: [PATCH 14/45] Try to fix _fetch_hanging_sessions --- src/ai/backend/manager/server.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/ai/backend/manager/server.py b/src/ai/backend/manager/server.py index 84f7f7fdc5..6071c5dbf2 100644 --- a/src/ai/backend/manager/server.py +++ b/src/ai/backend/manager/server.py @@ -50,7 +50,7 @@ from ai.backend.common.plugin.hook import ALL_COMPLETED, PASSED, HookPluginContext from ai.backend.common.plugin.monitor import INCREMENT from ai.backend.common.types import AgentSelectionStrategy, LogSeverity -from ai.backend.common.utils import env_info +from ai.backend.common.utils import env_info, get_first_occurrence_time from . import __version__ from .agent_cache import AgentRPCCache @@ -575,9 +575,10 @@ async def _fetch_hanging_sessions( .where( ( datetime.now(tz=tzutc()) - - SessionRow.status_history[status.name].astext.cast( - sa.types.DateTime(timezone=True) - ) + - sa.func.to_timestamp( + get_first_occurrence_time(SessionRow.status_history, status.name), + "YYYY-MM-DD HH24:MI:SS.US", + ).cast(sa.types.DateTime(timezone=True)) ) > threshold ) @@ -632,6 +633,9 @@ async def _force_terminate_hanging_sessions( heuristic_interval_weight = 0.4 # NOTE: Shorter than a half(0.5) max_interval = timedelta(hours=1).total_seconds() threshold: relativedelta | timedelta + + print("session_hang_tolerance!!", session_hang_tolerance) + for status, threshold in session_hang_tolerance["threshold"].items(): try: session_status = SessionStatus[status] From 185164b56b3f207968b49216c051b99416a46ca3 Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Mon, 30 Oct 2023 07:36:53 +0000 Subject: [PATCH 15/45] Fix broken _fetch_hanging_sessions --- src/ai/backend/manager/server.py | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/src/ai/backend/manager/server.py b/src/ai/backend/manager/server.py index 6071c5dbf2..c566deb1b7 100644 --- a/src/ai/backend/manager/server.py +++ b/src/ai/backend/manager/server.py @@ -50,7 +50,7 @@ from ai.backend.common.plugin.hook import ALL_COMPLETED, PASSED, HookPluginContext from ai.backend.common.plugin.monitor import INCREMENT from ai.backend.common.types import AgentSelectionStrategy, LogSeverity -from ai.backend.common.utils import env_info, get_first_occurrence_time +from ai.backend.common.utils import env_info from . import __version__ from .agent_cache import AgentRPCCache @@ -555,7 +555,6 @@ async def hanging_session_scanner_ctx(root_ctx: RootContext) -> AsyncIterator[No import sqlalchemy as sa from dateutil.relativedelta import relativedelta - from dateutil.tz import tzutc from sqlalchemy.orm import load_only, noload from .config import session_hang_tolerance_iv @@ -572,21 +571,23 @@ async def _fetch_hanging_sessions( query = ( sa.select(SessionRow) .where(SessionRow.status == status) - .where( - ( - datetime.now(tz=tzutc()) - - sa.func.to_timestamp( - get_first_occurrence_time(SessionRow.status_history, status.name), - "YYYY-MM-DD HH24:MI:SS.US", - ).cast(sa.types.DateTime(timezone=True)) - ) - > threshold - ) + .where(sa.text(""" + EXISTS ( + SELECT 1 + FROM jsonb_array_elements(status_history) AS session_history + WHERE + session_history->>0 = :status_name AND + ( + now() - CAST(session_history->>1 AS TIMESTAMP WITH TIME ZONE) + ) > :threshold + ) + """).bindparams(status_name=status.name, threshold=threshold)) .options( noload("*"), load_only(SessionRow.id, SessionRow.name, SessionRow.status, SessionRow.access_key), ) ) + async with db.begin_readonly() as conn: result = await conn.execute(query) return result.fetchall() @@ -634,8 +635,6 @@ async def _force_terminate_hanging_sessions( max_interval = timedelta(hours=1).total_seconds() threshold: relativedelta | timedelta - print("session_hang_tolerance!!", session_hang_tolerance) - for status, threshold in session_hang_tolerance["threshold"].items(): try: session_status = SessionStatus[status] From f92a54a8a5013d02b5a795f45084873460f2c513 Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Mon, 30 Oct 2023 07:40:00 +0000 Subject: [PATCH 16/45] Remove useless newline --- src/ai/backend/manager/server.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/ai/backend/manager/server.py b/src/ai/backend/manager/server.py index c566deb1b7..6cff02ec43 100644 --- a/src/ai/backend/manager/server.py +++ b/src/ai/backend/manager/server.py @@ -587,7 +587,6 @@ async def _fetch_hanging_sessions( load_only(SessionRow.id, SessionRow.name, SessionRow.status, SessionRow.access_key), ) ) - async with db.begin_readonly() as conn: result = await conn.execute(query) return result.fetchall() @@ -634,7 +633,6 @@ async def _force_terminate_hanging_sessions( heuristic_interval_weight = 0.4 # NOTE: Shorter than a half(0.5) max_interval = timedelta(hours=1).total_seconds() threshold: relativedelta | timedelta - for status, threshold in session_hang_tolerance["threshold"].items(): try: session_status = SessionStatus[status] From 9a6e0ad8093a1123d062c1e7b87037b125645954 Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Mon, 30 Oct 2023 08:25:37 +0000 Subject: [PATCH 17/45] Add migration script --- ...replace_status_history_s_type_map_with_.py | 82 +++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 src/ai/backend/manager/models/alembic/versions/37fb8b8e98e5_replace_status_history_s_type_map_with_.py diff --git a/src/ai/backend/manager/models/alembic/versions/37fb8b8e98e5_replace_status_history_s_type_map_with_.py b/src/ai/backend/manager/models/alembic/versions/37fb8b8e98e5_replace_status_history_s_type_map_with_.py new file mode 100644 index 0000000000..ca5d2c72fb --- /dev/null +++ b/src/ai/backend/manager/models/alembic/versions/37fb8b8e98e5_replace_status_history_s_type_map_with_.py @@ -0,0 +1,82 @@ +"""Replace status_history's type map with list + +Revision ID: 37fb8b8e98e5 +Revises: 8c74e7df26f8 +Create Date: 2023-10-30 08:02:27.845105 + +""" +from alembic import op + +# revision identifiers, used by Alembic. +revision = "37fb8b8e98e5" +down_revision = "8c74e7df26f8" +branch_labels = None +depends_on = None + + +def upgrade(): + op.execute(""" + WITH data AS ( + SELECT id, + (jsonb_each(status_history)).key, + (jsonb_each(status_history)).value + FROM kernels + ) + UPDATE kernels + SET status_history = ( + SELECT jsonb_agg( + jsonb_build_array(key, value) + ) + FROM data + WHERE data.id = kernels.id + ); + """) + + op.execute(""" + WITH data AS ( + SELECT id, + (jsonb_each(status_history)).key, + (jsonb_each(status_history)).value + FROM sessions + ) + UPDATE sessions + SET status_history = ( + SELECT jsonb_agg( + jsonb_build_array(key, value) + ) + FROM data + WHERE data.id = sessions.id + ); + """) + + +def downgrade(): + op.execute(""" + WITH data AS ( + SELECT id, jsonb_object_agg( + elem->>0, elem->>1 + ) AS new_status_history + FROM kernels, + jsonb_array_elements(status_history) AS elem + GROUP BY id + ) + UPDATE kernels + SET status_history = data.new_status_history + FROM data + WHERE data.id = kernels.id; + """) + + op.execute(""" + WITH data AS ( + SELECT id, jsonb_object_agg( + elem->>0, elem->>1 + ) AS new_status_history + FROM sessions, + jsonb_array_elements(status_history) AS elem + GROUP BY id + ) + UPDATE sessions + SET status_history = data.new_status_history + FROM data + WHERE data.id = sessions.id; + """) From f0bf3f553b2adde38774414a2b2452fa9972fd4f Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Thu, 2 Nov 2023 02:51:17 +0000 Subject: [PATCH 18/45] Change status history format --- .../backend/client/cli/session/lifecycle.py | 6 +- src/ai/backend/common/utils.py | 14 ++- src/ai/backend/manager/api/session.py | 30 +++++++ ...replace_status_history_s_type_map_with_.py | 25 ++++-- src/ai/backend/manager/models/kernel.py | 12 +-- src/ai/backend/manager/models/session.py | 25 ++++-- src/ai/backend/manager/models/utils.py | 10 +-- src/ai/backend/manager/registry.py | 65 ++++++++------ .../backend/manager/scheduler/dispatcher.py | 90 ++++++++----------- src/ai/backend/manager/server.py | 8 +- tests/manager/models/test_utils.py | 56 ------------ 11 files changed, 171 insertions(+), 170 deletions(-) diff --git a/src/ai/backend/client/cli/session/lifecycle.py b/src/ai/backend/client/cli/session/lifecycle.py index b8ae5ac9e5..00e6e429cd 100644 --- a/src/ai/backend/client/cli/session/lifecycle.py +++ b/src/ai/backend/client/cli/session/lifecycle.py @@ -24,7 +24,8 @@ from ai.backend.cli.main import main from ai.backend.cli.params import CommaSeparatedListType, OptionalType -from ai.backend.cli.types import ExitCode, Undefined, undefined +from ai.backend.cli.types import CliContextInfo, ExitCode, Undefined, undefined +from ai.backend.client.cli.extensions import pass_ctx_obj from ai.backend.common.arch import DEFAULT_IMAGE_ARCH from ai.backend.common.types import ClusterMode from ai.backend.common.utils import get_first_occurrence_time @@ -779,8 +780,9 @@ def logs(session_id, kernel: str | None): @session.command("status-history") +@pass_ctx_obj @click.argument("session_id", metavar="SESSID") -def status_history(session_id): +def status_history(ctx: CliContextInfo, session_id): """ Shows the status transition history of the compute session. diff --git a/src/ai/backend/common/utils.py b/src/ai/backend/common/utils.py index 759f5c93fd..863400b952 100644 --- a/src/ai/backend/common/utils.py +++ b/src/ai/backend/common/utils.py @@ -406,8 +406,14 @@ async def umount( return True -def get_first_occurrence_time(status_history: list[list[str]], status: str) -> str | None: - for item in status_history: - if item[0] == status: - return item[1] +def get_first_occurrence_time( + status_history_records: list[dict[str, str]], status: str +) -> str | None: + """ + Get the first occurrence time of the given status from the status history records. + """ + + for status_history in status_history_records: + if status_history["status"] == status: + return status_history["timestamp"] return None diff --git a/src/ai/backend/manager/api/session.py b/src/ai/backend/manager/api/session.py index 9446381372..b68814c6b3 100644 --- a/src/ai/backend/manager/api/session.py +++ b/src/ai/backend/manager/api/session.py @@ -2186,6 +2186,35 @@ async def get_container_logs( return web.json_response(resp, status=200) +@server_status_required(READ_ALLOWED) +@auth_required +@check_api_params( + t.Dict({ + tx.AliasedKey(["session_name", "sessionName", "task_id", "taskId"]) >> "kernel_id": tx.UUID, + t.Key("owner_access_key", default=None): t.Null | t.String, + }) +) +async def get_status_history(request: web.Request, params: Any) -> web.Response: + root_ctx: RootContext = request.app["_root.context"] + session_name: str = request.match_info["session_name"] + requester_access_key, owner_access_key = await get_access_key_scopes(request, params) + log.info( + "GET_STATUS_HISTORY (ak:{}/{}, s:{})", requester_access_key, owner_access_key, session_name + ) + resp: dict[str, Mapping] = {"result": {}} + + async with root_ctx.db.begin_readonly_session() as db_sess: + compute_session = await SessionRow.get_session( + db_sess, + session_name, + owner_access_key, + kernel_loading_strategy=KernelLoadingStrategy.MAIN_KERNEL_ONLY, + ) + resp["result"] = compute_session.status_history + + return web.json_response(resp, status=200) + + @server_status_required(READ_ALLOWED) @auth_required @check_api_params( @@ -2321,6 +2350,7 @@ def create_app( app.router.add_route("GET", "/{session_name}/direct-access-info", get_direct_access_info) ) cors.add(app.router.add_route("GET", "/{session_name}/logs", get_container_logs)) + cors.add(app.router.add_route("GET", "/{session_name}/status-history", get_status_history)) cors.add(app.router.add_route("POST", "/{session_name}/rename", rename_session)) cors.add(app.router.add_route("POST", "/{session_name}/interrupt", interrupt)) cors.add(app.router.add_route("POST", "/{session_name}/complete", complete)) diff --git a/src/ai/backend/manager/models/alembic/versions/37fb8b8e98e5_replace_status_history_s_type_map_with_.py b/src/ai/backend/manager/models/alembic/versions/37fb8b8e98e5_replace_status_history_s_type_map_with_.py index ca5d2c72fb..16c8e63007 100644 --- a/src/ai/backend/manager/models/alembic/versions/37fb8b8e98e5_replace_status_history_s_type_map_with_.py +++ b/src/ai/backend/manager/models/alembic/versions/37fb8b8e98e5_replace_status_history_s_type_map_with_.py @@ -5,6 +5,7 @@ Create Date: 2023-10-30 08:02:27.845105 """ + from alembic import op # revision identifiers, used by Alembic. @@ -15,7 +16,8 @@ def upgrade(): - op.execute(""" + op.execute( + """ WITH data AS ( SELECT id, (jsonb_each(status_history)).key, @@ -30,9 +32,11 @@ def upgrade(): FROM data WHERE data.id = kernels.id ); - """) + """ + ) - op.execute(""" + op.execute( + """ WITH data AS ( SELECT id, (jsonb_each(status_history)).key, @@ -47,11 +51,13 @@ def upgrade(): FROM data WHERE data.id = sessions.id ); - """) + """ + ) def downgrade(): - op.execute(""" + op.execute( + """ WITH data AS ( SELECT id, jsonb_object_agg( elem->>0, elem->>1 @@ -64,9 +70,11 @@ def downgrade(): SET status_history = data.new_status_history FROM data WHERE data.id = kernels.id; - """) + """ + ) - op.execute(""" + op.execute( + """ WITH data AS ( SELECT id, jsonb_object_agg( elem->>0, elem->>1 @@ -79,4 +87,5 @@ def downgrade(): SET status_history = data.new_status_history FROM data WHERE data.id = sessions.id; - """) + """ + ) diff --git a/src/ai/backend/manager/models/kernel.py b/src/ai/backend/manager/models/kernel.py index b53bbef265..df85b17201 100644 --- a/src/ai/backend/manager/models/kernel.py +++ b/src/ai/backend/manager/models/kernel.py @@ -84,7 +84,7 @@ ExtendedAsyncSAEngine, JSONCoalesceExpr, execute_with_retry, - sql_append_lists_to_list, + sql_append_dict_to_list, ) if TYPE_CHECKING: @@ -736,8 +736,9 @@ async def set_kernel_status( data = { "status": status, "status_changed": now, - "status_history": sql_append_lists_to_list( - KernelRow.status_history, [status.name, now.isoformat()] + "status_history": sql_append_dict_to_list( + KernelRow.status_history, + {"status": status.name, "timestamp": now.isoformat()}, ), } if status_data is not None: @@ -783,8 +784,9 @@ async def _update() -> bool: if update_data is None: update_values = { "status": new_status, - "status_history": sql_append_lists_to_list( - KernelRow.status_history, [new_status.name, now.isoformat()] + "status_history": sql_append_dict_to_list( + KernelRow.status_history, + {"status": new_status.name, "timestamp": now.isoformat()}, ), } else: diff --git a/src/ai/backend/manager/models/session.py b/src/ai/backend/manager/models/session.py index 6fedef1b66..3ae9c77652 100644 --- a/src/ai/backend/manager/models/session.py +++ b/src/ai/backend/manager/models/session.py @@ -80,7 +80,7 @@ JSONCoalesceExpr, agg_to_array, execute_with_retry, - sql_append_lists_to_list, + sql_append_dict_to_list, ) if TYPE_CHECKING: @@ -810,8 +810,9 @@ async def _check_and_update() -> SessionStatus | None: update_values = { "status": determined_status, - "status_history": sql_append_lists_to_list( - SessionRow.status_history, [determined_status.name, now.isoformat()] + "status_history": sql_append_dict_to_list( + SessionRow.status_history, + {"status": determined_status.name, "timestamp": now.isoformat()}, ), } if determined_status in (SessionStatus.CANCELLED, SessionStatus.TERMINATED): @@ -912,8 +913,9 @@ async def set_session_status( now = status_changed_at data = { "status": status, - "status_history": sql_append_lists_to_list( - SessionRow.status_history, [status.name, datetime.now(tzutc()).isoformat()] + "status_history": sql_append_dict_to_list( + SessionRow.status_history, + {"status": status.name, "timestamp": datetime.now(tzutc()).isoformat()}, ), } if status_data is not None: @@ -1280,7 +1282,8 @@ class Meta: status_changed = GQLDateTime() status_info = graphene.String() status_data = graphene.JSONString() - status_history = graphene.JSONString() + status_history = graphene.JSONString() # legacy + status_history_log = graphene.JSONString() created_at = GQLDateTime() terminated_at = GQLDateTime() starts_at = GQLDateTime() @@ -1321,8 +1324,7 @@ def parse_row(cls, ctx: GraphQueryContext, row: Row) -> Mapping[str, Any]: full_name = getattr(row, "full_name") group_name = getattr(row, "group_name") row = row.SessionRow - status_history = row.status_history - scheduled_at = get_first_occurrence_time(status_history, SessionStatus.SCHEDULED.name) + scheduled_at = get_first_occurrence_time(row.status_history, SessionStatus.SCHEDULED.name) return { # identity @@ -1355,7 +1357,8 @@ def parse_row(cls, ctx: GraphQueryContext, row: Row) -> Mapping[str, Any]: "status_changed": row.status_changed, "status_info": row.status_info, "status_data": row.status_data, - "status_history": status_history, + # "status_history": ..., # filled by the legacy resolver + "status_history_log": row.status_history, "created_at": row.created_at, "terminated_at": row.terminated_at, "starts_at": row.starts_at, @@ -1439,6 +1442,10 @@ async def resolve_idle_checks(self, info: graphene.ResolveInfo) -> Mapping[str, graph_ctx: GraphQueryContext = info.context return await graph_ctx.idle_checker_host.get_idle_check_report(self.session_id) + # legacy + async def resolve_status_history(self, _info: graphene.ResolveInfo) -> Mapping[str, Any]: + return {item["status"]: item["timestamp"] for item in self.status_history_log} + _queryfilter_fieldspec: FieldSpecType = { "id": ("sessions_id", None), "type": ("sessions_session_type", enum_field_getter(SessionTypes)), diff --git a/src/ai/backend/manager/models/utils.py b/src/ai/backend/manager/models/utils.py index 672b4f7972..7093ac876c 100644 --- a/src/ai/backend/manager/models/utils.py +++ b/src/ai/backend/manager/models/utils.py @@ -452,15 +452,13 @@ def sql_json_merge( return expr -def sql_append_lists_to_list(col, *arrs): +def sql_append_dict_to_list(col, arg: dict): """ - Generate an SQLAlchemy column update expression that append arrays to + Generate an SQLAlchemy column update expression that appends a dictionary to the existing JSONB array. """ - expr = col - for arr in arrs: - new_item_str = str(arr).replace("'", '"') - expr = expr.op("||")(sa.text(f"'[{new_item_str}]'::jsonb")) + new_item_str = json.dumps(arg).replace("'", '"') + expr = col.op("||")(sa.text(f"'[{new_item_str}]'::jsonb")) return expr diff --git a/src/ai/backend/manager/registry.py b/src/ai/backend/manager/registry.py index ca545a94ce..0b13cc0290 100644 --- a/src/ai/backend/manager/registry.py +++ b/src/ai/backend/manager/registry.py @@ -179,7 +179,7 @@ is_db_retry_error, reenter_txn, reenter_txn_session, - sql_append_lists_to_list, + sql_append_dict_to_list, sql_json_merge, ) from .types import UserScope @@ -1007,9 +1007,10 @@ async def enqueue_session( session_data = { "id": session_id, "status": SessionStatus.PENDING, - "status_history": [ - [SessionStatus.PENDING.name, datetime.now(tzutc()).isoformat()], - ], + "status_history": { + "status": SessionStatus.PENDING.name, + "timestamp": datetime.now(tzutc()).isoformat(), + }, "creation_id": session_creation_id, "name": session_name, "session_type": session_type, @@ -1030,9 +1031,10 @@ async def enqueue_session( kernel_shared_data = { "status": KernelStatus.PENDING, - "status_history": [ - [KernelStatus.PENDING.name, datetime.now(tzutc()).isoformat()], - ], + "status_history": { + "status": KernelStatus.PENDING.name, + "timestamp": datetime.now(tzutc()).isoformat(), + }, "session_creation_id": session_creation_id, "session_id": session_id, "session_name": session_name, @@ -1597,8 +1599,9 @@ async def finalize_running( "stdin_port": created_info["stdin_port"], "stdout_port": created_info["stdout_port"], "service_ports": service_ports, - "status_history": sql_append_lists_to_list( - KernelRow.status_history, [new_status.name, datetime.now(tzutc()).isoformat()] + "status_history": sql_append_dict_to_list( + KernelRow.status_history, + {"status": new_status.name, "timestamp": datetime.now(tzutc()).isoformat()}, ), } @@ -1803,12 +1806,12 @@ async def _update_failure() -> None: status_info=f"other-error ({ex!r})", status_changed=now, terminated_at=now, - status_history=sql_append_lists_to_list( + status_history=sql_append_dict_to_list( KernelRow.status_history, - [ - KernelStatus.ERROR.name, # ["PULLING", "PREPARING"] - now.isoformat(), - ], + { + "status": KernelStatus.ERROR.name, + "timestamp": now.isoformat(), + }, ), status_data=convert_to_status_data(ex, self.debug), ) @@ -2412,9 +2415,12 @@ async def _update() -> None: "status_info": reason, "status_changed": now, "terminated_at": now, - "status_history": sql_append_lists_to_list( + "status_history": sql_append_dict_to_list( KernelRow.status_history, - [KernelStatus.TERMINATED.name, now.isoformat()], + { + "status": KernelStatus.TERMINATED.name, + "timestamp": now.isoformat(), + }, ), } if kern_stat: @@ -2456,9 +2462,12 @@ async def _update() -> None: "kernel": {"exit_code": None}, "session": {"status": "terminating"}, }, - "status_history": sql_append_lists_to_list( + "status_history": sql_append_dict_to_list( KernelRow.status_history, - [KernelStatus.TERMINATING.name, now.isoformat()], + { + "status": KernelStatus.TERMINATING.name, + "timestamp": now.isoformat(), + }, ), } await db_sess.execute( @@ -2623,12 +2632,12 @@ async def _restarting_session() -> None: sa.update(SessionRow) .values( status=SessionStatus.RESTARTING, - status_history=sql_append_lists_to_list( + status_history=sql_append_dict_to_list( SessionRow.status_history, - [ - SessionStatus.RESTARTING.name, - datetime.now(tzutc()).isoformat(), - ], + { + "status": KernelStatus.RESTARTING.name, + "timestamp": datetime.now(tzutc()).isoformat(), + }, ), ) .where(SessionRow.id == session.id) @@ -2662,8 +2671,9 @@ async def _restart_kernel(kernel: KernelRow) -> None: "stdin_port": kernel_info["stdin_port"], "stdout_port": kernel_info["stdout_port"], "service_ports": kernel_info.get("service_ports", []), - "status_history": sql_append_lists_to_list( - KernelRow.status_history, [KernelStatus.RUNNING.name, now.isoformat()] + "status_history": sql_append_dict_to_list( + KernelRow.status_history, + {"status": KernelStatus.RUNNING.name, "timestamp": now.isoformat()}, ), } await KernelRow.update_kernel( @@ -3223,8 +3233,9 @@ async def _update_kernel() -> tuple[AccessKey, AgentId] | None: ("kernel",), {"exit_code": exit_code}, ), - "status_history": sql_append_lists_to_list( - KernelRow.status_history, [KernelStatus.TERMINATED.name, now.isoformat()] + "status_history": sql_append_dict_to_list( + KernelRow.status_history, + {"status": KernelStatus.TERMINATED.name, "timestamp": now.isoformat()}, ), "terminated_at": now, } diff --git a/src/ai/backend/manager/scheduler/dispatcher.py b/src/ai/backend/manager/scheduler/dispatcher.py index c553352fcd..6c54527a01 100644 --- a/src/ai/backend/manager/scheduler/dispatcher.py +++ b/src/ai/backend/manager/scheduler/dispatcher.py @@ -94,7 +94,7 @@ from ..models.utils import ExtendedAsyncSAEngine as SAEngine from ..models.utils import ( execute_with_retry, - sql_append_lists_to_list, + sql_append_dict_to_list, sql_json_increment, sql_json_merge, ) @@ -376,12 +376,9 @@ async def _apply_cancellation( status=KernelStatus.CANCELLED, status_info=reason, terminated_at=now, - status_history=sql_append_lists_to_list( + status_history=sql_append_dict_to_list( KernelRow.status_history, - [ - KernelStatus.CANCELLED.name, - now.isoformat(), - ], + {"status": KernelStatus.CANCELLED.name, "timestamp": now.isoformat()}, ), ) .where(KernelRow.session_id.in_(session_ids)) @@ -393,12 +390,9 @@ async def _apply_cancellation( status=SessionStatus.CANCELLED, status_info=reason, terminated_at=now, - status_history=sql_append_lists_to_list( + status_history=sql_append_dict_to_list( SessionRow.status_history, - [ - SessionStatus.CANCELLED.name, - now.isoformat(), - ], + {"status": KernelStatus.CANCELLED.name, "timestamp": now.isoformat()}, ), ) .where(SessionRow.id.in_(session_ids)) @@ -967,12 +961,12 @@ async def _finalize_scheduled() -> None: status_info="scheduled", status_data={}, status_changed=now, - status_history=sql_append_lists_to_list( + status_history=sql_append_dict_to_list( KernelRow.status_history, - [ - KernelStatus.SCHEDULED.name, - now.isoformat(), - ], + { + "status": KernelStatus.SCHEDULED.name, + "timestamp": now.isoformat(), + }, ), ) .where(KernelRow.id == kernel.id) @@ -989,12 +983,9 @@ async def _finalize_scheduled() -> None: status=SessionStatus.SCHEDULED, status_info="scheduled", status_data={}, - status_history=sql_append_lists_to_list( + status_history=sql_append_dict_to_list( SessionRow.status_history, - [ - SessionStatus.SCHEDULED.name, - now.isoformat(), - ], + {"status": KernelStatus.SCHEDULED.name, "timestamp": now.isoformat()}, ), ) .where(SessionRow.id == sess_ctx.id) @@ -1204,12 +1195,12 @@ async def _finalize_scheduled() -> None: status_info="scheduled", status_data={}, status_changed=now, - status_history=sql_append_lists_to_list( + status_history=sql_append_dict_to_list( KernelRow.status_history, - [ - KernelStatus.SCHEDULED.name, - now.isoformat(), - ], + { + "status": KernelStatus.SCHEDULED.name, + "timestamp": now.isoformat(), + }, ), ) .where(KernelRow.id == binding.kernel.id) @@ -1227,12 +1218,9 @@ async def _finalize_scheduled() -> None: status_info="scheduled", status_data={}, # status_changed=now, - status_history=sql_append_lists_to_list( + status_history=sql_append_dict_to_list( SessionRow.status_history, - [ - SessionStatus.SCHEDULED.name, - now.isoformat(), - ], + {"status": KernelStatus.SCHEDULED.name, "timestamp": now.isoformat()}, ), ) .where(SessionRow.id == sess_ctx.id) @@ -1293,12 +1281,12 @@ async def _mark_session_preparing() -> Sequence[SessionRow]: status_changed=now, status_info="", status_data={}, - status_history=sql_append_lists_to_list( + status_history=sql_append_dict_to_list( KernelRow.status_history, - [ - KernelStatus.PREPARING.name, - now.isoformat(), - ], + { + "status": KernelStatus.PREPARING.name, + "timestamp": now.isoformat(), + }, ), ) .where( @@ -1313,12 +1301,12 @@ async def _mark_session_preparing() -> Sequence[SessionRow]: # status_changed=now, status_info="", status_data={}, - status_history=sql_append_lists_to_list( + status_history=sql_append_dict_to_list( SessionRow.status_history, - [ - SessionStatus.PREPARING.name, - now.isoformat(), - ], + { + "status": KernelStatus.PREPARING.name, + "timestamp": now.isoformat(), + }, ), ) .where(SessionRow.status == SessionStatus.SCHEDULED) @@ -1617,12 +1605,12 @@ async def _mark_session_cancelled() -> None: status_info="failed-to-start", status_data=status_data, terminated_at=now, - status_history=sql_append_lists_to_list( + status_history=sql_append_dict_to_list( KernelRow.status_history, - [ - KernelStatus.CANCELLED.name, - now.isoformat(), - ], + { + "status": KernelStatus.CANCELLED.name, + "timestamp": now.isoformat(), + }, ), ) .where(KernelRow.session_id == session.id) @@ -1636,12 +1624,12 @@ async def _mark_session_cancelled() -> None: status_info="failed-to-start", status_data=status_data, terminated_at=now, - status_history=sql_append_lists_to_list( + status_history=sql_append_dict_to_list( SessionRow.status_history, - [ - SessionStatus.CANCELLED.name, - now.isoformat(), - ], + { + "status": KernelStatus.CANCELLED.name, + "timestamp": now.isoformat(), + }, ), ) .where(SessionRow.id == session.id) diff --git a/src/ai/backend/manager/server.py b/src/ai/backend/manager/server.py index 6cff02ec43..acb0abc0f1 100644 --- a/src/ai/backend/manager/server.py +++ b/src/ai/backend/manager/server.py @@ -571,7 +571,9 @@ async def _fetch_hanging_sessions( query = ( sa.select(SessionRow) .where(SessionRow.status == status) - .where(sa.text(""" + .where( + sa.text( + """ EXISTS ( SELECT 1 FROM jsonb_array_elements(status_history) AS session_history @@ -581,7 +583,9 @@ async def _fetch_hanging_sessions( now() - CAST(session_history->>1 AS TIMESTAMP WITH TIME ZONE) ) > :threshold ) - """).bindparams(status_name=status.name, threshold=threshold)) + """ + ).bindparams(status_name=status.name, threshold=threshold) + ) .options( noload("*"), load_only(SessionRow.id, SessionRow.name, SessionRow.status, SessionRow.access_key), diff --git a/tests/manager/models/test_utils.py b/tests/manager/models/test_utils.py index d461c9d063..4331c1baab 100644 --- a/tests/manager/models/test_utils.py +++ b/tests/manager/models/test_utils.py @@ -1,17 +1,14 @@ import uuid -from datetime import datetime from typing import Union import pytest import sqlalchemy import sqlalchemy as sa -from dateutil.tz import tzutc from ai.backend.manager.models import KernelRow, SessionRow, kernels from ai.backend.manager.models.utils import ( agg_to_array, agg_to_str, - sql_append_lists_to_list, ) @@ -24,59 +21,6 @@ async def _select_kernel_row( return kernel -@pytest.mark.asyncio -async def test_sql_json_merge__default(session_info): - session_id, conn = session_info - expected: list[list[str, str]] = [] - kernel = await _select_kernel_row(conn, session_id) - assert kernel is not None - assert kernel.status_history == expected - - -@pytest.mark.asyncio -async def test_sql_append_lists_to_list(session_info): - session_id, conn = session_info - timestamp = datetime.now(tzutc()).isoformat() - expected = [ - ["PENDING", timestamp], - ["PREPARING", timestamp], - ["TERMINATING", timestamp], - ["TERMINATED", timestamp], - ] - - query = ( - kernels.update() - .values( - { - "status_history": sql_append_lists_to_list( - kernels.c.status_history, - ["PENDING", timestamp], - ["PREPARING", timestamp], - ), - } - ) - .where(kernels.c.session_id == session_id) - ) - await conn.execute(query) - query = ( - kernels.update() - .values( - { - "status_history": sql_append_lists_to_list( - kernels.c.status_history, - ["TERMINATING", timestamp], - ["TERMINATED", timestamp], - ), - } - ) - .where(kernels.c.session_id == session_id) - ) - await conn.execute(query) - kernel = await _select_kernel_row(conn, session_id) - assert kernel is not None - assert kernel.status_history == expected - - @pytest.mark.asyncio async def test_agg_to_str(session_info): session_id, conn = session_info From 4cb68deb2083130b0204cf36d1e60bb83a4a809f Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Fri, 3 Nov 2023 03:10:25 +0000 Subject: [PATCH 19/45] Add FieldSpec for status_history --- src/ai/backend/client/output/fields.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/ai/backend/client/output/fields.py b/src/ai/backend/client/output/fields.py index 4a1905fc39..f0d9c4fa1a 100644 --- a/src/ai/backend/client/output/fields.py +++ b/src/ai/backend/client/output/fields.py @@ -185,6 +185,8 @@ FieldSpec("created_user_id"), FieldSpec("status"), FieldSpec("status_info"), + FieldSpec("status_history"), + FieldSpec("status_history_log"), FieldSpec("status_data", formatter=nested_dict_formatter), FieldSpec("status_changed", "Last Updated"), FieldSpec("created_at"), From 5e44e851539c1cbd9da5d361e7a8557c022677ce Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Fri, 3 Nov 2023 05:08:54 +0000 Subject: [PATCH 20/45] Fix status_history command --- .../backend/client/cli/session/lifecycle.py | 49 ++++++++++--------- 1 file changed, 25 insertions(+), 24 deletions(-) diff --git a/src/ai/backend/client/cli/session/lifecycle.py b/src/ai/backend/client/cli/session/lifecycle.py index 00e6e429cd..92cf192c6e 100644 --- a/src/ai/backend/client/cli/session/lifecycle.py +++ b/src/ai/backend/client/cli/session/lifecycle.py @@ -24,8 +24,9 @@ from ai.backend.cli.main import main from ai.backend.cli.params import CommaSeparatedListType, OptionalType -from ai.backend.cli.types import CliContextInfo, ExitCode, Undefined, undefined +from ai.backend.cli.types import ExitCode, Undefined, undefined from ai.backend.client.cli.extensions import pass_ctx_obj +from ai.backend.client.cli.types import CLIContext from ai.backend.common.arch import DEFAULT_IMAGE_ARCH from ai.backend.common.types import ClusterMode from ai.backend.common.utils import get_first_occurrence_time @@ -782,7 +783,7 @@ def logs(session_id, kernel: str | None): @session.command("status-history") @pass_ctx_obj @click.argument("session_id", metavar="SESSID") -def status_history(ctx: CliContextInfo, session_id): +def status_history(ctx: CLIContext, session_id): """ Shows the status transition history of the compute session. @@ -794,31 +795,31 @@ def status_history(ctx: CliContextInfo, session_id): kernel = session.ComputeSession(session_id) try: status_history = kernel.get_status_history().get("result") - print_info(f"status_history: {status_history}") + + prev_time = None + + for status_record in status_history: + timestamp = datetime.fromisoformat(status_record["timestamp"]) + + if prev_time: + time_diff = timestamp - prev_time + status_record["time_elapsed"] = str(time_diff) + + prev_time = timestamp + + ctx.output.print_list( + status_history, + [FieldSpec("status"), FieldSpec("timestamp"), FieldSpec("time_elapsed")], + ) + if (preparing := get_first_occurrence_time(status_history, "PREPARING")) is None: - result = { - "result": { - "seconds": 0, - "microseconds": 0, - }, - } + elapsed = timedelta() elif (terminated := get_first_occurrence_time(status_history, "TERMINATED")) is None: - alloc_time_until_now: timedelta = datetime.now(tzutc()) - isoparse(preparing) - result = { - "result": { - "seconds": alloc_time_until_now.seconds, - "microseconds": alloc_time_until_now.microseconds, - }, - } + elapsed = datetime.now(tzutc()) - isoparse(preparing) else: - alloc_time: timedelta = isoparse(terminated) - isoparse(preparing) - result = { - "result": { - "seconds": alloc_time.seconds, - "microseconds": alloc_time.microseconds, - }, - } - print_done(f"Actual Resource Allocation Time: {result}") + elapsed = isoparse(terminated) - isoparse(preparing) + + print_done(f"Actual Resource Allocation Time: {elapsed.total_seconds()}") except Exception as e: print_error(e) sys.exit(ExitCode.FAILURE) From 1839ef3d4bef9cb8e78fc89dd1bc67d0c21f2c90 Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Mon, 6 Nov 2023 00:22:03 +0000 Subject: [PATCH 21/45] Update obsoleted comments --- src/ai/backend/manager/models/kernel.py | 6 +++--- src/ai/backend/manager/models/session.py | 7 +++---- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/ai/backend/manager/models/kernel.py b/src/ai/backend/manager/models/kernel.py index df85b17201..4124d087f1 100644 --- a/src/ai/backend/manager/models/kernel.py +++ b/src/ai/backend/manager/models/kernel.py @@ -548,9 +548,9 @@ class KernelRow(Base): # status_history records all status changes # e.g) # [ - # ["PENDING", "2022-10-22T10:22:30"], - # ["SCHEDULED", "2022-10-22T11:40:30"], - # ["PREPARING", "2022-10-25T10:22:30"] + # {"status: "PENDING", "timestamp": "2022-10-22T10:22:30"}, + # {"status: "SCHEDULED", "timestamp": "2022-10-22T11:40:30"}, + # {"status: "PREPARING", "timestamp": "2022-10-25T10:22:30"} # ] callback_url = (sa.Column("callback_url", URLColumn, nullable=True, default=sa.null()),) startup_command = (sa.Column("startup_command", sa.Text, nullable=True),) diff --git a/src/ai/backend/manager/models/session.py b/src/ai/backend/manager/models/session.py index 3ae9c77652..205df53534 100644 --- a/src/ai/backend/manager/models/session.py +++ b/src/ai/backend/manager/models/session.py @@ -684,9 +684,9 @@ class SessionRow(Base): # status_history records all status changes # e.g) # [ - # ["PENDING", "2022-10-22T10:22:30"], - # ["SCHEDULED", "2022-10-22T11:40:30"], - # ["PREPARING", "2022-10-25T10:22:30"] + # {"status: "PENDING", "timestamp": "2022-10-22T10:22:30"}, + # {"status: "SCHEDULED", "timestamp": "2022-10-22T11:40:30"}, + # {"status: "PREPARING", "timestamp": "2022-10-25T10:22:30"} # ] callback_url = sa.Column("callback_url", URLColumn, nullable=True, default=sa.null()) @@ -1357,7 +1357,6 @@ def parse_row(cls, ctx: GraphQueryContext, row: Row) -> Mapping[str, Any]: "status_changed": row.status_changed, "status_info": row.status_info, "status_data": row.status_data, - # "status_history": ..., # filled by the legacy resolver "status_history_log": row.status_history, "created_at": row.created_at, "terminated_at": row.terminated_at, From 1c26e4b81329d44a6fdb82c835f92184be2fc7d2 Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Mon, 6 Nov 2023 00:33:48 +0000 Subject: [PATCH 22/45] Update migration script --- ...replace_status_history_s_type_map_with_.py | 26 ++++++++++--------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/src/ai/backend/manager/models/alembic/versions/37fb8b8e98e5_replace_status_history_s_type_map_with_.py b/src/ai/backend/manager/models/alembic/versions/37fb8b8e98e5_replace_status_history_s_type_map_with_.py index 16c8e63007..30074a4a8f 100644 --- a/src/ai/backend/manager/models/alembic/versions/37fb8b8e98e5_replace_status_history_s_type_map_with_.py +++ b/src/ai/backend/manager/models/alembic/versions/37fb8b8e98e5_replace_status_history_s_type_map_with_.py @@ -20,14 +20,14 @@ def upgrade(): """ WITH data AS ( SELECT id, - (jsonb_each(status_history)).key, - (jsonb_each(status_history)).value + (jsonb_each(status_history)).key AS status, + (jsonb_each(status_history)).value AS timestamp FROM kernels ) UPDATE kernels SET status_history = ( SELECT jsonb_agg( - jsonb_build_array(key, value) + jsonb_build_object('status', status, 'timestamp', timestamp) ) FROM data WHERE data.id = kernels.id @@ -39,14 +39,14 @@ def upgrade(): """ WITH data AS ( SELECT id, - (jsonb_each(status_history)).key, - (jsonb_each(status_history)).value + (jsonb_each(status_history)).key AS status, + (jsonb_each(status_history)).value AS timestamp FROM sessions ) UPDATE sessions SET status_history = ( SELECT jsonb_agg( - jsonb_build_array(key, value) + jsonb_build_object('status', status, 'timestamp', timestamp) ) FROM data WHERE data.id = sessions.id @@ -59,9 +59,10 @@ def downgrade(): op.execute( """ WITH data AS ( - SELECT id, jsonb_object_agg( - elem->>0, elem->>1 - ) AS new_status_history + SELECT id, + jsonb_object_agg( + elem->>'status', elem->>'timestamp' + ) AS new_status_history FROM kernels, jsonb_array_elements(status_history) AS elem GROUP BY id @@ -76,9 +77,10 @@ def downgrade(): op.execute( """ WITH data AS ( - SELECT id, jsonb_object_agg( - elem->>0, elem->>1 - ) AS new_status_history + SELECT id, + jsonb_object_agg( + elem->>'status', elem->>'timestamp' + ) AS new_status_history FROM sessions, jsonb_array_elements(status_history) AS elem GROUP BY id From 32dc1ccca883f8465a471e780c1bc92bf9b9dbe9 Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Mon, 6 Nov 2023 01:05:58 +0000 Subject: [PATCH 23/45] Update _fetch_hanging_sessions SQL --- src/ai/backend/manager/server.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ai/backend/manager/server.py b/src/ai/backend/manager/server.py index acb0abc0f1..93c275c840 100644 --- a/src/ai/backend/manager/server.py +++ b/src/ai/backend/manager/server.py @@ -578,9 +578,9 @@ async def _fetch_hanging_sessions( SELECT 1 FROM jsonb_array_elements(status_history) AS session_history WHERE - session_history->>0 = :status_name AND + session_history->>'status' = :status_name AND ( - now() - CAST(session_history->>1 AS TIMESTAMP WITH TIME ZONE) + now() - CAST(session_history->>'timestamp' AS TIMESTAMP WITH TIME ZONE) ) > :threshold ) """ From ddfce3371068b3be6a96826e49f9eb67864cac1e Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Mon, 6 Nov 2023 01:51:38 +0000 Subject: [PATCH 24/45] Allow to search stale sessions --- src/ai/backend/manager/api/session.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ai/backend/manager/api/session.py b/src/ai/backend/manager/api/session.py index b68814c6b3..873edc1ec2 100644 --- a/src/ai/backend/manager/api/session.py +++ b/src/ai/backend/manager/api/session.py @@ -2208,6 +2208,7 @@ async def get_status_history(request: web.Request, params: Any) -> web.Response: db_sess, session_name, owner_access_key, + allow_stale=True, kernel_loading_strategy=KernelLoadingStrategy.MAIN_KERNEL_ONLY, ) resp["result"] = compute_session.status_history From 76cc98e52f4d384c8b34f273dfd11954d7ecb606 Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Fri, 26 Jan 2024 11:22:09 +0000 Subject: [PATCH 25/45] Resolve alembic conflict --- ...0aebacd_replace_status_history_to_list.py} | 10 +++++----- src/ai/backend/manager/registry.py | 20 +++++++++++-------- 2 files changed, 17 insertions(+), 13 deletions(-) rename src/ai/backend/manager/models/alembic/versions/{37fb8b8e98e5_replace_status_history_s_type_map_with_.py => 8c8e90aebacd_replace_status_history_to_list.py} (93%) diff --git a/src/ai/backend/manager/models/alembic/versions/37fb8b8e98e5_replace_status_history_s_type_map_with_.py b/src/ai/backend/manager/models/alembic/versions/8c8e90aebacd_replace_status_history_to_list.py similarity index 93% rename from src/ai/backend/manager/models/alembic/versions/37fb8b8e98e5_replace_status_history_s_type_map_with_.py rename to src/ai/backend/manager/models/alembic/versions/8c8e90aebacd_replace_status_history_to_list.py index 30074a4a8f..5124657534 100644 --- a/src/ai/backend/manager/models/alembic/versions/37fb8b8e98e5_replace_status_history_s_type_map_with_.py +++ b/src/ai/backend/manager/models/alembic/versions/8c8e90aebacd_replace_status_history_to_list.py @@ -1,16 +1,16 @@ """Replace status_history's type map with list -Revision ID: 37fb8b8e98e5 -Revises: 8c74e7df26f8 -Create Date: 2023-10-30 08:02:27.845105 +Revision ID: 8c8e90aebacd +Revises: 8b2ec7e3d22a +Create Date: 2024-01-26 11:19:23.075014 """ from alembic import op # revision identifiers, used by Alembic. -revision = "37fb8b8e98e5" -down_revision = "8c74e7df26f8" +revision = "8c8e90aebacd" +down_revision = "8b2ec7e3d22a" branch_labels = None depends_on = None diff --git a/src/ai/backend/manager/registry.py b/src/ai/backend/manager/registry.py index 0b13cc0290..6ffc840fd3 100644 --- a/src/ai/backend/manager/registry.py +++ b/src/ai/backend/manager/registry.py @@ -1007,10 +1007,12 @@ async def enqueue_session( session_data = { "id": session_id, "status": SessionStatus.PENDING, - "status_history": { - "status": SessionStatus.PENDING.name, - "timestamp": datetime.now(tzutc()).isoformat(), - }, + "status_history": [ + { + "status": SessionStatus.PENDING.name, + "timestamp": datetime.now(tzutc()).isoformat(), + } + ], "creation_id": session_creation_id, "name": session_name, "session_type": session_type, @@ -1031,10 +1033,12 @@ async def enqueue_session( kernel_shared_data = { "status": KernelStatus.PENDING, - "status_history": { - "status": KernelStatus.PENDING.name, - "timestamp": datetime.now(tzutc()).isoformat(), - }, + "status_history": [ + { + "status": KernelStatus.PENDING.name, + "timestamp": datetime.now(tzutc()).isoformat(), + }, + ], "session_creation_id": session_creation_id, "session_id": session_id, "session_name": session_name, From b1520c9737ac45c7c44d873a706e2cd2c518cb61 Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Fri, 3 May 2024 02:31:57 +0000 Subject: [PATCH 26/45] chore: Merge with main --- src/ai/backend/manager/models/kernel.py | 2 +- src/ai/backend/manager/models/vfolder.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ai/backend/manager/models/kernel.py b/src/ai/backend/manager/models/kernel.py index 4124d087f1..b5e0b3f5f8 100644 --- a/src/ai/backend/manager/models/kernel.py +++ b/src/ai/backend/manager/models/kernel.py @@ -928,7 +928,7 @@ def parse_row(cls, ctx: GraphQueryContext, row: KernelRow) -> Mapping[str, Any]: hide_agents = False else: hide_agents = ctx.local_config["manager"]["hide-agents"] - status_history = row["status_history"] + status_history = row.status_history scheduled_at = get_first_occurrence_time(status_history, KernelStatus.SCHEDULED.name) return { diff --git a/src/ai/backend/manager/models/vfolder.py b/src/ai/backend/manager/models/vfolder.py index 8ca3fca982..3c5933544a 100644 --- a/src/ai/backend/manager/models/vfolder.py +++ b/src/ai/backend/manager/models/vfolder.py @@ -1470,7 +1470,7 @@ async def _update() -> None: vfolders.c.status_history, (), { - update_status.name: datetime.now(tzutc()).isoformat(), + update_status.name: now.isoformat(), }, ), ) From 10f36b2763c4f9d567e3c1a34d525b753ba647fc Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Fri, 3 May 2024 02:34:34 +0000 Subject: [PATCH 27/45] chore: update GraphQL schema dump --- src/ai/backend/manager/api/schema.graphql | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ai/backend/manager/api/schema.graphql b/src/ai/backend/manager/api/schema.graphql index 76df1d941e..1258748946 100644 --- a/src/ai/backend/manager/api/schema.graphql +++ b/src/ai/backend/manager/api/schema.graphql @@ -584,6 +584,7 @@ type ComputeSession implements Item { status_info: String status_data: JSONString status_history: JSONString + status_history_log: JSONString created_at: DateTime terminated_at: DateTime starts_at: DateTime From 4b0758ccd9f500569c27baacbcef2b2629af0a07 Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Fri, 3 May 2024 02:39:58 +0000 Subject: [PATCH 28/45] fix: revert unrelated change --- src/ai/backend/manager/models/vfolder.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ai/backend/manager/models/vfolder.py b/src/ai/backend/manager/models/vfolder.py index 3c5933544a..1b5f87b7a7 100644 --- a/src/ai/backend/manager/models/vfolder.py +++ b/src/ai/backend/manager/models/vfolder.py @@ -1466,6 +1466,7 @@ async def _update() -> None: sa.update(vfolders) .values( status=update_status, + status_changed=now, status_history=sql_json_merge( vfolders.c.status_history, (), From 240ee6ce834144456bceef9ae84c1f37b060e406 Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Fri, 3 May 2024 02:40:06 +0000 Subject: [PATCH 29/45] docs: update migration script msg --- .../versions/8c8e90aebacd_replace_status_history_to_list.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ai/backend/manager/models/alembic/versions/8c8e90aebacd_replace_status_history_to_list.py b/src/ai/backend/manager/models/alembic/versions/8c8e90aebacd_replace_status_history_to_list.py index 5124657534..8f6b23d8fb 100644 --- a/src/ai/backend/manager/models/alembic/versions/8c8e90aebacd_replace_status_history_to_list.py +++ b/src/ai/backend/manager/models/alembic/versions/8c8e90aebacd_replace_status_history_to_list.py @@ -1,4 +1,4 @@ -"""Replace status_history's type map with list +"""Replace sessions, kernels's status_history's type map with list Revision ID: 8c8e90aebacd Revises: 8b2ec7e3d22a From d0dc961293b4ff9dd4bec1fcb0211f071e0722c8 Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Fri, 3 May 2024 02:49:38 +0000 Subject: [PATCH 30/45] chore: Add description field --- src/ai/backend/manager/models/session.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ai/backend/manager/models/session.py b/src/ai/backend/manager/models/session.py index 205df53534..8c7aee85e0 100644 --- a/src/ai/backend/manager/models/session.py +++ b/src/ai/backend/manager/models/session.py @@ -1283,7 +1283,7 @@ class Meta: status_info = graphene.String() status_data = graphene.JSONString() status_history = graphene.JSONString() # legacy - status_history_log = graphene.JSONString() + status_history_log = graphene.JSONString(description="Added in 24.09.0") created_at = GQLDateTime() terminated_at = GQLDateTime() starts_at = GQLDateTime() From 855f91fa854f6e87469ec84ba23cd814f9dcd4dc Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Fri, 3 May 2024 02:52:05 +0000 Subject: [PATCH 31/45] chore: update GraphQL schema dump --- src/ai/backend/manager/api/schema.graphql | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/ai/backend/manager/api/schema.graphql b/src/ai/backend/manager/api/schema.graphql index 1258748946..7923e1a7e7 100644 --- a/src/ai/backend/manager/api/schema.graphql +++ b/src/ai/backend/manager/api/schema.graphql @@ -584,6 +584,8 @@ type ComputeSession implements Item { status_info: String status_data: JSONString status_history: JSONString + + """Added in 24.09.0""" status_history_log: JSONString created_at: DateTime terminated_at: DateTime From 2bb11f3e1ac86be8e188bfc7fe73659ae16b2fd8 Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Fri, 3 May 2024 02:56:40 +0000 Subject: [PATCH 32/45] fix: alembic migration script down_revision --- .../versions/8c8e90aebacd_replace_status_history_to_list.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ai/backend/manager/models/alembic/versions/8c8e90aebacd_replace_status_history_to_list.py b/src/ai/backend/manager/models/alembic/versions/8c8e90aebacd_replace_status_history_to_list.py index 8f6b23d8fb..1e461c611e 100644 --- a/src/ai/backend/manager/models/alembic/versions/8c8e90aebacd_replace_status_history_to_list.py +++ b/src/ai/backend/manager/models/alembic/versions/8c8e90aebacd_replace_status_history_to_list.py @@ -1,7 +1,7 @@ """Replace sessions, kernels's status_history's type map with list Revision ID: 8c8e90aebacd -Revises: 8b2ec7e3d22a +Revises: dddf9be580f5 Create Date: 2024-01-26 11:19:23.075014 """ @@ -10,7 +10,7 @@ # revision identifiers, used by Alembic. revision = "8c8e90aebacd" -down_revision = "8b2ec7e3d22a" +down_revision = "dddf9be580f5" branch_labels = None depends_on = None From 4080505af34e8618a60508f5ab688d840a9db0b7 Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Fri, 3 May 2024 04:29:39 +0000 Subject: [PATCH 33/45] fix: Wrong handling of `get_first_occurrence_time` --- src/ai/backend/manager/models/kernel.py | 2 +- src/ai/backend/manager/models/session.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ai/backend/manager/models/kernel.py b/src/ai/backend/manager/models/kernel.py index b5e0b3f5f8..56688151cf 100644 --- a/src/ai/backend/manager/models/kernel.py +++ b/src/ai/backend/manager/models/kernel.py @@ -956,7 +956,7 @@ def parse_row(cls, ctx: GraphQueryContext, row: KernelRow) -> Mapping[str, Any]: "created_at": row.created_at, "terminated_at": row.terminated_at, "starts_at": row.starts_at, - "scheduled_at": scheduled_at[1] if scheduled_at else None, + "scheduled_at": scheduled_at or None, "occupied_slots": row.occupied_slots.to_json(), # resources "agent": row.agent if not hide_agents else None, diff --git a/src/ai/backend/manager/models/session.py b/src/ai/backend/manager/models/session.py index 8c7aee85e0..af3fd38311 100644 --- a/src/ai/backend/manager/models/session.py +++ b/src/ai/backend/manager/models/session.py @@ -734,8 +734,8 @@ def main_kernel(self) -> KernelRow: @property def status_changed(self) -> Optional[datetime]: - if first := get_first_occurrence_time(self.status_history, self.status.name): - return datetime.fromisoformat(first) + if scheduled_at := get_first_occurrence_time(self.status_history, self.status.name): + return datetime.fromisoformat(scheduled_at) return None @property From cc7d8969014f87439526dd349a8eb826c185b79b Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Fri, 3 May 2024 04:41:02 +0000 Subject: [PATCH 34/45] chore: Rename `get_first_occurrence_time` -> `get_first_timestamp_for_status` --- src/ai/backend/client/cli/session/lifecycle.py | 8 +++++--- src/ai/backend/common/utils.py | 2 +- src/ai/backend/manager/models/kernel.py | 4 ++-- src/ai/backend/manager/models/session.py | 8 +++++--- 4 files changed, 13 insertions(+), 9 deletions(-) diff --git a/src/ai/backend/client/cli/session/lifecycle.py b/src/ai/backend/client/cli/session/lifecycle.py index 92cf192c6e..d7d0314aa0 100644 --- a/src/ai/backend/client/cli/session/lifecycle.py +++ b/src/ai/backend/client/cli/session/lifecycle.py @@ -29,7 +29,7 @@ from ai.backend.client.cli.types import CLIContext from ai.backend.common.arch import DEFAULT_IMAGE_ARCH from ai.backend.common.types import ClusterMode -from ai.backend.common.utils import get_first_occurrence_time +from ai.backend.common.utils import get_first_timestamp_for_status from ...compat import asyncio_run from ...exceptions import BackendAPIError @@ -812,9 +812,11 @@ def status_history(ctx: CLIContext, session_id): [FieldSpec("status"), FieldSpec("timestamp"), FieldSpec("time_elapsed")], ) - if (preparing := get_first_occurrence_time(status_history, "PREPARING")) is None: + if (preparing := get_first_timestamp_for_status(status_history, "PREPARING")) is None: elapsed = timedelta() - elif (terminated := get_first_occurrence_time(status_history, "TERMINATED")) is None: + elif ( + terminated := get_first_timestamp_for_status(status_history, "TERMINATED") + ) is None: elapsed = datetime.now(tzutc()) - isoparse(preparing) else: elapsed = isoparse(terminated) - isoparse(preparing) diff --git a/src/ai/backend/common/utils.py b/src/ai/backend/common/utils.py index 863400b952..84dd95fe69 100644 --- a/src/ai/backend/common/utils.py +++ b/src/ai/backend/common/utils.py @@ -406,7 +406,7 @@ async def umount( return True -def get_first_occurrence_time( +def get_first_timestamp_for_status( status_history_records: list[dict[str, str]], status: str ) -> str | None: """ diff --git a/src/ai/backend/manager/models/kernel.py b/src/ai/backend/manager/models/kernel.py index 56688151cf..fb10b2df62 100644 --- a/src/ai/backend/manager/models/kernel.py +++ b/src/ai/backend/manager/models/kernel.py @@ -46,7 +46,7 @@ SessionTypes, VFolderMount, ) -from ai.backend.common.utils import get_first_occurrence_time +from ai.backend.common.utils import get_first_timestamp_for_status from ..api.exceptions import ( BackendError, @@ -929,7 +929,7 @@ def parse_row(cls, ctx: GraphQueryContext, row: KernelRow) -> Mapping[str, Any]: else: hide_agents = ctx.local_config["manager"]["hide-agents"] status_history = row.status_history - scheduled_at = get_first_occurrence_time(status_history, KernelStatus.SCHEDULED.name) + scheduled_at = get_first_timestamp_for_status(status_history, KernelStatus.SCHEDULED.name) return { # identity diff --git a/src/ai/backend/manager/models/session.py b/src/ai/backend/manager/models/session.py index af3fd38311..4122c5bce0 100644 --- a/src/ai/backend/manager/models/session.py +++ b/src/ai/backend/manager/models/session.py @@ -38,7 +38,7 @@ SessionTypes, VFolderMount, ) -from ai.backend.common.utils import get_first_occurrence_time +from ai.backend.common.utils import get_first_timestamp_for_status from ..api.exceptions import ( AgentError, @@ -734,7 +734,7 @@ def main_kernel(self) -> KernelRow: @property def status_changed(self) -> Optional[datetime]: - if scheduled_at := get_first_occurrence_time(self.status_history, self.status.name): + if scheduled_at := get_first_timestamp_for_status(self.status_history, self.status.name): return datetime.fromisoformat(scheduled_at) return None @@ -1324,7 +1324,9 @@ def parse_row(cls, ctx: GraphQueryContext, row: Row) -> Mapping[str, Any]: full_name = getattr(row, "full_name") group_name = getattr(row, "group_name") row = row.SessionRow - scheduled_at = get_first_occurrence_time(row.status_history, SessionStatus.SCHEDULED.name) + scheduled_at = get_first_timestamp_for_status( + row.status_history, SessionStatus.SCHEDULED.name + ) return { # identity From cf5d7d5c166fc8bc8f51d6251cadf506a2524792 Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Mon, 6 May 2024 22:48:36 +0000 Subject: [PATCH 35/45] docs: Update fragment --- changes/1662.fix.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changes/1662.fix.md b/changes/1662.fix.md index dc32fcc559..8333b460be 100644 --- a/changes/1662.fix.md +++ b/changes/1662.fix.md @@ -1 +1 @@ -Replace `status_history`'s type `map` with `list` \ No newline at end of file +Change the type of `status_history` from a mapping of status and timestamps to a list of log entries containing status and timestamps, to preserve timestamps when revisiting session/kernel statuses (e.g., after session restarts). \ No newline at end of file From f25bf617cf7079b43f2c99c402cf2c9eef8a13dd Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Mon, 6 May 2024 22:49:40 +0000 Subject: [PATCH 36/45] docs: Add `deprecation_reason` to `status_history` in gql --- src/ai/backend/manager/models/session.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/ai/backend/manager/models/session.py b/src/ai/backend/manager/models/session.py index 4122c5bce0..43b8e74d52 100644 --- a/src/ai/backend/manager/models/session.py +++ b/src/ai/backend/manager/models/session.py @@ -1282,7 +1282,9 @@ class Meta: status_changed = GQLDateTime() status_info = graphene.String() status_data = graphene.JSONString() - status_history = graphene.JSONString() # legacy + status_history = graphene.JSONString( + deprecation_reason="Deprecated since 24.09.0; use `status_history_log`" + ) status_history_log = graphene.JSONString(description="Added in 24.09.0") created_at = GQLDateTime() terminated_at = GQLDateTime() From 62237a3d02b4957edd395e7efb81dfafdcfa97bc Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Mon, 6 May 2024 22:52:25 +0000 Subject: [PATCH 37/45] chore: update GraphQL schema dump --- src/ai/backend/manager/api/schema.graphql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ai/backend/manager/api/schema.graphql b/src/ai/backend/manager/api/schema.graphql index 7923e1a7e7..0e875c44a8 100644 --- a/src/ai/backend/manager/api/schema.graphql +++ b/src/ai/backend/manager/api/schema.graphql @@ -583,7 +583,7 @@ type ComputeSession implements Item { status_changed: DateTime status_info: String status_data: JSONString - status_history: JSONString + status_history: JSONString @deprecated(reason: "Deprecated since 24.09.0; use `status_history_log`") """Added in 24.09.0""" status_history_log: JSONString From d89a64fac95c8226bf600556ddda94791fc5dcb0 Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Tue, 7 May 2024 04:32:25 +0000 Subject: [PATCH 38/45] fix: Mishandling of `scheduled_at` in gql --- src/ai/backend/manager/models/kernel.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/ai/backend/manager/models/kernel.py b/src/ai/backend/manager/models/kernel.py index fb10b2df62..4734a15a1d 100644 --- a/src/ai/backend/manager/models/kernel.py +++ b/src/ai/backend/manager/models/kernel.py @@ -76,7 +76,6 @@ ) from .group import groups from .image import ImageNode, ImageRow -from .minilang import JSONFieldItem from .minilang.ordering import ColumnMapType, QueryOrderParser from .minilang.queryfilter import FieldSpecType, QueryFilterParser, enum_field_getter from .user import users @@ -956,7 +955,7 @@ def parse_row(cls, ctx: GraphQueryContext, row: KernelRow) -> Mapping[str, Any]: "created_at": row.created_at, "terminated_at": row.terminated_at, "starts_at": row.starts_at, - "scheduled_at": scheduled_at or None, + "scheduled_at": scheduled_at, "occupied_slots": row.occupied_slots.to_json(), # resources "agent": row.agent if not hide_agents else None, @@ -1010,7 +1009,7 @@ async def resolve_abusing_report( "created_at": ("created_at", dtparse), "status_changed": ("status_changed", dtparse), "terminated_at": ("terminated_at", dtparse), - "scheduled_at": (JSONFieldItem("status_history", KernelStatus.SCHEDULED.name), dtparse), + "scheduled_at": ("scheduled_at", None), } _queryorder_colmap: ColumnMapType = { @@ -1027,7 +1026,7 @@ async def resolve_abusing_report( "status_changed": ("status_info", None), "created_at": ("created_at", None), "terminated_at": ("terminated_at", None), - "scheduled_at": (JSONFieldItem("status_history", KernelStatus.SCHEDULED.name), None), + "scheduled_at": ("scheduled_at", None), } @classmethod From 6d4bc09d49346cdbd023555bdda085aca0181d97 Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Tue, 7 May 2024 04:33:11 +0000 Subject: [PATCH 39/45] fix: Change `get_first_timestamp_for_status` return type to datetime --- src/ai/backend/client/cli/session/lifecycle.py | 5 ++--- src/ai/backend/common/utils.py | 7 ++++--- .../backend/manager/models/resource_usage.py | 6 ++++-- src/ai/backend/manager/models/session.py | 18 +++++------------- 4 files changed, 15 insertions(+), 21 deletions(-) diff --git a/src/ai/backend/client/cli/session/lifecycle.py b/src/ai/backend/client/cli/session/lifecycle.py index d7d0314aa0..a7a053f4bd 100644 --- a/src/ai/backend/client/cli/session/lifecycle.py +++ b/src/ai/backend/client/cli/session/lifecycle.py @@ -16,7 +16,6 @@ import inquirer import treelib from async_timeout import timeout -from dateutil.parser import isoparse from dateutil.tz import tzutc from faker import Faker from humanize import naturalsize @@ -817,9 +816,9 @@ def status_history(ctx: CLIContext, session_id): elif ( terminated := get_first_timestamp_for_status(status_history, "TERMINATED") ) is None: - elapsed = datetime.now(tzutc()) - isoparse(preparing) + elapsed = datetime.now(tzutc()) - preparing else: - elapsed = isoparse(terminated) - isoparse(preparing) + elapsed = terminated - preparing print_done(f"Actual Resource Allocation Time: {elapsed.total_seconds()}") except Exception as e: diff --git a/src/ai/backend/common/utils.py b/src/ai/backend/common/utils.py index 84dd95fe69..acadeb48d7 100644 --- a/src/ai/backend/common/utils.py +++ b/src/ai/backend/common/utils.py @@ -8,7 +8,7 @@ import sys import uuid from collections import OrderedDict -from datetime import timedelta +from datetime import datetime, timedelta from itertools import chain from pathlib import Path from typing import ( @@ -25,6 +25,7 @@ import aiofiles from async_timeout import timeout +from dateutil.parser import parse as dtparse if TYPE_CHECKING: from decimal import Decimal @@ -408,12 +409,12 @@ async def umount( def get_first_timestamp_for_status( status_history_records: list[dict[str, str]], status: str -) -> str | None: +) -> datetime | None: """ Get the first occurrence time of the given status from the status history records. """ for status_history in status_history_records: if status_history["status"] == status: - return status_history["timestamp"] + return dtparse(status_history["timestamp"]) return None diff --git a/src/ai/backend/manager/models/resource_usage.py b/src/ai/backend/manager/models/resource_usage.py index d8997867d7..76b33016fe 100644 --- a/src/ai/backend/manager/models/resource_usage.py +++ b/src/ai/backend/manager/models/resource_usage.py @@ -15,7 +15,7 @@ from ai.backend.common import redis_helper from ai.backend.common.types import RedisConnectionInfo -from ai.backend.common.utils import nmget +from ai.backend.common.utils import get_first_timestamp_for_status, nmget from .group import GroupRow from .kernel import LIVE_STATUS, RESOURCE_USAGE_KERNEL_STATUSES, KernelRow, KernelStatus @@ -516,7 +516,9 @@ async def _pipe_builder(r: Redis) -> RedisPipeline: session_row=kern.session, created_at=kern.created_at, terminated_at=kern.terminated_at, - scheduled_at=kern.status_history.get(KernelStatus.SCHEDULED.name), + scheduled_at=str( + get_first_timestamp_for_status(kern.status_history, KernelStatus.SCHEDULED.name) + ), used_time=kern.used_time, used_days=kern.get_used_days(local_tz), last_stat=stat_map[kern.id], diff --git a/src/ai/backend/manager/models/session.py b/src/ai/backend/manager/models/session.py index 43b8e74d52..53ba67a665 100644 --- a/src/ai/backend/manager/models/session.py +++ b/src/ai/backend/manager/models/session.py @@ -71,7 +71,7 @@ ) from .group import GroupRow from .kernel import ComputeContainer, KernelRow, KernelStatus -from .minilang import ArrayFieldItem, JSONFieldItem +from .minilang import ArrayFieldItem from .minilang.ordering import ColumnMapType, QueryOrderParser from .minilang.queryfilter import FieldSpecType, QueryFilterParser, enum_field_getter from .user import UserRow @@ -733,10 +733,8 @@ def main_kernel(self) -> KernelRow: return kerns[0] @property - def status_changed(self) -> Optional[datetime]: - if scheduled_at := get_first_timestamp_for_status(self.status_history, self.status.name): - return datetime.fromisoformat(scheduled_at) - return None + def status_changed(self) -> datetime | None: + return get_first_timestamp_for_status(self.status_history, self.status.name) @property def resource_opts(self) -> dict[str, Any]: @@ -1472,10 +1470,7 @@ async def resolve_status_history(self, _info: graphene.ResolveInfo) -> Mapping[s "created_at": ("sessions_created_at", dtparse), "terminated_at": ("sessions_terminated_at", dtparse), "starts_at": ("sessions_starts_at", dtparse), - "scheduled_at": ( - JSONFieldItem("sessions_status_history", SessionStatus.SCHEDULED.name), - dtparse, - ), + "scheduled_at": ("scheduled_at", None), "startup_command": ("sessions_startup_command", None), } @@ -1503,10 +1498,7 @@ async def resolve_status_history(self, _info: graphene.ResolveInfo) -> Mapping[s "created_at": ("sessions_created_at", None), "terminated_at": ("sessions_terminated_at", None), "starts_at": ("sessions_starts_at", None), - "scheduled_at": ( - JSONFieldItem("sessions_status_history", SessionStatus.SCHEDULED.name), - None, - ), + "scheduled_at": ("scheduled_at", None), } @classmethod From 3af5ec95f1f494316aa34dbaac623b95fb1d94b9 Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Thu, 9 May 2024 04:39:00 +0000 Subject: [PATCH 40/45] fix: perform missing alter column in migration script --- .../8c8e90aebacd_replace_status_history_to_list.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/ai/backend/manager/models/alembic/versions/8c8e90aebacd_replace_status_history_to_list.py b/src/ai/backend/manager/models/alembic/versions/8c8e90aebacd_replace_status_history_to_list.py index 1e461c611e..62475cfb2f 100644 --- a/src/ai/backend/manager/models/alembic/versions/8c8e90aebacd_replace_status_history_to_list.py +++ b/src/ai/backend/manager/models/alembic/versions/8c8e90aebacd_replace_status_history_to_list.py @@ -34,6 +34,8 @@ def upgrade(): ); """ ) + op.execute("UPDATE kernels SET status_history = '[]'::jsonb WHERE status_history IS NULL;") + op.alter_column("kernels", "status_history", nullable=False, default=[]) op.execute( """ @@ -53,6 +55,8 @@ def upgrade(): ); """ ) + op.execute("UPDATE sessions SET status_history = '[]'::jsonb WHERE status_history IS NULL;") + op.alter_column("sessions", "status_history", nullable=False, default=[]) def downgrade(): @@ -73,6 +77,8 @@ def downgrade(): WHERE data.id = kernels.id; """ ) + op.alter_column("kernels", "status_history", nullable=True, default=None) + op.execute("UPDATE kernels SET status_history = NULL WHERE status_history = '[]'::jsonb;") op.execute( """ @@ -91,3 +97,5 @@ def downgrade(): WHERE data.id = sessions.id; """ ) + op.alter_column("sessions", "status_history", nullable=True, default=None) + op.execute("UPDATE sessions SET status_history = NULL WHERE status_history = '[]'::jsonb;") From 895fca81fbea782aea39d35bbbd8821392b6669a Mon Sep 17 00:00:00 2001 From: Joongi Kim Date: Mon, 15 Jul 2024 18:36:59 +0900 Subject: [PATCH 41/45] refactor: Move get_first_timestamp_for_status() from common.utils to manager.models.utils - This eliminates the necessity to use stringified enum arguments in the `ai.backend.manager.models` codes. - In the client SDK, add a copy of it using relaxed str-only arguments. Since this is a fairly simple logic, I think it is not worth to introduce additional complexity to share and reuse the code between the client SDK and the manager. (Note that originally `ai.backend.common` was not the dependency of the client SDK...) - I think it would be better to introduce a JSON-fied TypedDict of each status history records. - Also fix several merge errors. --- .../backend/client/cli/session/lifecycle.py | 2 +- src/ai/backend/client/utils.py | 14 +++++++++++++ src/ai/backend/common/utils.py | 16 +-------------- src/ai/backend/manager/models/kernel.py | 7 ++++--- .../backend/manager/models/resource_usage.py | 14 ++++++++----- src/ai/backend/manager/models/session.py | 7 +++---- src/ai/backend/manager/models/utils.py | 20 +++++++++++++++++++ src/ai/backend/manager/utils.py | 2 ++ 8 files changed, 54 insertions(+), 28 deletions(-) diff --git a/src/ai/backend/client/cli/session/lifecycle.py b/src/ai/backend/client/cli/session/lifecycle.py index a7a053f4bd..9a6666c925 100644 --- a/src/ai/backend/client/cli/session/lifecycle.py +++ b/src/ai/backend/client/cli/session/lifecycle.py @@ -28,7 +28,6 @@ from ai.backend.client.cli.types import CLIContext from ai.backend.common.arch import DEFAULT_IMAGE_ARCH from ai.backend.common.types import ClusterMode -from ai.backend.common.utils import get_first_timestamp_for_status from ...compat import asyncio_run from ...exceptions import BackendAPIError @@ -36,6 +35,7 @@ from ...output.fields import session_fields from ...output.types import FieldSpec from ...session import AsyncSession, Session +from ...utils import get_first_timestamp_for_status from .. import events from ..pretty import ( ProgressViewer, diff --git a/src/ai/backend/client/utils.py b/src/ai/backend/client/utils.py index b95fc5c9b8..6561eacedf 100644 --- a/src/ai/backend/client/utils.py +++ b/src/ai/backend/client/utils.py @@ -1,6 +1,10 @@ +from __future__ import annotations + import io import os +from datetime import datetime +from dateutil.parser import parse as dtparse from tqdm import tqdm @@ -48,3 +52,13 @@ def readinto1(self, *args, **kwargs): count = super().readinto1(*args, **kwargs) self.tqdm.set_postfix(file=self._filename, refresh=False) self.tqdm.update(count) + + +def get_first_timestamp_for_status( + status_history: list[dict[str, str]], + status: str, +) -> datetime | None: + for rec in status_history: + if rec["status"] == status: + return dtparse(rec["timestamp"]) + return None diff --git a/src/ai/backend/common/utils.py b/src/ai/backend/common/utils.py index acadeb48d7..34b74cc684 100644 --- a/src/ai/backend/common/utils.py +++ b/src/ai/backend/common/utils.py @@ -8,7 +8,7 @@ import sys import uuid from collections import OrderedDict -from datetime import datetime, timedelta +from datetime import timedelta from itertools import chain from pathlib import Path from typing import ( @@ -25,7 +25,6 @@ import aiofiles from async_timeout import timeout -from dateutil.parser import parse as dtparse if TYPE_CHECKING: from decimal import Decimal @@ -405,16 +404,3 @@ async def umount( fstab = Fstab(fp) await fstab.remove_by_mountpoint(str(mountpoint)) return True - - -def get_first_timestamp_for_status( - status_history_records: list[dict[str, str]], status: str -) -> datetime | None: - """ - Get the first occurrence time of the given status from the status history records. - """ - - for status_history in status_history_records: - if status_history["status"] == status: - return dtparse(status_history["timestamp"]) - return None diff --git a/src/ai/backend/manager/models/kernel.py b/src/ai/backend/manager/models/kernel.py index 4734a15a1d..2e009127cb 100644 --- a/src/ai/backend/manager/models/kernel.py +++ b/src/ai/backend/manager/models/kernel.py @@ -46,7 +46,6 @@ SessionTypes, VFolderMount, ) -from ai.backend.common.utils import get_first_timestamp_for_status from ..api.exceptions import ( BackendError, @@ -83,7 +82,9 @@ ExtendedAsyncSAEngine, JSONCoalesceExpr, execute_with_retry, + get_first_timestamp_for_status, sql_append_dict_to_list, + sql_json_merge, ) if TYPE_CHECKING: @@ -927,8 +928,8 @@ def parse_row(cls, ctx: GraphQueryContext, row: KernelRow) -> Mapping[str, Any]: hide_agents = False else: hide_agents = ctx.local_config["manager"]["hide-agents"] - status_history = row.status_history - scheduled_at = get_first_timestamp_for_status(status_history, KernelStatus.SCHEDULED.name) + status_history = cast(list[dict[str, str]], row.status_history) + scheduled_at = get_first_timestamp_for_status(status_history, KernelStatus.SCHEDULED) return { # identity diff --git a/src/ai/backend/manager/models/resource_usage.py b/src/ai/backend/manager/models/resource_usage.py index 76b33016fe..0916cadf3c 100644 --- a/src/ai/backend/manager/models/resource_usage.py +++ b/src/ai/backend/manager/models/resource_usage.py @@ -1,8 +1,9 @@ from __future__ import annotations +import json from datetime import datetime from enum import Enum -from typing import Any, Mapping, Optional, Sequence +from typing import Any, Mapping, Optional, Sequence, cast from uuid import UUID import attrs @@ -14,14 +15,15 @@ from sqlalchemy.orm import joinedload, load_only from ai.backend.common import redis_helper +from ai.backend.common.json import ExtendedJSONEncoder from ai.backend.common.types import RedisConnectionInfo -from ai.backend.common.utils import get_first_timestamp_for_status, nmget +from ai.backend.common.utils import nmget from .group import GroupRow from .kernel import LIVE_STATUS, RESOURCE_USAGE_KERNEL_STATUSES, KernelRow, KernelStatus from .session import SessionRow from .user import UserRow -from .utils import ExtendedAsyncSAEngine +from .utils import ExtendedAsyncSAEngine, get_first_timestamp_for_status __all__: Sequence[str] = ( "ResourceGroupUnit", @@ -517,7 +519,9 @@ async def _pipe_builder(r: Redis) -> RedisPipeline: created_at=kern.created_at, terminated_at=kern.terminated_at, scheduled_at=str( - get_first_timestamp_for_status(kern.status_history, KernelStatus.SCHEDULED.name) + get_first_timestamp_for_status( + cast(list[dict[str, str]], kern.status_history), KernelStatus.SCHEDULED + ) ), used_time=kern.used_time, used_days=kern.get_used_days(local_tz), @@ -536,7 +540,7 @@ async def _pipe_builder(r: Redis) -> RedisPipeline: images={kern.image}, agents={kern.agent}, status=kern.status.name, - status_history=kern.status_history, + status_history=json.dumps(kern.status_history, cls=ExtendedJSONEncoder), cluster_mode=kern.cluster_mode, status_info=kern.status_info, group_unit=ResourceGroupUnit.KERNEL, diff --git a/src/ai/backend/manager/models/session.py b/src/ai/backend/manager/models/session.py index 53ba67a665..8cbdd06afb 100644 --- a/src/ai/backend/manager/models/session.py +++ b/src/ai/backend/manager/models/session.py @@ -38,7 +38,6 @@ SessionTypes, VFolderMount, ) -from ai.backend.common.utils import get_first_timestamp_for_status from ..api.exceptions import ( AgentError, @@ -80,7 +79,9 @@ JSONCoalesceExpr, agg_to_array, execute_with_retry, + get_first_timestamp_for_status, sql_append_dict_to_list, + sql_json_merge, ) if TYPE_CHECKING: @@ -1324,9 +1325,7 @@ def parse_row(cls, ctx: GraphQueryContext, row: Row) -> Mapping[str, Any]: full_name = getattr(row, "full_name") group_name = getattr(row, "group_name") row = row.SessionRow - scheduled_at = get_first_timestamp_for_status( - row.status_history, SessionStatus.SCHEDULED.name - ) + scheduled_at = get_first_timestamp_for_status(row.status_history, SessionStatus.SCHEDULED) return { # identity diff --git a/src/ai/backend/manager/models/utils.py b/src/ai/backend/manager/models/utils.py index 7093ac876c..7756576263 100644 --- a/src/ai/backend/manager/models/utils.py +++ b/src/ai/backend/manager/models/utils.py @@ -6,6 +6,7 @@ import logging from contextlib import AbstractAsyncContextManager as AbstractAsyncCtxMgr from contextlib import asynccontextmanager as actxmgr +from datetime import datetime from typing import ( TYPE_CHECKING, Any, @@ -23,6 +24,7 @@ from urllib.parse import quote_plus as urlquote import sqlalchemy as sa +from dateutil.parser import parse as dtparse from sqlalchemy.dialects import postgresql as psql from sqlalchemy.engine import create_engine as _create_engine from sqlalchemy.exc import DBAPIError @@ -44,6 +46,10 @@ if TYPE_CHECKING: from ..config import LocalConfig + from . import ( + KernelStatus, + SessionStatus, + ) from ..defs import LockID from ..types import Sentinel @@ -536,3 +542,17 @@ async def vacuum_db( vacuum_sql = "VACUUM FULL" if vacuum_full else "VACUUM" log.info(f"Perfoming {vacuum_sql} operation...") await conn.exec_driver_sql(vacuum_sql) + + +def get_first_timestamp_for_status( + status_history_records: list[dict[str, str]], + status: KernelStatus | SessionStatus, +) -> datetime | None: + """ + Get the first occurrence time of the given status from the status history records. + """ + + for status_history in status_history_records: + if status_history["status"] == status.name: + return dtparse(status_history["timestamp"]) + return None diff --git a/src/ai/backend/manager/utils.py b/src/ai/backend/manager/utils.py index 091b9a767c..8a4c070a9b 100644 --- a/src/ai/backend/manager/utils.py +++ b/src/ai/backend/manager/utils.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from typing import Optional from uuid import UUID From bf0f5a417a0c70bebc591ba2c838469e6c1ec5e7 Mon Sep 17 00:00:00 2001 From: Joongi Kim Date: Mon, 15 Jul 2024 19:13:53 +0900 Subject: [PATCH 42/45] fix: Reconcile the alembic migration history --- .../versions/8c8e90aebacd_replace_status_history_to_list.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ai/backend/manager/models/alembic/versions/8c8e90aebacd_replace_status_history_to_list.py b/src/ai/backend/manager/models/alembic/versions/8c8e90aebacd_replace_status_history_to_list.py index 62475cfb2f..3d211e05f1 100644 --- a/src/ai/backend/manager/models/alembic/versions/8c8e90aebacd_replace_status_history_to_list.py +++ b/src/ai/backend/manager/models/alembic/versions/8c8e90aebacd_replace_status_history_to_list.py @@ -1,7 +1,7 @@ """Replace sessions, kernels's status_history's type map with list Revision ID: 8c8e90aebacd -Revises: dddf9be580f5 +Revises: 59a622c31820 Create Date: 2024-01-26 11:19:23.075014 """ @@ -10,7 +10,7 @@ # revision identifiers, used by Alembic. revision = "8c8e90aebacd" -down_revision = "dddf9be580f5" +down_revision = "59a622c31820" branch_labels = None depends_on = None From 4f63ea79f6cfe6188361daf6edcfe3c0f8425931 Mon Sep 17 00:00:00 2001 From: Joongi Kim Date: Thu, 18 Jul 2024 16:45:43 +0900 Subject: [PATCH 43/45] fix: Let migrations convert columns with the other type only --- ...c8e90aebacd_replace_status_history_to_list.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/ai/backend/manager/models/alembic/versions/8c8e90aebacd_replace_status_history_to_list.py b/src/ai/backend/manager/models/alembic/versions/8c8e90aebacd_replace_status_history_to_list.py index 3d211e05f1..64b25e9781 100644 --- a/src/ai/backend/manager/models/alembic/versions/8c8e90aebacd_replace_status_history_to_list.py +++ b/src/ai/backend/manager/models/alembic/versions/8c8e90aebacd_replace_status_history_to_list.py @@ -23,6 +23,7 @@ def upgrade(): (jsonb_each(status_history)).key AS status, (jsonb_each(status_history)).value AS timestamp FROM kernels + WHERE jsonb_typeof(status_history) = 'object' ) UPDATE kernels SET status_history = ( @@ -31,6 +32,7 @@ def upgrade(): ) FROM data WHERE data.id = kernels.id + AND jsonb_typeof(kernels.status_history) = 'object' ); """ ) @@ -44,6 +46,7 @@ def upgrade(): (jsonb_each(status_history)).key AS status, (jsonb_each(status_history)).value AS timestamp FROM sessions + WHERE jsonb_typeof(status_history) = 'object' ) UPDATE sessions SET status_history = ( @@ -52,6 +55,7 @@ def upgrade(): ) FROM data WHERE data.id = sessions.id + AND jsonb_typeof(sessions.status_history) = 'object' ); """ ) @@ -68,13 +72,15 @@ def downgrade(): elem->>'status', elem->>'timestamp' ) AS new_status_history FROM kernels, - jsonb_array_elements(status_history) AS elem + jsonb_array_elements(status_history) AS elem + WHERE jsonb_typeof(status_history) = 'array' GROUP BY id ) UPDATE kernels SET status_history = data.new_status_history FROM data - WHERE data.id = kernels.id; + WHERE data.id = kernels.id + AND jsonb_typeof(kernels.status_history) = 'array'; """ ) op.alter_column("kernels", "status_history", nullable=True, default=None) @@ -88,13 +94,15 @@ def downgrade(): elem->>'status', elem->>'timestamp' ) AS new_status_history FROM sessions, - jsonb_array_elements(status_history) AS elem + jsonb_array_elements(status_history) AS elem + WHERE jsonb_typeof(status_history) = 'array' GROUP BY id ) UPDATE sessions SET status_history = data.new_status_history FROM data - WHERE data.id = sessions.id; + WHERE data.id = sessions.id + AND jsonb_typeof(sessions.status_history) = 'array'; """ ) op.alter_column("sessions", "status_history", nullable=True, default=None) From 3bb3d3177d43437d88309a49758c3ea0688ff0cc Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Thu, 18 Jul 2024 08:19:22 +0000 Subject: [PATCH 44/45] fix: Change data types that were mistakenly converted to tuples --- src/ai/backend/manager/models/kernel.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ai/backend/manager/models/kernel.py b/src/ai/backend/manager/models/kernel.py index 2e009127cb..a31b06458c 100644 --- a/src/ai/backend/manager/models/kernel.py +++ b/src/ai/backend/manager/models/kernel.py @@ -544,7 +544,7 @@ class KernelRow(Base): # // used to prevent duplication of SessionTerminatedEvent # } # } - status_history = (sa.Column("status_history", pgsql.JSONB(), nullable=False, default=[]),) + status_history = sa.Column("status_history", pgsql.JSONB(), nullable=False, default=[]) # status_history records all status changes # e.g) # [ @@ -552,8 +552,8 @@ class KernelRow(Base): # {"status: "SCHEDULED", "timestamp": "2022-10-22T11:40:30"}, # {"status: "PREPARING", "timestamp": "2022-10-25T10:22:30"} # ] - callback_url = (sa.Column("callback_url", URLColumn, nullable=True, default=sa.null()),) - startup_command = (sa.Column("startup_command", sa.Text, nullable=True),) + callback_url = sa.Column("callback_url", URLColumn, nullable=True, default=sa.null()) + startup_command = sa.Column("startup_command", sa.Text, nullable=True) result = sa.Column( "result", EnumType(SessionResult), From 5eeb502be611f1c6d9f00ed84338f4cd772f06e4 Mon Sep 17 00:00:00 2001 From: Joongi Kim Date: Tue, 23 Jul 2024 01:36:30 +0900 Subject: [PATCH 45/45] fix: Update the codes added in #2275 --- src/ai/backend/manager/registry.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/ai/backend/manager/registry.py b/src/ai/backend/manager/registry.py index 6ffc840fd3..9062305038 100644 --- a/src/ai/backend/manager/registry.py +++ b/src/ai/backend/manager/registry.py @@ -2225,21 +2225,21 @@ async def _destroy(db_session: AsyncSession) -> SessionRow: kern.status = kernel_target_status kern.terminated_at = current_time kern.status_info = destroy_reason - kern.status_history = sql_json_merge( + kern.status_history = sql_append_dict_to_list( KernelRow.status_history, - (), { - kernel_target_status.name: current_time.isoformat(), + "status": kernel_target_status.name, + "timestamp": now.isoformat(), }, ) session_row.status = target_status session_row.terminated_at = current_time session_row.status_info = destroy_reason - session_row.status_history = sql_json_merge( + session_row.status_history = sql_append_dict_to_list( SessionRow.status_history, - (), { - target_status.name: current_time.isoformat(), + "status": target_status.name, + "timestamp": current_time.isoformat(), }, ) return session_row @@ -3239,7 +3239,10 @@ async def _update_kernel() -> tuple[AccessKey, AgentId] | None: ), "status_history": sql_append_dict_to_list( KernelRow.status_history, - {"status": KernelStatus.TERMINATED.name, "timestamp": now.isoformat()}, + { + "status": KernelStatus.TERMINATED.name, + "timestamp": now.isoformat(), + }, ), "terminated_at": now, }