From c2ca807745608dfdfecf97d42ace87a09f017242 Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Mon, 1 Apr 2024 07:53:36 +0000 Subject: [PATCH 01/16] Merge --- changes/1506.feature.md | 1 + configs/manager/halfstack.toml | 24 +- python.lock | 28 ++ requirements.txt | 1 + src/ai/backend/common/distributed.py | 74 +++- src/ai/backend/manager/api/context.py | 23 +- src/ai/backend/manager/api/logs.py | 34 +- src/ai/backend/manager/cli/__main__.py | 82 +++++ src/ai/backend/manager/config.py | 56 +++- src/ai/backend/manager/idle.py | 39 ++- src/ai/backend/manager/raft/BUILD | 1 + src/ai/backend/manager/raft/__init__.py | 0 src/ai/backend/manager/raft/logger.py | 24 ++ src/ai/backend/manager/raft/state_machine.py | 48 +++ src/ai/backend/manager/raft/utils.py | 101 ++++++ .../backend/manager/scheduler/dispatcher.py | 317 +++++++++--------- src/ai/backend/manager/server.py | 120 ++++++- src/ai/backend/manager/types.py | 9 + tests/common/test_distributed.py | 10 +- tests/manager/test_idle_checker.py | 20 +- tests/manager/test_scheduler.py | 3 + 21 files changed, 829 insertions(+), 186 deletions(-) create mode 100644 changes/1506.feature.md create mode 100644 src/ai/backend/manager/raft/BUILD create mode 100644 src/ai/backend/manager/raft/__init__.py create mode 100644 src/ai/backend/manager/raft/logger.py create mode 100644 src/ai/backend/manager/raft/state_machine.py create mode 100644 src/ai/backend/manager/raft/utils.py diff --git a/changes/1506.feature.md b/changes/1506.feature.md new file mode 100644 index 0000000000..e5056d8038 --- /dev/null +++ b/changes/1506.feature.md @@ -0,0 +1 @@ +Add Raft-based leader election process to manager group in HA condition in order to make their states consistent. diff --git a/configs/manager/halfstack.toml b/configs/manager/halfstack.toml index 69074f1cfc..742cfaab2d 100644 --- a/configs/manager/halfstack.toml +++ b/configs/manager/halfstack.toml @@ -15,7 +15,7 @@ pool-recycle = 50 [manager] -num-proc = 4 +num-proc = 3 service-addr = { host = "0.0.0.0", port = 8081 } #user = "nobody" #group = "nobody" @@ -34,6 +34,27 @@ hide-agents = true # The order of agent selection. agent-selection-resource-priority = ["cuda", "rocm", "tpu", "cpu", "mem"] +[raft] +heartbeat-tick = 3 +election-tick = 10 +log-dir = "./logs" +log-level = "debug" + +[[raft.peers]] +host = "127.0.0.1" +port = 60151 +node-id = 1 + +[[raft.peers]] +host = "127.0.0.1" +port = 60152 +node-id = 2 + +[[raft.peers]] +host = "127.0.0.1" +port = 60153 +node-id = 3 + [docker-registry] ssl-verify = false @@ -47,6 +68,7 @@ drivers = ["console"] "aiotools" = "INFO" "aiohttp" = "INFO" "ai.backend" = "INFO" +"ai.backend.manager.server.raft" = "INFO" "alembic" = "INFO" "sqlalchemy" = "WARNING" diff --git a/python.lock b/python.lock index 2db44c05b7..db5280c8ce 100644 --- a/python.lock +++ b/python.lock @@ -72,7 +72,12 @@ // "python-dateutil>=2.8", // "python-dotenv~=0.20.0", // "python-json-logger>=2.0.1", +<<<<<<< HEAD // "pyzmq~=25.1.2", +======= +// "pyzmq~=24.0.1", +// "raftify==0.1.42", +>>>>>>> a7ae2ac8 (Introduce raftify and RaftContext) // "redis[hiredis]==4.5.5", // "rich~=13.6", // "setproctitle~=1.3.2", @@ -3374,6 +3379,24 @@ "requires_python": ">=3.6", "version": "25.1.2" }, + { + "artifacts": [ + { + "algorithm": "sha256", + "hash": "ce3208bce423aa29eb7e328f1dcae653612866ccb2d5977306e833de186fce93", + "url": "https://files.pythonhosted.org/packages/bc/73/bde132e18ff1186d34044e53ab8fec92826f4cd2b64176b27e92594d2fc9/raftify-0.1.42-cp311-cp311-macosx_11_0_arm64.whl" + }, + { + "algorithm": "sha256", + "hash": "ab6e0238073b0bed73ee52edd53ee917d6c52ae0be4b62efbf4d4ee447d1b312", + "url": "https://files.pythonhosted.org/packages/69/d6/d157854fed70ac6acd350a88cedca59a27b7ee4ddb9b6c009c31a649d471/raftify-0.1.42.tar.gz" + } + ], + "project_name": "raftify", + "requires_dists": [], + "requires_python": ">=3.10", + "version": "0.1.42" + }, { "artifacts": [ { @@ -4638,7 +4661,12 @@ "python-dateutil>=2.8", "python-dotenv~=0.20.0", "python-json-logger>=2.0.1", +<<<<<<< HEAD "pyzmq~=25.1.2", +======= + "pyzmq~=24.0.1", + "raftify==0.1.42", +>>>>>>> a7ae2ac8 (Introduce raftify and RaftContext) "redis[hiredis]==4.5.5", "rich~=13.6", "setproctitle~=1.3.2", diff --git a/requirements.txt b/requirements.txt index bbd30f7c2f..496b5a3b8c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -95,3 +95,4 @@ backend.ai-krunner-alpine==5.1.0 backend.ai-krunner-static-gnu==4.1.1 etcd-client-py==0.2.4 +raftify==0.1.42 diff --git a/src/ai/backend/common/distributed.py b/src/ai/backend/common/distributed.py index f89cc13e4b..876fd598fe 100644 --- a/src/ai/backend/common/distributed.py +++ b/src/ai/backend/common/distributed.py @@ -1,10 +1,12 @@ from __future__ import annotations +import abc import asyncio import logging from typing import TYPE_CHECKING, Callable, Final from aiomonitor.task import preserve_termination_log +from raftify import RaftNode from .logging import BraceStyleAdapter @@ -16,7 +18,77 @@ log = BraceStyleAdapter(logging.getLogger(__spec__.name)) # type: ignore[name-defined] -class GlobalTimer: +class AbstractGlobalTimer(metaclass=abc.ABCMeta): + @abc.abstractmethod + async def generate_tick(self) -> None: + raise NotImplementedError + + @abc.abstractmethod + async def join(self) -> None: + raise NotImplementedError + + @abc.abstractmethod + async def leave(self) -> None: + raise NotImplementedError + + +class RaftGlobalTimer(AbstractGlobalTimer): + """ + Executes the given async function only once in the given interval, + uniquely among multiple manager instances across multiple nodes. + """ + + _event_producer: Final[EventProducer] + + def __init__( + self, + raft_node: RaftNode, + event_producer: EventProducer, + event_factory: Callable[[], AbstractEvent], + interval: float = 10.0, + initial_delay: float = 0.0, + ) -> None: + self._event_producer = event_producer + self._event_factory = event_factory + self._stopped = False + self.interval = interval + self.initial_delay = initial_delay + self.raft_node = raft_node + + async def generate_tick(self) -> None: + try: + await asyncio.sleep(self.initial_delay) + if self._stopped: + return + while True: + try: + if self._stopped: + return + if await self.raft_node.is_leader(): + await self._event_producer.produce_event(self._event_factory()) + if self._stopped: + return + await asyncio.sleep(self.interval) + except asyncio.TimeoutError: # timeout raised from etcd lock + log.warn("timeout raised while trying to acquire lock. retrying...") + except asyncio.CancelledError: + pass + + async def join(self) -> None: + self._tick_task = asyncio.create_task(self.generate_tick()) + + async def leave(self) -> None: + self._stopped = True + await asyncio.sleep(0) + if not self._tick_task.done(): + try: + self._tick_task.cancel() + await self._tick_task + except asyncio.CancelledError: + pass + + +class DistributedLockGlobalTimer(AbstractGlobalTimer): """ Executes the given async function only once in the given interval, uniquely among multiple manager instances across multiple nodes. diff --git a/src/ai/backend/manager/api/context.py b/src/ai/backend/manager/api/context.py index d8a989b15e..184162e334 100644 --- a/src/ai/backend/manager/api/context.py +++ b/src/ai/backend/manager/api/context.py @@ -1,8 +1,9 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Optional, cast import attrs +from raftify import Raft, RaftNode if TYPE_CHECKING: from ai.backend.common.bgtask import BackgroundTaskManager @@ -26,6 +27,25 @@ class BaseContext: pass +class RaftClusterContext: + _cluster: Optional[Raft] = None + + def use_raft(self) -> bool: + return self._cluster is not None + + @property + def cluster(self) -> Raft: + return cast(Raft, self._cluster) + + @cluster.setter + def cluster(self, rhs: Raft) -> None: + self._cluster = rhs + + @property + def raft_node(self) -> RaftNode: + return self.cluster.get_raft_node() + + @attrs.define(slots=True, auto_attribs=True, init=False) class RootContext(BaseContext): pidx: int @@ -53,3 +73,4 @@ class RootContext(BaseContext): error_monitor: ErrorPluginContext stats_monitor: StatsPluginContext background_task_manager: BackgroundTaskManager + raft_ctx: RaftClusterContext diff --git a/src/ai/backend/manager/api/logs.py b/src/ai/backend/manager/api/logs.py index cd99713250..931b9b0174 100644 --- a/src/ai/backend/manager/api/logs.py +++ b/src/ai/backend/manager/api/logs.py @@ -14,7 +14,11 @@ from dateutil.relativedelta import relativedelta from ai.backend.common import validators as tx -from ai.backend.common.distributed import GlobalTimer +from ai.backend.common.distributed import ( + AbstractGlobalTimer, + DistributedLockGlobalTimer, + RaftGlobalTimer, +) from ai.backend.common.events import AbstractEvent, EmptyEventArgs, EventHandler from ai.backend.common.logging import BraceStyleAdapter from ai.backend.common.types import AgentId, LogSeverity @@ -234,7 +238,7 @@ async def log_cleanup_task(app: web.Application, src: AgentId, event: DoLogClean @attrs.define(slots=True, auto_attribs=True, init=False) class PrivateContext: - log_cleanup_timer: GlobalTimer + log_cleanup_timer: AbstractGlobalTimer log_cleanup_timer_evh: EventHandler[web.Application, DoLogCleanupEvent] @@ -246,14 +250,24 @@ async def init(app: web.Application) -> None: app, log_cleanup_task, ) - app_ctx.log_cleanup_timer = GlobalTimer( - root_ctx.distributed_lock_factory(LockID.LOCKID_LOG_CLEANUP_TIMER, 20.0), - root_ctx.event_producer, - lambda: DoLogCleanupEvent(), - 20.0, - initial_delay=17.0, - task_name="log_cleanup_task", - ) + + if root_ctx.raft_ctx.use_raft(): + app_ctx.log_cleanup_timer = RaftGlobalTimer( + root_ctx.raft_ctx.raft_node, + root_ctx.event_producer, + lambda: DoLogCleanupEvent(), + 20.0, + initial_delay=17.0, + ) + else: + app_ctx.log_cleanup_timer = DistributedLockGlobalTimer( + root_ctx.distributed_lock_factory(LockID.LOCKID_LOG_CLEANUP_TIMER, 20.0), + root_ctx.event_producer, + lambda: DoLogCleanupEvent(), + 20.0, + initial_delay=17.0, + task_name="log_cleanup_task", + ) await app_ctx.log_cleanup_timer.join() diff --git a/src/ai/backend/manager/cli/__main__.py b/src/ai/backend/manager/cli/__main__.py index 100a487276..6e3b420d85 100644 --- a/src/ai/backend/manager/cli/__main__.py +++ b/src/ai/backend/manager/cli/__main__.py @@ -1,16 +1,20 @@ from __future__ import annotations import asyncio +import json import logging import pathlib import subprocess import sys from datetime import datetime from functools import partial +from typing import Any import click from more_itertools import chunked +from raftify import Peers, RaftServiceClient, cli_main from setproctitle import setproctitle +from tabulate import tabulate from ai.backend.cli.params import BoolExprType, OptionalType from ai.backend.cli.types import ExitCode @@ -19,6 +23,7 @@ from ai.backend.common.logging import BraceStyleAdapter from ai.backend.common.types import LogSeverity from ai.backend.common.validators import TimeDuration +from ai.backend.manager.raft.utils import register_custom_deserializer from .context import CLIContext, redis_ctx @@ -326,6 +331,83 @@ async def _clear_terminated_sessions(): asyncio.run(_clear_terminated_sessions()) +async def inspect_node_status(cli_ctx: CLIContext) -> None: + raft_configs = cli_ctx.local_config["raft"] + table = [] + headers = ["ENDPOINT", "NODE ID", "IS LEADER", "RAFT TERM", "RAFT APPLIED INDEX"] + + if raft_configs is not None: + initial_peers = Peers({ + int(entry["node-id"]): f"{entry['host']}:{entry['port']}" + for entry in raft_configs["peers"] + }) + + peers: dict[str, Any] | None = None + for _, peer_addr in initial_peers.items(): + raft_client = await RaftServiceClient.build(peer_addr) + try: + resp = await raft_client.get_peers() + peers = json.loads(resp) + except Exception as e: + print(f"Failed to getting peers from {peer_addr}: {e}") + continue + + if peers is None: + print("No peers are available!") + return + + for node_id in sorted(peers.keys()): + peer = peers[node_id] + raft_client = await RaftServiceClient.build(peer["addr"]) + + try: + node_debugging_info = json.loads(await raft_client.debug_node()) + except Exception as e: + print(f"Failed to getting debugging info from {peer['addr']}: {e}") + table.append([peer["addr"], "(Invalid response)"]) + + is_leader = node_debugging_info["node_id"] == node_debugging_info["leader_id"] + table.append([ + peer["addr"], + node_debugging_info["node_id"], + is_leader, + node_debugging_info["term"], + node_debugging_info["raft_log"]["applied"], + ]) + + table = [headers, *sorted(table, key=lambda x: str(x[0]))] + print( + tabulate(table, headers="firstrow", tablefmt="grid", stralign="center", numalign="center") + ) + + +@main.command() +@click.pass_obj +def status(cli_ctx: CLIContext) -> None: + """ + Collect and print each manager process's status. + """ + asyncio.run(inspect_node_status(cli_ctx)) + + +async def handle_raft_cli_main(argv: list[str]): + await cli_main(argv) + + +@main.command() +@click.pass_obj +@click.argument("args", nargs=-1, type=click.UNPROCESSED) +def raft(cli_ctx: CLIContext, args) -> None: + register_custom_deserializer() + + argv = sys.argv + # Remove "backend.ai", "mgr", "raft" from the argv + argv[:3] = [] + argv.insert(0, "raftify-cli") + + asyncio.run(handle_raft_cli_main(argv)) + + @main.group(cls=LazyGroup, import_name="ai.backend.manager.cli.dbschema:cli") def schema(): """Command set for managing the database schema.""" diff --git a/src/ai/backend/manager/config.py b/src/ai/backend/manager/config.py index 9b3a4ed541..bbe90ffa4e 100644 --- a/src/ai/backend/manager/config.py +++ b/src/ai/backend/manager/config.py @@ -1,3 +1,5 @@ +from __future__ import annotations + """ Configuration Schema on etcd ---------------------------- @@ -172,8 +174,6 @@ - {instance-id}: 1 # just a membership set """ -from __future__ import annotations - import json import logging import os @@ -217,6 +217,7 @@ SlotTypes, current_resource_slots, ) +from ai.backend.manager.types import RaftLogLovel from ..manager.defs import INTRINSIC_SLOTS from .api import ManagerStatus @@ -301,6 +302,56 @@ t.Key("log-scheduler-ticks", default=False): t.ToBool, t.Key("periodic-sync-stats", default=False): t.ToBool, }).allow_extra("*"), + t.Key("raft", default=None): ( + t.Dict({ + # Cluster configurations + ## Cluster's Leader node id + t.Key("cluster-leader-id", default=1): t.Int, + ## This would be useful when adding new RaftNodes to an existing cluster without restarting the server. + t.Key("bootstrap-done", default=False): t.ToBool, + ## Set this to the max(node_ids) when joining RaftNodes to another cluster. + t.Key("restore-wal-from", default=None): t.Int | t.Null, + t.Key("restore-wal-snapshot-from", default=None): t.Int | t.Null, + # Initial peers + ## my peers + t.Key("myself"): t.List( + t.Dict({ + t.Key("node-id"): t.Int, + t.Key("host"): t.String, + t.Key("port"): t.Int, + }) + ), + ## Other peers + t.Key("peers", default=[]): t.List( + t.Dict({ + t.Key("node-id"): t.Int, + t.Key("host"): t.String, + t.Key("port"): t.Int, + }) + ) + | t.Null, + # Storage configurations + t.Key("log-dir"): t.String, + # Logging configurations + t.Key("log-level", default=RaftLogLovel.INFO): tx.Enum(RaftLogLovel), + # Raft core configurations + # TODO: Decide proper default values for these configs. + t.Key("heartbeat-tick", default=None): t.Int | t.Null, + t.Key("election-tick", default=None): t.Int | t.Null, + t.Key("min-election-tick", default=None): t.Int | t.Null, + t.Key("max-election-tick", default=None): t.Int | t.Null, + t.Key("max-committed-size-per-ready", default=None): t.Int | t.Null, + t.Key("max-size-per-msg", default=None): t.Int | t.Null, + t.Key("max-inflight-msgs", default=None): t.Int | t.Null, + t.Key("check-quorum", default=None): t.ToBool | t.Null, + t.Key("batch-append", default=None): t.ToBool | t.Null, + t.Key("max-uncommitted-size", default=None): t.Int | t.Null, + t.Key("skip-bcast-commit", default=None): t.ToBool | t.Null, + t.Key("pre-vote", default=None): t.ToBool | t.Null, + t.Key("priority", default=None): t.Int | t.Null, + }).allow_extra("*") + | t.Null + ), }) .merge(config.etcd_config_iv) .allow_extra("*") @@ -344,6 +395,7 @@ "threshold": {}, }, }, + "raft": None, } container_registry_iv = t.Dict({ diff --git a/src/ai/backend/manager/idle.py b/src/ai/backend/manager/idle.py index d0323857ca..1a9bbe1ada 100644 --- a/src/ai/backend/manager/idle.py +++ b/src/ai/backend/manager/idle.py @@ -35,7 +35,11 @@ import ai.backend.common.validators as tx from ai.backend.common import msgpack, redis_helper from ai.backend.common.defs import REDIS_LIVE_DB, REDIS_STAT_DB -from ai.backend.common.distributed import GlobalTimer +from ai.backend.common.distributed import ( + AbstractGlobalTimer, + DistributedLockGlobalTimer, + RaftGlobalTimer, +) from ai.backend.common.events import ( AbstractEvent, DoIdleCheckEvent, @@ -58,13 +62,14 @@ SessionTypes, ) from ai.backend.common.utils import nmget +from ai.backend.manager.api.context import RaftClusterContext +from ai.backend.manager.types import DistributedLockFactory from .defs import DEFAULT_ROLE, LockID from .models.kernel import LIVE_STATUS, kernels from .models.keypair import keypairs from .models.resource_policy import keypair_resource_policies from .models.user import users -from .types import DistributedLockFactory if TYPE_CHECKING: from sqlalchemy.ext.asyncio import AsyncConnection as SAConnection @@ -169,6 +174,7 @@ class RemainingTimeType(enum.StrEnum): class IdleCheckerHost: + timer: AbstractGlobalTimer check_interval: ClassVar[float] = DEFAULT_CHECK_INTERVAL def __init__( @@ -177,6 +183,7 @@ def __init__( shared_config: SharedConfig, event_dispatcher: EventDispatcher, event_producer: EventProducer, + raft_ctx: RaftClusterContext, lock_factory: DistributedLockFactory, ) -> None: self._checkers: list[BaseIdleChecker] = [] @@ -199,6 +206,7 @@ def __init__( self._grace_period_checker: NewUserGracePeriodChecker = NewUserGracePeriodChecker( event_dispatcher, self._redis_live, self._redis_stat ) + self.raft_ctx = raft_ctx def add_checker(self, checker: BaseIdleChecker): if self._frozen: @@ -218,13 +226,22 @@ async def start(self) -> None: ) for checker in self._checkers: await checker.populate_config(raw_config.get(checker.name) or {}) - self.timer = GlobalTimer( - self._lock_factory(LockID.LOCKID_IDLE_CHECK_TIMER, self.check_interval), - self._event_producer, - lambda: DoIdleCheckEvent(), - self.check_interval, - task_name="idle_checker", - ) + + if self.raft_ctx.use_raft(): + self.timer = RaftGlobalTimer( + self.raft_ctx.raft_node, + self._event_producer, + lambda: DoIdleCheckEvent(), + self.check_interval, + ) + else: + self.timer = DistributedLockGlobalTimer( + self._lock_factory(LockID.LOCKID_IDLE_CHECK_TIMER, self.check_interval), + self._event_producer, + lambda: DoIdleCheckEvent(), + self.check_interval, + ) + self._evh_idle_check = self._event_dispatcher.consume( DoIdleCheckEvent, None, @@ -855,7 +872,7 @@ async def check_idleness( if (window_size <= 0) or (math.isinf(window_size) and window_size > 0): return True - # Wait until the time "interval" is passed after the last udpated time. + # Wait until the time "interval" is passed after the last updated time. t = await redis_helper.execute(self._redis_live, lambda r: r.time()) util_now: float = t[0] + (t[1] / (10**6)) raw_util_last_collected = await redis_helper.execute( @@ -1057,6 +1074,7 @@ async def init_idle_checkers( shared_config: SharedConfig, event_dispatcher: EventDispatcher, event_producer: EventProducer, + raft_ctx: RaftClusterContext, lock_factory: DistributedLockFactory, ) -> IdleCheckerHost: """ @@ -1068,6 +1086,7 @@ async def init_idle_checkers( shared_config, event_dispatcher, event_producer, + raft_ctx, lock_factory, ) checker_init_args = (event_dispatcher, checker_host._redis_live, checker_host._redis_stat) diff --git a/src/ai/backend/manager/raft/BUILD b/src/ai/backend/manager/raft/BUILD new file mode 100644 index 0000000000..7357442404 --- /dev/null +++ b/src/ai/backend/manager/raft/BUILD @@ -0,0 +1 @@ +python_sources(name="src") diff --git a/src/ai/backend/manager/raft/__init__.py b/src/ai/backend/manager/raft/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/ai/backend/manager/raft/logger.py b/src/ai/backend/manager/raft/logger.py new file mode 100644 index 0000000000..0ba3ccf57f --- /dev/null +++ b/src/ai/backend/manager/raft/logger.py @@ -0,0 +1,24 @@ +from typing import Any + + +class Logger: + def __init__(self, logger: Any) -> None: + self.logger = logger + + def trace(self, message): + self.logger.debug(message) + + def debug(self, message): + self.logger.debug(message) + + def info(self, message): + self.logger.info(message) + + def warn(self, message): + self.logger.warning(message) + + def error(self, message): + self.logger.error(message) + + def fatal(self, message): + self.logger.error(message) diff --git a/src/ai/backend/manager/raft/state_machine.py b/src/ai/backend/manager/raft/state_machine.py new file mode 100644 index 0000000000..e63b1434f8 --- /dev/null +++ b/src/ai/backend/manager/raft/state_machine.py @@ -0,0 +1,48 @@ +import pickle +from typing import Optional + + +class SetCommand: + """ + Represent simple key-value command. + Use pickle to serialize the data. + """ + + def __init__(self, key: str, value: str) -> None: + self.key = key + self.value = value + + def encode(self) -> bytes: + return pickle.dumps(self.__dict__) + + @classmethod + def decode(cls, packed: bytes) -> "SetCommand": + unpacked = pickle.loads(packed) + return cls(unpacked["key"], unpacked["value"]) + + +class HashStore: + """ + A simple key-value store that stores data in memory. + Use pickle to serialize the data. + """ + + def __init__(self): + self._store = dict() + + def get(self, key: str) -> Optional[str]: + return self._store.get(key) + + def as_dict(self) -> dict: + return self._store + + def apply(self, msg: bytes) -> bytes: + message = SetCommand.decode(msg) + self._store[message.key] = message.value + return msg + + def snapshot(self) -> bytes: + return pickle.dumps(self._store) + + def restore(self, snapshot: bytes) -> None: + self._store = pickle.loads(snapshot) diff --git a/src/ai/backend/manager/raft/utils.py b/src/ai/backend/manager/raft/utils.py new file mode 100644 index 0000000000..ad0509185d --- /dev/null +++ b/src/ai/backend/manager/raft/utils.py @@ -0,0 +1,101 @@ +import pickle +from typing import Any + +from aiohttp import web +from aiohttp.web import RouteTableDef +from raftify import ( + Raft, + set_confchange_context_deserializer, + set_confchangev2_context_deserializer, + set_entry_context_deserializer, + set_entry_data_deserializer, + set_fsm_deserializer, + set_log_entry_deserializer, + set_message_context_deserializer, + set_snapshot_data_deserializer, +) + +from ai.backend.manager.raft.state_machine import HashStore, SetCommand + +routes = RouteTableDef() +""" +APIs of the web servers to interact with the RaftServers. +""" + + +@routes.get("/get/{id}") +async def get(request: web.Request) -> web.Response: + store: HashStore = request.app["state"]["store"] + id = request.match_info["id"] + return web.Response(text=store.get(id)) + + +@routes.get("/leader") +async def leader(request: web.Request) -> web.Response: + raft: Raft = request.app["state"]["raft"] + leader_id = str(await raft.get_raft_node().get_leader_id()) + return web.Response(text=leader_id) + + +@routes.get("/size") +async def size(request: web.Request) -> web.Response: + raft: Raft = request.app["state"]["raft"] + size = str(await raft.get_raft_node().get_cluster_size()) + return web.Response(text=size) + + +@routes.get("/put/{id}/{value}") +async def put(request: web.Request) -> web.Response: + raft: Raft = request.app["state"]["raft"] + id, value = request.match_info["id"], request.match_info["value"] + message = SetCommand(id, value) + + await raft.get_raft_node().propose(message.encode()) + return web.Response(text="OK") + + +class WebServer: + """ + Simple webserver for Raft cluster testing. + Do not use this class for anything other than testing purposes. + """ + + def __init__(self, addr: str, state: dict[str, Any]): + self.app = web.Application() + self.app.add_routes(routes) + self.app["state"] = state + self.host, self.port = addr.split(":") + self.runner = None + + async def run(self): + self.runner = web.AppRunner(self.app) + await self.runner.setup() + self.site = web.TCPSite(self.runner, self.host, self.port) + await self.site.start() + + +def pickle_deserialize(data: bytes) -> str | None: + if data == b"": + return None + + if pickle.PROTO in data: + r = pickle.loads(data[data.index(pickle.PROTO) :]) + return r + + # Not pickle data + return None + + +def register_custom_deserializer() -> None: + """ + Initialize the custom deserializers. + """ + + set_confchange_context_deserializer(pickle_deserialize) + set_confchangev2_context_deserializer(pickle_deserialize) + set_entry_context_deserializer(pickle_deserialize) + set_entry_data_deserializer(pickle_deserialize) + set_message_context_deserializer(pickle_deserialize) + set_snapshot_data_deserializer(pickle_deserialize) + set_log_entry_deserializer(pickle_deserialize) + set_fsm_deserializer(pickle_deserialize) diff --git a/src/ai/backend/manager/scheduler/dispatcher.py b/src/ai/backend/manager/scheduler/dispatcher.py index 88ca4f2844..17544dcccc 100644 --- a/src/ai/backend/manager/scheduler/dispatcher.py +++ b/src/ai/backend/manager/scheduler/dispatcher.py @@ -34,7 +34,11 @@ from ai.backend.common import redis_helper from ai.backend.common.defs import REDIS_LIVE_DB -from ai.backend.common.distributed import GlobalTimer +from ai.backend.common.distributed import ( + AbstractGlobalTimer, + DistributedLockGlobalTimer, + RaftGlobalTimer, +) from ai.backend.common.events import ( AgentStartedEvent, CoalescingOptions, @@ -62,15 +66,16 @@ SessionId, aobject, ) +from ai.backend.manager.api.context import RaftClusterContext +from ai.backend.manager.defs import SERVICE_MAX_RETRIES, LockID +from ai.backend.manager.models.agent import AgentRow from ai.backend.manager.models.session import _build_session_fetch_query from ai.backend.manager.types import DistributedLockFactory from ai.backend.plugin.entrypoint import scan_entrypoints from ..api.exceptions import GenericBadRequest, InstanceNotAvailable, SessionNotFound -from ..defs import SERVICE_MAX_RETRIES, LockID from ..exceptions import convert_to_status_data from ..models import ( - AgentRow, AgentStatus, EndpointLifecycle, EndpointRow, @@ -152,21 +157,22 @@ def load_scheduler( class SchedulerDispatcher(aobject): - config: LocalConfig + local_config: LocalConfig shared_config: SharedConfig registry: AgentRegistry db: SAEngine event_dispatcher: EventDispatcher event_producer: EventProducer - schedule_timer: GlobalTimer - prepare_timer: GlobalTimer - scale_timer: GlobalTimer + schedule_timer: AbstractGlobalTimer + prepare_timer: AbstractGlobalTimer + scale_timer: AbstractGlobalTimer redis_live: RedisConnectionInfo def __init__( self, + raft_ctx: RaftClusterContext, local_config: LocalConfig, shared_config: SharedConfig, event_dispatcher: EventDispatcher, @@ -174,6 +180,7 @@ def __init__( lock_factory: DistributedLockFactory, registry: AgentRegistry, ) -> None: + self.raft_ctx = raft_ctx self.local_config = local_config self.shared_config = shared_config self.event_dispatcher = event_dispatcher @@ -204,32 +211,54 @@ async def __ainit__(self) -> None: evd.consume(DoScheduleEvent, None, self.schedule, coalescing_opts) evd.consume(DoPrepareEvent, None, self.prepare) evd.consume(DoScaleEvent, None, self.scale_services) - self.schedule_timer = GlobalTimer( - self.lock_factory(LockID.LOCKID_SCHEDULE_TIMER, 10.0), - self.event_producer, - lambda: DoScheduleEvent(), - interval=10.0, - task_name="schedule_timer", - ) - self.prepare_timer = GlobalTimer( - self.lock_factory(LockID.LOCKID_PREPARE_TIMER, 10.0), - self.event_producer, - lambda: DoPrepareEvent(), - interval=10.0, - initial_delay=5.0, - task_name="prepare_timer", - ) - self.scale_timer = GlobalTimer( - self.lock_factory(LockID.LOCKID_SCALE_TIMER, 10.0), - self.event_producer, - lambda: DoScaleEvent(), - interval=10.0, - initial_delay=7.0, - task_name="scale_timer", - ) + + if self.raft_ctx.use_raft(): + self.schedule_timer = RaftGlobalTimer( + self.raft_ctx.raft_node, + self.event_producer, + lambda: DoScheduleEvent(), + interval=10.0, + ) + self.prepare_timer = RaftGlobalTimer( + self.raft_ctx.raft_node, + self.event_producer, + lambda: DoPrepareEvent(), + interval=10.0, + initial_delay=5.0, + ) + self.scale_timer = RaftGlobalTimer( + self.raft_ctx.raft_node, + self.event_producer, + lambda: DoScaleEvent(), + interval=10.0, + initial_delay=7.0, + ) + else: + self.schedule_timer = DistributedLockGlobalTimer( + self.lock_factory(LockID.LOCKID_SCHEDULE_TIMER, 10.0), + self.event_producer, + lambda: DoScheduleEvent(), + interval=10.0, + ) + self.prepare_timer = DistributedLockGlobalTimer( + self.lock_factory(LockID.LOCKID_PREPARE_TIMER, 10.0), + self.event_producer, + lambda: DoPrepareEvent(), + interval=10.0, + initial_delay=5.0, + ) + self.scale_timer = DistributedLockGlobalTimer( + self.lock_factory(LockID.LOCKID_SCALE_TIMER, 10.0), + self.event_producer, + lambda: DoScaleEvent(), + interval=10.0, + initial_delay=7.0, + ) + await self.schedule_timer.join() await self.prepare_timer.join() await self.scale_timer.join() + log.info("Session scheduler started") async def close(self) -> None: @@ -237,7 +266,6 @@ async def close(self) -> None: tg.create_task(self.scale_timer.leave()) tg.create_task(self.prepare_timer.leave()) tg.create_task(self.schedule_timer.leave()) - await self.redis_live.close() log.info("Session scheduler stopped") async def schedule( @@ -283,52 +311,33 @@ def _pipeline(r: Redis) -> RedisPipeline: ) try: - # The schedule() method should be executed with a global lock - # as its individual steps are composed of many short-lived transactions. - async with self.lock_factory(LockID.LOCKID_SCHEDULE, 60): - async with self.db.begin_readonly_session() as db_sess: - # query = ( - # sa.select(ScalingGroupRow) - # .join(ScalingGroupRow.agents.and_(AgentRow.status == AgentStatus.ALIVE)) - # ) - query = ( - sa.select(AgentRow.scaling_group) - .where(AgentRow.status == AgentStatus.ALIVE) - .group_by(AgentRow.scaling_group) - ) - result = await db_sess.execute(query) - schedulable_scaling_groups = [row.scaling_group for row in result.fetchall()] - for sgroup_name in schedulable_scaling_groups: - try: - await self._schedule_in_sgroup( - sched_ctx, - sgroup_name, - ) - await redis_helper.execute( - self.redis_live, - lambda r: r.hset( - redis_key, - "resource_group", - sgroup_name, - ), - ) - except InstanceNotAvailable as e: - # Proceed to the next scaling group and come back later. - log.debug( - "schedule({}): instance not available ({})", - sgroup_name, - e.extra_msg, - ) - except Exception as e: - log.exception("schedule({}): scheduling error!\n{}", sgroup_name, repr(e)) - await redis_helper.execute( - self.redis_live, - lambda r: r.hset( - redis_key, - "finish_time", - datetime.now(tzutc()).isoformat(), - ), + async with self.db.begin_readonly_session() as db_sess: + # query = ( + # sa.select(ScalingGroupRow) + # .join(ScalingGroupRow.agents.and_(AgentRow.status == AgentStatus.ALIVE)) + # ) + query = ( + sa.select(AgentRow.scaling_group) + .where(AgentRow.status == AgentStatus.ALIVE) + .group_by(AgentRow.scaling_group) ) + result = await db_sess.execute(query) + schedulable_scaling_groups = [row.scaling_group for row in result.fetchall()] + for sgroup_name in schedulable_scaling_groups: + try: + await self._schedule_in_sgroup( + sched_ctx, + sgroup_name, + ) + except InstanceNotAvailable as e: + # Proceed to the next scaling group and come back later. + log.debug( + "schedule({}): instance not available ({})", + sgroup_name, + e.extra_msg, + ) + except Exception as e: + log.exception("schedule({}): scheduling error!\n{}", sgroup_name, repr(e)) except DBAPIError as e: if getattr(e.orig, "pgcode", None) == "55P03": log.info( @@ -721,6 +730,7 @@ async def _schedule_single_node_session( log_fmt = _log_fmt.get("") log_args = _log_args.get(tuple()) requested_architectures = set(k.architecture for k in sess_ctx.kernels) + if len(requested_architectures) > 1: raise GenericBadRequest( "Cannot assign multiple kernels with different architectures' single node session", @@ -1253,91 +1263,90 @@ def _pipeline(r: Redis) -> RedisPipeline: known_slot_types, ) try: - async with self.lock_factory(LockID.LOCKID_PREPARE, 600): - now = datetime.now(tzutc()) + now = datetime.now(tzutc()) - async def _mark_session_preparing() -> Sequence[SessionRow]: - async with self.db.begin_session() as db_sess: - update_query = ( - sa.update(KernelRow) - .values( - status=KernelStatus.PREPARING, - status_changed=now, - status_info="", - status_data={}, - status_history=sql_json_merge( - KernelRow.status_history, - (), - { - KernelStatus.PREPARING.name: now.isoformat(), - }, - ), - ) - .where( - (KernelRow.status == KernelStatus.SCHEDULED), - ) - ) - await db_sess.execute(update_query) - update_sess_query = ( - sa.update(SessionRow) - .values( - status=SessionStatus.PREPARING, - # status_changed=now, - status_info="", - status_data={}, - status_history=sql_json_merge( - SessionRow.status_history, - (), - { - SessionStatus.PREPARING.name: now.isoformat(), - }, - ), - ) - .where(SessionRow.status == SessionStatus.SCHEDULED) - .returning(SessionRow.id) + async def _mark_session_preparing() -> Sequence[SessionRow]: + async with self.db.begin_session() as db_sess: + update_query = ( + sa.update(KernelRow) + .values( + status=KernelStatus.PREPARING, + status_changed=now, + status_info="", + status_data={}, + status_history=sql_json_merge( + KernelRow.status_history, + (), + { + KernelStatus.PREPARING.name: now.isoformat(), + }, + ), ) - rows = (await db_sess.execute(update_sess_query)).fetchall() - if len(rows) == 0: - return [] - target_session_ids = [r["id"] for r in rows] - select_query = ( - sa.select(SessionRow) - .where(SessionRow.id.in_(target_session_ids)) - .options( - noload("*"), - selectinload(SessionRow.kernels).noload("*"), - ) + .where( + (KernelRow.status == KernelStatus.SCHEDULED), ) - result = await db_sess.execute(select_query) - return result.scalars().all() - - scheduled_sessions: Sequence[SessionRow] - scheduled_sessions = await execute_with_retry(_mark_session_preparing) - log.debug("prepare(): preparing {} session(s)", len(scheduled_sessions)) - async with ( - async_timeout.timeout(delay=50.0), - aiotools.PersistentTaskGroup() as tg, - ): - for scheduled_session in scheduled_sessions: - await self.registry.event_producer.produce_event( - SessionPreparingEvent( - scheduled_session.id, - scheduled_session.creation_id, + ) + await db_sess.execute(update_query) + update_sess_query = ( + sa.update(SessionRow) + .values( + status=SessionStatus.PREPARING, + # status_changed=now, + status_info="", + status_data={}, + status_history=sql_json_merge( + SessionRow.status_history, + (), + { + SessionStatus.PREPARING.name: now.isoformat(), + }, ), ) - tg.create_task( - self.start_session( - sched_ctx, - scheduled_session, - ) + .where(SessionRow.status == SessionStatus.SCHEDULED) + .returning(SessionRow.id) + ) + rows = (await db_sess.execute(update_sess_query)).fetchall() + if len(rows) == 0: + return [] + target_session_ids = [r["id"] for r in rows] + select_query = ( + sa.select(SessionRow) + .where(SessionRow.id.in_(target_session_ids)) + .options( + noload("*"), + selectinload(SessionRow.kernels).noload("*"), ) - - await redis_helper.execute( - self.redis_live, - lambda r: r.hset( - redis_key, "resource_group", scheduled_session.scaling_group_name - ), + ) + result = await db_sess.execute(select_query) + return result.scalars().all() + + scheduled_sessions: Sequence[SessionRow] + scheduled_sessions = await execute_with_retry(_mark_session_preparing) + log.debug("prepare(): preparing {} session(s)", len(scheduled_sessions)) + async with ( + async_timeout.timeout(delay=50.0), + aiotools.PersistentTaskGroup() as tg, + ): + for scheduled_session in scheduled_sessions: + await self.registry.event_producer.produce_event( + SessionPreparingEvent( + scheduled_session.id, + scheduled_session.creation_id, + ), + ) + tg.create_task( + self.start_session( + sched_ctx, + scheduled_session, ) + ) + + await redis_helper.execute( + self.redis_live, + lambda r: r.hset( + redis_key, "resource_group", scheduled_session.scaling_group_name + ), + ) await redis_helper.execute( self.redis_live, lambda r: r.hset( diff --git a/src/ai/backend/manager/server.py b/src/ai/backend/manager/server.py index 9bb71a5cce..a153abf5f5 100644 --- a/src/ai/backend/manager/server.py +++ b/src/ai/backend/manager/server.py @@ -31,6 +31,8 @@ import aiotools import click from aiohttp import web +from aiotools import process_index +from raftify import ClusterJoinTicket, Config, Peers, Raft, RaftConfig from setproctitle import setproctitle from ai.backend.common import redis_helper @@ -50,11 +52,14 @@ from ai.backend.common.plugin.monitor import INCREMENT from ai.backend.common.types import AgentSelectionStrategy, LogSeverity from ai.backend.common.utils import env_info +from ai.backend.manager.raft.logger import Logger as RaftLogger +from ai.backend.manager.raft.state_machine import HashStore +from ai.backend.manager.raft.utils import WebServer, register_custom_deserializer from . import __version__ from .agent_cache import AgentRPCCache from .api import ManagerStatus -from .api.context import RootContext +from .api.context import RaftClusterContext, RootContext from .api.exceptions import ( BackendError, GenericBadRequest, @@ -426,6 +431,7 @@ async def idle_checker_ctx(root_ctx: RootContext) -> AsyncIterator[None]: root_ctx.shared_config, root_ctx.event_dispatcher, root_ctx.event_producer, + root_ctx.raft_ctx, root_ctx.distributed_lock_factory, ) await root_ctx.idle_checker_host.start() @@ -504,6 +510,7 @@ async def sched_dispatcher_ctx(root_ctx: RootContext) -> AsyncIterator[None]: from .scheduler.dispatcher import SchedulerDispatcher sched_dispatcher = await SchedulerDispatcher.new( + root_ctx.raft_ctx, root_ctx.local_config, root_ctx.shared_config, root_ctx.event_dispatcher, @@ -650,6 +657,112 @@ async def _force_terminate_hanging_sessions( await task +@actxmgr +async def raft_ctx(root_ctx: RootContext) -> AsyncIterator[None]: + register_custom_deserializer() + local_config = root_ctx.local_config + raft_configs = local_config.get("raft") + + if raft_configs is not None: + other_peers = [{**peer, "myself": False} for peer in raft_configs["peers"]] + my_peers = [{**peer, "myself": True} for peer in raft_configs["myself"]] + all_peers = sorted([*other_peers, *my_peers], key=lambda x: x["node-id"]) + + initial_peers = Peers({ + int(peer_config["node-id"]): f"{peer_config['host']}:{peer_config['port']}" + for peer_config in all_peers + }) + + raft_core_config = RaftConfig( + heartbeat_tick=raft_configs["heartbeat-tick"], + election_tick=raft_configs["election-tick"], + min_election_tick=raft_configs["min-election-tick"], + max_election_tick=raft_configs["max-election-tick"], + max_committed_size_per_ready=raft_configs["max-committed-size-per-ready"], + max_size_per_msg=raft_configs["max-size-per-msg"], + max_inflight_msgs=raft_configs["max-inflight-msgs"], + check_quorum=raft_configs["check-quorum"], + batch_append=raft_configs["batch-append"], + max_uncommitted_size=raft_configs["max-uncommitted-size"], + skip_bcast_commit=raft_configs["skip-bcast-commit"], + pre_vote=raft_configs["pre-vote"], + priority=raft_configs["priority"], + ) + + raft_cfg = Config( + log_dir=raft_configs["log-dir"], + save_compacted_logs=True, + compacted_log_dir=raft_configs["log-dir"], + restore_wal_from=raft_configs["restore-wal-from"], + restore_wal_snapshot_from=raft_configs["restore-wal-snapshot-from"], + raft_config=raft_core_config, + ) + + node_id_offset = next((idx for idx, item in enumerate(all_peers) if item["myself"]), None) + node_id = node_id_offset + process_index.get() + 1 + raft_addr = initial_peers.get(node_id) + leader_id = raft_configs["cluster-leader-id"] + leader_addr = initial_peers.get(leader_id) + + if leader_addr is None: + raise Exception(f"Leader node {leader_id} not found in initial peers.") + + store = HashStore() + + leader_mark = "-leader" if node_id == leader_id else "" + + raft_logger = RaftLogger( + logging.getLogger(f"{__spec__.name}.raft.node-{node_id}{leader_mark}"), # type: ignore + ) + + if node_id == leader_id: + root_ctx.raft_ctx.cluster = Raft.bootstrap_cluster( + node_id, + raft_addr, + store, # type: ignore + raft_cfg, + raft_logger, # type: ignore + initial_peers, + ) + raft_cluster = root_ctx.raft_ctx.cluster + raft_cluster.run() # type: ignore + else: + root_ctx.raft_ctx.cluster = Raft.new_follower( + node_id, + raft_addr, + store, # type: ignore + raft_cfg, + raft_logger, # type: ignore + initial_peers, + ) + raft_cluster = root_ctx.raft_ctx.cluster + raft_cluster.run() # type: ignore + + # Wait for the leader node's gRPC server ready + await asyncio.sleep(2) + + if raft_configs["bootstrap-done"]: + await raft_cluster.join( + ClusterJoinTicket( + node_id, + leader_id, + leader_addr, + initial_peers, + ) + ) + await raft_cluster.get_raft_node().set_bootstrap_done() + else: + await raft_cluster.member_bootstrap_ready(leader_addr, node_id, raft_logger) # type: ignore + + # Only for testing + asyncio.create_task( + WebServer(f"127.0.0.1:6025{node_id}", {"raft": raft_cluster, "store": store}).run() + ) + + # assert root_ctx.raft_ctx.cluster.raft_node is not None, "RaftNode not initialized properly!" + yield + + class background_task_ctx: def __init__(self, root_ctx: RootContext) -> None: self.root_ctx = root_ctx @@ -796,6 +909,7 @@ def build_root_app( database_ctx, distributed_lock_ctx, event_dispatcher_ctx, + raft_ctx, idle_checker_ctx, storage_manager_ctx, hook_plugin_ctx, @@ -858,6 +972,8 @@ async def server_main( root_app = build_root_app(pidx, _args[0], subapp_pkgs=global_subapp_pkgs) root_ctx: RootContext = root_app["_root.context"] + root_ctx.raft_ctx = RaftClusterContext() + # Start aiomonitor. # Port is set by config (default=50100 + pidx). loop.set_debug(root_ctx.local_config["debug"]["asyncio"]) @@ -935,8 +1051,10 @@ async def server_main_logwrapper( _args: List[Any], ) -> AsyncIterator[None]: setproctitle(f"backend.ai: manager worker-{pidx}") + log_endpoint = _args[1] logger = Logger(_args[0]["logging"], is_master=False, log_endpoint=log_endpoint) + try: with logger: async with server_main(loop, pidx, _args): diff --git a/src/ai/backend/manager/types.py b/src/ai/backend/manager/types.py index 7d413594de..5f0a4c342f 100644 --- a/src/ai/backend/manager/types.py +++ b/src/ai/backend/manager/types.py @@ -41,3 +41,12 @@ class UserScope: class DistributedLockFactory(Protocol): def __call__(self, lock_id: LockID, lifetime_hint: float) -> AbstractDistributedLock: ... + + +class RaftLogLovel(str, enum.Enum): + TRACE = "trace" + DEBUG = "debug" + INFO = "info" + WARN = "warn" + ERROR = "error" + FATAL = "fatal" diff --git a/tests/common/test_distributed.py b/tests/common/test_distributed.py index ca133685c8..af2d642866 100644 --- a/tests/common/test_distributed.py +++ b/tests/common/test_distributed.py @@ -17,7 +17,7 @@ from redis.asyncio import Redis from ai.backend.common import config -from ai.backend.common.distributed import GlobalTimer +from ai.backend.common.distributed import DistributedLockGlobalTimer from ai.backend.common.etcd import AsyncEtcd, ConfigScopes from ai.backend.common.events import AbstractEvent, EventDispatcher, EventProducer from ai.backend.common.lock import AbstractDistributedLock, EtcdLock, FileLock, RedisLock @@ -97,7 +97,7 @@ async def _tick(context: Any, source: AgentId, event: NoopEvent) -> None: ) event_dispatcher.consume(NoopEvent, None, _tick) - timer = GlobalTimer( + timer = DistributedLockGlobalTimer( lock_factory(), event_producer, lambda: NoopEvent(test_case_ns), @@ -149,7 +149,7 @@ async def _tick(context: Any, source: AgentId, event: NoopEvent) -> None: ConfigScopes.NODE: "node/i-test", }, ) - timer = GlobalTimer( + timer = DistributedLockGlobalTimer( EtcdLock(etcd_ctx.lock_name, etcd, timeout=None, debug=True), event_producer, lambda: NoopEvent(timer_ctx.test_case_ns), @@ -206,7 +206,7 @@ async def _tick(context: Any, source: AgentId, event: NoopEvent) -> None: ) event_dispatcher.consume(NoopEvent, None, _tick) - timer = GlobalTimer( + timer = DistributedLockGlobalTimer( self.lock_factory(), event_producer, lambda: NoopEvent(self.test_case_ns), @@ -397,7 +397,7 @@ async def _tick(context: Any, source: AgentId, event: NoopEvent) -> None: lock_path = Path(tempfile.gettempdir()) / f"{test_case_ns}.lock" request.addfinalizer(partial(lock_path.unlink, missing_ok=True)) for _ in range(10): - timer = GlobalTimer( + timer = DistributedLockGlobalTimer( FileLock(lock_path, timeout=0, debug=True), event_producer, lambda: NoopEvent(test_case_ns), diff --git a/tests/manager/test_idle_checker.py b/tests/manager/test_idle_checker.py index 2174ebd981..7f9eb214e6 100644 --- a/tests/manager/test_idle_checker.py +++ b/tests/manager/test_idle_checker.py @@ -7,7 +7,7 @@ from ai.backend.common import msgpack, redis_helper from ai.backend.common.types import KernelId, SessionId, SessionTypes -from ai.backend.manager.api.context import RootContext +from ai.backend.manager.api.context import RaftClusterContext, RootContext from ai.backend.manager.idle import ( BaseIdleChecker, IdleCheckerHost, @@ -97,6 +97,7 @@ async def new_user_grace_period_checker( [".etcd"], ) root_ctx: RootContext = test_app["_root.context"] + root_ctx.raft_ctx = RaftClusterContext() # test config grace_period = 30 @@ -116,6 +117,7 @@ async def new_user_grace_period_checker( root_ctx.shared_config, root_ctx.event_dispatcher, root_ctx.event_producer, + root_ctx.raft_ctx, root_ctx.distributed_lock_factory, ) try: @@ -146,6 +148,7 @@ async def network_timeout_idle_checker( [".etcd"], ) root_ctx: RootContext = test_app["_root.context"] + root_ctx.raft_ctx = RaftClusterContext() # test 1 # remaining time is positive and no grace period @@ -177,6 +180,7 @@ async def network_timeout_idle_checker( root_ctx.shared_config, root_ctx.event_dispatcher, root_ctx.event_producer, + root_ctx.raft_ctx, root_ctx.distributed_lock_factory, ) try: @@ -230,6 +234,7 @@ async def network_timeout_idle_checker( root_ctx.shared_config, root_ctx.event_dispatcher, root_ctx.event_producer, + root_ctx.raft_ctx, root_ctx.distributed_lock_factory, ) try: @@ -287,6 +292,7 @@ async def network_timeout_idle_checker( root_ctx.shared_config, root_ctx.event_dispatcher, root_ctx.event_producer, + root_ctx.raft_ctx, root_ctx.distributed_lock_factory, ) try: @@ -349,6 +355,7 @@ async def network_timeout_idle_checker( root_ctx.shared_config, root_ctx.event_dispatcher, root_ctx.event_producer, + root_ctx.raft_ctx, root_ctx.distributed_lock_factory, ) try: @@ -397,6 +404,7 @@ async def session_lifetime_checker( [".etcd"], ) root_ctx: RootContext = test_app["_root.context"] + root_ctx.raft_ctx = RaftClusterContext() # test 1 # remaining time is positive and no grace period @@ -424,6 +432,7 @@ async def session_lifetime_checker( root_ctx.shared_config, root_ctx.event_dispatcher, root_ctx.event_producer, + root_ctx.raft_ctx, root_ctx.distributed_lock_factory, ) try: @@ -471,6 +480,7 @@ async def session_lifetime_checker( root_ctx.shared_config, root_ctx.event_dispatcher, root_ctx.event_producer, + root_ctx.raft_ctx, root_ctx.distributed_lock_factory, ) try: @@ -523,6 +533,7 @@ async def session_lifetime_checker( root_ctx.shared_config, root_ctx.event_dispatcher, root_ctx.event_producer, + root_ctx.raft_ctx, root_ctx.distributed_lock_factory, ) try: @@ -577,6 +588,7 @@ async def session_lifetime_checker( root_ctx.shared_config, root_ctx.event_dispatcher, root_ctx.event_producer, + root_ctx.raft_ctx, root_ctx.distributed_lock_factory, ) try: @@ -620,6 +632,7 @@ async def utilization_idle_checker__utilization( [".etcd"], ) root_ctx: RootContext = test_app["_root.context"] + root_ctx.raft_ctx = RaftClusterContext() kernel_id = KernelId(uuid4()) expected = { @@ -665,6 +678,7 @@ async def utilization_idle_checker__utilization( root_ctx.shared_config, root_ctx.event_dispatcher, root_ctx.event_producer, + root_ctx.raft_ctx, root_ctx.distributed_lock_factory, ) await redis_helper.execute( @@ -704,6 +718,7 @@ async def utilization_idle_checker( [".etcd"], ) root_ctx: RootContext = test_app["_root.context"] + root_ctx.raft_ctx = RaftClusterContext() # test 1 # remaining time is positive and no utilization. @@ -764,6 +779,7 @@ async def utilization_idle_checker( root_ctx.shared_config, root_ctx.event_dispatcher, root_ctx.event_producer, + root_ctx.raft_ctx, root_ctx.distributed_lock_factory, ) await redis_helper.execute( @@ -846,6 +862,7 @@ async def utilization_idle_checker( root_ctx.shared_config, root_ctx.event_dispatcher, root_ctx.event_producer, + root_ctx.raft_ctx, root_ctx.distributed_lock_factory, ) await redis_helper.execute( @@ -928,6 +945,7 @@ async def utilization_idle_checker( root_ctx.shared_config, root_ctx.event_dispatcher, root_ctx.event_producer, + root_ctx.raft_ctx, root_ctx.distributed_lock_factory, ) await redis_helper.execute( diff --git a/tests/manager/test_scheduler.py b/tests/manager/test_scheduler.py index cafa6b72f3..d625c96614 100644 --- a/tests/manager/test_scheduler.py +++ b/tests/manager/test_scheduler.py @@ -26,6 +26,7 @@ SessionId, SessionTypes, ) +from ai.backend.manager.api.context import RaftClusterContext from ai.backend.manager.defs import DEFAULT_ROLE from ai.backend.manager.models.agent import AgentRow from ai.backend.manager.models.image import ImageRow @@ -1105,6 +1106,7 @@ async def test_manually_assign_agent_available( candidate_agents = example_agents example_pending_sessions[0].kernels[0].agent = example_agents[0].id sess_ctx = example_pending_sessions[0] + raft_ctx = RaftClusterContext() dispatcher = SchedulerDispatcher( local_config=mock_local_config, @@ -1112,6 +1114,7 @@ async def test_manually_assign_agent_available( event_dispatcher=mock_event_dispatcher, event_producer=mock_event_producer, lock_factory=file_lock_factory, + raft_ctx=raft_ctx, registry=registry, ) From 88284ac51cc2df4b1e04160a5b28c572a0e49056 Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Thu, 8 Feb 2024 04:23:39 +0000 Subject: [PATCH 02/16] Reflect new API of raftify --- configs/manager/halfstack.toml | 10 ++-- python.lock | 18 +++++-- requirements.txt | 2 +- src/ai/backend/manager/cli/__main__.py | 9 ++-- src/ai/backend/manager/config.py | 32 +++++------ src/ai/backend/manager/server.py | 75 ++++++++------------------ src/ai/backend/manager/types.py | 7 +++ 7 files changed, 72 insertions(+), 81 deletions(-) diff --git a/configs/manager/halfstack.toml b/configs/manager/halfstack.toml index 742cfaab2d..10156c22ae 100644 --- a/configs/manager/halfstack.toml +++ b/configs/manager/halfstack.toml @@ -38,22 +38,24 @@ agent-selection-resource-priority = ["cuda", "rocm", "tpu", "cpu", "mem"] heartbeat-tick = 3 election-tick = 10 log-dir = "./logs" -log-level = "debug" -[[raft.peers]] +[[raft.myself]] host = "127.0.0.1" port = 60151 node-id = 1 +role = "voter" -[[raft.peers]] +[[raft.myself]] host = "127.0.0.1" port = 60152 node-id = 2 +role = "voter" -[[raft.peers]] +[[raft.myself]] host = "127.0.0.1" port = 60153 node-id = 3 +role = "voter" [docker-registry] ssl-verify = false diff --git a/python.lock b/python.lock index db5280c8ce..627175ba93 100644 --- a/python.lock +++ b/python.lock @@ -76,8 +76,12 @@ // "pyzmq~=25.1.2", ======= // "pyzmq~=24.0.1", +<<<<<<< HEAD // "raftify==0.1.42", >>>>>>> a7ae2ac8 (Introduce raftify and RaftContext) +======= +// "raftify==0.1.43", +>>>>>>> 4fcd15f2 (Reflect new API of raftify) // "redis[hiredis]==4.5.5", // "rich~=13.6", // "setproctitle~=1.3.2", @@ -3383,19 +3387,19 @@ "artifacts": [ { "algorithm": "sha256", - "hash": "ce3208bce423aa29eb7e328f1dcae653612866ccb2d5977306e833de186fce93", - "url": "https://files.pythonhosted.org/packages/bc/73/bde132e18ff1186d34044e53ab8fec92826f4cd2b64176b27e92594d2fc9/raftify-0.1.42-cp311-cp311-macosx_11_0_arm64.whl" + "hash": "3e55cb4eedde0b6c9a740560b6d01de7a810a7e28fba46c23f9a23fd02f146dd", + "url": "https://files.pythonhosted.org/packages/3f/0b/54a9a0068e6b0c64b7066cea9d4ba74db0ee48a54558b43618450a421e9c/raftify-0.1.43-cp311-cp311-macosx_11_0_arm64.whl" }, { "algorithm": "sha256", - "hash": "ab6e0238073b0bed73ee52edd53ee917d6c52ae0be4b62efbf4d4ee447d1b312", - "url": "https://files.pythonhosted.org/packages/69/d6/d157854fed70ac6acd350a88cedca59a27b7ee4ddb9b6c009c31a649d471/raftify-0.1.42.tar.gz" + "hash": "f3607539656460e58be755852100b57886103ea3b6ec3ca19e2bf269b23af175", + "url": "https://files.pythonhosted.org/packages/50/fa/f87a219d8592bc7519b97923957dc854e7532a48119a7084a56e9c9d673c/raftify-0.1.43.tar.gz" } ], "project_name": "raftify", "requires_dists": [], "requires_python": ">=3.10", - "version": "0.1.42" + "version": "0.1.43" }, { "artifacts": [ @@ -4665,8 +4669,12 @@ "pyzmq~=25.1.2", ======= "pyzmq~=24.0.1", +<<<<<<< HEAD "raftify==0.1.42", >>>>>>> a7ae2ac8 (Introduce raftify and RaftContext) +======= + "raftify==0.1.43", +>>>>>>> 4fcd15f2 (Reflect new API of raftify) "redis[hiredis]==4.5.5", "rich~=13.6", "setproctitle~=1.3.2", diff --git a/requirements.txt b/requirements.txt index 496b5a3b8c..0bd34b2e9e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -95,4 +95,4 @@ backend.ai-krunner-alpine==5.1.0 backend.ai-krunner-static-gnu==4.1.1 etcd-client-py==0.2.4 -raftify==0.1.42 +raftify==0.1.43 diff --git a/src/ai/backend/manager/cli/__main__.py b/src/ai/backend/manager/cli/__main__.py index 6e3b420d85..cedcbdffd6 100644 --- a/src/ai/backend/manager/cli/__main__.py +++ b/src/ai/backend/manager/cli/__main__.py @@ -12,7 +12,7 @@ import click from more_itertools import chunked -from raftify import Peers, RaftServiceClient, cli_main +from raftify import InitialRole, Peer, Peers, RaftServiceClient, cli_main from setproctitle import setproctitle from tabulate import tabulate @@ -338,8 +338,11 @@ async def inspect_node_status(cli_ctx: CLIContext) -> None: if raft_configs is not None: initial_peers = Peers({ - int(entry["node-id"]): f"{entry['host']}:{entry['port']}" - for entry in raft_configs["peers"] + int(peer_config["node-id"]): Peer( + addr=f"{peer_config['host']}:{peer_config['port']}", + role=InitialRole.from_str(peer_config["role"]), + ) + for peer_config in raft_configs["peers"] }) peers: dict[str, Any] | None = None diff --git a/src/ai/backend/manager/config.py b/src/ai/backend/manager/config.py index bbe90ffa4e..d07f319145 100644 --- a/src/ai/backend/manager/config.py +++ b/src/ai/backend/manager/config.py @@ -217,7 +217,7 @@ SlotTypes, current_resource_slots, ) -from ai.backend.manager.types import RaftLogLovel +from ai.backend.manager.types import RaftNodeInitialRole from ..manager.defs import INTRINSIC_SLOTS from .api import ManagerStatus @@ -305,10 +305,6 @@ t.Key("raft", default=None): ( t.Dict({ # Cluster configurations - ## Cluster's Leader node id - t.Key("cluster-leader-id", default=1): t.Int, - ## This would be useful when adding new RaftNodes to an existing cluster without restarting the server. - t.Key("bootstrap-done", default=False): t.ToBool, ## Set this to the max(node_ids) when joining RaftNodes to another cluster. t.Key("restore-wal-from", default=None): t.Int | t.Null, t.Key("restore-wal-snapshot-from", default=None): t.Int | t.Null, @@ -319,21 +315,27 @@ t.Key("node-id"): t.Int, t.Key("host"): t.String, t.Key("port"): t.Int, + t.Key("role", default=RaftNodeInitialRole.VOTER): tx.Enum( + RaftNodeInitialRole + ), }) ), ## Other peers - t.Key("peers", default=[]): t.List( - t.Dict({ - t.Key("node-id"): t.Int, - t.Key("host"): t.String, - t.Key("port"): t.Int, - }) - ) - | t.Null, + t.Key("peers", default=[]): ( + t.List( + t.Dict({ + t.Key("node-id"): t.Int, + t.Key("host"): t.String, + t.Key("port"): t.Int, + t.Key("role", default=RaftNodeInitialRole.VOTER): tx.Enum( + RaftNodeInitialRole + ), + }) + ) + | t.Null + ), # Storage configurations t.Key("log-dir"): t.String, - # Logging configurations - t.Key("log-level", default=RaftLogLovel.INFO): tx.Enum(RaftLogLovel), # Raft core configurations # TODO: Decide proper default values for these configs. t.Key("heartbeat-tick", default=None): t.Int | t.Null, diff --git a/src/ai/backend/manager/server.py b/src/ai/backend/manager/server.py index a153abf5f5..5e55cf2a5b 100644 --- a/src/ai/backend/manager/server.py +++ b/src/ai/backend/manager/server.py @@ -32,7 +32,9 @@ import click from aiohttp import web from aiotools import process_index -from raftify import ClusterJoinTicket, Config, Peers, Raft, RaftConfig +from raftify import Config as RaftConfig +from raftify import InitialRole, Peer, Peers, Raft +from raftify import RaftConfig as RaftCoreConfig from setproctitle import setproctitle from ai.backend.common import redis_helper @@ -669,11 +671,14 @@ async def raft_ctx(root_ctx: RootContext) -> AsyncIterator[None]: all_peers = sorted([*other_peers, *my_peers], key=lambda x: x["node-id"]) initial_peers = Peers({ - int(peer_config["node-id"]): f"{peer_config['host']}:{peer_config['port']}" + int(peer_config["node-id"]): Peer( + addr=f"{peer_config['host']}:{peer_config['port']}", + role=InitialRole.from_str(peer_config["role"]), + ) for peer_config in all_peers }) - raft_core_config = RaftConfig( + raft_core_config = RaftCoreConfig( heartbeat_tick=raft_configs["heartbeat-tick"], election_tick=raft_configs["election-tick"], min_election_tick=raft_configs["min-election-tick"], @@ -689,77 +694,41 @@ async def raft_ctx(root_ctx: RootContext) -> AsyncIterator[None]: priority=raft_configs["priority"], ) - raft_cfg = Config( + raft_cfg = RaftConfig( log_dir=raft_configs["log-dir"], save_compacted_logs=True, compacted_log_dir=raft_configs["log-dir"], restore_wal_from=raft_configs["restore-wal-from"], restore_wal_snapshot_from=raft_configs["restore-wal-snapshot-from"], + initial_peers=initial_peers, raft_config=raft_core_config, ) node_id_offset = next((idx for idx, item in enumerate(all_peers) if item["myself"]), None) node_id = node_id_offset + process_index.get() + 1 - raft_addr = initial_peers.get(node_id) - leader_id = raft_configs["cluster-leader-id"] - leader_addr = initial_peers.get(leader_id) - if leader_addr is None: - raise Exception(f"Leader node {leader_id} not found in initial peers.") + raft_addr = initial_peers.get(node_id) store = HashStore() - leader_mark = "-leader" if node_id == leader_id else "" - raft_logger = RaftLogger( - logging.getLogger(f"{__spec__.name}.raft.node-{node_id}{leader_mark}"), # type: ignore + logging.getLogger(f"{__spec__.name}.raft.node-{node_id}"), # type: ignore ) - if node_id == leader_id: - root_ctx.raft_ctx.cluster = Raft.bootstrap_cluster( - node_id, - raft_addr, - store, # type: ignore - raft_cfg, - raft_logger, # type: ignore - initial_peers, - ) - raft_cluster = root_ctx.raft_ctx.cluster - raft_cluster.run() # type: ignore - else: - root_ctx.raft_ctx.cluster = Raft.new_follower( - node_id, - raft_addr, - store, # type: ignore - raft_cfg, - raft_logger, # type: ignore - initial_peers, - ) - raft_cluster = root_ctx.raft_ctx.cluster - raft_cluster.run() # type: ignore - - # Wait for the leader node's gRPC server ready - await asyncio.sleep(2) - - if raft_configs["bootstrap-done"]: - await raft_cluster.join( - ClusterJoinTicket( - node_id, - leader_id, - leader_addr, - initial_peers, - ) - ) - await raft_cluster.get_raft_node().set_bootstrap_done() - else: - await raft_cluster.member_bootstrap_ready(leader_addr, node_id, raft_logger) # type: ignore + root_ctx.raft_ctx.cluster = Raft.bootstrap( + node_id, + raft_addr, + store, # type: ignore + raft_cfg, + raft_logger, # type: ignore + ) + raft_cluster = root_ctx.raft_ctx.cluster + raft_cluster.run() # type: ignore - # Only for testing + # Webserver only for raft testing asyncio.create_task( WebServer(f"127.0.0.1:6025{node_id}", {"raft": raft_cluster, "store": store}).run() ) - - # assert root_ctx.raft_ctx.cluster.raft_node is not None, "RaftNode not initialized properly!" yield diff --git a/src/ai/backend/manager/types.py b/src/ai/backend/manager/types.py index 5f0a4c342f..b699c97c5c 100644 --- a/src/ai/backend/manager/types.py +++ b/src/ai/backend/manager/types.py @@ -5,6 +5,7 @@ from typing import TYPE_CHECKING, Protocol import attr +from raftify import InitialRole from sqlalchemy.ext.asyncio import AsyncSession as SASession if TYPE_CHECKING: @@ -43,6 +44,12 @@ class DistributedLockFactory(Protocol): def __call__(self, lock_id: LockID, lifetime_hint: float) -> AbstractDistributedLock: ... +class RaftNodeInitialRole(str, enum.Enum): + LEADER = InitialRole.LEADER + VOTER = InitialRole.VOTER + LEARNER = InitialRole.LEARNER + + class RaftLogLovel(str, enum.Enum): TRACE = "trace" DEBUG = "debug" From 77994b0ffd0d44d98b8579f2386a95867ff458e7 Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Thu, 8 Feb 2024 06:32:12 +0000 Subject: [PATCH 03/16] Separate the volatile raft cluster config into a distinct configuration file. --- configs/manager/halfstack.toml | 18 ---- configs/manager/raft-cluster-config.toml | 20 ++++ src/ai/backend/manager/api/context.py | 1 + src/ai/backend/manager/config.py | 123 +++++++++++++---------- src/ai/backend/manager/server.py | 45 +++++++-- src/ai/backend/manager/types.py | 7 +- 6 files changed, 129 insertions(+), 85 deletions(-) create mode 100644 configs/manager/raft-cluster-config.toml diff --git a/configs/manager/halfstack.toml b/configs/manager/halfstack.toml index 10156c22ae..c068991621 100644 --- a/configs/manager/halfstack.toml +++ b/configs/manager/halfstack.toml @@ -39,24 +39,6 @@ heartbeat-tick = 3 election-tick = 10 log-dir = "./logs" -[[raft.myself]] -host = "127.0.0.1" -port = 60151 -node-id = 1 -role = "voter" - -[[raft.myself]] -host = "127.0.0.1" -port = 60152 -node-id = 2 -role = "voter" - -[[raft.myself]] -host = "127.0.0.1" -port = 60153 -node-id = 3 -role = "voter" - [docker-registry] ssl-verify = false diff --git a/configs/manager/raft-cluster-config.toml b/configs/manager/raft-cluster-config.toml new file mode 100644 index 0000000000..63cdecf860 --- /dev/null +++ b/configs/manager/raft-cluster-config.toml @@ -0,0 +1,20 @@ +#restore-wal-from = 1 +#restore-wal-snapshot-from = 1 + +[[peers.myself]] +host = "192.168.0.37" +port = 60151 +node-id = 1 +role = "voter" + +[[peers.myself]] +host = "192.168.0.37" +port = 60152 +node-id = 2 +role = "voter" + +[[peers.myself]] +host = "192.168.0.37" +port = 60153 +node-id = 3 +role = "voter" diff --git a/src/ai/backend/manager/api/context.py b/src/ai/backend/manager/api/context.py index 184162e334..1ace65ffbc 100644 --- a/src/ai/backend/manager/api/context.py +++ b/src/ai/backend/manager/api/context.py @@ -60,6 +60,7 @@ class RootContext(BaseContext): redis_lock: RedisConnectionInfo shared_config: SharedConfig local_config: LocalConfig + raft_cluster_config: Optional[LocalConfig] cors_options: CORSOptions webapp_plugin_ctx: WebappPluginContext diff --git a/src/ai/backend/manager/config.py b/src/ai/backend/manager/config.py index d07f319145..44759cca0d 100644 --- a/src/ai/backend/manager/config.py +++ b/src/ai/backend/manager/config.py @@ -302,63 +302,57 @@ t.Key("log-scheduler-ticks", default=False): t.ToBool, t.Key("periodic-sync-stats", default=False): t.ToBool, }).allow_extra("*"), - t.Key("raft", default=None): ( - t.Dict({ - # Cluster configurations - ## Set this to the max(node_ids) when joining RaftNodes to another cluster. - t.Key("restore-wal-from", default=None): t.Int | t.Null, - t.Key("restore-wal-snapshot-from", default=None): t.Int | t.Null, - # Initial peers - ## my peers - t.Key("myself"): t.List( - t.Dict({ - t.Key("node-id"): t.Int, - t.Key("host"): t.String, - t.Key("port"): t.Int, - t.Key("role", default=RaftNodeInitialRole.VOTER): tx.Enum( - RaftNodeInitialRole - ), - }) - ), - ## Other peers - t.Key("peers", default=[]): ( - t.List( - t.Dict({ - t.Key("node-id"): t.Int, - t.Key("host"): t.String, - t.Key("port"): t.Int, - t.Key("role", default=RaftNodeInitialRole.VOTER): tx.Enum( - RaftNodeInitialRole - ), - }) - ) - | t.Null - ), - # Storage configurations - t.Key("log-dir"): t.String, - # Raft core configurations - # TODO: Decide proper default values for these configs. - t.Key("heartbeat-tick", default=None): t.Int | t.Null, - t.Key("election-tick", default=None): t.Int | t.Null, - t.Key("min-election-tick", default=None): t.Int | t.Null, - t.Key("max-election-tick", default=None): t.Int | t.Null, - t.Key("max-committed-size-per-ready", default=None): t.Int | t.Null, - t.Key("max-size-per-msg", default=None): t.Int | t.Null, - t.Key("max-inflight-msgs", default=None): t.Int | t.Null, - t.Key("check-quorum", default=None): t.ToBool | t.Null, - t.Key("batch-append", default=None): t.ToBool | t.Null, - t.Key("max-uncommitted-size", default=None): t.Int | t.Null, - t.Key("skip-bcast-commit", default=None): t.ToBool | t.Null, - t.Key("pre-vote", default=None): t.ToBool | t.Null, - t.Key("priority", default=None): t.Int | t.Null, - }).allow_extra("*") - | t.Null - ), + t.Key("raft", default=None): t.Null + | t.Dict({ + # Storage configurations + t.Key("log-dir"): t.String, + # Raft core configurations + # TODO: Decide proper default values for these configs. + t.Key("heartbeat-tick", default=None): t.Int | t.Null, + t.Key("election-tick", default=None): t.Int | t.Null, + t.Key("min-election-tick", default=None): t.Int | t.Null, + t.Key("max-election-tick", default=None): t.Int | t.Null, + t.Key("max-committed-size-per-ready", default=None): t.Int | t.Null, + t.Key("max-size-per-msg", default=None): t.Int | t.Null, + t.Key("max-inflight-msgs", default=None): t.Int | t.Null, + t.Key("check-quorum", default=None): t.ToBool | t.Null, + t.Key("batch-append", default=None): t.ToBool | t.Null, + t.Key("max-uncommitted-size", default=None): t.Int | t.Null, + t.Key("skip-bcast-commit", default=None): t.ToBool | t.Null, + t.Key("pre-vote", default=None): t.ToBool | t.Null, + t.Key("priority", default=None): t.Int | t.Null, + }).allow_extra("*"), }) .merge(config.etcd_config_iv) .allow_extra("*") ) +manager_raft_cluster_config_iv = t.Dict({ + t.Key("restore-wal-from", default=None): t.Int | t.Null, + t.Key("restore-wal-snapshot-from", default=None): t.Int | t.Null, + t.Key("peers"): t.Dict({ + t.Key("myself"): t.List( + t.Dict({ + t.Key("node-id"): t.Int, + t.Key("host"): t.String, + t.Key("port"): t.Int, + t.Key("role", default=RaftNodeInitialRole.VOTER): tx.Enum(RaftNodeInitialRole), + }) + ), + t.Key("other", default=[]): ( + t.List( + t.Dict({ + t.Key("node-id"): t.Int, + t.Key("host"): t.String, + t.Key("port"): t.Int, + t.Key("role", default=RaftNodeInitialRole.VOTER): tx.Enum(RaftNodeInitialRole), + }) + ) + | t.Null + ), + }), +}).allow_extra("*") + _config_defaults: Mapping[str, Any] = { "system": { "timezone": "UTC", @@ -619,6 +613,31 @@ def load( return LocalConfig(cfg) +def load_raft_cluster_config( + debug_enabled: bool = False, + raft_cluster_config_path: Optional[Path] = None, +) -> Optional[LocalConfig]: + try: + raw_cfg, _ = config.read_from_file(raft_cluster_config_path, "raft-cluster-config") + except config.ConfigurationError: + return None + + try: + cfg = config.check(raw_cfg, manager_raft_cluster_config_iv) + if debug_enabled: + print("== Raft cluster configuration ==", file=sys.stderr) + print(pformat(cfg), file=sys.stderr) + except config.ConfigurationError as e: + print( + "ConfigurationError: Could not read or validate the raft cluster config:", + file=sys.stderr, + ) + print(pformat(e.invalid_data), file=sys.stderr) + raise click.Abort() + else: + return LocalConfig(cfg) + + class SharedConfig(AbstractConfig): ETCD_CONTAINER_REGISTRY_KEY: Final = "config/docker/registry" diff --git a/src/ai/backend/manager/server.py b/src/ai/backend/manager/server.py index 5e55cf2a5b..48cc6851c4 100644 --- a/src/ai/backend/manager/server.py +++ b/src/ai/backend/manager/server.py @@ -22,6 +22,7 @@ List, Mapping, MutableMapping, + Optional, Sequence, cast, ) @@ -76,7 +77,7 @@ WebMiddleware, WebRequestHandler, ) -from .config import LocalConfig, SharedConfig, volume_config_iv +from .config import LocalConfig, SharedConfig, load_raft_cluster_config, volume_config_iv from .config import load as load_config from .exceptions import InvalidArgument from .models import SessionRow @@ -662,14 +663,20 @@ async def _force_terminate_hanging_sessions( @actxmgr async def raft_ctx(root_ctx: RootContext) -> AsyncIterator[None]: register_custom_deserializer() - local_config = root_ctx.local_config - raft_configs = local_config.get("raft") + raft_configs = root_ctx.local_config.get("raft") + raft_cluster_configs = root_ctx.raft_cluster_config if raft_configs is not None: - other_peers = [{**peer, "myself": False} for peer in raft_configs["peers"]] - my_peers = [{**peer, "myself": True} for peer in raft_configs["myself"]] + assert raft_cluster_configs is not None + + other_peers = [{**peer, "myself": False} for peer in raft_cluster_configs["peers"]["other"]] + my_peers = [{**peer, "myself": True} for peer in raft_cluster_configs["peers"]["myself"]] all_peers = sorted([*other_peers, *my_peers], key=lambda x: x["node-id"]) + assert ( + root_ctx.local_config["manager"]["num-proc"] >= len(my_peers) + ), "The number of raft peers (myself), should be greater than or equal to the number of processes" + initial_peers = Peers({ int(peer_config["node-id"]): Peer( addr=f"{peer_config['host']}:{peer_config['port']}", @@ -698,8 +705,8 @@ async def raft_ctx(root_ctx: RootContext) -> AsyncIterator[None]: log_dir=raft_configs["log-dir"], save_compacted_logs=True, compacted_log_dir=raft_configs["log-dir"], - restore_wal_from=raft_configs["restore-wal-from"], - restore_wal_snapshot_from=raft_configs["restore-wal-snapshot-from"], + restore_wal_from=raft_cluster_configs["restore-wal-from"], + restore_wal_snapshot_from=raft_cluster_configs["restore-wal-snapshot-from"], initial_peers=initial_peers, raft_config=raft_core_config, ) @@ -835,6 +842,7 @@ def init_lock_factory(root_ctx: RootContext) -> DistributedLockFactory: def build_root_app( pidx: int, local_config: LocalConfig, + raft_cluster_config: Optional[LocalConfig] = None, *, cleanup_contexts: Sequence[CleanupContext] = None, subapp_pkgs: Sequence[str] = None, @@ -853,6 +861,13 @@ def build_root_app( loop.set_exception_handler(global_exception_handler) app["_root.context"] = root_ctx root_ctx.local_config = local_config + root_ctx.raft_cluster_config = raft_cluster_config + + if local_config.get("raft") is not None and raft_cluster_config is None: + raise FileNotFoundError( + "Raft configurations enabled but Raft cluster configuration file not found!" + ) + root_ctx.pidx = pidx root_ctx.cors_options = { "*": aiohttp_cors.ResourceOptions( @@ -938,9 +953,8 @@ async def server_main( pidx: int, _args: List[Any], ) -> AsyncIterator[None]: - root_app = build_root_app(pidx, _args[0], subapp_pkgs=global_subapp_pkgs) + root_app = build_root_app(pidx, _args[0], _args[1], subapp_pkgs=global_subapp_pkgs) root_ctx: RootContext = root_app["_root.context"] - root_ctx.raft_ctx = RaftClusterContext() # Start aiomonitor. @@ -1021,7 +1035,7 @@ async def server_main_logwrapper( ) -> AsyncIterator[None]: setproctitle(f"backend.ai: manager worker-{pidx}") - log_endpoint = _args[1] + log_endpoint = _args[2] logger = Logger(_args[0]["logging"], is_master=False, log_endpoint=log_endpoint) try: @@ -1041,6 +1055,13 @@ async def server_main_logwrapper( default=None, help="The config file path. (default: ./manager.toml and /etc/backend.ai/manager.toml)", ) +@click.option( + "--raft-cluster-config-path", + "--raft-cluster-config", + type=Path, + default=None, + help="The raft cluster config file path. (default: ./raft-cluster-config.toml and /etc/backend.ai/raft-cluster-config.toml)", +) @click.option( "--debug", is_flag=True, @@ -1057,12 +1078,14 @@ def main( ctx: click.Context, config_path: Path, log_level: LogSeverity, + raft_cluster_config_path: Path, debug: bool = False, ) -> None: """ Start the manager service as a foreground process. """ cfg = load_config(config_path, LogSeverity.DEBUG if debug else log_level) + raft_cluster_cfg = load_raft_cluster_config(debug, raft_cluster_config_path) if ctx.invoked_subcommand is None: cfg["manager"]["pid-file"].write_text(str(os.getpid())) @@ -1087,7 +1110,7 @@ def main( aiotools.start_server( server_main_logwrapper, num_workers=cfg["manager"]["num-proc"], - args=(cfg, log_endpoint), + args=(cfg, raft_cluster_cfg, log_endpoint), wait_timeout=5.0, ) finally: diff --git a/src/ai/backend/manager/types.py b/src/ai/backend/manager/types.py index b699c97c5c..b024a81a08 100644 --- a/src/ai/backend/manager/types.py +++ b/src/ai/backend/manager/types.py @@ -5,7 +5,6 @@ from typing import TYPE_CHECKING, Protocol import attr -from raftify import InitialRole from sqlalchemy.ext.asyncio import AsyncSession as SASession if TYPE_CHECKING: @@ -45,9 +44,9 @@ def __call__(self, lock_id: LockID, lifetime_hint: float) -> AbstractDistributed class RaftNodeInitialRole(str, enum.Enum): - LEADER = InitialRole.LEADER - VOTER = InitialRole.VOTER - LEARNER = InitialRole.LEARNER + LEADER = "leader" + VOTER = "voter" + LEARNER = "learner" class RaftLogLovel(str, enum.Enum): From cccc873002e15a3a7076b143f3ff615e45d3793e Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Tue, 13 Feb 2024 08:35:21 +0000 Subject: [PATCH 04/16] Update raftify --- .gitignore | 3 +++ python.lock | 18 +++++++++++++----- requirements.txt | 2 +- src/ai/backend/manager/raft/logger.py | 1 + 4 files changed, 18 insertions(+), 6 deletions(-) diff --git a/.gitignore b/.gitignore index ac740cd825..6fbad845b2 100644 --- a/.gitignore +++ b/.gitignore @@ -142,3 +142,6 @@ docs/manager/rest-reference/openapi.json /DIST-INFO /INSTALL-INFO + +# Raft cluster config +raft-cluster-config.toml \ No newline at end of file diff --git a/python.lock b/python.lock index 627175ba93..ae8f8a4cd8 100644 --- a/python.lock +++ b/python.lock @@ -77,11 +77,15 @@ ======= // "pyzmq~=24.0.1", <<<<<<< HEAD +<<<<<<< HEAD // "raftify==0.1.42", >>>>>>> a7ae2ac8 (Introduce raftify and RaftContext) ======= // "raftify==0.1.43", >>>>>>> 4fcd15f2 (Reflect new API of raftify) +======= +// "raftify==0.1.45", +>>>>>>> fa4423ad (Update raftify) // "redis[hiredis]==4.5.5", // "rich~=13.6", // "setproctitle~=1.3.2", @@ -3387,19 +3391,19 @@ "artifacts": [ { "algorithm": "sha256", - "hash": "3e55cb4eedde0b6c9a740560b6d01de7a810a7e28fba46c23f9a23fd02f146dd", - "url": "https://files.pythonhosted.org/packages/3f/0b/54a9a0068e6b0c64b7066cea9d4ba74db0ee48a54558b43618450a421e9c/raftify-0.1.43-cp311-cp311-macosx_11_0_arm64.whl" + "hash": "4bc907d1258d738476f49d0a3f81d98a5f5b77c86bdfadcc4fa85cbf888558c8", + "url": "https://files.pythonhosted.org/packages/73/ea/5d39b23a88a90b804f6441cc6e04bca71b2054e46126ada75bfc0342d9c3/raftify-0.1.45-cp311-cp311-macosx_11_0_arm64.whl" }, { "algorithm": "sha256", - "hash": "f3607539656460e58be755852100b57886103ea3b6ec3ca19e2bf269b23af175", - "url": "https://files.pythonhosted.org/packages/50/fa/f87a219d8592bc7519b97923957dc854e7532a48119a7084a56e9c9d673c/raftify-0.1.43.tar.gz" + "hash": "1d845200960b43962ffe9f9c3e6d4668982f5980680fd492cffe3beef5ea4f2e", + "url": "https://files.pythonhosted.org/packages/d7/6c/3c8508b92b4eeb0c0829a4285bd5af682489b424496f633454e2a64b2dab/raftify-0.1.45.tar.gz" } ], "project_name": "raftify", "requires_dists": [], "requires_python": ">=3.10", - "version": "0.1.43" + "version": "0.1.45" }, { "artifacts": [ @@ -4669,12 +4673,16 @@ "pyzmq~=25.1.2", ======= "pyzmq~=24.0.1", +<<<<<<< HEAD <<<<<<< HEAD "raftify==0.1.42", >>>>>>> a7ae2ac8 (Introduce raftify and RaftContext) ======= "raftify==0.1.43", >>>>>>> 4fcd15f2 (Reflect new API of raftify) +======= + "raftify==0.1.45", +>>>>>>> fa4423ad (Update raftify) "redis[hiredis]==4.5.5", "rich~=13.6", "setproctitle~=1.3.2", diff --git a/requirements.txt b/requirements.txt index 0bd34b2e9e..99500b954f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -95,4 +95,4 @@ backend.ai-krunner-alpine==5.1.0 backend.ai-krunner-static-gnu==4.1.1 etcd-client-py==0.2.4 -raftify==0.1.43 +raftify==0.1.45 diff --git a/src/ai/backend/manager/raft/logger.py b/src/ai/backend/manager/raft/logger.py index 0ba3ccf57f..419e4b0dda 100644 --- a/src/ai/backend/manager/raft/logger.py +++ b/src/ai/backend/manager/raft/logger.py @@ -22,3 +22,4 @@ def error(self, message): def fatal(self, message): self.logger.error(message) + assert False, "Fatal error occurred: " + message From c46340247060f4ed64d7d7da78f5f0344852eca8 Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Mon, 1 Apr 2024 07:55:26 +0000 Subject: [PATCH 05/16] Merge --- python.lock | 19 +++++++++------- requirements.txt | 2 +- src/ai/backend/manager/cli/__main__.py | 6 ++--- src/ai/backend/manager/config.py | 1 + src/ai/backend/manager/raft/utils.py | 7 ++++++ src/ai/backend/manager/server.py | 31 ++++++++++++++++++++++++-- 6 files changed, 52 insertions(+), 14 deletions(-) diff --git a/python.lock b/python.lock index ae8f8a4cd8..735de246c0 100644 --- a/python.lock +++ b/python.lock @@ -78,6 +78,7 @@ // "pyzmq~=24.0.1", <<<<<<< HEAD <<<<<<< HEAD +<<<<<<< HEAD // "raftify==0.1.42", >>>>>>> a7ae2ac8 (Introduce raftify and RaftContext) ======= @@ -86,6 +87,9 @@ ======= // "raftify==0.1.45", >>>>>>> fa4423ad (Update raftify) +======= +// "raftify==0.1.56", +>>>>>>> d23bbae6 (Implement multiple nodes joining after bootstrap done) // "redis[hiredis]==4.5.5", // "rich~=13.6", // "setproctitle~=1.3.2", @@ -3391,19 +3395,14 @@ "artifacts": [ { "algorithm": "sha256", - "hash": "4bc907d1258d738476f49d0a3f81d98a5f5b77c86bdfadcc4fa85cbf888558c8", - "url": "https://files.pythonhosted.org/packages/73/ea/5d39b23a88a90b804f6441cc6e04bca71b2054e46126ada75bfc0342d9c3/raftify-0.1.45-cp311-cp311-macosx_11_0_arm64.whl" - }, - { - "algorithm": "sha256", - "hash": "1d845200960b43962ffe9f9c3e6d4668982f5980680fd492cffe3beef5ea4f2e", - "url": "https://files.pythonhosted.org/packages/d7/6c/3c8508b92b4eeb0c0829a4285bd5af682489b424496f633454e2a64b2dab/raftify-0.1.45.tar.gz" + "hash": "03a691dcdc67d835d888ab4a32bfd656b66101e9a6e5ab89743138741c0da552", + "url": "https://files.pythonhosted.org/packages/d7/64/eaa29217787fed5b30351225d4111cacfe05224b1180cdc0a9155c7a109f/raftify-0.1.56.tar.gz" } ], "project_name": "raftify", "requires_dists": [], "requires_python": ">=3.10", - "version": "0.1.45" + "version": "0.1.56" }, { "artifacts": [ @@ -4674,6 +4673,7 @@ ======= "pyzmq~=24.0.1", <<<<<<< HEAD +<<<<<<< HEAD <<<<<<< HEAD "raftify==0.1.42", >>>>>>> a7ae2ac8 (Introduce raftify and RaftContext) @@ -4683,6 +4683,9 @@ ======= "raftify==0.1.45", >>>>>>> fa4423ad (Update raftify) +======= + "raftify==0.1.56", +>>>>>>> d23bbae6 (Implement multiple nodes joining after bootstrap done) "redis[hiredis]==4.5.5", "rich~=13.6", "setproctitle~=1.3.2", diff --git a/requirements.txt b/requirements.txt index 99500b954f..30c4336dca 100644 --- a/requirements.txt +++ b/requirements.txt @@ -95,4 +95,4 @@ backend.ai-krunner-alpine==5.1.0 backend.ai-krunner-static-gnu==4.1.1 etcd-client-py==0.2.4 -raftify==0.1.45 +raftify==0.1.56 diff --git a/src/ai/backend/manager/cli/__main__.py b/src/ai/backend/manager/cli/__main__.py index cedcbdffd6..4c2ffe1bd7 100644 --- a/src/ai/backend/manager/cli/__main__.py +++ b/src/ai/backend/manager/cli/__main__.py @@ -346,13 +346,13 @@ async def inspect_node_status(cli_ctx: CLIContext) -> None: }) peers: dict[str, Any] | None = None - for _, peer_addr in initial_peers.items(): - raft_client = await RaftServiceClient.build(peer_addr) + for intial_peer in initial_peers.to_dict().values(): + raft_client = await RaftServiceClient.build(intial_peer.get_addr()) try: resp = await raft_client.get_peers() peers = json.loads(resp) except Exception as e: - print(f"Failed to getting peers from {peer_addr}: {e}") + print(f"Failed to getting peers from {intial_peer.get_addr()}: {e}") continue if peers is None: diff --git a/src/ai/backend/manager/config.py b/src/ai/backend/manager/config.py index 44759cca0d..61fe6b20e5 100644 --- a/src/ai/backend/manager/config.py +++ b/src/ai/backend/manager/config.py @@ -328,6 +328,7 @@ ) manager_raft_cluster_config_iv = t.Dict({ + t.Key("bootstrap-done", default=False): t.ToBool, t.Key("restore-wal-from", default=None): t.Int | t.Null, t.Key("restore-wal-snapshot-from", default=None): t.Int | t.Null, t.Key("peers"): t.Dict({ diff --git a/src/ai/backend/manager/raft/utils.py b/src/ai/backend/manager/raft/utils.py index ad0509185d..7a0ac5c640 100644 --- a/src/ai/backend/manager/raft/utils.py +++ b/src/ai/backend/manager/raft/utils.py @@ -44,6 +44,13 @@ async def size(request: web.Request) -> web.Response: return web.Response(text=size) +@routes.get("/leave_joint") +async def leave_joint(request: web.Request) -> web.Response: + raft: Raft = request.app["state"]["raft"] + await raft.get_raft_node().leave_joint() + return web.Response(text="OK") + + @routes.get("/put/{id}/{value}") async def put(request: web.Request) -> web.Response: raft: Raft = request.app["state"]["raft"] diff --git a/src/ai/backend/manager/server.py b/src/ai/backend/manager/server.py index 48cc6851c4..3c6b8e1136 100644 --- a/src/ai/backend/manager/server.py +++ b/src/ai/backend/manager/server.py @@ -33,8 +33,8 @@ import click from aiohttp import web from aiotools import process_index +from raftify import ClusterJoinTicket, InitialRole, Peer, Peers, Raft, RaftServiceClient from raftify import Config as RaftConfig -from raftify import InitialRole, Peer, Peers, Raft from raftify import RaftConfig as RaftCoreConfig from setproctitle import setproctitle @@ -714,7 +714,7 @@ async def raft_ctx(root_ctx: RootContext) -> AsyncIterator[None]: node_id_offset = next((idx for idx, item in enumerate(all_peers) if item["myself"]), None) node_id = node_id_offset + process_index.get() + 1 - raft_addr = initial_peers.get(node_id) + raft_addr = initial_peers.get(node_id).get_addr() store = HashStore() @@ -732,6 +732,33 @@ async def raft_ctx(root_ctx: RootContext) -> AsyncIterator[None]: raft_cluster = root_ctx.raft_ctx.cluster raft_cluster.run() # type: ignore + if raft_cluster_configs["bootstrap-done"]: + # First follower manager execute join procedure + if node_id - node_id_offset == 1: + # TODO: Find leader_id by asking for leader_id to someone in initial_peers + leader_id, leader_addr = [ + (id_, peer.get_addr()) + for id_, peer in initial_peers.to_dict().items() + if peer.get_role() == InitialRole.LEADER + ][0] + + all_tickets = [ + ClusterJoinTicket( + peer["node-id"], + f"{peer['host']}:{peer['port']}", + leader_id, + leader_addr, + initial_peers, + ) + for peer in all_peers + ] + + await root_ctx.raft_ctx.cluster.join(all_tickets) + # TODO: Find a way to automatically close the leave_joint if possible + await asyncio.sleep(2) + client = await RaftServiceClient.build(leader_addr) + await client.leave_joint() + # Webserver only for raft testing asyncio.create_task( WebServer(f"127.0.0.1:6025{node_id}", {"raft": raft_cluster, "store": store}).run() From 2a41eee9a73d2345a4d53a7d81e6a5b8b9543814 Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Mon, 26 Feb 2024 06:57:37 +0000 Subject: [PATCH 06/16] Insert assert statement --- src/ai/backend/manager/server.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ai/backend/manager/server.py b/src/ai/backend/manager/server.py index 3c6b8e1136..07b5c551bc 100644 --- a/src/ai/backend/manager/server.py +++ b/src/ai/backend/manager/server.py @@ -712,6 +712,7 @@ async def raft_ctx(root_ctx: RootContext) -> AsyncIterator[None]: ) node_id_offset = next((idx for idx, item in enumerate(all_peers) if item["myself"]), None) + assert node_id_offset is not None, '"peers.myself" not found in initial_peers!' node_id = node_id_offset + process_index.get() + 1 raft_addr = initial_peers.get(node_id).get_addr() From 867363ebbeb5cf3815d2060bea53c6da3166f4c8 Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Mon, 26 Feb 2024 07:53:31 +0000 Subject: [PATCH 07/16] Create `raft-debug-webserver-enabled` flag --- src/ai/backend/manager/config.py | 1 + src/ai/backend/manager/server.py | 10 ++++++---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/ai/backend/manager/config.py b/src/ai/backend/manager/config.py index 61fe6b20e5..1bb2c730d1 100644 --- a/src/ai/backend/manager/config.py +++ b/src/ai/backend/manager/config.py @@ -329,6 +329,7 @@ manager_raft_cluster_config_iv = t.Dict({ t.Key("bootstrap-done", default=False): t.ToBool, + t.Key("raft-debug-webserver-enabled", default=False): t.ToBool, t.Key("restore-wal-from", default=None): t.Int | t.Null, t.Key("restore-wal-snapshot-from", default=None): t.Int | t.Null, t.Key("peers"): t.Dict({ diff --git a/src/ai/backend/manager/server.py b/src/ai/backend/manager/server.py index 07b5c551bc..7093fefc49 100644 --- a/src/ai/backend/manager/server.py +++ b/src/ai/backend/manager/server.py @@ -760,10 +760,12 @@ async def raft_ctx(root_ctx: RootContext) -> AsyncIterator[None]: client = await RaftServiceClient.build(leader_addr) await client.leave_joint() - # Webserver only for raft testing - asyncio.create_task( - WebServer(f"127.0.0.1:6025{node_id}", {"raft": raft_cluster, "store": store}).run() - ) + if raft_cluster_configs["raft-debug-webserver-enabled"]: + # Create webserver only for raft testing + asyncio.create_task( + WebServer(f"127.0.0.1:6025{node_id}", {"raft": raft_cluster, "store": store}).run() + ) + yield From 7fcc1c1b4f5adcdb961621ae7c8a07d82250178a Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Mon, 1 Apr 2024 07:55:50 +0000 Subject: [PATCH 08/16] Merge --- python.lock | 116 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 116 insertions(+) diff --git a/python.lock b/python.lock index 735de246c0..489bec97f6 100644 --- a/python.lock +++ b/python.lock @@ -953,6 +953,7 @@ "artifacts": [ { "algorithm": "sha256", +<<<<<<< HEAD "hash": "992e994c7e481a5d3259c699574882b79d631a46f7c369bea350b7ccb0651317", "url": "https://files.pythonhosted.org/packages/60/15/9aac35742c47578eb9a1aee4519fecebba5faec51eeb09f093eca9938567/boto3-1.34.61-py3-none-any.whl" }, @@ -960,22 +961,40 @@ "algorithm": "sha256", "hash": "4b40bf2c8494647c9e88c180537dc9fc0c1047a9fffbb1e5b0da6596f1e59b7b", "url": "https://files.pythonhosted.org/packages/0b/c1/5f220936875b62ba4dfc04eb24ab8d2e5e925e9baea3e174632739f9fa3a/boto3-1.34.61.tar.gz" +======= + "hash": "340c73f57fcca6f503403e2e13a0a4ad44bec218feee2e0896be612324394afd", + "url": "https://files.pythonhosted.org/packages/fe/da/2ac73f9db09018003dcc6e3cfad1029b08893de527b5af8e9032797090e9/boto3-1.34.53-py3-none-any.whl" + }, + { + "algorithm": "sha256", + "hash": "cd30261a782824ce543a628ae524480abb4ca6ab4e4a2631477e48baed43b5f2", + "url": "https://files.pythonhosted.org/packages/99/a7/4bde4b0ef2941cfad22a3527cf1fdace545d36cf5b7132613e0cb1e6a035/boto3-1.34.53.tar.gz" +>>>>>>> 6cebbda4 (Merge with main) } ], "project_name": "boto3", "requires_dists": [ +<<<<<<< HEAD "botocore<1.35.0,>=1.34.61", +======= + "botocore<1.35.0,>=1.34.53", +>>>>>>> 6cebbda4 (Merge with main) "botocore[crt]<2.0a0,>=1.21.0; extra == \"crt\"", "jmespath<2.0.0,>=0.7.1", "s3transfer<0.11.0,>=0.10.0" ], "requires_python": ">=3.8", +<<<<<<< HEAD "version": "1.34.61" +======= + "version": "1.34.53" +>>>>>>> 6cebbda4 (Merge with main) }, { "artifacts": [ { "algorithm": "sha256", +<<<<<<< HEAD "hash": "079f3288d38f97fd5656c25c44a94bea0e7090b938abfdeea463eaadb210c4a0", "url": "https://files.pythonhosted.org/packages/1d/4f/79dc11e13879bec49b577319a073f346ff340b0d00ebbd669ace4325bbbd/botocore-1.34.61-py3-none-any.whl" }, @@ -983,6 +1002,15 @@ "algorithm": "sha256", "hash": "72df4af7e4e6392552c882d48c74e4be9bf7be4cd8d829711b312fbae13d7034", "url": "https://files.pythonhosted.org/packages/11/3b/71b9b5cba2f8c90f40500c2827ce2b0229dd3c6b86a49065dc40b8e4a059/botocore-1.34.61.tar.gz" +======= + "hash": "cbbcaddc35738d32df55d26ed5561cf3fa32751a6b22e7e342be87b5e3f55eec", + "url": "https://files.pythonhosted.org/packages/a7/1c/ad78551dcf5fd347a4597885e0f02961311c5abb51f43b9faa4633caac5c/botocore-1.34.53-py3-none-any.whl" + }, + { + "algorithm": "sha256", + "hash": "3d243781e994dfc5b20036d9fb92672bfaef4dbe388eaa79dae6440ea56c53eb", + "url": "https://files.pythonhosted.org/packages/1b/3c/ef2bbbc3d569aa46805ceacb4582b5501415eddbdbb1fa9eeea013e4a044/botocore-1.34.53.tar.gz" +>>>>>>> 6cebbda4 (Merge with main) } ], "project_name": "botocore", @@ -994,7 +1022,11 @@ "urllib3<2.1,>=1.25.4; python_version >= \"3.10\"" ], "requires_python": ">=3.8", +<<<<<<< HEAD "version": "1.34.61" +======= + "version": "1.34.53" +>>>>>>> 6cebbda4 (Merge with main) }, { "artifacts": [ @@ -2361,6 +2393,7 @@ "artifacts": [ { "algorithm": "sha256", +<<<<<<< HEAD "hash": "f085493f79efb0644f270a9bf2892843142d80d7174bbbd2f3713f2a589dc633", "url": "https://files.pythonhosted.org/packages/38/04/37055b7013dfaaf66e3a9a51e46857cc9be151476a891b995fa70da7e139/marshmallow-3.21.1-py3-none-any.whl" }, @@ -2368,6 +2401,15 @@ "algorithm": "sha256", "hash": "4e65e9e0d80fc9e609574b9983cf32579f305c718afb30d7233ab818571768c3", "url": "https://files.pythonhosted.org/packages/5b/17/1b117d1875d8287a85cc2d5e2effd3f31bd8afd9f142c7b8391b9d665f0c/marshmallow-3.21.1.tar.gz" +======= + "hash": "e7997f83571c7fd476042c2c188e4ee8a78900ca5e74bd9c8097afa56624e9bd", + "url": "https://files.pythonhosted.org/packages/f5/97/6e4ddd6713bba5ede1d18f3959d7bffde38e56f7f7ae7c031c9a3d746b95/marshmallow-3.21.0-py3-none-any.whl" + }, + { + "algorithm": "sha256", + "hash": "20f53be28c6e374a711a16165fb22a8dc6003e3f7cda1285e3ca777b9193885b", + "url": "https://files.pythonhosted.org/packages/86/14/0dec31a81b16d39b6cfcb5ddd7e560d46dc5ea0d1d1bca0bb275a679071f/marshmallow-3.21.0.tar.gz" +>>>>>>> 6cebbda4 (Merge with main) } ], "project_name": "marshmallow", @@ -2386,7 +2428,11 @@ "tox; extra == \"dev\"" ], "requires_python": ">=3.8", +<<<<<<< HEAD "version": "3.21.1" +======= + "version": "3.21.0" +>>>>>>> 6cebbda4 (Merge with main) }, { "artifacts": [ @@ -3167,6 +3213,7 @@ "artifacts": [ { "algorithm": "sha256", +<<<<<<< HEAD "hash": "2a8386cfc11fa9d2c50ee7b2a57e7d898ef90470a7a34c4b949ff59662bb78b7", "url": "https://files.pythonhosted.org/packages/4d/7e/c79cecfdb6aa85c6c2e3cf63afc56d0f165f24f5c66c03c695c4d9b84756/pytest-8.1.1-py3-none-any.whl" }, @@ -3174,6 +3221,15 @@ "algorithm": "sha256", "hash": "ac978141a75948948817d360297b7aae0fcb9d6ff6bc9ec6d514b85d5a65c044", "url": "https://files.pythonhosted.org/packages/30/b7/7d44bbc04c531dcc753056920e0988032e5871ac674b5a84cb979de6e7af/pytest-8.1.1.tar.gz" +======= + "hash": "edfaaef32ce5172d5466b5127b42e0d6d35ebbe4453f0e3505d96afd93f6b096", + "url": "https://files.pythonhosted.org/packages/a7/ea/d0ab9595a0d4b2320483e634123171deaf50885e29d442180efcbf2ed0b2/pytest-8.0.2-py3-none-any.whl" + }, + { + "algorithm": "sha256", + "hash": "d4051d623a2e0b7e51960ba963193b09ce6daeb9759a451844a21e4ddedfc1bd", + "url": "https://files.pythonhosted.org/packages/3f/c0/238f25cb27495fdbaa5c48cef9886162e9df1f3d0e957fc8326d9c24fa2f/pytest-8.0.2.tar.gz" +>>>>>>> 6cebbda4 (Merge with main) } ], "project_name": "pytest", @@ -3194,7 +3250,11 @@ "xmlschema; extra == \"testing\"" ], "requires_python": ">=3.8", +<<<<<<< HEAD "version": "8.1.1" +======= + "version": "8.0.2" +>>>>>>> 6cebbda4 (Merge with main) }, { "artifacts": [ @@ -3216,6 +3276,7 @@ "artifacts": [ { "algorithm": "sha256", +<<<<<<< HEAD "hash": "a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", "url": "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl" }, @@ -3223,6 +3284,15 @@ "algorithm": "sha256", "hash": "37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3", "url": "https://files.pythonhosted.org/packages/66/c0/0c8b6ad9f17a802ee498c46e004a0eb49bc148f2fd230864601a86dcf6db/python-dateutil-2.9.0.post0.tar.gz" +======= + "hash": "cbf2f1da5e6083ac2fbfd4da39a25f34312230110440f424a14c7558bb85d82e", + "url": "https://files.pythonhosted.org/packages/13/7f/98d6f9ca8b731506c85785bbb8806c01f5966a4df6d68c0d1cf3b16967e1/python_dateutil-2.9.0-py2.py3-none-any.whl" + }, + { + "algorithm": "sha256", + "hash": "78e73e19c63f5b20ffa567001531680d939dc042bf7850431877645523c66709", + "url": "https://files.pythonhosted.org/packages/d9/77/bd458a2e387e98f71de86dcc2ca2cab64489736004c80bc12b70da8b5488/python-dateutil-2.9.0.tar.gz" +>>>>>>> 6cebbda4 (Merge with main) } ], "project_name": "python-dateutil", @@ -3230,7 +3300,11 @@ "six>=1.5" ], "requires_python": "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7", +<<<<<<< HEAD "version": "2.9.0.post0" +======= + "version": "2.9.0" +>>>>>>> 6cebbda4 (Merge with main) }, { "artifacts": [ @@ -4165,6 +4239,7 @@ "artifacts": [ { "algorithm": "sha256", +<<<<<<< HEAD "hash": "6e8e8bfad34924067333232c93f7fc4b369856d8bea0d5c9d1808cb290ab1972", "url": "https://files.pythonhosted.org/packages/7d/7d/06b1d678521cebeab11cc0fb02986c901de783a40befcc32076bbe7fe278/types_pyOpenSSL-24.0.0.20240311-py3-none-any.whl" }, @@ -4172,6 +4247,15 @@ "algorithm": "sha256", "hash": "7bca00cfc4e7ef9c5d2663c6a1c068c35798e59670595439f6296e7ba3d58083", "url": "https://files.pythonhosted.org/packages/0b/27/4112745fb4f44d89c3260fc683fae69b2bb2c724e1e1994c3e24f21d24e8/types-pyOpenSSL-24.0.0.20240311.tar.gz" +======= + "hash": "a472cf877a873549175e81972f153f44e975302a3cf17381eb5f3d41ccfb75a4", + "url": "https://files.pythonhosted.org/packages/3b/be/90df9b4654cd43344da7ca6cf003d3b5c710b49bb89658372d13cdda686e/types_pyOpenSSL-24.0.0.20240228-py3-none-any.whl" + }, + { + "algorithm": "sha256", + "hash": "cd990717d8aa3743ef0e73e0f462e64b54d90c304249232d48fece4f0f7c3c6a", + "url": "https://files.pythonhosted.org/packages/01/d3/8e3f365204734e4772cc264f6c933ef6a261450bf1ea5172d5bbec8e634f/types-pyOpenSSL-24.0.0.20240228.tar.gz" +>>>>>>> 6cebbda4 (Merge with main) } ], "project_name": "types-pyopenssl", @@ -4179,7 +4263,11 @@ "cryptography>=35.0.0" ], "requires_python": ">=3.8", +<<<<<<< HEAD "version": "24.0.0.20240311" +======= + "version": "24.0.0.20240228" +>>>>>>> 6cebbda4 (Merge with main) }, { "artifacts": [ @@ -4242,6 +4330,7 @@ "artifacts": [ { "algorithm": "sha256", +<<<<<<< HEAD "hash": "7801245ecaf371d24f1154924c8f1f0efdc53977339bf79886b5b10890af6478", "url": "https://files.pythonhosted.org/packages/a6/bf/2c0de809f83077f4095b89e3dad36807d466c6bd951d2e24bf8a08c968dc/types_setuptools-69.1.0.20240310-py3-none-any.whl" }, @@ -4249,17 +4338,31 @@ "algorithm": "sha256", "hash": "fc0e1082f55c974611bce844b1e5beb2d1a895501f4a464e48305592a4268100", "url": "https://files.pythonhosted.org/packages/e6/5f/a48e5273ffded6e85749866edec3d8d283dc0c7ed8cd8fd6f1b98d280cfa/types-setuptools-69.1.0.20240310.tar.gz" +======= + "hash": "2033afa8efe3f566ec18997c4b614664b2ed9653160d941745389ad61a50d1f6", + "url": "https://files.pythonhosted.org/packages/18/d9/8b873c75ae13da3313bc6c54fd104905671a448cbcf52d35f58a522e0f35/types_setuptools-69.1.0.20240301-py3-none-any.whl" + }, + { + "algorithm": "sha256", + "hash": "f99cf5a7f5c281c55f16ba860da68cb2cd8f3b3a472f78ec8e744240fc3aa09e", + "url": "https://files.pythonhosted.org/packages/18/d5/a5ac54da0cda554620cd5354467cd4cf2ceb795e42e2b8d6e8e870f57c02/types-setuptools-69.1.0.20240301.tar.gz" +>>>>>>> 6cebbda4 (Merge with main) } ], "project_name": "types-setuptools", "requires_dists": [], "requires_python": ">=3.8", +<<<<<<< HEAD "version": "69.1.0.20240310" +======= + "version": "69.1.0.20240301" +>>>>>>> 6cebbda4 (Merge with main) }, { "artifacts": [ { "algorithm": "sha256", +<<<<<<< HEAD "hash": "abc0377990d38e9b37b3333dd115ec960ca9788d78f3d9c7eb3f778cfc6c925c", "url": "https://files.pythonhosted.org/packages/f8/2e/92578323e36deeb7baf82ad1612a49dfe5ed0882b44e8ddbe18cfdfd5534/types_six-1.16.21.20240311-py3-none-any.whl" }, @@ -4267,12 +4370,25 @@ "algorithm": "sha256", "hash": "b5a117193ba0dc7a66507925e95e140b2af52731402cdd71ef9f2a4348e01f60", "url": "https://files.pythonhosted.org/packages/b4/f1/53abd134a8838c6046b433cbe426aa51ccc70237cd196dfca735d7000984/types-six-1.16.21.20240311.tar.gz" +======= + "hash": "4d5bbf07e521f0cb52cc880de71047bc9b5c2a5059211811e15423872d403c4c", + "url": "https://files.pythonhosted.org/packages/be/91/568c535a99994b2089bc9f346a2cf21ea2e45924d525cfed52002c1db2a0/types_six-1.16.21.20240301-py3-none-any.whl" + }, + { + "algorithm": "sha256", + "hash": "c877c0fa3dbe696860e571bfc23f987ce084bf4de13dfed343ef61ed49826686", + "url": "https://files.pythonhosted.org/packages/17/f6/98c4c8aac02b6628940a54a43c0c9bbc41956a729fa8234bf404729cc6a8/types-six-1.16.21.20240301.tar.gz" +>>>>>>> 6cebbda4 (Merge with main) } ], "project_name": "types-six", "requires_dists": [], "requires_python": ">=3.8", +<<<<<<< HEAD "version": "1.16.21.20240311" +======= + "version": "1.16.21.20240301" +>>>>>>> 6cebbda4 (Merge with main) }, { "artifacts": [ From bf4e59e2c355c6cc527845a84d9cd0d95391263d Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Mon, 1 Apr 2024 07:58:01 +0000 Subject: [PATCH 09/16] Merge --- python.lock | 330 ++++++++++++++-------------------------------------- 1 file changed, 90 insertions(+), 240 deletions(-) diff --git a/python.lock b/python.lock index 489bec97f6..77b6b0406c 100644 --- a/python.lock +++ b/python.lock @@ -72,24 +72,8 @@ // "python-dateutil>=2.8", // "python-dotenv~=0.20.0", // "python-json-logger>=2.0.1", -<<<<<<< HEAD // "pyzmq~=25.1.2", -======= -// "pyzmq~=24.0.1", -<<<<<<< HEAD -<<<<<<< HEAD -<<<<<<< HEAD -// "raftify==0.1.42", ->>>>>>> a7ae2ac8 (Introduce raftify and RaftContext) -======= -// "raftify==0.1.43", ->>>>>>> 4fcd15f2 (Reflect new API of raftify) -======= -// "raftify==0.1.45", ->>>>>>> fa4423ad (Update raftify) -======= // "raftify==0.1.56", ->>>>>>> d23bbae6 (Implement multiple nodes joining after bootstrap done) // "redis[hiredis]==4.5.5", // "rich~=13.6", // "setproctitle~=1.3.2", @@ -184,13 +168,13 @@ "artifacts": [ { "algorithm": "sha256", - "hash": "a387b63da4ced6aad35b1dda2d09620ad608a1c7c0fb71efa07ebb4cd511928d", - "url": "https://files.pythonhosted.org/packages/1a/74/976abff30200cb0cab0bd076db074b8cdda9236ba885ebe3f4d91c7e074b/aiodns-3.1.1-py3-none-any.whl" + "hash": "e443c0c27b07da3174a109fd9e736d69058d808f144d3c9d56dbd1776964c5f5", + "url": "https://files.pythonhosted.org/packages/15/14/13c65b1bd59f7e707e0cc0964fbab45c003f90292ed267d159eeeeaa2224/aiodns-3.2.0-py3-none-any.whl" }, { "algorithm": "sha256", - "hash": "1073eac48185f7a4150cad7f96a5192d6911f12b4fb894de80a088508c9b3a99", - "url": "https://files.pythonhosted.org/packages/fa/10/4de99e6e67703d8f6b10ea92a4d2a6c5b96a9c0708b75389a00203387925/aiodns-3.1.1.tar.gz" + "hash": "62869b23409349c21b072883ec8998316b234c9a9e36675756e8e317e8768f72", + "url": "https://files.pythonhosted.org/packages/e7/84/41a6a2765abc124563f5380e76b9b24118977729e25a84112f8dfb2b33dc/aiodns-3.2.0.tar.gz" } ], "project_name": "aiodns", @@ -198,7 +182,7 @@ "pycares>=4.0.0" ], "requires_python": null, - "version": "3.1.1" + "version": "3.2.0" }, { "artifacts": [ @@ -953,64 +937,36 @@ "artifacts": [ { "algorithm": "sha256", -<<<<<<< HEAD - "hash": "992e994c7e481a5d3259c699574882b79d631a46f7c369bea350b7ccb0651317", - "url": "https://files.pythonhosted.org/packages/60/15/9aac35742c47578eb9a1aee4519fecebba5faec51eeb09f093eca9938567/boto3-1.34.61-py3-none-any.whl" + "hash": "71f551491fb12fe07727d371d5561c5919fdf33dbc1d4251c57940d267a53a9e", + "url": "https://files.pythonhosted.org/packages/fe/61/2561a979dabf221724b0de8d5ba9c6f42950fea689ebfca304e8ee943d68/boto3-1.34.74-py3-none-any.whl" }, { "algorithm": "sha256", - "hash": "4b40bf2c8494647c9e88c180537dc9fc0c1047a9fffbb1e5b0da6596f1e59b7b", - "url": "https://files.pythonhosted.org/packages/0b/c1/5f220936875b62ba4dfc04eb24ab8d2e5e925e9baea3e174632739f9fa3a/boto3-1.34.61.tar.gz" -======= - "hash": "340c73f57fcca6f503403e2e13a0a4ad44bec218feee2e0896be612324394afd", - "url": "https://files.pythonhosted.org/packages/fe/da/2ac73f9db09018003dcc6e3cfad1029b08893de527b5af8e9032797090e9/boto3-1.34.53-py3-none-any.whl" - }, - { - "algorithm": "sha256", - "hash": "cd30261a782824ce543a628ae524480abb4ca6ab4e4a2631477e48baed43b5f2", - "url": "https://files.pythonhosted.org/packages/99/a7/4bde4b0ef2941cfad22a3527cf1fdace545d36cf5b7132613e0cb1e6a035/boto3-1.34.53.tar.gz" ->>>>>>> 6cebbda4 (Merge with main) + "hash": "b703e22775561a748adc4576c30424b81abd2a00d3c6fb28eec2e5cde92c1eed", + "url": "https://files.pythonhosted.org/packages/91/99/7dbca7a34850d7457e1009df53f477ecf9901c93301b2a97201a21800e74/boto3-1.34.74.tar.gz" } ], "project_name": "boto3", "requires_dists": [ -<<<<<<< HEAD - "botocore<1.35.0,>=1.34.61", -======= - "botocore<1.35.0,>=1.34.53", ->>>>>>> 6cebbda4 (Merge with main) + "botocore<1.35.0,>=1.34.74", "botocore[crt]<2.0a0,>=1.21.0; extra == \"crt\"", "jmespath<2.0.0,>=0.7.1", "s3transfer<0.11.0,>=0.10.0" ], "requires_python": ">=3.8", -<<<<<<< HEAD - "version": "1.34.61" -======= - "version": "1.34.53" ->>>>>>> 6cebbda4 (Merge with main) + "version": "1.34.74" }, { "artifacts": [ { "algorithm": "sha256", -<<<<<<< HEAD - "hash": "079f3288d38f97fd5656c25c44a94bea0e7090b938abfdeea463eaadb210c4a0", - "url": "https://files.pythonhosted.org/packages/1d/4f/79dc11e13879bec49b577319a073f346ff340b0d00ebbd669ace4325bbbd/botocore-1.34.61-py3-none-any.whl" - }, - { - "algorithm": "sha256", - "hash": "72df4af7e4e6392552c882d48c74e4be9bf7be4cd8d829711b312fbae13d7034", - "url": "https://files.pythonhosted.org/packages/11/3b/71b9b5cba2f8c90f40500c2827ce2b0229dd3c6b86a49065dc40b8e4a059/botocore-1.34.61.tar.gz" -======= - "hash": "cbbcaddc35738d32df55d26ed5561cf3fa32751a6b22e7e342be87b5e3f55eec", - "url": "https://files.pythonhosted.org/packages/a7/1c/ad78551dcf5fd347a4597885e0f02961311c5abb51f43b9faa4633caac5c/botocore-1.34.53-py3-none-any.whl" + "hash": "5d2015b5d91d6c402c122783729ce995ed7283a746b0380957026dc2b3b75969", + "url": "https://files.pythonhosted.org/packages/08/03/33b2a745333c676c3ecd9627146c84ec600ad46794c352807f1ad0f5f3e5/botocore-1.34.74-py3-none-any.whl" }, { "algorithm": "sha256", - "hash": "3d243781e994dfc5b20036d9fb92672bfaef4dbe388eaa79dae6440ea56c53eb", - "url": "https://files.pythonhosted.org/packages/1b/3c/ef2bbbc3d569aa46805ceacb4582b5501415eddbdbb1fa9eeea013e4a044/botocore-1.34.53.tar.gz" ->>>>>>> 6cebbda4 (Merge with main) + "hash": "32bb519bae62483893330c18a0ea4fd09d1ffa32bc573cd8559c2d9a08fb8c5c", + "url": "https://files.pythonhosted.org/packages/08/0c/9e206f1568b8fc7ad6b5ebc2442d0c39ed80a19f7f297461260353019e66/botocore-1.34.74.tar.gz" } ], "project_name": "botocore", @@ -1018,15 +974,11 @@ "awscrt==0.19.19; extra == \"crt\"", "jmespath<2.0.0,>=0.7.1", "python-dateutil<3.0.0,>=2.1", - "urllib3<1.27,>=1.25.4; python_version < \"3.10\"", - "urllib3<2.1,>=1.25.4; python_version >= \"3.10\"" + "urllib3!=2.2.0,<3,>=1.25.4; python_version >= \"3.10\"", + "urllib3<1.27,>=1.25.4; python_version < \"3.10\"" ], "requires_python": ">=3.8", -<<<<<<< HEAD - "version": "1.34.61" -======= - "version": "1.34.53" ->>>>>>> 6cebbda4 (Merge with main) + "version": "1.34.74" }, { "artifacts": [ @@ -1638,13 +1590,13 @@ "artifacts": [ { "algorithm": "sha256", - "hash": "9fd67bbcd40f16d9d42f950228e9cf02a2ded4ae49198b27432d0cded5a74c38", - "url": "https://files.pythonhosted.org/packages/92/94/35ba55b5011185ea1c995938e7851b25e6092f15658afa9263cd65a67dd4/google_auth-2.28.2-py2.py3-none-any.whl" + "hash": "d452ad095688cd52bae0ad6fafe027f6a6d6f560e810fec20914e17a09526415", + "url": "https://files.pythonhosted.org/packages/9e/8d/ddbcf81ec751d8ee5fd18ac11ff38a0e110f39dfbf105e6d9db69d556dd0/google_auth-2.29.0-py2.py3-none-any.whl" }, { "algorithm": "sha256", - "hash": "80b8b4969aa9ed5938c7828308f20f035bc79f9d8fb8120bf9dc8db20b41ba30", - "url": "https://files.pythonhosted.org/packages/7a/10/b3a860c103832a6a3353b47f9ebcf49ebe7f0c8ccf5cd39e89cffe67d98e/google-auth-2.28.2.tar.gz" + "hash": "672dff332d073227550ffc7457868ac4218d6c500b155fe6cc17d2b13602c360", + "url": "https://files.pythonhosted.org/packages/18/b2/f14129111cfd61793609643a07ecb03651a71dd65c6974f63b0310ff4b45/google-auth-2.29.0.tar.gz" } ], "project_name": "google-auth", @@ -1662,7 +1614,7 @@ "rsa<5,>=3.1.4" ], "requires_python": ">=3.7", - "version": "2.28.2" + "version": "2.29.0" }, { "artifacts": [ @@ -2393,7 +2345,6 @@ "artifacts": [ { "algorithm": "sha256", -<<<<<<< HEAD "hash": "f085493f79efb0644f270a9bf2892843142d80d7174bbbd2f3713f2a589dc633", "url": "https://files.pythonhosted.org/packages/38/04/37055b7013dfaaf66e3a9a51e46857cc9be151476a891b995fa70da7e139/marshmallow-3.21.1-py3-none-any.whl" }, @@ -2401,15 +2352,6 @@ "algorithm": "sha256", "hash": "4e65e9e0d80fc9e609574b9983cf32579f305c718afb30d7233ab818571768c3", "url": "https://files.pythonhosted.org/packages/5b/17/1b117d1875d8287a85cc2d5e2effd3f31bd8afd9f142c7b8391b9d665f0c/marshmallow-3.21.1.tar.gz" -======= - "hash": "e7997f83571c7fd476042c2c188e4ee8a78900ca5e74bd9c8097afa56624e9bd", - "url": "https://files.pythonhosted.org/packages/f5/97/6e4ddd6713bba5ede1d18f3959d7bffde38e56f7f7ae7c031c9a3d746b95/marshmallow-3.21.0-py3-none-any.whl" - }, - { - "algorithm": "sha256", - "hash": "20f53be28c6e374a711a16165fb22a8dc6003e3f7cda1285e3ca777b9193885b", - "url": "https://files.pythonhosted.org/packages/86/14/0dec31a81b16d39b6cfcb5ddd7e560d46dc5ea0d1d1bca0bb275a679071f/marshmallow-3.21.0.tar.gz" ->>>>>>> 6cebbda4 (Merge with main) } ], "project_name": "marshmallow", @@ -2428,11 +2370,7 @@ "tox; extra == \"dev\"" ], "requires_python": ">=3.8", -<<<<<<< HEAD "version": "3.21.1" -======= - "version": "3.21.0" ->>>>>>> 6cebbda4 (Merge with main) }, { "artifacts": [ @@ -2906,39 +2844,39 @@ "artifacts": [ { "algorithm": "sha256", - "hash": "4439847c58d40b1d0a573d07e3856e95333f1976294494c325775aeca506eb58", - "url": "https://files.pythonhosted.org/packages/d1/75/4686d2872bf2fc0b37917cbc8bbf0dd3a5cdb0990799be1b9cbf1e1eb733/pyasn1-0.5.1-py2.py3-none-any.whl" + "hash": "cca4bb0f2df5504f02f6f8a775b6e416ff9b0b3b16f7ee80b5a3153d9b804473", + "url": "https://files.pythonhosted.org/packages/23/7e/5f50d07d5e70a2addbccd90ac2950f81d1edd0783630651d9268d7f1db49/pyasn1-0.6.0-py2.py3-none-any.whl" }, { "algorithm": "sha256", - "hash": "6d391a96e59b23130a5cfa74d6fd7f388dbbe26cc8f1edf39fdddf08d9d6676c", - "url": "https://files.pythonhosted.org/packages/ce/dc/996e5446a94627fe8192735c20300ca51535397e31e7097a3cc80ccf78b7/pyasn1-0.5.1.tar.gz" + "hash": "3a35ab2c4b5ef98e17dfdec8ab074046fbda76e281c5a706ccd82328cfc8f64c", + "url": "https://files.pythonhosted.org/packages/4a/a3/d2157f333900747f20984553aca98008b6dc843eb62f3a36030140ccec0d/pyasn1-0.6.0.tar.gz" } ], "project_name": "pyasn1", "requires_dists": [], - "requires_python": "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7", - "version": "0.5.1" + "requires_python": ">=3.8", + "version": "0.6.0" }, { "artifacts": [ { "algorithm": "sha256", - "hash": "d3ccd6ed470d9ffbc716be08bd90efbd44d0734bc9303818f7336070984a162d", - "url": "https://files.pythonhosted.org/packages/cd/8e/bea464350e1b8c6ed0da3a312659cb648804a08af6cacc6435867f74f8bd/pyasn1_modules-0.3.0-py2.py3-none-any.whl" + "hash": "be04f15b66c206eed667e0bb5ab27e2b1855ea54a842e5037738099e8ca4ae0b", + "url": "https://files.pythonhosted.org/packages/13/68/8906226b15ef38e71dc926c321d2fe99de8048e9098b5dfd38343011c886/pyasn1_modules-0.4.0-py3-none-any.whl" }, { "algorithm": "sha256", - "hash": "5bd01446b736eb9d31512a30d46c1ac3395d676c6f3cafa4c03eb54b9925631c", - "url": "https://files.pythonhosted.org/packages/3b/e4/7dec823b1b5603c5b3c51e942d5d9e65efd6ff946e713a325ed4146d070f/pyasn1_modules-0.3.0.tar.gz" + "hash": "831dbcea1b177b28c9baddf4c6d1013c24c3accd14a1873fffaa6a2e905f17b6", + "url": "https://files.pythonhosted.org/packages/f7/00/e7bd1dec10667e3f2be602686537969a7ac92b0a7c5165be2e5875dc3971/pyasn1_modules-0.4.0.tar.gz" } ], "project_name": "pyasn1-modules", "requires_dists": [ - "pyasn1<0.6.0,>=0.4.6" + "pyasn1<0.7.0,>=0.4.6" ], - "requires_python": "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7", - "version": "0.3.0" + "requires_python": ">=3.8", + "version": "0.4.0" }, { "artifacts": [ @@ -3000,19 +2938,19 @@ "artifacts": [ { "algorithm": "sha256", - "hash": "8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9", - "url": "https://files.pythonhosted.org/packages/62/d5/5f610ebe421e85889f2e55e33b7f9a6795bd982198517d912eb1c76e1a53/pycparser-2.21-py2.py3-none-any.whl" + "hash": "c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc", + "url": "https://files.pythonhosted.org/packages/13/a3/a812df4e2dd5696d1f351d58b8fe16a405b234ad2886a0dab9183fb78109/pycparser-2.22-py3-none-any.whl" }, { "algorithm": "sha256", - "hash": "e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206", - "url": "https://files.pythonhosted.org/packages/5e/0b/95d387f5f4433cb0f53ff7ad859bd2c6051051cebbb564f139a999ab46de/pycparser-2.21.tar.gz" + "hash": "491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6", + "url": "https://files.pythonhosted.org/packages/1d/b2/31537cf4b1ca988837256c910a668b553fceb8f069bedc4b1c826024b52c/pycparser-2.22.tar.gz" } ], "project_name": "pycparser", "requires_dists": [], - "requires_python": "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,>=2.7", - "version": "2.21" + "requires_python": ">=3.8", + "version": "2.22" }, { "artifacts": [ @@ -3213,7 +3151,6 @@ "artifacts": [ { "algorithm": "sha256", -<<<<<<< HEAD "hash": "2a8386cfc11fa9d2c50ee7b2a57e7d898ef90470a7a34c4b949ff59662bb78b7", "url": "https://files.pythonhosted.org/packages/4d/7e/c79cecfdb6aa85c6c2e3cf63afc56d0f165f24f5c66c03c695c4d9b84756/pytest-8.1.1-py3-none-any.whl" }, @@ -3221,15 +3158,6 @@ "algorithm": "sha256", "hash": "ac978141a75948948817d360297b7aae0fcb9d6ff6bc9ec6d514b85d5a65c044", "url": "https://files.pythonhosted.org/packages/30/b7/7d44bbc04c531dcc753056920e0988032e5871ac674b5a84cb979de6e7af/pytest-8.1.1.tar.gz" -======= - "hash": "edfaaef32ce5172d5466b5127b42e0d6d35ebbe4453f0e3505d96afd93f6b096", - "url": "https://files.pythonhosted.org/packages/a7/ea/d0ab9595a0d4b2320483e634123171deaf50885e29d442180efcbf2ed0b2/pytest-8.0.2-py3-none-any.whl" - }, - { - "algorithm": "sha256", - "hash": "d4051d623a2e0b7e51960ba963193b09ce6daeb9759a451844a21e4ddedfc1bd", - "url": "https://files.pythonhosted.org/packages/3f/c0/238f25cb27495fdbaa5c48cef9886162e9df1f3d0e957fc8326d9c24fa2f/pytest-8.0.2.tar.gz" ->>>>>>> 6cebbda4 (Merge with main) } ], "project_name": "pytest", @@ -3250,11 +3178,7 @@ "xmlschema; extra == \"testing\"" ], "requires_python": ">=3.8", -<<<<<<< HEAD "version": "8.1.1" -======= - "version": "8.0.2" ->>>>>>> 6cebbda4 (Merge with main) }, { "artifacts": [ @@ -3276,7 +3200,6 @@ "artifacts": [ { "algorithm": "sha256", -<<<<<<< HEAD "hash": "a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", "url": "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl" }, @@ -3284,15 +3207,6 @@ "algorithm": "sha256", "hash": "37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3", "url": "https://files.pythonhosted.org/packages/66/c0/0c8b6ad9f17a802ee498c46e004a0eb49bc148f2fd230864601a86dcf6db/python-dateutil-2.9.0.post0.tar.gz" -======= - "hash": "cbf2f1da5e6083ac2fbfd4da39a25f34312230110440f424a14c7558bb85d82e", - "url": "https://files.pythonhosted.org/packages/13/7f/98d6f9ca8b731506c85785bbb8806c01f5966a4df6d68c0d1cf3b16967e1/python_dateutil-2.9.0-py2.py3-none-any.whl" - }, - { - "algorithm": "sha256", - "hash": "78e73e19c63f5b20ffa567001531680d939dc042bf7850431877645523c66709", - "url": "https://files.pythonhosted.org/packages/d9/77/bd458a2e387e98f71de86dcc2ca2cab64489736004c80bc12b70da8b5488/python-dateutil-2.9.0.tar.gz" ->>>>>>> 6cebbda4 (Merge with main) } ], "project_name": "python-dateutil", @@ -3300,11 +3214,7 @@ "six>=1.5" ], "requires_python": "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7", -<<<<<<< HEAD "version": "2.9.0.post0" -======= - "version": "2.9.0" ->>>>>>> 6cebbda4 (Merge with main) }, { "artifacts": [ @@ -3482,21 +3392,21 @@ "artifacts": [ { "algorithm": "sha256", - "hash": "76ec784a5dd2afac3b7da8003329834cdd9824294c260027f8c8d2e4d0a78f43", - "url": "https://files.pythonhosted.org/packages/cd/14/730280df294e52e395a70111f4d9b07be94f5ba7a69db7eba3c324f113b2/readchar-4.0.5-py3-none-any.whl" + "hash": "b4b31dd35de4897be738f27e8f9f62426b5fedb54b648364987e30ae534b71bc", + "url": "https://files.pythonhosted.org/packages/86/db/aca9e5e6a53a499d61cbd78b3594d700f1e48a50ab6970a92a4d1251f8db/readchar-4.0.6-py3-none-any.whl" }, { "algorithm": "sha256", - "hash": "08a456c2d7c1888cde3f4688b542621b676eb38cd6cfed7eb6cb2e2905ddc826", - "url": "https://files.pythonhosted.org/packages/a1/57/439aaa28659e66265518232bf4291ae5568aa01cd9e0e0f6f8fe3b300e9e/readchar-4.0.5.tar.gz" + "hash": "e0dae942d3a746f8d5423f83dbad67efe704004baafe31b626477929faaee472", + "url": "https://files.pythonhosted.org/packages/ec/85/35c1a04aa52c432ec604b2816570fb0ab721cb7403191130b9c068c672c3/readchar-4.0.6.tar.gz" } ], "project_name": "readchar", "requires_dists": [ "setuptools>=41.0" ], - "requires_python": ">=3.7", - "version": "4.0.5" + "requires_python": ">=3.8", + "version": "4.0.6" }, { "artifacts": [ @@ -3553,13 +3463,13 @@ "artifacts": [ { "algorithm": "sha256", - "hash": "7a3130d94a17520169e38db6c8d75f2c974643788465ecc2e4b36d288bf13033", - "url": "https://files.pythonhosted.org/packages/db/3a/457f30ab4e80b0e978686ccd43f17309e9fdc242d8619491a9156a19fda5/requests_oauthlib-1.4.0-py2.py3-none-any.whl" + "hash": "7dd8a5c40426b779b0868c404bdef9768deccf22749cde15852df527e6269b36", + "url": "https://files.pythonhosted.org/packages/3b/5d/63d4ae3b9daea098d5d6f5da83984853c1bbacd5dc826764b249fe119d24/requests_oauthlib-2.0.0-py2.py3-none-any.whl" }, { "algorithm": "sha256", - "hash": "acee623221e4a39abcbb919312c8ff04bd44e7e417087fb4bd5e2a2f53d5e79a", - "url": "https://files.pythonhosted.org/packages/d5/5d/fd68baf0876774835068b557717932c44c6be42ee847bcc210bf389d4189/requests-oauthlib-1.4.0.tar.gz" + "hash": "b3dffaebd884d8cd778494369603a9e7b58d29111bf6b41bdc2dcd87203af4e9", + "url": "https://files.pythonhosted.org/packages/42/f2/05f29bc3913aea15eb670be136045bf5c5bbf4b99ecb839da9b422bb2c85/requests-oauthlib-2.0.0.tar.gz" } ], "project_name": "requests-oauthlib", @@ -3568,8 +3478,8 @@ "oauthlib[signedtoken]>=3.0.0; extra == \"rsa\"", "requests>=2.0.0" ], - "requires_python": "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,>=2.7", - "version": "1.4.0" + "requires_python": ">=3.4", + "version": "2.0.0" }, { "artifacts": [ @@ -3618,13 +3528,13 @@ "artifacts": [ { "algorithm": "sha256", - "hash": "3cdb40f5cfa6966e812209d0994f2a4709b561c88e90cf00c2696d2df4e56b2e", - "url": "https://files.pythonhosted.org/packages/12/bb/7e7912e18cd558e7880d9b58ffc57300b2c28ffba9882b3a54ba5ce3ebc4/s3transfer-0.10.0-py3-none-any.whl" + "hash": "ceb252b11bcf87080fb7850a224fb6e05c8a776bab8f2b64b7f25b969464839d", + "url": "https://files.pythonhosted.org/packages/83/37/395cdb6ee92925fa211e55d8f07b9f93cf93f60d7d4ce5e66fd73f1ea986/s3transfer-0.10.1-py3-none-any.whl" }, { "algorithm": "sha256", - "hash": "d0c8bbf672d5eebbe4e57945e23b972d963f07d82f661cabf678a5c88831595b", - "url": "https://files.pythonhosted.org/packages/a0/b5/4c570b08cb85fdcc65037b5229e00412583bb38d974efecb7ec3495f40ba/s3transfer-0.10.0.tar.gz" + "hash": "5683916b4c724f799e600f41dd9e10a9ff19871bf87623cc8f491cb4f5fa0a19", + "url": "https://files.pythonhosted.org/packages/83/bc/fb0c1f76517e3380eb142af8a9d6b969c150cfca1324cea7d965d8c66571/s3transfer-0.10.1.tar.gz" } ], "project_name": "s3transfer", @@ -3633,7 +3543,7 @@ "botocore[crt]<2.0a.0,>=1.33.2; extra == \"crt\"" ], "requires_python": ">=3.8", - "version": "0.10.0" + "version": "0.10.1" }, { "artifacts": [ @@ -3704,13 +3614,13 @@ "artifacts": [ { "algorithm": "sha256", - "hash": "02fa291a0471b3a18b2b2481ed902af520c69e8ae0919c13da936542754b4c56", - "url": "https://files.pythonhosted.org/packages/c0/7a/3da654f49c95d0cc6e9549a855b5818e66a917e852ec608e77550c8dc08b/setuptools-69.1.1-py3-none-any.whl" + "hash": "c21c49fb1042386df081cb5d86759792ab89efca84cf114889191cd09aacc80c", + "url": "https://files.pythonhosted.org/packages/92/e1/1c8bb3420105e70bdf357d57dd5567202b4ef8d27f810e98bb962d950834/setuptools-69.2.0-py3-none-any.whl" }, { "algorithm": "sha256", - "hash": "5c0806c7d9af348e6dd3777b4f4dbb42c7ad85b190104837488eab9a7c945cf8", - "url": "https://files.pythonhosted.org/packages/c8/1f/e026746e5885a83e1af99002ae63650b7c577af5c424d4c27edcf729ab44/setuptools-69.1.1.tar.gz" + "hash": "0ff4183f8f42cd8fa3acea16c45205521a4ef28f73c6391d8a25e92893134f2e", + "url": "https://files.pythonhosted.org/packages/4d/5b/dc575711b6b8f2f866131a40d053e30e962e633b332acf7cd2c24843d83d/setuptools-69.2.0.tar.gz" } ], "project_name": "setuptools", @@ -3719,8 +3629,8 @@ "build[virtualenv]>=1.0.3; extra == \"testing-integration\"", "filelock>=3.4.0; extra == \"testing\"", "filelock>=3.4.0; extra == \"testing-integration\"", - "flake8-2020; extra == \"testing\"", "furo; extra == \"docs\"", + "importlib-metadata; extra == \"testing\"", "ini2toml[lite]>=0.9; extra == \"testing\"", "jaraco.develop>=7.21; (python_version >= \"3.9\" and sys_platform != \"cygwin\") and extra == \"testing\"", "jaraco.envs>=2.2; extra == \"testing\"", @@ -3729,6 +3639,7 @@ "jaraco.path>=3.2.0; extra == \"testing\"", "jaraco.path>=3.2.0; extra == \"testing-integration\"", "jaraco.tidelift>=1.4; extra == \"docs\"", + "mypy==1.9; extra == \"testing\"", "packaging>=23.2; extra == \"testing\"", "packaging>=23.2; extra == \"testing-integration\"", "pip>=19.1; extra == \"testing\"", @@ -3742,8 +3653,8 @@ "pytest-perf; sys_platform != \"cygwin\" and extra == \"testing\"", "pytest-ruff>=0.2.1; sys_platform != \"cygwin\" and extra == \"testing\"", "pytest-timeout; extra == \"testing\"", - "pytest-xdist; extra == \"testing\"", "pytest-xdist; extra == \"testing-integration\"", + "pytest-xdist>=3; extra == \"testing\"", "pytest; extra == \"testing-integration\"", "pytest>=6; extra == \"testing\"", "rst.linker>=1.9; extra == \"docs\"", @@ -3756,6 +3667,7 @@ "sphinx>=3.5; extra == \"docs\"", "sphinxcontrib-towncrier; extra == \"docs\"", "tomli-w>=1.0.0; extra == \"testing\"", + "tomli; extra == \"testing\"", "tomli; extra == \"testing-integration\"", "virtualenv>=13.0.0; extra == \"testing\"", "virtualenv>=13.0.0; extra == \"testing-integration\"", @@ -3763,7 +3675,7 @@ "wheel; extra == \"testing-integration\"" ], "requires_python": ">=3.8", - "version": "69.1.1" + "version": "69.2.0" }, { "artifacts": [ @@ -4165,19 +4077,19 @@ "artifacts": [ { "algorithm": "sha256", - "hash": "ed10a8002d88c94220597b77304cf1a1d8cf489c7143fc3ffa2c96488b20fec7", - "url": "https://files.pythonhosted.org/packages/89/aa/9b52b02f6a11bf6153dd04c15e8d3c3b2b7431ed5541876abaf7b2df8490/types_aiofiles-23.2.0.20240311-py3-none-any.whl" + "hash": "40c18c68956aaf1af6a37f997ae26ff6af124e52e16ec4f2473d51c720f79cd7", + "url": "https://files.pythonhosted.org/packages/0a/ac/56895d328250a8028ace67e22e58847ecb0da7faba4d3c02c97ec50b05c4/types_aiofiles-23.2.0.20240331-py3-none-any.whl" }, { "algorithm": "sha256", - "hash": "208e6b090de732739ef74ab8f133c954479c8e77e614f276f9e475a0cc986430", - "url": "https://files.pythonhosted.org/packages/4d/36/2cd7f4e832bf4778e787f2550a99e77e38cc966295681a5d32ac56ab04b2/types-aiofiles-23.2.0.20240311.tar.gz" + "hash": "8dd2a90cdcf16f9cfc9b07d77cd8eb989977c6bab2eba5b65e5929632ac77f53", + "url": "https://files.pythonhosted.org/packages/f8/35/3fdcdafc518f9bc40f3d733595ad6d081a7d3e6cd254d1fa8a6476c27400/types-aiofiles-23.2.0.20240331.tar.gz" } ], "project_name": "types-aiofiles", "requires_dists": [], "requires_python": ">=3.8", - "version": "23.2.0.20240311" + "version": "23.2.0.20240331" }, { "artifacts": [ @@ -4239,7 +4151,6 @@ "artifacts": [ { "algorithm": "sha256", -<<<<<<< HEAD "hash": "6e8e8bfad34924067333232c93f7fc4b369856d8bea0d5c9d1808cb290ab1972", "url": "https://files.pythonhosted.org/packages/7d/7d/06b1d678521cebeab11cc0fb02986c901de783a40befcc32076bbe7fe278/types_pyOpenSSL-24.0.0.20240311-py3-none-any.whl" }, @@ -4247,15 +4158,6 @@ "algorithm": "sha256", "hash": "7bca00cfc4e7ef9c5d2663c6a1c068c35798e59670595439f6296e7ba3d58083", "url": "https://files.pythonhosted.org/packages/0b/27/4112745fb4f44d89c3260fc683fae69b2bb2c724e1e1994c3e24f21d24e8/types-pyOpenSSL-24.0.0.20240311.tar.gz" -======= - "hash": "a472cf877a873549175e81972f153f44e975302a3cf17381eb5f3d41ccfb75a4", - "url": "https://files.pythonhosted.org/packages/3b/be/90df9b4654cd43344da7ca6cf003d3b5c710b49bb89658372d13cdda686e/types_pyOpenSSL-24.0.0.20240228-py3-none-any.whl" - }, - { - "algorithm": "sha256", - "hash": "cd990717d8aa3743ef0e73e0f462e64b54d90c304249232d48fece4f0f7c3c6a", - "url": "https://files.pythonhosted.org/packages/01/d3/8e3f365204734e4772cc264f6c933ef6a261450bf1ea5172d5bbec8e634f/types-pyOpenSSL-24.0.0.20240228.tar.gz" ->>>>>>> 6cebbda4 (Merge with main) } ], "project_name": "types-pyopenssl", @@ -4263,29 +4165,25 @@ "cryptography>=35.0.0" ], "requires_python": ">=3.8", -<<<<<<< HEAD "version": "24.0.0.20240311" -======= - "version": "24.0.0.20240228" ->>>>>>> 6cebbda4 (Merge with main) }, { "artifacts": [ { "algorithm": "sha256", - "hash": "ef813da0809aca76472ca88807addbeea98b19339aebe56159ae2f4b4f70857a", - "url": "https://files.pythonhosted.org/packages/e8/19/e11b8098f5d7864a9950385760f1d8a68abd1e0b8ba89656d94c45cf9f93/types_python_dateutil-2.8.19.20240311-py3-none-any.whl" + "hash": "6b8cb66d960771ce5ff974e9dd45e38facb81718cc1e208b10b1baccbfdbee3b", + "url": "https://files.pythonhosted.org/packages/c7/1b/af4f4c4f3f7339a4b7eb3c0ab13416db98f8ac09de3399129ee5fdfa282b/types_python_dateutil-2.9.0.20240316-py3-none-any.whl" }, { "algorithm": "sha256", - "hash": "51178227bbd4cbec35dc9adffbf59d832f20e09842d7dcb8c73b169b8780b7cb", - "url": "https://files.pythonhosted.org/packages/fb/90/e37d402a07f5a93791fc2837ee14b6947989aed6dc7895c420eb93354aea/types-python-dateutil-2.8.19.20240311.tar.gz" + "hash": "5d2f2e240b86905e40944dd787db6da9263f0deabef1076ddaed797351ec0202", + "url": "https://files.pythonhosted.org/packages/61/c5/c3a4d72ffa8efc2e78f7897b1c69ec760553246b67d3ce8c4431fac5d4e3/types-python-dateutil-2.9.0.20240316.tar.gz" } ], "project_name": "types-python-dateutil", "requires_dists": [], "requires_python": ">=3.8", - "version": "2.8.19.20240311" + "version": "2.9.0.20240316" }, { "artifacts": [ @@ -4330,39 +4228,24 @@ "artifacts": [ { "algorithm": "sha256", -<<<<<<< HEAD - "hash": "7801245ecaf371d24f1154924c8f1f0efdc53977339bf79886b5b10890af6478", - "url": "https://files.pythonhosted.org/packages/a6/bf/2c0de809f83077f4095b89e3dad36807d466c6bd951d2e24bf8a08c968dc/types_setuptools-69.1.0.20240310-py3-none-any.whl" + "hash": "cf91ff7c87ab7bf0625c3f0d4d90427c9da68561f3b0feab77977aaf0bbf7531", + "url": "https://files.pythonhosted.org/packages/1f/22/904934a3344fa5f332ecab887003f3f033c1272432a4af877007b75b0bd3/types_setuptools-69.2.0.20240317-py3-none-any.whl" }, { "algorithm": "sha256", - "hash": "fc0e1082f55c974611bce844b1e5beb2d1a895501f4a464e48305592a4268100", - "url": "https://files.pythonhosted.org/packages/e6/5f/a48e5273ffded6e85749866edec3d8d283dc0c7ed8cd8fd6f1b98d280cfa/types-setuptools-69.1.0.20240310.tar.gz" -======= - "hash": "2033afa8efe3f566ec18997c4b614664b2ed9653160d941745389ad61a50d1f6", - "url": "https://files.pythonhosted.org/packages/18/d9/8b873c75ae13da3313bc6c54fd104905671a448cbcf52d35f58a522e0f35/types_setuptools-69.1.0.20240301-py3-none-any.whl" - }, - { - "algorithm": "sha256", - "hash": "f99cf5a7f5c281c55f16ba860da68cb2cd8f3b3a472f78ec8e744240fc3aa09e", - "url": "https://files.pythonhosted.org/packages/18/d5/a5ac54da0cda554620cd5354467cd4cf2ceb795e42e2b8d6e8e870f57c02/types-setuptools-69.1.0.20240301.tar.gz" ->>>>>>> 6cebbda4 (Merge with main) + "hash": "b607c4c48842ef3ee49dc0c7fe9c1bad75700b071e1018bb4d7e3ac492d47048", + "url": "https://files.pythonhosted.org/packages/2d/06/0de7b539346aaa8758b3c80375c4841dc2764ef92c5e743f1ebe9789da54/types-setuptools-69.2.0.20240317.tar.gz" } ], "project_name": "types-setuptools", "requires_dists": [], "requires_python": ">=3.8", -<<<<<<< HEAD - "version": "69.1.0.20240310" -======= - "version": "69.1.0.20240301" ->>>>>>> 6cebbda4 (Merge with main) + "version": "69.2.0.20240317" }, { "artifacts": [ { "algorithm": "sha256", -<<<<<<< HEAD "hash": "abc0377990d38e9b37b3333dd115ec960ca9788d78f3d9c7eb3f778cfc6c925c", "url": "https://files.pythonhosted.org/packages/f8/2e/92578323e36deeb7baf82ad1612a49dfe5ed0882b44e8ddbe18cfdfd5534/types_six-1.16.21.20240311-py3-none-any.whl" }, @@ -4370,25 +4253,12 @@ "algorithm": "sha256", "hash": "b5a117193ba0dc7a66507925e95e140b2af52731402cdd71ef9f2a4348e01f60", "url": "https://files.pythonhosted.org/packages/b4/f1/53abd134a8838c6046b433cbe426aa51ccc70237cd196dfca735d7000984/types-six-1.16.21.20240311.tar.gz" -======= - "hash": "4d5bbf07e521f0cb52cc880de71047bc9b5c2a5059211811e15423872d403c4c", - "url": "https://files.pythonhosted.org/packages/be/91/568c535a99994b2089bc9f346a2cf21ea2e45924d525cfed52002c1db2a0/types_six-1.16.21.20240301-py3-none-any.whl" - }, - { - "algorithm": "sha256", - "hash": "c877c0fa3dbe696860e571bfc23f987ce084bf4de13dfed343ef61ed49826686", - "url": "https://files.pythonhosted.org/packages/17/f6/98c4c8aac02b6628940a54a43c0c9bbc41956a729fa8234bf404729cc6a8/types-six-1.16.21.20240301.tar.gz" ->>>>>>> 6cebbda4 (Merge with main) } ], "project_name": "types-six", "requires_dists": [], "requires_python": ">=3.8", -<<<<<<< HEAD "version": "1.16.21.20240311" -======= - "version": "1.16.21.20240301" ->>>>>>> 6cebbda4 (Merge with main) }, { "artifacts": [ @@ -4474,29 +4344,25 @@ "artifacts": [ { "algorithm": "sha256", - "hash": "fdb6d215c776278489906c2f8916e6e7d4f5a9b602ccbcfdf7f016fc8da0596e", - "url": "https://files.pythonhosted.org/packages/d2/b2/b157855192a68541a91ba7b2bbcb91f1b4faa51f8bae38d8005c034be524/urllib3-2.0.7-py3-none-any.whl" + "hash": "450b20ec296a467077128bff42b73080516e71b56ff59a60a02bef2232c4fa9d", + "url": "https://files.pythonhosted.org/packages/a2/73/a68704750a7679d0b6d3ad7aa8d4da8e14e151ae82e6fee774e6e0d05ec8/urllib3-2.2.1-py3-none-any.whl" }, { "algorithm": "sha256", - "hash": "c97dfde1f7bd43a71c8d2a58e369e9b2bf692d1334ea9f9cae55add7d0dd0f84", - "url": "https://files.pythonhosted.org/packages/af/47/b215df9f71b4fdba1025fc05a77db2ad243fa0926755a52c5e71659f4e3c/urllib3-2.0.7.tar.gz" + "hash": "d0570876c61ab9e520d776c38acbbb5b05a776d3f9ff98a5c8fd5162a444cf19", + "url": "https://files.pythonhosted.org/packages/7a/50/7fd50a27caa0652cd4caf224aa87741ea41d3265ad13f010886167cfcc79/urllib3-2.2.1.tar.gz" } ], "project_name": "urllib3", "requires_dists": [ "brotli>=1.0.9; platform_python_implementation == \"CPython\" and extra == \"brotli\"", "brotlicffi>=0.8.0; platform_python_implementation != \"CPython\" and extra == \"brotli\"", - "certifi; extra == \"secure\"", - "cryptography>=1.9; extra == \"secure\"", - "idna>=2.0.0; extra == \"secure\"", - "pyopenssl>=17.1.0; extra == \"secure\"", + "h2<5,>=4; extra == \"h2\"", "pysocks!=1.5.7,<2.0,>=1.5.6; extra == \"socks\"", - "urllib3-secure-extra; extra == \"secure\"", "zstandard>=0.18.0; extra == \"zstd\"" ], - "requires_python": ">=3.7", - "version": "2.0.7" + "requires_python": ">=3.8", + "version": "2.2.1" }, { "artifacts": [ @@ -4784,24 +4650,8 @@ "python-dateutil>=2.8", "python-dotenv~=0.20.0", "python-json-logger>=2.0.1", -<<<<<<< HEAD "pyzmq~=25.1.2", -======= - "pyzmq~=24.0.1", -<<<<<<< HEAD -<<<<<<< HEAD -<<<<<<< HEAD - "raftify==0.1.42", ->>>>>>> a7ae2ac8 (Introduce raftify and RaftContext) -======= - "raftify==0.1.43", ->>>>>>> 4fcd15f2 (Reflect new API of raftify) -======= - "raftify==0.1.45", ->>>>>>> fa4423ad (Update raftify) -======= "raftify==0.1.56", ->>>>>>> d23bbae6 (Implement multiple nodes joining after bootstrap done) "redis[hiredis]==4.5.5", "rich~=13.6", "setproctitle~=1.3.2", From 5cbcb5fd8afd6b74205dfef001053db374192c7d Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Mon, 1 Apr 2024 07:58:35 +0000 Subject: [PATCH 10/16] Merge --- python.lock | 42 ++++++++---- requirements.txt | 2 +- src/ai/backend/common/defs.py | 1 + src/ai/backend/manager/api/context.py | 1 + src/ai/backend/manager/cli/__main__.py | 48 ++++++++++++- src/ai/backend/manager/cli/context.py | 16 ++++- src/ai/backend/manager/config.py | 2 +- src/ai/backend/manager/raft/state_machine.py | 8 ++- src/ai/backend/manager/server.py | 71 +++++++++++--------- 9 files changed, 142 insertions(+), 49 deletions(-) diff --git a/python.lock b/python.lock index 77b6b0406c..d6136274ad 100644 --- a/python.lock +++ b/python.lock @@ -72,8 +72,8 @@ // "python-dateutil>=2.8", // "python-dotenv~=0.20.0", // "python-json-logger>=2.0.1", -// "pyzmq~=25.1.2", -// "raftify==0.1.56", +// "pyzmq~=24.0.1", +// "raftify==0.1.65", // "redis[hiredis]==4.5.5", // "rich~=13.6", // "setproctitle~=1.3.2", @@ -2840,6 +2840,24 @@ "requires_python": null, "version": "0.7.0" }, + { + "artifacts": [ + { + "algorithm": "sha256", + "hash": "cca4bb0f2df5504f02f6f8a775b6e416ff9b0b3b16f7ee80b5a3153d9b804473", + "url": "https://files.pythonhosted.org/packages/23/7e/5f50d07d5e70a2addbccd90ac2950f81d1edd0783630651d9268d7f1db49/pyasn1-0.6.0-py2.py3-none-any.whl" + }, + { + "algorithm": "sha256", + "hash": "51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719", + "url": "https://files.pythonhosted.org/packages/98/ff/fec109ceb715d2a6b4c4a85a61af3b40c723a961e8828319fbcb15b868dc/py-1.11.0.tar.gz" + } + ], + "project_name": "py", + "requires_dists": [], + "requires_python": "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7", + "version": "1.11.0" + }, { "artifacts": [ { @@ -3379,14 +3397,14 @@ "artifacts": [ { "algorithm": "sha256", - "hash": "03a691dcdc67d835d888ab4a32bfd656b66101e9a6e5ab89743138741c0da552", - "url": "https://files.pythonhosted.org/packages/d7/64/eaa29217787fed5b30351225d4111cacfe05224b1180cdc0a9155c7a109f/raftify-0.1.56.tar.gz" + "hash": "71abf9d3e3e829850cba5b013a42c6e9ef2d556585bf224de5aff75a182a93c2", + "url": "https://files.pythonhosted.org/packages/df/e0/49cb08a23f49a232d94fd3a32f534654750930e98cf85d074a9b4be3582b/raftify-0.1.65.tar.gz" } ], "project_name": "raftify", "requires_dists": [], "requires_python": ">=3.10", - "version": "0.1.56" + "version": "0.1.65" }, { "artifacts": [ @@ -3849,13 +3867,13 @@ "artifacts": [ { "algorithm": "sha256", - "hash": "960a19df2319482918b4a58736d9552cdc1ab65d170ba0bc15273ce0e1922b7a", - "url": "https://files.pythonhosted.org/packages/8a/f0/ab4e1045af86f051ebcb64b964b00b3b52a1c99304f357dd2ea0af3ed1a4/textual-0.52.1-py3-none-any.whl" + "hash": "94aacf28dece20a44f0b94b087e17ff4ac961acd92e12e648f060fe2555b3adc", + "url": "https://files.pythonhosted.org/packages/55/2f/2e0a2c65c460f66f547c5ed3945c0896e9a786a204d0f5a4f24b1ec19612/textual-0.54.0-py3-none-any.whl" }, { "algorithm": "sha256", - "hash": "4232e5c2b423ed7c63baaeb6030355e14e1de1b9df096c9655b68a1e60e4de5f", - "url": "https://files.pythonhosted.org/packages/bb/ce/b224ccc05260871da8df640e7cd8ca0a5e38721fddb6733650195402841e/textual-0.52.1.tar.gz" + "hash": "0cfd134dde5ae49d64dd73bb32a2fb5a86d878d9caeacecaa1d640082f31124e", + "url": "https://files.pythonhosted.org/packages/68/2d/d13923f4172751a22d1f41969f6d37bce2ae00be477b62abc6d1d0ebc476/textual-0.54.0.tar.gz" } ], "project_name": "textual", @@ -3867,7 +3885,7 @@ "typing-extensions<5.0.0,>=4.4.0" ], "requires_python": "<4.0,>=3.8", - "version": "0.52.1" + "version": "0.54.0" }, { "artifacts": [ @@ -4650,8 +4668,8 @@ "python-dateutil>=2.8", "python-dotenv~=0.20.0", "python-json-logger>=2.0.1", - "pyzmq~=25.1.2", - "raftify==0.1.56", + "pyzmq~=24.0.1", + "raftify==0.1.65", "redis[hiredis]==4.5.5", "rich~=13.6", "setproctitle~=1.3.2", diff --git a/requirements.txt b/requirements.txt index 30c4336dca..68000f3d14 100644 --- a/requirements.txt +++ b/requirements.txt @@ -95,4 +95,4 @@ backend.ai-krunner-alpine==5.1.0 backend.ai-krunner-static-gnu==4.1.1 etcd-client-py==0.2.4 -raftify==0.1.56 +raftify==0.1.65 diff --git a/src/ai/backend/common/defs.py b/src/ai/backend/common/defs.py index 9dadda4c90..ce4a269b41 100644 --- a/src/ai/backend/common/defs.py +++ b/src/ai/backend/common/defs.py @@ -7,6 +7,7 @@ REDIS_IMAGE_DB: Final = 3 REDIS_STREAM_DB: Final = 4 REDIS_STREAM_LOCK: Final = 5 +REDIS_RAFT_PENDING_JOIN_REQUESTS: Final = 6 DEFAULT_FILE_IO_TIMEOUT: Final = 10 diff --git a/src/ai/backend/manager/api/context.py b/src/ai/backend/manager/api/context.py index 1ace65ffbc..99cc9a4bb7 100644 --- a/src/ai/backend/manager/api/context.py +++ b/src/ai/backend/manager/api/context.py @@ -58,6 +58,7 @@ class RootContext(BaseContext): redis_image: RedisConnectionInfo redis_stream: RedisConnectionInfo redis_lock: RedisConnectionInfo + redis_raft_confchange_requests: RedisConnectionInfo shared_config: SharedConfig local_config: LocalConfig raft_cluster_config: Optional[LocalConfig] diff --git a/src/ai/backend/manager/cli/__main__.py b/src/ai/backend/manager/cli/__main__.py index 4c2ffe1bd7..4223b556a9 100644 --- a/src/ai/backend/manager/cli/__main__.py +++ b/src/ai/backend/manager/cli/__main__.py @@ -12,7 +12,16 @@ import click from more_itertools import chunked -from raftify import InitialRole, Peer, Peers, RaftServiceClient, cli_main +from raftify import ( + ConfChangeRequest, + ConfChangeSingle, + ConfChangeType, + InitialRole, + Peer, + Peers, + RaftServiceClient, + cli_main, +) from setproctitle import setproctitle from tabulate import tabulate @@ -397,10 +406,47 @@ async def handle_raft_cli_main(argv: list[str]): await cli_main(argv) +async def handle_apply_pending_confchanges(cli_ctx: CLIContext): + async with redis_ctx(cli_ctx) as redis_conn_set: + if raw_pending_join_request := await redis_conn_set.raft_confchange_requests.client.rpop( + "pending-requests" + ): + pending_join_request = json.loads(raw_pending_join_request) + leader_addr = pending_join_request[0]["leader_addr"] + + addrs = [] + changes = [] + for ticket in pending_join_request: + change = ConfChangeSingle() + change.set_node_id(ticket["reserved_id"]) + change.set_change_type(ConfChangeType.AddNode) + changes.append(change) + addrs.append(ticket["raft_addr"]) + + client = await RaftServiceClient.build(leader_addr) + await client.change_config(ConfChangeRequest(changes=changes, addrs=addrs)) + await redis_conn_set.raft_confchange_requests.client.delete("pending-requests") + + +async def handle_leave_joint(leader_addr: str): + client = await RaftServiceClient.build(leader_addr) + await client.leave_joint() + + @main.command() @click.pass_obj @click.argument("args", nargs=-1, type=click.UNPROCESSED) def raft(cli_ctx: CLIContext, args) -> None: + match next(iter(args), None): + case "apply-pending-confchanges": + print("Applying pending confchanges...") + asyncio.run(handle_apply_pending_confchanges(cli_ctx)) + return + case "leave-joint": + print("Apply leave joint.") + asyncio.run(handle_leave_joint(args[1])) + return + register_custom_deserializer() argv = sys.argv diff --git a/src/ai/backend/manager/cli/context.py b/src/ai/backend/manager/cli/context.py index baded4bd58..294acbe8ce 100644 --- a/src/ai/backend/manager/cli/context.py +++ b/src/ai/backend/manager/cli/context.py @@ -11,7 +11,13 @@ from ai.backend.common import redis_helper from ai.backend.common.config import redis_config_iv -from ai.backend.common.defs import REDIS_IMAGE_DB, REDIS_LIVE_DB, REDIS_STAT_DB, REDIS_STREAM_DB +from ai.backend.common.defs import ( + REDIS_IMAGE_DB, + REDIS_LIVE_DB, + REDIS_RAFT_PENDING_JOIN_REQUESTS, + REDIS_STAT_DB, + REDIS_STREAM_DB, +) from ai.backend.common.etcd import AsyncEtcd, ConfigScopes from ai.backend.common.exception import ConfigurationError from ai.backend.common.logging import AbstractLogger, LocalLogger @@ -111,6 +117,7 @@ class RedisConnectionSet: stat: RedisConnectionInfo image: RedisConnectionInfo stream: RedisConnectionInfo + raft_confchange_requests: RedisConnectionInfo @contextlib.asynccontextmanager @@ -145,13 +152,20 @@ async def redis_ctx(cli_ctx: CLIContext) -> AsyncIterator[RedisConnectionSet]: name="mgr_cli.stream", db=REDIS_STREAM_DB, ) + redis_raft_confchange_requests = redis_helper.get_redis_object( + shared_config.data["redis"], + name="mgr_cli.raft_confchange_requests", # raft configuration change requests + db=REDIS_RAFT_PENDING_JOIN_REQUESTS, + ) yield RedisConnectionSet( live=redis_live, stat=redis_stat, image=redis_image, stream=redis_stream, + raft_confchange_requests=redis_raft_confchange_requests, ) await redis_stream.close() await redis_image.close() await redis_stat.close() await redis_live.close() + await redis_raft_confchange_requests.close() diff --git a/src/ai/backend/manager/config.py b/src/ai/backend/manager/config.py index 1bb2c730d1..61ae9b6fea 100644 --- a/src/ai/backend/manager/config.py +++ b/src/ai/backend/manager/config.py @@ -328,7 +328,7 @@ ) manager_raft_cluster_config_iv = t.Dict({ - t.Key("bootstrap-done", default=False): t.ToBool, + t.Key("join-through-peer-addr", default=None): t.String | t.Null, t.Key("raft-debug-webserver-enabled", default=False): t.ToBool, t.Key("restore-wal-from", default=None): t.Int | t.Null, t.Key("restore-wal-snapshot-from", default=None): t.Int | t.Null, diff --git a/src/ai/backend/manager/raft/state_machine.py b/src/ai/backend/manager/raft/state_machine.py index e63b1434f8..2f48bb8a15 100644 --- a/src/ai/backend/manager/raft/state_machine.py +++ b/src/ai/backend/manager/raft/state_machine.py @@ -1,3 +1,4 @@ +import asyncio import pickle from typing import Optional @@ -29,6 +30,7 @@ class HashStore: def __init__(self): self._store = dict() + self._loop = asyncio.get_running_loop() def get(self, key: str) -> Optional[str]: return self._store.get(key) @@ -36,13 +38,13 @@ def get(self, key: str) -> Optional[str]: def as_dict(self) -> dict: return self._store - def apply(self, msg: bytes) -> bytes: + async def apply(self, msg: bytes) -> bytes: message = SetCommand.decode(msg) self._store[message.key] = message.value return msg - def snapshot(self) -> bytes: + async def snapshot(self) -> bytes: return pickle.dumps(self._store) - def restore(self, snapshot: bytes) -> None: + async def restore(self, snapshot: bytes) -> None: self._store = pickle.loads(snapshot) diff --git a/src/ai/backend/manager/server.py b/src/ai/backend/manager/server.py index 7093fefc49..96ac920b2e 100644 --- a/src/ai/backend/manager/server.py +++ b/src/ai/backend/manager/server.py @@ -5,6 +5,7 @@ import grp import importlib import importlib.resources +import json import logging import os import pwd @@ -33,7 +34,7 @@ import click from aiohttp import web from aiotools import process_index -from raftify import ClusterJoinTicket, InitialRole, Peer, Peers, Raft, RaftServiceClient +from raftify import ClusterJoinTicket, InitialRole, Peer, Peers, Raft from raftify import Config as RaftConfig from raftify import RaftConfig as RaftCoreConfig from setproctitle import setproctitle @@ -45,6 +46,7 @@ from ai.backend.common.defs import ( REDIS_IMAGE_DB, REDIS_LIVE_DB, + REDIS_RAFT_PENDING_JOIN_REQUESTS, REDIS_STAT_DB, REDIS_STREAM_DB, REDIS_STREAM_LOCK, @@ -348,8 +350,6 @@ async def manager_status_ctx(root_ctx: RootContext) -> AsyncIterator[None]: @actxmgr async def redis_ctx(root_ctx: RootContext) -> AsyncIterator[None]: - root_ctx.shared_config.data["redis"] - root_ctx.redis_live = redis_helper.get_redis_object( root_ctx.shared_config.data["redis"], name="live", # tracking live status of various entities @@ -375,12 +375,18 @@ async def redis_ctx(root_ctx: RootContext) -> AsyncIterator[None]: name="lock", # distributed locks db=REDIS_STREAM_LOCK, ) + root_ctx.redis_raft_confchange_requests = redis_helper.get_redis_object( + root_ctx.shared_config.data["redis"], + name="raft_confchange_requests", # raft configuration change requests + db=REDIS_RAFT_PENDING_JOIN_REQUESTS, + ) for redis_info in ( root_ctx.redis_live, root_ctx.redis_stat, root_ctx.redis_image, root_ctx.redis_stream, root_ctx.redis_lock, + root_ctx.redis_raft_confchange_requests, ): await redis_helper.ping_redis_connection(redis_info.client) yield @@ -389,6 +395,7 @@ async def redis_ctx(root_ctx: RootContext) -> AsyncIterator[None]: await root_ctx.redis_stat.close() await root_ctx.redis_live.close() await root_ctx.redis_lock.close() + await root_ctx.redis_raft_confchange_requests.close() @actxmgr @@ -723,6 +730,37 @@ async def raft_ctx(root_ctx: RootContext) -> AsyncIterator[None]: logging.getLogger(f"{__spec__.name}.raft.node-{node_id}"), # type: ignore ) + if peer_addr := raft_cluster_configs["join-through-peer-addr"]: + # Join the cluster through the peer dynamically + + # TODO: Find leader_id by asking for leader_id to someone in initial_peers + leader_addr = peer_addr + + all_tickets = [ + ClusterJoinTicket( + peer["node-id"], + f"{peer['host']}:{peer['port']}", + leader_addr, + initial_peers, + ).to_dict() + for peer in all_peers + ] + + raft_logger.info("Cluster join request made to the redis queue.") + + await root_ctx.redis_raft_confchange_requests.client.rpush( + "pending-requests", json.dumps(all_tickets) + ) + + while True: + print("Waiting for the join request to be processed...") + await asyncio.sleep(1) + if ( + await root_ctx.redis_raft_confchange_requests.client.llen("pending-requests") + == 0 + ): + break + root_ctx.raft_ctx.cluster = Raft.bootstrap( node_id, raft_addr, @@ -733,33 +771,6 @@ async def raft_ctx(root_ctx: RootContext) -> AsyncIterator[None]: raft_cluster = root_ctx.raft_ctx.cluster raft_cluster.run() # type: ignore - if raft_cluster_configs["bootstrap-done"]: - # First follower manager execute join procedure - if node_id - node_id_offset == 1: - # TODO: Find leader_id by asking for leader_id to someone in initial_peers - leader_id, leader_addr = [ - (id_, peer.get_addr()) - for id_, peer in initial_peers.to_dict().items() - if peer.get_role() == InitialRole.LEADER - ][0] - - all_tickets = [ - ClusterJoinTicket( - peer["node-id"], - f"{peer['host']}:{peer['port']}", - leader_id, - leader_addr, - initial_peers, - ) - for peer in all_peers - ] - - await root_ctx.raft_ctx.cluster.join(all_tickets) - # TODO: Find a way to automatically close the leave_joint if possible - await asyncio.sleep(2) - client = await RaftServiceClient.build(leader_addr) - await client.leave_joint() - if raft_cluster_configs["raft-debug-webserver-enabled"]: # Create webserver only for raft testing asyncio.create_task( From 0948d0954e944e9d9aa3479a50346ce40986a7bf Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Mon, 1 Apr 2024 07:15:51 +0000 Subject: [PATCH 11/16] Revert dynamic node attach --- src/ai/backend/common/defs.py | 1 - src/ai/backend/manager/api/context.py | 1 - src/ai/backend/manager/cli/__main__.py | 40 --------------------- src/ai/backend/manager/cli/context.py | 9 ----- src/ai/backend/manager/config.py | 3 +- src/ai/backend/manager/server.py | 50 +++----------------------- 6 files changed, 6 insertions(+), 98 deletions(-) diff --git a/src/ai/backend/common/defs.py b/src/ai/backend/common/defs.py index ce4a269b41..9dadda4c90 100644 --- a/src/ai/backend/common/defs.py +++ b/src/ai/backend/common/defs.py @@ -7,7 +7,6 @@ REDIS_IMAGE_DB: Final = 3 REDIS_STREAM_DB: Final = 4 REDIS_STREAM_LOCK: Final = 5 -REDIS_RAFT_PENDING_JOIN_REQUESTS: Final = 6 DEFAULT_FILE_IO_TIMEOUT: Final = 10 diff --git a/src/ai/backend/manager/api/context.py b/src/ai/backend/manager/api/context.py index 99cc9a4bb7..1ace65ffbc 100644 --- a/src/ai/backend/manager/api/context.py +++ b/src/ai/backend/manager/api/context.py @@ -58,7 +58,6 @@ class RootContext(BaseContext): redis_image: RedisConnectionInfo redis_stream: RedisConnectionInfo redis_lock: RedisConnectionInfo - redis_raft_confchange_requests: RedisConnectionInfo shared_config: SharedConfig local_config: LocalConfig raft_cluster_config: Optional[LocalConfig] diff --git a/src/ai/backend/manager/cli/__main__.py b/src/ai/backend/manager/cli/__main__.py index 4223b556a9..9acaa8a891 100644 --- a/src/ai/backend/manager/cli/__main__.py +++ b/src/ai/backend/manager/cli/__main__.py @@ -13,9 +13,6 @@ import click from more_itertools import chunked from raftify import ( - ConfChangeRequest, - ConfChangeSingle, - ConfChangeType, InitialRole, Peer, Peers, @@ -406,47 +403,10 @@ async def handle_raft_cli_main(argv: list[str]): await cli_main(argv) -async def handle_apply_pending_confchanges(cli_ctx: CLIContext): - async with redis_ctx(cli_ctx) as redis_conn_set: - if raw_pending_join_request := await redis_conn_set.raft_confchange_requests.client.rpop( - "pending-requests" - ): - pending_join_request = json.loads(raw_pending_join_request) - leader_addr = pending_join_request[0]["leader_addr"] - - addrs = [] - changes = [] - for ticket in pending_join_request: - change = ConfChangeSingle() - change.set_node_id(ticket["reserved_id"]) - change.set_change_type(ConfChangeType.AddNode) - changes.append(change) - addrs.append(ticket["raft_addr"]) - - client = await RaftServiceClient.build(leader_addr) - await client.change_config(ConfChangeRequest(changes=changes, addrs=addrs)) - await redis_conn_set.raft_confchange_requests.client.delete("pending-requests") - - -async def handle_leave_joint(leader_addr: str): - client = await RaftServiceClient.build(leader_addr) - await client.leave_joint() - - @main.command() @click.pass_obj @click.argument("args", nargs=-1, type=click.UNPROCESSED) def raft(cli_ctx: CLIContext, args) -> None: - match next(iter(args), None): - case "apply-pending-confchanges": - print("Applying pending confchanges...") - asyncio.run(handle_apply_pending_confchanges(cli_ctx)) - return - case "leave-joint": - print("Apply leave joint.") - asyncio.run(handle_leave_joint(args[1])) - return - register_custom_deserializer() argv = sys.argv diff --git a/src/ai/backend/manager/cli/context.py b/src/ai/backend/manager/cli/context.py index 294acbe8ce..6544a2c186 100644 --- a/src/ai/backend/manager/cli/context.py +++ b/src/ai/backend/manager/cli/context.py @@ -14,7 +14,6 @@ from ai.backend.common.defs import ( REDIS_IMAGE_DB, REDIS_LIVE_DB, - REDIS_RAFT_PENDING_JOIN_REQUESTS, REDIS_STAT_DB, REDIS_STREAM_DB, ) @@ -117,7 +116,6 @@ class RedisConnectionSet: stat: RedisConnectionInfo image: RedisConnectionInfo stream: RedisConnectionInfo - raft_confchange_requests: RedisConnectionInfo @contextlib.asynccontextmanager @@ -152,20 +150,13 @@ async def redis_ctx(cli_ctx: CLIContext) -> AsyncIterator[RedisConnectionSet]: name="mgr_cli.stream", db=REDIS_STREAM_DB, ) - redis_raft_confchange_requests = redis_helper.get_redis_object( - shared_config.data["redis"], - name="mgr_cli.raft_confchange_requests", # raft configuration change requests - db=REDIS_RAFT_PENDING_JOIN_REQUESTS, - ) yield RedisConnectionSet( live=redis_live, stat=redis_stat, image=redis_image, stream=redis_stream, - raft_confchange_requests=redis_raft_confchange_requests, ) await redis_stream.close() await redis_image.close() await redis_stat.close() await redis_live.close() - await redis_raft_confchange_requests.close() diff --git a/src/ai/backend/manager/config.py b/src/ai/backend/manager/config.py index 61ae9b6fea..44bed248c2 100644 --- a/src/ai/backend/manager/config.py +++ b/src/ai/backend/manager/config.py @@ -328,10 +328,9 @@ ) manager_raft_cluster_config_iv = t.Dict({ - t.Key("join-through-peer-addr", default=None): t.String | t.Null, - t.Key("raft-debug-webserver-enabled", default=False): t.ToBool, t.Key("restore-wal-from", default=None): t.Int | t.Null, t.Key("restore-wal-snapshot-from", default=None): t.Int | t.Null, + t.Key("raft-debug-webserver-enabled", default=False): t.ToBool, t.Key("peers"): t.Dict({ t.Key("myself"): t.List( t.Dict({ diff --git a/src/ai/backend/manager/server.py b/src/ai/backend/manager/server.py index 96ac920b2e..50aab2f3ca 100644 --- a/src/ai/backend/manager/server.py +++ b/src/ai/backend/manager/server.py @@ -5,7 +5,6 @@ import grp import importlib import importlib.resources -import json import logging import os import pwd @@ -34,8 +33,8 @@ import click from aiohttp import web from aiotools import process_index -from raftify import ClusterJoinTicket, InitialRole, Peer, Peers, Raft from raftify import Config as RaftConfig +from raftify import InitialRole, Peer, Peers, Raft from raftify import RaftConfig as RaftCoreConfig from setproctitle import setproctitle @@ -46,7 +45,6 @@ from ai.backend.common.defs import ( REDIS_IMAGE_DB, REDIS_LIVE_DB, - REDIS_RAFT_PENDING_JOIN_REQUESTS, REDIS_STAT_DB, REDIS_STREAM_DB, REDIS_STREAM_LOCK, @@ -375,18 +373,12 @@ async def redis_ctx(root_ctx: RootContext) -> AsyncIterator[None]: name="lock", # distributed locks db=REDIS_STREAM_LOCK, ) - root_ctx.redis_raft_confchange_requests = redis_helper.get_redis_object( - root_ctx.shared_config.data["redis"], - name="raft_confchange_requests", # raft configuration change requests - db=REDIS_RAFT_PENDING_JOIN_REQUESTS, - ) for redis_info in ( root_ctx.redis_live, root_ctx.redis_stat, root_ctx.redis_image, root_ctx.redis_stream, root_ctx.redis_lock, - root_ctx.redis_raft_confchange_requests, ): await redis_helper.ping_redis_connection(redis_info.client) yield @@ -395,7 +387,6 @@ async def redis_ctx(root_ctx: RootContext) -> AsyncIterator[None]: await root_ctx.redis_stat.close() await root_ctx.redis_live.close() await root_ctx.redis_lock.close() - await root_ctx.redis_raft_confchange_requests.close() @actxmgr @@ -730,37 +721,6 @@ async def raft_ctx(root_ctx: RootContext) -> AsyncIterator[None]: logging.getLogger(f"{__spec__.name}.raft.node-{node_id}"), # type: ignore ) - if peer_addr := raft_cluster_configs["join-through-peer-addr"]: - # Join the cluster through the peer dynamically - - # TODO: Find leader_id by asking for leader_id to someone in initial_peers - leader_addr = peer_addr - - all_tickets = [ - ClusterJoinTicket( - peer["node-id"], - f"{peer['host']}:{peer['port']}", - leader_addr, - initial_peers, - ).to_dict() - for peer in all_peers - ] - - raft_logger.info("Cluster join request made to the redis queue.") - - await root_ctx.redis_raft_confchange_requests.client.rpush( - "pending-requests", json.dumps(all_tickets) - ) - - while True: - print("Waiting for the join request to be processed...") - await asyncio.sleep(1) - if ( - await root_ctx.redis_raft_confchange_requests.client.llen("pending-requests") - == 0 - ): - break - root_ctx.raft_ctx.cluster = Raft.bootstrap( node_id, raft_addr, @@ -1142,11 +1102,11 @@ def main( log.info("runtime: {0}", env_info()) log_config = logging.getLogger("ai.backend.manager.config") log_config.debug("debug mode enabled.") - if cfg["manager"]["event-loop"] == "uvloop": - import uvloop - uvloop.install() - log.info("Using uvloop as the event loop backend") + import uvloop + + uvloop.install() + log.info("Using uvloop as the event loop backend") try: aiotools.start_server( server_main_logwrapper, From 703f4a17b8430718c7aeb21910d252589002fcb7 Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Mon, 1 Apr 2024 07:16:29 +0000 Subject: [PATCH 12/16] Update gitignore --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 6fbad845b2..866e127f6f 100644 --- a/.gitignore +++ b/.gitignore @@ -144,4 +144,4 @@ docs/manager/rest-reference/openapi.json /INSTALL-INFO # Raft cluster config -raft-cluster-config.toml \ No newline at end of file +raft-cluster-config.toml From e99d50ee7f30bbe71a88c5da683ecf833c83b178 Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Mon, 1 Apr 2024 07:18:58 +0000 Subject: [PATCH 13/16] Delete unrelated changes --- src/ai/backend/manager/server.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/ai/backend/manager/server.py b/src/ai/backend/manager/server.py index 50aab2f3ca..887defd553 100644 --- a/src/ai/backend/manager/server.py +++ b/src/ai/backend/manager/server.py @@ -1103,10 +1103,11 @@ def main( log_config = logging.getLogger("ai.backend.manager.config") log_config.debug("debug mode enabled.") - import uvloop + if cfg["manager"]["event-loop"] == "uvloop": + import uvloop - uvloop.install() - log.info("Using uvloop as the event loop backend") + uvloop.install() + log.info("Using uvloop as the event loop backend") try: aiotools.start_server( server_main_logwrapper, From bced16b546a089270ae45ec8750bf79b9dff5771 Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Mon, 1 Apr 2024 07:42:30 +0000 Subject: [PATCH 14/16] Fix stale logic in mgr status command --- src/ai/backend/manager/cli/__main__.py | 9 ++++++++- src/ai/backend/manager/cli/context.py | 21 ++++++++++++++++++++- src/ai/backend/manager/config.py | 4 ++-- src/ai/backend/manager/server.py | 2 +- 4 files changed, 31 insertions(+), 5 deletions(-) diff --git a/src/ai/backend/manager/cli/__main__.py b/src/ai/backend/manager/cli/__main__.py index 9acaa8a891..07da586ded 100644 --- a/src/ai/backend/manager/cli/__main__.py +++ b/src/ai/backend/manager/cli/__main__.py @@ -343,12 +343,19 @@ async def inspect_node_status(cli_ctx: CLIContext) -> None: headers = ["ENDPOINT", "NODE ID", "IS LEADER", "RAFT TERM", "RAFT APPLIED INDEX"] if raft_configs is not None: + raft_cluster_configs = cli_ctx.raft_cluster_config + assert raft_cluster_configs is not None + + other_peers = [{**peer, "myself": False} for peer in raft_cluster_configs["peers"]["other"]] + my_peers = [{**peer, "myself": True} for peer in raft_cluster_configs["peers"]["myself"]] + all_peers = sorted([*other_peers, *my_peers], key=lambda x: x["node-id"]) + initial_peers = Peers({ int(peer_config["node-id"]): Peer( addr=f"{peer_config['host']}:{peer_config['port']}", role=InitialRole.from_str(peer_config["role"]), ) - for peer_config in raft_configs["peers"] + for peer_config in all_peers }) peers: dict[str, Any] | None = None diff --git a/src/ai/backend/manager/cli/context.py b/src/ai/backend/manager/cli/context.py index 6544a2c186..b249ad5838 100644 --- a/src/ai/backend/manager/cli/context.py +++ b/src/ai/backend/manager/cli/context.py @@ -22,18 +22,20 @@ from ai.backend.common.logging import AbstractLogger, LocalLogger from ai.backend.common.types import LogSeverity, RedisConnectionInfo -from ..config import LocalConfig, SharedConfig +from ..config import LocalConfig, SharedConfig, load_raft_cluster_config from ..config import load as load_config class CLIContext: _local_config: LocalConfig | None + _raft_cluster_config: LocalConfig | None _logger: AbstractLogger def __init__(self, config_path: Path, log_level: LogSeverity) -> None: self.config_path = config_path self.log_level = log_level self._local_config = None + self._raft_cluster_config = None @property def local_config(self) -> LocalConfig: @@ -50,6 +52,23 @@ def local_config(self) -> LocalConfig: raise click.Abort() return self._local_config + @property + def raft_cluster_config(self) -> LocalConfig | None: + # Lazy-load the configuration only when requested. + try: + if self._raft_cluster_config is None: + self._raft_cluster_config = load_raft_cluster_config( + self.config_path, self.log_level + ) + except ConfigurationError as e: + print( + "ConfigurationError: Could not read or validate the manager raft cluster config:", + file=sys.stderr, + ) + print(pformat(e.invalid_data), file=sys.stderr) + raise click.Abort() + return self._raft_cluster_config + def __enter__(self) -> Self: # The "start-server" command is injected by ai.backend.cli from the entrypoint # and it has its own multi-process-aware logging initialization. diff --git a/src/ai/backend/manager/config.py b/src/ai/backend/manager/config.py index 44bed248c2..10dfe13157 100644 --- a/src/ai/backend/manager/config.py +++ b/src/ai/backend/manager/config.py @@ -615,8 +615,8 @@ def load( def load_raft_cluster_config( - debug_enabled: bool = False, raft_cluster_config_path: Optional[Path] = None, + log_level: LogSeverity = LogSeverity.INFO, ) -> Optional[LocalConfig]: try: raw_cfg, _ = config.read_from_file(raft_cluster_config_path, "raft-cluster-config") @@ -625,7 +625,7 @@ def load_raft_cluster_config( try: cfg = config.check(raw_cfg, manager_raft_cluster_config_iv) - if debug_enabled: + if log_level == LogSeverity.DEBUG: print("== Raft cluster configuration ==", file=sys.stderr) print(pformat(cfg), file=sys.stderr) except config.ConfigurationError as e: diff --git a/src/ai/backend/manager/server.py b/src/ai/backend/manager/server.py index 887defd553..c642fac1df 100644 --- a/src/ai/backend/manager/server.py +++ b/src/ai/backend/manager/server.py @@ -1086,7 +1086,7 @@ def main( Start the manager service as a foreground process. """ cfg = load_config(config_path, LogSeverity.DEBUG if debug else log_level) - raft_cluster_cfg = load_raft_cluster_config(debug, raft_cluster_config_path) + raft_cluster_cfg = load_raft_cluster_config(raft_cluster_config_path, log_level) if ctx.invoked_subcommand is None: cfg["manager"]["pid-file"].write_text(str(os.getpid())) From dc3e51f53a3abd99e6dc108d7c2066835461a5e3 Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Mon, 1 Apr 2024 08:00:02 +0000 Subject: [PATCH 15/16] Update python.lock --- python.lock | 32 +++++++------------------------- 1 file changed, 7 insertions(+), 25 deletions(-) diff --git a/python.lock b/python.lock index d6136274ad..fa08a5d679 100644 --- a/python.lock +++ b/python.lock @@ -72,7 +72,7 @@ // "python-dateutil>=2.8", // "python-dotenv~=0.20.0", // "python-json-logger>=2.0.1", -// "pyzmq~=24.0.1", +// "pyzmq~=25.1.2", // "raftify==0.1.65", // "redis[hiredis]==4.5.5", // "rich~=13.6", @@ -2840,24 +2840,6 @@ "requires_python": null, "version": "0.7.0" }, - { - "artifacts": [ - { - "algorithm": "sha256", - "hash": "cca4bb0f2df5504f02f6f8a775b6e416ff9b0b3b16f7ee80b5a3153d9b804473", - "url": "https://files.pythonhosted.org/packages/23/7e/5f50d07d5e70a2addbccd90ac2950f81d1edd0783630651d9268d7f1db49/pyasn1-0.6.0-py2.py3-none-any.whl" - }, - { - "algorithm": "sha256", - "hash": "51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719", - "url": "https://files.pythonhosted.org/packages/98/ff/fec109ceb715d2a6b4c4a85a61af3b40c723a961e8828319fbcb15b868dc/py-1.11.0.tar.gz" - } - ], - "project_name": "py", - "requires_dists": [], - "requires_python": "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7", - "version": "1.11.0" - }, { "artifacts": [ { @@ -3867,13 +3849,13 @@ "artifacts": [ { "algorithm": "sha256", - "hash": "94aacf28dece20a44f0b94b087e17ff4ac961acd92e12e648f060fe2555b3adc", - "url": "https://files.pythonhosted.org/packages/55/2f/2e0a2c65c460f66f547c5ed3945c0896e9a786a204d0f5a4f24b1ec19612/textual-0.54.0-py3-none-any.whl" + "hash": "960a19df2319482918b4a58736d9552cdc1ab65d170ba0bc15273ce0e1922b7a", + "url": "https://files.pythonhosted.org/packages/8a/f0/ab4e1045af86f051ebcb64b964b00b3b52a1c99304f357dd2ea0af3ed1a4/textual-0.52.1-py3-none-any.whl" }, { "algorithm": "sha256", - "hash": "0cfd134dde5ae49d64dd73bb32a2fb5a86d878d9caeacecaa1d640082f31124e", - "url": "https://files.pythonhosted.org/packages/68/2d/d13923f4172751a22d1f41969f6d37bce2ae00be477b62abc6d1d0ebc476/textual-0.54.0.tar.gz" + "hash": "4232e5c2b423ed7c63baaeb6030355e14e1de1b9df096c9655b68a1e60e4de5f", + "url": "https://files.pythonhosted.org/packages/bb/ce/b224ccc05260871da8df640e7cd8ca0a5e38721fddb6733650195402841e/textual-0.52.1.tar.gz" } ], "project_name": "textual", @@ -3885,7 +3867,7 @@ "typing-extensions<5.0.0,>=4.4.0" ], "requires_python": "<4.0,>=3.8", - "version": "0.54.0" + "version": "0.52.1" }, { "artifacts": [ @@ -4668,7 +4650,7 @@ "python-dateutil>=2.8", "python-dotenv~=0.20.0", "python-json-logger>=2.0.1", - "pyzmq~=24.0.1", + "pyzmq~=25.1.2", "raftify==0.1.65", "redis[hiredis]==4.5.5", "rich~=13.6", From 341b8871192c5b965eadd7b99870329850a4ed5c Mon Sep 17 00:00:00 2001 From: Gyubong Lee Date: Mon, 1 Apr 2024 17:00:40 +0900 Subject: [PATCH 16/16] Delete configs/manager/raft-cluster-config.toml --- configs/manager/raft-cluster-config.toml | 20 -------------------- 1 file changed, 20 deletions(-) delete mode 100644 configs/manager/raft-cluster-config.toml diff --git a/configs/manager/raft-cluster-config.toml b/configs/manager/raft-cluster-config.toml deleted file mode 100644 index 63cdecf860..0000000000 --- a/configs/manager/raft-cluster-config.toml +++ /dev/null @@ -1,20 +0,0 @@ -#restore-wal-from = 1 -#restore-wal-snapshot-from = 1 - -[[peers.myself]] -host = "192.168.0.37" -port = 60151 -node-id = 1 -role = "voter" - -[[peers.myself]] -host = "192.168.0.37" -port = 60152 -node-id = 2 -role = "voter" - -[[peers.myself]] -host = "192.168.0.37" -port = 60153 -node-id = 3 -role = "voter"