Skip to content

Commit

Permalink
api: better startup failure UX (#881)
Browse files Browse the repository at this point in the history
  • Loading branch information
AlpinDale authored Dec 12, 2024
1 parent db6a50f commit ce6e3d6
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 20 deletions.
24 changes: 18 additions & 6 deletions aphrodite/endpoints/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from contextlib import asynccontextmanager
from distutils.util import strtobool
from http import HTTPStatus
from typing import AsyncGenerator, AsyncIterator, List, Set, Tuple
from typing import AsyncGenerator, AsyncIterator, List, Optional, Set, Tuple

from fastapi import APIRouter, FastAPI, Request
from fastapi.exceptions import RequestValidationError
Expand Down Expand Up @@ -108,7 +108,14 @@ async def _force_log():


@asynccontextmanager
async def build_async_engine_client(args) -> AsyncIterator[AsyncEngineClient]:
async def build_async_engine_client(
args: Namespace) -> AsyncIterator[Optional[AsyncEngineClient]]:
"""
Create AsyncEngineClient, either:
- in-process using the AsyncAphrodite Directly
- multiprocess using AsyncAphrodite RPC
Returns the Client or None if the creation failed.
"""
# Context manager to handle async_engine_client lifecycle
# Ensures everything is shutdown and cleaned up on error/exit
global engine_args
Expand Down Expand Up @@ -168,11 +175,13 @@ async def build_async_engine_client(args) -> AsyncIterator[AsyncEngineClient]:
try:
await async_engine_client.setup()
break
except TimeoutError as e:
except TimeoutError:
if not rpc_server_process.is_alive():
raise RuntimeError(
"The server process died before "
"responding to the readiness probe") from e
logger.error(
"RPCServer process died before responding "
"to readiness probe")
yield None
return
yield async_engine_client
finally:
# Ensure rpc server process was terminated
Expand Down Expand Up @@ -759,6 +768,9 @@ async def init_app(
async def run_server(args, **uvicorn_kwargs) -> None:

async with build_async_engine_client(args) as async_engine_client:
# If None, creation of the client failed and we exit.
if async_engine_client is None:
return
app = await init_app(async_engine_client, args)

protocol = "https" if args.ssl_certfile else "http"
Expand Down
2 changes: 1 addition & 1 deletion aphrodite/server/launch.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ async def dummy_shutdown() -> None:
port = uvicorn_kwargs["port"]
process = find_process_using_port(port)
if process is not None:
logger.debug(
logger.info(
f"port {port} is used by process {process} launched with "
f"command:\n{' '.join(process.cmdline())}")
logger.info("Gracefully stopping http server")
Expand Down
28 changes: 15 additions & 13 deletions tests/endpoints/openai/test_mp_api_server.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import time

import pytest

from aphrodite.common.utils import FlexibleArgumentParser
Expand All @@ -8,19 +10,19 @@
@pytest.mark.asyncio
async def test_mp_crash_detection():

with pytest.raises(RuntimeError) as excinfo:
parser = FlexibleArgumentParser(
description="Aphrodite's remote OpenAI server.")
parser = make_arg_parser(parser)
args = parser.parse_args([])
# use an invalid tensor_parallel_size to trigger the
# error in the server
args.tensor_parallel_size = 65536

async with build_async_engine_client(args):
pass
assert "The server process died before responding to the readiness probe"\
in str(excinfo.value)
parser = FlexibleArgumentParser(
description="Aphrodite's remote OpenAI server.")
parser = make_arg_parser(parser)
args = parser.parse_args([])
# use an invalid tensor_parallel_size to trigger the
# error in the server
args.tensor_parallel_size = 65536
start = time.perf_counter()
async with build_async_engine_client(args):
pass
end = time.perf_counter()
assert end - start < 60, ("Expected Aphrodite to gracefully shutdown in "
"<60s if there is an error in the startup.")


@pytest.mark.asyncio
Expand Down

0 comments on commit ce6e3d6

Please sign in to comment.