Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 43 additions & 18 deletions fastdeploy/entrypoints/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,13 @@
import traceback
from collections.abc import AsyncGenerator
from contextlib import asynccontextmanager
from multiprocessing import current_process

import uvicorn
import zmq
from fastapi import FastAPI, HTTPException, Request
from fastapi.exceptions import RequestValidationError
from fastapi.responses import JSONResponse, Response, StreamingResponse
from gunicorn.app.base import BaseApplication
from opentelemetry import trace
from prometheus_client import CONTENT_TYPE_LATEST

Expand Down Expand Up @@ -87,6 +87,21 @@
llm_engine = None


class StandaloneApplication(BaseApplication):
def __init__(self, app, options=None):
self.application = app
self.options = options or {}
super().__init__()

def load_config(self):
config = {key: value for key, value in self.options.items() if key in self.cfg.settings and value is not None}
for key, value in config.items():
self.cfg.set(key.lower(), value)

def load(self):
return self.application


def load_engine():
"""
load engine
Expand All @@ -95,10 +110,10 @@ def load_engine():
if llm_engine is not None:
return llm_engine

api_server_logger.info(f"FastDeploy LLM API server starting... {os.getpid()}")
api_server_logger.info(f"FastDeploy LLM API server starting... {os.getpid()}, port: {args.port}")
engine_args = EngineArgs.from_cli_args(args)
engine = LLMEngine.from_engine_args(engine_args)
if not engine.start(api_server_pid=os.getpid()):
if not engine.start(api_server_pid=args.port):
api_server_logger.error("Failed to initialize FastDeploy LLM engine, service exit now!")
return None

Expand All @@ -119,12 +134,12 @@ def load_data_service():
global llm_engine
if llm_engine is not None:
return llm_engine
api_server_logger.info(f"FastDeploy LLM API server starting... {os.getpid()}")
api_server_logger.info(f"FastDeploy LLM API server starting... {os.getpid()}, port: {args.port}")
engine_args = EngineArgs.from_cli_args(args)
config = engine_args.create_engine_config()
api_server_logger.info(f"local_data_parallel_id: {config.parallel_config}")
expert_service = ExpertService(config, config.parallel_config.local_data_parallel_id)
if not expert_service.start(os.getpid(), config.parallel_config.local_data_parallel_id):
if not expert_service.start(args.port, config.parallel_config.local_data_parallel_id):
api_server_logger.error("Failed to initialize FastDeploy LLM expert service, service exit now!")
return None
llm_engine = expert_service
Expand All @@ -136,13 +151,22 @@ async def lifespan(app: FastAPI):
"""
async context manager for FastAPI lifespan
"""
import logging

uvicorn_access = logging.getLogger("uvicorn.access")
uvicorn_access.handlers.clear()

# 使用 gunicorn 的格式
formatter = logging.Formatter("[%(asctime)s] [%(process)d] [INFO] %(message)s", datefmt="%Y-%m-%d %H:%M:%S")

handler = logging.StreamHandler()
handler.setFormatter(formatter)
uvicorn_access.addHandler(handler)
uvicorn_access.propagate = False

if args.tokenizer is None:
args.tokenizer = args.model
if current_process().name != "MainProcess":
pid = os.getppid()
else:
pid = os.getpid()
pid = args.port
api_server_logger.info(f"{pid}")

if args.served_model_name is not None:
Expand Down Expand Up @@ -449,16 +473,17 @@ def launch_api_server() -> None:
api_server_logger.info(f"args: {args.__dict__}")
fd_start_span("FD_START")

options = {
"bind": f"{args.host}:{args.port}",
"workers": args.workers,
"worker_class": "uvicorn.workers.UvicornWorker",
"loglevel": "info",
"log_config": UVICORN_CONFIG,
"timeout_graceful_shutdown": args.timeout_graceful_shutdown,
}

try:
uvicorn.run(
app="fastdeploy.entrypoints.openai.api_server:app",
host=args.host,
port=args.port,
workers=args.workers,
log_config=UVICORN_CONFIG,
log_level="info",
timeout_graceful_shutdown=args.timeout_graceful_shutdown,
) # set log level to error to avoid log
StandaloneApplication(app, options).run()
except Exception as e:
api_server_logger.error(f"launch sync http server error, {e}, {str(traceback.format_exc())}")

Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ use-triton-in-paddle
crcmod
fastsafetensors==0.1.14
msgpack
gunicorn
modelscope
opentelemetry-api>=1.24.0
opentelemetry-sdk>=1.24.0
Expand Down
1 change: 1 addition & 0 deletions requirements_dcu.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ use-triton-in-paddle
crcmod
fastsafetensors==0.1.14
msgpack
gunicorn
opentelemetry-api>=1.24.0
opentelemetry-sdk>=1.24.0
opentelemetry-instrumentation-redis
Expand Down
1 change: 1 addition & 0 deletions requirements_iluvatar.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ use-triton-in-paddle
crcmod
fastsafetensors==0.1.14
msgpack
gunicorn
opentelemetry-api>=1.24.0
opentelemetry-sdk>=1.24.0
opentelemetry-instrumentation-redis
Expand Down
1 change: 1 addition & 0 deletions requirements_metaxgpu.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ use-triton-in-paddle
crcmod
fastsafetensors==0.1.14
msgpack
gunicorn
modelscope
opentelemetry-api>=1.24.0
opentelemetry-sdk>=1.24.0
Expand Down
Loading