[BugFix] fix workers=1 (#4364)

ltd0924 · ltd0924 · web-flow · commit d8841b7b4099 · 2025-10-15T17:06:25.000+08:00
* [Feature] support prefix cache in DP

* fix

* Update common_engine.py

* Update common_engine.py

* Update common_engine.py

* Update common_engine.py

* [BugFix] fix workers more than 1

* fix

* Update api_server.py

* fix

* Update api_server.py

* fix

---------

Co-authored-by: ltd0924 &lt;luotingdan@baidu.com&gt;
diff --git a/fastdeploy/entrypoints/openai/api_server.py b/fastdeploy/entrypoints/openai/api_server.py
@@ -22,13 +22,13 @@
 import traceback
 from collections.abc import AsyncGenerator
 from contextlib import asynccontextmanager
-from multiprocessing import current_process
 
 import uvicorn
 import zmq
 from fastapi import FastAPI, HTTPException, Request
 from fastapi.exceptions import RequestValidationError
 from fastapi.responses import JSONResponse, Response, StreamingResponse
+from gunicorn.app.base import BaseApplication
 from opentelemetry import trace
 from prometheus_client import CONTENT_TYPE_LATEST
 
@@ -87,6 +87,21 @@
 llm_engine = None
 
 
+class StandaloneApplication(BaseApplication):
+    def __init__(self, app, options=None):
+        self.application = app
+        self.options = options or {}
+        super().__init__()
+
+    def load_config(self):
+        config = {key: value for key, value in self.options.items() if key in self.cfg.settings and value is not None}
+        for key, value in config.items():
+            self.cfg.set(key.lower(), value)
+
+    def load(self):
+        return self.application
+
+
 def load_engine():
     """
     load engine
@@ -95,10 +110,10 @@ def load_engine():
     if llm_engine is not None:
         return llm_engine
 
-    api_server_logger.info(f"FastDeploy LLM API server starting... {os.getpid()}")
+    api_server_logger.info(f"FastDeploy LLM API server starting... {os.getpid()}, port: {args.port}")
     engine_args = EngineArgs.from_cli_args(args)
     engine = LLMEngine.from_engine_args(engine_args)
-    if not engine.start(api_server_pid=os.getpid()):
+    if not engine.start(api_server_pid=args.port):
         api_server_logger.error("Failed to initialize FastDeploy LLM engine, service exit now!")
         return None
 
@@ -119,12 +134,12 @@ def load_data_service():
     global llm_engine
     if llm_engine is not None:
         return llm_engine
-    api_server_logger.info(f"FastDeploy LLM API server starting... {os.getpid()}")
+    api_server_logger.info(f"FastDeploy LLM API server starting... {os.getpid()}, port: {args.port}")
     engine_args = EngineArgs.from_cli_args(args)
     config = engine_args.create_engine_config()
     api_server_logger.info(f"local_data_parallel_id: {config.parallel_config}")
     expert_service = ExpertService(config, config.parallel_config.local_data_parallel_id)
-    if not expert_service.start(os.getpid(), config.parallel_config.local_data_parallel_id):
+    if not expert_service.start(args.port, config.parallel_config.local_data_parallel_id):
         api_server_logger.error("Failed to initialize FastDeploy LLM expert service, service exit now!")
         return None
     llm_engine = expert_service
@@ -136,13 +151,22 @@ async def lifespan(app: FastAPI):
     """
     async context manager for FastAPI lifespan
     """
+    import logging
+
+    uvicorn_access = logging.getLogger("uvicorn.access")
+    uvicorn_access.handlers.clear()
+
+    # 使用 gunicorn 的格式
+    formatter = logging.Formatter("[%(asctime)s] [%(process)d] [INFO] %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
+
+    handler = logging.StreamHandler()
+    handler.setFormatter(formatter)
+    uvicorn_access.addHandler(handler)
+    uvicorn_access.propagate = False
 
     if args.tokenizer is None:
         args.tokenizer = args.model
-    if current_process().name != "MainProcess":
-        pid = os.getppid()
-    else:
-        pid = os.getpid()
+    pid = args.port
     api_server_logger.info(f"{pid}")
 
     if args.served_model_name is not None:
@@ -449,16 +473,17 @@ def launch_api_server() -> None:
     api_server_logger.info(f"args: {args.__dict__}")
     fd_start_span("FD_START")
 
+    options = {
+        "bind": f"{args.host}:{args.port}",
+        "workers": args.workers,
+        "worker_class": "uvicorn.workers.UvicornWorker",
+        "loglevel": "info",
+        "log_config": UVICORN_CONFIG,
+        "timeout_graceful_shutdown": args.timeout_graceful_shutdown,
+    }
+
     try:
-        uvicorn.run(
-            app="fastdeploy.entrypoints.openai.api_server:app",
-            host=args.host,
-            port=args.port,
-            workers=args.workers,
-            log_config=UVICORN_CONFIG,
-            log_level="info",
-            timeout_graceful_shutdown=args.timeout_graceful_shutdown,
-        )  # set log level to error to avoid log
+        StandaloneApplication(app, options).run()
     except Exception as e:
         api_server_logger.error(f"launch sync http server error, {e}, {str(traceback.format_exc())}")
 
diff --git a/requirements.txt b/requirements.txt
@@ -30,6 +30,7 @@ use-triton-in-paddle
 crcmod
 fastsafetensors==0.1.14
 msgpack
+gunicorn
 modelscope
 opentelemetry-api>=1.24.0
 opentelemetry-sdk>=1.24.0
diff --git a/requirements_dcu.txt b/requirements_dcu.txt
@@ -28,6 +28,7 @@ use-triton-in-paddle
 crcmod
 fastsafetensors==0.1.14
 msgpack
+gunicorn
 opentelemetry-api>=1.24.0
 opentelemetry-sdk>=1.24.0
 opentelemetry-instrumentation-redis
diff --git a/requirements_iluvatar.txt b/requirements_iluvatar.txt
@@ -29,6 +29,7 @@ use-triton-in-paddle
 crcmod
 fastsafetensors==0.1.14
 msgpack
+gunicorn
 opentelemetry-api>=1.24.0
 opentelemetry-sdk>=1.24.0
 opentelemetry-instrumentation-redis
diff --git a/requirements_metaxgpu.txt b/requirements_metaxgpu.txt
@@ -30,6 +30,7 @@ use-triton-in-paddle
 crcmod
 fastsafetensors==0.1.14
 msgpack
+gunicorn
 modelscope
 opentelemetry-api>=1.24.0
 opentelemetry-sdk>=1.24.0