add endpoint /abort_request (#4092)

lvhan028 · web-flow · commit 9ec8c4092f54 · 2025-11-05T17:51:01.000+08:00
* add endpoint /abort_request

* add finish_reason abort

* enlarge num_instance

* add option --enable-abort-handling

* fix access None req_metrics when sending abort_request
diff --git a/lmdeploy/cli/serve.py b/lmdeploy/cli/serve.py
@@ -73,6 +73,7 @@ def add_parser_api_server():
         ArgumentHelper.max_log_len(parser)
         ArgumentHelper.disable_fastapi_docs(parser)
         ArgumentHelper.allow_terminate_by_client(parser)
+        ArgumentHelper.enable_abort_handling(parser)
         # chat template args
         ArgumentHelper.chat_template(parser)
 
@@ -266,6 +267,7 @@ def api_server(args):
                            allow_methods=args.allow_methods,
                            allow_headers=args.allow_headers,
                            allow_terminate_by_client=args.allow_terminate_by_client,
+                           enable_abort_handling=args.enable_abort_handling,
                            log_level=args.log_level.upper(),
                            api_keys=args.api_keys,
                            ssl=args.ssl,
@@ -293,6 +295,7 @@ def api_server(args):
                           allow_methods=args.allow_methods,
                           allow_headers=args.allow_headers,
                           allow_terminate_by_client=args.allow_terminate_by_client,
+                          enable_abort_handling=args.enable_abort_handling,
                           log_level=args.log_level.upper(),
                           api_keys=args.api_keys,
                           ssl=args.ssl,
diff --git a/lmdeploy/cli/utils.py b/lmdeploy/cli/utils.py
@@ -454,6 +454,16 @@ def allow_terminate_by_client(parser):
                                    default=False,
                                    help='Enable server to be terminated by request from client')
 
+    @staticmethod
+    def enable_abort_handling(parser):
+        """Add --enable-abort-handling argument to configure server abort
+        request processing."""
+
+        return parser.add_argument('--enable-abort-handling',
+                                   action='store_true',
+                                   default=False,
+                                   help='Enable server to handle client abort requests')
+
     @staticmethod
     def cache_max_entry_count(parser):
         """Add argument cache_max_entry_count to parser."""
diff --git a/lmdeploy/metrics/metrics_processor.py b/lmdeploy/metrics/metrics_processor.py
@@ -122,7 +122,9 @@ async def _run_metrics_handler(self):
                 outputs, req_state, iteration_stats = update_data
 
                 # update request state according the engine events
-                req_state.update_from_events(outputs.req_metrics.engine_events)
+                if outputs and outputs.req_metrics:
+                    # when users visit "/abort_request" endpoint, `req_metrics` might be None
+                    req_state.update_from_events(outputs.req_metrics.engine_events)
 
                 # update iteration stats based on outputs and request state.
                 # some attributes of req_state will also be updated, e.g., lastest_token_time
diff --git a/lmdeploy/metrics/stats.py b/lmdeploy/metrics/stats.py
@@ -198,6 +198,9 @@ def update_from_output(self, outputs: EngineOutput, req_state: RequestState):
             outputs (EngineOutput): The output from the engine containing information about the current iteration.
             req_state (RequestState): The state of the request, including timestamps and token counts.
         """
+        if outputs.req_metrics is None:
+            # when users visit "/abort_request" endpoint, `req_metrics` might be None
+            return
         new_generation_tokens = len(outputs.token_ids)
         if new_generation_tokens == 0:
             return
diff --git a/lmdeploy/pytorch/engine/engine.py b/lmdeploy/pytorch/engine/engine.py
@@ -540,7 +540,7 @@ def _on_stop_session(self, reqs: List[Request], **kwargs):
                 for seq in session.sequences.values():
                     _resp: Response = getattr(seq, 'resp', None)
                     if _resp is not None:
-                        _resp.type = ResponseType.FINISH
+                        _resp.type = ResponseType.CANCEL
                         self.req_manager.response(_resp)
                 resp_type = ResponseType.SUCCESS
             if resp:
diff --git a/lmdeploy/pytorch/engine/mp_engine/base_worker.py b/lmdeploy/pytorch/engine/mp_engine/base_worker.py
@@ -20,7 +20,8 @@ class EngineInstancePool:
     def __init__(self, engine):
         from lmdeploy.pytorch.engine import Engine
         self.engine: Engine = engine
-        self.num_instance = self.engine.engine_config.max_batch_size
+        # enlarge `num_instance`, otherwise an sequence cannot be stopped in time
+        self.num_instance = self.engine.engine_config.max_batch_size * 2
         self.pool = None
 
     def create_instance_pool(self, num_instance: int):
diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py
@@ -444,12 +444,26 @@ async def do_log_stats(self):
         for stat_logger in self.stat_loggers:
             stat_logger.log()
 
+    async def stop_all_session(self):
+        """Stop all running sessions."""
+        logger.info('stop all sessions')
+        tasks = []
+        session_ids = []
+        for session_id in list(self.id2inst.keys()):
+            generator = self.id2inst.get(session_id)
+            if generator:
+                session_ids.append(session_id)
+                tasks.append(generator.async_cancel(session_id))
+        await asyncio.gather(*tasks)
+        logger.info(f'all {len(session_ids)} sessions stopped')
+
     async def stop_session(self, session_id: int):
         """Stop a session by a session_id."""
         logger.info(f'stop session {session_id}')
         generator = self.id2inst.get(session_id)
         if generator:
             await generator.async_cancel(session_id)
+            logger.info(f'session {session_id} stopped')
         # else it's not running at all
 
     async def end_session(self, session_id: int):
@@ -855,7 +869,7 @@ def is_error(status):
                         break
 
                     output_len = len(outputs.token_ids)
-                    if hit_stop_token:
+                    if hit_stop_token or output_len == 0:
                         continue
 
                     # This assumes the engine will stop when stop token is hit
@@ -892,7 +906,11 @@ def is_error(status):
                 metrics_processor.increment_finished_requests()
 
                 if not is_error(outputs.status):
-                    finish_reason = 'stop' if outputs.token_ids[-1] in stop_ids else 'length'
+                    if outputs.status == ResponseType.CANCEL:
+                        finish_reason = 'abort'
+                    else:
+                        finish_reason = 'stop' if outputs.token_ids[-1] in stop_ids else 'length'
+
                     # utf-8 char at the end means it's a potential unfinished byte sequence
                     if not response.endswith('�'):
                         # avoid returning the last response twice
@@ -926,7 +944,7 @@ def is_error(status):
                             output_len = gen_len
                         self.id2step[session_id] += input_len + output_len
                 else:
-                    logger.error(f'session {session_id} finished, '
+                    logger.error(f'session {session_id} finished, {outputs.status}, '
                                  'reason "error"')
                     yield GenOut(response=f'internal error happened, status code {outputs.status}',
                                  history_token_len=self.id2step[session_id],
diff --git a/lmdeploy/serve/openai/api_client.py b/lmdeploy/serve/openai/api_client.py
@@ -13,7 +13,7 @@ def get_model_list(api_url: str, headers: dict = None):
     logger = get_logger('lmdeploy')
     if not response.ok:
         logger.error(f'Failed to get the model list: {api_url}'
-                     'returns {response.status_code}')
+                     f' returns {response.status_code}')
         return None
     elif not hasattr(response, 'text'):
         logger.warning('Failed to get the model list.')
diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py
@@ -32,7 +32,7 @@
 from lmdeploy.serve.async_engine import AsyncEngine
 from lmdeploy.serve.openai.harmony_utils import GptOssChatParser
 from lmdeploy.serve.openai.protocol import ChatCompletionResponse  # noqa: E501
-from lmdeploy.serve.openai.protocol import (ChatCompletionRequest, ChatCompletionResponseChoice,
+from lmdeploy.serve.openai.protocol import (AbortRequest, ChatCompletionRequest, ChatCompletionResponseChoice,
                                             ChatCompletionResponseStreamChoice, ChatCompletionStreamResponse,
                                             ChatCompletionTokenLogprob, ChatMessage, ChoiceLogprobs, CompletionRequest,
                                             CompletionResponse, CompletionResponseChoice,
@@ -65,6 +65,7 @@ class VariableInterface:
     # following is for tool parsers
     tool_parser: Optional[ToolParser] = None
     allow_terminate_by_client: bool = False
+    enable_abort_handling: bool = False
 
 
 router = APIRouter()
@@ -954,18 +955,8 @@ async def generate(request: GenerateReqInput, raw_request: Request = None):
         do_preprocess=False,
     )
 
-    def create_finish_reason(finish_reason):
-        # TODO: add detail info
-        if not finish_reason:
-            return None
-        if finish_reason == 'length':
-            return dict(type='length')
-        if finish_reason == 'stop':
-            return dict(type='stop')
-        return dict(type='abort')
-
     def create_generate_response_json(res, text, output_ids, logprobs, finish_reason):
-        meta = GenerateReqMetaOutput(finish_reason=create_finish_reason(finish_reason),
+        meta = GenerateReqMetaOutput(finish_reason=dict(type=finish_reason) if finish_reason else None,
                                      output_token_logprobs=logprobs or None,
                                      prompt_tokens=res.input_token_len,
                                      completion_tokens=res.generate_token_len)
@@ -1004,7 +995,7 @@ async def _inner_call():
                 for tok, tok_logprobs in zip(res.token_ids, res.logprobs):
                     logprobs.append((tok_logprobs[tok], tok))
         nonlocal response
-        meta = GenerateReqMetaOutput(finish_reason=create_finish_reason(res.finish_reason),
+        meta = GenerateReqMetaOutput(finish_reason=dict(type=res.finish_reason) if res.finish_reason else None,
                                      output_token_logprobs=logprobs or None,
                                      prompt_tokens=res.input_token_len,
                                      completion_tokens=res.generate_token_len)
@@ -1168,6 +1159,21 @@ async def free_cache(cache_free_request: DistServeCacheFreeRequest) -> JSONRespo
 """ PD Disaggregation API End """
 
 
+@router.post('/abort_request')
+async def abort_request(request: AbortRequest, raw_request: Request = None):
+    """Abort an ongoing request."""
+    if not VariableInterface.enable_abort_handling:
+        return Response(
+            status_code=501,
+            content='This server does not support abort requests. Enable with --enable-abort-handling flag.')
+
+    if request.abort_all:
+        await VariableInterface.async_engine.stop_all_session()
+    else:
+        await VariableInterface.async_engine.stop_session(request.session_id)
+    return Response(status_code=200)
+
+
 @router.post('/v1/chat/interactive', dependencies=[Depends(check_api_key)])
 async def chat_interactive_v1(request: GenerateRequest, raw_request: Request = None):
     return create_error_response(
@@ -1332,6 +1338,7 @@ def serve(model_path: str,
           reasoning_parser: Optional[str] = None,
           tool_call_parser: Optional[str] = None,
           allow_terminate_by_client: bool = False,
+          enable_abort_handling: bool = False,
           **kwargs):
     """An example to perform model inference through the command line
     interface.
@@ -1390,6 +1397,7 @@ def serve(model_path: str,
     logger.setLevel(log_level)
 
     VariableInterface.allow_terminate_by_client = allow_terminate_by_client
+    VariableInterface.enable_abort_handling = enable_abort_handling
     if api_keys is not None:
         if isinstance(api_keys, str):
             api_keys = api_keys.split(',')
diff --git a/lmdeploy/serve/openai/protocol.py b/lmdeploy/serve/openai/protocol.py
@@ -256,7 +256,7 @@ class ChatCompletionResponseStreamChoice(BaseModel):
     index: int
     delta: DeltaMessage
     logprobs: Optional[ChoiceLogprobs] = None
-    finish_reason: Optional[Literal['stop', 'length', 'tool_calls', 'error']] = None
+    finish_reason: Optional[Literal['stop', 'length', 'tool_calls', 'error', 'abort']] = None
 
 
 class ChatCompletionStreamResponse(BaseModel):
@@ -314,7 +314,7 @@ class CompletionResponseChoice(BaseModel):
     text: str
     logprobs: Optional[LogProbs] = None
     gen_tokens: Optional[List[int]] = None
-    finish_reason: Optional[Literal['stop', 'length', 'tool_calls', 'error']] = None
+    finish_reason: Optional[Literal['stop', 'length', 'tool_calls', 'error', 'abort']] = None
 
 
 class CompletionResponse(BaseModel):
@@ -333,7 +333,7 @@ class CompletionResponseStreamChoice(BaseModel):
     text: str
     logprobs: Optional[LogProbs] = None
     gen_tokens: Optional[List[int]] = None
-    finish_reason: Optional[Literal['stop', 'length', 'tool_calls', 'error']] = None
+    finish_reason: Optional[Literal['stop', 'length', 'tool_calls', 'error', 'abort']] = None
 
 
 class CompletionStreamResponse(BaseModel):
@@ -430,7 +430,7 @@ class GenerateResponse(BaseModel):
     tokens: int
     input_tokens: int
     history_tokens: int
-    finish_reason: Optional[Literal['stop', 'length', 'tool_calls', 'error']] = None
+    finish_reason: Optional[Literal['stop', 'length', 'tool_calls', 'error', 'abort']] = None
 
 
 class UpdateParamsRequest(BaseModel):
@@ -478,3 +478,13 @@ class GenerateReqOutput(BaseModel):
     text: str
     output_ids: List[int]
     meta_info: GenerateReqMetaOutput
+
+
+class AbortRequest(BaseModel):
+    # Whether to abort all requests
+    abort_all: bool = False
+    # The finished reason data
+    finished_reason: Optional[Dict[str, Any]] = None
+    abort_message: Optional[str] = None
+    # The session ID to abort. If `abort_all` is True, this field is ignored.
+    session_id: Optional[int] = -1