InternLM · lvhan028 · Mar 11, 2024 · Mar 5, 2024 · Mar 5, 2024 · Mar 5, 2024
diff --git a/lmdeploy/pytorch/engine/engine.py b/lmdeploy/pytorch/engine/engine.py
@@ -9,7 +9,7 @@
 from lmdeploy.messages import (EngineGenerationConfig, PytorchEngineConfig,
                                ResponseType)
 from lmdeploy.tokenizer import Tokenizer
-from lmdeploy.utils import get_logger, get_model
+from lmdeploy.utils import get_logger, get_model, logging_timer
 
 from ..adapter.adapter import ADAPTER_MANAGER, SchedulerAdapter
 from ..check_env import check_env, check_model
@@ -404,6 +404,7 @@ def end_session(self, session_id: int):
         """Add new session."""
         return end(self.req_sender, session_id)
 
+    @logging_timer('CreateModelInputs', logger)
     @torch.inference_mode()
     def create_model_inputs(self, messages: SeqList, adapters: AdapterList):
         """create model inputs from messages.
@@ -506,6 +507,8 @@ def _stopping_criteria(self, msg: SchedulerSequence, next_token_id: int):
         """
 
         def _check_stop_word(sampling_param, next_token_id):
+            if sampling_param.ignore_eos:
+                return False
             return (sampling_param.stop_words is not None
                     and next_token_id in sampling_param.stop_words)
 
@@ -527,6 +530,7 @@ def _check_session_len(msg, max_session_len):
             return True
         return False
 
+    @logging_timer('SamplingLogits', logger)
     async def async_sampling_logits(self, logits: torch.Tensor,
                                     running: SeqList, inputs: ModelInputs):
         """sampling logits."""
@@ -565,11 +569,13 @@ def _gather_history(seqs: SeqList, device: torch.device):
         with torch.inference_mode(), torch.cuda.stream(self.stream):
             logits = logits_processor(input_ids, split_logits)
             next_token_ids = logits_processor.sampling(logits)
-        self.stream.synchronize()
+        await asyncio.get_event_loop().run_in_executor(None,
+                                                       self.stream.synchronize)
         next_token_ids = next_token_ids.cpu()
 
         return next_token_ids, split_logits
 
+    @logging_timer('UpdateRunning', logger)
     def update_running(self, running: SeqList, next_token_ids: torch.Tensor,
                        meta: Any):
         """update scheduler."""
@@ -593,6 +599,7 @@ def _can_output_token(self, token: torch.Tensor, msg: SchedulerSequence):
 
         return True
 
+    @logging_timer('ModelForward', logger)
     async def _async_model_forward(self, inputs: ModelInputs,
                                    swap_in_map: Dict, swap_out_map: Dict):
         """model forward."""
@@ -699,6 +706,7 @@ async def __long_context_forward(inputs):
         else:
             return await __long_context_forward(inputs)
 
+    @logging_timer('AsyncStep', logger)
     async def async_step(self, is_prefill: bool, return_logits: bool = False):
         """one step inference. Used to perform streaming chat.
 
@@ -714,6 +722,7 @@ async def async_step(self, is_prefill: bool, return_logits: bool = False):
         adapters = schedule_output.adapters
         if len(running) == 0:
             return dict()
+        logger.debug(f'<AsyncStep>: batch_size={len(running)}')
 
         inputs = self.create_model_inputs(running, adapters)
 
@@ -952,7 +961,6 @@ def _send_resp(step_tokens):
                 await asyncio.sleep(0.01)
                 continue
 
-            logger.debug('async_loop: RequestManager Step.')
             self.req_manager.step()
 
             # forward
@@ -961,16 +969,13 @@ def _send_resp(step_tokens):
                 is_prefill = not prefill_counter or not has_running
                 if is_prefill:
                     prefill_counter = prefill_interval
-                logger.debug('async_loop: Engine Step - '
-                             f'prefilling: {is_prefill}')
                 with torch.inference_mode():
                     step_tokens: Dict[int,
                                       InferOutput] = await self.async_step(
                                           is_prefill=is_prefill)
                 prefill_counter -= 1
 
                 # send response
-                logger.debug('async_loop: Response.')
                 _send_resp(step_tokens)
 
 

diff --git a/lmdeploy/pytorch/engine/request.py b/lmdeploy/pytorch/engine/request.py
@@ -13,13 +13,12 @@
 
 
 def _raise_exception_on_finish(task: asyncio.Task) -> None:
-    msg = ('Engine loop failed!')
     try:
         task.result()
     except asyncio.CancelledError:
         return
-    except Exception as exc:
-        raise RuntimeError(msg) from exc
+    except Exception as e:
+        logger.exception(f'Engine loop failed with error: {e}')
 
 
 def _ignore_exception_on_finish(task: asyncio.Task) -> None:
@@ -158,6 +157,8 @@ def _resp_get(self):
             except Empty:
                 continue
             except Exception as e:
+                logger.exception(
+                    f'sender[{self.sender_id}] get response failed: {e}')
                 raise e
 
     async def _async_resp_get(self):
@@ -177,6 +178,8 @@ async def __no_threadsafe_get():
                 except asyncio.TimeoutError:
                     continue
                 except Exception as e:
+                    logger.exception(
+                        f'sender[{self.sender_id}] get response failed: {e}')
                     raise e
 
         if self.is_thread_safe():

diff --git a/lmdeploy/pytorch/models/patch.py b/lmdeploy/pytorch/models/patch.py
@@ -83,8 +83,9 @@ def _find_submodulename():
         rewrite_qualname = _find_submodulename()
 
     origin_qualname = f'{module_name}.{class_name}'
-    logger.debug(
-        f'Find rewrite of module {origin_qualname}: {rewrite_qualname}.')
+    if rewrite_qualname is not None:
+        logger.debug('Find rewrite of module\n'
+                     f'{origin_qualname} <=> {rewrite_qualname}')
     return rewrite_qualname
 
 

diff --git a/lmdeploy/pytorch/paging/scheduler.py b/lmdeploy/pytorch/paging/scheduler.py
@@ -4,7 +4,7 @@
 from dataclasses import dataclass
 from typing import Dict, List, Set, Union
 
-from lmdeploy.utils import get_logger
+from lmdeploy.utils import get_logger, logging_timer
 
 from ..adapter.adapter import ADAPTER_MANAGER, SchedulerAdapter
 from ..config import CacheConfig, SchedulerConfig
@@ -116,6 +116,7 @@ def add_adapter(self, adapter_path: str, adapter_name: str):
             adapter) - self.block_manager.num_gpu_blocks
         return adapter.build_weight_map(block_table)
 
+    @logging_timer('SchedulePrefilling', logger)
     def _schedule_prefill(self):
         """Schedule for prefilling."""
 
@@ -206,6 +207,7 @@ def _deactive_adapter(adapter_name):
         self.running += running
         return running, swap_in_map, swap_out_map, copy_map
 
+    @logging_timer('ScheduleDecoding', logger)
     def _schedule_decoding(self):
         """schedule decoding."""
         assert len(self.running) != 0

diff --git a/lmdeploy/utils.py b/lmdeploy/utils.py
@@ -1,12 +1,48 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import asyncio
+import functools
 import logging
 import sys
+import time
+from contextlib import contextmanager
 from logging import Logger, LogRecord
 from typing import List, Optional
 
 logger_initialized = {}
 
 
+class _ASNI_COLOR:
+    BRIGHT_RED = '\033[91m'
+    RED = '\033[31m'
+    YELLOW = '\033[33m'
+    WHITE = '\033[37m'
+    GREEN = '\033[32m'
+
+
+class ColorFormatter(logging.Formatter):
+
+    _LEVELNAME_COLOR_MAP = dict(CRITICAL=_ASNI_COLOR.BRIGHT_RED,
+                                ERROR=_ASNI_COLOR.RED,
+                                WARN=_ASNI_COLOR.YELLOW,
+                                WARNING=_ASNI_COLOR.YELLOW,
+                                INFO=_ASNI_COLOR.WHITE,
+                                DEBUG=_ASNI_COLOR.GREEN)
+
+    _RESET_COLOR = '\033[0m'
+
+    def format(self, record: LogRecord):
+        """format."""
+        if sys.platform == 'win32':
+            # windows does not support ASNI color
+            return super().format(record)
+        levelname = record.levelname
+        level_color = self._LEVELNAME_COLOR_MAP.get(levelname,
+                                                    self._RESET_COLOR)
+        levelname = f'{level_color}{levelname}{self._RESET_COLOR}'
+        record.levelname = levelname
+        return super().format(record)
+
+
 class FilterDuplicateWarning(logging.Filter):
     """Filter the repeated warning message.
 
@@ -85,7 +121,7 @@ def get_logger(
         file_handler = logging.FileHandler(log_file, file_mode)
         handlers.append(file_handler)
 
-    formatter = logging.Formatter(log_formatter)
+    formatter = ColorFormatter(log_formatter)
     for handler in handlers:
         handler.setFormatter(formatter)
         handler.setLevel(log_level)
@@ -158,3 +194,46 @@ def get_model(pretrained_model_name_or_path: str,
     model_path = snapshot_download(pretrained_model_name_or_path,
                                    **download_kwargs)
     return model_path
+
+
+def logging_timer(op_name: str, logger: Logger, level: int = logging.DEBUG):
+    """logging timer."""
+
+    @contextmanager
+    def __timer():
+        """timer."""
+        start = time.perf_counter()
+        yield
+        end = time.perf_counter()
+        duration = (end - start) * 1000
+        logger.log(level, f'<{op_name}> take time: {duration:.2f} ms')
+
+    def __inner(func):
+        """inner."""
+
+        @functools.wraps(func)
+        def __func_warpper(*args, **kwargs):
+            """func warpper."""
+            if not logger.isEnabledFor(level):
+                return func(*args, **kwargs)
+            with __timer():
+                return func(*args, **kwargs)
+
+        @functools.wraps(func)
+        def __async_warpper(*args, **kwargs):
+            """async warpper."""
+
+            async def __tmp():
+                if not logger.isEnabledFor(level):
+                    return (await func(*args, **kwargs))
+                with __timer():
+                    return (await func(*args, **kwargs))
+
+            return __tmp()
+
+        if asyncio.iscoroutinefunction(func):
+            return __async_warpper
+        else:
+            return __func_warpper
+
+    return __inner