diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 1206424ae1e3f..f002a8ff905b1 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -265,7 +265,13 @@ def configure_post_pass(self): def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable: + # when dynamo calls the backend, it means the bytecode + # transform and analysis are done compilation_counter.num_graphs_seen += 1 + from .monitor import torch_compile_start_time + dynamo_time = time.time() - torch_compile_start_time + logger.info("Dynamo bytecode transform time: %.2f s", dynamo_time) + self.compilation_configs.compilation_time += dynamo_time # we control the compilation process, each instance can only be # called once diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index a32dced57e5b3..938430fe2a501 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -145,6 +145,7 @@ def _support_torch_compile( def __init__(self, *, vllm_config: VllmConfig, prefix: str = '', **kwargs): old_init(self, vllm_config=vllm_config, prefix=prefix, **kwargs) + self.vllm_config = vllm_config # for CompilationLevel.DYNAMO_AS_IS , the upper level model runner # will handle the compilation, so we don't need to do anything here. self.do_not_compile = \ @@ -157,9 +158,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = '', **kwargs): TorchCompileWrapperWithCustomDispatcher.__init__( self, compilation_level=vllm_config.compilation_config.level) - if vllm_config.compilation_config.level == CompilationLevel.PIECEWISE: - start_monitoring_torch_compile(vllm_config.compilation_config) - cls.__init__ = __init__ def __call__(self, *args, **kwargs): @@ -186,6 +184,8 @@ def __call__(self, *args, **kwargs): raise ValueError( "Unsupported dynamic dimensions" f" {dims} for argument {k} with type {type(arg)}.") + # here, it is the starting point of the `torch.compile` process + start_monitoring_torch_compile(self.vllm_config.compilation_config) # if we don't use custom dispatcher, we can directly call the # compiled function and let torch.compile handle the dispatching, diff --git a/vllm/compilation/monitor.py b/vllm/compilation/monitor.py index f718e46423212..3348674b09af2 100644 --- a/vllm/compilation/monitor.py +++ b/vllm/compilation/monitor.py @@ -1,14 +1,19 @@ +import time + from vllm.config import CompilationConfig, CompilationLevel from vllm.logger import init_logger logger = init_logger(__name__) +torch_compile_start_time: float = 0.0 + def start_monitoring_torch_compile(compilation_config: CompilationConfig): - pass + global torch_compile_start_time + torch_compile_start_time = time.time() def end_monitoring_torch_compile(compilation_config: CompilationConfig): if compilation_config.level == CompilationLevel.PIECEWISE: - logger.info("graph compilation takes %.2f s in total", + logger.info("torch.compile takes %.2f s in total", compilation_config.compilation_time)