Skip to content

Commit

Permalink
[torch.compile] add logging for compilation time (vllm-project#10941)
Browse files Browse the repository at this point in the history
Signed-off-by: youkaichao <[email protected]>
Co-authored-by: Woosuk Kwon <[email protected]>
  • Loading branch information
2 people authored and BKitor committed Dec 30, 2024
1 parent 2ad1390 commit 3f78eb4
Show file tree
Hide file tree
Showing 6 changed files with 75 additions and 10 deletions.
56 changes: 46 additions & 10 deletions vllm/compilation/backends.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import copy
import dataclasses
import time
from contextlib import ExitStack
from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Tuple
from unittest.mock import patch
Expand All @@ -14,6 +15,7 @@

from .counter import compilation_counter
from .inductor_pass import InductorPass
from .monitor import end_monitoring_torch_compile
from .pass_manager import PostGradPassManager

logger = init_logger(__name__)
Expand All @@ -22,20 +24,21 @@
def wrap_inductor(graph,
example_inputs,
additional_inductor_config,
do_logging=False,
compilation_config: CompilationConfig,
graph_index: int = 0,
num_graphs: int = 1,
runtime_shape: Optional[int] = None,
use_inductor: bool = True):
if graph_index == 0:
# before compiling the first graph, record the start time
global compilation_start_time
compilation_start_time = time.time()

if not use_inductor:
return graph

compilation_counter.num_inductor_compilations += 1

if do_logging:
if runtime_shape is None:
logger.info("Compiling a graph for general shape")
else:
logger.info("Compiling a graph for shape %s", runtime_shape)

from torch._inductor import config
current_config = config.shallow_copy_dict()
from torch._inductor.compile_fx import compile_fx
Expand All @@ -52,7 +55,23 @@ def wrap_inductor(graph,
# inductor can inplace modify the graph, so we need to copy it
# see https://github.com/pytorch/pytorch/issues/138980
graph = copy.deepcopy(graph)
return compile_fx(graph, example_inputs, config_patches=current_config)
compiled_graph = compile_fx(graph,
example_inputs,
config_patches=current_config)

# after compiling the last graph, record the end time
if graph_index == num_graphs - 1:
now = time.time()
elapsed = now - compilation_start_time
compilation_config.compilation_time += elapsed
if runtime_shape is None:
logger.info("Compiling a graph for general shape takes %.2f s",
elapsed)
else:
logger.info("Compiling a graph for shape %s takes %.2f s",
runtime_shape, elapsed)

return compiled_graph


@dataclasses.dataclass
Expand Down Expand Up @@ -114,6 +133,8 @@ def split_graph(graph: fx.GraphModule,
# we share the global graph pool among all the backends
global_graph_pool = None

compilation_start_time = 0.0


class PiecewiseCompileInterpreter(torch.fx.Interpreter):
"""Code adapted from `torch.fx.passes.shape_prop.ShapeProp`.
Expand Down Expand Up @@ -157,12 +178,15 @@ def call_module(self, target: torch.fx.node.Target,
sym_shape_indices = [
i for i, x in enumerate(args) if isinstance(x, torch.SymInt)
]
global compilation_start_time
compiled_graph_for_general_shape = wrap_inductor(
submod,
args,
self.compilation_configs.inductor_compile_config,
self.compilation_configs,
graph_index=index,
num_graphs=len(self.compile_submod_names),
runtime_shape=None,
do_logging=index == 0,
use_inductor=self.compilation_configs.use_inductor)

self.module.__dict__[target] = PiecewiseBackend(
Expand Down Expand Up @@ -379,6 +403,8 @@ def __init__(self, graph: fx.GraphModule,
# the entries for different shapes that we need to either
# compile or capture cudagraph
self.concrete_size_entries: Dict[int, ConcreteSizeEntry] = {}
self.to_be_compiled_sizes: Set[int] = self.compile_sizes.union(
self.capture_sizes)
for shape in self.compile_sizes.union(self.capture_sizes):
self.concrete_size_entries[shape] = ConcreteSizeEntry(
runtime_shape=shape,
Expand All @@ -389,6 +415,9 @@ def __init__(self, graph: fx.GraphModule,
def __call__(self, *args) -> Any:
if not self.first_run_finished:
self.first_run_finished = True
# no specific sizes to compile
if self.is_last_graph and not self.to_be_compiled_sizes:
end_monitoring_torch_compile(self.compilation_configs)
return self.compiled_graph_for_general_shape(*args)

runtime_shape = args[self.sym_shape_indices[0]]
Expand All @@ -403,15 +432,22 @@ def __call__(self, *args) -> Any:

if entry.need_to_compile and not entry.compiled:
entry.compiled = True
self.to_be_compiled_sizes.remove(runtime_shape)
# args are real arguments
entry.runnable = wrap_inductor(
self.graph,
args,
self.compilation_configs.inductor_compile_config,
self.compilation_configs,
graph_index=self.piecewise_compile_index,
num_graphs=self.total_piecewise_compiles,
runtime_shape=runtime_shape,
do_logging=self.is_first_graph,
use_inductor=self.compilation_configs.use_inductor)

# finished compilations for all required shapes
if self.is_last_graph and not self.to_be_compiled_sizes:
end_monitoring_torch_compile(self.compilation_configs)

if not entry.use_cudagraph:
return entry.runnable(*args)

Expand Down
5 changes: 5 additions & 0 deletions vllm/compilation/decorators.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
from vllm.sequence import IntermediateTensors
from vllm.utils import supports_dynamo

from .monitor import start_monitoring_torch_compile

logger = init_logger(__name__)

_T = TypeVar("_T", bound=type[nn.Module])
Expand Down Expand Up @@ -155,6 +157,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = '', **kwargs):
TorchCompileWrapperWithCustomDispatcher.__init__(
self, compilation_level=vllm_config.compilation_config.level)

if vllm_config.compilation_config.level == CompilationLevel.PIECEWISE:
start_monitoring_torch_compile(vllm_config.compilation_config)

cls.__init__ = __init__

def __call__(self, *args, **kwargs):
Expand Down
14 changes: 14 additions & 0 deletions vllm/compilation/monitor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from vllm.config import CompilationConfig, CompilationLevel
from vllm.logger import init_logger

logger = init_logger(__name__)


def start_monitoring_torch_compile(compilation_config: CompilationConfig):
pass


def end_monitoring_torch_compile(compilation_config: CompilationConfig):
if compilation_config.level == CompilationLevel.PIECEWISE:
logger.info("graph compilation takes %.2f s in total",
compilation_config.compilation_time)
2 changes: 2 additions & 0 deletions vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2281,6 +2281,7 @@ def model_post_init(self, __context: Any) -> None:
# keep track of enabled and disabled custom ops
enabled_custom_ops: Counter[str] = PrivateAttr
disabled_custom_ops: Counter[str] = PrivateAttr
compilation_time: float = PrivateAttr

# Per-model forward context
# Mainly used to store attention cls
Expand Down Expand Up @@ -2319,6 +2320,7 @@ def model_post_init(self, __context: Any) -> None:
self.enabled_custom_ops = Counter()
self.disabled_custom_ops = Counter()
self.static_forward_context = {}
self.compilation_time = 0.0

def init_backend(self) -> Union[str, Callable]:
if self.level == CompilationLevel.NO_COMPILATION:
Expand Down
4 changes: 4 additions & 0 deletions vllm/engine/llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -473,6 +473,7 @@ def _initialize_kv_caches(self) -> None:
The workers will determine the number of blocks in both the GPU cache
and the swap CPU cache.
"""
start = time.time()
num_gpu_blocks, num_cpu_blocks = (
self.model_executor.determine_num_available_blocks())

Expand All @@ -488,6 +489,9 @@ def _initialize_kv_caches(self) -> None:
self.cache_config.num_cpu_blocks = num_cpu_blocks

self.model_executor.initialize_cache(num_gpu_blocks, num_cpu_blocks)
elapsed = time.time() - start
logger.info(("init engine (profile, create kv cache, "
"warmup model) took %.2f seconds"), elapsed)

@classmethod
def _get_executor_cls(cls,
Expand Down
4 changes: 4 additions & 0 deletions vllm/v1/engine/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ def __init__(

def _initialize_kv_caches(self,
cache_config: CacheConfig) -> Tuple[int, int]:
start = time.time()
num_gpu_blocks, _ = self.model_executor.determine_num_available_blocks(
)

Expand All @@ -80,6 +81,9 @@ def _initialize_kv_caches(self,

num_cpu_blocks = 0
self.model_executor.initialize_cache(num_gpu_blocks)
elapsed = time.time() - start
logger.info(("init engine (profile, create kv cache, "
"warmup model) took %.2f seconds"), elapsed)
return num_gpu_blocks, num_cpu_blocks

def add_request(self, request: EngineCoreRequest):
Expand Down

0 comments on commit 3f78eb4

Please sign in to comment.