Skip to content

Commit

Permalink
Added GPU profiling
Browse files Browse the repository at this point in the history
  • Loading branch information
ilya-lavrenov committed Mar 12, 2024
1 parent 228e3c0 commit 797f1b1
Show file tree
Hide file tree
Showing 4 changed files with 164 additions and 82 deletions.
20 changes: 9 additions & 11 deletions vllm/engine/llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
SequenceGroupOutput, SequenceOutput, SequenceStatus)
from vllm.transformers_utils.tokenizer import (detokenize_incrementally,
TokenizerGroup)
from vllm.utils import Counter, is_openvino, is_openvino_optimum_intel
from vllm.utils import Counter

logger = init_logger(__name__)
_LOCAL_LOGGING_INTERVAL_SEC = 5
Expand Down Expand Up @@ -120,9 +120,10 @@ def from_engine_args(cls, engine_args: EngineArgs) -> "LLMEngine":
# Create the engine configs.
engine_configs = engine_args.create_engine_configs()
parallel_config = engine_configs[2]
device_config = engine_configs[4]

# Initialize the cluster and specify the executor class.
if is_openvino() or is_openvino_optimum_intel():
if device_config.is_openvino:
from vllm.executor.openvino_executor import OpenVINOExecutor
executor_class = OpenVINOExecutor
elif parallel_config.worker_use_ray:
Expand Down Expand Up @@ -634,16 +635,13 @@ def _get_stats(self,
now = time.monotonic()

# KV Cache Usage in %.
num_total_gpu = self.cache_config.num_gpu_blocks
num_total_gpu = max(1, self.cache_config.num_gpu_blocks)
num_free_gpu = self.scheduler.block_manager.get_num_free_gpu_blocks()
gpu_cache_usage = (1.0 - (num_free_gpu / num_total_gpu)) if num_total_gpu > 0 else 0.0

num_total_cpu = self.cache_config.num_cpu_blocks
cpu_cache_usage = 0.
if num_total_cpu > 0:
num_free_cpu = self.scheduler.block_manager.get_num_free_cpu_blocks(
)
cpu_cache_usage = 1.0 - (num_free_cpu / num_total_cpu)
gpu_cache_usage = 1.0 - (num_free_gpu / num_total_gpu)

num_total_cpu = max(1, self.cache_config.num_cpu_blocks)
num_free_cpu = self.scheduler.block_manager.get_num_free_cpu_blocks()
cpu_cache_usage = 1.0 - (num_free_cpu / num_total_cpu)

# Scheduler State
num_running = len(self.scheduler.running)
Expand Down
Loading

0 comments on commit 797f1b1

Please sign in to comment.