Skip to content

Commit

Permalink
Add more metrics to high level profiler (#63)
Browse files Browse the repository at this point in the history
* Add more detailed event names to profiler

* Add more profiler stats

* separate prompt and decode batch utilization

* Add more metrics

* revert engine/metrics.py changes

* un-singletonify (what a funny word) habana profiler

* formatting

* add batch block utilization metric

* fix division by zero

* fix batch_block_utilization formula

* minor refactors
  • Loading branch information
kzawora-intel authored Jun 28, 2024
1 parent 4a45bbf commit 20eafe9
Showing 1 changed file with 72 additions and 10 deletions.
82 changes: 72 additions & 10 deletions vllm/worker/habana_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,7 @@ def __init__(
self.scheduler_config = scheduler_config
self.lora_config = lora_config
self.load_config = load_config
self.cache_config = cache_config
self.is_driver_worker = is_driver_worker
self.profiler = Profiler()

Expand All @@ -267,6 +268,9 @@ def __init__(
self.lora_manager: LRUCacheWorkerLoRAManager = None
self.model: torch.nn.Module = None

# Profiler stats
self.profiler_counter_helper = HabanaProfilerCounterHelper()

self._setup_buckets()

def load_model(self) -> None:
Expand Down Expand Up @@ -876,19 +880,18 @@ def execute_model(
output.outputs = output.outputs[:real_batch_size]
htorch.core.mark_step()

if self.is_driver_worker:
if self.is_driver_worker and self.profiler.enabled:
# Stop recording 'execute_model' event
self.profiler.end()
event_end = self.profiler.get_timestamp_us()
duration = event_end - event_start
throughput = batch_size_padded / (duration / 1e6)
throughput_effective = real_batch_size / (duration / 1e6)
counters = {
'batch_size': batch_size_padded,
'batch_size_effective': real_batch_size,
'throughput': throughput,
'throughput_effective': throughput_effective
}
counters = self.profiler_counter_helper.get_counter_dict(
cache_config=self.cache_config,
duration=event_end-event_start,
seq_len=seq_len,
batch_size_padded=batch_size_padded,
real_batch_size=real_batch_size,
seq_group_metadata_list=seq_group_metadata_list,
is_prompt=is_prompt)
self.profiler.record_counter(event_start, counters)

return output
Expand Down Expand Up @@ -1014,3 +1017,62 @@ def vocab_size(self) -> int:

def _maybe_wrap_in_hpu_graph(model):
return htorch.hpu.wrap_in_hpu_graph(HpuModelAdapter(model)) if htorch.utils.internal.is_lazy() else HpuModelAdapter(model)


class HabanaProfilerCounterHelper():
def __init__(self):
self.niter = 0
self.average_real_throughput = None
self.logged_once = False

def get_counter_dict(self, cache_config, duration, seq_len, batch_size_padded, real_batch_size, seq_group_metadata_list, is_prompt):
throughput = batch_size_padded / (duration / 1e6)
throughput_effective = real_batch_size / (duration / 1e6)
real_seq_lens = [len(seq_data.prompt_token_ids) + len(seq_data.output_token_ids) for seq_group_metadata in seq_group_metadata_list for seq_data in seq_group_metadata.seq_data.values()]
real_max_seq_len = max(real_seq_lens)
real_num_tokens = sum(real_seq_lens)
padded_num_tokens = batch_size_padded * seq_len
batch_token_utilization = real_num_tokens / padded_num_tokens
if self.average_real_throughput is None:
self.average_real_throughput = throughput_effective
else: # https://www.heikohoffmann.de/htmlthesis/node134.html
self.average_real_throughput = self.average_real_throughput + 1/(self.niter+1) * (throughput_effective-self.average_real_throughput)
phase = "prompt" if is_prompt else "decode"
counters = {
f'{phase}_bucket_batch_size': batch_size_padded,
f'{phase}_batch_size': real_batch_size,
f'{phase}_bucket_seq_len': seq_len,
f'{phase}_seq_len': real_max_seq_len,
f'{phase}_bucket_gen_throughput': throughput,
f'{phase}_real_gen_throughput': throughput_effective,
f'{phase}_batch_token_utilization': batch_token_utilization,
'average_real_throughput': self.average_real_throughput,
'engine_iteration': self.niter,
}
self.niter += 1
if is_prompt:
prompt_seq_lens = [len(seq_data.prompt_token_ids) for seq_group_metadata in seq_group_metadata_list for seq_data in seq_group_metadata.seq_data.values()]
prompt_bucket_in_throughput = (seq_len*batch_size_padded) / (duration / 1e6)
prompt_real_in_throughput = sum(prompt_seq_lens) / (duration / 1e6)
counters[f'{phase}_bucket_in_throughput'] = prompt_bucket_in_throughput
counters[f'{phase}_real_in_throughput'] = prompt_real_in_throughput

# KV cache might not be created yet (e.g. for profiling run)
if cache_config.num_gpu_blocks is not None and cache_config.num_gpu_blocks != 0:
cache_num_blocks_used = [math.ceil(sl/cache_config.block_size) for sl in real_seq_lens]
cache_total_num_blocks_used = sum(cache_num_blocks_used)
num_cache_blocks = cache_config.num_gpu_blocks
cache_total_num_free_blocks = num_cache_blocks - cache_total_num_blocks_used
cache_computed_utilization = cache_total_num_blocks_used / num_cache_blocks
max_blocks_per_seq = math.ceil(seq_len/cache_config.block_size)
batch_block_utilization = cache_total_num_blocks_used / (batch_size_padded * max_blocks_per_seq)
counters['cache_num_blocks_used'] = cache_total_num_blocks_used
counters['cache_num_free_blocks'] = cache_total_num_free_blocks
counters['cache_computed_utilization'] = cache_computed_utilization
counters[f'{phase}_batch_block_utilization'] = batch_block_utilization
if not self.logged_once:
counters['const_cache_num_blocks'] = cache_config.num_gpu_blocks
counters['const_gpu_memory_utilization'] = cache_config.gpu_memory_utilization
counters['const_block_size'] = cache_config.block_size
self.logged_once = True
return counters

0 comments on commit 20eafe9

Please sign in to comment.