Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add new Prometheus metric : Model weight gpu memory usage #15

Merged
merged 4 commits into from
Jul 26, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions src/happy_vllm/model/model_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,17 +24,19 @@
import logging
from pathlib import Path
from argparse import Namespace
from prometheus_client import Gauge
from transformers import AutoTokenizer
from typing import Any, Tuple, Union, List
from vllm.entrypoints.logger import RequestLogger
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
from vllm.transformers_utils.tokenizer_group.tokenizer_group import TokenizerGroup
from lmformatenforcer.integrations.transformers import build_token_enforcer_tokenizer_data

from happy_vllm import utils
from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat


logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -78,6 +80,8 @@ async def _load_model(self, args: Namespace, **kwargs) -> None:
if args.model_name != "TEST MODEL":
engine_args = AsyncEngineArgs.from_cli_args(args)
self._model = AsyncLLMEngine.from_engine_args(engine_args) # type: ignore
model_consumed_memory = Gauge("model_memory_usage", "Model Consumed GPU Memory in GB ")
model_consumed_memory.set(round(self._model.engine.model_executor.driver_worker.model_runner.model_memory_usage/float(2**30),2)) # type: ignore
if isinstance(self._model.engine.tokenizer, TokenizerGroup): # type: ignore
self._tokenizer = self._model.engine.tokenizer.tokenizer # type: ignore
else:
Expand Down
Loading