Skip to content

Commit

Permalink
Config: Expose auto GPU split reserve config
Browse files Browse the repository at this point in the history
The GPU reserve is used as a VRAM buffer to prevent GPU overflow
when automatically deciding how to load a model on multiple GPUs.
Make this configurable.

Signed-off-by: kingbri <[email protected]>
  • Loading branch information
bdashore3 committed Feb 9, 2024
1 parent 43bba52 commit 2f568ff
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 10 deletions.
1 change: 1 addition & 0 deletions OAI/types/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ class ModelLoadRequest(BaseModel):
examples=[4096],
)
gpu_split_auto: Optional[bool] = True
autosplit_reserve: Optional[List[float]] = [96]
gpu_split: Optional[List[float]] = Field(
default_factory=list, examples=[[24.0, 20.0]]
)
Expand Down
31 changes: 21 additions & 10 deletions backends/exllamav2/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,11 @@

logger = init_logger(__name__)

# Bytes to reserve on first device when loading with auto split
AUTO_SPLIT_RESERVE_BYTES = 96 * 1024**2


class ExllamaV2Container:
"""The model container class for ExLlamaV2 models."""

# Exl2 vars
config: Optional[ExLlamaV2Config] = None
draft_config: Optional[ExLlamaV2Config] = None
model: Optional[ExLlamaV2] = None
Expand All @@ -44,13 +42,16 @@ class ExllamaV2Container:
tokenizer: Optional[ExLlamaV2Tokenizer] = None
generator: Optional[ExLlamaV2StreamingGenerator] = None
prompt_template: Optional[PromptTemplate] = None
active_loras: List[ExLlamaV2Lora] = []

# Internal config vars
cache_fp8: bool = False
gpu_split_auto: bool = True
gpu_split: Optional[list] = None
use_cfg: bool = False

active_loras: List[ExLlamaV2Lora] = []
# GPU split vars
gpu_split: Optional[list] = None
gpu_split_auto: bool = True
autosplit_reserve: List[float] = [96 * 1024**2]

def __init__(self, model_directory: pathlib.Path, quiet=False, **kwargs):
"""
Expand Down Expand Up @@ -109,7 +110,13 @@ def progress(loaded_modules: int, total_modules: int,
gpu_count = torch.cuda.device_count()
if gpu_count > 1:
self.gpu_split = kwargs.get("gpu_split")

# Auto GPU split parameters
self.gpu_split_auto = unwrap(kwargs.get("gpu_split_auto"), True)
autosplit_reserve_megabytes = unwrap(kwargs.get("autosplit_reserve"), [96])
self.autosplit_reserve = list(
map(lambda value: value * 1024**2, autosplit_reserve_megabytes)
)
else:
self.gpu_split_auto = False
logger.info("Disabling GPU split because one GPU is in use.")
Expand Down Expand Up @@ -336,17 +343,22 @@ def progress(loaded_modules: int, total_modules: int)
# Load tokenizer
self.tokenizer = ExLlamaV2Tokenizer(self.config)

# Calculate autosplit reserve for all GPUs
gpu_count = torch.cuda.device_count()
autosplit_reserve = self.autosplit_reserve + [0] * (
gpu_count - len(self.autosplit_reserve)
)

# Load draft model if a config is present
if self.draft_config:
self.draft_model = ExLlamaV2(self.draft_config)
if not self.quiet:
logger.info("Loading draft model: " + self.draft_config.model_dir)

self.draft_cache = ExLlamaV2Cache(self.draft_model, lazy=True)
reserve = [AUTO_SPLIT_RESERVE_BYTES] + [0] * 16
yield from self.draft_model.load_autosplit_gen(
self.draft_cache,
reserve_vram=reserve,
reserve_vram=autosplit_reserve,
last_id_only=True,
callback_gen=progress_callback,
)
Expand Down Expand Up @@ -385,10 +397,9 @@ def progress(loaded_modules: int, total_modules: int)
if self.gpu_split_auto:
logger.info("Loading with autosplit")

reserve = [AUTO_SPLIT_RESERVE_BYTES] + [0] * 16
for value in self.model.load_autosplit_gen(
self.cache,
reserve_vram=reserve,
reserve_vram=autosplit_reserve,
last_id_only=True,
callback_gen=progress_callback,
):
Expand Down
4 changes: 4 additions & 0 deletions config_sample.yml
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,10 @@ model:
# NOTE: Not parsed for single GPU users
#gpu_split_auto: True

# Reserve VRAM used for autosplit loading (default: 96 MB on GPU 0)
# This is represented as an array of MB per GPU used
#autosplit_reserve: [96]

# An integer array of GBs of vram to split between GPUs (default: [])
# NOTE: Not parsed for single GPU users
#gpu_split: [20.6, 24]
Expand Down

0 comments on commit 2f568ff

Please sign in to comment.