Skip to content

Commit

Permalink
Model: Auto-detect a one GPU setup and fix gpu_split_auto
Browse files Browse the repository at this point in the history
It makes more sense to use gpu split parameters when the user has
>1 GPUs. Otherwise, set split and split_auto to False and save
the user some VRAM.

Signed-off-by: kingbri <[email protected]>
  • Loading branch information
kingbri1 committed Feb 7, 2024
1 parent 849179d commit c0ad647
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 9 deletions.
2 changes: 1 addition & 1 deletion OAI/types/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ class ModelLoadRequest(BaseModel):
default=None,
examples=[4096],
)
gpu_split_auto: Optional[bool] = False
gpu_split_auto: Optional[bool] = True
gpu_split: Optional[List[float]] = Field(
default_factory=list, examples=[[24.0, 20.0]]
)
Expand Down
15 changes: 10 additions & 5 deletions backends/exllamav2/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,15 @@ def progress(loaded_modules: int, total_modules: int,
self.quiet = quiet

self.cache_fp8 = "cache_mode" in kwargs and kwargs["cache_mode"] == "FP8"
self.gpu_split = kwargs.get("gpu_split")
self.gpu_split_auto = unwrap(kwargs.get("gpu_split_auto"), False)

# Turn off GPU split if the user is using 1 GPU
gpu_count = torch.cuda.device_count()
if gpu_count > 1:
self.gpu_split = kwargs.get("gpu_split")
self.gpu_split_auto = unwrap(kwargs.get("gpu_split_auto"), True)
else:
self.gpu_split_auto = False
logger.info("Disabling GPU split because one GPU is in use.")

self.config = ExLlamaV2Config()
self.config.model_dir = str(model_directory.resolve())
Expand Down Expand Up @@ -354,9 +361,7 @@ def progress(loaded_modules: int, total_modules: int)
# Load model with manual split
# Entrypoint for single GPU users
if not self.gpu_split_auto:
logger.info(
"Loading with a manual GPU split (or a one GPU setup)"
)
logger.info("Loading with a manual GPU split (or a one GPU setup)")

for value in self.model.load_gen(
self.gpu_split,
Expand Down
7 changes: 4 additions & 3 deletions config_sample.yml
Original file line number Diff line number Diff line change
Expand Up @@ -68,11 +68,12 @@ model:
# Only use this if the model's base sequence length in config.json is incorrect (ex. Mistral/Mixtral models)
#override_base_seq_len:

# Automatically allocate resources to GPUs (default: False)
# WARNING: Will use more VRAM for single GPU users
#gpu_split_auto: False
# Automatically allocate resources to GPUs (default: True)
# NOTE: Not parsed for single GPU users
#gpu_split_auto: True

# An integer array of GBs of vram to split between GPUs (default: [])
# NOTE: Not parsed for single GPU users
#gpu_split: [20.6, 24]

# Rope scale (default: 1.0)
Expand Down

0 comments on commit c0ad647

Please sign in to comment.