Config: Expose auto GPU split reserve config

The GPU reserve is used as a VRAM buffer to prevent GPU overflow when automatically deciding how to load a model on multiple GPUs. Make this configurable. Signed-off-by: kingbri <[email protected]>
theroyallab · Feb 9, 2024 · 2f568ff · 2f568ff
1 parent 43bba52
commit 2f568ff
Show file tree

Hide file tree

Showing 3 changed files with 26 additions and 10 deletions.
diff --git a/OAI/types/model.py b/OAI/types/model.py
@@ -71,6 +71,7 @@ class ModelLoadRequest(BaseModel):
         examples=[4096],
     )
     gpu_split_auto: Optional[bool] = True
+    autosplit_reserve: Optional[List[float]] = [96]
     gpu_split: Optional[List[float]] = Field(
         default_factory=list, examples=[[24.0, 20.0]]
     )

diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py
@@ -28,13 +28,11 @@
 
 logger = init_logger(__name__)
 
-# Bytes to reserve on first device when loading with auto split
-AUTO_SPLIT_RESERVE_BYTES = 96 * 1024**2
-
 
 class ExllamaV2Container:
     """The model container class for ExLlamaV2 models."""
 
+    # Exl2 vars
     config: Optional[ExLlamaV2Config] = None
     draft_config: Optional[ExLlamaV2Config] = None
     model: Optional[ExLlamaV2] = None
@@ -44,13 +42,16 @@ class ExllamaV2Container:
     tokenizer: Optional[ExLlamaV2Tokenizer] = None
     generator: Optional[ExLlamaV2StreamingGenerator] = None
     prompt_template: Optional[PromptTemplate] = None
+    active_loras: List[ExLlamaV2Lora] = []
 
+    # Internal config vars
     cache_fp8: bool = False
-    gpu_split_auto: bool = True
-    gpu_split: Optional[list] = None
     use_cfg: bool = False
 
-    active_loras: List[ExLlamaV2Lora] = []
+    # GPU split vars
+    gpu_split: Optional[list] = None
+    gpu_split_auto: bool = True
+    autosplit_reserve: List[float] = [96 * 1024**2]
 
     def __init__(self, model_directory: pathlib.Path, quiet=False, **kwargs):
         """
@@ -109,7 +110,13 @@ def progress(loaded_modules: int, total_modules: int,
         gpu_count = torch.cuda.device_count()
         if gpu_count > 1:
             self.gpu_split = kwargs.get("gpu_split")
+
+            # Auto GPU split parameters
             self.gpu_split_auto = unwrap(kwargs.get("gpu_split_auto"), True)
+            autosplit_reserve_megabytes = unwrap(kwargs.get("autosplit_reserve"), [96])
+            self.autosplit_reserve = list(
+                map(lambda value: value * 1024**2, autosplit_reserve_megabytes)
+            )
         else:
             self.gpu_split_auto = False
             logger.info("Disabling GPU split because one GPU is in use.")
@@ -336,17 +343,22 @@ def progress(loaded_modules: int, total_modules: int)
         # Load tokenizer
         self.tokenizer = ExLlamaV2Tokenizer(self.config)
 
+        # Calculate autosplit reserve for all GPUs
+        gpu_count = torch.cuda.device_count()
+        autosplit_reserve = self.autosplit_reserve + [0] * (
+            gpu_count - len(self.autosplit_reserve)
+        )
+
         # Load draft model if a config is present
         if self.draft_config:
             self.draft_model = ExLlamaV2(self.draft_config)
             if not self.quiet:
                 logger.info("Loading draft model: " + self.draft_config.model_dir)
 
             self.draft_cache = ExLlamaV2Cache(self.draft_model, lazy=True)
-            reserve = [AUTO_SPLIT_RESERVE_BYTES] + [0] * 16
             yield from self.draft_model.load_autosplit_gen(
                 self.draft_cache,
-                reserve_vram=reserve,
+                reserve_vram=autosplit_reserve,
                 last_id_only=True,
                 callback_gen=progress_callback,
             )
@@ -385,10 +397,9 @@ def progress(loaded_modules: int, total_modules: int)
         if self.gpu_split_auto:
             logger.info("Loading with autosplit")
 
-            reserve = [AUTO_SPLIT_RESERVE_BYTES] + [0] * 16
             for value in self.model.load_autosplit_gen(
                 self.cache,
-                reserve_vram=reserve,
+                reserve_vram=autosplit_reserve,
                 last_id_only=True,
                 callback_gen=progress_callback,
             ):

diff --git a/config_sample.yml b/config_sample.yml
@@ -76,6 +76,10 @@ model:
   # NOTE: Not parsed for single GPU users
   #gpu_split_auto: True
 
+  # Reserve VRAM used for autosplit loading (default: 96 MB on GPU 0)
+  # This is represented as an array of MB per GPU used
+  #autosplit_reserve: [96]
+
   # An integer array of GBs of vram to split between GPUs (default: [])
   # NOTE: Not parsed for single GPU users
   #gpu_split: [20.6, 24]