Model: Auto-detect a one GPU setup and fix gpu_split_auto

It makes more sense to use gpu split parameters when the user has >1 GPUs. Otherwise, set split and split_auto to False and save the user some VRAM. Signed-off-by: kingbri <[email protected]>
theroyallab · Feb 7, 2024 · c0ad647 · c0ad647
1 parent 849179d
commit c0ad647
Show file tree

Hide file tree

Showing 3 changed files with 15 additions and 9 deletions.
diff --git a/OAI/types/model.py b/OAI/types/model.py
@@ -70,7 +70,7 @@ class ModelLoadRequest(BaseModel):
         default=None,
         examples=[4096],
     )
-    gpu_split_auto: Optional[bool] = False
+    gpu_split_auto: Optional[bool] = True
     gpu_split: Optional[List[float]] = Field(
         default_factory=list, examples=[[24.0, 20.0]]
     )

diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py
@@ -103,8 +103,15 @@ def progress(loaded_modules: int, total_modules: int,
         self.quiet = quiet
 
         self.cache_fp8 = "cache_mode" in kwargs and kwargs["cache_mode"] == "FP8"
-        self.gpu_split = kwargs.get("gpu_split")
-        self.gpu_split_auto = unwrap(kwargs.get("gpu_split_auto"), False)
+
+        # Turn off GPU split if the user is using 1 GPU
+        gpu_count = torch.cuda.device_count()
+        if gpu_count > 1:
+            self.gpu_split = kwargs.get("gpu_split")
+            self.gpu_split_auto = unwrap(kwargs.get("gpu_split_auto"), True)
+        else:
+            self.gpu_split_auto = False
+            logger.info("Disabling GPU split because one GPU is in use.")
 
         self.config = ExLlamaV2Config()
         self.config.model_dir = str(model_directory.resolve())
@@ -354,9 +361,7 @@ def progress(loaded_modules: int, total_modules: int)
         # Load model with manual split
         # Entrypoint for single GPU users
         if not self.gpu_split_auto:
-            logger.info(
-                "Loading with a manual GPU split (or a one GPU setup)"
-            )
+            logger.info("Loading with a manual GPU split (or a one GPU setup)")
 
             for value in self.model.load_gen(
                 self.gpu_split,

diff --git a/config_sample.yml b/config_sample.yml
@@ -68,11 +68,12 @@ model:
   # Only use this if the model's base sequence length in config.json is incorrect (ex. Mistral/Mixtral models)
   #override_base_seq_len:
 
-  # Automatically allocate resources to GPUs (default: False)
-  # WARNING: Will use more VRAM for single GPU users
-  #gpu_split_auto: False
+  # Automatically allocate resources to GPUs (default: True)
+  # NOTE: Not parsed for single GPU users
+  #gpu_split_auto: True
 
   # An integer array of GBs of vram to split between GPUs (default: [])
+  # NOTE: Not parsed for single GPU users
   #gpu_split: [20.6, 24]
 
   # Rope scale (default: 1.0)