diff --git a/OAI/types/model.py b/OAI/types/model.py
index c0ddd5df..22db757e 100644
--- a/OAI/types/model.py
+++ b/OAI/types/model.py
@@ -70,7 +70,7 @@ class ModelLoadRequest(BaseModel):
         default=None,
         examples=[4096],
     )
-    gpu_split_auto: Optional[bool] = True
+    gpu_split_auto: Optional[bool] = False
     gpu_split: Optional[List[float]] = Field(
         default_factory=list, examples=[[24.0, 20.0]]
     )
diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py
index 2088121c..615c3e1d 100644
--- a/backends/exllamav2/model.py
+++ b/backends/exllamav2/model.py
@@ -104,7 +104,7 @@ def progress(loaded_modules: int, total_modules: int,
 
         self.cache_fp8 = "cache_mode" in kwargs and kwargs["cache_mode"] == "FP8"
         self.gpu_split = kwargs.get("gpu_split")
-        self.gpu_split_auto = unwrap(kwargs.get("gpu_split_auto"), True)
+        self.gpu_split_auto = unwrap(kwargs.get("gpu_split_auto"), False)
 
         self.config = ExLlamaV2Config()
         self.config.model_dir = str(model_directory.resolve())
@@ -347,16 +347,22 @@ def progress(loaded_modules: int, total_modules: int)
             input_ids = torch.zeros((1, self.config.max_input_len), dtype=torch.long)
             self.draft_model.forward(input_ids, cache=self.cache, preprocess_only=True)
 
-        # Load model
         self.model = ExLlamaV2(self.config)
         if not self.quiet:
             logger.info("Loading model: " + self.config.model_dir)
 
+        # Load model with manual split
+        # Entrypoint for single GPU users
         if not self.gpu_split_auto:
+            logger.info(
+                "Loading with a manual GPU split (or a one GPU setup)"
+            )
+
             for value in self.model.load_gen(
-                self.gpu_split, callback_gen=progress_callback
+                self.gpu_split,
+                callback_gen=progress_callback,
             ):
-                if isinstance(value, str):
+                if value:
                     yield value
 
         batch_size = 2 if self.use_cfg else 1
@@ -369,14 +375,19 @@ def progress(loaded_modules: int, total_modules: int)
                 self.model, lazy=self.gpu_split_auto, batch_size=batch_size
             )
 
+        # Load model with autosplit
         if self.gpu_split_auto:
+            logger.info("Loading with autosplit")
+
             reserve = [AUTO_SPLIT_RESERVE_BYTES] + [0] * 16
-            yield from self.model.load_autosplit_gen(
+            for value in self.model.load_autosplit_gen(
                 self.cache,
                 reserve_vram=reserve,
                 last_id_only=True,
                 callback_gen=progress_callback,
-            )
+            ):
+                if value:
+                    yield value
 
         # Test VRAM allocation with a full-length forward pass
         input_ids = torch.zeros((1, self.config.max_input_len), dtype=torch.long)
@@ -395,6 +406,11 @@ def progress(loaded_modules: int, total_modules: int)
         self.generator.return_probabilities = True
         self.generator.return_logits = True
 
+        # Clean up any extra vram usage from torch and cuda
+        # (Helps reduce VRAM bottlenecking on Windows)
+        gc.collect()
+        torch.cuda.empty_cache()
+
         logger.info("Model successfully loaded.")
 
     def unload(self, loras_only: bool = False):
diff --git a/config_sample.yml b/config_sample.yml
index 3e6f60f3..ff357540 100644
--- a/config_sample.yml
+++ b/config_sample.yml
@@ -68,8 +68,9 @@ model:
   # Only use this if the model's base sequence length in config.json is incorrect (ex. Mistral/Mixtral models)
   #override_base_seq_len:
 
-  # Automatically allocate resources to GPUs (default: True)
-  #gpu_split_auto: True
+  # Automatically allocate resources to GPUs (default: False)
+  # WARNING: Will use more VRAM for single GPU users
+  #gpu_split_auto: False
 
   # An integer array of GBs of vram to split between GPUs (default: [])
   #gpu_split: [20.6, 24]