diff --git a/OAI/types/model.py b/OAI/types/model.py index c0ddd5df..22db757e 100644 --- a/OAI/types/model.py +++ b/OAI/types/model.py @@ -70,7 +70,7 @@ class ModelLoadRequest(BaseModel): default=None, examples=[4096], ) - gpu_split_auto: Optional[bool] = True + gpu_split_auto: Optional[bool] = False gpu_split: Optional[List[float]] = Field( default_factory=list, examples=[[24.0, 20.0]] ) diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py index 2088121c..615c3e1d 100644 --- a/backends/exllamav2/model.py +++ b/backends/exllamav2/model.py @@ -104,7 +104,7 @@ def progress(loaded_modules: int, total_modules: int, self.cache_fp8 = "cache_mode" in kwargs and kwargs["cache_mode"] == "FP8" self.gpu_split = kwargs.get("gpu_split") - self.gpu_split_auto = unwrap(kwargs.get("gpu_split_auto"), True) + self.gpu_split_auto = unwrap(kwargs.get("gpu_split_auto"), False) self.config = ExLlamaV2Config() self.config.model_dir = str(model_directory.resolve()) @@ -347,16 +347,22 @@ def progress(loaded_modules: int, total_modules: int) input_ids = torch.zeros((1, self.config.max_input_len), dtype=torch.long) self.draft_model.forward(input_ids, cache=self.cache, preprocess_only=True) - # Load model self.model = ExLlamaV2(self.config) if not self.quiet: logger.info("Loading model: " + self.config.model_dir) + # Load model with manual split + # Entrypoint for single GPU users if not self.gpu_split_auto: + logger.info( + "Loading with a manual GPU split (or a one GPU setup)" + ) + for value in self.model.load_gen( - self.gpu_split, callback_gen=progress_callback + self.gpu_split, + callback_gen=progress_callback, ): - if isinstance(value, str): + if value: yield value batch_size = 2 if self.use_cfg else 1 @@ -369,14 +375,19 @@ def progress(loaded_modules: int, total_modules: int) self.model, lazy=self.gpu_split_auto, batch_size=batch_size ) + # Load model with autosplit if self.gpu_split_auto: + logger.info("Loading with autosplit") + reserve = [AUTO_SPLIT_RESERVE_BYTES] + [0] * 16 - yield from self.model.load_autosplit_gen( + for value in self.model.load_autosplit_gen( self.cache, reserve_vram=reserve, last_id_only=True, callback_gen=progress_callback, - ) + ): + if value: + yield value # Test VRAM allocation with a full-length forward pass input_ids = torch.zeros((1, self.config.max_input_len), dtype=torch.long) @@ -395,6 +406,11 @@ def progress(loaded_modules: int, total_modules: int) self.generator.return_probabilities = True self.generator.return_logits = True + # Clean up any extra vram usage from torch and cuda + # (Helps reduce VRAM bottlenecking on Windows) + gc.collect() + torch.cuda.empty_cache() + logger.info("Model successfully loaded.") def unload(self, loras_only: bool = False): diff --git a/config_sample.yml b/config_sample.yml index 3e6f60f3..ff357540 100644 --- a/config_sample.yml +++ b/config_sample.yml @@ -68,8 +68,9 @@ model: # Only use this if the model's base sequence length in config.json is incorrect (ex. Mistral/Mixtral models) #override_base_seq_len: - # Automatically allocate resources to GPUs (default: True) - #gpu_split_auto: True + # Automatically allocate resources to GPUs (default: False) + # WARNING: Will use more VRAM for single GPU users + #gpu_split_auto: False # An integer array of GBs of vram to split between GPUs (default: []) #gpu_split: [20.6, 24]