diff --git a/OAI/types/model.py b/OAI/types/model.py index c0ddd5df..71b8ab3d 100644 --- a/OAI/types/model.py +++ b/OAI/types/model.py @@ -71,6 +71,7 @@ class ModelLoadRequest(BaseModel): examples=[4096], ) gpu_split_auto: Optional[bool] = True + autosplit_reserve: Optional[List[float]] = [96] gpu_split: Optional[List[float]] = Field( default_factory=list, examples=[[24.0, 20.0]] ) diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py index 74a02f69..5d1fc1a7 100644 --- a/backends/exllamav2/model.py +++ b/backends/exllamav2/model.py @@ -28,13 +28,11 @@ logger = init_logger(__name__) -# Bytes to reserve on first device when loading with auto split -AUTO_SPLIT_RESERVE_BYTES = 96 * 1024**2 - class ExllamaV2Container: """The model container class for ExLlamaV2 models.""" + # Exl2 vars config: Optional[ExLlamaV2Config] = None draft_config: Optional[ExLlamaV2Config] = None model: Optional[ExLlamaV2] = None @@ -44,13 +42,16 @@ class ExllamaV2Container: tokenizer: Optional[ExLlamaV2Tokenizer] = None generator: Optional[ExLlamaV2StreamingGenerator] = None prompt_template: Optional[PromptTemplate] = None + active_loras: List[ExLlamaV2Lora] = [] + # Internal config vars cache_fp8: bool = False - gpu_split_auto: bool = True - gpu_split: Optional[list] = None use_cfg: bool = False - active_loras: List[ExLlamaV2Lora] = [] + # GPU split vars + gpu_split: Optional[list] = None + gpu_split_auto: bool = True + autosplit_reserve: List[float] = [96 * 1024**2] def __init__(self, model_directory: pathlib.Path, quiet=False, **kwargs): """ @@ -109,7 +110,13 @@ def progress(loaded_modules: int, total_modules: int, gpu_count = torch.cuda.device_count() if gpu_count > 1: self.gpu_split = kwargs.get("gpu_split") + + # Auto GPU split parameters self.gpu_split_auto = unwrap(kwargs.get("gpu_split_auto"), True) + autosplit_reserve_megabytes = unwrap(kwargs.get("autosplit_reserve"), [96]) + self.autosplit_reserve = list( + map(lambda value: value * 1024**2, autosplit_reserve_megabytes) + ) else: self.gpu_split_auto = False logger.info("Disabling GPU split because one GPU is in use.") @@ -336,6 +343,12 @@ def progress(loaded_modules: int, total_modules: int) # Load tokenizer self.tokenizer = ExLlamaV2Tokenizer(self.config) + # Calculate autosplit reserve for all GPUs + gpu_count = torch.cuda.device_count() + autosplit_reserve = self.autosplit_reserve + [0] * ( + gpu_count - len(self.autosplit_reserve) + ) + # Load draft model if a config is present if self.draft_config: self.draft_model = ExLlamaV2(self.draft_config) @@ -343,10 +356,9 @@ def progress(loaded_modules: int, total_modules: int) logger.info("Loading draft model: " + self.draft_config.model_dir) self.draft_cache = ExLlamaV2Cache(self.draft_model, lazy=True) - reserve = [AUTO_SPLIT_RESERVE_BYTES] + [0] * 16 yield from self.draft_model.load_autosplit_gen( self.draft_cache, - reserve_vram=reserve, + reserve_vram=autosplit_reserve, last_id_only=True, callback_gen=progress_callback, ) @@ -385,10 +397,9 @@ def progress(loaded_modules: int, total_modules: int) if self.gpu_split_auto: logger.info("Loading with autosplit") - reserve = [AUTO_SPLIT_RESERVE_BYTES] + [0] * 16 for value in self.model.load_autosplit_gen( self.cache, - reserve_vram=reserve, + reserve_vram=autosplit_reserve, last_id_only=True, callback_gen=progress_callback, ): diff --git a/config_sample.yml b/config_sample.yml index 23b71917..a8750b68 100644 --- a/config_sample.yml +++ b/config_sample.yml @@ -76,6 +76,10 @@ model: # NOTE: Not parsed for single GPU users #gpu_split_auto: True + # Reserve VRAM used for autosplit loading (default: 96 MB on GPU 0) + # This is represented as an array of MB per GPU used + #autosplit_reserve: [96] + # An integer array of GBs of vram to split between GPUs (default: []) # NOTE: Not parsed for single GPU users #gpu_split: [20.6, 24]