diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py index 8381a8c..0f9ba3a 100644 --- a/backends/exllamav2/model.py +++ b/backends/exllamav2/model.py @@ -129,8 +129,27 @@ async def create(cls, model_directory: pathlib.Path, quiet=False, **kwargs): # Check if the model arch is compatible with various exl2 features self.config.arch_compat_overrides() + # Create the hf_config + self.hf_config = await HuggingFaceConfig.from_file(model_directory) + + # Load generation config overrides + generation_config_path = model_directory / "generation_config.json" + if generation_config_path.exists(): + try: + self.generation_config = await GenerationConfig.from_file( + generation_config_path.parent + ) + except Exception: + logger.error(traceback.format_exc()) + logger.warning( + "Skipping generation config load because of an unexpected error." + ) + + # Apply a model's config overrides while respecting user settings + kwargs = await self.set_model_overrides(**kwargs) + # Prepare the draft model config if necessary - draft_args = unwrap(kwargs.get("draft"), {}) + draft_args = unwrap(kwargs.get("draft_model"), {}) draft_model_name = draft_args.get("draft_model_name") enable_draft = draft_args and draft_model_name @@ -154,25 +173,6 @@ async def create(cls, model_directory: pathlib.Path, quiet=False, **kwargs): self.draft_config.model_dir = str(draft_model_path.resolve()) self.draft_config.prepare() - # Create the hf_config - self.hf_config = await HuggingFaceConfig.from_file(model_directory) - - # Load generation config overrides - generation_config_path = model_directory / "generation_config.json" - if generation_config_path.exists(): - try: - self.generation_config = await GenerationConfig.from_file( - generation_config_path.parent - ) - except Exception: - logger.error(traceback.format_exc()) - logger.warning( - "Skipping generation config load because of an unexpected error." - ) - - # Apply a model's config overrides while respecting user settings - kwargs = await self.set_model_overrides(**kwargs) - # MARK: User configuration # Get cache mode @@ -251,9 +251,6 @@ async def create(cls, model_directory: pathlib.Path, quiet=False, **kwargs): else: self.config.scale_alpha_value = rope_alpha - # Enable fasttensors loading if present - self.config.fasttensors = unwrap(kwargs.get("fasttensors"), False) - # Set max batch size to the config override self.max_batch_size = unwrap(kwargs.get("max_batch_size")) @@ -341,9 +338,6 @@ async def create(cls, model_directory: pathlib.Path, quiet=False, **kwargs): # Set user-configured draft model values if enable_draft: - # Fetch from the updated kwargs - draft_args = unwrap(kwargs.get("draft"), {}) - self.draft_config.max_seq_len = self.config.max_seq_len self.draft_config.scale_pos_emb = unwrap( @@ -387,9 +381,12 @@ async def set_model_overrides(self, **kwargs): override_args = unwrap(yaml.load(contents), {}) # Merge draft overrides beforehand - draft_override_args = unwrap(override_args.get("draft"), {}) - if self.draft_config and draft_override_args: - kwargs["draft"] = {**draft_override_args, **kwargs.get("draft")} + draft_override_args = unwrap(override_args.get("draft_model"), {}) + if draft_override_args: + kwargs["draft_model"] = { + **draft_override_args, + **unwrap(kwargs.get("draft_model"), {}), + } # Merge the override and model kwargs merged_kwargs = {**override_args, **kwargs} diff --git a/backends/exllamav2/version.py b/backends/exllamav2/version.py index cdaeaeb..3197599 100644 --- a/backends/exllamav2/version.py +++ b/backends/exllamav2/version.py @@ -25,7 +25,7 @@ def check_exllama_version(): if not dependencies.exllamav2: raise SystemExit(("Exllamav2 is not installed.\n" + install_message)) - required_version = version.parse("0.2.2") + required_version = version.parse("0.2.3") current_version = version.parse(package_version("exllamav2").split("+")[0]) unsupported_message = ( diff --git a/common/config_models.py b/common/config_models.py index 79d774f..a31d6a5 100644 --- a/common/config_models.py +++ b/common/config_models.py @@ -290,13 +290,6 @@ class ModelConfig(BaseConfigModel): ), ge=1, ) - fasttensors: Optional[bool] = Field( - False, - description=( - "Enables fasttensors to possibly increase model loading speeds " - "(default: False)." - ), - ) _metadata: Metadata = PrivateAttr(Metadata()) model_config = ConfigDict(protected_namespaces=()) diff --git a/config_sample.yml b/config_sample.yml index 507d7d5..771d7e6 100644 --- a/config_sample.yml +++ b/config_sample.yml @@ -135,9 +135,6 @@ model: # WARNING: Don't set this unless you know what you're doing! num_experts_per_token: - # Enables fasttensors to possibly increase model loading speeds (default: False). - fasttensors: false - # Options for draft models (speculative decoding) # This will use more VRAM! draft_model: diff --git a/endpoints/OAI/utils/completion.py b/endpoints/OAI/utils/completion.py index f032999..e939525 100644 --- a/endpoints/OAI/utils/completion.py +++ b/endpoints/OAI/utils/completion.py @@ -148,8 +148,11 @@ async def load_inline_model(model_name: str, request: Request): return - # Load the model - await model.load_model(model_path) + # Load the model and also add draft dir + await model.load_model( + model_path, + draft_model=config.draft_model.model_dump(include={"draft_model_dir"}), + ) async def stream_generate_completion( diff --git a/endpoints/core/router.py b/endpoints/core/router.py index 2c60cd7..f2b4247 100644 --- a/endpoints/core/router.py +++ b/endpoints/core/router.py @@ -123,7 +123,7 @@ async def load_model(data: ModelLoadRequest) -> ModelLoadResponse: """Loads a model into the model container. This returns an SSE stream.""" # Verify request parameters - if not data.name: + if not data.model_name: error_message = handle_request_error( "A model name was not provided for load.", exc_info=False, @@ -132,11 +132,11 @@ async def load_model(data: ModelLoadRequest) -> ModelLoadResponse: raise HTTPException(400, error_message) model_path = pathlib.Path(config.model.model_dir) - model_path = model_path / data.name + model_path = model_path / data.model_name draft_model_path = None - if data.draft: - if not data.draft.draft_model_name: + if data.draft_model: + if not data.draft_model.draft_model_name: error_message = handle_request_error( "Could not find the draft model name for model load.", exc_info=False, @@ -301,7 +301,7 @@ async def load_embedding_model( request: Request, data: EmbeddingModelLoadRequest ) -> ModelLoadResponse: # Verify request parameters - if not data.name: + if not data.embedding_model_name: error_message = handle_request_error( "A model name was not provided for load.", exc_info=False, @@ -310,7 +310,7 @@ async def load_embedding_model( raise HTTPException(400, error_message) embedding_model_dir = pathlib.Path(config.embeddings.embedding_model_dir) - embedding_model_path = embedding_model_dir / data.name + embedding_model_path = embedding_model_dir / data.embedding_model_name if not embedding_model_path.exists(): error_message = handle_request_error( @@ -441,7 +441,7 @@ async def list_templates(request: Request) -> TemplateList: async def switch_template(data: TemplateSwitchRequest): """Switch the currently loaded template.""" - if not data.name: + if not data.prompt_template_name: error_message = handle_request_error( "New template name not found.", exc_info=False, @@ -450,11 +450,12 @@ async def switch_template(data: TemplateSwitchRequest): raise HTTPException(400, error_message) try: - template_path = pathlib.Path("templates") / data.name + template_path = pathlib.Path("templates") / data.prompt_template_name model.container.prompt_template = await PromptTemplate.from_file(template_path) except FileNotFoundError as e: error_message = handle_request_error( - f"The template name {data.name} doesn't exist. Check the spelling?", + f"The template name {data.prompt_template_name} doesn't exist. " + + "Check the spelling?", exc_info=False, ).error.message diff --git a/endpoints/core/types/model.py b/endpoints/core/types/model.py index b169162..58ab0dc 100644 --- a/endpoints/core/types/model.py +++ b/endpoints/core/types/model.py @@ -1,6 +1,6 @@ """Contains model card types.""" -from pydantic import BaseModel, Field, ConfigDict +from pydantic import AliasChoices, BaseModel, Field, ConfigDict from time import time from typing import List, Literal, Optional, Union @@ -48,7 +48,10 @@ class DraftModelLoadRequest(BaseModel): """Represents a draft model load request.""" # Required - draft_model_name: str + draft_model_name: str = Field( + alias=AliasChoices("draft_model_name", "name"), + description="Aliases: name", + ) # Config arguments draft_rope_scale: Optional[float] = None @@ -63,8 +66,14 @@ class DraftModelLoadRequest(BaseModel): class ModelLoadRequest(BaseModel): """Represents a model load request.""" + # Avoids pydantic namespace warning + model_config = ConfigDict(protected_namespaces=[]) + # Required - name: str + model_name: str = Field( + alias=AliasChoices("model_name", "name"), + description="Aliases: name", + ) # Config arguments @@ -106,15 +115,20 @@ class ModelLoadRequest(BaseModel): chunk_size: Optional[int] = None prompt_template: Optional[str] = None num_experts_per_token: Optional[int] = None - fasttensors: Optional[bool] = None # Non-config arguments - draft: Optional[DraftModelLoadRequest] = None + draft_model: Optional[DraftModelLoadRequest] = Field( + default=None, + alias=AliasChoices("draft_model", "draft"), + ) skip_queue: Optional[bool] = False class EmbeddingModelLoadRequest(BaseModel): - name: str + embedding_model_name: str = Field( + alias=AliasChoices("embedding_model_name", "name"), + description="Aliases: name", + ) # Set default from the config embeddings_device: Optional[str] = Field(config.embeddings.embeddings_device) diff --git a/endpoints/core/types/template.py b/endpoints/core/types/template.py index d72d621..010c9db 100644 --- a/endpoints/core/types/template.py +++ b/endpoints/core/types/template.py @@ -1,4 +1,4 @@ -from pydantic import BaseModel, Field +from pydantic import AliasChoices, BaseModel, Field from typing import List @@ -12,4 +12,7 @@ class TemplateList(BaseModel): class TemplateSwitchRequest(BaseModel): """Request to switch a template.""" - name: str + prompt_template_name: str = Field( + alias=AliasChoices("prompt_template_name", "name"), + description="Aliases: name", + ) diff --git a/endpoints/core/utils/model.py b/endpoints/core/utils/model.py index d151fdd..973337d 100644 --- a/endpoints/core/utils/model.py +++ b/endpoints/core/utils/model.py @@ -104,7 +104,7 @@ async def stream_model_load( # Set the draft model path if it exists if draft_model_path: - load_data["draft"]["draft_model_dir"] = draft_model_path + load_data["draft_model"]["draft_model_dir"] = draft_model_path load_status = model.load_model_gen( model_path, skip_wait=data.skip_queue, **load_data diff --git a/main.py b/main.py index 06db5d5..e17e2f8 100644 --- a/main.py +++ b/main.py @@ -69,8 +69,8 @@ async def entrypoint_async(): # TODO: remove model_dump() await model.load_model( model_path.resolve(), - **config.model.model_dump(), - draft=config.draft_model.model_dump(), + **config.model.model_dump(exclude_none=True), + draft_model=config.draft_model.model_dump(exclude_none=True), ) # Load loras after loading the model diff --git a/pyproject.toml b/pyproject.toml index e4120d2..81f8bf2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -68,12 +68,12 @@ cu121 = [ "torch @ https://download.pytorch.org/whl/cu121/torch-2.4.1%2Bcu121-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'", # Exl2 - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+cu121.torch2.4.0-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'", - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+cu121.torch2.4.0-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'", - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+cu121.torch2.4.0-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'", - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+cu121.torch2.4.0-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'", - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+cu121.torch2.4.0-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'", - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+cu121.torch2.4.0-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu121.torch2.4.0-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu121.torch2.4.0-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu121.torch2.4.0-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu121.torch2.4.0-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu121.torch2.4.0-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu121.torch2.4.0-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'", # Windows FA2 from https://github.com/bdashore3/flash-attention/releases "flash_attn @ https://github.com/bdashore3/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4.0cxx11abiFALSE-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'", @@ -95,12 +95,12 @@ cu118 = [ "torch @ https://download.pytorch.org/whl/cu118/torch-2.4.1%2Bcu118-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'", # Exl2 - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+cu118.torch2.4.0-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'", - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+cu118.torch2.4.0-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'", - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+cu118.torch2.4.0-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'", - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+cu118.torch2.4.0-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'", - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+cu118.torch2.4.0-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'", - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+cu118.torch2.4.0-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu118.torch2.4.0-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu118.torch2.4.0-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu118.torch2.4.0-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu118.torch2.4.0-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu118.torch2.4.0-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu118.torch2.4.0-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'", # Linux FA2 from https://github.com/Dao-AILab/flash-attention/releases "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu118torch2.4cxx11abiFALSE-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'", @@ -119,9 +119,9 @@ amd = [ "torch @ https://download.pytorch.org/whl/rocm6.0/torch-2.4.1%2Brocm6.0-cp310-cp310-linux_x86_64.whl ; python_version == '3.10'", # Exl2 - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+rocm6.1.torch2.4.0-cp312-cp312-linux_x86_64.whl ; python_version == '3.12'", - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+rocm6.1.torch2.4.0-cp311-cp311-linux_x86_64.whl ; python_version == '3.11'", - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.2/exllamav2-0.2.2+rocm6.1.torch2.4.0-cp310-cp310-linux_x86_64.whl ; python_version == '3.10'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+rocm6.1.torch2.4.0-cp312-cp312-linux_x86_64.whl ; python_version == '3.12'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+rocm6.1.torch2.4.0-cp311-cp311-linux_x86_64.whl ; python_version == '3.11'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+rocm6.1.torch2.4.0-cp310-cp310-linux_x86_64.whl ; python_version == '3.10'", ] # MARK: Ruff options