diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py index a0a7f0e2..eaa431c0 100644 --- a/backends/exllamav2/model.py +++ b/backends/exllamav2/model.py @@ -170,7 +170,6 @@ async def create(cls, model_directory: pathlib.Path, quiet=False, **kwargs): if enable_draft: self.draft_config = ExLlamaV2Config() - self.draft_config.no_flash_attn = self.config.no_flash_attn draft_model_path = pathlib.Path( unwrap(draft_args.get("draft_model_dir"), "models") ) @@ -264,6 +263,8 @@ async def create(cls, model_directory: pathlib.Path, quiet=False, **kwargs): or not supports_paged_attn() ): self.config.no_flash_attn = True + if self.draft_config: + self.draft_config.no_flash_attn = True self.paged = False self.max_batch_size = 1 torch.backends.cuda.enable_flash_sdp(False) @@ -332,9 +333,20 @@ async def create(cls, model_directory: pathlib.Path, quiet=False, **kwargs): if num_experts_override: self.config.num_experts_per_token = kwargs.get("num_experts_per_token") - # Make sure chunk size is >= 16 and <= max seq length + # Make sure chunk size is >= 256, keep near or below max seq len user_chunk_size = unwrap(kwargs.get("chunk_size"), 2048) - chunk_size = sorted((16, user_chunk_size, self.config.max_seq_len))[1] + chunk_size = sorted((256, user_chunk_size, self.config.max_seq_len))[1] + chunk_remainder = chunk_size % 256 + if chunk_remainder != 0: + rounded_chunk_size = int(256 * ((chunk_size - chunk_remainder) / 256 + 1)) + + logger.warning( + f"The given chunk size ({chunk_size}) is " + "not a multiple of 256.\n" + "Overriding chunk_size with an overestimated value of " + f"{rounded_chunk_size} tokens." + ) + chunk_size = rounded_chunk_size self.config.max_input_len = chunk_size self.config.max_attention_size = chunk_size**2 diff --git a/common/config_models.py b/common/config_models.py index b8e7606b..f7f0addf 100644 --- a/common/config_models.py +++ b/common/config_models.py @@ -148,14 +148,25 @@ class ModelConfig(BaseConfigModel): False, description=( "Allow direct loading of models " - "from a completion or chat completion request (default: False)." + "from a completion or chat completion request (default: False).\n" + "This method of loading is strict by default.\n" + "Enable dummy models to add exceptions for invalid model names." ), ) use_dummy_models: Optional[bool] = Field( False, description=( - "Sends dummy model names when the models endpoint is queried.\n" - "Enable this if the client is looking for specific OAI models." + "Sends dummy model names when the models endpoint is queried. " + "(default: False)\n" + "Enable this if the client is looking for specific OAI models.\n" + ), + ) + dummy_model_names: List[str] = Field( + default=["gpt-3.5-turbo"], + description=( + "A list of fake model names that are sent via the /v1/models endpoint. " + '(default: ["gpt-3.5-turbo"])\n' + "Also used as bypasses for strict mode if inline_model_loading is true." ), ) model_name: Optional[str] = Field( diff --git a/config_sample.yml b/config_sample.yml index 48e58d9a..ebea5a1d 100644 --- a/config_sample.yml +++ b/config_sample.yml @@ -52,12 +52,18 @@ model: model_dir: models # Allow direct loading of models from a completion or chat completion request (default: False). + # This method of loading is strict by default. + # Enable dummy models to add exceptions for invalid model names. inline_model_loading: false - # Sends dummy model names when the models endpoint is queried. + # Sends dummy model names when the models endpoint is queried. (default: False) # Enable this if the client is looking for specific OAI models. use_dummy_models: false + # A list of fake model names that are sent via the /v1/models endpoint. (default: ["gpt-3.5-turbo"]) + # Also used as bypasses for strict mode if inline_model_loading is true. + dummy_model_names: ["gpt-3.5-turbo"] + # An initial model to load. # Make sure the model is located in the model directory! # REQUIRED: This must be filled out to load a model on startup. diff --git a/docker/Dockerfile b/docker/Dockerfile index f3587cca..96173715 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -19,8 +19,8 @@ WORKDIR /app # Get requirements COPY pyproject.toml . -# Install packages specified in pyproject.toml cu121 -RUN pip3 install --no-cache-dir .[cu121] +# Install packages specified in pyproject.toml cu121, extras +RUN pip3 install --no-cache-dir .[cu121,extras] RUN rm pyproject.toml diff --git a/endpoints/OAI/types/embedding.py b/endpoints/OAI/types/embedding.py index 7d5779fa..7937d503 100644 --- a/endpoints/OAI/types/embedding.py +++ b/endpoints/OAI/types/embedding.py @@ -1,4 +1,4 @@ -from typing import List, Optional +from typing import List, Optional, Union from pydantic import BaseModel, Field @@ -10,7 +10,7 @@ class UsageInfo(BaseModel): class EmbeddingsRequest(BaseModel): - input: List[str] = Field( + input: Union[str, List[str]] = Field( ..., description="List of input texts to generate embeddings for." ) encoding_format: str = Field( diff --git a/endpoints/OAI/utils/completion.py b/endpoints/OAI/utils/completion.py index 65ff0d32..e7981766 100644 --- a/endpoints/OAI/utils/completion.py +++ b/endpoints/OAI/utils/completion.py @@ -121,21 +121,41 @@ async def load_inline_model(model_name: str, request: Request): ): return - # Inline model loading isn't enabled or the user isn't an admin - if not get_key_permission(request) == "admin": - error_message = handle_request_error( - f"Unable to switch model to {model_name} because " - + "an admin key isn't provided", - exc_info=False, - ).error.message + # Return if inline loading is disabled + # Also warn if an admin key is used + if not config.model.inline_model_loading: + if get_key_permission(request) == "admin": + logger.warning( + f"Unable to switch model to {model_name} because " + '"inline_model_loading" is not True in config.yml.' + ) + + return - raise HTTPException(401, error_message) + is_dummy_model = ( + config.model.use_dummy_models and model_name in config.model.dummy_model_names + ) - if not config.model.inline_model_loading: - logger.warning( - f"Unable to switch model to {model_name} because " - '"inline_model_loading" is not True in config.yml.' - ) + # Error if an invalid key is passed + # If a dummy model is provided, don't error + if get_key_permission(request) != "admin": + if not is_dummy_model: + error_message = handle_request_error( + f"Unable to switch model to {model_name} because " + + "an admin key isn't provided", + exc_info=False, + ).error.message + + raise HTTPException(401, error_message) + else: + return + + # Start inline loading + # Past here, user is assumed to be admin + + # Skip if the model is a dummy + if is_dummy_model: + logger.warning(f"Dummy model {model_name} provided. Skipping inline load.") return diff --git a/endpoints/core/router.py b/endpoints/core/router.py index d7837f86..64450f49 100644 --- a/endpoints/core/router.py +++ b/endpoints/core/router.py @@ -40,6 +40,7 @@ from endpoints.core.utils.model import ( get_current_model, get_current_model_list, + get_dummy_models, get_model_list, stream_model_load, ) @@ -83,7 +84,7 @@ async def list_models(request: Request) -> ModelList: models = await get_current_model_list() if config.model.use_dummy_models: - models.data.insert(0, ModelCard(id="gpt-3.5-turbo")) + models.data[:0] = get_dummy_models() return models diff --git a/endpoints/core/utils/model.py b/endpoints/core/utils/model.py index 973337d0..c2c209b0 100644 --- a/endpoints/core/utils/model.py +++ b/endpoints/core/utils/model.py @@ -92,6 +92,13 @@ def get_current_model(): return model_card +def get_dummy_models(): + if config.model.dummy_model_names: + return [ModelCard(id=dummy_id) for dummy_id in config.model.dummy_model_names] + else: + return [ModelCard(id="gpt-3.5-turbo")] + + async def stream_model_load( data: ModelLoadRequest, model_path: pathlib.Path, diff --git a/pyproject.toml b/pyproject.toml index ca4b511d..d09129ba 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -69,12 +69,12 @@ cu121 = [ "torch @ https://download.pytorch.org/whl/cu121/torch-2.4.1%2Bcu121-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'", # Exl2 - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu121.torch2.4.0-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'", - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu121.torch2.4.0-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'", - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu121.torch2.4.0-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'", - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu121.torch2.4.0-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'", - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu121.torch2.4.0-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'", - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu121.torch2.4.0-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu121.torch2.4.0-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu121.torch2.4.0-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu121.torch2.4.0-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu121.torch2.4.0-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu121.torch2.4.0-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu121.torch2.4.0-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'", # Windows FA2 from https://github.com/bdashore3/flash-attention/releases "flash_attn @ https://github.com/bdashore3/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4.0cxx11abiFALSE-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'", @@ -96,12 +96,12 @@ cu118 = [ "torch @ https://download.pytorch.org/whl/cu118/torch-2.4.1%2Bcu118-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'", # Exl2 - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu118.torch2.4.0-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'", - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu118.torch2.4.0-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'", - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu118.torch2.4.0-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'", - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu118.torch2.4.0-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'", - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu118.torch2.4.0-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'", - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu118.torch2.4.0-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu118.torch2.4.0-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu118.torch2.4.0-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu118.torch2.4.0-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu118.torch2.4.0-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu118.torch2.4.0-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu118.torch2.4.0-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'", # Linux FA2 from https://github.com/Dao-AILab/flash-attention/releases "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu118torch2.4cxx11abiFALSE-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'", @@ -120,9 +120,9 @@ amd = [ "torch @ https://download.pytorch.org/whl/rocm6.0/torch-2.4.1%2Brocm6.0-cp310-cp310-linux_x86_64.whl ; python_version == '3.10'", # Exl2 - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+rocm6.1.torch2.4.0-cp312-cp312-linux_x86_64.whl ; python_version == '3.12'", - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+rocm6.1.torch2.4.0-cp311-cp311-linux_x86_64.whl ; python_version == '3.11'", - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+rocm6.1.torch2.4.0-cp310-cp310-linux_x86_64.whl ; python_version == '3.10'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+rocm6.1.torch2.4.0-cp312-cp312-linux_x86_64.whl ; python_version == '3.12'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+rocm6.1.torch2.4.0-cp311-cp311-linux_x86_64.whl ; python_version == '3.11'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+rocm6.1.torch2.4.0-cp310-cp310-linux_x86_64.whl ; python_version == '3.10'", ] # MARK: Ruff options