From 9530f8c8c76354b3bf5d527603b2dfece529c770 Mon Sep 17 00:00:00 2001 From: kingbri Date: Mon, 11 Nov 2024 12:09:27 -0500 Subject: [PATCH 1/8] Model: Add support for chat_template.json HuggingFace separated the chat template in the newest transformers versions. Signed-off-by: kingbri --- backends/exllamav2/model.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py index 6f17570..c7d2069 100644 --- a/backends/exllamav2/model.py +++ b/backends/exllamav2/model.py @@ -389,6 +389,10 @@ async def find_prompt_template(self, prompt_template_name, model_directory): logger.info("Attempting to load a prompt template if present.") find_template_functions = [ + lambda: PromptTemplate.from_model_json( + pathlib.Path(self.config.model_dir) / "chat_template.json", + key="chat_template", + ), lambda: PromptTemplate.from_model_json( pathlib.Path(self.config.model_dir) / "tokenizer_config.json", key="chat_template", From 69838e92ca48a115123e29ef8d3e0d10b6c162e6 Mon Sep 17 00:00:00 2001 From: kingbri Date: Wed, 13 Nov 2024 22:16:11 -0500 Subject: [PATCH 2/8] Dependencies: Update ExllamaV2 v0.2.4 Signed-off-by: kingbri --- pyproject.toml | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 81f8bf2..dc54ebe 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -68,12 +68,12 @@ cu121 = [ "torch @ https://download.pytorch.org/whl/cu121/torch-2.4.1%2Bcu121-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'", # Exl2 - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu121.torch2.4.0-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'", - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu121.torch2.4.0-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'", - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu121.torch2.4.0-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'", - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu121.torch2.4.0-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'", - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu121.torch2.4.0-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'", - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu121.torch2.4.0-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu121.torch2.4.0-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu121.torch2.4.0-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu121.torch2.4.0-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu121.torch2.4.0-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu121.torch2.4.0-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu121.torch2.4.0-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'", # Windows FA2 from https://github.com/bdashore3/flash-attention/releases "flash_attn @ https://github.com/bdashore3/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4.0cxx11abiFALSE-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'", @@ -95,12 +95,12 @@ cu118 = [ "torch @ https://download.pytorch.org/whl/cu118/torch-2.4.1%2Bcu118-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'", # Exl2 - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu118.torch2.4.0-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'", - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu118.torch2.4.0-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'", - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu118.torch2.4.0-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'", - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu118.torch2.4.0-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'", - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu118.torch2.4.0-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'", - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu118.torch2.4.0-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu118.torch2.4.0-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu118.torch2.4.0-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu118.torch2.4.0-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu118.torch2.4.0-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu118.torch2.4.0-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu118.torch2.4.0-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'", # Linux FA2 from https://github.com/Dao-AILab/flash-attention/releases "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu118torch2.4cxx11abiFALSE-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'", @@ -119,9 +119,9 @@ amd = [ "torch @ https://download.pytorch.org/whl/rocm6.0/torch-2.4.1%2Brocm6.0-cp310-cp310-linux_x86_64.whl ; python_version == '3.10'", # Exl2 - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+rocm6.1.torch2.4.0-cp312-cp312-linux_x86_64.whl ; python_version == '3.12'", - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+rocm6.1.torch2.4.0-cp311-cp311-linux_x86_64.whl ; python_version == '3.11'", - "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+rocm6.1.torch2.4.0-cp310-cp310-linux_x86_64.whl ; python_version == '3.10'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+rocm6.1.torch2.4.0-cp312-cp312-linux_x86_64.whl ; python_version == '3.12'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+rocm6.1.torch2.4.0-cp311-cp311-linux_x86_64.whl ; python_version == '3.11'", + "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+rocm6.1.torch2.4.0-cp310-cp310-linux_x86_64.whl ; python_version == '3.10'", ] # MARK: Ruff options From 101ebd658a2f59ec7caf00c68b7bb375f743137c Mon Sep 17 00:00:00 2001 From: kingbri Date: Fri, 15 Nov 2024 18:16:48 -0500 Subject: [PATCH 3/8] Docker: Add extras to dockerfile Adds support for all features when pulling the image Signed-off-by: kingbri --- docker/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index f3587cc..9617371 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -19,8 +19,8 @@ WORKDIR /app # Get requirements COPY pyproject.toml . -# Install packages specified in pyproject.toml cu121 -RUN pip3 install --no-cache-dir .[cu121] +# Install packages specified in pyproject.toml cu121, extras +RUN pip3 install --no-cache-dir .[cu121,extras] RUN rm pyproject.toml From 37cc701137165b2f3e8d4302ec6cb29c55d2b6a9 Mon Sep 17 00:00:00 2001 From: DocShotgun <126566557+DocShotgun@users.noreply.github.com> Date: Fri, 15 Nov 2024 20:35:18 -0800 Subject: [PATCH 4/8] Model: Enforce chunk_size as multiple of 256 --- backends/exllamav2/model.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py index c7d2069..dc68b22 100644 --- a/backends/exllamav2/model.py +++ b/backends/exllamav2/model.py @@ -321,9 +321,20 @@ async def create(cls, model_directory: pathlib.Path, quiet=False, **kwargs): if num_experts_override: self.config.num_experts_per_token = kwargs.get("num_experts_per_token") - # Make sure chunk size is >= 16 and <= max seq length + # Make sure chunk size is >= 256, keep near or below max seq len user_chunk_size = unwrap(kwargs.get("chunk_size"), 2048) - chunk_size = sorted((16, user_chunk_size, self.config.max_seq_len))[1] + chunk_size = sorted((256, user_chunk_size, self.config.max_seq_len))[1] + chunk_remainder = chunk_size % 256 + if chunk_remainder != 0: + rounded_chunk_size = int(256 * ((chunk_size - chunk_remainder) / 256 + 1)) + + logger.warning( + f"The given chunk size ({chunk_size}) is " + "not a multiple of 256.\n" + "Overriding chunk_size with an overestimated value of " + f"{rounded_chunk_size} tokens." + ) + chunk_size = rounded_chunk_size self.config.max_input_len = chunk_size self.config.max_attention_size = chunk_size**2 From 5bb46df3c3b12f2b6d88a47ce04ead8f53e37a58 Mon Sep 17 00:00:00 2001 From: DocShotgun <126566557+DocShotgun@users.noreply.github.com> Date: Fri, 15 Nov 2024 21:04:25 -0800 Subject: [PATCH 5/8] Model: Fix draft model non-FA2 fallback --- backends/exllamav2/model.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py index c7d2069..44e354f 100644 --- a/backends/exllamav2/model.py +++ b/backends/exllamav2/model.py @@ -159,7 +159,6 @@ async def create(cls, model_directory: pathlib.Path, quiet=False, **kwargs): if enable_draft: self.draft_config = ExLlamaV2Config() - self.draft_config.no_flash_attn = self.config.no_flash_attn draft_model_path = pathlib.Path( unwrap(draft_args.get("draft_model_dir"), "models") ) @@ -253,6 +252,8 @@ async def create(cls, model_directory: pathlib.Path, quiet=False, **kwargs): or not supports_paged_attn() ): self.config.no_flash_attn = True + if self.draft_config: + self.draft_config.no_flash_attn = True self.paged = False self.max_batch_size = 1 torch.backends.cuda.enable_flash_sdp(False) From f9fffd42e0b8de61f775a12db0a5dfa9831b27a1 Mon Sep 17 00:00:00 2001 From: kingbri Date: Sat, 16 Nov 2024 23:28:44 -0500 Subject: [PATCH 6/8] OAI: Fix inline model loading errors when disabled The admin key check was running even if inline loading was disabled. Fix this bug, but also preserve the existing permission system when inline loading is enabled. Signed-off-by: kingbri --- endpoints/OAI/utils/completion.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/endpoints/OAI/utils/completion.py b/endpoints/OAI/utils/completion.py index e939525..cfcdeba 100644 --- a/endpoints/OAI/utils/completion.py +++ b/endpoints/OAI/utils/completion.py @@ -119,8 +119,19 @@ async def load_inline_model(model_name: str, request: Request): ): return - # Inline model loading isn't enabled or the user isn't an admin - if not get_key_permission(request) == "admin": + # Return if inline loading is disabled + # Also warn if an admin key is used + if not config.model.inline_model_loading: + if get_key_permission(request) == "admin": + logger.warning( + f"Unable to switch model to {model_name} because " + '"inline_model_loading" is not True in config.yml.' + ) + + return + + # Error if an invalid key is passed + if get_key_permission(request) != "admin": error_message = handle_request_error( f"Unable to switch model to {model_name} because " + "an admin key isn't provided", @@ -129,14 +140,8 @@ async def load_inline_model(model_name: str, request: Request): raise HTTPException(401, error_message) - if not config.model.inline_model_loading: - logger.warning( - f"Unable to switch model to {model_name} because " - '"inline_model_loading" is not True in config.yml.' - ) - - return - + # Start inline loading + # Past here, user is assumed to be admin model_path = pathlib.Path(config.model.model_dir) model_path = model_path / model_name From b94c646210d56539cf0814afd2d99051825cc55c Mon Sep 17 00:00:00 2001 From: kingbri Date: Sat, 16 Nov 2024 23:48:31 -0500 Subject: [PATCH 7/8] Embeddings: Add string input as an option Used in OAI's API Signed-off-by: kingbri --- endpoints/OAI/types/embedding.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/endpoints/OAI/types/embedding.py b/endpoints/OAI/types/embedding.py index 7d5779f..7937d50 100644 --- a/endpoints/OAI/types/embedding.py +++ b/endpoints/OAI/types/embedding.py @@ -1,4 +1,4 @@ -from typing import List, Optional +from typing import List, Optional, Union from pydantic import BaseModel, Field @@ -10,7 +10,7 @@ class UsageInfo(BaseModel): class EmbeddingsRequest(BaseModel): - input: List[str] = Field( + input: Union[str, List[str]] = Field( ..., description="List of input texts to generate embeddings for." ) encoding_format: str = Field( From bd9e78e19e19550d91b73881bf85b6e29249f7bb Mon Sep 17 00:00:00 2001 From: kingbri Date: Sun, 17 Nov 2024 21:12:38 -0500 Subject: [PATCH 8/8] API: Add inline exception for dummy models If an API key sends a dummy model, it shouldn't error as the server is catering to clients that expect specific OAI model names. This is a problem with inline model loading since these names would error by default. Therefore, add an exception if the provided name is in the dummy model names (which also doubles as inline strict exceptions). However, the dummy model names weren't configurable, so add a new option to specify exception names, otherwise the default is gpt-3.5-turbo. Signed-off-by: kingbri --- common/config_models.py | 17 ++++++++++++++--- config_sample.yml | 8 +++++++- endpoints/OAI/utils/completion.py | 27 +++++++++++++++++++++------ endpoints/core/router.py | 3 ++- endpoints/core/utils/model.py | 7 +++++++ 5 files changed, 51 insertions(+), 11 deletions(-) diff --git a/common/config_models.py b/common/config_models.py index 40b4109..b113194 100644 --- a/common/config_models.py +++ b/common/config_models.py @@ -141,14 +141,25 @@ class ModelConfig(BaseConfigModel): False, description=( "Allow direct loading of models " - "from a completion or chat completion request (default: False)." + "from a completion or chat completion request (default: False).\n" + "This method of loading is strict by default.\n" + "Enable dummy models to add exceptions for invalid model names." ), ) use_dummy_models: Optional[bool] = Field( False, description=( - "Sends dummy model names when the models endpoint is queried.\n" - "Enable this if the client is looking for specific OAI models." + "Sends dummy model names when the models endpoint is queried. " + "(default: False)\n" + "Enable this if the client is looking for specific OAI models.\n" + ), + ) + dummy_model_names: List[str] = Field( + default=["gpt-3.5-turbo"], + description=( + "A list of fake model names that are sent via the /v1/models endpoint. " + '(default: ["gpt-3.5-turbo"])\n' + "Also used as bypasses for strict mode if inline_model_loading is true." ), ) model_name: Optional[str] = Field( diff --git a/config_sample.yml b/config_sample.yml index 83f2fc7..39593db 100644 --- a/config_sample.yml +++ b/config_sample.yml @@ -49,12 +49,18 @@ model: model_dir: models # Allow direct loading of models from a completion or chat completion request (default: False). + # This method of loading is strict by default. + # Enable dummy models to add exceptions for invalid model names. inline_model_loading: false - # Sends dummy model names when the models endpoint is queried. + # Sends dummy model names when the models endpoint is queried. (default: False) # Enable this if the client is looking for specific OAI models. use_dummy_models: false + # A list of fake model names that are sent via the /v1/models endpoint. (default: ["gpt-3.5-turbo"]) + # Also used as bypasses for strict mode if inline_model_loading is true. + dummy_model_names: ["gpt-3.5-turbo"] + # An initial model to load. # Make sure the model is located in the model directory! # REQUIRED: This must be filled out to load a model on startup. diff --git a/endpoints/OAI/utils/completion.py b/endpoints/OAI/utils/completion.py index cfcdeba..9fd8b90 100644 --- a/endpoints/OAI/utils/completion.py +++ b/endpoints/OAI/utils/completion.py @@ -130,18 +130,33 @@ async def load_inline_model(model_name: str, request: Request): return + is_dummy_model = ( + config.model.use_dummy_models and model_name in config.model.dummy_model_names + ) + # Error if an invalid key is passed + # If a dummy model is provided, don't error if get_key_permission(request) != "admin": - error_message = handle_request_error( - f"Unable to switch model to {model_name} because " - + "an admin key isn't provided", - exc_info=False, - ).error.message + if not is_dummy_model: + error_message = handle_request_error( + f"Unable to switch model to {model_name} because " + + "an admin key isn't provided", + exc_info=False, + ).error.message - raise HTTPException(401, error_message) + raise HTTPException(401, error_message) + else: + return # Start inline loading # Past here, user is assumed to be admin + + # Skip if the model is a dummy + if is_dummy_model: + logger.warning(f"Dummy model {model_name} provided. Skipping inline load.") + + return + model_path = pathlib.Path(config.model.model_dir) model_path = model_path / model_name diff --git a/endpoints/core/router.py b/endpoints/core/router.py index f2b4247..597930b 100644 --- a/endpoints/core/router.py +++ b/endpoints/core/router.py @@ -39,6 +39,7 @@ from endpoints.core.utils.model import ( get_current_model, get_current_model_list, + get_dummy_models, get_model_list, stream_model_load, ) @@ -82,7 +83,7 @@ async def list_models(request: Request) -> ModelList: models = await get_current_model_list() if config.model.use_dummy_models: - models.data.insert(0, ModelCard(id="gpt-3.5-turbo")) + models.data[:0] = get_dummy_models() return models diff --git a/endpoints/core/utils/model.py b/endpoints/core/utils/model.py index 973337d..c2c209b 100644 --- a/endpoints/core/utils/model.py +++ b/endpoints/core/utils/model.py @@ -92,6 +92,13 @@ def get_current_model(): return model_card +def get_dummy_models(): + if config.model.dummy_model_names: + return [ModelCard(id=dummy_id) for dummy_id in config.model.dummy_model_names] + else: + return [ModelCard(id="gpt-3.5-turbo")] + + async def stream_model_load( data: ModelLoadRequest, model_path: pathlib.Path,