Merge branch 'main' into vision

theroyallab · Nov 20, 2024 · 0fadb1e · 0fadb1e
2 parents 731a345 + a69f860
commit 0fadb1e
Show file tree

Hide file tree

Showing 9 changed files with 97 additions and 40 deletions.
diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py
@@ -170,7 +170,6 @@ async def create(cls, model_directory: pathlib.Path, quiet=False, **kwargs):
 
         if enable_draft:
             self.draft_config = ExLlamaV2Config()
-            self.draft_config.no_flash_attn = self.config.no_flash_attn
             draft_model_path = pathlib.Path(
                 unwrap(draft_args.get("draft_model_dir"), "models")
             )
@@ -264,6 +263,8 @@ async def create(cls, model_directory: pathlib.Path, quiet=False, **kwargs):
             or not supports_paged_attn()
         ):
             self.config.no_flash_attn = True
+            if self.draft_config:
+                self.draft_config.no_flash_attn = True
             self.paged = False
             self.max_batch_size = 1
             torch.backends.cuda.enable_flash_sdp(False)
@@ -332,9 +333,20 @@ async def create(cls, model_directory: pathlib.Path, quiet=False, **kwargs):
         if num_experts_override:
             self.config.num_experts_per_token = kwargs.get("num_experts_per_token")
 
-        # Make sure chunk size is >= 16 and <= max seq length
+        # Make sure chunk size is >= 256, keep near or below max seq len
         user_chunk_size = unwrap(kwargs.get("chunk_size"), 2048)
-        chunk_size = sorted((16, user_chunk_size, self.config.max_seq_len))[1]
+        chunk_size = sorted((256, user_chunk_size, self.config.max_seq_len))[1]
+        chunk_remainder = chunk_size % 256
+        if chunk_remainder != 0:
+            rounded_chunk_size = int(256 * ((chunk_size - chunk_remainder) / 256 + 1))
+
+            logger.warning(
+                f"The given chunk size ({chunk_size}) is "
+                "not a multiple of 256.\n"
+                "Overriding chunk_size with an overestimated value of "
+                f"{rounded_chunk_size} tokens."
+            )
+            chunk_size = rounded_chunk_size
         self.config.max_input_len = chunk_size
         self.config.max_attention_size = chunk_size**2
 

diff --git a/common/config_models.py b/common/config_models.py
@@ -148,14 +148,25 @@ class ModelConfig(BaseConfigModel):
         False,
         description=(
             "Allow direct loading of models "
-            "from a completion or chat completion request (default: False)."
+            "from a completion or chat completion request (default: False).\n"
+            "This method of loading is strict by default.\n"
+            "Enable dummy models to add exceptions for invalid model names."
         ),
     )
     use_dummy_models: Optional[bool] = Field(
         False,
         description=(
-            "Sends dummy model names when the models endpoint is queried.\n"
-            "Enable this if the client is looking for specific OAI models."
+            "Sends dummy model names when the models endpoint is queried. "
+            "(default: False)\n"
+            "Enable this if the client is looking for specific OAI models.\n"
+        ),
+    )
+    dummy_model_names: List[str] = Field(
+        default=["gpt-3.5-turbo"],
+        description=(
+            "A list of fake model names that are sent via the /v1/models endpoint. "
+            '(default: ["gpt-3.5-turbo"])\n'
+            "Also used as bypasses for strict mode if inline_model_loading is true."
         ),
     )
     model_name: Optional[str] = Field(

diff --git a/config_sample.yml b/config_sample.yml
@@ -52,12 +52,18 @@ model:
   model_dir: models
 
   # Allow direct loading of models from a completion or chat completion request (default: False).
+  # This method of loading is strict by default.
+  # Enable dummy models to add exceptions for invalid model names.
   inline_model_loading: false
 
-  # Sends dummy model names when the models endpoint is queried.
+  # Sends dummy model names when the models endpoint is queried. (default: False)
   # Enable this if the client is looking for specific OAI models.
   use_dummy_models: false
 
+  # A list of fake model names that are sent via the /v1/models endpoint. (default: ["gpt-3.5-turbo"])
+  # Also used as bypasses for strict mode if inline_model_loading is true.
+  dummy_model_names: ["gpt-3.5-turbo"]
+
   # An initial model to load.
   # Make sure the model is located in the model directory!
   # REQUIRED: This must be filled out to load a model on startup.

diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -19,8 +19,8 @@ WORKDIR /app
 # Get requirements
 COPY pyproject.toml .
 
-# Install packages specified in pyproject.toml cu121
-RUN pip3 install --no-cache-dir .[cu121]
+# Install packages specified in pyproject.toml cu121, extras
+RUN pip3 install --no-cache-dir .[cu121,extras]
 
 RUN rm pyproject.toml
 

diff --git a/endpoints/OAI/types/embedding.py b/endpoints/OAI/types/embedding.py
@@ -1,4 +1,4 @@
-from typing import List, Optional
+from typing import List, Optional, Union
 
 from pydantic import BaseModel, Field
 
@@ -10,7 +10,7 @@ class UsageInfo(BaseModel):
 
 
 class EmbeddingsRequest(BaseModel):
-    input: List[str] = Field(
+    input: Union[str, List[str]] = Field(
         ..., description="List of input texts to generate embeddings for."
     )
     encoding_format: str = Field(

diff --git a/endpoints/OAI/utils/completion.py b/endpoints/OAI/utils/completion.py
@@ -121,21 +121,41 @@ async def load_inline_model(model_name: str, request: Request):
     ):
         return
 
-    # Inline model loading isn't enabled or the user isn't an admin
-    if not get_key_permission(request) == "admin":
-        error_message = handle_request_error(
-            f"Unable to switch model to {model_name} because "
-            + "an admin key isn't provided",
-            exc_info=False,
-        ).error.message
+    # Return if inline loading is disabled
+    # Also warn if an admin key is used
+    if not config.model.inline_model_loading:
+        if get_key_permission(request) == "admin":
+            logger.warning(
+                f"Unable to switch model to {model_name} because "
+                '"inline_model_loading" is not True in config.yml.'
+            )
+
+        return
 
-        raise HTTPException(401, error_message)
+    is_dummy_model = (
+        config.model.use_dummy_models and model_name in config.model.dummy_model_names
+    )
 
-    if not config.model.inline_model_loading:
-        logger.warning(
-            f"Unable to switch model to {model_name} because "
-            '"inline_model_loading" is not True in config.yml.'
-        )
+    # Error if an invalid key is passed
+    # If a dummy model is provided, don't error
+    if get_key_permission(request) != "admin":
+        if not is_dummy_model:
+            error_message = handle_request_error(
+                f"Unable to switch model to {model_name} because "
+                + "an admin key isn't provided",
+                exc_info=False,
+            ).error.message
+
+            raise HTTPException(401, error_message)
+        else:
+            return
+
+    # Start inline loading
+    # Past here, user is assumed to be admin
+
+    # Skip if the model is a dummy
+    if is_dummy_model:
+        logger.warning(f"Dummy model {model_name} provided. Skipping inline load.")
 
         return
 

diff --git a/endpoints/core/router.py b/endpoints/core/router.py
@@ -40,6 +40,7 @@
 from endpoints.core.utils.model import (
     get_current_model,
     get_current_model_list,
+    get_dummy_models,
     get_model_list,
     stream_model_load,
 )
@@ -83,7 +84,7 @@ async def list_models(request: Request) -> ModelList:
         models = await get_current_model_list()
 
     if config.model.use_dummy_models:
-        models.data.insert(0, ModelCard(id="gpt-3.5-turbo"))
+        models.data[:0] = get_dummy_models()
 
     return models
 

diff --git a/endpoints/core/utils/model.py b/endpoints/core/utils/model.py
@@ -92,6 +92,13 @@ def get_current_model():
     return model_card
 
 
+def get_dummy_models():
+    if config.model.dummy_model_names:
+        return [ModelCard(id=dummy_id) for dummy_id in config.model.dummy_model_names]
+    else:
+        return [ModelCard(id="gpt-3.5-turbo")]
+
+
 async def stream_model_load(
     data: ModelLoadRequest,
     model_path: pathlib.Path,

diff --git a/pyproject.toml b/pyproject.toml
@@ -69,12 +69,12 @@ cu121 = [
     "torch @ https://download.pytorch.org/whl/cu121/torch-2.4.1%2Bcu121-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
 
     # Exl2
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu121.torch2.4.0-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu121.torch2.4.0-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu121.torch2.4.0-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu121.torch2.4.0-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu121.torch2.4.0-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu121.torch2.4.0-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu121.torch2.4.0-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu121.torch2.4.0-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu121.torch2.4.0-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu121.torch2.4.0-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu121.torch2.4.0-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu121.torch2.4.0-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
 
     # Windows FA2 from https://github.com/bdashore3/flash-attention/releases
     "flash_attn @ https://github.com/bdashore3/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4.0cxx11abiFALSE-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
@@ -96,12 +96,12 @@ cu118 = [
     "torch @ https://download.pytorch.org/whl/cu118/torch-2.4.1%2Bcu118-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
 
     # Exl2
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu118.torch2.4.0-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu118.torch2.4.0-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu118.torch2.4.0-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu118.torch2.4.0-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu118.torch2.4.0-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu118.torch2.4.0-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu118.torch2.4.0-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu118.torch2.4.0-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu118.torch2.4.0-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu118.torch2.4.0-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu118.torch2.4.0-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu118.torch2.4.0-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
 
     # Linux FA2 from https://github.com/Dao-AILab/flash-attention/releases
     "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu118torch2.4cxx11abiFALSE-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
@@ -120,9 +120,9 @@ amd = [
     "torch @ https://download.pytorch.org/whl/rocm6.0/torch-2.4.1%2Brocm6.0-cp310-cp310-linux_x86_64.whl ; python_version == '3.10'",
 
     # Exl2
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+rocm6.1.torch2.4.0-cp312-cp312-linux_x86_64.whl ; python_version == '3.12'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+rocm6.1.torch2.4.0-cp311-cp311-linux_x86_64.whl ; python_version == '3.11'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+rocm6.1.torch2.4.0-cp310-cp310-linux_x86_64.whl ; python_version == '3.10'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+rocm6.1.torch2.4.0-cp312-cp312-linux_x86_64.whl ; python_version == '3.12'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+rocm6.1.torch2.4.0-cp311-cp311-linux_x86_64.whl ; python_version == '3.11'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+rocm6.1.torch2.4.0-cp310-cp310-linux_x86_64.whl ; python_version == '3.10'",
 ]
 
 # MARK: Ruff options