From 4aebe8a2a578ca7478fd82106d21da83bb99d3a4 Mon Sep 17 00:00:00 2001 From: kingbri Date: Fri, 30 Aug 2024 12:45:09 -0400 Subject: [PATCH] Config: Use an explicit "auto" value for rope_alpha Using "auto" for rope alpha removes ambiguity on how to explicitly enable automatic rope calculation. The same behavior of None -> auto calculate still exists, but can be overwritten if a model's tabby_config.yml includes `rope_alpha`. Signed-off-by: kingbri --- backends/exllamav2/model.py | 29 +++++++++++++++++++---------- common/args.py | 24 +++++++++++++++++++++++- common/model.py | 1 + config_sample.yml | 3 ++- endpoints/core/types/model.py | 10 +++++----- 5 files changed, 50 insertions(+), 17 deletions(-) diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py index 3dfa84de..ab15836f 100644 --- a/backends/exllamav2/model.py +++ b/backends/exllamav2/model.py @@ -249,10 +249,13 @@ def __init__(self, model_directory: pathlib.Path, quiet=False, **kwargs): kwargs.get("rope_scale"), self.config.scale_pos_emb ) - # Automatically calculate rope alpha - self.config.scale_alpha_value = unwrap( - kwargs.get("rope_alpha"), self.calculate_rope_alpha(base_seq_len) - ) + # Sets rope alpha value. + # Automatically calculate if unset or defined as an "auto" literal. + rope_alpha = unwrap(kwargs.get("rope_alpha"), "auto") + if rope_alpha == "auto": + self.config.scale_alpha_value = self.calculate_rope_alpha(base_seq_len) + else: + self.config.scale_alpha_value = rope_alpha # Enable fasttensors loading if present self.config.fasttensors = unwrap(kwargs.get("fasttensors"), False) @@ -344,16 +347,22 @@ def __init__(self, model_directory: pathlib.Path, quiet=False, **kwargs): # Set user-configured draft model values if enable_draft: + self.draft_config.max_seq_len = self.config.max_seq_len + self.draft_config.scale_pos_emb = unwrap( draft_args.get("draft_rope_scale"), 1.0 ) - # Automatically calculate draft rope alpha - self.draft_config.scale_alpha_value = unwrap( - draft_args.get("draft_rope_alpha"), - self.calculate_rope_alpha(self.draft_config.max_seq_len), - ) - self.draft_config.max_seq_len = self.config.max_seq_len + # Set draft rope alpha. Follows same behavior as model rope alpha. + draft_rope_alpha = unwrap(draft_args.get("draft_rope_alpha"), "auto") + if draft_rope_alpha == "auto": + self.draft_config.scale_alpha_value = self.calculate_rope_alpha( + self.draft_config.max_seq_len + ) + else: + self.draft_config.scale_alpha_value = draft_rope_alpha + + # Set draft cache mode self.draft_cache_mode = unwrap(draft_args.get("draft_cache_mode"), "FP16") if chunk_size: diff --git a/common/args.py b/common/args.py index b879ec32..4755c39c 100644 --- a/common/args.py +++ b/common/args.py @@ -13,6 +13,24 @@ def str_to_bool(value): raise ValueError(f"{value} is not a valid boolean value") +def argument_with_auto(value): + """ + Argparse type wrapper for any argument that has an automatic option. + + Ex. rope_alpha + """ + + if value == "auto": + return "auto" + + try: + return float(value) + except ValueError as ex: + raise argparse.ArgumentTypeError( + 'This argument only takes a type of float or "auto"' + ) from ex + + def init_argparser(): """Creates an argument parser that any function can use""" @@ -133,7 +151,11 @@ def add_model_args(parser: argparse.ArgumentParser): model_group.add_argument( "--rope-scale", type=float, help="Sets rope_scale or compress_pos_emb" ) - model_group.add_argument("--rope-alpha", type=float, help="Sets rope_alpha for NTK") + model_group.add_argument( + "--rope-alpha", + type=argument_with_auto, + help="Sets rope_alpha for NTK", + ) model_group.add_argument( "--cache-mode", type=str, diff --git a/common/model.py b/common/model.py index 0f5ab55a..97bac055 100644 --- a/common/model.py +++ b/common/model.py @@ -149,6 +149,7 @@ async def unload_embedding_model(): embeddings_container = None +# FIXME: Maybe make this a one-time function instead of a dynamic default def get_config_default(key: str, model_type: str = "model"): """Fetches a default value from model config if allowed by the user.""" diff --git a/config_sample.yml b/config_sample.yml index 2c18080a..85bb1df4 100644 --- a/config_sample.yml +++ b/config_sample.yml @@ -135,7 +135,8 @@ model: # Rope alpha (default: 1.0) # Same thing as alpha_value - # Leave blank to automatically calculate alpha + # Set to "auto" to automatically calculate + # Leave blank to pull the value from the model #rope_alpha: 1.0 # Enable different cache modes for VRAM savings (slight performance hit). diff --git a/endpoints/core/types/model.py b/endpoints/core/types/model.py index dfbb0337..154a9065 100644 --- a/endpoints/core/types/model.py +++ b/endpoints/core/types/model.py @@ -2,7 +2,7 @@ from pydantic import BaseModel, Field, ConfigDict from time import time -from typing import List, Optional +from typing import List, Literal, Optional, Union from common.gen_logging import GenLogPreferences from common.model import get_config_default @@ -56,8 +56,8 @@ class DraftModelLoadRequest(BaseModel): "draft_rope_scale", model_type="draft" ) ) - draft_rope_alpha: Optional[float] = Field( - description="Automatically calculated if not present", + draft_rope_alpha: Optional[Union[float, Literal["auto"]]] = Field( + description='Automatically calculated if set to "auto"', default_factory=lambda: get_config_default( "draft_rope_alpha", model_type="draft" ), @@ -114,8 +114,8 @@ class ModelLoadRequest(BaseModel): default_factory=lambda: get_config_default("rope_scale"), examples=[1.0], ) - rope_alpha: Optional[float] = Field( - description="Automatically calculated if not present", + rope_alpha: Optional[Union[float, Literal["auto"]]] = Field( + description='Automatically calculated if set to "auto"', default_factory=lambda: get_config_default("rope_alpha"), examples=[1.0], )