From e094f3e3dac0c6f5222265d6b696f94626b588b5 Mon Sep 17 00:00:00 2001 From: turboderp <11859846+turboderp@users.noreply.github.com> Date: Sat, 13 Jul 2024 14:23:16 +0200 Subject: [PATCH 01/11] Cleanup architecture definitions --- exllamav2/architecture.py | 375 ++++++++++++-------------------------- 1 file changed, 121 insertions(+), 254 deletions(-) diff --git a/exllamav2/architecture.py b/exllamav2/architecture.py index 9bb7eca3..0d7e8e7f 100644 --- a/exllamav2/architecture.py +++ b/exllamav2/architecture.py @@ -106,28 +106,80 @@ def __init__(self, arch_string, read_config): self.arch_string = arch_string arch_recognized = False - self.expect_keys = [] # Keys to expect in model dict - self.layer_keys = [] # Keys to expect in model dict, per layer + # Keys to expect in model dict + self.expect_keys = [] + # Keys to expect in model dict, per layer + self.layer_keys = [] + + # Map tensors in HF model to standard keys + self.keymap = None + + # Fused tensors + self.fused_qkv_key = None self.fused_mlp_key_12 = None self.fused_mlp_key_3 = None + + # Alternate packing scheme for fused QKV tensor (InternLM2 quirk) + self.fused_qkv_altpack = False + + # Learned position embeddings self.learned_pos_emb_key = None + # Default multiplier for MLP inner dim (GPT2 quirk) self.default_inner_dim_mult = None - self.orig_weights_transposed = False + + # Compute logit scale from `dim_model_base` key in config.json (MiniCPM quirk) self.logit_scale_basedim = False + # Tensors are transposed in original model weights + self.orig_weights_transposed = False + + # Post norm keys self.norm_key_1_post = None self.norm_key_2_post = None + # SWA required by architecture self.swa = False self.alternating_swa = False + # Model only works with eager attention self.eager_attn_only = False + + # Clamp hidden states to FP16 range self.clamp_hidden_states = False + + # Upcast hidden state to FP32 before adding to residual stream self.residual_stream_fp32 = False - self.fused_qkv_altpack = False + # Expect bias for linear layers + self.attention_bias_qkv = False + self.attention_bias_o = False + self.mlp_bias = False + + # Use gated MLP + self.mlp_gate = True + + # Use block-sparse MLP + self.is_moe = False + + # Normalize embeddings (Gemma quirk) + self.normalize_embeddings = False + + # Constant bias for layernorm (Gemma quirk) + self.norm_constant_bias = 0 + + # Use parallel decoder blocks (Cohere quirk) + self.parallel_decoder_blocks = False + + # Model is incoherent without BOS at the start of the context + self.requires_bos = False + + # Use MQA, effectively num_key_valu_heads = 1 (GPTBigCode quirk) + self.mqa = False + + # Scale attn weights (GPT2 quirk, not important for inference) + self.scale_attn_weights = False # Mistral @@ -140,28 +192,15 @@ def __init__(self, arch_string, read_config): self.expect_keys += \ expect_keys_llama self.norm_eps_key = "rms_norm_eps" - self.attention_bias_qkv = False - self.attention_bias_o = False - self.mlp_bias = False - self.mlp_gate = True self.mlp_key_gate = ".mlp.gate_proj" self.mlp_key_up = ".mlp.up_proj" self.mlp_key_down = ".mlp.down_proj" - self.mlp_act_func = "silu" - self.is_moe = False - self.norm = "rmsnorm" self.lm_head_key = "lm_head" - self.normalize_embeddings = False self.norm_key_1 = ".input_layernorm" self.norm_key_2 = ".post_attention_layernorm" - self.norm_constant_bias = 0 - self.parallel_decoder_blocks = False - self.requires_bos = False + self.mlp_act_func = "silu" + self.norm = "rmsnorm" self.rope_style = RopeStyle.NEOX - self.keymap = None - self.fused_qkv_key = None - self.mqa = False - self.scale_attn_weights = False # Mixtral @@ -174,29 +213,17 @@ def __init__(self, arch_string, read_config): self.expect_keys += \ expect_keys_llama self.norm_eps_key = "rms_norm_eps" - self.attention_bias_qkv = False - self.attention_bias_o = False - self.mlp_bias = False - self.mlp_gate = True self.mlp_key_gate = ".block_sparse_moe.experts.*.w1" self.mlp_key_up = ".block_sparse_moe.experts.*.w3" self.mlp_key_down = ".block_sparse_moe.experts.*.w2" self.mlp_key_expert_gate = ".block_sparse_moe.gate" - self.mlp_act_func = "silu" - self.is_moe = True - self.norm = "rmsnorm" self.lm_head_key = "lm_head" - self.normalize_embeddings = False self.norm_key_1 = ".input_layernorm" self.norm_key_2 = ".post_attention_layernorm" - self.norm_constant_bias = 0 - self.parallel_decoder_blocks = False - self.requires_bos = False + self.mlp_act_func = "silu" + self.norm = "rmsnorm" self.rope_style = RopeStyle.NEOX - self.keymap = None - self.fused_qkv_key = None - self.mqa = False - self.scale_attn_weights = False + self.is_moe = True # Yi @@ -209,28 +236,15 @@ def __init__(self, arch_string, read_config): self.expect_keys += \ expect_keys_llama self.norm_eps_key = "rms_norm_eps" - self.attention_bias_qkv = False - self.attention_bias_o = False - self.mlp_bias = False - self.mlp_gate = True self.mlp_key_gate = ".mlp.gate_proj" self.mlp_key_up = ".mlp.up_proj" self.mlp_key_down = ".mlp.down_proj" self.mlp_act_func = "silu" - self.is_moe = False - self.norm = "rmsnorm" - self.lm_head_key = "lm_head" - self.normalize_embeddings = False self.norm_key_1 = ".ln1" self.norm_key_2 = ".ln2" - self.norm_constant_bias = 0 - self.parallel_decoder_blocks = False - self.requires_bos = False + self.norm = "rmsnorm" + self.lm_head_key = "lm_head" self.rope_style = RopeStyle.NEOX - self.keymap = None - self.fused_qkv_key = None - self.mqa = False - self.scale_attn_weights = False # Orion @@ -243,28 +257,15 @@ def __init__(self, arch_string, read_config): self.expect_keys += \ expect_keys_llama self.norm_eps_key = "rms_norm_eps" - self.attention_bias_qkv = False - self.attention_bias_o = False - self.mlp_bias = False - self.mlp_gate = True self.mlp_key_gate = ".mlp.gate_proj" self.mlp_key_up = ".mlp.up_proj" self.mlp_key_down = ".mlp.down_proj" - self.mlp_act_func = "silu" - self.is_moe = False - self.norm = "layernorm" self.lm_head_key = "lm_head" - self.normalize_embeddings = False self.norm_key_1 = ".input_layernorm" self.norm_key_2 = ".post_attention_layernorm" - self.norm_constant_bias = 0 - self.parallel_decoder_blocks = False - self.requires_bos = False + self.mlp_act_func = "silu" + self.norm = "layernorm" self.rope_style = RopeStyle.NEOX - self.keymap = None - self.fused_qkv_key = None - self.mqa = False - self.scale_attn_weights = False # Qwen2 (1.5) @@ -277,28 +278,16 @@ def __init__(self, arch_string, read_config): self.expect_keys += \ expect_keys_llama self.norm_eps_key = "rms_norm_eps" - self.attention_bias_qkv = True - self.attention_bias_o = False - self.mlp_bias = False - self.mlp_gate = True self.mlp_key_gate = ".mlp.gate_proj" self.mlp_key_up = ".mlp.up_proj" self.mlp_key_down = ".mlp.down_proj" - self.mlp_act_func = "silu" - self.is_moe = False - self.norm = "rmsnorm" self.lm_head_key = "lm_head" - self.normalize_embeddings = False self.norm_key_1 = ".input_layernorm" self.norm_key_2 = ".post_attention_layernorm" - self.norm_constant_bias = 0 - self.parallel_decoder_blocks = False - self.requires_bos = False + self.mlp_act_func = "silu" + self.norm = "rmsnorm" self.rope_style = RopeStyle.NEOX - self.keymap = None - self.fused_qkv_key = None - self.mqa = False - self.scale_attn_weights = False + self.attention_bias_qkv = True # Gemma @@ -311,28 +300,18 @@ def __init__(self, arch_string, read_config): self.expect_keys += \ expect_keys_gemma self.norm_eps_key = "rms_norm_eps" - self.attention_bias_qkv = False - self.attention_bias_o = False - self.mlp_bias = False - self.mlp_gate = True self.mlp_key_gate = ".mlp.gate_proj" self.mlp_key_up = ".mlp.up_proj" self.mlp_key_down = ".mlp.down_proj" - self.mlp_act_func = "gelu" - self.is_moe = False - self.norm = "rmsnorm" self.lm_head_key = "model.embed_tokens" - self.normalize_embeddings = True self.norm_key_1 = ".input_layernorm" self.norm_key_2 = ".post_attention_layernorm" + self.mlp_act_func = "gelu" + self.norm = "rmsnorm" + self.rope_style = RopeStyle.NEOX + self.normalize_embeddings = True self.norm_constant_bias = 1 - self.parallel_decoder_blocks = False self.requires_bos = True - self.rope_style = RopeStyle.NEOX - self.keymap = None - self.fused_qkv_key = None - self.mqa = False - self.scale_attn_weights = False # Gemma2 @@ -345,30 +324,20 @@ def __init__(self, arch_string, read_config): self.expect_keys += \ expect_keys_gemma self.norm_eps_key = "rms_norm_eps" - self.attention_bias_qkv = False - self.attention_bias_o = False - self.mlp_bias = False - self.mlp_gate = True self.mlp_key_gate = ".mlp.gate_proj" self.mlp_key_up = ".mlp.up_proj" self.mlp_key_down = ".mlp.down_proj" - self.mlp_act_func = "gelu" - self.is_moe = False - self.norm = "rmsnorm" self.lm_head_key = "model.embed_tokens" - self.normalize_embeddings = True self.norm_key_1 = ".input_layernorm" self.norm_key_1_post = ".post_attention_layernorm" self.norm_key_2 = ".pre_feedforward_layernorm" self.norm_key_2_post = ".post_feedforward_layernorm" + self.mlp_act_func = "gelu" + self.norm = "rmsnorm" + self.rope_style = RopeStyle.NEOX + self.normalize_embeddings = True self.norm_constant_bias = 1 - self.parallel_decoder_blocks = False self.requires_bos = True - self.rope_style = RopeStyle.NEOX - self.keymap = None - self.fused_qkv_key = None - self.mqa = False - self.scale_attn_weights = False self.pre_post_layernorm = True self.alternating_swa = True self.residual_stream_fp32 = True @@ -384,27 +353,18 @@ def __init__(self, arch_string, read_config): self.expect_keys += \ expect_keys_starcoder2 self.norm_eps_key = "norm_epsilon" - self.attention_bias_qkv = True - self.attention_bias_o = True - self.mlp_bias = True - self.mlp_gate = False self.mlp_key_up = ".mlp.c_fc" self.mlp_key_down = ".mlp.c_proj" - self.mlp_act_func = "gelu" - self.is_moe = False - self.norm = "layernorm" self.lm_head_key = "model.embed_tokens" - self.normalize_embeddings = False self.norm_key_1 = ".input_layernorm" self.norm_key_2 = ".post_attention_layernorm" - self.norm_constant_bias = 0 - self.parallel_decoder_blocks = False - self.requires_bos = False + self.mlp_act_func = "gelu" + self.norm = "layernorm" self.rope_style = RopeStyle.NEOX - self.keymap = None - self.fused_qkv_key = None - self.mqa = False - self.scale_attn_weights = False + self.attention_bias_qkv = True + self.attention_bias_o = True + self.mlp_bias = True + self.mlp_gate = False # GemMoE @@ -418,29 +378,20 @@ def __init__(self, arch_string, read_config): self.expect_keys += \ expect_keys_gemma self.norm_eps_key = "rms_norm_eps" - self.attention_bias_qkv = False - self.attention_bias_o = False - self.mlp_bias = False - self.mlp_gate = True self.mlp_key_gate = ".block_sparse_moe.experts.*.w1" self.mlp_key_up = ".block_sparse_moe.experts.*.w3" self.mlp_key_down = ".block_sparse_moe.experts.*.w2" self.mlp_key_expert_gate = ".block_sparse_moe.gate" - self.mlp_act_func = "gelu" - self.is_moe = True - self.norm = "rmsnorm" self.lm_head_key = "model.embed_tokens" - self.normalize_embeddings = True self.norm_key_1 = ".input_layernorm" self.norm_key_2 = ".post_attention_layernorm" + self.mlp_act_func = "gelu" + self.norm = "rmsnorm" + self.rope_style = RopeStyle.NEOX + self.normalize_embeddings = True self.norm_constant_bias = 1 - self.parallel_decoder_blocks = False + self.is_moe = True self.requires_bos = True - self.rope_style = RopeStyle.NEOX - self.keymap = None - self.fused_qkv_key = None - self.mqa = False - self.scale_attn_weights = False # Cohere @@ -453,28 +404,17 @@ def __init__(self, arch_string, read_config): self.expect_keys += \ expect_keys_gemma self.norm_eps_key = "layer_norm_eps" - self.attention_bias_qkv = False - self.attention_bias_o = False - self.mlp_bias = False - self.mlp_gate = True self.mlp_key_gate = ".mlp.gate_proj" self.mlp_key_up = ".mlp.up_proj" self.mlp_key_down = ".mlp.down_proj" - self.mlp_act_func = "silu" - self.is_moe = False - self.norm = "layernorm" self.lm_head_key = "model.embed_tokens" - self.normalize_embeddings = False self.norm_key_1 = ".input_layernorm" self.norm_key_2 = None - self.norm_constant_bias = 0 + self.mlp_act_func = "silu" + self.norm = "layernorm" + self.rope_style = RopeStyle.GPTJ self.parallel_decoder_blocks = True self.requires_bos = True - self.rope_style = RopeStyle.GPTJ - self.keymap = None - self.fused_qkv_key = None - self.mqa = False - self.scale_attn_weights = False # DBRX @@ -488,29 +428,18 @@ def __init__(self, arch_string, read_config): self.expect_keys += \ expect_keys_llama self.norm_eps_key = None - self.attention_bias_qkv = False - self.attention_bias_o = False - self.mlp_bias = False - self.mlp_gate = True self.mlp_key_gate = ".block_sparse_moe.experts.*.w1" self.mlp_key_up = ".block_sparse_moe.experts.*.v1" self.mlp_key_down = ".block_sparse_moe.experts.*.w2" self.mlp_key_expert_gate = ".block_sparse_moe.gate" - self.mlp_act_func = "silu" - self.is_moe = True - self.norm = "layernorm" self.lm_head_key = "lm_head" - self.normalize_embeddings = False self.norm_key_1 = ".input_layernorm" self.norm_key_2 = ".post_attention_layernorm" - self.norm_constant_bias = 0 - self.parallel_decoder_blocks = False - self.requires_bos = False + self.mlp_act_func = "silu" + self.norm = "layernorm" self.rope_style = RopeStyle.NEOX - self.keymap = dbrx_keymap self.fused_qkv_key = "Wqkv" - self.mqa = False - self.scale_attn_weights = False + self.is_moe = True # Phi3 @@ -523,34 +452,23 @@ def __init__(self, arch_string, read_config): self.expect_keys += \ expect_keys_llama self.norm_eps_key = "rms_norm_eps" - self.attention_bias_qkv = False - self.attention_bias_o = False - self.mlp_bias = False - self.mlp_gate = True self.mlp_key_gate = ".mlp.gate_proj" self.mlp_key_up = ".mlp.up_proj" self.mlp_key_down = ".mlp.down_proj" - self.mlp_act_func = "silu" - self.is_moe = False - self.norm = "rmsnorm" self.lm_head_key = "lm_head" - self.normalize_embeddings = False self.norm_key_1 = ".input_layernorm" self.norm_key_2 = ".post_attention_layernorm" - self.norm_constant_bias = 0 - self.parallel_decoder_blocks = False - self.requires_bos = False - self.rope_style = RopeStyle.NEOX - self.keymap = None self.fused_qkv_key = "qkv_proj" self.fused_mlp_key_12 = "gate_up_proj" - self.mqa = False - self.scale_attn_weights = False + self.mlp_act_func = "silu" + self.norm = "rmsnorm" + self.rope_style = RopeStyle.NEOX # GPTBigCode if arch_string == "GPTBigCodeForCausalLM": arch_recognized = True + self.keymap = bigcode_keymap self.layer_keys += \ layer_keys_gpt2_norms + \ layer_keys_gpt2_attn + \ @@ -558,34 +476,28 @@ def __init__(self, arch_string, read_config): self.expect_keys += \ expect_keys_gpt2 self.norm_eps_key = "layer_norm_epsilon" - self.attention_bias_qkv = True - self.attention_bias_o = True - self.mlp_bias = True - self.mlp_gate = False self.mlp_key_gate = None self.mlp_key_up = ".mlp.c_fc" self.mlp_key_down = ".mlp.c_proj" - self.mlp_act_func = "gelu" - self.is_moe = False - self.norm = "layernorm" self.lm_head_key = "model.embed_tokens" - self.normalize_embeddings = False self.norm_key_1 = ".ln_1" self.norm_key_2 = ".ln_2" - self.norm_constant_bias = 0 - self.parallel_decoder_blocks = False - self.requires_bos = False - self.rope_style = RopeStyle.NONE - self.keymap = bigcode_keymap self.fused_qkv_key = "c_attn" - self.mqa = True self.learned_pos_emb_key = "model.wpe" - self.scale_attn_weights = True + self.mlp_act_func = "gelu" + self.norm = "layernorm" + self.rope_style = RopeStyle.NONE + self.mqa = True + self.attention_bias_qkv = True + self.attention_bias_o = True + self.mlp_bias = True + self.mlp_gate = False # GPT2 if arch_string == "GPT2LMHeadModel": arch_recognized = True + self.keymap = gpt2_keymap self.layer_keys += \ layer_keys_gpt2_norms + \ layer_keys_gpt2_attn + \ @@ -593,31 +505,23 @@ def __init__(self, arch_string, read_config): self.expect_keys += \ expect_keys_gpt2 self.norm_eps_key = "layer_norm_epsilon" - self.attention_bias_qkv = True - self.attention_bias_o = True - self.mlp_bias = True - self.mlp_gate = False self.mlp_key_gate = None self.mlp_key_up = ".mlp.c_fc" self.mlp_key_down = ".mlp.c_proj" - self.mlp_act_func = "gelu" - self.is_moe = False - self.norm = "layernorm" self.lm_head_key = "model.embed_tokens" - self.normalize_embeddings = False self.norm_key_1 = ".ln_1" self.norm_key_2 = ".ln_2" - self.norm_constant_bias = 0 - self.parallel_decoder_blocks = False - self.requires_bos = False - self.rope_style = RopeStyle.NONE - self.keymap = gpt2_keymap self.fused_qkv_key = "c_attn" - self.mqa = False self.learned_pos_emb_key = "model.wpe" - self.scale_attn_weights = True + self.mlp_act_func = "gelu" + self.norm = "layernorm" + self.rope_style = RopeStyle.NONE self.default_inner_dim_mult = 4 self.orig_weights_transposed = True + self.attention_bias_qkv = True + self.attention_bias_o = True + self.mlp_bias = True + self.mlp_gate = False # MiniCPM @@ -630,34 +534,22 @@ def __init__(self, arch_string, read_config): self.expect_keys += \ expect_keys_llama self.norm_eps_key = "rms_norm_eps" - self.attention_bias_qkv = False - self.attention_bias_o = False - self.mlp_bias = False - self.mlp_gate = True self.mlp_key_gate = ".mlp.gate_proj" self.mlp_key_up = ".mlp.up_proj" self.mlp_key_down = ".mlp.down_proj" - self.mlp_act_func = "silu" - self.is_moe = False - self.norm = "rmsnorm" self.lm_head_key = "lm_head" - self.normalize_embeddings = False self.norm_key_1 = ".input_layernorm" self.norm_key_2 = ".post_attention_layernorm" - self.norm_constant_bias = 0 - self.parallel_decoder_blocks = False - self.requires_bos = False + self.mlp_act_func = "silu" + self.norm = "rmsnorm" self.rope_style = RopeStyle.NEOX - self.keymap = None - self.fused_qkv_key = None - self.mqa = False - self.scale_attn_weights = False self.logit_scale_basedim = True # InternLM2 if arch_string == "InternLM2ForCausalLM": arch_recognized = True + self.keymap = internlm2_keymap self.layer_keys += \ layer_keys_internlm2_norms + \ layer_keys_internlm2_attn + \ @@ -665,29 +557,17 @@ def __init__(self, arch_string, read_config): self.expect_keys += \ expect_keys_llama self.norm_eps_key = "rms_norm_eps" - self.attention_bias_qkv = False - self.attention_bias_o = False - self.mlp_bias = False - self.mlp_gate = True self.mlp_key_gate = ".feed_forward.w1" self.mlp_key_up = ".feed_forward.w3" self.mlp_key_down = ".feed_forward.w2" - self.mlp_act_func = "silu" - self.is_moe = False - self.norm = "rmsnorm" self.lm_head_key = "lm_head" - self.normalize_embeddings = False self.norm_key_1 = ".attention_norm" self.norm_key_2 = ".ffn_norm" - self.norm_constant_bias = 0 - self.parallel_decoder_blocks = False - self.requires_bos = False - self.rope_style = RopeStyle.NEOX - self.keymap = internlm2_keymap self.fused_qkv_key = "wqkv" + self.mlp_act_func = "silu" + self.norm = "rmsnorm" + self.rope_style = RopeStyle.NEOX self.fused_qkv_altpack = True - self.mqa = False - self.scale_attn_weights = False # Llama (default + fallback) @@ -703,28 +583,15 @@ def __init__(self, arch_string, read_config): self.expect_keys += \ expect_keys_llama self.norm_eps_key = "rms_norm_eps" - self.attention_bias_qkv = False - self.attention_bias_o = False - self.mlp_bias = False - self.mlp_gate = True self.mlp_key_gate = ".mlp.gate_proj" self.mlp_key_up = ".mlp.up_proj" self.mlp_key_down = ".mlp.down_proj" - self.mlp_act_func = "silu" - self.is_moe = False - self.norm = "rmsnorm" self.lm_head_key = "lm_head" - self.normalize_embeddings = False self.norm_key_1 = ".input_layernorm" self.norm_key_2 = ".post_attention_layernorm" - self.norm_constant_bias = 0 - self.parallel_decoder_blocks = False - self.requires_bos = False + self.mlp_act_func = "silu" + self.norm = "rmsnorm" self.rope_style = RopeStyle.NEOX - self.keymap = None - self.fused_qkv_key = None - self.mqa = False - self.scale_attn_weights = False # Arch overrides From c91893ac6e5d5c5a01d47fc53c4131a7d61f9be6 Mon Sep 17 00:00:00 2001 From: turboderp <11859846+turboderp@users.noreply.github.com> Date: Sun, 14 Jul 2024 05:54:18 +0200 Subject: [PATCH 02/11] Add --fast_safetensors option to quantizer script --- exllamav2/conversion/convert_exl2.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/exllamav2/conversion/convert_exl2.py b/exllamav2/conversion/convert_exl2.py index 2f56cf77..920386c9 100644 --- a/exllamav2/conversion/convert_exl2.py +++ b/exllamav2/conversion/convert_exl2.py @@ -31,6 +31,7 @@ parser.add_argument("-ml", "--measurement_length", type = int, default = 2048, help = "Max no. tokens per sample when measuring") parser.add_argument("-so", "--status_output", action = "store_true", help = "Include machine-parseable status updates in console output") parser.add_argument("-hsol", "--hidden_state_offload_layers", type = int, default = 0, help = "Number of hidden/target states to keep in VRAM. Speed-up but increases VRAM usage") +parser.add_argument("-fst", "--fast_safetensors", action = "store_true", help = "Use fast-safetensors to load layers of the unquantized model. This can help alleviate some out-of-memory issues, especially on Windows.") args = parser.parse_args() @@ -112,6 +113,7 @@ def save_job(): "rope_scale": args.rope_scale, "rope_alpha": args.rope_alpha, "output_measurement": output_measurement, + "fast_safetensors": args.fast_safetensors, "progress": "begin"} if args.measurement is not None: @@ -160,6 +162,8 @@ def save_job(): else: print(f" -- Measurement will be saved to {job['output_measurement']}") print(f" !! Conversion script will end after measurement pass") +if job.get("fast_safetensors"): + print(f" -- Enabled fast_safetensors option.") if job['rope_scale']: print(f" -- RoPE scale: {job['rope_scale']:.2f}") if job['rope_alpha']: print(f" -- RoPE alpha: {job['rope_alpha']:.2f}") @@ -190,6 +194,10 @@ def save_job(): tokenizer = ExLlamaV2Tokenizer(config) +# Set fast_safetensors in config + +if job.get("fast_safetensors"): config.fasttensors = True + # Set scaling for input model if job["rope_scale"] is not None: config.scale_pos_emb = job["rope_scale"] From 3ffcc74d82ac99f4c3e030b12e81d532949a1989 Mon Sep 17 00:00:00 2001 From: turboderp <11859846+turboderp@users.noreply.github.com> Date: Sun, 14 Jul 2024 08:01:28 +0200 Subject: [PATCH 03/11] Support Index architecture --- exllamav2/architecture.py | 21 +++++++++++++++++++++ exllamav2/config.py | 5 +++++ exllamav2/linear.py | 12 +++++++++++- exllamav2/model.py | 3 ++- 4 files changed, 39 insertions(+), 2 deletions(-) diff --git a/exllamav2/architecture.py b/exllamav2/architecture.py index 0d7e8e7f..ff7524fc 100644 --- a/exllamav2/architecture.py +++ b/exllamav2/architecture.py @@ -569,6 +569,27 @@ def __init__(self, arch_string, read_config): self.rope_style = RopeStyle.NEOX self.fused_qkv_altpack = True + # Index + + if arch_string == "IndexForCausalLM": + arch_recognized = True + self.layer_keys += \ + layer_keys_llama_norms + \ + layer_keys_llama_attn + \ + layer_keys_llama_mlp + self.expect_keys += \ + expect_keys_llama + self.norm_eps_key = "rms_norm_eps" + self.mlp_key_gate = ".mlp.gate_proj" + self.mlp_key_up = ".mlp.up_proj" + self.mlp_key_down = ".mlp.down_proj" + self.lm_head_key = "lm_head" + self.norm_key_1 = ".input_layernorm" + self.norm_key_2 = ".post_attention_layernorm" + self.mlp_act_func = "silu" + self.norm = "rmsnorm" + self.rope_style = RopeStyle.NEOX + # Llama (default + fallback) if arch_string != "LlamaForCausalLM" and not arch_recognized: diff --git a/exllamav2/config.py b/exllamav2/config.py index 162ef9d2..ed73ea72 100644 --- a/exllamav2/config.py +++ b/exllamav2/config.py @@ -104,6 +104,7 @@ class ExLlamaV2Config: final_logit_softcapping: float | None attn_logit_softcapping: float | None sliding_window: int + norm_head: int | None checkpoint_fused_mlp: bool @@ -251,6 +252,10 @@ def prepare(self, no_tensors: bool = False): self.attn_logit_softcapping = read(read_config, float, "attn_logit_softcapping", None) self.final_logit_softcapping = read(read_config, float, "final_logit_softcapping", None) + # Normalize weights in head layer + + self.norm_head = read(read_config, int, "norm_head", None) + # Positional embeddings self.rotary_embedding_base = read(read_config, float, ["rope_theta", "attn_config->rope_theta"], 10000.0) diff --git a/exllamav2/linear.py b/exllamav2/linear.py index 9858b3f1..e6755b95 100644 --- a/exllamav2/linear.py +++ b/exllamav2/linear.py @@ -54,7 +54,8 @@ def __init__(self, f_beg: int = None, f_end: int = None, is_sub_module: bool = True, - altpack_qkv: bool = False): + altpack_qkv: bool = False, + normalize_unq: bool = False): super().__init__(model, key) self.is_sub_module = is_sub_module @@ -89,6 +90,7 @@ def __init__(self, self.altpack_qkv = altpack_qkv self.assumed_footprint = in_features * (out_features + self.padding) * 2 + 128 + self.normalize_unq = normalize_unq @torch.inference_mode @@ -125,6 +127,8 @@ def load(self, elif isinstance(w, nn.Parameter): assert not self.has_bias, self.key + " has no bias tensor but bias is expected" + if self.normalize_unq: + w = self.normalize(w) if self.padding > 0: w = nn.Parameter(F.pad(w.data, (0, 0, 0, self.padding)).contiguous()) if not self.model.config.load_in_q4 or not ".layers." in self.key: self.linear = nn.Linear(self.in_features, self.out_features, self.has_bias, device = "meta", dtype = torch.float16) @@ -138,6 +142,8 @@ def load(self, elif isinstance(w, tuple): assert self.has_bias, self.key + " has bias tensor but bias is not expected" + if self.normalize_unq: + w = self.normalize(w[0]), w[1] ww = w[0] wb = w[1] if self.padding > 0: @@ -154,6 +160,10 @@ def load(self, self.fp16_bias = wb + def normalize(self, w: torch.Tensor): + return nn.functional.normalize(w) + + def matrix_shape(self): return self.in_features, self.out_features diff --git a/exllamav2/model.py b/exllamav2/model.py index d21031df..4d875f59 100644 --- a/exllamav2/model.py +++ b/exllamav2/model.py @@ -250,7 +250,8 @@ def __init__(self, config: ExLlamaV2Config, lazy_load = False): False, max_out_len = self.config.max_output_len, prescale = self.config.logit_scale, - is_sub_module = False) + is_sub_module = False, + normalize_unq = bool(self.config.norm_head)) if self.config.arch.lm_head_key != "lm_head": head.alt_key = self.config.arch.lm_head_key self.modules += [head] From c5e214cd7bc876f6ec777e6859d6fa1e840e7e60 Mon Sep 17 00:00:00 2001 From: Brian Dashore Date: Mon, 15 Jul 2024 19:27:57 -0400 Subject: [PATCH 04/11] dynamic_async: Add breakpoint for when a job is cancelled (#551) Previously, when the job cancel function was called, the job itself wasn't notified that it's cancelled. This caused a deadlock in the calling function due to waiting forever in a queue. Therefore, add a signal to break out of the generation loop on a call to cancel. Signed-off-by: kingbri --- exllamav2/generator/dynamic_async.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/exllamav2/generator/dynamic_async.py b/exllamav2/generator/dynamic_async.py index 8e006822..ae82c619 100644 --- a/exllamav2/generator/dynamic_async.py +++ b/exllamav2/generator/dynamic_async.py @@ -75,6 +75,7 @@ class ExLlamaV2DynamicJobAsync: job: ExLlamaV2DynamicJob queue: asyncio.Queue generator: ExLlamaV2DynamicGeneratorAsync + cancelled: bool = False def __init__(self, generator: ExLlamaV2DynamicGeneratorAsync, *args: object, **kwargs: object): self.generator = generator @@ -87,6 +88,10 @@ async def put_result(self, result): async def __aiter__(self): while True: + # Get out if the job is cancelled + if self.cancelled: + break + result = await self.queue.get() if isinstance(result, Exception): raise result @@ -96,3 +101,4 @@ async def __aiter__(self): async def cancel(self): await self.generator.cancel(self) + self.cancelled = True From 0cf9729d44e307fe7249cdb8e335f2082a64a845 Mon Sep 17 00:00:00 2001 From: turboderp <11859846+turboderp@users.noreply.github.com> Date: Wed, 17 Jul 2024 01:39:25 +0200 Subject: [PATCH 05/11] Dynamic gen: Identify specific stop condition when ending generation --- exllamav2/generator/dynamic.py | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/exllamav2/generator/dynamic.py b/exllamav2/generator/dynamic.py index 52924421..7fd64a08 100644 --- a/exllamav2/generator/dynamic.py +++ b/exllamav2/generator/dynamic.py @@ -21,7 +21,7 @@ import itertools from dataclasses import dataclass # import xxhash -# from line_profiler import profile +from line_profiler import profile # TODO: # - ExLlamaV2StreamingGenerator wrapper @@ -893,6 +893,11 @@ def iterate(self) -> list[dict]: "stop_string" "max_new_tokens" "end_filter" + optional, if "eos_reason" == "stop_token": + "eos_triggering_token_id": int + "eos_triggering_token_str": str + optional, if "eos_reason" == "stop_string": + "eos_triggering_string": str "full_completion": str - full text completion "new_tokens": int - number of tokens generated "time_enqueued": float - time from job was enqueued until it started, in seconds @@ -1849,7 +1854,9 @@ def emit( eos_reason: str = None, emit_held = False, suppressed_text = None, - suppressed_tokens = None + suppressed_tokens = None, + stop_token: int = None, + stop_string: str = None ): r = { "job": self, @@ -1860,6 +1867,15 @@ def emit( if eos_reason is not None: r.update({ "eos_reason": eos_reason }) + if eos_reason == "stop_token": + id_to_piece = self.generator.tokenizer.get_id_to_piece_list(True) + r.update({ + "eos_triggering_token_id": stop_token, + "eos_triggering_token_str": id_to_piece[stop_token] + }) + pass + if eos_reason == "stop_string": + r.update({ "eos_triggering_string": stop_string }) if emit_held: if self.held_text != "": @@ -1913,7 +1929,7 @@ def emit( # End on stop tokens if next_token.item() in self.stop_tokens: - return emit(results, emit_eos = True, eos_reason = "stop_token") + return emit(results, emit_eos = True, eos_reason = "stop_token", stop_token = next_token.item()) # Decode and buffer output @@ -2032,8 +2048,12 @@ def rewind_checkpoint(): self.stop_strings_utf32_buffer ) if match >= 0: + held = self.held_text[match:] self.held_text = self.held_text[:match] - return emit(results, emit_eos = True, emit_held = True, eos_reason = "stop_string") + for s in self.stop_strings: + if held.startswith(s): + return emit(results, emit_eos = True, emit_held = True, eos_reason = "stop_string", stop_string = s) + assert False, "Detected stop string but couldn't identify it (logic error)" if match == -2: return emit(results) From bcf1ee380d0ea23cbe528260ff6a71f37e5b37c7 Mon Sep 17 00:00:00 2001 From: turboderp <11859846+turboderp@users.noreply.github.com> Date: Thu, 18 Jul 2024 14:01:39 +0200 Subject: [PATCH 06/11] Support GPTQ models with gptq_v2 checkpoint_format --- exllamav2/config.py | 6 ++++++ exllamav2/ext.py | 6 +++++- exllamav2/linear.py | 7 +++++-- 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/exllamav2/config.py b/exllamav2/config.py index ed73ea72..6296052e 100644 --- a/exllamav2/config.py +++ b/exllamav2/config.py @@ -107,6 +107,7 @@ class ExLlamaV2Config: norm_head: int | None checkpoint_fused_mlp: bool + checkpoint_offset_qzeros: bool def __init__(self, @@ -287,6 +288,11 @@ def prepare(self, no_tensors: bool = False): # if scaling_type == "yarn": # self.scale_alpha_value = factor + # Checkpoint format (for GPTQ models) + + checkpoint_format = read(read_config, str, ["quantization_config->checkpoint_format"], None) + self.checkpoint_offset_qzeros = (checkpoint_format == "gptq_v2") + # Create map of model tensors if no_tensors: return diff --git a/exllamav2/ext.py b/exllamav2/ext.py index 53681510..aff67db0 100644 --- a/exllamav2/ext.py +++ b/exllamav2/ext.py @@ -320,7 +320,8 @@ def make_q_matrix(w: dict, temp_dq: torch.Tensor, key: str = None, prescale: float = 1, - max_dq_rows = 0): + max_dq_rows = 0, + offset_qzeros: bool = False): # EXL2 @@ -354,6 +355,9 @@ def make_q_matrix(w: dict, if prescale != 1: w["scales"] *= prescale if w["scales"].dtype == torch.float: w["scales"] = w["scales"].half() + if offset_qzeros: + w["qzeros"] -= 0b00010001000100010001000100010001 + # GPTQ with g_idx (act_order) if "g_idx" in w and not (w["g_idx"] == 0).all().item(): diff --git a/exllamav2/linear.py b/exllamav2/linear.py index e6755b95..8a6ba7de 100644 --- a/exllamav2/linear.py +++ b/exllamav2/linear.py @@ -98,13 +98,15 @@ def load(self, w: dict | nn.Parameter | tuple | None = None, device_tensors: bool = True): + cfg = self.model.config + if self.f_key: w = self.load_weight_fused(self.f_key, self.f_beg, self.f_end, self.in_features, self.out_features, self.altpack_qkv) if w is None: w = self.load_weight() # Load quantized linear layer from dictionary if isinstance(w, dict): - assert not self.model.config.load_in_q4, "Can't load quantized layer in Q4 mode" + assert not cfg.load_in_q4, "Can't load quantized layer in Q4 mode" if self.has_bias: assert "bias" in w, self.key + " has no bias but bias expected" else: @@ -119,7 +121,8 @@ def load(self, self.q_handle = ext.make_q_matrix(w, self.temp_dq, prescale = self.prescale, - max_dq_rows = self.model.config.max_dq_size // self.out_features) + max_dq_rows = cfg.max_dq_size // self.out_features, + offset_qzeros = cfg.checkpoint_offset_qzeros) self.prev_prescale = self.prescale self.prescale = 1 From 304e0211f6c717c36d4332977889cb3695fa0139 Mon Sep 17 00:00:00 2001 From: turboderp <11859846+turboderp@users.noreply.github.com> Date: Sat, 20 Jul 2024 01:40:52 +0200 Subject: [PATCH 07/11] Remove line_profiler import --- exllamav2/generator/dynamic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/exllamav2/generator/dynamic.py b/exllamav2/generator/dynamic.py index 7fd64a08..da579e57 100644 --- a/exllamav2/generator/dynamic.py +++ b/exllamav2/generator/dynamic.py @@ -21,7 +21,7 @@ import itertools from dataclasses import dataclass # import xxhash -from line_profiler import profile +# from line_profiler import profile # TODO: # - ExLlamaV2StreamingGenerator wrapper From 81cd6b70bc2108e40f79a09fdb977f1b8c9c5e77 Mon Sep 17 00:00:00 2001 From: turboderp <11859846+turboderp@users.noreply.github.com> Date: Sun, 21 Jul 2024 10:25:16 +0200 Subject: [PATCH 08/11] Dynamic gen: Return held output with last results --- exllamav2/generator/dynamic.py | 38 +++++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/exllamav2/generator/dynamic.py b/exllamav2/generator/dynamic.py index da579e57..1802fa57 100644 --- a/exllamav2/generator/dynamic.py +++ b/exllamav2/generator/dynamic.py @@ -1856,7 +1856,8 @@ def emit( suppressed_text = None, suppressed_tokens = None, stop_token: int = None, - stop_string: str = None + stop_string: str = None, + rem_held_text: str = None ): r = { "job": self, @@ -1919,6 +1920,22 @@ def emit( "accepted_draft_tokens": self.accepted_draft_tokens, "rejected_draft_tokens": self.rejected_draft_tokens }) + if eos_reason == "stop_string": + self.held_text = rem_held_text + rh = {} + if self.held_text: + rh.update({ "text": self.held_text }) + if self.held_tokens: + rh.update({ "token_ids": self.held_tokens.torch().clone() }) + if self.held_probs: + rh.update({ "token_probs": self.held_probs.torch().clone() }) + if self.held_k_tokens: + rh.update({ "top_k_tokens": self.held_k_tokens.torch().clone() }) + rh.update({ "top_k_probs": self.held_k_probs.torch().clone() }) + if self.held_logits: + rh.update({ "logits": self.held_logits.torch().clone() }) + if rh: + r.update({ "held": rh }) if self.identifier is not None: r.update({ "identifier": self.identifier }) @@ -1926,11 +1943,6 @@ def emit( results.append(r) return emit_eos, next_token - # End on stop tokens - - if next_token.item() in self.stop_tokens: - return emit(results, emit_eos = True, eos_reason = "stop_token", stop_token = next_token.item()) - # Decode and buffer output id_to_piece = self.generator.tokenizer.get_id_to_piece_list(self.decode_special_tokens) @@ -1950,6 +1962,11 @@ def emit( if self.return_logits: self.held_logits.append(logits[:1, :, :]) + # End on stop tokens + + if next_token.item() in self.stop_tokens: + return emit(results, emit_eos = True, eos_reason = "stop_token", stop_token = next_token.item()) + # Stop if we reach max_new_tokens if self.new_tokens >= self.max_new_tokens - self.generator.num_draft_tokens: @@ -2052,7 +2069,14 @@ def rewind_checkpoint(): self.held_text = self.held_text[:match] for s in self.stop_strings: if held.startswith(s): - return emit(results, emit_eos = True, emit_held = True, eos_reason = "stop_string", stop_string = s) + return emit( + results, + emit_eos = True, + emit_held = True, + eos_reason = "stop_string", + stop_string = s, + rem_held_text = held + ) assert False, "Detected stop string but couldn't identify it (logic error)" if match == -2: return emit(results) From 46a803fad163253eb4f256bdc09927d75fd83447 Mon Sep 17 00:00:00 2001 From: turboderp <11859846+turboderp@users.noreply.github.com> Date: Tue, 23 Jul 2024 17:54:28 +0200 Subject: [PATCH 09/11] Ignore potential junk tensors in LoRA --- exllamav2/lora.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/exllamav2/lora.py b/exllamav2/lora.py index 1dd99cbe..b5675935 100644 --- a/exllamav2/lora.py +++ b/exllamav2/lora.py @@ -81,6 +81,8 @@ def __init__(self, f = load_file(self.lora_path, map_location = "cpu") for key in f.keys(): + if any(key.endswith(x) for x in [".original_module.weight", ".modules_to_save.weight"]): + continue tensor = f[key] # Find target From 05d13528b96084e53f64d601e56a03cf17adb45c Mon Sep 17 00:00:00 2001 From: turboderp <11859846+turboderp@users.noreply.github.com> Date: Tue, 23 Jul 2024 17:56:05 +0200 Subject: [PATCH 10/11] Add RoPE scaling for Llama3.1 --- exllamav2/config.py | 29 ++++++++++++++++++++++------- exllamav2/model.py | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+), 7 deletions(-) diff --git a/exllamav2/config.py b/exllamav2/config.py index 6296052e..4dbcd5d3 100644 --- a/exllamav2/config.py +++ b/exllamav2/config.py @@ -10,7 +10,9 @@ T = TypeVar('T') no_default = object() -def read(input_dict: dict[str, Any], expected_type: type, keys: str | list[str], default = no_default) -> T: +def read(input_dict: dict[str, Any], expected_type: type | list[type], keys: str | list[str], default = no_default) -> T: + + expected_types = expected_type if isinstance(expected_type, list) else [expected_type] if isinstance(keys, str): keys = [keys] @@ -34,10 +36,10 @@ def read(input_dict: dict[str, Any], expected_type: type, keys: str | list[str], if expected_type == int and isinstance(x, float) and x == int(x): x = int(x) - if isinstance(x, expected_type): - return cast(T, x) - else: - raise TypeError(f"Value for {key} is not of expected type {expected_type}") + for t in expected_types: + if isinstance(x, t): + return cast(T, x) + raise TypeError(f"Value for {key} is not of expected type {expected_type}") if default != no_default: return default raise ValueError(f"Missing any of the following keys: {keys}") @@ -105,7 +107,10 @@ class ExLlamaV2Config: attn_logit_softcapping: float | None sliding_window: int norm_head: int | None - + l3_rope_factor: float | None + l3_rope_low_freq_factor: float | None + l3_rope_high_freq_factor: float | None + l3_rope_original_max_position_embeddings: int | None checkpoint_fused_mlp: bool checkpoint_offset_qzeros: bool @@ -191,10 +196,13 @@ def prepare(self, no_tensors: bool = False): # Vocab params self.bos_token_id = read(read_config, int, "bos_token_id", None) # 1 - self.eos_token_id = read(read_config, int, "eos_token_id", None) # 2 + self.eos_token_id = read(read_config, [int, list], "eos_token_id", None) # 2 self.pad_token_id = read(read_config, int, "pad_token_id", None) # 0 self.vocab_size = read(read_config, int, "vocab_size") + if isinstance(self.eos_token_id, list): + self.eos_token_id = self.eos_token_id[0] # TODO: Figure out a way to maybe use all the EOS tokens somehow + # Standard params self.initializer_range = read(read_config, float, ["initializer_range"]) @@ -287,6 +295,13 @@ def prepare(self, no_tensors: bool = False): self.alt_rope_method = "su" # if scaling_type == "yarn": # self.scale_alpha_value = factor + rope_type = rs.get("rope_type", None) + if rope_type == "llama3": + self.alt_rope_method = "llama3" + self.l3_rope_factor = rs["factor"] + self.l3_rope_low_freq_factor = rs["low_freq_factor"] + self.l3_rope_high_freq_factor = rs["high_freq_factor"] + self.l3_rope_original_max_position_embeddings = rs["original_max_position_embeddings"] # Checkpoint format (for GPTQ models) diff --git a/exllamav2/model.py b/exllamav2/model.py index 4d875f59..065f79f2 100644 --- a/exllamav2/model.py +++ b/exllamav2/model.py @@ -129,6 +129,31 @@ def get_scratch_slice(self, size_bytes): return scratch_slice + @staticmethod + def _apply_scaling( + freqs: torch.Tensor, + scale_factor: float = 8, + low_freq_factor: float = 1, + high_freq_factor: float = 4, + old_context_len: int = 8192, # original llama3 length + ): + low_freq_wavelen = old_context_len / low_freq_factor + high_freq_wavelen = old_context_len / high_freq_factor + new_freqs = [] + + for freq in freqs: + wavelen = 2 * math.pi / freq + if wavelen < high_freq_wavelen: + new_freqs.append(freq) + elif wavelen > low_freq_wavelen: + new_freqs.append(freq / scale_factor) + else: + assert low_freq_wavelen != high_freq_wavelen + smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) + new_freqs.append((1 - smooth) * freq / scale_factor + smooth * freq) + return torch.tensor(new_freqs, dtype = freqs.dtype, device = freqs.device) + + def prepare_sincos(self): device = _torch_device(self.device_idx) @@ -163,6 +188,19 @@ def prepare_sincos(self): inv_freq = 1.0 / (ext_factors * base ** (torch.arange(0, head_dim, 2, device = device).float() / head_dim)) + # Llama 3.1 + + elif cfg.alt_rope_method == "llama3": + + inv_freq = 1.0 / (base ** (torch.arange(0, head_dim, 2, device = device).float() / head_dim)) + inv_freq = self._apply_scaling( + inv_freq, + cfg.l3_rope_factor, + cfg.l3_rope_low_freq_factor, + cfg.l3_rope_high_freq_factor, + cfg.l3_rope_original_max_position_embeddings, + ) + # Regular else: From 4bbd969f033aa731fbf28c7191502157c7a8766e Mon Sep 17 00:00:00 2001 From: turboderp <11859846+turboderp@users.noreply.github.com> Date: Wed, 24 Jul 2024 08:30:31 +0200 Subject: [PATCH 11/11] Bump to 0.1.8 --- exllamav2/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/exllamav2/version.py b/exllamav2/version.py index 283b03a0..a23ef3f4 100644 --- a/exllamav2/version.py +++ b/exllamav2/version.py @@ -1 +1 @@ -__version__ = "0.1.7" \ No newline at end of file +__version__ = "0.1.8" \ No newline at end of file