From 2b2ab6fb6f5d9d524a85411c837b0e8bf76bc59c Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sat, 12 Oct 2024 02:48:51 +0200 Subject: [PATCH] New quand strategy / FTYPE IQ3_XL 4bpw Intermediary FTYPE mixed between IQ3_M and IQ4_XS at 4bpw. Transposed loyally from @Ikawrakow's new FTYPE IQ3_KL on ik_llama.cpp so it can be trusted, except for attn_k.weight that I chose to IQ4_XS when GQA is present, because it makes no sense whatsoever in such case to have a key head smaller than attn_output.weight or half of the FFNs. The XL suffix is chosen in the eventuality of the emergence of an IQ3_M/L GGML_TYPE close to 4bpw, with a related FTYPE. In such case, the proposed mixed FTYPE could be replaced. This very FTYPE answers to the demand of many users, me included, to have an intermediary between IQ3_M and IQ4_XS, separated by 0.5bpw, and preventing the users to get the best quality in many cases (70b on 36GiB VRAM, 123b on 64GiB VRAM). In an ulterior PR that I can provide, this in order to not multiply the FTYPES, IQ2_M could be eliminated, IQ2_S elevated to replace it and make sense in term of nomenclature (currently, IQ2_S is an IQ2_XS+, and IQ2_M uses IQ2_S GGML_TYPE), and IQ2_XS having a little boost to compensate for the potential disappearance of the old IQ2_S FTYPE, which is tbh first in line to be sacrificed. Note : I allow myself to PR this because I amused myself quite extensively with the quant strategies, and already revamped for my own use LCCP's quant strategies, some of it being already PRed on this repo as a demo. --- examples/quantize/quantize.cpp | 1 + gguf-py/gguf/constants.py | 1 + include/llama.h | 1 + src/llama.cpp | 29 ++++++++++++++++++++++------- 4 files changed, 25 insertions(+), 7 deletions(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index b989932107dba..12e95bd5bb79d 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -38,6 +38,7 @@ static const std::vector QUANT_OPTIONS = { { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 3.41G, +1.6321 ppl @ Llama-3-8B", }, { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.74G, +0.6569 ppl @ Llama-3-8B", }, { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 4.03G, +0.5562 ppl @ Llama-3-8B", }, + { "IQ3_XL", LLAMA_FTYPE_MOSTLY_IQ3_XL, " 4 bpw non-linear quantization mix",}, { "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.50 bpw non-linear quantization", }, { "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.25 bpw non-linear quantization", }, { "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", }, diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index e08617ba240b3..4ceb90cda4bc6 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -1443,6 +1443,7 @@ class LlamaFileType(IntEnum): MOSTLY_Q4_0_8_8 = 35 # except 1d tensors MOSTLY_TQ1_0 = 36 # except 1d tensors MOSTLY_TQ2_0 = 37 # except 1d tensors + MOSTLY_IQ3_XL = 38 # except 1d tensors GUESSED = 1024 # not specified in the model file diff --git a/include/llama.h b/include/llama.h index 4f8f6d23d8c1c..23c7927363ca6 100644 --- a/include/llama.h +++ b/include/llama.h @@ -175,6 +175,7 @@ extern "C" { LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // except 1d tensors LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors + LLAMA_FTYPE_MOSTLY_IQ3_XL = 38, // except 1d tensors LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file }; diff --git a/src/llama.cpp b/src/llama.cpp index da7afb1ee5b46..67a7207880240 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -5285,6 +5285,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ3_XL: return "IQ3_S and IQ4_XS mix - 4 bpw"; case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4"; case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8"; case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8"; @@ -18011,8 +18012,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs.model.hparams.n_gqa() >= 4) { new_type = GGML_TYPE_Q4_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { - new_type = GGML_TYPE_Q4_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) { + new_type = qs.model.hparams.n_gqa() >= 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; @@ -18042,11 +18043,14 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n // TODO: explore better strategies new_type = GGML_TYPE_Q8_0; } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { + new_type = GGML_TYPE_IQ2_S; + } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { new_type = GGML_TYPE_IQ3_XXS; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { - new_type = GGML_TYPE_IQ2_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) { + new_type = qs.model.hparams.n_gqa() >= 2 ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } } else if (name.find("attn_q.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { @@ -18074,6 +18078,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n (qs.model.hparams.n_expert == 8 && use_more_bits(i_layer, n_layer)))) { new_type = GGML_TYPE_Q4_K; } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) { + new_type = use_more_bits(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K; } @@ -18115,6 +18122,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ) new_type = GGML_TYPE_Q4_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL ) new_type = GGML_TYPE_IQ4_XS; } } else { if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K; @@ -18133,6 +18141,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) { new_type = GGML_TYPE_IQ3_XXS; } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && use_more_bits(i_layer, n_layer)) { + new_type = GGML_TYPE_IQ4_XS; + } ++qs.i_ffn_gate; } else if (name.find("ffn_up") != std::string::npos) { @@ -18141,6 +18152,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) { new_type = GGML_TYPE_IQ3_XXS; } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && use_more_bits(i_layer, n_layer)) { + new_type = GGML_TYPE_IQ4_XS; + } ++qs.i_ffn_up; } @@ -18282,7 +18296,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_TQ1_0: default_type = GGML_TYPE_TQ1_0; break; case LLAMA_FTYPE_MOSTLY_TQ2_0: default_type = GGML_TYPE_TQ2_0; break; case LLAMA_FTYPE_MOSTLY_IQ2_XXS: default_type = GGML_TYPE_IQ2_XXS; break; - case LLAMA_FTYPE_MOSTLY_IQ2_XS: default_type = GGML_TYPE_IQ2_XS; break; + case LLAMA_FTYPE_MOSTLY_IQ2_XS: case LLAMA_FTYPE_MOSTLY_IQ2_S: default_type = GGML_TYPE_IQ2_XS; break; case LLAMA_FTYPE_MOSTLY_IQ2_M: default_type = GGML_TYPE_IQ2_S; break; case LLAMA_FTYPE_MOSTLY_IQ3_XXS: default_type = GGML_TYPE_IQ3_XXS; break; @@ -18290,8 +18304,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_IQ1_M: default_type = GGML_TYPE_IQ1_M; break; case LLAMA_FTYPE_MOSTLY_IQ4_NL: default_type = GGML_TYPE_IQ4_NL; break; case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break; - case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break; - case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break; + case LLAMA_FTYPE_MOSTLY_IQ3_S: + case LLAMA_FTYPE_MOSTLY_IQ3_M: + case LLAMA_FTYPE_MOSTLY_IQ3_XL: default_type = GGML_TYPE_IQ3_S; break; case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: default_type = GGML_TYPE_Q4_0_4_4; break; case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: default_type = GGML_TYPE_Q4_0_4_8; break; case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: default_type = GGML_TYPE_Q4_0_8_8; break;