From 1d921c07e6aa6f87a6995ad6660cb47c9605a0cc Mon Sep 17 00:00:00 2001
From: Holden <34213478+hodlen@users.noreply.github.com>
Date: Mon, 18 Dec 2023 16:20:48 +0800
Subject: [PATCH] convert models with sparse threshold

---
 convert.py                  | 33 ++++++++++++++++++++++++++++++++-
 gguf-py/gguf/constants.py   |  3 +++
 gguf-py/gguf/gguf_writer.py |  3 +++
 llama.cpp                   |  5 +++++
 requirements.txt            |  2 +-
 5 files changed, 44 insertions(+), 2 deletions(-)

diff --git a/convert.py b/convert.py
index fd220eb3..e07bf16d 100755
--- a/convert.py
+++ b/convert.py
@@ -3,6 +3,7 @@
 
 import argparse
 import concurrent.futures
+import dataclasses
 import enum
 import faulthandler
 import functools
@@ -138,6 +139,28 @@ def type_for_tensor(self, name: str, tensor: LazyTensor) -> DataType:
 # hparams loading
 #
 
+@dataclass
+class PredictorParams:
+    sparse_threshold: float | None = None
+
+    @staticmethod
+    def loadPredictorJson(model: LazyModel, config_path: Path) -> PredictorParams:
+        config = json.load(open(config_path))
+        return PredictorParams(
+            sparse_threshold = config.get("sparse_threshold"),
+        )
+
+    @staticmethod
+    def load(model_plus: ModelPlus) -> PredictorParams:
+        config_path   = model_plus.paths[0].parent / "config.json"
+
+        if config_path.exists():
+            params = PredictorParams.loadPredictorJson(model_plus.model, config_path)
+        else:
+            params = PredictorParams()
+
+        return params
+
 @dataclass
 class Params:
     n_vocab:    int
@@ -160,6 +183,9 @@ class Params:
     # path to the directory containing the model files
     path_model: Path | None = None
 
+    # MLP predictor parameters
+    predictor_params: PredictorParams = dataclasses.field(default_factory=PredictorParams)
+
     @staticmethod
     def guessed(model: LazyModel) -> Params:
         # try transformer naming first
@@ -843,6 +869,9 @@ def add_meta_arch(self, params: Params) -> None:
         if params.ftype is not None:
             self.gguf.add_file_type(params.ftype)
 
+        if params.predictor_params.sparse_threshold is not None:
+            self.gguf.add_sparse_threshold(params.predictor_params.sparse_threshold)
+
     def add_meta_vocab(self, vocab: Vocab) -> None:
         tokens = []
         scores = []
@@ -1181,10 +1210,13 @@ def main(args_in: list[str] | None = None) -> None:
 
     if not args.vocab_only:
         model_plus = load_some_model(args.model)
+        params = Params.load(model_plus)
         mlp_predictor_plus = load_mlp_model(args.mlp_model)
+        params.predictor_params = PredictorParams.load(mlp_predictor_plus)
         model_plus = merge_multifile_models([model_plus, mlp_predictor_plus])
     else:
         model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None)
+        params = Params.load(model_plus)
 
     if args.dump:
         do_dump_model(model_plus)
@@ -1193,7 +1225,6 @@ def main(args_in: list[str] | None = None) -> None:
     if args.bigendian:
         endianess = gguf.GGUFEndian.BIG
 
-    params = Params.load(model_plus)
     if params.n_ctx == -1:
         if args.ctx is None:
             raise Exception("The model doesn't have a context size, and you didn't specify one with --ctx\n"
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index cb31e527..6e90d34f 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -70,6 +70,9 @@ class Tokenizer:
         ADD_EOS    = "tokenizer.ggml.add_eos_token"
         HF_JSON    = "tokenizer.huggingface.json"
         RWKV       = "tokenizer.rwkv.world"
+    
+    class PowerInfer:
+        SPARSE_THRESHOLD = "powerinfer.sparse_threshold"
 
 
 #
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index c3b8c588..0483d7ba 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -399,6 +399,9 @@ def add_add_bos_token(self, value: bool) -> None:
     def add_add_eos_token(self, value: bool) -> None:
         self.add_bool(Keys.Tokenizer.ADD_EOS, value)
 
+    def add_sparse_threshold(self, value: float) -> None:
+        self.add_float32(Keys.PowerInfer.SPARSE_THRESHOLD, value)
+
     def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes:
         pack_prefix = ''
         if not skip_pack_prefix:
diff --git a/llama.cpp b/llama.cpp
index 57108a51..b4d63bae 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -314,6 +314,8 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
     { LLM_KV_TOKENIZER_PAD_ID,              "tokenizer.ggml.padding_token_id"   },
     { LLM_KV_TOKENIZER_HF_JSON,             "tokenizer.huggingface.json"        },
     { LLM_KV_TOKENIZER_RWKV,                "tokenizer.rwkv.world"              },
+
+    { LLM_KV_SPARSE_THRESHOLD,              "powerinfer.sparse_threshold" },
 };
 
 struct LLM_KV {
@@ -2624,6 +2626,9 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
     if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
     if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
     if (vocab.linefeed_id    != -1) { LLAMA_LOG_INFO( "%s: LF token  = %d '%s'\n", __func__, vocab.linefeed_id,    vocab.id_to_token[vocab.linefeed_id].text.c_str() );    }
+
+    // sparse inference
+    LLAMA_LOG_INFO("%s: sparse_pred_threshold = %.2f\n", __func__, hparams.sparse_pred_threshold);
 }
 
 
diff --git a/requirements.txt b/requirements.txt
index 81c909d0..2b737d98 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,3 @@
 numpy==1.24.4
 sentencepiece==0.1.98
-gguf>=0.1.0
+-e ./gguf-py