From 1d921c07e6aa6f87a6995ad6660cb47c9605a0cc Mon Sep 17 00:00:00 2001 From: Holden <34213478+hodlen@users.noreply.github.com> Date: Mon, 18 Dec 2023 16:20:48 +0800 Subject: [PATCH] convert models with sparse threshold --- convert.py | 33 ++++++++++++++++++++++++++++++++- gguf-py/gguf/constants.py | 3 +++ gguf-py/gguf/gguf_writer.py | 3 +++ llama.cpp | 5 +++++ requirements.txt | 2 +- 5 files changed, 44 insertions(+), 2 deletions(-) diff --git a/convert.py b/convert.py index fd220eb3..e07bf16d 100755 --- a/convert.py +++ b/convert.py @@ -3,6 +3,7 @@ import argparse import concurrent.futures +import dataclasses import enum import faulthandler import functools @@ -138,6 +139,28 @@ def type_for_tensor(self, name: str, tensor: LazyTensor) -> DataType: # hparams loading # +@dataclass +class PredictorParams: + sparse_threshold: float | None = None + + @staticmethod + def loadPredictorJson(model: LazyModel, config_path: Path) -> PredictorParams: + config = json.load(open(config_path)) + return PredictorParams( + sparse_threshold = config.get("sparse_threshold"), + ) + + @staticmethod + def load(model_plus: ModelPlus) -> PredictorParams: + config_path = model_plus.paths[0].parent / "config.json" + + if config_path.exists(): + params = PredictorParams.loadPredictorJson(model_plus.model, config_path) + else: + params = PredictorParams() + + return params + @dataclass class Params: n_vocab: int @@ -160,6 +183,9 @@ class Params: # path to the directory containing the model files path_model: Path | None = None + # MLP predictor parameters + predictor_params: PredictorParams = dataclasses.field(default_factory=PredictorParams) + @staticmethod def guessed(model: LazyModel) -> Params: # try transformer naming first @@ -843,6 +869,9 @@ def add_meta_arch(self, params: Params) -> None: if params.ftype is not None: self.gguf.add_file_type(params.ftype) + if params.predictor_params.sparse_threshold is not None: + self.gguf.add_sparse_threshold(params.predictor_params.sparse_threshold) + def add_meta_vocab(self, vocab: Vocab) -> None: tokens = [] scores = [] @@ -1181,10 +1210,13 @@ def main(args_in: list[str] | None = None) -> None: if not args.vocab_only: model_plus = load_some_model(args.model) + params = Params.load(model_plus) mlp_predictor_plus = load_mlp_model(args.mlp_model) + params.predictor_params = PredictorParams.load(mlp_predictor_plus) model_plus = merge_multifile_models([model_plus, mlp_predictor_plus]) else: model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None) + params = Params.load(model_plus) if args.dump: do_dump_model(model_plus) @@ -1193,7 +1225,6 @@ def main(args_in: list[str] | None = None) -> None: if args.bigendian: endianess = gguf.GGUFEndian.BIG - params = Params.load(model_plus) if params.n_ctx == -1: if args.ctx is None: raise Exception("The model doesn't have a context size, and you didn't specify one with --ctx\n" diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index cb31e527..6e90d34f 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -70,6 +70,9 @@ class Tokenizer: ADD_EOS = "tokenizer.ggml.add_eos_token" HF_JSON = "tokenizer.huggingface.json" RWKV = "tokenizer.rwkv.world" + + class PowerInfer: + SPARSE_THRESHOLD = "powerinfer.sparse_threshold" # diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index c3b8c588..0483d7ba 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -399,6 +399,9 @@ def add_add_bos_token(self, value: bool) -> None: def add_add_eos_token(self, value: bool) -> None: self.add_bool(Keys.Tokenizer.ADD_EOS, value) + def add_sparse_threshold(self, value: float) -> None: + self.add_float32(Keys.PowerInfer.SPARSE_THRESHOLD, value) + def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes: pack_prefix = '' if not skip_pack_prefix: diff --git a/llama.cpp b/llama.cpp index 57108a51..b4d63bae 100644 --- a/llama.cpp +++ b/llama.cpp @@ -314,6 +314,8 @@ static std::map LLM_KV_NAMES = { { LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" }, { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" }, { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" }, + + { LLM_KV_SPARSE_THRESHOLD, "powerinfer.sparse_threshold" }, }; struct LLM_KV { @@ -2624,6 +2626,9 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); } if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); } if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); } + + // sparse inference + LLAMA_LOG_INFO("%s: sparse_pred_threshold = %.2f\n", __func__, hparams.sparse_pred_threshold); } diff --git a/requirements.txt b/requirements.txt index 81c909d0..2b737d98 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ numpy==1.24.4 sentencepiece==0.1.98 -gguf>=0.1.0 +-e ./gguf-py