From fbca2f27fc7fa9aa4a8ad0357478fdb908472908 Mon Sep 17 00:00:00 2001 From: fairydreaming <166155368+fairydreaming@users.noreply.github.com> Date: Fri, 24 May 2024 14:31:13 +0200 Subject: [PATCH 01/11] Add support for ArcticForCausalLM (#7020) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * common : increase max number of experts to 128 * common : add tensor LLM_TENSOR_FFN_NORM_EXPS for normalization before MoE that runs in parallel to attention + ffn * gguf-py : add architecture-specific block mappings that override selected general block mappings * convert-hf : add model conversion support for ArcticForCausalLM * convert-hf : use added_tokens_decoder from tokenizer_config.json to redefine tokens from SentencePiece model (only for ArcticForCausalLM) * llama : add inference support for LLM_ARCH_ARCTIC --------- Co-authored-by: Stanisław Szymczyk --- convert-hf-to-gguf.py | 151 ++++++++++++++++ gguf-py/gguf/constants.py | 25 +++ gguf-py/gguf/tensor_mapping.py | 19 ++- llama.cpp | 304 ++++++++++++++++++++++++++++----- 4 files changed, 456 insertions(+), 43 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 5a00a5e89accb..998877c26da19 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -2466,6 +2466,157 @@ def set_vocab(self, *args, **kwargs): self.gguf_writer.add_add_eos_token(True) +@Model.register("ArcticForCausalLM") +class ArcticModel(Model): + model_arch = gguf.MODEL_ARCH.ARCTIC + + def set_vocab(self): + # The reason for using a custom implementation here is that the + # snowflake-arctic-instruct model redefined tokens 31998 and 31999 from + # tokenizer.model and used them as BOS and EOS instead of adding new tokens. + from sentencepiece import SentencePieceProcessor + + tokenizer_path = self.dir_model / 'tokenizer.model' + + if not tokenizer_path.is_file(): + logger.error(f'Error: Missing {tokenizer_path}') + sys.exit(1) + + # Read the whole vocabulary from the tokenizer.model file + tokenizer = SentencePieceProcessor() + tokenizer.LoadFromFile(str(tokenizer_path)) + + vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + + tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] + scores: list[float] = [-10000.0] * vocab_size + toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size + + for token_id in range(tokenizer.vocab_size()): + + piece = tokenizer.IdToPiece(token_id) + text = piece.encode("utf-8") + score = tokenizer.GetScore(token_id) + + toktype = SentencePieceTokenTypes.NORMAL + if tokenizer.IsUnknown(token_id): + toktype = SentencePieceTokenTypes.UNKNOWN + elif tokenizer.IsControl(token_id): + toktype = SentencePieceTokenTypes.CONTROL + elif tokenizer.IsUnused(token_id): + toktype = SentencePieceTokenTypes.UNUSED + elif tokenizer.IsByte(token_id): + toktype = SentencePieceTokenTypes.BYTE + + tokens[token_id] = text + scores[token_id] = score + toktypes[token_id] = toktype + + # Use the added_tokens_decoder field from tokeniser_config.json as the source + # of information about added/redefined tokens and modify them accordingly. + tokenizer_config_file = self.dir_model / 'tokenizer_config.json' + if tokenizer_config_file.is_file(): + with open(tokenizer_config_file, "r", encoding="utf-8") as f: + tokenizer_config_json = json.load(f) + + if "added_tokens_decoder" in tokenizer_config_json: + added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"] + for token_id, token_json in added_tokens_decoder.items(): + token_id = int(token_id) + if (token_id >= vocab_size): + logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') + continue + + token_content = token_json["content"] + token_type = SentencePieceTokenTypes.USER_DEFINED + token_score = -10000.0 + + # Map unk_token to UNKNOWN, other special tokens to CONTROL + # Set the score to 0.0 as in the original tokenizer.model + if ("special" in token_json) and token_json["special"]: + if token_content == tokenizer_config_json["unk_token"]: + token_type = SentencePieceTokenTypes.UNKNOWN + else: + token_type = SentencePieceTokenTypes.CONTROL + token_score = 0.0 + + logger.info(f"Setting added token {token_id} to '{token_content}' (type: {token_type}, score: {token_score:.2f})") + tokens[token_id] = token_content.encode("utf-8") + toktypes[token_id] = token_type + scores[token_id] = token_score + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"]) + + _experts: list[dict[str, Tensor]] | None = None + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + n_head = self.hparams["num_attention_heads"] + n_kv_head = self.hparams.get("num_key_value_heads") + + if name.endswith("q_proj.weight"): + data_torch = LlamaModel.permute(data_torch, n_head, n_head) + if name.endswith("k_proj.weight"): + data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) + + # process the experts separately + if name.find("block_sparse_moe.experts") != -1: + n_experts = self.hparams["num_local_experts"] + + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + tensors: list[tuple[str, Tensor]] = [] + + # merge the experts into a single 3d tensor + for wid in ["w1", "w2", "w3"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"layers.{bid}.feed_forward.experts.{wid}.weight" + + new_name = self.map_tensor_name(merged_name) + + tensors.append((new_name, data_torch)) + return tensors + else: + return [] + + return [(self.map_tensor_name(name), data_torch)] + + def write_tensors(self): + super().write_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + ###### CONVERSION LOGIC ###### diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 67e23dcc14840..c9ae259e1d627 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -139,6 +139,7 @@ class MODEL_ARCH(IntEnum): COMMAND_R = auto() DBRX = auto() OLMO = auto() + ARCTIC = auto() class MODEL_TENSOR(IntEnum): @@ -167,6 +168,7 @@ class MODEL_TENSOR(IntEnum): FFN_DOWN = auto() FFN_UP = auto() FFN_ACT = auto() + FFN_NORM_EXP = auto() FFN_GATE_EXP = auto() FFN_DOWN_EXP = auto() FFN_UP_EXP = auto() @@ -218,6 +220,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.COMMAND_R: "command-r", MODEL_ARCH.DBRX: "dbrx", MODEL_ARCH.OLMO: "olmo", + MODEL_ARCH.ARCTIC: "arctic", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { @@ -251,6 +254,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN_SHEXP: "blk.{bid}.ffn_down_shexp", MODEL_TENSOR.FFN_UP_SHEXP: "blk.{bid}.ffn_up_shexp", MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn", + MODEL_TENSOR.FFN_NORM_EXP: "blk.{bid}.ffn_norm_exps", MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps", MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps", MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps", @@ -732,6 +736,27 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.ARCTIC: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_ROT_EMBD, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FFN_NORM_EXP, + MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, + MODEL_TENSOR.FFN_UP_EXP, + ], # TODO } diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 8e1cac9152f55..8b1b21d78bb09 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -244,6 +244,7 @@ class TensorNameMap: "encoder.layers.{bid}.mlp.fc11", # nomic-bert "model.layers.{bid}.mlp.c_fc", # starcoder2 "encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2 + "model.layers.{bid}.residual_mlp.w3", # arctic ), MODEL_TENSOR.FFN_UP_EXP: ( @@ -272,6 +273,7 @@ class TensorNameMap: "encoder.layers.{bid}.mlp.fc12", # nomic-bert "encoder.layer.{bid}.mlp.gated_layers_w", # jina-bert-v2 "transformer.h.{bid}.mlp.linear_1", # refact + "model.layers.{bid}.residual_mlp.w1", # arctic ), MODEL_TENSOR.FFN_GATE_EXP: ( @@ -306,6 +308,7 @@ class TensorNameMap: "encoder.layers.{bid}.mlp.fc2", # nomic-bert "model.layers.{bid}.mlp.c_proj", # starcoder2 "encoder.layer.{bid}.mlp.wo", # jina-bert-v2 + "model.layers.{bid}.residual_mlp.w2", # arctic ), MODEL_TENSOR.FFN_DOWN_EXP: ( @@ -382,6 +385,18 @@ class TensorNameMap: ), } + # architecture-specific block mappings + arch_block_mappings_cfg: dict[MODEL_ARCH, dict[MODEL_TENSOR, tuple[str, ...]]] = { + MODEL_ARCH.ARCTIC: { + MODEL_TENSOR.FFN_NORM: ( + "model.layers.{bid}.residual_layernorm", + ), + MODEL_TENSOR.FFN_NORM_EXP: ( + "model.layers.{bid}.post_attention_layernorm", + ), + }, + } + mapping: dict[str, tuple[MODEL_TENSOR, str]] def __init__(self, arch: MODEL_ARCH, n_blocks: int): @@ -393,12 +408,14 @@ def __init__(self, arch: MODEL_ARCH, n_blocks: int): self.mapping[tensor_name] = (tensor, tensor_name) for key in keys: self.mapping[key] = (tensor, tensor_name) + if arch in self.arch_block_mappings_cfg: + self.block_mappings_cfg.update(self.arch_block_mappings_cfg[arch]) for bid in range(n_blocks): for tensor, keys in self.block_mappings_cfg.items(): if tensor not in MODEL_TENSORS[arch]: continue # TODO: make this configurable - n_experts = 60 + n_experts = 128 for xid in range(n_experts): tensor_name = TENSOR_NAMES[tensor].format(bid = bid, xid = xid) self.mapping[tensor_name] = (tensor, tensor_name) diff --git a/llama.cpp b/llama.cpp index 15c66077525a7..3c9fe15bb4596 100644 --- a/llama.cpp +++ b/llama.cpp @@ -103,7 +103,7 @@ #endif #define LLAMA_MAX_NODES 8192 -#define LLAMA_MAX_EXPERTS 60 +#define LLAMA_MAX_EXPERTS 128 // // logging @@ -221,6 +221,7 @@ enum llm_arch { LLM_ARCH_COMMAND_R, LLM_ARCH_DBRX, LLM_ARCH_OLMO, + LLM_ARCH_ARCTIC, LLM_ARCH_UNKNOWN, }; @@ -257,6 +258,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_COMMAND_R, "command-r" }, { LLM_ARCH_DBRX, "dbrx" }, { LLM_ARCH_OLMO, "olmo" }, + { LLM_ARCH_ARCTIC, "arctic" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -455,6 +457,7 @@ enum llm_tensor { LLM_TENSOR_FFN_DOWN_EXP, // split experts for backward compatibility LLM_TENSOR_FFN_GATE_EXP, LLM_TENSOR_FFN_UP_EXP, + LLM_TENSOR_FFN_NORM_EXPS, LLM_TENSOR_FFN_DOWN_EXPS, // merged experts LLM_TENSOR_FFN_GATE_EXPS, LLM_TENSOR_FFN_UP_EXPS, @@ -1032,6 +1035,28 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_ARCTIC, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_NORM_EXPS, "blk.%d.ffn_norm_exps" }, + { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, + { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, + { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, + }, + }, { LLM_ARCH_UNKNOWN, { @@ -1732,6 +1757,7 @@ enum e_model { MODEL_8x7B, MODEL_8x22B, MODEL_16x12B, + MODEL_10B_128x3_66B, }; static const size_t kiB = 1024; @@ -1907,6 +1933,7 @@ struct llama_layer { struct ggml_tensor * ffn_norm_b; struct ggml_tensor * layer_out_norm; struct ggml_tensor * layer_out_norm_b; + struct ggml_tensor * ffn_norm_exps; // ff struct ggml_tensor * ffn_gate; // w1 @@ -3781,47 +3808,48 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { static const char * llama_model_type_name(e_model type) { switch (type) { - case MODEL_14M: return "14M"; - case MODEL_17M: return "17M"; - case MODEL_22M: return "22M"; - case MODEL_33M: return "33M"; - case MODEL_70M: return "70M"; - case MODEL_109M: return "109M"; - case MODEL_137M: return "137M"; - case MODEL_160M: return "160M"; - case MODEL_335M: return "335M"; - case MODEL_410M: return "410M"; - case MODEL_0_5B: return "0.5B"; - case MODEL_1B: return "1B"; - case MODEL_1_4B: return "1.4B"; - case MODEL_2B: return "2B"; - case MODEL_2_8B: return "2.8B"; - case MODEL_3B: return "3B"; - case MODEL_4B: return "4B"; - case MODEL_6_9B: return "6.9B"; - case MODEL_7B: return "7B"; - case MODEL_8B: return "8B"; - case MODEL_12B: return "12B"; - case MODEL_13B: return "13B"; - case MODEL_14B: return "14B"; - case MODEL_15B: return "15B"; - case MODEL_20B: return "20B"; - case MODEL_30B: return "30B"; - case MODEL_34B: return "34B"; - case MODEL_35B: return "35B"; - case MODEL_40B: return "40B"; - case MODEL_65B: return "65B"; - case MODEL_70B: return "70B"; - case MODEL_314B: return "314B"; - case MODEL_SMALL: return "0.1B"; - case MODEL_MEDIUM: return "0.4B"; - case MODEL_LARGE: return "0.8B"; - case MODEL_XL: return "1.5B"; - case MODEL_A2_7B: return "A2.7B"; - case MODEL_8x7B: return "8x7B"; - case MODEL_8x22B: return "8x22B"; - case MODEL_16x12B: return "16x12B"; - default: return "?B"; + case MODEL_14M: return "14M"; + case MODEL_17M: return "17M"; + case MODEL_22M: return "22M"; + case MODEL_33M: return "33M"; + case MODEL_70M: return "70M"; + case MODEL_109M: return "109M"; + case MODEL_137M: return "137M"; + case MODEL_160M: return "160M"; + case MODEL_335M: return "335M"; + case MODEL_410M: return "410M"; + case MODEL_0_5B: return "0.5B"; + case MODEL_1B: return "1B"; + case MODEL_1_4B: return "1.4B"; + case MODEL_2B: return "2B"; + case MODEL_2_8B: return "2.8B"; + case MODEL_3B: return "3B"; + case MODEL_4B: return "4B"; + case MODEL_6_9B: return "6.9B"; + case MODEL_7B: return "7B"; + case MODEL_8B: return "8B"; + case MODEL_12B: return "12B"; + case MODEL_13B: return "13B"; + case MODEL_14B: return "14B"; + case MODEL_15B: return "15B"; + case MODEL_20B: return "20B"; + case MODEL_30B: return "30B"; + case MODEL_34B: return "34B"; + case MODEL_35B: return "35B"; + case MODEL_40B: return "40B"; + case MODEL_65B: return "65B"; + case MODEL_70B: return "70B"; + case MODEL_314B: return "314B"; + case MODEL_SMALL: return "0.1B"; + case MODEL_MEDIUM: return "0.4B"; + case MODEL_LARGE: return "0.8B"; + case MODEL_XL: return "1.5B"; + case MODEL_A2_7B: return "A2.7B"; + case MODEL_8x7B: return "8x7B"; + case MODEL_8x22B: return "8x22B"; + case MODEL_16x12B: return "16x12B"; + case MODEL_10B_128x3_66B: return "10B+128x3.66B"; + default: return "?B"; } } @@ -4343,6 +4371,19 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_ARCTIC: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + + if (hparams.n_expert == 128) { + switch (hparams.n_layer) { + case 35: model.type = e_model::MODEL_10B_128x3_66B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } else { + model.type = e_model::MODEL_UNKNOWN; + } + } break; default: (void)0; } @@ -6129,6 +6170,46 @@ static bool llm_load_tensors( layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}); } } break; + case LLM_ARCH_ARCTIC: + { + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + + // output + { + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED); + // if output is NULL, init from the input tok embed + if (model.output == NULL) { + model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); + } + } + + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_embd}); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, n_embd}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_embd}); + + layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}); + layer.ffn_norm_exps = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM_EXPS, "weight", i), {n_embd}); + layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false); + layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}); + layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}); + } + } break; default: throw std::runtime_error("unknown architecture"); } @@ -10790,6 +10871,140 @@ struct llm_build_context { return gf; } + + struct ggml_cgraph * build_arctic() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + + // mutable variable, needed during the last layer of the computation to skip unused tokens + int32_t n_tokens = this->n_tokens; + + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); + + // self-attention + { + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + + struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + + struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + + Qcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + model.layers[il].wo, NULL, + Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + n_tokens = n_outputs; + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + model.layers[il].ffn_gate, NULL, + model.layers[il].ffn_down, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); + + struct ggml_tensor * ffn_out = ggml_add(ctx0, cur, ffn_inp); + cb(ffn_out, "ffn_out", il); + + // MoE + cur = llm_build_norm(ctx0, inpSA, hparams, + model.layers[il].ffn_norm_exps, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm_exps", il); + + cur = llm_build_moe_ffn(ctx0, cur, + model.layers[il].ffn_gate_inp, + model.layers[il].ffn_up_exps, + model.layers[il].ffn_gate_exps, + model.layers[il].ffn_down_exps, + n_expert, n_expert_used, + LLM_FFN_SILU, true, + cb, il); + cb(cur, "ffn_moe_out", il); + + cur = ggml_add(ctx0, cur, ffn_out); + cb(cur, "ffn_out", il); + + ggml_tensor * layer_dir = lctx.cvec.tensor_for(il); + if (layer_dir != nullptr) { + cur = ggml_add(ctx0, cur, layer_dir); + } + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } }; static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector & ids) { @@ -11004,6 +11219,10 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_gptneox(); } break; + case LLM_ARCH_ARCTIC: + { + result = llm.build_arctic(); + } break; default: GGML_ASSERT(false); } @@ -16015,6 +16234,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_XVERSE: case LLM_ARCH_COMMAND_R: case LLM_ARCH_OLMO: + case LLM_ARCH_ARCTIC: return LLAMA_ROPE_TYPE_NORM; // the pairs of head values are offset by n_rot/2 From 27891f6db03de6e3fd5941983838c29bef253352 Mon Sep 17 00:00:00 2001 From: Brian Date: Fri, 24 May 2024 23:47:56 +1000 Subject: [PATCH 02/11] docker.yml: disable light-intel and server-intel test (#7515) * docker.yml: disable light-intel test * docker.yml: disable server-intel test --- .github/workflows/docker.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 9b03d19bc77c6..c2838cbd9e73e 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -42,8 +42,9 @@ jobs: - { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } - { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } - { tag: "server-rocm", dockerfile: ".devops/server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } - - { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" } - - { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" } + # TODO: Disabled due to build issues https://github.com/ggerganov/llama.cpp/issues/7507 + #- { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" } + #- { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" } steps: - name: Check out the repo uses: actions/checkout@v4 From d041d2ceaaf50e058622d92921b3e680ffa4e9e7 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 24 May 2024 18:59:06 +0300 Subject: [PATCH 03/11] flake.lock: Update (#7232) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Flake lock file updates: • Updated input 'flake-parts': 'github:hercules-ci/flake-parts/e5d10a24b66c3ea8f150e47dfdb0416ab7c3390e?narHash=sha256-yzcRNDoyVP7%2BSCNX0wmuDju1NUCt8Dz9%2BlyUXEI0dbI%3D' (2024-05-02) → 'github:hercules-ci/flake-parts/8dc45382d5206bd292f9c2768b8058a8fd8311d9?narHash=sha256-/GJvTdTpuDjNn84j82cU6bXztE0MSkdnTWClUCRub78%3D' (2024-05-16) • Updated input 'nixpkgs': 'github:NixOS/nixpkgs/63c3a29ca82437c87573e4c6919b09a24ea61b0f?narHash=sha256-4cPymbty65RvF1DWQfc%2BBc8B233A1BWxJnNULJKQ1EY%3D' (2024-05-02) → 'github:NixOS/nixpkgs/4a6b83b05df1a8bd7d99095ec4b4d271f2956b64?narHash=sha256-%2BNpbZRCRisUHKQJZF3CT%2Bxn14ZZQO%2BKjxIIanH3Pvn4%3D' (2024-05-17) Co-authored-by: github-actions[bot] --- flake.lock | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/flake.lock b/flake.lock index c9ead0bf70cb4..451dfd32f4db8 100644 --- a/flake.lock +++ b/flake.lock @@ -5,11 +5,11 @@ "nixpkgs-lib": "nixpkgs-lib" }, "locked": { - "lastModified": 1714641030, - "narHash": "sha256-yzcRNDoyVP7+SCNX0wmuDju1NUCt8Dz9+lyUXEI0dbI=", + "lastModified": 1715865404, + "narHash": "sha256-/GJvTdTpuDjNn84j82cU6bXztE0MSkdnTWClUCRub78=", "owner": "hercules-ci", "repo": "flake-parts", - "rev": "e5d10a24b66c3ea8f150e47dfdb0416ab7c3390e", + "rev": "8dc45382d5206bd292f9c2768b8058a8fd8311d9", "type": "github" }, "original": { @@ -20,11 +20,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1714635257, - "narHash": "sha256-4cPymbty65RvF1DWQfc+Bc8B233A1BWxJnNULJKQ1EY=", + "lastModified": 1715961556, + "narHash": "sha256-+NpbZRCRisUHKQJZF3CT+xn14ZZQO+KjxIIanH3Pvn4=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "63c3a29ca82437c87573e4c6919b09a24ea61b0f", + "rev": "4a6b83b05df1a8bd7d99095ec4b4d271f2956b64", "type": "github" }, "original": { From b83bab15a5d2a1e7807d09613a9b34309d86cfaa Mon Sep 17 00:00:00 2001 From: compilade Date: Fri, 24 May 2024 21:11:48 -0400 Subject: [PATCH 04/11] gguf-py : fix and simplify quantized shape round-trip (#7483) * gguf-py : fix and simplify quantized shape round-trip * gguf-py : remove unused import --- convert-hf-to-gguf.py | 7 +++---- gguf-py/gguf/gguf_reader.py | 6 +++++- gguf-py/gguf/gguf_writer.py | 8 +++----- gguf-py/gguf/quants.py | 16 +++++++++++++++- gguf-py/scripts/gguf-new-metadata.py | 4 +--- 5 files changed, 27 insertions(+), 14 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 998877c26da19..51549ac72f8e7 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -313,11 +313,10 @@ def write_tensors(self): data = data.astype(np.float32) data_qtype = gguf.GGMLQuantizationType.F32 - block_size, type_size = gguf.GGML_QUANT_SIZES[data_qtype] + shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape + # reverse shape to make it similar to the internal ggml dimension order - shape_str = f"""{{{', '.join(str(n) for n in reversed( - (*data.shape[:-1], data.shape[-1] * data.dtype.itemsize // type_size * block_size)) - )}}}""" + shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}" # n_dims is implicit in the shape logger.info(f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}") diff --git a/gguf-py/gguf/gguf_reader.py b/gguf-py/gguf/gguf_reader.py index 21b089f8a2937..e48bc00c388c8 100644 --- a/gguf-py/gguf/gguf_reader.py +++ b/gguf-py/gguf/gguf_reader.py @@ -12,6 +12,8 @@ import numpy as np import numpy.typing as npt +from .quants import quant_shape_to_byte_shape + if __name__ == "__main__": import sys from pathlib import Path @@ -251,6 +253,7 @@ def _build_tensors(self, start_offs: int, fields: list[ReaderField]) -> None: tensor_names.add(tensor_name) ggml_type = GGMLQuantizationType(raw_dtype[0]) n_elems = int(np.prod(dims)) + np_dims = tuple(reversed(dims.tolist())) block_size, type_size = GGML_QUANT_SIZES[ggml_type] n_bytes = n_elems * type_size // block_size data_offs = int(start_offs + offset_tensor[0]) @@ -279,6 +282,7 @@ def _build_tensors(self, start_offs: int, fields: list[ReaderField]) -> None: else: item_count = n_bytes item_type = np.uint8 + np_dims = quant_shape_to_byte_shape(np_dims, ggml_type) tensors.append(ReaderTensor( name = tensor_name, tensor_type = ggml_type, @@ -286,7 +290,7 @@ def _build_tensors(self, start_offs: int, fields: list[ReaderField]) -> None: n_elements = n_elems, n_bytes = n_bytes, data_offset = data_offs, - data = self._get(data_offs, item_type, item_count), + data = self._get(data_offs, item_type, item_count).reshape(np_dims), field = field, )) self.tensors = tensors diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 8b41b54eaa5a6..c194dd5dd1e65 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -13,7 +13,6 @@ import numpy as np from .constants import ( - GGML_QUANT_SIZES, GGUF_DEFAULT_ALIGNMENT, GGUF_MAGIC, GGUF_VERSION, @@ -26,6 +25,8 @@ TokenType, ) +from .quants import quant_shape_from_byte_shape + logger = logging.getLogger(__name__) @@ -229,10 +230,7 @@ def add_tensor_info( else: dtype = raw_dtype if tensor_dtype == np.uint8: - block_size, type_size = GGML_QUANT_SIZES[raw_dtype] - if tensor_shape[-1] % type_size != 0: - raise ValueError(f"Quantized tensor row size ({tensor_shape[-1]}) is not a multiple of {dtype.name} type size ({type_size})") - tensor_shape = tuple(tensor_shape[:-1]) + (tensor_shape[-1] // type_size * block_size,) + tensor_shape = quant_shape_from_byte_shape(tensor_shape, raw_dtype) n_dims = len(tensor_shape) self.ti_data += self._pack("I", n_dims) for i in range(n_dims): diff --git a/gguf-py/gguf/quants.py b/gguf-py/gguf/quants.py index e7fc0eae3f64b..b22eec1661ce7 100644 --- a/gguf-py/gguf/quants.py +++ b/gguf-py/gguf/quants.py @@ -1,5 +1,5 @@ from __future__ import annotations -from typing import Callable +from typing import Callable, Sequence from numpy.typing import DTypeLike @@ -9,6 +9,20 @@ import numpy as np +def quant_shape_to_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType): + block_size, type_size = GGML_QUANT_SIZES[quant_type] + if shape[-1] % block_size != 0: + raise ValueError(f"Quantized tensor row size ({shape[-1]}) is not a multiple of {quant_type.name} block size ({block_size})") + return (*shape[:-1], shape[-1] // block_size * type_size) + + +def quant_shape_from_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType): + block_size, type_size = GGML_QUANT_SIZES[quant_type] + if shape[-1] % type_size != 0: + raise ValueError(f"Quantized tensor bytes per row ({shape[-1]}) is not a multiple of {quant_type.name} type size ({type_size})") + return (*shape[:-1], shape[-1] // type_size * block_size) + + # same as ggml_compute_fp32_to_bf16 in ggml-impl.h def __compute_fp32_to_bf16(n: np.ndarray) -> np.ndarray: n = n.astype(np.float32, copy=False).view(np.int32) diff --git a/gguf-py/scripts/gguf-new-metadata.py b/gguf-py/scripts/gguf-new-metadata.py index 63d3c5d8fdcf4..c9f1927f6a0be 100755 --- a/gguf-py/scripts/gguf-new-metadata.py +++ b/gguf-py/scripts/gguf-new-metadata.py @@ -118,9 +118,7 @@ def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new for tensor in reader.tensors: total_bytes += tensor.n_bytes - # Dimensions are written in reverse order, so flip them first - shape = np.flipud(tensor.shape).tolist() - writer.add_tensor_info(tensor.name, shape, tensor.data.dtype, tensor.data.nbytes, tensor.tensor_type) + writer.add_tensor_info(tensor.name, tensor.data.shape, tensor.data.dtype, tensor.data.nbytes, tensor.tensor_type) bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True) From 57684331fc2d685f7d1f5775af0b9e47d1829833 Mon Sep 17 00:00:00 2001 From: Mikko Juola Date: Fri, 24 May 2024 18:14:42 -0700 Subject: [PATCH 05/11] Make tokenize CLI tool have nicer command line arguments. (#6188) * Make tokenizer.cpp CLI tool nicer. Before this commit, tokenize was a simple CLI tool like this: tokenize MODEL_FILENAME PROMPT [--ids] This simple tool loads the model, takes the prompt, and shows the tokens llama.cpp is interpreting. This changeset makes the tokenize more sophisticated, and more useful for debugging and troubleshooting: tokenize [-m, --model MODEL_FILENAME] [--ids] [--stdin] [--prompt] [-f, --file] [--no-bos] [--log-disable] It also behaves nicer on Windows now, interpreting and rendering Unicode from command line arguments and pipes no matter what code page the user has set on their terminal. * style fix: strlen(str) == 0 --> *str == 0 * Simplify tokenize.cpp; by getting rid of handling positional style arguments. It must now be invoked with long --model, --prompt etc. arguments only. Shortens the code. * tokenize.cpp: iostream header no longer required --------- Co-authored-by: Georgi Gerganov Co-authored-by: brian khuu --- examples/tokenize/tokenize.cpp | 368 ++++++++++++++++++++++++++++++++- 1 file changed, 359 insertions(+), 9 deletions(-) diff --git a/examples/tokenize/tokenize.cpp b/examples/tokenize/tokenize.cpp index 8b1baea800cc8..54c9834afb1b9 100644 --- a/examples/tokenize/tokenize.cpp +++ b/examples/tokenize/tokenize.cpp @@ -3,40 +3,390 @@ #include #include +#include #include #include -int main(int argc, char ** argv) { - if (argc < 3 || argv[1][0] == '-') { - printf("usage: %s MODEL_PATH PROMPT [--ids]\n" , argv[0]); +#if defined(_WIN32) +#define WIN32_LEAN_AND_MEAN +#include +#include // For CommandLineToArgvW +#endif + +static void print_usage_information(const char * argv0, FILE * stream) { + fprintf(stream, "usage: %s [options]\n\n", argv0); + fprintf(stream, "The tokenize program tokenizes a prompt using a given model,\n"); + fprintf(stream, "and prints the resulting tokens to standard output.\n\n"); + fprintf(stream, "It needs a model file, a prompt, and optionally other flags\n"); + fprintf(stream, "to control the behavior of the tokenizer.\n\n"); + fprintf(stream, " The possible options are:\n"); + fprintf(stream, "\n"); + fprintf(stream, " -h, --help print this help and exit\n"); + fprintf(stream, " -m MODEL_PATH, --model MODEL_PATH path to model.\n"); + fprintf(stream, " --ids if given, only print numerical token IDs, and not token strings.\n"); + fprintf(stream, " The output format looks like [1, 2, 3], i.e. parseable by Python.\n"); + fprintf(stream, " -f PROMPT_FNAME, --file PROMPT_FNAME read prompt from a file.\n"); + fprintf(stream, " -p PROMPT, --prompt PROMPT read prompt from the argument.\n"); + fprintf(stream, " --stdin read prompt from standard input.\n"); + fprintf(stream, " --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n"); + fprintf(stream, " --log-disable disable logs. Makes stderr quiet when loading the model.\n"); +} + +static void llama_log_callback_null(ggml_log_level level, const char * text, void * user_data) { + (void) level; + (void) text; + (void) user_data; +} + +static std::string read_prompt_from_file(const char * filepath, bool & success) { + success = false; + + std::ifstream in(filepath, std::ios::binary); + if (!in) { + fprintf(stderr, "%s: could not open file '%s' for reading: %s\n", __func__, filepath, strerror(errno)); + return std::string(); + } + // do not assume the file is seekable (e.g. /dev/stdin) + std::stringstream buffer; + buffer << in.rdbuf(); + if (in.fail()) { + fprintf(stderr, "%s: could not read the entire file '%s': %s\n", __func__, filepath, strerror(errno)); + return std::string(); + } + + success = true; + return buffer.str(); +} + +// +// Function: ingest_args(...) -> vector +// +// Takes argc and argv arguments, and converts them to a vector of UTF-8 encoded +// strings, as an STL vector. +// +// In particular, it handles character encoding shenanigans on Windows. +// +// Note: raw_argc and raw_argv are not actually read at all on Windows. +// On Windows we call GetCommandLineW to get the arguments in wchar_t +// format, ignoring the regular argc/argv arguments to main(). +// +// TODO: potential opportunity to roll common stuff into common/console.cpp +// in relation to Windows wchar_t shenanigans. +static std::vector ingest_args(int raw_argc, char ** raw_argv) { + std::vector argv; + + // Handle Windows, if given non-ASCII arguments. + // We convert wchar_t arguments into UTF-8 char* on this platform. + // Lets you invoke 'tokenize' on Windows cmd.exe with non-ASCII characters + // without throwing tantrums. +#if defined(_WIN32) + int argc; + const LPWSTR cmdline_wargv = GetCommandLineW(); + LPWSTR * wargv = CommandLineToArgvW(cmdline_wargv, &argc); + + // silence unused arg warnings + (void) raw_argc; + (void) raw_argv; + + for (int i = 0; i < argc; ++i) { + int length_needed = WideCharToMultiByte(CP_UTF8, 0, wargv[i], wcslen(wargv[i]), 0, 0, NULL, NULL); + char * output_buf = (char *) calloc(length_needed+1, sizeof(char)); + GGML_ASSERT(output_buf); + + WideCharToMultiByte(CP_UTF8, 0, wargv[i], wcslen(wargv[i]), output_buf, length_needed, NULL, NULL); + output_buf[length_needed] = '\0'; + + argv.push_back(output_buf); + free(output_buf); + } + + LocalFree((HLOCAL) wargv); +#else + int argc = raw_argc; + for (int i = 0; i < argc; ++i) { + argv.push_back(raw_argv[i]); + } +#endif + + GGML_ASSERT((unsigned int) argc == argv.size()); + + return argv; +} + +// +// Function: write_utf8_cstr_to_stdout(const char *) -> +// +// writes a string to standard output; taking into account that on Windows +// to display correctly you have to use special handling. Works even if the +// user has not set a unicode code page on a Windows cmd.exe. +// +// In case of invalid UTF-8, invalid_utf8 is set to true on Windows, and something +// a human-readable is written instead. +// +// On non-Windows systems, simply printfs() the string. +static void write_utf8_cstr_to_stdout(const char * str, bool & invalid_utf8) { + invalid_utf8 = false; + +#if defined(_WIN32) + // Are we in a console? + HANDLE hConsole = GetStdHandle(STD_OUTPUT_HANDLE); + DWORD dwMode = 0; + + // According to Microsoft docs: + // "WriteConsole fails if it is used with a standard handle that is redirected to a file." + // Also according to the docs, you can use GetConsoleMode to check for that. + if (hConsole == INVALID_HANDLE_VALUE || !GetConsoleMode(hConsole, &dwMode)) { + printf("%s", str); + return; + } + + // MultiByteToWideChar reports an error if str is empty, don't report + // them as invalid_utf8. + if (*str == 0) { + return; + } + int length_needed = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str, strlen(str), NULL, 0); + if (length_needed == 0) { + DWORD err = GetLastError(); + if (err == ERROR_NO_UNICODE_TRANSLATION) { + invalid_utf8 = true; + int len = strlen(str); + printf("<"); + for (int i = 0; i < len; ++i) { + if (i > 0) { + printf(" "); + } + printf("%02x", (uint8_t) str[i]); + } + printf(">"); + return; + } + GGML_ASSERT(false && "MultiByteToWideChar() failed in an unexpected way."); + } + + LPWSTR wstr = (LPWSTR) calloc(length_needed+1, sizeof(*wstr)); + GGML_ASSERT(wstr); + + MultiByteToWideChar(CP_UTF8, 0, str, strlen(str), wstr, length_needed); + WriteConsoleW(hConsole, wstr, length_needed, NULL, NULL); + + free(wstr); +#else + // TODO: reporting invalid_utf8 would be useful on non-Windows too. + // printf will silently just write bad unicode. + printf("%s", str); +#endif +} + +int main(int raw_argc, char ** raw_argv) { + const std::vector argv = ingest_args(raw_argc, raw_argv); + const int argc = argv.size(); + + if (argc <= 1) { + print_usage_information(argv[0].c_str(), stderr); + return 1; + } + + ////// + // Read out all the command line arguments. + ////// + + // variables where to put any arguments we see. + bool printing_ids = false; + bool no_bos = false; + bool disable_logging = false; + const char * model_path = NULL; + const char * prompt_path = NULL; + const char * prompt_arg = NULL; + + // track which arguments were explicitly given + // used for sanity checking down the line + bool model_path_set = false; + bool prompt_path_set = false; + bool prompt_set = false; + bool stdin_set = false; + + int iarg = 1; + for (; iarg < argc; ++iarg) { + std::string arg{argv[iarg]}; + if (arg == "-h" || arg == "--help") { + print_usage_information(argv[0].c_str(), stdout); + return 0; + } + else if (arg == "--ids") { + printing_ids = true; + } + else if (arg == "-m" || arg == "--model") { + if (model_path_set) { + fprintf(stderr, "Error: -m or --model specified multiple times.\n"); + return 1; + } + model_path = argv[++iarg].c_str(); + model_path_set = true; + } + else if (arg == "--no-bos") { + no_bos = true; + } + else if (arg == "-p" || arg == "--prompt") { + if (prompt_set) { + fprintf(stderr, "Error: -p or --prompt specified multiple times.\n"); + return 1; + } + prompt_arg = argv[++iarg].c_str(); + prompt_set = true; + } + else if (arg == "-f" || arg == "--file") { + if (prompt_path_set) { + fprintf(stderr, "Error: -f or --file specified multiple times.\n"); + return 1; + } + prompt_path = argv[++iarg].c_str(); + prompt_path_set = true; + } + else if (arg == "--stdin") { + stdin_set = true; + } + else if (arg == "--log-disable") { + disable_logging = true; + } + else { + fprintf(stderr, "Error: unknown option '%s'\n", argv[iarg].c_str()); + return 1; + } + } + + ////// + // Sanity check the command line arguments. + ////// + + // Check that we have the required stuff set. + if (model_path_set && model_path == NULL) { + fprintf(stderr, "Error: --model requires an argument.\n"); + return 1; + } + if (!model_path_set) { + fprintf(stderr, "Error: must specify --model.\n"); + return 1; + } + if (prompt_path_set && prompt_path == NULL) { + fprintf(stderr, "Error: --file requires an argument.\n"); + return 1; + } + if (prompt_set && prompt_arg == NULL) { + fprintf(stderr, "Error: --prompt requires an argument.\n"); + return 1; + } + const int prompts_set = !!(prompt_path_set) + !!(prompt_set) + !!(stdin_set); + if (prompts_set > 1) { + fprintf(stderr, "Error: --stdin, --file and --prompt are mutually exclusive.\n"); + return 1; + } + // Must have some prompt. + if (prompts_set == 0) { + fprintf(stderr, "Error: must specify one of: --stdin, --file or --prompt.\n"); return 1; } - const char * model_path = argv[1]; - const char * prompt = argv[2]; + GGML_ASSERT(model_path); + GGML_ASSERT(prompt_path || prompt_arg || stdin_set); - const bool printing_ids = argc > 3 && std::string(argv[3]) == "--ids"; + ////// + // Figure out where will the prompt come from. + ////// + + std::string prompt; + if (prompt_path_set) { + bool success = false; + prompt = read_prompt_from_file(prompt_path, success); + if (!success) { + return 1; + } + } else if (prompt_set) { + prompt = prompt_arg; + } else { + GGML_ASSERT(stdin_set); + // we read stdin *after* loading model (early exit if model cannot + // be loaded, which can be a nicer user experience) + } + + ////// + // Start actually doing the tokenizing stuff. + ////// + +#ifdef LOG_DISABLE_LOGS + disable_logging = true; +#endif + + if (disable_logging) { + llama_log_set(llama_log_callback_null, NULL); + } llama_backend_init(); llama_model_params model_params = llama_model_default_params(); model_params.vocab_only = true; llama_model * model = llama_load_model_from_file(model_path, model_params); + if (!model) { + fprintf(stderr, "Error: could not load model from file '%s'.\n", model_path); + return 1; + } llama_context_params ctx_params = llama_context_default_params(); llama_context * ctx = llama_new_context_with_model(model, ctx_params); + if (!ctx) { + fprintf(stderr, "Error: could not create context.\n"); + return 1; + } + + // read entire prompt from stdin? + if (stdin_set) { + GGML_ASSERT(!prompt_path_set && !prompt_set); + + std::stringstream stdin_buffer; + stdin_buffer << std::cin.rdbuf(); + if (std::cin.fail()) { + fprintf(stderr, "Error: could not read the entire standard input.\n"); + return 1; + } + + prompt = stdin_buffer.str(); + } + + const bool model_wants_add_bos = llama_should_add_bos_token(model); + const bool add_bos = model_wants_add_bos && !no_bos; std::vector tokens; + tokens = ::llama_tokenize(model, prompt, add_bos, true); - tokens = ::llama_tokenize(model, prompt, true, true); + if (printing_ids) { + printf("["); + } for (int i = 0; i < (int) tokens.size(); i++) { if (printing_ids) { - printf("%d\n", tokens[i]); + if (i > 0) { + printf(", "); + } + printf("%d", tokens[i]); } else { - printf("%6d -> '%s'\n", tokens[i], llama_token_to_piece(ctx, tokens[i]).c_str()); + bool invalid_utf8 = false; + printf("%6d -> '", tokens[i]); + write_utf8_cstr_to_stdout(llama_token_to_piece(ctx, tokens[i]).c_str(), invalid_utf8); + if (invalid_utf8) { + printf("' (utf-8 decode failure)\n"); + } else { + printf("'\n"); + } } } + if (printing_ids) { + printf("]\n"); + } + + // silence valgrind + llama_free(ctx); + llama_free_model(model); + return 0; } From 902184dd3a9d6685e752b19027a48423742531db Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 25 May 2024 05:30:59 +0200 Subject: [PATCH 06/11] fix missing slash in `fs_get_cache_directory()` (#7503) * fix missing slash in fs_get_cache_directory() * use LOCALAPPDATA for fs_get_cache_directory() * better code style --- common/common.cpp | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 7500e08ff1be4..401d72bac00ce 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1855,11 +1855,15 @@ bool fs_create_directory_with_parents(const std::string & path) { std::string fs_get_cache_directory() { std::string cache_directory = ""; + auto ensure_trailing_slash = [](std::string p) { + // Make sure to add trailing slash + if (p.back() != DIRECTORY_SEPARATOR) { + p += DIRECTORY_SEPARATOR; + } + return p; + }; if (getenv("LLAMA_CACHE")) { cache_directory = std::getenv("LLAMA_CACHE"); - if (cache_directory.back() != DIRECTORY_SEPARATOR) { - cache_directory += DIRECTORY_SEPARATOR; - } } else { #ifdef __linux__ if (std::getenv("XDG_CACHE_HOME")) { @@ -1870,12 +1874,12 @@ std::string fs_get_cache_directory() { #elif defined(__APPLE__) cache_directory = std::getenv("HOME") + std::string("/Library/Caches/"); #elif defined(_WIN32) - cache_directory = std::getenv("APPDATA"); + cache_directory = std::getenv("LOCALAPPDATA"); #endif // __linux__ + cache_directory = ensure_trailing_slash(cache_directory); cache_directory += "llama.cpp"; - cache_directory += DIRECTORY_SEPARATOR; } - return cache_directory; + return ensure_trailing_slash(cache_directory); } From 9791f402580838d7f8543ae7bc633ef265e436f0 Mon Sep 17 00:00:00 2001 From: Elton Kola Date: Sat, 25 May 2024 04:11:33 -0400 Subject: [PATCH 07/11] android : module (#7502) * move ndk code to a new library * add gradle file --- examples/llama.android/app/build.gradle.kts | 25 +------ .../java/com/example/llama/MainViewModel.kt | 13 ++-- examples/llama.android/build.gradle.kts | 1 + examples/llama.android/llama/.gitignore | 1 + .../src/main/cpp => llama}/CMakeLists.txt | 2 +- examples/llama.android/llama/build.gradle.kts | 68 +++++++++++++++++++ .../llama.android/llama/consumer-rules.pro | 0 .../llama.android/llama/proguard-rules.pro | 21 ++++++ .../llama/cpp/ExampleInstrumentedTest.kt | 24 +++++++ .../llama/src/main/AndroidManifest.xml | 4 ++ .../llama/src/main/cpp/CMakeLists.txt | 49 +++++++++++++ .../src/main/cpp/llama-android.cpp | 28 ++++---- .../java/android/llama/cpp/LLamaAndroid.kt} | 8 +-- .../java/android/llama/cpp/ExampleUnitTest.kt | 17 +++++ examples/llama.android/settings.gradle.kts | 1 + 15 files changed, 213 insertions(+), 49 deletions(-) create mode 100644 examples/llama.android/llama/.gitignore rename examples/llama.android/{app/src/main/cpp => llama}/CMakeLists.txt (98%) create mode 100644 examples/llama.android/llama/build.gradle.kts create mode 100644 examples/llama.android/llama/consumer-rules.pro create mode 100644 examples/llama.android/llama/proguard-rules.pro create mode 100644 examples/llama.android/llama/src/androidTest/java/android/llama/cpp/ExampleInstrumentedTest.kt create mode 100644 examples/llama.android/llama/src/main/AndroidManifest.xml create mode 100644 examples/llama.android/llama/src/main/cpp/CMakeLists.txt rename examples/llama.android/{app => llama}/src/main/cpp/llama-android.cpp (92%) rename examples/llama.android/{app/src/main/java/com/example/llama/Llm.kt => llama/src/main/java/android/llama/cpp/LLamaAndroid.kt} (97%) create mode 100644 examples/llama.android/llama/src/test/java/android/llama/cpp/ExampleUnitTest.kt diff --git a/examples/llama.android/app/build.gradle.kts b/examples/llama.android/app/build.gradle.kts index d42140efe8168..8d1b37195efd4 100644 --- a/examples/llama.android/app/build.gradle.kts +++ b/examples/llama.android/app/build.gradle.kts @@ -7,8 +7,6 @@ android { namespace = "com.example.llama" compileSdk = 34 - ndkVersion = "26.1.10909125" - defaultConfig { applicationId = "com.example.llama" minSdk = 33 @@ -20,17 +18,6 @@ android { vectorDrawables { useSupportLibrary = true } - ndk { - // Add NDK properties if wanted, e.g. - // abiFilters += listOf("arm64-v8a") - } - externalNativeBuild { - cmake { - arguments += "-DCMAKE_BUILD_TYPE=Release" - cppFlags += listOf() - arguments += listOf() - } - } } buildTypes { @@ -55,17 +42,6 @@ android { composeOptions { kotlinCompilerExtensionVersion = "1.5.1" } - packaging { - resources { - excludes += "/META-INF/{AL2.0,LGPL2.1}" - } - } - externalNativeBuild { - cmake { - path = file("src/main/cpp/CMakeLists.txt") - version = "3.22.1" - } - } } dependencies { @@ -78,6 +54,7 @@ dependencies { implementation("androidx.compose.ui:ui-graphics") implementation("androidx.compose.ui:ui-tooling-preview") implementation("androidx.compose.material3:material3") + implementation(project(":llama")) testImplementation("junit:junit:4.13.2") androidTestImplementation("androidx.test.ext:junit:1.1.5") androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1") diff --git a/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt b/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt index be95e22218332..45ac29938f441 100644 --- a/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt +++ b/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt @@ -1,5 +1,6 @@ package com.example.llama +import android.llama.cpp.LLamaAndroid import android.util.Log import androidx.compose.runtime.getValue import androidx.compose.runtime.mutableStateOf @@ -9,7 +10,7 @@ import androidx.lifecycle.viewModelScope import kotlinx.coroutines.flow.catch import kotlinx.coroutines.launch -class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() { +class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instance()): ViewModel() { companion object { @JvmStatic private val NanosPerSecond = 1_000_000_000.0 @@ -28,7 +29,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() { viewModelScope.launch { try { - llm.unload() + llamaAndroid.unload() } catch (exc: IllegalStateException) { messages += exc.message!! } @@ -44,7 +45,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() { messages += "" viewModelScope.launch { - llm.send(text) + llamaAndroid.send(text) .catch { Log.e(tag, "send() failed", it) messages += it.message!! @@ -57,7 +58,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() { viewModelScope.launch { try { val start = System.nanoTime() - val warmupResult = llm.bench(pp, tg, pl, nr) + val warmupResult = llamaAndroid.bench(pp, tg, pl, nr) val end = System.nanoTime() messages += warmupResult @@ -70,7 +71,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() { return@launch } - messages += llm.bench(512, 128, 1, 3) + messages += llamaAndroid.bench(512, 128, 1, 3) } catch (exc: IllegalStateException) { Log.e(tag, "bench() failed", exc) messages += exc.message!! @@ -81,7 +82,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() { fun load(pathToModel: String) { viewModelScope.launch { try { - llm.load(pathToModel) + llamaAndroid.load(pathToModel) messages += "Loaded $pathToModel" } catch (exc: IllegalStateException) { Log.e(tag, "load() failed", exc) diff --git a/examples/llama.android/build.gradle.kts b/examples/llama.android/build.gradle.kts index 50ebc821122f6..acd1ada7d9b1a 100644 --- a/examples/llama.android/build.gradle.kts +++ b/examples/llama.android/build.gradle.kts @@ -2,4 +2,5 @@ plugins { id("com.android.application") version "8.2.0" apply false id("org.jetbrains.kotlin.android") version "1.9.0" apply false + id("com.android.library") version "8.2.0" apply false } diff --git a/examples/llama.android/llama/.gitignore b/examples/llama.android/llama/.gitignore new file mode 100644 index 0000000000000..796b96d1c4023 --- /dev/null +++ b/examples/llama.android/llama/.gitignore @@ -0,0 +1 @@ +/build diff --git a/examples/llama.android/app/src/main/cpp/CMakeLists.txt b/examples/llama.android/llama/CMakeLists.txt similarity index 98% rename from examples/llama.android/app/src/main/cpp/CMakeLists.txt rename to examples/llama.android/llama/CMakeLists.txt index 4536974a5c50c..a5618cac05849 100644 --- a/examples/llama.android/app/src/main/cpp/CMakeLists.txt +++ b/examples/llama.android/llama/CMakeLists.txt @@ -42,7 +42,7 @@ add_subdirectory(../../../../../../ build-llama) # used in the AndroidManifest.xml file. add_library(${CMAKE_PROJECT_NAME} SHARED # List C/C++ source files with relative paths to this CMakeLists.txt. - llama-android.cpp) + llama-android.cpp) # Specifies libraries CMake should link to your target library. You # can link libraries from various origins, such as libraries defined in this diff --git a/examples/llama.android/llama/build.gradle.kts b/examples/llama.android/llama/build.gradle.kts new file mode 100644 index 0000000000000..0a3806172f05f --- /dev/null +++ b/examples/llama.android/llama/build.gradle.kts @@ -0,0 +1,68 @@ +plugins { + id("com.android.library") + id("org.jetbrains.kotlin.android") +} + +android { + namespace = "android.llama.cpp" + compileSdk = 34 + + defaultConfig { + minSdk = 33 + + testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner" + consumerProguardFiles("consumer-rules.pro") + ndk { + // Add NDK properties if wanted, e.g. + // abiFilters += listOf("arm64-v8a") + } + externalNativeBuild { + cmake { + arguments += "-DCMAKE_BUILD_TYPE=Release" + cppFlags += listOf() + arguments += listOf() + + cppFlags("") + } + } + } + + buildTypes { + release { + isMinifyEnabled = false + proguardFiles( + getDefaultProguardFile("proguard-android-optimize.txt"), + "proguard-rules.pro" + ) + } + } + externalNativeBuild { + cmake { + path("src/main/cpp/CMakeLists.txt") + version = "3.22.1" + } + } + compileOptions { + sourceCompatibility = JavaVersion.VERSION_1_8 + targetCompatibility = JavaVersion.VERSION_1_8 + } + kotlinOptions { + jvmTarget = "1.8" + } + + packaging { + resources { + excludes += "/META-INF/{AL2.0,LGPL2.1}" + } + } +} + +dependencies { + + implementation("androidx.core:core-ktx:1.12.0") + implementation("androidx.appcompat:appcompat:1.6.1") + implementation("com.google.android.material:material:1.11.0") + testImplementation("junit:junit:4.13.2") + androidTestImplementation("androidx.test.ext:junit:1.1.5") + androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1") +} diff --git a/examples/llama.android/llama/consumer-rules.pro b/examples/llama.android/llama/consumer-rules.pro new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/examples/llama.android/llama/proguard-rules.pro b/examples/llama.android/llama/proguard-rules.pro new file mode 100644 index 0000000000000..f1b424510da51 --- /dev/null +++ b/examples/llama.android/llama/proguard-rules.pro @@ -0,0 +1,21 @@ +# Add project specific ProGuard rules here. +# You can control the set of applied configuration files using the +# proguardFiles setting in build.gradle. +# +# For more details, see +# http://developer.android.com/guide/developing/tools/proguard.html + +# If your project uses WebView with JS, uncomment the following +# and specify the fully qualified class name to the JavaScript interface +# class: +#-keepclassmembers class fqcn.of.javascript.interface.for.webview { +# public *; +#} + +# Uncomment this to preserve the line number information for +# debugging stack traces. +#-keepattributes SourceFile,LineNumberTable + +# If you keep the line number information, uncomment this to +# hide the original source file name. +#-renamesourcefileattribute SourceFile diff --git a/examples/llama.android/llama/src/androidTest/java/android/llama/cpp/ExampleInstrumentedTest.kt b/examples/llama.android/llama/src/androidTest/java/android/llama/cpp/ExampleInstrumentedTest.kt new file mode 100644 index 0000000000000..05d6ab5d2dd23 --- /dev/null +++ b/examples/llama.android/llama/src/androidTest/java/android/llama/cpp/ExampleInstrumentedTest.kt @@ -0,0 +1,24 @@ +package android.llama.cpp + +import androidx.test.platform.app.InstrumentationRegistry +import androidx.test.ext.junit.runners.AndroidJUnit4 + +import org.junit.Test +import org.junit.runner.RunWith + +import org.junit.Assert.* + +/** + * Instrumented test, which will execute on an Android device. + * + * See [testing documentation](http://d.android.com/tools/testing). + */ +@RunWith(AndroidJUnit4::class) +class ExampleInstrumentedTest { + @Test + fun useAppContext() { + // Context of the app under test. + val appContext = InstrumentationRegistry.getInstrumentation().targetContext + assertEquals("android.llama.cpp.test", appContext.packageName) + } +} diff --git a/examples/llama.android/llama/src/main/AndroidManifest.xml b/examples/llama.android/llama/src/main/AndroidManifest.xml new file mode 100644 index 0000000000000..8bdb7e14b389a --- /dev/null +++ b/examples/llama.android/llama/src/main/AndroidManifest.xml @@ -0,0 +1,4 @@ + + + + diff --git a/examples/llama.android/llama/src/main/cpp/CMakeLists.txt b/examples/llama.android/llama/src/main/cpp/CMakeLists.txt new file mode 100644 index 0000000000000..42ebaad49a560 --- /dev/null +++ b/examples/llama.android/llama/src/main/cpp/CMakeLists.txt @@ -0,0 +1,49 @@ +# For more information about using CMake with Android Studio, read the +# documentation: https://d.android.com/studio/projects/add-native-code.html. +# For more examples on how to use CMake, see https://github.com/android/ndk-samples. + +# Sets the minimum CMake version required for this project. +cmake_minimum_required(VERSION 3.22.1) + +# Declares the project name. The project name can be accessed via ${ PROJECT_NAME}, +# Since this is the top level CMakeLists.txt, the project name is also accessible +# with ${CMAKE_PROJECT_NAME} (both CMake variables are in-sync within the top level +# build script scope). +project("llama-android") + +include(FetchContent) +FetchContent_Declare( + llama + GIT_REPOSITORY https://github.com/ggerganov/llama.cpp + GIT_TAG master +) + +# Also provides "common" +FetchContent_MakeAvailable(llama) + +# Creates and names a library, sets it as either STATIC +# or SHARED, and provides the relative paths to its source code. +# You can define multiple libraries, and CMake builds them for you. +# Gradle automatically packages shared libraries with your APK. +# +# In this top level CMakeLists.txt, ${CMAKE_PROJECT_NAME} is used to define +# the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME} +# is preferred for the same purpose. +# +# In order to load a library into your app from Java/Kotlin, you must call +# System.loadLibrary() and pass the name of the library defined here; +# for GameActivity/NativeActivity derived applications, the same library name must be +# used in the AndroidManifest.xml file. +add_library(${CMAKE_PROJECT_NAME} SHARED + # List C/C++ source files with relative paths to this CMakeLists.txt. + llama-android.cpp) + +# Specifies libraries CMake should link to your target library. You +# can link libraries from various origins, such as libraries defined in this +# build script, prebuilt third-party libraries, or Android system libraries. +target_link_libraries(${CMAKE_PROJECT_NAME} + # List libraries link to the target library + llama + common + android + log) diff --git a/examples/llama.android/app/src/main/cpp/llama-android.cpp b/examples/llama.android/llama/src/main/cpp/llama-android.cpp similarity index 92% rename from examples/llama.android/app/src/main/cpp/llama-android.cpp rename to examples/llama.android/llama/src/main/cpp/llama-android.cpp index 4af9de3038359..874158ef0f98f 100644 --- a/examples/llama.android/app/src/main/cpp/llama-android.cpp +++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp @@ -81,7 +81,7 @@ static void log_callback(ggml_log_level level, const char * fmt, void * data) { extern "C" JNIEXPORT jlong JNICALL -Java_com_example_llama_Llm_load_1model(JNIEnv *env, jobject, jstring filename) { +Java_android_llama_cpp_LLamaAndroid_load_1model(JNIEnv *env, jobject, jstring filename) { llama_model_params model_params = llama_model_default_params(); auto path_to_model = env->GetStringUTFChars(filename, 0); @@ -101,13 +101,13 @@ Java_com_example_llama_Llm_load_1model(JNIEnv *env, jobject, jstring filename) { extern "C" JNIEXPORT void JNICALL -Java_com_example_llama_Llm_free_1model(JNIEnv *, jobject, jlong model) { +Java_android_llama_cpp_LLamaAndroid_free_1model(JNIEnv *, jobject, jlong model) { llama_free_model(reinterpret_cast(model)); } extern "C" JNIEXPORT jlong JNICALL -Java_com_example_llama_Llm_new_1context(JNIEnv *env, jobject, jlong jmodel) { +Java_android_llama_cpp_LLamaAndroid_new_1context(JNIEnv *env, jobject, jlong jmodel) { auto model = reinterpret_cast(jmodel); if (!model) { @@ -139,25 +139,25 @@ Java_com_example_llama_Llm_new_1context(JNIEnv *env, jobject, jlong jmodel) { extern "C" JNIEXPORT void JNICALL -Java_com_example_llama_Llm_free_1context(JNIEnv *, jobject, jlong context) { +Java_android_llama_cpp_LLamaAndroid_free_1context(JNIEnv *, jobject, jlong context) { llama_free(reinterpret_cast(context)); } extern "C" JNIEXPORT void JNICALL -Java_com_example_llama_Llm_backend_1free(JNIEnv *, jobject) { +Java_android_llama_cpp_LLamaAndroid_backend_1free(JNIEnv *, jobject) { llama_backend_free(); } extern "C" JNIEXPORT void JNICALL -Java_com_example_llama_Llm_log_1to_1android(JNIEnv *, jobject) { +Java_android_llama_cpp_LLamaAndroid_log_1to_1android(JNIEnv *, jobject) { llama_log_set(log_callback, NULL); } extern "C" JNIEXPORT jstring JNICALL -Java_com_example_llama_Llm_bench_1model( +Java_android_llama_cpp_LLamaAndroid_bench_1model( JNIEnv *env, jobject, jlong context_pointer, @@ -271,13 +271,13 @@ Java_com_example_llama_Llm_bench_1model( extern "C" JNIEXPORT void JNICALL -Java_com_example_llama_Llm_free_1batch(JNIEnv *, jobject, jlong batch_pointer) { +Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) { llama_batch_free(*reinterpret_cast(batch_pointer)); } extern "C" JNIEXPORT jlong JNICALL -Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) { +Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) { // Source: Copy of llama.cpp:llama_batch_init but heap-allocated. @@ -313,19 +313,19 @@ Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint emb extern "C" JNIEXPORT void JNICALL -Java_com_example_llama_Llm_backend_1init(JNIEnv *, jobject) { +Java_android_llama_cpp_LLamaAndroid_backend_1init(JNIEnv *, jobject) { llama_backend_init(); } extern "C" JNIEXPORT jstring JNICALL -Java_com_example_llama_Llm_system_1info(JNIEnv *env, jobject) { +Java_android_llama_cpp_LLamaAndroid_system_1info(JNIEnv *env, jobject) { return env->NewStringUTF(llama_print_system_info()); } extern "C" JNIEXPORT jint JNICALL -Java_com_example_llama_Llm_completion_1init( +Java_android_llama_cpp_LLamaAndroid_completion_1init( JNIEnv *env, jobject, jlong context_pointer, @@ -376,7 +376,7 @@ Java_com_example_llama_Llm_completion_1init( extern "C" JNIEXPORT jstring JNICALL -Java_com_example_llama_Llm_completion_1loop( +Java_android_llama_cpp_LLamaAndroid_completion_1loop( JNIEnv * env, jobject, jlong context_pointer, @@ -438,6 +438,6 @@ Java_com_example_llama_Llm_completion_1loop( extern "C" JNIEXPORT void JNICALL -Java_com_example_llama_Llm_kv_1cache_1clear(JNIEnv *, jobject, jlong context) { +Java_android_llama_cpp_LLamaAndroid_kv_1cache_1clear(JNIEnv *, jobject, jlong context) { llama_kv_cache_clear(reinterpret_cast(context)); } diff --git a/examples/llama.android/app/src/main/java/com/example/llama/Llm.kt b/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt similarity index 97% rename from examples/llama.android/app/src/main/java/com/example/llama/Llm.kt rename to examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt index d86afee379083..6c63e54e0d908 100644 --- a/examples/llama.android/app/src/main/java/com/example/llama/Llm.kt +++ b/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt @@ -1,4 +1,4 @@ -package com.example.llama +package android.llama.cpp import android.util.Log import kotlinx.coroutines.CoroutineDispatcher @@ -10,7 +10,7 @@ import kotlinx.coroutines.withContext import java.util.concurrent.Executors import kotlin.concurrent.thread -class Llm { +class LLamaAndroid { private val tag: String? = this::class.simpleName private val threadLocalState: ThreadLocal = ThreadLocal.withInitial { State.Idle } @@ -165,8 +165,8 @@ class Llm { } // Enforce only one instance of Llm. - private val _instance: Llm = Llm() + private val _instance: LLamaAndroid = LLamaAndroid() - fun instance(): Llm = _instance + fun instance(): LLamaAndroid = _instance } } diff --git a/examples/llama.android/llama/src/test/java/android/llama/cpp/ExampleUnitTest.kt b/examples/llama.android/llama/src/test/java/android/llama/cpp/ExampleUnitTest.kt new file mode 100644 index 0000000000000..cbbb974d32266 --- /dev/null +++ b/examples/llama.android/llama/src/test/java/android/llama/cpp/ExampleUnitTest.kt @@ -0,0 +1,17 @@ +package android.llama.cpp + +import org.junit.Test + +import org.junit.Assert.* + +/** + * Example local unit test, which will execute on the development machine (host). + * + * See [testing documentation](http://d.android.com/tools/testing). + */ +class ExampleUnitTest { + @Test + fun addition_isCorrect() { + assertEquals(4, 2 + 2) + } +} diff --git a/examples/llama.android/settings.gradle.kts b/examples/llama.android/settings.gradle.kts index 2ba32c4fafc5c..c7c1a034a45b8 100644 --- a/examples/llama.android/settings.gradle.kts +++ b/examples/llama.android/settings.gradle.kts @@ -15,3 +15,4 @@ dependencyResolutionManagement { rootProject.name = "LlamaAndroid" include(":app") +include(":llama") From faa0e6979a11dcb731e9d778ad42ceaa0302015e Mon Sep 17 00:00:00 2001 From: "Masaya, Kato" <62578291+msy-kato@users.noreply.github.com> Date: Sat, 25 May 2024 17:42:31 +0900 Subject: [PATCH 08/11] ggml: aarch64: SVE kernels for q8_0_q8_0, q4_0_q8_0 vector dot (#7433) * Add SVE support for q4_0_q8_0 q8_0_q8_0 * remove ifdef --- CMakeLists.txt | 4 +++ common/common.cpp | 1 + ggml-impl.h | 4 +++ ggml-quants.c | 66 +++++++++++++++++++++++++++++++++++++++++++++-- ggml.c | 10 +++++++ ggml.h | 1 + llama.cpp | 1 + 7 files changed, 85 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ef02ff66967f3..c5add8239c2bd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -72,6 +72,7 @@ else() set(INS_ENB ON) endif() +option(LLAMA_SVE "llama: enable SVE" OFF) option(LLAMA_AVX "llama: enable AVX" ${INS_ENB}) option(LLAMA_AVX2 "llama: enable AVX2" ${INS_ENB}) option(LLAMA_AVX512 "llama: enable AVX512" OFF) @@ -1040,6 +1041,9 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STR # Raspberry Pi 3, 4, Zero 2 (32-bit) list(APPEND ARCH_FLAGS -mno-unaligned-access) endif() + if (LLAMA_SVE) + list(APPEND ARCH_FLAGS -march=armv8.6-a+sve) + endif() endif() elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND diff --git a/common/common.cpp b/common/common.cpp index 401d72bac00ce..c6459038560f1 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2844,6 +2844,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false"); fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false"); fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false"); + fprintf(stream, "cpu_has_sve: %s\n", ggml_cpu_has_sve() ? "true" : "false"); fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false"); fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false"); fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false"); diff --git a/ggml-impl.h b/ggml-impl.h index 362d40f4d1d8b..5e77471f332f4 100644 --- a/ggml-impl.h +++ b/ggml-impl.h @@ -144,6 +144,10 @@ extern "C" { #endif #endif +#if defined(__ARM_FEATURE_SVE) +#include +#endif + // 16-bit float // on Arm, we use __fp16 // on x86, we use uint16_t diff --git a/ggml-quants.c b/ggml-quants.c index bb01ce93cb969..4f2c7224c3e75 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -3813,7 +3813,44 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r return; } #endif -#if defined(__ARM_NEON) +#if defined(__ARM_FEATURE_SVE) + const svbool_t ptrueh = svptrue_pat_b8(SV_VL16); + const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh); + + svfloat32_t sumv0 = svdup_n_f32(0.0f); + svfloat32_t sumv1 = svdup_n_f32(0.0f); + + assert(nb % 2 == 0); // TODO: handle odd nb + + for (int i = 0; i < nb; i += 2) { + const block_q4_0 * restrict x0 = &x[i + 0]; + const block_q4_0 * restrict x1 = &x[i + 1]; + const block_q8_0 * restrict y0 = &y[i + 0]; + const block_q8_0 * restrict y1 = &y[i + 1]; + + // load x + const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs); + const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs); + + // 4-bit -> 8-bit + const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(ptruel, svand_n_u8_m(ptrueh, qx0r, 0x0F), 0x04)); + const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(ptruel, svand_n_u8_m(ptrueh, qx1r, 0x0F), 0x04)); + + // sub 8 + const svint8_t qx0s = svsub_n_s8_x(svptrue_b8(), qx0, 8); + const svint8_t qx1s = svsub_n_s8_x(svptrue_b8(), qx1, 8); + + // load y + const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs); + const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs); + + // dot product + sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); + } + + *s = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1)); +#elif defined(__ARM_NEON) float32x4_t sumv0 = vdupq_n_f32(0.0f); float32x4_t sumv1 = vdupq_n_f32(0.0f); @@ -5384,7 +5421,32 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r return; } #endif -#if defined(__ARM_NEON) +#if defined(__ARM_FEATURE_SVE) + svfloat32_t sumv0 = svdup_n_f32(0.0f); + svfloat32_t sumv1 = svdup_n_f32(0.0f); + + assert(nb % 2 == 0); // TODO: handle odd nb + + for (int i = 0; i < nb; i += 2) { + const block_q8_0 * restrict x0 = &x[i + 0]; + const block_q8_0 * restrict x1 = &x[i + 1]; + const block_q8_0 * restrict y0 = &y[i + 0]; + const block_q8_0 * restrict y1 = &y[i + 1]; + + // load x + const svint8_t qx0 = svld1_s8(svptrue_b8(), x0->qs); + const svint8_t qx1 = svld1_s8(svptrue_b8(), x1->qs); + + // load y + const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs); + const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs); + + sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx0, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx1, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); + } + + *s = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1)); +#elif defined(__ARM_NEON) float32x4_t sumv0 = vdupq_n_f32(0.0f); float32x4_t sumv1 = vdupq_n_f32(0.0f); diff --git a/ggml.c b/ggml.c index 9e72b7a765dba..5145ceec9f4b2 100644 --- a/ggml.c +++ b/ggml.c @@ -22742,6 +22742,16 @@ int ggml_cpu_has_neon(void) { #endif } +int ggml_cpu_has_sve(void) { +#if defined(__ARM_FEATURE_SVE) + // TODO: Currently, SVE 256 bit is only supported. + GGML_ASSERT(svcntb() == QK8_0); + return 1; +#else + return 0; +#endif +} + int ggml_cpu_has_arm_fma(void) { #if defined(__ARM_FEATURE_FMA) return 1; diff --git a/ggml.h b/ggml.h index be81e0c52316b..f803ba7241fe1 100644 --- a/ggml.h +++ b/ggml.h @@ -2404,6 +2404,7 @@ extern "C" { GGML_API int ggml_cpu_has_avx512_bf16(void); GGML_API int ggml_cpu_has_fma (void); GGML_API int ggml_cpu_has_neon (void); + GGML_API int ggml_cpu_has_sve (void); GGML_API int ggml_cpu_has_arm_fma (void); GGML_API int ggml_cpu_has_metal (void); GGML_API int ggml_cpu_has_f16c (void); diff --git a/llama.cpp b/llama.cpp index 3c9fe15bb4596..85cb3140d945b 100644 --- a/llama.cpp +++ b/llama.cpp @@ -18337,6 +18337,7 @@ const char * llama_print_system_info(void) { s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | "; s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | "; s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | "; + s += "SVE = " + std::to_string(ggml_cpu_has_sve()) + " | "; s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | "; s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | "; s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | "; From 00c63907931bb08a0ed2b7e38cf44dd290143cb9 Mon Sep 17 00:00:00 2001 From: Justine Tunney Date: Sat, 25 May 2024 05:04:03 -0400 Subject: [PATCH 09/11] main : don't print special tokens with --grammar (#6923) * main : don't print special tokens with --grammar The CLI interface was recently changed to print special control tokens like the stop message one. This token shouldn't be printed if the grammar flag was passed, unless the grammar specifies it, because that breaks shell-scriptability. * main: use seperate stream for control characters * main: use dprintf and add --ctrl-token-no-out and --ctrl-token-fd-out * main: dprintf isn't part of the IEEE POSIX standard. Just use write(). * main: remove --ctrl-token-fd-out in favor for fcntl() based detection * common.cpp: accidentally removed --interactive-first * main: only merge stdout and control token if not in conversation or grammar mode * main: rejig control token descriptor handling * main: must check pipe status on very top of program * main: renamed --no-special from --ctrl-token-no-out and other refactoring * main: refactor ctrl_token_no_out --> no_special * llama: rename llama_token_is_control_token() to llama_token_is_control() * main: remove special token file descriptor feature (#5) --------- Co-authored-by: Brian --- common/common.cpp | 5 +++++ common/common.h | 1 + examples/main/main.cpp | 20 +++++++++++++++++--- llama.cpp | 4 ++++ llama.h | 3 +++ 5 files changed, 30 insertions(+), 3 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index c6459038560f1..781f2166bb66a 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -904,6 +904,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.interactive_specials = true; return true; } + if (arg == "--no-special") { + params.no_special = true; + return true; + } if (arg == "--embedding") { params.embedding = true; return true; @@ -1364,6 +1368,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param printf(" -i, --interactive run in interactive mode\n"); printf(" --interactive-specials allow special tokens in user text, in interactive mode\n"); printf(" --interactive-first run in interactive mode and wait for input right away\n"); + printf(" --no-special control tokens output disabled\n"); printf(" -cnv, --conversation run in conversation mode (does not print special tokens and suffix/prefix)\n"); printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n"); printf(" -cml, --chatml run in chatml mode (use with ChatML-compatible models)\n"); diff --git a/common/common.h b/common/common.h index f68f3c2979b94..5388f6b68973c 100644 --- a/common/common.h +++ b/common/common.h @@ -146,6 +146,7 @@ struct gpt_params { bool use_color = false; // use color to distinguish generations and inputs bool interactive = false; // interactive mode bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode + bool no_special = false; // disable control token output bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix) bool chatml = false; // chatml mode (used for models trained on chatml syntax) bool prompt_cache_all = false; // save user input and generations to prompt cache diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 09fa85fce0ee3..ac35772f1e133 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -740,18 +740,32 @@ int main(int argc, char ** argv) { // display text if (input_echo && display) { for (auto id : embd) { - const std::string token_str = llama_token_to_piece(ctx, id, !params.conversation); - printf("%s", token_str.c_str()); + const std::string token_str = llama_token_to_piece(ctx, id); + + // Console/Stream Output + if (!llama_token_is_control(llama_get_model(ctx), id)) { + // Stream Output Token To Standard Output + fprintf(stdout, "%s", token_str.c_str()); + } else if (!params.no_special && !params.conversation) { + // Stream Control Token To Standard Output Stream + fprintf(stdout, "%s", token_str.c_str()); + } + // Record Displayed Tokens To Log + // Note: Generated tokens are created one by one hence this check if (embd.size() > 1) { + // Incoming Requested Tokens input_tokens.push_back(id); } else { + // Outgoing Generated Tokens output_tokens.push_back(id); output_ss << token_str; } + + fflush(stdout); } - fflush(stdout); } + // reset color to default if there is no pending user input if (input_echo && (int) embd_inp.size() == n_consumed) { console::set_display(console::reset); diff --git a/llama.cpp b/llama.cpp index 85cb3140d945b..989d27b9dfb3a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -17861,6 +17861,10 @@ bool llama_token_is_eog(const struct llama_model * model, llama_token token) { ); } +bool llama_token_is_control(const struct llama_model * model, llama_token token) { + return llama_is_control_token(model->vocab, token); +} + llama_token llama_token_bos(const struct llama_model * model) { return model->vocab.special_bos_id; } diff --git a/llama.h b/llama.h index 16cece5db0e78..16676269dd38a 100644 --- a/llama.h +++ b/llama.h @@ -823,6 +823,9 @@ extern "C" { // Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.) LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token); + // Identify if Token Id is a control token or a render-able token + LLAMA_API bool llama_token_is_control(const struct llama_model * model, llama_token token); + // Special tokens LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence From 3cbd23ed88c03a27e1eb6090ac4a8186ca9ac29a Mon Sep 17 00:00:00 2001 From: Brian Date: Sat, 25 May 2024 19:30:42 +1000 Subject: [PATCH 10/11] labeler: added Apple Metal detector (+Kompute) (#7529) * labeler: added Apple Metal detector [no ci] * labeler: add Kompute to detector [no ci] --- .github/labeler.yml | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/.github/labeler.yml b/.github/labeler.yml index a67f78044c46a..97d739b5811e8 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -1,5 +1,16 @@ # https://github.com/actions/labeler - +Kompute: + - changed-files: + - any-glob-to-any-file: + - ggml-kompute.h + - ggml-kompute.cpp + - README-kompute.md +Apple Metal: + - changed-files: + - any-glob-to-any-file: + - ggml-metal.h + - ggml-metal.cpp + - README-metal.md SYCL: - changed-files: - any-glob-to-any-file: @@ -9,6 +20,7 @@ SYCL: Nvidia GPU: - changed-files: - any-glob-to-any-file: + - ggml-cuda.h - ggml-cuda/** Vulkan: - changed-files: From 9588f196b1d7b21bdff013fcf958c249576b2619 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 25 May 2024 15:21:30 +0300 Subject: [PATCH 11/11] train : change default FA argument (#7528) --- common/train.cpp | 2 +- examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/common/train.cpp b/common/train.cpp index 2d41a1d29a83c..fef1e57c94655 100644 --- a/common/train.cpp +++ b/common/train.cpp @@ -1052,7 +1052,7 @@ struct train_params_common get_default_train_params_common() { params.custom_n_ctx = false; - params.use_flash = true; + params.use_flash = false; params.use_checkpointing = true; params.sample_start = ""; diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp index 746c3fbef8412..8ca9f8915916c 100644 --- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp @@ -774,7 +774,7 @@ static struct train_params get_default_train_params() { params.samples_start_after_nl = false; params.use_adam = true; - params.use_flash = true; + params.use_flash = false; params.use_scratch = true; // only adam