diff --git a/include/llama.h b/include/llama.h index 7db12a35b7804..51cc73af65f1c 100644 --- a/include/llama.h +++ b/include/llama.h @@ -471,7 +471,7 @@ extern "C" { LLAMA_API const struct llama_model * llama_get_model (const struct llama_context * ctx); // TODO: remove const? LLAMA_API struct llama_kv_cache * llama_get_kv_self ( struct llama_context * ctx); - LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); + LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); // TODO: rename to llama_get_pooling_type LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model); LLAMA_API enum llama_rope_type llama_model_rope_type(const struct llama_model * model); diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp index 3ce36886c0e1f..b448614e471d6 100644 --- a/src/llama-adapter.cpp +++ b/src/llama-adapter.cpp @@ -4,14 +4,13 @@ #include "llama-mmap.h" #include "llama-model.h" -#include #include #include #include // vec -struct ggml_tensor * llama_adapter_cvec::tensor_for(int il) const { +ggml_tensor * llama_adapter_cvec::tensor_for(int il) const { if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) { return nullptr; } @@ -19,7 +18,7 @@ struct ggml_tensor * llama_adapter_cvec::tensor_for(int il) const { return tensors[il]; } -struct ggml_tensor * llama_adapter_cvec::apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const { +ggml_tensor * llama_adapter_cvec::apply_to(ggml_context * ctx, ggml_tensor * cur, int il) const { ggml_tensor * layer_dir = tensor_for(il); if (layer_dir != nullptr) { cur = ggml_add(ctx, cur, layer_dir); @@ -40,7 +39,7 @@ bool llama_adapter_cvec::init(const llama_model & model) { auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * { auto it = ctx_map.find(buft); if (it == ctx_map.end()) { - struct ggml_init_params params = { + ggml_init_params params = { /*.mem_size =*/ hparams.n_layer*ggml_tensor_overhead(), /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, @@ -135,7 +134,7 @@ bool llama_adapter_cvec::apply( // lora -llama_adapter_lora_weight * llama_adapter_lora::get_weight(struct ggml_tensor * w) { +llama_adapter_lora_weight * llama_adapter_lora::get_weight(ggml_tensor * w) { const std::string name(w->name); const auto pos = ab_map.find(name); @@ -146,11 +145,11 @@ llama_adapter_lora_weight * llama_adapter_lora::get_weight(struct ggml_tensor * return nullptr; } -static void llama_adapter_lora_init_impl(struct llama_model & model, const char * path_lora, struct llama_adapter_lora & adapter) { +static void llama_adapter_lora_init_impl(llama_model & model, const char * path_lora, llama_adapter_lora & adapter) { LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora); ggml_context * ctx_init; - struct gguf_init_params meta_gguf_params = { + gguf_init_params meta_gguf_params = { /* .no_alloc = */ true, /* .ctx = */ &ctx_init, }; @@ -201,7 +200,7 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char auto it = ctx_map.find(buft); if (it == ctx_map.end()) { // add a new context - struct ggml_init_params params = { + ggml_init_params params = { /*.mem_size =*/ n_tensors*ggml_tensor_overhead(), /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, @@ -264,7 +263,7 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)"); } - struct ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer)); + ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer)); // validate tensor shape if (is_token_embd) { // expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd() @@ -281,8 +280,8 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char } // save tensor to adapter - struct ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a); - struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b); + ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a); + ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b); ggml_set_name(tensor_a, w.a->name); ggml_set_name(tensor_b, w.b->name); adapter.ab_map[name] = llama_adapter_lora_weight(tensor_a, tensor_b); @@ -308,7 +307,7 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char { llama_file gguf_file(path_lora, "rb"); std::vector read_buf; - auto set_tensor = [&](struct ggml_tensor * orig, struct ggml_tensor * dev) { + auto set_tensor = [&](ggml_tensor * orig, ggml_tensor * dev) { size_t offs = gguf_get_data_offset(ctx_gguf.get()) + gguf_get_tensor_offset(ctx_gguf.get(), gguf_find_tensor(ctx_gguf.get(), orig->name)); size_t size = ggml_nbytes(orig); read_buf.resize(size); @@ -327,8 +326,8 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2); } -struct llama_adapter_lora * llama_adapter_lora_init(struct llama_model * model, const char * path_lora) { - struct llama_adapter_lora * adapter = new llama_adapter_lora(); +llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * path_lora) { + llama_adapter_lora * adapter = new llama_adapter_lora(); try { llama_adapter_lora_init_impl(*model, path_lora, *adapter); @@ -342,6 +341,6 @@ struct llama_adapter_lora * llama_adapter_lora_init(struct llama_model * model, return nullptr; } -void llama_adapter_lora_free(struct llama_adapter_lora * adapter) { +void llama_adapter_lora_free(llama_adapter_lora * adapter) { delete adapter; } diff --git a/src/llama-adapter.h b/src/llama-adapter.h index 57fda8d598ffe..65824e972765b 100644 --- a/src/llama-adapter.h +++ b/src/llama-adapter.h @@ -15,9 +15,9 @@ // struct llama_adapter_cvec { - struct ggml_tensor * tensor_for(int il) const; + ggml_tensor * tensor_for(int il) const; - struct ggml_tensor * apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const; + ggml_tensor * apply_to(ggml_context * ctx, ggml_tensor * cur, int il) const; bool apply( const llama_model & model, @@ -36,7 +36,7 @@ struct llama_adapter_cvec { std::vector ctxs; std::vector bufs; - std::vector tensors; // per layer + std::vector tensors; // per layer }; // @@ -44,8 +44,8 @@ struct llama_adapter_cvec { // struct llama_adapter_lora_weight { - struct ggml_tensor * a = nullptr; - struct ggml_tensor * b = nullptr; + ggml_tensor * a = nullptr; + ggml_tensor * b = nullptr; // get actual scale based on rank and alpha float get_scale(float alpha, float adapter_scale) const { @@ -55,12 +55,12 @@ struct llama_adapter_lora_weight { } llama_adapter_lora_weight() = default; - llama_adapter_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b) : a(a), b(b) {} + llama_adapter_lora_weight(ggml_tensor * a, ggml_tensor * b) : a(a), b(b) {} }; struct llama_adapter_lora { // map tensor name to lora_a_b - std::unordered_map ab_map; + std::unordered_map ab_map; std::vector ctxs; std::vector bufs; @@ -70,7 +70,7 @@ struct llama_adapter_lora { llama_adapter_lora() = default; ~llama_adapter_lora() = default; - llama_adapter_lora_weight * get_weight(struct ggml_tensor * w); + llama_adapter_lora_weight * get_weight(ggml_tensor * w); }; -using llama_adapter_loras = std::unordered_map; +using llama_adapter_loras = std::unordered_map; diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 387e344620877..0a43a3af8e003 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1591,7 +1591,7 @@ int32_t llama_context::graph_max_nodes() const { } ggml_cgraph * llama_context::graph_init() { - struct ggml_init_params params = { + ggml_init_params params = { /*.mem_size =*/ buf_compute_meta.size(), /*.mem_buffer =*/ buf_compute_meta.data(), /*.no_alloc =*/ true, @@ -1625,7 +1625,7 @@ llm_graph_result_ptr llama_context::graph_build( }, gf, gtype); } -enum ggml_status llama_context::graph_compute( +ggml_status llama_context::graph_compute( ggml_cgraph * gf, bool batched) { int n_threads = batched ? cparams.n_threads_batch : cparams.n_threads; @@ -2288,29 +2288,29 @@ llama_context * llama_init_from_model( } // deprecated -struct llama_context * llama_new_context_with_model( - struct llama_model * model, - struct llama_context_params params) { +llama_context * llama_new_context_with_model( + llama_model * model, + llama_context_params params) { return llama_init_from_model(model, params); } -void llama_free(struct llama_context * ctx) { +void llama_free(llama_context * ctx) { delete ctx; } -uint32_t llama_n_ctx(const struct llama_context * ctx) { +uint32_t llama_n_ctx(const llama_context * ctx) { return ctx->n_ctx(); } -uint32_t llama_n_batch(const struct llama_context * ctx) { +uint32_t llama_n_batch(const llama_context * ctx) { return ctx->n_batch(); } -uint32_t llama_n_ubatch(const struct llama_context * ctx) { +uint32_t llama_n_ubatch(const llama_context * ctx) { return ctx->n_ubatch(); } -uint32_t llama_n_seq_max(const struct llama_context * ctx) { +uint32_t llama_n_seq_max(const llama_context * ctx) { return ctx->n_seq_max(); } @@ -2331,69 +2331,69 @@ enum llama_pooling_type llama_pooling_type(const llama_context * ctx) { } void llama_attach_threadpool( - struct llama_context * ctx, - ggml_threadpool_t threadpool, - ggml_threadpool_t threadpool_batch) { + llama_context * ctx, + ggml_threadpool_t threadpool, + ggml_threadpool_t threadpool_batch) { ctx->attach_threadpool(threadpool, threadpool_batch); } -void llama_detach_threadpool(struct llama_context * ctx) { +void llama_detach_threadpool(llama_context * ctx) { ctx->detach_threadpool(); } -void llama_set_n_threads(struct llama_context * ctx, int32_t n_threads, int32_t n_threads_batch) { +void llama_set_n_threads(llama_context * ctx, int32_t n_threads, int32_t n_threads_batch) { ctx->set_n_threads(n_threads, n_threads_batch); } -int32_t llama_n_threads(struct llama_context * ctx) { +int32_t llama_n_threads(llama_context * ctx) { return ctx->n_threads(); } -int32_t llama_n_threads_batch(struct llama_context * ctx) { +int32_t llama_n_threads_batch(llama_context * ctx) { return ctx->n_threads_batch(); } -void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) { +void llama_set_abort_callback(llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) { ctx->set_abort_callback(abort_callback, abort_callback_data); } -void llama_set_embeddings(struct llama_context * ctx, bool embeddings) { +void llama_set_embeddings(llama_context * ctx, bool embeddings) { ctx->set_embeddings(embeddings); } -void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) { +void llama_set_causal_attn(llama_context * ctx, bool causal_attn) { ctx->set_causal_attn(causal_attn); } -void llama_synchronize(struct llama_context * ctx) { +void llama_synchronize(llama_context * ctx) { ctx->synchronize(); } -float * llama_get_logits(struct llama_context * ctx) { +float * llama_get_logits(llama_context * ctx) { ctx->synchronize(); return ctx->get_logits(); } -float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) { +float * llama_get_logits_ith(llama_context * ctx, int32_t i) { ctx->synchronize(); return ctx->get_logits_ith(i); } -float * llama_get_embeddings(struct llama_context * ctx) { +float * llama_get_embeddings(llama_context * ctx) { ctx->synchronize(); return ctx->get_embeddings(); } -float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) { +float * llama_get_embeddings_ith(llama_context * ctx, int32_t i) { ctx->synchronize(); return ctx->get_embeddings_ith(i); } -float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id) { +float * llama_get_embeddings_seq(llama_context * ctx, llama_seq_id seq_id) { ctx->synchronize(); return ctx->get_embeddings_seq(seq_id); @@ -2402,8 +2402,8 @@ float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id // llama adapter API int32_t llama_set_adapter_lora( - struct llama_context * ctx, - struct llama_adapter_lora * adapter, + llama_context * ctx, + llama_adapter_lora * adapter, float scale) { ctx->set_adapter_lora(adapter, scale); @@ -2411,19 +2411,19 @@ int32_t llama_set_adapter_lora( } int32_t llama_rm_adapter_lora( - struct llama_context * ctx, - struct llama_adapter_lora * adapter) { + llama_context * ctx, + llama_adapter_lora * adapter) { bool res = ctx->rm_adapter_lora(adapter); return res ? 0 : -1; } -void llama_clear_adapter_lora(struct llama_context * ctx) { +void llama_clear_adapter_lora(llama_context * ctx) { ctx->clear_adapter_lora(); } int32_t llama_apply_adapter_cvec( - struct llama_context * ctx, + llama_context * ctx, const float * data, size_t len, int32_t n_embd, @@ -2438,7 +2438,7 @@ int32_t llama_apply_adapter_cvec( // kv cache view // -struct llama_kv_cache_view llama_kv_cache_view_init(const llama_context * ctx, int32_t n_seq_max) { +llama_kv_cache_view llama_kv_cache_view_init(const llama_context * ctx, int32_t n_seq_max) { const auto * kv = ctx->get_kv_self(); if (kv == nullptr) { LLAMA_LOG_WARN("%s: the context does not have a KV cache\n", __func__); @@ -2609,50 +2609,50 @@ void llama_kv_cache_update(llama_context * ctx) { // llama state API // deprecated -size_t llama_get_state_size(struct llama_context * ctx) { +size_t llama_get_state_size(llama_context * ctx) { return llama_state_get_size(ctx); } // deprecated -size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) { +size_t llama_copy_state_data(llama_context * ctx, uint8_t * dst) { return llama_state_get_data(ctx, dst, -1); } // deprecated -size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { +size_t llama_set_state_data(llama_context * ctx, const uint8_t * src) { return llama_state_set_data(ctx, src, -1); } // deprecated -bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { +bool llama_load_session_file(llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { return llama_state_load_file(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out); } // deprecated -bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) { +bool llama_save_session_file(llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) { return llama_state_save_file(ctx, path_session, tokens, n_token_count); } // Returns the *actual* size of the state. // Intended to be used when saving to state to a buffer. -size_t llama_state_get_size(struct llama_context * ctx) { +size_t llama_state_get_size(llama_context * ctx) { return ctx->state_get_size(); } -size_t llama_state_get_data(struct llama_context * ctx, uint8_t * dst, size_t size) { +size_t llama_state_get_data(llama_context * ctx, uint8_t * dst, size_t size) { ctx->synchronize(); return ctx->state_get_data(dst, size); } // Sets the state reading from the specified source address -size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src, size_t size) { +size_t llama_state_set_data(llama_context * ctx, const uint8_t * src, size_t size) { ctx->synchronize(); return ctx->state_set_data(src, size); } -bool llama_state_load_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { +bool llama_state_load_file(llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { ctx->synchronize(); try { @@ -2663,7 +2663,7 @@ bool llama_state_load_file(struct llama_context * ctx, const char * path_session } } -bool llama_state_save_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) { +bool llama_state_save_file(llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) { ctx->synchronize(); try { @@ -2674,23 +2674,23 @@ bool llama_state_save_file(struct llama_context * ctx, const char * path_session } } -size_t llama_state_seq_get_size(struct llama_context * ctx, llama_seq_id seq_id) { +size_t llama_state_seq_get_size(llama_context * ctx, llama_seq_id seq_id) { return ctx->state_seq_get_size(seq_id); } -size_t llama_state_seq_get_data(struct llama_context * ctx, uint8_t * dst, size_t size, llama_seq_id seq_id) { +size_t llama_state_seq_get_data(llama_context * ctx, uint8_t * dst, size_t size, llama_seq_id seq_id) { ctx->synchronize(); return ctx->state_seq_get_data(seq_id, dst, size); } -size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, size_t size, llama_seq_id seq_id) { +size_t llama_state_seq_set_data(llama_context * ctx, const uint8_t * src, size_t size, llama_seq_id seq_id) { ctx->synchronize(); return ctx->state_seq_set_data(seq_id, src, size); } -size_t llama_state_seq_save_file(struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) { +size_t llama_state_seq_save_file(llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) { ctx->synchronize(); try { @@ -2701,7 +2701,7 @@ size_t llama_state_seq_save_file(struct llama_context * ctx, const char * filepa } } -size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepath, llama_seq_id dest_seq_id, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { +size_t llama_state_seq_load_file(llama_context * ctx, const char * filepath, llama_seq_id dest_seq_id, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { ctx->synchronize(); try { @@ -2715,8 +2715,8 @@ size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepa /// int32_t llama_encode( - struct llama_context * ctx, - struct llama_batch batch) { + llama_context * ctx, + llama_batch batch) { const int ret = ctx->encode(batch); if (ret != 0) { LLAMA_LOG_ERROR("%s: failed to encode, ret = %d\n", __func__, ret); @@ -2726,8 +2726,8 @@ int32_t llama_encode( } int32_t llama_decode( - struct llama_context * ctx, - struct llama_batch batch) { + llama_context * ctx, + llama_batch batch) { const int ret = ctx->decode(batch); if (ret != 0) { LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret); diff --git a/src/llama-context.h b/src/llama-context.h index 194e88b2c1574..71d702e8baeeb 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -156,7 +156,7 @@ struct llama_context { llm_graph_type gtype); // returns the result of ggml_backend_sched_graph_compute_async execution - enum ggml_status graph_compute( + ggml_status graph_compute( ggml_cgraph * gf, bool batched); diff --git a/src/llama-graph.h b/src/llama-graph.h index 4ada95d53e82e..b7a66d1898736 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -16,7 +16,6 @@ struct ggml_tensor; struct llama_ubatch; struct llama_cparams; -struct llama_layer; class llama_memory_i; class llama_kv_cache_unified; diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index eefbce342336e..14c8933b4d6c4 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -49,7 +49,7 @@ bool llama_kv_cache_unified::init( auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * { auto it = ctx_map.find(buft); if (it == ctx_map.end()) { - struct ggml_init_params params = { + ggml_init_params params = { /*.mem_size =*/ size_t(2u*n_layer*ggml_tensor_overhead()), /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, @@ -450,8 +450,8 @@ bool llama_kv_cache_unified::get_can_shift() const { return can_shift; } -struct llama_kv_cache_slot_info llama_kv_cache_unified::find_slot( - const struct llama_ubatch & ubatch) { +llama_kv_cache_slot_info llama_kv_cache_unified::find_slot( + const llama_ubatch & ubatch) { const uint32_t n_tokens = ubatch.n_tokens; const uint32_t n_seqs = ubatch.n_seqs; const uint32_t n_seq_tokens = ubatch.n_seq_tokens; @@ -1335,8 +1335,8 @@ bool llama_kv_cache_can_shift(const llama_kv_cache * kv) { // kv cache view // -struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_kv_cache & kv, int32_t n_seq_max) { - struct llama_kv_cache_view result = { +llama_kv_cache_view llama_kv_cache_view_init(const llama_kv_cache & kv, int32_t n_seq_max) { + llama_kv_cache_view result = { /*.n_cells = */ 0, /*.n_seq_max = */ n_seq_max, /*.token_count = */ 0, @@ -1350,7 +1350,7 @@ struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_kv_cache return result; } -void llama_kv_cache_view_free(struct llama_kv_cache_view * view) { +void llama_kv_cache_view_free(llama_kv_cache_view * view) { if (view->cells != nullptr) { free(view->cells); view->cells = nullptr; @@ -1361,7 +1361,7 @@ void llama_kv_cache_view_free(struct llama_kv_cache_view * view) { } } -void llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct llama_kv_cache * kv) { +void llama_kv_cache_view_update(llama_kv_cache_view * view, const llama_kv_cache * kv) { // TODO: rework this in the future, for now quick hack const llama_kv_cache_unified * kvu = dynamic_cast(kv); if (kvu == nullptr) { @@ -1371,9 +1371,9 @@ void llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct if (uint32_t(view->n_cells) < kvu->size || view->cells == nullptr) { view->n_cells = int32_t(kvu->size); - void * p = realloc(view->cells, sizeof(struct llama_kv_cache_view_cell) * view->n_cells); + void * p = realloc(view->cells, sizeof(llama_kv_cache_view_cell) * view->n_cells); GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells"); - view->cells = (struct llama_kv_cache_view_cell *)p; + view->cells = (llama_kv_cache_view_cell *)p; p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->n_seq_max * view->n_cells); GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells sequences"); view->cells_sequences = (llama_seq_id *)p; diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h index 15e8876f95a6b..0a7ff8a4ea3e6 100644 --- a/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h @@ -160,14 +160,14 @@ class llama_kv_cache_unified : public llama_kv_cache { std::vector cells; - std::vector k_l; // per layer - std::vector v_l; + std::vector k_l; // per layer + std::vector v_l; private: ggml_type type_k = GGML_TYPE_F16; ggml_type type_v = GGML_TYPE_F16; - std::vector ctxs; + std::vector ctxs; std::vector bufs; void state_write_meta(llama_io_write_i & io, const std::vector> & cell_ranges, llama_seq_id seq_id = -1) const; @@ -209,7 +209,7 @@ struct llama_kv_slot_restorer { } // saves a slot information for future restoration - void save(const struct llama_kv_cache_slot_info & slot) { + void save(const llama_kv_cache_slot_info & slot) { if (slot) { do_restore = true; if (slot.boundaries.first != slot.boundaries.second) { @@ -282,6 +282,6 @@ bool llama_kv_cache_can_shift(const llama_kv_cache * kv); // kv cache view // -struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_kv_cache & kv, int32_t n_seq_max); +llama_kv_cache_view llama_kv_cache_view_init(const llama_kv_cache & kv, int32_t n_seq_max); -void llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct llama_kv_cache * kv); +void llama_kv_cache_view_update(llama_kv_cache_view * view, const llama_kv_cache * kv); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index f8f2fa27df7d8..090b8193df00c 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -307,7 +307,7 @@ static buft_list_t make_cpu_buft_list(const std::vector & de } // GPU: split if LLAMA_SPLIT_MODE_ROW -> GPU -static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, enum llama_split_mode split_mode, const float * tensor_split) { +static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, llama_split_mode split_mode, const float * tensor_split) { buft_list_t buft_list; // add the device split buffer type if requested and available @@ -374,7 +374,7 @@ struct llama_model::impl { std::vector dev_layer; }; -llama_model::llama_model(const struct llama_model_params & params) : params(params), pimpl(std::make_unique()) { +llama_model::llama_model(const llama_model_params & params) : params(params), pimpl(std::make_unique()) { } llama_model::~llama_model() {} @@ -396,7 +396,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { // get metadata as string for (int i = 0; i < gguf_get_n_kv(ctx); i++) { - enum gguf_type type = gguf_get_kv_type(ctx, i); + gguf_type type = gguf_get_kv_type(ctx, i); if (type == GGUF_TYPE_ARRAY) { continue; } @@ -3706,7 +3706,7 @@ void llama_model::print_info() const { LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared); LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale); LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm); - LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((enum llama_expert_gating_func_type) hparams.expert_gating_func)); + LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func)); LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul); } @@ -3782,9 +3782,9 @@ ggml_backend_buffer_type_t llama_model::select_buft(int il) const { }); } -const struct ggml_tensor * llama_model::get_tensor(const char * name) const { +const ggml_tensor * llama_model::get_tensor(const char * name) const { auto it = std::find_if(tensors_by_name.begin(), tensors_by_name.end(), - [name](const std::pair & it) { + [name](const std::pair & it) { return it.first == name; }); if (it == tensors_by_name.end()) { @@ -3801,19 +3801,19 @@ struct llm_build_llama : public llm_graph_context { GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); - struct ggml_tensor * cur; - struct ggml_tensor * inpL; + ggml_tensor * cur; + ggml_tensor * inpL; inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_pos = build_inp_pos(); auto * inp_attn = build_attn_inp_kv_unified(true, false); const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; + ggml_tensor * inpSA = inpL; // norm cur = build_norm(inpL, @@ -3827,21 +3827,21 @@ struct llm_build_llama : public llm_graph_context { ggml_tensor * rope_factors = static_cast(memory)->cbs.get_rope_factors(n_ctx_per_seq, il); // compute Q and K and RoPE them - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -3869,7 +3869,7 @@ struct llm_build_llama : public llm_graph_context { if (il == n_layer - 1) { // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -3879,7 +3879,7 @@ struct llm_build_llama : public llm_graph_context { cur = ggml_scale(ctx0, cur, hparams.f_residual_scale); } - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); // feed-forward network @@ -3964,19 +3964,19 @@ struct llm_build_deci : public llm_graph_context { GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); - struct ggml_tensor * cur; - struct ggml_tensor * inpL; + ggml_tensor * cur; + ggml_tensor * inpL; inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_pos = build_inp_pos(); auto * inp_attn = build_attn_inp_kv_unified(true, false); const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; + ggml_tensor * inpSA = inpL; const int64_t n_head_kv = hparams.n_head_kv(il); const int64_t n_head = hparams.n_head(il); @@ -4001,21 +4001,21 @@ struct llm_build_deci : public llm_graph_context { ggml_tensor * rope_factors = static_cast(memory)->cbs.get_rope_factors(n_ctx_per_seq, il); // compute Q and K and RoPE them - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -4043,7 +4043,7 @@ struct llm_build_deci : public llm_graph_context { if (il == n_layer - 1) { // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -4054,7 +4054,7 @@ struct llm_build_deci : public llm_graph_context { } // modified to support attention-free layer of Llama-3_1-Nemotron-51B - struct ggml_tensor * ffn_inp = cur; + ggml_tensor * ffn_inp = cur; if (n_head > 0) { ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); @@ -4122,18 +4122,18 @@ struct llm_build_baichuan : public llm_graph_context { GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); - struct ggml_tensor * cur; - struct ggml_tensor * inpL; + ggml_tensor * cur; + ggml_tensor * inpL; inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr; + ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr; auto * inp_attn = build_attn_inp_kv_unified(true, false); for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; + ggml_tensor * inpSA = inpL; cur = build_norm(inpL, model.layers[il].attn_norm, NULL, @@ -4142,13 +4142,13 @@ struct llm_build_baichuan : public llm_graph_context { // self-attention { - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); switch (model.type) { @@ -4181,12 +4181,12 @@ struct llm_build_baichuan : public llm_graph_context { if (il == n_layer - 1) { // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); // feed-forward network @@ -4240,18 +4240,18 @@ struct llm_build_xverse : public llm_graph_context { GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); - struct ggml_tensor * cur; - struct ggml_tensor * inpL; + ggml_tensor * cur; + ggml_tensor * inpL; inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_pos = build_inp_pos(); auto * inp_attn = build_attn_inp_kv_unified(true, false); for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; + ggml_tensor * inpSA = inpL; cur = build_norm(inpL, model.layers[il].attn_norm, NULL, @@ -4260,13 +4260,13 @@ struct llm_build_xverse : public llm_graph_context { // self-attention { - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); Qcur = ggml_rope_ext( @@ -4290,12 +4290,12 @@ struct llm_build_xverse : public llm_graph_context { if (il == n_layer - 1) { // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); // feed-forward network @@ -4348,18 +4348,18 @@ struct llm_build_falcon : public llm_graph_context { GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); - struct ggml_tensor * cur; - struct ggml_tensor * inpL; + ggml_tensor * cur; + ggml_tensor * inpL; inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_pos = build_inp_pos(); auto * inp_attn = build_attn_inp_kv_unified(true, false); for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * attn_norm; + ggml_tensor * attn_norm; attn_norm = build_norm(inpL, model.layers[il].attn_norm, @@ -4383,9 +4383,9 @@ struct llm_build_falcon : public llm_graph_context { cur = build_lora_mm(model.layers[il].wqkv, cur); cb(cur, "wqkv", il); - struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); - struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); - struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); @@ -4414,13 +4414,13 @@ struct llm_build_falcon : public llm_graph_context { if (il == n_layer - 1) { // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); attn_norm = ggml_get_rows(ctx0, attn_norm, inp_out_ids); } - struct ggml_tensor * ffn_inp = cur; + ggml_tensor * ffn_inp = cur; // feed forward { @@ -4470,8 +4470,8 @@ struct llm_build_grok : public llm_graph_context { GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); - struct ggml_tensor * cur; - struct ggml_tensor * inpL; + ggml_tensor * cur; + ggml_tensor * inpL; inpL = build_inp_embd(model.tok_embd); @@ -4479,12 +4479,12 @@ struct llm_build_grok : public llm_graph_context { inpL = ggml_scale(ctx0, inpL, 78.38367176906169f); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_pos = build_inp_pos(); auto * inp_attn = build_attn_inp_kv_unified(true, false); for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; + ggml_tensor * inpSA = inpL; // norm cur = build_norm(inpL, @@ -4496,21 +4496,21 @@ struct llm_build_grok : public llm_graph_context { // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -4538,7 +4538,7 @@ struct llm_build_grok : public llm_graph_context { if (il == n_layer - 1) { // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -4552,7 +4552,7 @@ struct llm_build_grok : public llm_graph_context { cb(cur, "attn_out_norm", il); } - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); // feed-forward network @@ -4627,18 +4627,18 @@ struct llm_build_dbrx : public llm_graph_context { GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); - struct ggml_tensor * cur; - struct ggml_tensor * inpL; + ggml_tensor * cur; + ggml_tensor * inpL; inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_pos = build_inp_pos(); auto * inp_attn = build_attn_inp_kv_unified(true, false); for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; + ggml_tensor * inpSA = inpL; // norm cur = build_norm(inpL, @@ -4648,9 +4648,9 @@ struct llm_build_dbrx : public llm_graph_context { // self-attention { - struct ggml_tensor * Qcur = nullptr; - struct ggml_tensor * Kcur = nullptr; - struct ggml_tensor * Vcur = nullptr; + ggml_tensor * Qcur = nullptr; + ggml_tensor * Kcur = nullptr; + ggml_tensor * Vcur = nullptr; cur = build_lora_mm(model.layers[il].wqkv, cur); cb(cur, "wqkv", il); @@ -4687,12 +4687,12 @@ struct llm_build_dbrx : public llm_graph_context { if (il == n_layer - 1) { // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); // feed-forward network @@ -4751,17 +4751,17 @@ struct llm_build_starcoder : public llm_graph_context { GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - struct ggml_tensor * cur; - struct ggml_tensor * inpL; + ggml_tensor * cur; + ggml_tensor * inpL; inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_pos = build_inp_pos(); auto * inp_attn = build_attn_inp_kv_unified(true, false); - struct ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos); + ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos); cb(pos, "pos_embd", -1); inpL = ggml_add(ctx0, inpL, pos); @@ -4782,9 +4782,9 @@ struct llm_build_starcoder : public llm_graph_context { cur = ggml_add(ctx0, cur, model.layers[il].bqkv); cb(cur, "bqkv", il); - struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); - struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); - struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); @@ -4799,13 +4799,13 @@ struct llm_build_starcoder : public llm_graph_context { if (il == n_layer - 1) { // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); } // add the input - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); cb(ffn_inp, "ffn_inp", il); // FF @@ -4857,15 +4857,15 @@ struct llm_build_refact : public llm_graph_context { GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - struct ggml_tensor * cur; - struct ggml_tensor * inpL; + ggml_tensor * cur; + ggml_tensor * inpL; inpL = build_inp_embd(model.tok_embd); auto * inp_attn = build_attn_inp_kv_unified(true, false); for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; + ggml_tensor * inpSA = inpL; cur = build_norm(inpL, model.layers[il].attn_norm, NULL, @@ -4874,13 +4874,13 @@ struct llm_build_refact : public llm_graph_context { // self-attention { - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); @@ -4896,12 +4896,12 @@ struct llm_build_refact : public llm_graph_context { if (il == n_layer - 1) { // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); // feed-forward network @@ -4955,9 +4955,9 @@ struct llm_build_bert : public llm_graph_context { GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - struct ggml_tensor * inp_pos = nullptr; + ggml_tensor * cur; + ggml_tensor * inpL; + ggml_tensor * inp_pos = nullptr; if (model.arch != LLM_ARCH_JINA_BERT_V2) { inp_pos = build_inp_pos(); @@ -4967,7 +4967,7 @@ struct llm_build_bert : public llm_graph_context { inpL = build_inp_embd(model.tok_embd); // token types are hardcoded to zero ("Sentence A") - struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0); + ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0); inpL = ggml_add(ctx0, inpL, type_row0); if (model.arch == LLM_ARCH_BERT) { inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL); @@ -4982,11 +4982,11 @@ struct llm_build_bert : public llm_graph_context { // iterate layers for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * cur = inpL; + ggml_tensor * cur = inpL; - struct ggml_tensor * Qcur; - struct ggml_tensor * Kcur; - struct ggml_tensor * Vcur; + ggml_tensor * Qcur; + ggml_tensor * Kcur; + ggml_tensor * Vcur; // self-attention if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT_V2) { @@ -5050,7 +5050,7 @@ struct llm_build_bert : public llm_graph_context { if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) { // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); } @@ -5066,7 +5066,7 @@ struct llm_build_bert : public llm_graph_context { cur = build_norm(cur, model.layers[il].attn_norm_2, model.layers[il].attn_norm_2_b, LLM_NORM, il); } - struct ggml_tensor * ffn_inp = cur; + ggml_tensor * ffn_inp = cur; cb(ffn_inp, "ffn_inp", il); // feed-forward network @@ -5120,8 +5120,8 @@ struct llm_build_bloom : public llm_graph_context { GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - struct ggml_tensor * cur; - struct ggml_tensor * inpL; + ggml_tensor * cur; + ggml_tensor * inpL; inpL = build_inp_embd(model.tok_embd); @@ -5148,9 +5148,9 @@ struct llm_build_bloom : public llm_graph_context { cur = ggml_add(ctx0, cur, model.layers[il].bqkv); cb(cur, "bqkv", il); - struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); - struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); - struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); @@ -5165,13 +5165,13 @@ struct llm_build_bloom : public llm_graph_context { if (il == n_layer - 1) { // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); } // Add the input - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); cb(ffn_inp, "ffn_inp", il); // FF @@ -5224,9 +5224,9 @@ struct llm_build_mpt : public llm_graph_context { GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - struct ggml_tensor * cur; - struct ggml_tensor * pos; - struct ggml_tensor * inpL; + ggml_tensor * cur; + ggml_tensor * pos; + ggml_tensor * inpL; inpL = build_inp_embd(model.tok_embd); @@ -5234,7 +5234,7 @@ struct llm_build_mpt : public llm_graph_context { if (model.pos_embd) { // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_pos = build_inp_pos(); pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos); cb(pos, "pos_embd", -1); @@ -5243,7 +5243,7 @@ struct llm_build_mpt : public llm_graph_context { } for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * attn_norm; + ggml_tensor * attn_norm; attn_norm = build_norm(inpL, model.layers[il].attn_norm, @@ -5268,9 +5268,9 @@ struct llm_build_mpt : public llm_graph_context { cb(cur, "wqkv_clamped", il); } - struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); - struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); - struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); @@ -5307,13 +5307,13 @@ struct llm_build_mpt : public llm_graph_context { if (il == n_layer - 1) { // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); } // Add the input - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); cb(ffn_inp, "ffn_inp", il); // feed forward @@ -5366,13 +5366,13 @@ struct llm_build_stablelm : public llm_graph_context { GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - struct ggml_tensor * cur; - struct ggml_tensor * inpL; + ggml_tensor * cur; + ggml_tensor * inpL; inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_pos = build_inp_pos(); auto * inp_attn = build_attn_inp_kv_unified(true, false); @@ -5384,26 +5384,26 @@ struct llm_build_stablelm : public llm_graph_context { LLM_NORM, il); cb(cur, "attn_norm", il); - struct ggml_tensor * inpSA = cur; + ggml_tensor * inpSA = cur; // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -5452,13 +5452,13 @@ struct llm_build_stablelm : public llm_graph_context { if (il == n_layer - 1) { // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); cb(ffn_inp, "ffn_inp", il); // feed-forward network @@ -5517,18 +5517,18 @@ struct llm_build_qwen : public llm_graph_context { GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - struct ggml_tensor * cur; - struct ggml_tensor * inpL; + ggml_tensor * cur; + ggml_tensor * inpL; inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_pos = build_inp_pos(); auto * inp_attn = build_attn_inp_kv_unified(true, false); for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; + ggml_tensor * inpSA = inpL; cur = build_norm(inpL, model.layers[il].attn_norm, NULL, @@ -5543,9 +5543,9 @@ struct llm_build_qwen : public llm_graph_context { cur = ggml_add(ctx0, cur, model.layers[il].bqkv); cb(cur, "bqkv", il); - struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); - struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); - struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd))); + ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd))); cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); @@ -5574,12 +5574,12 @@ struct llm_build_qwen : public llm_graph_context { if (il == n_layer - 1) { // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); // feed-forward forward @@ -5633,18 +5633,18 @@ struct llm_build_qwen2 : public llm_graph_context { GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); - struct ggml_tensor * cur; - struct ggml_tensor * inpL; + ggml_tensor * cur; + ggml_tensor * inpL; inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_pos = build_inp_pos(); auto * inp_attn = build_attn_inp_kv_unified(true, false); for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; + ggml_tensor * inpSA = inpL; // norm cur = build_norm(inpL, @@ -5655,17 +5655,17 @@ struct llm_build_qwen2 : public llm_graph_context { // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); cb(Vcur, "Vcur", il); @@ -5691,12 +5691,12 @@ struct llm_build_qwen2 : public llm_graph_context { if (il == n_layer - 1) { // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); // feed-forward network @@ -5748,13 +5748,13 @@ struct llm_build_qwen2vl : public llm_graph_context { GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); - struct ggml_tensor * cur; - struct ggml_tensor * inpL; + ggml_tensor * cur; + ggml_tensor * inpL; inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_pos = build_inp_pos(); auto * inp_attn = build_attn_inp_kv_unified(true, false); @@ -5762,7 +5762,7 @@ struct llm_build_qwen2vl : public llm_graph_context { std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections); for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; + ggml_tensor * inpSA = inpL; // norm cur = build_norm(inpL, @@ -5773,17 +5773,17 @@ struct llm_build_qwen2vl : public llm_graph_context { // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); cb(Vcur, "Vcur", il); @@ -5811,12 +5811,12 @@ struct llm_build_qwen2vl : public llm_graph_context { if (il == n_layer - 1) { // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); // feed-forward network @@ -5868,18 +5868,18 @@ struct llm_build_qwen2moe : public llm_graph_context { GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); - struct ggml_tensor * cur; - struct ggml_tensor * inpL; + ggml_tensor * cur; + ggml_tensor * inpL; inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_pos = build_inp_pos(); auto * inp_attn = build_attn_inp_kv_unified(true, false); for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; + ggml_tensor * inpSA = inpL; // norm cur = build_norm(inpL, @@ -5890,17 +5890,17 @@ struct llm_build_qwen2moe : public llm_graph_context { // self_attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); cb(Vcur, "Vcur", il); @@ -5926,12 +5926,12 @@ struct llm_build_qwen2moe : public llm_graph_context { if (il == n_layer - 1) { // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); // MoE branch @@ -6015,15 +6015,15 @@ struct llm_build_phi2 : public llm_graph_context { GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - struct ggml_tensor * cur; - struct ggml_tensor * attn_norm_output; - struct ggml_tensor * ffn_output; - struct ggml_tensor * inpL; + ggml_tensor * cur; + ggml_tensor * attn_norm_output; + ggml_tensor * ffn_output; + ggml_tensor * inpL; inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_pos = build_inp_pos(); auto * inp_attn = build_attn_inp_kv_unified(true, false); @@ -6036,9 +6036,9 @@ struct llm_build_phi2 : public llm_graph_context { // self-attention { - struct ggml_tensor * Qcur = nullptr; - struct ggml_tensor * Kcur = nullptr; - struct ggml_tensor * Vcur = nullptr; + ggml_tensor * Qcur = nullptr; + ggml_tensor * Kcur = nullptr; + ggml_tensor * Vcur = nullptr; if (model.layers[il].wqkv) { cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output); @@ -6087,7 +6087,7 @@ struct llm_build_phi2 : public llm_graph_context { if (il == n_layer - 1) { // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids); @@ -6141,13 +6141,13 @@ struct llm_build_phi3 : public llm_graph_context { GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - struct ggml_tensor * cur; - struct ggml_tensor * inpL; + ggml_tensor * cur; + ggml_tensor * inpL; inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_pos = build_inp_pos(); auto * inp_attn = build_attn_inp_kv_unified(true, true); @@ -6159,15 +6159,15 @@ struct llm_build_phi3 : public llm_graph_context { // rope freq factors for 128k context ggml_tensor * rope_factors = static_cast(memory)->cbs.get_rope_factors(n_ctx_per_seq, il); - struct ggml_tensor* attn_norm_output = build_norm(inpL, + ggml_tensor* attn_norm_output = build_norm(inpL, model.layers[il].attn_norm, model.layers[il].attn_norm_b, LLM_NORM_RMS, il); cb(attn_norm_output, "attn_norm", il); - struct ggml_tensor * Qcur = nullptr; - struct ggml_tensor * Kcur = nullptr; - struct ggml_tensor * Vcur = nullptr; + ggml_tensor * Qcur = nullptr; + ggml_tensor * Kcur = nullptr; + ggml_tensor * Vcur = nullptr; if (model.layers[il].wqkv) { cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output); @@ -6211,8 +6211,8 @@ struct llm_build_phi3 : public llm_graph_context { if (il == n_layer - 1) { // skip computing output for unused tokens - struct ggml_tensor* inp_out_ids = build_inp_out_ids(); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); + ggml_tensor* inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); residual = ggml_get_rows(ctx0, residual, inp_out_ids); } @@ -6287,13 +6287,13 @@ struct llm_build_plamo : public llm_graph_context { GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); - struct ggml_tensor * cur; - struct ggml_tensor * inpL; + ggml_tensor * cur; + ggml_tensor * inpL; inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_pos = build_inp_pos(); auto * inp_attn = build_attn_inp_kv_unified(true, false); @@ -6305,18 +6305,18 @@ struct llm_build_plamo : public llm_graph_context { LLM_NORM_RMS, il); cb(cur, "attn_norm", il); - struct ggml_tensor * attention_norm = cur; + ggml_tensor * attention_norm = cur; // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); Qcur = ggml_rope_ext( @@ -6335,13 +6335,13 @@ struct llm_build_plamo : public llm_graph_context { model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } - struct ggml_tensor * sa_out = cur; + ggml_tensor * sa_out = cur; cur = attention_norm; if (il == n_layer - 1) { // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = ggml_get_rows(ctx0, cur, inp_out_ids); sa_out = ggml_get_rows(ctx0, sa_out, inp_out_ids); inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); @@ -6394,14 +6394,14 @@ struct llm_build_gpt2 : public llm_graph_context { GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - struct ggml_tensor * cur; - struct ggml_tensor * pos; - struct ggml_tensor * inpL; + ggml_tensor * cur; + ggml_tensor * pos; + ggml_tensor * inpL; inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_pos = build_inp_pos(); auto * inp_attn = build_attn_inp_kv_unified(true, false); @@ -6426,9 +6426,9 @@ struct llm_build_gpt2 : public llm_graph_context { cur = ggml_add(ctx0, cur, model.layers[il].bqkv); cb(cur, "bqkv", il); - struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); - struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); - struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); @@ -6443,13 +6443,13 @@ struct llm_build_gpt2 : public llm_graph_context { if (il == n_layer - 1) { // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); } // add the input - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); cb(ffn_inp, "ffn_inp", il); // FF @@ -6503,13 +6503,13 @@ struct llm_build_codeshell : public llm_graph_context { GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); - struct ggml_tensor * cur; - struct ggml_tensor * inpL; + ggml_tensor * cur; + ggml_tensor * inpL; inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_pos = build_inp_pos(); auto * inp_attn = build_attn_inp_kv_unified(true, false); @@ -6528,22 +6528,22 @@ struct llm_build_codeshell : public llm_graph_context { cur = ggml_add(ctx0, cur, model.layers[il].bqkv); cb(cur, "bqkv", il); - struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); - struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); - struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); cb(tmpq, "tmpq", il); cb(tmpk, "tmpk", il); cb(Vcur, "Vcur", il); - struct ggml_tensor * Qcur = ggml_rope_ext( + ggml_tensor * Qcur = ggml_rope_ext( ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = ggml_rope_ext( + ggml_tensor * Kcur = ggml_rope_ext( ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow @@ -6557,13 +6557,13 @@ struct llm_build_codeshell : public llm_graph_context { if (il == n_layer - 1) { // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); } // add the input - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); cb(ffn_inp, "ffn_inp", il); // FF @@ -6616,18 +6616,18 @@ struct llm_build_orion : public llm_graph_context { GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); - struct ggml_tensor * cur; - struct ggml_tensor * inpL; + ggml_tensor * cur; + ggml_tensor * inpL; inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_pos = build_inp_pos(); auto * inp_attn = build_attn_inp_kv_unified(true, false); for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; + ggml_tensor * inpSA = inpL; // norm cur = build_norm(inpL, @@ -6638,21 +6638,21 @@ struct llm_build_orion : public llm_graph_context { // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); // if (model.layers[il].bq) { // Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); // cb(Qcur, "Qcur", il); // } - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); // if (model.layers[il].bk) { // Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); // cb(Kcur, "Kcur", il); // } - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); // if (model.layers[il].bv) { // Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -6680,12 +6680,12 @@ struct llm_build_orion : public llm_graph_context { if (il == n_layer - 1) { // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); // feed-forward network @@ -6737,18 +6737,18 @@ struct llm_build_internlm2 : public llm_graph_context { GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); - struct ggml_tensor * cur; - struct ggml_tensor * inpL; + ggml_tensor * cur; + ggml_tensor * inpL; inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_pos = build_inp_pos(); auto * inp_attn = build_attn_inp_kv_unified(true, false); for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; + ggml_tensor * inpSA = inpL; // norm cur = build_norm(inpL, @@ -6759,21 +6759,21 @@ struct llm_build_internlm2 : public llm_graph_context { // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -6801,12 +6801,12 @@ struct llm_build_internlm2 : public llm_graph_context { if (il == n_layer - 1) { // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); // feed-forward network @@ -6863,8 +6863,8 @@ struct llm_build_minicpm3 : public llm_graph_context { const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot; const uint32_t kv_lora_rank = hparams.n_lora_kv; - struct ggml_tensor * cur; - struct ggml_tensor * inpL; + ggml_tensor * cur; + ggml_tensor * inpL; inpL = build_inp_embd(model.tok_embd); @@ -6873,12 +6873,12 @@ struct llm_build_minicpm3 : public llm_graph_context { cb(inpL, "inp_scaled", -1); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_pos = build_inp_pos(); auto * inp_attn = build_attn_inp_kv_unified(true, false); for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; + ggml_tensor * inpSA = inpL; ggml_tensor * rope_factors = static_cast(memory)->cbs.get_rope_factors(n_ctx_per_seq, il); @@ -6890,7 +6890,7 @@ struct llm_build_minicpm3 : public llm_graph_context { // self_attention { - struct ggml_tensor * q = NULL; + ggml_tensor * q = NULL; // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens} q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur); cb(q, "q", il); @@ -6905,31 +6905,31 @@ struct llm_build_minicpm3 : public llm_graph_context { cb(q, "q", il); // split into {n_head * n_embd_head_qk_nope, n_tokens} - struct ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens, + ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens, ggml_row_size(q->type, hparams.n_embd_head_k), ggml_row_size(q->type, hparams.n_embd_head_k * n_head), 0); cb(q_nope, "q_nope", il); // and {n_head * n_embd_head_qk_rope, n_tokens} - struct ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens, + ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens, ggml_row_size(q->type, hparams.n_embd_head_k), ggml_row_size(q->type, hparams.n_embd_head_k * n_head), ggml_row_size(q->type, n_embd_head_qk_nope)); cb(q_pe, "q_pe", il); // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens} - struct ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur); + ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur); cb(kv_pe_compresseed, "kv_pe_compresseed", il); // split into {kv_lora_rank, n_tokens} - struct ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens, + ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens, kv_pe_compresseed->nb[1], 0); cb(kv_compressed, "kv_compressed", il); // and {n_embd_head_qk_rope, n_tokens} - struct ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens, + ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens, kv_pe_compresseed->nb[1], kv_pe_compresseed->nb[1], ggml_row_size(kv_pe_compresseed->type, kv_lora_rank)); @@ -6943,18 +6943,18 @@ struct llm_build_minicpm3 : public llm_graph_context { cb(kv_compressed, "kv_compressed", il); // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens} - struct ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed); + ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed); cb(kv, "kv", il); // split into {n_head * n_embd_head_qk_nope, n_tokens} - struct ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens, + ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens, ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v), ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)), 0); cb(k_nope, "k_nope", il); // and {n_head * n_embd_head_v, n_tokens} - struct ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens, + ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens, ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)), ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head), ggml_row_size(kv->type, (n_embd_head_qk_nope))); @@ -6985,10 +6985,10 @@ struct llm_build_minicpm3 : public llm_graph_context { ); cb(k_pe, "k_pe", il); - struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0); + ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0); cb(q_states, "q_states", il); - struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0); + ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0); cb(k_states, "k_states", il); cur = build_attn(inp_attn, gf, @@ -6998,7 +6998,7 @@ struct llm_build_minicpm3 : public llm_graph_context { if (il == n_layer - 1) { // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -7008,7 +7008,7 @@ struct llm_build_minicpm3 : public llm_graph_context { cur = ggml_scale(ctx0, cur, scale_res); cb(cur, "hidden_scaled", il); - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); // feed-forward network @@ -7068,8 +7068,8 @@ struct llm_build_gemma : public llm_graph_context { llm_build_gemma(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { const int64_t n_embd_head_k = hparams.n_embd_head_k; - struct ggml_tensor * cur; - struct ggml_tensor * inpL; + ggml_tensor * cur; + ggml_tensor * inpL; inpL = build_inp_embd(model.tok_embd); @@ -7077,7 +7077,7 @@ struct llm_build_gemma : public llm_graph_context { cb(inpL, "inp_scaled", -1); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_pos = build_inp_pos(); auto * inp_attn = build_attn_inp_kv_unified(true, false); @@ -7091,13 +7091,13 @@ struct llm_build_gemma : public llm_graph_context { // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); Qcur = ggml_rope_ext( @@ -7122,12 +7122,12 @@ struct llm_build_gemma : public llm_graph_context { if (il == n_layer - 1) { // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); } - struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL); + ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL); cb(sa_out, "sa_out", il); cur = build_norm(sa_out, @@ -7178,8 +7178,8 @@ struct llm_build_gemma2 : public llm_graph_context { llm_build_gemma2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { const int64_t n_embd_head_k = hparams.n_embd_head_k; - struct ggml_tensor * cur; - struct ggml_tensor * inpL; + ggml_tensor * cur; + ggml_tensor * inpL; inpL = build_inp_embd(model.tok_embd); @@ -7187,7 +7187,7 @@ struct llm_build_gemma2 : public llm_graph_context { cb(inpL, "inp_scaled", -1); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_pos = build_inp_pos(); auto * inp_attn = build_attn_inp_kv_unified(true, true); @@ -7201,13 +7201,13 @@ struct llm_build_gemma2 : public llm_graph_context { // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); Qcur = ggml_rope_ext( @@ -7243,12 +7243,12 @@ struct llm_build_gemma2 : public llm_graph_context { if (il == n_layer - 1) { // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); } - struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL); + ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL); cb(sa_out, "sa_out", il); cur = build_norm(sa_out, @@ -7313,18 +7313,18 @@ struct llm_build_starcoder2 : public llm_graph_context { GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); - struct ggml_tensor * cur; - struct ggml_tensor * inpL; + ggml_tensor * cur; + ggml_tensor * inpL; inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_pos = build_inp_pos(); auto * inp_attn = build_attn_inp_kv_unified(true, false); for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; + ggml_tensor * inpSA = inpL; // norm cur = build_norm(inpL, @@ -7335,21 +7335,21 @@ struct llm_build_starcoder2 : public llm_graph_context { // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -7377,12 +7377,12 @@ struct llm_build_starcoder2 : public llm_graph_context { if (il == n_layer - 1) { // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); // feed-forward network @@ -7432,14 +7432,14 @@ struct llm_build_mamba : public llm_graph_context { const llama_model & model; llm_build_mamba(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params), model(model) { - struct ggml_tensor * cur; - struct ggml_tensor * inpL; + ggml_tensor * cur; + ggml_tensor * inpL; // {n_embd, n_tokens} inpL = build_inp_embd(model.tok_embd); - struct ggml_tensor * state_copy = build_inp_s_copy(); - struct ggml_tensor * state_mask = build_inp_s_mask(); + ggml_tensor * state_copy = build_inp_s_copy(); + ggml_tensor * state_mask = build_inp_s_mask(); for (int il = 0; il < n_layer; ++il) { // norm @@ -7453,7 +7453,7 @@ struct llm_build_mamba : public llm_graph_context { if (il == n_layer - 1) { // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); } @@ -7626,13 +7626,13 @@ struct llm_build_command_r : public llm_graph_context { const float f_logit_scale = hparams.f_logit_scale; - struct ggml_tensor * cur; - struct ggml_tensor * inpL; + ggml_tensor * cur; + ggml_tensor * inpL; inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_pos = build_inp_pos(); auto * inp_attn = build_attn_inp_kv_unified(true, false); @@ -7643,26 +7643,26 @@ struct llm_build_command_r : public llm_graph_context { model.layers[il].attn_norm, NULL, LLM_NORM, il); cb(cur, "attn_norm", il); - struct ggml_tensor * ffn_inp = cur; + ggml_tensor * ffn_inp = cur; // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -7715,13 +7715,13 @@ struct llm_build_command_r : public llm_graph_context { if (il == n_layer - 1) { // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids); } - struct ggml_tensor * attn_out = cur; + ggml_tensor * attn_out = cur; // feed-forward network { @@ -7776,13 +7776,13 @@ struct llm_build_cohere2 : public llm_graph_context { const float f_logit_scale = hparams.f_logit_scale; - struct ggml_tensor * cur; - struct ggml_tensor * inpL; + ggml_tensor * cur; + ggml_tensor * inpL; inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_pos = build_inp_pos(); auto * inp_attn = build_attn_inp_kv_unified(true, true); @@ -7797,7 +7797,7 @@ struct llm_build_cohere2 : public llm_graph_context { // norm cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM, il); cb(cur, "attn_norm", il); - struct ggml_tensor * ffn_inp = cur; + ggml_tensor * ffn_inp = cur; // self-attention { @@ -7805,21 +7805,21 @@ struct llm_build_cohere2 : public llm_graph_context { ggml_tensor * rope_factors = static_cast(memory)->cbs.get_rope_factors(n_ctx_per_seq, il); // compute Q and K and RoPE them - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -7852,13 +7852,13 @@ struct llm_build_cohere2 : public llm_graph_context { if (il == n_layer - 1) { // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); - ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids); + ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids); } - struct ggml_tensor * attn_out = cur; + ggml_tensor * attn_out = cur; // feed-forward network { @@ -7913,18 +7913,18 @@ struct llm_build_olmo : public llm_graph_context { GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); - struct ggml_tensor * cur; - struct ggml_tensor * inpL; + ggml_tensor * cur; + ggml_tensor * inpL; inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_pos = build_inp_pos(); auto * inp_attn = build_attn_inp_kv_unified(true, false); for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; + ggml_tensor * inpSA = inpL; // norm cur = build_norm(inpL, @@ -7935,21 +7935,21 @@ struct llm_build_olmo : public llm_graph_context { // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (hparams.f_clamp_kqv > 0.0f) { Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (hparams.f_clamp_kqv > 0.0f) { Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (hparams.f_clamp_kqv > 0.0f) { Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); @@ -7977,12 +7977,12 @@ struct llm_build_olmo : public llm_graph_context { if (il == n_layer - 1) { // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); // feed-forward network @@ -8035,31 +8035,31 @@ struct llm_build_olmo2 : public llm_graph_context { GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); - struct ggml_tensor * cur; - struct ggml_tensor * inpL; + ggml_tensor * cur; + ggml_tensor * inpL; inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_pos = build_inp_pos(); auto * inp_attn = build_attn_inp_kv_unified(true, false); for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; + ggml_tensor * inpSA = inpL; cur = inpL; // self_attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, @@ -8099,12 +8099,12 @@ struct llm_build_olmo2 : public llm_graph_context { if (il == n_layer - 1) { // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); // feed-forward network @@ -8161,18 +8161,18 @@ struct llm_build_olmoe : public llm_graph_context { GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); - struct ggml_tensor * cur; - struct ggml_tensor * inpL; + ggml_tensor * cur; + ggml_tensor * inpL; inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_pos = build_inp_pos(); auto * inp_attn = build_attn_inp_kv_unified(true, false); for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; + ggml_tensor * inpSA = inpL; // norm cur = build_norm(inpL, @@ -8183,13 +8183,13 @@ struct llm_build_olmoe : public llm_graph_context { // self_attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, @@ -8224,12 +8224,12 @@ struct llm_build_olmoe : public llm_graph_context { if (il == n_layer - 1) { // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); // MoE branch @@ -8285,12 +8285,12 @@ struct llm_build_openelm : public llm_graph_context { GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - struct ggml_tensor * cur; - struct ggml_tensor * inpL; + ggml_tensor * cur; + ggml_tensor * inpL; inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_pos = build_inp_pos(); auto * inp_attn = build_attn_inp_kv_unified(true, false); @@ -8300,7 +8300,7 @@ struct llm_build_openelm : public llm_graph_context { const int64_t n_head_qkv = 2*n_head_kv + n_head; cur = inpL; - struct ggml_tensor * residual = cur; + ggml_tensor * residual = cur; // norm cur = build_norm(inpL, @@ -8315,13 +8315,13 @@ struct llm_build_openelm : public llm_graph_context { cur = ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv, n_tokens); - struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, cur->nb[1], cur->nb[2], 0)); + ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, cur->nb[1], cur->nb[2], 0)); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head)); + ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head)); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head+n_head_kv))); + ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head+n_head_kv))); cb(Vcur, "Vcur", il); Qcur = build_norm(Qcur, @@ -8356,12 +8356,12 @@ struct llm_build_openelm : public llm_graph_context { if (il == n_layer - 1) { // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); residual = ggml_get_rows(ctx0, residual, inp_out_ids); cur = ggml_get_rows(ctx0, cur, inp_out_ids); } - struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur); + ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur); cb(ffn_inp, "ffn_inp", il); // feed-forward network @@ -8414,13 +8414,13 @@ struct llm_build_gptneox : public llm_graph_context { GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - struct ggml_tensor * cur; - struct ggml_tensor * inpL; + ggml_tensor * cur; + ggml_tensor * inpL; inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_pos = build_inp_pos(); auto * inp_attn = build_attn_inp_kv_unified(true, false); @@ -8439,9 +8439,9 @@ struct llm_build_gptneox : public llm_graph_context { cur = ggml_add(ctx0, cur, model.layers[il].bqkv); cb(cur, "bqkv", il); - struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); - struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); - struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); @@ -8468,7 +8468,7 @@ struct llm_build_gptneox : public llm_graph_context { if (il == n_layer - 1) { // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); } @@ -8478,7 +8478,7 @@ struct llm_build_gptneox : public llm_graph_context { // attention and ffn are computed in parallel // x = x + attn(ln1(x)) + ffn(ln2(x)) - struct ggml_tensor * attn_out = cur; + ggml_tensor * attn_out = cur; cur = build_norm(inpL, model.layers[il].ffn_norm, @@ -8509,7 +8509,7 @@ struct llm_build_gptneox : public llm_graph_context { // x = x + attn(ln1(x)) // x = x + ffn(ln2(x)) - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); cb(ffn_inp, "ffn_inp", il); cur = build_norm(ffn_inp, @@ -8560,18 +8560,18 @@ struct llm_build_arctic : public llm_graph_context { GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); - struct ggml_tensor * cur; - struct ggml_tensor * inpL; + ggml_tensor * cur; + ggml_tensor * inpL; inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_pos = build_inp_pos(); auto * inp_attn = build_attn_inp_kv_unified(true, false); for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; + ggml_tensor * inpSA = inpL; // norm cur = build_norm(inpL, @@ -8582,13 +8582,13 @@ struct llm_build_arctic : public llm_graph_context { // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); Qcur = ggml_rope_ext( @@ -8612,12 +8612,12 @@ struct llm_build_arctic : public llm_graph_context { if (il == n_layer - 1) { // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); // feed-forward network @@ -8634,7 +8634,7 @@ struct llm_build_arctic : public llm_graph_context { LLM_FFN_SILU, LLM_FFN_PAR, il); cb(cur, "ffn_out", il); - struct ggml_tensor * ffn_out = ggml_add(ctx0, cur, ffn_inp); + ggml_tensor * ffn_out = ggml_add(ctx0, cur, ffn_inp); cb(ffn_out, "ffn_out", il); // MoE @@ -8692,20 +8692,20 @@ struct llm_build_deepseek : public llm_graph_context { GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); - struct ggml_tensor * cur; - struct ggml_tensor * inpL; + ggml_tensor * cur; + ggml_tensor * inpL; inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_pos = build_inp_pos(); auto * inp_attn = build_attn_inp_kv_unified(true, false); const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; + ggml_tensor * inpSA = inpL; // norm cur = build_norm(inpL, @@ -8719,21 +8719,21 @@ struct llm_build_deepseek : public llm_graph_context { ggml_tensor * rope_factors = static_cast(memory)->cbs.get_rope_factors(n_ctx_per_seq, il); // compute Q and K and RoPE them - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -8761,13 +8761,13 @@ struct llm_build_deepseek : public llm_graph_context { if (il == n_layer - 1) { // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); cur = build_norm(ffn_inp, @@ -8856,19 +8856,19 @@ struct llm_build_deepseek2 : public llm_graph_context { const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot; const uint32_t kv_lora_rank = hparams.n_lora_kv; - struct ggml_tensor * cur; - struct ggml_tensor * inpL; + ggml_tensor * cur; + ggml_tensor * inpL; // {n_embd, n_tokens} inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_pos = build_inp_pos(); auto * inp_attn = build_attn_inp_kv_unified(true, false); for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; + ggml_tensor * inpSA = inpL; // norm cur = build_norm(inpL, @@ -8878,7 +8878,7 @@ struct llm_build_deepseek2 : public llm_graph_context { // self_attention { - struct ggml_tensor * q = NULL; + ggml_tensor * q = NULL; if (!is_lite) { // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens} q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur); @@ -8898,31 +8898,31 @@ struct llm_build_deepseek2 : public llm_graph_context { } // split into {n_head * n_embd_head_qk_nope, n_tokens} - struct ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens, + ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens, ggml_row_size(q->type, hparams.n_embd_head_k), ggml_row_size(q->type, hparams.n_embd_head_k * n_head), 0); cb(q_nope, "q_nope", il); // and {n_head * n_embd_head_qk_rope, n_tokens} - struct ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens, + ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens, ggml_row_size(q->type, hparams.n_embd_head_k), ggml_row_size(q->type, hparams.n_embd_head_k * n_head), ggml_row_size(q->type, n_embd_head_qk_nope)); cb(q_pe, "q_pe", il); // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens} - struct ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur); + ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur); cb(kv_pe_compresseed, "kv_pe_compresseed", il); // split into {kv_lora_rank, n_tokens} - struct ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens, + ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens, kv_pe_compresseed->nb[1], 0); cb(kv_compressed, "kv_compressed", il); // and {n_embd_head_qk_rope, n_tokens} - struct ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens, + ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens, kv_pe_compresseed->nb[1], kv_pe_compresseed->nb[1], ggml_row_size(kv_pe_compresseed->type, kv_lora_rank)); @@ -8936,18 +8936,18 @@ struct llm_build_deepseek2 : public llm_graph_context { cb(kv_compressed, "kv_compressed", il); // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens} - struct ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed); + ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed); cb(kv, "kv", il); // split into {n_head * n_embd_head_qk_nope, n_tokens} - struct ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens, + ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens, ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v), ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)), 0); cb(k_nope, "k_nope", il); // and {n_head * n_embd_head_v, n_tokens} - struct ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens, + ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens, ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)), ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head), ggml_row_size(kv->type, (n_embd_head_qk_nope))); @@ -8978,10 +8978,10 @@ struct llm_build_deepseek2 : public llm_graph_context { ); cb(k_pe, "k_pe", il); - struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0); + ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0); cb(q_states, "q_states", il); - struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0); + ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0); cb(k_states, "k_states", il); cur = build_attn(inp_attn, gf, @@ -8991,12 +8991,12 @@ struct llm_build_deepseek2 : public llm_graph_context { if (il == n_layer - 1) { // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); cur = build_norm(ffn_inp, @@ -9024,7 +9024,7 @@ struct llm_build_deepseek2 : public llm_graph_context { n_expert, n_expert_used, LLM_FFN_SILU, hparams.expert_weights_norm, true, hparams.expert_weights_scale, - (enum llama_expert_gating_func_type) hparams.expert_gating_func, + (llama_expert_gating_func_type) hparams.expert_gating_func, il); cb(moe_out, "ffn_moe_out", il); @@ -9077,18 +9077,18 @@ struct llm_build_bitnet : public llm_graph_context { GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - struct ggml_tensor * cur; - struct ggml_tensor * inpL; + ggml_tensor * cur; + ggml_tensor * inpL; inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_pos = build_inp_pos(); auto * inp_attn = build_attn_inp_kv_unified(true, false); for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; + ggml_tensor * inpSA = inpL; cur = build_norm(inpL, model.layers[il].attn_norm, NULL, @@ -9098,7 +9098,7 @@ struct llm_build_bitnet : public llm_graph_context { // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); if (model.layers[il].wq_scale) { Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_scale); } @@ -9109,7 +9109,7 @@ struct llm_build_bitnet : public llm_graph_context { } // B1.K - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); if (model.layers[il].wk_scale) { Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_scale); } @@ -9120,7 +9120,7 @@ struct llm_build_bitnet : public llm_graph_context { } // B1.V - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); if (model.layers[il].wv_scale) { Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_scale); } @@ -9165,12 +9165,12 @@ struct llm_build_bitnet : public llm_graph_context { if (il == n_layer - 1) { // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); // feed-forward forward @@ -9231,17 +9231,17 @@ struct llm_build_t5_enc : public llm_graph_context { GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - struct ggml_tensor * cur; - struct ggml_tensor * inpL; + ggml_tensor * cur; + ggml_tensor * inpL; inpL = build_inp_embd(model.tok_embd); - struct ggml_tensor * pos_bucket_enc = build_inp_pos_bucket_enc(); + ggml_tensor * pos_bucket_enc = build_inp_pos_bucket_enc(); auto * inp_attn = build_attn_inp_no_cache(); for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; + ggml_tensor * inpSA = inpL; // norm cur = build_norm(inpL, @@ -9251,21 +9251,21 @@ struct llm_build_t5_enc : public llm_graph_context { // self-attention { - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq_enc, cur); + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq_enc, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk_enc, cur); + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk_enc, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv_enc, cur); + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv_enc, cur); cb(Vcur, "Vcur", il); Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); - struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc; - struct ggml_tensor * kq_b = build_pos_bias(pos_bucket_enc, attn_rel_b); + ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc; + ggml_tensor * kq_b = build_pos_bias(pos_bucket_enc, attn_rel_b); cur = build_attn(inp_attn, gf, model.layers[il].wo_enc, nullptr, @@ -9275,12 +9275,12 @@ struct llm_build_t5_enc : public llm_graph_context { if (il == n_layer - 1) { // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); // feed-forward network @@ -9333,13 +9333,13 @@ struct llm_build_t5_dec : public llm_graph_context { GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - struct ggml_tensor * cur; - struct ggml_tensor * inpL; + ggml_tensor * cur; + ggml_tensor * inpL; inpL = build_inp_embd(model.tok_embd); - struct ggml_tensor * embd_enc = build_inp_cross_embd(); - struct ggml_tensor * pos_bucket_dec = build_inp_pos_bucket_dec(); + ggml_tensor * embd_enc = build_inp_cross_embd(); + ggml_tensor * pos_bucket_dec = build_inp_pos_bucket_dec(); const int64_t n_outputs_enc = embd_enc->ne[1]; @@ -9347,7 +9347,7 @@ struct llm_build_t5_dec : public llm_graph_context { auto * inp_attn_cross = build_attn_inp_cross(); for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; + ggml_tensor * inpSA = inpL; // norm cur = build_norm(inpL, @@ -9357,21 +9357,21 @@ struct llm_build_t5_dec : public llm_graph_context { // self-attention { - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); - struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b; - struct ggml_tensor * kq_b = build_pos_bias(pos_bucket_dec, attn_rel_b); + ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b; + ggml_tensor * kq_b = build_pos_bias(pos_bucket_dec, attn_rel_b); cur = build_attn(inp_attn_self, gf, model.layers[il].wo, model.layers[il].bo, @@ -9382,7 +9382,7 @@ struct llm_build_t5_dec : public llm_graph_context { cur = ggml_add(ctx0, cur, inpSA); cb(cur, "cross_inp", il); - struct ggml_tensor * inpCA = cur; + ggml_tensor * inpCA = cur; // norm cur = build_norm(cur, @@ -9392,13 +9392,13 @@ struct llm_build_t5_dec : public llm_graph_context { // cross-attention { - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq_cross, cur); + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq_cross, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk_cross, embd_enc); + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk_cross, embd_enc); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv_cross, embd_enc); + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv_cross, embd_enc); cb(Vcur, "Vcur", il); Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); @@ -9410,22 +9410,22 @@ struct llm_build_t5_dec : public llm_graph_context { Qcur, Kcur, Vcur, nullptr, 1.0f, il); cb(cur, "kqv_out", il); - //struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - //struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); + //ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); + //ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); - //struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); + //ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); //cb(kq, "kq", il); //kq = ggml_soft_max_ext(ctx0, kq, KQ_mask_cross, 1.0f, hparams.f_max_alibi_bias); //cb(kq, "kq_soft_max_ext", il); - //struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_outputs_enc))); + //ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_outputs_enc))); //cb(v, "v", il); - //struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_outputs_enc, n_embd_head, n_head_kv), kq); + //ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_outputs_enc, n_embd_head, n_head_kv), kq); //cb(kqv, "kqv", il); - //struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); + //ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); //cb(kqv_merged, "kqv_merged", il); //cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens); @@ -9439,13 +9439,13 @@ struct llm_build_t5_dec : public llm_graph_context { if (il == n_layer - 1) { // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids); } - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpCA); + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpCA); cb(ffn_inp, "ffn_inp", il); // feed-forward network @@ -9504,8 +9504,8 @@ struct llm_build_jais : public llm_graph_context { GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - struct ggml_tensor * cur; - struct ggml_tensor * inpL; + ggml_tensor * cur; + ggml_tensor * inpL; inpL = build_inp_embd(model.tok_embd); @@ -9526,9 +9526,9 @@ struct llm_build_jais : public llm_graph_context { cur = ggml_add(ctx0, cur, model.layers[il].bqkv); cb(cur, "bqkv", il); - struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*cur->nb[0]*(n_embd))); - struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd))); - struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd + n_embd_gqa))); + ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*cur->nb[0]*(n_embd))); + ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd))); + ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd + n_embd_gqa))); cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); @@ -9543,13 +9543,13 @@ struct llm_build_jais : public llm_graph_context { if (il == n_layer - 1) { // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); } // add the input - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); cb(ffn_inp, "ffn_inp", il); // FF @@ -9597,18 +9597,18 @@ struct llm_build_chatglm : public llm_graph_context { GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - struct ggml_tensor * cur; - struct ggml_tensor * inpL; + ggml_tensor * cur; + ggml_tensor * inpL; inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_pos = build_inp_pos(); auto * inp_attn = build_attn_inp_kv_unified(true, false); for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; + ggml_tensor * inpSA = inpL; cur = build_norm(inpL, model.layers[il].attn_norm, @@ -9618,9 +9618,9 @@ struct llm_build_chatglm : public llm_graph_context { // self-attention { - struct ggml_tensor * Qcur = nullptr; - struct ggml_tensor * Kcur = nullptr; - struct ggml_tensor * Vcur = nullptr; + ggml_tensor * Qcur = nullptr; + ggml_tensor * Kcur = nullptr; + ggml_tensor * Vcur = nullptr; if (model.layers[il].wqkv == nullptr) { Qcur = build_lora_mm(model.layers[il].wq, cur); @@ -9674,13 +9674,13 @@ struct llm_build_chatglm : public llm_graph_context { if (il == n_layer - 1) { // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } // Add the input - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); // FF @@ -9729,18 +9729,18 @@ struct llm_build_nemotron : public llm_graph_context { GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); //GGML_ASSERT(n_embd_head == hparams.n_rot); - struct ggml_tensor * cur; - struct ggml_tensor * inpL; + ggml_tensor * cur; + ggml_tensor * inpL; inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_pos = build_inp_pos(); auto * inp_attn = build_attn_inp_kv_unified(true, false); for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; + ggml_tensor * inpSA = inpL; // norm cur = build_norm(inpL, @@ -9752,21 +9752,21 @@ struct llm_build_nemotron : public llm_graph_context { // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -9794,12 +9794,12 @@ struct llm_build_nemotron : public llm_graph_context { if (il == n_layer - 1) { // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); // feed-forward network @@ -9852,18 +9852,18 @@ struct llm_build_exaone : public llm_graph_context { GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); - struct ggml_tensor * cur; - struct ggml_tensor * inpL; + ggml_tensor * cur; + ggml_tensor * inpL; inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_pos = build_inp_pos(); auto * inp_attn = build_attn_inp_kv_unified(true, false); for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; + ggml_tensor * inpSA = inpL; // norm cur = build_norm(inpL, @@ -9877,21 +9877,21 @@ struct llm_build_exaone : public llm_graph_context { ggml_tensor * rope_factors = static_cast(memory)->cbs.get_rope_factors(n_ctx_per_seq, il); // compute Q and K and RoPE them - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -9919,12 +9919,12 @@ struct llm_build_exaone : public llm_graph_context { if (il == n_layer - 1) { // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); // feed-forward network @@ -10179,14 +10179,14 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base { llm_build_rwkv6(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_build_rwkv6_base(model, params) { GGML_ASSERT(hparams.token_shift_count == 2); - struct ggml_tensor * cur; - struct ggml_tensor * inpL; + ggml_tensor * cur; + ggml_tensor * inpL; inpL = build_inp_embd(model.tok_embd); inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1); - struct ggml_tensor * state_copy = build_inp_s_copy(); - struct ggml_tensor * state_mask = build_inp_s_mask(); + ggml_tensor * state_copy = build_inp_s_copy(); + ggml_tensor * state_mask = build_inp_s_mask(); const auto n_embd = hparams.n_embd; const auto n_seq_tokens = ubatch.n_seq_tokens; @@ -10195,17 +10195,17 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base { for (int il = 0; il < n_layer; ++il) { const llama_layer * layer = &model.layers[il]; - struct ggml_tensor * token_shift = build_rwkv_token_shift_load( + ggml_tensor * token_shift = build_rwkv_token_shift_load( gf, state_copy, state_mask, ubatch, il ); - struct ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0); - struct ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift)); + ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0); + ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift)); - struct ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM, il); + ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM, il); cb(att_norm, "attn_norm", il); - struct ggml_tensor * x_prev = ggml_concat( + ggml_tensor * x_prev = ggml_concat( ctx0, att_shift, ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0), @@ -10214,10 +10214,10 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base { cur = build_rwkv6_time_mix(gf, att_norm, x_prev, state_copy, state_mask, ubatch, il); - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); cb(ffn_inp, "ffn_inp", il); - struct ggml_tensor * ffn_norm = build_norm(ffn_inp, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, il); + ggml_tensor * ffn_norm = build_norm(ffn_inp, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, il); cb(ffn_norm, "ffn_norm", il); x_prev = ggml_concat( @@ -10250,7 +10250,7 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base { cur = inpL; - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); cur = ggml_get_rows(ctx0, cur, inp_out_ids); @@ -10274,13 +10274,13 @@ struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base { llm_build_rwkv6qwen2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_build_rwkv6_base(model, params) { GGML_ASSERT(n_embd == hparams.n_embd_k_s()); - struct ggml_tensor * cur; - struct ggml_tensor * inpL; + ggml_tensor * cur; + ggml_tensor * inpL; inpL = build_inp_embd(model.tok_embd); - struct ggml_tensor * state_copy = build_inp_s_copy(); - struct ggml_tensor * state_mask = build_inp_s_mask(); + ggml_tensor * state_copy = build_inp_s_copy(); + ggml_tensor * state_mask = build_inp_s_mask(); const auto n_embd = hparams.n_embd; const auto n_seq_tokens = ubatch.n_seq_tokens; @@ -10291,14 +10291,14 @@ struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base { for (int il = 0; il < n_layer; ++il) { const llama_layer * layer = &model.layers[il]; - struct ggml_tensor * token_shift = build_rwkv_token_shift_load( + ggml_tensor * token_shift = build_rwkv_token_shift_load( gf, state_copy, state_mask, ubatch, il ); - struct ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il); + ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il); cb(att_norm, "attn_norm", il); - struct ggml_tensor * x_prev = ggml_concat( + ggml_tensor * x_prev = ggml_concat( ctx0, token_shift, ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0), @@ -10310,7 +10310,7 @@ struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base { token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm)); ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il)); - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); cb(ffn_inp, "ffn_inp", il); // feed-forward network @@ -10337,7 +10337,7 @@ struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base { } cur = inpL; - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); cur = ggml_get_rows(ctx0, cur, inp_out_ids); @@ -10368,18 +10368,18 @@ struct llm_build_chameleon : public llm_graph_context { GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); - struct ggml_tensor * cur; - struct ggml_tensor * inpL; + ggml_tensor * cur; + ggml_tensor * inpL; inpL = build_inp_embd(model.tok_embd); // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_pos = build_inp_pos(); auto * inp_attn = build_attn_inp_kv_unified(true, false); for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; + ggml_tensor * inpSA = inpL; // norm if (hparams.swin_norm) { @@ -10394,13 +10394,13 @@ struct llm_build_chameleon : public llm_graph_context { // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].attn_q_norm) { @@ -10458,12 +10458,12 @@ struct llm_build_chameleon : public llm_graph_context { if (il == n_layer - 1) { // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); // feed-forward network @@ -10519,7 +10519,7 @@ struct llm_build_chameleon : public llm_graph_context { int num_img_tokens = img_token_end_idx - img_token_start_idx; // creates 1d tensor of size num_img_tokens and values -FLT_MAX, // which ensures that text token values are always at least larger than image token values - struct ggml_tensor * img_logits = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, num_img_tokens); + ggml_tensor * img_logits = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, num_img_tokens); img_logits = ggml_clamp(ctx0, img_logits, -FLT_MAX, -FLT_MAX); cb(img_logits, "img_logits", -1); @@ -10534,8 +10534,8 @@ struct llm_build_chameleon : public llm_graph_context { struct llm_build_wavtokenizer_dec : public llm_graph_context { llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { - struct ggml_tensor * cur; - struct ggml_tensor * inpL; + ggml_tensor * cur; + ggml_tensor * inpL; inpL = build_inp_embd(model.tok_embd); @@ -10585,9 +10585,9 @@ struct llm_build_wavtokenizer_dec : public llm_graph_context { layer.attn_norm_b, LLM_NORM_GROUP, 0); - struct ggml_tensor * q; - struct ggml_tensor * k; - struct ggml_tensor * v; + ggml_tensor * q; + ggml_tensor * k; + ggml_tensor * v; q = ggml_conv_1d_ph(ctx0, layer.attn_q, cur, 1, 1); k = ggml_conv_1d_ph(ctx0, layer.attn_k, cur, 1, 1); @@ -10600,7 +10600,7 @@ struct llm_build_wavtokenizer_dec : public llm_graph_context { q = ggml_cont(ctx0, ggml_transpose(ctx0, q)); k = ggml_cont(ctx0, ggml_transpose(ctx0, k)); - struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); + ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); kq = ggml_soft_max_ext(ctx0, kq, nullptr, 1.0f/sqrtf(float(hparams.posnet.n_embd)), 0.0f); @@ -10955,8 +10955,8 @@ llm_graph_result_ptr llama_model::build_graph( // interface implementation // -struct llama_model_params llama_model_default_params() { - struct llama_model_params result = { +llama_model_params llama_model_default_params() { + llama_model_params result = { /*.devices =*/ nullptr, /*.n_gpu_layers =*/ 0, /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER, @@ -10979,59 +10979,59 @@ struct llama_model_params llama_model_default_params() { return result; } -const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model) { +const llama_vocab * llama_model_get_vocab(const llama_model * model) { return &model->vocab; } -void llama_free_model(struct llama_model * model) { +void llama_free_model(llama_model * model) { llama_model_free(model); } -void llama_model_free(struct llama_model * model) { +void llama_model_free(llama_model * model) { delete model; } -int32_t llama_model_n_ctx_train(const struct llama_model * model) { +int32_t llama_model_n_ctx_train(const llama_model * model) { return model->hparams.n_ctx_train; } -int32_t llama_model_n_embd(const struct llama_model * model) { +int32_t llama_model_n_embd(const llama_model * model) { return model->hparams.n_embd; } -int32_t llama_model_n_layer(const struct llama_model * model) { +int32_t llama_model_n_layer(const llama_model * model) { return model->hparams.n_layer; } -int32_t llama_model_n_head(const struct llama_model * model) { +int32_t llama_model_n_head(const llama_model * model) { return model->hparams.n_head(); } -int32_t llama_model_n_head_kv(const struct llama_model * model) { +int32_t llama_model_n_head_kv(const llama_model * model) { return model->hparams.n_head_kv(); } // deprecated -int32_t llama_n_ctx_train(const struct llama_model * model) { +int32_t llama_n_ctx_train(const llama_model * model) { return llama_model_n_ctx_train(model); } // deprecated -int32_t llama_n_embd(const struct llama_model * model) { +int32_t llama_n_embd(const llama_model * model) { return llama_model_n_embd(model); } // deprecated -int32_t llama_n_layer(const struct llama_model * model) { +int32_t llama_n_layer(const llama_model * model) { return llama_model_n_layer(model); } // deprecated -int32_t llama_n_head(const struct llama_model * model) { +int32_t llama_n_head(const llama_model * model) { return llama_model_n_head(model); } -enum llama_rope_type llama_model_rope_type(const struct llama_model * model) { +llama_rope_type llama_model_rope_type(const llama_model * model) { switch (model->arch) { // these models do not use RoPE case LLM_ARCH_GPT2: @@ -11109,11 +11109,11 @@ enum llama_rope_type llama_model_rope_type(const struct llama_model * model) { return LLAMA_ROPE_TYPE_NONE; } -float llama_model_rope_freq_scale_train(const struct llama_model * model) { +float llama_model_rope_freq_scale_train(const llama_model * model) { return model->hparams.rope_freq_scale_train; } -int32_t llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size) { +int32_t llama_model_meta_val_str(const llama_model * model, const char * key, char * buf, size_t buf_size) { const auto & it = model->gguf_kv.find(key); if (it == model->gguf_kv.end()) { if (buf_size > 0) { @@ -11124,11 +11124,11 @@ int32_t llama_model_meta_val_str(const struct llama_model * model, const char * return snprintf(buf, buf_size, "%s", it->second.c_str()); } -int32_t llama_model_meta_count(const struct llama_model * model) { +int32_t llama_model_meta_count(const llama_model * model) { return (int)model->gguf_kv.size(); } -int32_t llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) { +int32_t llama_model_meta_key_by_index(const llama_model * model, int i, char * buf, size_t buf_size) { if (i < 0 || i >= (int)model->gguf_kv.size()) { if (buf_size > 0) { buf[0] = '\0'; @@ -11140,7 +11140,7 @@ int32_t llama_model_meta_key_by_index(const struct llama_model * model, int i, c return snprintf(buf, buf_size, "%s", it->first.c_str()); } -int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size) { +int32_t llama_model_meta_val_str_by_index(const llama_model * model, int32_t i, char * buf, size_t buf_size) { if (i < 0 || i >= (int)model->gguf_kv.size()) { if (buf_size > 0) { buf[0] = '\0'; @@ -11152,15 +11152,15 @@ int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int3 return snprintf(buf, buf_size, "%s", it->second.c_str()); } -int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) { +int32_t llama_model_desc(const llama_model * model, char * buf, size_t buf_size) { return snprintf(buf, buf_size, "%s", model->desc().c_str()); } -uint64_t llama_model_size(const struct llama_model * model) { +uint64_t llama_model_size(const llama_model * model) { return model->size(); } -const char * llama_model_chat_template(const struct llama_model * model, const char * name) { +const char * llama_model_chat_template(const llama_model * model, const char * name) { const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE_N) : LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE); const auto & it = model->gguf_kv.find(key); @@ -11171,11 +11171,11 @@ const char * llama_model_chat_template(const struct llama_model * model, const c return it->second.c_str(); } -uint64_t llama_model_n_params(const struct llama_model * model) { +uint64_t llama_model_n_params(const llama_model * model) { return model->n_elements(); } -bool llama_model_has_encoder(const struct llama_model * model) { +bool llama_model_has_encoder(const llama_model * model) { switch (model->arch) { case LLM_ARCH_T5: return true; case LLM_ARCH_T5ENCODER: return true; @@ -11183,23 +11183,23 @@ bool llama_model_has_encoder(const struct llama_model * model) { } } -bool llama_model_has_decoder(const struct llama_model * model) { +bool llama_model_has_decoder(const llama_model * model) { switch (model->arch) { case LLM_ARCH_T5ENCODER: return false; default: return true; } } -llama_token llama_model_decoder_start_token(const struct llama_model * model) { +llama_token llama_model_decoder_start_token(const llama_model * model) { return model->hparams.dec_start_token_id; } -bool llama_model_is_recurrent(const struct llama_model * model) { +bool llama_model_is_recurrent(const llama_model * model) { switch (model->arch) { - case LLM_ARCH_MAMBA: return true; - case LLM_ARCH_RWKV6: return true; - case LLM_ARCH_RWKV6QWEN2: return true; - default: return false; + case LLM_ARCH_MAMBA: return true; + case LLM_ARCH_RWKV6: return true; + case LLM_ARCH_RWKV6QWEN2: return true; + default: return false; } }