From 2d69bf85f22ff99c580e7b3f9b56b27b0fdefc49 Mon Sep 17 00:00:00 2001 From: Theia Vogel Date: Tue, 12 Mar 2024 03:54:22 -0700 Subject: [PATCH] New simplified llama.h API, and GPU offloading for control vectors --- common/common.cpp | 195 +++++++++++++++++++++++++++---- common/common.h | 9 ++ llama.cpp | 290 ++++++++++++++-------------------------------- llama.h | 38 +++--- 4 files changed, 282 insertions(+), 250 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 6f8d49cf1259ee..493eb55b26ff7c 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1393,32 +1393,26 @@ std::tuple llama_init_from_gpt_par if (layer_start == 0) layer_start = 1; if (layer_end == 0) layer_end = 31; - struct llama_control_vector * vector = nullptr; - - for (const auto& t : params.control_vectors) { - std::string path; - float strength; - std::tie(path, strength) = t; - - fprintf(stderr, "%s: loading control vector from %s\n", __func__, path.c_str()); - struct llama_control_vector * temp = llama_control_vector_load(path.c_str()); - if (temp == nullptr) { - fprintf(stderr, "%s: error: failed to load control vector from %s\n", __func__, path.c_str()); - llama_free(lctx); - llama_free_model(model); - return std::make_tuple(nullptr, nullptr); - } - llama_control_vector_scale(temp, strength); - - if (vector == nullptr) { - vector = temp; - } else { - llama_control_vector_add(vector, temp); - llama_control_vector_free(temp); - } + std::vector control_vector; + int n_embd; + std::tie(control_vector, n_embd) = llama_control_vector_load(params.control_vectors); + if (n_embd == -1) { + llama_free(lctx); + llama_free_model(model); + return std::make_tuple(nullptr, nullptr); } - llama_apply_control_vector(lctx, vector, layer_start, layer_end); + int err = llama_control_vector_apply(lctx, + control_vector.data(), + control_vector.size(), + n_embd, + layer_start, + layer_end); + if (err) { + llama_free(lctx); + llama_free_model(model); + return std::make_tuple(nullptr, nullptr); + } } for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) { @@ -1937,3 +1931,156 @@ void llama_embd_normalize(const float * inp, float * out, int n) { } } +// +// Control vector utils +// + +static std::tuple, int> llama_control_vector_load_one(const std::string & path, float strength) { + int n_tensors; + size_t n_bytes = 0; + uint32_t max_direction_layer = 0; + int n_embd = -1; + + // calculate size of ctx needed for tensors, ensure tensors are f32, and find max layer + { + struct ggml_init_params meta_params = { + /* .mem_size = */ ggml_tensor_overhead() * 128 + ggml_graph_overhead(), + /* .mem_buffer = */ nullptr, + /* .no_alloc = */ true, + }; + ggml_context * meta_ctx = ggml_init(meta_params); + struct gguf_init_params meta_gguf_params = { + /* .no_alloc = */ true, + /* .ctx = */ &meta_ctx, + }; + struct gguf_context * meta_ctx_gguf = gguf_init_from_file(path.c_str(), meta_gguf_params); + if (!meta_ctx_gguf) { + fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, path.c_str()); + ggml_free(meta_ctx); + return std::make_tuple(std::vector(), -1); + } + + n_tensors = gguf_get_n_tensors(meta_ctx_gguf); + for (int i = 0; i < n_tensors; i++) { + std::string name = gguf_get_tensor_name(meta_ctx_gguf, i); + + // split on '.' + size_t dotpos = name.find('.'); + if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") { + try { + uint32_t layer = std::stoi(name.substr(dotpos + 1)); + if (layer == 0) { + fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, path.c_str()); + ggml_free(meta_ctx); + gguf_free(meta_ctx_gguf); + return std::make_tuple(std::vector(), -1); + } + if (layer > max_direction_layer) { + max_direction_layer = layer; + } + } catch (...) { + fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, path.c_str()); + ggml_free(meta_ctx); + gguf_free(meta_ctx_gguf); + return std::make_tuple(std::vector(), -1); + } + } + + struct ggml_tensor * tensor_meta = ggml_get_tensor(meta_ctx, name.c_str()); + if (tensor_meta->type != GGML_TYPE_F32 || ggml_n_dims(tensor_meta) != 1) { + fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, path.c_str()); + ggml_free(meta_ctx); + gguf_free(meta_ctx_gguf); + return std::make_tuple(std::vector(), -1); + } + if (n_embd == -1) { + n_embd = ggml_nelements(tensor_meta); + } else if (ggml_nelements(tensor_meta) != n_embd) { + fprintf(stderr, "%s: direction tensor sizes mismatched in %s\n", __func__, path.c_str()); + ggml_free(meta_ctx); + gguf_free(meta_ctx_gguf); + return std::make_tuple(std::vector(), -1); + } + n_bytes += ggml_nbytes(tensor_meta); + } + ggml_free(meta_ctx); + gguf_free(meta_ctx_gguf); + } + + if (n_tensors == 0) { + fprintf(stderr, "%s: no direction tensors found in %s\n", __func__, path.c_str()); + return std::make_tuple(std::vector(), -1); + } + + // load and scale tensors into final control vector context + struct ggml_init_params ggml_params = { + /* .mem_size = */ ggml_tensor_overhead() * n_tensors + n_bytes, + /* .mem_buffer = */ nullptr, + /* .no_alloc = */ false, + }; + struct ggml_context * ctx = ggml_init(ggml_params); + + struct gguf_init_params params = { + /*.no_alloc = */ false, + /*.ctx = */ &ctx, + }; + struct gguf_context * ctx_gguf = gguf_init_from_file(path.c_str(), params); + if (!ctx_gguf) { + fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, path.c_str()); + ggml_free(ctx); + return std::make_tuple(std::vector(), -1); + } + + std::vector vector; + for (uint32_t i = 1; i < max_direction_layer; i++) { + std::string name = "direction." + std::to_string(i); + ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str()); + if (tensor) { + const float * data = (const float *) tensor->data; + for (int i = 0; i < n_embd; i++) { + vector.push_back(data[i] * strength); + } + } else { + vector.insert(vector.end(), n_embd, 0.); // as a filler + } + } + + return std::make_tuple(vector, n_embd); +} + +std::tuple, int> llama_control_vector_load(const std::vector> & vectors) { + std::vector vector; + int n_embd = -1; + + for (const auto& pair : vectors) { + std::string path; + float strength; + std::tie(path, strength) = pair; + + std::vector v; + int v_n_embd; + std::tie(v, v_n_embd) = llama_control_vector_load_one(path, strength); + + if (v_n_embd == -1) { + return std::make_tuple(std::vector(), -1); + } + if (n_embd != -1 && (n_embd != v_n_embd || v.size() != vector.size())) { + fprintf(stderr, "%s: control vector in %s does not match previous vector dimensions\n", __func__, path.c_str()); + return std::make_tuple(std::vector(), -1); + } + + if (n_embd == -1) { + vector = std::move(v); + n_embd = v_n_embd; + } else { + for (size_t i = 0; i < vector.size(); i++) { + vector[i] += v[i]; + } + } + } + + if (n_embd == -1) { + fprintf(stderr, "%s: no vectors passed\n", __func__); + } + return std::make_tuple(vector, n_embd); +} diff --git a/common/common.h b/common/common.h index 28f7ccccfa393e..2ea867553f8f98 100644 --- a/common/common.h +++ b/common/common.h @@ -270,3 +270,12 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40 void llama_embd_normalize(const float * inp, float * out, int n); +// +// Control vector utils +// + +// Load control vectors from a tuple of {path, strength}, scale each by strength, and add them together. +// Returns a tuple of {concatenated vector data (n_emnd x n_layer), n_embd} +// On error, returns a tuple of {empty, -1} +std::tuple, int> llama_control_vector_load( + const std::vector> & vectors); diff --git a/llama.cpp b/llama.cpp index 6ec671e5de156e..ee7c463544cb80 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1884,6 +1884,31 @@ struct llama_kv_cache { } }; +struct llama_control_vector { + std::vector tensors; // per layer + std::vector ctxs; + std::vector bufs; + + int32_t layer_start = 0; + int32_t layer_end = 0; + + ggml_tensor * tensor_for(int il) const { + if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) { + return nullptr; + } + return tensors[il]; + } + + ~llama_control_vector() { + for (struct ggml_context * ctx : ctxs) { + ggml_free(ctx); + } + for (ggml_backend_buffer_t buf : bufs) { + ggml_backend_buffer_free(buf); + } + } +}; + struct llama_vocab { using id = int32_t; using token = std::string; @@ -2092,9 +2117,8 @@ struct llama_context { struct ggml_tensor * inp_s_mask; // F32 [kv_size] struct ggml_tensor * inp_s_seq; // I32 [kv_size, n_batch] - struct llama_control_vector * control_vector; - int32_t control_vector_layer_start; - int32_t control_vector_layer_end; + // control vectors + struct llama_control_vector cvec; #ifdef GGML_USE_MPI ggml_mpi_context * ctx_mpi = NULL; @@ -5420,8 +5444,6 @@ static struct ggml_tensor * llm_build_kv( return cur; } -ggml_tensor * get_control_vector_layer_tensor(struct llama_control_vector * vector, int il); - struct llm_build_context { const llama_model & model; const llama_context & lctx; @@ -5778,11 +5800,9 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - if (lctx.control_vector != nullptr && il >= lctx.control_vector_layer_start && il <= lctx.control_vector_layer_end) { - ggml_tensor * layer_dir = get_control_vector_layer_tensor(lctx.control_vector, il); - if (layer_dir != nullptr) { - cur = ggml_add(ctx0, cur, layer_dir); - } + ggml_tensor * layer_dir = lctx.cvec.tensor_for(il); + if (layer_dir != nullptr) { + cur = ggml_add(ctx0, cur, layer_dir); } cb(cur, "l_out", il); @@ -13197,227 +13217,93 @@ int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const } } -struct llama_control_vector { - struct ggml_context * ctx; - std::vector tensors; - - llama_control_vector() : ctx(nullptr) {} - - ~llama_control_vector() { - if (this->ctx) { - ggml_free(this->ctx); - } - } -}; +static bool llama_control_vector_init(struct llama_control_vector & cvec, const llama_model & model) { + GGML_ASSERT(cvec.tensors.empty()); + GGML_ASSERT(cvec.ctxs.empty()); + GGML_ASSERT(cvec.bufs.empty()); -ggml_tensor * get_control_vector_layer_tensor(struct llama_control_vector * vector, int il) { - if (!vector->ctx || il > vector->tensors.size()) { - return nullptr; + // count layer buffer types + std::map buft_layer_count; + for (int64_t i = 0; i < model.hparams.n_layer; i++) { + buft_layer_count[model.buft_layer[i].buft]++; } - return vector->tensors[il]; -} - -struct llama_control_vector * llama_control_vector_load(const char * path) { - struct llama_control_vector * vector = new llama_control_vector(); - int n_tensors; - size_t n_bytes = 0; - uint32_t max_direction_layer = 0; - - // calculate size of ctx needed for tensors, ensure tensors are f32, and find max layer - { - struct ggml_init_params meta_params = { - /* .mem_size = */ ggml_tensor_overhead() * 128 + ggml_graph_overhead(), - /* .mem_buffer = */ nullptr, - /* .no_alloc = */ true, - }; - ggml_context * meta_ctx = ggml_init(meta_params); - struct gguf_init_params meta_gguf_params = { - /* .no_alloc = */ true, - /* .ctx = */ &meta_ctx, + // allocate contexts + std::map ctx_map; + for (auto & it : buft_layer_count) { + int n_layers = it.second; + struct ggml_init_params params = { + /*.mem_size =*/ n_layers * ggml_tensor_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, }; - struct gguf_context * meta_ctx_gguf = gguf_init_from_file(path, meta_gguf_params); - if (!meta_ctx_gguf) { - LLAMA_LOG_ERROR("%s: failed to load control vector\n", __func__); - ggml_free(meta_ctx); - return nullptr; - } - - n_tensors = gguf_get_n_tensors(meta_ctx_gguf); - for (int i = 0; i < n_tensors; i++) { - std::string name = gguf_get_tensor_name(meta_ctx_gguf, i); - - // split on '.' - size_t dotpos = name.find('.'); - if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") { - try { - uint32_t layer = std::stoi(name.substr(dotpos + 1)); - if (layer == 0) { - LLAMA_LOG_ERROR("%s: direction tensor invalid: %s\n", __func__, name.c_str()); - ggml_free(meta_ctx); - gguf_free(meta_ctx_gguf); - return nullptr; - } - if (layer > max_direction_layer) { - max_direction_layer = layer; - } - } catch (...) { - LLAMA_LOG_ERROR("%s: direction tensor invalid: %s\n", __func__, name.c_str()); - ggml_free(meta_ctx); - gguf_free(meta_ctx_gguf); - return nullptr; - } - } - - struct ggml_tensor * tensor_meta = ggml_get_tensor(meta_ctx, name.c_str()); - if (tensor_meta->type != GGML_TYPE_F32 || ggml_n_dims(tensor_meta) != 1) { - LLAMA_LOG_ERROR("%s: direction tensor invalid: %s\n", __func__, name.c_str()); - ggml_free(meta_ctx); - gguf_free(meta_ctx_gguf); - return nullptr; - } - n_bytes += ggml_nbytes(tensor_meta); + ggml_context * ctx = ggml_init(params); + if (!ctx) { + LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__); + return 1; } - ggml_free(meta_ctx); - gguf_free(meta_ctx_gguf); + ctx_map[it.first] = ctx; } - // load and scale tensors into final control vector context - struct ggml_init_params ggml_params = { - /* .mem_size = */ ggml_tensor_overhead() * n_tensors + n_bytes, - /* .mem_buffer = */ nullptr, - /* .no_alloc = */ false, - }; - struct ggml_context * ctx = ggml_init(ggml_params); - - struct gguf_init_params params = { - /*.no_alloc = */ false, - /*.ctx = */ &ctx, - }; - struct gguf_context * ctx_gguf = gguf_init_from_file(path, params); - if (!ctx_gguf) { - LLAMA_LOG_ERROR("%s: failed to load control vector\n", __func__); - ggml_free(ctx); - return nullptr; + // make tensors + cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0 + for (size_t il = 1; il < model.hparams.n_layer; il++) { + struct ggml_context * ctx = ctx_map.at(model.buft_layer[il].buft); + ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd); + cvec.tensors.push_back(tensor); } - vector->ctx = ctx; - vector->tensors.push_back(nullptr); // there's never a direction vector for 0 - for (uint32_t i = 1; i < max_direction_layer; i++) { - std::string name = format("direction.%d", i); - ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str()); - if (tensor) { - vector->tensors.push_back(tensor); - // LLAMA_LOG_INFO("%s: found control vector tensor: t[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, ggml_n_dims(tensor), tensor->name, tensor->data); - } else { - vector->tensors.push_back(nullptr); // as a filler + // allocate tensors / buffers and zero + for (auto it : ctx_map) { + ggml_backend_buffer_type_t buft = it.first; + ggml_context * ctx = it.second; + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); + if (!buf) { + LLAMA_LOG_ERROR("%s: failed to allocate buffer for control vector\n", __func__); + return false; } + ggml_backend_buffer_clear(buf, 0); + cvec.ctxs.push_back(ctx); + cvec.bufs.push_back(buf); } - return vector; + return true; } -struct llama_control_vector * llama_control_vector_dup(const struct llama_control_vector * vector) { - struct llama_control_vector * new_vector = new llama_control_vector(); - if (vector->ctx == nullptr) { - return new_vector; - } - struct ggml_init_params ggml_params = { - /* .mem_size = */ ggml_get_mem_size(vector->ctx), - /* .mem_buffer = */ nullptr, - /* .no_alloc = */ false, - }; - - struct ggml_context * ctx = ggml_init(ggml_params); +int32_t llama_control_vector_apply(struct llama_context * lctx, float * data, size_t len, int n_embd, int32_t il_start, int32_t il_end) { + const llama_model & model = lctx->model; + llama_control_vector & cvec = lctx->cvec; - for (ggml_tensor * tensor : vector->tensors) { - if (tensor == nullptr) { - new_vector->tensors.push_back(nullptr); - } else { - ggml_tensor * new_tensor = ggml_dup_tensor(ctx, tensor); - new_vector->tensors.push_back(new_tensor); - } - } - - new_vector->ctx = ctx; - return new_vector; -} - -int32_t llama_control_vector_scale(struct llama_control_vector * vector, float strength) { - if (vector->ctx == nullptr) { - LLAMA_LOG_ERROR("%s: attempted to scale unloaded control vector\n", __func__); + if (n_embd != (int) model.hparams.n_embd) { + LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__); return 1; } - for (ggml_tensor * tensor : vector->tensors) { - if (tensor == nullptr) continue; - for (int j = 0; (int64_t)j < ggml_nelements(tensor); j++) { - float v = ggml_get_f32_1d(tensor, j); - ggml_set_f32_1d(tensor, j, v * strength); + if (cvec.tensors.empty()) { + if (!llama_control_vector_init(cvec, model)) { + return 1; } } - return 0; -} + cvec.layer_start = il_start; + cvec.layer_end = il_end; -int32_t llama_control_vector_add(struct llama_control_vector * vector, const struct llama_control_vector * other) { - if (vector->ctx == nullptr || other->ctx == nullptr) { - LLAMA_LOG_ERROR("%s: attempted to add with an unloaded control vector\n", __func__); - return 1; - } - - size_t size = std::max(vector->tensors.size(), other->tensors.size()); - for (size_t i = 0; i < size; i++) { - if (i >= vector->tensors.size()) { - vector->tensors.push_back(nullptr); + for (size_t il = 1; il < model.hparams.n_layer; il++) { + if (il >= cvec.tensors.size() || cvec.tensors[il] == nullptr) { + continue; } - - ggml_tensor * other_tensor = i < other->tensors.size() ? other->tensors[i] : nullptr; - if (other_tensor != nullptr) { - if (vector->tensors[i] == nullptr) { - ggml_tensor * new_tensor = ggml_dup_tensor(vector->ctx, other_tensor); - vector->tensors[i] = new_tensor; - } else { - ggml_tensor * this_tensor = vector->tensors[i]; - size_t this_nelements = ggml_nelements(this_tensor); - size_t other_nelements = ggml_nelements(other_tensor); - - if (this_nelements != other_nelements) { - LLAMA_LOG_ERROR("%s: attempted to add control vectors of incompatible dimension: %zu != %zu\n", __func__, this_nelements, other_nelements); - return 1; - } - - for (size_t j = 0; j < this_nelements; j++) { - float a = ggml_get_f32_1d(this_tensor, j); - float b = ggml_get_f32_1d(other_tensor, j); - ggml_set_f32_1d(this_tensor, j, a + b); - } - } + size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present + if (off + n_embd <= len) { + ggml_backend_tensor_set(cvec.tensors[il], + data + off, + 0, + n_embd * ggml_element_size(cvec.tensors[il])); } } return 0; } -void llama_control_vector_free(struct llama_control_vector * vector) { - delete vector; -} - -void llama_apply_control_vector( - struct llama_context * lctx, - struct llama_control_vector * vector, - int32_t control_vector_layer_start, - int32_t control_vector_layer_end -) { - lctx->control_vector = vector; - lctx->control_vector_layer_start = control_vector_layer_start; - lctx->control_vector_layer_end = control_vector_layer_end; -} - -void llama_clear_control_vector(struct llama_context * lctx) { - lctx->control_vector = nullptr; -} - struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq) { struct llama_kv_cache_view result = { /*.n_cells = */ 0, diff --git a/llama.h b/llama.h index 89ca02e022b847..bc855fb6ff6b0f 100644 --- a/llama.h +++ b/llama.h @@ -4,7 +4,6 @@ #include "ggml.h" #include "ggml-backend.h" -#include #include #include #include @@ -437,29 +436,20 @@ extern "C" { float scale, const char * path_base_model, int32_t n_threads); - - struct llama_control_vector; - - LLAMA_API struct llama_control_vector * llama_control_vector_load(const char * path); - LLAMA_API struct llama_control_vector * llama_control_vector_dup(const struct llama_control_vector * vector); - LLAMA_API int32_t llama_control_vector_scale(struct llama_control_vector * vector, float strength); - LLAMA_API int32_t llama_control_vector_add(struct llama_control_vector * vector, const struct llama_control_vector * other); - LLAMA_API void llama_control_vector_free(struct llama_control_vector * vector); - LLAMA_API void llama_apply_control_vector( - struct llama_context * lctx, - struct llama_control_vector * vector, - int32_t control_vector_layer_start, - int32_t control_vector_layer_end); - LLAMA_API void llama_clear_control_vector(struct llama_context * lctx); - - - // Apply a control vector to a model context - LLAMA_API int32_t llama_load_control_vector_from_file( - struct llama_context * lctx, - const char * control_vector_path, - float strength, - int32_t layer_start, - int32_t layer_end); + + // Apply a loaded control vector to a llama_context, or if data is NULL, clear + // the currently loaded vector. + // n_embd should be the size of a single layer's control, and data should point + // to an n_embd x n_layers buffer starting from layer 1. + // il_start and il_end are the layer range the vector should apply to (both inclusive) + // See llama_control_vector_load in common to load a control vector. + LLAMA_API int32_t llama_control_vector_apply( + struct llama_context * lctx, + float * data, + size_t len, + int n_embd, + int32_t il_start, + int32_t il_end); // // KV cache