diff --git a/examples/merge/config.example.txt b/examples/merge/config.example.txt index 631fd4c5068e4..d2ec9f2329b6b 100644 --- a/examples/merge/config.example.txt +++ b/examples/merge/config.example.txt @@ -7,17 +7,19 @@ # Supported verbs: # - linear: merge linearly, parameters: source_layer,source_layer,t # - slerp: spherical linear interpolation, parameters: source_layer,source_layer,scale,scale -# - repeat: repeat a layer in the same output model (to reduce file size) -# -# For example: -# +# - copy: copy from which model, which layer + + +######################### +# Example: + # This is the first layer of output model: # For all tensors, we want slerp(model[0].layer[0], model[1].layer[0], 0.1) # Except for "attn_output" tensor that we want t=0.5 instead t=0.1 output layer 0 -all slerp 0,0,0.9 -attn_output slerp 0,0,0.9 +all slerp 0,0,0.1 +attn_output slerp 0,0,0.5 # For next layer, we want: model[0].layer[1]*0.6 + model[1].layer[1]*0.4 # Except for "attn_output" tensor that we want to use slerp with t=0.9 @@ -26,13 +28,96 @@ output layer 1 all linear 1,1,0.6,0.4 attn_output slerp 1,1,0.9 -output layer 2 -all linear 2,2,1.0,0.0 +# For next layer, we want to copy from model[0].layer[2] -# repeat the first layers defined earlier in this file +output layer 2 +all copy 0,2 output layer 3 -all repeat 0 +all copy 0,3 + +# For next layer, we want to copy from model[1].layer[4] output layer 4 -all repeat 1 +all copy 1,4 + +output layer 5 +all copy 1,5 + +output layer 6 +all linear 6,6,0.1,0.9 + +output layer 7 +all linear 7,7,0.1,0.9 + +output layer 8 +all linear 8,8,0.1,0.9 + +output layer 9 +all linear 9,9,0.1,0.9 + +output layer 10 +all linear 10,10,0.1,0.9 + +output layer 11 +all linear 11,11,0.1,0.9 + +output layer 12 +all linear 12,12,0.1,0.9 + +output layer 13 +all linear 13,13,0.3333,0.6666 + +output layer 14 +all linear 14,14,0.3333,0.6666 + +output layer 15 +all linear 15,15,0.3333,0.6666 + +output layer 16 +all linear 16,16,0.3333,0.6666 + +output layer 17 +all linear 17,17,0.3333,0.6666 + +output layer 18 +all linear 18,18,0.3333,0.6666 + +output layer 19 +all linear 19,19,0.3333,0.6666 + +output layer 20 +all slerp 20,20,0.8 + +output layer 21 +all slerp 21,21,0.8 + +output layer 22 +all slerp 22,22,0.8 + +output layer 23 +all slerp 23,23,0.8 + +output layer 24 +all slerp 24,24,0.8 + +output layer 25 +all slerp 25,25,0.8 + +output layer 26 +all slerp 26,26,0.8 + +output layer 27 +all slerp 27,27,0.8 + +output layer 28 +all slerp 28,28,0.8 + +output layer 29 +all slerp 29,29,0.8 + +output layer 30 +all slerp 30,30,0.8 + +output layer 31 +all slerp 31,31,0.8 diff --git a/examples/merge/parser.hpp b/examples/merge/parser.hpp index 356c20250c515..64f7d0e607887 100644 --- a/examples/merge/parser.hpp +++ b/examples/merge/parser.hpp @@ -125,7 +125,8 @@ static std::vector parse_config(std::string & config_pa struct llama_merge_inst ins; ins.method = LLAMA_MERGE_COPY; strcpy(ins.name, name.c_str()); - strcpy(ins.srcs[0], name.c_str()); + strcpy(ins.srcs[0], name.c_str()); // always take the first model + strcpy(ins.srcs[1], ""); instructions.push_back(ins); } else { // tensor belong to layer @@ -177,7 +178,7 @@ static std::vector parse_config(std::string & config_pa auto parts = str_split(line, " "); if (parts.size() != 3) { - raise_err(i_line, "does not follow format: \"target (space) verb (space) arguments\""); + raise_err(i_line, "does not follow format: \"target (space) verb (space) parameters\""); } auto target = parts[0]; @@ -197,7 +198,7 @@ static std::vector parse_config(std::string & config_pa auto linear = [&](struct llama_merge_inst & ins, std::string unit) { if (params.size() != 4) { - raise_err(i_line, "verb \"linear\" requires exactly 4 params"); + raise_err(i_line, "verb \"linear\" requires exactly 4 parameters"); } ins.method = LLAMA_MERGE_LINEAR; int src0 = std::stoi(params[0]); @@ -211,7 +212,7 @@ static std::vector parse_config(std::string & config_pa auto slerp = [&](struct llama_merge_inst & ins, std::string unit) { if (params.size() != 3) { - raise_err(i_line, "verb \"slerp\" requires exactly 3 params"); + raise_err(i_line, "verb \"slerp\" requires exactly 3 parameters"); } ins.method = LLAMA_MERGE_SLERP; int src0 = std::stoi(params[0]); @@ -222,14 +223,33 @@ static std::vector parse_config(std::string & config_pa is_layer_empty = false; }; - auto repeat = [&](struct llama_merge_inst & ins, std::string unit) { + /*auto repeat = [&](struct llama_merge_inst & ins, std::string unit) { if (params.size() != 1) { - raise_err(i_line, "verb \"repeat\" requires exactly 1 param"); + raise_err(i_line, "verb \"repeat\" requires exactly 1 parameter"); } ins.method = LLAMA_MERGE_REPEAT; int src0 = std::stoi(params[0]); strcpy(ins.srcs[0], get_tensor_name(src0, unit).c_str()); is_layer_empty = false; + };*/ + + auto copy = [&](struct llama_merge_inst & ins, std::string unit) { + if (params.size() != 2) { + raise_err(i_line, "verb \"copy\" requires exactly 2 parameters"); + } + ins.method = LLAMA_MERGE_COPY; + int model = std::stoi(params[0]); + int layer = std::stoi(params[1]); + if (model == 0) { + strcpy(ins.srcs[0], get_tensor_name(layer, unit).c_str()); + strcpy(ins.srcs[1], ""); + } else if (model == 1) { + strcpy(ins.srcs[0], ""); + strcpy(ins.srcs[1], get_tensor_name(layer, unit).c_str()); + } else { + raise_err(i_line, "can only copy from model 0 or 1"); + } + is_layer_empty = false; }; auto apply_verb = [&](struct llama_merge_inst & ins, std::string unit) { @@ -238,12 +258,16 @@ static std::vector parse_config(std::string & config_pa } else if (verb == "slerp") { slerp(ins, unit); } else if (verb == "repeat") { - repeat(ins, unit); + // repeat(ins, unit); + raise_err(i_line, "repeat is currently not supported"); + } else if (verb == "copy") { + copy(ins, unit); } else { raise_err(i_line, "invalid verb: " + verb); } }; + // TODO: what if user does not use "all"? we may miss some tensors? if (target == "all") { for (auto & u : units) { apply_verb(layer[u], u); diff --git a/llama.cpp b/llama.cpp index e77a1380602ce..102657ac95600 100644 --- a/llama.cpp +++ b/llama.cpp @@ -11358,14 +11358,12 @@ int32_t llama_merge_models(const struct llama_merge_config * config) { #else constexpr bool use_mmap = false; #endif -/* // std::move doesn't work with llama_model and llama_model_loader, why? std::vector> models; std::vector> mls; std::vector> buf_in; std::vector> buf_out; std::set ref_names; // list of ref_name per layer - int max_input_layers = 0; // number of layers that the input model has std::vector output_tensors; // output file @@ -11373,21 +11371,6 @@ int32_t llama_merge_models(const struct llama_merge_config * config) { std::ofstream fout(config->output_path, std::ios::binary); fout.exceptions(std::ofstream::failbit); // fail fast on write errors - // get layer index from tensor name, for example "blk.x.attn_norm.weight" - // returns -1 if it is non-layer - auto get_i_layer = [&](std::string tensor_name) -> int { - int i_layer = -1; - return sscanf(tensor_name.c_str(), "blk.%d.", &i_layer) == 1 ? i_layer : -1; - }; - - // get new tensor name by i_layer and ref_name, for example "blk.x.attn_norm.weight" - auto get_name = [&](int i_layer, std::string ref_name) -> std::string { - ref_name.erase(0, ref_name.find(".", 4)); // delete the "blk.x" part - std::stringstream ss; - ss << "blk." << i_layer << ref_name; - return ss.str(); - }; - // remember to call before exit auto clean_up = [&]() { fout.close(); @@ -11398,7 +11381,8 @@ int32_t llama_merge_models(const struct llama_merge_config * config) { }; // load the input models - for (size_t i = 0; i < config->n_models; i++) { + static const size_t n_models = 2; + for (size_t i = 0; i < n_models; i++) { auto model = std::unique_ptr(new llama_model()); auto ml = std::unique_ptr(new llama_model_loader(config->model_paths[i], use_mmap, NULL)); ml->init_mapping(false); @@ -11415,6 +11399,12 @@ int32_t llama_merge_models(const struct llama_merge_config * config) { mls.push_back(std::move(ml)); } + // for verb copy, we want to get the source tensor + auto get_src_tensor_for_copy = [&](const struct llama_merge_inst ins, size_t & i_model) { + i_model = std::string(ins.srcs[0]).empty() ? 1 : 0; + return mls[i_model]->get_tensor_meta(ins.srcs[i_model]); + }; + // construct metadata { // copy the KV pairs from the input file @@ -11424,42 +11414,60 @@ int32_t llama_merge_models(const struct llama_merge_config * config) { std::stringstream ss; ss << mls[0]->get_arch_name() << ".block_count"; gguf_set_val_u32(ctx_out, ss.str().c_str(), config->n_layers); - printf("====> Set new value of %s = %ld\n", ss.str().c_str(), config->n_layers); - - // read input layers, process firstly non-layer tensors (embedding, output,...) - for (int i = 0; i < mls[0]->n_tensors; i++) { - struct ggml_tensor * meta = mls[0]->get_tensor_meta(i); - int i_layer = get_i_layer(ggml_get_name(meta)); - if (i_layer < 0) { - // populate data for non-layer tensors (embedding, output,...) - struct ggml_tensor * out_tensor = (struct ggml_tensor *) malloc(GGML_TENSOR_SIZE); - memcpy(out_tensor, meta, GGML_TENSOR_SIZE); // copy metadata (shape, type,...) - gguf_add_tensor(ctx_out, out_tensor); - output_tensors.push_back(out_tensor); - } else { - max_input_layers = std::max(i_layer, max_input_layers); - if (i_layer == 0) { - // only extract names of one layer, assuming all layers have the same structure - ref_names.insert(ggml_get_name(meta)); + LLAMA_LOG_INFO("====> Set new value of %s = %ld\n", ss.str().c_str(), config->n_layers); + + // populate metadata for output tensors + auto push_tensor = [&](struct ggml_tensor * ref, const char * name) { + struct ggml_tensor * out_tensor = (struct ggml_tensor *) malloc(GGML_TENSOR_SIZE); + if (ref != nullptr) { + // copy metadata (shape, type,...) + memcpy(out_tensor, ref, GGML_TENSOR_SIZE); + } + ggml_set_name(out_tensor, name); + gguf_add_tensor(ctx_out, out_tensor); + output_tensors.push_back(out_tensor); + }; + for (size_t i = 0; i < config->n_insts; i++) { + const struct llama_merge_inst ins = config->insts[i]; + struct ggml_tensor * t0; + struct ggml_tensor * t1; + // TODO: reject non-requantize-able type (one that requires imatrix) + if (ins.method == LLAMA_MERGE_COPY) { + // simply copy from model A + size_t i_model; + t0 = get_src_tensor_for_copy(ins, i_model); + push_tensor(t0, ins.name); + } else if (ins.method == LLAMA_MERGE_LINEAR || ins.method == LLAMA_MERGE_SLERP) { + t0 = mls[0]->get_tensor_meta(ins.srcs[0]); + t1 = mls[1]->get_tensor_meta(ins.srcs[1]); + if (llama_format_tensor_shape(t0) != llama_format_tensor_shape(t1)) { + LLAMA_LOG_ERROR("some tensors does not have the same shape"); + clean_up(); + return -1; } + push_tensor(t0, ins.name); + } else if (ins.method == LLAMA_MERGE_REPEAT) { + // TODO: in theory, we can point 2 tensors to the same offset, but here we're unable to do that, because offset is currently managed by gguf_add_tensor() + GGML_ASSERT(false); + /*int idx = nullptr; + std::string search_tensor(ins.srcs[0]); + for (auto & tensor : output_tensors) { + if (std::string(ggml_get_name(tensor)) == search_tensor) { + t0 = tensor; + break; + } + } + if (t0 == nullptr) { + LLAMA_LOG_ERROR("cannot find source tensor to repeat"); + clean_up(); + return -1; + } + push_tensor(t0, ins.name);*/ + } else { + GGML_ASSERT(false); // should never happen } } - // populate layers metadata for output model - for (size_t i_layer = 0; i_layer < config->n_layers; i_layer++) { - for (auto & ref_name : ref_names) { - // create new tensor, because new model may have more tensors than input model - struct ggml_tensor * out_tensor = (struct ggml_tensor *) malloc(GGML_TENSOR_SIZE); - struct ggml_tensor * ref_tensor = mls[0]->get_tensor_meta(ref_name.c_str()); // get ref tensor from layer 0 - memcpy(out_tensor, ref_tensor, GGML_TENSOR_SIZE); // copy metadata (shape, type,...) - ggml_set_name(out_tensor, get_name(i_layer, ref_name).c_str()); // set the correct name (with correct i_layer) - output_tensors.push_back(out_tensor); - gguf_add_tensor(ctx_out, out_tensor); - // TODO: reject non-requantize-able type (one that requires imatrix) - } - // TODO: how to reuse tensor (duplicated layers)? we can play with ctx->infos[tensor_idx].offset - } - const size_t meta_size = gguf_get_meta_size(ctx_out); LLAMA_LOG_INFO("%s: meta size = %zu bytes\n", __func__, meta_size); @@ -11481,8 +11489,11 @@ int32_t llama_merge_models(const struct llama_merge_config * config) { }; size_t n_done = 0; - size_t n_curr = 0; - auto log_step = [&](const struct ggml_tensor * tensor) { + auto write_output_tensor = [&](const struct ggml_tensor * tensor, void * data) { + // write tensor data + padding + const size_t len = ggml_nbytes(tensor); + fout.write((const char *) data, len); + zeros(fout, GGML_PAD(len, GGUF_DEFAULT_ALIGNMENT) - len); n_done++; LLAMA_LOG_INFO("[%4ld/%4ld] %36s - [%s], input type = %6s\n", n_done, output_tensors.size(), @@ -11491,84 +11502,82 @@ int32_t llama_merge_models(const struct llama_merge_config * config) { ggml_type_name(tensor->type)); }; - // process non-layer output tensor - for (auto & out_tensor : output_tensors) { - std::string name = ggml_get_name(out_tensor); - int i_layer_out = get_i_layer(name.c_str()); - std::vector> buf; - if (i_layer_out >= 0) { - continue; - } - n_curr++; - struct ggml_tensor * in_tensor = mls[0]->get_tensor_meta(name.c_str()); - if (in_tensor == nullptr) { - LLAMA_LOG_ERROR("Cannot find layer name %s from base model\n", name.c_str()); - clean_up(); - return -1; - } - read_tensor_data(in_tensor, *mls[0], buf); // read from first model - - // write tensor data + padding - const size_t write_size = ggml_nbytes(out_tensor); - fout.write((const char *) in_tensor->data, write_size); - zeros(fout, GGML_PAD(write_size, GGUF_DEFAULT_ALIGNMENT) - write_size); - log_step(out_tensor); - } - // TODO: allow user to set n_threads const int n_threads = std::thread::hardware_concurrency(); std::vector workers; workers.reserve(n_threads); - // process tensors associated to layer - for (auto & out_tensor : output_tensors) { + // process instruction one by one + GGML_ASSERT(config->n_insts == output_tensors.size()); + for (size_t i = 0; i < config->n_insts; i++) { + const struct llama_merge_inst ins = config->insts[i]; + struct ggml_tensor * t0; + struct ggml_tensor * t1; + struct ggml_tensor * out_tensor = output_tensors[i]; const size_t n_elements = ggml_nelements(out_tensor); - std::vector> in_buf; - std::vector> f32_in_buf; // dequant it internally + std::vector> in_buf0; + std::vector> f32_in_buf0; // dequant it internally + std::vector> in_buf1; + std::vector> f32_in_buf1; // dequant it internally std::vector f32_out_buf(n_elements, 0.0); // do not resize! std::vector out_buf(ggml_nbytes(out_tensor)); // do not resize! - std::string out_name = ggml_get_name(out_tensor); - int i_layer_out = get_i_layer(out_name.c_str()); - auto layer = config->layers[i_layer_out]; - - if (i_layer_out < 0) { - continue; // skip non-layer tensors + if (ins.method == LLAMA_MERGE_COPY) { + LLAMA_LOG_INFO("copy\n"); + size_t i_model; + t0 = get_src_tensor_for_copy(ins, i_model); + read_tensor_data(t0, *mls[i_model], in_buf0); + write_output_tensor(out_tensor, t0->data); + continue; } - for (size_t i_model = 0; i_model < config->n_models; i_model++) { - int src_layer = layer.srcs[i_model]; // source layer - float scale = layer.scales[i_model]; - std::string src_name = get_name(src_layer, out_name); // find the correct tensor based on src_layer - struct ggml_tensor * in_tensor = mls[i_model]->get_tensor_meta(src_name.c_str()); - if (in_tensor == nullptr) { - LLAMA_LOG_ERROR("Cannot find layer name %s from model %ld\n", src_name.c_str(), i_model + 1); - clean_up(); - return -1; // stop - } - read_tensor_data(in_tensor, *mls[i_model], in_buf); - // dequant the tensor to FP32 + // dequantize the tensor to FP32 + auto dequantize = [&](struct ggml_tensor * in_tensor, std::vector> & f32_in_buf) { if (in_tensor->type != GGML_TYPE_F32) { - //LLAMA_LOG_ERROR("dequant "); + LLAMA_LOG_INFO("dequant "); llama_convert_tensor_internal(in_tensor, f32_in_buf, workers, n_elements, n_threads); } else { // if we already have f32, just copy it - //LLAMA_LOG_ERROR("f32_copy "); + LLAMA_LOG_INFO("f32_copy "); f32_in_buf.resize(n_elements); memcpy((void *) f32_in_buf.data(), in_tensor->data, n_elements * sizeof(float)); } - // do the calculation - //LLAMA_LOG_ERROR("calc "); + }; + + // load data and dequantize + if (ins.method == LLAMA_MERGE_LINEAR || ins.method == LLAMA_MERGE_SLERP) { + t0 = mls[0]->get_tensor_meta(ins.srcs[0]); + t1 = mls[1]->get_tensor_meta(ins.srcs[1]); + read_tensor_data(t0, *mls[0], in_buf0); + read_tensor_data(t1, *mls[1], in_buf1); + dequantize(t0, f32_in_buf0); + dequantize(t1, f32_in_buf1); + } + + if (ins.method == LLAMA_MERGE_LINEAR) { + LLAMA_LOG_INFO("linear "); + float * in0 = (float *) f32_in_buf0.data(); + float * in1 = (float *) f32_in_buf1.data(); + float * dest = (float *) f32_out_buf.data(); + for (size_t i = 0; i < n_elements; i++) { + dest[i] = in0[i] * ins.scales[0] + in1[i] * ins.scales[1]; + } + } + + if (ins.method == LLAMA_MERGE_SLERP) { + LLAMA_LOG_INFO("slerp "); + float * in0 = (float *) f32_in_buf0.data(); + float * in1 = (float *) f32_in_buf1.data(); + float * dest = (float *) f32_out_buf.data(); for (size_t i = 0; i < n_elements; i++) { - float * in = (float *) f32_in_buf.data(); - float * dest = (float *) f32_out_buf.data(); - dest[i] += in[i] * scale; + //dest[i] = in0[i] * ins.t + in1[i] * 0; + dest[i] = in0[i]; } } // re-quantize it - //LLAMA_LOG_ERROR("requant\n"); { + LLAMA_LOG_INFO("requant\n"); std::array hist_cur = {}; const int n_per_row = out_tensor->ne[0]; const int n_rows = n_elements / n_per_row; @@ -11588,16 +11597,10 @@ int32_t llama_merge_models(const struct llama_merge_config * config) { GGML_ASSERT(new_size == out_buf.size()); } - // write tensor to file - { - LLAMA_LOG_ERROR("===> INPUT [layer %d] %f %f %f\n", i_layer_out, f32_in_buf[0].value, f32_in_buf[1].value, f32_in_buf[2].value); - LLAMA_LOG_ERROR("===> OUTPUT [layer %d] %f %f %f\n", i_layer_out, f32_out_buf[0], f32_out_buf[1], f32_out_buf[2]); - // my turn, write the result! - // write tensor data + padding - fout.write((const char *) out_buf.data(), out_buf.size()); - zeros(fout, GGML_PAD(out_buf.size(), GGUF_DEFAULT_ALIGNMENT) - out_buf.size()); - log_step(out_tensor); - } + LLAMA_LOG_INFO("===> INPUT %f %f %f\n", f32_in_buf0[0].value, f32_in_buf0[1].value, f32_in_buf0[2].value); + LLAMA_LOG_INFO("===> OUTPUT %f %f %f\n", f32_out_buf[0], f32_out_buf[1], f32_out_buf[2]); + + write_output_tensor(out_tensor, out_buf.data()); } // go back to beginning of file and write the updated meta data @@ -11610,7 +11613,6 @@ int32_t llama_merge_models(const struct llama_merge_config * config) { } clean_up(); -*/ return 0; } diff --git a/llama.h b/llama.h index a6fb7b8fa7646..b2ff920da4a89 100644 --- a/llama.h +++ b/llama.h @@ -330,7 +330,7 @@ extern "C" { enum llama_merge_method { LLAMA_MERGE_LINEAR, LLAMA_MERGE_SLERP, - LLAMA_MERGE_REPEAT, + LLAMA_MERGE_REPEAT, // doesn't work for now LLAMA_MERGE_COPY, }; @@ -339,7 +339,7 @@ extern "C" { char name[GGML_MAX_NAME]; // name of output tensor enum llama_merge_method method; // we only support 2 models for now - char srcs[2][GGML_MAX_NAME]; // name of input tensors + char srcs[2][GGML_MAX_NAME]; // name of input tensors. if method == copy, only one src is non-empty float scales[2]; // for linear method float t; // for slerp method };