From 2f3e5c38f5e50b2dbe51cd9d8b3a3f9b0256faba Mon Sep 17 00:00:00 2001 From: Holden <34213478+hodlen@users.noreply.github.com> Date: Sat, 16 Dec 2023 00:40:55 +0800 Subject: [PATCH] add gpu index opts and udpate doc commands --- README.md | 53 +++++++++++++++++++----------------- common/common.cpp | 11 ++++---- common/common.h | 2 +- examples/batched/batched.cpp | 10 +++---- llama.cpp | 2 +- llama.h | 2 +- scripts/export-gpu-split.py | 2 +- 7 files changed, 42 insertions(+), 40 deletions(-) diff --git a/README.md b/README.md index 03296808..d13f6b2b 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,6 @@ The SparseLLM Team is currently converting the Mistral-7B model to a sparser ver - [Installation](##setup--installation) - [Model Weights](##model-weights) -- [Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html) ## Setup & Installation ### Get the Code @@ -60,28 +59,24 @@ git clone https://github.com/hodlen/PowerInfer cd PowerInfer ``` ### Build -In order to build PowerInfer you have two different options. - -- Using `make`: - - On Linux or MacOS: - ```bash - make - ``` -- Using `CMake`: - - If you have one GPU: - ```bash - mkdir build - cd build - cmake .. -DLLAMA_CUBLAS=ON - cmake --build . --config Release - ``` - - If you just CPU: - ```bash - mkdir build - cd build - cmake .. - cmake --build . --config Release - ``` +In order to build PowerInfer you have two different options. These commands are supposed to be run from the root directory of the project. + +Using `make` on Linux or MacOS: +```bash +make +``` + +Using `CMake`: +* If you have one GPU: +```bash +cmake -S . -B build -DLLAMA_CUBLAS=ON +cmake --build build --config Release +``` +* If you just CPU: +```bash +cmake -S . -B build +cmake --build build --config Release +``` ## Model Weights @@ -96,11 +91,19 @@ In order to build PowerInfer you have two different options. ```bash ./build/bin/main -m /PATH/TO/MODEL -n $(output_token_count) -t $(thread_num) -p $(prompt) ``` -- If you have CPU with one consumer grade GPU: +- If you have CPU with one GPU: ```bash - ./build/bin/main -m /PATH/TO/MODEL -n $(output_token_count) -t $(thread_num) -p $(prompt) +./build/bin/main -m /PATH/TO/MODEL -n $(output_token_count) -t $(thread_num) -p $(prompt) ``` +As for now, it requires a offline-generated "GPU index" file to split FFNs on GPU. If you want to try it, please use the following instruction to generate the GPU index file: +```bash +python scripts/export-gpu-split.py $(activation_count_path) $(output_idx_path) solver +``` +Then, you can use the following instruction to run PowerInfer with GPU index: +```bash +./build/bin/main -m /PATH/TO/MODEL -n $(output_token_count) -t $(thread_num) -p $(prompt) --gpu-index $(split_path) +``` ## Evaluation diff --git a/common/common.cpp b/common/common.cpp index 429f313a..936e114c 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -471,12 +471,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { break; } params.lora_base = argv[i]; - } else if (arg == "--mlp-adapter") { + } else if (arg == "--gpu-index") { if (++i >= argc) { invalid_param = true; break; } - params.mlp_adapter = argv[i]; + params.gpu_index = argv[i]; } else if (arg == "--mmproj") { if (++i >= argc) { invalid_param = true; @@ -970,9 +970,8 @@ std::tuple llama_init_from_gpt_par if (llama_use_sparse_inference(model)) { fprintf(stderr, "%s: postprocessing PowerInfer model '%s'\n", __func__, params.model.c_str()); - if (!params.mlp_adapter.empty()) { - fprintf(stderr, "%s: warning: --mlp-adapter is deprecated and has no effect\n", __func__); - int err = llama_model_apply_mlp_from_file(model, params.mlp_adapter.c_str(), true); + if (!params.gpu_index.empty()) { + int err = llama_model_apply_gpu_idx_from_file(model, params.gpu_index.c_str(), true); if (err != 0) { fprintf(stderr, "%s: error: failed to apply mlp adapter\n", __func__); llama_free_model(model); @@ -1358,7 +1357,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la)); } fprintf(stream, "lora_base: %s\n", params.lora_base.c_str()); - fprintf(stream, "mlp_adapter: %s\n", params.mlp_adapter.c_str()); + fprintf(stream, "gpu_index: %s\n", params.gpu_index.c_str()); fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu); fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false"); fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat); diff --git a/common/common.h b/common/common.h index 20497018..c16eb86e 100644 --- a/common/common.h +++ b/common/common.h @@ -91,7 +91,7 @@ struct gpt_params { std::vector> lora_adapter; // lora adapter path with user defined scale std::string lora_base = ""; // base model path for the lora adapter - std::string mlp_adapter = ""; // sparse activation mlp adapter path + std::string gpu_index = ""; // sparse activation mlp adapter path int ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used. int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index b9a355b0..fcfe9140 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -49,11 +49,11 @@ int main(int argc, char ** argv) { } if (argc >= 8) { - params.mlp_adapter = argv[7]; + params.gpu_index = argv[7]; } - printf("params: model = %s, prompt = %s, n_parallel = %d, n_len = %d, n_gpu_layers = %d, n_threads = %d, mlp_adapter = %s\n", - params.model.c_str(), params.prompt.c_str(), n_parallel, n_len, n_gpu_layers, params.n_threads, params.mlp_adapter.c_str()); + printf("params: model = %s, prompt = %s, n_parallel = %d, n_len = %d, n_gpu_layers = %d, n_threads = %d, gpu_index = %s\n", + params.model.c_str(), params.prompt.c_str(), n_parallel, n_len, n_gpu_layers, params.n_threads, params.gpu_index.c_str()); if (params.prompt.empty()) { params.prompt = "Hello my name is"; @@ -76,8 +76,8 @@ int main(int argc, char ** argv) { return 1; } - if (!params.mlp_adapter.empty()) { - int err = llama_model_apply_mlp_from_file(model, params.mlp_adapter.c_str(), true); + if (!params.gpu_index.empty()) { + int err = llama_model_apply_gpu_idx_from_file(model, params.gpu_index.c_str(), true); if (err != 0) { fprintf(stderr, "%s: error: failed to apply mlp adapter\n", __func__); llama_free_model(model); diff --git a/llama.cpp b/llama.cpp index e2e466d0..5814a1ea 100644 --- a/llama.cpp +++ b/llama.cpp @@ -9660,7 +9660,7 @@ int llama_model_apply_lora_from_file(const struct llama_model * model, const cha } } -int llama_model_apply_mlp_from_file(struct llama_model * model, const char * path_mlp, bool use_mmap) { +int llama_model_apply_gpu_idx_from_file(struct llama_model * model, const char * path_mlp, bool use_mmap) { llama_mlp_model_loader * mlp_ml = new llama_mlp_model_loader(path_mlp, use_mmap); if (mlp_ml -> apply_tensors_to_base_model(model) > 0) { LLAMA_LOG_ERROR("%s: failed to apply mlp adapter\n", __func__); diff --git a/llama.h b/llama.h index 1d73b522..0ab02cca 100644 --- a/llama.h +++ b/llama.h @@ -342,7 +342,7 @@ extern "C" { const char * path_base_model, int n_threads); - LLAMA_API int llama_model_apply_mlp_from_file( + LLAMA_API int llama_model_apply_gpu_idx_from_file( struct llama_model * model, const char * path_mlp, bool use_mmap); diff --git a/scripts/export-gpu-split.py b/scripts/export-gpu-split.py index c028f6a3..30f44fe8 100644 --- a/scripts/export-gpu-split.py +++ b/scripts/export-gpu-split.py @@ -134,7 +134,7 @@ def main(predictors_path: str, output_path: str, solver_path: str): parser.add_argument( "output_path", help="path to the output GGML adapter", - default="./ggml-mlp-adapters.bin", + default="./gpu-index.bin", ) parser.add_argument("solver", help="path to the solver")