Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add gpu index opts and udpate doc commands #2

Merged
merged 1 commit into from
Dec 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 28 additions & 25 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@ The SparseLLM Team is currently converting the Mistral-7B model to a sparser ver

- [Installation](##setup--installation)
- [Model Weights](##model-weights)
- [Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html)

## Setup & Installation
### Get the Code
Expand All @@ -60,28 +59,24 @@ git clone https://github.com/hodlen/PowerInfer
cd PowerInfer
```
### Build
In order to build PowerInfer you have two different options.

- Using `make`:
- On Linux or MacOS:
```bash
make
```
- Using `CMake`:
- If you have one GPU:
```bash
mkdir build
cd build
cmake .. -DLLAMA_CUBLAS=ON
cmake --build . --config Release
```
- If you just CPU:
```bash
mkdir build
cd build
cmake ..
cmake --build . --config Release
```
In order to build PowerInfer you have two different options. These commands are supposed to be run from the root directory of the project.

Using `make` on Linux or MacOS:
```bash
make
```

Using `CMake`:
* If you have one GPU:
```bash
cmake -S . -B build -DLLAMA_CUBLAS=ON
cmake --build build --config Release
```
* If you just CPU:
```bash
cmake -S . -B build
cmake --build build --config Release
```

## Model Weights

Expand All @@ -96,11 +91,19 @@ In order to build PowerInfer you have two different options.
```bash
./build/bin/main -m /PATH/TO/MODEL -n $(output_token_count) -t $(thread_num) -p $(prompt)
```
- If you have CPU with one consumer grade GPU:
- If you have CPU with one GPU:
```bash
./build/bin/main -m /PATH/TO/MODEL -n $(output_token_count) -t $(thread_num) -p $(prompt)
./build/bin/main -m /PATH/TO/MODEL -n $(output_token_count) -t $(thread_num) -p $(prompt)
```

As for now, it requires a offline-generated "GPU index" file to split FFNs on GPU. If you want to try it, please use the following instruction to generate the GPU index file:
```bash
python scripts/export-gpu-split.py $(activation_count_path) $(output_idx_path) solver
```
Then, you can use the following instruction to run PowerInfer with GPU index:
```bash
./build/bin/main -m /PATH/TO/MODEL -n $(output_token_count) -t $(thread_num) -p $(prompt) --gpu-index $(split_path)
```

## Evaluation

Expand Down
11 changes: 5 additions & 6 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -471,12 +471,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
break;
}
params.lora_base = argv[i];
} else if (arg == "--mlp-adapter") {
} else if (arg == "--gpu-index") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.mlp_adapter = argv[i];
params.gpu_index = argv[i];
} else if (arg == "--mmproj") {
if (++i >= argc) {
invalid_param = true;
Expand Down Expand Up @@ -970,9 +970,8 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par

if (llama_use_sparse_inference(model)) {
fprintf(stderr, "%s: postprocessing PowerInfer model '%s'\n", __func__, params.model.c_str());
if (!params.mlp_adapter.empty()) {
fprintf(stderr, "%s: warning: --mlp-adapter is deprecated and has no effect\n", __func__);
int err = llama_model_apply_mlp_from_file(model, params.mlp_adapter.c_str(), true);
if (!params.gpu_index.empty()) {
int err = llama_model_apply_gpu_idx_from_file(model, params.gpu_index.c_str(), true);
if (err != 0) {
fprintf(stderr, "%s: error: failed to apply mlp adapter\n", __func__);
llama_free_model(model);
Expand Down Expand Up @@ -1358,7 +1357,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
}
fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
fprintf(stream, "mlp_adapter: %s\n", params.mlp_adapter.c_str());
fprintf(stream, "gpu_index: %s\n", params.gpu_index.c_str());
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
Expand Down
2 changes: 1 addition & 1 deletion common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ struct gpt_params {
std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
std::string lora_base = ""; // base model path for the lora adapter

std::string mlp_adapter = ""; // sparse activation mlp adapter path
std::string gpu_index = ""; // sparse activation mlp adapter path

int ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
Expand Down
10 changes: 5 additions & 5 deletions examples/batched/batched.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,11 @@ int main(int argc, char ** argv) {
}

if (argc >= 8) {
params.mlp_adapter = argv[7];
params.gpu_index = argv[7];
}

printf("params: model = %s, prompt = %s, n_parallel = %d, n_len = %d, n_gpu_layers = %d, n_threads = %d, mlp_adapter = %s\n",
params.model.c_str(), params.prompt.c_str(), n_parallel, n_len, n_gpu_layers, params.n_threads, params.mlp_adapter.c_str());
printf("params: model = %s, prompt = %s, n_parallel = %d, n_len = %d, n_gpu_layers = %d, n_threads = %d, gpu_index = %s\n",
params.model.c_str(), params.prompt.c_str(), n_parallel, n_len, n_gpu_layers, params.n_threads, params.gpu_index.c_str());

if (params.prompt.empty()) {
params.prompt = "Hello my name is";
Expand All @@ -76,8 +76,8 @@ int main(int argc, char ** argv) {
return 1;
}

if (!params.mlp_adapter.empty()) {
int err = llama_model_apply_mlp_from_file(model, params.mlp_adapter.c_str(), true);
if (!params.gpu_index.empty()) {
int err = llama_model_apply_gpu_idx_from_file(model, params.gpu_index.c_str(), true);
if (err != 0) {
fprintf(stderr, "%s: error: failed to apply mlp adapter\n", __func__);
llama_free_model(model);
Expand Down
2 changes: 1 addition & 1 deletion llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9660,7 +9660,7 @@ int llama_model_apply_lora_from_file(const struct llama_model * model, const cha
}
}

int llama_model_apply_mlp_from_file(struct llama_model * model, const char * path_mlp, bool use_mmap) {
int llama_model_apply_gpu_idx_from_file(struct llama_model * model, const char * path_mlp, bool use_mmap) {
llama_mlp_model_loader * mlp_ml = new llama_mlp_model_loader(path_mlp, use_mmap);
if (mlp_ml -> apply_tensors_to_base_model(model) > 0) {
LLAMA_LOG_ERROR("%s: failed to apply mlp adapter\n", __func__);
Expand Down
2 changes: 1 addition & 1 deletion llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -342,7 +342,7 @@ extern "C" {
const char * path_base_model,
int n_threads);

LLAMA_API int llama_model_apply_mlp_from_file(
LLAMA_API int llama_model_apply_gpu_idx_from_file(
struct llama_model * model,
const char * path_mlp,
bool use_mmap);
Expand Down
2 changes: 1 addition & 1 deletion scripts/export-gpu-split.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ def main(predictors_path: str, output_path: str, solver_path: str):
parser.add_argument(
"output_path",
help="path to the output GGML adapter",
default="./ggml-mlp-adapters.bin",
default="./gpu-index.bin",
)
parser.add_argument("solver", help="path to the solver")

Expand Down
Loading