From a2909ee59c10631a88119dde45fbddce9344f4fd Mon Sep 17 00:00:00 2001 From: Qiqi Date: Tue, 10 Dec 2024 16:23:21 +0800 Subject: [PATCH] fix a bug when calculating `neuron_cap` before invoking the solver For example, in ReluLLaMA-7B, NVIDIA GeForce RTX 2080 Ti 11264MiB; ffn_up,ffn_gate,ffn_down all are [4096,11008]; `env CUDA_VISIBLE_DEVICES=0 ./build/bin/main -m ./ReluLLaMA-7B/llama-7b-relu.powerinfer.gguf -n 128 -t 8 -p "Once upon a time"` - before revising: `slice_size=22016` `vram_bytes_per_slice=99072` `vram_allocatable_bytes=4212178944` `neuron_cap=170064` - after revising: `slice_size=8192` `vram_bytes_per_slice=24576` `vram_allocatable_bytes=4212178944` `neuron_cap=171394` --- llama.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llama.cpp b/llama.cpp index 3ae9e946..fa6642f8 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3076,10 +3076,10 @@ static bool llm_load_gpu_split_with_budget(llama_model_loader & ml, llama_model // Calculate solver parameters ggml_tensor * ffn_up = model.layers[0].ffn_up; ggml_tensor * ffn_gate = model.layers[0].ffn_gate; - int slice_size = ffn_up->ne[1] * ggml_type_size(ffn_up->type) / ggml_blck_size(ffn_up->type); + int slice_size = ffn_up->ne[0] * ggml_type_size(ffn_up->type) / ggml_blck_size(ffn_up->type); // For model arch with FFN gate, the gate is also sliced, otherwise only the up and down matrices are sliced - int vram_bytes_per_slice = slice_size * (ffn_gate ? 4.5 : 2); // TODO: why 4.5, not 3? - int neuron_cap = floor((double)vram_allocatable_bytes / vram_bytes_per_slice) * 4; + int vram_bytes_per_slice = slice_size * (ffn_gate ? 3 : 2); + int neuron_cap = floor((double)vram_allocatable_bytes / vram_bytes_per_slice); LLAMA_LOG_INFO("invoking powerinfer Python module to generate gpu split for %.2f MiB of VRAM\n", vram_allocatable_bytes / 1024.0 / 1024.0);