From a2909ee59c10631a88119dde45fbddce9344f4fd Mon Sep 17 00:00:00 2001
From: Qiqi <kiritohugh98@gmail.com>
Date: Tue, 10 Dec 2024 16:23:21 +0800
Subject: [PATCH] fix a bug when calculating `neuron_cap` before invoking the
 solver

For example, in ReluLLaMA-7B, NVIDIA GeForce RTX 2080 Ti 11264MiB;
ffn_up,ffn_gate,ffn_down all are [4096,11008];
`env CUDA_VISIBLE_DEVICES=0 ./build/bin/main -m ./ReluLLaMA-7B/llama-7b-relu.powerinfer.gguf -n 128 -t 8 -p "Once upon a time"`

- before revising:
`slice_size=22016`
`vram_bytes_per_slice=99072`
`vram_allocatable_bytes=4212178944`
`neuron_cap=170064`

- after revising:
`slice_size=8192`
`vram_bytes_per_slice=24576`
`vram_allocatable_bytes=4212178944`
`neuron_cap=171394`
---
 llama.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 3ae9e946..fa6642f8 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3076,10 +3076,10 @@ static bool llm_load_gpu_split_with_budget(llama_model_loader & ml, llama_model
     // Calculate solver parameters
     ggml_tensor * ffn_up = model.layers[0].ffn_up;
     ggml_tensor * ffn_gate = model.layers[0].ffn_gate;
-    int slice_size = ffn_up->ne[1] * ggml_type_size(ffn_up->type) / ggml_blck_size(ffn_up->type);
+    int slice_size = ffn_up->ne[0] * ggml_type_size(ffn_up->type) / ggml_blck_size(ffn_up->type);
     // For model arch with FFN gate, the gate is also sliced, otherwise only the up and down matrices are sliced
-    int vram_bytes_per_slice = slice_size * (ffn_gate ? 4.5 : 2); // TODO: why 4.5, not 3?
-    int neuron_cap = floor((double)vram_allocatable_bytes / vram_bytes_per_slice) * 4;
+    int vram_bytes_per_slice = slice_size * (ffn_gate ? 3 : 2); 
+    int neuron_cap = floor((double)vram_allocatable_bytes / vram_bytes_per_slice);
 
     LLAMA_LOG_INFO("invoking powerinfer Python module to generate gpu split for %.2f MiB of VRAM\n", vram_allocatable_bytes / 1024.0 / 1024.0);