fix: failed to asign output buffer

Signed-off-by: thxCode <[email protected]>
gpustack · Aug 30, 2024 · 95b2217 · 95b2217
1 parent a66b9d3
commit 95b2217
Show file tree

Hide file tree

Showing 2 changed files with 24 additions and 18 deletions.
diff --git a/README.md b/README.md
@@ -517,7 +517,7 @@ $ gguf-parser --hf-repo="hierholzer/Llama-3.1-70B-Instruct-GGUF" --hf-file="Llam
 +--------------------+------------+------------+----------------+---------+-----------+----------------+-----------+-----------+
 | LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |   UMA   |   NONUMA  | LAYERS (T + O) |    UMA    |   NONUMA  |
 +--------------------+------------+------------+----------------+---------+-----------+----------------+-----------+-----------+
-|      1 + 0 + 0     | 238.08 MiB | 388.08 MiB |     36 + 0     | 144 MiB | 17.87 GiB |     44 + 1     | 22.01 GiB | 22.44 GiB |
+|      1 + 0 + 0     | 238.08 MiB | 388.08 MiB |     36 + 0     | 144 MiB | 17.79 GiB |     44 + 1     | 22.01 GiB | 22.51 GiB |
 +--------------------+------------+------------+----------------+---------+-----------+----------------+-----------+-----------+
 
 ```
@@ -528,8 +528,8 @@ resource consumption:
 | Host                  | Available RAM | Request RAM | Available VRAM | Request VRAM | Result     |
 |-----------------------|---------------|-------------|----------------|--------------|------------|
 | host1                 | ENOUGH        | 388.08 MiB  |                |              | :thumbsup: |
-| host1 (NVIDIA 4080 0) |               |             | 8 GiB          | 17.87 GiB    |            |
-| host1 (NVIDIA 4080 1) |               |             | 10 GiB         | 22.44 GiB    |            |
+| host1 (NVIDIA 4080 0) |               |             | 8 GiB          | 17.79 GiB    |            |
+| host1 (NVIDIA 4080 1) |               |             | 10 GiB         | 22.51 GiB    |            |
 
 It appears that running the model on `host1` alone is not feasible.
 
@@ -570,7 +570,7 @@ $ gguf-parser --hf-repo="hierholzer/Llama-3.1-70B-Instruct-GGUF" --hf-file="Llam
 +--------------------+------------+------------+----------------+----------+----------+----------------+-----------+-----------+----------------+-----------+-----------+----------------+----------+----------+
 | LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |    UMA   |  NONUMA  | LAYERS (T + O) |    UMA    |   NONUMA  | LAYERS (T + O) |    UMA    |   NONUMA  | LAYERS (T + O) |    UMA   |  NONUMA  |
 +--------------------+------------+------------+----------------+----------+----------+----------------+-----------+-----------+----------------+-----------+-----------+----------------+----------+----------+
-|      1 + 0 + 0     | 238.08 MiB | 388.08 MiB |     18 + 0     | 8.85 GiB | 9.37 GiB |     23 + 0     | 10.88 GiB | 11.32 GiB |     27 + 0     | 12.75 GiB | 13.19 GiB |     12 + 1     | 6.87 GiB | 7.31 GiB |
+|      1 + 0 + 0     | 238.08 MiB | 388.08 MiB |     18 + 0     | 8.85 GiB | 9.28 GiB |     23 + 0     | 10.88 GiB | 11.32 GiB |     27 + 0     | 12.75 GiB | 13.19 GiB |     12 + 1     | 6.87 GiB | 7.38 GiB |
 +--------------------+------------+------------+----------------+----------+----------+----------------+-----------+-----------+----------------+-----------+-----------+----------------+----------+----------+
 
 ```
@@ -581,7 +581,7 @@ following resource consumption:
 | Host                  | Available RAM | Request RAM | Available VRAM | Request VRAM | Result     |
 |-----------------------|---------------|-------------|----------------|--------------|------------|
 | host4                 | 11 GiB        | 388.08 MiB  |                |              | :thumbsup: |
-| host1 (NVIDIA 4080 0) |               |             | 8 GiB          | 9.37 GiB     |            |
+| host1 (NVIDIA 4080 0) |               |             | 8 GiB          | 9.28 GiB     |            |
 | host1 (NVIDIA 4080 1) |               |             | 10 GiB         | 11.32 GiB    |            |
 | host2 (NVIDIA 4090)   |               |             | 12 GiB         | 13.19 GiB    |            |
 | host3 (Apple M1 Max)  | ENOUGH        |             | 6 GiB          | 6.87 GiB     |            |
@@ -615,15 +615,15 @@ flowchart TD
 
 ```shell
 $ gguf-parser --hf-repo="hierholzer/Llama-3.1-70B-Instruct-GGUF" --hf-file="Llama-3.1-70B-Instruct-Q4_K_M.gguf" --skip-metadata --skip-architecture --skip-tokenizer --ctx-size=1024 --tensor-split="6,11,12,8,10" --rpc="host4:50052,host2:50052,host1:50052,host1:50053" --in-short
-+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| ESTIMATE                                                                                                                                                                                                                                      |
-+----------------------------------------------+------------------------------------+--------------------------------------+--------------------------------------+--------------------------------------+--------------------------------------+
-|                      RAM                     |               VRAM 0               |                VRAM 1                |                VRAM 2                |                VRAM 3                |                VRAM 4                |
-+--------------------+------------+------------+----------------+--------+----------+----------------+----------+----------+----------------+----------+----------+----------------+----------+----------+----------------+----------+----------+
-| LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |   UMA  |  NONUMA  | LAYERS (T + O) |    UMA   |  NONUMA  | LAYERS (T + O) |    UMA   |  NONUMA  | LAYERS (T + O) |    UMA   |  NONUMA  | LAYERS (T + O) |    UMA   |  NONUMA  |
-+--------------------+------------+------------+----------------+--------+----------+----------------+----------+----------+----------------+----------+----------+----------------+----------+----------+----------------+----------+----------+
-|      1 + 0 + 0     | 238.08 MiB | 388.08 MiB |     11 + 0     | 44 MiB | 6.08 GiB |     19 + 0     | 8.96 GiB | 9.39 GiB |     20 + 0     | 9.47 GiB | 9.90 GiB |     14 + 0     | 6.63 GiB | 7.07 GiB |     16 + 1     | 8.74 GiB | 9.18 GiB |
-+--------------------+------------+------------+----------------+--------+----------+----------------+----------+----------+----------------+----------+----------+----------------+----------+----------+----------------+----------+----------+
++---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| ESTIMATE                                                                                                                                                                                                                                    |
++----------------------------------------------+----------------------------------+--------------------------------------+--------------------------------------+--------------------------------------+--------------------------------------+
+|                      RAM                     |              VRAM 0              |                VRAM 1                |                VRAM 2                |                VRAM 3                |                VRAM 4                |
++--------------------+------------+------------+----------------+--------+--------+----------------+----------+----------+----------------+----------+----------+----------------+----------+----------+----------------+----------+----------+
+| LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |   UMA  | NONUMA | LAYERS (T + O) |    UMA   |  NONUMA  | LAYERS (T + O) |    UMA   |  NONUMA  | LAYERS (T + O) |    UMA   |  NONUMA  | LAYERS (T + O) |    UMA   |  NONUMA  |
++--------------------+------------+------------+----------------+--------+--------+----------------+----------+----------+----------------+----------+----------+----------------+----------+----------+----------------+----------+----------+
+|      1 + 0 + 0     | 238.08 MiB | 388.08 MiB |     11 + 0     | 44 MiB |  6 GiB |     19 + 0     | 8.96 GiB | 9.39 GiB |     20 + 0     | 9.47 GiB | 9.90 GiB |     14 + 0     | 6.63 GiB | 7.07 GiB |     16 + 1     | 8.74 GiB | 9.25 GiB |
++--------------------+------------+------------+----------------+--------+--------+----------------+----------+----------+----------------+----------+----------+----------------+----------+----------+----------------+----------+----------+
 
 ```
 
@@ -637,7 +637,7 @@ following resource consumption:
 | host4                 | 11 GiB        | 9.39 GiB    |                |              | :thumbsup: |
 | host1 (NVIDIA 4080 1) |               |             | 12 GiB         | 9.90 GiB     | :thumbsup: |
 | host2 (NVIDIA 4080 0) |               |             | 8 GiB          | 7.07 GiB     | :thumbsup: |
-| host3 (NVIDIA 4080 1) |               |             | 10 GiB         | 9.18 GiB     | :thumbsup: |
+| host3 (NVIDIA 4080 1) |               |             | 10 GiB         | 9.25 GiB     | :thumbsup: |
 
 Now, the model can be successfully served on `host3`, with all layers offloaded to `host1`, `host2`, and `host4`.
 

diff --git a/file_estimate.go b/file_estimate.go
@@ -546,7 +546,7 @@ func (gf *GGUFFile) EstimateLLaMACppRun(opts ...LLaMACppRunEstimateOption) (e LL
 		}
 		// Finally, get the usage of output layer.
 		if a.Type == "model" {
-			outInc := inpEmbd
+			var outInc uint64
 			if a.Architecture == "mamba" {
 				outInc += inpSMask + inpSSeq
 			}
@@ -557,10 +557,16 @@ func (gf *GGUFFile) EstimateLLaMACppRun(opts ...LLaMACppRunEstimateOption) (e LL
 				rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens})
 				outInc += rs
 			}
+			idx := 0 // Default to the main host's RAM.
 			if !fullOffload {
-				outInc += uint64(e.Devices[0].Weight.Output)
+				if len(e.Devices) != len(o.RPCServers)+1 { // If the main host has a GPU.
+					outInc += uint64(e.Devices[0].Weight.Output)
+					idx = o.MainGPUIndex + 1
+				}
+			} else {
+				idx = len(e.Devices) - 1 // The last device is the output device.
 			}
-			e.Devices[o.MainGPUIndex+1].Computation.Output += GGUFBytesScalar(outInc)
+			e.Devices[idx].Computation.Output += GGUFBytesScalar(outInc)
 		}
 	}