refactor: adjust estimate

Signed-off-by: thxCode <[email protected]>
gpustack · Aug 27, 2024 · 798028e · 798028e
1 parent 94139f2
commit 798028e
Show file tree

Hide file tree

Showing 2 changed files with 37 additions and 38 deletions.
diff --git a/README.md b/README.md
@@ -190,7 +190,7 @@ $ gguf-parser --url="https://huggingface.co/NousResearch/Nous-Hermes-2-Mixtral-8
 |       |              |                    |                 |               |                |               |                |                +------------+------------+-----------+-----------+
 |       |              |                    |                 |               |                |               |                |                |     UMA    |   NONUMA   |    UMA    |   NONUMA  |
 +-------+--------------+--------------------+-----------------+---------------+----------------+---------------+----------------+----------------+------------+------------+-----------+-----------+
-| llama |     32768    |     2048 / 512     |     Disabled    | Not Supported |       No       |   Supported   |   33 (32 + 1)  |       Yes      | 174.54 MiB | 324.54 MiB | 24.94 GiB | 27.41 GiB |
+| llama |     32768    |     2048 / 512     |     Disabled    | Not Supported |       No       |   Supported   |   33 (32 + 1)  |       Yes      | 277.10 MiB | 427.10 MiB | 24.94 GiB | 27.41 GiB |
 +-------+--------------+--------------------+-----------------+---------------+----------------+---------------+----------------+----------------+------------+------------+-----------+-----------+
 
 $ # Retrieve the model's metadata via split file
@@ -267,7 +267,7 @@ $ gguf-parser --hf-repo="openbmb/MiniCPM-Llama3-V-2_5-gguf" --hf-file="ggml-mode
 |       |              |                    |                 |           |                |               |                |                +------------+------------+--------+----------+
 |       |              |                    |                 |           |                |               |                |                |     UMA    |   NONUMA   |   UMA  |  NONUMA  |
 +-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+------------+------------+--------+----------+
-| llama |     8192     |     2048 / 512     |     Disabled    |  Enabled  |       No       |   Supported   |   33 (32 + 1)  |       Yes      | 184.85 MiB | 334.85 MiB |  1 GiB | 7.78 GiB |
+| llama |     8192     |     2048 / 512     |     Disabled    |  Enabled  |       No       |   Supported   |   33 (32 + 1)  |       Yes      | 184.85 MiB | 334.85 MiB |  1 GiB | 7.88 GiB |
 +-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+------------+------------+--------+----------+
 
 $ # Retrieve the model's metadata via split file
@@ -515,12 +515,12 @@ $ gguf-parser --hf-repo="hierholzer/Llama-3.1-70B-Instruct-GGUF" --hf-file="Llam
 |       |              |                    |                 |           |                |               |                |                +------------+------------+------------+-----------+-----------+-----------+
 |       |              |                    |                 |           |                |               |                |                |     UMA    |   NONUMA   |     UMA    |   NONUMA  |    UMA    |   NONUMA  |
 +-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+------------+------------+------------+-----------+-----------+-----------+
-| llama |     1024     |     2048 / 512     |     Disabled    |  Enabled  |       No       |   Supported   |   81 (80 + 1)  |       Yes      | 270.08 MiB | 420.08 MiB | 192.52 MiB | 24.34 GiB | 16.53 GiB | 16.78 GiB |
+| llama |     1024     |     2048 / 512     |     Disabled    |  Enabled  |       No       |   Supported   |   81 (80 + 1)  |       Yes      | 270.08 MiB | 420.08 MiB | 192.52 MiB | 24.34 GiB | 15.73 GiB | 16.78 GiB |
 +-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+------------+------------+------------+-----------+-----------+-----------+
 
 ```
 
-Based on the output provided, serving the `hierholzer/Llama-3.1-70B-Instruct-GGUF model` on `host1` has the following
+Based on the output provided, serving the `hierholzer/Llama-3.1-70B-Instruct-GGUF` model on `host1` has the following
 resource consumption:
 
 | Host                  | Available RAM | Request RAM | Available VRAM | Request VRAM | Result     |
@@ -560,16 +560,16 @@ flowchart TD
 ```
 
 ```shell
-gguf-parser --hf-repo="hierholzer/Llama-3.1-70B-Instruct-GGUF" --hf-file="Llama-3.1-70B-Instruct-Q4_K_M.gguf" --skip-metadata --skip-architecture --skip-tokenizer --ctx-size=1024 --tensor-split="15,10,12,6" --rpc="host1:50052,host1:50053,host2:50052,host3:50052"
-+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| ESTIMATE                                                                                                                                                                                                                                                          |
-+-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+-------------------------+-----------------------+----------------------+-----------------------+---------------------+
-|  ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED |           RAM           |         VRAM 0        |        VRAM 1        |         VRAM 2        |        VRAM 3       |
-|       |              |                    |                 |           |                |               |                |                +------------+------------+-----------+-----------+----------+-----------+-----------+-----------+----------+----------+
-|       |              |                    |                 |           |                |               |                |                |     UMA    |   NONUMA   |    UMA    |   NONUMA  |    UMA   |   NONUMA  |    UMA    |   NONUMA  |    UMA   |  NONUMA  |
-+-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+------------+------------+-----------+-----------+----------+-----------+-----------+-----------+----------+----------+
-| llama |     1024     |     2048 / 512     |     Disabled    |  Enabled  |       No       |   Supported   |   81 (80 + 1)  |       Yes      | 302.08 MiB | 452.08 MiB | 14.69 GiB | 14.93 GiB | 9.95 GiB | 10.20 GiB | 11.37 GiB | 11.61 GiB | 6.61 GiB | 6.86 GiB |
-+-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+------------+------------+-----------+-----------+----------+-----------+-----------+-----------+----------+----------+
+$ gguf-parser --hf-repo="hierholzer/Llama-3.1-70B-Instruct-GGUF" --hf-file="Llama-3.1-70B-Instruct-Q4_K_M.gguf" --skip-metadata --skip-architecture --skip-tokenizer --ctx-size=1024 --tensor-split="15,10,12,6" --rpc="host1:50052,host1:50053,host2:50052,host3:50052"
++------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| ESTIMATE                                                                                                                                                                                                                                                         |
++-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+-------------------------+-----------------------+---------------------+-----------------------+---------------------+
+|  ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED |           RAM           |         VRAM 0        |        VRAM 1       |         VRAM 2        |        VRAM 3       |
+|       |              |                    |                 |           |                |               |                |                +------------+------------+-----------+-----------+----------+----------+-----------+-----------+----------+----------+
+|       |              |                    |                 |           |                |               |                |                |     UMA    |   NONUMA   |    UMA    |   NONUMA  |    UMA   |  NONUMA  |    UMA    |   NONUMA  |    UMA   |  NONUMA  |
++-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+------------+------------+-----------+-----------+----------+----------+-----------+-----------+----------+----------+
+| llama |     1024     |     2048 / 512     |     Disabled    |  Enabled  |       No       |   Supported   |   81 (80 + 1)  |       Yes      | 302.08 MiB | 452.08 MiB | 14.69 GiB | 14.93 GiB | 9.15 GiB | 9.32 GiB | 10.57 GiB | 10.72 GiB | 5.81 GiB | 6.81 GiB |
++-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+------------+------------+-----------+-----------+----------+----------+-----------+-----------+----------+----------+
 
 ```
 
@@ -580,9 +580,9 @@ following resource consumption:
 |-----------------------|---------------|-------------|----------------|--------------|------------|
 | host4                 | 11 GiB        | 452.08 MiB  |                |              | :thumbsup: |
 | host1 (NVIDIA 4080 0) |               |             | 15 GiB         | 14.93 GiB    | :thumbsup: |
-| host1 (NVIDIA 4080 1) |               |             | 10 GiB         | 10.20 GiB    |            |
-| host2 (NVIDIA 4090)   |               |             | 12 GiB         | 11.61 GiB    | :thumbsup: |
-| host3 (Apple M1 Max)  | ENOUGH        |             | 6 GiB          | 6.86 GiB     |            |
+| host1 (NVIDIA 4080 1) |               |             | 10 GiB         | 9.32 GiB     | :thumbsup: |
+| host2 (NVIDIA 4090)   |               |             | 12 GiB         | 10.72 GiB    | :thumbsup: |
+| host3 (Apple M1 Max)  | ENOUGH        |             | 6 GiB          | 6.81 GiB     |            |
 
 It seems that the model cannot be served on `host4`, even with all layers offloaded to `host1`, `host2`, and `host3`.
 
@@ -612,15 +612,15 @@ flowchart TD
 ```
 
 ```shell
-gguf-parser --hf-repo="hierholzer/Llama-3.1-70B-Instruct-GGUF" --hf-file="Llama-3.1-70B-Instruct-Q4_K_M.gguf" --skip-metadata --skip-architecture --skip-tokenizer --ctx-size=1024 --tensor-split="6,11,12,15,10" --rpc="host4:50052,host2:50052,host1:50052,host1:50053"
+$ gguf-parser --hf-repo="hierholzer/Llama-3.1-70B-Instruct-GGUF" --hf-file="Llama-3.1-70B-Instruct-Q4_K_M.gguf" --skip-metadata --skip-architecture --skip-tokenizer --ctx-size=1024 --tensor-split="6,11,12,15,10" --rpc="host4:50052,host2:50052,host1:50052,host1:50053"
 +---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | ESTIMATE                                                                                                                                                                                                                                                                              |
 +-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+-------------------------+----------------------+---------------------+---------------------+-----------------------+---------------------+
 |  ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED |           RAM           |        VRAM 0        |        VRAM 1       |        VRAM 2       |         VRAM 3        |        VRAM 4       |
 |       |              |                    |                 |           |                |               |                |                +------------+------------+-----------+----------+----------+----------+----------+----------+-----------+-----------+----------+----------+
 |       |              |                    |                 |           |                |               |                |                |     UMA    |   NONUMA   |    UMA    |  NONUMA  |    UMA   |  NONUMA  |    UMA   |  NONUMA  |    UMA    |   NONUMA  |    UMA   |  NONUMA  |
 +-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+------------+------------+-----------+----------+----------+----------+----------+----------+-----------+-----------+----------+----------+
-| llama |     1024     |     2048 / 512     |     Disabled    |  Enabled  |       No       |   Supported   |   81 (80 + 1)  |       Yes      | 318.08 MiB | 468.08 MiB | 36.52 MiB | 5.92 GiB | 9.04 GiB | 9.29 GiB | 9.04 GiB | 9.29 GiB | 11.82 GiB | 12.07 GiB | 8.03 GiB | 8.27 GiB |
+| llama |     1024     |     2048 / 512     |     Disabled    |  Enabled  |       No       |   Supported   |   81 (80 + 1)  |       Yes      | 318.08 MiB | 468.08 MiB | 36.52 MiB | 5.92 GiB | 8.24 GiB | 8.42 GiB | 8.24 GiB | 8.42 GiB | 11.02 GiB | 11.17 GiB | 7.23 GiB | 8.22 GiB |
 +-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+------------+------------+-----------+----------+----------+----------+----------+----------+-----------+-----------+----------+----------+
 
 ```
@@ -632,10 +632,10 @@ following resource consumption:
 |-----------------------|---------------|-------------|----------------|--------------|------------|
 | host3 (Apple M1 Max)  | ENOUGH        | 318.08 MiB  |                |              | :thumbsup: |
 | host3 (Apple M1 Max)  |               |             | 6 GiB          | 36.52 MiB    | :thumbsup: |
-| host4                 | 11 GiB        | 9.29 GiB    |                |              | :thumbsup: |
-| host1 (NVIDIA 4080 1) |               |             | 12 GiB         | 9.29 GiB     | :thumbsup: |
-| host2 (NVIDIA 4080 0) |               |             | 15 GiB         | 12.07 GiB    | :thumbsup: |
-| host3 (NVIDIA 4080 1) |               |             | 10 GiB         | 8.27 GiB     | :thumbsup: |
+| host4                 | 11 GiB        | 8.42 GiB    |                |              | :thumbsup: |
+| host1 (NVIDIA 4080 1) |               |             | 12 GiB         | 8.42 GiB     | :thumbsup: |
+| host2 (NVIDIA 4080 0) |               |             | 15 GiB         | 11.17 GiB    | :thumbsup: |
+| host3 (NVIDIA 4080 1) |               |             | 10 GiB         | 8.22 GiB     | :thumbsup: |
 
 Now, the model can be successfully served on `host3`, with all layers offloaded to `host1`, `host2`, and `host4`.
 

diff --git a/file_estimate.go b/file_estimate.go
@@ -364,12 +364,9 @@ func (gf *GGUFFile) EstimateLLaMACppUsage(opts ...LLaMACppUsageEstimateOption) (
 		} else if a.AttentionCausal {
 			op = GGUFBytesScalar(opLs.Bytes()) + e.Devices[0].Weight.Input /* duplicate the input layer */
 		}
+		e.Devices[0].Weight.Output = op
 		if fullOffload {
-			for i := range e.Devices[1:] {
-				e.Devices[i+1].Weight.Output = op
-			}
-		} else {
-			e.Devices[0].Weight.Output = op
+			e.Devices[len(e.Devices)-1].Weight.Output = op
 		}
 	}
 
@@ -470,10 +467,7 @@ func (gf *GGUFFile) EstimateLLaMACppUsage(opts ...LLaMACppUsageEstimateOption) (
 				ssmInc += rs
 			}
 			cp := GGUFBytesScalar(convInc + ssmInc)
-			for i, d := range e.Devices[1:] {
-				if d.LastLayer < 0 && (i == 0 && !d.Remote) {
-					continue
-				}
+			for i := range e.Devices[1:] {
 				e.Devices[i+1].Computation.Compute = cp
 			}
 		case a.Type == "model":
@@ -537,10 +531,7 @@ func (gf *GGUFFile) EstimateLLaMACppUsage(opts ...LLaMACppUsageEstimateOption) (
 				e.Devices[0].Computation.Compute = GGUFBytesScalar(loadAttnInc)
 			}
 			cp := GGUFBytesScalar(max(offloadAttnInc, ffnInc))
-			for i, d := range e.Devices[1:] {
-				if d.LastLayer < 0 && (i == 0 && !d.Remote) {
-					continue
-				}
+			for i := range e.Devices[1:] {
 				e.Devices[i+1].Computation.Compute = cp
 			}
 			// Special case: we cannot use mmap for splitting expert weights in MoE.
@@ -676,13 +667,21 @@ func (e LLaMACppUsageEstimate) SummarizeMemory(mmap bool, nonUMARamFootprint, no
 			ems.VRAMs[i].UMA = fp + wg + kv + /* cp */ 0
 			if !e.NoMMap && mmap {
 				ems.VRAMs[i].UMA -= wg
-				if i > 0 || v.Remote {
-					ems.VRAMs[i].UMA += wg + cp
+				if i > 0 && v.LastLayer >= 0 || v.Remote {
+					ems.VRAMs[i].UMA += wg + cp - v.Weight.Output
 				}
 			}
 
 			// NonUMA.
 			ems.VRAMs[i].NonUMA = GGUFBytesScalar(nonUMAVramFootprint) + fp + wg + kv + cp
+			if i > 0 {
+				switch {
+				case v.LastLayer < 0:
+					ems.VRAMs[i].NonUMA -= wg + cp
+				case v.Remote && wg > kv:
+					ems.VRAMs[i].NonUMA -= kv
+				}
+			}
 		}
 	}