Skip to content

Commit

Permalink
refactor: adjust estimate
Browse files Browse the repository at this point in the history
Signed-off-by: thxCode <[email protected]>
  • Loading branch information
thxCode committed Aug 27, 2024
1 parent 94139f2 commit 798028e
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 38 deletions.
46 changes: 23 additions & 23 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ $ gguf-parser --url="https://huggingface.co/NousResearch/Nous-Hermes-2-Mixtral-8
| | | | | | | | | +------------+------------+-----------+-----------+
| | | | | | | | | | UMA | NONUMA | UMA | NONUMA |
+-------+--------------+--------------------+-----------------+---------------+----------------+---------------+----------------+----------------+------------+------------+-----------+-----------+
| llama | 32768 | 2048 / 512 | Disabled | Not Supported | No | Supported | 33 (32 + 1) | Yes | 174.54 MiB | 324.54 MiB | 24.94 GiB | 27.41 GiB |
| llama | 32768 | 2048 / 512 | Disabled | Not Supported | No | Supported | 33 (32 + 1) | Yes | 277.10 MiB | 427.10 MiB | 24.94 GiB | 27.41 GiB |
+-------+--------------+--------------------+-----------------+---------------+----------------+---------------+----------------+----------------+------------+------------+-----------+-----------+

$ # Retrieve the model's metadata via split file
Expand Down Expand Up @@ -267,7 +267,7 @@ $ gguf-parser --hf-repo="openbmb/MiniCPM-Llama3-V-2_5-gguf" --hf-file="ggml-mode
| | | | | | | | | +------------+------------+--------+----------+
| | | | | | | | | | UMA | NONUMA | UMA | NONUMA |
+-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+------------+------------+--------+----------+
| llama | 8192 | 2048 / 512 | Disabled | Enabled | No | Supported | 33 (32 + 1) | Yes | 184.85 MiB | 334.85 MiB | 1 GiB | 7.78 GiB |
| llama | 8192 | 2048 / 512 | Disabled | Enabled | No | Supported | 33 (32 + 1) | Yes | 184.85 MiB | 334.85 MiB | 1 GiB | 7.88 GiB |
+-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+------------+------------+--------+----------+

$ # Retrieve the model's metadata via split file
Expand Down Expand Up @@ -515,12 +515,12 @@ $ gguf-parser --hf-repo="hierholzer/Llama-3.1-70B-Instruct-GGUF" --hf-file="Llam
| | | | | | | | | +------------+------------+------------+-----------+-----------+-----------+
| | | | | | | | | | UMA | NONUMA | UMA | NONUMA | UMA | NONUMA |
+-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+------------+------------+------------+-----------+-----------+-----------+
| llama | 1024 | 2048 / 512 | Disabled | Enabled | No | Supported | 81 (80 + 1) | Yes | 270.08 MiB | 420.08 MiB | 192.52 MiB | 24.34 GiB | 16.53 GiB | 16.78 GiB |
| llama | 1024 | 2048 / 512 | Disabled | Enabled | No | Supported | 81 (80 + 1) | Yes | 270.08 MiB | 420.08 MiB | 192.52 MiB | 24.34 GiB | 15.73 GiB | 16.78 GiB |
+-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+------------+------------+------------+-----------+-----------+-----------+

```

Based on the output provided, serving the `hierholzer/Llama-3.1-70B-Instruct-GGUF model` on `host1` has the following
Based on the output provided, serving the `hierholzer/Llama-3.1-70B-Instruct-GGUF` model on `host1` has the following
resource consumption:

| Host | Available RAM | Request RAM | Available VRAM | Request VRAM | Result |
Expand Down Expand Up @@ -560,16 +560,16 @@ flowchart TD
```

```shell
gguf-parser --hf-repo="hierholzer/Llama-3.1-70B-Instruct-GGUF" --hf-file="Llama-3.1-70B-Instruct-Q4_K_M.gguf" --skip-metadata --skip-architecture --skip-tokenizer --ctx-size=1024 --tensor-split="15,10,12,6" --rpc="host1:50052,host1:50053,host2:50052,host3:50052"
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| ESTIMATE |
+-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+-------------------------+-----------------------+----------------------+-----------------------+---------------------+
| ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED | RAM | VRAM 0 | VRAM 1 | VRAM 2 | VRAM 3 |
| | | | | | | | | +------------+------------+-----------+-----------+----------+-----------+-----------+-----------+----------+----------+
| | | | | | | | | | UMA | NONUMA | UMA | NONUMA | UMA | NONUMA | UMA | NONUMA | UMA | NONUMA |
+-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+------------+------------+-----------+-----------+----------+-----------+-----------+-----------+----------+----------+
| llama | 1024 | 2048 / 512 | Disabled | Enabled | No | Supported | 81 (80 + 1) | Yes | 302.08 MiB | 452.08 MiB | 14.69 GiB | 14.93 GiB | 9.95 GiB | 10.20 GiB | 11.37 GiB | 11.61 GiB | 6.61 GiB | 6.86 GiB |
+-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+------------+------------+-----------+-----------+----------+-----------+-----------+-----------+----------+----------+
$ gguf-parser --hf-repo="hierholzer/Llama-3.1-70B-Instruct-GGUF" --hf-file="Llama-3.1-70B-Instruct-Q4_K_M.gguf" --skip-metadata --skip-architecture --skip-tokenizer --ctx-size=1024 --tensor-split="15,10,12,6" --rpc="host1:50052,host1:50053,host2:50052,host3:50052"
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| ESTIMATE |
+-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+-------------------------+-----------------------+---------------------+-----------------------+---------------------+
| ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED | RAM | VRAM 0 | VRAM 1 | VRAM 2 | VRAM 3 |
| | | | | | | | | +------------+------------+-----------+-----------+----------+----------+-----------+-----------+----------+----------+
| | | | | | | | | | UMA | NONUMA | UMA | NONUMA | UMA | NONUMA | UMA | NONUMA | UMA | NONUMA |
+-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+------------+------------+-----------+-----------+----------+----------+-----------+-----------+----------+----------+
| llama | 1024 | 2048 / 512 | Disabled | Enabled | No | Supported | 81 (80 + 1) | Yes | 302.08 MiB | 452.08 MiB | 14.69 GiB | 14.93 GiB | 9.15 GiB | 9.32 GiB | 10.57 GiB | 10.72 GiB | 5.81 GiB | 6.81 GiB |
+-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+------------+------------+-----------+-----------+----------+----------+-----------+-----------+----------+----------+

```

Expand All @@ -580,9 +580,9 @@ following resource consumption:
|-----------------------|---------------|-------------|----------------|--------------|------------|
| host4 | 11 GiB | 452.08 MiB | | | :thumbsup: |
| host1 (NVIDIA 4080 0) | | | 15 GiB | 14.93 GiB | :thumbsup: |
| host1 (NVIDIA 4080 1) | | | 10 GiB | 10.20 GiB | |
| host2 (NVIDIA 4090) | | | 12 GiB | 11.61 GiB | :thumbsup: |
| host3 (Apple M1 Max) | ENOUGH | | 6 GiB | 6.86 GiB | |
| host1 (NVIDIA 4080 1) | | | 10 GiB | 9.32 GiB | :thumbsup: |
| host2 (NVIDIA 4090) | | | 12 GiB | 10.72 GiB | :thumbsup: |
| host3 (Apple M1 Max) | ENOUGH | | 6 GiB | 6.81 GiB | |

It seems that the model cannot be served on `host4`, even with all layers offloaded to `host1`, `host2`, and `host3`.

Expand Down Expand Up @@ -612,15 +612,15 @@ flowchart TD
```

```shell
gguf-parser --hf-repo="hierholzer/Llama-3.1-70B-Instruct-GGUF" --hf-file="Llama-3.1-70B-Instruct-Q4_K_M.gguf" --skip-metadata --skip-architecture --skip-tokenizer --ctx-size=1024 --tensor-split="6,11,12,15,10" --rpc="host4:50052,host2:50052,host1:50052,host1:50053"
$ gguf-parser --hf-repo="hierholzer/Llama-3.1-70B-Instruct-GGUF" --hf-file="Llama-3.1-70B-Instruct-Q4_K_M.gguf" --skip-metadata --skip-architecture --skip-tokenizer --ctx-size=1024 --tensor-split="6,11,12,15,10" --rpc="host4:50052,host2:50052,host1:50052,host1:50053"
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| ESTIMATE |
+-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+-------------------------+----------------------+---------------------+---------------------+-----------------------+---------------------+
| ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED | RAM | VRAM 0 | VRAM 1 | VRAM 2 | VRAM 3 | VRAM 4 |
| | | | | | | | | +------------+------------+-----------+----------+----------+----------+----------+----------+-----------+-----------+----------+----------+
| | | | | | | | | | UMA | NONUMA | UMA | NONUMA | UMA | NONUMA | UMA | NONUMA | UMA | NONUMA | UMA | NONUMA |
+-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+------------+------------+-----------+----------+----------+----------+----------+----------+-----------+-----------+----------+----------+
| llama | 1024 | 2048 / 512 | Disabled | Enabled | No | Supported | 81 (80 + 1) | Yes | 318.08 MiB | 468.08 MiB | 36.52 MiB | 5.92 GiB | 9.04 GiB | 9.29 GiB | 9.04 GiB | 9.29 GiB | 11.82 GiB | 12.07 GiB | 8.03 GiB | 8.27 GiB |
| llama | 1024 | 2048 / 512 | Disabled | Enabled | No | Supported | 81 (80 + 1) | Yes | 318.08 MiB | 468.08 MiB | 36.52 MiB | 5.92 GiB | 8.24 GiB | 8.42 GiB | 8.24 GiB | 8.42 GiB | 11.02 GiB | 11.17 GiB | 7.23 GiB | 8.22 GiB |
+-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+------------+------------+-----------+----------+----------+----------+----------+----------+-----------+-----------+----------+----------+

```
Expand All @@ -632,10 +632,10 @@ following resource consumption:
|-----------------------|---------------|-------------|----------------|--------------|------------|
| host3 (Apple M1 Max) | ENOUGH | 318.08 MiB | | | :thumbsup: |
| host3 (Apple M1 Max) | | | 6 GiB | 36.52 MiB | :thumbsup: |
| host4 | 11 GiB | 9.29 GiB | | | :thumbsup: |
| host1 (NVIDIA 4080 1) | | | 12 GiB | 9.29 GiB | :thumbsup: |
| host2 (NVIDIA 4080 0) | | | 15 GiB | 12.07 GiB | :thumbsup: |
| host3 (NVIDIA 4080 1) | | | 10 GiB | 8.27 GiB | :thumbsup: |
| host4 | 11 GiB | 8.42 GiB | | | :thumbsup: |
| host1 (NVIDIA 4080 1) | | | 12 GiB | 8.42 GiB | :thumbsup: |
| host2 (NVIDIA 4080 0) | | | 15 GiB | 11.17 GiB | :thumbsup: |
| host3 (NVIDIA 4080 1) | | | 10 GiB | 8.22 GiB | :thumbsup: |

Now, the model can be successfully served on `host3`, with all layers offloaded to `host1`, `host2`, and `host4`.

Expand Down
29 changes: 14 additions & 15 deletions file_estimate.go
Original file line number Diff line number Diff line change
Expand Up @@ -364,12 +364,9 @@ func (gf *GGUFFile) EstimateLLaMACppUsage(opts ...LLaMACppUsageEstimateOption) (
} else if a.AttentionCausal {
op = GGUFBytesScalar(opLs.Bytes()) + e.Devices[0].Weight.Input /* duplicate the input layer */
}
e.Devices[0].Weight.Output = op
if fullOffload {
for i := range e.Devices[1:] {
e.Devices[i+1].Weight.Output = op
}
} else {
e.Devices[0].Weight.Output = op
e.Devices[len(e.Devices)-1].Weight.Output = op
}
}

Expand Down Expand Up @@ -470,10 +467,7 @@ func (gf *GGUFFile) EstimateLLaMACppUsage(opts ...LLaMACppUsageEstimateOption) (
ssmInc += rs
}
cp := GGUFBytesScalar(convInc + ssmInc)
for i, d := range e.Devices[1:] {
if d.LastLayer < 0 && (i == 0 && !d.Remote) {
continue
}
for i := range e.Devices[1:] {
e.Devices[i+1].Computation.Compute = cp
}
case a.Type == "model":
Expand Down Expand Up @@ -537,10 +531,7 @@ func (gf *GGUFFile) EstimateLLaMACppUsage(opts ...LLaMACppUsageEstimateOption) (
e.Devices[0].Computation.Compute = GGUFBytesScalar(loadAttnInc)
}
cp := GGUFBytesScalar(max(offloadAttnInc, ffnInc))
for i, d := range e.Devices[1:] {
if d.LastLayer < 0 && (i == 0 && !d.Remote) {
continue
}
for i := range e.Devices[1:] {
e.Devices[i+1].Computation.Compute = cp
}
// Special case: we cannot use mmap for splitting expert weights in MoE.
Expand Down Expand Up @@ -676,13 +667,21 @@ func (e LLaMACppUsageEstimate) SummarizeMemory(mmap bool, nonUMARamFootprint, no
ems.VRAMs[i].UMA = fp + wg + kv + /* cp */ 0
if !e.NoMMap && mmap {
ems.VRAMs[i].UMA -= wg
if i > 0 || v.Remote {
ems.VRAMs[i].UMA += wg + cp
if i > 0 && v.LastLayer >= 0 || v.Remote {
ems.VRAMs[i].UMA += wg + cp - v.Weight.Output
}
}

// NonUMA.
ems.VRAMs[i].NonUMA = GGUFBytesScalar(nonUMAVramFootprint) + fp + wg + kv + cp
if i > 0 {
switch {
case v.LastLayer < 0:
ems.VRAMs[i].NonUMA -= wg + cp
case v.Remote && wg > kv:
ems.VRAMs[i].NonUMA -= kv
}
}
}
}

Expand Down

0 comments on commit 798028e

Please sign in to comment.