From 5c1f28dd50d68f598525ddf00585eb131ee2b736 Mon Sep 17 00:00:00 2001 From: thxCode Date: Mon, 26 Aug 2024 14:17:05 +0800 Subject: [PATCH] refactor: estimate across multiple devices Signed-off-by: thxCode --- README.md | 540 ++++++++++++++------ cmd/gguf-parser/README.md | 11 +- cmd/gguf-parser/main.go | 394 +++++++------- file_architecture.go | 124 ++++- file_estimate.go | 141 ++--- file_estimate_option.go | 42 +- file_model.go => file_metadata.go | 40 +- file_model_test.go => file_metadata_test.go | 10 +- file_tokenizer.go | 6 +- 9 files changed, 837 insertions(+), 471 deletions(-) rename file_model.go => file_metadata.go (90%) rename file_model_test.go => file_metadata_test.go (81%) diff --git a/README.md b/README.md index 4c91682..6adc4ef 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,9 @@ GGUF Parser helps in reviewing and estimating the usage of a GGUF format model w * [From HuggingFace](#parse-from-huggingface) * [From ModelScope](#parse-from-modelscope) * [From Ollama Library](#parse-from-ollama-library) + * [None Model](#parse-none-model) + [Estimate](#estimate) + * [Across Multiple GPU devices](#estimate-across-multiple-gpu-devices) * [Full Layers Offload (default)](#full-layers-offload-default) * [Zero Layers Offload](#zero-layers-offload) * [Specific Layers Offload](#specific-layers-offload) @@ -46,6 +48,10 @@ GGUF Parser helps in reviewing and estimating the usage of a GGUF format model w ## Notes +- Since v0.8.0, GGUF Parser distinguishes the remote devices from `--tensor-split` via `--rpc`. + + For one host multiple GPU devices, you can use `--tensor-split` to get the estimated memory usage of each GPU. + + For multiple hosts multiple GPU devices, you can use `--tensor-split` and `--rpc` to get the estimated memory + usage of each GPU. - The table result `DISTRIBUTABLE` indicates the GGUF file supports distribution inference or not, if the file doesn't support distribution inference, you can not offload it with [RPC servers](https://github.com/ggerganov/llama.cpp/tree/master/examples/rpc). @@ -53,7 +59,7 @@ GGUF Parser helps in reviewing and estimating the usage of a GGUF format model w which suffixes with something like `-00001-of-00009.gguf`. - The `UMA` column indicates the memory usage of Apple macOS only. To estimate the macOS memory usage, you can sum the `UMA` results of `RAM` and `VRAM 0` columns. -- Since v0.7.0, GGUF Parser supports estimating the usage of multiple GPUs. +- Since v0.7.0, GGUF Parser supports estimating the usage of multiple GPU devices. + The table result `RAM` means the system memory usage when running [LLaMA.Cpp](https://github.com/ggerganov/llama.cpp) or LLaMA.Cpp like application. + The `VRAM 0` columns means the first visible GPU memory usage when serving the GGUF file. @@ -73,13 +79,13 @@ or `go install github.com/gpustack/gguf-parser-go/cmd/gguf-parser@latest`. ```shell $ gguf-parser --path="~/.cache/lm-studio/models/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF/Hermes-2-Pro-Mistral-7B.Q5_K_M.gguf" -+-----------------------------------------------------------------------------------+ -| MODEL | -+-------+-------+----------------+---------------+----------+------------+----------+ -| NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | -+-------+-------+----------------+---------------+----------+------------+----------+ -| jeffq | llama | IQ3_XXS/Q5_K_M | true | 4.78 GiB | 7.24 B | 5.67 bpw | -+-------+-------+----------------+---------------+----------+------------+----------+ ++-------------------------------------------------------------------------------------------+ +| METADATA | ++-------+-------+-------+----------------+---------------+----------+------------+----------+ +| TYPE | NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | ++-------+-------+-------+----------------+---------------+----------+------------+----------+ +| model | jeffq | llama | IQ3_XXS/Q5_K_M | true | 4.78 GiB | 7.24 B | 5.67 bpw | ++-------+-------+-------+----------------+---------------+----------+------------+----------+ +---------------------------------------------------------------------------------------------------------------------------------------------------+ | ARCHITECTURE | @@ -104,20 +110,20 @@ $ gguf-parser --path="~/.cache/lm-studio/models/NousResearch/Hermes-2-Pro-Mistra | | | | | | | | | +------------+------------+--------+-----------+ | | | | | | | | | | UMA | NONUMA | UMA | NONUMA | +-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+------------+------------+--------+-----------+ -| llama | 32768 | 2048 / 512 | Disabled | Supported | No | Supported | 33 (32 + 1) | Yes | 176.25 MiB | 326.25 MiB | 4 GiB | 11.16 GiB | +| llama | 32768 | 2048 / 512 | Disabled | Enabled | No | Supported | 33 (32 + 1) | Yes | 176.25 MiB | 326.25 MiB | 4 GiB | 11.16 GiB | +-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+------------+------------+--------+-----------+ $ # Retrieve the model's metadata via split file, $ # which needs all split files has been downloaded. $ gguf-parser --path="~/.cache/lm-studio/models/Qwen/Qwen2-72B-Instruct-GGUF/qwen2-72b-instruct-q6_k-00001-of-00002.gguf" -+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| MODEL | -+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------+--------------+---------------+-----------+------------+----------+ -| NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | -+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------+--------------+---------------+-----------+------------+----------+ -| 72b.5000B--cmix31-base100w-cpt32k_mega_v1_reflection_4_identity_2_if_ondare_beta0.09_lr_1e-6_bs128_epoch2-72B.qwen2B-bf16-mp8-pp4-lr-1e-6-minlr-1e-9-bs-128-seqlen-4096-step1350 | qwen2 | IQ1_S/Q6_K | true | 59.92 GiB | 72.71 B | 7.08 bpw | -+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------+--------------+---------------+-----------+------------+----------+ ++------------------------------------------------------------------------------------------------------------+ +| METADATA | ++-------+-------------------------+-------+--------------+---------------+-----------+------------+----------+ +| TYPE | NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | ++-------+-------------------------+-------+--------------+---------------+-----------+------------+----------+ +| model | 72b.5000B--cmix31-ba... | qwen2 | IQ1_S/Q6_K | true | 59.92 GiB | 72.71 B | 7.08 bpw | ++-------+-------------------------+-------+--------------+---------------+-----------+------------+----------+ +---------------------------------------------------------------------------------------------------------------------------------------------------+ | ARCHITECTURE | @@ -142,7 +148,7 @@ $ gguf-parser --path="~/.cache/lm-studio/models/Qwen/Qwen2-72B-Instruct-GGUF/qwe | | | | | | | | | +------------+------------+--------+-----------+ | | | | | | | | | | UMA | NONUMA | UMA | NONUMA | +-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+------------+------------+--------+-----------+ -| qwen2 | 32768 | 2048 / 512 | Disabled | Supported | No | Not Supported | 81 (80 + 1) | Yes | 307.38 MiB | 457.38 MiB | 10 GiB | 73.47 GiB | +| qwen2 | 32768 | 2048 / 512 | Disabled | Enabled | No | Not Supported | 81 (80 + 1) | Yes | 307.38 MiB | 457.38 MiB | 10 GiB | 73.47 GiB | +-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+------------+------------+--------+-----------+ ``` @@ -151,13 +157,13 @@ $ gguf-parser --path="~/.cache/lm-studio/models/Qwen/Qwen2-72B-Instruct-GGUF/qwe ```shell $ gguf-parser --url="https://huggingface.co/NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO-GGUF/resolve/main/Nous-Hermes-2-Mixtral-8x7B-DPO.Q3_K_M.gguf" -+----------------------------------------------------------------------------------+ -| MODEL | -+----------+-------+--------------+---------------+--------+------------+----------+ -| NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | -+----------+-------+--------------+---------------+--------+------------+----------+ -| emozilla | llama | Q4_K/Q3_K_M | true | 21 GiB | 46.70 B | 3.86 bpw | -+----------+-------+--------------+---------------+--------+------------+----------+ ++------------------------------------------------------------------------------------------+ +| METADATA | ++-------+----------+-------+--------------+---------------+--------+------------+----------+ +| TYPE | NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | ++-------+----------+-------+--------------+---------------+--------+------------+----------+ +| model | emozilla | llama | Q4_K/Q3_K_M | true | 21 GiB | 46.70 B | 3.86 bpw | ++-------+----------+-------+--------------+---------------+--------+------------+----------+ +---------------------------------------------------------------------------------------------------------------------------------------------------+ | ARCHITECTURE | @@ -188,13 +194,13 @@ $ gguf-parser --url="https://huggingface.co/NousResearch/Nous-Hermes-2-Mixtral-8 $ # Retrieve the model's metadata via split file $ gguf-parser --url="https://huggingface.co/MaziyarPanahi/Meta-Llama-3.1-405B-Instruct-GGUF/resolve/main/Meta-Llama-3.1-405B-Instruct.Q2_K.gguf-00001-of-00009.gguf" -+----------------------------------------------------------------------------------------------------------------------------+ -| MODEL | -+------------------------------------------------+-------+--------------+---------------+------------+------------+----------+ -| NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | -+------------------------------------------------+-------+--------------+---------------+------------+------------+----------+ -| Models Meta Llama Meta Llama 3.1 405B Instruct | llama | Q2_K | true | 140.81 GiB | 410.08 B | 2.95 bpw | -+------------------------------------------------+-------+--------------+---------------+------------+------------+----------+ ++-------------------------------------------------------------------------------------------------------------+ +| METADATA | ++-------+-------------------------+-------+--------------+---------------+------------+------------+----------+ +| TYPE | NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | ++-------+-------------------------+-------+--------------+---------------+------------+------------+----------+ +| model | Models Meta Llama Me... | llama | Q2_K | true | 140.81 GiB | 410.08 B | 2.95 bpw | ++-------+-------------------------+-------+--------------+---------------+------------+------------+----------+ +---------------------------------------------------------------------------------------------------------------------------------------------------+ | ARCHITECTURE | @@ -219,7 +225,7 @@ $ gguf-parser --url="https://huggingface.co/MaziyarPanahi/Meta-Llama-3.1-405B-In | | | | | | | | | +------------+------------+---------+------------+ | | | | | | | | | | UMA | NONUMA | UMA | NONUMA | +-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+------------+------------+---------+------------+ -| llama | 131072 | 2048 / 512 | Disabled | Supported | No | Supported | 127 (126 + 1) | Yes | 684.53 MiB | 834.53 MiB | 126 GiB | 299.79 GiB | +| llama | 131072 | 2048 / 512 | Disabled | Enabled | No | Supported | 127 (126 + 1) | Yes | 684.53 MiB | 834.53 MiB | 126 GiB | 299.79 GiB | +-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+------------+------------+---------+------------+ ``` @@ -228,13 +234,13 @@ $ gguf-parser --url="https://huggingface.co/MaziyarPanahi/Meta-Llama-3.1-405B-In ```shell $ gguf-parser --hf-repo="openbmb/MiniCPM-Llama3-V-2_5-gguf" --hf-file="ggml-model-Q5_K_M.gguf" --hf-mmproj-file="mmproj-model-f16.gguf" -+-----------------------------------------------------------------------------------+ -| MODEL | -+-------+-------+----------------+---------------+----------+------------+----------+ -| NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | -+-------+-------+----------------+---------------+----------+------------+----------+ -| model | llama | IQ3_XXS/Q5_K_M | true | 5.33 GiB | 8.03 B | 5.70 bpw | -+-------+-------+----------------+---------------+----------+------------+----------+ ++-------------------------------------------------------------------------------------------+ +| METADATA | ++-------+-------+-------+----------------+---------------+----------+------------+----------+ +| TYPE | NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | ++-------+-------+-------+----------------+---------------+----------+------------+----------+ +| model | model | llama | IQ3_XXS/Q5_K_M | true | 5.33 GiB | 8.03 B | 5.70 bpw | ++-------+-------+-------+----------------+---------------+----------+------------+----------+ +---------------------------------------------------------------------------------------------------------------------------------------------------+ | ARCHITECTURE | @@ -259,19 +265,19 @@ $ gguf-parser --hf-repo="openbmb/MiniCPM-Llama3-V-2_5-gguf" --hf-file="ggml-mode | | | | | | | | | +------------+------------+--------+----------+ | | | | | | | | | | UMA | NONUMA | UMA | NONUMA | +-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+------------+------------+--------+----------+ -| llama | 8192 | 2048 / 512 | Disabled | Supported | No | Supported | 33 (32 + 1) | Yes | 184.85 MiB | 334.85 MiB | 1 GiB | 7.78 GiB | +| llama | 8192 | 2048 / 512 | Disabled | Enabled | No | Supported | 33 (32 + 1) | Yes | 184.85 MiB | 334.85 MiB | 1 GiB | 7.78 GiB | +-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+------------+------------+--------+----------+ $ # Retrieve the model's metadata via split file $ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" -+---------------------------------------------------------------------------------------------------------+ -| MODEL | -+------------------------------+-------+--------------+---------------+-----------+------------+----------+ -| NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | -+------------------------------+-------+--------------+---------------+-----------+------------+----------+ -| Meta-Llama-3.1-405B-Instruct | llama | IQ1_M | true | 88.61 GiB | 410.08 B | 1.86 bpw | -+------------------------------+-------+--------------+---------------+-----------+------------+----------+ ++------------------------------------------------------------------------------------------------------------+ +| METADATA | ++-------+-------------------------+-------+--------------+---------------+-----------+------------+----------+ +| TYPE | NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | ++-------+-------------------------+-------+--------------+---------------+-----------+------------+----------+ +| model | Meta-Llama-3.1-405B-... | llama | IQ1_M | true | 88.61 GiB | 410.08 B | 1.86 bpw | ++-------+-------------------------+-------+--------------+---------------+-----------+------------+----------+ +---------------------------------------------------------------------------------------------------------------------------------------------------+ | ARCHITECTURE | @@ -296,7 +302,7 @@ $ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-4 | | | | | | | | | +------------+------------+---------+------------+ | | | | | | | | | | UMA | NONUMA | UMA | NONUMA | +-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+------------+------------+---------+------------+ -| llama | 131072 | 2048 / 512 | Disabled | Supported | No | Supported | 127 (126 + 1) | Yes | 684.53 MiB | 834.53 MiB | 126 GiB | 247.59 GiB | +| llama | 131072 | 2048 / 512 | Disabled | Enabled | No | Supported | 127 (126 + 1) | Yes | 684.53 MiB | 834.53 MiB | 126 GiB | 247.59 GiB | +-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+------------+------------+---------+------------+ ``` @@ -305,13 +311,13 @@ $ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-4 ```shell $ gguf-parser --ms-repo="shaowenchen/chinese-alpaca-2-13b-16k-gguf" --ms-file="chinese-alpaca-2-13b-16k.Q5_K.gguf" -+----------------------------------------------------------------------------------+ -| MODEL | -+------+-------+----------------+---------------+----------+------------+----------+ -| NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | -+------+-------+----------------+---------------+----------+------------+----------+ -| .. | llama | IQ3_XXS/Q5_K_M | true | 8.76 GiB | 13.25 B | 5.68 bpw | -+------+-------+----------------+---------------+----------+------------+----------+ ++------------------------------------------------------------------------------------------+ +| METADATA | ++-------+------+-------+----------------+---------------+----------+------------+----------+ +| TYPE | NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | ++-------+------+-------+----------------+---------------+----------+------------+----------+ +| model | .. | llama | IQ3_XXS/Q5_K_M | true | 8.76 GiB | 13.25 B | 5.68 bpw | ++-------+------+-------+----------------+---------------+----------+------------+----------+ +---------------------------------------------------------------------------------------------------------------------------------------------------+ | ARCHITECTURE | @@ -336,7 +342,7 @@ $ gguf-parser --ms-repo="shaowenchen/chinese-alpaca-2-13b-16k-gguf" --ms-file="c | | | | | | | | | +------------+------------+-----------+-----------+ | | | | | | | | | | UMA | NONUMA | UMA | NONUMA | +-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+------------+------------+-----------+-----------+ -| llama | 16384 | 2048 / 512 | Disabled | Supported | No | Supported | 41 (40 + 1) | Yes | 154.95 MiB | 304.95 MiB | 12.50 GiB | 22.96 GiB | +| llama | 16384 | 2048 / 512 | Disabled | Enabled | No | Supported | 41 (40 + 1) | Yes | 154.95 MiB | 304.95 MiB | 12.50 GiB | 22.96 GiB | +-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+------------+------------+-----------+-----------+ ``` @@ -345,13 +351,13 @@ $ gguf-parser --ms-repo="shaowenchen/chinese-alpaca-2-13b-16k-gguf" --ms-file="c ```shell $ gguf-parser --ol-model="llama3.1" -+------------------------------------------------------------------------------------------------------+ -| MODEL | -+----------------------------+-------+--------------+---------------+----------+------------+----------+ -| NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | -+----------------------------+-------+--------------+---------------+----------+------------+----------+ -| Meta Llama 3.1 8B Instruct | llama | Q4_0 | true | 4.33 GiB | 8.03 B | 4.64 bpw | -+----------------------------+-------+--------------+---------------+----------+------------+----------+ ++-----------------------------------------------------------------------------------------------------------+ +| METADATA | ++-------+-------------------------+-------+--------------+---------------+----------+------------+----------+ +| TYPE | NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | ++-------+-------------------------+-------+--------------+---------------+----------+------------+----------+ +| model | Meta Llama 3.1 8B In... | llama | Q4_0 | true | 4.33 GiB | 8.03 B | 4.64 bpw | ++-------+-------------------------+-------+--------------+---------------+----------+------------+----------+ +---------------------------------------------------------------------------------------------------------------------------------------------------+ | ARCHITECTURE | @@ -376,20 +382,20 @@ $ gguf-parser --ol-model="llama3.1" | | | | | | | | | +------------+------------+--------+-----------+ | | | | | | | | | | UMA | NONUMA | UMA | NONUMA | +-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+------------+------------+--------+-----------+ -| llama | 131072 | 2048 / 512 | Disabled | Supported | No | Supported | 33 (32 + 1) | Yes | 411.62 MiB | 561.62 MiB | 16 GiB | 29.08 GiB | +| llama | 131072 | 2048 / 512 | Disabled | Enabled | No | Supported | 33 (32 + 1) | Yes | 411.62 MiB | 561.62 MiB | 16 GiB | 29.08 GiB | +-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+------------+------------+--------+-----------+ $ # Ollama Model includes the preset params and other artifacts, like multimodal projectors or LoRA adapters, $ # you can get the usage of Ollama running by using `--ol-usage` option. $ gguf-parser --ol-model="llama3.1" --ol-usage -+------------------------------------------------------------------------------------------------------+ -| MODEL | -+----------------------------+-------+--------------+---------------+----------+------------+----------+ -| NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | -+----------------------------+-------+--------------+---------------+----------+------------+----------+ -| Meta Llama 3.1 8B Instruct | llama | Q4_0 | true | 4.33 GiB | 8.03 B | 4.64 bpw | -+----------------------------+-------+--------------+---------------+----------+------------+----------+ ++-----------------------------------------------------------------------------------------------------------+ +| METADATA | ++-------+-------------------------+-------+--------------+---------------+----------+------------+----------+ +| TYPE | NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | ++-------+-------------------------+-------+--------------+---------------+----------+------------+----------+ +| model | Meta Llama 3.1 8B In... | llama | Q4_0 | true | 4.33 GiB | 8.03 B | 4.64 bpw | ++-------+-------------------------+-------+--------------+---------------+----------+------------+----------+ +---------------------------------------------------------------------------------------------------------------------------------------------------+ | ARCHITECTURE | @@ -414,58 +420,268 @@ $ gguf-parser --ol-model="llama3.1" --ol-usage | | | | | | | | | +------------+------------+------------+----------+ | | | | | | | | | | UMA | NONUMA | UMA | NONUMA | +-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+------------+------------+------------+----------+ -| llama | 2048 | 2048 / 512 | Disabled | Supported | No | Supported | 33 (32 + 1) | Yes | 159.62 MiB | 309.62 MiB | 256.50 MiB | 4.82 GiB | +| llama | 2048 | 2048 / 512 | Disabled | Enabled | No | Supported | 33 (32 + 1) | Yes | 159.62 MiB | 309.62 MiB | 256.50 MiB | 4.82 GiB | +-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+------------+------------+------------+----------+ ``` +#### Parse None Model + +```shell +$ # Parse Multi-Modal Projector +$ gguf-parser --hf-repo="xtuner/llava-llama-3-8b-v1_1-gguf" --hf-file="llava-llama-3-8b-v1_1-mmproj-f16.gguf" ++-----------------------------------------------------------------------------------------------------------------+ +| METADATA | ++-----------+-------------------------+------+--------------+---------------+------------+------------+-----------+ +| TYPE | NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | ++-----------+-------------------------+------+--------------+---------------+------------+------------+-----------+ +| projector | openai/clip-vit-larg... | clip | F16 | true | 595.49 MiB | 311.89 M | 16.02 bpw | ++-----------+-------------------------+------+--------------+---------------+------------+------------+-----------+ + ++----------------------------------------------------------------------+ +| ARCHITECTURE | ++----------------+---------------+--------+------------------+---------+ +| PROJECTOR TYPE | EMBEDDING LEN | LAYERS | FEED FORWARD LEN | ENCODER | ++----------------+---------------+--------+------------------+---------+ +| mlp | 1024 | 23 | 4096 | Vision | ++----------------+---------------+--------+------------------+---------+ + +$ # Parse LoRA Adapter +$ gguf-parser --hf-repo="ngxson/test_gguf_lora_adapter" --hf-file="lora-Llama-3-Instruct-abliteration-LoRA-8B-f16.gguf" ++---------------------------------------------------------------------------------------------+ +| METADATA | ++---------+------+-------+--------------+---------------+------------+------------+-----------+ +| TYPE | NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | ++---------+------+-------+--------------+---------------+------------+------------+-----------+ +| adapter | N/A | llama | F16 | true | 168.08 MiB | 88.12 M | 16.00 bpw | ++---------+------+-------+--------------+---------------+------------+------------+-----------+ + ++---------------------------+ +| ARCHITECTURE | ++--------------+------------+ +| ADAPTER TYPE | LORA ALPHA | ++--------------+------------+ +| lora | 32 | ++--------------+------------+ + +``` + ### Estimate +#### Estimate Across Multiple GPU Devices + +Imaging you're preparing to run +the [hierholzer/Llama-3.1-70B-Instruct-GGUF](https://huggingface.co/hierholzer/Llama-3.1-70B-Instruct-GGUF) model file +across several hosts in your local network. Some of these hosts are equipped with GPU devices, while others do not have +any GPU capabilities. + +```mermaid +flowchart TD + subgraph host4["Windows 11 (host4)"] + ram40(["11GiB RAM remaining"]) + end + subgraph host3["Apple macOS (host3)"] + gpu10["Apple M1 Max (6GiB VRAM remaining)"] + end + subgraph host2["Windows 11 (host2)"] + gpu20["NVIDIA 4090 (12GiB VRAM remaining)"] + end + subgraph host1["Ubuntu (host1)"] + gpu30["NVIDIA 4080 0 (8GiB VRAM remaining)"] + gpu31["NVIDIA 4080 1 (10GiB VRAM remaining)"] + end +``` + +##### Single Host Multiple GPU Devices + +Let's assume you plan to run the model on `host1` only. + +```mermaid +flowchart TD + subgraph host1["Ubuntu (host1)"] + gpu30["NVIDIA 4080 0 (15GiB VRAM remaining)"] + gpu31["NVIDIA 4080 1 (10GiB VRAM remaining)"] + end +``` + +```shell +$ gguf-parser --hf-repo="hierholzer/Llama-3.1-70B-Instruct-GGUF" --hf-file="Llama-3.1-70B-Instruct-Q4_K_M.gguf" --skip-metadata --skip-architecture --skip-tokenizer --ctx-size=1024 --tensor-split="15,10" ++-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| ESTIMATE | ++-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+-------------------------+------------------------+-----------------------+ +| ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED | RAM | VRAM 0 | VRAM 1 | +| | | | | | | | | +------------+------------+------------+-----------+-----------+-----------+ +| | | | | | | | | | UMA | NONUMA | UMA | NONUMA | UMA | NONUMA | ++-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+------------+------------+------------+-----------+-----------+-----------+ +| llama | 1024 | 2048 / 512 | Disabled | Enabled | No | Supported | 81 (80 + 1) | Yes | 270.08 MiB | 420.08 MiB | 192.52 MiB | 24.34 GiB | 16.53 GiB | 16.78 GiB | ++-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+------------+------------+------------+-----------+-----------+-----------+ + +``` + +Based on the output provided, serving the `hierholzer/Llama-3.1-70B-Instruct-GGUF model` on `host1` has the following +resource consumption: + +| Host | Available RAM | Request RAM | Available VRAM | Request VRAM | Result | +|-----------------------|---------------|-------------|----------------|--------------|------------| +| host1 | | 420.08 MiB | | | :thumbsup: | +| host1 (NVIDIA 4080 0) | | | 15 GiB | 24.34 GiB | | +| host1 (NVIDIA 4080 1) | | | 10 GiB | 16.78 GiB | | + +It appears that running the model on `host1` alone is not feasible. + +##### Multiple Hosts Multiple GPU Devices + +Next, let's consider the scenario where you plan to run the model on `host4`, while offloading all layers to `host1`, +`host2`, +and `host3`. + +```mermaid +flowchart TD + host4 -->|TCP| gpu10 + host4 -->|TCP| gpu20 + host4 -->|TCP| gpu30 + host4 -->|TCP| gpu31 + + subgraph host4["Windows 11 (host4)"] + ram40(["11GiB RAM remaining"]) + end + subgraph host3["Apple macOS (host3)"] + gpu10["Apple M1 Max (6GiB VRAM remaining)"] + end + subgraph host2["Windows 11 (host2)"] + gpu20["NVIDIA 4090 (12GiB VRAM remaining)"] + end + subgraph host1["Ubuntu (host1)"] + gpu30["NVIDIA 4080 0 (15GiB VRAM remaining)"] + gpu31["NVIDIA 4080 1 (10GiB VRAM remaining)"] + end +``` + +```shell +gguf-parser --hf-repo="hierholzer/Llama-3.1-70B-Instruct-GGUF" --hf-file="Llama-3.1-70B-Instruct-Q4_K_M.gguf" --skip-metadata --skip-architecture --skip-tokenizer --ctx-size=1024 --tensor-split="15,10,12,6" --rpc="host1:50052,host1:50053,host2:50052,host3:50052" ++-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| ESTIMATE | ++-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+-------------------------+-----------------------+----------------------+-----------------------+---------------------+ +| ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED | RAM | VRAM 0 | VRAM 1 | VRAM 2 | VRAM 3 | +| | | | | | | | | +------------+------------+-----------+-----------+----------+-----------+-----------+-----------+----------+----------+ +| | | | | | | | | | UMA | NONUMA | UMA | NONUMA | UMA | NONUMA | UMA | NONUMA | UMA | NONUMA | ++-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+------------+------------+-----------+-----------+----------+-----------+-----------+-----------+----------+----------+ +| llama | 1024 | 2048 / 512 | Disabled | Enabled | No | Supported | 81 (80 + 1) | Yes | 302.08 MiB | 452.08 MiB | 14.69 GiB | 14.93 GiB | 9.95 GiB | 10.20 GiB | 11.37 GiB | 11.61 GiB | 6.61 GiB | 6.86 GiB | ++-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+------------+------------+-----------+-----------+----------+-----------+-----------+-----------+----------+----------+ + +``` + +According to the output provided, serving the `hierholzer/Llama-3.1-70B-Instruct-GGUF` model on `host4` results in the +following resource consumption: + +| Host | Available RAM | Request RAM | Available VRAM | Request VRAM | Result | +|-----------------------|---------------|-------------|----------------|--------------|------------| +| host4 | 11 GiB | 452.08 MiB | | | :thumbsup: | +| host1 (NVIDIA 4080 0) | | | 15 GiB | 14.93 GiB | :thumbsup: | +| host1 (NVIDIA 4080 1) | | | 10 GiB | 10.20 GiB | | +| host2 (NVIDIA 4090) | | | 12 GiB | 11.61 GiB | :thumbsup: | +| host3 (Apple M1 Max) | ENOUGH | | 6 GiB | 6.86 GiB | | + +It seems that the model cannot be served on `host4`, even with all layers offloaded to `host1`, `host2`, and `host3`. + +We should consider a different approach: running the model on `host3` while offloading all layers to `host1`, `host2`, +and `host4`. + +```mermaid +flowchart TD + host3 -->|TCP| ram40 + host3 -->|TCP| gpu20 + host3 -->|TCP| gpu30 + host3 -->|TCP| gpu31 + + subgraph host4["Windows 11 (host4)"] + ram40(["11GiB RAM remaining"]) + end + subgraph host3["Apple macOS (host3)"] + gpu10["Apple M1 Max (6GiB VRAM remaining)"] + end + subgraph host2["Windows 11 (host2)"] + gpu20["NVIDIA 4090 (12GiB VRAM remaining)"] + end + subgraph host1["Ubuntu (host1)"] + gpu30["NVIDIA 4080 0 (8GiB VRAM remaining)"] + gpu31["NVIDIA 4080 1 (10GiB VRAM remaining)"] + end +``` + +```shell +gguf-parser --hf-repo="hierholzer/Llama-3.1-70B-Instruct-GGUF" --hf-file="Llama-3.1-70B-Instruct-Q4_K_M.gguf" --skip-metadata --skip-architecture --skip-tokenizer --ctx-size=1024 --tensor-split="6,11,12,15,10" --rpc="host4:50052,host2:50052,host1:50052,host1:50053" ++---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| ESTIMATE | ++-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+-------------------------+----------------------+---------------------+---------------------+-----------------------+---------------------+ +| ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED | RAM | VRAM 0 | VRAM 1 | VRAM 2 | VRAM 3 | VRAM 4 | +| | | | | | | | | +------------+------------+-----------+----------+----------+----------+----------+----------+-----------+-----------+----------+----------+ +| | | | | | | | | | UMA | NONUMA | UMA | NONUMA | UMA | NONUMA | UMA | NONUMA | UMA | NONUMA | UMA | NONUMA | ++-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+------------+------------+-----------+----------+----------+----------+----------+----------+-----------+-----------+----------+----------+ +| llama | 1024 | 2048 / 512 | Disabled | Enabled | No | Supported | 81 (80 + 1) | Yes | 318.08 MiB | 468.08 MiB | 36.52 MiB | 5.92 GiB | 9.04 GiB | 9.29 GiB | 9.04 GiB | 9.29 GiB | 11.82 GiB | 12.07 GiB | 8.03 GiB | 8.27 GiB | ++-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+------------+------------+-----------+----------+----------+----------+----------+----------+-----------+-----------+----------+----------+ + +``` + +According to the output provided, serving the `hierholzer/Llama-3.1-70B-Instruct-GGUF` model on `host3` results in the +following resource consumption: + +| Host | Available RAM | Request RAM | Available VRAM | Request VRAM | Result | +|-----------------------|---------------|-------------|----------------|--------------|------------| +| host3 (Apple M1 Max) | ENOUGH | 318.08 MiB | | | :thumbsup: | +| host3 (Apple M1 Max) | | | 6 GiB | 36.52 MiB | :thumbsup: | +| host4 | 11 GiB | 9.29 GiB | | | :thumbsup: | +| host1 (NVIDIA 4080 1) | | | 12 GiB | 9.29 GiB | :thumbsup: | +| host2 (NVIDIA 4080 0) | | | 15 GiB | 12.07 GiB | :thumbsup: | +| host3 (NVIDIA 4080 1) | | | 10 GiB | 8.27 GiB | :thumbsup: | + +Now, the model can be successfully served on `host3`, with all layers offloaded to `host1`, `host2`, and `host4`. + #### Full Layers Offload (default) ```shell -$ gguf-parser --hf-repo="NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO-GGUF" --hf-file="Nous-Hermes-2-Mixtral-8x7B-DPO.Q3_K_M.gguf" --skip-model --skip-architecture --skip-tokenizer -+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| ESTIMATE | -+-------+--------------+--------------------+-----------------+---------------+----------------+---------------+----------------+----------------+-------------------------+-----------------------+ -| ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED | RAM | VRAM 0 | -| | | | | | | | | +------------+------------+-----------+-----------+ -| | | | | | | | | | UMA | NONUMA | UMA | NONUMA | -+-------+--------------+--------------------+-----------------+---------------+----------------+---------------+----------------+----------------+------------+------------+-----------+-----------+ -| llama | 32768 | 2048 / 512 | Disabled | Not Supported | No | Supported | 33 (32 + 1) | Yes | 174.54 MiB | 324.54 MiB | 24.94 GiB | 27.41 GiB | -+-------+--------------+--------------------+-----------------+---------------+----------------+---------------+----------------+----------------+------------+------------+-----------+-----------+ +$ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" --skip-metadata --skip-architecture --skip-tokenizer ++---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| ESTIMATE | ++-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+-------------------------+----------------------+ +| ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED | RAM | VRAM 0 | +| | | | | | | | | +------------+------------+---------+------------+ +| | | | | | | | | | UMA | NONUMA | UMA | NONUMA | ++-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+------------+------------+---------+------------+ +| llama | 131072 | 2048 / 512 | Disabled | Enabled | No | Supported | 127 (126 + 1) | Yes | 684.53 MiB | 834.53 MiB | 126 GiB | 247.59 GiB | ++-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+------------+------------+---------+------------+ ``` #### Zero Layers Offload ```shell -$ gguf-parser --hf-repo="NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO-GGUF" --hf-file="Nous-Hermes-2-Mixtral-8x7B-DPO.Q3_K_M.gguf" --skip-model --skip-architecture --skip-tokenizer --gpu-layers=0 -+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| ESTIMATE | -+-------+--------------+--------------------+-----------------+---------------+----------------+---------------+----------------+----------------+-----------------------+-------------------+ -| ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED | RAM | VRAM 0 | -| | | | | | | | | +-----------+-----------+--------+----------+ -| | | | | | | | | | UMA | NONUMA | UMA | NONUMA | -+-------+--------------+--------------------+-----------------+---------------+----------------+---------------+----------------+----------------+-----------+-----------+--------+----------+ -| llama | 32768 | 2048 / 512 | Disabled | Not Supported | No | Supported | 0 | No | 25.11 GiB | 25.25 GiB | 0 B | 2.39 GiB | -+-------+--------------+--------------------+-----------------+---------------+----------------+---------------+----------------+----------------+-----------+-----------+--------+----------+ +$ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" --skip-metadata --skip-architecture --skip-tokenizer --gpu-layers=0 ++-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| ESTIMATE | ++-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+-------------------------+--------------------+ +| ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED | RAM | VRAM 0 | +| | | | | | | | | +------------+------------+--------+-----------+ +| | | | | | | | | | UMA | NONUMA | UMA | NONUMA | ++-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+------------+------------+--------+-----------+ +| llama | 131072 | 2048 / 512 | Disabled | Enabled | No | Supported | 0 | No | 126.64 GiB | 126.78 GiB | 0 B | 33.34 GiB | ++-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+------------+------------+--------+-----------+ ``` #### Specific Layers Offload ```shell -$ gguf-parser --hf-repo="NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO-GGUF" --hf-file="Nous-Hermes-2-Mixtral-8x7B-DPO.Q3_K_M.gguf" --skip-model --skip-architecture --skip-tokenizer --gpu-layers=10 -+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| ESTIMATE | -+-------+--------------+--------------------+-----------------+---------------+----------------+---------------+----------------+----------------+-----------------------+----------------------+ -| ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED | RAM | VRAM 0 | -| | | | | | | | | +-----------+-----------+----------+-----------+ -| | | | | | | | | | UMA | NONUMA | UMA | NONUMA | -+-------+--------------+--------------------+-----------------+---------------+----------------+---------------+----------------+----------------+-----------+-----------+----------+-----------+ -| llama | 32768 | 2048 / 512 | Disabled | Not Supported | No | Supported | 10 | No | 17.38 GiB | 17.52 GiB | 7.73 GiB | 10.19 GiB | -+-------+--------------+--------------------+-----------------+---------------+----------------+---------------+----------------+----------------+-----------+-----------+----------+-----------+ +$ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" --skip-metadata --skip-architecture --skip-tokenizer --gpu-layers=10 ++-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| ESTIMATE | ++-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+-------------------------+--------------------+ +| ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED | RAM | VRAM 0 | +| | | | | | | | | +------------+------------+--------+-----------+ +| | | | | | | | | | UMA | NONUMA | UMA | NONUMA | ++-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+------------+------------+--------+-----------+ +| llama | 131072 | 2048 / 512 | Disabled | Enabled | No | Supported | 10 | No | 116.64 GiB | 116.78 GiB | 10 GiB | 50.39 GiB | ++-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+------------+------------+--------+-----------+ ``` @@ -476,16 +692,16 @@ By default, the context size retrieved from the model's metadata. Use `--ctx-size` to specify the context size. ```shell -$ gguf-parser --hf-repo="NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO-GGUF" --hf-file="Nous-Hermes-2-Mixtral-8x7B-DPO.Q3_K_M.gguf" --skip-model --skip-architecture --skip-tokenizer --ctx-size=4096 -+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| ESTIMATE | -+-------+--------------+--------------------+-----------------+---------------+----------------+---------------+----------------+----------------+-------------------------+-----------------------+ -| ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED | RAM | VRAM 0 | -| | | | | | | | | +------------+------------+-----------+-----------+ -| | | | | | | | | | UMA | NONUMA | UMA | NONUMA | -+-------+--------------+--------------------+-----------------+---------------+----------------+---------------+----------------+----------------+------------+------------+-----------+-----------+ -| llama | 4096 | 2048 / 512 | Disabled | Not Supported | No | Supported | 33 (32 + 1) | Yes | 118.54 MiB | 268.54 MiB | 21.44 GiB | 21.99 GiB | -+-------+--------------+--------------------+-----------------+---------------+----------------+---------------+----------------+----------------+------------+------------+-----------+-----------+ +$ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" --skip-metadata --skip-architecture --skip-tokenizer --ctx-size=4096 ++---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| ESTIMATE | ++-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+-------------------------+----------------------+ +| ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED | RAM | VRAM 0 | +| | | | | | | | | +------------+------------+----------+-----------+ +| | | | | | | | | | UMA | NONUMA | UMA | NONUMA | ++-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+------------+------------+----------+-----------+ +| llama | 4096 | 2048 / 512 | Disabled | Enabled | No | Supported | 127 (126 + 1) | Yes | 436.53 MiB | 586.53 MiB | 3.94 GiB | 93.31 GiB | ++-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+------------+------------+----------+-----------+ ``` @@ -501,16 +717,16 @@ Please note that not all models support Flash Attention, if the model does not s Disabled" even if you enable it. ```shell -$ gguf-parser --hf-repo="NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO-GGUF" --hf-file="Nous-Hermes-2-Mixtral-8x7B-DPO.Q3_K_M.gguf" --skip-model --skip-architecture --skip-tokenizer --flash-attention -+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| ESTIMATE | -+-------+--------------+--------------------+-----------------+---------------+----------------+---------------+----------------+----------------+-------------------------+-----------------------+ -| ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED | RAM | VRAM 0 | -| | | | | | | | | +------------+------------+-----------+-----------+ -| | | | | | | | | | UMA | NONUMA | UMA | NONUMA | -+-------+--------------+--------------------+-----------------+---------------+----------------+---------------+----------------+----------------+------------+------------+-----------+-----------+ -| llama | 32768 | 2048 / 512 | Enabled | Not Supported | No | Supported | 33 (32 + 1) | Yes | 158.54 MiB | 308.54 MiB | 24.94 GiB | 25.43 GiB | -+-------+--------------+--------------------+-----------------+---------------+----------------+---------------+----------------+----------------+------------+------------+-----------+-----------+ +$ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" --skip-metadata --skip-architecture --skip-tokenizer --flash-attention ++---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| ESTIMATE | ++-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+-------------------------+----------------------+ +| ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED | RAM | VRAM 0 | +| | | | | | | | | +------------+------------+---------+------------+ +| | | | | | | | | | UMA | NONUMA | UMA | NONUMA | ++-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+------------+------------+---------+------------+ +| llama | 131072 | 2048 / 512 | Enabled | Enabled | No | Supported | 127 (126 + 1) | Yes | 620.53 MiB | 770.53 MiB | 126 GiB | 215.70 GiB | ++-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+------------+------------+---------+------------+ ``` @@ -527,16 +743,16 @@ Please note that some models require loading the whole weight into memory, if th LOAD" shows "Not Supported". ```shell -$ gguf-parser --hf-repo="NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO-GGUF" --hf-file="Nous-Hermes-2-Mixtral-8x7B-DPO.Q3_K_M.gguf" --skip-model --skip-architecture --skip-tokenizer --gpu-layers=10 --no-mmap -+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| ESTIMATE | -+-------+--------------+--------------------+-----------------+---------------+----------------+---------------+----------------+----------------+-----------------------+----------------------+ -| ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED | RAM | VRAM 0 | -| | | | | | | | | +-----------+-----------+----------+-----------+ -| | | | | | | | | | UMA | NONUMA | UMA | NONUMA | -+-------+--------------+--------------------+-----------------+---------------+----------------+---------------+----------------+----------------+-----------+-----------+----------+-----------+ -| llama | 32768 | 2048 / 512 | Disabled | Not Supported | No | Supported | 10 | No | 17.38 GiB | 17.52 GiB | 7.73 GiB | 10.19 GiB | -+-------+--------------+--------------------+-----------------+---------------+----------------+---------------+----------------+----------------+-----------+-----------+----------+-----------+ +$ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" --skip-metadata --skip-architecture --skip-tokenizer --no-mmap ++------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| ESTIMATE | ++-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+-------------------------+-------------------------+ +| ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED | RAM | VRAM 0 | +| | | | | | | | | +------------+------------+------------+------------+ +| | | | | | | | | | UMA | NONUMA | UMA | NONUMA | ++-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+------------+------------+------------+------------+ +| llama | 131072 | 2048 / 512 | Disabled | Disabled | No | Supported | 127 (126 + 1) | Yes | 684.53 MiB | 834.53 MiB | 213.97 GiB | 247.59 GiB | ++-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+------------+------------+------------+------------+ ``` @@ -545,30 +761,42 @@ $ gguf-parser --hf-repo="NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO-GGUF" --hf- Use `--gpu-layers-step` to get the proper offload layers number when the model is too large to fit into the GPUs memory. ```shell -$ gguf-parser --hf-repo="NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO-GGUF" --hf-file="Nous-Hermes-2-Mixtral-8x7B-DPO.Q3_K_M.gguf" --skip-model --skip-architecture --skip-tokenizer --gpu-layers-step=5 -+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| ESTIMATE | -+-------+--------------+--------------------+-----------------+---------------+----------------+---------------+----------------+----------------+-------------------------+-----------------------+ -| ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED | RAM | VRAM 0 | -| | | | | | | | | +------------+------------+-----------+-----------+ -| | | | | | | | | | UMA | NONUMA | UMA | NONUMA | -+-------+--------------+--------------------+-----------------+---------------+----------------+---------------+----------------+----------------+------------+------------+-----------+-----------+ -| llama | 32768 | 2048 / 512 | Disabled | Not Supported | No | Supported | 0 | No | 25.11 GiB | 25.25 GiB | 0 B | 2.39 GiB | -| | | | | | | +----------------+ +------------+------------+-----------+-----------+ -| | | | | | | | 5 | | 21.24 GiB | 21.39 GiB | 3.86 GiB | 6.33 GiB | -| | | | | | | +----------------+ +------------+------------+-----------+-----------+ -| | | | | | | | 10 | | 17.38 GiB | 17.52 GiB | 7.73 GiB | 10.19 GiB | -| | | | | | | +----------------+ +------------+------------+-----------+-----------+ -| | | | | | | | 15 | | 13.51 GiB | 13.66 GiB | 11.59 GiB | 14.06 GiB | -| | | | | | | +----------------+ +------------+------------+-----------+-----------+ -| | | | | | | | 20 | | 9.65 GiB | 9.79 GiB | 15.46 GiB | 17.92 GiB | -| | | | | | | +----------------+ +------------+------------+-----------+-----------+ -| | | | | | | | 25 | | 5.78 GiB | 5.93 GiB | 19.32 GiB | 21.79 GiB | -| | | | | | | +----------------+ +------------+------------+-----------+-----------+ -| | | | | | | | 30 | | 1.92 GiB | 2.06 GiB | 23.19 GiB | 25.65 GiB | -| | | | | | | +----------------+----------------+------------+------------+-----------+-----------+ -| | | | | | | | 33 (32 + 1) | Yes | 174.54 MiB | 324.54 MiB | 24.94 GiB | 27.41 GiB | -+-------+--------------+--------------------+-----------------+---------------+----------------+---------------+----------------+----------------+------------+------------+-----------+-----------+ +$ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" --skip-metadata --skip-architecture --skip-tokenizer --gpu-layers-step=10 ++---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| ESTIMATE | ++-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+-------------------------+----------------------+ +| ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED | RAM | VRAM 0 | +| | | | | | | | | +------------+------------+---------+------------+ +| | | | | | | | | | UMA | NONUMA | UMA | NONUMA | ++-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+------------+------------+---------+------------+ +| llama | 131072 | 2048 / 512 | Disabled | Enabled | No | Supported | 0 | No | 126.64 GiB | 126.78 GiB | 0 B | 33.34 GiB | +| | | | | | | +----------------+ +------------+------------+---------+------------+ +| | | | | | | | 10 | | 116.64 GiB | 116.78 GiB | 10 GiB | 50.39 GiB | +| | | | | | | +----------------+ +------------+------------+---------+------------+ +| | | | | | | | 20 | | 106.64 GiB | 106.78 GiB | 20 GiB | 67.16 GiB | +| | | | | | | +----------------+ +------------+------------+---------+------------+ +| | | | | | | | 30 | | 96.64 GiB | 96.78 GiB | 30 GiB | 83.93 GiB | +| | | | | | | +----------------+ +------------+------------+---------+------------+ +| | | | | | | | 40 | | 86.64 GiB | 86.78 GiB | 40 GiB | 100.69 GiB | +| | | | | | | +----------------+ +------------+------------+---------+------------+ +| | | | | | | | 50 | | 76.64 GiB | 76.78 GiB | 50 GiB | 117.46 GiB | +| | | | | | | +----------------+ +------------+------------+---------+------------+ +| | | | | | | | 60 | | 66.64 GiB | 66.78 GiB | 60 GiB | 134.23 GiB | +| | | | | | | +----------------+ +------------+------------+---------+------------+ +| | | | | | | | 70 | | 56.64 GiB | 56.78 GiB | 70 GiB | 151 GiB | +| | | | | | | +----------------+ +------------+------------+---------+------------+ +| | | | | | | | 80 | | 46.64 GiB | 46.78 GiB | 80 GiB | 167.77 GiB | +| | | | | | | +----------------+ +------------+------------+---------+------------+ +| | | | | | | | 90 | | 36.64 GiB | 36.78 GiB | 90 GiB | 184.54 GiB | +| | | | | | | +----------------+ +------------+------------+---------+------------+ +| | | | | | | | 100 | | 26.64 GiB | 26.78 GiB | 100 GiB | 201.31 GiB | +| | | | | | | +----------------+ +------------+------------+---------+------------+ +| | | | | | | | 110 | | 16.64 GiB | 16.78 GiB | 110 GiB | 218.08 GiB | +| | | | | | | +----------------+ +------------+------------+---------+------------+ +| | | | | | | | 120 | | 6.64 GiB | 6.78 GiB | 120 GiB | 235.64 GiB | +| | | | | | | +----------------+----------------+------------+------------+---------+------------+ +| | | | | | | | 127 (126 + 1) | Yes | 684.53 MiB | 834.53 MiB | 126 GiB | 247.59 GiB | ++-------+--------------+--------------------+-----------------+-----------+----------------+---------------+----------------+----------------+------------+------------+---------+------------+ ``` diff --git a/cmd/gguf-parser/README.md b/cmd/gguf-parser/README.md index daeaaf8..80b2fa2 100644 --- a/cmd/gguf-parser/README.md +++ b/cmd/gguf-parser/README.md @@ -37,8 +37,9 @@ GLOBAL OPTIONS: --no-mmap Specify disabling Memory-Mapped using, which is used to estimate the usage. Memory-Mapped can avoid loading the entire model weights into RAM. (default: false) --parallel-size value, --parallel value, --np value Specify the number of parallel sequences to decode, which is used to estimate the usage. (default: 1) --platform-footprint value Specify the platform footprint(RAM,VRAM) of running host in MiB, which is used to estimate the NonUMA usage, default is 150,250. Different platform always gets different RAM and VRAM footprints, for example, within CUDA, 'cudaMemGetInfo' would occupy some RAM and VRAM, see https://stackoverflow.com/questions/64854862/free-memory-occupied-by-cudamemgetinfo. (default: "150,250") + --rpc value Specify the RPC servers, which is used to estimate the usage, it is a comma-separated list of host:port. Woks with --tensor-split. --split-mode value, --sm value Specify how to split the model across multiple devices, which is used to estimate the usage, select from [layer, row, none]. Since gguf-parser always estimates the usage of VRAM, "none" is meaningless here, keep for compatibility. (default: "layer") - --tensor-split value, --ts value Specify the fraction of the model to offload to each device, which is used to estimate the usage, it is a comma-separated list of integer. Since gguf-parser cannot recognize the host GPU devices or RPC servers, must explicitly set --tensor-split to indicate how many devices are used. + --tensor-split value, --ts value Specify the fraction of the model to offload to each device, which is used to estimate the usage, it is a comma-separated list of integer. Since gguf-parser cannot recognize the host GPU devices or RPC servers, must explicitly set --tensor-split to indicate how many devices are used. To declare the devices belong to RPC servers, set --rpc please. --ubatch-size value, --ub value Specify the physical maximum batch size, which is used to estimate the usage. (default: 512) Load @@ -95,10 +96,10 @@ GLOBAL OPTIONS: --json-pretty Works with --json, to output pretty format JSON. (default: true) --raw Output the GGUF file information as JSON only, skip anything. (default: false) --raw-output value Works with --raw, to save the result to the file - --skip-architecture Skip to display architecture metadata. (default: false) - --skip-estimate Skip to estimate. (default: false) - --skip-model Skip to display model metadata. (default: false) - --skip-tokenizer Skip to display tokenizer metadata. (default: false) + --skip-architecture Skip to display architecture. (default: false) + --skip-estimate Skip to estimate. By default, gguf-parser always estimates the file which types with "model". (default: false) + --skip-metadata Skip to display metadata. (default: false) + --skip-tokenizer Skip to display tokenizer. By default, gguf-parser always displays the tokenizer of the file which types with "model". (default: false) ``` diff --git a/cmd/gguf-parser/main.go b/cmd/gguf-parser/main.go index 20f2390..5d83305 100644 --- a/cmd/gguf-parser/main.go +++ b/cmd/gguf-parser/main.go @@ -20,6 +20,7 @@ import ( "github.com/urfave/cli/v2" . "github.com/gpustack/gguf-parser-go" // nolint: stylecheck + "net" ) var Version = "v0.0.0" @@ -412,7 +413,18 @@ func main() { "which is used to estimate the usage, " + "it is a comma-separated list of integer. " + "Since gguf-parser cannot recognize the host GPU devices or RPC servers, " + - "must explicitly set --tensor-split to indicate how many devices are used.", + "must explicitly set --tensor-split to indicate how many devices are used. " + + "To declare the devices belong to RPC servers, set --rpc please.", + }, + &cli.StringFlag{ + Destination: &rpcServers, + Value: rpcServers, + Category: "Estimate", + Name: "rpc", + Usage: "Specify the RPC servers, " + + "which is used to estimate the usage, " + + "it is a comma-separated list of host:port. " + + "Woks with --tensor-split.", }, &cli.UintFlag{ Destination: &mainGPU, @@ -490,32 +502,34 @@ func main() { Usage: "Works with --raw, to save the result to the file", }, &cli.BoolFlag{ - Destination: &skipModel, - Value: skipModel, + Destination: &skipMetadata, + Value: skipMetadata, Category: "Output", - Name: "skip-model", - Usage: "Skip to display model metadata.", + Name: "skip-metadata", + Usage: "Skip to display metadata.", }, &cli.BoolFlag{ Destination: &skipArchitecture, Value: skipArchitecture, Category: "Output", Name: "skip-architecture", - Usage: "Skip to display architecture metadata.", + Usage: "Skip to display architecture.", }, &cli.BoolFlag{ Destination: &skipTokenizer, Value: skipTokenizer, Category: "Output", Name: "skip-tokenizer", - Usage: "Skip to display tokenizer metadata.", + Usage: "Skip to display tokenizer. " + + "By default, gguf-parser always displays the tokenizer of the file which types with \"model\".", }, &cli.BoolFlag{ Destination: &skipEstimate, Value: skipEstimate, Category: "Output", Name: "skip-estimate", - Usage: "Skip to estimate.", + Usage: "Skip to estimate. " + + "By default, gguf-parser always estimates the file which types with \"model\".", }, &cli.BoolFlag{ Destination: &inMib, @@ -594,6 +608,7 @@ var ( splitMode = "layer" tensorSplit string mainGPU uint + rpcServers string platformFootprint = "150,250" noMMap bool offloadLayers = -1 @@ -602,7 +617,7 @@ var ( // output options raw bool rawOutput string - skipModel bool + skipMetadata bool skipArchitecture bool skipTokenizer bool skipEstimate bool @@ -689,12 +704,13 @@ func mainAction(c *cli.Context) error { eopts = append(eopts, WithSplitMode(LLaMACppSplitModeLayer)) } if tensorSplit != "" { - ss := strings.Split(tensorSplit, ",") + tss := strings.Split(tensorSplit, ",") var vs float64 - vv := make([]float64, len(ss)) - vf := make([]float64, len(ss)) - for i, s := range ss { - v, err := strconv.ParseFloat(strings.TrimSpace(s), 64) + vv := make([]float64, len(tss)) + vf := make([]float64, len(tss)) + for i, s := range tss { + s = strings.TrimSpace(s) + v, err := strconv.ParseFloat(s, 64) if err != nil { return errors.New("--tensor-split has invalid integer") } @@ -710,11 +726,26 @@ func mainAction(c *cli.Context) error { } else { return errors.New("--main-gpu must be less than item size of --tensor-split") } + if rpcServers != "" { + rss := strings.Split(rpcServers, ",") + if len(rss) > len(tss) { + return errors.New("--rpc has more items than --tensor-split") + } + rpc := make([]string, len(rss)) + for i, s := range rss { + s = strings.TrimSpace(s) + if _, _, err := net.SplitHostPort(s); err != nil { + return errors.New("--rpc has invalid host:port") + } + rpc[i] = s + } + eopts = append(eopts, WithRPCServers(rpc)) + } } // Parse GGUF file. - var gf, mmpgf, dftgf *GGUFFile + var gf, projgf, dftgf *GGUFFile { var err error @@ -761,7 +792,7 @@ func mainAction(c *cli.Context) error { { mls := om.SearchLayers(regexp.MustCompile(`^application/vnd\.ollama\.image\.projector$`)) if len(mls) > 0 { - mmpgf, err = ParseGGUFFileRemote(ctx, mls[len(mls)-1].BlobURL().String(), ropts...) + projgf, err = ParseGGUFFileRemote(ctx, mls[len(mls)-1].BlobURL().String(), ropts...) } } } @@ -770,16 +801,16 @@ func mainAction(c *cli.Context) error { return fmt.Errorf("failed to parse GGUF file: %w", err) } - // MultimodalProjector model. + // Projector model. switch { case mmprojPath != "": - mmpgf, err = ParseGGUFFile(mmprojPath, ropts...) + projgf, err = ParseGGUFFile(mmprojPath, ropts...) case mmprojUrl != "": - mmpgf, err = ParseGGUFFileRemote(ctx, mmprojUrl, ropts...) + projgf, err = ParseGGUFFileRemote(ctx, mmprojUrl, ropts...) case hfRepo != "" && hfMMProjFile != "": - mmpgf, err = ParseGGUFFileFromHuggingFace(ctx, hfRepo, hfMMProjFile, ropts...) + projgf, err = ParseGGUFFileFromHuggingFace(ctx, hfRepo, hfMMProjFile, ropts...) case msRepo != "" && msMMProjFile != "": - mmpgf, err = ParseGGUFFileFromModelScope(ctx, msRepo, msMMProjFile, ropts...) + projgf, err = ParseGGUFFileFromModelScope(ctx, msRepo, msMMProjFile, ropts...) } if err != nil { return fmt.Errorf("failed to parse multimodal projector GGUF file: %w", err) @@ -822,27 +853,21 @@ func mainAction(c *cli.Context) error { // Otherwise, display the metadata and estimate the usage. var ( - m GGUFModelMetadata - a GGUFArchitectureMetadata - t GGUFTokenizerMetadata + m GGUFMetadata + a GGUFArchitecture + t GGUFTokenizer e LLaMACppUsageEstimate ) - if !skipModel { - m = gf.Model() + if !skipMetadata { + m = gf.Metadata() } - if !skipArchitecture && !skipEstimate { + if !skipArchitecture { a = gf.Architecture() } - if !skipTokenizer && !skipEstimate { + if !skipTokenizer { t = gf.Tokenizer() } if !skipEstimate { - if mmpgf != nil { - meopts := eopts[:len(eopts):len(eopts)] - me := mmpgf.EstimateLLaMACppUsage(meopts...) - eopts = append(eopts, WithMultimodalProjector(&me)) - } - if dftgf != nil { deopts := eopts[:len(eopts):len(eopts)] if offloadLayersDraft >= 0 { @@ -852,6 +877,12 @@ func mainAction(c *cli.Context) error { eopts = append(eopts, WithDrafter(&de)) } + if projgf != nil { + peopts := eopts[:len(eopts):len(eopts)] + me := projgf.EstimateLLaMACppUsage(peopts...) + eopts = append(eopts, WithProjector(&me)) + } + deopts := eopts[:len(eopts):len(eopts)] if offloadLayers >= 0 { deopts = append(deopts, WithOffloadLayers(uint64(offloadLayers))) @@ -881,8 +912,8 @@ func mainAction(c *cli.Context) error { if inJson { o := map[string]any{} - if !skipModel { - o["model"] = m + if !skipMetadata { + o["metadata"] = m } if !skipArchitecture { o["architecture"] = a @@ -890,35 +921,33 @@ func mainAction(c *cli.Context) error { if !skipTokenizer && t.Model != "" { o["tokenizer"] = t } - if !skipEstimate { + if !skipEstimate && e.Type == "model" { es := e.Summarize(mmap, platformRAM, platformVRAM) - if e.Architecture != "clip" { - switch { - case offloadLayersStep > e.OffloadLayers: - offloadLayersStep = e.OffloadLayers - case offloadLayersStep <= 0: - offloadLayersStep = e.OffloadLayers + switch { + case offloadLayersStep > e.OffloadLayers: + offloadLayersStep = e.OffloadLayers + case offloadLayersStep <= 0: + offloadLayersStep = e.OffloadLayers + } + if offloadLayersStep < e.OffloadLayers { + cnt := e.OffloadLayers/offloadLayersStep + 1 + if e.OffloadLayers%offloadLayersStep != 0 || e.FullOffloaded { + cnt++ } - if offloadLayersStep < e.OffloadLayers { - cnt := e.OffloadLayers/offloadLayersStep + 1 - if e.OffloadLayers%offloadLayersStep != 0 || e.FullOffloaded { - cnt++ - } - ess := make([]LLaMACppUsageEstimateMemorySummary, cnt) - var wg sync.WaitGroup - for i := 0; i < cap(ess); i++ { - wg.Add(1) - go func(i int) { - defer wg.Done() - eopts := eopts[:len(eopts):len(eopts)] - eopts = append(eopts, WithOffloadLayers(uint64(i)*offloadLayersStep)) - ess[i] = gf.EstimateLLaMACppUsage(eopts...).SummarizeMemory(mmap, platformRAM, platformVRAM) - }(i) - } - wg.Wait() - ess[cap(ess)-1] = es.Memory[0] - es.Memory = ess + ess := make([]LLaMACppUsageEstimateMemorySummary, cnt) + var wg sync.WaitGroup + for i := 0; i < cap(ess); i++ { + wg.Add(1) + go func(i int) { + defer wg.Done() + eopts := eopts[:len(eopts):len(eopts)] + eopts = append(eopts, WithOffloadLayers(uint64(i)*offloadLayersStep)) + ess[i] = gf.EstimateLLaMACppUsage(eopts...).SummarizeMemory(mmap, platformRAM, platformVRAM) + }(i) } + wg.Wait() + ess[cap(ess)-1] = es.Memory[0] + es.Memory = ess } o["estimate"] = es } @@ -936,11 +965,12 @@ func mainAction(c *cli.Context) error { InMiBytes = inMib - if !skipModel { + if !skipMetadata { tprint( - "MODEL", + "Metadata", [][]any{ { + "Type", "Name", "Arch", "Quantization", @@ -952,7 +982,8 @@ func mainAction(c *cli.Context) error { }, [][]any{ { - m.Name, + m.Type, + sprintf(tenary(len(m.Name) == 0, "N/A", tenary(len([]rune(m.Name)) <= 20, m.Name, string([]rune(m.Name)[:20])+"..."))), m.Architecture, sprintf(m.FileType), sprintf(m.LittleEndian), @@ -968,7 +999,37 @@ func mainAction(c *cli.Context) error { hd []any bd []any ) - if a.Architecture != "clip" { + switch a.Type { + case "projector": + hd = []any{ + "Projector Type", + "Embedding Len", + "Layers", + "Feed Forward Len", + "Encoder", + } + bd = []any{ + sprintf(a.ClipProjectorType), + sprintf(a.EmbeddingLength), + sprintf(a.BlockCount), + sprintf(a.FeedForwardLength), + sprintf(tenary(a.ClipHasTextEncoder, tenary(a.ClipHasVisionEncoder, "Text & Vision", "Text"), tenary(a.ClipHasVisionEncoder, "Vision", "N/A"))), + } + case "adapter": + hd = []any{ + "Adapter Type", + } + bd = []any{ + sprintf(a.AdapterType), + } + if a.AdapterType == "lora" { + hd = append(hd, "LoRA Alpha") + bd = append(bd, sprintf(a.AdapterLoRAAlpha)) + } else { + hd = append(hd, "ControlVector Layers") + bd = append(bd, sprintf(a.AdapterControlVectorLayerCount)) + } + default: hd = []any{ "Max Context Len", "Embedding Len", @@ -991,21 +1052,6 @@ func mainAction(c *cli.Context) error { sprintf(a.ExpertCount), sprintf(a.VocabularyLength), } - } else { - hd = []any{ - "Embedding Len", - "Layers", - "Feed Forward Len", - "Encoder", - "LLaVA MultimodalProjector", - } - bd = []any{ - sprintf(a.EmbeddingLength), - sprintf(a.BlockCount), - sprintf(a.FeedForwardLength), - sprintf(tenary(a.ClipHasTextEncoder, tenary(a.ClipHasVisionEncoder, "Text & Vision", "Text"), tenary(a.ClipHasVisionEncoder, "Vision", "N/A"))), - sprintf(tenary(a.ClipHasLLaVaProjector, a.ClipProjectorType, "N/A")), - } } tprint( "ARCHITECTURE", @@ -1048,128 +1094,90 @@ func mainAction(c *cli.Context) error { }) } - if !skipEstimate { + if !skipEstimate && e.Type == "model" { var ( hds [][]any bds [][]any ) es := e.Summarize(mmap, platformRAM, platformVRAM) - if e.Architecture != "clip" { - hds = [][]any{ - { - "Arch", - "Context Size", - "Batch Size (L / P)", - "Flash Attention", - "MMap Load", - "Embedding Only", - "Distributable", - "Offload Layers", - "Full Offloaded", - "RAM", - "RAM", - }, - { - "Arch", - "Context Size", - "Batch Size (L / P)", - "Flash Attention", - "MMap Load", - "Embedding Only", - "Distributable", - "Offload Layers", - "Full Offloaded", - "UMA", - "NonUMA", - }, - } - for i := range es.Memory[0].VRAMs { - hds[0] = append(hds[0], fmt.Sprintf("VRAM %d", i), fmt.Sprintf("VRAM %d", i)) - hds[1] = append(hds[1], "UMA", "NonUMA") - } - - switch { - case offloadLayersStep > e.OffloadLayers: - offloadLayersStep = e.OffloadLayers - case offloadLayersStep <= 0: - offloadLayersStep = e.OffloadLayers - } - if offloadLayersStep < e.OffloadLayers { - cnt := e.OffloadLayers/offloadLayersStep + 1 - if e.OffloadLayers%offloadLayersStep != 0 || e.FullOffloaded { - cnt++ - } - ess := make([]LLaMACppUsageEstimateMemorySummary, cnt) - var wg sync.WaitGroup - for i := 0; i < cap(ess); i++ { - wg.Add(1) - go func(i int) { - defer wg.Done() - eopts := eopts[:len(eopts):len(eopts)] - eopts = append(eopts, WithOffloadLayers(uint64(i)*offloadLayersStep)) - ess[i] = gf.EstimateLLaMACppUsage(eopts...).SummarizeMemory(mmap, platformRAM, platformVRAM) - }(i) - } - wg.Wait() - ess[cap(ess)-1] = es.Memory[0] - es.Memory = ess - } + hds = [][]any{ + { + "Arch", + "Context Size", + "Batch Size (L / P)", + "Flash Attention", + "MMap Load", + "Embedding Only", + "Distributable", + "Offload Layers", + "Full Offloaded", + "RAM", + "RAM", + }, + { + "Arch", + "Context Size", + "Batch Size (L / P)", + "Flash Attention", + "MMap Load", + "Embedding Only", + "Distributable", + "Offload Layers", + "Full Offloaded", + "UMA", + "NonUMA", + }, + } + for i := range es.Memory[0].VRAMs { + hds[0] = append(hds[0], fmt.Sprintf("VRAM %d", i), fmt.Sprintf("VRAM %d", i)) + hds[1] = append(hds[1], "UMA", "NonUMA") + } - bds = make([][]any, len(es.Memory)) - for i := range es.Memory { - bds[i] = []any{ - sprintf(es.Architecture), - sprintf(es.ContextSize), - sprintf("%d / %d", es.LogicalBatchSize, es.PhysicalBatchSize), - sprintf(tenary(es.FlashAttention, "Enabled", "Disabled")), - sprintf(tenary(!es.NoMMap, "Supported", "Not Supported")), - sprintf(tenary(es.EmbeddingOnly, "Yes", "No")), - sprintf(tenary(es.Distributable, "Supported", "Not Supported")), - sprintf(tenary(es.Memory[i].FullOffloaded, sprintf("%d (%d + 1)", - es.Memory[i].OffloadLayers, es.Memory[i].OffloadLayers-1), es.Memory[i].OffloadLayers)), - sprintf(tenary(es.Memory[i].FullOffloaded, "Yes", "No")), - sprintf(es.Memory[i].RAM.UMA), - sprintf(es.Memory[i].RAM.NonUMA), - } - for _, v := range es.Memory[i].VRAMs { - bds[i] = append(bds[i], - sprintf(v.UMA), - sprintf(v.NonUMA)) - } - } - } else { - hds = [][]any{ - { - "Arch", - "Offload Layers", - "Full Offloaded", - "RAM", - "RAM", - }, - { - "Arch", - "Offload Layers", - "Full Offloaded", - "UMA", - "NonUMA", - }, + switch { + case offloadLayersStep > e.OffloadLayers: + offloadLayersStep = e.OffloadLayers + case offloadLayersStep <= 0: + offloadLayersStep = e.OffloadLayers + } + if offloadLayersStep < e.OffloadLayers { + cnt := e.OffloadLayers/offloadLayersStep + 1 + if e.OffloadLayers%offloadLayersStep != 0 || e.FullOffloaded { + cnt++ } - for i := range es.Memory[0].VRAMs { - hds[0] = append(hds[0], fmt.Sprintf("VRAM %d", i), fmt.Sprintf("VRAM %d", i)) - hds[1] = append(hds[1], "UMA", "NonUMA") + ess := make([]LLaMACppUsageEstimateMemorySummary, cnt) + var wg sync.WaitGroup + for i := 0; i < cap(ess); i++ { + wg.Add(1) + go func(i int) { + defer wg.Done() + eopts := eopts[:len(eopts):len(eopts)] + eopts = append(eopts, WithOffloadLayers(uint64(i)*offloadLayersStep)) + ess[i] = gf.EstimateLLaMACppUsage(eopts...).SummarizeMemory(mmap, platformRAM, platformVRAM) + }(i) } + wg.Wait() + ess[cap(ess)-1] = es.Memory[0] + es.Memory = ess + } - bds = [][]any{ - { - sprintf(es.Architecture), - sprintf(es.Memory[0].OffloadLayers), - sprintf(tenary(es.Memory[0].FullOffloaded, "Yes", "No")), - sprintf(es.Memory[0].RAM.UMA), - sprintf(es.Memory[0].RAM.NonUMA), - }, + bds = make([][]any, len(es.Memory)) + for i := range es.Memory { + bds[i] = []any{ + sprintf(es.Architecture), + sprintf(es.ContextSize), + sprintf("%d / %d", es.LogicalBatchSize, es.PhysicalBatchSize), + sprintf(tenary(flashAttention, tenary(es.FlashAttention, "Enabled", "Not Supported"), "Disabled")), + sprintf(tenary(mmap, tenary(!es.NoMMap, "Enabled", "Not Supported"), "Disabled")), + sprintf(tenary(es.EmbeddingOnly, "Yes", "No")), + sprintf(tenary(es.Distributable, "Supported", "Not Supported")), + sprintf(tenary(es.Memory[i].FullOffloaded, sprintf("%d (%d + 1)", + es.Memory[i].OffloadLayers, es.Memory[i].OffloadLayers-1), es.Memory[i].OffloadLayers)), + sprintf(tenary(es.Memory[i].FullOffloaded, "Yes", "No")), + sprintf(es.Memory[i].RAM.UMA), + sprintf(es.Memory[i].RAM.NonUMA), } - for _, v := range es.Memory[0].VRAMs { - bds[0] = append(bds[0], + for _, v := range es.Memory[i].VRAMs { + bds[i] = append(bds[i], sprintf(v.UMA), sprintf(v.NonUMA)) } diff --git a/file_architecture.go b/file_architecture.go index 87126c7..4052021 100644 --- a/file_architecture.go +++ b/file_architecture.go @@ -1,9 +1,12 @@ package gguf_parser -// GGUFArchitectureMetadata represents the architecture metadata of a GGUF file. -type GGUFArchitectureMetadata struct { +// GGUFArchitecture represents the architecture metadata of a GGUF file. +type GGUFArchitecture struct { /* Basic */ + // Type describes the type of the file, + // default is "model". + Type string `json:"type"` // Architecture describes what architecture this model implements. // // All lowercase ASCII, with only [a-z0-9]+ characters allowed. @@ -14,13 +17,13 @@ type GGUFArchitectureMetadata struct { // Architectures, like RWKV, // that are not reliant on transformer-style attention may be able to handle larger inputs, // but this is not guaranteed. - MaximumContextLength uint64 `json:"maximumContextLength"` + MaximumContextLength uint64 `json:"maximumContextLength,omitempty"` // EmbeddingLength(n_embd) is the length of the embedding layer. - EmbeddingLength uint64 `json:"embeddingLength"` + EmbeddingLength uint64 `json:"embeddingLength,omitempty"` // BlockCount(n_layer) is the number of blocks of attention and feed-forward layers, // i.e. the bulk of the LLM. // This does not include the input or embedding layers. - BlockCount uint64 `json:"blockCount"` + BlockCount uint64 `json:"blockCount,omitempty"` // FeedForwardLength(n_ff) is the length of the feed-forward layer. FeedForwardLength uint64 `json:"feedForwardLength,omitempty"` // ExpertFeedForwardLength(expert_feed_forward_length) is the length of the feed-forward layer in the expert model. @@ -51,11 +54,11 @@ type GGUFArchitectureMetadata struct { // AttentionKeyLength(n_embd_head_k) is the size of a key head. // // Defaults to `EmbeddingLength / AttentionHeadCount`. - AttentionKeyLength uint32 `json:"attentionKeyLength"` + AttentionKeyLength uint32 `json:"attentionKeyLength,omitempty"` // AttentionValueLength(n_embd_head_v) is the size of a value head. // // Defaults to `EmbeddingLength / AttentionHeadCount`. - AttentionValueLength uint32 `json:"attentionValueLength"` + AttentionValueLength uint32 `json:"attentionValueLength,omitempty"` // AttentionCausal is true if the attention is causal. AttentionCausal bool `json:"attentionCausal,omitempty"` // RoPEDimensionCount is the number of dimensions in the RoPE(Rotary Positional Encoding). @@ -81,7 +84,7 @@ type GGUFArchitectureMetadata struct { // VocabularyLength is the size of the vocabulary. // // VocabularyLength is the same as the tokenizer's token size. - VocabularyLength uint64 `json:"vocabularyLength"` + VocabularyLength uint64 `json:"vocabularyLength,omitempty"` /* Appendix */ @@ -100,35 +103,67 @@ type GGUFArchitectureMetadata struct { // // Only used when Architecture is "clip". ClipHasVisionEncoder bool `json:"clipHasVisionEncoder,omitempty"` - // ClipHasLLaVaProjector indicates whether the clip model has LLaVa projector or not. - // - // Only used when Architecture is "clip". - ClipHasLLaVaProjector bool `json:"clipHasLLaVaProjector,omitempty"` // ClipProjectorType is the type of the projector used in the clip model. // // Only used when Architecture is "clip". ClipProjectorType string `json:"clipProjectorType,omitempty"` + + // AdapterType is the type of the adapter. + AdapterType string `json:"adapterType,omitempty"` + // AdapterLoRAAlpha is the alpha value of the LoRA adapter. + // + // Only used when AdapterType is "lora". + AdapterLoRAAlpha float32 `json:"adapterLoRAAlpha,omitempty"` + // AdapterControlVectorLayerCount is the number of layers in the control vector. + // + // Only used when Architecture is "control_vector". + AdapterControlVectorLayerCount uint32 `json:"adapterControlVectorLayerCount,omitempty"` } // Architecture returns the architecture metadata of the GGUF file. -func (gf *GGUFFile) Architecture() (ga GGUFArchitectureMetadata) { - arch := "llama" - if v, ok := gf.Header.MetadataKV.Get("general.architecture"); ok { - arch = v.ValueString() +func (gf *GGUFFile) Architecture() (ga GGUFArchitecture) { + var ( + generalTypeKey = "general.type" + generalArchitectureKey = "general.architecture" + + controlVectorModelHintKey = "controlvector.model_hint" + ) + m, _ := gf.Header.MetadataKV.Index([]string{ + generalTypeKey, + generalArchitectureKey, + controlVectorModelHintKey, + }) + + typ, arch := "model", "llama" // nolint: goconst + { + if v, ok := m[generalTypeKey]; ok { + typ = v.ValueString() + } + if v, ok := m[generalArchitectureKey]; ok { + arch = v.ValueString() + } } - if arch == "clip" { + switch { + case arch == "clip": return gf.clipArchitecture() + case arch == "controlvector": + arch = "llama" + if v, ok := m[controlVectorModelHintKey]; ok { + arch = v.ValueString() + } + return gf.adapterArchitecture(arch) + case typ == "adapter": + return gf.adapterArchitecture(arch) } - return gf.transformArchitecture(arch) + return gf.modelArchitecture(arch) } -func (gf *GGUFFile) clipArchitecture() (ga GGUFArchitectureMetadata) { +func (gf *GGUFFile) clipArchitecture() (ga GGUFArchitecture) { var ( - hasTextEncoderKey = "clip.has_text_encoder" - hasVisionEncoderKey = "clip.has_vision_encoder" - hasLLaVaProjectorKey = "clip.has_llava_projector" - projectorTypeKey = "clip.projector_type" + hasTextEncoderKey = "clip.has_text_encoder" + hasVisionEncoderKey = "clip.has_vision_encoder" + projectorTypeKey = "clip.projector_type" textEmbeddingLengthKey = "clip.text.embedding_length" textBlockCountKey = "clip.text.block_count" @@ -143,12 +178,12 @@ func (gf *GGUFFile) clipArchitecture() (ga GGUFArchitectureMetadata) { visionAttentionLayerNormRMSEpsilonKey = "clip.vision.attention.layer_norm_epsilon" ) + ga.Type = "projector" ga.Architecture = "clip" m, _ := gf.Header.MetadataKV.Index([]string{ hasTextEncoderKey, hasVisionEncoderKey, - hasLLaVaProjectorKey, projectorTypeKey, textEmbeddingLengthKey, textBlockCountKey, @@ -168,9 +203,6 @@ func (gf *GGUFFile) clipArchitecture() (ga GGUFArchitectureMetadata) { if v, ok := m[hasVisionEncoderKey]; ok { ga.ClipHasVisionEncoder = v.ValueBool() } - if v, ok := m[hasLLaVaProjectorKey]; ok { - ga.ClipHasLLaVaProjector = v.ValueBool() - } if v, ok := m[projectorTypeKey]; ok { ga.ClipProjectorType = v.ValueString() } else { @@ -228,7 +260,42 @@ func (gf *GGUFFile) clipArchitecture() (ga GGUFArchitectureMetadata) { return ga } -func (gf *GGUFFile) transformArchitecture(arch string) (ga GGUFArchitectureMetadata) { +func (gf *GGUFFile) adapterArchitecture(arch string) (ga GGUFArchitecture) { + var ( + typeKey = "adapter.type" + + loraAlphaKey = "adapter.lora.alpha" + + controlVectorLayerCountKey = "adapter.control_vector.layer_count" + controlVectorLayerCountKey2 = "control_vector.layer_count" + ) + + ga.Type = "adapter" + ga.Architecture = arch + + m, _ := gf.Header.MetadataKV.Index([]string{ + typeKey, + loraAlphaKey, + controlVectorLayerCountKey, + controlVectorLayerCountKey2, + }) + + if v, ok := m[typeKey]; ok { + ga.AdapterType = v.ValueString() + } + if v, ok := m[loraAlphaKey]; ok { + ga.AdapterLoRAAlpha = ValueNumeric[float32](v) + } + if v, ok := m[controlVectorLayerCountKey]; ok { + ga.AdapterControlVectorLayerCount = ValueNumeric[uint32](v) + } else if v, ok := m[controlVectorLayerCountKey2]; ok { + ga.AdapterControlVectorLayerCount = ValueNumeric[uint32](v) + } + + return ga +} + +func (gf *GGUFFile) modelArchitecture(arch string) (ga GGUFArchitecture) { var ( contextLengthKey = arch + ".context_length" embeddingLengthKey = arch + ".embedding_length" @@ -269,6 +336,7 @@ func (gf *GGUFFile) transformArchitecture(arch string) (ga GGUFArchitectureMetad tokenizerGGMLTokensKey = "tokenizer.ggml.tokens" ) + ga.Type = "model" ga.Architecture = arch m, _ := gf.Header.MetadataKV.Index([]string{ diff --git a/file_estimate.go b/file_estimate.go index 41b5384..9ca54e2 100644 --- a/file_estimate.go +++ b/file_estimate.go @@ -11,7 +11,9 @@ import ( type ( // LLaMACppUsageEstimate represents the estimated result of loading the GGUF file in llama.cpp. LLaMACppUsageEstimate struct { - // Architecture describes what architecture this model implements. + // Type describes what type this GGUF file is. + Type string `json:"type"` + // Architecture describes what architecture this GGUF file implements. Architecture string `json:"architecture"` // FlashAttention is the flag to indicate whether enable the flash attention, // true for enable. @@ -39,10 +41,12 @@ type ( // Devices represents the memory usage for running the GGUF file, // the first device is the CPU, and the rest are GPUs. Devices []LLaMACppMemoryUsage `json:"devices"` - // MultimodalProjector is the memory usage of multimodal projector. - MultimodalProjector *LLaMACppUsageEstimate `json:"multimodalProjector,omitempty"` // Drafter is the memory usage of drafter. Drafter *LLaMACppUsageEstimate `json:"drafter,omitempty"` + // Projector is the memory usage of multimodal projector. + Projector *LLaMACppUsageEstimate `json:"projector,omitempty"` + // Adapters is the memory usage of adapters. + Adapters []LLaMACppUsageEstimate `json:"adapters,omitempty"` } // LLaMACppMemoryUsage represents the memory usage for expanding the GGUF file in llama.cpp. @@ -51,6 +55,9 @@ type ( HandleLayers uint64 `json:"handleLayers"` // LastLayer is the index of the last layer the device can handle. LastLayer int `json:"latestLayer"` + // Remote is the flag to indicate whether the device is remote, + // true for remote. + Remote bool `json:"remote"` // Footprint is the memory footprint for bootstrapping. Footprint GGUFBytesScalar `json:"footprint"` // Weight is the memory usage of loading weights. @@ -136,10 +143,10 @@ func (gf *GGUFFile) EstimateLLaMACppUsage(opts ...LLaMACppUsageEstimateOption) ( e.Devices[i].LastLayer = -1 } - // Architecture and tokenizer metadata. + // Metadata. var ( - a GGUFArchitectureMetadata - t GGUFTokenizerMetadata + a GGUFArchitecture + t GGUFTokenizer ) if o.Architecture != nil { a = *o.Architecture @@ -151,10 +158,11 @@ func (gf *GGUFFile) EstimateLLaMACppUsage(opts ...LLaMACppUsageEstimateOption) ( } else { t = gf.Tokenizer() } + e.Type = a.Type e.Architecture = a.Architecture // Flash attention. - { + if a.Type == "model" { // Quantization requires flash attention, // see https://github.com/ggerganov/llama.cpp/blob/172c8256840ffd882ab9992ecedbb587d9b21f15/llama.cpp#L16055-L16058. if *o.CacheValueType > GGMLTypeF16 && !o.FlashAttention { @@ -170,7 +178,7 @@ func (gf *GGUFFile) EstimateLLaMACppUsage(opts ...LLaMACppUsageEstimateOption) ( } // Embedding. - if !a.AttentionCausal { + if a.Type == "model" && !a.AttentionCausal { e.EmbeddingOnly = true o.PhysicalBatchSize = o.LogicalBatchSize } @@ -178,19 +186,22 @@ func (gf *GGUFFile) EstimateLLaMACppUsage(opts ...LLaMACppUsageEstimateOption) ( // Distributable, // see https://github.com/ggerganov/llama.cpp/blob/a07c32ea54850c989f0ef6989da5b955b77b7172/ggml/src/ggml-rpc.cpp#L391-L397. { - e.Distributable = true - for i := range gf.TensorInfos { - if t, ok := gf.TensorInfos[i].Type.Trait(); ok && !t.Quantized { - continue - } - if len(gf.TensorInfos[i].Dimensions) == 0 { - continue - } - if gf.TensorInfos[i].Dimensions[0]%512 == 0 { - continue + e.Distributable = false + if a.Type == "model" { + e.Distributable = true + for i := range gf.TensorInfos { + if t, ok := gf.TensorInfos[i].Type.Trait(); ok && !t.Quantized { + continue + } + if len(gf.TensorInfos[i].Dimensions) == 0 { + continue + } + if gf.TensorInfos[i].Dimensions[0]%512 == 0 { + continue + } + e.Distributable = false + break } - e.Distributable = false - break } } @@ -233,7 +244,7 @@ func (gf *GGUFFile) EstimateLLaMACppUsage(opts ...LLaMACppUsageEstimateOption) ( // For mamba, // see https://github.com/ggerganov/llama.cpp/blob/7672adeec7a79ea271058c63106c142ba84f951a/llama.cpp#L16122-L16129. - if a.Architecture == "mamba" { + if a.Type == "model" && a.Architecture == "mamba" { nKV = nParallel o.CacheKeyType = ptr.To(GGMLTypeF32) o.CacheValueType = ptr.To(GGMLTypeF32) @@ -253,9 +264,9 @@ func (gf *GGUFFile) EstimateLLaMACppUsage(opts ...LLaMACppUsageEstimateOption) ( fullOffload, partialOffload, zeroOffload bool ) { - // For clip, + // For none model, // see https://github.com/ggerganov/llama.cpp/blob/148ec970b62c3c5ae0a8bfdaad2fc237aaae350d/examples/llava/clip.cpp#L994-L1008. - if a.Architecture == "clip" { + if a.Type != "model" { o.OffloadLayers = ptr.To(a.BlockCount + 1) // Clip means full offload. } switch v := o.OffloadLayers; { @@ -318,10 +329,10 @@ func (gf *GGUFFile) EstimateLLaMACppUsage(opts ...LLaMACppUsageEstimateOption) ( // Weight. { // Compute. - switch a.Architecture { - case "clip": - e.Devices[1].Weight.Compute = GGUFBytesScalar(ls.Bytes()) + switch a.Type { default: + e.Devices[1].Weight.Compute = GGUFBytesScalar(ls.Bytes()) + case "model": for i, j, offloadStart := 0, 0, len(tfLs)-int(nOffloadLayers); i < len(tfLs); i++ { switch { case i < int(nLoadLayers): @@ -338,6 +349,7 @@ func (gf *GGUFFile) EstimateLLaMACppUsage(opts ...LLaMACppUsageEstimateOption) ( } e.Devices[j+1].HandleLayers += 1 e.Devices[j+1].LastLayer = i + e.Devices[j+1].Remote = len(o.TensorSplitFraction)-len(o.RPCServers) <= j e.Devices[j+1].Weight.Compute += GGUFBytesScalar(tfLs[i].Bytes()) } } @@ -411,10 +423,8 @@ func (gf *GGUFFile) EstimateLLaMACppUsage(opts ...LLaMACppUsageEstimateOption) ( inpSMask = GGMLTypeF32.RowSizeOf([]uint64{1, nKV}) // F32 [1, n_kv] inpSSeq = GGMLTypeI32.RowSizeOf([]uint64{nKV, nBatch}) // I32 [n_kv, n_batch] ) - switch a.Architecture { - case "clip": - // NOP. - case "mamba": + switch { + case a.Type == "model" && a.Architecture == "mamba": e.Devices[0].Computation.Input = GGUFBytesScalar(inpTokens + inpEmbd + inpSMask + inpSSeq + inpOutIds) if !zeroOffload { v := GGUFBytesScalar(inpEmbd + inpSMask + inpSSeq + inpOutIds) @@ -422,7 +432,7 @@ func (gf *GGUFFile) EstimateLLaMACppUsage(opts ...LLaMACppUsageEstimateOption) ( e.Devices[i+1].Computation.Input += v } } - default: + case a.Type == "model": e.Devices[0].Computation.Input = GGUFBytesScalar(inpTokens + inpEmbd + inpPos + inpKQMask + inpOutIds) if !zeroOffload { v := GGUFBytesScalar(inpEmbd + inpPos + inpKQMask + inpOutIds) @@ -435,10 +445,8 @@ func (gf *GGUFFile) EstimateLLaMACppUsage(opts ...LLaMACppUsageEstimateOption) ( // the allocated memory can be reused for the next layer. // So, we only consider the usage of the largest layer, // which is the last layer by default. - switch a.Architecture { - case "clip": - // NOP. - case "mamba": + switch { + case a.Type == "model" && a.Architecture == "mamba": convInc := GGMLTypeF32.RowSizeOf([]uint64{a.EmbeddingKeyGQA, nKV}) // F32 [n_embd_key_gqa, n_kv] reshape for _, l := range tfLs[len(tfLs)-1].Search(regexp.MustCompile(`.*\.\d+\.(attn_norm|ssm_in|ssm_conv1d)\.weight`)) { if !strings.HasSuffix(l.Name, ".ssm_conv1d.weight") { @@ -463,12 +471,12 @@ func (gf *GGUFFile) EstimateLLaMACppUsage(opts ...LLaMACppUsageEstimateOption) ( } cp := GGUFBytesScalar(convInc + ssmInc) for i, d := range e.Devices[1:] { - if d.LastLayer < 0 && i != 0 { + if d.LastLayer < 0 && (i == 0 && !d.Remote) { continue } e.Devices[i+1].Computation.Compute = cp } - default: + case a.Type == "model": loadAttnInc, offloadAttnInc := uint64(0), uint64(0) if o.FlashAttention { // https://github.com/ggerganov/llama.cpp/blob/172c8256840ffd882ab9992ecedbb587d9b21f15/llama.cpp#L7387. @@ -530,7 +538,7 @@ func (gf *GGUFFile) EstimateLLaMACppUsage(opts ...LLaMACppUsageEstimateOption) ( } cp := GGUFBytesScalar(max(offloadAttnInc, ffnInc)) for i, d := range e.Devices[1:] { - if d.LastLayer < 0 && i != 0 { + if d.LastLayer < 0 && (i == 0 && !d.Remote) { continue } e.Devices[i+1].Computation.Compute = cp @@ -541,10 +549,7 @@ func (gf *GGUFFile) EstimateLLaMACppUsage(opts ...LLaMACppUsageEstimateOption) ( } } // Finally, get the usage of output layer. - switch a.Architecture { - case "clip": - // NOP. - default: + if a.Type == "model" { outInc := inpEmbd if a.Architecture == "mamba" { outInc += inpSMask + inpSSeq @@ -561,12 +566,15 @@ func (gf *GGUFFile) EstimateLLaMACppUsage(opts ...LLaMACppUsageEstimateOption) ( } } - // Multimodal projector. - e.MultimodalProjector = o.MultimodalProjector - // Drafter. e.Drafter = o.Drafter + // Projector. + e.Projector = o.Projector + + // Adapters. + e.Adapters = o.Adapters + return e } @@ -580,7 +588,9 @@ type ( /* Appendix */ - // Architecture describes what architecture this model implements. + // Type describes what type this GGUF file is. + Type string `json:"type"` + // Architecture describes what architecture this GGUF file implements. Architecture string `json:"architecture"` // ContextSize is the size of the context. ContextSize uint64 `json:"contextSize"` @@ -666,10 +676,7 @@ func (e LLaMACppUsageEstimate) SummarizeMemory(mmap bool, nonUMARamFootprint, no ems.VRAMs[i].UMA = fp + wg + kv + /* cp */ 0 if !e.NoMMap && mmap { ems.VRAMs[i].UMA -= wg - // NB(thxCode): the weight add back for the following reasons: - // - UMA treats as one device. - // - RPC server will load all weights and computation. - if i > 0 { + if i > 0 || v.Remote { ems.VRAMs[i].UMA += wg + cp } } @@ -679,17 +686,6 @@ func (e LLaMACppUsageEstimate) SummarizeMemory(mmap bool, nonUMARamFootprint, no } } - // MultimodalProjector. - if e.MultimodalProjector != nil { - cems := e.MultimodalProjector.SummarizeMemory(mmap, 0, 0) - ems.RAM.UMA += cems.RAM.UMA - ems.RAM.NonUMA += cems.RAM.NonUMA - for i, v := range cems.VRAMs { - ems.VRAMs[i].UMA += v.UMA - ems.VRAMs[i].NonUMA += v.NonUMA - } - } - // Drafter. if e.Drafter != nil { dmes := e.Drafter.SummarizeMemory(mmap, 0, 0) @@ -701,6 +697,28 @@ func (e LLaMACppUsageEstimate) SummarizeMemory(mmap bool, nonUMARamFootprint, no } } + // Projector. + if e.Projector != nil { + cems := e.Projector.SummarizeMemory(mmap, 0, 0) + ems.RAM.UMA += cems.RAM.UMA + ems.RAM.NonUMA += cems.RAM.NonUMA + for i, v := range cems.VRAMs { + ems.VRAMs[i].UMA += v.UMA + ems.VRAMs[i].NonUMA += v.NonUMA + } + } + + // Adapters. + for i := range e.Adapters { + aems := e.Adapters[i].SummarizeMemory(mmap, 0, 0) + ems.RAM.UMA += aems.RAM.UMA + ems.RAM.NonUMA += aems.RAM.NonUMA + for j, v := range aems.VRAMs { + ems.VRAMs[j].UMA += v.UMA + ems.VRAMs[j].NonUMA += v.NonUMA + } + } + return ems } @@ -713,6 +731,7 @@ func (e LLaMACppUsageEstimate) Summarize(mmap bool, nonUMARamFootprint, nonUMAVr } // Just copy from the original estimate. + es.Type = e.Type es.Architecture = e.Architecture es.ContextSize = e.ContextSize es.FlashAttention = e.FlashAttention diff --git a/file_estimate_option.go b/file_estimate_option.go index 42a0fd5..d915729 100644 --- a/file_estimate_option.go +++ b/file_estimate_option.go @@ -8,8 +8,8 @@ import ( type ( _LLaMACppUsageEstimateOptions struct { - Architecture *GGUFArchitectureMetadata - Tokenizer *GGUFTokenizerMetadata + Architecture *GGUFArchitecture + Tokenizer *GGUFTokenizer ContextSize *int32 InMaxContextSize bool LogicalBatchSize *int32 @@ -23,16 +23,18 @@ type ( SplitMode LLaMACppSplitMode TensorSplitFraction []float64 MainGPUIndex int - MultimodalProjector *LLaMACppUsageEstimate + RPCServers []string + Projector *LLaMACppUsageEstimate Drafter *LLaMACppUsageEstimate + Adapters []LLaMACppUsageEstimate } LLaMACppUsageEstimateOption func(*_LLaMACppUsageEstimateOptions) ) // WithArchitecture sets the architecture for the estimate. // -// Allows reusing the same GGUFArchitectureMetadata for multiple estimates. -func WithArchitecture(arch GGUFArchitectureMetadata) LLaMACppUsageEstimateOption { +// Allows reusing the same GGUFArchitecture for multiple estimates. +func WithArchitecture(arch GGUFArchitecture) LLaMACppUsageEstimateOption { return func(o *_LLaMACppUsageEstimateOptions) { o.Architecture = &arch } @@ -40,8 +42,8 @@ func WithArchitecture(arch GGUFArchitectureMetadata) LLaMACppUsageEstimateOption // WithTokenizer sets the tokenizer for the estimate. // -// Allows reusing the same GGUFTokenizerMetadata for multiple estimates. -func WithTokenizer(tokenizer GGUFTokenizerMetadata) LLaMACppUsageEstimateOption { +// Allows reusing the same GGUFTokenizer for multiple estimates. +func WithTokenizer(tokenizer GGUFTokenizer) LLaMACppUsageEstimateOption { return func(o *_LLaMACppUsageEstimateOptions) { o.Tokenizer = &tokenizer } @@ -199,10 +201,13 @@ func WithMainGPUIndex(di int) LLaMACppUsageEstimateOption { } } -// WithMultimodalProjector sets the multimodal projector estimate usage. -func WithMultimodalProjector(mmp *LLaMACppUsageEstimate) LLaMACppUsageEstimateOption { +// WithRPCServers sets the RPC servers for the estimate. +func WithRPCServers(srvs []string) LLaMACppUsageEstimateOption { return func(o *_LLaMACppUsageEstimateOptions) { - o.MultimodalProjector = mmp + if len(srvs) == 0 { + return + } + o.RPCServers = srvs } } @@ -212,3 +217,20 @@ func WithDrafter(dft *LLaMACppUsageEstimate) LLaMACppUsageEstimateOption { o.Drafter = dft } } + +// WithProjector sets the multimodal projector estimate usage. +func WithProjector(prj *LLaMACppUsageEstimate) LLaMACppUsageEstimateOption { + return func(o *_LLaMACppUsageEstimateOptions) { + o.Projector = prj + } +} + +// WithAdapters sets the adapters estimate usage. +func WithAdapters(adp []LLaMACppUsageEstimate) LLaMACppUsageEstimateOption { + return func(o *_LLaMACppUsageEstimateOptions) { + if len(adp) == 0 { + return + } + o.Adapters = adp + } +} diff --git a/file_model.go b/file_metadata.go similarity index 90% rename from file_model.go rename to file_metadata.go index b9d4b58..a66c4cb 100644 --- a/file_model.go +++ b/file_metadata.go @@ -5,11 +5,14 @@ import ( "strings" ) -// GGUFModelMetadata represents the model metadata of a GGUF file. -type GGUFModelMetadata struct { +// GGUFMetadata represents the model metadata of a GGUF file. +type GGUFMetadata struct { /* Basic */ - // Architecture describes what architecture this model implements. + // Type describes the type of the GGUF file, + // default is "model". + Type string `json:"type"` + // Architecture describes what architecture this GGUF file implements. // // All lowercase ASCII, with only [a-z0-9]+ characters allowed. Architecture string `json:"architecture"` @@ -30,9 +33,9 @@ type GGUFModelMetadata struct { Alignment uint32 `json:"alignment"` // Name to the model. // - // This should be a human-readable name that can be used to identify the model. + // This should be a human-readable name that can be used to identify the GGUF file. // It should be unique within the community that the model is defined in. - Name string `json:"name"` + Name string `json:"name,omitempty"` // Author to the model. Author string `json:"author,omitempty"` // URL to the model's homepage. @@ -57,9 +60,9 @@ type GGUFModelMetadata struct { FileSize GGUFBytesScalar `json:"fileSize"` // Size is the model size. Size GGUFBytesScalar `json:"size"` - // Parameters is the parameters of the model. + // Parameters is the parameters of the GGUF file. Parameters GGUFParametersScalar `json:"parameters"` - // BitsPerWeight is the bits per weight of the model. + // BitsPerWeight is the bits per weight of the GGUF file. BitsPerWeight GGUFBitsPerWeightScalar `json:"bitsPerWeight"` } @@ -105,9 +108,10 @@ const ( _GGUFFileTypeCount // Unknown ) -// Model returns the model metadata of the GGUF file. -func (gf *GGUFFile) Model() (gm GGUFModelMetadata) { +// Metadata returns the metadata of the GGUF file. +func (gf *GGUFFile) Metadata() (gm GGUFMetadata) { const ( + typeKey = "general.type" architectureKey = "general.architecture" quantizationKey = "general.quantization_version" alignmentKey = "general.alignment" @@ -117,11 +121,14 @@ func (gf *GGUFFile) Model() (gm GGUFModelMetadata) { descriptionKey = "general.description" licenseKey = "general.license" fileTypeKey = "general.file_type" + + controlVectorModelHintKey = "controlvector.model_hint" ) gm.FileType = _GGUFFileTypeCount m, _ := gf.Header.MetadataKV.Index([]string{ + typeKey, architectureKey, quantizationKey, alignmentKey, @@ -131,10 +138,23 @@ func (gf *GGUFFile) Model() (gm GGUFModelMetadata) { descriptionKey, licenseKey, fileTypeKey, + controlVectorModelHintKey, }) - if v, ok := m[architectureKey]; ok { + if v, ok := m[typeKey]; ok { + gm.Type = v.ValueString() + } else if _, ok = m[controlVectorModelHintKey]; ok { + gm.Type = "adapter" + } else { + gm.Type = "model" + } + if v, ok := m[controlVectorModelHintKey]; ok { gm.Architecture = v.ValueString() + } else if v, ok = m[architectureKey]; ok { + gm.Architecture = v.ValueString() + if gm.Architecture == "clip" { + gm.Type = "projector" + } } else { gm.Architecture = "llama" } diff --git a/file_model_test.go b/file_metadata_test.go similarity index 81% rename from file_model_test.go rename to file_metadata_test.go index 5400944..831359f 100644 --- a/file_model_test.go +++ b/file_metadata_test.go @@ -10,7 +10,7 @@ import ( "github.com/stretchr/testify/assert" ) -func TestGGUFFile_Model(t *testing.T) { +func TestGGUFFile_Metadata(t *testing.T) { ctx := context.Background() f, err := ParseGGUFFileFromHuggingFace( @@ -23,10 +23,10 @@ func TestGGUFFile_Model(t *testing.T) { return } - t.Log("\n", spew.Sdump(f.Model()), "\n") + t.Log("\n", spew.Sdump(f.Metadata()), "\n") } -func BenchmarkGGUFFile_Model(b *testing.B) { +func BenchmarkGGUFFile_Metadata(b *testing.B) { mp, ok := os.LookupEnv("TEST_MODEL_PATH") if !ok { b.Skip("TEST_MODEL_PATH is not set") @@ -43,7 +43,7 @@ func BenchmarkGGUFFile_Model(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - _ = f.Model() + _ = f.Metadata() } } @@ -74,7 +74,7 @@ func TestGGUFFile_guessFileType(t *testing.T) { t.Fatal(err) return } - assert.Equal(t, gf.Model().FileType.String(), gf.guessFileType().String(), tc+" file type should be equal") + assert.Equal(t, gf.Metadata().FileType.String(), gf.guessFileType().String(), tc+" file type should be equal") }) } } diff --git a/file_tokenizer.go b/file_tokenizer.go index 6daa1d6..311b1c4 100644 --- a/file_tokenizer.go +++ b/file_tokenizer.go @@ -1,7 +1,7 @@ package gguf_parser -// GGUFTokenizerMetadata represents the tokenizer metadata of a GGUF file. -type GGUFTokenizerMetadata struct { +// GGUFTokenizer represents the tokenizer metadata of a GGUF file. +type GGUFTokenizer struct { /* Basic */ // Model is the model of the tokenizer. @@ -50,7 +50,7 @@ type GGUFTokenizerMetadata struct { } // Tokenizer returns the tokenizer metadata of a GGUF file. -func (gf *GGUFFile) Tokenizer() (gt GGUFTokenizerMetadata) { +func (gf *GGUFFile) Tokenizer() (gt GGUFTokenizer) { const ( modelKey = "tokenizer.ggml.model" tokensKey = "tokenizer.ggml.tokens"