Skip to content

Commit 35d72bd

Browse files
mengniwang95Mengni Wang
andauthored
Scout example (#2301)
* add inital files Signed-off-by: Mengni Wang <[email protected]> * update example Signed-off-by: Mengni Wang <[email protected]> * Update README.md * Update requirements.txt * Update run_benchmark.sh * Update setup.sh * Update README.md * Update requirements.txt * Update run_benchmark.sh * Update README.md --------- Signed-off-by: Mengni Wang <[email protected]> Co-authored-by: Mengni Wang <[email protected]>
1 parent e9cbe6e commit 35d72bd

File tree

5 files changed

+169
-0
lines changed

5 files changed

+169
-0
lines changed
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# Step-by-Step
2+
3+
This example quantizes and validates the accuracy of Llama4.
4+
5+
# Prerequisite
6+
7+
## 1. Environment
8+
9+
```shell
10+
docker run -d --gpus all -v ... --shm-size=100g --name llama4 -it nvcr.io/nvidia/pytorch:25.05-py3 /bin/bash
11+
docker exec -it llama4 bash
12+
git clone https://github.com/intel/neural-compressor.git
13+
cd neural-compressor/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4
14+
bash setup.sh
15+
```
16+
17+
## 2. Prepare Model
18+
19+
```shell
20+
hf download meta-llama/Llama-4-Scout-17B-16E-Instruct --local-dir Llama-4-Scout-17B-16E-Instruct
21+
```
22+
23+
# Run
24+
25+
## 1. Quantization
26+
27+
```bash
28+
CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=llama4_mxfp4 --input_model=Llama-4-Scout-17B-16E-Instruct/
29+
```
30+
31+
32+
## 2. Benchmark
33+
34+
```bash
35+
CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_benchmark.sh --topology=llama4_mxfp4 --input_model=saved_results/Llama-4-Scout-17B-16E-Instruct-w4g32/ --tasks=piqa --batch_size=1 --tp_size=4
36+
```
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
auto-round @ git+https://github.com/intel/[email protected]
2+
lm-eval==0.4.9.1
3+
setuptools_scm
4+
torchao==0.12.0
5+
triton==3.3.1
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
#!/bin/bash
2+
set -x
3+
4+
function main {
5+
init_params "$@"
6+
run_benchmark
7+
8+
}
9+
10+
# init params
11+
function init_params {
12+
for var in "$@"
13+
do
14+
case $var in
15+
--input_model=*)
16+
input_model=$(echo $var |cut -f2 -d=)
17+
;;
18+
--topology=*)
19+
topology=$(echo $var |cut -f2 -d=)
20+
;;
21+
--tasks=*)
22+
tasks=$(echo $var |cut -f2 -d=)
23+
;;
24+
--tp_size=*)
25+
tp_size=$(echo $var |cut -f2 -d=)
26+
;;
27+
--batch_size=*)
28+
batch_size=$(echo $var |cut -f2 -d=)
29+
;;
30+
esac
31+
done
32+
33+
}
34+
35+
# run_benchmark
36+
function run_benchmark {
37+
38+
extra_model_args=""
39+
extra_cmd=""
40+
batch_size=${batch_size:=1}
41+
42+
if [ "${topology}" = "llama4_mxfp4" ]; then
43+
extra_model_args="max_model_len=8192,max_num_seqs=1024,max_gen_toks=2048,kv_cache_dtype=auto,gpu_memory_utilization=0.7"
44+
extra_cmd="--gen_kwargs max_gen_toks=2048"
45+
fi
46+
47+
if [[ "${tasks}" == *"chartqa"* || "${tasks}" == *"mmmu_val"* ]]; then
48+
model="vllm-vlm"
49+
extra_cmd=${extra_cmd}" --apply_chat_template"
50+
else
51+
model="vllm"
52+
fi
53+
54+
NCCL_NVLS_ENABLE=0 VLLM_USE_STANDALONE_COMPILE=0 VLLM_WORKER_MULTIPROC_METHOD=spawn \
55+
lm_eval --model ${model} \
56+
--model_args pretrained=${input_model},tensor_parallel_size=${tp_size},${extra_model_args},enable_expert_parallel=True \
57+
--tasks ${tasks} \
58+
--batch_size ${batch_size} \
59+
${extra_cmd}
60+
}
61+
62+
main "$@"
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
#!/bin/bash
2+
set -x
3+
4+
function main {
5+
6+
init_params "$@"
7+
run_tuning
8+
9+
}
10+
11+
# init params
12+
function init_params {
13+
for var in "$@"
14+
do
15+
case $var in
16+
--topology=*)
17+
topology=$(echo $var |cut -f2 -d=)
18+
;;
19+
--iters=*)
20+
iters=$(echo $var |cut -f2 -d=)
21+
;;
22+
--dataset_location=*)
23+
dataset_location=$(echo $var |cut -f2 -d=)
24+
;;
25+
--input_model=*)
26+
input_model=$(echo $var |cut -f2 -d=)
27+
;;
28+
--output_model=*)
29+
tuned_checkpoint=$(echo $var |cut -f2 -d=)
30+
;;
31+
*)
32+
echo "Error: No such parameter: ${var}"
33+
exit 1
34+
;;
35+
esac
36+
done
37+
38+
}
39+
40+
# run_tuning
41+
function run_tuning {
42+
extra_cmd=""
43+
tuned_checkpoint=${tuned_checkpoint:="saved_results"}
44+
iters=${iters:=0}
45+
46+
if [ "${topology}" = "llama4_mxfp4" ]; then
47+
extra_cmd="--fp_layers lm-head,self_attn,router,vision_model,multi_modal_projector,shared_expert --scheme MXFP4"
48+
fi
49+
50+
python3 -m auto_round \
51+
--model ${input_model} \
52+
--iters ${iters} \
53+
--format "llm_compressor" \
54+
--output_dir ${tuned_checkpoint} \
55+
${extra_cmd}
56+
}
57+
58+
main "$@"
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
pip install -r requirements.txt
2+
pip install setuptools --upgrade
3+
pip install packaging --upgrade
4+
pip install -U "huggingface_hub[cli]"
5+
git clone -b mxfp4 https://github.com/mengniwang95/vllm-fork.git
6+
cd vllm-fork
7+
VLLM_USE_PRECOMPILED=1 pip install . -vvv --no-build-isolation
8+
cd ..

0 commit comments

Comments
 (0)