vllm-project · aurickq · Dec 3, 2024 · Dec 3, 2024 · Dec 4, 2024 · Dec 4, 2024
diff --git a/examples/swiftkv/README.md b/examples/swiftkv/README.md
@@ -0,0 +1,79 @@
+# SwiftKV on vLLM
+
+SwiftKV is a technique developed by Snowflake AI Research that reduces computational overhead during prompt processing by combining model rewiring and knowledge-preserving self-distillation.
+
+For more details, see:
+
+- [Blog post](https://www.snowflake.com/engineering-blog/swiftkv-llm-compute-reduction)
+- [Paper](https://arxiv.org/abs/2410.03960)
+- [Huggingface](https://huggingface.co/collections/Snowflake/swiftkv-models-674f7d7474eb789e185d31cb)
+
+## Quickstart
+
+Run an example conversation using [Snowflake/Llama-3.1-SwiftKV-8B-Instruct](https://huggingface.co/Snowflake/Llama-3.1-SwiftKV-8B-Instruct):
+```console
+$ python examples/swiftkv/offline_inference_swiftkv.py
+
+...
+
+The Importance of Higher Education
+
+Higher education is a vital component of an individual's life, providing numerous benefits that extend beyond the acquisition of knowledge and skills. It plays a significant role in shaping an individual's future, career prospects, and overall well-being. In this essay, we will explore the importance of higher education and its far-reaching implications on individuals, society, and the economy.
+
+...
+```
+
+## Running Accuracy Evaluations
+
+To evaluate the Llama-3.1-SwiftKV models, we use the [LM-Eval fork by NeuralMagic](https://github.com/neuralmagic/lm-evaluation-harness.git):
+
+```console
+$ pip install git+https://github.com/neuralmagic/lm-evaluation-harness.git@llama_3.1_instruct
+```
+
+Run evaluation on Llama-3.1-SwiftKV-8B-Instruct:
+
+```console
+$ bash examples/swiftkv/run_eval_8b.sh
+```
+
+Run evaluation on Llama-3.1-SwiftKV-405B-Instruct-FP8:
+
+```console
+$ bash examples/swiftkv/run_eval_405b_fp8.sh
+```
+
+## Running Performance Benchmarks
+
+Llama-3.1-SwiftKV-8B-Instruct
+
+```console
+$ python benchmarks/benchmark_throughput.py \
+    --input-len 2000 --output-len 256 \
+    --model Snowflake/Llama-3.1-SwiftKV-8B-Instruct \
+    --gpu-memory-utilization 0.95 \
+    --enable-chunked-prefill \
+    --max-num-batched-tokens 2048 \
+    --max-num-seqs 512 
+
+...
+
+Throughput: 11.36 requests/s, 25635.51 total tokens/s, 2908.99 output tokens/s
+```
+
+Llama-3.1-SwiftKV-405B-Instruct-FP8
+
+```console
+$ python benchmarks/benchmark_throughput.py \
+    --input-len 2000 --output-len 256 \
+    --model Snowflake/Llama-3.1-SwiftKV-405B-Instruct-FP8 \
+    --gpu-memory-utilization 0.95 \
+    --enable-chunked-prefill \
+    --max-num-batched-tokens 2048 \
+    --max-num-seqs 512 \
+    --tensor-parallel-size 8
+
+...
+
+Throughput: 3.21 requests/s, 7233.37 total tokens/s, 820.81 output tokens/s
+```
diff --git a/examples/swiftkv/offline_inference_swiftkv.py b/examples/swiftkv/offline_inference_swiftkv.py
@@ -0,0 +1,26 @@
+from vllm import LLM, SamplingParams
+
+llm = LLM(model="Snowflake/Llama-3.1-SwiftKV-8B-Instruct")
+
+print("=" * 80)
+
+conversation = [
+    {
+        "role": "user",
+        "content": "Hello"
+    },
+    {
+        "role": "assistant",
+        "content": "Hello! How can I assist you today?"
+    },
+    {
+        "role": "user",
+        "content": "Write an essay about the importance of higher education.",
+    },
+]
+
+sampling_params = SamplingParams(temperature=0.1, max_tokens=800)
+
+outputs = llm.chat(conversation, sampling_params=sampling_params)
+
+print(outputs[0].outputs[0].text)
diff --git a/examples/swiftkv/run_eval_405b_fp8.sh b/examples/swiftkv/run_eval_405b_fp8.sh
@@ -0,0 +1,47 @@
+#!/usr/bin/env bash
+
+MODEL=Snowflake/Llama-3.1-SwiftKV-405B-Instruct-FP8
+
+EVAL_CMD=$(cat <<EOF
+python -m lm_eval \
+  --model vllm \
+  --model_args pretrained=${MODEL},dtype=auto,max_model_len=4096,enable_chunked_prefill=True,tensor_parallel_size=8 \
+  --gen_kwargs max_gen_toks=1024 \
+  --batch_size auto \
+  --output_path ./swiftkv-eval
+EOF
+)
+
+${EVAL_CMD} \
+  --tasks truthfulqa_mc2 \
+  --num_fewshot 0
+
+${EVAL_CMD} \
+  --tasks winogrande \
+  --num_fewshot 5
+
+${EVAL_CMD} \
+  --tasks hellaswag \
+  --num_fewshot 10
+
+${EVAL_CMD} \
+  --tasks arc_challenge_llama_3.1_instruct \
+  --apply_chat_template \
+  --num_fewshot 0
+
+${EVAL_CMD} \
+  --tasks gsm8k_cot_llama_3.1_instruct \
+  --fewshot_as_multiturn \
+  --apply_chat_template \
+  --num_fewshot 8
+
+${EVAL_CMD} \
+  --tasks mmlu_llama_3.1_instruct \
+  --fewshot_as_multiturn \
+  --apply_chat_template \
+  --num_fewshot 5
+
+${EVAL_CMD} \
+  --tasks mmlu_cot_0shot_llama_3.1_instruct \
+  --apply_chat_template \
+  --num_fewshot 0
diff --git a/examples/swiftkv/run_eval_8b.sh b/examples/swiftkv/run_eval_8b.sh
@@ -0,0 +1,47 @@
+#!/usr/bin/env bash
+
+MODEL=Snowflake/Llama-3.1-SwiftKV-8B-Instruct
+
+EVAL_CMD=$(cat <<EOF
+python -m lm_eval \
+  --model vllm \
+  --model_args pretrained=${MODEL},dtype=auto,max_model_len=4096,enable_chunked_prefill=True \
+  --gen_kwargs max_gen_toks=1024 \
+  --batch_size auto \
+  --output_path ./swiftkv-eval
+EOF
+)
+
+${EVAL_CMD} \
+  --tasks truthfulqa_mc2 \
+  --num_fewshot 0
+
+${EVAL_CMD} \
+  --tasks winogrande \
+  --num_fewshot 5
+
+${EVAL_CMD} \
+  --tasks hellaswag \
+  --num_fewshot 10
+
+${EVAL_CMD} \
+  --tasks arc_challenge_llama_3.1_instruct \
+  --apply_chat_template \
+  --num_fewshot 0
+
+${EVAL_CMD} \
+  --tasks gsm8k_cot_llama_3.1_instruct \
+  --fewshot_as_multiturn \
+  --apply_chat_template \
+  --num_fewshot 8
+
+${EVAL_CMD} \
+  --tasks mmlu_llama_3.1_instruct \
+  --fewshot_as_multiturn \
+  --apply_chat_template \
+  --num_fewshot 5
+
+${EVAL_CMD} \
+  --tasks mmlu_cot_0shot_llama_3.1_instruct \
+  --apply_chat_template \
+  --num_fewshot 0
diff --git a/tests/swiftkv/__init__.py b/tests/swiftkv/__init__.py
diff --git a/tests/swiftkv/test_llama_fp8.py b/tests/swiftkv/test_llama_fp8.py
@@ -0,0 +1,55 @@
+import pytest
+
+import vllm
+from tests.utils import multi_gpu_test
+from vllm.sampling_params import SamplingParams
+
+MODELS = ["Snowflake/Llama-3.1-SwiftKV-8B-Instruct-FP8"]
+CONVERSATIONS = [
+    [{
+        "role": "user",
+        "content": "Hello!"
+    }],
+    [{
+        "role": "user",
+        "content": "Who is the president of the United States?"
+    }],
+    [{
+        "role": "user",
+        "content": "What is the capital of France?"
+    }],
+    [{
+        "role": "user",
+        "content": "What is the future of AI?"
+    }],
+]
+EXPECTED_OUTPUTS = [
+    "Hello! How can I assist you today?",
+    "As of my cut-off knowledge in December 2023, the President of the United "
+    "States is Joe",
+    "The capital of France is Paris.",
+    "The future of AI is vast and rapidly evolving, with numerous potential "
+    "developments and applications on the horizon.",
+]
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("enforce_eager", [True, False])
+@pytest.mark.parametrize("tensor_parallel_size", [1, 2])
+@multi_gpu_test(num_gpus=2)
+def test_model(model, enforce_eager, tensor_parallel_size) -> None:
+    llm = vllm.LLM(
+        model,
+        enforce_eager=enforce_eager,
+        enable_chunked_prefill=True,
+        tensor_parallel_size=tensor_parallel_size,
+    )
+    sampling_params = SamplingParams(temperature=0.0, max_tokens=20)
+
+    for idx, conversation in enumerate(CONVERSATIONS):
+        outputs = llm.chat(
+            conversation,
+            sampling_params=sampling_params,
+            use_tqdm=False,
+        )
+        assert outputs[0].outputs[0].text == EXPECTED_OUTPUTS[idx]