support TP in qwen2 bnb (vllm-project#9574)

Signed-off-by: Alvant <[email protected]>
compressa-ai · Oct 26, 2024 · 96441ea · 96441ea
1 parent 0bdbb81
commit 96441ea
Showing 1 changed file with 14 additions and 0 deletions.
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
@@ -364,6 +364,20 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     ]
     embedding_modules = {}
     embedding_padding_modules = []
+
+    # BitandBytes specific attributes
+    default_bitsandbytes_target_modules = [
+        ".gate_proj.",
+        ".down_proj.",
+        ".up_proj.",
+        ".q_proj.",
+        ".k_proj.",
+        ".v_proj.",
+        ".o_proj.",
+    ]
+
+    # in TP, these weights are partitioned along the column dimension (dim=-1)
+    column_parallel_weights_modules = [".down_proj.", ".o_proj."]
     bitsandbytes_stacked_params_mapping = {
         # shard_name, weight_name, index
         "q_proj": ("qkv_proj", 0),