diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index cb04cc4850951..23eb1482ffef1 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -364,6 +364,20 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): ] embedding_modules = {} embedding_padding_modules = [] + + # BitandBytes specific attributes + default_bitsandbytes_target_modules = [ + ".gate_proj.", + ".down_proj.", + ".up_proj.", + ".q_proj.", + ".k_proj.", + ".v_proj.", + ".o_proj.", + ] + + # in TP, these weights are partitioned along the column dimension (dim=-1) + column_parallel_weights_modules = [".down_proj.", ".o_proj."] bitsandbytes_stacked_params_mapping = { # shard_name, weight_name, index "q_proj": ("qkv_proj", 0),