[Misc]Further reduce BNB static variable (vllm-project#10597)

Signed-off-by: Jee Jee Li <[email protected]> Signed-off-by: Andrew Feldman <[email protected]>
neuralmagic · Dec 2, 2024 · a809ee1 · a809ee1
1 parent 89c4f78
commit a809ee1
Show file tree

Hide file tree

Showing 14 changed files with 131 additions and 219 deletions.
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
@@ -351,14 +351,6 @@ class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     embedding_padding_modules = []
 
     # BitandBytes specific attributes
-    default_bitsandbytes_target_modules = [
-        ".W_pack.",
-        ".o_proj.",
-        ".down_proj.",
-        ".up_proj.",
-        ".gate_proj.",
-        ".up_proj.",
-    ]
     bitsandbytes_stacked_params_mapping = {
         # shard_name, weight_name, index
         "gate_proj": ("gate_up_proj", 0),

diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py
@@ -412,12 +412,6 @@ class FalconForCausalLM(nn.Module, SupportsPP):
 
     # BitandBytes specific attributes
     bitsandbytes_stacked_params_mapping = {}
-    default_bitsandbytes_target_modules = [
-        ".query_key_value.",
-        ".dense.",
-        ".dense_h_to_4h.",
-        ".dense_4h_to_h.",
-    ]
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()

diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
@@ -350,15 +350,6 @@ class GemmaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "down_proj",
     ]
     # BitandBytes specific attributes
-    default_bitsandbytes_target_modules = [
-        ".gate_proj.",
-        ".down_proj.",
-        ".up_proj.",
-        ".q_proj.",
-        ".k_proj.",
-        ".v_proj.",
-        ".o_proj.",
-    ]
     bitsandbytes_stacked_params_mapping = {
         # shard_name, weight_name, index
         "q_proj": ("qkv_proj", 0),

diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
@@ -386,15 +386,6 @@ class Gemma2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     embedding_padding_modules = []
 
     # BitandBytes specific attributes
-    default_bitsandbytes_target_modules = [
-        ".gate_proj.",
-        ".down_proj.",
-        ".up_proj.",
-        ".q_proj.",
-        ".k_proj.",
-        ".v_proj.",
-        ".o_proj.",
-    ]
     bitsandbytes_stacked_params_mapping = {
         # shard_name, weight_name, index
         "q_proj": ("qkv_proj", 0),

diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
@@ -656,21 +656,6 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
     ]
 
     # BitandBytes specific attributes
-    default_bitsandbytes_target_modules = [
-        ".gate_proj.",
-        ".down_proj.",
-        ".up_proj.",
-        ".q_proj.",
-        ".k_proj.",
-        ".v_proj.",
-        ".o_proj.",
-        # vision_model
-        ".fc1.",
-        ".fc2.",
-        ".out_proj.",
-        # connector
-        ".proj.",
-    ]
     bitsandbytes_stacked_params_mapping = {
         # shard_name, weight_name, index
         "q_proj": ("qkv_proj", 0),

diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
@@ -463,15 +463,6 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     embedding_padding_modules = ["lm_head"]
 
     # BitandBytes specific attributes
-    default_bitsandbytes_target_modules = [
-        ".gate_proj.",
-        ".down_proj.",
-        ".up_proj.",
-        ".q_proj.",
-        ".k_proj.",
-        ".v_proj.",
-        ".o_proj.",
-    ]
     bitsandbytes_stacked_params_mapping = {
         # shard_name, weight_name, index
         "q_proj": ("qkv_proj", 0),

diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
@@ -822,25 +822,6 @@ class MiniCPMV2_5(MiniCPMVBaseModel, SupportsLoRA):
     ]
 
     # BitandBytes specific attributes
-    default_bitsandbytes_target_modules = [
-        ".gate_proj.",
-        ".down_proj.",
-        ".up_proj.",
-        ".q_proj.",
-        ".k_proj.",
-        ".v_proj.",
-        ".o_proj.",
-        # vision encoder
-        ".fc1.",
-        ".fc2.",
-        # Currently, vllm does not support BNB quantization for the `out_proj`
-        # of the resampler, so it's necessary to distinguish between the
-        # vision encoder and the resampler's out_proj. The same applies to
-        # MiniCPMV2_6.
-        ".self_attn.out_proj.",  #  vision encoder out_proj
-        # resampler
-        ".kv_proj.",
-    ]
     bitsandbytes_stacked_params_mapping = {
         # shard_name, weight_name, index
         "q_proj": ("qkv_proj", 0),
@@ -964,21 +945,6 @@ class MiniCPMV2_6(MiniCPMVBaseModel, SupportsLoRA):
     ]
 
     # BitandBytes specific attributes
-    default_bitsandbytes_target_modules = [
-        ".gate_proj.",
-        ".down_proj.",
-        ".up_proj.",
-        ".q_proj.",
-        ".k_proj.",
-        ".v_proj.",
-        ".o_proj.",
-        # vision encoder
-        ".fc1.",
-        ".fc2.",
-        ".self_attn.out_proj.",
-        # resampler
-        ".kv_proj.",
-    ]
     bitsandbytes_stacked_params_mapping = {
         # shard_name, weight_name, index
         "q_proj": ("qkv_proj", 0),

diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
@@ -1104,20 +1104,6 @@ def forward(
 @INPUT_REGISTRY.register_input_processor(input_processor_for_mllama)
 class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal):
     # BitandBytes specific attributes
-    default_bitsandbytes_target_modules = [
-        ".gate_proj.",
-        ".down_proj.",
-        ".up_proj.",
-        ".q_proj.",
-        ".k_proj.",
-        ".v_proj.",
-        ".o_proj.",
-        ".fc1.",
-        ".fc2.",
-        # The `multi_modal_projector` is at the top level of the model,
-        # so we can't add a dot in front of it.
-        "multi_modal_projector."
-    ]
     bitsandbytes_stacked_params_mapping = {
         # shard_name, weight_name, index
         "q_proj": ("qkv_proj", 0),

diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
@@ -337,9 +337,6 @@ class OPTForCausalLM(nn.Module, SupportsPP):
         "k_proj": ("qkv_proj", 1),
         "v_proj": ("qkv_proj", 2),
     }
-    default_bitsandbytes_target_modules = [
-        ".q_proj.", ".k_proj.", ".v_proj.", ".out_proj.", ".fc1.", ".fc2."
-    ]
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()

diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
@@ -286,9 +286,6 @@ class PhiForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "k_proj": ("qkv_proj", 1),
         "v_proj": ("qkv_proj", 2),
     }
-    default_bitsandbytes_target_modules = [
-        ".q_proj.", ".k_proj.", ".v_proj.", ".fc1.", ".fc2.", ".dense."
-    ]
 
     embedding_modules = {}
     embedding_padding_modules = []

diff --git a/vllm/model_executor/models/phi3.py b/vllm/model_executor/models/phi3.py
@@ -16,11 +16,5 @@ class Phi3ForCausalLM(LlamaForCausalLM):
     }
 
     # BitandBytes specific attributes
-    default_bitsandbytes_target_modules = [
-        ".gate_up_proj.",
-        ".down_proj.",
-        ".qkv_proj.",
-        ".o_proj.",
-    ]
     # Initialize an empty dict when there is no stacked parameter mapping.
     bitsandbytes_stacked_params_mapping = {}
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
@@ -1028,12 +1028,7 @@ class QWenLLM(QWenBaseModel):
     embedding_modules = {}
     embedding_padding_modules = []
 
-    default_bitsandbytes_target_modules = [
-        ".c_attn.",
-        ".c_proj.",
-        ".w1.",
-        ".w2.",
-    ]
+    # BitandBytes specific attributes
     bitsandbytes_stacked_params_mapping = {
         # shard_name, weight_name, index
         "w2": ("gate_up_proj", 0),

diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
@@ -419,15 +419,6 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     embedding_padding_modules = []
 
     # BitandBytes specific attributes
-    default_bitsandbytes_target_modules = [
-        ".gate_proj.",
-        ".down_proj.",
-        ".up_proj.",
-        ".q_proj.",
-        ".k_proj.",
-        ".v_proj.",
-        ".o_proj.",
-    ]
     bitsandbytes_stacked_params_mapping = {
         # shard_name, weight_name, index
         "q_proj": ("qkv_proj", 0),