Merge branch 'main' into embedding-adapter

Signed-off-by: DarkLight1337 <[email protected]>
vllm-project · Nov 30, 2024 · c3334dc · c3334dc
2 parents f7d8c05 + e7cfc4e
commit c3334dc
Show file tree

Hide file tree

Showing 16 changed files with 219 additions and 178 deletions.
diff --git a/vllm/config.py b/vllm/config.py
@@ -418,17 +418,11 @@ def _parse_quant_hf_config(self):
 
     def _verify_quantization(self) -> None:
         supported_quantization = QUANTIZATION_METHODS
-        rocm_supported_quantization = [
-            "awq", "gptq", "fp8", "compressed_tensors", "compressed-tensors",
-            "fbgemm_fp8", "gguf"
-        ]
         optimized_quantization_methods = [
             "fp8", "marlin", "modelopt", "gptq_marlin_24", "gptq_marlin",
             "awq_marlin", "fbgemm_fp8", "compressed_tensors",
             "compressed-tensors", "experts_int8"
         ]
-        tpu_supported_quantization = ["tpu_int8"]
-        neuron_supported_quantization = ["neuron_quant"]
         if self.quantization is not None:
             self.quantization = self.quantization.lower()
 
@@ -463,32 +457,12 @@ def _verify_quantization(self) -> None:
                 raise ValueError(
                     f"Unknown quantization method: {self.quantization}. Must "
                     f"be one of {supported_quantization}.")
-            if current_platform.is_rocm(
-            ) and self.quantization not in rocm_supported_quantization:
-                raise ValueError(
-                    f"{self.quantization} quantization is currently not "
-                    f"supported in ROCm.")
-            if current_platform.is_tpu(
-            ) and self.quantization not in tpu_supported_quantization:
-                raise ValueError(
-                    f"{self.quantization} quantization is currently not "
-                    f"supported in TPU Backend.")
+            current_platform.verify_quantization(self.quantization)
             if self.quantization not in optimized_quantization_methods:
                 logger.warning(
                     "%s quantization is not fully "
                     "optimized yet. The speed can be slower than "
                     "non-quantized models.", self.quantization)
-            if (self.quantization == "awq" and current_platform.is_rocm()
-                    and not envs.VLLM_USE_TRITON_AWQ):
-                logger.warning(
-                    "Using AWQ quantization with ROCm, but VLLM_USE_TRITON_AWQ"
-                    " is not set, enabling VLLM_USE_TRITON_AWQ.")
-                envs.VLLM_USE_TRITON_AWQ = True
-            if current_platform.is_neuron(
-            ) and self.quantization not in neuron_supported_quantization:
-                raise ValueError(
-                    f"{self.quantization} quantization is currently not "
-                    f"supported in Neuron Backend.")
 
     def _verify_cuda_graph(self) -> None:
         if self.max_seq_len_to_capture is None:

diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py
@@ -29,11 +29,13 @@ def _init_worker(self):
         wrapper = WorkerWrapperBase(vllm_config=self.vllm_config)
         distributed_init_method = get_distributed_init_method(
             get_ip(), get_open_port())
-        self.driver_worker = wrapper.init_worker(
+        wrapper.init_worker(
             vllm_config=self.vllm_config,
             local_rank=0,
             rank=0,
-            distributed_init_method=distributed_init_method)
+            distributed_init_method=distributed_init_method,
+        )
+        self.driver_worker = wrapper.worker
         self.driver_worker.init_device()
         self.driver_worker.load_model()
 

diff --git a/vllm/executor/openvino_executor.py b/vllm/executor/openvino_executor.py
@@ -36,7 +36,7 @@ def _init_worker(self):
 
         distributed_init_method = get_distributed_init_method(
             get_ip(), get_open_port())
-        self.driver_worker = wrapper.init_worker(
+        wrapper.init_worker(
             ov_core=ov.Core(),
             vllm_config=self.vllm_config,
             local_rank=0,
@@ -45,6 +45,7 @@ def _init_worker(self):
             kv_cache_dtype=self.cache_config.cache_dtype,
             is_driver_worker=True,
         )
+        self.driver_worker = wrapper.worker
         self.driver_worker.init_device()
         self.driver_worker.load_model()
 

diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
@@ -267,54 +267,56 @@ def input_processor_for_idefics3(ctx: InputContext,
     n_images_in_text = []
 
     text = inputs.get("prompt")
-    if text is not None:
-        if isinstance(text, str):
-            text = [text]
-        elif not isinstance(text, list) and not isinstance(text[0], str):
-            raise ValueError("Invalid input text. Please provide a string, "
-                             "or a list of strings")
-
-        fake_image_token = processor.fake_image_token.content
-        image_token = processor.image_token.content
-        global_img_token = processor.global_image_tag
-
-        prompt_strings = []
-        for sample, sample_rows, sample_cols in zip(text, image_rows,
-                                                    image_cols):
-            n_images_in_text.append(sample.count(image_token))
-
-            # Replace the image token with fake tokens around the expanded
-            # image token sequence of length `image_seq_len`
-            image_prompt_strings = []
-            for n_rows, n_cols in zip(sample_rows, sample_cols):
-                image_prompt_string = _get_image_prompt_string(
-                    n_rows,
-                    n_cols,
-                    processor.image_seq_len,
-                    image_token=image_token,
-                    fake_token_around_image=fake_image_token,
-                    global_img_token=global_img_token,
-                )
-                image_prompt_strings.append(image_prompt_string)
-
-            split_sample = sample.split(image_token)
-            if len(split_sample) == 0:
-                raise ValueError(
-                    "The image token should be present in the text.")
+    if text is None:
+        prompt_token_ids = inputs.get("prompt_token_ids", [])
+        assert prompt_token_ids
+        text = tokenizer.decode(prompt_token_ids)
+
+    if isinstance(text, str):
+        text = [text]
+    elif not isinstance(text, list) and not isinstance(text[0], str):
+        raise ValueError("Invalid input text. Please provide a string, "
+                         "or a list of strings")
+
+    fake_image_token = processor.fake_image_token.content
+    image_token = processor.image_token.content
+    global_img_token = processor.global_image_tag
+
+    prompt_strings = []
+    for sample, sample_rows, sample_cols in zip(text, image_rows, image_cols):
+        n_images_in_text.append(sample.count(image_token))
+
+        # Replace the image token with fake tokens around the expanded
+        # image token sequence of length `image_seq_len`
+        image_prompt_strings = []
+        for n_rows, n_cols in zip(sample_rows, sample_cols):
+            image_prompt_string = _get_image_prompt_string(
+                n_rows,
+                n_cols,
+                processor.image_seq_len,
+                image_token=image_token,
+                fake_token_around_image=fake_image_token,
+                global_img_token=global_img_token,
+            )
+            image_prompt_strings.append(image_prompt_string)
 
-            # Place in the image prompt strings where the image tokens are
-            sample = split_sample[0]
-            for i, image_prompt_string in enumerate(image_prompt_strings):
-                sample += image_prompt_string + split_sample[i + 1]
-            prompt_strings.append(sample)
+        split_sample = sample.split(image_token)
+        if len(split_sample) == 0:
+            raise ValueError("The image token should be present in the text.")
 
-        prompt_token_ids = tokenizer(text=prompt_strings[0]).input_ids
+        # Place in the image prompt strings where the image tokens are
+        sample = split_sample[0]
+        for i, image_prompt_string in enumerate(image_prompt_strings):
+            sample += image_prompt_string + split_sample[i + 1]
+        prompt_strings.append(sample)
 
-        return token_inputs(
-            prompt_token_ids=prompt_token_ids,
-            prompt=prompt_strings[0],
-            multi_modal_data=multi_modal_data,
-        )
+    prompt_token_ids = tokenizer(text=prompt_strings[0]).input_ids
+
+    return token_inputs(
+        prompt_token_ids=prompt_token_ids,
+        prompt=prompt_strings[0],
+        multi_modal_data=multi_modal_data,
+    )
 
 
 def _get_max_num_image_patch(image_processor: Idefics3ImageProcessor) -> int:

diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
@@ -51,7 +51,8 @@
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index,
+                    is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -111,6 +112,7 @@ def __init__(
         prefix: str = "",
     ) -> None:
         super().__init__()
+        layer_idx = extract_layer_index(prefix)
         self.hidden_size = hidden_size
         tp_size = get_tensor_model_parallel_world_size()
         self.total_num_heads = num_heads
@@ -165,13 +167,26 @@ def __init__(
             rope_scaling=rope_scaling,
             is_neox_style=is_neox_style,
         )
+
+        if hasattr(config, "interleaved_sliding_window"):
+            if isinstance(config.interleaved_sliding_window, int):
+                sliding_window = config.interleaved_sliding_window
+            elif isinstance(config.interleaved_sliding_window, list):
+                sw_idx = layer_idx % len(config.interleaved_sliding_window)
+                sliding_window = config.interleaved_sliding_window[sw_idx]
+            else:
+                raise ValueError(f"{type(sliding_window)} is not supported.")
+        else:
+            sliding_window = None
+
         self.attn = Attention(
             self.num_heads,
             self.head_dim,
             self.scaling,
             num_kv_heads=self.num_kv_heads,
             cache_config=cache_config,
             quant_config=quant_config,
+            per_layer_sliding_window=sliding_window,
             prefix=f"{prefix}.attn",
         )