Skip to content

Commit

Permalink
Merge branch 'main' into embedding-adapter
Browse files Browse the repository at this point in the history
Signed-off-by: DarkLight1337 <[email protected]>
  • Loading branch information
DarkLight1337 committed Nov 30, 2024
2 parents f7d8c05 + e7cfc4e commit c3334dc
Show file tree
Hide file tree
Showing 16 changed files with 219 additions and 178 deletions.
28 changes: 1 addition & 27 deletions vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -418,17 +418,11 @@ def _parse_quant_hf_config(self):

def _verify_quantization(self) -> None:
supported_quantization = QUANTIZATION_METHODS
rocm_supported_quantization = [
"awq", "gptq", "fp8", "compressed_tensors", "compressed-tensors",
"fbgemm_fp8", "gguf"
]
optimized_quantization_methods = [
"fp8", "marlin", "modelopt", "gptq_marlin_24", "gptq_marlin",
"awq_marlin", "fbgemm_fp8", "compressed_tensors",
"compressed-tensors", "experts_int8"
]
tpu_supported_quantization = ["tpu_int8"]
neuron_supported_quantization = ["neuron_quant"]
if self.quantization is not None:
self.quantization = self.quantization.lower()

Expand Down Expand Up @@ -463,32 +457,12 @@ def _verify_quantization(self) -> None:
raise ValueError(
f"Unknown quantization method: {self.quantization}. Must "
f"be one of {supported_quantization}.")
if current_platform.is_rocm(
) and self.quantization not in rocm_supported_quantization:
raise ValueError(
f"{self.quantization} quantization is currently not "
f"supported in ROCm.")
if current_platform.is_tpu(
) and self.quantization not in tpu_supported_quantization:
raise ValueError(
f"{self.quantization} quantization is currently not "
f"supported in TPU Backend.")
current_platform.verify_quantization(self.quantization)
if self.quantization not in optimized_quantization_methods:
logger.warning(
"%s quantization is not fully "
"optimized yet. The speed can be slower than "
"non-quantized models.", self.quantization)
if (self.quantization == "awq" and current_platform.is_rocm()
and not envs.VLLM_USE_TRITON_AWQ):
logger.warning(
"Using AWQ quantization with ROCm, but VLLM_USE_TRITON_AWQ"
" is not set, enabling VLLM_USE_TRITON_AWQ.")
envs.VLLM_USE_TRITON_AWQ = True
if current_platform.is_neuron(
) and self.quantization not in neuron_supported_quantization:
raise ValueError(
f"{self.quantization} quantization is currently not "
f"supported in Neuron Backend.")

def _verify_cuda_graph(self) -> None:
if self.max_seq_len_to_capture is None:
Expand Down
6 changes: 4 additions & 2 deletions vllm/executor/neuron_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,13 @@ def _init_worker(self):
wrapper = WorkerWrapperBase(vllm_config=self.vllm_config)
distributed_init_method = get_distributed_init_method(
get_ip(), get_open_port())
self.driver_worker = wrapper.init_worker(
wrapper.init_worker(
vllm_config=self.vllm_config,
local_rank=0,
rank=0,
distributed_init_method=distributed_init_method)
distributed_init_method=distributed_init_method,
)
self.driver_worker = wrapper.worker
self.driver_worker.init_device()
self.driver_worker.load_model()

Expand Down
3 changes: 2 additions & 1 deletion vllm/executor/openvino_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def _init_worker(self):

distributed_init_method = get_distributed_init_method(
get_ip(), get_open_port())
self.driver_worker = wrapper.init_worker(
wrapper.init_worker(
ov_core=ov.Core(),
vllm_config=self.vllm_config,
local_rank=0,
Expand All @@ -45,6 +45,7 @@ def _init_worker(self):
kv_cache_dtype=self.cache_config.cache_dtype,
is_driver_worker=True,
)
self.driver_worker = wrapper.worker
self.driver_worker.init_device()
self.driver_worker.load_model()

Expand Down
92 changes: 47 additions & 45 deletions vllm/model_executor/models/idefics3.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,54 +267,56 @@ def input_processor_for_idefics3(ctx: InputContext,
n_images_in_text = []

text = inputs.get("prompt")
if text is not None:
if isinstance(text, str):
text = [text]
elif not isinstance(text, list) and not isinstance(text[0], str):
raise ValueError("Invalid input text. Please provide a string, "
"or a list of strings")

fake_image_token = processor.fake_image_token.content
image_token = processor.image_token.content
global_img_token = processor.global_image_tag

prompt_strings = []
for sample, sample_rows, sample_cols in zip(text, image_rows,
image_cols):
n_images_in_text.append(sample.count(image_token))

# Replace the image token with fake tokens around the expanded
# image token sequence of length `image_seq_len`
image_prompt_strings = []
for n_rows, n_cols in zip(sample_rows, sample_cols):
image_prompt_string = _get_image_prompt_string(
n_rows,
n_cols,
processor.image_seq_len,
image_token=image_token,
fake_token_around_image=fake_image_token,
global_img_token=global_img_token,
)
image_prompt_strings.append(image_prompt_string)

split_sample = sample.split(image_token)
if len(split_sample) == 0:
raise ValueError(
"The image token should be present in the text.")
if text is None:
prompt_token_ids = inputs.get("prompt_token_ids", [])
assert prompt_token_ids
text = tokenizer.decode(prompt_token_ids)

if isinstance(text, str):
text = [text]
elif not isinstance(text, list) and not isinstance(text[0], str):
raise ValueError("Invalid input text. Please provide a string, "
"or a list of strings")

fake_image_token = processor.fake_image_token.content
image_token = processor.image_token.content
global_img_token = processor.global_image_tag

prompt_strings = []
for sample, sample_rows, sample_cols in zip(text, image_rows, image_cols):
n_images_in_text.append(sample.count(image_token))

# Replace the image token with fake tokens around the expanded
# image token sequence of length `image_seq_len`
image_prompt_strings = []
for n_rows, n_cols in zip(sample_rows, sample_cols):
image_prompt_string = _get_image_prompt_string(
n_rows,
n_cols,
processor.image_seq_len,
image_token=image_token,
fake_token_around_image=fake_image_token,
global_img_token=global_img_token,
)
image_prompt_strings.append(image_prompt_string)

# Place in the image prompt strings where the image tokens are
sample = split_sample[0]
for i, image_prompt_string in enumerate(image_prompt_strings):
sample += image_prompt_string + split_sample[i + 1]
prompt_strings.append(sample)
split_sample = sample.split(image_token)
if len(split_sample) == 0:
raise ValueError("The image token should be present in the text.")

prompt_token_ids = tokenizer(text=prompt_strings[0]).input_ids
# Place in the image prompt strings where the image tokens are
sample = split_sample[0]
for i, image_prompt_string in enumerate(image_prompt_strings):
sample += image_prompt_string + split_sample[i + 1]
prompt_strings.append(sample)

return token_inputs(
prompt_token_ids=prompt_token_ids,
prompt=prompt_strings[0],
multi_modal_data=multi_modal_data,
)
prompt_token_ids = tokenizer(text=prompt_strings[0]).input_ids

return token_inputs(
prompt_token_ids=prompt_token_ids,
prompt=prompt_strings[0],
multi_modal_data=multi_modal_data,
)


def _get_max_num_image_patch(image_processor: Idefics3ImageProcessor) -> int:
Expand Down
17 changes: 16 additions & 1 deletion vllm/model_executor/models/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@
from vllm.sequence import IntermediateTensors

from .interfaces import SupportsLoRA, SupportsPP
from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index,
is_pp_missing_parameter,
make_empty_intermediate_tensors_factory, make_layers,
maybe_prefix)

Expand Down Expand Up @@ -111,6 +112,7 @@ def __init__(
prefix: str = "",
) -> None:
super().__init__()
layer_idx = extract_layer_index(prefix)
self.hidden_size = hidden_size
tp_size = get_tensor_model_parallel_world_size()
self.total_num_heads = num_heads
Expand Down Expand Up @@ -165,13 +167,26 @@ def __init__(
rope_scaling=rope_scaling,
is_neox_style=is_neox_style,
)

if hasattr(config, "interleaved_sliding_window"):
if isinstance(config.interleaved_sliding_window, int):
sliding_window = config.interleaved_sliding_window
elif isinstance(config.interleaved_sliding_window, list):
sw_idx = layer_idx % len(config.interleaved_sliding_window)
sliding_window = config.interleaved_sliding_window[sw_idx]
else:
raise ValueError(f"{type(sliding_window)} is not supported.")
else:
sliding_window = None

self.attn = Attention(
self.num_heads,
self.head_dim,
self.scaling,
num_kv_heads=self.num_kv_heads,
cache_config=cache_config,
quant_config=quant_config,
per_layer_sliding_window=sliding_window,
prefix=f"{prefix}.attn",
)

Expand Down
Loading

0 comments on commit c3334dc

Please sign in to comment.