diff --git a/tests/conftest.py b/tests/conftest.py index 15affc4776a47..5be3d2e225670 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -14,9 +14,8 @@ import torch.nn.functional as F from huggingface_hub import snapshot_download from PIL import Image -from transformers import (AutoModelForCausalLM, - AutoModelForSequenceClassification, AutoTokenizer, - AutoConfig, BatchEncoding, BatchFeature) +from transformers import (AutoModelForCausalLM, AutoTokenizer, BatchEncoding, + BatchFeature) from transformers.models.auto.auto_factory import _BaseAutoModelClass from tests.models.utils import (TokensTextLogprobs, @@ -272,16 +271,6 @@ def __init__( ).to(dtype=torch_dtype)) else: model_kwargs = model_kwargs if model_kwargs is not None else {} - config = AutoConfig.from_pretrained( - model_name, - torch_dtype=torch_dtype, - trust_remote_code=True, - ) - arch = config.architectures - if len(arch) > 0: - cls_type = arch[0].split("For")[-1] - auto_cls = eval(f"AutoModelFor{cls_type}") - self.model = self.wrap_device( auto_cls.from_pretrained( model_name, diff --git a/tests/models/decoder_only/language/test_cls_models.py b/tests/models/decoder_only/language/test_cls_models.py index 352f8a4e74188..daf6461b38dcb 100644 --- a/tests/models/decoder_only/language/test_cls_models.py +++ b/tests/models/decoder_only/language/test_cls_models.py @@ -7,10 +7,9 @@ """ import pytest import torch +from transformers import AutoModelForSequenceClassification -CLASSIFICATION_MODELS = [ - "jason9693/Qwen2.5-1.5B-apeach" -] +CLASSIFICATION_MODELS = ["jason9693/Qwen2.5-1.5B-apeach"] @pytest.mark.parametrize("model", CLASSIFICATION_MODELS) @@ -22,7 +21,9 @@ def test_classification_models( model: str, dtype: str, ) -> None: - with hf_runner(model, dtype=dtype) as hf_model: + with hf_runner(model, + dtype=dtype, + auto_cls=AutoModelForSequenceClassification) as hf_model: hf_outputs = hf_model.classify(example_prompts) with vllm_runner(model, dtype=dtype) as vllm_model: diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py index 91e70ada2e59f..2a8eaa8314b04 100644 --- a/vllm/attention/backends/flash_attn.py +++ b/vllm/attention/backends/flash_attn.py @@ -349,7 +349,7 @@ def _add_seq_group( else: block_table = block_tables[seq_id][ -curr_sliding_window_block:] - + print(f"prefix cache hit: {prefix_cache_hit}") print(f"chunked prefill enabled: {chunked_prefill_enabled}") print(f"prompt: {is_prompt}") diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index f5e6d394329c7..91605417e730f 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -92,7 +92,8 @@ "Gemma2Model": ("gemma2", "Gemma2EmbeddingModel"), "MistralModel": ("llama", "LlamaEmbeddingModel"), "Qwen2ForRewardModel": ("qwen2_rm", "Qwen2ForRewardModel"), - "Qwen2ForSequenceClassification": ("qwen2_cls", "Qwen2ForSequenceClassification"), + "Qwen2ForSequenceClassification": ( + "qwen2_cls", "Qwen2ForSequenceClassification"), # [Multimodal] "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"), }