From f3ce9b0bbb83c80c8a5f0d5d98109cfc0e4989bd Mon Sep 17 00:00:00 2001 From: Travis Johnson Date: Wed, 23 Oct 2024 14:24:32 -0600 Subject: [PATCH 1/3] fix crash with llama 3.2 vision models and guided decoding Signed-off-by: Travis Johnson Co-authored-by: pavlo-ruban --- .../guided_decoding/outlines_logits_processors.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py index c28bd71c9f682..dc821e7269d6d 100644 --- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py +++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py @@ -79,6 +79,10 @@ def __call__(self, input_ids: List[int], mask = torch.full((scores.shape[-1], ), -math.inf, device=scores.device) + # the tokenizer may support more token ids than the model can generate, + # eg. Llama 3.2 Vision models have an `<|image|>` token with id 128256 + # but scores.shape == torch.Size([128256]) + allowed_tokens = [t for t in allowed_tokens if t < scores.shape[-1]] mask[allowed_tokens] = 0 scores.add_(mask) return scores From 0a58760193c052b460ca0ed568681147497726ae Mon Sep 17 00:00:00 2001 From: Travis Johnson Date: Wed, 23 Oct 2024 17:05:57 -0600 Subject: [PATCH 2/3] vectorize allowed token check Signed-off-by: Travis Johnson --- .../guided_decoding/outlines_logits_processors.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py index dc821e7269d6d..cb8ae37ba48f3 100644 --- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py +++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py @@ -15,11 +15,11 @@ # limitations under the License. import copy import json -import math from collections import defaultdict from functools import lru_cache from typing import Callable, DefaultDict, Dict, List, Union +import numpy as np import torch from lark import Lark from outlines import grammars @@ -77,12 +77,14 @@ def __call__(self, input_ids: List[int], f"Unsupported instruction type {type(instruction)}") mask = torch.full((scores.shape[-1], ), - -math.inf, + -torch.inf, device=scores.device) - # the tokenizer may support more token ids than the model can generate, + # The tokenizer may support more token ids than the model can generate, # eg. Llama 3.2 Vision models have an `<|image|>` token with id 128256 # but scores.shape == torch.Size([128256]) - allowed_tokens = [t for t in allowed_tokens if t < scores.shape[-1]] + # Using NumPy is faster for filtering token ids + allowed_tokens = np.array(allowed_tokens) + allowed_tokens = allowed_tokens[allowed_tokens < scores.shape[-1]] mask[allowed_tokens] = 0 scores.add_(mask) return scores From dcefbf0c85f729433bbb9348663e29d61a5d7268 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Thu, 24 Oct 2024 17:52:50 -0700 Subject: [PATCH 3/3] further optimization --- .../guided_decoding/outlines_logits_processors.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py index cb8ae37ba48f3..e1309c31f77e7 100644 --- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py +++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py @@ -83,9 +83,11 @@ def __call__(self, input_ids: List[int], # eg. Llama 3.2 Vision models have an `<|image|>` token with id 128256 # but scores.shape == torch.Size([128256]) # Using NumPy is faster for filtering token ids - allowed_tokens = np.array(allowed_tokens) - allowed_tokens = allowed_tokens[allowed_tokens < scores.shape[-1]] - mask[allowed_tokens] = 0 + allowed_tokens = np.array(allowed_tokens, dtype=np.int64) + allowed_tokens = torch.tensor(allowed_tokens, device=scores.device) + allowed_tokens = allowed_tokens.masked_select( + allowed_tokens < scores.shape[-1]) + mask.index_fill_(0, allowed_tokens, 0) scores.add_(mask) return scores