From f3ce9b0bbb83c80c8a5f0d5d98109cfc0e4989bd Mon Sep 17 00:00:00 2001
From: Travis Johnson <tsjohnso@us.ibm.com>
Date: Wed, 23 Oct 2024 14:24:32 -0600
Subject: [PATCH 1/3] fix crash with llama 3.2 vision models and guided
 decoding

Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
Co-authored-by: pavlo-ruban <pavlo.ruban@servicenow.com>
---
 .../guided_decoding/outlines_logits_processors.py             | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
index c28bd71c9f682..dc821e7269d6d 100644
--- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py
+++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
@@ -79,6 +79,10 @@ def __call__(self, input_ids: List[int],
         mask = torch.full((scores.shape[-1], ),
                           -math.inf,
                           device=scores.device)
+        # the tokenizer may support more token ids than the model can generate,
+        # eg. Llama 3.2 Vision models have an `<|image|>` token with id 128256
+        # but scores.shape == torch.Size([128256])
+        allowed_tokens = [t for t in allowed_tokens if t < scores.shape[-1]]
         mask[allowed_tokens] = 0
         scores.add_(mask)
         return scores

From 0a58760193c052b460ca0ed568681147497726ae Mon Sep 17 00:00:00 2001
From: Travis Johnson <tsjohnso@us.ibm.com>
Date: Wed, 23 Oct 2024 17:05:57 -0600
Subject: [PATCH 2/3] vectorize allowed token check

Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
---
 .../guided_decoding/outlines_logits_processors.py      | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
index dc821e7269d6d..cb8ae37ba48f3 100644
--- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py
+++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
@@ -15,11 +15,11 @@
 # limitations under the License.
 import copy
 import json
-import math
 from collections import defaultdict
 from functools import lru_cache
 from typing import Callable, DefaultDict, Dict, List, Union
 
+import numpy as np
 import torch
 from lark import Lark
 from outlines import grammars
@@ -77,12 +77,14 @@ def __call__(self, input_ids: List[int],
                 f"Unsupported instruction type {type(instruction)}")
 
         mask = torch.full((scores.shape[-1], ),
-                          -math.inf,
+                          -torch.inf,
                           device=scores.device)
-        # the tokenizer may support more token ids than the model can generate,
+        # The tokenizer may support more token ids than the model can generate,
         # eg. Llama 3.2 Vision models have an `<|image|>` token with id 128256
         # but scores.shape == torch.Size([128256])
-        allowed_tokens = [t for t in allowed_tokens if t < scores.shape[-1]]
+        # Using NumPy is faster for filtering token ids
+        allowed_tokens = np.array(allowed_tokens)
+        allowed_tokens = allowed_tokens[allowed_tokens < scores.shape[-1]]
         mask[allowed_tokens] = 0
         scores.add_(mask)
         return scores

From dcefbf0c85f729433bbb9348663e29d61a5d7268 Mon Sep 17 00:00:00 2001
From: Nick Hill <nickhill@us.ibm.com>
Date: Thu, 24 Oct 2024 17:52:50 -0700
Subject: [PATCH 3/3] further optimization

---
 .../guided_decoding/outlines_logits_processors.py         | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
index cb8ae37ba48f3..e1309c31f77e7 100644
--- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py
+++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
@@ -83,9 +83,11 @@ def __call__(self, input_ids: List[int],
         # eg. Llama 3.2 Vision models have an `<|image|>` token with id 128256
         # but scores.shape == torch.Size([128256])
         # Using NumPy is faster for filtering token ids
-        allowed_tokens = np.array(allowed_tokens)
-        allowed_tokens = allowed_tokens[allowed_tokens < scores.shape[-1]]
-        mask[allowed_tokens] = 0
+        allowed_tokens = np.array(allowed_tokens, dtype=np.int64)
+        allowed_tokens = torch.tensor(allowed_tokens, device=scores.device)
+        allowed_tokens = allowed_tokens.masked_select(
+            allowed_tokens < scores.shape[-1])
+        mask.index_fill_(0, allowed_tokens, 0)
         scores.add_(mask)
         return scores