From 1875e93e604255529092858166a22b4ad1a779b1 Mon Sep 17 00:00:00 2001
From: "Brandon T. Willard" <brandonwillard@users.noreply.github.com>
Date: Tue, 10 Sep 2024 12:30:18 -0500
Subject: [PATCH] Set tokenizer versions in tests

---
 tests/fsm/test_regex.py                     | 17 ++++++++++-------
 tests/generate/test_integration_llamacpp.py |  5 ++++-
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/tests/fsm/test_regex.py b/tests/fsm/test_regex.py
index f7aa4ae67..824588b22 100644
--- a/tests/fsm/test_regex.py
+++ b/tests/fsm/test_regex.py
@@ -402,15 +402,18 @@ def test_create_fsm_index_end_to_end_multi_byte():
 
 
 @pytest.mark.parametrize(
-    "hf_tokenizer_uri",
+    "hf_tokenizer_uri, revision",
     [
-        "gpt2",
-        "microsoft/phi-2",
-        "Qwen/Qwen1.5-0.5B-Chat",
-        "NousResearch/Hermes-2-Pro-Llama-3-8B",
+        ("openai-community/gpt2", "607a30d783dfa663caf39e06633721c8d4cfcd7e"),
+        ("microsoft/phi-2", "ef382358ec9e382308935a992d908de099b64c23"),
+        ("Qwen/Qwen1.5-0.5B-Chat", "4d14e384a4b037942bb3f3016665157c8bcb70ea"),
+        (
+            "NousResearch/Hermes-2-Pro-Llama-3-8B",
+            "783fd50eb82d7f57758de033861f54d62dde234f",
+        ),
     ],
 )
-def test_create_fsm_index_tokenizer(hf_tokenizer_uri):
+def test_create_fsm_index_tokenizer(hf_tokenizer_uri, revision):
     # The combined regular expressions of a lexer state in a Python grammar
     regex_str = "(?:(?:[0-9](?:(?:_)?[0-9])*(?:e|E)(?:(?:\\+|\\-))?[0-9](?:(?:_)?[0-9])*|(?:[0-9](?:(?:_)?[0-9])*\\.(?:[0-9](?:(?:_)?[0-9])*)?|\\.[0-9](?:(?:_)?[0-9])*)(?:(?:e|E)(?:(?:\\+|\\-))?[0-9](?:(?:_)?[0-9])*)?)|[0-9](?:(?:_)?[0-9])*)(?:J|j)|(?:[0-9](?:(?:_)?[0-9])*(?:e|E)(?:(?:\\+|\\-))?[0-9](?:(?:_)?[0-9])*|(?:[0-9](?:(?:_)?[0-9])*\\.(?:[0-9](?:(?:_)?[0-9])*)?|\\.[0-9](?:(?:_)?[0-9])*)(?:(?:e|E)(?:(?:\\+|\\-))?[0-9](?:(?:_)?[0-9])*)?)|0(?:x|X)(?:(?:_)?(?:[0-9]|[a-f]|[A-F]))+|0(?:b|B)(?:(?:_)?[0-1])+|0(?:o|O)(?:(?:_)?[0-7])+|(?:(?i:([ubf]?r?|r[ubf])('([^\\\\']|.)*?'))|(?i:([ubf]?r?|r[ubf])(\"([^\\\"]|.)*?\")))|(?:(?:\r?\n[\t ]*|#[^\n]*))+|[1-9](?:(?:_)?[0-9])*|\\\\[\t \x0c]*\r?\n|continue|nonlocal|assert|global|import|lambda|return|async|await|break|class|False|match|raise|while|yield|case|from|None|pass|True|with|def|del|for|not|try|if|[^\\W\\d]\\w*|#[^\n]*|[\t \x0c]+|\\.\\.\\.|@|\\{|\\(|\\[|\\-|\\+|\\*|\\~"
 
@@ -425,7 +428,7 @@ def test_create_fsm_index_tokenizer(hf_tokenizer_uri):
     num_bytes_fsm_states = len(bytes_fsm.states)
     assert num_bytes_fsm_states == 235
 
-    tokenizer = AutoTokenizer.from_pretrained(hf_tokenizer_uri)
+    tokenizer = AutoTokenizer.from_pretrained(hf_tokenizer_uri, revision=revision)
     tokenizer = TransformerTokenizer(tokenizer)
 
     states_to_token_subsets, empty_token_ids = create_fsm_index_tokenizer(
diff --git a/tests/generate/test_integration_llamacpp.py b/tests/generate/test_integration_llamacpp.py
index 3469dcbc0..0a98f0226 100644
--- a/tests/generate/test_integration_llamacpp.py
+++ b/tests/generate/test_integration_llamacpp.py
@@ -329,6 +329,9 @@ def test_RegexGuide_caching(model, temp_cache_dir):
     assert structured != structured_2
 
 
+@pytest.mark.xfail(
+    reason="Some versions of the Hermes-2-Pro-Llama-3 model have a broken config"
+)
 def test_tokenizer_vocabulary_decode_sanity():
     """Assert the decoded newline token (198) is the same as the normalized vocab token"""
     import llama_cpp
@@ -337,7 +340,7 @@ def test_tokenizer_vocabulary_decode_sanity():
         "bartowski/Meta-Llama-3-8B-Instruct-GGUF",
         "Meta-Llama-3-8B-Instruct-IQ1_M.gguf",
         tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained(
-            "NousResearch/Hermes-2-Pro-Llama-3-8B"
+            "NousResearch/Hermes-2-Pro-Llama-3-8B",
         ),
     )
     tokenizer = generate.regex(model, "a").logits_processor.tokenizer