[https://nvbugs/5375594][fix] fix oom issue on structural_tag test case (#6838)

nv-guomingz · syuoni · web-flow · commit 3e46624f098f · 2025-08-13T10:09:35.000-04:00
Signed-off-by: nv-guomingz &lt;137257613+nv-guomingz@users.noreply.github.com&gt;
Signed-off-by: Enwei Zhu &lt;21126786+syuoni@users.noreply.github.com&gt;
Co-authored-by: Enwei Zhu &lt;21126786+syuoni@users.noreply.github.com&gt;
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
@@ -43,7 +43,6 @@ examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bf
 examples/test_whisper.py::test_llm_whisper_general[large-v3-enable_gemm_plugin-enable_attention_plugin-disable_weight_only-float16-nb:1-use_python_runtime] SKIP (https://nvbugs/4866931)
 examples/test_nemotron.py::test_llm_nemotron_3_8b_1gpu[bfloat16-fp8] SKIP (https://nvbugs/4961624)
 examples/test_mistral.py::test_llm_mistral_v1_1gpu[mistral-7b-v0.1-float16-max_attention_window_size_4096-chunked_summarization_long] SKIP (https://nvbugs/5321371)
-test_e2e.py::test_openai_chat_structural_tag_example SKIP (https://nvbugspro.nvidia.com/bug/5375594)
 cpp/test_e2e.py::test_model[fp8-chatglm-90] SKIP (https://nvbugs/5034830)
 full:B200_PCIe/unittest/trt/functional SKIP (Disable for Blackwell)
 full:B200_PCIe/unittest/trt/quantization SKIP (Disable for Blackwell)
diff --git a/tests/unittest/llmapi/apps/_test_openai_chat_json.py b/tests/unittest/llmapi/apps/_test_openai_chat_json.py
@@ -26,11 +26,7 @@ def temp_extra_llm_api_options_file(request):
     temp_dir = tempfile.gettempdir()
     temp_file_path = os.path.join(temp_dir, "extra_llm_api_options.yaml")
     try:
-        extra_llm_api_options_dict = {
-            "guided_decoding_backend": "xgrammar",
-            "disable_overlap_scheduler":
-            True,  # Guided decoding is not supported with overlap scheduler
-        }
+        extra_llm_api_options_dict = {"guided_decoding_backend": "xgrammar"}
 
         with open(temp_file_path, "w") as f:
             yaml.dump(extra_llm_api_options_dict, f)
diff --git a/tests/unittest/llmapi/apps/_test_openai_chat_structural_tag.py b/tests/unittest/llmapi/apps/_test_openai_chat_structural_tag.py
@@ -1,25 +1,28 @@
 # Adapted from
 # https://github.com/vllm-project/vllm/blob/aae6927be06dedbda39c6b0c30f6aa3242b84388/tests/entrypoints/openai/test_chat.py
+import json
 import os
+import re
 import tempfile
 
+import jsonschema
 import openai
 import pytest
 import yaml
 
-from ..test_llm import get_model_path, similar
+from ..test_llm import get_model_path
 from .openai_server import RemoteOpenAIServer
 
 pytestmark = pytest.mark.threadleak(enabled=False)
 
 
-@pytest.fixture(scope="module", ids=["TinyLlama-1.1B-Chat"])
+@pytest.fixture(scope="module")
 def model_name():
     return "llama-3.1-model/Llama-3.1-8B-Instruct"
 
 
 @pytest.fixture(scope="module")
-def temp_extra_llm_api_options_file(request):
+def temp_extra_llm_api_options_file():
     temp_dir = tempfile.gettempdir()
     temp_file_path = os.path.join(temp_dir, "extra_llm_api_options.yaml")
     try:
@@ -37,7 +40,12 @@ def temp_extra_llm_api_options_file(request):
 @pytest.fixture(scope="module")
 def server(model_name: str, temp_extra_llm_api_options_file: str):
     model_path = get_model_path(model_name)
-    args = ["--extra_llm_api_options", temp_extra_llm_api_options_file]
+
+    # Use small max_batch_size/max_seq_len/max_num_tokens to avoid OOM on A10/A30 GPUs.
+    args = [
+        "--max_batch_size=8", "--max_seq_len=1024", "--max_num_tokens=1024",
+        f"--extra_llm_api_options={temp_extra_llm_api_options_file}"
+    ]
     with RemoteOpenAIServer(model_path, args) as remote_server:
         yield remote_server
 
@@ -112,12 +120,7 @@ def tool_get_current_date():
 
 def test_chat_structural_tag(client: openai.OpenAI, model_name: str,
                              tool_get_current_weather, tool_get_current_date):
-    messages = [
-        {
-            "role":
-            "system",
-            "content":
-            f"""
+    system_prompt = f"""
 # Tool Instructions
 - Always execute python code in messages that you share.
 - When looking for real time information use relevant functions if available else fallback to brave_search
@@ -140,20 +143,24 @@ def test_chat_structural_tag(client: openai.OpenAI, model_name: str,
 - Only call one function at a time
 - Put the entire function call reply on one line
 - Always add your sources when using search results to answer the user query
-You are a helpful assistant.""",
+You are a helpful assistant."""
+    user_prompt = "You are in New York. Please get the current date and time, and the weather."
+
+    messages = [
+        {
+            "role": "system",
+            "content": system_prompt,
         },
         {
-            "role":
-            "user",
-            "content":
-            "You are in New York. Please get the current date and time, and the weather.",
+            "role": "user",
+            "content": user_prompt,
         },
     ]
 
     chat_completion = client.chat.completions.create(
         model=model_name,
         messages=messages,
-        max_completion_tokens=100,
+        max_completion_tokens=256,
         response_format={
             "type":
             "structural_tag",
@@ -173,11 +180,18 @@ def test_chat_structural_tag(client: openai.OpenAI, model_name: str,
             "triggers": ["<function="],
         },
     )
-    assert chat_completion.id is not None
-    assert len(chat_completion.choices) == 1
+
     message = chat_completion.choices[0].message
     assert message.content is not None
     assert message.role == "assistant"
 
-    reference = '<function=get_current_date>{"timezone": "America/New_York"}</function>\n<function=get_current_weather>{"city": "New York", "state": "NY", "unit": "fahrenheit"}</function>\n\nSources:\n- get_current_date function\n- get_current_weather function'
-    assert similar(chat_completion.choices[0].message.content, reference)
+    match = re.search(r'<function=get_current_weather>([\S\s]+?)</function>',
+                      message.content)
+    params = json.loads(match.group(1))
+    jsonschema.validate(params,
+                        tool_get_current_weather["function"]["parameters"])
+
+    match = re.search(r'<function=get_current_date>([\S\s]+?)</function>',
+                      message.content)
+    params = json.loads(match.group(1))
+    jsonschema.validate(params, tool_get_current_date["function"]["parameters"])