vllm-project · fabianlim · Dec 5, 2024 · Dec 5, 2024 · Dec 5, 2024 · Dec 8, 2024
@@ -6,7 +6,8 @@
 
 from ...utils import check_outputs_equal
 
-MODELS = ["ai21labs/Jamba-tiny-dev"]
+# This test is for the hybrid models
+MODELS = ["ai21labs/Jamba-tiny-dev", "ibm-fms/Bamba-9.8b-1.8T-hf"]
 
 
 @pytest.mark.parametrize("model", MODELS)
@@ -140,7 +141,7 @@ def test_mamba_prefill_chunking(hf_runner, vllm_runner, example_prompts,
 
 
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("dtype", ["float"])
 @pytest.mark.parametrize("max_tokens", [15])
 def test_parallel_sampling(
     vllm_runner,
@@ -243,17 +244,17 @@ def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
     dtype: str,
     example_prompts,
 ) -> None:
-    # This test is for verifying that the Jamba inner state management doesn't
+    # This test is for verifying that the hybrid inner state management doesn't
     # collapse in case where the number of incoming requests and
     # finished_requests_ids is larger than the maximum mamba block capacity.
-    # This could generally happen due to the fact that Jamba does support
+    # This could generally happen due to the fact that hybrid does support
     # statelessness mechanism where it can cleanup new incoming requests in
     # a single step.
     try:
         with vllm_runner(model, dtype=dtype, max_num_seqs=10) as vllm_model:
             vllm_model.generate_greedy([example_prompts[0]] * 100, 10)
     except ValueError:
-        pytest.fail("Jamba inner state wasn't cleaned up properly between"
+        pytest.fail("Hybrid inner state wasn't cleaned up properly between"
                     "steps finished requests registered unnecessarily ")
 
 
@@ -265,14 +266,14 @@ def test_state_cleanup(
     dtype: str,
     example_prompts,
 ) -> None:
-    # This test is for verifying that the Jamba state is cleaned up between
+    # This test is for verifying that the Hybrid state is cleaned up between
     # steps, If its not cleaned, an error would be expected.
     try:
         with vllm_runner(model, dtype=dtype) as vllm_model:
             for _ in range(10):
                 vllm_model.generate_greedy([example_prompts[0]] * 100, 1)
     except ValueError:
-        pytest.fail("Jamba inner state wasn't cleaned up between states, "
+        pytest.fail("Hybrid inner state wasn't cleaned up between states, "
                     "could be related to finished_requests_ids")
 
 
@@ -318,7 +319,7 @@ def test_multistep_correctness(vllm_runner, model: str, dtype: str,
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["float"])
 @pytest.mark.parametrize("max_tokens", [64])
-def test_jamba_distributed_produces_identical_generation(
+def test_hybrid_distributed_produces_identical_generation(
         vllm_runner, model: str, dtype: str, max_tokens: int,
         example_prompts) -> None: