diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py index 0fe88e792520a..3c2ca1bddd906 100644 --- a/tests/basic_correctness/test_basic_correctness.py +++ b/tests/basic_correctness/test_basic_correctness.py @@ -19,7 +19,7 @@ MODELS = [ "facebook/opt-125m", - "meta-llama/Llama-2-7b-hf", + "meta-llama/Llama-3.2-1B", ] TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4") diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py index c3e3835aff0af..51aec8c873d12 100644 --- a/tests/basic_correctness/test_chunked_prefill.py +++ b/tests/basic_correctness/test_chunked_prefill.py @@ -16,7 +16,7 @@ MODELS = [ "facebook/opt-125m", - "meta-llama/Llama-2-7b-hf", + "meta-llama/Llama-3.2-1B", ] diff --git a/tests/basic_correctness/test_cpu_offload.py b/tests/basic_correctness/test_cpu_offload.py index a5df5639cf948..d7f36a7812802 100644 --- a/tests/basic_correctness/test_cpu_offload.py +++ b/tests/basic_correctness/test_cpu_offload.py @@ -2,5 +2,5 @@ def test_cpu_offload(): - compare_two_settings("meta-llama/Llama-2-7b-hf", [], - ["--cpu-offload-gb", "4"]) + compare_two_settings("meta-llama/Llama-3.2-1B", [], + ["--cpu-offload-gb", "1"]) diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py index b6ec7413978f4..77c56d91d0a8b 100644 --- a/tests/compile/test_basic_correctness.py +++ b/tests/compile/test_basic_correctness.py @@ -13,8 +13,7 @@ @pytest.mark.parametrize( "model, model_args, pp_size, tp_size, attn_backend, method, fullgraph", [ - ("meta-llama/Meta-Llama-3-8B", [], 2, 2, "FLASH_ATTN", "generate", - True), + ("meta-llama/Llama-3.2-1B", [], 2, 2, "FLASH_ATTN", "generate", True), ("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples", ["--quantization", "compressed-tensors" ], 1, 1, "FLASH_ATTN", "generate", True), diff --git a/tests/entrypoints/llm/test_chat.py b/tests/entrypoints/llm/test_chat.py index b57348a4d9a58..fc66386fd2d2a 100644 --- a/tests/entrypoints/llm/test_chat.py +++ b/tests/entrypoints/llm/test_chat.py @@ -8,7 +8,7 @@ def test_chat(): - llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct") + llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct") prompt1 = "Explain the concept of entropy." messages = [ @@ -26,7 +26,7 @@ def test_chat(): def test_multi_chat(): - llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct") + llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct") prompt1 = "Explain the concept of entropy." prompt2 = "Explain what among us is." diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py index a29747603622b..d1aebbd70d256 100644 --- a/tests/entrypoints/openai/test_chat.py +++ b/tests/entrypoints/openai/test_chat.py @@ -16,9 +16,6 @@ # any model with a chat template should work here MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" -# technically this needs Mistral-7B-v0.1 as base, but we're not testing -# generation quality here -LORA_NAME = "typeof/zephyr-7b-beta-lora" @pytest.fixture(scope="module") diff --git a/tests/entrypoints/openai/test_shutdown.py b/tests/entrypoints/openai/test_shutdown.py index 25ab91ef69333..6fcc92022855b 100644 --- a/tests/entrypoints/openai/test_shutdown.py +++ b/tests/entrypoints/openai/test_shutdown.py @@ -6,7 +6,7 @@ from ...utils import RemoteOpenAIServer -MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" +MODEL_NAME = "meta-llama/Llama-3.2-1B" @pytest.mark.asyncio diff --git a/tests/test_sharded_state_loader.py b/tests/test_sharded_state_loader.py index f5d9569046a63..2412da5037ece 100644 --- a/tests/test_sharded_state_loader.py +++ b/tests/test_sharded_state_loader.py @@ -46,9 +46,10 @@ def test_filter_subtensors(): @pytest.fixture(scope="module") def llama_2_7b_files(): with TemporaryDirectory() as cache_dir: - input_dir = snapshot_download("meta-llama/Llama-2-7b-hf", + input_dir = snapshot_download("meta-llama/Llama-3.2-1B", cache_dir=cache_dir, - ignore_patterns="*.bin*") + ignore_patterns=["*.bin*", "original/*"]) + yield input_dir @@ -58,9 +59,12 @@ def _run_writer(input_dir, output_dir, weights_patterns, **kwargs): # Dump worker states to output directory llm_sharded_writer.llm_engine.model_executor.save_sharded_state( path=output_dir) + # Copy metadata files to output directory for file in os.listdir(input_dir): - if not any(file.endswith(ext) for ext in weights_patterns): + if not any( + file.endswith(ext) and not os.path.isdir(file) + for ext in weights_patterns): shutil.copy(f"{input_dir}/{file}", output_dir)