diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index 0fe88e792520a..3c2ca1bddd906 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -19,7 +19,7 @@
 
 MODELS = [
     "facebook/opt-125m",
-    "meta-llama/Llama-2-7b-hf",
+    "meta-llama/Llama-3.2-1B",
 ]
 
 TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py
index c3e3835aff0af..51aec8c873d12 100644
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -16,7 +16,7 @@
 
 MODELS = [
     "facebook/opt-125m",
-    "meta-llama/Llama-2-7b-hf",
+    "meta-llama/Llama-3.2-1B",
 ]
 
 
diff --git a/tests/basic_correctness/test_cpu_offload.py b/tests/basic_correctness/test_cpu_offload.py
index a5df5639cf948..d7f36a7812802 100644
--- a/tests/basic_correctness/test_cpu_offload.py
+++ b/tests/basic_correctness/test_cpu_offload.py
@@ -2,5 +2,5 @@
 
 
 def test_cpu_offload():
-    compare_two_settings("meta-llama/Llama-2-7b-hf", [],
-                         ["--cpu-offload-gb", "4"])
+    compare_two_settings("meta-llama/Llama-3.2-1B", [],
+                         ["--cpu-offload-gb", "1"])
diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py
index b6ec7413978f4..77c56d91d0a8b 100644
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@@ -13,8 +13,7 @@
 @pytest.mark.parametrize(
     "model, model_args, pp_size, tp_size, attn_backend, method, fullgraph",
     [
-        ("meta-llama/Meta-Llama-3-8B", [], 2, 2, "FLASH_ATTN", "generate",
-         True),
+        ("meta-llama/Llama-3.2-1B", [], 2, 2, "FLASH_ATTN", "generate", True),
         ("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples",
          ["--quantization", "compressed-tensors"
           ], 1, 1, "FLASH_ATTN", "generate", True),
diff --git a/tests/entrypoints/llm/test_chat.py b/tests/entrypoints/llm/test_chat.py
index b57348a4d9a58..fc66386fd2d2a 100644
--- a/tests/entrypoints/llm/test_chat.py
+++ b/tests/entrypoints/llm/test_chat.py
@@ -8,7 +8,7 @@
 
 
 def test_chat():
-    llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
+    llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct")
 
     prompt1 = "Explain the concept of entropy."
     messages = [
@@ -26,7 +26,7 @@ def test_chat():
 
 
 def test_multi_chat():
-    llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
+    llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct")
 
     prompt1 = "Explain the concept of entropy."
     prompt2 = "Explain what among us is."
diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index a29747603622b..d1aebbd70d256 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -16,9 +16,6 @@
 
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
-# technically this needs Mistral-7B-v0.1 as base, but we're not testing
-# generation quality here
-LORA_NAME = "typeof/zephyr-7b-beta-lora"
 
 
 @pytest.fixture(scope="module")
diff --git a/tests/entrypoints/openai/test_shutdown.py b/tests/entrypoints/openai/test_shutdown.py
index 25ab91ef69333..6fcc92022855b 100644
--- a/tests/entrypoints/openai/test_shutdown.py
+++ b/tests/entrypoints/openai/test_shutdown.py
@@ -6,7 +6,7 @@
 
 from ...utils import RemoteOpenAIServer
 
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+MODEL_NAME = "meta-llama/Llama-3.2-1B"
 
 
 @pytest.mark.asyncio
diff --git a/tests/test_sharded_state_loader.py b/tests/test_sharded_state_loader.py
index f5d9569046a63..2412da5037ece 100644
--- a/tests/test_sharded_state_loader.py
+++ b/tests/test_sharded_state_loader.py
@@ -46,9 +46,10 @@ def test_filter_subtensors():
 @pytest.fixture(scope="module")
 def llama_2_7b_files():
     with TemporaryDirectory() as cache_dir:
-        input_dir = snapshot_download("meta-llama/Llama-2-7b-hf",
+        input_dir = snapshot_download("meta-llama/Llama-3.2-1B",
                                       cache_dir=cache_dir,
-                                      ignore_patterns="*.bin*")
+                                      ignore_patterns=["*.bin*", "original/*"])
+
         yield input_dir
 
 
@@ -58,9 +59,12 @@ def _run_writer(input_dir, output_dir, weights_patterns, **kwargs):
     # Dump worker states to output directory
     llm_sharded_writer.llm_engine.model_executor.save_sharded_state(
         path=output_dir)
+
     # Copy metadata files to output directory
     for file in os.listdir(input_dir):
-        if not any(file.endswith(ext) for ext in weights_patterns):
+        if not any(
+                file.endswith(ext) and not os.path.isdir(file)
+                for ext in weights_patterns):
             shutil.copy(f"{input_dir}/{file}", output_dir)