diff --git a/docs/source/examples/configs.rst b/docs/source/examples/configs.rst
index ef13092d63..5c00459c05 100644
--- a/docs/source/examples/configs.rst
+++ b/docs/source/examples/configs.rst
@@ -102,7 +102,7 @@ keyword arguments not specified in the config if we'd like:
 
     # Tokenizer is needed for the dataset, configure it first
     tokenizer:
-      _component_: torchtune.models.llama2_tokenizer
+      _component_: torchtune.models.llama2.llama2_tokenizer
       path: /tmp/tokenizer.model
 
     dataset:
diff --git a/docs/source/examples/finetune_llm.rst b/docs/source/examples/finetune_llm.rst
index 440ed9d429..c7b9e09eb3 100644
--- a/docs/source/examples/finetune_llm.rst
+++ b/docs/source/examples/finetune_llm.rst
@@ -30,7 +30,7 @@ An example config for training the Llama 7B model using the Alpaca dataset looks
 
     # Tokenizer
     tokenizer:
-      _component_: torchtune.models.llama2_tokenizer
+      _component_: torchtune.models.llama2.llama2_tokenizer
       path: /tmp/tokenizer.model
 
     # Dataset
@@ -40,7 +40,7 @@ An example config for training the Llama 7B model using the Alpaca dataset looks
 
     # Model Arguments
     model:
-      _component_: torchtune.models.llama2_7b
+      _component_: torchtune.models.llama2.llama2_7b
     model_checkpoint: /tmp/llama2-7b
 
     # Fine-tuning arguments
diff --git a/docs/source/examples/first_finetune_tutorial.rst b/docs/source/examples/first_finetune_tutorial.rst
index 5c0c4dd547..87b2cc302b 100644
--- a/docs/source/examples/first_finetune_tutorial.rst
+++ b/docs/source/examples/first_finetune_tutorial.rst
@@ -97,7 +97,7 @@ lowering the epochs to 1 so you can see results sooner, and updating the learnin
 
   # Tokenizer
   tokenizer:
-    _component_: torchtune.models.llama2_tokenizer
+    _component_: torchtune.models.llama2.llama2_tokenizer
     path: /tmp/tokenizer.model
 
   # Dataset
@@ -108,7 +108,7 @@ lowering the epochs to 1 so you can see results sooner, and updating the learnin
 
   # Model Arguments
   model:
-    _component_: torchtune.models.llama2_7b
+    _component_: torchtune.models.llama2.llama2_7b
   model_checkpoint: /tmp/llama2/native_pytorch_model.pt
 
   # Fine-tuning arguments
diff --git a/recipes/configs/alpaca_llama2_full_finetune.yaml b/recipes/configs/alpaca_llama2_full_finetune.yaml
index a2318ac51d..5598ed5021 100644
--- a/recipes/configs/alpaca_llama2_full_finetune.yaml
+++ b/recipes/configs/alpaca_llama2_full_finetune.yaml
@@ -5,7 +5,7 @@
 
 # Tokenizer
 tokenizer:
-  _component_: torchtune.models.llama2_tokenizer
+  _component_: torchtune.models.llama2.llama2_tokenizer
   path: /tmp/llama2/tokenizer.model
 
 # Dataset
@@ -17,7 +17,7 @@ shuffle: True
 
 # Model Arguments
 model:
-  _component_: torchtune.models.llama2_7b
+  _component_: torchtune.models.llama2.llama2_7b
 model_checkpoint: /tmp/llama2_native
 
 # Fine-tuning arguments
diff --git a/recipes/configs/alpaca_llama2_generate.yaml b/recipes/configs/alpaca_llama2_generate.yaml
index 2dacb09125..2c4a3f9781 100644
--- a/recipes/configs/alpaca_llama2_generate.yaml
+++ b/recipes/configs/alpaca_llama2_generate.yaml
@@ -5,12 +5,12 @@
 
 # Model arguments
 model:
-  _component_: torchtune.models.llama2_7b
+  _component_: torchtune.models.llama2.llama2_7b
 model_checkpoint: /tmp/llama2_native
 
 # Tokenizer arguments
 tokenizer:
-  _component_: torchtune.models.llama2_tokenizer
+  _component_: torchtune.models.llama2.llama2_tokenizer
   path: /tmp/llama2/tokenizer.model
 
 # Generation arguments
diff --git a/recipes/configs/alpaca_llama2_lora_finetune.yaml b/recipes/configs/alpaca_llama2_lora_finetune.yaml
index 8b0560fbde..02f41f8aa2 100644
--- a/recipes/configs/alpaca_llama2_lora_finetune.yaml
+++ b/recipes/configs/alpaca_llama2_lora_finetune.yaml
@@ -5,7 +5,7 @@
 
 # Model Arguments
 model:
-  _component_: torchtune.models.lora_llama2_7b
+  _component_: torchtune.models.llama2.lora_llama2_7b
   lora_attn_modules: ['q_proj', 'v_proj']
   lora_rank: 8
   lora_alpha: 16
@@ -15,7 +15,7 @@ lora_checkpoint: null
 
 # Tokenizer
 tokenizer:
-  _component_: torchtune.models.llama2_tokenizer
+  _component_: torchtune.models.llama2.llama2_tokenizer
   path: /tmp/llama2/tokenizer.model
 
 # Dataset and Sampler
diff --git a/recipes/tests/test_alpaca_generate.py b/recipes/tests/test_alpaca_generate.py
index a2c0a2a65d..08a7d5fdd8 100644
--- a/recipes/tests/test_alpaca_generate.py
+++ b/recipes/tests/test_alpaca_generate.py
@@ -36,19 +36,19 @@ class TestAlpacaGenerateRecipe:
     def _fetch_ckpt_model_path(self, ckpt) -> str:
         if ckpt == "small_test_ckpt":
             return "/tmp/test-artifacts/small-ckpt-01242024"
-        if ckpt == "llama2_7b":
+        if ckpt == "llama2.llama2_7b":
             return "/tmp/test-artifacts/llama2-7b-01242024"
         raise ValueError(f"Unknown ckpt {ckpt}")
 
     def test_alpaca_generate(self, capsys, pytestconfig):
         large_scale = pytestconfig.getoption("--large-scale")
-        ckpt = "llama2_7b" if large_scale else "small_test_ckpt"
+        ckpt = "llama2.llama2_7b" if large_scale else "small_test_ckpt"
 
         kwargs_values = {
             "model": {"_component_": f"torchtune.models.{ckpt}"},
             "model_checkpoint": self._fetch_ckpt_model_path(ckpt),
             "tokenizer": {
-                "_component_": "torchtune.models.llama2_tokenizer",
+                "_component_": "torchtune.models.llama2.llama2_tokenizer",
                 "path": "/tmp/test-artifacts/tokenizer.model",
             },
             "instruction": "Answer the question.",
diff --git a/recipes/tests/test_full_finetune.py b/recipes/tests/test_full_finetune.py
index d08ed24d79..7f4c849a85 100644
--- a/recipes/tests/test_full_finetune.py
+++ b/recipes/tests/test_full_finetune.py
@@ -51,13 +51,13 @@ def _fetch_expected_loss_values(self, ckpt) -> Dict[str, float]:
         }
         if ckpt == "small_test_ckpt":
             return small_test_ckpt_loss_values
-        if ckpt == "llama2_7b":
+        if ckpt == "llama2.llama2_7b":
             return llama2_7b_ckpt_loss_values
         raise ValueError(f"Unknown ckpt {ckpt}")
 
     def test_loss(self, capsys, pytestconfig):
         large_scale = pytestconfig.getoption("--large-scale")
-        ckpt = "llama2_7b" if large_scale else "small_test_ckpt"
+        ckpt = "llama2.llama2_7b" if large_scale else "small_test_ckpt"
         expected_loss_values = self._fetch_expected_loss_values(ckpt)
 
         kwargs_values = default_recipe_kwargs(ckpt)
@@ -93,7 +93,7 @@ def test_training_state_on_resume(self):
                     "model": {"_component_": f"torchtune.models.{model_ckpt}"},
                     "model_checkpoint": fetch_ckpt_model_path(model_ckpt),
                     "tokenizer": {
-                        "_component_": "torchtune.models.llama2_tokenizer",
+                        "_component_": "torchtune.models.llama2.llama2_tokenizer",
                         "path": "/tmp/test-artifacts/tokenizer.model",
                     },
                     "epochs": 4,
@@ -127,7 +127,7 @@ def test_training_state_on_resume(self):
                     "model": {"_component_": f"torchtune.models.{model_ckpt}"},
                     "model_checkpoint": os.path.join(tmpdirname, "model_2.ckpt"),
                     "tokenizer": {
-                        "_component_": "torchtune.models.llama2_tokenizer",
+                        "_component_": "torchtune.models.llama2.llama2_tokenizer",
                         "path": "/tmp/test-artifacts/tokenizer.model",
                     },
                     "epochs": 4,
@@ -228,7 +228,7 @@ def test_gradient_accumulation(
             "model": {"_component_": f"torchtune.models.{model_ckpt}"},
             "model_checkpoint": None,
             "tokenizer": {
-                "_component_": "torchtune.models.llama2_tokenizer",
+                "_component_": "torchtune.models.llama2.llama2_tokenizer",
                 "path": "/tmp/test-artifacts/tokenizer.model",
             },
             "batch_size": full_batch_size,
diff --git a/recipes/tests/utils.py b/recipes/tests/utils.py
index 8a4d742323..db6625841a 100644
--- a/recipes/tests/utils.py
+++ b/recipes/tests/utils.py
@@ -89,7 +89,7 @@ def default_recipe_kwargs(ckpt):
         "model": {"_component_": f"torchtune.models.{ckpt}"},
         "model_checkpoint": fetch_ckpt_model_path(ckpt),
         "tokenizer": {
-            "_component_": "torchtune.models.llama2_tokenizer",
+            "_component_": "torchtune.models.llama2.llama2_tokenizer",
             "path": "/tmp/test-artifacts/tokenizer.model",
         },
         "batch_size": 8,
diff --git a/tests/torchtune/config/test_utils.py b/tests/torchtune/config/test_utils.py
index c4e3fc044a..6a27bfdfd4 100644
--- a/tests/torchtune/config/test_utils.py
+++ b/tests/torchtune/config/test_utils.py
@@ -13,7 +13,7 @@ def test_get_component_from_path(self):
         good_paths = [
             "torchtune",  # Test single module without dot
             "torchtune.models",  # Test dotpath for a module
-            "torchtune.models.llama2_7b",  # Test dotpath for an object
+            "torchtune.models.llama2.llama2_7b",  # Test dotpath for an object
         ]
         for path in good_paths:
             _ = _get_component_from_path(path)
diff --git a/torchtune/models/__init__.py b/torchtune/models/__init__.py
new file mode 100644
index 0000000000..a4fcc101f4
--- /dev/null
+++ b/torchtune/models/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from torchtune.models import llama2
+
+__all__ = [
+    "llama2",
+]