From ef4018181e9c2e29d001af0ee8acaeb68de9c13c Mon Sep 17 00:00:00 2001
From: Daniel Kleine <53251018+d-kleine@users.noreply.github.com>
Date: Wed, 23 Oct 2024 03:23:31 +0200
Subject: [PATCH 1/7] updates for PyTorch 2.5 (#408)

* updated Dockerfile

* updated MHA implementations for PT 2.5

* fixed typo

* update installation instruction

* Update setup/03_optional-docker-environment/.devcontainer/Dockerfile

---------

Co-authored-by: rasbt <mail@sebastianraschka.com>
---
 .../mha-implementations.ipynb                 | 76 +++++++------------
 .../.devcontainer/Dockerfile                  |  7 +-
 2 files changed, 32 insertions(+), 51 deletions(-)
diff --git a/ch03/02_bonus_efficient-multihead-attention/mha-implementations.ipynb b/ch03/02_bonus_efficient-multihead-attention/mha-implementations.ipynb
index fb490da3..a1d074be 100644
--- a/ch03/02_bonus_efficient-multihead-attention/mha-implementations.ipynb
+++ b/ch03/02_bonus_efficient-multihead-attention/mha-implementations.ipynb
@@ -22,50 +22,6 @@
     "</table>"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "1HABx0Hr3PDD",
-   "metadata": {
-    "id": "1HABx0Hr3PDD"
-   },
-   "source": [
-    "Uncomment and execute the following code cell to install the dependencies:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "qPnVNAOxwy5s",
-   "metadata": {
-    "id": "qPnVNAOxwy5s"
-   },
-   "outputs": [],
-   "source": [
-    "# pip install -r https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/requirements.txt"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "LYLcq3403Yq6",
-   "metadata": {
-    "id": "LYLcq3403Yq6"
-   },
-   "source": [
-    "Uncomment and execute the following code cell to install the PyTorch nightly dependency if you want to run the FlexAttention benchmarks (this is required because FlexAttention is not yet included in the latest PyTorch release):"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "gAgYvxm_xVct",
-   "metadata": {
-    "id": "gAgYvxm_xVct"
-   },
-   "outputs": [],
-   "source": [
-    "# pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121 -U"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "6f678e62-7bcb-4405-86ae-dce94f494303",
@@ -119,6 +75,28 @@
     "embeddings = torch.randn((batch_size, context_len, embed_dim), device=device)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "LYLcq3403Yq6",
+   "metadata": {
+    "id": "LYLcq3403Yq6"
+   },
+   "source": [
+    "- To run all the code in this notebook, please ensure you update to at least PyTorch 2.5 (FlexAttention is not included in earlier PyTorch releases)\n",
+    "If the code cell above shows a PyTorch version lower than 2.5, you can upgrade your PyTorch installation by uncommenting and running the following code cell (Please note that PyTorch 2.5 requires Python 3.9 or later)\n",
+    "- For more specific instructions and CUDA versions, please refer to the official installation guide at https://pytorch.org."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1db27f43-86f4-478f-89df-fbc2182a129b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# pip install --upgrade torch torchvision torchaudio"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "2f9bb1b6-a1e5-4e0a-884d-0f31b374a8d6",
@@ -964,16 +942,16 @@
     "## 9) Using PyTorch's FlexAttention\n",
     "\n",
     "- See [FlexAttention: The Flexibility of PyTorch with the Performance of FlashAttention](https://pytorch.org/blog/flexattention/) to learn more about FlexAttention\n",
-    "- This is currently only supported in PyTorch 2.5 (nightly), which you can install on a CPU machine via\n",
+    "- This is supported starting from PyTorch 2.5, which you can install on a CPU machine via\n",
     "\n",
     "    ```bash\n",
-    "    pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu -U\n",
+    "    pip install torch torchvision torchaudio\n",
     "    ```\n",
     "\n",
-    "- To install PyTorch nighly on a GPU machine, use the following (for more information, also see the installation menu on [pytorch.org](https://pytorch.org/))\n",
+    "- To install PyTorch on a GPU machine, use the following (for more information, also see the installation menu on [pytorch.org](https://pytorch.org/))\n",
     "\n",
     "    ```bash\n",
-    "    pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121 -U\n",
+    "    pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124\n",
     "    ```"
    ]
   },
@@ -2001,7 +1979,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.9"
+   "version": "3.11.4"
   }
  },
  "nbformat": 4,
diff --git a/setup/03_optional-docker-environment/.devcontainer/Dockerfile b/setup/03_optional-docker-environment/.devcontainer/Dockerfile
index 7e0f697d..79194330 100644
--- a/setup/03_optional-docker-environment/.devcontainer/Dockerfile
+++ b/setup/03_optional-docker-environment/.devcontainer/Dockerfile
@@ -1,5 +1,7 @@
-FROM pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime
+# Install PyTorch 2.5 with CUDA 12.4
+FROM pytorch/pytorch:2.5.0-cuda12.4-cudnn9-runtime
 
+# Install Ubuntu packages
 RUN apt-get update && \
     apt-get upgrade -y && \
     apt-get install -y rsync && \
@@ -7,6 +9,7 @@ RUN apt-get update && \
     apt-get install -y curl && \
     rm -rf /var/lib/apt/lists/*
 
+# Install Python packages
 COPY requirements.txt requirements.txt
-
+RUN pip install --upgrade pip
 RUN pip install --no-cache-dir -r requirements.txt

From 4f9c9fb703c66ccda9535bb0acd63f20388604ec Mon Sep 17 00:00:00 2001
From: Sebastian Raschka <mail@sebastianraschka.com>
Date: Wed, 23 Oct 2024 07:48:33 -0500
Subject: [PATCH 2/7] Update tests.py

---
 ch05/07_gpt_to_llama/tests/tests.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/ch05/07_gpt_to_llama/tests/tests.py b/ch05/07_gpt_to_llama/tests/tests.py
index e753ae74..6620b4ea 100644
--- a/ch05/07_gpt_to_llama/tests/tests.py
+++ b/ch05/07_gpt_to_llama/tests/tests.py
@@ -1,3 +1,10 @@
+# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
+# Source for "Build a Large Language Model From Scratch"
+#   - https://www.manning.com/books/build-a-large-language-model-from-scratch
+# Code: https://github.com/rasbt/LLMs-from-scratch
+
+# File for internal use (unit tests)
+
 import io
 import os
 import sys
@@ -8,14 +15,6 @@
 from transformers.models.llama.modeling_llama import LlamaRotaryEmbedding, apply_rotary_pos_emb
 
 
-# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
-# Source for "Build a Large Language Model From Scratch"
-#   - https://www.manning.com/books/build-a-large-language-model-from-scratch
-# Code: https://github.com/rasbt/LLMs-from-scratch
-
-# File for internal use (unit tests)
-
-
 @pytest.fixture(scope="module")
 def notebook():
     def import_definitions_from_notebook(notebooks):

From 7cd6a670ed16f657d81f03a78e70684cf0b4f7f7 Mon Sep 17 00:00:00 2001
From: Sebastian Raschka <mail@sebastianraschka.com>
Date: Wed, 23 Oct 2024 18:07:49 -0500
Subject: [PATCH 3/7] RoPE updates (#412)

* RoPE updates

* Apply suggestions from code review

* updates

* updates

* updates
---
 .../converting-gpt-to-llama2.ipynb            |   8 +-
 .../converting-llama2-to-llama3.ipynb         |   8 +-
 ch05/07_gpt_to_llama/standalone-llama32.ipynb |   4 +-
 ch05/07_gpt_to_llama/tests/Untitled.ipynb     |  74 +++++++++++
 .../tests/test-requirements-extra.txt         |   3 +-
 ch05/07_gpt_to_llama/tests/tests.py           | 118 +++++++++++++++++-
 6 files changed, 202 insertions(+), 13 deletions(-)
 create mode 100644 ch05/07_gpt_to_llama/tests/Untitled.ipynb

diff --git a/ch05/07_gpt_to_llama/converting-gpt-to-llama2.ipynb b/ch05/07_gpt_to_llama/converting-gpt-to-llama2.ipynb
index e8c5bf68..e7f459ea 100644
--- a/ch05/07_gpt_to_llama/converting-gpt-to-llama2.ipynb
+++ b/ch05/07_gpt_to_llama/converting-gpt-to-llama2.ipynb
@@ -426,7 +426,7 @@
     "    assert head_dim % 2 == 0, \"Embedding dimension must be even\"\n",
     "\n",
     "    # Compute the inverse frequencies\n",
-    "    inv_freq = 1.0 / (theta_base ** (torch.arange(0, head_dim // 2) / (head_dim // 2)))\n",
+    "    inv_freq = 1.0 / (theta_base ** (torch.arange(0, head_dim, 2)[: (head_dim // 2)].float() / head_dim))\n",
     "\n",
     "    # Generate position indices\n",
     "    positions = torch.arange(context_length)\n",
@@ -493,8 +493,8 @@
     "\n",
     "# Dummy query and key tensors\n",
     "torch.manual_seed(123)\n",
-    "queries = torch.randn(batch_size, context_len, num_heads, head_dim)\n",
-    "keys = torch.randn(batch_size, context_len, num_heads, head_dim)\n",
+    "queries = torch.randn(batch_size, num_heads, context_len, head_dim)\n",
+    "keys = torch.randn(batch_size, num_heads, context_len, head_dim)\n",
     "\n",
     "# Apply rotary position embeddings\n",
     "queries_rot = compute_rope(queries, cos, sin)\n",
@@ -1691,7 +1691,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.4"
+   "version": "3.10.6"
   },
   "widgets": {
    "application/vnd.jupyter.widget-state+json": {
diff --git a/ch05/07_gpt_to_llama/converting-llama2-to-llama3.ipynb b/ch05/07_gpt_to_llama/converting-llama2-to-llama3.ipynb
index 4b4459fc..bf62d9fc 100644
--- a/ch05/07_gpt_to_llama/converting-llama2-to-llama3.ipynb
+++ b/ch05/07_gpt_to_llama/converting-llama2-to-llama3.ipynb
@@ -278,7 +278,7 @@
     "    assert head_dim % 2 == 0, \"Embedding dimension must be even\"\n",
     "\n",
     "    # Compute the inverse frequencies\n",
-    "    inv_freq = 1.0 / (theta_base ** (torch.arange(0, head_dim // 2) / (head_dim // 2)))\n",
+    "    inv_freq = 1.0 / (theta_base ** (torch.arange(0, head_dim, 2)[: (head_dim // 2)].float() / head_dim))\n",
     "\n",
     "    ################################ NEW ###############################################\n",
     "    # Frequency adjustments\n",
@@ -383,8 +383,8 @@
     "\n",
     "# Dummy query and key tensors\n",
     "torch.manual_seed(123)\n",
-    "queries = torch.randn(batch_size, llama_3_context_len, num_heads, head_dim)\n",
-    "keys = torch.randn(batch_size, llama_3_context_len, num_heads, head_dim)\n",
+    "queries = torch.randn(batch_size, num_heads, llama_3_context_len, head_dim)\n",
+    "keys = torch.randn(batch_size, num_heads, llama_3_context_len, head_dim)\n",
     "\n",
     "# Apply rotary position embeddings\n",
     "queries_rot = compute_rope(queries, cos, sin)\n",
@@ -2701,7 +2701,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.4"
+   "version": "3.10.6"
   },
   "widgets": {
    "application/vnd.jupyter.widget-state+json": {
diff --git a/ch05/07_gpt_to_llama/standalone-llama32.ipynb b/ch05/07_gpt_to_llama/standalone-llama32.ipynb
index 4201f959..b3d80c9e 100644
--- a/ch05/07_gpt_to_llama/standalone-llama32.ipynb
+++ b/ch05/07_gpt_to_llama/standalone-llama32.ipynb
@@ -133,7 +133,7 @@
     "    assert head_dim % 2 == 0, \"Embedding dimension must be even\"\n",
     "\n",
     "    # Compute the inverse frequencies\n",
-    "    inv_freq = 1.0 / (theta_base ** (torch.arange(0, head_dim // 2) / (head_dim // 2)))\n",
+    "    inv_freq = 1.0 / (theta_base ** (torch.arange(0, head_dim, 2)[: (head_dim // 2)].float() / head_dim))\n",
     "\n",
     "    # Frequency adjustments\n",
     "    if freq_config is not None:\n",
@@ -1061,7 +1061,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.4"
+   "version": "3.10.6"
   }
  },
  "nbformat": 4,
diff --git a/ch05/07_gpt_to_llama/tests/Untitled.ipynb b/ch05/07_gpt_to_llama/tests/Untitled.ipynb
new file mode 100644
index 00000000..1375a9e9
--- /dev/null
+++ b/ch05/07_gpt_to_llama/tests/Untitled.ipynb
@@ -0,0 +1,74 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "40d2405d-ee10-44ad-b20e-cf32078f926a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "True | head dim: 1, tensor([]), tensor([])\n",
+      "True | head dim: 2, tensor([1.]), tensor([1.])\n",
+      "True | head dim: 3, tensor([1.]), tensor([1.])\n",
+      "True | head dim: 4, tensor([1.0000, 0.0100]), tensor([1.0000, 0.0100])\n",
+      "False | head dim: 5, tensor([1.0000, 0.0100]), tensor([1.0000, 0.0251])\n",
+      "True | head dim: 6, tensor([1.0000, 0.0464, 0.0022]), tensor([1.0000, 0.0464, 0.0022])\n",
+      "False | head dim: 7, tensor([1.0000, 0.0464, 0.0022]), tensor([1.0000, 0.0720, 0.0052])\n",
+      "True | head dim: 8, tensor([1.0000, 0.1000, 0.0100, 0.0010]), tensor([1.0000, 0.1000, 0.0100, 0.0010])\n",
+      "False | head dim: 9, tensor([1.0000, 0.1000, 0.0100, 0.0010]), tensor([1.0000, 0.1292, 0.0167, 0.0022])\n",
+      "True | head dim: 10, tensor([1.0000e+00, 1.5849e-01, 2.5119e-02, 3.9811e-03, 6.3096e-04]), tensor([1.0000e+00, 1.5849e-01, 2.5119e-02, 3.9811e-03, 6.3096e-04])\n",
+      "False | head dim: 11, tensor([1.0000e+00, 1.5849e-01, 2.5119e-02, 3.9811e-03, 6.3096e-04]), tensor([1.0000, 0.1874, 0.0351, 0.0066, 0.0012])\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "\n",
+    "theta_base = 10_000\n",
+    "\n",
+    "for head_dim in range(1, 12):\n",
+    "\n",
+    "    before = 1.0 / (theta_base ** (torch.arange(0, head_dim // 2) / (head_dim // 2)))\n",
+    "    after = 1.0 / (theta_base ** (torch.arange(0, head_dim, 2)[: (head_dim // 2)].float() / head_dim))\n",
+    "    \n",
+    "    s = f\"{torch.equal(before, after)} | head dim: {head_dim}, {before}, {after}\"\n",
+    "    print(s)\n",
+    "\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0abfbf38-93a4-4994-8e7e-a543477268a8",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/ch05/07_gpt_to_llama/tests/test-requirements-extra.txt b/ch05/07_gpt_to_llama/tests/test-requirements-extra.txt
index 8828ccea..2b9fd336 100644
--- a/ch05/07_gpt_to_llama/tests/test-requirements-extra.txt
+++ b/ch05/07_gpt_to_llama/tests/test-requirements-extra.txt
@@ -1 +1,2 @@
-transformers>=4.44.2
\ No newline at end of file
+transformers>=4.44.2
+litgpt>=0.5.0
\ No newline at end of file
diff --git a/ch05/07_gpt_to_llama/tests/tests.py b/ch05/07_gpt_to_llama/tests/tests.py
index 6620b4ea..395f9ec3 100644
--- a/ch05/07_gpt_to_llama/tests/tests.py
+++ b/ch05/07_gpt_to_llama/tests/tests.py
@@ -10,11 +10,82 @@
 import sys
 import types
 import nbformat
+from typing import Optional, Tuple
 import torch
 import pytest
 from transformers.models.llama.modeling_llama import LlamaRotaryEmbedding, apply_rotary_pos_emb
 
 
+# LitGPT code from https://github.com/Lightning-AI/litgpt/blob/main/litgpt/model.py
+# LitGPT is licensed under Apache v2: https://github.com/Lightning-AI/litgpt/blob/main/LICENSE
+def litgpt_build_rope_cache(
+    seq_len: int,
+    n_elem: int,
+    device: Optional[torch.device] = None,
+    base: int = 10000,
+    condense_ratio: int = 1,
+    extra_config: Optional[dict] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Enhanced Transformer with Rotary Position Embedding.
+
+    Args:
+        seq_len (int): Sequence length.
+        n_elem (int): Number of elements (head dimension).
+        device (torch.device, optional): Device for tensor allocations.
+        base (int, optional): Base for computing inverse frequencies.
+        condense_ratio (int, optional): Ratio to condense the position indices.
+        extra_config (dict, optional): Configuration parameters for frequency adjustments (used by Llama 3.1 and 3.2)
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: Cosine and sine caches for RoPE.
+    """
+
+    # Compute the inverse frequencies theta
+    theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, device=device).float() / n_elem))
+
+    if extra_config is not None:
+        orig_context_len = extra_config["original_max_seq_len"]
+        factor = extra_config["factor"]
+        low_freq_factor = extra_config["low_freq_factor"]
+        high_freq_factor = extra_config["high_freq_factor"]
+
+        wavelen = 2 * torch.pi / theta
+        ratio = orig_context_len / wavelen
+        smooth_factor = (ratio - low_freq_factor) / (high_freq_factor - low_freq_factor)
+        smooth_factor = torch.clamp(smooth_factor, min=0.0, max=1.0)
+
+        # Compute adjusted_theta without masked indexing
+        adjusted_theta = (1 - smooth_factor) * (theta / factor) + smooth_factor * theta
+        theta = adjusted_theta
+
+    # Create position indices `[0, 1, ..., seq_len - 1]`
+    seq_idx = torch.arange(seq_len, device=device) / condense_ratio
+
+    # Calculate the product of position index and $\theta_i$
+    idx_theta = torch.outer(seq_idx, theta).repeat(1, 2)
+
+    return torch.cos(idx_theta), torch.sin(idx_theta)
+
+
+# LitGPT code from https://github.com/Lightning-AI/litgpt/blob/main/litgpt/model.py
+# LitGPT is licensed under Apache v2: https://github.com/Lightning-AI/litgpt/blob/main/LICENSE
+def litgpt_apply_rope(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor:
+    head_size = x.size(-1)
+    x1 = x[..., : head_size // 2]  # (B, nh, T, hs/2)
+    x2 = x[..., head_size // 2:]  # (B, nh, T, hs/2)
+    rotated = torch.cat((-x2, x1), dim=-1)  # (B, nh, T, hs)
+    if cos.dim() > 1:
+        # batch dimensions must align
+        # sin/cos are (B, T, hs) so we unsqeeze -3 for nh
+        # we count from back because all of apply_rope does
+        cos = cos.unsqueeze(-3)
+        sin = sin.unsqueeze(-3)
+
+    roped = (x * cos) + (rotated * sin)
+    return roped.to(dtype=x.dtype)
+
+
 @pytest.fixture(scope="module")
 def notebook():
     def import_definitions_from_notebook(notebooks):
@@ -84,21 +155,30 @@ def test_rope_llama2(notebook):
     queries_rot = this_nb.compute_rope(queries, cos, sin)
     keys_rot = this_nb.compute_rope(keys, cos, sin)
 
+    # Generate reference RoPE via HF
     rot_emb = LlamaRotaryEmbedding(
         dim=head_dim,
         max_position_embeddings=context_len,
         base=10_000
     )
-
     position_ids = torch.arange(context_len, dtype=torch.long).unsqueeze(0)
     ref_cos, ref_sin = rot_emb(queries, position_ids)
     ref_queries_rot, ref_keys_rot = apply_rotary_pos_emb(queries, keys, ref_cos, ref_sin)
-
     torch.testing.assert_close(sin, ref_sin.squeeze(0))
     torch.testing.assert_close(cos, ref_cos.squeeze(0))
     torch.testing.assert_close(keys_rot, ref_keys_rot)
     torch.testing.assert_close(queries_rot, ref_queries_rot)
 
+    # Generate reference RoPE via LitGPT
+    litgpt_cos, litgpt_sin = litgpt_build_rope_cache(context_len, n_elem=head_dim, base=10_000)
+    litgpt_queries_rot = litgpt_apply_rope(queries, litgpt_cos, litgpt_sin)
+    litgpt_keys_rot = litgpt_apply_rope(keys, litgpt_cos, litgpt_sin)
+
+    torch.testing.assert_close(sin, litgpt_sin)
+    torch.testing.assert_close(cos, litgpt_cos)
+    torch.testing.assert_close(keys_rot, litgpt_keys_rot)
+    torch.testing.assert_close(queries_rot, litgpt_queries_rot)
+
 
 def test_rope_llama3(notebook):
 
@@ -128,6 +208,7 @@ def test_rope_llama3(notebook):
     queries_rot = nb1.compute_rope(queries, cos, sin)
     keys_rot = nb1.compute_rope(keys, cos, sin)
 
+    # Generate reference RoPE via HF
     rot_emb = LlamaRotaryEmbedding(
         dim=head_dim,
         max_position_embeddings=context_len,
@@ -143,6 +224,16 @@ def test_rope_llama3(notebook):
     torch.testing.assert_close(keys_rot, ref_keys_rot)
     torch.testing.assert_close(queries_rot, ref_queries_rot)
 
+    # Generate reference RoPE via LitGPT
+    litgpt_cos, litgpt_sin = litgpt_build_rope_cache(context_len, n_elem=head_dim, base=theta_base)
+    litgpt_queries_rot = litgpt_apply_rope(queries, litgpt_cos, litgpt_sin)
+    litgpt_keys_rot = litgpt_apply_rope(keys, litgpt_cos, litgpt_sin)
+
+    torch.testing.assert_close(sin, litgpt_sin)
+    torch.testing.assert_close(cos, litgpt_cos)
+    torch.testing.assert_close(keys_rot, litgpt_keys_rot)
+    torch.testing.assert_close(queries_rot, litgpt_queries_rot)
+
 
 def test_rope_llama3_12(notebook):
 
@@ -180,6 +271,7 @@ def test_rope_llama3_12(notebook):
     queries_rot = nb1.compute_rope(queries, cos, sin)
     keys_rot = nb1.compute_rope(keys, cos, sin)
 
+    # Generate reference RoPE via HF
     hf_rope_params = {
         "factor": 8.0,
         "low_freq_factor": 1.0,
@@ -210,6 +302,28 @@ class RoPEConfig:
     torch.testing.assert_close(keys_rot, ref_keys_rot)
     torch.testing.assert_close(queries_rot, ref_queries_rot)
 
+    # Generate reference RoPE via LitGPT
+    litgpt_rope_config = {
+        "factor": 8.0,
+        "low_freq_factor": 1.0,
+        "high_freq_factor": 4.0,
+        "original_max_seq_len": 8192
+    }
+
+    litgpt_cos, litgpt_sin = litgpt_build_rope_cache(
+        context_len,
+        n_elem=head_dim,
+        base=rope_theta,
+        extra_config=litgpt_rope_config
+    )
+    litgpt_queries_rot = litgpt_apply_rope(queries, litgpt_cos, litgpt_sin)
+    litgpt_keys_rot = litgpt_apply_rope(keys, litgpt_cos, litgpt_sin)
+
+    torch.testing.assert_close(sin, litgpt_sin)
+    torch.testing.assert_close(cos, litgpt_cos)
+    torch.testing.assert_close(keys_rot, litgpt_keys_rot)
+    torch.testing.assert_close(queries_rot, litgpt_queries_rot)
+
 
 def test_silu(notebook):
     example_batch = torch.randn(2, 3, 4)

From e1dfd2cb7a43e987233dcc22049d4d2947824dc4 Mon Sep 17 00:00:00 2001
From: Sebastian Raschka <mail@sebastianraschka.com>
Date: Wed, 23 Oct 2024 19:19:58 -0500
Subject: [PATCH 4/7] Update test-requirements-extra.txt

---
 ch05/07_gpt_to_llama/tests/test-requirements-extra.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ch05/07_gpt_to_llama/tests/test-requirements-extra.txt b/ch05/07_gpt_to_llama/tests/test-requirements-extra.txt
index 2b9fd336..4f423290 100644
--- a/ch05/07_gpt_to_llama/tests/test-requirements-extra.txt
+++ b/ch05/07_gpt_to_llama/tests/test-requirements-extra.txt
@@ -1,2 +1,2 @@
+pytest>=8.1.1
 transformers>=4.44.2
-litgpt>=0.5.0
\ No newline at end of file

From d38083c40199fe7a8e3d9a5b4b20dc209371bd3d Mon Sep 17 00:00:00 2001
From: Daniel Kleine <53251018+d-kleine@users.noreply.github.com>
Date: Thu, 24 Oct 2024 14:40:08 +0200
Subject: [PATCH 5/7] Updated Llama 2 to 3 paths (#413)

* llama 2 and 3 path fixes

* updated llama 3, 3.1 and 3.2 paths

* updated .gitignore

* Typo fix

---------

Co-authored-by: Sebastian Raschka <mail@sebastianraschka.com>
---
 .gitignore                                    | 11 ++++++----
 .../converting-gpt-to-llama2.ipynb            |  2 +-
 .../converting-llama2-to-llama3.ipynb         | 20 +++++++++----------
 ch05/07_gpt_to_llama/standalone-llama32.ipynb | 10 +++++-----
 4 files changed, 23 insertions(+), 20 deletions(-)

diff --git a/.gitignore b/.gitignore
index f91cc5ce..4385a0ba 100644
--- a/.gitignore
+++ b/.gitignore
@@ -35,12 +35,15 @@ ch05/01_main-chapter-code/model.pth
 ch05/01_main-chapter-code/model_and_optimizer.pth
 ch05/03_bonus_pretraining_on_gutenberg/model_checkpoints
 ch05/06_user_interface/gpt2
+ch05/07_gpt_to_llama/.cache
 ch05/07_gpt_to_llama/Llama-2-7b
 ch05/07_gpt_to_llama/Llama-2-7b-chat
-ch05/07_gpt_to_llama/.cache
-ch05/07_gpt_to_llama/llama3-files
-ch05/07_gpt_to_llama/llama31-files
-ch05/07_gpt_to_llama/llama32-files
+ch05/07_gpt_to_llama/Llama-3-8B
+ch05/07_gpt_to_llama/Llama-3-8B-Instruct
+ch05/07_gpt_to_llama/Llama-3.1-8B
+ch05/07_gpt_to_llama/Llama-3.1-8B-Instruct
+ch05/07_gpt_to_llama/Llama-3.2-1B
+ch05/07_gpt_to_llama/Llama-3.2-1B-Instruct
 
 ch06/01_main-chapter-code/gpt2
 ch06/02_bonus_additional-experiments/gpt2
diff --git a/ch05/07_gpt_to_llama/converting-gpt-to-llama2.ipynb b/ch05/07_gpt_to_llama/converting-gpt-to-llama2.ipynb
index e7f459ea..1ff5a42b 100644
--- a/ch05/07_gpt_to_llama/converting-gpt-to-llama2.ipynb
+++ b/ch05/07_gpt_to_llama/converting-gpt-to-llama2.ipynb
@@ -1189,7 +1189,7 @@
     "tokenizer_file = hf_hub_download(\n",
     "    repo_id=\"meta-llama/Llama-2-7b\",\n",
     "    filename=\"tokenizer.model\",\n",
-    "    local_dir=\"Llama-2-7B\"\n",
+    "    local_dir=\"Llama-2-7b\"\n",
     ")"
    ]
   },
diff --git a/ch05/07_gpt_to_llama/converting-llama2-to-llama3.ipynb b/ch05/07_gpt_to_llama/converting-llama2-to-llama3.ipynb
index bf62d9fc..1c0dc349 100644
--- a/ch05/07_gpt_to_llama/converting-llama2-to-llama3.ipynb
+++ b/ch05/07_gpt_to_llama/converting-llama2-to-llama3.ipynb
@@ -1252,7 +1252,7 @@
     "tokenizer_file_path = hf_hub_download(\n",
     "    repo_id=\"meta-llama/Meta-Llama-3-8B\",\n",
     "    filename=\"original/tokenizer.model\",\n",
-    "    local_dir=\"llama3-files\"\n",
+    "    local_dir=\"Llama-3-8B\"\n",
     ")"
    ]
   },
@@ -1458,7 +1458,7 @@
     "    weights_file = hf_hub_download(\n",
     "        repo_id=\"meta-llama/Meta-Llama-3-8B\",\n",
     "        filename=f\"model-0000{i}-of-00004.safetensors\",\n",
-    "        local_dir=\"llama3-files\"\n",
+    "        local_dir=\"Llama-3-8B\"\n",
     "    )\n",
     "    current_weights = load_file(weights_file)\n",
     "    combined_weights.update(current_weights)"
@@ -1677,7 +1677,7 @@
     "id": "akyo7WNyF_YL"
    },
    "source": [
-    "- Above, we used the pretrained base model; if you want to use a model capable of following instructions, use the `\"meta-llama/Llama-3-8b-Instruct\"` model instead, as shown below"
+    "- Above, we used the pretrained base model; if you want to use a model capable of following instructions, use the `\"meta-llama/Llama-3-8B-Instruct\"` model instead, as shown below"
    ]
   },
   {
@@ -1824,7 +1824,7 @@
     "    weights_file = hf_hub_download(\n",
     "        repo_id=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n",
     "        filename=f\"model-0000{i}-of-00004.safetensors\",\n",
-    "        local_dir=\"llama3-files\"\n",
+    "        local_dir=\"Llama-3-8B-Instruct\"\n",
     "    )\n",
     "    current_weights = load_file(weights_file)\n",
     "    combined_weights.update(current_weights)\n",
@@ -2157,7 +2157,7 @@
     "tokenizer_file_path = hf_hub_download(\n",
     "    repo_id=\"meta-llama/Llama-3.1-8B\",\n",
     "    filename=\"original/tokenizer.model\",\n",
-    "    local_dir=\"llama31-files\"\n",
+    "    local_dir=\"Llama-3.1-8B\"\n",
     ")\n",
     "\n",
     "tokenizer = Tokenizer(tokenizer_file_path)"
@@ -2313,7 +2313,7 @@
     "    weights_file = hf_hub_download(\n",
     "        repo_id=\"meta-llama/Llama-3.1-8B\",\n",
     "        filename=f\"model-0000{i}-of-00004.safetensors\",\n",
-    "        local_dir=\"llama31-files\"\n",
+    "        local_dir=\"Llama-3.1-8B\"\n",
     "    )\n",
     "    current_weights = load_file(weights_file)\n",
     "    combined_weights.update(current_weights)\n",
@@ -2512,7 +2512,7 @@
     "tokenizer_file_path = hf_hub_download(\n",
     "    repo_id=\"meta-llama/Llama-3.2-1B\",\n",
     "    filename=\"original/tokenizer.model\",\n",
-    "    local_dir=\"llama32-files\"\n",
+    "    local_dir=\"Llama-3.2-1B\"\n",
     ")\n",
     "\n",
     "tokenizer = Tokenizer(tokenizer_file_path)"
@@ -2589,7 +2589,7 @@
     "weights_file = hf_hub_download(\n",
     "    repo_id=\"meta-llama/Llama-3.2-1B\",\n",
     "    filename=f\"model.safetensors\",\n",
-    "    local_dir=\"llama32-files\"\n",
+    "    local_dir=\"Llama-3.2-1B\"\n",
     ")\n",
     "current_weights = load_file(weights_file)\n",
     "\n",
@@ -2687,7 +2687,7 @@
    "provenance": []
   },
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "pt",
    "language": "python",
    "name": "python3"
   },
@@ -2701,7 +2701,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.6"
+   "version": "3.11.9"
   },
   "widgets": {
    "application/vnd.jupyter.widget-state+json": {
diff --git a/ch05/07_gpt_to_llama/standalone-llama32.ipynb b/ch05/07_gpt_to_llama/standalone-llama32.ipynb
index b3d80c9e..dd8fdf5f 100644
--- a/ch05/07_gpt_to_llama/standalone-llama32.ipynb
+++ b/ch05/07_gpt_to_llama/standalone-llama32.ipynb
@@ -733,7 +733,7 @@
     "tokenizer_file_path = hf_hub_download(\n",
     "    repo_id=f\"meta-llama/Llama-3.2-{LLAMA_SIZE_STR}-Instruct\",\n",
     "    filename=\"original/tokenizer.model\",\n",
-    "    local_dir=\"llama32-files\"\n",
+    "    local_dir=\"Llama-3.2-1B-Instruct\"\n",
     ")"
    ]
   },
@@ -860,7 +860,7 @@
     "    weights_file = hf_hub_download(\n",
     "        repo_id=f\"meta-llama/Llama-3.2-{LLAMA_SIZE_STR}-Instruct\",\n",
     "        filename=f\"model.safetensors\",\n",
-    "        local_dir=\"llama32-files\"\n",
+    "        local_dir=\"Llama-3.2-1B-Instruct\"\n",
     "    )\n",
     "    combined_weights = load_file(weights_file)\n",
     "\n",
@@ -871,7 +871,7 @@
     "        weights_file = hf_hub_download(\n",
     "            repo_id=f\"meta-llama/Llama-3.2-{LLAMA_SIZE_STR}-Instruct\",\n",
     "            filename=f\"model-0000{i}-of-00002.safetensors\",\n",
-    "            local_dir=\"llama32-files\"\n",
+    "            local_dir=\"Llama-3.2-1B-Instruct\"\n",
     "        )\n",
     "        current_weights = load_file(weights_file)\n",
     "        combined_weights.update(current_weights)\n",
@@ -1047,7 +1047,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "pt",
    "language": "python",
    "name": "python3"
   },
@@ -1061,7 +1061,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.6"
+   "version": "3.11.9"
   }
  },
  "nbformat": 4,

From 5ff72c2850345a854d9ca0c750a9e44f75c2b0c9 Mon Sep 17 00:00:00 2001
From: Daniel Kleine <53251018+d-kleine@users.noreply.github.com>
Date: Fri, 25 Oct 2024 01:23:53 +0200
Subject: [PATCH 6/7] fixed typos (#414)

* fixed typos

* fixed formatting

* Update ch03/02_bonus_efficient-multihead-attention/mha-implementations.ipynb

* del weights after load into model

---------

Co-authored-by: Sebastian Raschka <mail@sebastianraschka.com>
---
 .../mha-implementations.ipynb                 | 20 ++++++++++---------
 .../converting-llama2-to-llama3.ipynb         | 12 ++++++-----
 2 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/ch03/02_bonus_efficient-multihead-attention/mha-implementations.ipynb b/ch03/02_bonus_efficient-multihead-attention/mha-implementations.ipynb
index a1d074be..76f7aaf4 100644
--- a/ch03/02_bonus_efficient-multihead-attention/mha-implementations.ipynb
+++ b/ch03/02_bonus_efficient-multihead-attention/mha-implementations.ipynb
@@ -83,8 +83,8 @@
    },
    "source": [
     "- To run all the code in this notebook, please ensure you update to at least PyTorch 2.5 (FlexAttention is not included in earlier PyTorch releases)\n",
-    "If the code cell above shows a PyTorch version lower than 2.5, you can upgrade your PyTorch installation by uncommenting and running the following code cell (Please note that PyTorch 2.5 requires Python 3.9 or later)\n",
-    "- For more specific instructions and CUDA versions, please refer to the official installation guide at https://pytorch.org."
+    "- If the code cell above shows a PyTorch version lower than 2.5, you can upgrade your PyTorch installation by uncommenting and running the following code cell (Please note that PyTorch 2.5 requires Python 3.9 or later)\n",
+    "- For more specific instructions and CUDA versions, please refer to the official installation guide at https://pytorch.org"
    ]
   },
   {
@@ -886,12 +886,14 @@
     "id": "d2164859-31a0-4537-b4fb-27d57675ba77"
    },
    "source": [
-    "- Set `need_weights` (default `True`) to need_weights=False so that `MultiheadAttention` uses `scaled_dot_product_attention` [according to the documentation](https://github.com/pytorch/pytorch/blob/71d020262793542974cf13b30f2a9099773f015c/torch/nn/modules/activation.py#L1096)\n",
+    "- Set `need_weights` (default `True`) to `False` so that `MultiheadAttention` uses `scaled_dot_product_attention` [according to the documentation](https://github.com/pytorch/pytorch/blob/71d020262793542974cf13b30f2a9099773f015c/torch/nn/modules/activation.py#L1096)\n",
     "\n",
-    ">  need_weights: If specified, returns ``attn_output_weights`` in addition to ``attn_outputs``.\n",
-    "            Set ``need_weights=False`` to use the optimized ``scaled_dot_product_attention``\n",
-    "            and achieve the best performance for MHA.\n",
-    "            Default: ``True``."
+    "```markdown\n",
+    "need_weights: If specified, returns `attn_output_weights` in addition to `attn_outputs`.\n",
+    "           Set `need_weights=False` to use the optimized `scaled_dot_product_attention`\n",
+    "           and achieve the best performance for MHA.\n",
+    "           Default: `True`\n",
+    "```"
    ]
   },
   {
@@ -1965,7 +1967,7 @@
    "provenance": []
   },
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "pt",
    "language": "python",
    "name": "python3"
   },
@@ -1979,7 +1981,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.4"
+   "version": "3.11.9"
   }
  },
  "nbformat": 4,
diff --git a/ch05/07_gpt_to_llama/converting-llama2-to-llama3.ipynb b/ch05/07_gpt_to_llama/converting-llama2-to-llama3.ipynb
index 1c0dc349..3fb007b8 100644
--- a/ch05/07_gpt_to_llama/converting-llama2-to-llama3.ipynb
+++ b/ch05/07_gpt_to_llama/converting-llama2-to-llama3.ipynb
@@ -1843,7 +1843,7 @@
     "id": "VlH7qYVdDKQr"
    },
    "source": [
-    "- Note that the Llama 3 model should ideally used with the correct prompt template that was used during finetuning (as discussed in chapter 7)\n",
+    "- Note that the Llama 3 model should ideally be used with the correct prompt template that was used during finetuning (as discussed in chapter 7)\n",
     "- Below is a wrapper class around the tokenizer based on Meta AI's Llama 3-specific [ChatFormat code](https://github.com/meta-llama/llama3/blob/11817d47e1ba7a4959b025eb1ca308572e0e3963/llama/tokenizer.py#L202) that constructs the prompt template"
    ]
   },
@@ -2099,7 +2099,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "LLAMA32_CONFIG[\"context_length\"] = 8192"
+    "LLAMA31_CONFIG_8B[\"context_length\"] = 8192"
    ]
   },
   {
@@ -2319,7 +2319,8 @@
     "    combined_weights.update(current_weights)\n",
     "\n",
     "load_weights_into_llama(model, LLAMA31_CONFIG_8B, combined_weights)\n",
-    "model.to(device);"
+    "model.to(device);\n",
+    "del combined_weights  # free up memory"
    ]
   },
   {
@@ -2466,7 +2467,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "LLAMA32_CONFIG[\"context_length\"] = 8192"
+    "LLAMA32_CONFIG_1B[\"context_length\"] = 8192"
    ]
   },
   {
@@ -2594,7 +2595,8 @@
     "current_weights = load_file(weights_file)\n",
     "\n",
     "load_weights_into_llama(model, LLAMA32_CONFIG_1B, current_weights)\n",
-    "model.to(device);"
+    "model.to(device);\n",
+    "del current_weights  # free up memory"
    ]
   },
   {

From b34d34e4a514f4e09e3cf4ab0c9fd2dbe7d6afe2 Mon Sep 17 00:00:00 2001
From: hbaghramyan <henrikh.baghramyan@gmail.com>
Date: Fri, 25 Oct 2024 21:53:01 +0200
Subject: [PATCH 7/7] done 5.2

---
 ch05/01_main-chapter-code/ch05.py | 55 ++++++++++++++++++++++++++++++-
 1 file changed, 54 insertions(+), 1 deletion(-)

diff --git a/ch05/01_main-chapter-code/ch05.py b/ch05/01_main-chapter-code/ch05.py
index 414a7488..edcff2c4 100644
--- a/ch05/01_main-chapter-code/ch05.py
+++ b/ch05/01_main-chapter-code/ch05.py
@@ -2,6 +2,8 @@
 import tiktoken
 import os
 import sys
+import matplotlib.pyplot as plt
+from matplotlib.ticker import MaxNLocator
 
 sys.path.insert(0, os.getcwd())
 
@@ -203,7 +205,7 @@ def calc_loss_batch(input_batch, target_batch, model, device):
     target_batch = target_batch.to(device)
     logits = model(input_batch)
     loss = torch.nn.functional.cross_entropy(
-        input=logits.flatten(0, 1), target=targets.flatten()
+        input=logits.flatten(0, 1), target=target_batch.flatten()
     )
     return loss
 
@@ -283,3 +285,54 @@ def evaluate_model(model, train_loader, val_loader, device, eval_iter):
         val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
     model.train()
     return train_loss, val_loss
+
+
+def generate_and_print_sample(model, tokenizer, device, start_context):
+    model.eval()
+    context_size = model.pos_emb.weight.shape[0]
+    encoded = text_to_token_ids(start_context, tokenizer).to(device)
+    with torch.no_grad():
+        token_ids = generate_text_simple(
+            model=model, idx=encoded, max_new_tokens=50, context_size=context_size
+        )
+    decoded_text = token_ids_to_text(token_ids, tokenizer)
+    print(decoded_text.replace("\n", " "))
+    model.train()
+
+
+torch.manual_seed(123)
+model = GPTModel(GPT_CONFIG_124M)
+model.to(device)
+optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)
+num_epochs = 10
+train_losses, val_losses, tokens_seen = train_model_simple(
+    model,
+    train_loader,
+    val_loader,
+    optimizer,
+    device,
+    num_epochs=num_epochs,
+    eval_freq=5,
+    eval_iter=5,
+    start_context="Every effort moves you",
+    tokenizer=tokenizer,
+)
+
+
+def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses):
+    fig, ax1 = plt.subplots(figsize=(5, 3))
+    ax1.plot(epochs_seen, train_losses, label="Training loss")
+    ax1.plot(epochs_seen, val_losses, linestyle="-.", label="Validation loss")
+    ax1.set_xlabel("Epochs")
+    ax1.set_ylabel("Loss")
+    ax1.legend(loc="upper right")
+    ax1.xaxis.set_major_locator(MaxNLocator(integer=True))
+    ax2 = ax1.twiny()
+    ax2.plot(tokens_seen, train_losses, alpha=0)
+    ax2.set_xlabel("Tokens seen")
+    fig.tight_layout()
+    plt.show()
+
+
+epochs_tensor = torch.linspace(0, num_epochs, len(train_losses))
+plot_losses(epochs_tensor, tokens_seen, train_losses, val_losses)