From ef4018181e9c2e29d001af0ee8acaeb68de9c13c Mon Sep 17 00:00:00 2001 From: Daniel Kleine <53251018+d-kleine@users.noreply.github.com> Date: Wed, 23 Oct 2024 03:23:31 +0200 Subject: [PATCH 1/7] updates for PyTorch 2.5 (#408) * updated Dockerfile * updated MHA implementations for PT 2.5 * fixed typo * update installation instruction * Update setup/03_optional-docker-environment/.devcontainer/Dockerfile --------- Co-authored-by: rasbt --- .../mha-implementations.ipynb | 76 +++++++------------ .../.devcontainer/Dockerfile | 7 +- 2 files changed, 32 insertions(+), 51 deletions(-) diff --git a/ch03/02_bonus_efficient-multihead-attention/mha-implementations.ipynb b/ch03/02_bonus_efficient-multihead-attention/mha-implementations.ipynb index fb490da3..a1d074be 100644 --- a/ch03/02_bonus_efficient-multihead-attention/mha-implementations.ipynb +++ b/ch03/02_bonus_efficient-multihead-attention/mha-implementations.ipynb @@ -22,50 +22,6 @@ "" ] }, - { - "cell_type": "markdown", - "id": "1HABx0Hr3PDD", - "metadata": { - "id": "1HABx0Hr3PDD" - }, - "source": [ - "Uncomment and execute the following code cell to install the dependencies:" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "qPnVNAOxwy5s", - "metadata": { - "id": "qPnVNAOxwy5s" - }, - "outputs": [], - "source": [ - "# pip install -r https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/requirements.txt" - ] - }, - { - "cell_type": "markdown", - "id": "LYLcq3403Yq6", - "metadata": { - "id": "LYLcq3403Yq6" - }, - "source": [ - "Uncomment and execute the following code cell to install the PyTorch nightly dependency if you want to run the FlexAttention benchmarks (this is required because FlexAttention is not yet included in the latest PyTorch release):" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "gAgYvxm_xVct", - "metadata": { - "id": "gAgYvxm_xVct" - }, - "outputs": [], - "source": [ - "# pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121 -U" - ] - }, { "cell_type": "markdown", "id": "6f678e62-7bcb-4405-86ae-dce94f494303", @@ -119,6 +75,28 @@ "embeddings = torch.randn((batch_size, context_len, embed_dim), device=device)" ] }, + { + "cell_type": "markdown", + "id": "LYLcq3403Yq6", + "metadata": { + "id": "LYLcq3403Yq6" + }, + "source": [ + "- To run all the code in this notebook, please ensure you update to at least PyTorch 2.5 (FlexAttention is not included in earlier PyTorch releases)\n", + "If the code cell above shows a PyTorch version lower than 2.5, you can upgrade your PyTorch installation by uncommenting and running the following code cell (Please note that PyTorch 2.5 requires Python 3.9 or later)\n", + "- For more specific instructions and CUDA versions, please refer to the official installation guide at https://pytorch.org." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1db27f43-86f4-478f-89df-fbc2182a129b", + "metadata": {}, + "outputs": [], + "source": [ + "# pip install --upgrade torch torchvision torchaudio" + ] + }, { "cell_type": "markdown", "id": "2f9bb1b6-a1e5-4e0a-884d-0f31b374a8d6", @@ -964,16 +942,16 @@ "## 9) Using PyTorch's FlexAttention\n", "\n", "- See [FlexAttention: The Flexibility of PyTorch with the Performance of FlashAttention](https://pytorch.org/blog/flexattention/) to learn more about FlexAttention\n", - "- This is currently only supported in PyTorch 2.5 (nightly), which you can install on a CPU machine via\n", + "- This is supported starting from PyTorch 2.5, which you can install on a CPU machine via\n", "\n", " ```bash\n", - " pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu -U\n", + " pip install torch torchvision torchaudio\n", " ```\n", "\n", - "- To install PyTorch nighly on a GPU machine, use the following (for more information, also see the installation menu on [pytorch.org](https://pytorch.org/))\n", + "- To install PyTorch on a GPU machine, use the following (for more information, also see the installation menu on [pytorch.org](https://pytorch.org/))\n", "\n", " ```bash\n", - " pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121 -U\n", + " pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124\n", " ```" ] }, @@ -2001,7 +1979,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.11.4" } }, "nbformat": 4, diff --git a/setup/03_optional-docker-environment/.devcontainer/Dockerfile b/setup/03_optional-docker-environment/.devcontainer/Dockerfile index 7e0f697d..79194330 100644 --- a/setup/03_optional-docker-environment/.devcontainer/Dockerfile +++ b/setup/03_optional-docker-environment/.devcontainer/Dockerfile @@ -1,5 +1,7 @@ -FROM pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime +# Install PyTorch 2.5 with CUDA 12.4 +FROM pytorch/pytorch:2.5.0-cuda12.4-cudnn9-runtime +# Install Ubuntu packages RUN apt-get update && \ apt-get upgrade -y && \ apt-get install -y rsync && \ @@ -7,6 +9,7 @@ RUN apt-get update && \ apt-get install -y curl && \ rm -rf /var/lib/apt/lists/* +# Install Python packages COPY requirements.txt requirements.txt - +RUN pip install --upgrade pip RUN pip install --no-cache-dir -r requirements.txt From 4f9c9fb703c66ccda9535bb0acd63f20388604ec Mon Sep 17 00:00:00 2001 From: Sebastian Raschka Date: Wed, 23 Oct 2024 07:48:33 -0500 Subject: [PATCH 2/7] Update tests.py --- ch05/07_gpt_to_llama/tests/tests.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/ch05/07_gpt_to_llama/tests/tests.py b/ch05/07_gpt_to_llama/tests/tests.py index e753ae74..6620b4ea 100644 --- a/ch05/07_gpt_to_llama/tests/tests.py +++ b/ch05/07_gpt_to_llama/tests/tests.py @@ -1,3 +1,10 @@ +# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). +# Source for "Build a Large Language Model From Scratch" +# - https://www.manning.com/books/build-a-large-language-model-from-scratch +# Code: https://github.com/rasbt/LLMs-from-scratch + +# File for internal use (unit tests) + import io import os import sys @@ -8,14 +15,6 @@ from transformers.models.llama.modeling_llama import LlamaRotaryEmbedding, apply_rotary_pos_emb -# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). -# Source for "Build a Large Language Model From Scratch" -# - https://www.manning.com/books/build-a-large-language-model-from-scratch -# Code: https://github.com/rasbt/LLMs-from-scratch - -# File for internal use (unit tests) - - @pytest.fixture(scope="module") def notebook(): def import_definitions_from_notebook(notebooks): From 7cd6a670ed16f657d81f03a78e70684cf0b4f7f7 Mon Sep 17 00:00:00 2001 From: Sebastian Raschka Date: Wed, 23 Oct 2024 18:07:49 -0500 Subject: [PATCH 3/7] RoPE updates (#412) * RoPE updates * Apply suggestions from code review * updates * updates * updates --- .../converting-gpt-to-llama2.ipynb | 8 +- .../converting-llama2-to-llama3.ipynb | 8 +- ch05/07_gpt_to_llama/standalone-llama32.ipynb | 4 +- ch05/07_gpt_to_llama/tests/Untitled.ipynb | 74 +++++++++++ .../tests/test-requirements-extra.txt | 3 +- ch05/07_gpt_to_llama/tests/tests.py | 118 +++++++++++++++++- 6 files changed, 202 insertions(+), 13 deletions(-) create mode 100644 ch05/07_gpt_to_llama/tests/Untitled.ipynb diff --git a/ch05/07_gpt_to_llama/converting-gpt-to-llama2.ipynb b/ch05/07_gpt_to_llama/converting-gpt-to-llama2.ipynb index e8c5bf68..e7f459ea 100644 --- a/ch05/07_gpt_to_llama/converting-gpt-to-llama2.ipynb +++ b/ch05/07_gpt_to_llama/converting-gpt-to-llama2.ipynb @@ -426,7 +426,7 @@ " assert head_dim % 2 == 0, \"Embedding dimension must be even\"\n", "\n", " # Compute the inverse frequencies\n", - " inv_freq = 1.0 / (theta_base ** (torch.arange(0, head_dim // 2) / (head_dim // 2)))\n", + " inv_freq = 1.0 / (theta_base ** (torch.arange(0, head_dim, 2)[: (head_dim // 2)].float() / head_dim))\n", "\n", " # Generate position indices\n", " positions = torch.arange(context_length)\n", @@ -493,8 +493,8 @@ "\n", "# Dummy query and key tensors\n", "torch.manual_seed(123)\n", - "queries = torch.randn(batch_size, context_len, num_heads, head_dim)\n", - "keys = torch.randn(batch_size, context_len, num_heads, head_dim)\n", + "queries = torch.randn(batch_size, num_heads, context_len, head_dim)\n", + "keys = torch.randn(batch_size, num_heads, context_len, head_dim)\n", "\n", "# Apply rotary position embeddings\n", "queries_rot = compute_rope(queries, cos, sin)\n", @@ -1691,7 +1691,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.10.6" }, "widgets": { "application/vnd.jupyter.widget-state+json": { diff --git a/ch05/07_gpt_to_llama/converting-llama2-to-llama3.ipynb b/ch05/07_gpt_to_llama/converting-llama2-to-llama3.ipynb index 4b4459fc..bf62d9fc 100644 --- a/ch05/07_gpt_to_llama/converting-llama2-to-llama3.ipynb +++ b/ch05/07_gpt_to_llama/converting-llama2-to-llama3.ipynb @@ -278,7 +278,7 @@ " assert head_dim % 2 == 0, \"Embedding dimension must be even\"\n", "\n", " # Compute the inverse frequencies\n", - " inv_freq = 1.0 / (theta_base ** (torch.arange(0, head_dim // 2) / (head_dim // 2)))\n", + " inv_freq = 1.0 / (theta_base ** (torch.arange(0, head_dim, 2)[: (head_dim // 2)].float() / head_dim))\n", "\n", " ################################ NEW ###############################################\n", " # Frequency adjustments\n", @@ -383,8 +383,8 @@ "\n", "# Dummy query and key tensors\n", "torch.manual_seed(123)\n", - "queries = torch.randn(batch_size, llama_3_context_len, num_heads, head_dim)\n", - "keys = torch.randn(batch_size, llama_3_context_len, num_heads, head_dim)\n", + "queries = torch.randn(batch_size, num_heads, llama_3_context_len, head_dim)\n", + "keys = torch.randn(batch_size, num_heads, llama_3_context_len, head_dim)\n", "\n", "# Apply rotary position embeddings\n", "queries_rot = compute_rope(queries, cos, sin)\n", @@ -2701,7 +2701,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.10.6" }, "widgets": { "application/vnd.jupyter.widget-state+json": { diff --git a/ch05/07_gpt_to_llama/standalone-llama32.ipynb b/ch05/07_gpt_to_llama/standalone-llama32.ipynb index 4201f959..b3d80c9e 100644 --- a/ch05/07_gpt_to_llama/standalone-llama32.ipynb +++ b/ch05/07_gpt_to_llama/standalone-llama32.ipynb @@ -133,7 +133,7 @@ " assert head_dim % 2 == 0, \"Embedding dimension must be even\"\n", "\n", " # Compute the inverse frequencies\n", - " inv_freq = 1.0 / (theta_base ** (torch.arange(0, head_dim // 2) / (head_dim // 2)))\n", + " inv_freq = 1.0 / (theta_base ** (torch.arange(0, head_dim, 2)[: (head_dim // 2)].float() / head_dim))\n", "\n", " # Frequency adjustments\n", " if freq_config is not None:\n", @@ -1061,7 +1061,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.10.6" } }, "nbformat": 4, diff --git a/ch05/07_gpt_to_llama/tests/Untitled.ipynb b/ch05/07_gpt_to_llama/tests/Untitled.ipynb new file mode 100644 index 00000000..1375a9e9 --- /dev/null +++ b/ch05/07_gpt_to_llama/tests/Untitled.ipynb @@ -0,0 +1,74 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 9, + "id": "40d2405d-ee10-44ad-b20e-cf32078f926a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "True | head dim: 1, tensor([]), tensor([])\n", + "True | head dim: 2, tensor([1.]), tensor([1.])\n", + "True | head dim: 3, tensor([1.]), tensor([1.])\n", + "True | head dim: 4, tensor([1.0000, 0.0100]), tensor([1.0000, 0.0100])\n", + "False | head dim: 5, tensor([1.0000, 0.0100]), tensor([1.0000, 0.0251])\n", + "True | head dim: 6, tensor([1.0000, 0.0464, 0.0022]), tensor([1.0000, 0.0464, 0.0022])\n", + "False | head dim: 7, tensor([1.0000, 0.0464, 0.0022]), tensor([1.0000, 0.0720, 0.0052])\n", + "True | head dim: 8, tensor([1.0000, 0.1000, 0.0100, 0.0010]), tensor([1.0000, 0.1000, 0.0100, 0.0010])\n", + "False | head dim: 9, tensor([1.0000, 0.1000, 0.0100, 0.0010]), tensor([1.0000, 0.1292, 0.0167, 0.0022])\n", + "True | head dim: 10, tensor([1.0000e+00, 1.5849e-01, 2.5119e-02, 3.9811e-03, 6.3096e-04]), tensor([1.0000e+00, 1.5849e-01, 2.5119e-02, 3.9811e-03, 6.3096e-04])\n", + "False | head dim: 11, tensor([1.0000e+00, 1.5849e-01, 2.5119e-02, 3.9811e-03, 6.3096e-04]), tensor([1.0000, 0.1874, 0.0351, 0.0066, 0.0012])\n" + ] + } + ], + "source": [ + "import torch\n", + "\n", + "theta_base = 10_000\n", + "\n", + "for head_dim in range(1, 12):\n", + "\n", + " before = 1.0 / (theta_base ** (torch.arange(0, head_dim // 2) / (head_dim // 2)))\n", + " after = 1.0 / (theta_base ** (torch.arange(0, head_dim, 2)[: (head_dim // 2)].float() / head_dim))\n", + " \n", + " s = f\"{torch.equal(before, after)} | head dim: {head_dim}, {before}, {after}\"\n", + " print(s)\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0abfbf38-93a4-4994-8e7e-a543477268a8", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/ch05/07_gpt_to_llama/tests/test-requirements-extra.txt b/ch05/07_gpt_to_llama/tests/test-requirements-extra.txt index 8828ccea..2b9fd336 100644 --- a/ch05/07_gpt_to_llama/tests/test-requirements-extra.txt +++ b/ch05/07_gpt_to_llama/tests/test-requirements-extra.txt @@ -1 +1,2 @@ -transformers>=4.44.2 \ No newline at end of file +transformers>=4.44.2 +litgpt>=0.5.0 \ No newline at end of file diff --git a/ch05/07_gpt_to_llama/tests/tests.py b/ch05/07_gpt_to_llama/tests/tests.py index 6620b4ea..395f9ec3 100644 --- a/ch05/07_gpt_to_llama/tests/tests.py +++ b/ch05/07_gpt_to_llama/tests/tests.py @@ -10,11 +10,82 @@ import sys import types import nbformat +from typing import Optional, Tuple import torch import pytest from transformers.models.llama.modeling_llama import LlamaRotaryEmbedding, apply_rotary_pos_emb +# LitGPT code from https://github.com/Lightning-AI/litgpt/blob/main/litgpt/model.py +# LitGPT is licensed under Apache v2: https://github.com/Lightning-AI/litgpt/blob/main/LICENSE +def litgpt_build_rope_cache( + seq_len: int, + n_elem: int, + device: Optional[torch.device] = None, + base: int = 10000, + condense_ratio: int = 1, + extra_config: Optional[dict] = None, +) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Enhanced Transformer with Rotary Position Embedding. + + Args: + seq_len (int): Sequence length. + n_elem (int): Number of elements (head dimension). + device (torch.device, optional): Device for tensor allocations. + base (int, optional): Base for computing inverse frequencies. + condense_ratio (int, optional): Ratio to condense the position indices. + extra_config (dict, optional): Configuration parameters for frequency adjustments (used by Llama 3.1 and 3.2) + + Returns: + Tuple[torch.Tensor, torch.Tensor]: Cosine and sine caches for RoPE. + """ + + # Compute the inverse frequencies theta + theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, device=device).float() / n_elem)) + + if extra_config is not None: + orig_context_len = extra_config["original_max_seq_len"] + factor = extra_config["factor"] + low_freq_factor = extra_config["low_freq_factor"] + high_freq_factor = extra_config["high_freq_factor"] + + wavelen = 2 * torch.pi / theta + ratio = orig_context_len / wavelen + smooth_factor = (ratio - low_freq_factor) / (high_freq_factor - low_freq_factor) + smooth_factor = torch.clamp(smooth_factor, min=0.0, max=1.0) + + # Compute adjusted_theta without masked indexing + adjusted_theta = (1 - smooth_factor) * (theta / factor) + smooth_factor * theta + theta = adjusted_theta + + # Create position indices `[0, 1, ..., seq_len - 1]` + seq_idx = torch.arange(seq_len, device=device) / condense_ratio + + # Calculate the product of position index and $\theta_i$ + idx_theta = torch.outer(seq_idx, theta).repeat(1, 2) + + return torch.cos(idx_theta), torch.sin(idx_theta) + + +# LitGPT code from https://github.com/Lightning-AI/litgpt/blob/main/litgpt/model.py +# LitGPT is licensed under Apache v2: https://github.com/Lightning-AI/litgpt/blob/main/LICENSE +def litgpt_apply_rope(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor: + head_size = x.size(-1) + x1 = x[..., : head_size // 2] # (B, nh, T, hs/2) + x2 = x[..., head_size // 2:] # (B, nh, T, hs/2) + rotated = torch.cat((-x2, x1), dim=-1) # (B, nh, T, hs) + if cos.dim() > 1: + # batch dimensions must align + # sin/cos are (B, T, hs) so we unsqeeze -3 for nh + # we count from back because all of apply_rope does + cos = cos.unsqueeze(-3) + sin = sin.unsqueeze(-3) + + roped = (x * cos) + (rotated * sin) + return roped.to(dtype=x.dtype) + + @pytest.fixture(scope="module") def notebook(): def import_definitions_from_notebook(notebooks): @@ -84,21 +155,30 @@ def test_rope_llama2(notebook): queries_rot = this_nb.compute_rope(queries, cos, sin) keys_rot = this_nb.compute_rope(keys, cos, sin) + # Generate reference RoPE via HF rot_emb = LlamaRotaryEmbedding( dim=head_dim, max_position_embeddings=context_len, base=10_000 ) - position_ids = torch.arange(context_len, dtype=torch.long).unsqueeze(0) ref_cos, ref_sin = rot_emb(queries, position_ids) ref_queries_rot, ref_keys_rot = apply_rotary_pos_emb(queries, keys, ref_cos, ref_sin) - torch.testing.assert_close(sin, ref_sin.squeeze(0)) torch.testing.assert_close(cos, ref_cos.squeeze(0)) torch.testing.assert_close(keys_rot, ref_keys_rot) torch.testing.assert_close(queries_rot, ref_queries_rot) + # Generate reference RoPE via LitGPT + litgpt_cos, litgpt_sin = litgpt_build_rope_cache(context_len, n_elem=head_dim, base=10_000) + litgpt_queries_rot = litgpt_apply_rope(queries, litgpt_cos, litgpt_sin) + litgpt_keys_rot = litgpt_apply_rope(keys, litgpt_cos, litgpt_sin) + + torch.testing.assert_close(sin, litgpt_sin) + torch.testing.assert_close(cos, litgpt_cos) + torch.testing.assert_close(keys_rot, litgpt_keys_rot) + torch.testing.assert_close(queries_rot, litgpt_queries_rot) + def test_rope_llama3(notebook): @@ -128,6 +208,7 @@ def test_rope_llama3(notebook): queries_rot = nb1.compute_rope(queries, cos, sin) keys_rot = nb1.compute_rope(keys, cos, sin) + # Generate reference RoPE via HF rot_emb = LlamaRotaryEmbedding( dim=head_dim, max_position_embeddings=context_len, @@ -143,6 +224,16 @@ def test_rope_llama3(notebook): torch.testing.assert_close(keys_rot, ref_keys_rot) torch.testing.assert_close(queries_rot, ref_queries_rot) + # Generate reference RoPE via LitGPT + litgpt_cos, litgpt_sin = litgpt_build_rope_cache(context_len, n_elem=head_dim, base=theta_base) + litgpt_queries_rot = litgpt_apply_rope(queries, litgpt_cos, litgpt_sin) + litgpt_keys_rot = litgpt_apply_rope(keys, litgpt_cos, litgpt_sin) + + torch.testing.assert_close(sin, litgpt_sin) + torch.testing.assert_close(cos, litgpt_cos) + torch.testing.assert_close(keys_rot, litgpt_keys_rot) + torch.testing.assert_close(queries_rot, litgpt_queries_rot) + def test_rope_llama3_12(notebook): @@ -180,6 +271,7 @@ def test_rope_llama3_12(notebook): queries_rot = nb1.compute_rope(queries, cos, sin) keys_rot = nb1.compute_rope(keys, cos, sin) + # Generate reference RoPE via HF hf_rope_params = { "factor": 8.0, "low_freq_factor": 1.0, @@ -210,6 +302,28 @@ class RoPEConfig: torch.testing.assert_close(keys_rot, ref_keys_rot) torch.testing.assert_close(queries_rot, ref_queries_rot) + # Generate reference RoPE via LitGPT + litgpt_rope_config = { + "factor": 8.0, + "low_freq_factor": 1.0, + "high_freq_factor": 4.0, + "original_max_seq_len": 8192 + } + + litgpt_cos, litgpt_sin = litgpt_build_rope_cache( + context_len, + n_elem=head_dim, + base=rope_theta, + extra_config=litgpt_rope_config + ) + litgpt_queries_rot = litgpt_apply_rope(queries, litgpt_cos, litgpt_sin) + litgpt_keys_rot = litgpt_apply_rope(keys, litgpt_cos, litgpt_sin) + + torch.testing.assert_close(sin, litgpt_sin) + torch.testing.assert_close(cos, litgpt_cos) + torch.testing.assert_close(keys_rot, litgpt_keys_rot) + torch.testing.assert_close(queries_rot, litgpt_queries_rot) + def test_silu(notebook): example_batch = torch.randn(2, 3, 4) From e1dfd2cb7a43e987233dcc22049d4d2947824dc4 Mon Sep 17 00:00:00 2001 From: Sebastian Raschka Date: Wed, 23 Oct 2024 19:19:58 -0500 Subject: [PATCH 4/7] Update test-requirements-extra.txt --- ch05/07_gpt_to_llama/tests/test-requirements-extra.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ch05/07_gpt_to_llama/tests/test-requirements-extra.txt b/ch05/07_gpt_to_llama/tests/test-requirements-extra.txt index 2b9fd336..4f423290 100644 --- a/ch05/07_gpt_to_llama/tests/test-requirements-extra.txt +++ b/ch05/07_gpt_to_llama/tests/test-requirements-extra.txt @@ -1,2 +1,2 @@ +pytest>=8.1.1 transformers>=4.44.2 -litgpt>=0.5.0 \ No newline at end of file From d38083c40199fe7a8e3d9a5b4b20dc209371bd3d Mon Sep 17 00:00:00 2001 From: Daniel Kleine <53251018+d-kleine@users.noreply.github.com> Date: Thu, 24 Oct 2024 14:40:08 +0200 Subject: [PATCH 5/7] Updated Llama 2 to 3 paths (#413) * llama 2 and 3 path fixes * updated llama 3, 3.1 and 3.2 paths * updated .gitignore * Typo fix --------- Co-authored-by: Sebastian Raschka --- .gitignore | 11 ++++++---- .../converting-gpt-to-llama2.ipynb | 2 +- .../converting-llama2-to-llama3.ipynb | 20 +++++++++---------- ch05/07_gpt_to_llama/standalone-llama32.ipynb | 10 +++++----- 4 files changed, 23 insertions(+), 20 deletions(-) diff --git a/.gitignore b/.gitignore index f91cc5ce..4385a0ba 100644 --- a/.gitignore +++ b/.gitignore @@ -35,12 +35,15 @@ ch05/01_main-chapter-code/model.pth ch05/01_main-chapter-code/model_and_optimizer.pth ch05/03_bonus_pretraining_on_gutenberg/model_checkpoints ch05/06_user_interface/gpt2 +ch05/07_gpt_to_llama/.cache ch05/07_gpt_to_llama/Llama-2-7b ch05/07_gpt_to_llama/Llama-2-7b-chat -ch05/07_gpt_to_llama/.cache -ch05/07_gpt_to_llama/llama3-files -ch05/07_gpt_to_llama/llama31-files -ch05/07_gpt_to_llama/llama32-files +ch05/07_gpt_to_llama/Llama-3-8B +ch05/07_gpt_to_llama/Llama-3-8B-Instruct +ch05/07_gpt_to_llama/Llama-3.1-8B +ch05/07_gpt_to_llama/Llama-3.1-8B-Instruct +ch05/07_gpt_to_llama/Llama-3.2-1B +ch05/07_gpt_to_llama/Llama-3.2-1B-Instruct ch06/01_main-chapter-code/gpt2 ch06/02_bonus_additional-experiments/gpt2 diff --git a/ch05/07_gpt_to_llama/converting-gpt-to-llama2.ipynb b/ch05/07_gpt_to_llama/converting-gpt-to-llama2.ipynb index e7f459ea..1ff5a42b 100644 --- a/ch05/07_gpt_to_llama/converting-gpt-to-llama2.ipynb +++ b/ch05/07_gpt_to_llama/converting-gpt-to-llama2.ipynb @@ -1189,7 +1189,7 @@ "tokenizer_file = hf_hub_download(\n", " repo_id=\"meta-llama/Llama-2-7b\",\n", " filename=\"tokenizer.model\",\n", - " local_dir=\"Llama-2-7B\"\n", + " local_dir=\"Llama-2-7b\"\n", ")" ] }, diff --git a/ch05/07_gpt_to_llama/converting-llama2-to-llama3.ipynb b/ch05/07_gpt_to_llama/converting-llama2-to-llama3.ipynb index bf62d9fc..1c0dc349 100644 --- a/ch05/07_gpt_to_llama/converting-llama2-to-llama3.ipynb +++ b/ch05/07_gpt_to_llama/converting-llama2-to-llama3.ipynb @@ -1252,7 +1252,7 @@ "tokenizer_file_path = hf_hub_download(\n", " repo_id=\"meta-llama/Meta-Llama-3-8B\",\n", " filename=\"original/tokenizer.model\",\n", - " local_dir=\"llama3-files\"\n", + " local_dir=\"Llama-3-8B\"\n", ")" ] }, @@ -1458,7 +1458,7 @@ " weights_file = hf_hub_download(\n", " repo_id=\"meta-llama/Meta-Llama-3-8B\",\n", " filename=f\"model-0000{i}-of-00004.safetensors\",\n", - " local_dir=\"llama3-files\"\n", + " local_dir=\"Llama-3-8B\"\n", " )\n", " current_weights = load_file(weights_file)\n", " combined_weights.update(current_weights)" @@ -1677,7 +1677,7 @@ "id": "akyo7WNyF_YL" }, "source": [ - "- Above, we used the pretrained base model; if you want to use a model capable of following instructions, use the `\"meta-llama/Llama-3-8b-Instruct\"` model instead, as shown below" + "- Above, we used the pretrained base model; if you want to use a model capable of following instructions, use the `\"meta-llama/Llama-3-8B-Instruct\"` model instead, as shown below" ] }, { @@ -1824,7 +1824,7 @@ " weights_file = hf_hub_download(\n", " repo_id=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n", " filename=f\"model-0000{i}-of-00004.safetensors\",\n", - " local_dir=\"llama3-files\"\n", + " local_dir=\"Llama-3-8B-Instruct\"\n", " )\n", " current_weights = load_file(weights_file)\n", " combined_weights.update(current_weights)\n", @@ -2157,7 +2157,7 @@ "tokenizer_file_path = hf_hub_download(\n", " repo_id=\"meta-llama/Llama-3.1-8B\",\n", " filename=\"original/tokenizer.model\",\n", - " local_dir=\"llama31-files\"\n", + " local_dir=\"Llama-3.1-8B\"\n", ")\n", "\n", "tokenizer = Tokenizer(tokenizer_file_path)" @@ -2313,7 +2313,7 @@ " weights_file = hf_hub_download(\n", " repo_id=\"meta-llama/Llama-3.1-8B\",\n", " filename=f\"model-0000{i}-of-00004.safetensors\",\n", - " local_dir=\"llama31-files\"\n", + " local_dir=\"Llama-3.1-8B\"\n", " )\n", " current_weights = load_file(weights_file)\n", " combined_weights.update(current_weights)\n", @@ -2512,7 +2512,7 @@ "tokenizer_file_path = hf_hub_download(\n", " repo_id=\"meta-llama/Llama-3.2-1B\",\n", " filename=\"original/tokenizer.model\",\n", - " local_dir=\"llama32-files\"\n", + " local_dir=\"Llama-3.2-1B\"\n", ")\n", "\n", "tokenizer = Tokenizer(tokenizer_file_path)" @@ -2589,7 +2589,7 @@ "weights_file = hf_hub_download(\n", " repo_id=\"meta-llama/Llama-3.2-1B\",\n", " filename=f\"model.safetensors\",\n", - " local_dir=\"llama32-files\"\n", + " local_dir=\"Llama-3.2-1B\"\n", ")\n", "current_weights = load_file(weights_file)\n", "\n", @@ -2687,7 +2687,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "pt", "language": "python", "name": "python3" }, @@ -2701,7 +2701,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.11.9" }, "widgets": { "application/vnd.jupyter.widget-state+json": { diff --git a/ch05/07_gpt_to_llama/standalone-llama32.ipynb b/ch05/07_gpt_to_llama/standalone-llama32.ipynb index b3d80c9e..dd8fdf5f 100644 --- a/ch05/07_gpt_to_llama/standalone-llama32.ipynb +++ b/ch05/07_gpt_to_llama/standalone-llama32.ipynb @@ -733,7 +733,7 @@ "tokenizer_file_path = hf_hub_download(\n", " repo_id=f\"meta-llama/Llama-3.2-{LLAMA_SIZE_STR}-Instruct\",\n", " filename=\"original/tokenizer.model\",\n", - " local_dir=\"llama32-files\"\n", + " local_dir=\"Llama-3.2-1B-Instruct\"\n", ")" ] }, @@ -860,7 +860,7 @@ " weights_file = hf_hub_download(\n", " repo_id=f\"meta-llama/Llama-3.2-{LLAMA_SIZE_STR}-Instruct\",\n", " filename=f\"model.safetensors\",\n", - " local_dir=\"llama32-files\"\n", + " local_dir=\"Llama-3.2-1B-Instruct\"\n", " )\n", " combined_weights = load_file(weights_file)\n", "\n", @@ -871,7 +871,7 @@ " weights_file = hf_hub_download(\n", " repo_id=f\"meta-llama/Llama-3.2-{LLAMA_SIZE_STR}-Instruct\",\n", " filename=f\"model-0000{i}-of-00002.safetensors\",\n", - " local_dir=\"llama32-files\"\n", + " local_dir=\"Llama-3.2-1B-Instruct\"\n", " )\n", " current_weights = load_file(weights_file)\n", " combined_weights.update(current_weights)\n", @@ -1047,7 +1047,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "pt", "language": "python", "name": "python3" }, @@ -1061,7 +1061,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.11.9" } }, "nbformat": 4, From 5ff72c2850345a854d9ca0c750a9e44f75c2b0c9 Mon Sep 17 00:00:00 2001 From: Daniel Kleine <53251018+d-kleine@users.noreply.github.com> Date: Fri, 25 Oct 2024 01:23:53 +0200 Subject: [PATCH 6/7] fixed typos (#414) * fixed typos * fixed formatting * Update ch03/02_bonus_efficient-multihead-attention/mha-implementations.ipynb * del weights after load into model --------- Co-authored-by: Sebastian Raschka --- .../mha-implementations.ipynb | 20 ++++++++++--------- .../converting-llama2-to-llama3.ipynb | 12 ++++++----- 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/ch03/02_bonus_efficient-multihead-attention/mha-implementations.ipynb b/ch03/02_bonus_efficient-multihead-attention/mha-implementations.ipynb index a1d074be..76f7aaf4 100644 --- a/ch03/02_bonus_efficient-multihead-attention/mha-implementations.ipynb +++ b/ch03/02_bonus_efficient-multihead-attention/mha-implementations.ipynb @@ -83,8 +83,8 @@ }, "source": [ "- To run all the code in this notebook, please ensure you update to at least PyTorch 2.5 (FlexAttention is not included in earlier PyTorch releases)\n", - "If the code cell above shows a PyTorch version lower than 2.5, you can upgrade your PyTorch installation by uncommenting and running the following code cell (Please note that PyTorch 2.5 requires Python 3.9 or later)\n", - "- For more specific instructions and CUDA versions, please refer to the official installation guide at https://pytorch.org." + "- If the code cell above shows a PyTorch version lower than 2.5, you can upgrade your PyTorch installation by uncommenting and running the following code cell (Please note that PyTorch 2.5 requires Python 3.9 or later)\n", + "- For more specific instructions and CUDA versions, please refer to the official installation guide at https://pytorch.org" ] }, { @@ -886,12 +886,14 @@ "id": "d2164859-31a0-4537-b4fb-27d57675ba77" }, "source": [ - "- Set `need_weights` (default `True`) to need_weights=False so that `MultiheadAttention` uses `scaled_dot_product_attention` [according to the documentation](https://github.com/pytorch/pytorch/blob/71d020262793542974cf13b30f2a9099773f015c/torch/nn/modules/activation.py#L1096)\n", + "- Set `need_weights` (default `True`) to `False` so that `MultiheadAttention` uses `scaled_dot_product_attention` [according to the documentation](https://github.com/pytorch/pytorch/blob/71d020262793542974cf13b30f2a9099773f015c/torch/nn/modules/activation.py#L1096)\n", "\n", - "> need_weights: If specified, returns ``attn_output_weights`` in addition to ``attn_outputs``.\n", - " Set ``need_weights=False`` to use the optimized ``scaled_dot_product_attention``\n", - " and achieve the best performance for MHA.\n", - " Default: ``True``." + "```markdown\n", + "need_weights: If specified, returns `attn_output_weights` in addition to `attn_outputs`.\n", + " Set `need_weights=False` to use the optimized `scaled_dot_product_attention`\n", + " and achieve the best performance for MHA.\n", + " Default: `True`\n", + "```" ] }, { @@ -1965,7 +1967,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "pt", "language": "python", "name": "python3" }, @@ -1979,7 +1981,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.11.9" } }, "nbformat": 4, diff --git a/ch05/07_gpt_to_llama/converting-llama2-to-llama3.ipynb b/ch05/07_gpt_to_llama/converting-llama2-to-llama3.ipynb index 1c0dc349..3fb007b8 100644 --- a/ch05/07_gpt_to_llama/converting-llama2-to-llama3.ipynb +++ b/ch05/07_gpt_to_llama/converting-llama2-to-llama3.ipynb @@ -1843,7 +1843,7 @@ "id": "VlH7qYVdDKQr" }, "source": [ - "- Note that the Llama 3 model should ideally used with the correct prompt template that was used during finetuning (as discussed in chapter 7)\n", + "- Note that the Llama 3 model should ideally be used with the correct prompt template that was used during finetuning (as discussed in chapter 7)\n", "- Below is a wrapper class around the tokenizer based on Meta AI's Llama 3-specific [ChatFormat code](https://github.com/meta-llama/llama3/blob/11817d47e1ba7a4959b025eb1ca308572e0e3963/llama/tokenizer.py#L202) that constructs the prompt template" ] }, @@ -2099,7 +2099,7 @@ "metadata": {}, "outputs": [], "source": [ - "LLAMA32_CONFIG[\"context_length\"] = 8192" + "LLAMA31_CONFIG_8B[\"context_length\"] = 8192" ] }, { @@ -2319,7 +2319,8 @@ " combined_weights.update(current_weights)\n", "\n", "load_weights_into_llama(model, LLAMA31_CONFIG_8B, combined_weights)\n", - "model.to(device);" + "model.to(device);\n", + "del combined_weights # free up memory" ] }, { @@ -2466,7 +2467,7 @@ "metadata": {}, "outputs": [], "source": [ - "LLAMA32_CONFIG[\"context_length\"] = 8192" + "LLAMA32_CONFIG_1B[\"context_length\"] = 8192" ] }, { @@ -2594,7 +2595,8 @@ "current_weights = load_file(weights_file)\n", "\n", "load_weights_into_llama(model, LLAMA32_CONFIG_1B, current_weights)\n", - "model.to(device);" + "model.to(device);\n", + "del current_weights # free up memory" ] }, { From b34d34e4a514f4e09e3cf4ab0c9fd2dbe7d6afe2 Mon Sep 17 00:00:00 2001 From: hbaghramyan Date: Fri, 25 Oct 2024 21:53:01 +0200 Subject: [PATCH 7/7] done 5.2 --- ch05/01_main-chapter-code/ch05.py | 55 ++++++++++++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) diff --git a/ch05/01_main-chapter-code/ch05.py b/ch05/01_main-chapter-code/ch05.py index 414a7488..edcff2c4 100644 --- a/ch05/01_main-chapter-code/ch05.py +++ b/ch05/01_main-chapter-code/ch05.py @@ -2,6 +2,8 @@ import tiktoken import os import sys +import matplotlib.pyplot as plt +from matplotlib.ticker import MaxNLocator sys.path.insert(0, os.getcwd()) @@ -203,7 +205,7 @@ def calc_loss_batch(input_batch, target_batch, model, device): target_batch = target_batch.to(device) logits = model(input_batch) loss = torch.nn.functional.cross_entropy( - input=logits.flatten(0, 1), target=targets.flatten() + input=logits.flatten(0, 1), target=target_batch.flatten() ) return loss @@ -283,3 +285,54 @@ def evaluate_model(model, train_loader, val_loader, device, eval_iter): val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter) model.train() return train_loss, val_loss + + +def generate_and_print_sample(model, tokenizer, device, start_context): + model.eval() + context_size = model.pos_emb.weight.shape[0] + encoded = text_to_token_ids(start_context, tokenizer).to(device) + with torch.no_grad(): + token_ids = generate_text_simple( + model=model, idx=encoded, max_new_tokens=50, context_size=context_size + ) + decoded_text = token_ids_to_text(token_ids, tokenizer) + print(decoded_text.replace("\n", " ")) + model.train() + + +torch.manual_seed(123) +model = GPTModel(GPT_CONFIG_124M) +model.to(device) +optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1) +num_epochs = 10 +train_losses, val_losses, tokens_seen = train_model_simple( + model, + train_loader, + val_loader, + optimizer, + device, + num_epochs=num_epochs, + eval_freq=5, + eval_iter=5, + start_context="Every effort moves you", + tokenizer=tokenizer, +) + + +def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses): + fig, ax1 = plt.subplots(figsize=(5, 3)) + ax1.plot(epochs_seen, train_losses, label="Training loss") + ax1.plot(epochs_seen, val_losses, linestyle="-.", label="Validation loss") + ax1.set_xlabel("Epochs") + ax1.set_ylabel("Loss") + ax1.legend(loc="upper right") + ax1.xaxis.set_major_locator(MaxNLocator(integer=True)) + ax2 = ax1.twiny() + ax2.plot(tokens_seen, train_losses, alpha=0) + ax2.set_xlabel("Tokens seen") + fig.tight_layout() + plt.show() + + +epochs_tensor = torch.linspace(0, num_epochs, len(train_losses)) +plot_losses(epochs_tensor, tokens_seen, train_losses, val_losses)