diff --git a/config.yml b/config.yml index 8054715..a93f09c 100644 --- a/config.yml +++ b/config.yml @@ -21,4 +21,7 @@ menu: - name: "About" url: "/about" - weight: 3 \ No newline at end of file + weight: 3 + +disableKinds: + - "404" \ No newline at end of file diff --git a/public/404.html b/public/404.html index d06def7..b77b272 100644 --- a/public/404.html +++ b/public/404.html @@ -1,209 +1,211 @@ - - - - -404 Page not found | Jonah's ML Notes - - - - - - - - - - - - - - - - - - - - - - + + + + + - - - - + + + + + - - - + + + - - -
- -
-
-
404
+ + + +
+
404
- - - - - - - - - + + - - + - + \ No newline at end of file diff --git a/public/about/index.html b/public/about/index.html new file mode 100644 index 0000000..4e996b4 --- /dev/null +++ b/public/about/index.html @@ -0,0 +1,115 @@ + + + + - Jonah's ML Notes + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+

+
Posted on Jan 1, 0001
+
+ + +
+

This is the about page.

+ +
+ + +
+
+ + + +
+ + diff --git a/public/categories/index.html b/public/categories/index.html index c515807..1a865b1 100644 --- a/public/categories/index.html +++ b/public/categories/index.html @@ -11,19 +11,57 @@ - + + + + + + + + + + + + + + + + @@ -34,6 +72,12 @@ diff --git a/public/css/dark.c95c5dcf5f32f8b67bd36f7dab66680e068fce2b303087294114aabf7a7c080b.css b/public/css/dark.c95c5dcf5f32f8b67bd36f7dab66680e068fce2b303087294114aabf7a7c080b.css new file mode 100644 index 0000000..f93adf1 --- /dev/null +++ b/public/css/dark.c95c5dcf5f32f8b67bd36f7dab66680e068fce2b303087294114aabf7a7c080b.css @@ -0,0 +1,159 @@ +body { + color: white; + background-color: #202124; +} + +::-moz-selection { + background: blue; + color: #fff; + text-shadow: none; +} + +::selection { + background: red; + color: #fff; + text-shadow: none; +} + +hr { + border-top: 3px dotted blue; +} +code { + background-color: lightblue; + color: black; + text-decoration: bold; + padding: 0.1em 0.2em; +} +pre { + background-color: #272822; + line-height: 1.4; + overflow-x: auto; + padding: 1em; +} +blockquote { + border-color: blue; +} + +h1, +h2, +h3, +h4, +h5, +h6 { + color: #ddd; +} +h1::before { + color: var(--darkMaincolor); + content: "# "; +} +h2::before { + color: var(--darkMaincolor); + content: "## "; +} +h3::before { + color: var(--darkMaincolor); + content: "### "; +} +h4::before { + color: var(--darkMaincolor); + content: "#### "; +} +h5::before { + color: var(--darkMaincolor); + content: "##### "; +} +h6::before { + color: var(--darkMaincolor); + content: "###### "; +} + +a { + border-bottom: 3px solid var(--darkMaincolor); + color: inherit; +} +a:hover { + background-color: var(--darkMaincolor); + color: black; +} + +.site-description a { + color: #ddd; +} +.site-description a:hover { + color: black; +} + +.tags a { + border-bottom: 3px solid var(--darkMaincolor); +} +.tags a:hover { + background-color: var(--darkMaincolor); + color: black; +} + +.site-title a { + color: white; + text-decoration: none !important; +} + +.header nav, +.footer { + border-color: #333; +} + +.highlight { + background-color: #333; +} +.soc:hover { + color: black; +} +.draft-label { + color: var(--darkMaincolor); + background-color: blue; +} +.highlight pre code[class=language-javaScript]::before, +.highlight pre code[class="language-js"]::before { + content: "js"; + background: #f7df1e; + color: black; +} +.highlight pre code[class*='language-yml']::before, +.highlight pre code[class*='language-yaml']::before { + content: 'yaml'; + background: #f71e6a; + color: white; +} +.highlight pre code[class*='language-shell']::before, +.highlight pre code[class*='language-bash']::before, +.highlight pre code[class*='language-sh']::before { + content: 'shell'; + background: green; + color:white +} +.highlight pre code[class*='language-json']::before{ + content: 'json'; + background: dodgerblue; + color: #000000 +} +.highlight pre code[class*='language-python']::before, +.highlight pre code[class*='language-py']::before { + content: 'py'; + background: blue; + color: yellow ; +} +.highlight pre code[class*='language-css']::before{ + content: 'css'; + background: cyan; + color: black ; +} +.highlight pre code[class*='language-go']::before{ + content: 'Go'; + background: cyan; + color: royalblue ; +} +.highlight pre code[class*='language-md']::before, +.highlight pre code[class*='language-md']::before{ + content: 'Markdown'; + background: royalblue; + color: whitesmoke ; +} \ No newline at end of file diff --git a/public/css/main.d902908ac6e0fab67957de5db5aea1b6455b19ae2ca98eac4c95a4a0fdc02238.css b/public/css/main.d902908ac6e0fab67957de5db5aea1b6455b19ae2ca98eac4c95a4a0fdc02238.css index 4b237d8..3a7d1cd 100644 --- a/public/css/main.d902908ac6e0fab67957de5db5aea1b6455b19ae2ca98eac4c95a4a0fdc02238.css +++ b/public/css/main.d902908ac6e0fab67957de5db5aea1b6455b19ae2ca98eac4c95a4a0fdc02238.css @@ -1,24 +1,21 @@ /* Markdown */ -:root { - --maincolor: #e24329; - --bordercl: rebeccapurple; - --callouctcolor: dodgerblue; - --hovercolor: navy; - --darkMaincolor: #50fa7b; +:root{ +--maincolor: red; +--bordercl:rebeccapurple; +--callouctcolor:dodgerblue; +--hovercolor:navy; +--darkMaincolor: #50fa7b; } - html { color: #232333; font-family: 'Roboto Mono', monospace; font-size: 15px; line-height: 1.6em; } - -body { +body{ display: block; margin: 8px; } - * { -webkit-tap-highlight-color: rgba(0, 0, 0, 0); } @@ -51,22 +48,19 @@ a { color: inherit; text-decoration: none; } - a:hover { - background-color: var(--hovercolor); - color: #fff; + background-color: var(--hovercolor); + color: #fff; } ul { list-style: none; padding-left: 2ch; } - ul li { text-indent: -2ch; } - -ul>li::before { +ul > li::before { content: '* '; font-weight: bold; } @@ -99,7 +93,6 @@ figure h4 { margin: 0; margin-bottom: 1em; } - figure h4::before { content: '↳ '; } @@ -151,46 +144,17 @@ header { header .main { font-size: 1.5rem; } - -h1, -h2, -h3, -h4, -h5, -h6 { +h1, h2, h3, h4, h5, h6 { font-size: 1.2rem; margin-top: 2em; } -h1::before { - color: var(--maincolor); - content: '# '; -} - -h2::before { - color: var(--maincolor); - content: '## '; -} - -h3::before { - color: var(--maincolor); - content: '### '; -} - -h4::before { - color: var(--maincolor); - content: '#### '; -} - -h5::before { - color: var(--maincolor); - content: '##### '; -} - -h6::before { - color: var(--maincolor); - content: '###### '; -} +h1::before { color: var(--maincolor); content: '# '; } +h2::before { color: var(--maincolor); content: '## '; } +h3::before { color: var(--maincolor); content: '### '; } +h4::before { color: var(--maincolor); content: '#### '; } +h5::before { color: var(--maincolor); content: '##### '; } +h6::before { color: var(--maincolor); content: '###### '; } .meta { color: #999; @@ -205,19 +169,16 @@ footer { padding: 2rem 0rem; margin-top: 2rem; } - .soc { display: flex; align-items: center; border-bottom: none; } - .border { margin-left: 0.5rem; margin-right: 0.5rem; border: 1px solid; } - .footer-info { padding: var(--footer-padding); } @@ -259,49 +220,40 @@ article .title { } .site-description { - display: flex; - justify-content: space-between; +display: flex; +justify-content: space-between; } - -.tags li::before { +.tags li::before{ content: "🏷 "; } - -.tags a { - border-bottom: 3px solid var(--maincolor); +.tags a{ + border-bottom: 3px solid var(--maincolor); } - -.tags a:hover { - color: white; - background-color: var(--hovercolor); +.tags a:hover{ + color:white; + background-color: var(--hovercolor); } - -svg { +svg{ max-height: 15px; } - -.soc:hover { +.soc:hover{ color: white; } - -.draft-label { - color: var(--bordercl); - text-decoration: none; - padding: 2px 4px; - border-radius: 4px; - margin-left: 6px; - background-color: #f9f2f4; +.draft-label{ + color: var(--bordercl); + text-decoration: none; + padding: 2px 4px; + border-radius: 4px; + margin-left: 6px; + background-color: #f9f2f4; } - .highlight { position: relative; -webkit-overflow-scrolling: touch; } - .highlight pre code[class*="language-"] { -webkit-overflow-scrolling: touch; } - .highlight pre code[class*="language-"]::before { background: black; border-radius: 0 0 0.25rem 0.25rem; @@ -318,56 +270,49 @@ svg { .highlight pre code[class=language-javaScript]::before, .highlight pre code[class="language-js"]::before { - content: "js"; - background: #f7df1e; - color: black; +content: "js"; +background: #f7df1e; +color: black; } - .highlight pre code[class*='language-yml']::before, .highlight pre code[class*='language-yaml']::before { - content: 'yaml'; - background: #f71e6a; - color: white; +content: 'yaml'; +background: #f71e6a; +color: white; } - .highlight pre code[class*='language-shell']::before, .highlight pre code[class*='language-bash']::before, .highlight pre code[class*='language-sh']::before { - content: 'shell'; - background: green; - color: white +content: 'shell'; +background: green; +color:white } - -.highlight pre code[class*='language-json']::before { - content: 'json'; - background: dodgerblue; - color: #000000 +.highlight pre code[class*='language-json']::before{ +content: 'json'; +background: dodgerblue; + color: #000000 } - .highlight pre code[class*='language-python']::before, .highlight pre code[class*='language-py']::before { - content: 'py'; - background: blue; - color: yellow; +content: 'py'; +background: blue; +color: yellow ; } - -.highlight pre code[class*='language-css']::before { - content: 'css'; - background: cyan; - color: black; +.highlight pre code[class*='language-css']::before{ +content: 'css'; +background: cyan; +color: black ; } - -.highlight pre code[class*='language-go']::before { - content: 'Go'; - background: cyan; - color: royalblue; +.highlight pre code[class*='language-go']::before{ +content: 'Go'; +background: cyan; +color: royalblue ; } - .highlight pre code[class*='language-md']::before, -.highlight pre code[class*='language-md']::before { - content: 'Markdown'; - background: royalblue; - color: whitesmoke; +.highlight pre code[class*='language-md']::before{ +content: 'Markdown'; +background: royalblue; +color: whitesmoke ; } /* table */ @@ -376,13 +321,13 @@ table { border-collapse: collapse; } -table th { +table th{ padding: 6px 13px; border: 1px solid #dfe2e5; font-size: large; } -table td { +table td{ padding: 6px 13px; border: 1px solid #dfe2e5; -} \ No newline at end of file +} diff --git a/public/img/dilated_sliding_window.png b/public/img/dilated_sliding_window.png new file mode 100644 index 0000000..b25e679 Binary files /dev/null and b/public/img/dilated_sliding_window.png differ diff --git a/public/img/first_pred_kv.png b/public/img/first_pred_kv.png new file mode 100644 index 0000000..f400bcf Binary files /dev/null and b/public/img/first_pred_kv.png differ diff --git a/public/img/longformer.png b/public/img/longformer.png new file mode 100644 index 0000000..798c5d9 Binary files /dev/null and b/public/img/longformer.png differ diff --git a/public/img/second_pred_kv.png b/public/img/second_pred_kv.png new file mode 100644 index 0000000..986981d Binary files /dev/null and b/public/img/second_pred_kv.png differ diff --git a/public/img/sliding_window.png b/public/img/sliding_window.png new file mode 100644 index 0000000..61fd48a Binary files /dev/null and b/public/img/sliding_window.png differ diff --git a/public/img/sparse_attention.png b/public/img/sparse_attention.png new file mode 100644 index 0000000..4128ca7 Binary files /dev/null and b/public/img/sparse_attention.png differ diff --git a/public/index.html b/public/index.html index 7814d95..14c793d 100644 --- a/public/index.html +++ b/public/index.html @@ -12,19 +12,57 @@ - + + + + + + + + + + + + + + + + @@ -37,6 +75,12 @@ @@ -48,28 +92,50 @@
-

Post 2

+

Intro to Attention


- Here’s my second content… + A brief introduction to attention in the transformer architecture.
- Read more ⟶ + Read more ⟶
-

Test

- +

Flash Attention

+
- Here’s my content! What do you think ? $1^2$… + Reduce the memory usage used to compute exact attention.
- Read more ⟶ + Read more ⟶
+
+

Multi & Grouped Query Attention

+ +
+ + Use less K and V matrices to use less memory. + +
+ Read more ⟶ +
+ + +
diff --git a/public/index.xml b/public/index.xml index 2a9b304..9e8f3f5 100644 --- a/public/index.xml +++ b/public/index.xml @@ -6,21 +6,63 @@ Recent content on Jonah's ML Notes Hugo -- gohugo.io en-us - Sat, 30 Mar 2024 11:49:13 +0000 + Sat, 30 Mar 2024 00:00:00 +0000 - Post 2 - https://www.jonahramponi.com/posts/test-copy/ - Sat, 30 Mar 2024 11:49:13 +0000 - https://www.jonahramponi.com/posts/test-copy/ - Here’s my second content + Intro to Attention + https://www.jonahramponi.com/posts/intro_to_attention/ + Sat, 30 Mar 2024 00:00:00 +0000 + https://www.jonahramponi.com/posts/intro_to_attention/ + Suppose you give an LLM the input What is the capital of France? The first thing the LLM will do is split this input into tokens. A token is just some combinations of characters. You can see an example of the tokenization outputs for the question below. $\colorbox{red}{What}\colorbox{magenta}{ is}\colorbox{green}{ the}\colorbox{orange}{ capital}\colorbox{purple}{ of}\colorbox{brown}{ France}\colorbox{cyan}?$ (This tokenization was produced using cl100k_base, the tokenizer used in GPT-3.5-turbo and GPT-4.) In this example we have $(n = 7)$ tokens. - Test - https://www.jonahramponi.com/posts/test/ - Sat, 30 Mar 2024 11:49:13 +0000 - https://www.jonahramponi.com/posts/test/ - Here’s my content! What do you think ? $1^2$ + Flash Attention + https://www.jonahramponi.com/posts/flash_attention/ + Tue, 26 Mar 2024 00:00:00 +0000 + https://www.jonahramponi.com/posts/flash_attention/ + The goal of Flash Attention is to compute the attention value with fewer high bandwidth memory read / writes. The approach has since been refined in Flash Attention 2. We will split the attention inputs $Q,K,V$ into blocks. Each block will be handled separately, and attention will therefore be computed with respect to each block. With the correct scaling, adding the outputs from each block we will give us the same attention value as we would get by computing everything all together. + + + Multi & Grouped Query Attention + https://www.jonahramponi.com/posts/mqa_gqa/ + Fri, 22 Mar 2024 00:00:00 +0000 + https://www.jonahramponi.com/posts/mqa_gqa/ + Multi Query Attention Multi Query Attention (MQA) using the same $K$ and $V$ matrices for each head in our multi head self attention mechanism. For a given head, $h$, $1 \leq h \leq H$, the attention mechanism is calculated as \begin{equation} h_i = \text{attention}(M\cdot W_h^Q, M \cdot W^K,M \cdot W^V). \end{equation} For each of our $H$ heads, the only difference in the weight matrices is in $W_h^Q$. Each of these $W_h$ has dimension $(n \times d_q)$. + + + Sliding Window Attention + https://www.jonahramponi.com/posts/sliding_window_attention/ + Fri, 22 Mar 2024 00:00:00 +0000 + https://www.jonahramponi.com/posts/sliding_window_attention/ + Sliding Window Attention reduces the number of calculations we are doing when computing self attention. Previously, to compute attention we took our input matrix of positional encodings $M$, and made copies named $Q, K$ and $V$. We used these copies to compute \begin{equation} \text{attention}(Q,K,V) = \text{softmax}\Big(\frac{Q K^T}{\sqrt{d_k}}\Big) V. \end{equation} For now, let’s ignore the re-scaling by $\sqrt{d_k}$ and just look at the computation of $QK^T$. This computation looks like \begin{equation} Q \times K^T = \begin{pmatrix} Q_{11} & Q_{12} & \cdots & Q_{1d} \\ \vdots & \ddots & \cdots & \vdots \\ Q_{n1} & Q_{n2} & \cdots & Q_{nd} \end{pmatrix} \times \begin{pmatrix} K_{11} & K_{21} & \cdots & K_{n1} \\ \vdots & \ddots & \cdots & \vdots \\ K_{1d} & K_{2d} & \cdots & K_{nd} \end{pmatrix} \end{equation} + + + Sparse Attention + https://www.jonahramponi.com/posts/sparse_attention/ + Fri, 22 Mar 2024 00:00:00 +0000 + https://www.jonahramponi.com/posts/sparse_attention/ + Sparse Attention introduces sparse factorizations on the attention matrix. To implement this we introduce a connectivity pattern $S = {S_1,\dots,S_n}$. Here, $S_i$ denotes the set of indices of the input vectors to which the $i$th output vector attends. For instance, in regular $n^2$ attention every input vector attends to every output vector before it in the sequence. Remember that $d_k$ is the inner dimension of our queries and keys. Sparse Attention is given as follows + + + The KV Cache + https://www.jonahramponi.com/posts/kv_cache/ + Fri, 22 Mar 2024 00:00:00 +0000 + https://www.jonahramponi.com/posts/kv_cache/ + The computation of attention is costly. Remember that our decoder works in an auto-regressive fashion. For our given input $$\colorbox{red}{What}\colorbox{magenta}{ is}\colorbox{green}{ the}\colorbox{orange}{ capital}\colorbox{purple}{ of}\colorbox{brown}{ France}\colorbox{cyan}{?}"$$ \begin{align} \text{Prediction 1} &= \colorbox{orange}{The} \\ \text{Prediction 2} &= \colorbox{orange}{The}\colorbox{pink}{ capital} \\ &\vdots \\ \text{Prediction $p$} &= \colorbox{orange}{The}\colorbox{pink}{ capital} (\dots) \colorbox{red}{ Paris.} \end{align} To produce prediction $2$, we will take the output from prediction $1$. At each step, the model will also see our input sequence. + + + PDFs and Resources + https://www.jonahramponi.com/posts/resources/ + Wed, 28 Feb 2024 11:49:13 +0000 + https://www.jonahramponi.com/posts/resources/ + The contents of this website can be found as a pdf here. + + + + https://www.jonahramponi.com/about/ + Mon, 01 Jan 0001 00:00:00 +0000 + https://www.jonahramponi.com/about/ + This is the about page. diff --git a/public/page/2/index.html b/public/page/2/index.html new file mode 100644 index 0000000..60ffe39 --- /dev/null +++ b/public/page/2/index.html @@ -0,0 +1,157 @@ + + + + + Jonah's ML Notes | Home + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ + +
+
+ + + +
+

Sliding Window Attention

+ +
+ + Altering the tokens to which a token in the input sequence attends. + +
+ Read more ⟶ +
+ +
+

Sparse Attention

+ +
+ + Reducing the number of calculations to compute attention. + +
+ Read more ⟶ +
+ +
+

The KV Cache

+ +
+ + Computing the attention more efficiently at inference. + +
+ Read more ⟶ +
+ + + + + + +
+ + + + +
+ + + diff --git a/public/page/3/index.html b/public/page/3/index.html new file mode 100644 index 0000000..1473a1b --- /dev/null +++ b/public/page/3/index.html @@ -0,0 +1,133 @@ + + + + + Jonah's ML Notes | Home + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ + +
+
+ + + +
+

PDFs and Resources

+ +
+ + The contents of this website can be found as a pdf here.… + +
+ Read more ⟶ +
+ + + + + + +
+ + + + +
+ + + diff --git a/public/posts/file/Attention_Mechanisms.pdf b/public/posts/file/Attention_Mechanisms.pdf new file mode 100644 index 0000000..a40bc0a Binary files /dev/null and b/public/posts/file/Attention_Mechanisms.pdf differ diff --git a/public/posts/flash_attention/index.html b/public/posts/flash_attention/index.html new file mode 100644 index 0000000..7c58f74 --- /dev/null +++ b/public/posts/flash_attention/index.html @@ -0,0 +1,164 @@ + + + + Flash Attention - Jonah's ML Notes + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+

Flash Attention

+
Posted on Mar 26, 2024
+
+ +
+ tl;dr: + Reduce the memory usage used to compute exact attention. +
+ +
+

The goal of Flash Attention is to compute the attention value with fewer high bandwidth memory read / writes. The approach has since been refined in Flash Attention 2.

+

We will split the attention inputs $Q,K,V$ into blocks. Each block will be handled separately, and attention will therefore be computed with respect to each block. With the correct scaling, adding the outputs from each block we will give us the same attention value as we would get by computing everything all together.

+

Tilling. To compute attention, we multiply $Q \times K^T$, divide by $\sqrt{d_k}$ and then take the softmax. Keeping track of the scaling values in softmax is the key to making this technique work. The softmax for a vector $\vec{x} \in \mathbb{R}^{2n}$ is given by

+

$$ +m(x):= \max_i x_i, \hspace{3mm} f(x):= [e^{x_1-m(x)}, \dots, e^{x_b -m(x)}], \hspace{3mm} \ell(x) := \sum_i f(x)_i, \hspace{3mm} \text{softmax}(x) := \frac{f(x)}{\ell(x)}. +$$

+

This looks unfriendly, but is really just the notation for a more numerically stable softmax. What does that mean? Well, notice we are just applying regular softmax but with some shifting of each element of vector $\vec{x}$ by $\max(x)$ units. We can do this because softmax$(\vec{x}) = \text{softmax}(\vec{x}-c)$ for any scalar $c$.

+

Proof +\begin{align*} +\text{softmax}(\vec{x} - c) &= \frac{e^{\vec{x} - c}}{\sum_{j} e^{x_j - c}} \\ +&= \frac{e^{\vec{x}} \cdot e^{-c}}{\sum_{j} e^{x_j} \cdot e^{-c}} \\ +&= \frac{e^{\vec{x}}}{\sum_{j} e^{x_j}} \\ +&= \text{softmax}(\vec{x}) +\end{align*}

+

In this case, we improve numerical stability by ensuring we do not take the exponential of very large numbers. This can lead to overflow issues. This simply means our number gets too big to store in the given datatype. By subtracting the largest element, we ensure the vector $\vec{x}$ only has non-positive entries. For example, in floating point 64, the maximum value we can represent is very large $(10^{308})$. However

+

$$ +e^x > 10^{308} \implies x > \ln(10^{308}) \implies x > 308 \times \ln(10) \implies x > 709. +$$

+

Therefore, approximately any $x$ larger than $709$ will result in overflow issues. For instance, computing $\exp(709) = 8.22e+307$ but $\exp(710) = inf$ in numpy.

+
np.exp(709)
+# 8.218407461554972e+307
+
np.exp(710)
+# <stdin>:1: RuntimeWarning: overflow encountered in exp
+# inf
+

We certainly do not want our model to hit any overflow errors. It is therefore preferable to use this numerically stable version of softmax.

+

To compute softmax in blocks, we decompose our vector $\vec{x} \in \mathbb{R}^{2n}$ into two smaller vectors in $\mathbb{R}^n$.Let’s look at the simple case of decomposing into two vectors. Denote these vectors $\vec{x}_1,\vec{x}_2$ each in $\mathbb{R}^n$. Our softmax calculation becomes

+

\begin{aligned} +m(x) &= m([x_1\hspace{3mm} x_2]) = \max (m(x_1),m(x_2)), \\ +f(x) &= [e^{m(x_1) - m(x)}f(x_1) \hspace{3mm} e^{m(x_2) - m(x)}f(x_2)], \\ +\ell(x) &= \ell([x_1\hspace{3mm} x_2]) = [e^{m(x_1) - m(x)}\ell(x_1) \hspace{3mm} e^{m(x_2) - m(x)}\ell(x_2)], \\ +\text{softmax}(x) &= \frac{f(x)}{\ell(x)}. +\end{aligned}

+

Notice that we use $m(x_i) - m(x)$ as the normalization factor, as we do not know which group will contain the maximum value of $\vec{x}$. By keeping track of both $m(x)$ and $\ell(x)$ we will be able to accurately recombine the softmax outputs for each block, as will know how to rescale the softmax outputs.

+

Recomputation. We also do not wish to store all the intermediate values we calculate for every backward pass. Typically we require the attention matrix, $QK^T$, and the output after softmax, simply softmax($QK^T$) in each backward pass. However, by using our blocks of $Q,K,V$ the whole attention matrix is not required to be loaded in during every backward pass.

+ +
+ + +
+
+ + + +
+ + diff --git a/public/posts/index.html b/public/posts/index.html index 9195a91..5ca1d29 100644 --- a/public/posts/index.html +++ b/public/posts/index.html @@ -11,19 +11,57 @@ - + + + + + + + + + + + + + + + + @@ -34,6 +72,12 @@ @@ -43,9 +87,19 @@

All articles