main.tex

\documentclass{article}

\usepackage[cmyk]{xcolor}

% start 

\usepackage{hyperref}

\setcounter{tocdepth}{4}

\makeatletter
\DeclareRobustCommand*{\Myhref}[1][]{% definition copied form hyperref.sty
  \begingroup
  \setkeys{Hyp}{#1}% Changed href to Hyp
  \@ifnextchar\bgroup\Hy@href{\hyper@normalise\href@}%
}
\makeatother

\makeatletter
  \let\Hy@setpdfborderOrig\Hy@setpdfborder
  \def\Hy@setpdfborder{\ocgbase@insert@oc\Hy@setpdfborderOrig}%
\makeatother

\usepackage{xparse,ocgbase,calc,tikzpagenodes,linegoal}
\usetikzlibrary{calc}

\ExplSyntaxOn
\let\tpPdfLink\pbs_pdflink:nn
\let\tpPdfAnnot\pbs_pdfannot:nnnn\let\tpPdfLastAnn\pbs_pdflastann:
\let\tpAppendToFields\pbs_appendtofields:n
\def\tpPdfXform{\pbs_pdfxform:nnnnn{1}{1}{}{}}
\let\tpPdfLastXform\pbs_pdflastxform:
\let\cListSet\clist_set:Nn\let\cListItem\clist_item:Nn
\ExplSyntaxOff

\makeatletter
\NewDocumentCommand{\tooltip}{%
  ssssO{\ifdefined\@linkcolor\@linkcolor\else blue\fi}mO{gray!20}mO{0pt,0pt}%
}{{%
  \leavevmode%
  \IfBooleanT{#2}{%
    %for variants with two and more stars, put tip box on a PDF Layer (OCG)
    \ocgbase@new@ocg{tipOCG.\thetcnt}{%
      /Print<</PrintState/OFF>>/Export<</ExportState/OFF>>%
    }{false}%
    \xdef\tpTipOcg{\ocgbase@last@ocg}%
    %prevent simultaneous visibility of multiple non-draggable tooltips
    \ocgbase@add@ocg@to@radiobtn@grp{tool@tips}{\ocgbase@last@ocg}%
  }%
  \tpPdfLink{%
    \IfBooleanTF{#4}{%
      /Subtype/Link/Border[0 0 0]/A <</S/SetOCGState/State [/Toggle \tpTipOcg]>>
    }{%
      /Subtype/Screen%
      /AA<<%
        \IfBooleanTF{#3}{%
          /E<</S/SetOCGState/State [/Toggle \tpTipOcg]>>%
        }{%
          \IfBooleanTF{#2}{%
            /E<</S/SetOCGState/State [/ON \tpTipOcg]>>%
            /X<</S/SetOCGState/State [/OFF \tpTipOcg]>>%
          }{
            \IfBooleanTF{#1}{%
              /E<</S/JavaScript/JS(%
                var fd=this.getField('tip.\thetcnt');%
                if(typeof(click\thetcnt)=='undefined'){%
                  var click\thetcnt=false;%
                  var fdor\thetcnt=fd.rect;var dragging\thetcnt=false;%
                }%
                if(fd.display==display.hidden){%
                  fd.delay=true;fd.display=display.visible;fd.delay=false;%
                }else{%
                  if(!click\thetcnt&&!dragging\thetcnt){fd.display=display.hidden;}%
                  if(!dragging\thetcnt){click\thetcnt=false;}%
                }%
                this.dirty=false;%
              )>>%
            }{%
              /E<</S/JavaScript/JS(%
                var fd=this.getField('tip.\thetcnt');%
                if(typeof(click\thetcnt)=='undefined'){%
                  var click\thetcnt=false;%
                  var fdor\thetcnt=fd.rect;var dragging\thetcnt=false;%
                }%
                if(fd.display==display.hidden){%
                  fd.delay=true;fd.display=display.visible;fd.delay=false;%
                }%
               this.dirty=false;%
              )>>%
              /X<</S/JavaScript/JS(%
                if(!click\thetcnt&&!dragging\thetcnt){fd.display=display.hidden;}%
                if(!dragging\thetcnt){click\thetcnt=false;}%
                this.dirty=false;%
              )>>%
            }%
            /U<</S/JavaScript/JS(click\thetcnt=true;this.dirty=false;)>>%
            /PC<</S/JavaScript/JS (%
              var fd=this.getField('tip.\thetcnt');%
              try{fd.rect=fdor\thetcnt;}catch(e){}%
              fd.display=display.hidden;this.dirty=false;%
            )>>%
            /PO<</S/JavaScript/JS(this.dirty=false;)>>%
          }%
        }%
      >>%
    }%
  }{{\color{#5}#6}}%
  \sbox\tiptext{%
    \IfBooleanT{#2}{%
      \ocgbase@oc@bdc{\tpTipOcg}\ocgbase@open@stack@push{\tpTipOcg}}%
    \fcolorbox{black}{#7}{#8}%
    \IfBooleanT{#2}{\ocgbase@oc@emc\ocgbase@open@stack@pop\tpNull}%
  }%
  \cListSet\tpOffsets{#9}%
  \edef\twd{\the\wd\tiptext}%
  \edef\tht{\the\ht\tiptext}%
  \edef\tdp{\the\dp\tiptext}%
  \tipshift=0pt%
  \IfBooleanTF{#2}{%
    %OCG-based (that is, all non-draggable) boxes should not extend beyond the
    %current column as they may get overlaid by text in the neighbouring column
    \setlength\whatsleft{\linegoal}%
  }{%
    \measureremainder{\whatsleft}%
  }%
  \ifdim\whatsleft<\dimexpr\twd+\cListItem\tpOffsets{1}\relax%
    \setlength\tipshift{\whatsleft-\twd-\cListItem\tpOffsets{1}}\fi%
  \IfBooleanF{#2}{\tpPdfXform{\tiptext}}%
  \raisebox{\heightof{#6}+\tdp+\cListItem\tpOffsets{2}}[0pt][0pt]{%
    \makebox[0pt][l]{\hspace{\dimexpr\tipshift+\cListItem\tpOffsets{1}\relax}%
    \IfBooleanTF{#2}{\usebox{\tiptext}}{%
      \tpPdfAnnot{\twd}{\tht}{\tdp}{%
        /Subtype/Widget/FT/Btn/T (tip.\thetcnt)%
        /AP<</N \tpPdfLastXform>>%
        /MK<</TP 1/I \tpPdfLastXform/IF<</S/A/FB true/A [0.0 0.0]>>>>%
        /Ff 65536/F 3%
        /AA <<%
          /U <<%
            /S/JavaScript/JS(%
              var fd=event.target;%
              var mX=this.mouseX;var mY=this.mouseY;%
              var drag=function(){%
                var nX=this.mouseX;var nY=this.mouseY;%
                var dX=nX-mX;var dY=nY-mY;%
                var fdr=fd.rect;%
                fdr[0]+=dX;fdr[1]+=dY;fdr[2]+=dX;fdr[3]+=dY;%
                fd.rect=fdr;mX=nX;mY=nY;%
              };%
              if(!dragging\thetcnt){%
                dragging\thetcnt=true;Int=app.setInterval("drag()",1);%
              }%
              else{app.clearInterval(Int);dragging\thetcnt=false;}%
              this.dirty=false;%
            )%
          >>%
        >>%
      }%
      \tpAppendToFields{\tpPdfLastAnn}%
    }%
  }}%
  \stepcounter{tcnt}%
}}
\makeatother
\newsavebox\tiptext\newcounter{tcnt}
\newlength{\whatsleft}\newlength{\tipshift}
\newcommand{\measureremainder}[1]{%
  \begin{tikzpicture}[overlay,remember picture]
    \path let \p0 = (0,0), \p1 = (current page.east) in
      [/utils/exec={\pgfmathsetlength#1{\x1-\x0}\global#1=#1}];
  \end{tikzpicture}%
}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\newsavebox{\switchbox}% measure width
\savebox{\switchbox}{\fcolorbox{black}{yellow}{\bfseries +}}

\newcommand{\centerwithin}[2]{% #1=small symbol, #2=wide symbol
  {\mathmakebox[\widthof{\ensuremath{{}#2{}}}][c]{{#1}}}}

% end

\usepackage[utf8]{inputenc}
\usepackage{graphicx}
\usepackage{amsmath}
\usepackage{siunitx}
\usepackage{amssymb}
\usepackage{dsfont}
\usepackage{authblk}
\usepackage[lite]{mtpro2}
\usepackage{mathrsfs}
\usepackage{verbatim}
\usepackage{mathtools}
\usepackage{booktabs}
\usepackage{empheq}
\usepackage{lscape}
\usepackage{animate}
\usepackage{bigdelim}
\usepackage{color, colortbl}
\usepackage{soul}
\usepackage{caption}
\usepackage{lipsum}
\usepackage{arydshln}
\usepackage[compat=1.1.0]{tikz-feynman}
\usepackage{tikz}
\usepackage{multirow}
\usetikzlibrary{backgrounds}
\usepackage[a4paper, total={6.2in, 9.5in}]{geometry}
\usepackage[title]{appendix}
\usepackage{float}
\usepackage{pgfplots}
\usepackage{subcaption}

\usepackage{mdframed}

\newcommand*\widefbox[1]{\fbox{\hspace{2em}#1\hspace{2em}}}
\newcommand{\sech}{\operatorname{sech}}
\newcommand{\mathcolorbox}[2]{\colorbox{#1}{$\displaystyle #2$}}

\tikzset{Lstyle/.style={fill=cyan!20}}
\tikzset{Mstyle/.style={fill=orange!30}}
\tikzset{Nstyle/.style={fill=yellow!20}}
\tikzset{Ostyle/.style={fill=green!30}}
\tikzset{nodeStyle/.style={fill=gray!35}}


\tikzset{Lstyle/.style={fill=cyan!20}}
\tikzset{Mstyle/.style={fill=orange!30}}
\tikzset{Nstyle/.style={fill=yellow!20}}
\tikzset{Ostyle/.style={fill=green!30}}
\tikzset{nodeStyle/.style={fill=gray!35}}

\tikzstyle{residual} = [dataNode, text width=11em, fill=red!20, minimum height=5em, text centered, rounded corners, drop shadow]
\tikzstyle{dataNode}=[draw, fill=orange]
\tikzstyle{errNode}=[draw, fill=light-blue]

\tikzstyle{opt} = [text width=10em, fill=red!20, minimum height=5em, text centered, rounded corners]
\tikzstyle{opt2} = [text width=11em, fill=yellow!70, minimum height=5em, text centered, rounded corners]

\usetikzlibrary{shapes,arrows,shadows}

\DeclarePairedDelimiter\ceil{\lceil}{\rceil}
\DeclarePairedDelimiter\floor{\lfloor}{\rfloor}

\DeclareMathAlphabet\mathbfcal{OMS}{cmsy}{b}{n}

\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}

\definecolor{shadecolor}{cmyk}{0,0,0.41,0}
\definecolor{light-blue}{cmyk}{0.25,0,0,0}
\definecolor{green-yellow}{rgb}{0.68, 1.0, 0.18}
\definecolor{Gray}{gray}{0.9}

\newsavebox{\mysaveboxM} 
\newsavebox{\mysaveboxT}

\newcolumntype{z}{>{\columncolor{shadecolor}}l}
\newcolumntype{b}{>{\columncolor{shadecolor}}r}
\newcolumntype{y}{>{\columncolor{green-yellow}}l}
\newcolumntype{x}{>{\columncolor{Gray}}c}
\newcolumntype{g}{>{\columncolor{Gray}}l}

\newcommand*\backPropBox[2][Example]{%
    \sbox{\mysaveboxM}{#2}%
    \sbox{\mysaveboxT}{\fcolorbox{black}{light-blue}{#1}}%
\sbox{\mysaveboxM}{%
      \parbox[b][\ht\mysaveboxM+.5\ht\mysaveboxT+.5\dp\mysaveboxT][b]{%
        \wd\mysaveboxM}{#2}%
    }%
\sbox{\mysaveboxM}{%
      \fcolorbox{black}{shadecolor}{%
        \makebox[\linewidth-5em]{\usebox{\mysaveboxM}}%
      }%
}%
\usebox{\mysaveboxM}%
    \makebox[0pt][r]{%
      \makebox[\wd\mysaveboxM][c]{%
        \raisebox{\ht\mysaveboxM-0.5\ht\mysaveboxT
+0.5\dp\mysaveboxT-0.5\fboxrule}{\usebox{\mysaveboxT}}%
}% 
}%
}

\newcommand*\LongbackPropBox[2][Example]{%
    \sbox{\mysaveboxM}{#2}%
    \sbox{\mysaveboxT}{\fcolorbox{black}{light-blue}{#1}}%
\sbox{\mysaveboxM}{%
      \parbox[b][\ht\mysaveboxM+.5\ht\mysaveboxT+.5\dp\mysaveboxT][b]{%
        \wd\mysaveboxM}{#2}%
    }%
\sbox{\mysaveboxM}{%
      \fcolorbox{black}{shadecolor}{%
        \makebox[\linewidth+5em]{\usebox{\mysaveboxM}}%
      }%
}%
\usebox{\mysaveboxM}%
    \makebox[0pt][r]{%
      \makebox[\wd\mysaveboxM][c]{%
        \raisebox{\ht\mysaveboxM-0.5\ht\mysaveboxT
+0.5\dp\mysaveboxT-0.5\fboxrule}{\usebox{\mysaveboxT}}%
}% 
}%
}

\newcommand*\forwardBox[2][Example]{%
    \sbox{\mysaveboxM}{#2}%
    \sbox{\mysaveboxT}{\fcolorbox{black}{orange}{#1}}%
\sbox{\mysaveboxM}{%
      \parbox[b][\ht\mysaveboxM+.5\ht\mysaveboxT+.5\dp\mysaveboxT][b]{%
        \wd\mysaveboxM}{#2}%
    }%
\sbox{\mysaveboxM}{%
      \fcolorbox{black}{green-yellow}{%
        \makebox[\linewidth-5em]{\usebox{\mysaveboxM}}%
      }%
}%
\usebox{\mysaveboxM}%
    \makebox[0pt][r]{%
      \makebox[\wd\mysaveboxM][c]{%
        \raisebox{\ht\mysaveboxM-0.5\ht\mysaveboxT
+0.5\dp\mysaveboxT-0.5\fboxrule}{\usebox{\mysaveboxT}}%
}% 
}%
}

\begin{document}

\title{Deep learning for pedestrians: backpropagation in CNNs}
\author{Laurent Bou\'e \\ \Myhref[hidelinks]{mailto:ranlot75@gmail.com}{\protect\includegraphics[width=1cm,height=0.6cm]{pptx/logos/gmail.png}} \,\,\,\, \Myhref[hidelinks]{https://www.linkedin.com/in/laurent-bou\%C3\%A9-b7923853/}{\protect\includegraphics[width=0.8cm,height=0.8cm]{pptx/logos/linkedin.png}} \,\,\,\, \Myhref[hidelinks]{https://github.com/Ranlot}{\protect\includegraphics[width=1cm,height=1cm]{pptx/logos/github.png}} \,\,\,\, \Myhref[hidelinks]{https://twitter.com/ranlot75}{\protect\includegraphics[width=0.8cm,height=0.8cm]{pptx/logos/twitter.png}} }
\affil{SAP Labs}
\date{}
\maketitle

\begin{abstract}
The goal of this document is to provide a pedagogical introduction to the main concepts underpinning the training of deep neural networks using gradient descent; a process known as backpropagation.  Although we focus on a very influential class of architectures called ``convolutional neural networks'' (CNNs) the approach is generic and useful to the machine learning community as a whole.  Motivated by the observation that derivations of backpropagation are often obscured by clumsy index-heavy narratives that appear somewhat mathemagical, we aim to offer a conceptually clear, vectorized description that articulates well the higher level logic.  Following the principle of ``writing is nature's way of letting you know how sloppy your thinking is'', we try to make the calculations meticulous, self-contained and yet as intuitive as possible.  Taking nothing for granted, ample illustrations serve as visual guides and an extensive bibliography is provided for further explorations.
\end{abstract}

\noindent \vspace{0.3cm}

\newcommand\tempboxTT{%
\begin{minipage}{0.643\textwidth}%
\abovedisplayskip=0pt
\belowdisplayskip=0pt
\begin{tikzpicture}
\node (opt) [opt] {\bf Click to open optional content \\ \vspace{0.6cm}};
\node[right = 0.8cm of opt, opt2] (opt2) {\bf Click again to close optional content \\ \vspace{0.6cm}};
\end{tikzpicture}
\end{minipage}}

\fcolorbox{black}{lightgray}{%
\minipage[t]{\dimexpr0.9\linewidth-2\fboxsep-2\fboxrule\relax}
\noindent For the sake of clarity, long mathematical derivations and visualizations have been broken up into a short ``summarized view'' and a longer ``detailed view''.  Detailed views are encoded into the~PDF as optional content groups that become visible by clicking on the yellow symbol~$\hspace{0.53cm} \makebox[0pt][l]{\makebox[0pt][r]{\tooltip****{\usebox{\switchbox}}{\tempboxTT}[-\fboxsep,-\dimexpr\ht\switchbox+\dp\switchbox\relax]}}$.  In addition, some figures contain animations designed to illustrate important concepts in a more engaging style. For these reasons, we advise to download the document locally and open it using~{\bf Adobe~Acrobat Reader}.  Other viewers were not tested and may not render the detailed views, animations correctly.  For completeness, the overall structure of the paper is summarized in a \hyperlink{contents}{table of contents}.
\endminipage}

\noindent \vspace{0.25cm}

\section{Supervised machine learning from 20,000 feet...}
\label{sec:intro}

The general workflow of training supervised machine learning models follows a well-oiled iterative procedure which we illustrate in the context of {\bf convolutional neural networks} (CNNs) for image classification.  This field has undergone such rapid development in the last few years that it is sometimes used as the official ``success story'' of deep learning.  Indeed, supervised image classification (and related topics such as object detection, instance segmentation...) is nowadays considered as a kind of commodity software that made its way into countless industrial applications and, consequently, it is worthwile to become familiar with how CNNs work under the hood.  Broadly speaking, deep learning models have a highly modular structure where so-called ``layers'' play the role of elementary building blocks.  Although different kinds of layers serve different purposes, it turns out that CNNs are constructed from very generic layers that also appear in a wide range of very diverse neural network architectures.  This means that the results presented here are effortlessly portable and remain useful in vast and far-reaching applications of deep learning that go beyond CNNs and even supervised techniques. \\

\noindent As an introduction, let us go over the high-level description of the iterative loop of training a machine learning model.  This procedure is illustrated graphically in~Fig.\ref{fig:iterativeProcedure} and the rest of this section is dedicated to a brief review of machine learning basics.

\newpage

\begin{enumerate}
\item The starting point consists in gathering a large set of ``training'' data along with their accompanying ``ground-truth'' labels: 
\begin{itemize}
\item Each sample~$s$ of \colorbox{pink}{data can be expressed as a~$f$-dimensional feature vector}~${\bf a}^s \sim \mathbb{R}^f$. What those~$f$ features represent in the physical world depends on the modality (time-series, image, text...) of the data considered. Let's assume that we are dealing with a minibatch of~$n$ such samples simultaneously.  In this case it is convenient to stack together all~$n$ training samples vertically into:
\begin{equation*}
{\bf A} = \left(
\begin{matrix}
    {\bf a}^1 \sim \mathbb{R}^f  \\
    \vdots \\
    {\bf a}^n \sim \mathbb{R}^f
\end{matrix}
\right) = \left(
\begin{matrix}
    a^{1}_1 & \dots & a^1_f \\
    \vdots & \vdots & \vdots \\
    a^n_1 & \dots & a^n_f
\end{matrix}
\right) \sim \mathbb{R}^{n \times f} 
\label{stackedMatrix}
\end{equation*}

As mentioned above, we will concern ourselves with the task of image classification.  This means that each training sample~${\bf a}^s$ is actually a color image of height~$h$, width~$w$ and depth~$d$ (number of channels, $d=3$ for RGB for example).  As such, feature vectors can be represented as a~3d~structure~$f \equiv \mathbb{R}^{d \times h \times w}$.  For the sake of simplicity, we will only consider square images and denote by~$r \equiv h \equiv w$ the spatial resolution of the image.  Note that the dimensionality of~$f$ grows quadratically with the spatial resolution of the images meaning that, even for modest sizes~$\sim \num[group-separator={,}]{1000}$ pixels, we quickly arrive at very high dimensional input data~$f \sim 10^6$ features (pixel values).  Stacking together all the images present in a minibatch, the raw input data~${\bf A}_0 \sim \mathbb{R}^{n\times d\times r\times r}$ therefore starts as a (1+3)d~=~4d array whose shape will evolve (in depth as well as in spatial resolution) as it flows deeper into the network as shown in table~\ref{table:networkExample} and discussed more in detail in point~2 below.  

\item In addition, each sample~$s$ is also associated with its \colorbox{pink}{ground-truth categorical label}~${\bf y}^s_\text{gt}$.  Denoting by~$n_c$ the number of possible classes,~${\bf y}^s_\text{gt}$ is generally represented as a ``One Hot Encoded'' (OHE) vector~${\bf y}^s_\text{gt} \sim \mathbb{R}^{n_c}$. Stacking the~$n$ ground-truth vectors all together, we represent the labels via the following structure:
\begin{equation*}
{\bf Y}_\text{gt} = \left(
\begin{matrix}
    {\bf y}^1_\text{gt} \sim \mathbb{R}^{n_c}  \\
    \vdots \\
    {\bf y}^n_\text{gt} \sim \mathbb{R}^{n_c}
\end{matrix}
\right) = \left(
\begin{matrix}
    y^1_1 & \dots & y^1_{n_c} \\
    \vdots & \vdots & \vdots \\
    y^n_1 & \dots & y^n_{n_c}
\end{matrix}
\right)_\text{gt}
\sim \mathbb{R}^{n \times n_c} 
\label{stackedMatrix}
\end{equation*}
\noindent If sample~$s$ actually belongs to the ground-truth class~$c^s_\text{gt}$,~OHE representation means that there is only a single element~${\bf y}^s_\text{gt} (c^s_\text{gt})=1$ which is non-zero and all others are identically null~${\bf y}^s_\text{gt} (c \neq c^s_\text{gt}) = 0$ as illustrated in the top left corner of~Fig.\ref{fig:crossEntropy}.

\end{itemize}

The minibatch size~$n$, i.e. number of training samples that we stack together for simultaneous processing, should be considered as a hyper-parameter. Selecting the optimal~$n$ remains an active area of research and we come back to it in point~4 when we discuss the learning algorithm.  In the following, training data for a specific minibatch~b refers to the pair~$\mathcal{D}_\text{b} = ({\bf A}_0 , {\bf Y}_\text{gt})_\text{b}$. Assuming that the entire training dataset is divided into~$N$ minibatches, we can represent it as a list~$\mathcal{D}_\text{training} = \lbrack \mathcal{D}_1 , \cdots , \mathcal{D}_N \rbrack $.  As we will see shortly, it is implicitly assumed that the training points are independent and identically distributed (i.i.d.) according to an unknown distribution.

\begin{figure}
\centering
\begin{tikzpicture}
\begin{feynman}

\vertex (a1) {\( {\bf A}_0  \)};
\vertex[below=0.5cm of a1] (param) {\( \mathbfcal{P} \phantom{\hspace{0.11cm}} \) };
\vertex at ($(a1)!0.5!(param) - (-0.2cm, 0cm)$) (fStart);
\vertex[right=0.3cm of fStart] (ff);
\vertex[right=4.5cm of fStart] (a2) { \( {\bf Y}_\text{pred} \equiv \mathcal{N}_{ {\mathbfcal{P} }}({\bf A}_0)  \) } ;
\vertex[below=1.5cm of a1] (b1) {\( {\bf Y}_\text{gt} \) };
\vertex at ($(a1)!0.5!(b1) - (-10cm, 0cm)$) (j) { \( \mathcal{L}_\text{batch} ( \mathbfcal{P}) \) };
\vertex[below=3cm of j] (backStart) { \( \nabla_\mathbfcal{P} \mathcal{L}_\text{batch} \) };

\vertex at ($(a1)!0.9!(backStart) - (-1cm, 0cm)$) (fMid);

\vertex[left=6.2cm of backStart] (bJoin) { \( \mathbfcal{P} \leftarrow \mathbfcal{P} - \lambda \nabla_\mathbfcal{P} \mathcal{L}_\text{batch} \) } ;
\vertex[above=0.5cm of bJoin] (bGhost) { } ;
\vertex[left=4.2cm of backStart] (bs) { } ;

\vertex[below=3.35cm of a1] (backghost);
\vertex[right=6.2cm of backghost] (backPass) {\( \mathcolorbox{light-blue}{\text{backward pass}} \)};

\diagram* {
(ff) -- [charged boson, edge label=\( \mathcolorbox{orange}{\text{forward pass}} \)] (a2) -- [fermion] (j),
(b1) -- [fermion] (j),
(j) -- [gluon] (backStart),
(backStart) -- [fermion] (bs),
(ff) -- [ghost] (fMid),
(bGhost) -- [charged scalar] (param)
};


\draw [decoration={brace}, decorate] (bJoin.north west) -- (bJoin.north east) ;

\draw [decoration={brace}, decorate] (a1.north east) -- (param.south east) ;

\begin{scope}[on background layer]
\filldraw[draw=black, fill=green-yellow] (current bounding box.north west) rectangle 
($(current bounding box.north east)!0.6!(current bounding box.south east)$);
\filldraw[draw=black, fill=shadecolor] (current bounding box.south west) rectangle 
($(current bounding box.north east)!0.6!(current bounding box.south east)$);
\fill[fill=gray] ([yshift=1mm]$(current bounding box.north west)!0.6!(current bounding box.south
west)$) rectangle 
([yshift=-1mm]$(current bounding box.north east)!0.6!(current bounding box.south east)$);
\end{scope}

\end{feynman}
\end{tikzpicture}
\caption{High level cartoon of the iterative training loop. The model~$\mathcal{N}_\mathbfcal{P}$ is a probabilistic function parametrized by~$\mathbfcal{P}$ whose purpose is to take in the raw data~${\bf A}_0$ and return a probability distribution~${\bf Y}_\text{pred}$ over a set of~$n_c$ classes during the ``forward pass''. Combining~${\bf Y}_\text{pred}$ with the ground-truth~${\bf Y}_\text{gt}$ leads to a scalar~$\mathcal{L}_\text{batch} (\mathbfcal{P}) > 0 $, known as the loss function, that quantifies the level of mismatch between prediction and ground-truth.   The objective of training is to find a better set of parameters~$\mathbfcal{P}$ to minimize the value of the loss over the training data.  As discussed in the text, this can be achieved by calculating the gradient~$\nabla_\mathbfcal{P} \mathcal{L}_\text{batch}$ of the loss during the ``backward pass''. Calculating this gradient in the context of convolutional neural networks is the focus of this article.  Once~$\nabla_\mathbfcal{P} \mathcal{L}_\text{batch}$ is known, the parameters are updated proportionately to the learning rate~$\lambda$.  Cycles of forward/backward passes are run iteratively over minibatches of labeled data~$\mathcal{D}_\text{training}$ until one is satisfied with the overall performance of the model.}
\label{fig:iterativeProcedure}
\end{figure}

\item Define a \colorbox{pink}{model architecture} for the neural network~$\mathcal{N}_{\mathbfcal{P}}$ by arranging together a number~$n_\ell$ of layers.  Formally, the model should be thought of as a \colorbox{pink}{parametrized function~$\mathcal{N}_{\mathbfcal{P}}$ that takes the} \colorbox{pink}{input data~${\bf A}_0$ and returns a probability distribution~${\bf Y}_\text{pred}$ over~$n_c$ classes:}
\begin{equation}
{\bf Y}_\text{pred} = \mathcal{N}_{\mathbfcal{P}} \left( {\bf A}_0 \right) \sim \mathbb{R}^{n\times n_c}
\label{eq:SignatureOfNetwork}
\end{equation}

Evaluating~$\mathcal{N}_{\mathbfcal{P}}$ given some input data~${\bf A}_0$ is referred to as the ``forward pass'' and the resulting~${\bf Y}_\text{pred}$ defines the probabilistic prediction of the model. Each training sample~$s$ gets a prediction vector~${\bf y}^s_\text{pred} \sim \mathbb{R}^{n_c}$ indicating how ``confident'' the network is that this sample belongs to any one of the~$n_c$ possible classes as illustrated in the top right corner of~Fig.\ref{fig:crossEntropy}. The final layer of the network ensures proper normalization of the probability distributions so that~$\sum_{c=1}^{n_c} (y^s_c)_\text{pred} = 1$ independently for all~$n$ samples (see section~\ref{sec:softmaxLayer}).  Denoting by~$n_p$ the collective number of parameters contained in the trainable layers of the network, we have~${\mathbfcal{P}} \sim \mathbb{R}^{n_p}$.

\vspace{1cm}

In this article, we will consider the following layers:
\begin{itemize}
\item {\bf non-trainable}: non-linear activation~(\ref{sec:activation}), max-pool~(\ref{sec:maxpool}) \& flatten~(\ref{sec:flatten})
\item {\bf trainable}: fully connected~(\ref{sec:fc}), convolution~(\ref{sec:conv}) \& batch normalization~(\ref{sec:batchNorm})
\end{itemize}
Inspired by a simple and historically important CNN, we consider a modified version of the famous LeNet-5 model that incorporates a few more modern components (ReLU activation, batch normalization, skip connections...).  The architecture of our example network is fully specified in table~\ref{table:networkExample}.  Its backbone is made up of~$n_\ell = 16$ layers comprising of~$n_p =$~\num[group-separator={,}]{44878} parameters listed in table~\ref{table:networkParams}.  Because the architecture does not have loops, it falls under the category of feedforward neural networks which are usually implemented as directed acyclic graphs (DAGs) by deep learning frameworks.  Keeping with historical references, we use the MNIST (Modified National Institute of Standards and Technology) dataset as the labeled data~$\mathcal{D}_\text{training}$.  This dataset consists of~$n_c=10$ classes of handwritten digits~(0-9) in the form of~\num[group-separator={,}]{70000} grayscale images (depth~$d=1$ and spatial resolution~$r = 28$). A selection of~\num[group-separator={,}]{60000} samples is assigned to the training set while the remaining~\num[group-separator={,}]{10000} constitute the test set.  With a minibatch of size~$n$, the input data is therefore represented as~${\bf A}_0 \sim \mathbb{R}^{n \times 1 \times 28 \times 28}$.  The ``shape'' column of table~\ref{table:networkExample} shows how the data is sequentially transformed from pixel space~${\bf A}_0$ layer by layer all the way down to ``embedding'' space~${\bf A} \equiv {\bf A}_{n_\ell = 16}$.  The network starts by a series of alternating ``convolution~\textemdash~activation~\textemdash~batch normalization~\textemdash~maxpool'' layer blocks whose effect is to reduce the spatial resolution of the data while, at the same time, increase its depth.  At some point the~3d structure of samples (independent space and depth dimensions of images) are flattened into~1d feature vectors transforming~${\bf A}_8 \sim \mathbb{R}^{n\times 16\times 4\times 4}$ into a 2d array~${\bf A}_9 \sim \mathbb{R}^{n\times 256}$ which is fed into another series of alternating ``fully connected~\textemdash~activation~\textemdash~batch normalization'' layer blocks.  Note that space and depth information are no longer relevant as interpretable features as soon as data is processed by fully connected layers because of the global connectivity patterns they introduce.  The final representation, so-called ``embedding'', denoted by~${\bf A}$ is eventually fed into a softmax layer (section~\ref{sec:softmaxLayer}) in order to produce a normalized probability distribution~${\bf Y}_\text{pred} \sim \mathbb{R}^{n\times n_c}$. \\

\noindent The ``backward pass'' corresponds to an equivalent propagation of error terms~$\Delta$'s back up through the layers of~$\mathcal{N}_{\mathbfcal{P}}$ (see section~\ref{backPropSection}).  As can be gleaned from table~\ref{table:networkExample}, data and error arrays always share the same dimensionality~${\bf A}_i \sim \Delta_i$ for all layers.

\begin{figure}
\centering
\includegraphics[width=0.9\linewidth]{pptx/crossEntropy/Slide1.png}
\caption{Illustration of the cross-entropy loss function~${\bf \ell}_\mathbfcal{P} \left( {\bf Y}_\text{gt}, {\bf Y}_\text{pred} \right)$ defining the amount of ``mismatch'' between the one-hot encoded ground-truth~${\bf Y}_\text{gt}$ and the output probability distribution~${\bf Y}_\text{pred}$ as defined in~eq.(\ref{eq:crossEntropy}).  For clarity, we show only the values of the~${\bf y}_\text{pred}^s(c_\text{gt}^s)$ components for all samples~$s$ of~${\bf Y}_\text{pred}$ since they are the only relevant ones as far as the cross-entropy calculation is concerned.  (Numerical values are shared with~Fig.~\ref{fig:softMax}).}
\label{fig:crossEntropy}
\end{figure}

\vspace{2cm}

\item Define a loss function that measures the \mathcolorbox{pink}{\text{amount of disagreement between the predicted}~{\bf Y}_\text{pred}}  \mathcolorbox{pink}{\text{and the ground-truth}~{\bf Y}_\text{gt}}.  For classification tasks, it is usual to use the cross-entropy between the predicted probability distribution and the ground-truth distribution:
\begin{equation*}
\mathcal{L}_\text{batch} ({\mathbfcal{P}}) = - {\bf Y}_\text{gt} \cdot \log {\bf Y}_\text{pred} \label{eq:batchLoss} \sim \mathbb{R}
\end{equation*}
\noindent where the explicit~$\mathbfcal{P}$-dependence of the loss comes its dependence on~${\bf Y}_\text{pred} = \mathcal{N}_{\mathbfcal{P}} \left( {\bf A}_0 \right)$ and,~recursively, on all the preceding layers of the neural network~\footnote{Obviously~$\mathcal{L}_\text{batch}$ also depends on the network architecture~$\mathcal{N}$ in addition to~$\mathcal{P}$ and the training data~$(\bf{A}_0, {\bf Y}_\text{gt})$ (see also \hyperlink{programmerNote}{side note}).  However, as this dependence is usually non-differentiable, we restrict ourselves to static architectures and consider the loss as a function of the parameters and the training data only.  We refer the reader to~\cite{optimizeNet} for recent work that formulates architecture search as a gradient-based optimization problem (see point~4) using differentiable losses with respect to~$\mathcal{N}$ as an alternative to conventional approaches that use evolution techniques or reinforcement learning over a discrete and non-differentiable search space~\cite{googleAIblog}.}.

In order to gain some insight into how the cross-entropy loss emerges as the natural quantity, let us consider a single training sample~$s$ with input data and supervised label pair~$\left( {\bf a}_0^s , \, {\bf y}_\text{gt}^s \right)$.  Passing this input as an argument to the neural network function~$\mathcal{N}_{\mathbfcal{P}}$ produces a probability distribution vector~${\bf y}^s_\text{pred} = \mathcal{N}_{\mathbfcal{P}} \left( {\bf a}_0^s \right) \sim \mathbb{R}^{n_c}$.  Denoting by~$c^s_\text{gt}$ the ground-truth class to which this sample belongs means that all components of~${\bf y}^s_\text{gt} \sim \mathbb{R}^{n_c}$ are identically~0 except for~${\bf y}^s_\text{gt} (c_\text{gt}^s) \equiv 1$. Because of this~OHE representation of~${\bf y}_\text{gt}^s$, its dot-product with~${\bf y}_\text{pred}^s$ produces a single value
\begin{equation*}
{\bf y}_\text{gt}^s \cdot {\bf y}_\text{pred}^s = {\bf y}_\text{pred}^s (c_\text{gt}^s ) \sim \mathbb{R}
\end{equation*}
which represents the probability/{\bf likelihood} assigned by~$\mathcal{N}_{\mathbfcal{P}}$ to the actual ground-truth class.  Accordingly, a good prediction consists in having a likelihood~$0 < {\bf y}^s_\text{pred} (c_\text{gt}^s) \lessapprox 1$ as high as possible in order to mirror~${\bf y}^s_\text{gt}$. Under the assumption that the~$n$ training samples are i.i.d. (as discussed in point~1), the likelihood over the entire minibatch~$\mathtt{L}_\text{batch}$ can be written as a product over the individual likelihoods.  The training objective is then formulated as an optimization problem over the parameters~$\mathbfcal{P} \sim \mathbb{R}^{n_p}$ to maximize the minibatch likelihood:
\begin{equation*} 
\argmax_\mathbfcal{P} \, \mathtt{L}_\text{batch} \,\, ; \,\, \text{with} \,\,\, \mathtt{L}_\text{batch} = \prod_{s=1}^n {\bf y}_\text{pred}^s (c_\text{gt}^s ) \sim \mathbb{R}
\end{equation*}
Taking the logarithm of the likelihood turns the product into a sum over individual training samples without changing the nature of the optimization objective. Since the~$\log$ function is strictly monotonic, maximizing the likelihood is equivalent to minimizing the negative log-likelihood:
\begin{eqnarray*}
\argmax_\mathbfcal{P} \, \mathtt{L}_\text{batch} &\Longleftrightarrow& \argmin_\mathbfcal{P} \left( - \log \mathtt{L}_\text{batch} \right) \\
&=& \argmin_\mathbfcal{P} \left( - \sum_{s=1}^n \log {\bf y}_\text{pred}^s (c_\text{gt}^s ) \right) \\
&\equiv& \argmin_\mathbfcal{P} \sum_{s=1}^n  {\bf \ell}_\mathbfcal{P} \left( {\bf y}^s_\text{gt}, {\bf y}^s_\text{pred} \right)
\end{eqnarray*}


\newcommand\tempboxPic{%
\begin{minipage}{0.5\textwidth}%
\abovedisplayskip=0pt
\belowdisplayskip=0pt
\begin{tikzpicture}[scale=1]
\begin{axis}[ 
  xlabel={${\bf y}_\text{pred}^s(c_\text{gt}^s)$},
  , samples=500, grid, thick
  , domain=-0.1:1
  , xtick={0, 0.25, 0.5, 0.75, 1},
    xticklabels={$0$,$1/4$,$1/2$,$3/4$,$1$}
    , title={${\bf \ell}_\mathbfcal{P} \left( {\bf y}^s_\text{gt}, {\bf y}^s_\text{pred} \right)$ \phantom{zzzzzzzzzzzzzzzzzzzzzz} }
  ]
\addplot+[no marks, orange, line width=2pt] {-ln(x)};
\end{axis}
\end{tikzpicture}
\end{minipage}}

Thanks to this convenient formulation as a sum over the minibatch, we can identify the amount of mismatch due to an individual training sample~$s$ as:
\begin{align*}
{\bf \ell}_\mathbfcal{P} \left( {\bf y}^s_\text{gt}, {\bf y}^s_\text{pred} \right) &\equiv -\log \, {\bf y}^s_\text{pred} (c_\text{gt}^s ) \sim \mathbb{R} \\
&\centerwithin{\downarrow}{=} \colorbox{light-blue}{equivalent to cross-entropy between the distributions ${\bf y}_\text{gt}^s$ and ${\bf y}_\text{pred}^s$} \\
&\centerwithin{}{=} \colorbox{light-blue}{(using the OHE representation of ${\bf y}_\text{gt}^s$)} \\
&\equiv - {\bf y}_\text{gt}^s \cdot \log  {\bf y}_\text{pred}^s \sim \mathbb{R}  \\
& \makebox[0pt][l]{\makebox[0pt][r]{\tooltip****{\usebox{\switchbox}}{\tempboxPic}[-\fboxsep,-\dimexpr\ht\switchbox+\dp\switchbox\relax]}} \, \colorbox{yellow}{illustrative plot}
\end{align*}

This shows that maximizing the likelihood is equivalent to minimizing the cross-entropy between the ground-truth distribution and the probability distribution vector predicted by the neural network. As can be seen in the illustrative plot, the cross-entropy metric ensures a monotonically decreasing cost from high values when~${\bf y}^s_\text{pred} (c_\text{gt}^s) \ll 1$ (i.e. small likelihood assigned by~$\mathcal{N}_{\mathbfcal{P}}$ to the ground-truth class: bad prediction) down to small cost values as the prediction for the ground-truth class approaches~1, namely ${\bf y}^s_\text{pred} (c_\text{gt}^s) \lessapprox 1$ (i.e. high likelihood assigned by~$\mathcal{N}_{\mathbfcal{P}}$ to the ground-truth class: good prediction).

\noindent Going back to the general case of minibatches of training samples, we can dispatch~${\bf \ell}_\mathbfcal{P}$ to all~$n$~samples and express the cross-entropy loss as a vectorized operation:
\begin{equation}
{\bf \ell}_\mathbfcal{P} \left( {\bf Y}_\text{gt}, {\bf Y}_\text{pred} \right) \equiv \left(
\begin{matrix}
{\bf \ell}_\mathbfcal{P} \left( {\bf y}^1_\text{gt}, {\bf y}^1_\text{pred} \right)\sim \mathbb{R}  \\
\vdots \\
{\bf \ell}_\mathbfcal{P} \left( {\bf y}^n_\text{gt}, {\bf y}^n_\text{pred} \right) \sim \mathbb{R}
\end{matrix}
\right) = - {\bf Y}_\text{gt} \ominus \log {\bf Y}_\text{pred} \sim \mathbb{R}^{n} 
\label{eq:crossEntropy}
\end{equation}
where each component corresponds to the loss due to individual samples as illustrated in~Fig.\ref{fig:crossEntropy}.   Using~eq.(\ref{rowWise::Froebenius}) to sum up this loss vector demonstrates that the total loss is indeed given by the cross-entropy between the predicted probability distribution~${\bf Y}_\text{pred}$ and the ground truth distribution~${\bf Y}_\text{gt}$ as stated at the beginning of this section:
\begin{equation}
\mathcal{L}_\text{batch} ({\mathbfcal{P}}) = \sum_\text{samples} \ell_{\mathbfcal{P}} \left( {\bf Y}_\text{gt} , {\bf Y}_\text{pred} \right) = - {\bf Y}_\text{gt} \cdot \log {\bf Y}_\text{pred} \label{eq:batchLoss} \sim \mathbb{R}
\end{equation}
\noindent In summary, we have shown that the training objective can be formulated as a search for a set of parameters~$\mathbfcal{P} \sim \mathbb{R}^{n_p}$ that minimize the cross-entropy between~${\bf Y}_\text{pred}$ and~${\bf Y}_\text{gt}$:
\begin{equation*}
\argmin_\mathbfcal{P} \mathcal{L}_\text{batch} ({\mathbfcal{P}}) 
\end{equation*}
\noindent Minimizing the training error has the effect of maximizing the similarity between ground-truth and model prediction distributions, i.e. the likelihood that the model is able to produce the correct labels on the training dataset.

\newpage

\item The purpose of the \colorbox{pink}{learning algorithm} is to provide a concrete strategy to solve the optimization objective discussed in the previous point. It is known that the inductive biases introduced by different optimization algorithms~\cite{inductiveBias} and proper initializations~\cite{weightInit,weightInit2} play a crucial role in the quality of learning and in the generalization ability of the learned models.  We focus on the celebrated gradient descent algorithm which, despite its age~\cite{cauchy}, remains the ``workhorse'' of deep learning.  \\

\colorbox{pink}{Gradient descent} is an intuitive procedure where the parameters~$\mathbfcal{P}$ are iteratively corrected by a small vector~$\delta \mathbfcal{P} \ll \mathbfcal{P}$ carefully chosen so as to reduce the loss~$\mathcal{L}_\text{batch} (\mathbfcal{P} + \delta \mathbfcal{P} ) < \mathcal{L}_\text{batch} (\mathbfcal{P})$. Obviously, a decrease in the loss implies that~${\bf Y}_\text{pred}$ gradually becomes a little more accurate description of~${\bf Y}_\text{gt}$ (see point~3).  The minimization is implemented by repeating this update many times over different batches of training data~$\mathcal{D}_\text{training}$.  How to find~$\delta \mathbfcal{P}$?  Since the parameter update is assumed to be small, we can perform a Taylor expansion of the loss function:
\begin{equation*}
\mathcal{L}_\text{batch} ( \mathbfcal{P} + \delta \mathbfcal{P} ) \approx \mathcal{L}_\text{batch} ( \mathbfcal{P} ) + \delta \mathbfcal{P}^t \cdot \nabla_\mathbfcal{P} \mathcal{L}_\text{batch} + \frac{1}{2} \, \delta \mathbfcal{P}^t \, \nabla^2_\mathbfcal{P} \mathcal{L}_\text{batch} \, \delta \mathbfcal{P} + \cdots
\end{equation*}

Restricting ourselves to~$1^\text{st}$-order effects only, the change in loss values is determined by the dot-product~$\delta \mathbfcal{P}^t \cdot \nabla_\mathbfcal{P} \mathcal{L}_\text{batch}$.  Clearly, this term depends on the angle between the parameter update~$\delta \mathbfcal{P} \sim \mathbb{R}^{n_p}$ and the gradient~$\nabla_\mathbfcal{P} \mathcal{L}_\text{batch} \sim \mathbb{R}^{n_p}$. It reaches its maximum when both vectors are aligned with each other showing that the gradient~\footnote{\label{noteGrad} The gradient~$\nabla_\mathbfcal{P} \mathcal{L}_\text{batch}$ of a function~$\mathcal{L}_\text{batch} (\mathbfcal{P}) : \mathbb{R}^{n_p} \rightarrow \mathbb{R}$ is defined as the vector of partial derivatives with respect to each of its parameters:~$\nabla_\mathbfcal{P} \mathcal{L}_\text{batch} = \big( \partial \mathcal{L}_\text{batch} / \partial \mathbfcal{P}_1, \cdots, \partial \mathcal{L}_\text{batch} / \partial \mathbfcal{P}_{n_\ell} \big)$. Trainable layers typically have more than a single parameter and, denoting by~$n_p$ the total number of parameters contained in the~$n_\ell$ layers, we have therefore~$\nabla_\mathbfcal{P} \mathcal{L}_\text{batch} \sim \mathbfcal{P} \sim \mathbb{R}^{n_p}$.} can be interpreted as the direction of steepest ascent.  In other words,~$\nabla_\mathbfcal{P} \mathcal{L}_\text{batch}$ is a vector whose direction in~$\mathbb{R}^{n_p}$ space is the one along which~$\mathcal{L}_\text{batch}$ grows the fastest.  Given that our goal is to decrease the loss value, the optimal parameter update consists in choosing a vector~$\delta \mathbfcal{P} \sim \mathbb{R}^{n_p}$ that lies in the opposite direction of the gradient, i.e. the direction of steepest descent. Now that the direction of~$\delta \mathbfcal{P}_\text{steepest descent}$  is known, we can fix its magnitude by introducing a~``learning rate''~$0 < \lambda \ll 1$ such that parameters are updated according to:
\begin{eqnarray}
\mathbfcal{P} &\Longleftarrow& \mathbfcal{P} + \delta \mathbfcal{P}_\text{steepest descent}
\label{eq:steepestUpdateA} \\
\delta \mathbfcal{P}_\text{steepest descent} &\equiv&   - \lambda \nabla_\mathbfcal{P} \mathcal{L}_\text{batch}  = -\lambda \left( 
\begin{matrix}
\partial \mathcal{L}_\text{batch} / \partial \mathbfcal{P}_1  \\
\vdots \\
\partial \mathcal{L}_\text{batch} / \partial \mathbfcal{P}_{n_p}
\end{matrix} 
\right) \sim \mathbb{R}^{n_p}
\label{eq:steepestUpdateB}
\end{eqnarray}
Obviously, this approach requires the \mathcolorbox{pink}{\text{explicit calculation of the gradient}~\nabla_\mathbfcal{P} \mathcal{L}_\text{batch}} and, for implementation reasons that will introduced in section~\ref{backPropSection} and be the focus of the rest of the article, the evaluation of~$\nabla_\mathbfcal{P} \mathcal{L}_\text{batch}$ is referred to as the ``backward pass''. \\

In our case, the input data consists of minibatches representing only a subset of the entire training dataset.  As a result, the gradient is calculated based on a limited number~$n$ of samples for each update and the learning algorithm is typically referred to as ``stochastic gradient descent'' (SGD) to reflect the noise introduced by this finite-size estimation of the gradient.  (This is in contrast with ``batch'' gradient descent that uses the entire available dataset.)  Choosing the minibatch size~$n$ remains a delicate issue which is entangled with the learning rate~$\lambda$ and its evolution during training.  It is customarily believed~\cite{YannLeCun_tweet,graphCoreAI} that smaller values of~$n$ lead to better generalization performance:  an effect attributed to the randomness in minibatch sampling inducing an ``exploratory'' behavior of SGD dynamics.  In fact, one can show that the covariance of the minibatch noise is related to the Hessian~$\nabla^2_\mathbfcal{P} \mathcal{L}_\text{batch}$ of the loss function~\cite{limitCycles} hinting at a connection between noise and higher-order effects.  Overall, noise appears to play a crucial role by implicitly providing a form of regularization that may help escape saddle points and facilitate training. In contrast, a number of studies advocate larger batch sizes in order to reduce the number of parameter updates and allow the use of distributed training without having to sacrifice performance~\cite{largeBatch1,largeBatch2,largeBatch3}.  Obviously, the geometry and training dynamics of loss landscapes remain topics of intense research; fruitful connections have appeared with models and tools coming statistical physics~\cite{lossSurface}.

Deep learning frameworks offer a whole zoo of gradient-based optimization strategies that decorate the ``canonical'' SGD presented here~\cite{ruderGradient,adaptiveMethods}.  Nevertheless, all these methods share one common characteristic which is the necessity to evaluate the gradient~$\nabla_\mathbfcal{P} \mathcal{L}_\text{batch}$.  Complexity-wise,~$1^\text{st}$-order algorithms are efficient since~$\nabla_\mathbfcal{P} \mathcal{L}_\text{batch} \sim \mathbb{R}^{n_p}$ is linear with the total number of parameters in the network.  Intuitively, one may think that~$2^\text{nd}$ order methods involving the Hessian would improve the search by utilizing information about the curvature of the loss landscape.  Unfortunately, the quadratic growth of the Hessian~$\nabla_\mathbfcal{P} \mathcal{L}_\text{batch}^2 \sim \mathbb{R}^{n_p \times n_p}$ coupled with the need for matrix inversion render such approaches prohibitively expensive for large networks. Even higher-order methods suffer from increasingly worse scaling and, additionally, cannot exploit well-established linear algebra tools.  In practice, the overwhelming majority of neural networks are trained using simple gradient descent methods based on~$\nabla_\mathbfcal{P} \mathcal{L}_\text{batch}$.

\item After the parameters have been updated for a minibatch~$\mathcal{D}_\text{b}$, we can advance to the next mini-batch~$\mathcal{D}_\text{b+1} \in \mathcal{D}_\text{training}$, make a forward pass through the network to get the loss and do another parameter update via the backward pass.  This \mathcolorbox{pink}{\text{loop over minibatches and update of parameters}} may be repeated until the loss function has decreased enough and the training data is well approximated by the neural network function, i.e.~${\bf Y}_\text{gt} \approx {\bf Y}_\text{pred} = \mathcal{N}_\mathbfcal{P} ({\bf A}_0) \, , \, \forall \, \mathcal{D} \in \mathcal{D}_\text{training}$. 

One run over the complete~$\mathcal{D}_\text{training}$ is called an ``epoch'' and this iterative procedure is summarized in a concise high-level~\hyperlink{programmerNote}{pseudo-code} as well as in the cartoon of~Fig.~\ref{fig:iterativeProcedure}.  Given enough iterations a ``good'' learning algorithm may lead to a vanishingly small loss~$\mathcal{L} (\mathbfcal{P}_\text{overfit}) \approx 0$ meaning that the function~$\mathcal{N}_{\mathbfcal{P}_\text{overfit}}$ is capable of perfectly reproducing all the training samples.  This is a signal of overfitting and implies that~$\mathcal{N}_{\mathbfcal{P}_\text{overfit}}$ has little chance of generalizing to new input data.  This is why, in addition to the training loss, it is common practice to also monitor the accuracy on an independent held-out testing set in order to be able to stop training as soon as the testing performance no longer increases; a technique known as early stopping.  There exists a large number of ``regularization'' strategies that aim to address this trade-off between optimization on the training data and generalization to previously unseen datasets.  In addition to traditional penalties based on the norm of the parameters such as sparsity inducing~L1 (Lasso regression) and~L2 regularizations (ridge regression also known as weight decay), other approaches like drop-out~\cite{dropOut} (a prominent candidate), dataset augmentation techniques, label smoothing, bagging... are frequently used by deep learning practitioners. Nevertheless, controlling the accuracy of a model remains a critical topic of research and training networks that yield state-of-the-art performance still involves many tricks (one may look at~\cite{ng,deepLearningBookGoodfellow} for an introduction).

\end{enumerate}

\paragraph{Closing words on supervised learning from~20,000 feet...}  Before moving on to the core of this article which is the description of backpropagation, let us finish this introduction by making a few general observations.  First of all, although number of parameters may not the the best way to quantify model complexity~\cite{intrinsicDim,oneParam}, let us note that the example CNN discussed in this article (tables~\ref{table:networkExample} and~\ref{table:networkParams}) is under-parametrized: the number~$n_p=$~\num[group-separator={,}]{44878} of parameters is smaller than the~\num[group-separator={,}]{60000} samples available for training.  This situation is somewhat unusual for modern state-of-the-art deep learning architectures that typically gravitate towards a heavy over-parametrization of the models.  For example, the famous AlexNet which propelled deep learning under the spotlight in~2012 after winning the ImageNet ILSVRC challenge (by an unprecedented margin) contained about~60~million parameters trained on a dataset of ``only''~$1.2$~million images~\cite{imageNet}.  Empirical evidence suggests that models with larger capacity are surprisingly resistant to overfitting and continue to improve their generalization error~(see~\cite{outrageousLarge} for an extreme example) even when trained without explicit regularization~\cite{overParam}. \\

\noindent Putting things in perspective, one may be tempted to think of the task of training the neural network function~$\mathcal{N}_\mathbfcal{P}$ as a simple interpolation problem on the dataset~$\mathcal{D}_\text{training}$.  However, the very high-dimensional nature of the input data raises major difficulties.  Indeed, it is well known that with increasing dimensionality all data points become equally far away from each other and the notion of nearest neighbors may no longer be relevant~\cite{distanceRatioHighD}; a consequence of the ``curse of dimensionality''.  Dense sampling of a unit hypercube of dimension~$d \sim 10^6$ (lower range of real-world data, see point~1 for images) into a grid of (very poor) resolution~$\varepsilon \sim 0.1$ would require an unreachable and absolutely absurd number~$(1/\varepsilon)^d \sim 10^{\num[group-separator={,}]{1000000}}$ of samples.  Even if, because of some intrinsic structural constraints, real-world data happens to lie in the vicinity of lower dimensional manifolds~\cite{dimReduction}, training samples are forever condemned to be immensely isolated from each other effectively ruling out na\"ive interpolation.

\fcolorbox{black}{lightgray}{%
\minipage[t]{\dimexpr0.9\linewidth-2\fboxsep-2\fboxrule\relax}
{\bf Iterative training: \hypertarget{programmerNote}{pseudo-code}} \\

Let us mention that the whole training procedure can be summarized in a surprisingly short high-level program mirroring the cartoon of Fig.\ref{fig:iterativeProcedure}.  The first step consists in choosing a neural network architecture~$\mathcal{N}_\mathcal{P}$, i.e. a parametrized function that, given some input data, returns a probability distribution~$\bf{Y}_\text{pred}$ over a set of~$n_c$ predefined classes.  This prediction is then compared to the ground-truth~${\bf Y}_\text{gt}$ and the quality of the fit is measured via:
\begin{equation*}
\mathcal{L}_\text{batch} \left( {\bf Y}_\text{gt}, {\bf Y}_\text{pred} = \mathcal{N}_\mathcal{P} ({\bf A}_0) \right) \equiv \mathcal{L}_\text{batch} \left( \mathcal{N}, \mathcal{P}, \mathcal{D}_\text{batch} \right) \sim \mathbb{R}
\end{equation*}
Conceptually, the loss~$\mathcal{L}_\text{batch}$ can be expressed as a scalar function of the architecture~$\mathcal{N}$, the current value of the parameters~$\mathcal{P} \sim \mathbb{R}^{n_p}$ and the training data~$\mathcal{D}_\text{batch} = \left( {\bf A}_0, {\bf Y}_\text{gt}\right)_\text{batch}$. \\

\noindent {\it (In our case~$\mathcal{N}$ is defined as the functional composition of the layers specified in table~\ref{table:networkExample}, parametrized by~$\mathcal{P}$ described in table~\ref{table:networkParams} and trained on~$\mathcal{D}_\text{training} = \text{MNIST}$ with the cross-entropy loss defined in~eq.(\ref{eq:batchLoss}))}. \\

As explained in point~4, it is necessary to calculate the gradient of~$\mathcal{L}_\text{batch}$ with respect to~$\mathcal{P}$ in order to perform an SGD update. Restricting ourselves to static network architectures, the gradient can be formally returned by a vector-valued ``backpropagation'' function~$\mathcal{B}$ implicitly parametrized by~$\mathcal{N}$:
\begin{equation*}
\nabla_\mathbfcal{P} \mathcal{L}_\text{batch} = \mathcal{B}_\mathcal{N} \left( \mathbfcal{P} , \mathcal{D}_\text{batch} \right) \sim \mathbb{R}^{n_p}
\end{equation*}
In practice, deep learning frameworks expose rich high-level APIs based on automatic \mbox{differentiation}~\cite{pearlmutter} to efficiently implement~$\mathcal{B}_\mathcal{N}$ and support complex control flow such as branches/loops as part of the emerging concept of~``differentiable programming''~\cite{RAD,differentiableProgramming}.  In this article we will calculate the gradient of each layer analytically and define~$\mathcal{B}_\mathcal{N}$ as the composition of all the layer-level gradient functions.  Once~$\mathcal{B}_\mathcal{N}$ is known, training proceeds by~i)~evaluating the backpropagation function iteratively over the list of supervised observations~$\mathcal{D}_\text{training} = \lbrack \mathcal{D}_1, \cdots , \mathcal{D}_N \rbrack$ and~ii)~updating the parameters~$\mathbfcal{P}$ each time according to~eqs.(\ref{eq:steepestUpdateA},\ref{eq:steepestUpdateB}).  This can be translated in single line of code by defining a training function:
\begin{equation*}
\mathbfcal{P}_\text{trained} \left( \mathcal{N} \right) = \texttt{foldl} \,\, \Big(\backslash \mathbfcal{P}, \mathcal{D} \rightarrow \mathbfcal{P} - \lambda \mathcal{B}_\mathcal{N} \left( \mathbfcal{P} , \mathcal{D} \right)  \Big) \,\, \mathbfcal{P}_\text{initial} \,\, \mathcal{D}_\text{training} \sim \mathbb{R}^{n_p}
\end{equation*}
that returns a set of trained parameters~$\mathbfcal{P}_\text{trained}$ for any given architecture~$\mathcal{N}$.  Here,~\texttt{foldl} stands for the left fold function (Haskell-specific syntax; exists under different keywords in other programming languages) that takes~3 arguments: an updating function, an initialized accumulator~$\mathbfcal{P}_\text{initial}$ and a list~$\mathcal{D}_\text{training}$ upon which to evaluate the function.

\endminipage}

\vspace{1cm}

\begin{figure}
\centering
\begin{subfigure}{.5\textwidth}
  \centering
  \includegraphics[width=\linewidth]{pptx/manifoldLaurent/ReLU_inputData.png}
\end{subfigure}%
\begin{subfigure}{.5\textwidth}
  \centering
  \includegraphics[width=\linewidth]{pptx/manifoldLaurent/ReLU_decisionBoundaries_Final.png}
\end{subfigure}
\caption{Concrete visualization of the ``manifold hypothesis'' using a synthetic spiral dataset composed of~7 interleaved classes represented by different colors; see~\cite{laurentSpiral} for all details including animations of the training dynamics using different activation functions. {\bf Left)}~Raw input data~$\equiv {\bf A}_0 \sim \mathbb{R}^{n\times 2}$. {\bf Right)}~Data representation at the level of the ``embedding''~$\equiv {\bf A} \sim \mathbb{R}^{n\times 2}$. Thanks to the~2d geometry of both~${\bf A}$/${\bf A}_0$, one can see how~$\mathcal{N}_\mathcal{P}$ has learned to separate the training samples so that classes can be separated by linear boundaries.}
\label{fig::spiralNet}
\end{figure}

\noindent Instead, deep learning is usually presented as a data cascade across multiple levels of abstraction starting with raw unstructured high-dimensional input and finishing with lower dimensional representations called ``embeddings''.  In the case of~CNNs, the first few layers are suspected to encode simple features such as edges and colors.  Deeper layers pick up the task by trying to detect higher-level motifs that start to resemble more familiar objects.  Finally, this hierarchical cascade is believed to produce abstract concepts that are easier to separate into distinct classes~\cite{deconv}.  This scenario can be joined with a complimentary interpretation going under the name of ``manifold hypothesis''~\cite{manifoldHypothesis}.  There, the idea is that the learning process should be seen as a progressive linearization of the topology of the input data that starts as complicated interwoven class manifolds and culminates into embeddings that are separable by simple hyperplanes~\cite{chrisTopo,laurentSpiral} as illustrated in Fig.\ref{fig::spiralNet}.  Giving further support to the importance of these learned representations, it is known that algebraic operations on embeddings can combine high-level concepts into semantically meaningful relationships~\cite{DCGAN,mikolov}.  Somewhat orthogonally, compression of input data into efficient representations during the training dynamics can also be approached from the point of view of information theory~\cite{tishby}.

\newpage

\noindent This hierarchical and compositional view of deep learning is not specific to CNNs and images but, rather, may be a natural consequence of the generative process which created the data in the first place.  Indeed, physical world data is usually modeled in terms of recursive compositions of fundamental building blocks into more and more complex systems. In this case, neural networks can be seen as an attempt to ``reverse-engineer'' this decomposition of the generative process into a hierarchy of simpler steps~\cite{cheapLearning}.  As such, it is probable that successful deep learning architectures implicitly provide very strong priors on the types of statistical patterns that can be discovered from data~\cite{explainMethod}.  Despite empirical evidence that ``deeper is better'', the actual role played by depth and its impact on the expressiveness of approximating functions remains a topic of active theoretical research~\cite{nadav,poggio}. \\

\noindent Although this narrative is very widespread and influential~\cite{natureDeep}, it is worth emphasizing that there is still no consensus regarding what it is that the current incarnations of neural networks really do learn; a lot of work is going into understanding and visualizing the predictions of deep learning models~\cite{distill,explainLIME}.  Among many known drawbacks let us mention that neural networks can effortlessly fit data with random labels~\cite{generalize}, are alarmingly fooled by minute and imperceptible adversarial perturbations~\cite{intriguing}, lack a sense of geometric context~\cite{capsule}, learn superficial details not related to semantically relevant concepts~\cite{fftSuperficial}, are brittle against benign distributional shift~\cite{cifar10Generalize}...  Another obvious limitation stems from the implicit static closed-world environment assumption: models can only ever predict classes that belong to the training set.  They are forced into making wrong predictions (with uncontrollable confidence) if presented with new objects~\cite{openSet} or with nonsensical input~\cite{rubbishImages}. The practicality (or lack thereof) of deep neural networks in the wild, the role of prior knowledge~\cite{garyMarcus}, whether modern architectures are anything more than (somewhat fast and loose~\cite{fastLoose}) exceptional curve-fitting machines that bear no resemblance whatsoever to human perception let alone reasoning~\cite{pearl}... and many more issues continue~\footnote{(as evidenced by the famous, time-tested, quote {\it ``The question of whether machines can think is about as relevant as the question of whether submarines can swim''}~\cite{Dijkstra}, these discussions have a long history)} to be nebulous and hotly debated topics.  Nonetheless, deep learning has undeniably turned out to be very good at discovering intricate structures in high-dimensional data and continues to establish new state-of-the-art performance in many academic fields beyond computer vision. The last few years have also shown how neural networks can have a meaningful impact in wide variety of commercial products.  Nevertheless, a healthy dose of skepticism shows that there is still some way to go before they can fully be deployed as trusted components of critical and ever-changing real-world applications. \\

\noindent With this long introduction out of the way, let us now move on to the main purpose of this article which is a pedagogical presentation of backpropagation in the context of CNNs.

\begin{table}
\vspace{-3em}%
\hspace*{-2.66cm}
\captionsetup{singlelinecheck=off}
\begin{tabular}{|| l | y | x | z ||}
\hline
{\bf Layer} & \cellcolor{orange}{\bf Forward pass} & {\bf Shape} & \cellcolor{light-blue}{\bf Backward pass} \\
\hline
\hline
\rule{0pt}{1.1\normalbaselineskip}
Input data & ${\bf A}_0$ & $\mathbb{R}^{n\times 1\times 28\times 28}$ &  \cellcolor{Gray} $n$ grayscale ($d=1$) square images of $28\times28$ resolution \\
\hline \hline \rule{0pt}{1.1\normalbaselineskip}
 & ${\bf A}_1 = {\bf w}_0^{\mathfrak{p}_0} \star {\bf A}_0 + \widetilde{{\bf b}}_0$ &  & \\
\multirow{-2}*{Convolution} & $\mathfrak{p}_0= \big( k_0=5 , s_0=1 , p_0=0  \big) $ & \multirow{-2}*{$ $} & \multirow{-2}*{$\Delta_1 = \Delta_2 \circ g^\prime \left( {\bf A}_1 \right) $}  \\

& & & $\Delta_{2} = f_{4d}^t \bigg[ \bigg( 576 n f_{2d}^t ( \Delta_{3} ) \widetilde{{\bf w}_{2}} - \widetilde{ \sum_s f_{2d}^t(\Delta_{3}) \widetilde{{\bf w}_{2}}}$  \\
\multirow{-2}*{Activation} & \multirow{-2}*{${\bf A}_{2} = g({\bf A}_{1})$} & \multirow{-2}*{$\mathbb{R}^{n \times 6 \times 24 \times 24 }$} & $\hspace{2.2cm} - \overline{f_{2d}^t({\bf A}_{2})} \circ \widetilde{ \sum_s \overline{f_{2d}^t({\bf A}_{2})} \circ f_{2d}^t(\Delta_{3}) \widetilde{{\bf w}_{2}} } \bigg)  / 576n\widetilde{\sigma_{2}} \bigg] $ \\[0.2em]

Batch normalization & ${\bf A}_{3} = f_{4d}^t \Big( \overline{f_{2d}^t({\bf A}_{2})} \, \widetilde{{\bf w}_{2}} + \widetilde{{\bf b}_{2}} \Big) $ & $ $ & $\Delta_3 = \widetilde{\Delta_4} \circ g_{\mathfrak{p}}^\prime ({\bf A}_3) $ \\[0.3em]
\hline \hline \rule{0pt}{1.1\normalbaselineskip}

 & ${\bf A}_4 = \text{maxpool}_\mathfrak{p} \, {\bf A}_3 $ &  & $\Delta_4 = \overset{\curvearrowleft \, t_{\text{d}} \, \mathfrak{p}_4^\prime }{{\bf w}_{4\,\,\,\,\,\,\,\,}} \star \Delta_5$  \\
\multirow{-2}*{Maxpool} & $\mathfrak{p} =\big( k=2 \, , s=2 \, , p=0 \big)$ & \multirow{-2}*{$\mathbb{R}^{n \times 6 \times 12 \times 12 }$} & $\mathfrak{p}_4^\prime =\big( k^\prime_4= k_4 = 5 \, , \, s^\prime_4=1/s_4 =1 \, , \, p^\prime_4= k_4 - p_4 - 1 = 4 \, \big)$  \\[0.3em]
\hline \hline \rule{0pt}{1.1\normalbaselineskip}
 & ${\bf A}_5 = {\bf A}_4 \star {\bf w}_4^{\mathfrak{p}_4} + \widetilde{{\bf b}}_4 $ & & \\
\multirow{-2}*{Convolution} & $\mathfrak{p}_4= \big( k_4=5 , s_4=1 , p_4=0 \big) $ & \multirow{-2}*{$ $} & \multirow{-2}*{$\Delta_5 = \Delta_6 \circ g^\prime \left( {\bf A}_5 \right)$} \\

& & & $\Delta_{6} = f_{4d}^t \bigg[ \bigg( 64 n f_{2d}^t ( \Delta_{7} ) \widetilde{{\bf w}_{6}} - \widetilde{ \sum_s f_{2d}^t(\Delta_{7}) \widetilde{{\bf w}_{6}}}$  \\
\multirow{-2}*{Activation} & \multirow{-2}*{${\bf A}_{6} = g({\bf A}_{5})$} & \multirow{-2}*{$\mathbb{R}^{n \times 16\times 8\times 8}$} & $\hspace{2.35cm} - \overline{f_{2d}^t({\bf A}_{6})} \circ \widetilde{ \sum_s \overline{f_{2d}^t({\bf A}_{6})} \circ f_{2d}^t(\Delta_{7}) \widetilde{{\bf w}_{6}} } \bigg) / 64n\widetilde{\sigma_{6}} \bigg]  $ \\[0.2em]

Batch normalization & ${\bf A}_{7} = f_{4d} \Big( \overline{f_{2d}^t({\bf A}_{6})} \, \widetilde{{\bf w}_{6}} + \widetilde{{\bf b}_{6}} \Big) $ & $ $ & $\Delta_7 = \widetilde{\Delta_8} \circ g_{\mathfrak{p}}^\prime ({\bf A}_7) $ \\[0.3em]
\hline \hline \rule{0pt}{1.1\normalbaselineskip}
 & ${\bf A}_8 = \text{maxpool}_\mathfrak{p} \, {\bf A}_7 $ &  & \\
\multirow{-2}*{Maxpool} & $\mathfrak{p}= \big( k=2 , s=2 , p=0 \big) $ & \multirow{-2}*{$ \mathbb{R}^{n \times 16 \times 4 \times 4 } $} & \multirow{-2}*{$\Delta_8 = \text{fold} \, \Delta_9 $} \\[0.3em]
\hline \hline \rule{0pt}{1.1\normalbaselineskip}
Flatten & ${\bf A}_9 = \text{flatten} \, {\bf A}_8 $ & $\mathbb{R}^{n \times 256} $  & $\Delta_9 = \Delta_{10} {\bf w}_9^t $ \\[0.3em]
\hline \hline \rule{0pt}{1.1\normalbaselineskip}

Fully connected & ${\bf A}_{10} = {\bf A}_9 {\bf w}_9 + \widetilde{{\bf b}}_9 $ & $  $ & $\Delta_{10} = \Delta_{11} \circ g^\prime \left( {\bf A}_{10} \right)$ \\[0.2em]

Activation & ${\bf A}_{11} = g({\bf A}_ {10})$ & $ \mathbb{R}^{n \times 120} $ & $\Delta_{11} = \frac{1}{n\widetilde{\sigma_{11}} \vphantom{3^{3^{3}}} } \Big( n \Delta_{12} \widetilde{{\bf w}_{11}} - \widetilde{ \sum_s \Delta_{12} \widetilde{{\bf w}_{11}} } - \overline{\bf A}_{11} \circ \widetilde{ \sum_s \overline{\bf A}_{11} \circ \Delta_{12} \widetilde{{\bf w}_{11}} } \Big)$ \\[0.4em]

Batch normalization & ${\bf A}_{12} = \overline{\bf A}_{11} \widetilde{{\bf w}_{11}} + \widetilde{{\bf b}_{11}} $ & $ $ & $\Delta_{12} = \Delta_{13} {\bf w}_{12}^t $ \\[0.3em]

\hline \hline \rule{0pt}{1.1\normalbaselineskip}
Fully connected & ${\bf A}_{13} = {\bf A}_{12} {\bf w}_{12} + \widetilde{{\bf b}}_{12} $ & $ $ & $\Delta_{13} = \Delta_{14} \circ g^\prime \left( {\bf A}_{13} \right) $  \\[0.2em]

Activation & ${\bf A}_{14} = g({\bf A}_{13})$ & $ \mathbb{R}^{n \times 84} $ & $\Delta_{14} = \frac{1}{n\widetilde{\sigma_{14}} \vphantom{3^{3^{3}}} } \Big( n \Delta_{15} \widetilde{{\bf w}_{14}} - \widetilde{ \sum_s \Delta_{15} \widetilde{{\bf w}_{14}} } - \overline{\bf A}_{14} \circ \widetilde{ \sum_s \overline{\bf A}_{14} \circ \Delta_{15} \widetilde{{\bf w}_{14}} } \Big)$  \\[0.4em]

Batch normalization & ${\bf A}_{15} = \overline{\bf A}_{14} \widetilde{{\bf w}_{14}} + \widetilde{{\bf b}_{14}} $ & $ $ & $\Delta_{15} = \Delta_{16} {\bf w}_{15}^t $ \\[0.3em]

\hline \hline \rule{0pt}{1.1\normalbaselineskip}
Fully connected & ${\bf A} \equiv {\bf A}_{16} = {\bf A}_{15} {\bf w}_{15} + \widetilde{{\bf b}}_{15} $ & $ \mathbb{R}^{n \times 10} $ & $\Delta_{16} = {\bf Y}_\text{pred} - {\bf Y}_\text{gt}$ \\[0.3em]

\hline \hline \rule{0pt}{1.1\normalbaselineskip}
Softmax & ${\bf Y}_\text{pred} = \text{softmax} \, {\bf A} $ & $\mathbb{R}^{n \times 10}$ & \cellcolor{Gray} probability distribution over $n_c=10$ classes for all images \\[0.3em]
\hline  
\end{tabular}
\caption[capNet]{%
Illustration of a typical~Convolutional Neural Network~(CNN) architecture inspired by the historical LeNet-5.  Notice how patterns ``convolution/fully connected~\textemdash~activation~\textemdash~batch normalization'' are grouped together and repeated.  Shortcut connections, another component of modern architectures such as ResNet, are \hyperlink{skipNote}{explained in detail} even though they are absent from this example network.  As usual in CNNs, the shape of the data representations during the {\bf forward pass} (to be read from \colorbox{green-yellow}{\bf top to bottom}) starts by becoming ``fatter'' (deeper and spatially smaller) early in the network before being flattened into a ``thin'' feature vector whose dimension is gradually decreased to eventually match the desired number of classes~$n_c=10$ for classification; the final layer before the softmax, sometimes referred to as the ``embedding'', is denoted as~${\bf A} \equiv {\bf A}_{16}$.  This in contrast to the {\bf backward pass} (to be read from \colorbox{shadecolor}{\bf bottom to top}) where the error arrays, corresponding to the gradient of the loss function with respect to the data, follow the exact opposite dimensionality changes.  As can be gleaned from the expressions above and \hyperlink{errGeneralFunction}{demonstrated} in the main text, error backpropagation may be seen as a general function~$\mathcolorbox{pink}{\Delta_{i-1} \, \big( \Delta_i , {\bf A}_{i-1} , \mathbfcal{P}_{i-1} \big) }$ of the upstream error~$\Delta_i$, original data array~${\bf A}_{i-1}$ from the forward pass and parameters~$\mathbfcal{P}_{i-1}$; layer-specific implementations of the downstream error terms~$\Delta_{i-1}$ can be found in the relevant sections. \\

{\it (Note that the dimensionality of convolutional kernels and of arrays that are wrapped by geometrical reshape operations, such as~$f_{2d}$ and~$f_{4d}$, designed to handle minibatches of image data are provided explicitly in the caption of table~\ref{table:networkParams}.  More details about the architecture and the dataflow are provided in point~2 of section~\ref{sec:intro}.)}}
\label{table:networkExample}
\end{table}

\begin{table}
\vspace{-4.8em}%
\hspace*{-0.5cm}
\captionsetup{singlelinecheck=off}
\begin{tabular}{|| l | y : y | x : x | z ||}
\hline
\multicolumn{3}{|c|}{\cellcolor{orange}{{\bf Parameters}}} & \multicolumn{2}{c|}{\cellcolor{Gray}{{\bf Dimensionality}}} & \cellcolor{light-blue}{\bf Loss derivative} \\
\hline
\hline
\rule{0pt}{1.1\normalbaselineskip}
\multirow{2}*{Fully connected} & & ${\bf w}_{15}$ & $\mathbb{R}^{84\times 10}$ & 840 & $\partial \mathcal{L}_{\text{batch}} / \partial {\bf w}_{15} = {\bf A}_{15}^t \Delta_{16}$  \\[0.3em]
& \multirow{-2}*{$\mathbfcal{P}_{15}$} & ${\bf b}_{15}$ & $\mathbb{R}^{10} $ & 10 & $\partial \mathcal{L}_{\text{batch}} / \partial {\bf b}_{15} = \sum_s \Delta_{16}$ \\[0.3em]
\hline
\hline
\rule{0pt}{1.1\normalbaselineskip}
\multirow{2}*{Batch normalization} & & ${\bf w}_{14}$ & $\mathbb{R}^{84}$ & 84 & $\partial \mathcal{L}_{\text{batch}} / \partial {\bf w}_{14} = \text{diag} \big( \overline{\bf A}_{14}^t \Delta_{15} \big) $  \\[0.3em]
& \multirow{-2}*{$\mathbfcal{P}_{14}$} & ${\bf b}_{14}$ & $\mathbb{R}^{84} $ & 84 & $\partial \mathcal{L}_{\text{batch}} / \partial {\bf b}_{14} = \sum_s \Delta_{15} $ \\[0.3em]
\hline  
\hline
\rule{0pt}{1.1\normalbaselineskip}
\multirow{2}*{Fully connected} &  &  ${\bf w}_{12}$ & $\mathbb{R}^{120\times 84} $ & 10,080 & $\partial \mathcal{L}_{\text{batch}} / \partial {\bf w}_{12} = {\bf A}_{12}^t \Delta_{13}$ \\[0.3em]
& \multirow{-2}*{$\mathbfcal{P}_{12}$} &  ${\bf b}_{12}$ & $ \mathbb{R}^{84}$ & 84 & $\partial \mathcal{L}_{\text{batch}} / \partial {\bf b}_{12} = \sum_s \Delta_{13}$ \\[0.3em]
\hline
\hline
\rule{0pt}{1.1\normalbaselineskip}
\multirow{2}*{Batch normalization} & & ${\bf w}_{11}$ & $\mathbb{R}^{120}$ & 120 & $\partial \mathcal{L}_{\text{batch}} / \partial {\bf w}_{11} = \text{diag} \big( \overline{\bf A}_{11}^t \Delta_{12} \big) $  \\[0.3em]
& \multirow{-2}*{$\mathbfcal{P}_{11}$} & ${\bf b}_{11}$ & $\mathbb{R}^{120} $ & 120 & $\partial \mathcal{L}_{\text{batch}} / \partial {\bf b}_{11} = \sum_s \Delta_{12} $ \\[0.3em]
\hline  
\hline
\rule{0pt}{1.1\normalbaselineskip}
\multirow{2}*{Fully connected} &  &  ${\bf w}_9$ & $ \mathbb{R}^{256\times 120} $ & 30,720 & $\partial \mathcal{L}_{\text{batch}} / \partial {\bf w}_9 = {\bf A}_9^t \Delta_{10} $ \\[0.3em]
& \multirow{-2}*{$\mathbfcal{P}_9$} &  ${\bf b}_9$ & $\mathbb{R}^{120}$ & 120 & $\partial \mathcal{L}_{\text{batch}} / \partial {\bf b}_9 = \sum_s \Delta_{10}$ \\[0.3em]
\hline
\hline
\rule{0pt}{1.1\normalbaselineskip}
\multirow{2}*{Batch normalization} & & ${\bf w}_{6}$ & $\mathbb{R}^{16}$ & 16 & $\partial \mathcal{L}_{\text{batch}} / \partial {\bf w}_{6} = \text{diag} \big( \overline{f_{2d}^t({\bf A}_6)}^{\,t} f_{2d}^t(\Delta_{7}) \big) $  \\[0.3em]
& \multirow{-2}*{$\mathbfcal{P}_{6}$} & ${\bf b}_{6}$ & $\mathbb{R}^{16} $ & 16 & $\partial \mathcal{L}_{\text{batch}} / \partial {\bf b}_{6} = \sum_{s,r} \Delta_7 $ \\[0.3em]
\hline  
\hline
\rule{0pt}{1.1\normalbaselineskip}
\multirow{2}*{Convolution} & &  ${\bf w}_4$ & $\mathbb{R}^{16\times 6\times 5\times 5}$ & $16 \times 150 =$ 2,400 & $\partial \mathcal{L}_{\text{batch}} / \partial {\bf w}_4 = \text{roll} \left[ f_{2d} \left( \Delta_5 \right) \phi \left( {\bf A}_4 \right)^t \right] $ \\[0.3em]
& \multirow{-2}*{$\mathbfcal{P}_4$} &  ${\bf b}_4$ & $\mathbb{R}^{16}$ & 16 & $\partial \mathcal{L}_{\text{batch}} / \partial {\bf b}_4 = \sum_{s,r} \Delta_{5}$ \\[0.3em]
\hline
\hline
\rule{0pt}{1.1\normalbaselineskip}
\multirow{2}*{Batch normalization} & & ${\bf w}_{2}$ & $\mathbb{R}^{6}$ & 6 & $\partial \mathcal{L}_{\text{batch}} / \partial {\bf w}_{2} = \text{diag} \big( \overline{f_{2d}^t({\bf A}_2)}^{\,t} f_{2d}^t(\Delta_{3}) \big) $  \\[0.3em]
& \multirow{-2}*{$\mathbfcal{P}_{2}$} & ${\bf b}_{2}$ & $\mathbb{R}^{6} $ & 6 & $\partial \mathcal{L}_{\text{batch}} / \partial {\bf b}_{2} = \sum_{s,r} \Delta_3 $ \\[0.3em]
\hline  
\hline
\rule{0pt}{1.1\normalbaselineskip}
\multirow{2}*{Convolution} & & ${\bf w}_0$ & $\mathbb{R}^{6\times 1\times5 \times5}$ & $6\times 25 = $ 150 & $\partial \mathcal{L}_{\text{batch}} / \partial {\bf w}_0 = \text{roll} \left[ f_{2d} \left( \Delta_1 \right) \phi \left( {\bf A}_0 \right)^t \right] $ \\[0.3em]
& \multirow{-2}*{$\mathbfcal{P}_0$} & ${\bf b}_0$ & $\mathbb{R}^{6}$ & 6 & $\partial \mathcal{L}_{\text{batch}} / \partial {\bf b}_0 = \sum_{s,r} \Delta_1$ \\[0.3em]
\hline
\hline
\multicolumn{6}{|c|}{\cellcolor{Gray}{Total number of parameters =~\num[group-separator={,}]{44878}}} \\
\hline
\end{tabular}
\caption[capParam]{%
Parameters are presented from top to bottom in the order in which they are updated during the backpropagation algorithm described in section~\ref{backPropSection}.  Notice that all gradient components with respect to parameters~$\mathbfcal{P}_{i-1}$ share the same pattern regardless of the type of (linear) layer:
\begin{itemize}
\item[{\bf weights:}] $\mathcolorbox{pink}{\partial \mathcal{L}_{\text{batch}} / \partial {\bf w}_{i-1} \sim {\bf A}_{i-1}^t \Delta_i}$ Matrix product between the upstream error~$\Delta_i$ and the transpose of the data array~${\bf A}_{i-1}$ (up to geometrical transformations such as~$f_{2d}$,~$f_{4d}$,~$\text{roll}$,~$\text{diag}$...)  This shows that intermediate data arrays originating from the forward pass need to be cached in memory to be combined, at a later point, with error arrays during the backward pass; illustration in Fig.\ref{errorFlow}. 
\item[{\bf biases:}] $\mathcolorbox{pink}{\partial \mathcal{L}_{\text{batch}} / \partial {\bf b}_{i-1} \sim \sum \Delta_i}$ Tensor-contraction of the upstream error.  In the case of error arrays associated with image data, the sum runs over the spatial dimensions (indicated by the~$r$ subscript) in addition to minibatch samples (indicated by the~$s$ subscript) in the summation symbol~$\sum_{s,r}$.
\end{itemize}
{\it (Details about layer-specific implementations of the \colorbox{shadecolor}{{\bf components of the gradient~$\nabla_\mathbfcal{P} \mathcal{L}_\text{batch}$}} are provided in the relevant sections of the main text.)}
\vspace{0.1cm}

\noindent\rule{\linewidth}{1.4pt}

\vspace{0.2cm}

For the sake of completeness, we report here the dimensionality of transformed arrays and of the convolutional kernels relevant both for table~\ref{table:networkExample} as well as for the gradient expressions above:
\begin{center}
\begin{tabular}{|g:g:g|}
\hline
\rule{0pt}{1.1\normalbaselineskip}
$f_{2d}^t({\bf A}_6) \sim \mathbb{R}^{(8\times 8\times n)\times 16}$ & $f_{2d}^t(\Delta_7) \sim \mathbb{R}^{(8\times 8\times n)\times 16} $ & $\text{diag} \left( \mathbb{R}^{16\times 16} \right) \sim \mathbb{R}^{16} $ \\[0.3em]
\hline
\rule{0pt}{1.1\normalbaselineskip}
$f_{2d}(\Delta_5) \sim \mathbb{R}^{16\times (8\times 8\times n)}$ & $\phi \left( {\bf A}_4 \right) \sim \mathbb{R}^{(6\times 5\times 5)\times (8\times 8\times n)}$ & $\text{roll} \left( \mathbb{R}^{16\times (6\times 5\times 5)} \right) \sim \mathbb{R}^{16\times 6\times 5\times 5}$ \\[0.3em]
\hline
\rule{0pt}{1.1\normalbaselineskip}
$f_{2d}^t({\bf A}_2) \sim \mathbb{R}^{(24\times 24\times n)\times 6}$ & $f_{2d}^t(\Delta_3) \sim \mathbb{R}^{(24\times 24\times n)\times 6} $ & $\text{diag} \left( \mathbb{R}^{6\times 6} \right) \sim \mathbb{R}^6 $ \\[0.3em]
\hline
\rule{0pt}{1.1\normalbaselineskip}
$f_{2d}(\Delta_1) \sim \mathbb{R}^{6\times (24\times 24\times n)}$ & $\phi \left( {\bf A}_0 \right) \sim \mathbb{R}^{(1\times 5\times 5)\times (24\times 24\times n)}$ & $\text{roll} \left( \mathbb{R}^{6\times (1\times 5\times 5)} \right) \sim \mathbb{R}^{6\times 1\times 5\times 5}$ \\[0.3em]
\hline \hline
\rule{0pt}{1.1\normalbaselineskip}
${\bf w}_0^{\mathfrak{p}_0} \sim \mathbb{R}^{6\times1\times k_0\times k_0} $ & ${\bf w}_4^{\mathfrak{p}_4} \sim \mathbb{R}^{16\times6\times k_4\times k_4}$ & $\overset{\curvearrowleft \, t_{\text{d}} \, \mathfrak{p}_4^\prime }{{\bf w}_{4\,\,\,\,\,\,\,\,}} \sim \mathbb{R}^{6\times 16\times k_4^\prime \times k_4^\prime}  $ \\[0.3em]
\hline
\end{tabular}
\end{center}
The purely geometrical transformation from~${\bf w}_4^{\mathfrak{p}_4}$ to~$\overset{\curvearrowleft \, t_{\text{d}} \, \mathfrak{p}_4^\prime }{{\bf w}_{4\,\,\,\,\,\,\,\,}}$ is explained in a \hyperlink{convTransf}{dedicated paragraph} and illustrated in an animation of Fig.~\ref{fig:convForwardBackward}.}
\label{table:networkParams}
\end{table}

\newpage

\section{Gradient evaluation via backpropagation}
\label{backPropSection}

As discussed in the introduction, the key component behind the iterative training loop displayed in~Fig~\ref{fig:iterativeProcedure} consists in being able to provide an explicit expression for the gradient~$\nabla_\mathbfcal{P} \mathcal{L}_\text{batch}$ of the loss function with respect to the parameters~$\mathbfcal{P}$ of the neural network~$\mathcal{N}_\mathcal{P}$.  This section is dedicated to an in-depth immersion into the fundamental mechanics behind one such implementation of gradient calculation generally referred to as ``backpropagation''.   For typical machine learning datasets, loss functions tend to have signatures of the following type:
\begin{equation*}
\mathcal{L}_\text{batch} (\mathbfcal{P}) : \,\, \mathbb{R}^{n_p} \rightarrow \mathbb{R} \quad \text{with} \quad n_p \gg 1
\end{equation*}
where a very high-dimensional parameter space~($n_p =$~\num[group-separator={,}]{44878} in our example network) is reduced to a scalar value.  In this case, backpropagation is computationally efficient~\footnote{\label{backVsForward} To be compared with a straightforward computation of all partial derivatives of~$\mathcal{L}_\text{batch} (\mathbfcal{P})$ independently from each which requires~$\sim \mathcal{O}(n_p)$ evaluations of~$\mathcal{N}_\mathcal{P}$. Such ``forward mode'' implementations of differentiation are efficient only for functions~$\mathbb{R}^n \rightarrow \mathbb{R}^m$ where the dimensionality of the output space is larger than that of the input space, i.e.~$n < m$.} since the gradient can be evaluated by a \colorbox{pink}{single forward/backward cycle} through the neural network, i.e. roughly-speaking with a time complexity on the order of only~2 evaluations of~$\mathcal{N}_\mathcal{P}$ (at the expense of \hyperlink{memBack}{memory consumption}). This section presents the logic of backpropagation in a generic way applicable to any network architecture and layer type as done in tables~\ref{table:networkExample} and~\ref{table:networkParams} for our example network.

\paragraph{How to start? Bootstrap with loss function \& softmax}  Let us start by recognizing that, by definition, the total derivative of~$\mathcal{L}_\text{batch}$ is given by:
\begin{eqnarray}
\text{d} \mathcal{L}_\text{batch} &=& \nabla_\mathbfcal{P} \mathcal{L}_\text{batch} \cdot \text{d} \mathbfcal{P} \nonumber \\
&\downarrow& \colorbox{light-blue}{gradient as a vector of partial derivatives for the~$n_\ell$ layers; see footnote~\ref{noteGrad}} \nonumber \\
\text{d} \mathcal{L}_\text{batch} &\equiv& \sum_{p=1}^{n_\ell} \frac{\partial \mathcal{L}_{\text{batch}}}{\partial \mathbfcal{P}_p} \cdot \text{d}\mathbfcal{P}_p \label{eq:gradientFormal}
\end{eqnarray}

\newcommand\tempboxBootStrap{%
\begin{minipage}{0.2\textwidth}%
\abovedisplayskip=0pt
\belowdisplayskip=0pt
\begin{align*}
\text{d} \mathcal{L}_\text{batch} &= \sum_\text{samples} \text{d} {\bf \ell}_\mathbfcal{P} \left( {\bf Y}_\text{gt} , {\bf Y}_\text{pred} \right) \\
&\centerwithin{\downarrow}{=} \colorbox{light-blue}{using eq.~\eqref{totalVectorToScalar}} \\
&= \sum_\text{samples} \nabla {\bf \ell}_\mathbfcal{P} \left( {\bf Y}_\text{gt} , {\bf Y}_\text{pred} \right) \ominus \text{d} {\bf Y}_\text{pred} \\
&\centerwithin{\downarrow}{=} \colorbox{light-blue}{using eq.~\eqref{rowWise::Froebenius}} \\
& = \nabla {\bf \ell}_\mathbfcal{P} \left( {\bf Y}_\text{gt} , {\bf Y}_\text{pred} \right) \cdot \text{d} {\bf Y}_\text{pred}
\end{align*}
\end{minipage}}

\noindent As we will discover in this section, all the components of~$\nabla_\mathbfcal{P} \mathcal{L}_\text{batch}$ can be extracted by an intuitive pattern matching process that operates by recursively going backwards layer-by-layer through~$\mathcal{N}_\mathcal{P}$. \\

\noindent Accordingly, let us start with the definition of the cross-entropy loss function in~eq.(\ref{eq:batchLoss}) and begin evaluating~$\text{d} \mathcal{L}_\text{batch}$ at the output level of the neural network:
\begin{align}
\text{d} \mathcal{L}_\text{batch} &= \sum_\text{samples} \text{d} {\bf \ell}_\mathbfcal{P} \left( {\bf Y}_\text{gt} , {\bf Y}_\text{pred} \right) \phantom{zzzzzzzzzzzzzzzz} \nonumber \\
 & \makebox[0pt][l]{\makebox[0pt][r]{\tooltip****{\usebox{\switchbox}}{\tempboxBootStrap}[-\fboxsep,-\dimexpr\ht\switchbox+\dp\switchbox\relax]}}\\
& = \nabla_\mathbfcal{P} {\bf \ell}_\mathbfcal{P} \left( {\bf Y}_\text{gt} , {\bf Y}_\text{pred} \right) \cdot \, \text{d} {\bf Y}_\text{pred} \label{totalLossBatch}
\end{align}

\newcommand\tempboxGradLoss{%
\begin{minipage}{0.2\textwidth}%
\abovedisplayskip=0pt
\belowdisplayskip=0pt
\begin{align*}
\nabla_\mathbfcal{P} {\bf \ell}_\mathbfcal{P} ( {\bf Y}_\text{gt} , {\bf Y}_\text{pred} ) &= \nabla_\mathbfcal{P} \left( - {\bf Y}_\text{gt} \ominus \log {\bf Y}_\text{pred} \right) \\
&\centerwithin{\downarrow}{=} \colorbox{light-blue}{since, at this stage, all $\mathbfcal{P}$ dependence is contained in ${\bf Y}_\text{pred}$} \\
&= \nabla_{\bf{Y}_\text{pred}} \left( - {\bf Y}_\text{gt} \ominus \log {\bf Y}_\text{pred} \right) \\
&= \left( 
\begin{matrix}
\nabla_{{\bf y}^1_\text{pred}} \sim \mathbb{R}^{n_c} \\
\vdots \\
\nabla_{{\bf y}^n_\text{pred}} \sim \mathbb{R}^{n_c}
\end{matrix}  \right)
\left( 
\begin{matrix}
- {\bf y}^1_\text{gt} \cdot \log {\bf y}^1_\text{pred} \sim \mathbb{R} \\
\vdots \\
- {\bf y}^n_\text{gt} \cdot \log {\bf y}^n_\text{pred} \sim \mathbb{R}
\end{matrix}  \right) \sim \mathbb{R}^{n \times n_c} \\
&= \left( 
\begin{matrix}
\left( \frac{\partial}{\partial y^1_1} , \cdots , \frac{\partial}{\partial y^1_{n_c}} \right)_\text{pred}  \\
\vdots \\
\left( \frac{\partial}{\partial y^n_1} , \cdots , \frac{\partial}{\partial y^n_{n_c}} \right)_\text{pred}
\end{matrix}  \right) 
\left( 
\begin{matrix}
- \sum_c ( y^1_c )_\text{gt} \log ( y^1_c )_\text{pred} \\
\vdots \\
- \sum_c ( y^n_c )_\text{gt} \log ( y^n_c )_\text{pred}
\end{matrix}  \right) \\
&= - \left( 
\begin{matrix}
(y^1_1)_\text{gt} / (y^1_1)_\text{pred} & \dots & (y^1_{n_c})_\text{gt} / (y^1_{n_c})_\text{pred} \\
\vdots &\vdots& \vdots \\
(y^n_1)_\text{gt} / (y^n_1)_\text{pred} & \dots & (y^1_{n_c})_\text{gt} / (y^1_{n_c})_\text{pred}
\end{matrix} \right) \\
&= - \left( 
\begin{matrix}
y^1_1 & \dots & y^1_{n_c} \\
\vdots &\vdots& \vdots \\
y^n_1 & \dots & y^1_{n_c}
\end{matrix} \right)_\text{gt} \circ \left( 
\begin{matrix}
1 / y^1_1 & \dots & 1 / y^1_{n_c} \\
\vdots &\vdots& \vdots \\
1 / y^n_1 & \dots & 1 / y^1_{n_c}
\end{matrix} \right)_\text{pred} \\
& = - {\bf Y}_\text{gt} \circ \frac{1}{{\bf Y}_\text{pred}}
\end{align*}
\end{minipage}}

\noindent The first thing to notice is that~$\text{d} \mathcal{L}_\text{batch}$ is expressed as a {\bf Frobenius product between an ``error''~\footnote{Besides the fact that this term is directly related to the loss function~${\bf \ell}_\mathbfcal{P} \left( {\bf Y}_\text{gt} , {\bf Y}_\text{pred} \right)$, the origin of the naming convention as an ``error'' term will become evident \hyperlink{etymology}{later}.} term and the total derivative of a layer}.  We will come back to this important observation below but, for now, let us push the calculation one step further.  As specified in~table~\ref{table:networkExample} and explained in detail in section~\ref{sec:softmaxLayer}, the predicted probability distribution ${\bf Y}_\text{pred} = \text{softmax}\,{\bf A}$ is determined by applying the softmax function to the ``embedded'' representation~${\bf A}$.  In general, this means that its total derivative~$\text{d}{\bf Y}_\text{pred}$ can be formulated in terms of~$\text{d}{\bf A}$ through the chain rule (its exact expression is provided in~eq.(\ref{eq:softMaxTotalDerivative}) as part of the relevant section dedicated to the softmax function). \\

\noindent In order to continue evaluating~$\text{d} \mathcal{L}_\text{batch}$, let us now consider the error term involving the cross-entropy loss function~$\ell_\mathbfcal{P}$ defined in~eq.(\ref{eq:crossEntropy}):
\begin{align*}
\nabla_\mathbfcal{P} {\bf \ell}_\mathbfcal{P} ( {\bf Y}_\text{gt} , {\bf Y}_\text{pred} ) &= \nabla_\mathbfcal{P} \left( - {\bf Y}_\text{gt} \ominus \log {\bf Y}_\text{pred} \right) \phantom{zzzzzzzzzzzzzzzzzzzzzzzz} \\
 & \makebox[0pt][l]{\makebox[0pt][r]{\tooltip****{\usebox{\switchbox}}{\tempboxGradLoss}[-\fboxsep,-\dimexpr\ht\switchbox+\dp\switchbox\relax]}} \\
&= - {\bf Y}_\text{gt} \circ \frac{1}{{\bf Y}_\text{pred}} 
\end{align*}

\newpage

\newcommand\tempboxGrad{%
\begin{minipage}{0.2\textwidth}%
\abovedisplayskip=0pt
\belowdisplayskip=0pt
\begin{align*}
\text{d} \mathcal{L}_\text{batch} &= \nabla_\mathbfcal{P} {\bf \ell}_\mathbfcal{P} ( {\bf Y}_\text{gt} , {\bf Y}_\text{pred} ) \cdot \text{d}{\bf Y}_\text{pred} \\
&= \left( - {\bf Y}_\text{gt} \circ \frac{1}{{\bf Y}_\text{pred}} \right) \cdot \left( {\bf Y}_\text{pred} \circ \left( \text{d} {\bf A} - \widetilde{ {\bf Y}_\text{pred} \ominus \text{d} {\bf A} } \right) \right) \\
&= \sum_{ij} \left( - {\bf Y}_\text{gt} \circ \frac{1}{{\bf Y}_\text{pred}} \right)^{ij} \cdot \left( {\bf Y}_\text{pred} \circ \left( \text{d} {\bf A} - \widetilde{ {\bf Y}_\text{pred} \ominus \text{d} {\bf A} } \right) \right)_{ij} \\
 &= - \sum_{ij} Y^{ij}_\text{gt} \, \frac{1}{Y^{ij}_\text{pred}} \, Y^{ij}_\text{pred} \left( \text{d} {\bf A} - \widetilde{ {\bf Y}_\text{pred} \ominus \text{d} {\bf A} } \right)_{ij}  \\
&= - \sum_{ij} Y^{ij}_\text{gt} \left( \text{d} {\bf A} - \widetilde{{\bf Y}_\text{pred} \ominus \text{d} {\bf A} } \right)_{ij} \\
& = - {\bf Y}_\text{gt} \cdot \left( \text{d} {\bf A} - \widetilde{ {\bf Y}_\text{pred} \ominus \text{d} {\bf A} } \right) \\
&= {\bf Y}_\text{gt} \cdot \widetilde{ {\bf Y}_\text{pred} \ominus \text{d} {\bf A} }  - {\bf Y}_\text{gt} \cdot \text{d}{\bf A}
\end{align*}
\end{minipage}}

\newcommand\tempboxIntermediate{%
\begin{minipage}{0.2\textwidth}%
\abovedisplayskip=0pt
\belowdisplayskip=0pt
\begin{align*}
{\bf Y}_\text{gt} \cdot \widetilde{ {\bf Y}_\text{pred} \ominus \text{d} {\bf A} } &= \sum_\text{samples} {\bf Y}_\text{gt} \ominus \left( \widetilde{ {\bf Y}_\text{pred} \ominus \text{d} {\bf A} } \right) \\
&= \sum_\text{samples} \left( 
\begin{matrix}
y^1_1 & \dots &  y^1_{n_c} \\
\vdots &\vdots& \vdots \\
y^n_1 & \dots & y^n_{n_c}
\end{matrix} \right)_\text{gt} \ominus \left( 
\begin{matrix}
{\bf y}^1_\text{pred} \cdot \text{d} {\bf a}^1 & \dots & {\bf y}^1_\text{pred}	 \cdot \text{d} {\bf a}^1 \\
\vdots &\vdots& \vdots \\
{\bf y}^n_\text{pred} \cdot \text{d} {\bf a}^n & \dots & {\bf y}^n_\text{pred} \cdot \text{d} {\bf a}^n
\end{matrix} \right) \\
&= \sum_\text{samples} \left( 
\begin{matrix}
\left(y^1_1 + \cdots + y^1_{n_c} \right)_\text{gt} {\bf y}^1_\text{pred} \cdot \text{d} {\bf a}^1   \\
\vdots \\
\left(y^n_1 + \cdots + y^n_{n_c} \right)_\text{gt} {\bf y}^n_\text{pred} \cdot \text{d} {\bf a}^n
\end{matrix} \right) \\
&\centerwithin{\downarrow}{=} \colorbox{light-blue}{${\bf y}^s_\text{gt}$ disappears because of the OHE property $\sum_c (y^s_c)_\text{gt} = 1 $} \\
&= \sum_\text{samples} \left( 
\begin{matrix}
{\bf y}^1_\text{pred} \cdot \text{d} {\bf a}^1  \\
\vdots \\
{\bf y}^n_\text{pred} \cdot \text{d} {\bf a}^n
\end{matrix} \right) = \sum_\text{samples} {\bf Y}_\text{pred} \ominus \text{d}{\bf A}  \\
& = {\bf Y}_\text{pred} \cdot \text{d} {\bf A}
\end{align*}
\end{minipage}}

\noindent The next step consists in combining the expression above for~$\nabla_\mathbfcal{P} {\bf \ell}_\mathbfcal{P} ( {\bf Y}_\text{gt} , {\bf Y}_\text{pred} )$ along with that of~$\text{d}{\bf Y}_\text{pred}$ derived in~eq.(\ref{eq:softMaxTotalDerivative}) and reproduced here as a reminder:
\begin{equation*}
\text{d}{\bf Y}_\text{pred} = {\bf Y}_\text{pred} \circ \left( \text{d} {\bf A} - \widetilde{ {\bf Y}_\text{pred} \ominus \text{d} {\bf A} } \right)
\end{equation*}
together into the Frobenius product of~eq.(\ref{totalLossBatch}) in order to get:
\begin{align*}
\text{d} \mathcal{L}_\text{batch} &= \left( - {\bf Y}_\text{gt} \circ \frac{1}{{\bf Y}_\text{pred}} \right) \cdot \left( {\bf Y}_\text{pred} \circ \left( \text{d} {\bf A} - \widetilde{ {\bf Y}_\text{pred} \ominus \text{d} {\bf A} } \right) \right)\\
 & \makebox[0pt][l]{\makebox[0pt][r]{\tooltip****{\usebox{\switchbox}}{\tempboxGrad}[-\fboxsep,-\dimexpr\ht\switchbox+\dp\switchbox\relax]}} \\
 &= {\bf Y}_\text{gt} \cdot \widetilde{ {\bf Y}_\text{pred} \ominus \text{d} {\bf A} }  - {\bf Y}_\text{gt} \cdot \text{d}{\bf A} \\
 & \makebox[0pt][l]{\makebox[0pt][r]{\tooltip****{\usebox{\switchbox}}{\tempboxIntermediate}[-\fboxsep,-\dimexpr\ht\switchbox+\dp\switchbox\relax]}}  \\
  &= \left( {\bf Y}_\text{pred} - {\bf Y}_\text{gt} \right) \cdot \text{d} {\bf A} 
\end{align*}
Comparing the expression above for~$\text{d} \mathcal{L}_\text{batch}$ with that of~eq.(\ref{totalLossBatch}), we observe that the structure as a Frobenius product between an error term and the total derivative of a layer is preserved as we go from the level of the predicted probability distribution to that of the embedding layer. Namely:
\begin{eqnarray}
\text{d} \mathcal{L}_\text{batch} &=& \underbrace{\nabla_\mathbfcal{P} {\bf \ell}_\mathbfcal{P} \left( {\bf Y}_\text{gt} , {\bf Y}_\text{pred} \right)}_\text{upstream error} \cdot \, \text{d} \,\, \big( \underbrace{{\bf Y}_\text{pred}}_\text{current layer} \big) \nonumber \\
&\downarrow& \colorbox{light-blue}{our $1^\text{st}$ backward step through the network} \nonumber \\
\text{d} \mathcal{L}_\text{batch} &=& \overbrace{ \left( {\bf Y}_\text{pred} - {\bf Y}_\text{gt} \right) }^\text{downstream error} \cdot \, \text{d} \,\, \big( \overbrace{{\bf A}}^\text{previous layer} \big) \label{kickStartBack}
\end{eqnarray}
In other words, this first step in the evaluation of~$\text{d} \mathcal{L}_\text{batch}$ can be seen as going backwards through one layer of the neural network: we went from an expression involving~${\bf Y}_\text{pred}$ to a similar expression that now involves the preceding layer~${\bf A}$. In this process, the ``upstream'' error at the level of~${\bf Y}_\text{pred}$ has been modified into a new ``downstream'' expression at the level of~${\bf A}$.  \\

\noindent \hypertarget{etymology}{At this point}, it is useful to make a connection with our example network by pattern matching~eq.(\ref{kickStartBack}) against the downstream error~$\Delta_i \equiv {\bf Y}_\text{pred} - {\bf Y}_\text{gt}$ and the embedding layer~${\bf A} \equiv {\bf A}_i$ with~$i=16$ inferred from table~\ref{table:networkExample}.  As the difference between the predicted probability distribution and the ground-truth, the \colorbox{pink}{naming of~$\Delta_i$ as an ``error'' term is self-explanatory}.  In summary, the backward pass starting at the level of the loss, through the softmax layer and back up to the embedding layer is given by:  
\begin{empheq}[box={\backPropBox[{\bf cross-entropy \& softmax}: backward pass]}]{alignat=2}
\text{d} \mathcal{L}_\text{batch} &= \Delta_i \cdot \text{d}{\bf A}_i \label{bootStrapBackPropEmbed} \\
\Delta_i &= {\bf Y}_\text{pred} - {\bf Y}_\text{gt} \nonumber 
\end{empheq}
More generally, $\Delta_i$ corresponds to the gradient of the loss function with respect to the data array~${\bf A}_i$.  For consistency, {\bf we will continue to refer to the descendants of~$\Delta_i$ as generalized ``error'' terms}.

\newcommand\tempboxRecursion{%
\begin{minipage}{0.2\textwidth}%
\abovedisplayskip=0pt
\belowdisplayskip=0pt
\begin{align*}
\text{d} \mathcal{L}_\text{batch} &= \Delta_i \cdot \text{d} {\bf A}_i \\
&\centerwithin{\downarrow}{=} \colorbox{light-blue}{formal expansion of the total derivative $\text{d} {\bf A}_i$} \\
&= \Delta_i \cdot \bigg[ \underbrace{ \left( \frac{\partial f_i}{\partial {\bf A}_{i-1}} \right) }_{\textstyle \begin{gathered} g_i \end{gathered} } \text{d} {\bf A}_{i-1} + \underbrace{  \left( \frac{\partial f_i}{\partial \mathbfcal{P}_{i-1}} \right)  }_{\textstyle \begin{gathered} h_i \end{gathered} } \text{d}{\mathbfcal P}_{i-1} \bigg] \\
&\centerwithin{\downarrow}{=} \colorbox{light-blue}{where the functions~$g_i \left( {\bf A}_{i-1} , {\mathbfcal P}_{i-1} \right) $ and~$h_i \left( {\bf A}_{i-1} , {\mathbfcal P}_{i-1} \right)$ depend} \\
&\centerwithin{}{=} \colorbox{light-blue}{on the nature of the particular layer $f_i$ under consideration} \\
&= \Delta_i \cdot \bigg[ g_i \left( {\bf A}_{i-1} , {\mathbfcal P}_{i-1} \right) \text{d} {\bf A}_{i-1} + h_i \left( {\bf A}_{i-1} , {\mathbfcal P}_{i-1} \right) \text{d}{\mathbfcal P}_{i-1} \bigg] \\
&\centerwithin{\downarrow}{=} \colorbox{light-blue}{using eq.~\eqref{frobenius1}} \\
&= \bigg[ g^t_i \left( {\bf A}_{i-1} , {\mathbfcal P}_{i-1} \right) \Delta_i \bigg] \cdot \text{d} {\bf A}_{i-1} + \bigg[ h^t_i \left( {\bf A}_{i-1} , {\mathbfcal P}_{i-1} \right) \Delta_i \bigg] \cdot \text{d}{\mathbfcal P}_{i-1} \\
&\centerwithin{\downarrow}{=} \colorbox{light-blue}{$1^\text{st}$ term: gradient w.r.t. data~${\bf A}_{i-1}$ identified as the downstream error $\Delta_{i-1}$} \\
&\centerwithin{}{=} \colorbox{light-blue}{$2^\text{nd}$ term: gradient w.r.t. parameters~${\mathbfcal P}_{i-1}$; see eq.(\ref{eq:gradientFormal})} \\
&= \Delta_{i-1} \cdot \text{d}{\bf A}_{i-1} + \frac{\partial \mathcal{L}_{\text{batch}}}{\partial \mathbfcal{P}_{i-1}} \cdot \text{d}\mathbfcal{P}_{i-1}
\end{align*}
\end{minipage}}

\paragraph{Recursive backwards error flow} Let us now formalize this backwards propagation of the error up through the layers of the network into a high-level generic framework. \\

\noindent As already discussed in the introduction, deep learning models should be understood as the composition $\mathcal{N}_\mathbfcal{P} \equiv f_{n_\ell} \circ \cdots \circ f_1$ of a set of differentiable functions~$\{ f_1, \cdots , f_{n_\ell} \}$ that define~$n_\ell$ layers. For the sake of simplicity~\footnote{Obviously, this assumption of locality for~${\bf A}_i$ neglects the possibility of long-range data dependencies; those can easily be taken into account as explained in a side note dedicated to~\hyperlink{skipNote}{shortcut connections}.}, let us begin with the assumption that the data~${\bf A}_i$ at the~$i^\text{th}$ layer depends only on its data predecessor~${\bf A}_{i-1}$ at the~$(i-1)^\text{th}$ level and, potentially, a set~$\mathbfcal{P}_{i-1}$ of adjustable parameters.  Denoting by~$f_i$ the function representing the corresponding layer of~$\mathcal{N}_\mathbfcal{P}$, we have: 
\begin{equation}
{\bf A}_i \equiv f_i ( {\bf A}_{i-1}, \mathbfcal{P}_{i-1} )
\label{eq:simpleDep}
\end{equation}

\newpage

\fcolorbox{black}{lightgray}{%
\minipage[t]{\dimexpr0.9\linewidth-2\fboxsep-2\fboxrule\relax}
{\bf Small \hypertarget{skipNote}{side note} about shortcut connections} \\

Even though the accuracy of statistical learning systems over complex classification tasks has unambiguously benefited from the ever-increasing depths of modern neural networks, very deep architectures (made possible thanks to smart initialization schemes and normalization layers) have empirically revealed the emergence of stubborn degradation effects. For example, it turns out that adding more layers to an already well-trained network leads to a decrease in accuracy (even when measured over the training dataset which suggests that overfitting is not the root cause for the degradation).  Paradoxically, constructing the extra layers as identity mappings, one expects that deeper networks should not have a higher training error than shallower ones.  Unfortunately, existing solvers often struggle to learn arithmetic concepts even as simple as the identity function~\cite{nalu}.  \\

\newcommand\tempboxShortcut{%
\begin{minipage}{0.895\textwidth}%
\abovedisplayskip=0pt
\belowdisplayskip=0pt
\begin{tikzpicture}[scale=1]
\node (residual) [residual] {{\bf residual}: sequential composition of~$k$ layers \\ \vspace{0.1cm} $\big( f_i \circ \cdots \circ f_{i-k} \big)$};
\path (residual.west)+(-1.22, -1) node (inputData) [dataNode] {${\bf A}_{i-k}$};
\path (residual.east)+(1.22, -1) node (outputData) [dataNode] {${\bf A}_i$};
\path (outputData.east)+(1.6, 0) node (addedData) [dataNode] {${\bf A}_i +  {\bf A}_{i-k}$};
\path (residual.west)+(-1.22, 1) node (downstreamErr) [errNode] {$\Delta_{i-k}$};
\path (downstreamErr.west)+(-1.6, 0) node (addedErr) [errNode] {$\Delta_i + \Delta_{i-k}$};
\path (residual.east)+(1.22, 1) node (upstreamErr) [errNode] {$\Delta_i$};
\draw[->] (inputData) -- (residual) node[ ] {};
\draw[->] (outputData) -- (addedData) node[ ] {};
\draw[->] (upstreamErr) -- (residual) node[ ] {};
\draw[->] (residual) -- (downstreamErr) node[ ] {};
\draw[->] (residual) -- (outputData) node[ ] {};
\draw[->] (downstreamErr) -- (addedErr) node[ ] {};
\path[->, line width=0.24mm] (inputData) edge[bend right] node [below] {{\bf identity shortcut}: data propagation} (addedData);
\path[->, line width=0.24mm] (upstreamErr) edge[bend right] node [above] {{\bf identity shortcut}: error propagation} (addedErr);

\begin{scope}[on background layer]
\filldraw[draw=black, fill=shadecolor] (current bounding box.north west) rectangle 
($(current bounding box.north east)!0.5!(current bounding box.south east)$);
\filldraw[draw=black, fill=green-yellow] (current bounding box.south west) rectangle 
($(current bounding box.north east)!0.5!(current bounding box.south east)$);
\fill[fill=gray] ([yshift=1mm]$(current bounding box.north west)!0.5!(current bounding box.south
west)$) rectangle 
([yshift=-1mm]$(current bounding box.north east)!0.5!(current bounding box.south east)$);
\end{scope}
\end{tikzpicture}
\end{minipage}}

\noindent The general idea behind shortcut connections is to decompose the learning task into a fixed identity mapping supplemented by a learned residual function.  Accordingly, a shortcut connection linking layer~$i-k$ directly down to layer~$i$, eliminating the locality restriction of~eq.(\ref{eq:simpleDep}), can be implemented via element-wise addition:
\begin{equation*}
{\bf A}_i \equiv \big( f_i \circ \cdots \circ f_{i-k} \big) \, {\bf A}_{i-k}  + {\bf A}_{i-k}
\end{equation*}
where~$k\geq2$ stands for the number of skipped layers and where the sequential composition of all the~$k$ intermediate layers defines the residual function.  This concept was popularized by the successful ResNet architecture~\cite{resNet} which has gone on to inspire the design of many current state-of-the-art networks.  During the backward pass, it is the upstream error~$\Delta_i$ that is bypassed unchanged straight up to layer~$i-k$ where it is combined with the usual error propagation~$\Delta_{i-k}$ of the residual function:
\begin{align*}
\Delta_i \cdot \text{d} {\bf A}_i &= \left( \sum_{p=\,i-k}^i \frac{\partial \mathcal{L}_{\text{batch}}}{\partial \mathbfcal{P}_p} \cdot \text{d}\mathbfcal{P}_p \right) + \big( \Delta_{i-k} + \Delta_i \big) \cdot \text{d}{\bf A}_{i-k} \hspace{5cm} \\
& \makebox[0pt][l]{\makebox[0pt][r]{\tooltip****{\usebox{\switchbox}}{\tempboxShortcut}[-\fboxsep,-\dimexpr\ht\switchbox+\dp\switchbox\relax]}}
\end{align*}

\noindent In addition to learning decomposition via identity mappings and error backpropagation bypass, the success behind skip connections may also originate from an emerging form of inductive bias that tends to ``convexify'' loss landscapes~\cite{resNetConvex}. \\

{\it (In case that~${\bf A}_{i-k}$ and~${\bf A}_i$ do not have the same dimensionality one needs to perform a linear projection of~${\bf A}_{i-k}$.  This is usually achieved by~$1\times1$ convolutions for~3d data and fully connected layers for~1d data but comes at the cost of introducing new trainable parameters.)}

\endminipage}

\vspace{0.5cm}

\noindent As emphasized in the previous paragraph, the total derivative of the minibatch loss function~$\mathcal{L}_\text{batch}$ follows a particular structure as a Frobenius product of the form~$\text{d} \mathcal{L}_\text{batch} = \Delta_i \cdot \text{d} {\bf A}_i$.  Using the generic expression for~${\bf A}_i$ provided above, we continue the evaluation of~$\text{d} \mathcal{L}_\text{batch}$ by:
\begin{itemize}
\item formally expanding~$\text{d}{\bf A}_i = \big( \partial f_i / \partial {\bf A}_{i-1} \big) \text{d} {\bf A}_{i-1} + \big( \partial f_i / \partial \mathbfcal{P}_{i-1} \big) \text{d}{\mathbfcal P}_{i-1}$ 
\item plugging this expression back into the Frobenius product defining~$\text{d} \mathcal{L}_\text{batch}$
\end{itemize}
Carrying out these steps and gathering the resulting terms into a coherent division between gradient components with respect data, so-called error terms, and those with respect to parameters yields:
\begin{align*}
\text{d} \mathcal{L}_\text{batch} &= \mathcolorbox{shadecolor}{ \underbrace{ \Delta_i}_\text{upstream error}  \cdot \,\, \text{d} \,(\underbrace{ {\bf A}_i }_{i^\text{th} \text{ layer}} ) } \\
& \makebox[0pt][l]{\makebox[0pt][r]{\tooltip****{\usebox{\switchbox}}{\tempboxRecursion}[-\fboxsep,-\dimexpr\ht\switchbox+\dp\switchbox\relax]}} \\
&= \mathcolorbox{shadecolor}{ \overbrace{\Delta_{i-1}}^\text{downstream error} \cdot \,\, \text{d} \, (\overbrace{ {\bf A}_{i-1}}^{(i-1)^\text{th} \text{ layer} } ) } + \mathcolorbox{light-blue}{ \overbrace{ \frac{\partial \mathcal{L}_{\text{batch}}}{\partial \mathbfcal{P}_{i-1}} }^\text{component of $\nabla_\mathbfcal{P} \mathcal{L}_\text{batch}$}  \cdot \, \text{d} \, \mathbfcal{P}_{i-1} }
\end{align*}

\noindent We see that the evaluation of~$\text{d} \mathcal{L}_\text{batch} = \Delta_i \cdot \text{d} {\bf A}_i$ one more step after the embedding layer leads to the \colorbox{pink}{emergence of a recursive relation} made up of two components:
\begin{itemize}
\item $\mathcolorbox{shadecolor}{\Delta_{i-1} \cdot \text{d}{\bf A}_{i-1}}$  In complete analogy with the discussion around~eq.(\ref{kickStartBack}), this should be interpreted as another backward step through~$\mathcal{N}_\mathbfcal{P}$ from the~$i^\text{th}$ layer back to the~$(i-1)^\text{th}$ layer.  In the process, the upstream error~$\Delta_i$ is transformed into a downstream error~$\Delta_{i-1}$ which preserves the familiar structure of error backpropagation as a recursive Frobenius product.
\item $\mathcolorbox{light-blue}{\frac{\partial \mathcal{L}_{\text{batch}}}{\partial \mathbfcal{P}_{i-1}} \cdot \text{d} \mathbfcal{P}_{i-1}}$  Comparing the latest expression of~$\text{d} \mathcal{L}_\text{batch}$ derived above against its formal expansion of in~eq.(\ref{eq:gradientFormal}), we understand that this term should be identified with the gradient of the loss function with respect to parameters~$\mathbfcal{P}_{i-1}$.
\end{itemize}

\begin{figure}
\centering
\begin{tikzpicture}

\begin{feynman}

\vertex (a1) {\( \Delta_{i-1} \)};
\vertex[right=4cm of a1] (a2) ;
\vertex[right=3cm of a2] (a3) {\( \Delta_i \)};
\vertex[right=2cm of a3] (arecurs);
\vertex[above=0.8cm of a1] (bw) {\( \partial \mathcal{L}_\text{batch} / \partial \mathbfcal{P}_{i-1} \) };

\vertex at ($(bw)!0.02!(a1) - (-1.3cm, 0cm)$) (j);

\vertex[below=2cm of a1] (b1) {\( {\bf A}_{i-1} \) };
\vertex[below=0.5cm of b1] (b1param) {\( \mathbfcal{P}_{i-1} \) };

\vertex at ($(b1)!0.5!(b1param) - (-1cm, 0cm) $) (btest);
\vertex[right=6cm of btest] (b2) {\( {\bf A}_i \) };
\vertex[right=2cm of b2] (brecurs);

\vertex[left=4cm of btest] (forwardStart0);
\vertex[left=2cm of btest] (forwardStart1);

\vertex[left=3cm of a1] (backwardEnd0);
\vertex[left=1cm of a1] (backwardEnd1);

\vertex[right=2cm of b1] (j1);
\vertex[left=2cm of b2] (j2);

\diagram* {
(a3) -- [fermion] (a2) -- [fermion] (a1),
(a2) -- [photon, bend right, edge label'=\( \mathcolorbox{light-blue}{\text{backward pass: {\bf error propagation}}} \) ] (j),
(btest) -- [fermion, edge label'=\( \colorbox{orange}{\text{forward pass: {\bf data propagation}}} \)] (b2),
(btest) -- [ghost] (a2),
(b2) -- [charged scalar] (brecurs),
(forwardStart0) -- [charged scalar] (forwardStart1),
(backwardEnd1) -- [charged scalar] (backwardEnd0),
(arecurs) -- [charged scalar] (a3)
};

\draw [decoration={brace}, decorate] (b1.north east) -- (b1param.south east) ;

\begin{scope}[on background layer]
\filldraw[draw=black, fill=shadecolor] (current bounding box.north west) rectangle 
($(current bounding box.north east)!0.65!(current bounding box.south east)$);
\filldraw[draw=black, fill=green-yellow] (current bounding box.south west) rectangle 
($(current bounding box.north east)!0.65!(current bounding box.south east)$);
\fill[fill=gray] ([yshift=1mm]$(current bounding box.north west)!0.65!(current bounding box.south
west)$) rectangle 
([yshift=-1mm]$(current bounding box.north east)!0.65!(current bounding box.south east)$);
\end{scope}

\end{feynman}
\end{tikzpicture}
\caption{Illustration of how the recursive relation of~eq.(\ref{recursionGeneral}) can be visualized as a backwards error flow from ``upstream''~$\Delta_i$ into a ``downstream'' error~$\Delta_{i-1}$.  This terminology reflects the fact that this error backpropagation is analogous to the forward pass where we follow the propagation of the data the other way around.    In case that the layer function~$f_i$ during the forward pass has adjustable parameters~$\mathbfcal{P}_{i-1}$, a side-branch away from the error flow evaluates into a gradient component~$\partial \mathcal{L}_{\text{batch}} / \partial \mathbfcal{P}_{i-1}$.  The point-dashed line indicates that information from the forward pass needs to be cached in memory and combined with the upstream gradient~$\Delta_i$ in order to propagate the error and evaluate the gradient component.  The bottom panel simply illustrates the data propagation step from~${\bf A}_{i-1}$ to~${\bf A}_i$ during the forward pass.}
\label{errorFlow}
\end{figure}

\noindent This recursive pattern where each step of evaluation of~$\text{d} \mathcal{L}_\text{batch}$ defines a backwards flow through~$\mathcal{N}_\mathbfcal{P}$ from an upstream error~$\Delta_i$ into a downstream error~$\Delta_{i-1}$ and a side-branch that evaluates to a component~$\partial \mathcal{L}_{\text{batch}} / \partial \mathbfcal{P}_{i-1}$ of the gradient is illustrated in the cartoon of~Fig.\ref{errorFlow}. \\

\noindent \hypertarget{errGeneralFunction}{As} revealed in the optional content above, \hypertarget{memBack}{it} is important to realize that both~$\Delta_{i-1}$ and~$\partial \mathcal{L}_{\text{batch}} / \partial \mathbfcal{P}_{i-1}$ may be seen as general functions that depend on the upstream error~$\Delta_i$ as well as on the data and parameters~$({\bf A}_{i-1}, \mathbfcal{P}_{i-1})$ at the~$(i-1)^\text{th}$ layer that were computed during the forward pass as indicated by the point-dashed line in~Fig.\ref{errorFlow}. In other words:
\begin{equation*}
\Delta_{i-1} \rightarrow \Delta_{i-1} \, \Big( \Delta_i , {\bf A}_{i-1}, \mathbfcal{P}_{i-1} \Big) \quad \text{and} \quad \partial \mathcal{L}_{\text{batch}} / \partial \mathbfcal{P}_{i-1} \rightarrow \partial \mathcal{L}_{\text{batch}} / \partial \mathbfcal{P}_{i-1} \, \Big( \Delta_i , {\bf A}_{i-1}, \mathbfcal{P}_{i-1} \Big) \, .
\end{equation*}
This shows that the time efficiency gains obtained by backpropagation (see footnote~\ref{backVsForward}) are mitigated by the {\bf additional memory requirements} necessary to store intermediate data arrays; a manifestation of the classic space-time trade-off.
\begin{empheq}[box={\backPropBox[Recursive backwards {\bf error flow} \& {\bf gradient extraction}]}]{alignat=2}
\Delta_i \cdot \text{d} {\bf A}_i &= \Delta_{i-1} \cdot \text{d}{\bf A}_{i-1} + \frac{\partial \mathcal{L}_{\text{batch}}}{\partial \mathbfcal{P}_{i-1}} \cdot \text{d}\mathbfcal{P}_{i-1}
\label{recursionGeneral}
\end{empheq}
\noindent Note that if the layer~$f_i$ does not have any adjustable parameters, i.e.~${\bf A}_i = f_i({\bf A}_{i-1})$, there are no gradient components associated with this layer and the recursion relation of~eq.(\ref{recursionGeneral}) reduces to a simple backward propagation along the error flow only, i.e.~$\Delta_i \cdot \text{d} {\bf A}_i = \Delta_{i-1} \cdot \text{d}{\bf A}_{i-1}$.  In case that the network involves shortcut connections, the recursive backwards error flow is easily generalized as shown in the dedicated \hyperlink{skipNote}{side note}.

\paragraph{Unrolling backpropagation} Starting from the error at the stage of the embedding layer given by~eq.(\ref{bootStrapBackPropEmbed}) and unrolling the recursion relation all the way back to the input data layer~${\bf A}_0$ leads to:
\begin{eqnarray*}
\mathcolorbox{shadecolor}{\text{d} \mathcal{L}_\text{batch}} &=& \Delta_i \cdot \text{d} {\bf A}_i \\ 
&=& \frac{\partial \mathcal{L}_{\text{batch}}}{\partial \mathbfcal{P}_{i-1}} \cdot \text{d}\mathbfcal{P}_{i-1} + \Delta_{i-1} \cdot \text{d}{\bf A}_{i-1} \\
&=& \left( \frac{\partial \mathcal{L}_{\text{batch}}}{\partial \mathbfcal{P}_{i-1}} \cdot \text{d}\mathbfcal{P}_{i-1} + \frac{\partial \mathcal{L}_{\text{batch}}}{\partial \mathbfcal{P}_{i-2}} \cdot \text{d}\mathbfcal{P}_{i-2}  \right) + \Delta_{i-2} \cdot \text{d} {\bf A}_{i-2} \\
&\downarrow& \colorbox{light-blue}{recursively unrolling the backwards error flow and gradient component extraction} \nonumber \\
&=& \left( \sum_{p=1}^{n_\ell-1} \frac{\partial \mathcal{L}_{\text{batch}}}{\partial \mathbfcal{P}_p} \cdot \text{d}\mathbfcal{P}_p \right) + \Delta_1 \cdot \text{d} {\bf A}_1 \\
&=& \mathcolorbox{shadecolor}{\sum_{p=1}^{n_\ell} \frac{\partial \mathcal{L}_{\text{batch}}}{\partial \mathbfcal{P}_p} \cdot \text{d}\mathbfcal{P}_p } + \underbrace{\Delta_0 \cdot \text{d} {\bf A}_0}_{= 0}
\end{eqnarray*}

\noindent This demonstrates that the recursion relation indeed recovers the formal expansion of~eq.(\ref{eq:gradientFormal}).  Therefore, a step-by-step identification allows the extraction of all the components~$\partial \mathcal{L}_{\text{batch}} / \partial \mathbfcal{P}_p$ of the gradient of the loss function~$\mathcolorbox{shadecolor}{\nabla \mathcal{L}_\text{batch} (\mathbfcal{P}) = \big( \partial \mathcal{L}_\text{batch} / \partial \mathbfcal{P}_1, \cdots, \partial \mathcal{L}_\text{batch} / \partial \mathbfcal{P}_{n_\ell} \big)}$. (Note that we terminated the recursion by considering the input data to be fixed, i.e.~$\text{d}{\bf A}_0 = 0$)

\section{Softmax layer}
\label{sec:softmaxLayer}

\paragraph{Forward pass}  In neural network architectures designed for classification tasks, the final layer is typically chosen to be the softmax function.  Its purpose is to transform the final embedded representation~${\bf A} \sim \mathbb{R}^{n \times n_c}$, known as ``logits'', into a probability distribution~${\bf Y}_\text{pred} \sim \mathbb{R}^{n \times n_c} $ over the~$n_c$ classes.  This can be accomplished by a multi-dimensional generalization of the logistic function:
\begin{empheq}[box={\forwardBox[{\bf Softmax}: forward pass]}]{alignat=2}
{\bf Y}_\text{pred} \equiv \text{softmax} \, {\bf A} =  \frac{e^{{\bf A}}}{\sum_c e^{{\bf A}}} \sim \mathbb{R}^{n\times n_c}
\label{eq:softMaxForward}
\end{empheq}

\noindent where the sum runs over the~$c = \{ 1, \cdots, n_c \}$ features which, at this stage, directly correspond to the~$n_c$ possible classes as illustrated in~Fig~\ref{fig:softMax}. In order to get a more explicit formulation of the broadcasting arithmetic going on under the hood in~eq.(\ref{eq:softMaxForward}), let's introduce the normalizing vector:
\begin{equation*}
\alpha \equiv \frac{1}{\sum_c e^{{\bf A}}}  = 
\left( 
\begin{matrix}
\alpha_1 \equiv 1 / \sum_c e^{a^1_c} \sim \mathbb{R} \\
\vdots \\
\alpha_n \equiv 1 / \sum_c e^{a^n_c} \sim \mathbb{R}
\end{matrix} \right)
\sim \mathbb{R}^{n} \label{softMaxNorm}
\end{equation*}

\noindent that needs to be broadcast into~$\alpha \rightarrow \widetilde{\alpha} \sim \mathbb{R}^{n \times n_c}$ for compatibility with~$e^{{\bf A}}$.  Therefore, a more explicit representation of~${\bf Y}_\text{pred}$ can be presented as:

\newcommand\tempboxSoftMaxForward{%
\begin{minipage}{0.2\textwidth}%
\abovedisplayskip=0pt
\belowdisplayskip=0pt
\begin{align*}
{\bf Y}_\text{pred} &= \text{softmax} \, {\bf A}  \\
&= \widetilde{\alpha} \circ e^{{\bf A}} \\
&= \left(
\begin{matrix}
\alpha_1 & \dots & \alpha_1 \\
\vdots &\vdots& \vdots \\
\alpha_n & \dots & \alpha_n
\end{matrix}
\right) \circ \left(
\begin{matrix}
e^{a^1_1}  & \dots & e^{a^1_{n_c}} \\
\vdots &\vdots& \vdots \\
e^{a^n_1} & \dots & e^{a^n_{n_c}}
\end{matrix}
\right) \\
&= \left(
\begin{matrix}
\alpha_1 e^{a^1_1}  & \dots & \alpha_1 e^{a^1_{n_c}} \\
\vdots &\vdots& \vdots \\
\alpha_n e^{a^n_1} & \dots & \alpha_n e^{a^n_{n_c}}
\end{matrix}
\right)
\end{align*}
\end{minipage}}

\begin{align*}
{\bf Y}_\text{pred} \equiv \text{softmax} \, {\bf A} = \widetilde{\alpha} \circ e^{{\bf A}} &= \left(
\begin{matrix}
\alpha_1 e^{a^1_1}  & \dots & \alpha_1 e^{a^1_{n_c}} \\
\vdots &\vdots& \vdots \\
\alpha_n e^{a^n_1} & \dots & \alpha_n e^{a^n_{n_c}}
\end{matrix}
\right) \sim \mathbb{R}^{n\times n_c} \\
& \makebox[0pt][l]{\makebox[0pt][r]{\tooltip****{\usebox{\switchbox}}{\tempboxSoftMaxForward}[-\fboxsep,-\dimexpr\ht\switchbox+\dp\switchbox\relax]}}
\end{align*}

\begin{figure}
\centering
\includegraphics[width=0.9\linewidth]{pptx/crossEntropy/Slide2.png}
\caption{Illustration of the softmax operation.  One can see that the samples of~${\bf A}$ (last representation layer also known as ``embedding'') are transformed into~${\bf Y}_\text{pred}$ whose vectors can be interpreted as probability values since~$\sum_c {\bf y}_\text{pred}^\mathfrak{s} = 1$ is correctly normalized for all the samples~$\mathfrak{s} \in \{ 1 , \cdots , n \}$ of the training minibatch. Understandably, each component~$y^\mathfrak{s}_\text{pred}(c) \sim \mathbb{R}$ of the prediction vector~${\bf y}^\mathfrak{s}_\text{pred} \sim \mathbb{R}^{n_c}$ quantifies the probability attributed to each class~$c \in \{ 1, \cdots , n_c \}$ by the neural network~$\mathcal{N}_\mathbfcal{P}$ for a specific sample~$\mathfrak{s}$.  (Note that the numerical values used in this illustration are shared with~Fig.~\ref{fig:crossEntropy}.)}
\label{fig:softMax}
\end{figure}

\noindent From this expression and the definition of~$\alpha$, it is clear that the rows (representing minibatch training data samples) of~${\bf Y}_\text{pred}$ are indeed normalized correctly and can be interpreted as probabilities:
\begin{equation*}
\left(
\begin{matrix}
\sum_c {\bf y}_\text{pred}^1 \\
\vdots \\
\sum_c {\bf y}_\text{pred}^n
\end{matrix}
\right) = \left(
\begin{matrix}
\alpha_1 \sum_c e^{a^1_c} \\
\vdots \\
\alpha_n \sum_c e^{a^n_c}
\end{matrix}
\right) \Longrightarrow \left(
\begin{matrix}
1 \\
\vdots \\
1
\end{matrix}
\right)
\end{equation*}


\fcolorbox{black}{lightgray}{%
\minipage[t]{\dimexpr0.9\linewidth-2\fboxsep-2\fboxrule\relax}
{\bf Small side note about probabilistic interpretation} \\

Although the softmax layer does indeed turn embeddings into probability distributions, it makes no guarantee that those probabilities reflect reliable confidence estimates. Perfect model calibration means that, given~$N$ predictions each with confidence of~$y_\text{pred}$ we expect exactly~$y_\text{pred} N$ to be correctly classified.  In practice, it turns out that many machine learning models are severely miscalibrated (even if they are accurate) and require post-processing scaling~\cite{calibration}.  Generally speaking, the ability to control and estimate model uncertainty is of crucial importance in many applications such as security, medical, safety critical...  Going beyond simple point estimates of probabilities in order to try and probe the full distribution of model uncertainty may be better addressed by tools coming from Bayesian machine learning~\cite{bbq,testTimeDropOut}.

\endminipage}

\newcommand\tempboxSoftMaxOne{%
\begin{minipage}{0.2\textwidth}%
\abovedisplayskip=0pt
\belowdisplayskip=0pt
\begin{align*}
\text{d}{\bf Y}_\text{pred} &= \text{d} \left( \text{softmax} \, {\bf A} \right) \\
&= \text{d} \widetilde{\alpha} \circ e^{{\bf A}} + \widetilde{\alpha} \circ \text{d} \left( e^{{\bf A}} \right) \\
&\centerwithin{\downarrow}{=} \colorbox{light-blue}{using eq.~\eqref{scalarToScalar}} \\
&= \text{d} \widetilde{\alpha} \circ e^{{\bf A}} + \widetilde{\alpha} \circ e^{{\bf A}} \circ \text{d} {\bf A} \\
&\centerwithin{\downarrow}{=} \colorbox{light-blue}{by definition of ${\bf Y}_\text{pred}$ during the forward pass}	 \\
&= \text{d} \widetilde{\alpha} \circ e^{{\bf A}} + {\bf Y}_\text{pred} \circ \text{d} {\bf A}
\end{align*}
\end{minipage}}

\newcommand\tempboxSoftMaxTempRes{%
\begin{minipage}{0.2\textwidth}%
\abovedisplayskip=0pt
\belowdisplayskip=0pt
\begin{align*}
\text{d} \alpha  &= \left( 
\begin{matrix}
    \text{d}_1 \alpha_1 \\
    \vdots \\
    \text{d}_n \alpha_n
\end{matrix}  \right) \sim \mathbb{R}^n \hspace{0.35cm} \colorbox{light-blue}{start with the ``unbroadcast'' version of $\text{d} \widetilde{\alpha}$ } \\
&\centerwithin{\downarrow}{=} \colorbox{light-blue}{each component (sample $\mathfrak{s}$) can be seen as a generic function $\alpha_\mathfrak{s} = h (a^\mathfrak{s}_1 , \cdots , a^\mathfrak{s}_{n_c} )$ so that the} \\
&\centerwithin{}{=} \colorbox{light-blue}{sample-specific total derivative operator is given by $\text{d}_\mathfrak{s} = ( \partial / \partial a^{\mathfrak{s}}_1 ) \text{d} a^\mathfrak{s}_1 + \cdots + ( \partial / \partial a^{\mathfrak{s}}_{n_c} ) \text{d} a^\mathfrak{s}_{n_c} $} \\
&= 
\left( 
\begin{matrix}
(\partial \alpha_1 / \partial a^1_1) \text{d} a^1_1 + \cdots + (\partial \alpha_1 / \partial a^1_{n_c}) \text{d} a^1_{n_c}  \\
\vdots \\
(\partial \alpha_n / \partial a^n_1) \text{d} a^n_1 + \cdots + (\partial \alpha_n / \partial a^n_{n_c}) \text{d} a^n_{n_c}
\end{matrix}  \right) \\
&\centerwithin{\downarrow}{=} \colorbox{light-blue}{using $\partial \alpha_\mathfrak{s} / \partial a^\mathfrak{s}_\mathfrak{f} = -  (\alpha_\mathfrak{s})^2 e^{a^\mathfrak{s}_\mathfrak{f}}$ for arbitrary sample $\mathfrak{s}$ and feature $\mathfrak{f}$ pair} \\
&= - \left( 
\begin{matrix}
(\alpha_1)^2 e^{a^1_1} \text{d} a^1_1 + \cdots + (\alpha_1)^2 e^{a^1_{n_c}} \text{d} a^1_{n_c}  \\
\vdots \\
(\alpha_n)^2 e^{a^n_1}  \text{d} a^n_1 + \cdots + (\alpha_n)^2 e^{a^n_{n_c}} \text{d} a^n_{n_c}
\end{matrix}  \right)  \\
&= - \left( 
\begin{matrix}
\alpha_1 \\
\vdots \\
\alpha_n
\end{matrix}  \right)
\circ
\left( 
\begin{matrix}
\alpha_1 e^{a^1_1} \text{d} a^1_1 + \cdots +  \alpha_1 e^{a^1_{n_c}} \text{d} a^1_{n_c}  \\
\vdots \\
\alpha_n e^{a^n_1} \text{d} a^n_1 + \cdots + \alpha_n e^{a^n_{n_c}} \text{d} a^n_{n_c}
\end{matrix}  \right) \\
&= - \left( 
\begin{matrix}
\alpha_1 \\
\vdots \\
\alpha_n
\end{matrix}  \right)
\circ \left[
\left(
\begin{matrix}
\alpha_1 e^{a^1_1}  & \dots & \alpha_1 e^{a^1_{n_c}} \\
\vdots &\vdots& \vdots \\
\alpha_n e^{a^n_1} & \dots & \alpha_n e^{a^n_{n_c}}
\end{matrix}
\right) \ominus 
\left(
\begin{matrix}
\text{d} a^1_1  & \dots & \text{d} a^1_{n_c} \\
\vdots &\vdots& \vdots \\
\text{d} a^n_1 & \dots & \text{d} a^n_{n_c}
\end{matrix}
\right) \right] \\
&= - \, \alpha \circ \left( {\bf Y}_\text{pred} \ominus \text{d}{\bf A} \right) \sim \mathbb{R}^n \\
&\centerwithin{\downarrow}{=} \colorbox{light-blue}{broadcast $\text{d}\alpha \sim \mathbb{R}^n$ back to~$\text{d} \widetilde{\alpha} \sim \mathbb{R}^{n \times n_c}$} \\
\text{d} \widetilde{\alpha} &= - \, \widetilde{\alpha} \circ \widetilde{{\bf Y}_\text{pred} \ominus \text{d}{\bf A}}
\end{align*}
\end{minipage}}

\noindent \\

\paragraph{Backward pass} As argued in section~\ref{backPropSection} and demonstrated by~eq.(\ref{totalLossBatch}), it is necessary to evaluate the total derivative of the softmax layer in order to kickstart the recursive error flow at the heart of the backpropagation algorithm.  Explicit calculation of~$\text{d}{\bf Y}_\text{pred}$ leads to:
\begin{align}
\text{d}{\bf Y}_\text{pred} &= \text{d} \left( \text{softmax} \, {\bf A} \right)  \nonumber \\
 & \makebox[0pt][l]{\makebox[0pt][r]{\tooltip****{\usebox{\switchbox}}{\tempboxSoftMaxOne}[-\fboxsep,-\dimexpr\ht\switchbox+\dp\switchbox\relax]}} \nonumber \\
&= \text{d} \widetilde{\alpha} \circ e^{{\bf A}} + {\bf Y}_\text{pred} \circ \text{d} {\bf A} \nonumber \\
& \makebox[0pt][l]{\makebox[0pt][r]{\tooltip****{\usebox{\switchbox}}{\tempboxSoftMaxTempRes}[-\fboxsep,-\dimexpr\ht\switchbox+\dp\switchbox\relax]}} \nonumber \\
&\downarrow \mathcolorbox{light-blue}{\text{replacing } \text{d}\widetilde{\alpha} \text{ and using associativity of Hadamard product to substitute in the definition of ${\bf Y}_\text{pred}$}} \nonumber\\
& = {\bf Y}_\text{pred} \circ \left( \text{d} {\bf A} - \widetilde{ {\bf Y}_\text{pred} \ominus \text{d} {\bf A} } \right) \hspace{15cm} \label{eq:softMaxTotalDerivative}
\end{align}

\newpage

\section{Non-linear activation layer}
\label{sec:activation}

Typically, the trainable layers of a neural network are linear maps (fully connected~\ref{sec:fc}, locally connected i.e.~convolutions~\ref{sec:conv}, batch normalization~\ref{sec:batchNorm}...) between the input feature map~${\bf A}_{i-1}$ and the output~${\bf A}_i$.  Since the functional composition of a set of linear transformations can always be reduced to another linear transformation, the overall architecture of a linear neural network could be collapsed onto itself effectively reducing its depth to a single layer.  Although it is possible, in principle, to exploit peculiarities of floating-point arithmetic as an esoteric form of non-linearity~\cite{floatingPoint}, in practice all neural networks contain explicit non-linear activation layers.  Their purpose is to ``break'' the architectures into distinct parts that can no longer be reduced thereby maintaining the concept of \hyperlink{depthImportance}{depth}. 

\paragraph{Forward pass}  Conventional activation layers consist of a simple element-wise application of a non-linear function to the input data as illustrated in~Fig~\ref{fig:activation} and formalized by:
\begin{empheq}[box={\forwardBox[{\bf Non-linear activation}: forward pass]}]{alignat=2}
{\bf A}_{i} &= g \left( {\bf A}_{i-1} \right) 
\label{activationForward}
\end{empheq}
As established in section~\ref{backPropSection}, backpropagation requires certain smoothness guarantees to be satisfied by the layers that compose the architecture of a neural network. In particular, this means that activation functions are constrained to belong to the class of differentiable functions. Although the literature is flush with an ever-growing number of implementations, it is the rectified linear unit~(ReLU)~\cite{relu} activation, defined as:
\begin{equation*}
g(x) \equiv \text{ReLU}(x) =
\begin{cases} 
\,\, x & x \geq 0 \\
\,\, 0 & x < 0 
\end{cases}
\end{equation*}
that has taken prominence over other forms of non-linearities in recent years~\footnote{Confusingly enough, ReLU is continuous everywhere but not differentiable at~$x=0$ making it a function only of class~{\it C}$^{\,\,0}$ and not~{\it C}$^{\,1}$ as required to ensure continuity of the first derivative.  This jump discontinuity is dismissed by deep learning frameworks which ignore this exceptionally rare event by, for example, choosing to return one of the one-sided derivatives instead of raising an exception.}.  Because of its definition, the output feature map of a~ReLU activation has a sparse representation where, statistically, half of the neurons of~${\bf A}_i$ are set to zero (those are said to be ``not activated'') while keeping the remaining neurons unchanged (``activated'') as shown in the top panel of~Fig.\ref{fig:ReLU}.

\begin{figure}[hb]
\centering
\includegraphics[width=0.85\linewidth]{pptx/activation/Slide2.png}
\caption{Illustration of the forward/backward passes through a general non-linear activation layer~$g$.  The forward pass simply consists of an element-wise application of~$g$ as shown in~eq.(\ref{activationForward}). The backward propagation of the error downstream to~$\Delta_{i-1}$ is given by the Hadamard product between the upstream error~$\Delta_{i}$ and the derivative~$g^\prime$ applied to the original input feature map~${\bf A}_{i-1}$, see~eq.(\ref{activationBackward}).}
\label{fig:activation}
\end{figure}

\newcommand\tempboxActivation{%
\begin{minipage}{0.2\textwidth}%
\abovedisplayskip=0pt
\belowdisplayskip=0pt
\begin{align*}
\Delta_i \cdot \text{d}{\bf A}_{i} &= \Delta_i \cdot \text{d} g \left( {\bf A}_{i-1} \right) \\
&\centerwithin{\downarrow}{=} \colorbox{light-blue}{using eq.~\eqref{scalarToScalar}} \\
&= \Delta_i \cdot \left( g^\prime \left( {\bf A}_{i-1} \right) \circ \text{d}  {\bf A}_{i-1} \right) \\
&\centerwithin{\downarrow}{=} \colorbox{light-blue}{using eq.~\eqref{frobenius2}} \\
& = \mathcolorbox{shadecolor}{\Delta_i \circ g^\prime \left( {\bf A}_{i-1} \right) } \cdot \text{d} {\bf A}_{i-1}
\end{align*}
\end{minipage}}

\paragraph{Backward pass}  Since activation layers don't contain trainable parameters, the backward pass only requires us to propagate the upstream error~$\Delta_i$.  As demonstrated in~eq.(\ref{recursionGeneral}), this is achieved via:
\begin{align}
\Delta_i \cdot \text{d}{\bf A}_{i} &= \Delta_i \cdot \text{d} g \left( {\bf A}_{i-1} \right) \nonumber \\
 & \makebox[0pt][l]{\makebox[0pt][r]{\tooltip****{\usebox{\switchbox}}{\tempboxActivation}[-\fboxsep,-\dimexpr\ht\switchbox+\dp\switchbox\relax]}}  \nonumber \\
& = \underbrace{ \mathcolorbox{shadecolor}{\Delta_i \circ g^\prime \left( {\bf A}_{i-1} \right) } }_{\textstyle
    \begin{gathered}
      \Delta_{i-1}
    \end{gathered} } \cdot \, \text{d} {\bf A}_{i-1} \label{eq:backActivation}
\end{align}
In the case of~ReLU activation, the derivative is given by the (binary) Heaviside function:
\begin{equation*}
g^\prime (x) =
\begin{cases} 
\,\, 1 & x \geq 0 \\
\,\, 0 & x < 0 
\end{cases}
\end{equation*}
meaning that neurons that were activated during the forward pass let the corresponding components of the upstream error~$\Delta_i$ flow to~$\Delta_{i-1}$ without any damping thanks to the constant derivative~$g^\prime(x\geq 0) = 1$ thereby helping fight the ``vanishing gradient problem''.  On the other hand, non-activated neurons completely suppress the corresponding components of~$\Delta_i$ through $g^\prime (x<0) = 0$ leading to a sparse representation for~$\Delta_{i-1}$ which may act as a form of regularization.  For comparison, activation functions that bound the error signal (such as~$0 < g^\prime(x) \leq 1 / 4 \, \, \forall x \in \mathbb{R}$ for the logistic function) expose the risk of vanishingly small~$\Delta_{i-1}$ as one updates weights further and further away from the final layer (i.e. error values that have passed through many bounded activations as they make their way closer and closer to the input layer) leading to a lethargically slow learning process. \\

\noindent In summary, the backward pass of the activation layer consists in a simple propagation of the error given by:
\begin{empheq}[box={\backPropBox[{\bf Non-linear activation}: backward pass]}]{alignat=2}
\Delta_{i-1} &= \Delta_i \circ g^\prime ({\bf A}_{i-1})
\label{activationBackward}
\end{empheq}

\begin{figure}[!hb]
\centering
\includegraphics[width=0.85\linewidth]{pptx/activation/Slide1.png}
\caption{Illustration of the non-linear layer in the special case of the~ReLU activation.  The orange (resp. greeen) components of~${\bf A}_{i-1}$ denote positive (resp. negative) values. The presence of many~0's in~${\bf A}_i$ shows how the~ReLU function leads to a sparse output during the forward pass.  Similarly, the downstream error~$\Delta_{i-1}$ also has a sparse representation (blue components).  Error values corresponding to the ``activated'' positions during the forward pass are propagated unchanged from upstream~$\Delta_i$ to~$\Delta_{i-1}$ because of the identification of~$g^\prime$ with the sign function.}
\label{fig:ReLU}
\end{figure}

\section{Fully connected layer}
\label{sec:fc}

\paragraph{Forward pass}  This layer consists of a linear mapping between the input feature map~${\bf A}_{i-1} \sim \mathbb{R}^{n\times f_{i-1}}$ and the output~${\bf A}_i \sim \mathbb{R}^{n\times f_i}$.  Without loss of generality, any linear function $\mathbb{R}^{f_{i-1}} \rightarrow \mathbb{R}^{f_i}$ between finite-dimensional vector spaces can be represented by a matrix $\sim \mathbb{R}^{f_{i-1} \times f_i}$.  Allowing for the possibility of translation, fully connected layers are defined by a set of~2 trainable parameters:
\begin{equation*}
\mathcal{P}_{i-1} \begin{cases} 
\hspace{0.1cm} {\bf w}_{i-1} \sim \mathbb{R}^{f_{i-1}\times f_i} \, , &\text{``weights''}  \\
\hspace{0.1cm} {\bf b}_{i-1} \sim \mathbb{R}^{f_i}  \, , &\text{``biases''.} 
\end{cases}
\end{equation*}

\noindent Their purpose is to discover a useful relationship transforming the $f_{i-1}$ input features into a new set of $f_i$ output features which are learned from the training data.  The forward pass of this layer can be expressed as a generic affine transformation:
\begin{empheq}[box={\forwardBox[{\bf Fully connected}: forward pass]}]{alignat=2}
{\bf A}_{i} &= {\bf A}_{i-1} {\bf w}_{i-1} + \widetilde{{\bf b}_{i-1}}
\label{fc:forward}
\end{empheq}
\noindent as illustrated in the top panel of~Fig.\ref{fig:fcMatrix}. Note that the biases are broadcast~${\bf b}_{i-1} \rightarrow \widetilde{{\bf b}_{i-1}} \sim \mathbb{R}^{n\times f_i} $ over the~$+$~operator in order to match the dimensionality of~${\bf A}_{i-1} {\bf w}_{i-1}$.  The name ``fully connected'' stems from the ``neurobiological'' interpretation of the matrix multiplication where one can view every component, i.e.~``neuron'', of~${\bf A}_{i-1}$ as being connected to every other component, i.e.~``neuron'', of~${\bf A}_i$ via the relevant component of the weights~${\bf w}_{i-1}$, i.e.~through ``synapses'', as illustrated in~Fig~\ref{fig:fcNeuron}.   Note that we have one bias term for each one of the~$f_i$ features in the output space (i.e.~one bias term per output neuron). Fully connected layers are at the heart of many machine learning algorithms (linear/logistic regression, linear SVM, multi-layer perceptron neural networks...).

\newcommand\tempboxFCback{%
\begin{minipage}{0.2\textwidth}%
\abovedisplayskip=0pt
\belowdisplayskip=0pt
\begin{align*}
\Delta_i \cdot \text{d}{\bf A}_{i} &= \Delta_i \cdot \text{d} \left( {\bf A}_{i-1} {\bf w}_{i-1} + \widetilde{{\bf b}_{i-1}} \right) \\
&= \Delta_i \cdot \left( {\bf A}_{i-1} \text{d} {\bf w}_{i-1} \right)  + \Delta_i \cdot \text{d} \widetilde{{\bf b}_{i-1}}  + \Delta_i \cdot \big[ \left( \text{d} {\bf A}_{i-1} \right) {\bf w}_{i-1} \big] \\
&\centerwithin{\downarrow}{=} \colorbox{light-blue}{using eq.~\eqref{frobenius1} and broadcasting semantics of appendix~\ref{appendix::broadcast}} \\
&= \mathcolorbox{shadecolor}{{\bf A}_{i-1}^t \Delta_i} \, \cdot \, \text{d} {\bf w}_{i-1} + \mathcolorbox{shadecolor}{ \sum_\text{samples} \Delta_i } \cdot \, \text{d} {\bf b}_{i-1} + \mathcolorbox{shadecolor}{\Delta_i {\bf w}^t_{i-1} } \cdot \, \text{d} {\bf A}_{i-1}
\end{align*}
\end{minipage}}

\paragraph{Backward pass}  As explained in section~\ref{backPropSection}, the backward pass is carried out by evaluating the recursive relation derived in~eq.(\ref{recursionGeneral}).  Specializing~${\bf A}_i$ to the case of a fully connected layer and taking its Frobenius product with the upstream error~$\Delta_i \sim \mathbb{R}^{n\times f_i}$, we find:
\begin{align*}
\Delta_i \cdot \text{d}{\bf A}_{i} &= \Delta_i \cdot \text{d} \left( {\bf A}_{i-1} {\bf w}_{i-1} + \widetilde{{\bf b}_{i-1}} \right) \\
 & \makebox[0pt][l]{\makebox[0pt][r]{\tooltip****{\usebox{\switchbox}}{\tempboxFCback}[-\fboxsep,-\dimexpr\ht\switchbox+\dp\switchbox\relax]}}\\
&= \underbrace{ \mathcolorbox{shadecolor}{{\bf A}_{i-1}^t \Delta_i}}_{\textstyle
    \begin{gathered}
      \frac{\partial \mathcal{L}_{\text{batch}}}{\partial {\bf w}_{i-1}}
    \end{gathered} } \, \cdot \, \text{d} {\bf w}_{i-1} + \underbrace{ \mathcolorbox{shadecolor}{ \sum_\text{samples} \Delta_i } }_{\textstyle
    \begin{gathered}
      \frac{\partial \mathcal{L}_{\text{batch}}}{\partial {\bf b}_{i-1}}
    \end{gathered} } \cdot \, \text{d} {\bf b}_{i-1} + \underbrace{ \mathcolorbox{shadecolor}{\Delta_i {\bf w}^t_{i-1} } }_{\textstyle
    \begin{gathered}
      \Delta_{i-1}
    \end{gathered} } \cdot \, \text{d} {\bf A}_{i-1} 
\end{align*}

\noindent Identifying all the terms allows us to extract the components of the gradient with respect to parameters~$\mathcal{P}_{i-1} = \{ {\bf w}_{i-1} , {\bf b}_{i-1} \}$ as well as the downstream error~$\Delta_{i-1}$.  In particular, one can see that~$\Delta_{i-1}$ is related to its upstream counterpart~$\Delta_i$ in a very similar way as the data flow from~${\bf A}_{i-1}$ to~${\bf A}_i$ during the forward pass; namely a matrix multiplication using the same weight matrix~${\bf w}_{i-1}$ (up to a simple transpose) as illustrated in the bottom panel of~Fig.\ref{fig:fcMatrix}.  Notice also that one needs to keep in memory the original data input~${\bf A}_{i-1}$ in order to compute the gradient components during the backward pass. \\

\noindent In summary, the backward pass through a fully connected layer is given by:
\begin{empheq}[box={\backPropBox[{\bf Fully connected}: backward pass]}]{alignat=2}
\Delta_{i-1} &= \Delta_i {\bf w}^t_{i-1} &\quad &\sim \mathbb{R}^{n\times f_{i-1}}  \\
\frac{\partial \mathcal{L}_{\text{batch}}}{\partial {\bf w}_{i-1}} &= {\bf A}_{i-1}^t \Delta_i &\quad  &\sim \mathbb{R}^{f_{i-1}\times f_i} \\
\frac{\partial \mathcal{L}_{\text{batch}}}{\partial {\bf b}_{i-1}} &= \sum_\text{samples} \Delta_i &\quad  &\sim \mathbb{R}^{f_i} 
\end{empheq}

\begin{figure}
\captionsetup{singlelinecheck=false}
\centering
\includegraphics[width=0.89\linewidth]{pptx/fc/Slide1.png}
\caption[]{Basic illustration of a fully connected layer for a single sample~$n=1$ and low-dimensional feature maps~$\{ f_{i-1} = 2 , f_i = 3 \}$ showing the data flow from~${\bf A}_{i-1}$ to~${\bf A}_i$ during the forward pass (top panel) and the corresponding error flow from upstream~$\Delta_i$ downstream to~$\Delta_{i-1}$ during the backward pass (bottom panel).  Explicit expansion of the matrix algebra is provided below in order to draw a clear parallel with the ``neuron'' interpretation displayed in the accompanying~Fig.\ref{fig:fcNeuron}.
\begin{equation*}
\begin{aligned}[c]
\cline{1-2}
& \text{\bf forward pass} \\
\mathcal{L}_A &= (w_{11} \times 1_A) + (w_{21} \times 2_A) + b_1 \\
\mathcal{M}_A &= (w_{12} \times 1_A) + (w_{22} \times 2_A) + b_2 \\
\mathcal{N}_A &= (w_{13} \times 1_A) + (w_{23} \times 2_A) + b_3 \\
\cline{1-2}
\end{aligned}
\qquad \qquad
\begin{aligned}[c]
\cline{1-2}
& \text{\bf backward pass} \\
1_\delta &= (w_{11} \times \mathcal{L}_\delta) + (w_{12} \times \mathcal{M}_\delta) + (w_{13} \times \mathcal{N}_\delta) \\
2_\delta &= (w_{21} \times \mathcal{L}_\delta) + (w_{22} \times \mathcal{M}_\delta) + (w_{23} \times \mathcal{N}_\delta) \\
\cline{1-2}
\end{aligned}
\end{equation*}
}
\label{fig:fcMatrix}
\end{figure}

\begin{figure}

\begin{minipage}{.5\textwidth}

\resizebox {\columnwidth} {!} {

\begin{tikzpicture}[show background rectangle, background rectangle/.style={fill=green-yellow}]

\tikzstyle{neuron}=[circle,fill=black!25,minimum size=22pt,inner sep=0pt]
\tikzstyle{input neuron}=[neuron, fill=orange!100];
\tikzstyle{annot} = [text width=5em, text centered]

\node[input neuron] (a1) { $1_A$ };
\node[input neuron, below=1.8cm of a1] (a2) { $2_A$ };

\node[annot,above=1.1cm of a1] (inLayer) { };

\node[annot, below=0.7cm of a2] (lin) { ${\bf A}_{i-1}$ };
\node[annot, right=3.5cm of lin] (lout) { ${\bf A}_i$ };

\node[input neuron] (o2) at ($(a1)!0.5!(a2) - (-5cm, 0)$) {$\mathcal{M}_A$};
\node[input neuron, right=0.2cm of o2] (b2) { $+b_2$ };

\node[input neuron, above=1cm of o2] (o1) { $\mathcal{L}_A$ };
\node[input neuron, right=0.2cm of o1] (b1) { $+b_1$ };

\node[input neuron, below=1cm of o2] (o3) { $\mathcal{N}_A$ };
\node[input neuron, right=0.2cm of o3] (b3) { $+b_3$ };

\draw[->] (a1) -- (o1) node[pos=0.15,sloped,above,rotate=0] {$w_{11}$};
\draw[->] (a1) -- (o2) node[pos=0.3,sloped,above,rotate=0] {$w_{12}$};
\draw[->] (a1) -- (o3) node[pos=0.15,sloped,below,rotate=0] {$w_{13}$};

\draw[->] (a2) -- (o1) node[pos=0.15,sloped,above,rotate=0] {$w_{21}$};
\draw[->] (a2) -- (o2) node[pos=0.3,sloped,below,rotate=0] {$w_{22}$};
\draw[->] (a2) -- (o3) node[pos=0.15,sloped,below,rotate=0] {$w_{23}$};

\draw[->] (lin) -- (lout) node[midway, below] { \mathcolorbox{orange}{\text{forward pass}} } ;

\end{tikzpicture}

}

\end{minipage}
\begin{minipage}{.5\textwidth}

\resizebox {\columnwidth} {!} {

\begin{tikzpicture}[show background rectangle, background rectangle/.style={fill=shadecolor}]

\tikzstyle{neuron}=[circle,fill=black!25,minimum size=22pt,inner sep=0pt]
\tikzstyle{input neuron}=[neuron, fill=light-blue!80];
\tikzstyle{annot} = [text width=5em, text centered]

\node[input neuron] (a1) { $1_\delta$ };
\node[input neuron, below=0.2cm of a1] (a2) at (0, -1) { $2_\delta$ };

\node[annot,above=1cm of a1] (inLayer) { };

\node[input neuron] (o2) at ($(a1)!0.5!(a2) - (-5cm, 0)$) {$\mathcal{M}_\delta$};
\node[input neuron, above=1cm of o2] (o1) { $\mathcal{L}_\delta$ };
\node[input neuron, below=1cm of o2] (o3) { $\mathcal{N}_\delta$ };

\draw[->] (o1) -- (a1) node[pos=0.15,sloped,above,rotate=0] {$w_{11}$};
\draw[->] (o2) -- (a1) node[pos=0.15,sloped,above,rotate=0] {$w_{12}$};
\draw[->] (o3) -- (a1) node[pos=0.15,sloped,above,rotate=0] {$w_{13}$};

\draw[->] (o1) -- (a2) node[pos=0.15,sloped,below,rotate=0] {$w_{21}$};
\draw[->] (o2) -- (a2) node[pos=0.15,sloped,below,rotate=0] {$w_{22}$};
\draw[->] (o3) -- (a2) node[pos=0.15,sloped,below,rotate=0] {$w_{23}$};

\node[annot, below=1.2cm of a2] (lin) { $\Delta_{i-1}$ };
\node[annot, right=2.7cm of lin] (lout) { $\Delta_i$ };
\draw[->] (lout) -- (lin) node[midway, below] {\mathcolorbox{light-blue}{\text{backward pass}}} ;

\end{tikzpicture}

}

\end{minipage}
\caption{Basic illustration of a fully connected layer according to the ``neurobiological'' interpretation (same dimensionality as in the equivalent matrix formulation presented in~Fig~\ref{fig:fcMatrix}).  During the forward pass, the ``output neurons''~$\{ \mathcal{L}_A , \mathcal{M}_A , \mathcal{N}_A \}$ are connected to each one of the ``input neurons''~$\{1_A , 2_A \}$ through some weights~$w_{ij}$ playing the role of ``synapses''. Similarly, the neurons of the downstream error~$\{1_\delta , 2_\delta \}$ are connected to each one of the upstream error neurons~$\{ \mathcal{L}_\delta , \mathcal{M}_\delta , \mathcal{N}_\delta \}$ through the same weights~$w_{ij}$ during the backward pass.  One can verify that this interpretation of fully connected layers is in complete agreement with the explicit matrix algebra carried out above.}
\label{fig:fcNeuron}
\end{figure}

\section{Maxpool layer}
\label{sec:maxpool}

One purpose of pooling layers is to reduce (subsample) the spatial size of feature maps in an effort to make the neural network~$\mathcal{N}_\mathbfcal{P}$ less compute/memory intensive.  As an added benefit, removing features from the data may also help control overfitting by reducing the number of trainable parameters in subsequent layers of~$\mathcal{N}_\mathbfcal{P}$.  By far, the most common implementation of pooling layer is the so-called ``maxpool'' operation which, despite its long-standing popularity in CNN architectures, is not free from criticism~\cite{maxPoolcontroversy}.  Alternative forms of pooling (average, L2-norm, stochastic~\cite{stochasticPool}...) along with suggestions to replace pooling layers altogether in favor of convolutional subsampling~\cite{allConv} can be found in the literature.  Nevertheless, maxpool layers remain widely used by practitioners and are generally considered an important architectural component.

\paragraph{Forward pass}  As all sampling operations (described in appendix~\ref{sec:sample}), pooling layers are characterized by the geometrical parameter~$\mathfrak{p} \equiv \left(k, s, p \right)$.  Given an input data~${\bf A}_{i-1} \sim \mathbb{R}^{n\times d_{i-1} \times r_{i-1} \times r_{i-1} }$, the idea is to use~$\mathfrak{p}$ to partition each one of~$N_\text{activation maps} = n\times d_{i-1}$ depth and minibatch slices (so-called ``activation maps''), individually of spatial size~$\sim \mathbb{R}^{r_{i-1} \times r_{i-1}}$, into a set of smaller~2d data patches of kernel size~${\bf A}_{i-1}^\text{patch} \sim \mathbb{R}^{k \times k}$.  The number of patches per activation map,~$n_\text{patches}$, depends on the stride~$s$, padding~$p$ and kernel size~$k$ defined by~$\mathfrak{p}$ and its explicit form is given by~eq.(\ref{eq:subSample}). Overall, this partitioning of~${\bf A}_{i-1}$ generates~$N_\text{patches} = N_\text{activation maps} \times n_\text{patches}$ individual data patches. \\

\noindent {\it (Note that this is unlike convolution layers where one considers data partitions as sliding windows that include depth thereby leading to~3d patches of data~$\sim \mathbb{R}^{d_{i-1} \times k \times k}$ as illustrated in~Fig.~\ref{fig:dataDepth} instead of purely spatial~2d patches~$\mathbb{R}^{k\times k}$ independent of depth for pooling layers as illustrated in~Fig.~\ref{fig:maxPool}).} \\

\noindent Denoting by~$g : \mathbb{R}^{k\times k} \rightarrow \mathbb{R}$ a generic pooling function (named so in analogy with activation layers), its role is to summarize the~$k^2$ features present in a patch of input data into a single feature denoted~${\bf A}_i^\text{patch}  \sim \mathbb{R} = g( {\bf A}_{i-1}^\text{patch} \sim \mathbb{R}^{k\times k}  )$ acting as a kind of local descriptive statistics.  Applying~$g$ to all the patches of~${\bf A}_{i-1}$ defined by~$\mathfrak{p}$ allows us to build the output feature map:
\begin{empheq}[box={\forwardBox[{\bf Pooling layer}: forward pass]}]{alignat=2}
& {\bf A}_i = \diamondsuit \, \big\{ \, g ( {\bf A}_{i-1}^\text{patch} ) \,\, \vert \,\, \text{patch} \in \text{patches of }{\bf A}_{i-1}\text{ defined by }{\mathfrak{p}} \, \big\}
\end{empheq}
where~$\diamondsuit$ stands for the reconstruction of the set~$\{ {\bf A}_i^\text{patch} \sim \mathbb{R} \}$ of cardinality~$N_\text{patches}$ back into a~4d data array of dimensionality consistent with~${\bf A}_i$ (minibatch and depth stack of spatial~2d~activation maps): i)~since pooling layers operate independently of depth,~$N_\text{activation maps}$~remains unchanged from~${\bf A}_{i-1}$ to~${\bf A}_i$ and, as a result, depth is conserved i.e.~$d_i \equiv d_{i-1}$ ii)~the output patch values~${\bf A}_i^\text{patch}$ are grouped together by their respective activation maps and aggregated into~2d grids of spatial size~$\sim \mathbb{R}^{r_i \times r_i}$ where we identify the output size~$r_i \equiv \sqrt{n_\text{patches}} < r_{i-1}$, finally~iii) the complete activation maps are stacked together in order to recover the correct dimensionality~${\bf A}_i \sim \mathbb{R}^{n\times d_i \times r_i \times r_i}$. \\

\noindent For maxpool layers, the pooling function is implemented simply as the maximum function~$g \equiv \max$.  In this case,~${\bf A}_i^\text{patch}$ corresponds to the value of the largest feature among the~$k^2$ components of~${\bf A}_{i-1}^\text{patch}$.  The idea is that since we only care about the maximum value within some spatial neighborhood of size~$\sim \mathbb{R}^{k\times k}$, as long as small translations do not remove the largest feature from a pooling region (or bring in another even larger feature into the pooling region) the output representation~${\bf A}_i$ is unaffected.  As a consequence, maxpool layers are believed to introduce a basic form of translational invariance.  In addition to the formalism developed above, the mechanics of maxpool layers is readily understandable as a geometrical procedure illustrated in Fig.~\ref{fig:maxPool}. \\

\noindent In passing, let us mention that assuming~i)~the patches defined by~$\mathfrak{p}$ do not overlap: specializing~eq.(\ref{eq:subSample}) leads to~$r_i = r_{i-1} / k$ and~ii)~maxpool layers are the only source of downsampling in the network (often the case since it is common to use the so-called ``same padding'' in convolution layers), one can see that a network with~$n_\text{pool}$ maxpool layers experiences an exponential decrease in the size of its final output feature map~$r_\text{final} = r_\text{initial} / k^{n_\text{pool}}$.  For this reason, it is recommended to adopt less destructive pooling in the form of small kernels with~$\mathfrak{p} = \left(k=2, s=2, p=0 \right)$ being a popular choice illustrated in~Fig.~\ref{fig:maxPool}; an option that nevertheless discards~75$\%$ of the input features at each maxpool layer.

\begin{figure}[!ht]
\centering
\includegraphics[width=0.85\linewidth]{pptx/maxPool/Slide1.png}
\caption{Illustration of the maxpool layer with~$\mathfrak{p} = \left(k=2, s=2, p=0 \right)$ on a small activation map of spatial size~$r_{i-1} = 4$. Because pooling layers operate on activation maps independently of depth, it is enough to consider a single activation map. {\bf Forward pass:}~Applying the partitioning defined by~$\mathfrak{p}$ on~${\bf A}_{i-1}$ leads to~4 patches (transparent red squares).  The maximum value within each patch (arbitrary cells colored in orange) is returned by the pooling function~$g\equiv \max$ thereby building the output~${\bf A}_i$ with~$r_i=2$. {\bf Backward pass:} Downstream propagation of the error to~$\Delta_{i-1}$ is given by the Hadamard product between the upstream error~$\Delta_i$ (up to a patch-wise upsampling broadcast~$r_i=2 \rightarrow r_{i-1} = 4$) and the patch-level evaluation of~$g_\mathfrak{p}^\prime({\bf A}_{i-1})$.  In both cases, the patches are defined according to~$\mathfrak{p}$ and visualized as transparent blue squares. Accordingly, the backward pass can be implemented in a memory friendly way by routing the components of~$\Delta_i$ to the patch location that contributed to~${\bf A}_i$ during the forward pass (cells colored in blue) and ignoring others (since their associated derivatives are~0).}
\label{fig:maxPool}
\end{figure}

\paragraph{Backward pass}  As understood from the forward pass, pooling layers are similar to the non-linear activation layers of section~\ref{sec:activation}. The main difference stems from the fact that while activation functions operate element-wise on the input data (thereby preserving the dimensionality of activation maps), pooling layers are applied at the level of larger input data patches defined by~$\mathfrak{p}$ (thereby reducing the spatial size of activation maps).  As such, we can combine the error propagation expression previously derived in~eq.(\ref{eq:backActivation}):
\begin{equation*}
\Delta_i \cdot \text{d}{\bf A}_{i} = \underbrace{ \mathcolorbox{shadecolor}{\widetilde{\Delta_i} \circ g_\mathfrak{p}^\prime \left( {\bf A}_{i-1} \right) } }_{\textstyle
    \begin{gathered}
      \Delta_{i-1}
    \end{gathered} } \cdot \, \text{d} {\bf A}_{i-1}
\end{equation*}
together with a patch-level evaluation of the derivative of the pooling function~$g_\mathfrak{p}^\prime({\bf A}_{i-1})$ using the same construction~$\diamondsuit$ that was defined in the forward pass.  Note that the upstream error~$\Delta_i$ is also broadcast patch-wise in order to match the dimensionality of the original input data~$\Delta_i \rightarrow \widetilde{\Delta_i} \sim {\bf A}_{i-1}$.
\begin{empheq}[box={\backPropBox[{\bf Pooling layer}: backward pass]}]{alignat=2}
\Delta_{i-1} &=  \widetilde{\Delta_i} \circ g_\mathfrak{p}^\prime({\bf A}_{i-1}) \\
g_\mathfrak{p}^\prime({\bf A}_{i-1}) &\equiv \diamondsuit \, \big\{ \, g^\prime ( {\bf A}_{i-1}^\text{patch} ) \,\, \vert \,\, \text{patch} \in \text{patches of }{\bf A}_{i-1}\text{ defined by }{\mathfrak{p}} \, \big\} \nonumber 
\end{empheq}
In the special case of maxpool activation, the derivative of~$g\equiv \max$ with respect to the~$k^2$ elements present in a patch~${\bf A}_{i-1}^\text{patch}$ is given by:  
\begin{equation*}
g^\prime ({\bf A}_{i-1}^\text{patch}) \equiv \Big\{ \, \frac{\text{d}  \max \, (a_1 , \cdots , a_{k^2} ) }{ \text{d} a_j } \,\, \vert \,\, a_j \in {\bf A}_{i-1}^\text{patch} \Big\} = \begin{cases} 
\,\, 1 & \text{for the maximum value} \\
\,\, 0 & \text{otherwise}
\end{cases}
\end{equation*}
This can be implemented in a memory efficient way by caching the position of the maxima within the patches defined by~$\mathfrak{p}$ during the forward pass instead of the full~${\bf A}_{i-1}$ as shown in~Fig~\ref{fig:maxPool}.

\section{Flatten layer}
\label{sec:flatten}

\noindent In essence, this layer is nothing more than a non-parametric ``plumbing'' transformation whose sole purpose is to reshape the input data~${\bf A}_{i-1}$ during the forward pass and the upstream error~$\Delta_i$ during the backward pass in a manner that is appropriate for processing by subsequent layers.  Note that in both cases, there is no change in information content besides trivial geometrical re-organizations.

\paragraph{Forward pass}  In typical scenarios (as exemplified in the case study network of table~\ref{table:networkExample}), the input data consists of a minibatch~${\bf A}_{i-1} \sim \mathbb{R}^{n  \times d_{i-1} \times r_{i-1} \times r_{i-1} }$ where each~3d sample (image) is parametrized in terms of meaningful depth and space dimensions.  The target is to flatten those dimensions into a~1d feature vector:~$\mathbb{R}^{d_{i-1} \times r_{i-1} \times r_{i-1}} \rightarrow \mathbb{R}^{(d_{i-1} \times r_{i-1} \times r_{i-1})} \equiv \mathbb{R}^{f_i}$ so that the minibatch is ultimately represented as a~2d data array~${\bf A}_i \sim \mathbb{R}^{n \times f_i}$.
\begin{empheq}[box={\forwardBox[{\bf Flatten}: forward pass]}]{alignat=2}
{\bf A}_i \sim \mathbb{R}^{ n \times f_i } &= \text{flatten} \, \left( {\bf A}_{i-1} \, \sim \mathbb{R}^{n  \times d_{i-1} \times r_{i-1} \times r_{i-1} }\right)
\end{empheq}

\paragraph{Backward pass} In this case, the input consists of a~2d upstream error~$\Delta_i \sim \mathbb{R}^{n\times f_i}$ and the objective is to restore depth and space as meaningful independent dimensions back in to the downstream error~$\Delta_{i-1} \sim \mathbb{R}^{n\times d_{i-1} \times r_{i-1} \times r_{i-1}}$.  We formally denote by ``fold'':~$\mathbb{R}^{f_i} \rightarrow \mathbb{R}^{d_{i-1} \times r_{i-1} \times r_{i-1}}$, this inverse of the flattening operation.  Applying the general recursion relation~eq.(\ref{recursionGeneral}), we get:
\begin{equation*}
\Delta_i \cdot \text{d} {\bf A}_i = \Delta_i \cdot \text{d}  \left( \text{flatten} \, {\bf A}_{i-1} \right) \rightarrow \underbrace{ \mathcolorbox{shadecolor}{\text{fold} \, \Delta_i} }_{\textstyle
    \begin{gathered}
      \Delta_{i-1}
    \end{gathered} } \cdot \, \text{d} {\bf A}_{i-1} 
\end{equation*}
\begin{empheq}[box={\backPropBox[{\bf Flatten}: backward pass]}]{alignat=2}
\Delta_{i-1} \sim \mathbb{R}^{n\times d_{i-1} \times r_{i-1} \times r_{i-1}} &= \text{fold} \left( \Delta_i \sim \mathbb{R}^{n\times f_i} \right)
\end{empheq}

\section{Convolution layer}
\label{sec:conv}

\paragraph{Forward pass}  Unlike fully connected layers where samples are represented as a 1d feature vectors (with input $\sim \mathbb{R}^{f_{i-1}}$ and output~$\sim \mathbb{R}^{f_i}$; see section~\ref{sec:fc}), we are now considering data arrays (images) that have a~3d~structure composed of independent depth and space dimensions as introduced in section~\ref{sec:intro} and illustrated in~Fig.\ref{fig:dataDepth}.  Otherwise and similarly to fully connected layers, a convolution layer is another kind of linear mapping between the input data~${\bf A}_{i-1} \sim \mathbb{R}^{n\times d_{i-1} \times r_{i-1} \times r_{i-1}}$ and the corresponding output~${\bf A}_i \sim \mathbb{R}^{n\times d_i \times r_i \times r_i}$ parametrized by: 
\begin{equation*}
\mathcal{P}_{i-1} \begin{cases} 
\hspace{0.1cm} {\bf w}_{i-1}^\mathfrak{p} \sim \mathbb{R}^{d_i \times d_{i-1} \times k \times k} \, , &\text{``weights'' / convolutional kernels}  \\
\hspace{0.1cm} {\bf b}_{i-1} \sim \mathbb{R}^{d_i}  \, , &\text{``biases''.} 
\end{cases}
\end{equation*}
The kernels can be seen as a collection of~$d_i$ independent~3d ``neurons'' of dimensionality~$\sim \mathbb{R}^{d_{i-1}\times k\times k}$. Stacking all the individual kernels together leads to a~4d structure for~${\bf w}_{i-1}^\mathfrak{p}$.  Like the biases of fully connected layers, each one of~$d_i$ output convolutional neurons gets their own bias term~${\bf b}_{i-1} \sim \mathbb{R}^{d_i}$.  Accordingly, the forward pass of convolution layers is expressed as an affine transformation:
\begin{empheq}[box={\forwardBox[{\bf Convolution}: forward pass]}]{alignat=2}
{\bf A}_i = {\bf w}^{\mathfrak{p}}_{i-1} \star {\bf A}_{i-1} + \widetilde{{\bf b}_{i-1}}
\label{eq:convForward}
\end{empheq}
\noindent where the biases are appropriately broadcast~${\bf b}_{i-1} \rightarrow \widetilde{{\bf b}_{i-1}} \sim \mathbb{R}^{n\times d_i \times r_i \times r_i}$ (see appendix~\ref{appendix::broadcast}).  The crucial difference between straightforward matrix multiplication (fully connected layer) and convolution as symbolized by~$\star$ comes from the fact that the geometrical factor~$\mathfrak{p} = ( k,s,p )$ imposes a spatial local-connectivity pattern between the weights and the data with which they are being convolved.  This is unlike fully connected layers where each neuron is connected to all of the input features.  In other words, convolutional neurons are only connected to a local spatial neighborhood of the data. 

\begin{figure}[!ht]
\animategraphics[loop, autoplay, width=0.88\textwidth]{1}{./pptx/dataDepth/Slide}{1}{3}
\caption{Illustration of the general~3d nature of the data ({\bf with animations}). Complete minibatches of size~$n>1$ are built by stacking~3d samples into~4d structures as discussed in section~\ref{sec:intro}.  The purpose of this figure is only to emphasize the fact that the planar representations of~Fig.~\ref{fig:convForwardBackward} (where the mechanics of convolution layers is explained in detail) correspond to top views of the actual underlying~3d data structures.  For example, it should be clear that individual cells in~Fig.~\ref{fig:convForwardBackward} actually correspond to~1d vectors~$\sim \mathbb{R}^d$ across the depth.  Therefore, generic products between individual cells of~${\bf A}_{i-1}$/$\Delta_i$ with individual cells of the kernel, as performed in Fig~\ref{fig:convForwardBackward}, really stand for complete dot-products~$\mathbb{R}^d \cdot \mathbb{R}^d \sim \mathbb{R}$.  Note that the same kernel is used both with~${\bf A}_{i-1}$ (data propagation during the forward pass) and~$\Delta_i$ (error propagation during the backward pass) modulo simple geometrical transformations discussed in the text.  For example, the depth dimension~$d$ is transposed from~$\mathbb{R}^{d_{i-1}}$ in the forward pass to~$\mathbb{R}^{d_i}$ in the backward pass making the dot-products discussed above well-defined. Even though we are displaying here only a single convolutional kernel, real convolution layers may be made up of a large number of such filters by stacking individual~3d filters into~4d banks of filters.  The animation of the upstream error~$\Delta_i$ shows how {\bf fractionally strided convolutions} are implemented by a combination of both internal (gray) and external (white) padding; more details in the main text and in~Fig.\ref{fig:convForwardBackward}.} 
\label{fig:dataDepth}
\end{figure}

\noindent A natural way to look at convolutions consists in imagining that the weights are sliding across different data windows of~${\bf A}_{i-1}$:  Spatially small,~$k<r_{i-1}$, patches of~3d~data~$\sim \mathbb{R}^{d_{i-1}\times k\times k}$ are paired via tensor contraction with individual~3d~kernels of matching dimensionality~$\sim \mathbb{R}^{d_{i-1}\times k\times k}$ in order to produce, step-by-step, the scalar components of the output feature map~${\bf A}_i$ as illustrated in detail in~Fig.\ref{fig:convForwardBackward}.  Note that the filters remain fully connected to the data depth-wise as one can see that ${\bf w}_{i-1}^\mathfrak{p}$ adopts the same value~$d_{i-1}$ along this dimension to match the depth of~${\bf A}_{i-1}$.  Therefore,~$d_{i-1}$ is contracted away by the depth-wise dot-products that build up~${\bf A}_i$ and choosing the number~$d_i$ of filters prescribes the depth of the output. The precise way in which the sliding windows are constructed is defined by the sampling triplet~$\mathfrak{p} = (k,s,p)$ described in appendix~\ref{sec:sample} and determines the spatial resolution~$r_i$ of the output. One side-effect of this weight-sharing property of~${\bf w}_{i-1}^\mathfrak{p}$ across different patches of~${\bf A}_{i-1}$ is the emergence of a basic (and somewhat debatable~\cite{invarianceBrittle}) form of \colorbox{pink}{translational invariance}. In addition, weight-sharing leads to a \colorbox{pink}{dramatic decrease in the number of trainable parameters compared to fully connected layers}. \\

\noindent Besides this ``{\bf sliding-window}'' approach, there are many ways to implement convolutions and fast algorithms are still a topic of research since convolutions are the most computationally-intensive layer (Winograd~\cite{winograd}, frequency domain~\cite{convFFT}...).  In this paper, we focus on a very common implementation that uses the so-called ``{\bf im2col}'' transformation.  This intuitive data manipulation scheme~\cite{wikiConv, convUnroll} allows to re-express the convolution operator~$\star$ in terms of a single matrix multiplication that can be handled by existing and highly optimized numerical routines~\cite{blas, cuDNN}.  It turns out that the benefits of expressing convolutions as GEMMs (GEneral Matrix to Matrix Multiplication) generally outweigh the additional memory requirements and this approach is widely used in modern deep learning frameworks.

\begin{figure}
\animategraphics[loop, autoplay, width=0.9\textwidth]{1}{./pptx/convolution/Slide}{1}{28}
\caption{Convolution layer ({\bf with animations}): the ``sliding window'' approach.  For clarity, we consider {\bf top view} visualizations of a single~3d data sample and a single~3d convolutional \mbox{kernel}.  The~2d~\mbox{projections} of these data structures as squares when viewed from the top should not \mbox{distract} from their true underlying~3d nature: each cell actually represents a~1d vector across the depth \mbox{dimension} as illustrated in~Fig.\ref{fig:dataDepth}.  The (downsampling) {\bf forward pass} can be thought of as \mbox{sliding} the~3d~\mbox{kernel}~${\bf w}_{i-1}^\mathfrak{p} \sim \mathbb{R}^{d_{i-1}\times k\times k} $~across a set of~3d~patches of matching \mbox{dimensionality}~$\sim \mathbb{R}^{d_{i-1} \times k \times k}$, \mbox{depicted} as red squares, which partition the input data~${\bf A}_{i-1} \sim \mathbb{R}^{d_{i-1}\times r_{i-1}\times r_{i-1}}$. The number of patches is given by~eq.(\ref{eq:subSample}) and prescribes the spatial resolution~$r_i$ of the output activation map~${\bf A}_i \sim \mathbb{R}^{r_i \times r_i}$ (note that~$d_{i-1}$ has been contracted away since convolutional kernels are fully connected to the \mbox{input} data along the depth dimension). The geometric properties of these patches are \mbox{characterized} (in this example) by the triplet~$\mathfrak{p}_{\text{forward}} = ( k=3, s=2, p=0 )$ and we provide an explicit \mbox{decomposition} of the spatial local connectivity pattern in table~\ref{table:connectivity} as well as its implementation \mbox{using} the im2col \mbox{transformation} in Fig~\ref{fig:convForwardIm2Col}. Each component~$\{ \mathcal{L}_A \sim \mathbb{R}, \mathcal{M}_A \sim \mathbb{R} , \mathcal{N}_A \sim \mathbb{R}, \mathcal{O}_A \sim \mathbb{R} \}$ of the \mbox{output} activation map is determined by the sum of the dot-products (tensor contraction) \mbox{between} the \mbox{entries}~$\sim \mathbb{R}^{d_{i-1}}$ of~${\bf w}_{i-1}^\mathfrak{p}$ and each individual cell~$\sim \mathbb{R}^{d_{i-1}}$ of the relevant input data patch (members of the \mbox{sliding} red squares).  Obviously, realistic convolution layers are composed of a number~$d_i > 1$ of \mbox{filters} and this sliding window \mbox{approach} should be repeated independently for all the filters in order to yield the~3d~output~feature~map~${\bf A}_i \sim \mathbb{R}^{d_i\times r_i\times r_i}$. The (upsampling) {\bf \mbox{backward} pass} \mbox{propagates} the upstream error~$\Delta_i \sim \mathbb{R}^{d_i \times r_i \times r_i}$ downstream and recovers the \mbox{dimensionality} of the input data~$\Delta_{i-1} \sim {\bf A}_{i-1}$.  This is accomplished by another convolution that uses the same sliding window \mbox{approach} with the same \mbox{kernel}~$\overset{\curvearrowleft \, t_{\text{d}} \, \mathfrak{p^\prime} }{{\bf w}_{i-1}} \sim \mathbb{R}^{d_i \times k \times k}$ modulo simple geometric transformations such as depth transpose~$t_\text{d}$ from~$d_{i-1}$ to~$d_i$ and~180-degree \mbox{rotation} of the weights as explained in the main text. In order to preserve the connectivity \mbox{pattern} that was established during the forward pass, the patches partitioning~$\Delta_i$ are now defined by the sampling triplet~$\mathfrak{p^\prime} \equiv \mathfrak{p}_{\text{backward}} = ( k=3, s=1, p=2 )$ coupled with an internal padding on~$\Delta_i$ (gray colored cells) effectively \mbox{mimicking} a slow-moving fractional stride~$s_\text{backward} \sim 1/2$ in inverse proportion to~$s_\text{forward} = 2$. \mbox{Indeed}, table~\ref{table:connectivity} explicitly verifies that this upsampling convolution shares an identical local connectivity pattern for both forward/backward passes.}
\label{fig:convForwardBackward}
\end{figure}

\begin{figure}[!ht]
\centering
\includegraphics[width=0.8\linewidth]{pptx/im2col/Slide1.png}
\caption{Convolution layer: implementation as a matrix multiplication via {\bf im2col transformation} as shown in~eq.(\ref{eq:convForward_im2col}). For consistency, we use the same notation as the example already discussed in Fig.~\ref{fig:convForwardBackward} involving a single data sample~($n=1$), single convolutional kernel~($d_i=1$) and the sampling parameter~$\mathfrak{p} = ( k=3, s=2, p=0 )$.  Note that this is again a top view (see~Fig.\ref{fig:dataDepth} for the underlying~3d data structures) so that cells actually represent~1d vectors of depth data~$\sim \mathbb{R}^{d_{i-1}}$ and products between individual cells correspond to vector dot-products~$\sim \mathbb{R}^{d_{i-1}} \cdot \mathbb{R}^{d_{i-1}} \sim \mathbb{R}$ across the depth dimension.  First, the kernel is unrolled into an array~$\overline{{\bf w}}^{\mathfrak{p}}_{i-1}$ of~9$d_{i-1}$ components ($k=3$); see~eq.(\ref{kernelUnroll}).  Second, the input data~${\bf A}_{i-1}$ is partitioned into~4 patches, according to~$\mathfrak{p}$, and the~9$d_{i-1}$ components of each patch (members of the sliding red squares in~Fig.\ref{fig:convForwardBackward}) are stacked as matrix columns building up the transformed~$\phi( {\bf A}_{i-1})$.  One can verify that the matrix multiplication~$\overline{{\bf w}}^{\mathfrak{p}}_{i-1} \, \phi({\bf A}_{i-1})$ of~eq.(\ref{eq:convForward_im2col}) indeed leads to the sequence and sum of dot-products that defined~$\{ \mathcal{L}_A, \mathcal{M}_A, \mathcal{N}_A, \mathcal{O}_A \}$ in the traditional ``sliding window approach'' illustrated in~Fig.\ref{fig:convForwardBackward}. Note that the output of this GEMM is appropriately reshaped by~$f_{4d}$ in order to restore~${\bf A}_i$ with the spatial resolution~$r_i=2$ as prescribed by~$\mathfrak{p}$.}
\label{fig:convForwardIm2Col}
\end{figure}

\paragraph{Forward pass (GEMM formulation)}  The idea is to perform a series of geometric transformations on the weights~${\bf w}_{i-1}^\mathfrak{p}$ and the input data~${\bf A}_{i-1}$ so as to turn both of them into~2d arrays in order to express their convolution as a GEMM. The first step consists in unrolling the depth~($d_{i-1}$) and space~($k\times k$) dimensions of the~$d_i$ kernels into a~1d vector as such:
\begin{equation}
{\bf w}^{\mathfrak{p}}_{i-1} \sim \mathbb{R}^{d_i \times d_{i-1} \times k \times k} \rightarrow \overline{{\bf w}}^{\mathfrak{p}}_{i-1} \sim \mathbb{R}^{d_i \times (d_{i-1} \times k \times k)}
\label{kernelUnroll}
\end{equation}
This unrolling operation is illustrated in~Fig.\ref{fig:convForwardIm2Col} and can be reversed by rolling back~$\overline{{\bf w}}^{\mathfrak{p}}_{i-1}$ to restore the depth and space dimensions to the~$d_i$ kernels and recover the original~4d structure of~${\bf w}^{\mathfrak{p}}_{i-1}$:
\begin{equation}
\text{roll} \left( \overline{{\bf w}}^{\mathfrak{p}}_{i-1} \right) \longrightarrow \mathbb{R}^{d_i \times d_{i-1} \times k \times k}
\label{kernelRoll}
\end{equation}
Next, the input feature map~${\bf A}_{i-1}$ is also re-arranged as a~$2d$ array~${\bf A}_{i-1} \rightarrow \phi({\bf A}_{i-1})$ using the im2col transformation~$\phi$. Keeping in line with the sliding-window approach (as shown in~Fig.\ref{fig:convForwardBackward}) where input data patches are paired with convolutional kernels, we see that the columns of~$\phi({\bf A}_{i-1})$ should correspond to the components of the data patches that partition~${\bf A}_{i-1}$.  The number of patches is determined by~$\mathfrak{p}$ through~eq.(\ref{eq:subSample}) and defines the spatial resolution~$r_i$ of the output~${\bf A}_i$.  With a minibatch size~$n$, the matrix~$\phi({\bf A}_{i-1})$ therefore has~$(r_i \times r_i \times n)$ columns, each of which composed of the particular~$(d_{i-1} \times k\times k)$ components of~${\bf A}_{i-1}$ present in a specific patch.  Dimensionally, the transformation leads to:
\begin{equation*}
{\bf A}_{i-1} \sim \mathbb{R}^{n\times d_{i-1} \times k \times k} \rightarrow \phi({\bf A}_{i-1}) \sim \mathbb{R}^{(d_{i-1}\times k\times k) \times (r_i \times r_i \times n)}
\end{equation*}
As shown in~Fig.\ref{fig:convForwardIm2Col}, a simple matrix multiplication between the unrolled weights~$\overline{{\bf w}}^{\mathfrak{p}}_{i-1}$ and~$\phi({\bf A}_{i-1})$ is indeed identical to the tensor contraction discussed in the previous paragraph and illustrated in~Fig.\ref{fig:convForwardBackward}.  Putting it all together, the forward pass of the convolution layer can be formulated as:
\begin{empheq}[box={\forwardBox[{\bf Convolution}: forward pass (GEMM formulation)]}]{alignat=2}
& {\bf A}_i = f_{4d} \left[ \overline{{\bf w}}^{\mathfrak{p}}_{i-1} \, \phi({\bf A}_{i-1}) \right] + \widetilde{{\bf b}_{i-1}}  \label{eq:convForward_im2col}
\end{empheq}

\newpage

\noindent Note that there is a final geometric fold~$f_{4d}$ of the resulting GEMM in order to bring the original~$4d$ structure of~${\bf A}_{i-1}$ back to~${\bf A}_i$. Correspondingly, we denote the complimentary procedure which takes in~$4d$ representations and folds them to~$2d$ arrays as~$f_{2d}$:
\begin{eqnarray}
f_{2d} \big[ \mathbb{R}^{n \times d \times r \times r} \big] &\Longrightarrow& \mathbb{R}^{d \times \left( r \times r \times n \right)} \label{2dfold} \\ 
f_{4d} \big[ \mathbb{R}^{d \times \left( r \times r \times n \right)} \big] &\Longrightarrow& \mathbb{R}^{n \times d \times r \times r} \label{4dfold}
\end{eqnarray}
These geometric operations will also prove useful for batch normalization layers; see section~\ref{sec:batchNorm}.

\vspace{0.44cm}

\fcolorbox{black}{lightgray}{%
\minipage[t]{\dimexpr0.9\linewidth-2\fboxsep-2\fboxrule\relax}
{\bf Small side note about convolution layers} \\

{\it (The type of convolution layer discussed in this section is usually referred to as a ``2d-convolution''.  This is due to the fact that only the spatial components~(2d) of the kernels are locally connected to the input data whereas the depth dimension is still fully connected.  In addition, it may be worth emphasizing that the~$\phi$ transformation is not specific to the forward pass but that any convolution operation can be replaced by a matrix multiplication.)} \\

More generally, convolution layers should be viewed as {\bf universal and powerful feature extractors applicable to all kinds of data modality} by astutely adapting the dimensionality of the kernels.  For example, video data is routinely studied with~3d-convolutions where both space~(2d) and time~(1d) are locally connected to their input.  Conversely, sequential data such as time series or text can be treated with the help of~1d-convolutions. \\

On that note, let us mention that although sequence modeling has long been quasi-synonymous with recurrent neural networks (RNNs), this association is being challenged by recent research indicating that substantial (stateless) memory can be incorporated in to convolutional architectures, allowing CNNs to compete and even outperform canonical recurrent architectures (such as LSTMs and GRUs) in traditional speech recognition and various natural language processing tasks~\cite{koltun,hardt}. Arousing further suspicion, the behavior of dynamical systems implemented by RNNs is understood to be somehow reminiscent to that induced by~\hyperlink{skipNote}{shortcut connections}; a type of layer that has become common sight in modern (and more hardware-friendly) feed-forward architectures~\cite{RNNresnet}.  Nevertheless, despite these reservations RNNs and variants thereof remain popular in industrial/research projects and are the focus of a separate pedagogical article~\cite{LaurentRNN}.

\endminipage}

\newcommand\tempboxConvBack{%
\begin{minipage}{0.2\textwidth}%
\abovedisplayskip=0pt
\belowdisplayskip=0pt
\begin{align*}
\Delta_i \cdot \text{d}{\bf A}_{i} &= \Delta_i \cdot \text{d} \left( {\bf w}^{\mathfrak{p}}_{i-1} \star {\bf A}_{i-1} + \widetilde{{\bf b}_{i-1}} \right) \\
&\centerwithin{\downarrow}{=} \colorbox{light-blue}{GEMM formulation of~$\star$ via im2col transformation} \\
&= \Delta_i \cdot \text{d} \left( f_{4d} \left[ \overline{\bf w}^\mathfrak{p}_{i-1} \, \phi({\bf A}_{i-1}) \right] + \widetilde{{\bf b}_{i-1}} \right) \\
&\centerwithin{\downarrow}{=} \colorbox{light-blue}{folding $\Delta_i$ as a 2d array using eq.(\ref{2dfold})} \\
&= f_{2d}(\Delta_i) \cdot \left[ \text{d} \overline{\bf w}^\mathfrak{p}_{i-1} \, \phi({\bf A}_{i-1}) \right] + \Delta_i \cdot \text{d} \widetilde{{\bf b}_{i-1}} + f_{2d}(\Delta_i) \cdot \left[ \overline{\bf w}^\mathfrak{p}_{i-1} \text{d} \phi({\bf A}_{i-1}) \right] \\
&\centerwithin{\downarrow}{=} \colorbox{light-blue}{using eq.(\ref{frobenius1}) } \\
&= \left[ f_{2d} \left( \Delta_i \right) \phi({\bf A}_{i-1})^t \right] \cdot \text{d} \overline{\bf w}^\mathfrak{p}_{i-1} + \Delta_i \cdot \text{d} \widetilde{{\bf b}_{i-1}} + \left[ \left( \overline{\bf w}^\mathfrak{p}_{i-1} \right)^t f_{2d}(\Delta_i) \right] \cdot \text{d} \phi({\bf A}_{i-1}) \\
&\centerwithin{\downarrow}{=} \colorbox{light-blue}{restoring 4d structure to the weights using eq.(\ref{kernelRoll}) and contracting out $\Delta_i$ according to} \\
&\centerwithin{}{=} \colorbox{light-blue}{the broadcast rules initially applied to the bias vector (appendix \ref{appendix::broadcast})} \\
&= \mathcolorbox{shadecolor}{ \text{roll} \left[ f_{2d} \left( \Delta_i \right) \phi({\bf A}_{i-1})^t \right] }  \cdot  \text{d} {\bf w}^\mathfrak{p}_{i-1} + \mathcolorbox{shadecolor}{ \sum_{\substack{\text{samples} \\ \text{\& space}}} \Delta_i  } \cdot \, \text{d} {\bf b}_{i-1} + \mathcolorbox{pink}{\left[ \left( \overline{\bf w}^\mathfrak{p}_{i-1} \right)^t f_{2d}(\Delta_i) \right] \cdot \text{d} \phi({\bf A}_{i-1})}
\end{align*}
\end{minipage}}

\vspace{0.3cm}

\paragraph{Backward pass (fractionally strided convolutions)}  As demonstrated in section~\ref{backPropSection}, the backward pass is determined by specializing the recursive relation derived in~eq.(\ref{recursionGeneral}) to convolutions with:
\begin{align*}
\Delta_i \cdot \text{d}{\bf A}_{i} &= \Delta_i \cdot \text{d} \left( {\bf w}^{\mathfrak{p}}_{i-1} \star {\bf A}_{i-1} + \widetilde{{\bf b}_{i-1}} \right) \phantom{zzzzzzzzzzzz} \\
 & \makebox[0pt][l]{\makebox[0pt][r]{\tooltip****{\usebox{\switchbox}}{\tempboxConvBack}[-\fboxsep,-\dimexpr\ht\switchbox+\dp\switchbox\relax]}}\\
&= \underbrace{ \mathcolorbox{shadecolor}{ \text{roll} \left[ f_{2d} \left( \Delta_i \right) \phi({\bf A}_{i-1})^t \right] }}_{\textstyle
    \begin{gathered}
      \frac{\partial \mathcal{L}_{\text{batch}}}{\partial {\bf w}_{i-1}^\mathfrak{p}}
    \end{gathered} } \, \cdot \, \text{d} {\bf w}^\mathfrak{p}_{i-1} + \underbrace{ \mathcolorbox{shadecolor}{  \sum_{\substack{\text{samples} \\ \text{\& space}}} \Delta_i } }_{\textstyle
    \begin{gathered}
      \frac{\partial \mathcal{L}_{\text{batch}}}{\partial {\bf b}_{i-1}}
    \end{gathered} } \cdot \, \text{d} {\bf b}_{i-1} + \mathcolorbox{pink}{\left[ \left( \overline{\bf w}^\mathfrak{p}_{i-1} \right)^t f_{2d} \left( \Delta_i \right) \right] \cdot \text{d} \phi({\bf A}_{i-1})}
\end{align*}

\noindent Apart from facilitating the evaluation of the components of the gradient with respect to~$\mathcal{P}_{i-1}$, a side-effect of having used the im2col transformation~$\phi$ during the forward pass is that it leaves us with an awkward term~$\left[ \left( \overline{\bf w}^\mathfrak{p}_{i-1} \right)^t f_{2d} \left( \Delta_i \right) \right] \cdot \, \text{d} \phi({\bf A}_{i-1})$ that we would like to invert in order to get back the dimensionality of~${\bf A}_{i-1}$ and extract the downstream error~$\Delta_{i-1}$.  The difficulty stems from the weight-sharing property of convolutions which causes the inverse of the~$\phi$ mapping to be ``one-to-many'' and therefore not a well-defined function.  It turns out that reaching an explicit closed-form mathematical expression of the ``inverse'' procedure~$\phi^{-1}$ involves {\bf tedious and heavily index-based manipulations} that are difficult to express elegantly.  Nonetheless, efficient implementations of col2im do exist in image processing libraries and they are indeed selected by modern deep learning frameworks. 

\newpage

\noindent Since this article attempts to offer a more graceful \hypertarget{convTransf}{{\bf vectorized derivation}} of backpropagation, let us offer an alternative and more intuitive approach.  The key insight consists in recognizing that the upstream error appears in the form of a GEMM~$\sim \left( \overline{\bf w}^\mathfrak{p}_{i-1} \right)^t f_{2d} \left( \Delta_i \right)$ and, therefore, can be re-expressed as a convolution~$\star$ by applying a {\bf formal} inverse protocol~$\phi^{-1}$; just like~$\phi$ was itself initially used to transform~$\star$ to a~GEMM.  In other words:
\begin{equation}
\mathcolorbox{pink}{ \left[ \left( \overline{\bf w}^\mathfrak{p}_{i-1} \right)^t f_{2d} \left( \Delta_i \right) \right] \cdot \text{d} \phi({\bf A}_{i-1}) } \,\, \longrightarrow_{\phi^{-1}} \,\, \underbrace{ \mathcolorbox{shadecolor}{ \overset{\curvearrowleft \, t_{\text{d}} \, \mathfrak{p^\prime} }{{\bf w}_{i-1}} \star \Delta_i }}_{\textstyle
    \begin{gathered}
      \Delta_{i-1}
    \end{gathered} } \cdot \, \text{d} {\bf A}_{i-1}
\label{eq:backAsConv}
\end{equation}
where the convolution operation~$\star$ in the downstream error~$\Delta_{i-1}$ must satisfy a number of constraints in order to mimic peculiarities of the non-invertible, one-to-many,~$\phi^{-1}$ protocol.  This is achieved by introducing a geometrically modified version of the original weights:  $\overset{\curvearrowleft \, t_{\text{d}} \, \mathfrak{p^\prime} }{{\bf w}_{i-1}} \leftarrow {\bf w}^\mathfrak{p}_{i-1}$ which we proceed to describe in detail.  First, the sampling triplet~$\mathfrak{p}^\prime$ needs to be selected such that the dimensionality of the input feature map will be recovered downstream, namely~$\Delta_{i-1} \sim {\bf A}_{i-1}$.  Second, we must ensure that the connectivity pattern that was established between~${\bf A}_{i-1}$ and~${\bf w}^\mathfrak{p}_{i-1}$ during the forward pass will be respected during the backward pass between~$\overset{\curvearrowleft \, t_{\text{d}} \, \mathfrak{p^\prime} }{{\bf w}_{i-1}}$ and~$\Delta_i$.  As demonstrated in the animated bottom panel of Fig.\ref{fig:convForwardBackward} and by exhaustive enumeration in table~\ref{table:connectivity}, these constraints are satisfied by executing a series of simple geometric transformations on the original weights:
\begin{itemize}
\item \colorbox{pink}{transpose the depth dimensions}~$d_i$ and~$d_{i-1}$ of~${\bf w}^\mathfrak{p}_{i-1} \sim \mathbb{R}^{d_i \times d_{i-1} \times k \times k}$ into~$\overset{\curvearrowleft \, t_{\text{d}} \, \mathfrak{p^\prime} }{{\bf w}_{i-1}} \sim \mathbb{R}^{d_{i-1} \times d_i  \times k \times k}$ as indicated by the~$t_\text{d}$ superscript.  This guarantees a well-defined convolution that restores the original depth~$d_{i-1}$ of~${\bf A}_{i-1}$ to the downstream error~$\Delta_{i-1}$. 
\item perform a \colorbox{pink}{180-degree rotation of the weights} as indicated by the~$\curvearrowleft$ symbol.
\item associate the convolution with a new sampling parameter~$\mathfrak{p^\prime} = \big( k^\prime = k , s^\prime = 1/s , p^\prime = k-p-1 \big) $.  Notice that the stride~$s^\prime$ is now fractional and in inverse proportion to that defined by~$\mathfrak{p}$ in the forward pass~\cite{fracStride}.  This \colorbox{pink}{fractionally strided convolution} is implemented by setting~$s^\prime=1$ and embedding padding zeros in between the spatial components of~$\Delta_i$.  The effect of this ``internal'' padding is to slow down the sliding of the kernel in order to establish a connectivity pattern between~$\Delta_i$ and~$\overset{\curvearrowleft \, t_{\text{d}} \, \mathfrak{p^\prime} }{{\bf w}_{i-1}}$ compatible with the one between~${\bf A}_{i-1}$ and~${\bf w}^\mathfrak{p}_{i-1}$.
\end{itemize}
Keep in mind that, of course, the actual component values of~$\overset{\curvearrowleft \, t_{\text{d}} \, \mathfrak{p^\prime} }{{\bf w}_{i-1}}$ and ${\bf w}^\mathfrak{p}_{i-1}$ are identical to each other modulo the geometric transformations described above.  Incidentally, this fractionally strided convolution during the backward pass should be understood as an upsampling of the upstream error~${\Delta_i}$ in contrast to the forward convolution that acted as a downsampling of the input feature map~${\bf A}_{i-1}$. \\

\noindent Furthermore, one can see that apart from slightly cumbersome geometrical data folding, the backward pass of convolution layers is indeed very similar to that of fully connected layers (section~\ref{sec:fc}).  In both cases, the gradient component with respect to the weights involve a matrix product between the transpose of the original data~${\bf A}_{i-1}$ and the upstream error~$\Delta_i$ while the bias component is obtained by contraction of~$\Delta_i$ according to the broadcast rules applied to the bias vector during the forward pass.  Although intellectually pleasant, this implementation of error backpropagation via fractionally strided convolutions between weights and upstream error (which mirrors that of fully connected layers, replacing matrix product by~$\star$) is not the most computationally efficient and deep learning compilers provide lower level optimizations that break this symmetry.

\begin{empheq}[box={\backPropBox[{\bf Convolution}: backward pass (fractional stride)]}]{alignat=2}
\Delta_{i-1} &= \overset{\curvearrowleft \, t_{\text{d}} \, \mathfrak{p^\prime} }{{\bf w}_{i-1}} \star \Delta_i &\quad &\sim \mathbb{R}^{n\times d_{i-1} \times r_{i-1} \times r_{i-1} }  \\
\frac{\partial \mathcal{L}_{\text{batch}}}{\partial {\bf w}_{i-1}^\mathfrak{p}} &= \text{roll} \left[ f_{2d} \left( \Delta_i \right) \phi({\bf A}_{i-1})^t  \right]   &\quad  &\sim \mathbb{R}^{d_i \times d_{i-1} \times k \times k} \\
\frac{\partial \mathcal{L}_{\text{batch}}}{\partial {\bf b}_{i-1}} &= \sum_{\substack{\text{samples} \\ \text{\& space}}} \Delta_i &\quad  &\sim \mathbb{R}^{d_i} 
\end{empheq}

\begin{landscape}
\begin{figure}
\begin{tikzpicture}[scale=0.9]
\begin{scope}[every node/.style={thick,draw}]

\node [nodeStyle] (1) at (-10,7) {${\bf 1}_{A, \delta}$};
\node [Lstyle] (1a) at (-7,7) {$\mathbfcal{L}_{A,\delta}$};

\node [nodeStyle] (2) at (-5,7) {${\bf 2}_{A, \delta}$};
\node [Lstyle] (2a) at (-2,7) {$\mathbfcal{L}_{A,\delta}$};

\node [nodeStyle] (3) at (0,7) {${\bf 3}_{A, \delta}$};
\node [Mstyle] (3a) at (3,6.4) {$\mathbfcal{M}_{A,\delta}$};
\node [Lstyle] (3b) at (3,7.6) {$\mathbfcal{L}_{A,\delta}$};

\node [nodeStyle] (4) at (5,7) {${\bf 4}_{A, \delta}$};
\node [Mstyle] (4a) at (8,7) {$\mathbfcal{M}_{A,\delta}$};

\node [nodeStyle] (5) at (10,7) {${\bf 5}_{A, \delta}$};
\node [Mstyle] (5a) at (13,7) {$\mathbfcal{M}_{A,\delta}$};

\node [nodeStyle] (6) at (-10,4) {${\bf 6}_{A, \delta}$};
\node [Lstyle] (6a) at (-7,4) {$\mathbfcal{L}_{A,\delta}$};

\node [nodeStyle] (7) at (-5,4) {${\bf 7}_{A, \delta}$};
\node [Lstyle] (7a) at (-2,4) {$\mathbfcal{L}_{A,\delta}$};

\node [nodeStyle] (8) at (0,4) {${\bf 8}_{A, \delta}$};
\node [Mstyle] (8a) at (3,3.4) {$\mathbfcal{M}_{A,\delta}$};
\node [Lstyle] (8b) at (3,4.6) {$\mathbfcal{L}_{A,\delta}$};

\node [nodeStyle] (9) at (5,4) {${\bf 9}_{A, \delta}$};
\node [Mstyle] (9a) at (8,4) {$\mathbfcal{M}_{A,\delta}$};

\node [nodeStyle] (10) at (10,4) {${\bf 10}_{A, \delta}$};
\node [Mstyle] (10a) at (13,4) {$\mathbfcal{M}_{A,\delta}$};

\node [nodeStyle] (11) at (-10,0) {${\bf 11}_{A, \delta}$};
\node [Nstyle] (11a) at (-7,-0.6) {$\mathbfcal{N}_{A,\delta}$};
\node [Lstyle] (11b) at (-7,0.6) {$\mathbfcal{L}_{A,\delta}$};

\node [nodeStyle] (12) at (-5,0) {${\bf 12}_{A, \delta}$};
\node [Nstyle] (12a) at (-2,-0.6) {$\mathbfcal{N}_{A,\delta}$};
\node [Lstyle] (12b) at (-2,0.6) {$\mathbfcal{L}_{A,\delta}$};

\node [nodeStyle] (13) at (0,0) {${\bf 13}_{A, \delta}$};
\node [Lstyle] (13a) at (3,1.8) {$\mathbfcal{L}_{A,\delta}$};
\node [Mstyle] (13b) at (3,0.6) {$\mathbfcal{M}_{A,\delta}$};
\node [Nstyle] (13c) at (3,-0.6) {$\mathbfcal{N}_{A,\delta}$};
\node [Ostyle] (13d) at (3,-1.8) {$\mathbfcal{O}_{A,\delta}$};

\node [nodeStyle] (14) at (5,0) {${\bf 14}_{A, \delta}$};
\node [Ostyle] (14a) at (8,-0.6) {$\mathbfcal{O}_{A,\delta}$};
\node [Mstyle] (14b) at (8,0.6) {$\mathbfcal{M}_{A,\delta}$};

\node [nodeStyle] (15) at (10,0) {${\bf 15}_{A, \delta}$};
\node [Ostyle] (15a) at (13,-0.6) {$\mathbfcal{O}_{A,\delta}$};
\node [Mstyle] (15b) at (13,0.6) {$\mathbfcal{M}_{A,\delta}$};

\node [nodeStyle] (16) at (-10,-4) {${\bf 16}_{A, \delta}$};
\node [Nstyle] (16a) at (-7,-4) {$\mathbfcal{N}_{A,\delta}$};

\node [nodeStyle] (17) at (-5,-4) {${\bf 17}_{A, \delta}$};
\node [Nstyle] (17a) at (-2,-4) {$\mathbfcal{N}_{A,\delta}$};

\node [nodeStyle] (18) at (0,-4) {${\bf 18}_{A, \delta}$};
\node [Nstyle] (18a) at (3,-3.4) {$\mathbfcal{N}_{A,\delta}$};
\node [Ostyle] (18b) at (3,-4.6) {$\mathbfcal{O}_{A,\delta}$};

\node [nodeStyle] (19) at (5,-4) {${\bf 19}_{A, \delta}$};
\node [Ostyle] (19a) at (8,-4) {$\mathbfcal{O}_{A,\delta}$};

\node [nodeStyle] (20) at (10,-4) {${\bf 20}_{A, \delta}$};
\node [Ostyle] (20a) at (13,-4) {$\mathbfcal{O}_{A,\delta}$};

\node [nodeStyle] (21) at (-10,-7) {${\bf 21}_{A, \delta}$};
\node [Nstyle] (21a) at (-7,-7) {$\mathbfcal{N}_{A,\delta}$};

\node [nodeStyle] (22) at (-5,-7) {${\bf 22}_{A, \delta}$};
\node [Nstyle] (22a) at (-2,-7) {$\mathbfcal{N}_{A,\delta}$};

\node [nodeStyle] (23) at (0,-7) {${\bf 23}_{A, \delta}$};
\node [Nstyle] (23a) at (3,-6.4) {$\mathbfcal{N}_{A,\delta}$};
\node [Ostyle] (23b) at (3,-7.6) {$\mathbfcal{O}_{A,\delta}$};

\node [nodeStyle] (24) at (5,-7) {${\bf 24}_{A, \delta}$};
\node [Ostyle] (24a) at (8,-7) {$\mathbfcal{O}_{A,\delta}$};

\node [nodeStyle] (25) at (10,-7) {${\bf 25}_{A, \delta}$};
\node [Ostyle] (25a) at (13,-7) {$\mathbfcal{O}_{A,\delta}$};

\end{scope}

\begin{scope}[>={Stealth[black]},
              every node/.style={fill=red!30,circle},
              every edge/.style={draw=black, thick}]

\draw (-11,-8.5) -- (14,-8.5);
\draw (-11,-5.5) -- (14,-5.5);
\draw (-11,5.5) -- (14,5.5);
\draw (-11,8.5) -- (14,8.5);
\draw (-11,-2.6) -- (14,-2.6);
\draw (-11,2.6) -- (14,2.6);

\draw (-1,8.5) -- (-1,-8.5);
\draw (-11,8.5) -- (-11,-8.5);
\draw (14,8.5) -- (14,-8.5);
\draw (-6,8.5) -- (-6,-8.5);
\draw (4,8.5) -- (4,-8.5);
\draw (9,8.5) -- (9,-8.5);

\path [-] (1) edge node {${\bf w_1}$} (1a);
\path [-] (2) edge node {${\bf w_2}$} (2a);

\path [-] (3) edge node {${\bf w_1}$} (3a);
\path [-] (3) edge node {${\bf w_3}$} (3b);

\path [-] (4) edge node {${\bf w_2}$} (4a);
\path [-] (5) edge node {${\bf w_3}$} (5a);
\path [-] (6) edge node {${\bf w_4}$} (6a);
\path [-] (7) edge node {${\bf w_5}$} (7a);

\path [-] (8) edge node {${\bf w_4}$} (8a);
\path [-] (8) edge node {${\bf w_6}$} (8b);

\path [-] (9) edge node {${\bf w_5}$} (9a);
\path [-] (10) edge node {${\bf w_6}$} (10a);

\path [-] (11) edge node {${\bf w_1}$} (11a);
\path [-] (11) edge node {${\bf w_7}$} (11b);

\path [-] (12) edge node {${\bf w_2}$} (12a);
\path [-] (12) edge node {${\bf w_8}$} (12b);

\path [-] (13) edge node {${\bf w_9}$} (13a);
\path [-] (13) edge node {${\bf w_7}$} (13b);
\path [-] (13) edge node {${\bf w_3}$} (13c);
\path [-] (13) edge node {${\bf w_1}$} (13d);

\path [-] (14) edge node {${\bf w_2}$} (14a);
\path [-] (14) edge node {${\bf w_8}$} (14b);

\path [-] (15) edge node {${\bf w_3}$} (15a);
\path [-] (15) edge node {${\bf w_9}$} (15b);

\path [-] (16) edge node {${\bf w_4}$} (16a);
\path [-] (17) edge node {${\bf w_5}$} (17a);

\path [-] (18) edge node {${\bf w_6}$} (18a);
\path [-] (18) edge node {${\bf w_4}$} (18b);

\path [-] (19) edge node {${\bf w_5}$} (19a);
\path [-] (20) edge node {${\bf w_6}$} (20a);
\path [-] (21) edge node {${\bf w_7}$} (21a);
\path [-] (22) edge node {${\bf w_8}$} (22a);

\path [-] (23) edge node {${\bf w_9}$} (23a);
\path [-] (23) edge node {${\bf w_7}$} (23b);

\path [-] (24) edge node {${\bf w_8}$} (24a);
\path [-] (25) edge node {${\bf w_9}$} (25a);

\end{scope}
\end{tikzpicture}
\caption{Exhaustive enumeration of the {\bf connectivity pattern} defined by the convolution illustrated in~Fig.~\ref{fig:convForwardBackward}.  One can verify that thanks to the geometric transformation~$\overset{\curvearrowleft \, t_{\text{d}} \, \mathfrak{p^\prime} }{{\bf w}_{i-1}} \leftarrow {\bf w}^\mathfrak{p}_{i-1}$, the local connections between data and error values are indeed identical for both forward/backward passes. {\it (Colors are intended as a local guide for the eye only and are not related to the general color code adopted in the rest of the article.)}}
\label{table:connectivity}
\end{figure}
\end{landscape}

\section{Batch normalization (BN) layer}
\label{sec:batchNorm}

It is considered good-practice among machine learning professionals to include a feature rescaling pre-processing step in their model building workflow.  Ordinarily, the idea may be as elementary as to ``standardize'' the data so that each rescaled feature has zero mean and unit standard deviation.  This way, features with different scales and units can be more easily compared with each other and integrated together into a learning algorithm (such as gradient descent) with superior convergence properties.  Since the batch normalization layer of modern deep learning architectures is also based on this standardization of the features, let us start by a brief summary of traditional feature normalization.

\paragraph{Traditional feature normalization}  Typically, input data is represented as a~2d array~${\bf A} \sim \mathbb{R}^{n\times f}$ by aggregating a minibatch of~$n$ independent samples which are, individually, encoded as~1d feature vectors~$\sim \mathbb{R}^f$ composed of~$f$ scalar features. Let us focus on a specific feature~$\mathfrak{f} \in \{ 1, \cdots , f \} $.  Drawing values from across the minibatch samples,~$\mathfrak{f}$ can be represented as a~vector~${\bf a}_\mathfrak{f} = (a_\mathfrak{f}^1 , \cdots , a_\mathfrak{f}^n ) \sim \mathbb{R}^n$ whose mean and standard deviation are estimated from the minibatch statistics by:
\begin{eqnarray}
\mu_\mathfrak{f} &=& \frac{1}{n} \left( a_\mathfrak{f}^1 + \cdots + a_\mathfrak{f}^n \right) \sim \mathbb{R} \label{def::BNmu} \\
\sigma_\mathfrak{f} &=& \sqrt{\frac{1}{n} \left[ \left( a_\mathfrak{f}^1 - \mu_\mathfrak{f} \right)^2 + \cdots + \left( a_\mathfrak{f}^n - \mu_\mathfrak{f} \right)^2 \right]} \sim \mathbb{R} \label{def::BNsig}
\end{eqnarray}
These summary statistics are used to transform~${\bf a}_\mathfrak{f}$ into a ``batch normalized'' vector:
\begin{equation}
\overline{\bf a}_\mathfrak{f} = \frac{{\bf a}_\mathfrak{f} - \mu_\mathfrak{f}}{ \sigma_\mathfrak{f}} \sim \mathbb{R}^n
\label{eq:minibatchNorm}
\end{equation}
where the mean~$\mu_\mathfrak{f}$ and standard deviation~$\sigma_\mathfrak{f}$ are broadcast vector-wise such that basic arithmetic operations with individual components of~$\overline{\bf a}_\mathfrak{f}$ are well-defined.  By construction, the batch-normalized vector~$\overline{\bf a}_\mathfrak{f}$ achieves the desired properties of zero mean~$\mu_{\overline{\bf a}_\mathfrak{f}}=0$ and unit standard deviation~$\sigma_{\overline{\bf a}_\mathfrak{f}}=1$.  Note that this transformation does not change the nature of the underlying statistical distribution that generates feature~$\mathfrak{f}$ but simply performs a \colorbox{pink}{linear ``shift and rescale''} operation on its first~2 moments. \\

\newcommand\tempboxBNnormalize{%
\begin{minipage}{0.2\textwidth}%
\abovedisplayskip=0pt
\belowdisplayskip=0pt
\begin{align*}
\overline{\bf A} &= \left( \frac{{\bf a}_1 - \mu_1}{\sigma_1} , \cdots , \frac{{\bf a}_f - \mu_f}{\sigma_f} \right) \\
&\centerwithin{\downarrow}{=} \colorbox{light-blue}{unrolling each component} \\
&= \left(
\begin{matrix}
(a_1^1 - \mu_1)/\sigma_1 & \dots & (a_f^1 - \mu_f)/\sigma_f \\
\vdots & \vdots & \vdots \\
(a_1^n - \mu_1)/\sigma_1 & \dots & (a_f^n - \mu_f)/\sigma_f
\end{matrix}
\right) \\
&\centerwithin{\downarrow}{=} \colorbox{light-blue}{separating out mean and inverse standard deviation (broadcast) vectors} \\
&= \left[ \left(
\begin{matrix}
a_1^1 & \dots & a_f^1 \\
\vdots & \vdots & \vdots \\
a_1^n & \dots & a_f^n
\end{matrix}
\right) - \left(
\begin{matrix}
\mu_1 & \dots & \mu_f \\
\vdots & \vdots & \vdots \\
\mu_1 & \dots & \mu_f
\end{matrix}
\right)  \right] \left(
\begin{matrix}
1/\sigma_1 & & \\
& \ddots & \\
& & 1/\sigma_f
\end{matrix}
\right) \\
&= \left( {\bf A} - \widetilde{\mu} \right) \text{diag} ( 1/ \sigma ) \\
&\centerwithin{\downarrow}{=} \colorbox{light-blue}{division is overloaded as $1/ \, \widetilde{\sigma}$ to represent the matrix product with $\text{diag} ( 1/ \sigma )$} \\
&= \frac{{\bf A} - \widetilde{\mu}}{\widetilde{\sigma}}
\end{align*}
\end{minipage}}

\noindent Going beyond a single feature, let us now consider the general situation of~$f$ features.  In this case, we can define a pair of vectors~$\mu = ( \mu_1 , \cdots , \mu_f) \sim \mathbb{R}^f$ and~$\sigma = (\sigma_1, \cdots , \sigma_f) \sim \mathbb{R}^f$ made up of the minibatch mean values and standard deviations for all features~$\mathfrak{f} \in \{ 1 , \cdots , f \}$.  Then, one can assemble together all the batch-normalized features into a single data array:
\begin{equation*}
\overline{\bf A} = \left( \overline{\bf a}_1 , \cdots , \overline{\bf a}_f \right) = \left( \frac{{\bf a}_1 - \mu_1}{\sigma_1} , \cdots , \frac{{\bf a}_f - \mu_f}{\sigma_f} \right) \sim \mathbb{R}^{n\times f}
\end{equation*}
This expression may be vectorized further by introducing some broadcasting semantics:~i)~broadcast the mean value vector~$\mu \rightarrow \widetilde{\mu} \sim \mathbb{R}^{n\times f}$ additively across the minibatch samples~ii)~use a multiplicative broadcast to bring the inverse of the standard deviation vector~$1/\sigma = (1/\sigma_1, \cdots , 1/\sigma_f) \sim \mathbb{R}^f$ into a square diagonal matrix~$1/\sigma \rightarrow  1/ \, \widetilde{\sigma} \equiv \text{diag} \, (1/\sigma) \sim \mathbb{R}^{f\times f}$ as shown in~eq.(\ref{broadcast::mul}).  Using these broadcasting rules, the normalized data array can be expressed as:
\begin{align}
\overline{\bf A} &= \frac{{\bf A} - \widetilde{\mu}}{\widetilde{\sigma}} \hspace{6.5cm} \label{bn::normalizeBroadcast} \\
 & \makebox[0pt][l]{\makebox[0pt][r]{\tooltip****{\usebox{\switchbox}}{\tempboxBNnormalize}[-\fboxsep,-\dimexpr\ht\switchbox+\dp\switchbox\relax]}}
\end{align}

\noindent Note that we have implicitly introduced the shorthand notation~$\left( {\bf A} - \widetilde{\mu} \right) / \, \widetilde{\sigma} \equiv \left( {\bf A} - \widetilde{\mu} \right) \text{diag}( 1/\sigma)$ where division is overloaded to represent the matrix product~$\sim \mathbb{R}^{n\times f} \, \mathbb{R}^{f\times f} \sim \mathbb{R}^{n\times f}$ with the multiplicative broadcast of~$1/\sigma$.  The diagonality of this broadcast is crucial to ensure that each scaling factor~$1/\sigma_\mathfrak{f}$ is indeed coupled with its correct companion vector~${\bf a}_\mathfrak{f}$ as defined in~eq.(\ref{eq:minibatchNorm}) and derived explicitly in the optional content above.

\newpage

\hypertarget{3dfeature}{\paragraph{Feature definition for~3d image data}} How does this concept of traditional feature normalization carry over for~3d image data?  In order to answer this, one should look back at what a ``feature'' is in the context of CNNs.  As explained in section~\ref{sec:conv}, convolutional kernels are locally connected in space with the input data patches but they remain fully connected along the depth dimension. Because of this choice of dimensionality, depth is tensor-contracted out as kernels slide across the input data and \colorbox{pink}{each kernel generates a spatial feature~$\sim \mathbb{R}^{r\times r}$}, so-called activation map, which may be unrolled into a vector~$\sim \mathbb{R}^{(r\times r)}$.  Considering that a general convolution layer comes equipped with a number~$f$ of such independent kernels~\footnote{Notice that the number of kernels is usually denoted by~$d$ as it represents the output depth after a convolution layer. We denote it here as~$f$ to emphasize that it is also the number of independent features as far as data normalization is concerned.}, its output should be understood as a set of~$f$ features~$\sim \mathbb{R}^{(r\times r)\times f}$ where each individual feature is represented by a vector composed of~$r\times r = r^2$ unrolled values (instead of just one scalar value per feature as discussed in the previous paragraph for~1d data).  Aggregating together the feature vectors for all~$n$ samples (images), we get a~2d representation of the data~$\sim \mathbb{R}^{(n\times r\times r)\times f}$ where one can think of~$n_\text{eff} \equiv n\times r\times r$ as an ``effective'' minibatch size reflecting the intrinsic origin of features as unrolled~2d spatial maps into~1d vectors. \\

\noindent Conveniently enough, geometric reshape operations of data arrays from~4d to~2d and back were already introduced in section~\ref{sec:conv} and can be directly re-used (up to a simple transpose) for batch normalization.  For example, 4d minibatch arrays of images can be converted to their relevant~2d feature vector representations using~eq.(\ref{2dfold}) via~$f_{2d}^t \big[ \mathbb{R}^{n\times f\times r\times r} \big]  \Longrightarrow \mathbb{R}^{n_\text{eff} \times f}$ where we denote by~$f$ the number of features (i.e. output depth) and make use of the effective minibatch size~$n_\text{eff}$.  Correspondingly, folding a~2d array which is known to have originated from~4d image data is accomplished by applying the transpose of~eq.(\ref{4dfold}) via~$f_{4d}^t \big[ \mathbb{R}^{n_\text{eff}\times f} \big] \Longrightarrow \mathbb{R}^{n\times f\times r\times r}$.  These transformations allow us to keep a \colorbox{pink}{unified description of batch normalization independent of the underlying dimensionality of the data}. 

\noindent When dealing with~4d data, the idea consists in reshaping it into~2d feature vector representations via~$f_{2d}$ and use all the formalism described in the following sections, simply replacing~$n$ with~$n_\text{eff}$.  Only in the end, one concludes by folding the result back into its relevant~4d structure by invoking~$f_{4d}$.  For example, the traditional feature normalization defined in~eq.(\ref{bn::normalizeBroadcast}) can be adapted to~4d image data through the following gymnastics:
\begin{equation*}
\overline{\bf A} = f_{4d}^t \left( \frac{f_{2d}^t({\bf A}) - \widetilde{\mu} }{\widetilde{\sigma}} \right)
\end{equation*}

\paragraph{Forward pass}  The traditional feature normalization procedure reviewed above has proven itself very successful and maintains its hold as a popular pre-processing step in machine learning pipelines.  Nonetheless, introducing it ``as is'' into the architecture of a neural network is not ideal since restricting activation values to a certain range might limit the representations that can be achieved by subsequent layers. In particular, it would be desirable that any normalization layer inserted into the network would, at least, be able to represent the identity transform.  In other words, it should be possible for this layer to learn how to recover the original unnormalized data if that turns out, empirically, to be the optimal thing to do.  In order to see how to accomplish this, let us go back to a minibatch standard-normalized feature~$\overline{\bf a}_\mathfrak{f} \sim \mathbb{R}^n$ with zero mean and unit standard deviation.  Introducing a pair of scalar parameters~$w_\mathfrak{f} \sim \mathbb{R}$ and~$b_\mathfrak{f} \sim \mathbb{R} $, one can then apply an affine transformation:
\begin{equation*}
\text{BN} \left( \overline{\bf a}_\mathfrak{f} \right) = w_\mathfrak{f} \, \overline{\bf a}_\mathfrak{f} + b_\mathfrak{f}
\end{equation*}

\newcommand\tempboxBNmean{%
\begin{minipage}{0.2\textwidth}%
\abovedisplayskip=0pt
\belowdisplayskip=0pt
\begin{align*}
\mu_{\text{BN} \left( \overline{\bf a}_\mathfrak{f} \right)} &= \frac{1}{n} \sum_\text{samples} \left( w_\mathfrak{f} \, \overline{a}_\mathfrak{f}^s + b_\mathfrak{f} \right) \\
&= w_\mathfrak{f} \left( \frac{1}{n} \sum_\text{samples} \overline{a}_\mathfrak{f}^s \right) + b_\mathfrak{f} \\
&\downarrow \mathcolorbox{light-blue}{\text{since the mean of~$\overline{\bf a}_\mathfrak{f}$ is~0 by construction}} \\
&= b_\mathfrak{f}
\end{align*}
\end{minipage}}

\newcommand\tempboxBNstd{%
\begin{minipage}{0.2\textwidth}%
\abovedisplayskip=0pt
\belowdisplayskip=0pt
\begin{align*}
\sigma_{\text{BN} \left( \overline{\bf a}_\mathfrak{f} \right)} &= \sqrt{ \frac{1}{n} \sum_\text{samples} \left( \text{BN} \left( \overline{\bf a}_\mathfrak{f} \right) - \mu_{\text{BN} \left( \overline{\bf a}_\mathfrak{f} \right)} \right)^2 } \\
&= \sqrt{ \frac{1}{n} \sum_\text{samples} \left( w_\mathfrak{f} \, \overline{a}_\mathfrak{f}^s + b_\mathfrak{f} - b_\mathfrak{f} \right)^2 } \\
&= w_\mathfrak{f} \sqrt{ \frac{1}{n} \sum_\text{samples} \left( \overline{a}_\mathfrak{f}^s \right)^2} \\
&\downarrow \mathcolorbox{light-blue}{\text{since the standard deviation of~$\overline{\bf a}_\mathfrak{f}$ is~1 (and its mean~0) by construction}} \\
&= w_\mathfrak{f}
\end{align*}
\end{minipage}}

\noindent First of all, let us demonstrate how this weight/bias pair of adjustable parameters trivially determines the summary statistics of batch-normalized~$\text{BN} \left( \overline{\bf a}_\mathfrak{f} \right)$ through:
\begin{align*}
\mu_{\text{BN} \left( \overline{\bf a}_\mathfrak{f} \right)} &= b_\mathfrak{f} \hspace{5.5cm} \\
 & \makebox[0pt][l]{\makebox[0pt][r]{\tooltip****{\usebox{\switchbox}}{\tempboxBNmean}[-\fboxsep,-\dimexpr\ht\switchbox+\dp\switchbox\relax]}} \nonumber \\
\sigma_{\text{BN} \left( \overline{\bf a}_\mathfrak{f} \right)} &= w_\mathfrak{f} \nonumber \\
& \makebox[0pt][l]{\makebox[0pt][r]{\tooltip****{\usebox{\switchbox}}{\tempboxBNstd}[-\fboxsep,-\dimexpr\ht\switchbox+\dp\switchbox\relax]}} \nonumber 
\end{align*}

\noindent Evidently, giving the possibility to the learning algorithm to converge to the special case~$w_\mathfrak{f} \approx \sigma_\mathfrak{f}$ and~$b_\mathfrak{f} \approx \mu_\mathfrak{f}$, as defined in~eqs.(\ref{def::BNmu},\ref{def::BNsig}), demonstrates how this affine transformation may reclaim the original unnormalized values of the feature vector, namely~$\text{BN} \left( \overline{\bf a}_\mathfrak{f} \right)_{w_\mathfrak{f} \approx \sigma_\mathfrak{f} \, ; \, b_\mathfrak{f} \approx \mu_\mathfrak{f} } \approx {\bf a}_\mathfrak{f}$.

\newcommand\tempboxBNgeneral{%
\begin{minipage}{0.2\textwidth}%
\abovedisplayskip=0pt
\belowdisplayskip=0pt
\begin{align*}
\text{BN} \left( \overline{\bf A} \right) &= \left( w_1 \, \overline{\bf a}_1 + b_1 , \cdots , w_f \, \overline{\bf a}_f + b_f \right) \\
&= \left(
\begin{matrix}
w_1 \overline{a}_1^1 + b_1 & \dots & w_f \overline{a}_f^1 + b_f \\
\vdots & \vdots & \vdots \\
w_1 \overline{a}_1^n + b_1 & \dots & w_f \overline{a}_f^n + b_f
\end{matrix}
\right) \\
&= \left(
\begin{matrix}
\overline{a}_1^1 & \dots & \overline{a}_f^1 \\
\vdots & \vdots & \vdots \\
\overline{a}_1^n & \dots & \overline{a}_f^n
\end{matrix}
\right) \left(
\begin{matrix}
w_1 & & \\
& \ddots & \\
& & w_f
\end{matrix}
\right) + \left(
\begin{matrix}
b_1 & \dots & b_f \\
\vdots & \vdots & \vdots \\
b_1 & \dots & b_f
\end{matrix}
\right) \\
&= \overline{\bf A} \, \text{diag} \left( {\bf w} \right) + \widetilde{\bf b} \\
&= \overline{\bf A} \, \widetilde{\bf w} + \widetilde{\bf b}
\end{align*}
\end{minipage}}

\newpage

\noindent Going back to the general case of multiple features, we consolidate the newly-introduced adjustable weights~${\bf w} = \left( w_1 , \cdots , w_f \right) \sim \mathbb{R}^f$ and biases~${\bf b} = \left( b_1 , \cdots , b_f \right) \sim \mathbb{R}^f$ into vectors. The batch-normalized version of~$\overline{\bf A}$ can then be expressed as:
\begin{align*}
\text{BN} \left( \overline{\bf A} \right) &= \left( w_1 \, \overline{\bf a}_1 + b_1 , \cdots , w_f \, \overline{\bf a}_f + b_f \right) \\
 & \makebox[0pt][l]{\makebox[0pt][r]{\tooltip****{\usebox{\switchbox}}{\tempboxBNgeneral}[-\fboxsep,-19pt]}} \\
&= \overline{\bf A} \, \widetilde{\bf w} + \widetilde{\bf b}
\end{align*}
\noindent where the weights are broadcast diagonally~${\bf w} \rightarrow  \widetilde{\bf w} \equiv \text{diag} \, {\bf w} \sim \mathbb{R}^{f\times f}$ in order to scale the features (columns) of~$\overline{\bf A} \sim \mathbb{R}^{n\times f}$ with the appropriate component of the weight vector; see appendix~\ref{appendix::broadcast}. Biases are additively broadcast sample-wise~${\bf b} \rightarrow \widetilde{\bf b} \sim \mathbb{R}^{n\times f}$ as usual. \\

\noindent In summary, the batch normalization layer applies an affine transformation parametrized by:
\begin{equation*}
\mathcal{P}_{i-1} \begin{cases} 
\hspace{0.1cm} {\bf w}_{i-1} \sim \mathbb{R}^{f_{i-1}} \, , &\text{``weights''}  \\
\hspace{0.1cm} {\bf b}_{i-1} \sim \mathbb{R}^{f_{i-1}}  \, , &\text{``biases''} 
\end{cases}
\end{equation*}
to the~$f\equiv f_{i-1}$ features of a data array~${\bf A}_{i-1} \sim \mathbb{R}^{n\times f_{i-1}}$ on top of a more traditional feature standardization pre-processing step.  The final output of the forward pass is given by:
\begin{empheq}[box={\forwardBox[{\bf Batch normalization}: forward pass]}]{alignat=2}
{\bf A}_i &= \overline{\bf A}_{i-1} \widetilde{{\bf w}_{i-1}} + \widetilde{{\bf b}_{i-1}} \quad \text{with} \quad \overline{\bf A}_{i-1} = \frac{{\bf A}_{i-1} - \widetilde{\mu} }{ \widetilde{\sigma}}
\label{eq::bnForward}
\end{empheq}
The broadcasting rules of the data standardization parameters~$\mu$, $\sigma$ and are defined in~eq.(\ref{bn::normalizeBroadcast}) and those of the affine transformation~${\bf w}_{i-1}$ and~${\bf b}_{i-1}$ are defined just above.  Note that batch normalization layers do not change the dimensionality of the data~${\bf A}_i \sim {\bf A}_{i-1}$ as they simply provide a parametrized linear shift and rescale of all the features independently from each other. \\

\newcommand\tempboxBNimageForward{%
\begin{minipage}{\textwidth}%
\abovedisplayskip=0pt
\belowdisplayskip=0pt
\begin{empheq}[box={\forwardBox[{\bf Batch normalization}: forward pass with 3d image data]}]{alignat=2}
{\bf A}_i &= f_{4d}^t \left( \overline{f_{2d}^t({\bf A}_{i-1})} \, \widetilde{{\bf w}_{i-1}} + \widetilde{{\bf b}_{i-1}} \right) \nonumber
\end{empheq}

\begin{mdframed}[backgroundcolor=blue!20]
In the case of~3d (image) data, the forward pass of the batch normalization layer starts with a reorganization of~${\bf A}_{i-1} \sim \mathbb{R}^{n\times f_{i-1} \times r_{i-1} \times r_{i-1}} $ into~$f_{i-1}$ independent features consisting of activation maps whose spatial components are unrolled into vectors: ${\bf A}_{i-1} \longrightarrow f_{2d}^t({\bf A}_{i-1}) \sim \mathbb{R}^{n_\text{eff} \times f_{i-1}}$ as explained in the \hyperlink{3dfeature}{relevant paragraph}.  At this stage, one can carry out the regular batch normalization step specified in~eq.(\ref{eq::bnForward}).  Finally, the geometric reshape operation~$f_{4d}$ restores the original~4d structure of the data minibatch.  (Practical applications are shown in table~\ref{table:networkExample}).
\end{mdframed}

\end{minipage}}

\indent \hspace{0.0cm} $\makebox[0pt][l]{\makebox[0pt][r]{\fboxrule=-1pt\tooltip****{\usebox{\switchbox}}[white]{\tempboxBNimageForward}[-\fboxsep,0pt]}}$ {\it For details about geometric reshape operations in the case of~3d (image) data.}  \\

\noindent It is interesting to point out that batch normalization may account for a significant portion of training time since it requires two passes through the input data~${\bf A}_{i-1}$. First to compute the batch statistics~$\mu$ and~$\sigma$, and then to perform the normalization~$\overline{\bf A}_{i-1}$; a process bounded by memory bandwidth that is not easily parallelizable~\cite{bnMemoryBottleneck}.  Although we focus on training, let us mention that one needs to be cautious, engineering-wise, when using batch normalization for inference.  Since the concept of minibatch statistics is no longer relevant, it is customary to use population statistics based on the entire training dataset during inference. 

\newcommand\tempboxBNstart{%
\begin{minipage}{0.2\textwidth}%
\abovedisplayskip=0pt
\belowdisplayskip=0pt
\begin{align*}
\Delta_i \cdot \text{d}{\bf A}_{i} &= \Delta_i \cdot \text{d} \left( \overline{\bf A}_{i-1}  \widetilde{{\bf w}_{i-1}} + \widetilde{{\bf b}_{i-1}} \right) \\
&= \Delta_i \cdot \left( \overline{\bf A}_{i-1} \text{d} \widetilde{{\bf w}_{i-1}} \right) + \Delta_i \cdot \text{d} \widetilde{{\bf b}_{i-1}} + \Delta_i \cdot \left( \text{d} \overline{\bf A}_{i-1} \widetilde{{\bf w}_{i-1}} \right) \\
&\centerwithin{\downarrow}{=} \colorbox{light-blue}{using eq.(\ref{frobenius1}) and $\widetilde{{\bf w}_{i-1}}^t = \widetilde{{\bf w}_{i-1}}$ because of the diagonal broadcast} \\
&= \left( \overline{\bf A}_{i-1}^{\, t} \Delta_i \right) \cdot \text{d} \widetilde{{\bf w}_{i-1}} + \Delta_i \cdot \text{d} \widetilde{{\bf b}_{i-1}} + \left( \Delta_i \widetilde{{\bf w}_{i-1}}  \right) \cdot \text{d} \overline{\bf A}_{i-1} \\
&\centerwithin{\downarrow}{=} \colorbox{light-blue}{reversing the weight broadcast by picking out only the diagonal components} \\
&\centerwithin{}{=} \colorbox{light-blue}{of square matrix $\overline{\bf A}_{i-1}^{\, t} \Delta_i \sim \mathbb{R}^{f_{i-1} \times f_{i-1}}$ as explained in Appendix~\ref{appendix::broadcast}} \\
&\centerwithin{}{=} \colorbox{light-blue}{(keeping in mind that dimensionality is unchanged, i.e. $f_{i-1} = f_i$ for BN layers)} \\
&= \mathcolorbox{shadecolor}{ \text{diag} \left( \overline{\bf A}_{i-1}^{\, t} \Delta_i \right) } \, \cdot \, \text{d} {\bf w}_{i-1} + \mathcolorbox{shadecolor}{ \sum_\text{samples} \Delta_i } \cdot \, \text{d} {\bf b}_{i-1} + \mathcolorbox{pink}{ \big( \Delta_i \widetilde{{\bf w}_{i-1}}  \big) \cdot \text{d} \overline{\bf A}_{i-1} }
\end{align*}
\end{minipage}}

\paragraph{Backward pass} As with all other layers, the backward pass of the batch normalization layer starts by applying the recursive relation~eq.(\ref{recursionGeneral}):
\begin{align*}
\Delta_i \cdot \text{d}{\bf A}_{i} &= \Delta_i \cdot \text{d} \left( \overline{\bf A}_{i-1}  \widetilde{{\bf w}_{i-1}} + \widetilde{{\bf b}_{i-1}} \right) \\
& \makebox[0pt][l]{\makebox[0pt][r]{\tooltip****{\usebox{\switchbox}}{\tempboxBNstart}[-\fboxsep,-\dimexpr\ht\switchbox+\dp\switchbox\relax]}}\\
&= \underbrace{ \mathcolorbox{shadecolor}{ \text{diag} \left( \overline{\bf A}_{i-1}^{\, t} \Delta_i \right) }}_{\textstyle
    \begin{gathered}
      \frac{\partial \mathcal{L}_{\text{batch}}}{\partial {\bf w}_{i-1}}
    \end{gathered} } \, \cdot \, \text{d} {\bf w}_{i-1} + \underbrace{ \mathcolorbox{shadecolor}{ \sum_\text{samples} \Delta_i }}_{\textstyle
    \begin{gathered}
      \frac{\partial \mathcal{L}_{\text{batch}}}{\partial {\bf b}_{i-1}}
    \end{gathered} } \cdot \, \text{d} {\bf b}_{i-1} + \mathcolorbox{pink}{ \big( \Delta_i \widetilde{{\bf w}_{i-1}}  \big) \cdot \text{d} \overline{\bf A}_{i-1} }
\end{align*}
Although it offers no conceptual challenges, the explicit calculation of~$\big( \Delta_i \widetilde{{\bf w}_{i-1}}  \big) \cdot \text{d} \overline{\bf A}_{i-1}$, which is crucial to extract the downstream error~$\Delta_{i-1}$, turns out to be more tedious.  Essentially, the calculation boils down to writing the normalized total derivative~$\text{d} \overline{\bf A}_{i-1}$ in terms of the unnormalized~$\text{d} {\bf A}_{i-1}$.  In order to make progress, it is useful to break up the calculation into a few independent steps.

\newcommand\tempboxBNbackIndivZero{%
\begin{minipage}{0.2\textwidth}%
\abovedisplayskip=0pt
\belowdisplayskip=0pt
\begin{align*}
\text{d} \overline{a}_\mathfrak{f}^\mathfrak{s} &= \left( \frac{\partial \overline{a}_\mathfrak{f}^\mathfrak{s}}{\partial a_\mathfrak{f}^\mathfrak{s}} \right) \text{d} a_\mathfrak{f}^\mathfrak{s} + \left( \frac{\partial \overline{a}_\mathfrak{f}^\mathfrak{s}}{\partial \mu_\mathfrak{f}} \right) \text{d} \mu_\mathfrak{f} + \left( \frac{\partial \overline{a}_\mathfrak{f}^\mathfrak{s}}{\partial \sigma_\mathfrak{f}} \right) \text{d} \sigma_\mathfrak{f} \\
&= \frac{1}{\sigma_\mathfrak{f}} \, \text{d} a_\mathfrak{f}^s - \frac{1}{\sigma_\mathfrak{f}} \, \text{d} \mu_\mathfrak{f} - \frac{a_\mathfrak{f}^s - \mu_\mathfrak{f}}{\sigma_\mathfrak{f}^2} \, \text{d} \sigma_\mathfrak{f} \\
&\centerwithin{\downarrow}{=} \colorbox{light-blue}{substituting in the definition of $\overline{a}_\mathfrak{f}^\mathfrak{s}$} \\
&= \frac{1}{\sigma_\mathfrak{f}} \, \text{d} a_\mathfrak{f}^\mathfrak{s} - \frac{1}{\sigma_\mathfrak{f}} \, \text{d} \mu_\mathfrak{f} - \frac{\overline{a}_\mathfrak{f}^\mathfrak{s}}{\sigma_\mathfrak{f}} \, \text{d} \sigma_\mathfrak{f}
\end{align*}
\end{minipage}}

\newcommand\tempboxBNbackIndivMu{%
\begin{minipage}{0.2\textwidth}%
\abovedisplayskip=0pt
\belowdisplayskip=0pt
\begin{align*}
\text{d} \mu_\mathfrak{f} &= \text{d} \left[ \frac{1}{n} \left( a_\mathfrak{f}^1 + \cdots + a_\mathfrak{f}^n \right) \right]  \\
&\centerwithin{}{=} \colorbox{light-blue}{see eq.(\ref{def::BNmu}) for the definition of $\mu_\mathfrak{f}$} \\
&= \left( \frac{\partial \mu_\mathfrak{f}}{\partial a^1_\mathfrak{f}} \right) \text{d} a^1_\mathfrak{f} + \cdots + \left( \frac{\partial \mu_\mathfrak{f}}{\partial a^n_\mathfrak{f}} \right) \text{d} a^n_\mathfrak{f} \\
&= \frac{1}{n} \sum_\text{samples} \text{d} a_\mathfrak{f}^s
\end{align*}
\end{minipage}}

\newcommand\tempboxBNbackIndivSigma{%
\begin{minipage}{0.2\textwidth}%
\abovedisplayskip=0pt
\belowdisplayskip=0pt
\begin{align*}
\text{d} \sigma_\mathfrak{f} &= \text{d} \left[ \sqrt{\frac{1}{n} \left[ \left( a_\mathfrak{f}^1 - \mu_\mathfrak{f} \right)^2 + \cdots + \left( a_\mathfrak{f}^n - \mu_\mathfrak{f} \right)^2 \right]} \right] \\
&\centerwithin{}{=} \colorbox{light-blue}{see eq.(\ref{def::BNsig}) for the definition of $\sigma_\mathfrak{f}$} \\
&= \left( \frac{\partial \sigma_\mathfrak{f}}{\partial a^1_\mathfrak{f}} \right) \text{d} a^1_\mathfrak{f} + \cdots + \left( \frac{\partial \sigma_\mathfrak{f}}{\partial a^n_\mathfrak{f}} \right) \text{d} a^n_\mathfrak{f} + \left( \frac{\partial \sigma_\mathfrak{f}}{\partial \mu_\mathfrak{f}} \right) \text{d} \mu_\mathfrak{f} \\
&= \left( \frac{a_\mathfrak{f}^1 - \mu_\mathfrak{f}}{n\sigma_\mathfrak{f}} \right) \text{d}a_\mathfrak{f}^1 + \cdots + \left( \frac{a_\mathfrak{f}^n - \mu_\mathfrak{f}}{n\sigma_\mathfrak{f}} \right) \text{d}a_\mathfrak{f}^n - \frac{1}{\sigma_\mathfrak{f}} \bigg[ \left( a_\mathfrak{f}^1 - \mu_\mathfrak{f} \right) + \cdots + \left( a_\mathfrak{f}^n - \mu_\mathfrak{f} \right) \bigg] \text{d} \mu_\mathfrak{f} \\
&= \frac{1}{n} \sum_\text{samples} \left( \frac{a_\mathfrak{f}^s - \mu_\mathfrak{f}}{\sigma_\mathfrak{f}} \right) \text{d}a_\mathfrak{f}^s - \frac{1}{\sigma_\mathfrak{f}} \bigg[ \left( a_\mathfrak{f}^1 + \cdots + a_\mathfrak{f}^n \right) - n \mu_\mathfrak{f} \bigg] \text{d} \mu_\mathfrak{f} \\
&\centerwithin{\downarrow}{=} \colorbox{light-blue}{second term is identically 0 from the definition of $\mu_\mathfrak{f} = ( a_\mathfrak{f}^1 + \cdots + a_\mathfrak{f}^n ) / n$} \\
&\centerwithin{}{=} \colorbox{light-blue}{and substituting the definition of $\overline{a}_\mathfrak{f}^s$ into the first term} \\
&= \frac{1}{n} \sum_\text{samples} \overline{a}_\mathfrak{f}^s \text{d} a_\mathfrak{f}^s
\end{align*}
\end{minipage}}

\fcolorbox{black}{lightgray}{%
\minipage[t]{\dimexpr0.9\linewidth-2\fboxsep-2\fboxrule\relax}
{\bf Small side note about normalization layers} \\

Since its introduction in~\cite{bnOriginal}, batch normalization continues to be credited as a crucial ingredient behind the current deep learning renaissance.  Empirically, it has been shown that this normalization mechanism provides tremendous help in stabilizing the training of neural networks by allowing the use of large learning rates with which~SGD can reach good data representations faster and more reliably.  By preventing the saturation of activation functions, batch normalization also contributes to the elimination of vanishing/exploding gradients; issues that used to poison neural network training.  Moreover, it is suspected that the noise introduced by finite-size batch statistics may endow batch normalization with a form of regularization somewhat similar to dropout~\cite{dropOut}. \\

Despite widely acknowledged benefits, the jury is nevertheless still out with regards to the underlying reason for the success of batch normalization.  Although it was originally argued that batch normalization works by limiting ``internal covariate shift'' (a process by which input statistical distributions supposedly keep changing thereby slowing down training), recent evidence casts doubts on this view~\cite{bnDebunk}.  In general, the idea seems to be that by letting some batch statistics free to be learned by the data instead of being imposed by complex high-order interactions between layers, the dynamics of learning becomes more layer-independent which, in turn, helps stabilize the learning rate~\cite{deepLearningBookGoodfellow}.  In any case, the literature on normalization layers has expanded enormously and a number of other techniques have recently been proposed in order to fix some shortcomings of batch normalization~\cite{normalizationLayers}. \\

{\it (Technical addendum: it was originally proposed to place the batch normalization layer before the non-linear activation layer, presumably so that the normalized inputs to the non-linearity have a better chance to avoid saturating the activation function; a situation more relevant when dealing with sigmoid/tanh activations for example...  However, architectures based on ReLU sometimes place batch normalization after the non-linearity so that negative input values, which are to be eliminated by ReLU, do not contribute to the minibatch statistics; as we implement here in the example network architecture specified in table~\ref{table:networkExample}.)}

\endminipage}

\vspace{0.42cm}

\noindent First of all, let us consider a specific sample~$\mathfrak{s}$ and feature~$\mathfrak{f}$ extracted from~$\overline{\bf A}_{i-1}$ so as to start by focusing on the calculation of the total derivative of an arbitrary component~$\overline{a}_\mathfrak{f}^\mathfrak{s} \sim \mathbb{R}$.  Keeping in mind how~$\overline{\bf a}_\mathfrak{f}$ is defined, see~eq.(\ref{eq:minibatchNorm}), promptly yields the total derivative of one of its component as: 
\begin{align*}
\text{d} \overline{a}_\mathfrak{f}^\mathfrak{s} &= \left( \frac{\partial \overline{a}_\mathfrak{f}^\mathfrak{s}}{\partial a_\mathfrak{f}^\mathfrak{s}} \right) \text{d} a_\mathfrak{f}^\mathfrak{s} + \left( \frac{\partial \overline{a}_\mathfrak{f}^\mathfrak{s}}{\partial \mu_\mathfrak{f}} \right) \text{d} \mu_\mathfrak{f} + \left( \frac{\partial \overline{a}_\mathfrak{f}^\mathfrak{s}}{\partial \sigma_\mathfrak{f}} \right) \text{d} \sigma_\mathfrak{f} \hspace{4.5cm} \\
& \makebox[0pt][l]{\makebox[0pt][r]{\tooltip****{\usebox{\switchbox}}{\tempboxBNbackIndivZero}[-\fboxsep,-\dimexpr\ht\switchbox+\dp\switchbox\relax]}} \\
&= \frac{1}{\sigma_\mathfrak{f}} \, \text{d} a_\mathfrak{f}^\mathfrak{s} - \frac{1}{\sigma_\mathfrak{f}} \, \text{d} \mu_\mathfrak{f} - \frac{\overline{a}_\mathfrak{f}^\mathfrak{s}}{\sigma_\mathfrak{f}} \, \text{d} \sigma_\mathfrak{f} \\
& \text{for  } \text{d}\mu_\mathfrak{f} \hspace{0.75cm} \makebox[0pt][l]{\makebox[0pt][r]{\tooltip****{\usebox{\switchbox}}{\tempboxBNbackIndivMu}[-\fboxsep,-\dimexpr\ht\switchbox+\dp\switchbox\relax]}} \\
& \text{for  } \text{d}\sigma_\mathfrak{f} \hspace{0.8cm} \makebox[0pt][l]{\makebox[0pt][r]{\tooltip****{\usebox{\switchbox}}{\tempboxBNbackIndivSigma}[-\fboxsep,-\dimexpr\ht\switchbox+\dp\switchbox\relax]}} \\
&= \frac{1}{n\sigma_\mathfrak{f}} \left( n \, \text{d} a_\mathfrak{f}^\mathfrak{s} - \sum_\text{samples} \text{d} a_\mathfrak{f}^s - \overline{a}_\mathfrak{f}^\mathfrak{s} \sum_\text{samples} \overline{a}_\mathfrak{f}^s \text{d} a_\mathfrak{f}^s \right)
\end{align*}

\noindent As it will soon become helpful, it is worth mentioning that~$\text{d} \overline{a}_\mathfrak{f}^\mathfrak{s}$ can be transformed into a compositional form by factoring out~$\text{d} a_\mathfrak{f}^\mathfrak{s}$:
\begin{equation}
\text{d} \overline{a}_\mathfrak{f}^\mathfrak{s} = \frac{1}{n\sigma_\mathfrak{f}} \left( n \, {\bf 1} - \sum_\text{samples} {\bf 1} - \overline{a}_\mathfrak{f}^\mathfrak{s} \sum_\text{samples} \overline{a}_\mathfrak{f}^s \right) \text{d} a_\mathfrak{f}^\mathfrak{s}
\label{factorOperator}
\end{equation}
where we introduce the identity mapping~${\bf 1}$ as an indicator that the terms in between the parentheses should be thought of as operators acting on~$\text{d} a_\mathfrak{f}^\mathfrak{s}$.

\newcommand\tempboxBNbackExplain{%
\begin{minipage}{0.2\textwidth}%
\abovedisplayskip=0pt
\belowdisplayskip=0pt
\begin{align*}
\text{d}\overline{\bf A}_{i-1} &= \left(
\begin{matrix}
\text{d} \overline{a}_1^1 & \dots & \text{d} \overline{a}_f^1 \\
\vdots & \vdots & \vdots \\
\text{d} \overline{a}_1^n & \dots & \text{d} \overline{a}_f^n
\end{matrix}
\right) = \left(
\begin{matrix}
\big( n \text{d}a_1^1 - \sum_s \text{d}a_1^s - \overline{a}_1^1 \sum_s \overline{a}_1^s \text{d}a_1^s \big) / n\sigma_1 & \dots & \big( n \text{d}a_f^1 - \sum_s \text{d}a_f^s - \overline{a}_f^1 \sum_s \overline{a}_f^s \text{d}a_f^s \big) / n\sigma_f \\
\vdots & \vdots & \vdots \\
\big( n \text{d}a_1^n - \sum_s \text{d}a_1^s - \overline{a}_1^n \sum_s \overline{a}_1^s \text{d}a_1^s \big) / n\sigma_1 & \dots & \big( n \text{d}a_f^n - \sum_s \text{d}a_f^s - \overline{a}_f^n \sum_s \overline{a}_f^s \text{d}a_f^s \big) / n\sigma_f
\end{matrix}
\right) \\
&= \frac{1}{n} \left[ \left(
\begin{matrix}
n \text{d}a_1^1 & \dots & n \text{d}a_f^1 \\
\vdots & \vdots & \vdots \\
n \text{d}a_1^n & \dots & n \text{d}a_f^n
\end{matrix}
\right) - \left(
\begin{matrix}
\sum_s \text{d}a_1^s & \dots & \sum_s \text{d}a_f^s \\
\vdots & \vdots & \vdots \\
\sum_s \text{d}a_1^s & \dots & \sum_s \text{d}a_f^s
\end{matrix}
\right) - \left(
\begin{matrix}
\overline{a}_1^1 \sum_s \overline{a}_1^s \text{d}\overline{a}_1^s & \dots & \overline{a}_f^1 \sum_s \overline{a}_f^s \text{d}\overline{a}_f^s \\
\vdots & \vdots & \vdots \\
\overline{a}_1^n \sum_s \overline{a}_1^s \text{d}\overline{a}_1^s & \dots & \overline{a}_f^n \sum_s \overline{a}_f^s \text{d}\overline{a}_f^s
\end{matrix}
\right) \right] \left(
\begin{matrix}
1/\sigma_1 & & \\
& \ddots & \\
& & 1/\sigma_f
\end{matrix}
\right) \\
&= \frac{1}{n} \, \bigg[ \hspace{0.6cm} n \left( {\bf 1} \circ \text{d}{\bf A}_{i-1} \right) \hspace{0.8cm} - \hspace{0.8cm} \text{middle term} \hspace{1.25cm} - \hspace{1.9cm} \text{last term} \hspace{2.35cm} \bigg] \,\, \text{diag} \left( \frac{1}{\sigma} \right)
\end{align*}
\end{minipage}}

\newcommand\tempboxBNbackMiddle{%
\begin{minipage}{0.2\textwidth}%
\abovedisplayskip=0pt
\belowdisplayskip=0pt
\begin{align*}
\left(
\begin{matrix}
\sum_s \text{d}a_1^s & \dots & \sum_s \text{d}a_f^s \\
\vdots & \vdots & \vdots \\
\sum_s \text{d}a_1^s & \dots & \sum_s \text{d}a_f^s
\end{matrix}
\right) &\centerwithin{\downarrow}{=} \colorbox{light-blue}{equivalent notation as broadcast of $\sim \mathbb{R}^f$ vector} \\
&= \widetilde{\left( \sum_s \text{d}a_1^s , \cdots , \sum_s \text{d}a_f^s \right)} \\
&\centerwithin{\downarrow}{=} \colorbox{light-blue}{each component of (broadcast scoped) vector} \\
&\centerwithin{}{=} \colorbox{light-blue}{corresponds to column-wise sum of $\mathbb{R}^{n\times f}$ matrix:} \\
&= \widetilde{ \sum_\text{samples} \left(
\begin{matrix}
\text{d}a_1^1 & \dots & \text{d}a_f^1 \\
\vdots & \vdots & \vdots \\
\text{d}a_1^n & \dots & \text{d}a_f^n
\end{matrix}
\right) }  \\
&= \widetilde{ \sum_\text{samples} \left(
\begin{matrix}
1 & \dots & 1 \\
\vdots & \vdots & \vdots \\
1 & \dots & 1
\end{matrix}
\right) \circ \left(
\begin{matrix}
\text{d}a_1^1 & \dots & \text{d}a_f^1 \\
\vdots & \vdots & \vdots \\
\text{d}a_1^n & \dots & \text{d}a_f^n
\end{matrix}
\right) } \\
&= \widetilde{ \sum_\text{samples} \mathbf{1} \circ \text{d}{\bf A}_{i-1} }
\end{align*}
\end{minipage}}

\newcommand\tempboxBNbackLast{%
\begin{minipage}{0.2\textwidth}%
\abovedisplayskip=0pt
\belowdisplayskip=0pt
\begin{align*}
\left(
\begin{matrix}
\overline{a}_1^1 \sum_s \overline{a}_1^s \text{d}a_1^s & \dots & \overline{a}_f^1 \sum_s \overline{a}_f^s \text{d}a_f^s \\
\vdots & \vdots & \vdots \\
\overline{a}_1^n \sum_s \overline{a}_1^s \text{d}a_1^s & \dots & \overline{a}_f^n \sum_s \overline{a}_f^s \text{d}a_f^s
\end{matrix}
\right) &= \left(
\begin{matrix}
\overline{a}_1^1 & \dots & \overline{a}_f^1 \\
\vdots & \vdots & \vdots \\
\overline{a}_1^n & \dots & \overline{a}_f^n
\end{matrix}
\right) \circ \left(
\begin{matrix}
\sum_s \overline{a}_1^s \text{d}a_1^s & \dots & \sum_s \overline{a}_f^s \text{d}a_f^s \\
\vdots & \vdots & \vdots \\
\sum_s \overline{a}_1^s \text{d}a_1^s & \dots & \sum_s \overline{a}_f^s \text{d}a_f^s
\end{matrix}
\right)  \\
&\centerwithin{\downarrow}{=} \colorbox{light-blue}{equivalent notation as broadcast of $\sim \mathbb{R}^f$ vector} \\
&= \overline{\bf A}_{i-1} \circ \widetilde{\left( \sum_s \overline{a}_1^s \text{d}a_1^s , \cdots , \sum_s \overline{a}_f^s \text{d}a_f^s \right)} \\
&\centerwithin{\downarrow}{=} \colorbox{light-blue}{column-wise sum of $\sim \mathbb{R}^{n\times f}$ matrix leads to} \\
&\centerwithin{}{=} \colorbox{light-blue}{identical (broadcast scoped) vector} \\
&= \overline{\bf A}_{i-1} \circ \widetilde{\sum_\text{samples} \left(
\begin{matrix}
\overline{a}_1^1 \text{d}a_1^1 & \dots & \overline{a}_f^1 \text{d}a_f^1 \\
\vdots & \vdots & \vdots \\
\overline{a}_1^n \text{d}a_1^n & \dots & \overline{a}_f^n \text{d}a_f^n
\end{matrix}
\right) } \\
&= \overline{\bf A}_{i-1} \circ \widetilde{\sum_\text{samples} \left(
\begin{matrix}
\overline{a}_1^1 & \dots & \overline{a}_f^1 \\
\vdots & \vdots & \vdots \\
\overline{a}_1^n & \dots & \overline{a}_f^n
\end{matrix}
\right) \circ \left(
\begin{matrix}
\text{d}a_1^1 & \dots & \text{d}a_f^1 \\
\vdots & \vdots & \vdots \\
\text{d}a_1^n & \dots & \text{d}a_f^n
\end{matrix}
\right) } \\
&= \overline{\bf A}_{i-1} \circ \widetilde{\sum_\text{samples} \overline{\bf A}_{i-1} \circ \text{d}{\bf A}_{i-1} } 
\end{align*}
\end{minipage}}

\newpage

\noindent The next step consists in applying~eq.(\ref{factorOperator}), valid for an arbitrary component~$\text{d}\overline{a}^\mathfrak{s}_\mathfrak{f}$ of~$\text{d}\overline{\bf A}_{i-1}$, to all samples~$\mathfrak{s} \in \{ 1 , \cdots , n \} $ and features~$\mathfrak{f} \in \{ 1, \cdots , f \equiv f_{i-1} \}$ thereby allowing us to populate all the components of the total derivative of the input feature map with closed-form expressions:
\begin{align*}
\text{d}\overline{\bf A}_{i-1} &= \left(
\begin{matrix}
\text{d} \overline{a}_1^1 & \dots & \text{d} \overline{a}_f^1 \\
\vdots & \vdots & \vdots \\
\text{d} \overline{a}_1^n & \dots & \text{d} \overline{a}_f^n
\end{matrix}
\right) \hspace{5cm} \\
& \hspace{0.5cm} \makebox[0pt][l]{\makebox[0pt][r]{\tooltip****{\usebox{\switchbox}}{\tempboxBNbackExplain}[-\fboxsep,0.6pt]}}  \\
&\centerwithin{\downarrow}{=} \colorbox{light-blue}{overloading division as $1/\,\widetilde{\sigma}$ to represent matrix product with $\text{diag}(1 / \sigma)$; same as in eq.(\ref{bn::normalizeBroadcast}) } \\
&= \frac{1}{n\widetilde{\sigma}} \left[ n \left( {\bf 1} \circ \text{d}{\bf A}_{i-1} \right) - \left(
\begin{matrix}
\sum_s \text{d}a_1^s & \dots & \sum_s \text{d}a_f^s \\
\vdots & \vdots & \vdots \\
\sum_s \text{d}a_1^s & \dots & \sum_s \text{d}a_f^s
\end{matrix}
\right) - \left(
\begin{matrix}
\overline{a}_1^1 \sum_s \overline{a}_1^s \text{d}\overline{a}_1^s & \dots & \overline{a}_f^1 \sum_s \overline{a}_f^s \text{d}\overline{a}_f^s \\
\vdots & \vdots & \vdots \\
\overline{a}_1^n \sum_s \overline{a}_1^s \text{d}\overline{a}_1^s & \dots & \overline{a}_f^n \sum_s \overline{a}_f^s \text{d}\overline{a}_f^s
\end{matrix}
\right) \right] \\
& \hspace{4.5cm} \widetilde{ \sum_\text{samples} \mathbf{1} \circ \text{d}{\bf A}_{i-1} } \hspace{2cm} \overline{\bf A}_{i-1} \circ \widetilde{\sum_\text{samples} \overline{\bf A}_{i-1} \circ \text{d}{\bf A}_{i-1} } \\
& \begin{cases}
\text{ middle term} \hspace{0.75cm} \makebox[0pt][l]{\makebox[0pt][r]{\tooltip****{\usebox{\switchbox}}{\tempboxBNbackMiddle}[-\fboxsep,-20pt]}} \\
\text{ last term} \hspace{1.25cm} \makebox[0pt][l]{\makebox[0pt][r]{\tooltip****{\usebox{\switchbox}}{\tempboxBNbackLast}[-\fboxsep,-26pt]}}
\end{cases} \\
&\centerwithin{\downarrow}{=} \colorbox{light-blue}{restructure as a Hadamard product} \\
&\centerwithin{}{=} \colorbox{light-blue}{by factoring out $\text{d}{\bf A}_{i-1}$ and introducing an identity operator~${\bf 1} \sim \mathbb{R}^{n\times f_{i-1}}$} \\
&= \frac{1}{n\widetilde{\sigma}} \left( n\,{\bf 1}  - \widetilde{ \sum_\text{samples} \mathbf{1}} - \overline{\bf A}_{i-1} \circ \widetilde{\sum_\text{samples} \overline{\bf A}_{i-1} } \right) \circ \text{d}{\bf A}_{i-1}
\end{align*}

\newcommand\tempboxBootBNError{%
\begin{minipage}{0.2\textwidth}%
\abovedisplayskip=0pt
\belowdisplayskip=0pt
\begin{align*}
\left( \Delta_i \widetilde{{\bf w}_{i-1}}  \right) \cdot \text{d} \overline{\bf A}_{i-1} &= \left( \Delta_i \widetilde{{\bf w}_{i-1}}  \right) \cdot \frac{1}{n \widetilde{\sigma}} \left( n\,{\bf 1} - \widetilde{ \sum_\text{samples} {\bf 1} } - \overline{\bf A}_{i-1} \circ \widetilde{ \sum_\text{samples} \overline{\bf A}_{i-1} } \right) \circ \text{d} {\bf A}_{i-1} \\
&\centerwithin{\downarrow}{=} \colorbox{light-blue}{using eq.(\ref{frobenius2})} \\
&= \frac{1}{n \widetilde{\sigma}} \left[  \left( n\,{\bf 1} - \widetilde{ \sum_\text{samples} {\bf 1} } - \overline{\bf A}_{i-1} \circ \widetilde{ \sum_\text{samples} \overline{\bf A}_{i-1} } \right) \circ \Delta_i \widetilde{{\bf w}_{i-1}} \right] \cdot \text{d} {\bf A}_{i-1} \\
&\centerwithin{\downarrow}{=} \colorbox{light-blue}{algebraic distribution of the operators over the Hadamard product} \\
&= \mathcolorbox{shadecolor}{ \frac{1}{n \widetilde{\sigma}} \left( n \Delta_i \widetilde{{\bf w}_{i-1}} - \widetilde{ \sum_\text{samples} \Delta_i \widetilde{{\bf w}_{i-1}} } - \overline{\bf A}_{i-1} \circ \widetilde{ \sum_\text{samples} \overline{\bf A}_{i-1} \circ \Delta_i \widetilde{{\bf w}_{i-1}} } \right) } \cdot \, \text{d} {\bf A}_{i-1}
\end{align*}
\end{minipage}}

\noindent As expected, this compositional formulation of~$\text{d}\overline{\bf A}_{i-1}$ naturally comes out as an elegant and vectorized generalization of~eq.(\ref{factorOperator}).  Substituting it back into the sought-after term yields an expression from which one can finally identify the downstream error~$\Delta_i$:
\begin{align*}
\mathcolorbox{pink}{\left( \Delta_i \widetilde{{\bf w}_{i-1}}  \right) \cdot \text{d} \overline{\bf A}_{i-1}} &= \left( \Delta_i \widetilde{{\bf w}_{i-1}}  \right) \cdot \frac{1}{n \widetilde{\sigma}} \left( n\,{\bf 1} - \widetilde{ \sum_\text{samples} {\bf 1} } - \overline{\bf A}_{i-1} \circ \widetilde{ \sum_\text{samples} \overline{\bf A}_{i-1} } \right) \circ \text{d} {\bf A}_{i-1} \\
& \makebox[0pt][l]{\makebox[0pt][r]{\tooltip****{\usebox{\switchbox}}{\tempboxBootBNError}[-\fboxsep,-\dimexpr\ht\switchbox+\dp\switchbox\relax]}}\\
&=  \underbrace{ \mathcolorbox{shadecolor}{ \frac{1}{n \widetilde{\sigma}} \left( n \Delta_i \widetilde{{\bf w}_{i-1}} - \widetilde{ \sum_\text{samples} \Delta_i \widetilde{{\bf w}_{i-1}} } - \overline{\bf A}_{i-1} \circ \widetilde{ \sum_\text{samples} \overline{\bf A}_{i-1} \circ \Delta_i \widetilde{{\bf w}_{i-1}} } \right) }}_{\textstyle
    \begin{gathered}
      \Delta_{i-1}
    \end{gathered} } \cdot \, \text{d} {\bf A}_{i-1}
\end{align*}

\noindent In summary, the backward pass of the batch normalization layer is given by:
\begin{empheq}[box={\backPropBox[{\bf Batch normalization}: backward pass]}]{alignat=2}
\Delta_{i-1} &= \frac{1}{n \widetilde{\sigma}} \left( n \Delta_i \widetilde{{\bf w}_{i-1}} - \widetilde{ \sum_\text{samples} \Delta_i \widetilde{{\bf w}_{i-1}} } - \overline{\bf A}_{i-1} \circ \widetilde{ \sum_\text{samples} \overline{\bf A}_{i-1} \circ \Delta_i \widetilde{{\bf w}_{i-1}} } \right) \sim \mathbb{R}^{n\times f_{i-1}} \label{bn::error} \\
\frac{\partial \mathcal{L}_{\text{batch}}}{\partial {\bf w}_{i-1}} &= \text{diag} \left( \overline{\bf A}_{i-1}^{\, t} \Delta_i \right) \sim \mathbb{R}^{f_{i-1}} \label{bn::weight} \\
\frac{\partial \mathcal{L}_{\text{batch}}}{\partial {\bf b}_{i-1}} &= \sum_\text{samples} \Delta_i \sim \mathbb{R}^{f_{i-1}} \label{bn::bias}
\end{empheq}

\newcommand\tempboxBNimage{%
\begin{minipage}{\textwidth}%
\abovedisplayskip=0pt
\belowdisplayskip=0pt
\begin{empheq}[box={\LongbackPropBox[{\bf Batch normalization}: backward pass with 3d image data]}]{alignat=2}
\Delta_{i-1} &= f_{4d}^t \left[ \frac{1}{n_\text{eff} \, \widetilde{\sigma}} \left( n_\text{eff} f_{2d}^t(\Delta_i) \widetilde{{\bf w}_{i-1}} - \widetilde{ \sum_\text{samples} f_{2d}^t(\Delta_i) \widetilde{{\bf w}_{i-1}} } - \overline{f_{2d}^t({\bf A}_{i-1})} \circ \widetilde{ \sum_\text{samples} \overline{f_{2d}^t({\bf A}_{i-1})} \circ f_{2d}^t(\Delta_i) \widetilde{{\bf w}_{i-1}} } \right) \right]  \nonumber  \\
\frac{\partial \mathcal{L}_{\text{batch}}}{\partial {\bf w}_{i-1}} &= \text{diag} \left( \overline{f_{2d}^t({\bf A}_{i-1})}^{\,t} f_{2d}^t ( \Delta_i ) \right) \sim \mathbb{R}^{f_{i-1}} \nonumber \\
\frac{\partial \mathcal{L}_{\text{batch}}}{\partial {\bf b}_{i-1}} &= \sum_\text{samples} f_{2d}^t ( \Delta_i ) \equiv \sum_{\substack{\text{samples} \\ \text{\& space}}} \Delta_i \sim \mathbb{R}^{f_{i-1}} \nonumber
\end{empheq}

\begin{mdframed}[backgroundcolor=blue!20]
In the case of~3d (image) data, one needs to perform geometrical reshape operations in order to get equivalent expressions for the gradients.  Essentially, the idea consists in transforming the~4d structures participating in a batch normalization layer:
\begin{itemize}
\item input data array~${\bf A}_{i-1} \sim \mathbb{R}^{n\times f_{i-1} \times r_{i-1} \times r_{i-1}}$
\item upstream error~$\Delta_i \sim \mathbb{R}^{n\times f_i \times r_i \times r_i}$
\end{itemize}
into~2d structures based on feature vector representations (as explained in the \hyperlink{3dfeature}{relevant paragraph}) with which one can follow the standard expressions derived in~eqs.(\ref{bn::error},\ref{bn::weight},\ref{bn::bias}). \\

Because batch normalization layers do not change the dimensionality of the arrays, the number of features is conserved~$f_{i-1} = f_i$ and the same goes for the spatial resolution of the activation maps~$r_{i-1} = r_i$. Denoting by~$n_\text{eff} = n\times r_{i-1}\times r_{i-1}$ the ``effective'' minibatch size, we obtain the feature vector representations as:
\begin{itemize}
\item ${\bf A}_{i-1} \longrightarrow f_{2d}^t({\bf A}_{i-1}) \sim \mathbb{R}^{n_\text{eff}\times f_{i-1}}$
\item $\Delta_i \longrightarrow f_{2d}^t(\Delta_i) \sim \mathbb{R}^{n_\text{eff}\times f_{i-1}}$
\end{itemize}
Considering the multiplicative broadcast of the weight vector~$ {\bf w}_{i-1} \sim \mathbb{R}^{f_{i-1}} \longrightarrow \widetilde{{\bf w}_{i-1}} \sim \mathbb{R}^{f_{i-1} \times f_{i-1}}$, one can check that all matrix and Hadamard products are well-defined. \\

Finally, one concludes by invoking~$f_{4d}$ to restore its original~4d structure back to the downstream error~$\Delta_{i-1}$. (Practical examples are shown in table~\ref{table:networkExample} for our example network.)
\end{mdframed}

\end{minipage}}

\noindent \\

\noindent \hspace{0.45cm} $\makebox[0pt][l]{\makebox[0pt][r]{\fboxrule=-1pt\tooltip****{\usebox{\switchbox}}[white]{\tempboxBNimage}[-\fboxsep,0pt]}}$ {\it For details about geometric reshape operations in the case of~3d (image) data.}

\newpage

\fcolorbox{black}{pink}{%
\minipage[t]{\dimexpr0.9\linewidth-2\fboxsep-2\fboxrule\relax}

{\it ``The Queen propped her up against a tree, and said kindly,  You may rest a little now. \\

Alice looked round her in great surprise. Why, I do believe we've been under this tree the whole time! Everything's just as it was! \\

Of course it is, said the Queen, what would you have it? \\

Well, in our country, said Alice, still panting a little, you'd generally get to somewhere else~\textemdash~if you ran very fast for a long time, as we've been doing. \\

A slow sort of country! said the Queen. Now, here, you see, it takes all the running you can do, to keep in the same place. \\

If you want to get somewhere else, you must run at least twice as fast as that!''} \\

(Lewis Carroll, Through the Looking-Glass, 1871)

\endminipage}

\noindent \\ \\

\begin{thebibliography}{10}

\bibitem{optimizeNet} Hanxiao Liu, Karen Simonyan \& Yiming Yang. ``DARTS: Differentiable Architecture Search''. \href{https://arxiv.org/pdf/1806.09055.pdf}{arXiv:1806.09055} (2018) (Carnegie Mellon University \& Google DeepMind)

\bibitem{googleAIblog} Quoc Le \& Barret Zoph. ``Using Machine Learning to Explore Neural Network Architecture''. \href{https://ai.googleblog.com/2017/05/using-machine-learning-to-explore.html}{Google AI Blog~2017} (Google Brain)

\bibitem{inductiveBias} Daniel Soudry, Elad Hoffer, Mor Shpigel Nacson \& Nathan Srebro. ``The Implicit Bias of Gradient Descent on Separable Data''. \href{https://openreview.net/pdf?id=r1q7n9gAb}{ICLR~2018} (Technion \& Toyota Technological Institute at Chicago)

\bibitem{weightInit} Andre Perunicic. ``Understanding Neural Network Weight Initialization''. \href{https://intoli.com/blog/neural-network-initialization/}{Blog~post} (2017) illustrating common initialization procedures such as:
\begin{itemize}
\item Xavier Glorot \& Yoshua Bengio. ``Understanding the difficulty of training deep feedforward neural networks''. \href{http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf}{AISTATS~2010}. (Universit\'e de Montr\'eal).
\item Kaiming He, Xiangyu Zhang, Shaoqing Ren \& Jian Sun. ``Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification''. \href{https://www.cv-foundation.org/openaccess/content_iccv_2015/papers/He_Delving_Deep_into_ICCV_2015_paper.pdf}{ICCV~2015}. (Microsoft Research).
\item Dmytro Mishkin \& Jiri Matas. ``All you need is a good init'' \href{https://arxiv.org/pdf/1511.06422.pdf}{ICLR~2016} (Czech Technical University in Prague)
\end{itemize}

\bibitem{weightInit2} Lechao Xiao, Yasaman Bahri, Jascha Sohl-Dickstein, Samuel Schoenholz \& Jeffrey Pennington. ``Dynamical Isometry and a Mean Field Theory of CNNs: How to Train 10,000-Layer Vanilla Convolutional Neural Networks''. \href{https://arxiv.org/pdf/1806.05393.pdf}{ICML~2018} (Google Brain)

\bibitem{cauchy} Augustin Cauchy. ``M\'ethode g\'en\'erale pour la r\'esolution des syst\`emes d'\'equations simultan\'ees''. \href{https://gallica.bnf.fr/ark:/12148/bpt6k90190w/f406}{Comptes rendus de l'Acad\'emie des Sciences}~(1847)  

\bibitem{YannLeCun_tweet} Yann LeCun. ``Friends don't let friends use minibatches larger than 32''. \href{https://twitter.com/ylecun/status/989610208497360896}{Tweet} on April, 26$^\text{th}$ 2018 (Facebook AI Research \& NYU).

\bibitem{graphCoreAI} Dominic Masters \& Carlo Luschi. ``Revisiting small batch training for deep neural networks''. \href{https://www.graphcore.ai/posts/revisiting-small-batch-training-for-deep-neural-networks}{Blog post} and \href{https://arxiv.org/pdf/1804.07612.pdf}{arXiv:1804.07612} (2018) (Graphcore Research).

\bibitem{limitCycles} Pratik Chaudhari \& Stefano Soatto. ``Stochastic gradient descent performs variational inference, converges to limit cycles for deep networks''. \href{https://openreview.net/pdf?id=HyWrIgW0W}{ICLR~2018} (University of California, Los Angeles)

\bibitem{largeBatch1} Samuel L. Smith, Pieter-Jan Kindermans, Chris Ying \& Quoc V. Le.  ``Don't decay the learning rate, increase the batch size'' \href{https://openreview.net/pdf?id=B1Yy1BxCZ}{ICLR~2018} (Google Brain).

\bibitem{largeBatch2} Priya Goyal, Piotr Doll\'ar, Ross Girshick, Pieter Noordhuis, Lukasz Wesolowski, Aapo Kyrola, Andrew Tulloch, Yangqing Jia \& Kaiming He.  ``Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour''. \href{https://arxiv.org/abs/1706.02677}{arXiv:1706.02677} (2018) (Facebook AI Research).

\bibitem{largeBatch3} Elad Hoffer, Itay Hubara \& Daniel Soudry. ``Train longer, generalize better: closing the generalization gap in large batch training of neural networks''. \href{https://arxiv.org/pdf/1705.08741.pdf}{NIPS~2017} (Technion)

\bibitem{lossSurface} Marco Baity-Jesi, Levent Sagun, Mario Geiger, Stefano Spigler, G\'erard Ben Arous, Chiara Cammarota, Yann LeCun, Matthieu Wyart \& Giulio Biroli. ``Comparing Dynamics: Deep Neural Networks versus Glassy Systems''. \href{https://arxiv.org/pdf/1803.06969.pdf}{ICML~2018} (Columbia University, Institut de Physique Th\'eorique Saclay, EPFL, Courant Institute \& Center for Data Science NYU, Kings College, Facebook AI Research, Laboratoire de Physique Statistique de l'\'Ecole Normale Sup\'erieure)

\bibitem{ruderGradient} Sebastian Ruder. ``An overview of gradient descent optimization algorithms''.  \href{http://ruder.io/optimizing-gradient-descent/}{Blog~post} and \href{https://arxiv.org/abs/1609.04747}{arXiv:1609.04747} (2017) (NUI Galway \& Aylien Ltd.).

\bibitem{adaptiveMethods} Ashia Wilson, Rebecca Roelofs, Mitchell Stern, Nathan Srebro \& Benjamin Recht. ``The Marginal Value of Adaptive Gradient Methods in Machine Learning''. \href{https://papers.nips.cc/paper/7003-the-marginal-value-of-adaptive-gradient-methods-in-machine-learning.pdf}{NIPS~2017} (U.C. Berkeley~\&~Toyota Technological Institute at Chicago)

\bibitem{dropOut} Nitish Srivastava, Geoffrey Hinton, Alex Krizhevsky, Ilya Sutskever \& Ruslan Salakhutdinov. ``Dropout: A Simple Way to Prevent Neural Networks from Overfitting''. \href{http://jmlr.org/papers/volume15/srivastava14a.old/srivastava14a.pdf}{Journal of Machine Learning Research~2014} (University of Toronto) 

\bibitem{ng} Andrew Ng, Younes Bensouda Mourri \& Kian Katanforoosh. \href{https://www.coursera.org/specializations/deep-learning}{Deep Learning Specialization} (Coursera, Stanford University \& \'Ecole Centrale de Paris)

\bibitem{deepLearningBookGoodfellow} Ian Goodfellow, Yoshua Bengio \& Aaron Courville. ``Deep Learning''. \href{http://www.deeplearningbook.org}{MIT Press~2016}

\bibitem{intrinsicDim} Chunyuan Li, Heerad Farkhoor, Rosanne Liu \& Jason Yosinski. ``Measuring the Intrinsic Dimension of Objective Landscapes''. \href{https://openreview.net/pdf?id=ryup8-WCW}{ICLR~2018} (Duke University \& Uber AI Labs)

\bibitem{oneParam} Steven Piantadosi. ``One parameter is always enough''. \href{https://aip.scitation.org/doi/pdf/10.1063/1.5031956?class=pdf}{AIP Advances~2018} (U.C. Berkeley)

\bibitem{imageNet} Alex Krizhevsky, Ilya Sutskever \& Geoffrey Hinton. ``ImageNet Classification with Deep Convolutional Neural Networks''. \href{https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf}{NIPS~2012} (University of Toronto)

\bibitem{outrageousLarge} Noam Shazeer, Azalia Mirhoseini, Krzysztof Maziarz, Andy Davis, Quoc Le, Geoffrey Hinton~\&~Jeff Dean. ``Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer'' \href{https://openreview.net/pdf?id=B1ckMDqlg}{ICLR~2017} (Google Brain \& Jagiellonian University, Krak\'ow)

\bibitem{overParam} Behnam Neyshabur, Zhiyuan Li, Srinadh Bhojanapalli, Yann LeCun \& Nathan Srebro. ``Towards Understanding the Role of Over-Parametrization in Generalization of Neural Networks'' \href{https://arxiv.org/pdf/1805.12076.pdf}{arXiv:1805.12076} (2018) (Princeton, Toyota Technological Institute at Chicago, Facebook AI Research \& Courant Institute, NYU)

\bibitem{distanceRatioHighD} Kevin Beyer, Jonathan Goldstein, Raghu Ramakrishnan \& Uri Shaft. ``When Is ``Nearest Neighbor'' Meaningful?'' \href{https://www.researchgate.net/profile/Jonathan_Goldstein4/publication/2845566_When_Is_Nearest_Neighbor_Meaningful/links/09e4150b3eb298bf21000000/When-Is-Nearest-Neighbor-Meaningful.pdf}{Database Theory-ICDT~1999} (University of Wisconsin-Madison)

\bibitem{dimReduction} Sam Roweis \& Lawrence Saul. ``Nonlinear Dimensionality Reduction by Locally Linear Embedding''. \href{http://www.robots.ox.ac.uk/~az/lectures/ml/lle.pdf}{Science~2000} (University College London \& AT\&T Lab-Research)

\bibitem{pearlmutter} Atilim Baydin, Barak Pearlmutter, Alexey Radul \& Jeffrey Siskind. ``Automatic differentiation in machine learning: a survey''. \href{http://www.jmlr.org/papers/volume18/17-468/17-468.pdf}{The Journal of Machine Learning Research~2018} (University of Oxford, National University of Ireland Maynooth, MIT \& Purdue University)

\bibitem{RAD} Conal Elliott. ``The Simple Essence of Automatic Differentiation'' \href{http://conal.net/papers/essence-of-ad/essence-of-ad-icfp.pdf}{International Conference on Functional Programming~2018} (Target)

\bibitem{differentiableProgramming} Fei Wang, Xilun Wu, Gregory Essertel, James Decker \& Tiark Rompf. ``Demystifying Differentiable Programming: Shift/Reset the Penultimate Backpropagator''. \href{https://arxiv.org/pdf/1803.10228.pdf}{arXiv:1803.10228} (2018) (Purdue University)

\bibitem{deconv} Matthew Zeiler \& Rob Fergus ``Visualizing and Understanding Convolutional Networks''. \href{https://cs.nyu.edu/~fergus/papers/zeilerECCV2014.pdf}{ECCV~2014} (NYU)

\bibitem{manifoldHypothesis} Hariharan Narayanan \& Sanjoy Mitter. ``Sample Complexity of Testing the Manifold Hypothesis''. \href{https://papers.nips.cc/paper/3958-sample-complexity-of-testing-the-manifold-hypothesis.pdf}{NIPS~2010} (MIT)

\bibitem{chrisTopo} Chris Olah. ``Neural Networks, Manifolds, and Topology''. \href{http://colah.github.io/posts/2014-03-NN-Manifolds-Topology/}{Blog post~2014}

\bibitem{laurentSpiral} Laurent Bou\'e. ``Neural nets and optimal packing: taking a peek into the hidden layers...'' \href{https://github.com/Ranlot/spiralNet}{GitHub~2017}

\bibitem{DCGAN} Alec Radford, Luke Metz \& Soumith Chintala. ``Unsupervised Representation Learning with Deep Convolutional Generative Adversarial Networks''. \href{https://arxiv.org/abs/1511.06434}{ICLR~2016} (indico Research \& Facebook AI Research)

\bibitem{mikolov} Tomas Mikolov, Ilya Sutskever, Kai Chen, Gregory Corrado \& Jeffrey Dean. ``Distributed Representations of Words and Phrases and their Compositionality''. \href{https://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf}{NIPS~2013} (Google)

\bibitem{tishby} Ravid Schwartz-Ziv \& Naftali Tishby. ``Opening the black box of deep neural networks via information''. \href{https://arxiv.org/pdf/1703.00810.pdf}{ICRI-CI} (2017) (The Hebrew University of Jerusalem) 

\bibitem{cheapLearning} Henry Lin, Max Tegmark \& David Rolnick. ``Why does deep and cheap learning work so well?''. \href{https://arxiv.org/abs/1608.08225}{Journal of Statistical Physics~2017} (MIT \& Harvard University)

\bibitem{explainMethod} Julius Adebayo, Justin Gilmer, Ian Goodfellow \& Been Kim. ``Local Explanation Methods for Deep Neural Networks Lack Sensitivity to Parameter Values'' \href{https://openreview.net/pdf?id=SJOYTK1vM}{ICLR~2018} (Google Brain)

\bibitem{nadav} Nadav Cohen, Or Sharir \& Amnon Shashua. ``On the Expressive Power of Deep Learning: A Tensor Analysis'' \href{http://proceedings.mlr.press/v49/cohen16.pdf}{COLT~2016} (Hebrew University of Jerusalem)

\bibitem{poggio} Tomaso Poggio, Hrushikesh Mhaskar, Lorenzo Rosasco Brando Miranda \& Qianli Liao. ``Why and when can deep~\textemdash~but not shallow~\textemdash~networks avoid the curse of dimensionality: A review'' \href{https://link.springer.com/article/10.1007/s11633-017-1054-2}{International Journal of Automation and Computing~2017} (MIT, Caltech \& Claremont Graduate University)

\bibitem{natureDeep} Yann LeCun, Yoshua Bengio \& Geoffrey Hinton. ``Deep learning'' \href{https://www.cs.toronto.edu/~hinton/absps/NatureDeepReview.pdf}{Nature~2015} (Facebook AI Research, NYU, Universit\'e de Montr\'eal, Google \& University of Toronto)

\bibitem{distill} Chris Olah, Arvind Satyanarayan, Ian Johnson, Shan Carter, Ludwig Schubert, Katherine Ye~\&~Alexander Mordvintsev. ``The Building Blocks of Interpretability''. \href{https://distill.pub/2018/building-blocks/}{distill.pub~2018} (Google~\& Carnegie Mellon University)

\bibitem{explainLIME} Marco Tulio Ribeiro, Sameer Singh \& Carlos Guestrin ``Why Should I Trust You? Explaining the Predictions of Any Classifier''. \href{http://www.kdd.org/kdd2016/papers/files/rfp0573-ribeiroA.pdf}{KDD~2016} (University of Washington, Seattle)

\bibitem{generalize} Chiyuan Zhang, Samy Bengio, Moritz Hardt, Benjamin Recht \& Oriol Vinyals. ``Understanding deep learning requires rethinking generalization''. \href{https://openreview.net/forum?id=Sy8gdB9xx}{ICLR~2017}. (MIT, U.C. Berkely \& Google)

\bibitem{intriguing} Christian Szegedy, Wojciech Zaremba, Ilya Sutskever, Joan Bruna, Dumitru Erhan, Ian Goodfellow \& Rob Fergus. ``Intriguing properties of neural networks''. \href{https://arxiv.org/abs/1312.6199}{ICLR~2014} (Google, NYU, Universit\'e de Montr\'eal \& Facebook AI Research)

\bibitem{capsule} Sara Sabour, Nicholas Frosst \& Geoffrey Hinton. ``Dynamic Routing Between Capsules''. \href{https://papers.nips.cc/paper/6975-dynamic-routing-between-capsules.pdf}{NIPS~2017} (Google Brain)

\bibitem{fftSuperficial} Jason Jo \& Yoshua Bengio. ``Measuring the tendency of CNNs to Learn Surface Statistical Regularities''. \href{https://arxiv.org/abs/1711.11561}{arXiv:1711.11561} (2017) (Universit\'e de Montr\'eal) 

\bibitem{cifar10Generalize} Benjamin Recht, Rebecca Roelofs, Ludwig Schmidt \& Vaishaal Shankar. ``Do CIFAR-10 Classifiers Generalize to CIFAR-10?'' \href{https://arxiv.org/pdf/1806.00451.pdf}{arXiv:1806.00451} (2018) (UC Berkeley \& MIT)

\bibitem{openSet} Abhijit Bendale \& Terrance Boult. ``Towards Open Set Deep Networks''. \href{https://www.cv-foundation.org/openaccess/content_cvpr_2016/papers/Bendale_Towards_Open_Set_CVPR_2016_paper.pdf}{CVPR~2016} (University of Colorado at Colorado Springs)

\bibitem{rubbishImages} Anh Nguyen, Jason Yosinski \& Jeff Clune. ``Deep Neural Networks are Easily Fooled: High Confidence Predictions for Unrecognizable Images''. \href{http://www.evolvingai.org/files/DNNsEasilyFooled_cvpr15.pdf}{CVPR~2015} (University of Wyoming \& Cornell University)

\bibitem{garyMarcus} Gary Marcus. ``Innateness, AlphaZero, and Artificial Intelligence''. \href{https://arxiv.org/pdf/1801.05667.pdf}{arXiv:1801.05667} (NYU)

\bibitem{fastLoose}
\begin{itemize}
\item D. Sculley, Jasper Snoek, Ali Rahimi \& Alex Wiltschko. ``Winner's curse? On Pace, Progress, and Empirical Rigor''. \href{https://openreview.net/pdf?id=rJWF0Fywf}{ICLR~2018} (Google AI)
\item Zachary Lipton \& Jacob Steinhardt. ``Troubling Trends in Machine Learning Scholarship'' \href{https://arxiv.org/pdf/1807.03341.pdf}{ICML~2018:~The~Debates} (Carnegie Mellon University \& Stanford University)
\item D. Sculley, Gary Holt, Daniel Golovin, Eugene Davydov, Todd Phillips, Dietmar Ebner, Vinay Chaudhary, Michael Young, Jean-Fran\c{c}ois Crespo \& Dan Dennison. ``Hidden Technical Debt in Machine Learning Systems''. \href{https://papers.nips.cc/paper/5656-hidden-technical-debt-in-machine-learning-systems.pdf}{NIPS~2014} (Google)
\end{itemize}

\bibitem{pearl} Dana Mackenzie \& Judea Pearl ``The Book of Why: The New Science of Cause and Effect''.~(2018)

\bibitem{Dijkstra} Edsger Dijkstra. ``The threats to computing science''. \href{http://www.cs.utexas.edu/users/EWD/transcriptions/EWD08xx/EWD898.html}{EWD898} (1984)

\bibitem{nalu} Andrew Trask, Felix Hill, Scott Reed, Jack Rae, Chris Dyer \& Phil Blunsom. ``Neural Arithmetic Logic Units''. \href{https://arxiv.org/pdf/1808.00508.pdf}{arXiv:1808.00508} (2018) (Google DeepMind, University of Oxford \& University College London)

\bibitem{resNet} Kaiming He, Xiangyu Zhang, Shaoqing Ren \& Jian Sun. ``Deep Residual Learning for Image Recognition''. \href{https://www.cv-foundation.org/openaccess/content_cvpr_2016/papers/He_Deep_Residual_Learning_CVPR_2016_paper.pdf}{CVPR~2016} (Microsoft Research)

\bibitem{resNetConvex} Hao Li, Zheng Xu, Gavin Taylor, Christoph Studer \& Tom Goldstein. ``Visualizing the Loss Landscape of Neural Nets''. \href{https://arxiv.org/pdf/1712.09913.pdf}{NIPS~2017} (University of Maryland, United States Naval Academy \& Cornell University)

\bibitem{calibration} Chuan Guo, Geoff Pleiss, Yu Sun \& Kilian Weinberger. ``On Calibration of Modern Neural Networks'' \href{http://proceedings.mlr.press/v70/guo17a/guo17a.pdf} {ICML~2017} (Cornell University)

\bibitem{bbq} Charles Blundell, Julien Cornebise, Koray Kavukcuoglu \& Daan Wierstra. ``Weight Uncertainty in Neural Networks''. \href{http://proceedings.mlr.press/v37/blundell15.pdf}{ICML~2015} (Google DeepMind)

\bibitem{testTimeDropOut} Yarin Gal \& Zoubin Ghahramani. ``Dropout as a Bayesian Approximation: Representing Model Uncertainty in Deep Learning''. \href{http://proceedings.mlr.press/v48/gal16.pdf}{ICML~2016} (University of Cambridge)

\bibitem{floatingPoint} \href{https://blog.openai.com/nonlinear-computation-in-linear-networks/}{Open AI Blog~2017} 

\bibitem{relu} Xavier Glorot, Antoine Bordes \& Yoshua Bengio. ``Deep Sparse Rectifier Neural Networks''. \href{http://proceedings.mlr.press/v15/glorot11a/glorot11a.pdf}{AISTATS~2011} (Universit\'e de Technologie de Compi\`egne \& Universit\'e de Montr\'eal)

\bibitem{maxPoolcontroversy} Geoff Hinton. {\it ``The pooling operation used in convolutional neural networks is a big mistake and the fact that it works so well is a disaster.''}

\bibitem{stochasticPool} Matthew Zeiler \& Rob Fergus. ``Stochastic Pooling for Regularization of Deep Convolutional Neural Networks''. \href{https://arxiv.org/abs/1301.3557}{ICLR~2013} (Courant Institute, NYU)

\bibitem{allConv} Jost Tobias Springenberg, Alexey Dosovitskiy, Thomas Brox \& Martin Riedmiller. ``Striving for Simplicity: The All Convolutional Net''. \href{https://arxiv.org/abs/1412.6806}{ICLR~2015} (University of Freiburg)

\bibitem{invarianceBrittle} Aharon Azulay \& Yair Weiss. ``Why do deep convolutional networks generalize so poorly to small image transformations?''. \href{https://arxiv.org/pdf/1805.12177.pdf}{	arXiv:1805.12177} (Hebrew University of Jerusalem)

\bibitem{winograd} Andrew Lavin \& Scott Gray. ``Fast Algorithms for Convolutional Neural Networks''. \href{https://pdfs.semanticscholar.org/8758/5d3dc3a771704855186d5625542acec2f01b.pdf}{CVPR~2016}. (Association for Computing Machinery \& Nervana Systems)

\bibitem{convFFT} Michael Mathieu, Mikael Henaff \& Yann LeCun. ``Fast Training of Convolutional Networks through FFTs''. \href{https://arxiv.org/abs/1312.5851}{ICLR~2014}. (Courant Institute of Mathematical Sciences, NYU)

\bibitem{wikiConv} \href{https://en.wikipedia.org/wiki/Toeplitz_matrix#Discrete_convolution}{Toeplitz matrices and discrete convolutions} (Wikipedia)

\bibitem{convUnroll} Kumar Chellapilla, Sidd Puri, Patrice Simard. ``High Performance Convolutional Neural Networks for Document Processing''. \href{https://hal.inria.fr/inria-00112631/document}{Tenth International Workshop on Frontiers in Handwriting Recognition, La Baule, 2006}. (Microsoft Research)

\bibitem{blas} Charles Lawson, Richard Hanson, David Kincaid \& Fred Krogh. ``Basic Linear Algebra Subprograms for Fortran Usage''. \href{https://dl.acm.org/citation.cfm?id=355847}{ACM Transactions on Mathematical Software, 1979}. (Jet Propulsion Laboratory, Sandia National Laboratories \& The University of Texas at Austin)

\bibitem{cuDNN} Sharan Chetlur, Cliff Woolley, Philippe Vandermersch, Jonathan Cohen, John Tran, Bryan Catanzaro \& Evan Shelhamer. ``cuDNN: Efficient Primitives for Deep Learning''. \href{https://arxiv.org/abs/1410.0759}{arXiv:1410.0759} (2014) (NVIDIA, Baidu Research \& U.C. Berkeley)

\bibitem{koltun} Shaojie Bai, J. Zico Kolter \& Vladlen Koltun. ``An Empirical Evaluation of Generic Convolutional and Recurrent Networks for Sequence Modeling''. \href{https://arxiv.org/abs/1803.01271}{arXiv:1803.01271} (2018) (Carnegie Mellon University \& Intel)

\bibitem{hardt} John Miller \& Moritz Hardt. ``When Recurrent Models Don't Need To Be Recurrent''. \href{https://arxiv.org/abs/1805.10369}{arXiv:1805.10369} (2018) (U.C. Berkeley)

\bibitem{RNNresnet} Qianli Liao \& Tomaso Poggio. ``Bridging the gaps between residual learning, recurrent neural networks and visual cortex''. \href{https://arxiv.org/pdf/1604.03640.pdf}{CBMM Memo No. 047} (2016) (Center for Brains, Minds and Machines, MIT)

\bibitem{LaurentRNN} Laurent Bou\'e. ``Deep learning for pedestrians: backpropagation in RNNs'' (in preparation).

\bibitem{fracStride} Vincent Dumoulin \& Francesco Visin. ``A guide to convolution arithmetic for deep learning''. \href{https://arxiv.org/pdf/1603.07285.pdf}{arXiv:1603.07285} (2016) (Universit\'e de Montr\'eal \& AIRLab, Politecnico di Milano)

\bibitem{bnMemoryBottleneck} Igor Gitman \& Boris Ginsburg. ``Comparison of Batch Normalization and Weight Normalization Algorithms for the Large-scale Image Classification''. \href{https://arxiv.org/pdf/1709.08145.pdf}{	arXiv:1709.08145} (2017) (Carnegie Mellon University \& NVIDIA)

\bibitem{bnOriginal} Sergey Ioffe \& Christian Szegedy. ``Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift''. \href{http://proceedings.mlr.press/v37/ioffe15.pdf}{ICML~2015} (Google)

\bibitem{bnDebunk} Shibani Santurkar, Dimitris Tsipras, Andrew Ilyas \& Aleksander Madry. ``How Does Batch Normalization Help Optimization? (No, It Is Not About Internal Covariate Shift)''. \href{https://arxiv.org/pdf/1805.11604.pdf}{arXiv:1805.11604} (2018) (MIT)

\bibitem{normalizationLayers} Some prominent alternatives to batch normalization:
\begin{itemize}
\item Jimmy Ba, Jamie Kiros \& Geoffrey Hinton. ``Layer Normalization''. \href{https://arxiv.org/pdf/1607.06450.pdf}{arXiv:1607.06450} (2016) (University of Toronto \& Google)  More relevant for Recurrent Neural Networks~(RNNS).
\item Dmitry Ulyanov, Andrea Vedaldi \& Victor Lempitsky. ``Improved Texture Networks: Maximizing Quality and Diversity in Feed-forward Stylization and Texture Synthesis''. \href{http://openaccess.thecvf.com/content_cvpr_2017/papers/Ulyanov_Improved_Texture_Networks_CVPR_2017_paper.pdf}{CVPR~2017} (Skolkovo Institute of Science and Technology, Yandex \& University of Oxford)
\item Yuxin Wu \& Kaiming He. ``Group Normalization''. \href{https://arxiv.org/pdf/1803.08494.pdf}{ECCV~2018} (Facebook AI Research)
\item Elad Hoffer, Ron Banner, Itay Golan \& Daniel Soudry. ``Norm matters: efficient and accurate normalization schemes in deep networks''. \href{https://arxiv.org/pdf/1803.01814.pdf}{arXiv:1803.01814} (2018) (Technion \& Intel AIPG)
\end{itemize}

\end{thebibliography}

\newpage

\begin{appendices}

\section{Spatial sampling of feature maps}
\label{sec:sample}

\noindent Illustrated examples of spatial sampling of feature maps are provided in:
\begin{itemize}
\item Fig.~\ref{fig:maxPool} (maxpool) and the top panel of~Fig.~\ref{fig:convForwardBackward} (convolution): {\bf downsampling}
\item bottom panel of~Fig.~\ref{fig:convForwardBackward} (convolution): {\bf upsampling}
\end{itemize}
In both cases, the sampling operation can be understood as a sliding window parametrized by a geometric variable made up of~3 components:
\begin{equation}
\mathcolorbox{pink}{\mathfrak{p} \equiv \left( \makebox{kernel size} = k, \text{stride} = s, \text{padding} = p \right)}
\label{eq:triplet}
\end{equation}
The kernel size corresponds to the spatial extent (considered to be square for simplicity) of the sliding window.  The stride corresponds to the number of cells to slide the kernel.  Notice that fractional strides can be implemented by inserting zero-filled cells in between the actual data of the feature maps (bottom panel of~Fig.\ref{fig:convForwardBackward}). Finally, padding corresponds to the number of zero-padding cells that may be added on the outer edges of feature maps.  Note that padding is commonly used in order to control the spatial size of output feature maps.  In general, sampling an input feature map~${\bf A}_{i-1}$ results in an output~${\bf A}_i$ which can be dimensionally summarized as:
\begin{equation*}
{\bf A}_i \sim \mathbb{R}^{n \times d_i \times r_i \times r_i} = \text{sample}_\mathfrak{p}  \left( {\bf A}_{i-1} \sim \mathbb{R}^{n \times d_{i-1} \times r_{i-1} \times r_{i-1}} \right)
\end{equation*}
The spatial size~$r_i$ of the output can be calculated as a function of~$r_{i-1}$ and of the geometrical properties~$\mathfrak{p}$ of the sliding window via:
\begin{equation}
r_i = \floor*{\frac{r_{i-1} + 2p -k}{s} + 1} 
\label{eq:subSample}
\end{equation}
The relationship between~$d_i$ and~$d_{i-1}$ depends on the type of the sampling layer and is independent of~$\mathfrak{p}$.  For example, $d_i = d_{i-1}$ simply remains unchanged in the case of maxpool layers (section~\ref{sec:maxpool}).  On the other hand, $d_i$ corresponds to the desired number of trainable filters for convolutional layers (see the weights~${\bf w}_{i-1}^\mathfrak{p} \sim \mathbb{R}^{d_i \times d_{i-1} \times k \times k}$ in section~\ref{sec:conv}).

\section{Broadcasting semantics}
\label{appendix::broadcast}

\noindent Although arithmetic operations require the shape of data arrays to satisfy certain constraints, one would still like to perform vectorized operations (using transparent operator overloading) in case of dimensionality mismatch~\textemdash~as long as the arrays in question are of compatible sizes.

\paragraph{Additive broadcast} For example, in the case of fully connected layers discussed in section~\ref{sec:fc}, it is very common to add together~2d arrays~${\bf A} \sim \mathbb{R}^{n\times f}$ with~1d~vectors~${\bf b} = (b_1 , \cdots , b_f ) \sim \mathbb{R}^f$.  In this case, the size incompatibility stems from the conflict between our data representation where we simultaneously consider minibatches of~$n$ samples and our desire to give each feature its own bias term.  Conceptually, this conflict can easily be resolved by explicitly giving all samples their own copy of the original bias vector:
\begin{equation*}
{\bf b}  \Longrightarrow \widetilde{\bf b} = \left(
\begin{matrix}
   {\bf b} \\
   \vdots \\
   {\bf b}
\end{matrix}
\right) = \left(
\begin{matrix}
    b_1 & \dots & b_f \\
    \vdots & \vdots & \vdots \\
    b_1 & \dots & b_f
\end{matrix}
\right) \sim \mathbb{R}^{n\times f}
\end{equation*}
so that the bias shift can now be carried out with a well-defined addition~${\bf A} + \widetilde{\bf b}$ between arrays of same sizes. Of course, linear algebra frameworks implement this ``broadcasting'' process via smart memory-efficient techniques that circumvent the need for data duplication.  Furthermore, broadcasting provides a means of vectorizing array operations for more efficient looping instructions.  Reverting the broadcasting is performed by contracting out the ``duplicated'' dimensions.  For example, in the case of the bias vector, a simple sum over the samples takes us back to the original~1d representation:
\begin{equation*}
\sum_\text{samples} \widetilde{\bf b} \Longrightarrow {\bf b} = \left( b_1 , \cdots , b_f \right) \sim \mathbb{R}^f
\end{equation*}
As an additional example, let us consider the broadcasting of the biases of convolutional kernels discussed in section~\ref{sec:conv}.  The process is very similar to that described above with the only difference that the data with which the bias vector~${\bf b} \sim \mathbb{R}^f$ needs to be added is now~${\bf A} \sim \mathbb{R}^{n\times f \times r\times r}$ a~4d array~\footnote{Notice that the number of features is usually denoted~$d$ for depth.  We keep it here as~$f$ for consistency with the description of~2d broadcasting of the bias vector and to emphasize that feature maps act as independent features.}.  Accordingly, broadcasting is implemented by copying~${\bf b}$ into~$n$ spatial feature maps of resolution~$r$.  Similarly, contracting out the newly created dimensions by summing over the spatial dimensions of the feature maps in addition to the samples reverts the broadcasting:
\begin{eqnarray*}
{\bf b} &\Longrightarrow& \widetilde{\bf b} \sim \mathbb{R}^{n\times f\times r\times r} \\
\sum_{\substack{\text{samples} \\ \text{\& space}}} \widetilde{\bf b} &\Longrightarrow& {\bf b} \sim \mathbb{R}^f
\end{eqnarray*}

\paragraph{Multiplicative broadcast} Section~\ref{sec:batchNorm} introduces another kind of broadcasting semantics.  In this case, the idea consists in scaling all the features of a~2d data array~${\bf A} \sim \mathbb{R}^{n\times f}$ using a~1d vector~$\lambda \sim \mathbb{R}^f$.  Distribution of the components of~$\lambda$ over to their matching feature can be accomplished by creating a diagonal broadcast:
\begin{equation}
\lambda = \left( \lambda_1 , \cdots , \lambda_f \right) \sim \mathbb{R}^f \,\, \Longrightarrow \,\,\, \widetilde{\lambda} = \left(
\begin{matrix}
\lambda_1 & & \\
& \ddots & \\
& & \lambda_f
\end{matrix}
\right) \sim \mathbb{R}^{f\times f}
\label{broadcast::mul}
\end{equation}
so that feature scaling is achieved by a well-defined~${\bf A} \, \widetilde{\lambda}$ matrix multiplication.  Accordingly, reverting this multiplicative broadcast is accomplished by picking out the diagonal components:
\begin{equation*}
\text{diag} \, \widetilde{\lambda} \,\, \Longrightarrow \,\, \lambda \sim \mathbb{R}^f
\end{equation*}

\section{Matrices: a potpourri of what's relevant...}

\noindent The purpose of this section is to serve a small collection of definitions and basic properties of matrix calculus that are used throughout the article.  Keeping in mind the machine learning context, it is common to think of matrices~${\bf A} \sim \mathbb{R}^{n\times f}$ as a vertical stack of~1d feature vectors~${\bf a} = (a_1 , \cdots , a_f ) \sim \mathbb{R}^f$ with each row corresponding to an individual sample out of a minibatch of~$n$ samples: \\
\begin{equation} 
{\bf A} = \left(
\begin{matrix}
    {\bf a}^1 \sim \mathbb{R}^f  \\
    \vdots \\
    {\bf a}^n  \sim \mathbb{R}^f
\end{matrix}
\right) = \left(
\begin{matrix}
	a^1_1 & \dots &  a^1_f \\
    \hdotsfor{3} \\
    a^n_1 & \dots &  a^n_f
\end{matrix}
\right) \sim \mathbb{R}^{n\times f}
\end{equation}

\noindent Without surprise, we denote the matrix product between ${\bf A} \sim \mathbb{R}^{n \times f}$ and ${\bf B} \sim \mathbb{R}^{f \times m}$ as ${\bf A} {\bf B} \sim \mathbb{R}^{n 
\times m}$.

\paragraph{Feature dot-product}  This operation is defined as the dot-products between the feature vectors of a pair of matrices~${\bf A} \sim \mathbb{R}^{n \times f}$ and~${\bf B} \sim \mathbb{R}^{n\times f}$ as such:
\begin{equation}
{\bf A} \ominus {\bf B} = \left(
\begin{matrix}
    {\bf a}^1 \sim \mathbb{R}^f  \\
    \vdots \\
    {\bf a}^n  \sim \mathbb{R}^f
\end{matrix}
\right) \ominus \left(
\begin{matrix}
    {\bf b}^1 \sim \mathbb{R}^f  \\
    \vdots \\
    {\bf b}^n  \sim \mathbb{R}^f
\end{matrix}
\right) = \left(
\begin{matrix}
    {\bf a}^1 \cdot {\bf b}^1 \sim \mathbb{R}  \\
    \vdots \\
    {\bf a}^n  \cdot {\bf b}^n \sim \mathbb{R}
\end{matrix}
\right) \sim \mathbb{R}^n
\end{equation}

\noindent where we denote by~${\bf a} \cdot {\bf b} = \sum_{\mathfrak{f}=1}^f a_\mathfrak{f} b_\mathfrak{f}$ the regular dot-product between~1d vectors~${\bf a} \sim \mathbb{R}^f$ and~${\bf b} \sim \mathbb{R}^f$.

\paragraph{Hadamard product}  This operation takes a pair of matrices~${\bf A} \sim \mathbb{R}^{n \times f}$ and~${\bf B} \sim \mathbb{R}^{n\times f}$ and returns a new matrix~$\sim \mathbb{R}^{n\times f}$ where each element is the product of elements of the original two matrices:
\begin{equation}
{\bf A} \circ {\bf B} = \left(
\begin{matrix}
    a^1_1 & \dots &  a^1_f \\
    \vdots & \vdots & \vdots \\
    a^n_1 & \dots &  a^n_f
\end{matrix} \right) \circ \left(
\begin{matrix}
    b^1_1 & \dots &  b^1_f \\
    \vdots & \vdots & \vdots \\
    b^n_1 & \dots &  b^n_f
\end{matrix} \right) = \left(
\begin{matrix}
    a^1_1 b^1_1 & \dots &  a^1_f b^1_f \\
    \vdots & \vdots & \vdots \\
    a^n_1 b^n_1 & \dots &  a^n_f b^n_f
\end{matrix} \right) \sim \mathbb{R}^{n \times f} 
\end{equation}

\paragraph{Frobenius product}  This operation can be seen as a generalization of vector dot-products to matrices.  Taking two matrices of the same size~${\bf A} \sim \mathbb{R}^{n \times f}$ and~${\bf B} \sim \mathbb{R}^{n\times f}$, the Frobenius product returns a single number~$\sim \mathbb{R}$ defined as the sum of the entries of the Hadamard product:
\begin{equation}
{\bf A} \cdot {\bf B} = \sum_{ij} {\bf A} \circ {\bf B} = \sum_{s=1}^n \sum_{\mathfrak{f}=1}^f a^s_\mathfrak{f} b^s_\mathfrak{f}
\end{equation}

\noindent Note that this product is also obviously related to the feature dot-product operation through:
\begin{equation}
{\bf A} \cdot {\bf B} = \sum_\text{samples} {\bf A} \ominus {\bf B}
\label{rowWise::Froebenius}
\end{equation}

\noindent Finally, let us mention some useful identities relating the binary matrix operations defined above with each other:
\begin{align}
{\bf A} \cdot \left( {\bf B} {\bf C} \right) &= \left( {\bf B}^t {\bf A} \right) \cdot {\bf C} = \left( {\bf A} {\bf C}^t \right) \cdot {\bf B}
\label{frobenius1} \\
{\bf A} \cdot \left( {\bf B} \circ {\bf C} \right) &= \left( {\bf A} \circ {\bf B} \right) \cdot {\bf C}
\label{frobenius2}
\end{align}

\section{Matrix derivatives}

\paragraph{Vector to scalar} As seen in the introduction and in section~\ref{backPropSection}, neural networks involve a supervised loss function:
\begin{equation*}
\ell \left( {\bf a}^s , {\bf y}^s \right) \sim \mathbb{R}
\end{equation*}
that takes a~1d feature vector~${\bf a}^s \sim \mathbb{R}^f$ corresponding to a sample~$s$ and returns a scalar~$\sim \mathbb{R}$ representing the amount of mismatch with a fixed ground-truth label~${\bf y}^s$.  Typically, backpropagation proceeds by evaluating the total derivative of this loss function over a minibatch of~$n$ samples: 
\begin{equation}
\text{d} \ell \left( {\bf A} , {\bf Y} \right) = \left( 
\begin{matrix}
    \text{d} \ell ( {\bf a}^1 , {\bf y}^1) \sim \mathbb{R} \\
    \vdots \\
    \text{d} \ell ( {\bf a}^n , {\bf y}^n) \sim \mathbb{R}
\end{matrix}  \right)
 = \left( \begin{matrix}
    \nabla^1 \ell ( {\bf a}^1 , {\bf y}^1) \cdot \text{d} {\bf a}_1 \\
    \vdots \\
    \nabla^n \ell ( {\bf a}^n , {\bf y}^n) \cdot \text{d} {\bf a}_n
\end{matrix}  \right) = \nabla \ell \left( {\bf A} , {\bf Y} \right) \ominus \text{d} {\bf A}  \sim \mathbb{R}^n
\label{totalVectorToScalar}
\end{equation}
\noindent where the sample-specific~$s\in \{1, \cdots ,n \}$ nabla operator is defined as:
\begin{equation*}
\nabla^s = \frac{\partial}{\partial {\bf a}^s} = \left( \frac{\partial}{\partial a^s_1} , \cdots , \frac{\partial}{\partial a^s_f} \right) \sim \mathbb{R}^f 
\end{equation*}
Note that we are usually interested in the sensitivity of the loss with respect to the input data~${\bf A}$ and not to the ground-truth labels~${\bf Y}$.  This explains why we considered only the feature vector dependence in the definition of the nabla operator.

\paragraph{Scalar to scalar}  As described in section~\ref{sec:activation}, activation functions typically involve the application of a scalar to scalar function~$a^s_\mathfrak{f} \sim \mathbb{R} \rightarrow g(a^s_\mathfrak{f}) \sim \mathbb{R}$ to all the elements of a data matrix: 
\begin{equation*}
g({\bf A}) = 
\left(
\begin{matrix}
    g(a^1_1) & \dots & g(a^1_f) \\
    \vdots & \vdots & \vdots \\
    g(a^n_1) & \dots & g(a^n_f)
\end{matrix}
\right) \sim \mathbb{R}^{n \times f} 
\end{equation*}

\newcommand\tempboxAppendixActivation{%
\begin{minipage}{0.2\textwidth}%
\abovedisplayskip=0pt
\belowdisplayskip=0pt
\begin{align*}
\text{d} g \left( {\bf A} \right) &= \left(
\begin{matrix}
    \text{d} g(a^1_1) & \dots & \text{d} g(a^1_f) \\
    \vdots & \vdots & \vdots \\
	\text{d} g(a^n_1) & \dots & \text{d} g(a^n_f)
\end{matrix} \right) \\
&= \left(
\begin{matrix}
    g^\prime(a^1_1) \text{d} a^1_1 & \dots & g^\prime(a^1_f) \text{d} a^1_f \\
    \vdots & \vdots & \vdots \\
    g^\prime(a^n_1) \text{d} a^n_1 & \dots & g^\prime(a^n_f) \text{d} a^n_f
\end{matrix} 
\right) \\
&=
\left(
\begin{matrix}
    g^\prime(a^1_1) & \dots & g^\prime(a^1_f)  \\
    \vdots & \vdots & \vdots \\
    g^\prime(a^n_1) & \dots & g^\prime(a^n_f)
\end{matrix}
\right) \circ
\left(
\begin{matrix}
    \text{d} a^1_1 & \dots & \text{d} a^1_f  \\
    \vdots & \vdots & \vdots \\
    \text{d} a^n_1 & \dots & \text{d} a^n_f
\end{matrix}
\right) \\
&= g^\prime({\bf A}) \circ \text{d} {\bf A}
\end{align*}
\end{minipage}}

\noindent Evaluating its total derivative yields:
\begin{align}
\text{d} g \left( {\bf A} \right) &= g^\prime({\bf A}) \circ \text{d} {\bf A} \sim \mathbb{R}^{n \times f}
\label{scalarToScalar} \\
 & \makebox[0pt][l]{\makebox[0pt][r]{\tooltip****{\usebox{\switchbox}}{\tempboxAppendixActivation}[-\fboxsep,-\dimexpr\ht\switchbox+\dp\switchbox\relax]}} 
\end{align}

\end{appendices}

\newpage

\hypertarget{contents}{}
\tableofcontents

\end{document}