-
Notifications
You must be signed in to change notification settings - Fork 3
/
main.tex
2990 lines (2514 loc) · 264 KB
/
main.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
\documentclass{article}
\usepackage[cmyk]{xcolor}
% start
\usepackage{hyperref}
\setcounter{tocdepth}{4}
\makeatletter
\DeclareRobustCommand*{\Myhref}[1][]{% definition copied form hyperref.sty
\begingroup
\setkeys{Hyp}{#1}% Changed href to Hyp
\@ifnextchar\bgroup\Hy@href{\hyper@normalise\href@}%
}
\makeatother
\makeatletter
\let\Hy@setpdfborderOrig\Hy@setpdfborder
\def\Hy@setpdfborder{\ocgbase@insert@oc\Hy@setpdfborderOrig}%
\makeatother
\usepackage{xparse,ocgbase,calc,tikzpagenodes,linegoal}
\usetikzlibrary{calc}
\ExplSyntaxOn
\let\tpPdfLink\pbs_pdflink:nn
\let\tpPdfAnnot\pbs_pdfannot:nnnn\let\tpPdfLastAnn\pbs_pdflastann:
\let\tpAppendToFields\pbs_appendtofields:n
\def\tpPdfXform{\pbs_pdfxform:nnnnn{1}{1}{}{}}
\let\tpPdfLastXform\pbs_pdflastxform:
\let\cListSet\clist_set:Nn\let\cListItem\clist_item:Nn
\ExplSyntaxOff
\makeatletter
\NewDocumentCommand{\tooltip}{%
ssssO{\ifdefined\@linkcolor\@linkcolor\else blue\fi}mO{gray!20}mO{0pt,0pt}%
}{{%
\leavevmode%
\IfBooleanT{#2}{%
%for variants with two and more stars, put tip box on a PDF Layer (OCG)
\ocgbase@new@ocg{tipOCG.\thetcnt}{%
/Print<</PrintState/OFF>>/Export<</ExportState/OFF>>%
}{false}%
\xdef\tpTipOcg{\ocgbase@last@ocg}%
%prevent simultaneous visibility of multiple non-draggable tooltips
\ocgbase@add@ocg@to@radiobtn@grp{tool@tips}{\ocgbase@last@ocg}%
}%
\tpPdfLink{%
\IfBooleanTF{#4}{%
/Subtype/Link/Border[0 0 0]/A <</S/SetOCGState/State [/Toggle \tpTipOcg]>>
}{%
/Subtype/Screen%
/AA<<%
\IfBooleanTF{#3}{%
/E<</S/SetOCGState/State [/Toggle \tpTipOcg]>>%
}{%
\IfBooleanTF{#2}{%
/E<</S/SetOCGState/State [/ON \tpTipOcg]>>%
/X<</S/SetOCGState/State [/OFF \tpTipOcg]>>%
}{
\IfBooleanTF{#1}{%
/E<</S/JavaScript/JS(%
var fd=this.getField('tip.\thetcnt');%
if(typeof(click\thetcnt)=='undefined'){%
var click\thetcnt=false;%
var fdor\thetcnt=fd.rect;var dragging\thetcnt=false;%
}%
if(fd.display==display.hidden){%
fd.delay=true;fd.display=display.visible;fd.delay=false;%
}else{%
if(!click\thetcnt&&!dragging\thetcnt){fd.display=display.hidden;}%
if(!dragging\thetcnt){click\thetcnt=false;}%
}%
this.dirty=false;%
)>>%
}{%
/E<</S/JavaScript/JS(%
var fd=this.getField('tip.\thetcnt');%
if(typeof(click\thetcnt)=='undefined'){%
var click\thetcnt=false;%
var fdor\thetcnt=fd.rect;var dragging\thetcnt=false;%
}%
if(fd.display==display.hidden){%
fd.delay=true;fd.display=display.visible;fd.delay=false;%
}%
this.dirty=false;%
)>>%
/X<</S/JavaScript/JS(%
if(!click\thetcnt&&!dragging\thetcnt){fd.display=display.hidden;}%
if(!dragging\thetcnt){click\thetcnt=false;}%
this.dirty=false;%
)>>%
}%
/U<</S/JavaScript/JS(click\thetcnt=true;this.dirty=false;)>>%
/PC<</S/JavaScript/JS (%
var fd=this.getField('tip.\thetcnt');%
try{fd.rect=fdor\thetcnt;}catch(e){}%
fd.display=display.hidden;this.dirty=false;%
)>>%
/PO<</S/JavaScript/JS(this.dirty=false;)>>%
}%
}%
>>%
}%
}{{\color{#5}#6}}%
\sbox\tiptext{%
\IfBooleanT{#2}{%
\ocgbase@oc@bdc{\tpTipOcg}\ocgbase@open@stack@push{\tpTipOcg}}%
\fcolorbox{black}{#7}{#8}%
\IfBooleanT{#2}{\ocgbase@oc@emc\ocgbase@open@stack@pop\tpNull}%
}%
\cListSet\tpOffsets{#9}%
\edef\twd{\the\wd\tiptext}%
\edef\tht{\the\ht\tiptext}%
\edef\tdp{\the\dp\tiptext}%
\tipshift=0pt%
\IfBooleanTF{#2}{%
%OCG-based (that is, all non-draggable) boxes should not extend beyond the
%current column as they may get overlaid by text in the neighbouring column
\setlength\whatsleft{\linegoal}%
}{%
\measureremainder{\whatsleft}%
}%
\ifdim\whatsleft<\dimexpr\twd+\cListItem\tpOffsets{1}\relax%
\setlength\tipshift{\whatsleft-\twd-\cListItem\tpOffsets{1}}\fi%
\IfBooleanF{#2}{\tpPdfXform{\tiptext}}%
\raisebox{\heightof{#6}+\tdp+\cListItem\tpOffsets{2}}[0pt][0pt]{%
\makebox[0pt][l]{\hspace{\dimexpr\tipshift+\cListItem\tpOffsets{1}\relax}%
\IfBooleanTF{#2}{\usebox{\tiptext}}{%
\tpPdfAnnot{\twd}{\tht}{\tdp}{%
/Subtype/Widget/FT/Btn/T (tip.\thetcnt)%
/AP<</N \tpPdfLastXform>>%
/MK<</TP 1/I \tpPdfLastXform/IF<</S/A/FB true/A [0.0 0.0]>>>>%
/Ff 65536/F 3%
/AA <<%
/U <<%
/S/JavaScript/JS(%
var fd=event.target;%
var mX=this.mouseX;var mY=this.mouseY;%
var drag=function(){%
var nX=this.mouseX;var nY=this.mouseY;%
var dX=nX-mX;var dY=nY-mY;%
var fdr=fd.rect;%
fdr[0]+=dX;fdr[1]+=dY;fdr[2]+=dX;fdr[3]+=dY;%
fd.rect=fdr;mX=nX;mY=nY;%
};%
if(!dragging\thetcnt){%
dragging\thetcnt=true;Int=app.setInterval("drag()",1);%
}%
else{app.clearInterval(Int);dragging\thetcnt=false;}%
this.dirty=false;%
)%
>>%
>>%
}%
\tpAppendToFields{\tpPdfLastAnn}%
}%
}}%
\stepcounter{tcnt}%
}}
\makeatother
\newsavebox\tiptext\newcounter{tcnt}
\newlength{\whatsleft}\newlength{\tipshift}
\newcommand{\measureremainder}[1]{%
\begin{tikzpicture}[overlay,remember picture]
\path let \p0 = (0,0), \p1 = (current page.east) in
[/utils/exec={\pgfmathsetlength#1{\x1-\x0}\global#1=#1}];
\end{tikzpicture}%
}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newsavebox{\switchbox}% measure width
\savebox{\switchbox}{\fcolorbox{black}{yellow}{\bfseries +}}
\newcommand{\centerwithin}[2]{% #1=small symbol, #2=wide symbol
{\mathmakebox[\widthof{\ensuremath{{}#2{}}}][c]{{#1}}}}
% end
\usepackage[utf8]{inputenc}
\usepackage{graphicx}
\usepackage{amsmath}
\usepackage{siunitx}
\usepackage{amssymb}
\usepackage{dsfont}
\usepackage{authblk}
\usepackage[lite]{mtpro2}
\usepackage{mathrsfs}
\usepackage{verbatim}
\usepackage{mathtools}
\usepackage{booktabs}
\usepackage{empheq}
\usepackage{lscape}
\usepackage{animate}
\usepackage{bigdelim}
\usepackage{color, colortbl}
\usepackage{soul}
\usepackage{caption}
\usepackage{lipsum}
\usepackage{arydshln}
\usepackage[compat=1.1.0]{tikz-feynman}
\usepackage{tikz}
\usepackage{multirow}
\usetikzlibrary{backgrounds}
\usepackage[a4paper, total={6.2in, 9.5in}]{geometry}
\usepackage[title]{appendix}
\usepackage{float}
\usepackage{pgfplots}
\usepackage{subcaption}
\usepackage{mdframed}
\newcommand*\widefbox[1]{\fbox{\hspace{2em}#1\hspace{2em}}}
\newcommand{\sech}{\operatorname{sech}}
\newcommand{\mathcolorbox}[2]{\colorbox{#1}{$\displaystyle #2$}}
\tikzset{Lstyle/.style={fill=cyan!20}}
\tikzset{Mstyle/.style={fill=orange!30}}
\tikzset{Nstyle/.style={fill=yellow!20}}
\tikzset{Ostyle/.style={fill=green!30}}
\tikzset{nodeStyle/.style={fill=gray!35}}
\tikzset{Lstyle/.style={fill=cyan!20}}
\tikzset{Mstyle/.style={fill=orange!30}}
\tikzset{Nstyle/.style={fill=yellow!20}}
\tikzset{Ostyle/.style={fill=green!30}}
\tikzset{nodeStyle/.style={fill=gray!35}}
\tikzstyle{residual} = [dataNode, text width=11em, fill=red!20, minimum height=5em, text centered, rounded corners, drop shadow]
\tikzstyle{dataNode}=[draw, fill=orange]
\tikzstyle{errNode}=[draw, fill=light-blue]
\tikzstyle{opt} = [text width=10em, fill=red!20, minimum height=5em, text centered, rounded corners]
\tikzstyle{opt2} = [text width=11em, fill=yellow!70, minimum height=5em, text centered, rounded corners]
\usetikzlibrary{shapes,arrows,shadows}
\DeclarePairedDelimiter\ceil{\lceil}{\rceil}
\DeclarePairedDelimiter\floor{\lfloor}{\rfloor}
\DeclareMathAlphabet\mathbfcal{OMS}{cmsy}{b}{n}
\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}
\definecolor{shadecolor}{cmyk}{0,0,0.41,0}
\definecolor{light-blue}{cmyk}{0.25,0,0,0}
\definecolor{green-yellow}{rgb}{0.68, 1.0, 0.18}
\definecolor{Gray}{gray}{0.9}
\newsavebox{\mysaveboxM}
\newsavebox{\mysaveboxT}
\newcolumntype{z}{>{\columncolor{shadecolor}}l}
\newcolumntype{b}{>{\columncolor{shadecolor}}r}
\newcolumntype{y}{>{\columncolor{green-yellow}}l}
\newcolumntype{x}{>{\columncolor{Gray}}c}
\newcolumntype{g}{>{\columncolor{Gray}}l}
\newcommand*\backPropBox[2][Example]{%
\sbox{\mysaveboxM}{#2}%
\sbox{\mysaveboxT}{\fcolorbox{black}{light-blue}{#1}}%
\sbox{\mysaveboxM}{%
\parbox[b][\ht\mysaveboxM+.5\ht\mysaveboxT+.5\dp\mysaveboxT][b]{%
\wd\mysaveboxM}{#2}%
}%
\sbox{\mysaveboxM}{%
\fcolorbox{black}{shadecolor}{%
\makebox[\linewidth-5em]{\usebox{\mysaveboxM}}%
}%
}%
\usebox{\mysaveboxM}%
\makebox[0pt][r]{%
\makebox[\wd\mysaveboxM][c]{%
\raisebox{\ht\mysaveboxM-0.5\ht\mysaveboxT
+0.5\dp\mysaveboxT-0.5\fboxrule}{\usebox{\mysaveboxT}}%
}%
}%
}
\newcommand*\LongbackPropBox[2][Example]{%
\sbox{\mysaveboxM}{#2}%
\sbox{\mysaveboxT}{\fcolorbox{black}{light-blue}{#1}}%
\sbox{\mysaveboxM}{%
\parbox[b][\ht\mysaveboxM+.5\ht\mysaveboxT+.5\dp\mysaveboxT][b]{%
\wd\mysaveboxM}{#2}%
}%
\sbox{\mysaveboxM}{%
\fcolorbox{black}{shadecolor}{%
\makebox[\linewidth+5em]{\usebox{\mysaveboxM}}%
}%
}%
\usebox{\mysaveboxM}%
\makebox[0pt][r]{%
\makebox[\wd\mysaveboxM][c]{%
\raisebox{\ht\mysaveboxM-0.5\ht\mysaveboxT
+0.5\dp\mysaveboxT-0.5\fboxrule}{\usebox{\mysaveboxT}}%
}%
}%
}
\newcommand*\forwardBox[2][Example]{%
\sbox{\mysaveboxM}{#2}%
\sbox{\mysaveboxT}{\fcolorbox{black}{orange}{#1}}%
\sbox{\mysaveboxM}{%
\parbox[b][\ht\mysaveboxM+.5\ht\mysaveboxT+.5\dp\mysaveboxT][b]{%
\wd\mysaveboxM}{#2}%
}%
\sbox{\mysaveboxM}{%
\fcolorbox{black}{green-yellow}{%
\makebox[\linewidth-5em]{\usebox{\mysaveboxM}}%
}%
}%
\usebox{\mysaveboxM}%
\makebox[0pt][r]{%
\makebox[\wd\mysaveboxM][c]{%
\raisebox{\ht\mysaveboxM-0.5\ht\mysaveboxT
+0.5\dp\mysaveboxT-0.5\fboxrule}{\usebox{\mysaveboxT}}%
}%
}%
}
\begin{document}
\title{Deep learning for pedestrians: backpropagation in CNNs}
\author{Laurent Bou\'e \\ \Myhref[hidelinks]{mailto:[email protected]}{\protect\includegraphics[width=1cm,height=0.6cm]{pptx/logos/gmail.png}} \,\,\,\, \Myhref[hidelinks]{https://www.linkedin.com/in/laurent-bou\%C3\%A9-b7923853/}{\protect\includegraphics[width=0.8cm,height=0.8cm]{pptx/logos/linkedin.png}} \,\,\,\, \Myhref[hidelinks]{https://github.com/Ranlot}{\protect\includegraphics[width=1cm,height=1cm]{pptx/logos/github.png}} \,\,\,\, \Myhref[hidelinks]{https://twitter.com/ranlot75}{\protect\includegraphics[width=0.8cm,height=0.8cm]{pptx/logos/twitter.png}} }
\affil{SAP Labs}
\date{}
\maketitle
\begin{abstract}
The goal of this document is to provide a pedagogical introduction to the main concepts underpinning the training of deep neural networks using gradient descent; a process known as backpropagation. Although we focus on a very influential class of architectures called ``convolutional neural networks'' (CNNs) the approach is generic and useful to the machine learning community as a whole. Motivated by the observation that derivations of backpropagation are often obscured by clumsy index-heavy narratives that appear somewhat mathemagical, we aim to offer a conceptually clear, vectorized description that articulates well the higher level logic. Following the principle of ``writing is nature's way of letting you know how sloppy your thinking is'', we try to make the calculations meticulous, self-contained and yet as intuitive as possible. Taking nothing for granted, ample illustrations serve as visual guides and an extensive bibliography is provided for further explorations.
\end{abstract}
\noindent \vspace{0.3cm}
\newcommand\tempboxTT{%
\begin{minipage}{0.643\textwidth}%
\abovedisplayskip=0pt
\belowdisplayskip=0pt
\begin{tikzpicture}
\node (opt) [opt] {\bf Click to open optional content \\ \vspace{0.6cm}};
\node[right = 0.8cm of opt, opt2] (opt2) {\bf Click again to close optional content \\ \vspace{0.6cm}};
\end{tikzpicture}
\end{minipage}}
\fcolorbox{black}{lightgray}{%
\minipage[t]{\dimexpr0.9\linewidth-2\fboxsep-2\fboxrule\relax}
\noindent For the sake of clarity, long mathematical derivations and visualizations have been broken up into a short ``summarized view'' and a longer ``detailed view''. Detailed views are encoded into the~PDF as optional content groups that become visible by clicking on the yellow symbol~$\hspace{0.53cm} \makebox[0pt][l]{\makebox[0pt][r]{\tooltip****{\usebox{\switchbox}}{\tempboxTT}[-\fboxsep,-\dimexpr\ht\switchbox+\dp\switchbox\relax]}}$. In addition, some figures contain animations designed to illustrate important concepts in a more engaging style. For these reasons, we advise to download the document locally and open it using~{\bf Adobe~Acrobat Reader}. Other viewers were not tested and may not render the detailed views, animations correctly. For completeness, the overall structure of the paper is summarized in a \hyperlink{contents}{table of contents}.
\endminipage}
\noindent \vspace{0.25cm}
\section{Supervised machine learning from 20,000 feet...}
\label{sec:intro}
The general workflow of training supervised machine learning models follows a well-oiled iterative procedure which we illustrate in the context of {\bf convolutional neural networks} (CNNs) for image classification. This field has undergone such rapid development in the last few years that it is sometimes used as the official ``success story'' of deep learning. Indeed, supervised image classification (and related topics such as object detection, instance segmentation...) is nowadays considered as a kind of commodity software that made its way into countless industrial applications and, consequently, it is worthwile to become familiar with how CNNs work under the hood. Broadly speaking, deep learning models have a highly modular structure where so-called ``layers'' play the role of elementary building blocks. Although different kinds of layers serve different purposes, it turns out that CNNs are constructed from very generic layers that also appear in a wide range of very diverse neural network architectures. This means that the results presented here are effortlessly portable and remain useful in vast and far-reaching applications of deep learning that go beyond CNNs and even supervised techniques. \\
\noindent As an introduction, let us go over the high-level description of the iterative loop of training a machine learning model. This procedure is illustrated graphically in~Fig.\ref{fig:iterativeProcedure} and the rest of this section is dedicated to a brief review of machine learning basics.
\newpage
\begin{enumerate}
\item The starting point consists in gathering a large set of ``training'' data along with their accompanying ``ground-truth'' labels:
\begin{itemize}
\item Each sample~$s$ of \colorbox{pink}{data can be expressed as a~$f$-dimensional feature vector}~${\bf a}^s \sim \mathbb{R}^f$. What those~$f$ features represent in the physical world depends on the modality (time-series, image, text...) of the data considered. Let's assume that we are dealing with a minibatch of~$n$ such samples simultaneously. In this case it is convenient to stack together all~$n$ training samples vertically into:
\begin{equation*}
{\bf A} = \left(
\begin{matrix}
{\bf a}^1 \sim \mathbb{R}^f \\
\vdots \\
{\bf a}^n \sim \mathbb{R}^f
\end{matrix}
\right) = \left(
\begin{matrix}
a^{1}_1 & \dots & a^1_f \\
\vdots & \vdots & \vdots \\
a^n_1 & \dots & a^n_f
\end{matrix}
\right) \sim \mathbb{R}^{n \times f}
\label{stackedMatrix}
\end{equation*}
As mentioned above, we will concern ourselves with the task of image classification. This means that each training sample~${\bf a}^s$ is actually a color image of height~$h$, width~$w$ and depth~$d$ (number of channels, $d=3$ for RGB for example). As such, feature vectors can be represented as a~3d~structure~$f \equiv \mathbb{R}^{d \times h \times w}$. For the sake of simplicity, we will only consider square images and denote by~$r \equiv h \equiv w$ the spatial resolution of the image. Note that the dimensionality of~$f$ grows quadratically with the spatial resolution of the images meaning that, even for modest sizes~$\sim \num[group-separator={,}]{1000}$ pixels, we quickly arrive at very high dimensional input data~$f \sim 10^6$ features (pixel values). Stacking together all the images present in a minibatch, the raw input data~${\bf A}_0 \sim \mathbb{R}^{n\times d\times r\times r}$ therefore starts as a (1+3)d~=~4d array whose shape will evolve (in depth as well as in spatial resolution) as it flows deeper into the network as shown in table~\ref{table:networkExample} and discussed more in detail in point~2 below.
\item In addition, each sample~$s$ is also associated with its \colorbox{pink}{ground-truth categorical label}~${\bf y}^s_\text{gt}$. Denoting by~$n_c$ the number of possible classes,~${\bf y}^s_\text{gt}$ is generally represented as a ``One Hot Encoded'' (OHE) vector~${\bf y}^s_\text{gt} \sim \mathbb{R}^{n_c}$. Stacking the~$n$ ground-truth vectors all together, we represent the labels via the following structure:
\begin{equation*}
{\bf Y}_\text{gt} = \left(
\begin{matrix}
{\bf y}^1_\text{gt} \sim \mathbb{R}^{n_c} \\
\vdots \\
{\bf y}^n_\text{gt} \sim \mathbb{R}^{n_c}
\end{matrix}
\right) = \left(
\begin{matrix}
y^1_1 & \dots & y^1_{n_c} \\
\vdots & \vdots & \vdots \\
y^n_1 & \dots & y^n_{n_c}
\end{matrix}
\right)_\text{gt}
\sim \mathbb{R}^{n \times n_c}
\label{stackedMatrix}
\end{equation*}
\noindent If sample~$s$ actually belongs to the ground-truth class~$c^s_\text{gt}$,~OHE representation means that there is only a single element~${\bf y}^s_\text{gt} (c^s_\text{gt})=1$ which is non-zero and all others are identically null~${\bf y}^s_\text{gt} (c \neq c^s_\text{gt}) = 0$ as illustrated in the top left corner of~Fig.\ref{fig:crossEntropy}.
\end{itemize}
The minibatch size~$n$, i.e. number of training samples that we stack together for simultaneous processing, should be considered as a hyper-parameter. Selecting the optimal~$n$ remains an active area of research and we come back to it in point~4 when we discuss the learning algorithm. In the following, training data for a specific minibatch~b refers to the pair~$\mathcal{D}_\text{b} = ({\bf A}_0 , {\bf Y}_\text{gt})_\text{b}$. Assuming that the entire training dataset is divided into~$N$ minibatches, we can represent it as a list~$\mathcal{D}_\text{training} = \lbrack \mathcal{D}_1 , \cdots , \mathcal{D}_N \rbrack $. As we will see shortly, it is implicitly assumed that the training points are independent and identically distributed (i.i.d.) according to an unknown distribution.
\begin{figure}
\centering
\begin{tikzpicture}
\begin{feynman}
\vertex (a1) {\( {\bf A}_0 \)};
\vertex[below=0.5cm of a1] (param) {\( \mathbfcal{P} \phantom{\hspace{0.11cm}} \) };
\vertex at ($(a1)!0.5!(param) - (-0.2cm, 0cm)$) (fStart);
\vertex[right=0.3cm of fStart] (ff);
\vertex[right=4.5cm of fStart] (a2) { \( {\bf Y}_\text{pred} \equiv \mathcal{N}_{ {\mathbfcal{P} }}({\bf A}_0) \) } ;
\vertex[below=1.5cm of a1] (b1) {\( {\bf Y}_\text{gt} \) };
\vertex at ($(a1)!0.5!(b1) - (-10cm, 0cm)$) (j) { \( \mathcal{L}_\text{batch} ( \mathbfcal{P}) \) };
\vertex[below=3cm of j] (backStart) { \( \nabla_\mathbfcal{P} \mathcal{L}_\text{batch} \) };
\vertex at ($(a1)!0.9!(backStart) - (-1cm, 0cm)$) (fMid);
\vertex[left=6.2cm of backStart] (bJoin) { \( \mathbfcal{P} \leftarrow \mathbfcal{P} - \lambda \nabla_\mathbfcal{P} \mathcal{L}_\text{batch} \) } ;
\vertex[above=0.5cm of bJoin] (bGhost) { } ;
\vertex[left=4.2cm of backStart] (bs) { } ;
\vertex[below=3.35cm of a1] (backghost);
\vertex[right=6.2cm of backghost] (backPass) {\( \mathcolorbox{light-blue}{\text{backward pass}} \)};
\diagram* {
(ff) -- [charged boson, edge label=\( \mathcolorbox{orange}{\text{forward pass}} \)] (a2) -- [fermion] (j),
(b1) -- [fermion] (j),
(j) -- [gluon] (backStart),
(backStart) -- [fermion] (bs),
(ff) -- [ghost] (fMid),
(bGhost) -- [charged scalar] (param)
};
\draw [decoration={brace}, decorate] (bJoin.north west) -- (bJoin.north east) ;
\draw [decoration={brace}, decorate] (a1.north east) -- (param.south east) ;
\begin{scope}[on background layer]
\filldraw[draw=black, fill=green-yellow] (current bounding box.north west) rectangle
($(current bounding box.north east)!0.6!(current bounding box.south east)$);
\filldraw[draw=black, fill=shadecolor] (current bounding box.south west) rectangle
($(current bounding box.north east)!0.6!(current bounding box.south east)$);
\fill[fill=gray] ([yshift=1mm]$(current bounding box.north west)!0.6!(current bounding box.south
west)$) rectangle
([yshift=-1mm]$(current bounding box.north east)!0.6!(current bounding box.south east)$);
\end{scope}
\end{feynman}
\end{tikzpicture}
\caption{High level cartoon of the iterative training loop. The model~$\mathcal{N}_\mathbfcal{P}$ is a probabilistic function parametrized by~$\mathbfcal{P}$ whose purpose is to take in the raw data~${\bf A}_0$ and return a probability distribution~${\bf Y}_\text{pred}$ over a set of~$n_c$ classes during the ``forward pass''. Combining~${\bf Y}_\text{pred}$ with the ground-truth~${\bf Y}_\text{gt}$ leads to a scalar~$\mathcal{L}_\text{batch} (\mathbfcal{P}) > 0 $, known as the loss function, that quantifies the level of mismatch between prediction and ground-truth. The objective of training is to find a better set of parameters~$\mathbfcal{P}$ to minimize the value of the loss over the training data. As discussed in the text, this can be achieved by calculating the gradient~$\nabla_\mathbfcal{P} \mathcal{L}_\text{batch}$ of the loss during the ``backward pass''. Calculating this gradient in the context of convolutional neural networks is the focus of this article. Once~$\nabla_\mathbfcal{P} \mathcal{L}_\text{batch}$ is known, the parameters are updated proportionately to the learning rate~$\lambda$. Cycles of forward/backward passes are run iteratively over minibatches of labeled data~$\mathcal{D}_\text{training}$ until one is satisfied with the overall performance of the model.}
\label{fig:iterativeProcedure}
\end{figure}
\item Define a \colorbox{pink}{model architecture} for the neural network~$\mathcal{N}_{\mathbfcal{P}}$ by arranging together a number~$n_\ell$ of layers. Formally, the model should be thought of as a \colorbox{pink}{parametrized function~$\mathcal{N}_{\mathbfcal{P}}$ that takes the} \colorbox{pink}{input data~${\bf A}_0$ and returns a probability distribution~${\bf Y}_\text{pred}$ over~$n_c$ classes:}
\begin{equation}
{\bf Y}_\text{pred} = \mathcal{N}_{\mathbfcal{P}} \left( {\bf A}_0 \right) \sim \mathbb{R}^{n\times n_c}
\label{eq:SignatureOfNetwork}
\end{equation}
Evaluating~$\mathcal{N}_{\mathbfcal{P}}$ given some input data~${\bf A}_0$ is referred to as the ``forward pass'' and the resulting~${\bf Y}_\text{pred}$ defines the probabilistic prediction of the model. Each training sample~$s$ gets a prediction vector~${\bf y}^s_\text{pred} \sim \mathbb{R}^{n_c}$ indicating how ``confident'' the network is that this sample belongs to any one of the~$n_c$ possible classes as illustrated in the top right corner of~Fig.\ref{fig:crossEntropy}. The final layer of the network ensures proper normalization of the probability distributions so that~$\sum_{c=1}^{n_c} (y^s_c)_\text{pred} = 1$ independently for all~$n$ samples (see section~\ref{sec:softmaxLayer}). Denoting by~$n_p$ the collective number of parameters contained in the trainable layers of the network, we have~${\mathbfcal{P}} \sim \mathbb{R}^{n_p}$.
\vspace{1cm}
In this article, we will consider the following layers:
\begin{itemize}
\item {\bf non-trainable}: non-linear activation~(\ref{sec:activation}), max-pool~(\ref{sec:maxpool}) \& flatten~(\ref{sec:flatten})
\item {\bf trainable}: fully connected~(\ref{sec:fc}), convolution~(\ref{sec:conv}) \& batch normalization~(\ref{sec:batchNorm})
\end{itemize}
Inspired by a simple and historically important CNN, we consider a modified version of the famous LeNet-5 model that incorporates a few more modern components (ReLU activation, batch normalization, skip connections...). The architecture of our example network is fully specified in table~\ref{table:networkExample}. Its backbone is made up of~$n_\ell = 16$ layers comprising of~$n_p =$~\num[group-separator={,}]{44878} parameters listed in table~\ref{table:networkParams}. Because the architecture does not have loops, it falls under the category of feedforward neural networks which are usually implemented as directed acyclic graphs (DAGs) by deep learning frameworks. Keeping with historical references, we use the MNIST (Modified National Institute of Standards and Technology) dataset as the labeled data~$\mathcal{D}_\text{training}$. This dataset consists of~$n_c=10$ classes of handwritten digits~(0-9) in the form of~\num[group-separator={,}]{70000} grayscale images (depth~$d=1$ and spatial resolution~$r = 28$). A selection of~\num[group-separator={,}]{60000} samples is assigned to the training set while the remaining~\num[group-separator={,}]{10000} constitute the test set. With a minibatch of size~$n$, the input data is therefore represented as~${\bf A}_0 \sim \mathbb{R}^{n \times 1 \times 28 \times 28}$. The ``shape'' column of table~\ref{table:networkExample} shows how the data is sequentially transformed from pixel space~${\bf A}_0$ layer by layer all the way down to ``embedding'' space~${\bf A} \equiv {\bf A}_{n_\ell = 16}$. The network starts by a series of alternating ``convolution~\textemdash~activation~\textemdash~batch normalization~\textemdash~maxpool'' layer blocks whose effect is to reduce the spatial resolution of the data while, at the same time, increase its depth. At some point the~3d structure of samples (independent space and depth dimensions of images) are flattened into~1d feature vectors transforming~${\bf A}_8 \sim \mathbb{R}^{n\times 16\times 4\times 4}$ into a 2d array~${\bf A}_9 \sim \mathbb{R}^{n\times 256}$ which is fed into another series of alternating ``fully connected~\textemdash~activation~\textemdash~batch normalization'' layer blocks. Note that space and depth information are no longer relevant as interpretable features as soon as data is processed by fully connected layers because of the global connectivity patterns they introduce. The final representation, so-called ``embedding'', denoted by~${\bf A}$ is eventually fed into a softmax layer (section~\ref{sec:softmaxLayer}) in order to produce a normalized probability distribution~${\bf Y}_\text{pred} \sim \mathbb{R}^{n\times n_c}$. \\
\noindent The ``backward pass'' corresponds to an equivalent propagation of error terms~$\Delta$'s back up through the layers of~$\mathcal{N}_{\mathbfcal{P}}$ (see section~\ref{backPropSection}). As can be gleaned from table~\ref{table:networkExample}, data and error arrays always share the same dimensionality~${\bf A}_i \sim \Delta_i$ for all layers.
\begin{figure}
\centering
\includegraphics[width=0.9\linewidth]{pptx/crossEntropy/Slide1.png}
\caption{Illustration of the cross-entropy loss function~${\bf \ell}_\mathbfcal{P} \left( {\bf Y}_\text{gt}, {\bf Y}_\text{pred} \right)$ defining the amount of ``mismatch'' between the one-hot encoded ground-truth~${\bf Y}_\text{gt}$ and the output probability distribution~${\bf Y}_\text{pred}$ as defined in~eq.(\ref{eq:crossEntropy}). For clarity, we show only the values of the~${\bf y}_\text{pred}^s(c_\text{gt}^s)$ components for all samples~$s$ of~${\bf Y}_\text{pred}$ since they are the only relevant ones as far as the cross-entropy calculation is concerned. (Numerical values are shared with~Fig.~\ref{fig:softMax}).}
\label{fig:crossEntropy}
\end{figure}
\vspace{2cm}
\item Define a loss function that measures the \mathcolorbox{pink}{\text{amount of disagreement between the predicted}~{\bf Y}_\text{pred}} \mathcolorbox{pink}{\text{and the ground-truth}~{\bf Y}_\text{gt}}. For classification tasks, it is usual to use the cross-entropy between the predicted probability distribution and the ground-truth distribution:
\begin{equation*}
\mathcal{L}_\text{batch} ({\mathbfcal{P}}) = - {\bf Y}_\text{gt} \cdot \log {\bf Y}_\text{pred} \label{eq:batchLoss} \sim \mathbb{R}
\end{equation*}
\noindent where the explicit~$\mathbfcal{P}$-dependence of the loss comes its dependence on~${\bf Y}_\text{pred} = \mathcal{N}_{\mathbfcal{P}} \left( {\bf A}_0 \right)$ and,~recursively, on all the preceding layers of the neural network~\footnote{Obviously~$\mathcal{L}_\text{batch}$ also depends on the network architecture~$\mathcal{N}$ in addition to~$\mathcal{P}$ and the training data~$(\bf{A}_0, {\bf Y}_\text{gt})$ (see also \hyperlink{programmerNote}{side note}). However, as this dependence is usually non-differentiable, we restrict ourselves to static architectures and consider the loss as a function of the parameters and the training data only. We refer the reader to~\cite{optimizeNet} for recent work that formulates architecture search as a gradient-based optimization problem (see point~4) using differentiable losses with respect to~$\mathcal{N}$ as an alternative to conventional approaches that use evolution techniques or reinforcement learning over a discrete and non-differentiable search space~\cite{googleAIblog}.}.
In order to gain some insight into how the cross-entropy loss emerges as the natural quantity, let us consider a single training sample~$s$ with input data and supervised label pair~$\left( {\bf a}_0^s , \, {\bf y}_\text{gt}^s \right)$. Passing this input as an argument to the neural network function~$\mathcal{N}_{\mathbfcal{P}}$ produces a probability distribution vector~${\bf y}^s_\text{pred} = \mathcal{N}_{\mathbfcal{P}} \left( {\bf a}_0^s \right) \sim \mathbb{R}^{n_c}$. Denoting by~$c^s_\text{gt}$ the ground-truth class to which this sample belongs means that all components of~${\bf y}^s_\text{gt} \sim \mathbb{R}^{n_c}$ are identically~0 except for~${\bf y}^s_\text{gt} (c_\text{gt}^s) \equiv 1$. Because of this~OHE representation of~${\bf y}_\text{gt}^s$, its dot-product with~${\bf y}_\text{pred}^s$ produces a single value
\begin{equation*}
{\bf y}_\text{gt}^s \cdot {\bf y}_\text{pred}^s = {\bf y}_\text{pred}^s (c_\text{gt}^s ) \sim \mathbb{R}
\end{equation*}
which represents the probability/{\bf likelihood} assigned by~$\mathcal{N}_{\mathbfcal{P}}$ to the actual ground-truth class. Accordingly, a good prediction consists in having a likelihood~$0 < {\bf y}^s_\text{pred} (c_\text{gt}^s) \lessapprox 1$ as high as possible in order to mirror~${\bf y}^s_\text{gt}$. Under the assumption that the~$n$ training samples are i.i.d. (as discussed in point~1), the likelihood over the entire minibatch~$\mathtt{L}_\text{batch}$ can be written as a product over the individual likelihoods. The training objective is then formulated as an optimization problem over the parameters~$\mathbfcal{P} \sim \mathbb{R}^{n_p}$ to maximize the minibatch likelihood:
\begin{equation*}
\argmax_\mathbfcal{P} \, \mathtt{L}_\text{batch} \,\, ; \,\, \text{with} \,\,\, \mathtt{L}_\text{batch} = \prod_{s=1}^n {\bf y}_\text{pred}^s (c_\text{gt}^s ) \sim \mathbb{R}
\end{equation*}
Taking the logarithm of the likelihood turns the product into a sum over individual training samples without changing the nature of the optimization objective. Since the~$\log$ function is strictly monotonic, maximizing the likelihood is equivalent to minimizing the negative log-likelihood:
\begin{eqnarray*}
\argmax_\mathbfcal{P} \, \mathtt{L}_\text{batch} &\Longleftrightarrow& \argmin_\mathbfcal{P} \left( - \log \mathtt{L}_\text{batch} \right) \\
&=& \argmin_\mathbfcal{P} \left( - \sum_{s=1}^n \log {\bf y}_\text{pred}^s (c_\text{gt}^s ) \right) \\
&\equiv& \argmin_\mathbfcal{P} \sum_{s=1}^n {\bf \ell}_\mathbfcal{P} \left( {\bf y}^s_\text{gt}, {\bf y}^s_\text{pred} \right)
\end{eqnarray*}
\newcommand\tempboxPic{%
\begin{minipage}{0.5\textwidth}%
\abovedisplayskip=0pt
\belowdisplayskip=0pt
\begin{tikzpicture}[scale=1]
\begin{axis}[
xlabel={${\bf y}_\text{pred}^s(c_\text{gt}^s)$},
, samples=500, grid, thick
, domain=-0.1:1
, xtick={0, 0.25, 0.5, 0.75, 1},
xticklabels={$0$,$1/4$,$1/2$,$3/4$,$1$}
, title={${\bf \ell}_\mathbfcal{P} \left( {\bf y}^s_\text{gt}, {\bf y}^s_\text{pred} \right)$ \phantom{zzzzzzzzzzzzzzzzzzzzzz} }
]
\addplot+[no marks, orange, line width=2pt] {-ln(x)};
\end{axis}
\end{tikzpicture}
\end{minipage}}
Thanks to this convenient formulation as a sum over the minibatch, we can identify the amount of mismatch due to an individual training sample~$s$ as:
\begin{align*}
{\bf \ell}_\mathbfcal{P} \left( {\bf y}^s_\text{gt}, {\bf y}^s_\text{pred} \right) &\equiv -\log \, {\bf y}^s_\text{pred} (c_\text{gt}^s ) \sim \mathbb{R} \\
&\centerwithin{\downarrow}{=} \colorbox{light-blue}{equivalent to cross-entropy between the distributions ${\bf y}_\text{gt}^s$ and ${\bf y}_\text{pred}^s$} \\
&\centerwithin{}{=} \colorbox{light-blue}{(using the OHE representation of ${\bf y}_\text{gt}^s$)} \\
&\equiv - {\bf y}_\text{gt}^s \cdot \log {\bf y}_\text{pred}^s \sim \mathbb{R} \\
& \makebox[0pt][l]{\makebox[0pt][r]{\tooltip****{\usebox{\switchbox}}{\tempboxPic}[-\fboxsep,-\dimexpr\ht\switchbox+\dp\switchbox\relax]}} \, \colorbox{yellow}{illustrative plot}
\end{align*}
This shows that maximizing the likelihood is equivalent to minimizing the cross-entropy between the ground-truth distribution and the probability distribution vector predicted by the neural network. As can be seen in the illustrative plot, the cross-entropy metric ensures a monotonically decreasing cost from high values when~${\bf y}^s_\text{pred} (c_\text{gt}^s) \ll 1$ (i.e. small likelihood assigned by~$\mathcal{N}_{\mathbfcal{P}}$ to the ground-truth class: bad prediction) down to small cost values as the prediction for the ground-truth class approaches~1, namely ${\bf y}^s_\text{pred} (c_\text{gt}^s) \lessapprox 1$ (i.e. high likelihood assigned by~$\mathcal{N}_{\mathbfcal{P}}$ to the ground-truth class: good prediction).
\noindent Going back to the general case of minibatches of training samples, we can dispatch~${\bf \ell}_\mathbfcal{P}$ to all~$n$~samples and express the cross-entropy loss as a vectorized operation:
\begin{equation}
{\bf \ell}_\mathbfcal{P} \left( {\bf Y}_\text{gt}, {\bf Y}_\text{pred} \right) \equiv \left(
\begin{matrix}
{\bf \ell}_\mathbfcal{P} \left( {\bf y}^1_\text{gt}, {\bf y}^1_\text{pred} \right)\sim \mathbb{R} \\
\vdots \\
{\bf \ell}_\mathbfcal{P} \left( {\bf y}^n_\text{gt}, {\bf y}^n_\text{pred} \right) \sim \mathbb{R}
\end{matrix}
\right) = - {\bf Y}_\text{gt} \ominus \log {\bf Y}_\text{pred} \sim \mathbb{R}^{n}
\label{eq:crossEntropy}
\end{equation}
where each component corresponds to the loss due to individual samples as illustrated in~Fig.\ref{fig:crossEntropy}. Using~eq.(\ref{rowWise::Froebenius}) to sum up this loss vector demonstrates that the total loss is indeed given by the cross-entropy between the predicted probability distribution~${\bf Y}_\text{pred}$ and the ground truth distribution~${\bf Y}_\text{gt}$ as stated at the beginning of this section:
\begin{equation}
\mathcal{L}_\text{batch} ({\mathbfcal{P}}) = \sum_\text{samples} \ell_{\mathbfcal{P}} \left( {\bf Y}_\text{gt} , {\bf Y}_\text{pred} \right) = - {\bf Y}_\text{gt} \cdot \log {\bf Y}_\text{pred} \label{eq:batchLoss} \sim \mathbb{R}
\end{equation}
\noindent In summary, we have shown that the training objective can be formulated as a search for a set of parameters~$\mathbfcal{P} \sim \mathbb{R}^{n_p}$ that minimize the cross-entropy between~${\bf Y}_\text{pred}$ and~${\bf Y}_\text{gt}$:
\begin{equation*}
\argmin_\mathbfcal{P} \mathcal{L}_\text{batch} ({\mathbfcal{P}})
\end{equation*}
\noindent Minimizing the training error has the effect of maximizing the similarity between ground-truth and model prediction distributions, i.e. the likelihood that the model is able to produce the correct labels on the training dataset.
\newpage
\item The purpose of the \colorbox{pink}{learning algorithm} is to provide a concrete strategy to solve the optimization objective discussed in the previous point. It is known that the inductive biases introduced by different optimization algorithms~\cite{inductiveBias} and proper initializations~\cite{weightInit,weightInit2} play a crucial role in the quality of learning and in the generalization ability of the learned models. We focus on the celebrated gradient descent algorithm which, despite its age~\cite{cauchy}, remains the ``workhorse'' of deep learning. \\
\colorbox{pink}{Gradient descent} is an intuitive procedure where the parameters~$\mathbfcal{P}$ are iteratively corrected by a small vector~$\delta \mathbfcal{P} \ll \mathbfcal{P}$ carefully chosen so as to reduce the loss~$\mathcal{L}_\text{batch} (\mathbfcal{P} + \delta \mathbfcal{P} ) < \mathcal{L}_\text{batch} (\mathbfcal{P})$. Obviously, a decrease in the loss implies that~${\bf Y}_\text{pred}$ gradually becomes a little more accurate description of~${\bf Y}_\text{gt}$ (see point~3). The minimization is implemented by repeating this update many times over different batches of training data~$\mathcal{D}_\text{training}$. How to find~$\delta \mathbfcal{P}$? Since the parameter update is assumed to be small, we can perform a Taylor expansion of the loss function:
\begin{equation*}
\mathcal{L}_\text{batch} ( \mathbfcal{P} + \delta \mathbfcal{P} ) \approx \mathcal{L}_\text{batch} ( \mathbfcal{P} ) + \delta \mathbfcal{P}^t \cdot \nabla_\mathbfcal{P} \mathcal{L}_\text{batch} + \frac{1}{2} \, \delta \mathbfcal{P}^t \, \nabla^2_\mathbfcal{P} \mathcal{L}_\text{batch} \, \delta \mathbfcal{P} + \cdots
\end{equation*}
Restricting ourselves to~$1^\text{st}$-order effects only, the change in loss values is determined by the dot-product~$\delta \mathbfcal{P}^t \cdot \nabla_\mathbfcal{P} \mathcal{L}_\text{batch}$. Clearly, this term depends on the angle between the parameter update~$\delta \mathbfcal{P} \sim \mathbb{R}^{n_p}$ and the gradient~$\nabla_\mathbfcal{P} \mathcal{L}_\text{batch} \sim \mathbb{R}^{n_p}$. It reaches its maximum when both vectors are aligned with each other showing that the gradient~\footnote{\label{noteGrad} The gradient~$\nabla_\mathbfcal{P} \mathcal{L}_\text{batch}$ of a function~$\mathcal{L}_\text{batch} (\mathbfcal{P}) : \mathbb{R}^{n_p} \rightarrow \mathbb{R}$ is defined as the vector of partial derivatives with respect to each of its parameters:~$\nabla_\mathbfcal{P} \mathcal{L}_\text{batch} = \big( \partial \mathcal{L}_\text{batch} / \partial \mathbfcal{P}_1, \cdots, \partial \mathcal{L}_\text{batch} / \partial \mathbfcal{P}_{n_\ell} \big)$. Trainable layers typically have more than a single parameter and, denoting by~$n_p$ the total number of parameters contained in the~$n_\ell$ layers, we have therefore~$\nabla_\mathbfcal{P} \mathcal{L}_\text{batch} \sim \mathbfcal{P} \sim \mathbb{R}^{n_p}$.} can be interpreted as the direction of steepest ascent. In other words,~$\nabla_\mathbfcal{P} \mathcal{L}_\text{batch}$ is a vector whose direction in~$\mathbb{R}^{n_p}$ space is the one along which~$\mathcal{L}_\text{batch}$ grows the fastest. Given that our goal is to decrease the loss value, the optimal parameter update consists in choosing a vector~$\delta \mathbfcal{P} \sim \mathbb{R}^{n_p}$ that lies in the opposite direction of the gradient, i.e. the direction of steepest descent. Now that the direction of~$\delta \mathbfcal{P}_\text{steepest descent}$ is known, we can fix its magnitude by introducing a~``learning rate''~$0 < \lambda \ll 1$ such that parameters are updated according to:
\begin{eqnarray}
\mathbfcal{P} &\Longleftarrow& \mathbfcal{P} + \delta \mathbfcal{P}_\text{steepest descent}
\label{eq:steepestUpdateA} \\
\delta \mathbfcal{P}_\text{steepest descent} &\equiv& - \lambda \nabla_\mathbfcal{P} \mathcal{L}_\text{batch} = -\lambda \left(
\begin{matrix}
\partial \mathcal{L}_\text{batch} / \partial \mathbfcal{P}_1 \\
\vdots \\
\partial \mathcal{L}_\text{batch} / \partial \mathbfcal{P}_{n_p}
\end{matrix}
\right) \sim \mathbb{R}^{n_p}
\label{eq:steepestUpdateB}
\end{eqnarray}
Obviously, this approach requires the \mathcolorbox{pink}{\text{explicit calculation of the gradient}~\nabla_\mathbfcal{P} \mathcal{L}_\text{batch}} and, for implementation reasons that will introduced in section~\ref{backPropSection} and be the focus of the rest of the article, the evaluation of~$\nabla_\mathbfcal{P} \mathcal{L}_\text{batch}$ is referred to as the ``backward pass''. \\
In our case, the input data consists of minibatches representing only a subset of the entire training dataset. As a result, the gradient is calculated based on a limited number~$n$ of samples for each update and the learning algorithm is typically referred to as ``stochastic gradient descent'' (SGD) to reflect the noise introduced by this finite-size estimation of the gradient. (This is in contrast with ``batch'' gradient descent that uses the entire available dataset.) Choosing the minibatch size~$n$ remains a delicate issue which is entangled with the learning rate~$\lambda$ and its evolution during training. It is customarily believed~\cite{YannLeCun_tweet,graphCoreAI} that smaller values of~$n$ lead to better generalization performance: an effect attributed to the randomness in minibatch sampling inducing an ``exploratory'' behavior of SGD dynamics. In fact, one can show that the covariance of the minibatch noise is related to the Hessian~$\nabla^2_\mathbfcal{P} \mathcal{L}_\text{batch}$ of the loss function~\cite{limitCycles} hinting at a connection between noise and higher-order effects. Overall, noise appears to play a crucial role by implicitly providing a form of regularization that may help escape saddle points and facilitate training. In contrast, a number of studies advocate larger batch sizes in order to reduce the number of parameter updates and allow the use of distributed training without having to sacrifice performance~\cite{largeBatch1,largeBatch2,largeBatch3}. Obviously, the geometry and training dynamics of loss landscapes remain topics of intense research; fruitful connections have appeared with models and tools coming statistical physics~\cite{lossSurface}.
Deep learning frameworks offer a whole zoo of gradient-based optimization strategies that decorate the ``canonical'' SGD presented here~\cite{ruderGradient,adaptiveMethods}. Nevertheless, all these methods share one common characteristic which is the necessity to evaluate the gradient~$\nabla_\mathbfcal{P} \mathcal{L}_\text{batch}$. Complexity-wise,~$1^\text{st}$-order algorithms are efficient since~$\nabla_\mathbfcal{P} \mathcal{L}_\text{batch} \sim \mathbb{R}^{n_p}$ is linear with the total number of parameters in the network. Intuitively, one may think that~$2^\text{nd}$ order methods involving the Hessian would improve the search by utilizing information about the curvature of the loss landscape. Unfortunately, the quadratic growth of the Hessian~$\nabla_\mathbfcal{P} \mathcal{L}_\text{batch}^2 \sim \mathbb{R}^{n_p \times n_p}$ coupled with the need for matrix inversion render such approaches prohibitively expensive for large networks. Even higher-order methods suffer from increasingly worse scaling and, additionally, cannot exploit well-established linear algebra tools. In practice, the overwhelming majority of neural networks are trained using simple gradient descent methods based on~$\nabla_\mathbfcal{P} \mathcal{L}_\text{batch}$.
\item After the parameters have been updated for a minibatch~$\mathcal{D}_\text{b}$, we can advance to the next mini-batch~$\mathcal{D}_\text{b+1} \in \mathcal{D}_\text{training}$, make a forward pass through the network to get the loss and do another parameter update via the backward pass. This \mathcolorbox{pink}{\text{loop over minibatches and update of parameters}} may be repeated until the loss function has decreased enough and the training data is well approximated by the neural network function, i.e.~${\bf Y}_\text{gt} \approx {\bf Y}_\text{pred} = \mathcal{N}_\mathbfcal{P} ({\bf A}_0) \, , \, \forall \, \mathcal{D} \in \mathcal{D}_\text{training}$.
One run over the complete~$\mathcal{D}_\text{training}$ is called an ``epoch'' and this iterative procedure is summarized in a concise high-level~\hyperlink{programmerNote}{pseudo-code} as well as in the cartoon of~Fig.~\ref{fig:iterativeProcedure}. Given enough iterations a ``good'' learning algorithm may lead to a vanishingly small loss~$\mathcal{L} (\mathbfcal{P}_\text{overfit}) \approx 0$ meaning that the function~$\mathcal{N}_{\mathbfcal{P}_\text{overfit}}$ is capable of perfectly reproducing all the training samples. This is a signal of overfitting and implies that~$\mathcal{N}_{\mathbfcal{P}_\text{overfit}}$ has little chance of generalizing to new input data. This is why, in addition to the training loss, it is common practice to also monitor the accuracy on an independent held-out testing set in order to be able to stop training as soon as the testing performance no longer increases; a technique known as early stopping. There exists a large number of ``regularization'' strategies that aim to address this trade-off between optimization on the training data and generalization to previously unseen datasets. In addition to traditional penalties based on the norm of the parameters such as sparsity inducing~L1 (Lasso regression) and~L2 regularizations (ridge regression also known as weight decay), other approaches like drop-out~\cite{dropOut} (a prominent candidate), dataset augmentation techniques, label smoothing, bagging... are frequently used by deep learning practitioners. Nevertheless, controlling the accuracy of a model remains a critical topic of research and training networks that yield state-of-the-art performance still involves many tricks (one may look at~\cite{ng,deepLearningBookGoodfellow} for an introduction).
\end{enumerate}
\paragraph{Closing words on supervised learning from~20,000 feet...} Before moving on to the core of this article which is the description of backpropagation, let us finish this introduction by making a few general observations. First of all, although number of parameters may not the the best way to quantify model complexity~\cite{intrinsicDim,oneParam}, let us note that the example CNN discussed in this article (tables~\ref{table:networkExample} and~\ref{table:networkParams}) is under-parametrized: the number~$n_p=$~\num[group-separator={,}]{44878} of parameters is smaller than the~\num[group-separator={,}]{60000} samples available for training. This situation is somewhat unusual for modern state-of-the-art deep learning architectures that typically gravitate towards a heavy over-parametrization of the models. For example, the famous AlexNet which propelled deep learning under the spotlight in~2012 after winning the ImageNet ILSVRC challenge (by an unprecedented margin) contained about~60~million parameters trained on a dataset of ``only''~$1.2$~million images~\cite{imageNet}. Empirical evidence suggests that models with larger capacity are surprisingly resistant to overfitting and continue to improve their generalization error~(see~\cite{outrageousLarge} for an extreme example) even when trained without explicit regularization~\cite{overParam}. \\
\noindent Putting things in perspective, one may be tempted to think of the task of training the neural network function~$\mathcal{N}_\mathbfcal{P}$ as a simple interpolation problem on the dataset~$\mathcal{D}_\text{training}$. However, the very high-dimensional nature of the input data raises major difficulties. Indeed, it is well known that with increasing dimensionality all data points become equally far away from each other and the notion of nearest neighbors may no longer be relevant~\cite{distanceRatioHighD}; a consequence of the ``curse of dimensionality''. Dense sampling of a unit hypercube of dimension~$d \sim 10^6$ (lower range of real-world data, see point~1 for images) into a grid of (very poor) resolution~$\varepsilon \sim 0.1$ would require an unreachable and absolutely absurd number~$(1/\varepsilon)^d \sim 10^{\num[group-separator={,}]{1000000}}$ of samples. Even if, because of some intrinsic structural constraints, real-world data happens to lie in the vicinity of lower dimensional manifolds~\cite{dimReduction}, training samples are forever condemned to be immensely isolated from each other effectively ruling out na\"ive interpolation.
\fcolorbox{black}{lightgray}{%
\minipage[t]{\dimexpr0.9\linewidth-2\fboxsep-2\fboxrule\relax}
{\bf Iterative training: \hypertarget{programmerNote}{pseudo-code}} \\
Let us mention that the whole training procedure can be summarized in a surprisingly short high-level program mirroring the cartoon of Fig.\ref{fig:iterativeProcedure}. The first step consists in choosing a neural network architecture~$\mathcal{N}_\mathcal{P}$, i.e. a parametrized function that, given some input data, returns a probability distribution~$\bf{Y}_\text{pred}$ over a set of~$n_c$ predefined classes. This prediction is then compared to the ground-truth~${\bf Y}_\text{gt}$ and the quality of the fit is measured via:
\begin{equation*}
\mathcal{L}_\text{batch} \left( {\bf Y}_\text{gt}, {\bf Y}_\text{pred} = \mathcal{N}_\mathcal{P} ({\bf A}_0) \right) \equiv \mathcal{L}_\text{batch} \left( \mathcal{N}, \mathcal{P}, \mathcal{D}_\text{batch} \right) \sim \mathbb{R}
\end{equation*}
Conceptually, the loss~$\mathcal{L}_\text{batch}$ can be expressed as a scalar function of the architecture~$\mathcal{N}$, the current value of the parameters~$\mathcal{P} \sim \mathbb{R}^{n_p}$ and the training data~$\mathcal{D}_\text{batch} = \left( {\bf A}_0, {\bf Y}_\text{gt}\right)_\text{batch}$. \\
\noindent {\it (In our case~$\mathcal{N}$ is defined as the functional composition of the layers specified in table~\ref{table:networkExample}, parametrized by~$\mathcal{P}$ described in table~\ref{table:networkParams} and trained on~$\mathcal{D}_\text{training} = \text{MNIST}$ with the cross-entropy loss defined in~eq.(\ref{eq:batchLoss}))}. \\
As explained in point~4, it is necessary to calculate the gradient of~$\mathcal{L}_\text{batch}$ with respect to~$\mathcal{P}$ in order to perform an SGD update. Restricting ourselves to static network architectures, the gradient can be formally returned by a vector-valued ``backpropagation'' function~$\mathcal{B}$ implicitly parametrized by~$\mathcal{N}$:
\begin{equation*}
\nabla_\mathbfcal{P} \mathcal{L}_\text{batch} = \mathcal{B}_\mathcal{N} \left( \mathbfcal{P} , \mathcal{D}_\text{batch} \right) \sim \mathbb{R}^{n_p}
\end{equation*}
In practice, deep learning frameworks expose rich high-level APIs based on automatic \mbox{differentiation}~\cite{pearlmutter} to efficiently implement~$\mathcal{B}_\mathcal{N}$ and support complex control flow such as branches/loops as part of the emerging concept of~``differentiable programming''~\cite{RAD,differentiableProgramming}. In this article we will calculate the gradient of each layer analytically and define~$\mathcal{B}_\mathcal{N}$ as the composition of all the layer-level gradient functions. Once~$\mathcal{B}_\mathcal{N}$ is known, training proceeds by~i)~evaluating the backpropagation function iteratively over the list of supervised observations~$\mathcal{D}_\text{training} = \lbrack \mathcal{D}_1, \cdots , \mathcal{D}_N \rbrack$ and~ii)~updating the parameters~$\mathbfcal{P}$ each time according to~eqs.(\ref{eq:steepestUpdateA},\ref{eq:steepestUpdateB}). This can be translated in single line of code by defining a training function:
\begin{equation*}
\mathbfcal{P}_\text{trained} \left( \mathcal{N} \right) = \texttt{foldl} \,\, \Big(\backslash \mathbfcal{P}, \mathcal{D} \rightarrow \mathbfcal{P} - \lambda \mathcal{B}_\mathcal{N} \left( \mathbfcal{P} , \mathcal{D} \right) \Big) \,\, \mathbfcal{P}_\text{initial} \,\, \mathcal{D}_\text{training} \sim \mathbb{R}^{n_p}
\end{equation*}
that returns a set of trained parameters~$\mathbfcal{P}_\text{trained}$ for any given architecture~$\mathcal{N}$. Here,~\texttt{foldl} stands for the left fold function (Haskell-specific syntax; exists under different keywords in other programming languages) that takes~3 arguments: an updating function, an initialized accumulator~$\mathbfcal{P}_\text{initial}$ and a list~$\mathcal{D}_\text{training}$ upon which to evaluate the function.
\endminipage}
\vspace{1cm}
\begin{figure}
\centering
\begin{subfigure}{.5\textwidth}
\centering
\includegraphics[width=\linewidth]{pptx/manifoldLaurent/ReLU_inputData.png}
\end{subfigure}%
\begin{subfigure}{.5\textwidth}
\centering
\includegraphics[width=\linewidth]{pptx/manifoldLaurent/ReLU_decisionBoundaries_Final.png}
\end{subfigure}
\caption{Concrete visualization of the ``manifold hypothesis'' using a synthetic spiral dataset composed of~7 interleaved classes represented by different colors; see~\cite{laurentSpiral} for all details including animations of the training dynamics using different activation functions. {\bf Left)}~Raw input data~$\equiv {\bf A}_0 \sim \mathbb{R}^{n\times 2}$. {\bf Right)}~Data representation at the level of the ``embedding''~$\equiv {\bf A} \sim \mathbb{R}^{n\times 2}$. Thanks to the~2d geometry of both~${\bf A}$/${\bf A}_0$, one can see how~$\mathcal{N}_\mathcal{P}$ has learned to separate the training samples so that classes can be separated by linear boundaries.}
\label{fig::spiralNet}
\end{figure}
\noindent Instead, deep learning is usually presented as a data cascade across multiple levels of abstraction starting with raw unstructured high-dimensional input and finishing with lower dimensional representations called ``embeddings''. In the case of~CNNs, the first few layers are suspected to encode simple features such as edges and colors. Deeper layers pick up the task by trying to detect higher-level motifs that start to resemble more familiar objects. Finally, this hierarchical cascade is believed to produce abstract concepts that are easier to separate into distinct classes~\cite{deconv}. This scenario can be joined with a complimentary interpretation going under the name of ``manifold hypothesis''~\cite{manifoldHypothesis}. There, the idea is that the learning process should be seen as a progressive linearization of the topology of the input data that starts as complicated interwoven class manifolds and culminates into embeddings that are separable by simple hyperplanes~\cite{chrisTopo,laurentSpiral} as illustrated in Fig.\ref{fig::spiralNet}. Giving further support to the importance of these learned representations, it is known that algebraic operations on embeddings can combine high-level concepts into semantically meaningful relationships~\cite{DCGAN,mikolov}. Somewhat orthogonally, compression of input data into efficient representations during the training dynamics can also be approached from the point of view of information theory~\cite{tishby}.
\newpage
\noindent This hierarchical and compositional view of deep learning is not specific to CNNs and images but, rather, may be a natural consequence of the generative process which created the data in the first place. Indeed, physical world data is usually modeled in terms of recursive compositions of fundamental building blocks into more and more complex systems. In this case, neural networks can be seen as an attempt to ``reverse-engineer'' this decomposition of the generative process into a hierarchy of simpler steps~\cite{cheapLearning}. As such, it is probable that successful deep learning architectures implicitly provide very strong priors on the types of statistical patterns that can be discovered from data~\cite{explainMethod}. Despite empirical evidence that ``deeper is better'', the actual role played by depth and its impact on the expressiveness of approximating functions remains a topic of active theoretical research~\cite{nadav,poggio}. \\
\noindent Although this narrative is very widespread and influential~\cite{natureDeep}, it is worth emphasizing that there is still no consensus regarding what it is that the current incarnations of neural networks really do learn; a lot of work is going into understanding and visualizing the predictions of deep learning models~\cite{distill,explainLIME}. Among many known drawbacks let us mention that neural networks can effortlessly fit data with random labels~\cite{generalize}, are alarmingly fooled by minute and imperceptible adversarial perturbations~\cite{intriguing}, lack a sense of geometric context~\cite{capsule}, learn superficial details not related to semantically relevant concepts~\cite{fftSuperficial}, are brittle against benign distributional shift~\cite{cifar10Generalize}... Another obvious limitation stems from the implicit static closed-world environment assumption: models can only ever predict classes that belong to the training set. They are forced into making wrong predictions (with uncontrollable confidence) if presented with new objects~\cite{openSet} or with nonsensical input~\cite{rubbishImages}. The practicality (or lack thereof) of deep neural networks in the wild, the role of prior knowledge~\cite{garyMarcus}, whether modern architectures are anything more than (somewhat fast and loose~\cite{fastLoose}) exceptional curve-fitting machines that bear no resemblance whatsoever to human perception let alone reasoning~\cite{pearl}... and many more issues continue~\footnote{(as evidenced by the famous, time-tested, quote {\it ``The question of whether machines can think is about as relevant as the question of whether submarines can swim''}~\cite{Dijkstra}, these discussions have a long history)} to be nebulous and hotly debated topics. Nonetheless, deep learning has undeniably turned out to be very good at discovering intricate structures in high-dimensional data and continues to establish new state-of-the-art performance in many academic fields beyond computer vision. The last few years have also shown how neural networks can have a meaningful impact in wide variety of commercial products. Nevertheless, a healthy dose of skepticism shows that there is still some way to go before they can fully be deployed as trusted components of critical and ever-changing real-world applications. \\
\noindent With this long introduction out of the way, let us now move on to the main purpose of this article which is a pedagogical presentation of backpropagation in the context of CNNs.
\begin{table}
\vspace{-3em}%
\hspace*{-2.66cm}
\captionsetup{singlelinecheck=off}
\begin{tabular}{|| l | y | x | z ||}
\hline
{\bf Layer} & \cellcolor{orange}{\bf Forward pass} & {\bf Shape} & \cellcolor{light-blue}{\bf Backward pass} \\
\hline
\hline
\rule{0pt}{1.1\normalbaselineskip}
Input data & ${\bf A}_0$ & $\mathbb{R}^{n\times 1\times 28\times 28}$ & \cellcolor{Gray} $n$ grayscale ($d=1$) square images of $28\times28$ resolution \\
\hline \hline \rule{0pt}{1.1\normalbaselineskip}
& ${\bf A}_1 = {\bf w}_0^{\mathfrak{p}_0} \star {\bf A}_0 + \widetilde{{\bf b}}_0$ & & \\
\multirow{-2}*{Convolution} & $\mathfrak{p}_0= \big( k_0=5 , s_0=1 , p_0=0 \big) $ & \multirow{-2}*{$ $} & \multirow{-2}*{$\Delta_1 = \Delta_2 \circ g^\prime \left( {\bf A}_1 \right) $} \\
& & & $\Delta_{2} = f_{4d}^t \bigg[ \bigg( 576 n f_{2d}^t ( \Delta_{3} ) \widetilde{{\bf w}_{2}} - \widetilde{ \sum_s f_{2d}^t(\Delta_{3}) \widetilde{{\bf w}_{2}}}$ \\
\multirow{-2}*{Activation} & \multirow{-2}*{${\bf A}_{2} = g({\bf A}_{1})$} & \multirow{-2}*{$\mathbb{R}^{n \times 6 \times 24 \times 24 }$} & $\hspace{2.2cm} - \overline{f_{2d}^t({\bf A}_{2})} \circ \widetilde{ \sum_s \overline{f_{2d}^t({\bf A}_{2})} \circ f_{2d}^t(\Delta_{3}) \widetilde{{\bf w}_{2}} } \bigg) / 576n\widetilde{\sigma_{2}} \bigg] $ \\[0.2em]
Batch normalization & ${\bf A}_{3} = f_{4d}^t \Big( \overline{f_{2d}^t({\bf A}_{2})} \, \widetilde{{\bf w}_{2}} + \widetilde{{\bf b}_{2}} \Big) $ & $ $ & $\Delta_3 = \widetilde{\Delta_4} \circ g_{\mathfrak{p}}^\prime ({\bf A}_3) $ \\[0.3em]
\hline \hline \rule{0pt}{1.1\normalbaselineskip}
& ${\bf A}_4 = \text{maxpool}_\mathfrak{p} \, {\bf A}_3 $ & & $\Delta_4 = \overset{\curvearrowleft \, t_{\text{d}} \, \mathfrak{p}_4^\prime }{{\bf w}_{4\,\,\,\,\,\,\,\,}} \star \Delta_5$ \\
\multirow{-2}*{Maxpool} & $\mathfrak{p} =\big( k=2 \, , s=2 \, , p=0 \big)$ & \multirow{-2}*{$\mathbb{R}^{n \times 6 \times 12 \times 12 }$} & $\mathfrak{p}_4^\prime =\big( k^\prime_4= k_4 = 5 \, , \, s^\prime_4=1/s_4 =1 \, , \, p^\prime_4= k_4 - p_4 - 1 = 4 \, \big)$ \\[0.3em]
\hline \hline \rule{0pt}{1.1\normalbaselineskip}
& ${\bf A}_5 = {\bf A}_4 \star {\bf w}_4^{\mathfrak{p}_4} + \widetilde{{\bf b}}_4 $ & & \\
\multirow{-2}*{Convolution} & $\mathfrak{p}_4= \big( k_4=5 , s_4=1 , p_4=0 \big) $ & \multirow{-2}*{$ $} & \multirow{-2}*{$\Delta_5 = \Delta_6 \circ g^\prime \left( {\bf A}_5 \right)$} \\
& & & $\Delta_{6} = f_{4d}^t \bigg[ \bigg( 64 n f_{2d}^t ( \Delta_{7} ) \widetilde{{\bf w}_{6}} - \widetilde{ \sum_s f_{2d}^t(\Delta_{7}) \widetilde{{\bf w}_{6}}}$ \\
\multirow{-2}*{Activation} & \multirow{-2}*{${\bf A}_{6} = g({\bf A}_{5})$} & \multirow{-2}*{$\mathbb{R}^{n \times 16\times 8\times 8}$} & $\hspace{2.35cm} - \overline{f_{2d}^t({\bf A}_{6})} \circ \widetilde{ \sum_s \overline{f_{2d}^t({\bf A}_{6})} \circ f_{2d}^t(\Delta_{7}) \widetilde{{\bf w}_{6}} } \bigg) / 64n\widetilde{\sigma_{6}} \bigg] $ \\[0.2em]
Batch normalization & ${\bf A}_{7} = f_{4d} \Big( \overline{f_{2d}^t({\bf A}_{6})} \, \widetilde{{\bf w}_{6}} + \widetilde{{\bf b}_{6}} \Big) $ & $ $ & $\Delta_7 = \widetilde{\Delta_8} \circ g_{\mathfrak{p}}^\prime ({\bf A}_7) $ \\[0.3em]
\hline \hline \rule{0pt}{1.1\normalbaselineskip}
& ${\bf A}_8 = \text{maxpool}_\mathfrak{p} \, {\bf A}_7 $ & & \\
\multirow{-2}*{Maxpool} & $\mathfrak{p}= \big( k=2 , s=2 , p=0 \big) $ & \multirow{-2}*{$ \mathbb{R}^{n \times 16 \times 4 \times 4 } $} & \multirow{-2}*{$\Delta_8 = \text{fold} \, \Delta_9 $} \\[0.3em]
\hline \hline \rule{0pt}{1.1\normalbaselineskip}
Flatten & ${\bf A}_9 = \text{flatten} \, {\bf A}_8 $ & $\mathbb{R}^{n \times 256} $ & $\Delta_9 = \Delta_{10} {\bf w}_9^t $ \\[0.3em]
\hline \hline \rule{0pt}{1.1\normalbaselineskip}
Fully connected & ${\bf A}_{10} = {\bf A}_9 {\bf w}_9 + \widetilde{{\bf b}}_9 $ & $ $ & $\Delta_{10} = \Delta_{11} \circ g^\prime \left( {\bf A}_{10} \right)$ \\[0.2em]
Activation & ${\bf A}_{11} = g({\bf A}_ {10})$ & $ \mathbb{R}^{n \times 120} $ & $\Delta_{11} = \frac{1}{n\widetilde{\sigma_{11}} \vphantom{3^{3^{3}}} } \Big( n \Delta_{12} \widetilde{{\bf w}_{11}} - \widetilde{ \sum_s \Delta_{12} \widetilde{{\bf w}_{11}} } - \overline{\bf A}_{11} \circ \widetilde{ \sum_s \overline{\bf A}_{11} \circ \Delta_{12} \widetilde{{\bf w}_{11}} } \Big)$ \\[0.4em]
Batch normalization & ${\bf A}_{12} = \overline{\bf A}_{11} \widetilde{{\bf w}_{11}} + \widetilde{{\bf b}_{11}} $ & $ $ & $\Delta_{12} = \Delta_{13} {\bf w}_{12}^t $ \\[0.3em]
\hline \hline \rule{0pt}{1.1\normalbaselineskip}
Fully connected & ${\bf A}_{13} = {\bf A}_{12} {\bf w}_{12} + \widetilde{{\bf b}}_{12} $ & $ $ & $\Delta_{13} = \Delta_{14} \circ g^\prime \left( {\bf A}_{13} \right) $ \\[0.2em]
Activation & ${\bf A}_{14} = g({\bf A}_{13})$ & $ \mathbb{R}^{n \times 84} $ & $\Delta_{14} = \frac{1}{n\widetilde{\sigma_{14}} \vphantom{3^{3^{3}}} } \Big( n \Delta_{15} \widetilde{{\bf w}_{14}} - \widetilde{ \sum_s \Delta_{15} \widetilde{{\bf w}_{14}} } - \overline{\bf A}_{14} \circ \widetilde{ \sum_s \overline{\bf A}_{14} \circ \Delta_{15} \widetilde{{\bf w}_{14}} } \Big)$ \\[0.4em]
Batch normalization & ${\bf A}_{15} = \overline{\bf A}_{14} \widetilde{{\bf w}_{14}} + \widetilde{{\bf b}_{14}} $ & $ $ & $\Delta_{15} = \Delta_{16} {\bf w}_{15}^t $ \\[0.3em]
\hline \hline \rule{0pt}{1.1\normalbaselineskip}
Fully connected & ${\bf A} \equiv {\bf A}_{16} = {\bf A}_{15} {\bf w}_{15} + \widetilde{{\bf b}}_{15} $ & $ \mathbb{R}^{n \times 10} $ & $\Delta_{16} = {\bf Y}_\text{pred} - {\bf Y}_\text{gt}$ \\[0.3em]
\hline \hline \rule{0pt}{1.1\normalbaselineskip}
Softmax & ${\bf Y}_\text{pred} = \text{softmax} \, {\bf A} $ & $\mathbb{R}^{n \times 10}$ & \cellcolor{Gray} probability distribution over $n_c=10$ classes for all images \\[0.3em]
\hline
\end{tabular}
\caption[capNet]{%
Illustration of a typical~Convolutional Neural Network~(CNN) architecture inspired by the historical LeNet-5. Notice how patterns ``convolution/fully connected~\textemdash~activation~\textemdash~batch normalization'' are grouped together and repeated. Shortcut connections, another component of modern architectures such as ResNet, are \hyperlink{skipNote}{explained in detail} even though they are absent from this example network. As usual in CNNs, the shape of the data representations during the {\bf forward pass} (to be read from \colorbox{green-yellow}{\bf top to bottom}) starts by becoming ``fatter'' (deeper and spatially smaller) early in the network before being flattened into a ``thin'' feature vector whose dimension is gradually decreased to eventually match the desired number of classes~$n_c=10$ for classification; the final layer before the softmax, sometimes referred to as the ``embedding'', is denoted as~${\bf A} \equiv {\bf A}_{16}$. This in contrast to the {\bf backward pass} (to be read from \colorbox{shadecolor}{\bf bottom to top}) where the error arrays, corresponding to the gradient of the loss function with respect to the data, follow the exact opposite dimensionality changes. As can be gleaned from the expressions above and \hyperlink{errGeneralFunction}{demonstrated} in the main text, error backpropagation may be seen as a general function~$\mathcolorbox{pink}{\Delta_{i-1} \, \big( \Delta_i , {\bf A}_{i-1} , \mathbfcal{P}_{i-1} \big) }$ of the upstream error~$\Delta_i$, original data array~${\bf A}_{i-1}$ from the forward pass and parameters~$\mathbfcal{P}_{i-1}$; layer-specific implementations of the downstream error terms~$\Delta_{i-1}$ can be found in the relevant sections. \\
{\it (Note that the dimensionality of convolutional kernels and of arrays that are wrapped by geometrical reshape operations, such as~$f_{2d}$ and~$f_{4d}$, designed to handle minibatches of image data are provided explicitly in the caption of table~\ref{table:networkParams}. More details about the architecture and the dataflow are provided in point~2 of section~\ref{sec:intro}.)}}
\label{table:networkExample}
\end{table}
\begin{table}
\vspace{-4.8em}%
\hspace*{-0.5cm}
\captionsetup{singlelinecheck=off}
\begin{tabular}{|| l | y : y | x : x | z ||}
\hline
\multicolumn{3}{|c|}{\cellcolor{orange}{{\bf Parameters}}} & \multicolumn{2}{c|}{\cellcolor{Gray}{{\bf Dimensionality}}} & \cellcolor{light-blue}{\bf Loss derivative} \\
\hline
\hline
\rule{0pt}{1.1\normalbaselineskip}
\multirow{2}*{Fully connected} & & ${\bf w}_{15}$ & $\mathbb{R}^{84\times 10}$ & 840 & $\partial \mathcal{L}_{\text{batch}} / \partial {\bf w}_{15} = {\bf A}_{15}^t \Delta_{16}$ \\[0.3em]
& \multirow{-2}*{$\mathbfcal{P}_{15}$} & ${\bf b}_{15}$ & $\mathbb{R}^{10} $ & 10 & $\partial \mathcal{L}_{\text{batch}} / \partial {\bf b}_{15} = \sum_s \Delta_{16}$ \\[0.3em]
\hline
\hline
\rule{0pt}{1.1\normalbaselineskip}
\multirow{2}*{Batch normalization} & & ${\bf w}_{14}$ & $\mathbb{R}^{84}$ & 84 & $\partial \mathcal{L}_{\text{batch}} / \partial {\bf w}_{14} = \text{diag} \big( \overline{\bf A}_{14}^t \Delta_{15} \big) $ \\[0.3em]
& \multirow{-2}*{$\mathbfcal{P}_{14}$} & ${\bf b}_{14}$ & $\mathbb{R}^{84} $ & 84 & $\partial \mathcal{L}_{\text{batch}} / \partial {\bf b}_{14} = \sum_s \Delta_{15} $ \\[0.3em]
\hline
\hline
\rule{0pt}{1.1\normalbaselineskip}
\multirow{2}*{Fully connected} & & ${\bf w}_{12}$ & $\mathbb{R}^{120\times 84} $ & 10,080 & $\partial \mathcal{L}_{\text{batch}} / \partial {\bf w}_{12} = {\bf A}_{12}^t \Delta_{13}$ \\[0.3em]
& \multirow{-2}*{$\mathbfcal{P}_{12}$} & ${\bf b}_{12}$ & $ \mathbb{R}^{84}$ & 84 & $\partial \mathcal{L}_{\text{batch}} / \partial {\bf b}_{12} = \sum_s \Delta_{13}$ \\[0.3em]
\hline
\hline
\rule{0pt}{1.1\normalbaselineskip}
\multirow{2}*{Batch normalization} & & ${\bf w}_{11}$ & $\mathbb{R}^{120}$ & 120 & $\partial \mathcal{L}_{\text{batch}} / \partial {\bf w}_{11} = \text{diag} \big( \overline{\bf A}_{11}^t \Delta_{12} \big) $ \\[0.3em]
& \multirow{-2}*{$\mathbfcal{P}_{11}$} & ${\bf b}_{11}$ & $\mathbb{R}^{120} $ & 120 & $\partial \mathcal{L}_{\text{batch}} / \partial {\bf b}_{11} = \sum_s \Delta_{12} $ \\[0.3em]
\hline
\hline
\rule{0pt}{1.1\normalbaselineskip}
\multirow{2}*{Fully connected} & & ${\bf w}_9$ & $ \mathbb{R}^{256\times 120} $ & 30,720 & $\partial \mathcal{L}_{\text{batch}} / \partial {\bf w}_9 = {\bf A}_9^t \Delta_{10} $ \\[0.3em]
& \multirow{-2}*{$\mathbfcal{P}_9$} & ${\bf b}_9$ & $\mathbb{R}^{120}$ & 120 & $\partial \mathcal{L}_{\text{batch}} / \partial {\bf b}_9 = \sum_s \Delta_{10}$ \\[0.3em]
\hline
\hline
\rule{0pt}{1.1\normalbaselineskip}
\multirow{2}*{Batch normalization} & & ${\bf w}_{6}$ & $\mathbb{R}^{16}$ & 16 & $\partial \mathcal{L}_{\text{batch}} / \partial {\bf w}_{6} = \text{diag} \big( \overline{f_{2d}^t({\bf A}_6)}^{\,t} f_{2d}^t(\Delta_{7}) \big) $ \\[0.3em]
& \multirow{-2}*{$\mathbfcal{P}_{6}$} & ${\bf b}_{6}$ & $\mathbb{R}^{16} $ & 16 & $\partial \mathcal{L}_{\text{batch}} / \partial {\bf b}_{6} = \sum_{s,r} \Delta_7 $ \\[0.3em]
\hline
\hline
\rule{0pt}{1.1\normalbaselineskip}
\multirow{2}*{Convolution} & & ${\bf w}_4$ & $\mathbb{R}^{16\times 6\times 5\times 5}$ & $16 \times 150 =$ 2,400 & $\partial \mathcal{L}_{\text{batch}} / \partial {\bf w}_4 = \text{roll} \left[ f_{2d} \left( \Delta_5 \right) \phi \left( {\bf A}_4 \right)^t \right] $ \\[0.3em]
& \multirow{-2}*{$\mathbfcal{P}_4$} & ${\bf b}_4$ & $\mathbb{R}^{16}$ & 16 & $\partial \mathcal{L}_{\text{batch}} / \partial {\bf b}_4 = \sum_{s,r} \Delta_{5}$ \\[0.3em]
\hline
\hline
\rule{0pt}{1.1\normalbaselineskip}
\multirow{2}*{Batch normalization} & & ${\bf w}_{2}$ & $\mathbb{R}^{6}$ & 6 & $\partial \mathcal{L}_{\text{batch}} / \partial {\bf w}_{2} = \text{diag} \big( \overline{f_{2d}^t({\bf A}_2)}^{\,t} f_{2d}^t(\Delta_{3}) \big) $ \\[0.3em]
& \multirow{-2}*{$\mathbfcal{P}_{2}$} & ${\bf b}_{2}$ & $\mathbb{R}^{6} $ & 6 & $\partial \mathcal{L}_{\text{batch}} / \partial {\bf b}_{2} = \sum_{s,r} \Delta_3 $ \\[0.3em]
\hline
\hline
\rule{0pt}{1.1\normalbaselineskip}
\multirow{2}*{Convolution} & & ${\bf w}_0$ & $\mathbb{R}^{6\times 1\times5 \times5}$ & $6\times 25 = $ 150 & $\partial \mathcal{L}_{\text{batch}} / \partial {\bf w}_0 = \text{roll} \left[ f_{2d} \left( \Delta_1 \right) \phi \left( {\bf A}_0 \right)^t \right] $ \\[0.3em]
& \multirow{-2}*{$\mathbfcal{P}_0$} & ${\bf b}_0$ & $\mathbb{R}^{6}$ & 6 & $\partial \mathcal{L}_{\text{batch}} / \partial {\bf b}_0 = \sum_{s,r} \Delta_1$ \\[0.3em]
\hline
\hline
\multicolumn{6}{|c|}{\cellcolor{Gray}{Total number of parameters =~\num[group-separator={,}]{44878}}} \\
\hline
\end{tabular}
\caption[capParam]{%
Parameters are presented from top to bottom in the order in which they are updated during the backpropagation algorithm described in section~\ref{backPropSection}. Notice that all gradient components with respect to parameters~$\mathbfcal{P}_{i-1}$ share the same pattern regardless of the type of (linear) layer:
\begin{itemize}
\item[{\bf weights:}] $\mathcolorbox{pink}{\partial \mathcal{L}_{\text{batch}} / \partial {\bf w}_{i-1} \sim {\bf A}_{i-1}^t \Delta_i}$ Matrix product between the upstream error~$\Delta_i$ and the transpose of the data array~${\bf A}_{i-1}$ (up to geometrical transformations such as~$f_{2d}$,~$f_{4d}$,~$\text{roll}$,~$\text{diag}$...) This shows that intermediate data arrays originating from the forward pass need to be cached in memory to be combined, at a later point, with error arrays during the backward pass; illustration in Fig.\ref{errorFlow}.
\item[{\bf biases:}] $\mathcolorbox{pink}{\partial \mathcal{L}_{\text{batch}} / \partial {\bf b}_{i-1} \sim \sum \Delta_i}$ Tensor-contraction of the upstream error. In the case of error arrays associated with image data, the sum runs over the spatial dimensions (indicated by the~$r$ subscript) in addition to minibatch samples (indicated by the~$s$ subscript) in the summation symbol~$\sum_{s,r}$.
\end{itemize}
{\it (Details about layer-specific implementations of the \colorbox{shadecolor}{{\bf components of the gradient~$\nabla_\mathbfcal{P} \mathcal{L}_\text{batch}$}} are provided in the relevant sections of the main text.)}
\vspace{0.1cm}
\noindent\rule{\linewidth}{1.4pt}
\vspace{0.2cm}
For the sake of completeness, we report here the dimensionality of transformed arrays and of the convolutional kernels relevant both for table~\ref{table:networkExample} as well as for the gradient expressions above:
\begin{center}
\begin{tabular}{|g:g:g|}
\hline
\rule{0pt}{1.1\normalbaselineskip}
$f_{2d}^t({\bf A}_6) \sim \mathbb{R}^{(8\times 8\times n)\times 16}$ & $f_{2d}^t(\Delta_7) \sim \mathbb{R}^{(8\times 8\times n)\times 16} $ & $\text{diag} \left( \mathbb{R}^{16\times 16} \right) \sim \mathbb{R}^{16} $ \\[0.3em]
\hline
\rule{0pt}{1.1\normalbaselineskip}
$f_{2d}(\Delta_5) \sim \mathbb{R}^{16\times (8\times 8\times n)}$ & $\phi \left( {\bf A}_4 \right) \sim \mathbb{R}^{(6\times 5\times 5)\times (8\times 8\times n)}$ & $\text{roll} \left( \mathbb{R}^{16\times (6\times 5\times 5)} \right) \sim \mathbb{R}^{16\times 6\times 5\times 5}$ \\[0.3em]
\hline
\rule{0pt}{1.1\normalbaselineskip}
$f_{2d}^t({\bf A}_2) \sim \mathbb{R}^{(24\times 24\times n)\times 6}$ & $f_{2d}^t(\Delta_3) \sim \mathbb{R}^{(24\times 24\times n)\times 6} $ & $\text{diag} \left( \mathbb{R}^{6\times 6} \right) \sim \mathbb{R}^6 $ \\[0.3em]
\hline
\rule{0pt}{1.1\normalbaselineskip}
$f_{2d}(\Delta_1) \sim \mathbb{R}^{6\times (24\times 24\times n)}$ & $\phi \left( {\bf A}_0 \right) \sim \mathbb{R}^{(1\times 5\times 5)\times (24\times 24\times n)}$ & $\text{roll} \left( \mathbb{R}^{6\times (1\times 5\times 5)} \right) \sim \mathbb{R}^{6\times 1\times 5\times 5}$ \\[0.3em]
\hline \hline
\rule{0pt}{1.1\normalbaselineskip}
${\bf w}_0^{\mathfrak{p}_0} \sim \mathbb{R}^{6\times1\times k_0\times k_0} $ & ${\bf w}_4^{\mathfrak{p}_4} \sim \mathbb{R}^{16\times6\times k_4\times k_4}$ & $\overset{\curvearrowleft \, t_{\text{d}} \, \mathfrak{p}_4^\prime }{{\bf w}_{4\,\,\,\,\,\,\,\,}} \sim \mathbb{R}^{6\times 16\times k_4^\prime \times k_4^\prime} $ \\[0.3em]
\hline
\end{tabular}
\end{center}
The purely geometrical transformation from~${\bf w}_4^{\mathfrak{p}_4}$ to~$\overset{\curvearrowleft \, t_{\text{d}} \, \mathfrak{p}_4^\prime }{{\bf w}_{4\,\,\,\,\,\,\,\,}}$ is explained in a \hyperlink{convTransf}{dedicated paragraph} and illustrated in an animation of Fig.~\ref{fig:convForwardBackward}.}
\label{table:networkParams}
\end{table}
\newpage
\section{Gradient evaluation via backpropagation}
\label{backPropSection}
As discussed in the introduction, the key component behind the iterative training loop displayed in~Fig~\ref{fig:iterativeProcedure} consists in being able to provide an explicit expression for the gradient~$\nabla_\mathbfcal{P} \mathcal{L}_\text{batch}$ of the loss function with respect to the parameters~$\mathbfcal{P}$ of the neural network~$\mathcal{N}_\mathcal{P}$. This section is dedicated to an in-depth immersion into the fundamental mechanics behind one such implementation of gradient calculation generally referred to as ``backpropagation''. For typical machine learning datasets, loss functions tend to have signatures of the following type:
\begin{equation*}
\mathcal{L}_\text{batch} (\mathbfcal{P}) : \,\, \mathbb{R}^{n_p} \rightarrow \mathbb{R} \quad \text{with} \quad n_p \gg 1
\end{equation*}
where a very high-dimensional parameter space~($n_p =$~\num[group-separator={,}]{44878} in our example network) is reduced to a scalar value. In this case, backpropagation is computationally efficient~\footnote{\label{backVsForward} To be compared with a straightforward computation of all partial derivatives of~$\mathcal{L}_\text{batch} (\mathbfcal{P})$ independently from each which requires~$\sim \mathcal{O}(n_p)$ evaluations of~$\mathcal{N}_\mathcal{P}$. Such ``forward mode'' implementations of differentiation are efficient only for functions~$\mathbb{R}^n \rightarrow \mathbb{R}^m$ where the dimensionality of the output space is larger than that of the input space, i.e.~$n < m$.} since the gradient can be evaluated by a \colorbox{pink}{single forward/backward cycle} through the neural network, i.e. roughly-speaking with a time complexity on the order of only~2 evaluations of~$\mathcal{N}_\mathcal{P}$ (at the expense of \hyperlink{memBack}{memory consumption}). This section presents the logic of backpropagation in a generic way applicable to any network architecture and layer type as done in tables~\ref{table:networkExample} and~\ref{table:networkParams} for our example network.
\paragraph{How to start? Bootstrap with loss function \& softmax} Let us start by recognizing that, by definition, the total derivative of~$\mathcal{L}_\text{batch}$ is given by:
\begin{eqnarray}
\text{d} \mathcal{L}_\text{batch} &=& \nabla_\mathbfcal{P} \mathcal{L}_\text{batch} \cdot \text{d} \mathbfcal{P} \nonumber \\
&\downarrow& \colorbox{light-blue}{gradient as a vector of partial derivatives for the~$n_\ell$ layers; see footnote~\ref{noteGrad}} \nonumber \\
\text{d} \mathcal{L}_\text{batch} &\equiv& \sum_{p=1}^{n_\ell} \frac{\partial \mathcal{L}_{\text{batch}}}{\partial \mathbfcal{P}_p} \cdot \text{d}\mathbfcal{P}_p \label{eq:gradientFormal}
\end{eqnarray}
\newcommand\tempboxBootStrap{%
\begin{minipage}{0.2\textwidth}%
\abovedisplayskip=0pt
\belowdisplayskip=0pt
\begin{align*}
\text{d} \mathcal{L}_\text{batch} &= \sum_\text{samples} \text{d} {\bf \ell}_\mathbfcal{P} \left( {\bf Y}_\text{gt} , {\bf Y}_\text{pred} \right) \\
&\centerwithin{\downarrow}{=} \colorbox{light-blue}{using eq.~\eqref{totalVectorToScalar}} \\
&= \sum_\text{samples} \nabla {\bf \ell}_\mathbfcal{P} \left( {\bf Y}_\text{gt} , {\bf Y}_\text{pred} \right) \ominus \text{d} {\bf Y}_\text{pred} \\
&\centerwithin{\downarrow}{=} \colorbox{light-blue}{using eq.~\eqref{rowWise::Froebenius}} \\
& = \nabla {\bf \ell}_\mathbfcal{P} \left( {\bf Y}_\text{gt} , {\bf Y}_\text{pred} \right) \cdot \text{d} {\bf Y}_\text{pred}
\end{align*}
\end{minipage}}
\noindent As we will discover in this section, all the components of~$\nabla_\mathbfcal{P} \mathcal{L}_\text{batch}$ can be extracted by an intuitive pattern matching process that operates by recursively going backwards layer-by-layer through~$\mathcal{N}_\mathcal{P}$. \\
\noindent Accordingly, let us start with the definition of the cross-entropy loss function in~eq.(\ref{eq:batchLoss}) and begin evaluating~$\text{d} \mathcal{L}_\text{batch}$ at the output level of the neural network:
\begin{align}
\text{d} \mathcal{L}_\text{batch} &= \sum_\text{samples} \text{d} {\bf \ell}_\mathbfcal{P} \left( {\bf Y}_\text{gt} , {\bf Y}_\text{pred} \right) \phantom{zzzzzzzzzzzzzzzz} \nonumber \\
& \makebox[0pt][l]{\makebox[0pt][r]{\tooltip****{\usebox{\switchbox}}{\tempboxBootStrap}[-\fboxsep,-\dimexpr\ht\switchbox+\dp\switchbox\relax]}}\\
& = \nabla_\mathbfcal{P} {\bf \ell}_\mathbfcal{P} \left( {\bf Y}_\text{gt} , {\bf Y}_\text{pred} \right) \cdot \, \text{d} {\bf Y}_\text{pred} \label{totalLossBatch}
\end{align}
\newcommand\tempboxGradLoss{%
\begin{minipage}{0.2\textwidth}%
\abovedisplayskip=0pt
\belowdisplayskip=0pt
\begin{align*}
\nabla_\mathbfcal{P} {\bf \ell}_\mathbfcal{P} ( {\bf Y}_\text{gt} , {\bf Y}_\text{pred} ) &= \nabla_\mathbfcal{P} \left( - {\bf Y}_\text{gt} \ominus \log {\bf Y}_\text{pred} \right) \\
&\centerwithin{\downarrow}{=} \colorbox{light-blue}{since, at this stage, all $\mathbfcal{P}$ dependence is contained in ${\bf Y}_\text{pred}$} \\
&= \nabla_{\bf{Y}_\text{pred}} \left( - {\bf Y}_\text{gt} \ominus \log {\bf Y}_\text{pred} \right) \\
&= \left(
\begin{matrix}
\nabla_{{\bf y}^1_\text{pred}} \sim \mathbb{R}^{n_c} \\
\vdots \\
\nabla_{{\bf y}^n_\text{pred}} \sim \mathbb{R}^{n_c}
\end{matrix} \right)
\left(
\begin{matrix}
- {\bf y}^1_\text{gt} \cdot \log {\bf y}^1_\text{pred} \sim \mathbb{R} \\
\vdots \\
- {\bf y}^n_\text{gt} \cdot \log {\bf y}^n_\text{pred} \sim \mathbb{R}
\end{matrix} \right) \sim \mathbb{R}^{n \times n_c} \\
&= \left(
\begin{matrix}
\left( \frac{\partial}{\partial y^1_1} , \cdots , \frac{\partial}{\partial y^1_{n_c}} \right)_\text{pred} \\
\vdots \\
\left( \frac{\partial}{\partial y^n_1} , \cdots , \frac{\partial}{\partial y^n_{n_c}} \right)_\text{pred}
\end{matrix} \right)
\left(
\begin{matrix}
- \sum_c ( y^1_c )_\text{gt} \log ( y^1_c )_\text{pred} \\
\vdots \\
- \sum_c ( y^n_c )_\text{gt} \log ( y^n_c )_\text{pred}
\end{matrix} \right) \\
&= - \left(
\begin{matrix}
(y^1_1)_\text{gt} / (y^1_1)_\text{pred} & \dots & (y^1_{n_c})_\text{gt} / (y^1_{n_c})_\text{pred} \\
\vdots &\vdots& \vdots \\
(y^n_1)_\text{gt} / (y^n_1)_\text{pred} & \dots & (y^1_{n_c})_\text{gt} / (y^1_{n_c})_\text{pred}
\end{matrix} \right) \\
&= - \left(
\begin{matrix}
y^1_1 & \dots & y^1_{n_c} \\
\vdots &\vdots& \vdots \\
y^n_1 & \dots & y^1_{n_c}
\end{matrix} \right)_\text{gt} \circ \left(
\begin{matrix}
1 / y^1_1 & \dots & 1 / y^1_{n_c} \\
\vdots &\vdots& \vdots \\
1 / y^n_1 & \dots & 1 / y^1_{n_c}
\end{matrix} \right)_\text{pred} \\
& = - {\bf Y}_\text{gt} \circ \frac{1}{{\bf Y}_\text{pred}}
\end{align*}
\end{minipage}}
\noindent The first thing to notice is that~$\text{d} \mathcal{L}_\text{batch}$ is expressed as a {\bf Frobenius product between an ``error''~\footnote{Besides the fact that this term is directly related to the loss function~${\bf \ell}_\mathbfcal{P} \left( {\bf Y}_\text{gt} , {\bf Y}_\text{pred} \right)$, the origin of the naming convention as an ``error'' term will become evident \hyperlink{etymology}{later}.} term and the total derivative of a layer}. We will come back to this important observation below but, for now, let us push the calculation one step further. As specified in~table~\ref{table:networkExample} and explained in detail in section~\ref{sec:softmaxLayer}, the predicted probability distribution ${\bf Y}_\text{pred} = \text{softmax}\,{\bf A}$ is determined by applying the softmax function to the ``embedded'' representation~${\bf A}$. In general, this means that its total derivative~$\text{d}{\bf Y}_\text{pred}$ can be formulated in terms of~$\text{d}{\bf A}$ through the chain rule (its exact expression is provided in~eq.(\ref{eq:softMaxTotalDerivative}) as part of the relevant section dedicated to the softmax function). \\
\noindent In order to continue evaluating~$\text{d} \mathcal{L}_\text{batch}$, let us now consider the error term involving the cross-entropy loss function~$\ell_\mathbfcal{P}$ defined in~eq.(\ref{eq:crossEntropy}):
\begin{align*}
\nabla_\mathbfcal{P} {\bf \ell}_\mathbfcal{P} ( {\bf Y}_\text{gt} , {\bf Y}_\text{pred} ) &= \nabla_\mathbfcal{P} \left( - {\bf Y}_\text{gt} \ominus \log {\bf Y}_\text{pred} \right) \phantom{zzzzzzzzzzzzzzzzzzzzzzzz} \\
& \makebox[0pt][l]{\makebox[0pt][r]{\tooltip****{\usebox{\switchbox}}{\tempboxGradLoss}[-\fboxsep,-\dimexpr\ht\switchbox+\dp\switchbox\relax]}} \\
&= - {\bf Y}_\text{gt} \circ \frac{1}{{\bf Y}_\text{pred}}
\end{align*}
\newpage
\newcommand\tempboxGrad{%
\begin{minipage}{0.2\textwidth}%
\abovedisplayskip=0pt
\belowdisplayskip=0pt
\begin{align*}
\text{d} \mathcal{L}_\text{batch} &= \nabla_\mathbfcal{P} {\bf \ell}_\mathbfcal{P} ( {\bf Y}_\text{gt} , {\bf Y}_\text{pred} ) \cdot \text{d}{\bf Y}_\text{pred} \\
&= \left( - {\bf Y}_\text{gt} \circ \frac{1}{{\bf Y}_\text{pred}} \right) \cdot \left( {\bf Y}_\text{pred} \circ \left( \text{d} {\bf A} - \widetilde{ {\bf Y}_\text{pred} \ominus \text{d} {\bf A} } \right) \right) \\
&= \sum_{ij} \left( - {\bf Y}_\text{gt} \circ \frac{1}{{\bf Y}_\text{pred}} \right)^{ij} \cdot \left( {\bf Y}_\text{pred} \circ \left( \text{d} {\bf A} - \widetilde{ {\bf Y}_\text{pred} \ominus \text{d} {\bf A} } \right) \right)_{ij} \\
&= - \sum_{ij} Y^{ij}_\text{gt} \, \frac{1}{Y^{ij}_\text{pred}} \, Y^{ij}_\text{pred} \left( \text{d} {\bf A} - \widetilde{ {\bf Y}_\text{pred} \ominus \text{d} {\bf A} } \right)_{ij} \\
&= - \sum_{ij} Y^{ij}_\text{gt} \left( \text{d} {\bf A} - \widetilde{{\bf Y}_\text{pred} \ominus \text{d} {\bf A} } \right)_{ij} \\
& = - {\bf Y}_\text{gt} \cdot \left( \text{d} {\bf A} - \widetilde{ {\bf Y}_\text{pred} \ominus \text{d} {\bf A} } \right) \\
&= {\bf Y}_\text{gt} \cdot \widetilde{ {\bf Y}_\text{pred} \ominus \text{d} {\bf A} } - {\bf Y}_\text{gt} \cdot \text{d}{\bf A}
\end{align*}
\end{minipage}}
\newcommand\tempboxIntermediate{%
\begin{minipage}{0.2\textwidth}%
\abovedisplayskip=0pt
\belowdisplayskip=0pt
\begin{align*}
{\bf Y}_\text{gt} \cdot \widetilde{ {\bf Y}_\text{pred} \ominus \text{d} {\bf A} } &= \sum_\text{samples} {\bf Y}_\text{gt} \ominus \left( \widetilde{ {\bf Y}_\text{pred} \ominus \text{d} {\bf A} } \right) \\
&= \sum_\text{samples} \left(
\begin{matrix}
y^1_1 & \dots & y^1_{n_c} \\
\vdots &\vdots& \vdots \\
y^n_1 & \dots & y^n_{n_c}
\end{matrix} \right)_\text{gt} \ominus \left(
\begin{matrix}
{\bf y}^1_\text{pred} \cdot \text{d} {\bf a}^1 & \dots & {\bf y}^1_\text{pred} \cdot \text{d} {\bf a}^1 \\
\vdots &\vdots& \vdots \\
{\bf y}^n_\text{pred} \cdot \text{d} {\bf a}^n & \dots & {\bf y}^n_\text{pred} \cdot \text{d} {\bf a}^n
\end{matrix} \right) \\
&= \sum_\text{samples} \left(
\begin{matrix}
\left(y^1_1 + \cdots + y^1_{n_c} \right)_\text{gt} {\bf y}^1_\text{pred} \cdot \text{d} {\bf a}^1 \\
\vdots \\
\left(y^n_1 + \cdots + y^n_{n_c} \right)_\text{gt} {\bf y}^n_\text{pred} \cdot \text{d} {\bf a}^n
\end{matrix} \right) \\
&\centerwithin{\downarrow}{=} \colorbox{light-blue}{${\bf y}^s_\text{gt}$ disappears because of the OHE property $\sum_c (y^s_c)_\text{gt} = 1 $} \\
&= \sum_\text{samples} \left(
\begin{matrix}
{\bf y}^1_\text{pred} \cdot \text{d} {\bf a}^1 \\
\vdots \\
{\bf y}^n_\text{pred} \cdot \text{d} {\bf a}^n
\end{matrix} \right) = \sum_\text{samples} {\bf Y}_\text{pred} \ominus \text{d}{\bf A} \\
& = {\bf Y}_\text{pred} \cdot \text{d} {\bf A}
\end{align*}
\end{minipage}}
\noindent The next step consists in combining the expression above for~$\nabla_\mathbfcal{P} {\bf \ell}_\mathbfcal{P} ( {\bf Y}_\text{gt} , {\bf Y}_\text{pred} )$ along with that of~$\text{d}{\bf Y}_\text{pred}$ derived in~eq.(\ref{eq:softMaxTotalDerivative}) and reproduced here as a reminder:
\begin{equation*}
\text{d}{\bf Y}_\text{pred} = {\bf Y}_\text{pred} \circ \left( \text{d} {\bf A} - \widetilde{ {\bf Y}_\text{pred} \ominus \text{d} {\bf A} } \right)
\end{equation*}
together into the Frobenius product of~eq.(\ref{totalLossBatch}) in order to get:
\begin{align*}
\text{d} \mathcal{L}_\text{batch} &= \left( - {\bf Y}_\text{gt} \circ \frac{1}{{\bf Y}_\text{pred}} \right) \cdot \left( {\bf Y}_\text{pred} \circ \left( \text{d} {\bf A} - \widetilde{ {\bf Y}_\text{pred} \ominus \text{d} {\bf A} } \right) \right)\\
& \makebox[0pt][l]{\makebox[0pt][r]{\tooltip****{\usebox{\switchbox}}{\tempboxGrad}[-\fboxsep,-\dimexpr\ht\switchbox+\dp\switchbox\relax]}} \\
&= {\bf Y}_\text{gt} \cdot \widetilde{ {\bf Y}_\text{pred} \ominus \text{d} {\bf A} } - {\bf Y}_\text{gt} \cdot \text{d}{\bf A} \\
& \makebox[0pt][l]{\makebox[0pt][r]{\tooltip****{\usebox{\switchbox}}{\tempboxIntermediate}[-\fboxsep,-\dimexpr\ht\switchbox+\dp\switchbox\relax]}} \\
&= \left( {\bf Y}_\text{pred} - {\bf Y}_\text{gt} \right) \cdot \text{d} {\bf A}
\end{align*}
Comparing the expression above for~$\text{d} \mathcal{L}_\text{batch}$ with that of~eq.(\ref{totalLossBatch}), we observe that the structure as a Frobenius product between an error term and the total derivative of a layer is preserved as we go from the level of the predicted probability distribution to that of the embedding layer. Namely:
\begin{eqnarray}
\text{d} \mathcal{L}_\text{batch} &=& \underbrace{\nabla_\mathbfcal{P} {\bf \ell}_\mathbfcal{P} \left( {\bf Y}_\text{gt} , {\bf Y}_\text{pred} \right)}_\text{upstream error} \cdot \, \text{d} \,\, \big( \underbrace{{\bf Y}_\text{pred}}_\text{current layer} \big) \nonumber \\
&\downarrow& \colorbox{light-blue}{our $1^\text{st}$ backward step through the network} \nonumber \\
\text{d} \mathcal{L}_\text{batch} &=& \overbrace{ \left( {\bf Y}_\text{pred} - {\bf Y}_\text{gt} \right) }^\text{downstream error} \cdot \, \text{d} \,\, \big( \overbrace{{\bf A}}^\text{previous layer} \big) \label{kickStartBack}
\end{eqnarray}
In other words, this first step in the evaluation of~$\text{d} \mathcal{L}_\text{batch}$ can be seen as going backwards through one layer of the neural network: we went from an expression involving~${\bf Y}_\text{pred}$ to a similar expression that now involves the preceding layer~${\bf A}$. In this process, the ``upstream'' error at the level of~${\bf Y}_\text{pred}$ has been modified into a new ``downstream'' expression at the level of~${\bf A}$. \\
\noindent \hypertarget{etymology}{At this point}, it is useful to make a connection with our example network by pattern matching~eq.(\ref{kickStartBack}) against the downstream error~$\Delta_i \equiv {\bf Y}_\text{pred} - {\bf Y}_\text{gt}$ and the embedding layer~${\bf A} \equiv {\bf A}_i$ with~$i=16$ inferred from table~\ref{table:networkExample}. As the difference between the predicted probability distribution and the ground-truth, the \colorbox{pink}{naming of~$\Delta_i$ as an ``error'' term is self-explanatory}. In summary, the backward pass starting at the level of the loss, through the softmax layer and back up to the embedding layer is given by:
\begin{empheq}[box={\backPropBox[{\bf cross-entropy \& softmax}: backward pass]}]{alignat=2}
\text{d} \mathcal{L}_\text{batch} &= \Delta_i \cdot \text{d}{\bf A}_i \label{bootStrapBackPropEmbed} \\
\Delta_i &= {\bf Y}_\text{pred} - {\bf Y}_\text{gt} \nonumber
\end{empheq}
More generally, $\Delta_i$ corresponds to the gradient of the loss function with respect to the data array~${\bf A}_i$. For consistency, {\bf we will continue to refer to the descendants of~$\Delta_i$ as generalized ``error'' terms}.
\newcommand\tempboxRecursion{%
\begin{minipage}{0.2\textwidth}%
\abovedisplayskip=0pt
\belowdisplayskip=0pt
\begin{align*}
\text{d} \mathcal{L}_\text{batch} &= \Delta_i \cdot \text{d} {\bf A}_i \\
&\centerwithin{\downarrow}{=} \colorbox{light-blue}{formal expansion of the total derivative $\text{d} {\bf A}_i$} \\