-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathml.tex
2102 lines (1857 loc) · 178 KB
/
ml.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
\documentclass[4apaper,12pt]{book}
\usepackage{amsmath}
\usepackage{index}
\usepackage{gensymb}
\usepackage[toc,page]{appendix}
\usepackage{hyperref}
\usepackage[semicolon,round,sort&compress,sectionbib]{natbib}
\usepackage{graphicx,accents}
\usepackage{chapterbib}
\usepackage{tikz}
\usepackage{pgfplots}
\usepackage{subcaption}
%\usepackage[demo]{graphicx}
\pgfplotsset{width=10cm,compat=1.9}
\usepgfplotslibrary{external}
% Define \figref as abbreviation of "Figure~\ref{...}"
\newcommand\figref{Figure~\ref}
%\usetikzlibrary{positioning}
\usetikzlibrary{positioning, fit, arrows.meta, shapes, chains}
% used to avoid putting the same thing several times...
% Command \empt{var1}{var2}
\newcommand{\empt}[2]{$#1^{\langle #2 \rangle}$}
\makeindex
% for mathematical symbols
%\usepackage{amsfonts}
\usepackage{amssymb}
\begin{document}
\title{AI and Machine Learning mathematics with algorithmic implementation}
\author{mohab metwally}
\date{2020}
\maketitle
\tableofcontents
\chapter*{Introduction}
\addcontentsline{toc}{chapter}{Introduction}
\section{notation}
\begin{description}
\item through the book i sometimes use subscript without braces, or superscript with braces, or even curly braces to indicate vector element access, or row access in the case of a matrix.
\item $(x, y) \in R^{n_x}, y \in {0,1}$ are the training data-set, where x is input, and y is corresponding output.
\item therefore $M=\{(x^1, y^1), (x^2, y^2), ..., (x^{m}, y^{m})\}$ is the training set, and $M_{test}$ is test example.
\item X is the input of training set and $\in R^{n_x \times m}$, $X=\begin{vmatrix}X^1&X^2&...&X^{m}\end{vmatrix}$, $X^{(i)}$ is the ith column of length n, and can be written as x, or $x^{(i)}$.
\end{description}
\chapter{Logistic Regression as a neural network}
\section{definitions}
\begin{description}
\item we need an estimate function $\hat{y}$ for the input x, and weight prameters w$\in R^{n_x}, b\in R$.
\item logstic function is $\hat{y}=p(y=1|x)$, and can be defined as follows: $\hat{y}=\sigma(w^Tx+b)$, where the sigma function is defined by $\sigma(z)=\frac{1}{1+e^{-z}}$, and notice when z $\to \infty, \sigma = 1, z \to -\infty, \sigma =0$.
\item
\end{description}
\section{cost function}
\begin{description}
\item starting with a estimation linear forward model $\hat{y}$, we calculate the difference between our estimate, and the real value $y$, and through optimization we try to minimize the difference, or loss/cost through gradient descent, then we update our model's parameters.
\item loss function is minimizing the difference between estimation ${\hat{y}, y}$, and can be defined as least squre $L(\hat{y}, y)=\frac{1}{2}(\hat{y} - y)$, but least squares leads to non-convex loss function(with multiple local minimums).
\item there are different loss functions, but the most efficient is that which maximize the difference. we can define $P(y|x^{(i)}, \theta) = h(x^{(i)},\theta)^{y^{(i)}}(1-h(x^{(i)},\theta)^{1-y^{(i)}}$, to increase the sensitivity to the training set we take the likelihood function, as the loss, $L(\theta)=\prod_{i=1}^mP(y|x^{(i)}, \theta)$ see $\mathbf{(AppendixA)}$.
\item one final step in our model is that as m get larger L tend to go to zero, to solve this we define the average sum of log-likelihood, or loss function to be our Cost function.
\item we multiply by -1 since the sum of the log-likelihood function is negative.
\item the Cost function $J(\theta)=\frac{1}{m}\sum_{i=1}^{m}log(h(x^{(i)},\theta)^{y^{(i)}}(1-h(x^{(i)},\theta)^{1-y^{(i)}})$
\item loss function is defined as $L(\hat{y}, y)=-[ylog(\hat{y}) - (1-y)log(1-\hat{y})]$, L$\in[0-1]$.
\item cost function is defined as the average of loss function $J(w,b)=\frac{1}{m}\sum_{i=1}^{m}L(\hat{y^{(i)}}, y)$
\end{description}
\section{Gradient Descent}
\begin{description}
\item gradient descent is a way to tune the weighting parameters, the objective is the lean toward the fittest weights with respect to the least cost.
\item iterate through cost function $\mathbf{J}$ tuning with respect to weight parameters $\mathbf{w}$, $\mathbf{b}$.
\item iterate through: $w:=w-\alpha\frac{\partial{J}}{\partial{w}}$, $b:=b-\alpha\frac{\partial{J}}{\partial{b}}$, for tuning w, b for the least $\mathbf{J}$ possible, such that $\alpha$ is the learning rate of GD.
\item for simplicity $\partial{J}/\partial{w}$ replaced for $\partial{w}$, and similarly $\partial{J}/\partial{b}$ is replaced for $\partial{b}$.
\item forward propagation, $$\partial w = \frac{\partial{J}}{\partial{L}}\frac{\partial{L}}{\partial{\hat{y}}}\frac{\partial{\hat{y}}}{\partial{z}}\frac{\partial{z}}{\partial{w}}$$, similarly $$\partial b = \frac{\partial{J}}{\partial{L}}\frac{\partial{L}}{\partial{\hat{y}}}\frac{\partial{\hat{y}}}{\partial{z}}\frac{\partial{z}}{\partial{b}}$$.
\item $$\partial{L}/\partial{\hat{y}}=\frac{-y}{\hat{y}} + \frac{(1-y)}{1-\hat{y}}$$, $$\partial{\hat{y}}/\partial{z}=\frac{-e^{-z}}{1+e^{-z}} = \hat{y}(1-\hat{y}).$$
\item $\partial{L}/\partial{z}=\hat{y}-y$.
\item then we can deduce that the final iteration gradient descent step after calculating sigma, loss, and cost functions can be $$w:=w-\frac{\alpha}{m}\sum_{i=1}^m\frac{\partial{L}}{\partial{b}}=\frac{\alpha}{m}X^T(\hat{y}-y)$$, and $$b:=b-\frac{\alpha}{m}\sum_{i=1}^{m}(\hat{y}-y)$$.
\end{description}
\section {Model training}
\begin {description}
\item to train a logistic regression model given data set of \textbf{{X,y}} we divide it into 20\% for testing, and 80\% for training, such that the training set is used to train our model parameters, and testing set is a separate set to test our model's predictions.
\item we call $X=\{X_{training},X_{testing}\}, y=\{y_{training},y_{testing}\}$ the data set, and
$\{X_{training}, y_{training}\}$ the training set,and
$\{X_{testing}, y_{testing}\}$ the testing set.
\item using $X_{training}, y_{training}$ (for the rest of the chapter, and the book i will refer to them by $X,y$ for simplicity) we start \textbf{Forward propagation} to estimate $\hat{y}$ calculate the difference between $y$, $\hat{y}$ through \textbf{Cost function} $J(y,\hat{y})$.
\item going backward to W, b we implement \textbf{Backward propagation} through $\frac{\partial{J}}{\partial{\omega}}$, and $\frac{\partial{J}}{\partial{b}}$.
\item finally we \textbf{update weight parameters} after sufficient iterations until we minimize our cost function completely.
\end {description}
\section{Forward Propagation}
\begin{description}
\item we begin by initializing our weight, and bias parameters
$\omega$, b randomly.
\subsection{Activation Functions}
\item what is activation function?
\item sigmoid:
\item relu:
\item tanh:
\item Starting with input training set \textbf{X} in our model. we estimate
$$z=\omega^TX + b$$. then $$\hat{y}=\sigma(z)$$.
\item calculate cost function $$J(y,\hat{y})$$.
\end{description}
\section{Backward Propagation}
\begin{description}
\item after evaluating the cost function $J(y,\hat{y})$ we calculate it's derivative with respect to $\{\omega,b\}$.
\item $$\partial{\omega}= \frac{\partial{L}}{\partial{\omega}}=
\frac{\partial{L}}{\partial{\hat{y}}}
\frac{\partial{\hat{y}}}{\partial{z}}
\frac{\partial{z}}{\partial{\omega}}$$
\item $$
\frac{\partial{L}}{\partial{\hat{y}}} = -
(\frac{y}{\hat{y}} - \frac{(1-y)}{(1-\hat{y}}) =
\frac{\hat{y}-y}{\hat{y}(1-\hat{y})}
$$
\item $$e^{-z}=\frac{1}{\hat{y}} - 1=\frac{1-\hat{y}}{\hat{y}}$$
\item $$
\frac{\partial{\hat{y}}}{\partial{z}} =
\frac{e^{-z}}{1+e^{-z}} = (\hat{y})^2e^{-z}
$$
\item $$ \partial{\omega} = X^T(\hat{y} - y) $$
\item $$ \partial{b} = \hat{y} - y $$
\end{description}
\section{Update parameters}
\begin{description}
\item we implement the following algorithm with a fixed number of iteration that is customized per application, and tuned by the Engineer, such that each application would require different tuning parameters from which is the iteration number.
\item we iterate the following: update the parameters $\omega$, b, in the back propagation step using $\partial{\omega}$, $\partial{b}$.
\item $$\omega = \omega - \frac{\alpha}{m}X^T(y-\hat{y})$$
\item $$b = b - \frac{\alpha}{m}(y-\hat{y})$$
\end{description}
\section{Summary}
\begin{description}
\item
\end{description}
\section{Logistic Regression in Python}
\begin{description}
\item
\end{description}
\section{References}
\begin{description}
\item
\end{description}
\chapter{Neural Networks}
\section {Lingua franca}
\begin{itemize}
\item RELU Activation Function: It turns out that using the Tanh function in hidden layers is far more better. (Because of the zero mean of the function). Tanh activation function range is [-1,1] (Shifted version of sigmoid function). Sigmoid or Tanh function disadvantage is that if the input is too small or too high, the slope will be near zero which will cause us the gradient decent problem. RELU stands for rectified linear unit, it's a rectifier Activation function and can be defined as $f(x)=x^+=max(0,x)$ or $\begin{cases} 0 & x\leq 0 \\ x & x > 0 \end{cases}$ relu shows to be better replacement to sigmoid function $\sigma$ for the reason that it help in vanquishing gradient problem.
\item Neuron: is a linear regression algorithm denoted by z, or a, $z= W^TX + b$, such that W is the the weight vector of the network.
\item Shallow Layers: also known as Hidden Layers, is a set of neurons, for example of the network of composed of input $X$, and output $Y$, with at least a single layer $L1$, and at most 2 layers, then the forward propagation will be as follows: we calculate the logistic function for the first layer $(1)$, $z^{1}_i=w^TX_i+b_i$, $\hat{Y}^{(1)}=\sigma{(z^{1}_i)}$ , then we proceed to calculate the final logistic evaluation for the output layer with $\hat{Y}^{(1)}$ as an input instead of $X$, and so on we proceed replacing $\hat{Y}^{(i)}$ instead of X as new input.
\item Layer: layer $L_{(i)}$ is $\hat{Y}^{(i)}=[\hat{y}^{(i)}_1, \hat{y}^{(i)}_2, ..., \hat{y}^{(i)}_n]$ such that n is the length of the layer $L_{(i)}$. each $\hat{y}^{(i)}_{j}$ is weighted with unique weight vector with previous layer $L_{(i-1)}$.
\item Neural Network(NN): is a set of interconnected layers, $<X, L_1, L_2, ..., L_m, Y>$
\item Deepness: shallow layer as defined to consist of 1-2 hidden layers, but on the other hand Deep Network is consisting of more than 2 inner, or hidden layers.
\end{itemize}
\begin{description}
\item we discussed in previous chapter that $\frac{\partial{\hat{y}}}{\partial{z}}$, is actually for the logistic activation function $\sigma$ only, we need to calculate the same derivation for tanh, and RELU.
\item for $tanh(z)=\frac{e^{z}-e^{-z}}{e^{z}+e^{-z}}$ is $\frac{\partial{\hat{y}}}{\partial{z}}=1-tanh(z)^2$
\item for relu activation function $\frac{\partial{\hat{y}}}{\partial{z}}=\begin{cases}0 & z<0\\1 & z\ge0\end{cases}$
\item in NN there are plenty of parameters to worry about, for example the weights need to be initialized randomly, with small values, and $b$ can be initialized as zero.
\end {description}
\section{Model training}
\begin{description}
\item Training a deep neural network is analogous to training a logistic regression network, as discussed in previous chapter, a logistic regression model can be considered a neural network with zero \textbf{hidden layers}.
\item We follow the same algorithm, parameter initialization, but in this case we initialize the parameters for each layer, assume a network composed of 2 hidden layers $X \rightarrow L1 \rightarrow L2 \rightarrow Y$, layer $(L^{(i)}, L^{(i-1)})$ are interconnected with weight,bias parameters $\omega^{(1)},b^{(1)}$, such that $\omega^{(i)}$ is a matrix of shape $(length(L^{(i)}), length(L^{(i-1)})$, and $b$ is a vector of shape $(length(L^{(i)}, 1))$, so each node in the layer $L^{(i)}$ is connected with each node in previous layer $L^{(i-1)}$.
\item W, b are ought to be randomly initialized, in the logistic regression discussed in previous chapter, W, b should be initialized with zero values, but in the case of the neural network zero initialization leads to $\hat{y}=0$, and all the nodes share the same weight which violates the purpose of the nodes in the first place which is to capture features from the data set as much as possible, so random initialization is necessary, and not to overshoot, initialization better in range $]0-1[$ weighted by small value around 0.01(this will be discussed in details in NN hyperparameters chapter) to reduce the sensitivity, in order for the gradient descent to not take for ever.
\item forward and back propagation are executed in a similar way to logistic regression with minor differences.
\item calculation of forward propagation using inputs from the previous layer instead of input data set, activation function can be \textbf{sigmoid}, \textbf{relu}, or \textbf{tanh}, it has been tested that \textbf{relu} activation function in the first layers shows better results, and \textbf{sigmoid} activation in the last layer to fit perfectly with the output classification y-vector in the range $[0-1]$
\end{description}
\section{Parameter initialization}
\begin{description}
\item Iterating through each layer, such that $L^i$ layer of length $l_i$ nodes, and previous layer $L^{i-1}$ of $l_{i-1}$ nodes, the weight parameter at layer L is inter-connected with all nodes in the previous layer, meaning that nodes at layer $L^i$ ought to have weight matrix of shape $(l_i, l_{i-1})$, and bias of shape $(l_i,1)$.
\item Unlike the logistic regression, in Neural network initialization step is crucial step, in logistic regression there is only a single activation node extracting a single feature such as price of a house for example, but we intend to employ neural networks to capture as many features as possible inside each node, and therefor initialization ought to be with random values, otherwise we end up with similar copies of each node forward, and backward propagation.
\item and since we choose random $W^i$ we can choose $b^i$ to be zero.
\subsection{Xavier initialization}
\begin{description}
\item randomizing weight parameters from a Gaussian, or normal distribution help to keep the initial values contained, or in other words, solves the vanishing, exploding gradient problem , instead to be demeaned around zero, this is proven to reduce variance (we will discuss this in details later on)
\end{description}
\end{description}
\section{Forward Propagation}
\begin{description}
\item similar to logistic regression forward propagation, done for each layer, with little difference that instead of using sigmoid function $\sigma$ we replace it with \textbf{relu} function, which is shows better results, and activate the last layer with sigmoid function to distribute the output toward the extremes.
\item the forward propagation step is done on the input $A^{i-1}$ from previous layer, with current layer $L^i$ parameters $\omega^i$, $b^i$.
\item we iterate through layers $L^i$ such that $i \in [0,L]$
\item $$z^i=(\omega^i)^TA^{(i-1)}+b^i$$.
\item $$
A^i=\begin{cases}
\text{relu{($z^i$)} $\leftarrow$ if i $<$ L} \\
\text{$\sigma(z^i)$ $\leftarrow$ if i = L}
\end{cases}
$$
\end{description}
\section{Backward Propagation}
\begin{description}
\item $$
\frac{\partial{L^i}}{\partial{\omega^i}}=
\frac{\partial{L^i}}{\partial{A^{(i-1)}}}
\frac{\partial{A^{(i-1)}}}{\partial{z^i}}
\frac{\partial{z^i}}{\omega^i}
$$
\item $$\partial{A^{(i-1)}}=\partial{z^i}
(\frac{\partial{A^{(i-1)}}}{\partial{z^i}})^{-1}=
\partial{z^i}\frac{\partial{z^i}}{\partial{A^{(i-1)}}}
$$
\item since the last activation function is sigmoid, then $$\partial{z^L}=y^L-\hat{y^L}$$
\item the back propagation algorithm start with the following initialization step:.
\item $$ \partial{A^{(L)}} = \frac{(\hat{y}-y)}{\hat{y}(1-\hat{y})} $$
\item $$\partial{A^{(L-1)}} = (\omega^{(L)})^T(y^{(L)}-\hat{y^{(L)}})$$
\item $$ \frac{\partial{z^i}}{\partial{A^{(i-1)}}} = \omega^T $$
\item $$\partial{A^{(i-1)}}=\omega^T\partial{z^i}$$
\end{description}
\section{Summary}
\begin{description}
\item
\end{description}
\section{Deep neural networks in Python}
\begin{description}
\item
\end{description}
\chapter {Neural Networks hyperparameters}
\begin{description}
\item machine learning have a wide applications in ranging from Computer Vision, Natural Language Processing, Speech recognition, and structured data, in data-science, and the the parameterization varies greatly from one domain to another, it's domain specific.
\subsection{training}
\item given a data set, wide range of models, the data data is divided into three categories, the \textbf{training}, \textbf{development}, and the \textbf{testing}.
\item we train different models to evaluate the most fit model on the development(also called evaluation set), after choosing our model we test on the testing set.
\item the partitioning ratio of the data-set varies with the size of the data set, for example in a small data set of size 1000, it's convenient to divide the data into 70\%, 30\% for training, and test respectively, or 60\%, 20\%, 20\% for training, evaluating, and testing respectively, but in the era of big data, of millions of entries, data set is conveniently divided into (98\%, 1\%, 1\%), (99.5\%, 0.4\%, 0.1\%) for training, evaluation, testing sets respectively, the ratio varies from one domain to another.
\item for faster, and accurate training/testing the data-set ought to be for the same source of the same general range of features, on the other hand in the mismatched data-set, in the case of training set from specific source, and development/test set from different set, the development/testing process can be inefficient, so the division of the partitioned sets need to be taken randomly from the same source.
\subsection{bias-variance trade-off}
\item In an n-dimensional training model, the bias-variance dimension indicates how far our model captures the data, and can be classified into under-fitting high-bias model where it hardly fit the data, on the other extreme, high-variance, over-fitting model, and in between just-right model.
\item high-variance: in a classification problem where the training set error is 1\%, while the dev set error is 11\%, this gape indicate that the model is well trained in a very narrow data-set with limited range of features quite different from that of the development set, in this case the model is over-fitting, or hardly scaling to other data-sets, and therefore fails in application.
\item high-bias: similarly in the same experiment if the training set error is around 15\% where the dev-set error is ~ 16\%, indicating that the model generalize much better than the previous case of high-variance over-fitting model, but with high error would indicate that the model is highly-biased, and well generalized.
\item high-bias, high-variance: the worst model is that which do not generalize well, and doesn't fit our data, with high training error say 15\%, and 30\% development error.
\item low-bias, low-variance: the perfect model is that of low bias, low-variance, that which fits the training set, and generalizes well.
\subsection{recipes for high-bias, high-variance}
\item high-bias solution: getting a high bias means that our model doesn't capture sufficient features, and this can be due to our network is short, or narrow, or the gradient descend need to be run for longer time, or through using optimization, or change the neural network architecture.
\item high-variance solution: high variance indicate the lack of generalization, and is reduced through training on wider data-set, regularization, and changing the NN-architecture.
\subsection {Regularization (weight decay)}
\item in the case of high-variance, we need our model be more generalized over the data-set, one way to do so is through regularization, or changing the sensitivity of the weight parameters, this sensitivity is measured in the back-propagation step through $\partial{J}/\partial{w}$, we can vary the sensitivity by regularization parameter $\lambda$ added to the cost function as follows:
$$ J(w^1,w^2...,w^L,b^L,b^L...,b^L) = \frac{\alpha}{m}\sum_{i=1}^{m}{L(w^1,w^2...,w^L,b^1,b^2...,b^L)} + \frac{\lambda}{2m} \sum_{l=1}^{L} {\left\Vert w^{i} \right\Vert^2} + \frac{\lambda}{2m}b^L $$
\item but usually the last term is quite ineffective in regularization, and ignored, so it's cuts down to the following:
\item $$ J(w^1,w^2...,w^L,b^L,b^L...,b^L) = \frac{\alpha}{m}\sum_{i=1}^{m}{L(w^1,w^2...,w^L,b^1,b^2...,b^L)} + \frac{\lambda}{2m} \sum_{l=1}^{L} {\left\Vert w^{i} \right\Vert^2} $$
\item where the last norm is frobenius norm:
$$ \sum_{i=1}^{n^{(l)}} \sum_{j=1}^{n^{(l-1)}} (w_{i,j}^{(l)})^2 $$
\item backward propagation step:
\item $$ \frac{\partial{J}}{w^{i}} = \frac{1}{m}\sum_{i=1}^{m}{\frac{\partial{L}}{\partial{w^L}}} + \frac{\lambda}{m} \sum_{l=1}^{L} {\left\Vert w^{i} \right\Vert} $$
\item looking at our mechanism now, $\lambda$ is a device for tuning the weight parameter, or rather as a valve to control it, the larger the $\lambda$ is the lower the w as we minimize our cost function.
\subsection {Inverted Dropout Regularization}
\item instead of reducing the weight parameters, we can drop it out completely by assigning those values to zero.
\item implementation algorithm: by matching a threshold probability = $P_{keep}$ against activation matrix $A^{(i)}$, and to keep balanced activation values, it's re-evaluated as $A^{(i)}/(1-P_{keep})$ and this last step distinguish the inverted dropout, from normal dropout, and the inverted is more efficient.
\subsection{Input Normalization}
\item since there are no constraint on the input features, each feature can be of different range, and the cost function can end up elongated at one dimension, and narrow on the other, being steep on one edge of the function, and smooth on the other, it can be visualized by taking a contour of the \textbf{J} function, it will look like very long ellipse.
\item with un-normalized input the gradient descend tends to take long time, and longer way down to the local minima, but if the input is normalized (demeaned, and divided by the standard deviation of the input) we end up with circular counter, and smooth, and faster gradient descent.
\subsection{Vanishing/Exploding gradients}
\item let's look deeper inside a deeper network, through L layers, such that L is large, $\hat{y}=(W^LW^{(L-1)}W^{(L-2)}...W^1)X$ if we assume $b=0$ for simplicity, and that $W^L$ is the transpose of $w^L$, in case entries of similar weight matrices in values, if the values of each $W^i$ matrix below the 1, then the overall $(W^LW^{(L-1)}W^{(L-2)}...W^1)$ term will be very small, and gradient descent will take forever, on the other hand, if those entries greater than 1, then the overall term will be huge, and the gradient descend will overshot(exploding).
\item a well chosen initialization parameters can Speed up the convergence of gradient descent, and Increase the odds of gradient descent converging to a lower training (and generalization) error.
%TODO name the formula, and elaborate.
\item to solve vanishing/exploding gradients problem we take two steps in weight parameters initialization, first the values are choosing at random through normal distribution, secondly, setting up the variance of $w^i$ to $\frac{1}{k}$ such that k is the number of the neurons in layer $i-1$. and it's by multiplying of $w^i$ by $\sqrt{\frac{1}{k}}$, this ratio shows to fit better with relu activation if replaced with $\sqrt{\frac{2}{k}}$
\item back to inverted dropout algorithm, due to the vanishing/exploding gradients problem we fix a threshold $P_{keep}$ and compare each entry in $W_i$ (taken from uniform distribution [0-1], unlike previously we choose W at random from normal/Gaussian distribution), so.
\item back propagation step we run the same masking algorithm over $\partial{A^{l}}$ and divide it by the same factor.
\item if $A^{l}=\frac{mask(A^{l})}{P_{keep}}=\frac{A^{l}.*A_{mask}}{P_{keep}}$, then $\partial{A^{l}}=\frac{\partial{A^{l}}.*A_{mask}}{P_{keep}}$ note that $.*$ means element-wise multiplication.
\subsection {gradient checking}
\item we verify the GD isn't converging.
\item it's a method to verify that the gradient descent is within diminishing $\epsilon$ of error difference with the manual derivative of the derived function $f(\theta)$, and $\theta$ is a vector of unwrapped weight, and bias parameters.
\item if following the Euclidean distance $L_2$ norm difference $ < (\epsilon =10e-7)$ then the result is negative. $$
\frac{
\left\Vert\frac{\partial{J(\theta^i)}}{\partial{\theta^i}} - \frac{\nabla {J(\theta^i)}}{\nabla{\theta^i}}\right\Vert
}
{
\left\Vert\frac{\partial{J(\theta^i)}}{\partial{\theta^i}}\right\Vert +
\left\Vert\frac{\nabla{J(\theta^i)}}{\nabla{\theta^i}}\right\Vert } < \epsilon $$
\end{description}
\section {Optimization}
\begin {description}
\item machine learning is highly iterative process, to choose low-bias, low-variance model from the set of trained/tested models, the iteration process for each model through the same m data-set entries, of L layers, and this process is time consuming, optimized algorithms are essential.
\subsection{stochastic, mini-batch, and batch gradient descent}
\item let's consider the input data-set $X$ of image-classification model Neural Network, using big-data of 5 million images, each with resolution of $800*400$, and three color channels, then our input Matrix X would be of size $5M*800*400*3$ bytes need to be loaded in the training machine memory, that is around 4.37 Tera bytes!
\item to be able to train our model we divide our Input set into mini-batches, if we choose the batch size to be 1 meaning we run the gradient descent on a single input entry, without taking benefit of vectorization, and machine GPU, then it's called \textbf{stochastic gradient descent} it's a \textbf{mini-batch} of size 1, even using input from the same distribution the cost function hardly reduces to the local minima, and tend to take random-walks for each input, it's hard to predict it's behavior, but for the overall training-set it converges to the neighbourhood area of the local minima. but on the bright side it can run on a moderate computer, but takes longer time compared to \text{batch gradient descent} that make full use of vectorization, and GPU power.
\item if we take the benefits of both worlds and choose \text{mini-batch} size fit enough to run in our training machine GPU/CPU memory, we can run our model in the optimal time possible. so instead of running (X,Y) of size $m$ inside our model, we divide our training data-set (X,Y) into $(X^{\{t\}}, Y^{\{t\}}) | t=\frac{m}{number-of-batches}$ mini-batches.
\end{description}
\section{Gradient descent with momentum}
\begin{description}
\item In the case of min-batch gradient descent, or in slow gradient descent the descent tends to overshoot, and deviate from the shortest path, to solve overshooting we employ weighted averages of the gradients (see appendix (D) on exponentially weighted averages) as follows: $$ Vdw^i = \beta_1 Vdw^{i-1} + (1-\beta_1) dw^i $$
\item then we use $Vdw^i$ in update parameters step, $w^i = w^i - \alpha Vdw^i$.
\end{description}
\section{RMSprob}
\begin{description}
\item In a highly oscillating converging gradient descent around the axis of convergence, we can employ 'root mean squared prob' algorithm also using weighted averages, and here is the formula $$ Sdw^i = \beta_2 Sdw^{i-1} + (1-\beta_2) (dw^i)^2 $$, the only difference here is that the last term's parameter is squared, and it's useful in the case of oscillating gradient along side an axis perpendicular to the converging axis, the squared weight will boost relative large value, or shrink down relatively small value, then in the update step we divide by the square root of the calculated maximized/shrinked weight value, if it was initialize relatively small we can compensate by increasing the learning rate $\alpha$ to leap larger steps, if the initial $dw$ value was relatively large then we shrink it for more stable gradient descent.
\item the last step is: $$ w^i = w^i - \alpha \frac{dw^i}{\sqrt{Sdw^i + \epsilon}} $$.
\end{description}
\section{Adaptive Momentum estimation (Adam)}
\begin{description}
\item It's not always the case that one optimization algorithm on first application will scale will on a the second, so combination of different optimization algorithms shows to work best.
\item adam algorithm combines both RMSprob, and momentum gradient descent:
$$ w^i = w^i - \alpha \frac{Vdw_{corrected}^i}{\sqrt{Sdw_{corrected}^i + \epsilon}} $$
$$ b^i = b^i - \alpha \frac{Vdb_{corrected}^i}{\sqrt{Sdb_{corrected}^i + \epsilon}} $$
(look up appendix D for further explanations on bias correction)
\subsection {Hyperparameters}
\begin{description}
\item the overall number of parameters is increasing: \begin{itemize}
\item $\alpha$: need to be tuned.
\item $\beta_1$:set to 0.9 (can be tuned).
\item $\beta_2$: set to 0.99 (can be tuned).
\item $\epsilon$: set to $10^{-8}$.
\end{itemize}
\end{description}
\end{description}
\section{Learning rate decay}
\begin{description}
\item at the start of gradient descent, it start making large leaps away from the maximum minima, but as it converges, gradient can converge faster if the $\alpha$ is tuned to adjust itself to get smaller over time.
\item in gradient descent iterations are called epochs, of index $epoc_{num}$ an epoch with rate $decay_{rate}$, we initialize learning rate to relatively large value $\alpha_0$ then we set our decaying learning rate to: $$\frac{1}{1+epoch_{num}*decay_{rate}}\alpha_0$$
\item there are different variation of learning rate decay of the form scale*$\alpha_0$ for example:
$$0.95^{epoch_{rate}}\alpha_0 $$
\end{description}
\section{Tuning parameters}
\begin{description}
\item so far our highly optimized neural network is composed of many parameters left to the designer to tune (ordered by priority):
\begin{enumerate}
\item learning rate \textbf{$\alpha$}.
\item number of hidden unites.
\item momentum averaging constant \textbf{$\beta$}.
\item mini-batch size.
\item number of layers \textbf{L}.
\item learning rate decay initial value \textbf{$\alpha_0$}.
\item ADAM's $\beta_1$, $\beta_2$, $\epsilon$.
\end{enumerate}
\item so we have 7 parameters so far, the permutation of those 7 variables even in a narrow range of discrete values, can take for ever, and an old-practice was to pick random permutation on a grid, meaning that $hyper-parameter_i$, against $hyper-parameter_j$, such that each cell map two different parameters, but since we are solving a stochastic search problem, it's more efficient to set the values at random in both dimension so for example for each $hyper-parameter_i$ is projected on the $hyper-parameter_j$ at each point, and texted, this opens more possibilities.
\subsection{coarse to fine search}
\subsection{panda vs caviar training approaches}
\begin{description}
\item depending on the computational capacity available you choose between panda, and caviar approaches, training a single model at a time, and tuning it's parameters every set of epochs in case you have limited computational resources is called panda approach, on the other hand, if you have sufficient power to run different models with different parameters simultaneously, this last approach is called caviar (those terms were coined by Andrew Ng in a coursera specialization).
\end{description}
\end{description}
\section{Batch Normalization}
\begin{description}
\item in previous chapter we discussed input normalization, for the same reasons we can implement the same method for each layer's input before the activation process, then we re-scale by $\lambda,\beta$: $$
z_{norm}^{[l]} = \frac{z^{[l]} - \mu({z^{[l]}})}{\sqrt{\sigma^2+\epsilon}} | l \in [1-L] $$
$$\widetilde{z^{[l]}} = \lambda z_{norm}^{[l]} + \beta $$
\item but make sure here that $(\beta,\lambda)\neq(\sqrt{\sigma^2+\epsilon}, \beta)$, because in such case we end up with $\widetilde{z^{[l]}} = z_{norm}^{[l]}$. here $\lambda, \beta$ are learned parameters, and updated during the training.
\item update $\lambda, \beta$ in the update step:
$$ \beta^{[l]} = \beta^{[l]} -\alpha d\beta^{[l]} $$
$$ \lambda^{[l]} = \lambda^{[l]} -\alpha d\lambda^{[l]} $$
\item in case of implementation of batch normalization we end up with the following: $$ z_{norm}^{[l]} = \frac{z^{[l]} - \mu({z^{[l]}})}{\sqrt{\sigma^2+\epsilon}} | l \in [1-L] $$, and $$\widetilde{z^{[l]}} = \lambda z_{norm}^{[l]} + \beta $$ if we look closely we found that $\beta$ also serves the purpose of bias, or shifting term \textbf{b} in forward algorithm: $z^{[l]} = w^TX^{\{i\}} + b$, therefore we can set \textbf{b} to zero, or remove this term.
\subsection{Covariate Shift}
\item if our data-set was from specific distribution, if we cut through the neural network at layer l, then all inputs $a^{[l-1]}$ feeding layer l are depended on the tuned parameters, gradients, and basically our input data, hence if $a^{[l]}$ are projected in specific neighbourhood for mini-batch $X^{\{i\}}$, at different mini-batch $X^{\{j\}}$ the projection can shift greatly in different area. Batch normalization help to muzzle this overshoots, by mapping the output of previous layer $l-1$ into normalization dimension, hence keeping the output more controlled for different mini-batches.
\subsection{Batch normalization as a regularization technique}
\item analogous to regularization, each mini-batch is scaled by mean, and variance, with the effect of randomizing the network layers output, similar to dropout technique, therefor it can be considered as regularization technique.
\subsection{Batch normalization on test sets}
\item during the test time, you can run the model on a a single example, so there are two approaches for calculation of $\mu, \sigma^2$ those values can be taken from the final mini-batch, or through exponential/running weighted averages run over each mini-batch parameters.
\end{description}
\section{Multi-class classification}
\begin{description}
\item so far we discussed a \textbf{sigmoid} activation over \textbf{$\hat{y}$}, what if our model's \textbf{y} is multi-class, and y isn't a single number but a vector of length n?
\item if we run for example sigmoid, or relu on a vector rather than a real number the sum of the vector entries will be arbitrary number,
\subsection{Softmax activation}
\item softmax is a probabilistic activation function, run over $z^{[L]}$ is defined as: $$
a^{[L]}=\frac{e^{z^{[l]}}}{\sum_{i=1}^{n^{[L]}}t_i},$$ where $t=e^{z^{[L]}} $.
\item the resultant $\sum_{i=1}^{n^{[L]}}{a_i^{[L]}}=1$, where $n^{[L]}$ is the number of neurons, or nodes in the final layer, which is the same as the number of classes at output \textbf{y}.
\end{description}
\section{Support Vector Machine (SVM)}
\begin{description}
\item so far the cost function with regularization is $$\frac{1}{m}[\sum_{i=1}^my^{(i)}(-log(\text( h_{\theta})(x^{(i)})) + (1-y^{(i)}) ((-log(1-h_{\theta}(x^{(i)})))] + \frac{\lambda}{2m}\sum_{j=1}^n\theta_j^2 $$
\item a historical, and widely used alternative yet similar algorithm is support vector machine or SVM, but first let's delve deeper inside the sigmoid function.
\item \begin{tikzpicture}
\begin{axis}
%\addplot[color=red]{log(x)};
\addplot[color=red]{1/(1+exp(-x))};
\legend{$\sigma(x)$}
\end{axis}
\end{tikzpicture}
%Here ends the furst plot
\hskip 5pt
%Here begins the 3d plot
%\begin{tikzpicture}
% \begin{axis}
% \addplot3[surf,] {1/(1+exp(-x))};
% \legend{$\sigma(x)$}
%\end{axis}
%\end{tikzpicture}
\item to get $y=1$ when $z\gg0$, on the other end for $y=0$ then $z\ll0$.
\item if we generalize our look to the logistic cost function, if $y=0$ then the equation reduces to $$\frac{1}{m}[\sum_{i=1}^my^{(i)}(-log(\text( h_{\theta}(x^{(i)}))] + \frac{\lambda}{2m}\sum_{j=1}^n\theta_j^2 $$, we cant define a define $cost_1=-log(\text( h_{\theta})(x^{(i)})=-log(\theta^TX)$
%%
\item \begin{tikzpicture}
\begin{axis}
%TODO fix this graph!!
\addplot[red,smooth] {-ln(1/(1+exp(-x)))};
\addlegendentry{$-log(\frac{1}{(1+e(-\theta^TX)))}$}
\addplot[blue, smooth] {(\x<=1) * (\x*-1) };
\addlegendentry{$cost_1$}
\end{axis}
\end{tikzpicture}
\begin{tikzpicture}
\begin{axis}
%TODO fix this graph!!
\addplot[red,smooth] {-ln(1-1/(1+exp(-x)))};
\addlegendentry{$-log(1- \frac{1}{(1+e(-\theta^TX)))}$}
\addplot[blue] {(\x>=-1) * (\x*1) };
\addlegendentry{$cost_0$}
\end{axis}
\end{tikzpicture}
%%
\item $cost_1$: the linear lower bound of $-log(\text( h_{\theta})(x^{(i)})$ term, when y=1, and $\theta^TX>=1$
\item $cost_0$: the linear lower bound of $-log(1-h_{\theta}(x^{(i)})))$ term, when y=0, and $\theta^TX<=-1$
\item penalization term $\lambda$ is added to the first term instead, $C \approx \frac{1}{\lambda}$, and finally, and averaging by m can be ignored since we multiply by large value C, and can also be ignored from the regularization term, since the $\theta$ is adjusted in the first term, note that the regularization term is just the Euclidean norm $\lVert \theta \rVert_2$
\item the final SVM: $$C\sum_{i=1}^m[y^{(i)}(cost_1(\theta^TX^{(i)})) + (1-y^{(i)}) (cost_0(\theta^TX^{(i)})] + \frac{1}{2} \lVert \theta \rVert_2$$
\item and the intuition for this is the first term get trained such that when $y=1$ then $cost_1=0$, if $y=0$, then $cost_0=0$, thus the first term is set to zero, and then SVM thrinks down to minimizing $$\frac{1}{2} \lVert \theta \rVert_2$$.
\subsection{Large Margins in linear Decision Boundaries}
\begin{description}
\item it turns out that SVM do a pretty good job in decision boundary for given data-set, assume we classify class A against class B in the given data-set, first in a linear classification, it draw a line in between the two blobs of data, with nearly equal margins in both sides, the reason is in the hypothesis constrains $z=\theta^TX$, for $y=1$ then $z>=1$, and if $y=0$ then $z<=-1$, thus the model is trained to classify classes A,B outside the Large margin [-1\text{-}1], also note that the $\theta^Tx^i$ is the dot product (degree of alignment) between the weight parameter, and the training example $x^i$, and since z in required to be outside the defined range, thus z need to be a large value, meaning that both $\vec{x}$, $\vec{\theta}$ need to be aligned, otherwise z shrinks to zero at perpendicular angle $90^{\degree}$. if for example $\vec{\theta}$ is inclined w.r.t $\vec{x^{i}}$ by $\Theta^{\degree}$, then $\vec{\theta}$ need to be pretty large in order for z to lay outside ]-1\text{-}1[, but since we using penalizing factor $C$, then $\vec{\theta}$ is maintained to reasonable value, and thus the SVM model converges to the optimal decision boundary with large margins.
\end{description}
\subsection{Non-linear Decision Boundaries}
\begin{description}
\item previous hypothesis $\theta^TX$ is linear, in case of non-linear features, for example extracting faces for pictures, the problem is no longer classification of two categories, but rather features extraction of non-linear features from n-Dimensional example of n features. for example $ f(x) = \theta_1x_1 + \theta_2x_2+ \theta_3x_1^2 + \theta_4x_2^2 + \theta_5x_1x_2 + \theta_6x_1^2x_2^2 + \dots $ can be a good representative for capturing the non-linear boundaries for training example $x^i$ for all i in n. for a problem with large number of features is equation can be very expensive to train, instead choosing a appropriate polynomial of k-degree, to suits our problem decision boundary can be easier to train.
\subsection{Gaussian kernel}
\begin{description}
\item in a non linear boundaries, where class A is surrounded by the class B scattered plots, the most appropriate used function is the normal, or Gaussian distribution function $$f(x)=\frac{1}{\sqrt{2\pi}\sigma}e^{-\frac{(x-\mu)^2}{2\sigma^2}}$$ yet instead of $\mu$ we use locality points (we will discuss this in the convolution neural network chapter), and since we deal with vector the numerator is the Euclidean distance $\lVert x^{(i)} - l^{(i)} \rVert$.
\item so for each training example $x^{(i)}$ if it exists in the neighbourhood of the locality point $l^{(j)}$ for i,j $\in$ m. with computational cost $O(m^2)$ then f(x)=0, otherwise if the distance is too far, f(x)=1, thus SVM with Gaussian kernel hypothesis: $$C\sum_{i=1}^m[y^{(i)}(cost_1(\theta^Tf^{(i)})) + (1-y^{(i)}) (cost_0(\theta^Tf^{(i)})] + \frac{1}{2} \lVert \theta \rVert_2$$
\end{description}
\end{description}
\end{description}
\section{Unsupervised Learning (Introduction)}
\begin{description}
\item in \textbf{supervised learning}, given structure data-set of input X, and output Y, and the model has to be trained to map X $\rightarrow$ Y, in \textbf{unsupervised learning}, given only input X, the model has to be trained to extract similarities between input examples, thus classifying them into different coherent classes, for example in NLP, using clustering k-mean algorithm to compute word embedding similarity, or another more interesting example is identification of galaxies, and stars from astronomical pictures, another example is market segmentation, even when the data doesn't fit into separate coherent clusters, cut extend over a long range, it can be divided onto K number of classes/clusters
\subsection{k-mean algorithm}
\begin{description}
\item given and input $X={x^1,x^2,\dots,x^m}$, randomly sample the K centroids (cluster centroids) over X, secondly is the assignment step, in which for each example $x^i$ for all i $\in$ m, $c^{(i)}$ is assigned to the centroid with the shortest distance, thus taking the values $[1\text{-}k]$, so if $x^i$ is closest to $k^{\textbf{th}}$ then $c^{(i)}=k$, thirdly, and final step is the centroid movement, in which for each centroid $\mu_k$ for k $\in$ K, is relocated to the average of all points assigned to it.
\item the cost function $$ J=1/m\sum_{i=0}^m\lVert x^{(i)} - \mu^{(i)} \rVert^2 $$
\item to avoid falling into a local optima, multiple instantiation of the algorithm with different random initialization is advised, at the end the lowest cost is chosen.
\item finally, if the number of clusters is unknown, and set arbitrary, then it need to commit to the constraint that $K <= m$, and whenever a centroid isn't assigned with any example, then K can be reduced by this number of unassigned centroids, or number of clusters can be plotted against the cost function, and we choose the pivotal k where the slop of cost changes significantly if any exist called \textbf{Elbow method}.
\end{description}
\subsection{Principle Component Analysis PCA}
\begin{description}
\item another technique that will come useful later on in the NLP chapter is dimensionality reduction, having high-dimensional input data-set, with potentially redundant features, reducing the dimensions can be useful for both understanding data, and getting rid of redundancy, so for training data set with m training examples, of n features, let's assume that the input X $\in$ $R^{n}$ is projected into smaller subspace $R^{k}$, ignoring the bias feature term for simplicity.
\item a straight forward algorithm is the Principle component Analysis, or PCA, for input features $X_{m\times{n}}$, $\Sigma=X^TX$ is the input for the single value decomposition SVD algorithm to de-factor it into $\textbf{U }\textbf{S }\textbf{V}$, for U being the Eigen vectors representing the n Dimensional space, and S is the Eigen values multiplied by $I_n$, in which the orthogonal matrices U, V have the property that any subset of U column for example first k columns denoted $U_k$ satisfy $U_k^TU_k=I_k$, therefore the projection into the k-Dimentional subspace spanning X is $U_kX$, or the projection of X on the subspace $U_k$.
\item reduction shouldn't be done without losing much of the useful data, so it's wise to pick k, such that the lose of variance $<$ negligible value $\epsilon$, and can be estimated by the sum of the reduced eigen values, over the total sum in n dimension, so as to say $$\frac{\sum_{i=1}^kS_{ii}}{\sum_{i=1}^nS_{ii}} >= 1-\epsilon$$.
\end{description}
\end{description}
\chapter{Structuring machine learning}
\section{Machine learning strategy}
\begin{description}
\item we discussed in previous chapter how, and when to use optimization, regularization techniques, in practice, such decision making is crucial, and can save months of development time if we know to choose the right direction, and make the right decision to help our model work more effectively, doing regularization when the system need optimization in the case of high-bias is time consuming process, and spending months getting more data-set in highly biased model can be a total wast of time.
\section{orthogonality}
\item tuning process isn't done arbitrarily but we must know which hyper-parameters to tune to get the most desired effect.
\item modeling a process or a problem with as many features, we need to be able to fit our model to capture all data-set features. in the set of hyper-parameters it's easier if we can tune one parameter from the set of hyper-parameters and only take an effect on the corresponding purpose without avalanche effect, so for example in a car we can choose the adjust speed separately from the steering, although steering and speed are related in a single function, yet we can change one without affecting the other, therefor the name \textbf{orthogonality}
\item doing well on training set, usually comparable to human level performance, depending on the application,... change the network into different architecture, use more efficient optimization algorithm i.e ADAM, etc
\item doing well on the dev-set, in that order if the model is note fitting the dev set, meaning we have a high-variance, and we can use regularization, or getting a larger data-set.
\item fit test-set well on the cost function, if we can't get our model to fit on the test-set in the other, then we probably over-tuned the model on the dev-set, and we need to re-tune with bigger dev-set.
\item perform well in application, finally and in this order, if we can't capture the real application features in our model, then we then dev-test sets aren't quit fitting in the model, or we need to change the cost function.
\section{Set up your Goal}
\begin{description}
\subsection{Evaluation metric}
\subsection{Precision vs accuracy}
\item accuracy/recall numerically the average of our prediction measurement is how var close to the reality.
\item on the other hand, precision is how far our network prediction are True, or in other words is how far our predication are what they are.
\item precision, and accuracy are orthogonal metrics, but there is a trad-off between both of them, in application, in the training process, it's advised to train different simultaneously (if possible) and plot their precision/accuracy metrics, and those the highest of all, getting high precision in one model, or high accuracy on the other, can't indicate which model is better.
\subsection {F1-score}
\item this metric is harmonic mean of precision \textbf{p}, and accuracy/recall \textbf{R}: $$\frac{2}{\frac{1}{P} + \frac{1}{R}}$$.
\item with a single metric that combine how far our model is both accurate, and precise can be more efficient.
\subsection {Satisfying-Optimizing metric}
\item we might have a precise, and accurate model such as in case of F1-score, but yet it doesn't satisfy our practical application, in reality we are limited by time, and computational resources, and we need high \textbf{F1-score} relative to available resources, then a more general metric is a weighted average of F1-score, and our resource metrics, for example if we developing a network for recognition on phone we need the system to be both accurate/precise, but also to be under specific time threshold $T$, then our evaluation metric can be $F1-score*0.8+T*0.2$.
\end{description}
\section{train/dev/test sets}
\begin{description}
\item as discussed earlier in previous chapter that the ration of data-set partition into train, dev, and test sets are dependent on the data-set size, and the application really in accord with bias/variance scale.
\item having a large enough training set can reduce variance, and sufficiently large enough dev set can enable us to have a deep perspective into the bias-variance scale, and finally the test set is essential to evaluate how var our low-bias, low-variance model fits on the same distribution of data.
\subsection {dev/test metric}
\item in case of very domain-specific training algorithm, for example a classification network on a very specific class $c_i$ or object out of the set of different classes C from our data-set of size n, we need to train our model to be aware of different classes, well distinguish between object rather than cat/not-cat binary classification, but also be able to classify our target object efficiently enough, we can add ad weighting parameter to the cost function evaluation: $$ J(w,b)=\frac{1}{\sum_{k=1}^{C_k}{w_k}} \sum_{i=1}^{m}w_k L(\hat{y},y) $$ where $w_k$ is the weight corresponding to each class $\begin{cases}w_1 \leftarrow \hat{y}=c_1\\w_2 \leftarrow \hat{y}=c_2\\...\\w_n \leftarrow \hat{y}=c_n\end{cases}$
\end{description}
\item it's important to note that if the distribution of the practical input-set is of different features than that of our train/dev/test distribution, our model can be guaranteed to do well in practice.
\end{description}
\section{Human-level performance}
\begin{description}
\item in the past few years machine learning have surpassed human cognitive levels in different areas, performance of machine today overshadow that of humans, in the past decades the accuracy of machine has been increasing linearly up to level slightly higher than the human's.
\item as long as the machine level is lower from that of the humans we can boost it with human-labeled data, gain a better insight from error analysis, better analysis of bias/variance , as to increase it's cognitive abilities
\subsection{human(bayes) error}
\item as machine level rises high to humans, we can add another level the train/dev/test cycle, that is human level error, and interpretation from human/bayes level to training error is called the \textbf{avoidable bias} error, and difference in error between training and development is the \textbf{variance}.
\item we can use avoidable bias, and variance as a metric for how well our model perform, and optimize our model, and network to reach human level performance, and minimize that error.
\item comparing those two metrics tell us where to optimize, for example if the avoidable bias is 4%, and the variance error is 10% then it's wiser to resolve the over-fitting of the model before achieving human-level performance.
\item the human level performance is really lose, comparable to how well our model is trained to achieve low variance error, humans as well difference in their levels of cognitive ability fitting a normal distribution naturally, but if an average human is to be compared to a guru in the specific domain, then the guru of sufficient skill-set, and training is able to achieve lower Bayes error, and the latter varies greatly, for example in radiology, an average human is expected to achieve higher error of diagnosis, and recognition compared to skilled doctor, and the latter compared to a team of doctors, choosing specific human-level from the domain of human-cognition is another problem.
\end{description}
\section{Error analysis}
\begin{description}
\item taking the right decisions in what to invest your time in while optimizing a model can be a great leverage and can save month, if not years of Engineering time! making error analysis is just one of those.
\item statistical analysis of the mismatched predictions $\hat{y}$ at the test time cat is coined the name \textbf{Error analysis}.
\item Assume for example that error analysis taking from the sample of mismatched predictions shows that false-negative is 50\% percent, then it would be wise to spend time enhancing our model for predictions, and cut down the ratio of 50\% to a reasonable ratio, perhaps 2\% .
\item for example in a classifier network, out of N mismatched predictions are actually what we expect (of being a specific class), but the model classified otherwise, from the N mismatched predictions we take for example 100 entry, and analyse the false-negatives. if it shows that the ratio is 50\% for example then we are on the right track to regard this as bias problem, and invest time on it, on the other-hand if out of the 100 entries only 3\% were false-negatives then, it's not our priority to invest time down this road, and to achieve such high accuracy is actually indication that we might not be able to do better, and it's more wise to move on, and spend time optimizing our model to get a low false-negatives on other classes from our error analysis, or train our model on certain features that might be mis-classified.
\subsection{Mislabeled data}
\item another perspective to Error analysis is to record the mislabeled input data, it turns out that deep learning is quite robust against small randomly mislabeled data, so in big data of millions of entries, if the mislabeled data are randomly distributed over the data-set classes, and it's a below acceptable error, it's really unwise to spend months fixed mislabeled data since deep learning prove to be robust against just errors.
\item so far there are three metrics in error analysis to consider, first the overall false-negatives, we take this ration, and sum the ratio of each prediction, thirdly the ratio of mislabeled data, comparing the three metrics can give us insight to help enhance our model, and reduce the overall error, if the overall error is reduced, meaning our model accuracy is acceptable compared to human level, we still have two metrics, to choose from, if the prediction-error (overall-error - mislabeled-error) from the dev-set is reduced after we choose between models, and the test error is also reduced then it can be wise to spend time on mislabeled input data.
\section{Mismatched training dev/test sets}
\begin{description}
\item it turns out in practice the model's input usually tend to noisier than the training/dev/test sets input, which are filtered for example in computer vision, input data-set can be of a common distribution of high-resolution, close proximity, centered object, while user input can be blurry, of low-resolution, or in speech recognition we train our model on a clear audio, yet in practice there will be stutter, background noise, etc.
\item there is a trade-off between choosing data-input between different distribution, and accuracy, and choosing data from different distribution can really help but under the following criteria, that the dev/test phases need to aim at our target, meaning to be chosen completely from target data, while the training data-set can be a shuffle of different distributions.
\item in this scenario new error is introduces, consider that the training error is 1\% and the dev error is 10\% then it's possible to tell wither our model is over-fitting or it's result of mismatch data from different distribution?! to help make a decision, the data-set is divided into training/train-dev/dev/test sets, where train-dev, and training sets are form the same shuffled distributions, while dev/test sets are of our target data, so addressing the same problem if the train error is 1\%, and the train-dev error is 9.5\%, while the dev error is 10\% then it's a variance problem, but if the train-dev error is 1.5\%, and the development error is 10\% then our model is getting mismatched data.
\item lastly, and as previously discussed if the test error say 15\% then our test set of the same target distribution as the dev-set is over-fitting the dev-set, and to fix this, we ought to increase the dev-set.
\subsection{Addressing data mismatch}
\item in case of data mismatch in a shuffle of distributions, or in other words the wide error difference between train-dev set, and the dev set, this is an indication of the gap between the data-sets from different distributions.
\item a close examination of that difference through error analysis help taking the right decision to reduce this difference, and choosing training data from distribution closer to the target data.
\subsection{Artificial data synthesis}
\item if the target data-set is limited as usual in practice, given a target data set of size m, it's possible through artificial data synthesis techniques to increase this size to 100m, or even as large as 1000m, but in this case it's possible to run into an over-fitting problem if the original m data set isn't representative enough.
\end{description}
\section{Multiple tasks learning}
\begin{description}
\subsection{Transfer learning}
\item when we train a model in a very domain-specific application with very limited data-set, we can employ weight-transfer, also known as transfer learning, and pick the closest general domain to our domains-specific application, and randomly initialize the last layer's weight $w^{[L]}$, leaving the rest of weight as they are, and through the new domain-specific data set $(X,Y)$ old $w^{[l]}$ $ | l \in [0-L[$, and randomly initialized $w^{[L]}$ we can scale the old model to the new model very quickly.
\end{description}
\subsection{Multitask learning}
\item in transfer learning we make use of previously trained weights to train our model on a newly different task usually in cases when there isn't sufficient data of the second class, but in cases when there is a need to extract as many features from the same example, and the same model, previously we discussed softmax activation function for training a model for multi-class classification, but in different cases of multitasking models rather than multi-class model, for example in object detection in a video, we need to extract boundaries around n set of object $\{o_1,o_2,...,o_n\}$ with bounding box surrounding each object, compared to multi-class problem then we just satisfied with the prediction of $\hat{y}$ in a sparse hot-vector, but in the multitask the label $Y$ is vector with more than a single entry of each task, in this we employ what is called Multitask learning, in the following chapter we will deal such case.
\end{description}
\section{End-to-End Machine Learning}
\begin{description}
\item .
\end{description}
\chapter{Computer Vision}
\section {Object Detection}
\begin{description}
\item In previous chapters we discussed classification neural networks, for a given input $x_i$, being an image, and prediction (estimate) $y$ is the classification wither in a binary representation 0/1 of size 1, or on-hot vector of multi-class classification, in case the input contains more than one class, known as multi-tasking. so far
\item classification in computer vision is one of the building blocks, feeding input $x_i$ to the NN, and getting an estimate of the classes inside the pictures isn't sufficient, when we (humans!) look at a picture we first locate the objects, then classify each of them, then derive a meaning. in computer vision we focus on the first two steps.
\item object detection is about How to locate all the objects in the picture simultaneously? but before that, in previous chapter we discussed that it's advised to use min-batch in the case of large NN compared to the available computational power, and memory, having a moderate resolution picture as an input i.e $1024 \times 1024$ of 3 channels, if the first layer consist of just 1000 neurons we end up with billions of parameters to train! with huge amount of parameters, and features to capture, if we doesn't have sufficient data-set, then our model will be over-fitting the data-set, then how to solve it?
\item convolution solves this problem, it makes use of the \textbf{sparsity of connections} (in each layer, each output value depends only on a small number of inputs), and \textbf{parameter sharing} (that a single features detector or a filter is shared between features) and we will discuss it in the following sections.
\item due to sparsity of connection convolutions NN good at capturing translation invariance, meaning a network is capable of getting the same output of a slightly shifted image, we will see why.
\subsection{Edge Detection}
\begin{description}
\item for input image $x_i$ how to detect the boundaries? how to interpret it mathematically?
\item first, edge boundaries of $2\times2$, or $4\times4$ pixels can be vertical, horizontal, or inclined by $n^{\circ}$, to extract such features of image $g(x)$ by filter $f(y)$, is a process of weighting image \textbf{g} by the filter \textbf{f}, or mathematically, it's the integral of all the shifts in shifted, and reversed function g, or f $$ \int_{\infty}^{-\infty}{f(\tau)*g(x-\tau)d\tau} $$ (see the appendix on convolutions)
\item to detect a vertical edges a vertical edge detection can be as simple as $\begin{bmatrix}1&0&-1\\1&0&-1\\1&0&-1\end{bmatrix}$, or with more emphases on the center.
\item for example to detect horizonal edges, we employ
$f(\tau)=\begin{bmatrix}
1&1&1\\
0&0&0\\
-1&-1&-1\end{bmatrix}$, for image $g(x)=\begin{bmatrix}
10&10&10&0&0&0\\
10&10&10&0&0&0\\
10&10&10&0&0&0\\
0&0&0&10&10&10\\
0&0&0&10&10&10\\
0&0&0&10&10&10\end{bmatrix}$ then convoluted
$\int_{\infty}^{-\infty}{f(\tau)*g(x-\tau)d\tau}$ evaluated to $\begin{bmatrix}
0&0&0&0\\
30&10&-10&-30\\
30&10&-10&-30\\
0&0&0&0 \end{bmatrix}$
\item more filter examples: \begin{itemize}
\item \textbf{sobel's filter} is $\begin{bmatrix} 1&0&-1\\ 2&0&-2\\ 1&0&-1\end{bmatrix}$.
\item identity filter function $\begin{bmatrix}
\ \ 0 &\ \ 0 &\ \ 0 \\
\ \ 0 &\ \ 1 &\ \ 0 \\
\ \ 0 &\ \ 0 &\ \ 0 \end{bmatrix}$
\item another sharper edge detection filter $\begin{bmatrix}
-1 & -1 & -1 \\
-1 & \ \ 8 & -1 \\
-1 & -1 & -1 \end{bmatrix}$
\item sharpen filter $\begin{bmatrix}
\ \ 0 & -1 & \ \ 0 \\
-1 & \ \ 5 & -1 \\
\ \ 0 & -1 & \ \ 0
\end{bmatrix} $
\item blur box filter $\frac{1}{9}
\begin{bmatrix}
\ \ 1 &\ \ 1 &\ \ 1 \\
\ \ 1 &\ \ 1 &\ \ 1 \\
\ \ 1 &\ \ 1 &\ \ 1
\end{bmatrix}$
\item Gaussian blur filter $\frac{1}{16}
\begin{bmatrix}
\ \ 1 &\ \ 2 &\ \ 1 \\
\ \ 2 &\ \ 4 &\ \ 2 \\
\ \ 1 &\ \ 2 &\ \ 1
\end{bmatrix}$
\item larger $5\times5$ Gaussian blur filter $\frac{1}{256} \begin{bmatrix}
1 & 4 & 6 & 4 & 1 \\
4 & 16 & 24 & 16 & 4 \\
6 & 24 & 36 & 24 & 6 \\
4 & 16 & 24 & 16 & 4 \\
1 & 4 & 6 & 4 & 1
\end{bmatrix}$.
\end{itemize}
\subsection{Padding}
\item from previous example you can notice that $6\times6$ input image was shrunk into $4\times4$ matrix (\textbf{known as valid convolution}), also the boundary edges are dismissed, to maintain the same output size to the input size (\textbf{known as the same convolution}) we use padding:
\item for input image of size $n\times n$, and filter of size $f\times f$, the output is $n-f+1\times n-f+1$, for same convolution we pad the edges with p cells, such that $p=f-1/2$ then final output image size $n+2p-f+1\times n+2p-f+1$, for this equation to hold, then filter size have to be \textbf{odd}, and it's usually the case in computer vision, look at the \textbf{identity filter}, where the weighting parameters are set to zero, while the \textbf{center pixel} is set to 1.
\item to pad our previous examples we get the following image with $p=1$ $g(x)=\begin{bmatrix}
0&0&0&0&0&0&0&0 \\
0&10&10&10&0&0&0&0\\
0&10&10&10&0&0&0&0\\
0&10&10&10&0&0&0&0\\
0&0&0&0&10&10&10&0\\
0&0&0&0&10&10&10&0\\
0&0&0&0&10&10&10&0\\
0&0&0&0&0&0&0&0\end{bmatrix}$
\subsection{striding}
\item ?
\end{description}
\subsection{Convolution on RGB channels}
\begin{description}
\item In 3 RGB channels $n_c=3$, on image $n\times{n}$ we add filter for each channel $f\times{f}\times{n_c}$, and the output image is $n-f+1 \times{n-f+1} \times{1}$
\end{description}
\subsection{Multiple filters Convolution}
\begin{description}
\item for layer $L$ of size $n\times{n}\times{n_c}$, there is always more than a single features, and therefore requires filter for each feature, so filtering layer end up being $f\times{f}\times{n_c}\times{n^*}$ such that $n^*$ is the number of filters for layers, and the output is $n-f+1\times{n-f+1}\times{n^*}$.
\end{description}
%\subsections{Notations}
\subsection{Example ConvNet}
\begin{description}
\item in case of input image $n_H^{[0]}=n_W^{[0]}=39, n_c^{[0]}=3$ of size $39 \times{39} \times{3}$, using 10 filter of size $f^{[1]}=3$, and strides $S^{[1]}=1$, padding $P^{[1]}=0$ and 10 filters, then layer 1 will deterministically be of a size $\left \lfloor{\frac{n^{[1]}+2p^{[1]}-f^{[1]}}{s^{[1]}}}\right \rfloor + 1=37$, then $n_H^{[1]}=n_W^{[1]}=37,n_c^{[1]}=10$.
\item in layer 2, using 20 filter of size $f^{[2]}=5$, and strides $S^{[2]}=2$, padding $P^{[2]}=0$, then layer 2 will deterministically be of a size $\left \lfloor{\frac{n^{[2]}+2p^{[2]}-f^{[2]}}{s^{[2]}}}\right \rfloor + 1=17$, then $n_H^{[1]}=n_W^{[1]}=17,n_c^{[1]}=20$.
\item in layer 3, using 40 filters, of size of size $f^{[3]}=5$, and strides $S^{[3]}=2$, padding $P^{[3]}=0$, then layer 3 will deterministically be of a size $\left \lfloor{\frac{n^{[3]}+2p^{[3]}-f^{[3]}}{s^{[3]}}}\right \rfloor + 1=7$, then $n_H^{[1]}=n_W^{[1]}=7 ,n_c^{[1]}=40$.
\item flatten layer3 $7 \times{7} \times{40}$ into $1960\times{1}$ and activate it through softmax/logistic function, where last layer 5 is $1\times{1}$.
\end{description}
\subsection{Pooling Convolutions}
\begin{description}
\item along side convolution NN, there Pooling layers are used to make the network more robust, and cut down the size of parameters.
\item most used parameters are $f=2,s=2$, or $f=3,s=2$, and padding is rarely used.
\item note that there is no learned parameters in pooling, so max-pooling layer doesn't count as a stand alone layer since no parameters to train, and rather seen as part of the layer operating on.
\subsection{Max Pooling}
\begin{description}
\item pooling is $n\times{n}$ layer, and services to divide the input layer into smaller $n\times{n}$ layers and leaving only the maximum value of which it's an \textbf{argmax} convolution function, the convolution function takes the argmax instead o the integral.
\item for example, input layer $\begin{bmatrix}1&3&2&1\\2&9&1&1\\1&3&2&3\\5&6&1&2\end{bmatrix}$, with $strides=2$, convoluted by $2\times{2}$ max-pooling into $\begin{bmatrix}9&2\\6&3\end{bmatrix}$
\end{description}
\subsection{Average Pooling}
\begin{description}
\item similar to max pooling except argamx is replaced with the mean function.
\item for example, input layer $\begin{bmatrix}1&3&2&1\\2&9&1&1\\1&3&2&3\\5&6&1&2\end{bmatrix}$, with $strides=2$, convoluted by $2\times{2}$, max-pooling into $\begin{bmatrix}3.75&1.25\\4&2\end{bmatrix}$.
\end{description}
\end{description}
\section{Examples}
\subsection{LeNet-5 Network}
\begin{enumerate}
\item Layer 0: with input image $n_H^{[0]}=n_W^{[0]}=32$, $n_c^{[0]}=3$.
\item Layer 1: using 8 filters, $f^{[1]}=5$, $s^{[1]}=1$, we get $28 \times{28} \times{8}$, adding max-pooling with $f=2, s=2$, pours down into $14\times{14}\times{8}$ layer.
\item Layer 2: using 16 filters $f^{[2]}=5$, $s^{[2]}=1$, we get $10 \times{10} \times{16}$, adding max-pooling with $f=2, s=2$, pours down into $5\times{5}\times{16}$ layer, flattened into $400\times{1} vector$
\item Layer 3: Fully-Connected (FC3) layer of 120 neurons, with weight, bias parameters $W^{[3]}, b^{[3]}$.
\item Layer 4: FC4 layer of 84 neurons, and $W^{[4]},b^{[4]}$, feed to softmax activation with 10 neurons output.
\end{enumerate}
\item notice the size of parameters in convolution layers compared tota \textbf{FC} layers, in \textbf{CONV1}, \textbf{CONV2} number of parameters are 208, 416 respectively compared to 48001, 10,081 in \textbf{FC3}, \textbf{FC4} respectively. meanwhile the \textbf{Activation size} falls down gradually, from 3.072 the number of pixels in the image to 10 neurons in the softmax classification, and 6.272, 1.568, 1.600, 400, 120, 84 in CONV1, POOL1, CONV2, POOL2, FC3, FC4 respectively.
\end{description}
\subsection{AlexNet Network}
\begin{enumerate}
\item Layer 0: input is $227\times{227} \times{3}$,
\item Layer 1: applying 96 filter with size 11, and stride=4, the output is $5 \times{5} \times{96}$ max pooled by $3\times{3}$ of strides of size 2, the input is $27\times 27 \times 96$.
\item Layer 2: applying 256 filter, of \textbf{SAME} padding, of size 5, outputs $27 \times27 \times 256$, max-pooled by size 3, and stride size=2 outputs $13 \times 13 \times 256$.
\item Layer 3: 384 filters with SAME padding convolutions, outputs $13 \times {13} \times {384}$.
\item Layer 4: 256 filters with SAME padding, outputs $13\times{13}\times{256}$ max-pooled by size 3, and strides of size 2, outputs $6\times{6}\times{256}$ flattened into $9216\times{1}$.
\item Layer 5: fully-connected $4096\times{1}$, by $4096\times{1}$, then softmax of 1000 neurons/predictions.
\item compare the two networks, LeNet-5 from the nineties with around 60000 parameter, to 2012 AlexNet with 60 million parameters, despite this, they are similar in structure.
\end{enumerate}
\subsection{VGG-16 Network}
\begin{description}
\item VGG similar AlexNet, yet deeper, with narrower convolution proven to be more powerful.
\item using [CONV] of size $3\times{3}$, strides size=1, of SAME padding, MAX-POOLing of size $2\times{2}$, with strides size=2.
\item consist of 16 layers therefore the name VGG-16, with ~138Million of parameters, and it's network is: $[CONV64\times{2}] \rightarrow [CONV256\times{3}] \rightarrow [CONV512\times{3}] \rightarrow [CONV512\times{3}] \rightarrow [FC4096] \rightarrow [FC4096] \rightarrow [SOFTMAX1000]$
\end{description}
\begin{enumerate}
\item $224\times{224}\times{3}$ input image.
\item applying $[CONV64] \times{2}$, (convolutions of 64 filters twice), outputs $224\times{224}\times{3}$. [MAX-POOL]ed into $112\times{112}\times{64}$
\item $[CONV128]\times{2}$ [MAX-POOL]ed into $56\times{56}\times{128}$.
\item $[CONV256]\times{3}$ [MAX-POOL]ed into $28\times{28}\times{256}$
\item $[CONV512]\times{3}$ [MAX-POOL]ed into $28\times{28}\times{512}$
\item $[CONV512]\times{3}$ [MAX-POOL]ed into $7\times{7}\times{512}$
\item $[FC4096]$ fully connected layer with 4096 nodes
\item $[FC4096]$ fully connected layer with 4096 nodes
\item $[SOFTMAX1000]$ softmax activation with 1000 entries.
\end{enumerate}
\subsection{ResNet (Residual Block)}
\begin{description}
\item Very deep network are much difficult to train because of vanishing, and exploding gradients, to make this feasible, \textbf{Residual blocks} was introduced
\item even with the use of an Optimizer to lessen the vanishing/exploding gradients, in a very deep network with hundreds or even thousands of layers, which are rarely used, the number adds up, and either diminishes to zero, or explode into large numbers. residual blocks adds the activation from a previous layer (i.e $l-2) $into the current activation function $l$, $a^{[l]}=g(z^{[l]}+a^{[l-2]})$, also known as \textbf{shortcut}, or \textbf{skip connection}.
\item assume for example we using regularization and $w^{[l]},b^{[l]}\rightarrow{0}$, then $a^{[l]}=g(a^{[l-2]}$, residual would prevent vanishing gradient.
\item note that skip connection between $a^{[i+n]}$, and $a^{[i]}$ for $i,n \in N$, can be between two different sizes, in practice same padding is used between shorted layers, or through function W, such that shape of $a^{[i+n]}=W.a^{[i]}$. that can for example zero pad the ith layer.
\item .
\end{description}
\section{Inception}
\begin{description}
\item another parameter to worry about is the size of convolution kernel, instead of testing out different sizes, it's possible to combine all of them at once with SAME padding, and let the model train, and figure out the most convenient of all, called \textbf{Inception network}, on a computational cost.
\item for example in input network of size $28\times{28}\times{192}$, we can combine $1\times{1}$, $3\times{3}$ SAME padding, $5\times{5}$ SAME padding, and MAX-POOLing SAME padding, to get $28\times{28}\times{256}$ network. in this example note that the computational cost is for 120 million parameter.
\subsection{$1\times{1}$ Convolutions}
\begin{description}
\item in such very expensive layer, what we can do to reduce the computational cost is convolution of $1\times{1}$ times the factor or reduction, for example in the previous example we can pick fittest filter through [CONV1], with 16 kernel, applying second [CONV5] with 32 filters we end up with $28\times{28}\times{32}$ layer, so we manage to pick the fittest kernel, and also reduce the computation cost for 120 million parameter to just 12.4 million.
\item
\end{description}
\subsection{Inception Block}
\begin{description}
\item the inception network is a series of block of the following building block: (insert the szegedy paper picture here)
\end{description}
\subsection{Inception Branches}
\item (insert the szegedy paper picture here)
\section{Object Detection}
\subsection{Localization and Detection}
\begin{description}
\item in computer vision network the output layer is usually a vector of localized objects rather than an one-hot vector representing a specific class, and the process is divided into three steps, first \textbf{Detection}, second \textbf{Localization}, lastly \textbf{classification}.
\subsection{classification with localization}
\item one-hot vector SOFTMAX output can be combined with localization, to locate, specific target class in an image, boxing our target object with $b_w\times {b_h}$ with Cartesian coordinates of the image center $(b_x,b_y)$, now three are 4 more parameters. $b_x, b_y,b_w, b_h$ for localization.
\item labeled output vector with localization: $$\begin{vmatrix}p_c | \text{a Boolean for wither there is an object or not}\\
b_x\\b_y\\b_h\\b_w\\c_1 | \text{first class}\\c_2 \text{class 2}\\...\\c_n | \text{class n}\end{vmatrix}$$. if $p_c=0$ then the rest of the vector's elements will be zero.
\item Loss function $L(\hat{y},y)$ will be element wise operation. but practically different loss functions can be used on different ranges of $y$ for example log-likelihood on range [$c_1$-$c_n$], and mean squared on the bounding box coordinates range, and logistic regression loss.
\end{description}
\subsection{Landmark detection}
\begin{description}
\item it's possible to go deeper beyond the bounding box, and train model on labeled data-set with desired landmarks on the target object, for example after along side localization of the humane face with bounding box, it's possible to add landmark features for example to locate the human mouth, eye's, and cheek boundaries, so to extract the facial expressions, another example could be tracking the skeleton, and joints for pose estimation of an athlete during exercise, also used in augmented reality applications.
\item to place 50 landmark on a single object to locate it's boundaries with two Cartesian coordinates points means extra 100 entry inside the labeled data-set entries.
\end{description}
\subsection{Object Detection}
\begin{description}
\item to detect, locate objects in input image of size $N\times{N}$.
\item run sliding window cropping function over input image cropping images with re-sizable starting with length = l, with certain strides.
\item sequencially iterate through the previous step increasing the length l for larger crops, this is a problem with $O(N^2)$ computational complexity.
\item then final step is to run the cropped images in the classifier.
\subsection{Convolutional sliding windows}
\begin{description}
\item sliding window cropping is computationally expensive, if you have noticed that for every crop, the model expected to forward the cropped image to be classified, then increasing the complexity by the number of crops.
\item the shared parameters characteristic of the convolutions can a great leverage to solve this problem, and cut the complexity significantly, by running the whole iterations simultaneously, it can be illustrated by an example.
\subsection{Example:}
\begin{description}
\item for the following network classifying input image(expected input) of size $14\times{14}\times{3}$, $[CONV5]\rightarrow [MAX-POOL2-5] \rightarrow [FC5-400] \rightarrow [FC1-400] \rightarrow [SOFT-MAX-4]$. with the corresponding size $10\times{10}\times{16}\rightarrow 5\times{5}\times{16} \rightarrow 1\times{1}\times{400} \rightarrow 1\times{1}\times{400} \rightarrow 1\times{1}\times{4}$ we end up with 4 neurons (softmax output) being our classification model, for an input image with size $28\times{28}\times{3}$ and sliding window $14\times{14}$ to fit in our model, running the same convolution network above on this input $[CONV5]\rightarrow [MAX-POOL2-5] \rightarrow [FC5-400] \rightarrow [FC1-400] \rightarrow [SOFT-MAX-4]$. with the corresponding size $24\times{24}\times{16}\rightarrow 12\times{12}\times{16} \rightarrow 8\times{8}\times{400} \rightarrow 8\times{8}\times{400} \rightarrow 8\times{8}\times{4}$ such that the most upper right cell $1\times{1}\times{4}$ of coordinates (0,0) is actually corresponding to the simplified model output above, and the second (0,1) output is one slide to the right, and so on.
\end{description}
\end{description}
\end{description}
\section{Bounding Box prediction}
\item the aspect ratio of the bounding box isn't really fixed, and to wrap the detected object correctly, then there are two variables in the sliding box instead of one in sequential terms.
\subsection{YOLO}
\begin{description}
\item YOLO is a convolutional network, but we can explain it from a sequential iterative perspective that the input image is divided up into $k\times{k}$ cells, running the sliding window function in each cell, then simultaneously, so instead of getting a single $8\times{1}$ output, we get $k\times{k}\times{8}$ output, the localization algorithms runs in a single cell.
\item even if the object stretches on multiple grid cells, it's only assigned to a single cell.
\item to specify the bounding box inside a grid, the center of the detected object is coordinated with respect to the origin(0,0) of the cell on the top most left, and the bottom right assigned to (1,1), therefore the center$(b_x,b_y)$ must be between 0,1, where as the $b_w,b_h$ can be greater than 1 if it wraps multiple cells, and it's measured by the ratio with respect to the cell width, or height respectively.
\end{description}
\subsection{Jaccard distance, or Intersection over Union IOU function}
\begin{description}
\item it's an accuracy metric for how well the model is performing, and defined by the area of intersection between the prediction, and the labeled bounding boxes, over the prediction area. indicating how far this predicted bounding box fits into the ground truth table.
\end{description}
\subsection{Non-Max Suppression}
\begin{description}
\item in practice the input image is divided up into very fine cells, and a single object can overlap multiple cells, Non-Max suppression is algorithm to choose the maximum overlapping cell, and assign the detection with is, suppressing other cells, therefore the name.
\item the metric for overlap is the $p_c$ in the output layer, previously we assigned $p_c$ to either 0, or 1, but in this case is the probability of overlap, and only the highest bounding box's $p_c$ remains, suppressing the rest.
\end{description}
\subsection{Anchor Boxes}
\begin{description}
\item in a single cell there could coexist more than one object, we expect multiple object each with specific aspect ratio, then we consider more than sliding window also known as \textbf{Anchor Box}, each with different aspect ratio, and the output $y,\hat{y}$ will look like the following for m anchor boxes but flattened into a single vector $$ \begin{vmatrix}p_c^1&p_c^{2}&...&p_c^{m}\\p_x^{1}&p_x^{2}&...&p_x^{m}\\p_y^{1}&p_y^{2}&...&p_y^{m}\\p_h^{1}&p_h^{2}&...&p_h^{m}\\p_w^{1}&p_w^2&...&p_w^{m}\\c_1^{1}&c_1^{2}&...&c_1^{m}\\...\\c_n^{1}&c_n^{2}&...&c_n^{m}\end{vmatrix}$$
\end{description}
\subsection{Region Proposal R-CNN, Fast R-CNN}
\begin{description}
\item instead of running the sliding window on everywhere in all cells, with the same slide, it can be more appropriate to narrow this down into very specific regions: first the input image is run through semantic segmentation function, dividing up the input into blobs(Regions proposal), then the Anchor boxes are run on top of them, but this process turns out to be very slow.
\item Fast R-CNN is a convolution implemtnation of it, and speeds up the process a bit.
\end{description}
\item .
\end{description}
\section {Face Recognition }
\subsection{Recognition vs. Verification}
\begin{description}
\item in verification given an input image, name/ID, verify whether the input image is that of the claimed person, on the other hand, in a databaseof K persons, given an image, Recognize the image to one of the images in the database.
\item having 99\% accuracy in the verification process is good enough threshold, meanwhile in recognition in database of 1000 persons, having 99\% accuracy means that the classifier works with uncertainty of 1 to 10, there our classification model maps the image to 10 out of 1000 with equally likely probability, so accuracy in recognition need to approach 100 percent.
\end{description}
\subsection{One-Shot learning}
\begin{description}
\item addressing the following criteria given a single input image for a person, as a single entry in the training-set, our model need to have accuracy approaching 100\%. for new inputs to the trained model, how to scale?
\item also the shortcomming of the use of classification, is that it's trained on specific number of classes, or identities, but in the case of new input, the clssification is expected to match the input to one of the softmax outputs, thus reducing the accuracy, to maintain the accuracy the model need to be trained for each new input, instead a siamese network is used to calculate the simiarity of new input against database records.
\subsection{Similarity learning function}
\begin{description}
\item given two images $I_1,I_2$, then similarity function $d(I_1,I_2)$ calculates the distance, and the distance inverse is how identical the two images are, such that $d=0$ if $I_1=I_2$, and verification is acceptable under negligible distance threshold $\tau$: $$
\begin{cases}
same \leftarrow d(I_1,I_2) \leq \tau \\
different \leftarrow d(I_1,I_2) > \tau
\end{cases}
$$
\item a convolution example is the Siasmese network:
\end{description}
\subsection{Siamese Network}
\item how to calculate the similarity distance?
\begin{description}
\item .
\item for an convolution network ending with fully-connected with N nodes [FCN] layer, for each data-set entry $x^{(i)}$, we define $f(x^{(i)})$ to be the output of [FCN] on the entry $x^{(i)}$.
\item for two input images $x^i, x^j$ from the data-set, the similarity function: $$ \|f(x^{(i)}) - f(x^{(j)})\|_2^2 $$.
\item so, how to define the loss function? using what is called a \textbf{Triplet Loss}
\subsection{Triplet Loss}
\begin{description}
\item Loss function is defined on top of the similarity function d, in terms of distance between current image $x^i$, and the whole data-set images, the model can be trained on a single image called \textbf{Negative} $x_i^{n}$ with large distance, and \textbf{Positive} $x_i^{n}$ image with small distance, against the current image called \textbf{Anchor image} $x_i^{a}$, and the loss function for the triplet can be defined as $L(d(x_i^a,x_i^p), d(x_i^a,x_i^p))$.
\item but to prevent against the trivial solutions for triplets of the same persons, a marginal $\alpha$ is used, so the final loss equation is for N triplets: $$
\sum_i^N {\|f(x_i^a) - f(x_i^p)\|_2^2 - \|f(x_i^a) - f(x_i^n)\|_2^2} + \alpha
$$
\end{description}
%TODO (res) \item what to do in case of new input images?!
\subsection{Binary Classification}
\begin{description}
\item another variation is binary classification into ``match'', or ``not match'', can be performed into pair of images $x^i, x^j$, and label 0 (for different persons), or 1(for the same person), with Sigmoid activation, defined as [FC] layer of size F: $$
\hat{y}=\sigma (\sum_k^{F} w_i \|f(x^{i})_k - f(x^{j})_k \| + b)
$$ where $w_i, b_i$ are training parameters.
\item or using the qui-square formula $$
\frac{(f(x^i)_k - f(x^j)_k)^2}{(f(x^i)_k + f(x^j)_k}
$$
\end{description}
%TODO (res) the parallel architecture of the network, and shed light on siamese in NLP
\item note that in NLP, the siamese network is used to match two different sentences, for example two search queries, with input being the embedding of the sentence, running through LSTM layer, and the distance is the cosine similarity function between the output, while the triplet function is defined with anchor vector of corresponding sentence, and positive, and negative vectors, such that $diff = d(A,P) - d(A,N)$, since the difference can be negative, cost function is required to be non-negative, a rectifier over the diff function is employed, and to capture the negative value difference, such that if the output for each negative difference is proportional to a positive value, and shift over the loss axis by adding small constant $\alpha$ (similar to triplet loss in CNN), the final lost function is $ relu(diff+\alpha) $, and the cost is the overall lost over all the training example $\sum_{i=1}^mL(A^{(i)},P^{(i)},N^{(i)})$.
\item training on hard triplet (are those satisfying $d(A,P) - d(A,N) = \epsilon$) are better for training to adjust the parameters to detect small differences, and the loss function is replaced by $L_{full}=L_1+L_2$, $L_1 = relu(\text{mean Negative } - d(A,P) + \alpha)$, $L_2 = relu(\text{closest negative} - d(A,P) + \alpha)$, where \text{mean Negative} is the different with the negative values in the embedding matrix, and \text{closest negative} is the different with the closest negative in the embedding values.
\end{description}
\section{Neural Style Transfer}
\begin{description}
\item first of all how to define style? style is the variations, or shades of specific content, and inside the neural network style transfer can be length of correlation between activations across different channels.
\item artistic style transfer, is the transfer of style of the Style image \textbf{S}, to the content image \textbf{C}, that outputs the generated image \textbf{G}.
\item cost function can be defined on the generated G, as follows: $$J(G) = \alpha J_{content}(C,G) + \beta J_{style}(S,G) $$ using weighting parameters $\alpha, \beta $ to control how far the generated image is closer to either the content, or the style (note that a single weight is sufficient, for example $\alpha, and (1-\alpha))$ minimizing the cost between both the style, and the generated image, and the content, and the generated, thus merging the content, and the style images.
\subsection{Generated image descent}
\begin{description}
\item gradient descent is minimizing J(G), rather than weight parameters. $$ G=G-\frac{\partial{J(G)}}{\partial{G}} $$
\item the final generated image descent is as simple as: \begin{enumerate}
\item initialize G randomly
\item use gradient descne to minimize J(G)
\end{enumerate}
\end{description}
\subsection{Content cost}
\begin{description}
\item using a middle layer l, not too shallow, not two deep,
\item using pre-trained Convolution network, let $a_C^{[l]}, a_G^{[l]}$ be the activation of layer l on the content, and generated respectively, then we define $$J_{content}(C, G) = \frac{1}{2}\|a_C^{[l]} - a_G^{(G)}\|^2$$.
\end{description}
\subsection{Style cost}
\begin{description}
\subsection{Gram Matrix(style)}
\item let $a_{i,j,k}^{[l]}$ be the activation at $(i,j,k)$, $G^{[l]}$ be the covariance (un-normalized) of different channels of size $n_c^{[l]}$, the style gram matrix can be formulated as: $$
G_{kk`}^{[l](S)} = \sum_{i=1}^{n_H^{[l]}} \sum_{j=1}^{n_W^{[l]}} a_{i,j,k}^{[l](S)} a_{i,j,k`}^{[l](S)}$$
\item generated gram matrix: $$
G_{kk`}^{[l](G)} = \sum_{i=1}^{n_H^{[l]}} \sum_{j=1}^{n_W^{[l]}} a_{i,j,k}^{[l](G)} a_{i,j,k`}^{[l](G)} $$
\item the final style cost function: $$
J_{style}^{[l]}(S,G) = \frac{1}{2n_H^{[l]}n_W^{[l]}n_C^{[l]})^2} \| G^{[l](S)} - G^{[l](G)}\|_F^2
$$ such that $\frac{1}{2n_H^{[l]}n_W^{[l]}n_C^{[l]})^2}$ is a normalization term.
\item $$J_{style}(S,G) = \sum_l^L \lambda^{[l]} J_{style}^{[l]}(S,G) $$ with weight parameter $\lambda$.
\end{description}
\end{description}
\end{description}
\chapter{Recurrent Neural Networks RNN}
\section{Introduction}
\begin{description}
\item RNN has been making advance in the area of sequence modeling such as speech recognition, bio-informatics, music generation, natural language processing, and activity, and name recognition, given an input as a sequence, sequence models maps it to another sequence, for example translation of voices into words, or musical note, into a generated song, proteins in the DNA, etc.
\end{description}
\section{Notation}
\begin{description}
\item in a sequence $x^{(i)}$, of the training data-set X, is dereferenced by $x^{(i)<t>}$, such that t is the t(th) (temporal element in the sequence) and represented by one-hot vector from a known Vocabulary V.
\item sequence $x^{(i)}$ have the corresponding length $T_x^{(i)}$.
\item equivalently $y^{(i)<t>}$ is the element t of the $y^{(i)}$ output, with output sequence length $T_y^{(i)}$.
\item finally the weight parameter $W_{ab}$ means wight for calculation of a after multiplication by b.
\end{description}
\section{RNN}
\begin{description}
\item first of all, why NN isn't used in sequence modeling? NN is used in scenarios where there are shared features in the input, and where the input, and output are of the same length but in sequences it's not expected for the input features to be of similar features, and the length of input, and output different, therefore NN leads to less accurate model that is hard to train, instead in RNN, the layers are interconnected, and each input $X^{<i>}$ maps to $Y^{<i>}$.
\subsection{RNN model}
\begin{description}
%%
\item \begin{tikzpicture}[item/.style={circle,draw,thick,align=center},
itemc/.style={item,on chain,join}]
\begin{scope}
[start chain=going right,nodes=itemc,every
join/.style={-latex,very thick},local bounding box=chain]
\path node (A0) {$A$} node (A1) {$A$} node (A2) {$A$} node[xshift=2em] (At) {$A$};
\end{scope}
\node[left=1em of chain,scale=2] (eq) {$=$};
\node[left=2em of eq,item] (AL) {$A$};
\path (AL.west) ++ (-1em,2em) coordinate (aux);
\draw[very thick,-latex,rounded corners] (AL.east) -| ++ (1em,2em) -- (aux) |- (AL.west);
\foreach \X in {0,1,2,t} {
\draw[very thick,-latex] (A\X.north) -- ++ (0,2em) node[above,item,fill=gray!10] (y\X) {$y^{<\X>}$};
\draw[very thick,latex-] (A\X.south) -- ++ (0,-2em) node[below,item,fill=gray!10] (x\X) {$x^{<\X>}$};
}
\draw[white,line width=0.8ex] (AL.north) -- ++ (0,1.9em);
\draw[very thick,-latex] (AL.north) -- ++ (0,2em)
node[above,item,fill=gray!10] {$y^{<t>}$};
\draw[very thick,latex-] (AL.south) -- ++ (0,-2em)
node[below,item,fill=gray!10] {$x^{<t>}$};
\path (x2) -- (xt) node[midway,scale=2,font=\bfseries] {\dots};
\end{tikzpicture}
\item and the network is often initialized by random $A^{<0>}$ at the start of the network
\item this type of model are called unidirection RNN, compared to bidirectional RNN where the paths moves both ways.
\item the cost function is defined as the average of all output losses $$ J = \frac{1}{T}\sum_{i=1}^T\sum_{j=1}^{T_Y}y_j^{<t>}log(\hat{y_j^{<t>}}) $$
\section{uni-directional RNN Forward propagation}
\begin{description}
\item $$a^{<t>} = g(W_{aa}a^{<t-1>} + W_{ax}X^{<t>} + b_a) $$
\item $$\hat{y}^{<t>} = g(W_{ya}a^{<t>} + b_y) $$ where \textbf{g} function is usually \textbf{tanh}, or \textbf{relu}.
\item $a^{<t>}$ is usually represented in vectorized notation into $$ a^{<t>} = g(W_{a}[A^{<t-1>}, X^{<t>}] + b_a) $$ where $W_a$ is $\begin{vmatrix}W_{aa} & {W_{ax}}\end{vmatrix}$, and $[a^{<t-1>}, X^{<t>}]=\begin{vmatrix} a^{<t-1>} \\ X^{<t>} \end{vmatrix}$
\item $$ \hat{y}^{<t>} = softmax(W_{ya}a^{<t>} + b_y) $$
\item $$L=\sum {L^{<t>}(\hat{y^{<t>}},y^{<t>})}$$
\item notice that from the Bayesian chain-rule (see next chapter) that $P(y^{<t>}|y^{<1>}\dots{y^{<t-1>}})=\hat{y^{<t>}}$
\section{uni-directional RNN backward propagation through time}
\begin{description}
\item (derive)
\end{description}
\section {Variations of RNN models}
\begin{description}
\item in the previous figure we discussed many-to-many uni-directional recurrent neural network architecture, a more general architecture is the bidirectional RNN where arrows goes both ways.
\item another variation is many-to-one, and it look like this
\subsection{many-to-one RNN}
\begin{description}
\item \begin{tikzpicture}[item/.style={circle,draw,thick,align=center},
itemc/.style={item,on chain,join}]
\begin{scope}
[start chain=going right,nodes=itemc,every
join/.style={-latex,very thick},local bounding box=chain]
\path node (A0) {$A$} node (A1) {$A$} node (A2) {$A$} node[xshift=2em] (At) {$A$};
\end{scope}
\node[left=1em of chain,scale=2] (eq) {$=$};
\node[left=2em of eq,item] (AL) {$A$};
\path (AL.west) ++ (-1em,2em) coordinate (aux);
%\draw[very thick,-latex,rounded corners] (AL.east) -| ++ (1em,2em) -- (aux) |- (AL.west);
\draw[very thick,-latex] (At.north) -- ++ (0,2em) node[above,item,fill=gray!10] (yt) {$y^{<t>}$};
\foreach \X in {0,1,2,t} {
\draw[very thick,latex-] (A\X.south) -- ++ (0,-2em) node[below,item,fill=gray!10] (x\X) {$x^{<\X>}$};
}
\draw[white,line width=0.8ex] (AL.north) -- ++ (0,1.9em);
\draw[very thick,-latex] (AL.north) -- ++ (0,2em)
node[above,item,fill=gray!10] {$y^{<t>}$};
\draw[very thick,latex-] (AL.south) -- ++ (0,-2em)
node[below,item,fill=gray!10] {$x^{<t>}$};
\path (x2) -- (xt) node[midway,scale=2,font=\bfseries] {\dots};
\end{tikzpicture}
\item notice that there is a single output the map the whole sequence to a single output, similar to the Neural network.
\end{description}
\subsection{One-to-Many RNN}
\begin{description}
\item and finally one-to-many, input to next time step is sampled from the previous output:
\item \begin{tikzpicture}[item/.style={circle,draw,thick,align=center},