forked from miriamamendola/the-final-master
-
Notifications
You must be signed in to change notification settings - Fork 0
/
chapter_1.tex
120 lines (100 loc) · 4.74 KB
/
chapter_1.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
\chapter{Fundamentals}
\section{Linear Algebra}
\paragraph{Matrix dot product}
Assume we have $A \in \mathbb{R}^{m \times n}$, $B \in \mathbb{R}^{n \times p}$, then $C = A B \in \mathbb{R}^{m \times p}$ is defined as:
\[
C_{ij} = \sum_{k = 1}^{n} A_{ik} B_{kj}
\]
\subparagraph{Special cases}
Assume that $A \in \mathbb{R}^{1 \times n}$ and $B \in \mathbb{R}^{n \times 1}$.
Then $C = A B \in \mathbb{R}^{1 \times 1}$ so $C$ will be a scalar. On the other side, $D = B A \in \mathbb{R}^{n \times n}$ so $D$ will be a square matrix.
\paragraph{Orthogonality}
We say that the matrix $U$ is \textbf{orthogonal} if $U^T U = I$.
Furthermore, if $U$ is also \textbf{square}, then $U^T = U^{-1}$.
\paragraph{Similarity}
We say that two matrices $B$ and $C$ are \textbf{similar} if there exists a matrix $P$ such that $B = P^{-1} C P$. When two matrices are similar they have the same eigenvalues.
\paragraph{Eigen-decomposition}
From the \textbf{spectral theorem} we know that if $A$ is a \textit{real square symmetric} matrix then it can be decomposed as $A = U \Lambda U^T$ where $U$ is a matrix obtained by stacking the eigenvectors of $A$ as columns, while $\Lambda$ is a diagonal matrix with the eigenvalues of $A$. In other words, if $A$ is a real square symmetric matrix then it is \textbf{similar} to the diagonal matrix $\Lambda$.
In general, the relationship between the eigenvectors and the eigenvalues is the following: $A u^{(k)} = \lambda_k u^{(k)}$ where $u$ is an eigenvector and $\lambda$ is the corresponding eigenvalue.
\paragraph{Gradient rules}
\begin{enumerate}
\item $\nabla_x a^t x = \nabla_x x^T a = a$
\item $\nabla_x x^T A x = A^t x + A x$
\end{enumerate}
\section{Law of Large Numbers}
Let $X_1,X_2,...,X_n,...$ be a subsequence of independent and identically distributed random variables, and let $\mathbb E(X_i)=\mu$. Let, then, $\overline{X}_n$ be the subsequence defined by
\[
\overline{X}_n=\frac{1}{n}\sum_{i=1}^n X_i
\]
\begin{theorem}
The strong version of the theorem states that:
\[
\Pr{\lim_{n \to \infty} \overline{X}_n = \mu} = 1
\]
that is, the subsequence $\overline{X}_n$ tends to $\mu$ with probability 1.
\end{theorem}
\subsection{Practical interpretation}
Suppose to have a single discrete random variable $Y\in\left\{-1,1\right\}$. If we do a large number of experiments and we compute the average of the results we get:
\begin{gather*}
\overline{Y}_n=\frac{1}{n}\sum_{i=1}^n Y_i = \\
= (-1) \frac{\text{\# of -1s}}{n} + (+1)\frac{\text{\# of 1s}}{n}\approx (-1)\Pr{Y=-1} + (+1)\Pr{Y=1} = \E{}{Y}
\end{gather*}
\section{Bayes's Rule}
\begin{theorem}
Bayes's rule states that:
\[
p(x|y) = \frac{p(y|x) p(x)}{p(y)}
\]
\end{theorem}
\begin{proof}
By definition of conditional probability we have:
\[
p(x|y) = \frac{p(x,y)}{p(y)} \to p(x,y) = p(x|y) p(y)
\]
and
\[
p(y|x) = \frac{p(x,y)}{p(x)} \to p(x,y) = p(y|x) p(x)
\]
\[
p(x|y) p(y) = p(y|x) p(x) \to p(x|y) = \frac{p(y|x) p(x)}{p(y)}
\]
\end{proof}
\section{Fundamental theorem of expectation}
Assume we have a discrete random variable $Y$ and a function $h(Y)$ and we want to compute the expected value of $h(Y)$. By definition of expected value, by fixing $Z = h(Y)$, we have:
\[
\E{}{Z} = \sum_{z \in Z} z p(z) = \sum_{y \in Y} h(y) p(z)
\]
The fundamental theorem of expectation states that we can compute the expected value of $h(Y)$:
\[
\E{}{h(Y)} = \sum_{y \in Y} h(y) p(y)
\]
\section{Tower Property}
Suppose we have two discrete random variables $X$ and $Y$ and $h$ that is a function of $X$ and $Y$.
\begin{theorem}
The \textbf{tower property} states that we can compute the expected value of $h(X,Y)$ by first computing the conditional expected value of $h(X,Y)$ given $X$ and then computing the expected value of the result over $X$.
\[
\E{}{h(X,Y)} = \E{X}{\E{Y}{h(X,Y) \mid X}}
\]
\end{theorem}
\begin{proof}
By definition of mean of discrete random variables and applying the fundamental theorem of expectation:
\[
\E{}{h(X,Y)} = \sum_{x \in X} \sum_{y \in Y} h(x,y) p(x,y)
\]
By apply Bayes's rule, we can rewrite $p(x,y)$:
\[
\sum_{x \in X} \sum_{y \in Y} h(x,y) p(x,y) = \sum_{x \in X} \sum_{y \in Y} h(x,y) p(y|x) p(x)
\]
Since $p(x)$ does not depend on the second summation term, we can rewrite it outside the summation term:
\[
\sum_{x \in X} p(x) \sum_{y \in Y} h(x,y) p(y|x)
\]
We can observe that now the second summation term is by definition the conditional mean of $h(X,Y)$:
\[
\sum_{x \in X} p(x) \E{Y}{h(X,Y) \mid X = x}
\]
Finally, we can observe that this term is the expected value over $X$ of the conditional mean.
\[
\E{X}{\E{Y}{h(X,Y) \mid X = x} }
\]
\end{proof}