From f45e5b1d6df4570f9508b93aaf0b44f8ea43b25c Mon Sep 17 00:00:00 2001
From: mrava87 <matteoravasi@gmail.com>
Date: Wed, 20 Mar 2024 20:43:56 +0300
Subject: [PATCH] Deployed 7e35731 with MkDocs version: 1.3.1

---
 lectures/12_seqmod/index.html |   4 +--
 search/search_index.json      |   2 +-
 sitemap.xml                   |  48 +++++++++++++++++-----------------
 sitemap.xml.gz                | Bin 214 -> 213 bytes
 4 files changed, 27 insertions(+), 27 deletions(-)
diff --git a/lectures/12_seqmod/index.html b/lectures/12_seqmod/index.html
index a7cf924..22592ed 100644
--- a/lectures/12_seqmod/index.html
+++ b/lectures/12_seqmod/index.html
@@ -887,9 +887,9 @@ <h2 id="motivation">Motivation</h2>
 we consider a sequence of <span class="arithmatex">\(N_\tau\)</span> samples and <span class="arithmatex">\(N_f\)</span> features:</p>
 <div class="arithmatex">\[
 \mathbf{X} = \begin{bmatrix} 
-                x_1^{&lt;1&gt;} &amp; x_1^{&lt;2&gt;} &amp; x_1^{N_\tau} \\
+                x_1^{&lt;1&gt;} &amp; x_1^{&lt;2&gt;} &amp; x_1^{&lt;N_\tau&gt;} \\
                 ...     &amp; ...     &amp; ... \\
-                x_{N_f}^{1} &amp; x_1^{&lt;2&gt;} &amp; x_{N_f}^{N_\tau}
+                x_{N_f}^{&lt;1&gt;} &amp; x_1^{&lt;2&gt;} &amp; x_{N_f}^{&lt;N_\tau&gt;}
   \end{bmatrix} =
   \begin{bmatrix} 
                 \mathbf{x}^{&lt;1&gt;} &amp; \mathbf{x}^{&lt;2&gt;} &amp; \mathbf{x}^{&lt;N_\tau&gt;}
diff --git a/search/search_index.json b/search/search_index.json
index 27ea9b3..aee4316 100644
--- a/search/search_index.json
+++ b/search/search_index.json
@@ -1 +1 @@
-{"config":{"indexing":"full","lang":["en"],"min_search_length":3,"prebuild_index":false,"separator":"[\\s\\-]+"},"docs":[{"location":"","text":"Homepage This course covers the fundamentals of machine learning, its applications to geoscientific problems, and it provides basic best practices for the rigorous development and evaluation of machine learning models. The main focus of the course is on describing the fundamental theory of linear regression , logistic regression , neural networks , convolutional neural networks , sequence modelling , dimensionality reduction , generative modelling , and physics-inspired neural networks . Students will also be introduced to practical applications in geoscience for each of the presented methods; lab sessions will be held using the PyTorch computational framework in the Python programming language. Lectures Sunday, and Wednesday, 1:00pm - 2:30pm Teaching Staff Instructor: Matteo Ravasi - Office Hours: Monday 4pm to 5pm (by Appointment: BI-1432) Textbook Deep Learning by Ian Goodfellow and Yoshua Bengio and Aaron Courville \u2013 MIT Press. Pre-requisites Knowledge of calculus, linear algebra ad statistics is required. Basic Python knowledge is preferred. Course Requirements ErSE 213 - Inverse problems","title":"Homepage"},{"location":"#homepage","text":"This course covers the fundamentals of machine learning, its applications to geoscientific problems, and it provides basic best practices for the rigorous development and evaluation of machine learning models. The main focus of the course is on describing the fundamental theory of linear regression , logistic regression , neural networks , convolutional neural networks , sequence modelling , dimensionality reduction , generative modelling , and physics-inspired neural networks . Students will also be introduced to practical applications in geoscience for each of the presented methods; lab sessions will be held using the PyTorch computational framework in the Python programming language.","title":"Homepage"},{"location":"#lectures","text":"Sunday, and Wednesday, 1:00pm - 2:30pm","title":"Lectures"},{"location":"#teaching-staff","text":"Instructor: Matteo Ravasi - Office Hours: Monday 4pm to 5pm (by Appointment: BI-1432)","title":"Teaching Staff"},{"location":"#textbook","text":"Deep Learning by Ian Goodfellow and Yoshua Bengio and Aaron Courville \u2013 MIT Press.","title":"Textbook"},{"location":"#pre-requisites","text":"Knowledge of calculus, linear algebra ad statistics is required. Basic Python knowledge is preferred.","title":"Pre-requisites"},{"location":"#course-requirements","text":"ErSE 213 - Inverse problems","title":"Course Requirements"},{"location":"READMEcurvenotelocal/","text":"Curvenote version of notes Set up environment To get started install Curvenote: npm install -g curvenote Get token online and add it curvenote token set API_TOKEN Run locally Type the following command in the terminal curvenote start and access at http://127.0.0.1:3000 Publish curvenote deploy","title":"READMEcurvenotelocal"},{"location":"READMEcurvenotelocal/#set-up-environment","text":"To get started install Curvenote: npm install -g curvenote Get token online and add it curvenote token set API_TOKEN","title":"Set up environment"},{"location":"READMEcurvenotelocal/#run-locally","text":"Type the following command in the terminal curvenote start and access at http://127.0.0.1:3000","title":"Run locally"},{"location":"READMEcurvenotelocal/#publish","text":"curvenote deploy","title":"Publish"},{"location":"gradind/","text":"Grading system The final grade will be obtained as the combination of the following: 50.00% - Course Project 30.00% - Midterm exam 20.00% - Homeworks Homeworks Homeworks will be assigned at the end of each topic. They consist of both pen and paper questions and programming exercises. The submitted codes must be properly commented and implementation choices must be justified (this is as important as the code itself and counts towards the final mark). Project The project should cover one of the topics learned in this course. It could be focused on implementing a novel machine learning algorithm to a geoscientific problem or on performing a systematic comparison of different machine learning algorithms to a geoscientific dataset. Students are encouraged to start the project early. The best way is to define a problem statement at the beginning of the term and learn how to use machine learning to solve such a problem during the course. Collaboration Most homeworks involve programming assignments. Students are encouraged to collaborate and consult with each other, but an individual assignments (and code) must be handed in. Acknowledge explicitly in your submitted assignment if you have collaborated with someone else while working on the assignment. Late submissions Each student has access to one late submission wildcard of no more than 2 days from the submission deadline. Apart from using this wildcard, late submissions will be penalized with a loss of 40% of the achieved score.","title":"Grading system"},{"location":"gradind/#grading-system","text":"The final grade will be obtained as the combination of the following: 50.00% - Course Project 30.00% - Midterm exam 20.00% - Homeworks","title":"Grading system"},{"location":"gradind/#homeworks","text":"Homeworks will be assigned at the end of each topic. They consist of both pen and paper questions and programming exercises. The submitted codes must be properly commented and implementation choices must be justified (this is as important as the code itself and counts towards the final mark).","title":"Homeworks"},{"location":"gradind/#project","text":"The project should cover one of the topics learned in this course. It could be focused on implementing a novel machine learning algorithm to a geoscientific problem or on performing a systematic comparison of different machine learning algorithms to a geoscientific dataset. Students are encouraged to start the project early. The best way is to define a problem statement at the beginning of the term and learn how to use machine learning to solve such a problem during the course.","title":"Project"},{"location":"gradind/#collaboration","text":"Most homeworks involve programming assignments. Students are encouraged to collaborate and consult with each other, but an individual assignments (and code) must be handed in. Acknowledge explicitly in your submitted assignment if you have collaborated with someone else while working on the assignment.","title":"Collaboration"},{"location":"gradind/#late-submissions","text":"Each student has access to one late submission wildcard of no more than 2 days from the submission deadline. Apart from using this wildcard, late submissions will be penalized with a loss of 40% of the achieved score.","title":"Late submissions"},{"location":"schedule/","text":"Schedule Lecture Topic Exercise 1 Course overview and introduction to Machine Learning - 2 Linear algebra refresher - 3 Probability refresher - 4 Gradient-based optimization link 5 Linear and Logistic regression link 6 Neural Networks: perceptron, activation functions link1 7 Neural Networks: backpropagation, initialization, and loss functions - 8 Best practices in training of Machine Learning models - 9 Advanced solvers: momentum, RMSProp, Adam, greedy training - 10 UQ in Neural Networks and Mixture Density Networks link1 link2 11 Introduction to CNNs - 12 CNNs Popular Architectutues link 13 Sequence modelling: basic principles 14 Sequence modelling: architectures link 15 Dimensionality reduction 16 Generative modelling and VAEs reduction 17 GANs 18 Scientific ML and PINNs link 19 Deep learning for Inverse Problems 20 Invertible Neural Networks 21 Implicit Neural Networks","title":"Schedule"},{"location":"schedule/#schedule","text":"Lecture Topic Exercise 1 Course overview and introduction to Machine Learning - 2 Linear algebra refresher - 3 Probability refresher - 4 Gradient-based optimization link 5 Linear and Logistic regression link 6 Neural Networks: perceptron, activation functions link1 7 Neural Networks: backpropagation, initialization, and loss functions - 8 Best practices in training of Machine Learning models - 9 Advanced solvers: momentum, RMSProp, Adam, greedy training - 10 UQ in Neural Networks and Mixture Density Networks link1 link2 11 Introduction to CNNs - 12 CNNs Popular Architectutues link 13 Sequence modelling: basic principles 14 Sequence modelling: architectures link 15 Dimensionality reduction 16 Generative modelling and VAEs reduction 17 GANs 18 Scientific ML and PINNs link 19 Deep learning for Inverse Problems 20 Invertible Neural Networks 21 Implicit Neural Networks","title":"Schedule"},{"location":"lectures/01_intro/","text":"Introduction to Machine Learning Humans have long dreamed of creating machines that can think and act independently . For many years this has been the aim of Artificial Intelligence (AI) . In the early days of AI, many problems that are difficult to solve by humans (e.g., large summations or multiplications, solution of systems of equations) turn out to be easier for computers as long as humans could define a list of tasks that machines could perform at faster speed and higher precisions than humans can do themselves. On the other hand, tasks that are very easily solved by adult humans and even kids (e.g., recognizing animals in pictures or singing a song) turned out to be very difficult for computers. The main reason of such difficulties lies in the fact that humans cannot explain in words (and with a simple set of instructions) how they have learned to accomplish these tasks. This is where instead the second era of AI solutions, belonging to the field of Machine Learning (ML) , have shown astonishing results in the last decade. Instead of relying on hard-coded rules, these algorithms operate in a similar fashion to human beings as they learn from experience . In other words, given enough training data in the form of inputs (e.g., photos) and outputs (e.g., label of the animal present in the photo), ML algorithms can learn a complex nonlinear mapping between them such that they can infer the output from the input when provided with unseen inputs. A large variety of ML algorithms have been developed by the scientific community, ranging from the basic linear and logistic regression that we will see in our fourth lecture , decision tree-based statistical methods such as random forrest or gradient boosting , all the way to deep neural networks , which have recently shown to outperform previously developed algorithms in many fields (e.g., computer science, text analysis and speech recognition, seismic interpretation). This subfield has grown exponentially in the last few years and it is now referred to as Deep Learning and will be subject of most of our course. In short, Deep learning is a particular kind of machine learning that represent the world as a nested hierarchy of increasingly complicated concepts the more we move away from the input and towards the output of the associated computational graph. Whilst sharing the same underlying principle of learning from experience in the form of a training data , different algorithms presents their own strengths and limitations and a machine learning practitioner must make a careful judgment at any time depending on the problem to be solved. Terminology Machine Learning is divided into 3 main categories: Supervised Learning : learn a function that maps an input to an output ( \\(X \\rightarrow Y\\) ). Inputs are also referred to as features and outputs are called targets. In practice we have access to a number of training pairs \\(\\{ \\textbf{x}_i, \\textbf{y}_i \\} \\; i=1,..,N\\) and we learn \\(\\textbf{y}_i=f_\\theta(\\textbf{x}_i)\\) where \\(f_\\theta\\) is for example parametrized via a neural network. Two main applications of supervised learning are Classification : the target is discrete Regression : the target is continuous Unsupervised Learning : learn patterns from unlabelled data. These methods have been shown to be able to find compact internal representation of the manifold the input data belongs to. Such compact representations can become valuable input features for subsequent tasks of supervised learning. In the context of deep learning, unsupervised models may even attempt to estimate the entire probability distribution of the dataset or how to generate new, independent samples from such distribution. We will get into the mathematical details of these families of models in the second part of our course. Semi-supervised Learning : it lies in between the other learning paradigms as it learns from some examples that include a target and some that do not. Input data can also come in 2 different types: Structured data : tables (e.g., databases) Unstructured data : images, audio, text, ... Examples of applications in geoscience are displayed in the figure below. A number of available data types in various geoscientific contexts is also displayed. History Finally, we take a brief look at the history of Deep Learning. This field has so far experienced three main waves of major development (and periods of success) interspersed by winters (or periods of disbelief): '40 - '50 : first learning algorithms heavily influenced by our understanding of the inner working of the human brain. Mostly linear models such as the McCulloch-Pitts neuron, the perceptron by Rosenblatt, and the adaptive linear element (ADALINE). The latter was trained on an algorithm very similar to Stochastic Gradient Descent (SGD). These models showed poor performance in learning complex functions (e.g., XOR) and led to a drop in popularity of the field. '80 - '90 : these years so the creation of the Multi Layer Perceptron (MLP), the neocognitron (the ancestor of the convolutional layer), the first deep neural networks (e.g., LeNet for MNIST classification), the first sequence-to-sequence networks and the LSTM layer. from 2010 till now : a major moment for the history of this field can be traced back to 2012, when a deep convolution neural network developed by Krizhevsky and co-authors won the ImageNet competition lowering the top-5 error rate from 26.1 percent (previous winning solution not based on a neural network) to 15.3 percent. Since then the field has exploded with advances both in terms of model architectures (AlexNet, VGG, ResNet, GoogleLeNet, ...) optimization algorithms (AdaGrad, RMSProp, Adam, ...), applications (computer vision, text analysis, speech recognition, ...). Moreover, recent developments in the area of unsupervised learning have led to the creation of dimensionality reduction and generative algorithms that can now outperform any state-of-the-art method that is not based on neural networks. If you want to dig deeper into the history of this field, an interesting read can be found here .","title":"Introduction to Machine Learning"},{"location":"lectures/01_intro/#introduction-to-machine-learning","text":"Humans have long dreamed of creating machines that can think and act independently . For many years this has been the aim of Artificial Intelligence (AI) . In the early days of AI, many problems that are difficult to solve by humans (e.g., large summations or multiplications, solution of systems of equations) turn out to be easier for computers as long as humans could define a list of tasks that machines could perform at faster speed and higher precisions than humans can do themselves. On the other hand, tasks that are very easily solved by adult humans and even kids (e.g., recognizing animals in pictures or singing a song) turned out to be very difficult for computers. The main reason of such difficulties lies in the fact that humans cannot explain in words (and with a simple set of instructions) how they have learned to accomplish these tasks. This is where instead the second era of AI solutions, belonging to the field of Machine Learning (ML) , have shown astonishing results in the last decade. Instead of relying on hard-coded rules, these algorithms operate in a similar fashion to human beings as they learn from experience . In other words, given enough training data in the form of inputs (e.g., photos) and outputs (e.g., label of the animal present in the photo), ML algorithms can learn a complex nonlinear mapping between them such that they can infer the output from the input when provided with unseen inputs. A large variety of ML algorithms have been developed by the scientific community, ranging from the basic linear and logistic regression that we will see in our fourth lecture , decision tree-based statistical methods such as random forrest or gradient boosting , all the way to deep neural networks , which have recently shown to outperform previously developed algorithms in many fields (e.g., computer science, text analysis and speech recognition, seismic interpretation). This subfield has grown exponentially in the last few years and it is now referred to as Deep Learning and will be subject of most of our course. In short, Deep learning is a particular kind of machine learning that represent the world as a nested hierarchy of increasingly complicated concepts the more we move away from the input and towards the output of the associated computational graph. Whilst sharing the same underlying principle of learning from experience in the form of a training data , different algorithms presents their own strengths and limitations and a machine learning practitioner must make a careful judgment at any time depending on the problem to be solved.","title":"Introduction to Machine Learning"},{"location":"lectures/01_intro/#terminology","text":"Machine Learning is divided into 3 main categories: Supervised Learning : learn a function that maps an input to an output ( \\(X \\rightarrow Y\\) ). Inputs are also referred to as features and outputs are called targets. In practice we have access to a number of training pairs \\(\\{ \\textbf{x}_i, \\textbf{y}_i \\} \\; i=1,..,N\\) and we learn \\(\\textbf{y}_i=f_\\theta(\\textbf{x}_i)\\) where \\(f_\\theta\\) is for example parametrized via a neural network. Two main applications of supervised learning are Classification : the target is discrete Regression : the target is continuous Unsupervised Learning : learn patterns from unlabelled data. These methods have been shown to be able to find compact internal representation of the manifold the input data belongs to. Such compact representations can become valuable input features for subsequent tasks of supervised learning. In the context of deep learning, unsupervised models may even attempt to estimate the entire probability distribution of the dataset or how to generate new, independent samples from such distribution. We will get into the mathematical details of these families of models in the second part of our course. Semi-supervised Learning : it lies in between the other learning paradigms as it learns from some examples that include a target and some that do not. Input data can also come in 2 different types: Structured data : tables (e.g., databases) Unstructured data : images, audio, text, ... Examples of applications in geoscience are displayed in the figure below. A number of available data types in various geoscientific contexts is also displayed.","title":"Terminology"},{"location":"lectures/01_intro/#history","text":"Finally, we take a brief look at the history of Deep Learning. This field has so far experienced three main waves of major development (and periods of success) interspersed by winters (or periods of disbelief): '40 - '50 : first learning algorithms heavily influenced by our understanding of the inner working of the human brain. Mostly linear models such as the McCulloch-Pitts neuron, the perceptron by Rosenblatt, and the adaptive linear element (ADALINE). The latter was trained on an algorithm very similar to Stochastic Gradient Descent (SGD). These models showed poor performance in learning complex functions (e.g., XOR) and led to a drop in popularity of the field. '80 - '90 : these years so the creation of the Multi Layer Perceptron (MLP), the neocognitron (the ancestor of the convolutional layer), the first deep neural networks (e.g., LeNet for MNIST classification), the first sequence-to-sequence networks and the LSTM layer. from 2010 till now : a major moment for the history of this field can be traced back to 2012, when a deep convolution neural network developed by Krizhevsky and co-authors won the ImageNet competition lowering the top-5 error rate from 26.1 percent (previous winning solution not based on a neural network) to 15.3 percent. Since then the field has exploded with advances both in terms of model architectures (AlexNet, VGG, ResNet, GoogleLeNet, ...) optimization algorithms (AdaGrad, RMSProp, Adam, ...), applications (computer vision, text analysis, speech recognition, ...). Moreover, recent developments in the area of unsupervised learning have led to the creation of dimensionality reduction and generative algorithms that can now outperform any state-of-the-art method that is not based on neural networks. If you want to dig deeper into the history of this field, an interesting read can be found here .","title":"History"},{"location":"lectures/02_linalg/","text":"Linear Algebra refresher In this lecture we will go through some of the key concepts of linear algebra and inverse problem theory that are required to develop the theories of the different machine learning algorithm presented in this course. This is not meant to be an exhaustive treatise and students are strongly advised to take the ErSE 213 - Inverse Problems prior to this course. Three key mathematical objects arise in the study of linear algebra: Scalars : \\(a \\in \\mathbb{R}\\) , a single number represented by a lower case italic letter; Vectors : \\(\\mathbf{x} = [x_1, x_2, ..., x_N]^T \\in \\mathbb{R}^N\\) , ordered collection of \\(N\\) numbers represented by a lower case bold letter; it is sometimes useful to extract a subset of elements by defining a set \\(\\mathbb{S}\\) and add it to as a superscript, \\(\\mathbf{x}_\\mathbb{S}\\) . As an example, given \\(\\mathbf{x} = [x_1, x_2, x_3, x_4, x_5, x_6]^T \\in \\mathbb{R}^6\\) and \\(\\mathbb{S} = {1, 3, 5}\\) we can define the vector \\(\\mathbf{x}_\\mathbb{S} = [x_1, x_3, x_5]\\) and its complementary vector \\(\\mathbf{x}_{-\\mathbb{S}} = [x_2, x_4, x_6]\\) Matrices : \\(\\mathbf{X} \\in \\mathbb{R}^{[N \\times M]}\\) , two dimensional collection of numbers represented by an upper case bold letter where \\(N\\) and \\(M\\) are referred to as the height and width of the matrix. More specifically a matrix can be written as \\[\\mathbf{X} = \\begin{bmatrix} x_{1,1} & x_{1,2} & x_{1,M} \\\\ ... & ... & ... \\\\ x_{N,1} & x_{N,2} & x_{N,M} \\end{bmatrix} \\] A matrix can be indexed by rows \\(\\mathbf{X}_{i, :}\\) (i-th row), by columns \\(\\mathbf{X}_{:, j}\\) (j-th column), and by element \\(\\mathbf{X}_{i, j}\\) (i-th row, j-th column). A number of useful operations that are commonly applied on vectors and matrices are now described: Transpose: \\(\\mathbf{Y} = \\mathbf{X}^T\\) , where \\(Y_{i, j} = X_{j, i}\\) Matrix plus vector: \\(\\mathbf{Y}_{[N \\times M]} = \\mathbf{X}_{[N \\times M]} + \\mathbf{z}_{[1 \\times M]}\\) , where \\(Y_{i, j} = X_{i, j} + z_{j}\\) ( \\(\\mathbf{z}\\) is added to each row of the matrix \\(\\mathbf{X}\\) ) Matrix-vector product: \\(\\mathbf{y}_{[N \\times 1]} = \\mathbf{A}_{[N \\times M]} \\mathbf{x}_{[M \\times 1]}\\) , where \\(y_i = \\sum_{j=1}^M A_{i, j} x_j\\) Matrix-vector product: \\(\\mathbf{y}_{[N \\times 1]} = \\mathbf{A}_{[N \\times M]} \\mathbf{x}_{[M \\times 1]}\\) , where \\(y_i = \\sum_{j=1}^M A_{i, j} x_j\\) Matrix-matrix product: \\(\\mathbf{C}_{[N \\times K]} = \\mathbf{A}_{[N \\times M]} \\mathbf{B}_{[M \\times K]}\\) , where \\(C_{i,k} = \\sum_{j=1}^M A_{i, j} B_{j, k}\\) Hadamart product (i.e., element-wise product): \\(\\mathbf{C}_{[N \\times M]} = \\mathbf{A}_{[N \\times M]} \\odot \\mathbf{B}_{[N \\times M]}\\) , where \\(C_{i,j} = A_{i, j} B_{i, j}\\) Dot product: \\(a = \\mathbf{x}_{[N \\times 1]}^T \\mathbf{y}_{[N \\times 1]} = \\sum_{i=1}^N x_i y_i\\) Identity matrix: \\(\\mathbf{I}_N = diag\\{\\mathbf{1}_N\\}\\) . Based on its definition, we have that \\(\\mathbf{I}_N \\mathbf{x} = \\mathbf{x}\\) and \\(\\mathbf{I}_N \\mathbf{X} = \\mathbf{X}\\) Inverse matrix: given \\(\\mathbf{y} = \\mathbf{A} \\mathbf{x}\\) , the inverse matrix of \\(\\mathbf{A}\\) is a matrix that satisfies the following equality \\(\\mathbf{A}^{-1} \\mathbf{A} = \\mathbf{I}_N\\) . We can finally write \\(\\mathbf{x} = \\mathbf{A}^{-1} \\mathbf{y}\\) Orthogonal vectors and matrices: given two vectors \\(\\mathbf{x}\\) and \\(\\mathbf{y}\\) , they are said to be orthogonal if \\(\\mathbf{y}^T \\mathbf{x} = 0\\) . Given two matrices \\(\\mathbf{X}\\) and \\(\\mathbf{Y}\\) , they are said to be orthogonal if \\(\\mathbf{Y}^T \\mathbf{X} = \\mathbf{I}_N\\) . Orthogonal matrices are especially interesting because their inverse is simply \\(\\mathbf{X}^{-1} = \\mathbf{X}^T\\) Matrix decomposition: like any scalar number can be decomposed into a product of prime numbers, a matrix \\(\\mathbf{A}\\) can also be decomposed into a combination of vectors (i.e., eigenvectors) and scalars (i.e., eigenvalues). Eigendecomposition: real-valued, square, symmetric matrices can be written as \\(\\mathbf{A} = \\mathbf{V} \\Lambda \\mathbf{V}^T = \\sum_i \\lambda_i \\mathbf{v}_i \\mathbf{v}_i^T\\) where \\(\\lambda_i\\) and \\(\\mathbf{v}_i\\) are the eigenvalues and eigenvectors of the matrix \\(\\mathbf{A}\\) , respectively. Eigenvectors are placed along the columns of the matrix \\(\\mathbf{V}\\) , which is an orthogonal matrix (i.e., \\(\\mathbf{V}^T=\\mathbf{V}^{-1}\\) ). Eigenvalues are placed along the diagonal of the matrix \\(\\Lambda=diag\\{\\lambda\\}\\) and tell us about the rank of the matrix, \\(rank(\\mathbf{A}) = \\# \\lambda \\neq 0\\) . A full rank matrix is matrix whose eigenvalues are all non-zero and can be inverted. In this case the inverse of \\(\\mathbf{A}\\) is \\(\\mathbf{A}^{-1}=\\mathbf{V}\\Lambda^{-1}\\mathbf{V}^T\\) Singular value decomposition (SVD): this is a more general decomposition which can be applied to real-valued, non-square, non-symmetric matrices. Singular vectors \\(\\mathbf{u}\\) and \\(\\mathbf{v}\\) and singular values \\(\\lambda\\) generalized the concept of eigenvectors and and eigenvalues. The matrix \\(\\mathbf{A}\\) can be decomposed as \\(\\mathbf{A} = \\mathbf{U} \\mathbf{D} \\mathbf{V}^T\\) where \\(\\mathbf{D} = \\Lambda\\) for square matrices, \\(\\mathbf{D} = [\\Lambda \\; \\mathbf{0}]^T\\) for \\(N>M\\) and \\(\\mathbf{D} = [\\Lambda \\; \\mathbf{0}]\\) for \\(M>N\\) . Similar to the eigendecomposition, in this case the inverse of \\(\\mathbf{A}\\) is \\(\\mathbf{A}^{-1}=\\mathbf{V}\\mathbf{D}^{-1}\\mathbf{U}^T\\) Conditioning: in general, it refers to how fast a function \\(f(x)\\) changes given a small change in its input \\(x\\) . Similarly for a matrix, conditioning is linked to the curvature of its associated quadratic form \\(f(\\mathbf{A}) = \\mathbf{x}^T \\mathbf{A} \\mathbf{x}\\) and it generally indicates how rapidly this function changes as function of \\(\\mathbf{x}\\) . It is defined as \\(cond(\\mathbf{A})=\\frac{|\\lambda_{max}|}{|\\lambda_{min}|}\\) . Norms : another important object that we will be using when defining cost functions for ML models are norms. A norm is a function that maps a vector \\(\\mathbf{x} \\in \\mathbb{R}^N\\) to a scalar \\(d \\in \\mathbb{R}\\) and it can be loosely seen as measure of the length of the vector (i.e., distance from the origin). In general, the \\(L^p\\) norm is defined as: \\[ ||\\mathbf{x}||_p = \\left( \\sum_i |x_i|^p \\right) ^{1/p} \\; p \\ge 0 \\] Popular norms are: Euclidean norm ( \\(L_2\\) ): \\(||\\mathbf{x}||_2 = \\sqrt{\\sum_i x_i^2}\\) , is a real distance of a vector from the origin of the N-d Euclidean space. Note that \\(||\\mathbf{x}||_2^2 = \\mathbf{x}^T \\mathbf{x}\\) and that \\(||\\mathbf{x}||_2=1\\) for a unit vector; \\(L_1\\) norm: \\(||\\mathbf{x}||_1 = \\sum_i |x_i|\\) \\(L_0\\) norm: number of non-zero elements in the vector \\(\\mathbf{x}\\) \\(L_\\infty\\) norm: \\(||\\mathbf{x}||_2 = max |x_i|\\) Frobenious norm (for matrices): \\(||\\mathbf{A}||_F = \\sqrt{\\sum_{i,j} A_{i,j}^2}\\) ,","title":"Linear Algebra refresher"},{"location":"lectures/02_linalg/#linear-algebra-refresher","text":"In this lecture we will go through some of the key concepts of linear algebra and inverse problem theory that are required to develop the theories of the different machine learning algorithm presented in this course. This is not meant to be an exhaustive treatise and students are strongly advised to take the ErSE 213 - Inverse Problems prior to this course. Three key mathematical objects arise in the study of linear algebra: Scalars : \\(a \\in \\mathbb{R}\\) , a single number represented by a lower case italic letter; Vectors : \\(\\mathbf{x} = [x_1, x_2, ..., x_N]^T \\in \\mathbb{R}^N\\) , ordered collection of \\(N\\) numbers represented by a lower case bold letter; it is sometimes useful to extract a subset of elements by defining a set \\(\\mathbb{S}\\) and add it to as a superscript, \\(\\mathbf{x}_\\mathbb{S}\\) . As an example, given \\(\\mathbf{x} = [x_1, x_2, x_3, x_4, x_5, x_6]^T \\in \\mathbb{R}^6\\) and \\(\\mathbb{S} = {1, 3, 5}\\) we can define the vector \\(\\mathbf{x}_\\mathbb{S} = [x_1, x_3, x_5]\\) and its complementary vector \\(\\mathbf{x}_{-\\mathbb{S}} = [x_2, x_4, x_6]\\) Matrices : \\(\\mathbf{X} \\in \\mathbb{R}^{[N \\times M]}\\) , two dimensional collection of numbers represented by an upper case bold letter where \\(N\\) and \\(M\\) are referred to as the height and width of the matrix. More specifically a matrix can be written as \\[\\mathbf{X} = \\begin{bmatrix} x_{1,1} & x_{1,2} & x_{1,M} \\\\ ... & ... & ... \\\\ x_{N,1} & x_{N,2} & x_{N,M} \\end{bmatrix} \\] A matrix can be indexed by rows \\(\\mathbf{X}_{i, :}\\) (i-th row), by columns \\(\\mathbf{X}_{:, j}\\) (j-th column), and by element \\(\\mathbf{X}_{i, j}\\) (i-th row, j-th column). A number of useful operations that are commonly applied on vectors and matrices are now described: Transpose: \\(\\mathbf{Y} = \\mathbf{X}^T\\) , where \\(Y_{i, j} = X_{j, i}\\) Matrix plus vector: \\(\\mathbf{Y}_{[N \\times M]} = \\mathbf{X}_{[N \\times M]} + \\mathbf{z}_{[1 \\times M]}\\) , where \\(Y_{i, j} = X_{i, j} + z_{j}\\) ( \\(\\mathbf{z}\\) is added to each row of the matrix \\(\\mathbf{X}\\) ) Matrix-vector product: \\(\\mathbf{y}_{[N \\times 1]} = \\mathbf{A}_{[N \\times M]} \\mathbf{x}_{[M \\times 1]}\\) , where \\(y_i = \\sum_{j=1}^M A_{i, j} x_j\\) Matrix-vector product: \\(\\mathbf{y}_{[N \\times 1]} = \\mathbf{A}_{[N \\times M]} \\mathbf{x}_{[M \\times 1]}\\) , where \\(y_i = \\sum_{j=1}^M A_{i, j} x_j\\) Matrix-matrix product: \\(\\mathbf{C}_{[N \\times K]} = \\mathbf{A}_{[N \\times M]} \\mathbf{B}_{[M \\times K]}\\) , where \\(C_{i,k} = \\sum_{j=1}^M A_{i, j} B_{j, k}\\) Hadamart product (i.e., element-wise product): \\(\\mathbf{C}_{[N \\times M]} = \\mathbf{A}_{[N \\times M]} \\odot \\mathbf{B}_{[N \\times M]}\\) , where \\(C_{i,j} = A_{i, j} B_{i, j}\\) Dot product: \\(a = \\mathbf{x}_{[N \\times 1]}^T \\mathbf{y}_{[N \\times 1]} = \\sum_{i=1}^N x_i y_i\\) Identity matrix: \\(\\mathbf{I}_N = diag\\{\\mathbf{1}_N\\}\\) . Based on its definition, we have that \\(\\mathbf{I}_N \\mathbf{x} = \\mathbf{x}\\) and \\(\\mathbf{I}_N \\mathbf{X} = \\mathbf{X}\\) Inverse matrix: given \\(\\mathbf{y} = \\mathbf{A} \\mathbf{x}\\) , the inverse matrix of \\(\\mathbf{A}\\) is a matrix that satisfies the following equality \\(\\mathbf{A}^{-1} \\mathbf{A} = \\mathbf{I}_N\\) . We can finally write \\(\\mathbf{x} = \\mathbf{A}^{-1} \\mathbf{y}\\) Orthogonal vectors and matrices: given two vectors \\(\\mathbf{x}\\) and \\(\\mathbf{y}\\) , they are said to be orthogonal if \\(\\mathbf{y}^T \\mathbf{x} = 0\\) . Given two matrices \\(\\mathbf{X}\\) and \\(\\mathbf{Y}\\) , they are said to be orthogonal if \\(\\mathbf{Y}^T \\mathbf{X} = \\mathbf{I}_N\\) . Orthogonal matrices are especially interesting because their inverse is simply \\(\\mathbf{X}^{-1} = \\mathbf{X}^T\\) Matrix decomposition: like any scalar number can be decomposed into a product of prime numbers, a matrix \\(\\mathbf{A}\\) can also be decomposed into a combination of vectors (i.e., eigenvectors) and scalars (i.e., eigenvalues). Eigendecomposition: real-valued, square, symmetric matrices can be written as \\(\\mathbf{A} = \\mathbf{V} \\Lambda \\mathbf{V}^T = \\sum_i \\lambda_i \\mathbf{v}_i \\mathbf{v}_i^T\\) where \\(\\lambda_i\\) and \\(\\mathbf{v}_i\\) are the eigenvalues and eigenvectors of the matrix \\(\\mathbf{A}\\) , respectively. Eigenvectors are placed along the columns of the matrix \\(\\mathbf{V}\\) , which is an orthogonal matrix (i.e., \\(\\mathbf{V}^T=\\mathbf{V}^{-1}\\) ). Eigenvalues are placed along the diagonal of the matrix \\(\\Lambda=diag\\{\\lambda\\}\\) and tell us about the rank of the matrix, \\(rank(\\mathbf{A}) = \\# \\lambda \\neq 0\\) . A full rank matrix is matrix whose eigenvalues are all non-zero and can be inverted. In this case the inverse of \\(\\mathbf{A}\\) is \\(\\mathbf{A}^{-1}=\\mathbf{V}\\Lambda^{-1}\\mathbf{V}^T\\) Singular value decomposition (SVD): this is a more general decomposition which can be applied to real-valued, non-square, non-symmetric matrices. Singular vectors \\(\\mathbf{u}\\) and \\(\\mathbf{v}\\) and singular values \\(\\lambda\\) generalized the concept of eigenvectors and and eigenvalues. The matrix \\(\\mathbf{A}\\) can be decomposed as \\(\\mathbf{A} = \\mathbf{U} \\mathbf{D} \\mathbf{V}^T\\) where \\(\\mathbf{D} = \\Lambda\\) for square matrices, \\(\\mathbf{D} = [\\Lambda \\; \\mathbf{0}]^T\\) for \\(N>M\\) and \\(\\mathbf{D} = [\\Lambda \\; \\mathbf{0}]\\) for \\(M>N\\) . Similar to the eigendecomposition, in this case the inverse of \\(\\mathbf{A}\\) is \\(\\mathbf{A}^{-1}=\\mathbf{V}\\mathbf{D}^{-1}\\mathbf{U}^T\\) Conditioning: in general, it refers to how fast a function \\(f(x)\\) changes given a small change in its input \\(x\\) . Similarly for a matrix, conditioning is linked to the curvature of its associated quadratic form \\(f(\\mathbf{A}) = \\mathbf{x}^T \\mathbf{A} \\mathbf{x}\\) and it generally indicates how rapidly this function changes as function of \\(\\mathbf{x}\\) . It is defined as \\(cond(\\mathbf{A})=\\frac{|\\lambda_{max}|}{|\\lambda_{min}|}\\) . Norms : another important object that we will be using when defining cost functions for ML models are norms. A norm is a function that maps a vector \\(\\mathbf{x} \\in \\mathbb{R}^N\\) to a scalar \\(d \\in \\mathbb{R}\\) and it can be loosely seen as measure of the length of the vector (i.e., distance from the origin). In general, the \\(L^p\\) norm is defined as: \\[ ||\\mathbf{x}||_p = \\left( \\sum_i |x_i|^p \\right) ^{1/p} \\; p \\ge 0 \\] Popular norms are: Euclidean norm ( \\(L_2\\) ): \\(||\\mathbf{x}||_2 = \\sqrt{\\sum_i x_i^2}\\) , is a real distance of a vector from the origin of the N-d Euclidean space. Note that \\(||\\mathbf{x}||_2^2 = \\mathbf{x}^T \\mathbf{x}\\) and that \\(||\\mathbf{x}||_2=1\\) for a unit vector; \\(L_1\\) norm: \\(||\\mathbf{x}||_1 = \\sum_i |x_i|\\) \\(L_0\\) norm: number of non-zero elements in the vector \\(\\mathbf{x}\\) \\(L_\\infty\\) norm: \\(||\\mathbf{x}||_2 = max |x_i|\\) Frobenious norm (for matrices): \\(||\\mathbf{A}||_F = \\sqrt{\\sum_{i,j} A_{i,j}^2}\\) ,","title":"Linear Algebra refresher"},{"location":"lectures/02_prob/","text":"Probability refresher Another set of fundamental mathematical tools required to develop various machine learning algorithms (especially towards the end of the course when we will focus on generative modelling) In order to develop various machine learning algorithms (especially towards the end of the course when we will focus on generative modelling) we need to be familiarized with some basic concepts of: mathematical tools from: Probability : mathematical framework to handle uncertain statements; Information Theory : scientific field focused on the quantification of amount of uncertainty in a probability distribution. Probability Random Variable : a variable whose value is unknown, all we know is that it can take on different values with a given probability. It is generally defined by an uppercase letter \\(X\\) , whilst the values it can take are in lowercase letter \\(x\\) . (Note: Actually, random variable is not really a variable. To be exact, random variable is actually a function that maps from sample space to the probability space.) Probability distribution : description of how likely a variable \\(x\\) is, \\(P(x)\\) (or \\(p(x)\\) ). Depending on the type of variable we have: Discrete distributions : \\(P(X)\\) called Probability Mass Function (PMF) and \\(X\\) can take on a discrete number of states N. A classical example is represented by a coin where N=2 and \\(X={0,1}\\) . For a fair coin, \\(P(X=0)=0.5\\) and \\(P(X=1)=0.5\\) . Continuous distributions : \\(p(X)\\) called Probability Density Function (PDF) and \\(X\\) can take on any value from a continuous space (e.g., \\(\\mathbb{R}\\) ). A classical example is represented by the gaussian distribution where \\(x \\in (-\\infty, \\infty)\\) . A probability distribution must satisfy the following conditions: each of the possible states must have probability bounded between 0 (no occurrance) and 1 (certainty of occurcence): \\(\\forall x \\in X, \\; 0 \\leq P(x) \\leq 1\\) (or \\(p(x) \\geq 0\\) , where the upper bound is removed because of the fact that the integration step \\(\\delta x\\) in the second condition can be smaller than 1: \\(p(X=x) \\delta x <=1\\) ); the sum of the probabilities of all possible states must equal to 1: \\(\\sum_x P(X=x)=1\\) (or \\(\\int p(X=x)dx=1\\) ). Joint and Marginal Probabilities : assuming we have a probability distribution acting over a set of variables (e.g., \\(X\\) and \\(Y\\) ) we can define Joint distribution : \\(P(X=x, Y=y)\\) (or \\(p(X=x, Y=y)\\) ); Marginal distribution : \\(P(X=x) = \\sum_{y \\in Y} P(X=x, Y=y)\\) (or \\(p(X=x) = \\int P(X=x, Y=y) dy\\) ), which is the probability spanning one or a subset of the original variables; Conditional Probability : provides us with the probability of an event given the knowledge that another event has already occurred \\[ P(Y=y | X=x) = \\frac{P(X=x, Y=y)}{P(X=x)} \\] This formula can be used recursively to define the joint probability of N variables as product of conditional probabilities (so-called Chain Rule of Probability ) \\[ P(x_1, x_2, ..., x_N) = P(x_1) \\prod_{i=2}^N P(x_i | x_1, x_2, x_{i-1}) \\] Independence and Conditional Independence : Two variables X and Y are said to be independent if \\[ P(X=x, Y=y) = P(X=x) P(Y=y) \\] If both variables are conditioned on a third variable Z (i.e., P(X=x, Y=y | Z=z)), they are said to be conditionally independent if \\[ P(X=x, Y=y | Z=z) = P(X=x | Z=z) P(Y=y| Z=z) \\] Bayes Rule : probabilistic way to update our knowledge of a certain phenomenon (called prior) based on a new piece of evidence (called likelihood): \\[ P(x | y) = \\frac{P(y|x) P(x)}{P(y)} \\] where \\(P(y) = \\sum_x P(x, y) = \\sum_x P(y |x) P(x)\\) is called the evidence. In practice, it is infeasible to compute this quantity as it would require evaluating \\(y\\) for all possible combination of \\(x\\) (we will see later how it is possible to devise methods for which \\(P(y)\\) can be ignored). Mean (or Expectation) : Given a function \\(f(x)\\) where \\(x\\) is a random variable with probability \\(P(x)\\) , its average or mean value is defined as follows for the discrete case: \\[ \\mu = E_{x \\sim P} [f(x)] = \\sum_x P(x) f(x) \\] and for the continuous case \\[ \\mu = E_{x \\sim p} [f(x)] = \\int p(x) f(x) dx \\] In most Machine Learning applications, we do not have knowledge of the full distribution to evaluate the mean, rather we have access to N equi-probable samples that we assume are drawn from the underlying distribution. We can approximate the mean via the Sample Mean : \\[ \\mu \\approx \\sum_i \\frac{1}{N} f(x_i) \\] Variance (and Covariance) : Given a function \\(f(x)\\) where \\(x\\) is a random variable with probability \\(P(x)\\) , it represents a measure of how much the values of the function vary from the mean: \\[ \\sigma^2 = E_{x \\sim p} [(f(x)-\\mu)^2] \\] Covariance is the extension of the variance to two or more variables, and it tells how much these variables are related to each other: \\[ Cov(f(x), g(y)) = E_{x,y \\sim p} [(f(x)-\\mu_x)(f(y)-\\mu_y)] \\] Here, \\(Cov \\rightarrow 0\\) indicates no correlation between the variables, \\(Cov > 0\\) denotes positive correlation and \\(Cov < 0\\) denotes negative correlation. It is worth remembering that covariance is linked to correlation via: \\[ Corr_{x,y} = \\frac{Cov_{x,y}}{\\sigma_x \\sigma_y} \\] Finally, the covariance of a multidimensional vector \\(\\textbf{x} \\in \\mathbb{R}^n\\) is defined as: \\[ Cov_{i,j} = Cov(x_i, x_j), \\qquad Cov_{i,i} = \\sigma^2_i \\] Distributions : some of the most used probability distributions in Machine Learning are listed in the following. 1. Bernoulli : single binary variable \\(x \\in \\{0,1\\}\\) (commonly used to describe the toss of a coin). It is defined as \\[ P(x=1)=\\phi, \\; P(x=0)=1-\\phi, \\; \\phi \\in [0,1] \\] with probability: \\[ P(x)=\\phi^x(1-\\phi)^{1-x} = \\phi x + (1-\\phi)(1-x) \\] and momentum equal to: \\[ E[x] = 1, \\; \\sigma^2 = \\phi (1-\\phi) \\] 2. Multinoulli (or categorical) : extension of Bernoulli distribution to K different states \\[ \\textbf{P} \\in [0,1]^{K-1}; \\; P_k = 1- \\textbf{1}^T\\textbf{P}, \\; \\textbf{1}^T\\textbf{P} \\leq 1 \\] 3. Gaussian : most popular choice for continuous random variables (most distributions are close to a normal distribution and the central limit theorem states that any sum of independent variables is approximately normal) \\[ x \\sim \\mathcal{N}(\\mu, \\sigma^2) \\rightarrow p(x) = \\frac{1}{\\sqrt{2 \\pi} \\sigma} e^{-\\frac{(x-\\mu)^2}{2\\sigma^2}} = \\sqrt{\\frac{\\beta}{2 \\pi}} e^{-\\frac{\\beta(x-\\mu)^2}{2}} \\] where the second definition uses the precision \\(\\beta=\\frac{1}{\\sigma^2} \\in (0, \\infty)\\) to avoid possible division by zero. A third way to parametrize the gaussian probability uses \\(2 \\delta = log \\sigma^2 \\in (-\\infty, \\infty)\\) which has the further benefit to be unbounded and can be easily optimized for during training. which is unbounded (compared to the variance that must be positive) 4. Multivariate Gaussian : extension of Gaussian distribution to a multidimensional vector \\(\\textbf{x} \\in \\mathbb{R}^n\\) \\[ \\textbf{x} \\sim \\mathcal{N}(\\boldsymbol\\mu, \\boldsymbol\\Sigma) \\rightarrow p(\\textbf{x}) = \\sqrt{\\frac{1}{(2 \\pi)^n det \\boldsymbol\\Sigma}} e^{-\\frac{1}{2}(\\textbf{x}- \\boldsymbol\\mu)^T\\boldsymbol\\Sigma^{-1}(\\textbf{x}- \\boldsymbol\\mu)}= \\sqrt{\\frac{det \\boldsymbol\\beta}{(2 \\pi)^n}} e^{-\\frac{1}{2}(\\textbf{x}- \\boldsymbol\\mu)^T\\boldsymbol\\beta(\\textbf{x}- \\boldsymbol\\mu)} \\] where again \\(\\boldsymbol\\beta =\\boldsymbol\\Sigma^{-1}\\) . In ML applications, \\(\\boldsymbol\\beta\\) is generally assumed diagonal (mean-field approximation) or even isotropic ($\\boldsymbol\\beta = \\beta \\textbf{I}_n) 5. Mixture of distributions : any smooth probability density function can be expressed as a weighted sum of simpler distributions \\[ P(x) = \\sum_i P(c=i) P(x | c=i) \\] where \\(c\\) is a categorical variable with Multinoulli distribution and plays the role of a latent variable , a variable that cannot be directly observed but is related to \\(x\\) via the joint distribution: \\[ P(x,c) = P(x | c) P(c), \\; P(x) = \\sum_c P(x|c)P(c) \\] A special case is the so-called Gaussian Mixture where each probability \\(P(x|c=i) \\sim \\mathcal{N}(\\mu_i, \\sigma_i^2)\\) . Information theory In Machine Learning, we are sometimes interested to quantify how much information is contained in a signal or how much two signals (or probability distributions) differ from each other. A large body of literature exists in the context of telecommunications, where it is necessary to study how to transmit signals for a discrete alphabet over a noisy channel. More specifically, a code must be designed so to allow sending the least amount of bits for the most amount of useful information. Extension of such theory to continuous variables is also available and more commonly used in the context of ML systems. Self-information : a measure of information in such a way that likely events have low information content, less likely events have higher information content and independent events have additive information: \\[ I(x) = - log_eP(x) \\] such that for \\(P(x) \\rightarrow 0\\) (unlikely event), \\(I \\rightarrow \\infty\\) and for \\(P(x) \\rightarrow 1\\) (likely event), \\(I \\rightarrow 0\\) . Shannon entropy : extension of self-information to continuous variables, representing the expected amount of information in an event \\(x\\) drawn from a probability $P: \\[ H(x) = E_{x \\sim P} [I(x)] = - E_{x \\sim P} [log_eP(x)] \\] Kullback-Leibler divergence : extension of entropy to 2 variables with probability \\(P\\) and \\(Q\\) , respectively. It is used to measure their distance \\[ D_{KL}(P||Q) = E_{x \\sim P} [log\\frac{P(x)}{Q(x)}] = E_{x \\sim P} [logP(x)-logQ(x)] = E_{x \\sim P} [logP(x)] -E_{x \\sim P}[logQ(x)] \\] which is \\(D_{KL}(P||Q)=0\\) only when \\(P=Q\\) and grows the further away the two probabilities are. Finally, note that this is not a real distance in that \\(D_{KL}(P||Q) \\neq D_{KL}(Q|| P)\\) (non-symmetric), therefore the direction matter and it must be chosen wisely when devising optimization schemes with KL divergence in the loss function as we will discuss in more details later.","title":"Probability refresher"},{"location":"lectures/02_prob/#probability-refresher","text":"Another set of fundamental mathematical tools required to develop various machine learning algorithms (especially towards the end of the course when we will focus on generative modelling) In order to develop various machine learning algorithms (especially towards the end of the course when we will focus on generative modelling) we need to be familiarized with some basic concepts of: mathematical tools from: Probability : mathematical framework to handle uncertain statements; Information Theory : scientific field focused on the quantification of amount of uncertainty in a probability distribution.","title":"Probability refresher"},{"location":"lectures/02_prob/#probability","text":"Random Variable : a variable whose value is unknown, all we know is that it can take on different values with a given probability. It is generally defined by an uppercase letter \\(X\\) , whilst the values it can take are in lowercase letter \\(x\\) . (Note: Actually, random variable is not really a variable. To be exact, random variable is actually a function that maps from sample space to the probability space.) Probability distribution : description of how likely a variable \\(x\\) is, \\(P(x)\\) (or \\(p(x)\\) ). Depending on the type of variable we have: Discrete distributions : \\(P(X)\\) called Probability Mass Function (PMF) and \\(X\\) can take on a discrete number of states N. A classical example is represented by a coin where N=2 and \\(X={0,1}\\) . For a fair coin, \\(P(X=0)=0.5\\) and \\(P(X=1)=0.5\\) . Continuous distributions : \\(p(X)\\) called Probability Density Function (PDF) and \\(X\\) can take on any value from a continuous space (e.g., \\(\\mathbb{R}\\) ). A classical example is represented by the gaussian distribution where \\(x \\in (-\\infty, \\infty)\\) . A probability distribution must satisfy the following conditions: each of the possible states must have probability bounded between 0 (no occurrance) and 1 (certainty of occurcence): \\(\\forall x \\in X, \\; 0 \\leq P(x) \\leq 1\\) (or \\(p(x) \\geq 0\\) , where the upper bound is removed because of the fact that the integration step \\(\\delta x\\) in the second condition can be smaller than 1: \\(p(X=x) \\delta x <=1\\) ); the sum of the probabilities of all possible states must equal to 1: \\(\\sum_x P(X=x)=1\\) (or \\(\\int p(X=x)dx=1\\) ). Joint and Marginal Probabilities : assuming we have a probability distribution acting over a set of variables (e.g., \\(X\\) and \\(Y\\) ) we can define Joint distribution : \\(P(X=x, Y=y)\\) (or \\(p(X=x, Y=y)\\) ); Marginal distribution : \\(P(X=x) = \\sum_{y \\in Y} P(X=x, Y=y)\\) (or \\(p(X=x) = \\int P(X=x, Y=y) dy\\) ), which is the probability spanning one or a subset of the original variables; Conditional Probability : provides us with the probability of an event given the knowledge that another event has already occurred \\[ P(Y=y | X=x) = \\frac{P(X=x, Y=y)}{P(X=x)} \\] This formula can be used recursively to define the joint probability of N variables as product of conditional probabilities (so-called Chain Rule of Probability ) \\[ P(x_1, x_2, ..., x_N) = P(x_1) \\prod_{i=2}^N P(x_i | x_1, x_2, x_{i-1}) \\] Independence and Conditional Independence : Two variables X and Y are said to be independent if \\[ P(X=x, Y=y) = P(X=x) P(Y=y) \\] If both variables are conditioned on a third variable Z (i.e., P(X=x, Y=y | Z=z)), they are said to be conditionally independent if \\[ P(X=x, Y=y | Z=z) = P(X=x | Z=z) P(Y=y| Z=z) \\] Bayes Rule : probabilistic way to update our knowledge of a certain phenomenon (called prior) based on a new piece of evidence (called likelihood): \\[ P(x | y) = \\frac{P(y|x) P(x)}{P(y)} \\] where \\(P(y) = \\sum_x P(x, y) = \\sum_x P(y |x) P(x)\\) is called the evidence. In practice, it is infeasible to compute this quantity as it would require evaluating \\(y\\) for all possible combination of \\(x\\) (we will see later how it is possible to devise methods for which \\(P(y)\\) can be ignored). Mean (or Expectation) : Given a function \\(f(x)\\) where \\(x\\) is a random variable with probability \\(P(x)\\) , its average or mean value is defined as follows for the discrete case: \\[ \\mu = E_{x \\sim P} [f(x)] = \\sum_x P(x) f(x) \\] and for the continuous case \\[ \\mu = E_{x \\sim p} [f(x)] = \\int p(x) f(x) dx \\] In most Machine Learning applications, we do not have knowledge of the full distribution to evaluate the mean, rather we have access to N equi-probable samples that we assume are drawn from the underlying distribution. We can approximate the mean via the Sample Mean : \\[ \\mu \\approx \\sum_i \\frac{1}{N} f(x_i) \\] Variance (and Covariance) : Given a function \\(f(x)\\) where \\(x\\) is a random variable with probability \\(P(x)\\) , it represents a measure of how much the values of the function vary from the mean: \\[ \\sigma^2 = E_{x \\sim p} [(f(x)-\\mu)^2] \\] Covariance is the extension of the variance to two or more variables, and it tells how much these variables are related to each other: \\[ Cov(f(x), g(y)) = E_{x,y \\sim p} [(f(x)-\\mu_x)(f(y)-\\mu_y)] \\] Here, \\(Cov \\rightarrow 0\\) indicates no correlation between the variables, \\(Cov > 0\\) denotes positive correlation and \\(Cov < 0\\) denotes negative correlation. It is worth remembering that covariance is linked to correlation via: \\[ Corr_{x,y} = \\frac{Cov_{x,y}}{\\sigma_x \\sigma_y} \\] Finally, the covariance of a multidimensional vector \\(\\textbf{x} \\in \\mathbb{R}^n\\) is defined as: \\[ Cov_{i,j} = Cov(x_i, x_j), \\qquad Cov_{i,i} = \\sigma^2_i \\] Distributions : some of the most used probability distributions in Machine Learning are listed in the following. 1. Bernoulli : single binary variable \\(x \\in \\{0,1\\}\\) (commonly used to describe the toss of a coin). It is defined as \\[ P(x=1)=\\phi, \\; P(x=0)=1-\\phi, \\; \\phi \\in [0,1] \\] with probability: \\[ P(x)=\\phi^x(1-\\phi)^{1-x} = \\phi x + (1-\\phi)(1-x) \\] and momentum equal to: \\[ E[x] = 1, \\; \\sigma^2 = \\phi (1-\\phi) \\] 2. Multinoulli (or categorical) : extension of Bernoulli distribution to K different states \\[ \\textbf{P} \\in [0,1]^{K-1}; \\; P_k = 1- \\textbf{1}^T\\textbf{P}, \\; \\textbf{1}^T\\textbf{P} \\leq 1 \\] 3. Gaussian : most popular choice for continuous random variables (most distributions are close to a normal distribution and the central limit theorem states that any sum of independent variables is approximately normal) \\[ x \\sim \\mathcal{N}(\\mu, \\sigma^2) \\rightarrow p(x) = \\frac{1}{\\sqrt{2 \\pi} \\sigma} e^{-\\frac{(x-\\mu)^2}{2\\sigma^2}} = \\sqrt{\\frac{\\beta}{2 \\pi}} e^{-\\frac{\\beta(x-\\mu)^2}{2}} \\] where the second definition uses the precision \\(\\beta=\\frac{1}{\\sigma^2} \\in (0, \\infty)\\) to avoid possible division by zero. A third way to parametrize the gaussian probability uses \\(2 \\delta = log \\sigma^2 \\in (-\\infty, \\infty)\\) which has the further benefit to be unbounded and can be easily optimized for during training. which is unbounded (compared to the variance that must be positive) 4. Multivariate Gaussian : extension of Gaussian distribution to a multidimensional vector \\(\\textbf{x} \\in \\mathbb{R}^n\\) \\[ \\textbf{x} \\sim \\mathcal{N}(\\boldsymbol\\mu, \\boldsymbol\\Sigma) \\rightarrow p(\\textbf{x}) = \\sqrt{\\frac{1}{(2 \\pi)^n det \\boldsymbol\\Sigma}} e^{-\\frac{1}{2}(\\textbf{x}- \\boldsymbol\\mu)^T\\boldsymbol\\Sigma^{-1}(\\textbf{x}- \\boldsymbol\\mu)}= \\sqrt{\\frac{det \\boldsymbol\\beta}{(2 \\pi)^n}} e^{-\\frac{1}{2}(\\textbf{x}- \\boldsymbol\\mu)^T\\boldsymbol\\beta(\\textbf{x}- \\boldsymbol\\mu)} \\] where again \\(\\boldsymbol\\beta =\\boldsymbol\\Sigma^{-1}\\) . In ML applications, \\(\\boldsymbol\\beta\\) is generally assumed diagonal (mean-field approximation) or even isotropic ($\\boldsymbol\\beta = \\beta \\textbf{I}_n) 5. Mixture of distributions : any smooth probability density function can be expressed as a weighted sum of simpler distributions \\[ P(x) = \\sum_i P(c=i) P(x | c=i) \\] where \\(c\\) is a categorical variable with Multinoulli distribution and plays the role of a latent variable , a variable that cannot be directly observed but is related to \\(x\\) via the joint distribution: \\[ P(x,c) = P(x | c) P(c), \\; P(x) = \\sum_c P(x|c)P(c) \\] A special case is the so-called Gaussian Mixture where each probability \\(P(x|c=i) \\sim \\mathcal{N}(\\mu_i, \\sigma_i^2)\\) .","title":"Probability"},{"location":"lectures/02_prob/#information-theory","text":"In Machine Learning, we are sometimes interested to quantify how much information is contained in a signal or how much two signals (or probability distributions) differ from each other. A large body of literature exists in the context of telecommunications, where it is necessary to study how to transmit signals for a discrete alphabet over a noisy channel. More specifically, a code must be designed so to allow sending the least amount of bits for the most amount of useful information. Extension of such theory to continuous variables is also available and more commonly used in the context of ML systems. Self-information : a measure of information in such a way that likely events have low information content, less likely events have higher information content and independent events have additive information: \\[ I(x) = - log_eP(x) \\] such that for \\(P(x) \\rightarrow 0\\) (unlikely event), \\(I \\rightarrow \\infty\\) and for \\(P(x) \\rightarrow 1\\) (likely event), \\(I \\rightarrow 0\\) . Shannon entropy : extension of self-information to continuous variables, representing the expected amount of information in an event \\(x\\) drawn from a probability $P: \\[ H(x) = E_{x \\sim P} [I(x)] = - E_{x \\sim P} [log_eP(x)] \\] Kullback-Leibler divergence : extension of entropy to 2 variables with probability \\(P\\) and \\(Q\\) , respectively. It is used to measure their distance \\[ D_{KL}(P||Q) = E_{x \\sim P} [log\\frac{P(x)}{Q(x)}] = E_{x \\sim P} [logP(x)-logQ(x)] = E_{x \\sim P} [logP(x)] -E_{x \\sim P}[logQ(x)] \\] which is \\(D_{KL}(P||Q)=0\\) only when \\(P=Q\\) and grows the further away the two probabilities are. Finally, note that this is not a real distance in that \\(D_{KL}(P||Q) \\neq D_{KL}(Q|| P)\\) (non-symmetric), therefore the direction matter and it must be chosen wisely when devising optimization schemes with KL divergence in the loss function as we will discuss in more details later.","title":"Information theory"},{"location":"lectures/03_gradopt/","text":"Gradient-based optimization After reviewing some of the basic concepts of linear algebra and probability that we will be using during this course, we are now in a position to start our journey in the field of learning algorithms . Any learning algorithm, no matter its level of complexity, is composed of 4 key elements: Dataset : a collection of many examples (sometimes referred to as samples of data points) that represents the experience we wish our machine learning algorithm to learn from. More specifically, the dataset is defined as: $$ \\mathbf{x} = [x_1, x_2, ..., x_{N_f}]^T \\quad \\mathbf{X} = [\\mathbf{x}^{(1)}, \\mathbf{x}^{(2)}, ..., \\mathbf{x}^{(N_s)}] $$ and $$ \\mathbf{y} = [y_1, y_2, ..., y_{N_t}]^T \\quad \\mathbf{Y} = [\\mathbf{y}^{(1)}, \\mathbf{y}^{(2)}, ..., \\mathbf{y}^{(N_s)}] $$ where \\(N_f\\) and \\(N_t\\) are the number of features and targets for each sample in the dataset, respectively, and \\(N_s\\) is the number of samples. Model : a mathematical relation between the input (or features) and output (or target) of our dataset. It is generally parametrized as function \\(f\\) of a number of free parameters \\(\\theta\\) which we want the learning algorithm to estimate given a task and a measure of performance, and we write it as $$ \\mathbf{y} = f_\\theta(\\mathbf{x}) $$ Loss (and cost) function : quantitative measure of the performance of the learning algorithm, which we wish to minimize (or maximize) in order to make accurate predictions on the unseen data. It is written as $$ J_\\theta = \\frac{1}{N_s} \\sum_{j=1}^{N_s} \\mathscr{L} (\\mathbf{y}^{(j)}, f_\\theta(\\mathbf{x}^{(j)})) $$ where \\(\\mathscr{L}\\) is the loss function for each input-output pair and \\(J\\) is the overall cost function. Optimization algorithm : mathematical method that aims to drive down (up) the cost function by modifying its free-parameters \\(\\theta\\) : $$ \\hat{\\theta} = \\underset{\\theta} {\\mathrm{argmin}} \\; J_\\theta $$ Optimization algorithms are generally divided into two main families: gradient-based (or local) and gradient-free (or global). Gradient-based optimization is by far the most popular way to train NNs and will be discussed in more details below. Gradient-descent algorithms The simplest of gradient-based methods is the so-called Gradient-descent algorithms (e.g., steepest descent algorithm). As the name implies, this algorithm uses local gradient information of the functional to minimize/maximize to move towards its global mimimum/maximum as depicted in the figure below. More formally, given a functional \\(J_\\theta\\) and its gradient \\(\\nabla J = \\frac{\\delta J}{\\delta \\theta}\\) , the (minimization) algorithm can be written as: Initialization: choose \\(\\theta \\in \\mathbb{R}\\) For \\(i=0,...N-1\\) ; Compute update direction \\(d_i = -\\nabla J |_{\\theta_i}\\) Estimate step-lenght \\(\\alpha_i\\) Update \\(\\theta_{i+1} = \\theta_{i} + \\alpha_i d_i\\) Note that the maximization version of this algorithm simply swaps the sign in the update direction (first equation of the algorithm). Moreover, the proposed algorithm can be easily extended to N-dimensional model vectors \\(\\theta=[\\theta_1, \\theta_2, ..., \\theta_N]\\) by defining the following gradient vector \\(\\nabla J=[\\delta J / \\delta\\theta_1, \\delta J / \\delta\\theta_2, ..., \\delta J/ \\delta\\theta_N]\\) . Step length selection The choice of the step-length has tremendous impact on the performance of the algorithm and its ability to converge fast (i.e., in a small number of iterations) to the optimal solution. The most used selection rules are: Constant: the step size is fixed to a constant value \\(\\alpha_i=\\hat{\\alpha}\\) . This is the most common situation that we will encounter when training neural networks. In practice, some adaptive schemes based on the evolution of the train (or validation) norm are generally adopted, but we will still refer to this case as constant step size; Exact line search: at each iteration, \\(\\alpha_i\\) is chosen such that it minimizes \\(J(\\theta_{i} + \\alpha_i d_i)\\) . This is the most commonly used approach when dealing with linear systems of equations. Backtracking \"Armijo\" line search: at each iteration, given a parameter \\(\\mu \\in (0,1)\\) , start with \\(\\alpha_i=1\\) and reduce it by a factor of 2 until the following condition is satisfied: \\(J(\\theta_i) - J(\\theta_{i} + \\alpha_i d_i) \\ge -\\mu \\alpha_i \\nabla J^T d_i\\) Second-order optimization Up until now we have discussed first-order optimization techniques that rely on the ability to evaluate the function \\(J\\) and its gradient \\(\\nabla J\\) . Second-order optimization method go one step beyond in that they use information from both the local slope and curvature of the function \\(J\\) . When a function has small curvature, the function and its tangent line are very similar: the gradient alone is therefore able to provide a good local approximation of the function (i.e., \\(J(\\theta+\\delta \\theta)\\approx J(\\theta) + \\nabla J \\delta \\theta\\) ). On the other hand, if the curvature of the function of large, the function and its tangent line start to differ very quickly away from the linearization point. The gradient alone is not able anymore to provide a good local approximation of the function (i.e., \\(J(\\theta+\\delta \\theta)\\approx J(\\theta) + \\nabla J \\delta \\theta + \\nabla^2 J \\delta \\theta^2\\) ). Let's start again from the one-dimensional case and the well-known Newton's method . This method is generally employed to find the zeros of a function: \\(\\theta: J(\\theta)=0\\) and can be written as: \\[ \\theta_{i+1} = \\theta_i - \\frac{J(\\theta)|_{\\theta_i}}{J'(\\theta)|_{\\theta_i}} \\] which can be easily derived from the Taylor expansion of \\(J(\\theta)\\) around \\(\\theta_{i+1}\\) . If we remember that finding the minimum (or maximum) of a function is equivalent to find the zeros of its first derivative ( \\(\\theta: min_\\theta J(\\theta) \\leftrightarrow \\theta: J'(\\theta)=0\\) ), the Newton's method can be written as: \\[ \\theta_{i+1} = \\theta_i - \\frac{J'(\\theta)|_{\\theta_i}}{J''(\\theta)|_{\\theta_i}} \\] In order to be able to discuss second-order optimization algorithms for the multi-dimensional case, let's first introduce the notion of Jacobian : \\[\\mathbf{y} = J(\\boldsymbol\\theta) \\rightarrow \\mathbf{J} = \\begin{bmatrix} \\frac{\\partial J_1}{\\partial \\theta_1} & \\frac{\\partial J_1}{\\partial \\theta_2} & ... & \\frac{\\partial J_1}{\\partial \\theta_M} \\\\ ... & ... & ... & ... \\\\ \\frac{\\partial J_N}{\\partial \\theta_1} & \\frac{\\partial J_N}{\\partial \\theta_2} & ... & \\frac{\\partial J_N}{\\partial \\theta_M} \\\\ \\end{bmatrix} \\in \\mathbb{R}^{[N \\times M]} \\] Through the notion of Jacobian, we can define the Hessian as the Jacobian of the gradient vector \\[\\mathbf{H} = \\nabla (\\nabla J) = \\begin{bmatrix} \\frac{\\partial J^2}{\\partial \\theta_1^2} & \\frac{\\partial J^2}{\\partial x_1 \\partial \\theta_2} & ... & \\frac{\\partial J^2}{\\partial \\theta_1\\partial \\theta_M} \\\\ ... & ... & ... & ... \\\\ \\frac{\\partial J^2}{\\partial \\theta_M \\partial \\theta_1} & \\frac{\\partial J^2}{\\partial \\theta_M \\partial \\theta_2} & ... & \\frac{\\partial J^2}{\\partial \\theta_M^2} \\\\ \\end{bmatrix} \\in \\mathbb{R}^{[M \\times M]} \\] where we note that when \\(J\\) is continuous, \\(\\partial / \\partial \\theta_i \\partial \\theta_j = \\partial / \\partial \\theta_j \\partial \\theta_i\\) , and \\(\\mathbf{H}\\) is symmetric. The Newton method for the multi-dimensional case becomes: \\[ \\boldsymbol\\theta_{i+1} = \\boldsymbol\\theta_i - \\mathbf{H}^{-1}\\nabla J \\] Approximated version of the Newton method have been developed over the years, mostly based on the idea that inverting \\(\\mathbf{H}\\) is sometimes a prohibitive task. Such methods, generally referred to as Quasi-Netwon methods attempt to approximate the Hessian (or its inverse) using the collections of gradient information from the previous iterations. BFGS or its limited memory version L-BFGS are examples of such a kind. Due to their computational cost (as well as the lack of solid theories for their use in conjunction with approximate gradients), these methods are not yet commonly used by the machine learning community to optimize the parameters of NNs in deep learning. Stochastic-gradient descent (SGD) To conclude, we look again at gradient-based iterative solvers and more specifically in the context of finite-sum functionals of the kind that we will encountering when training neural networks: \\[ J_\\theta = \\frac{1}{N_s} \\sum_{i=1}^{N_s} \\mathscr{L} (\\mathbf{y}^{(i)}, f_\\theta(\\mathbf{x}^{(i)})) \\] where the summation here is performed over training data. Batched gradient descent The solvers that we have considered so far are generally update the model parameters \\(\\boldsymbol\\theta\\) using the full gradient (i.e., over the entire batch of samples): \\[ \\boldsymbol\\theta_{i+1} = \\boldsymbol\\theta_{i} - \\alpha_i \\nabla J = \\boldsymbol\\theta_{i} - \\frac{\\alpha_i}{N_s} \\sum_{j=1}^{N_s} \\nabla \\mathscr{L}_j \\] A limitation of such an approach is that, if we have a very large number of training samples, the computational cost of computing the full gradient is very high and when some of the samples are similar, their gradient contribution is somehow redundant. Stochastic gradient descent In this case we take a completely opposite approach to computing the gradient. More specifically, a single training sample is considered at each iteration: \\[ \\boldsymbol\\theta_{i+1} = \\boldsymbol\\theta_{i} - \\alpha_i \\nabla \\mathscr{L}_j \\] The choice of the training sample \\(j\\) at each iteration is generally completely random and this is repeated once all training data have been used at least once (generally referred to as epoch ). In this case, the gradient may be noisy because the gradient of a single sample is a very rough approximation of the total cost function \\(J\\) : such a high variance of gradients requires lowering the step-size \\(\\alpha\\) leading to slow convergence. Mini-batched gradient descent A more commonly used strategy lies in between the batched and stochastic gradient descent algorithms uses batches of training samples to compute the gradient at each iteration. More specifically given a batch of \\(N_b\\) samples, the update formula can be written as: \\[ \\boldsymbol\\theta_{i+1} = \\boldsymbol\\theta_{i} - \\frac{\\alpha_i}{N_b} \\sum_{j=1}^{N_b} \\nabla \\mathscr{L}_j \\] and similarly to the stochastic gradient descent, the batches of data are chosen at random and this is repeated as soon as all data are used once in the training loop. Whilst the choice of the size of the batch depends on many factors (e.g., overall size of the dataset, variety of training samples), common batch sizes in training of NNs are from around 50 to 256 (unless memory requirements kick in leading to even small batch sizes). Additional readings the following blog post for a more detailed overview of the optimization algorithms discussed here. Note that in one of our future lectures we will also look again at the optimization algorithms and more specifically discuss strategies that allow overcoming some of the limitations of standard SGD in this lecture .","title":"Gradient-based optimization"},{"location":"lectures/03_gradopt/#gradient-based-optimization","text":"After reviewing some of the basic concepts of linear algebra and probability that we will be using during this course, we are now in a position to start our journey in the field of learning algorithms . Any learning algorithm, no matter its level of complexity, is composed of 4 key elements: Dataset : a collection of many examples (sometimes referred to as samples of data points) that represents the experience we wish our machine learning algorithm to learn from. More specifically, the dataset is defined as: $$ \\mathbf{x} = [x_1, x_2, ..., x_{N_f}]^T \\quad \\mathbf{X} = [\\mathbf{x}^{(1)}, \\mathbf{x}^{(2)}, ..., \\mathbf{x}^{(N_s)}] $$ and $$ \\mathbf{y} = [y_1, y_2, ..., y_{N_t}]^T \\quad \\mathbf{Y} = [\\mathbf{y}^{(1)}, \\mathbf{y}^{(2)}, ..., \\mathbf{y}^{(N_s)}] $$ where \\(N_f\\) and \\(N_t\\) are the number of features and targets for each sample in the dataset, respectively, and \\(N_s\\) is the number of samples. Model : a mathematical relation between the input (or features) and output (or target) of our dataset. It is generally parametrized as function \\(f\\) of a number of free parameters \\(\\theta\\) which we want the learning algorithm to estimate given a task and a measure of performance, and we write it as $$ \\mathbf{y} = f_\\theta(\\mathbf{x}) $$ Loss (and cost) function : quantitative measure of the performance of the learning algorithm, which we wish to minimize (or maximize) in order to make accurate predictions on the unseen data. It is written as $$ J_\\theta = \\frac{1}{N_s} \\sum_{j=1}^{N_s} \\mathscr{L} (\\mathbf{y}^{(j)}, f_\\theta(\\mathbf{x}^{(j)})) $$ where \\(\\mathscr{L}\\) is the loss function for each input-output pair and \\(J\\) is the overall cost function. Optimization algorithm : mathematical method that aims to drive down (up) the cost function by modifying its free-parameters \\(\\theta\\) : $$ \\hat{\\theta} = \\underset{\\theta} {\\mathrm{argmin}} \\; J_\\theta $$ Optimization algorithms are generally divided into two main families: gradient-based (or local) and gradient-free (or global). Gradient-based optimization is by far the most popular way to train NNs and will be discussed in more details below.","title":"Gradient-based optimization"},{"location":"lectures/03_gradopt/#gradient-descent-algorithms","text":"The simplest of gradient-based methods is the so-called Gradient-descent algorithms (e.g., steepest descent algorithm). As the name implies, this algorithm uses local gradient information of the functional to minimize/maximize to move towards its global mimimum/maximum as depicted in the figure below. More formally, given a functional \\(J_\\theta\\) and its gradient \\(\\nabla J = \\frac{\\delta J}{\\delta \\theta}\\) , the (minimization) algorithm can be written as: Initialization: choose \\(\\theta \\in \\mathbb{R}\\) For \\(i=0,...N-1\\) ; Compute update direction \\(d_i = -\\nabla J |_{\\theta_i}\\) Estimate step-lenght \\(\\alpha_i\\) Update \\(\\theta_{i+1} = \\theta_{i} + \\alpha_i d_i\\) Note that the maximization version of this algorithm simply swaps the sign in the update direction (first equation of the algorithm). Moreover, the proposed algorithm can be easily extended to N-dimensional model vectors \\(\\theta=[\\theta_1, \\theta_2, ..., \\theta_N]\\) by defining the following gradient vector \\(\\nabla J=[\\delta J / \\delta\\theta_1, \\delta J / \\delta\\theta_2, ..., \\delta J/ \\delta\\theta_N]\\) .","title":"Gradient-descent algorithms"},{"location":"lectures/03_gradopt/#step-length-selection","text":"The choice of the step-length has tremendous impact on the performance of the algorithm and its ability to converge fast (i.e., in a small number of iterations) to the optimal solution. The most used selection rules are: Constant: the step size is fixed to a constant value \\(\\alpha_i=\\hat{\\alpha}\\) . This is the most common situation that we will encounter when training neural networks. In practice, some adaptive schemes based on the evolution of the train (or validation) norm are generally adopted, but we will still refer to this case as constant step size; Exact line search: at each iteration, \\(\\alpha_i\\) is chosen such that it minimizes \\(J(\\theta_{i} + \\alpha_i d_i)\\) . This is the most commonly used approach when dealing with linear systems of equations. Backtracking \"Armijo\" line search: at each iteration, given a parameter \\(\\mu \\in (0,1)\\) , start with \\(\\alpha_i=1\\) and reduce it by a factor of 2 until the following condition is satisfied: \\(J(\\theta_i) - J(\\theta_{i} + \\alpha_i d_i) \\ge -\\mu \\alpha_i \\nabla J^T d_i\\)","title":"Step length selection"},{"location":"lectures/03_gradopt/#second-order-optimization","text":"Up until now we have discussed first-order optimization techniques that rely on the ability to evaluate the function \\(J\\) and its gradient \\(\\nabla J\\) . Second-order optimization method go one step beyond in that they use information from both the local slope and curvature of the function \\(J\\) . When a function has small curvature, the function and its tangent line are very similar: the gradient alone is therefore able to provide a good local approximation of the function (i.e., \\(J(\\theta+\\delta \\theta)\\approx J(\\theta) + \\nabla J \\delta \\theta\\) ). On the other hand, if the curvature of the function of large, the function and its tangent line start to differ very quickly away from the linearization point. The gradient alone is not able anymore to provide a good local approximation of the function (i.e., \\(J(\\theta+\\delta \\theta)\\approx J(\\theta) + \\nabla J \\delta \\theta + \\nabla^2 J \\delta \\theta^2\\) ). Let's start again from the one-dimensional case and the well-known Newton's method . This method is generally employed to find the zeros of a function: \\(\\theta: J(\\theta)=0\\) and can be written as: \\[ \\theta_{i+1} = \\theta_i - \\frac{J(\\theta)|_{\\theta_i}}{J'(\\theta)|_{\\theta_i}} \\] which can be easily derived from the Taylor expansion of \\(J(\\theta)\\) around \\(\\theta_{i+1}\\) . If we remember that finding the minimum (or maximum) of a function is equivalent to find the zeros of its first derivative ( \\(\\theta: min_\\theta J(\\theta) \\leftrightarrow \\theta: J'(\\theta)=0\\) ), the Newton's method can be written as: \\[ \\theta_{i+1} = \\theta_i - \\frac{J'(\\theta)|_{\\theta_i}}{J''(\\theta)|_{\\theta_i}} \\] In order to be able to discuss second-order optimization algorithms for the multi-dimensional case, let's first introduce the notion of Jacobian : \\[\\mathbf{y} = J(\\boldsymbol\\theta) \\rightarrow \\mathbf{J} = \\begin{bmatrix} \\frac{\\partial J_1}{\\partial \\theta_1} & \\frac{\\partial J_1}{\\partial \\theta_2} & ... & \\frac{\\partial J_1}{\\partial \\theta_M} \\\\ ... & ... & ... & ... \\\\ \\frac{\\partial J_N}{\\partial \\theta_1} & \\frac{\\partial J_N}{\\partial \\theta_2} & ... & \\frac{\\partial J_N}{\\partial \\theta_M} \\\\ \\end{bmatrix} \\in \\mathbb{R}^{[N \\times M]} \\] Through the notion of Jacobian, we can define the Hessian as the Jacobian of the gradient vector \\[\\mathbf{H} = \\nabla (\\nabla J) = \\begin{bmatrix} \\frac{\\partial J^2}{\\partial \\theta_1^2} & \\frac{\\partial J^2}{\\partial x_1 \\partial \\theta_2} & ... & \\frac{\\partial J^2}{\\partial \\theta_1\\partial \\theta_M} \\\\ ... & ... & ... & ... \\\\ \\frac{\\partial J^2}{\\partial \\theta_M \\partial \\theta_1} & \\frac{\\partial J^2}{\\partial \\theta_M \\partial \\theta_2} & ... & \\frac{\\partial J^2}{\\partial \\theta_M^2} \\\\ \\end{bmatrix} \\in \\mathbb{R}^{[M \\times M]} \\] where we note that when \\(J\\) is continuous, \\(\\partial / \\partial \\theta_i \\partial \\theta_j = \\partial / \\partial \\theta_j \\partial \\theta_i\\) , and \\(\\mathbf{H}\\) is symmetric. The Newton method for the multi-dimensional case becomes: \\[ \\boldsymbol\\theta_{i+1} = \\boldsymbol\\theta_i - \\mathbf{H}^{-1}\\nabla J \\] Approximated version of the Newton method have been developed over the years, mostly based on the idea that inverting \\(\\mathbf{H}\\) is sometimes a prohibitive task. Such methods, generally referred to as Quasi-Netwon methods attempt to approximate the Hessian (or its inverse) using the collections of gradient information from the previous iterations. BFGS or its limited memory version L-BFGS are examples of such a kind. Due to their computational cost (as well as the lack of solid theories for their use in conjunction with approximate gradients), these methods are not yet commonly used by the machine learning community to optimize the parameters of NNs in deep learning.","title":"Second-order optimization"},{"location":"lectures/03_gradopt/#stochastic-gradient-descent-sgd","text":"To conclude, we look again at gradient-based iterative solvers and more specifically in the context of finite-sum functionals of the kind that we will encountering when training neural networks: \\[ J_\\theta = \\frac{1}{N_s} \\sum_{i=1}^{N_s} \\mathscr{L} (\\mathbf{y}^{(i)}, f_\\theta(\\mathbf{x}^{(i)})) \\] where the summation here is performed over training data.","title":"Stochastic-gradient descent (SGD)"},{"location":"lectures/03_gradopt/#batched-gradient-descent","text":"The solvers that we have considered so far are generally update the model parameters \\(\\boldsymbol\\theta\\) using the full gradient (i.e., over the entire batch of samples): \\[ \\boldsymbol\\theta_{i+1} = \\boldsymbol\\theta_{i} - \\alpha_i \\nabla J = \\boldsymbol\\theta_{i} - \\frac{\\alpha_i}{N_s} \\sum_{j=1}^{N_s} \\nabla \\mathscr{L}_j \\] A limitation of such an approach is that, if we have a very large number of training samples, the computational cost of computing the full gradient is very high and when some of the samples are similar, their gradient contribution is somehow redundant.","title":"Batched gradient descent"},{"location":"lectures/03_gradopt/#stochastic-gradient-descent","text":"In this case we take a completely opposite approach to computing the gradient. More specifically, a single training sample is considered at each iteration: \\[ \\boldsymbol\\theta_{i+1} = \\boldsymbol\\theta_{i} - \\alpha_i \\nabla \\mathscr{L}_j \\] The choice of the training sample \\(j\\) at each iteration is generally completely random and this is repeated once all training data have been used at least once (generally referred to as epoch ). In this case, the gradient may be noisy because the gradient of a single sample is a very rough approximation of the total cost function \\(J\\) : such a high variance of gradients requires lowering the step-size \\(\\alpha\\) leading to slow convergence.","title":"Stochastic gradient descent"},{"location":"lectures/03_gradopt/#mini-batched-gradient-descent","text":"A more commonly used strategy lies in between the batched and stochastic gradient descent algorithms uses batches of training samples to compute the gradient at each iteration. More specifically given a batch of \\(N_b\\) samples, the update formula can be written as: \\[ \\boldsymbol\\theta_{i+1} = \\boldsymbol\\theta_{i} - \\frac{\\alpha_i}{N_b} \\sum_{j=1}^{N_b} \\nabla \\mathscr{L}_j \\] and similarly to the stochastic gradient descent, the batches of data are chosen at random and this is repeated as soon as all data are used once in the training loop. Whilst the choice of the size of the batch depends on many factors (e.g., overall size of the dataset, variety of training samples), common batch sizes in training of NNs are from around 50 to 256 (unless memory requirements kick in leading to even small batch sizes).","title":"Mini-batched gradient descent"},{"location":"lectures/03_gradopt/#additional-readings","text":"the following blog post for a more detailed overview of the optimization algorithms discussed here. Note that in one of our future lectures we will also look again at the optimization algorithms and more specifically discuss strategies that allow overcoming some of the limitations of standard SGD in this lecture .","title":"Additional readings"},{"location":"lectures/04_linreg/","text":"Linear and Logistic Regression In the previous lecture we have learned how to optimize a generic loss function \\(J_\\theta\\) by modifying its free parameters \\(\\theta\\) . Whilst this is a very generic framework that can be used for various applications in different scientific field, from now on we will learn how to take advtange of similar algorithms in the context of Machine Learning. Linear regression In preparation to our lecture on Neural Networks, here we consider the simplest machine learning model for regression, linear regression . Its simplicity lies in the fact that we will only consider a linear relationship between our inputs and targets: where \\(\\textbf{x}\\) is a training sample with \\(N_f\\) features, \\(\\textbf{w}\\) is a vector of \\(N_f\\) weights and \\(b=w_0\\) is the so-called bias term. The set of trainable parameters is therefore the combination of the weights and bias \\(\\boldsymbol\\theta=[\\textbf{w}, b] \\in \\mathbb{R}^{N_f+1}\\) . Similarly, the combination of the training sample and a 1-scalar is defined as \\(\\tilde{\\textbf{x}}=[\\textbf{x}, 1] \\in \\mathbb{R}^{N_f+1}\\) The prediction \\(\\hat{y}\\) is simply obtained by linearly combining the different features of the input vector and adding the bias. Despite its simplicity, linear regression (and more commonly multi-variate linear regression) has been successfully used in a variety of geoscientific tasks, examples of such a kind are: rock-physics models, where a number of petrophysical parameters (e.g., porosity, shale content, depth) can be linearly regressed in order to predict an elastic parameter of interest (e.g., dry bulk modulus); time-to-depth conversion, where a velocity (or depth) prediction is generally made as a linear combination of two-way traveltime and other parameters such as seismic amplitudes and various derived attributes; filling gaps in petrophysical well logs, where various petrophysical measurements (e.g., GR, NEU, DEN) are regressed to estimate another quantity of interest (e.g., S-wave velocity of DTS) that is not directly available within a certain depth interval. Assuming availability of \\(N_s\\) training samples, the input training matrix and output training vector of a linear regression model is written as: \\[ \\mathbf{X}_{train} = [\\tilde{\\mathbf{x}}^{(1)}, \\tilde{\\mathbf{x}}^{(2)}, ..., \\tilde{\\mathbf{x}}^{(N_s)}] \\in \\mathbb{R}^{N_f+1 \\times N_s}, \\quad \\mathbf{y}_{train} = [y^{(1)}, y^{(2)}, y^{(N_s)}] \\in \\mathbb{R}^{N_s \\times 1} \\] Finally, the model can be compactly written as: \\[ \\hat{\\textbf{y}}_{train} = \\textbf{X}_{train}^T \\boldsymbol\\theta \\] Next, we need to define a metric (i.e., cost function) which we can use to optimize for the free parameters \\(\\boldsymbol\\theta\\) . For regression problems, a common metric of goodness is the L2 norm or MSE (Mean Square Error): \\[ J_\\theta = MSE(\\textbf{y}_{train}, \\hat{\\textbf{y}}_{train}) = \\frac{1}{N_s} || \\textbf{y}_{train} - \\hat{\\textbf{y}}_{train}||_2^2 = \\frac{1}{N_s} \\sum_i^{N_s} (y_{train}^{(i)}-\\hat{y}_{train}^{(i)})^2 \\] Based on our previous lecture on optimization, we need to find the best set of coefficients \\(\\theta\\) that minimizes the MSE: \\[ \\hat{\\theta} = min_\\theta J_\\theta \\rightarrow \\theta_{i+1} = \\theta_i - \\alpha \\nabla J_\\theta \\] However, since this is a linear inverse problem we can write the analytical solution of the minimization problem as: \\[ \\hat{\\theta} = (\\textbf{X}_{train}^T \\textbf{X}_{train})^{-1} \\textbf{X}_{train}^T \\textbf{y}_{train} \\] which can be obtained by inverting a \\(N_s \\times N_s\\) matrix. An important observation, which lies at the core of most Machine Learning algorithms, is that once the model is trained on the \\(N_s\\) available input-target pairs, the estimated \\(\\hat{\\theta}\\) coefficients can be used to make inference on any new unseen data: \\[ y_{test} = \\tilde{\\textbf{x}}^T_{test} \\hat{\\theta} \\] To conclude, once a linear regression model has been trained, a variety of measures exist to assess the goodness of the model. Whilst the same metric used for training, the mean-square error, can be used to assess the model performance, other metrics are represented by the Pearson coefficient ( \\(R^2\\) ) and the mean-absolute error (MAE). Logistic regression Simply put, logistic regression is an extension of linear regression to the problem of binary classification. Whilst the model used by logistic regression is the same linear model described above, this will be coupled with a nonlinear 'activation' function that enforces the outcome of the entire model to be bounded between 0 and 1 (i.e., a probability). In other words, whilst the input training matrix is the same as that of linear regression, the output training vector becomes: \\[ y_{train} = \\{0, 1\\} \\] A variety of applications of such a simple model can be found in geoscience, one common example is represent by net pay prediction from petrophysical logs. Given a single pair of training samples \\(\\textbf{x}, y\\) , a mathematical model for logistic regression can be compactly written as: \\[ \\hat{y} = f_\\theta(\\textbf{x}) = P(y=1 | \\textbf{x}) \\in (0,1) \\] or in other words, the input vector \\(\\textbf{x}\\) is fed through a nonlinear model \\(f_\\theta\\) whose output is a scalar number between 0 and 1 that represents the probability of the target output to be 1. Considering now a set of \\(N_s\\) training pairs, the model can be explicitly written as: \\[ \\hat{\\textbf{y}}_{train} = f_\\theta(\\textbf{X}_{train}) = \\sigma(\\textbf{X}_{train}^T \\boldsymbol\\theta) \\] where \\(\\sigma\\) is a sigmoid function as shown in figure below: Once again, let's define a cost function that we can use to optimize the model parameters. For binary classification, a common metric of goodness is represented by the so-called binary cross-entropy : \\[ \\mathscr{L}(y_{train}^{(i)}, \\hat{y}_{train}^{(i)}) = -(y_{train}^{(i)} log(\\hat{y}_{train}^{(i)}) + (1-y_{train}^{(i)}) log(1- \\hat{y}_{train}^{(i)})) \\] and \\[ J_\\theta = \\frac{1}{N_s} \\sum_i^{N_s} \\mathscr{L}(y_{train}^{(i)}, \\hat{y}_{train}^{(i)}) \\] Let's gain some intuition onto why this is a good cost function. More specifically, we consider with a drawing the two cases separately. First the case of positive target, \\(y_{train}^{(i)}=1\\) and then the case of negative target, \\(y_{train}^{(i)}=0\\) : Our drawings clearly show the validity of such a cost function in both cases. The further away is the prediction from the true label the higher the resulting cost function. Similar to the case of linear regression, we can now update the model parameters by minimizing the cost function: \\[ \\hat{\\theta} = min_\\theta J_\\theta \\rightarrow \\theta_{i+1} = \\theta_i - \\alpha \\nabla J_\\theta \\] However a major difference arises here. Whilst it is easy to compute the derivative of the MSE with respect to the model parameters \\(\\theta\\) , and even more since the model is linear an analytical solution can be found (as shown above), this is not the case of the cost function of the logistic regression model. The good news here is that there exist a systematic approach to computing the derivative of a composite function (i.e., \\(f(x)=f_N(...f_2(f_1(x)))\\) ), which simply relies on the well-known chain rule of functional analysis. This method is referred to in the mathematical community as Automatic Differentiation (AD), and more likely so as Back-propagation in the ML community. As this lies as the foundation of the training process for neural networks, we will get into details later in the text. At this point, it suffices to say that if we have a composite function like the one above, its derivative with respect to \\(x\\) can be written as: \\[ \\frac{\\partial f}{\\partial x} = \\frac{\\partial f_N}{\\partial f_{N-1}} ... \\frac{\\partial f_2}{\\partial f_1} \\frac{\\partial f_1}{\\partial x} \\] where the derivative is simply the product of all derivatives over the chain of operations of the composite function. Note that in practice it is more common to compute this chain rule in reverse order, from left to right in the equation above. We generally rely on the built-in functionalities of deep learning libraries such as Tensorflow or PyTorch to compute such derivaties, we will perform here a full derivation for the simple case of logistic regression. In order to do so, we introduce a very useful mathamatical tool that we use to keep track of a chain of operations and later, we know how to evaluate the associated gradient. This tool is usually known as computational graph . More specifically, instead of writing the entire logistic regression model compactly in a single equation, we divide it here into its atomic components: \\[ z = \\textbf{x}^T \\boldsymbol\\theta, \\quad a = \\sigma(z), \\quad \\mathscr{L} = -(y log(a) + (1-y)log(1-a)) \\] such that the derivative of the loss function with respect to the model parameters becomes: \\[ \\frac{\\partial \\mathscr{L} }{\\partial \\boldsymbol\\theta} = \\frac{\\partial \\mathscr{L} }{\\partial a} \\frac{\\partial a }{\\partial z} \\frac{\\partial z}{\\partial \\boldsymbol\\theta} \\] The forward and backward passes (as described in software frameworks like PyTorch) can be visually displayed as follows: Let's start from \\(\\partial \\mathscr{L} / \\partial a\\) : \\[ \\frac{\\partial \\mathscr{L}}{\\partial a} = -\\frac{y}{a} + \\frac{1-y}{1-a} = \\frac{-y(1-a) + (1-y)a}{a (1-a)} \\] and \\(\\partial a / \\partial z\\) : \\[ \\frac{\\partial a}{\\partial z} = a(1-a) \\] which we can combine together to obtain a simplified formula for the derivative of the loss function of the output of the weighted summation ( \\(z\\) ) \\[ \\frac{\\partial \\mathscr{L}}{\\partial z} = \\frac{\\partial \\mathscr{L}}{\\partial a} \\frac{\\partial a}{\\partial z} = -y(1-a) + (1-y)a = a - y = dz \\] Finally we differentiate between the weights and the bias to obtain: \\[ \\frac{\\partial z}{\\partial w_i} = x_i, \\quad \\frac{\\partial z}{\\partial b} = 1 \\] such that: \\[ \\frac{\\partial \\mathscr{L}}{\\partial w_i} = dz \\cdot x_i = dw_i, \\quad \\frac{\\partial \\mathscr{L}}{\\partial b} = dz = db \\] Having found the gradients, we can now update the parameters as discussed above: \\[ w_i \\leftarrow w_i - \\alpha \\frac{\\partial \\mathscr{L}}{\\partial w_i} = w_i - \\alpha dw_i, \\quad b \\leftarrow b - \\alpha \\frac{\\partial \\mathscr{L}}{\\partial b} = b - \\alpha db \\] which can be easily modified in the case of multiple training samples: \\[ w_i \\leftarrow w_i - \\alpha \\sum_{j=1}^{N_s} dw_i^{(j)}, \\quad b \\leftarrow b - \\alpha \\sum_{j=1}^{N_s} db^{(j)} \\] We can now summarize a single step of training for \\(N_s\\) training samples for the logistic regression model: \\(\\textbf{z}=\\textbf{X}_{train}^T \\boldsymbol \\theta\\) \\(\\textbf{a} = \\sigma(\\textbf{z})\\) \\(\\textbf{dz} = \\textbf{a} - \\textbf{y}\\) \\(\\textbf{dw} = \\frac{1}{N_s} \\textbf{X}_{train} \\textbf{dz}\\) \\(db = \\frac{1}{N_s} \\textbf{1}^T \\textbf{dz}\\) \\(\\textbf{w} \\leftarrow \\textbf{w} - \\alpha \\textbf{dw}\\) \\(b \\leftarrow b - \\alpha db\\) To conclude, let's turn our attention into some of the evaluation metrics that are commonly used to assess the performance of a classification model (or classifier). Note that these metrics can be used for the logistic regression model discussed here as well as for other more advanced models discussed later in the course. In general for binary classification we have two possible outcomes (positive/negative or true/false) for both the true labels \\(y\\) and the predicted labels \\(\\hat{y}\\) . We can therefore define 4 scenarios: and a number of complementary metrics (all bounded between 0 and 1) can be defined. Note that no metric is better than the others, the importance of one metric over another is context dependant. Precision : \\(Pr=\\frac{TP}{TP+FP}\\) , percentage of correct positive predictions over the overall positive predictions. This measure is appropriate when minimizing false positives is the focus. In the geoscientific context, this may represent a meaningful metric for applications where the main interest is that of predicting the smallest possible number of false positives, whilst at the same time accepting to miss out on some of positives (false negatives). This could be the case when we want to predict hydrocarbon bearing reservoirs from seismic data, where we know already that we will not be able to drill wells into many of them. It is therefore important that even if we make very few positive predictions these must be accurate, whilst the cost of missing other opportunities is not so high. On the other hand, this measure is blind to the predictions of real positive cases to be chosen to be part of the negative class (false negative); Recall : \\(Rc=\\frac{TP}{TP+FN} = \\frac{TP}{P}\\) , percentage of correct positive predictions over the overall positive occurrences. This measure is appropriate when minimizing false negatives is the focus. An opposite scenario to the one presented above is represented by the case of a classifier trained to predict pressure kicks whilst drilling a well. In this case, we are not really concerned with making a few mistakes where we predict a kick when this is not likely to happen (False Positive); of course, this may slow down the drilling process but it is nowhere near as dramatic as the case in which we do not predict a kick which is going to happen (False Negative); a high recall is therefore what we want, as this is an indication of the fact that the model does not miss out on many positive cases. Of course, a model that always provides a positive prediction will have a recall of 1 (FN=0), indication of the fact that a high recall is not always an indication of a good model; Accuracy : \\(Ac=\\frac{TP+TN}{TP+TN+FP+FN}=\\frac{TP+TN}{P+N}\\) , percentage of correct predictions over the total number of cases. This measure combines both error types (in the denominator), it is therefore a more global measure of the quality of the model. F1-Score : \\(2 \\frac{Pr \\cdot Rc}{Pr+Rc}\\) , represents a way to combine precision and recall into a single measure that captures both properties. Finally, a more complete description of the performance of a model is given by the so-called confusion matrix , which for the case of binary classification is just the \\(2 \\times 2\\) table in the figure above. This table can be both unnormalized, where each cell simply contains the number of samples which satisfy the specific combination of real and predicted labels, or normalized over either rows or columns.","title":"Linear and Logistic Regression"},{"location":"lectures/04_linreg/#linear-and-logistic-regression","text":"In the previous lecture we have learned how to optimize a generic loss function \\(J_\\theta\\) by modifying its free parameters \\(\\theta\\) . Whilst this is a very generic framework that can be used for various applications in different scientific field, from now on we will learn how to take advtange of similar algorithms in the context of Machine Learning.","title":"Linear and Logistic Regression"},{"location":"lectures/04_linreg/#linear-regression","text":"In preparation to our lecture on Neural Networks, here we consider the simplest machine learning model for regression, linear regression . Its simplicity lies in the fact that we will only consider a linear relationship between our inputs and targets: where \\(\\textbf{x}\\) is a training sample with \\(N_f\\) features, \\(\\textbf{w}\\) is a vector of \\(N_f\\) weights and \\(b=w_0\\) is the so-called bias term. The set of trainable parameters is therefore the combination of the weights and bias \\(\\boldsymbol\\theta=[\\textbf{w}, b] \\in \\mathbb{R}^{N_f+1}\\) . Similarly, the combination of the training sample and a 1-scalar is defined as \\(\\tilde{\\textbf{x}}=[\\textbf{x}, 1] \\in \\mathbb{R}^{N_f+1}\\) The prediction \\(\\hat{y}\\) is simply obtained by linearly combining the different features of the input vector and adding the bias. Despite its simplicity, linear regression (and more commonly multi-variate linear regression) has been successfully used in a variety of geoscientific tasks, examples of such a kind are: rock-physics models, where a number of petrophysical parameters (e.g., porosity, shale content, depth) can be linearly regressed in order to predict an elastic parameter of interest (e.g., dry bulk modulus); time-to-depth conversion, where a velocity (or depth) prediction is generally made as a linear combination of two-way traveltime and other parameters such as seismic amplitudes and various derived attributes; filling gaps in petrophysical well logs, where various petrophysical measurements (e.g., GR, NEU, DEN) are regressed to estimate another quantity of interest (e.g., S-wave velocity of DTS) that is not directly available within a certain depth interval. Assuming availability of \\(N_s\\) training samples, the input training matrix and output training vector of a linear regression model is written as: \\[ \\mathbf{X}_{train} = [\\tilde{\\mathbf{x}}^{(1)}, \\tilde{\\mathbf{x}}^{(2)}, ..., \\tilde{\\mathbf{x}}^{(N_s)}] \\in \\mathbb{R}^{N_f+1 \\times N_s}, \\quad \\mathbf{y}_{train} = [y^{(1)}, y^{(2)}, y^{(N_s)}] \\in \\mathbb{R}^{N_s \\times 1} \\] Finally, the model can be compactly written as: \\[ \\hat{\\textbf{y}}_{train} = \\textbf{X}_{train}^T \\boldsymbol\\theta \\] Next, we need to define a metric (i.e., cost function) which we can use to optimize for the free parameters \\(\\boldsymbol\\theta\\) . For regression problems, a common metric of goodness is the L2 norm or MSE (Mean Square Error): \\[ J_\\theta = MSE(\\textbf{y}_{train}, \\hat{\\textbf{y}}_{train}) = \\frac{1}{N_s} || \\textbf{y}_{train} - \\hat{\\textbf{y}}_{train}||_2^2 = \\frac{1}{N_s} \\sum_i^{N_s} (y_{train}^{(i)}-\\hat{y}_{train}^{(i)})^2 \\] Based on our previous lecture on optimization, we need to find the best set of coefficients \\(\\theta\\) that minimizes the MSE: \\[ \\hat{\\theta} = min_\\theta J_\\theta \\rightarrow \\theta_{i+1} = \\theta_i - \\alpha \\nabla J_\\theta \\] However, since this is a linear inverse problem we can write the analytical solution of the minimization problem as: \\[ \\hat{\\theta} = (\\textbf{X}_{train}^T \\textbf{X}_{train})^{-1} \\textbf{X}_{train}^T \\textbf{y}_{train} \\] which can be obtained by inverting a \\(N_s \\times N_s\\) matrix. An important observation, which lies at the core of most Machine Learning algorithms, is that once the model is trained on the \\(N_s\\) available input-target pairs, the estimated \\(\\hat{\\theta}\\) coefficients can be used to make inference on any new unseen data: \\[ y_{test} = \\tilde{\\textbf{x}}^T_{test} \\hat{\\theta} \\] To conclude, once a linear regression model has been trained, a variety of measures exist to assess the goodness of the model. Whilst the same metric used for training, the mean-square error, can be used to assess the model performance, other metrics are represented by the Pearson coefficient ( \\(R^2\\) ) and the mean-absolute error (MAE).","title":"Linear regression"},{"location":"lectures/04_linreg/#logistic-regression","text":"Simply put, logistic regression is an extension of linear regression to the problem of binary classification. Whilst the model used by logistic regression is the same linear model described above, this will be coupled with a nonlinear 'activation' function that enforces the outcome of the entire model to be bounded between 0 and 1 (i.e., a probability). In other words, whilst the input training matrix is the same as that of linear regression, the output training vector becomes: \\[ y_{train} = \\{0, 1\\} \\] A variety of applications of such a simple model can be found in geoscience, one common example is represent by net pay prediction from petrophysical logs. Given a single pair of training samples \\(\\textbf{x}, y\\) , a mathematical model for logistic regression can be compactly written as: \\[ \\hat{y} = f_\\theta(\\textbf{x}) = P(y=1 | \\textbf{x}) \\in (0,1) \\] or in other words, the input vector \\(\\textbf{x}\\) is fed through a nonlinear model \\(f_\\theta\\) whose output is a scalar number between 0 and 1 that represents the probability of the target output to be 1. Considering now a set of \\(N_s\\) training pairs, the model can be explicitly written as: \\[ \\hat{\\textbf{y}}_{train} = f_\\theta(\\textbf{X}_{train}) = \\sigma(\\textbf{X}_{train}^T \\boldsymbol\\theta) \\] where \\(\\sigma\\) is a sigmoid function as shown in figure below: Once again, let's define a cost function that we can use to optimize the model parameters. For binary classification, a common metric of goodness is represented by the so-called binary cross-entropy : \\[ \\mathscr{L}(y_{train}^{(i)}, \\hat{y}_{train}^{(i)}) = -(y_{train}^{(i)} log(\\hat{y}_{train}^{(i)}) + (1-y_{train}^{(i)}) log(1- \\hat{y}_{train}^{(i)})) \\] and \\[ J_\\theta = \\frac{1}{N_s} \\sum_i^{N_s} \\mathscr{L}(y_{train}^{(i)}, \\hat{y}_{train}^{(i)}) \\] Let's gain some intuition onto why this is a good cost function. More specifically, we consider with a drawing the two cases separately. First the case of positive target, \\(y_{train}^{(i)}=1\\) and then the case of negative target, \\(y_{train}^{(i)}=0\\) : Our drawings clearly show the validity of such a cost function in both cases. The further away is the prediction from the true label the higher the resulting cost function. Similar to the case of linear regression, we can now update the model parameters by minimizing the cost function: \\[ \\hat{\\theta} = min_\\theta J_\\theta \\rightarrow \\theta_{i+1} = \\theta_i - \\alpha \\nabla J_\\theta \\] However a major difference arises here. Whilst it is easy to compute the derivative of the MSE with respect to the model parameters \\(\\theta\\) , and even more since the model is linear an analytical solution can be found (as shown above), this is not the case of the cost function of the logistic regression model. The good news here is that there exist a systematic approach to computing the derivative of a composite function (i.e., \\(f(x)=f_N(...f_2(f_1(x)))\\) ), which simply relies on the well-known chain rule of functional analysis. This method is referred to in the mathematical community as Automatic Differentiation (AD), and more likely so as Back-propagation in the ML community. As this lies as the foundation of the training process for neural networks, we will get into details later in the text. At this point, it suffices to say that if we have a composite function like the one above, its derivative with respect to \\(x\\) can be written as: \\[ \\frac{\\partial f}{\\partial x} = \\frac{\\partial f_N}{\\partial f_{N-1}} ... \\frac{\\partial f_2}{\\partial f_1} \\frac{\\partial f_1}{\\partial x} \\] where the derivative is simply the product of all derivatives over the chain of operations of the composite function. Note that in practice it is more common to compute this chain rule in reverse order, from left to right in the equation above. We generally rely on the built-in functionalities of deep learning libraries such as Tensorflow or PyTorch to compute such derivaties, we will perform here a full derivation for the simple case of logistic regression. In order to do so, we introduce a very useful mathamatical tool that we use to keep track of a chain of operations and later, we know how to evaluate the associated gradient. This tool is usually known as computational graph . More specifically, instead of writing the entire logistic regression model compactly in a single equation, we divide it here into its atomic components: \\[ z = \\textbf{x}^T \\boldsymbol\\theta, \\quad a = \\sigma(z), \\quad \\mathscr{L} = -(y log(a) + (1-y)log(1-a)) \\] such that the derivative of the loss function with respect to the model parameters becomes: \\[ \\frac{\\partial \\mathscr{L} }{\\partial \\boldsymbol\\theta} = \\frac{\\partial \\mathscr{L} }{\\partial a} \\frac{\\partial a }{\\partial z} \\frac{\\partial z}{\\partial \\boldsymbol\\theta} \\] The forward and backward passes (as described in software frameworks like PyTorch) can be visually displayed as follows: Let's start from \\(\\partial \\mathscr{L} / \\partial a\\) : \\[ \\frac{\\partial \\mathscr{L}}{\\partial a} = -\\frac{y}{a} + \\frac{1-y}{1-a} = \\frac{-y(1-a) + (1-y)a}{a (1-a)} \\] and \\(\\partial a / \\partial z\\) : \\[ \\frac{\\partial a}{\\partial z} = a(1-a) \\] which we can combine together to obtain a simplified formula for the derivative of the loss function of the output of the weighted summation ( \\(z\\) ) \\[ \\frac{\\partial \\mathscr{L}}{\\partial z} = \\frac{\\partial \\mathscr{L}}{\\partial a} \\frac{\\partial a}{\\partial z} = -y(1-a) + (1-y)a = a - y = dz \\] Finally we differentiate between the weights and the bias to obtain: \\[ \\frac{\\partial z}{\\partial w_i} = x_i, \\quad \\frac{\\partial z}{\\partial b} = 1 \\] such that: \\[ \\frac{\\partial \\mathscr{L}}{\\partial w_i} = dz \\cdot x_i = dw_i, \\quad \\frac{\\partial \\mathscr{L}}{\\partial b} = dz = db \\] Having found the gradients, we can now update the parameters as discussed above: \\[ w_i \\leftarrow w_i - \\alpha \\frac{\\partial \\mathscr{L}}{\\partial w_i} = w_i - \\alpha dw_i, \\quad b \\leftarrow b - \\alpha \\frac{\\partial \\mathscr{L}}{\\partial b} = b - \\alpha db \\] which can be easily modified in the case of multiple training samples: \\[ w_i \\leftarrow w_i - \\alpha \\sum_{j=1}^{N_s} dw_i^{(j)}, \\quad b \\leftarrow b - \\alpha \\sum_{j=1}^{N_s} db^{(j)} \\] We can now summarize a single step of training for \\(N_s\\) training samples for the logistic regression model: \\(\\textbf{z}=\\textbf{X}_{train}^T \\boldsymbol \\theta\\) \\(\\textbf{a} = \\sigma(\\textbf{z})\\) \\(\\textbf{dz} = \\textbf{a} - \\textbf{y}\\) \\(\\textbf{dw} = \\frac{1}{N_s} \\textbf{X}_{train} \\textbf{dz}\\) \\(db = \\frac{1}{N_s} \\textbf{1}^T \\textbf{dz}\\) \\(\\textbf{w} \\leftarrow \\textbf{w} - \\alpha \\textbf{dw}\\) \\(b \\leftarrow b - \\alpha db\\) To conclude, let's turn our attention into some of the evaluation metrics that are commonly used to assess the performance of a classification model (or classifier). Note that these metrics can be used for the logistic regression model discussed here as well as for other more advanced models discussed later in the course. In general for binary classification we have two possible outcomes (positive/negative or true/false) for both the true labels \\(y\\) and the predicted labels \\(\\hat{y}\\) . We can therefore define 4 scenarios: and a number of complementary metrics (all bounded between 0 and 1) can be defined. Note that no metric is better than the others, the importance of one metric over another is context dependant. Precision : \\(Pr=\\frac{TP}{TP+FP}\\) , percentage of correct positive predictions over the overall positive predictions. This measure is appropriate when minimizing false positives is the focus. In the geoscientific context, this may represent a meaningful metric for applications where the main interest is that of predicting the smallest possible number of false positives, whilst at the same time accepting to miss out on some of positives (false negatives). This could be the case when we want to predict hydrocarbon bearing reservoirs from seismic data, where we know already that we will not be able to drill wells into many of them. It is therefore important that even if we make very few positive predictions these must be accurate, whilst the cost of missing other opportunities is not so high. On the other hand, this measure is blind to the predictions of real positive cases to be chosen to be part of the negative class (false negative); Recall : \\(Rc=\\frac{TP}{TP+FN} = \\frac{TP}{P}\\) , percentage of correct positive predictions over the overall positive occurrences. This measure is appropriate when minimizing false negatives is the focus. An opposite scenario to the one presented above is represented by the case of a classifier trained to predict pressure kicks whilst drilling a well. In this case, we are not really concerned with making a few mistakes where we predict a kick when this is not likely to happen (False Positive); of course, this may slow down the drilling process but it is nowhere near as dramatic as the case in which we do not predict a kick which is going to happen (False Negative); a high recall is therefore what we want, as this is an indication of the fact that the model does not miss out on many positive cases. Of course, a model that always provides a positive prediction will have a recall of 1 (FN=0), indication of the fact that a high recall is not always an indication of a good model; Accuracy : \\(Ac=\\frac{TP+TN}{TP+TN+FP+FN}=\\frac{TP+TN}{P+N}\\) , percentage of correct predictions over the total number of cases. This measure combines both error types (in the denominator), it is therefore a more global measure of the quality of the model. F1-Score : \\(2 \\frac{Pr \\cdot Rc}{Pr+Rc}\\) , represents a way to combine precision and recall into a single measure that captures both properties. Finally, a more complete description of the performance of a model is given by the so-called confusion matrix , which for the case of binary classification is just the \\(2 \\times 2\\) table in the figure above. This table can be both unnormalized, where each cell simply contains the number of samples which satisfy the specific combination of real and predicted labels, or normalized over either rows or columns.","title":"Logistic regression"},{"location":"lectures/05_nn/","text":"Basics of Neural Networks In this lecture, we start our journey in the field of Deep Learning. In order to do so, we must first introduce the most commonly used kind of Neural Networks, the so-called Multi-Layer Perceptron (MLP) (also commonly referred to as fully connected (FC) layer). A MLP is a class of feedforward artificial neural networks (ANNs), where the term feedforward refers to the fact the the flow of information moves from left to right. On the other hand, a change in the direction of the flow is introduced as part of the forward pass gives rise to a different family of NNs, so-called Recurrent Neural Networks (they will be subject of future lectures): Perceptron To begin with, we focus on the core building block of a MLP, so-called Perceptron or Unit. This is nothing really new to us, as it is exactly the same structure that we used to create the logistic regression model, a linear weighting of the element of the input vector followed by a nonlinear activation function. We prefer however to schematic represent it in a slightly different way as this will make it easier later on to drawn MLPs. Mathematically, the action of a percepton can be written compactly as dot-product followed by an element-wise nonlinear activation: \\[ y = \\sigma(\\sum_i w_i x_i + b) = \\sigma(\\sum_i \\textbf{w}^T \\textbf{x} + b) \\] where \\(\\textbf{w} \\in \\mathbb{R}^{N_i}\\) is the vector of weights, \\(b\\) is the bias, and \\(\\sigma\\) is a nonlinear activation function. Note that whilst we used a sigmoid function in the logistic regression model, this can be any differentiable function as later we will discuss in more details. Multi-layer Perceptron The perceptron model shown above takes as input a vector \\(\\textbf{x} \\in \\mathbb{R}^{N_i}\\) and returns a scalar \\(y\\) , we are now ready to make a step forward where we simply combine multiple perceptrons together to return a vector \\(\\textbf{y} \\in \\mathbb{R}^{N_o}\\) The MLP in the figure above presents \\(N_i=3\\) inputs and \\(N_o=2\\) outputs. By highlighting the original perceptron in green, we can easily observed that a MLP is simply a composition of \\(N_o\\) perceptrons, which again we can compactly write as a matrix-vector multiplication followed by an element-wise nonlinear activation: \\[ y_j = \\sigma(\\sum_i w_{ji} x_i + b), \\quad \\textbf{y} = \\sigma(\\textbf{W} \\textbf{x} + \\textbf{b}) \\] where \\(\\textbf{W} \\in \\mathbb{R}^{N_o \\times N_i}\\) is the matrix of weights, \\(\\textbf{b} \\in \\mathbb{R}^{N_o}\\) is a vector of biases. Finally, if we stack multiple MLPs together we obtained what is generally referred to as N-layer NN, where the count of the number of layers does not include the input layer. For example, a 3-layer NN has the following structure where we omit for simplicity the bias terms in the schematic drawing. This figure gives us the opportunity to introduce some terminology commonly used in the DL community: Input layer : first layer taking the input vector \\(\\textbf{x}\\) as input and returning an intermediate representation \\(\\textbf{z}^{[1]}\\) ; Hidden layers : second to penultimate layers taking as input the previous representation \\(\\textbf{z}^{[i-1]}\\) and returning a new representation \\(\\textbf{z}^{[i]}\\) ; Ouput layer : last layer producing the output of the network \\(\\textbf{y}\\) ; Depth : number of hidden layers (plus output layer); Width : number of units in each hidden layer. Note that we will always use the following notation \\(\\cdot^{(i)[j]}\\) where round brackets are used to refer to a specific training sample and square brackets are used to refer to a specific layer. Activation functions We have just started to appreciate the simplicity of NNs. A Neural Network is nothing more than a stack of linear transformations and nonlinear element-wise activation functions. If such activation functions where omitted, we could combine the various linear transformations together in a single matrix, as the product of N matrices. Assuming that sigma acts as an identity matrix \\(\\sigma(\\textbf{x})=\\textbf{Ix}=\\textbf{x}\\) , (and omitting biases for simplicity) we get: $$ \\textbf{y} = \\sigma(\\textbf{W}^{[3]}\\sigma(\\textbf{W}^{[2]}\\sigma(\\textbf{W}^{[1]} \\textbf{x}))) = \\textbf{W}^{[3]}\\textbf{W}^{[2]}\\textbf{W}^{[1]}\\textbf{x} = \\textbf{W} \\textbf{x} $$ so no matter how deep the network is, we can always reconduct it to a linear model. Depending on the final activation and loss function, therefore we will have a linear regression or a logistic regression model. We consider here a very simple example to show the importance of nonlinear activations before delving into the details. Let's assume that we wish the learn the XOR (eXclusive OR) boolean logic operator from the following four training samples: \\[ \\textbf{x}^{(1)} = [0, 0] \\rightarrow y^{(1)}=0 \\] \\[ \\textbf{x}^{(2)} = [0, 1] \\rightarrow y^{(2)}=1 \\] \\[ \\textbf{x}^{(3)} = [1, 0] \\rightarrow y^{(3)}=1 \\] \\[ \\textbf{x}^{(4)} = [1, 1] \\rightarrow y^{(4)}=0 \\] Starting from the linear regression model, we can define a matrix \\(\\textbf{X}_{train} = [\\textbf{x}^{(1)}, \\textbf{x}^{(2)}, \\textbf{x}^{(3)}, \\textbf{x}^{(4)}]\\) and a vector \\(\\textbf{y}_{train} = [y^{(1)}, y^{(2)}, y^{(3)}, y^{(4)}]\\) . The linear model becomes: \\[ \\textbf{y}_{train} = \\textbf{X}_{train}^T \\boldsymbol \\theta \\] where the weights \\(\\boldsymbol \\theta\\) are obtained as detailed in the previous section. It can be easily proven that the solution is \\(\\boldsymbol \\theta=[0,0,0.5]\\) , where \\(\\textbf{w}=[0,0]\\) and \\(b=0.5\\) . This means that, no matter the input the output of the linear model will always be equal to \\(0.5\\) ; in other words, the model is unable to distinguish between the true or false outcomes. If instead we introduce a nonlinearity between two weight matrices (i.e., a 2-layer NN), the following combination of weights and biases (taken from the Goodfellow book) will lead to a correct prediction: \\[ \\textbf{W}^{[1]} = \\begin{bmatrix} 1 & 1 \\\\ 1 & 1\\end{bmatrix}, \\textbf{W}^{[2]} = \\begin{bmatrix} 1 \\\\ -2 \\end{bmatrix}^T, \\textbf{b}^{[1]} = \\begin{bmatrix} 0 \\\\ -1 \\end{bmatrix}, b^{[2]} = 0 \\] Note that in this case the \\(\\sigma=ReLU\\) activation function, which we will introduce in the next section, must be used. Of course, there may be many more combinations of weights and biases that lead to a satisfactory prediction. You can prove this to yourself by initializing the weights and biases randomly and optimizing them by means of a stochastic gradient-descent algorithm. Having introduced nonlinearites every time after we apply the weight matrices to the vector flowing through the computational graph, the overall set of operations cannot be simply reconducted to a matrix-vector multiplication and allows us to learn highly complex nonlinear mappings between input features and targets. The role of activation functions is however not always straightforward and easy to grasp. Whilst we can say that they help in the learning process, not every function is suitable for this task and in fact, some functions may prevent the network from learning at all. In the following we look at the most commonly used activation functions and discuss their origin and why they became popular and useful in Deep Learning: Sigmoid and Tanh : historically these were the most popular activation functions as they are differentiable across the entire domain. In the past, there was in fact a strong belief that gradient-descent cannot operate on functions that have singularities; although this is correct from a theoretical point of view it was later proved to be wrong in practice. They are mathematically defined as: $$ \\sigma_s(x) = \\frac{1}{1-e^{-x}} $$ and $$ \\sigma_t(x) = 2 \\sigma_s(2x) - 1 $$ Whilst still used in various contexts, these activation functions saturate very quickly (i.e., large values are clipped to 1 and small values are clipped to -1 for tanh or 0 for sigmoid). This leads to the so-called vanishing gradient problem that we will discuss in more details in following lectures; simply put, if we look at the the gradient of both of these functions, it is non-zero only when x is near zero and becomes zero away from it, meaning that if the output of a linear layer is large the gradient of the activation function will be zero and therefore the gradient will stop flowing through backpropagation. This is particularly problematic for deep network as the training of the early layers becomes very slow. ReLU (Rectified Linear Unit): this activation function became very popular in the start of the 21st century and since then it is the most commonly used activation function for NN training. It is much closer to a linear activation than the previous two, but introduces a nonlinearity by putting negative inputs to zero. By doing so, the ReLU activation function is a piecewise linear function. This shows that non-differentiable functions can be used in gradient based optimization, mostly because numerically we will hardly (if not never) have an output of a NN layer that is exactly zero when fed as input to the activation. Mathematically speaking, we can write it as: $$ \\sigma_r(x) = max ( 0,x ) = \\begin{cases} x & x\\ge 0, \\quad 0 & x<0 \\end{cases} $$ whilst its derivative is: $$ \\sigma'_{relu}(x) = \\begin{cases} 1 & x\\ge 0, \\quad 0 & x<0 \\end{cases} $$ We can observe that this activation function never saturates, for every value in the positive axis the derivative is always 1. Such a property makes ReLU suitable for large networks as the risk of vanishing gradients is greatly reduced. A downside of ReLU is that the entire negative axis acts as an annihilator preventing information to flow. A strategy to prevent or reduce the occurrences of negative inputs is represented by the initialization of biases to a value slightly greater than zero (e.g., b=0.1). Leaky ReLU (Leaky Rectified Linear Unit): a modified version of the ReLU activation function aimed once again at avoiding zeroing of inputs in the negative axis. This function is identical to the ReLU in the positive axis, whilst another straight line with smaller slope is used in the negative axis: $$ \\sigma'_{l-relu}(x) = max ( 0,x ) + \\alpha min ( 0,x ) = \\begin{cases} x & x\\ge 0, \\quad \\alpha x & x<0 \\end{cases} $$ By doing so, also negative inputs can flow through the computational graph. A variant of L-ReLU, called P-ReLU, allows for the \\(\\alpha\\) parameter to be learned instead of being fixed. Absolute ReLU (Absolute Rectified Linear Unit): a modified version of the ReLU activation function that is symmetric with respect to the \\(x=0\\) axis: $$ \\sigma'_{l-relu}(x) = |x| = \\begin{cases} x & x\\ge 0, \\quad -x & x<0 \\end{cases} $$ Whilst this is not a popular choice in the DL literature, it has been successfully used in object detection tasks where the features that we wish the NN to extract from the training process are polarity invariant. Cosine, Sine, ... : the use of periodic functions have recently started to appear in the literature especially in the context of scientific DL (e.g., Physics-informed neural networks). Softmax : this activation function is commonly used at the end of the last layer in the context of multi-label classification. However as it takes an input vector of N numbers and converts it into an output vector of probabilities (i.e., N numbers summing to 1), it may also be used as a sort of switch in the internal layers. The following two figures show the different activation functions discussed above and their gradients. Network architecture Up until now we have discussed the key components of a Feedforward Neural Network, the Multi-layer Perceptron. It was mentioned a few times that a NN can be composed of multiple MLPs connected with each other, giving rise to a so-called Deep Neural Network (DNN). The depth and width of the network has been also defined, and we have introduced the convention that a N-layer NN is a network with N-1 hidden layers. A crucial point in the design of a neural network architecture is represented by the choice of such parameters. Whilst no hard rules exist and the creation of a NN architecture is to these days still closer to an art than a systematic science, in the following we provide a number of guidelines that should be followed when approaching the problem of designing a network. For example, as previously discussed, connecting two or more layers without adding a nonlinear activation function in between should be avoided as this part of the network simply behaves as a single linear layer. An important theorem that provide insights into the design of neural networks is the so-called Universal Approximation theorem . This theorem states that: \"...regardless of the function that we are trying to learn, we know that a single MLP with infinite number of units can represent this function. We are however not guaranteed that we can train such a network...\" More specifically, learning can fail for two different reasons: i) the optimization algorithm used for training may not be able to find the value of the parameters that correspond to the desired function; ii) the training algorithm might choose the wrong function as a result of over\ufb01tting. In practice, experience has shown that deeper networks with fewer units per layer are better both in terms of generalization and robustness to training . This leads us with a trade-off between shallow networks with many units in each layer and deep networks with fewer units in each layer. An empirical trend has been observed between the depth of a network and its accuracy on test data: To summarize, whilst theoretically 1-layer shallow networks can learn any function, it is advisable these days to trade network width with network depth as training deep networks is nowadays feasible both from a theoretical and computational point of view. It is however always best to start small and grow the network in width and depth as the problem requires. We will see in the following lectures that a large network requires a large training data to avoid overfitting; therefore, when working with small to medium size training data it is always best to avoid using very large networks in the first place. Additional readings the following blog post contains an extensive treatment of activation functions used in NN training beyond the most popular ones that we covered in this lecture.","title":"Basics of Neural Networks"},{"location":"lectures/05_nn/#basics-of-neural-networks","text":"In this lecture, we start our journey in the field of Deep Learning. In order to do so, we must first introduce the most commonly used kind of Neural Networks, the so-called Multi-Layer Perceptron (MLP) (also commonly referred to as fully connected (FC) layer). A MLP is a class of feedforward artificial neural networks (ANNs), where the term feedforward refers to the fact the the flow of information moves from left to right. On the other hand, a change in the direction of the flow is introduced as part of the forward pass gives rise to a different family of NNs, so-called Recurrent Neural Networks (they will be subject of future lectures):","title":"Basics of Neural Networks"},{"location":"lectures/05_nn/#perceptron","text":"To begin with, we focus on the core building block of a MLP, so-called Perceptron or Unit. This is nothing really new to us, as it is exactly the same structure that we used to create the logistic regression model, a linear weighting of the element of the input vector followed by a nonlinear activation function. We prefer however to schematic represent it in a slightly different way as this will make it easier later on to drawn MLPs. Mathematically, the action of a percepton can be written compactly as dot-product followed by an element-wise nonlinear activation: \\[ y = \\sigma(\\sum_i w_i x_i + b) = \\sigma(\\sum_i \\textbf{w}^T \\textbf{x} + b) \\] where \\(\\textbf{w} \\in \\mathbb{R}^{N_i}\\) is the vector of weights, \\(b\\) is the bias, and \\(\\sigma\\) is a nonlinear activation function. Note that whilst we used a sigmoid function in the logistic regression model, this can be any differentiable function as later we will discuss in more details.","title":"Perceptron"},{"location":"lectures/05_nn/#multi-layer-perceptron","text":"The perceptron model shown above takes as input a vector \\(\\textbf{x} \\in \\mathbb{R}^{N_i}\\) and returns a scalar \\(y\\) , we are now ready to make a step forward where we simply combine multiple perceptrons together to return a vector \\(\\textbf{y} \\in \\mathbb{R}^{N_o}\\) The MLP in the figure above presents \\(N_i=3\\) inputs and \\(N_o=2\\) outputs. By highlighting the original perceptron in green, we can easily observed that a MLP is simply a composition of \\(N_o\\) perceptrons, which again we can compactly write as a matrix-vector multiplication followed by an element-wise nonlinear activation: \\[ y_j = \\sigma(\\sum_i w_{ji} x_i + b), \\quad \\textbf{y} = \\sigma(\\textbf{W} \\textbf{x} + \\textbf{b}) \\] where \\(\\textbf{W} \\in \\mathbb{R}^{N_o \\times N_i}\\) is the matrix of weights, \\(\\textbf{b} \\in \\mathbb{R}^{N_o}\\) is a vector of biases. Finally, if we stack multiple MLPs together we obtained what is generally referred to as N-layer NN, where the count of the number of layers does not include the input layer. For example, a 3-layer NN has the following structure where we omit for simplicity the bias terms in the schematic drawing. This figure gives us the opportunity to introduce some terminology commonly used in the DL community: Input layer : first layer taking the input vector \\(\\textbf{x}\\) as input and returning an intermediate representation \\(\\textbf{z}^{[1]}\\) ; Hidden layers : second to penultimate layers taking as input the previous representation \\(\\textbf{z}^{[i-1]}\\) and returning a new representation \\(\\textbf{z}^{[i]}\\) ; Ouput layer : last layer producing the output of the network \\(\\textbf{y}\\) ; Depth : number of hidden layers (plus output layer); Width : number of units in each hidden layer. Note that we will always use the following notation \\(\\cdot^{(i)[j]}\\) where round brackets are used to refer to a specific training sample and square brackets are used to refer to a specific layer.","title":"Multi-layer Perceptron"},{"location":"lectures/05_nn/#activation-functions","text":"We have just started to appreciate the simplicity of NNs. A Neural Network is nothing more than a stack of linear transformations and nonlinear element-wise activation functions. If such activation functions where omitted, we could combine the various linear transformations together in a single matrix, as the product of N matrices. Assuming that sigma acts as an identity matrix \\(\\sigma(\\textbf{x})=\\textbf{Ix}=\\textbf{x}\\) , (and omitting biases for simplicity) we get: $$ \\textbf{y} = \\sigma(\\textbf{W}^{[3]}\\sigma(\\textbf{W}^{[2]}\\sigma(\\textbf{W}^{[1]} \\textbf{x}))) = \\textbf{W}^{[3]}\\textbf{W}^{[2]}\\textbf{W}^{[1]}\\textbf{x} = \\textbf{W} \\textbf{x} $$ so no matter how deep the network is, we can always reconduct it to a linear model. Depending on the final activation and loss function, therefore we will have a linear regression or a logistic regression model. We consider here a very simple example to show the importance of nonlinear activations before delving into the details. Let's assume that we wish the learn the XOR (eXclusive OR) boolean logic operator from the following four training samples: \\[ \\textbf{x}^{(1)} = [0, 0] \\rightarrow y^{(1)}=0 \\] \\[ \\textbf{x}^{(2)} = [0, 1] \\rightarrow y^{(2)}=1 \\] \\[ \\textbf{x}^{(3)} = [1, 0] \\rightarrow y^{(3)}=1 \\] \\[ \\textbf{x}^{(4)} = [1, 1] \\rightarrow y^{(4)}=0 \\] Starting from the linear regression model, we can define a matrix \\(\\textbf{X}_{train} = [\\textbf{x}^{(1)}, \\textbf{x}^{(2)}, \\textbf{x}^{(3)}, \\textbf{x}^{(4)}]\\) and a vector \\(\\textbf{y}_{train} = [y^{(1)}, y^{(2)}, y^{(3)}, y^{(4)}]\\) . The linear model becomes: \\[ \\textbf{y}_{train} = \\textbf{X}_{train}^T \\boldsymbol \\theta \\] where the weights \\(\\boldsymbol \\theta\\) are obtained as detailed in the previous section. It can be easily proven that the solution is \\(\\boldsymbol \\theta=[0,0,0.5]\\) , where \\(\\textbf{w}=[0,0]\\) and \\(b=0.5\\) . This means that, no matter the input the output of the linear model will always be equal to \\(0.5\\) ; in other words, the model is unable to distinguish between the true or false outcomes. If instead we introduce a nonlinearity between two weight matrices (i.e., a 2-layer NN), the following combination of weights and biases (taken from the Goodfellow book) will lead to a correct prediction: \\[ \\textbf{W}^{[1]} = \\begin{bmatrix} 1 & 1 \\\\ 1 & 1\\end{bmatrix}, \\textbf{W}^{[2]} = \\begin{bmatrix} 1 \\\\ -2 \\end{bmatrix}^T, \\textbf{b}^{[1]} = \\begin{bmatrix} 0 \\\\ -1 \\end{bmatrix}, b^{[2]} = 0 \\] Note that in this case the \\(\\sigma=ReLU\\) activation function, which we will introduce in the next section, must be used. Of course, there may be many more combinations of weights and biases that lead to a satisfactory prediction. You can prove this to yourself by initializing the weights and biases randomly and optimizing them by means of a stochastic gradient-descent algorithm. Having introduced nonlinearites every time after we apply the weight matrices to the vector flowing through the computational graph, the overall set of operations cannot be simply reconducted to a matrix-vector multiplication and allows us to learn highly complex nonlinear mappings between input features and targets. The role of activation functions is however not always straightforward and easy to grasp. Whilst we can say that they help in the learning process, not every function is suitable for this task and in fact, some functions may prevent the network from learning at all. In the following we look at the most commonly used activation functions and discuss their origin and why they became popular and useful in Deep Learning: Sigmoid and Tanh : historically these were the most popular activation functions as they are differentiable across the entire domain. In the past, there was in fact a strong belief that gradient-descent cannot operate on functions that have singularities; although this is correct from a theoretical point of view it was later proved to be wrong in practice. They are mathematically defined as: $$ \\sigma_s(x) = \\frac{1}{1-e^{-x}} $$ and $$ \\sigma_t(x) = 2 \\sigma_s(2x) - 1 $$ Whilst still used in various contexts, these activation functions saturate very quickly (i.e., large values are clipped to 1 and small values are clipped to -1 for tanh or 0 for sigmoid). This leads to the so-called vanishing gradient problem that we will discuss in more details in following lectures; simply put, if we look at the the gradient of both of these functions, it is non-zero only when x is near zero and becomes zero away from it, meaning that if the output of a linear layer is large the gradient of the activation function will be zero and therefore the gradient will stop flowing through backpropagation. This is particularly problematic for deep network as the training of the early layers becomes very slow. ReLU (Rectified Linear Unit): this activation function became very popular in the start of the 21st century and since then it is the most commonly used activation function for NN training. It is much closer to a linear activation than the previous two, but introduces a nonlinearity by putting negative inputs to zero. By doing so, the ReLU activation function is a piecewise linear function. This shows that non-differentiable functions can be used in gradient based optimization, mostly because numerically we will hardly (if not never) have an output of a NN layer that is exactly zero when fed as input to the activation. Mathematically speaking, we can write it as: $$ \\sigma_r(x) = max ( 0,x ) = \\begin{cases} x & x\\ge 0, \\quad 0 & x<0 \\end{cases} $$ whilst its derivative is: $$ \\sigma'_{relu}(x) = \\begin{cases} 1 & x\\ge 0, \\quad 0 & x<0 \\end{cases} $$ We can observe that this activation function never saturates, for every value in the positive axis the derivative is always 1. Such a property makes ReLU suitable for large networks as the risk of vanishing gradients is greatly reduced. A downside of ReLU is that the entire negative axis acts as an annihilator preventing information to flow. A strategy to prevent or reduce the occurrences of negative inputs is represented by the initialization of biases to a value slightly greater than zero (e.g., b=0.1). Leaky ReLU (Leaky Rectified Linear Unit): a modified version of the ReLU activation function aimed once again at avoiding zeroing of inputs in the negative axis. This function is identical to the ReLU in the positive axis, whilst another straight line with smaller slope is used in the negative axis: $$ \\sigma'_{l-relu}(x) = max ( 0,x ) + \\alpha min ( 0,x ) = \\begin{cases} x & x\\ge 0, \\quad \\alpha x & x<0 \\end{cases} $$ By doing so, also negative inputs can flow through the computational graph. A variant of L-ReLU, called P-ReLU, allows for the \\(\\alpha\\) parameter to be learned instead of being fixed. Absolute ReLU (Absolute Rectified Linear Unit): a modified version of the ReLU activation function that is symmetric with respect to the \\(x=0\\) axis: $$ \\sigma'_{l-relu}(x) = |x| = \\begin{cases} x & x\\ge 0, \\quad -x & x<0 \\end{cases} $$ Whilst this is not a popular choice in the DL literature, it has been successfully used in object detection tasks where the features that we wish the NN to extract from the training process are polarity invariant. Cosine, Sine, ... : the use of periodic functions have recently started to appear in the literature especially in the context of scientific DL (e.g., Physics-informed neural networks). Softmax : this activation function is commonly used at the end of the last layer in the context of multi-label classification. However as it takes an input vector of N numbers and converts it into an output vector of probabilities (i.e., N numbers summing to 1), it may also be used as a sort of switch in the internal layers. The following two figures show the different activation functions discussed above and their gradients.","title":"Activation functions"},{"location":"lectures/05_nn/#network-architecture","text":"Up until now we have discussed the key components of a Feedforward Neural Network, the Multi-layer Perceptron. It was mentioned a few times that a NN can be composed of multiple MLPs connected with each other, giving rise to a so-called Deep Neural Network (DNN). The depth and width of the network has been also defined, and we have introduced the convention that a N-layer NN is a network with N-1 hidden layers. A crucial point in the design of a neural network architecture is represented by the choice of such parameters. Whilst no hard rules exist and the creation of a NN architecture is to these days still closer to an art than a systematic science, in the following we provide a number of guidelines that should be followed when approaching the problem of designing a network. For example, as previously discussed, connecting two or more layers without adding a nonlinear activation function in between should be avoided as this part of the network simply behaves as a single linear layer. An important theorem that provide insights into the design of neural networks is the so-called Universal Approximation theorem . This theorem states that: \"...regardless of the function that we are trying to learn, we know that a single MLP with infinite number of units can represent this function. We are however not guaranteed that we can train such a network...\" More specifically, learning can fail for two different reasons: i) the optimization algorithm used for training may not be able to find the value of the parameters that correspond to the desired function; ii) the training algorithm might choose the wrong function as a result of over\ufb01tting. In practice, experience has shown that deeper networks with fewer units per layer are better both in terms of generalization and robustness to training . This leads us with a trade-off between shallow networks with many units in each layer and deep networks with fewer units in each layer. An empirical trend has been observed between the depth of a network and its accuracy on test data: To summarize, whilst theoretically 1-layer shallow networks can learn any function, it is advisable these days to trade network width with network depth as training deep networks is nowadays feasible both from a theoretical and computational point of view. It is however always best to start small and grow the network in width and depth as the problem requires. We will see in the following lectures that a large network requires a large training data to avoid overfitting; therefore, when working with small to medium size training data it is always best to avoid using very large networks in the first place.","title":"Network architecture"},{"location":"lectures/05_nn/#additional-readings","text":"the following blog post contains an extensive treatment of activation functions used in NN training beyond the most popular ones that we covered in this lecture.","title":"Additional readings"},{"location":"lectures/06_nn/","text":"More on Neural Networks In this lecture, we will delve into some more advanced topics associated to the creation and training of deep neural networks. Backpropagation First of all, once a neural network architecture has been defined for the problem at hand, we need a method that can learn the best set of free parameters of such nonlinear function represented as \\(f_\\theta\\) . More specifically, we want to initialize the network with some random weights and biases (we will soon discuss how such initialization can be performed) and use the training data at hand to improve our weights and biases in order to minimize a certain loss function. Whilst this can be easily done by means of gradient based optimizers like those presented in Lecture 3, a key ingredient that we need to provide to such algorithms is represented by the gradient of the loss function with respect to each and every weight and bias parameters. We have already alluded at a technique that can do so whilst discussing a simple logistic regression model. This is generally referred to by the ML community as back-propagation and more broadly by the mathematical community as Reverse Automatic Differentiation . Let's start by taking the same schematic diagram used for the logistic regression example and generalize it to a N-layer NN: The main difference here, which we will need to discuss in details, is the fact that in the forward pass we feed the input into a stack of linear layers prior to computing the loss function. The backpropagation does need to be able to keep track of the chain of operations (i.e., computational graph) and traverse it back. However, as already done for the logistic regression model, all we need to do is to write the entire chain of operations as a chain of atomic ones that we can then easily traverse back. Let's do this for the network above and a single training sample \\(\\textbf{x}\\) : \\[ \\textbf{z}^{[1]} = \\textbf{W}^{[1]}\\textbf{x} + \\textbf{b}^{[1]}, \\quad \\textbf{a}^{[1]} = \\sigma(\\textbf{z}^{[1]}), \\] \\[ \\textbf{z}^{[2]} = \\textbf{W}^{[2]}\\textbf{a}^{[1]} + \\textbf{b}^{[2]}, \\quad \\textbf{a}^{[2]} = \\sigma(\\textbf{z}^{[2]}), \\] \\[ z^{[3]} = \\textbf{w}^{[3]T}\\textbf{a}^{[2]} + b^{[3]}, \\quad a^{[3]} = \\sigma(z^{[3]}), \\] \\[ l = \\mathscr{L}(y,a^{[3]}). \\] Given such a chain of operations, we are now able to find the derivatives of the loss function with respect to any of the weights or biases. As an example we consider here \\(\\partial l / \\partial \\textbf{W}^{[2]}\\) : \\[ \\frac{\\partial l}{\\partial \\textbf{W}^{[2]}} = \\frac{\\partial l}{\\partial a^{[3]}} \\frac{\\partial a^{[3]}}{\\partial z^{[3]}} \\frac{\\partial z^{[3]}}{\\partial \\textbf{a}^{[2]}} \\frac{\\partial \\textbf{a}^{[2]}}{\\partial \\textbf{z}^{[2]}} \\frac{\\partial \\textbf{z}^{[2]}}{\\partial \\textbf{W}^{[2]}} \\] Note that since this will be effectively evaluated from left to right, it is perhaps easier to rewrite the chain of derivatives as follows: \\[ \\frac{\\partial l}{\\partial \\textbf{W}^{[2]}} = \\frac{\\partial \\textbf{z}^{[2]}}{\\partial \\textbf{W}^{[2]}} \\frac{\\partial \\textbf{a}^{[2]}}{\\partial \\textbf{z}^{[2]}} \\frac{\\partial z^{[3]}}{\\partial \\textbf{a}^{[2]}} \\frac{\\partial a^{[3]}}{\\partial z^{[3]}} \\frac{\\partial l}{\\partial a^{[3]}} \\] Assuming for simplicity that the binary cross-entropy and sigmoid functions are used here as the loss and activation functions, respectively, we get: \\[ \\frac{\\partial a^{[3]}}{\\partial z^{[3]}} \\frac{\\partial l}{\\partial a^{[3]}} = a^{[3]} - y \\] (which has already been derived in the logistic regression lecture). The subsequent derivatives are: \\[ \\frac{\\partial z^{[3]}}{\\partial \\textbf{a}^{[2]}} = \\textbf{w}^{[3]}_{N_{a^{[2]}} \\times 1} \\] \\[ \\frac{\\partial \\textbf{a}^{[2]}}{\\partial \\textbf{z}^{[2]}} = diag\\{\\textbf{a}^{[2]}(1-\\textbf{a}^{[2]})\\}_{N_{z^{[2]}} \\times N_{a^{[2]}}} \\] \\[ \\frac{\\partial \\textbf{z}^{[2]}}{\\partial \\textbf{W}^{[2]}} = \\begin{bmatrix} \\mathbf{a}^{[1]} & \\mathbf{0} & \\ldots & \\mathbf{0} \\\\ \\mathbf{0} & \\mathbf{a}^{[1]} & \\ldots & \\mathbf{0} \\\\ \\vdots & \\vdots & \\ddots & \\vdots \\\\ \\mathbf{0} & \\mathbf{0} & \\ldots & \\mathbf{a}^{[1]} \\end{bmatrix}_{N_{a^{[1]}}N_{z^{[2]}} \\times N_{z^{[2]}}} \\] where the last two expressions correspond to the transposed Jacobian. Putting all together: \\[ \\frac{\\partial l}{\\partial \\textbf{W}^{[2]}} = \\begin{bmatrix} \\mathbf{a}^{[1]} & \\mathbf{0} & \\ldots & \\mathbf{0} \\\\ \\mathbf{0} & \\mathbf{a}^{[1]} & \\ldots & \\mathbf{0} \\\\ \\vdots & \\vdots & \\ddots & \\vdots \\\\ \\mathbf{0} & \\mathbf{0} & \\ldots & \\mathbf{a}^{[1]} \\end{bmatrix} diag\\{\\textbf{a}^{[2]}(1-\\textbf{a}^{[2]})\\} \\textbf{w}^{[3]} (a^{[3]} - y) \\] which can be later reshaped into a matrix of size \\(N_{z^{[2]}} \\times N_{a^{[1]}}\\) . This derivative can also written in a more compact form as \\[ \\frac{\\partial l}{\\partial \\textbf{W}^{[2]}} = \\textbf{a}^{[1]}[(\\textbf{a}^{[2]}(1-\\textbf{a}^{[2]})) \\cdot \\textbf{w}^{[3]}(a^{[3]} - y)]^T \\] where \\(\\cdot\\) is used to refer to element-wise products. Similar results can be obtained for the bias vector and for both weights and biases in the other layers as depicted in the figure below for a 2-layer NN: To conclude, the backpropagation equations in the diagram above are now generalized for the case of \\(N_s\\) training samples \\(\\textbf{X} \\in \\mathbb{R}^{N \\times N_s}\\) and a generic activation function \\(\\sigma\\) whose derivative is denoted as \\(\\sigma'\\) . Here we still assume an output of dimensionality one -- \\(\\textbf{Y} \\in \\mathbb{R}^{1 \\times N_s}\\) : \\[ \\textbf{dZ}^{[2]}=\\textbf{A}^{[2]}-\\textbf{Y} \\qquad (\\textbf{A}^{[2]},\\textbf{dZ}^{[2]} \\in \\mathbb{R}^{1 \\times N_s}) \\] \\[ \\textbf{dW}^{[2]}= \\frac{1}{N_s} \\textbf{dZ}^{[2]}\\textbf{A}^{[1]T} \\qquad (\\textbf{A}^{[1]} \\in \\mathbb{R}^{N^{[1]} \\times N_s}) \\] \\[ db^{[2]}= \\frac{1}{N_s} \\sum_i \\textbf{dZ}_{:,i}^{[2]} \\] \\[ \\textbf{dZ}^{[1]}=\\textbf{W}^{[2]^T}\\textbf{dZ}^{[2]} \\cdot \\sigma'(\\textbf{Z}^{[1]}) \\qquad (\\textbf{dZ}^{[1]} \\in \\mathbb{R}^{N^{[1]} \\times N_s}) \\] \\[ \\textbf{dW}^{[1]}= \\frac{1}{N_s} \\textbf{dZ}^{[1]}\\textbf{X}^T \\] \\[ \\textbf{db}^{[1]}= \\frac{1}{N_s} \\sum_i \\textbf{dZ}_{:,i}^{[1]} \\] Initialization Neural networks are highly nonlinear functions. The associated cost function used in the training process in order to optimize the network weights and biases is therefore non-convex and contains several local minima and saddle points. A key component in non-convex optimization is represented by the starting guess of the parameters to optimize, which in the context of deep learning is identified by initialization of weights and biases. Whilst a proper initialization has been shown to be key to a successful training of deep train NNs, this is a very active area of research as initialization strategies are so far mostly based on heuristic arguments and experience. Zero initialization First of all, let's highlight a bad choice of initialization that can compromise the training no matter the architecture of the network and other hyperparameters. A common choice in standard optimization in the absence of any strong prior information is to initialize all the parameters to zero: if we decide to follow such a strategy when training a NN, we will soon realize that training is stagnant due to the so called symmetry problem (also referred to as symmetric gradients ). Note that a similar situation arises also if we choose a constant values for weights and biases (e.g., \\(c^{[1]}\\) for all the weights and biases in the first layer and \\(c^{[2]}\\) for all the weights and biases in the second layer): Let's take a look at this with an example: Since the activations are constant vectors, back-propagation produces constant updates for the weights (and biases), leading to weights and biases to never lose the initial symmetry. Random initialization A more appropriate way to initialize the weights of a neural network is to sample their values from random distributions, for example: $$ w_{ij}^{[.]} \\sim \\mathcal{N}(0, 0.01) $$ where the choice of the variance is based on the following trade-off: too small variance leads to the vanishing gradient problem (i.e., slow training), whilst too high variance leads to the exploding gradient problem (i.e., unstable training). On the other hand, for the biases we can use zero or a constant value. If you remember, we have already mentioned this when discussing the ReLU activation function: a good strategy to limit the amount of negative values as input to this activation function is to choose a small constant bias (e.g., \\(b=0.1\\) ). Whilst this approach provides a good starting point for stable training of neural networks, more advanced initialization strategies have been proposed in the literature: Uniform : the weights are initialized with uniform distributions whose variance depend on the number of units in the layer: $$ w_{ij}^{[k]} \\sim \\mathcal{U}(-1/\\sqrt{N^{[k]}}, 1/\\sqrt{N^{[k]}}) $$ or $$ w_{ij}^{[k]} \\sim \\mathcal{U}(-\\sqrt{6/(N^{[k-1]}+N^{[k]})}, \\sqrt{6/(N^{[k-1]}+N^{[k]})}) $$ This strategy is commonly used with FC layers. Xavier : the weights are initialized with normal distributions whose variance depend on the number of units in the layer: $$ w_{ij}^{[k]} \\sim \\mathcal{N}(0, 1/N^{[k]}) $$ This strategy ensures that the variance remains the same across the layers. Xavier initialization is very popular especially in layers using Tanh activations. He : the weights are initialized with normal distributions whose variance depend on the number of units in the layer: $$ w_{ij}^{[k]} \\sim \\mathcal{N}(0, 2/N^{[k]}) $$ This strategy ensures that the variance remains the same across the layers. He initialization is very popular especially in layers using ReLU activations. Why Deep Learning took off in the last century Before moving onto the last topic of this lecture, a unified statistical view of loss functions in deep learning, let's try to answer a question that many of you may ask: what makes NNs so popular these days and why deep learning took off in the last decade? By now, we have made ourself familiar with the concept of neural networks, learned about its basic building block (the so-called perceptron) and how by simply horizontally and vertically stacking multiple perceptrons we can create universal function approximators that can be trained to learn very complex nonlinear relationships between inputs and targets (provided availability of a large enough amount of training data). The process of creating and training NNs relies on the following four key ingredients: linear algebra operations : matrix-vector and matrix-matrix multiplications (at least within the context of FC networks); activations : nonlinear functions that enable the learning of complex nonlinear mappings; loss functions : functions that can be used to evaluate the goodness of the model in terms of predicting targets from inputs; learning algorithms : optimization methods that can produce the best weights and biases using gradient information; Eventually, most of the underlying theory of NNs was already fairly mature in 70s and 80s; nevertheless, until the early 2000, research in the field of artificial neural networks was still considered a niche domain mostly theoretical and with little practical implications. So, what did lead to the renaissance of Deep Learning? Two key factors in the popularity and success of Neural Networks growth are undoubtedly: larger datasets : with the growth of the internet and social media, a digital revolution has started since the beginning of the new century, where datasets of ever increasing size can be easily sourced. This applies both to images and text as well as audio and video content. larger networks : with the emergence of new hardware technology such as GPUs, training large deep networks is nowadays possible, not only for large corporations like Google or Microsoft but also in Academia or for small- and medium-size enterprises that want to leverage their data to make data-driven business decisions. Alongside the data and hardware revolution, a number of important algorithmic discoveries have also led to faster, more robust training of NNs making this process easier and more accessible to domain scientists in a variety of scientific fields. Some of them have been already discussed, but we wish here to put more emphasis on them: MSE --> Cross-entropy : whilst in the past the mean square error (MSE) loss was used for pretty much every task, nowadays classification or semantic segmentation tasks are more commonly solved by means of Cross-entropy loss functions. This shift in training strategy is mostly due to the fact that the ML community and the statistical community got closer to each other in the last two decades, which lead to the development of strong statistical foundations in the theory of deep learning; Sigmoid --> ReLU : whilst continuous, differentiable activation functions used to be a must in the past mostly due to the belief that gradient descent algorithms (and back-propagation) needs these kind of functions to behave correctly, it is now clear that this constraint can be greatly related. Piece-wise linear activation functions like ReLU are nowadays not only used but pretty much the de-facto standard for hidden layers in deep neural networks. Jarrett et al. (2009) observed that \"using a rectifying nonlinearity is the single most important factor in improving the performance of a recognition system\" . Maximum likelihood estimators To conclude, we would like to revisit the loss functions already introduced in the context of linear and logistic regression models and introduce some other loss functions that are commonly employed to train neural networks. However, whilst so far we have chosen different loss functions for each task (regression vs. classification) without really providing a statistical motivation of such choices, in this section we will instead try to define a common framework based on the concept of Maximum Likelihood Estimations (MLE). Let's start by considering a set of samples drawn from the true (but unknown) distribution: \\[ \\mathbf{X} = \\{ \\mathbf{x}^{(1)}, \\mathbf{x}^{(2)}, ..., \\mathbf{x}^{(N_s)} \\} \\sim p_{data}(\\mathbf{X}) \\] Second, a parametric family of probability distribution is defined: \\[ p_{model}(\\mathbf{X}; \\theta) \\] This distribution maps any vector \\(\\mathbf{x}\\) to a real number and is generally referred to as the likelihood function. Its free parameters \\(\\theta\\) must be chosen such that this probability distribution is as close as possible to the true one. As an example, if we consider a multi-variate gaussian distribution with uncorrelated members, the free parameters become \\(\\theta = \\{ \\boldsymbol \\mu, \\sigma\\}\\) and the probability density function becomes: \\[ p_{model}(\\mathbf{x}; \\{ \\boldsymbol \\mu, \\sigma\\}) = \\frac{1}{\\sqrt{2 \\pi \\sigma^2}} e^{-\\frac{||\\mathbf{x} - \\boldsymbol \\mu||_2^2}{2 \\sigma^2}} \\] We can now define the MLE as follows: \\[ \\theta_{ML} = \\underset{\\theta} {\\mathrm{argmax}} \\; p_{model}(\\mathbf{X}; \\theta) \\] Assuming now statistical independence between the samples \\(\\mathbf{x}^{(i)}\\) , the equation above can be rewritten as: \\[ \\begin{aligned} \\theta_{ML} &= \\underset{\\theta} {\\mathrm{argmax}} \\; \\prod_{i=1}^{N_s} p_{model}(\\mathbf{x}^{(i)}; \\theta) \\\\ &= \\underset{\\theta} {\\mathrm{argmax}} \\; \\sum_{i=1}^{N_s} log(p_{model}(\\mathbf{x}^{(i)}; \\theta)) \\\\ &\\approx \\underset{\\theta} {\\mathrm{argmax}} \\; E_{\\mathbf{x} \\sim p_{data}} [ log(p_{model}(\\mathbf{x}; \\theta))] \\\\ &= \\underset{\\theta} {\\mathrm{argmin}} \\; - E_{\\mathbf{x} \\sim p_{data}} [ log(p_{model}(\\mathbf{x}; \\theta))] \\end{aligned} \\] Simply put, maximizing the parametric probability density function is shown to be equivalent to minimizing the negative log likelihood of the same distribution. An optimization problem must be therefore solved to find the most suitable free parameters. Going back to the multi-variate gaussian example, let's assume we are interested to estimate the mean (whilst we keep the variance fixed): \\[ \\begin{aligned} \\boldsymbol \\mu_{ML} &= \\underset{\\boldsymbol \\mu} {\\mathrm{argmin}} \\; - \\sum_{i=1}^{N_s} log \\Big( \\frac{1}{\\sqrt{2 \\pi \\sigma^2}} e^{-\\frac{||\\mathbf{x}^{(i)} - \\boldsymbol \\mu||_2^2}{2 \\sigma^2}} \\Big) \\\\ &= \\underset{\\boldsymbol \\mu} {\\mathrm{argmin}} \\; \\sum_{i=1}^{N_s} \\frac{||\\mathbf{x}^{(i)} - \\boldsymbol \\mu||_2^2}{2 \\sigma^2} \\end{aligned} \\] Computing the gradient and imposing it to be zero gives us the point estimate of \\(\\boldsymbol \\mu_{ML}\\) : \\[ \\frac{\\partial -\\sum_i log p}{\\partial \\boldsymbol \\mu} = 0 \\rightarrow \\sum_{i=1}^{N_s} (\\mathbf{x}^{(i)} - \\boldsymbol \\mu) = 0 \\rightarrow \\boldsymbol \\mu_{ML} = \\frac{1}{N_s} \\sum_{i=1}^{N_s} \\mathbf{x}^{(i)} \\] which is nothing more than the well-known sample mean of the distribution. In order to apply the same framework to learning problems like those arising in DL, the ML estimation is now extended to the case of conditional probabilities where we are given a set of training pairs \\((\\mathbf{x}, y)^{(i)}\\) : \\[ \\begin{aligned} \\theta_{ML} &= \\underset{\\theta} {\\mathrm{argmax}} \\; p_{model}(Y | \\mathbf{X}; \\theta) \\\\ &= ... \\\\ &= \\underset{\\theta} {\\mathrm{argmin}} \\; - E_{\\mathbf{x},y \\sim p_{data}} [ log(p_{model}(y|\\mathbf{x}; \\theta))] \\end{aligned} \\] Regression Linear regression Let's first apply this framework to a simple linear regression problem. Here, under the assumption of gaussian noise, the likelihood can be written as a multi-variate gaussian distribution: \\[ y = \\tilde{\\mathbf{x}}^T \\boldsymbol \\theta + \\mathbf{n} \\sim \\mathcal{N}(\\hat{y} = \\tilde{\\mathbf{x}}^T \\boldsymbol \\theta, \\sigma) \\] Plugging this distribution into the negative log likelihood expression, we obtain: \\[ \\boldsymbol \\theta_{ML} = \\underset{\\boldsymbol \\theta} {\\mathrm{argmin}} \\; \\sum_{i=1}^{N_s} \\frac{||\\hat{y}^{(i)} - y^{(i)}||_2^2}{2\\sigma^2} = \\frac{N_s}{2\\sigma^2} MSE(\\textbf{y}_{train}, \\hat{\\textbf{y}}_{train})\\\\ \\] This cost function can be seen to be a rescaled version of the MSE function previously introduced as the loss function for the linear regression model. Note however, that this model is not only more rigorous from a statistical point of view but provides also a natural way to handle training samples with different confidence. By using sample-dependant scaling factors ( \\(\\sigma^{(i)}\\) ), different samples can be chosen to contribute more or less to the training process. Multi-layer perceptron regression A very similar derivation can be performed for a neural network composed by one or more MLPs. Eventually we simply need to swap the previously linearly predicted output \\(\\hat{y}=\\tilde{\\mathbf{x}}^T \\boldsymbol \\theta\\) with a new output produced by the chosen nonlinear functional \\(\\hat{y}=f_\\theta(\\mathbf{x})\\) . In conclusion, we must remember that the MSE loss function, commonly used for regression tasks in ML and DL, is a MLE in disguise. Classification Binary classification In statistical learning, a Bernoulli distribution is commonly used for the task of binary (i.e., 2 label) classification: \\[ P(y)= \\phi y + (1-\\phi)(1-y) \\] where \\(y\\) is the outcome and \\(\\phi\\) is its probability that we wish to learn by means of a model (i.e., logistic regression or MLP). Moreover, as we wish to learn a probability this value must be bound between 0 and 1; this can be easily achieved by feeding the output of the model into a sigmoid function \\(\\sigma\\) : \\[ \\hat{y} = \\sigma (f_\\theta(\\mathbf{x})) \\] Put together: \\[ \\begin{aligned} \\boldsymbol \\theta_{ML} &= \\underset{\\boldsymbol \\theta} {\\mathrm{argmin}} \\; -\\sum_{i=1}^{N_s} log(p_{model}(y^{(i)}|\\mathbf{x}^{(i)}; \\theta) \\\\ &= -\\sum_{i=1}^{N_s} y^{(i)} log \\hat{y}^{(i)} + (1-y^{(i)}) log (1-\\hat{y}^{(i)}) \\end{aligned} \\] which is the same loss function that we have introduced and discussed in details in the context of logistic regression. Once again, we note how we have here simply defined a MLE for a classification task and obtained the well-know binary cross-entropy loss function. Multi-label classification An extension of binary classification, multi-label classification aims at producing an estimate of the most likely class within a set of \\(N_c\\) classes. The combination of a Bernoulli distribution and sigmoid activation used for the binary classifier is here replaced by a Multinoulli distribution and softmax activation, where the latter is defined as follows: \\[ \\hat{\\mathbf{y}} = \\sigma(\\mathbf{x}) =\\frac{e^\\mathbf{x}}{\\sum_{i=1}^{N_c} e^{x_i}} \\] A property of such activation function is that it takes as input a vector of numbers (sometimes called logits )) and produces as output a vector of probabilities (i.e., \\(y_i>0\\) and \\(\\sum_{i=1}^{N_c} y_i=1\\) ). Put together: \\[ \\begin{aligned} \\boldsymbol \\theta_{ML} &= \\underset{\\boldsymbol \\theta} {\\mathrm{argmin}} \\; -\\sum_{i=1}^{N_s} log(p_{model}(y^{(i)}|\\mathbf{x}^{(i)}; \\theta)) \\\\ &= -\\sum_{i=1}^{N_s} \\sum_{j=1}^{N_c} y_j^{(i)} log \\hat{y}_j^{(i)} \\end{aligned} \\] where the true labels \\(\\mathbf{y}^{(i)}\\) are one-hot encoded vectors (i.e., \\(y_{j=j_{true}}^{(i)}=1\\) and \\(y_{j \\neq j_{true}}^{(i)}=0\\) ). To conclude, let's try to get more insights into why ML estimators work so succesfully. In order to do so, we start by defining a measure of similarity between the two distributions of interest: empirical distribution of the data: \\(p_{data}(\\mathbf{X})\\) parametric model distribution: \\(p_{model}(\\mathbf{X}; \\theta)\\) This can be achieved by means of the previously introduced Kullback-Leibler divergence, which we can write as follows: \\[ D_{KL}(p_{data}||p_{model}) = E_{x \\sim p_{data}} [log p_{data}(\\mathbf{x}) - p_{model}(\\mathbf{x})] \\] Since we are interested to estimate the free-parameters \\(\\theta\\) such that the model distribution matches that of the data, an equivalent optimization problem can be written with the help of the KL divergence: \\[ \\begin{aligned} \\theta_{KL} &= \\underset{\\theta} {\\mathrm{argmin}} \\; D_{KL}(p_{data}||p_{model}) \\\\ &= \\underset{\\theta} {\\mathrm{argmin}} \\; - E_{\\mathbf{x} \\sim p_{data}} [ log(p_{model}(\\mathbf{x}; \\theta))] \\end{aligned} \\] where the data probability has been removed in the second term since it is independent of \\(\\theta\\) . We can conclude that \\(\\theta_{KL}=\\theta_{ML}\\) and therefore minimizing the KL divergence between the model and data distributions is the same as maximizing their cross-entropy (as done by the ML estimator). Additional readings If you are interested to learn more about network initialization, I recommend reading (and reproducing) the following blog posts: 1 and 2 .","title":"More on Neural Networks"},{"location":"lectures/06_nn/#more-on-neural-networks","text":"In this lecture, we will delve into some more advanced topics associated to the creation and training of deep neural networks.","title":"More on Neural Networks"},{"location":"lectures/06_nn/#backpropagation","text":"First of all, once a neural network architecture has been defined for the problem at hand, we need a method that can learn the best set of free parameters of such nonlinear function represented as \\(f_\\theta\\) . More specifically, we want to initialize the network with some random weights and biases (we will soon discuss how such initialization can be performed) and use the training data at hand to improve our weights and biases in order to minimize a certain loss function. Whilst this can be easily done by means of gradient based optimizers like those presented in Lecture 3, a key ingredient that we need to provide to such algorithms is represented by the gradient of the loss function with respect to each and every weight and bias parameters. We have already alluded at a technique that can do so whilst discussing a simple logistic regression model. This is generally referred to by the ML community as back-propagation and more broadly by the mathematical community as Reverse Automatic Differentiation . Let's start by taking the same schematic diagram used for the logistic regression example and generalize it to a N-layer NN: The main difference here, which we will need to discuss in details, is the fact that in the forward pass we feed the input into a stack of linear layers prior to computing the loss function. The backpropagation does need to be able to keep track of the chain of operations (i.e., computational graph) and traverse it back. However, as already done for the logistic regression model, all we need to do is to write the entire chain of operations as a chain of atomic ones that we can then easily traverse back. Let's do this for the network above and a single training sample \\(\\textbf{x}\\) : \\[ \\textbf{z}^{[1]} = \\textbf{W}^{[1]}\\textbf{x} + \\textbf{b}^{[1]}, \\quad \\textbf{a}^{[1]} = \\sigma(\\textbf{z}^{[1]}), \\] \\[ \\textbf{z}^{[2]} = \\textbf{W}^{[2]}\\textbf{a}^{[1]} + \\textbf{b}^{[2]}, \\quad \\textbf{a}^{[2]} = \\sigma(\\textbf{z}^{[2]}), \\] \\[ z^{[3]} = \\textbf{w}^{[3]T}\\textbf{a}^{[2]} + b^{[3]}, \\quad a^{[3]} = \\sigma(z^{[3]}), \\] \\[ l = \\mathscr{L}(y,a^{[3]}). \\] Given such a chain of operations, we are now able to find the derivatives of the loss function with respect to any of the weights or biases. As an example we consider here \\(\\partial l / \\partial \\textbf{W}^{[2]}\\) : \\[ \\frac{\\partial l}{\\partial \\textbf{W}^{[2]}} = \\frac{\\partial l}{\\partial a^{[3]}} \\frac{\\partial a^{[3]}}{\\partial z^{[3]}} \\frac{\\partial z^{[3]}}{\\partial \\textbf{a}^{[2]}} \\frac{\\partial \\textbf{a}^{[2]}}{\\partial \\textbf{z}^{[2]}} \\frac{\\partial \\textbf{z}^{[2]}}{\\partial \\textbf{W}^{[2]}} \\] Note that since this will be effectively evaluated from left to right, it is perhaps easier to rewrite the chain of derivatives as follows: \\[ \\frac{\\partial l}{\\partial \\textbf{W}^{[2]}} = \\frac{\\partial \\textbf{z}^{[2]}}{\\partial \\textbf{W}^{[2]}} \\frac{\\partial \\textbf{a}^{[2]}}{\\partial \\textbf{z}^{[2]}} \\frac{\\partial z^{[3]}}{\\partial \\textbf{a}^{[2]}} \\frac{\\partial a^{[3]}}{\\partial z^{[3]}} \\frac{\\partial l}{\\partial a^{[3]}} \\] Assuming for simplicity that the binary cross-entropy and sigmoid functions are used here as the loss and activation functions, respectively, we get: \\[ \\frac{\\partial a^{[3]}}{\\partial z^{[3]}} \\frac{\\partial l}{\\partial a^{[3]}} = a^{[3]} - y \\] (which has already been derived in the logistic regression lecture). The subsequent derivatives are: \\[ \\frac{\\partial z^{[3]}}{\\partial \\textbf{a}^{[2]}} = \\textbf{w}^{[3]}_{N_{a^{[2]}} \\times 1} \\] \\[ \\frac{\\partial \\textbf{a}^{[2]}}{\\partial \\textbf{z}^{[2]}} = diag\\{\\textbf{a}^{[2]}(1-\\textbf{a}^{[2]})\\}_{N_{z^{[2]}} \\times N_{a^{[2]}}} \\] \\[ \\frac{\\partial \\textbf{z}^{[2]}}{\\partial \\textbf{W}^{[2]}} = \\begin{bmatrix} \\mathbf{a}^{[1]} & \\mathbf{0} & \\ldots & \\mathbf{0} \\\\ \\mathbf{0} & \\mathbf{a}^{[1]} & \\ldots & \\mathbf{0} \\\\ \\vdots & \\vdots & \\ddots & \\vdots \\\\ \\mathbf{0} & \\mathbf{0} & \\ldots & \\mathbf{a}^{[1]} \\end{bmatrix}_{N_{a^{[1]}}N_{z^{[2]}} \\times N_{z^{[2]}}} \\] where the last two expressions correspond to the transposed Jacobian. Putting all together: \\[ \\frac{\\partial l}{\\partial \\textbf{W}^{[2]}} = \\begin{bmatrix} \\mathbf{a}^{[1]} & \\mathbf{0} & \\ldots & \\mathbf{0} \\\\ \\mathbf{0} & \\mathbf{a}^{[1]} & \\ldots & \\mathbf{0} \\\\ \\vdots & \\vdots & \\ddots & \\vdots \\\\ \\mathbf{0} & \\mathbf{0} & \\ldots & \\mathbf{a}^{[1]} \\end{bmatrix} diag\\{\\textbf{a}^{[2]}(1-\\textbf{a}^{[2]})\\} \\textbf{w}^{[3]} (a^{[3]} - y) \\] which can be later reshaped into a matrix of size \\(N_{z^{[2]}} \\times N_{a^{[1]}}\\) . This derivative can also written in a more compact form as \\[ \\frac{\\partial l}{\\partial \\textbf{W}^{[2]}} = \\textbf{a}^{[1]}[(\\textbf{a}^{[2]}(1-\\textbf{a}^{[2]})) \\cdot \\textbf{w}^{[3]}(a^{[3]} - y)]^T \\] where \\(\\cdot\\) is used to refer to element-wise products. Similar results can be obtained for the bias vector and for both weights and biases in the other layers as depicted in the figure below for a 2-layer NN: To conclude, the backpropagation equations in the diagram above are now generalized for the case of \\(N_s\\) training samples \\(\\textbf{X} \\in \\mathbb{R}^{N \\times N_s}\\) and a generic activation function \\(\\sigma\\) whose derivative is denoted as \\(\\sigma'\\) . Here we still assume an output of dimensionality one -- \\(\\textbf{Y} \\in \\mathbb{R}^{1 \\times N_s}\\) : \\[ \\textbf{dZ}^{[2]}=\\textbf{A}^{[2]}-\\textbf{Y} \\qquad (\\textbf{A}^{[2]},\\textbf{dZ}^{[2]} \\in \\mathbb{R}^{1 \\times N_s}) \\] \\[ \\textbf{dW}^{[2]}= \\frac{1}{N_s} \\textbf{dZ}^{[2]}\\textbf{A}^{[1]T} \\qquad (\\textbf{A}^{[1]} \\in \\mathbb{R}^{N^{[1]} \\times N_s}) \\] \\[ db^{[2]}= \\frac{1}{N_s} \\sum_i \\textbf{dZ}_{:,i}^{[2]} \\] \\[ \\textbf{dZ}^{[1]}=\\textbf{W}^{[2]^T}\\textbf{dZ}^{[2]} \\cdot \\sigma'(\\textbf{Z}^{[1]}) \\qquad (\\textbf{dZ}^{[1]} \\in \\mathbb{R}^{N^{[1]} \\times N_s}) \\] \\[ \\textbf{dW}^{[1]}= \\frac{1}{N_s} \\textbf{dZ}^{[1]}\\textbf{X}^T \\] \\[ \\textbf{db}^{[1]}= \\frac{1}{N_s} \\sum_i \\textbf{dZ}_{:,i}^{[1]} \\]","title":"Backpropagation"},{"location":"lectures/06_nn/#initialization","text":"Neural networks are highly nonlinear functions. The associated cost function used in the training process in order to optimize the network weights and biases is therefore non-convex and contains several local minima and saddle points. A key component in non-convex optimization is represented by the starting guess of the parameters to optimize, which in the context of deep learning is identified by initialization of weights and biases. Whilst a proper initialization has been shown to be key to a successful training of deep train NNs, this is a very active area of research as initialization strategies are so far mostly based on heuristic arguments and experience.","title":"Initialization"},{"location":"lectures/06_nn/#zero-initialization","text":"First of all, let's highlight a bad choice of initialization that can compromise the training no matter the architecture of the network and other hyperparameters. A common choice in standard optimization in the absence of any strong prior information is to initialize all the parameters to zero: if we decide to follow such a strategy when training a NN, we will soon realize that training is stagnant due to the so called symmetry problem (also referred to as symmetric gradients ). Note that a similar situation arises also if we choose a constant values for weights and biases (e.g., \\(c^{[1]}\\) for all the weights and biases in the first layer and \\(c^{[2]}\\) for all the weights and biases in the second layer): Let's take a look at this with an example: Since the activations are constant vectors, back-propagation produces constant updates for the weights (and biases), leading to weights and biases to never lose the initial symmetry.","title":"Zero initialization"},{"location":"lectures/06_nn/#random-initialization","text":"A more appropriate way to initialize the weights of a neural network is to sample their values from random distributions, for example: $$ w_{ij}^{[.]} \\sim \\mathcal{N}(0, 0.01) $$ where the choice of the variance is based on the following trade-off: too small variance leads to the vanishing gradient problem (i.e., slow training), whilst too high variance leads to the exploding gradient problem (i.e., unstable training). On the other hand, for the biases we can use zero or a constant value. If you remember, we have already mentioned this when discussing the ReLU activation function: a good strategy to limit the amount of negative values as input to this activation function is to choose a small constant bias (e.g., \\(b=0.1\\) ). Whilst this approach provides a good starting point for stable training of neural networks, more advanced initialization strategies have been proposed in the literature: Uniform : the weights are initialized with uniform distributions whose variance depend on the number of units in the layer: $$ w_{ij}^{[k]} \\sim \\mathcal{U}(-1/\\sqrt{N^{[k]}}, 1/\\sqrt{N^{[k]}}) $$ or $$ w_{ij}^{[k]} \\sim \\mathcal{U}(-\\sqrt{6/(N^{[k-1]}+N^{[k]})}, \\sqrt{6/(N^{[k-1]}+N^{[k]})}) $$ This strategy is commonly used with FC layers. Xavier : the weights are initialized with normal distributions whose variance depend on the number of units in the layer: $$ w_{ij}^{[k]} \\sim \\mathcal{N}(0, 1/N^{[k]}) $$ This strategy ensures that the variance remains the same across the layers. Xavier initialization is very popular especially in layers using Tanh activations. He : the weights are initialized with normal distributions whose variance depend on the number of units in the layer: $$ w_{ij}^{[k]} \\sim \\mathcal{N}(0, 2/N^{[k]}) $$ This strategy ensures that the variance remains the same across the layers. He initialization is very popular especially in layers using ReLU activations.","title":"Random initialization"},{"location":"lectures/06_nn/#why-deep-learning-took-off-in-the-last-century","text":"Before moving onto the last topic of this lecture, a unified statistical view of loss functions in deep learning, let's try to answer a question that many of you may ask: what makes NNs so popular these days and why deep learning took off in the last decade? By now, we have made ourself familiar with the concept of neural networks, learned about its basic building block (the so-called perceptron) and how by simply horizontally and vertically stacking multiple perceptrons we can create universal function approximators that can be trained to learn very complex nonlinear relationships between inputs and targets (provided availability of a large enough amount of training data). The process of creating and training NNs relies on the following four key ingredients: linear algebra operations : matrix-vector and matrix-matrix multiplications (at least within the context of FC networks); activations : nonlinear functions that enable the learning of complex nonlinear mappings; loss functions : functions that can be used to evaluate the goodness of the model in terms of predicting targets from inputs; learning algorithms : optimization methods that can produce the best weights and biases using gradient information; Eventually, most of the underlying theory of NNs was already fairly mature in 70s and 80s; nevertheless, until the early 2000, research in the field of artificial neural networks was still considered a niche domain mostly theoretical and with little practical implications. So, what did lead to the renaissance of Deep Learning? Two key factors in the popularity and success of Neural Networks growth are undoubtedly: larger datasets : with the growth of the internet and social media, a digital revolution has started since the beginning of the new century, where datasets of ever increasing size can be easily sourced. This applies both to images and text as well as audio and video content. larger networks : with the emergence of new hardware technology such as GPUs, training large deep networks is nowadays possible, not only for large corporations like Google or Microsoft but also in Academia or for small- and medium-size enterprises that want to leverage their data to make data-driven business decisions. Alongside the data and hardware revolution, a number of important algorithmic discoveries have also led to faster, more robust training of NNs making this process easier and more accessible to domain scientists in a variety of scientific fields. Some of them have been already discussed, but we wish here to put more emphasis on them: MSE --> Cross-entropy : whilst in the past the mean square error (MSE) loss was used for pretty much every task, nowadays classification or semantic segmentation tasks are more commonly solved by means of Cross-entropy loss functions. This shift in training strategy is mostly due to the fact that the ML community and the statistical community got closer to each other in the last two decades, which lead to the development of strong statistical foundations in the theory of deep learning; Sigmoid --> ReLU : whilst continuous, differentiable activation functions used to be a must in the past mostly due to the belief that gradient descent algorithms (and back-propagation) needs these kind of functions to behave correctly, it is now clear that this constraint can be greatly related. Piece-wise linear activation functions like ReLU are nowadays not only used but pretty much the de-facto standard for hidden layers in deep neural networks. Jarrett et al. (2009) observed that \"using a rectifying nonlinearity is the single most important factor in improving the performance of a recognition system\" .","title":"Why Deep Learning took off in the last century"},{"location":"lectures/06_nn/#maximum-likelihood-estimators","text":"To conclude, we would like to revisit the loss functions already introduced in the context of linear and logistic regression models and introduce some other loss functions that are commonly employed to train neural networks. However, whilst so far we have chosen different loss functions for each task (regression vs. classification) without really providing a statistical motivation of such choices, in this section we will instead try to define a common framework based on the concept of Maximum Likelihood Estimations (MLE). Let's start by considering a set of samples drawn from the true (but unknown) distribution: \\[ \\mathbf{X} = \\{ \\mathbf{x}^{(1)}, \\mathbf{x}^{(2)}, ..., \\mathbf{x}^{(N_s)} \\} \\sim p_{data}(\\mathbf{X}) \\] Second, a parametric family of probability distribution is defined: \\[ p_{model}(\\mathbf{X}; \\theta) \\] This distribution maps any vector \\(\\mathbf{x}\\) to a real number and is generally referred to as the likelihood function. Its free parameters \\(\\theta\\) must be chosen such that this probability distribution is as close as possible to the true one. As an example, if we consider a multi-variate gaussian distribution with uncorrelated members, the free parameters become \\(\\theta = \\{ \\boldsymbol \\mu, \\sigma\\}\\) and the probability density function becomes: \\[ p_{model}(\\mathbf{x}; \\{ \\boldsymbol \\mu, \\sigma\\}) = \\frac{1}{\\sqrt{2 \\pi \\sigma^2}} e^{-\\frac{||\\mathbf{x} - \\boldsymbol \\mu||_2^2}{2 \\sigma^2}} \\] We can now define the MLE as follows: \\[ \\theta_{ML} = \\underset{\\theta} {\\mathrm{argmax}} \\; p_{model}(\\mathbf{X}; \\theta) \\] Assuming now statistical independence between the samples \\(\\mathbf{x}^{(i)}\\) , the equation above can be rewritten as: \\[ \\begin{aligned} \\theta_{ML} &= \\underset{\\theta} {\\mathrm{argmax}} \\; \\prod_{i=1}^{N_s} p_{model}(\\mathbf{x}^{(i)}; \\theta) \\\\ &= \\underset{\\theta} {\\mathrm{argmax}} \\; \\sum_{i=1}^{N_s} log(p_{model}(\\mathbf{x}^{(i)}; \\theta)) \\\\ &\\approx \\underset{\\theta} {\\mathrm{argmax}} \\; E_{\\mathbf{x} \\sim p_{data}} [ log(p_{model}(\\mathbf{x}; \\theta))] \\\\ &= \\underset{\\theta} {\\mathrm{argmin}} \\; - E_{\\mathbf{x} \\sim p_{data}} [ log(p_{model}(\\mathbf{x}; \\theta))] \\end{aligned} \\] Simply put, maximizing the parametric probability density function is shown to be equivalent to minimizing the negative log likelihood of the same distribution. An optimization problem must be therefore solved to find the most suitable free parameters. Going back to the multi-variate gaussian example, let's assume we are interested to estimate the mean (whilst we keep the variance fixed): \\[ \\begin{aligned} \\boldsymbol \\mu_{ML} &= \\underset{\\boldsymbol \\mu} {\\mathrm{argmin}} \\; - \\sum_{i=1}^{N_s} log \\Big( \\frac{1}{\\sqrt{2 \\pi \\sigma^2}} e^{-\\frac{||\\mathbf{x}^{(i)} - \\boldsymbol \\mu||_2^2}{2 \\sigma^2}} \\Big) \\\\ &= \\underset{\\boldsymbol \\mu} {\\mathrm{argmin}} \\; \\sum_{i=1}^{N_s} \\frac{||\\mathbf{x}^{(i)} - \\boldsymbol \\mu||_2^2}{2 \\sigma^2} \\end{aligned} \\] Computing the gradient and imposing it to be zero gives us the point estimate of \\(\\boldsymbol \\mu_{ML}\\) : \\[ \\frac{\\partial -\\sum_i log p}{\\partial \\boldsymbol \\mu} = 0 \\rightarrow \\sum_{i=1}^{N_s} (\\mathbf{x}^{(i)} - \\boldsymbol \\mu) = 0 \\rightarrow \\boldsymbol \\mu_{ML} = \\frac{1}{N_s} \\sum_{i=1}^{N_s} \\mathbf{x}^{(i)} \\] which is nothing more than the well-known sample mean of the distribution. In order to apply the same framework to learning problems like those arising in DL, the ML estimation is now extended to the case of conditional probabilities where we are given a set of training pairs \\((\\mathbf{x}, y)^{(i)}\\) : \\[ \\begin{aligned} \\theta_{ML} &= \\underset{\\theta} {\\mathrm{argmax}} \\; p_{model}(Y | \\mathbf{X}; \\theta) \\\\ &= ... \\\\ &= \\underset{\\theta} {\\mathrm{argmin}} \\; - E_{\\mathbf{x},y \\sim p_{data}} [ log(p_{model}(y|\\mathbf{x}; \\theta))] \\end{aligned} \\]","title":"Maximum likelihood estimators"},{"location":"lectures/06_nn/#regression","text":"","title":"Regression"},{"location":"lectures/06_nn/#linear-regression","text":"Let's first apply this framework to a simple linear regression problem. Here, under the assumption of gaussian noise, the likelihood can be written as a multi-variate gaussian distribution: \\[ y = \\tilde{\\mathbf{x}}^T \\boldsymbol \\theta + \\mathbf{n} \\sim \\mathcal{N}(\\hat{y} = \\tilde{\\mathbf{x}}^T \\boldsymbol \\theta, \\sigma) \\] Plugging this distribution into the negative log likelihood expression, we obtain: \\[ \\boldsymbol \\theta_{ML} = \\underset{\\boldsymbol \\theta} {\\mathrm{argmin}} \\; \\sum_{i=1}^{N_s} \\frac{||\\hat{y}^{(i)} - y^{(i)}||_2^2}{2\\sigma^2} = \\frac{N_s}{2\\sigma^2} MSE(\\textbf{y}_{train}, \\hat{\\textbf{y}}_{train})\\\\ \\] This cost function can be seen to be a rescaled version of the MSE function previously introduced as the loss function for the linear regression model. Note however, that this model is not only more rigorous from a statistical point of view but provides also a natural way to handle training samples with different confidence. By using sample-dependant scaling factors ( \\(\\sigma^{(i)}\\) ), different samples can be chosen to contribute more or less to the training process.","title":"Linear regression"},{"location":"lectures/06_nn/#multi-layer-perceptron-regression","text":"A very similar derivation can be performed for a neural network composed by one or more MLPs. Eventually we simply need to swap the previously linearly predicted output \\(\\hat{y}=\\tilde{\\mathbf{x}}^T \\boldsymbol \\theta\\) with a new output produced by the chosen nonlinear functional \\(\\hat{y}=f_\\theta(\\mathbf{x})\\) . In conclusion, we must remember that the MSE loss function, commonly used for regression tasks in ML and DL, is a MLE in disguise.","title":"Multi-layer perceptron regression"},{"location":"lectures/06_nn/#classification","text":"","title":"Classification"},{"location":"lectures/06_nn/#binary-classification","text":"In statistical learning, a Bernoulli distribution is commonly used for the task of binary (i.e., 2 label) classification: \\[ P(y)= \\phi y + (1-\\phi)(1-y) \\] where \\(y\\) is the outcome and \\(\\phi\\) is its probability that we wish to learn by means of a model (i.e., logistic regression or MLP). Moreover, as we wish to learn a probability this value must be bound between 0 and 1; this can be easily achieved by feeding the output of the model into a sigmoid function \\(\\sigma\\) : \\[ \\hat{y} = \\sigma (f_\\theta(\\mathbf{x})) \\] Put together: \\[ \\begin{aligned} \\boldsymbol \\theta_{ML} &= \\underset{\\boldsymbol \\theta} {\\mathrm{argmin}} \\; -\\sum_{i=1}^{N_s} log(p_{model}(y^{(i)}|\\mathbf{x}^{(i)}; \\theta) \\\\ &= -\\sum_{i=1}^{N_s} y^{(i)} log \\hat{y}^{(i)} + (1-y^{(i)}) log (1-\\hat{y}^{(i)}) \\end{aligned} \\] which is the same loss function that we have introduced and discussed in details in the context of logistic regression. Once again, we note how we have here simply defined a MLE for a classification task and obtained the well-know binary cross-entropy loss function.","title":"Binary classification"},{"location":"lectures/06_nn/#multi-label-classification","text":"An extension of binary classification, multi-label classification aims at producing an estimate of the most likely class within a set of \\(N_c\\) classes. The combination of a Bernoulli distribution and sigmoid activation used for the binary classifier is here replaced by a Multinoulli distribution and softmax activation, where the latter is defined as follows: \\[ \\hat{\\mathbf{y}} = \\sigma(\\mathbf{x}) =\\frac{e^\\mathbf{x}}{\\sum_{i=1}^{N_c} e^{x_i}} \\] A property of such activation function is that it takes as input a vector of numbers (sometimes called logits )) and produces as output a vector of probabilities (i.e., \\(y_i>0\\) and \\(\\sum_{i=1}^{N_c} y_i=1\\) ). Put together: \\[ \\begin{aligned} \\boldsymbol \\theta_{ML} &= \\underset{\\boldsymbol \\theta} {\\mathrm{argmin}} \\; -\\sum_{i=1}^{N_s} log(p_{model}(y^{(i)}|\\mathbf{x}^{(i)}; \\theta)) \\\\ &= -\\sum_{i=1}^{N_s} \\sum_{j=1}^{N_c} y_j^{(i)} log \\hat{y}_j^{(i)} \\end{aligned} \\] where the true labels \\(\\mathbf{y}^{(i)}\\) are one-hot encoded vectors (i.e., \\(y_{j=j_{true}}^{(i)}=1\\) and \\(y_{j \\neq j_{true}}^{(i)}=0\\) ). To conclude, let's try to get more insights into why ML estimators work so succesfully. In order to do so, we start by defining a measure of similarity between the two distributions of interest: empirical distribution of the data: \\(p_{data}(\\mathbf{X})\\) parametric model distribution: \\(p_{model}(\\mathbf{X}; \\theta)\\) This can be achieved by means of the previously introduced Kullback-Leibler divergence, which we can write as follows: \\[ D_{KL}(p_{data}||p_{model}) = E_{x \\sim p_{data}} [log p_{data}(\\mathbf{x}) - p_{model}(\\mathbf{x})] \\] Since we are interested to estimate the free-parameters \\(\\theta\\) such that the model distribution matches that of the data, an equivalent optimization problem can be written with the help of the KL divergence: \\[ \\begin{aligned} \\theta_{KL} &= \\underset{\\theta} {\\mathrm{argmin}} \\; D_{KL}(p_{data}||p_{model}) \\\\ &= \\underset{\\theta} {\\mathrm{argmin}} \\; - E_{\\mathbf{x} \\sim p_{data}} [ log(p_{model}(\\mathbf{x}; \\theta))] \\end{aligned} \\] where the data probability has been removed in the second term since it is independent of \\(\\theta\\) . We can conclude that \\(\\theta_{KL}=\\theta_{ML}\\) and therefore minimizing the KL divergence between the model and data distributions is the same as maximizing their cross-entropy (as done by the ML estimator).","title":"Multi-label classification"},{"location":"lectures/06_nn/#additional-readings","text":"If you are interested to learn more about network initialization, I recommend reading (and reproducing) the following blog posts: 1 and 2 .","title":"Additional readings"},{"location":"lectures/07_bestpractice/","text":"Best practices in the training of Machine Learning models This lecture is devoted to the training of Machine Learning models in general, with Neural Networks representing a subclass of the entire set of models commonly used to learn mappings between some features and targets. As we will see in the following, a number of best practices are in fact independent on the model used. Let's begin by re-stating here the overall aim of a ML model: a model is useful if it can perform well on new, previously unseen data. This property of a model is also generally referred to as generalization . In order to be able to assess the generalization capabilities of a model, the dataset available for training must be divided into 3 distinct sets: Training dataset: \\(\\{ \\mathbf{X}_{train}, \\mathbf{Y}_{train} \\}\\) , used to train the model (e.g., learn the free-parameters \\(\\boldsymbol \\theta\\) of a NN); Validation dataset: \\(\\{ \\mathbf{X}_{valid}, \\mathbf{Y}_{valid} \\}\\) , used to select the hyperparameters of the model; Testing dataset: \\(\\{ \\mathbf{X}_{test}, \\mathbf{Y}_{test} \\}\\) , used only once a model is finalized (trained and optimized) to produce an unbiased estimate of model performance. Note that a number of assumptions are usually made on the training samples, namely each sample is independent from the others, samples must come from the same distributions. The first assumption is however seldom satisfied as a different training samples are related to each other to some degree (this is for example the case when samples are taken from well logs at consecutive depth levels or from 2D slices of 3D seismic cubes). On the other hand, the second assumption must be satisfied for a successful training. For example, if well logs from the North Sea are used in the training data and well logs from Onshore US are used in the testing data, any estimate of model performance will be biased as the two sets are likely to belong to different distributions. Historically, the overall dataset is split into training/validation/testing data with the following proportions: 60%/20%/20%. This is the case for small datasets in the order of hundreds or thousands samples to be able to retain a large enough set to produce reliable statistics. In recent years, when training neural networks with large datasets (in the order of millions or more samples), the split is more commonly chosen as 98%/1%/1%. As the size of datasets in geoscientific applications is usually in between what we referred to as small and large datasets, choosing validation and training sets that are 5%/10% of the overall dataset is usually a good choice. A measure must be then chosen to evaluate the performance of such a model in terms of the estimation error after training. This can be for example the MSE for regression tasks, or cross-entropy for classification tasks. Three quantities are generally computed: Training error (or performance): overall error (or performance) computed over the training dataset; Validation error (or performance): overall error (or performance) computed over the validation dataset. Test/Generalization error (or performance): overall error (or performance) computed over the testing dataset. The first two are usually computed during training, whilst the latter once the model is trained. The goodness of machine learning model is usually evaluated based on the following two criteria: Bias : ability to produce a small training error. When the error is small, we say that we have a model with low bias. Vice versa, when the error is large, the model have high bias. In this case, the model is said to be underfitting the data; Variance : ability to produce comparable training and testing error. In other words, if we define gap to be the difference between the testing and training errors, this is also the ability to produce a small gap. When the gap is large, the model is said to be overfitting the data. Finally, no matter how large the training dataset and the model capacity are, the bias and variance are likely to be always present. So, an important question to ask ourselves when working on a ML project is 'how far can the performance of the model be pushed further?'. Or, in other words, can we expect further hyperparameter tuning to be successfully improving the model significantly or the model has reached a plateau in terms of learning capabilities. To answer the above question, we need to first consider two distinct scenarios: the network is designed to solve a task that a human can also solve (e.g., interpreting faults and horizons in seismic volumes) the network is performing a task that a human is not able to perform (e.g., denoising a seismic volume). In the former case, it is possible to define the so-called human-level performance (i.e., error that a human is likely to make on the task at hand). Experience in the field of deep learning has shown that the performance of a model (e.g., accuracy in classification tasks) depends on the development time of a ML model in such a way that a well-designed model will very quickly reach human-level performance, whilst a much more significant effort is required to obtain further improvements and approach the theoretical limit of the model. Such a limit, called the Bayesian optimal error, is the error of an oracle making predictions. In practical applications this may however be unknown, unless the training data has been generated in such a way that true exact labels are available. In a range of geoscientific applications, human-level performance may be replaced by the state-of-the-art algorithm that is commonly used to perform a certain task. For example, going back to the denoising example, this may be represented by FX prediction filters or SVD-based denoising. Such an algorithm can represent the human performance in the above figure and represent a baseline that we would like our model to outperform. At the same time, as no human labels are available, the use of synthetic data with true labels usually represent the only viable solution to creating a training dataset in this scenario. This time, the theoretical limit represented by the true labels can again guide us into how much effort we should put to improve the hyperparameters of the model. To conclude, let's consider two examples of ML classifiers. In the first, after some development time, our ML model performs as follows: human-level percentage error: 2% training percentage error: 10% validation percentage error: 12% In this scenario, we clearly observe a large bias and a much smaller variance. This bias, usually referred to as avoidable bias is what we should focus next in our ML development time. On the other hand, if we observe the following: human-level percentage error: 2% training percentage error: 4% validation percentage error: 20% This model shows a small bias and a large variance. Our ML development time should be therefore devoted to reduce the generalization gap. On the other hand, since the difference between human-level and training error is minimal, we refer to this error as unavoidable bias that may be very difficult to further reduce. In the following we discuss a number of strategies that are commonly used in the process of improving the model and reducing both bias and variance. Capacity A key aspect in the process of designing a ML model that can both fit the available data and generalize to new, unseen data is represented by the size, or more formally speaking the capacity of the model. Simply put, this is the number of free parameters \\(\\theta\\) that the model is allowed to optimize in the training process. Let's begin with an example of linear regression, where the model is defined as follows: \\[ y_i = f_{\\boldsymbol \\theta}(x_i) = \\sum_{j=0}^{N_f} x_i^j \\theta_j \\qquad \\forall i=1,2,...,N_s \\] where \\(N_f+1\\) is the capacity of the model. The simplest model that we can fit to the available data is straight line parametrized by \\(\\boldsymbol \\theta = {\\theta_1, \\theta_2}\\) . More complex model fit a polynomial function of order \\(N_f+1\\) . As shown in the figure below, a too simple model does lead to underfitting, whilst a too complex model leads to overfitting. The key is therefore to make a model that is as simple as possible but not too simple, something usually referred to as the Occam's razor principle in inverse problem theory. Similar principles apply to the design of a neural network. In this case however, the risk of overfitting is much higher due to ability of NNs to learn very complex nonlinear functions. The size of the training dataset plays a key role in choosing the capacity of the network: large networks with thousands (or millions) of free-parameters require large training dataset to avoid the arising of overfitting. Moreover, as shown in the figure below, training and testing errors are usually similar for small networks in the underfitting regimes and tend to reduce together as the network size increases. However, when the capacity of the network grows into the overfitting regime, the training error keeps decreasing whilst the testing error starts to increase. Note that the training error should always reduce when increasing the size of the network, therefore it cannot be used to choose the ideal network capacity. This shows the importance of holding some of the available data for validation purposes. i.e., hyperparameter optimization. Other hyperparameters Whilst the size of the network largely drives the ability of a model to learn as well as its tendency to overfit, unfortunately (or fortunately, depending of the point of view of the reader), when designing a ML model you will likely need to make decisions on a number of additional hyperparameters. Here we report some of the most important ones: activation function; optimizer, learning rate additional optimizer hyperparameters, batch size... As already mentioned, we need to devote a portion of the overall dataset that we will not be using to evaluate the final performance of the model for the task of hyperparameter tuning . This is indeed the role of the validation dataset (note that using this dataset at testing stage will result in underestimating the generalization error of the model because the model is partially optimized on this dataset). Finally, whilst not very commonly used in the context of deep learning because of the extremely high cost of training neural networks, a more powerful approach in optimizing hyper-parameter is the so-called cross-validations strategy. Similar to validation, a portion of the dataset is hold out from the training process and used for hyperparameter tuning. However, the portion selected for this task is not fixed, rather the entire training dataset is split into K groups, where each group is used once as the validation set and the remaining number of times as part of the training dataset. This approach, usually referred to a K-fold cross-validation . It is a great help when the training dataset is of limited size as it helps averaging out fluctuations on the validation error over multiple realizations. The obvious downside of this strategy is of course that the training process must be repeated N times. Note that other strategies can be used to split the dataset into training and validation. For example, in the context of classification, a class-aware division is usually recommended where inter-class proportions are the same between the validation and training datasets. Regularization Whilst both under- and overfitting commonly affect the development of a successful ML model, reducing variance without affecting bias is notoriously difficult. A strategy that is usually successful in achieving such a task is called Regularization. Regularization acts directly on the loss function by adding prior knowledge to the training process. By informing the network about something that we know (or wish the model to know), the network is less prone to just learn from the training data and improve its generalization capabilities. In the context of inverse problem theory, where our aim is to fit some observations \\(\\mathbf{y}\\) given knowledge of the underlying physical process ( \\(g\\) ), \\[ J = ||\\mathbf{y} - g(\\mathbf{x})||_2^2 + \\lambda R(\\mathbf{x}) \\] regularization can come in different flavours, more specifically: Tikhonov (or L2) regularization : \\(||\\mathbf{x}||_2^2\\) . Ensures that a model with small norm is favoured over other models equally fitting the data. This promotes simplicity (or parsimony) in the solution Generalized Tikhonov regularization : e.g., \\(||\\nabla \\mathbf{x}||_2^2\\) , where \\(\\nabla\\) is the laplacian operator. Ensures that a smooth model is favoured over a rough one by second derivative of the model (i.e., curvature). L1 regularization or sparsity : \\(||\\mathbf{x}||_p \\; p \\le 1\\) . Promotes solutions that are sparse (i.e., few non-zero elements) A similar approach can be adopted in the context of machine learning by augmenting the data-fitting loss function with a regularization term: \\[ J_\\theta = \\frac{1}{N_s} \\sum_{i=1}^{N_s} ||y^{(i)} - f_\\theta(\\mathbf{x}^{(i)})||_2^2 + \\lambda R(\\theta) \\] where: L2 regularization or weight decay : \\(||\\boldsymbol \\theta||_2^2= \\boldsymbol \\theta^T \\boldsymbol \\theta\\) . Ensures small Euclidean norm for the free-parameters; L1 regularization : \\(||\\boldsymbol \\theta||_1\\) . Ensures small L1 norm for the free-parameters. By favouring sparsity in the parameters of the model, this regularization can allow compressing the trained model by storing only the non-negative parameters and their indices. Note that in the statistical community, regression models with one of the two regularizations discussed above is called Ridge and Lasso regression models, respectively. Finally, in the special case of deep learning and training by back-propagation the regularization terms and their gradients can be simply expressed as: L2 regularization or weight decay : $$ \\frac{1}{2 N_s} \\sum_{l=1}^{L} ||\\mathbf{W}^{[l]}||_2^2 \\rightarrow \\mathbf{W}^{[l]} = \\mathbf{W}^{[l]} \\big[ 1 - \\alpha \\frac{\\lambda}{N_s} \\big] - \\alpha \\mathbf{dW}^{[l]} $$ where the term weight decay comes from the fact that the strength of the current weights is reduced by a factor of \\(\\alpha \\lambda / N_s\\) at every gradient step of the regularized loss function L1 regularization : $$ \\sum_{l=1}^{L} ||\\mathbf{W}^{[l]}||_2^2 \\rightarrow \\mathbf{W}^{[l]} = \\mathbf{W}^{[l]} - \\alpha \\mathbf{dW}^{[l]} - \\mathbf{W}^{[l]} $$ In both cases \\(L\\) represents the number of layers in the network. Note that it is common to apply regularization to the weights only (no bias terms). Dropout Dropout represents a special kind of regularization strategy to prevent overfitting, which is specific to Neural Network architectures. Contrarily to other regularization techniques that act directly on the objective function to optimize, dropout modifies the network in such a way that a portion of the neurons are randomly inactivated (i.e., set to zero) during the training process. A strategy to mitigate overfitting is in fact to reduce the size of the network (i.e., its degrees of freedom) used to match the training data; however, instead of applying such a drastic approach, dropout allows the overall network size to remain unchanged whilst making the network effectively smaller during training. By doing so, the network learns not to rely on any neuron (or set of neurons) in particular, which leads to better generalization in validation and testing phases. Considering a simple 3-layers NN, the nodes at each layer are inactivated with probability \\(p_{drop}\\) . In the figure below, \\(p_{drop}=0.3\\) for the input and first hidden layer, \\(p_{drop}=0.5\\) for the second hidden layer (and \\(p_{drop}=0\\) for the output layer): Mathematically speaking, if we consider a single node in the figure above, its output can be written as: \\[ z_2^{[1]} = \\frac{W_{21}^{[1]} x_1 + \\cancel{W_{22}^{[1]} x_2} + W_{32}^{[1]} x_3 + b_{2}^{[1]}}{1-p_{drop}} \\] where the second term is removed because \\(W_{22}^{[1]}\\) is set to zero. Moreover note that a denominator is added to compensate for the fact that the output is smaller than the one that we would obtain without dropout. In practice, a more convenient way to implement dropout is to act directly on the input vector at the i-th layer \\(\\mathbf{a}^{[i-1]}\\) instead of deactivating the weights. This approach is called inverted dropout and simply achieved by using a mask \\(\\mathbf{m}\\) where each element is randomly set to 0 or 1 based on \\(p_{drop}\\) : \\[ \\mathbf{a}^{[i-1]} \\rightarrow \\tilde{\\mathbf{a}}^{[i-1]} = \\mathbf{a}^{[i-1]} \\cdot \\mathbf{m} \\rightarrow \\mathbf{z}^{[i]} = (\\mathbf{W}^{[i]} \\tilde{\\mathbf{a}}^{[i-1]})/(1-p_{drop}) \\] Finally, at testing time, dropout is usually inactivated. Recent research in the area of uncertainty quantification (UQ) for deep learning has however suggested that by using dropout also at testing time, multiple equi-probable realizations of the output can be produced and statistics (e.g., mean, variance, marginal probability...) can be computed. This strategy will be implemented and compared to other strategies for UQ in in this lab . Data augumentation One of the key elements to a successful generalization (i.e., low variance) is represented by the availability of a large training dataset. When this is not the case and it is not feasible to acquire additional data, an alternative solution is represented by so-called data augmentation . Data augmentation simply refers to the set of procedures that can be employed to create new artificial data samples by manipulating or combining some of the original data samples. Whilst this is very data and problem dependant, in the following we will mention a number of strategies that are commonly used in computer vision when working with natural images. More specifically: cropping, flipping, rotating, shearing, averaging, color shifting are all strategies that can be used or adapted to geoscientific multi-dimensional datasets. Nevertheless, special attention may be required when implementing some of the above mentioned strategies. Just to give an example, stretching or squeezing time series data such as seismic traces does introduce a shift in the frequency content that may not be desirable. Similarly, applying color shifting to seismic data will lead to a signal whose average is not zero anymore. Alternatively, polarity reversal represents a better alternative that can be seen as a special type of color shifting when dealing with seismic data. Finally, we observe that although this does not, strictly speaking, fall within the realm of data augmentation, using basic physics principles to create synthetic datasets for training is another commonly employed strategy in geoscience when accessing high-quality labelled datasets is feasible from either a technical or intellectual property point of view. We will see example of a ML workflow that relies on synthetic data when dealing with microseismic event detection . Transfer learning An edge case of data augmentation is represented by transfer learning. Transfer learning is a procedure employed to circumvent the issue or scarce labelled data when similar datasets are available and have been previously used to train a neural network for a similar (or sometimes different task). Under these conditions, one may think to use the pre-trained network and use the available training data to fine-tune such a network for the task at hand. Once again, based on the dataset and problem, the entire pre-trained network may be used as starting point or just a portion of it (generally chosen to be the initial portion of the network where some of the final layers are removed, and referred to as backbone). To make things a bit more concrete, let's consider here an example. A NN model has been created to interpret faults in seismic data. Training has been performed using data from an area of the world where both seismic data and fault interpretations are abundant. When a new dataset from the same area becomes available, the pre-trained model can be used as-is or fine-tuned using a much smaller training dataset (i.e., requiring limited manual labelling of faults). A similar strategy could also be used if a new dataset from another area of the world is ready for fault interpretation. In this second case, however, a user needs to be aware that the model may not generalize well if key features in the data (e.g., frequency content) or interpretation (e.g., presence of reverse faults) are different from those in the original dataset used for training. A different case where transfer learning can be also used is when the output that we are interested is slightly different from the one the network was trained on but the input is the same (and therefore the low- and medium-level features learned by the network). In this case, one may freeze the first few layers (i.e., make those parameters non-learnable) and fine-tune only the last few layers of the network on the new task. As an example, let's consider again a network trained for fault interpretation. Imagine now that we are interest to estimate a seismic attribute such as the relative geological time and we have very limited access to labelled data. In this case the backbone of the network is likely to already contain useful features, and the problem arising from a lack of large training dataset is mitigated by limiting the number of free-parameters to learn to those of the last few layers. To conclude, let's visually summarize the strategies that we should keep in mind when interested to reduce bias or variance.","title":"Best practices in the training of Machine Learning models"},{"location":"lectures/07_bestpractice/#best-practices-in-the-training-of-machine-learning-models","text":"This lecture is devoted to the training of Machine Learning models in general, with Neural Networks representing a subclass of the entire set of models commonly used to learn mappings between some features and targets. As we will see in the following, a number of best practices are in fact independent on the model used. Let's begin by re-stating here the overall aim of a ML model: a model is useful if it can perform well on new, previously unseen data. This property of a model is also generally referred to as generalization . In order to be able to assess the generalization capabilities of a model, the dataset available for training must be divided into 3 distinct sets: Training dataset: \\(\\{ \\mathbf{X}_{train}, \\mathbf{Y}_{train} \\}\\) , used to train the model (e.g., learn the free-parameters \\(\\boldsymbol \\theta\\) of a NN); Validation dataset: \\(\\{ \\mathbf{X}_{valid}, \\mathbf{Y}_{valid} \\}\\) , used to select the hyperparameters of the model; Testing dataset: \\(\\{ \\mathbf{X}_{test}, \\mathbf{Y}_{test} \\}\\) , used only once a model is finalized (trained and optimized) to produce an unbiased estimate of model performance. Note that a number of assumptions are usually made on the training samples, namely each sample is independent from the others, samples must come from the same distributions. The first assumption is however seldom satisfied as a different training samples are related to each other to some degree (this is for example the case when samples are taken from well logs at consecutive depth levels or from 2D slices of 3D seismic cubes). On the other hand, the second assumption must be satisfied for a successful training. For example, if well logs from the North Sea are used in the training data and well logs from Onshore US are used in the testing data, any estimate of model performance will be biased as the two sets are likely to belong to different distributions. Historically, the overall dataset is split into training/validation/testing data with the following proportions: 60%/20%/20%. This is the case for small datasets in the order of hundreds or thousands samples to be able to retain a large enough set to produce reliable statistics. In recent years, when training neural networks with large datasets (in the order of millions or more samples), the split is more commonly chosen as 98%/1%/1%. As the size of datasets in geoscientific applications is usually in between what we referred to as small and large datasets, choosing validation and training sets that are 5%/10% of the overall dataset is usually a good choice. A measure must be then chosen to evaluate the performance of such a model in terms of the estimation error after training. This can be for example the MSE for regression tasks, or cross-entropy for classification tasks. Three quantities are generally computed: Training error (or performance): overall error (or performance) computed over the training dataset; Validation error (or performance): overall error (or performance) computed over the validation dataset. Test/Generalization error (or performance): overall error (or performance) computed over the testing dataset. The first two are usually computed during training, whilst the latter once the model is trained. The goodness of machine learning model is usually evaluated based on the following two criteria: Bias : ability to produce a small training error. When the error is small, we say that we have a model with low bias. Vice versa, when the error is large, the model have high bias. In this case, the model is said to be underfitting the data; Variance : ability to produce comparable training and testing error. In other words, if we define gap to be the difference between the testing and training errors, this is also the ability to produce a small gap. When the gap is large, the model is said to be overfitting the data. Finally, no matter how large the training dataset and the model capacity are, the bias and variance are likely to be always present. So, an important question to ask ourselves when working on a ML project is 'how far can the performance of the model be pushed further?'. Or, in other words, can we expect further hyperparameter tuning to be successfully improving the model significantly or the model has reached a plateau in terms of learning capabilities. To answer the above question, we need to first consider two distinct scenarios: the network is designed to solve a task that a human can also solve (e.g., interpreting faults and horizons in seismic volumes) the network is performing a task that a human is not able to perform (e.g., denoising a seismic volume). In the former case, it is possible to define the so-called human-level performance (i.e., error that a human is likely to make on the task at hand). Experience in the field of deep learning has shown that the performance of a model (e.g., accuracy in classification tasks) depends on the development time of a ML model in such a way that a well-designed model will very quickly reach human-level performance, whilst a much more significant effort is required to obtain further improvements and approach the theoretical limit of the model. Such a limit, called the Bayesian optimal error, is the error of an oracle making predictions. In practical applications this may however be unknown, unless the training data has been generated in such a way that true exact labels are available. In a range of geoscientific applications, human-level performance may be replaced by the state-of-the-art algorithm that is commonly used to perform a certain task. For example, going back to the denoising example, this may be represented by FX prediction filters or SVD-based denoising. Such an algorithm can represent the human performance in the above figure and represent a baseline that we would like our model to outperform. At the same time, as no human labels are available, the use of synthetic data with true labels usually represent the only viable solution to creating a training dataset in this scenario. This time, the theoretical limit represented by the true labels can again guide us into how much effort we should put to improve the hyperparameters of the model. To conclude, let's consider two examples of ML classifiers. In the first, after some development time, our ML model performs as follows: human-level percentage error: 2% training percentage error: 10% validation percentage error: 12% In this scenario, we clearly observe a large bias and a much smaller variance. This bias, usually referred to as avoidable bias is what we should focus next in our ML development time. On the other hand, if we observe the following: human-level percentage error: 2% training percentage error: 4% validation percentage error: 20% This model shows a small bias and a large variance. Our ML development time should be therefore devoted to reduce the generalization gap. On the other hand, since the difference between human-level and training error is minimal, we refer to this error as unavoidable bias that may be very difficult to further reduce. In the following we discuss a number of strategies that are commonly used in the process of improving the model and reducing both bias and variance.","title":"Best practices in the training of Machine Learning models"},{"location":"lectures/07_bestpractice/#capacity","text":"A key aspect in the process of designing a ML model that can both fit the available data and generalize to new, unseen data is represented by the size, or more formally speaking the capacity of the model. Simply put, this is the number of free parameters \\(\\theta\\) that the model is allowed to optimize in the training process. Let's begin with an example of linear regression, where the model is defined as follows: \\[ y_i = f_{\\boldsymbol \\theta}(x_i) = \\sum_{j=0}^{N_f} x_i^j \\theta_j \\qquad \\forall i=1,2,...,N_s \\] where \\(N_f+1\\) is the capacity of the model. The simplest model that we can fit to the available data is straight line parametrized by \\(\\boldsymbol \\theta = {\\theta_1, \\theta_2}\\) . More complex model fit a polynomial function of order \\(N_f+1\\) . As shown in the figure below, a too simple model does lead to underfitting, whilst a too complex model leads to overfitting. The key is therefore to make a model that is as simple as possible but not too simple, something usually referred to as the Occam's razor principle in inverse problem theory. Similar principles apply to the design of a neural network. In this case however, the risk of overfitting is much higher due to ability of NNs to learn very complex nonlinear functions. The size of the training dataset plays a key role in choosing the capacity of the network: large networks with thousands (or millions) of free-parameters require large training dataset to avoid the arising of overfitting. Moreover, as shown in the figure below, training and testing errors are usually similar for small networks in the underfitting regimes and tend to reduce together as the network size increases. However, when the capacity of the network grows into the overfitting regime, the training error keeps decreasing whilst the testing error starts to increase. Note that the training error should always reduce when increasing the size of the network, therefore it cannot be used to choose the ideal network capacity. This shows the importance of holding some of the available data for validation purposes. i.e., hyperparameter optimization.","title":"Capacity"},{"location":"lectures/07_bestpractice/#other-hyperparameters","text":"Whilst the size of the network largely drives the ability of a model to learn as well as its tendency to overfit, unfortunately (or fortunately, depending of the point of view of the reader), when designing a ML model you will likely need to make decisions on a number of additional hyperparameters. Here we report some of the most important ones: activation function; optimizer, learning rate additional optimizer hyperparameters, batch size... As already mentioned, we need to devote a portion of the overall dataset that we will not be using to evaluate the final performance of the model for the task of hyperparameter tuning . This is indeed the role of the validation dataset (note that using this dataset at testing stage will result in underestimating the generalization error of the model because the model is partially optimized on this dataset). Finally, whilst not very commonly used in the context of deep learning because of the extremely high cost of training neural networks, a more powerful approach in optimizing hyper-parameter is the so-called cross-validations strategy. Similar to validation, a portion of the dataset is hold out from the training process and used for hyperparameter tuning. However, the portion selected for this task is not fixed, rather the entire training dataset is split into K groups, where each group is used once as the validation set and the remaining number of times as part of the training dataset. This approach, usually referred to a K-fold cross-validation . It is a great help when the training dataset is of limited size as it helps averaging out fluctuations on the validation error over multiple realizations. The obvious downside of this strategy is of course that the training process must be repeated N times. Note that other strategies can be used to split the dataset into training and validation. For example, in the context of classification, a class-aware division is usually recommended where inter-class proportions are the same between the validation and training datasets.","title":"Other hyperparameters"},{"location":"lectures/07_bestpractice/#regularization","text":"Whilst both under- and overfitting commonly affect the development of a successful ML model, reducing variance without affecting bias is notoriously difficult. A strategy that is usually successful in achieving such a task is called Regularization. Regularization acts directly on the loss function by adding prior knowledge to the training process. By informing the network about something that we know (or wish the model to know), the network is less prone to just learn from the training data and improve its generalization capabilities. In the context of inverse problem theory, where our aim is to fit some observations \\(\\mathbf{y}\\) given knowledge of the underlying physical process ( \\(g\\) ), \\[ J = ||\\mathbf{y} - g(\\mathbf{x})||_2^2 + \\lambda R(\\mathbf{x}) \\] regularization can come in different flavours, more specifically: Tikhonov (or L2) regularization : \\(||\\mathbf{x}||_2^2\\) . Ensures that a model with small norm is favoured over other models equally fitting the data. This promotes simplicity (or parsimony) in the solution Generalized Tikhonov regularization : e.g., \\(||\\nabla \\mathbf{x}||_2^2\\) , where \\(\\nabla\\) is the laplacian operator. Ensures that a smooth model is favoured over a rough one by second derivative of the model (i.e., curvature). L1 regularization or sparsity : \\(||\\mathbf{x}||_p \\; p \\le 1\\) . Promotes solutions that are sparse (i.e., few non-zero elements) A similar approach can be adopted in the context of machine learning by augmenting the data-fitting loss function with a regularization term: \\[ J_\\theta = \\frac{1}{N_s} \\sum_{i=1}^{N_s} ||y^{(i)} - f_\\theta(\\mathbf{x}^{(i)})||_2^2 + \\lambda R(\\theta) \\] where: L2 regularization or weight decay : \\(||\\boldsymbol \\theta||_2^2= \\boldsymbol \\theta^T \\boldsymbol \\theta\\) . Ensures small Euclidean norm for the free-parameters; L1 regularization : \\(||\\boldsymbol \\theta||_1\\) . Ensures small L1 norm for the free-parameters. By favouring sparsity in the parameters of the model, this regularization can allow compressing the trained model by storing only the non-negative parameters and their indices. Note that in the statistical community, regression models with one of the two regularizations discussed above is called Ridge and Lasso regression models, respectively. Finally, in the special case of deep learning and training by back-propagation the regularization terms and their gradients can be simply expressed as: L2 regularization or weight decay : $$ \\frac{1}{2 N_s} \\sum_{l=1}^{L} ||\\mathbf{W}^{[l]}||_2^2 \\rightarrow \\mathbf{W}^{[l]} = \\mathbf{W}^{[l]} \\big[ 1 - \\alpha \\frac{\\lambda}{N_s} \\big] - \\alpha \\mathbf{dW}^{[l]} $$ where the term weight decay comes from the fact that the strength of the current weights is reduced by a factor of \\(\\alpha \\lambda / N_s\\) at every gradient step of the regularized loss function L1 regularization : $$ \\sum_{l=1}^{L} ||\\mathbf{W}^{[l]}||_2^2 \\rightarrow \\mathbf{W}^{[l]} = \\mathbf{W}^{[l]} - \\alpha \\mathbf{dW}^{[l]} - \\mathbf{W}^{[l]} $$ In both cases \\(L\\) represents the number of layers in the network. Note that it is common to apply regularization to the weights only (no bias terms).","title":"Regularization"},{"location":"lectures/07_bestpractice/#dropout","text":"Dropout represents a special kind of regularization strategy to prevent overfitting, which is specific to Neural Network architectures. Contrarily to other regularization techniques that act directly on the objective function to optimize, dropout modifies the network in such a way that a portion of the neurons are randomly inactivated (i.e., set to zero) during the training process. A strategy to mitigate overfitting is in fact to reduce the size of the network (i.e., its degrees of freedom) used to match the training data; however, instead of applying such a drastic approach, dropout allows the overall network size to remain unchanged whilst making the network effectively smaller during training. By doing so, the network learns not to rely on any neuron (or set of neurons) in particular, which leads to better generalization in validation and testing phases. Considering a simple 3-layers NN, the nodes at each layer are inactivated with probability \\(p_{drop}\\) . In the figure below, \\(p_{drop}=0.3\\) for the input and first hidden layer, \\(p_{drop}=0.5\\) for the second hidden layer (and \\(p_{drop}=0\\) for the output layer): Mathematically speaking, if we consider a single node in the figure above, its output can be written as: \\[ z_2^{[1]} = \\frac{W_{21}^{[1]} x_1 + \\cancel{W_{22}^{[1]} x_2} + W_{32}^{[1]} x_3 + b_{2}^{[1]}}{1-p_{drop}} \\] where the second term is removed because \\(W_{22}^{[1]}\\) is set to zero. Moreover note that a denominator is added to compensate for the fact that the output is smaller than the one that we would obtain without dropout. In practice, a more convenient way to implement dropout is to act directly on the input vector at the i-th layer \\(\\mathbf{a}^{[i-1]}\\) instead of deactivating the weights. This approach is called inverted dropout and simply achieved by using a mask \\(\\mathbf{m}\\) where each element is randomly set to 0 or 1 based on \\(p_{drop}\\) : \\[ \\mathbf{a}^{[i-1]} \\rightarrow \\tilde{\\mathbf{a}}^{[i-1]} = \\mathbf{a}^{[i-1]} \\cdot \\mathbf{m} \\rightarrow \\mathbf{z}^{[i]} = (\\mathbf{W}^{[i]} \\tilde{\\mathbf{a}}^{[i-1]})/(1-p_{drop}) \\] Finally, at testing time, dropout is usually inactivated. Recent research in the area of uncertainty quantification (UQ) for deep learning has however suggested that by using dropout also at testing time, multiple equi-probable realizations of the output can be produced and statistics (e.g., mean, variance, marginal probability...) can be computed. This strategy will be implemented and compared to other strategies for UQ in in this lab .","title":"Dropout"},{"location":"lectures/07_bestpractice/#data-augumentation","text":"One of the key elements to a successful generalization (i.e., low variance) is represented by the availability of a large training dataset. When this is not the case and it is not feasible to acquire additional data, an alternative solution is represented by so-called data augmentation . Data augmentation simply refers to the set of procedures that can be employed to create new artificial data samples by manipulating or combining some of the original data samples. Whilst this is very data and problem dependant, in the following we will mention a number of strategies that are commonly used in computer vision when working with natural images. More specifically: cropping, flipping, rotating, shearing, averaging, color shifting are all strategies that can be used or adapted to geoscientific multi-dimensional datasets. Nevertheless, special attention may be required when implementing some of the above mentioned strategies. Just to give an example, stretching or squeezing time series data such as seismic traces does introduce a shift in the frequency content that may not be desirable. Similarly, applying color shifting to seismic data will lead to a signal whose average is not zero anymore. Alternatively, polarity reversal represents a better alternative that can be seen as a special type of color shifting when dealing with seismic data. Finally, we observe that although this does not, strictly speaking, fall within the realm of data augmentation, using basic physics principles to create synthetic datasets for training is another commonly employed strategy in geoscience when accessing high-quality labelled datasets is feasible from either a technical or intellectual property point of view. We will see example of a ML workflow that relies on synthetic data when dealing with microseismic event detection .","title":"Data augumentation"},{"location":"lectures/07_bestpractice/#transfer-learning","text":"An edge case of data augmentation is represented by transfer learning. Transfer learning is a procedure employed to circumvent the issue or scarce labelled data when similar datasets are available and have been previously used to train a neural network for a similar (or sometimes different task). Under these conditions, one may think to use the pre-trained network and use the available training data to fine-tune such a network for the task at hand. Once again, based on the dataset and problem, the entire pre-trained network may be used as starting point or just a portion of it (generally chosen to be the initial portion of the network where some of the final layers are removed, and referred to as backbone). To make things a bit more concrete, let's consider here an example. A NN model has been created to interpret faults in seismic data. Training has been performed using data from an area of the world where both seismic data and fault interpretations are abundant. When a new dataset from the same area becomes available, the pre-trained model can be used as-is or fine-tuned using a much smaller training dataset (i.e., requiring limited manual labelling of faults). A similar strategy could also be used if a new dataset from another area of the world is ready for fault interpretation. In this second case, however, a user needs to be aware that the model may not generalize well if key features in the data (e.g., frequency content) or interpretation (e.g., presence of reverse faults) are different from those in the original dataset used for training. A different case where transfer learning can be also used is when the output that we are interested is slightly different from the one the network was trained on but the input is the same (and therefore the low- and medium-level features learned by the network). In this case, one may freeze the first few layers (i.e., make those parameters non-learnable) and fine-tune only the last few layers of the network on the new task. As an example, let's consider again a network trained for fault interpretation. Imagine now that we are interest to estimate a seismic attribute such as the relative geological time and we have very limited access to labelled data. In this case the backbone of the network is likely to already contain useful features, and the problem arising from a lack of large training dataset is mitigated by limiting the number of free-parameters to learn to those of the last few layers. To conclude, let's visually summarize the strategies that we should keep in mind when interested to reduce bias or variance.","title":"Transfer learning"},{"location":"lectures/08_gradopt1/","text":"More on gradient-based optimization Whilst stochastic gradient descent is easy to understand, and simple to implement algorithm (as discussed in this lecture ), it presents a number of shortcomings that prevent learning to be as fast and effective as we would like it to be. In this lecture, we will discuss some of the limitations of SGD and look at alternative optimization algorithms that have been developed in the last decade and are nowadays preferred to SGD in the process of training NNs. Limitations of SGD Ill-conditioning The shape, and more specifically the curvature, of the functional that we wish to minimize affects our ability to quickly and efficiently converge to one of its minima (ideally the global, likely one of the local). For nonlinear optimization problems, like those encountered in deep learning, this is mathematically represented by the Hessian matrix ( \\(\\mathbf{H}=\\frac{\\partial^2 f}{\\partial \\boldsymbol \\theta^2}\\) ). An Hessian matrix with large conditioning number (i.e., ratio of the largest and smallest eigenvalues) tends to affect convergence speed of first-order (gradient-based) methods. In classical optimization theory, second order methods such as the Gauss-Newton method are commonly employed to counteract this problem. However, as already mentioned in one of our previous lectures, such methods are not yet suitable for deep learning in that no mathematical foundations have been developed in conjunction with approximate gradients (i.e., mini-batch learning strategy). Another factor that is worth knowing about is related to the norm of the gradient \\(\\mathbf{g}^T\\mathbf{g}\\) through iterations. In theory, this norm should shrink through iterations to guarantee convergence. Nevertheless, successful training may still be obtained even if the norm does not shrink as long as the learning rate is kept small. Let's write the second-order Taylor expansion of the functional around the current parameter estimate \\(\\boldsymbol \\theta_0\\) : \\[ J(\\boldsymbol \\theta) \\approx J(\\boldsymbol \\theta_0) + (\\boldsymbol \\theta - \\boldsymbol \\theta_0)^T \\mathbf{g} + \\frac{1}{2} (\\boldsymbol \\theta - \\boldsymbol \\theta_0)^T \\mathbf{H} (\\boldsymbol \\theta - \\boldsymbol \\theta_0) \\] and evaluate it at the next gradient step \\(\\boldsymbol \\theta = \\boldsymbol \\theta_0 - \\alpha \\mathbf{g}\\) : \\[ J(\\boldsymbol \\theta_0 - \\alpha \\mathbf{g}) \\approx J(\\boldsymbol \\theta_0) - \\alpha \\mathbf{g}^T \\mathbf{g} + \\frac{1}{2} \\alpha^2 \\mathbf{g}^T \\mathbf{H} \\mathbf{g} \\] We can interpret this expression as follows: a gradient step of \\(- \\alpha \\mathbf{g}\\) adds the following contribution to the cost function, \\(-\\alpha \\mathbf{g}^T \\mathbf{g} + \\frac{1}{2} \\alpha^2 \\mathbf{g}^T \\mathbf{H} \\mathbf{g}\\) . When this contribution is positive (i.e., \\(\\frac{1}{2} \\alpha^2 \\mathbf{g}^T \\mathbf{H} \\mathbf{g} > \\alpha\\mathbf{g}^T \\mathbf{g}\\) ), the cost function grows instead of being reduced. Under the assumption that \\(\\mathbf{H}\\) is known, we could easily choose a step-size \\(\\alpha\\) that prevents this from happening. However, when the Hessian cannot be estimated, a conservative selection of the step-size is the only remedy to prevent the cost function from growing. A downside of such an approach is that the smaller the learning rate the slower the training process. Local minima Whilst the focus of the previous section has been in the neighbour of \\(\\boldsymbol \\theta_0\\) where the functional \\(J_{\\boldsymbol \\theta}\\) can be approximated by a convex function, the landscape of NN functionals is generally non-convex and populated with a multitude of local minima. The problem of converging to the global minimum without getting stuck in one of the local minima is a well-known problem for any non-convex optimization. An example in geophysics is represented by waveform inversion and a large body of work has been carried out by the geophysical research community to identify objective functions that are more well-behaved (i.e., show a large basin of attraction around the global minimum). Nevertheless, getting stuck into local minima is much less of a problem when training neural networks. This can be justified by the fact that multiple models may perform equally well on both the training and testing data. To be more precise this relates to the concept of model identifiability , where a model is defined identifiable if there exist a single set of parameters ( \\(\\boldsymbol \\theta_{gm}\\) ) that lead to optimal model performance. On the other hand, when multiple models \\(\\{ \\boldsymbol \\theta_{gm}, \\boldsymbol \\theta_{lm,1}, ..., \\boldsymbol \\theta_{lm1,N}\\) perform similarly those models are said to be non-identifiable. Moreover, even when a single model performs best, a distinction must be made between training and testing performance. As far as training performance is concerned, this model must be that of the global minimum of the functional \\(\\boldsymbol \\theta_{gm}\\) . Nevertheless, the model that performs best on the testing data may be the one obtained from any of the local minima \\(\\boldsymbol \\theta_{lm,i}\\) as such a model be have better generalization capabilities than the one from the global minimum. Saddle points and other flat regions Recent research in the field of deep learning has however revealed that multi-dimensional landscapes associated to the training of deep neural networks may actually have much fewer local minima than we tend to believe, and the main hinder to slow training is actually represented by saddle points (and flat regions in general). More specifically, empirically it can be shown that the ratio between saddle points and local minima is in the order of \\(e^n\\) where \\(n\\) is the number of dimensions of the model vector \\(\\boldsymbol \\theta\\) . The main problem associated with saddle points is similar to that of local minima: the associated gradient is \\(J(\\boldsymbol \\theta) \\rightarrow 0\\) ; as a consequence, during training, when the trajectory of the model parameter vector approaches a saddle point, the learning process may experience a slow down. Cliffs Another potentially dangerous feature of NN landscapes is represented by steep regions where \\(J(\\boldsymbol \\theta) \\rightarrow \\infty\\) . This may in fact lead to unstable behaviours during training as large jumps will arise in the trajectory of the model parameter vector. Heuristic approaches to mitigate this problem exist, one of them is the so-called gradient clipping strategy where: $$ \\nabla J(\\theta_i) = min(\\nabla J(\\theta_i), th) $$ where \\(th\\) is a user-defined threshold. This approach allows element-wise gradient clipping for those directions with an extremely large gradient whilst not forcing us to lower the overall learning rate. Exploding and vanishing gradients Two problems that we commonly encounter whilst training Neural Networks are the so-called exploding and vanishing gradient phenomena. Whilst we already mentioned two scenarios where either of these situations can occur, i.e., cliffs and saddle points, the shape of the functional that we wish to optimize is not the only reason for gradients to grow uncontrolled or stagnate. It is in fact the NN architecture itself that sometimes may give rise to such phenomena. To provide some intuition, let's consider a matrix of weights \\(\\mathbf{W}\\) and apply it N times recursively to a certain input (where for simplicity we ignore the nonlinear activation functions): \\[ \\mathbf{y}=\\mathbf{W}^N\\mathbf{x} \\] If we assume \\(\\mathbf{W}\\) to be symmetric for simplicity and express it using its eigendecomposition \\[ \\mathbf{W}=\\mathbf{V} \\boldsymbol \\Sigma \\mathbf{V}^{-1} \\] the resulting output vector \\(\\mathbf{y}\\) can be equivalently written as: \\[ \\begin{aligned} \\mathbf{y} &= \\mathbf{V} \\boldsymbol \\Sigma \\mathbf{V}^{-1} \\mathbf{V} \\boldsymbol \\Sigma \\mathbf{V}^{-1} ... \\mathbf{V} \\boldsymbol \\Sigma \\mathbf{V}^{-1} \\mathbf{x} \\\\ &= \\mathbf{V} \\boldsymbol \\Sigma^N \\mathbf{V}^{-1} \\mathbf{x} \\end{aligned} \\] where we have used here the property of eigendecomposition, \\(\\mathbf{V}^{-1} \\mathbf{V} = \\mathbf{I}\\) . Note that since the matrix of eigenvalues is raised to the power of N, when N is large we will experience the following phenomena: \\(\\lambda_i > 1 \\rightarrow\\) exploding gradient; \\(\\lambda_i < 1 \\rightarrow\\) vanishing gradient; Note that the scenario discussed here does not manifest itself when training feed forward networks, whilst it is much more relevant in the context of recurrent neural networks as the same weights are repeatedly applied to the input as it flows through the computational graph. We defer a more extensive discussion of this phenomenon to this lecture . Stategies to improve SGD After looking at some of the problems that we should be aware of when training NNs (note that some of them can be easily overcome as we will see in the following, whilst others are outstanding and do not have a simple solution), let's look back at the SGD algorithm and consider a number of improvements that can lead to both faster and more stable training. We remember from our previous lecture , that the optimization step of SGD is simply composed of two steps: compute the gradient of the cost function with respect to the free-parameters, obtained via back-propagation apply a scaled step, dictated by the learning rate \\(\\alpha\\) . Cooling strategy The most basic version of SGD uses a constant learning rate. However, a learning rate that may be optimal at the start of training and lead to fast convergence towards one of the minima of the cost function, may lead to unstable behaviour at later iterations. A question arises: given a gradient telling us where to move in the NN functional landscape, can we do something smart with the learning rate to reach the minimum faster. A common approach usually referred to as cooling strategy or learning rate scheduling , where the learning rate is not kept fixed through epochs. Instead, the learning rate is slowly reduced as epochs progress allowing the trajectory of the free-parameters to not fluctuate too much as it progresses towards a valley. Many alternative approaches to LR scheduling exist. However, to be effective, they must respect the following conditions: \\[ \\sum_i \\alpha_i = \\infty, \\; \\sum_i \\alpha_i^2 < \\infty' \\] or, in words, the learning rate should reduce slowly as iterations progress. One common approach uses a linearly decaying LR for the first \\(\\tau\\) iterations, followed by a constant LR: \\[ \\begin{aligned} &\\alpha_i = (1-\\beta) \\alpha_0 + \\beta \\alpha_\\tau \\qquad i<\\tau\\\\ &\\alpha_i = \\alpha_\\tau \\qquad i\\ge\\tau \\end{aligned} \\] where \\(\\beta=i/\\tau\\) . As a rule of thumb, \\(\\tau \\approx 100 N_{epochs}, \\alpha_\\tau = \\alpha_0/100\\) , whilst the choice of \\(\\alpha_0\\) is problem dependent and chosen by monitoring the first few iterations. Alternative approaches can either apply a fixed decay (i.e., exponential) or choose to reduce the LR when the training (or validation) metric has not decreased for a number of epochs. Momentum Another commonly used strategy aimed at improving the convergence of SGD is called Momentum and dates back to the 60s and the seminal works of Polyak and Nesterov in the area of mathematical optimization. The idea of momentum is rather simple, yet very effective. It is based on the idea of using information not only from the current gradient but also from past gradients when making a step. More specifically, the step is based on an exponentially decaying moving average of the past gradients created during iterations. The motivation behind using multiple gradients is to use the knowledge about the landscape shape accumulated through time in the proximity of the current parameters to make a more informed decision on where to move. This can generally help dealing with poorly conditioned modelling matrices in linear optimization and poorly conditioned Hessian matrices in nonlinear optimization. Intuitively, momentum can be understood as some sort of medium resistance or inertia when moving down a valley which slows down the trajectory and keeps it close to the axes of the ellipses of the functional (or its linearization around the current position). This physical interpretation is actually used when defining SGD with momentum as a vector \\(\\mathbf{v}\\) (where v stands for velocity) is introduced: \\[ \\mathbf{v}_{i+1} = \\gamma \\mathbf{v}_i - \\mathbf{g}_{i+1} = \\gamma \\mathbf{v}_i - \\frac{\\alpha}{N_b} \\sum_{j=1}^{N_b} \\nabla \\mathscr{L}_j \\] and the update becomes: \\[ \\boldsymbol\\theta_{i+1} = \\boldsymbol\\theta_{i} + \\mathbf{v}_{i+1} \\] where \\(\\gamma \\in [0, 1)\\) is the momentum term. If we write explicitly the first three iterates of the velocity vector: \\[ \\begin{aligned} &\\mathbf{v}_0 = - \\alpha \\mathbf{g}_0\\\\ &\\mathbf{v}_1 = \\gamma \\mathbf{v}_0 - \\alpha \\mathbf{g}_1 = - \\gamma \\alpha \\mathbf{g}_0 - \\alpha \\mathbf{g}_1 \\\\ &\\mathbf{v}_2 = \\gamma \\mathbf{v}_1 - \\alpha \\mathbf{g}_2 = - \\gamma^2 \\alpha \\mathbf{g}_0 - \\gamma \\alpha \\mathbf{g}_1 - \\alpha \\mathbf{g}_2 \\end{aligned} \\] we notice that the momentum tells us how quickly the contribution of the previous gradients should decay. With \\(\\gamma=0\\) we are back to the standard SGD algorithm, whilst with \\(\\gamma \\rightarrow 1\\) we take into account the entire history of gradients. More commonly used values of momentum are \\(\\gamma=0.5/0.9/0.99\\) which can also be combined with a warming strategy (i.e., start from 0.5 and increase through iterations all the way to 0.99). This is a similar strategy (even though in opposite direction) to the one we previously discussed for the learning rate, even though it is known to impact the learning process to a lesser extent. Based on what we wrote above for the first three iterates, we can easily conclude that: if \\(\\mathbf{g}_i \\approx \\mathbf{g}_{i-1} \\approx \\mathbf{g}_{i-2}\\) (where the sign \\(\\approx\\) is used here to indicate a vector with approximately the same direction), the gradients' sum constructively leading to higher momentum and therefore a faster trajectory if \\(\\mathbf{g}_i \\ne \\mathbf{g}_{i-1} \\ne \\mathbf{g}_{i-2}\\) (where the sign \\(\\ne\\) is used here to indicate a vector with different directions), the gradients' sum destructively leading to lower momentum and therefore a slower trajectory Finally, an even smarter approach would require us not only to accumulate past gradients but also to look ahead of time so that we could slow down the trajectory if the landscape is about to change curvature (i.e., slow up). This requires a slight modification of the momentum term, referred to as Nesterov momentum : \\[ \\mathbf{v}_{i+1} = \\gamma \\mathbf{v}_i - \\frac{\\alpha}{N_b} \\sum_{j=1}^{N_b} \\nabla \\mathscr{L}_j(f_{\\theta+\\gamma \\mathbf{v}_i}(\\mathbf{x}_i), y_i) \\] where the main change here is represented by the fact that the loss function ( \\(\\mathscr{L}\\) ), and therefore, the gradient is evaluated at location \\(\\theta+\\gamma \\mathbf{v}_i\\) rather than at the current one. Here, \\(\\gamma \\mathbf{v}_i\\) represents a correction factor to the standard method of momentum. In classical optimization (i.e., for batched gradient descent), this small change provides an improvement in the rate of convergence from \\(\\mathcal{O}(1/i)\\) to \\(\\mathcal{O}(1/i^2)\\) . Note that this is however not always the case when using stochastic gradient descent. Adaptive learning rates Up until now, we have introduced some modifications to the standard SGD algorithm that globally change the scaling of the gradient (also referred to as learning rate). However, if we believe that directions of sensitivity of the functional should be axis aligned, different learning rates should be used for the different parameters we wish to optimize for. More specifically a small LR should be preferred for those directions associated with large eigenvalues of the local Hessian whilst a large LR should be used for the other directions that associated with small eigenvalues. The delta-bar-delta algorithm of Jacobs (1988) represents an early heuristic approach to automatically adapting learning rates of individual parameters. It is based on this simple rule: if \\(sign\\{g_{i+1}^j\\} = sign\\{g_{i}^j\\}\\) , increase LR if \\(sign\\{g_{i+1}^j\\} \\ne sign\\{g_{i}^j\\}\\) , decrease LR where \\(j\\) refers here to the j-th component of the gradient vector. However, in the last decade a large variety of optimizers have appeared in the literature mostly focusing on this particular aspect of training, i.e. parameter-dependent learning rate. We will go through some of the most popular ones that have revolutionized the way we train NNs nowadays. AdaGrad This optimizer scales the gradient vector by the inverse of the square root of the sum of all historical squared values of the gradient. \\[ \\begin{aligned} &\\mathbf{g}_{i+1} = \\frac{1}{N_b} \\sum_{j=1}^{N_b} \\nabla \\mathscr{L}_j\\\\ &\\mathbf{r}_{i+1} = \\mathbf{r}_i + \\mathbf{g}_{i+1} \\cdot \\mathbf{g}_{i+1} \\\\ &\\Delta \\boldsymbol\\theta_{i+1} = -\\frac{\\alpha}{\\delta + \\sqrt{\\mathbf{r}_{i+1}}} \\cdot \\mathbf{g}_{i+1} \\\\ &\\boldsymbol\\theta_{i+1} = \\boldsymbol\\theta_{i} + \\Delta \\boldsymbol\\theta_{i+1} \\end{aligned} \\] where the vector \\(\\mathbf{r}\\) contains a running sum of the element-wise square gradients (with \\(\\mathbf{r}_0=0\\) ), \\(\\cdot\\) and \\(\\sqrt{\\;}\\) represent the element-wise multiplication of two vectors and square root, respectively. Finally, \\(\\delta=10^{-6}\\) is used as stabilizer to avoid division by zero. If we look at the learning rate of AdaGrad, it is clear that this is parameter dependent and more importantly, it is a function of the norm of the past gradients. Therefore, parameters associated with large gradients will experience a rapid decrease in their associated LR, whilst parameters with small gradients will have an increase of the LR through iterations. The effect of such adaptive LR, is that the trajectory of the parameters will show greater progress over gently sloped directions of the landscape. Nevertheless, it has been reported in the literature that a main drawback of AdaGrad is that this effect is too strong, leading to a premature decrease of the LR in those directions with large gradients and therefore an overall slow learning process. RMSProp A modified version of AdaGrad particularly suited for nonconvex optimization where the gradient accumulation (i.e., \\(\\mathbf{r}\\) vector) is exponentially weighted on a moving window. The idea behind is that for NN training it may take a large number of gradient steps to converge to a satisfactory solution, and therefore it is important for the LR not to decrease too fast in the first few hundred steps. In mathematical terms, a single change is needed to the AdaGrad equations, namely: \\[ \\mathbf{r}_{i+1} = \\rho \\mathbf{r}_i + (1-\\rho)\\mathbf{g}_{i+1} \\cdot \\mathbf{g}_{i+1} \\\\ \\] where \\(\\rho\\) represents the decay rate in the accumulation of past gradients. RMSProp, which was proposed by Geoffrey Hinton during a Coursera class, is shown to be one of the best-in-class optimizers for NN training and it is widely adopted by the DL community. ADAM ADAM stands for Adaptive Moments and it is a variant of RMSProp that further includes Momentum. Nowadays, ADAM is by far the most popular optimizer in the training of deep NNs. Two key changes have been introduced in the ADAM algorithm when compared to RMSProp: Momentum is applied via an estimate of the first-order momentum plus an exponential decay and used in spite of pure gradients in the parameter update step; A bias correction is included to take into account initialization. The algorithm can be written as follows: \\[ \\begin{aligned} &\\mathbf{g}_{i+1} = \\frac{1}{N_b} \\sum_{j=1}^{N_b} \\nabla \\mathscr{L}_j\\\\ &\\mathbf{v}_{i+1} = \\rho_1 \\mathbf{v}_i + (1-\\rho_1)\\mathbf{g}_{i+1} \\leftarrow velocity \\; term \\\\ &\\mathbf{r}_{i+1} = \\rho_2 \\mathbf{r}_i + (1-\\rho_2)\\mathbf{g}_{i+1} \\cdot \\mathbf{g}_{i+1} \\leftarrow scaling \\; term \\\\ &\\hat{\\mathbf{v}}_{i+1} = \\frac{\\mathbf{v}_{i+1}}{1-\\rho_1^{i+1}} \\leftarrow bias \\; correction \\\\ &\\hat{\\mathbf{r}}_{i+1} = \\frac{\\mathbf{r}_{i+1}}{1-\\rho_2^{i+1}} \\leftarrow bias \\; correction \\\\ &\\Delta \\boldsymbol\\theta_{i+1} = -\\frac{\\alpha}{\\delta + \\sqrt{\\hat{\\mathbf{r}}_{i+1}}} \\cdot \\hat{\\mathbf{v}}_{i+1}\\\\ &\\boldsymbol\\theta_{i+1} = \\boldsymbol\\theta_{i} + \\Delta \\boldsymbol\\theta_{i+1} \\end{aligned} \\] where, once again, a number of hyperparameters are introduced. These are the stabilizer, \\(\\delta=10^{-6}\\) , and two decay rates ( \\(\\rho_1\\) and \\(\\rho_2\\) ). To conclude, we have first introduced simpler optimizers and subsequently built complexity in terms of both momentum and parameter-dependent learning, there is no universal winner. Although both momentum and adaptive LR do clearly seem to be beneficial to the training on NNs, it is not always the case that ADAM provides the best results both in terms of robustness and convergence speed. It is therefore important to be aware of the different optimizers that are available in the DL arsenal and identify the best based on the task at end. In other words, the choice of the optimizer can usually represent one of those hyperparameters that ML practitioners need to evaluate and select when developing a new ML pipeline. Other tricks In the following, we report a few other practical tricks that can be used when training NNs to further improve the learning capabilities of our optimizer (no matter what optimizer has been selected). Polyak Averaging When training a NN, the most common approach is to select the last iterate ( \\(\\boldsymbol\\theta_{N_{it}}\\) ) where \\(N_{it}\\) is the overall number of iterations and use it at inference stage. Nevertheless, given the highly nonconvex optimization problem that we are required to solver, it is logical to expect that perhaps the last estimate of model parameters is not the best. Let's for example imagine that towards the end of the training process we are approaching a (local or global) minimum. However, our trajectory is bouncing all around the valley: A simple approach to mitigate this effect is to average over the last \\(N\\) iterations: \\[ \\boldsymbol\\theta = \\frac{1}{N} \\sum_{i=0}^{N-1} \\boldsymbol\\theta_{N_{it}-i} \\] This averaging acts as a denoising process that takes away some of the fluctuations and makes the optimization process less sensitive to the last step. Batch Normalization This is a very recent advancement in the field of DL, from the seminal work of Ioffe and Szegedy (2015). It has been shown to be particularly beneficial to the training of very deep neural networks. Let's first take a look at what happens during the training process if we do not include batch normalization. As previously discussed, given the gradient \\(\\partial J / \\partial \\boldsymbol \\theta\\) , at every step of the optimization process all the parameters (weights and biases) in the different layers of a NN are simultaneously updated. This goes against the \"theoretical assumption\" that the optimization process should update one parameter at the time (which is however too expensive and therefore unfeasible). As a consequence of the fact that all free-parameters are updated together is that second order updates are introduced or, in other words, the statistical distribution of various parameters across the layers of the NN are modified. This is commonly referred to as internal covariate shift . Batch normalization use a general way to reparametrize every NN, which reduces the need for coordination across many layers during an update (making the process of updating all parameters at the same time more stable). It is simply implemented by modifying the output of a layer (or all the layers) at training time as follows: where a re-normalization process is applied to every row of the output matrix \\(\\mathbf{A}\\) and it is directly based on the local statistics (mean and standard deviation) of the output of the layer. The overall forward and backward passes remain unchanged with the simple difference that the network is now operating on the re-normalized output \\(\\mathbf{A}'\\) instead of the original one \\(\\mathbf{A}\\) . The implications of such an additional step of re-normalization are that now the activations are distributed as \\(\\mathcal{N}(0, 1)\\) throughout the entire training process. By doing so, the optimization algorithm is discouraged to propose an update that simply acts constantly over the mean or the standard deviation of \\(\\mathbf{A}\\) . At testing time, the mean and standard deviation ( \\(\\boldsymbol \\mu\\) and \\(\\boldsymbol \\sigma\\) ) are usually fixed and taken from a running mean computed during training time. In practice, however, batch normalization includes an extra step where instead of forcing the mean and standard deviation of each layer to be fixed, these parameters are learned to make the units of the network more expressive. This is simply accomplished by defining the output \\(\\mathbf{A}''\\) as: \\[ \\mathbf{A}'' = \\gamma \\mathbf{A}' + \\beta \\] where \\(\\gamma\\) and \\(\\beta\\) are also learned alongside the weights and biases of the network. Finally, since the bias is now induced by \\(\\beta\\) a common recommendation when using batch normalization is to avoid adding a learnable bias to the layer of the network. Supervised pre-training So far, we have talked about optimizing the free-parameters of a neural network starting from a random initialization of such parameters and using all the available data to get the best estimate of such parameters. We have also briefly mentioned that transfer learning, a technique that uses a pre-trained network on a different set of data and possible different task and fine-tunes it on the task and data at hand, as a way to speed-up the training process as well as get around to the fact that sometimes we have access to a small amount of labelled data. Another interesting technique that can be used to ease the learning capabilities of a NN is called pre-training or greedy training . Two alternative approaches are generally taken: \\(\\boldsymbol \\theta_0\\) (selected at random) \\(\\rightarrow\\) Simple task: \\(\\tilde{\\boldsymbol \\theta}\\) \\(\\rightarrow\\) Hard task: \\(\\tilde{\\boldsymbol \\theta'}\\) \\(\\boldsymbol \\theta^1_0\\) (selected at random) \\(\\rightarrow\\) Simple network: \\(\\tilde{\\boldsymbol \\theta^1}, \\boldsymbol \\theta^2_0\\) \\(\\rightarrow\\) Complex network: \\(\\tilde{\\boldsymbol \\theta^1}, \\tilde{\\boldsymbol \\theta^2}\\) where in the latter case a common approach is to fix the hidden layers and discard the output layer after the first training process, add a number of extra layers to make the network deeper and continue training those layers alone. However, since N independent optimizations generally do not provide the overall optimal solution, a final fine-tuning step may be required. Additional readings A great resource containing references (and Pytorch implementations) of more than 20 optimizers. This may be a good starting point if interest to experiment with different optimizers in both classical optimization and training of NNs. Another great resource with step-by-step implementations of some popular optimizers and networks.","title":"More on gradient-based optimization"},{"location":"lectures/08_gradopt1/#more-on-gradient-based-optimization","text":"Whilst stochastic gradient descent is easy to understand, and simple to implement algorithm (as discussed in this lecture ), it presents a number of shortcomings that prevent learning to be as fast and effective as we would like it to be. In this lecture, we will discuss some of the limitations of SGD and look at alternative optimization algorithms that have been developed in the last decade and are nowadays preferred to SGD in the process of training NNs.","title":"More on gradient-based optimization"},{"location":"lectures/08_gradopt1/#limitations-of-sgd","text":"","title":"Limitations of SGD"},{"location":"lectures/08_gradopt1/#ill-conditioning","text":"The shape, and more specifically the curvature, of the functional that we wish to minimize affects our ability to quickly and efficiently converge to one of its minima (ideally the global, likely one of the local). For nonlinear optimization problems, like those encountered in deep learning, this is mathematically represented by the Hessian matrix ( \\(\\mathbf{H}=\\frac{\\partial^2 f}{\\partial \\boldsymbol \\theta^2}\\) ). An Hessian matrix with large conditioning number (i.e., ratio of the largest and smallest eigenvalues) tends to affect convergence speed of first-order (gradient-based) methods. In classical optimization theory, second order methods such as the Gauss-Newton method are commonly employed to counteract this problem. However, as already mentioned in one of our previous lectures, such methods are not yet suitable for deep learning in that no mathematical foundations have been developed in conjunction with approximate gradients (i.e., mini-batch learning strategy). Another factor that is worth knowing about is related to the norm of the gradient \\(\\mathbf{g}^T\\mathbf{g}\\) through iterations. In theory, this norm should shrink through iterations to guarantee convergence. Nevertheless, successful training may still be obtained even if the norm does not shrink as long as the learning rate is kept small. Let's write the second-order Taylor expansion of the functional around the current parameter estimate \\(\\boldsymbol \\theta_0\\) : \\[ J(\\boldsymbol \\theta) \\approx J(\\boldsymbol \\theta_0) + (\\boldsymbol \\theta - \\boldsymbol \\theta_0)^T \\mathbf{g} + \\frac{1}{2} (\\boldsymbol \\theta - \\boldsymbol \\theta_0)^T \\mathbf{H} (\\boldsymbol \\theta - \\boldsymbol \\theta_0) \\] and evaluate it at the next gradient step \\(\\boldsymbol \\theta = \\boldsymbol \\theta_0 - \\alpha \\mathbf{g}\\) : \\[ J(\\boldsymbol \\theta_0 - \\alpha \\mathbf{g}) \\approx J(\\boldsymbol \\theta_0) - \\alpha \\mathbf{g}^T \\mathbf{g} + \\frac{1}{2} \\alpha^2 \\mathbf{g}^T \\mathbf{H} \\mathbf{g} \\] We can interpret this expression as follows: a gradient step of \\(- \\alpha \\mathbf{g}\\) adds the following contribution to the cost function, \\(-\\alpha \\mathbf{g}^T \\mathbf{g} + \\frac{1}{2} \\alpha^2 \\mathbf{g}^T \\mathbf{H} \\mathbf{g}\\) . When this contribution is positive (i.e., \\(\\frac{1}{2} \\alpha^2 \\mathbf{g}^T \\mathbf{H} \\mathbf{g} > \\alpha\\mathbf{g}^T \\mathbf{g}\\) ), the cost function grows instead of being reduced. Under the assumption that \\(\\mathbf{H}\\) is known, we could easily choose a step-size \\(\\alpha\\) that prevents this from happening. However, when the Hessian cannot be estimated, a conservative selection of the step-size is the only remedy to prevent the cost function from growing. A downside of such an approach is that the smaller the learning rate the slower the training process.","title":"Ill-conditioning"},{"location":"lectures/08_gradopt1/#local-minima","text":"Whilst the focus of the previous section has been in the neighbour of \\(\\boldsymbol \\theta_0\\) where the functional \\(J_{\\boldsymbol \\theta}\\) can be approximated by a convex function, the landscape of NN functionals is generally non-convex and populated with a multitude of local minima. The problem of converging to the global minimum without getting stuck in one of the local minima is a well-known problem for any non-convex optimization. An example in geophysics is represented by waveform inversion and a large body of work has been carried out by the geophysical research community to identify objective functions that are more well-behaved (i.e., show a large basin of attraction around the global minimum). Nevertheless, getting stuck into local minima is much less of a problem when training neural networks. This can be justified by the fact that multiple models may perform equally well on both the training and testing data. To be more precise this relates to the concept of model identifiability , where a model is defined identifiable if there exist a single set of parameters ( \\(\\boldsymbol \\theta_{gm}\\) ) that lead to optimal model performance. On the other hand, when multiple models \\(\\{ \\boldsymbol \\theta_{gm}, \\boldsymbol \\theta_{lm,1}, ..., \\boldsymbol \\theta_{lm1,N}\\) perform similarly those models are said to be non-identifiable. Moreover, even when a single model performs best, a distinction must be made between training and testing performance. As far as training performance is concerned, this model must be that of the global minimum of the functional \\(\\boldsymbol \\theta_{gm}\\) . Nevertheless, the model that performs best on the testing data may be the one obtained from any of the local minima \\(\\boldsymbol \\theta_{lm,i}\\) as such a model be have better generalization capabilities than the one from the global minimum.","title":"Local minima"},{"location":"lectures/08_gradopt1/#saddle-points-and-other-flat-regions","text":"Recent research in the field of deep learning has however revealed that multi-dimensional landscapes associated to the training of deep neural networks may actually have much fewer local minima than we tend to believe, and the main hinder to slow training is actually represented by saddle points (and flat regions in general). More specifically, empirically it can be shown that the ratio between saddle points and local minima is in the order of \\(e^n\\) where \\(n\\) is the number of dimensions of the model vector \\(\\boldsymbol \\theta\\) . The main problem associated with saddle points is similar to that of local minima: the associated gradient is \\(J(\\boldsymbol \\theta) \\rightarrow 0\\) ; as a consequence, during training, when the trajectory of the model parameter vector approaches a saddle point, the learning process may experience a slow down.","title":"Saddle points and other flat regions"},{"location":"lectures/08_gradopt1/#cliffs","text":"Another potentially dangerous feature of NN landscapes is represented by steep regions where \\(J(\\boldsymbol \\theta) \\rightarrow \\infty\\) . This may in fact lead to unstable behaviours during training as large jumps will arise in the trajectory of the model parameter vector. Heuristic approaches to mitigate this problem exist, one of them is the so-called gradient clipping strategy where: $$ \\nabla J(\\theta_i) = min(\\nabla J(\\theta_i), th) $$ where \\(th\\) is a user-defined threshold. This approach allows element-wise gradient clipping for those directions with an extremely large gradient whilst not forcing us to lower the overall learning rate.","title":"Cliffs"},{"location":"lectures/08_gradopt1/#exploding-and-vanishing-gradients","text":"Two problems that we commonly encounter whilst training Neural Networks are the so-called exploding and vanishing gradient phenomena. Whilst we already mentioned two scenarios where either of these situations can occur, i.e., cliffs and saddle points, the shape of the functional that we wish to optimize is not the only reason for gradients to grow uncontrolled or stagnate. It is in fact the NN architecture itself that sometimes may give rise to such phenomena. To provide some intuition, let's consider a matrix of weights \\(\\mathbf{W}\\) and apply it N times recursively to a certain input (where for simplicity we ignore the nonlinear activation functions): \\[ \\mathbf{y}=\\mathbf{W}^N\\mathbf{x} \\] If we assume \\(\\mathbf{W}\\) to be symmetric for simplicity and express it using its eigendecomposition \\[ \\mathbf{W}=\\mathbf{V} \\boldsymbol \\Sigma \\mathbf{V}^{-1} \\] the resulting output vector \\(\\mathbf{y}\\) can be equivalently written as: \\[ \\begin{aligned} \\mathbf{y} &= \\mathbf{V} \\boldsymbol \\Sigma \\mathbf{V}^{-1} \\mathbf{V} \\boldsymbol \\Sigma \\mathbf{V}^{-1} ... \\mathbf{V} \\boldsymbol \\Sigma \\mathbf{V}^{-1} \\mathbf{x} \\\\ &= \\mathbf{V} \\boldsymbol \\Sigma^N \\mathbf{V}^{-1} \\mathbf{x} \\end{aligned} \\] where we have used here the property of eigendecomposition, \\(\\mathbf{V}^{-1} \\mathbf{V} = \\mathbf{I}\\) . Note that since the matrix of eigenvalues is raised to the power of N, when N is large we will experience the following phenomena: \\(\\lambda_i > 1 \\rightarrow\\) exploding gradient; \\(\\lambda_i < 1 \\rightarrow\\) vanishing gradient; Note that the scenario discussed here does not manifest itself when training feed forward networks, whilst it is much more relevant in the context of recurrent neural networks as the same weights are repeatedly applied to the input as it flows through the computational graph. We defer a more extensive discussion of this phenomenon to this lecture .","title":"Exploding and vanishing gradients"},{"location":"lectures/08_gradopt1/#stategies-to-improve-sgd","text":"After looking at some of the problems that we should be aware of when training NNs (note that some of them can be easily overcome as we will see in the following, whilst others are outstanding and do not have a simple solution), let's look back at the SGD algorithm and consider a number of improvements that can lead to both faster and more stable training. We remember from our previous lecture , that the optimization step of SGD is simply composed of two steps: compute the gradient of the cost function with respect to the free-parameters, obtained via back-propagation apply a scaled step, dictated by the learning rate \\(\\alpha\\) .","title":"Stategies to improve SGD"},{"location":"lectures/08_gradopt1/#cooling-strategy","text":"The most basic version of SGD uses a constant learning rate. However, a learning rate that may be optimal at the start of training and lead to fast convergence towards one of the minima of the cost function, may lead to unstable behaviour at later iterations. A question arises: given a gradient telling us where to move in the NN functional landscape, can we do something smart with the learning rate to reach the minimum faster. A common approach usually referred to as cooling strategy or learning rate scheduling , where the learning rate is not kept fixed through epochs. Instead, the learning rate is slowly reduced as epochs progress allowing the trajectory of the free-parameters to not fluctuate too much as it progresses towards a valley. Many alternative approaches to LR scheduling exist. However, to be effective, they must respect the following conditions: \\[ \\sum_i \\alpha_i = \\infty, \\; \\sum_i \\alpha_i^2 < \\infty' \\] or, in words, the learning rate should reduce slowly as iterations progress. One common approach uses a linearly decaying LR for the first \\(\\tau\\) iterations, followed by a constant LR: \\[ \\begin{aligned} &\\alpha_i = (1-\\beta) \\alpha_0 + \\beta \\alpha_\\tau \\qquad i<\\tau\\\\ &\\alpha_i = \\alpha_\\tau \\qquad i\\ge\\tau \\end{aligned} \\] where \\(\\beta=i/\\tau\\) . As a rule of thumb, \\(\\tau \\approx 100 N_{epochs}, \\alpha_\\tau = \\alpha_0/100\\) , whilst the choice of \\(\\alpha_0\\) is problem dependent and chosen by monitoring the first few iterations. Alternative approaches can either apply a fixed decay (i.e., exponential) or choose to reduce the LR when the training (or validation) metric has not decreased for a number of epochs.","title":"Cooling strategy"},{"location":"lectures/08_gradopt1/#momentum","text":"Another commonly used strategy aimed at improving the convergence of SGD is called Momentum and dates back to the 60s and the seminal works of Polyak and Nesterov in the area of mathematical optimization. The idea of momentum is rather simple, yet very effective. It is based on the idea of using information not only from the current gradient but also from past gradients when making a step. More specifically, the step is based on an exponentially decaying moving average of the past gradients created during iterations. The motivation behind using multiple gradients is to use the knowledge about the landscape shape accumulated through time in the proximity of the current parameters to make a more informed decision on where to move. This can generally help dealing with poorly conditioned modelling matrices in linear optimization and poorly conditioned Hessian matrices in nonlinear optimization. Intuitively, momentum can be understood as some sort of medium resistance or inertia when moving down a valley which slows down the trajectory and keeps it close to the axes of the ellipses of the functional (or its linearization around the current position). This physical interpretation is actually used when defining SGD with momentum as a vector \\(\\mathbf{v}\\) (where v stands for velocity) is introduced: \\[ \\mathbf{v}_{i+1} = \\gamma \\mathbf{v}_i - \\mathbf{g}_{i+1} = \\gamma \\mathbf{v}_i - \\frac{\\alpha}{N_b} \\sum_{j=1}^{N_b} \\nabla \\mathscr{L}_j \\] and the update becomes: \\[ \\boldsymbol\\theta_{i+1} = \\boldsymbol\\theta_{i} + \\mathbf{v}_{i+1} \\] where \\(\\gamma \\in [0, 1)\\) is the momentum term. If we write explicitly the first three iterates of the velocity vector: \\[ \\begin{aligned} &\\mathbf{v}_0 = - \\alpha \\mathbf{g}_0\\\\ &\\mathbf{v}_1 = \\gamma \\mathbf{v}_0 - \\alpha \\mathbf{g}_1 = - \\gamma \\alpha \\mathbf{g}_0 - \\alpha \\mathbf{g}_1 \\\\ &\\mathbf{v}_2 = \\gamma \\mathbf{v}_1 - \\alpha \\mathbf{g}_2 = - \\gamma^2 \\alpha \\mathbf{g}_0 - \\gamma \\alpha \\mathbf{g}_1 - \\alpha \\mathbf{g}_2 \\end{aligned} \\] we notice that the momentum tells us how quickly the contribution of the previous gradients should decay. With \\(\\gamma=0\\) we are back to the standard SGD algorithm, whilst with \\(\\gamma \\rightarrow 1\\) we take into account the entire history of gradients. More commonly used values of momentum are \\(\\gamma=0.5/0.9/0.99\\) which can also be combined with a warming strategy (i.e., start from 0.5 and increase through iterations all the way to 0.99). This is a similar strategy (even though in opposite direction) to the one we previously discussed for the learning rate, even though it is known to impact the learning process to a lesser extent. Based on what we wrote above for the first three iterates, we can easily conclude that: if \\(\\mathbf{g}_i \\approx \\mathbf{g}_{i-1} \\approx \\mathbf{g}_{i-2}\\) (where the sign \\(\\approx\\) is used here to indicate a vector with approximately the same direction), the gradients' sum constructively leading to higher momentum and therefore a faster trajectory if \\(\\mathbf{g}_i \\ne \\mathbf{g}_{i-1} \\ne \\mathbf{g}_{i-2}\\) (where the sign \\(\\ne\\) is used here to indicate a vector with different directions), the gradients' sum destructively leading to lower momentum and therefore a slower trajectory Finally, an even smarter approach would require us not only to accumulate past gradients but also to look ahead of time so that we could slow down the trajectory if the landscape is about to change curvature (i.e., slow up). This requires a slight modification of the momentum term, referred to as Nesterov momentum : \\[ \\mathbf{v}_{i+1} = \\gamma \\mathbf{v}_i - \\frac{\\alpha}{N_b} \\sum_{j=1}^{N_b} \\nabla \\mathscr{L}_j(f_{\\theta+\\gamma \\mathbf{v}_i}(\\mathbf{x}_i), y_i) \\] where the main change here is represented by the fact that the loss function ( \\(\\mathscr{L}\\) ), and therefore, the gradient is evaluated at location \\(\\theta+\\gamma \\mathbf{v}_i\\) rather than at the current one. Here, \\(\\gamma \\mathbf{v}_i\\) represents a correction factor to the standard method of momentum. In classical optimization (i.e., for batched gradient descent), this small change provides an improvement in the rate of convergence from \\(\\mathcal{O}(1/i)\\) to \\(\\mathcal{O}(1/i^2)\\) . Note that this is however not always the case when using stochastic gradient descent.","title":"Momentum"},{"location":"lectures/08_gradopt1/#adaptive-learning-rates","text":"Up until now, we have introduced some modifications to the standard SGD algorithm that globally change the scaling of the gradient (also referred to as learning rate). However, if we believe that directions of sensitivity of the functional should be axis aligned, different learning rates should be used for the different parameters we wish to optimize for. More specifically a small LR should be preferred for those directions associated with large eigenvalues of the local Hessian whilst a large LR should be used for the other directions that associated with small eigenvalues. The delta-bar-delta algorithm of Jacobs (1988) represents an early heuristic approach to automatically adapting learning rates of individual parameters. It is based on this simple rule: if \\(sign\\{g_{i+1}^j\\} = sign\\{g_{i}^j\\}\\) , increase LR if \\(sign\\{g_{i+1}^j\\} \\ne sign\\{g_{i}^j\\}\\) , decrease LR where \\(j\\) refers here to the j-th component of the gradient vector. However, in the last decade a large variety of optimizers have appeared in the literature mostly focusing on this particular aspect of training, i.e. parameter-dependent learning rate. We will go through some of the most popular ones that have revolutionized the way we train NNs nowadays.","title":"Adaptive learning rates"},{"location":"lectures/08_gradopt1/#adagrad","text":"This optimizer scales the gradient vector by the inverse of the square root of the sum of all historical squared values of the gradient. \\[ \\begin{aligned} &\\mathbf{g}_{i+1} = \\frac{1}{N_b} \\sum_{j=1}^{N_b} \\nabla \\mathscr{L}_j\\\\ &\\mathbf{r}_{i+1} = \\mathbf{r}_i + \\mathbf{g}_{i+1} \\cdot \\mathbf{g}_{i+1} \\\\ &\\Delta \\boldsymbol\\theta_{i+1} = -\\frac{\\alpha}{\\delta + \\sqrt{\\mathbf{r}_{i+1}}} \\cdot \\mathbf{g}_{i+1} \\\\ &\\boldsymbol\\theta_{i+1} = \\boldsymbol\\theta_{i} + \\Delta \\boldsymbol\\theta_{i+1} \\end{aligned} \\] where the vector \\(\\mathbf{r}\\) contains a running sum of the element-wise square gradients (with \\(\\mathbf{r}_0=0\\) ), \\(\\cdot\\) and \\(\\sqrt{\\;}\\) represent the element-wise multiplication of two vectors and square root, respectively. Finally, \\(\\delta=10^{-6}\\) is used as stabilizer to avoid division by zero. If we look at the learning rate of AdaGrad, it is clear that this is parameter dependent and more importantly, it is a function of the norm of the past gradients. Therefore, parameters associated with large gradients will experience a rapid decrease in their associated LR, whilst parameters with small gradients will have an increase of the LR through iterations. The effect of such adaptive LR, is that the trajectory of the parameters will show greater progress over gently sloped directions of the landscape. Nevertheless, it has been reported in the literature that a main drawback of AdaGrad is that this effect is too strong, leading to a premature decrease of the LR in those directions with large gradients and therefore an overall slow learning process.","title":"AdaGrad"},{"location":"lectures/08_gradopt1/#rmsprop","text":"A modified version of AdaGrad particularly suited for nonconvex optimization where the gradient accumulation (i.e., \\(\\mathbf{r}\\) vector) is exponentially weighted on a moving window. The idea behind is that for NN training it may take a large number of gradient steps to converge to a satisfactory solution, and therefore it is important for the LR not to decrease too fast in the first few hundred steps. In mathematical terms, a single change is needed to the AdaGrad equations, namely: \\[ \\mathbf{r}_{i+1} = \\rho \\mathbf{r}_i + (1-\\rho)\\mathbf{g}_{i+1} \\cdot \\mathbf{g}_{i+1} \\\\ \\] where \\(\\rho\\) represents the decay rate in the accumulation of past gradients. RMSProp, which was proposed by Geoffrey Hinton during a Coursera class, is shown to be one of the best-in-class optimizers for NN training and it is widely adopted by the DL community.","title":"RMSProp"},{"location":"lectures/08_gradopt1/#adam","text":"ADAM stands for Adaptive Moments and it is a variant of RMSProp that further includes Momentum. Nowadays, ADAM is by far the most popular optimizer in the training of deep NNs. Two key changes have been introduced in the ADAM algorithm when compared to RMSProp: Momentum is applied via an estimate of the first-order momentum plus an exponential decay and used in spite of pure gradients in the parameter update step; A bias correction is included to take into account initialization. The algorithm can be written as follows: \\[ \\begin{aligned} &\\mathbf{g}_{i+1} = \\frac{1}{N_b} \\sum_{j=1}^{N_b} \\nabla \\mathscr{L}_j\\\\ &\\mathbf{v}_{i+1} = \\rho_1 \\mathbf{v}_i + (1-\\rho_1)\\mathbf{g}_{i+1} \\leftarrow velocity \\; term \\\\ &\\mathbf{r}_{i+1} = \\rho_2 \\mathbf{r}_i + (1-\\rho_2)\\mathbf{g}_{i+1} \\cdot \\mathbf{g}_{i+1} \\leftarrow scaling \\; term \\\\ &\\hat{\\mathbf{v}}_{i+1} = \\frac{\\mathbf{v}_{i+1}}{1-\\rho_1^{i+1}} \\leftarrow bias \\; correction \\\\ &\\hat{\\mathbf{r}}_{i+1} = \\frac{\\mathbf{r}_{i+1}}{1-\\rho_2^{i+1}} \\leftarrow bias \\; correction \\\\ &\\Delta \\boldsymbol\\theta_{i+1} = -\\frac{\\alpha}{\\delta + \\sqrt{\\hat{\\mathbf{r}}_{i+1}}} \\cdot \\hat{\\mathbf{v}}_{i+1}\\\\ &\\boldsymbol\\theta_{i+1} = \\boldsymbol\\theta_{i} + \\Delta \\boldsymbol\\theta_{i+1} \\end{aligned} \\] where, once again, a number of hyperparameters are introduced. These are the stabilizer, \\(\\delta=10^{-6}\\) , and two decay rates ( \\(\\rho_1\\) and \\(\\rho_2\\) ). To conclude, we have first introduced simpler optimizers and subsequently built complexity in terms of both momentum and parameter-dependent learning, there is no universal winner. Although both momentum and adaptive LR do clearly seem to be beneficial to the training on NNs, it is not always the case that ADAM provides the best results both in terms of robustness and convergence speed. It is therefore important to be aware of the different optimizers that are available in the DL arsenal and identify the best based on the task at end. In other words, the choice of the optimizer can usually represent one of those hyperparameters that ML practitioners need to evaluate and select when developing a new ML pipeline.","title":"ADAM"},{"location":"lectures/08_gradopt1/#other-tricks","text":"In the following, we report a few other practical tricks that can be used when training NNs to further improve the learning capabilities of our optimizer (no matter what optimizer has been selected).","title":"Other tricks"},{"location":"lectures/08_gradopt1/#polyak-averaging","text":"When training a NN, the most common approach is to select the last iterate ( \\(\\boldsymbol\\theta_{N_{it}}\\) ) where \\(N_{it}\\) is the overall number of iterations and use it at inference stage. Nevertheless, given the highly nonconvex optimization problem that we are required to solver, it is logical to expect that perhaps the last estimate of model parameters is not the best. Let's for example imagine that towards the end of the training process we are approaching a (local or global) minimum. However, our trajectory is bouncing all around the valley: A simple approach to mitigate this effect is to average over the last \\(N\\) iterations: \\[ \\boldsymbol\\theta = \\frac{1}{N} \\sum_{i=0}^{N-1} \\boldsymbol\\theta_{N_{it}-i} \\] This averaging acts as a denoising process that takes away some of the fluctuations and makes the optimization process less sensitive to the last step.","title":"Polyak Averaging"},{"location":"lectures/08_gradopt1/#batch-normalization","text":"This is a very recent advancement in the field of DL, from the seminal work of Ioffe and Szegedy (2015). It has been shown to be particularly beneficial to the training of very deep neural networks. Let's first take a look at what happens during the training process if we do not include batch normalization. As previously discussed, given the gradient \\(\\partial J / \\partial \\boldsymbol \\theta\\) , at every step of the optimization process all the parameters (weights and biases) in the different layers of a NN are simultaneously updated. This goes against the \"theoretical assumption\" that the optimization process should update one parameter at the time (which is however too expensive and therefore unfeasible). As a consequence of the fact that all free-parameters are updated together is that second order updates are introduced or, in other words, the statistical distribution of various parameters across the layers of the NN are modified. This is commonly referred to as internal covariate shift . Batch normalization use a general way to reparametrize every NN, which reduces the need for coordination across many layers during an update (making the process of updating all parameters at the same time more stable). It is simply implemented by modifying the output of a layer (or all the layers) at training time as follows: where a re-normalization process is applied to every row of the output matrix \\(\\mathbf{A}\\) and it is directly based on the local statistics (mean and standard deviation) of the output of the layer. The overall forward and backward passes remain unchanged with the simple difference that the network is now operating on the re-normalized output \\(\\mathbf{A}'\\) instead of the original one \\(\\mathbf{A}\\) . The implications of such an additional step of re-normalization are that now the activations are distributed as \\(\\mathcal{N}(0, 1)\\) throughout the entire training process. By doing so, the optimization algorithm is discouraged to propose an update that simply acts constantly over the mean or the standard deviation of \\(\\mathbf{A}\\) . At testing time, the mean and standard deviation ( \\(\\boldsymbol \\mu\\) and \\(\\boldsymbol \\sigma\\) ) are usually fixed and taken from a running mean computed during training time. In practice, however, batch normalization includes an extra step where instead of forcing the mean and standard deviation of each layer to be fixed, these parameters are learned to make the units of the network more expressive. This is simply accomplished by defining the output \\(\\mathbf{A}''\\) as: \\[ \\mathbf{A}'' = \\gamma \\mathbf{A}' + \\beta \\] where \\(\\gamma\\) and \\(\\beta\\) are also learned alongside the weights and biases of the network. Finally, since the bias is now induced by \\(\\beta\\) a common recommendation when using batch normalization is to avoid adding a learnable bias to the layer of the network.","title":"Batch Normalization"},{"location":"lectures/08_gradopt1/#supervised-pre-training","text":"So far, we have talked about optimizing the free-parameters of a neural network starting from a random initialization of such parameters and using all the available data to get the best estimate of such parameters. We have also briefly mentioned that transfer learning, a technique that uses a pre-trained network on a different set of data and possible different task and fine-tunes it on the task and data at hand, as a way to speed-up the training process as well as get around to the fact that sometimes we have access to a small amount of labelled data. Another interesting technique that can be used to ease the learning capabilities of a NN is called pre-training or greedy training . Two alternative approaches are generally taken: \\(\\boldsymbol \\theta_0\\) (selected at random) \\(\\rightarrow\\) Simple task: \\(\\tilde{\\boldsymbol \\theta}\\) \\(\\rightarrow\\) Hard task: \\(\\tilde{\\boldsymbol \\theta'}\\) \\(\\boldsymbol \\theta^1_0\\) (selected at random) \\(\\rightarrow\\) Simple network: \\(\\tilde{\\boldsymbol \\theta^1}, \\boldsymbol \\theta^2_0\\) \\(\\rightarrow\\) Complex network: \\(\\tilde{\\boldsymbol \\theta^1}, \\tilde{\\boldsymbol \\theta^2}\\) where in the latter case a common approach is to fix the hidden layers and discard the output layer after the first training process, add a number of extra layers to make the network deeper and continue training those layers alone. However, since N independent optimizations generally do not provide the overall optimal solution, a final fine-tuning step may be required.","title":"Supervised pre-training"},{"location":"lectures/08_gradopt1/#additional-readings","text":"A great resource containing references (and Pytorch implementations) of more than 20 optimizers. This may be a good starting point if interest to experiment with different optimizers in both classical optimization and training of NNs. Another great resource with step-by-step implementations of some popular optimizers and networks.","title":"Additional readings"},{"location":"lectures/09_mdn/","text":"Uncertainty Quantification in Neural Networks and Mixture Density Networks Before delving into more advanced NN building blocks (e.g., convolutional, recurrent), let's revisit the training process of feed forward NNs with a probabilistic standpoint. Anything that we present here can be later applied to any of the other NN architectures that will be discussed in this course. We have already discussed that every loss function commonly used in the training of NNs, both for regression and classification, can be justified using a statistical formulation mostly in the context of Maximum-likelihood estimators. Despite this intrinsic link with probabilistic modelling, NN predictions are however most of the time punctual, meaning that we do not get an idea of the uncertainty associated to our prediction. First of all, it is important to remark the fact that even in classification tasks when the output of a softmax layer has the form of a probability (i.e., each term is bounded between 0 and 1, and their sum is equal to 1), this values should not be treated as an accurate description of the confidence level of our prediction. Second, when assessing the confidence of a NN prediction (or another ML model), two different types of uncertainties are generally identified: Epistemic uncertainty , also called model uncertainty: this uncertainty arises from a lack of training data in certain regions of the input domain. As we can expect our training data not to cover the entire input space, our trained network is likely to produce arbitrary output values for a large portion of the input values that the network has never seen before. We therefore want to be able to quantify the lack of accuracy due to missing training data. Aleatoric uncertainty : this uncertainty is associated with the fact that the input data may contain some intrinsic randomness. This is either represented by the fact that the function we try to approximate is multimodal (i.e., multiple possible outputs exist for a single input) or the recorded data is polluted by noise. As a result, the training data will include samples with very close input values and a large spread of output values. We wish to be able to get such an insight out of the network predictions. A number of more or less simple strategies can however be employed when training NNs with the goal of obtaining a quantitative measurement of how certain our estimate is: Dropout : this commonly used regularization strategy presented in one of our previous lectures can be also leveraged to produce an estimate of the uncertainty of our solution. This can be done by simply using dropout at the inference time and feeding the network multiple times with the same input. Multiple realizations of a prediction are computed, where different portions of the neurons of the network are deactivated for the different realizations. An empirical distribution or parameter estimates (e.g., mean and standard deviation) over the outputs can be finally estimated. The reason behind the success of this strategy is that the network can easily learn to always predict the same (or very similar) output when it is well constrained by data no matter if some of the neurons are deactivated at random. On the other hand, when the network is more unsure because of lack of data or contrasting data, different versions of the network are likely to produce different predictions. Ensembling : another popular strategy, although quite expensive, is to train N neural networks with different initializations and use them to produce multiple predictions. Similar to dropout, when the training data is available and of good quality, the different networks will make similar predictions as they will likely converge to minima of similar quality. On the other hand, when the data is poor (or lacking), the weight initialization plays a much bigger role in the training and different network are likely to behave differently. Distributional parameter estimation (DPE) : a different route is to change the parametrization of the output itself. More specifically, considering here for simplicity the case of regression, the network is asked to produce two outputs. The first is tasked to predict the mean of the output distribution whilst the second predicts the standard deviation. Whilst in the more traditional training of NNs the standard deviation is kept fixed for all training samples, here the network will be able to understand which portion of the input data is noisier and which is cleaner (as well as detect where input data is missing). The negative log-likelihood is chosen to be the loss function of the network: \\[ \\boldsymbol \\theta = \\underset{\\boldsymbol \\theta} {\\mathrm{argmin}} \\; \\sum_{i=1}^{N_s} \\frac{log \\hat{\\sigma}^{(i)2}}{2} + \\frac{(\\hat{y}^{(i)} - y^{(i)})^2}{2\\hat{\\sigma}^{(i)2}} \\\\ \\] with the main difference that not only the mean (here denoted as \\(\\hat{y}^{(i)}\\) ) but also the standard deviation ( \\(\\hat{\\sigma}^{(i)}\\) ) are produced by the network and therefore function of the free-parameters that we wish to optimize. Intuitively, the numerator of the second term encourages the mean prediction to be close to the observed data, while the denominator makes sure the variance penalizes the poor predictions. The first term avoids the network making the variance grow to infinity (which would lead to minimizing the second term no matter the mean value prediction). Mixture density networks : a natural extension of the DPE method is is represented by networks that try to predict more complex probability distributions by parametrizing them as a mixture of gaussians. Mixture density networks (MDNs) Extending to the case above, the network output is now composed of \\(N_m\\) means, \\(N_m\\) standard deviations, and \\(N_m\\) weights (where \\(N_m\\) is the number of gaussians in the mixture): and the probability of a single outcome \\(y\\) given a single input \\(\\mathbf{x}\\) can be written as follows: \\[ p(y|\\mathbf{x}) = \\sum_{i=1}^{N_m} \\pi_i p_i(y|\\mu_i,\\sigma_i) = \\sum_{i=1}^{N_m} \\pi_i \\frac{1}{\\sqrt{2 \\pi \\sigma_i^2}} e^{-\\frac{(y - \\mu_i)^2}{2 \\sigma_i^2}} \\] A few key points worth highlighting for this model: the last layer produces an output of size \\(3N_m\\) , where the last \\(N_m\\) values must sum to 1 as they represent the weights of the gaussian mixture. They are therefore passed through a softmax activation function. the variances should always be positive, this can be simply obtained by adding an exponential activation function at the end of the network to the parameters that represent the variance. This turns unbounded values into values bounded between 0 and \\(+\\infty\\) . The loss function used for MDNs is once again the negative log-likelihood, which can be written for a single training sample as follows: \\[ \\begin{aligned} - log(p(y|\\mathbf{x})) &= - log(\\sum_{i=1}^{N_m} \\pi_i p_i(y|\\mu_i,\\sigma_i)) = \\\\ &= -log(\\sum_{i=1}^{N_m} e^{(log \\pi_i + log p_i)}) \\end{aligned} \\] where the second equation is introduced to avoid instability issues arising by applying the logarithm to the sum of exponential functions. Here we observe that a \\(log(\\sum e)\\) has to be computed; this can be stably done by using the LogSumExp (LSE) function. In prediction mode, a certain input \\(\\mathbf{x}\\) is feed through the network and a prediction of both the means, standard deviations and weights is produced. This uniquely define the probability function of the mixture of gaussian that we have decided to use to parametrize the output of the network. To conclude, let's discuss a practical scenario where MDNs should be preferred to simple DPE networks. Imagine that we are tasked to predict the porosity values in the subsurface given knowledge of elastic parameters (e.g., acoustic and shear impedance). Depending on the rock type, the relation between porosity and acoustic impedance may change. More importantly, there may be a certain overlap in the acoustic impedance values of the different rock types. If this is the case, as shown in the figure below, the output is multi-modal; unless we are certain about the rock type (or jointly predict the rock type alongside the porosity value), we would like the network to inform us when we should be confident about our prediction and where two distinct values of porosity have been observed in the training data for the same (or similar) value of acoustic impedance. Of course, despite this is a simple, single-dimensional example, similar conclusions apply when training a NN with multi-dimensional inputs. Additional readings To learn more about uncertainties in deep learning, read this webpage If you are interest to learn more about MDN, this blog post provides an in-depth introduction of both the underlying theory and implementation details.","title":"Uncertainty Quantification in Neural Networks and Mixture Density Networks"},{"location":"lectures/09_mdn/#uncertainty-quantification-in-neural-networks-and-mixture-density-networks","text":"Before delving into more advanced NN building blocks (e.g., convolutional, recurrent), let's revisit the training process of feed forward NNs with a probabilistic standpoint. Anything that we present here can be later applied to any of the other NN architectures that will be discussed in this course. We have already discussed that every loss function commonly used in the training of NNs, both for regression and classification, can be justified using a statistical formulation mostly in the context of Maximum-likelihood estimators. Despite this intrinsic link with probabilistic modelling, NN predictions are however most of the time punctual, meaning that we do not get an idea of the uncertainty associated to our prediction. First of all, it is important to remark the fact that even in classification tasks when the output of a softmax layer has the form of a probability (i.e., each term is bounded between 0 and 1, and their sum is equal to 1), this values should not be treated as an accurate description of the confidence level of our prediction. Second, when assessing the confidence of a NN prediction (or another ML model), two different types of uncertainties are generally identified: Epistemic uncertainty , also called model uncertainty: this uncertainty arises from a lack of training data in certain regions of the input domain. As we can expect our training data not to cover the entire input space, our trained network is likely to produce arbitrary output values for a large portion of the input values that the network has never seen before. We therefore want to be able to quantify the lack of accuracy due to missing training data. Aleatoric uncertainty : this uncertainty is associated with the fact that the input data may contain some intrinsic randomness. This is either represented by the fact that the function we try to approximate is multimodal (i.e., multiple possible outputs exist for a single input) or the recorded data is polluted by noise. As a result, the training data will include samples with very close input values and a large spread of output values. We wish to be able to get such an insight out of the network predictions. A number of more or less simple strategies can however be employed when training NNs with the goal of obtaining a quantitative measurement of how certain our estimate is: Dropout : this commonly used regularization strategy presented in one of our previous lectures can be also leveraged to produce an estimate of the uncertainty of our solution. This can be done by simply using dropout at the inference time and feeding the network multiple times with the same input. Multiple realizations of a prediction are computed, where different portions of the neurons of the network are deactivated for the different realizations. An empirical distribution or parameter estimates (e.g., mean and standard deviation) over the outputs can be finally estimated. The reason behind the success of this strategy is that the network can easily learn to always predict the same (or very similar) output when it is well constrained by data no matter if some of the neurons are deactivated at random. On the other hand, when the network is more unsure because of lack of data or contrasting data, different versions of the network are likely to produce different predictions. Ensembling : another popular strategy, although quite expensive, is to train N neural networks with different initializations and use them to produce multiple predictions. Similar to dropout, when the training data is available and of good quality, the different networks will make similar predictions as they will likely converge to minima of similar quality. On the other hand, when the data is poor (or lacking), the weight initialization plays a much bigger role in the training and different network are likely to behave differently. Distributional parameter estimation (DPE) : a different route is to change the parametrization of the output itself. More specifically, considering here for simplicity the case of regression, the network is asked to produce two outputs. The first is tasked to predict the mean of the output distribution whilst the second predicts the standard deviation. Whilst in the more traditional training of NNs the standard deviation is kept fixed for all training samples, here the network will be able to understand which portion of the input data is noisier and which is cleaner (as well as detect where input data is missing). The negative log-likelihood is chosen to be the loss function of the network: \\[ \\boldsymbol \\theta = \\underset{\\boldsymbol \\theta} {\\mathrm{argmin}} \\; \\sum_{i=1}^{N_s} \\frac{log \\hat{\\sigma}^{(i)2}}{2} + \\frac{(\\hat{y}^{(i)} - y^{(i)})^2}{2\\hat{\\sigma}^{(i)2}} \\\\ \\] with the main difference that not only the mean (here denoted as \\(\\hat{y}^{(i)}\\) ) but also the standard deviation ( \\(\\hat{\\sigma}^{(i)}\\) ) are produced by the network and therefore function of the free-parameters that we wish to optimize. Intuitively, the numerator of the second term encourages the mean prediction to be close to the observed data, while the denominator makes sure the variance penalizes the poor predictions. The first term avoids the network making the variance grow to infinity (which would lead to minimizing the second term no matter the mean value prediction). Mixture density networks : a natural extension of the DPE method is is represented by networks that try to predict more complex probability distributions by parametrizing them as a mixture of gaussians.","title":"Uncertainty Quantification in Neural Networks and Mixture Density Networks"},{"location":"lectures/09_mdn/#mixture-density-networks-mdns","text":"Extending to the case above, the network output is now composed of \\(N_m\\) means, \\(N_m\\) standard deviations, and \\(N_m\\) weights (where \\(N_m\\) is the number of gaussians in the mixture): and the probability of a single outcome \\(y\\) given a single input \\(\\mathbf{x}\\) can be written as follows: \\[ p(y|\\mathbf{x}) = \\sum_{i=1}^{N_m} \\pi_i p_i(y|\\mu_i,\\sigma_i) = \\sum_{i=1}^{N_m} \\pi_i \\frac{1}{\\sqrt{2 \\pi \\sigma_i^2}} e^{-\\frac{(y - \\mu_i)^2}{2 \\sigma_i^2}} \\] A few key points worth highlighting for this model: the last layer produces an output of size \\(3N_m\\) , where the last \\(N_m\\) values must sum to 1 as they represent the weights of the gaussian mixture. They are therefore passed through a softmax activation function. the variances should always be positive, this can be simply obtained by adding an exponential activation function at the end of the network to the parameters that represent the variance. This turns unbounded values into values bounded between 0 and \\(+\\infty\\) . The loss function used for MDNs is once again the negative log-likelihood, which can be written for a single training sample as follows: \\[ \\begin{aligned} - log(p(y|\\mathbf{x})) &= - log(\\sum_{i=1}^{N_m} \\pi_i p_i(y|\\mu_i,\\sigma_i)) = \\\\ &= -log(\\sum_{i=1}^{N_m} e^{(log \\pi_i + log p_i)}) \\end{aligned} \\] where the second equation is introduced to avoid instability issues arising by applying the logarithm to the sum of exponential functions. Here we observe that a \\(log(\\sum e)\\) has to be computed; this can be stably done by using the LogSumExp (LSE) function. In prediction mode, a certain input \\(\\mathbf{x}\\) is feed through the network and a prediction of both the means, standard deviations and weights is produced. This uniquely define the probability function of the mixture of gaussian that we have decided to use to parametrize the output of the network. To conclude, let's discuss a practical scenario where MDNs should be preferred to simple DPE networks. Imagine that we are tasked to predict the porosity values in the subsurface given knowledge of elastic parameters (e.g., acoustic and shear impedance). Depending on the rock type, the relation between porosity and acoustic impedance may change. More importantly, there may be a certain overlap in the acoustic impedance values of the different rock types. If this is the case, as shown in the figure below, the output is multi-modal; unless we are certain about the rock type (or jointly predict the rock type alongside the porosity value), we would like the network to inform us when we should be confident about our prediction and where two distinct values of porosity have been observed in the training data for the same (or similar) value of acoustic impedance. Of course, despite this is a simple, single-dimensional example, similar conclusions apply when training a NN with multi-dimensional inputs.","title":"Mixture density networks (MDNs)"},{"location":"lectures/09_mdn/#additional-readings","text":"To learn more about uncertainties in deep learning, read this webpage If you are interest to learn more about MDN, this blog post provides an in-depth introduction of both the underlying theory and implementation details.","title":"Additional readings"},{"location":"lectures/10_cnn/","text":"Convolutional Neural Networks Convolutional Neural Networks are one of the most powerful types of neural network, very popular and successful in image processing (and more broadly computer vision). They are based on a simple mathematical operation that we, geoscientists, know very well and user in a variety of tasks: the convolution operator. This is motivated in most scenarios where local dependencies in the input data are known to be predominant. Imagine for example a geological model, or a core section. If we decide to apply Deep Learning to such data to either classify rock types, estimate rock parameters, or even for generative modelling tasks, the first thing that we would like our NN to know is that nearby geological features are likely to be correlated, whilst the further apart we move the more the features become independent from each other. By looking at the schematic diagrams below, a FCN would not take this prior information into account as each input value is linearly combined to give rise to the output. On the other hand, a convolutional block which represents the key component of a CNN will only use values of the input vector in a certain neighbour to obtain the output: The example mentioned above is just one of many in geoscience where convolution-based networks have been lately shown to be very successfull. Other examples are: Seismic interpretation (faults, horizons, bodies) Seismic processing (denoising, interpolation, deblurring) Satellite imaginery (denoising, segmentation) Microseismicity (detection, source mechanism) Laboratory studies (CT, SEM, Microscopy for various processing and interpretation tasks) In general, any data type that is represented regularly on a 1D, 2D, or ND gridded topology is fit for CNNs. Convolution First of all, let's briefly recall what a convolution is. This represents in fact the core operation performed by a convolutional layer. A convolution between two signals can be mathematically written as \\[ y(t) = \\int x(\\tau) h(t-\\tau) d\\tau \\leftrightarrow y = x * h \\] where \\(x(t)\\) and \\(y(t)\\) are the input and output, respectively, and \\(h(t)\\) is the filter (also called kernel in the DL jargon). This equation can be interpreted as follows: take the filter and flip it across the origin, then slide it along the time axis and multiply-and-sum it to the input signal. In practice, when working with digital data in a computer, all signals are discrete and the continuous formula above can be rewritten as follows: \\[ y_i = \\sum_{j=-\\infty}^{\\infty} x_j h_{i-j} \\] where, to be general, we have here extended the integral from \\(-\\infty\\) to \\(\\infty\\) . In most applications, the filter \\(h\\) is however compact (it has a small size of N samples, also called kernel size ) and therefore we can limit the summation within the window of samples where the filter is non-zero. A similar (but still different!) concept in signal processing is correlation \\[ y(t) = \\int x(\\tau) h(t+\\tau) d\\tau \\leftrightarrow y_i = \\sum_{j=-\\infty}^{\\infty} x_j h_{i+j} \\] where the filter is simply slid across the \\(t\\) axis (without being initially flipped). The main difference between convolution and correlation is therefore that one delays the input signal whilst the other anticipates it when the filter is non-symmetric to zero. As we will see later, it is important to immediately empathize also a slight difference in the jargon used in classical signal processing and deep learning: what usually we refer to as convolution in DL is what signal processing refers to as correlation. However, since in DL we do not choose the filter \\(h\\) , rather this is learned from data, if signal processing convolution was used instead of correlation, the learning algorithm would just learned the flipped version of the filter. In both cases, when we convolve two signals of size \\(N_x\\) with a filter of size \\(N_h\\) , the output signal has size: \\[ N_y = N_x + N_h - 1 \\] However, in the context of CNNs, we generally only want to consider the so-called valid part of the convolution, i.e., where the entire filter contributes to the computation. For this reason the output signal size becomes: \\[ N_y = N_x - N_h + 1 \\] In the next section, we will see how we can actually make the choice of \\(N_y\\) more flexible with the help of additional tools like padding and striding. Extending the concept of convolution to two- and multi-dimensional data is straightforward. This can be done by simply sliding the filter in all dimensions and can be mathematically written (in the discrete form) as follows: \\[ y_{i,j} = \\sum_m \\sum_l x_{m,l} h_{i+m,j+l} \\] Finally, another interesting thing to notice is that convolution is a linear process. Therefore we can express it as a matrix-vector multiplication where the vector identifies the input data and the filter is re-organized into a Toeplitz matrix as show in the figure below which means that the gradient of a convolutional operator that we need for backpropagation is just the adjoint of the matrix \\(\\mathbf{H}^T\\) . This is a convolution with the flipped kernel (so truly a convolution!). Why Convolution? A first intuitive motivation about locality of interactions, also referred to as space interactions (or sparse connectivity or sparse weights ), has been already provided onto why convolution blocks may represent an appealing alternative to fully connected blocks in the context of neural networks. However, this is not the only reason why convolution blocks are so powerful and widely used nowadays when training NNs for image processing tasks. Let's start with an example. Imagine we are given a large image and a small 3x3 kernel. By sliding the kernel across the image we can still be able to detect useful local features (e.g., edges). Note that, the Machine Learning community has been aware of this for decades, and in fact many early approaches to image detection relied on hand-crafted filters that could highlight one feature of the input data over another. The modern DL approach simply takes this paradigm one step further where the filters are learned instead of being defined upfront. Experience has further shown that deep CNNs learn initially low level features (e.g., edges), then middle level features (e.g., shapes) and finally high level features (e.g., objects). Compared to flattening the input data and applying a matrix that transforms it into the dimension of the output data (that is what a FCC would do as shown above), using convolutions with small filters can save both memory and computations. Given for example an image of size \\(N_{w,x} \\times N_{h,x}\\) , a fully connected layer that produces an output of the same size requires a matrix with \\((N_{w,x} N_{h,x})^2\\) parameters and \\((N_{w,x} N_{h,x})^2\\) computations are required to obtain the output. On the other hand, if we now consider a simple filter of size \\(N_{w,h} \\times N_{h,h}\\) , the number of computations is reduced to \\(N_{w,x} N_{h,x} N_{w,h} N_{h,h}\\) . The second main advantage of convolutional blocks is so-called parameter sharing . The same learned kernels are applied all over the input data, instead of having one filter operating on all (or part of) the input data to produce a single output component. Finally, a third benefit is the equivariance of convolution to translation . This means that if we shift the input by \\(k\\) samples, the output will also be shifted by the same number of samples; however, the shape of the output will not change. Padding and strides We have previously seen how applying convolution to a signal with a kernel of a given size produces an output signal of different size, either with the total or valid output size is chosen. It may be however much easier when designing a convolutional neural network to have inputs and outputs of the same size, or more in general to be free to design the size of the output independent on that of the input and filter. Two simple approaches exist: padding : the input signal is padded with zeros on both sides (for 1D signals) or all sides (for ND signals) prior to convolution. This allows producing outputs that can have the same size or even larger size than the input. Let's first look at this with an example when the output size is computed using the equation above for the valid case. We can devise a padding such that the size of the output stays the same as that of the input. This is actually easy to do once we choose the size of the filter and more specifically \\(N_{x,pad} = N_x + 2*pad\\) with \\(pad = (N_h-1)/2\\) when \\(N_h\\) is a odd number and \\(N_h/2\\) when \\(N_h\\) is a even number. Moreover, apart from the obvious benefit of not having to handle outputs that keep reducing in size, padding ensures that edge values in the inputs are also used the same number of times that central values in the convolution process. strides : a common approach when building convolutional neural network, as we will see when discussing popular CNN architecture, is however to gradually reduce the size of the signal (or image in 2D) whilst going deeper and deeper into the network. Two alternative ways to achieve this exist: the simplest is to couple convolutional layers that do not change the size of the input and downsampling (or pooling layers). Alternatively, one can choose to apply a special type of convolution called strided convolution that simply moves the filter around the input jumping (or striding) by more than a single sample at the time. Again, if we look at an example, we can observe how by doing so the size of the output is reduced by the striding factor. If we stride by a factor of two the output size will be half of the input size. As a result the output size can be written as \\(N_y = \\lfloor (N_x - N_h) / stride + 1 \\rfloor\\) . Eventually striding and padding can be used together to get for example an output that is exactly half of the size of the input in all directions. An important formula to remember when designing convolutional layers is: \\[ N_y = \\Bigl\\lfloor \\frac{N_x + 2pad - N_h}{stride} + 1 \\Bigr\\rfloor \\] Channels We need to introduce one last key ingredient before we can define a convolutional layer. Let's imagine we have a 3D tensor and a 3D filter; the extension of 2D convolution to 3D (or any extra dimension) is as easy as sliding the filter along the third dimension as well as the first two. However, in deep learning we generally do something different when we are dealing with convolutional networks. We define a special dimension called channel . Imagine having a 1D signal like a seismic trace but recording both the horizontal and vertical components of the particle displacement field. One way to arrange such data is as a 2D array where one of the dimensions is the size of the trace and the other is the number of components (or channels), here two. A similar scenario may arise for 2D signals if we record for example different spectral components or for pre-stack seismic data where we record data at different angles. Here once again we will have two \"classical\" dimensions, say latitude and longitude or geographical location and depth and one channel dimension. For the first example this will contain the different spectral components, for the second example it will be represented by the different angles (or offsets). This is the geoscientific equivalent to natural images that are commonly used in deep learning tasks where the channel contains different colors (e.g., RGB or CMYK). In order to make ourselves already familiar with the ordering used in computational frameworks like PyTorch, a batch of training samples is usually organized as follows: \\[ N_x = (N_s \\times N_{ch,x} \\times N_{w,x} \\times N_{h,x}) \\] where \\(N_{ch,x}\\) is the number of input channels, whilst \\(N_{w,x}\\) and \\(N_{w,h}\\) are the width and the height of the image, respectively. By defining a special dimension, we can now decide to still work with filters that slide only across the width and height axes. Such kernels will have size \\(N_{ch,x} \\times N_{w,h} \\times N_{h,h}\\) . By doing so, for every step of convolution, the input and filter and multiplied and then all the values across all channels are summed together. Convolutional layer A convolutional layer is simply a stack of \\(N_{ch,y}\\) filters. The resulting output has therefore a shape equal to: \\[ N_y = (N_s \\times N_{ch,y} \\times N_{w,y} \\times N_{h,y}) \\] where \\(N_{w,y}\\) and \\(N_{w,y}\\) can be computed upfront using the formulas derived above. Note that a convolutional layer contains trainable parameters both in the form of the coefficients of the various filters and a vector of biases \\(\\mathbf{b}=[b_1, b_2,...,b_{N_{ch,y}}]\\) where every bias is applied to a different output channel. The output can be therefore written in a compact mathematical form as follows: \\[y = \\sigma \\Big( \\begin{bmatrix} h_1 * x + b_1 \\\\ ... \\\\ h_{N_{ch,y}} * x + b_{N_{ch,y}} \\end{bmatrix} \\Big) \\] In summary, a convolutional layer has the following number of trainable parameters: \\[ N=N_{w,h}N_{h,h}N_{ch,x}N_{ch,y} + N_{ch,y} \\] For example, if \\(N_{ch,x}=3\\) , \\(N_{ch,y}=10\\) , and the filters have size \\(3 \\times 3\\) , the overall number of parameters is \\(3\\cdot3\\cdot3\\cdot10 + 10 =280\\) . Moreover, as convolutional layers can be stacked similarly to what we have done with MLP layers, the following nomenclature will be used in the following when referring to a generic layer \\(l\\) : \\[ \\begin{aligned} x:&\\quad N_{ch}^{[l-1]} \\times N_w^{[l-1]} \\times N_h^{[l-1]},\\\\ h:&\\quad N_{ch}^{[l]} \\times N_{ch}^{[l-1]} \\times N_w^{[l-1]} \\times N_h^{[l-1]},\\\\ b:&\\quad N_{ch}^{[l]},\\\\ y:&\\quad N_{ch}^{[l]} \\times N_w^{[l]} \\times N_h^{[l]} \\end{aligned} \\] Convolutional network Similar to a fully connected network, a convolutional network can be easily created by putting together a certain number of convolutional layers. Although we will see that different tasks call for different design choices, most convolutional neural networks share the following design features: the height and width ( \\(N_h\\) and \\(N_w\\) ) tends to reduce the deeper we travel into the network; the number of channels ( \\(N_{ch}\\) ) does instead increase as function of network depth; after a certain number of convolutional layers, the output of size \\(N_{ch}^{[l]} \\times N_w^{[l]} \\times N_h^{[l]}\\) is flattened and fed into one or more fully connected layers and then sent into a classifier (or regressor) loss function. Pooling As we have previously mentioned in the previous section, convolutional neural networks require reducing the size of the height and width of an input image We have already discussed that by choosing the filter size, stride and padding, the output can be either kept of the same size of the input or reduced (or increased) in size. At times, it may however be better to avoid changing the size of the output directly as part of the convolution process, rather perform this in a separate step. In this section we introduce the so-called pooling process, which is designed specifically to reduce the size of an input N-dimensional array by an arbitrary factor \\(N_p\\) . Let's start with an example. We are interested to take a matrix of size \\(N_{h,x} \\times N_{w,x}\\) as input and produce an output of half the size (i.e., \\(N_{h,x}/2 \\times N_{w,x}/2\\) . A possible way to achieve this without purely discarding some of the values of the matrix is to select the maximum value within a sliding window of size \\(2 \\times 2\\) (stride=2): This approach is commonly referred to in the literature as Max Pooling . This approach can be easily extended to any other subsampling by simply extending the size of the window and stride accordingly (i.e., using to the equations defined above used for the output sizes of a convolutional layer based on the filter size and stride). Moreover, even though less commonly used, Mean Pooling represent an alternative approach where the mean value inside each patch is taken instead of the maximum. Finally, it is important to observe that Pooling is done for each channel independently and that it does not contain any learnable parameter. 1x1 convolutions At this point we know how to take an input tensor with an arbitrary number of dimensions (two or more) and a given number of channels, feed it through a convolutional layer, and obtain an output sensor with the same (or slightly different size) and a new chosen number of channels. It is common practice when building convolutional neural networks to start with a small number of channels and increase it gradually as we go deeper into the network. However, when you start stacking many of these layers the number of channels will quickly grow to a point where \\(N_{ch} \\rightarrow \\infty\\) . As a consequence of this fact, also the size of the filters start to grow indefinitely. But since having deeper networks has been shown an effective way to learn very complex mappings, we need something to be able to reduce the size of these filters at any time we are in need for it. A simple, yet very effective approach was proposed in 2013 by Lin and coauthors where filters of size \\(1\\times1\\) are used to reduce the number of channels whilst keeping the number of learnable parameter to a minimum (any other filter with bigger depth or width will introduce more learnable parameters). The authors actually refer to this \\(1\\times1\\) convolutional layer as a specific implementation of cross-channel parametric pooling, as similar to pooling reduces the size of the input tensor over one dimensions (channel in this case). Skip connections As already extensively discussed in one of our previous lectures, one of the problem associated with making neural networks very deep is that of so-called vanishing gradients. However, since deep neural networks are key to high performing models, the DL community has for long time tried to come up with strategies that can speed up the training process (or at least avoid a slow down) in the presence of long stacks of convolutional blocks. One successful idea that was proposed in 2015 by He and coauthors under the name of Residual Block , where so-called skip connection is introduced in a NN to take the activation of a certain layer and feed it directly to another layer further down in the computational graph. In the figure below, we consider an example where a skip connection of 2 layers is introduced to connect the activations of layer \\(l\\) and \\(l+2\\) (just before applying a nonlinear activation). Here the connection is achieved by summing the two activations. Mathematically we can write: \\[ \\textbf{a}^{[l+2]}= \\sigma(\\textbf{a}^{[l]}+\\textbf{z}^{[l+2]}) \\] and we can clearly see how the information contained in \\(\\textbf{a}^{[l]}\\) flows through the graph along both a longer path (i.e., main path) and a shorter one (i.e., shortcut). Finally note that in the last 5 years or so many variations of the residual block have been introduced. For example, one could have more or less than 2 convolutional layers (or MPLs) inside the main path. Moreover, since the size of \\(\\textbf{a}^{[l]}\\) and \\(\\textbf{z}^{[l+2]}\\) may be different, an additional layer with learnable parameter may be introduced as part of the shortcut to adjust for the size of \\(\\textbf{a}^{[l]}\\) : \\[ \\textbf{a}^{[l+2]}= \\sigma(f_\\theta(\\textbf{a}^{[l]})+\\textbf{z}^{[l+2]}) \\] where \\(f_\\theta\\) here could simply be a convolutional layer.","title":"Convolutional Neural Networks"},{"location":"lectures/10_cnn/#convolutional-neural-networks","text":"Convolutional Neural Networks are one of the most powerful types of neural network, very popular and successful in image processing (and more broadly computer vision). They are based on a simple mathematical operation that we, geoscientists, know very well and user in a variety of tasks: the convolution operator. This is motivated in most scenarios where local dependencies in the input data are known to be predominant. Imagine for example a geological model, or a core section. If we decide to apply Deep Learning to such data to either classify rock types, estimate rock parameters, or even for generative modelling tasks, the first thing that we would like our NN to know is that nearby geological features are likely to be correlated, whilst the further apart we move the more the features become independent from each other. By looking at the schematic diagrams below, a FCN would not take this prior information into account as each input value is linearly combined to give rise to the output. On the other hand, a convolutional block which represents the key component of a CNN will only use values of the input vector in a certain neighbour to obtain the output: The example mentioned above is just one of many in geoscience where convolution-based networks have been lately shown to be very successfull. Other examples are: Seismic interpretation (faults, horizons, bodies) Seismic processing (denoising, interpolation, deblurring) Satellite imaginery (denoising, segmentation) Microseismicity (detection, source mechanism) Laboratory studies (CT, SEM, Microscopy for various processing and interpretation tasks) In general, any data type that is represented regularly on a 1D, 2D, or ND gridded topology is fit for CNNs.","title":"Convolutional Neural Networks"},{"location":"lectures/10_cnn/#convolution","text":"First of all, let's briefly recall what a convolution is. This represents in fact the core operation performed by a convolutional layer. A convolution between two signals can be mathematically written as \\[ y(t) = \\int x(\\tau) h(t-\\tau) d\\tau \\leftrightarrow y = x * h \\] where \\(x(t)\\) and \\(y(t)\\) are the input and output, respectively, and \\(h(t)\\) is the filter (also called kernel in the DL jargon). This equation can be interpreted as follows: take the filter and flip it across the origin, then slide it along the time axis and multiply-and-sum it to the input signal. In practice, when working with digital data in a computer, all signals are discrete and the continuous formula above can be rewritten as follows: \\[ y_i = \\sum_{j=-\\infty}^{\\infty} x_j h_{i-j} \\] where, to be general, we have here extended the integral from \\(-\\infty\\) to \\(\\infty\\) . In most applications, the filter \\(h\\) is however compact (it has a small size of N samples, also called kernel size ) and therefore we can limit the summation within the window of samples where the filter is non-zero. A similar (but still different!) concept in signal processing is correlation \\[ y(t) = \\int x(\\tau) h(t+\\tau) d\\tau \\leftrightarrow y_i = \\sum_{j=-\\infty}^{\\infty} x_j h_{i+j} \\] where the filter is simply slid across the \\(t\\) axis (without being initially flipped). The main difference between convolution and correlation is therefore that one delays the input signal whilst the other anticipates it when the filter is non-symmetric to zero. As we will see later, it is important to immediately empathize also a slight difference in the jargon used in classical signal processing and deep learning: what usually we refer to as convolution in DL is what signal processing refers to as correlation. However, since in DL we do not choose the filter \\(h\\) , rather this is learned from data, if signal processing convolution was used instead of correlation, the learning algorithm would just learned the flipped version of the filter. In both cases, when we convolve two signals of size \\(N_x\\) with a filter of size \\(N_h\\) , the output signal has size: \\[ N_y = N_x + N_h - 1 \\] However, in the context of CNNs, we generally only want to consider the so-called valid part of the convolution, i.e., where the entire filter contributes to the computation. For this reason the output signal size becomes: \\[ N_y = N_x - N_h + 1 \\] In the next section, we will see how we can actually make the choice of \\(N_y\\) more flexible with the help of additional tools like padding and striding. Extending the concept of convolution to two- and multi-dimensional data is straightforward. This can be done by simply sliding the filter in all dimensions and can be mathematically written (in the discrete form) as follows: \\[ y_{i,j} = \\sum_m \\sum_l x_{m,l} h_{i+m,j+l} \\] Finally, another interesting thing to notice is that convolution is a linear process. Therefore we can express it as a matrix-vector multiplication where the vector identifies the input data and the filter is re-organized into a Toeplitz matrix as show in the figure below which means that the gradient of a convolutional operator that we need for backpropagation is just the adjoint of the matrix \\(\\mathbf{H}^T\\) . This is a convolution with the flipped kernel (so truly a convolution!).","title":"Convolution"},{"location":"lectures/10_cnn/#why-convolution","text":"A first intuitive motivation about locality of interactions, also referred to as space interactions (or sparse connectivity or sparse weights ), has been already provided onto why convolution blocks may represent an appealing alternative to fully connected blocks in the context of neural networks. However, this is not the only reason why convolution blocks are so powerful and widely used nowadays when training NNs for image processing tasks. Let's start with an example. Imagine we are given a large image and a small 3x3 kernel. By sliding the kernel across the image we can still be able to detect useful local features (e.g., edges). Note that, the Machine Learning community has been aware of this for decades, and in fact many early approaches to image detection relied on hand-crafted filters that could highlight one feature of the input data over another. The modern DL approach simply takes this paradigm one step further where the filters are learned instead of being defined upfront. Experience has further shown that deep CNNs learn initially low level features (e.g., edges), then middle level features (e.g., shapes) and finally high level features (e.g., objects). Compared to flattening the input data and applying a matrix that transforms it into the dimension of the output data (that is what a FCC would do as shown above), using convolutions with small filters can save both memory and computations. Given for example an image of size \\(N_{w,x} \\times N_{h,x}\\) , a fully connected layer that produces an output of the same size requires a matrix with \\((N_{w,x} N_{h,x})^2\\) parameters and \\((N_{w,x} N_{h,x})^2\\) computations are required to obtain the output. On the other hand, if we now consider a simple filter of size \\(N_{w,h} \\times N_{h,h}\\) , the number of computations is reduced to \\(N_{w,x} N_{h,x} N_{w,h} N_{h,h}\\) . The second main advantage of convolutional blocks is so-called parameter sharing . The same learned kernels are applied all over the input data, instead of having one filter operating on all (or part of) the input data to produce a single output component. Finally, a third benefit is the equivariance of convolution to translation . This means that if we shift the input by \\(k\\) samples, the output will also be shifted by the same number of samples; however, the shape of the output will not change.","title":"Why Convolution?"},{"location":"lectures/10_cnn/#padding-and-strides","text":"We have previously seen how applying convolution to a signal with a kernel of a given size produces an output signal of different size, either with the total or valid output size is chosen. It may be however much easier when designing a convolutional neural network to have inputs and outputs of the same size, or more in general to be free to design the size of the output independent on that of the input and filter. Two simple approaches exist: padding : the input signal is padded with zeros on both sides (for 1D signals) or all sides (for ND signals) prior to convolution. This allows producing outputs that can have the same size or even larger size than the input. Let's first look at this with an example when the output size is computed using the equation above for the valid case. We can devise a padding such that the size of the output stays the same as that of the input. This is actually easy to do once we choose the size of the filter and more specifically \\(N_{x,pad} = N_x + 2*pad\\) with \\(pad = (N_h-1)/2\\) when \\(N_h\\) is a odd number and \\(N_h/2\\) when \\(N_h\\) is a even number. Moreover, apart from the obvious benefit of not having to handle outputs that keep reducing in size, padding ensures that edge values in the inputs are also used the same number of times that central values in the convolution process. strides : a common approach when building convolutional neural network, as we will see when discussing popular CNN architecture, is however to gradually reduce the size of the signal (or image in 2D) whilst going deeper and deeper into the network. Two alternative ways to achieve this exist: the simplest is to couple convolutional layers that do not change the size of the input and downsampling (or pooling layers). Alternatively, one can choose to apply a special type of convolution called strided convolution that simply moves the filter around the input jumping (or striding) by more than a single sample at the time. Again, if we look at an example, we can observe how by doing so the size of the output is reduced by the striding factor. If we stride by a factor of two the output size will be half of the input size. As a result the output size can be written as \\(N_y = \\lfloor (N_x - N_h) / stride + 1 \\rfloor\\) . Eventually striding and padding can be used together to get for example an output that is exactly half of the size of the input in all directions. An important formula to remember when designing convolutional layers is: \\[ N_y = \\Bigl\\lfloor \\frac{N_x + 2pad - N_h}{stride} + 1 \\Bigr\\rfloor \\]","title":"Padding and strides"},{"location":"lectures/10_cnn/#channels","text":"We need to introduce one last key ingredient before we can define a convolutional layer. Let's imagine we have a 3D tensor and a 3D filter; the extension of 2D convolution to 3D (or any extra dimension) is as easy as sliding the filter along the third dimension as well as the first two. However, in deep learning we generally do something different when we are dealing with convolutional networks. We define a special dimension called channel . Imagine having a 1D signal like a seismic trace but recording both the horizontal and vertical components of the particle displacement field. One way to arrange such data is as a 2D array where one of the dimensions is the size of the trace and the other is the number of components (or channels), here two. A similar scenario may arise for 2D signals if we record for example different spectral components or for pre-stack seismic data where we record data at different angles. Here once again we will have two \"classical\" dimensions, say latitude and longitude or geographical location and depth and one channel dimension. For the first example this will contain the different spectral components, for the second example it will be represented by the different angles (or offsets). This is the geoscientific equivalent to natural images that are commonly used in deep learning tasks where the channel contains different colors (e.g., RGB or CMYK). In order to make ourselves already familiar with the ordering used in computational frameworks like PyTorch, a batch of training samples is usually organized as follows: \\[ N_x = (N_s \\times N_{ch,x} \\times N_{w,x} \\times N_{h,x}) \\] where \\(N_{ch,x}\\) is the number of input channels, whilst \\(N_{w,x}\\) and \\(N_{w,h}\\) are the width and the height of the image, respectively. By defining a special dimension, we can now decide to still work with filters that slide only across the width and height axes. Such kernels will have size \\(N_{ch,x} \\times N_{w,h} \\times N_{h,h}\\) . By doing so, for every step of convolution, the input and filter and multiplied and then all the values across all channels are summed together.","title":"Channels"},{"location":"lectures/10_cnn/#convolutional-layer","text":"A convolutional layer is simply a stack of \\(N_{ch,y}\\) filters. The resulting output has therefore a shape equal to: \\[ N_y = (N_s \\times N_{ch,y} \\times N_{w,y} \\times N_{h,y}) \\] where \\(N_{w,y}\\) and \\(N_{w,y}\\) can be computed upfront using the formulas derived above. Note that a convolutional layer contains trainable parameters both in the form of the coefficients of the various filters and a vector of biases \\(\\mathbf{b}=[b_1, b_2,...,b_{N_{ch,y}}]\\) where every bias is applied to a different output channel. The output can be therefore written in a compact mathematical form as follows: \\[y = \\sigma \\Big( \\begin{bmatrix} h_1 * x + b_1 \\\\ ... \\\\ h_{N_{ch,y}} * x + b_{N_{ch,y}} \\end{bmatrix} \\Big) \\] In summary, a convolutional layer has the following number of trainable parameters: \\[ N=N_{w,h}N_{h,h}N_{ch,x}N_{ch,y} + N_{ch,y} \\] For example, if \\(N_{ch,x}=3\\) , \\(N_{ch,y}=10\\) , and the filters have size \\(3 \\times 3\\) , the overall number of parameters is \\(3\\cdot3\\cdot3\\cdot10 + 10 =280\\) . Moreover, as convolutional layers can be stacked similarly to what we have done with MLP layers, the following nomenclature will be used in the following when referring to a generic layer \\(l\\) : \\[ \\begin{aligned} x:&\\quad N_{ch}^{[l-1]} \\times N_w^{[l-1]} \\times N_h^{[l-1]},\\\\ h:&\\quad N_{ch}^{[l]} \\times N_{ch}^{[l-1]} \\times N_w^{[l-1]} \\times N_h^{[l-1]},\\\\ b:&\\quad N_{ch}^{[l]},\\\\ y:&\\quad N_{ch}^{[l]} \\times N_w^{[l]} \\times N_h^{[l]} \\end{aligned} \\]","title":"Convolutional layer"},{"location":"lectures/10_cnn/#convolutional-network","text":"Similar to a fully connected network, a convolutional network can be easily created by putting together a certain number of convolutional layers. Although we will see that different tasks call for different design choices, most convolutional neural networks share the following design features: the height and width ( \\(N_h\\) and \\(N_w\\) ) tends to reduce the deeper we travel into the network; the number of channels ( \\(N_{ch}\\) ) does instead increase as function of network depth; after a certain number of convolutional layers, the output of size \\(N_{ch}^{[l]} \\times N_w^{[l]} \\times N_h^{[l]}\\) is flattened and fed into one or more fully connected layers and then sent into a classifier (or regressor) loss function.","title":"Convolutional network"},{"location":"lectures/10_cnn/#pooling","text":"As we have previously mentioned in the previous section, convolutional neural networks require reducing the size of the height and width of an input image We have already discussed that by choosing the filter size, stride and padding, the output can be either kept of the same size of the input or reduced (or increased) in size. At times, it may however be better to avoid changing the size of the output directly as part of the convolution process, rather perform this in a separate step. In this section we introduce the so-called pooling process, which is designed specifically to reduce the size of an input N-dimensional array by an arbitrary factor \\(N_p\\) . Let's start with an example. We are interested to take a matrix of size \\(N_{h,x} \\times N_{w,x}\\) as input and produce an output of half the size (i.e., \\(N_{h,x}/2 \\times N_{w,x}/2\\) . A possible way to achieve this without purely discarding some of the values of the matrix is to select the maximum value within a sliding window of size \\(2 \\times 2\\) (stride=2): This approach is commonly referred to in the literature as Max Pooling . This approach can be easily extended to any other subsampling by simply extending the size of the window and stride accordingly (i.e., using to the equations defined above used for the output sizes of a convolutional layer based on the filter size and stride). Moreover, even though less commonly used, Mean Pooling represent an alternative approach where the mean value inside each patch is taken instead of the maximum. Finally, it is important to observe that Pooling is done for each channel independently and that it does not contain any learnable parameter.","title":"Pooling"},{"location":"lectures/10_cnn/#1x1-convolutions","text":"At this point we know how to take an input tensor with an arbitrary number of dimensions (two or more) and a given number of channels, feed it through a convolutional layer, and obtain an output sensor with the same (or slightly different size) and a new chosen number of channels. It is common practice when building convolutional neural networks to start with a small number of channels and increase it gradually as we go deeper into the network. However, when you start stacking many of these layers the number of channels will quickly grow to a point where \\(N_{ch} \\rightarrow \\infty\\) . As a consequence of this fact, also the size of the filters start to grow indefinitely. But since having deeper networks has been shown an effective way to learn very complex mappings, we need something to be able to reduce the size of these filters at any time we are in need for it. A simple, yet very effective approach was proposed in 2013 by Lin and coauthors where filters of size \\(1\\times1\\) are used to reduce the number of channels whilst keeping the number of learnable parameter to a minimum (any other filter with bigger depth or width will introduce more learnable parameters). The authors actually refer to this \\(1\\times1\\) convolutional layer as a specific implementation of cross-channel parametric pooling, as similar to pooling reduces the size of the input tensor over one dimensions (channel in this case).","title":"1x1 convolutions"},{"location":"lectures/10_cnn/#skip-connections","text":"As already extensively discussed in one of our previous lectures, one of the problem associated with making neural networks very deep is that of so-called vanishing gradients. However, since deep neural networks are key to high performing models, the DL community has for long time tried to come up with strategies that can speed up the training process (or at least avoid a slow down) in the presence of long stacks of convolutional blocks. One successful idea that was proposed in 2015 by He and coauthors under the name of Residual Block , where so-called skip connection is introduced in a NN to take the activation of a certain layer and feed it directly to another layer further down in the computational graph. In the figure below, we consider an example where a skip connection of 2 layers is introduced to connect the activations of layer \\(l\\) and \\(l+2\\) (just before applying a nonlinear activation). Here the connection is achieved by summing the two activations. Mathematically we can write: \\[ \\textbf{a}^{[l+2]}= \\sigma(\\textbf{a}^{[l]}+\\textbf{z}^{[l+2]}) \\] and we can clearly see how the information contained in \\(\\textbf{a}^{[l]}\\) flows through the graph along both a longer path (i.e., main path) and a shorter one (i.e., shortcut). Finally note that in the last 5 years or so many variations of the residual block have been introduced. For example, one could have more or less than 2 convolutional layers (or MPLs) inside the main path. Moreover, since the size of \\(\\textbf{a}^{[l]}\\) and \\(\\textbf{z}^{[l+2]}\\) may be different, an additional layer with learnable parameter may be introduced as part of the shortcut to adjust for the size of \\(\\textbf{a}^{[l]}\\) : \\[ \\textbf{a}^{[l+2]}= \\sigma(f_\\theta(\\textbf{a}^{[l]})+\\textbf{z}^{[l+2]}) \\] where \\(f_\\theta\\) here could simply be a convolutional layer.","title":"Skip connections"},{"location":"lectures/11_cnnarch/","text":"CNNs Popular Architectures This lecture provides an overview of how deep learning, especially in the context of CNNs (and computer vision in general), has evolved over the last decade. This is something that it is good to be familiar with because: whilst most of these advances are given for granted and routinely used today, it is always insightful to learn how ans why these developments were made; we can use architectures that worked well with no (or minimal) adaptation to our problem at hand (we will see that this is very commonly done with high degree of success in geoscience); even better, sometimes we can decide to use pre-trained networks and fine-tune them with limited amount of label data. In this case knowing the network architecture in details allows us to make informed choices, such as remove some of the final layers and introduce new ones that better adapt to the problem at hand (e.g., different number of classes). LeNet-5 One of the first successful CNNs was created and trained by the famous Yan Le Cun in 1989 with the objective of classifying hand-written digits. As we will see when comparing this to other popular networks, the size of LeNet-5 is very limited, mostly due to the hardware capabilities at that time (and the availability of a fairly small training dataset). As shown in the figure below, this network is composed of: 2 convolutional layers with filter size equal to \\(5 \\times 5\\) , stride equal to 1, and number of channels equal to 6 and 16, respectively; 2 average pooling layers that reduce the height and width of the feature maps by a factor of 2; 3 fully connected layers of size 120, 84, and 10 (the number of digits to classify); softmax activation in the final layer; and the overall number of training parameters is \\(\\approx 60k\\) . Finally, looking at the network architecture two things stand out that probably today would have been implemented differently: average pool layers are not so popular today, max pool layers are more commonly used; activations were used also after pooling and all activations where sigmoid/tangent. Again, today ReLU or one of its variant is more commonly used and no activations are added after pooling layers. AlexNet AlexNet represents a milestone in the field of DeepLearning. Developed by Alex Krizhevsky, Ilya Sutskever and Geoffrey Hinton, this network was the first CNN that won the popular computer vision competition ImageNet. Not only that, but the network outperformed other submissions by far, and brought Deep Learning to the attention of the larger Computer Vision community. As shown in the figure below, this network is not very different from LeNet-5 in its individual components, it is however much deeper and contains much more trainable parameters. More specifically, it is composed of: 5 convolutional layers with variable filter size (ranging from \\(11 \\times 11\\) in the first layer all the way to \\(3 \\times 3\\) in some of the deeper layers); 3 max pooling layers that reduce the height and width of the feature maps by a factor of 2; 3 fully connected layers of size 4096, 4096, and 1000 (the number of digits to classify); softmax activation in the final layer; and the overall number of training parameters is \\(\\approx 60M\\) , 3 order of magnitude more than that of LeNet-5. A number of interesting feature of this network: the number of channels in the different layers: initially, this grows from 3 (i.e., RGB) to 384 and it is then reduced to 256 all the way to the FC layer; ReLU is used as activation function for all hidden layers; Dropout is used to avoid overfitting; VGG-16 In 2015, the Visual Geometry Group at Oxford introduce a new CNN architecture called VGG. The key architectural change here is the fact that the network was much deeper than most state-of-the art networks at that time (16 layers); this was achieved by trading filter size (now \\(3 \\times 3\\) ) for depth. Moreover, whilst other networks like AlexNet were hand-crafted with very different filter sizes, strides and padding from layer to layer, this network is really very simple to define: 16 \\(3 \\times 3\\) convolutional layers with stride equal to 1; 16 max pooling laywrs with filter size and stride equal to 2. and the overall number of training parameters is \\(\\approx 138M\\) , roughly twice more than those of AlexNet. The key insight of VGG, which we will see is also used in later CNN architectures, is that stacks of convolutional layers with small filters can emulate the receptive field of one layer with larger filter sizes. Note that further extensions of VGG-16 have been proposed, for example VGG-19 where the network is composed of 19 layers. GoogleLeNet and Inception In 2014, Christian Szegedy from Google was working on reducing the computational burden of deep neural networks. At that time, a new convolutional block was introduced under the name of Inception Layer: Instead of choosing the size of the bank of filters to be used upfront, the inception layer uses more than once filter size at the same time (a kind of multi-resolution approach). More specifically the input is sent into 4 paths in parallel: \\(1 \\times 1\\) convolution block; \\(3 \\times 3\\) convolution block; \\(5 \\times 5\\) convolution block; Max pooling block. Moreover, since sending an input with large width, height, and channel number into a \\(3 \\times 3\\) (or \\(5 \\times 5\\) ) convolutional layer would result in a very large number of trainable parameters and extreme computational cost, the input is first sent into a \\(1 \\times 1\\) that reduces the channel size and then the channel size is increased again in the next layer. The \\(1 \\times 1\\) layers act as a bottleneck layer keeping the number of trainable parameters low. Similarly, after the max pooling layer the number of channels is controlled via another \\(1 \\times 1\\) convolutional layer. The four outputs are simply concatenated together to form the output of the Inception layer. The GoogleLeNet network is a large networks where multiple of these Inception layers are stacked together. This network presents an additional set of new features: two side branches are added at different stages of the network, where intermediate representations from hidden layers are passed through a few more layers and sent to a classifier. These classifiers perform the same task of the main classifier placed at the end of the network and have been shown to act as a natural regularizer, ensuring that the hidden features are as expressive as possible to the point they can be used directly for the classification task at hand. ResNet We can already observe a trend moving from LeNet-5 to VGG-19. From the 80' all the way to the early 2000', networks started to become deeper and deeper. However, despite deeper network can generally achieve better performance, practitioners started to also experience painfully slow training. It was later discovered that this was caused by the vanishing gradient problem. Around the same time of VGG-16, He and coauthors proposed a new network block called the Residual Block. As already discussed in our last lecture, this block introduces the innovative idea of shortcuting some of the activations forward in the computational graph and summing them to the activations of the main path. This gave rise to the so-called ResNet that proved to be much easier (and faster) to train than other CNNs when stacking a large number of layers, even up to 100 (or 1000) of layers! The figure above shows ResNet-18, but it is important to remember that the idea of adding skip-connections every couple of layers has much wider implications than just for the ResNet architecture. One of the key benefits introduced by ResNet is the ability to increase the depth of a network without incurring in the risk of overfitting the training data. So, whilst in theory deeper networks should always reduce the training error, this is not always the case for plain networks. On the other hand, networks with Residual blocks are much more successful in that respect. UNet The UNet architecture was proposed by Ronneberger et al. in 2015 in the context of interpretation of microscopy images. This network architecture presents however a number of innovative design choices which led to its widespread use in a variety of disciplines for both semantic segmentation and regression tasks. More specifically, whilst most of the networks we have discussed so far are specifically designed for classification tasks where inputs are of much larger size of target (i.e., imagine taking images from the MNIST dataset as input as a single vector of 10 elements as output), UNet was originally conceived for a semantic segmentation task. Semantic segmentation is a special case of classification where instead of predicting a class per input samples, we want to predict a class for each element of that sample. This makes the output space very large, equal to that of the input times the number of classes. The UNet architecture presents the following characteristics: it can be seen as composed by two networks, an Encoder or contracting path, and a Decoder or expanding path. This is a common design in dimensionality reduction networks like AutoEncoders (see Lecture X for more details). Each level of the encoder network contains a number of convolutional layers followed by a downsampler (usually achieved by means of max pooling). On the other hand, the decoder is composed of convolutional layers preceded by an upsampler (this can be either an interpolator like a bilinear interpolation or a convtranspose layer); skip connections are introduced at each level of the contracting path, taking those features all the way to the corresponding level of the expanding path (where they are concatenated with the features coming from a deeper level of the contracting path itself). Whilst we have already discussed the importance of skip connections for stable training, here these skip connections are brought to a new level, as a very large portion of the network is skipped and concatenation is used instead of summation. The presence of such connections make the UNet architecture able to create very high resolution segmentation and regression outputs; Finally, restricting ourselves to geoscience applications, UNet has been successfully used for a variety of tasks such as: Salt body / channel / karst extraction from seismic data (semantic segmentation); Fault and horizon tracking (semantic segmentation, where a skeletonized fault or horizon volume is used as the target to predict); Microseismic event detection (semantic segmentation); Seismic data interpolation, denoising, deghosting (regression, or more precisely domain translation ); and more... To conclude a summary of some of the most popular CNN architectures used for various computer vision task is shown in the figure below. Note the size of the circles refer to the number of trainable parameters of the associated network. Additional readings the following blog post provides a good overview of some of the most popular architectures in computer vision, including those discussed in this lecture.","title":"CNNs Popular Architectures"},{"location":"lectures/11_cnnarch/#cnns-popular-architectures","text":"This lecture provides an overview of how deep learning, especially in the context of CNNs (and computer vision in general), has evolved over the last decade. This is something that it is good to be familiar with because: whilst most of these advances are given for granted and routinely used today, it is always insightful to learn how ans why these developments were made; we can use architectures that worked well with no (or minimal) adaptation to our problem at hand (we will see that this is very commonly done with high degree of success in geoscience); even better, sometimes we can decide to use pre-trained networks and fine-tune them with limited amount of label data. In this case knowing the network architecture in details allows us to make informed choices, such as remove some of the final layers and introduce new ones that better adapt to the problem at hand (e.g., different number of classes).","title":"CNNs Popular Architectures"},{"location":"lectures/11_cnnarch/#lenet-5","text":"One of the first successful CNNs was created and trained by the famous Yan Le Cun in 1989 with the objective of classifying hand-written digits. As we will see when comparing this to other popular networks, the size of LeNet-5 is very limited, mostly due to the hardware capabilities at that time (and the availability of a fairly small training dataset). As shown in the figure below, this network is composed of: 2 convolutional layers with filter size equal to \\(5 \\times 5\\) , stride equal to 1, and number of channels equal to 6 and 16, respectively; 2 average pooling layers that reduce the height and width of the feature maps by a factor of 2; 3 fully connected layers of size 120, 84, and 10 (the number of digits to classify); softmax activation in the final layer; and the overall number of training parameters is \\(\\approx 60k\\) . Finally, looking at the network architecture two things stand out that probably today would have been implemented differently: average pool layers are not so popular today, max pool layers are more commonly used; activations were used also after pooling and all activations where sigmoid/tangent. Again, today ReLU or one of its variant is more commonly used and no activations are added after pooling layers.","title":"LeNet-5"},{"location":"lectures/11_cnnarch/#alexnet","text":"AlexNet represents a milestone in the field of DeepLearning. Developed by Alex Krizhevsky, Ilya Sutskever and Geoffrey Hinton, this network was the first CNN that won the popular computer vision competition ImageNet. Not only that, but the network outperformed other submissions by far, and brought Deep Learning to the attention of the larger Computer Vision community. As shown in the figure below, this network is not very different from LeNet-5 in its individual components, it is however much deeper and contains much more trainable parameters. More specifically, it is composed of: 5 convolutional layers with variable filter size (ranging from \\(11 \\times 11\\) in the first layer all the way to \\(3 \\times 3\\) in some of the deeper layers); 3 max pooling layers that reduce the height and width of the feature maps by a factor of 2; 3 fully connected layers of size 4096, 4096, and 1000 (the number of digits to classify); softmax activation in the final layer; and the overall number of training parameters is \\(\\approx 60M\\) , 3 order of magnitude more than that of LeNet-5. A number of interesting feature of this network: the number of channels in the different layers: initially, this grows from 3 (i.e., RGB) to 384 and it is then reduced to 256 all the way to the FC layer; ReLU is used as activation function for all hidden layers; Dropout is used to avoid overfitting;","title":"AlexNet"},{"location":"lectures/11_cnnarch/#vgg-16","text":"In 2015, the Visual Geometry Group at Oxford introduce a new CNN architecture called VGG. The key architectural change here is the fact that the network was much deeper than most state-of-the art networks at that time (16 layers); this was achieved by trading filter size (now \\(3 \\times 3\\) ) for depth. Moreover, whilst other networks like AlexNet were hand-crafted with very different filter sizes, strides and padding from layer to layer, this network is really very simple to define: 16 \\(3 \\times 3\\) convolutional layers with stride equal to 1; 16 max pooling laywrs with filter size and stride equal to 2. and the overall number of training parameters is \\(\\approx 138M\\) , roughly twice more than those of AlexNet. The key insight of VGG, which we will see is also used in later CNN architectures, is that stacks of convolutional layers with small filters can emulate the receptive field of one layer with larger filter sizes. Note that further extensions of VGG-16 have been proposed, for example VGG-19 where the network is composed of 19 layers.","title":"VGG-16"},{"location":"lectures/11_cnnarch/#googlelenet-and-inception","text":"In 2014, Christian Szegedy from Google was working on reducing the computational burden of deep neural networks. At that time, a new convolutional block was introduced under the name of Inception Layer: Instead of choosing the size of the bank of filters to be used upfront, the inception layer uses more than once filter size at the same time (a kind of multi-resolution approach). More specifically the input is sent into 4 paths in parallel: \\(1 \\times 1\\) convolution block; \\(3 \\times 3\\) convolution block; \\(5 \\times 5\\) convolution block; Max pooling block. Moreover, since sending an input with large width, height, and channel number into a \\(3 \\times 3\\) (or \\(5 \\times 5\\) ) convolutional layer would result in a very large number of trainable parameters and extreme computational cost, the input is first sent into a \\(1 \\times 1\\) that reduces the channel size and then the channel size is increased again in the next layer. The \\(1 \\times 1\\) layers act as a bottleneck layer keeping the number of trainable parameters low. Similarly, after the max pooling layer the number of channels is controlled via another \\(1 \\times 1\\) convolutional layer. The four outputs are simply concatenated together to form the output of the Inception layer. The GoogleLeNet network is a large networks where multiple of these Inception layers are stacked together. This network presents an additional set of new features: two side branches are added at different stages of the network, where intermediate representations from hidden layers are passed through a few more layers and sent to a classifier. These classifiers perform the same task of the main classifier placed at the end of the network and have been shown to act as a natural regularizer, ensuring that the hidden features are as expressive as possible to the point they can be used directly for the classification task at hand.","title":"GoogleLeNet and Inception"},{"location":"lectures/11_cnnarch/#resnet","text":"We can already observe a trend moving from LeNet-5 to VGG-19. From the 80' all the way to the early 2000', networks started to become deeper and deeper. However, despite deeper network can generally achieve better performance, practitioners started to also experience painfully slow training. It was later discovered that this was caused by the vanishing gradient problem. Around the same time of VGG-16, He and coauthors proposed a new network block called the Residual Block. As already discussed in our last lecture, this block introduces the innovative idea of shortcuting some of the activations forward in the computational graph and summing them to the activations of the main path. This gave rise to the so-called ResNet that proved to be much easier (and faster) to train than other CNNs when stacking a large number of layers, even up to 100 (or 1000) of layers! The figure above shows ResNet-18, but it is important to remember that the idea of adding skip-connections every couple of layers has much wider implications than just for the ResNet architecture. One of the key benefits introduced by ResNet is the ability to increase the depth of a network without incurring in the risk of overfitting the training data. So, whilst in theory deeper networks should always reduce the training error, this is not always the case for plain networks. On the other hand, networks with Residual blocks are much more successful in that respect.","title":"ResNet"},{"location":"lectures/11_cnnarch/#unet","text":"The UNet architecture was proposed by Ronneberger et al. in 2015 in the context of interpretation of microscopy images. This network architecture presents however a number of innovative design choices which led to its widespread use in a variety of disciplines for both semantic segmentation and regression tasks. More specifically, whilst most of the networks we have discussed so far are specifically designed for classification tasks where inputs are of much larger size of target (i.e., imagine taking images from the MNIST dataset as input as a single vector of 10 elements as output), UNet was originally conceived for a semantic segmentation task. Semantic segmentation is a special case of classification where instead of predicting a class per input samples, we want to predict a class for each element of that sample. This makes the output space very large, equal to that of the input times the number of classes. The UNet architecture presents the following characteristics: it can be seen as composed by two networks, an Encoder or contracting path, and a Decoder or expanding path. This is a common design in dimensionality reduction networks like AutoEncoders (see Lecture X for more details). Each level of the encoder network contains a number of convolutional layers followed by a downsampler (usually achieved by means of max pooling). On the other hand, the decoder is composed of convolutional layers preceded by an upsampler (this can be either an interpolator like a bilinear interpolation or a convtranspose layer); skip connections are introduced at each level of the contracting path, taking those features all the way to the corresponding level of the expanding path (where they are concatenated with the features coming from a deeper level of the contracting path itself). Whilst we have already discussed the importance of skip connections for stable training, here these skip connections are brought to a new level, as a very large portion of the network is skipped and concatenation is used instead of summation. The presence of such connections make the UNet architecture able to create very high resolution segmentation and regression outputs; Finally, restricting ourselves to geoscience applications, UNet has been successfully used for a variety of tasks such as: Salt body / channel / karst extraction from seismic data (semantic segmentation); Fault and horizon tracking (semantic segmentation, where a skeletonized fault or horizon volume is used as the target to predict); Microseismic event detection (semantic segmentation); Seismic data interpolation, denoising, deghosting (regression, or more precisely domain translation ); and more... To conclude a summary of some of the most popular CNN architectures used for various computer vision task is shown in the figure below. Note the size of the circles refer to the number of trainable parameters of the associated network.","title":"UNet"},{"location":"lectures/11_cnnarch/#additional-readings","text":"the following blog post provides a good overview of some of the most popular architectures in computer vision, including those discussed in this lecture.","title":"Additional readings"},{"location":"lectures/12_seqmod/","text":"Sequence modelling In this lecture we will start investigating a family of Neural Network that are particularly suitable for learning tasks that involve sequences as input data. To understand what a sequence is in the context of Deep learning, let's consider a recording over time (e.g., an audio recording): Compared to other dataset types (e.g., tabular or gridded data), the different samples of a sequence present an obvious degree of correlation that tends to diminuish the further away to samples are from each other. Moreover, in the case of multi-feature sequences (e.g., multi-component seismological recordings), the overall sequence contains a number of features at each time step that can be more or less correlated to each other. Sequences appear in every aspect of life. For example, outside of geoscience, the two most commonly used data in sequence modelling are: text audio More specifically, as we will see, the field of Natural Language Processing (NPL) has experienced a revolutionary growth in the last decade thanks to sequence modelling and deep learning. In geoscience, many of the commonly used datasets can also be interpreted as sequences, for example: seismograms well logs production data are all datatypes that present a certain degree of correlation along either the time or depth axis. Finally, similar to FFNs or CNNs, sequence modelling can be used for various applications: Single output classification: given an input sequence of a certain length \\(\\mathbf{x}\\) , a model is trained to decide whether than sequence contains a feature of interest or not. For example, given a seismogram we may be interest to detect the presence of a seismic event, or we may want to find out if a well log is clean or corrupted by some recording error or what is the facies in the middle of the sequence; Multi output classification (i.e., semantic segmentation): given an input sequence of a certain length \\(\\mathbf{x}\\) , a model is trained to classify each element of the input sequence into a predefined set of classes. Taking once again the example of facies labelling, here the task is extended to predicting labels at each depth level (and not only in the middle of the sequence); Regression: given an input sequence of a certain length \\(\\mathbf{x}\\) , a model is trained to predict a continuous output, which could be a single value \\(y\\) or a sequence of values \\(\\mathbf{y}\\) that has the same (or different length) of the input. For example, given a set of well logs we may want to predict another one that was not acquired. Similarly, given a seismic trace recorded by the vertical component of a geophone we may be interested to predict the horizontal components. Both of these example fall under the area of domain translation ; Motivation Let's start by considering what we have learned so far and discuss how we could use those tools to handle sequential data. First of all, we consider a sequence of \\(N_\\tau\\) samples and \\(N_f\\) features: \\[ \\mathbf{X} = \\begin{bmatrix} x_1^{<1>} & x_1^{<2>} & x_1^{N_\\tau} \\\\ ... & ... & ... \\\\ x_{N_f}^{1} & x_1^{<2>} & x_{N_f}^{N_\\tau} \\end{bmatrix} = \\begin{bmatrix} \\mathbf{x}^{<1>} & \\mathbf{x}^{<2>} & \\mathbf{x}^{<N_\\tau>} \\end{bmatrix}_{[N_f \\times N_\\tau]} \\] we could easily deal with this as if it was a 2D-array (i.e., an image) and use CNNs. However, the locality argument used for the convolutional filters that constitute a convolutional layer would not make much sense here, especially if we know that elements in the sequence away from each other may still have a certain degree of correlation. Alternatively, the matrix \\(\\mathbf{X}\\) could be simply vectorized and used as input to a FFN. This approach does however present two main limitations: since the vector \\(vec(\\mathbf{X})\\) is likely to be very long, weight matrices will be very large leading to a very expensive training process; FFNs cannot easily handle inputs of variable lengths, so all sequences will need to have fixed length. We will see that being able to handle variable-length sequences is very useful in some situations. Both problems can be overcome by taking advantage of parameter sharing . We have already introduced this concept in the context of CNNs, where the same filters are used in different parts of the input. Similarly in sequence modelling, the idea of parameter sharing allows using the same parameters at different stages of the sequence and therefore allows the network to easily handle sequences of variable length. By doing so, a new type of neural network is created under the name of Recurrent Neural Network (RNN): where \\(\\mathbf{x}\\) is the input vector (or matrix when multiple features are present), \\(\\mathbf{y}\\) is the output vector, and \\(\\mathbf{h}\\) is the so called hidden state vector. As clearly shown in the unrolled version of the network into a standard computational graph, various inputs and hidden states are passed through the same function \\(f_\\theta\\) with a given number of training parameters. This is very different from a feed-forward network where different functions is are used over consecutive layers. The choice of the function \\(f_\\theta\\) leads to the definition of different RNN architectures. Before we begin introducing a number of popular architectures for sequence modelling, let's introduce some useful notation. Inputs and outputs of a RNNs will be always defined as follows: \\[ \\mathbf{X} = \\begin{bmatrix} \\mathbf{x}^{<1>} & \\mathbf{x}^{<2>} & \\mathbf{x}^{<T_x>} \\end{bmatrix}_{[N_f \\times T_x]} \\] and \\[ \\mathbf{Y} = \\begin{bmatrix} \\mathbf{y}^{<1>} & \\mathbf{y}^{<2>} & \\mathbf{y}^{<T_y>} \\end{bmatrix}_{[N_t \\times T_y]} \\] where \\(T_x\\) and \\(T_y\\) are the length of the input and output sequences. First, note that this notations differs from before in that a single training sample is now represented as a matrix; therefore, the entire training data becomes a 3-D tensor of size \\([N_s \\times N_f \\times T_x]\\) (and \\([N_s \\times N_t \\times T_y]\\) ). Finally, note that in the most general case these parameters may be sample dependant (i.e., when we allow sequences of variable size): the following notation will be used in that case, \\(T_x^{(i)}\\) and \\(T_y^{(i)}\\) where \\(i\\) refers to the i-th training sample. Moreover, given that we recurrently apply the same function \\(f_\\theta\\) , we can very compactly write an RNN as: \\[ \\mathbf{h}^{<t>}, \\mathbf{y}^{<t>}=f_\\theta(\\mathbf{h}^{<t-1>}, \\mathbf{x}^{<t>}) \\qquad t=1,2,T_x \\] that we can unroll into: \\[ \\mathbf{h}^{<t>}, \\mathbf{y}^{<t>}=f_\\theta(f_\\theta(f_\\theta(\\mathbf{h}^{<0>}, \\mathbf{x}^{<1>}), ...), \\mathbf{x}^{<t-2>}), \\mathbf{x}^{<t-1>}), \\mathbf{x}^{<t>}) \\] As we have already briefly mentioned, RNNs allows some flexibility on the choice of \\(T_y\\) (i.e., the length of the output sequence). This leads to the creation of different network architectures that are suitable to different tasks: Note that in the cases 3 and 4, the predicted output is fed back to the network as input to the next step at inference stage as shown in the figure above. At training stage, however, the true output is used as input. In summary, what we wish to achieve here is to create a network that can learn but short and long term relationships in the data such that both samples closes to each other as well as far away samples can help in the prediction of the current step. By using parameter sharing in a smart way, we can avoid overparametrizing the network and therefore limit the risk of overfitting on short and long term trends in the data. In other words, by assuming stationariety in the data, we let the network understand if step \\(t\\) and \\(t+N_t\\) are correlated to each other across the entire time sequence, instead of giving the network with the freedom to find relationships between any two samples in the sequence. Basic RNN Architecture It is now time to discuss in more details what is an effective function, \\(f_\\theta\\) . The most basic Recurrent Neural Network can be written as follows: \\[ \\begin{aligned} \\mathbf{a}^{<t>} &= \\mathbf{W}_h \\mathbf{h}^{<t-1>} + \\mathbf{W}_x \\mathbf{x}^{<t>} + \\mathbf{b}_a = \\mathbf{W} [\\mathbf{h}^{<t-1>}, \\mathbf{x}^{<t>}]^T + \\mathbf{b}_a \\\\ \\mathbf{h}^{<t>} &= \\sigma(\\mathbf{a}^{<t>} ) \\\\ \\mathbf{o}^{<t>} &= \\mathbf{W}_y \\mathbf{h}^{<t>} + \\mathbf{b}_y \\\\ \\hat{\\mathbf{y}}^{<t>} &= \\sigma' (\\mathbf{o}^{<t>}) \\end{aligned} \\] where: \\(\\sigma\\) and \\(\\sigma'\\) are the activation functions for the hidden and output paths (the choice of the activation for the latter depends on the problem we wish to solve, e.g., softmax for binary classification) \\(\\mathbf{h}^{<0>}\\) is the initial hidden state vector which is usually initalialized as a zero vector. \\(\\mathbf{W} = [\\mathbf{W}_h, \\mathbf{W}_x]_{[N_h \\times N_h + N_x]}\\) is the matrix of weights for the hidden path \\(\\mathbf{W}_{y \\; [N_y \\times N_h]}\\) is the matrix of weights for the output path In conclusion, the learnable parameters for this kind of RNN block are: \\(\\mathbf{W}_h, \\mathbf{W}_x, \\mathbf{W}_y, \\mathbf{b}_a, \\mathbf{b}_y\\) whose overall size is \\(N_h(N_h+N_x) + N_y N_h + N_h + N_y\\) . To give some perspective, this is much smaller than the number of learnable parameters of an 'equivalent' Feed-Forward network where the entire input matrix \\(\\mathbf{X}\\) is flattened into a 1-d array of size \\(N_f T_x\\) and the entire output matrix \\(\\mathbf{Y}\\) is flattened into a 1-d array of size \\(N_t T_y\\) . The equivalent weight matrix and bias vectors have size \\(N_x N_y T_x T_y\\) and \\(N_yT_y\\) . For example, given a problem of size \\(N_x=2\\) , \\(N_y=3\\) , \\(N_h=5\\) , and \\(T_x=T_y=4\\) , we obtain \\(N_{FFN}=108\\) and \\(N_{RNN}=58\\) . Loss Once the architecture is defined, the next step is to understand how the loss function should be defined for this kind of networks. As shown in the figure below, this can be simply accomplished by considering a loss function per time step and summing them together: \\[ \\mathscr{L} = \\sum_{t=1}^{T_x} \\mathscr{L}^{<t>}, \\qquad \\mathscr{L}^{<t>}= f(\\hat{\\mathbf{y}}^{<t>}, \\mathbf{y}^{<t>}) \\] where \\(f\\) can be the MSE, MAE, BCE, etc. This loss function can be easily interpreted in probabilistic terms as: \\[ f \\rightarrow -log P (\\mathbf{y}^{<t>} | \\mathbf{x}^{<1>}, \\mathbf{x}^{<2>}, ..., \\mathbf{x}^{<t>}) \\] To conclude, we note that the process of evaluating the various terms of the loss function is sequential as a previous hidden state is required to evaluate the current output. This can be very expensive and does not allow for parallelization (beyond across training samples), similar to the case of very deep feedforward neural networks. Backprop Given the loss function defined above, the computation of its gradient easily follows the principles that we have already extensively discussed in previous lectures; in simple terms, the backpropagation algorithm is applied on the unrolled computational graph in order to obtain the gradients of the weights and biases of the network block. Backpropagation over an RNN block is usually referred to as back-propagation through time (BPTT). Looking at this in more details, we can observe how the overall gradient of each of the weights or biases can be written as \\[ \\frac{\\partial \\mathscr{L}}{\\partial \\cdot} = \\sum_{t=1}^{T_x} \\frac{\\partial \\mathscr{L}^{<t>}}{\\partial \\cdot} \\] or, in other words, the gradient accumulates over the unrolled graph. Note also that, \\[ \\frac{\\partial \\mathscr{L}}{\\partial \\mathscr{L}^{<t>}} = 1, \\qquad \\frac{\\partial \\mathscr{L}}{\\partial \\cdot} = \\sum_{t=1}^{T_x} \\frac{\\partial \\mathscr{L}}{\\partial \\mathscr{L}^{<t>}} \\frac{\\partial \\mathscr{L}^{<t>}}{\\partial \\cdot} = \\sum_{t=1}^{T_x} \\frac{\\partial \\mathscr{L}^{<t>}}{\\partial \\cdot} \\] Let's now look more in details at the equations of backpropagation through time for a specific case of multi-label classification. More specifically we assume that the output of each step of the recurrent network ( \\(\\mathbf{o}^{<t>}\\) ) is passed through a softmax to get \\(\\hat{\\mathbf{y}}^{<t>}= \\sigma' (\\mathbf{o}^{<t>})\\) , and the loss in the negative log-likelihood of a Multinoulli distribution. Moreover, we will use tanh for the internal activation function \\(\\sigma\\) . Starting from the gradients of the internal nodes: \\[ \\left(\\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{o}^{<t>}}\\right)_i = \\hat{y}_i^{<t>} - \\mathbf{1}_{i=y^{<t>}} \\] \\[ \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{h}^{<T_x>}} = \\frac{\\partial \\mathscr{L}^{<T_x>}}{\\partial \\mathbf{o}^{<T_x>}} \\frac{\\partial \\mathbf{o}^{<T_x>}}{\\partial \\mathbf{h}^{<T_x>}} = \\mathbf{W}_y^T (\\hat{\\mathbf{y}}^{<T_x>} - \\mathbf{1}_{i=y^{<T_x>}}) \\] \\[ \\begin{aligned} \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{h}^{<t>}} &= \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{o}^{<t>}} \\frac{\\partial \\mathbf{o}^{<t>}}{\\partial \\mathbf{h}^{<t>}} + \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{h}^{<t+1>}} \\frac{\\partial \\mathbf{h}^{<t+1>}}{\\partial \\mathbf{h}^{<t>}} \\\\ &= \\mathbf{W}_y^T (\\hat{\\mathbf{y}}^{<t>} - \\mathbf{1}_{i=y^{<t>}}) + \\mathbf{W}_h^T diag(1 - (\\mathbf{h}^{<t+1>})^2) \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{h}^{<t+1>}} \\end{aligned} \\] where \\(\\mathbf{1}_{i=y^{<t>}}\\) is a vector of zeros with 1 at location of the true label, i.e. \\(i=y^{<t>}\\) , \\(diag(1 - (\\mathbf{h}^{<t+1>})^2)\\) is the Jacobian of the tanh activation function, and \\(\\partial \\mathscr{L} / \\partial \\mathbf{h}^{<t+1>}\\) is computed recursively from \\(t+1=T_x\\) as we know \\(\\partial \\mathscr{L} / \\partial \\mathbf{h}^{<T_x>}\\) . Moreover, it is worth noting how the gradient of the loss function over any hidden state \\(\\mathbf{h}^{<t>}\\) is composed of two terms, one coming directly from the corresponding output \\(\\mathbf{o}^{<t>}\\) and one from the next hidden state \\(\\mathbf{h}^{<t+1>}\\) . It follows that the gradients of the parameters to update are: \\[ \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{b}_y} = \\sum_{t=1}^{T_x} \\left( \\frac{\\partial \\mathbf{o}^{<t>}}{\\partial \\mathbf{b}_y} \\right)^T \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{o}^{<t>}} = \\sum_{t=1}^{T_x} \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{o}^{<t>}} \\] \\[ \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{b}_a} = \\sum_{t=1}^{T_x} \\left( \\frac{\\partial \\mathbf{h}^{<t>}}{\\partial \\mathbf{b}_a} \\right)^T \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{h}^{<t>}} = \\sum_{t=1}^{T_x} diag(1 - (\\mathbf{h}^{<t>})^2) \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{h}^{<t>}} \\] \\[ \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{W}_y} = \\sum_{t=1}^{T_x} \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{o}^{<t>}} \\frac{\\partial \\mathbf{o}^{<t>}}{\\partial \\mathbf{W}_y} = \\sum_{t=1}^{T_x} \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{o}^{<t>}} \\mathbf{h}^{<t>T} \\] \\[ \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{W}_h} = \\sum_{t=1}^{T_x} \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{h}^{<t>}} \\frac{\\partial \\mathbf{h}^{<t>}}{\\partial \\mathbf{W}_h} = \\sum_{t=1}^{T_x} diag(1 - (\\mathbf{h}^{<t>})^2) \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{h}^{<t>}} \\mathbf{h}^{<t-1>T} \\] \\[ \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{W}_x} = \\sum_{t=1}^{T_x} \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{h}^{<t>}} \\frac{\\partial \\mathbf{h}^{<t>}}{\\partial \\mathbf{W}_x} = \\sum_{t=1}^{T_x} diag(1 - (\\mathbf{h}^{<t>})^2) \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{h}^{<t>}} \\mathbf{x}^{<t>T} \\] Inference At test time, the evaluation of a RNN is straightforward. We usually simply need to pass through the forward pass and get the output \\(\\hat{\\mathbf{y}}^{<t>}\\) . However, this is not always true, especially in the following two cases: \\(T_x=1, T_y>1\\) (generative network) \\(T_x, T_y\\) (encoder-decoder network) as in both cases we will be required to use the output at a given step ( \\(\\hat{\\mathbf{y}}^{<t-1>}\\) ) as part of the input to produce the output of the next step ( \\(\\hat{\\mathbf{y}}^{<t>}\\) ). These two scenarios are dominant in so-called Language Modelling for tasks where we want to generate sentences given some initial guess (e.g., first word) or perform language-to-language translation. However, similar concepts could also be used to for example generate well logs or seismograms. Let's briefly take a look at some of the required changes in the inference process of these 2 network types. First of all, in conventional cases our loss function can be written as: \\[ \\begin{aligned} \\mathscr{L} &= \\prod_{t=1}^{T_x} P (\\mathbf{y}^{<t>} | \\mathbf{x}^{<1>}, \\mathbf{x}^{<2>}, ..., \\mathbf{x}^{<t>}) \\\\ &= - \\sum_{t=1}^{T_x} log P (\\mathbf{y}^{<t>} | \\mathbf{x}^{<1>}, \\mathbf{x}^{<2>}, ..., \\mathbf{x}^{<t>}) \\end{aligned} \\] where each output is here totally independent from the others. On the other hand, we are now faced with a joint distribution to sample from: \\[ \\begin{aligned} \\mathscr{L} &= P (\\mathbf{y}^{<1>}, \\mathbf{y}^{<2>},..., \\mathbf{y}^{<t>})\\\\ &= - \\prod_{t=1}^{T_x} log P (\\mathbf{y}^{<t>} | \\mathbf{y}^{<1>}, \\mathbf{y}^{<2>},..., \\mathbf{y}^{<t-1>}) \\end{aligned} \\] Evaluating such a probability is not a big deal during training as we can simply use the true labels as inputs (similarly to the more conventional network architectures where we use \\(\\mathbf{x}^{<t>}\\) ) instead. However, at inference stage we do not have access to the exact previous outputs when evaluating the current one. In order to simplify the evaluation of such a probability, we are therefore required to make an assumption: more specifically, we assume that the outputs can be modelled as a Markov Chain. In other words, we assume that the current output depends only on the previous one and not all of the other previous outputs. We can therefore write: \\[ \\mathscr{L} \\approx - \\prod_{t=1}^{T_x} log P (\\mathbf{y}^{<t>} | \\hat{\\mathbf{y}}^{<t-1>}) \\] which can be easily evaluated by placing the prediction at step \\(t-1\\) as input to step \\(t\\) . However, when we are interested in using our trained RNN for generative tasks, this approach comes with a limitation. It is in fact deterministic, and therefore we can only create a single output sequence. A more sophisticated procedure can be designed such that we can take advantage of our predictions in terms of their probabilities (and not the most probable outcome). Given \\(P (\\mathbf{y}^{<t-1>} | ...)\\) (from, e.g., before a softmax later), what we can instead do is to sample one value of \\(\\mathbf{y}^{<t-1>}\\) and feed it to the next step of our recurrent network. If we now repeat the same procedure multiple times, we will produce a bunch of different sequences. Finally, we could go even one step beyond and sample multiple values at step \\(t-1\\) , feed them concurrently to the next step (or the next N steps) and evaluate which one(s) has the highest joint probability, then go back to step \\(t-1\\) and choose that value(s). This procedure, usually referred as Beam Search , is however beyond the scope of this lecture. Bidirectional RNN Up until now, we have tried to construct NNs that can learn from short and long term patterns in the data in a causal fashion: in other words, by feeding our time series from left to right to the network we allow it at every time step \\(t\\) to learn dependencies from the past \\((t-1,t-2,t-i)\\) . This is very useful for streaming data where we record the data sequentially from \\(t=0\\) to \\(t=T_x\\) , and we do not want to wait until the entire data has been collected before we can make some predictions. This is usually referred to as online processing. An example of such a scenario is represented by real-time drilling, when we drill a hole into the subsurface and record some measurements whilst doing so. We would like a machine to process such recordings as they come in and provide us with useful insights on how to best continue drilling: Of course, not every problem lends naturally to the above depicted scenario. In most cases we are able to record data over an entire time window and only after that we are concerned with analyzing such data. This is usually referred to as offline processing. In this case it may be useful to also look at correlations between samples at time \\(t\\) and future samples \\((t+1,t+2,t+i)\\) . Bidirectional RNNs represent a solution to this as they allow learning short and long term dependencies not only from the past but also from the future. Let's start with a schematic diagram: where the network architecture presents a simple modification. Instead of having a single flow of information from left to right as it is the case for basic RNNs, we have now added a second flow of information from right to left. The hidden states of the first have been labelled with the suffix F (for forward), and those of the second with the suffix B (for backward). The inputs remain unchanged, apart from the fact that they are now fed twice to the network, once for the forward flow and once for the backward flow, whilst the output is not the concatenation of the outputs of the two flows, i.e., \\(\\hat{\\mathbf{y}}^{<t>} = [\\hat{\\mathbf{y}}_F^{<t>T} \\; \\hat{\\mathbf{y}}_B^{<t>T}]^T\\) . Deep RNNs Similarly to any other network architecture that we have investigated so far, the concept of shallow and deep network also applies to RNNs. Shallow RNNs are recurrent networks that have a single hidden layer connecting the inputs to the outputs. On the other than, deep RNNs are composed of more hidden layers. This is simply achieved as follows: First layer input: \\(\\mathbf{x}^{<t>}\\) , hidden and output: \\(\\mathbf{h}_0^{<t>}\\) , Second layer input: \\(\\mathbf{h}_0^{<t>}\\) , hidden and output: \\(\\mathbf{h}_1^{<t>}\\) , Last layer input: \\(\\mathbf{h}_{N-1}^{<t>}\\) , hidden: \\(\\mathbf{h}_N^{<t>}\\) , output: \\(\\hat{\\mathbf{y}}^{<t>}\\) . that we can visually represent as: Mathematically, a deep RNN can be simply expressed as follows. For \\(i=0,1,N-1\\) (with \\(\\mathbf{h}_{-1}=\\mathbf{x}\\) ) \\[ \\begin{aligned} \\mathbf{a}_i^{<t>} &= \\mathbf{W}_{h_i} \\mathbf{h}_i^{<t-1>} + \\mathbf{W}_{x_i} \\mathbf{h}_{i-1}^{<t>} + \\mathbf{b}_{a_i} \\\\ \\mathbf{h}_i^{<t>} &= \\sigma(\\mathbf{a}_i^{<t>} ) \\\\ \\end{aligned} \\] For \\(i=N\\) \\[ \\begin{aligned} \\mathbf{a}_N^{<t>} &= \\mathbf{W}_{h_N} \\mathbf{h}_N^{<t-1>} + \\mathbf{W}_{x_N} \\mathbf{h}_{N-1}^{<t>} + \\mathbf{b}_{a_N} \\\\ \\mathbf{h}_N^{<t>} &= \\sigma(\\mathbf{a}_N^{<t>} ) \\\\ \\mathbf{o}^{<t>} &= \\mathbf{W}_y \\mathbf{h}_N^{<t>} + \\mathbf{b}_y \\\\ \\hat{\\mathbf{y}}^{<t>} &= \\sigma' (\\mathbf{o}^{<t>}) \\\\ \\end{aligned} \\] Long-term dependencies: implications for gradients In this section, we will discuss a long-standing challenge arising when implementing backpropagation through a RNN. A number of solution to circumvent this problem will be presented in following sections. Let's start by considering the forward pass of a recurrent network. For the information to flow from left to right, a recurrent network repeatedly applies the matrix \\(\\mathbf{W}_h\\) to the hidden state vectors (interleaved by nonlinear transformations): as already discussed in the Optimization lecture, this leads to raising the eigenvalues of this matrix to the power of \\(T_x\\) . Eigenvalues smaller than one decay very fast to zero, whilst those bigger than one grow exponentially fast to infinity. As a consequence, only the part of the initial vector \\(\\mathbf{h}^{<0>}\\) aligned with the largest eigenvectors successfully propagates through the network whilst the other components become insignificant after a few steps. So, no matter how we choose the initial weights of the network and hidden state, long term dependencies tend to become irrelevant when compared to short terms ones in terms of their contribution to the gradient. In other words, the network will take a long time to train and learn long-term dependencies. In order to avoid that, a number of strategies have been proposed in the literature. In the following, we will look at three of them: the first tries to circumvent this problem as part of the learning process, whilst the latter two tackle the issue from the perspective of the network architecture design. By no means, these are the preferred choices nowadays when using RNNs. Gradient clipping We have previously mentioned that one simple strategy to prevent exploding gradient is represented by so-called gradient clipping. As the name suggests, this is applied only during the backward pass to gradients that overcome a given threshold. A forward-backward pass with gradient clipping can be therefore written as: Forward pass: \\(\\hat{\\mathbf{y}}^{<t>} = f_\\theta(\\mathbf{x}^{<t>} , \\mathbf{h}^{<0>}) \\; \\forall t=0,1,...T_x\\) Backward pass: \\(\\partial \\mathscr{L} / \\partial \\theta\\) Gradient clipping: if \\(|\\partial \\mathscr{L} / \\partial \\theta| > th\\) , then \\(\\partial \\mathscr{L} / \\partial \\theta = sign(\\partial \\mathscr{L} / \\partial \\theta) \\cdot th\\) Unfortunately, a similar simply trick does not exist for the other problem, vanishing gradients. So, whislt adopting this strategy will avoid instabilities in the training of basic RNNs, the training process will still be painfully slow. Gated recurrent networks or GRU unit The most effective family of networks that can tackle both the exploding and vanishing gradient problem is called Gated networks . As the name implies, a gate is introduced in each block of the network to help information flow and be used by later units without vanishing and exploding gradient issues. By doing so, the gate helps the network remembering some information from early steps, use it much later down the flow, and eventually forget about it. A GRU unit can be simply seen as a classical RNN unit with a number of small modifications. Let's start by drawing them side-by-side (note that for the moment we are considering a simplified GRU block): Apart from a slight change in name ( \\(\\mathbf{h}^{<t>}\\) has been replaced by \\(\\mathbf{c}^{<t>}\\) , which stands for memory cell), compared to the basic RNN the GRU block contains a number of additional internal states. More specifically: \\(\\tilde{\\mathbf{c}}^{<t>}\\) : the candidate replacement for the memory cell. It is a candidate as in some cases it will not be used, rather the current memory cell will be fast-tracked to allow learning long-term dependencies. \\(\\Gamma_u\\) : update gate, which is responsible to choose whether to pass the candidate memory cell \\(\\tilde{\\mathbf{c}}^{<t>}\\) or the previous memory cell \\(\\mathbf{c}^{<t-1>}\\) to the next layer. The associated update equations for this simplified GRU block are: \\[ \\begin{aligned} &\\boldsymbol \\Gamma_{u}=\\sigma\\left(\\mathbf{W}_{u}\\left[\\begin{array}{c} \\mathbf{c}^{<t-1>} \\\\ \\mathbf{x}^{<t>} \\end{array}\\right]+\\mathbf{b}_{u}\\right) \\\\ &\\tilde{\\mathbf{c}}^{<t>}=\\tanh \\left(\\mathbf{W}_{c}\\left[\\begin{array}{c} \\mathbf{c}^{<t-1>} \\\\ \\mathbf{x}^{<t>} \\end{array}\\right]+\\mathbf{b}_{c}\\right) \\\\ &\\mathbf{c}^{<t>}=\\boldsymbol \\Gamma_{u} \\cdot \\tilde{\\mathbf{c}}^{<t>}+\\left(1-\\boldsymbol \\Gamma_{u}\\right) \\cdot \\mathbf{c}^{<t-1>}\\\\ &\\hat{\\mathbf{y}}^{<t>}=\\sigma' (\\mathbf{W}_y \\mathbf{c}^{<t>} + \\mathbf{b}_{y}) \\end{aligned} \\] In the last equation, the new memory cell is computed as the linear interpolation between the old memory cell and the candidate one. However, since a sigmoid is usually chosen for the update gate, \\(\\boldsymbol \\Gamma_{u}\\) roughly acts as a binary gate (0-stop, 1-pass). This way, the gate can stop the flowing of new information for a number of steps allowing the old information to be moved further up the flow without being multiplicated by the weight matrix and therefore creating long-term dependencies that do not suffer from the vanishing gradient problem. To conclude, let's look at the real GRU and its equations, which introduces an additional gate called the relevance or reset gate \\(\\boldsymbol \\Gamma_{r}\\) : \\[ \\begin{aligned} &\\boldsymbol \\Gamma_{u}=\\sigma\\left(\\mathbf{W}_{u}\\left[\\begin{array}{c} \\mathbf{c}^{<t-1>} \\\\ \\mathbf{x}^{<t>} \\end{array}\\right]+\\mathbf{b}_{u}\\right) \\\\ &\\boldsymbol \\Gamma_{r}=\\sigma\\left(\\mathbf{W}_{r}\\left[\\begin{array}{c} \\mathbf{c}^{<t-1>} \\\\ \\mathbf{x}^{<t>} \\end{array}\\right]+\\mathbf{b}_{r}\\right) \\\\ &\\tilde{\\mathbf{c}}^{<t>}=\\tanh \\left(\\mathbf{W}_{c}\\left[\\begin{array}{c} \\boldsymbol \\Gamma_{r} \\cdot \\mathbf{c}^{<t-1>} \\\\ \\mathbf{x}^{<t>} \\end{array}\\right]+\\mathbf{b}_{c}\\right) \\\\ &\\mathbf{c}^{<t>}=\\boldsymbol \\Gamma_{u} \\cdot \\tilde{\\mathbf{c}}^{<t>}+\\left(1-\\boldsymbol \\Gamma_{u}\\right) \\cdot \\mathbf{c}^{<t-1>}\\\\ &\\hat{\\mathbf{y}}^{<t>}=\\sigma' (\\mathbf{W}_y \\mathbf{c}^{<t>} + \\mathbf{b}_{y}) \\end{aligned} \\] Long-short term memory (LSTM) unit Another popular, probably the most popular, RNN block that mitigates the vanishing gradient problem is called LSTM block. It uses similar concepts to those introduced for the GRU block, but at the same time introduces a number of additional hidden states, namely: \\(\\Gamma_f\\) : forget gate, which provides more flexibility when updating the memory cell with the old and candidate memory cells. More specifically, whilst in the GRU block, the new memory cell was a linear combination of those two terms, now we have two independent weights (both of them learned) that can allow passing more or less information from the two inputs instead of having to weight their total contribution to 1. \\(\\Gamma_o\\) : output gate; \\[ \\begin{aligned} &\\boldsymbol{\\Gamma}_{u}=\\sigma\\left(\\boldsymbol{W}_{u}\\left[\\begin{array}{c} \\boldsymbol{h}^{<t-1>} \\\\ \\boldsymbol{x}^{<t>} \\end{array}\\right]+\\boldsymbol{b}_{u}\\right) \\\\ &\\boldsymbol{\\Gamma}_{\\boldsymbol{f}}=\\sigma\\left(\\boldsymbol{W}_{f}\\left[\\begin{array}{c} h^{<t-1>} \\\\ x^{<t>} \\end{array}\\right]+\\boldsymbol{b}_{f}\\right) \\\\ &\\boldsymbol{\\Gamma}_{o}=\\sigma\\left(\\boldsymbol{W}_{o}\\left[\\begin{array}{c} \\boldsymbol{h}^{<t-1>} \\\\ \\boldsymbol{x}^{<t>} \\end{array}\\right]+\\boldsymbol{b}_{o}\\right) \\\\ &\\tilde{\\boldsymbol{c}}^{<t>}=\\tanh \\left(\\boldsymbol{W}_{c}\\left[\\begin{array}{c} \\boldsymbol{h}^{<t-1>} \\\\ \\boldsymbol{x}^{<t>} \\end{array}\\right]+\\boldsymbol{b}_{c}\\right) \\\\ &\\boldsymbol{c}^{<t>}=\\boldsymbol{\\Gamma}_{u} \\tilde{\\boldsymbol{c}}^{<t>}+\\boldsymbol{\\Gamma}_{f} \\boldsymbol{c}^{<t-1>} \\\\ &\\boldsymbol{h}^{<t>}=\\boldsymbol{\\Gamma}_{o} \\tanh \\left(\\boldsymbol{c}^{<t>}\\right) \\\\ &\\boldsymbol{y}^{<t>}=\\sigma^{\\prime}\\left(\\boldsymbol{W}_{y} \\boldsymbol{h}^{<t>}+\\boldsymbol{b}_{y}\\right) \\end{aligned} \\] Present and future of sequence modelling Finally, it is worth noting that the field of sequence modelling with deep neural networks has been taken by a storm a couple of years ago with novel architectures that have led to great improvements in the field of Natural Language Processing. The first innovation, which goes under the name of Attention layer has been initially introduced to mitigate one of the main limitations of the encoder-decoder RNN architecture that we have extensively discussed in this lecture. More specifically, the attention layer can find global correlations between the input(s) of the decoder layer and any of the hidden states of the encoder, avoiding the problem of having a bottleneck at the end of the encoder and a single hidden state that is required to encode the information of the various inputs of the encoder. The attention layer has later led to the design of a completely new type of neural network architecture, the so-called Transformer layer. In this case, instead of processing the input sequentially as in RNNs, the transformer layer takes all the inputs at once and find both local and global correlations by means of so-called self-attention blocks. Additional readings If you are interested to learn more about attention and transformer layers, I recommend watching this lecture from the KAUST Summer School on Unstructured Data in Geoscience","title":"Sequence modelling"},{"location":"lectures/12_seqmod/#sequence-modelling","text":"In this lecture we will start investigating a family of Neural Network that are particularly suitable for learning tasks that involve sequences as input data. To understand what a sequence is in the context of Deep learning, let's consider a recording over time (e.g., an audio recording): Compared to other dataset types (e.g., tabular or gridded data), the different samples of a sequence present an obvious degree of correlation that tends to diminuish the further away to samples are from each other. Moreover, in the case of multi-feature sequences (e.g., multi-component seismological recordings), the overall sequence contains a number of features at each time step that can be more or less correlated to each other. Sequences appear in every aspect of life. For example, outside of geoscience, the two most commonly used data in sequence modelling are: text audio More specifically, as we will see, the field of Natural Language Processing (NPL) has experienced a revolutionary growth in the last decade thanks to sequence modelling and deep learning. In geoscience, many of the commonly used datasets can also be interpreted as sequences, for example: seismograms well logs production data are all datatypes that present a certain degree of correlation along either the time or depth axis. Finally, similar to FFNs or CNNs, sequence modelling can be used for various applications: Single output classification: given an input sequence of a certain length \\(\\mathbf{x}\\) , a model is trained to decide whether than sequence contains a feature of interest or not. For example, given a seismogram we may be interest to detect the presence of a seismic event, or we may want to find out if a well log is clean or corrupted by some recording error or what is the facies in the middle of the sequence; Multi output classification (i.e., semantic segmentation): given an input sequence of a certain length \\(\\mathbf{x}\\) , a model is trained to classify each element of the input sequence into a predefined set of classes. Taking once again the example of facies labelling, here the task is extended to predicting labels at each depth level (and not only in the middle of the sequence); Regression: given an input sequence of a certain length \\(\\mathbf{x}\\) , a model is trained to predict a continuous output, which could be a single value \\(y\\) or a sequence of values \\(\\mathbf{y}\\) that has the same (or different length) of the input. For example, given a set of well logs we may want to predict another one that was not acquired. Similarly, given a seismic trace recorded by the vertical component of a geophone we may be interested to predict the horizontal components. Both of these example fall under the area of domain translation ;","title":"Sequence modelling"},{"location":"lectures/12_seqmod/#motivation","text":"Let's start by considering what we have learned so far and discuss how we could use those tools to handle sequential data. First of all, we consider a sequence of \\(N_\\tau\\) samples and \\(N_f\\) features: \\[ \\mathbf{X} = \\begin{bmatrix} x_1^{<1>} & x_1^{<2>} & x_1^{N_\\tau} \\\\ ... & ... & ... \\\\ x_{N_f}^{1} & x_1^{<2>} & x_{N_f}^{N_\\tau} \\end{bmatrix} = \\begin{bmatrix} \\mathbf{x}^{<1>} & \\mathbf{x}^{<2>} & \\mathbf{x}^{<N_\\tau>} \\end{bmatrix}_{[N_f \\times N_\\tau]} \\] we could easily deal with this as if it was a 2D-array (i.e., an image) and use CNNs. However, the locality argument used for the convolutional filters that constitute a convolutional layer would not make much sense here, especially if we know that elements in the sequence away from each other may still have a certain degree of correlation. Alternatively, the matrix \\(\\mathbf{X}\\) could be simply vectorized and used as input to a FFN. This approach does however present two main limitations: since the vector \\(vec(\\mathbf{X})\\) is likely to be very long, weight matrices will be very large leading to a very expensive training process; FFNs cannot easily handle inputs of variable lengths, so all sequences will need to have fixed length. We will see that being able to handle variable-length sequences is very useful in some situations. Both problems can be overcome by taking advantage of parameter sharing . We have already introduced this concept in the context of CNNs, where the same filters are used in different parts of the input. Similarly in sequence modelling, the idea of parameter sharing allows using the same parameters at different stages of the sequence and therefore allows the network to easily handle sequences of variable length. By doing so, a new type of neural network is created under the name of Recurrent Neural Network (RNN): where \\(\\mathbf{x}\\) is the input vector (or matrix when multiple features are present), \\(\\mathbf{y}\\) is the output vector, and \\(\\mathbf{h}\\) is the so called hidden state vector. As clearly shown in the unrolled version of the network into a standard computational graph, various inputs and hidden states are passed through the same function \\(f_\\theta\\) with a given number of training parameters. This is very different from a feed-forward network where different functions is are used over consecutive layers. The choice of the function \\(f_\\theta\\) leads to the definition of different RNN architectures. Before we begin introducing a number of popular architectures for sequence modelling, let's introduce some useful notation. Inputs and outputs of a RNNs will be always defined as follows: \\[ \\mathbf{X} = \\begin{bmatrix} \\mathbf{x}^{<1>} & \\mathbf{x}^{<2>} & \\mathbf{x}^{<T_x>} \\end{bmatrix}_{[N_f \\times T_x]} \\] and \\[ \\mathbf{Y} = \\begin{bmatrix} \\mathbf{y}^{<1>} & \\mathbf{y}^{<2>} & \\mathbf{y}^{<T_y>} \\end{bmatrix}_{[N_t \\times T_y]} \\] where \\(T_x\\) and \\(T_y\\) are the length of the input and output sequences. First, note that this notations differs from before in that a single training sample is now represented as a matrix; therefore, the entire training data becomes a 3-D tensor of size \\([N_s \\times N_f \\times T_x]\\) (and \\([N_s \\times N_t \\times T_y]\\) ). Finally, note that in the most general case these parameters may be sample dependant (i.e., when we allow sequences of variable size): the following notation will be used in that case, \\(T_x^{(i)}\\) and \\(T_y^{(i)}\\) where \\(i\\) refers to the i-th training sample. Moreover, given that we recurrently apply the same function \\(f_\\theta\\) , we can very compactly write an RNN as: \\[ \\mathbf{h}^{<t>}, \\mathbf{y}^{<t>}=f_\\theta(\\mathbf{h}^{<t-1>}, \\mathbf{x}^{<t>}) \\qquad t=1,2,T_x \\] that we can unroll into: \\[ \\mathbf{h}^{<t>}, \\mathbf{y}^{<t>}=f_\\theta(f_\\theta(f_\\theta(\\mathbf{h}^{<0>}, \\mathbf{x}^{<1>}), ...), \\mathbf{x}^{<t-2>}), \\mathbf{x}^{<t-1>}), \\mathbf{x}^{<t>}) \\] As we have already briefly mentioned, RNNs allows some flexibility on the choice of \\(T_y\\) (i.e., the length of the output sequence). This leads to the creation of different network architectures that are suitable to different tasks: Note that in the cases 3 and 4, the predicted output is fed back to the network as input to the next step at inference stage as shown in the figure above. At training stage, however, the true output is used as input. In summary, what we wish to achieve here is to create a network that can learn but short and long term relationships in the data such that both samples closes to each other as well as far away samples can help in the prediction of the current step. By using parameter sharing in a smart way, we can avoid overparametrizing the network and therefore limit the risk of overfitting on short and long term trends in the data. In other words, by assuming stationariety in the data, we let the network understand if step \\(t\\) and \\(t+N_t\\) are correlated to each other across the entire time sequence, instead of giving the network with the freedom to find relationships between any two samples in the sequence.","title":"Motivation"},{"location":"lectures/12_seqmod/#basic-rnn","text":"","title":"Basic RNN"},{"location":"lectures/12_seqmod/#architecture","text":"It is now time to discuss in more details what is an effective function, \\(f_\\theta\\) . The most basic Recurrent Neural Network can be written as follows: \\[ \\begin{aligned} \\mathbf{a}^{<t>} &= \\mathbf{W}_h \\mathbf{h}^{<t-1>} + \\mathbf{W}_x \\mathbf{x}^{<t>} + \\mathbf{b}_a = \\mathbf{W} [\\mathbf{h}^{<t-1>}, \\mathbf{x}^{<t>}]^T + \\mathbf{b}_a \\\\ \\mathbf{h}^{<t>} &= \\sigma(\\mathbf{a}^{<t>} ) \\\\ \\mathbf{o}^{<t>} &= \\mathbf{W}_y \\mathbf{h}^{<t>} + \\mathbf{b}_y \\\\ \\hat{\\mathbf{y}}^{<t>} &= \\sigma' (\\mathbf{o}^{<t>}) \\end{aligned} \\] where: \\(\\sigma\\) and \\(\\sigma'\\) are the activation functions for the hidden and output paths (the choice of the activation for the latter depends on the problem we wish to solve, e.g., softmax for binary classification) \\(\\mathbf{h}^{<0>}\\) is the initial hidden state vector which is usually initalialized as a zero vector. \\(\\mathbf{W} = [\\mathbf{W}_h, \\mathbf{W}_x]_{[N_h \\times N_h + N_x]}\\) is the matrix of weights for the hidden path \\(\\mathbf{W}_{y \\; [N_y \\times N_h]}\\) is the matrix of weights for the output path In conclusion, the learnable parameters for this kind of RNN block are: \\(\\mathbf{W}_h, \\mathbf{W}_x, \\mathbf{W}_y, \\mathbf{b}_a, \\mathbf{b}_y\\) whose overall size is \\(N_h(N_h+N_x) + N_y N_h + N_h + N_y\\) . To give some perspective, this is much smaller than the number of learnable parameters of an 'equivalent' Feed-Forward network where the entire input matrix \\(\\mathbf{X}\\) is flattened into a 1-d array of size \\(N_f T_x\\) and the entire output matrix \\(\\mathbf{Y}\\) is flattened into a 1-d array of size \\(N_t T_y\\) . The equivalent weight matrix and bias vectors have size \\(N_x N_y T_x T_y\\) and \\(N_yT_y\\) . For example, given a problem of size \\(N_x=2\\) , \\(N_y=3\\) , \\(N_h=5\\) , and \\(T_x=T_y=4\\) , we obtain \\(N_{FFN}=108\\) and \\(N_{RNN}=58\\) .","title":"Architecture"},{"location":"lectures/12_seqmod/#loss","text":"Once the architecture is defined, the next step is to understand how the loss function should be defined for this kind of networks. As shown in the figure below, this can be simply accomplished by considering a loss function per time step and summing them together: \\[ \\mathscr{L} = \\sum_{t=1}^{T_x} \\mathscr{L}^{<t>}, \\qquad \\mathscr{L}^{<t>}= f(\\hat{\\mathbf{y}}^{<t>}, \\mathbf{y}^{<t>}) \\] where \\(f\\) can be the MSE, MAE, BCE, etc. This loss function can be easily interpreted in probabilistic terms as: \\[ f \\rightarrow -log P (\\mathbf{y}^{<t>} | \\mathbf{x}^{<1>}, \\mathbf{x}^{<2>}, ..., \\mathbf{x}^{<t>}) \\] To conclude, we note that the process of evaluating the various terms of the loss function is sequential as a previous hidden state is required to evaluate the current output. This can be very expensive and does not allow for parallelization (beyond across training samples), similar to the case of very deep feedforward neural networks.","title":"Loss"},{"location":"lectures/12_seqmod/#backprop","text":"Given the loss function defined above, the computation of its gradient easily follows the principles that we have already extensively discussed in previous lectures; in simple terms, the backpropagation algorithm is applied on the unrolled computational graph in order to obtain the gradients of the weights and biases of the network block. Backpropagation over an RNN block is usually referred to as back-propagation through time (BPTT). Looking at this in more details, we can observe how the overall gradient of each of the weights or biases can be written as \\[ \\frac{\\partial \\mathscr{L}}{\\partial \\cdot} = \\sum_{t=1}^{T_x} \\frac{\\partial \\mathscr{L}^{<t>}}{\\partial \\cdot} \\] or, in other words, the gradient accumulates over the unrolled graph. Note also that, \\[ \\frac{\\partial \\mathscr{L}}{\\partial \\mathscr{L}^{<t>}} = 1, \\qquad \\frac{\\partial \\mathscr{L}}{\\partial \\cdot} = \\sum_{t=1}^{T_x} \\frac{\\partial \\mathscr{L}}{\\partial \\mathscr{L}^{<t>}} \\frac{\\partial \\mathscr{L}^{<t>}}{\\partial \\cdot} = \\sum_{t=1}^{T_x} \\frac{\\partial \\mathscr{L}^{<t>}}{\\partial \\cdot} \\] Let's now look more in details at the equations of backpropagation through time for a specific case of multi-label classification. More specifically we assume that the output of each step of the recurrent network ( \\(\\mathbf{o}^{<t>}\\) ) is passed through a softmax to get \\(\\hat{\\mathbf{y}}^{<t>}= \\sigma' (\\mathbf{o}^{<t>})\\) , and the loss in the negative log-likelihood of a Multinoulli distribution. Moreover, we will use tanh for the internal activation function \\(\\sigma\\) . Starting from the gradients of the internal nodes: \\[ \\left(\\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{o}^{<t>}}\\right)_i = \\hat{y}_i^{<t>} - \\mathbf{1}_{i=y^{<t>}} \\] \\[ \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{h}^{<T_x>}} = \\frac{\\partial \\mathscr{L}^{<T_x>}}{\\partial \\mathbf{o}^{<T_x>}} \\frac{\\partial \\mathbf{o}^{<T_x>}}{\\partial \\mathbf{h}^{<T_x>}} = \\mathbf{W}_y^T (\\hat{\\mathbf{y}}^{<T_x>} - \\mathbf{1}_{i=y^{<T_x>}}) \\] \\[ \\begin{aligned} \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{h}^{<t>}} &= \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{o}^{<t>}} \\frac{\\partial \\mathbf{o}^{<t>}}{\\partial \\mathbf{h}^{<t>}} + \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{h}^{<t+1>}} \\frac{\\partial \\mathbf{h}^{<t+1>}}{\\partial \\mathbf{h}^{<t>}} \\\\ &= \\mathbf{W}_y^T (\\hat{\\mathbf{y}}^{<t>} - \\mathbf{1}_{i=y^{<t>}}) + \\mathbf{W}_h^T diag(1 - (\\mathbf{h}^{<t+1>})^2) \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{h}^{<t+1>}} \\end{aligned} \\] where \\(\\mathbf{1}_{i=y^{<t>}}\\) is a vector of zeros with 1 at location of the true label, i.e. \\(i=y^{<t>}\\) , \\(diag(1 - (\\mathbf{h}^{<t+1>})^2)\\) is the Jacobian of the tanh activation function, and \\(\\partial \\mathscr{L} / \\partial \\mathbf{h}^{<t+1>}\\) is computed recursively from \\(t+1=T_x\\) as we know \\(\\partial \\mathscr{L} / \\partial \\mathbf{h}^{<T_x>}\\) . Moreover, it is worth noting how the gradient of the loss function over any hidden state \\(\\mathbf{h}^{<t>}\\) is composed of two terms, one coming directly from the corresponding output \\(\\mathbf{o}^{<t>}\\) and one from the next hidden state \\(\\mathbf{h}^{<t+1>}\\) . It follows that the gradients of the parameters to update are: \\[ \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{b}_y} = \\sum_{t=1}^{T_x} \\left( \\frac{\\partial \\mathbf{o}^{<t>}}{\\partial \\mathbf{b}_y} \\right)^T \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{o}^{<t>}} = \\sum_{t=1}^{T_x} \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{o}^{<t>}} \\] \\[ \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{b}_a} = \\sum_{t=1}^{T_x} \\left( \\frac{\\partial \\mathbf{h}^{<t>}}{\\partial \\mathbf{b}_a} \\right)^T \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{h}^{<t>}} = \\sum_{t=1}^{T_x} diag(1 - (\\mathbf{h}^{<t>})^2) \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{h}^{<t>}} \\] \\[ \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{W}_y} = \\sum_{t=1}^{T_x} \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{o}^{<t>}} \\frac{\\partial \\mathbf{o}^{<t>}}{\\partial \\mathbf{W}_y} = \\sum_{t=1}^{T_x} \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{o}^{<t>}} \\mathbf{h}^{<t>T} \\] \\[ \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{W}_h} = \\sum_{t=1}^{T_x} \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{h}^{<t>}} \\frac{\\partial \\mathbf{h}^{<t>}}{\\partial \\mathbf{W}_h} = \\sum_{t=1}^{T_x} diag(1 - (\\mathbf{h}^{<t>})^2) \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{h}^{<t>}} \\mathbf{h}^{<t-1>T} \\] \\[ \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{W}_x} = \\sum_{t=1}^{T_x} \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{h}^{<t>}} \\frac{\\partial \\mathbf{h}^{<t>}}{\\partial \\mathbf{W}_x} = \\sum_{t=1}^{T_x} diag(1 - (\\mathbf{h}^{<t>})^2) \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{h}^{<t>}} \\mathbf{x}^{<t>T} \\]","title":"Backprop"},{"location":"lectures/12_seqmod/#inference","text":"At test time, the evaluation of a RNN is straightforward. We usually simply need to pass through the forward pass and get the output \\(\\hat{\\mathbf{y}}^{<t>}\\) . However, this is not always true, especially in the following two cases: \\(T_x=1, T_y>1\\) (generative network) \\(T_x, T_y\\) (encoder-decoder network) as in both cases we will be required to use the output at a given step ( \\(\\hat{\\mathbf{y}}^{<t-1>}\\) ) as part of the input to produce the output of the next step ( \\(\\hat{\\mathbf{y}}^{<t>}\\) ). These two scenarios are dominant in so-called Language Modelling for tasks where we want to generate sentences given some initial guess (e.g., first word) or perform language-to-language translation. However, similar concepts could also be used to for example generate well logs or seismograms. Let's briefly take a look at some of the required changes in the inference process of these 2 network types. First of all, in conventional cases our loss function can be written as: \\[ \\begin{aligned} \\mathscr{L} &= \\prod_{t=1}^{T_x} P (\\mathbf{y}^{<t>} | \\mathbf{x}^{<1>}, \\mathbf{x}^{<2>}, ..., \\mathbf{x}^{<t>}) \\\\ &= - \\sum_{t=1}^{T_x} log P (\\mathbf{y}^{<t>} | \\mathbf{x}^{<1>}, \\mathbf{x}^{<2>}, ..., \\mathbf{x}^{<t>}) \\end{aligned} \\] where each output is here totally independent from the others. On the other hand, we are now faced with a joint distribution to sample from: \\[ \\begin{aligned} \\mathscr{L} &= P (\\mathbf{y}^{<1>}, \\mathbf{y}^{<2>},..., \\mathbf{y}^{<t>})\\\\ &= - \\prod_{t=1}^{T_x} log P (\\mathbf{y}^{<t>} | \\mathbf{y}^{<1>}, \\mathbf{y}^{<2>},..., \\mathbf{y}^{<t-1>}) \\end{aligned} \\] Evaluating such a probability is not a big deal during training as we can simply use the true labels as inputs (similarly to the more conventional network architectures where we use \\(\\mathbf{x}^{<t>}\\) ) instead. However, at inference stage we do not have access to the exact previous outputs when evaluating the current one. In order to simplify the evaluation of such a probability, we are therefore required to make an assumption: more specifically, we assume that the outputs can be modelled as a Markov Chain. In other words, we assume that the current output depends only on the previous one and not all of the other previous outputs. We can therefore write: \\[ \\mathscr{L} \\approx - \\prod_{t=1}^{T_x} log P (\\mathbf{y}^{<t>} | \\hat{\\mathbf{y}}^{<t-1>}) \\] which can be easily evaluated by placing the prediction at step \\(t-1\\) as input to step \\(t\\) . However, when we are interested in using our trained RNN for generative tasks, this approach comes with a limitation. It is in fact deterministic, and therefore we can only create a single output sequence. A more sophisticated procedure can be designed such that we can take advantage of our predictions in terms of their probabilities (and not the most probable outcome). Given \\(P (\\mathbf{y}^{<t-1>} | ...)\\) (from, e.g., before a softmax later), what we can instead do is to sample one value of \\(\\mathbf{y}^{<t-1>}\\) and feed it to the next step of our recurrent network. If we now repeat the same procedure multiple times, we will produce a bunch of different sequences. Finally, we could go even one step beyond and sample multiple values at step \\(t-1\\) , feed them concurrently to the next step (or the next N steps) and evaluate which one(s) has the highest joint probability, then go back to step \\(t-1\\) and choose that value(s). This procedure, usually referred as Beam Search , is however beyond the scope of this lecture.","title":"Inference"},{"location":"lectures/12_seqmod/#bidirectional-rnn","text":"Up until now, we have tried to construct NNs that can learn from short and long term patterns in the data in a causal fashion: in other words, by feeding our time series from left to right to the network we allow it at every time step \\(t\\) to learn dependencies from the past \\((t-1,t-2,t-i)\\) . This is very useful for streaming data where we record the data sequentially from \\(t=0\\) to \\(t=T_x\\) , and we do not want to wait until the entire data has been collected before we can make some predictions. This is usually referred to as online processing. An example of such a scenario is represented by real-time drilling, when we drill a hole into the subsurface and record some measurements whilst doing so. We would like a machine to process such recordings as they come in and provide us with useful insights on how to best continue drilling: Of course, not every problem lends naturally to the above depicted scenario. In most cases we are able to record data over an entire time window and only after that we are concerned with analyzing such data. This is usually referred to as offline processing. In this case it may be useful to also look at correlations between samples at time \\(t\\) and future samples \\((t+1,t+2,t+i)\\) . Bidirectional RNNs represent a solution to this as they allow learning short and long term dependencies not only from the past but also from the future. Let's start with a schematic diagram: where the network architecture presents a simple modification. Instead of having a single flow of information from left to right as it is the case for basic RNNs, we have now added a second flow of information from right to left. The hidden states of the first have been labelled with the suffix F (for forward), and those of the second with the suffix B (for backward). The inputs remain unchanged, apart from the fact that they are now fed twice to the network, once for the forward flow and once for the backward flow, whilst the output is not the concatenation of the outputs of the two flows, i.e., \\(\\hat{\\mathbf{y}}^{<t>} = [\\hat{\\mathbf{y}}_F^{<t>T} \\; \\hat{\\mathbf{y}}_B^{<t>T}]^T\\) .","title":"Bidirectional RNN"},{"location":"lectures/12_seqmod/#deep-rnns","text":"Similarly to any other network architecture that we have investigated so far, the concept of shallow and deep network also applies to RNNs. Shallow RNNs are recurrent networks that have a single hidden layer connecting the inputs to the outputs. On the other than, deep RNNs are composed of more hidden layers. This is simply achieved as follows: First layer input: \\(\\mathbf{x}^{<t>}\\) , hidden and output: \\(\\mathbf{h}_0^{<t>}\\) , Second layer input: \\(\\mathbf{h}_0^{<t>}\\) , hidden and output: \\(\\mathbf{h}_1^{<t>}\\) , Last layer input: \\(\\mathbf{h}_{N-1}^{<t>}\\) , hidden: \\(\\mathbf{h}_N^{<t>}\\) , output: \\(\\hat{\\mathbf{y}}^{<t>}\\) . that we can visually represent as: Mathematically, a deep RNN can be simply expressed as follows. For \\(i=0,1,N-1\\) (with \\(\\mathbf{h}_{-1}=\\mathbf{x}\\) ) \\[ \\begin{aligned} \\mathbf{a}_i^{<t>} &= \\mathbf{W}_{h_i} \\mathbf{h}_i^{<t-1>} + \\mathbf{W}_{x_i} \\mathbf{h}_{i-1}^{<t>} + \\mathbf{b}_{a_i} \\\\ \\mathbf{h}_i^{<t>} &= \\sigma(\\mathbf{a}_i^{<t>} ) \\\\ \\end{aligned} \\] For \\(i=N\\) \\[ \\begin{aligned} \\mathbf{a}_N^{<t>} &= \\mathbf{W}_{h_N} \\mathbf{h}_N^{<t-1>} + \\mathbf{W}_{x_N} \\mathbf{h}_{N-1}^{<t>} + \\mathbf{b}_{a_N} \\\\ \\mathbf{h}_N^{<t>} &= \\sigma(\\mathbf{a}_N^{<t>} ) \\\\ \\mathbf{o}^{<t>} &= \\mathbf{W}_y \\mathbf{h}_N^{<t>} + \\mathbf{b}_y \\\\ \\hat{\\mathbf{y}}^{<t>} &= \\sigma' (\\mathbf{o}^{<t>}) \\\\ \\end{aligned} \\]","title":"Deep RNNs"},{"location":"lectures/12_seqmod/#long-term-dependencies-implications-for-gradients","text":"In this section, we will discuss a long-standing challenge arising when implementing backpropagation through a RNN. A number of solution to circumvent this problem will be presented in following sections. Let's start by considering the forward pass of a recurrent network. For the information to flow from left to right, a recurrent network repeatedly applies the matrix \\(\\mathbf{W}_h\\) to the hidden state vectors (interleaved by nonlinear transformations): as already discussed in the Optimization lecture, this leads to raising the eigenvalues of this matrix to the power of \\(T_x\\) . Eigenvalues smaller than one decay very fast to zero, whilst those bigger than one grow exponentially fast to infinity. As a consequence, only the part of the initial vector \\(\\mathbf{h}^{<0>}\\) aligned with the largest eigenvectors successfully propagates through the network whilst the other components become insignificant after a few steps. So, no matter how we choose the initial weights of the network and hidden state, long term dependencies tend to become irrelevant when compared to short terms ones in terms of their contribution to the gradient. In other words, the network will take a long time to train and learn long-term dependencies. In order to avoid that, a number of strategies have been proposed in the literature. In the following, we will look at three of them: the first tries to circumvent this problem as part of the learning process, whilst the latter two tackle the issue from the perspective of the network architecture design. By no means, these are the preferred choices nowadays when using RNNs.","title":"Long-term dependencies: implications for gradients"},{"location":"lectures/12_seqmod/#gradient-clipping","text":"We have previously mentioned that one simple strategy to prevent exploding gradient is represented by so-called gradient clipping. As the name suggests, this is applied only during the backward pass to gradients that overcome a given threshold. A forward-backward pass with gradient clipping can be therefore written as: Forward pass: \\(\\hat{\\mathbf{y}}^{<t>} = f_\\theta(\\mathbf{x}^{<t>} , \\mathbf{h}^{<0>}) \\; \\forall t=0,1,...T_x\\) Backward pass: \\(\\partial \\mathscr{L} / \\partial \\theta\\) Gradient clipping: if \\(|\\partial \\mathscr{L} / \\partial \\theta| > th\\) , then \\(\\partial \\mathscr{L} / \\partial \\theta = sign(\\partial \\mathscr{L} / \\partial \\theta) \\cdot th\\) Unfortunately, a similar simply trick does not exist for the other problem, vanishing gradients. So, whislt adopting this strategy will avoid instabilities in the training of basic RNNs, the training process will still be painfully slow.","title":"Gradient clipping"},{"location":"lectures/12_seqmod/#gated-recurrent-networks-or-gru-unit","text":"The most effective family of networks that can tackle both the exploding and vanishing gradient problem is called Gated networks . As the name implies, a gate is introduced in each block of the network to help information flow and be used by later units without vanishing and exploding gradient issues. By doing so, the gate helps the network remembering some information from early steps, use it much later down the flow, and eventually forget about it. A GRU unit can be simply seen as a classical RNN unit with a number of small modifications. Let's start by drawing them side-by-side (note that for the moment we are considering a simplified GRU block): Apart from a slight change in name ( \\(\\mathbf{h}^{<t>}\\) has been replaced by \\(\\mathbf{c}^{<t>}\\) , which stands for memory cell), compared to the basic RNN the GRU block contains a number of additional internal states. More specifically: \\(\\tilde{\\mathbf{c}}^{<t>}\\) : the candidate replacement for the memory cell. It is a candidate as in some cases it will not be used, rather the current memory cell will be fast-tracked to allow learning long-term dependencies. \\(\\Gamma_u\\) : update gate, which is responsible to choose whether to pass the candidate memory cell \\(\\tilde{\\mathbf{c}}^{<t>}\\) or the previous memory cell \\(\\mathbf{c}^{<t-1>}\\) to the next layer. The associated update equations for this simplified GRU block are: \\[ \\begin{aligned} &\\boldsymbol \\Gamma_{u}=\\sigma\\left(\\mathbf{W}_{u}\\left[\\begin{array}{c} \\mathbf{c}^{<t-1>} \\\\ \\mathbf{x}^{<t>} \\end{array}\\right]+\\mathbf{b}_{u}\\right) \\\\ &\\tilde{\\mathbf{c}}^{<t>}=\\tanh \\left(\\mathbf{W}_{c}\\left[\\begin{array}{c} \\mathbf{c}^{<t-1>} \\\\ \\mathbf{x}^{<t>} \\end{array}\\right]+\\mathbf{b}_{c}\\right) \\\\ &\\mathbf{c}^{<t>}=\\boldsymbol \\Gamma_{u} \\cdot \\tilde{\\mathbf{c}}^{<t>}+\\left(1-\\boldsymbol \\Gamma_{u}\\right) \\cdot \\mathbf{c}^{<t-1>}\\\\ &\\hat{\\mathbf{y}}^{<t>}=\\sigma' (\\mathbf{W}_y \\mathbf{c}^{<t>} + \\mathbf{b}_{y}) \\end{aligned} \\] In the last equation, the new memory cell is computed as the linear interpolation between the old memory cell and the candidate one. However, since a sigmoid is usually chosen for the update gate, \\(\\boldsymbol \\Gamma_{u}\\) roughly acts as a binary gate (0-stop, 1-pass). This way, the gate can stop the flowing of new information for a number of steps allowing the old information to be moved further up the flow without being multiplicated by the weight matrix and therefore creating long-term dependencies that do not suffer from the vanishing gradient problem. To conclude, let's look at the real GRU and its equations, which introduces an additional gate called the relevance or reset gate \\(\\boldsymbol \\Gamma_{r}\\) : \\[ \\begin{aligned} &\\boldsymbol \\Gamma_{u}=\\sigma\\left(\\mathbf{W}_{u}\\left[\\begin{array}{c} \\mathbf{c}^{<t-1>} \\\\ \\mathbf{x}^{<t>} \\end{array}\\right]+\\mathbf{b}_{u}\\right) \\\\ &\\boldsymbol \\Gamma_{r}=\\sigma\\left(\\mathbf{W}_{r}\\left[\\begin{array}{c} \\mathbf{c}^{<t-1>} \\\\ \\mathbf{x}^{<t>} \\end{array}\\right]+\\mathbf{b}_{r}\\right) \\\\ &\\tilde{\\mathbf{c}}^{<t>}=\\tanh \\left(\\mathbf{W}_{c}\\left[\\begin{array}{c} \\boldsymbol \\Gamma_{r} \\cdot \\mathbf{c}^{<t-1>} \\\\ \\mathbf{x}^{<t>} \\end{array}\\right]+\\mathbf{b}_{c}\\right) \\\\ &\\mathbf{c}^{<t>}=\\boldsymbol \\Gamma_{u} \\cdot \\tilde{\\mathbf{c}}^{<t>}+\\left(1-\\boldsymbol \\Gamma_{u}\\right) \\cdot \\mathbf{c}^{<t-1>}\\\\ &\\hat{\\mathbf{y}}^{<t>}=\\sigma' (\\mathbf{W}_y \\mathbf{c}^{<t>} + \\mathbf{b}_{y}) \\end{aligned} \\]","title":"Gated recurrent networks or GRU unit"},{"location":"lectures/12_seqmod/#long-short-term-memory-lstm-unit","text":"Another popular, probably the most popular, RNN block that mitigates the vanishing gradient problem is called LSTM block. It uses similar concepts to those introduced for the GRU block, but at the same time introduces a number of additional hidden states, namely: \\(\\Gamma_f\\) : forget gate, which provides more flexibility when updating the memory cell with the old and candidate memory cells. More specifically, whilst in the GRU block, the new memory cell was a linear combination of those two terms, now we have two independent weights (both of them learned) that can allow passing more or less information from the two inputs instead of having to weight their total contribution to 1. \\(\\Gamma_o\\) : output gate; \\[ \\begin{aligned} &\\boldsymbol{\\Gamma}_{u}=\\sigma\\left(\\boldsymbol{W}_{u}\\left[\\begin{array}{c} \\boldsymbol{h}^{<t-1>} \\\\ \\boldsymbol{x}^{<t>} \\end{array}\\right]+\\boldsymbol{b}_{u}\\right) \\\\ &\\boldsymbol{\\Gamma}_{\\boldsymbol{f}}=\\sigma\\left(\\boldsymbol{W}_{f}\\left[\\begin{array}{c} h^{<t-1>} \\\\ x^{<t>} \\end{array}\\right]+\\boldsymbol{b}_{f}\\right) \\\\ &\\boldsymbol{\\Gamma}_{o}=\\sigma\\left(\\boldsymbol{W}_{o}\\left[\\begin{array}{c} \\boldsymbol{h}^{<t-1>} \\\\ \\boldsymbol{x}^{<t>} \\end{array}\\right]+\\boldsymbol{b}_{o}\\right) \\\\ &\\tilde{\\boldsymbol{c}}^{<t>}=\\tanh \\left(\\boldsymbol{W}_{c}\\left[\\begin{array}{c} \\boldsymbol{h}^{<t-1>} \\\\ \\boldsymbol{x}^{<t>} \\end{array}\\right]+\\boldsymbol{b}_{c}\\right) \\\\ &\\boldsymbol{c}^{<t>}=\\boldsymbol{\\Gamma}_{u} \\tilde{\\boldsymbol{c}}^{<t>}+\\boldsymbol{\\Gamma}_{f} \\boldsymbol{c}^{<t-1>} \\\\ &\\boldsymbol{h}^{<t>}=\\boldsymbol{\\Gamma}_{o} \\tanh \\left(\\boldsymbol{c}^{<t>}\\right) \\\\ &\\boldsymbol{y}^{<t>}=\\sigma^{\\prime}\\left(\\boldsymbol{W}_{y} \\boldsymbol{h}^{<t>}+\\boldsymbol{b}_{y}\\right) \\end{aligned} \\]","title":"Long-short term memory (LSTM) unit"},{"location":"lectures/12_seqmod/#present-and-future-of-sequence-modelling","text":"Finally, it is worth noting that the field of sequence modelling with deep neural networks has been taken by a storm a couple of years ago with novel architectures that have led to great improvements in the field of Natural Language Processing. The first innovation, which goes under the name of Attention layer has been initially introduced to mitigate one of the main limitations of the encoder-decoder RNN architecture that we have extensively discussed in this lecture. More specifically, the attention layer can find global correlations between the input(s) of the decoder layer and any of the hidden states of the encoder, avoiding the problem of having a bottleneck at the end of the encoder and a single hidden state that is required to encode the information of the various inputs of the encoder. The attention layer has later led to the design of a completely new type of neural network architecture, the so-called Transformer layer. In this case, instead of processing the input sequentially as in RNNs, the transformer layer takes all the inputs at once and find both local and global correlations by means of so-called self-attention blocks.","title":"Present and future of sequence modelling"},{"location":"lectures/12_seqmod/#additional-readings","text":"If you are interested to learn more about attention and transformer layers, I recommend watching this lecture from the KAUST Summer School on Unstructured Data in Geoscience","title":"Additional readings"},{"location":"lectures/13_dimred/","text":"Dimensionality reduction Up until now we have mostly focused on one family of Machine Learning methods, so-called Supervised learning . Whilst this is by far the most popular application in Deep Learning and the one that has reported greater success in the last decade, another family of methods that is becoming more and more popular falls under the umbrella of so-called Unsupervised learning . When labelled data are scarce, or it is difficult to have access to ground truth labels (e.g., in geoscience), unsupervised learning can represent an appealing alternative to find patterns in data. Unsupervised learning comes in different flavours. For example let's imagine grouping a set of unlabelled data into a number of buckets and then analyze them one-by-one knowing that the samples within each bucket are more similar to each other than others in the dataset: this is a form of unsupervised learning called clustering . The flavour that we are going to discuss in more details in the following is however referred to as Dimensionality reduction . Simply stated dimensionality reduction can be described as: Take \\(N_s\\) training samples \\(\\mathbf{x}^{(i)} \\in \\mathbb{R}^{N_f}\\) , ( \\(i=1,2,...N_s\\) ), Find a smaller representation \\(\\mathbf{c}^{(i)} \\in \\mathbb{R}^{N_l}\\) ( \\(N_l<<N_f\\) ) whilst making the smallest possible reconstruction error. If you previously studied how data are stored in a computer transmitted via cable (or air), you may recall that this is the very same objective of data compression . For this reason, nowadays we can build on a vast body of literature when designing effective dimensionality reduction techniques. What it is however slowly becoming more and more evident is the fact that by identifying representative low-dimensional (also called latent ) spaces from a set of data samples living in a much richer space, we can implicitly extract useful features to be later used in subsequent tasks of supervised learning. This two-steps approach is becoming very popular these days especially in fields of science that lack vast amount of labelled data as a way to take advantage as much as possible of unlabelled samples and then being able to fine-tune supervised models using small amounts of labelled data. Before we consider a number of different approaches to dimensionality reduction, let's write the problem in a common mathematical form. Given a number of training samples $\\mathbf{x}^{(i)}, we wish to identify: encoder: \\(\\mathbf{c}^{(i)} = e(\\mathbf{x}^{(i)})\\) decoder: \\(\\hat{\\mathbf{x}}^{(i)} = d(\\mathbf{c}^{(i)})\\) such that: \\[ \\hat{e},\\hat{d} = \\underset{e,d} {\\mathrm{argmin}} \\; \\frac{1}{N_s}\\sum_i \\mathscr{L}(\\mathbf{x}^{(i)}, d(e(\\mathbf{x}^{(i)}))) \\] Principal Component Analysis (PCA) The simplest approach to dimensionality reduction uses linear operators for the encoder: encoder: \\(\\mathbf{c}^{(i)} = \\mathbf{E}\\mathbf{x}^{(i)}\\) decoder: \\(\\hat{\\mathbf{x}}^{(i)} = \\mathbf{D}\\mathbf{c}^{(i)}\\) where \\(\\mathbf{E}_{[N_l \\times N_f]}\\) and \\(\\mathbf{D}_{[N_f \\times N_l]}\\) . PCA aims to find representative features that are linear combinations of the columns of the encoder (i.e., \\(\\mathbf{c}=\\sum_{i=1}^{N_f} \\mathbf{E}_{:,i} x_i\\) ) such that the projection of these new features onto the original space ( \\(\\mathbf{D}\\mathbf{c}\\) ) is as close as possible to the original sample \\(\\mathbf{x}\\) . In other words, we want to find the best linear subspace of the original space that minimizes the reconstruction error defined here as the squared Euclidean norm ( \\(\\mathscr{L}=||.||^2_2\\) ). Defining a unique pair of matrices ( \\(\\mathbf{E},\\mathbf{D}\\) ) is however not possible without imposing further constraints. In the PCA derivation we must assume that the columns of \\(\\mathbf{D}\\) are orthonormal: \\[ \\mathbf{D}^T \\mathbf{D} = \\mathbf{I}_{N_l} \\] By making such a strong assumption we can easily see that \\[ $\\hat{\\mathbf{x}}^{(i)} = \\mathbf{D}\\mathbf{E}\\mathbf{x}^{(i)}=\\mathbf{D}\\mathbf{D}^T\\mathbf{x}^{(i)} \\quad (\\mathbf{E}=\\mathbf{D}^T) \\] is the choice of encoder-decoder that minimizes the reconstruction error. Let's now prove to ourselves that this is the case for a single training sample: \\[ \\hat{\\mathbf{c}} = \\underset{\\mathbf{c}} {\\mathrm{argmin}} \\; ||\\mathbf{x}-d(\\mathbf{x})||_2^2 \\] where for the moment we do not specify the decoder and simply call it \\(d\\) . Let's first expand the loss function \\[ \\begin{aligned} ||\\mathbf{x}-d(\\mathbf{x})||_2^2 &= (\\mathbf{x}-g(\\mathbf{x}))^T (\\mathbf{x}-d(\\mathbf{x})) \\\\ &= \\mathbf{x}^T \\mathbf{x} - \\mathbf{x}^Td(\\mathbf{x}) - g(\\mathbf{x})^T \\mathbf{x} + d(\\mathbf{c})^T g(\\mathbf{c})^T\\\\ &= \\mathbf{x}^T \\mathbf{x} - 2 \\mathbf{x}^Td(\\mathbf{x}) + d(\\mathbf{c})^T g(\\mathbf{c})^T\\\\ \\end{aligned} \\] where we can ignore the first term given it does not depend on \\(\\mathbf{c}\\) . At this point let's consider the special case of \\(d()=\\mathbf{D}\\) , which gives: \\[ \\begin{aligned} ||\\mathbf{x}-d(\\mathbf{x})||_2^2 &= \\mathbf{c}^T \\mathbf{D}^T \\mathbf{D} \\mathbf{c} - 2 \\mathbf{x}^T \\mathbf{D} \\mathbf{c} \\\\ &= \\mathbf{c}^T \\mathbf{I}_{N_l} \\mathbf{c} - 2 \\mathbf{x}^T \\mathbf{D} \\mathbf{c} \\end{aligned} \\] Finally we compute the derivative of the loss function over \\(\\mathbf{c}\\) : \\[ \\frac{\\partial J}{\\partial \\mathbf{c}} = 0 \\rightarrow 2 \\mathbf{c}^T - 2 \\mathbf{x}^T \\mathbf{D} = 0 \\rightarrow \\mathbf{c} = \\mathbf{D}^T \\mathbf{x} \\] where we have obtained that \\(\\mathbf{E} = \\mathbf{D}^T\\) . At this point we know what is the optimal linear encoder-decoder pair with respect to the MSE loss. However, we do not have a specific form for the matrix \\(\\mathbf{D}\\) itself. In order to identify the entries of the decoder matrix, we need to set up another optimization problem, this time directly for \\(\\mathbf{D}\\) : \\[ \\hat{\\mathbf{D}} = \\underset{\\mathbf{D}} {\\mathrm{argmin}} \\; ||\\mathbf{X}-\\mathbf{D}\\mathbf{D}^T \\mathbf{X}||_F \\quad s.t. \\; \\mathbf{D}^T \\mathbf{D} = \\mathbf{I}_{N_l} \\] where \\(\\mathbf{X}_{[N_f \\times N_s]}\\) is the training sample matrix. To simplify our derivation let's consider the case of \\(N_l=1\\) ; the result can then be easily generalized for any choice of \\(N_l=1\\) . Let's write \\[ \\begin{aligned} \\hat{\\mathbf{d}} &= \\underset{\\mathbf{d}} {\\mathrm{argmin}} \\; ||\\mathbf{X}-\\mathbf{d}\\mathbf{d}^T \\mathbf{X}||_F \\quad s.t. \\; \\mathbf{d}^T \\mathbf{d} = 1 \\\\ &= \\underset{\\mathbf{d}} {\\mathrm{argmin}} \\; ||\\bar{\\mathbf{X}}-\\bar{\\mathbf{X}}\\mathbf{d}\\mathbf{d}^T||_F \\quad s.t. \\; \\mathbf{d}^T \\mathbf{d} = 1 \\quad (\\bar{\\mathbf{X}}=\\mathbf{X}^T) \\\\ &= \\underset{\\mathbf{d}} {\\mathrm{argmin}} \\; Tr((\\bar{\\mathbf{X}}-\\bar{\\mathbf{X}}\\mathbf{d}\\mathbf{d}^T)^T(\\bar{\\mathbf{X}}-\\bar{\\mathbf{X}}\\mathbf{d}\\mathbf{d}^T)) \\quad s.t. \\; \\mathbf{d}^T \\mathbf{d} = 1\\\\ &= \\underset{\\mathbf{d}} {\\mathrm{argmin}} \\; Tr(\\bar{\\mathbf{X}}^T \\bar{\\mathbf{X}} - \\bar{\\mathbf{X}}^T \\bar{\\mathbf{X}}\\mathbf{d}\\mathbf{d}^T - \\mathbf{d}\\mathbf{d}^T \\bar{\\mathbf{X}}^T \\bar{\\mathbf{X}} + \\mathbf{d}\\mathbf{d}^T\\bar{\\mathbf{X}}^T \\bar{\\mathbf{X}}\\mathbf{d}\\mathbf{d}^T) \\quad s.t. \\; \\mathbf{d}^T \\mathbf{d} = 1\\\\ &= \\underset{\\mathbf{d}} {\\mathrm{argmin}} \\; -2 Tr(\\bar{\\mathbf{X}}^T \\bar{\\mathbf{X}}\\mathbf{d}\\mathbf{d}^T) + Tr(\\mathbf{d}\\mathbf{d}^T\\bar{\\mathbf{X}}^T \\bar{\\mathbf{X}}\\mathbf{d}\\mathbf{d}^T) \\quad s.t. \\; \\mathbf{d}^T \\mathbf{d} = 1\\\\ &= \\underset{\\mathbf{d}} {\\mathrm{argmin}} \\; -2 Tr(\\bar{\\mathbf{X}}^T \\bar{\\mathbf{X}}\\mathbf{d}\\mathbf{d}^T) + Tr(\\bar{\\mathbf{X}}^T \\bar{\\mathbf{X}}\\mathbf{d}\\mathbf{d}^T \\mathbf{d}\\mathbf{d}^T) \\quad s.t. \\; \\mathbf{d}^T \\mathbf{d} = 1\\\\ &= \\underset{\\mathbf{d}} {\\mathrm{argmin}} \\; -Tr(\\bar{\\mathbf{X}}^T \\bar{\\mathbf{X}}\\mathbf{d}\\mathbf{d}^T) \\quad s.t. \\; \\mathbf{d}^T \\mathbf{d} = 1\\\\ &= \\underset{\\mathbf{d}} {\\mathrm{argmax}} \\; Tr(\\bar{\\mathbf{X}}^T \\bar{\\mathbf{X}}\\mathbf{d}\\mathbf{d}^T) = Tr(\\mathbf{d}^T \\bar{\\mathbf{X}}^T \\bar{\\mathbf{X}}\\mathbf{d}) \\quad s.t. \\; \\mathbf{d}^T \\mathbf{d} = 1\\\\ \\end{aligned} \\] where in 6 we use the fact that \\(\\mathbf{d}^T \\mathbf{d} = 1\\) . The solution of this maximization problem is represented by the eigenvector of \\(\\bar{\\mathbf{X}}^T \\bar{\\mathbf{X}}\\) associated to the largest eigenvalue (or the \\(N_l\\) largest eigenvalues for the general case). We can therefore conclude that PCA is defined as: Take \\(N_s\\) training samples \\(\\mathbf{x}^{(i)} \\in \\mathbb{R}^{N_f}\\) , ( \\(i=1,2,...N_s\\) ), Compute the matrix \\(\\bar{\\mathbf{X}}_{[N_s \\times N_f]}\\) Compute the SVD of \\(\\bar{\\mathbf{X}}\\) (i.e., eigenvalues and eigenvectors of the sample covariance matrix \\(\\bar{\\mathbf{X}}^T \\bar{\\mathbf{X}}\\) ) Form \\(\\mathbf{D}\\) composed by the eigenvector associated with the \\(N_l\\) largest eigenvalues. Compute \\(\\mathbf{c}=\\mathbf{D}^T \\mathbf{x}\\) and \\(\\hat{\\mathbf{x}}=\\mathbf{D} \\mathbf{c}\\) . More in general, it is also worth remembering that if the training data is not zero-mean, PCA can be slightly modified to take that into account: \\[ \\mathbf{c}=\\mathbf{D}^T (\\mathbf{x}-\\boldsymbol\\mu$ and $\\hat{\\mathbf{x}}=\\mathbf{D} \\mathbf{c}+\\boldsymbol\\mu$. \\] where \\(\\boldsymbol\\mu\\) is the sample mean. To conclude, let's try to provide some additional geometrical intuition of how PCA works in practice. Once again, let's recall the covariance matrix that we form and create SVD on: \\[ \\mathbf{C}_x=E_\\mathbf{x} [(\\mathbf{x}-\\boldsymbol\\mu) (\\mathbf{x}-\\boldsymbol\\mu)^T] \\] The eigenvalues \\(\\lambda_i\\) of \\(\\mathbf{C}_x\\) relate to the variance of the dataset \\(\\mathbf{X}\\) in the direction of the associated eigenvector \\(\\mathbf{v}_i\\) as follows (we use a 2d example for simplicity): so we observe that the first direction of PCA (i.e. \\(\\mathbf{v}_1\\) ) is the one that best minimizes the reconstruction error (i.e., \\sum_i d_{i,1}). In multiple dimensions, the eigenvectors are organized in order of reconstruction error of the projected data points from smallest to largest. Other linear dimensionality reduction techniques Whilst PCA is very popular for its simplicity (both of understanding and implementation), other techniques for linead dimensionality reduction exist. As some of them has been shown during the years to be very powerful and better suited to find representative latent representations from data, we will briefly look at them here. Independent Component Analysis (ICA) ICA aims to separate a signal into many underlying signals that are scaled and added together to reproduce the original one: \\[ \\mathbf{x} = \\sum_i c_i \\mathbf{w}_i = \\mathbf{Wc} \\] where in this case \\(\\mathbf{c}\\) has the same dimensionality of \\(\\mathbf{x}\\) . This model is in fact commonly used for blind source separation of mixed signals. Despite it is strictly speaking not a dimensionality reduction technique, we discuss it here due to its ability of finding representative bases that combined together can explain a set of data. Once again, the problem is in need for extra constraints for us to be able to find a solution. In this case the assumption made of the \\(\\mathbf{w}_i\\) signals is as follows: Signals \\(\\mathbf{w}_i\\) must be statistically independent from each other and non-gaussian A solution to this problem can be obtained finding the pair ( \\(\\mathbf{W}, \\mathbf{c}\\) ) which maximises non-gaussianity (i.e., minimizes normalized sample kurtosis) or minimizes mutual information (MI). Whilst we don't discuss here in details how to achieve such solution, it is worth pointing out that this requires solving a nonlinear inverse problem as \\(\\mathbf{W}\\) relates in a nonlinear manner to kurtosis or MI. Sparse Coding (or Dictionary Learning) Sparse coding is another heavily studied model for dimensionality reduction. The general idea has origin in a large body of work carried out in other areas of applied mathematics where hand-crafted transformations (e.g., wavelets) habe been identified to nicely represent data of different kind (e.g., images, sounds, seismic recordings) in a very sparse fashion. Here sparse refers to the fact that the transformed signal can be represented by a vector with many zeros and just few non-zero entries. In this context, however, the transformation is represented a matrix \\(\\mathbf{W}\\) , whose entries are once again learned directly from the available training data. This is achieved by imposing a strong condition on the probability distribution associated with the latent vector \\(\\mathbf{c}\\) : \\[ p(\\mathbf{c}) \\approx \\text{Laplace, Cauchy, Factorized t-student} \\] in other words, a fat tailed distribution, whose samples are therefore sparse. By making such an assumption, no closed form solution exist like in the PCA case. Instead, the training process is set up with the following goals in mind: Find sparsest latent representation during the encoding phase Find a decoder that provides the smallest reconstruction error which mathematically can be written as: $$ \\begin{aligned} \\hat{\\mathbf{W}}, \\hat{\\mathbf{h}} &= \\underset{\\mathbf{W}, \\mathbf{h}} {\\mathrm{argmax}} p(\\mathbf{h}|\\mathbf{x}) &= \\underset{\\mathbf{W}, \\mathbf{h}} {\\mathrm{argmin}} \\beta ||\\mathbf{h}-\\mathbf{W}\\mathbf{h}||_2^2 +\\lambda ||\\mathbf{h}||_1 \\end{aligned} $$ where \\(\\beta\\) , \\(\\lambda\\) are directly related to the parameters of the posterior distribution that we wish to maximize. This functional can be minimized in an alternating fashion, first for \\(\\mathbf{W}\\) , then for \\(\\mathbf{x}\\) , and so on and so forth. Finally, once the training process is over and \\(\\hat{\\mathbf{W}}\\) is available, it is worth noting that sparse coding does require solving a sparsity-promoting inverse problem for any new training sample \\(\\mathbf{x}\\) in order to find its best representation \\(\\hat{\\mathbf{h}}\\) . Nevertheless, despite the higher cost compared to for example PCA, sparse coding has shown great promise in both data compression and representation learning, the latter when coupled with down-the-line supervised tasks. Autoencoders Finally, we turn our attention onto nonlinear dimensionality reduction models. We should know by now that nonlinear mappings (like those performed by NNs) may be much more powerful than their linear counterpart is used to our advantage. The most popular nonlinear dimensionality techniques dates back to 1991 and the work of M. Kramer. Simply put, an autoencoder is the combination of an encoder function \\(e_\\theta\\) , which converts the input data into a latent representation, and a decoder function \\(d_\\theta\\) , which converts the new representation back into the original format. Here, both \\(e_\\theta\\) and \\(d_\\theta\\) and nonlinear and fully learned and stack one after the other as shown below An autoencoder can therefore be simply defined as: \\[ \\hat{\\mathbf{x}} = d_\\phi(e_\\theta(\\mathbf{x})) \\] where similar to PCA, the training process is setup such the parameters of the two networks are optimized to minimize the following loss function: \\[ \\hat{e}_\\theta,\\hat{d}_\\phi = \\underset{e_\\theta,d_\\phi} {\\mathrm{argmin}} \\; \\frac{1}{N_s}\\sum_i \\mathscr{L}(\\mathbf{x}^{(i)}, d_\\phi(e_\\theta(\\mathbf{x}^{(i)})))) \\] where the network architecture for both the encoder and decoder can be chosen accordingly to the type of data we are interested in. Once again, our code (or latent vector \\(\\mathbf{z}\\) ) must be chosen to be of lower dimensionality compared to the input in order to be able to learn useful representations. On the other hand if we choose \\(N_l \\ge N_f\\) , we will likely not learn something useful: very likely what we are going to learn is to reproduce the identity mapping. In other words, whilst the loss function is set to reproduce the input itself, what we are really interested is not the mere reconstruction, rather the creation of some meaningful transformation of the input vector that first projects it into a latent space and then expands it back to the original space. If we are able to accomplish this task, we will likely see that if we feed the trained network with a new sample \\(\\mathbf{x}_{in}\\) that lies inside the distribution of the training data, the reconstruction will be of similar quality as to what we observed in training. On the other hand, when a out-of-distribution sample \\(\\mathbf{x}_{out}\\) is fed to the network, its prediction will be much less accurate. Applications Now that we know how an AutoEncoder works, the next obvious question is why do we care and what can we use if for. Let's recap here a couple of applications that we have already mentioned here and there in the lecture: Data compression: the use of NNs (and AEs in this specific case) may soon lead to a completely new, nonlinear paradigm in data compression where we could simply store the latent vectors and network architecture and weights and reconstruct the original vector on-demand similar to what conventionally done with linear compressors (e.g., JPEG2000). Learn robust features on large unlabelled data prior to supervised learning: assuming that we have access to a large dataset composed for the majority of unlabelled data and for a small portion of labelled data, we could imagine training and AE on the first part of the dataset and use the learned latent features as input to a subsequent task of supervised learning. More specifically, the inputs of the labelled data are fed to the trained encoder and the resulting features are used in conjunction with the labels in a supervised manner. Inverse problems in the latent space: this is similar to the previous case, with the main difference that we may have an inverse problem we wish to solve where the parameter to estimate lives in the manifold of the \\(\\mathbf{x}\\) samples. We can once again train and AE to learn a good representation for such the manifold of possible solutions and then solve the inverse problem for \\(\\mathbf{z}\\) instead of \\(\\mathbf{x}\\) directly. Perform vector math in the latent space: Imagine we want to compare two multi-dimensional vectors \\(\\mathbf{x}\\) (e.g., images). Classical distance measures may focus too much on small discrepancies and not really on the overall similarity between these samples, that is what we usually want to compare. Alternatively, we could convert both vectors into their latent representations and compare them in this reduced space. In this case, even simple distance measures like MSE may become more robust as they really compare high-level features of the inputs that are encapsulated in the latent vectors. Undercomplete vs. Overcomplete AEs Up until now, we are talked about undercomplete representations (i.e., \\(N_l << N_f\\) ). We have justified this with the fact that if we give too many degrees of freedom to the network, we will likely allow it to learn the identity mapping (a form of overfitting for AEs). In short, a good design for a AE should follow these two rules: choose a small enough code ( \\(N_l\\) ): not too small as it won't be able to reproduce the input accurately, not too large as it will make the AE overfit; choose a small enough network capacity for both the encoder and decoder: similarly, a too large network will easily overfit even if the size of bottleneck has been appropriately chosen. However, a different choice may be taken as we will see shortly. This is heavily inspired by traditional compression algorithms, where a (linear) transformation that can produce a compact code (i.e., a code that can be stored in far fewer bits than the corresponding input) is usually overcomplete. Let's take the Wavelet transform as an example: Here the input image is initially decomposed into 3 high-pass and one-low pass filtered versions of it, and the low-pass one is further processed recursively. The overall size of the input and output is however identical . What makes this transform a great compressor is that in the transformed domain, natural images (and other N-dimensional signals) can be represented by very few non-zero coefficients. In other words, we say that the Wavelet transform provides a sparse representation of a variety of N-dimensional signals in nature. A similar approach can be taken for nonlinear transformations, like those applied by AEs. In this case, however, extra care must be taken to avoid overfitting, which can be done by adding some constraints to the learning process. As already discussed many times, these constraints can simply come in the form of regularization in the learning process: \\[ \\mathscr{L}_r = \\frac{1}{N_s}\\sum_i \\left( \\mathscr{L}(\\mathbf{x}^{(i)}, d_\\phi(e_\\theta(\\mathbf{x}^{(i)}))) +\\lambda R(\\mathbf{x}^{(i)} ; \\theta,\\phi) \\right) \\] where \\(R(\\mathbf{x}^{(i)} ;\\theta,\\phi)\\) can take several forms: L1 norm : this encourages the network to produce sparse latent representations; Derivative of the latent vector over the input : this encourages robust latent vectors that a small sensitivity to small perturbations of the input; Noise or missing parts in the inputs : this is not really a regularization in formal sense, as nothing is added to the cost function, rather the input is perturbed to make once again the latent representation robust to small variations in the input. Sparse AutoEncoders Enforcing a sparse latent vector can act as a strong regularization. This can be simply achieved by choosing: \\[ R(\\mathbf{x}^{(i)} ;\\theta,\\phi) = ||e_\\theta(\\mathbf{x}^{(i)})||_1 \\] which allows the learning process to optimize for the pair of encoder-decoder that can reproduce the training samples, whilst also forcing the encoder to produce sparse latent representation. A step further can be taken by imposing that not only the activations of the latent code are sparse, rather all the activations in the network. Let's take for simplicity a small network as depicted below: and changing the regularizer to: \\[ R(\\mathbf{x}^{(i)} ;\\theta,\\phi) = \\sum_j ||a_e^{[j](i)}||_1 + \\sum_j ||a_d^{[j](i)}||_1 \\] An autoencoder that is trained using this strategy is called Sparse Autoencoder . Finally, a slightly different strategy has been proposed under the name of K-sparse AutoEncoder , where instead of having a soft-constraint in the form of the regularization term above, the elements of the latent code are modified by a nonlinear transformation that brings all elements to zero apart from the K largest elements in absolute value. More formally, even though in practice no regularization term is therefore explicitly added to the loss function, this approach solves the following constrained problem: \\[ \\underset{\\mathbf{e}_\\theta, \\mathbf{d}_\\phi} {\\mathrm{argmin}} \\; \\frac{1}{N_s}\\sum_i \\mathscr{L}(\\mathbf{x}^{(i)}, d_\\phi(e_\\theta(\\mathbf{x}^{(i)}))) \\quad s.t. \\quad ||\\mathbf{z}||_0<K \\] Note that, once again, this procedure can be extended such that all the activations in the network are forced to have only K non-zero values. Contractive AutoEncoders An alternative regularization term that can make AEs robust to small changes in the input vectors is: \\[ R(\\mathbf{x} ;\\theta,\\phi) = ||\\nabla_\\mathbf{x} \\mathbf{z}||_F \\] where the derivative of the latent vector is taken over the input vector and forced to be small. Note that this derivative produces the Jacobian of the encoder as both the input and output are multi-dimensional (and therefore the use of the Frobenious norm). Whilst the authors of this method claim additional robustness, the computational cost of computing a Jacobian makes this approach quite costly. Denoising AutoEncoders Finally, denoising AEs are another family of regularized autoencoders. In this case, however, the regularization is implemented directly on the input vectors prior to feeding them to the network, by either replacing some values with zeros (or random values) or adding noise. Considering this last case, each step of the training process becomes: \\(\\tilde{\\mathbf{x}}^{(i)} = \\mathbf{x}^{(i)} + \\mathbf{n}^{(i)} \\quad \\forall i\\) ; \\(\\mathscr{L} = \\frac{1}{N_s}\\sum_i \\mathscr{L}(\\mathbf{x}^{(i)}, d_\\phi(e_\\theta(\\tilde{\\mathbf{x}}^{(i)})))\\) . Additional readings the following resource provides a detailed explanation of the theory of ICA (and a simple Python implementation!) the following blog post provides an extensive list (and description) of different AutoEncoder networks (and Variational AutoEncoders, which we will discuss in the next lecture).","title":"Dimensionality reduction"},{"location":"lectures/13_dimred/#dimensionality-reduction","text":"Up until now we have mostly focused on one family of Machine Learning methods, so-called Supervised learning . Whilst this is by far the most popular application in Deep Learning and the one that has reported greater success in the last decade, another family of methods that is becoming more and more popular falls under the umbrella of so-called Unsupervised learning . When labelled data are scarce, or it is difficult to have access to ground truth labels (e.g., in geoscience), unsupervised learning can represent an appealing alternative to find patterns in data. Unsupervised learning comes in different flavours. For example let's imagine grouping a set of unlabelled data into a number of buckets and then analyze them one-by-one knowing that the samples within each bucket are more similar to each other than others in the dataset: this is a form of unsupervised learning called clustering . The flavour that we are going to discuss in more details in the following is however referred to as Dimensionality reduction . Simply stated dimensionality reduction can be described as: Take \\(N_s\\) training samples \\(\\mathbf{x}^{(i)} \\in \\mathbb{R}^{N_f}\\) , ( \\(i=1,2,...N_s\\) ), Find a smaller representation \\(\\mathbf{c}^{(i)} \\in \\mathbb{R}^{N_l}\\) ( \\(N_l<<N_f\\) ) whilst making the smallest possible reconstruction error. If you previously studied how data are stored in a computer transmitted via cable (or air), you may recall that this is the very same objective of data compression . For this reason, nowadays we can build on a vast body of literature when designing effective dimensionality reduction techniques. What it is however slowly becoming more and more evident is the fact that by identifying representative low-dimensional (also called latent ) spaces from a set of data samples living in a much richer space, we can implicitly extract useful features to be later used in subsequent tasks of supervised learning. This two-steps approach is becoming very popular these days especially in fields of science that lack vast amount of labelled data as a way to take advantage as much as possible of unlabelled samples and then being able to fine-tune supervised models using small amounts of labelled data. Before we consider a number of different approaches to dimensionality reduction, let's write the problem in a common mathematical form. Given a number of training samples $\\mathbf{x}^{(i)}, we wish to identify: encoder: \\(\\mathbf{c}^{(i)} = e(\\mathbf{x}^{(i)})\\) decoder: \\(\\hat{\\mathbf{x}}^{(i)} = d(\\mathbf{c}^{(i)})\\) such that: \\[ \\hat{e},\\hat{d} = \\underset{e,d} {\\mathrm{argmin}} \\; \\frac{1}{N_s}\\sum_i \\mathscr{L}(\\mathbf{x}^{(i)}, d(e(\\mathbf{x}^{(i)}))) \\]","title":"Dimensionality reduction"},{"location":"lectures/13_dimred/#principal-component-analysis-pca","text":"The simplest approach to dimensionality reduction uses linear operators for the encoder: encoder: \\(\\mathbf{c}^{(i)} = \\mathbf{E}\\mathbf{x}^{(i)}\\) decoder: \\(\\hat{\\mathbf{x}}^{(i)} = \\mathbf{D}\\mathbf{c}^{(i)}\\) where \\(\\mathbf{E}_{[N_l \\times N_f]}\\) and \\(\\mathbf{D}_{[N_f \\times N_l]}\\) . PCA aims to find representative features that are linear combinations of the columns of the encoder (i.e., \\(\\mathbf{c}=\\sum_{i=1}^{N_f} \\mathbf{E}_{:,i} x_i\\) ) such that the projection of these new features onto the original space ( \\(\\mathbf{D}\\mathbf{c}\\) ) is as close as possible to the original sample \\(\\mathbf{x}\\) . In other words, we want to find the best linear subspace of the original space that minimizes the reconstruction error defined here as the squared Euclidean norm ( \\(\\mathscr{L}=||.||^2_2\\) ). Defining a unique pair of matrices ( \\(\\mathbf{E},\\mathbf{D}\\) ) is however not possible without imposing further constraints. In the PCA derivation we must assume that the columns of \\(\\mathbf{D}\\) are orthonormal: \\[ \\mathbf{D}^T \\mathbf{D} = \\mathbf{I}_{N_l} \\] By making such a strong assumption we can easily see that \\[ $\\hat{\\mathbf{x}}^{(i)} = \\mathbf{D}\\mathbf{E}\\mathbf{x}^{(i)}=\\mathbf{D}\\mathbf{D}^T\\mathbf{x}^{(i)} \\quad (\\mathbf{E}=\\mathbf{D}^T) \\] is the choice of encoder-decoder that minimizes the reconstruction error. Let's now prove to ourselves that this is the case for a single training sample: \\[ \\hat{\\mathbf{c}} = \\underset{\\mathbf{c}} {\\mathrm{argmin}} \\; ||\\mathbf{x}-d(\\mathbf{x})||_2^2 \\] where for the moment we do not specify the decoder and simply call it \\(d\\) . Let's first expand the loss function \\[ \\begin{aligned} ||\\mathbf{x}-d(\\mathbf{x})||_2^2 &= (\\mathbf{x}-g(\\mathbf{x}))^T (\\mathbf{x}-d(\\mathbf{x})) \\\\ &= \\mathbf{x}^T \\mathbf{x} - \\mathbf{x}^Td(\\mathbf{x}) - g(\\mathbf{x})^T \\mathbf{x} + d(\\mathbf{c})^T g(\\mathbf{c})^T\\\\ &= \\mathbf{x}^T \\mathbf{x} - 2 \\mathbf{x}^Td(\\mathbf{x}) + d(\\mathbf{c})^T g(\\mathbf{c})^T\\\\ \\end{aligned} \\] where we can ignore the first term given it does not depend on \\(\\mathbf{c}\\) . At this point let's consider the special case of \\(d()=\\mathbf{D}\\) , which gives: \\[ \\begin{aligned} ||\\mathbf{x}-d(\\mathbf{x})||_2^2 &= \\mathbf{c}^T \\mathbf{D}^T \\mathbf{D} \\mathbf{c} - 2 \\mathbf{x}^T \\mathbf{D} \\mathbf{c} \\\\ &= \\mathbf{c}^T \\mathbf{I}_{N_l} \\mathbf{c} - 2 \\mathbf{x}^T \\mathbf{D} \\mathbf{c} \\end{aligned} \\] Finally we compute the derivative of the loss function over \\(\\mathbf{c}\\) : \\[ \\frac{\\partial J}{\\partial \\mathbf{c}} = 0 \\rightarrow 2 \\mathbf{c}^T - 2 \\mathbf{x}^T \\mathbf{D} = 0 \\rightarrow \\mathbf{c} = \\mathbf{D}^T \\mathbf{x} \\] where we have obtained that \\(\\mathbf{E} = \\mathbf{D}^T\\) . At this point we know what is the optimal linear encoder-decoder pair with respect to the MSE loss. However, we do not have a specific form for the matrix \\(\\mathbf{D}\\) itself. In order to identify the entries of the decoder matrix, we need to set up another optimization problem, this time directly for \\(\\mathbf{D}\\) : \\[ \\hat{\\mathbf{D}} = \\underset{\\mathbf{D}} {\\mathrm{argmin}} \\; ||\\mathbf{X}-\\mathbf{D}\\mathbf{D}^T \\mathbf{X}||_F \\quad s.t. \\; \\mathbf{D}^T \\mathbf{D} = \\mathbf{I}_{N_l} \\] where \\(\\mathbf{X}_{[N_f \\times N_s]}\\) is the training sample matrix. To simplify our derivation let's consider the case of \\(N_l=1\\) ; the result can then be easily generalized for any choice of \\(N_l=1\\) . Let's write \\[ \\begin{aligned} \\hat{\\mathbf{d}} &= \\underset{\\mathbf{d}} {\\mathrm{argmin}} \\; ||\\mathbf{X}-\\mathbf{d}\\mathbf{d}^T \\mathbf{X}||_F \\quad s.t. \\; \\mathbf{d}^T \\mathbf{d} = 1 \\\\ &= \\underset{\\mathbf{d}} {\\mathrm{argmin}} \\; ||\\bar{\\mathbf{X}}-\\bar{\\mathbf{X}}\\mathbf{d}\\mathbf{d}^T||_F \\quad s.t. \\; \\mathbf{d}^T \\mathbf{d} = 1 \\quad (\\bar{\\mathbf{X}}=\\mathbf{X}^T) \\\\ &= \\underset{\\mathbf{d}} {\\mathrm{argmin}} \\; Tr((\\bar{\\mathbf{X}}-\\bar{\\mathbf{X}}\\mathbf{d}\\mathbf{d}^T)^T(\\bar{\\mathbf{X}}-\\bar{\\mathbf{X}}\\mathbf{d}\\mathbf{d}^T)) \\quad s.t. \\; \\mathbf{d}^T \\mathbf{d} = 1\\\\ &= \\underset{\\mathbf{d}} {\\mathrm{argmin}} \\; Tr(\\bar{\\mathbf{X}}^T \\bar{\\mathbf{X}} - \\bar{\\mathbf{X}}^T \\bar{\\mathbf{X}}\\mathbf{d}\\mathbf{d}^T - \\mathbf{d}\\mathbf{d}^T \\bar{\\mathbf{X}}^T \\bar{\\mathbf{X}} + \\mathbf{d}\\mathbf{d}^T\\bar{\\mathbf{X}}^T \\bar{\\mathbf{X}}\\mathbf{d}\\mathbf{d}^T) \\quad s.t. \\; \\mathbf{d}^T \\mathbf{d} = 1\\\\ &= \\underset{\\mathbf{d}} {\\mathrm{argmin}} \\; -2 Tr(\\bar{\\mathbf{X}}^T \\bar{\\mathbf{X}}\\mathbf{d}\\mathbf{d}^T) + Tr(\\mathbf{d}\\mathbf{d}^T\\bar{\\mathbf{X}}^T \\bar{\\mathbf{X}}\\mathbf{d}\\mathbf{d}^T) \\quad s.t. \\; \\mathbf{d}^T \\mathbf{d} = 1\\\\ &= \\underset{\\mathbf{d}} {\\mathrm{argmin}} \\; -2 Tr(\\bar{\\mathbf{X}}^T \\bar{\\mathbf{X}}\\mathbf{d}\\mathbf{d}^T) + Tr(\\bar{\\mathbf{X}}^T \\bar{\\mathbf{X}}\\mathbf{d}\\mathbf{d}^T \\mathbf{d}\\mathbf{d}^T) \\quad s.t. \\; \\mathbf{d}^T \\mathbf{d} = 1\\\\ &= \\underset{\\mathbf{d}} {\\mathrm{argmin}} \\; -Tr(\\bar{\\mathbf{X}}^T \\bar{\\mathbf{X}}\\mathbf{d}\\mathbf{d}^T) \\quad s.t. \\; \\mathbf{d}^T \\mathbf{d} = 1\\\\ &= \\underset{\\mathbf{d}} {\\mathrm{argmax}} \\; Tr(\\bar{\\mathbf{X}}^T \\bar{\\mathbf{X}}\\mathbf{d}\\mathbf{d}^T) = Tr(\\mathbf{d}^T \\bar{\\mathbf{X}}^T \\bar{\\mathbf{X}}\\mathbf{d}) \\quad s.t. \\; \\mathbf{d}^T \\mathbf{d} = 1\\\\ \\end{aligned} \\] where in 6 we use the fact that \\(\\mathbf{d}^T \\mathbf{d} = 1\\) . The solution of this maximization problem is represented by the eigenvector of \\(\\bar{\\mathbf{X}}^T \\bar{\\mathbf{X}}\\) associated to the largest eigenvalue (or the \\(N_l\\) largest eigenvalues for the general case). We can therefore conclude that PCA is defined as: Take \\(N_s\\) training samples \\(\\mathbf{x}^{(i)} \\in \\mathbb{R}^{N_f}\\) , ( \\(i=1,2,...N_s\\) ), Compute the matrix \\(\\bar{\\mathbf{X}}_{[N_s \\times N_f]}\\) Compute the SVD of \\(\\bar{\\mathbf{X}}\\) (i.e., eigenvalues and eigenvectors of the sample covariance matrix \\(\\bar{\\mathbf{X}}^T \\bar{\\mathbf{X}}\\) ) Form \\(\\mathbf{D}\\) composed by the eigenvector associated with the \\(N_l\\) largest eigenvalues. Compute \\(\\mathbf{c}=\\mathbf{D}^T \\mathbf{x}\\) and \\(\\hat{\\mathbf{x}}=\\mathbf{D} \\mathbf{c}\\) . More in general, it is also worth remembering that if the training data is not zero-mean, PCA can be slightly modified to take that into account: \\[ \\mathbf{c}=\\mathbf{D}^T (\\mathbf{x}-\\boldsymbol\\mu$ and $\\hat{\\mathbf{x}}=\\mathbf{D} \\mathbf{c}+\\boldsymbol\\mu$. \\] where \\(\\boldsymbol\\mu\\) is the sample mean. To conclude, let's try to provide some additional geometrical intuition of how PCA works in practice. Once again, let's recall the covariance matrix that we form and create SVD on: \\[ \\mathbf{C}_x=E_\\mathbf{x} [(\\mathbf{x}-\\boldsymbol\\mu) (\\mathbf{x}-\\boldsymbol\\mu)^T] \\] The eigenvalues \\(\\lambda_i\\) of \\(\\mathbf{C}_x\\) relate to the variance of the dataset \\(\\mathbf{X}\\) in the direction of the associated eigenvector \\(\\mathbf{v}_i\\) as follows (we use a 2d example for simplicity): so we observe that the first direction of PCA (i.e. \\(\\mathbf{v}_1\\) ) is the one that best minimizes the reconstruction error (i.e., \\sum_i d_{i,1}). In multiple dimensions, the eigenvectors are organized in order of reconstruction error of the projected data points from smallest to largest.","title":"Principal Component Analysis (PCA)"},{"location":"lectures/13_dimred/#other-linear-dimensionality-reduction-techniques","text":"Whilst PCA is very popular for its simplicity (both of understanding and implementation), other techniques for linead dimensionality reduction exist. As some of them has been shown during the years to be very powerful and better suited to find representative latent representations from data, we will briefly look at them here.","title":"Other linear dimensionality reduction techniques"},{"location":"lectures/13_dimred/#independent-component-analysis-ica","text":"ICA aims to separate a signal into many underlying signals that are scaled and added together to reproduce the original one: \\[ \\mathbf{x} = \\sum_i c_i \\mathbf{w}_i = \\mathbf{Wc} \\] where in this case \\(\\mathbf{c}\\) has the same dimensionality of \\(\\mathbf{x}\\) . This model is in fact commonly used for blind source separation of mixed signals. Despite it is strictly speaking not a dimensionality reduction technique, we discuss it here due to its ability of finding representative bases that combined together can explain a set of data. Once again, the problem is in need for extra constraints for us to be able to find a solution. In this case the assumption made of the \\(\\mathbf{w}_i\\) signals is as follows: Signals \\(\\mathbf{w}_i\\) must be statistically independent from each other and non-gaussian A solution to this problem can be obtained finding the pair ( \\(\\mathbf{W}, \\mathbf{c}\\) ) which maximises non-gaussianity (i.e., minimizes normalized sample kurtosis) or minimizes mutual information (MI). Whilst we don't discuss here in details how to achieve such solution, it is worth pointing out that this requires solving a nonlinear inverse problem as \\(\\mathbf{W}\\) relates in a nonlinear manner to kurtosis or MI.","title":"Independent Component Analysis (ICA)"},{"location":"lectures/13_dimred/#sparse-coding-or-dictionary-learning","text":"Sparse coding is another heavily studied model for dimensionality reduction. The general idea has origin in a large body of work carried out in other areas of applied mathematics where hand-crafted transformations (e.g., wavelets) habe been identified to nicely represent data of different kind (e.g., images, sounds, seismic recordings) in a very sparse fashion. Here sparse refers to the fact that the transformed signal can be represented by a vector with many zeros and just few non-zero entries. In this context, however, the transformation is represented a matrix \\(\\mathbf{W}\\) , whose entries are once again learned directly from the available training data. This is achieved by imposing a strong condition on the probability distribution associated with the latent vector \\(\\mathbf{c}\\) : \\[ p(\\mathbf{c}) \\approx \\text{Laplace, Cauchy, Factorized t-student} \\] in other words, a fat tailed distribution, whose samples are therefore sparse. By making such an assumption, no closed form solution exist like in the PCA case. Instead, the training process is set up with the following goals in mind: Find sparsest latent representation during the encoding phase Find a decoder that provides the smallest reconstruction error which mathematically can be written as: $$ \\begin{aligned} \\hat{\\mathbf{W}}, \\hat{\\mathbf{h}} &= \\underset{\\mathbf{W}, \\mathbf{h}} {\\mathrm{argmax}} p(\\mathbf{h}|\\mathbf{x}) &= \\underset{\\mathbf{W}, \\mathbf{h}} {\\mathrm{argmin}} \\beta ||\\mathbf{h}-\\mathbf{W}\\mathbf{h}||_2^2 +\\lambda ||\\mathbf{h}||_1 \\end{aligned} $$ where \\(\\beta\\) , \\(\\lambda\\) are directly related to the parameters of the posterior distribution that we wish to maximize. This functional can be minimized in an alternating fashion, first for \\(\\mathbf{W}\\) , then for \\(\\mathbf{x}\\) , and so on and so forth. Finally, once the training process is over and \\(\\hat{\\mathbf{W}}\\) is available, it is worth noting that sparse coding does require solving a sparsity-promoting inverse problem for any new training sample \\(\\mathbf{x}\\) in order to find its best representation \\(\\hat{\\mathbf{h}}\\) . Nevertheless, despite the higher cost compared to for example PCA, sparse coding has shown great promise in both data compression and representation learning, the latter when coupled with down-the-line supervised tasks.","title":"Sparse Coding (or Dictionary Learning)"},{"location":"lectures/13_dimred/#autoencoders","text":"Finally, we turn our attention onto nonlinear dimensionality reduction models. We should know by now that nonlinear mappings (like those performed by NNs) may be much more powerful than their linear counterpart is used to our advantage. The most popular nonlinear dimensionality techniques dates back to 1991 and the work of M. Kramer. Simply put, an autoencoder is the combination of an encoder function \\(e_\\theta\\) , which converts the input data into a latent representation, and a decoder function \\(d_\\theta\\) , which converts the new representation back into the original format. Here, both \\(e_\\theta\\) and \\(d_\\theta\\) and nonlinear and fully learned and stack one after the other as shown below An autoencoder can therefore be simply defined as: \\[ \\hat{\\mathbf{x}} = d_\\phi(e_\\theta(\\mathbf{x})) \\] where similar to PCA, the training process is setup such the parameters of the two networks are optimized to minimize the following loss function: \\[ \\hat{e}_\\theta,\\hat{d}_\\phi = \\underset{e_\\theta,d_\\phi} {\\mathrm{argmin}} \\; \\frac{1}{N_s}\\sum_i \\mathscr{L}(\\mathbf{x}^{(i)}, d_\\phi(e_\\theta(\\mathbf{x}^{(i)})))) \\] where the network architecture for both the encoder and decoder can be chosen accordingly to the type of data we are interested in. Once again, our code (or latent vector \\(\\mathbf{z}\\) ) must be chosen to be of lower dimensionality compared to the input in order to be able to learn useful representations. On the other hand if we choose \\(N_l \\ge N_f\\) , we will likely not learn something useful: very likely what we are going to learn is to reproduce the identity mapping. In other words, whilst the loss function is set to reproduce the input itself, what we are really interested is not the mere reconstruction, rather the creation of some meaningful transformation of the input vector that first projects it into a latent space and then expands it back to the original space. If we are able to accomplish this task, we will likely see that if we feed the trained network with a new sample \\(\\mathbf{x}_{in}\\) that lies inside the distribution of the training data, the reconstruction will be of similar quality as to what we observed in training. On the other hand, when a out-of-distribution sample \\(\\mathbf{x}_{out}\\) is fed to the network, its prediction will be much less accurate.","title":"Autoencoders"},{"location":"lectures/13_dimred/#applications","text":"Now that we know how an AutoEncoder works, the next obvious question is why do we care and what can we use if for. Let's recap here a couple of applications that we have already mentioned here and there in the lecture: Data compression: the use of NNs (and AEs in this specific case) may soon lead to a completely new, nonlinear paradigm in data compression where we could simply store the latent vectors and network architecture and weights and reconstruct the original vector on-demand similar to what conventionally done with linear compressors (e.g., JPEG2000). Learn robust features on large unlabelled data prior to supervised learning: assuming that we have access to a large dataset composed for the majority of unlabelled data and for a small portion of labelled data, we could imagine training and AE on the first part of the dataset and use the learned latent features as input to a subsequent task of supervised learning. More specifically, the inputs of the labelled data are fed to the trained encoder and the resulting features are used in conjunction with the labels in a supervised manner. Inverse problems in the latent space: this is similar to the previous case, with the main difference that we may have an inverse problem we wish to solve where the parameter to estimate lives in the manifold of the \\(\\mathbf{x}\\) samples. We can once again train and AE to learn a good representation for such the manifold of possible solutions and then solve the inverse problem for \\(\\mathbf{z}\\) instead of \\(\\mathbf{x}\\) directly. Perform vector math in the latent space: Imagine we want to compare two multi-dimensional vectors \\(\\mathbf{x}\\) (e.g., images). Classical distance measures may focus too much on small discrepancies and not really on the overall similarity between these samples, that is what we usually want to compare. Alternatively, we could convert both vectors into their latent representations and compare them in this reduced space. In this case, even simple distance measures like MSE may become more robust as they really compare high-level features of the inputs that are encapsulated in the latent vectors.","title":"Applications"},{"location":"lectures/13_dimred/#undercomplete-vs-overcomplete-aes","text":"Up until now, we are talked about undercomplete representations (i.e., \\(N_l << N_f\\) ). We have justified this with the fact that if we give too many degrees of freedom to the network, we will likely allow it to learn the identity mapping (a form of overfitting for AEs). In short, a good design for a AE should follow these two rules: choose a small enough code ( \\(N_l\\) ): not too small as it won't be able to reproduce the input accurately, not too large as it will make the AE overfit; choose a small enough network capacity for both the encoder and decoder: similarly, a too large network will easily overfit even if the size of bottleneck has been appropriately chosen. However, a different choice may be taken as we will see shortly. This is heavily inspired by traditional compression algorithms, where a (linear) transformation that can produce a compact code (i.e., a code that can be stored in far fewer bits than the corresponding input) is usually overcomplete. Let's take the Wavelet transform as an example: Here the input image is initially decomposed into 3 high-pass and one-low pass filtered versions of it, and the low-pass one is further processed recursively. The overall size of the input and output is however identical . What makes this transform a great compressor is that in the transformed domain, natural images (and other N-dimensional signals) can be represented by very few non-zero coefficients. In other words, we say that the Wavelet transform provides a sparse representation of a variety of N-dimensional signals in nature. A similar approach can be taken for nonlinear transformations, like those applied by AEs. In this case, however, extra care must be taken to avoid overfitting, which can be done by adding some constraints to the learning process. As already discussed many times, these constraints can simply come in the form of regularization in the learning process: \\[ \\mathscr{L}_r = \\frac{1}{N_s}\\sum_i \\left( \\mathscr{L}(\\mathbf{x}^{(i)}, d_\\phi(e_\\theta(\\mathbf{x}^{(i)}))) +\\lambda R(\\mathbf{x}^{(i)} ; \\theta,\\phi) \\right) \\] where \\(R(\\mathbf{x}^{(i)} ;\\theta,\\phi)\\) can take several forms: L1 norm : this encourages the network to produce sparse latent representations; Derivative of the latent vector over the input : this encourages robust latent vectors that a small sensitivity to small perturbations of the input; Noise or missing parts in the inputs : this is not really a regularization in formal sense, as nothing is added to the cost function, rather the input is perturbed to make once again the latent representation robust to small variations in the input.","title":"Undercomplete vs. Overcomplete AEs"},{"location":"lectures/13_dimred/#sparse-autoencoders","text":"Enforcing a sparse latent vector can act as a strong regularization. This can be simply achieved by choosing: \\[ R(\\mathbf{x}^{(i)} ;\\theta,\\phi) = ||e_\\theta(\\mathbf{x}^{(i)})||_1 \\] which allows the learning process to optimize for the pair of encoder-decoder that can reproduce the training samples, whilst also forcing the encoder to produce sparse latent representation. A step further can be taken by imposing that not only the activations of the latent code are sparse, rather all the activations in the network. Let's take for simplicity a small network as depicted below: and changing the regularizer to: \\[ R(\\mathbf{x}^{(i)} ;\\theta,\\phi) = \\sum_j ||a_e^{[j](i)}||_1 + \\sum_j ||a_d^{[j](i)}||_1 \\] An autoencoder that is trained using this strategy is called Sparse Autoencoder . Finally, a slightly different strategy has been proposed under the name of K-sparse AutoEncoder , where instead of having a soft-constraint in the form of the regularization term above, the elements of the latent code are modified by a nonlinear transformation that brings all elements to zero apart from the K largest elements in absolute value. More formally, even though in practice no regularization term is therefore explicitly added to the loss function, this approach solves the following constrained problem: \\[ \\underset{\\mathbf{e}_\\theta, \\mathbf{d}_\\phi} {\\mathrm{argmin}} \\; \\frac{1}{N_s}\\sum_i \\mathscr{L}(\\mathbf{x}^{(i)}, d_\\phi(e_\\theta(\\mathbf{x}^{(i)}))) \\quad s.t. \\quad ||\\mathbf{z}||_0<K \\] Note that, once again, this procedure can be extended such that all the activations in the network are forced to have only K non-zero values.","title":"Sparse AutoEncoders"},{"location":"lectures/13_dimred/#contractive-autoencoders","text":"An alternative regularization term that can make AEs robust to small changes in the input vectors is: \\[ R(\\mathbf{x} ;\\theta,\\phi) = ||\\nabla_\\mathbf{x} \\mathbf{z}||_F \\] where the derivative of the latent vector is taken over the input vector and forced to be small. Note that this derivative produces the Jacobian of the encoder as both the input and output are multi-dimensional (and therefore the use of the Frobenious norm). Whilst the authors of this method claim additional robustness, the computational cost of computing a Jacobian makes this approach quite costly.","title":"Contractive AutoEncoders"},{"location":"lectures/13_dimred/#denoising-autoencoders","text":"Finally, denoising AEs are another family of regularized autoencoders. In this case, however, the regularization is implemented directly on the input vectors prior to feeding them to the network, by either replacing some values with zeros (or random values) or adding noise. Considering this last case, each step of the training process becomes: \\(\\tilde{\\mathbf{x}}^{(i)} = \\mathbf{x}^{(i)} + \\mathbf{n}^{(i)} \\quad \\forall i\\) ; \\(\\mathscr{L} = \\frac{1}{N_s}\\sum_i \\mathscr{L}(\\mathbf{x}^{(i)}, d_\\phi(e_\\theta(\\tilde{\\mathbf{x}}^{(i)})))\\) .","title":"Denoising AutoEncoders"},{"location":"lectures/13_dimred/#additional-readings","text":"the following resource provides a detailed explanation of the theory of ICA (and a simple Python implementation!) the following blog post provides an extensive list (and description) of different AutoEncoder networks (and Variational AutoEncoders, which we will discuss in the next lecture).","title":"Additional readings"},{"location":"lectures/14_vae/","text":"Generative Modelling and Variational AutoEncoders Up until now, our attention has been mostly focused on supervised learning tasks where we have access to a certain number of training samples, in the form of input-target pairs, and we train a model (e.g., a NN) to learn the best possible mapping between the two. These kind of models are also usually referred to as discriminative models as they learn from training samples their underlying conditional probability distribution \\(p(\\mathbf{y}|\\mathbf{x})\\) . In the last lecture, we have also seen how the general principles of supervised learning can be adapted to accomplish a number of different tasks where input-target pairs are not available. Dimensionality reduction is one of such tasks, which are usually categorized under the umbrella of unsupervised learning. Another very exciting area of statistics that has been recently heavily influenced by the deep learning revolution is the so-called field of Generative modelling . Here, instead of having access to input-target pairs, we are able to only gather a (large) number of samples \\(\\mathbf{X} = \\{ \\mathbf{x}^{(1)}, \\mathbf{x}^{(2)}, ..., \\mathbf{x}^{(N_s)} \\}\\) that we believe come from a given hidden distribution. The task that we wish to accomplished is therefore: Learn the underlying distribution \\(p(\\mathbf{x})\\) , or Learn to sample from the underlying distribution \\(\\tilde{\\mathbf{x}} \\sim p(\\mathbf{x})\\) Obviously, the first task is more general and usually more ambitious. Once you know a distribution, sampling from it is rather an easy task. In the next two lectures, we will however mostly focused on the second task and discuss two popular algorithms that have shown impressive capabilities to sample from high-dimensional, complex distributions. To set the scene, let's take the simplest approach to generative modelling that has nothing to do with neural networks. Let's imagine we are provided with \\(N_s\\) multi-dimensional arrays and we are told that they come from a multi-variate gaussian distribution. We can set up a generative modelling task as follows: Training Compute the sample mean and covariance from the training samples: \\(\\boldsymbol \\mu, \\boldsymbol \\Sigma\\) Apply the Cholesky decomposition to the covariance matrix: \\(\\boldsymbol \\Sigma = \\mathbf{L} \\mathbf{L}^T\\) Inference / Generation Sample a vector from a unitary, zero-mean normal distribution \\(\\mathbf{z} \\sim \\mathcal{N}(\\mathbf{0}, \\mathbf{I})\\) Create a new sample from the true distribution: \\(\\tilde{\\mathbf{x}} = \\mathbf{L} \\mathbf{z} + \\boldsymbol \\mu\\) Unfortunately, multi-dimensional distributions that we usually find in nature are hardly gaussian and this kind of simple generative modelling procedure falls short. Nevertheless, the approach that we take with some of the more advanced generative modelling methods that we are going to discuss later on in this lecture does not differ from what we have done so far. A training phase, where the free-parameters of the chosen parametric model (e.g., a NN) are learned from the available data, followed by a generation phase that uses the trained model and some stochastic input (like the \\(\\mathbf{z}\\) vector in the example above). Variational AutoEncoders (VAEs) Variational AutoEncoders have been proposed by Kingma and Welling in 2013. in As the name implies, these networks take inspiration from the AutoEncoder networks that we have presented in the previous lecture. However, some small, yet fundamental changes are implemented to the network architecture as well as the learning process (i.e., loss function) to turn such family of networks from being able to perform dimensionality reduction to being generative models. Let's start by looking at a schematic representation of a VAEs: Even before we delve into the mathematical details, we can clearly see that one main change has been implemented to the network architecture: instead of directly producing a vector \\(\\mathbf{z} \\in \\mathbb{R}^{N_l}\\) , the encoder's output is composed of two vectors \\(\\boldsymbol \\mu \\in \\mathbb{R}^{N_l}\\) and \\(\\boldsymbol \\sigma \\in \\mathbb{R}^{N_l}\\) that represent the mean and standard deviation of a \\(N_l\\) dimensional gaussian distribution (with uncorrelated variables, i.e., diagonal covariance matrix). Mathematically, the encoder can be written as \\(\\boldsymbol \\mu = e_{\\theta,\\mu}(\\mathbf{x}), \\; \\boldsymbol \\sigma = e_{\\theta,\\sigma}(\\mathbf{x})\\) , where the two networks share all weights apart from the last layer. The two vectors produced by the encoder are then fed together to a sampler, who similar to what we did before, produces a sample from the following gaussian distribution: \\(\\mathcal{N}(\\boldsymbol \\mu, diag\\{ \\boldsymbol \\sigma \\})\\) . In practice this is however achieved by sampling a vector and then transforming it into the desired distribution, \\(\\mathbf{z} = \\boldsymbol \\sigma \\cdot \\mathbf{z} + \\boldsymbol \\mu\\) where \\(\\cdot\\) refers to an element-wise product. Reparametrization trick This rather simple trick is referred to as Reparametrization trick and it is strictly needed in neural networks every time we want to introduce a stochastic process within the computational graph. In fact, by simply having a stochastic process parametrized by a certain mean and standard deviation that may come from a previous part of the computational graph (as in VAEs) we lose the possibility to perform backpropagation. Instead if we decouple the stochastic component (which we are not interested to update, and therefore to backpropagate onto) and the deterministic component(s), we do not lose access to backpropagation: Why VAEs? Before we progress in discussing the loss function and training procedure of VAEs, a rather simple question may arise: 'Why can we not use AEs for generative modelling?' In fact, this could be achieved by simply modifying the inference step: where instead of taking a precomputed \\(\\mathbf{z}\\) vector (from a previous stage of compression), we could sample a new \\(\\mathbf{z}\\) value from a properly crafted distribution (perhaps chosen from statistical analysis of the training latent vectors) at any time we want to create a new sample. Unfortunately, whilst this idea may sound reasonable, we will be soon faced with a problem. In fact, the latent manifold learned by a AE may not be regular, or in other words it may be hard to ensure that areas of such manifold that have not been properly sampled by the training data will produce meaningful samples \\(\\tilde{\\mathbf{z}}\\) . Just to give an idea, let's look at the following schematic representation: as we can see, if a part of the latent 1-d manifold is not rich in training data, the resulting generated sample may be non-representative at all. Whilst we discussed techniques that can mitigate this form of overfitting (e.g., sparse AEs), VAEs bring the learning process to a whole new level by choosing a more appropriate regularization term \\(R(\\mathbf{x}^{(i)} ;\\theta,\\phi)\\) to add to the reconstruction loss. Regularization in VAEs In order to better understand the regularization choice in VAEs, let's look once again at a schematic representation of VAEs but this time in a probabilistic mindset: where we highlight here the fact that the encoder and decoder can be seen as probability approximators. More specifically: \\(e_\\theta(\\mathbf{x}) \\approx p(\\mathbf{z}|\\mathbf{x})\\) : the encoder learns to sample from the latent space distribution conditioned on a specific input \\(d_\\phi(\\mathbf{z}) \\approx p(\\mathbf{x}|\\mathbf{z})\\) : the decoder learns to sample from the true distribution conditioned on a specific latent sample By doing so, we can reinterpret the reconstruction loss as the negative log-likelihood of the decoder. And, provided that we have defined a prior for the latent space \\(\\mathbf{z} \\sim P(\\mathbf{z})\\) , we can learn the parameters of the decoder by ensuring that the posterior does not deviate too much from the prior. This can be achieved by choosing: \\[ R(\\mathbf{x} ;\\theta,\\phi) = KL(p(\\mathbf{z}|\\mathbf{x})||p(\\mathbf{z})) \\] As in any statistical learning process, the overall loss of our VAEs shows a trade-off between the likelihood (i.e., learning from data) and prior (i.e., keeping close to the initial guess). Before we provide a mathematical derivation supporting these claims, let's briefly try to provide some intuition onto why adding this regularization makes VAEs more well behaved than AEs in terms of generating representation samples of the input distribution ( \\(p(\\mathbf{x})\\) ) whilst sampling directly in the latent space. Back to the example with geometrical shapes, if we now assume a 2-dimensional latent space for both an AE and a VAEs: the effect of the regularization term in VAEs is such that the probability density function of the latent space (( \\(p(\\mathbf{z}|\\mathbf{x})\\) ) is forced to stay close to the prior, and therefore the \"clouds\" of different classes do not really separate from each other abruptly. As a consequence, the geometrical shapes associated with the transition zones in the latent space are still meaningful. The same cannot be said for the AE as the \"clouds\" of different classes tend to move apart leaving unexplored regions between them. Sampling from such region will result in non-representative geometrical shapes. More precisely, the regularization term in VAEs ensures the following two properties for the latent space: continuity: two closely points in the latent space are similar in the original space; completness: any point sampled from the latent distribution is meaningful in the original space; Mathematics of VAEs To conclude our lecture on VAEs, we would like to gain a stronger mathematical understanding about the inner working of this model. In order to do so, we are required to introduce a technique commonly used in statistics to estimate complex distributions. This technique goes under the name of Variational Inference (VI) . Let's begin from the classical setup of Bayesian inference. We are interested in a certain probability distribution that we want to sample from or characterize (e.g., in terms of its mean and standard deviation), for example the following posterior distribution in a general inverse problem setting: \\[ p(\\mathbf{x} | \\mathbf{y}) = \\frac{p(\\mathbf{y}|\\mathbf{x}) p(\\mathbf{x})}{p(\\mathbf{y})} \\] where \\(\\mathbf{x}\\) is the model we wish to estimate and \\(\\mathbf{y}\\) are the available observations. We assume knowledge of the prior distribution \\(p(\\mathbf{x})\\) and the underlying physical process that links the model to the data, \\(\\mathbf{y}=f(\\mathbf{x})\\) from which we can compute the likelihood \\(p(\\mathbf{y}|\\mathbf{x})\\) (assuming a certain statistics for the noise). The denominator of the Bayes rule ( \\(p(\\mathbf{y}) = \\int p(\\mathbf{y}|\\mathbf{x}) p(\\mathbf{x}) d\\mathbf{x}\\) ) is what prevents us from computing the posterior directly. Variational Inference approaches the above problem in a special way. A parametric distribution \\(q_\\theta(\\mathbf{x})\\) is defined, also sometimes referred to as guide or proposal distribution, and an optimization problem is set up to find the best free-parameters \\(\\theta\\) such that this easy-to-evaluate distribution closely resembles to posterior distribution of interest. As usual when setting up an optimization problem, a measure of distance between such distributions is required to be able to optimize for such set of parameters. In this case, since we are dealing with distributions, it comes natural to choose the Kullback-Leibler divergence as metric: \\[ \\underset{\\theta} {\\mathrm{argmin}} \\; KL(q_\\theta(\\mathbf{x})||p(\\mathbf{x}|\\mathbf{y})) \\] Let's now expand the expression of the KL divergence and show an equivalent formula for this optimization problem: \\[ \\begin{aligned} &\\underset{\\theta} {\\mathrm{argmin}} \\; KL(q_\\theta(\\mathbf{x})||p(\\mathbf{x}|\\mathbf{y})) \\\\ &= \\underset{\\theta} {\\mathrm{argmin}} \\; E_{\\mathbf{x} \\sim q_\\theta} \\left[ log \\left( \\frac{q_\\theta(\\mathbf{x})}{p(\\mathbf{x}|\\mathbf{y})} \\right) \\right] \\\\ &= \\underset{\\theta} {\\mathrm{argmin}} \\; E_{\\mathbf{x} \\sim q_\\theta} [ log q_\\theta(\\mathbf{x}) ] - E_{\\mathbf{x} \\sim q_\\theta} [ log p(\\mathbf{x}|\\mathbf{y}) ] \\\\ &= \\underset{\\theta} {\\mathrm{argmin}} \\; E_{\\mathbf{x} \\sim q_\\theta} [ log q_\\theta(\\mathbf{x}) ] - E_{\\mathbf{x} \\sim q_\\theta} \\left[ log \\left( \\frac{p(y|\\mathbf{x})p(\\mathbf{x})}{p(\\mathbf{y})} \\right) \\right] \\\\ &= \\underset{\\theta} {\\mathrm{argmin}} \\; E_{\\mathbf{x} \\sim q_\\theta} [ log q_\\theta(\\mathbf{x}) ] - E_{\\mathbf{x} \\sim q_\\theta} [ log p(\\mathbf{y}|\\mathbf{x}) ] - E_{\\mathbf{x} \\sim q_\\theta} [ log p(\\mathbf{x}) ] + \\cancel{E_{\\mathbf{x} \\sim q_\\theta} [ log p(\\mathbf{y}) ]} \\\\ &= \\underset{\\theta} {\\mathrm{argmin}} \\; KL(q_\\theta(\\mathbf{x})||p(\\mathbf{x})) - E_{\\mathbf{x} \\sim q_\\theta} [ log p(\\mathbf{y}|\\mathbf{x}) ] \\end{aligned} \\] where we have eliminated \\(E_{x \\sim q_\\theta} [ p(\\mathbf{y}) ]\\) the in the 5th row since it does not depend on \\(\\theta\\) . In the last row, we can see the two terms that we have previously described: \\(-E_{x \\sim q_\\theta} [ p(\\mathbf{y}|\\mathbf{x}) ]\\) is the negative log-likelihood of a traditional Maximum likelihood estimation (i.e., data misfit term). In the special case of gaussian noise ( \\(\\mathbf{y} \\sim \\mathcal{N}(f(\\mathbf{x}), \\sigma^2 \\mathbf{I})\\) ), this becomes the MSE loss as discussed in one of our previous lectures; \\(KL(q_\\theta(\\mathbf{x})||p(\\mathbf{x}))\\) is the regularization term encouraging the proposal distribution to stay close to the prior. Finally, let's slightly rearrange the expression in the 5th row: \\[ E_{\\mathbf{x} \\sim q_\\theta} [ log p(\\mathbf{y}) ] - KL(q_\\theta(\\mathbf{x})||p(\\mathbf{x}|\\mathbf{y})) = E_{\\mathbf{x} \\sim q_\\theta} [ log p(\\mathbf{y}|\\mathbf{x}) ] - KL(q_\\theta(\\mathbf{x})||p(\\mathbf{x})) \\] The left hand side of this equation is called Evidence Lower Bound (ELBO) . The names comes from the fact that the sum of these two terms is always \\(\\le E_{\\mathbf{x} \\sim q_\\theta} [ log p(\\mathbf{y}) ]\\) since KL divergence is always positive. Therefore, by maximizing the right hand side (or equivalently by minimizing the negative of the right hand side), we effectively maximize the lower bound of the probability of the evidence \\(p(\\mathbf{y})\\) . Variational inference can be therefore seen also as a maximization problem over the ELBO. Whilst we now understand the theoretical foundations of VI, to make it practical we need to specify: A suitable proposal \\(q_\\theta(\\mathbf{x})\\) , where suitable means that we can easily evaluate such probability, its KL divergence with a prior of choice, as well as sample from it. The simplest choice that is sometimes made in VI is named mean-field approximation where: $$ q_\\theta(\\mathbf{x}) = \\prod_i q_\\theta(x_i) \\sim \\mathcal{N}(\\boldsymbol \\mu , diag(\\boldsymbol \\sigma)) $$ where \\(\\theta={\\boldsymbol \\mu, \\boldsymbol \\sigma}\\) . This implies that there is no correlation over the different variables of the N-dimensional proposal distribution. Whilst this choice may be too simple in many practical scenarios, it is important to notice that this is not the same as assuming that the variables of the posterior itself are uncorrelated! A suitable optimizer. In the case where multiple \\mathbf{x} samples are available, \\(p(\\mathbf{y}|\\mathbf{x}\\) , \\(p(\\mathbf{x}\\) , and \\(q_\\theta(\\mathbf{x})\\) are differentiable we can simply use a stochastic gradient method. This special case of VI is named ADVI. Moving back to where we started, the VAE model. Let's now rewrite the problem as a VI estimation (where \\(\\mathbf{z}\\) plays here the role of the model or unseen variable and \\(\\mathbf{x}\\) represents the available observations): \\[ \\begin{aligned} &\\underset{\\theta, \\phi} {\\mathrm{argmin}} \\; KL(q_\\theta(\\mathbf{z})||p(\\mathbf{z}|\\mathbf{x})) \\\\ &= \\underset{\\theta, \\phi} {\\mathrm{argmin}} \\; KL(q_\\theta(\\mathbf{z})||p(\\mathbf{z})) - E_{\\mathbf{z} \\sim q_\\theta} [ log p_\\phi(\\mathbf{x}|\\mathbf{z}) ] \\end{aligned} \\] where the first term is responsible for updating the encoder whilst the second term contributes to the update of both the encoder and decoder. The proposal distribution is here parametrized as \\(q_\\theta(\\mathbf{z}) \\sim \\mathcal{N}(e_{\\theta,\\mu}(\\mathbf{x}), diag(e_{\\theta,\\sigma}(\\mathbf{x})))\\) . and the expectation is taken over the training samples (or a batch of them). Additional readings The flow of this lecture is heavily inspired by this blog post A Python library that can help you step up your game with Variational Inference is Pyro from Uber.","title":"Generative Modelling and Variational AutoEncoders"},{"location":"lectures/14_vae/#generative-modelling-and-variational-autoencoders","text":"Up until now, our attention has been mostly focused on supervised learning tasks where we have access to a certain number of training samples, in the form of input-target pairs, and we train a model (e.g., a NN) to learn the best possible mapping between the two. These kind of models are also usually referred to as discriminative models as they learn from training samples their underlying conditional probability distribution \\(p(\\mathbf{y}|\\mathbf{x})\\) . In the last lecture, we have also seen how the general principles of supervised learning can be adapted to accomplish a number of different tasks where input-target pairs are not available. Dimensionality reduction is one of such tasks, which are usually categorized under the umbrella of unsupervised learning. Another very exciting area of statistics that has been recently heavily influenced by the deep learning revolution is the so-called field of Generative modelling . Here, instead of having access to input-target pairs, we are able to only gather a (large) number of samples \\(\\mathbf{X} = \\{ \\mathbf{x}^{(1)}, \\mathbf{x}^{(2)}, ..., \\mathbf{x}^{(N_s)} \\}\\) that we believe come from a given hidden distribution. The task that we wish to accomplished is therefore: Learn the underlying distribution \\(p(\\mathbf{x})\\) , or Learn to sample from the underlying distribution \\(\\tilde{\\mathbf{x}} \\sim p(\\mathbf{x})\\) Obviously, the first task is more general and usually more ambitious. Once you know a distribution, sampling from it is rather an easy task. In the next two lectures, we will however mostly focused on the second task and discuss two popular algorithms that have shown impressive capabilities to sample from high-dimensional, complex distributions. To set the scene, let's take the simplest approach to generative modelling that has nothing to do with neural networks. Let's imagine we are provided with \\(N_s\\) multi-dimensional arrays and we are told that they come from a multi-variate gaussian distribution. We can set up a generative modelling task as follows: Training Compute the sample mean and covariance from the training samples: \\(\\boldsymbol \\mu, \\boldsymbol \\Sigma\\) Apply the Cholesky decomposition to the covariance matrix: \\(\\boldsymbol \\Sigma = \\mathbf{L} \\mathbf{L}^T\\) Inference / Generation Sample a vector from a unitary, zero-mean normal distribution \\(\\mathbf{z} \\sim \\mathcal{N}(\\mathbf{0}, \\mathbf{I})\\) Create a new sample from the true distribution: \\(\\tilde{\\mathbf{x}} = \\mathbf{L} \\mathbf{z} + \\boldsymbol \\mu\\) Unfortunately, multi-dimensional distributions that we usually find in nature are hardly gaussian and this kind of simple generative modelling procedure falls short. Nevertheless, the approach that we take with some of the more advanced generative modelling methods that we are going to discuss later on in this lecture does not differ from what we have done so far. A training phase, where the free-parameters of the chosen parametric model (e.g., a NN) are learned from the available data, followed by a generation phase that uses the trained model and some stochastic input (like the \\(\\mathbf{z}\\) vector in the example above).","title":"Generative Modelling and Variational AutoEncoders"},{"location":"lectures/14_vae/#variational-autoencoders-vaes","text":"Variational AutoEncoders have been proposed by Kingma and Welling in 2013. in As the name implies, these networks take inspiration from the AutoEncoder networks that we have presented in the previous lecture. However, some small, yet fundamental changes are implemented to the network architecture as well as the learning process (i.e., loss function) to turn such family of networks from being able to perform dimensionality reduction to being generative models. Let's start by looking at a schematic representation of a VAEs: Even before we delve into the mathematical details, we can clearly see that one main change has been implemented to the network architecture: instead of directly producing a vector \\(\\mathbf{z} \\in \\mathbb{R}^{N_l}\\) , the encoder's output is composed of two vectors \\(\\boldsymbol \\mu \\in \\mathbb{R}^{N_l}\\) and \\(\\boldsymbol \\sigma \\in \\mathbb{R}^{N_l}\\) that represent the mean and standard deviation of a \\(N_l\\) dimensional gaussian distribution (with uncorrelated variables, i.e., diagonal covariance matrix). Mathematically, the encoder can be written as \\(\\boldsymbol \\mu = e_{\\theta,\\mu}(\\mathbf{x}), \\; \\boldsymbol \\sigma = e_{\\theta,\\sigma}(\\mathbf{x})\\) , where the two networks share all weights apart from the last layer. The two vectors produced by the encoder are then fed together to a sampler, who similar to what we did before, produces a sample from the following gaussian distribution: \\(\\mathcal{N}(\\boldsymbol \\mu, diag\\{ \\boldsymbol \\sigma \\})\\) . In practice this is however achieved by sampling a vector and then transforming it into the desired distribution, \\(\\mathbf{z} = \\boldsymbol \\sigma \\cdot \\mathbf{z} + \\boldsymbol \\mu\\) where \\(\\cdot\\) refers to an element-wise product.","title":"Variational AutoEncoders (VAEs)"},{"location":"lectures/14_vae/#reparametrization-trick","text":"This rather simple trick is referred to as Reparametrization trick and it is strictly needed in neural networks every time we want to introduce a stochastic process within the computational graph. In fact, by simply having a stochastic process parametrized by a certain mean and standard deviation that may come from a previous part of the computational graph (as in VAEs) we lose the possibility to perform backpropagation. Instead if we decouple the stochastic component (which we are not interested to update, and therefore to backpropagate onto) and the deterministic component(s), we do not lose access to backpropagation:","title":"Reparametrization trick"},{"location":"lectures/14_vae/#why-vaes","text":"Before we progress in discussing the loss function and training procedure of VAEs, a rather simple question may arise: 'Why can we not use AEs for generative modelling?' In fact, this could be achieved by simply modifying the inference step: where instead of taking a precomputed \\(\\mathbf{z}\\) vector (from a previous stage of compression), we could sample a new \\(\\mathbf{z}\\) value from a properly crafted distribution (perhaps chosen from statistical analysis of the training latent vectors) at any time we want to create a new sample. Unfortunately, whilst this idea may sound reasonable, we will be soon faced with a problem. In fact, the latent manifold learned by a AE may not be regular, or in other words it may be hard to ensure that areas of such manifold that have not been properly sampled by the training data will produce meaningful samples \\(\\tilde{\\mathbf{z}}\\) . Just to give an idea, let's look at the following schematic representation: as we can see, if a part of the latent 1-d manifold is not rich in training data, the resulting generated sample may be non-representative at all. Whilst we discussed techniques that can mitigate this form of overfitting (e.g., sparse AEs), VAEs bring the learning process to a whole new level by choosing a more appropriate regularization term \\(R(\\mathbf{x}^{(i)} ;\\theta,\\phi)\\) to add to the reconstruction loss.","title":"Why VAEs?"},{"location":"lectures/14_vae/#regularization-in-vaes","text":"In order to better understand the regularization choice in VAEs, let's look once again at a schematic representation of VAEs but this time in a probabilistic mindset: where we highlight here the fact that the encoder and decoder can be seen as probability approximators. More specifically: \\(e_\\theta(\\mathbf{x}) \\approx p(\\mathbf{z}|\\mathbf{x})\\) : the encoder learns to sample from the latent space distribution conditioned on a specific input \\(d_\\phi(\\mathbf{z}) \\approx p(\\mathbf{x}|\\mathbf{z})\\) : the decoder learns to sample from the true distribution conditioned on a specific latent sample By doing so, we can reinterpret the reconstruction loss as the negative log-likelihood of the decoder. And, provided that we have defined a prior for the latent space \\(\\mathbf{z} \\sim P(\\mathbf{z})\\) , we can learn the parameters of the decoder by ensuring that the posterior does not deviate too much from the prior. This can be achieved by choosing: \\[ R(\\mathbf{x} ;\\theta,\\phi) = KL(p(\\mathbf{z}|\\mathbf{x})||p(\\mathbf{z})) \\] As in any statistical learning process, the overall loss of our VAEs shows a trade-off between the likelihood (i.e., learning from data) and prior (i.e., keeping close to the initial guess). Before we provide a mathematical derivation supporting these claims, let's briefly try to provide some intuition onto why adding this regularization makes VAEs more well behaved than AEs in terms of generating representation samples of the input distribution ( \\(p(\\mathbf{x})\\) ) whilst sampling directly in the latent space. Back to the example with geometrical shapes, if we now assume a 2-dimensional latent space for both an AE and a VAEs: the effect of the regularization term in VAEs is such that the probability density function of the latent space (( \\(p(\\mathbf{z}|\\mathbf{x})\\) ) is forced to stay close to the prior, and therefore the \"clouds\" of different classes do not really separate from each other abruptly. As a consequence, the geometrical shapes associated with the transition zones in the latent space are still meaningful. The same cannot be said for the AE as the \"clouds\" of different classes tend to move apart leaving unexplored regions between them. Sampling from such region will result in non-representative geometrical shapes. More precisely, the regularization term in VAEs ensures the following two properties for the latent space: continuity: two closely points in the latent space are similar in the original space; completness: any point sampled from the latent distribution is meaningful in the original space;","title":"Regularization in VAEs"},{"location":"lectures/14_vae/#mathematics-of-vaes","text":"To conclude our lecture on VAEs, we would like to gain a stronger mathematical understanding about the inner working of this model. In order to do so, we are required to introduce a technique commonly used in statistics to estimate complex distributions. This technique goes under the name of Variational Inference (VI) . Let's begin from the classical setup of Bayesian inference. We are interested in a certain probability distribution that we want to sample from or characterize (e.g., in terms of its mean and standard deviation), for example the following posterior distribution in a general inverse problem setting: \\[ p(\\mathbf{x} | \\mathbf{y}) = \\frac{p(\\mathbf{y}|\\mathbf{x}) p(\\mathbf{x})}{p(\\mathbf{y})} \\] where \\(\\mathbf{x}\\) is the model we wish to estimate and \\(\\mathbf{y}\\) are the available observations. We assume knowledge of the prior distribution \\(p(\\mathbf{x})\\) and the underlying physical process that links the model to the data, \\(\\mathbf{y}=f(\\mathbf{x})\\) from which we can compute the likelihood \\(p(\\mathbf{y}|\\mathbf{x})\\) (assuming a certain statistics for the noise). The denominator of the Bayes rule ( \\(p(\\mathbf{y}) = \\int p(\\mathbf{y}|\\mathbf{x}) p(\\mathbf{x}) d\\mathbf{x}\\) ) is what prevents us from computing the posterior directly. Variational Inference approaches the above problem in a special way. A parametric distribution \\(q_\\theta(\\mathbf{x})\\) is defined, also sometimes referred to as guide or proposal distribution, and an optimization problem is set up to find the best free-parameters \\(\\theta\\) such that this easy-to-evaluate distribution closely resembles to posterior distribution of interest. As usual when setting up an optimization problem, a measure of distance between such distributions is required to be able to optimize for such set of parameters. In this case, since we are dealing with distributions, it comes natural to choose the Kullback-Leibler divergence as metric: \\[ \\underset{\\theta} {\\mathrm{argmin}} \\; KL(q_\\theta(\\mathbf{x})||p(\\mathbf{x}|\\mathbf{y})) \\] Let's now expand the expression of the KL divergence and show an equivalent formula for this optimization problem: \\[ \\begin{aligned} &\\underset{\\theta} {\\mathrm{argmin}} \\; KL(q_\\theta(\\mathbf{x})||p(\\mathbf{x}|\\mathbf{y})) \\\\ &= \\underset{\\theta} {\\mathrm{argmin}} \\; E_{\\mathbf{x} \\sim q_\\theta} \\left[ log \\left( \\frac{q_\\theta(\\mathbf{x})}{p(\\mathbf{x}|\\mathbf{y})} \\right) \\right] \\\\ &= \\underset{\\theta} {\\mathrm{argmin}} \\; E_{\\mathbf{x} \\sim q_\\theta} [ log q_\\theta(\\mathbf{x}) ] - E_{\\mathbf{x} \\sim q_\\theta} [ log p(\\mathbf{x}|\\mathbf{y}) ] \\\\ &= \\underset{\\theta} {\\mathrm{argmin}} \\; E_{\\mathbf{x} \\sim q_\\theta} [ log q_\\theta(\\mathbf{x}) ] - E_{\\mathbf{x} \\sim q_\\theta} \\left[ log \\left( \\frac{p(y|\\mathbf{x})p(\\mathbf{x})}{p(\\mathbf{y})} \\right) \\right] \\\\ &= \\underset{\\theta} {\\mathrm{argmin}} \\; E_{\\mathbf{x} \\sim q_\\theta} [ log q_\\theta(\\mathbf{x}) ] - E_{\\mathbf{x} \\sim q_\\theta} [ log p(\\mathbf{y}|\\mathbf{x}) ] - E_{\\mathbf{x} \\sim q_\\theta} [ log p(\\mathbf{x}) ] + \\cancel{E_{\\mathbf{x} \\sim q_\\theta} [ log p(\\mathbf{y}) ]} \\\\ &= \\underset{\\theta} {\\mathrm{argmin}} \\; KL(q_\\theta(\\mathbf{x})||p(\\mathbf{x})) - E_{\\mathbf{x} \\sim q_\\theta} [ log p(\\mathbf{y}|\\mathbf{x}) ] \\end{aligned} \\] where we have eliminated \\(E_{x \\sim q_\\theta} [ p(\\mathbf{y}) ]\\) the in the 5th row since it does not depend on \\(\\theta\\) . In the last row, we can see the two terms that we have previously described: \\(-E_{x \\sim q_\\theta} [ p(\\mathbf{y}|\\mathbf{x}) ]\\) is the negative log-likelihood of a traditional Maximum likelihood estimation (i.e., data misfit term). In the special case of gaussian noise ( \\(\\mathbf{y} \\sim \\mathcal{N}(f(\\mathbf{x}), \\sigma^2 \\mathbf{I})\\) ), this becomes the MSE loss as discussed in one of our previous lectures; \\(KL(q_\\theta(\\mathbf{x})||p(\\mathbf{x}))\\) is the regularization term encouraging the proposal distribution to stay close to the prior. Finally, let's slightly rearrange the expression in the 5th row: \\[ E_{\\mathbf{x} \\sim q_\\theta} [ log p(\\mathbf{y}) ] - KL(q_\\theta(\\mathbf{x})||p(\\mathbf{x}|\\mathbf{y})) = E_{\\mathbf{x} \\sim q_\\theta} [ log p(\\mathbf{y}|\\mathbf{x}) ] - KL(q_\\theta(\\mathbf{x})||p(\\mathbf{x})) \\] The left hand side of this equation is called Evidence Lower Bound (ELBO) . The names comes from the fact that the sum of these two terms is always \\(\\le E_{\\mathbf{x} \\sim q_\\theta} [ log p(\\mathbf{y}) ]\\) since KL divergence is always positive. Therefore, by maximizing the right hand side (or equivalently by minimizing the negative of the right hand side), we effectively maximize the lower bound of the probability of the evidence \\(p(\\mathbf{y})\\) . Variational inference can be therefore seen also as a maximization problem over the ELBO. Whilst we now understand the theoretical foundations of VI, to make it practical we need to specify: A suitable proposal \\(q_\\theta(\\mathbf{x})\\) , where suitable means that we can easily evaluate such probability, its KL divergence with a prior of choice, as well as sample from it. The simplest choice that is sometimes made in VI is named mean-field approximation where: $$ q_\\theta(\\mathbf{x}) = \\prod_i q_\\theta(x_i) \\sim \\mathcal{N}(\\boldsymbol \\mu , diag(\\boldsymbol \\sigma)) $$ where \\(\\theta={\\boldsymbol \\mu, \\boldsymbol \\sigma}\\) . This implies that there is no correlation over the different variables of the N-dimensional proposal distribution. Whilst this choice may be too simple in many practical scenarios, it is important to notice that this is not the same as assuming that the variables of the posterior itself are uncorrelated! A suitable optimizer. In the case where multiple \\mathbf{x} samples are available, \\(p(\\mathbf{y}|\\mathbf{x}\\) , \\(p(\\mathbf{x}\\) , and \\(q_\\theta(\\mathbf{x})\\) are differentiable we can simply use a stochastic gradient method. This special case of VI is named ADVI. Moving back to where we started, the VAE model. Let's now rewrite the problem as a VI estimation (where \\(\\mathbf{z}\\) plays here the role of the model or unseen variable and \\(\\mathbf{x}\\) represents the available observations): \\[ \\begin{aligned} &\\underset{\\theta, \\phi} {\\mathrm{argmin}} \\; KL(q_\\theta(\\mathbf{z})||p(\\mathbf{z}|\\mathbf{x})) \\\\ &= \\underset{\\theta, \\phi} {\\mathrm{argmin}} \\; KL(q_\\theta(\\mathbf{z})||p(\\mathbf{z})) - E_{\\mathbf{z} \\sim q_\\theta} [ log p_\\phi(\\mathbf{x}|\\mathbf{z}) ] \\end{aligned} \\] where the first term is responsible for updating the encoder whilst the second term contributes to the update of both the encoder and decoder. The proposal distribution is here parametrized as \\(q_\\theta(\\mathbf{z}) \\sim \\mathcal{N}(e_{\\theta,\\mu}(\\mathbf{x}), diag(e_{\\theta,\\sigma}(\\mathbf{x})))\\) . and the expectation is taken over the training samples (or a batch of them).","title":"Mathematics of VAEs"},{"location":"lectures/14_vae/#additional-readings","text":"The flow of this lecture is heavily inspired by this blog post A Python library that can help you step up your game with Variational Inference is Pyro from Uber.","title":"Additional readings"},{"location":"lectures/15_gans/","text":"Generative Adversarial Networks (GANs) A fundamentally new way of approaching generative modelling has been proposed by Goodfellow and co-authors in 2014. Similar to VAEs, Generative Adversarial Networks (GANs) can learn a distribution from some training samples, or more precisely thy can learn to sample from the underlying, unknown distribution. This family of NNs are revolutionary in that they can produce very high quality (i.e., extremely realistic) samples compared to predecessor models at similar computational cost. Whilst the core application of GANs (and pretty much any generative model in deep learning) has been computer vision (i.e., natural images and portraits of people in particular), their use in geoscience has also recently provided us with new ways of generating \"new\" samples that can easily outperform state-of-the-art geostatistical tools. This is very appealing in applications like reservoir modelling as geologists and reservoir engineers are nowadays usually tasked to work with multiple realizations of the subsurface and provide probabilistic estimates to support the subsequent decision making process. A few examples of early applications of GANs in geoscience are: Mosser et al. , Reconstruction of three-dimensional porous media using generative adversarial neural networks Zhang et al. , Generating geologically realistic 3D reservoir facies models using deep learning of sedimentary architecture with generative adversarial networks Wang et al. , SeismoGen: Seismic Waveform Synthesis Using GAN With Application to Seismic Data Augmentation ... We will begin by discussing the main application of GANs, i.e. pure unconditional generation. Later, we will however see that recent modifications of GANs have allowed performing conditional generation (e.g., generate facies model conditioned to well information) as well as domain transformation (e.g., from seismic to reflectivity, from facies to petrophysical parameters). The latter has been shown to outperform traditional supervised learning workflows based for example on UNet architectures. GANs groundbreaking idea Let's start by looking at the basic idea of GANs with a schematic drawing of the network architecture (or, as we will soon become familiar with, we should say the network architectures as we will be dealing with two networks!): A GAN model is composed of two networks, namely: Generator ( \\(g_\\theta\\) ): takes an input vector \\(\\mathbf{z} \\in \\mathbb{R}^{N_l}\\) randomly sampled for a given distribution (e.g., \\(\\mathbf{z} \\sim \\mathcal{N}(\\mathbf{0}, \\mathbf{I})\\) ) and produces an output vector \\(\\hat{\\mathbf{x}} \\in \\mathbb{R}^{N_f}\\) that should should belong to the underlying probability distribution of the training samples, \\(p(\\mathbf{x})\\) ). To achieve this task, the generator is not allowed direct access to the training samples \\(\\mathbf{x}^{<i>}\\) . On the other hand, it relies on the discriminator for feedback. Discriminator ( \\(d_\\phi\\) ): takes both real samples \\(\\mathbf{x}\\) and fake samples \\(\\hat{\\mathbf{x}}\\) (the latter coming from the generator) and tries to classify them. Its goal is to discriminate between true and fake samples, or in other words to identify which samples are coming from the generator. A classical example from the original paper is that the generator is a painting forger, whilst the discriminator is a painting critic. It is obvious here that these two networks must compete with each other, if one achieves its goal the other would have failed and vice-versa. As we will see later, this is what makes GANs successful but also hard to train. Moreover, whilst training is performed in parallel, it is worth noticing that the generator network is what ultimately we care about (to be able to create realistic samples), whilst the discriminator is an auxiliary network that will be discarded after training. Mathematics of GANs Before we delve into the mathematical framework of GANs, let's take a more detailed look at the two networks and their training process. First, the generator: and the discriminator: As previously explained, the generator is updated solely based on the samples it generates. This is not done in a direct form, rather through the feedback of the discriminator. As the generator tries to fool the discriminator, the discriminator is provided with true labels during the generator training phase. On the other hand, the discriminator is fed with both true and fake samples and their correct corresponding label. Its task is therefore to perform a correct classification, which, if successful, will prevent the generator from producing realistic fake samples. Note that since we want the discriminator to perform a binary classification task the activation of the last layer must be chosen to be a sigmoid function. Let's try to put down into equations this training process. Given that we are dealing with a binary classification problem, the obvious choice for the loss function is the commonly used binary cross entropy (BCE) loss. Starting from the generator: \\[ \\mathscr{L}(\\hat{\\mathbf{x}}, \\hat{\\mathbf{y}}=1) = BCE(d_\\phi(g_\\theta(\\mathbf{z})), \\hat{\\mathbf{y}}=1) = -log(d_\\phi(g_\\theta(\\mathbf{z}))) \\] which is minimum for \\(g_\\theta(d_\\phi(\\mathbf{z}))=1\\) (i.e., when the generator has been able to fool the discriminator), and maximum for \\(g_\\theta(d_\\phi(\\mathbf{z}))=0\\) (i.e., when the discriminator recognizes the creation of the generator). For the discriminator we must consider two cases. The first one is associated with the generator: \\[ \\mathscr{L}(\\hat{\\mathbf{x}}, \\hat{\\mathbf{y}}=0) = BCE(d_\\phi(g_\\theta(\\mathbf{z})), \\hat{\\mathbf{y}}=0) = -log(1-d_\\phi(g_\\theta(\\mathbf{z}))) \\] which is minimum for \\(g_\\theta(d_\\phi(\\mathbf{z}))=0\\) (i.e., when the discriminator recognizes the creation of the generator), and maximum for \\(g_\\theta(d_\\phi(\\mathbf{z}))=1\\) (i.e., when the generator has been able to fool the discriminator). The second one is instead associated with the true samples: \\[ \\mathscr{L}(\\mathbf{x}, \\mathbf{y}=1) = BCE(d_\\phi(\\mathbf{x}), \\mathbf{y}=1) = -log(d_\\phi(\\mathbf{x})) \\] which is minimum for \\(d_\\phi(\\mathbf{x})=1\\) (i.e., when the discriminator recognizes the true samples), and maximum for \\(d_\\phi(\\mathbf{x})=0\\) (i.e., when the discriminator believes that the true samples are fake). Whilst for simplicity we have analyzed these three terms separately and focused on how they can be minimized (this is also what we would do when implementing GANs in practice), a unique adversarial loss function can be also defined that uniquely identifies the goal of GAN training: \\[ \\mathscr{L}_{adv} = E_{\\mathbf{x} \\sim p_x} [log(d_\\phi(\\mathbf{x}))] + E_{\\mathbf{z} \\sim p_z} [log(1- d_\\phi(g_\\theta(\\mathbf{z})))] \\] and the overall training problem can be written as: \\[ arg \\; \\underset{g_\\theta} {\\mathrm{min}} \\; \\underset{d_\\phi} {\\mathrm{max}} \\; \\mathscr{L}_{adv} \\] This is interesting, as we are not simply minimizing a loss function to find the best parameters of a network, rather we are required to play a min-max game between the generator and discriminator. This is exactly where the Adversarial part of the name GANs comes from. Training of GANs Although there are various strategies to successfully train these two networks together, the most common one is to do one step of optimization on one and one on the other in an alternating fashion. By doing so, we allow the two networks to competitively learn together their own task whilst trying to make the other network fail on the other task. In practice, the learning process of GANs can be however very unstable (and sometimes even unpredictable). A common scenario is in fact represented by the fact that one network learns its task much faster than the other network. Depending on which of the two networks is the fast learner, the following scenarios may arise: Super-discriminator (i.e., the discriminator outpaces the generator): the generator is made aware of the fact that most (if not all) of his fake samples have been identified by the discriminator. By not knowing which of the generated samples are harder to discriminate and which ones are easier, the network cannot update its parameters to generator more of the samples that were mislead for real and less of those that were identified as fake by the discriminator. This is the most common scenario as the discriminator has a much easier task (binary classification) compared to that of the generator (learning a probability density function); Super-generator (i.e.,the generator outpaces the discriminator): as the discriminator cannot tell apart the true from the fake samples, the generator is satisfied with what it is producing and continues to do so. Whilst this is the ideal scenario that we wish to experience after some epochs of training, when this arises early (in the first few epochs), it is usually a sign that the generator is producing samples that are very similar to each other instead of a representative set of samples of the underlying distribution (so-called mode collapse). Now that we know how to train GANs, and that the training process may be hard, let's discuss in details a number of scenarios that we may encounter during training. After that, we will discuss a number of strategies that have been devised through out the years to minimize the risk of having unbalanced training and ultimately a generator with poor generative capabilities. Mode collapse Let's consider the following multi-modal 1-dimensional distribution: A successfully trained GAN is able to generate sample from the different modes of this distribution. However, it is common for a GAN to identify a single mode and stick to it, generating only samples from a small part of the overall distribution. As these samples are realistic, the generator may eventually end up fooling the discriminator losing any interest in exploring other areas of the distribution. More precisely, let's imagine that during the early stages of training the discriminator is able to distinguish between fake and real samples for 3 of the 4 modes whilst it struggles for the remaining one. Whilst trying to fool the discriminator, the generator realizes that and exploits its ability to fool the discriminator when sampling from one of the modes. By doing this, the generator becomes better and better at producing realistic samples from that mode but forgets about the fact that the probability it is trying to approximate may be multi-modal. Whilst this is a better scenario than the one depicted above (i.e., the generator is outpaced by the discriminator and cannot produce any representative sample), mode collapse is still something we would like to avoid if possible. We will soon discuss a number of modifications to the classical GAN model that can achieve that. Vanishing gradients Another problem commonly experienced whilst training GANs is represented by the arising of vanishing gradients. Let's imagine that at the start of the training process, the discriminator manages to get the two distribution (true and generator) far apart. In this scenario, the gradient of the BCE loss tends to flatten and training starts to slow down as depicted in the figure below: In other words, once the distributions stop overlapping the chances of the generator to keep learning and producing something meaningful drastically reduce (or at least the learning process becomes very slow). Solutions to unstable training In the quest of creating stable and reliable GAN models, a number of researchers have suggested that the BCE loss initially used in the original GAN paper may be the main cause of some of the above highlighted problems. As GANs ultimate goal is that of estimate a probability distribution, a suggestion was made by Arjovsky and coauthors in 2017 to replace the BCE loss with the so-called Wasserstein loss. Simply put, the Wasserstein distance, or Earth's mover distance, is a distance function between two probability distributions that computes the amount of earth (or soil) that needs to be moved from one probability to match another. Let's take a look at it with an example where for simplicity we discretize two probability distributions (i.e., we display them as histograms). In this case we can simply observe that the Wasserstein distance between such probabilities is 1 as we need to make just one move to match them: More importantly, a clear implication in choosing this loss function over BCE for the training of GANs lies in the fact that its gradient does not saturate very quickly as two distributions are pulled far apart. This greatly eases the training process of the generator even when the discriminator is superior at the beginning of the training process. In practice, when using the Wasserstein distance, the loss function of the min-max game becomes: \\[ arg \\; \\underset{g_\\theta} {\\mathrm{min}} \\; \\underset{d_\\phi} {\\mathrm{max}} \\; E_{\\mathbf{x} \\sim p_x} [d_\\phi(\\mathbf{x})] - E_{\\mathbf{z} \\sim p_z}[d_\\phi(g_\\theta(\\mathbf{z}))] \\] and the training process remains unchanged. Finally, note in the context of Wasserstein GANs (W-GANs) the discriminator is sometimes also called critic as its role it is not anymore to perform a classification but simply criticize the fake samples. Moreover, although the Wasserstein loss makes training of GANs less unstable, it introduces a problem. The training of the discriminator can be affected by the exploding gradient behaviour, due to the fact that the discriminator may not be 1-Lipschitz continuous. Various approaches have been introduced to avoid such a situation, gradient clipping and gradient penalty are two of the most common solutions. To conclude this section, it is worth mentioning that a number of other strategies have been proposed in the literature to mitigate training failures of GANs. Whilst we will not go into the details here, we will briefly mention a couple of them: Spectral normalization : another approach used to ensure that the discriminator is 1-Lipschitz continuous. As the name implies, spectral normalization is a normalization procedure applied to each layer of the network to ensure that the spectral norm of the layer is smaller or equal to 1. It requires estimating the largest eigenvalue of each layer and renormalizing its weight whenever they are updated. For more details, see this blog post for more details. Polyak averaging of the generator : as discussed in this lecture , it is possible to mitigate the importance of the stopping criterion when training GANs by averaging the parameters of the generator for a number of iterations \\(N_{it}\\) (or even epochs). Pro-GAN (Progressive growing GANs): Instead of training a GAN network directly on the high-resolution output of interest, a good strategy proposed by Karras and coauthors is to start with a smaller, lower resolution version and train a small GAN. This is repeated for a number of times by freezing the trained layers, adding more layers, and increasing the resolution of the sought output. Conditional GANs The original formulation of GANs aims at learning an unconditional probability distribution and sampling from it. However, in many real life scenarios we may expect the probability distribution of our data to be somehow clustered (i.e., display a multi-modal behaviour). Let's for example imagine that we are provided with a number of geological models from all over the world and our goal is to teach a GAN to create new 'fake' models that are as realistic as possible. Whilst all the geological models share some high-level features, it is logical to expect that some of them have more things in common then others. Let's also assume we are provided with such information in the form of labels, so that for each sample we also know the class it belongs to. If we were able to sample from a 2-dimensional latent space and reproduce exactly our training samples, this is what we may observe: It may be appealing to train a GAN that at inference time could produce samples conditionally to us choosing a specific class (or cluster) of interest. Of course, the most straightforward approach could be to separate the training samples into \\(N_c\\) buckets and train \\(N_c\\) independent GANs. This is however very costly. A smarter approach is to turn our generative network from unconditional to conditional, something we call conditional GAN (or c-GAN). Conditional GANs present a number of distinctive features when compared to traditional GANs, which we are going to summarize here: Alongside a random vector \\(\\mathbf{z} \\in \\mathbb{R}^{N_l}\\) , the generator is now also fed with a label that represents the class we wish to sample from. These two are concatenated to each other to form a new vector \\(\\tilde{\\mathbf{z}} = [\\mathbf{z}, c] \\in \\mathbb{R}^{N_l+1}\\) . Alternatively, the label can be one-hot encoded into a vector \\(\\mathbf{c}\\) and the generator is fed with \\(\\tilde{\\mathbf{z}} = [\\mathbf{z}, \\mathbf{c}] \\in \\mathbb{R}^{N_l+N_c}\\) . By doing so at training time, we inform the generator that we are not just interested in producing a random sample from the distribution of the training data, rather we want a sample from a specific class. Once the network is trained, at inference time we will have the ability to sample conditionally; The discriminator is also made aware of the fact that the training data is divided into classes. And of course, this goes also for the generated samples. Similar to the generator input, the input of the discriminator is now modified to include also the label of the true sample (or the label provided to the generator for the fake sample). Once again the vector $\\mathbf{x} is simply concatenated with either the label \\(c\\) or its one-hot encoded version \\(\\mathbf{c}\\) to create a new input to the discriminator \\(\\tilde{\\mathbf{x}}\\) . Finally, what if the input of the discriminator (and/or generator) is N-dimensional. This is usually the case when working with natural images or multi-dimensional geoscientific data (e.g., seismic data, satellite images). A simple modification of the process described above can be introduced. Instead of concatenating the label \\(c\\) to the 1-dimensional \\(\\mathbf{z}\\) (or \\(\\mathbf{x}\\) ) vector, an additional channel is added to the N-dimensional \\(\\mathbf{z}\\) (or \\(\\mathbf{x}\\) ) tensor that contains the value of the label. Similarly, when working with the one-hot encoded version of the label, \\(N_c\\) channels are added instead with one of them containing 1s (the one corresponding to the label) and all others containing 0s. Domain translation with GANs To conclude this lecture, we will discuss a slightly different application of GANs. Whilst so far we have presented GANs as statistical modelling tools for generative tasks, it turns out they are also useful for image-to-image translation (or more broadly, for any form of domain translation ). We previously mentioned this application in the context of convolutional networks and more specifically the UNet architecture. The idea is to map data from a given input domain to a given output domain. A number of interesting applications in geoscience may benefit from this set up. For example, any geophysical processing step can be seen as a domain translation task where we transform the input data into a new version of it. Also, we could think of using domain translation as a way to create realistic geological models from sketches or to populate them with petrophysical properties whist starting from a pure facies skeleton. In general, two scenarios may arise: Paired training: the training data provides us with paired combinations of samples from the two domains, e.g., \\(\\mathbf{x}_A^{<i>} \\leftrightarrow \\mathbf{x}_B^{<i>} \\; \\forall i\\) Unpaired training: the training data comes in the form of two set of training samples, the first from domain A and the second from domain B. However, we do not know how each sample of one domain is related to a sample of the other domain, i.e. \\(X_A = (\\mathbf{x}_A^{<1>}, \\mathbf{x}_A^{<2>}, ..., \\mathbf{x}_A^{<N_A>})\\) and \\(X_B = (\\mathbf{x}_B^{<1>}, \\mathbf{x}_B^{<2>}, ..., \\mathbf{x}_B^{<N_B>})\\) . Paired training In 2017 Isola and coauthors suggested that c-GANs could be used for paired domain translation and proposed the so called Pix2Pix network. Mathematically speaking, whilst a traditional c-GAN aims to learn: \\[ p(\\mathbf{x} | \\mathbf{z}, c) \\] a c-GAN for domain translation will be tasked to learn: \\[ p(\\mathbf{x}_B | \\mathbf{z}, \\mathbf{x}_A) \\] where the noise vector \\(\\mathbf{z}\\) can be used to sample multiple realizations conditioned on the given input \\(\\mathbf{x}_A\\) . In practice, it turns out that this problem is too constrained to allow 'rich sampling', so the original authors suggest to remove \\(\\mathbf{z}\\) from the inputs and use alternative approaches such as dropout if interested to produce multi outputs. The overall network architecture of Pix2Pix can be summarized as follows: Generator: as the inputs and outputs of the generator share the same dimensions, the network architecture here does not need to be that of a decoder like in classical GANs. More powerful architectures with skip connections, like UNet, can be used instead. Note that, as mentioned above, there the random vector \\(\\mathbf{z}\\) is not required to be the input of the generator. Discriminator: similar to c-GAN, both the true samples from the target domain, \\(\\mathbf{x}_B\\) , as well as the predicted ones, \\(\\hat{\\mathbf{x}}_B\\) are fed to the discriminator concatenated with their corresponding sample in the original domain, \\(\\mathbf{x}_A\\) . A second modification to the usual GAN discriminator is also applied here. Instead of using a classical discriminator that reduces the dimensionality of the output to a scalar, Pix2Pix use a special type of discriminator called Patch GAN . More specifically, the discriminator produces an 2-dimensional output of size \\(N_d \\times N_d\\) (where \\(N_d\\) is much smaller than the size of the input samples, \\(N \\times N\\) ). This matrix contains values that are fed independently to the adversarial loss used in classical GANs. By doing the PatchGAN discriminator tries to classify if each \\(N/N_d \\times N/N_d\\) patch in an image is real or fake, providing therefore a much richer feedback to the generator. Apart from the network changes, Pix2Pix introduces also a modification to the original loss function of GANs: \\[ \\mathscr{L} = \\mathscr{L}_{adv} + \\lambda \\mathscr{L}_{pix} \\] where the first term, \\(\\mathscr{L}_{adv}\\) , is the adversarial loss of choice, whilst the second term computes the error between the predicted sample in the new domain and the corresponding true one, e.g. \\(MSE(\\hat{\\mathbf{x}}_B^{<i>}, \\mathbf{x}_B^{<i>})\\) . As this can be interpreted as the classical loss term of a supervised learning task, Pix2Pix does indeed trade-off between performing a classical reconstruction with pixel-wise loss and producing samples that can fool the discriminator. Unpaired training As we previously mentioned, it is not always possible to have access to paired samples from the two domains. Provided access to a variety of samples from domains A and B, CycleGAN was introduced by Zhu and colleagues as a way to perform domain translation in this more general setup. The idea of CycleGAN is to train 2 GANs in parallel, one performing a domain translation task from A to B and the other performing a domain translation task from B to A: Each GAN is composed of a generator with UNet architecture and a discriminator (it could be normal one or a PatchGAN); samples from domain A are fed to the \\(GAN_{A \\rightarrow B}\\) whilst samples from domain B are fed to the \\(GAN_{B \\rightarrow A}\\) . An adversarial loss is used as commonly done in GANs training, but in this case neither BCE nor Wasserstein is chosen. Instead, the authors suggest to use an MSE loss: \\[ \\begin{aligned} \\mathscr{L}_{adv, g_{A \\rightarrow B}} &= E_{\\mathbf{x}_A \\sim p_{x,A}} [(d_B(g_{A \\rightarrow B}(\\mathbf{x}_A) - 1)^2], \\\\ \\mathscr{L}_{adv, g_{B \\rightarrow A}} &= E_{\\mathbf{x}_B \\sim p_{x,B}} [(d_A(g_{B \\rightarrow A}(\\mathbf{x}_B) - 1)^2], \\\\ \\mathscr{L}_{adv, d_A} &= E_{\\mathbf{x}_A \\sim p_{x,A}} [(d_A(\\mathbf{x}_A) - 1)^2] + E_{\\mathbf{x}_B \\sim p_{x,B}} [d_B(g_{B \\rightarrow A}(\\mathbf{x}_B))^2], \\\\ \\mathscr{L}_{adv, d_B} &= E_{\\mathbf{x}_B \\sim p_{x,B}} [(d_B(\\mathbf{x}_B) - 1)^2] + E_{\\mathbf{x}_A \\sim p_{x,A}} [d_B(g_{A \\rightarrow B}(\\mathbf{x}_A))^2], \\\\ \\end{aligned} \\] and we finally define \\(\\mathscr{L}_{adv, A \\rightarrow B}=\\mathscr{L}_{adv, g_{A \\rightarrow B}} + \\mathscr{L}_{adv, d_B}\\) and \\(\\mathscr{L}_{adv, B \\rightarrow A}=\\mathscr{L}_{adv, g_{B \\rightarrow A}} + \\mathscr{L}_{adv, d_A}\\) . Moreover, since we do not know how to pair the samples from the different domains the PixelLoss of Pix2Pix cannot be used here. Up until now the networks are also not aware of each other and could well be trained separately. A feedback loop is therefore introduced such that the two networks are aware of each other and trained together to learn both mappings consistently. This is referred to as Cycle consistency and it works as follows: A sample from domain A is fed to the \\(GAN_{A \\rightarrow B}\\) and subsequently to \\(GAN_{B \\rightarrow A}\\) and the reconstruction loss is computed: \\(\\mathscr{L}_{cycle, A \\rightarrow B} = MSE(\\mathbf{x}_A, g_{B \\rightarrow A}(g_{A \\rightarrow B}(\\mathbf{x}_A))\\) A sample from domain B is fed to the \\(GAN_{B \\rightarrow A}\\) and subsequently to \\(GAN_{A \\rightarrow B}\\) and the reconstruction loss is computed: \\(\\mathscr{L}_{cycle, B \\rightarrow A} = MSE(\\mathbf{x}_B, g_{A \\rightarrow B}(g_{B \\rightarrow A}(\\mathbf{x}_B))\\) Finally, an Identity loss is also optionally introduced where a sample from one domain is passed through the generator of the other domain and the MSE loss is computed against the sample itself. The idea of such loss is that a generator should not modify a sample that already belongs to the target distribution: \\[ \\begin{aligned} \\mathscr{L}_{identity, A} &= MSE(g_{B \\rightarrow A}(\\mathbf{x}_A), \\mathbf{x}_A)\\\\ \\mathscr{L}_{identity, B} &= MSE(g_{A \\rightarrow B}(\\mathbf{x}_B), \\mathbf{x}_B)\\\\ \\end{aligned} \\] To summarize, the overall loss function of CycleGAN becomes: \\[ \\mathscr{L} = \\sum_{i \\in (A \\rightarrow B, B \\rightarrow A)} \\left( \\mathscr{L}_{adv, i} + \\lambda_C (\\mathscr{L}_{cycle, i}) \\right) + \\lambda_I (\\mathscr{L}_{identity, A} +\\mathscr{L}_{identity, B}) \\] Additional readings A good discussion on the limitations of the BCE loss for GANs training can be found here A number of valuable resources for stable training of GANs are: 1 , 2 If you want to get started with GANs in PyTorch, here is a good starting point: A Zoo of PyTorch implementations of GANs","title":"Generative Adversarial Networks (GANs)"},{"location":"lectures/15_gans/#generative-adversarial-networks-gans","text":"A fundamentally new way of approaching generative modelling has been proposed by Goodfellow and co-authors in 2014. Similar to VAEs, Generative Adversarial Networks (GANs) can learn a distribution from some training samples, or more precisely thy can learn to sample from the underlying, unknown distribution. This family of NNs are revolutionary in that they can produce very high quality (i.e., extremely realistic) samples compared to predecessor models at similar computational cost. Whilst the core application of GANs (and pretty much any generative model in deep learning) has been computer vision (i.e., natural images and portraits of people in particular), their use in geoscience has also recently provided us with new ways of generating \"new\" samples that can easily outperform state-of-the-art geostatistical tools. This is very appealing in applications like reservoir modelling as geologists and reservoir engineers are nowadays usually tasked to work with multiple realizations of the subsurface and provide probabilistic estimates to support the subsequent decision making process. A few examples of early applications of GANs in geoscience are: Mosser et al. , Reconstruction of three-dimensional porous media using generative adversarial neural networks Zhang et al. , Generating geologically realistic 3D reservoir facies models using deep learning of sedimentary architecture with generative adversarial networks Wang et al. , SeismoGen: Seismic Waveform Synthesis Using GAN With Application to Seismic Data Augmentation ... We will begin by discussing the main application of GANs, i.e. pure unconditional generation. Later, we will however see that recent modifications of GANs have allowed performing conditional generation (e.g., generate facies model conditioned to well information) as well as domain transformation (e.g., from seismic to reflectivity, from facies to petrophysical parameters). The latter has been shown to outperform traditional supervised learning workflows based for example on UNet architectures.","title":"Generative Adversarial Networks (GANs)"},{"location":"lectures/15_gans/#gans-groundbreaking-idea","text":"Let's start by looking at the basic idea of GANs with a schematic drawing of the network architecture (or, as we will soon become familiar with, we should say the network architectures as we will be dealing with two networks!): A GAN model is composed of two networks, namely: Generator ( \\(g_\\theta\\) ): takes an input vector \\(\\mathbf{z} \\in \\mathbb{R}^{N_l}\\) randomly sampled for a given distribution (e.g., \\(\\mathbf{z} \\sim \\mathcal{N}(\\mathbf{0}, \\mathbf{I})\\) ) and produces an output vector \\(\\hat{\\mathbf{x}} \\in \\mathbb{R}^{N_f}\\) that should should belong to the underlying probability distribution of the training samples, \\(p(\\mathbf{x})\\) ). To achieve this task, the generator is not allowed direct access to the training samples \\(\\mathbf{x}^{<i>}\\) . On the other hand, it relies on the discriminator for feedback. Discriminator ( \\(d_\\phi\\) ): takes both real samples \\(\\mathbf{x}\\) and fake samples \\(\\hat{\\mathbf{x}}\\) (the latter coming from the generator) and tries to classify them. Its goal is to discriminate between true and fake samples, or in other words to identify which samples are coming from the generator. A classical example from the original paper is that the generator is a painting forger, whilst the discriminator is a painting critic. It is obvious here that these two networks must compete with each other, if one achieves its goal the other would have failed and vice-versa. As we will see later, this is what makes GANs successful but also hard to train. Moreover, whilst training is performed in parallel, it is worth noticing that the generator network is what ultimately we care about (to be able to create realistic samples), whilst the discriminator is an auxiliary network that will be discarded after training.","title":"GANs groundbreaking idea"},{"location":"lectures/15_gans/#mathematics-of-gans","text":"Before we delve into the mathematical framework of GANs, let's take a more detailed look at the two networks and their training process. First, the generator: and the discriminator: As previously explained, the generator is updated solely based on the samples it generates. This is not done in a direct form, rather through the feedback of the discriminator. As the generator tries to fool the discriminator, the discriminator is provided with true labels during the generator training phase. On the other hand, the discriminator is fed with both true and fake samples and their correct corresponding label. Its task is therefore to perform a correct classification, which, if successful, will prevent the generator from producing realistic fake samples. Note that since we want the discriminator to perform a binary classification task the activation of the last layer must be chosen to be a sigmoid function. Let's try to put down into equations this training process. Given that we are dealing with a binary classification problem, the obvious choice for the loss function is the commonly used binary cross entropy (BCE) loss. Starting from the generator: \\[ \\mathscr{L}(\\hat{\\mathbf{x}}, \\hat{\\mathbf{y}}=1) = BCE(d_\\phi(g_\\theta(\\mathbf{z})), \\hat{\\mathbf{y}}=1) = -log(d_\\phi(g_\\theta(\\mathbf{z}))) \\] which is minimum for \\(g_\\theta(d_\\phi(\\mathbf{z}))=1\\) (i.e., when the generator has been able to fool the discriminator), and maximum for \\(g_\\theta(d_\\phi(\\mathbf{z}))=0\\) (i.e., when the discriminator recognizes the creation of the generator). For the discriminator we must consider two cases. The first one is associated with the generator: \\[ \\mathscr{L}(\\hat{\\mathbf{x}}, \\hat{\\mathbf{y}}=0) = BCE(d_\\phi(g_\\theta(\\mathbf{z})), \\hat{\\mathbf{y}}=0) = -log(1-d_\\phi(g_\\theta(\\mathbf{z}))) \\] which is minimum for \\(g_\\theta(d_\\phi(\\mathbf{z}))=0\\) (i.e., when the discriminator recognizes the creation of the generator), and maximum for \\(g_\\theta(d_\\phi(\\mathbf{z}))=1\\) (i.e., when the generator has been able to fool the discriminator). The second one is instead associated with the true samples: \\[ \\mathscr{L}(\\mathbf{x}, \\mathbf{y}=1) = BCE(d_\\phi(\\mathbf{x}), \\mathbf{y}=1) = -log(d_\\phi(\\mathbf{x})) \\] which is minimum for \\(d_\\phi(\\mathbf{x})=1\\) (i.e., when the discriminator recognizes the true samples), and maximum for \\(d_\\phi(\\mathbf{x})=0\\) (i.e., when the discriminator believes that the true samples are fake). Whilst for simplicity we have analyzed these three terms separately and focused on how they can be minimized (this is also what we would do when implementing GANs in practice), a unique adversarial loss function can be also defined that uniquely identifies the goal of GAN training: \\[ \\mathscr{L}_{adv} = E_{\\mathbf{x} \\sim p_x} [log(d_\\phi(\\mathbf{x}))] + E_{\\mathbf{z} \\sim p_z} [log(1- d_\\phi(g_\\theta(\\mathbf{z})))] \\] and the overall training problem can be written as: \\[ arg \\; \\underset{g_\\theta} {\\mathrm{min}} \\; \\underset{d_\\phi} {\\mathrm{max}} \\; \\mathscr{L}_{adv} \\] This is interesting, as we are not simply minimizing a loss function to find the best parameters of a network, rather we are required to play a min-max game between the generator and discriminator. This is exactly where the Adversarial part of the name GANs comes from.","title":"Mathematics of GANs"},{"location":"lectures/15_gans/#training-of-gans","text":"Although there are various strategies to successfully train these two networks together, the most common one is to do one step of optimization on one and one on the other in an alternating fashion. By doing so, we allow the two networks to competitively learn together their own task whilst trying to make the other network fail on the other task. In practice, the learning process of GANs can be however very unstable (and sometimes even unpredictable). A common scenario is in fact represented by the fact that one network learns its task much faster than the other network. Depending on which of the two networks is the fast learner, the following scenarios may arise: Super-discriminator (i.e., the discriminator outpaces the generator): the generator is made aware of the fact that most (if not all) of his fake samples have been identified by the discriminator. By not knowing which of the generated samples are harder to discriminate and which ones are easier, the network cannot update its parameters to generator more of the samples that were mislead for real and less of those that were identified as fake by the discriminator. This is the most common scenario as the discriminator has a much easier task (binary classification) compared to that of the generator (learning a probability density function); Super-generator (i.e.,the generator outpaces the discriminator): as the discriminator cannot tell apart the true from the fake samples, the generator is satisfied with what it is producing and continues to do so. Whilst this is the ideal scenario that we wish to experience after some epochs of training, when this arises early (in the first few epochs), it is usually a sign that the generator is producing samples that are very similar to each other instead of a representative set of samples of the underlying distribution (so-called mode collapse). Now that we know how to train GANs, and that the training process may be hard, let's discuss in details a number of scenarios that we may encounter during training. After that, we will discuss a number of strategies that have been devised through out the years to minimize the risk of having unbalanced training and ultimately a generator with poor generative capabilities.","title":"Training of GANs"},{"location":"lectures/15_gans/#mode-collapse","text":"Let's consider the following multi-modal 1-dimensional distribution: A successfully trained GAN is able to generate sample from the different modes of this distribution. However, it is common for a GAN to identify a single mode and stick to it, generating only samples from a small part of the overall distribution. As these samples are realistic, the generator may eventually end up fooling the discriminator losing any interest in exploring other areas of the distribution. More precisely, let's imagine that during the early stages of training the discriminator is able to distinguish between fake and real samples for 3 of the 4 modes whilst it struggles for the remaining one. Whilst trying to fool the discriminator, the generator realizes that and exploits its ability to fool the discriminator when sampling from one of the modes. By doing this, the generator becomes better and better at producing realistic samples from that mode but forgets about the fact that the probability it is trying to approximate may be multi-modal. Whilst this is a better scenario than the one depicted above (i.e., the generator is outpaced by the discriminator and cannot produce any representative sample), mode collapse is still something we would like to avoid if possible. We will soon discuss a number of modifications to the classical GAN model that can achieve that.","title":"Mode collapse"},{"location":"lectures/15_gans/#vanishing-gradients","text":"Another problem commonly experienced whilst training GANs is represented by the arising of vanishing gradients. Let's imagine that at the start of the training process, the discriminator manages to get the two distribution (true and generator) far apart. In this scenario, the gradient of the BCE loss tends to flatten and training starts to slow down as depicted in the figure below: In other words, once the distributions stop overlapping the chances of the generator to keep learning and producing something meaningful drastically reduce (or at least the learning process becomes very slow).","title":"Vanishing gradients"},{"location":"lectures/15_gans/#solutions-to-unstable-training","text":"In the quest of creating stable and reliable GAN models, a number of researchers have suggested that the BCE loss initially used in the original GAN paper may be the main cause of some of the above highlighted problems. As GANs ultimate goal is that of estimate a probability distribution, a suggestion was made by Arjovsky and coauthors in 2017 to replace the BCE loss with the so-called Wasserstein loss. Simply put, the Wasserstein distance, or Earth's mover distance, is a distance function between two probability distributions that computes the amount of earth (or soil) that needs to be moved from one probability to match another. Let's take a look at it with an example where for simplicity we discretize two probability distributions (i.e., we display them as histograms). In this case we can simply observe that the Wasserstein distance between such probabilities is 1 as we need to make just one move to match them: More importantly, a clear implication in choosing this loss function over BCE for the training of GANs lies in the fact that its gradient does not saturate very quickly as two distributions are pulled far apart. This greatly eases the training process of the generator even when the discriminator is superior at the beginning of the training process. In practice, when using the Wasserstein distance, the loss function of the min-max game becomes: \\[ arg \\; \\underset{g_\\theta} {\\mathrm{min}} \\; \\underset{d_\\phi} {\\mathrm{max}} \\; E_{\\mathbf{x} \\sim p_x} [d_\\phi(\\mathbf{x})] - E_{\\mathbf{z} \\sim p_z}[d_\\phi(g_\\theta(\\mathbf{z}))] \\] and the training process remains unchanged. Finally, note in the context of Wasserstein GANs (W-GANs) the discriminator is sometimes also called critic as its role it is not anymore to perform a classification but simply criticize the fake samples. Moreover, although the Wasserstein loss makes training of GANs less unstable, it introduces a problem. The training of the discriminator can be affected by the exploding gradient behaviour, due to the fact that the discriminator may not be 1-Lipschitz continuous. Various approaches have been introduced to avoid such a situation, gradient clipping and gradient penalty are two of the most common solutions. To conclude this section, it is worth mentioning that a number of other strategies have been proposed in the literature to mitigate training failures of GANs. Whilst we will not go into the details here, we will briefly mention a couple of them: Spectral normalization : another approach used to ensure that the discriminator is 1-Lipschitz continuous. As the name implies, spectral normalization is a normalization procedure applied to each layer of the network to ensure that the spectral norm of the layer is smaller or equal to 1. It requires estimating the largest eigenvalue of each layer and renormalizing its weight whenever they are updated. For more details, see this blog post for more details. Polyak averaging of the generator : as discussed in this lecture , it is possible to mitigate the importance of the stopping criterion when training GANs by averaging the parameters of the generator for a number of iterations \\(N_{it}\\) (or even epochs). Pro-GAN (Progressive growing GANs): Instead of training a GAN network directly on the high-resolution output of interest, a good strategy proposed by Karras and coauthors is to start with a smaller, lower resolution version and train a small GAN. This is repeated for a number of times by freezing the trained layers, adding more layers, and increasing the resolution of the sought output.","title":"Solutions to unstable training"},{"location":"lectures/15_gans/#conditional-gans","text":"The original formulation of GANs aims at learning an unconditional probability distribution and sampling from it. However, in many real life scenarios we may expect the probability distribution of our data to be somehow clustered (i.e., display a multi-modal behaviour). Let's for example imagine that we are provided with a number of geological models from all over the world and our goal is to teach a GAN to create new 'fake' models that are as realistic as possible. Whilst all the geological models share some high-level features, it is logical to expect that some of them have more things in common then others. Let's also assume we are provided with such information in the form of labels, so that for each sample we also know the class it belongs to. If we were able to sample from a 2-dimensional latent space and reproduce exactly our training samples, this is what we may observe: It may be appealing to train a GAN that at inference time could produce samples conditionally to us choosing a specific class (or cluster) of interest. Of course, the most straightforward approach could be to separate the training samples into \\(N_c\\) buckets and train \\(N_c\\) independent GANs. This is however very costly. A smarter approach is to turn our generative network from unconditional to conditional, something we call conditional GAN (or c-GAN). Conditional GANs present a number of distinctive features when compared to traditional GANs, which we are going to summarize here: Alongside a random vector \\(\\mathbf{z} \\in \\mathbb{R}^{N_l}\\) , the generator is now also fed with a label that represents the class we wish to sample from. These two are concatenated to each other to form a new vector \\(\\tilde{\\mathbf{z}} = [\\mathbf{z}, c] \\in \\mathbb{R}^{N_l+1}\\) . Alternatively, the label can be one-hot encoded into a vector \\(\\mathbf{c}\\) and the generator is fed with \\(\\tilde{\\mathbf{z}} = [\\mathbf{z}, \\mathbf{c}] \\in \\mathbb{R}^{N_l+N_c}\\) . By doing so at training time, we inform the generator that we are not just interested in producing a random sample from the distribution of the training data, rather we want a sample from a specific class. Once the network is trained, at inference time we will have the ability to sample conditionally; The discriminator is also made aware of the fact that the training data is divided into classes. And of course, this goes also for the generated samples. Similar to the generator input, the input of the discriminator is now modified to include also the label of the true sample (or the label provided to the generator for the fake sample). Once again the vector $\\mathbf{x} is simply concatenated with either the label \\(c\\) or its one-hot encoded version \\(\\mathbf{c}\\) to create a new input to the discriminator \\(\\tilde{\\mathbf{x}}\\) . Finally, what if the input of the discriminator (and/or generator) is N-dimensional. This is usually the case when working with natural images or multi-dimensional geoscientific data (e.g., seismic data, satellite images). A simple modification of the process described above can be introduced. Instead of concatenating the label \\(c\\) to the 1-dimensional \\(\\mathbf{z}\\) (or \\(\\mathbf{x}\\) ) vector, an additional channel is added to the N-dimensional \\(\\mathbf{z}\\) (or \\(\\mathbf{x}\\) ) tensor that contains the value of the label. Similarly, when working with the one-hot encoded version of the label, \\(N_c\\) channels are added instead with one of them containing 1s (the one corresponding to the label) and all others containing 0s.","title":"Conditional GANs"},{"location":"lectures/15_gans/#domain-translation-with-gans","text":"To conclude this lecture, we will discuss a slightly different application of GANs. Whilst so far we have presented GANs as statistical modelling tools for generative tasks, it turns out they are also useful for image-to-image translation (or more broadly, for any form of domain translation ). We previously mentioned this application in the context of convolutional networks and more specifically the UNet architecture. The idea is to map data from a given input domain to a given output domain. A number of interesting applications in geoscience may benefit from this set up. For example, any geophysical processing step can be seen as a domain translation task where we transform the input data into a new version of it. Also, we could think of using domain translation as a way to create realistic geological models from sketches or to populate them with petrophysical properties whist starting from a pure facies skeleton. In general, two scenarios may arise: Paired training: the training data provides us with paired combinations of samples from the two domains, e.g., \\(\\mathbf{x}_A^{<i>} \\leftrightarrow \\mathbf{x}_B^{<i>} \\; \\forall i\\) Unpaired training: the training data comes in the form of two set of training samples, the first from domain A and the second from domain B. However, we do not know how each sample of one domain is related to a sample of the other domain, i.e. \\(X_A = (\\mathbf{x}_A^{<1>}, \\mathbf{x}_A^{<2>}, ..., \\mathbf{x}_A^{<N_A>})\\) and \\(X_B = (\\mathbf{x}_B^{<1>}, \\mathbf{x}_B^{<2>}, ..., \\mathbf{x}_B^{<N_B>})\\) .","title":"Domain translation with GANs"},{"location":"lectures/15_gans/#paired-training","text":"In 2017 Isola and coauthors suggested that c-GANs could be used for paired domain translation and proposed the so called Pix2Pix network. Mathematically speaking, whilst a traditional c-GAN aims to learn: \\[ p(\\mathbf{x} | \\mathbf{z}, c) \\] a c-GAN for domain translation will be tasked to learn: \\[ p(\\mathbf{x}_B | \\mathbf{z}, \\mathbf{x}_A) \\] where the noise vector \\(\\mathbf{z}\\) can be used to sample multiple realizations conditioned on the given input \\(\\mathbf{x}_A\\) . In practice, it turns out that this problem is too constrained to allow 'rich sampling', so the original authors suggest to remove \\(\\mathbf{z}\\) from the inputs and use alternative approaches such as dropout if interested to produce multi outputs. The overall network architecture of Pix2Pix can be summarized as follows: Generator: as the inputs and outputs of the generator share the same dimensions, the network architecture here does not need to be that of a decoder like in classical GANs. More powerful architectures with skip connections, like UNet, can be used instead. Note that, as mentioned above, there the random vector \\(\\mathbf{z}\\) is not required to be the input of the generator. Discriminator: similar to c-GAN, both the true samples from the target domain, \\(\\mathbf{x}_B\\) , as well as the predicted ones, \\(\\hat{\\mathbf{x}}_B\\) are fed to the discriminator concatenated with their corresponding sample in the original domain, \\(\\mathbf{x}_A\\) . A second modification to the usual GAN discriminator is also applied here. Instead of using a classical discriminator that reduces the dimensionality of the output to a scalar, Pix2Pix use a special type of discriminator called Patch GAN . More specifically, the discriminator produces an 2-dimensional output of size \\(N_d \\times N_d\\) (where \\(N_d\\) is much smaller than the size of the input samples, \\(N \\times N\\) ). This matrix contains values that are fed independently to the adversarial loss used in classical GANs. By doing the PatchGAN discriminator tries to classify if each \\(N/N_d \\times N/N_d\\) patch in an image is real or fake, providing therefore a much richer feedback to the generator. Apart from the network changes, Pix2Pix introduces also a modification to the original loss function of GANs: \\[ \\mathscr{L} = \\mathscr{L}_{adv} + \\lambda \\mathscr{L}_{pix} \\] where the first term, \\(\\mathscr{L}_{adv}\\) , is the adversarial loss of choice, whilst the second term computes the error between the predicted sample in the new domain and the corresponding true one, e.g. \\(MSE(\\hat{\\mathbf{x}}_B^{<i>}, \\mathbf{x}_B^{<i>})\\) . As this can be interpreted as the classical loss term of a supervised learning task, Pix2Pix does indeed trade-off between performing a classical reconstruction with pixel-wise loss and producing samples that can fool the discriminator.","title":"Paired training"},{"location":"lectures/15_gans/#unpaired-training","text":"As we previously mentioned, it is not always possible to have access to paired samples from the two domains. Provided access to a variety of samples from domains A and B, CycleGAN was introduced by Zhu and colleagues as a way to perform domain translation in this more general setup. The idea of CycleGAN is to train 2 GANs in parallel, one performing a domain translation task from A to B and the other performing a domain translation task from B to A: Each GAN is composed of a generator with UNet architecture and a discriminator (it could be normal one or a PatchGAN); samples from domain A are fed to the \\(GAN_{A \\rightarrow B}\\) whilst samples from domain B are fed to the \\(GAN_{B \\rightarrow A}\\) . An adversarial loss is used as commonly done in GANs training, but in this case neither BCE nor Wasserstein is chosen. Instead, the authors suggest to use an MSE loss: \\[ \\begin{aligned} \\mathscr{L}_{adv, g_{A \\rightarrow B}} &= E_{\\mathbf{x}_A \\sim p_{x,A}} [(d_B(g_{A \\rightarrow B}(\\mathbf{x}_A) - 1)^2], \\\\ \\mathscr{L}_{adv, g_{B \\rightarrow A}} &= E_{\\mathbf{x}_B \\sim p_{x,B}} [(d_A(g_{B \\rightarrow A}(\\mathbf{x}_B) - 1)^2], \\\\ \\mathscr{L}_{adv, d_A} &= E_{\\mathbf{x}_A \\sim p_{x,A}} [(d_A(\\mathbf{x}_A) - 1)^2] + E_{\\mathbf{x}_B \\sim p_{x,B}} [d_B(g_{B \\rightarrow A}(\\mathbf{x}_B))^2], \\\\ \\mathscr{L}_{adv, d_B} &= E_{\\mathbf{x}_B \\sim p_{x,B}} [(d_B(\\mathbf{x}_B) - 1)^2] + E_{\\mathbf{x}_A \\sim p_{x,A}} [d_B(g_{A \\rightarrow B}(\\mathbf{x}_A))^2], \\\\ \\end{aligned} \\] and we finally define \\(\\mathscr{L}_{adv, A \\rightarrow B}=\\mathscr{L}_{adv, g_{A \\rightarrow B}} + \\mathscr{L}_{adv, d_B}\\) and \\(\\mathscr{L}_{adv, B \\rightarrow A}=\\mathscr{L}_{adv, g_{B \\rightarrow A}} + \\mathscr{L}_{adv, d_A}\\) . Moreover, since we do not know how to pair the samples from the different domains the PixelLoss of Pix2Pix cannot be used here. Up until now the networks are also not aware of each other and could well be trained separately. A feedback loop is therefore introduced such that the two networks are aware of each other and trained together to learn both mappings consistently. This is referred to as Cycle consistency and it works as follows: A sample from domain A is fed to the \\(GAN_{A \\rightarrow B}\\) and subsequently to \\(GAN_{B \\rightarrow A}\\) and the reconstruction loss is computed: \\(\\mathscr{L}_{cycle, A \\rightarrow B} = MSE(\\mathbf{x}_A, g_{B \\rightarrow A}(g_{A \\rightarrow B}(\\mathbf{x}_A))\\) A sample from domain B is fed to the \\(GAN_{B \\rightarrow A}\\) and subsequently to \\(GAN_{A \\rightarrow B}\\) and the reconstruction loss is computed: \\(\\mathscr{L}_{cycle, B \\rightarrow A} = MSE(\\mathbf{x}_B, g_{A \\rightarrow B}(g_{B \\rightarrow A}(\\mathbf{x}_B))\\) Finally, an Identity loss is also optionally introduced where a sample from one domain is passed through the generator of the other domain and the MSE loss is computed against the sample itself. The idea of such loss is that a generator should not modify a sample that already belongs to the target distribution: \\[ \\begin{aligned} \\mathscr{L}_{identity, A} &= MSE(g_{B \\rightarrow A}(\\mathbf{x}_A), \\mathbf{x}_A)\\\\ \\mathscr{L}_{identity, B} &= MSE(g_{A \\rightarrow B}(\\mathbf{x}_B), \\mathbf{x}_B)\\\\ \\end{aligned} \\] To summarize, the overall loss function of CycleGAN becomes: \\[ \\mathscr{L} = \\sum_{i \\in (A \\rightarrow B, B \\rightarrow A)} \\left( \\mathscr{L}_{adv, i} + \\lambda_C (\\mathscr{L}_{cycle, i}) \\right) + \\lambda_I (\\mathscr{L}_{identity, A} +\\mathscr{L}_{identity, B}) \\]","title":"Unpaired training"},{"location":"lectures/15_gans/#additional-readings","text":"A good discussion on the limitations of the BCE loss for GANs training can be found here A number of valuable resources for stable training of GANs are: 1 , 2 If you want to get started with GANs in PyTorch, here is a good starting point: A Zoo of PyTorch implementations of GANs","title":"Additional readings"},{"location":"lectures/16_pinns/","text":"Scientific Machine Learning and Physics-informed Neural Networks In the last two lectures of our course, we will focus our attention on a flourishing area of scientific computing that aims to develop algorithms that can bridge the gap between purely data-driven methods and model-driven ones. Sometimes this new field of research is referred to as Scientific Machine Learning and you can find a great deal of information on the web (e.g., 1 , 2 , 3 ). However, it is not always easy to understand what Scientific ML really is and how it differs from the mere application of the ML (and DL) tooling that we have discussed during this course. To be able to understand what is the best way to marry the latest advances in deep learning with our toolbox of model-driven algorithms, let's first briefly review what these two disciplines are good at alone and where they usually struggle. Deep Learning is usually great at: Computer Vision tasks; Language modelling tasks; Discovery of hidden patterns in large amount of structured data. These three topics have something in common: very little is known a priori about the physics that underlie the process that we want to learn from. For example, although a great deal of research has been performed in the fields of neuroscience, our current understanding of how a child learns to recognize a dog from a cat or how we learn a new language is still very limited. Whilst for long time researchers have tried to decode the rules of a language and create computer programs that could translate, answer questions or more broadly communicate with humans, it is nowadays clear to us that a better route is to provide machines with a large amount of training data and let them identify the best possible way to accomplish a task. Physics is usually great at: Modelling natural phenomena by means of (more or less) simple equations, e.g. how waves propagate. Providing a link between some observations that we are able to take in the nature and the unobserved parameters of the underlying physical system. For example, we can link the traveltime of sound waves with the actual velocity of the medium they travel in, or link precipitation levels with the pressure and temperature of the atmosphere. This is usually encoded by equations of the form: $$ d = g(m) $$ where \\(d\\) are the observations, \\(m\\) are the model parameters, and \\(g\\) is the (usually nonlinear) physical model. This could be an ordinary differential equation (ODE), or a partial differential equation (PDE), or any other equation that has an analytical or numerical solution. On the other hand, unlike deep learning, a purely physics-driven approach may not be able to learn useful information from data nor automatically identify patterns in the solution space that we would like to enhance or suppress. This is where a hybrid approach could come in handy: we can leverage some of the deep learning methods discussed in this course to identify patterns in both the observations and the sought after model and use it as an informed prior whilst still relying on the well-established physical process to link the two. In the following we will focus on the following three directions of research that build their foundations on this paradigm: Physics-Informed Neural Networks (PINNs) : this family of NNs try to learn to model a physical process in an unsupervised manner. This is accomplished by including the ODE or PDE that describe the physical process of interest as part of the loss function used to train the network. Ultimately, a trained PINN can quickly evaluate the solution of the chosen ODE or PDE at any point in the domain of interest (or perform inverse modelling with respect to the free-parameters, initial conditions or boundary conditions of such an equation); Data-driven regularization of inverse problems : in classical inverse problem theory, regularization is a heavily used tool to allow the solution of ill-posed inverse problem. We will discuss how hand-crafted regularizers (and/or preconditioners) are nowadays replaced by properly pre-trained Neural networks. Learned iterative solvers : large-scale inverse problems are usually solved by means of iterative solvers. A new line of research has shown great promise in learning the best direction to apply at each step of an iterative solver, this being the output of a neural network fed with the current solution, gradient and possibly other inputs. Whilst this approach requires supervision, we will discuss its great potential to replace classical iterative solvers to improve both the speed and quality of the solution. Physics-Informed Neural Networks (PINNs) Physics-Informed Neural Networks are a new family of deep learning models specifically aimed at solving differential equations. To begin with, let's recall how a physical model can be explained by means of differential equations: Ordinary Differential Equations (ODEs): differential equations with a single independent variable, here denoted with \\(t\\) . For example: $$ \\frac{d u(t)}{dt} = f(u(t; \\alpha)) $$ where \\(u(t)\\) is the dependent variable we are interested in, and \\(f\\) is a generic linear or nonlinear function of \\(u(t)\\) . Partial Differential Equation (PDEs): differential equations with two or more independent variable, here denoted with \\(t,x\\) . For example: $$ \\frac{\\partial u(t,x)}{\\partial t} + \\frac{\\partial u(t,x)}{\\partial x} = f(u(t,x; \\alpha)) $$ where \\(u(t)\\) is the dependent variable we are interested in, and \\(f\\) is a generic linear or nonlinear function of \\(u(t)\\) . In both cases the free-parameters of the equation are denoted with \\(\\alpha\\) . Three family of methods exist to solve such equations: Analytical solution : some special types of ODEs and PDEs (e.g., with constant free-parameters \\(\\alpha\\) ) can be solved analytically. Whilst this approach is very appealing in terms of computational cost and accuracy of the solution it has limited practical use; Numerical methods : a more general approach to any form of ODE or PDE is to discretize the differential equation itself (or its equivalent integral relation) and solve it by means of numerical methods such as Finite-Difference (FD), Finite-Element (FE), Spectral-Element (SE), etc. Whilst these methods are routinely employed in almost any scientific field, they present some outstanding limitations, the most important of which are the extremely large computational cost and the need for a predefined (regular or irregular) mesh. Moreover, numerical methods like FD or FE solve a specific instance of a ODE or PDE (given fixed initial and boundary conditions and free-parameters) and cannot take advantage of the solution of one instance of the equation when solving a different instance. A classical problem in geophysics, for example, is to solve the wave equation for a given number of different sources (i.e., forcing terms): each instance is solved separately as no one instance can benefit from another one even when sources are just a few meters apart. Learned models : in the spirit of supervised learning, a number of solutions have been proposed to directly learn a ODE or PDE (or the entire operator) by training a deep learning model (usually a CNN) to map initial conditions and free-parameters into the solution, or a portion of the solution (e.g., u(t) for \\(0\\ge t <T/N\\) ) and free-parameters into the rest of the solution (e.g., u(t) for \\(T/N\\ge t<T\\) ). Whilst such an approach can work under special circumstances, one clear limitation is that the knowledge of the ODE/PDE is only embedded in the training data. Moreover a classical numerical solver is still required to create the training data. PINNs, on the other hand, take a very different approach to learning differential equations. First of all, the exploit the general idea of the Universal Approximation Theorem which states that any function can be learned with a large enough (1 layer) Neural Network. Second, they do so by leveraging the underlying ODE/PDE that we wish to solve as part of the loss function used to train such a network. To explain how PINNs work, let's take a generic PDE and write it formally as: where we have specified here both the differential equation itself, as well as its initial conditions (IC) and boundary conditions (BC). Given the definition of a ODE/PDE, a Physics-Informed Neural Network is composed of the following: A simple feedforward network \\(f_\\theta\\) with number of inputs equal to the number of independent variables of the differential equation and number of outputs equal to the number of dependent variables of the differential equation. In the simple case above, the network will have 2 inputs and one outputs. The internal structure of the network is totally arbitrary. Depending on the complexity of the solution this may require more or less layers as well as more or less units per layer. Similarly, the choice of the internal activation functions is arbitrary. Experience has shown than tanh works well in simple scenarios (e.g., when the solution \\(u\\) is smooth), whilst other activations such as LeakyRelu, Swish or even Sin may be preferable for complex solutions (e.g., oscillating or with abrupt discontinuities). Automatic differentiation (AD) is used not only to compute the gradient of the loss function, but also to compute the derivatives of the output(s) of the network (dependent variables) over the inputs (independent variables) A loss function is defined in such a way that the ODE/PDE is fitted alongside initial and/or boundary conditions. Before we delve into the details of each of these three new ingredients, let's visually consider the PINN for the sample PDE equation above: Starting from the left, as usually done when training NNs, a number of \\((x^{<i>},t^{<i>}) \\; i=1,2,...,N_c\\) pairs (also sometimes referred in the literature as co-location points) is selected and feed to the network. The corresponding outputs \\(u(x^{<i>},t^{<i>})\\) are then fed to AD to compute the required derivatives over the inputs (here \\(\\partial u / \\partial x\\) and \\(\\partial u / \\partial t\\) ). Finally, both the output and its derivatives are used to evaluate the PDE. Alongside these \\(N_c\\) co-locations points, a number of additional points are fed to the network. In case of initial conditions, these points are \\((x^{<i>},t_0) \\; i=1,2,...,N_{IC}\\) . Similarly, in case of boundary conditions, these points are \\((x_j,t^{<i>}) \\; i=1,2,...,N_{BC}; \\; j=0,1\\) . The ratio between \\(N_c\\) , \\(N_{IC}\\) , \\(N_{BC}\\) is arbitrary. Moreover, the choice of the co-location points can be performed in various alternative ways: Uniform in the grid; Randomly sampled in the grid (once at the start of training); Randomly sampled in the grid (at every step) Adaptively: this can be based for example on the PDE loss, where during the training process more points are selected in areas where the PDE match is poorer. Which one is best is still under debate, and it is likely to be also problem dependent. Moreover, whilst a full batch approach is the most common for training PINNs, researcher have also started to successfully use mini-batch approaches during training. Moving onto the computation of the derivatives ( \\(\\partial u / \\partial x\\) and \\(\\partial u / \\partial t\\) ), since \\(x\\) and \\(t\\) represents the entry leaves of the computational graph, as long as we make our computational framework aware of the fact that we want to compute derivatives over such variables, we can do so at any time (and even multiple times if required by the PDE, e.g., \\(\\partial^2 u / \\partial x^2\\) ). Last but not least, the loss function of PINNs can be written as follows: \\[ \\begin{aligned} \\mathscr{L}_{pinn} &= \\frac{1}{N_c} \\sum_{i=1}^{N_c} PDE(x^{<i>},t^{<i>}) \\\\ &+ \\frac{\\lambda_IC}{N_{IC}} \\sum_{i=1}^{N_{IC}} IC(x^{<i>},t_0) \\\\ &+ \\frac{\\lambda_{BC}}{N_{BC}} \\sum_{i=1}^{N_{BC}} BC(x_{j^{<i>}},t^{<i>}) \\\\ \\end{aligned} \\] As an example, for the problem above, the loss function becomes: \\[ \\begin{aligned} \\mathscr{L}_{pinn} &= \\frac{1}{N_c} \\sum_{i=1}^{N_c} (\\partial u(x^{<i>},t^{<i>}) / \\partial t + \\partial u(x^{<i>},t^{<i>}) / \\partial x - f(u(x^{<i>},t^{<i>}; \\alpha)||_2^2 \\\\ &+ \\frac{\\lambda_IC}{N_{IC}} \\sum_{i=1}^{N_{IC}} ||u(x^{<i>},t_0)-u_{t0}(x^{<i>})||_2^2 \\\\ &+ \\frac{\\lambda_{BC}}{N_{BC}} \\sum_{i=1}^{N_{BC}} ||u(x_{j^{<i>}},t^{<i>})-u_{x_{j^{<i>}}}(t^{<i>})||_2^2 \\end{aligned} \\] where the L2 norm has been used for all the three losses. Given the loss, the training process follows similar pattern to that of any Neural Network described in this course. An optimizer of choice (e.g., Adam) is used to minimize the loss: \\[ \\underset{\\theta} {\\mathrm{argmin}} \\; \\mathscr{L}_{pinn} \\] Finally, once the network is trained, the solution can be evaluated anywhere in the domain by simply passing a pair of coordinates \\((x,t)\\) of choice. One of the key features of PINNs is that they are mesh independent. Theoretically speaking we could sample our solution at any spatial and temporal sampling of choice, and even more so we could have different ones for different areas of the domain. Similarly, since we can evaluate any area of the domain, this method can be also very fast compared to for example FD which requires starting from earlier times to get to later ones. To conclude, whilst up until now we have discussed PINNs in the context of forward modelling, they can be also used for inverse modelling. In other words, an optimization problem can be setup for the free-parameters of the ODE/PDE \\(\\alpha\\) as follows: where the optimization process is now performed not only over the network parameters \\(\\theta\\) whose aim is to produce a continuos field \\(u\\) that satisfies the ODE/PDE of interest but also over the free-parameters \\(\\alpha\\) : \\[ \\underset{\\theta, \\alpha} {\\mathrm{argmin}} \\; \\mathscr{L}_{pinn} \\] Note, however, that whilst from a computational point of view this can be easily done, In practice the underlying inverse problem may be highly ill-posed and finding a satisfactory pair of ( \\(\\alpha, \\theta\\) may not always be easy. Finally, when \\(\\alpha\\) is also function of one or more of the independent variables of the differential equation (e.g., \\(\\alpha(x)\\) ), this approach can be taken one step further by parametrizing also \\(\\alpha\\) with a feedforward neural network and optimizing the the weights of the two networks instead of \\(\\alpha\\) directly: \\[ \\underset{\\theta, \\phi} {\\mathrm{argmin}} \\; \\mathscr{L}_{pinn} \\] An example of such a scenario can be represented by a classical problem in geophysics: traveltime tomography. Here the PDE is the eikonal equation, the independent variables are spatial coordinates \\((x,z)\\) and possibly the coordinates of the sources \\((x_S,z_S)\\) , and the dependent variable is the traveltime \\(T\\) . For inverse modelling, the free-parameter \\(\\alpha(x,z)\\) is the velocity of the medium which can be also parametrized via a network as shown above. To reduce the amount of plausible solutions that can fit the PDE, a BC must be added to the loss function in the form of the observed traveltime at receivers (either on the surface ( \\(z=0\\) ) or anywhere available within the domain). Of course, the eikonal equation and traveltime tomography is just one problem in geophysics where PINNs may represent an appealing solution. Other applications that have recently emerged within the field of geoscience are: time and frequency domain wave equation; Navier-Stokes equations; ...","title":"Scientific Machine Learning and Physics-informed Neural Networks"},{"location":"lectures/16_pinns/#scientific-machine-learning-and-physics-informed-neural-networks","text":"In the last two lectures of our course, we will focus our attention on a flourishing area of scientific computing that aims to develop algorithms that can bridge the gap between purely data-driven methods and model-driven ones. Sometimes this new field of research is referred to as Scientific Machine Learning and you can find a great deal of information on the web (e.g., 1 , 2 , 3 ). However, it is not always easy to understand what Scientific ML really is and how it differs from the mere application of the ML (and DL) tooling that we have discussed during this course. To be able to understand what is the best way to marry the latest advances in deep learning with our toolbox of model-driven algorithms, let's first briefly review what these two disciplines are good at alone and where they usually struggle. Deep Learning is usually great at: Computer Vision tasks; Language modelling tasks; Discovery of hidden patterns in large amount of structured data. These three topics have something in common: very little is known a priori about the physics that underlie the process that we want to learn from. For example, although a great deal of research has been performed in the fields of neuroscience, our current understanding of how a child learns to recognize a dog from a cat or how we learn a new language is still very limited. Whilst for long time researchers have tried to decode the rules of a language and create computer programs that could translate, answer questions or more broadly communicate with humans, it is nowadays clear to us that a better route is to provide machines with a large amount of training data and let them identify the best possible way to accomplish a task. Physics is usually great at: Modelling natural phenomena by means of (more or less) simple equations, e.g. how waves propagate. Providing a link between some observations that we are able to take in the nature and the unobserved parameters of the underlying physical system. For example, we can link the traveltime of sound waves with the actual velocity of the medium they travel in, or link precipitation levels with the pressure and temperature of the atmosphere. This is usually encoded by equations of the form: $$ d = g(m) $$ where \\(d\\) are the observations, \\(m\\) are the model parameters, and \\(g\\) is the (usually nonlinear) physical model. This could be an ordinary differential equation (ODE), or a partial differential equation (PDE), or any other equation that has an analytical or numerical solution. On the other hand, unlike deep learning, a purely physics-driven approach may not be able to learn useful information from data nor automatically identify patterns in the solution space that we would like to enhance or suppress. This is where a hybrid approach could come in handy: we can leverage some of the deep learning methods discussed in this course to identify patterns in both the observations and the sought after model and use it as an informed prior whilst still relying on the well-established physical process to link the two. In the following we will focus on the following three directions of research that build their foundations on this paradigm: Physics-Informed Neural Networks (PINNs) : this family of NNs try to learn to model a physical process in an unsupervised manner. This is accomplished by including the ODE or PDE that describe the physical process of interest as part of the loss function used to train the network. Ultimately, a trained PINN can quickly evaluate the solution of the chosen ODE or PDE at any point in the domain of interest (or perform inverse modelling with respect to the free-parameters, initial conditions or boundary conditions of such an equation); Data-driven regularization of inverse problems : in classical inverse problem theory, regularization is a heavily used tool to allow the solution of ill-posed inverse problem. We will discuss how hand-crafted regularizers (and/or preconditioners) are nowadays replaced by properly pre-trained Neural networks. Learned iterative solvers : large-scale inverse problems are usually solved by means of iterative solvers. A new line of research has shown great promise in learning the best direction to apply at each step of an iterative solver, this being the output of a neural network fed with the current solution, gradient and possibly other inputs. Whilst this approach requires supervision, we will discuss its great potential to replace classical iterative solvers to improve both the speed and quality of the solution.","title":"Scientific Machine Learning and Physics-informed Neural Networks"},{"location":"lectures/16_pinns/#physics-informed-neural-networks-pinns","text":"Physics-Informed Neural Networks are a new family of deep learning models specifically aimed at solving differential equations. To begin with, let's recall how a physical model can be explained by means of differential equations: Ordinary Differential Equations (ODEs): differential equations with a single independent variable, here denoted with \\(t\\) . For example: $$ \\frac{d u(t)}{dt} = f(u(t; \\alpha)) $$ where \\(u(t)\\) is the dependent variable we are interested in, and \\(f\\) is a generic linear or nonlinear function of \\(u(t)\\) . Partial Differential Equation (PDEs): differential equations with two or more independent variable, here denoted with \\(t,x\\) . For example: $$ \\frac{\\partial u(t,x)}{\\partial t} + \\frac{\\partial u(t,x)}{\\partial x} = f(u(t,x; \\alpha)) $$ where \\(u(t)\\) is the dependent variable we are interested in, and \\(f\\) is a generic linear or nonlinear function of \\(u(t)\\) . In both cases the free-parameters of the equation are denoted with \\(\\alpha\\) . Three family of methods exist to solve such equations: Analytical solution : some special types of ODEs and PDEs (e.g., with constant free-parameters \\(\\alpha\\) ) can be solved analytically. Whilst this approach is very appealing in terms of computational cost and accuracy of the solution it has limited practical use; Numerical methods : a more general approach to any form of ODE or PDE is to discretize the differential equation itself (or its equivalent integral relation) and solve it by means of numerical methods such as Finite-Difference (FD), Finite-Element (FE), Spectral-Element (SE), etc. Whilst these methods are routinely employed in almost any scientific field, they present some outstanding limitations, the most important of which are the extremely large computational cost and the need for a predefined (regular or irregular) mesh. Moreover, numerical methods like FD or FE solve a specific instance of a ODE or PDE (given fixed initial and boundary conditions and free-parameters) and cannot take advantage of the solution of one instance of the equation when solving a different instance. A classical problem in geophysics, for example, is to solve the wave equation for a given number of different sources (i.e., forcing terms): each instance is solved separately as no one instance can benefit from another one even when sources are just a few meters apart. Learned models : in the spirit of supervised learning, a number of solutions have been proposed to directly learn a ODE or PDE (or the entire operator) by training a deep learning model (usually a CNN) to map initial conditions and free-parameters into the solution, or a portion of the solution (e.g., u(t) for \\(0\\ge t <T/N\\) ) and free-parameters into the rest of the solution (e.g., u(t) for \\(T/N\\ge t<T\\) ). Whilst such an approach can work under special circumstances, one clear limitation is that the knowledge of the ODE/PDE is only embedded in the training data. Moreover a classical numerical solver is still required to create the training data. PINNs, on the other hand, take a very different approach to learning differential equations. First of all, the exploit the general idea of the Universal Approximation Theorem which states that any function can be learned with a large enough (1 layer) Neural Network. Second, they do so by leveraging the underlying ODE/PDE that we wish to solve as part of the loss function used to train such a network. To explain how PINNs work, let's take a generic PDE and write it formally as: where we have specified here both the differential equation itself, as well as its initial conditions (IC) and boundary conditions (BC). Given the definition of a ODE/PDE, a Physics-Informed Neural Network is composed of the following: A simple feedforward network \\(f_\\theta\\) with number of inputs equal to the number of independent variables of the differential equation and number of outputs equal to the number of dependent variables of the differential equation. In the simple case above, the network will have 2 inputs and one outputs. The internal structure of the network is totally arbitrary. Depending on the complexity of the solution this may require more or less layers as well as more or less units per layer. Similarly, the choice of the internal activation functions is arbitrary. Experience has shown than tanh works well in simple scenarios (e.g., when the solution \\(u\\) is smooth), whilst other activations such as LeakyRelu, Swish or even Sin may be preferable for complex solutions (e.g., oscillating or with abrupt discontinuities). Automatic differentiation (AD) is used not only to compute the gradient of the loss function, but also to compute the derivatives of the output(s) of the network (dependent variables) over the inputs (independent variables) A loss function is defined in such a way that the ODE/PDE is fitted alongside initial and/or boundary conditions. Before we delve into the details of each of these three new ingredients, let's visually consider the PINN for the sample PDE equation above: Starting from the left, as usually done when training NNs, a number of \\((x^{<i>},t^{<i>}) \\; i=1,2,...,N_c\\) pairs (also sometimes referred in the literature as co-location points) is selected and feed to the network. The corresponding outputs \\(u(x^{<i>},t^{<i>})\\) are then fed to AD to compute the required derivatives over the inputs (here \\(\\partial u / \\partial x\\) and \\(\\partial u / \\partial t\\) ). Finally, both the output and its derivatives are used to evaluate the PDE. Alongside these \\(N_c\\) co-locations points, a number of additional points are fed to the network. In case of initial conditions, these points are \\((x^{<i>},t_0) \\; i=1,2,...,N_{IC}\\) . Similarly, in case of boundary conditions, these points are \\((x_j,t^{<i>}) \\; i=1,2,...,N_{BC}; \\; j=0,1\\) . The ratio between \\(N_c\\) , \\(N_{IC}\\) , \\(N_{BC}\\) is arbitrary. Moreover, the choice of the co-location points can be performed in various alternative ways: Uniform in the grid; Randomly sampled in the grid (once at the start of training); Randomly sampled in the grid (at every step) Adaptively: this can be based for example on the PDE loss, where during the training process more points are selected in areas where the PDE match is poorer. Which one is best is still under debate, and it is likely to be also problem dependent. Moreover, whilst a full batch approach is the most common for training PINNs, researcher have also started to successfully use mini-batch approaches during training. Moving onto the computation of the derivatives ( \\(\\partial u / \\partial x\\) and \\(\\partial u / \\partial t\\) ), since \\(x\\) and \\(t\\) represents the entry leaves of the computational graph, as long as we make our computational framework aware of the fact that we want to compute derivatives over such variables, we can do so at any time (and even multiple times if required by the PDE, e.g., \\(\\partial^2 u / \\partial x^2\\) ). Last but not least, the loss function of PINNs can be written as follows: \\[ \\begin{aligned} \\mathscr{L}_{pinn} &= \\frac{1}{N_c} \\sum_{i=1}^{N_c} PDE(x^{<i>},t^{<i>}) \\\\ &+ \\frac{\\lambda_IC}{N_{IC}} \\sum_{i=1}^{N_{IC}} IC(x^{<i>},t_0) \\\\ &+ \\frac{\\lambda_{BC}}{N_{BC}} \\sum_{i=1}^{N_{BC}} BC(x_{j^{<i>}},t^{<i>}) \\\\ \\end{aligned} \\] As an example, for the problem above, the loss function becomes: \\[ \\begin{aligned} \\mathscr{L}_{pinn} &= \\frac{1}{N_c} \\sum_{i=1}^{N_c} (\\partial u(x^{<i>},t^{<i>}) / \\partial t + \\partial u(x^{<i>},t^{<i>}) / \\partial x - f(u(x^{<i>},t^{<i>}; \\alpha)||_2^2 \\\\ &+ \\frac{\\lambda_IC}{N_{IC}} \\sum_{i=1}^{N_{IC}} ||u(x^{<i>},t_0)-u_{t0}(x^{<i>})||_2^2 \\\\ &+ \\frac{\\lambda_{BC}}{N_{BC}} \\sum_{i=1}^{N_{BC}} ||u(x_{j^{<i>}},t^{<i>})-u_{x_{j^{<i>}}}(t^{<i>})||_2^2 \\end{aligned} \\] where the L2 norm has been used for all the three losses. Given the loss, the training process follows similar pattern to that of any Neural Network described in this course. An optimizer of choice (e.g., Adam) is used to minimize the loss: \\[ \\underset{\\theta} {\\mathrm{argmin}} \\; \\mathscr{L}_{pinn} \\] Finally, once the network is trained, the solution can be evaluated anywhere in the domain by simply passing a pair of coordinates \\((x,t)\\) of choice. One of the key features of PINNs is that they are mesh independent. Theoretically speaking we could sample our solution at any spatial and temporal sampling of choice, and even more so we could have different ones for different areas of the domain. Similarly, since we can evaluate any area of the domain, this method can be also very fast compared to for example FD which requires starting from earlier times to get to later ones. To conclude, whilst up until now we have discussed PINNs in the context of forward modelling, they can be also used for inverse modelling. In other words, an optimization problem can be setup for the free-parameters of the ODE/PDE \\(\\alpha\\) as follows: where the optimization process is now performed not only over the network parameters \\(\\theta\\) whose aim is to produce a continuos field \\(u\\) that satisfies the ODE/PDE of interest but also over the free-parameters \\(\\alpha\\) : \\[ \\underset{\\theta, \\alpha} {\\mathrm{argmin}} \\; \\mathscr{L}_{pinn} \\] Note, however, that whilst from a computational point of view this can be easily done, In practice the underlying inverse problem may be highly ill-posed and finding a satisfactory pair of ( \\(\\alpha, \\theta\\) may not always be easy. Finally, when \\(\\alpha\\) is also function of one or more of the independent variables of the differential equation (e.g., \\(\\alpha(x)\\) ), this approach can be taken one step further by parametrizing also \\(\\alpha\\) with a feedforward neural network and optimizing the the weights of the two networks instead of \\(\\alpha\\) directly: \\[ \\underset{\\theta, \\phi} {\\mathrm{argmin}} \\; \\mathscr{L}_{pinn} \\] An example of such a scenario can be represented by a classical problem in geophysics: traveltime tomography. Here the PDE is the eikonal equation, the independent variables are spatial coordinates \\((x,z)\\) and possibly the coordinates of the sources \\((x_S,z_S)\\) , and the dependent variable is the traveltime \\(T\\) . For inverse modelling, the free-parameter \\(\\alpha(x,z)\\) is the velocity of the medium which can be also parametrized via a network as shown above. To reduce the amount of plausible solutions that can fit the PDE, a BC must be added to the loss function in the form of the observed traveltime at receivers (either on the surface ( \\(z=0\\) ) or anywhere available within the domain). Of course, the eikonal equation and traveltime tomography is just one problem in geophysics where PINNs may represent an appealing solution. Other applications that have recently emerged within the field of geoscience are: time and frequency domain wave equation; Navier-Stokes equations; ...","title":"Physics-Informed Neural Networks (PINNs)"},{"location":"lectures/17_deepinv/","text":"Deep learning for Inverse Problems The field of inverse problem has experienced a renaissance in the last decade thanks to the recent advances in Deep Learning. Whilst solid theories exist for the solution of linear (or nonlinear) inverse problems, in practice one is always faced with problems that are ill-posed by nature, i.e. many solutions exist that can match data equally well. This is where for long time the inverse problem community has spent time and resources to identify mitigating strategies to reduce the so-called nullspace of an inverse problem by means of prior information. Similarly, for long time the optimization community has developed iterative solvers that can provide solutions to convex or non-convex functionals by requiring only access to function and gradient evaluations of the functional of interest. In this lecture we will discuss where and how Deep Learning may be of great help in the solution of inverse problems. Data-driven or learned regularization of inverse problems To begin, let's consider the solution of an inverse problem of the form: \\(\\mathbf{d}^{obs}=g(\\mathbf{m})\\) or \\(\\mathbf{d}^{obs} = \\mathbf{Gm}\\) where \\(g\\) or \\(\\mathbf{G}\\) is the known modelling operator, \\(\\mathbf{m}\\) are the unknown model parameters, and \\(\\mathbf{d}^{obs}\\) are the observed data. As previously mentioned, in many (geo)scientific applications the operator may be ill-posed and prior knowledge is required to obtain a plausible solution (not just one of the many that matches the data). In classical inverse problem theory this can be achieved as follows: Regularization: \\(J = ||\\mathbf{d}^{obs}-g(\\mathbf{m})||_p^p + \\lambda ||r(\\mathbf{m})||_p^p\\) where \\(r\\) is a function that tries to penalize some features of the model that we are not interested in. Classical choices of \\(r\\) are linear operators such as the identity matrix (this type of regularization is called Tikhonov regularization and favours solution with small L2 norm - \\(p=2\\) ) or the second derivative of laplacian operator (this type of regularization favour smooth solutions). Alternatively, one could choose a linear or nonlinear projection that transforms the model into a domain where the solution is sparse; by choosing \\(p=1\\) , one can estimate the sparsest model that at the same time matches the data. Preconditioning: \\(J = ||\\mathbf{d}^{obs}-g(p(\\mathbf{z}))||_p^p + \\lambda ||\\mathbf{z}||_p^p\\) where by performing a change of variable ( \\(\\mathbf{m}=p(\\mathbf{z})\\) ) the inverse problem is now solved in a transformed domain, and \\(p\\) is a function that filters the solution \\(\\mathbf{z}\\) in such a way that favourable features of the model are enhanced. As an example, a smoothing operator can be used to produce smooth solution (note how this differs from the previous approach where smooth solutions could be constructed by penalizing roughness in the solution by means of second derivatives). A common feature of these two families of approaches is that we as user are requested to select the regularizer or preconditioner for the problem at hand. This could be a difficult task and usually requires a lot of trial-and-error before a good choice is made for a specific problem. Alternatively, one could define a projection that reduces the dimensionality of the space in which we wish to find the solution (i.e., \\(\\mathbf{x} \\in \\mathbb{R}^{N_x}, \\mathbf{z} \\in \\mathbb{R}^{N_z}\\) with \\(N_z << N_x\\) ). This approach reminds us of the dimensionality reduction techniques discusses in this lecture and the choice of the method used to identify a representative latent space can be arbitrary (i.e., a simple linear transformation like PCA or a complex nonlinear transformation like that induced by an Autoencoder or a GAN). A clear advantage of such an approach is that the user is not required to define a transform upfront. Provided availability of training dataset in the form of a representative set of solutions \\(M = (\\mathbf{m}^{<1>}, \\mathbf{m}^{<2>}, ..., \\mathbf{m}^{<N>})\\) , the best data-driven transformation can be identified that suits the problem at hand. Before we get more into the details of such an approach, it is important to make a few remarks. This approach lies in between classical approaches in inverse problem theory and supervised learning approaches in that: classical inverse problems: only the modelling operator \\(g/\\mathbf{G}\\) and one instance of data \\(\\mathbf{d}^{obs}\\) are available. Prior information comes from our knowledge of the expected solution (or its probability distribution), but no set of solutions are available when solving the problem; supervised learning: pairs of models and associated observations \\((\\mathbf{m}^{<i>}, \\mathbf{d}^{obs,<i>})\\) are available upfront (or a set of models \\((\\mathbf{m}^{<i>}\\) from which the associated observations can be synthetically created via the modelling operator). A data-driven model (e.g., a NN) is then trained to find the mapping between data and models. Note that the modelling operator is not actively used in the training process; learned regularization: a set of models \\((\\mathbf{m}^{<i>}\\) is available upfront, which are used to find a latent representation. The inverse problem is subsequently solved for one instance of data \\(\\mathbf{d}^{obs}\\) using the learned regularizer (or preconditioner) and the physical modelling operator. The key idea of solving inverse problems with learned regularizers is therefore to split the problem into two subsequent tasks, where the first is concerned with the prior and the latter with the modelling operator (this is different from the supervised learning approach where the two are learned together): Learning process: a nonlinear model is trained to identify a representative latent space for the set of available solutions. Such model can be an AE (or VAE) network: \\[ \\underset{\\mathbf{e}_\\theta, \\mathbf{d}_\\phi} {\\mathrm{argmin}} \\; \\frac{1}{N_s}\\sum_i \\mathscr{L}(\\mathbf{m}^{(i)}, d_\\phi(e_\\theta(\\mathbf{m}^{(i)}))) \\] or a GAN network \\[ arg \\; \\underset{g_\\theta} {\\mathrm{min}} \\; \\underset{d_\\phi} {\\mathrm{max}} \\; \\frac{1}{N_s}\\sum_i \\mathscr{L}_{adv}(\\mathbf{m}^{(i)}) \\] Inversion: Once the training process is finalized, the decoder (or generator) is used as a nonlinear preconditioner to the solution of the inverse problem as follows: \\[ AE: \\mathbf{m} = d_\\phi(\\mathbf{z}) = p(\\mathbf{z}) \\quad GAN: \\mathbf{m} = g_\\theta(\\mathbf{z}) = p(\\mathbf{z}) \\] such that the inverse problem becomes: \\[ J = ||\\mathbf{d}^{obs}-g(p(\\mathbf{z}))||_p^p + \\lambda ||\\mathbf{z}||_p^p \\] This problem can be now solved using a nonlinear solver of choice, where the gradient can be easily computed using the same set of tools that we employed in the training process of neural networks, namely backpropagation: \\[ \\frac{\\partial J}{\\partial \\mathbf{z}} = \\frac{\\partial J}{\\partial g} \\frac{\\partial g}{\\partial p} \\frac{\\partial p}{\\partial \\mathbf{z}} \\] where \\(\\partial J / \\partial g\\) is the derivative of the loss function over the predicted data, \\(\\partial g / \\partial p\\) is the derivative of the physical modelling operator, and \\(\\partial p / \\partial \\mathbf{z}\\) is the derivative of the decoder of the pretrained AE (or that of the generator of the pretrained GAN) over the input. Finally, it is worth noting that when an autoencoder is used to find a representative latent space, alternatively a regularized problem of this form can be solved: \\[ J = ||\\mathbf{d}^{obs}-g(\\mathbf{m})||_p^p + \\lambda ||\\mathbf{m} - d_\\phi(e_\\theta(\\mathbf{m})) ||_p^p \\] where the regularization terms ensures that the autoencoder can recreate the estimated model. This ensures that the solution lies in the manifold of the set of plausible solutions used to train the AE network. Learned solvers In the previous section we have discussed the solution of linear (or nonlinear) inverse problems from a high-level perspective. In fact, we purposely decided to avoid any discussion regarding the numerical aspects of solving any of the cost functions \\(J\\) . In practice, real-life problems may target model spaces that contain millions (or even billions) of variables and the same usually applies for the observation vector. Under these conditions, iterative solvers similar to those presented here and here are therefore the only viable option. An iterative solver can be loosely expressed as a nonlinear function \\(\\mathcal{F}\\) of this form: \\[ \\hat{\\mathbf{m}} = \\mathcal{F}(\\mathbf{d}^{obs}, \\mathbf{m}_0, g/\\mathbf{G}) \\] where \\(\\mathbf{m}_0\\) is an initial guess. The vanilla gradient descent algorithm can be more explicitly described by the following update rule: \\[ \\mathbf{m}_{i+1} = \\mathbf{m}_i - \\alpha \\frac{\\partial J}{\\partial \\mathbf{m}} | _ {\\mathbf{m}=\\mathbf{m}_i} (\\mathbf{d}^{obs}, \\mathbf{m}, g/\\mathbf{G}) \\] which we can unroll for a number of iterations and write as: \\[ \\mathbf{m}_{2} = \\mathbf{m}_0 - \\alpha \\frac{\\partial J}{\\partial \\mathbf{m}} | _ {\\mathbf{m}=\\mathbf{m}_0} - \\alpha \\frac{\\partial J}{\\partial \\mathbf{m}} | _ {\\mathbf{m}=\\mathbf{m}_1} \\] This expression clearly shows that the solution of an iterative solver at a given iteration is a simple weighted summation of the intermediate gradients that are subtracted from the initial guess $\\mathbf{m}_0 $. Similarly, more advanced solvers like the linear or nonlinear conjugate gradient algorithm take into account the past gradients at each iteration, however they still apply simple linear scalings to the gradients to produce the final solution. The mathematical community has recently started to investigate a new family of iterative solvers, called learned solvers. The key idea lies in the fact that a linear combination of gradients may not be the best choice (both in terms of convergence speed and ultimate quality of the solution). An alternative update rule of this form \\[ \\mathbf{m}_{i+1} = \\mathbf{m}_i - f_\\theta \\left( \\frac{\\partial J}{\\partial \\mathbf{m}} | _ {\\mathbf{m}=\\mathbf{m}_i}\\right) \\] may represent a better choice. However, a question may arise at this point: how do we choose the nonlinear project \\(f_\\theta\\) that we are going to apply to the gradients at each step? Learned iterative solvers, as the name implies, learn this mapping. More specifically, assuming availability of pairs of models and associated observations \\((\\mathbf{m}^{<i>}, \\mathbf{d}^{obs,<i>})\\) , a supervised learning process is setup such that an iterative solver with \\(N_it\\) iterations is tasked to learn the mapping from data to models. Let's take a look at the schematic below to better understand how this works: A learned iterative solver can be seen as an unrolled neural network where each element takes as input the current model estimate and its gradient and produces an updated version of the model. To keep the model capacity low, each unit shares weights like in classical RNN and each update can be compactly written as: \\[ \\mathbf{m}_i = f_\\theta(\\mathbf{x}_i), \\qquad \\mathbf{x}_i = \\mathbf{m}_{i-1} \\oplus \\frac{\\partial J}{\\partial \\mathbf{m}} \\] where \\(\\oplus\\) indicates concatenation over the channel axis (assuming that model and gradient are N-dimensional tensors). Depending on the problem and type of data \\(f_\\theta\\) can be chosen to be any network architecture, from a simple FF block, to a stack of FF blocks, or even a convolutional neural network. Moreover, given that we have access to the solution, the loss function is set up as follows: \\[ \\underset{f_\\theta} {\\mathrm{arg min}} \\; \\frac{1}{N_s}\\sum_{i=1}^{N_s} \\sum_{j=1}^{N_{it}} w_j \\mathscr{L}(\\mathbf{m}_j^{(i)}, \\mathbf{m}) \\] where each estimate is compared to the true model. Since early iterations may be worse, an exponentially increasing weight may be used to downweight their contributions over the estimates as later iterations of the unrolled solver. Finally, once the learning process is finalized, inference can be simply performed by evaluation a single forward pass of the network for one instance of data \\(\\mathbf{d}^{obs}\\) and a chosen initial guess. To conclude, it is important to answer the following question: why learned solvers are better than pure vanilla supervised learning? The key difference between these two approaches lies in how they decide to use the knowledge of the modelling operator \\(g/\\mathbf{G}\\) . Whilst traditional supervised learning approaches may use the modelling operator in the process of generating training data whilst ignoring it during training, learned iterative solvers integrate the modelling operator in the learning process. Two benefits may arise from this choice: generalization of the trained network over unseen modelling operator and increased robustness to noise in the data. Variants of learned solvers The structure of the learned solver discussed above closely resembles the method proposed by Adler and O\u0308ktem in 2017. A number of variants have been suggested in the literature in the following years: Learned solver with memory Adler and O\u0308ktem further propose to include a memory variable \\(\\mathbf{s}\\) . This takes inspiration from conventional solvers that use past gradients (or memory) to obtain more informed update directions. The model update can be therefore written as follows: \\[ \\mathbf{y}_i = f_\\theta(\\mathbf{x}_i), \\qquad \\mathbf{x}_i = \\mathbf{m}_{i-1} \\oplus \\frac{\\partial J}{\\partial \\mathbf{m}} \\oplus \\mathbf{s}_{i-1} \\qquad \\mathbf{y}_i = \\mathbf{m}_i \\oplus \\mathbf{s}_i \\] Recurrent Inference Machines (RIMs) RIMs closely resemble the second learned solver of Adler and O\u0308ktem. They however differ in the design on the network block and the fact that similarly to RNNs two set of parameters are used instead of one, \\(f_\\theta\\) and \\(f'_\\phi\\) . The model update can be therefore written as follows: \\[ \\begin{aligned} \\mathbf{s}_i &= f'_\\phi (\\mathbf{z}_i) , \\qquad \\mathbf{z}_i = \\boldsymbol \\eta_{i-1} \\oplus \\frac{\\partial J}{\\partial \\mathbf{m}} \\oplus \\mathbf{s}_{i-1} \\\\ \\boldsymbol \\eta_i &= \\boldsymbol \\eta_{i-1} + f_\\theta(\\mathbf{x}_i), \\qquad \\mathbf{x}_i = \\boldsymbol \\eta_{i-1} \\oplus \\frac{\\partial J}{\\partial \\mathbf{m}} \\oplus \\mathbf{s}_i \\end{aligned} \\] where a new variable \\(\\boldsymbol \\eta\\) has been introduced. This is the unscaled output and is connected to the model via a nonlinear activation function \\(\\sigma\\) that is in change of defining a range of allowed values: \\(\\mathbf{z} = \\sigma ( \\boldsymbol \\eta)\\) .","title":"Deep learning for Inverse Problems"},{"location":"lectures/17_deepinv/#deep-learning-for-inverse-problems","text":"The field of inverse problem has experienced a renaissance in the last decade thanks to the recent advances in Deep Learning. Whilst solid theories exist for the solution of linear (or nonlinear) inverse problems, in practice one is always faced with problems that are ill-posed by nature, i.e. many solutions exist that can match data equally well. This is where for long time the inverse problem community has spent time and resources to identify mitigating strategies to reduce the so-called nullspace of an inverse problem by means of prior information. Similarly, for long time the optimization community has developed iterative solvers that can provide solutions to convex or non-convex functionals by requiring only access to function and gradient evaluations of the functional of interest. In this lecture we will discuss where and how Deep Learning may be of great help in the solution of inverse problems.","title":"Deep learning for Inverse Problems"},{"location":"lectures/17_deepinv/#data-driven-or-learned-regularization-of-inverse-problems","text":"To begin, let's consider the solution of an inverse problem of the form: \\(\\mathbf{d}^{obs}=g(\\mathbf{m})\\) or \\(\\mathbf{d}^{obs} = \\mathbf{Gm}\\) where \\(g\\) or \\(\\mathbf{G}\\) is the known modelling operator, \\(\\mathbf{m}\\) are the unknown model parameters, and \\(\\mathbf{d}^{obs}\\) are the observed data. As previously mentioned, in many (geo)scientific applications the operator may be ill-posed and prior knowledge is required to obtain a plausible solution (not just one of the many that matches the data). In classical inverse problem theory this can be achieved as follows: Regularization: \\(J = ||\\mathbf{d}^{obs}-g(\\mathbf{m})||_p^p + \\lambda ||r(\\mathbf{m})||_p^p\\) where \\(r\\) is a function that tries to penalize some features of the model that we are not interested in. Classical choices of \\(r\\) are linear operators such as the identity matrix (this type of regularization is called Tikhonov regularization and favours solution with small L2 norm - \\(p=2\\) ) or the second derivative of laplacian operator (this type of regularization favour smooth solutions). Alternatively, one could choose a linear or nonlinear projection that transforms the model into a domain where the solution is sparse; by choosing \\(p=1\\) , one can estimate the sparsest model that at the same time matches the data. Preconditioning: \\(J = ||\\mathbf{d}^{obs}-g(p(\\mathbf{z}))||_p^p + \\lambda ||\\mathbf{z}||_p^p\\) where by performing a change of variable ( \\(\\mathbf{m}=p(\\mathbf{z})\\) ) the inverse problem is now solved in a transformed domain, and \\(p\\) is a function that filters the solution \\(\\mathbf{z}\\) in such a way that favourable features of the model are enhanced. As an example, a smoothing operator can be used to produce smooth solution (note how this differs from the previous approach where smooth solutions could be constructed by penalizing roughness in the solution by means of second derivatives). A common feature of these two families of approaches is that we as user are requested to select the regularizer or preconditioner for the problem at hand. This could be a difficult task and usually requires a lot of trial-and-error before a good choice is made for a specific problem. Alternatively, one could define a projection that reduces the dimensionality of the space in which we wish to find the solution (i.e., \\(\\mathbf{x} \\in \\mathbb{R}^{N_x}, \\mathbf{z} \\in \\mathbb{R}^{N_z}\\) with \\(N_z << N_x\\) ). This approach reminds us of the dimensionality reduction techniques discusses in this lecture and the choice of the method used to identify a representative latent space can be arbitrary (i.e., a simple linear transformation like PCA or a complex nonlinear transformation like that induced by an Autoencoder or a GAN). A clear advantage of such an approach is that the user is not required to define a transform upfront. Provided availability of training dataset in the form of a representative set of solutions \\(M = (\\mathbf{m}^{<1>}, \\mathbf{m}^{<2>}, ..., \\mathbf{m}^{<N>})\\) , the best data-driven transformation can be identified that suits the problem at hand. Before we get more into the details of such an approach, it is important to make a few remarks. This approach lies in between classical approaches in inverse problem theory and supervised learning approaches in that: classical inverse problems: only the modelling operator \\(g/\\mathbf{G}\\) and one instance of data \\(\\mathbf{d}^{obs}\\) are available. Prior information comes from our knowledge of the expected solution (or its probability distribution), but no set of solutions are available when solving the problem; supervised learning: pairs of models and associated observations \\((\\mathbf{m}^{<i>}, \\mathbf{d}^{obs,<i>})\\) are available upfront (or a set of models \\((\\mathbf{m}^{<i>}\\) from which the associated observations can be synthetically created via the modelling operator). A data-driven model (e.g., a NN) is then trained to find the mapping between data and models. Note that the modelling operator is not actively used in the training process; learned regularization: a set of models \\((\\mathbf{m}^{<i>}\\) is available upfront, which are used to find a latent representation. The inverse problem is subsequently solved for one instance of data \\(\\mathbf{d}^{obs}\\) using the learned regularizer (or preconditioner) and the physical modelling operator. The key idea of solving inverse problems with learned regularizers is therefore to split the problem into two subsequent tasks, where the first is concerned with the prior and the latter with the modelling operator (this is different from the supervised learning approach where the two are learned together): Learning process: a nonlinear model is trained to identify a representative latent space for the set of available solutions. Such model can be an AE (or VAE) network: \\[ \\underset{\\mathbf{e}_\\theta, \\mathbf{d}_\\phi} {\\mathrm{argmin}} \\; \\frac{1}{N_s}\\sum_i \\mathscr{L}(\\mathbf{m}^{(i)}, d_\\phi(e_\\theta(\\mathbf{m}^{(i)}))) \\] or a GAN network \\[ arg \\; \\underset{g_\\theta} {\\mathrm{min}} \\; \\underset{d_\\phi} {\\mathrm{max}} \\; \\frac{1}{N_s}\\sum_i \\mathscr{L}_{adv}(\\mathbf{m}^{(i)}) \\] Inversion: Once the training process is finalized, the decoder (or generator) is used as a nonlinear preconditioner to the solution of the inverse problem as follows: \\[ AE: \\mathbf{m} = d_\\phi(\\mathbf{z}) = p(\\mathbf{z}) \\quad GAN: \\mathbf{m} = g_\\theta(\\mathbf{z}) = p(\\mathbf{z}) \\] such that the inverse problem becomes: \\[ J = ||\\mathbf{d}^{obs}-g(p(\\mathbf{z}))||_p^p + \\lambda ||\\mathbf{z}||_p^p \\] This problem can be now solved using a nonlinear solver of choice, where the gradient can be easily computed using the same set of tools that we employed in the training process of neural networks, namely backpropagation: \\[ \\frac{\\partial J}{\\partial \\mathbf{z}} = \\frac{\\partial J}{\\partial g} \\frac{\\partial g}{\\partial p} \\frac{\\partial p}{\\partial \\mathbf{z}} \\] where \\(\\partial J / \\partial g\\) is the derivative of the loss function over the predicted data, \\(\\partial g / \\partial p\\) is the derivative of the physical modelling operator, and \\(\\partial p / \\partial \\mathbf{z}\\) is the derivative of the decoder of the pretrained AE (or that of the generator of the pretrained GAN) over the input. Finally, it is worth noting that when an autoencoder is used to find a representative latent space, alternatively a regularized problem of this form can be solved: \\[ J = ||\\mathbf{d}^{obs}-g(\\mathbf{m})||_p^p + \\lambda ||\\mathbf{m} - d_\\phi(e_\\theta(\\mathbf{m})) ||_p^p \\] where the regularization terms ensures that the autoencoder can recreate the estimated model. This ensures that the solution lies in the manifold of the set of plausible solutions used to train the AE network.","title":"Data-driven or learned regularization of inverse problems"},{"location":"lectures/17_deepinv/#learned-solvers","text":"In the previous section we have discussed the solution of linear (or nonlinear) inverse problems from a high-level perspective. In fact, we purposely decided to avoid any discussion regarding the numerical aspects of solving any of the cost functions \\(J\\) . In practice, real-life problems may target model spaces that contain millions (or even billions) of variables and the same usually applies for the observation vector. Under these conditions, iterative solvers similar to those presented here and here are therefore the only viable option. An iterative solver can be loosely expressed as a nonlinear function \\(\\mathcal{F}\\) of this form: \\[ \\hat{\\mathbf{m}} = \\mathcal{F}(\\mathbf{d}^{obs}, \\mathbf{m}_0, g/\\mathbf{G}) \\] where \\(\\mathbf{m}_0\\) is an initial guess. The vanilla gradient descent algorithm can be more explicitly described by the following update rule: \\[ \\mathbf{m}_{i+1} = \\mathbf{m}_i - \\alpha \\frac{\\partial J}{\\partial \\mathbf{m}} | _ {\\mathbf{m}=\\mathbf{m}_i} (\\mathbf{d}^{obs}, \\mathbf{m}, g/\\mathbf{G}) \\] which we can unroll for a number of iterations and write as: \\[ \\mathbf{m}_{2} = \\mathbf{m}_0 - \\alpha \\frac{\\partial J}{\\partial \\mathbf{m}} | _ {\\mathbf{m}=\\mathbf{m}_0} - \\alpha \\frac{\\partial J}{\\partial \\mathbf{m}} | _ {\\mathbf{m}=\\mathbf{m}_1} \\] This expression clearly shows that the solution of an iterative solver at a given iteration is a simple weighted summation of the intermediate gradients that are subtracted from the initial guess $\\mathbf{m}_0 $. Similarly, more advanced solvers like the linear or nonlinear conjugate gradient algorithm take into account the past gradients at each iteration, however they still apply simple linear scalings to the gradients to produce the final solution. The mathematical community has recently started to investigate a new family of iterative solvers, called learned solvers. The key idea lies in the fact that a linear combination of gradients may not be the best choice (both in terms of convergence speed and ultimate quality of the solution). An alternative update rule of this form \\[ \\mathbf{m}_{i+1} = \\mathbf{m}_i - f_\\theta \\left( \\frac{\\partial J}{\\partial \\mathbf{m}} | _ {\\mathbf{m}=\\mathbf{m}_i}\\right) \\] may represent a better choice. However, a question may arise at this point: how do we choose the nonlinear project \\(f_\\theta\\) that we are going to apply to the gradients at each step? Learned iterative solvers, as the name implies, learn this mapping. More specifically, assuming availability of pairs of models and associated observations \\((\\mathbf{m}^{<i>}, \\mathbf{d}^{obs,<i>})\\) , a supervised learning process is setup such that an iterative solver with \\(N_it\\) iterations is tasked to learn the mapping from data to models. Let's take a look at the schematic below to better understand how this works: A learned iterative solver can be seen as an unrolled neural network where each element takes as input the current model estimate and its gradient and produces an updated version of the model. To keep the model capacity low, each unit shares weights like in classical RNN and each update can be compactly written as: \\[ \\mathbf{m}_i = f_\\theta(\\mathbf{x}_i), \\qquad \\mathbf{x}_i = \\mathbf{m}_{i-1} \\oplus \\frac{\\partial J}{\\partial \\mathbf{m}} \\] where \\(\\oplus\\) indicates concatenation over the channel axis (assuming that model and gradient are N-dimensional tensors). Depending on the problem and type of data \\(f_\\theta\\) can be chosen to be any network architecture, from a simple FF block, to a stack of FF blocks, or even a convolutional neural network. Moreover, given that we have access to the solution, the loss function is set up as follows: \\[ \\underset{f_\\theta} {\\mathrm{arg min}} \\; \\frac{1}{N_s}\\sum_{i=1}^{N_s} \\sum_{j=1}^{N_{it}} w_j \\mathscr{L}(\\mathbf{m}_j^{(i)}, \\mathbf{m}) \\] where each estimate is compared to the true model. Since early iterations may be worse, an exponentially increasing weight may be used to downweight their contributions over the estimates as later iterations of the unrolled solver. Finally, once the learning process is finalized, inference can be simply performed by evaluation a single forward pass of the network for one instance of data \\(\\mathbf{d}^{obs}\\) and a chosen initial guess. To conclude, it is important to answer the following question: why learned solvers are better than pure vanilla supervised learning? The key difference between these two approaches lies in how they decide to use the knowledge of the modelling operator \\(g/\\mathbf{G}\\) . Whilst traditional supervised learning approaches may use the modelling operator in the process of generating training data whilst ignoring it during training, learned iterative solvers integrate the modelling operator in the learning process. Two benefits may arise from this choice: generalization of the trained network over unseen modelling operator and increased robustness to noise in the data.","title":"Learned solvers"},{"location":"lectures/17_deepinv/#variants-of-learned-solvers","text":"The structure of the learned solver discussed above closely resembles the method proposed by Adler and O\u0308ktem in 2017. A number of variants have been suggested in the literature in the following years: Learned solver with memory Adler and O\u0308ktem further propose to include a memory variable \\(\\mathbf{s}\\) . This takes inspiration from conventional solvers that use past gradients (or memory) to obtain more informed update directions. The model update can be therefore written as follows: \\[ \\mathbf{y}_i = f_\\theta(\\mathbf{x}_i), \\qquad \\mathbf{x}_i = \\mathbf{m}_{i-1} \\oplus \\frac{\\partial J}{\\partial \\mathbf{m}} \\oplus \\mathbf{s}_{i-1} \\qquad \\mathbf{y}_i = \\mathbf{m}_i \\oplus \\mathbf{s}_i \\] Recurrent Inference Machines (RIMs) RIMs closely resemble the second learned solver of Adler and O\u0308ktem. They however differ in the design on the network block and the fact that similarly to RNNs two set of parameters are used instead of one, \\(f_\\theta\\) and \\(f'_\\phi\\) . The model update can be therefore written as follows: \\[ \\begin{aligned} \\mathbf{s}_i &= f'_\\phi (\\mathbf{z}_i) , \\qquad \\mathbf{z}_i = \\boldsymbol \\eta_{i-1} \\oplus \\frac{\\partial J}{\\partial \\mathbf{m}} \\oplus \\mathbf{s}_{i-1} \\\\ \\boldsymbol \\eta_i &= \\boldsymbol \\eta_{i-1} + f_\\theta(\\mathbf{x}_i), \\qquad \\mathbf{x}_i = \\boldsymbol \\eta_{i-1} \\oplus \\frac{\\partial J}{\\partial \\mathbf{m}} \\oplus \\mathbf{s}_i \\end{aligned} \\] where a new variable \\(\\boldsymbol \\eta\\) has been introduced. This is the unscaled output and is connected to the model via a nonlinear activation function \\(\\sigma\\) that is in change of defining a range of allowed values: \\(\\mathbf{z} = \\sigma ( \\boldsymbol \\eta)\\) .","title":"Variants of learned solvers"},{"location":"lectures/18_INN/","text":"Invertible Neural Networks Invertible Neural Networks (INNs) are a class of neural networks where the input of the network can be reconstructed exactly from the output. Popular neural network architectures such as (V)AE, UNet and ResNet are not invertible for a number of reasons. Firstly, these networks have layers that map to different dimensions by either expanding or shrinking (mostly shrinking) the dimension of the current hidden layer to project the input into the so-called latent space. It was long believed that this projection into a lower dimensional space is what made neural networks so powerful; invertible neural networks break with this notion. Additionally, the layers mostly carry out convolutions, which, even if they map to a space of the same dimension, are generally rank deficient and therefore non-invertible. Moreover, popular architectures generally incorporate operations like batch normalization or average or max pooling that are non-invertible. On top of that, some popular activation functions, like ReLU, suffer the same issue of non-invertibility. Given that non-invertible operations like pooling, batch normalization and non-invertible activation functions are typically chosen because they greatly improve performance, discarding them just to ensure invertibility is undesirable. Therefore, constructing an INN that performs on par with modern architectures is not straightforward. In this lecture we will cover how to construct INNs that perform on par with state-of-the-art neural network architectures and show some of their applications. INNs have two main applications that we will cover in this lecture: INNs are used as generative models, mostly known under the name of normalizing flows (NFs) . The idea is that a complicated distribution can be transformed into a Gaussian distribution through a sequence of invertible and differentiable transformations, also called flow , hence the name normalizing flow . INNs are used to overcome the memory requirements of neural networks due to storing all activations that are needed for backpropagation. In INNs, since the input can be computed from the output, these values need not be stored and the memory requirements are constant as the size of the network increases. Normalizing flows Normalizing flows are used for generative modeling by transforming a sample distribution into the target distribution through a series of invertible transformations, called flow . This principle is illustrated in the figure below Any sample \\(x\\) from the target distribution can be transformed to a sample from the base distribution \\(z\\) via the relation \\(f(x) = z\\) and if \\(f\\) is invertible then we can equivalently obtain \\(x = f^{-1}(z)\\) . When the flow consists of multiple transformations like in the figure above, then \\(f = f_1 \\circ f_2 \\circ f_3\\) and \\(f^{-1} = f_3^{-1} \\circ f_2^{-1} \\circ f_1^{-1}\\) . Ideally, the sample distribution is a simple one whose parameters are known and is one that is easy to sample from. When one probability density function is related to another via a flow the relationship between the two is given by \\[ p_X(x) = p_Z(f(x))\\vert \\det Df(x)\\vert \\] Ideally, when looking for a probability distribution that best fits the data one is interested in minimizing the negative log-likelihood. In the case of INNs this is straightforward, since the log-likelihood of \\(p_X\\) is related to the log-likelihood of \\(p_Z\\) via \\[ -\\log p_X(x) = - \\log p_Z(f(x)) - \\log(\\vert \\det Df(x)\\vert). \\] This shows one benefit of INNs as compared to other generative models such as GANs and VAEs, that are not able to minimize the log-likelihood. GANs do not act on the log-likelihood and VAEs only minimize an upper bound, see the lecture on VAEs. Moreover, by design both sampling from the distribution and inference are easy. This makes INNs well-suited for Variational Inference (VI), as discussed in the lecture on VAEs. Clearly, in order to efficiently evaluate the sought after distribution \\(p_X\\) , we need to be able to evaluate \\(\\det Df(x)\\) efficiently. For a general matrix evaluating the determinant is costly; roughly equal to the cost of inverting the matrix. As an example, assume that we have a neural network that maps \\(\\mathbb{R}^n \\to \\mathbb{R}^n\\) . Choosing a sigmoid activation function yields the following expression for one forward pass from one layer to the next \\[ \\sigma(x) = \\frac{1}{1 + e^{- Wx - b}}, \\] where the exponential is evaluated pointwise. The gradient is given by \\[ D\\sigma(x) = \\begin{bmatrix} \\frac{e^{-Wx - b}}{(1 + e^{-Wx - b})^2}\\odot w_1, \\ldots, \\frac{e^{-Wx - b}}{(1 + e^{-Wx - b})^2}\\odot w_n \\end{bmatrix}. \\] Clearly, evaluating the determinant of this matrix is not feasible for large-scale problems, even though the network itself has an invertible structure. When the matrix has special structure, e.g. diagonal or triangular, or if the matrix is unitary, computing the determinant is cheap. However, guaranteeing a certain structure or property of the weight matrix is either not possible, computationally expensive, or severely limits the expressive capabilities of the network. Dinh et al., 2014 proposed the coupling flow that is both invertible and has a determinant that is cheap to evaluate. Coupling flows A coupling flow is a flow that splits the input into two parts, say \\(x_A\\) and \\(x_B\\) , after which \\(x_A\\) is mapped directly to the output via the identity function and \\(x_B\\) undergoes an invertible transformation, which is conditioned on \\(x_A\\) . The principle is outlined in the figure below. Here, \\(f\\) denotes the coupling function that ensures the dependency of the output \\(z_B\\) on the input \\(x_A\\) . \\(\\theta\\) can be any function and need not even be invertible, since it only parametrizes the coupling function \\(f\\) , and it can be computed from the output using the equality \\(z_A = x_A\\) . For example, in Dinh et al., 2014 the authors use a simple ReLU MLP, Dinh et al., 2016 use a convolutional residual neural network, and in Kingma et al., 2018 a three-layer CNN with ReLU activations. As illustrated in the figure below, the coupling flow is invertible. By splitting the input the Jacobian will consist of four components. Denoting the function mapping the input \\(x\\) to the output \\(z\\) by \\(g\\) we have \\[ Dg(x) = \\begin{bmatrix} \\frac{\\partial z_A}{\\partial x_A} & \\frac{\\partial z_A}{\\partial x_A} \\\\ \\frac{\\partial z_B}{\\partial x_A} & \\frac{\\partial z_B}{\\partial x_B}\\end{bmatrix} = \\begin{bmatrix} I & 0 \\\\ \\frac{\\partial}{\\partial x_A} f(x_B;\\theta(x_A)) & Df(x_B;\\theta(x_A)) \\end{bmatrix}. \\] The Jacobian is a triangular matrix and therefore the determinant is easy to evaluate since it is simply the product of the diagonal elements. The coupling function The coupling flow was introduced in Dinh et al., 2014 , where the additive coupling function has the following structure $$ \\begin{aligned} z_A = x_A \\ z_B = x_B + \\theta(x_A) \\end{aligned} $$ In subsequent work Dinh et al., 2016 used the affine coupling function $$ \\begin{aligned} z_A = x_A \\ z_B = x_B \\odot e^{s(x_A)} + t(x_A), \\end{aligned} $$ where \\(s\\) denotes a scaling function and \\(t\\) a translation function. The split function Splitting the input can be done a number of ways. Dinh et al., 2016 split the input along the spatial dimension using a checkerboard pattern, after which a squeezing operation is applied that reduces the spatial dimension and accordingly increases the number of channels. Subsequently, the channel dimension is masked in a manner that doesn't interfere with the masking in the spatial dimension. This principle is illustrated in the figure below that is directly taken from the paper. In this figure, the white squares are fed directly to the output whereas the black squares are fed through an invertible function conditioned on the white squares. By splitting the input the same way every time certain parts of the input are never used but only propagated directly to the output. To make sure all components are leveraged, the intermediate outputs have to be shuffled. Dinh et al., 2016 propose alternating shuffling where the components that are unaltered are fed through the invertible function in the next iteration and vice versa. Alternatively, Kingma et al., 2018 propose the use of invertible learned 1-x-1 convolutions, and show that this improves performance compared to alternating shuffling of Dinh et al., 2016 or random shuffling. Multi-scale architecture Along with the spatial splitting of the input Dinh et al., 2016 propose a multi-scale architecture where the spatial dimension is downsampled followed by a corresponding increase in the number of channels. Their overal architecture combines 3 coupling layers with spatial checkerboard masking followed by a squeezing operation with channel-wise masking. Because the layers of an INN preserve dimension, propagating the input through the network is costly both in terms of computational cost and memory cost. Therefore, half of the dimension are fed through the network directly without undergoing any more coupling functions. Reversible network architectures Due to the reversible nature of the network INNs have low memory cost. To understand this, let's recall the algorithm that is used to perform gradient descent for neural networks: backpropagation. Backpropagation essentially computes the gradient by repeated application of the chain rule. Recall the following figure from lecture 6: To compute the derivative we need access to result of the activation functions. If the network is fully invertible these values can be computed from the output. However, if the network is not invertible either the values have to be recomputed, which is extremely expensive for large networks, or the output of the activations has to be stored. This is essentially what happens in backpropagation. The drawback is that all the intermediate outputs have to be stored, putting a huge burden on the memory. Since GPUs generally have limited memory this becomes a bottleneck for deep neural networks. When the network is reversible, the inputs can be computed from the outputs which lifts the burden from the memory in exchange computation. This is generally a favorable trade for GPUs. A number of popular architectures now have reversible or invertible counterparts, for example UNet and ResNet. Note that, in order to lift the memory burden from backpropagation, the network need not be invertible: injectivity suffices. We now show two popular network architectures that can be made invertible: UNet and ResNet. RevNet: reversible ResNet The ResNet architecture is characterized by skip connections and consists of residual blocks of the form \\[ y = x + \\mathcal{F}(x), \\] where \\(\\mathcal{F}(x)\\) denotes the residual block. The RevNet uses a coupling flow that is slightly different from the previous coupling as shown in the figure below from the RevNet paper Gomez et al., 2017 . Here, both \\(F\\) and \\(G\\) denote the residual blocks that are typical for the standard ResNet. The coupling flow is given by \\[ \\begin{aligned} z_A & = x_A + F(x_B) \\\\ z_B & = x_B + G(z_A) \\end{aligned} \\] with inverse \\[ \\begin{aligned} x_B & = z_B - G(z_A) \\\\ x_A & = z_A - F(x_B) \\end{aligned} \\] i-UNet: invertible UNet The UNet architecture calculates features on multiple scales by downsampling the input a number of times: this is the depth of the UNet. Every downsampling layers is followed by a number of convolutional layers that extract the features at the current scale. When the maximum downsampling is reached, the input is upsampled at the same rate until an output with the same dimension as the input is reached. To combine the extracted features from different scales the UNet passes the downsampled images directly from the downsampling layers to the upsampling layers where they are concatenated. This is illustrated in the figure below. The convolutional layers in the UNet can be replaced by the invertible coupling layers to make them invertible. The downsampling layers can be made invertible by increasing the number of channels. If we denote the size of the current image by \\(h \\times w \\times c\\) , where h denotes the height, w denotes the width and c denotes the number of channels, then a map from \\(\\mathbb{R}^{h \\times w \\times c} \\to \\mathbb{R}^{h/n_h \\times w/n_w \\times n_h\\cdot n_w\\cdot c}\\) can be made invertible. In principle one could use the downsampling operation according to the checkerboard pattern as introduced by Dinh et al., 2016 . However, the corresponding upsampling operation introduces checkerboard artifacts. Etmann et al., 2020 proposed the use of learned orthogonal downsampling and upsampling operations. The key idea is that the downsampling operation can be expressed as a convolution where the kernel size equals the stride. This way, convolution can be seen as matrix-vector multiplication with a convolutional kernel matrix that has the dimension of the number of channels. This principle is illustrated in the figure below. Note that it is convenient but not strictly necessary for the kernel matrix to be orthogonal. Orthogonality makes the subsequent upsampling operator easy to evaluate, as it's just the adjoint of the kernel matrix. The invertible UNet, i-UNet, is now constructed by combining the invertible downsampling operator with the coupling functions we have seen before, replacing the standard downsampling and convolutional layers respectively. The i-UNet architecture is shown in the figure below from the paper by Etmann et al., 2020 . Further reading These notes were heavily inspired by the following tutorials: Brubacker and Kothe Paul Hand Below are the references for the RealNVP, GLOW, i-UNet and RevNet papers: RealNVP GLOW i-UNet RevNet Library for building INNs: MemCNN","title":"Invertible Neural Networks"},{"location":"lectures/18_INN/#invertible-neural-networks","text":"Invertible Neural Networks (INNs) are a class of neural networks where the input of the network can be reconstructed exactly from the output. Popular neural network architectures such as (V)AE, UNet and ResNet are not invertible for a number of reasons. Firstly, these networks have layers that map to different dimensions by either expanding or shrinking (mostly shrinking) the dimension of the current hidden layer to project the input into the so-called latent space. It was long believed that this projection into a lower dimensional space is what made neural networks so powerful; invertible neural networks break with this notion. Additionally, the layers mostly carry out convolutions, which, even if they map to a space of the same dimension, are generally rank deficient and therefore non-invertible. Moreover, popular architectures generally incorporate operations like batch normalization or average or max pooling that are non-invertible. On top of that, some popular activation functions, like ReLU, suffer the same issue of non-invertibility. Given that non-invertible operations like pooling, batch normalization and non-invertible activation functions are typically chosen because they greatly improve performance, discarding them just to ensure invertibility is undesirable. Therefore, constructing an INN that performs on par with modern architectures is not straightforward. In this lecture we will cover how to construct INNs that perform on par with state-of-the-art neural network architectures and show some of their applications. INNs have two main applications that we will cover in this lecture: INNs are used as generative models, mostly known under the name of normalizing flows (NFs) . The idea is that a complicated distribution can be transformed into a Gaussian distribution through a sequence of invertible and differentiable transformations, also called flow , hence the name normalizing flow . INNs are used to overcome the memory requirements of neural networks due to storing all activations that are needed for backpropagation. In INNs, since the input can be computed from the output, these values need not be stored and the memory requirements are constant as the size of the network increases.","title":"Invertible Neural Networks"},{"location":"lectures/18_INN/#normalizing-flows","text":"Normalizing flows are used for generative modeling by transforming a sample distribution into the target distribution through a series of invertible transformations, called flow . This principle is illustrated in the figure below Any sample \\(x\\) from the target distribution can be transformed to a sample from the base distribution \\(z\\) via the relation \\(f(x) = z\\) and if \\(f\\) is invertible then we can equivalently obtain \\(x = f^{-1}(z)\\) . When the flow consists of multiple transformations like in the figure above, then \\(f = f_1 \\circ f_2 \\circ f_3\\) and \\(f^{-1} = f_3^{-1} \\circ f_2^{-1} \\circ f_1^{-1}\\) . Ideally, the sample distribution is a simple one whose parameters are known and is one that is easy to sample from. When one probability density function is related to another via a flow the relationship between the two is given by \\[ p_X(x) = p_Z(f(x))\\vert \\det Df(x)\\vert \\] Ideally, when looking for a probability distribution that best fits the data one is interested in minimizing the negative log-likelihood. In the case of INNs this is straightforward, since the log-likelihood of \\(p_X\\) is related to the log-likelihood of \\(p_Z\\) via \\[ -\\log p_X(x) = - \\log p_Z(f(x)) - \\log(\\vert \\det Df(x)\\vert). \\] This shows one benefit of INNs as compared to other generative models such as GANs and VAEs, that are not able to minimize the log-likelihood. GANs do not act on the log-likelihood and VAEs only minimize an upper bound, see the lecture on VAEs. Moreover, by design both sampling from the distribution and inference are easy. This makes INNs well-suited for Variational Inference (VI), as discussed in the lecture on VAEs. Clearly, in order to efficiently evaluate the sought after distribution \\(p_X\\) , we need to be able to evaluate \\(\\det Df(x)\\) efficiently. For a general matrix evaluating the determinant is costly; roughly equal to the cost of inverting the matrix. As an example, assume that we have a neural network that maps \\(\\mathbb{R}^n \\to \\mathbb{R}^n\\) . Choosing a sigmoid activation function yields the following expression for one forward pass from one layer to the next \\[ \\sigma(x) = \\frac{1}{1 + e^{- Wx - b}}, \\] where the exponential is evaluated pointwise. The gradient is given by \\[ D\\sigma(x) = \\begin{bmatrix} \\frac{e^{-Wx - b}}{(1 + e^{-Wx - b})^2}\\odot w_1, \\ldots, \\frac{e^{-Wx - b}}{(1 + e^{-Wx - b})^2}\\odot w_n \\end{bmatrix}. \\] Clearly, evaluating the determinant of this matrix is not feasible for large-scale problems, even though the network itself has an invertible structure. When the matrix has special structure, e.g. diagonal or triangular, or if the matrix is unitary, computing the determinant is cheap. However, guaranteeing a certain structure or property of the weight matrix is either not possible, computationally expensive, or severely limits the expressive capabilities of the network. Dinh et al., 2014 proposed the coupling flow that is both invertible and has a determinant that is cheap to evaluate.","title":"Normalizing flows"},{"location":"lectures/18_INN/#coupling-flows","text":"A coupling flow is a flow that splits the input into two parts, say \\(x_A\\) and \\(x_B\\) , after which \\(x_A\\) is mapped directly to the output via the identity function and \\(x_B\\) undergoes an invertible transformation, which is conditioned on \\(x_A\\) . The principle is outlined in the figure below. Here, \\(f\\) denotes the coupling function that ensures the dependency of the output \\(z_B\\) on the input \\(x_A\\) . \\(\\theta\\) can be any function and need not even be invertible, since it only parametrizes the coupling function \\(f\\) , and it can be computed from the output using the equality \\(z_A = x_A\\) . For example, in Dinh et al., 2014 the authors use a simple ReLU MLP, Dinh et al., 2016 use a convolutional residual neural network, and in Kingma et al., 2018 a three-layer CNN with ReLU activations. As illustrated in the figure below, the coupling flow is invertible. By splitting the input the Jacobian will consist of four components. Denoting the function mapping the input \\(x\\) to the output \\(z\\) by \\(g\\) we have \\[ Dg(x) = \\begin{bmatrix} \\frac{\\partial z_A}{\\partial x_A} & \\frac{\\partial z_A}{\\partial x_A} \\\\ \\frac{\\partial z_B}{\\partial x_A} & \\frac{\\partial z_B}{\\partial x_B}\\end{bmatrix} = \\begin{bmatrix} I & 0 \\\\ \\frac{\\partial}{\\partial x_A} f(x_B;\\theta(x_A)) & Df(x_B;\\theta(x_A)) \\end{bmatrix}. \\] The Jacobian is a triangular matrix and therefore the determinant is easy to evaluate since it is simply the product of the diagonal elements.","title":"Coupling flows"},{"location":"lectures/18_INN/#the-coupling-function","text":"The coupling flow was introduced in Dinh et al., 2014 , where the additive coupling function has the following structure $$ \\begin{aligned} z_A = x_A \\ z_B = x_B + \\theta(x_A) \\end{aligned} $$ In subsequent work Dinh et al., 2016 used the affine coupling function $$ \\begin{aligned} z_A = x_A \\ z_B = x_B \\odot e^{s(x_A)} + t(x_A), \\end{aligned} $$ where \\(s\\) denotes a scaling function and \\(t\\) a translation function.","title":"The coupling function"},{"location":"lectures/18_INN/#the-split-function","text":"Splitting the input can be done a number of ways. Dinh et al., 2016 split the input along the spatial dimension using a checkerboard pattern, after which a squeezing operation is applied that reduces the spatial dimension and accordingly increases the number of channels. Subsequently, the channel dimension is masked in a manner that doesn't interfere with the masking in the spatial dimension. This principle is illustrated in the figure below that is directly taken from the paper. In this figure, the white squares are fed directly to the output whereas the black squares are fed through an invertible function conditioned on the white squares. By splitting the input the same way every time certain parts of the input are never used but only propagated directly to the output. To make sure all components are leveraged, the intermediate outputs have to be shuffled. Dinh et al., 2016 propose alternating shuffling where the components that are unaltered are fed through the invertible function in the next iteration and vice versa. Alternatively, Kingma et al., 2018 propose the use of invertible learned 1-x-1 convolutions, and show that this improves performance compared to alternating shuffling of Dinh et al., 2016 or random shuffling.","title":"The split function"},{"location":"lectures/18_INN/#multi-scale-architecture","text":"Along with the spatial splitting of the input Dinh et al., 2016 propose a multi-scale architecture where the spatial dimension is downsampled followed by a corresponding increase in the number of channels. Their overal architecture combines 3 coupling layers with spatial checkerboard masking followed by a squeezing operation with channel-wise masking. Because the layers of an INN preserve dimension, propagating the input through the network is costly both in terms of computational cost and memory cost. Therefore, half of the dimension are fed through the network directly without undergoing any more coupling functions.","title":"Multi-scale architecture"},{"location":"lectures/18_INN/#reversible-network-architectures","text":"Due to the reversible nature of the network INNs have low memory cost. To understand this, let's recall the algorithm that is used to perform gradient descent for neural networks: backpropagation. Backpropagation essentially computes the gradient by repeated application of the chain rule. Recall the following figure from lecture 6: To compute the derivative we need access to result of the activation functions. If the network is fully invertible these values can be computed from the output. However, if the network is not invertible either the values have to be recomputed, which is extremely expensive for large networks, or the output of the activations has to be stored. This is essentially what happens in backpropagation. The drawback is that all the intermediate outputs have to be stored, putting a huge burden on the memory. Since GPUs generally have limited memory this becomes a bottleneck for deep neural networks. When the network is reversible, the inputs can be computed from the outputs which lifts the burden from the memory in exchange computation. This is generally a favorable trade for GPUs. A number of popular architectures now have reversible or invertible counterparts, for example UNet and ResNet. Note that, in order to lift the memory burden from backpropagation, the network need not be invertible: injectivity suffices. We now show two popular network architectures that can be made invertible: UNet and ResNet.","title":"Reversible network architectures"},{"location":"lectures/18_INN/#revnet-reversible-resnet","text":"The ResNet architecture is characterized by skip connections and consists of residual blocks of the form \\[ y = x + \\mathcal{F}(x), \\] where \\(\\mathcal{F}(x)\\) denotes the residual block. The RevNet uses a coupling flow that is slightly different from the previous coupling as shown in the figure below from the RevNet paper Gomez et al., 2017 . Here, both \\(F\\) and \\(G\\) denote the residual blocks that are typical for the standard ResNet. The coupling flow is given by \\[ \\begin{aligned} z_A & = x_A + F(x_B) \\\\ z_B & = x_B + G(z_A) \\end{aligned} \\] with inverse \\[ \\begin{aligned} x_B & = z_B - G(z_A) \\\\ x_A & = z_A - F(x_B) \\end{aligned} \\]","title":"RevNet: reversible ResNet"},{"location":"lectures/18_INN/#i-unet-invertible-unet","text":"The UNet architecture calculates features on multiple scales by downsampling the input a number of times: this is the depth of the UNet. Every downsampling layers is followed by a number of convolutional layers that extract the features at the current scale. When the maximum downsampling is reached, the input is upsampled at the same rate until an output with the same dimension as the input is reached. To combine the extracted features from different scales the UNet passes the downsampled images directly from the downsampling layers to the upsampling layers where they are concatenated. This is illustrated in the figure below. The convolutional layers in the UNet can be replaced by the invertible coupling layers to make them invertible. The downsampling layers can be made invertible by increasing the number of channels. If we denote the size of the current image by \\(h \\times w \\times c\\) , where h denotes the height, w denotes the width and c denotes the number of channels, then a map from \\(\\mathbb{R}^{h \\times w \\times c} \\to \\mathbb{R}^{h/n_h \\times w/n_w \\times n_h\\cdot n_w\\cdot c}\\) can be made invertible. In principle one could use the downsampling operation according to the checkerboard pattern as introduced by Dinh et al., 2016 . However, the corresponding upsampling operation introduces checkerboard artifacts. Etmann et al., 2020 proposed the use of learned orthogonal downsampling and upsampling operations. The key idea is that the downsampling operation can be expressed as a convolution where the kernel size equals the stride. This way, convolution can be seen as matrix-vector multiplication with a convolutional kernel matrix that has the dimension of the number of channels. This principle is illustrated in the figure below. Note that it is convenient but not strictly necessary for the kernel matrix to be orthogonal. Orthogonality makes the subsequent upsampling operator easy to evaluate, as it's just the adjoint of the kernel matrix. The invertible UNet, i-UNet, is now constructed by combining the invertible downsampling operator with the coupling functions we have seen before, replacing the standard downsampling and convolutional layers respectively. The i-UNet architecture is shown in the figure below from the paper by Etmann et al., 2020 .","title":"i-UNet: invertible UNet"},{"location":"lectures/18_INN/#further-reading","text":"These notes were heavily inspired by the following tutorials: Brubacker and Kothe Paul Hand Below are the references for the RealNVP, GLOW, i-UNet and RevNet papers: RealNVP GLOW i-UNet RevNet Library for building INNs: MemCNN","title":"Further reading"},{"location":"lectures/19_implicit/","text":"Implicit neural networks Neural networks consists of a sequence of consecutive operations that are typically defined explicitly. An explicit operation is one that computes the output directly from a sequence of explicit operations applied to the input. A simple example is a feed-forward MLP, where the transition from one layer to the next is done by the following sequence of operations \\[ \\begin{aligned} z_i & = W_iz_{i-1} + b_i \\\\ a_i & = \\sigma_i(z_i) \\end{aligned} \\] Additionally one could add operations like batch normalization and max pooling, all of which are given explicitly. Alternatively, two variables can be related via an implicit equation. A simple example of an explicit function versus an implicit function is \\(y = x^2\\) versus \\(x^2 + y^2 = 1\\) . From the second example it becomes clear why implicit functions are sometimes favorable, since the implicit function \\(x^2 + y^2 = 1\\) has an explicit counterpart with two equations, namely \\(y = \\sqrt{1 - x^2}\\) and \\(y = -\\sqrt{1 - x^2}\\) . In a more abstract fashion, we can write an explicit equation as \\[ y = f(x), \\] and an implicit equation as \\[ f(x, y) = 0. \\] Neural networks can be defined implicitly as well through the concept of implicit layers and were introduced under the name Deep Equilibrium Models (DEQ) . This concept is a bit abstract but the nice thing about this paradigm is that the memory requirements for deep networks are constant. To understand this concept we need to cover two fundamental concepts: Implicit functions and the implicit function theorem. Taking derivatives of explicit functions is easy, since we have an explicit relation of the output with respect to the input, and we can compute \\(\\frac{dy}{dx}\\) in a straightforward manner. However, if \\(y\\) is only given through \\(f(x, y) = 0\\) then computing the derivative is less straightforward. Fixed point iterations. Fixed point iterations are iterations of the form \\(x_{k+1} = \\mathcal{F}(x_k)\\) and we call a vector \\(x_{\\star}\\) a fixed point of \\(\\mathcal{F}\\) if \\(x_{\\star} = \\mathcal{F}(x_{\\star})\\) . DEQs are based on the idea that the layers of a neural network will eventually reach a fixed point. Fixed point iterations Consider the following fixed point iteration \\[ z_{k+1} = \\tanh(Wz_k + b + x). \\] This is essentially repeated application of one layer of a neural network with weight matrix \\(W\\) , bias \\(b\\) , some input \\(x\\) and activation function \\(\\tanh\\) . Assuming for now that a fixed point actually exists we iterate until convergence, i.e. \\(z_{\\star} = \\mathcal{F}(z_{\\star})\\) up to some tolerance. Alternatively, we can write the above equation as \\[ z - \\tanh(Wz + b + x) = 0, \\] where the function is now implicitly defined. Defining \\[ g(x, z) := z - \\tanh(Wz + b + x), \\] the goal is now to solve the root finding problem \\[ g(x, z_{\\star}(x)) = 0, \\] where \\(z_{\\star}(x)\\) denotes the solution depending on \\(x\\) . Let the solution to this problem be given by \\(z_{\\star}(x)\\) and assume we want to compute \\(\\frac{dz_{\\star}(x)}{dx}\\) (note that we could choose to differentiate through any parameter, for example the weight matrix, but this is just for illustrative purposes). Since we only have access to \\(z_{\\star}\\) through the equation \\(g(x, z_{\\star}(x)) = 0\\) need to differentiate through this equation to obtain \\(\\frac{dz_{\\star}(x)}{dx}\\) . This yields: \\[ \\frac{\\partial}{\\partial x}g(x, z_{\\star}(x)) = \\frac{\\partial g(x, z_{\\star})}{\\partial x} + \\frac{\\partial g(x, z_{\\star})}{\\partial z_{\\star}}\\frac{\\partial z_{\\star}(x)}{\\partial x} = 0 \\] This equation allows us to solve for \\(\\frac{dz_{\\star}(x)}{dx}\\) as follows \\[ \\frac{\\partial z_{\\star}(x)}{\\partial x} = -\\left(\\frac{\\partial g(x, z_{\\star})}{\\partial z_{\\star}}\\right)^{-1}\\frac{\\partial g(x, z_{\\star})}{\\partial x} \\] The main question here is whether existence is guaranteed. The implicit function theorem states that if a fixed point exists and the function \\(g\\) is differentiable with non-singular Jacobian around \\(z_{\\star}\\) there exists a unique function \\(z_{\\star}(x)\\) . The key point here is that one can differentiate through \\(z_{\\star}\\) without needing to differentiate through the solver used to obtain the fixed point. This saves a huge amount of memory that would otherwise be needed in order to perform backpropagation. This observation has led to the development of the Deep Equilibrium Network . This network has the following structure: \\[ \\begin{aligned} z_1 & = 0 \\\\ z_i & = \\sigma_i(Wz_i + Ux + b_i), \\quad i=1,\\ldots, k \\\\ h(x) & = W_kz_k + b_k \\end{aligned} \\] As we can see, DEQs apply a fixed point iteration to a single layer of a neural network. The question is whether this fixed point iteration actually converges: It could also blow-up or oscillate. It turns out that in general the fixed point iteration converges. As you can probably guess at this point, the fixed point iteration is solved using implicit differentiation, thereby bypassing the need to store any information necessary for the backward pass. This way one can build an extremely deep network. If we now want to update the weights of the neural network we need to evaluate the partial derivative with respect to \\(W\\) . Given that \\(z_{\\star}\\) is a fixed point we have \\[ z_{\\star} = f(x, z_{\\star}) \\: \\Leftrightarrow \\: \\frac{\\partial z_{\\star}}{\\partial W} = \\frac{\\partial f(x, z_{\\star})}{\\partial W} \\] Computing \\(\\frac{\\partial f(x, z_{\\star})}{\\partial W}\\) via implicit differentiation and rearranging terms gives \\[ \\frac{\\partial z_{\\star}}{\\partial W} = \\left(I - \\frac{\\partial f(x, z_{\\star})}{\\partial z_{\\star}}\\right)^{-1}\\frac{\\partial f(x, z_{\\star})}{\\partial W} \\] Backpropagation actually implements the transpose of this expression, i.e.: \\[ \\left(\\frac{\\partial z_{\\star}}{\\partial W}\\right)^Ty = \\left(\\frac{\\partial f(x, z_{\\star})}{\\partial W}\\right)^T\\left(I - \\frac{\\partial f(x, z_{\\star})}{\\partial z_{\\star}}\\right)^{-T}y, \\] where \\(y\\) is some vector we apply the gradient to. Evaluating the gradient is now a two-step process: Evaluate \\(\\left(I - \\frac{\\partial f(x, z_{\\star})}{\\partial z_{\\star}}\\right)^{-1}y\\) . Since this matrix tends to be large we do not evaluate the inverse directly, but rather solve the linear system $$ y = \\left(I - \\frac{\\partial f(x, z_{\\star})}{\\partial z_{\\star}}\\right)g \\quad \\Leftrightarrow \\quad g = \\frac{\\partial f(x, z_{\\star})}{\\partial z_{\\star}}g + y. $$ Compute $$ \\left(\\frac{\\partial z_{\\star}}{\\partial W}\\right)^Ty = \\left(\\frac{\\partial f(x, z_{\\star})}{\\partial W}\\right)^Tg $$ So far we have considered a rather simple model for the DEQ. We have assumed a constant weight \\(W\\) accross the layers and have assumed a simple feed-forward model. It turns out that a feed-forward neural network with constant weights accross the layers is actually equivalent to a neural network with a layer-dependent matrix, which is summarized in the following theorem by Bai et al., 2019 : Consider a traditional \\(L\\) -layer MLP \\[ z_{i+1} = \\sigma_{i}(W_iz_i + b_i), \\quad i=0,\\ldots,L-1, \\quad z_0 = x. \\] This network is equivalent to the following weight-tied network of equivalent depth: \\[ \\tilde{z}_{i+1} = \\tilde{\\sigma}(W_zz_i + \\tilde{b} + Ux), \\quad i=0, \\ldots, L-1, \\quad \\tilde{z}_{0} = (0, \\ldots, 0)^T \\] We prove the theorem for the case \\(L = 4\\) , but it extends to general \\(L\\) . Define the matrices \\[ W_z = \\begin{bmatrix} 0 & 0 & 0 & 0 \\\\ W_1 & 0 & 0 & 0 \\\\ 0 & W_2 & 0 & 0 \\\\ 0 & 0 & W_{3} & 0 \\end{bmatrix}, \\: U = \\begin{bmatrix} W_0 \\\\ 0 \\\\ 0 \\\\ 0 \\end{bmatrix}, \\: \\tilde{b} = \\begin{bmatrix} b_0 \\\\ b_1 \\\\ b_2 \\\\ b_3 \\end{bmatrix}, \\: \\tilde{\\sigma} = \\begin{bmatrix} \\sigma_0 \\\\ \\sigma_1 \\\\ \\sigma_2 \\\\ \\sigma_3 \\end{bmatrix}. \\] Then after one iteration we have \\[ \\tilde{z}_1 = \\tilde{\\sigma}(W_z\\tilde{z}_0 + Ux + \\tilde{b}) = \\begin{bmatrix} \\sigma_0(W_0x + b_0) \\\\ \\sigma_1(b_1) \\\\ \\sigma_2(b_2) \\\\ \\sigma_3(b_3) \\end{bmatrix} = \\begin{bmatrix} z_0 \\\\ \\sigma_1(b_1) \\\\ \\sigma_2(b_2) \\\\ \\sigma_3(b_3) \\end{bmatrix}. \\] For the second iteration we have \\[ W_z\\tilde{z_1} = \\begin{bmatrix} 0 & 0 & 0 & 0 \\\\ W_1 & 0 & 0 & 0 \\\\ 0 & W_2 & 0 & 0 \\\\ 0 & 0 & W_{3} & 0 \\end{bmatrix}\\begin{bmatrix} z_0 \\\\ \\sigma_1(b_1) \\\\ \\sigma_2(b_2) \\\\ \\sigma_3(b_3) \\end{bmatrix} = \\begin{bmatrix} 0 \\\\ W_1z_0 \\\\ W_2\\sigma_2(b_2) \\\\ W_3\\sigma_2(b_2) \\end{bmatrix} \\] and hence \\[ \\tilde{z}_{2} = \\tilde{\\sigma}(W_zz_1 + \\tilde{b} + Ux) = \\begin{bmatrix} \\sigma_0(W_0x + b_0) \\\\ \\sigma_1(W_1z_0 + b_1) \\\\ \\sigma_2(W_2\\sigma_1(b_1) + b_2) \\\\ \\sigma_3(W_3\\sigma_2(b_2) + b_3) \\end{bmatrix} = \\begin{bmatrix} z_0 \\\\ z_1 \\\\ \\sigma_2(W_2\\sigma_1(b_1) + b_2) \\\\ \\sigma_3(W_3\\sigma_2(b_2) + b_3) \\end{bmatrix}. \\] Similarly, for the next layer we obtain \\[ \\begin{bmatrix} 0 & 0 & 0 & 0 \\\\ W_1 & 0 & 0 & 0 \\\\ 0 & W_2 & 0 & 0 \\\\ 0 & 0 & W_{3} & 0 \\end{bmatrix}\\begin{bmatrix} z_0 \\\\ z_1 \\\\ \\sigma_2(W_2\\sigma_1(b_1) + b_2) \\\\ \\sigma_3(W_3\\sigma_2(b_2) + b_3) \\end{bmatrix} = \\begin{bmatrix} 0 \\\\ W_1z_0 \\\\ W_2z_1 \\\\ W_3\\sigma_2(W_2\\sigma_1(b_1) + b_2) \\end{bmatrix} \\] which leads to \\[ \\tilde{z}_3 = \\begin{bmatrix} \\sigma_0(W_0x + b_0) \\\\ \\sigma_1(W_1z_0 + b_1) \\\\ \\sigma_2(W_2z_1 + b_2) \\\\ \\sigma_3(W_3\\sigma_2(W_2\\sigma_1(b_1) + b_2) + b_3) \\end{bmatrix} = \\begin{bmatrix} z_1 \\\\ z_2 \\\\ z_3 \\\\ \\sigma_3(W_3\\sigma_2(W_2\\sigma_1(b_1) + b_2) + b_3) \\end{bmatrix} \\] Then, finally, \\[ \\begin{bmatrix} 0 & 0 & 0 & 0 \\\\ W_1 & 0 & 0 & 0 \\\\ 0 & W_2 & 0 & 0 \\\\ 0 & 0 & W_{3} & 0 \\end{bmatrix}\\begin{bmatrix} z_1 \\\\ z_2 \\\\ z_3 \\\\ \\sigma_3(W_3\\sigma_2(W_2\\sigma_1(b_1) + b_2) + b_3) \\end{bmatrix} = \\begin{bmatrix} 0 \\\\ W_1z_1 \\\\ W_2z_2 \\\\ W_3z_3 \\end{bmatrix} \\] and hence \\[ \\tilde{z}_4 = \\begin{bmatrix} \\sigma_0(W_0x + b_0) \\\\ \\sigma_1(W_1z_0 + b_1) \\\\ \\sigma_2(W_2z_1 + b_2) \\\\ \\sigma_3(W_3z_2 + b_3) \\end{bmatrix} = \\begin{bmatrix} z_1 \\\\ z_2 \\\\ z_3 \\\\ z_4 \\end{bmatrix}. \\] Moreover, note that we have only used a single layer DEQ as opposed to the multi-layer architecture that is typical for powerful neural networks. However, any deep neural network can be represented as a deep neural network. The argument is as follows. Assume that construct a two-layer network \\(g_2(g_1(x))\\) . This can be posed a single layer DEQ using the following relation: \\[ f(z, x) = f\\left( \\begin{bmatrix} z_1 \\\\ z_2 \\end{bmatrix}, x\\right) = \\begin{bmatrix} g(x) \\\\ g(z_1) \\end{bmatrix} \\] That is, the complexity of the extra layer can simply be added by concatenating the two layers to make a single layer neural network. The same argument holds for stacking DEQs: a single DEQ can model any number of stacked DEQs. Finally, we can increase the complexity of the DEQ by substituting the simple feed-forward neural network with any sequence of operations, including convolutions, normalizations, grouping and skip connections. Further reading These notes are essentially a summary of the following tutorial, specifically chapters 1 and 4: Implicit layer tutorial Below is the paper introducing Deep Equilibrium Models (DEQ): Bai et al., 2019","title":"Implicit neural networks"},{"location":"lectures/19_implicit/#implicit-neural-networks","text":"Neural networks consists of a sequence of consecutive operations that are typically defined explicitly. An explicit operation is one that computes the output directly from a sequence of explicit operations applied to the input. A simple example is a feed-forward MLP, where the transition from one layer to the next is done by the following sequence of operations \\[ \\begin{aligned} z_i & = W_iz_{i-1} + b_i \\\\ a_i & = \\sigma_i(z_i) \\end{aligned} \\] Additionally one could add operations like batch normalization and max pooling, all of which are given explicitly. Alternatively, two variables can be related via an implicit equation. A simple example of an explicit function versus an implicit function is \\(y = x^2\\) versus \\(x^2 + y^2 = 1\\) . From the second example it becomes clear why implicit functions are sometimes favorable, since the implicit function \\(x^2 + y^2 = 1\\) has an explicit counterpart with two equations, namely \\(y = \\sqrt{1 - x^2}\\) and \\(y = -\\sqrt{1 - x^2}\\) . In a more abstract fashion, we can write an explicit equation as \\[ y = f(x), \\] and an implicit equation as \\[ f(x, y) = 0. \\] Neural networks can be defined implicitly as well through the concept of implicit layers and were introduced under the name Deep Equilibrium Models (DEQ) . This concept is a bit abstract but the nice thing about this paradigm is that the memory requirements for deep networks are constant. To understand this concept we need to cover two fundamental concepts: Implicit functions and the implicit function theorem. Taking derivatives of explicit functions is easy, since we have an explicit relation of the output with respect to the input, and we can compute \\(\\frac{dy}{dx}\\) in a straightforward manner. However, if \\(y\\) is only given through \\(f(x, y) = 0\\) then computing the derivative is less straightforward. Fixed point iterations. Fixed point iterations are iterations of the form \\(x_{k+1} = \\mathcal{F}(x_k)\\) and we call a vector \\(x_{\\star}\\) a fixed point of \\(\\mathcal{F}\\) if \\(x_{\\star} = \\mathcal{F}(x_{\\star})\\) . DEQs are based on the idea that the layers of a neural network will eventually reach a fixed point.","title":"Implicit neural networks"},{"location":"lectures/19_implicit/#fixed-point-iterations","text":"Consider the following fixed point iteration \\[ z_{k+1} = \\tanh(Wz_k + b + x). \\] This is essentially repeated application of one layer of a neural network with weight matrix \\(W\\) , bias \\(b\\) , some input \\(x\\) and activation function \\(\\tanh\\) . Assuming for now that a fixed point actually exists we iterate until convergence, i.e. \\(z_{\\star} = \\mathcal{F}(z_{\\star})\\) up to some tolerance. Alternatively, we can write the above equation as \\[ z - \\tanh(Wz + b + x) = 0, \\] where the function is now implicitly defined. Defining \\[ g(x, z) := z - \\tanh(Wz + b + x), \\] the goal is now to solve the root finding problem \\[ g(x, z_{\\star}(x)) = 0, \\] where \\(z_{\\star}(x)\\) denotes the solution depending on \\(x\\) . Let the solution to this problem be given by \\(z_{\\star}(x)\\) and assume we want to compute \\(\\frac{dz_{\\star}(x)}{dx}\\) (note that we could choose to differentiate through any parameter, for example the weight matrix, but this is just for illustrative purposes). Since we only have access to \\(z_{\\star}\\) through the equation \\(g(x, z_{\\star}(x)) = 0\\) need to differentiate through this equation to obtain \\(\\frac{dz_{\\star}(x)}{dx}\\) . This yields: \\[ \\frac{\\partial}{\\partial x}g(x, z_{\\star}(x)) = \\frac{\\partial g(x, z_{\\star})}{\\partial x} + \\frac{\\partial g(x, z_{\\star})}{\\partial z_{\\star}}\\frac{\\partial z_{\\star}(x)}{\\partial x} = 0 \\] This equation allows us to solve for \\(\\frac{dz_{\\star}(x)}{dx}\\) as follows \\[ \\frac{\\partial z_{\\star}(x)}{\\partial x} = -\\left(\\frac{\\partial g(x, z_{\\star})}{\\partial z_{\\star}}\\right)^{-1}\\frac{\\partial g(x, z_{\\star})}{\\partial x} \\] The main question here is whether existence is guaranteed. The implicit function theorem states that if a fixed point exists and the function \\(g\\) is differentiable with non-singular Jacobian around \\(z_{\\star}\\) there exists a unique function \\(z_{\\star}(x)\\) . The key point here is that one can differentiate through \\(z_{\\star}\\) without needing to differentiate through the solver used to obtain the fixed point. This saves a huge amount of memory that would otherwise be needed in order to perform backpropagation. This observation has led to the development of the Deep Equilibrium Network . This network has the following structure: \\[ \\begin{aligned} z_1 & = 0 \\\\ z_i & = \\sigma_i(Wz_i + Ux + b_i), \\quad i=1,\\ldots, k \\\\ h(x) & = W_kz_k + b_k \\end{aligned} \\] As we can see, DEQs apply a fixed point iteration to a single layer of a neural network. The question is whether this fixed point iteration actually converges: It could also blow-up or oscillate. It turns out that in general the fixed point iteration converges. As you can probably guess at this point, the fixed point iteration is solved using implicit differentiation, thereby bypassing the need to store any information necessary for the backward pass. This way one can build an extremely deep network. If we now want to update the weights of the neural network we need to evaluate the partial derivative with respect to \\(W\\) . Given that \\(z_{\\star}\\) is a fixed point we have \\[ z_{\\star} = f(x, z_{\\star}) \\: \\Leftrightarrow \\: \\frac{\\partial z_{\\star}}{\\partial W} = \\frac{\\partial f(x, z_{\\star})}{\\partial W} \\] Computing \\(\\frac{\\partial f(x, z_{\\star})}{\\partial W}\\) via implicit differentiation and rearranging terms gives \\[ \\frac{\\partial z_{\\star}}{\\partial W} = \\left(I - \\frac{\\partial f(x, z_{\\star})}{\\partial z_{\\star}}\\right)^{-1}\\frac{\\partial f(x, z_{\\star})}{\\partial W} \\] Backpropagation actually implements the transpose of this expression, i.e.: \\[ \\left(\\frac{\\partial z_{\\star}}{\\partial W}\\right)^Ty = \\left(\\frac{\\partial f(x, z_{\\star})}{\\partial W}\\right)^T\\left(I - \\frac{\\partial f(x, z_{\\star})}{\\partial z_{\\star}}\\right)^{-T}y, \\] where \\(y\\) is some vector we apply the gradient to. Evaluating the gradient is now a two-step process: Evaluate \\(\\left(I - \\frac{\\partial f(x, z_{\\star})}{\\partial z_{\\star}}\\right)^{-1}y\\) . Since this matrix tends to be large we do not evaluate the inverse directly, but rather solve the linear system $$ y = \\left(I - \\frac{\\partial f(x, z_{\\star})}{\\partial z_{\\star}}\\right)g \\quad \\Leftrightarrow \\quad g = \\frac{\\partial f(x, z_{\\star})}{\\partial z_{\\star}}g + y. $$ Compute $$ \\left(\\frac{\\partial z_{\\star}}{\\partial W}\\right)^Ty = \\left(\\frac{\\partial f(x, z_{\\star})}{\\partial W}\\right)^Tg $$ So far we have considered a rather simple model for the DEQ. We have assumed a constant weight \\(W\\) accross the layers and have assumed a simple feed-forward model. It turns out that a feed-forward neural network with constant weights accross the layers is actually equivalent to a neural network with a layer-dependent matrix, which is summarized in the following theorem by Bai et al., 2019 : Consider a traditional \\(L\\) -layer MLP \\[ z_{i+1} = \\sigma_{i}(W_iz_i + b_i), \\quad i=0,\\ldots,L-1, \\quad z_0 = x. \\] This network is equivalent to the following weight-tied network of equivalent depth: \\[ \\tilde{z}_{i+1} = \\tilde{\\sigma}(W_zz_i + \\tilde{b} + Ux), \\quad i=0, \\ldots, L-1, \\quad \\tilde{z}_{0} = (0, \\ldots, 0)^T \\] We prove the theorem for the case \\(L = 4\\) , but it extends to general \\(L\\) . Define the matrices \\[ W_z = \\begin{bmatrix} 0 & 0 & 0 & 0 \\\\ W_1 & 0 & 0 & 0 \\\\ 0 & W_2 & 0 & 0 \\\\ 0 & 0 & W_{3} & 0 \\end{bmatrix}, \\: U = \\begin{bmatrix} W_0 \\\\ 0 \\\\ 0 \\\\ 0 \\end{bmatrix}, \\: \\tilde{b} = \\begin{bmatrix} b_0 \\\\ b_1 \\\\ b_2 \\\\ b_3 \\end{bmatrix}, \\: \\tilde{\\sigma} = \\begin{bmatrix} \\sigma_0 \\\\ \\sigma_1 \\\\ \\sigma_2 \\\\ \\sigma_3 \\end{bmatrix}. \\] Then after one iteration we have \\[ \\tilde{z}_1 = \\tilde{\\sigma}(W_z\\tilde{z}_0 + Ux + \\tilde{b}) = \\begin{bmatrix} \\sigma_0(W_0x + b_0) \\\\ \\sigma_1(b_1) \\\\ \\sigma_2(b_2) \\\\ \\sigma_3(b_3) \\end{bmatrix} = \\begin{bmatrix} z_0 \\\\ \\sigma_1(b_1) \\\\ \\sigma_2(b_2) \\\\ \\sigma_3(b_3) \\end{bmatrix}. \\] For the second iteration we have \\[ W_z\\tilde{z_1} = \\begin{bmatrix} 0 & 0 & 0 & 0 \\\\ W_1 & 0 & 0 & 0 \\\\ 0 & W_2 & 0 & 0 \\\\ 0 & 0 & W_{3} & 0 \\end{bmatrix}\\begin{bmatrix} z_0 \\\\ \\sigma_1(b_1) \\\\ \\sigma_2(b_2) \\\\ \\sigma_3(b_3) \\end{bmatrix} = \\begin{bmatrix} 0 \\\\ W_1z_0 \\\\ W_2\\sigma_2(b_2) \\\\ W_3\\sigma_2(b_2) \\end{bmatrix} \\] and hence \\[ \\tilde{z}_{2} = \\tilde{\\sigma}(W_zz_1 + \\tilde{b} + Ux) = \\begin{bmatrix} \\sigma_0(W_0x + b_0) \\\\ \\sigma_1(W_1z_0 + b_1) \\\\ \\sigma_2(W_2\\sigma_1(b_1) + b_2) \\\\ \\sigma_3(W_3\\sigma_2(b_2) + b_3) \\end{bmatrix} = \\begin{bmatrix} z_0 \\\\ z_1 \\\\ \\sigma_2(W_2\\sigma_1(b_1) + b_2) \\\\ \\sigma_3(W_3\\sigma_2(b_2) + b_3) \\end{bmatrix}. \\] Similarly, for the next layer we obtain \\[ \\begin{bmatrix} 0 & 0 & 0 & 0 \\\\ W_1 & 0 & 0 & 0 \\\\ 0 & W_2 & 0 & 0 \\\\ 0 & 0 & W_{3} & 0 \\end{bmatrix}\\begin{bmatrix} z_0 \\\\ z_1 \\\\ \\sigma_2(W_2\\sigma_1(b_1) + b_2) \\\\ \\sigma_3(W_3\\sigma_2(b_2) + b_3) \\end{bmatrix} = \\begin{bmatrix} 0 \\\\ W_1z_0 \\\\ W_2z_1 \\\\ W_3\\sigma_2(W_2\\sigma_1(b_1) + b_2) \\end{bmatrix} \\] which leads to \\[ \\tilde{z}_3 = \\begin{bmatrix} \\sigma_0(W_0x + b_0) \\\\ \\sigma_1(W_1z_0 + b_1) \\\\ \\sigma_2(W_2z_1 + b_2) \\\\ \\sigma_3(W_3\\sigma_2(W_2\\sigma_1(b_1) + b_2) + b_3) \\end{bmatrix} = \\begin{bmatrix} z_1 \\\\ z_2 \\\\ z_3 \\\\ \\sigma_3(W_3\\sigma_2(W_2\\sigma_1(b_1) + b_2) + b_3) \\end{bmatrix} \\] Then, finally, \\[ \\begin{bmatrix} 0 & 0 & 0 & 0 \\\\ W_1 & 0 & 0 & 0 \\\\ 0 & W_2 & 0 & 0 \\\\ 0 & 0 & W_{3} & 0 \\end{bmatrix}\\begin{bmatrix} z_1 \\\\ z_2 \\\\ z_3 \\\\ \\sigma_3(W_3\\sigma_2(W_2\\sigma_1(b_1) + b_2) + b_3) \\end{bmatrix} = \\begin{bmatrix} 0 \\\\ W_1z_1 \\\\ W_2z_2 \\\\ W_3z_3 \\end{bmatrix} \\] and hence \\[ \\tilde{z}_4 = \\begin{bmatrix} \\sigma_0(W_0x + b_0) \\\\ \\sigma_1(W_1z_0 + b_1) \\\\ \\sigma_2(W_2z_1 + b_2) \\\\ \\sigma_3(W_3z_2 + b_3) \\end{bmatrix} = \\begin{bmatrix} z_1 \\\\ z_2 \\\\ z_3 \\\\ z_4 \\end{bmatrix}. \\] Moreover, note that we have only used a single layer DEQ as opposed to the multi-layer architecture that is typical for powerful neural networks. However, any deep neural network can be represented as a deep neural network. The argument is as follows. Assume that construct a two-layer network \\(g_2(g_1(x))\\) . This can be posed a single layer DEQ using the following relation: \\[ f(z, x) = f\\left( \\begin{bmatrix} z_1 \\\\ z_2 \\end{bmatrix}, x\\right) = \\begin{bmatrix} g(x) \\\\ g(z_1) \\end{bmatrix} \\] That is, the complexity of the extra layer can simply be added by concatenating the two layers to make a single layer neural network. The same argument holds for stacking DEQs: a single DEQ can model any number of stacked DEQs. Finally, we can increase the complexity of the DEQ by substituting the simple feed-forward neural network with any sequence of operations, including convolutions, normalizations, grouping and skip connections.","title":"Fixed point iterations"},{"location":"lectures/19_implicit/#further-reading","text":"These notes are essentially a summary of the following tutorial, specifically chapters 1 and 4: Implicit layer tutorial Below is the paper introducing Deep Equilibrium Models (DEQ): Bai et al., 2019","title":"Further reading"}]}
\ No newline at end of file
+{"config":{"indexing":"full","lang":["en"],"min_search_length":3,"prebuild_index":false,"separator":"[\\s\\-]+"},"docs":[{"location":"","text":"Homepage This course covers the fundamentals of machine learning, its applications to geoscientific problems, and it provides basic best practices for the rigorous development and evaluation of machine learning models. The main focus of the course is on describing the fundamental theory of linear regression , logistic regression , neural networks , convolutional neural networks , sequence modelling , dimensionality reduction , generative modelling , and physics-inspired neural networks . Students will also be introduced to practical applications in geoscience for each of the presented methods; lab sessions will be held using the PyTorch computational framework in the Python programming language. Lectures Sunday, and Wednesday, 1:00pm - 2:30pm Teaching Staff Instructor: Matteo Ravasi - Office Hours: Monday 4pm to 5pm (by Appointment: BI-1432) Textbook Deep Learning by Ian Goodfellow and Yoshua Bengio and Aaron Courville \u2013 MIT Press. Pre-requisites Knowledge of calculus, linear algebra ad statistics is required. Basic Python knowledge is preferred. Course Requirements ErSE 213 - Inverse problems","title":"Homepage"},{"location":"#homepage","text":"This course covers the fundamentals of machine learning, its applications to geoscientific problems, and it provides basic best practices for the rigorous development and evaluation of machine learning models. The main focus of the course is on describing the fundamental theory of linear regression , logistic regression , neural networks , convolutional neural networks , sequence modelling , dimensionality reduction , generative modelling , and physics-inspired neural networks . Students will also be introduced to practical applications in geoscience for each of the presented methods; lab sessions will be held using the PyTorch computational framework in the Python programming language.","title":"Homepage"},{"location":"#lectures","text":"Sunday, and Wednesday, 1:00pm - 2:30pm","title":"Lectures"},{"location":"#teaching-staff","text":"Instructor: Matteo Ravasi - Office Hours: Monday 4pm to 5pm (by Appointment: BI-1432)","title":"Teaching Staff"},{"location":"#textbook","text":"Deep Learning by Ian Goodfellow and Yoshua Bengio and Aaron Courville \u2013 MIT Press.","title":"Textbook"},{"location":"#pre-requisites","text":"Knowledge of calculus, linear algebra ad statistics is required. Basic Python knowledge is preferred.","title":"Pre-requisites"},{"location":"#course-requirements","text":"ErSE 213 - Inverse problems","title":"Course Requirements"},{"location":"READMEcurvenotelocal/","text":"Curvenote version of notes Set up environment To get started install Curvenote: npm install -g curvenote Get token online and add it curvenote token set API_TOKEN Run locally Type the following command in the terminal curvenote start and access at http://127.0.0.1:3000 Publish curvenote deploy","title":"READMEcurvenotelocal"},{"location":"READMEcurvenotelocal/#set-up-environment","text":"To get started install Curvenote: npm install -g curvenote Get token online and add it curvenote token set API_TOKEN","title":"Set up environment"},{"location":"READMEcurvenotelocal/#run-locally","text":"Type the following command in the terminal curvenote start and access at http://127.0.0.1:3000","title":"Run locally"},{"location":"READMEcurvenotelocal/#publish","text":"curvenote deploy","title":"Publish"},{"location":"gradind/","text":"Grading system The final grade will be obtained as the combination of the following: 50.00% - Course Project 30.00% - Midterm exam 20.00% - Homeworks Homeworks Homeworks will be assigned at the end of each topic. They consist of both pen and paper questions and programming exercises. The submitted codes must be properly commented and implementation choices must be justified (this is as important as the code itself and counts towards the final mark). Project The project should cover one of the topics learned in this course. It could be focused on implementing a novel machine learning algorithm to a geoscientific problem or on performing a systematic comparison of different machine learning algorithms to a geoscientific dataset. Students are encouraged to start the project early. The best way is to define a problem statement at the beginning of the term and learn how to use machine learning to solve such a problem during the course. Collaboration Most homeworks involve programming assignments. Students are encouraged to collaborate and consult with each other, but an individual assignments (and code) must be handed in. Acknowledge explicitly in your submitted assignment if you have collaborated with someone else while working on the assignment. Late submissions Each student has access to one late submission wildcard of no more than 2 days from the submission deadline. Apart from using this wildcard, late submissions will be penalized with a loss of 40% of the achieved score.","title":"Grading system"},{"location":"gradind/#grading-system","text":"The final grade will be obtained as the combination of the following: 50.00% - Course Project 30.00% - Midterm exam 20.00% - Homeworks","title":"Grading system"},{"location":"gradind/#homeworks","text":"Homeworks will be assigned at the end of each topic. They consist of both pen and paper questions and programming exercises. The submitted codes must be properly commented and implementation choices must be justified (this is as important as the code itself and counts towards the final mark).","title":"Homeworks"},{"location":"gradind/#project","text":"The project should cover one of the topics learned in this course. It could be focused on implementing a novel machine learning algorithm to a geoscientific problem or on performing a systematic comparison of different machine learning algorithms to a geoscientific dataset. Students are encouraged to start the project early. The best way is to define a problem statement at the beginning of the term and learn how to use machine learning to solve such a problem during the course.","title":"Project"},{"location":"gradind/#collaboration","text":"Most homeworks involve programming assignments. Students are encouraged to collaborate and consult with each other, but an individual assignments (and code) must be handed in. Acknowledge explicitly in your submitted assignment if you have collaborated with someone else while working on the assignment.","title":"Collaboration"},{"location":"gradind/#late-submissions","text":"Each student has access to one late submission wildcard of no more than 2 days from the submission deadline. Apart from using this wildcard, late submissions will be penalized with a loss of 40% of the achieved score.","title":"Late submissions"},{"location":"schedule/","text":"Schedule Lecture Topic Exercise 1 Course overview and introduction to Machine Learning - 2 Linear algebra refresher - 3 Probability refresher - 4 Gradient-based optimization link 5 Linear and Logistic regression link 6 Neural Networks: perceptron, activation functions link1 7 Neural Networks: backpropagation, initialization, and loss functions - 8 Best practices in training of Machine Learning models - 9 Advanced solvers: momentum, RMSProp, Adam, greedy training - 10 UQ in Neural Networks and Mixture Density Networks link1 link2 11 Introduction to CNNs - 12 CNNs Popular Architectutues link 13 Sequence modelling: basic principles 14 Sequence modelling: architectures link 15 Dimensionality reduction 16 Generative modelling and VAEs reduction 17 GANs 18 Scientific ML and PINNs link 19 Deep learning for Inverse Problems 20 Invertible Neural Networks 21 Implicit Neural Networks","title":"Schedule"},{"location":"schedule/#schedule","text":"Lecture Topic Exercise 1 Course overview and introduction to Machine Learning - 2 Linear algebra refresher - 3 Probability refresher - 4 Gradient-based optimization link 5 Linear and Logistic regression link 6 Neural Networks: perceptron, activation functions link1 7 Neural Networks: backpropagation, initialization, and loss functions - 8 Best practices in training of Machine Learning models - 9 Advanced solvers: momentum, RMSProp, Adam, greedy training - 10 UQ in Neural Networks and Mixture Density Networks link1 link2 11 Introduction to CNNs - 12 CNNs Popular Architectutues link 13 Sequence modelling: basic principles 14 Sequence modelling: architectures link 15 Dimensionality reduction 16 Generative modelling and VAEs reduction 17 GANs 18 Scientific ML and PINNs link 19 Deep learning for Inverse Problems 20 Invertible Neural Networks 21 Implicit Neural Networks","title":"Schedule"},{"location":"lectures/01_intro/","text":"Introduction to Machine Learning Humans have long dreamed of creating machines that can think and act independently . For many years this has been the aim of Artificial Intelligence (AI) . In the early days of AI, many problems that are difficult to solve by humans (e.g., large summations or multiplications, solution of systems of equations) turn out to be easier for computers as long as humans could define a list of tasks that machines could perform at faster speed and higher precisions than humans can do themselves. On the other hand, tasks that are very easily solved by adult humans and even kids (e.g., recognizing animals in pictures or singing a song) turned out to be very difficult for computers. The main reason of such difficulties lies in the fact that humans cannot explain in words (and with a simple set of instructions) how they have learned to accomplish these tasks. This is where instead the second era of AI solutions, belonging to the field of Machine Learning (ML) , have shown astonishing results in the last decade. Instead of relying on hard-coded rules, these algorithms operate in a similar fashion to human beings as they learn from experience . In other words, given enough training data in the form of inputs (e.g., photos) and outputs (e.g., label of the animal present in the photo), ML algorithms can learn a complex nonlinear mapping between them such that they can infer the output from the input when provided with unseen inputs. A large variety of ML algorithms have been developed by the scientific community, ranging from the basic linear and logistic regression that we will see in our fourth lecture , decision tree-based statistical methods such as random forrest or gradient boosting , all the way to deep neural networks , which have recently shown to outperform previously developed algorithms in many fields (e.g., computer science, text analysis and speech recognition, seismic interpretation). This subfield has grown exponentially in the last few years and it is now referred to as Deep Learning and will be subject of most of our course. In short, Deep learning is a particular kind of machine learning that represent the world as a nested hierarchy of increasingly complicated concepts the more we move away from the input and towards the output of the associated computational graph. Whilst sharing the same underlying principle of learning from experience in the form of a training data , different algorithms presents their own strengths and limitations and a machine learning practitioner must make a careful judgment at any time depending on the problem to be solved. Terminology Machine Learning is divided into 3 main categories: Supervised Learning : learn a function that maps an input to an output ( \\(X \\rightarrow Y\\) ). Inputs are also referred to as features and outputs are called targets. In practice we have access to a number of training pairs \\(\\{ \\textbf{x}_i, \\textbf{y}_i \\} \\; i=1,..,N\\) and we learn \\(\\textbf{y}_i=f_\\theta(\\textbf{x}_i)\\) where \\(f_\\theta\\) is for example parametrized via a neural network. Two main applications of supervised learning are Classification : the target is discrete Regression : the target is continuous Unsupervised Learning : learn patterns from unlabelled data. These methods have been shown to be able to find compact internal representation of the manifold the input data belongs to. Such compact representations can become valuable input features for subsequent tasks of supervised learning. In the context of deep learning, unsupervised models may even attempt to estimate the entire probability distribution of the dataset or how to generate new, independent samples from such distribution. We will get into the mathematical details of these families of models in the second part of our course. Semi-supervised Learning : it lies in between the other learning paradigms as it learns from some examples that include a target and some that do not. Input data can also come in 2 different types: Structured data : tables (e.g., databases) Unstructured data : images, audio, text, ... Examples of applications in geoscience are displayed in the figure below. A number of available data types in various geoscientific contexts is also displayed. History Finally, we take a brief look at the history of Deep Learning. This field has so far experienced three main waves of major development (and periods of success) interspersed by winters (or periods of disbelief): '40 - '50 : first learning algorithms heavily influenced by our understanding of the inner working of the human brain. Mostly linear models such as the McCulloch-Pitts neuron, the perceptron by Rosenblatt, and the adaptive linear element (ADALINE). The latter was trained on an algorithm very similar to Stochastic Gradient Descent (SGD). These models showed poor performance in learning complex functions (e.g., XOR) and led to a drop in popularity of the field. '80 - '90 : these years so the creation of the Multi Layer Perceptron (MLP), the neocognitron (the ancestor of the convolutional layer), the first deep neural networks (e.g., LeNet for MNIST classification), the first sequence-to-sequence networks and the LSTM layer. from 2010 till now : a major moment for the history of this field can be traced back to 2012, when a deep convolution neural network developed by Krizhevsky and co-authors won the ImageNet competition lowering the top-5 error rate from 26.1 percent (previous winning solution not based on a neural network) to 15.3 percent. Since then the field has exploded with advances both in terms of model architectures (AlexNet, VGG, ResNet, GoogleLeNet, ...) optimization algorithms (AdaGrad, RMSProp, Adam, ...), applications (computer vision, text analysis, speech recognition, ...). Moreover, recent developments in the area of unsupervised learning have led to the creation of dimensionality reduction and generative algorithms that can now outperform any state-of-the-art method that is not based on neural networks. If you want to dig deeper into the history of this field, an interesting read can be found here .","title":"Introduction to Machine Learning"},{"location":"lectures/01_intro/#introduction-to-machine-learning","text":"Humans have long dreamed of creating machines that can think and act independently . For many years this has been the aim of Artificial Intelligence (AI) . In the early days of AI, many problems that are difficult to solve by humans (e.g., large summations or multiplications, solution of systems of equations) turn out to be easier for computers as long as humans could define a list of tasks that machines could perform at faster speed and higher precisions than humans can do themselves. On the other hand, tasks that are very easily solved by adult humans and even kids (e.g., recognizing animals in pictures or singing a song) turned out to be very difficult for computers. The main reason of such difficulties lies in the fact that humans cannot explain in words (and with a simple set of instructions) how they have learned to accomplish these tasks. This is where instead the second era of AI solutions, belonging to the field of Machine Learning (ML) , have shown astonishing results in the last decade. Instead of relying on hard-coded rules, these algorithms operate in a similar fashion to human beings as they learn from experience . In other words, given enough training data in the form of inputs (e.g., photos) and outputs (e.g., label of the animal present in the photo), ML algorithms can learn a complex nonlinear mapping between them such that they can infer the output from the input when provided with unseen inputs. A large variety of ML algorithms have been developed by the scientific community, ranging from the basic linear and logistic regression that we will see in our fourth lecture , decision tree-based statistical methods such as random forrest or gradient boosting , all the way to deep neural networks , which have recently shown to outperform previously developed algorithms in many fields (e.g., computer science, text analysis and speech recognition, seismic interpretation). This subfield has grown exponentially in the last few years and it is now referred to as Deep Learning and will be subject of most of our course. In short, Deep learning is a particular kind of machine learning that represent the world as a nested hierarchy of increasingly complicated concepts the more we move away from the input and towards the output of the associated computational graph. Whilst sharing the same underlying principle of learning from experience in the form of a training data , different algorithms presents their own strengths and limitations and a machine learning practitioner must make a careful judgment at any time depending on the problem to be solved.","title":"Introduction to Machine Learning"},{"location":"lectures/01_intro/#terminology","text":"Machine Learning is divided into 3 main categories: Supervised Learning : learn a function that maps an input to an output ( \\(X \\rightarrow Y\\) ). Inputs are also referred to as features and outputs are called targets. In practice we have access to a number of training pairs \\(\\{ \\textbf{x}_i, \\textbf{y}_i \\} \\; i=1,..,N\\) and we learn \\(\\textbf{y}_i=f_\\theta(\\textbf{x}_i)\\) where \\(f_\\theta\\) is for example parametrized via a neural network. Two main applications of supervised learning are Classification : the target is discrete Regression : the target is continuous Unsupervised Learning : learn patterns from unlabelled data. These methods have been shown to be able to find compact internal representation of the manifold the input data belongs to. Such compact representations can become valuable input features for subsequent tasks of supervised learning. In the context of deep learning, unsupervised models may even attempt to estimate the entire probability distribution of the dataset or how to generate new, independent samples from such distribution. We will get into the mathematical details of these families of models in the second part of our course. Semi-supervised Learning : it lies in between the other learning paradigms as it learns from some examples that include a target and some that do not. Input data can also come in 2 different types: Structured data : tables (e.g., databases) Unstructured data : images, audio, text, ... Examples of applications in geoscience are displayed in the figure below. A number of available data types in various geoscientific contexts is also displayed.","title":"Terminology"},{"location":"lectures/01_intro/#history","text":"Finally, we take a brief look at the history of Deep Learning. This field has so far experienced three main waves of major development (and periods of success) interspersed by winters (or periods of disbelief): '40 - '50 : first learning algorithms heavily influenced by our understanding of the inner working of the human brain. Mostly linear models such as the McCulloch-Pitts neuron, the perceptron by Rosenblatt, and the adaptive linear element (ADALINE). The latter was trained on an algorithm very similar to Stochastic Gradient Descent (SGD). These models showed poor performance in learning complex functions (e.g., XOR) and led to a drop in popularity of the field. '80 - '90 : these years so the creation of the Multi Layer Perceptron (MLP), the neocognitron (the ancestor of the convolutional layer), the first deep neural networks (e.g., LeNet for MNIST classification), the first sequence-to-sequence networks and the LSTM layer. from 2010 till now : a major moment for the history of this field can be traced back to 2012, when a deep convolution neural network developed by Krizhevsky and co-authors won the ImageNet competition lowering the top-5 error rate from 26.1 percent (previous winning solution not based on a neural network) to 15.3 percent. Since then the field has exploded with advances both in terms of model architectures (AlexNet, VGG, ResNet, GoogleLeNet, ...) optimization algorithms (AdaGrad, RMSProp, Adam, ...), applications (computer vision, text analysis, speech recognition, ...). Moreover, recent developments in the area of unsupervised learning have led to the creation of dimensionality reduction and generative algorithms that can now outperform any state-of-the-art method that is not based on neural networks. If you want to dig deeper into the history of this field, an interesting read can be found here .","title":"History"},{"location":"lectures/02_linalg/","text":"Linear Algebra refresher In this lecture we will go through some of the key concepts of linear algebra and inverse problem theory that are required to develop the theories of the different machine learning algorithm presented in this course. This is not meant to be an exhaustive treatise and students are strongly advised to take the ErSE 213 - Inverse Problems prior to this course. Three key mathematical objects arise in the study of linear algebra: Scalars : \\(a \\in \\mathbb{R}\\) , a single number represented by a lower case italic letter; Vectors : \\(\\mathbf{x} = [x_1, x_2, ..., x_N]^T \\in \\mathbb{R}^N\\) , ordered collection of \\(N\\) numbers represented by a lower case bold letter; it is sometimes useful to extract a subset of elements by defining a set \\(\\mathbb{S}\\) and add it to as a superscript, \\(\\mathbf{x}_\\mathbb{S}\\) . As an example, given \\(\\mathbf{x} = [x_1, x_2, x_3, x_4, x_5, x_6]^T \\in \\mathbb{R}^6\\) and \\(\\mathbb{S} = {1, 3, 5}\\) we can define the vector \\(\\mathbf{x}_\\mathbb{S} = [x_1, x_3, x_5]\\) and its complementary vector \\(\\mathbf{x}_{-\\mathbb{S}} = [x_2, x_4, x_6]\\) Matrices : \\(\\mathbf{X} \\in \\mathbb{R}^{[N \\times M]}\\) , two dimensional collection of numbers represented by an upper case bold letter where \\(N\\) and \\(M\\) are referred to as the height and width of the matrix. More specifically a matrix can be written as \\[\\mathbf{X} = \\begin{bmatrix} x_{1,1} & x_{1,2} & x_{1,M} \\\\ ... & ... & ... \\\\ x_{N,1} & x_{N,2} & x_{N,M} \\end{bmatrix} \\] A matrix can be indexed by rows \\(\\mathbf{X}_{i, :}\\) (i-th row), by columns \\(\\mathbf{X}_{:, j}\\) (j-th column), and by element \\(\\mathbf{X}_{i, j}\\) (i-th row, j-th column). A number of useful operations that are commonly applied on vectors and matrices are now described: Transpose: \\(\\mathbf{Y} = \\mathbf{X}^T\\) , where \\(Y_{i, j} = X_{j, i}\\) Matrix plus vector: \\(\\mathbf{Y}_{[N \\times M]} = \\mathbf{X}_{[N \\times M]} + \\mathbf{z}_{[1 \\times M]}\\) , where \\(Y_{i, j} = X_{i, j} + z_{j}\\) ( \\(\\mathbf{z}\\) is added to each row of the matrix \\(\\mathbf{X}\\) ) Matrix-vector product: \\(\\mathbf{y}_{[N \\times 1]} = \\mathbf{A}_{[N \\times M]} \\mathbf{x}_{[M \\times 1]}\\) , where \\(y_i = \\sum_{j=1}^M A_{i, j} x_j\\) Matrix-vector product: \\(\\mathbf{y}_{[N \\times 1]} = \\mathbf{A}_{[N \\times M]} \\mathbf{x}_{[M \\times 1]}\\) , where \\(y_i = \\sum_{j=1}^M A_{i, j} x_j\\) Matrix-matrix product: \\(\\mathbf{C}_{[N \\times K]} = \\mathbf{A}_{[N \\times M]} \\mathbf{B}_{[M \\times K]}\\) , where \\(C_{i,k} = \\sum_{j=1}^M A_{i, j} B_{j, k}\\) Hadamart product (i.e., element-wise product): \\(\\mathbf{C}_{[N \\times M]} = \\mathbf{A}_{[N \\times M]} \\odot \\mathbf{B}_{[N \\times M]}\\) , where \\(C_{i,j} = A_{i, j} B_{i, j}\\) Dot product: \\(a = \\mathbf{x}_{[N \\times 1]}^T \\mathbf{y}_{[N \\times 1]} = \\sum_{i=1}^N x_i y_i\\) Identity matrix: \\(\\mathbf{I}_N = diag\\{\\mathbf{1}_N\\}\\) . Based on its definition, we have that \\(\\mathbf{I}_N \\mathbf{x} = \\mathbf{x}\\) and \\(\\mathbf{I}_N \\mathbf{X} = \\mathbf{X}\\) Inverse matrix: given \\(\\mathbf{y} = \\mathbf{A} \\mathbf{x}\\) , the inverse matrix of \\(\\mathbf{A}\\) is a matrix that satisfies the following equality \\(\\mathbf{A}^{-1} \\mathbf{A} = \\mathbf{I}_N\\) . We can finally write \\(\\mathbf{x} = \\mathbf{A}^{-1} \\mathbf{y}\\) Orthogonal vectors and matrices: given two vectors \\(\\mathbf{x}\\) and \\(\\mathbf{y}\\) , they are said to be orthogonal if \\(\\mathbf{y}^T \\mathbf{x} = 0\\) . Given two matrices \\(\\mathbf{X}\\) and \\(\\mathbf{Y}\\) , they are said to be orthogonal if \\(\\mathbf{Y}^T \\mathbf{X} = \\mathbf{I}_N\\) . Orthogonal matrices are especially interesting because their inverse is simply \\(\\mathbf{X}^{-1} = \\mathbf{X}^T\\) Matrix decomposition: like any scalar number can be decomposed into a product of prime numbers, a matrix \\(\\mathbf{A}\\) can also be decomposed into a combination of vectors (i.e., eigenvectors) and scalars (i.e., eigenvalues). Eigendecomposition: real-valued, square, symmetric matrices can be written as \\(\\mathbf{A} = \\mathbf{V} \\Lambda \\mathbf{V}^T = \\sum_i \\lambda_i \\mathbf{v}_i \\mathbf{v}_i^T\\) where \\(\\lambda_i\\) and \\(\\mathbf{v}_i\\) are the eigenvalues and eigenvectors of the matrix \\(\\mathbf{A}\\) , respectively. Eigenvectors are placed along the columns of the matrix \\(\\mathbf{V}\\) , which is an orthogonal matrix (i.e., \\(\\mathbf{V}^T=\\mathbf{V}^{-1}\\) ). Eigenvalues are placed along the diagonal of the matrix \\(\\Lambda=diag\\{\\lambda\\}\\) and tell us about the rank of the matrix, \\(rank(\\mathbf{A}) = \\# \\lambda \\neq 0\\) . A full rank matrix is matrix whose eigenvalues are all non-zero and can be inverted. In this case the inverse of \\(\\mathbf{A}\\) is \\(\\mathbf{A}^{-1}=\\mathbf{V}\\Lambda^{-1}\\mathbf{V}^T\\) Singular value decomposition (SVD): this is a more general decomposition which can be applied to real-valued, non-square, non-symmetric matrices. Singular vectors \\(\\mathbf{u}\\) and \\(\\mathbf{v}\\) and singular values \\(\\lambda\\) generalized the concept of eigenvectors and and eigenvalues. The matrix \\(\\mathbf{A}\\) can be decomposed as \\(\\mathbf{A} = \\mathbf{U} \\mathbf{D} \\mathbf{V}^T\\) where \\(\\mathbf{D} = \\Lambda\\) for square matrices, \\(\\mathbf{D} = [\\Lambda \\; \\mathbf{0}]^T\\) for \\(N>M\\) and \\(\\mathbf{D} = [\\Lambda \\; \\mathbf{0}]\\) for \\(M>N\\) . Similar to the eigendecomposition, in this case the inverse of \\(\\mathbf{A}\\) is \\(\\mathbf{A}^{-1}=\\mathbf{V}\\mathbf{D}^{-1}\\mathbf{U}^T\\) Conditioning: in general, it refers to how fast a function \\(f(x)\\) changes given a small change in its input \\(x\\) . Similarly for a matrix, conditioning is linked to the curvature of its associated quadratic form \\(f(\\mathbf{A}) = \\mathbf{x}^T \\mathbf{A} \\mathbf{x}\\) and it generally indicates how rapidly this function changes as function of \\(\\mathbf{x}\\) . It is defined as \\(cond(\\mathbf{A})=\\frac{|\\lambda_{max}|}{|\\lambda_{min}|}\\) . Norms : another important object that we will be using when defining cost functions for ML models are norms. A norm is a function that maps a vector \\(\\mathbf{x} \\in \\mathbb{R}^N\\) to a scalar \\(d \\in \\mathbb{R}\\) and it can be loosely seen as measure of the length of the vector (i.e., distance from the origin). In general, the \\(L^p\\) norm is defined as: \\[ ||\\mathbf{x}||_p = \\left( \\sum_i |x_i|^p \\right) ^{1/p} \\; p \\ge 0 \\] Popular norms are: Euclidean norm ( \\(L_2\\) ): \\(||\\mathbf{x}||_2 = \\sqrt{\\sum_i x_i^2}\\) , is a real distance of a vector from the origin of the N-d Euclidean space. Note that \\(||\\mathbf{x}||_2^2 = \\mathbf{x}^T \\mathbf{x}\\) and that \\(||\\mathbf{x}||_2=1\\) for a unit vector; \\(L_1\\) norm: \\(||\\mathbf{x}||_1 = \\sum_i |x_i|\\) \\(L_0\\) norm: number of non-zero elements in the vector \\(\\mathbf{x}\\) \\(L_\\infty\\) norm: \\(||\\mathbf{x}||_2 = max |x_i|\\) Frobenious norm (for matrices): \\(||\\mathbf{A}||_F = \\sqrt{\\sum_{i,j} A_{i,j}^2}\\) ,","title":"Linear Algebra refresher"},{"location":"lectures/02_linalg/#linear-algebra-refresher","text":"In this lecture we will go through some of the key concepts of linear algebra and inverse problem theory that are required to develop the theories of the different machine learning algorithm presented in this course. This is not meant to be an exhaustive treatise and students are strongly advised to take the ErSE 213 - Inverse Problems prior to this course. Three key mathematical objects arise in the study of linear algebra: Scalars : \\(a \\in \\mathbb{R}\\) , a single number represented by a lower case italic letter; Vectors : \\(\\mathbf{x} = [x_1, x_2, ..., x_N]^T \\in \\mathbb{R}^N\\) , ordered collection of \\(N\\) numbers represented by a lower case bold letter; it is sometimes useful to extract a subset of elements by defining a set \\(\\mathbb{S}\\) and add it to as a superscript, \\(\\mathbf{x}_\\mathbb{S}\\) . As an example, given \\(\\mathbf{x} = [x_1, x_2, x_3, x_4, x_5, x_6]^T \\in \\mathbb{R}^6\\) and \\(\\mathbb{S} = {1, 3, 5}\\) we can define the vector \\(\\mathbf{x}_\\mathbb{S} = [x_1, x_3, x_5]\\) and its complementary vector \\(\\mathbf{x}_{-\\mathbb{S}} = [x_2, x_4, x_6]\\) Matrices : \\(\\mathbf{X} \\in \\mathbb{R}^{[N \\times M]}\\) , two dimensional collection of numbers represented by an upper case bold letter where \\(N\\) and \\(M\\) are referred to as the height and width of the matrix. More specifically a matrix can be written as \\[\\mathbf{X} = \\begin{bmatrix} x_{1,1} & x_{1,2} & x_{1,M} \\\\ ... & ... & ... \\\\ x_{N,1} & x_{N,2} & x_{N,M} \\end{bmatrix} \\] A matrix can be indexed by rows \\(\\mathbf{X}_{i, :}\\) (i-th row), by columns \\(\\mathbf{X}_{:, j}\\) (j-th column), and by element \\(\\mathbf{X}_{i, j}\\) (i-th row, j-th column). A number of useful operations that are commonly applied on vectors and matrices are now described: Transpose: \\(\\mathbf{Y} = \\mathbf{X}^T\\) , where \\(Y_{i, j} = X_{j, i}\\) Matrix plus vector: \\(\\mathbf{Y}_{[N \\times M]} = \\mathbf{X}_{[N \\times M]} + \\mathbf{z}_{[1 \\times M]}\\) , where \\(Y_{i, j} = X_{i, j} + z_{j}\\) ( \\(\\mathbf{z}\\) is added to each row of the matrix \\(\\mathbf{X}\\) ) Matrix-vector product: \\(\\mathbf{y}_{[N \\times 1]} = \\mathbf{A}_{[N \\times M]} \\mathbf{x}_{[M \\times 1]}\\) , where \\(y_i = \\sum_{j=1}^M A_{i, j} x_j\\) Matrix-vector product: \\(\\mathbf{y}_{[N \\times 1]} = \\mathbf{A}_{[N \\times M]} \\mathbf{x}_{[M \\times 1]}\\) , where \\(y_i = \\sum_{j=1}^M A_{i, j} x_j\\) Matrix-matrix product: \\(\\mathbf{C}_{[N \\times K]} = \\mathbf{A}_{[N \\times M]} \\mathbf{B}_{[M \\times K]}\\) , where \\(C_{i,k} = \\sum_{j=1}^M A_{i, j} B_{j, k}\\) Hadamart product (i.e., element-wise product): \\(\\mathbf{C}_{[N \\times M]} = \\mathbf{A}_{[N \\times M]} \\odot \\mathbf{B}_{[N \\times M]}\\) , where \\(C_{i,j} = A_{i, j} B_{i, j}\\) Dot product: \\(a = \\mathbf{x}_{[N \\times 1]}^T \\mathbf{y}_{[N \\times 1]} = \\sum_{i=1}^N x_i y_i\\) Identity matrix: \\(\\mathbf{I}_N = diag\\{\\mathbf{1}_N\\}\\) . Based on its definition, we have that \\(\\mathbf{I}_N \\mathbf{x} = \\mathbf{x}\\) and \\(\\mathbf{I}_N \\mathbf{X} = \\mathbf{X}\\) Inverse matrix: given \\(\\mathbf{y} = \\mathbf{A} \\mathbf{x}\\) , the inverse matrix of \\(\\mathbf{A}\\) is a matrix that satisfies the following equality \\(\\mathbf{A}^{-1} \\mathbf{A} = \\mathbf{I}_N\\) . We can finally write \\(\\mathbf{x} = \\mathbf{A}^{-1} \\mathbf{y}\\) Orthogonal vectors and matrices: given two vectors \\(\\mathbf{x}\\) and \\(\\mathbf{y}\\) , they are said to be orthogonal if \\(\\mathbf{y}^T \\mathbf{x} = 0\\) . Given two matrices \\(\\mathbf{X}\\) and \\(\\mathbf{Y}\\) , they are said to be orthogonal if \\(\\mathbf{Y}^T \\mathbf{X} = \\mathbf{I}_N\\) . Orthogonal matrices are especially interesting because their inverse is simply \\(\\mathbf{X}^{-1} = \\mathbf{X}^T\\) Matrix decomposition: like any scalar number can be decomposed into a product of prime numbers, a matrix \\(\\mathbf{A}\\) can also be decomposed into a combination of vectors (i.e., eigenvectors) and scalars (i.e., eigenvalues). Eigendecomposition: real-valued, square, symmetric matrices can be written as \\(\\mathbf{A} = \\mathbf{V} \\Lambda \\mathbf{V}^T = \\sum_i \\lambda_i \\mathbf{v}_i \\mathbf{v}_i^T\\) where \\(\\lambda_i\\) and \\(\\mathbf{v}_i\\) are the eigenvalues and eigenvectors of the matrix \\(\\mathbf{A}\\) , respectively. Eigenvectors are placed along the columns of the matrix \\(\\mathbf{V}\\) , which is an orthogonal matrix (i.e., \\(\\mathbf{V}^T=\\mathbf{V}^{-1}\\) ). Eigenvalues are placed along the diagonal of the matrix \\(\\Lambda=diag\\{\\lambda\\}\\) and tell us about the rank of the matrix, \\(rank(\\mathbf{A}) = \\# \\lambda \\neq 0\\) . A full rank matrix is matrix whose eigenvalues are all non-zero and can be inverted. In this case the inverse of \\(\\mathbf{A}\\) is \\(\\mathbf{A}^{-1}=\\mathbf{V}\\Lambda^{-1}\\mathbf{V}^T\\) Singular value decomposition (SVD): this is a more general decomposition which can be applied to real-valued, non-square, non-symmetric matrices. Singular vectors \\(\\mathbf{u}\\) and \\(\\mathbf{v}\\) and singular values \\(\\lambda\\) generalized the concept of eigenvectors and and eigenvalues. The matrix \\(\\mathbf{A}\\) can be decomposed as \\(\\mathbf{A} = \\mathbf{U} \\mathbf{D} \\mathbf{V}^T\\) where \\(\\mathbf{D} = \\Lambda\\) for square matrices, \\(\\mathbf{D} = [\\Lambda \\; \\mathbf{0}]^T\\) for \\(N>M\\) and \\(\\mathbf{D} = [\\Lambda \\; \\mathbf{0}]\\) for \\(M>N\\) . Similar to the eigendecomposition, in this case the inverse of \\(\\mathbf{A}\\) is \\(\\mathbf{A}^{-1}=\\mathbf{V}\\mathbf{D}^{-1}\\mathbf{U}^T\\) Conditioning: in general, it refers to how fast a function \\(f(x)\\) changes given a small change in its input \\(x\\) . Similarly for a matrix, conditioning is linked to the curvature of its associated quadratic form \\(f(\\mathbf{A}) = \\mathbf{x}^T \\mathbf{A} \\mathbf{x}\\) and it generally indicates how rapidly this function changes as function of \\(\\mathbf{x}\\) . It is defined as \\(cond(\\mathbf{A})=\\frac{|\\lambda_{max}|}{|\\lambda_{min}|}\\) . Norms : another important object that we will be using when defining cost functions for ML models are norms. A norm is a function that maps a vector \\(\\mathbf{x} \\in \\mathbb{R}^N\\) to a scalar \\(d \\in \\mathbb{R}\\) and it can be loosely seen as measure of the length of the vector (i.e., distance from the origin). In general, the \\(L^p\\) norm is defined as: \\[ ||\\mathbf{x}||_p = \\left( \\sum_i |x_i|^p \\right) ^{1/p} \\; p \\ge 0 \\] Popular norms are: Euclidean norm ( \\(L_2\\) ): \\(||\\mathbf{x}||_2 = \\sqrt{\\sum_i x_i^2}\\) , is a real distance of a vector from the origin of the N-d Euclidean space. Note that \\(||\\mathbf{x}||_2^2 = \\mathbf{x}^T \\mathbf{x}\\) and that \\(||\\mathbf{x}||_2=1\\) for a unit vector; \\(L_1\\) norm: \\(||\\mathbf{x}||_1 = \\sum_i |x_i|\\) \\(L_0\\) norm: number of non-zero elements in the vector \\(\\mathbf{x}\\) \\(L_\\infty\\) norm: \\(||\\mathbf{x}||_2 = max |x_i|\\) Frobenious norm (for matrices): \\(||\\mathbf{A}||_F = \\sqrt{\\sum_{i,j} A_{i,j}^2}\\) ,","title":"Linear Algebra refresher"},{"location":"lectures/02_prob/","text":"Probability refresher Another set of fundamental mathematical tools required to develop various machine learning algorithms (especially towards the end of the course when we will focus on generative modelling) In order to develop various machine learning algorithms (especially towards the end of the course when we will focus on generative modelling) we need to be familiarized with some basic concepts of: mathematical tools from: Probability : mathematical framework to handle uncertain statements; Information Theory : scientific field focused on the quantification of amount of uncertainty in a probability distribution. Probability Random Variable : a variable whose value is unknown, all we know is that it can take on different values with a given probability. It is generally defined by an uppercase letter \\(X\\) , whilst the values it can take are in lowercase letter \\(x\\) . (Note: Actually, random variable is not really a variable. To be exact, random variable is actually a function that maps from sample space to the probability space.) Probability distribution : description of how likely a variable \\(x\\) is, \\(P(x)\\) (or \\(p(x)\\) ). Depending on the type of variable we have: Discrete distributions : \\(P(X)\\) called Probability Mass Function (PMF) and \\(X\\) can take on a discrete number of states N. A classical example is represented by a coin where N=2 and \\(X={0,1}\\) . For a fair coin, \\(P(X=0)=0.5\\) and \\(P(X=1)=0.5\\) . Continuous distributions : \\(p(X)\\) called Probability Density Function (PDF) and \\(X\\) can take on any value from a continuous space (e.g., \\(\\mathbb{R}\\) ). A classical example is represented by the gaussian distribution where \\(x \\in (-\\infty, \\infty)\\) . A probability distribution must satisfy the following conditions: each of the possible states must have probability bounded between 0 (no occurrance) and 1 (certainty of occurcence): \\(\\forall x \\in X, \\; 0 \\leq P(x) \\leq 1\\) (or \\(p(x) \\geq 0\\) , where the upper bound is removed because of the fact that the integration step \\(\\delta x\\) in the second condition can be smaller than 1: \\(p(X=x) \\delta x <=1\\) ); the sum of the probabilities of all possible states must equal to 1: \\(\\sum_x P(X=x)=1\\) (or \\(\\int p(X=x)dx=1\\) ). Joint and Marginal Probabilities : assuming we have a probability distribution acting over a set of variables (e.g., \\(X\\) and \\(Y\\) ) we can define Joint distribution : \\(P(X=x, Y=y)\\) (or \\(p(X=x, Y=y)\\) ); Marginal distribution : \\(P(X=x) = \\sum_{y \\in Y} P(X=x, Y=y)\\) (or \\(p(X=x) = \\int P(X=x, Y=y) dy\\) ), which is the probability spanning one or a subset of the original variables; Conditional Probability : provides us with the probability of an event given the knowledge that another event has already occurred \\[ P(Y=y | X=x) = \\frac{P(X=x, Y=y)}{P(X=x)} \\] This formula can be used recursively to define the joint probability of N variables as product of conditional probabilities (so-called Chain Rule of Probability ) \\[ P(x_1, x_2, ..., x_N) = P(x_1) \\prod_{i=2}^N P(x_i | x_1, x_2, x_{i-1}) \\] Independence and Conditional Independence : Two variables X and Y are said to be independent if \\[ P(X=x, Y=y) = P(X=x) P(Y=y) \\] If both variables are conditioned on a third variable Z (i.e., P(X=x, Y=y | Z=z)), they are said to be conditionally independent if \\[ P(X=x, Y=y | Z=z) = P(X=x | Z=z) P(Y=y| Z=z) \\] Bayes Rule : probabilistic way to update our knowledge of a certain phenomenon (called prior) based on a new piece of evidence (called likelihood): \\[ P(x | y) = \\frac{P(y|x) P(x)}{P(y)} \\] where \\(P(y) = \\sum_x P(x, y) = \\sum_x P(y |x) P(x)\\) is called the evidence. In practice, it is infeasible to compute this quantity as it would require evaluating \\(y\\) for all possible combination of \\(x\\) (we will see later how it is possible to devise methods for which \\(P(y)\\) can be ignored). Mean (or Expectation) : Given a function \\(f(x)\\) where \\(x\\) is a random variable with probability \\(P(x)\\) , its average or mean value is defined as follows for the discrete case: \\[ \\mu = E_{x \\sim P} [f(x)] = \\sum_x P(x) f(x) \\] and for the continuous case \\[ \\mu = E_{x \\sim p} [f(x)] = \\int p(x) f(x) dx \\] In most Machine Learning applications, we do not have knowledge of the full distribution to evaluate the mean, rather we have access to N equi-probable samples that we assume are drawn from the underlying distribution. We can approximate the mean via the Sample Mean : \\[ \\mu \\approx \\sum_i \\frac{1}{N} f(x_i) \\] Variance (and Covariance) : Given a function \\(f(x)\\) where \\(x\\) is a random variable with probability \\(P(x)\\) , it represents a measure of how much the values of the function vary from the mean: \\[ \\sigma^2 = E_{x \\sim p} [(f(x)-\\mu)^2] \\] Covariance is the extension of the variance to two or more variables, and it tells how much these variables are related to each other: \\[ Cov(f(x), g(y)) = E_{x,y \\sim p} [(f(x)-\\mu_x)(f(y)-\\mu_y)] \\] Here, \\(Cov \\rightarrow 0\\) indicates no correlation between the variables, \\(Cov > 0\\) denotes positive correlation and \\(Cov < 0\\) denotes negative correlation. It is worth remembering that covariance is linked to correlation via: \\[ Corr_{x,y} = \\frac{Cov_{x,y}}{\\sigma_x \\sigma_y} \\] Finally, the covariance of a multidimensional vector \\(\\textbf{x} \\in \\mathbb{R}^n\\) is defined as: \\[ Cov_{i,j} = Cov(x_i, x_j), \\qquad Cov_{i,i} = \\sigma^2_i \\] Distributions : some of the most used probability distributions in Machine Learning are listed in the following. 1. Bernoulli : single binary variable \\(x \\in \\{0,1\\}\\) (commonly used to describe the toss of a coin). It is defined as \\[ P(x=1)=\\phi, \\; P(x=0)=1-\\phi, \\; \\phi \\in [0,1] \\] with probability: \\[ P(x)=\\phi^x(1-\\phi)^{1-x} = \\phi x + (1-\\phi)(1-x) \\] and momentum equal to: \\[ E[x] = 1, \\; \\sigma^2 = \\phi (1-\\phi) \\] 2. Multinoulli (or categorical) : extension of Bernoulli distribution to K different states \\[ \\textbf{P} \\in [0,1]^{K-1}; \\; P_k = 1- \\textbf{1}^T\\textbf{P}, \\; \\textbf{1}^T\\textbf{P} \\leq 1 \\] 3. Gaussian : most popular choice for continuous random variables (most distributions are close to a normal distribution and the central limit theorem states that any sum of independent variables is approximately normal) \\[ x \\sim \\mathcal{N}(\\mu, \\sigma^2) \\rightarrow p(x) = \\frac{1}{\\sqrt{2 \\pi} \\sigma} e^{-\\frac{(x-\\mu)^2}{2\\sigma^2}} = \\sqrt{\\frac{\\beta}{2 \\pi}} e^{-\\frac{\\beta(x-\\mu)^2}{2}} \\] where the second definition uses the precision \\(\\beta=\\frac{1}{\\sigma^2} \\in (0, \\infty)\\) to avoid possible division by zero. A third way to parametrize the gaussian probability uses \\(2 \\delta = log \\sigma^2 \\in (-\\infty, \\infty)\\) which has the further benefit to be unbounded and can be easily optimized for during training. which is unbounded (compared to the variance that must be positive) 4. Multivariate Gaussian : extension of Gaussian distribution to a multidimensional vector \\(\\textbf{x} \\in \\mathbb{R}^n\\) \\[ \\textbf{x} \\sim \\mathcal{N}(\\boldsymbol\\mu, \\boldsymbol\\Sigma) \\rightarrow p(\\textbf{x}) = \\sqrt{\\frac{1}{(2 \\pi)^n det \\boldsymbol\\Sigma}} e^{-\\frac{1}{2}(\\textbf{x}- \\boldsymbol\\mu)^T\\boldsymbol\\Sigma^{-1}(\\textbf{x}- \\boldsymbol\\mu)}= \\sqrt{\\frac{det \\boldsymbol\\beta}{(2 \\pi)^n}} e^{-\\frac{1}{2}(\\textbf{x}- \\boldsymbol\\mu)^T\\boldsymbol\\beta(\\textbf{x}- \\boldsymbol\\mu)} \\] where again \\(\\boldsymbol\\beta =\\boldsymbol\\Sigma^{-1}\\) . In ML applications, \\(\\boldsymbol\\beta\\) is generally assumed diagonal (mean-field approximation) or even isotropic ($\\boldsymbol\\beta = \\beta \\textbf{I}_n) 5. Mixture of distributions : any smooth probability density function can be expressed as a weighted sum of simpler distributions \\[ P(x) = \\sum_i P(c=i) P(x | c=i) \\] where \\(c\\) is a categorical variable with Multinoulli distribution and plays the role of a latent variable , a variable that cannot be directly observed but is related to \\(x\\) via the joint distribution: \\[ P(x,c) = P(x | c) P(c), \\; P(x) = \\sum_c P(x|c)P(c) \\] A special case is the so-called Gaussian Mixture where each probability \\(P(x|c=i) \\sim \\mathcal{N}(\\mu_i, \\sigma_i^2)\\) . Information theory In Machine Learning, we are sometimes interested to quantify how much information is contained in a signal or how much two signals (or probability distributions) differ from each other. A large body of literature exists in the context of telecommunications, where it is necessary to study how to transmit signals for a discrete alphabet over a noisy channel. More specifically, a code must be designed so to allow sending the least amount of bits for the most amount of useful information. Extension of such theory to continuous variables is also available and more commonly used in the context of ML systems. Self-information : a measure of information in such a way that likely events have low information content, less likely events have higher information content and independent events have additive information: \\[ I(x) = - log_eP(x) \\] such that for \\(P(x) \\rightarrow 0\\) (unlikely event), \\(I \\rightarrow \\infty\\) and for \\(P(x) \\rightarrow 1\\) (likely event), \\(I \\rightarrow 0\\) . Shannon entropy : extension of self-information to continuous variables, representing the expected amount of information in an event \\(x\\) drawn from a probability $P: \\[ H(x) = E_{x \\sim P} [I(x)] = - E_{x \\sim P} [log_eP(x)] \\] Kullback-Leibler divergence : extension of entropy to 2 variables with probability \\(P\\) and \\(Q\\) , respectively. It is used to measure their distance \\[ D_{KL}(P||Q) = E_{x \\sim P} [log\\frac{P(x)}{Q(x)}] = E_{x \\sim P} [logP(x)-logQ(x)] = E_{x \\sim P} [logP(x)] -E_{x \\sim P}[logQ(x)] \\] which is \\(D_{KL}(P||Q)=0\\) only when \\(P=Q\\) and grows the further away the two probabilities are. Finally, note that this is not a real distance in that \\(D_{KL}(P||Q) \\neq D_{KL}(Q|| P)\\) (non-symmetric), therefore the direction matter and it must be chosen wisely when devising optimization schemes with KL divergence in the loss function as we will discuss in more details later.","title":"Probability refresher"},{"location":"lectures/02_prob/#probability-refresher","text":"Another set of fundamental mathematical tools required to develop various machine learning algorithms (especially towards the end of the course when we will focus on generative modelling) In order to develop various machine learning algorithms (especially towards the end of the course when we will focus on generative modelling) we need to be familiarized with some basic concepts of: mathematical tools from: Probability : mathematical framework to handle uncertain statements; Information Theory : scientific field focused on the quantification of amount of uncertainty in a probability distribution.","title":"Probability refresher"},{"location":"lectures/02_prob/#probability","text":"Random Variable : a variable whose value is unknown, all we know is that it can take on different values with a given probability. It is generally defined by an uppercase letter \\(X\\) , whilst the values it can take are in lowercase letter \\(x\\) . (Note: Actually, random variable is not really a variable. To be exact, random variable is actually a function that maps from sample space to the probability space.) Probability distribution : description of how likely a variable \\(x\\) is, \\(P(x)\\) (or \\(p(x)\\) ). Depending on the type of variable we have: Discrete distributions : \\(P(X)\\) called Probability Mass Function (PMF) and \\(X\\) can take on a discrete number of states N. A classical example is represented by a coin where N=2 and \\(X={0,1}\\) . For a fair coin, \\(P(X=0)=0.5\\) and \\(P(X=1)=0.5\\) . Continuous distributions : \\(p(X)\\) called Probability Density Function (PDF) and \\(X\\) can take on any value from a continuous space (e.g., \\(\\mathbb{R}\\) ). A classical example is represented by the gaussian distribution where \\(x \\in (-\\infty, \\infty)\\) . A probability distribution must satisfy the following conditions: each of the possible states must have probability bounded between 0 (no occurrance) and 1 (certainty of occurcence): \\(\\forall x \\in X, \\; 0 \\leq P(x) \\leq 1\\) (or \\(p(x) \\geq 0\\) , where the upper bound is removed because of the fact that the integration step \\(\\delta x\\) in the second condition can be smaller than 1: \\(p(X=x) \\delta x <=1\\) ); the sum of the probabilities of all possible states must equal to 1: \\(\\sum_x P(X=x)=1\\) (or \\(\\int p(X=x)dx=1\\) ). Joint and Marginal Probabilities : assuming we have a probability distribution acting over a set of variables (e.g., \\(X\\) and \\(Y\\) ) we can define Joint distribution : \\(P(X=x, Y=y)\\) (or \\(p(X=x, Y=y)\\) ); Marginal distribution : \\(P(X=x) = \\sum_{y \\in Y} P(X=x, Y=y)\\) (or \\(p(X=x) = \\int P(X=x, Y=y) dy\\) ), which is the probability spanning one or a subset of the original variables; Conditional Probability : provides us with the probability of an event given the knowledge that another event has already occurred \\[ P(Y=y | X=x) = \\frac{P(X=x, Y=y)}{P(X=x)} \\] This formula can be used recursively to define the joint probability of N variables as product of conditional probabilities (so-called Chain Rule of Probability ) \\[ P(x_1, x_2, ..., x_N) = P(x_1) \\prod_{i=2}^N P(x_i | x_1, x_2, x_{i-1}) \\] Independence and Conditional Independence : Two variables X and Y are said to be independent if \\[ P(X=x, Y=y) = P(X=x) P(Y=y) \\] If both variables are conditioned on a third variable Z (i.e., P(X=x, Y=y | Z=z)), they are said to be conditionally independent if \\[ P(X=x, Y=y | Z=z) = P(X=x | Z=z) P(Y=y| Z=z) \\] Bayes Rule : probabilistic way to update our knowledge of a certain phenomenon (called prior) based on a new piece of evidence (called likelihood): \\[ P(x | y) = \\frac{P(y|x) P(x)}{P(y)} \\] where \\(P(y) = \\sum_x P(x, y) = \\sum_x P(y |x) P(x)\\) is called the evidence. In practice, it is infeasible to compute this quantity as it would require evaluating \\(y\\) for all possible combination of \\(x\\) (we will see later how it is possible to devise methods for which \\(P(y)\\) can be ignored). Mean (or Expectation) : Given a function \\(f(x)\\) where \\(x\\) is a random variable with probability \\(P(x)\\) , its average or mean value is defined as follows for the discrete case: \\[ \\mu = E_{x \\sim P} [f(x)] = \\sum_x P(x) f(x) \\] and for the continuous case \\[ \\mu = E_{x \\sim p} [f(x)] = \\int p(x) f(x) dx \\] In most Machine Learning applications, we do not have knowledge of the full distribution to evaluate the mean, rather we have access to N equi-probable samples that we assume are drawn from the underlying distribution. We can approximate the mean via the Sample Mean : \\[ \\mu \\approx \\sum_i \\frac{1}{N} f(x_i) \\] Variance (and Covariance) : Given a function \\(f(x)\\) where \\(x\\) is a random variable with probability \\(P(x)\\) , it represents a measure of how much the values of the function vary from the mean: \\[ \\sigma^2 = E_{x \\sim p} [(f(x)-\\mu)^2] \\] Covariance is the extension of the variance to two or more variables, and it tells how much these variables are related to each other: \\[ Cov(f(x), g(y)) = E_{x,y \\sim p} [(f(x)-\\mu_x)(f(y)-\\mu_y)] \\] Here, \\(Cov \\rightarrow 0\\) indicates no correlation between the variables, \\(Cov > 0\\) denotes positive correlation and \\(Cov < 0\\) denotes negative correlation. It is worth remembering that covariance is linked to correlation via: \\[ Corr_{x,y} = \\frac{Cov_{x,y}}{\\sigma_x \\sigma_y} \\] Finally, the covariance of a multidimensional vector \\(\\textbf{x} \\in \\mathbb{R}^n\\) is defined as: \\[ Cov_{i,j} = Cov(x_i, x_j), \\qquad Cov_{i,i} = \\sigma^2_i \\] Distributions : some of the most used probability distributions in Machine Learning are listed in the following. 1. Bernoulli : single binary variable \\(x \\in \\{0,1\\}\\) (commonly used to describe the toss of a coin). It is defined as \\[ P(x=1)=\\phi, \\; P(x=0)=1-\\phi, \\; \\phi \\in [0,1] \\] with probability: \\[ P(x)=\\phi^x(1-\\phi)^{1-x} = \\phi x + (1-\\phi)(1-x) \\] and momentum equal to: \\[ E[x] = 1, \\; \\sigma^2 = \\phi (1-\\phi) \\] 2. Multinoulli (or categorical) : extension of Bernoulli distribution to K different states \\[ \\textbf{P} \\in [0,1]^{K-1}; \\; P_k = 1- \\textbf{1}^T\\textbf{P}, \\; \\textbf{1}^T\\textbf{P} \\leq 1 \\] 3. Gaussian : most popular choice for continuous random variables (most distributions are close to a normal distribution and the central limit theorem states that any sum of independent variables is approximately normal) \\[ x \\sim \\mathcal{N}(\\mu, \\sigma^2) \\rightarrow p(x) = \\frac{1}{\\sqrt{2 \\pi} \\sigma} e^{-\\frac{(x-\\mu)^2}{2\\sigma^2}} = \\sqrt{\\frac{\\beta}{2 \\pi}} e^{-\\frac{\\beta(x-\\mu)^2}{2}} \\] where the second definition uses the precision \\(\\beta=\\frac{1}{\\sigma^2} \\in (0, \\infty)\\) to avoid possible division by zero. A third way to parametrize the gaussian probability uses \\(2 \\delta = log \\sigma^2 \\in (-\\infty, \\infty)\\) which has the further benefit to be unbounded and can be easily optimized for during training. which is unbounded (compared to the variance that must be positive) 4. Multivariate Gaussian : extension of Gaussian distribution to a multidimensional vector \\(\\textbf{x} \\in \\mathbb{R}^n\\) \\[ \\textbf{x} \\sim \\mathcal{N}(\\boldsymbol\\mu, \\boldsymbol\\Sigma) \\rightarrow p(\\textbf{x}) = \\sqrt{\\frac{1}{(2 \\pi)^n det \\boldsymbol\\Sigma}} e^{-\\frac{1}{2}(\\textbf{x}- \\boldsymbol\\mu)^T\\boldsymbol\\Sigma^{-1}(\\textbf{x}- \\boldsymbol\\mu)}= \\sqrt{\\frac{det \\boldsymbol\\beta}{(2 \\pi)^n}} e^{-\\frac{1}{2}(\\textbf{x}- \\boldsymbol\\mu)^T\\boldsymbol\\beta(\\textbf{x}- \\boldsymbol\\mu)} \\] where again \\(\\boldsymbol\\beta =\\boldsymbol\\Sigma^{-1}\\) . In ML applications, \\(\\boldsymbol\\beta\\) is generally assumed diagonal (mean-field approximation) or even isotropic ($\\boldsymbol\\beta = \\beta \\textbf{I}_n) 5. Mixture of distributions : any smooth probability density function can be expressed as a weighted sum of simpler distributions \\[ P(x) = \\sum_i P(c=i) P(x | c=i) \\] where \\(c\\) is a categorical variable with Multinoulli distribution and plays the role of a latent variable , a variable that cannot be directly observed but is related to \\(x\\) via the joint distribution: \\[ P(x,c) = P(x | c) P(c), \\; P(x) = \\sum_c P(x|c)P(c) \\] A special case is the so-called Gaussian Mixture where each probability \\(P(x|c=i) \\sim \\mathcal{N}(\\mu_i, \\sigma_i^2)\\) .","title":"Probability"},{"location":"lectures/02_prob/#information-theory","text":"In Machine Learning, we are sometimes interested to quantify how much information is contained in a signal or how much two signals (or probability distributions) differ from each other. A large body of literature exists in the context of telecommunications, where it is necessary to study how to transmit signals for a discrete alphabet over a noisy channel. More specifically, a code must be designed so to allow sending the least amount of bits for the most amount of useful information. Extension of such theory to continuous variables is also available and more commonly used in the context of ML systems. Self-information : a measure of information in such a way that likely events have low information content, less likely events have higher information content and independent events have additive information: \\[ I(x) = - log_eP(x) \\] such that for \\(P(x) \\rightarrow 0\\) (unlikely event), \\(I \\rightarrow \\infty\\) and for \\(P(x) \\rightarrow 1\\) (likely event), \\(I \\rightarrow 0\\) . Shannon entropy : extension of self-information to continuous variables, representing the expected amount of information in an event \\(x\\) drawn from a probability $P: \\[ H(x) = E_{x \\sim P} [I(x)] = - E_{x \\sim P} [log_eP(x)] \\] Kullback-Leibler divergence : extension of entropy to 2 variables with probability \\(P\\) and \\(Q\\) , respectively. It is used to measure their distance \\[ D_{KL}(P||Q) = E_{x \\sim P} [log\\frac{P(x)}{Q(x)}] = E_{x \\sim P} [logP(x)-logQ(x)] = E_{x \\sim P} [logP(x)] -E_{x \\sim P}[logQ(x)] \\] which is \\(D_{KL}(P||Q)=0\\) only when \\(P=Q\\) and grows the further away the two probabilities are. Finally, note that this is not a real distance in that \\(D_{KL}(P||Q) \\neq D_{KL}(Q|| P)\\) (non-symmetric), therefore the direction matter and it must be chosen wisely when devising optimization schemes with KL divergence in the loss function as we will discuss in more details later.","title":"Information theory"},{"location":"lectures/03_gradopt/","text":"Gradient-based optimization After reviewing some of the basic concepts of linear algebra and probability that we will be using during this course, we are now in a position to start our journey in the field of learning algorithms . Any learning algorithm, no matter its level of complexity, is composed of 4 key elements: Dataset : a collection of many examples (sometimes referred to as samples of data points) that represents the experience we wish our machine learning algorithm to learn from. More specifically, the dataset is defined as: $$ \\mathbf{x} = [x_1, x_2, ..., x_{N_f}]^T \\quad \\mathbf{X} = [\\mathbf{x}^{(1)}, \\mathbf{x}^{(2)}, ..., \\mathbf{x}^{(N_s)}] $$ and $$ \\mathbf{y} = [y_1, y_2, ..., y_{N_t}]^T \\quad \\mathbf{Y} = [\\mathbf{y}^{(1)}, \\mathbf{y}^{(2)}, ..., \\mathbf{y}^{(N_s)}] $$ where \\(N_f\\) and \\(N_t\\) are the number of features and targets for each sample in the dataset, respectively, and \\(N_s\\) is the number of samples. Model : a mathematical relation between the input (or features) and output (or target) of our dataset. It is generally parametrized as function \\(f\\) of a number of free parameters \\(\\theta\\) which we want the learning algorithm to estimate given a task and a measure of performance, and we write it as $$ \\mathbf{y} = f_\\theta(\\mathbf{x}) $$ Loss (and cost) function : quantitative measure of the performance of the learning algorithm, which we wish to minimize (or maximize) in order to make accurate predictions on the unseen data. It is written as $$ J_\\theta = \\frac{1}{N_s} \\sum_{j=1}^{N_s} \\mathscr{L} (\\mathbf{y}^{(j)}, f_\\theta(\\mathbf{x}^{(j)})) $$ where \\(\\mathscr{L}\\) is the loss function for each input-output pair and \\(J\\) is the overall cost function. Optimization algorithm : mathematical method that aims to drive down (up) the cost function by modifying its free-parameters \\(\\theta\\) : $$ \\hat{\\theta} = \\underset{\\theta} {\\mathrm{argmin}} \\; J_\\theta $$ Optimization algorithms are generally divided into two main families: gradient-based (or local) and gradient-free (or global). Gradient-based optimization is by far the most popular way to train NNs and will be discussed in more details below. Gradient-descent algorithms The simplest of gradient-based methods is the so-called Gradient-descent algorithms (e.g., steepest descent algorithm). As the name implies, this algorithm uses local gradient information of the functional to minimize/maximize to move towards its global mimimum/maximum as depicted in the figure below. More formally, given a functional \\(J_\\theta\\) and its gradient \\(\\nabla J = \\frac{\\delta J}{\\delta \\theta}\\) , the (minimization) algorithm can be written as: Initialization: choose \\(\\theta \\in \\mathbb{R}\\) For \\(i=0,...N-1\\) ; Compute update direction \\(d_i = -\\nabla J |_{\\theta_i}\\) Estimate step-lenght \\(\\alpha_i\\) Update \\(\\theta_{i+1} = \\theta_{i} + \\alpha_i d_i\\) Note that the maximization version of this algorithm simply swaps the sign in the update direction (first equation of the algorithm). Moreover, the proposed algorithm can be easily extended to N-dimensional model vectors \\(\\theta=[\\theta_1, \\theta_2, ..., \\theta_N]\\) by defining the following gradient vector \\(\\nabla J=[\\delta J / \\delta\\theta_1, \\delta J / \\delta\\theta_2, ..., \\delta J/ \\delta\\theta_N]\\) . Step length selection The choice of the step-length has tremendous impact on the performance of the algorithm and its ability to converge fast (i.e., in a small number of iterations) to the optimal solution. The most used selection rules are: Constant: the step size is fixed to a constant value \\(\\alpha_i=\\hat{\\alpha}\\) . This is the most common situation that we will encounter when training neural networks. In practice, some adaptive schemes based on the evolution of the train (or validation) norm are generally adopted, but we will still refer to this case as constant step size; Exact line search: at each iteration, \\(\\alpha_i\\) is chosen such that it minimizes \\(J(\\theta_{i} + \\alpha_i d_i)\\) . This is the most commonly used approach when dealing with linear systems of equations. Backtracking \"Armijo\" line search: at each iteration, given a parameter \\(\\mu \\in (0,1)\\) , start with \\(\\alpha_i=1\\) and reduce it by a factor of 2 until the following condition is satisfied: \\(J(\\theta_i) - J(\\theta_{i} + \\alpha_i d_i) \\ge -\\mu \\alpha_i \\nabla J^T d_i\\) Second-order optimization Up until now we have discussed first-order optimization techniques that rely on the ability to evaluate the function \\(J\\) and its gradient \\(\\nabla J\\) . Second-order optimization method go one step beyond in that they use information from both the local slope and curvature of the function \\(J\\) . When a function has small curvature, the function and its tangent line are very similar: the gradient alone is therefore able to provide a good local approximation of the function (i.e., \\(J(\\theta+\\delta \\theta)\\approx J(\\theta) + \\nabla J \\delta \\theta\\) ). On the other hand, if the curvature of the function of large, the function and its tangent line start to differ very quickly away from the linearization point. The gradient alone is not able anymore to provide a good local approximation of the function (i.e., \\(J(\\theta+\\delta \\theta)\\approx J(\\theta) + \\nabla J \\delta \\theta + \\nabla^2 J \\delta \\theta^2\\) ). Let's start again from the one-dimensional case and the well-known Newton's method . This method is generally employed to find the zeros of a function: \\(\\theta: J(\\theta)=0\\) and can be written as: \\[ \\theta_{i+1} = \\theta_i - \\frac{J(\\theta)|_{\\theta_i}}{J'(\\theta)|_{\\theta_i}} \\] which can be easily derived from the Taylor expansion of \\(J(\\theta)\\) around \\(\\theta_{i+1}\\) . If we remember that finding the minimum (or maximum) of a function is equivalent to find the zeros of its first derivative ( \\(\\theta: min_\\theta J(\\theta) \\leftrightarrow \\theta: J'(\\theta)=0\\) ), the Newton's method can be written as: \\[ \\theta_{i+1} = \\theta_i - \\frac{J'(\\theta)|_{\\theta_i}}{J''(\\theta)|_{\\theta_i}} \\] In order to be able to discuss second-order optimization algorithms for the multi-dimensional case, let's first introduce the notion of Jacobian : \\[\\mathbf{y} = J(\\boldsymbol\\theta) \\rightarrow \\mathbf{J} = \\begin{bmatrix} \\frac{\\partial J_1}{\\partial \\theta_1} & \\frac{\\partial J_1}{\\partial \\theta_2} & ... & \\frac{\\partial J_1}{\\partial \\theta_M} \\\\ ... & ... & ... & ... \\\\ \\frac{\\partial J_N}{\\partial \\theta_1} & \\frac{\\partial J_N}{\\partial \\theta_2} & ... & \\frac{\\partial J_N}{\\partial \\theta_M} \\\\ \\end{bmatrix} \\in \\mathbb{R}^{[N \\times M]} \\] Through the notion of Jacobian, we can define the Hessian as the Jacobian of the gradient vector \\[\\mathbf{H} = \\nabla (\\nabla J) = \\begin{bmatrix} \\frac{\\partial J^2}{\\partial \\theta_1^2} & \\frac{\\partial J^2}{\\partial x_1 \\partial \\theta_2} & ... & \\frac{\\partial J^2}{\\partial \\theta_1\\partial \\theta_M} \\\\ ... & ... & ... & ... \\\\ \\frac{\\partial J^2}{\\partial \\theta_M \\partial \\theta_1} & \\frac{\\partial J^2}{\\partial \\theta_M \\partial \\theta_2} & ... & \\frac{\\partial J^2}{\\partial \\theta_M^2} \\\\ \\end{bmatrix} \\in \\mathbb{R}^{[M \\times M]} \\] where we note that when \\(J\\) is continuous, \\(\\partial / \\partial \\theta_i \\partial \\theta_j = \\partial / \\partial \\theta_j \\partial \\theta_i\\) , and \\(\\mathbf{H}\\) is symmetric. The Newton method for the multi-dimensional case becomes: \\[ \\boldsymbol\\theta_{i+1} = \\boldsymbol\\theta_i - \\mathbf{H}^{-1}\\nabla J \\] Approximated version of the Newton method have been developed over the years, mostly based on the idea that inverting \\(\\mathbf{H}\\) is sometimes a prohibitive task. Such methods, generally referred to as Quasi-Netwon methods attempt to approximate the Hessian (or its inverse) using the collections of gradient information from the previous iterations. BFGS or its limited memory version L-BFGS are examples of such a kind. Due to their computational cost (as well as the lack of solid theories for their use in conjunction with approximate gradients), these methods are not yet commonly used by the machine learning community to optimize the parameters of NNs in deep learning. Stochastic-gradient descent (SGD) To conclude, we look again at gradient-based iterative solvers and more specifically in the context of finite-sum functionals of the kind that we will encountering when training neural networks: \\[ J_\\theta = \\frac{1}{N_s} \\sum_{i=1}^{N_s} \\mathscr{L} (\\mathbf{y}^{(i)}, f_\\theta(\\mathbf{x}^{(i)})) \\] where the summation here is performed over training data. Batched gradient descent The solvers that we have considered so far are generally update the model parameters \\(\\boldsymbol\\theta\\) using the full gradient (i.e., over the entire batch of samples): \\[ \\boldsymbol\\theta_{i+1} = \\boldsymbol\\theta_{i} - \\alpha_i \\nabla J = \\boldsymbol\\theta_{i} - \\frac{\\alpha_i}{N_s} \\sum_{j=1}^{N_s} \\nabla \\mathscr{L}_j \\] A limitation of such an approach is that, if we have a very large number of training samples, the computational cost of computing the full gradient is very high and when some of the samples are similar, their gradient contribution is somehow redundant. Stochastic gradient descent In this case we take a completely opposite approach to computing the gradient. More specifically, a single training sample is considered at each iteration: \\[ \\boldsymbol\\theta_{i+1} = \\boldsymbol\\theta_{i} - \\alpha_i \\nabla \\mathscr{L}_j \\] The choice of the training sample \\(j\\) at each iteration is generally completely random and this is repeated once all training data have been used at least once (generally referred to as epoch ). In this case, the gradient may be noisy because the gradient of a single sample is a very rough approximation of the total cost function \\(J\\) : such a high variance of gradients requires lowering the step-size \\(\\alpha\\) leading to slow convergence. Mini-batched gradient descent A more commonly used strategy lies in between the batched and stochastic gradient descent algorithms uses batches of training samples to compute the gradient at each iteration. More specifically given a batch of \\(N_b\\) samples, the update formula can be written as: \\[ \\boldsymbol\\theta_{i+1} = \\boldsymbol\\theta_{i} - \\frac{\\alpha_i}{N_b} \\sum_{j=1}^{N_b} \\nabla \\mathscr{L}_j \\] and similarly to the stochastic gradient descent, the batches of data are chosen at random and this is repeated as soon as all data are used once in the training loop. Whilst the choice of the size of the batch depends on many factors (e.g., overall size of the dataset, variety of training samples), common batch sizes in training of NNs are from around 50 to 256 (unless memory requirements kick in leading to even small batch sizes). Additional readings the following blog post for a more detailed overview of the optimization algorithms discussed here. Note that in one of our future lectures we will also look again at the optimization algorithms and more specifically discuss strategies that allow overcoming some of the limitations of standard SGD in this lecture .","title":"Gradient-based optimization"},{"location":"lectures/03_gradopt/#gradient-based-optimization","text":"After reviewing some of the basic concepts of linear algebra and probability that we will be using during this course, we are now in a position to start our journey in the field of learning algorithms . Any learning algorithm, no matter its level of complexity, is composed of 4 key elements: Dataset : a collection of many examples (sometimes referred to as samples of data points) that represents the experience we wish our machine learning algorithm to learn from. More specifically, the dataset is defined as: $$ \\mathbf{x} = [x_1, x_2, ..., x_{N_f}]^T \\quad \\mathbf{X} = [\\mathbf{x}^{(1)}, \\mathbf{x}^{(2)}, ..., \\mathbf{x}^{(N_s)}] $$ and $$ \\mathbf{y} = [y_1, y_2, ..., y_{N_t}]^T \\quad \\mathbf{Y} = [\\mathbf{y}^{(1)}, \\mathbf{y}^{(2)}, ..., \\mathbf{y}^{(N_s)}] $$ where \\(N_f\\) and \\(N_t\\) are the number of features and targets for each sample in the dataset, respectively, and \\(N_s\\) is the number of samples. Model : a mathematical relation between the input (or features) and output (or target) of our dataset. It is generally parametrized as function \\(f\\) of a number of free parameters \\(\\theta\\) which we want the learning algorithm to estimate given a task and a measure of performance, and we write it as $$ \\mathbf{y} = f_\\theta(\\mathbf{x}) $$ Loss (and cost) function : quantitative measure of the performance of the learning algorithm, which we wish to minimize (or maximize) in order to make accurate predictions on the unseen data. It is written as $$ J_\\theta = \\frac{1}{N_s} \\sum_{j=1}^{N_s} \\mathscr{L} (\\mathbf{y}^{(j)}, f_\\theta(\\mathbf{x}^{(j)})) $$ where \\(\\mathscr{L}\\) is the loss function for each input-output pair and \\(J\\) is the overall cost function. Optimization algorithm : mathematical method that aims to drive down (up) the cost function by modifying its free-parameters \\(\\theta\\) : $$ \\hat{\\theta} = \\underset{\\theta} {\\mathrm{argmin}} \\; J_\\theta $$ Optimization algorithms are generally divided into two main families: gradient-based (or local) and gradient-free (or global). Gradient-based optimization is by far the most popular way to train NNs and will be discussed in more details below.","title":"Gradient-based optimization"},{"location":"lectures/03_gradopt/#gradient-descent-algorithms","text":"The simplest of gradient-based methods is the so-called Gradient-descent algorithms (e.g., steepest descent algorithm). As the name implies, this algorithm uses local gradient information of the functional to minimize/maximize to move towards its global mimimum/maximum as depicted in the figure below. More formally, given a functional \\(J_\\theta\\) and its gradient \\(\\nabla J = \\frac{\\delta J}{\\delta \\theta}\\) , the (minimization) algorithm can be written as: Initialization: choose \\(\\theta \\in \\mathbb{R}\\) For \\(i=0,...N-1\\) ; Compute update direction \\(d_i = -\\nabla J |_{\\theta_i}\\) Estimate step-lenght \\(\\alpha_i\\) Update \\(\\theta_{i+1} = \\theta_{i} + \\alpha_i d_i\\) Note that the maximization version of this algorithm simply swaps the sign in the update direction (first equation of the algorithm). Moreover, the proposed algorithm can be easily extended to N-dimensional model vectors \\(\\theta=[\\theta_1, \\theta_2, ..., \\theta_N]\\) by defining the following gradient vector \\(\\nabla J=[\\delta J / \\delta\\theta_1, \\delta J / \\delta\\theta_2, ..., \\delta J/ \\delta\\theta_N]\\) .","title":"Gradient-descent algorithms"},{"location":"lectures/03_gradopt/#step-length-selection","text":"The choice of the step-length has tremendous impact on the performance of the algorithm and its ability to converge fast (i.e., in a small number of iterations) to the optimal solution. The most used selection rules are: Constant: the step size is fixed to a constant value \\(\\alpha_i=\\hat{\\alpha}\\) . This is the most common situation that we will encounter when training neural networks. In practice, some adaptive schemes based on the evolution of the train (or validation) norm are generally adopted, but we will still refer to this case as constant step size; Exact line search: at each iteration, \\(\\alpha_i\\) is chosen such that it minimizes \\(J(\\theta_{i} + \\alpha_i d_i)\\) . This is the most commonly used approach when dealing with linear systems of equations. Backtracking \"Armijo\" line search: at each iteration, given a parameter \\(\\mu \\in (0,1)\\) , start with \\(\\alpha_i=1\\) and reduce it by a factor of 2 until the following condition is satisfied: \\(J(\\theta_i) - J(\\theta_{i} + \\alpha_i d_i) \\ge -\\mu \\alpha_i \\nabla J^T d_i\\)","title":"Step length selection"},{"location":"lectures/03_gradopt/#second-order-optimization","text":"Up until now we have discussed first-order optimization techniques that rely on the ability to evaluate the function \\(J\\) and its gradient \\(\\nabla J\\) . Second-order optimization method go one step beyond in that they use information from both the local slope and curvature of the function \\(J\\) . When a function has small curvature, the function and its tangent line are very similar: the gradient alone is therefore able to provide a good local approximation of the function (i.e., \\(J(\\theta+\\delta \\theta)\\approx J(\\theta) + \\nabla J \\delta \\theta\\) ). On the other hand, if the curvature of the function of large, the function and its tangent line start to differ very quickly away from the linearization point. The gradient alone is not able anymore to provide a good local approximation of the function (i.e., \\(J(\\theta+\\delta \\theta)\\approx J(\\theta) + \\nabla J \\delta \\theta + \\nabla^2 J \\delta \\theta^2\\) ). Let's start again from the one-dimensional case and the well-known Newton's method . This method is generally employed to find the zeros of a function: \\(\\theta: J(\\theta)=0\\) and can be written as: \\[ \\theta_{i+1} = \\theta_i - \\frac{J(\\theta)|_{\\theta_i}}{J'(\\theta)|_{\\theta_i}} \\] which can be easily derived from the Taylor expansion of \\(J(\\theta)\\) around \\(\\theta_{i+1}\\) . If we remember that finding the minimum (or maximum) of a function is equivalent to find the zeros of its first derivative ( \\(\\theta: min_\\theta J(\\theta) \\leftrightarrow \\theta: J'(\\theta)=0\\) ), the Newton's method can be written as: \\[ \\theta_{i+1} = \\theta_i - \\frac{J'(\\theta)|_{\\theta_i}}{J''(\\theta)|_{\\theta_i}} \\] In order to be able to discuss second-order optimization algorithms for the multi-dimensional case, let's first introduce the notion of Jacobian : \\[\\mathbf{y} = J(\\boldsymbol\\theta) \\rightarrow \\mathbf{J} = \\begin{bmatrix} \\frac{\\partial J_1}{\\partial \\theta_1} & \\frac{\\partial J_1}{\\partial \\theta_2} & ... & \\frac{\\partial J_1}{\\partial \\theta_M} \\\\ ... & ... & ... & ... \\\\ \\frac{\\partial J_N}{\\partial \\theta_1} & \\frac{\\partial J_N}{\\partial \\theta_2} & ... & \\frac{\\partial J_N}{\\partial \\theta_M} \\\\ \\end{bmatrix} \\in \\mathbb{R}^{[N \\times M]} \\] Through the notion of Jacobian, we can define the Hessian as the Jacobian of the gradient vector \\[\\mathbf{H} = \\nabla (\\nabla J) = \\begin{bmatrix} \\frac{\\partial J^2}{\\partial \\theta_1^2} & \\frac{\\partial J^2}{\\partial x_1 \\partial \\theta_2} & ... & \\frac{\\partial J^2}{\\partial \\theta_1\\partial \\theta_M} \\\\ ... & ... & ... & ... \\\\ \\frac{\\partial J^2}{\\partial \\theta_M \\partial \\theta_1} & \\frac{\\partial J^2}{\\partial \\theta_M \\partial \\theta_2} & ... & \\frac{\\partial J^2}{\\partial \\theta_M^2} \\\\ \\end{bmatrix} \\in \\mathbb{R}^{[M \\times M]} \\] where we note that when \\(J\\) is continuous, \\(\\partial / \\partial \\theta_i \\partial \\theta_j = \\partial / \\partial \\theta_j \\partial \\theta_i\\) , and \\(\\mathbf{H}\\) is symmetric. The Newton method for the multi-dimensional case becomes: \\[ \\boldsymbol\\theta_{i+1} = \\boldsymbol\\theta_i - \\mathbf{H}^{-1}\\nabla J \\] Approximated version of the Newton method have been developed over the years, mostly based on the idea that inverting \\(\\mathbf{H}\\) is sometimes a prohibitive task. Such methods, generally referred to as Quasi-Netwon methods attempt to approximate the Hessian (or its inverse) using the collections of gradient information from the previous iterations. BFGS or its limited memory version L-BFGS are examples of such a kind. Due to their computational cost (as well as the lack of solid theories for their use in conjunction with approximate gradients), these methods are not yet commonly used by the machine learning community to optimize the parameters of NNs in deep learning.","title":"Second-order optimization"},{"location":"lectures/03_gradopt/#stochastic-gradient-descent-sgd","text":"To conclude, we look again at gradient-based iterative solvers and more specifically in the context of finite-sum functionals of the kind that we will encountering when training neural networks: \\[ J_\\theta = \\frac{1}{N_s} \\sum_{i=1}^{N_s} \\mathscr{L} (\\mathbf{y}^{(i)}, f_\\theta(\\mathbf{x}^{(i)})) \\] where the summation here is performed over training data.","title":"Stochastic-gradient descent (SGD)"},{"location":"lectures/03_gradopt/#batched-gradient-descent","text":"The solvers that we have considered so far are generally update the model parameters \\(\\boldsymbol\\theta\\) using the full gradient (i.e., over the entire batch of samples): \\[ \\boldsymbol\\theta_{i+1} = \\boldsymbol\\theta_{i} - \\alpha_i \\nabla J = \\boldsymbol\\theta_{i} - \\frac{\\alpha_i}{N_s} \\sum_{j=1}^{N_s} \\nabla \\mathscr{L}_j \\] A limitation of such an approach is that, if we have a very large number of training samples, the computational cost of computing the full gradient is very high and when some of the samples are similar, their gradient contribution is somehow redundant.","title":"Batched gradient descent"},{"location":"lectures/03_gradopt/#stochastic-gradient-descent","text":"In this case we take a completely opposite approach to computing the gradient. More specifically, a single training sample is considered at each iteration: \\[ \\boldsymbol\\theta_{i+1} = \\boldsymbol\\theta_{i} - \\alpha_i \\nabla \\mathscr{L}_j \\] The choice of the training sample \\(j\\) at each iteration is generally completely random and this is repeated once all training data have been used at least once (generally referred to as epoch ). In this case, the gradient may be noisy because the gradient of a single sample is a very rough approximation of the total cost function \\(J\\) : such a high variance of gradients requires lowering the step-size \\(\\alpha\\) leading to slow convergence.","title":"Stochastic gradient descent"},{"location":"lectures/03_gradopt/#mini-batched-gradient-descent","text":"A more commonly used strategy lies in between the batched and stochastic gradient descent algorithms uses batches of training samples to compute the gradient at each iteration. More specifically given a batch of \\(N_b\\) samples, the update formula can be written as: \\[ \\boldsymbol\\theta_{i+1} = \\boldsymbol\\theta_{i} - \\frac{\\alpha_i}{N_b} \\sum_{j=1}^{N_b} \\nabla \\mathscr{L}_j \\] and similarly to the stochastic gradient descent, the batches of data are chosen at random and this is repeated as soon as all data are used once in the training loop. Whilst the choice of the size of the batch depends on many factors (e.g., overall size of the dataset, variety of training samples), common batch sizes in training of NNs are from around 50 to 256 (unless memory requirements kick in leading to even small batch sizes).","title":"Mini-batched gradient descent"},{"location":"lectures/03_gradopt/#additional-readings","text":"the following blog post for a more detailed overview of the optimization algorithms discussed here. Note that in one of our future lectures we will also look again at the optimization algorithms and more specifically discuss strategies that allow overcoming some of the limitations of standard SGD in this lecture .","title":"Additional readings"},{"location":"lectures/04_linreg/","text":"Linear and Logistic Regression In the previous lecture we have learned how to optimize a generic loss function \\(J_\\theta\\) by modifying its free parameters \\(\\theta\\) . Whilst this is a very generic framework that can be used for various applications in different scientific field, from now on we will learn how to take advtange of similar algorithms in the context of Machine Learning. Linear regression In preparation to our lecture on Neural Networks, here we consider the simplest machine learning model for regression, linear regression . Its simplicity lies in the fact that we will only consider a linear relationship between our inputs and targets: where \\(\\textbf{x}\\) is a training sample with \\(N_f\\) features, \\(\\textbf{w}\\) is a vector of \\(N_f\\) weights and \\(b=w_0\\) is the so-called bias term. The set of trainable parameters is therefore the combination of the weights and bias \\(\\boldsymbol\\theta=[\\textbf{w}, b] \\in \\mathbb{R}^{N_f+1}\\) . Similarly, the combination of the training sample and a 1-scalar is defined as \\(\\tilde{\\textbf{x}}=[\\textbf{x}, 1] \\in \\mathbb{R}^{N_f+1}\\) The prediction \\(\\hat{y}\\) is simply obtained by linearly combining the different features of the input vector and adding the bias. Despite its simplicity, linear regression (and more commonly multi-variate linear regression) has been successfully used in a variety of geoscientific tasks, examples of such a kind are: rock-physics models, where a number of petrophysical parameters (e.g., porosity, shale content, depth) can be linearly regressed in order to predict an elastic parameter of interest (e.g., dry bulk modulus); time-to-depth conversion, where a velocity (or depth) prediction is generally made as a linear combination of two-way traveltime and other parameters such as seismic amplitudes and various derived attributes; filling gaps in petrophysical well logs, where various petrophysical measurements (e.g., GR, NEU, DEN) are regressed to estimate another quantity of interest (e.g., S-wave velocity of DTS) that is not directly available within a certain depth interval. Assuming availability of \\(N_s\\) training samples, the input training matrix and output training vector of a linear regression model is written as: \\[ \\mathbf{X}_{train} = [\\tilde{\\mathbf{x}}^{(1)}, \\tilde{\\mathbf{x}}^{(2)}, ..., \\tilde{\\mathbf{x}}^{(N_s)}] \\in \\mathbb{R}^{N_f+1 \\times N_s}, \\quad \\mathbf{y}_{train} = [y^{(1)}, y^{(2)}, y^{(N_s)}] \\in \\mathbb{R}^{N_s \\times 1} \\] Finally, the model can be compactly written as: \\[ \\hat{\\textbf{y}}_{train} = \\textbf{X}_{train}^T \\boldsymbol\\theta \\] Next, we need to define a metric (i.e., cost function) which we can use to optimize for the free parameters \\(\\boldsymbol\\theta\\) . For regression problems, a common metric of goodness is the L2 norm or MSE (Mean Square Error): \\[ J_\\theta = MSE(\\textbf{y}_{train}, \\hat{\\textbf{y}}_{train}) = \\frac{1}{N_s} || \\textbf{y}_{train} - \\hat{\\textbf{y}}_{train}||_2^2 = \\frac{1}{N_s} \\sum_i^{N_s} (y_{train}^{(i)}-\\hat{y}_{train}^{(i)})^2 \\] Based on our previous lecture on optimization, we need to find the best set of coefficients \\(\\theta\\) that minimizes the MSE: \\[ \\hat{\\theta} = min_\\theta J_\\theta \\rightarrow \\theta_{i+1} = \\theta_i - \\alpha \\nabla J_\\theta \\] However, since this is a linear inverse problem we can write the analytical solution of the minimization problem as: \\[ \\hat{\\theta} = (\\textbf{X}_{train}^T \\textbf{X}_{train})^{-1} \\textbf{X}_{train}^T \\textbf{y}_{train} \\] which can be obtained by inverting a \\(N_s \\times N_s\\) matrix. An important observation, which lies at the core of most Machine Learning algorithms, is that once the model is trained on the \\(N_s\\) available input-target pairs, the estimated \\(\\hat{\\theta}\\) coefficients can be used to make inference on any new unseen data: \\[ y_{test} = \\tilde{\\textbf{x}}^T_{test} \\hat{\\theta} \\] To conclude, once a linear regression model has been trained, a variety of measures exist to assess the goodness of the model. Whilst the same metric used for training, the mean-square error, can be used to assess the model performance, other metrics are represented by the Pearson coefficient ( \\(R^2\\) ) and the mean-absolute error (MAE). Logistic regression Simply put, logistic regression is an extension of linear regression to the problem of binary classification. Whilst the model used by logistic regression is the same linear model described above, this will be coupled with a nonlinear 'activation' function that enforces the outcome of the entire model to be bounded between 0 and 1 (i.e., a probability). In other words, whilst the input training matrix is the same as that of linear regression, the output training vector becomes: \\[ y_{train} = \\{0, 1\\} \\] A variety of applications of such a simple model can be found in geoscience, one common example is represent by net pay prediction from petrophysical logs. Given a single pair of training samples \\(\\textbf{x}, y\\) , a mathematical model for logistic regression can be compactly written as: \\[ \\hat{y} = f_\\theta(\\textbf{x}) = P(y=1 | \\textbf{x}) \\in (0,1) \\] or in other words, the input vector \\(\\textbf{x}\\) is fed through a nonlinear model \\(f_\\theta\\) whose output is a scalar number between 0 and 1 that represents the probability of the target output to be 1. Considering now a set of \\(N_s\\) training pairs, the model can be explicitly written as: \\[ \\hat{\\textbf{y}}_{train} = f_\\theta(\\textbf{X}_{train}) = \\sigma(\\textbf{X}_{train}^T \\boldsymbol\\theta) \\] where \\(\\sigma\\) is a sigmoid function as shown in figure below: Once again, let's define a cost function that we can use to optimize the model parameters. For binary classification, a common metric of goodness is represented by the so-called binary cross-entropy : \\[ \\mathscr{L}(y_{train}^{(i)}, \\hat{y}_{train}^{(i)}) = -(y_{train}^{(i)} log(\\hat{y}_{train}^{(i)}) + (1-y_{train}^{(i)}) log(1- \\hat{y}_{train}^{(i)})) \\] and \\[ J_\\theta = \\frac{1}{N_s} \\sum_i^{N_s} \\mathscr{L}(y_{train}^{(i)}, \\hat{y}_{train}^{(i)}) \\] Let's gain some intuition onto why this is a good cost function. More specifically, we consider with a drawing the two cases separately. First the case of positive target, \\(y_{train}^{(i)}=1\\) and then the case of negative target, \\(y_{train}^{(i)}=0\\) : Our drawings clearly show the validity of such a cost function in both cases. The further away is the prediction from the true label the higher the resulting cost function. Similar to the case of linear regression, we can now update the model parameters by minimizing the cost function: \\[ \\hat{\\theta} = min_\\theta J_\\theta \\rightarrow \\theta_{i+1} = \\theta_i - \\alpha \\nabla J_\\theta \\] However a major difference arises here. Whilst it is easy to compute the derivative of the MSE with respect to the model parameters \\(\\theta\\) , and even more since the model is linear an analytical solution can be found (as shown above), this is not the case of the cost function of the logistic regression model. The good news here is that there exist a systematic approach to computing the derivative of a composite function (i.e., \\(f(x)=f_N(...f_2(f_1(x)))\\) ), which simply relies on the well-known chain rule of functional analysis. This method is referred to in the mathematical community as Automatic Differentiation (AD), and more likely so as Back-propagation in the ML community. As this lies as the foundation of the training process for neural networks, we will get into details later in the text. At this point, it suffices to say that if we have a composite function like the one above, its derivative with respect to \\(x\\) can be written as: \\[ \\frac{\\partial f}{\\partial x} = \\frac{\\partial f_N}{\\partial f_{N-1}} ... \\frac{\\partial f_2}{\\partial f_1} \\frac{\\partial f_1}{\\partial x} \\] where the derivative is simply the product of all derivatives over the chain of operations of the composite function. Note that in practice it is more common to compute this chain rule in reverse order, from left to right in the equation above. We generally rely on the built-in functionalities of deep learning libraries such as Tensorflow or PyTorch to compute such derivaties, we will perform here a full derivation for the simple case of logistic regression. In order to do so, we introduce a very useful mathamatical tool that we use to keep track of a chain of operations and later, we know how to evaluate the associated gradient. This tool is usually known as computational graph . More specifically, instead of writing the entire logistic regression model compactly in a single equation, we divide it here into its atomic components: \\[ z = \\textbf{x}^T \\boldsymbol\\theta, \\quad a = \\sigma(z), \\quad \\mathscr{L} = -(y log(a) + (1-y)log(1-a)) \\] such that the derivative of the loss function with respect to the model parameters becomes: \\[ \\frac{\\partial \\mathscr{L} }{\\partial \\boldsymbol\\theta} = \\frac{\\partial \\mathscr{L} }{\\partial a} \\frac{\\partial a }{\\partial z} \\frac{\\partial z}{\\partial \\boldsymbol\\theta} \\] The forward and backward passes (as described in software frameworks like PyTorch) can be visually displayed as follows: Let's start from \\(\\partial \\mathscr{L} / \\partial a\\) : \\[ \\frac{\\partial \\mathscr{L}}{\\partial a} = -\\frac{y}{a} + \\frac{1-y}{1-a} = \\frac{-y(1-a) + (1-y)a}{a (1-a)} \\] and \\(\\partial a / \\partial z\\) : \\[ \\frac{\\partial a}{\\partial z} = a(1-a) \\] which we can combine together to obtain a simplified formula for the derivative of the loss function of the output of the weighted summation ( \\(z\\) ) \\[ \\frac{\\partial \\mathscr{L}}{\\partial z} = \\frac{\\partial \\mathscr{L}}{\\partial a} \\frac{\\partial a}{\\partial z} = -y(1-a) + (1-y)a = a - y = dz \\] Finally we differentiate between the weights and the bias to obtain: \\[ \\frac{\\partial z}{\\partial w_i} = x_i, \\quad \\frac{\\partial z}{\\partial b} = 1 \\] such that: \\[ \\frac{\\partial \\mathscr{L}}{\\partial w_i} = dz \\cdot x_i = dw_i, \\quad \\frac{\\partial \\mathscr{L}}{\\partial b} = dz = db \\] Having found the gradients, we can now update the parameters as discussed above: \\[ w_i \\leftarrow w_i - \\alpha \\frac{\\partial \\mathscr{L}}{\\partial w_i} = w_i - \\alpha dw_i, \\quad b \\leftarrow b - \\alpha \\frac{\\partial \\mathscr{L}}{\\partial b} = b - \\alpha db \\] which can be easily modified in the case of multiple training samples: \\[ w_i \\leftarrow w_i - \\alpha \\sum_{j=1}^{N_s} dw_i^{(j)}, \\quad b \\leftarrow b - \\alpha \\sum_{j=1}^{N_s} db^{(j)} \\] We can now summarize a single step of training for \\(N_s\\) training samples for the logistic regression model: \\(\\textbf{z}=\\textbf{X}_{train}^T \\boldsymbol \\theta\\) \\(\\textbf{a} = \\sigma(\\textbf{z})\\) \\(\\textbf{dz} = \\textbf{a} - \\textbf{y}\\) \\(\\textbf{dw} = \\frac{1}{N_s} \\textbf{X}_{train} \\textbf{dz}\\) \\(db = \\frac{1}{N_s} \\textbf{1}^T \\textbf{dz}\\) \\(\\textbf{w} \\leftarrow \\textbf{w} - \\alpha \\textbf{dw}\\) \\(b \\leftarrow b - \\alpha db\\) To conclude, let's turn our attention into some of the evaluation metrics that are commonly used to assess the performance of a classification model (or classifier). Note that these metrics can be used for the logistic regression model discussed here as well as for other more advanced models discussed later in the course. In general for binary classification we have two possible outcomes (positive/negative or true/false) for both the true labels \\(y\\) and the predicted labels \\(\\hat{y}\\) . We can therefore define 4 scenarios: and a number of complementary metrics (all bounded between 0 and 1) can be defined. Note that no metric is better than the others, the importance of one metric over another is context dependant. Precision : \\(Pr=\\frac{TP}{TP+FP}\\) , percentage of correct positive predictions over the overall positive predictions. This measure is appropriate when minimizing false positives is the focus. In the geoscientific context, this may represent a meaningful metric for applications where the main interest is that of predicting the smallest possible number of false positives, whilst at the same time accepting to miss out on some of positives (false negatives). This could be the case when we want to predict hydrocarbon bearing reservoirs from seismic data, where we know already that we will not be able to drill wells into many of them. It is therefore important that even if we make very few positive predictions these must be accurate, whilst the cost of missing other opportunities is not so high. On the other hand, this measure is blind to the predictions of real positive cases to be chosen to be part of the negative class (false negative); Recall : \\(Rc=\\frac{TP}{TP+FN} = \\frac{TP}{P}\\) , percentage of correct positive predictions over the overall positive occurrences. This measure is appropriate when minimizing false negatives is the focus. An opposite scenario to the one presented above is represented by the case of a classifier trained to predict pressure kicks whilst drilling a well. In this case, we are not really concerned with making a few mistakes where we predict a kick when this is not likely to happen (False Positive); of course, this may slow down the drilling process but it is nowhere near as dramatic as the case in which we do not predict a kick which is going to happen (False Negative); a high recall is therefore what we want, as this is an indication of the fact that the model does not miss out on many positive cases. Of course, a model that always provides a positive prediction will have a recall of 1 (FN=0), indication of the fact that a high recall is not always an indication of a good model; Accuracy : \\(Ac=\\frac{TP+TN}{TP+TN+FP+FN}=\\frac{TP+TN}{P+N}\\) , percentage of correct predictions over the total number of cases. This measure combines both error types (in the denominator), it is therefore a more global measure of the quality of the model. F1-Score : \\(2 \\frac{Pr \\cdot Rc}{Pr+Rc}\\) , represents a way to combine precision and recall into a single measure that captures both properties. Finally, a more complete description of the performance of a model is given by the so-called confusion matrix , which for the case of binary classification is just the \\(2 \\times 2\\) table in the figure above. This table can be both unnormalized, where each cell simply contains the number of samples which satisfy the specific combination of real and predicted labels, or normalized over either rows or columns.","title":"Linear and Logistic Regression"},{"location":"lectures/04_linreg/#linear-and-logistic-regression","text":"In the previous lecture we have learned how to optimize a generic loss function \\(J_\\theta\\) by modifying its free parameters \\(\\theta\\) . Whilst this is a very generic framework that can be used for various applications in different scientific field, from now on we will learn how to take advtange of similar algorithms in the context of Machine Learning.","title":"Linear and Logistic Regression"},{"location":"lectures/04_linreg/#linear-regression","text":"In preparation to our lecture on Neural Networks, here we consider the simplest machine learning model for regression, linear regression . Its simplicity lies in the fact that we will only consider a linear relationship between our inputs and targets: where \\(\\textbf{x}\\) is a training sample with \\(N_f\\) features, \\(\\textbf{w}\\) is a vector of \\(N_f\\) weights and \\(b=w_0\\) is the so-called bias term. The set of trainable parameters is therefore the combination of the weights and bias \\(\\boldsymbol\\theta=[\\textbf{w}, b] \\in \\mathbb{R}^{N_f+1}\\) . Similarly, the combination of the training sample and a 1-scalar is defined as \\(\\tilde{\\textbf{x}}=[\\textbf{x}, 1] \\in \\mathbb{R}^{N_f+1}\\) The prediction \\(\\hat{y}\\) is simply obtained by linearly combining the different features of the input vector and adding the bias. Despite its simplicity, linear regression (and more commonly multi-variate linear regression) has been successfully used in a variety of geoscientific tasks, examples of such a kind are: rock-physics models, where a number of petrophysical parameters (e.g., porosity, shale content, depth) can be linearly regressed in order to predict an elastic parameter of interest (e.g., dry bulk modulus); time-to-depth conversion, where a velocity (or depth) prediction is generally made as a linear combination of two-way traveltime and other parameters such as seismic amplitudes and various derived attributes; filling gaps in petrophysical well logs, where various petrophysical measurements (e.g., GR, NEU, DEN) are regressed to estimate another quantity of interest (e.g., S-wave velocity of DTS) that is not directly available within a certain depth interval. Assuming availability of \\(N_s\\) training samples, the input training matrix and output training vector of a linear regression model is written as: \\[ \\mathbf{X}_{train} = [\\tilde{\\mathbf{x}}^{(1)}, \\tilde{\\mathbf{x}}^{(2)}, ..., \\tilde{\\mathbf{x}}^{(N_s)}] \\in \\mathbb{R}^{N_f+1 \\times N_s}, \\quad \\mathbf{y}_{train} = [y^{(1)}, y^{(2)}, y^{(N_s)}] \\in \\mathbb{R}^{N_s \\times 1} \\] Finally, the model can be compactly written as: \\[ \\hat{\\textbf{y}}_{train} = \\textbf{X}_{train}^T \\boldsymbol\\theta \\] Next, we need to define a metric (i.e., cost function) which we can use to optimize for the free parameters \\(\\boldsymbol\\theta\\) . For regression problems, a common metric of goodness is the L2 norm or MSE (Mean Square Error): \\[ J_\\theta = MSE(\\textbf{y}_{train}, \\hat{\\textbf{y}}_{train}) = \\frac{1}{N_s} || \\textbf{y}_{train} - \\hat{\\textbf{y}}_{train}||_2^2 = \\frac{1}{N_s} \\sum_i^{N_s} (y_{train}^{(i)}-\\hat{y}_{train}^{(i)})^2 \\] Based on our previous lecture on optimization, we need to find the best set of coefficients \\(\\theta\\) that minimizes the MSE: \\[ \\hat{\\theta} = min_\\theta J_\\theta \\rightarrow \\theta_{i+1} = \\theta_i - \\alpha \\nabla J_\\theta \\] However, since this is a linear inverse problem we can write the analytical solution of the minimization problem as: \\[ \\hat{\\theta} = (\\textbf{X}_{train}^T \\textbf{X}_{train})^{-1} \\textbf{X}_{train}^T \\textbf{y}_{train} \\] which can be obtained by inverting a \\(N_s \\times N_s\\) matrix. An important observation, which lies at the core of most Machine Learning algorithms, is that once the model is trained on the \\(N_s\\) available input-target pairs, the estimated \\(\\hat{\\theta}\\) coefficients can be used to make inference on any new unseen data: \\[ y_{test} = \\tilde{\\textbf{x}}^T_{test} \\hat{\\theta} \\] To conclude, once a linear regression model has been trained, a variety of measures exist to assess the goodness of the model. Whilst the same metric used for training, the mean-square error, can be used to assess the model performance, other metrics are represented by the Pearson coefficient ( \\(R^2\\) ) and the mean-absolute error (MAE).","title":"Linear regression"},{"location":"lectures/04_linreg/#logistic-regression","text":"Simply put, logistic regression is an extension of linear regression to the problem of binary classification. Whilst the model used by logistic regression is the same linear model described above, this will be coupled with a nonlinear 'activation' function that enforces the outcome of the entire model to be bounded between 0 and 1 (i.e., a probability). In other words, whilst the input training matrix is the same as that of linear regression, the output training vector becomes: \\[ y_{train} = \\{0, 1\\} \\] A variety of applications of such a simple model can be found in geoscience, one common example is represent by net pay prediction from petrophysical logs. Given a single pair of training samples \\(\\textbf{x}, y\\) , a mathematical model for logistic regression can be compactly written as: \\[ \\hat{y} = f_\\theta(\\textbf{x}) = P(y=1 | \\textbf{x}) \\in (0,1) \\] or in other words, the input vector \\(\\textbf{x}\\) is fed through a nonlinear model \\(f_\\theta\\) whose output is a scalar number between 0 and 1 that represents the probability of the target output to be 1. Considering now a set of \\(N_s\\) training pairs, the model can be explicitly written as: \\[ \\hat{\\textbf{y}}_{train} = f_\\theta(\\textbf{X}_{train}) = \\sigma(\\textbf{X}_{train}^T \\boldsymbol\\theta) \\] where \\(\\sigma\\) is a sigmoid function as shown in figure below: Once again, let's define a cost function that we can use to optimize the model parameters. For binary classification, a common metric of goodness is represented by the so-called binary cross-entropy : \\[ \\mathscr{L}(y_{train}^{(i)}, \\hat{y}_{train}^{(i)}) = -(y_{train}^{(i)} log(\\hat{y}_{train}^{(i)}) + (1-y_{train}^{(i)}) log(1- \\hat{y}_{train}^{(i)})) \\] and \\[ J_\\theta = \\frac{1}{N_s} \\sum_i^{N_s} \\mathscr{L}(y_{train}^{(i)}, \\hat{y}_{train}^{(i)}) \\] Let's gain some intuition onto why this is a good cost function. More specifically, we consider with a drawing the two cases separately. First the case of positive target, \\(y_{train}^{(i)}=1\\) and then the case of negative target, \\(y_{train}^{(i)}=0\\) : Our drawings clearly show the validity of such a cost function in both cases. The further away is the prediction from the true label the higher the resulting cost function. Similar to the case of linear regression, we can now update the model parameters by minimizing the cost function: \\[ \\hat{\\theta} = min_\\theta J_\\theta \\rightarrow \\theta_{i+1} = \\theta_i - \\alpha \\nabla J_\\theta \\] However a major difference arises here. Whilst it is easy to compute the derivative of the MSE with respect to the model parameters \\(\\theta\\) , and even more since the model is linear an analytical solution can be found (as shown above), this is not the case of the cost function of the logistic regression model. The good news here is that there exist a systematic approach to computing the derivative of a composite function (i.e., \\(f(x)=f_N(...f_2(f_1(x)))\\) ), which simply relies on the well-known chain rule of functional analysis. This method is referred to in the mathematical community as Automatic Differentiation (AD), and more likely so as Back-propagation in the ML community. As this lies as the foundation of the training process for neural networks, we will get into details later in the text. At this point, it suffices to say that if we have a composite function like the one above, its derivative with respect to \\(x\\) can be written as: \\[ \\frac{\\partial f}{\\partial x} = \\frac{\\partial f_N}{\\partial f_{N-1}} ... \\frac{\\partial f_2}{\\partial f_1} \\frac{\\partial f_1}{\\partial x} \\] where the derivative is simply the product of all derivatives over the chain of operations of the composite function. Note that in practice it is more common to compute this chain rule in reverse order, from left to right in the equation above. We generally rely on the built-in functionalities of deep learning libraries such as Tensorflow or PyTorch to compute such derivaties, we will perform here a full derivation for the simple case of logistic regression. In order to do so, we introduce a very useful mathamatical tool that we use to keep track of a chain of operations and later, we know how to evaluate the associated gradient. This tool is usually known as computational graph . More specifically, instead of writing the entire logistic regression model compactly in a single equation, we divide it here into its atomic components: \\[ z = \\textbf{x}^T \\boldsymbol\\theta, \\quad a = \\sigma(z), \\quad \\mathscr{L} = -(y log(a) + (1-y)log(1-a)) \\] such that the derivative of the loss function with respect to the model parameters becomes: \\[ \\frac{\\partial \\mathscr{L} }{\\partial \\boldsymbol\\theta} = \\frac{\\partial \\mathscr{L} }{\\partial a} \\frac{\\partial a }{\\partial z} \\frac{\\partial z}{\\partial \\boldsymbol\\theta} \\] The forward and backward passes (as described in software frameworks like PyTorch) can be visually displayed as follows: Let's start from \\(\\partial \\mathscr{L} / \\partial a\\) : \\[ \\frac{\\partial \\mathscr{L}}{\\partial a} = -\\frac{y}{a} + \\frac{1-y}{1-a} = \\frac{-y(1-a) + (1-y)a}{a (1-a)} \\] and \\(\\partial a / \\partial z\\) : \\[ \\frac{\\partial a}{\\partial z} = a(1-a) \\] which we can combine together to obtain a simplified formula for the derivative of the loss function of the output of the weighted summation ( \\(z\\) ) \\[ \\frac{\\partial \\mathscr{L}}{\\partial z} = \\frac{\\partial \\mathscr{L}}{\\partial a} \\frac{\\partial a}{\\partial z} = -y(1-a) + (1-y)a = a - y = dz \\] Finally we differentiate between the weights and the bias to obtain: \\[ \\frac{\\partial z}{\\partial w_i} = x_i, \\quad \\frac{\\partial z}{\\partial b} = 1 \\] such that: \\[ \\frac{\\partial \\mathscr{L}}{\\partial w_i} = dz \\cdot x_i = dw_i, \\quad \\frac{\\partial \\mathscr{L}}{\\partial b} = dz = db \\] Having found the gradients, we can now update the parameters as discussed above: \\[ w_i \\leftarrow w_i - \\alpha \\frac{\\partial \\mathscr{L}}{\\partial w_i} = w_i - \\alpha dw_i, \\quad b \\leftarrow b - \\alpha \\frac{\\partial \\mathscr{L}}{\\partial b} = b - \\alpha db \\] which can be easily modified in the case of multiple training samples: \\[ w_i \\leftarrow w_i - \\alpha \\sum_{j=1}^{N_s} dw_i^{(j)}, \\quad b \\leftarrow b - \\alpha \\sum_{j=1}^{N_s} db^{(j)} \\] We can now summarize a single step of training for \\(N_s\\) training samples for the logistic regression model: \\(\\textbf{z}=\\textbf{X}_{train}^T \\boldsymbol \\theta\\) \\(\\textbf{a} = \\sigma(\\textbf{z})\\) \\(\\textbf{dz} = \\textbf{a} - \\textbf{y}\\) \\(\\textbf{dw} = \\frac{1}{N_s} \\textbf{X}_{train} \\textbf{dz}\\) \\(db = \\frac{1}{N_s} \\textbf{1}^T \\textbf{dz}\\) \\(\\textbf{w} \\leftarrow \\textbf{w} - \\alpha \\textbf{dw}\\) \\(b \\leftarrow b - \\alpha db\\) To conclude, let's turn our attention into some of the evaluation metrics that are commonly used to assess the performance of a classification model (or classifier). Note that these metrics can be used for the logistic regression model discussed here as well as for other more advanced models discussed later in the course. In general for binary classification we have two possible outcomes (positive/negative or true/false) for both the true labels \\(y\\) and the predicted labels \\(\\hat{y}\\) . We can therefore define 4 scenarios: and a number of complementary metrics (all bounded between 0 and 1) can be defined. Note that no metric is better than the others, the importance of one metric over another is context dependant. Precision : \\(Pr=\\frac{TP}{TP+FP}\\) , percentage of correct positive predictions over the overall positive predictions. This measure is appropriate when minimizing false positives is the focus. In the geoscientific context, this may represent a meaningful metric for applications where the main interest is that of predicting the smallest possible number of false positives, whilst at the same time accepting to miss out on some of positives (false negatives). This could be the case when we want to predict hydrocarbon bearing reservoirs from seismic data, where we know already that we will not be able to drill wells into many of them. It is therefore important that even if we make very few positive predictions these must be accurate, whilst the cost of missing other opportunities is not so high. On the other hand, this measure is blind to the predictions of real positive cases to be chosen to be part of the negative class (false negative); Recall : \\(Rc=\\frac{TP}{TP+FN} = \\frac{TP}{P}\\) , percentage of correct positive predictions over the overall positive occurrences. This measure is appropriate when minimizing false negatives is the focus. An opposite scenario to the one presented above is represented by the case of a classifier trained to predict pressure kicks whilst drilling a well. In this case, we are not really concerned with making a few mistakes where we predict a kick when this is not likely to happen (False Positive); of course, this may slow down the drilling process but it is nowhere near as dramatic as the case in which we do not predict a kick which is going to happen (False Negative); a high recall is therefore what we want, as this is an indication of the fact that the model does not miss out on many positive cases. Of course, a model that always provides a positive prediction will have a recall of 1 (FN=0), indication of the fact that a high recall is not always an indication of a good model; Accuracy : \\(Ac=\\frac{TP+TN}{TP+TN+FP+FN}=\\frac{TP+TN}{P+N}\\) , percentage of correct predictions over the total number of cases. This measure combines both error types (in the denominator), it is therefore a more global measure of the quality of the model. F1-Score : \\(2 \\frac{Pr \\cdot Rc}{Pr+Rc}\\) , represents a way to combine precision and recall into a single measure that captures both properties. Finally, a more complete description of the performance of a model is given by the so-called confusion matrix , which for the case of binary classification is just the \\(2 \\times 2\\) table in the figure above. This table can be both unnormalized, where each cell simply contains the number of samples which satisfy the specific combination of real and predicted labels, or normalized over either rows or columns.","title":"Logistic regression"},{"location":"lectures/05_nn/","text":"Basics of Neural Networks In this lecture, we start our journey in the field of Deep Learning. In order to do so, we must first introduce the most commonly used kind of Neural Networks, the so-called Multi-Layer Perceptron (MLP) (also commonly referred to as fully connected (FC) layer). A MLP is a class of feedforward artificial neural networks (ANNs), where the term feedforward refers to the fact the the flow of information moves from left to right. On the other hand, a change in the direction of the flow is introduced as part of the forward pass gives rise to a different family of NNs, so-called Recurrent Neural Networks (they will be subject of future lectures): Perceptron To begin with, we focus on the core building block of a MLP, so-called Perceptron or Unit. This is nothing really new to us, as it is exactly the same structure that we used to create the logistic regression model, a linear weighting of the element of the input vector followed by a nonlinear activation function. We prefer however to schematic represent it in a slightly different way as this will make it easier later on to drawn MLPs. Mathematically, the action of a percepton can be written compactly as dot-product followed by an element-wise nonlinear activation: \\[ y = \\sigma(\\sum_i w_i x_i + b) = \\sigma(\\sum_i \\textbf{w}^T \\textbf{x} + b) \\] where \\(\\textbf{w} \\in \\mathbb{R}^{N_i}\\) is the vector of weights, \\(b\\) is the bias, and \\(\\sigma\\) is a nonlinear activation function. Note that whilst we used a sigmoid function in the logistic regression model, this can be any differentiable function as later we will discuss in more details. Multi-layer Perceptron The perceptron model shown above takes as input a vector \\(\\textbf{x} \\in \\mathbb{R}^{N_i}\\) and returns a scalar \\(y\\) , we are now ready to make a step forward where we simply combine multiple perceptrons together to return a vector \\(\\textbf{y} \\in \\mathbb{R}^{N_o}\\) The MLP in the figure above presents \\(N_i=3\\) inputs and \\(N_o=2\\) outputs. By highlighting the original perceptron in green, we can easily observed that a MLP is simply a composition of \\(N_o\\) perceptrons, which again we can compactly write as a matrix-vector multiplication followed by an element-wise nonlinear activation: \\[ y_j = \\sigma(\\sum_i w_{ji} x_i + b), \\quad \\textbf{y} = \\sigma(\\textbf{W} \\textbf{x} + \\textbf{b}) \\] where \\(\\textbf{W} \\in \\mathbb{R}^{N_o \\times N_i}\\) is the matrix of weights, \\(\\textbf{b} \\in \\mathbb{R}^{N_o}\\) is a vector of biases. Finally, if we stack multiple MLPs together we obtained what is generally referred to as N-layer NN, where the count of the number of layers does not include the input layer. For example, a 3-layer NN has the following structure where we omit for simplicity the bias terms in the schematic drawing. This figure gives us the opportunity to introduce some terminology commonly used in the DL community: Input layer : first layer taking the input vector \\(\\textbf{x}\\) as input and returning an intermediate representation \\(\\textbf{z}^{[1]}\\) ; Hidden layers : second to penultimate layers taking as input the previous representation \\(\\textbf{z}^{[i-1]}\\) and returning a new representation \\(\\textbf{z}^{[i]}\\) ; Ouput layer : last layer producing the output of the network \\(\\textbf{y}\\) ; Depth : number of hidden layers (plus output layer); Width : number of units in each hidden layer. Note that we will always use the following notation \\(\\cdot^{(i)[j]}\\) where round brackets are used to refer to a specific training sample and square brackets are used to refer to a specific layer. Activation functions We have just started to appreciate the simplicity of NNs. A Neural Network is nothing more than a stack of linear transformations and nonlinear element-wise activation functions. If such activation functions where omitted, we could combine the various linear transformations together in a single matrix, as the product of N matrices. Assuming that sigma acts as an identity matrix \\(\\sigma(\\textbf{x})=\\textbf{Ix}=\\textbf{x}\\) , (and omitting biases for simplicity) we get: $$ \\textbf{y} = \\sigma(\\textbf{W}^{[3]}\\sigma(\\textbf{W}^{[2]}\\sigma(\\textbf{W}^{[1]} \\textbf{x}))) = \\textbf{W}^{[3]}\\textbf{W}^{[2]}\\textbf{W}^{[1]}\\textbf{x} = \\textbf{W} \\textbf{x} $$ so no matter how deep the network is, we can always reconduct it to a linear model. Depending on the final activation and loss function, therefore we will have a linear regression or a logistic regression model. We consider here a very simple example to show the importance of nonlinear activations before delving into the details. Let's assume that we wish the learn the XOR (eXclusive OR) boolean logic operator from the following four training samples: \\[ \\textbf{x}^{(1)} = [0, 0] \\rightarrow y^{(1)}=0 \\] \\[ \\textbf{x}^{(2)} = [0, 1] \\rightarrow y^{(2)}=1 \\] \\[ \\textbf{x}^{(3)} = [1, 0] \\rightarrow y^{(3)}=1 \\] \\[ \\textbf{x}^{(4)} = [1, 1] \\rightarrow y^{(4)}=0 \\] Starting from the linear regression model, we can define a matrix \\(\\textbf{X}_{train} = [\\textbf{x}^{(1)}, \\textbf{x}^{(2)}, \\textbf{x}^{(3)}, \\textbf{x}^{(4)}]\\) and a vector \\(\\textbf{y}_{train} = [y^{(1)}, y^{(2)}, y^{(3)}, y^{(4)}]\\) . The linear model becomes: \\[ \\textbf{y}_{train} = \\textbf{X}_{train}^T \\boldsymbol \\theta \\] where the weights \\(\\boldsymbol \\theta\\) are obtained as detailed in the previous section. It can be easily proven that the solution is \\(\\boldsymbol \\theta=[0,0,0.5]\\) , where \\(\\textbf{w}=[0,0]\\) and \\(b=0.5\\) . This means that, no matter the input the output of the linear model will always be equal to \\(0.5\\) ; in other words, the model is unable to distinguish between the true or false outcomes. If instead we introduce a nonlinearity between two weight matrices (i.e., a 2-layer NN), the following combination of weights and biases (taken from the Goodfellow book) will lead to a correct prediction: \\[ \\textbf{W}^{[1]} = \\begin{bmatrix} 1 & 1 \\\\ 1 & 1\\end{bmatrix}, \\textbf{W}^{[2]} = \\begin{bmatrix} 1 \\\\ -2 \\end{bmatrix}^T, \\textbf{b}^{[1]} = \\begin{bmatrix} 0 \\\\ -1 \\end{bmatrix}, b^{[2]} = 0 \\] Note that in this case the \\(\\sigma=ReLU\\) activation function, which we will introduce in the next section, must be used. Of course, there may be many more combinations of weights and biases that lead to a satisfactory prediction. You can prove this to yourself by initializing the weights and biases randomly and optimizing them by means of a stochastic gradient-descent algorithm. Having introduced nonlinearites every time after we apply the weight matrices to the vector flowing through the computational graph, the overall set of operations cannot be simply reconducted to a matrix-vector multiplication and allows us to learn highly complex nonlinear mappings between input features and targets. The role of activation functions is however not always straightforward and easy to grasp. Whilst we can say that they help in the learning process, not every function is suitable for this task and in fact, some functions may prevent the network from learning at all. In the following we look at the most commonly used activation functions and discuss their origin and why they became popular and useful in Deep Learning: Sigmoid and Tanh : historically these were the most popular activation functions as they are differentiable across the entire domain. In the past, there was in fact a strong belief that gradient-descent cannot operate on functions that have singularities; although this is correct from a theoretical point of view it was later proved to be wrong in practice. They are mathematically defined as: $$ \\sigma_s(x) = \\frac{1}{1-e^{-x}} $$ and $$ \\sigma_t(x) = 2 \\sigma_s(2x) - 1 $$ Whilst still used in various contexts, these activation functions saturate very quickly (i.e., large values are clipped to 1 and small values are clipped to -1 for tanh or 0 for sigmoid). This leads to the so-called vanishing gradient problem that we will discuss in more details in following lectures; simply put, if we look at the the gradient of both of these functions, it is non-zero only when x is near zero and becomes zero away from it, meaning that if the output of a linear layer is large the gradient of the activation function will be zero and therefore the gradient will stop flowing through backpropagation. This is particularly problematic for deep network as the training of the early layers becomes very slow. ReLU (Rectified Linear Unit): this activation function became very popular in the start of the 21st century and since then it is the most commonly used activation function for NN training. It is much closer to a linear activation than the previous two, but introduces a nonlinearity by putting negative inputs to zero. By doing so, the ReLU activation function is a piecewise linear function. This shows that non-differentiable functions can be used in gradient based optimization, mostly because numerically we will hardly (if not never) have an output of a NN layer that is exactly zero when fed as input to the activation. Mathematically speaking, we can write it as: $$ \\sigma_r(x) = max ( 0,x ) = \\begin{cases} x & x\\ge 0, \\quad 0 & x<0 \\end{cases} $$ whilst its derivative is: $$ \\sigma'_{relu}(x) = \\begin{cases} 1 & x\\ge 0, \\quad 0 & x<0 \\end{cases} $$ We can observe that this activation function never saturates, for every value in the positive axis the derivative is always 1. Such a property makes ReLU suitable for large networks as the risk of vanishing gradients is greatly reduced. A downside of ReLU is that the entire negative axis acts as an annihilator preventing information to flow. A strategy to prevent or reduce the occurrences of negative inputs is represented by the initialization of biases to a value slightly greater than zero (e.g., b=0.1). Leaky ReLU (Leaky Rectified Linear Unit): a modified version of the ReLU activation function aimed once again at avoiding zeroing of inputs in the negative axis. This function is identical to the ReLU in the positive axis, whilst another straight line with smaller slope is used in the negative axis: $$ \\sigma'_{l-relu}(x) = max ( 0,x ) + \\alpha min ( 0,x ) = \\begin{cases} x & x\\ge 0, \\quad \\alpha x & x<0 \\end{cases} $$ By doing so, also negative inputs can flow through the computational graph. A variant of L-ReLU, called P-ReLU, allows for the \\(\\alpha\\) parameter to be learned instead of being fixed. Absolute ReLU (Absolute Rectified Linear Unit): a modified version of the ReLU activation function that is symmetric with respect to the \\(x=0\\) axis: $$ \\sigma'_{l-relu}(x) = |x| = \\begin{cases} x & x\\ge 0, \\quad -x & x<0 \\end{cases} $$ Whilst this is not a popular choice in the DL literature, it has been successfully used in object detection tasks where the features that we wish the NN to extract from the training process are polarity invariant. Cosine, Sine, ... : the use of periodic functions have recently started to appear in the literature especially in the context of scientific DL (e.g., Physics-informed neural networks). Softmax : this activation function is commonly used at the end of the last layer in the context of multi-label classification. However as it takes an input vector of N numbers and converts it into an output vector of probabilities (i.e., N numbers summing to 1), it may also be used as a sort of switch in the internal layers. The following two figures show the different activation functions discussed above and their gradients. Network architecture Up until now we have discussed the key components of a Feedforward Neural Network, the Multi-layer Perceptron. It was mentioned a few times that a NN can be composed of multiple MLPs connected with each other, giving rise to a so-called Deep Neural Network (DNN). The depth and width of the network has been also defined, and we have introduced the convention that a N-layer NN is a network with N-1 hidden layers. A crucial point in the design of a neural network architecture is represented by the choice of such parameters. Whilst no hard rules exist and the creation of a NN architecture is to these days still closer to an art than a systematic science, in the following we provide a number of guidelines that should be followed when approaching the problem of designing a network. For example, as previously discussed, connecting two or more layers without adding a nonlinear activation function in between should be avoided as this part of the network simply behaves as a single linear layer. An important theorem that provide insights into the design of neural networks is the so-called Universal Approximation theorem . This theorem states that: \"...regardless of the function that we are trying to learn, we know that a single MLP with infinite number of units can represent this function. We are however not guaranteed that we can train such a network...\" More specifically, learning can fail for two different reasons: i) the optimization algorithm used for training may not be able to find the value of the parameters that correspond to the desired function; ii) the training algorithm might choose the wrong function as a result of over\ufb01tting. In practice, experience has shown that deeper networks with fewer units per layer are better both in terms of generalization and robustness to training . This leads us with a trade-off between shallow networks with many units in each layer and deep networks with fewer units in each layer. An empirical trend has been observed between the depth of a network and its accuracy on test data: To summarize, whilst theoretically 1-layer shallow networks can learn any function, it is advisable these days to trade network width with network depth as training deep networks is nowadays feasible both from a theoretical and computational point of view. It is however always best to start small and grow the network in width and depth as the problem requires. We will see in the following lectures that a large network requires a large training data to avoid overfitting; therefore, when working with small to medium size training data it is always best to avoid using very large networks in the first place. Additional readings the following blog post contains an extensive treatment of activation functions used in NN training beyond the most popular ones that we covered in this lecture.","title":"Basics of Neural Networks"},{"location":"lectures/05_nn/#basics-of-neural-networks","text":"In this lecture, we start our journey in the field of Deep Learning. In order to do so, we must first introduce the most commonly used kind of Neural Networks, the so-called Multi-Layer Perceptron (MLP) (also commonly referred to as fully connected (FC) layer). A MLP is a class of feedforward artificial neural networks (ANNs), where the term feedforward refers to the fact the the flow of information moves from left to right. On the other hand, a change in the direction of the flow is introduced as part of the forward pass gives rise to a different family of NNs, so-called Recurrent Neural Networks (they will be subject of future lectures):","title":"Basics of Neural Networks"},{"location":"lectures/05_nn/#perceptron","text":"To begin with, we focus on the core building block of a MLP, so-called Perceptron or Unit. This is nothing really new to us, as it is exactly the same structure that we used to create the logistic regression model, a linear weighting of the element of the input vector followed by a nonlinear activation function. We prefer however to schematic represent it in a slightly different way as this will make it easier later on to drawn MLPs. Mathematically, the action of a percepton can be written compactly as dot-product followed by an element-wise nonlinear activation: \\[ y = \\sigma(\\sum_i w_i x_i + b) = \\sigma(\\sum_i \\textbf{w}^T \\textbf{x} + b) \\] where \\(\\textbf{w} \\in \\mathbb{R}^{N_i}\\) is the vector of weights, \\(b\\) is the bias, and \\(\\sigma\\) is a nonlinear activation function. Note that whilst we used a sigmoid function in the logistic regression model, this can be any differentiable function as later we will discuss in more details.","title":"Perceptron"},{"location":"lectures/05_nn/#multi-layer-perceptron","text":"The perceptron model shown above takes as input a vector \\(\\textbf{x} \\in \\mathbb{R}^{N_i}\\) and returns a scalar \\(y\\) , we are now ready to make a step forward where we simply combine multiple perceptrons together to return a vector \\(\\textbf{y} \\in \\mathbb{R}^{N_o}\\) The MLP in the figure above presents \\(N_i=3\\) inputs and \\(N_o=2\\) outputs. By highlighting the original perceptron in green, we can easily observed that a MLP is simply a composition of \\(N_o\\) perceptrons, which again we can compactly write as a matrix-vector multiplication followed by an element-wise nonlinear activation: \\[ y_j = \\sigma(\\sum_i w_{ji} x_i + b), \\quad \\textbf{y} = \\sigma(\\textbf{W} \\textbf{x} + \\textbf{b}) \\] where \\(\\textbf{W} \\in \\mathbb{R}^{N_o \\times N_i}\\) is the matrix of weights, \\(\\textbf{b} \\in \\mathbb{R}^{N_o}\\) is a vector of biases. Finally, if we stack multiple MLPs together we obtained what is generally referred to as N-layer NN, where the count of the number of layers does not include the input layer. For example, a 3-layer NN has the following structure where we omit for simplicity the bias terms in the schematic drawing. This figure gives us the opportunity to introduce some terminology commonly used in the DL community: Input layer : first layer taking the input vector \\(\\textbf{x}\\) as input and returning an intermediate representation \\(\\textbf{z}^{[1]}\\) ; Hidden layers : second to penultimate layers taking as input the previous representation \\(\\textbf{z}^{[i-1]}\\) and returning a new representation \\(\\textbf{z}^{[i]}\\) ; Ouput layer : last layer producing the output of the network \\(\\textbf{y}\\) ; Depth : number of hidden layers (plus output layer); Width : number of units in each hidden layer. Note that we will always use the following notation \\(\\cdot^{(i)[j]}\\) where round brackets are used to refer to a specific training sample and square brackets are used to refer to a specific layer.","title":"Multi-layer Perceptron"},{"location":"lectures/05_nn/#activation-functions","text":"We have just started to appreciate the simplicity of NNs. A Neural Network is nothing more than a stack of linear transformations and nonlinear element-wise activation functions. If such activation functions where omitted, we could combine the various linear transformations together in a single matrix, as the product of N matrices. Assuming that sigma acts as an identity matrix \\(\\sigma(\\textbf{x})=\\textbf{Ix}=\\textbf{x}\\) , (and omitting biases for simplicity) we get: $$ \\textbf{y} = \\sigma(\\textbf{W}^{[3]}\\sigma(\\textbf{W}^{[2]}\\sigma(\\textbf{W}^{[1]} \\textbf{x}))) = \\textbf{W}^{[3]}\\textbf{W}^{[2]}\\textbf{W}^{[1]}\\textbf{x} = \\textbf{W} \\textbf{x} $$ so no matter how deep the network is, we can always reconduct it to a linear model. Depending on the final activation and loss function, therefore we will have a linear regression or a logistic regression model. We consider here a very simple example to show the importance of nonlinear activations before delving into the details. Let's assume that we wish the learn the XOR (eXclusive OR) boolean logic operator from the following four training samples: \\[ \\textbf{x}^{(1)} = [0, 0] \\rightarrow y^{(1)}=0 \\] \\[ \\textbf{x}^{(2)} = [0, 1] \\rightarrow y^{(2)}=1 \\] \\[ \\textbf{x}^{(3)} = [1, 0] \\rightarrow y^{(3)}=1 \\] \\[ \\textbf{x}^{(4)} = [1, 1] \\rightarrow y^{(4)}=0 \\] Starting from the linear regression model, we can define a matrix \\(\\textbf{X}_{train} = [\\textbf{x}^{(1)}, \\textbf{x}^{(2)}, \\textbf{x}^{(3)}, \\textbf{x}^{(4)}]\\) and a vector \\(\\textbf{y}_{train} = [y^{(1)}, y^{(2)}, y^{(3)}, y^{(4)}]\\) . The linear model becomes: \\[ \\textbf{y}_{train} = \\textbf{X}_{train}^T \\boldsymbol \\theta \\] where the weights \\(\\boldsymbol \\theta\\) are obtained as detailed in the previous section. It can be easily proven that the solution is \\(\\boldsymbol \\theta=[0,0,0.5]\\) , where \\(\\textbf{w}=[0,0]\\) and \\(b=0.5\\) . This means that, no matter the input the output of the linear model will always be equal to \\(0.5\\) ; in other words, the model is unable to distinguish between the true or false outcomes. If instead we introduce a nonlinearity between two weight matrices (i.e., a 2-layer NN), the following combination of weights and biases (taken from the Goodfellow book) will lead to a correct prediction: \\[ \\textbf{W}^{[1]} = \\begin{bmatrix} 1 & 1 \\\\ 1 & 1\\end{bmatrix}, \\textbf{W}^{[2]} = \\begin{bmatrix} 1 \\\\ -2 \\end{bmatrix}^T, \\textbf{b}^{[1]} = \\begin{bmatrix} 0 \\\\ -1 \\end{bmatrix}, b^{[2]} = 0 \\] Note that in this case the \\(\\sigma=ReLU\\) activation function, which we will introduce in the next section, must be used. Of course, there may be many more combinations of weights and biases that lead to a satisfactory prediction. You can prove this to yourself by initializing the weights and biases randomly and optimizing them by means of a stochastic gradient-descent algorithm. Having introduced nonlinearites every time after we apply the weight matrices to the vector flowing through the computational graph, the overall set of operations cannot be simply reconducted to a matrix-vector multiplication and allows us to learn highly complex nonlinear mappings between input features and targets. The role of activation functions is however not always straightforward and easy to grasp. Whilst we can say that they help in the learning process, not every function is suitable for this task and in fact, some functions may prevent the network from learning at all. In the following we look at the most commonly used activation functions and discuss their origin and why they became popular and useful in Deep Learning: Sigmoid and Tanh : historically these were the most popular activation functions as they are differentiable across the entire domain. In the past, there was in fact a strong belief that gradient-descent cannot operate on functions that have singularities; although this is correct from a theoretical point of view it was later proved to be wrong in practice. They are mathematically defined as: $$ \\sigma_s(x) = \\frac{1}{1-e^{-x}} $$ and $$ \\sigma_t(x) = 2 \\sigma_s(2x) - 1 $$ Whilst still used in various contexts, these activation functions saturate very quickly (i.e., large values are clipped to 1 and small values are clipped to -1 for tanh or 0 for sigmoid). This leads to the so-called vanishing gradient problem that we will discuss in more details in following lectures; simply put, if we look at the the gradient of both of these functions, it is non-zero only when x is near zero and becomes zero away from it, meaning that if the output of a linear layer is large the gradient of the activation function will be zero and therefore the gradient will stop flowing through backpropagation. This is particularly problematic for deep network as the training of the early layers becomes very slow. ReLU (Rectified Linear Unit): this activation function became very popular in the start of the 21st century and since then it is the most commonly used activation function for NN training. It is much closer to a linear activation than the previous two, but introduces a nonlinearity by putting negative inputs to zero. By doing so, the ReLU activation function is a piecewise linear function. This shows that non-differentiable functions can be used in gradient based optimization, mostly because numerically we will hardly (if not never) have an output of a NN layer that is exactly zero when fed as input to the activation. Mathematically speaking, we can write it as: $$ \\sigma_r(x) = max ( 0,x ) = \\begin{cases} x & x\\ge 0, \\quad 0 & x<0 \\end{cases} $$ whilst its derivative is: $$ \\sigma'_{relu}(x) = \\begin{cases} 1 & x\\ge 0, \\quad 0 & x<0 \\end{cases} $$ We can observe that this activation function never saturates, for every value in the positive axis the derivative is always 1. Such a property makes ReLU suitable for large networks as the risk of vanishing gradients is greatly reduced. A downside of ReLU is that the entire negative axis acts as an annihilator preventing information to flow. A strategy to prevent or reduce the occurrences of negative inputs is represented by the initialization of biases to a value slightly greater than zero (e.g., b=0.1). Leaky ReLU (Leaky Rectified Linear Unit): a modified version of the ReLU activation function aimed once again at avoiding zeroing of inputs in the negative axis. This function is identical to the ReLU in the positive axis, whilst another straight line with smaller slope is used in the negative axis: $$ \\sigma'_{l-relu}(x) = max ( 0,x ) + \\alpha min ( 0,x ) = \\begin{cases} x & x\\ge 0, \\quad \\alpha x & x<0 \\end{cases} $$ By doing so, also negative inputs can flow through the computational graph. A variant of L-ReLU, called P-ReLU, allows for the \\(\\alpha\\) parameter to be learned instead of being fixed. Absolute ReLU (Absolute Rectified Linear Unit): a modified version of the ReLU activation function that is symmetric with respect to the \\(x=0\\) axis: $$ \\sigma'_{l-relu}(x) = |x| = \\begin{cases} x & x\\ge 0, \\quad -x & x<0 \\end{cases} $$ Whilst this is not a popular choice in the DL literature, it has been successfully used in object detection tasks where the features that we wish the NN to extract from the training process are polarity invariant. Cosine, Sine, ... : the use of periodic functions have recently started to appear in the literature especially in the context of scientific DL (e.g., Physics-informed neural networks). Softmax : this activation function is commonly used at the end of the last layer in the context of multi-label classification. However as it takes an input vector of N numbers and converts it into an output vector of probabilities (i.e., N numbers summing to 1), it may also be used as a sort of switch in the internal layers. The following two figures show the different activation functions discussed above and their gradients.","title":"Activation functions"},{"location":"lectures/05_nn/#network-architecture","text":"Up until now we have discussed the key components of a Feedforward Neural Network, the Multi-layer Perceptron. It was mentioned a few times that a NN can be composed of multiple MLPs connected with each other, giving rise to a so-called Deep Neural Network (DNN). The depth and width of the network has been also defined, and we have introduced the convention that a N-layer NN is a network with N-1 hidden layers. A crucial point in the design of a neural network architecture is represented by the choice of such parameters. Whilst no hard rules exist and the creation of a NN architecture is to these days still closer to an art than a systematic science, in the following we provide a number of guidelines that should be followed when approaching the problem of designing a network. For example, as previously discussed, connecting two or more layers without adding a nonlinear activation function in between should be avoided as this part of the network simply behaves as a single linear layer. An important theorem that provide insights into the design of neural networks is the so-called Universal Approximation theorem . This theorem states that: \"...regardless of the function that we are trying to learn, we know that a single MLP with infinite number of units can represent this function. We are however not guaranteed that we can train such a network...\" More specifically, learning can fail for two different reasons: i) the optimization algorithm used for training may not be able to find the value of the parameters that correspond to the desired function; ii) the training algorithm might choose the wrong function as a result of over\ufb01tting. In practice, experience has shown that deeper networks with fewer units per layer are better both in terms of generalization and robustness to training . This leads us with a trade-off between shallow networks with many units in each layer and deep networks with fewer units in each layer. An empirical trend has been observed between the depth of a network and its accuracy on test data: To summarize, whilst theoretically 1-layer shallow networks can learn any function, it is advisable these days to trade network width with network depth as training deep networks is nowadays feasible both from a theoretical and computational point of view. It is however always best to start small and grow the network in width and depth as the problem requires. We will see in the following lectures that a large network requires a large training data to avoid overfitting; therefore, when working with small to medium size training data it is always best to avoid using very large networks in the first place.","title":"Network architecture"},{"location":"lectures/05_nn/#additional-readings","text":"the following blog post contains an extensive treatment of activation functions used in NN training beyond the most popular ones that we covered in this lecture.","title":"Additional readings"},{"location":"lectures/06_nn/","text":"More on Neural Networks In this lecture, we will delve into some more advanced topics associated to the creation and training of deep neural networks. Backpropagation First of all, once a neural network architecture has been defined for the problem at hand, we need a method that can learn the best set of free parameters of such nonlinear function represented as \\(f_\\theta\\) . More specifically, we want to initialize the network with some random weights and biases (we will soon discuss how such initialization can be performed) and use the training data at hand to improve our weights and biases in order to minimize a certain loss function. Whilst this can be easily done by means of gradient based optimizers like those presented in Lecture 3, a key ingredient that we need to provide to such algorithms is represented by the gradient of the loss function with respect to each and every weight and bias parameters. We have already alluded at a technique that can do so whilst discussing a simple logistic regression model. This is generally referred to by the ML community as back-propagation and more broadly by the mathematical community as Reverse Automatic Differentiation . Let's start by taking the same schematic diagram used for the logistic regression example and generalize it to a N-layer NN: The main difference here, which we will need to discuss in details, is the fact that in the forward pass we feed the input into a stack of linear layers prior to computing the loss function. The backpropagation does need to be able to keep track of the chain of operations (i.e., computational graph) and traverse it back. However, as already done for the logistic regression model, all we need to do is to write the entire chain of operations as a chain of atomic ones that we can then easily traverse back. Let's do this for the network above and a single training sample \\(\\textbf{x}\\) : \\[ \\textbf{z}^{[1]} = \\textbf{W}^{[1]}\\textbf{x} + \\textbf{b}^{[1]}, \\quad \\textbf{a}^{[1]} = \\sigma(\\textbf{z}^{[1]}), \\] \\[ \\textbf{z}^{[2]} = \\textbf{W}^{[2]}\\textbf{a}^{[1]} + \\textbf{b}^{[2]}, \\quad \\textbf{a}^{[2]} = \\sigma(\\textbf{z}^{[2]}), \\] \\[ z^{[3]} = \\textbf{w}^{[3]T}\\textbf{a}^{[2]} + b^{[3]}, \\quad a^{[3]} = \\sigma(z^{[3]}), \\] \\[ l = \\mathscr{L}(y,a^{[3]}). \\] Given such a chain of operations, we are now able to find the derivatives of the loss function with respect to any of the weights or biases. As an example we consider here \\(\\partial l / \\partial \\textbf{W}^{[2]}\\) : \\[ \\frac{\\partial l}{\\partial \\textbf{W}^{[2]}} = \\frac{\\partial l}{\\partial a^{[3]}} \\frac{\\partial a^{[3]}}{\\partial z^{[3]}} \\frac{\\partial z^{[3]}}{\\partial \\textbf{a}^{[2]}} \\frac{\\partial \\textbf{a}^{[2]}}{\\partial \\textbf{z}^{[2]}} \\frac{\\partial \\textbf{z}^{[2]}}{\\partial \\textbf{W}^{[2]}} \\] Note that since this will be effectively evaluated from left to right, it is perhaps easier to rewrite the chain of derivatives as follows: \\[ \\frac{\\partial l}{\\partial \\textbf{W}^{[2]}} = \\frac{\\partial \\textbf{z}^{[2]}}{\\partial \\textbf{W}^{[2]}} \\frac{\\partial \\textbf{a}^{[2]}}{\\partial \\textbf{z}^{[2]}} \\frac{\\partial z^{[3]}}{\\partial \\textbf{a}^{[2]}} \\frac{\\partial a^{[3]}}{\\partial z^{[3]}} \\frac{\\partial l}{\\partial a^{[3]}} \\] Assuming for simplicity that the binary cross-entropy and sigmoid functions are used here as the loss and activation functions, respectively, we get: \\[ \\frac{\\partial a^{[3]}}{\\partial z^{[3]}} \\frac{\\partial l}{\\partial a^{[3]}} = a^{[3]} - y \\] (which has already been derived in the logistic regression lecture). The subsequent derivatives are: \\[ \\frac{\\partial z^{[3]}}{\\partial \\textbf{a}^{[2]}} = \\textbf{w}^{[3]}_{N_{a^{[2]}} \\times 1} \\] \\[ \\frac{\\partial \\textbf{a}^{[2]}}{\\partial \\textbf{z}^{[2]}} = diag\\{\\textbf{a}^{[2]}(1-\\textbf{a}^{[2]})\\}_{N_{z^{[2]}} \\times N_{a^{[2]}}} \\] \\[ \\frac{\\partial \\textbf{z}^{[2]}}{\\partial \\textbf{W}^{[2]}} = \\begin{bmatrix} \\mathbf{a}^{[1]} & \\mathbf{0} & \\ldots & \\mathbf{0} \\\\ \\mathbf{0} & \\mathbf{a}^{[1]} & \\ldots & \\mathbf{0} \\\\ \\vdots & \\vdots & \\ddots & \\vdots \\\\ \\mathbf{0} & \\mathbf{0} & \\ldots & \\mathbf{a}^{[1]} \\end{bmatrix}_{N_{a^{[1]}}N_{z^{[2]}} \\times N_{z^{[2]}}} \\] where the last two expressions correspond to the transposed Jacobian. Putting all together: \\[ \\frac{\\partial l}{\\partial \\textbf{W}^{[2]}} = \\begin{bmatrix} \\mathbf{a}^{[1]} & \\mathbf{0} & \\ldots & \\mathbf{0} \\\\ \\mathbf{0} & \\mathbf{a}^{[1]} & \\ldots & \\mathbf{0} \\\\ \\vdots & \\vdots & \\ddots & \\vdots \\\\ \\mathbf{0} & \\mathbf{0} & \\ldots & \\mathbf{a}^{[1]} \\end{bmatrix} diag\\{\\textbf{a}^{[2]}(1-\\textbf{a}^{[2]})\\} \\textbf{w}^{[3]} (a^{[3]} - y) \\] which can be later reshaped into a matrix of size \\(N_{z^{[2]}} \\times N_{a^{[1]}}\\) . This derivative can also written in a more compact form as \\[ \\frac{\\partial l}{\\partial \\textbf{W}^{[2]}} = \\textbf{a}^{[1]}[(\\textbf{a}^{[2]}(1-\\textbf{a}^{[2]})) \\cdot \\textbf{w}^{[3]}(a^{[3]} - y)]^T \\] where \\(\\cdot\\) is used to refer to element-wise products. Similar results can be obtained for the bias vector and for both weights and biases in the other layers as depicted in the figure below for a 2-layer NN: To conclude, the backpropagation equations in the diagram above are now generalized for the case of \\(N_s\\) training samples \\(\\textbf{X} \\in \\mathbb{R}^{N \\times N_s}\\) and a generic activation function \\(\\sigma\\) whose derivative is denoted as \\(\\sigma'\\) . Here we still assume an output of dimensionality one -- \\(\\textbf{Y} \\in \\mathbb{R}^{1 \\times N_s}\\) : \\[ \\textbf{dZ}^{[2]}=\\textbf{A}^{[2]}-\\textbf{Y} \\qquad (\\textbf{A}^{[2]},\\textbf{dZ}^{[2]} \\in \\mathbb{R}^{1 \\times N_s}) \\] \\[ \\textbf{dW}^{[2]}= \\frac{1}{N_s} \\textbf{dZ}^{[2]}\\textbf{A}^{[1]T} \\qquad (\\textbf{A}^{[1]} \\in \\mathbb{R}^{N^{[1]} \\times N_s}) \\] \\[ db^{[2]}= \\frac{1}{N_s} \\sum_i \\textbf{dZ}_{:,i}^{[2]} \\] \\[ \\textbf{dZ}^{[1]}=\\textbf{W}^{[2]^T}\\textbf{dZ}^{[2]} \\cdot \\sigma'(\\textbf{Z}^{[1]}) \\qquad (\\textbf{dZ}^{[1]} \\in \\mathbb{R}^{N^{[1]} \\times N_s}) \\] \\[ \\textbf{dW}^{[1]}= \\frac{1}{N_s} \\textbf{dZ}^{[1]}\\textbf{X}^T \\] \\[ \\textbf{db}^{[1]}= \\frac{1}{N_s} \\sum_i \\textbf{dZ}_{:,i}^{[1]} \\] Initialization Neural networks are highly nonlinear functions. The associated cost function used in the training process in order to optimize the network weights and biases is therefore non-convex and contains several local minima and saddle points. A key component in non-convex optimization is represented by the starting guess of the parameters to optimize, which in the context of deep learning is identified by initialization of weights and biases. Whilst a proper initialization has been shown to be key to a successful training of deep train NNs, this is a very active area of research as initialization strategies are so far mostly based on heuristic arguments and experience. Zero initialization First of all, let's highlight a bad choice of initialization that can compromise the training no matter the architecture of the network and other hyperparameters. A common choice in standard optimization in the absence of any strong prior information is to initialize all the parameters to zero: if we decide to follow such a strategy when training a NN, we will soon realize that training is stagnant due to the so called symmetry problem (also referred to as symmetric gradients ). Note that a similar situation arises also if we choose a constant values for weights and biases (e.g., \\(c^{[1]}\\) for all the weights and biases in the first layer and \\(c^{[2]}\\) for all the weights and biases in the second layer): Let's take a look at this with an example: Since the activations are constant vectors, back-propagation produces constant updates for the weights (and biases), leading to weights and biases to never lose the initial symmetry. Random initialization A more appropriate way to initialize the weights of a neural network is to sample their values from random distributions, for example: $$ w_{ij}^{[.]} \\sim \\mathcal{N}(0, 0.01) $$ where the choice of the variance is based on the following trade-off: too small variance leads to the vanishing gradient problem (i.e., slow training), whilst too high variance leads to the exploding gradient problem (i.e., unstable training). On the other hand, for the biases we can use zero or a constant value. If you remember, we have already mentioned this when discussing the ReLU activation function: a good strategy to limit the amount of negative values as input to this activation function is to choose a small constant bias (e.g., \\(b=0.1\\) ). Whilst this approach provides a good starting point for stable training of neural networks, more advanced initialization strategies have been proposed in the literature: Uniform : the weights are initialized with uniform distributions whose variance depend on the number of units in the layer: $$ w_{ij}^{[k]} \\sim \\mathcal{U}(-1/\\sqrt{N^{[k]}}, 1/\\sqrt{N^{[k]}}) $$ or $$ w_{ij}^{[k]} \\sim \\mathcal{U}(-\\sqrt{6/(N^{[k-1]}+N^{[k]})}, \\sqrt{6/(N^{[k-1]}+N^{[k]})}) $$ This strategy is commonly used with FC layers. Xavier : the weights are initialized with normal distributions whose variance depend on the number of units in the layer: $$ w_{ij}^{[k]} \\sim \\mathcal{N}(0, 1/N^{[k]}) $$ This strategy ensures that the variance remains the same across the layers. Xavier initialization is very popular especially in layers using Tanh activations. He : the weights are initialized with normal distributions whose variance depend on the number of units in the layer: $$ w_{ij}^{[k]} \\sim \\mathcal{N}(0, 2/N^{[k]}) $$ This strategy ensures that the variance remains the same across the layers. He initialization is very popular especially in layers using ReLU activations. Why Deep Learning took off in the last century Before moving onto the last topic of this lecture, a unified statistical view of loss functions in deep learning, let's try to answer a question that many of you may ask: what makes NNs so popular these days and why deep learning took off in the last decade? By now, we have made ourself familiar with the concept of neural networks, learned about its basic building block (the so-called perceptron) and how by simply horizontally and vertically stacking multiple perceptrons we can create universal function approximators that can be trained to learn very complex nonlinear relationships between inputs and targets (provided availability of a large enough amount of training data). The process of creating and training NNs relies on the following four key ingredients: linear algebra operations : matrix-vector and matrix-matrix multiplications (at least within the context of FC networks); activations : nonlinear functions that enable the learning of complex nonlinear mappings; loss functions : functions that can be used to evaluate the goodness of the model in terms of predicting targets from inputs; learning algorithms : optimization methods that can produce the best weights and biases using gradient information; Eventually, most of the underlying theory of NNs was already fairly mature in 70s and 80s; nevertheless, until the early 2000, research in the field of artificial neural networks was still considered a niche domain mostly theoretical and with little practical implications. So, what did lead to the renaissance of Deep Learning? Two key factors in the popularity and success of Neural Networks growth are undoubtedly: larger datasets : with the growth of the internet and social media, a digital revolution has started since the beginning of the new century, where datasets of ever increasing size can be easily sourced. This applies both to images and text as well as audio and video content. larger networks : with the emergence of new hardware technology such as GPUs, training large deep networks is nowadays possible, not only for large corporations like Google or Microsoft but also in Academia or for small- and medium-size enterprises that want to leverage their data to make data-driven business decisions. Alongside the data and hardware revolution, a number of important algorithmic discoveries have also led to faster, more robust training of NNs making this process easier and more accessible to domain scientists in a variety of scientific fields. Some of them have been already discussed, but we wish here to put more emphasis on them: MSE --> Cross-entropy : whilst in the past the mean square error (MSE) loss was used for pretty much every task, nowadays classification or semantic segmentation tasks are more commonly solved by means of Cross-entropy loss functions. This shift in training strategy is mostly due to the fact that the ML community and the statistical community got closer to each other in the last two decades, which lead to the development of strong statistical foundations in the theory of deep learning; Sigmoid --> ReLU : whilst continuous, differentiable activation functions used to be a must in the past mostly due to the belief that gradient descent algorithms (and back-propagation) needs these kind of functions to behave correctly, it is now clear that this constraint can be greatly related. Piece-wise linear activation functions like ReLU are nowadays not only used but pretty much the de-facto standard for hidden layers in deep neural networks. Jarrett et al. (2009) observed that \"using a rectifying nonlinearity is the single most important factor in improving the performance of a recognition system\" . Maximum likelihood estimators To conclude, we would like to revisit the loss functions already introduced in the context of linear and logistic regression models and introduce some other loss functions that are commonly employed to train neural networks. However, whilst so far we have chosen different loss functions for each task (regression vs. classification) without really providing a statistical motivation of such choices, in this section we will instead try to define a common framework based on the concept of Maximum Likelihood Estimations (MLE). Let's start by considering a set of samples drawn from the true (but unknown) distribution: \\[ \\mathbf{X} = \\{ \\mathbf{x}^{(1)}, \\mathbf{x}^{(2)}, ..., \\mathbf{x}^{(N_s)} \\} \\sim p_{data}(\\mathbf{X}) \\] Second, a parametric family of probability distribution is defined: \\[ p_{model}(\\mathbf{X}; \\theta) \\] This distribution maps any vector \\(\\mathbf{x}\\) to a real number and is generally referred to as the likelihood function. Its free parameters \\(\\theta\\) must be chosen such that this probability distribution is as close as possible to the true one. As an example, if we consider a multi-variate gaussian distribution with uncorrelated members, the free parameters become \\(\\theta = \\{ \\boldsymbol \\mu, \\sigma\\}\\) and the probability density function becomes: \\[ p_{model}(\\mathbf{x}; \\{ \\boldsymbol \\mu, \\sigma\\}) = \\frac{1}{\\sqrt{2 \\pi \\sigma^2}} e^{-\\frac{||\\mathbf{x} - \\boldsymbol \\mu||_2^2}{2 \\sigma^2}} \\] We can now define the MLE as follows: \\[ \\theta_{ML} = \\underset{\\theta} {\\mathrm{argmax}} \\; p_{model}(\\mathbf{X}; \\theta) \\] Assuming now statistical independence between the samples \\(\\mathbf{x}^{(i)}\\) , the equation above can be rewritten as: \\[ \\begin{aligned} \\theta_{ML} &= \\underset{\\theta} {\\mathrm{argmax}} \\; \\prod_{i=1}^{N_s} p_{model}(\\mathbf{x}^{(i)}; \\theta) \\\\ &= \\underset{\\theta} {\\mathrm{argmax}} \\; \\sum_{i=1}^{N_s} log(p_{model}(\\mathbf{x}^{(i)}; \\theta)) \\\\ &\\approx \\underset{\\theta} {\\mathrm{argmax}} \\; E_{\\mathbf{x} \\sim p_{data}} [ log(p_{model}(\\mathbf{x}; \\theta))] \\\\ &= \\underset{\\theta} {\\mathrm{argmin}} \\; - E_{\\mathbf{x} \\sim p_{data}} [ log(p_{model}(\\mathbf{x}; \\theta))] \\end{aligned} \\] Simply put, maximizing the parametric probability density function is shown to be equivalent to minimizing the negative log likelihood of the same distribution. An optimization problem must be therefore solved to find the most suitable free parameters. Going back to the multi-variate gaussian example, let's assume we are interested to estimate the mean (whilst we keep the variance fixed): \\[ \\begin{aligned} \\boldsymbol \\mu_{ML} &= \\underset{\\boldsymbol \\mu} {\\mathrm{argmin}} \\; - \\sum_{i=1}^{N_s} log \\Big( \\frac{1}{\\sqrt{2 \\pi \\sigma^2}} e^{-\\frac{||\\mathbf{x}^{(i)} - \\boldsymbol \\mu||_2^2}{2 \\sigma^2}} \\Big) \\\\ &= \\underset{\\boldsymbol \\mu} {\\mathrm{argmin}} \\; \\sum_{i=1}^{N_s} \\frac{||\\mathbf{x}^{(i)} - \\boldsymbol \\mu||_2^2}{2 \\sigma^2} \\end{aligned} \\] Computing the gradient and imposing it to be zero gives us the point estimate of \\(\\boldsymbol \\mu_{ML}\\) : \\[ \\frac{\\partial -\\sum_i log p}{\\partial \\boldsymbol \\mu} = 0 \\rightarrow \\sum_{i=1}^{N_s} (\\mathbf{x}^{(i)} - \\boldsymbol \\mu) = 0 \\rightarrow \\boldsymbol \\mu_{ML} = \\frac{1}{N_s} \\sum_{i=1}^{N_s} \\mathbf{x}^{(i)} \\] which is nothing more than the well-known sample mean of the distribution. In order to apply the same framework to learning problems like those arising in DL, the ML estimation is now extended to the case of conditional probabilities where we are given a set of training pairs \\((\\mathbf{x}, y)^{(i)}\\) : \\[ \\begin{aligned} \\theta_{ML} &= \\underset{\\theta} {\\mathrm{argmax}} \\; p_{model}(Y | \\mathbf{X}; \\theta) \\\\ &= ... \\\\ &= \\underset{\\theta} {\\mathrm{argmin}} \\; - E_{\\mathbf{x},y \\sim p_{data}} [ log(p_{model}(y|\\mathbf{x}; \\theta))] \\end{aligned} \\] Regression Linear regression Let's first apply this framework to a simple linear regression problem. Here, under the assumption of gaussian noise, the likelihood can be written as a multi-variate gaussian distribution: \\[ y = \\tilde{\\mathbf{x}}^T \\boldsymbol \\theta + \\mathbf{n} \\sim \\mathcal{N}(\\hat{y} = \\tilde{\\mathbf{x}}^T \\boldsymbol \\theta, \\sigma) \\] Plugging this distribution into the negative log likelihood expression, we obtain: \\[ \\boldsymbol \\theta_{ML} = \\underset{\\boldsymbol \\theta} {\\mathrm{argmin}} \\; \\sum_{i=1}^{N_s} \\frac{||\\hat{y}^{(i)} - y^{(i)}||_2^2}{2\\sigma^2} = \\frac{N_s}{2\\sigma^2} MSE(\\textbf{y}_{train}, \\hat{\\textbf{y}}_{train})\\\\ \\] This cost function can be seen to be a rescaled version of the MSE function previously introduced as the loss function for the linear regression model. Note however, that this model is not only more rigorous from a statistical point of view but provides also a natural way to handle training samples with different confidence. By using sample-dependant scaling factors ( \\(\\sigma^{(i)}\\) ), different samples can be chosen to contribute more or less to the training process. Multi-layer perceptron regression A very similar derivation can be performed for a neural network composed by one or more MLPs. Eventually we simply need to swap the previously linearly predicted output \\(\\hat{y}=\\tilde{\\mathbf{x}}^T \\boldsymbol \\theta\\) with a new output produced by the chosen nonlinear functional \\(\\hat{y}=f_\\theta(\\mathbf{x})\\) . In conclusion, we must remember that the MSE loss function, commonly used for regression tasks in ML and DL, is a MLE in disguise. Classification Binary classification In statistical learning, a Bernoulli distribution is commonly used for the task of binary (i.e., 2 label) classification: \\[ P(y)= \\phi y + (1-\\phi)(1-y) \\] where \\(y\\) is the outcome and \\(\\phi\\) is its probability that we wish to learn by means of a model (i.e., logistic regression or MLP). Moreover, as we wish to learn a probability this value must be bound between 0 and 1; this can be easily achieved by feeding the output of the model into a sigmoid function \\(\\sigma\\) : \\[ \\hat{y} = \\sigma (f_\\theta(\\mathbf{x})) \\] Put together: \\[ \\begin{aligned} \\boldsymbol \\theta_{ML} &= \\underset{\\boldsymbol \\theta} {\\mathrm{argmin}} \\; -\\sum_{i=1}^{N_s} log(p_{model}(y^{(i)}|\\mathbf{x}^{(i)}; \\theta) \\\\ &= -\\sum_{i=1}^{N_s} y^{(i)} log \\hat{y}^{(i)} + (1-y^{(i)}) log (1-\\hat{y}^{(i)}) \\end{aligned} \\] which is the same loss function that we have introduced and discussed in details in the context of logistic regression. Once again, we note how we have here simply defined a MLE for a classification task and obtained the well-know binary cross-entropy loss function. Multi-label classification An extension of binary classification, multi-label classification aims at producing an estimate of the most likely class within a set of \\(N_c\\) classes. The combination of a Bernoulli distribution and sigmoid activation used for the binary classifier is here replaced by a Multinoulli distribution and softmax activation, where the latter is defined as follows: \\[ \\hat{\\mathbf{y}} = \\sigma(\\mathbf{x}) =\\frac{e^\\mathbf{x}}{\\sum_{i=1}^{N_c} e^{x_i}} \\] A property of such activation function is that it takes as input a vector of numbers (sometimes called logits )) and produces as output a vector of probabilities (i.e., \\(y_i>0\\) and \\(\\sum_{i=1}^{N_c} y_i=1\\) ). Put together: \\[ \\begin{aligned} \\boldsymbol \\theta_{ML} &= \\underset{\\boldsymbol \\theta} {\\mathrm{argmin}} \\; -\\sum_{i=1}^{N_s} log(p_{model}(y^{(i)}|\\mathbf{x}^{(i)}; \\theta)) \\\\ &= -\\sum_{i=1}^{N_s} \\sum_{j=1}^{N_c} y_j^{(i)} log \\hat{y}_j^{(i)} \\end{aligned} \\] where the true labels \\(\\mathbf{y}^{(i)}\\) are one-hot encoded vectors (i.e., \\(y_{j=j_{true}}^{(i)}=1\\) and \\(y_{j \\neq j_{true}}^{(i)}=0\\) ). To conclude, let's try to get more insights into why ML estimators work so succesfully. In order to do so, we start by defining a measure of similarity between the two distributions of interest: empirical distribution of the data: \\(p_{data}(\\mathbf{X})\\) parametric model distribution: \\(p_{model}(\\mathbf{X}; \\theta)\\) This can be achieved by means of the previously introduced Kullback-Leibler divergence, which we can write as follows: \\[ D_{KL}(p_{data}||p_{model}) = E_{x \\sim p_{data}} [log p_{data}(\\mathbf{x}) - p_{model}(\\mathbf{x})] \\] Since we are interested to estimate the free-parameters \\(\\theta\\) such that the model distribution matches that of the data, an equivalent optimization problem can be written with the help of the KL divergence: \\[ \\begin{aligned} \\theta_{KL} &= \\underset{\\theta} {\\mathrm{argmin}} \\; D_{KL}(p_{data}||p_{model}) \\\\ &= \\underset{\\theta} {\\mathrm{argmin}} \\; - E_{\\mathbf{x} \\sim p_{data}} [ log(p_{model}(\\mathbf{x}; \\theta))] \\end{aligned} \\] where the data probability has been removed in the second term since it is independent of \\(\\theta\\) . We can conclude that \\(\\theta_{KL}=\\theta_{ML}\\) and therefore minimizing the KL divergence between the model and data distributions is the same as maximizing their cross-entropy (as done by the ML estimator). Additional readings If you are interested to learn more about network initialization, I recommend reading (and reproducing) the following blog posts: 1 and 2 .","title":"More on Neural Networks"},{"location":"lectures/06_nn/#more-on-neural-networks","text":"In this lecture, we will delve into some more advanced topics associated to the creation and training of deep neural networks.","title":"More on Neural Networks"},{"location":"lectures/06_nn/#backpropagation","text":"First of all, once a neural network architecture has been defined for the problem at hand, we need a method that can learn the best set of free parameters of such nonlinear function represented as \\(f_\\theta\\) . More specifically, we want to initialize the network with some random weights and biases (we will soon discuss how such initialization can be performed) and use the training data at hand to improve our weights and biases in order to minimize a certain loss function. Whilst this can be easily done by means of gradient based optimizers like those presented in Lecture 3, a key ingredient that we need to provide to such algorithms is represented by the gradient of the loss function with respect to each and every weight and bias parameters. We have already alluded at a technique that can do so whilst discussing a simple logistic regression model. This is generally referred to by the ML community as back-propagation and more broadly by the mathematical community as Reverse Automatic Differentiation . Let's start by taking the same schematic diagram used for the logistic regression example and generalize it to a N-layer NN: The main difference here, which we will need to discuss in details, is the fact that in the forward pass we feed the input into a stack of linear layers prior to computing the loss function. The backpropagation does need to be able to keep track of the chain of operations (i.e., computational graph) and traverse it back. However, as already done for the logistic regression model, all we need to do is to write the entire chain of operations as a chain of atomic ones that we can then easily traverse back. Let's do this for the network above and a single training sample \\(\\textbf{x}\\) : \\[ \\textbf{z}^{[1]} = \\textbf{W}^{[1]}\\textbf{x} + \\textbf{b}^{[1]}, \\quad \\textbf{a}^{[1]} = \\sigma(\\textbf{z}^{[1]}), \\] \\[ \\textbf{z}^{[2]} = \\textbf{W}^{[2]}\\textbf{a}^{[1]} + \\textbf{b}^{[2]}, \\quad \\textbf{a}^{[2]} = \\sigma(\\textbf{z}^{[2]}), \\] \\[ z^{[3]} = \\textbf{w}^{[3]T}\\textbf{a}^{[2]} + b^{[3]}, \\quad a^{[3]} = \\sigma(z^{[3]}), \\] \\[ l = \\mathscr{L}(y,a^{[3]}). \\] Given such a chain of operations, we are now able to find the derivatives of the loss function with respect to any of the weights or biases. As an example we consider here \\(\\partial l / \\partial \\textbf{W}^{[2]}\\) : \\[ \\frac{\\partial l}{\\partial \\textbf{W}^{[2]}} = \\frac{\\partial l}{\\partial a^{[3]}} \\frac{\\partial a^{[3]}}{\\partial z^{[3]}} \\frac{\\partial z^{[3]}}{\\partial \\textbf{a}^{[2]}} \\frac{\\partial \\textbf{a}^{[2]}}{\\partial \\textbf{z}^{[2]}} \\frac{\\partial \\textbf{z}^{[2]}}{\\partial \\textbf{W}^{[2]}} \\] Note that since this will be effectively evaluated from left to right, it is perhaps easier to rewrite the chain of derivatives as follows: \\[ \\frac{\\partial l}{\\partial \\textbf{W}^{[2]}} = \\frac{\\partial \\textbf{z}^{[2]}}{\\partial \\textbf{W}^{[2]}} \\frac{\\partial \\textbf{a}^{[2]}}{\\partial \\textbf{z}^{[2]}} \\frac{\\partial z^{[3]}}{\\partial \\textbf{a}^{[2]}} \\frac{\\partial a^{[3]}}{\\partial z^{[3]}} \\frac{\\partial l}{\\partial a^{[3]}} \\] Assuming for simplicity that the binary cross-entropy and sigmoid functions are used here as the loss and activation functions, respectively, we get: \\[ \\frac{\\partial a^{[3]}}{\\partial z^{[3]}} \\frac{\\partial l}{\\partial a^{[3]}} = a^{[3]} - y \\] (which has already been derived in the logistic regression lecture). The subsequent derivatives are: \\[ \\frac{\\partial z^{[3]}}{\\partial \\textbf{a}^{[2]}} = \\textbf{w}^{[3]}_{N_{a^{[2]}} \\times 1} \\] \\[ \\frac{\\partial \\textbf{a}^{[2]}}{\\partial \\textbf{z}^{[2]}} = diag\\{\\textbf{a}^{[2]}(1-\\textbf{a}^{[2]})\\}_{N_{z^{[2]}} \\times N_{a^{[2]}}} \\] \\[ \\frac{\\partial \\textbf{z}^{[2]}}{\\partial \\textbf{W}^{[2]}} = \\begin{bmatrix} \\mathbf{a}^{[1]} & \\mathbf{0} & \\ldots & \\mathbf{0} \\\\ \\mathbf{0} & \\mathbf{a}^{[1]} & \\ldots & \\mathbf{0} \\\\ \\vdots & \\vdots & \\ddots & \\vdots \\\\ \\mathbf{0} & \\mathbf{0} & \\ldots & \\mathbf{a}^{[1]} \\end{bmatrix}_{N_{a^{[1]}}N_{z^{[2]}} \\times N_{z^{[2]}}} \\] where the last two expressions correspond to the transposed Jacobian. Putting all together: \\[ \\frac{\\partial l}{\\partial \\textbf{W}^{[2]}} = \\begin{bmatrix} \\mathbf{a}^{[1]} & \\mathbf{0} & \\ldots & \\mathbf{0} \\\\ \\mathbf{0} & \\mathbf{a}^{[1]} & \\ldots & \\mathbf{0} \\\\ \\vdots & \\vdots & \\ddots & \\vdots \\\\ \\mathbf{0} & \\mathbf{0} & \\ldots & \\mathbf{a}^{[1]} \\end{bmatrix} diag\\{\\textbf{a}^{[2]}(1-\\textbf{a}^{[2]})\\} \\textbf{w}^{[3]} (a^{[3]} - y) \\] which can be later reshaped into a matrix of size \\(N_{z^{[2]}} \\times N_{a^{[1]}}\\) . This derivative can also written in a more compact form as \\[ \\frac{\\partial l}{\\partial \\textbf{W}^{[2]}} = \\textbf{a}^{[1]}[(\\textbf{a}^{[2]}(1-\\textbf{a}^{[2]})) \\cdot \\textbf{w}^{[3]}(a^{[3]} - y)]^T \\] where \\(\\cdot\\) is used to refer to element-wise products. Similar results can be obtained for the bias vector and for both weights and biases in the other layers as depicted in the figure below for a 2-layer NN: To conclude, the backpropagation equations in the diagram above are now generalized for the case of \\(N_s\\) training samples \\(\\textbf{X} \\in \\mathbb{R}^{N \\times N_s}\\) and a generic activation function \\(\\sigma\\) whose derivative is denoted as \\(\\sigma'\\) . Here we still assume an output of dimensionality one -- \\(\\textbf{Y} \\in \\mathbb{R}^{1 \\times N_s}\\) : \\[ \\textbf{dZ}^{[2]}=\\textbf{A}^{[2]}-\\textbf{Y} \\qquad (\\textbf{A}^{[2]},\\textbf{dZ}^{[2]} \\in \\mathbb{R}^{1 \\times N_s}) \\] \\[ \\textbf{dW}^{[2]}= \\frac{1}{N_s} \\textbf{dZ}^{[2]}\\textbf{A}^{[1]T} \\qquad (\\textbf{A}^{[1]} \\in \\mathbb{R}^{N^{[1]} \\times N_s}) \\] \\[ db^{[2]}= \\frac{1}{N_s} \\sum_i \\textbf{dZ}_{:,i}^{[2]} \\] \\[ \\textbf{dZ}^{[1]}=\\textbf{W}^{[2]^T}\\textbf{dZ}^{[2]} \\cdot \\sigma'(\\textbf{Z}^{[1]}) \\qquad (\\textbf{dZ}^{[1]} \\in \\mathbb{R}^{N^{[1]} \\times N_s}) \\] \\[ \\textbf{dW}^{[1]}= \\frac{1}{N_s} \\textbf{dZ}^{[1]}\\textbf{X}^T \\] \\[ \\textbf{db}^{[1]}= \\frac{1}{N_s} \\sum_i \\textbf{dZ}_{:,i}^{[1]} \\]","title":"Backpropagation"},{"location":"lectures/06_nn/#initialization","text":"Neural networks are highly nonlinear functions. The associated cost function used in the training process in order to optimize the network weights and biases is therefore non-convex and contains several local minima and saddle points. A key component in non-convex optimization is represented by the starting guess of the parameters to optimize, which in the context of deep learning is identified by initialization of weights and biases. Whilst a proper initialization has been shown to be key to a successful training of deep train NNs, this is a very active area of research as initialization strategies are so far mostly based on heuristic arguments and experience.","title":"Initialization"},{"location":"lectures/06_nn/#zero-initialization","text":"First of all, let's highlight a bad choice of initialization that can compromise the training no matter the architecture of the network and other hyperparameters. A common choice in standard optimization in the absence of any strong prior information is to initialize all the parameters to zero: if we decide to follow such a strategy when training a NN, we will soon realize that training is stagnant due to the so called symmetry problem (also referred to as symmetric gradients ). Note that a similar situation arises also if we choose a constant values for weights and biases (e.g., \\(c^{[1]}\\) for all the weights and biases in the first layer and \\(c^{[2]}\\) for all the weights and biases in the second layer): Let's take a look at this with an example: Since the activations are constant vectors, back-propagation produces constant updates for the weights (and biases), leading to weights and biases to never lose the initial symmetry.","title":"Zero initialization"},{"location":"lectures/06_nn/#random-initialization","text":"A more appropriate way to initialize the weights of a neural network is to sample their values from random distributions, for example: $$ w_{ij}^{[.]} \\sim \\mathcal{N}(0, 0.01) $$ where the choice of the variance is based on the following trade-off: too small variance leads to the vanishing gradient problem (i.e., slow training), whilst too high variance leads to the exploding gradient problem (i.e., unstable training). On the other hand, for the biases we can use zero or a constant value. If you remember, we have already mentioned this when discussing the ReLU activation function: a good strategy to limit the amount of negative values as input to this activation function is to choose a small constant bias (e.g., \\(b=0.1\\) ). Whilst this approach provides a good starting point for stable training of neural networks, more advanced initialization strategies have been proposed in the literature: Uniform : the weights are initialized with uniform distributions whose variance depend on the number of units in the layer: $$ w_{ij}^{[k]} \\sim \\mathcal{U}(-1/\\sqrt{N^{[k]}}, 1/\\sqrt{N^{[k]}}) $$ or $$ w_{ij}^{[k]} \\sim \\mathcal{U}(-\\sqrt{6/(N^{[k-1]}+N^{[k]})}, \\sqrt{6/(N^{[k-1]}+N^{[k]})}) $$ This strategy is commonly used with FC layers. Xavier : the weights are initialized with normal distributions whose variance depend on the number of units in the layer: $$ w_{ij}^{[k]} \\sim \\mathcal{N}(0, 1/N^{[k]}) $$ This strategy ensures that the variance remains the same across the layers. Xavier initialization is very popular especially in layers using Tanh activations. He : the weights are initialized with normal distributions whose variance depend on the number of units in the layer: $$ w_{ij}^{[k]} \\sim \\mathcal{N}(0, 2/N^{[k]}) $$ This strategy ensures that the variance remains the same across the layers. He initialization is very popular especially in layers using ReLU activations.","title":"Random initialization"},{"location":"lectures/06_nn/#why-deep-learning-took-off-in-the-last-century","text":"Before moving onto the last topic of this lecture, a unified statistical view of loss functions in deep learning, let's try to answer a question that many of you may ask: what makes NNs so popular these days and why deep learning took off in the last decade? By now, we have made ourself familiar with the concept of neural networks, learned about its basic building block (the so-called perceptron) and how by simply horizontally and vertically stacking multiple perceptrons we can create universal function approximators that can be trained to learn very complex nonlinear relationships between inputs and targets (provided availability of a large enough amount of training data). The process of creating and training NNs relies on the following four key ingredients: linear algebra operations : matrix-vector and matrix-matrix multiplications (at least within the context of FC networks); activations : nonlinear functions that enable the learning of complex nonlinear mappings; loss functions : functions that can be used to evaluate the goodness of the model in terms of predicting targets from inputs; learning algorithms : optimization methods that can produce the best weights and biases using gradient information; Eventually, most of the underlying theory of NNs was already fairly mature in 70s and 80s; nevertheless, until the early 2000, research in the field of artificial neural networks was still considered a niche domain mostly theoretical and with little practical implications. So, what did lead to the renaissance of Deep Learning? Two key factors in the popularity and success of Neural Networks growth are undoubtedly: larger datasets : with the growth of the internet and social media, a digital revolution has started since the beginning of the new century, where datasets of ever increasing size can be easily sourced. This applies both to images and text as well as audio and video content. larger networks : with the emergence of new hardware technology such as GPUs, training large deep networks is nowadays possible, not only for large corporations like Google or Microsoft but also in Academia or for small- and medium-size enterprises that want to leverage their data to make data-driven business decisions. Alongside the data and hardware revolution, a number of important algorithmic discoveries have also led to faster, more robust training of NNs making this process easier and more accessible to domain scientists in a variety of scientific fields. Some of them have been already discussed, but we wish here to put more emphasis on them: MSE --> Cross-entropy : whilst in the past the mean square error (MSE) loss was used for pretty much every task, nowadays classification or semantic segmentation tasks are more commonly solved by means of Cross-entropy loss functions. This shift in training strategy is mostly due to the fact that the ML community and the statistical community got closer to each other in the last two decades, which lead to the development of strong statistical foundations in the theory of deep learning; Sigmoid --> ReLU : whilst continuous, differentiable activation functions used to be a must in the past mostly due to the belief that gradient descent algorithms (and back-propagation) needs these kind of functions to behave correctly, it is now clear that this constraint can be greatly related. Piece-wise linear activation functions like ReLU are nowadays not only used but pretty much the de-facto standard for hidden layers in deep neural networks. Jarrett et al. (2009) observed that \"using a rectifying nonlinearity is the single most important factor in improving the performance of a recognition system\" .","title":"Why Deep Learning took off in the last century"},{"location":"lectures/06_nn/#maximum-likelihood-estimators","text":"To conclude, we would like to revisit the loss functions already introduced in the context of linear and logistic regression models and introduce some other loss functions that are commonly employed to train neural networks. However, whilst so far we have chosen different loss functions for each task (regression vs. classification) without really providing a statistical motivation of such choices, in this section we will instead try to define a common framework based on the concept of Maximum Likelihood Estimations (MLE). Let's start by considering a set of samples drawn from the true (but unknown) distribution: \\[ \\mathbf{X} = \\{ \\mathbf{x}^{(1)}, \\mathbf{x}^{(2)}, ..., \\mathbf{x}^{(N_s)} \\} \\sim p_{data}(\\mathbf{X}) \\] Second, a parametric family of probability distribution is defined: \\[ p_{model}(\\mathbf{X}; \\theta) \\] This distribution maps any vector \\(\\mathbf{x}\\) to a real number and is generally referred to as the likelihood function. Its free parameters \\(\\theta\\) must be chosen such that this probability distribution is as close as possible to the true one. As an example, if we consider a multi-variate gaussian distribution with uncorrelated members, the free parameters become \\(\\theta = \\{ \\boldsymbol \\mu, \\sigma\\}\\) and the probability density function becomes: \\[ p_{model}(\\mathbf{x}; \\{ \\boldsymbol \\mu, \\sigma\\}) = \\frac{1}{\\sqrt{2 \\pi \\sigma^2}} e^{-\\frac{||\\mathbf{x} - \\boldsymbol \\mu||_2^2}{2 \\sigma^2}} \\] We can now define the MLE as follows: \\[ \\theta_{ML} = \\underset{\\theta} {\\mathrm{argmax}} \\; p_{model}(\\mathbf{X}; \\theta) \\] Assuming now statistical independence between the samples \\(\\mathbf{x}^{(i)}\\) , the equation above can be rewritten as: \\[ \\begin{aligned} \\theta_{ML} &= \\underset{\\theta} {\\mathrm{argmax}} \\; \\prod_{i=1}^{N_s} p_{model}(\\mathbf{x}^{(i)}; \\theta) \\\\ &= \\underset{\\theta} {\\mathrm{argmax}} \\; \\sum_{i=1}^{N_s} log(p_{model}(\\mathbf{x}^{(i)}; \\theta)) \\\\ &\\approx \\underset{\\theta} {\\mathrm{argmax}} \\; E_{\\mathbf{x} \\sim p_{data}} [ log(p_{model}(\\mathbf{x}; \\theta))] \\\\ &= \\underset{\\theta} {\\mathrm{argmin}} \\; - E_{\\mathbf{x} \\sim p_{data}} [ log(p_{model}(\\mathbf{x}; \\theta))] \\end{aligned} \\] Simply put, maximizing the parametric probability density function is shown to be equivalent to minimizing the negative log likelihood of the same distribution. An optimization problem must be therefore solved to find the most suitable free parameters. Going back to the multi-variate gaussian example, let's assume we are interested to estimate the mean (whilst we keep the variance fixed): \\[ \\begin{aligned} \\boldsymbol \\mu_{ML} &= \\underset{\\boldsymbol \\mu} {\\mathrm{argmin}} \\; - \\sum_{i=1}^{N_s} log \\Big( \\frac{1}{\\sqrt{2 \\pi \\sigma^2}} e^{-\\frac{||\\mathbf{x}^{(i)} - \\boldsymbol \\mu||_2^2}{2 \\sigma^2}} \\Big) \\\\ &= \\underset{\\boldsymbol \\mu} {\\mathrm{argmin}} \\; \\sum_{i=1}^{N_s} \\frac{||\\mathbf{x}^{(i)} - \\boldsymbol \\mu||_2^2}{2 \\sigma^2} \\end{aligned} \\] Computing the gradient and imposing it to be zero gives us the point estimate of \\(\\boldsymbol \\mu_{ML}\\) : \\[ \\frac{\\partial -\\sum_i log p}{\\partial \\boldsymbol \\mu} = 0 \\rightarrow \\sum_{i=1}^{N_s} (\\mathbf{x}^{(i)} - \\boldsymbol \\mu) = 0 \\rightarrow \\boldsymbol \\mu_{ML} = \\frac{1}{N_s} \\sum_{i=1}^{N_s} \\mathbf{x}^{(i)} \\] which is nothing more than the well-known sample mean of the distribution. In order to apply the same framework to learning problems like those arising in DL, the ML estimation is now extended to the case of conditional probabilities where we are given a set of training pairs \\((\\mathbf{x}, y)^{(i)}\\) : \\[ \\begin{aligned} \\theta_{ML} &= \\underset{\\theta} {\\mathrm{argmax}} \\; p_{model}(Y | \\mathbf{X}; \\theta) \\\\ &= ... \\\\ &= \\underset{\\theta} {\\mathrm{argmin}} \\; - E_{\\mathbf{x},y \\sim p_{data}} [ log(p_{model}(y|\\mathbf{x}; \\theta))] \\end{aligned} \\]","title":"Maximum likelihood estimators"},{"location":"lectures/06_nn/#regression","text":"","title":"Regression"},{"location":"lectures/06_nn/#linear-regression","text":"Let's first apply this framework to a simple linear regression problem. Here, under the assumption of gaussian noise, the likelihood can be written as a multi-variate gaussian distribution: \\[ y = \\tilde{\\mathbf{x}}^T \\boldsymbol \\theta + \\mathbf{n} \\sim \\mathcal{N}(\\hat{y} = \\tilde{\\mathbf{x}}^T \\boldsymbol \\theta, \\sigma) \\] Plugging this distribution into the negative log likelihood expression, we obtain: \\[ \\boldsymbol \\theta_{ML} = \\underset{\\boldsymbol \\theta} {\\mathrm{argmin}} \\; \\sum_{i=1}^{N_s} \\frac{||\\hat{y}^{(i)} - y^{(i)}||_2^2}{2\\sigma^2} = \\frac{N_s}{2\\sigma^2} MSE(\\textbf{y}_{train}, \\hat{\\textbf{y}}_{train})\\\\ \\] This cost function can be seen to be a rescaled version of the MSE function previously introduced as the loss function for the linear regression model. Note however, that this model is not only more rigorous from a statistical point of view but provides also a natural way to handle training samples with different confidence. By using sample-dependant scaling factors ( \\(\\sigma^{(i)}\\) ), different samples can be chosen to contribute more or less to the training process.","title":"Linear regression"},{"location":"lectures/06_nn/#multi-layer-perceptron-regression","text":"A very similar derivation can be performed for a neural network composed by one or more MLPs. Eventually we simply need to swap the previously linearly predicted output \\(\\hat{y}=\\tilde{\\mathbf{x}}^T \\boldsymbol \\theta\\) with a new output produced by the chosen nonlinear functional \\(\\hat{y}=f_\\theta(\\mathbf{x})\\) . In conclusion, we must remember that the MSE loss function, commonly used for regression tasks in ML and DL, is a MLE in disguise.","title":"Multi-layer perceptron regression"},{"location":"lectures/06_nn/#classification","text":"","title":"Classification"},{"location":"lectures/06_nn/#binary-classification","text":"In statistical learning, a Bernoulli distribution is commonly used for the task of binary (i.e., 2 label) classification: \\[ P(y)= \\phi y + (1-\\phi)(1-y) \\] where \\(y\\) is the outcome and \\(\\phi\\) is its probability that we wish to learn by means of a model (i.e., logistic regression or MLP). Moreover, as we wish to learn a probability this value must be bound between 0 and 1; this can be easily achieved by feeding the output of the model into a sigmoid function \\(\\sigma\\) : \\[ \\hat{y} = \\sigma (f_\\theta(\\mathbf{x})) \\] Put together: \\[ \\begin{aligned} \\boldsymbol \\theta_{ML} &= \\underset{\\boldsymbol \\theta} {\\mathrm{argmin}} \\; -\\sum_{i=1}^{N_s} log(p_{model}(y^{(i)}|\\mathbf{x}^{(i)}; \\theta) \\\\ &= -\\sum_{i=1}^{N_s} y^{(i)} log \\hat{y}^{(i)} + (1-y^{(i)}) log (1-\\hat{y}^{(i)}) \\end{aligned} \\] which is the same loss function that we have introduced and discussed in details in the context of logistic regression. Once again, we note how we have here simply defined a MLE for a classification task and obtained the well-know binary cross-entropy loss function.","title":"Binary classification"},{"location":"lectures/06_nn/#multi-label-classification","text":"An extension of binary classification, multi-label classification aims at producing an estimate of the most likely class within a set of \\(N_c\\) classes. The combination of a Bernoulli distribution and sigmoid activation used for the binary classifier is here replaced by a Multinoulli distribution and softmax activation, where the latter is defined as follows: \\[ \\hat{\\mathbf{y}} = \\sigma(\\mathbf{x}) =\\frac{e^\\mathbf{x}}{\\sum_{i=1}^{N_c} e^{x_i}} \\] A property of such activation function is that it takes as input a vector of numbers (sometimes called logits )) and produces as output a vector of probabilities (i.e., \\(y_i>0\\) and \\(\\sum_{i=1}^{N_c} y_i=1\\) ). Put together: \\[ \\begin{aligned} \\boldsymbol \\theta_{ML} &= \\underset{\\boldsymbol \\theta} {\\mathrm{argmin}} \\; -\\sum_{i=1}^{N_s} log(p_{model}(y^{(i)}|\\mathbf{x}^{(i)}; \\theta)) \\\\ &= -\\sum_{i=1}^{N_s} \\sum_{j=1}^{N_c} y_j^{(i)} log \\hat{y}_j^{(i)} \\end{aligned} \\] where the true labels \\(\\mathbf{y}^{(i)}\\) are one-hot encoded vectors (i.e., \\(y_{j=j_{true}}^{(i)}=1\\) and \\(y_{j \\neq j_{true}}^{(i)}=0\\) ). To conclude, let's try to get more insights into why ML estimators work so succesfully. In order to do so, we start by defining a measure of similarity between the two distributions of interest: empirical distribution of the data: \\(p_{data}(\\mathbf{X})\\) parametric model distribution: \\(p_{model}(\\mathbf{X}; \\theta)\\) This can be achieved by means of the previously introduced Kullback-Leibler divergence, which we can write as follows: \\[ D_{KL}(p_{data}||p_{model}) = E_{x \\sim p_{data}} [log p_{data}(\\mathbf{x}) - p_{model}(\\mathbf{x})] \\] Since we are interested to estimate the free-parameters \\(\\theta\\) such that the model distribution matches that of the data, an equivalent optimization problem can be written with the help of the KL divergence: \\[ \\begin{aligned} \\theta_{KL} &= \\underset{\\theta} {\\mathrm{argmin}} \\; D_{KL}(p_{data}||p_{model}) \\\\ &= \\underset{\\theta} {\\mathrm{argmin}} \\; - E_{\\mathbf{x} \\sim p_{data}} [ log(p_{model}(\\mathbf{x}; \\theta))] \\end{aligned} \\] where the data probability has been removed in the second term since it is independent of \\(\\theta\\) . We can conclude that \\(\\theta_{KL}=\\theta_{ML}\\) and therefore minimizing the KL divergence between the model and data distributions is the same as maximizing their cross-entropy (as done by the ML estimator).","title":"Multi-label classification"},{"location":"lectures/06_nn/#additional-readings","text":"If you are interested to learn more about network initialization, I recommend reading (and reproducing) the following blog posts: 1 and 2 .","title":"Additional readings"},{"location":"lectures/07_bestpractice/","text":"Best practices in the training of Machine Learning models This lecture is devoted to the training of Machine Learning models in general, with Neural Networks representing a subclass of the entire set of models commonly used to learn mappings between some features and targets. As we will see in the following, a number of best practices are in fact independent on the model used. Let's begin by re-stating here the overall aim of a ML model: a model is useful if it can perform well on new, previously unseen data. This property of a model is also generally referred to as generalization . In order to be able to assess the generalization capabilities of a model, the dataset available for training must be divided into 3 distinct sets: Training dataset: \\(\\{ \\mathbf{X}_{train}, \\mathbf{Y}_{train} \\}\\) , used to train the model (e.g., learn the free-parameters \\(\\boldsymbol \\theta\\) of a NN); Validation dataset: \\(\\{ \\mathbf{X}_{valid}, \\mathbf{Y}_{valid} \\}\\) , used to select the hyperparameters of the model; Testing dataset: \\(\\{ \\mathbf{X}_{test}, \\mathbf{Y}_{test} \\}\\) , used only once a model is finalized (trained and optimized) to produce an unbiased estimate of model performance. Note that a number of assumptions are usually made on the training samples, namely each sample is independent from the others, samples must come from the same distributions. The first assumption is however seldom satisfied as a different training samples are related to each other to some degree (this is for example the case when samples are taken from well logs at consecutive depth levels or from 2D slices of 3D seismic cubes). On the other hand, the second assumption must be satisfied for a successful training. For example, if well logs from the North Sea are used in the training data and well logs from Onshore US are used in the testing data, any estimate of model performance will be biased as the two sets are likely to belong to different distributions. Historically, the overall dataset is split into training/validation/testing data with the following proportions: 60%/20%/20%. This is the case for small datasets in the order of hundreds or thousands samples to be able to retain a large enough set to produce reliable statistics. In recent years, when training neural networks with large datasets (in the order of millions or more samples), the split is more commonly chosen as 98%/1%/1%. As the size of datasets in geoscientific applications is usually in between what we referred to as small and large datasets, choosing validation and training sets that are 5%/10% of the overall dataset is usually a good choice. A measure must be then chosen to evaluate the performance of such a model in terms of the estimation error after training. This can be for example the MSE for regression tasks, or cross-entropy for classification tasks. Three quantities are generally computed: Training error (or performance): overall error (or performance) computed over the training dataset; Validation error (or performance): overall error (or performance) computed over the validation dataset. Test/Generalization error (or performance): overall error (or performance) computed over the testing dataset. The first two are usually computed during training, whilst the latter once the model is trained. The goodness of machine learning model is usually evaluated based on the following two criteria: Bias : ability to produce a small training error. When the error is small, we say that we have a model with low bias. Vice versa, when the error is large, the model have high bias. In this case, the model is said to be underfitting the data; Variance : ability to produce comparable training and testing error. In other words, if we define gap to be the difference between the testing and training errors, this is also the ability to produce a small gap. When the gap is large, the model is said to be overfitting the data. Finally, no matter how large the training dataset and the model capacity are, the bias and variance are likely to be always present. So, an important question to ask ourselves when working on a ML project is 'how far can the performance of the model be pushed further?'. Or, in other words, can we expect further hyperparameter tuning to be successfully improving the model significantly or the model has reached a plateau in terms of learning capabilities. To answer the above question, we need to first consider two distinct scenarios: the network is designed to solve a task that a human can also solve (e.g., interpreting faults and horizons in seismic volumes) the network is performing a task that a human is not able to perform (e.g., denoising a seismic volume). In the former case, it is possible to define the so-called human-level performance (i.e., error that a human is likely to make on the task at hand). Experience in the field of deep learning has shown that the performance of a model (e.g., accuracy in classification tasks) depends on the development time of a ML model in such a way that a well-designed model will very quickly reach human-level performance, whilst a much more significant effort is required to obtain further improvements and approach the theoretical limit of the model. Such a limit, called the Bayesian optimal error, is the error of an oracle making predictions. In practical applications this may however be unknown, unless the training data has been generated in such a way that true exact labels are available. In a range of geoscientific applications, human-level performance may be replaced by the state-of-the-art algorithm that is commonly used to perform a certain task. For example, going back to the denoising example, this may be represented by FX prediction filters or SVD-based denoising. Such an algorithm can represent the human performance in the above figure and represent a baseline that we would like our model to outperform. At the same time, as no human labels are available, the use of synthetic data with true labels usually represent the only viable solution to creating a training dataset in this scenario. This time, the theoretical limit represented by the true labels can again guide us into how much effort we should put to improve the hyperparameters of the model. To conclude, let's consider two examples of ML classifiers. In the first, after some development time, our ML model performs as follows: human-level percentage error: 2% training percentage error: 10% validation percentage error: 12% In this scenario, we clearly observe a large bias and a much smaller variance. This bias, usually referred to as avoidable bias is what we should focus next in our ML development time. On the other hand, if we observe the following: human-level percentage error: 2% training percentage error: 4% validation percentage error: 20% This model shows a small bias and a large variance. Our ML development time should be therefore devoted to reduce the generalization gap. On the other hand, since the difference between human-level and training error is minimal, we refer to this error as unavoidable bias that may be very difficult to further reduce. In the following we discuss a number of strategies that are commonly used in the process of improving the model and reducing both bias and variance. Capacity A key aspect in the process of designing a ML model that can both fit the available data and generalize to new, unseen data is represented by the size, or more formally speaking the capacity of the model. Simply put, this is the number of free parameters \\(\\theta\\) that the model is allowed to optimize in the training process. Let's begin with an example of linear regression, where the model is defined as follows: \\[ y_i = f_{\\boldsymbol \\theta}(x_i) = \\sum_{j=0}^{N_f} x_i^j \\theta_j \\qquad \\forall i=1,2,...,N_s \\] where \\(N_f+1\\) is the capacity of the model. The simplest model that we can fit to the available data is straight line parametrized by \\(\\boldsymbol \\theta = {\\theta_1, \\theta_2}\\) . More complex model fit a polynomial function of order \\(N_f+1\\) . As shown in the figure below, a too simple model does lead to underfitting, whilst a too complex model leads to overfitting. The key is therefore to make a model that is as simple as possible but not too simple, something usually referred to as the Occam's razor principle in inverse problem theory. Similar principles apply to the design of a neural network. In this case however, the risk of overfitting is much higher due to ability of NNs to learn very complex nonlinear functions. The size of the training dataset plays a key role in choosing the capacity of the network: large networks with thousands (or millions) of free-parameters require large training dataset to avoid the arising of overfitting. Moreover, as shown in the figure below, training and testing errors are usually similar for small networks in the underfitting regimes and tend to reduce together as the network size increases. However, when the capacity of the network grows into the overfitting regime, the training error keeps decreasing whilst the testing error starts to increase. Note that the training error should always reduce when increasing the size of the network, therefore it cannot be used to choose the ideal network capacity. This shows the importance of holding some of the available data for validation purposes. i.e., hyperparameter optimization. Other hyperparameters Whilst the size of the network largely drives the ability of a model to learn as well as its tendency to overfit, unfortunately (or fortunately, depending of the point of view of the reader), when designing a ML model you will likely need to make decisions on a number of additional hyperparameters. Here we report some of the most important ones: activation function; optimizer, learning rate additional optimizer hyperparameters, batch size... As already mentioned, we need to devote a portion of the overall dataset that we will not be using to evaluate the final performance of the model for the task of hyperparameter tuning . This is indeed the role of the validation dataset (note that using this dataset at testing stage will result in underestimating the generalization error of the model because the model is partially optimized on this dataset). Finally, whilst not very commonly used in the context of deep learning because of the extremely high cost of training neural networks, a more powerful approach in optimizing hyper-parameter is the so-called cross-validations strategy. Similar to validation, a portion of the dataset is hold out from the training process and used for hyperparameter tuning. However, the portion selected for this task is not fixed, rather the entire training dataset is split into K groups, where each group is used once as the validation set and the remaining number of times as part of the training dataset. This approach, usually referred to a K-fold cross-validation . It is a great help when the training dataset is of limited size as it helps averaging out fluctuations on the validation error over multiple realizations. The obvious downside of this strategy is of course that the training process must be repeated N times. Note that other strategies can be used to split the dataset into training and validation. For example, in the context of classification, a class-aware division is usually recommended where inter-class proportions are the same between the validation and training datasets. Regularization Whilst both under- and overfitting commonly affect the development of a successful ML model, reducing variance without affecting bias is notoriously difficult. A strategy that is usually successful in achieving such a task is called Regularization. Regularization acts directly on the loss function by adding prior knowledge to the training process. By informing the network about something that we know (or wish the model to know), the network is less prone to just learn from the training data and improve its generalization capabilities. In the context of inverse problem theory, where our aim is to fit some observations \\(\\mathbf{y}\\) given knowledge of the underlying physical process ( \\(g\\) ), \\[ J = ||\\mathbf{y} - g(\\mathbf{x})||_2^2 + \\lambda R(\\mathbf{x}) \\] regularization can come in different flavours, more specifically: Tikhonov (or L2) regularization : \\(||\\mathbf{x}||_2^2\\) . Ensures that a model with small norm is favoured over other models equally fitting the data. This promotes simplicity (or parsimony) in the solution Generalized Tikhonov regularization : e.g., \\(||\\nabla \\mathbf{x}||_2^2\\) , where \\(\\nabla\\) is the laplacian operator. Ensures that a smooth model is favoured over a rough one by second derivative of the model (i.e., curvature). L1 regularization or sparsity : \\(||\\mathbf{x}||_p \\; p \\le 1\\) . Promotes solutions that are sparse (i.e., few non-zero elements) A similar approach can be adopted in the context of machine learning by augmenting the data-fitting loss function with a regularization term: \\[ J_\\theta = \\frac{1}{N_s} \\sum_{i=1}^{N_s} ||y^{(i)} - f_\\theta(\\mathbf{x}^{(i)})||_2^2 + \\lambda R(\\theta) \\] where: L2 regularization or weight decay : \\(||\\boldsymbol \\theta||_2^2= \\boldsymbol \\theta^T \\boldsymbol \\theta\\) . Ensures small Euclidean norm for the free-parameters; L1 regularization : \\(||\\boldsymbol \\theta||_1\\) . Ensures small L1 norm for the free-parameters. By favouring sparsity in the parameters of the model, this regularization can allow compressing the trained model by storing only the non-negative parameters and their indices. Note that in the statistical community, regression models with one of the two regularizations discussed above is called Ridge and Lasso regression models, respectively. Finally, in the special case of deep learning and training by back-propagation the regularization terms and their gradients can be simply expressed as: L2 regularization or weight decay : $$ \\frac{1}{2 N_s} \\sum_{l=1}^{L} ||\\mathbf{W}^{[l]}||_2^2 \\rightarrow \\mathbf{W}^{[l]} = \\mathbf{W}^{[l]} \\big[ 1 - \\alpha \\frac{\\lambda}{N_s} \\big] - \\alpha \\mathbf{dW}^{[l]} $$ where the term weight decay comes from the fact that the strength of the current weights is reduced by a factor of \\(\\alpha \\lambda / N_s\\) at every gradient step of the regularized loss function L1 regularization : $$ \\sum_{l=1}^{L} ||\\mathbf{W}^{[l]}||_2^2 \\rightarrow \\mathbf{W}^{[l]} = \\mathbf{W}^{[l]} - \\alpha \\mathbf{dW}^{[l]} - \\mathbf{W}^{[l]} $$ In both cases \\(L\\) represents the number of layers in the network. Note that it is common to apply regularization to the weights only (no bias terms). Dropout Dropout represents a special kind of regularization strategy to prevent overfitting, which is specific to Neural Network architectures. Contrarily to other regularization techniques that act directly on the objective function to optimize, dropout modifies the network in such a way that a portion of the neurons are randomly inactivated (i.e., set to zero) during the training process. A strategy to mitigate overfitting is in fact to reduce the size of the network (i.e., its degrees of freedom) used to match the training data; however, instead of applying such a drastic approach, dropout allows the overall network size to remain unchanged whilst making the network effectively smaller during training. By doing so, the network learns not to rely on any neuron (or set of neurons) in particular, which leads to better generalization in validation and testing phases. Considering a simple 3-layers NN, the nodes at each layer are inactivated with probability \\(p_{drop}\\) . In the figure below, \\(p_{drop}=0.3\\) for the input and first hidden layer, \\(p_{drop}=0.5\\) for the second hidden layer (and \\(p_{drop}=0\\) for the output layer): Mathematically speaking, if we consider a single node in the figure above, its output can be written as: \\[ z_2^{[1]} = \\frac{W_{21}^{[1]} x_1 + \\cancel{W_{22}^{[1]} x_2} + W_{32}^{[1]} x_3 + b_{2}^{[1]}}{1-p_{drop}} \\] where the second term is removed because \\(W_{22}^{[1]}\\) is set to zero. Moreover note that a denominator is added to compensate for the fact that the output is smaller than the one that we would obtain without dropout. In practice, a more convenient way to implement dropout is to act directly on the input vector at the i-th layer \\(\\mathbf{a}^{[i-1]}\\) instead of deactivating the weights. This approach is called inverted dropout and simply achieved by using a mask \\(\\mathbf{m}\\) where each element is randomly set to 0 or 1 based on \\(p_{drop}\\) : \\[ \\mathbf{a}^{[i-1]} \\rightarrow \\tilde{\\mathbf{a}}^{[i-1]} = \\mathbf{a}^{[i-1]} \\cdot \\mathbf{m} \\rightarrow \\mathbf{z}^{[i]} = (\\mathbf{W}^{[i]} \\tilde{\\mathbf{a}}^{[i-1]})/(1-p_{drop}) \\] Finally, at testing time, dropout is usually inactivated. Recent research in the area of uncertainty quantification (UQ) for deep learning has however suggested that by using dropout also at testing time, multiple equi-probable realizations of the output can be produced and statistics (e.g., mean, variance, marginal probability...) can be computed. This strategy will be implemented and compared to other strategies for UQ in in this lab . Data augumentation One of the key elements to a successful generalization (i.e., low variance) is represented by the availability of a large training dataset. When this is not the case and it is not feasible to acquire additional data, an alternative solution is represented by so-called data augmentation . Data augmentation simply refers to the set of procedures that can be employed to create new artificial data samples by manipulating or combining some of the original data samples. Whilst this is very data and problem dependant, in the following we will mention a number of strategies that are commonly used in computer vision when working with natural images. More specifically: cropping, flipping, rotating, shearing, averaging, color shifting are all strategies that can be used or adapted to geoscientific multi-dimensional datasets. Nevertheless, special attention may be required when implementing some of the above mentioned strategies. Just to give an example, stretching or squeezing time series data such as seismic traces does introduce a shift in the frequency content that may not be desirable. Similarly, applying color shifting to seismic data will lead to a signal whose average is not zero anymore. Alternatively, polarity reversal represents a better alternative that can be seen as a special type of color shifting when dealing with seismic data. Finally, we observe that although this does not, strictly speaking, fall within the realm of data augmentation, using basic physics principles to create synthetic datasets for training is another commonly employed strategy in geoscience when accessing high-quality labelled datasets is feasible from either a technical or intellectual property point of view. We will see example of a ML workflow that relies on synthetic data when dealing with microseismic event detection . Transfer learning An edge case of data augmentation is represented by transfer learning. Transfer learning is a procedure employed to circumvent the issue or scarce labelled data when similar datasets are available and have been previously used to train a neural network for a similar (or sometimes different task). Under these conditions, one may think to use the pre-trained network and use the available training data to fine-tune such a network for the task at hand. Once again, based on the dataset and problem, the entire pre-trained network may be used as starting point or just a portion of it (generally chosen to be the initial portion of the network where some of the final layers are removed, and referred to as backbone). To make things a bit more concrete, let's consider here an example. A NN model has been created to interpret faults in seismic data. Training has been performed using data from an area of the world where both seismic data and fault interpretations are abundant. When a new dataset from the same area becomes available, the pre-trained model can be used as-is or fine-tuned using a much smaller training dataset (i.e., requiring limited manual labelling of faults). A similar strategy could also be used if a new dataset from another area of the world is ready for fault interpretation. In this second case, however, a user needs to be aware that the model may not generalize well if key features in the data (e.g., frequency content) or interpretation (e.g., presence of reverse faults) are different from those in the original dataset used for training. A different case where transfer learning can be also used is when the output that we are interested is slightly different from the one the network was trained on but the input is the same (and therefore the low- and medium-level features learned by the network). In this case, one may freeze the first few layers (i.e., make those parameters non-learnable) and fine-tune only the last few layers of the network on the new task. As an example, let's consider again a network trained for fault interpretation. Imagine now that we are interest to estimate a seismic attribute such as the relative geological time and we have very limited access to labelled data. In this case the backbone of the network is likely to already contain useful features, and the problem arising from a lack of large training dataset is mitigated by limiting the number of free-parameters to learn to those of the last few layers. To conclude, let's visually summarize the strategies that we should keep in mind when interested to reduce bias or variance.","title":"Best practices in the training of Machine Learning models"},{"location":"lectures/07_bestpractice/#best-practices-in-the-training-of-machine-learning-models","text":"This lecture is devoted to the training of Machine Learning models in general, with Neural Networks representing a subclass of the entire set of models commonly used to learn mappings between some features and targets. As we will see in the following, a number of best practices are in fact independent on the model used. Let's begin by re-stating here the overall aim of a ML model: a model is useful if it can perform well on new, previously unseen data. This property of a model is also generally referred to as generalization . In order to be able to assess the generalization capabilities of a model, the dataset available for training must be divided into 3 distinct sets: Training dataset: \\(\\{ \\mathbf{X}_{train}, \\mathbf{Y}_{train} \\}\\) , used to train the model (e.g., learn the free-parameters \\(\\boldsymbol \\theta\\) of a NN); Validation dataset: \\(\\{ \\mathbf{X}_{valid}, \\mathbf{Y}_{valid} \\}\\) , used to select the hyperparameters of the model; Testing dataset: \\(\\{ \\mathbf{X}_{test}, \\mathbf{Y}_{test} \\}\\) , used only once a model is finalized (trained and optimized) to produce an unbiased estimate of model performance. Note that a number of assumptions are usually made on the training samples, namely each sample is independent from the others, samples must come from the same distributions. The first assumption is however seldom satisfied as a different training samples are related to each other to some degree (this is for example the case when samples are taken from well logs at consecutive depth levels or from 2D slices of 3D seismic cubes). On the other hand, the second assumption must be satisfied for a successful training. For example, if well logs from the North Sea are used in the training data and well logs from Onshore US are used in the testing data, any estimate of model performance will be biased as the two sets are likely to belong to different distributions. Historically, the overall dataset is split into training/validation/testing data with the following proportions: 60%/20%/20%. This is the case for small datasets in the order of hundreds or thousands samples to be able to retain a large enough set to produce reliable statistics. In recent years, when training neural networks with large datasets (in the order of millions or more samples), the split is more commonly chosen as 98%/1%/1%. As the size of datasets in geoscientific applications is usually in between what we referred to as small and large datasets, choosing validation and training sets that are 5%/10% of the overall dataset is usually a good choice. A measure must be then chosen to evaluate the performance of such a model in terms of the estimation error after training. This can be for example the MSE for regression tasks, or cross-entropy for classification tasks. Three quantities are generally computed: Training error (or performance): overall error (or performance) computed over the training dataset; Validation error (or performance): overall error (or performance) computed over the validation dataset. Test/Generalization error (or performance): overall error (or performance) computed over the testing dataset. The first two are usually computed during training, whilst the latter once the model is trained. The goodness of machine learning model is usually evaluated based on the following two criteria: Bias : ability to produce a small training error. When the error is small, we say that we have a model with low bias. Vice versa, when the error is large, the model have high bias. In this case, the model is said to be underfitting the data; Variance : ability to produce comparable training and testing error. In other words, if we define gap to be the difference between the testing and training errors, this is also the ability to produce a small gap. When the gap is large, the model is said to be overfitting the data. Finally, no matter how large the training dataset and the model capacity are, the bias and variance are likely to be always present. So, an important question to ask ourselves when working on a ML project is 'how far can the performance of the model be pushed further?'. Or, in other words, can we expect further hyperparameter tuning to be successfully improving the model significantly or the model has reached a plateau in terms of learning capabilities. To answer the above question, we need to first consider two distinct scenarios: the network is designed to solve a task that a human can also solve (e.g., interpreting faults and horizons in seismic volumes) the network is performing a task that a human is not able to perform (e.g., denoising a seismic volume). In the former case, it is possible to define the so-called human-level performance (i.e., error that a human is likely to make on the task at hand). Experience in the field of deep learning has shown that the performance of a model (e.g., accuracy in classification tasks) depends on the development time of a ML model in such a way that a well-designed model will very quickly reach human-level performance, whilst a much more significant effort is required to obtain further improvements and approach the theoretical limit of the model. Such a limit, called the Bayesian optimal error, is the error of an oracle making predictions. In practical applications this may however be unknown, unless the training data has been generated in such a way that true exact labels are available. In a range of geoscientific applications, human-level performance may be replaced by the state-of-the-art algorithm that is commonly used to perform a certain task. For example, going back to the denoising example, this may be represented by FX prediction filters or SVD-based denoising. Such an algorithm can represent the human performance in the above figure and represent a baseline that we would like our model to outperform. At the same time, as no human labels are available, the use of synthetic data with true labels usually represent the only viable solution to creating a training dataset in this scenario. This time, the theoretical limit represented by the true labels can again guide us into how much effort we should put to improve the hyperparameters of the model. To conclude, let's consider two examples of ML classifiers. In the first, after some development time, our ML model performs as follows: human-level percentage error: 2% training percentage error: 10% validation percentage error: 12% In this scenario, we clearly observe a large bias and a much smaller variance. This bias, usually referred to as avoidable bias is what we should focus next in our ML development time. On the other hand, if we observe the following: human-level percentage error: 2% training percentage error: 4% validation percentage error: 20% This model shows a small bias and a large variance. Our ML development time should be therefore devoted to reduce the generalization gap. On the other hand, since the difference between human-level and training error is minimal, we refer to this error as unavoidable bias that may be very difficult to further reduce. In the following we discuss a number of strategies that are commonly used in the process of improving the model and reducing both bias and variance.","title":"Best practices in the training of Machine Learning models"},{"location":"lectures/07_bestpractice/#capacity","text":"A key aspect in the process of designing a ML model that can both fit the available data and generalize to new, unseen data is represented by the size, or more formally speaking the capacity of the model. Simply put, this is the number of free parameters \\(\\theta\\) that the model is allowed to optimize in the training process. Let's begin with an example of linear regression, where the model is defined as follows: \\[ y_i = f_{\\boldsymbol \\theta}(x_i) = \\sum_{j=0}^{N_f} x_i^j \\theta_j \\qquad \\forall i=1,2,...,N_s \\] where \\(N_f+1\\) is the capacity of the model. The simplest model that we can fit to the available data is straight line parametrized by \\(\\boldsymbol \\theta = {\\theta_1, \\theta_2}\\) . More complex model fit a polynomial function of order \\(N_f+1\\) . As shown in the figure below, a too simple model does lead to underfitting, whilst a too complex model leads to overfitting. The key is therefore to make a model that is as simple as possible but not too simple, something usually referred to as the Occam's razor principle in inverse problem theory. Similar principles apply to the design of a neural network. In this case however, the risk of overfitting is much higher due to ability of NNs to learn very complex nonlinear functions. The size of the training dataset plays a key role in choosing the capacity of the network: large networks with thousands (or millions) of free-parameters require large training dataset to avoid the arising of overfitting. Moreover, as shown in the figure below, training and testing errors are usually similar for small networks in the underfitting regimes and tend to reduce together as the network size increases. However, when the capacity of the network grows into the overfitting regime, the training error keeps decreasing whilst the testing error starts to increase. Note that the training error should always reduce when increasing the size of the network, therefore it cannot be used to choose the ideal network capacity. This shows the importance of holding some of the available data for validation purposes. i.e., hyperparameter optimization.","title":"Capacity"},{"location":"lectures/07_bestpractice/#other-hyperparameters","text":"Whilst the size of the network largely drives the ability of a model to learn as well as its tendency to overfit, unfortunately (or fortunately, depending of the point of view of the reader), when designing a ML model you will likely need to make decisions on a number of additional hyperparameters. Here we report some of the most important ones: activation function; optimizer, learning rate additional optimizer hyperparameters, batch size... As already mentioned, we need to devote a portion of the overall dataset that we will not be using to evaluate the final performance of the model for the task of hyperparameter tuning . This is indeed the role of the validation dataset (note that using this dataset at testing stage will result in underestimating the generalization error of the model because the model is partially optimized on this dataset). Finally, whilst not very commonly used in the context of deep learning because of the extremely high cost of training neural networks, a more powerful approach in optimizing hyper-parameter is the so-called cross-validations strategy. Similar to validation, a portion of the dataset is hold out from the training process and used for hyperparameter tuning. However, the portion selected for this task is not fixed, rather the entire training dataset is split into K groups, where each group is used once as the validation set and the remaining number of times as part of the training dataset. This approach, usually referred to a K-fold cross-validation . It is a great help when the training dataset is of limited size as it helps averaging out fluctuations on the validation error over multiple realizations. The obvious downside of this strategy is of course that the training process must be repeated N times. Note that other strategies can be used to split the dataset into training and validation. For example, in the context of classification, a class-aware division is usually recommended where inter-class proportions are the same between the validation and training datasets.","title":"Other hyperparameters"},{"location":"lectures/07_bestpractice/#regularization","text":"Whilst both under- and overfitting commonly affect the development of a successful ML model, reducing variance without affecting bias is notoriously difficult. A strategy that is usually successful in achieving such a task is called Regularization. Regularization acts directly on the loss function by adding prior knowledge to the training process. By informing the network about something that we know (or wish the model to know), the network is less prone to just learn from the training data and improve its generalization capabilities. In the context of inverse problem theory, where our aim is to fit some observations \\(\\mathbf{y}\\) given knowledge of the underlying physical process ( \\(g\\) ), \\[ J = ||\\mathbf{y} - g(\\mathbf{x})||_2^2 + \\lambda R(\\mathbf{x}) \\] regularization can come in different flavours, more specifically: Tikhonov (or L2) regularization : \\(||\\mathbf{x}||_2^2\\) . Ensures that a model with small norm is favoured over other models equally fitting the data. This promotes simplicity (or parsimony) in the solution Generalized Tikhonov regularization : e.g., \\(||\\nabla \\mathbf{x}||_2^2\\) , where \\(\\nabla\\) is the laplacian operator. Ensures that a smooth model is favoured over a rough one by second derivative of the model (i.e., curvature). L1 regularization or sparsity : \\(||\\mathbf{x}||_p \\; p \\le 1\\) . Promotes solutions that are sparse (i.e., few non-zero elements) A similar approach can be adopted in the context of machine learning by augmenting the data-fitting loss function with a regularization term: \\[ J_\\theta = \\frac{1}{N_s} \\sum_{i=1}^{N_s} ||y^{(i)} - f_\\theta(\\mathbf{x}^{(i)})||_2^2 + \\lambda R(\\theta) \\] where: L2 regularization or weight decay : \\(||\\boldsymbol \\theta||_2^2= \\boldsymbol \\theta^T \\boldsymbol \\theta\\) . Ensures small Euclidean norm for the free-parameters; L1 regularization : \\(||\\boldsymbol \\theta||_1\\) . Ensures small L1 norm for the free-parameters. By favouring sparsity in the parameters of the model, this regularization can allow compressing the trained model by storing only the non-negative parameters and their indices. Note that in the statistical community, regression models with one of the two regularizations discussed above is called Ridge and Lasso regression models, respectively. Finally, in the special case of deep learning and training by back-propagation the regularization terms and their gradients can be simply expressed as: L2 regularization or weight decay : $$ \\frac{1}{2 N_s} \\sum_{l=1}^{L} ||\\mathbf{W}^{[l]}||_2^2 \\rightarrow \\mathbf{W}^{[l]} = \\mathbf{W}^{[l]} \\big[ 1 - \\alpha \\frac{\\lambda}{N_s} \\big] - \\alpha \\mathbf{dW}^{[l]} $$ where the term weight decay comes from the fact that the strength of the current weights is reduced by a factor of \\(\\alpha \\lambda / N_s\\) at every gradient step of the regularized loss function L1 regularization : $$ \\sum_{l=1}^{L} ||\\mathbf{W}^{[l]}||_2^2 \\rightarrow \\mathbf{W}^{[l]} = \\mathbf{W}^{[l]} - \\alpha \\mathbf{dW}^{[l]} - \\mathbf{W}^{[l]} $$ In both cases \\(L\\) represents the number of layers in the network. Note that it is common to apply regularization to the weights only (no bias terms).","title":"Regularization"},{"location":"lectures/07_bestpractice/#dropout","text":"Dropout represents a special kind of regularization strategy to prevent overfitting, which is specific to Neural Network architectures. Contrarily to other regularization techniques that act directly on the objective function to optimize, dropout modifies the network in such a way that a portion of the neurons are randomly inactivated (i.e., set to zero) during the training process. A strategy to mitigate overfitting is in fact to reduce the size of the network (i.e., its degrees of freedom) used to match the training data; however, instead of applying such a drastic approach, dropout allows the overall network size to remain unchanged whilst making the network effectively smaller during training. By doing so, the network learns not to rely on any neuron (or set of neurons) in particular, which leads to better generalization in validation and testing phases. Considering a simple 3-layers NN, the nodes at each layer are inactivated with probability \\(p_{drop}\\) . In the figure below, \\(p_{drop}=0.3\\) for the input and first hidden layer, \\(p_{drop}=0.5\\) for the second hidden layer (and \\(p_{drop}=0\\) for the output layer): Mathematically speaking, if we consider a single node in the figure above, its output can be written as: \\[ z_2^{[1]} = \\frac{W_{21}^{[1]} x_1 + \\cancel{W_{22}^{[1]} x_2} + W_{32}^{[1]} x_3 + b_{2}^{[1]}}{1-p_{drop}} \\] where the second term is removed because \\(W_{22}^{[1]}\\) is set to zero. Moreover note that a denominator is added to compensate for the fact that the output is smaller than the one that we would obtain without dropout. In practice, a more convenient way to implement dropout is to act directly on the input vector at the i-th layer \\(\\mathbf{a}^{[i-1]}\\) instead of deactivating the weights. This approach is called inverted dropout and simply achieved by using a mask \\(\\mathbf{m}\\) where each element is randomly set to 0 or 1 based on \\(p_{drop}\\) : \\[ \\mathbf{a}^{[i-1]} \\rightarrow \\tilde{\\mathbf{a}}^{[i-1]} = \\mathbf{a}^{[i-1]} \\cdot \\mathbf{m} \\rightarrow \\mathbf{z}^{[i]} = (\\mathbf{W}^{[i]} \\tilde{\\mathbf{a}}^{[i-1]})/(1-p_{drop}) \\] Finally, at testing time, dropout is usually inactivated. Recent research in the area of uncertainty quantification (UQ) for deep learning has however suggested that by using dropout also at testing time, multiple equi-probable realizations of the output can be produced and statistics (e.g., mean, variance, marginal probability...) can be computed. This strategy will be implemented and compared to other strategies for UQ in in this lab .","title":"Dropout"},{"location":"lectures/07_bestpractice/#data-augumentation","text":"One of the key elements to a successful generalization (i.e., low variance) is represented by the availability of a large training dataset. When this is not the case and it is not feasible to acquire additional data, an alternative solution is represented by so-called data augmentation . Data augmentation simply refers to the set of procedures that can be employed to create new artificial data samples by manipulating or combining some of the original data samples. Whilst this is very data and problem dependant, in the following we will mention a number of strategies that are commonly used in computer vision when working with natural images. More specifically: cropping, flipping, rotating, shearing, averaging, color shifting are all strategies that can be used or adapted to geoscientific multi-dimensional datasets. Nevertheless, special attention may be required when implementing some of the above mentioned strategies. Just to give an example, stretching or squeezing time series data such as seismic traces does introduce a shift in the frequency content that may not be desirable. Similarly, applying color shifting to seismic data will lead to a signal whose average is not zero anymore. Alternatively, polarity reversal represents a better alternative that can be seen as a special type of color shifting when dealing with seismic data. Finally, we observe that although this does not, strictly speaking, fall within the realm of data augmentation, using basic physics principles to create synthetic datasets for training is another commonly employed strategy in geoscience when accessing high-quality labelled datasets is feasible from either a technical or intellectual property point of view. We will see example of a ML workflow that relies on synthetic data when dealing with microseismic event detection .","title":"Data augumentation"},{"location":"lectures/07_bestpractice/#transfer-learning","text":"An edge case of data augmentation is represented by transfer learning. Transfer learning is a procedure employed to circumvent the issue or scarce labelled data when similar datasets are available and have been previously used to train a neural network for a similar (or sometimes different task). Under these conditions, one may think to use the pre-trained network and use the available training data to fine-tune such a network for the task at hand. Once again, based on the dataset and problem, the entire pre-trained network may be used as starting point or just a portion of it (generally chosen to be the initial portion of the network where some of the final layers are removed, and referred to as backbone). To make things a bit more concrete, let's consider here an example. A NN model has been created to interpret faults in seismic data. Training has been performed using data from an area of the world where both seismic data and fault interpretations are abundant. When a new dataset from the same area becomes available, the pre-trained model can be used as-is or fine-tuned using a much smaller training dataset (i.e., requiring limited manual labelling of faults). A similar strategy could also be used if a new dataset from another area of the world is ready for fault interpretation. In this second case, however, a user needs to be aware that the model may not generalize well if key features in the data (e.g., frequency content) or interpretation (e.g., presence of reverse faults) are different from those in the original dataset used for training. A different case where transfer learning can be also used is when the output that we are interested is slightly different from the one the network was trained on but the input is the same (and therefore the low- and medium-level features learned by the network). In this case, one may freeze the first few layers (i.e., make those parameters non-learnable) and fine-tune only the last few layers of the network on the new task. As an example, let's consider again a network trained for fault interpretation. Imagine now that we are interest to estimate a seismic attribute such as the relative geological time and we have very limited access to labelled data. In this case the backbone of the network is likely to already contain useful features, and the problem arising from a lack of large training dataset is mitigated by limiting the number of free-parameters to learn to those of the last few layers. To conclude, let's visually summarize the strategies that we should keep in mind when interested to reduce bias or variance.","title":"Transfer learning"},{"location":"lectures/08_gradopt1/","text":"More on gradient-based optimization Whilst stochastic gradient descent is easy to understand, and simple to implement algorithm (as discussed in this lecture ), it presents a number of shortcomings that prevent learning to be as fast and effective as we would like it to be. In this lecture, we will discuss some of the limitations of SGD and look at alternative optimization algorithms that have been developed in the last decade and are nowadays preferred to SGD in the process of training NNs. Limitations of SGD Ill-conditioning The shape, and more specifically the curvature, of the functional that we wish to minimize affects our ability to quickly and efficiently converge to one of its minima (ideally the global, likely one of the local). For nonlinear optimization problems, like those encountered in deep learning, this is mathematically represented by the Hessian matrix ( \\(\\mathbf{H}=\\frac{\\partial^2 f}{\\partial \\boldsymbol \\theta^2}\\) ). An Hessian matrix with large conditioning number (i.e., ratio of the largest and smallest eigenvalues) tends to affect convergence speed of first-order (gradient-based) methods. In classical optimization theory, second order methods such as the Gauss-Newton method are commonly employed to counteract this problem. However, as already mentioned in one of our previous lectures, such methods are not yet suitable for deep learning in that no mathematical foundations have been developed in conjunction with approximate gradients (i.e., mini-batch learning strategy). Another factor that is worth knowing about is related to the norm of the gradient \\(\\mathbf{g}^T\\mathbf{g}\\) through iterations. In theory, this norm should shrink through iterations to guarantee convergence. Nevertheless, successful training may still be obtained even if the norm does not shrink as long as the learning rate is kept small. Let's write the second-order Taylor expansion of the functional around the current parameter estimate \\(\\boldsymbol \\theta_0\\) : \\[ J(\\boldsymbol \\theta) \\approx J(\\boldsymbol \\theta_0) + (\\boldsymbol \\theta - \\boldsymbol \\theta_0)^T \\mathbf{g} + \\frac{1}{2} (\\boldsymbol \\theta - \\boldsymbol \\theta_0)^T \\mathbf{H} (\\boldsymbol \\theta - \\boldsymbol \\theta_0) \\] and evaluate it at the next gradient step \\(\\boldsymbol \\theta = \\boldsymbol \\theta_0 - \\alpha \\mathbf{g}\\) : \\[ J(\\boldsymbol \\theta_0 - \\alpha \\mathbf{g}) \\approx J(\\boldsymbol \\theta_0) - \\alpha \\mathbf{g}^T \\mathbf{g} + \\frac{1}{2} \\alpha^2 \\mathbf{g}^T \\mathbf{H} \\mathbf{g} \\] We can interpret this expression as follows: a gradient step of \\(- \\alpha \\mathbf{g}\\) adds the following contribution to the cost function, \\(-\\alpha \\mathbf{g}^T \\mathbf{g} + \\frac{1}{2} \\alpha^2 \\mathbf{g}^T \\mathbf{H} \\mathbf{g}\\) . When this contribution is positive (i.e., \\(\\frac{1}{2} \\alpha^2 \\mathbf{g}^T \\mathbf{H} \\mathbf{g} > \\alpha\\mathbf{g}^T \\mathbf{g}\\) ), the cost function grows instead of being reduced. Under the assumption that \\(\\mathbf{H}\\) is known, we could easily choose a step-size \\(\\alpha\\) that prevents this from happening. However, when the Hessian cannot be estimated, a conservative selection of the step-size is the only remedy to prevent the cost function from growing. A downside of such an approach is that the smaller the learning rate the slower the training process. Local minima Whilst the focus of the previous section has been in the neighbour of \\(\\boldsymbol \\theta_0\\) where the functional \\(J_{\\boldsymbol \\theta}\\) can be approximated by a convex function, the landscape of NN functionals is generally non-convex and populated with a multitude of local minima. The problem of converging to the global minimum without getting stuck in one of the local minima is a well-known problem for any non-convex optimization. An example in geophysics is represented by waveform inversion and a large body of work has been carried out by the geophysical research community to identify objective functions that are more well-behaved (i.e., show a large basin of attraction around the global minimum). Nevertheless, getting stuck into local minima is much less of a problem when training neural networks. This can be justified by the fact that multiple models may perform equally well on both the training and testing data. To be more precise this relates to the concept of model identifiability , where a model is defined identifiable if there exist a single set of parameters ( \\(\\boldsymbol \\theta_{gm}\\) ) that lead to optimal model performance. On the other hand, when multiple models \\(\\{ \\boldsymbol \\theta_{gm}, \\boldsymbol \\theta_{lm,1}, ..., \\boldsymbol \\theta_{lm1,N}\\) perform similarly those models are said to be non-identifiable. Moreover, even when a single model performs best, a distinction must be made between training and testing performance. As far as training performance is concerned, this model must be that of the global minimum of the functional \\(\\boldsymbol \\theta_{gm}\\) . Nevertheless, the model that performs best on the testing data may be the one obtained from any of the local minima \\(\\boldsymbol \\theta_{lm,i}\\) as such a model be have better generalization capabilities than the one from the global minimum. Saddle points and other flat regions Recent research in the field of deep learning has however revealed that multi-dimensional landscapes associated to the training of deep neural networks may actually have much fewer local minima than we tend to believe, and the main hinder to slow training is actually represented by saddle points (and flat regions in general). More specifically, empirically it can be shown that the ratio between saddle points and local minima is in the order of \\(e^n\\) where \\(n\\) is the number of dimensions of the model vector \\(\\boldsymbol \\theta\\) . The main problem associated with saddle points is similar to that of local minima: the associated gradient is \\(J(\\boldsymbol \\theta) \\rightarrow 0\\) ; as a consequence, during training, when the trajectory of the model parameter vector approaches a saddle point, the learning process may experience a slow down. Cliffs Another potentially dangerous feature of NN landscapes is represented by steep regions where \\(J(\\boldsymbol \\theta) \\rightarrow \\infty\\) . This may in fact lead to unstable behaviours during training as large jumps will arise in the trajectory of the model parameter vector. Heuristic approaches to mitigate this problem exist, one of them is the so-called gradient clipping strategy where: $$ \\nabla J(\\theta_i) = min(\\nabla J(\\theta_i), th) $$ where \\(th\\) is a user-defined threshold. This approach allows element-wise gradient clipping for those directions with an extremely large gradient whilst not forcing us to lower the overall learning rate. Exploding and vanishing gradients Two problems that we commonly encounter whilst training Neural Networks are the so-called exploding and vanishing gradient phenomena. Whilst we already mentioned two scenarios where either of these situations can occur, i.e., cliffs and saddle points, the shape of the functional that we wish to optimize is not the only reason for gradients to grow uncontrolled or stagnate. It is in fact the NN architecture itself that sometimes may give rise to such phenomena. To provide some intuition, let's consider a matrix of weights \\(\\mathbf{W}\\) and apply it N times recursively to a certain input (where for simplicity we ignore the nonlinear activation functions): \\[ \\mathbf{y}=\\mathbf{W}^N\\mathbf{x} \\] If we assume \\(\\mathbf{W}\\) to be symmetric for simplicity and express it using its eigendecomposition \\[ \\mathbf{W}=\\mathbf{V} \\boldsymbol \\Sigma \\mathbf{V}^{-1} \\] the resulting output vector \\(\\mathbf{y}\\) can be equivalently written as: \\[ \\begin{aligned} \\mathbf{y} &= \\mathbf{V} \\boldsymbol \\Sigma \\mathbf{V}^{-1} \\mathbf{V} \\boldsymbol \\Sigma \\mathbf{V}^{-1} ... \\mathbf{V} \\boldsymbol \\Sigma \\mathbf{V}^{-1} \\mathbf{x} \\\\ &= \\mathbf{V} \\boldsymbol \\Sigma^N \\mathbf{V}^{-1} \\mathbf{x} \\end{aligned} \\] where we have used here the property of eigendecomposition, \\(\\mathbf{V}^{-1} \\mathbf{V} = \\mathbf{I}\\) . Note that since the matrix of eigenvalues is raised to the power of N, when N is large we will experience the following phenomena: \\(\\lambda_i > 1 \\rightarrow\\) exploding gradient; \\(\\lambda_i < 1 \\rightarrow\\) vanishing gradient; Note that the scenario discussed here does not manifest itself when training feed forward networks, whilst it is much more relevant in the context of recurrent neural networks as the same weights are repeatedly applied to the input as it flows through the computational graph. We defer a more extensive discussion of this phenomenon to this lecture . Stategies to improve SGD After looking at some of the problems that we should be aware of when training NNs (note that some of them can be easily overcome as we will see in the following, whilst others are outstanding and do not have a simple solution), let's look back at the SGD algorithm and consider a number of improvements that can lead to both faster and more stable training. We remember from our previous lecture , that the optimization step of SGD is simply composed of two steps: compute the gradient of the cost function with respect to the free-parameters, obtained via back-propagation apply a scaled step, dictated by the learning rate \\(\\alpha\\) . Cooling strategy The most basic version of SGD uses a constant learning rate. However, a learning rate that may be optimal at the start of training and lead to fast convergence towards one of the minima of the cost function, may lead to unstable behaviour at later iterations. A question arises: given a gradient telling us where to move in the NN functional landscape, can we do something smart with the learning rate to reach the minimum faster. A common approach usually referred to as cooling strategy or learning rate scheduling , where the learning rate is not kept fixed through epochs. Instead, the learning rate is slowly reduced as epochs progress allowing the trajectory of the free-parameters to not fluctuate too much as it progresses towards a valley. Many alternative approaches to LR scheduling exist. However, to be effective, they must respect the following conditions: \\[ \\sum_i \\alpha_i = \\infty, \\; \\sum_i \\alpha_i^2 < \\infty' \\] or, in words, the learning rate should reduce slowly as iterations progress. One common approach uses a linearly decaying LR for the first \\(\\tau\\) iterations, followed by a constant LR: \\[ \\begin{aligned} &\\alpha_i = (1-\\beta) \\alpha_0 + \\beta \\alpha_\\tau \\qquad i<\\tau\\\\ &\\alpha_i = \\alpha_\\tau \\qquad i\\ge\\tau \\end{aligned} \\] where \\(\\beta=i/\\tau\\) . As a rule of thumb, \\(\\tau \\approx 100 N_{epochs}, \\alpha_\\tau = \\alpha_0/100\\) , whilst the choice of \\(\\alpha_0\\) is problem dependent and chosen by monitoring the first few iterations. Alternative approaches can either apply a fixed decay (i.e., exponential) or choose to reduce the LR when the training (or validation) metric has not decreased for a number of epochs. Momentum Another commonly used strategy aimed at improving the convergence of SGD is called Momentum and dates back to the 60s and the seminal works of Polyak and Nesterov in the area of mathematical optimization. The idea of momentum is rather simple, yet very effective. It is based on the idea of using information not only from the current gradient but also from past gradients when making a step. More specifically, the step is based on an exponentially decaying moving average of the past gradients created during iterations. The motivation behind using multiple gradients is to use the knowledge about the landscape shape accumulated through time in the proximity of the current parameters to make a more informed decision on where to move. This can generally help dealing with poorly conditioned modelling matrices in linear optimization and poorly conditioned Hessian matrices in nonlinear optimization. Intuitively, momentum can be understood as some sort of medium resistance or inertia when moving down a valley which slows down the trajectory and keeps it close to the axes of the ellipses of the functional (or its linearization around the current position). This physical interpretation is actually used when defining SGD with momentum as a vector \\(\\mathbf{v}\\) (where v stands for velocity) is introduced: \\[ \\mathbf{v}_{i+1} = \\gamma \\mathbf{v}_i - \\mathbf{g}_{i+1} = \\gamma \\mathbf{v}_i - \\frac{\\alpha}{N_b} \\sum_{j=1}^{N_b} \\nabla \\mathscr{L}_j \\] and the update becomes: \\[ \\boldsymbol\\theta_{i+1} = \\boldsymbol\\theta_{i} + \\mathbf{v}_{i+1} \\] where \\(\\gamma \\in [0, 1)\\) is the momentum term. If we write explicitly the first three iterates of the velocity vector: \\[ \\begin{aligned} &\\mathbf{v}_0 = - \\alpha \\mathbf{g}_0\\\\ &\\mathbf{v}_1 = \\gamma \\mathbf{v}_0 - \\alpha \\mathbf{g}_1 = - \\gamma \\alpha \\mathbf{g}_0 - \\alpha \\mathbf{g}_1 \\\\ &\\mathbf{v}_2 = \\gamma \\mathbf{v}_1 - \\alpha \\mathbf{g}_2 = - \\gamma^2 \\alpha \\mathbf{g}_0 - \\gamma \\alpha \\mathbf{g}_1 - \\alpha \\mathbf{g}_2 \\end{aligned} \\] we notice that the momentum tells us how quickly the contribution of the previous gradients should decay. With \\(\\gamma=0\\) we are back to the standard SGD algorithm, whilst with \\(\\gamma \\rightarrow 1\\) we take into account the entire history of gradients. More commonly used values of momentum are \\(\\gamma=0.5/0.9/0.99\\) which can also be combined with a warming strategy (i.e., start from 0.5 and increase through iterations all the way to 0.99). This is a similar strategy (even though in opposite direction) to the one we previously discussed for the learning rate, even though it is known to impact the learning process to a lesser extent. Based on what we wrote above for the first three iterates, we can easily conclude that: if \\(\\mathbf{g}_i \\approx \\mathbf{g}_{i-1} \\approx \\mathbf{g}_{i-2}\\) (where the sign \\(\\approx\\) is used here to indicate a vector with approximately the same direction), the gradients' sum constructively leading to higher momentum and therefore a faster trajectory if \\(\\mathbf{g}_i \\ne \\mathbf{g}_{i-1} \\ne \\mathbf{g}_{i-2}\\) (where the sign \\(\\ne\\) is used here to indicate a vector with different directions), the gradients' sum destructively leading to lower momentum and therefore a slower trajectory Finally, an even smarter approach would require us not only to accumulate past gradients but also to look ahead of time so that we could slow down the trajectory if the landscape is about to change curvature (i.e., slow up). This requires a slight modification of the momentum term, referred to as Nesterov momentum : \\[ \\mathbf{v}_{i+1} = \\gamma \\mathbf{v}_i - \\frac{\\alpha}{N_b} \\sum_{j=1}^{N_b} \\nabla \\mathscr{L}_j(f_{\\theta+\\gamma \\mathbf{v}_i}(\\mathbf{x}_i), y_i) \\] where the main change here is represented by the fact that the loss function ( \\(\\mathscr{L}\\) ), and therefore, the gradient is evaluated at location \\(\\theta+\\gamma \\mathbf{v}_i\\) rather than at the current one. Here, \\(\\gamma \\mathbf{v}_i\\) represents a correction factor to the standard method of momentum. In classical optimization (i.e., for batched gradient descent), this small change provides an improvement in the rate of convergence from \\(\\mathcal{O}(1/i)\\) to \\(\\mathcal{O}(1/i^2)\\) . Note that this is however not always the case when using stochastic gradient descent. Adaptive learning rates Up until now, we have introduced some modifications to the standard SGD algorithm that globally change the scaling of the gradient (also referred to as learning rate). However, if we believe that directions of sensitivity of the functional should be axis aligned, different learning rates should be used for the different parameters we wish to optimize for. More specifically a small LR should be preferred for those directions associated with large eigenvalues of the local Hessian whilst a large LR should be used for the other directions that associated with small eigenvalues. The delta-bar-delta algorithm of Jacobs (1988) represents an early heuristic approach to automatically adapting learning rates of individual parameters. It is based on this simple rule: if \\(sign\\{g_{i+1}^j\\} = sign\\{g_{i}^j\\}\\) , increase LR if \\(sign\\{g_{i+1}^j\\} \\ne sign\\{g_{i}^j\\}\\) , decrease LR where \\(j\\) refers here to the j-th component of the gradient vector. However, in the last decade a large variety of optimizers have appeared in the literature mostly focusing on this particular aspect of training, i.e. parameter-dependent learning rate. We will go through some of the most popular ones that have revolutionized the way we train NNs nowadays. AdaGrad This optimizer scales the gradient vector by the inverse of the square root of the sum of all historical squared values of the gradient. \\[ \\begin{aligned} &\\mathbf{g}_{i+1} = \\frac{1}{N_b} \\sum_{j=1}^{N_b} \\nabla \\mathscr{L}_j\\\\ &\\mathbf{r}_{i+1} = \\mathbf{r}_i + \\mathbf{g}_{i+1} \\cdot \\mathbf{g}_{i+1} \\\\ &\\Delta \\boldsymbol\\theta_{i+1} = -\\frac{\\alpha}{\\delta + \\sqrt{\\mathbf{r}_{i+1}}} \\cdot \\mathbf{g}_{i+1} \\\\ &\\boldsymbol\\theta_{i+1} = \\boldsymbol\\theta_{i} + \\Delta \\boldsymbol\\theta_{i+1} \\end{aligned} \\] where the vector \\(\\mathbf{r}\\) contains a running sum of the element-wise square gradients (with \\(\\mathbf{r}_0=0\\) ), \\(\\cdot\\) and \\(\\sqrt{\\;}\\) represent the element-wise multiplication of two vectors and square root, respectively. Finally, \\(\\delta=10^{-6}\\) is used as stabilizer to avoid division by zero. If we look at the learning rate of AdaGrad, it is clear that this is parameter dependent and more importantly, it is a function of the norm of the past gradients. Therefore, parameters associated with large gradients will experience a rapid decrease in their associated LR, whilst parameters with small gradients will have an increase of the LR through iterations. The effect of such adaptive LR, is that the trajectory of the parameters will show greater progress over gently sloped directions of the landscape. Nevertheless, it has been reported in the literature that a main drawback of AdaGrad is that this effect is too strong, leading to a premature decrease of the LR in those directions with large gradients and therefore an overall slow learning process. RMSProp A modified version of AdaGrad particularly suited for nonconvex optimization where the gradient accumulation (i.e., \\(\\mathbf{r}\\) vector) is exponentially weighted on a moving window. The idea behind is that for NN training it may take a large number of gradient steps to converge to a satisfactory solution, and therefore it is important for the LR not to decrease too fast in the first few hundred steps. In mathematical terms, a single change is needed to the AdaGrad equations, namely: \\[ \\mathbf{r}_{i+1} = \\rho \\mathbf{r}_i + (1-\\rho)\\mathbf{g}_{i+1} \\cdot \\mathbf{g}_{i+1} \\\\ \\] where \\(\\rho\\) represents the decay rate in the accumulation of past gradients. RMSProp, which was proposed by Geoffrey Hinton during a Coursera class, is shown to be one of the best-in-class optimizers for NN training and it is widely adopted by the DL community. ADAM ADAM stands for Adaptive Moments and it is a variant of RMSProp that further includes Momentum. Nowadays, ADAM is by far the most popular optimizer in the training of deep NNs. Two key changes have been introduced in the ADAM algorithm when compared to RMSProp: Momentum is applied via an estimate of the first-order momentum plus an exponential decay and used in spite of pure gradients in the parameter update step; A bias correction is included to take into account initialization. The algorithm can be written as follows: \\[ \\begin{aligned} &\\mathbf{g}_{i+1} = \\frac{1}{N_b} \\sum_{j=1}^{N_b} \\nabla \\mathscr{L}_j\\\\ &\\mathbf{v}_{i+1} = \\rho_1 \\mathbf{v}_i + (1-\\rho_1)\\mathbf{g}_{i+1} \\leftarrow velocity \\; term \\\\ &\\mathbf{r}_{i+1} = \\rho_2 \\mathbf{r}_i + (1-\\rho_2)\\mathbf{g}_{i+1} \\cdot \\mathbf{g}_{i+1} \\leftarrow scaling \\; term \\\\ &\\hat{\\mathbf{v}}_{i+1} = \\frac{\\mathbf{v}_{i+1}}{1-\\rho_1^{i+1}} \\leftarrow bias \\; correction \\\\ &\\hat{\\mathbf{r}}_{i+1} = \\frac{\\mathbf{r}_{i+1}}{1-\\rho_2^{i+1}} \\leftarrow bias \\; correction \\\\ &\\Delta \\boldsymbol\\theta_{i+1} = -\\frac{\\alpha}{\\delta + \\sqrt{\\hat{\\mathbf{r}}_{i+1}}} \\cdot \\hat{\\mathbf{v}}_{i+1}\\\\ &\\boldsymbol\\theta_{i+1} = \\boldsymbol\\theta_{i} + \\Delta \\boldsymbol\\theta_{i+1} \\end{aligned} \\] where, once again, a number of hyperparameters are introduced. These are the stabilizer, \\(\\delta=10^{-6}\\) , and two decay rates ( \\(\\rho_1\\) and \\(\\rho_2\\) ). To conclude, we have first introduced simpler optimizers and subsequently built complexity in terms of both momentum and parameter-dependent learning, there is no universal winner. Although both momentum and adaptive LR do clearly seem to be beneficial to the training on NNs, it is not always the case that ADAM provides the best results both in terms of robustness and convergence speed. It is therefore important to be aware of the different optimizers that are available in the DL arsenal and identify the best based on the task at end. In other words, the choice of the optimizer can usually represent one of those hyperparameters that ML practitioners need to evaluate and select when developing a new ML pipeline. Other tricks In the following, we report a few other practical tricks that can be used when training NNs to further improve the learning capabilities of our optimizer (no matter what optimizer has been selected). Polyak Averaging When training a NN, the most common approach is to select the last iterate ( \\(\\boldsymbol\\theta_{N_{it}}\\) ) where \\(N_{it}\\) is the overall number of iterations and use it at inference stage. Nevertheless, given the highly nonconvex optimization problem that we are required to solver, it is logical to expect that perhaps the last estimate of model parameters is not the best. Let's for example imagine that towards the end of the training process we are approaching a (local or global) minimum. However, our trajectory is bouncing all around the valley: A simple approach to mitigate this effect is to average over the last \\(N\\) iterations: \\[ \\boldsymbol\\theta = \\frac{1}{N} \\sum_{i=0}^{N-1} \\boldsymbol\\theta_{N_{it}-i} \\] This averaging acts as a denoising process that takes away some of the fluctuations and makes the optimization process less sensitive to the last step. Batch Normalization This is a very recent advancement in the field of DL, from the seminal work of Ioffe and Szegedy (2015). It has been shown to be particularly beneficial to the training of very deep neural networks. Let's first take a look at what happens during the training process if we do not include batch normalization. As previously discussed, given the gradient \\(\\partial J / \\partial \\boldsymbol \\theta\\) , at every step of the optimization process all the parameters (weights and biases) in the different layers of a NN are simultaneously updated. This goes against the \"theoretical assumption\" that the optimization process should update one parameter at the time (which is however too expensive and therefore unfeasible). As a consequence of the fact that all free-parameters are updated together is that second order updates are introduced or, in other words, the statistical distribution of various parameters across the layers of the NN are modified. This is commonly referred to as internal covariate shift . Batch normalization use a general way to reparametrize every NN, which reduces the need for coordination across many layers during an update (making the process of updating all parameters at the same time more stable). It is simply implemented by modifying the output of a layer (or all the layers) at training time as follows: where a re-normalization process is applied to every row of the output matrix \\(\\mathbf{A}\\) and it is directly based on the local statistics (mean and standard deviation) of the output of the layer. The overall forward and backward passes remain unchanged with the simple difference that the network is now operating on the re-normalized output \\(\\mathbf{A}'\\) instead of the original one \\(\\mathbf{A}\\) . The implications of such an additional step of re-normalization are that now the activations are distributed as \\(\\mathcal{N}(0, 1)\\) throughout the entire training process. By doing so, the optimization algorithm is discouraged to propose an update that simply acts constantly over the mean or the standard deviation of \\(\\mathbf{A}\\) . At testing time, the mean and standard deviation ( \\(\\boldsymbol \\mu\\) and \\(\\boldsymbol \\sigma\\) ) are usually fixed and taken from a running mean computed during training time. In practice, however, batch normalization includes an extra step where instead of forcing the mean and standard deviation of each layer to be fixed, these parameters are learned to make the units of the network more expressive. This is simply accomplished by defining the output \\(\\mathbf{A}''\\) as: \\[ \\mathbf{A}'' = \\gamma \\mathbf{A}' + \\beta \\] where \\(\\gamma\\) and \\(\\beta\\) are also learned alongside the weights and biases of the network. Finally, since the bias is now induced by \\(\\beta\\) a common recommendation when using batch normalization is to avoid adding a learnable bias to the layer of the network. Supervised pre-training So far, we have talked about optimizing the free-parameters of a neural network starting from a random initialization of such parameters and using all the available data to get the best estimate of such parameters. We have also briefly mentioned that transfer learning, a technique that uses a pre-trained network on a different set of data and possible different task and fine-tunes it on the task and data at hand, as a way to speed-up the training process as well as get around to the fact that sometimes we have access to a small amount of labelled data. Another interesting technique that can be used to ease the learning capabilities of a NN is called pre-training or greedy training . Two alternative approaches are generally taken: \\(\\boldsymbol \\theta_0\\) (selected at random) \\(\\rightarrow\\) Simple task: \\(\\tilde{\\boldsymbol \\theta}\\) \\(\\rightarrow\\) Hard task: \\(\\tilde{\\boldsymbol \\theta'}\\) \\(\\boldsymbol \\theta^1_0\\) (selected at random) \\(\\rightarrow\\) Simple network: \\(\\tilde{\\boldsymbol \\theta^1}, \\boldsymbol \\theta^2_0\\) \\(\\rightarrow\\) Complex network: \\(\\tilde{\\boldsymbol \\theta^1}, \\tilde{\\boldsymbol \\theta^2}\\) where in the latter case a common approach is to fix the hidden layers and discard the output layer after the first training process, add a number of extra layers to make the network deeper and continue training those layers alone. However, since N independent optimizations generally do not provide the overall optimal solution, a final fine-tuning step may be required. Additional readings A great resource containing references (and Pytorch implementations) of more than 20 optimizers. This may be a good starting point if interest to experiment with different optimizers in both classical optimization and training of NNs. Another great resource with step-by-step implementations of some popular optimizers and networks.","title":"More on gradient-based optimization"},{"location":"lectures/08_gradopt1/#more-on-gradient-based-optimization","text":"Whilst stochastic gradient descent is easy to understand, and simple to implement algorithm (as discussed in this lecture ), it presents a number of shortcomings that prevent learning to be as fast and effective as we would like it to be. In this lecture, we will discuss some of the limitations of SGD and look at alternative optimization algorithms that have been developed in the last decade and are nowadays preferred to SGD in the process of training NNs.","title":"More on gradient-based optimization"},{"location":"lectures/08_gradopt1/#limitations-of-sgd","text":"","title":"Limitations of SGD"},{"location":"lectures/08_gradopt1/#ill-conditioning","text":"The shape, and more specifically the curvature, of the functional that we wish to minimize affects our ability to quickly and efficiently converge to one of its minima (ideally the global, likely one of the local). For nonlinear optimization problems, like those encountered in deep learning, this is mathematically represented by the Hessian matrix ( \\(\\mathbf{H}=\\frac{\\partial^2 f}{\\partial \\boldsymbol \\theta^2}\\) ). An Hessian matrix with large conditioning number (i.e., ratio of the largest and smallest eigenvalues) tends to affect convergence speed of first-order (gradient-based) methods. In classical optimization theory, second order methods such as the Gauss-Newton method are commonly employed to counteract this problem. However, as already mentioned in one of our previous lectures, such methods are not yet suitable for deep learning in that no mathematical foundations have been developed in conjunction with approximate gradients (i.e., mini-batch learning strategy). Another factor that is worth knowing about is related to the norm of the gradient \\(\\mathbf{g}^T\\mathbf{g}\\) through iterations. In theory, this norm should shrink through iterations to guarantee convergence. Nevertheless, successful training may still be obtained even if the norm does not shrink as long as the learning rate is kept small. Let's write the second-order Taylor expansion of the functional around the current parameter estimate \\(\\boldsymbol \\theta_0\\) : \\[ J(\\boldsymbol \\theta) \\approx J(\\boldsymbol \\theta_0) + (\\boldsymbol \\theta - \\boldsymbol \\theta_0)^T \\mathbf{g} + \\frac{1}{2} (\\boldsymbol \\theta - \\boldsymbol \\theta_0)^T \\mathbf{H} (\\boldsymbol \\theta - \\boldsymbol \\theta_0) \\] and evaluate it at the next gradient step \\(\\boldsymbol \\theta = \\boldsymbol \\theta_0 - \\alpha \\mathbf{g}\\) : \\[ J(\\boldsymbol \\theta_0 - \\alpha \\mathbf{g}) \\approx J(\\boldsymbol \\theta_0) - \\alpha \\mathbf{g}^T \\mathbf{g} + \\frac{1}{2} \\alpha^2 \\mathbf{g}^T \\mathbf{H} \\mathbf{g} \\] We can interpret this expression as follows: a gradient step of \\(- \\alpha \\mathbf{g}\\) adds the following contribution to the cost function, \\(-\\alpha \\mathbf{g}^T \\mathbf{g} + \\frac{1}{2} \\alpha^2 \\mathbf{g}^T \\mathbf{H} \\mathbf{g}\\) . When this contribution is positive (i.e., \\(\\frac{1}{2} \\alpha^2 \\mathbf{g}^T \\mathbf{H} \\mathbf{g} > \\alpha\\mathbf{g}^T \\mathbf{g}\\) ), the cost function grows instead of being reduced. Under the assumption that \\(\\mathbf{H}\\) is known, we could easily choose a step-size \\(\\alpha\\) that prevents this from happening. However, when the Hessian cannot be estimated, a conservative selection of the step-size is the only remedy to prevent the cost function from growing. A downside of such an approach is that the smaller the learning rate the slower the training process.","title":"Ill-conditioning"},{"location":"lectures/08_gradopt1/#local-minima","text":"Whilst the focus of the previous section has been in the neighbour of \\(\\boldsymbol \\theta_0\\) where the functional \\(J_{\\boldsymbol \\theta}\\) can be approximated by a convex function, the landscape of NN functionals is generally non-convex and populated with a multitude of local minima. The problem of converging to the global minimum without getting stuck in one of the local minima is a well-known problem for any non-convex optimization. An example in geophysics is represented by waveform inversion and a large body of work has been carried out by the geophysical research community to identify objective functions that are more well-behaved (i.e., show a large basin of attraction around the global minimum). Nevertheless, getting stuck into local minima is much less of a problem when training neural networks. This can be justified by the fact that multiple models may perform equally well on both the training and testing data. To be more precise this relates to the concept of model identifiability , where a model is defined identifiable if there exist a single set of parameters ( \\(\\boldsymbol \\theta_{gm}\\) ) that lead to optimal model performance. On the other hand, when multiple models \\(\\{ \\boldsymbol \\theta_{gm}, \\boldsymbol \\theta_{lm,1}, ..., \\boldsymbol \\theta_{lm1,N}\\) perform similarly those models are said to be non-identifiable. Moreover, even when a single model performs best, a distinction must be made between training and testing performance. As far as training performance is concerned, this model must be that of the global minimum of the functional \\(\\boldsymbol \\theta_{gm}\\) . Nevertheless, the model that performs best on the testing data may be the one obtained from any of the local minima \\(\\boldsymbol \\theta_{lm,i}\\) as such a model be have better generalization capabilities than the one from the global minimum.","title":"Local minima"},{"location":"lectures/08_gradopt1/#saddle-points-and-other-flat-regions","text":"Recent research in the field of deep learning has however revealed that multi-dimensional landscapes associated to the training of deep neural networks may actually have much fewer local minima than we tend to believe, and the main hinder to slow training is actually represented by saddle points (and flat regions in general). More specifically, empirically it can be shown that the ratio between saddle points and local minima is in the order of \\(e^n\\) where \\(n\\) is the number of dimensions of the model vector \\(\\boldsymbol \\theta\\) . The main problem associated with saddle points is similar to that of local minima: the associated gradient is \\(J(\\boldsymbol \\theta) \\rightarrow 0\\) ; as a consequence, during training, when the trajectory of the model parameter vector approaches a saddle point, the learning process may experience a slow down.","title":"Saddle points and other flat regions"},{"location":"lectures/08_gradopt1/#cliffs","text":"Another potentially dangerous feature of NN landscapes is represented by steep regions where \\(J(\\boldsymbol \\theta) \\rightarrow \\infty\\) . This may in fact lead to unstable behaviours during training as large jumps will arise in the trajectory of the model parameter vector. Heuristic approaches to mitigate this problem exist, one of them is the so-called gradient clipping strategy where: $$ \\nabla J(\\theta_i) = min(\\nabla J(\\theta_i), th) $$ where \\(th\\) is a user-defined threshold. This approach allows element-wise gradient clipping for those directions with an extremely large gradient whilst not forcing us to lower the overall learning rate.","title":"Cliffs"},{"location":"lectures/08_gradopt1/#exploding-and-vanishing-gradients","text":"Two problems that we commonly encounter whilst training Neural Networks are the so-called exploding and vanishing gradient phenomena. Whilst we already mentioned two scenarios where either of these situations can occur, i.e., cliffs and saddle points, the shape of the functional that we wish to optimize is not the only reason for gradients to grow uncontrolled or stagnate. It is in fact the NN architecture itself that sometimes may give rise to such phenomena. To provide some intuition, let's consider a matrix of weights \\(\\mathbf{W}\\) and apply it N times recursively to a certain input (where for simplicity we ignore the nonlinear activation functions): \\[ \\mathbf{y}=\\mathbf{W}^N\\mathbf{x} \\] If we assume \\(\\mathbf{W}\\) to be symmetric for simplicity and express it using its eigendecomposition \\[ \\mathbf{W}=\\mathbf{V} \\boldsymbol \\Sigma \\mathbf{V}^{-1} \\] the resulting output vector \\(\\mathbf{y}\\) can be equivalently written as: \\[ \\begin{aligned} \\mathbf{y} &= \\mathbf{V} \\boldsymbol \\Sigma \\mathbf{V}^{-1} \\mathbf{V} \\boldsymbol \\Sigma \\mathbf{V}^{-1} ... \\mathbf{V} \\boldsymbol \\Sigma \\mathbf{V}^{-1} \\mathbf{x} \\\\ &= \\mathbf{V} \\boldsymbol \\Sigma^N \\mathbf{V}^{-1} \\mathbf{x} \\end{aligned} \\] where we have used here the property of eigendecomposition, \\(\\mathbf{V}^{-1} \\mathbf{V} = \\mathbf{I}\\) . Note that since the matrix of eigenvalues is raised to the power of N, when N is large we will experience the following phenomena: \\(\\lambda_i > 1 \\rightarrow\\) exploding gradient; \\(\\lambda_i < 1 \\rightarrow\\) vanishing gradient; Note that the scenario discussed here does not manifest itself when training feed forward networks, whilst it is much more relevant in the context of recurrent neural networks as the same weights are repeatedly applied to the input as it flows through the computational graph. We defer a more extensive discussion of this phenomenon to this lecture .","title":"Exploding and vanishing gradients"},{"location":"lectures/08_gradopt1/#stategies-to-improve-sgd","text":"After looking at some of the problems that we should be aware of when training NNs (note that some of them can be easily overcome as we will see in the following, whilst others are outstanding and do not have a simple solution), let's look back at the SGD algorithm and consider a number of improvements that can lead to both faster and more stable training. We remember from our previous lecture , that the optimization step of SGD is simply composed of two steps: compute the gradient of the cost function with respect to the free-parameters, obtained via back-propagation apply a scaled step, dictated by the learning rate \\(\\alpha\\) .","title":"Stategies to improve SGD"},{"location":"lectures/08_gradopt1/#cooling-strategy","text":"The most basic version of SGD uses a constant learning rate. However, a learning rate that may be optimal at the start of training and lead to fast convergence towards one of the minima of the cost function, may lead to unstable behaviour at later iterations. A question arises: given a gradient telling us where to move in the NN functional landscape, can we do something smart with the learning rate to reach the minimum faster. A common approach usually referred to as cooling strategy or learning rate scheduling , where the learning rate is not kept fixed through epochs. Instead, the learning rate is slowly reduced as epochs progress allowing the trajectory of the free-parameters to not fluctuate too much as it progresses towards a valley. Many alternative approaches to LR scheduling exist. However, to be effective, they must respect the following conditions: \\[ \\sum_i \\alpha_i = \\infty, \\; \\sum_i \\alpha_i^2 < \\infty' \\] or, in words, the learning rate should reduce slowly as iterations progress. One common approach uses a linearly decaying LR for the first \\(\\tau\\) iterations, followed by a constant LR: \\[ \\begin{aligned} &\\alpha_i = (1-\\beta) \\alpha_0 + \\beta \\alpha_\\tau \\qquad i<\\tau\\\\ &\\alpha_i = \\alpha_\\tau \\qquad i\\ge\\tau \\end{aligned} \\] where \\(\\beta=i/\\tau\\) . As a rule of thumb, \\(\\tau \\approx 100 N_{epochs}, \\alpha_\\tau = \\alpha_0/100\\) , whilst the choice of \\(\\alpha_0\\) is problem dependent and chosen by monitoring the first few iterations. Alternative approaches can either apply a fixed decay (i.e., exponential) or choose to reduce the LR when the training (or validation) metric has not decreased for a number of epochs.","title":"Cooling strategy"},{"location":"lectures/08_gradopt1/#momentum","text":"Another commonly used strategy aimed at improving the convergence of SGD is called Momentum and dates back to the 60s and the seminal works of Polyak and Nesterov in the area of mathematical optimization. The idea of momentum is rather simple, yet very effective. It is based on the idea of using information not only from the current gradient but also from past gradients when making a step. More specifically, the step is based on an exponentially decaying moving average of the past gradients created during iterations. The motivation behind using multiple gradients is to use the knowledge about the landscape shape accumulated through time in the proximity of the current parameters to make a more informed decision on where to move. This can generally help dealing with poorly conditioned modelling matrices in linear optimization and poorly conditioned Hessian matrices in nonlinear optimization. Intuitively, momentum can be understood as some sort of medium resistance or inertia when moving down a valley which slows down the trajectory and keeps it close to the axes of the ellipses of the functional (or its linearization around the current position). This physical interpretation is actually used when defining SGD with momentum as a vector \\(\\mathbf{v}\\) (where v stands for velocity) is introduced: \\[ \\mathbf{v}_{i+1} = \\gamma \\mathbf{v}_i - \\mathbf{g}_{i+1} = \\gamma \\mathbf{v}_i - \\frac{\\alpha}{N_b} \\sum_{j=1}^{N_b} \\nabla \\mathscr{L}_j \\] and the update becomes: \\[ \\boldsymbol\\theta_{i+1} = \\boldsymbol\\theta_{i} + \\mathbf{v}_{i+1} \\] where \\(\\gamma \\in [0, 1)\\) is the momentum term. If we write explicitly the first three iterates of the velocity vector: \\[ \\begin{aligned} &\\mathbf{v}_0 = - \\alpha \\mathbf{g}_0\\\\ &\\mathbf{v}_1 = \\gamma \\mathbf{v}_0 - \\alpha \\mathbf{g}_1 = - \\gamma \\alpha \\mathbf{g}_0 - \\alpha \\mathbf{g}_1 \\\\ &\\mathbf{v}_2 = \\gamma \\mathbf{v}_1 - \\alpha \\mathbf{g}_2 = - \\gamma^2 \\alpha \\mathbf{g}_0 - \\gamma \\alpha \\mathbf{g}_1 - \\alpha \\mathbf{g}_2 \\end{aligned} \\] we notice that the momentum tells us how quickly the contribution of the previous gradients should decay. With \\(\\gamma=0\\) we are back to the standard SGD algorithm, whilst with \\(\\gamma \\rightarrow 1\\) we take into account the entire history of gradients. More commonly used values of momentum are \\(\\gamma=0.5/0.9/0.99\\) which can also be combined with a warming strategy (i.e., start from 0.5 and increase through iterations all the way to 0.99). This is a similar strategy (even though in opposite direction) to the one we previously discussed for the learning rate, even though it is known to impact the learning process to a lesser extent. Based on what we wrote above for the first three iterates, we can easily conclude that: if \\(\\mathbf{g}_i \\approx \\mathbf{g}_{i-1} \\approx \\mathbf{g}_{i-2}\\) (where the sign \\(\\approx\\) is used here to indicate a vector with approximately the same direction), the gradients' sum constructively leading to higher momentum and therefore a faster trajectory if \\(\\mathbf{g}_i \\ne \\mathbf{g}_{i-1} \\ne \\mathbf{g}_{i-2}\\) (where the sign \\(\\ne\\) is used here to indicate a vector with different directions), the gradients' sum destructively leading to lower momentum and therefore a slower trajectory Finally, an even smarter approach would require us not only to accumulate past gradients but also to look ahead of time so that we could slow down the trajectory if the landscape is about to change curvature (i.e., slow up). This requires a slight modification of the momentum term, referred to as Nesterov momentum : \\[ \\mathbf{v}_{i+1} = \\gamma \\mathbf{v}_i - \\frac{\\alpha}{N_b} \\sum_{j=1}^{N_b} \\nabla \\mathscr{L}_j(f_{\\theta+\\gamma \\mathbf{v}_i}(\\mathbf{x}_i), y_i) \\] where the main change here is represented by the fact that the loss function ( \\(\\mathscr{L}\\) ), and therefore, the gradient is evaluated at location \\(\\theta+\\gamma \\mathbf{v}_i\\) rather than at the current one. Here, \\(\\gamma \\mathbf{v}_i\\) represents a correction factor to the standard method of momentum. In classical optimization (i.e., for batched gradient descent), this small change provides an improvement in the rate of convergence from \\(\\mathcal{O}(1/i)\\) to \\(\\mathcal{O}(1/i^2)\\) . Note that this is however not always the case when using stochastic gradient descent.","title":"Momentum"},{"location":"lectures/08_gradopt1/#adaptive-learning-rates","text":"Up until now, we have introduced some modifications to the standard SGD algorithm that globally change the scaling of the gradient (also referred to as learning rate). However, if we believe that directions of sensitivity of the functional should be axis aligned, different learning rates should be used for the different parameters we wish to optimize for. More specifically a small LR should be preferred for those directions associated with large eigenvalues of the local Hessian whilst a large LR should be used for the other directions that associated with small eigenvalues. The delta-bar-delta algorithm of Jacobs (1988) represents an early heuristic approach to automatically adapting learning rates of individual parameters. It is based on this simple rule: if \\(sign\\{g_{i+1}^j\\} = sign\\{g_{i}^j\\}\\) , increase LR if \\(sign\\{g_{i+1}^j\\} \\ne sign\\{g_{i}^j\\}\\) , decrease LR where \\(j\\) refers here to the j-th component of the gradient vector. However, in the last decade a large variety of optimizers have appeared in the literature mostly focusing on this particular aspect of training, i.e. parameter-dependent learning rate. We will go through some of the most popular ones that have revolutionized the way we train NNs nowadays.","title":"Adaptive learning rates"},{"location":"lectures/08_gradopt1/#adagrad","text":"This optimizer scales the gradient vector by the inverse of the square root of the sum of all historical squared values of the gradient. \\[ \\begin{aligned} &\\mathbf{g}_{i+1} = \\frac{1}{N_b} \\sum_{j=1}^{N_b} \\nabla \\mathscr{L}_j\\\\ &\\mathbf{r}_{i+1} = \\mathbf{r}_i + \\mathbf{g}_{i+1} \\cdot \\mathbf{g}_{i+1} \\\\ &\\Delta \\boldsymbol\\theta_{i+1} = -\\frac{\\alpha}{\\delta + \\sqrt{\\mathbf{r}_{i+1}}} \\cdot \\mathbf{g}_{i+1} \\\\ &\\boldsymbol\\theta_{i+1} = \\boldsymbol\\theta_{i} + \\Delta \\boldsymbol\\theta_{i+1} \\end{aligned} \\] where the vector \\(\\mathbf{r}\\) contains a running sum of the element-wise square gradients (with \\(\\mathbf{r}_0=0\\) ), \\(\\cdot\\) and \\(\\sqrt{\\;}\\) represent the element-wise multiplication of two vectors and square root, respectively. Finally, \\(\\delta=10^{-6}\\) is used as stabilizer to avoid division by zero. If we look at the learning rate of AdaGrad, it is clear that this is parameter dependent and more importantly, it is a function of the norm of the past gradients. Therefore, parameters associated with large gradients will experience a rapid decrease in their associated LR, whilst parameters with small gradients will have an increase of the LR through iterations. The effect of such adaptive LR, is that the trajectory of the parameters will show greater progress over gently sloped directions of the landscape. Nevertheless, it has been reported in the literature that a main drawback of AdaGrad is that this effect is too strong, leading to a premature decrease of the LR in those directions with large gradients and therefore an overall slow learning process.","title":"AdaGrad"},{"location":"lectures/08_gradopt1/#rmsprop","text":"A modified version of AdaGrad particularly suited for nonconvex optimization where the gradient accumulation (i.e., \\(\\mathbf{r}\\) vector) is exponentially weighted on a moving window. The idea behind is that for NN training it may take a large number of gradient steps to converge to a satisfactory solution, and therefore it is important for the LR not to decrease too fast in the first few hundred steps. In mathematical terms, a single change is needed to the AdaGrad equations, namely: \\[ \\mathbf{r}_{i+1} = \\rho \\mathbf{r}_i + (1-\\rho)\\mathbf{g}_{i+1} \\cdot \\mathbf{g}_{i+1} \\\\ \\] where \\(\\rho\\) represents the decay rate in the accumulation of past gradients. RMSProp, which was proposed by Geoffrey Hinton during a Coursera class, is shown to be one of the best-in-class optimizers for NN training and it is widely adopted by the DL community.","title":"RMSProp"},{"location":"lectures/08_gradopt1/#adam","text":"ADAM stands for Adaptive Moments and it is a variant of RMSProp that further includes Momentum. Nowadays, ADAM is by far the most popular optimizer in the training of deep NNs. Two key changes have been introduced in the ADAM algorithm when compared to RMSProp: Momentum is applied via an estimate of the first-order momentum plus an exponential decay and used in spite of pure gradients in the parameter update step; A bias correction is included to take into account initialization. The algorithm can be written as follows: \\[ \\begin{aligned} &\\mathbf{g}_{i+1} = \\frac{1}{N_b} \\sum_{j=1}^{N_b} \\nabla \\mathscr{L}_j\\\\ &\\mathbf{v}_{i+1} = \\rho_1 \\mathbf{v}_i + (1-\\rho_1)\\mathbf{g}_{i+1} \\leftarrow velocity \\; term \\\\ &\\mathbf{r}_{i+1} = \\rho_2 \\mathbf{r}_i + (1-\\rho_2)\\mathbf{g}_{i+1} \\cdot \\mathbf{g}_{i+1} \\leftarrow scaling \\; term \\\\ &\\hat{\\mathbf{v}}_{i+1} = \\frac{\\mathbf{v}_{i+1}}{1-\\rho_1^{i+1}} \\leftarrow bias \\; correction \\\\ &\\hat{\\mathbf{r}}_{i+1} = \\frac{\\mathbf{r}_{i+1}}{1-\\rho_2^{i+1}} \\leftarrow bias \\; correction \\\\ &\\Delta \\boldsymbol\\theta_{i+1} = -\\frac{\\alpha}{\\delta + \\sqrt{\\hat{\\mathbf{r}}_{i+1}}} \\cdot \\hat{\\mathbf{v}}_{i+1}\\\\ &\\boldsymbol\\theta_{i+1} = \\boldsymbol\\theta_{i} + \\Delta \\boldsymbol\\theta_{i+1} \\end{aligned} \\] where, once again, a number of hyperparameters are introduced. These are the stabilizer, \\(\\delta=10^{-6}\\) , and two decay rates ( \\(\\rho_1\\) and \\(\\rho_2\\) ). To conclude, we have first introduced simpler optimizers and subsequently built complexity in terms of both momentum and parameter-dependent learning, there is no universal winner. Although both momentum and adaptive LR do clearly seem to be beneficial to the training on NNs, it is not always the case that ADAM provides the best results both in terms of robustness and convergence speed. It is therefore important to be aware of the different optimizers that are available in the DL arsenal and identify the best based on the task at end. In other words, the choice of the optimizer can usually represent one of those hyperparameters that ML practitioners need to evaluate and select when developing a new ML pipeline.","title":"ADAM"},{"location":"lectures/08_gradopt1/#other-tricks","text":"In the following, we report a few other practical tricks that can be used when training NNs to further improve the learning capabilities of our optimizer (no matter what optimizer has been selected).","title":"Other tricks"},{"location":"lectures/08_gradopt1/#polyak-averaging","text":"When training a NN, the most common approach is to select the last iterate ( \\(\\boldsymbol\\theta_{N_{it}}\\) ) where \\(N_{it}\\) is the overall number of iterations and use it at inference stage. Nevertheless, given the highly nonconvex optimization problem that we are required to solver, it is logical to expect that perhaps the last estimate of model parameters is not the best. Let's for example imagine that towards the end of the training process we are approaching a (local or global) minimum. However, our trajectory is bouncing all around the valley: A simple approach to mitigate this effect is to average over the last \\(N\\) iterations: \\[ \\boldsymbol\\theta = \\frac{1}{N} \\sum_{i=0}^{N-1} \\boldsymbol\\theta_{N_{it}-i} \\] This averaging acts as a denoising process that takes away some of the fluctuations and makes the optimization process less sensitive to the last step.","title":"Polyak Averaging"},{"location":"lectures/08_gradopt1/#batch-normalization","text":"This is a very recent advancement in the field of DL, from the seminal work of Ioffe and Szegedy (2015). It has been shown to be particularly beneficial to the training of very deep neural networks. Let's first take a look at what happens during the training process if we do not include batch normalization. As previously discussed, given the gradient \\(\\partial J / \\partial \\boldsymbol \\theta\\) , at every step of the optimization process all the parameters (weights and biases) in the different layers of a NN are simultaneously updated. This goes against the \"theoretical assumption\" that the optimization process should update one parameter at the time (which is however too expensive and therefore unfeasible). As a consequence of the fact that all free-parameters are updated together is that second order updates are introduced or, in other words, the statistical distribution of various parameters across the layers of the NN are modified. This is commonly referred to as internal covariate shift . Batch normalization use a general way to reparametrize every NN, which reduces the need for coordination across many layers during an update (making the process of updating all parameters at the same time more stable). It is simply implemented by modifying the output of a layer (or all the layers) at training time as follows: where a re-normalization process is applied to every row of the output matrix \\(\\mathbf{A}\\) and it is directly based on the local statistics (mean and standard deviation) of the output of the layer. The overall forward and backward passes remain unchanged with the simple difference that the network is now operating on the re-normalized output \\(\\mathbf{A}'\\) instead of the original one \\(\\mathbf{A}\\) . The implications of such an additional step of re-normalization are that now the activations are distributed as \\(\\mathcal{N}(0, 1)\\) throughout the entire training process. By doing so, the optimization algorithm is discouraged to propose an update that simply acts constantly over the mean or the standard deviation of \\(\\mathbf{A}\\) . At testing time, the mean and standard deviation ( \\(\\boldsymbol \\mu\\) and \\(\\boldsymbol \\sigma\\) ) are usually fixed and taken from a running mean computed during training time. In practice, however, batch normalization includes an extra step where instead of forcing the mean and standard deviation of each layer to be fixed, these parameters are learned to make the units of the network more expressive. This is simply accomplished by defining the output \\(\\mathbf{A}''\\) as: \\[ \\mathbf{A}'' = \\gamma \\mathbf{A}' + \\beta \\] where \\(\\gamma\\) and \\(\\beta\\) are also learned alongside the weights and biases of the network. Finally, since the bias is now induced by \\(\\beta\\) a common recommendation when using batch normalization is to avoid adding a learnable bias to the layer of the network.","title":"Batch Normalization"},{"location":"lectures/08_gradopt1/#supervised-pre-training","text":"So far, we have talked about optimizing the free-parameters of a neural network starting from a random initialization of such parameters and using all the available data to get the best estimate of such parameters. We have also briefly mentioned that transfer learning, a technique that uses a pre-trained network on a different set of data and possible different task and fine-tunes it on the task and data at hand, as a way to speed-up the training process as well as get around to the fact that sometimes we have access to a small amount of labelled data. Another interesting technique that can be used to ease the learning capabilities of a NN is called pre-training or greedy training . Two alternative approaches are generally taken: \\(\\boldsymbol \\theta_0\\) (selected at random) \\(\\rightarrow\\) Simple task: \\(\\tilde{\\boldsymbol \\theta}\\) \\(\\rightarrow\\) Hard task: \\(\\tilde{\\boldsymbol \\theta'}\\) \\(\\boldsymbol \\theta^1_0\\) (selected at random) \\(\\rightarrow\\) Simple network: \\(\\tilde{\\boldsymbol \\theta^1}, \\boldsymbol \\theta^2_0\\) \\(\\rightarrow\\) Complex network: \\(\\tilde{\\boldsymbol \\theta^1}, \\tilde{\\boldsymbol \\theta^2}\\) where in the latter case a common approach is to fix the hidden layers and discard the output layer after the first training process, add a number of extra layers to make the network deeper and continue training those layers alone. However, since N independent optimizations generally do not provide the overall optimal solution, a final fine-tuning step may be required.","title":"Supervised pre-training"},{"location":"lectures/08_gradopt1/#additional-readings","text":"A great resource containing references (and Pytorch implementations) of more than 20 optimizers. This may be a good starting point if interest to experiment with different optimizers in both classical optimization and training of NNs. Another great resource with step-by-step implementations of some popular optimizers and networks.","title":"Additional readings"},{"location":"lectures/09_mdn/","text":"Uncertainty Quantification in Neural Networks and Mixture Density Networks Before delving into more advanced NN building blocks (e.g., convolutional, recurrent), let's revisit the training process of feed forward NNs with a probabilistic standpoint. Anything that we present here can be later applied to any of the other NN architectures that will be discussed in this course. We have already discussed that every loss function commonly used in the training of NNs, both for regression and classification, can be justified using a statistical formulation mostly in the context of Maximum-likelihood estimators. Despite this intrinsic link with probabilistic modelling, NN predictions are however most of the time punctual, meaning that we do not get an idea of the uncertainty associated to our prediction. First of all, it is important to remark the fact that even in classification tasks when the output of a softmax layer has the form of a probability (i.e., each term is bounded between 0 and 1, and their sum is equal to 1), this values should not be treated as an accurate description of the confidence level of our prediction. Second, when assessing the confidence of a NN prediction (or another ML model), two different types of uncertainties are generally identified: Epistemic uncertainty , also called model uncertainty: this uncertainty arises from a lack of training data in certain regions of the input domain. As we can expect our training data not to cover the entire input space, our trained network is likely to produce arbitrary output values for a large portion of the input values that the network has never seen before. We therefore want to be able to quantify the lack of accuracy due to missing training data. Aleatoric uncertainty : this uncertainty is associated with the fact that the input data may contain some intrinsic randomness. This is either represented by the fact that the function we try to approximate is multimodal (i.e., multiple possible outputs exist for a single input) or the recorded data is polluted by noise. As a result, the training data will include samples with very close input values and a large spread of output values. We wish to be able to get such an insight out of the network predictions. A number of more or less simple strategies can however be employed when training NNs with the goal of obtaining a quantitative measurement of how certain our estimate is: Dropout : this commonly used regularization strategy presented in one of our previous lectures can be also leveraged to produce an estimate of the uncertainty of our solution. This can be done by simply using dropout at the inference time and feeding the network multiple times with the same input. Multiple realizations of a prediction are computed, where different portions of the neurons of the network are deactivated for the different realizations. An empirical distribution or parameter estimates (e.g., mean and standard deviation) over the outputs can be finally estimated. The reason behind the success of this strategy is that the network can easily learn to always predict the same (or very similar) output when it is well constrained by data no matter if some of the neurons are deactivated at random. On the other hand, when the network is more unsure because of lack of data or contrasting data, different versions of the network are likely to produce different predictions. Ensembling : another popular strategy, although quite expensive, is to train N neural networks with different initializations and use them to produce multiple predictions. Similar to dropout, when the training data is available and of good quality, the different networks will make similar predictions as they will likely converge to minima of similar quality. On the other hand, when the data is poor (or lacking), the weight initialization plays a much bigger role in the training and different network are likely to behave differently. Distributional parameter estimation (DPE) : a different route is to change the parametrization of the output itself. More specifically, considering here for simplicity the case of regression, the network is asked to produce two outputs. The first is tasked to predict the mean of the output distribution whilst the second predicts the standard deviation. Whilst in the more traditional training of NNs the standard deviation is kept fixed for all training samples, here the network will be able to understand which portion of the input data is noisier and which is cleaner (as well as detect where input data is missing). The negative log-likelihood is chosen to be the loss function of the network: \\[ \\boldsymbol \\theta = \\underset{\\boldsymbol \\theta} {\\mathrm{argmin}} \\; \\sum_{i=1}^{N_s} \\frac{log \\hat{\\sigma}^{(i)2}}{2} + \\frac{(\\hat{y}^{(i)} - y^{(i)})^2}{2\\hat{\\sigma}^{(i)2}} \\\\ \\] with the main difference that not only the mean (here denoted as \\(\\hat{y}^{(i)}\\) ) but also the standard deviation ( \\(\\hat{\\sigma}^{(i)}\\) ) are produced by the network and therefore function of the free-parameters that we wish to optimize. Intuitively, the numerator of the second term encourages the mean prediction to be close to the observed data, while the denominator makes sure the variance penalizes the poor predictions. The first term avoids the network making the variance grow to infinity (which would lead to minimizing the second term no matter the mean value prediction). Mixture density networks : a natural extension of the DPE method is is represented by networks that try to predict more complex probability distributions by parametrizing them as a mixture of gaussians. Mixture density networks (MDNs) Extending to the case above, the network output is now composed of \\(N_m\\) means, \\(N_m\\) standard deviations, and \\(N_m\\) weights (where \\(N_m\\) is the number of gaussians in the mixture): and the probability of a single outcome \\(y\\) given a single input \\(\\mathbf{x}\\) can be written as follows: \\[ p(y|\\mathbf{x}) = \\sum_{i=1}^{N_m} \\pi_i p_i(y|\\mu_i,\\sigma_i) = \\sum_{i=1}^{N_m} \\pi_i \\frac{1}{\\sqrt{2 \\pi \\sigma_i^2}} e^{-\\frac{(y - \\mu_i)^2}{2 \\sigma_i^2}} \\] A few key points worth highlighting for this model: the last layer produces an output of size \\(3N_m\\) , where the last \\(N_m\\) values must sum to 1 as they represent the weights of the gaussian mixture. They are therefore passed through a softmax activation function. the variances should always be positive, this can be simply obtained by adding an exponential activation function at the end of the network to the parameters that represent the variance. This turns unbounded values into values bounded between 0 and \\(+\\infty\\) . The loss function used for MDNs is once again the negative log-likelihood, which can be written for a single training sample as follows: \\[ \\begin{aligned} - log(p(y|\\mathbf{x})) &= - log(\\sum_{i=1}^{N_m} \\pi_i p_i(y|\\mu_i,\\sigma_i)) = \\\\ &= -log(\\sum_{i=1}^{N_m} e^{(log \\pi_i + log p_i)}) \\end{aligned} \\] where the second equation is introduced to avoid instability issues arising by applying the logarithm to the sum of exponential functions. Here we observe that a \\(log(\\sum e)\\) has to be computed; this can be stably done by using the LogSumExp (LSE) function. In prediction mode, a certain input \\(\\mathbf{x}\\) is feed through the network and a prediction of both the means, standard deviations and weights is produced. This uniquely define the probability function of the mixture of gaussian that we have decided to use to parametrize the output of the network. To conclude, let's discuss a practical scenario where MDNs should be preferred to simple DPE networks. Imagine that we are tasked to predict the porosity values in the subsurface given knowledge of elastic parameters (e.g., acoustic and shear impedance). Depending on the rock type, the relation between porosity and acoustic impedance may change. More importantly, there may be a certain overlap in the acoustic impedance values of the different rock types. If this is the case, as shown in the figure below, the output is multi-modal; unless we are certain about the rock type (or jointly predict the rock type alongside the porosity value), we would like the network to inform us when we should be confident about our prediction and where two distinct values of porosity have been observed in the training data for the same (or similar) value of acoustic impedance. Of course, despite this is a simple, single-dimensional example, similar conclusions apply when training a NN with multi-dimensional inputs. Additional readings To learn more about uncertainties in deep learning, read this webpage If you are interest to learn more about MDN, this blog post provides an in-depth introduction of both the underlying theory and implementation details.","title":"Uncertainty Quantification in Neural Networks and Mixture Density Networks"},{"location":"lectures/09_mdn/#uncertainty-quantification-in-neural-networks-and-mixture-density-networks","text":"Before delving into more advanced NN building blocks (e.g., convolutional, recurrent), let's revisit the training process of feed forward NNs with a probabilistic standpoint. Anything that we present here can be later applied to any of the other NN architectures that will be discussed in this course. We have already discussed that every loss function commonly used in the training of NNs, both for regression and classification, can be justified using a statistical formulation mostly in the context of Maximum-likelihood estimators. Despite this intrinsic link with probabilistic modelling, NN predictions are however most of the time punctual, meaning that we do not get an idea of the uncertainty associated to our prediction. First of all, it is important to remark the fact that even in classification tasks when the output of a softmax layer has the form of a probability (i.e., each term is bounded between 0 and 1, and their sum is equal to 1), this values should not be treated as an accurate description of the confidence level of our prediction. Second, when assessing the confidence of a NN prediction (or another ML model), two different types of uncertainties are generally identified: Epistemic uncertainty , also called model uncertainty: this uncertainty arises from a lack of training data in certain regions of the input domain. As we can expect our training data not to cover the entire input space, our trained network is likely to produce arbitrary output values for a large portion of the input values that the network has never seen before. We therefore want to be able to quantify the lack of accuracy due to missing training data. Aleatoric uncertainty : this uncertainty is associated with the fact that the input data may contain some intrinsic randomness. This is either represented by the fact that the function we try to approximate is multimodal (i.e., multiple possible outputs exist for a single input) or the recorded data is polluted by noise. As a result, the training data will include samples with very close input values and a large spread of output values. We wish to be able to get such an insight out of the network predictions. A number of more or less simple strategies can however be employed when training NNs with the goal of obtaining a quantitative measurement of how certain our estimate is: Dropout : this commonly used regularization strategy presented in one of our previous lectures can be also leveraged to produce an estimate of the uncertainty of our solution. This can be done by simply using dropout at the inference time and feeding the network multiple times with the same input. Multiple realizations of a prediction are computed, where different portions of the neurons of the network are deactivated for the different realizations. An empirical distribution or parameter estimates (e.g., mean and standard deviation) over the outputs can be finally estimated. The reason behind the success of this strategy is that the network can easily learn to always predict the same (or very similar) output when it is well constrained by data no matter if some of the neurons are deactivated at random. On the other hand, when the network is more unsure because of lack of data or contrasting data, different versions of the network are likely to produce different predictions. Ensembling : another popular strategy, although quite expensive, is to train N neural networks with different initializations and use them to produce multiple predictions. Similar to dropout, when the training data is available and of good quality, the different networks will make similar predictions as they will likely converge to minima of similar quality. On the other hand, when the data is poor (or lacking), the weight initialization plays a much bigger role in the training and different network are likely to behave differently. Distributional parameter estimation (DPE) : a different route is to change the parametrization of the output itself. More specifically, considering here for simplicity the case of regression, the network is asked to produce two outputs. The first is tasked to predict the mean of the output distribution whilst the second predicts the standard deviation. Whilst in the more traditional training of NNs the standard deviation is kept fixed for all training samples, here the network will be able to understand which portion of the input data is noisier and which is cleaner (as well as detect where input data is missing). The negative log-likelihood is chosen to be the loss function of the network: \\[ \\boldsymbol \\theta = \\underset{\\boldsymbol \\theta} {\\mathrm{argmin}} \\; \\sum_{i=1}^{N_s} \\frac{log \\hat{\\sigma}^{(i)2}}{2} + \\frac{(\\hat{y}^{(i)} - y^{(i)})^2}{2\\hat{\\sigma}^{(i)2}} \\\\ \\] with the main difference that not only the mean (here denoted as \\(\\hat{y}^{(i)}\\) ) but also the standard deviation ( \\(\\hat{\\sigma}^{(i)}\\) ) are produced by the network and therefore function of the free-parameters that we wish to optimize. Intuitively, the numerator of the second term encourages the mean prediction to be close to the observed data, while the denominator makes sure the variance penalizes the poor predictions. The first term avoids the network making the variance grow to infinity (which would lead to minimizing the second term no matter the mean value prediction). Mixture density networks : a natural extension of the DPE method is is represented by networks that try to predict more complex probability distributions by parametrizing them as a mixture of gaussians.","title":"Uncertainty Quantification in Neural Networks and Mixture Density Networks"},{"location":"lectures/09_mdn/#mixture-density-networks-mdns","text":"Extending to the case above, the network output is now composed of \\(N_m\\) means, \\(N_m\\) standard deviations, and \\(N_m\\) weights (where \\(N_m\\) is the number of gaussians in the mixture): and the probability of a single outcome \\(y\\) given a single input \\(\\mathbf{x}\\) can be written as follows: \\[ p(y|\\mathbf{x}) = \\sum_{i=1}^{N_m} \\pi_i p_i(y|\\mu_i,\\sigma_i) = \\sum_{i=1}^{N_m} \\pi_i \\frac{1}{\\sqrt{2 \\pi \\sigma_i^2}} e^{-\\frac{(y - \\mu_i)^2}{2 \\sigma_i^2}} \\] A few key points worth highlighting for this model: the last layer produces an output of size \\(3N_m\\) , where the last \\(N_m\\) values must sum to 1 as they represent the weights of the gaussian mixture. They are therefore passed through a softmax activation function. the variances should always be positive, this can be simply obtained by adding an exponential activation function at the end of the network to the parameters that represent the variance. This turns unbounded values into values bounded between 0 and \\(+\\infty\\) . The loss function used for MDNs is once again the negative log-likelihood, which can be written for a single training sample as follows: \\[ \\begin{aligned} - log(p(y|\\mathbf{x})) &= - log(\\sum_{i=1}^{N_m} \\pi_i p_i(y|\\mu_i,\\sigma_i)) = \\\\ &= -log(\\sum_{i=1}^{N_m} e^{(log \\pi_i + log p_i)}) \\end{aligned} \\] where the second equation is introduced to avoid instability issues arising by applying the logarithm to the sum of exponential functions. Here we observe that a \\(log(\\sum e)\\) has to be computed; this can be stably done by using the LogSumExp (LSE) function. In prediction mode, a certain input \\(\\mathbf{x}\\) is feed through the network and a prediction of both the means, standard deviations and weights is produced. This uniquely define the probability function of the mixture of gaussian that we have decided to use to parametrize the output of the network. To conclude, let's discuss a practical scenario where MDNs should be preferred to simple DPE networks. Imagine that we are tasked to predict the porosity values in the subsurface given knowledge of elastic parameters (e.g., acoustic and shear impedance). Depending on the rock type, the relation between porosity and acoustic impedance may change. More importantly, there may be a certain overlap in the acoustic impedance values of the different rock types. If this is the case, as shown in the figure below, the output is multi-modal; unless we are certain about the rock type (or jointly predict the rock type alongside the porosity value), we would like the network to inform us when we should be confident about our prediction and where two distinct values of porosity have been observed in the training data for the same (or similar) value of acoustic impedance. Of course, despite this is a simple, single-dimensional example, similar conclusions apply when training a NN with multi-dimensional inputs.","title":"Mixture density networks (MDNs)"},{"location":"lectures/09_mdn/#additional-readings","text":"To learn more about uncertainties in deep learning, read this webpage If you are interest to learn more about MDN, this blog post provides an in-depth introduction of both the underlying theory and implementation details.","title":"Additional readings"},{"location":"lectures/10_cnn/","text":"Convolutional Neural Networks Convolutional Neural Networks are one of the most powerful types of neural network, very popular and successful in image processing (and more broadly computer vision). They are based on a simple mathematical operation that we, geoscientists, know very well and user in a variety of tasks: the convolution operator. This is motivated in most scenarios where local dependencies in the input data are known to be predominant. Imagine for example a geological model, or a core section. If we decide to apply Deep Learning to such data to either classify rock types, estimate rock parameters, or even for generative modelling tasks, the first thing that we would like our NN to know is that nearby geological features are likely to be correlated, whilst the further apart we move the more the features become independent from each other. By looking at the schematic diagrams below, a FCN would not take this prior information into account as each input value is linearly combined to give rise to the output. On the other hand, a convolutional block which represents the key component of a CNN will only use values of the input vector in a certain neighbour to obtain the output: The example mentioned above is just one of many in geoscience where convolution-based networks have been lately shown to be very successfull. Other examples are: Seismic interpretation (faults, horizons, bodies) Seismic processing (denoising, interpolation, deblurring) Satellite imaginery (denoising, segmentation) Microseismicity (detection, source mechanism) Laboratory studies (CT, SEM, Microscopy for various processing and interpretation tasks) In general, any data type that is represented regularly on a 1D, 2D, or ND gridded topology is fit for CNNs. Convolution First of all, let's briefly recall what a convolution is. This represents in fact the core operation performed by a convolutional layer. A convolution between two signals can be mathematically written as \\[ y(t) = \\int x(\\tau) h(t-\\tau) d\\tau \\leftrightarrow y = x * h \\] where \\(x(t)\\) and \\(y(t)\\) are the input and output, respectively, and \\(h(t)\\) is the filter (also called kernel in the DL jargon). This equation can be interpreted as follows: take the filter and flip it across the origin, then slide it along the time axis and multiply-and-sum it to the input signal. In practice, when working with digital data in a computer, all signals are discrete and the continuous formula above can be rewritten as follows: \\[ y_i = \\sum_{j=-\\infty}^{\\infty} x_j h_{i-j} \\] where, to be general, we have here extended the integral from \\(-\\infty\\) to \\(\\infty\\) . In most applications, the filter \\(h\\) is however compact (it has a small size of N samples, also called kernel size ) and therefore we can limit the summation within the window of samples where the filter is non-zero. A similar (but still different!) concept in signal processing is correlation \\[ y(t) = \\int x(\\tau) h(t+\\tau) d\\tau \\leftrightarrow y_i = \\sum_{j=-\\infty}^{\\infty} x_j h_{i+j} \\] where the filter is simply slid across the \\(t\\) axis (without being initially flipped). The main difference between convolution and correlation is therefore that one delays the input signal whilst the other anticipates it when the filter is non-symmetric to zero. As we will see later, it is important to immediately empathize also a slight difference in the jargon used in classical signal processing and deep learning: what usually we refer to as convolution in DL is what signal processing refers to as correlation. However, since in DL we do not choose the filter \\(h\\) , rather this is learned from data, if signal processing convolution was used instead of correlation, the learning algorithm would just learned the flipped version of the filter. In both cases, when we convolve two signals of size \\(N_x\\) with a filter of size \\(N_h\\) , the output signal has size: \\[ N_y = N_x + N_h - 1 \\] However, in the context of CNNs, we generally only want to consider the so-called valid part of the convolution, i.e., where the entire filter contributes to the computation. For this reason the output signal size becomes: \\[ N_y = N_x - N_h + 1 \\] In the next section, we will see how we can actually make the choice of \\(N_y\\) more flexible with the help of additional tools like padding and striding. Extending the concept of convolution to two- and multi-dimensional data is straightforward. This can be done by simply sliding the filter in all dimensions and can be mathematically written (in the discrete form) as follows: \\[ y_{i,j} = \\sum_m \\sum_l x_{m,l} h_{i+m,j+l} \\] Finally, another interesting thing to notice is that convolution is a linear process. Therefore we can express it as a matrix-vector multiplication where the vector identifies the input data and the filter is re-organized into a Toeplitz matrix as show in the figure below which means that the gradient of a convolutional operator that we need for backpropagation is just the adjoint of the matrix \\(\\mathbf{H}^T\\) . This is a convolution with the flipped kernel (so truly a convolution!). Why Convolution? A first intuitive motivation about locality of interactions, also referred to as space interactions (or sparse connectivity or sparse weights ), has been already provided onto why convolution blocks may represent an appealing alternative to fully connected blocks in the context of neural networks. However, this is not the only reason why convolution blocks are so powerful and widely used nowadays when training NNs for image processing tasks. Let's start with an example. Imagine we are given a large image and a small 3x3 kernel. By sliding the kernel across the image we can still be able to detect useful local features (e.g., edges). Note that, the Machine Learning community has been aware of this for decades, and in fact many early approaches to image detection relied on hand-crafted filters that could highlight one feature of the input data over another. The modern DL approach simply takes this paradigm one step further where the filters are learned instead of being defined upfront. Experience has further shown that deep CNNs learn initially low level features (e.g., edges), then middle level features (e.g., shapes) and finally high level features (e.g., objects). Compared to flattening the input data and applying a matrix that transforms it into the dimension of the output data (that is what a FCC would do as shown above), using convolutions with small filters can save both memory and computations. Given for example an image of size \\(N_{w,x} \\times N_{h,x}\\) , a fully connected layer that produces an output of the same size requires a matrix with \\((N_{w,x} N_{h,x})^2\\) parameters and \\((N_{w,x} N_{h,x})^2\\) computations are required to obtain the output. On the other hand, if we now consider a simple filter of size \\(N_{w,h} \\times N_{h,h}\\) , the number of computations is reduced to \\(N_{w,x} N_{h,x} N_{w,h} N_{h,h}\\) . The second main advantage of convolutional blocks is so-called parameter sharing . The same learned kernels are applied all over the input data, instead of having one filter operating on all (or part of) the input data to produce a single output component. Finally, a third benefit is the equivariance of convolution to translation . This means that if we shift the input by \\(k\\) samples, the output will also be shifted by the same number of samples; however, the shape of the output will not change. Padding and strides We have previously seen how applying convolution to a signal with a kernel of a given size produces an output signal of different size, either with the total or valid output size is chosen. It may be however much easier when designing a convolutional neural network to have inputs and outputs of the same size, or more in general to be free to design the size of the output independent on that of the input and filter. Two simple approaches exist: padding : the input signal is padded with zeros on both sides (for 1D signals) or all sides (for ND signals) prior to convolution. This allows producing outputs that can have the same size or even larger size than the input. Let's first look at this with an example when the output size is computed using the equation above for the valid case. We can devise a padding such that the size of the output stays the same as that of the input. This is actually easy to do once we choose the size of the filter and more specifically \\(N_{x,pad} = N_x + 2*pad\\) with \\(pad = (N_h-1)/2\\) when \\(N_h\\) is a odd number and \\(N_h/2\\) when \\(N_h\\) is a even number. Moreover, apart from the obvious benefit of not having to handle outputs that keep reducing in size, padding ensures that edge values in the inputs are also used the same number of times that central values in the convolution process. strides : a common approach when building convolutional neural network, as we will see when discussing popular CNN architecture, is however to gradually reduce the size of the signal (or image in 2D) whilst going deeper and deeper into the network. Two alternative ways to achieve this exist: the simplest is to couple convolutional layers that do not change the size of the input and downsampling (or pooling layers). Alternatively, one can choose to apply a special type of convolution called strided convolution that simply moves the filter around the input jumping (or striding) by more than a single sample at the time. Again, if we look at an example, we can observe how by doing so the size of the output is reduced by the striding factor. If we stride by a factor of two the output size will be half of the input size. As a result the output size can be written as \\(N_y = \\lfloor (N_x - N_h) / stride + 1 \\rfloor\\) . Eventually striding and padding can be used together to get for example an output that is exactly half of the size of the input in all directions. An important formula to remember when designing convolutional layers is: \\[ N_y = \\Bigl\\lfloor \\frac{N_x + 2pad - N_h}{stride} + 1 \\Bigr\\rfloor \\] Channels We need to introduce one last key ingredient before we can define a convolutional layer. Let's imagine we have a 3D tensor and a 3D filter; the extension of 2D convolution to 3D (or any extra dimension) is as easy as sliding the filter along the third dimension as well as the first two. However, in deep learning we generally do something different when we are dealing with convolutional networks. We define a special dimension called channel . Imagine having a 1D signal like a seismic trace but recording both the horizontal and vertical components of the particle displacement field. One way to arrange such data is as a 2D array where one of the dimensions is the size of the trace and the other is the number of components (or channels), here two. A similar scenario may arise for 2D signals if we record for example different spectral components or for pre-stack seismic data where we record data at different angles. Here once again we will have two \"classical\" dimensions, say latitude and longitude or geographical location and depth and one channel dimension. For the first example this will contain the different spectral components, for the second example it will be represented by the different angles (or offsets). This is the geoscientific equivalent to natural images that are commonly used in deep learning tasks where the channel contains different colors (e.g., RGB or CMYK). In order to make ourselves already familiar with the ordering used in computational frameworks like PyTorch, a batch of training samples is usually organized as follows: \\[ N_x = (N_s \\times N_{ch,x} \\times N_{w,x} \\times N_{h,x}) \\] where \\(N_{ch,x}\\) is the number of input channels, whilst \\(N_{w,x}\\) and \\(N_{w,h}\\) are the width and the height of the image, respectively. By defining a special dimension, we can now decide to still work with filters that slide only across the width and height axes. Such kernels will have size \\(N_{ch,x} \\times N_{w,h} \\times N_{h,h}\\) . By doing so, for every step of convolution, the input and filter and multiplied and then all the values across all channels are summed together. Convolutional layer A convolutional layer is simply a stack of \\(N_{ch,y}\\) filters. The resulting output has therefore a shape equal to: \\[ N_y = (N_s \\times N_{ch,y} \\times N_{w,y} \\times N_{h,y}) \\] where \\(N_{w,y}\\) and \\(N_{w,y}\\) can be computed upfront using the formulas derived above. Note that a convolutional layer contains trainable parameters both in the form of the coefficients of the various filters and a vector of biases \\(\\mathbf{b}=[b_1, b_2,...,b_{N_{ch,y}}]\\) where every bias is applied to a different output channel. The output can be therefore written in a compact mathematical form as follows: \\[y = \\sigma \\Big( \\begin{bmatrix} h_1 * x + b_1 \\\\ ... \\\\ h_{N_{ch,y}} * x + b_{N_{ch,y}} \\end{bmatrix} \\Big) \\] In summary, a convolutional layer has the following number of trainable parameters: \\[ N=N_{w,h}N_{h,h}N_{ch,x}N_{ch,y} + N_{ch,y} \\] For example, if \\(N_{ch,x}=3\\) , \\(N_{ch,y}=10\\) , and the filters have size \\(3 \\times 3\\) , the overall number of parameters is \\(3\\cdot3\\cdot3\\cdot10 + 10 =280\\) . Moreover, as convolutional layers can be stacked similarly to what we have done with MLP layers, the following nomenclature will be used in the following when referring to a generic layer \\(l\\) : \\[ \\begin{aligned} x:&\\quad N_{ch}^{[l-1]} \\times N_w^{[l-1]} \\times N_h^{[l-1]},\\\\ h:&\\quad N_{ch}^{[l]} \\times N_{ch}^{[l-1]} \\times N_w^{[l-1]} \\times N_h^{[l-1]},\\\\ b:&\\quad N_{ch}^{[l]},\\\\ y:&\\quad N_{ch}^{[l]} \\times N_w^{[l]} \\times N_h^{[l]} \\end{aligned} \\] Convolutional network Similar to a fully connected network, a convolutional network can be easily created by putting together a certain number of convolutional layers. Although we will see that different tasks call for different design choices, most convolutional neural networks share the following design features: the height and width ( \\(N_h\\) and \\(N_w\\) ) tends to reduce the deeper we travel into the network; the number of channels ( \\(N_{ch}\\) ) does instead increase as function of network depth; after a certain number of convolutional layers, the output of size \\(N_{ch}^{[l]} \\times N_w^{[l]} \\times N_h^{[l]}\\) is flattened and fed into one or more fully connected layers and then sent into a classifier (or regressor) loss function. Pooling As we have previously mentioned in the previous section, convolutional neural networks require reducing the size of the height and width of an input image We have already discussed that by choosing the filter size, stride and padding, the output can be either kept of the same size of the input or reduced (or increased) in size. At times, it may however be better to avoid changing the size of the output directly as part of the convolution process, rather perform this in a separate step. In this section we introduce the so-called pooling process, which is designed specifically to reduce the size of an input N-dimensional array by an arbitrary factor \\(N_p\\) . Let's start with an example. We are interested to take a matrix of size \\(N_{h,x} \\times N_{w,x}\\) as input and produce an output of half the size (i.e., \\(N_{h,x}/2 \\times N_{w,x}/2\\) . A possible way to achieve this without purely discarding some of the values of the matrix is to select the maximum value within a sliding window of size \\(2 \\times 2\\) (stride=2): This approach is commonly referred to in the literature as Max Pooling . This approach can be easily extended to any other subsampling by simply extending the size of the window and stride accordingly (i.e., using to the equations defined above used for the output sizes of a convolutional layer based on the filter size and stride). Moreover, even though less commonly used, Mean Pooling represent an alternative approach where the mean value inside each patch is taken instead of the maximum. Finally, it is important to observe that Pooling is done for each channel independently and that it does not contain any learnable parameter. 1x1 convolutions At this point we know how to take an input tensor with an arbitrary number of dimensions (two or more) and a given number of channels, feed it through a convolutional layer, and obtain an output sensor with the same (or slightly different size) and a new chosen number of channels. It is common practice when building convolutional neural networks to start with a small number of channels and increase it gradually as we go deeper into the network. However, when you start stacking many of these layers the number of channels will quickly grow to a point where \\(N_{ch} \\rightarrow \\infty\\) . As a consequence of this fact, also the size of the filters start to grow indefinitely. But since having deeper networks has been shown an effective way to learn very complex mappings, we need something to be able to reduce the size of these filters at any time we are in need for it. A simple, yet very effective approach was proposed in 2013 by Lin and coauthors where filters of size \\(1\\times1\\) are used to reduce the number of channels whilst keeping the number of learnable parameter to a minimum (any other filter with bigger depth or width will introduce more learnable parameters). The authors actually refer to this \\(1\\times1\\) convolutional layer as a specific implementation of cross-channel parametric pooling, as similar to pooling reduces the size of the input tensor over one dimensions (channel in this case). Skip connections As already extensively discussed in one of our previous lectures, one of the problem associated with making neural networks very deep is that of so-called vanishing gradients. However, since deep neural networks are key to high performing models, the DL community has for long time tried to come up with strategies that can speed up the training process (or at least avoid a slow down) in the presence of long stacks of convolutional blocks. One successful idea that was proposed in 2015 by He and coauthors under the name of Residual Block , where so-called skip connection is introduced in a NN to take the activation of a certain layer and feed it directly to another layer further down in the computational graph. In the figure below, we consider an example where a skip connection of 2 layers is introduced to connect the activations of layer \\(l\\) and \\(l+2\\) (just before applying a nonlinear activation). Here the connection is achieved by summing the two activations. Mathematically we can write: \\[ \\textbf{a}^{[l+2]}= \\sigma(\\textbf{a}^{[l]}+\\textbf{z}^{[l+2]}) \\] and we can clearly see how the information contained in \\(\\textbf{a}^{[l]}\\) flows through the graph along both a longer path (i.e., main path) and a shorter one (i.e., shortcut). Finally note that in the last 5 years or so many variations of the residual block have been introduced. For example, one could have more or less than 2 convolutional layers (or MPLs) inside the main path. Moreover, since the size of \\(\\textbf{a}^{[l]}\\) and \\(\\textbf{z}^{[l+2]}\\) may be different, an additional layer with learnable parameter may be introduced as part of the shortcut to adjust for the size of \\(\\textbf{a}^{[l]}\\) : \\[ \\textbf{a}^{[l+2]}= \\sigma(f_\\theta(\\textbf{a}^{[l]})+\\textbf{z}^{[l+2]}) \\] where \\(f_\\theta\\) here could simply be a convolutional layer.","title":"Convolutional Neural Networks"},{"location":"lectures/10_cnn/#convolutional-neural-networks","text":"Convolutional Neural Networks are one of the most powerful types of neural network, very popular and successful in image processing (and more broadly computer vision). They are based on a simple mathematical operation that we, geoscientists, know very well and user in a variety of tasks: the convolution operator. This is motivated in most scenarios where local dependencies in the input data are known to be predominant. Imagine for example a geological model, or a core section. If we decide to apply Deep Learning to such data to either classify rock types, estimate rock parameters, or even for generative modelling tasks, the first thing that we would like our NN to know is that nearby geological features are likely to be correlated, whilst the further apart we move the more the features become independent from each other. By looking at the schematic diagrams below, a FCN would not take this prior information into account as each input value is linearly combined to give rise to the output. On the other hand, a convolutional block which represents the key component of a CNN will only use values of the input vector in a certain neighbour to obtain the output: The example mentioned above is just one of many in geoscience where convolution-based networks have been lately shown to be very successfull. Other examples are: Seismic interpretation (faults, horizons, bodies) Seismic processing (denoising, interpolation, deblurring) Satellite imaginery (denoising, segmentation) Microseismicity (detection, source mechanism) Laboratory studies (CT, SEM, Microscopy for various processing and interpretation tasks) In general, any data type that is represented regularly on a 1D, 2D, or ND gridded topology is fit for CNNs.","title":"Convolutional Neural Networks"},{"location":"lectures/10_cnn/#convolution","text":"First of all, let's briefly recall what a convolution is. This represents in fact the core operation performed by a convolutional layer. A convolution between two signals can be mathematically written as \\[ y(t) = \\int x(\\tau) h(t-\\tau) d\\tau \\leftrightarrow y = x * h \\] where \\(x(t)\\) and \\(y(t)\\) are the input and output, respectively, and \\(h(t)\\) is the filter (also called kernel in the DL jargon). This equation can be interpreted as follows: take the filter and flip it across the origin, then slide it along the time axis and multiply-and-sum it to the input signal. In practice, when working with digital data in a computer, all signals are discrete and the continuous formula above can be rewritten as follows: \\[ y_i = \\sum_{j=-\\infty}^{\\infty} x_j h_{i-j} \\] where, to be general, we have here extended the integral from \\(-\\infty\\) to \\(\\infty\\) . In most applications, the filter \\(h\\) is however compact (it has a small size of N samples, also called kernel size ) and therefore we can limit the summation within the window of samples where the filter is non-zero. A similar (but still different!) concept in signal processing is correlation \\[ y(t) = \\int x(\\tau) h(t+\\tau) d\\tau \\leftrightarrow y_i = \\sum_{j=-\\infty}^{\\infty} x_j h_{i+j} \\] where the filter is simply slid across the \\(t\\) axis (without being initially flipped). The main difference between convolution and correlation is therefore that one delays the input signal whilst the other anticipates it when the filter is non-symmetric to zero. As we will see later, it is important to immediately empathize also a slight difference in the jargon used in classical signal processing and deep learning: what usually we refer to as convolution in DL is what signal processing refers to as correlation. However, since in DL we do not choose the filter \\(h\\) , rather this is learned from data, if signal processing convolution was used instead of correlation, the learning algorithm would just learned the flipped version of the filter. In both cases, when we convolve two signals of size \\(N_x\\) with a filter of size \\(N_h\\) , the output signal has size: \\[ N_y = N_x + N_h - 1 \\] However, in the context of CNNs, we generally only want to consider the so-called valid part of the convolution, i.e., where the entire filter contributes to the computation. For this reason the output signal size becomes: \\[ N_y = N_x - N_h + 1 \\] In the next section, we will see how we can actually make the choice of \\(N_y\\) more flexible with the help of additional tools like padding and striding. Extending the concept of convolution to two- and multi-dimensional data is straightforward. This can be done by simply sliding the filter in all dimensions and can be mathematically written (in the discrete form) as follows: \\[ y_{i,j} = \\sum_m \\sum_l x_{m,l} h_{i+m,j+l} \\] Finally, another interesting thing to notice is that convolution is a linear process. Therefore we can express it as a matrix-vector multiplication where the vector identifies the input data and the filter is re-organized into a Toeplitz matrix as show in the figure below which means that the gradient of a convolutional operator that we need for backpropagation is just the adjoint of the matrix \\(\\mathbf{H}^T\\) . This is a convolution with the flipped kernel (so truly a convolution!).","title":"Convolution"},{"location":"lectures/10_cnn/#why-convolution","text":"A first intuitive motivation about locality of interactions, also referred to as space interactions (or sparse connectivity or sparse weights ), has been already provided onto why convolution blocks may represent an appealing alternative to fully connected blocks in the context of neural networks. However, this is not the only reason why convolution blocks are so powerful and widely used nowadays when training NNs for image processing tasks. Let's start with an example. Imagine we are given a large image and a small 3x3 kernel. By sliding the kernel across the image we can still be able to detect useful local features (e.g., edges). Note that, the Machine Learning community has been aware of this for decades, and in fact many early approaches to image detection relied on hand-crafted filters that could highlight one feature of the input data over another. The modern DL approach simply takes this paradigm one step further where the filters are learned instead of being defined upfront. Experience has further shown that deep CNNs learn initially low level features (e.g., edges), then middle level features (e.g., shapes) and finally high level features (e.g., objects). Compared to flattening the input data and applying a matrix that transforms it into the dimension of the output data (that is what a FCC would do as shown above), using convolutions with small filters can save both memory and computations. Given for example an image of size \\(N_{w,x} \\times N_{h,x}\\) , a fully connected layer that produces an output of the same size requires a matrix with \\((N_{w,x} N_{h,x})^2\\) parameters and \\((N_{w,x} N_{h,x})^2\\) computations are required to obtain the output. On the other hand, if we now consider a simple filter of size \\(N_{w,h} \\times N_{h,h}\\) , the number of computations is reduced to \\(N_{w,x} N_{h,x} N_{w,h} N_{h,h}\\) . The second main advantage of convolutional blocks is so-called parameter sharing . The same learned kernels are applied all over the input data, instead of having one filter operating on all (or part of) the input data to produce a single output component. Finally, a third benefit is the equivariance of convolution to translation . This means that if we shift the input by \\(k\\) samples, the output will also be shifted by the same number of samples; however, the shape of the output will not change.","title":"Why Convolution?"},{"location":"lectures/10_cnn/#padding-and-strides","text":"We have previously seen how applying convolution to a signal with a kernel of a given size produces an output signal of different size, either with the total or valid output size is chosen. It may be however much easier when designing a convolutional neural network to have inputs and outputs of the same size, or more in general to be free to design the size of the output independent on that of the input and filter. Two simple approaches exist: padding : the input signal is padded with zeros on both sides (for 1D signals) or all sides (for ND signals) prior to convolution. This allows producing outputs that can have the same size or even larger size than the input. Let's first look at this with an example when the output size is computed using the equation above for the valid case. We can devise a padding such that the size of the output stays the same as that of the input. This is actually easy to do once we choose the size of the filter and more specifically \\(N_{x,pad} = N_x + 2*pad\\) with \\(pad = (N_h-1)/2\\) when \\(N_h\\) is a odd number and \\(N_h/2\\) when \\(N_h\\) is a even number. Moreover, apart from the obvious benefit of not having to handle outputs that keep reducing in size, padding ensures that edge values in the inputs are also used the same number of times that central values in the convolution process. strides : a common approach when building convolutional neural network, as we will see when discussing popular CNN architecture, is however to gradually reduce the size of the signal (or image in 2D) whilst going deeper and deeper into the network. Two alternative ways to achieve this exist: the simplest is to couple convolutional layers that do not change the size of the input and downsampling (or pooling layers). Alternatively, one can choose to apply a special type of convolution called strided convolution that simply moves the filter around the input jumping (or striding) by more than a single sample at the time. Again, if we look at an example, we can observe how by doing so the size of the output is reduced by the striding factor. If we stride by a factor of two the output size will be half of the input size. As a result the output size can be written as \\(N_y = \\lfloor (N_x - N_h) / stride + 1 \\rfloor\\) . Eventually striding and padding can be used together to get for example an output that is exactly half of the size of the input in all directions. An important formula to remember when designing convolutional layers is: \\[ N_y = \\Bigl\\lfloor \\frac{N_x + 2pad - N_h}{stride} + 1 \\Bigr\\rfloor \\]","title":"Padding and strides"},{"location":"lectures/10_cnn/#channels","text":"We need to introduce one last key ingredient before we can define a convolutional layer. Let's imagine we have a 3D tensor and a 3D filter; the extension of 2D convolution to 3D (or any extra dimension) is as easy as sliding the filter along the third dimension as well as the first two. However, in deep learning we generally do something different when we are dealing with convolutional networks. We define a special dimension called channel . Imagine having a 1D signal like a seismic trace but recording both the horizontal and vertical components of the particle displacement field. One way to arrange such data is as a 2D array where one of the dimensions is the size of the trace and the other is the number of components (or channels), here two. A similar scenario may arise for 2D signals if we record for example different spectral components or for pre-stack seismic data where we record data at different angles. Here once again we will have two \"classical\" dimensions, say latitude and longitude or geographical location and depth and one channel dimension. For the first example this will contain the different spectral components, for the second example it will be represented by the different angles (or offsets). This is the geoscientific equivalent to natural images that are commonly used in deep learning tasks where the channel contains different colors (e.g., RGB or CMYK). In order to make ourselves already familiar with the ordering used in computational frameworks like PyTorch, a batch of training samples is usually organized as follows: \\[ N_x = (N_s \\times N_{ch,x} \\times N_{w,x} \\times N_{h,x}) \\] where \\(N_{ch,x}\\) is the number of input channels, whilst \\(N_{w,x}\\) and \\(N_{w,h}\\) are the width and the height of the image, respectively. By defining a special dimension, we can now decide to still work with filters that slide only across the width and height axes. Such kernels will have size \\(N_{ch,x} \\times N_{w,h} \\times N_{h,h}\\) . By doing so, for every step of convolution, the input and filter and multiplied and then all the values across all channels are summed together.","title":"Channels"},{"location":"lectures/10_cnn/#convolutional-layer","text":"A convolutional layer is simply a stack of \\(N_{ch,y}\\) filters. The resulting output has therefore a shape equal to: \\[ N_y = (N_s \\times N_{ch,y} \\times N_{w,y} \\times N_{h,y}) \\] where \\(N_{w,y}\\) and \\(N_{w,y}\\) can be computed upfront using the formulas derived above. Note that a convolutional layer contains trainable parameters both in the form of the coefficients of the various filters and a vector of biases \\(\\mathbf{b}=[b_1, b_2,...,b_{N_{ch,y}}]\\) where every bias is applied to a different output channel. The output can be therefore written in a compact mathematical form as follows: \\[y = \\sigma \\Big( \\begin{bmatrix} h_1 * x + b_1 \\\\ ... \\\\ h_{N_{ch,y}} * x + b_{N_{ch,y}} \\end{bmatrix} \\Big) \\] In summary, a convolutional layer has the following number of trainable parameters: \\[ N=N_{w,h}N_{h,h}N_{ch,x}N_{ch,y} + N_{ch,y} \\] For example, if \\(N_{ch,x}=3\\) , \\(N_{ch,y}=10\\) , and the filters have size \\(3 \\times 3\\) , the overall number of parameters is \\(3\\cdot3\\cdot3\\cdot10 + 10 =280\\) . Moreover, as convolutional layers can be stacked similarly to what we have done with MLP layers, the following nomenclature will be used in the following when referring to a generic layer \\(l\\) : \\[ \\begin{aligned} x:&\\quad N_{ch}^{[l-1]} \\times N_w^{[l-1]} \\times N_h^{[l-1]},\\\\ h:&\\quad N_{ch}^{[l]} \\times N_{ch}^{[l-1]} \\times N_w^{[l-1]} \\times N_h^{[l-1]},\\\\ b:&\\quad N_{ch}^{[l]},\\\\ y:&\\quad N_{ch}^{[l]} \\times N_w^{[l]} \\times N_h^{[l]} \\end{aligned} \\]","title":"Convolutional layer"},{"location":"lectures/10_cnn/#convolutional-network","text":"Similar to a fully connected network, a convolutional network can be easily created by putting together a certain number of convolutional layers. Although we will see that different tasks call for different design choices, most convolutional neural networks share the following design features: the height and width ( \\(N_h\\) and \\(N_w\\) ) tends to reduce the deeper we travel into the network; the number of channels ( \\(N_{ch}\\) ) does instead increase as function of network depth; after a certain number of convolutional layers, the output of size \\(N_{ch}^{[l]} \\times N_w^{[l]} \\times N_h^{[l]}\\) is flattened and fed into one or more fully connected layers and then sent into a classifier (or regressor) loss function.","title":"Convolutional network"},{"location":"lectures/10_cnn/#pooling","text":"As we have previously mentioned in the previous section, convolutional neural networks require reducing the size of the height and width of an input image We have already discussed that by choosing the filter size, stride and padding, the output can be either kept of the same size of the input or reduced (or increased) in size. At times, it may however be better to avoid changing the size of the output directly as part of the convolution process, rather perform this in a separate step. In this section we introduce the so-called pooling process, which is designed specifically to reduce the size of an input N-dimensional array by an arbitrary factor \\(N_p\\) . Let's start with an example. We are interested to take a matrix of size \\(N_{h,x} \\times N_{w,x}\\) as input and produce an output of half the size (i.e., \\(N_{h,x}/2 \\times N_{w,x}/2\\) . A possible way to achieve this without purely discarding some of the values of the matrix is to select the maximum value within a sliding window of size \\(2 \\times 2\\) (stride=2): This approach is commonly referred to in the literature as Max Pooling . This approach can be easily extended to any other subsampling by simply extending the size of the window and stride accordingly (i.e., using to the equations defined above used for the output sizes of a convolutional layer based on the filter size and stride). Moreover, even though less commonly used, Mean Pooling represent an alternative approach where the mean value inside each patch is taken instead of the maximum. Finally, it is important to observe that Pooling is done for each channel independently and that it does not contain any learnable parameter.","title":"Pooling"},{"location":"lectures/10_cnn/#1x1-convolutions","text":"At this point we know how to take an input tensor with an arbitrary number of dimensions (two or more) and a given number of channels, feed it through a convolutional layer, and obtain an output sensor with the same (or slightly different size) and a new chosen number of channels. It is common practice when building convolutional neural networks to start with a small number of channels and increase it gradually as we go deeper into the network. However, when you start stacking many of these layers the number of channels will quickly grow to a point where \\(N_{ch} \\rightarrow \\infty\\) . As a consequence of this fact, also the size of the filters start to grow indefinitely. But since having deeper networks has been shown an effective way to learn very complex mappings, we need something to be able to reduce the size of these filters at any time we are in need for it. A simple, yet very effective approach was proposed in 2013 by Lin and coauthors where filters of size \\(1\\times1\\) are used to reduce the number of channels whilst keeping the number of learnable parameter to a minimum (any other filter with bigger depth or width will introduce more learnable parameters). The authors actually refer to this \\(1\\times1\\) convolutional layer as a specific implementation of cross-channel parametric pooling, as similar to pooling reduces the size of the input tensor over one dimensions (channel in this case).","title":"1x1 convolutions"},{"location":"lectures/10_cnn/#skip-connections","text":"As already extensively discussed in one of our previous lectures, one of the problem associated with making neural networks very deep is that of so-called vanishing gradients. However, since deep neural networks are key to high performing models, the DL community has for long time tried to come up with strategies that can speed up the training process (or at least avoid a slow down) in the presence of long stacks of convolutional blocks. One successful idea that was proposed in 2015 by He and coauthors under the name of Residual Block , where so-called skip connection is introduced in a NN to take the activation of a certain layer and feed it directly to another layer further down in the computational graph. In the figure below, we consider an example where a skip connection of 2 layers is introduced to connect the activations of layer \\(l\\) and \\(l+2\\) (just before applying a nonlinear activation). Here the connection is achieved by summing the two activations. Mathematically we can write: \\[ \\textbf{a}^{[l+2]}= \\sigma(\\textbf{a}^{[l]}+\\textbf{z}^{[l+2]}) \\] and we can clearly see how the information contained in \\(\\textbf{a}^{[l]}\\) flows through the graph along both a longer path (i.e., main path) and a shorter one (i.e., shortcut). Finally note that in the last 5 years or so many variations of the residual block have been introduced. For example, one could have more or less than 2 convolutional layers (or MPLs) inside the main path. Moreover, since the size of \\(\\textbf{a}^{[l]}\\) and \\(\\textbf{z}^{[l+2]}\\) may be different, an additional layer with learnable parameter may be introduced as part of the shortcut to adjust for the size of \\(\\textbf{a}^{[l]}\\) : \\[ \\textbf{a}^{[l+2]}= \\sigma(f_\\theta(\\textbf{a}^{[l]})+\\textbf{z}^{[l+2]}) \\] where \\(f_\\theta\\) here could simply be a convolutional layer.","title":"Skip connections"},{"location":"lectures/11_cnnarch/","text":"CNNs Popular Architectures This lecture provides an overview of how deep learning, especially in the context of CNNs (and computer vision in general), has evolved over the last decade. This is something that it is good to be familiar with because: whilst most of these advances are given for granted and routinely used today, it is always insightful to learn how ans why these developments were made; we can use architectures that worked well with no (or minimal) adaptation to our problem at hand (we will see that this is very commonly done with high degree of success in geoscience); even better, sometimes we can decide to use pre-trained networks and fine-tune them with limited amount of label data. In this case knowing the network architecture in details allows us to make informed choices, such as remove some of the final layers and introduce new ones that better adapt to the problem at hand (e.g., different number of classes). LeNet-5 One of the first successful CNNs was created and trained by the famous Yan Le Cun in 1989 with the objective of classifying hand-written digits. As we will see when comparing this to other popular networks, the size of LeNet-5 is very limited, mostly due to the hardware capabilities at that time (and the availability of a fairly small training dataset). As shown in the figure below, this network is composed of: 2 convolutional layers with filter size equal to \\(5 \\times 5\\) , stride equal to 1, and number of channels equal to 6 and 16, respectively; 2 average pooling layers that reduce the height and width of the feature maps by a factor of 2; 3 fully connected layers of size 120, 84, and 10 (the number of digits to classify); softmax activation in the final layer; and the overall number of training parameters is \\(\\approx 60k\\) . Finally, looking at the network architecture two things stand out that probably today would have been implemented differently: average pool layers are not so popular today, max pool layers are more commonly used; activations were used also after pooling and all activations where sigmoid/tangent. Again, today ReLU or one of its variant is more commonly used and no activations are added after pooling layers. AlexNet AlexNet represents a milestone in the field of DeepLearning. Developed by Alex Krizhevsky, Ilya Sutskever and Geoffrey Hinton, this network was the first CNN that won the popular computer vision competition ImageNet. Not only that, but the network outperformed other submissions by far, and brought Deep Learning to the attention of the larger Computer Vision community. As shown in the figure below, this network is not very different from LeNet-5 in its individual components, it is however much deeper and contains much more trainable parameters. More specifically, it is composed of: 5 convolutional layers with variable filter size (ranging from \\(11 \\times 11\\) in the first layer all the way to \\(3 \\times 3\\) in some of the deeper layers); 3 max pooling layers that reduce the height and width of the feature maps by a factor of 2; 3 fully connected layers of size 4096, 4096, and 1000 (the number of digits to classify); softmax activation in the final layer; and the overall number of training parameters is \\(\\approx 60M\\) , 3 order of magnitude more than that of LeNet-5. A number of interesting feature of this network: the number of channels in the different layers: initially, this grows from 3 (i.e., RGB) to 384 and it is then reduced to 256 all the way to the FC layer; ReLU is used as activation function for all hidden layers; Dropout is used to avoid overfitting; VGG-16 In 2015, the Visual Geometry Group at Oxford introduce a new CNN architecture called VGG. The key architectural change here is the fact that the network was much deeper than most state-of-the art networks at that time (16 layers); this was achieved by trading filter size (now \\(3 \\times 3\\) ) for depth. Moreover, whilst other networks like AlexNet were hand-crafted with very different filter sizes, strides and padding from layer to layer, this network is really very simple to define: 16 \\(3 \\times 3\\) convolutional layers with stride equal to 1; 16 max pooling laywrs with filter size and stride equal to 2. and the overall number of training parameters is \\(\\approx 138M\\) , roughly twice more than those of AlexNet. The key insight of VGG, which we will see is also used in later CNN architectures, is that stacks of convolutional layers with small filters can emulate the receptive field of one layer with larger filter sizes. Note that further extensions of VGG-16 have been proposed, for example VGG-19 where the network is composed of 19 layers. GoogleLeNet and Inception In 2014, Christian Szegedy from Google was working on reducing the computational burden of deep neural networks. At that time, a new convolutional block was introduced under the name of Inception Layer: Instead of choosing the size of the bank of filters to be used upfront, the inception layer uses more than once filter size at the same time (a kind of multi-resolution approach). More specifically the input is sent into 4 paths in parallel: \\(1 \\times 1\\) convolution block; \\(3 \\times 3\\) convolution block; \\(5 \\times 5\\) convolution block; Max pooling block. Moreover, since sending an input with large width, height, and channel number into a \\(3 \\times 3\\) (or \\(5 \\times 5\\) ) convolutional layer would result in a very large number of trainable parameters and extreme computational cost, the input is first sent into a \\(1 \\times 1\\) that reduces the channel size and then the channel size is increased again in the next layer. The \\(1 \\times 1\\) layers act as a bottleneck layer keeping the number of trainable parameters low. Similarly, after the max pooling layer the number of channels is controlled via another \\(1 \\times 1\\) convolutional layer. The four outputs are simply concatenated together to form the output of the Inception layer. The GoogleLeNet network is a large networks where multiple of these Inception layers are stacked together. This network presents an additional set of new features: two side branches are added at different stages of the network, where intermediate representations from hidden layers are passed through a few more layers and sent to a classifier. These classifiers perform the same task of the main classifier placed at the end of the network and have been shown to act as a natural regularizer, ensuring that the hidden features are as expressive as possible to the point they can be used directly for the classification task at hand. ResNet We can already observe a trend moving from LeNet-5 to VGG-19. From the 80' all the way to the early 2000', networks started to become deeper and deeper. However, despite deeper network can generally achieve better performance, practitioners started to also experience painfully slow training. It was later discovered that this was caused by the vanishing gradient problem. Around the same time of VGG-16, He and coauthors proposed a new network block called the Residual Block. As already discussed in our last lecture, this block introduces the innovative idea of shortcuting some of the activations forward in the computational graph and summing them to the activations of the main path. This gave rise to the so-called ResNet that proved to be much easier (and faster) to train than other CNNs when stacking a large number of layers, even up to 100 (or 1000) of layers! The figure above shows ResNet-18, but it is important to remember that the idea of adding skip-connections every couple of layers has much wider implications than just for the ResNet architecture. One of the key benefits introduced by ResNet is the ability to increase the depth of a network without incurring in the risk of overfitting the training data. So, whilst in theory deeper networks should always reduce the training error, this is not always the case for plain networks. On the other hand, networks with Residual blocks are much more successful in that respect. UNet The UNet architecture was proposed by Ronneberger et al. in 2015 in the context of interpretation of microscopy images. This network architecture presents however a number of innovative design choices which led to its widespread use in a variety of disciplines for both semantic segmentation and regression tasks. More specifically, whilst most of the networks we have discussed so far are specifically designed for classification tasks where inputs are of much larger size of target (i.e., imagine taking images from the MNIST dataset as input as a single vector of 10 elements as output), UNet was originally conceived for a semantic segmentation task. Semantic segmentation is a special case of classification where instead of predicting a class per input samples, we want to predict a class for each element of that sample. This makes the output space very large, equal to that of the input times the number of classes. The UNet architecture presents the following characteristics: it can be seen as composed by two networks, an Encoder or contracting path, and a Decoder or expanding path. This is a common design in dimensionality reduction networks like AutoEncoders (see Lecture X for more details). Each level of the encoder network contains a number of convolutional layers followed by a downsampler (usually achieved by means of max pooling). On the other hand, the decoder is composed of convolutional layers preceded by an upsampler (this can be either an interpolator like a bilinear interpolation or a convtranspose layer); skip connections are introduced at each level of the contracting path, taking those features all the way to the corresponding level of the expanding path (where they are concatenated with the features coming from a deeper level of the contracting path itself). Whilst we have already discussed the importance of skip connections for stable training, here these skip connections are brought to a new level, as a very large portion of the network is skipped and concatenation is used instead of summation. The presence of such connections make the UNet architecture able to create very high resolution segmentation and regression outputs; Finally, restricting ourselves to geoscience applications, UNet has been successfully used for a variety of tasks such as: Salt body / channel / karst extraction from seismic data (semantic segmentation); Fault and horizon tracking (semantic segmentation, where a skeletonized fault or horizon volume is used as the target to predict); Microseismic event detection (semantic segmentation); Seismic data interpolation, denoising, deghosting (regression, or more precisely domain translation ); and more... To conclude a summary of some of the most popular CNN architectures used for various computer vision task is shown in the figure below. Note the size of the circles refer to the number of trainable parameters of the associated network. Additional readings the following blog post provides a good overview of some of the most popular architectures in computer vision, including those discussed in this lecture.","title":"CNNs Popular Architectures"},{"location":"lectures/11_cnnarch/#cnns-popular-architectures","text":"This lecture provides an overview of how deep learning, especially in the context of CNNs (and computer vision in general), has evolved over the last decade. This is something that it is good to be familiar with because: whilst most of these advances are given for granted and routinely used today, it is always insightful to learn how ans why these developments were made; we can use architectures that worked well with no (or minimal) adaptation to our problem at hand (we will see that this is very commonly done with high degree of success in geoscience); even better, sometimes we can decide to use pre-trained networks and fine-tune them with limited amount of label data. In this case knowing the network architecture in details allows us to make informed choices, such as remove some of the final layers and introduce new ones that better adapt to the problem at hand (e.g., different number of classes).","title":"CNNs Popular Architectures"},{"location":"lectures/11_cnnarch/#lenet-5","text":"One of the first successful CNNs was created and trained by the famous Yan Le Cun in 1989 with the objective of classifying hand-written digits. As we will see when comparing this to other popular networks, the size of LeNet-5 is very limited, mostly due to the hardware capabilities at that time (and the availability of a fairly small training dataset). As shown in the figure below, this network is composed of: 2 convolutional layers with filter size equal to \\(5 \\times 5\\) , stride equal to 1, and number of channels equal to 6 and 16, respectively; 2 average pooling layers that reduce the height and width of the feature maps by a factor of 2; 3 fully connected layers of size 120, 84, and 10 (the number of digits to classify); softmax activation in the final layer; and the overall number of training parameters is \\(\\approx 60k\\) . Finally, looking at the network architecture two things stand out that probably today would have been implemented differently: average pool layers are not so popular today, max pool layers are more commonly used; activations were used also after pooling and all activations where sigmoid/tangent. Again, today ReLU or one of its variant is more commonly used and no activations are added after pooling layers.","title":"LeNet-5"},{"location":"lectures/11_cnnarch/#alexnet","text":"AlexNet represents a milestone in the field of DeepLearning. Developed by Alex Krizhevsky, Ilya Sutskever and Geoffrey Hinton, this network was the first CNN that won the popular computer vision competition ImageNet. Not only that, but the network outperformed other submissions by far, and brought Deep Learning to the attention of the larger Computer Vision community. As shown in the figure below, this network is not very different from LeNet-5 in its individual components, it is however much deeper and contains much more trainable parameters. More specifically, it is composed of: 5 convolutional layers with variable filter size (ranging from \\(11 \\times 11\\) in the first layer all the way to \\(3 \\times 3\\) in some of the deeper layers); 3 max pooling layers that reduce the height and width of the feature maps by a factor of 2; 3 fully connected layers of size 4096, 4096, and 1000 (the number of digits to classify); softmax activation in the final layer; and the overall number of training parameters is \\(\\approx 60M\\) , 3 order of magnitude more than that of LeNet-5. A number of interesting feature of this network: the number of channels in the different layers: initially, this grows from 3 (i.e., RGB) to 384 and it is then reduced to 256 all the way to the FC layer; ReLU is used as activation function for all hidden layers; Dropout is used to avoid overfitting;","title":"AlexNet"},{"location":"lectures/11_cnnarch/#vgg-16","text":"In 2015, the Visual Geometry Group at Oxford introduce a new CNN architecture called VGG. The key architectural change here is the fact that the network was much deeper than most state-of-the art networks at that time (16 layers); this was achieved by trading filter size (now \\(3 \\times 3\\) ) for depth. Moreover, whilst other networks like AlexNet were hand-crafted with very different filter sizes, strides and padding from layer to layer, this network is really very simple to define: 16 \\(3 \\times 3\\) convolutional layers with stride equal to 1; 16 max pooling laywrs with filter size and stride equal to 2. and the overall number of training parameters is \\(\\approx 138M\\) , roughly twice more than those of AlexNet. The key insight of VGG, which we will see is also used in later CNN architectures, is that stacks of convolutional layers with small filters can emulate the receptive field of one layer with larger filter sizes. Note that further extensions of VGG-16 have been proposed, for example VGG-19 where the network is composed of 19 layers.","title":"VGG-16"},{"location":"lectures/11_cnnarch/#googlelenet-and-inception","text":"In 2014, Christian Szegedy from Google was working on reducing the computational burden of deep neural networks. At that time, a new convolutional block was introduced under the name of Inception Layer: Instead of choosing the size of the bank of filters to be used upfront, the inception layer uses more than once filter size at the same time (a kind of multi-resolution approach). More specifically the input is sent into 4 paths in parallel: \\(1 \\times 1\\) convolution block; \\(3 \\times 3\\) convolution block; \\(5 \\times 5\\) convolution block; Max pooling block. Moreover, since sending an input with large width, height, and channel number into a \\(3 \\times 3\\) (or \\(5 \\times 5\\) ) convolutional layer would result in a very large number of trainable parameters and extreme computational cost, the input is first sent into a \\(1 \\times 1\\) that reduces the channel size and then the channel size is increased again in the next layer. The \\(1 \\times 1\\) layers act as a bottleneck layer keeping the number of trainable parameters low. Similarly, after the max pooling layer the number of channels is controlled via another \\(1 \\times 1\\) convolutional layer. The four outputs are simply concatenated together to form the output of the Inception layer. The GoogleLeNet network is a large networks where multiple of these Inception layers are stacked together. This network presents an additional set of new features: two side branches are added at different stages of the network, where intermediate representations from hidden layers are passed through a few more layers and sent to a classifier. These classifiers perform the same task of the main classifier placed at the end of the network and have been shown to act as a natural regularizer, ensuring that the hidden features are as expressive as possible to the point they can be used directly for the classification task at hand.","title":"GoogleLeNet and Inception"},{"location":"lectures/11_cnnarch/#resnet","text":"We can already observe a trend moving from LeNet-5 to VGG-19. From the 80' all the way to the early 2000', networks started to become deeper and deeper. However, despite deeper network can generally achieve better performance, practitioners started to also experience painfully slow training. It was later discovered that this was caused by the vanishing gradient problem. Around the same time of VGG-16, He and coauthors proposed a new network block called the Residual Block. As already discussed in our last lecture, this block introduces the innovative idea of shortcuting some of the activations forward in the computational graph and summing them to the activations of the main path. This gave rise to the so-called ResNet that proved to be much easier (and faster) to train than other CNNs when stacking a large number of layers, even up to 100 (or 1000) of layers! The figure above shows ResNet-18, but it is important to remember that the idea of adding skip-connections every couple of layers has much wider implications than just for the ResNet architecture. One of the key benefits introduced by ResNet is the ability to increase the depth of a network without incurring in the risk of overfitting the training data. So, whilst in theory deeper networks should always reduce the training error, this is not always the case for plain networks. On the other hand, networks with Residual blocks are much more successful in that respect.","title":"ResNet"},{"location":"lectures/11_cnnarch/#unet","text":"The UNet architecture was proposed by Ronneberger et al. in 2015 in the context of interpretation of microscopy images. This network architecture presents however a number of innovative design choices which led to its widespread use in a variety of disciplines for both semantic segmentation and regression tasks. More specifically, whilst most of the networks we have discussed so far are specifically designed for classification tasks where inputs are of much larger size of target (i.e., imagine taking images from the MNIST dataset as input as a single vector of 10 elements as output), UNet was originally conceived for a semantic segmentation task. Semantic segmentation is a special case of classification where instead of predicting a class per input samples, we want to predict a class for each element of that sample. This makes the output space very large, equal to that of the input times the number of classes. The UNet architecture presents the following characteristics: it can be seen as composed by two networks, an Encoder or contracting path, and a Decoder or expanding path. This is a common design in dimensionality reduction networks like AutoEncoders (see Lecture X for more details). Each level of the encoder network contains a number of convolutional layers followed by a downsampler (usually achieved by means of max pooling). On the other hand, the decoder is composed of convolutional layers preceded by an upsampler (this can be either an interpolator like a bilinear interpolation or a convtranspose layer); skip connections are introduced at each level of the contracting path, taking those features all the way to the corresponding level of the expanding path (where they are concatenated with the features coming from a deeper level of the contracting path itself). Whilst we have already discussed the importance of skip connections for stable training, here these skip connections are brought to a new level, as a very large portion of the network is skipped and concatenation is used instead of summation. The presence of such connections make the UNet architecture able to create very high resolution segmentation and regression outputs; Finally, restricting ourselves to geoscience applications, UNet has been successfully used for a variety of tasks such as: Salt body / channel / karst extraction from seismic data (semantic segmentation); Fault and horizon tracking (semantic segmentation, where a skeletonized fault or horizon volume is used as the target to predict); Microseismic event detection (semantic segmentation); Seismic data interpolation, denoising, deghosting (regression, or more precisely domain translation ); and more... To conclude a summary of some of the most popular CNN architectures used for various computer vision task is shown in the figure below. Note the size of the circles refer to the number of trainable parameters of the associated network.","title":"UNet"},{"location":"lectures/11_cnnarch/#additional-readings","text":"the following blog post provides a good overview of some of the most popular architectures in computer vision, including those discussed in this lecture.","title":"Additional readings"},{"location":"lectures/12_seqmod/","text":"Sequence modelling In this lecture we will start investigating a family of Neural Network that are particularly suitable for learning tasks that involve sequences as input data. To understand what a sequence is in the context of Deep learning, let's consider a recording over time (e.g., an audio recording): Compared to other dataset types (e.g., tabular or gridded data), the different samples of a sequence present an obvious degree of correlation that tends to diminuish the further away to samples are from each other. Moreover, in the case of multi-feature sequences (e.g., multi-component seismological recordings), the overall sequence contains a number of features at each time step that can be more or less correlated to each other. Sequences appear in every aspect of life. For example, outside of geoscience, the two most commonly used data in sequence modelling are: text audio More specifically, as we will see, the field of Natural Language Processing (NPL) has experienced a revolutionary growth in the last decade thanks to sequence modelling and deep learning. In geoscience, many of the commonly used datasets can also be interpreted as sequences, for example: seismograms well logs production data are all datatypes that present a certain degree of correlation along either the time or depth axis. Finally, similar to FFNs or CNNs, sequence modelling can be used for various applications: Single output classification: given an input sequence of a certain length \\(\\mathbf{x}\\) , a model is trained to decide whether than sequence contains a feature of interest or not. For example, given a seismogram we may be interest to detect the presence of a seismic event, or we may want to find out if a well log is clean or corrupted by some recording error or what is the facies in the middle of the sequence; Multi output classification (i.e., semantic segmentation): given an input sequence of a certain length \\(\\mathbf{x}\\) , a model is trained to classify each element of the input sequence into a predefined set of classes. Taking once again the example of facies labelling, here the task is extended to predicting labels at each depth level (and not only in the middle of the sequence); Regression: given an input sequence of a certain length \\(\\mathbf{x}\\) , a model is trained to predict a continuous output, which could be a single value \\(y\\) or a sequence of values \\(\\mathbf{y}\\) that has the same (or different length) of the input. For example, given a set of well logs we may want to predict another one that was not acquired. Similarly, given a seismic trace recorded by the vertical component of a geophone we may be interested to predict the horizontal components. Both of these example fall under the area of domain translation ; Motivation Let's start by considering what we have learned so far and discuss how we could use those tools to handle sequential data. First of all, we consider a sequence of \\(N_\\tau\\) samples and \\(N_f\\) features: \\[ \\mathbf{X} = \\begin{bmatrix} x_1^{<1>} & x_1^{<2>} & x_1^{<N_\\tau>} \\\\ ... & ... & ... \\\\ x_{N_f}^{<1>} & x_1^{<2>} & x_{N_f}^{<N_\\tau>} \\end{bmatrix} = \\begin{bmatrix} \\mathbf{x}^{<1>} & \\mathbf{x}^{<2>} & \\mathbf{x}^{<N_\\tau>} \\end{bmatrix}_{[N_f \\times N_\\tau]} \\] we could easily deal with this as if it was a 2D-array (i.e., an image) and use CNNs. However, the locality argument used for the convolutional filters that constitute a convolutional layer would not make much sense here, especially if we know that elements in the sequence away from each other may still have a certain degree of correlation. Alternatively, the matrix \\(\\mathbf{X}\\) could be simply vectorized and used as input to a FFN. This approach does however present two main limitations: since the vector \\(vec(\\mathbf{X})\\) is likely to be very long, weight matrices will be very large leading to a very expensive training process; FFNs cannot easily handle inputs of variable lengths, so all sequences will need to have fixed length. We will see that being able to handle variable-length sequences is very useful in some situations. Both problems can be overcome by taking advantage of parameter sharing . We have already introduced this concept in the context of CNNs, where the same filters are used in different parts of the input. Similarly in sequence modelling, the idea of parameter sharing allows using the same parameters at different stages of the sequence and therefore allows the network to easily handle sequences of variable length. By doing so, a new type of neural network is created under the name of Recurrent Neural Network (RNN): where \\(\\mathbf{x}\\) is the input vector (or matrix when multiple features are present), \\(\\mathbf{y}\\) is the output vector, and \\(\\mathbf{h}\\) is the so called hidden state vector. As clearly shown in the unrolled version of the network into a standard computational graph, various inputs and hidden states are passed through the same function \\(f_\\theta\\) with a given number of training parameters. This is very different from a feed-forward network where different functions is are used over consecutive layers. The choice of the function \\(f_\\theta\\) leads to the definition of different RNN architectures. Before we begin introducing a number of popular architectures for sequence modelling, let's introduce some useful notation. Inputs and outputs of a RNNs will be always defined as follows: \\[ \\mathbf{X} = \\begin{bmatrix} \\mathbf{x}^{<1>} & \\mathbf{x}^{<2>} & \\mathbf{x}^{<T_x>} \\end{bmatrix}_{[N_f \\times T_x]} \\] and \\[ \\mathbf{Y} = \\begin{bmatrix} \\mathbf{y}^{<1>} & \\mathbf{y}^{<2>} & \\mathbf{y}^{<T_y>} \\end{bmatrix}_{[N_t \\times T_y]} \\] where \\(T_x\\) and \\(T_y\\) are the length of the input and output sequences. First, note that this notations differs from before in that a single training sample is now represented as a matrix; therefore, the entire training data becomes a 3-D tensor of size \\([N_s \\times N_f \\times T_x]\\) (and \\([N_s \\times N_t \\times T_y]\\) ). Finally, note that in the most general case these parameters may be sample dependant (i.e., when we allow sequences of variable size): the following notation will be used in that case, \\(T_x^{(i)}\\) and \\(T_y^{(i)}\\) where \\(i\\) refers to the i-th training sample. Moreover, given that we recurrently apply the same function \\(f_\\theta\\) , we can very compactly write an RNN as: \\[ \\mathbf{h}^{<t>}, \\mathbf{y}^{<t>}=f_\\theta(\\mathbf{h}^{<t-1>}, \\mathbf{x}^{<t>}) \\qquad t=1,2,T_x \\] that we can unroll into: \\[ \\mathbf{h}^{<t>}, \\mathbf{y}^{<t>}=f_\\theta(f_\\theta(f_\\theta(\\mathbf{h}^{<0>}, \\mathbf{x}^{<1>}), ...), \\mathbf{x}^{<t-2>}), \\mathbf{x}^{<t-1>}), \\mathbf{x}^{<t>}) \\] As we have already briefly mentioned, RNNs allows some flexibility on the choice of \\(T_y\\) (i.e., the length of the output sequence). This leads to the creation of different network architectures that are suitable to different tasks: Note that in the cases 3 and 4, the predicted output is fed back to the network as input to the next step at inference stage as shown in the figure above. At training stage, however, the true output is used as input. In summary, what we wish to achieve here is to create a network that can learn but short and long term relationships in the data such that both samples closes to each other as well as far away samples can help in the prediction of the current step. By using parameter sharing in a smart way, we can avoid overparametrizing the network and therefore limit the risk of overfitting on short and long term trends in the data. In other words, by assuming stationariety in the data, we let the network understand if step \\(t\\) and \\(t+N_t\\) are correlated to each other across the entire time sequence, instead of giving the network with the freedom to find relationships between any two samples in the sequence. Basic RNN Architecture It is now time to discuss in more details what is an effective function, \\(f_\\theta\\) . The most basic Recurrent Neural Network can be written as follows: \\[ \\begin{aligned} \\mathbf{a}^{<t>} &= \\mathbf{W}_h \\mathbf{h}^{<t-1>} + \\mathbf{W}_x \\mathbf{x}^{<t>} + \\mathbf{b}_a = \\mathbf{W} [\\mathbf{h}^{<t-1>}, \\mathbf{x}^{<t>}]^T + \\mathbf{b}_a \\\\ \\mathbf{h}^{<t>} &= \\sigma(\\mathbf{a}^{<t>} ) \\\\ \\mathbf{o}^{<t>} &= \\mathbf{W}_y \\mathbf{h}^{<t>} + \\mathbf{b}_y \\\\ \\hat{\\mathbf{y}}^{<t>} &= \\sigma' (\\mathbf{o}^{<t>}) \\end{aligned} \\] where: \\(\\sigma\\) and \\(\\sigma'\\) are the activation functions for the hidden and output paths (the choice of the activation for the latter depends on the problem we wish to solve, e.g., softmax for binary classification) \\(\\mathbf{h}^{<0>}\\) is the initial hidden state vector which is usually initalialized as a zero vector. \\(\\mathbf{W} = [\\mathbf{W}_h, \\mathbf{W}_x]_{[N_h \\times N_h + N_x]}\\) is the matrix of weights for the hidden path \\(\\mathbf{W}_{y \\; [N_y \\times N_h]}\\) is the matrix of weights for the output path In conclusion, the learnable parameters for this kind of RNN block are: \\(\\mathbf{W}_h, \\mathbf{W}_x, \\mathbf{W}_y, \\mathbf{b}_a, \\mathbf{b}_y\\) whose overall size is \\(N_h(N_h+N_x) + N_y N_h + N_h + N_y\\) . To give some perspective, this is much smaller than the number of learnable parameters of an 'equivalent' Feed-Forward network where the entire input matrix \\(\\mathbf{X}\\) is flattened into a 1-d array of size \\(N_f T_x\\) and the entire output matrix \\(\\mathbf{Y}\\) is flattened into a 1-d array of size \\(N_t T_y\\) . The equivalent weight matrix and bias vectors have size \\(N_x N_y T_x T_y\\) and \\(N_yT_y\\) . For example, given a problem of size \\(N_x=2\\) , \\(N_y=3\\) , \\(N_h=5\\) , and \\(T_x=T_y=4\\) , we obtain \\(N_{FFN}=108\\) and \\(N_{RNN}=58\\) . Loss Once the architecture is defined, the next step is to understand how the loss function should be defined for this kind of networks. As shown in the figure below, this can be simply accomplished by considering a loss function per time step and summing them together: \\[ \\mathscr{L} = \\sum_{t=1}^{T_x} \\mathscr{L}^{<t>}, \\qquad \\mathscr{L}^{<t>}= f(\\hat{\\mathbf{y}}^{<t>}, \\mathbf{y}^{<t>}) \\] where \\(f\\) can be the MSE, MAE, BCE, etc. This loss function can be easily interpreted in probabilistic terms as: \\[ f \\rightarrow -log P (\\mathbf{y}^{<t>} | \\mathbf{x}^{<1>}, \\mathbf{x}^{<2>}, ..., \\mathbf{x}^{<t>}) \\] To conclude, we note that the process of evaluating the various terms of the loss function is sequential as a previous hidden state is required to evaluate the current output. This can be very expensive and does not allow for parallelization (beyond across training samples), similar to the case of very deep feedforward neural networks. Backprop Given the loss function defined above, the computation of its gradient easily follows the principles that we have already extensively discussed in previous lectures; in simple terms, the backpropagation algorithm is applied on the unrolled computational graph in order to obtain the gradients of the weights and biases of the network block. Backpropagation over an RNN block is usually referred to as back-propagation through time (BPTT). Looking at this in more details, we can observe how the overall gradient of each of the weights or biases can be written as \\[ \\frac{\\partial \\mathscr{L}}{\\partial \\cdot} = \\sum_{t=1}^{T_x} \\frac{\\partial \\mathscr{L}^{<t>}}{\\partial \\cdot} \\] or, in other words, the gradient accumulates over the unrolled graph. Note also that, \\[ \\frac{\\partial \\mathscr{L}}{\\partial \\mathscr{L}^{<t>}} = 1, \\qquad \\frac{\\partial \\mathscr{L}}{\\partial \\cdot} = \\sum_{t=1}^{T_x} \\frac{\\partial \\mathscr{L}}{\\partial \\mathscr{L}^{<t>}} \\frac{\\partial \\mathscr{L}^{<t>}}{\\partial \\cdot} = \\sum_{t=1}^{T_x} \\frac{\\partial \\mathscr{L}^{<t>}}{\\partial \\cdot} \\] Let's now look more in details at the equations of backpropagation through time for a specific case of multi-label classification. More specifically we assume that the output of each step of the recurrent network ( \\(\\mathbf{o}^{<t>}\\) ) is passed through a softmax to get \\(\\hat{\\mathbf{y}}^{<t>}= \\sigma' (\\mathbf{o}^{<t>})\\) , and the loss in the negative log-likelihood of a Multinoulli distribution. Moreover, we will use tanh for the internal activation function \\(\\sigma\\) . Starting from the gradients of the internal nodes: \\[ \\left(\\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{o}^{<t>}}\\right)_i = \\hat{y}_i^{<t>} - \\mathbf{1}_{i=y^{<t>}} \\] \\[ \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{h}^{<T_x>}} = \\frac{\\partial \\mathscr{L}^{<T_x>}}{\\partial \\mathbf{o}^{<T_x>}} \\frac{\\partial \\mathbf{o}^{<T_x>}}{\\partial \\mathbf{h}^{<T_x>}} = \\mathbf{W}_y^T (\\hat{\\mathbf{y}}^{<T_x>} - \\mathbf{1}_{i=y^{<T_x>}}) \\] \\[ \\begin{aligned} \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{h}^{<t>}} &= \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{o}^{<t>}} \\frac{\\partial \\mathbf{o}^{<t>}}{\\partial \\mathbf{h}^{<t>}} + \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{h}^{<t+1>}} \\frac{\\partial \\mathbf{h}^{<t+1>}}{\\partial \\mathbf{h}^{<t>}} \\\\ &= \\mathbf{W}_y^T (\\hat{\\mathbf{y}}^{<t>} - \\mathbf{1}_{i=y^{<t>}}) + \\mathbf{W}_h^T diag(1 - (\\mathbf{h}^{<t+1>})^2) \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{h}^{<t+1>}} \\end{aligned} \\] where \\(\\mathbf{1}_{i=y^{<t>}}\\) is a vector of zeros with 1 at location of the true label, i.e. \\(i=y^{<t>}\\) , \\(diag(1 - (\\mathbf{h}^{<t+1>})^2)\\) is the Jacobian of the tanh activation function, and \\(\\partial \\mathscr{L} / \\partial \\mathbf{h}^{<t+1>}\\) is computed recursively from \\(t+1=T_x\\) as we know \\(\\partial \\mathscr{L} / \\partial \\mathbf{h}^{<T_x>}\\) . Moreover, it is worth noting how the gradient of the loss function over any hidden state \\(\\mathbf{h}^{<t>}\\) is composed of two terms, one coming directly from the corresponding output \\(\\mathbf{o}^{<t>}\\) and one from the next hidden state \\(\\mathbf{h}^{<t+1>}\\) . It follows that the gradients of the parameters to update are: \\[ \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{b}_y} = \\sum_{t=1}^{T_x} \\left( \\frac{\\partial \\mathbf{o}^{<t>}}{\\partial \\mathbf{b}_y} \\right)^T \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{o}^{<t>}} = \\sum_{t=1}^{T_x} \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{o}^{<t>}} \\] \\[ \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{b}_a} = \\sum_{t=1}^{T_x} \\left( \\frac{\\partial \\mathbf{h}^{<t>}}{\\partial \\mathbf{b}_a} \\right)^T \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{h}^{<t>}} = \\sum_{t=1}^{T_x} diag(1 - (\\mathbf{h}^{<t>})^2) \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{h}^{<t>}} \\] \\[ \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{W}_y} = \\sum_{t=1}^{T_x} \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{o}^{<t>}} \\frac{\\partial \\mathbf{o}^{<t>}}{\\partial \\mathbf{W}_y} = \\sum_{t=1}^{T_x} \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{o}^{<t>}} \\mathbf{h}^{<t>T} \\] \\[ \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{W}_h} = \\sum_{t=1}^{T_x} \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{h}^{<t>}} \\frac{\\partial \\mathbf{h}^{<t>}}{\\partial \\mathbf{W}_h} = \\sum_{t=1}^{T_x} diag(1 - (\\mathbf{h}^{<t>})^2) \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{h}^{<t>}} \\mathbf{h}^{<t-1>T} \\] \\[ \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{W}_x} = \\sum_{t=1}^{T_x} \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{h}^{<t>}} \\frac{\\partial \\mathbf{h}^{<t>}}{\\partial \\mathbf{W}_x} = \\sum_{t=1}^{T_x} diag(1 - (\\mathbf{h}^{<t>})^2) \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{h}^{<t>}} \\mathbf{x}^{<t>T} \\] Inference At test time, the evaluation of a RNN is straightforward. We usually simply need to pass through the forward pass and get the output \\(\\hat{\\mathbf{y}}^{<t>}\\) . However, this is not always true, especially in the following two cases: \\(T_x=1, T_y>1\\) (generative network) \\(T_x, T_y\\) (encoder-decoder network) as in both cases we will be required to use the output at a given step ( \\(\\hat{\\mathbf{y}}^{<t-1>}\\) ) as part of the input to produce the output of the next step ( \\(\\hat{\\mathbf{y}}^{<t>}\\) ). These two scenarios are dominant in so-called Language Modelling for tasks where we want to generate sentences given some initial guess (e.g., first word) or perform language-to-language translation. However, similar concepts could also be used to for example generate well logs or seismograms. Let's briefly take a look at some of the required changes in the inference process of these 2 network types. First of all, in conventional cases our loss function can be written as: \\[ \\begin{aligned} \\mathscr{L} &= \\prod_{t=1}^{T_x} P (\\mathbf{y}^{<t>} | \\mathbf{x}^{<1>}, \\mathbf{x}^{<2>}, ..., \\mathbf{x}^{<t>}) \\\\ &= - \\sum_{t=1}^{T_x} log P (\\mathbf{y}^{<t>} | \\mathbf{x}^{<1>}, \\mathbf{x}^{<2>}, ..., \\mathbf{x}^{<t>}) \\end{aligned} \\] where each output is here totally independent from the others. On the other hand, we are now faced with a joint distribution to sample from: \\[ \\begin{aligned} \\mathscr{L} &= P (\\mathbf{y}^{<1>}, \\mathbf{y}^{<2>},..., \\mathbf{y}^{<t>})\\\\ &= - \\prod_{t=1}^{T_x} log P (\\mathbf{y}^{<t>} | \\mathbf{y}^{<1>}, \\mathbf{y}^{<2>},..., \\mathbf{y}^{<t-1>}) \\end{aligned} \\] Evaluating such a probability is not a big deal during training as we can simply use the true labels as inputs (similarly to the more conventional network architectures where we use \\(\\mathbf{x}^{<t>}\\) ) instead. However, at inference stage we do not have access to the exact previous outputs when evaluating the current one. In order to simplify the evaluation of such a probability, we are therefore required to make an assumption: more specifically, we assume that the outputs can be modelled as a Markov Chain. In other words, we assume that the current output depends only on the previous one and not all of the other previous outputs. We can therefore write: \\[ \\mathscr{L} \\approx - \\prod_{t=1}^{T_x} log P (\\mathbf{y}^{<t>} | \\hat{\\mathbf{y}}^{<t-1>}) \\] which can be easily evaluated by placing the prediction at step \\(t-1\\) as input to step \\(t\\) . However, when we are interested in using our trained RNN for generative tasks, this approach comes with a limitation. It is in fact deterministic, and therefore we can only create a single output sequence. A more sophisticated procedure can be designed such that we can take advantage of our predictions in terms of their probabilities (and not the most probable outcome). Given \\(P (\\mathbf{y}^{<t-1>} | ...)\\) (from, e.g., before a softmax later), what we can instead do is to sample one value of \\(\\mathbf{y}^{<t-1>}\\) and feed it to the next step of our recurrent network. If we now repeat the same procedure multiple times, we will produce a bunch of different sequences. Finally, we could go even one step beyond and sample multiple values at step \\(t-1\\) , feed them concurrently to the next step (or the next N steps) and evaluate which one(s) has the highest joint probability, then go back to step \\(t-1\\) and choose that value(s). This procedure, usually referred as Beam Search , is however beyond the scope of this lecture. Bidirectional RNN Up until now, we have tried to construct NNs that can learn from short and long term patterns in the data in a causal fashion: in other words, by feeding our time series from left to right to the network we allow it at every time step \\(t\\) to learn dependencies from the past \\((t-1,t-2,t-i)\\) . This is very useful for streaming data where we record the data sequentially from \\(t=0\\) to \\(t=T_x\\) , and we do not want to wait until the entire data has been collected before we can make some predictions. This is usually referred to as online processing. An example of such a scenario is represented by real-time drilling, when we drill a hole into the subsurface and record some measurements whilst doing so. We would like a machine to process such recordings as they come in and provide us with useful insights on how to best continue drilling: Of course, not every problem lends naturally to the above depicted scenario. In most cases we are able to record data over an entire time window and only after that we are concerned with analyzing such data. This is usually referred to as offline processing. In this case it may be useful to also look at correlations between samples at time \\(t\\) and future samples \\((t+1,t+2,t+i)\\) . Bidirectional RNNs represent a solution to this as they allow learning short and long term dependencies not only from the past but also from the future. Let's start with a schematic diagram: where the network architecture presents a simple modification. Instead of having a single flow of information from left to right as it is the case for basic RNNs, we have now added a second flow of information from right to left. The hidden states of the first have been labelled with the suffix F (for forward), and those of the second with the suffix B (for backward). The inputs remain unchanged, apart from the fact that they are now fed twice to the network, once for the forward flow and once for the backward flow, whilst the output is not the concatenation of the outputs of the two flows, i.e., \\(\\hat{\\mathbf{y}}^{<t>} = [\\hat{\\mathbf{y}}_F^{<t>T} \\; \\hat{\\mathbf{y}}_B^{<t>T}]^T\\) . Deep RNNs Similarly to any other network architecture that we have investigated so far, the concept of shallow and deep network also applies to RNNs. Shallow RNNs are recurrent networks that have a single hidden layer connecting the inputs to the outputs. On the other than, deep RNNs are composed of more hidden layers. This is simply achieved as follows: First layer input: \\(\\mathbf{x}^{<t>}\\) , hidden and output: \\(\\mathbf{h}_0^{<t>}\\) , Second layer input: \\(\\mathbf{h}_0^{<t>}\\) , hidden and output: \\(\\mathbf{h}_1^{<t>}\\) , Last layer input: \\(\\mathbf{h}_{N-1}^{<t>}\\) , hidden: \\(\\mathbf{h}_N^{<t>}\\) , output: \\(\\hat{\\mathbf{y}}^{<t>}\\) . that we can visually represent as: Mathematically, a deep RNN can be simply expressed as follows. For \\(i=0,1,N-1\\) (with \\(\\mathbf{h}_{-1}=\\mathbf{x}\\) ) \\[ \\begin{aligned} \\mathbf{a}_i^{<t>} &= \\mathbf{W}_{h_i} \\mathbf{h}_i^{<t-1>} + \\mathbf{W}_{x_i} \\mathbf{h}_{i-1}^{<t>} + \\mathbf{b}_{a_i} \\\\ \\mathbf{h}_i^{<t>} &= \\sigma(\\mathbf{a}_i^{<t>} ) \\\\ \\end{aligned} \\] For \\(i=N\\) \\[ \\begin{aligned} \\mathbf{a}_N^{<t>} &= \\mathbf{W}_{h_N} \\mathbf{h}_N^{<t-1>} + \\mathbf{W}_{x_N} \\mathbf{h}_{N-1}^{<t>} + \\mathbf{b}_{a_N} \\\\ \\mathbf{h}_N^{<t>} &= \\sigma(\\mathbf{a}_N^{<t>} ) \\\\ \\mathbf{o}^{<t>} &= \\mathbf{W}_y \\mathbf{h}_N^{<t>} + \\mathbf{b}_y \\\\ \\hat{\\mathbf{y}}^{<t>} &= \\sigma' (\\mathbf{o}^{<t>}) \\\\ \\end{aligned} \\] Long-term dependencies: implications for gradients In this section, we will discuss a long-standing challenge arising when implementing backpropagation through a RNN. A number of solution to circumvent this problem will be presented in following sections. Let's start by considering the forward pass of a recurrent network. For the information to flow from left to right, a recurrent network repeatedly applies the matrix \\(\\mathbf{W}_h\\) to the hidden state vectors (interleaved by nonlinear transformations): as already discussed in the Optimization lecture, this leads to raising the eigenvalues of this matrix to the power of \\(T_x\\) . Eigenvalues smaller than one decay very fast to zero, whilst those bigger than one grow exponentially fast to infinity. As a consequence, only the part of the initial vector \\(\\mathbf{h}^{<0>}\\) aligned with the largest eigenvectors successfully propagates through the network whilst the other components become insignificant after a few steps. So, no matter how we choose the initial weights of the network and hidden state, long term dependencies tend to become irrelevant when compared to short terms ones in terms of their contribution to the gradient. In other words, the network will take a long time to train and learn long-term dependencies. In order to avoid that, a number of strategies have been proposed in the literature. In the following, we will look at three of them: the first tries to circumvent this problem as part of the learning process, whilst the latter two tackle the issue from the perspective of the network architecture design. By no means, these are the preferred choices nowadays when using RNNs. Gradient clipping We have previously mentioned that one simple strategy to prevent exploding gradient is represented by so-called gradient clipping. As the name suggests, this is applied only during the backward pass to gradients that overcome a given threshold. A forward-backward pass with gradient clipping can be therefore written as: Forward pass: \\(\\hat{\\mathbf{y}}^{<t>} = f_\\theta(\\mathbf{x}^{<t>} , \\mathbf{h}^{<0>}) \\; \\forall t=0,1,...T_x\\) Backward pass: \\(\\partial \\mathscr{L} / \\partial \\theta\\) Gradient clipping: if \\(|\\partial \\mathscr{L} / \\partial \\theta| > th\\) , then \\(\\partial \\mathscr{L} / \\partial \\theta = sign(\\partial \\mathscr{L} / \\partial \\theta) \\cdot th\\) Unfortunately, a similar simply trick does not exist for the other problem, vanishing gradients. So, whislt adopting this strategy will avoid instabilities in the training of basic RNNs, the training process will still be painfully slow. Gated recurrent networks or GRU unit The most effective family of networks that can tackle both the exploding and vanishing gradient problem is called Gated networks . As the name implies, a gate is introduced in each block of the network to help information flow and be used by later units without vanishing and exploding gradient issues. By doing so, the gate helps the network remembering some information from early steps, use it much later down the flow, and eventually forget about it. A GRU unit can be simply seen as a classical RNN unit with a number of small modifications. Let's start by drawing them side-by-side (note that for the moment we are considering a simplified GRU block): Apart from a slight change in name ( \\(\\mathbf{h}^{<t>}\\) has been replaced by \\(\\mathbf{c}^{<t>}\\) , which stands for memory cell), compared to the basic RNN the GRU block contains a number of additional internal states. More specifically: \\(\\tilde{\\mathbf{c}}^{<t>}\\) : the candidate replacement for the memory cell. It is a candidate as in some cases it will not be used, rather the current memory cell will be fast-tracked to allow learning long-term dependencies. \\(\\Gamma_u\\) : update gate, which is responsible to choose whether to pass the candidate memory cell \\(\\tilde{\\mathbf{c}}^{<t>}\\) or the previous memory cell \\(\\mathbf{c}^{<t-1>}\\) to the next layer. The associated update equations for this simplified GRU block are: \\[ \\begin{aligned} &\\boldsymbol \\Gamma_{u}=\\sigma\\left(\\mathbf{W}_{u}\\left[\\begin{array}{c} \\mathbf{c}^{<t-1>} \\\\ \\mathbf{x}^{<t>} \\end{array}\\right]+\\mathbf{b}_{u}\\right) \\\\ &\\tilde{\\mathbf{c}}^{<t>}=\\tanh \\left(\\mathbf{W}_{c}\\left[\\begin{array}{c} \\mathbf{c}^{<t-1>} \\\\ \\mathbf{x}^{<t>} \\end{array}\\right]+\\mathbf{b}_{c}\\right) \\\\ &\\mathbf{c}^{<t>}=\\boldsymbol \\Gamma_{u} \\cdot \\tilde{\\mathbf{c}}^{<t>}+\\left(1-\\boldsymbol \\Gamma_{u}\\right) \\cdot \\mathbf{c}^{<t-1>}\\\\ &\\hat{\\mathbf{y}}^{<t>}=\\sigma' (\\mathbf{W}_y \\mathbf{c}^{<t>} + \\mathbf{b}_{y}) \\end{aligned} \\] In the last equation, the new memory cell is computed as the linear interpolation between the old memory cell and the candidate one. However, since a sigmoid is usually chosen for the update gate, \\(\\boldsymbol \\Gamma_{u}\\) roughly acts as a binary gate (0-stop, 1-pass). This way, the gate can stop the flowing of new information for a number of steps allowing the old information to be moved further up the flow without being multiplicated by the weight matrix and therefore creating long-term dependencies that do not suffer from the vanishing gradient problem. To conclude, let's look at the real GRU and its equations, which introduces an additional gate called the relevance or reset gate \\(\\boldsymbol \\Gamma_{r}\\) : \\[ \\begin{aligned} &\\boldsymbol \\Gamma_{u}=\\sigma\\left(\\mathbf{W}_{u}\\left[\\begin{array}{c} \\mathbf{c}^{<t-1>} \\\\ \\mathbf{x}^{<t>} \\end{array}\\right]+\\mathbf{b}_{u}\\right) \\\\ &\\boldsymbol \\Gamma_{r}=\\sigma\\left(\\mathbf{W}_{r}\\left[\\begin{array}{c} \\mathbf{c}^{<t-1>} \\\\ \\mathbf{x}^{<t>} \\end{array}\\right]+\\mathbf{b}_{r}\\right) \\\\ &\\tilde{\\mathbf{c}}^{<t>}=\\tanh \\left(\\mathbf{W}_{c}\\left[\\begin{array}{c} \\boldsymbol \\Gamma_{r} \\cdot \\mathbf{c}^{<t-1>} \\\\ \\mathbf{x}^{<t>} \\end{array}\\right]+\\mathbf{b}_{c}\\right) \\\\ &\\mathbf{c}^{<t>}=\\boldsymbol \\Gamma_{u} \\cdot \\tilde{\\mathbf{c}}^{<t>}+\\left(1-\\boldsymbol \\Gamma_{u}\\right) \\cdot \\mathbf{c}^{<t-1>}\\\\ &\\hat{\\mathbf{y}}^{<t>}=\\sigma' (\\mathbf{W}_y \\mathbf{c}^{<t>} + \\mathbf{b}_{y}) \\end{aligned} \\] Long-short term memory (LSTM) unit Another popular, probably the most popular, RNN block that mitigates the vanishing gradient problem is called LSTM block. It uses similar concepts to those introduced for the GRU block, but at the same time introduces a number of additional hidden states, namely: \\(\\Gamma_f\\) : forget gate, which provides more flexibility when updating the memory cell with the old and candidate memory cells. More specifically, whilst in the GRU block, the new memory cell was a linear combination of those two terms, now we have two independent weights (both of them learned) that can allow passing more or less information from the two inputs instead of having to weight their total contribution to 1. \\(\\Gamma_o\\) : output gate; \\[ \\begin{aligned} &\\boldsymbol{\\Gamma}_{u}=\\sigma\\left(\\boldsymbol{W}_{u}\\left[\\begin{array}{c} \\boldsymbol{h}^{<t-1>} \\\\ \\boldsymbol{x}^{<t>} \\end{array}\\right]+\\boldsymbol{b}_{u}\\right) \\\\ &\\boldsymbol{\\Gamma}_{\\boldsymbol{f}}=\\sigma\\left(\\boldsymbol{W}_{f}\\left[\\begin{array}{c} h^{<t-1>} \\\\ x^{<t>} \\end{array}\\right]+\\boldsymbol{b}_{f}\\right) \\\\ &\\boldsymbol{\\Gamma}_{o}=\\sigma\\left(\\boldsymbol{W}_{o}\\left[\\begin{array}{c} \\boldsymbol{h}^{<t-1>} \\\\ \\boldsymbol{x}^{<t>} \\end{array}\\right]+\\boldsymbol{b}_{o}\\right) \\\\ &\\tilde{\\boldsymbol{c}}^{<t>}=\\tanh \\left(\\boldsymbol{W}_{c}\\left[\\begin{array}{c} \\boldsymbol{h}^{<t-1>} \\\\ \\boldsymbol{x}^{<t>} \\end{array}\\right]+\\boldsymbol{b}_{c}\\right) \\\\ &\\boldsymbol{c}^{<t>}=\\boldsymbol{\\Gamma}_{u} \\tilde{\\boldsymbol{c}}^{<t>}+\\boldsymbol{\\Gamma}_{f} \\boldsymbol{c}^{<t-1>} \\\\ &\\boldsymbol{h}^{<t>}=\\boldsymbol{\\Gamma}_{o} \\tanh \\left(\\boldsymbol{c}^{<t>}\\right) \\\\ &\\boldsymbol{y}^{<t>}=\\sigma^{\\prime}\\left(\\boldsymbol{W}_{y} \\boldsymbol{h}^{<t>}+\\boldsymbol{b}_{y}\\right) \\end{aligned} \\] Present and future of sequence modelling Finally, it is worth noting that the field of sequence modelling with deep neural networks has been taken by a storm a couple of years ago with novel architectures that have led to great improvements in the field of Natural Language Processing. The first innovation, which goes under the name of Attention layer has been initially introduced to mitigate one of the main limitations of the encoder-decoder RNN architecture that we have extensively discussed in this lecture. More specifically, the attention layer can find global correlations between the input(s) of the decoder layer and any of the hidden states of the encoder, avoiding the problem of having a bottleneck at the end of the encoder and a single hidden state that is required to encode the information of the various inputs of the encoder. The attention layer has later led to the design of a completely new type of neural network architecture, the so-called Transformer layer. In this case, instead of processing the input sequentially as in RNNs, the transformer layer takes all the inputs at once and find both local and global correlations by means of so-called self-attention blocks. Additional readings If you are interested to learn more about attention and transformer layers, I recommend watching this lecture from the KAUST Summer School on Unstructured Data in Geoscience","title":"Sequence modelling"},{"location":"lectures/12_seqmod/#sequence-modelling","text":"In this lecture we will start investigating a family of Neural Network that are particularly suitable for learning tasks that involve sequences as input data. To understand what a sequence is in the context of Deep learning, let's consider a recording over time (e.g., an audio recording): Compared to other dataset types (e.g., tabular or gridded data), the different samples of a sequence present an obvious degree of correlation that tends to diminuish the further away to samples are from each other. Moreover, in the case of multi-feature sequences (e.g., multi-component seismological recordings), the overall sequence contains a number of features at each time step that can be more or less correlated to each other. Sequences appear in every aspect of life. For example, outside of geoscience, the two most commonly used data in sequence modelling are: text audio More specifically, as we will see, the field of Natural Language Processing (NPL) has experienced a revolutionary growth in the last decade thanks to sequence modelling and deep learning. In geoscience, many of the commonly used datasets can also be interpreted as sequences, for example: seismograms well logs production data are all datatypes that present a certain degree of correlation along either the time or depth axis. Finally, similar to FFNs or CNNs, sequence modelling can be used for various applications: Single output classification: given an input sequence of a certain length \\(\\mathbf{x}\\) , a model is trained to decide whether than sequence contains a feature of interest or not. For example, given a seismogram we may be interest to detect the presence of a seismic event, or we may want to find out if a well log is clean or corrupted by some recording error or what is the facies in the middle of the sequence; Multi output classification (i.e., semantic segmentation): given an input sequence of a certain length \\(\\mathbf{x}\\) , a model is trained to classify each element of the input sequence into a predefined set of classes. Taking once again the example of facies labelling, here the task is extended to predicting labels at each depth level (and not only in the middle of the sequence); Regression: given an input sequence of a certain length \\(\\mathbf{x}\\) , a model is trained to predict a continuous output, which could be a single value \\(y\\) or a sequence of values \\(\\mathbf{y}\\) that has the same (or different length) of the input. For example, given a set of well logs we may want to predict another one that was not acquired. Similarly, given a seismic trace recorded by the vertical component of a geophone we may be interested to predict the horizontal components. Both of these example fall under the area of domain translation ;","title":"Sequence modelling"},{"location":"lectures/12_seqmod/#motivation","text":"Let's start by considering what we have learned so far and discuss how we could use those tools to handle sequential data. First of all, we consider a sequence of \\(N_\\tau\\) samples and \\(N_f\\) features: \\[ \\mathbf{X} = \\begin{bmatrix} x_1^{<1>} & x_1^{<2>} & x_1^{<N_\\tau>} \\\\ ... & ... & ... \\\\ x_{N_f}^{<1>} & x_1^{<2>} & x_{N_f}^{<N_\\tau>} \\end{bmatrix} = \\begin{bmatrix} \\mathbf{x}^{<1>} & \\mathbf{x}^{<2>} & \\mathbf{x}^{<N_\\tau>} \\end{bmatrix}_{[N_f \\times N_\\tau]} \\] we could easily deal with this as if it was a 2D-array (i.e., an image) and use CNNs. However, the locality argument used for the convolutional filters that constitute a convolutional layer would not make much sense here, especially if we know that elements in the sequence away from each other may still have a certain degree of correlation. Alternatively, the matrix \\(\\mathbf{X}\\) could be simply vectorized and used as input to a FFN. This approach does however present two main limitations: since the vector \\(vec(\\mathbf{X})\\) is likely to be very long, weight matrices will be very large leading to a very expensive training process; FFNs cannot easily handle inputs of variable lengths, so all sequences will need to have fixed length. We will see that being able to handle variable-length sequences is very useful in some situations. Both problems can be overcome by taking advantage of parameter sharing . We have already introduced this concept in the context of CNNs, where the same filters are used in different parts of the input. Similarly in sequence modelling, the idea of parameter sharing allows using the same parameters at different stages of the sequence and therefore allows the network to easily handle sequences of variable length. By doing so, a new type of neural network is created under the name of Recurrent Neural Network (RNN): where \\(\\mathbf{x}\\) is the input vector (or matrix when multiple features are present), \\(\\mathbf{y}\\) is the output vector, and \\(\\mathbf{h}\\) is the so called hidden state vector. As clearly shown in the unrolled version of the network into a standard computational graph, various inputs and hidden states are passed through the same function \\(f_\\theta\\) with a given number of training parameters. This is very different from a feed-forward network where different functions is are used over consecutive layers. The choice of the function \\(f_\\theta\\) leads to the definition of different RNN architectures. Before we begin introducing a number of popular architectures for sequence modelling, let's introduce some useful notation. Inputs and outputs of a RNNs will be always defined as follows: \\[ \\mathbf{X} = \\begin{bmatrix} \\mathbf{x}^{<1>} & \\mathbf{x}^{<2>} & \\mathbf{x}^{<T_x>} \\end{bmatrix}_{[N_f \\times T_x]} \\] and \\[ \\mathbf{Y} = \\begin{bmatrix} \\mathbf{y}^{<1>} & \\mathbf{y}^{<2>} & \\mathbf{y}^{<T_y>} \\end{bmatrix}_{[N_t \\times T_y]} \\] where \\(T_x\\) and \\(T_y\\) are the length of the input and output sequences. First, note that this notations differs from before in that a single training sample is now represented as a matrix; therefore, the entire training data becomes a 3-D tensor of size \\([N_s \\times N_f \\times T_x]\\) (and \\([N_s \\times N_t \\times T_y]\\) ). Finally, note that in the most general case these parameters may be sample dependant (i.e., when we allow sequences of variable size): the following notation will be used in that case, \\(T_x^{(i)}\\) and \\(T_y^{(i)}\\) where \\(i\\) refers to the i-th training sample. Moreover, given that we recurrently apply the same function \\(f_\\theta\\) , we can very compactly write an RNN as: \\[ \\mathbf{h}^{<t>}, \\mathbf{y}^{<t>}=f_\\theta(\\mathbf{h}^{<t-1>}, \\mathbf{x}^{<t>}) \\qquad t=1,2,T_x \\] that we can unroll into: \\[ \\mathbf{h}^{<t>}, \\mathbf{y}^{<t>}=f_\\theta(f_\\theta(f_\\theta(\\mathbf{h}^{<0>}, \\mathbf{x}^{<1>}), ...), \\mathbf{x}^{<t-2>}), \\mathbf{x}^{<t-1>}), \\mathbf{x}^{<t>}) \\] As we have already briefly mentioned, RNNs allows some flexibility on the choice of \\(T_y\\) (i.e., the length of the output sequence). This leads to the creation of different network architectures that are suitable to different tasks: Note that in the cases 3 and 4, the predicted output is fed back to the network as input to the next step at inference stage as shown in the figure above. At training stage, however, the true output is used as input. In summary, what we wish to achieve here is to create a network that can learn but short and long term relationships in the data such that both samples closes to each other as well as far away samples can help in the prediction of the current step. By using parameter sharing in a smart way, we can avoid overparametrizing the network and therefore limit the risk of overfitting on short and long term trends in the data. In other words, by assuming stationariety in the data, we let the network understand if step \\(t\\) and \\(t+N_t\\) are correlated to each other across the entire time sequence, instead of giving the network with the freedom to find relationships between any two samples in the sequence.","title":"Motivation"},{"location":"lectures/12_seqmod/#basic-rnn","text":"","title":"Basic RNN"},{"location":"lectures/12_seqmod/#architecture","text":"It is now time to discuss in more details what is an effective function, \\(f_\\theta\\) . The most basic Recurrent Neural Network can be written as follows: \\[ \\begin{aligned} \\mathbf{a}^{<t>} &= \\mathbf{W}_h \\mathbf{h}^{<t-1>} + \\mathbf{W}_x \\mathbf{x}^{<t>} + \\mathbf{b}_a = \\mathbf{W} [\\mathbf{h}^{<t-1>}, \\mathbf{x}^{<t>}]^T + \\mathbf{b}_a \\\\ \\mathbf{h}^{<t>} &= \\sigma(\\mathbf{a}^{<t>} ) \\\\ \\mathbf{o}^{<t>} &= \\mathbf{W}_y \\mathbf{h}^{<t>} + \\mathbf{b}_y \\\\ \\hat{\\mathbf{y}}^{<t>} &= \\sigma' (\\mathbf{o}^{<t>}) \\end{aligned} \\] where: \\(\\sigma\\) and \\(\\sigma'\\) are the activation functions for the hidden and output paths (the choice of the activation for the latter depends on the problem we wish to solve, e.g., softmax for binary classification) \\(\\mathbf{h}^{<0>}\\) is the initial hidden state vector which is usually initalialized as a zero vector. \\(\\mathbf{W} = [\\mathbf{W}_h, \\mathbf{W}_x]_{[N_h \\times N_h + N_x]}\\) is the matrix of weights for the hidden path \\(\\mathbf{W}_{y \\; [N_y \\times N_h]}\\) is the matrix of weights for the output path In conclusion, the learnable parameters for this kind of RNN block are: \\(\\mathbf{W}_h, \\mathbf{W}_x, \\mathbf{W}_y, \\mathbf{b}_a, \\mathbf{b}_y\\) whose overall size is \\(N_h(N_h+N_x) + N_y N_h + N_h + N_y\\) . To give some perspective, this is much smaller than the number of learnable parameters of an 'equivalent' Feed-Forward network where the entire input matrix \\(\\mathbf{X}\\) is flattened into a 1-d array of size \\(N_f T_x\\) and the entire output matrix \\(\\mathbf{Y}\\) is flattened into a 1-d array of size \\(N_t T_y\\) . The equivalent weight matrix and bias vectors have size \\(N_x N_y T_x T_y\\) and \\(N_yT_y\\) . For example, given a problem of size \\(N_x=2\\) , \\(N_y=3\\) , \\(N_h=5\\) , and \\(T_x=T_y=4\\) , we obtain \\(N_{FFN}=108\\) and \\(N_{RNN}=58\\) .","title":"Architecture"},{"location":"lectures/12_seqmod/#loss","text":"Once the architecture is defined, the next step is to understand how the loss function should be defined for this kind of networks. As shown in the figure below, this can be simply accomplished by considering a loss function per time step and summing them together: \\[ \\mathscr{L} = \\sum_{t=1}^{T_x} \\mathscr{L}^{<t>}, \\qquad \\mathscr{L}^{<t>}= f(\\hat{\\mathbf{y}}^{<t>}, \\mathbf{y}^{<t>}) \\] where \\(f\\) can be the MSE, MAE, BCE, etc. This loss function can be easily interpreted in probabilistic terms as: \\[ f \\rightarrow -log P (\\mathbf{y}^{<t>} | \\mathbf{x}^{<1>}, \\mathbf{x}^{<2>}, ..., \\mathbf{x}^{<t>}) \\] To conclude, we note that the process of evaluating the various terms of the loss function is sequential as a previous hidden state is required to evaluate the current output. This can be very expensive and does not allow for parallelization (beyond across training samples), similar to the case of very deep feedforward neural networks.","title":"Loss"},{"location":"lectures/12_seqmod/#backprop","text":"Given the loss function defined above, the computation of its gradient easily follows the principles that we have already extensively discussed in previous lectures; in simple terms, the backpropagation algorithm is applied on the unrolled computational graph in order to obtain the gradients of the weights and biases of the network block. Backpropagation over an RNN block is usually referred to as back-propagation through time (BPTT). Looking at this in more details, we can observe how the overall gradient of each of the weights or biases can be written as \\[ \\frac{\\partial \\mathscr{L}}{\\partial \\cdot} = \\sum_{t=1}^{T_x} \\frac{\\partial \\mathscr{L}^{<t>}}{\\partial \\cdot} \\] or, in other words, the gradient accumulates over the unrolled graph. Note also that, \\[ \\frac{\\partial \\mathscr{L}}{\\partial \\mathscr{L}^{<t>}} = 1, \\qquad \\frac{\\partial \\mathscr{L}}{\\partial \\cdot} = \\sum_{t=1}^{T_x} \\frac{\\partial \\mathscr{L}}{\\partial \\mathscr{L}^{<t>}} \\frac{\\partial \\mathscr{L}^{<t>}}{\\partial \\cdot} = \\sum_{t=1}^{T_x} \\frac{\\partial \\mathscr{L}^{<t>}}{\\partial \\cdot} \\] Let's now look more in details at the equations of backpropagation through time for a specific case of multi-label classification. More specifically we assume that the output of each step of the recurrent network ( \\(\\mathbf{o}^{<t>}\\) ) is passed through a softmax to get \\(\\hat{\\mathbf{y}}^{<t>}= \\sigma' (\\mathbf{o}^{<t>})\\) , and the loss in the negative log-likelihood of a Multinoulli distribution. Moreover, we will use tanh for the internal activation function \\(\\sigma\\) . Starting from the gradients of the internal nodes: \\[ \\left(\\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{o}^{<t>}}\\right)_i = \\hat{y}_i^{<t>} - \\mathbf{1}_{i=y^{<t>}} \\] \\[ \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{h}^{<T_x>}} = \\frac{\\partial \\mathscr{L}^{<T_x>}}{\\partial \\mathbf{o}^{<T_x>}} \\frac{\\partial \\mathbf{o}^{<T_x>}}{\\partial \\mathbf{h}^{<T_x>}} = \\mathbf{W}_y^T (\\hat{\\mathbf{y}}^{<T_x>} - \\mathbf{1}_{i=y^{<T_x>}}) \\] \\[ \\begin{aligned} \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{h}^{<t>}} &= \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{o}^{<t>}} \\frac{\\partial \\mathbf{o}^{<t>}}{\\partial \\mathbf{h}^{<t>}} + \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{h}^{<t+1>}} \\frac{\\partial \\mathbf{h}^{<t+1>}}{\\partial \\mathbf{h}^{<t>}} \\\\ &= \\mathbf{W}_y^T (\\hat{\\mathbf{y}}^{<t>} - \\mathbf{1}_{i=y^{<t>}}) + \\mathbf{W}_h^T diag(1 - (\\mathbf{h}^{<t+1>})^2) \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{h}^{<t+1>}} \\end{aligned} \\] where \\(\\mathbf{1}_{i=y^{<t>}}\\) is a vector of zeros with 1 at location of the true label, i.e. \\(i=y^{<t>}\\) , \\(diag(1 - (\\mathbf{h}^{<t+1>})^2)\\) is the Jacobian of the tanh activation function, and \\(\\partial \\mathscr{L} / \\partial \\mathbf{h}^{<t+1>}\\) is computed recursively from \\(t+1=T_x\\) as we know \\(\\partial \\mathscr{L} / \\partial \\mathbf{h}^{<T_x>}\\) . Moreover, it is worth noting how the gradient of the loss function over any hidden state \\(\\mathbf{h}^{<t>}\\) is composed of two terms, one coming directly from the corresponding output \\(\\mathbf{o}^{<t>}\\) and one from the next hidden state \\(\\mathbf{h}^{<t+1>}\\) . It follows that the gradients of the parameters to update are: \\[ \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{b}_y} = \\sum_{t=1}^{T_x} \\left( \\frac{\\partial \\mathbf{o}^{<t>}}{\\partial \\mathbf{b}_y} \\right)^T \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{o}^{<t>}} = \\sum_{t=1}^{T_x} \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{o}^{<t>}} \\] \\[ \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{b}_a} = \\sum_{t=1}^{T_x} \\left( \\frac{\\partial \\mathbf{h}^{<t>}}{\\partial \\mathbf{b}_a} \\right)^T \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{h}^{<t>}} = \\sum_{t=1}^{T_x} diag(1 - (\\mathbf{h}^{<t>})^2) \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{h}^{<t>}} \\] \\[ \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{W}_y} = \\sum_{t=1}^{T_x} \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{o}^{<t>}} \\frac{\\partial \\mathbf{o}^{<t>}}{\\partial \\mathbf{W}_y} = \\sum_{t=1}^{T_x} \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{o}^{<t>}} \\mathbf{h}^{<t>T} \\] \\[ \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{W}_h} = \\sum_{t=1}^{T_x} \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{h}^{<t>}} \\frac{\\partial \\mathbf{h}^{<t>}}{\\partial \\mathbf{W}_h} = \\sum_{t=1}^{T_x} diag(1 - (\\mathbf{h}^{<t>})^2) \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{h}^{<t>}} \\mathbf{h}^{<t-1>T} \\] \\[ \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{W}_x} = \\sum_{t=1}^{T_x} \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{h}^{<t>}} \\frac{\\partial \\mathbf{h}^{<t>}}{\\partial \\mathbf{W}_x} = \\sum_{t=1}^{T_x} diag(1 - (\\mathbf{h}^{<t>})^2) \\frac{\\partial \\mathscr{L}}{\\partial \\mathbf{h}^{<t>}} \\mathbf{x}^{<t>T} \\]","title":"Backprop"},{"location":"lectures/12_seqmod/#inference","text":"At test time, the evaluation of a RNN is straightforward. We usually simply need to pass through the forward pass and get the output \\(\\hat{\\mathbf{y}}^{<t>}\\) . However, this is not always true, especially in the following two cases: \\(T_x=1, T_y>1\\) (generative network) \\(T_x, T_y\\) (encoder-decoder network) as in both cases we will be required to use the output at a given step ( \\(\\hat{\\mathbf{y}}^{<t-1>}\\) ) as part of the input to produce the output of the next step ( \\(\\hat{\\mathbf{y}}^{<t>}\\) ). These two scenarios are dominant in so-called Language Modelling for tasks where we want to generate sentences given some initial guess (e.g., first word) or perform language-to-language translation. However, similar concepts could also be used to for example generate well logs or seismograms. Let's briefly take a look at some of the required changes in the inference process of these 2 network types. First of all, in conventional cases our loss function can be written as: \\[ \\begin{aligned} \\mathscr{L} &= \\prod_{t=1}^{T_x} P (\\mathbf{y}^{<t>} | \\mathbf{x}^{<1>}, \\mathbf{x}^{<2>}, ..., \\mathbf{x}^{<t>}) \\\\ &= - \\sum_{t=1}^{T_x} log P (\\mathbf{y}^{<t>} | \\mathbf{x}^{<1>}, \\mathbf{x}^{<2>}, ..., \\mathbf{x}^{<t>}) \\end{aligned} \\] where each output is here totally independent from the others. On the other hand, we are now faced with a joint distribution to sample from: \\[ \\begin{aligned} \\mathscr{L} &= P (\\mathbf{y}^{<1>}, \\mathbf{y}^{<2>},..., \\mathbf{y}^{<t>})\\\\ &= - \\prod_{t=1}^{T_x} log P (\\mathbf{y}^{<t>} | \\mathbf{y}^{<1>}, \\mathbf{y}^{<2>},..., \\mathbf{y}^{<t-1>}) \\end{aligned} \\] Evaluating such a probability is not a big deal during training as we can simply use the true labels as inputs (similarly to the more conventional network architectures where we use \\(\\mathbf{x}^{<t>}\\) ) instead. However, at inference stage we do not have access to the exact previous outputs when evaluating the current one. In order to simplify the evaluation of such a probability, we are therefore required to make an assumption: more specifically, we assume that the outputs can be modelled as a Markov Chain. In other words, we assume that the current output depends only on the previous one and not all of the other previous outputs. We can therefore write: \\[ \\mathscr{L} \\approx - \\prod_{t=1}^{T_x} log P (\\mathbf{y}^{<t>} | \\hat{\\mathbf{y}}^{<t-1>}) \\] which can be easily evaluated by placing the prediction at step \\(t-1\\) as input to step \\(t\\) . However, when we are interested in using our trained RNN for generative tasks, this approach comes with a limitation. It is in fact deterministic, and therefore we can only create a single output sequence. A more sophisticated procedure can be designed such that we can take advantage of our predictions in terms of their probabilities (and not the most probable outcome). Given \\(P (\\mathbf{y}^{<t-1>} | ...)\\) (from, e.g., before a softmax later), what we can instead do is to sample one value of \\(\\mathbf{y}^{<t-1>}\\) and feed it to the next step of our recurrent network. If we now repeat the same procedure multiple times, we will produce a bunch of different sequences. Finally, we could go even one step beyond and sample multiple values at step \\(t-1\\) , feed them concurrently to the next step (or the next N steps) and evaluate which one(s) has the highest joint probability, then go back to step \\(t-1\\) and choose that value(s). This procedure, usually referred as Beam Search , is however beyond the scope of this lecture.","title":"Inference"},{"location":"lectures/12_seqmod/#bidirectional-rnn","text":"Up until now, we have tried to construct NNs that can learn from short and long term patterns in the data in a causal fashion: in other words, by feeding our time series from left to right to the network we allow it at every time step \\(t\\) to learn dependencies from the past \\((t-1,t-2,t-i)\\) . This is very useful for streaming data where we record the data sequentially from \\(t=0\\) to \\(t=T_x\\) , and we do not want to wait until the entire data has been collected before we can make some predictions. This is usually referred to as online processing. An example of such a scenario is represented by real-time drilling, when we drill a hole into the subsurface and record some measurements whilst doing so. We would like a machine to process such recordings as they come in and provide us with useful insights on how to best continue drilling: Of course, not every problem lends naturally to the above depicted scenario. In most cases we are able to record data over an entire time window and only after that we are concerned with analyzing such data. This is usually referred to as offline processing. In this case it may be useful to also look at correlations between samples at time \\(t\\) and future samples \\((t+1,t+2,t+i)\\) . Bidirectional RNNs represent a solution to this as they allow learning short and long term dependencies not only from the past but also from the future. Let's start with a schematic diagram: where the network architecture presents a simple modification. Instead of having a single flow of information from left to right as it is the case for basic RNNs, we have now added a second flow of information from right to left. The hidden states of the first have been labelled with the suffix F (for forward), and those of the second with the suffix B (for backward). The inputs remain unchanged, apart from the fact that they are now fed twice to the network, once for the forward flow and once for the backward flow, whilst the output is not the concatenation of the outputs of the two flows, i.e., \\(\\hat{\\mathbf{y}}^{<t>} = [\\hat{\\mathbf{y}}_F^{<t>T} \\; \\hat{\\mathbf{y}}_B^{<t>T}]^T\\) .","title":"Bidirectional RNN"},{"location":"lectures/12_seqmod/#deep-rnns","text":"Similarly to any other network architecture that we have investigated so far, the concept of shallow and deep network also applies to RNNs. Shallow RNNs are recurrent networks that have a single hidden layer connecting the inputs to the outputs. On the other than, deep RNNs are composed of more hidden layers. This is simply achieved as follows: First layer input: \\(\\mathbf{x}^{<t>}\\) , hidden and output: \\(\\mathbf{h}_0^{<t>}\\) , Second layer input: \\(\\mathbf{h}_0^{<t>}\\) , hidden and output: \\(\\mathbf{h}_1^{<t>}\\) , Last layer input: \\(\\mathbf{h}_{N-1}^{<t>}\\) , hidden: \\(\\mathbf{h}_N^{<t>}\\) , output: \\(\\hat{\\mathbf{y}}^{<t>}\\) . that we can visually represent as: Mathematically, a deep RNN can be simply expressed as follows. For \\(i=0,1,N-1\\) (with \\(\\mathbf{h}_{-1}=\\mathbf{x}\\) ) \\[ \\begin{aligned} \\mathbf{a}_i^{<t>} &= \\mathbf{W}_{h_i} \\mathbf{h}_i^{<t-1>} + \\mathbf{W}_{x_i} \\mathbf{h}_{i-1}^{<t>} + \\mathbf{b}_{a_i} \\\\ \\mathbf{h}_i^{<t>} &= \\sigma(\\mathbf{a}_i^{<t>} ) \\\\ \\end{aligned} \\] For \\(i=N\\) \\[ \\begin{aligned} \\mathbf{a}_N^{<t>} &= \\mathbf{W}_{h_N} \\mathbf{h}_N^{<t-1>} + \\mathbf{W}_{x_N} \\mathbf{h}_{N-1}^{<t>} + \\mathbf{b}_{a_N} \\\\ \\mathbf{h}_N^{<t>} &= \\sigma(\\mathbf{a}_N^{<t>} ) \\\\ \\mathbf{o}^{<t>} &= \\mathbf{W}_y \\mathbf{h}_N^{<t>} + \\mathbf{b}_y \\\\ \\hat{\\mathbf{y}}^{<t>} &= \\sigma' (\\mathbf{o}^{<t>}) \\\\ \\end{aligned} \\]","title":"Deep RNNs"},{"location":"lectures/12_seqmod/#long-term-dependencies-implications-for-gradients","text":"In this section, we will discuss a long-standing challenge arising when implementing backpropagation through a RNN. A number of solution to circumvent this problem will be presented in following sections. Let's start by considering the forward pass of a recurrent network. For the information to flow from left to right, a recurrent network repeatedly applies the matrix \\(\\mathbf{W}_h\\) to the hidden state vectors (interleaved by nonlinear transformations): as already discussed in the Optimization lecture, this leads to raising the eigenvalues of this matrix to the power of \\(T_x\\) . Eigenvalues smaller than one decay very fast to zero, whilst those bigger than one grow exponentially fast to infinity. As a consequence, only the part of the initial vector \\(\\mathbf{h}^{<0>}\\) aligned with the largest eigenvectors successfully propagates through the network whilst the other components become insignificant after a few steps. So, no matter how we choose the initial weights of the network and hidden state, long term dependencies tend to become irrelevant when compared to short terms ones in terms of their contribution to the gradient. In other words, the network will take a long time to train and learn long-term dependencies. In order to avoid that, a number of strategies have been proposed in the literature. In the following, we will look at three of them: the first tries to circumvent this problem as part of the learning process, whilst the latter two tackle the issue from the perspective of the network architecture design. By no means, these are the preferred choices nowadays when using RNNs.","title":"Long-term dependencies: implications for gradients"},{"location":"lectures/12_seqmod/#gradient-clipping","text":"We have previously mentioned that one simple strategy to prevent exploding gradient is represented by so-called gradient clipping. As the name suggests, this is applied only during the backward pass to gradients that overcome a given threshold. A forward-backward pass with gradient clipping can be therefore written as: Forward pass: \\(\\hat{\\mathbf{y}}^{<t>} = f_\\theta(\\mathbf{x}^{<t>} , \\mathbf{h}^{<0>}) \\; \\forall t=0,1,...T_x\\) Backward pass: \\(\\partial \\mathscr{L} / \\partial \\theta\\) Gradient clipping: if \\(|\\partial \\mathscr{L} / \\partial \\theta| > th\\) , then \\(\\partial \\mathscr{L} / \\partial \\theta = sign(\\partial \\mathscr{L} / \\partial \\theta) \\cdot th\\) Unfortunately, a similar simply trick does not exist for the other problem, vanishing gradients. So, whislt adopting this strategy will avoid instabilities in the training of basic RNNs, the training process will still be painfully slow.","title":"Gradient clipping"},{"location":"lectures/12_seqmod/#gated-recurrent-networks-or-gru-unit","text":"The most effective family of networks that can tackle both the exploding and vanishing gradient problem is called Gated networks . As the name implies, a gate is introduced in each block of the network to help information flow and be used by later units without vanishing and exploding gradient issues. By doing so, the gate helps the network remembering some information from early steps, use it much later down the flow, and eventually forget about it. A GRU unit can be simply seen as a classical RNN unit with a number of small modifications. Let's start by drawing them side-by-side (note that for the moment we are considering a simplified GRU block): Apart from a slight change in name ( \\(\\mathbf{h}^{<t>}\\) has been replaced by \\(\\mathbf{c}^{<t>}\\) , which stands for memory cell), compared to the basic RNN the GRU block contains a number of additional internal states. More specifically: \\(\\tilde{\\mathbf{c}}^{<t>}\\) : the candidate replacement for the memory cell. It is a candidate as in some cases it will not be used, rather the current memory cell will be fast-tracked to allow learning long-term dependencies. \\(\\Gamma_u\\) : update gate, which is responsible to choose whether to pass the candidate memory cell \\(\\tilde{\\mathbf{c}}^{<t>}\\) or the previous memory cell \\(\\mathbf{c}^{<t-1>}\\) to the next layer. The associated update equations for this simplified GRU block are: \\[ \\begin{aligned} &\\boldsymbol \\Gamma_{u}=\\sigma\\left(\\mathbf{W}_{u}\\left[\\begin{array}{c} \\mathbf{c}^{<t-1>} \\\\ \\mathbf{x}^{<t>} \\end{array}\\right]+\\mathbf{b}_{u}\\right) \\\\ &\\tilde{\\mathbf{c}}^{<t>}=\\tanh \\left(\\mathbf{W}_{c}\\left[\\begin{array}{c} \\mathbf{c}^{<t-1>} \\\\ \\mathbf{x}^{<t>} \\end{array}\\right]+\\mathbf{b}_{c}\\right) \\\\ &\\mathbf{c}^{<t>}=\\boldsymbol \\Gamma_{u} \\cdot \\tilde{\\mathbf{c}}^{<t>}+\\left(1-\\boldsymbol \\Gamma_{u}\\right) \\cdot \\mathbf{c}^{<t-1>}\\\\ &\\hat{\\mathbf{y}}^{<t>}=\\sigma' (\\mathbf{W}_y \\mathbf{c}^{<t>} + \\mathbf{b}_{y}) \\end{aligned} \\] In the last equation, the new memory cell is computed as the linear interpolation between the old memory cell and the candidate one. However, since a sigmoid is usually chosen for the update gate, \\(\\boldsymbol \\Gamma_{u}\\) roughly acts as a binary gate (0-stop, 1-pass). This way, the gate can stop the flowing of new information for a number of steps allowing the old information to be moved further up the flow without being multiplicated by the weight matrix and therefore creating long-term dependencies that do not suffer from the vanishing gradient problem. To conclude, let's look at the real GRU and its equations, which introduces an additional gate called the relevance or reset gate \\(\\boldsymbol \\Gamma_{r}\\) : \\[ \\begin{aligned} &\\boldsymbol \\Gamma_{u}=\\sigma\\left(\\mathbf{W}_{u}\\left[\\begin{array}{c} \\mathbf{c}^{<t-1>} \\\\ \\mathbf{x}^{<t>} \\end{array}\\right]+\\mathbf{b}_{u}\\right) \\\\ &\\boldsymbol \\Gamma_{r}=\\sigma\\left(\\mathbf{W}_{r}\\left[\\begin{array}{c} \\mathbf{c}^{<t-1>} \\\\ \\mathbf{x}^{<t>} \\end{array}\\right]+\\mathbf{b}_{r}\\right) \\\\ &\\tilde{\\mathbf{c}}^{<t>}=\\tanh \\left(\\mathbf{W}_{c}\\left[\\begin{array}{c} \\boldsymbol \\Gamma_{r} \\cdot \\mathbf{c}^{<t-1>} \\\\ \\mathbf{x}^{<t>} \\end{array}\\right]+\\mathbf{b}_{c}\\right) \\\\ &\\mathbf{c}^{<t>}=\\boldsymbol \\Gamma_{u} \\cdot \\tilde{\\mathbf{c}}^{<t>}+\\left(1-\\boldsymbol \\Gamma_{u}\\right) \\cdot \\mathbf{c}^{<t-1>}\\\\ &\\hat{\\mathbf{y}}^{<t>}=\\sigma' (\\mathbf{W}_y \\mathbf{c}^{<t>} + \\mathbf{b}_{y}) \\end{aligned} \\]","title":"Gated recurrent networks or GRU unit"},{"location":"lectures/12_seqmod/#long-short-term-memory-lstm-unit","text":"Another popular, probably the most popular, RNN block that mitigates the vanishing gradient problem is called LSTM block. It uses similar concepts to those introduced for the GRU block, but at the same time introduces a number of additional hidden states, namely: \\(\\Gamma_f\\) : forget gate, which provides more flexibility when updating the memory cell with the old and candidate memory cells. More specifically, whilst in the GRU block, the new memory cell was a linear combination of those two terms, now we have two independent weights (both of them learned) that can allow passing more or less information from the two inputs instead of having to weight their total contribution to 1. \\(\\Gamma_o\\) : output gate; \\[ \\begin{aligned} &\\boldsymbol{\\Gamma}_{u}=\\sigma\\left(\\boldsymbol{W}_{u}\\left[\\begin{array}{c} \\boldsymbol{h}^{<t-1>} \\\\ \\boldsymbol{x}^{<t>} \\end{array}\\right]+\\boldsymbol{b}_{u}\\right) \\\\ &\\boldsymbol{\\Gamma}_{\\boldsymbol{f}}=\\sigma\\left(\\boldsymbol{W}_{f}\\left[\\begin{array}{c} h^{<t-1>} \\\\ x^{<t>} \\end{array}\\right]+\\boldsymbol{b}_{f}\\right) \\\\ &\\boldsymbol{\\Gamma}_{o}=\\sigma\\left(\\boldsymbol{W}_{o}\\left[\\begin{array}{c} \\boldsymbol{h}^{<t-1>} \\\\ \\boldsymbol{x}^{<t>} \\end{array}\\right]+\\boldsymbol{b}_{o}\\right) \\\\ &\\tilde{\\boldsymbol{c}}^{<t>}=\\tanh \\left(\\boldsymbol{W}_{c}\\left[\\begin{array}{c} \\boldsymbol{h}^{<t-1>} \\\\ \\boldsymbol{x}^{<t>} \\end{array}\\right]+\\boldsymbol{b}_{c}\\right) \\\\ &\\boldsymbol{c}^{<t>}=\\boldsymbol{\\Gamma}_{u} \\tilde{\\boldsymbol{c}}^{<t>}+\\boldsymbol{\\Gamma}_{f} \\boldsymbol{c}^{<t-1>} \\\\ &\\boldsymbol{h}^{<t>}=\\boldsymbol{\\Gamma}_{o} \\tanh \\left(\\boldsymbol{c}^{<t>}\\right) \\\\ &\\boldsymbol{y}^{<t>}=\\sigma^{\\prime}\\left(\\boldsymbol{W}_{y} \\boldsymbol{h}^{<t>}+\\boldsymbol{b}_{y}\\right) \\end{aligned} \\]","title":"Long-short term memory (LSTM) unit"},{"location":"lectures/12_seqmod/#present-and-future-of-sequence-modelling","text":"Finally, it is worth noting that the field of sequence modelling with deep neural networks has been taken by a storm a couple of years ago with novel architectures that have led to great improvements in the field of Natural Language Processing. The first innovation, which goes under the name of Attention layer has been initially introduced to mitigate one of the main limitations of the encoder-decoder RNN architecture that we have extensively discussed in this lecture. More specifically, the attention layer can find global correlations between the input(s) of the decoder layer and any of the hidden states of the encoder, avoiding the problem of having a bottleneck at the end of the encoder and a single hidden state that is required to encode the information of the various inputs of the encoder. The attention layer has later led to the design of a completely new type of neural network architecture, the so-called Transformer layer. In this case, instead of processing the input sequentially as in RNNs, the transformer layer takes all the inputs at once and find both local and global correlations by means of so-called self-attention blocks.","title":"Present and future of sequence modelling"},{"location":"lectures/12_seqmod/#additional-readings","text":"If you are interested to learn more about attention and transformer layers, I recommend watching this lecture from the KAUST Summer School on Unstructured Data in Geoscience","title":"Additional readings"},{"location":"lectures/13_dimred/","text":"Dimensionality reduction Up until now we have mostly focused on one family of Machine Learning methods, so-called Supervised learning . Whilst this is by far the most popular application in Deep Learning and the one that has reported greater success in the last decade, another family of methods that is becoming more and more popular falls under the umbrella of so-called Unsupervised learning . When labelled data are scarce, or it is difficult to have access to ground truth labels (e.g., in geoscience), unsupervised learning can represent an appealing alternative to find patterns in data. Unsupervised learning comes in different flavours. For example let's imagine grouping a set of unlabelled data into a number of buckets and then analyze them one-by-one knowing that the samples within each bucket are more similar to each other than others in the dataset: this is a form of unsupervised learning called clustering . The flavour that we are going to discuss in more details in the following is however referred to as Dimensionality reduction . Simply stated dimensionality reduction can be described as: Take \\(N_s\\) training samples \\(\\mathbf{x}^{(i)} \\in \\mathbb{R}^{N_f}\\) , ( \\(i=1,2,...N_s\\) ), Find a smaller representation \\(\\mathbf{c}^{(i)} \\in \\mathbb{R}^{N_l}\\) ( \\(N_l<<N_f\\) ) whilst making the smallest possible reconstruction error. If you previously studied how data are stored in a computer transmitted via cable (or air), you may recall that this is the very same objective of data compression . For this reason, nowadays we can build on a vast body of literature when designing effective dimensionality reduction techniques. What it is however slowly becoming more and more evident is the fact that by identifying representative low-dimensional (also called latent ) spaces from a set of data samples living in a much richer space, we can implicitly extract useful features to be later used in subsequent tasks of supervised learning. This two-steps approach is becoming very popular these days especially in fields of science that lack vast amount of labelled data as a way to take advantage as much as possible of unlabelled samples and then being able to fine-tune supervised models using small amounts of labelled data. Before we consider a number of different approaches to dimensionality reduction, let's write the problem in a common mathematical form. Given a number of training samples $\\mathbf{x}^{(i)}, we wish to identify: encoder: \\(\\mathbf{c}^{(i)} = e(\\mathbf{x}^{(i)})\\) decoder: \\(\\hat{\\mathbf{x}}^{(i)} = d(\\mathbf{c}^{(i)})\\) such that: \\[ \\hat{e},\\hat{d} = \\underset{e,d} {\\mathrm{argmin}} \\; \\frac{1}{N_s}\\sum_i \\mathscr{L}(\\mathbf{x}^{(i)}, d(e(\\mathbf{x}^{(i)}))) \\] Principal Component Analysis (PCA) The simplest approach to dimensionality reduction uses linear operators for the encoder: encoder: \\(\\mathbf{c}^{(i)} = \\mathbf{E}\\mathbf{x}^{(i)}\\) decoder: \\(\\hat{\\mathbf{x}}^{(i)} = \\mathbf{D}\\mathbf{c}^{(i)}\\) where \\(\\mathbf{E}_{[N_l \\times N_f]}\\) and \\(\\mathbf{D}_{[N_f \\times N_l]}\\) . PCA aims to find representative features that are linear combinations of the columns of the encoder (i.e., \\(\\mathbf{c}=\\sum_{i=1}^{N_f} \\mathbf{E}_{:,i} x_i\\) ) such that the projection of these new features onto the original space ( \\(\\mathbf{D}\\mathbf{c}\\) ) is as close as possible to the original sample \\(\\mathbf{x}\\) . In other words, we want to find the best linear subspace of the original space that minimizes the reconstruction error defined here as the squared Euclidean norm ( \\(\\mathscr{L}=||.||^2_2\\) ). Defining a unique pair of matrices ( \\(\\mathbf{E},\\mathbf{D}\\) ) is however not possible without imposing further constraints. In the PCA derivation we must assume that the columns of \\(\\mathbf{D}\\) are orthonormal: \\[ \\mathbf{D}^T \\mathbf{D} = \\mathbf{I}_{N_l} \\] By making such a strong assumption we can easily see that \\[ $\\hat{\\mathbf{x}}^{(i)} = \\mathbf{D}\\mathbf{E}\\mathbf{x}^{(i)}=\\mathbf{D}\\mathbf{D}^T\\mathbf{x}^{(i)} \\quad (\\mathbf{E}=\\mathbf{D}^T) \\] is the choice of encoder-decoder that minimizes the reconstruction error. Let's now prove to ourselves that this is the case for a single training sample: \\[ \\hat{\\mathbf{c}} = \\underset{\\mathbf{c}} {\\mathrm{argmin}} \\; ||\\mathbf{x}-d(\\mathbf{x})||_2^2 \\] where for the moment we do not specify the decoder and simply call it \\(d\\) . Let's first expand the loss function \\[ \\begin{aligned} ||\\mathbf{x}-d(\\mathbf{x})||_2^2 &= (\\mathbf{x}-g(\\mathbf{x}))^T (\\mathbf{x}-d(\\mathbf{x})) \\\\ &= \\mathbf{x}^T \\mathbf{x} - \\mathbf{x}^Td(\\mathbf{x}) - g(\\mathbf{x})^T \\mathbf{x} + d(\\mathbf{c})^T g(\\mathbf{c})^T\\\\ &= \\mathbf{x}^T \\mathbf{x} - 2 \\mathbf{x}^Td(\\mathbf{x}) + d(\\mathbf{c})^T g(\\mathbf{c})^T\\\\ \\end{aligned} \\] where we can ignore the first term given it does not depend on \\(\\mathbf{c}\\) . At this point let's consider the special case of \\(d()=\\mathbf{D}\\) , which gives: \\[ \\begin{aligned} ||\\mathbf{x}-d(\\mathbf{x})||_2^2 &= \\mathbf{c}^T \\mathbf{D}^T \\mathbf{D} \\mathbf{c} - 2 \\mathbf{x}^T \\mathbf{D} \\mathbf{c} \\\\ &= \\mathbf{c}^T \\mathbf{I}_{N_l} \\mathbf{c} - 2 \\mathbf{x}^T \\mathbf{D} \\mathbf{c} \\end{aligned} \\] Finally we compute the derivative of the loss function over \\(\\mathbf{c}\\) : \\[ \\frac{\\partial J}{\\partial \\mathbf{c}} = 0 \\rightarrow 2 \\mathbf{c}^T - 2 \\mathbf{x}^T \\mathbf{D} = 0 \\rightarrow \\mathbf{c} = \\mathbf{D}^T \\mathbf{x} \\] where we have obtained that \\(\\mathbf{E} = \\mathbf{D}^T\\) . At this point we know what is the optimal linear encoder-decoder pair with respect to the MSE loss. However, we do not have a specific form for the matrix \\(\\mathbf{D}\\) itself. In order to identify the entries of the decoder matrix, we need to set up another optimization problem, this time directly for \\(\\mathbf{D}\\) : \\[ \\hat{\\mathbf{D}} = \\underset{\\mathbf{D}} {\\mathrm{argmin}} \\; ||\\mathbf{X}-\\mathbf{D}\\mathbf{D}^T \\mathbf{X}||_F \\quad s.t. \\; \\mathbf{D}^T \\mathbf{D} = \\mathbf{I}_{N_l} \\] where \\(\\mathbf{X}_{[N_f \\times N_s]}\\) is the training sample matrix. To simplify our derivation let's consider the case of \\(N_l=1\\) ; the result can then be easily generalized for any choice of \\(N_l=1\\) . Let's write \\[ \\begin{aligned} \\hat{\\mathbf{d}} &= \\underset{\\mathbf{d}} {\\mathrm{argmin}} \\; ||\\mathbf{X}-\\mathbf{d}\\mathbf{d}^T \\mathbf{X}||_F \\quad s.t. \\; \\mathbf{d}^T \\mathbf{d} = 1 \\\\ &= \\underset{\\mathbf{d}} {\\mathrm{argmin}} \\; ||\\bar{\\mathbf{X}}-\\bar{\\mathbf{X}}\\mathbf{d}\\mathbf{d}^T||_F \\quad s.t. \\; \\mathbf{d}^T \\mathbf{d} = 1 \\quad (\\bar{\\mathbf{X}}=\\mathbf{X}^T) \\\\ &= \\underset{\\mathbf{d}} {\\mathrm{argmin}} \\; Tr((\\bar{\\mathbf{X}}-\\bar{\\mathbf{X}}\\mathbf{d}\\mathbf{d}^T)^T(\\bar{\\mathbf{X}}-\\bar{\\mathbf{X}}\\mathbf{d}\\mathbf{d}^T)) \\quad s.t. \\; \\mathbf{d}^T \\mathbf{d} = 1\\\\ &= \\underset{\\mathbf{d}} {\\mathrm{argmin}} \\; Tr(\\bar{\\mathbf{X}}^T \\bar{\\mathbf{X}} - \\bar{\\mathbf{X}}^T \\bar{\\mathbf{X}}\\mathbf{d}\\mathbf{d}^T - \\mathbf{d}\\mathbf{d}^T \\bar{\\mathbf{X}}^T \\bar{\\mathbf{X}} + \\mathbf{d}\\mathbf{d}^T\\bar{\\mathbf{X}}^T \\bar{\\mathbf{X}}\\mathbf{d}\\mathbf{d}^T) \\quad s.t. \\; \\mathbf{d}^T \\mathbf{d} = 1\\\\ &= \\underset{\\mathbf{d}} {\\mathrm{argmin}} \\; -2 Tr(\\bar{\\mathbf{X}}^T \\bar{\\mathbf{X}}\\mathbf{d}\\mathbf{d}^T) + Tr(\\mathbf{d}\\mathbf{d}^T\\bar{\\mathbf{X}}^T \\bar{\\mathbf{X}}\\mathbf{d}\\mathbf{d}^T) \\quad s.t. \\; \\mathbf{d}^T \\mathbf{d} = 1\\\\ &= \\underset{\\mathbf{d}} {\\mathrm{argmin}} \\; -2 Tr(\\bar{\\mathbf{X}}^T \\bar{\\mathbf{X}}\\mathbf{d}\\mathbf{d}^T) + Tr(\\bar{\\mathbf{X}}^T \\bar{\\mathbf{X}}\\mathbf{d}\\mathbf{d}^T \\mathbf{d}\\mathbf{d}^T) \\quad s.t. \\; \\mathbf{d}^T \\mathbf{d} = 1\\\\ &= \\underset{\\mathbf{d}} {\\mathrm{argmin}} \\; -Tr(\\bar{\\mathbf{X}}^T \\bar{\\mathbf{X}}\\mathbf{d}\\mathbf{d}^T) \\quad s.t. \\; \\mathbf{d}^T \\mathbf{d} = 1\\\\ &= \\underset{\\mathbf{d}} {\\mathrm{argmax}} \\; Tr(\\bar{\\mathbf{X}}^T \\bar{\\mathbf{X}}\\mathbf{d}\\mathbf{d}^T) = Tr(\\mathbf{d}^T \\bar{\\mathbf{X}}^T \\bar{\\mathbf{X}}\\mathbf{d}) \\quad s.t. \\; \\mathbf{d}^T \\mathbf{d} = 1\\\\ \\end{aligned} \\] where in 6 we use the fact that \\(\\mathbf{d}^T \\mathbf{d} = 1\\) . The solution of this maximization problem is represented by the eigenvector of \\(\\bar{\\mathbf{X}}^T \\bar{\\mathbf{X}}\\) associated to the largest eigenvalue (or the \\(N_l\\) largest eigenvalues for the general case). We can therefore conclude that PCA is defined as: Take \\(N_s\\) training samples \\(\\mathbf{x}^{(i)} \\in \\mathbb{R}^{N_f}\\) , ( \\(i=1,2,...N_s\\) ), Compute the matrix \\(\\bar{\\mathbf{X}}_{[N_s \\times N_f]}\\) Compute the SVD of \\(\\bar{\\mathbf{X}}\\) (i.e., eigenvalues and eigenvectors of the sample covariance matrix \\(\\bar{\\mathbf{X}}^T \\bar{\\mathbf{X}}\\) ) Form \\(\\mathbf{D}\\) composed by the eigenvector associated with the \\(N_l\\) largest eigenvalues. Compute \\(\\mathbf{c}=\\mathbf{D}^T \\mathbf{x}\\) and \\(\\hat{\\mathbf{x}}=\\mathbf{D} \\mathbf{c}\\) . More in general, it is also worth remembering that if the training data is not zero-mean, PCA can be slightly modified to take that into account: \\[ \\mathbf{c}=\\mathbf{D}^T (\\mathbf{x}-\\boldsymbol\\mu$ and $\\hat{\\mathbf{x}}=\\mathbf{D} \\mathbf{c}+\\boldsymbol\\mu$. \\] where \\(\\boldsymbol\\mu\\) is the sample mean. To conclude, let's try to provide some additional geometrical intuition of how PCA works in practice. Once again, let's recall the covariance matrix that we form and create SVD on: \\[ \\mathbf{C}_x=E_\\mathbf{x} [(\\mathbf{x}-\\boldsymbol\\mu) (\\mathbf{x}-\\boldsymbol\\mu)^T] \\] The eigenvalues \\(\\lambda_i\\) of \\(\\mathbf{C}_x\\) relate to the variance of the dataset \\(\\mathbf{X}\\) in the direction of the associated eigenvector \\(\\mathbf{v}_i\\) as follows (we use a 2d example for simplicity): so we observe that the first direction of PCA (i.e. \\(\\mathbf{v}_1\\) ) is the one that best minimizes the reconstruction error (i.e., \\sum_i d_{i,1}). In multiple dimensions, the eigenvectors are organized in order of reconstruction error of the projected data points from smallest to largest. Other linear dimensionality reduction techniques Whilst PCA is very popular for its simplicity (both of understanding and implementation), other techniques for linead dimensionality reduction exist. As some of them has been shown during the years to be very powerful and better suited to find representative latent representations from data, we will briefly look at them here. Independent Component Analysis (ICA) ICA aims to separate a signal into many underlying signals that are scaled and added together to reproduce the original one: \\[ \\mathbf{x} = \\sum_i c_i \\mathbf{w}_i = \\mathbf{Wc} \\] where in this case \\(\\mathbf{c}\\) has the same dimensionality of \\(\\mathbf{x}\\) . This model is in fact commonly used for blind source separation of mixed signals. Despite it is strictly speaking not a dimensionality reduction technique, we discuss it here due to its ability of finding representative bases that combined together can explain a set of data. Once again, the problem is in need for extra constraints for us to be able to find a solution. In this case the assumption made of the \\(\\mathbf{w}_i\\) signals is as follows: Signals \\(\\mathbf{w}_i\\) must be statistically independent from each other and non-gaussian A solution to this problem can be obtained finding the pair ( \\(\\mathbf{W}, \\mathbf{c}\\) ) which maximises non-gaussianity (i.e., minimizes normalized sample kurtosis) or minimizes mutual information (MI). Whilst we don't discuss here in details how to achieve such solution, it is worth pointing out that this requires solving a nonlinear inverse problem as \\(\\mathbf{W}\\) relates in a nonlinear manner to kurtosis or MI. Sparse Coding (or Dictionary Learning) Sparse coding is another heavily studied model for dimensionality reduction. The general idea has origin in a large body of work carried out in other areas of applied mathematics where hand-crafted transformations (e.g., wavelets) habe been identified to nicely represent data of different kind (e.g., images, sounds, seismic recordings) in a very sparse fashion. Here sparse refers to the fact that the transformed signal can be represented by a vector with many zeros and just few non-zero entries. In this context, however, the transformation is represented a matrix \\(\\mathbf{W}\\) , whose entries are once again learned directly from the available training data. This is achieved by imposing a strong condition on the probability distribution associated with the latent vector \\(\\mathbf{c}\\) : \\[ p(\\mathbf{c}) \\approx \\text{Laplace, Cauchy, Factorized t-student} \\] in other words, a fat tailed distribution, whose samples are therefore sparse. By making such an assumption, no closed form solution exist like in the PCA case. Instead, the training process is set up with the following goals in mind: Find sparsest latent representation during the encoding phase Find a decoder that provides the smallest reconstruction error which mathematically can be written as: $$ \\begin{aligned} \\hat{\\mathbf{W}}, \\hat{\\mathbf{h}} &= \\underset{\\mathbf{W}, \\mathbf{h}} {\\mathrm{argmax}} p(\\mathbf{h}|\\mathbf{x}) &= \\underset{\\mathbf{W}, \\mathbf{h}} {\\mathrm{argmin}} \\beta ||\\mathbf{h}-\\mathbf{W}\\mathbf{h}||_2^2 +\\lambda ||\\mathbf{h}||_1 \\end{aligned} $$ where \\(\\beta\\) , \\(\\lambda\\) are directly related to the parameters of the posterior distribution that we wish to maximize. This functional can be minimized in an alternating fashion, first for \\(\\mathbf{W}\\) , then for \\(\\mathbf{x}\\) , and so on and so forth. Finally, once the training process is over and \\(\\hat{\\mathbf{W}}\\) is available, it is worth noting that sparse coding does require solving a sparsity-promoting inverse problem for any new training sample \\(\\mathbf{x}\\) in order to find its best representation \\(\\hat{\\mathbf{h}}\\) . Nevertheless, despite the higher cost compared to for example PCA, sparse coding has shown great promise in both data compression and representation learning, the latter when coupled with down-the-line supervised tasks. Autoencoders Finally, we turn our attention onto nonlinear dimensionality reduction models. We should know by now that nonlinear mappings (like those performed by NNs) may be much more powerful than their linear counterpart is used to our advantage. The most popular nonlinear dimensionality techniques dates back to 1991 and the work of M. Kramer. Simply put, an autoencoder is the combination of an encoder function \\(e_\\theta\\) , which converts the input data into a latent representation, and a decoder function \\(d_\\theta\\) , which converts the new representation back into the original format. Here, both \\(e_\\theta\\) and \\(d_\\theta\\) and nonlinear and fully learned and stack one after the other as shown below An autoencoder can therefore be simply defined as: \\[ \\hat{\\mathbf{x}} = d_\\phi(e_\\theta(\\mathbf{x})) \\] where similar to PCA, the training process is setup such the parameters of the two networks are optimized to minimize the following loss function: \\[ \\hat{e}_\\theta,\\hat{d}_\\phi = \\underset{e_\\theta,d_\\phi} {\\mathrm{argmin}} \\; \\frac{1}{N_s}\\sum_i \\mathscr{L}(\\mathbf{x}^{(i)}, d_\\phi(e_\\theta(\\mathbf{x}^{(i)})))) \\] where the network architecture for both the encoder and decoder can be chosen accordingly to the type of data we are interested in. Once again, our code (or latent vector \\(\\mathbf{z}\\) ) must be chosen to be of lower dimensionality compared to the input in order to be able to learn useful representations. On the other hand if we choose \\(N_l \\ge N_f\\) , we will likely not learn something useful: very likely what we are going to learn is to reproduce the identity mapping. In other words, whilst the loss function is set to reproduce the input itself, what we are really interested is not the mere reconstruction, rather the creation of some meaningful transformation of the input vector that first projects it into a latent space and then expands it back to the original space. If we are able to accomplish this task, we will likely see that if we feed the trained network with a new sample \\(\\mathbf{x}_{in}\\) that lies inside the distribution of the training data, the reconstruction will be of similar quality as to what we observed in training. On the other hand, when a out-of-distribution sample \\(\\mathbf{x}_{out}\\) is fed to the network, its prediction will be much less accurate. Applications Now that we know how an AutoEncoder works, the next obvious question is why do we care and what can we use if for. Let's recap here a couple of applications that we have already mentioned here and there in the lecture: Data compression: the use of NNs (and AEs in this specific case) may soon lead to a completely new, nonlinear paradigm in data compression where we could simply store the latent vectors and network architecture and weights and reconstruct the original vector on-demand similar to what conventionally done with linear compressors (e.g., JPEG2000). Learn robust features on large unlabelled data prior to supervised learning: assuming that we have access to a large dataset composed for the majority of unlabelled data and for a small portion of labelled data, we could imagine training and AE on the first part of the dataset and use the learned latent features as input to a subsequent task of supervised learning. More specifically, the inputs of the labelled data are fed to the trained encoder and the resulting features are used in conjunction with the labels in a supervised manner. Inverse problems in the latent space: this is similar to the previous case, with the main difference that we may have an inverse problem we wish to solve where the parameter to estimate lives in the manifold of the \\(\\mathbf{x}\\) samples. We can once again train and AE to learn a good representation for such the manifold of possible solutions and then solve the inverse problem for \\(\\mathbf{z}\\) instead of \\(\\mathbf{x}\\) directly. Perform vector math in the latent space: Imagine we want to compare two multi-dimensional vectors \\(\\mathbf{x}\\) (e.g., images). Classical distance measures may focus too much on small discrepancies and not really on the overall similarity between these samples, that is what we usually want to compare. Alternatively, we could convert both vectors into their latent representations and compare them in this reduced space. In this case, even simple distance measures like MSE may become more robust as they really compare high-level features of the inputs that are encapsulated in the latent vectors. Undercomplete vs. Overcomplete AEs Up until now, we are talked about undercomplete representations (i.e., \\(N_l << N_f\\) ). We have justified this with the fact that if we give too many degrees of freedom to the network, we will likely allow it to learn the identity mapping (a form of overfitting for AEs). In short, a good design for a AE should follow these two rules: choose a small enough code ( \\(N_l\\) ): not too small as it won't be able to reproduce the input accurately, not too large as it will make the AE overfit; choose a small enough network capacity for both the encoder and decoder: similarly, a too large network will easily overfit even if the size of bottleneck has been appropriately chosen. However, a different choice may be taken as we will see shortly. This is heavily inspired by traditional compression algorithms, where a (linear) transformation that can produce a compact code (i.e., a code that can be stored in far fewer bits than the corresponding input) is usually overcomplete. Let's take the Wavelet transform as an example: Here the input image is initially decomposed into 3 high-pass and one-low pass filtered versions of it, and the low-pass one is further processed recursively. The overall size of the input and output is however identical . What makes this transform a great compressor is that in the transformed domain, natural images (and other N-dimensional signals) can be represented by very few non-zero coefficients. In other words, we say that the Wavelet transform provides a sparse representation of a variety of N-dimensional signals in nature. A similar approach can be taken for nonlinear transformations, like those applied by AEs. In this case, however, extra care must be taken to avoid overfitting, which can be done by adding some constraints to the learning process. As already discussed many times, these constraints can simply come in the form of regularization in the learning process: \\[ \\mathscr{L}_r = \\frac{1}{N_s}\\sum_i \\left( \\mathscr{L}(\\mathbf{x}^{(i)}, d_\\phi(e_\\theta(\\mathbf{x}^{(i)}))) +\\lambda R(\\mathbf{x}^{(i)} ; \\theta,\\phi) \\right) \\] where \\(R(\\mathbf{x}^{(i)} ;\\theta,\\phi)\\) can take several forms: L1 norm : this encourages the network to produce sparse latent representations; Derivative of the latent vector over the input : this encourages robust latent vectors that a small sensitivity to small perturbations of the input; Noise or missing parts in the inputs : this is not really a regularization in formal sense, as nothing is added to the cost function, rather the input is perturbed to make once again the latent representation robust to small variations in the input. Sparse AutoEncoders Enforcing a sparse latent vector can act as a strong regularization. This can be simply achieved by choosing: \\[ R(\\mathbf{x}^{(i)} ;\\theta,\\phi) = ||e_\\theta(\\mathbf{x}^{(i)})||_1 \\] which allows the learning process to optimize for the pair of encoder-decoder that can reproduce the training samples, whilst also forcing the encoder to produce sparse latent representation. A step further can be taken by imposing that not only the activations of the latent code are sparse, rather all the activations in the network. Let's take for simplicity a small network as depicted below: and changing the regularizer to: \\[ R(\\mathbf{x}^{(i)} ;\\theta,\\phi) = \\sum_j ||a_e^{[j](i)}||_1 + \\sum_j ||a_d^{[j](i)}||_1 \\] An autoencoder that is trained using this strategy is called Sparse Autoencoder . Finally, a slightly different strategy has been proposed under the name of K-sparse AutoEncoder , where instead of having a soft-constraint in the form of the regularization term above, the elements of the latent code are modified by a nonlinear transformation that brings all elements to zero apart from the K largest elements in absolute value. More formally, even though in practice no regularization term is therefore explicitly added to the loss function, this approach solves the following constrained problem: \\[ \\underset{\\mathbf{e}_\\theta, \\mathbf{d}_\\phi} {\\mathrm{argmin}} \\; \\frac{1}{N_s}\\sum_i \\mathscr{L}(\\mathbf{x}^{(i)}, d_\\phi(e_\\theta(\\mathbf{x}^{(i)}))) \\quad s.t. \\quad ||\\mathbf{z}||_0<K \\] Note that, once again, this procedure can be extended such that all the activations in the network are forced to have only K non-zero values. Contractive AutoEncoders An alternative regularization term that can make AEs robust to small changes in the input vectors is: \\[ R(\\mathbf{x} ;\\theta,\\phi) = ||\\nabla_\\mathbf{x} \\mathbf{z}||_F \\] where the derivative of the latent vector is taken over the input vector and forced to be small. Note that this derivative produces the Jacobian of the encoder as both the input and output are multi-dimensional (and therefore the use of the Frobenious norm). Whilst the authors of this method claim additional robustness, the computational cost of computing a Jacobian makes this approach quite costly. Denoising AutoEncoders Finally, denoising AEs are another family of regularized autoencoders. In this case, however, the regularization is implemented directly on the input vectors prior to feeding them to the network, by either replacing some values with zeros (or random values) or adding noise. Considering this last case, each step of the training process becomes: \\(\\tilde{\\mathbf{x}}^{(i)} = \\mathbf{x}^{(i)} + \\mathbf{n}^{(i)} \\quad \\forall i\\) ; \\(\\mathscr{L} = \\frac{1}{N_s}\\sum_i \\mathscr{L}(\\mathbf{x}^{(i)}, d_\\phi(e_\\theta(\\tilde{\\mathbf{x}}^{(i)})))\\) . Additional readings the following resource provides a detailed explanation of the theory of ICA (and a simple Python implementation!) the following blog post provides an extensive list (and description) of different AutoEncoder networks (and Variational AutoEncoders, which we will discuss in the next lecture).","title":"Dimensionality reduction"},{"location":"lectures/13_dimred/#dimensionality-reduction","text":"Up until now we have mostly focused on one family of Machine Learning methods, so-called Supervised learning . Whilst this is by far the most popular application in Deep Learning and the one that has reported greater success in the last decade, another family of methods that is becoming more and more popular falls under the umbrella of so-called Unsupervised learning . When labelled data are scarce, or it is difficult to have access to ground truth labels (e.g., in geoscience), unsupervised learning can represent an appealing alternative to find patterns in data. Unsupervised learning comes in different flavours. For example let's imagine grouping a set of unlabelled data into a number of buckets and then analyze them one-by-one knowing that the samples within each bucket are more similar to each other than others in the dataset: this is a form of unsupervised learning called clustering . The flavour that we are going to discuss in more details in the following is however referred to as Dimensionality reduction . Simply stated dimensionality reduction can be described as: Take \\(N_s\\) training samples \\(\\mathbf{x}^{(i)} \\in \\mathbb{R}^{N_f}\\) , ( \\(i=1,2,...N_s\\) ), Find a smaller representation \\(\\mathbf{c}^{(i)} \\in \\mathbb{R}^{N_l}\\) ( \\(N_l<<N_f\\) ) whilst making the smallest possible reconstruction error. If you previously studied how data are stored in a computer transmitted via cable (or air), you may recall that this is the very same objective of data compression . For this reason, nowadays we can build on a vast body of literature when designing effective dimensionality reduction techniques. What it is however slowly becoming more and more evident is the fact that by identifying representative low-dimensional (also called latent ) spaces from a set of data samples living in a much richer space, we can implicitly extract useful features to be later used in subsequent tasks of supervised learning. This two-steps approach is becoming very popular these days especially in fields of science that lack vast amount of labelled data as a way to take advantage as much as possible of unlabelled samples and then being able to fine-tune supervised models using small amounts of labelled data. Before we consider a number of different approaches to dimensionality reduction, let's write the problem in a common mathematical form. Given a number of training samples $\\mathbf{x}^{(i)}, we wish to identify: encoder: \\(\\mathbf{c}^{(i)} = e(\\mathbf{x}^{(i)})\\) decoder: \\(\\hat{\\mathbf{x}}^{(i)} = d(\\mathbf{c}^{(i)})\\) such that: \\[ \\hat{e},\\hat{d} = \\underset{e,d} {\\mathrm{argmin}} \\; \\frac{1}{N_s}\\sum_i \\mathscr{L}(\\mathbf{x}^{(i)}, d(e(\\mathbf{x}^{(i)}))) \\]","title":"Dimensionality reduction"},{"location":"lectures/13_dimred/#principal-component-analysis-pca","text":"The simplest approach to dimensionality reduction uses linear operators for the encoder: encoder: \\(\\mathbf{c}^{(i)} = \\mathbf{E}\\mathbf{x}^{(i)}\\) decoder: \\(\\hat{\\mathbf{x}}^{(i)} = \\mathbf{D}\\mathbf{c}^{(i)}\\) where \\(\\mathbf{E}_{[N_l \\times N_f]}\\) and \\(\\mathbf{D}_{[N_f \\times N_l]}\\) . PCA aims to find representative features that are linear combinations of the columns of the encoder (i.e., \\(\\mathbf{c}=\\sum_{i=1}^{N_f} \\mathbf{E}_{:,i} x_i\\) ) such that the projection of these new features onto the original space ( \\(\\mathbf{D}\\mathbf{c}\\) ) is as close as possible to the original sample \\(\\mathbf{x}\\) . In other words, we want to find the best linear subspace of the original space that minimizes the reconstruction error defined here as the squared Euclidean norm ( \\(\\mathscr{L}=||.||^2_2\\) ). Defining a unique pair of matrices ( \\(\\mathbf{E},\\mathbf{D}\\) ) is however not possible without imposing further constraints. In the PCA derivation we must assume that the columns of \\(\\mathbf{D}\\) are orthonormal: \\[ \\mathbf{D}^T \\mathbf{D} = \\mathbf{I}_{N_l} \\] By making such a strong assumption we can easily see that \\[ $\\hat{\\mathbf{x}}^{(i)} = \\mathbf{D}\\mathbf{E}\\mathbf{x}^{(i)}=\\mathbf{D}\\mathbf{D}^T\\mathbf{x}^{(i)} \\quad (\\mathbf{E}=\\mathbf{D}^T) \\] is the choice of encoder-decoder that minimizes the reconstruction error. Let's now prove to ourselves that this is the case for a single training sample: \\[ \\hat{\\mathbf{c}} = \\underset{\\mathbf{c}} {\\mathrm{argmin}} \\; ||\\mathbf{x}-d(\\mathbf{x})||_2^2 \\] where for the moment we do not specify the decoder and simply call it \\(d\\) . Let's first expand the loss function \\[ \\begin{aligned} ||\\mathbf{x}-d(\\mathbf{x})||_2^2 &= (\\mathbf{x}-g(\\mathbf{x}))^T (\\mathbf{x}-d(\\mathbf{x})) \\\\ &= \\mathbf{x}^T \\mathbf{x} - \\mathbf{x}^Td(\\mathbf{x}) - g(\\mathbf{x})^T \\mathbf{x} + d(\\mathbf{c})^T g(\\mathbf{c})^T\\\\ &= \\mathbf{x}^T \\mathbf{x} - 2 \\mathbf{x}^Td(\\mathbf{x}) + d(\\mathbf{c})^T g(\\mathbf{c})^T\\\\ \\end{aligned} \\] where we can ignore the first term given it does not depend on \\(\\mathbf{c}\\) . At this point let's consider the special case of \\(d()=\\mathbf{D}\\) , which gives: \\[ \\begin{aligned} ||\\mathbf{x}-d(\\mathbf{x})||_2^2 &= \\mathbf{c}^T \\mathbf{D}^T \\mathbf{D} \\mathbf{c} - 2 \\mathbf{x}^T \\mathbf{D} \\mathbf{c} \\\\ &= \\mathbf{c}^T \\mathbf{I}_{N_l} \\mathbf{c} - 2 \\mathbf{x}^T \\mathbf{D} \\mathbf{c} \\end{aligned} \\] Finally we compute the derivative of the loss function over \\(\\mathbf{c}\\) : \\[ \\frac{\\partial J}{\\partial \\mathbf{c}} = 0 \\rightarrow 2 \\mathbf{c}^T - 2 \\mathbf{x}^T \\mathbf{D} = 0 \\rightarrow \\mathbf{c} = \\mathbf{D}^T \\mathbf{x} \\] where we have obtained that \\(\\mathbf{E} = \\mathbf{D}^T\\) . At this point we know what is the optimal linear encoder-decoder pair with respect to the MSE loss. However, we do not have a specific form for the matrix \\(\\mathbf{D}\\) itself. In order to identify the entries of the decoder matrix, we need to set up another optimization problem, this time directly for \\(\\mathbf{D}\\) : \\[ \\hat{\\mathbf{D}} = \\underset{\\mathbf{D}} {\\mathrm{argmin}} \\; ||\\mathbf{X}-\\mathbf{D}\\mathbf{D}^T \\mathbf{X}||_F \\quad s.t. \\; \\mathbf{D}^T \\mathbf{D} = \\mathbf{I}_{N_l} \\] where \\(\\mathbf{X}_{[N_f \\times N_s]}\\) is the training sample matrix. To simplify our derivation let's consider the case of \\(N_l=1\\) ; the result can then be easily generalized for any choice of \\(N_l=1\\) . Let's write \\[ \\begin{aligned} \\hat{\\mathbf{d}} &= \\underset{\\mathbf{d}} {\\mathrm{argmin}} \\; ||\\mathbf{X}-\\mathbf{d}\\mathbf{d}^T \\mathbf{X}||_F \\quad s.t. \\; \\mathbf{d}^T \\mathbf{d} = 1 \\\\ &= \\underset{\\mathbf{d}} {\\mathrm{argmin}} \\; ||\\bar{\\mathbf{X}}-\\bar{\\mathbf{X}}\\mathbf{d}\\mathbf{d}^T||_F \\quad s.t. \\; \\mathbf{d}^T \\mathbf{d} = 1 \\quad (\\bar{\\mathbf{X}}=\\mathbf{X}^T) \\\\ &= \\underset{\\mathbf{d}} {\\mathrm{argmin}} \\; Tr((\\bar{\\mathbf{X}}-\\bar{\\mathbf{X}}\\mathbf{d}\\mathbf{d}^T)^T(\\bar{\\mathbf{X}}-\\bar{\\mathbf{X}}\\mathbf{d}\\mathbf{d}^T)) \\quad s.t. \\; \\mathbf{d}^T \\mathbf{d} = 1\\\\ &= \\underset{\\mathbf{d}} {\\mathrm{argmin}} \\; Tr(\\bar{\\mathbf{X}}^T \\bar{\\mathbf{X}} - \\bar{\\mathbf{X}}^T \\bar{\\mathbf{X}}\\mathbf{d}\\mathbf{d}^T - \\mathbf{d}\\mathbf{d}^T \\bar{\\mathbf{X}}^T \\bar{\\mathbf{X}} + \\mathbf{d}\\mathbf{d}^T\\bar{\\mathbf{X}}^T \\bar{\\mathbf{X}}\\mathbf{d}\\mathbf{d}^T) \\quad s.t. \\; \\mathbf{d}^T \\mathbf{d} = 1\\\\ &= \\underset{\\mathbf{d}} {\\mathrm{argmin}} \\; -2 Tr(\\bar{\\mathbf{X}}^T \\bar{\\mathbf{X}}\\mathbf{d}\\mathbf{d}^T) + Tr(\\mathbf{d}\\mathbf{d}^T\\bar{\\mathbf{X}}^T \\bar{\\mathbf{X}}\\mathbf{d}\\mathbf{d}^T) \\quad s.t. \\; \\mathbf{d}^T \\mathbf{d} = 1\\\\ &= \\underset{\\mathbf{d}} {\\mathrm{argmin}} \\; -2 Tr(\\bar{\\mathbf{X}}^T \\bar{\\mathbf{X}}\\mathbf{d}\\mathbf{d}^T) + Tr(\\bar{\\mathbf{X}}^T \\bar{\\mathbf{X}}\\mathbf{d}\\mathbf{d}^T \\mathbf{d}\\mathbf{d}^T) \\quad s.t. \\; \\mathbf{d}^T \\mathbf{d} = 1\\\\ &= \\underset{\\mathbf{d}} {\\mathrm{argmin}} \\; -Tr(\\bar{\\mathbf{X}}^T \\bar{\\mathbf{X}}\\mathbf{d}\\mathbf{d}^T) \\quad s.t. \\; \\mathbf{d}^T \\mathbf{d} = 1\\\\ &= \\underset{\\mathbf{d}} {\\mathrm{argmax}} \\; Tr(\\bar{\\mathbf{X}}^T \\bar{\\mathbf{X}}\\mathbf{d}\\mathbf{d}^T) = Tr(\\mathbf{d}^T \\bar{\\mathbf{X}}^T \\bar{\\mathbf{X}}\\mathbf{d}) \\quad s.t. \\; \\mathbf{d}^T \\mathbf{d} = 1\\\\ \\end{aligned} \\] where in 6 we use the fact that \\(\\mathbf{d}^T \\mathbf{d} = 1\\) . The solution of this maximization problem is represented by the eigenvector of \\(\\bar{\\mathbf{X}}^T \\bar{\\mathbf{X}}\\) associated to the largest eigenvalue (or the \\(N_l\\) largest eigenvalues for the general case). We can therefore conclude that PCA is defined as: Take \\(N_s\\) training samples \\(\\mathbf{x}^{(i)} \\in \\mathbb{R}^{N_f}\\) , ( \\(i=1,2,...N_s\\) ), Compute the matrix \\(\\bar{\\mathbf{X}}_{[N_s \\times N_f]}\\) Compute the SVD of \\(\\bar{\\mathbf{X}}\\) (i.e., eigenvalues and eigenvectors of the sample covariance matrix \\(\\bar{\\mathbf{X}}^T \\bar{\\mathbf{X}}\\) ) Form \\(\\mathbf{D}\\) composed by the eigenvector associated with the \\(N_l\\) largest eigenvalues. Compute \\(\\mathbf{c}=\\mathbf{D}^T \\mathbf{x}\\) and \\(\\hat{\\mathbf{x}}=\\mathbf{D} \\mathbf{c}\\) . More in general, it is also worth remembering that if the training data is not zero-mean, PCA can be slightly modified to take that into account: \\[ \\mathbf{c}=\\mathbf{D}^T (\\mathbf{x}-\\boldsymbol\\mu$ and $\\hat{\\mathbf{x}}=\\mathbf{D} \\mathbf{c}+\\boldsymbol\\mu$. \\] where \\(\\boldsymbol\\mu\\) is the sample mean. To conclude, let's try to provide some additional geometrical intuition of how PCA works in practice. Once again, let's recall the covariance matrix that we form and create SVD on: \\[ \\mathbf{C}_x=E_\\mathbf{x} [(\\mathbf{x}-\\boldsymbol\\mu) (\\mathbf{x}-\\boldsymbol\\mu)^T] \\] The eigenvalues \\(\\lambda_i\\) of \\(\\mathbf{C}_x\\) relate to the variance of the dataset \\(\\mathbf{X}\\) in the direction of the associated eigenvector \\(\\mathbf{v}_i\\) as follows (we use a 2d example for simplicity): so we observe that the first direction of PCA (i.e. \\(\\mathbf{v}_1\\) ) is the one that best minimizes the reconstruction error (i.e., \\sum_i d_{i,1}). In multiple dimensions, the eigenvectors are organized in order of reconstruction error of the projected data points from smallest to largest.","title":"Principal Component Analysis (PCA)"},{"location":"lectures/13_dimred/#other-linear-dimensionality-reduction-techniques","text":"Whilst PCA is very popular for its simplicity (both of understanding and implementation), other techniques for linead dimensionality reduction exist. As some of them has been shown during the years to be very powerful and better suited to find representative latent representations from data, we will briefly look at them here.","title":"Other linear dimensionality reduction techniques"},{"location":"lectures/13_dimred/#independent-component-analysis-ica","text":"ICA aims to separate a signal into many underlying signals that are scaled and added together to reproduce the original one: \\[ \\mathbf{x} = \\sum_i c_i \\mathbf{w}_i = \\mathbf{Wc} \\] where in this case \\(\\mathbf{c}\\) has the same dimensionality of \\(\\mathbf{x}\\) . This model is in fact commonly used for blind source separation of mixed signals. Despite it is strictly speaking not a dimensionality reduction technique, we discuss it here due to its ability of finding representative bases that combined together can explain a set of data. Once again, the problem is in need for extra constraints for us to be able to find a solution. In this case the assumption made of the \\(\\mathbf{w}_i\\) signals is as follows: Signals \\(\\mathbf{w}_i\\) must be statistically independent from each other and non-gaussian A solution to this problem can be obtained finding the pair ( \\(\\mathbf{W}, \\mathbf{c}\\) ) which maximises non-gaussianity (i.e., minimizes normalized sample kurtosis) or minimizes mutual information (MI). Whilst we don't discuss here in details how to achieve such solution, it is worth pointing out that this requires solving a nonlinear inverse problem as \\(\\mathbf{W}\\) relates in a nonlinear manner to kurtosis or MI.","title":"Independent Component Analysis (ICA)"},{"location":"lectures/13_dimred/#sparse-coding-or-dictionary-learning","text":"Sparse coding is another heavily studied model for dimensionality reduction. The general idea has origin in a large body of work carried out in other areas of applied mathematics where hand-crafted transformations (e.g., wavelets) habe been identified to nicely represent data of different kind (e.g., images, sounds, seismic recordings) in a very sparse fashion. Here sparse refers to the fact that the transformed signal can be represented by a vector with many zeros and just few non-zero entries. In this context, however, the transformation is represented a matrix \\(\\mathbf{W}\\) , whose entries are once again learned directly from the available training data. This is achieved by imposing a strong condition on the probability distribution associated with the latent vector \\(\\mathbf{c}\\) : \\[ p(\\mathbf{c}) \\approx \\text{Laplace, Cauchy, Factorized t-student} \\] in other words, a fat tailed distribution, whose samples are therefore sparse. By making such an assumption, no closed form solution exist like in the PCA case. Instead, the training process is set up with the following goals in mind: Find sparsest latent representation during the encoding phase Find a decoder that provides the smallest reconstruction error which mathematically can be written as: $$ \\begin{aligned} \\hat{\\mathbf{W}}, \\hat{\\mathbf{h}} &= \\underset{\\mathbf{W}, \\mathbf{h}} {\\mathrm{argmax}} p(\\mathbf{h}|\\mathbf{x}) &= \\underset{\\mathbf{W}, \\mathbf{h}} {\\mathrm{argmin}} \\beta ||\\mathbf{h}-\\mathbf{W}\\mathbf{h}||_2^2 +\\lambda ||\\mathbf{h}||_1 \\end{aligned} $$ where \\(\\beta\\) , \\(\\lambda\\) are directly related to the parameters of the posterior distribution that we wish to maximize. This functional can be minimized in an alternating fashion, first for \\(\\mathbf{W}\\) , then for \\(\\mathbf{x}\\) , and so on and so forth. Finally, once the training process is over and \\(\\hat{\\mathbf{W}}\\) is available, it is worth noting that sparse coding does require solving a sparsity-promoting inverse problem for any new training sample \\(\\mathbf{x}\\) in order to find its best representation \\(\\hat{\\mathbf{h}}\\) . Nevertheless, despite the higher cost compared to for example PCA, sparse coding has shown great promise in both data compression and representation learning, the latter when coupled with down-the-line supervised tasks.","title":"Sparse Coding (or Dictionary Learning)"},{"location":"lectures/13_dimred/#autoencoders","text":"Finally, we turn our attention onto nonlinear dimensionality reduction models. We should know by now that nonlinear mappings (like those performed by NNs) may be much more powerful than their linear counterpart is used to our advantage. The most popular nonlinear dimensionality techniques dates back to 1991 and the work of M. Kramer. Simply put, an autoencoder is the combination of an encoder function \\(e_\\theta\\) , which converts the input data into a latent representation, and a decoder function \\(d_\\theta\\) , which converts the new representation back into the original format. Here, both \\(e_\\theta\\) and \\(d_\\theta\\) and nonlinear and fully learned and stack one after the other as shown below An autoencoder can therefore be simply defined as: \\[ \\hat{\\mathbf{x}} = d_\\phi(e_\\theta(\\mathbf{x})) \\] where similar to PCA, the training process is setup such the parameters of the two networks are optimized to minimize the following loss function: \\[ \\hat{e}_\\theta,\\hat{d}_\\phi = \\underset{e_\\theta,d_\\phi} {\\mathrm{argmin}} \\; \\frac{1}{N_s}\\sum_i \\mathscr{L}(\\mathbf{x}^{(i)}, d_\\phi(e_\\theta(\\mathbf{x}^{(i)})))) \\] where the network architecture for both the encoder and decoder can be chosen accordingly to the type of data we are interested in. Once again, our code (or latent vector \\(\\mathbf{z}\\) ) must be chosen to be of lower dimensionality compared to the input in order to be able to learn useful representations. On the other hand if we choose \\(N_l \\ge N_f\\) , we will likely not learn something useful: very likely what we are going to learn is to reproduce the identity mapping. In other words, whilst the loss function is set to reproduce the input itself, what we are really interested is not the mere reconstruction, rather the creation of some meaningful transformation of the input vector that first projects it into a latent space and then expands it back to the original space. If we are able to accomplish this task, we will likely see that if we feed the trained network with a new sample \\(\\mathbf{x}_{in}\\) that lies inside the distribution of the training data, the reconstruction will be of similar quality as to what we observed in training. On the other hand, when a out-of-distribution sample \\(\\mathbf{x}_{out}\\) is fed to the network, its prediction will be much less accurate.","title":"Autoencoders"},{"location":"lectures/13_dimred/#applications","text":"Now that we know how an AutoEncoder works, the next obvious question is why do we care and what can we use if for. Let's recap here a couple of applications that we have already mentioned here and there in the lecture: Data compression: the use of NNs (and AEs in this specific case) may soon lead to a completely new, nonlinear paradigm in data compression where we could simply store the latent vectors and network architecture and weights and reconstruct the original vector on-demand similar to what conventionally done with linear compressors (e.g., JPEG2000). Learn robust features on large unlabelled data prior to supervised learning: assuming that we have access to a large dataset composed for the majority of unlabelled data and for a small portion of labelled data, we could imagine training and AE on the first part of the dataset and use the learned latent features as input to a subsequent task of supervised learning. More specifically, the inputs of the labelled data are fed to the trained encoder and the resulting features are used in conjunction with the labels in a supervised manner. Inverse problems in the latent space: this is similar to the previous case, with the main difference that we may have an inverse problem we wish to solve where the parameter to estimate lives in the manifold of the \\(\\mathbf{x}\\) samples. We can once again train and AE to learn a good representation for such the manifold of possible solutions and then solve the inverse problem for \\(\\mathbf{z}\\) instead of \\(\\mathbf{x}\\) directly. Perform vector math in the latent space: Imagine we want to compare two multi-dimensional vectors \\(\\mathbf{x}\\) (e.g., images). Classical distance measures may focus too much on small discrepancies and not really on the overall similarity between these samples, that is what we usually want to compare. Alternatively, we could convert both vectors into their latent representations and compare them in this reduced space. In this case, even simple distance measures like MSE may become more robust as they really compare high-level features of the inputs that are encapsulated in the latent vectors.","title":"Applications"},{"location":"lectures/13_dimred/#undercomplete-vs-overcomplete-aes","text":"Up until now, we are talked about undercomplete representations (i.e., \\(N_l << N_f\\) ). We have justified this with the fact that if we give too many degrees of freedom to the network, we will likely allow it to learn the identity mapping (a form of overfitting for AEs). In short, a good design for a AE should follow these two rules: choose a small enough code ( \\(N_l\\) ): not too small as it won't be able to reproduce the input accurately, not too large as it will make the AE overfit; choose a small enough network capacity for both the encoder and decoder: similarly, a too large network will easily overfit even if the size of bottleneck has been appropriately chosen. However, a different choice may be taken as we will see shortly. This is heavily inspired by traditional compression algorithms, where a (linear) transformation that can produce a compact code (i.e., a code that can be stored in far fewer bits than the corresponding input) is usually overcomplete. Let's take the Wavelet transform as an example: Here the input image is initially decomposed into 3 high-pass and one-low pass filtered versions of it, and the low-pass one is further processed recursively. The overall size of the input and output is however identical . What makes this transform a great compressor is that in the transformed domain, natural images (and other N-dimensional signals) can be represented by very few non-zero coefficients. In other words, we say that the Wavelet transform provides a sparse representation of a variety of N-dimensional signals in nature. A similar approach can be taken for nonlinear transformations, like those applied by AEs. In this case, however, extra care must be taken to avoid overfitting, which can be done by adding some constraints to the learning process. As already discussed many times, these constraints can simply come in the form of regularization in the learning process: \\[ \\mathscr{L}_r = \\frac{1}{N_s}\\sum_i \\left( \\mathscr{L}(\\mathbf{x}^{(i)}, d_\\phi(e_\\theta(\\mathbf{x}^{(i)}))) +\\lambda R(\\mathbf{x}^{(i)} ; \\theta,\\phi) \\right) \\] where \\(R(\\mathbf{x}^{(i)} ;\\theta,\\phi)\\) can take several forms: L1 norm : this encourages the network to produce sparse latent representations; Derivative of the latent vector over the input : this encourages robust latent vectors that a small sensitivity to small perturbations of the input; Noise or missing parts in the inputs : this is not really a regularization in formal sense, as nothing is added to the cost function, rather the input is perturbed to make once again the latent representation robust to small variations in the input.","title":"Undercomplete vs. Overcomplete AEs"},{"location":"lectures/13_dimred/#sparse-autoencoders","text":"Enforcing a sparse latent vector can act as a strong regularization. This can be simply achieved by choosing: \\[ R(\\mathbf{x}^{(i)} ;\\theta,\\phi) = ||e_\\theta(\\mathbf{x}^{(i)})||_1 \\] which allows the learning process to optimize for the pair of encoder-decoder that can reproduce the training samples, whilst also forcing the encoder to produce sparse latent representation. A step further can be taken by imposing that not only the activations of the latent code are sparse, rather all the activations in the network. Let's take for simplicity a small network as depicted below: and changing the regularizer to: \\[ R(\\mathbf{x}^{(i)} ;\\theta,\\phi) = \\sum_j ||a_e^{[j](i)}||_1 + \\sum_j ||a_d^{[j](i)}||_1 \\] An autoencoder that is trained using this strategy is called Sparse Autoencoder . Finally, a slightly different strategy has been proposed under the name of K-sparse AutoEncoder , where instead of having a soft-constraint in the form of the regularization term above, the elements of the latent code are modified by a nonlinear transformation that brings all elements to zero apart from the K largest elements in absolute value. More formally, even though in practice no regularization term is therefore explicitly added to the loss function, this approach solves the following constrained problem: \\[ \\underset{\\mathbf{e}_\\theta, \\mathbf{d}_\\phi} {\\mathrm{argmin}} \\; \\frac{1}{N_s}\\sum_i \\mathscr{L}(\\mathbf{x}^{(i)}, d_\\phi(e_\\theta(\\mathbf{x}^{(i)}))) \\quad s.t. \\quad ||\\mathbf{z}||_0<K \\] Note that, once again, this procedure can be extended such that all the activations in the network are forced to have only K non-zero values.","title":"Sparse AutoEncoders"},{"location":"lectures/13_dimred/#contractive-autoencoders","text":"An alternative regularization term that can make AEs robust to small changes in the input vectors is: \\[ R(\\mathbf{x} ;\\theta,\\phi) = ||\\nabla_\\mathbf{x} \\mathbf{z}||_F \\] where the derivative of the latent vector is taken over the input vector and forced to be small. Note that this derivative produces the Jacobian of the encoder as both the input and output are multi-dimensional (and therefore the use of the Frobenious norm). Whilst the authors of this method claim additional robustness, the computational cost of computing a Jacobian makes this approach quite costly.","title":"Contractive AutoEncoders"},{"location":"lectures/13_dimred/#denoising-autoencoders","text":"Finally, denoising AEs are another family of regularized autoencoders. In this case, however, the regularization is implemented directly on the input vectors prior to feeding them to the network, by either replacing some values with zeros (or random values) or adding noise. Considering this last case, each step of the training process becomes: \\(\\tilde{\\mathbf{x}}^{(i)} = \\mathbf{x}^{(i)} + \\mathbf{n}^{(i)} \\quad \\forall i\\) ; \\(\\mathscr{L} = \\frac{1}{N_s}\\sum_i \\mathscr{L}(\\mathbf{x}^{(i)}, d_\\phi(e_\\theta(\\tilde{\\mathbf{x}}^{(i)})))\\) .","title":"Denoising AutoEncoders"},{"location":"lectures/13_dimred/#additional-readings","text":"the following resource provides a detailed explanation of the theory of ICA (and a simple Python implementation!) the following blog post provides an extensive list (and description) of different AutoEncoder networks (and Variational AutoEncoders, which we will discuss in the next lecture).","title":"Additional readings"},{"location":"lectures/14_vae/","text":"Generative Modelling and Variational AutoEncoders Up until now, our attention has been mostly focused on supervised learning tasks where we have access to a certain number of training samples, in the form of input-target pairs, and we train a model (e.g., a NN) to learn the best possible mapping between the two. These kind of models are also usually referred to as discriminative models as they learn from training samples their underlying conditional probability distribution \\(p(\\mathbf{y}|\\mathbf{x})\\) . In the last lecture, we have also seen how the general principles of supervised learning can be adapted to accomplish a number of different tasks where input-target pairs are not available. Dimensionality reduction is one of such tasks, which are usually categorized under the umbrella of unsupervised learning. Another very exciting area of statistics that has been recently heavily influenced by the deep learning revolution is the so-called field of Generative modelling . Here, instead of having access to input-target pairs, we are able to only gather a (large) number of samples \\(\\mathbf{X} = \\{ \\mathbf{x}^{(1)}, \\mathbf{x}^{(2)}, ..., \\mathbf{x}^{(N_s)} \\}\\) that we believe come from a given hidden distribution. The task that we wish to accomplished is therefore: Learn the underlying distribution \\(p(\\mathbf{x})\\) , or Learn to sample from the underlying distribution \\(\\tilde{\\mathbf{x}} \\sim p(\\mathbf{x})\\) Obviously, the first task is more general and usually more ambitious. Once you know a distribution, sampling from it is rather an easy task. In the next two lectures, we will however mostly focused on the second task and discuss two popular algorithms that have shown impressive capabilities to sample from high-dimensional, complex distributions. To set the scene, let's take the simplest approach to generative modelling that has nothing to do with neural networks. Let's imagine we are provided with \\(N_s\\) multi-dimensional arrays and we are told that they come from a multi-variate gaussian distribution. We can set up a generative modelling task as follows: Training Compute the sample mean and covariance from the training samples: \\(\\boldsymbol \\mu, \\boldsymbol \\Sigma\\) Apply the Cholesky decomposition to the covariance matrix: \\(\\boldsymbol \\Sigma = \\mathbf{L} \\mathbf{L}^T\\) Inference / Generation Sample a vector from a unitary, zero-mean normal distribution \\(\\mathbf{z} \\sim \\mathcal{N}(\\mathbf{0}, \\mathbf{I})\\) Create a new sample from the true distribution: \\(\\tilde{\\mathbf{x}} = \\mathbf{L} \\mathbf{z} + \\boldsymbol \\mu\\) Unfortunately, multi-dimensional distributions that we usually find in nature are hardly gaussian and this kind of simple generative modelling procedure falls short. Nevertheless, the approach that we take with some of the more advanced generative modelling methods that we are going to discuss later on in this lecture does not differ from what we have done so far. A training phase, where the free-parameters of the chosen parametric model (e.g., a NN) are learned from the available data, followed by a generation phase that uses the trained model and some stochastic input (like the \\(\\mathbf{z}\\) vector in the example above). Variational AutoEncoders (VAEs) Variational AutoEncoders have been proposed by Kingma and Welling in 2013. in As the name implies, these networks take inspiration from the AutoEncoder networks that we have presented in the previous lecture. However, some small, yet fundamental changes are implemented to the network architecture as well as the learning process (i.e., loss function) to turn such family of networks from being able to perform dimensionality reduction to being generative models. Let's start by looking at a schematic representation of a VAEs: Even before we delve into the mathematical details, we can clearly see that one main change has been implemented to the network architecture: instead of directly producing a vector \\(\\mathbf{z} \\in \\mathbb{R}^{N_l}\\) , the encoder's output is composed of two vectors \\(\\boldsymbol \\mu \\in \\mathbb{R}^{N_l}\\) and \\(\\boldsymbol \\sigma \\in \\mathbb{R}^{N_l}\\) that represent the mean and standard deviation of a \\(N_l\\) dimensional gaussian distribution (with uncorrelated variables, i.e., diagonal covariance matrix). Mathematically, the encoder can be written as \\(\\boldsymbol \\mu = e_{\\theta,\\mu}(\\mathbf{x}), \\; \\boldsymbol \\sigma = e_{\\theta,\\sigma}(\\mathbf{x})\\) , where the two networks share all weights apart from the last layer. The two vectors produced by the encoder are then fed together to a sampler, who similar to what we did before, produces a sample from the following gaussian distribution: \\(\\mathcal{N}(\\boldsymbol \\mu, diag\\{ \\boldsymbol \\sigma \\})\\) . In practice this is however achieved by sampling a vector and then transforming it into the desired distribution, \\(\\mathbf{z} = \\boldsymbol \\sigma \\cdot \\mathbf{z} + \\boldsymbol \\mu\\) where \\(\\cdot\\) refers to an element-wise product. Reparametrization trick This rather simple trick is referred to as Reparametrization trick and it is strictly needed in neural networks every time we want to introduce a stochastic process within the computational graph. In fact, by simply having a stochastic process parametrized by a certain mean and standard deviation that may come from a previous part of the computational graph (as in VAEs) we lose the possibility to perform backpropagation. Instead if we decouple the stochastic component (which we are not interested to update, and therefore to backpropagate onto) and the deterministic component(s), we do not lose access to backpropagation: Why VAEs? Before we progress in discussing the loss function and training procedure of VAEs, a rather simple question may arise: 'Why can we not use AEs for generative modelling?' In fact, this could be achieved by simply modifying the inference step: where instead of taking a precomputed \\(\\mathbf{z}\\) vector (from a previous stage of compression), we could sample a new \\(\\mathbf{z}\\) value from a properly crafted distribution (perhaps chosen from statistical analysis of the training latent vectors) at any time we want to create a new sample. Unfortunately, whilst this idea may sound reasonable, we will be soon faced with a problem. In fact, the latent manifold learned by a AE may not be regular, or in other words it may be hard to ensure that areas of such manifold that have not been properly sampled by the training data will produce meaningful samples \\(\\tilde{\\mathbf{z}}\\) . Just to give an idea, let's look at the following schematic representation: as we can see, if a part of the latent 1-d manifold is not rich in training data, the resulting generated sample may be non-representative at all. Whilst we discussed techniques that can mitigate this form of overfitting (e.g., sparse AEs), VAEs bring the learning process to a whole new level by choosing a more appropriate regularization term \\(R(\\mathbf{x}^{(i)} ;\\theta,\\phi)\\) to add to the reconstruction loss. Regularization in VAEs In order to better understand the regularization choice in VAEs, let's look once again at a schematic representation of VAEs but this time in a probabilistic mindset: where we highlight here the fact that the encoder and decoder can be seen as probability approximators. More specifically: \\(e_\\theta(\\mathbf{x}) \\approx p(\\mathbf{z}|\\mathbf{x})\\) : the encoder learns to sample from the latent space distribution conditioned on a specific input \\(d_\\phi(\\mathbf{z}) \\approx p(\\mathbf{x}|\\mathbf{z})\\) : the decoder learns to sample from the true distribution conditioned on a specific latent sample By doing so, we can reinterpret the reconstruction loss as the negative log-likelihood of the decoder. And, provided that we have defined a prior for the latent space \\(\\mathbf{z} \\sim P(\\mathbf{z})\\) , we can learn the parameters of the decoder by ensuring that the posterior does not deviate too much from the prior. This can be achieved by choosing: \\[ R(\\mathbf{x} ;\\theta,\\phi) = KL(p(\\mathbf{z}|\\mathbf{x})||p(\\mathbf{z})) \\] As in any statistical learning process, the overall loss of our VAEs shows a trade-off between the likelihood (i.e., learning from data) and prior (i.e., keeping close to the initial guess). Before we provide a mathematical derivation supporting these claims, let's briefly try to provide some intuition onto why adding this regularization makes VAEs more well behaved than AEs in terms of generating representation samples of the input distribution ( \\(p(\\mathbf{x})\\) ) whilst sampling directly in the latent space. Back to the example with geometrical shapes, if we now assume a 2-dimensional latent space for both an AE and a VAEs: the effect of the regularization term in VAEs is such that the probability density function of the latent space (( \\(p(\\mathbf{z}|\\mathbf{x})\\) ) is forced to stay close to the prior, and therefore the \"clouds\" of different classes do not really separate from each other abruptly. As a consequence, the geometrical shapes associated with the transition zones in the latent space are still meaningful. The same cannot be said for the AE as the \"clouds\" of different classes tend to move apart leaving unexplored regions between them. Sampling from such region will result in non-representative geometrical shapes. More precisely, the regularization term in VAEs ensures the following two properties for the latent space: continuity: two closely points in the latent space are similar in the original space; completness: any point sampled from the latent distribution is meaningful in the original space; Mathematics of VAEs To conclude our lecture on VAEs, we would like to gain a stronger mathematical understanding about the inner working of this model. In order to do so, we are required to introduce a technique commonly used in statistics to estimate complex distributions. This technique goes under the name of Variational Inference (VI) . Let's begin from the classical setup of Bayesian inference. We are interested in a certain probability distribution that we want to sample from or characterize (e.g., in terms of its mean and standard deviation), for example the following posterior distribution in a general inverse problem setting: \\[ p(\\mathbf{x} | \\mathbf{y}) = \\frac{p(\\mathbf{y}|\\mathbf{x}) p(\\mathbf{x})}{p(\\mathbf{y})} \\] where \\(\\mathbf{x}\\) is the model we wish to estimate and \\(\\mathbf{y}\\) are the available observations. We assume knowledge of the prior distribution \\(p(\\mathbf{x})\\) and the underlying physical process that links the model to the data, \\(\\mathbf{y}=f(\\mathbf{x})\\) from which we can compute the likelihood \\(p(\\mathbf{y}|\\mathbf{x})\\) (assuming a certain statistics for the noise). The denominator of the Bayes rule ( \\(p(\\mathbf{y}) = \\int p(\\mathbf{y}|\\mathbf{x}) p(\\mathbf{x}) d\\mathbf{x}\\) ) is what prevents us from computing the posterior directly. Variational Inference approaches the above problem in a special way. A parametric distribution \\(q_\\theta(\\mathbf{x})\\) is defined, also sometimes referred to as guide or proposal distribution, and an optimization problem is set up to find the best free-parameters \\(\\theta\\) such that this easy-to-evaluate distribution closely resembles to posterior distribution of interest. As usual when setting up an optimization problem, a measure of distance between such distributions is required to be able to optimize for such set of parameters. In this case, since we are dealing with distributions, it comes natural to choose the Kullback-Leibler divergence as metric: \\[ \\underset{\\theta} {\\mathrm{argmin}} \\; KL(q_\\theta(\\mathbf{x})||p(\\mathbf{x}|\\mathbf{y})) \\] Let's now expand the expression of the KL divergence and show an equivalent formula for this optimization problem: \\[ \\begin{aligned} &\\underset{\\theta} {\\mathrm{argmin}} \\; KL(q_\\theta(\\mathbf{x})||p(\\mathbf{x}|\\mathbf{y})) \\\\ &= \\underset{\\theta} {\\mathrm{argmin}} \\; E_{\\mathbf{x} \\sim q_\\theta} \\left[ log \\left( \\frac{q_\\theta(\\mathbf{x})}{p(\\mathbf{x}|\\mathbf{y})} \\right) \\right] \\\\ &= \\underset{\\theta} {\\mathrm{argmin}} \\; E_{\\mathbf{x} \\sim q_\\theta} [ log q_\\theta(\\mathbf{x}) ] - E_{\\mathbf{x} \\sim q_\\theta} [ log p(\\mathbf{x}|\\mathbf{y}) ] \\\\ &= \\underset{\\theta} {\\mathrm{argmin}} \\; E_{\\mathbf{x} \\sim q_\\theta} [ log q_\\theta(\\mathbf{x}) ] - E_{\\mathbf{x} \\sim q_\\theta} \\left[ log \\left( \\frac{p(y|\\mathbf{x})p(\\mathbf{x})}{p(\\mathbf{y})} \\right) \\right] \\\\ &= \\underset{\\theta} {\\mathrm{argmin}} \\; E_{\\mathbf{x} \\sim q_\\theta} [ log q_\\theta(\\mathbf{x}) ] - E_{\\mathbf{x} \\sim q_\\theta} [ log p(\\mathbf{y}|\\mathbf{x}) ] - E_{\\mathbf{x} \\sim q_\\theta} [ log p(\\mathbf{x}) ] + \\cancel{E_{\\mathbf{x} \\sim q_\\theta} [ log p(\\mathbf{y}) ]} \\\\ &= \\underset{\\theta} {\\mathrm{argmin}} \\; KL(q_\\theta(\\mathbf{x})||p(\\mathbf{x})) - E_{\\mathbf{x} \\sim q_\\theta} [ log p(\\mathbf{y}|\\mathbf{x}) ] \\end{aligned} \\] where we have eliminated \\(E_{x \\sim q_\\theta} [ p(\\mathbf{y}) ]\\) the in the 5th row since it does not depend on \\(\\theta\\) . In the last row, we can see the two terms that we have previously described: \\(-E_{x \\sim q_\\theta} [ p(\\mathbf{y}|\\mathbf{x}) ]\\) is the negative log-likelihood of a traditional Maximum likelihood estimation (i.e., data misfit term). In the special case of gaussian noise ( \\(\\mathbf{y} \\sim \\mathcal{N}(f(\\mathbf{x}), \\sigma^2 \\mathbf{I})\\) ), this becomes the MSE loss as discussed in one of our previous lectures; \\(KL(q_\\theta(\\mathbf{x})||p(\\mathbf{x}))\\) is the regularization term encouraging the proposal distribution to stay close to the prior. Finally, let's slightly rearrange the expression in the 5th row: \\[ E_{\\mathbf{x} \\sim q_\\theta} [ log p(\\mathbf{y}) ] - KL(q_\\theta(\\mathbf{x})||p(\\mathbf{x}|\\mathbf{y})) = E_{\\mathbf{x} \\sim q_\\theta} [ log p(\\mathbf{y}|\\mathbf{x}) ] - KL(q_\\theta(\\mathbf{x})||p(\\mathbf{x})) \\] The left hand side of this equation is called Evidence Lower Bound (ELBO) . The names comes from the fact that the sum of these two terms is always \\(\\le E_{\\mathbf{x} \\sim q_\\theta} [ log p(\\mathbf{y}) ]\\) since KL divergence is always positive. Therefore, by maximizing the right hand side (or equivalently by minimizing the negative of the right hand side), we effectively maximize the lower bound of the probability of the evidence \\(p(\\mathbf{y})\\) . Variational inference can be therefore seen also as a maximization problem over the ELBO. Whilst we now understand the theoretical foundations of VI, to make it practical we need to specify: A suitable proposal \\(q_\\theta(\\mathbf{x})\\) , where suitable means that we can easily evaluate such probability, its KL divergence with a prior of choice, as well as sample from it. The simplest choice that is sometimes made in VI is named mean-field approximation where: $$ q_\\theta(\\mathbf{x}) = \\prod_i q_\\theta(x_i) \\sim \\mathcal{N}(\\boldsymbol \\mu , diag(\\boldsymbol \\sigma)) $$ where \\(\\theta={\\boldsymbol \\mu, \\boldsymbol \\sigma}\\) . This implies that there is no correlation over the different variables of the N-dimensional proposal distribution. Whilst this choice may be too simple in many practical scenarios, it is important to notice that this is not the same as assuming that the variables of the posterior itself are uncorrelated! A suitable optimizer. In the case where multiple \\mathbf{x} samples are available, \\(p(\\mathbf{y}|\\mathbf{x}\\) , \\(p(\\mathbf{x}\\) , and \\(q_\\theta(\\mathbf{x})\\) are differentiable we can simply use a stochastic gradient method. This special case of VI is named ADVI. Moving back to where we started, the VAE model. Let's now rewrite the problem as a VI estimation (where \\(\\mathbf{z}\\) plays here the role of the model or unseen variable and \\(\\mathbf{x}\\) represents the available observations): \\[ \\begin{aligned} &\\underset{\\theta, \\phi} {\\mathrm{argmin}} \\; KL(q_\\theta(\\mathbf{z})||p(\\mathbf{z}|\\mathbf{x})) \\\\ &= \\underset{\\theta, \\phi} {\\mathrm{argmin}} \\; KL(q_\\theta(\\mathbf{z})||p(\\mathbf{z})) - E_{\\mathbf{z} \\sim q_\\theta} [ log p_\\phi(\\mathbf{x}|\\mathbf{z}) ] \\end{aligned} \\] where the first term is responsible for updating the encoder whilst the second term contributes to the update of both the encoder and decoder. The proposal distribution is here parametrized as \\(q_\\theta(\\mathbf{z}) \\sim \\mathcal{N}(e_{\\theta,\\mu}(\\mathbf{x}), diag(e_{\\theta,\\sigma}(\\mathbf{x})))\\) . and the expectation is taken over the training samples (or a batch of them). Additional readings The flow of this lecture is heavily inspired by this blog post A Python library that can help you step up your game with Variational Inference is Pyro from Uber.","title":"Generative Modelling and Variational AutoEncoders"},{"location":"lectures/14_vae/#generative-modelling-and-variational-autoencoders","text":"Up until now, our attention has been mostly focused on supervised learning tasks where we have access to a certain number of training samples, in the form of input-target pairs, and we train a model (e.g., a NN) to learn the best possible mapping between the two. These kind of models are also usually referred to as discriminative models as they learn from training samples their underlying conditional probability distribution \\(p(\\mathbf{y}|\\mathbf{x})\\) . In the last lecture, we have also seen how the general principles of supervised learning can be adapted to accomplish a number of different tasks where input-target pairs are not available. Dimensionality reduction is one of such tasks, which are usually categorized under the umbrella of unsupervised learning. Another very exciting area of statistics that has been recently heavily influenced by the deep learning revolution is the so-called field of Generative modelling . Here, instead of having access to input-target pairs, we are able to only gather a (large) number of samples \\(\\mathbf{X} = \\{ \\mathbf{x}^{(1)}, \\mathbf{x}^{(2)}, ..., \\mathbf{x}^{(N_s)} \\}\\) that we believe come from a given hidden distribution. The task that we wish to accomplished is therefore: Learn the underlying distribution \\(p(\\mathbf{x})\\) , or Learn to sample from the underlying distribution \\(\\tilde{\\mathbf{x}} \\sim p(\\mathbf{x})\\) Obviously, the first task is more general and usually more ambitious. Once you know a distribution, sampling from it is rather an easy task. In the next two lectures, we will however mostly focused on the second task and discuss two popular algorithms that have shown impressive capabilities to sample from high-dimensional, complex distributions. To set the scene, let's take the simplest approach to generative modelling that has nothing to do with neural networks. Let's imagine we are provided with \\(N_s\\) multi-dimensional arrays and we are told that they come from a multi-variate gaussian distribution. We can set up a generative modelling task as follows: Training Compute the sample mean and covariance from the training samples: \\(\\boldsymbol \\mu, \\boldsymbol \\Sigma\\) Apply the Cholesky decomposition to the covariance matrix: \\(\\boldsymbol \\Sigma = \\mathbf{L} \\mathbf{L}^T\\) Inference / Generation Sample a vector from a unitary, zero-mean normal distribution \\(\\mathbf{z} \\sim \\mathcal{N}(\\mathbf{0}, \\mathbf{I})\\) Create a new sample from the true distribution: \\(\\tilde{\\mathbf{x}} = \\mathbf{L} \\mathbf{z} + \\boldsymbol \\mu\\) Unfortunately, multi-dimensional distributions that we usually find in nature are hardly gaussian and this kind of simple generative modelling procedure falls short. Nevertheless, the approach that we take with some of the more advanced generative modelling methods that we are going to discuss later on in this lecture does not differ from what we have done so far. A training phase, where the free-parameters of the chosen parametric model (e.g., a NN) are learned from the available data, followed by a generation phase that uses the trained model and some stochastic input (like the \\(\\mathbf{z}\\) vector in the example above).","title":"Generative Modelling and Variational AutoEncoders"},{"location":"lectures/14_vae/#variational-autoencoders-vaes","text":"Variational AutoEncoders have been proposed by Kingma and Welling in 2013. in As the name implies, these networks take inspiration from the AutoEncoder networks that we have presented in the previous lecture. However, some small, yet fundamental changes are implemented to the network architecture as well as the learning process (i.e., loss function) to turn such family of networks from being able to perform dimensionality reduction to being generative models. Let's start by looking at a schematic representation of a VAEs: Even before we delve into the mathematical details, we can clearly see that one main change has been implemented to the network architecture: instead of directly producing a vector \\(\\mathbf{z} \\in \\mathbb{R}^{N_l}\\) , the encoder's output is composed of two vectors \\(\\boldsymbol \\mu \\in \\mathbb{R}^{N_l}\\) and \\(\\boldsymbol \\sigma \\in \\mathbb{R}^{N_l}\\) that represent the mean and standard deviation of a \\(N_l\\) dimensional gaussian distribution (with uncorrelated variables, i.e., diagonal covariance matrix). Mathematically, the encoder can be written as \\(\\boldsymbol \\mu = e_{\\theta,\\mu}(\\mathbf{x}), \\; \\boldsymbol \\sigma = e_{\\theta,\\sigma}(\\mathbf{x})\\) , where the two networks share all weights apart from the last layer. The two vectors produced by the encoder are then fed together to a sampler, who similar to what we did before, produces a sample from the following gaussian distribution: \\(\\mathcal{N}(\\boldsymbol \\mu, diag\\{ \\boldsymbol \\sigma \\})\\) . In practice this is however achieved by sampling a vector and then transforming it into the desired distribution, \\(\\mathbf{z} = \\boldsymbol \\sigma \\cdot \\mathbf{z} + \\boldsymbol \\mu\\) where \\(\\cdot\\) refers to an element-wise product.","title":"Variational AutoEncoders (VAEs)"},{"location":"lectures/14_vae/#reparametrization-trick","text":"This rather simple trick is referred to as Reparametrization trick and it is strictly needed in neural networks every time we want to introduce a stochastic process within the computational graph. In fact, by simply having a stochastic process parametrized by a certain mean and standard deviation that may come from a previous part of the computational graph (as in VAEs) we lose the possibility to perform backpropagation. Instead if we decouple the stochastic component (which we are not interested to update, and therefore to backpropagate onto) and the deterministic component(s), we do not lose access to backpropagation:","title":"Reparametrization trick"},{"location":"lectures/14_vae/#why-vaes","text":"Before we progress in discussing the loss function and training procedure of VAEs, a rather simple question may arise: 'Why can we not use AEs for generative modelling?' In fact, this could be achieved by simply modifying the inference step: where instead of taking a precomputed \\(\\mathbf{z}\\) vector (from a previous stage of compression), we could sample a new \\(\\mathbf{z}\\) value from a properly crafted distribution (perhaps chosen from statistical analysis of the training latent vectors) at any time we want to create a new sample. Unfortunately, whilst this idea may sound reasonable, we will be soon faced with a problem. In fact, the latent manifold learned by a AE may not be regular, or in other words it may be hard to ensure that areas of such manifold that have not been properly sampled by the training data will produce meaningful samples \\(\\tilde{\\mathbf{z}}\\) . Just to give an idea, let's look at the following schematic representation: as we can see, if a part of the latent 1-d manifold is not rich in training data, the resulting generated sample may be non-representative at all. Whilst we discussed techniques that can mitigate this form of overfitting (e.g., sparse AEs), VAEs bring the learning process to a whole new level by choosing a more appropriate regularization term \\(R(\\mathbf{x}^{(i)} ;\\theta,\\phi)\\) to add to the reconstruction loss.","title":"Why VAEs?"},{"location":"lectures/14_vae/#regularization-in-vaes","text":"In order to better understand the regularization choice in VAEs, let's look once again at a schematic representation of VAEs but this time in a probabilistic mindset: where we highlight here the fact that the encoder and decoder can be seen as probability approximators. More specifically: \\(e_\\theta(\\mathbf{x}) \\approx p(\\mathbf{z}|\\mathbf{x})\\) : the encoder learns to sample from the latent space distribution conditioned on a specific input \\(d_\\phi(\\mathbf{z}) \\approx p(\\mathbf{x}|\\mathbf{z})\\) : the decoder learns to sample from the true distribution conditioned on a specific latent sample By doing so, we can reinterpret the reconstruction loss as the negative log-likelihood of the decoder. And, provided that we have defined a prior for the latent space \\(\\mathbf{z} \\sim P(\\mathbf{z})\\) , we can learn the parameters of the decoder by ensuring that the posterior does not deviate too much from the prior. This can be achieved by choosing: \\[ R(\\mathbf{x} ;\\theta,\\phi) = KL(p(\\mathbf{z}|\\mathbf{x})||p(\\mathbf{z})) \\] As in any statistical learning process, the overall loss of our VAEs shows a trade-off between the likelihood (i.e., learning from data) and prior (i.e., keeping close to the initial guess). Before we provide a mathematical derivation supporting these claims, let's briefly try to provide some intuition onto why adding this regularization makes VAEs more well behaved than AEs in terms of generating representation samples of the input distribution ( \\(p(\\mathbf{x})\\) ) whilst sampling directly in the latent space. Back to the example with geometrical shapes, if we now assume a 2-dimensional latent space for both an AE and a VAEs: the effect of the regularization term in VAEs is such that the probability density function of the latent space (( \\(p(\\mathbf{z}|\\mathbf{x})\\) ) is forced to stay close to the prior, and therefore the \"clouds\" of different classes do not really separate from each other abruptly. As a consequence, the geometrical shapes associated with the transition zones in the latent space are still meaningful. The same cannot be said for the AE as the \"clouds\" of different classes tend to move apart leaving unexplored regions between them. Sampling from such region will result in non-representative geometrical shapes. More precisely, the regularization term in VAEs ensures the following two properties for the latent space: continuity: two closely points in the latent space are similar in the original space; completness: any point sampled from the latent distribution is meaningful in the original space;","title":"Regularization in VAEs"},{"location":"lectures/14_vae/#mathematics-of-vaes","text":"To conclude our lecture on VAEs, we would like to gain a stronger mathematical understanding about the inner working of this model. In order to do so, we are required to introduce a technique commonly used in statistics to estimate complex distributions. This technique goes under the name of Variational Inference (VI) . Let's begin from the classical setup of Bayesian inference. We are interested in a certain probability distribution that we want to sample from or characterize (e.g., in terms of its mean and standard deviation), for example the following posterior distribution in a general inverse problem setting: \\[ p(\\mathbf{x} | \\mathbf{y}) = \\frac{p(\\mathbf{y}|\\mathbf{x}) p(\\mathbf{x})}{p(\\mathbf{y})} \\] where \\(\\mathbf{x}\\) is the model we wish to estimate and \\(\\mathbf{y}\\) are the available observations. We assume knowledge of the prior distribution \\(p(\\mathbf{x})\\) and the underlying physical process that links the model to the data, \\(\\mathbf{y}=f(\\mathbf{x})\\) from which we can compute the likelihood \\(p(\\mathbf{y}|\\mathbf{x})\\) (assuming a certain statistics for the noise). The denominator of the Bayes rule ( \\(p(\\mathbf{y}) = \\int p(\\mathbf{y}|\\mathbf{x}) p(\\mathbf{x}) d\\mathbf{x}\\) ) is what prevents us from computing the posterior directly. Variational Inference approaches the above problem in a special way. A parametric distribution \\(q_\\theta(\\mathbf{x})\\) is defined, also sometimes referred to as guide or proposal distribution, and an optimization problem is set up to find the best free-parameters \\(\\theta\\) such that this easy-to-evaluate distribution closely resembles to posterior distribution of interest. As usual when setting up an optimization problem, a measure of distance between such distributions is required to be able to optimize for such set of parameters. In this case, since we are dealing with distributions, it comes natural to choose the Kullback-Leibler divergence as metric: \\[ \\underset{\\theta} {\\mathrm{argmin}} \\; KL(q_\\theta(\\mathbf{x})||p(\\mathbf{x}|\\mathbf{y})) \\] Let's now expand the expression of the KL divergence and show an equivalent formula for this optimization problem: \\[ \\begin{aligned} &\\underset{\\theta} {\\mathrm{argmin}} \\; KL(q_\\theta(\\mathbf{x})||p(\\mathbf{x}|\\mathbf{y})) \\\\ &= \\underset{\\theta} {\\mathrm{argmin}} \\; E_{\\mathbf{x} \\sim q_\\theta} \\left[ log \\left( \\frac{q_\\theta(\\mathbf{x})}{p(\\mathbf{x}|\\mathbf{y})} \\right) \\right] \\\\ &= \\underset{\\theta} {\\mathrm{argmin}} \\; E_{\\mathbf{x} \\sim q_\\theta} [ log q_\\theta(\\mathbf{x}) ] - E_{\\mathbf{x} \\sim q_\\theta} [ log p(\\mathbf{x}|\\mathbf{y}) ] \\\\ &= \\underset{\\theta} {\\mathrm{argmin}} \\; E_{\\mathbf{x} \\sim q_\\theta} [ log q_\\theta(\\mathbf{x}) ] - E_{\\mathbf{x} \\sim q_\\theta} \\left[ log \\left( \\frac{p(y|\\mathbf{x})p(\\mathbf{x})}{p(\\mathbf{y})} \\right) \\right] \\\\ &= \\underset{\\theta} {\\mathrm{argmin}} \\; E_{\\mathbf{x} \\sim q_\\theta} [ log q_\\theta(\\mathbf{x}) ] - E_{\\mathbf{x} \\sim q_\\theta} [ log p(\\mathbf{y}|\\mathbf{x}) ] - E_{\\mathbf{x} \\sim q_\\theta} [ log p(\\mathbf{x}) ] + \\cancel{E_{\\mathbf{x} \\sim q_\\theta} [ log p(\\mathbf{y}) ]} \\\\ &= \\underset{\\theta} {\\mathrm{argmin}} \\; KL(q_\\theta(\\mathbf{x})||p(\\mathbf{x})) - E_{\\mathbf{x} \\sim q_\\theta} [ log p(\\mathbf{y}|\\mathbf{x}) ] \\end{aligned} \\] where we have eliminated \\(E_{x \\sim q_\\theta} [ p(\\mathbf{y}) ]\\) the in the 5th row since it does not depend on \\(\\theta\\) . In the last row, we can see the two terms that we have previously described: \\(-E_{x \\sim q_\\theta} [ p(\\mathbf{y}|\\mathbf{x}) ]\\) is the negative log-likelihood of a traditional Maximum likelihood estimation (i.e., data misfit term). In the special case of gaussian noise ( \\(\\mathbf{y} \\sim \\mathcal{N}(f(\\mathbf{x}), \\sigma^2 \\mathbf{I})\\) ), this becomes the MSE loss as discussed in one of our previous lectures; \\(KL(q_\\theta(\\mathbf{x})||p(\\mathbf{x}))\\) is the regularization term encouraging the proposal distribution to stay close to the prior. Finally, let's slightly rearrange the expression in the 5th row: \\[ E_{\\mathbf{x} \\sim q_\\theta} [ log p(\\mathbf{y}) ] - KL(q_\\theta(\\mathbf{x})||p(\\mathbf{x}|\\mathbf{y})) = E_{\\mathbf{x} \\sim q_\\theta} [ log p(\\mathbf{y}|\\mathbf{x}) ] - KL(q_\\theta(\\mathbf{x})||p(\\mathbf{x})) \\] The left hand side of this equation is called Evidence Lower Bound (ELBO) . The names comes from the fact that the sum of these two terms is always \\(\\le E_{\\mathbf{x} \\sim q_\\theta} [ log p(\\mathbf{y}) ]\\) since KL divergence is always positive. Therefore, by maximizing the right hand side (or equivalently by minimizing the negative of the right hand side), we effectively maximize the lower bound of the probability of the evidence \\(p(\\mathbf{y})\\) . Variational inference can be therefore seen also as a maximization problem over the ELBO. Whilst we now understand the theoretical foundations of VI, to make it practical we need to specify: A suitable proposal \\(q_\\theta(\\mathbf{x})\\) , where suitable means that we can easily evaluate such probability, its KL divergence with a prior of choice, as well as sample from it. The simplest choice that is sometimes made in VI is named mean-field approximation where: $$ q_\\theta(\\mathbf{x}) = \\prod_i q_\\theta(x_i) \\sim \\mathcal{N}(\\boldsymbol \\mu , diag(\\boldsymbol \\sigma)) $$ where \\(\\theta={\\boldsymbol \\mu, \\boldsymbol \\sigma}\\) . This implies that there is no correlation over the different variables of the N-dimensional proposal distribution. Whilst this choice may be too simple in many practical scenarios, it is important to notice that this is not the same as assuming that the variables of the posterior itself are uncorrelated! A suitable optimizer. In the case where multiple \\mathbf{x} samples are available, \\(p(\\mathbf{y}|\\mathbf{x}\\) , \\(p(\\mathbf{x}\\) , and \\(q_\\theta(\\mathbf{x})\\) are differentiable we can simply use a stochastic gradient method. This special case of VI is named ADVI. Moving back to where we started, the VAE model. Let's now rewrite the problem as a VI estimation (where \\(\\mathbf{z}\\) plays here the role of the model or unseen variable and \\(\\mathbf{x}\\) represents the available observations): \\[ \\begin{aligned} &\\underset{\\theta, \\phi} {\\mathrm{argmin}} \\; KL(q_\\theta(\\mathbf{z})||p(\\mathbf{z}|\\mathbf{x})) \\\\ &= \\underset{\\theta, \\phi} {\\mathrm{argmin}} \\; KL(q_\\theta(\\mathbf{z})||p(\\mathbf{z})) - E_{\\mathbf{z} \\sim q_\\theta} [ log p_\\phi(\\mathbf{x}|\\mathbf{z}) ] \\end{aligned} \\] where the first term is responsible for updating the encoder whilst the second term contributes to the update of both the encoder and decoder. The proposal distribution is here parametrized as \\(q_\\theta(\\mathbf{z}) \\sim \\mathcal{N}(e_{\\theta,\\mu}(\\mathbf{x}), diag(e_{\\theta,\\sigma}(\\mathbf{x})))\\) . and the expectation is taken over the training samples (or a batch of them).","title":"Mathematics of VAEs"},{"location":"lectures/14_vae/#additional-readings","text":"The flow of this lecture is heavily inspired by this blog post A Python library that can help you step up your game with Variational Inference is Pyro from Uber.","title":"Additional readings"},{"location":"lectures/15_gans/","text":"Generative Adversarial Networks (GANs) A fundamentally new way of approaching generative modelling has been proposed by Goodfellow and co-authors in 2014. Similar to VAEs, Generative Adversarial Networks (GANs) can learn a distribution from some training samples, or more precisely thy can learn to sample from the underlying, unknown distribution. This family of NNs are revolutionary in that they can produce very high quality (i.e., extremely realistic) samples compared to predecessor models at similar computational cost. Whilst the core application of GANs (and pretty much any generative model in deep learning) has been computer vision (i.e., natural images and portraits of people in particular), their use in geoscience has also recently provided us with new ways of generating \"new\" samples that can easily outperform state-of-the-art geostatistical tools. This is very appealing in applications like reservoir modelling as geologists and reservoir engineers are nowadays usually tasked to work with multiple realizations of the subsurface and provide probabilistic estimates to support the subsequent decision making process. A few examples of early applications of GANs in geoscience are: Mosser et al. , Reconstruction of three-dimensional porous media using generative adversarial neural networks Zhang et al. , Generating geologically realistic 3D reservoir facies models using deep learning of sedimentary architecture with generative adversarial networks Wang et al. , SeismoGen: Seismic Waveform Synthesis Using GAN With Application to Seismic Data Augmentation ... We will begin by discussing the main application of GANs, i.e. pure unconditional generation. Later, we will however see that recent modifications of GANs have allowed performing conditional generation (e.g., generate facies model conditioned to well information) as well as domain transformation (e.g., from seismic to reflectivity, from facies to petrophysical parameters). The latter has been shown to outperform traditional supervised learning workflows based for example on UNet architectures. GANs groundbreaking idea Let's start by looking at the basic idea of GANs with a schematic drawing of the network architecture (or, as we will soon become familiar with, we should say the network architectures as we will be dealing with two networks!): A GAN model is composed of two networks, namely: Generator ( \\(g_\\theta\\) ): takes an input vector \\(\\mathbf{z} \\in \\mathbb{R}^{N_l}\\) randomly sampled for a given distribution (e.g., \\(\\mathbf{z} \\sim \\mathcal{N}(\\mathbf{0}, \\mathbf{I})\\) ) and produces an output vector \\(\\hat{\\mathbf{x}} \\in \\mathbb{R}^{N_f}\\) that should should belong to the underlying probability distribution of the training samples, \\(p(\\mathbf{x})\\) ). To achieve this task, the generator is not allowed direct access to the training samples \\(\\mathbf{x}^{<i>}\\) . On the other hand, it relies on the discriminator for feedback. Discriminator ( \\(d_\\phi\\) ): takes both real samples \\(\\mathbf{x}\\) and fake samples \\(\\hat{\\mathbf{x}}\\) (the latter coming from the generator) and tries to classify them. Its goal is to discriminate between true and fake samples, or in other words to identify which samples are coming from the generator. A classical example from the original paper is that the generator is a painting forger, whilst the discriminator is a painting critic. It is obvious here that these two networks must compete with each other, if one achieves its goal the other would have failed and vice-versa. As we will see later, this is what makes GANs successful but also hard to train. Moreover, whilst training is performed in parallel, it is worth noticing that the generator network is what ultimately we care about (to be able to create realistic samples), whilst the discriminator is an auxiliary network that will be discarded after training. Mathematics of GANs Before we delve into the mathematical framework of GANs, let's take a more detailed look at the two networks and their training process. First, the generator: and the discriminator: As previously explained, the generator is updated solely based on the samples it generates. This is not done in a direct form, rather through the feedback of the discriminator. As the generator tries to fool the discriminator, the discriminator is provided with true labels during the generator training phase. On the other hand, the discriminator is fed with both true and fake samples and their correct corresponding label. Its task is therefore to perform a correct classification, which, if successful, will prevent the generator from producing realistic fake samples. Note that since we want the discriminator to perform a binary classification task the activation of the last layer must be chosen to be a sigmoid function. Let's try to put down into equations this training process. Given that we are dealing with a binary classification problem, the obvious choice for the loss function is the commonly used binary cross entropy (BCE) loss. Starting from the generator: \\[ \\mathscr{L}(\\hat{\\mathbf{x}}, \\hat{\\mathbf{y}}=1) = BCE(d_\\phi(g_\\theta(\\mathbf{z})), \\hat{\\mathbf{y}}=1) = -log(d_\\phi(g_\\theta(\\mathbf{z}))) \\] which is minimum for \\(g_\\theta(d_\\phi(\\mathbf{z}))=1\\) (i.e., when the generator has been able to fool the discriminator), and maximum for \\(g_\\theta(d_\\phi(\\mathbf{z}))=0\\) (i.e., when the discriminator recognizes the creation of the generator). For the discriminator we must consider two cases. The first one is associated with the generator: \\[ \\mathscr{L}(\\hat{\\mathbf{x}}, \\hat{\\mathbf{y}}=0) = BCE(d_\\phi(g_\\theta(\\mathbf{z})), \\hat{\\mathbf{y}}=0) = -log(1-d_\\phi(g_\\theta(\\mathbf{z}))) \\] which is minimum for \\(g_\\theta(d_\\phi(\\mathbf{z}))=0\\) (i.e., when the discriminator recognizes the creation of the generator), and maximum for \\(g_\\theta(d_\\phi(\\mathbf{z}))=1\\) (i.e., when the generator has been able to fool the discriminator). The second one is instead associated with the true samples: \\[ \\mathscr{L}(\\mathbf{x}, \\mathbf{y}=1) = BCE(d_\\phi(\\mathbf{x}), \\mathbf{y}=1) = -log(d_\\phi(\\mathbf{x})) \\] which is minimum for \\(d_\\phi(\\mathbf{x})=1\\) (i.e., when the discriminator recognizes the true samples), and maximum for \\(d_\\phi(\\mathbf{x})=0\\) (i.e., when the discriminator believes that the true samples are fake). Whilst for simplicity we have analyzed these three terms separately and focused on how they can be minimized (this is also what we would do when implementing GANs in practice), a unique adversarial loss function can be also defined that uniquely identifies the goal of GAN training: \\[ \\mathscr{L}_{adv} = E_{\\mathbf{x} \\sim p_x} [log(d_\\phi(\\mathbf{x}))] + E_{\\mathbf{z} \\sim p_z} [log(1- d_\\phi(g_\\theta(\\mathbf{z})))] \\] and the overall training problem can be written as: \\[ arg \\; \\underset{g_\\theta} {\\mathrm{min}} \\; \\underset{d_\\phi} {\\mathrm{max}} \\; \\mathscr{L}_{adv} \\] This is interesting, as we are not simply minimizing a loss function to find the best parameters of a network, rather we are required to play a min-max game between the generator and discriminator. This is exactly where the Adversarial part of the name GANs comes from. Training of GANs Although there are various strategies to successfully train these two networks together, the most common one is to do one step of optimization on one and one on the other in an alternating fashion. By doing so, we allow the two networks to competitively learn together their own task whilst trying to make the other network fail on the other task. In practice, the learning process of GANs can be however very unstable (and sometimes even unpredictable). A common scenario is in fact represented by the fact that one network learns its task much faster than the other network. Depending on which of the two networks is the fast learner, the following scenarios may arise: Super-discriminator (i.e., the discriminator outpaces the generator): the generator is made aware of the fact that most (if not all) of his fake samples have been identified by the discriminator. By not knowing which of the generated samples are harder to discriminate and which ones are easier, the network cannot update its parameters to generator more of the samples that were mislead for real and less of those that were identified as fake by the discriminator. This is the most common scenario as the discriminator has a much easier task (binary classification) compared to that of the generator (learning a probability density function); Super-generator (i.e.,the generator outpaces the discriminator): as the discriminator cannot tell apart the true from the fake samples, the generator is satisfied with what it is producing and continues to do so. Whilst this is the ideal scenario that we wish to experience after some epochs of training, when this arises early (in the first few epochs), it is usually a sign that the generator is producing samples that are very similar to each other instead of a representative set of samples of the underlying distribution (so-called mode collapse). Now that we know how to train GANs, and that the training process may be hard, let's discuss in details a number of scenarios that we may encounter during training. After that, we will discuss a number of strategies that have been devised through out the years to minimize the risk of having unbalanced training and ultimately a generator with poor generative capabilities. Mode collapse Let's consider the following multi-modal 1-dimensional distribution: A successfully trained GAN is able to generate sample from the different modes of this distribution. However, it is common for a GAN to identify a single mode and stick to it, generating only samples from a small part of the overall distribution. As these samples are realistic, the generator may eventually end up fooling the discriminator losing any interest in exploring other areas of the distribution. More precisely, let's imagine that during the early stages of training the discriminator is able to distinguish between fake and real samples for 3 of the 4 modes whilst it struggles for the remaining one. Whilst trying to fool the discriminator, the generator realizes that and exploits its ability to fool the discriminator when sampling from one of the modes. By doing this, the generator becomes better and better at producing realistic samples from that mode but forgets about the fact that the probability it is trying to approximate may be multi-modal. Whilst this is a better scenario than the one depicted above (i.e., the generator is outpaced by the discriminator and cannot produce any representative sample), mode collapse is still something we would like to avoid if possible. We will soon discuss a number of modifications to the classical GAN model that can achieve that. Vanishing gradients Another problem commonly experienced whilst training GANs is represented by the arising of vanishing gradients. Let's imagine that at the start of the training process, the discriminator manages to get the two distribution (true and generator) far apart. In this scenario, the gradient of the BCE loss tends to flatten and training starts to slow down as depicted in the figure below: In other words, once the distributions stop overlapping the chances of the generator to keep learning and producing something meaningful drastically reduce (or at least the learning process becomes very slow). Solutions to unstable training In the quest of creating stable and reliable GAN models, a number of researchers have suggested that the BCE loss initially used in the original GAN paper may be the main cause of some of the above highlighted problems. As GANs ultimate goal is that of estimate a probability distribution, a suggestion was made by Arjovsky and coauthors in 2017 to replace the BCE loss with the so-called Wasserstein loss. Simply put, the Wasserstein distance, or Earth's mover distance, is a distance function between two probability distributions that computes the amount of earth (or soil) that needs to be moved from one probability to match another. Let's take a look at it with an example where for simplicity we discretize two probability distributions (i.e., we display them as histograms). In this case we can simply observe that the Wasserstein distance between such probabilities is 1 as we need to make just one move to match them: More importantly, a clear implication in choosing this loss function over BCE for the training of GANs lies in the fact that its gradient does not saturate very quickly as two distributions are pulled far apart. This greatly eases the training process of the generator even when the discriminator is superior at the beginning of the training process. In practice, when using the Wasserstein distance, the loss function of the min-max game becomes: \\[ arg \\; \\underset{g_\\theta} {\\mathrm{min}} \\; \\underset{d_\\phi} {\\mathrm{max}} \\; E_{\\mathbf{x} \\sim p_x} [d_\\phi(\\mathbf{x})] - E_{\\mathbf{z} \\sim p_z}[d_\\phi(g_\\theta(\\mathbf{z}))] \\] and the training process remains unchanged. Finally, note in the context of Wasserstein GANs (W-GANs) the discriminator is sometimes also called critic as its role it is not anymore to perform a classification but simply criticize the fake samples. Moreover, although the Wasserstein loss makes training of GANs less unstable, it introduces a problem. The training of the discriminator can be affected by the exploding gradient behaviour, due to the fact that the discriminator may not be 1-Lipschitz continuous. Various approaches have been introduced to avoid such a situation, gradient clipping and gradient penalty are two of the most common solutions. To conclude this section, it is worth mentioning that a number of other strategies have been proposed in the literature to mitigate training failures of GANs. Whilst we will not go into the details here, we will briefly mention a couple of them: Spectral normalization : another approach used to ensure that the discriminator is 1-Lipschitz continuous. As the name implies, spectral normalization is a normalization procedure applied to each layer of the network to ensure that the spectral norm of the layer is smaller or equal to 1. It requires estimating the largest eigenvalue of each layer and renormalizing its weight whenever they are updated. For more details, see this blog post for more details. Polyak averaging of the generator : as discussed in this lecture , it is possible to mitigate the importance of the stopping criterion when training GANs by averaging the parameters of the generator for a number of iterations \\(N_{it}\\) (or even epochs). Pro-GAN (Progressive growing GANs): Instead of training a GAN network directly on the high-resolution output of interest, a good strategy proposed by Karras and coauthors is to start with a smaller, lower resolution version and train a small GAN. This is repeated for a number of times by freezing the trained layers, adding more layers, and increasing the resolution of the sought output. Conditional GANs The original formulation of GANs aims at learning an unconditional probability distribution and sampling from it. However, in many real life scenarios we may expect the probability distribution of our data to be somehow clustered (i.e., display a multi-modal behaviour). Let's for example imagine that we are provided with a number of geological models from all over the world and our goal is to teach a GAN to create new 'fake' models that are as realistic as possible. Whilst all the geological models share some high-level features, it is logical to expect that some of them have more things in common then others. Let's also assume we are provided with such information in the form of labels, so that for each sample we also know the class it belongs to. If we were able to sample from a 2-dimensional latent space and reproduce exactly our training samples, this is what we may observe: It may be appealing to train a GAN that at inference time could produce samples conditionally to us choosing a specific class (or cluster) of interest. Of course, the most straightforward approach could be to separate the training samples into \\(N_c\\) buckets and train \\(N_c\\) independent GANs. This is however very costly. A smarter approach is to turn our generative network from unconditional to conditional, something we call conditional GAN (or c-GAN). Conditional GANs present a number of distinctive features when compared to traditional GANs, which we are going to summarize here: Alongside a random vector \\(\\mathbf{z} \\in \\mathbb{R}^{N_l}\\) , the generator is now also fed with a label that represents the class we wish to sample from. These two are concatenated to each other to form a new vector \\(\\tilde{\\mathbf{z}} = [\\mathbf{z}, c] \\in \\mathbb{R}^{N_l+1}\\) . Alternatively, the label can be one-hot encoded into a vector \\(\\mathbf{c}\\) and the generator is fed with \\(\\tilde{\\mathbf{z}} = [\\mathbf{z}, \\mathbf{c}] \\in \\mathbb{R}^{N_l+N_c}\\) . By doing so at training time, we inform the generator that we are not just interested in producing a random sample from the distribution of the training data, rather we want a sample from a specific class. Once the network is trained, at inference time we will have the ability to sample conditionally; The discriminator is also made aware of the fact that the training data is divided into classes. And of course, this goes also for the generated samples. Similar to the generator input, the input of the discriminator is now modified to include also the label of the true sample (or the label provided to the generator for the fake sample). Once again the vector $\\mathbf{x} is simply concatenated with either the label \\(c\\) or its one-hot encoded version \\(\\mathbf{c}\\) to create a new input to the discriminator \\(\\tilde{\\mathbf{x}}\\) . Finally, what if the input of the discriminator (and/or generator) is N-dimensional. This is usually the case when working with natural images or multi-dimensional geoscientific data (e.g., seismic data, satellite images). A simple modification of the process described above can be introduced. Instead of concatenating the label \\(c\\) to the 1-dimensional \\(\\mathbf{z}\\) (or \\(\\mathbf{x}\\) ) vector, an additional channel is added to the N-dimensional \\(\\mathbf{z}\\) (or \\(\\mathbf{x}\\) ) tensor that contains the value of the label. Similarly, when working with the one-hot encoded version of the label, \\(N_c\\) channels are added instead with one of them containing 1s (the one corresponding to the label) and all others containing 0s. Domain translation with GANs To conclude this lecture, we will discuss a slightly different application of GANs. Whilst so far we have presented GANs as statistical modelling tools for generative tasks, it turns out they are also useful for image-to-image translation (or more broadly, for any form of domain translation ). We previously mentioned this application in the context of convolutional networks and more specifically the UNet architecture. The idea is to map data from a given input domain to a given output domain. A number of interesting applications in geoscience may benefit from this set up. For example, any geophysical processing step can be seen as a domain translation task where we transform the input data into a new version of it. Also, we could think of using domain translation as a way to create realistic geological models from sketches or to populate them with petrophysical properties whist starting from a pure facies skeleton. In general, two scenarios may arise: Paired training: the training data provides us with paired combinations of samples from the two domains, e.g., \\(\\mathbf{x}_A^{<i>} \\leftrightarrow \\mathbf{x}_B^{<i>} \\; \\forall i\\) Unpaired training: the training data comes in the form of two set of training samples, the first from domain A and the second from domain B. However, we do not know how each sample of one domain is related to a sample of the other domain, i.e. \\(X_A = (\\mathbf{x}_A^{<1>}, \\mathbf{x}_A^{<2>}, ..., \\mathbf{x}_A^{<N_A>})\\) and \\(X_B = (\\mathbf{x}_B^{<1>}, \\mathbf{x}_B^{<2>}, ..., \\mathbf{x}_B^{<N_B>})\\) . Paired training In 2017 Isola and coauthors suggested that c-GANs could be used for paired domain translation and proposed the so called Pix2Pix network. Mathematically speaking, whilst a traditional c-GAN aims to learn: \\[ p(\\mathbf{x} | \\mathbf{z}, c) \\] a c-GAN for domain translation will be tasked to learn: \\[ p(\\mathbf{x}_B | \\mathbf{z}, \\mathbf{x}_A) \\] where the noise vector \\(\\mathbf{z}\\) can be used to sample multiple realizations conditioned on the given input \\(\\mathbf{x}_A\\) . In practice, it turns out that this problem is too constrained to allow 'rich sampling', so the original authors suggest to remove \\(\\mathbf{z}\\) from the inputs and use alternative approaches such as dropout if interested to produce multi outputs. The overall network architecture of Pix2Pix can be summarized as follows: Generator: as the inputs and outputs of the generator share the same dimensions, the network architecture here does not need to be that of a decoder like in classical GANs. More powerful architectures with skip connections, like UNet, can be used instead. Note that, as mentioned above, there the random vector \\(\\mathbf{z}\\) is not required to be the input of the generator. Discriminator: similar to c-GAN, both the true samples from the target domain, \\(\\mathbf{x}_B\\) , as well as the predicted ones, \\(\\hat{\\mathbf{x}}_B\\) are fed to the discriminator concatenated with their corresponding sample in the original domain, \\(\\mathbf{x}_A\\) . A second modification to the usual GAN discriminator is also applied here. Instead of using a classical discriminator that reduces the dimensionality of the output to a scalar, Pix2Pix use a special type of discriminator called Patch GAN . More specifically, the discriminator produces an 2-dimensional output of size \\(N_d \\times N_d\\) (where \\(N_d\\) is much smaller than the size of the input samples, \\(N \\times N\\) ). This matrix contains values that are fed independently to the adversarial loss used in classical GANs. By doing the PatchGAN discriminator tries to classify if each \\(N/N_d \\times N/N_d\\) patch in an image is real or fake, providing therefore a much richer feedback to the generator. Apart from the network changes, Pix2Pix introduces also a modification to the original loss function of GANs: \\[ \\mathscr{L} = \\mathscr{L}_{adv} + \\lambda \\mathscr{L}_{pix} \\] where the first term, \\(\\mathscr{L}_{adv}\\) , is the adversarial loss of choice, whilst the second term computes the error between the predicted sample in the new domain and the corresponding true one, e.g. \\(MSE(\\hat{\\mathbf{x}}_B^{<i>}, \\mathbf{x}_B^{<i>})\\) . As this can be interpreted as the classical loss term of a supervised learning task, Pix2Pix does indeed trade-off between performing a classical reconstruction with pixel-wise loss and producing samples that can fool the discriminator. Unpaired training As we previously mentioned, it is not always possible to have access to paired samples from the two domains. Provided access to a variety of samples from domains A and B, CycleGAN was introduced by Zhu and colleagues as a way to perform domain translation in this more general setup. The idea of CycleGAN is to train 2 GANs in parallel, one performing a domain translation task from A to B and the other performing a domain translation task from B to A: Each GAN is composed of a generator with UNet architecture and a discriminator (it could be normal one or a PatchGAN); samples from domain A are fed to the \\(GAN_{A \\rightarrow B}\\) whilst samples from domain B are fed to the \\(GAN_{B \\rightarrow A}\\) . An adversarial loss is used as commonly done in GANs training, but in this case neither BCE nor Wasserstein is chosen. Instead, the authors suggest to use an MSE loss: \\[ \\begin{aligned} \\mathscr{L}_{adv, g_{A \\rightarrow B}} &= E_{\\mathbf{x}_A \\sim p_{x,A}} [(d_B(g_{A \\rightarrow B}(\\mathbf{x}_A) - 1)^2], \\\\ \\mathscr{L}_{adv, g_{B \\rightarrow A}} &= E_{\\mathbf{x}_B \\sim p_{x,B}} [(d_A(g_{B \\rightarrow A}(\\mathbf{x}_B) - 1)^2], \\\\ \\mathscr{L}_{adv, d_A} &= E_{\\mathbf{x}_A \\sim p_{x,A}} [(d_A(\\mathbf{x}_A) - 1)^2] + E_{\\mathbf{x}_B \\sim p_{x,B}} [d_B(g_{B \\rightarrow A}(\\mathbf{x}_B))^2], \\\\ \\mathscr{L}_{adv, d_B} &= E_{\\mathbf{x}_B \\sim p_{x,B}} [(d_B(\\mathbf{x}_B) - 1)^2] + E_{\\mathbf{x}_A \\sim p_{x,A}} [d_B(g_{A \\rightarrow B}(\\mathbf{x}_A))^2], \\\\ \\end{aligned} \\] and we finally define \\(\\mathscr{L}_{adv, A \\rightarrow B}=\\mathscr{L}_{adv, g_{A \\rightarrow B}} + \\mathscr{L}_{adv, d_B}\\) and \\(\\mathscr{L}_{adv, B \\rightarrow A}=\\mathscr{L}_{adv, g_{B \\rightarrow A}} + \\mathscr{L}_{adv, d_A}\\) . Moreover, since we do not know how to pair the samples from the different domains the PixelLoss of Pix2Pix cannot be used here. Up until now the networks are also not aware of each other and could well be trained separately. A feedback loop is therefore introduced such that the two networks are aware of each other and trained together to learn both mappings consistently. This is referred to as Cycle consistency and it works as follows: A sample from domain A is fed to the \\(GAN_{A \\rightarrow B}\\) and subsequently to \\(GAN_{B \\rightarrow A}\\) and the reconstruction loss is computed: \\(\\mathscr{L}_{cycle, A \\rightarrow B} = MSE(\\mathbf{x}_A, g_{B \\rightarrow A}(g_{A \\rightarrow B}(\\mathbf{x}_A))\\) A sample from domain B is fed to the \\(GAN_{B \\rightarrow A}\\) and subsequently to \\(GAN_{A \\rightarrow B}\\) and the reconstruction loss is computed: \\(\\mathscr{L}_{cycle, B \\rightarrow A} = MSE(\\mathbf{x}_B, g_{A \\rightarrow B}(g_{B \\rightarrow A}(\\mathbf{x}_B))\\) Finally, an Identity loss is also optionally introduced where a sample from one domain is passed through the generator of the other domain and the MSE loss is computed against the sample itself. The idea of such loss is that a generator should not modify a sample that already belongs to the target distribution: \\[ \\begin{aligned} \\mathscr{L}_{identity, A} &= MSE(g_{B \\rightarrow A}(\\mathbf{x}_A), \\mathbf{x}_A)\\\\ \\mathscr{L}_{identity, B} &= MSE(g_{A \\rightarrow B}(\\mathbf{x}_B), \\mathbf{x}_B)\\\\ \\end{aligned} \\] To summarize, the overall loss function of CycleGAN becomes: \\[ \\mathscr{L} = \\sum_{i \\in (A \\rightarrow B, B \\rightarrow A)} \\left( \\mathscr{L}_{adv, i} + \\lambda_C (\\mathscr{L}_{cycle, i}) \\right) + \\lambda_I (\\mathscr{L}_{identity, A} +\\mathscr{L}_{identity, B}) \\] Additional readings A good discussion on the limitations of the BCE loss for GANs training can be found here A number of valuable resources for stable training of GANs are: 1 , 2 If you want to get started with GANs in PyTorch, here is a good starting point: A Zoo of PyTorch implementations of GANs","title":"Generative Adversarial Networks (GANs)"},{"location":"lectures/15_gans/#generative-adversarial-networks-gans","text":"A fundamentally new way of approaching generative modelling has been proposed by Goodfellow and co-authors in 2014. Similar to VAEs, Generative Adversarial Networks (GANs) can learn a distribution from some training samples, or more precisely thy can learn to sample from the underlying, unknown distribution. This family of NNs are revolutionary in that they can produce very high quality (i.e., extremely realistic) samples compared to predecessor models at similar computational cost. Whilst the core application of GANs (and pretty much any generative model in deep learning) has been computer vision (i.e., natural images and portraits of people in particular), their use in geoscience has also recently provided us with new ways of generating \"new\" samples that can easily outperform state-of-the-art geostatistical tools. This is very appealing in applications like reservoir modelling as geologists and reservoir engineers are nowadays usually tasked to work with multiple realizations of the subsurface and provide probabilistic estimates to support the subsequent decision making process. A few examples of early applications of GANs in geoscience are: Mosser et al. , Reconstruction of three-dimensional porous media using generative adversarial neural networks Zhang et al. , Generating geologically realistic 3D reservoir facies models using deep learning of sedimentary architecture with generative adversarial networks Wang et al. , SeismoGen: Seismic Waveform Synthesis Using GAN With Application to Seismic Data Augmentation ... We will begin by discussing the main application of GANs, i.e. pure unconditional generation. Later, we will however see that recent modifications of GANs have allowed performing conditional generation (e.g., generate facies model conditioned to well information) as well as domain transformation (e.g., from seismic to reflectivity, from facies to petrophysical parameters). The latter has been shown to outperform traditional supervised learning workflows based for example on UNet architectures.","title":"Generative Adversarial Networks (GANs)"},{"location":"lectures/15_gans/#gans-groundbreaking-idea","text":"Let's start by looking at the basic idea of GANs with a schematic drawing of the network architecture (or, as we will soon become familiar with, we should say the network architectures as we will be dealing with two networks!): A GAN model is composed of two networks, namely: Generator ( \\(g_\\theta\\) ): takes an input vector \\(\\mathbf{z} \\in \\mathbb{R}^{N_l}\\) randomly sampled for a given distribution (e.g., \\(\\mathbf{z} \\sim \\mathcal{N}(\\mathbf{0}, \\mathbf{I})\\) ) and produces an output vector \\(\\hat{\\mathbf{x}} \\in \\mathbb{R}^{N_f}\\) that should should belong to the underlying probability distribution of the training samples, \\(p(\\mathbf{x})\\) ). To achieve this task, the generator is not allowed direct access to the training samples \\(\\mathbf{x}^{<i>}\\) . On the other hand, it relies on the discriminator for feedback. Discriminator ( \\(d_\\phi\\) ): takes both real samples \\(\\mathbf{x}\\) and fake samples \\(\\hat{\\mathbf{x}}\\) (the latter coming from the generator) and tries to classify them. Its goal is to discriminate between true and fake samples, or in other words to identify which samples are coming from the generator. A classical example from the original paper is that the generator is a painting forger, whilst the discriminator is a painting critic. It is obvious here that these two networks must compete with each other, if one achieves its goal the other would have failed and vice-versa. As we will see later, this is what makes GANs successful but also hard to train. Moreover, whilst training is performed in parallel, it is worth noticing that the generator network is what ultimately we care about (to be able to create realistic samples), whilst the discriminator is an auxiliary network that will be discarded after training.","title":"GANs groundbreaking idea"},{"location":"lectures/15_gans/#mathematics-of-gans","text":"Before we delve into the mathematical framework of GANs, let's take a more detailed look at the two networks and their training process. First, the generator: and the discriminator: As previously explained, the generator is updated solely based on the samples it generates. This is not done in a direct form, rather through the feedback of the discriminator. As the generator tries to fool the discriminator, the discriminator is provided with true labels during the generator training phase. On the other hand, the discriminator is fed with both true and fake samples and their correct corresponding label. Its task is therefore to perform a correct classification, which, if successful, will prevent the generator from producing realistic fake samples. Note that since we want the discriminator to perform a binary classification task the activation of the last layer must be chosen to be a sigmoid function. Let's try to put down into equations this training process. Given that we are dealing with a binary classification problem, the obvious choice for the loss function is the commonly used binary cross entropy (BCE) loss. Starting from the generator: \\[ \\mathscr{L}(\\hat{\\mathbf{x}}, \\hat{\\mathbf{y}}=1) = BCE(d_\\phi(g_\\theta(\\mathbf{z})), \\hat{\\mathbf{y}}=1) = -log(d_\\phi(g_\\theta(\\mathbf{z}))) \\] which is minimum for \\(g_\\theta(d_\\phi(\\mathbf{z}))=1\\) (i.e., when the generator has been able to fool the discriminator), and maximum for \\(g_\\theta(d_\\phi(\\mathbf{z}))=0\\) (i.e., when the discriminator recognizes the creation of the generator). For the discriminator we must consider two cases. The first one is associated with the generator: \\[ \\mathscr{L}(\\hat{\\mathbf{x}}, \\hat{\\mathbf{y}}=0) = BCE(d_\\phi(g_\\theta(\\mathbf{z})), \\hat{\\mathbf{y}}=0) = -log(1-d_\\phi(g_\\theta(\\mathbf{z}))) \\] which is minimum for \\(g_\\theta(d_\\phi(\\mathbf{z}))=0\\) (i.e., when the discriminator recognizes the creation of the generator), and maximum for \\(g_\\theta(d_\\phi(\\mathbf{z}))=1\\) (i.e., when the generator has been able to fool the discriminator). The second one is instead associated with the true samples: \\[ \\mathscr{L}(\\mathbf{x}, \\mathbf{y}=1) = BCE(d_\\phi(\\mathbf{x}), \\mathbf{y}=1) = -log(d_\\phi(\\mathbf{x})) \\] which is minimum for \\(d_\\phi(\\mathbf{x})=1\\) (i.e., when the discriminator recognizes the true samples), and maximum for \\(d_\\phi(\\mathbf{x})=0\\) (i.e., when the discriminator believes that the true samples are fake). Whilst for simplicity we have analyzed these three terms separately and focused on how they can be minimized (this is also what we would do when implementing GANs in practice), a unique adversarial loss function can be also defined that uniquely identifies the goal of GAN training: \\[ \\mathscr{L}_{adv} = E_{\\mathbf{x} \\sim p_x} [log(d_\\phi(\\mathbf{x}))] + E_{\\mathbf{z} \\sim p_z} [log(1- d_\\phi(g_\\theta(\\mathbf{z})))] \\] and the overall training problem can be written as: \\[ arg \\; \\underset{g_\\theta} {\\mathrm{min}} \\; \\underset{d_\\phi} {\\mathrm{max}} \\; \\mathscr{L}_{adv} \\] This is interesting, as we are not simply minimizing a loss function to find the best parameters of a network, rather we are required to play a min-max game between the generator and discriminator. This is exactly where the Adversarial part of the name GANs comes from.","title":"Mathematics of GANs"},{"location":"lectures/15_gans/#training-of-gans","text":"Although there are various strategies to successfully train these two networks together, the most common one is to do one step of optimization on one and one on the other in an alternating fashion. By doing so, we allow the two networks to competitively learn together their own task whilst trying to make the other network fail on the other task. In practice, the learning process of GANs can be however very unstable (and sometimes even unpredictable). A common scenario is in fact represented by the fact that one network learns its task much faster than the other network. Depending on which of the two networks is the fast learner, the following scenarios may arise: Super-discriminator (i.e., the discriminator outpaces the generator): the generator is made aware of the fact that most (if not all) of his fake samples have been identified by the discriminator. By not knowing which of the generated samples are harder to discriminate and which ones are easier, the network cannot update its parameters to generator more of the samples that were mislead for real and less of those that were identified as fake by the discriminator. This is the most common scenario as the discriminator has a much easier task (binary classification) compared to that of the generator (learning a probability density function); Super-generator (i.e.,the generator outpaces the discriminator): as the discriminator cannot tell apart the true from the fake samples, the generator is satisfied with what it is producing and continues to do so. Whilst this is the ideal scenario that we wish to experience after some epochs of training, when this arises early (in the first few epochs), it is usually a sign that the generator is producing samples that are very similar to each other instead of a representative set of samples of the underlying distribution (so-called mode collapse). Now that we know how to train GANs, and that the training process may be hard, let's discuss in details a number of scenarios that we may encounter during training. After that, we will discuss a number of strategies that have been devised through out the years to minimize the risk of having unbalanced training and ultimately a generator with poor generative capabilities.","title":"Training of GANs"},{"location":"lectures/15_gans/#mode-collapse","text":"Let's consider the following multi-modal 1-dimensional distribution: A successfully trained GAN is able to generate sample from the different modes of this distribution. However, it is common for a GAN to identify a single mode and stick to it, generating only samples from a small part of the overall distribution. As these samples are realistic, the generator may eventually end up fooling the discriminator losing any interest in exploring other areas of the distribution. More precisely, let's imagine that during the early stages of training the discriminator is able to distinguish between fake and real samples for 3 of the 4 modes whilst it struggles for the remaining one. Whilst trying to fool the discriminator, the generator realizes that and exploits its ability to fool the discriminator when sampling from one of the modes. By doing this, the generator becomes better and better at producing realistic samples from that mode but forgets about the fact that the probability it is trying to approximate may be multi-modal. Whilst this is a better scenario than the one depicted above (i.e., the generator is outpaced by the discriminator and cannot produce any representative sample), mode collapse is still something we would like to avoid if possible. We will soon discuss a number of modifications to the classical GAN model that can achieve that.","title":"Mode collapse"},{"location":"lectures/15_gans/#vanishing-gradients","text":"Another problem commonly experienced whilst training GANs is represented by the arising of vanishing gradients. Let's imagine that at the start of the training process, the discriminator manages to get the two distribution (true and generator) far apart. In this scenario, the gradient of the BCE loss tends to flatten and training starts to slow down as depicted in the figure below: In other words, once the distributions stop overlapping the chances of the generator to keep learning and producing something meaningful drastically reduce (or at least the learning process becomes very slow).","title":"Vanishing gradients"},{"location":"lectures/15_gans/#solutions-to-unstable-training","text":"In the quest of creating stable and reliable GAN models, a number of researchers have suggested that the BCE loss initially used in the original GAN paper may be the main cause of some of the above highlighted problems. As GANs ultimate goal is that of estimate a probability distribution, a suggestion was made by Arjovsky and coauthors in 2017 to replace the BCE loss with the so-called Wasserstein loss. Simply put, the Wasserstein distance, or Earth's mover distance, is a distance function between two probability distributions that computes the amount of earth (or soil) that needs to be moved from one probability to match another. Let's take a look at it with an example where for simplicity we discretize two probability distributions (i.e., we display them as histograms). In this case we can simply observe that the Wasserstein distance between such probabilities is 1 as we need to make just one move to match them: More importantly, a clear implication in choosing this loss function over BCE for the training of GANs lies in the fact that its gradient does not saturate very quickly as two distributions are pulled far apart. This greatly eases the training process of the generator even when the discriminator is superior at the beginning of the training process. In practice, when using the Wasserstein distance, the loss function of the min-max game becomes: \\[ arg \\; \\underset{g_\\theta} {\\mathrm{min}} \\; \\underset{d_\\phi} {\\mathrm{max}} \\; E_{\\mathbf{x} \\sim p_x} [d_\\phi(\\mathbf{x})] - E_{\\mathbf{z} \\sim p_z}[d_\\phi(g_\\theta(\\mathbf{z}))] \\] and the training process remains unchanged. Finally, note in the context of Wasserstein GANs (W-GANs) the discriminator is sometimes also called critic as its role it is not anymore to perform a classification but simply criticize the fake samples. Moreover, although the Wasserstein loss makes training of GANs less unstable, it introduces a problem. The training of the discriminator can be affected by the exploding gradient behaviour, due to the fact that the discriminator may not be 1-Lipschitz continuous. Various approaches have been introduced to avoid such a situation, gradient clipping and gradient penalty are two of the most common solutions. To conclude this section, it is worth mentioning that a number of other strategies have been proposed in the literature to mitigate training failures of GANs. Whilst we will not go into the details here, we will briefly mention a couple of them: Spectral normalization : another approach used to ensure that the discriminator is 1-Lipschitz continuous. As the name implies, spectral normalization is a normalization procedure applied to each layer of the network to ensure that the spectral norm of the layer is smaller or equal to 1. It requires estimating the largest eigenvalue of each layer and renormalizing its weight whenever they are updated. For more details, see this blog post for more details. Polyak averaging of the generator : as discussed in this lecture , it is possible to mitigate the importance of the stopping criterion when training GANs by averaging the parameters of the generator for a number of iterations \\(N_{it}\\) (or even epochs). Pro-GAN (Progressive growing GANs): Instead of training a GAN network directly on the high-resolution output of interest, a good strategy proposed by Karras and coauthors is to start with a smaller, lower resolution version and train a small GAN. This is repeated for a number of times by freezing the trained layers, adding more layers, and increasing the resolution of the sought output.","title":"Solutions to unstable training"},{"location":"lectures/15_gans/#conditional-gans","text":"The original formulation of GANs aims at learning an unconditional probability distribution and sampling from it. However, in many real life scenarios we may expect the probability distribution of our data to be somehow clustered (i.e., display a multi-modal behaviour). Let's for example imagine that we are provided with a number of geological models from all over the world and our goal is to teach a GAN to create new 'fake' models that are as realistic as possible. Whilst all the geological models share some high-level features, it is logical to expect that some of them have more things in common then others. Let's also assume we are provided with such information in the form of labels, so that for each sample we also know the class it belongs to. If we were able to sample from a 2-dimensional latent space and reproduce exactly our training samples, this is what we may observe: It may be appealing to train a GAN that at inference time could produce samples conditionally to us choosing a specific class (or cluster) of interest. Of course, the most straightforward approach could be to separate the training samples into \\(N_c\\) buckets and train \\(N_c\\) independent GANs. This is however very costly. A smarter approach is to turn our generative network from unconditional to conditional, something we call conditional GAN (or c-GAN). Conditional GANs present a number of distinctive features when compared to traditional GANs, which we are going to summarize here: Alongside a random vector \\(\\mathbf{z} \\in \\mathbb{R}^{N_l}\\) , the generator is now also fed with a label that represents the class we wish to sample from. These two are concatenated to each other to form a new vector \\(\\tilde{\\mathbf{z}} = [\\mathbf{z}, c] \\in \\mathbb{R}^{N_l+1}\\) . Alternatively, the label can be one-hot encoded into a vector \\(\\mathbf{c}\\) and the generator is fed with \\(\\tilde{\\mathbf{z}} = [\\mathbf{z}, \\mathbf{c}] \\in \\mathbb{R}^{N_l+N_c}\\) . By doing so at training time, we inform the generator that we are not just interested in producing a random sample from the distribution of the training data, rather we want a sample from a specific class. Once the network is trained, at inference time we will have the ability to sample conditionally; The discriminator is also made aware of the fact that the training data is divided into classes. And of course, this goes also for the generated samples. Similar to the generator input, the input of the discriminator is now modified to include also the label of the true sample (or the label provided to the generator for the fake sample). Once again the vector $\\mathbf{x} is simply concatenated with either the label \\(c\\) or its one-hot encoded version \\(\\mathbf{c}\\) to create a new input to the discriminator \\(\\tilde{\\mathbf{x}}\\) . Finally, what if the input of the discriminator (and/or generator) is N-dimensional. This is usually the case when working with natural images or multi-dimensional geoscientific data (e.g., seismic data, satellite images). A simple modification of the process described above can be introduced. Instead of concatenating the label \\(c\\) to the 1-dimensional \\(\\mathbf{z}\\) (or \\(\\mathbf{x}\\) ) vector, an additional channel is added to the N-dimensional \\(\\mathbf{z}\\) (or \\(\\mathbf{x}\\) ) tensor that contains the value of the label. Similarly, when working with the one-hot encoded version of the label, \\(N_c\\) channels are added instead with one of them containing 1s (the one corresponding to the label) and all others containing 0s.","title":"Conditional GANs"},{"location":"lectures/15_gans/#domain-translation-with-gans","text":"To conclude this lecture, we will discuss a slightly different application of GANs. Whilst so far we have presented GANs as statistical modelling tools for generative tasks, it turns out they are also useful for image-to-image translation (or more broadly, for any form of domain translation ). We previously mentioned this application in the context of convolutional networks and more specifically the UNet architecture. The idea is to map data from a given input domain to a given output domain. A number of interesting applications in geoscience may benefit from this set up. For example, any geophysical processing step can be seen as a domain translation task where we transform the input data into a new version of it. Also, we could think of using domain translation as a way to create realistic geological models from sketches or to populate them with petrophysical properties whist starting from a pure facies skeleton. In general, two scenarios may arise: Paired training: the training data provides us with paired combinations of samples from the two domains, e.g., \\(\\mathbf{x}_A^{<i>} \\leftrightarrow \\mathbf{x}_B^{<i>} \\; \\forall i\\) Unpaired training: the training data comes in the form of two set of training samples, the first from domain A and the second from domain B. However, we do not know how each sample of one domain is related to a sample of the other domain, i.e. \\(X_A = (\\mathbf{x}_A^{<1>}, \\mathbf{x}_A^{<2>}, ..., \\mathbf{x}_A^{<N_A>})\\) and \\(X_B = (\\mathbf{x}_B^{<1>}, \\mathbf{x}_B^{<2>}, ..., \\mathbf{x}_B^{<N_B>})\\) .","title":"Domain translation with GANs"},{"location":"lectures/15_gans/#paired-training","text":"In 2017 Isola and coauthors suggested that c-GANs could be used for paired domain translation and proposed the so called Pix2Pix network. Mathematically speaking, whilst a traditional c-GAN aims to learn: \\[ p(\\mathbf{x} | \\mathbf{z}, c) \\] a c-GAN for domain translation will be tasked to learn: \\[ p(\\mathbf{x}_B | \\mathbf{z}, \\mathbf{x}_A) \\] where the noise vector \\(\\mathbf{z}\\) can be used to sample multiple realizations conditioned on the given input \\(\\mathbf{x}_A\\) . In practice, it turns out that this problem is too constrained to allow 'rich sampling', so the original authors suggest to remove \\(\\mathbf{z}\\) from the inputs and use alternative approaches such as dropout if interested to produce multi outputs. The overall network architecture of Pix2Pix can be summarized as follows: Generator: as the inputs and outputs of the generator share the same dimensions, the network architecture here does not need to be that of a decoder like in classical GANs. More powerful architectures with skip connections, like UNet, can be used instead. Note that, as mentioned above, there the random vector \\(\\mathbf{z}\\) is not required to be the input of the generator. Discriminator: similar to c-GAN, both the true samples from the target domain, \\(\\mathbf{x}_B\\) , as well as the predicted ones, \\(\\hat{\\mathbf{x}}_B\\) are fed to the discriminator concatenated with their corresponding sample in the original domain, \\(\\mathbf{x}_A\\) . A second modification to the usual GAN discriminator is also applied here. Instead of using a classical discriminator that reduces the dimensionality of the output to a scalar, Pix2Pix use a special type of discriminator called Patch GAN . More specifically, the discriminator produces an 2-dimensional output of size \\(N_d \\times N_d\\) (where \\(N_d\\) is much smaller than the size of the input samples, \\(N \\times N\\) ). This matrix contains values that are fed independently to the adversarial loss used in classical GANs. By doing the PatchGAN discriminator tries to classify if each \\(N/N_d \\times N/N_d\\) patch in an image is real or fake, providing therefore a much richer feedback to the generator. Apart from the network changes, Pix2Pix introduces also a modification to the original loss function of GANs: \\[ \\mathscr{L} = \\mathscr{L}_{adv} + \\lambda \\mathscr{L}_{pix} \\] where the first term, \\(\\mathscr{L}_{adv}\\) , is the adversarial loss of choice, whilst the second term computes the error between the predicted sample in the new domain and the corresponding true one, e.g. \\(MSE(\\hat{\\mathbf{x}}_B^{<i>}, \\mathbf{x}_B^{<i>})\\) . As this can be interpreted as the classical loss term of a supervised learning task, Pix2Pix does indeed trade-off between performing a classical reconstruction with pixel-wise loss and producing samples that can fool the discriminator.","title":"Paired training"},{"location":"lectures/15_gans/#unpaired-training","text":"As we previously mentioned, it is not always possible to have access to paired samples from the two domains. Provided access to a variety of samples from domains A and B, CycleGAN was introduced by Zhu and colleagues as a way to perform domain translation in this more general setup. The idea of CycleGAN is to train 2 GANs in parallel, one performing a domain translation task from A to B and the other performing a domain translation task from B to A: Each GAN is composed of a generator with UNet architecture and a discriminator (it could be normal one or a PatchGAN); samples from domain A are fed to the \\(GAN_{A \\rightarrow B}\\) whilst samples from domain B are fed to the \\(GAN_{B \\rightarrow A}\\) . An adversarial loss is used as commonly done in GANs training, but in this case neither BCE nor Wasserstein is chosen. Instead, the authors suggest to use an MSE loss: \\[ \\begin{aligned} \\mathscr{L}_{adv, g_{A \\rightarrow B}} &= E_{\\mathbf{x}_A \\sim p_{x,A}} [(d_B(g_{A \\rightarrow B}(\\mathbf{x}_A) - 1)^2], \\\\ \\mathscr{L}_{adv, g_{B \\rightarrow A}} &= E_{\\mathbf{x}_B \\sim p_{x,B}} [(d_A(g_{B \\rightarrow A}(\\mathbf{x}_B) - 1)^2], \\\\ \\mathscr{L}_{adv, d_A} &= E_{\\mathbf{x}_A \\sim p_{x,A}} [(d_A(\\mathbf{x}_A) - 1)^2] + E_{\\mathbf{x}_B \\sim p_{x,B}} [d_B(g_{B \\rightarrow A}(\\mathbf{x}_B))^2], \\\\ \\mathscr{L}_{adv, d_B} &= E_{\\mathbf{x}_B \\sim p_{x,B}} [(d_B(\\mathbf{x}_B) - 1)^2] + E_{\\mathbf{x}_A \\sim p_{x,A}} [d_B(g_{A \\rightarrow B}(\\mathbf{x}_A))^2], \\\\ \\end{aligned} \\] and we finally define \\(\\mathscr{L}_{adv, A \\rightarrow B}=\\mathscr{L}_{adv, g_{A \\rightarrow B}} + \\mathscr{L}_{adv, d_B}\\) and \\(\\mathscr{L}_{adv, B \\rightarrow A}=\\mathscr{L}_{adv, g_{B \\rightarrow A}} + \\mathscr{L}_{adv, d_A}\\) . Moreover, since we do not know how to pair the samples from the different domains the PixelLoss of Pix2Pix cannot be used here. Up until now the networks are also not aware of each other and could well be trained separately. A feedback loop is therefore introduced such that the two networks are aware of each other and trained together to learn both mappings consistently. This is referred to as Cycle consistency and it works as follows: A sample from domain A is fed to the \\(GAN_{A \\rightarrow B}\\) and subsequently to \\(GAN_{B \\rightarrow A}\\) and the reconstruction loss is computed: \\(\\mathscr{L}_{cycle, A \\rightarrow B} = MSE(\\mathbf{x}_A, g_{B \\rightarrow A}(g_{A \\rightarrow B}(\\mathbf{x}_A))\\) A sample from domain B is fed to the \\(GAN_{B \\rightarrow A}\\) and subsequently to \\(GAN_{A \\rightarrow B}\\) and the reconstruction loss is computed: \\(\\mathscr{L}_{cycle, B \\rightarrow A} = MSE(\\mathbf{x}_B, g_{A \\rightarrow B}(g_{B \\rightarrow A}(\\mathbf{x}_B))\\) Finally, an Identity loss is also optionally introduced where a sample from one domain is passed through the generator of the other domain and the MSE loss is computed against the sample itself. The idea of such loss is that a generator should not modify a sample that already belongs to the target distribution: \\[ \\begin{aligned} \\mathscr{L}_{identity, A} &= MSE(g_{B \\rightarrow A}(\\mathbf{x}_A), \\mathbf{x}_A)\\\\ \\mathscr{L}_{identity, B} &= MSE(g_{A \\rightarrow B}(\\mathbf{x}_B), \\mathbf{x}_B)\\\\ \\end{aligned} \\] To summarize, the overall loss function of CycleGAN becomes: \\[ \\mathscr{L} = \\sum_{i \\in (A \\rightarrow B, B \\rightarrow A)} \\left( \\mathscr{L}_{adv, i} + \\lambda_C (\\mathscr{L}_{cycle, i}) \\right) + \\lambda_I (\\mathscr{L}_{identity, A} +\\mathscr{L}_{identity, B}) \\]","title":"Unpaired training"},{"location":"lectures/15_gans/#additional-readings","text":"A good discussion on the limitations of the BCE loss for GANs training can be found here A number of valuable resources for stable training of GANs are: 1 , 2 If you want to get started with GANs in PyTorch, here is a good starting point: A Zoo of PyTorch implementations of GANs","title":"Additional readings"},{"location":"lectures/16_pinns/","text":"Scientific Machine Learning and Physics-informed Neural Networks In the last two lectures of our course, we will focus our attention on a flourishing area of scientific computing that aims to develop algorithms that can bridge the gap between purely data-driven methods and model-driven ones. Sometimes this new field of research is referred to as Scientific Machine Learning and you can find a great deal of information on the web (e.g., 1 , 2 , 3 ). However, it is not always easy to understand what Scientific ML really is and how it differs from the mere application of the ML (and DL) tooling that we have discussed during this course. To be able to understand what is the best way to marry the latest advances in deep learning with our toolbox of model-driven algorithms, let's first briefly review what these two disciplines are good at alone and where they usually struggle. Deep Learning is usually great at: Computer Vision tasks; Language modelling tasks; Discovery of hidden patterns in large amount of structured data. These three topics have something in common: very little is known a priori about the physics that underlie the process that we want to learn from. For example, although a great deal of research has been performed in the fields of neuroscience, our current understanding of how a child learns to recognize a dog from a cat or how we learn a new language is still very limited. Whilst for long time researchers have tried to decode the rules of a language and create computer programs that could translate, answer questions or more broadly communicate with humans, it is nowadays clear to us that a better route is to provide machines with a large amount of training data and let them identify the best possible way to accomplish a task. Physics is usually great at: Modelling natural phenomena by means of (more or less) simple equations, e.g. how waves propagate. Providing a link between some observations that we are able to take in the nature and the unobserved parameters of the underlying physical system. For example, we can link the traveltime of sound waves with the actual velocity of the medium they travel in, or link precipitation levels with the pressure and temperature of the atmosphere. This is usually encoded by equations of the form: $$ d = g(m) $$ where \\(d\\) are the observations, \\(m\\) are the model parameters, and \\(g\\) is the (usually nonlinear) physical model. This could be an ordinary differential equation (ODE), or a partial differential equation (PDE), or any other equation that has an analytical or numerical solution. On the other hand, unlike deep learning, a purely physics-driven approach may not be able to learn useful information from data nor automatically identify patterns in the solution space that we would like to enhance or suppress. This is where a hybrid approach could come in handy: we can leverage some of the deep learning methods discussed in this course to identify patterns in both the observations and the sought after model and use it as an informed prior whilst still relying on the well-established physical process to link the two. In the following we will focus on the following three directions of research that build their foundations on this paradigm: Physics-Informed Neural Networks (PINNs) : this family of NNs try to learn to model a physical process in an unsupervised manner. This is accomplished by including the ODE or PDE that describe the physical process of interest as part of the loss function used to train the network. Ultimately, a trained PINN can quickly evaluate the solution of the chosen ODE or PDE at any point in the domain of interest (or perform inverse modelling with respect to the free-parameters, initial conditions or boundary conditions of such an equation); Data-driven regularization of inverse problems : in classical inverse problem theory, regularization is a heavily used tool to allow the solution of ill-posed inverse problem. We will discuss how hand-crafted regularizers (and/or preconditioners) are nowadays replaced by properly pre-trained Neural networks. Learned iterative solvers : large-scale inverse problems are usually solved by means of iterative solvers. A new line of research has shown great promise in learning the best direction to apply at each step of an iterative solver, this being the output of a neural network fed with the current solution, gradient and possibly other inputs. Whilst this approach requires supervision, we will discuss its great potential to replace classical iterative solvers to improve both the speed and quality of the solution. Physics-Informed Neural Networks (PINNs) Physics-Informed Neural Networks are a new family of deep learning models specifically aimed at solving differential equations. To begin with, let's recall how a physical model can be explained by means of differential equations: Ordinary Differential Equations (ODEs): differential equations with a single independent variable, here denoted with \\(t\\) . For example: $$ \\frac{d u(t)}{dt} = f(u(t; \\alpha)) $$ where \\(u(t)\\) is the dependent variable we are interested in, and \\(f\\) is a generic linear or nonlinear function of \\(u(t)\\) . Partial Differential Equation (PDEs): differential equations with two or more independent variable, here denoted with \\(t,x\\) . For example: $$ \\frac{\\partial u(t,x)}{\\partial t} + \\frac{\\partial u(t,x)}{\\partial x} = f(u(t,x; \\alpha)) $$ where \\(u(t)\\) is the dependent variable we are interested in, and \\(f\\) is a generic linear or nonlinear function of \\(u(t)\\) . In both cases the free-parameters of the equation are denoted with \\(\\alpha\\) . Three family of methods exist to solve such equations: Analytical solution : some special types of ODEs and PDEs (e.g., with constant free-parameters \\(\\alpha\\) ) can be solved analytically. Whilst this approach is very appealing in terms of computational cost and accuracy of the solution it has limited practical use; Numerical methods : a more general approach to any form of ODE or PDE is to discretize the differential equation itself (or its equivalent integral relation) and solve it by means of numerical methods such as Finite-Difference (FD), Finite-Element (FE), Spectral-Element (SE), etc. Whilst these methods are routinely employed in almost any scientific field, they present some outstanding limitations, the most important of which are the extremely large computational cost and the need for a predefined (regular or irregular) mesh. Moreover, numerical methods like FD or FE solve a specific instance of a ODE or PDE (given fixed initial and boundary conditions and free-parameters) and cannot take advantage of the solution of one instance of the equation when solving a different instance. A classical problem in geophysics, for example, is to solve the wave equation for a given number of different sources (i.e., forcing terms): each instance is solved separately as no one instance can benefit from another one even when sources are just a few meters apart. Learned models : in the spirit of supervised learning, a number of solutions have been proposed to directly learn a ODE or PDE (or the entire operator) by training a deep learning model (usually a CNN) to map initial conditions and free-parameters into the solution, or a portion of the solution (e.g., u(t) for \\(0\\ge t <T/N\\) ) and free-parameters into the rest of the solution (e.g., u(t) for \\(T/N\\ge t<T\\) ). Whilst such an approach can work under special circumstances, one clear limitation is that the knowledge of the ODE/PDE is only embedded in the training data. Moreover a classical numerical solver is still required to create the training data. PINNs, on the other hand, take a very different approach to learning differential equations. First of all, the exploit the general idea of the Universal Approximation Theorem which states that any function can be learned with a large enough (1 layer) Neural Network. Second, they do so by leveraging the underlying ODE/PDE that we wish to solve as part of the loss function used to train such a network. To explain how PINNs work, let's take a generic PDE and write it formally as: where we have specified here both the differential equation itself, as well as its initial conditions (IC) and boundary conditions (BC). Given the definition of a ODE/PDE, a Physics-Informed Neural Network is composed of the following: A simple feedforward network \\(f_\\theta\\) with number of inputs equal to the number of independent variables of the differential equation and number of outputs equal to the number of dependent variables of the differential equation. In the simple case above, the network will have 2 inputs and one outputs. The internal structure of the network is totally arbitrary. Depending on the complexity of the solution this may require more or less layers as well as more or less units per layer. Similarly, the choice of the internal activation functions is arbitrary. Experience has shown than tanh works well in simple scenarios (e.g., when the solution \\(u\\) is smooth), whilst other activations such as LeakyRelu, Swish or even Sin may be preferable for complex solutions (e.g., oscillating or with abrupt discontinuities). Automatic differentiation (AD) is used not only to compute the gradient of the loss function, but also to compute the derivatives of the output(s) of the network (dependent variables) over the inputs (independent variables) A loss function is defined in such a way that the ODE/PDE is fitted alongside initial and/or boundary conditions. Before we delve into the details of each of these three new ingredients, let's visually consider the PINN for the sample PDE equation above: Starting from the left, as usually done when training NNs, a number of \\((x^{<i>},t^{<i>}) \\; i=1,2,...,N_c\\) pairs (also sometimes referred in the literature as co-location points) is selected and feed to the network. The corresponding outputs \\(u(x^{<i>},t^{<i>})\\) are then fed to AD to compute the required derivatives over the inputs (here \\(\\partial u / \\partial x\\) and \\(\\partial u / \\partial t\\) ). Finally, both the output and its derivatives are used to evaluate the PDE. Alongside these \\(N_c\\) co-locations points, a number of additional points are fed to the network. In case of initial conditions, these points are \\((x^{<i>},t_0) \\; i=1,2,...,N_{IC}\\) . Similarly, in case of boundary conditions, these points are \\((x_j,t^{<i>}) \\; i=1,2,...,N_{BC}; \\; j=0,1\\) . The ratio between \\(N_c\\) , \\(N_{IC}\\) , \\(N_{BC}\\) is arbitrary. Moreover, the choice of the co-location points can be performed in various alternative ways: Uniform in the grid; Randomly sampled in the grid (once at the start of training); Randomly sampled in the grid (at every step) Adaptively: this can be based for example on the PDE loss, where during the training process more points are selected in areas where the PDE match is poorer. Which one is best is still under debate, and it is likely to be also problem dependent. Moreover, whilst a full batch approach is the most common for training PINNs, researcher have also started to successfully use mini-batch approaches during training. Moving onto the computation of the derivatives ( \\(\\partial u / \\partial x\\) and \\(\\partial u / \\partial t\\) ), since \\(x\\) and \\(t\\) represents the entry leaves of the computational graph, as long as we make our computational framework aware of the fact that we want to compute derivatives over such variables, we can do so at any time (and even multiple times if required by the PDE, e.g., \\(\\partial^2 u / \\partial x^2\\) ). Last but not least, the loss function of PINNs can be written as follows: \\[ \\begin{aligned} \\mathscr{L}_{pinn} &= \\frac{1}{N_c} \\sum_{i=1}^{N_c} PDE(x^{<i>},t^{<i>}) \\\\ &+ \\frac{\\lambda_IC}{N_{IC}} \\sum_{i=1}^{N_{IC}} IC(x^{<i>},t_0) \\\\ &+ \\frac{\\lambda_{BC}}{N_{BC}} \\sum_{i=1}^{N_{BC}} BC(x_{j^{<i>}},t^{<i>}) \\\\ \\end{aligned} \\] As an example, for the problem above, the loss function becomes: \\[ \\begin{aligned} \\mathscr{L}_{pinn} &= \\frac{1}{N_c} \\sum_{i=1}^{N_c} (\\partial u(x^{<i>},t^{<i>}) / \\partial t + \\partial u(x^{<i>},t^{<i>}) / \\partial x - f(u(x^{<i>},t^{<i>}; \\alpha)||_2^2 \\\\ &+ \\frac{\\lambda_IC}{N_{IC}} \\sum_{i=1}^{N_{IC}} ||u(x^{<i>},t_0)-u_{t0}(x^{<i>})||_2^2 \\\\ &+ \\frac{\\lambda_{BC}}{N_{BC}} \\sum_{i=1}^{N_{BC}} ||u(x_{j^{<i>}},t^{<i>})-u_{x_{j^{<i>}}}(t^{<i>})||_2^2 \\end{aligned} \\] where the L2 norm has been used for all the three losses. Given the loss, the training process follows similar pattern to that of any Neural Network described in this course. An optimizer of choice (e.g., Adam) is used to minimize the loss: \\[ \\underset{\\theta} {\\mathrm{argmin}} \\; \\mathscr{L}_{pinn} \\] Finally, once the network is trained, the solution can be evaluated anywhere in the domain by simply passing a pair of coordinates \\((x,t)\\) of choice. One of the key features of PINNs is that they are mesh independent. Theoretically speaking we could sample our solution at any spatial and temporal sampling of choice, and even more so we could have different ones for different areas of the domain. Similarly, since we can evaluate any area of the domain, this method can be also very fast compared to for example FD which requires starting from earlier times to get to later ones. To conclude, whilst up until now we have discussed PINNs in the context of forward modelling, they can be also used for inverse modelling. In other words, an optimization problem can be setup for the free-parameters of the ODE/PDE \\(\\alpha\\) as follows: where the optimization process is now performed not only over the network parameters \\(\\theta\\) whose aim is to produce a continuos field \\(u\\) that satisfies the ODE/PDE of interest but also over the free-parameters \\(\\alpha\\) : \\[ \\underset{\\theta, \\alpha} {\\mathrm{argmin}} \\; \\mathscr{L}_{pinn} \\] Note, however, that whilst from a computational point of view this can be easily done, In practice the underlying inverse problem may be highly ill-posed and finding a satisfactory pair of ( \\(\\alpha, \\theta\\) may not always be easy. Finally, when \\(\\alpha\\) is also function of one or more of the independent variables of the differential equation (e.g., \\(\\alpha(x)\\) ), this approach can be taken one step further by parametrizing also \\(\\alpha\\) with a feedforward neural network and optimizing the the weights of the two networks instead of \\(\\alpha\\) directly: \\[ \\underset{\\theta, \\phi} {\\mathrm{argmin}} \\; \\mathscr{L}_{pinn} \\] An example of such a scenario can be represented by a classical problem in geophysics: traveltime tomography. Here the PDE is the eikonal equation, the independent variables are spatial coordinates \\((x,z)\\) and possibly the coordinates of the sources \\((x_S,z_S)\\) , and the dependent variable is the traveltime \\(T\\) . For inverse modelling, the free-parameter \\(\\alpha(x,z)\\) is the velocity of the medium which can be also parametrized via a network as shown above. To reduce the amount of plausible solutions that can fit the PDE, a BC must be added to the loss function in the form of the observed traveltime at receivers (either on the surface ( \\(z=0\\) ) or anywhere available within the domain). Of course, the eikonal equation and traveltime tomography is just one problem in geophysics where PINNs may represent an appealing solution. Other applications that have recently emerged within the field of geoscience are: time and frequency domain wave equation; Navier-Stokes equations; ...","title":"Scientific Machine Learning and Physics-informed Neural Networks"},{"location":"lectures/16_pinns/#scientific-machine-learning-and-physics-informed-neural-networks","text":"In the last two lectures of our course, we will focus our attention on a flourishing area of scientific computing that aims to develop algorithms that can bridge the gap between purely data-driven methods and model-driven ones. Sometimes this new field of research is referred to as Scientific Machine Learning and you can find a great deal of information on the web (e.g., 1 , 2 , 3 ). However, it is not always easy to understand what Scientific ML really is and how it differs from the mere application of the ML (and DL) tooling that we have discussed during this course. To be able to understand what is the best way to marry the latest advances in deep learning with our toolbox of model-driven algorithms, let's first briefly review what these two disciplines are good at alone and where they usually struggle. Deep Learning is usually great at: Computer Vision tasks; Language modelling tasks; Discovery of hidden patterns in large amount of structured data. These three topics have something in common: very little is known a priori about the physics that underlie the process that we want to learn from. For example, although a great deal of research has been performed in the fields of neuroscience, our current understanding of how a child learns to recognize a dog from a cat or how we learn a new language is still very limited. Whilst for long time researchers have tried to decode the rules of a language and create computer programs that could translate, answer questions or more broadly communicate with humans, it is nowadays clear to us that a better route is to provide machines with a large amount of training data and let them identify the best possible way to accomplish a task. Physics is usually great at: Modelling natural phenomena by means of (more or less) simple equations, e.g. how waves propagate. Providing a link between some observations that we are able to take in the nature and the unobserved parameters of the underlying physical system. For example, we can link the traveltime of sound waves with the actual velocity of the medium they travel in, or link precipitation levels with the pressure and temperature of the atmosphere. This is usually encoded by equations of the form: $$ d = g(m) $$ where \\(d\\) are the observations, \\(m\\) are the model parameters, and \\(g\\) is the (usually nonlinear) physical model. This could be an ordinary differential equation (ODE), or a partial differential equation (PDE), or any other equation that has an analytical or numerical solution. On the other hand, unlike deep learning, a purely physics-driven approach may not be able to learn useful information from data nor automatically identify patterns in the solution space that we would like to enhance or suppress. This is where a hybrid approach could come in handy: we can leverage some of the deep learning methods discussed in this course to identify patterns in both the observations and the sought after model and use it as an informed prior whilst still relying on the well-established physical process to link the two. In the following we will focus on the following three directions of research that build their foundations on this paradigm: Physics-Informed Neural Networks (PINNs) : this family of NNs try to learn to model a physical process in an unsupervised manner. This is accomplished by including the ODE or PDE that describe the physical process of interest as part of the loss function used to train the network. Ultimately, a trained PINN can quickly evaluate the solution of the chosen ODE or PDE at any point in the domain of interest (or perform inverse modelling with respect to the free-parameters, initial conditions or boundary conditions of such an equation); Data-driven regularization of inverse problems : in classical inverse problem theory, regularization is a heavily used tool to allow the solution of ill-posed inverse problem. We will discuss how hand-crafted regularizers (and/or preconditioners) are nowadays replaced by properly pre-trained Neural networks. Learned iterative solvers : large-scale inverse problems are usually solved by means of iterative solvers. A new line of research has shown great promise in learning the best direction to apply at each step of an iterative solver, this being the output of a neural network fed with the current solution, gradient and possibly other inputs. Whilst this approach requires supervision, we will discuss its great potential to replace classical iterative solvers to improve both the speed and quality of the solution.","title":"Scientific Machine Learning and Physics-informed Neural Networks"},{"location":"lectures/16_pinns/#physics-informed-neural-networks-pinns","text":"Physics-Informed Neural Networks are a new family of deep learning models specifically aimed at solving differential equations. To begin with, let's recall how a physical model can be explained by means of differential equations: Ordinary Differential Equations (ODEs): differential equations with a single independent variable, here denoted with \\(t\\) . For example: $$ \\frac{d u(t)}{dt} = f(u(t; \\alpha)) $$ where \\(u(t)\\) is the dependent variable we are interested in, and \\(f\\) is a generic linear or nonlinear function of \\(u(t)\\) . Partial Differential Equation (PDEs): differential equations with two or more independent variable, here denoted with \\(t,x\\) . For example: $$ \\frac{\\partial u(t,x)}{\\partial t} + \\frac{\\partial u(t,x)}{\\partial x} = f(u(t,x; \\alpha)) $$ where \\(u(t)\\) is the dependent variable we are interested in, and \\(f\\) is a generic linear or nonlinear function of \\(u(t)\\) . In both cases the free-parameters of the equation are denoted with \\(\\alpha\\) . Three family of methods exist to solve such equations: Analytical solution : some special types of ODEs and PDEs (e.g., with constant free-parameters \\(\\alpha\\) ) can be solved analytically. Whilst this approach is very appealing in terms of computational cost and accuracy of the solution it has limited practical use; Numerical methods : a more general approach to any form of ODE or PDE is to discretize the differential equation itself (or its equivalent integral relation) and solve it by means of numerical methods such as Finite-Difference (FD), Finite-Element (FE), Spectral-Element (SE), etc. Whilst these methods are routinely employed in almost any scientific field, they present some outstanding limitations, the most important of which are the extremely large computational cost and the need for a predefined (regular or irregular) mesh. Moreover, numerical methods like FD or FE solve a specific instance of a ODE or PDE (given fixed initial and boundary conditions and free-parameters) and cannot take advantage of the solution of one instance of the equation when solving a different instance. A classical problem in geophysics, for example, is to solve the wave equation for a given number of different sources (i.e., forcing terms): each instance is solved separately as no one instance can benefit from another one even when sources are just a few meters apart. Learned models : in the spirit of supervised learning, a number of solutions have been proposed to directly learn a ODE or PDE (or the entire operator) by training a deep learning model (usually a CNN) to map initial conditions and free-parameters into the solution, or a portion of the solution (e.g., u(t) for \\(0\\ge t <T/N\\) ) and free-parameters into the rest of the solution (e.g., u(t) for \\(T/N\\ge t<T\\) ). Whilst such an approach can work under special circumstances, one clear limitation is that the knowledge of the ODE/PDE is only embedded in the training data. Moreover a classical numerical solver is still required to create the training data. PINNs, on the other hand, take a very different approach to learning differential equations. First of all, the exploit the general idea of the Universal Approximation Theorem which states that any function can be learned with a large enough (1 layer) Neural Network. Second, they do so by leveraging the underlying ODE/PDE that we wish to solve as part of the loss function used to train such a network. To explain how PINNs work, let's take a generic PDE and write it formally as: where we have specified here both the differential equation itself, as well as its initial conditions (IC) and boundary conditions (BC). Given the definition of a ODE/PDE, a Physics-Informed Neural Network is composed of the following: A simple feedforward network \\(f_\\theta\\) with number of inputs equal to the number of independent variables of the differential equation and number of outputs equal to the number of dependent variables of the differential equation. In the simple case above, the network will have 2 inputs and one outputs. The internal structure of the network is totally arbitrary. Depending on the complexity of the solution this may require more or less layers as well as more or less units per layer. Similarly, the choice of the internal activation functions is arbitrary. Experience has shown than tanh works well in simple scenarios (e.g., when the solution \\(u\\) is smooth), whilst other activations such as LeakyRelu, Swish or even Sin may be preferable for complex solutions (e.g., oscillating or with abrupt discontinuities). Automatic differentiation (AD) is used not only to compute the gradient of the loss function, but also to compute the derivatives of the output(s) of the network (dependent variables) over the inputs (independent variables) A loss function is defined in such a way that the ODE/PDE is fitted alongside initial and/or boundary conditions. Before we delve into the details of each of these three new ingredients, let's visually consider the PINN for the sample PDE equation above: Starting from the left, as usually done when training NNs, a number of \\((x^{<i>},t^{<i>}) \\; i=1,2,...,N_c\\) pairs (also sometimes referred in the literature as co-location points) is selected and feed to the network. The corresponding outputs \\(u(x^{<i>},t^{<i>})\\) are then fed to AD to compute the required derivatives over the inputs (here \\(\\partial u / \\partial x\\) and \\(\\partial u / \\partial t\\) ). Finally, both the output and its derivatives are used to evaluate the PDE. Alongside these \\(N_c\\) co-locations points, a number of additional points are fed to the network. In case of initial conditions, these points are \\((x^{<i>},t_0) \\; i=1,2,...,N_{IC}\\) . Similarly, in case of boundary conditions, these points are \\((x_j,t^{<i>}) \\; i=1,2,...,N_{BC}; \\; j=0,1\\) . The ratio between \\(N_c\\) , \\(N_{IC}\\) , \\(N_{BC}\\) is arbitrary. Moreover, the choice of the co-location points can be performed in various alternative ways: Uniform in the grid; Randomly sampled in the grid (once at the start of training); Randomly sampled in the grid (at every step) Adaptively: this can be based for example on the PDE loss, where during the training process more points are selected in areas where the PDE match is poorer. Which one is best is still under debate, and it is likely to be also problem dependent. Moreover, whilst a full batch approach is the most common for training PINNs, researcher have also started to successfully use mini-batch approaches during training. Moving onto the computation of the derivatives ( \\(\\partial u / \\partial x\\) and \\(\\partial u / \\partial t\\) ), since \\(x\\) and \\(t\\) represents the entry leaves of the computational graph, as long as we make our computational framework aware of the fact that we want to compute derivatives over such variables, we can do so at any time (and even multiple times if required by the PDE, e.g., \\(\\partial^2 u / \\partial x^2\\) ). Last but not least, the loss function of PINNs can be written as follows: \\[ \\begin{aligned} \\mathscr{L}_{pinn} &= \\frac{1}{N_c} \\sum_{i=1}^{N_c} PDE(x^{<i>},t^{<i>}) \\\\ &+ \\frac{\\lambda_IC}{N_{IC}} \\sum_{i=1}^{N_{IC}} IC(x^{<i>},t_0) \\\\ &+ \\frac{\\lambda_{BC}}{N_{BC}} \\sum_{i=1}^{N_{BC}} BC(x_{j^{<i>}},t^{<i>}) \\\\ \\end{aligned} \\] As an example, for the problem above, the loss function becomes: \\[ \\begin{aligned} \\mathscr{L}_{pinn} &= \\frac{1}{N_c} \\sum_{i=1}^{N_c} (\\partial u(x^{<i>},t^{<i>}) / \\partial t + \\partial u(x^{<i>},t^{<i>}) / \\partial x - f(u(x^{<i>},t^{<i>}; \\alpha)||_2^2 \\\\ &+ \\frac{\\lambda_IC}{N_{IC}} \\sum_{i=1}^{N_{IC}} ||u(x^{<i>},t_0)-u_{t0}(x^{<i>})||_2^2 \\\\ &+ \\frac{\\lambda_{BC}}{N_{BC}} \\sum_{i=1}^{N_{BC}} ||u(x_{j^{<i>}},t^{<i>})-u_{x_{j^{<i>}}}(t^{<i>})||_2^2 \\end{aligned} \\] where the L2 norm has been used for all the three losses. Given the loss, the training process follows similar pattern to that of any Neural Network described in this course. An optimizer of choice (e.g., Adam) is used to minimize the loss: \\[ \\underset{\\theta} {\\mathrm{argmin}} \\; \\mathscr{L}_{pinn} \\] Finally, once the network is trained, the solution can be evaluated anywhere in the domain by simply passing a pair of coordinates \\((x,t)\\) of choice. One of the key features of PINNs is that they are mesh independent. Theoretically speaking we could sample our solution at any spatial and temporal sampling of choice, and even more so we could have different ones for different areas of the domain. Similarly, since we can evaluate any area of the domain, this method can be also very fast compared to for example FD which requires starting from earlier times to get to later ones. To conclude, whilst up until now we have discussed PINNs in the context of forward modelling, they can be also used for inverse modelling. In other words, an optimization problem can be setup for the free-parameters of the ODE/PDE \\(\\alpha\\) as follows: where the optimization process is now performed not only over the network parameters \\(\\theta\\) whose aim is to produce a continuos field \\(u\\) that satisfies the ODE/PDE of interest but also over the free-parameters \\(\\alpha\\) : \\[ \\underset{\\theta, \\alpha} {\\mathrm{argmin}} \\; \\mathscr{L}_{pinn} \\] Note, however, that whilst from a computational point of view this can be easily done, In practice the underlying inverse problem may be highly ill-posed and finding a satisfactory pair of ( \\(\\alpha, \\theta\\) may not always be easy. Finally, when \\(\\alpha\\) is also function of one or more of the independent variables of the differential equation (e.g., \\(\\alpha(x)\\) ), this approach can be taken one step further by parametrizing also \\(\\alpha\\) with a feedforward neural network and optimizing the the weights of the two networks instead of \\(\\alpha\\) directly: \\[ \\underset{\\theta, \\phi} {\\mathrm{argmin}} \\; \\mathscr{L}_{pinn} \\] An example of such a scenario can be represented by a classical problem in geophysics: traveltime tomography. Here the PDE is the eikonal equation, the independent variables are spatial coordinates \\((x,z)\\) and possibly the coordinates of the sources \\((x_S,z_S)\\) , and the dependent variable is the traveltime \\(T\\) . For inverse modelling, the free-parameter \\(\\alpha(x,z)\\) is the velocity of the medium which can be also parametrized via a network as shown above. To reduce the amount of plausible solutions that can fit the PDE, a BC must be added to the loss function in the form of the observed traveltime at receivers (either on the surface ( \\(z=0\\) ) or anywhere available within the domain). Of course, the eikonal equation and traveltime tomography is just one problem in geophysics where PINNs may represent an appealing solution. Other applications that have recently emerged within the field of geoscience are: time and frequency domain wave equation; Navier-Stokes equations; ...","title":"Physics-Informed Neural Networks (PINNs)"},{"location":"lectures/17_deepinv/","text":"Deep learning for Inverse Problems The field of inverse problem has experienced a renaissance in the last decade thanks to the recent advances in Deep Learning. Whilst solid theories exist for the solution of linear (or nonlinear) inverse problems, in practice one is always faced with problems that are ill-posed by nature, i.e. many solutions exist that can match data equally well. This is where for long time the inverse problem community has spent time and resources to identify mitigating strategies to reduce the so-called nullspace of an inverse problem by means of prior information. Similarly, for long time the optimization community has developed iterative solvers that can provide solutions to convex or non-convex functionals by requiring only access to function and gradient evaluations of the functional of interest. In this lecture we will discuss where and how Deep Learning may be of great help in the solution of inverse problems. Data-driven or learned regularization of inverse problems To begin, let's consider the solution of an inverse problem of the form: \\(\\mathbf{d}^{obs}=g(\\mathbf{m})\\) or \\(\\mathbf{d}^{obs} = \\mathbf{Gm}\\) where \\(g\\) or \\(\\mathbf{G}\\) is the known modelling operator, \\(\\mathbf{m}\\) are the unknown model parameters, and \\(\\mathbf{d}^{obs}\\) are the observed data. As previously mentioned, in many (geo)scientific applications the operator may be ill-posed and prior knowledge is required to obtain a plausible solution (not just one of the many that matches the data). In classical inverse problem theory this can be achieved as follows: Regularization: \\(J = ||\\mathbf{d}^{obs}-g(\\mathbf{m})||_p^p + \\lambda ||r(\\mathbf{m})||_p^p\\) where \\(r\\) is a function that tries to penalize some features of the model that we are not interested in. Classical choices of \\(r\\) are linear operators such as the identity matrix (this type of regularization is called Tikhonov regularization and favours solution with small L2 norm - \\(p=2\\) ) or the second derivative of laplacian operator (this type of regularization favour smooth solutions). Alternatively, one could choose a linear or nonlinear projection that transforms the model into a domain where the solution is sparse; by choosing \\(p=1\\) , one can estimate the sparsest model that at the same time matches the data. Preconditioning: \\(J = ||\\mathbf{d}^{obs}-g(p(\\mathbf{z}))||_p^p + \\lambda ||\\mathbf{z}||_p^p\\) where by performing a change of variable ( \\(\\mathbf{m}=p(\\mathbf{z})\\) ) the inverse problem is now solved in a transformed domain, and \\(p\\) is a function that filters the solution \\(\\mathbf{z}\\) in such a way that favourable features of the model are enhanced. As an example, a smoothing operator can be used to produce smooth solution (note how this differs from the previous approach where smooth solutions could be constructed by penalizing roughness in the solution by means of second derivatives). A common feature of these two families of approaches is that we as user are requested to select the regularizer or preconditioner for the problem at hand. This could be a difficult task and usually requires a lot of trial-and-error before a good choice is made for a specific problem. Alternatively, one could define a projection that reduces the dimensionality of the space in which we wish to find the solution (i.e., \\(\\mathbf{x} \\in \\mathbb{R}^{N_x}, \\mathbf{z} \\in \\mathbb{R}^{N_z}\\) with \\(N_z << N_x\\) ). This approach reminds us of the dimensionality reduction techniques discusses in this lecture and the choice of the method used to identify a representative latent space can be arbitrary (i.e., a simple linear transformation like PCA or a complex nonlinear transformation like that induced by an Autoencoder or a GAN). A clear advantage of such an approach is that the user is not required to define a transform upfront. Provided availability of training dataset in the form of a representative set of solutions \\(M = (\\mathbf{m}^{<1>}, \\mathbf{m}^{<2>}, ..., \\mathbf{m}^{<N>})\\) , the best data-driven transformation can be identified that suits the problem at hand. Before we get more into the details of such an approach, it is important to make a few remarks. This approach lies in between classical approaches in inverse problem theory and supervised learning approaches in that: classical inverse problems: only the modelling operator \\(g/\\mathbf{G}\\) and one instance of data \\(\\mathbf{d}^{obs}\\) are available. Prior information comes from our knowledge of the expected solution (or its probability distribution), but no set of solutions are available when solving the problem; supervised learning: pairs of models and associated observations \\((\\mathbf{m}^{<i>}, \\mathbf{d}^{obs,<i>})\\) are available upfront (or a set of models \\((\\mathbf{m}^{<i>}\\) from which the associated observations can be synthetically created via the modelling operator). A data-driven model (e.g., a NN) is then trained to find the mapping between data and models. Note that the modelling operator is not actively used in the training process; learned regularization: a set of models \\((\\mathbf{m}^{<i>}\\) is available upfront, which are used to find a latent representation. The inverse problem is subsequently solved for one instance of data \\(\\mathbf{d}^{obs}\\) using the learned regularizer (or preconditioner) and the physical modelling operator. The key idea of solving inverse problems with learned regularizers is therefore to split the problem into two subsequent tasks, where the first is concerned with the prior and the latter with the modelling operator (this is different from the supervised learning approach where the two are learned together): Learning process: a nonlinear model is trained to identify a representative latent space for the set of available solutions. Such model can be an AE (or VAE) network: \\[ \\underset{\\mathbf{e}_\\theta, \\mathbf{d}_\\phi} {\\mathrm{argmin}} \\; \\frac{1}{N_s}\\sum_i \\mathscr{L}(\\mathbf{m}^{(i)}, d_\\phi(e_\\theta(\\mathbf{m}^{(i)}))) \\] or a GAN network \\[ arg \\; \\underset{g_\\theta} {\\mathrm{min}} \\; \\underset{d_\\phi} {\\mathrm{max}} \\; \\frac{1}{N_s}\\sum_i \\mathscr{L}_{adv}(\\mathbf{m}^{(i)}) \\] Inversion: Once the training process is finalized, the decoder (or generator) is used as a nonlinear preconditioner to the solution of the inverse problem as follows: \\[ AE: \\mathbf{m} = d_\\phi(\\mathbf{z}) = p(\\mathbf{z}) \\quad GAN: \\mathbf{m} = g_\\theta(\\mathbf{z}) = p(\\mathbf{z}) \\] such that the inverse problem becomes: \\[ J = ||\\mathbf{d}^{obs}-g(p(\\mathbf{z}))||_p^p + \\lambda ||\\mathbf{z}||_p^p \\] This problem can be now solved using a nonlinear solver of choice, where the gradient can be easily computed using the same set of tools that we employed in the training process of neural networks, namely backpropagation: \\[ \\frac{\\partial J}{\\partial \\mathbf{z}} = \\frac{\\partial J}{\\partial g} \\frac{\\partial g}{\\partial p} \\frac{\\partial p}{\\partial \\mathbf{z}} \\] where \\(\\partial J / \\partial g\\) is the derivative of the loss function over the predicted data, \\(\\partial g / \\partial p\\) is the derivative of the physical modelling operator, and \\(\\partial p / \\partial \\mathbf{z}\\) is the derivative of the decoder of the pretrained AE (or that of the generator of the pretrained GAN) over the input. Finally, it is worth noting that when an autoencoder is used to find a representative latent space, alternatively a regularized problem of this form can be solved: \\[ J = ||\\mathbf{d}^{obs}-g(\\mathbf{m})||_p^p + \\lambda ||\\mathbf{m} - d_\\phi(e_\\theta(\\mathbf{m})) ||_p^p \\] where the regularization terms ensures that the autoencoder can recreate the estimated model. This ensures that the solution lies in the manifold of the set of plausible solutions used to train the AE network. Learned solvers In the previous section we have discussed the solution of linear (or nonlinear) inverse problems from a high-level perspective. In fact, we purposely decided to avoid any discussion regarding the numerical aspects of solving any of the cost functions \\(J\\) . In practice, real-life problems may target model spaces that contain millions (or even billions) of variables and the same usually applies for the observation vector. Under these conditions, iterative solvers similar to those presented here and here are therefore the only viable option. An iterative solver can be loosely expressed as a nonlinear function \\(\\mathcal{F}\\) of this form: \\[ \\hat{\\mathbf{m}} = \\mathcal{F}(\\mathbf{d}^{obs}, \\mathbf{m}_0, g/\\mathbf{G}) \\] where \\(\\mathbf{m}_0\\) is an initial guess. The vanilla gradient descent algorithm can be more explicitly described by the following update rule: \\[ \\mathbf{m}_{i+1} = \\mathbf{m}_i - \\alpha \\frac{\\partial J}{\\partial \\mathbf{m}} | _ {\\mathbf{m}=\\mathbf{m}_i} (\\mathbf{d}^{obs}, \\mathbf{m}, g/\\mathbf{G}) \\] which we can unroll for a number of iterations and write as: \\[ \\mathbf{m}_{2} = \\mathbf{m}_0 - \\alpha \\frac{\\partial J}{\\partial \\mathbf{m}} | _ {\\mathbf{m}=\\mathbf{m}_0} - \\alpha \\frac{\\partial J}{\\partial \\mathbf{m}} | _ {\\mathbf{m}=\\mathbf{m}_1} \\] This expression clearly shows that the solution of an iterative solver at a given iteration is a simple weighted summation of the intermediate gradients that are subtracted from the initial guess $\\mathbf{m}_0 $. Similarly, more advanced solvers like the linear or nonlinear conjugate gradient algorithm take into account the past gradients at each iteration, however they still apply simple linear scalings to the gradients to produce the final solution. The mathematical community has recently started to investigate a new family of iterative solvers, called learned solvers. The key idea lies in the fact that a linear combination of gradients may not be the best choice (both in terms of convergence speed and ultimate quality of the solution). An alternative update rule of this form \\[ \\mathbf{m}_{i+1} = \\mathbf{m}_i - f_\\theta \\left( \\frac{\\partial J}{\\partial \\mathbf{m}} | _ {\\mathbf{m}=\\mathbf{m}_i}\\right) \\] may represent a better choice. However, a question may arise at this point: how do we choose the nonlinear project \\(f_\\theta\\) that we are going to apply to the gradients at each step? Learned iterative solvers, as the name implies, learn this mapping. More specifically, assuming availability of pairs of models and associated observations \\((\\mathbf{m}^{<i>}, \\mathbf{d}^{obs,<i>})\\) , a supervised learning process is setup such that an iterative solver with \\(N_it\\) iterations is tasked to learn the mapping from data to models. Let's take a look at the schematic below to better understand how this works: A learned iterative solver can be seen as an unrolled neural network where each element takes as input the current model estimate and its gradient and produces an updated version of the model. To keep the model capacity low, each unit shares weights like in classical RNN and each update can be compactly written as: \\[ \\mathbf{m}_i = f_\\theta(\\mathbf{x}_i), \\qquad \\mathbf{x}_i = \\mathbf{m}_{i-1} \\oplus \\frac{\\partial J}{\\partial \\mathbf{m}} \\] where \\(\\oplus\\) indicates concatenation over the channel axis (assuming that model and gradient are N-dimensional tensors). Depending on the problem and type of data \\(f_\\theta\\) can be chosen to be any network architecture, from a simple FF block, to a stack of FF blocks, or even a convolutional neural network. Moreover, given that we have access to the solution, the loss function is set up as follows: \\[ \\underset{f_\\theta} {\\mathrm{arg min}} \\; \\frac{1}{N_s}\\sum_{i=1}^{N_s} \\sum_{j=1}^{N_{it}} w_j \\mathscr{L}(\\mathbf{m}_j^{(i)}, \\mathbf{m}) \\] where each estimate is compared to the true model. Since early iterations may be worse, an exponentially increasing weight may be used to downweight their contributions over the estimates as later iterations of the unrolled solver. Finally, once the learning process is finalized, inference can be simply performed by evaluation a single forward pass of the network for one instance of data \\(\\mathbf{d}^{obs}\\) and a chosen initial guess. To conclude, it is important to answer the following question: why learned solvers are better than pure vanilla supervised learning? The key difference between these two approaches lies in how they decide to use the knowledge of the modelling operator \\(g/\\mathbf{G}\\) . Whilst traditional supervised learning approaches may use the modelling operator in the process of generating training data whilst ignoring it during training, learned iterative solvers integrate the modelling operator in the learning process. Two benefits may arise from this choice: generalization of the trained network over unseen modelling operator and increased robustness to noise in the data. Variants of learned solvers The structure of the learned solver discussed above closely resembles the method proposed by Adler and O\u0308ktem in 2017. A number of variants have been suggested in the literature in the following years: Learned solver with memory Adler and O\u0308ktem further propose to include a memory variable \\(\\mathbf{s}\\) . This takes inspiration from conventional solvers that use past gradients (or memory) to obtain more informed update directions. The model update can be therefore written as follows: \\[ \\mathbf{y}_i = f_\\theta(\\mathbf{x}_i), \\qquad \\mathbf{x}_i = \\mathbf{m}_{i-1} \\oplus \\frac{\\partial J}{\\partial \\mathbf{m}} \\oplus \\mathbf{s}_{i-1} \\qquad \\mathbf{y}_i = \\mathbf{m}_i \\oplus \\mathbf{s}_i \\] Recurrent Inference Machines (RIMs) RIMs closely resemble the second learned solver of Adler and O\u0308ktem. They however differ in the design on the network block and the fact that similarly to RNNs two set of parameters are used instead of one, \\(f_\\theta\\) and \\(f'_\\phi\\) . The model update can be therefore written as follows: \\[ \\begin{aligned} \\mathbf{s}_i &= f'_\\phi (\\mathbf{z}_i) , \\qquad \\mathbf{z}_i = \\boldsymbol \\eta_{i-1} \\oplus \\frac{\\partial J}{\\partial \\mathbf{m}} \\oplus \\mathbf{s}_{i-1} \\\\ \\boldsymbol \\eta_i &= \\boldsymbol \\eta_{i-1} + f_\\theta(\\mathbf{x}_i), \\qquad \\mathbf{x}_i = \\boldsymbol \\eta_{i-1} \\oplus \\frac{\\partial J}{\\partial \\mathbf{m}} \\oplus \\mathbf{s}_i \\end{aligned} \\] where a new variable \\(\\boldsymbol \\eta\\) has been introduced. This is the unscaled output and is connected to the model via a nonlinear activation function \\(\\sigma\\) that is in change of defining a range of allowed values: \\(\\mathbf{z} = \\sigma ( \\boldsymbol \\eta)\\) .","title":"Deep learning for Inverse Problems"},{"location":"lectures/17_deepinv/#deep-learning-for-inverse-problems","text":"The field of inverse problem has experienced a renaissance in the last decade thanks to the recent advances in Deep Learning. Whilst solid theories exist for the solution of linear (or nonlinear) inverse problems, in practice one is always faced with problems that are ill-posed by nature, i.e. many solutions exist that can match data equally well. This is where for long time the inverse problem community has spent time and resources to identify mitigating strategies to reduce the so-called nullspace of an inverse problem by means of prior information. Similarly, for long time the optimization community has developed iterative solvers that can provide solutions to convex or non-convex functionals by requiring only access to function and gradient evaluations of the functional of interest. In this lecture we will discuss where and how Deep Learning may be of great help in the solution of inverse problems.","title":"Deep learning for Inverse Problems"},{"location":"lectures/17_deepinv/#data-driven-or-learned-regularization-of-inverse-problems","text":"To begin, let's consider the solution of an inverse problem of the form: \\(\\mathbf{d}^{obs}=g(\\mathbf{m})\\) or \\(\\mathbf{d}^{obs} = \\mathbf{Gm}\\) where \\(g\\) or \\(\\mathbf{G}\\) is the known modelling operator, \\(\\mathbf{m}\\) are the unknown model parameters, and \\(\\mathbf{d}^{obs}\\) are the observed data. As previously mentioned, in many (geo)scientific applications the operator may be ill-posed and prior knowledge is required to obtain a plausible solution (not just one of the many that matches the data). In classical inverse problem theory this can be achieved as follows: Regularization: \\(J = ||\\mathbf{d}^{obs}-g(\\mathbf{m})||_p^p + \\lambda ||r(\\mathbf{m})||_p^p\\) where \\(r\\) is a function that tries to penalize some features of the model that we are not interested in. Classical choices of \\(r\\) are linear operators such as the identity matrix (this type of regularization is called Tikhonov regularization and favours solution with small L2 norm - \\(p=2\\) ) or the second derivative of laplacian operator (this type of regularization favour smooth solutions). Alternatively, one could choose a linear or nonlinear projection that transforms the model into a domain where the solution is sparse; by choosing \\(p=1\\) , one can estimate the sparsest model that at the same time matches the data. Preconditioning: \\(J = ||\\mathbf{d}^{obs}-g(p(\\mathbf{z}))||_p^p + \\lambda ||\\mathbf{z}||_p^p\\) where by performing a change of variable ( \\(\\mathbf{m}=p(\\mathbf{z})\\) ) the inverse problem is now solved in a transformed domain, and \\(p\\) is a function that filters the solution \\(\\mathbf{z}\\) in such a way that favourable features of the model are enhanced. As an example, a smoothing operator can be used to produce smooth solution (note how this differs from the previous approach where smooth solutions could be constructed by penalizing roughness in the solution by means of second derivatives). A common feature of these two families of approaches is that we as user are requested to select the regularizer or preconditioner for the problem at hand. This could be a difficult task and usually requires a lot of trial-and-error before a good choice is made for a specific problem. Alternatively, one could define a projection that reduces the dimensionality of the space in which we wish to find the solution (i.e., \\(\\mathbf{x} \\in \\mathbb{R}^{N_x}, \\mathbf{z} \\in \\mathbb{R}^{N_z}\\) with \\(N_z << N_x\\) ). This approach reminds us of the dimensionality reduction techniques discusses in this lecture and the choice of the method used to identify a representative latent space can be arbitrary (i.e., a simple linear transformation like PCA or a complex nonlinear transformation like that induced by an Autoencoder or a GAN). A clear advantage of such an approach is that the user is not required to define a transform upfront. Provided availability of training dataset in the form of a representative set of solutions \\(M = (\\mathbf{m}^{<1>}, \\mathbf{m}^{<2>}, ..., \\mathbf{m}^{<N>})\\) , the best data-driven transformation can be identified that suits the problem at hand. Before we get more into the details of such an approach, it is important to make a few remarks. This approach lies in between classical approaches in inverse problem theory and supervised learning approaches in that: classical inverse problems: only the modelling operator \\(g/\\mathbf{G}\\) and one instance of data \\(\\mathbf{d}^{obs}\\) are available. Prior information comes from our knowledge of the expected solution (or its probability distribution), but no set of solutions are available when solving the problem; supervised learning: pairs of models and associated observations \\((\\mathbf{m}^{<i>}, \\mathbf{d}^{obs,<i>})\\) are available upfront (or a set of models \\((\\mathbf{m}^{<i>}\\) from which the associated observations can be synthetically created via the modelling operator). A data-driven model (e.g., a NN) is then trained to find the mapping between data and models. Note that the modelling operator is not actively used in the training process; learned regularization: a set of models \\((\\mathbf{m}^{<i>}\\) is available upfront, which are used to find a latent representation. The inverse problem is subsequently solved for one instance of data \\(\\mathbf{d}^{obs}\\) using the learned regularizer (or preconditioner) and the physical modelling operator. The key idea of solving inverse problems with learned regularizers is therefore to split the problem into two subsequent tasks, where the first is concerned with the prior and the latter with the modelling operator (this is different from the supervised learning approach where the two are learned together): Learning process: a nonlinear model is trained to identify a representative latent space for the set of available solutions. Such model can be an AE (or VAE) network: \\[ \\underset{\\mathbf{e}_\\theta, \\mathbf{d}_\\phi} {\\mathrm{argmin}} \\; \\frac{1}{N_s}\\sum_i \\mathscr{L}(\\mathbf{m}^{(i)}, d_\\phi(e_\\theta(\\mathbf{m}^{(i)}))) \\] or a GAN network \\[ arg \\; \\underset{g_\\theta} {\\mathrm{min}} \\; \\underset{d_\\phi} {\\mathrm{max}} \\; \\frac{1}{N_s}\\sum_i \\mathscr{L}_{adv}(\\mathbf{m}^{(i)}) \\] Inversion: Once the training process is finalized, the decoder (or generator) is used as a nonlinear preconditioner to the solution of the inverse problem as follows: \\[ AE: \\mathbf{m} = d_\\phi(\\mathbf{z}) = p(\\mathbf{z}) \\quad GAN: \\mathbf{m} = g_\\theta(\\mathbf{z}) = p(\\mathbf{z}) \\] such that the inverse problem becomes: \\[ J = ||\\mathbf{d}^{obs}-g(p(\\mathbf{z}))||_p^p + \\lambda ||\\mathbf{z}||_p^p \\] This problem can be now solved using a nonlinear solver of choice, where the gradient can be easily computed using the same set of tools that we employed in the training process of neural networks, namely backpropagation: \\[ \\frac{\\partial J}{\\partial \\mathbf{z}} = \\frac{\\partial J}{\\partial g} \\frac{\\partial g}{\\partial p} \\frac{\\partial p}{\\partial \\mathbf{z}} \\] where \\(\\partial J / \\partial g\\) is the derivative of the loss function over the predicted data, \\(\\partial g / \\partial p\\) is the derivative of the physical modelling operator, and \\(\\partial p / \\partial \\mathbf{z}\\) is the derivative of the decoder of the pretrained AE (or that of the generator of the pretrained GAN) over the input. Finally, it is worth noting that when an autoencoder is used to find a representative latent space, alternatively a regularized problem of this form can be solved: \\[ J = ||\\mathbf{d}^{obs}-g(\\mathbf{m})||_p^p + \\lambda ||\\mathbf{m} - d_\\phi(e_\\theta(\\mathbf{m})) ||_p^p \\] where the regularization terms ensures that the autoencoder can recreate the estimated model. This ensures that the solution lies in the manifold of the set of plausible solutions used to train the AE network.","title":"Data-driven or learned regularization of inverse problems"},{"location":"lectures/17_deepinv/#learned-solvers","text":"In the previous section we have discussed the solution of linear (or nonlinear) inverse problems from a high-level perspective. In fact, we purposely decided to avoid any discussion regarding the numerical aspects of solving any of the cost functions \\(J\\) . In practice, real-life problems may target model spaces that contain millions (or even billions) of variables and the same usually applies for the observation vector. Under these conditions, iterative solvers similar to those presented here and here are therefore the only viable option. An iterative solver can be loosely expressed as a nonlinear function \\(\\mathcal{F}\\) of this form: \\[ \\hat{\\mathbf{m}} = \\mathcal{F}(\\mathbf{d}^{obs}, \\mathbf{m}_0, g/\\mathbf{G}) \\] where \\(\\mathbf{m}_0\\) is an initial guess. The vanilla gradient descent algorithm can be more explicitly described by the following update rule: \\[ \\mathbf{m}_{i+1} = \\mathbf{m}_i - \\alpha \\frac{\\partial J}{\\partial \\mathbf{m}} | _ {\\mathbf{m}=\\mathbf{m}_i} (\\mathbf{d}^{obs}, \\mathbf{m}, g/\\mathbf{G}) \\] which we can unroll for a number of iterations and write as: \\[ \\mathbf{m}_{2} = \\mathbf{m}_0 - \\alpha \\frac{\\partial J}{\\partial \\mathbf{m}} | _ {\\mathbf{m}=\\mathbf{m}_0} - \\alpha \\frac{\\partial J}{\\partial \\mathbf{m}} | _ {\\mathbf{m}=\\mathbf{m}_1} \\] This expression clearly shows that the solution of an iterative solver at a given iteration is a simple weighted summation of the intermediate gradients that are subtracted from the initial guess $\\mathbf{m}_0 $. Similarly, more advanced solvers like the linear or nonlinear conjugate gradient algorithm take into account the past gradients at each iteration, however they still apply simple linear scalings to the gradients to produce the final solution. The mathematical community has recently started to investigate a new family of iterative solvers, called learned solvers. The key idea lies in the fact that a linear combination of gradients may not be the best choice (both in terms of convergence speed and ultimate quality of the solution). An alternative update rule of this form \\[ \\mathbf{m}_{i+1} = \\mathbf{m}_i - f_\\theta \\left( \\frac{\\partial J}{\\partial \\mathbf{m}} | _ {\\mathbf{m}=\\mathbf{m}_i}\\right) \\] may represent a better choice. However, a question may arise at this point: how do we choose the nonlinear project \\(f_\\theta\\) that we are going to apply to the gradients at each step? Learned iterative solvers, as the name implies, learn this mapping. More specifically, assuming availability of pairs of models and associated observations \\((\\mathbf{m}^{<i>}, \\mathbf{d}^{obs,<i>})\\) , a supervised learning process is setup such that an iterative solver with \\(N_it\\) iterations is tasked to learn the mapping from data to models. Let's take a look at the schematic below to better understand how this works: A learned iterative solver can be seen as an unrolled neural network where each element takes as input the current model estimate and its gradient and produces an updated version of the model. To keep the model capacity low, each unit shares weights like in classical RNN and each update can be compactly written as: \\[ \\mathbf{m}_i = f_\\theta(\\mathbf{x}_i), \\qquad \\mathbf{x}_i = \\mathbf{m}_{i-1} \\oplus \\frac{\\partial J}{\\partial \\mathbf{m}} \\] where \\(\\oplus\\) indicates concatenation over the channel axis (assuming that model and gradient are N-dimensional tensors). Depending on the problem and type of data \\(f_\\theta\\) can be chosen to be any network architecture, from a simple FF block, to a stack of FF blocks, or even a convolutional neural network. Moreover, given that we have access to the solution, the loss function is set up as follows: \\[ \\underset{f_\\theta} {\\mathrm{arg min}} \\; \\frac{1}{N_s}\\sum_{i=1}^{N_s} \\sum_{j=1}^{N_{it}} w_j \\mathscr{L}(\\mathbf{m}_j^{(i)}, \\mathbf{m}) \\] where each estimate is compared to the true model. Since early iterations may be worse, an exponentially increasing weight may be used to downweight their contributions over the estimates as later iterations of the unrolled solver. Finally, once the learning process is finalized, inference can be simply performed by evaluation a single forward pass of the network for one instance of data \\(\\mathbf{d}^{obs}\\) and a chosen initial guess. To conclude, it is important to answer the following question: why learned solvers are better than pure vanilla supervised learning? The key difference between these two approaches lies in how they decide to use the knowledge of the modelling operator \\(g/\\mathbf{G}\\) . Whilst traditional supervised learning approaches may use the modelling operator in the process of generating training data whilst ignoring it during training, learned iterative solvers integrate the modelling operator in the learning process. Two benefits may arise from this choice: generalization of the trained network over unseen modelling operator and increased robustness to noise in the data.","title":"Learned solvers"},{"location":"lectures/17_deepinv/#variants-of-learned-solvers","text":"The structure of the learned solver discussed above closely resembles the method proposed by Adler and O\u0308ktem in 2017. A number of variants have been suggested in the literature in the following years: Learned solver with memory Adler and O\u0308ktem further propose to include a memory variable \\(\\mathbf{s}\\) . This takes inspiration from conventional solvers that use past gradients (or memory) to obtain more informed update directions. The model update can be therefore written as follows: \\[ \\mathbf{y}_i = f_\\theta(\\mathbf{x}_i), \\qquad \\mathbf{x}_i = \\mathbf{m}_{i-1} \\oplus \\frac{\\partial J}{\\partial \\mathbf{m}} \\oplus \\mathbf{s}_{i-1} \\qquad \\mathbf{y}_i = \\mathbf{m}_i \\oplus \\mathbf{s}_i \\] Recurrent Inference Machines (RIMs) RIMs closely resemble the second learned solver of Adler and O\u0308ktem. They however differ in the design on the network block and the fact that similarly to RNNs two set of parameters are used instead of one, \\(f_\\theta\\) and \\(f'_\\phi\\) . The model update can be therefore written as follows: \\[ \\begin{aligned} \\mathbf{s}_i &= f'_\\phi (\\mathbf{z}_i) , \\qquad \\mathbf{z}_i = \\boldsymbol \\eta_{i-1} \\oplus \\frac{\\partial J}{\\partial \\mathbf{m}} \\oplus \\mathbf{s}_{i-1} \\\\ \\boldsymbol \\eta_i &= \\boldsymbol \\eta_{i-1} + f_\\theta(\\mathbf{x}_i), \\qquad \\mathbf{x}_i = \\boldsymbol \\eta_{i-1} \\oplus \\frac{\\partial J}{\\partial \\mathbf{m}} \\oplus \\mathbf{s}_i \\end{aligned} \\] where a new variable \\(\\boldsymbol \\eta\\) has been introduced. This is the unscaled output and is connected to the model via a nonlinear activation function \\(\\sigma\\) that is in change of defining a range of allowed values: \\(\\mathbf{z} = \\sigma ( \\boldsymbol \\eta)\\) .","title":"Variants of learned solvers"},{"location":"lectures/18_INN/","text":"Invertible Neural Networks Invertible Neural Networks (INNs) are a class of neural networks where the input of the network can be reconstructed exactly from the output. Popular neural network architectures such as (V)AE, UNet and ResNet are not invertible for a number of reasons. Firstly, these networks have layers that map to different dimensions by either expanding or shrinking (mostly shrinking) the dimension of the current hidden layer to project the input into the so-called latent space. It was long believed that this projection into a lower dimensional space is what made neural networks so powerful; invertible neural networks break with this notion. Additionally, the layers mostly carry out convolutions, which, even if they map to a space of the same dimension, are generally rank deficient and therefore non-invertible. Moreover, popular architectures generally incorporate operations like batch normalization or average or max pooling that are non-invertible. On top of that, some popular activation functions, like ReLU, suffer the same issue of non-invertibility. Given that non-invertible operations like pooling, batch normalization and non-invertible activation functions are typically chosen because they greatly improve performance, discarding them just to ensure invertibility is undesirable. Therefore, constructing an INN that performs on par with modern architectures is not straightforward. In this lecture we will cover how to construct INNs that perform on par with state-of-the-art neural network architectures and show some of their applications. INNs have two main applications that we will cover in this lecture: INNs are used as generative models, mostly known under the name of normalizing flows (NFs) . The idea is that a complicated distribution can be transformed into a Gaussian distribution through a sequence of invertible and differentiable transformations, also called flow , hence the name normalizing flow . INNs are used to overcome the memory requirements of neural networks due to storing all activations that are needed for backpropagation. In INNs, since the input can be computed from the output, these values need not be stored and the memory requirements are constant as the size of the network increases. Normalizing flows Normalizing flows are used for generative modeling by transforming a sample distribution into the target distribution through a series of invertible transformations, called flow . This principle is illustrated in the figure below Any sample \\(x\\) from the target distribution can be transformed to a sample from the base distribution \\(z\\) via the relation \\(f(x) = z\\) and if \\(f\\) is invertible then we can equivalently obtain \\(x = f^{-1}(z)\\) . When the flow consists of multiple transformations like in the figure above, then \\(f = f_1 \\circ f_2 \\circ f_3\\) and \\(f^{-1} = f_3^{-1} \\circ f_2^{-1} \\circ f_1^{-1}\\) . Ideally, the sample distribution is a simple one whose parameters are known and is one that is easy to sample from. When one probability density function is related to another via a flow the relationship between the two is given by \\[ p_X(x) = p_Z(f(x))\\vert \\det Df(x)\\vert \\] Ideally, when looking for a probability distribution that best fits the data one is interested in minimizing the negative log-likelihood. In the case of INNs this is straightforward, since the log-likelihood of \\(p_X\\) is related to the log-likelihood of \\(p_Z\\) via \\[ -\\log p_X(x) = - \\log p_Z(f(x)) - \\log(\\vert \\det Df(x)\\vert). \\] This shows one benefit of INNs as compared to other generative models such as GANs and VAEs, that are not able to minimize the log-likelihood. GANs do not act on the log-likelihood and VAEs only minimize an upper bound, see the lecture on VAEs. Moreover, by design both sampling from the distribution and inference are easy. This makes INNs well-suited for Variational Inference (VI), as discussed in the lecture on VAEs. Clearly, in order to efficiently evaluate the sought after distribution \\(p_X\\) , we need to be able to evaluate \\(\\det Df(x)\\) efficiently. For a general matrix evaluating the determinant is costly; roughly equal to the cost of inverting the matrix. As an example, assume that we have a neural network that maps \\(\\mathbb{R}^n \\to \\mathbb{R}^n\\) . Choosing a sigmoid activation function yields the following expression for one forward pass from one layer to the next \\[ \\sigma(x) = \\frac{1}{1 + e^{- Wx - b}}, \\] where the exponential is evaluated pointwise. The gradient is given by \\[ D\\sigma(x) = \\begin{bmatrix} \\frac{e^{-Wx - b}}{(1 + e^{-Wx - b})^2}\\odot w_1, \\ldots, \\frac{e^{-Wx - b}}{(1 + e^{-Wx - b})^2}\\odot w_n \\end{bmatrix}. \\] Clearly, evaluating the determinant of this matrix is not feasible for large-scale problems, even though the network itself has an invertible structure. When the matrix has special structure, e.g. diagonal or triangular, or if the matrix is unitary, computing the determinant is cheap. However, guaranteeing a certain structure or property of the weight matrix is either not possible, computationally expensive, or severely limits the expressive capabilities of the network. Dinh et al., 2014 proposed the coupling flow that is both invertible and has a determinant that is cheap to evaluate. Coupling flows A coupling flow is a flow that splits the input into two parts, say \\(x_A\\) and \\(x_B\\) , after which \\(x_A\\) is mapped directly to the output via the identity function and \\(x_B\\) undergoes an invertible transformation, which is conditioned on \\(x_A\\) . The principle is outlined in the figure below. Here, \\(f\\) denotes the coupling function that ensures the dependency of the output \\(z_B\\) on the input \\(x_A\\) . \\(\\theta\\) can be any function and need not even be invertible, since it only parametrizes the coupling function \\(f\\) , and it can be computed from the output using the equality \\(z_A = x_A\\) . For example, in Dinh et al., 2014 the authors use a simple ReLU MLP, Dinh et al., 2016 use a convolutional residual neural network, and in Kingma et al., 2018 a three-layer CNN with ReLU activations. As illustrated in the figure below, the coupling flow is invertible. By splitting the input the Jacobian will consist of four components. Denoting the function mapping the input \\(x\\) to the output \\(z\\) by \\(g\\) we have \\[ Dg(x) = \\begin{bmatrix} \\frac{\\partial z_A}{\\partial x_A} & \\frac{\\partial z_A}{\\partial x_A} \\\\ \\frac{\\partial z_B}{\\partial x_A} & \\frac{\\partial z_B}{\\partial x_B}\\end{bmatrix} = \\begin{bmatrix} I & 0 \\\\ \\frac{\\partial}{\\partial x_A} f(x_B;\\theta(x_A)) & Df(x_B;\\theta(x_A)) \\end{bmatrix}. \\] The Jacobian is a triangular matrix and therefore the determinant is easy to evaluate since it is simply the product of the diagonal elements. The coupling function The coupling flow was introduced in Dinh et al., 2014 , where the additive coupling function has the following structure $$ \\begin{aligned} z_A = x_A \\ z_B = x_B + \\theta(x_A) \\end{aligned} $$ In subsequent work Dinh et al., 2016 used the affine coupling function $$ \\begin{aligned} z_A = x_A \\ z_B = x_B \\odot e^{s(x_A)} + t(x_A), \\end{aligned} $$ where \\(s\\) denotes a scaling function and \\(t\\) a translation function. The split function Splitting the input can be done a number of ways. Dinh et al., 2016 split the input along the spatial dimension using a checkerboard pattern, after which a squeezing operation is applied that reduces the spatial dimension and accordingly increases the number of channels. Subsequently, the channel dimension is masked in a manner that doesn't interfere with the masking in the spatial dimension. This principle is illustrated in the figure below that is directly taken from the paper. In this figure, the white squares are fed directly to the output whereas the black squares are fed through an invertible function conditioned on the white squares. By splitting the input the same way every time certain parts of the input are never used but only propagated directly to the output. To make sure all components are leveraged, the intermediate outputs have to be shuffled. Dinh et al., 2016 propose alternating shuffling where the components that are unaltered are fed through the invertible function in the next iteration and vice versa. Alternatively, Kingma et al., 2018 propose the use of invertible learned 1-x-1 convolutions, and show that this improves performance compared to alternating shuffling of Dinh et al., 2016 or random shuffling. Multi-scale architecture Along with the spatial splitting of the input Dinh et al., 2016 propose a multi-scale architecture where the spatial dimension is downsampled followed by a corresponding increase in the number of channels. Their overal architecture combines 3 coupling layers with spatial checkerboard masking followed by a squeezing operation with channel-wise masking. Because the layers of an INN preserve dimension, propagating the input through the network is costly both in terms of computational cost and memory cost. Therefore, half of the dimension are fed through the network directly without undergoing any more coupling functions. Reversible network architectures Due to the reversible nature of the network INNs have low memory cost. To understand this, let's recall the algorithm that is used to perform gradient descent for neural networks: backpropagation. Backpropagation essentially computes the gradient by repeated application of the chain rule. Recall the following figure from lecture 6: To compute the derivative we need access to result of the activation functions. If the network is fully invertible these values can be computed from the output. However, if the network is not invertible either the values have to be recomputed, which is extremely expensive for large networks, or the output of the activations has to be stored. This is essentially what happens in backpropagation. The drawback is that all the intermediate outputs have to be stored, putting a huge burden on the memory. Since GPUs generally have limited memory this becomes a bottleneck for deep neural networks. When the network is reversible, the inputs can be computed from the outputs which lifts the burden from the memory in exchange computation. This is generally a favorable trade for GPUs. A number of popular architectures now have reversible or invertible counterparts, for example UNet and ResNet. Note that, in order to lift the memory burden from backpropagation, the network need not be invertible: injectivity suffices. We now show two popular network architectures that can be made invertible: UNet and ResNet. RevNet: reversible ResNet The ResNet architecture is characterized by skip connections and consists of residual blocks of the form \\[ y = x + \\mathcal{F}(x), \\] where \\(\\mathcal{F}(x)\\) denotes the residual block. The RevNet uses a coupling flow that is slightly different from the previous coupling as shown in the figure below from the RevNet paper Gomez et al., 2017 . Here, both \\(F\\) and \\(G\\) denote the residual blocks that are typical for the standard ResNet. The coupling flow is given by \\[ \\begin{aligned} z_A & = x_A + F(x_B) \\\\ z_B & = x_B + G(z_A) \\end{aligned} \\] with inverse \\[ \\begin{aligned} x_B & = z_B - G(z_A) \\\\ x_A & = z_A - F(x_B) \\end{aligned} \\] i-UNet: invertible UNet The UNet architecture calculates features on multiple scales by downsampling the input a number of times: this is the depth of the UNet. Every downsampling layers is followed by a number of convolutional layers that extract the features at the current scale. When the maximum downsampling is reached, the input is upsampled at the same rate until an output with the same dimension as the input is reached. To combine the extracted features from different scales the UNet passes the downsampled images directly from the downsampling layers to the upsampling layers where they are concatenated. This is illustrated in the figure below. The convolutional layers in the UNet can be replaced by the invertible coupling layers to make them invertible. The downsampling layers can be made invertible by increasing the number of channels. If we denote the size of the current image by \\(h \\times w \\times c\\) , where h denotes the height, w denotes the width and c denotes the number of channels, then a map from \\(\\mathbb{R}^{h \\times w \\times c} \\to \\mathbb{R}^{h/n_h \\times w/n_w \\times n_h\\cdot n_w\\cdot c}\\) can be made invertible. In principle one could use the downsampling operation according to the checkerboard pattern as introduced by Dinh et al., 2016 . However, the corresponding upsampling operation introduces checkerboard artifacts. Etmann et al., 2020 proposed the use of learned orthogonal downsampling and upsampling operations. The key idea is that the downsampling operation can be expressed as a convolution where the kernel size equals the stride. This way, convolution can be seen as matrix-vector multiplication with a convolutional kernel matrix that has the dimension of the number of channels. This principle is illustrated in the figure below. Note that it is convenient but not strictly necessary for the kernel matrix to be orthogonal. Orthogonality makes the subsequent upsampling operator easy to evaluate, as it's just the adjoint of the kernel matrix. The invertible UNet, i-UNet, is now constructed by combining the invertible downsampling operator with the coupling functions we have seen before, replacing the standard downsampling and convolutional layers respectively. The i-UNet architecture is shown in the figure below from the paper by Etmann et al., 2020 . Further reading These notes were heavily inspired by the following tutorials: Brubacker and Kothe Paul Hand Below are the references for the RealNVP, GLOW, i-UNet and RevNet papers: RealNVP GLOW i-UNet RevNet Library for building INNs: MemCNN","title":"Invertible Neural Networks"},{"location":"lectures/18_INN/#invertible-neural-networks","text":"Invertible Neural Networks (INNs) are a class of neural networks where the input of the network can be reconstructed exactly from the output. Popular neural network architectures such as (V)AE, UNet and ResNet are not invertible for a number of reasons. Firstly, these networks have layers that map to different dimensions by either expanding or shrinking (mostly shrinking) the dimension of the current hidden layer to project the input into the so-called latent space. It was long believed that this projection into a lower dimensional space is what made neural networks so powerful; invertible neural networks break with this notion. Additionally, the layers mostly carry out convolutions, which, even if they map to a space of the same dimension, are generally rank deficient and therefore non-invertible. Moreover, popular architectures generally incorporate operations like batch normalization or average or max pooling that are non-invertible. On top of that, some popular activation functions, like ReLU, suffer the same issue of non-invertibility. Given that non-invertible operations like pooling, batch normalization and non-invertible activation functions are typically chosen because they greatly improve performance, discarding them just to ensure invertibility is undesirable. Therefore, constructing an INN that performs on par with modern architectures is not straightforward. In this lecture we will cover how to construct INNs that perform on par with state-of-the-art neural network architectures and show some of their applications. INNs have two main applications that we will cover in this lecture: INNs are used as generative models, mostly known under the name of normalizing flows (NFs) . The idea is that a complicated distribution can be transformed into a Gaussian distribution through a sequence of invertible and differentiable transformations, also called flow , hence the name normalizing flow . INNs are used to overcome the memory requirements of neural networks due to storing all activations that are needed for backpropagation. In INNs, since the input can be computed from the output, these values need not be stored and the memory requirements are constant as the size of the network increases.","title":"Invertible Neural Networks"},{"location":"lectures/18_INN/#normalizing-flows","text":"Normalizing flows are used for generative modeling by transforming a sample distribution into the target distribution through a series of invertible transformations, called flow . This principle is illustrated in the figure below Any sample \\(x\\) from the target distribution can be transformed to a sample from the base distribution \\(z\\) via the relation \\(f(x) = z\\) and if \\(f\\) is invertible then we can equivalently obtain \\(x = f^{-1}(z)\\) . When the flow consists of multiple transformations like in the figure above, then \\(f = f_1 \\circ f_2 \\circ f_3\\) and \\(f^{-1} = f_3^{-1} \\circ f_2^{-1} \\circ f_1^{-1}\\) . Ideally, the sample distribution is a simple one whose parameters are known and is one that is easy to sample from. When one probability density function is related to another via a flow the relationship between the two is given by \\[ p_X(x) = p_Z(f(x))\\vert \\det Df(x)\\vert \\] Ideally, when looking for a probability distribution that best fits the data one is interested in minimizing the negative log-likelihood. In the case of INNs this is straightforward, since the log-likelihood of \\(p_X\\) is related to the log-likelihood of \\(p_Z\\) via \\[ -\\log p_X(x) = - \\log p_Z(f(x)) - \\log(\\vert \\det Df(x)\\vert). \\] This shows one benefit of INNs as compared to other generative models such as GANs and VAEs, that are not able to minimize the log-likelihood. GANs do not act on the log-likelihood and VAEs only minimize an upper bound, see the lecture on VAEs. Moreover, by design both sampling from the distribution and inference are easy. This makes INNs well-suited for Variational Inference (VI), as discussed in the lecture on VAEs. Clearly, in order to efficiently evaluate the sought after distribution \\(p_X\\) , we need to be able to evaluate \\(\\det Df(x)\\) efficiently. For a general matrix evaluating the determinant is costly; roughly equal to the cost of inverting the matrix. As an example, assume that we have a neural network that maps \\(\\mathbb{R}^n \\to \\mathbb{R}^n\\) . Choosing a sigmoid activation function yields the following expression for one forward pass from one layer to the next \\[ \\sigma(x) = \\frac{1}{1 + e^{- Wx - b}}, \\] where the exponential is evaluated pointwise. The gradient is given by \\[ D\\sigma(x) = \\begin{bmatrix} \\frac{e^{-Wx - b}}{(1 + e^{-Wx - b})^2}\\odot w_1, \\ldots, \\frac{e^{-Wx - b}}{(1 + e^{-Wx - b})^2}\\odot w_n \\end{bmatrix}. \\] Clearly, evaluating the determinant of this matrix is not feasible for large-scale problems, even though the network itself has an invertible structure. When the matrix has special structure, e.g. diagonal or triangular, or if the matrix is unitary, computing the determinant is cheap. However, guaranteeing a certain structure or property of the weight matrix is either not possible, computationally expensive, or severely limits the expressive capabilities of the network. Dinh et al., 2014 proposed the coupling flow that is both invertible and has a determinant that is cheap to evaluate.","title":"Normalizing flows"},{"location":"lectures/18_INN/#coupling-flows","text":"A coupling flow is a flow that splits the input into two parts, say \\(x_A\\) and \\(x_B\\) , after which \\(x_A\\) is mapped directly to the output via the identity function and \\(x_B\\) undergoes an invertible transformation, which is conditioned on \\(x_A\\) . The principle is outlined in the figure below. Here, \\(f\\) denotes the coupling function that ensures the dependency of the output \\(z_B\\) on the input \\(x_A\\) . \\(\\theta\\) can be any function and need not even be invertible, since it only parametrizes the coupling function \\(f\\) , and it can be computed from the output using the equality \\(z_A = x_A\\) . For example, in Dinh et al., 2014 the authors use a simple ReLU MLP, Dinh et al., 2016 use a convolutional residual neural network, and in Kingma et al., 2018 a three-layer CNN with ReLU activations. As illustrated in the figure below, the coupling flow is invertible. By splitting the input the Jacobian will consist of four components. Denoting the function mapping the input \\(x\\) to the output \\(z\\) by \\(g\\) we have \\[ Dg(x) = \\begin{bmatrix} \\frac{\\partial z_A}{\\partial x_A} & \\frac{\\partial z_A}{\\partial x_A} \\\\ \\frac{\\partial z_B}{\\partial x_A} & \\frac{\\partial z_B}{\\partial x_B}\\end{bmatrix} = \\begin{bmatrix} I & 0 \\\\ \\frac{\\partial}{\\partial x_A} f(x_B;\\theta(x_A)) & Df(x_B;\\theta(x_A)) \\end{bmatrix}. \\] The Jacobian is a triangular matrix and therefore the determinant is easy to evaluate since it is simply the product of the diagonal elements.","title":"Coupling flows"},{"location":"lectures/18_INN/#the-coupling-function","text":"The coupling flow was introduced in Dinh et al., 2014 , where the additive coupling function has the following structure $$ \\begin{aligned} z_A = x_A \\ z_B = x_B + \\theta(x_A) \\end{aligned} $$ In subsequent work Dinh et al., 2016 used the affine coupling function $$ \\begin{aligned} z_A = x_A \\ z_B = x_B \\odot e^{s(x_A)} + t(x_A), \\end{aligned} $$ where \\(s\\) denotes a scaling function and \\(t\\) a translation function.","title":"The coupling function"},{"location":"lectures/18_INN/#the-split-function","text":"Splitting the input can be done a number of ways. Dinh et al., 2016 split the input along the spatial dimension using a checkerboard pattern, after which a squeezing operation is applied that reduces the spatial dimension and accordingly increases the number of channels. Subsequently, the channel dimension is masked in a manner that doesn't interfere with the masking in the spatial dimension. This principle is illustrated in the figure below that is directly taken from the paper. In this figure, the white squares are fed directly to the output whereas the black squares are fed through an invertible function conditioned on the white squares. By splitting the input the same way every time certain parts of the input are never used but only propagated directly to the output. To make sure all components are leveraged, the intermediate outputs have to be shuffled. Dinh et al., 2016 propose alternating shuffling where the components that are unaltered are fed through the invertible function in the next iteration and vice versa. Alternatively, Kingma et al., 2018 propose the use of invertible learned 1-x-1 convolutions, and show that this improves performance compared to alternating shuffling of Dinh et al., 2016 or random shuffling.","title":"The split function"},{"location":"lectures/18_INN/#multi-scale-architecture","text":"Along with the spatial splitting of the input Dinh et al., 2016 propose a multi-scale architecture where the spatial dimension is downsampled followed by a corresponding increase in the number of channels. Their overal architecture combines 3 coupling layers with spatial checkerboard masking followed by a squeezing operation with channel-wise masking. Because the layers of an INN preserve dimension, propagating the input through the network is costly both in terms of computational cost and memory cost. Therefore, half of the dimension are fed through the network directly without undergoing any more coupling functions.","title":"Multi-scale architecture"},{"location":"lectures/18_INN/#reversible-network-architectures","text":"Due to the reversible nature of the network INNs have low memory cost. To understand this, let's recall the algorithm that is used to perform gradient descent for neural networks: backpropagation. Backpropagation essentially computes the gradient by repeated application of the chain rule. Recall the following figure from lecture 6: To compute the derivative we need access to result of the activation functions. If the network is fully invertible these values can be computed from the output. However, if the network is not invertible either the values have to be recomputed, which is extremely expensive for large networks, or the output of the activations has to be stored. This is essentially what happens in backpropagation. The drawback is that all the intermediate outputs have to be stored, putting a huge burden on the memory. Since GPUs generally have limited memory this becomes a bottleneck for deep neural networks. When the network is reversible, the inputs can be computed from the outputs which lifts the burden from the memory in exchange computation. This is generally a favorable trade for GPUs. A number of popular architectures now have reversible or invertible counterparts, for example UNet and ResNet. Note that, in order to lift the memory burden from backpropagation, the network need not be invertible: injectivity suffices. We now show two popular network architectures that can be made invertible: UNet and ResNet.","title":"Reversible network architectures"},{"location":"lectures/18_INN/#revnet-reversible-resnet","text":"The ResNet architecture is characterized by skip connections and consists of residual blocks of the form \\[ y = x + \\mathcal{F}(x), \\] where \\(\\mathcal{F}(x)\\) denotes the residual block. The RevNet uses a coupling flow that is slightly different from the previous coupling as shown in the figure below from the RevNet paper Gomez et al., 2017 . Here, both \\(F\\) and \\(G\\) denote the residual blocks that are typical for the standard ResNet. The coupling flow is given by \\[ \\begin{aligned} z_A & = x_A + F(x_B) \\\\ z_B & = x_B + G(z_A) \\end{aligned} \\] with inverse \\[ \\begin{aligned} x_B & = z_B - G(z_A) \\\\ x_A & = z_A - F(x_B) \\end{aligned} \\]","title":"RevNet: reversible ResNet"},{"location":"lectures/18_INN/#i-unet-invertible-unet","text":"The UNet architecture calculates features on multiple scales by downsampling the input a number of times: this is the depth of the UNet. Every downsampling layers is followed by a number of convolutional layers that extract the features at the current scale. When the maximum downsampling is reached, the input is upsampled at the same rate until an output with the same dimension as the input is reached. To combine the extracted features from different scales the UNet passes the downsampled images directly from the downsampling layers to the upsampling layers where they are concatenated. This is illustrated in the figure below. The convolutional layers in the UNet can be replaced by the invertible coupling layers to make them invertible. The downsampling layers can be made invertible by increasing the number of channels. If we denote the size of the current image by \\(h \\times w \\times c\\) , where h denotes the height, w denotes the width and c denotes the number of channels, then a map from \\(\\mathbb{R}^{h \\times w \\times c} \\to \\mathbb{R}^{h/n_h \\times w/n_w \\times n_h\\cdot n_w\\cdot c}\\) can be made invertible. In principle one could use the downsampling operation according to the checkerboard pattern as introduced by Dinh et al., 2016 . However, the corresponding upsampling operation introduces checkerboard artifacts. Etmann et al., 2020 proposed the use of learned orthogonal downsampling and upsampling operations. The key idea is that the downsampling operation can be expressed as a convolution where the kernel size equals the stride. This way, convolution can be seen as matrix-vector multiplication with a convolutional kernel matrix that has the dimension of the number of channels. This principle is illustrated in the figure below. Note that it is convenient but not strictly necessary for the kernel matrix to be orthogonal. Orthogonality makes the subsequent upsampling operator easy to evaluate, as it's just the adjoint of the kernel matrix. The invertible UNet, i-UNet, is now constructed by combining the invertible downsampling operator with the coupling functions we have seen before, replacing the standard downsampling and convolutional layers respectively. The i-UNet architecture is shown in the figure below from the paper by Etmann et al., 2020 .","title":"i-UNet: invertible UNet"},{"location":"lectures/18_INN/#further-reading","text":"These notes were heavily inspired by the following tutorials: Brubacker and Kothe Paul Hand Below are the references for the RealNVP, GLOW, i-UNet and RevNet papers: RealNVP GLOW i-UNet RevNet Library for building INNs: MemCNN","title":"Further reading"},{"location":"lectures/19_implicit/","text":"Implicit neural networks Neural networks consists of a sequence of consecutive operations that are typically defined explicitly. An explicit operation is one that computes the output directly from a sequence of explicit operations applied to the input. A simple example is a feed-forward MLP, where the transition from one layer to the next is done by the following sequence of operations \\[ \\begin{aligned} z_i & = W_iz_{i-1} + b_i \\\\ a_i & = \\sigma_i(z_i) \\end{aligned} \\] Additionally one could add operations like batch normalization and max pooling, all of which are given explicitly. Alternatively, two variables can be related via an implicit equation. A simple example of an explicit function versus an implicit function is \\(y = x^2\\) versus \\(x^2 + y^2 = 1\\) . From the second example it becomes clear why implicit functions are sometimes favorable, since the implicit function \\(x^2 + y^2 = 1\\) has an explicit counterpart with two equations, namely \\(y = \\sqrt{1 - x^2}\\) and \\(y = -\\sqrt{1 - x^2}\\) . In a more abstract fashion, we can write an explicit equation as \\[ y = f(x), \\] and an implicit equation as \\[ f(x, y) = 0. \\] Neural networks can be defined implicitly as well through the concept of implicit layers and were introduced under the name Deep Equilibrium Models (DEQ) . This concept is a bit abstract but the nice thing about this paradigm is that the memory requirements for deep networks are constant. To understand this concept we need to cover two fundamental concepts: Implicit functions and the implicit function theorem. Taking derivatives of explicit functions is easy, since we have an explicit relation of the output with respect to the input, and we can compute \\(\\frac{dy}{dx}\\) in a straightforward manner. However, if \\(y\\) is only given through \\(f(x, y) = 0\\) then computing the derivative is less straightforward. Fixed point iterations. Fixed point iterations are iterations of the form \\(x_{k+1} = \\mathcal{F}(x_k)\\) and we call a vector \\(x_{\\star}\\) a fixed point of \\(\\mathcal{F}\\) if \\(x_{\\star} = \\mathcal{F}(x_{\\star})\\) . DEQs are based on the idea that the layers of a neural network will eventually reach a fixed point. Fixed point iterations Consider the following fixed point iteration \\[ z_{k+1} = \\tanh(Wz_k + b + x). \\] This is essentially repeated application of one layer of a neural network with weight matrix \\(W\\) , bias \\(b\\) , some input \\(x\\) and activation function \\(\\tanh\\) . Assuming for now that a fixed point actually exists we iterate until convergence, i.e. \\(z_{\\star} = \\mathcal{F}(z_{\\star})\\) up to some tolerance. Alternatively, we can write the above equation as \\[ z - \\tanh(Wz + b + x) = 0, \\] where the function is now implicitly defined. Defining \\[ g(x, z) := z - \\tanh(Wz + b + x), \\] the goal is now to solve the root finding problem \\[ g(x, z_{\\star}(x)) = 0, \\] where \\(z_{\\star}(x)\\) denotes the solution depending on \\(x\\) . Let the solution to this problem be given by \\(z_{\\star}(x)\\) and assume we want to compute \\(\\frac{dz_{\\star}(x)}{dx}\\) (note that we could choose to differentiate through any parameter, for example the weight matrix, but this is just for illustrative purposes). Since we only have access to \\(z_{\\star}\\) through the equation \\(g(x, z_{\\star}(x)) = 0\\) need to differentiate through this equation to obtain \\(\\frac{dz_{\\star}(x)}{dx}\\) . This yields: \\[ \\frac{\\partial}{\\partial x}g(x, z_{\\star}(x)) = \\frac{\\partial g(x, z_{\\star})}{\\partial x} + \\frac{\\partial g(x, z_{\\star})}{\\partial z_{\\star}}\\frac{\\partial z_{\\star}(x)}{\\partial x} = 0 \\] This equation allows us to solve for \\(\\frac{dz_{\\star}(x)}{dx}\\) as follows \\[ \\frac{\\partial z_{\\star}(x)}{\\partial x} = -\\left(\\frac{\\partial g(x, z_{\\star})}{\\partial z_{\\star}}\\right)^{-1}\\frac{\\partial g(x, z_{\\star})}{\\partial x} \\] The main question here is whether existence is guaranteed. The implicit function theorem states that if a fixed point exists and the function \\(g\\) is differentiable with non-singular Jacobian around \\(z_{\\star}\\) there exists a unique function \\(z_{\\star}(x)\\) . The key point here is that one can differentiate through \\(z_{\\star}\\) without needing to differentiate through the solver used to obtain the fixed point. This saves a huge amount of memory that would otherwise be needed in order to perform backpropagation. This observation has led to the development of the Deep Equilibrium Network . This network has the following structure: \\[ \\begin{aligned} z_1 & = 0 \\\\ z_i & = \\sigma_i(Wz_i + Ux + b_i), \\quad i=1,\\ldots, k \\\\ h(x) & = W_kz_k + b_k \\end{aligned} \\] As we can see, DEQs apply a fixed point iteration to a single layer of a neural network. The question is whether this fixed point iteration actually converges: It could also blow-up or oscillate. It turns out that in general the fixed point iteration converges. As you can probably guess at this point, the fixed point iteration is solved using implicit differentiation, thereby bypassing the need to store any information necessary for the backward pass. This way one can build an extremely deep network. If we now want to update the weights of the neural network we need to evaluate the partial derivative with respect to \\(W\\) . Given that \\(z_{\\star}\\) is a fixed point we have \\[ z_{\\star} = f(x, z_{\\star}) \\: \\Leftrightarrow \\: \\frac{\\partial z_{\\star}}{\\partial W} = \\frac{\\partial f(x, z_{\\star})}{\\partial W} \\] Computing \\(\\frac{\\partial f(x, z_{\\star})}{\\partial W}\\) via implicit differentiation and rearranging terms gives \\[ \\frac{\\partial z_{\\star}}{\\partial W} = \\left(I - \\frac{\\partial f(x, z_{\\star})}{\\partial z_{\\star}}\\right)^{-1}\\frac{\\partial f(x, z_{\\star})}{\\partial W} \\] Backpropagation actually implements the transpose of this expression, i.e.: \\[ \\left(\\frac{\\partial z_{\\star}}{\\partial W}\\right)^Ty = \\left(\\frac{\\partial f(x, z_{\\star})}{\\partial W}\\right)^T\\left(I - \\frac{\\partial f(x, z_{\\star})}{\\partial z_{\\star}}\\right)^{-T}y, \\] where \\(y\\) is some vector we apply the gradient to. Evaluating the gradient is now a two-step process: Evaluate \\(\\left(I - \\frac{\\partial f(x, z_{\\star})}{\\partial z_{\\star}}\\right)^{-1}y\\) . Since this matrix tends to be large we do not evaluate the inverse directly, but rather solve the linear system $$ y = \\left(I - \\frac{\\partial f(x, z_{\\star})}{\\partial z_{\\star}}\\right)g \\quad \\Leftrightarrow \\quad g = \\frac{\\partial f(x, z_{\\star})}{\\partial z_{\\star}}g + y. $$ Compute $$ \\left(\\frac{\\partial z_{\\star}}{\\partial W}\\right)^Ty = \\left(\\frac{\\partial f(x, z_{\\star})}{\\partial W}\\right)^Tg $$ So far we have considered a rather simple model for the DEQ. We have assumed a constant weight \\(W\\) accross the layers and have assumed a simple feed-forward model. It turns out that a feed-forward neural network with constant weights accross the layers is actually equivalent to a neural network with a layer-dependent matrix, which is summarized in the following theorem by Bai et al., 2019 : Consider a traditional \\(L\\) -layer MLP \\[ z_{i+1} = \\sigma_{i}(W_iz_i + b_i), \\quad i=0,\\ldots,L-1, \\quad z_0 = x. \\] This network is equivalent to the following weight-tied network of equivalent depth: \\[ \\tilde{z}_{i+1} = \\tilde{\\sigma}(W_zz_i + \\tilde{b} + Ux), \\quad i=0, \\ldots, L-1, \\quad \\tilde{z}_{0} = (0, \\ldots, 0)^T \\] We prove the theorem for the case \\(L = 4\\) , but it extends to general \\(L\\) . Define the matrices \\[ W_z = \\begin{bmatrix} 0 & 0 & 0 & 0 \\\\ W_1 & 0 & 0 & 0 \\\\ 0 & W_2 & 0 & 0 \\\\ 0 & 0 & W_{3} & 0 \\end{bmatrix}, \\: U = \\begin{bmatrix} W_0 \\\\ 0 \\\\ 0 \\\\ 0 \\end{bmatrix}, \\: \\tilde{b} = \\begin{bmatrix} b_0 \\\\ b_1 \\\\ b_2 \\\\ b_3 \\end{bmatrix}, \\: \\tilde{\\sigma} = \\begin{bmatrix} \\sigma_0 \\\\ \\sigma_1 \\\\ \\sigma_2 \\\\ \\sigma_3 \\end{bmatrix}. \\] Then after one iteration we have \\[ \\tilde{z}_1 = \\tilde{\\sigma}(W_z\\tilde{z}_0 + Ux + \\tilde{b}) = \\begin{bmatrix} \\sigma_0(W_0x + b_0) \\\\ \\sigma_1(b_1) \\\\ \\sigma_2(b_2) \\\\ \\sigma_3(b_3) \\end{bmatrix} = \\begin{bmatrix} z_0 \\\\ \\sigma_1(b_1) \\\\ \\sigma_2(b_2) \\\\ \\sigma_3(b_3) \\end{bmatrix}. \\] For the second iteration we have \\[ W_z\\tilde{z_1} = \\begin{bmatrix} 0 & 0 & 0 & 0 \\\\ W_1 & 0 & 0 & 0 \\\\ 0 & W_2 & 0 & 0 \\\\ 0 & 0 & W_{3} & 0 \\end{bmatrix}\\begin{bmatrix} z_0 \\\\ \\sigma_1(b_1) \\\\ \\sigma_2(b_2) \\\\ \\sigma_3(b_3) \\end{bmatrix} = \\begin{bmatrix} 0 \\\\ W_1z_0 \\\\ W_2\\sigma_2(b_2) \\\\ W_3\\sigma_2(b_2) \\end{bmatrix} \\] and hence \\[ \\tilde{z}_{2} = \\tilde{\\sigma}(W_zz_1 + \\tilde{b} + Ux) = \\begin{bmatrix} \\sigma_0(W_0x + b_0) \\\\ \\sigma_1(W_1z_0 + b_1) \\\\ \\sigma_2(W_2\\sigma_1(b_1) + b_2) \\\\ \\sigma_3(W_3\\sigma_2(b_2) + b_3) \\end{bmatrix} = \\begin{bmatrix} z_0 \\\\ z_1 \\\\ \\sigma_2(W_2\\sigma_1(b_1) + b_2) \\\\ \\sigma_3(W_3\\sigma_2(b_2) + b_3) \\end{bmatrix}. \\] Similarly, for the next layer we obtain \\[ \\begin{bmatrix} 0 & 0 & 0 & 0 \\\\ W_1 & 0 & 0 & 0 \\\\ 0 & W_2 & 0 & 0 \\\\ 0 & 0 & W_{3} & 0 \\end{bmatrix}\\begin{bmatrix} z_0 \\\\ z_1 \\\\ \\sigma_2(W_2\\sigma_1(b_1) + b_2) \\\\ \\sigma_3(W_3\\sigma_2(b_2) + b_3) \\end{bmatrix} = \\begin{bmatrix} 0 \\\\ W_1z_0 \\\\ W_2z_1 \\\\ W_3\\sigma_2(W_2\\sigma_1(b_1) + b_2) \\end{bmatrix} \\] which leads to \\[ \\tilde{z}_3 = \\begin{bmatrix} \\sigma_0(W_0x + b_0) \\\\ \\sigma_1(W_1z_0 + b_1) \\\\ \\sigma_2(W_2z_1 + b_2) \\\\ \\sigma_3(W_3\\sigma_2(W_2\\sigma_1(b_1) + b_2) + b_3) \\end{bmatrix} = \\begin{bmatrix} z_1 \\\\ z_2 \\\\ z_3 \\\\ \\sigma_3(W_3\\sigma_2(W_2\\sigma_1(b_1) + b_2) + b_3) \\end{bmatrix} \\] Then, finally, \\[ \\begin{bmatrix} 0 & 0 & 0 & 0 \\\\ W_1 & 0 & 0 & 0 \\\\ 0 & W_2 & 0 & 0 \\\\ 0 & 0 & W_{3} & 0 \\end{bmatrix}\\begin{bmatrix} z_1 \\\\ z_2 \\\\ z_3 \\\\ \\sigma_3(W_3\\sigma_2(W_2\\sigma_1(b_1) + b_2) + b_3) \\end{bmatrix} = \\begin{bmatrix} 0 \\\\ W_1z_1 \\\\ W_2z_2 \\\\ W_3z_3 \\end{bmatrix} \\] and hence \\[ \\tilde{z}_4 = \\begin{bmatrix} \\sigma_0(W_0x + b_0) \\\\ \\sigma_1(W_1z_0 + b_1) \\\\ \\sigma_2(W_2z_1 + b_2) \\\\ \\sigma_3(W_3z_2 + b_3) \\end{bmatrix} = \\begin{bmatrix} z_1 \\\\ z_2 \\\\ z_3 \\\\ z_4 \\end{bmatrix}. \\] Moreover, note that we have only used a single layer DEQ as opposed to the multi-layer architecture that is typical for powerful neural networks. However, any deep neural network can be represented as a deep neural network. The argument is as follows. Assume that construct a two-layer network \\(g_2(g_1(x))\\) . This can be posed a single layer DEQ using the following relation: \\[ f(z, x) = f\\left( \\begin{bmatrix} z_1 \\\\ z_2 \\end{bmatrix}, x\\right) = \\begin{bmatrix} g(x) \\\\ g(z_1) \\end{bmatrix} \\] That is, the complexity of the extra layer can simply be added by concatenating the two layers to make a single layer neural network. The same argument holds for stacking DEQs: a single DEQ can model any number of stacked DEQs. Finally, we can increase the complexity of the DEQ by substituting the simple feed-forward neural network with any sequence of operations, including convolutions, normalizations, grouping and skip connections. Further reading These notes are essentially a summary of the following tutorial, specifically chapters 1 and 4: Implicit layer tutorial Below is the paper introducing Deep Equilibrium Models (DEQ): Bai et al., 2019","title":"Implicit neural networks"},{"location":"lectures/19_implicit/#implicit-neural-networks","text":"Neural networks consists of a sequence of consecutive operations that are typically defined explicitly. An explicit operation is one that computes the output directly from a sequence of explicit operations applied to the input. A simple example is a feed-forward MLP, where the transition from one layer to the next is done by the following sequence of operations \\[ \\begin{aligned} z_i & = W_iz_{i-1} + b_i \\\\ a_i & = \\sigma_i(z_i) \\end{aligned} \\] Additionally one could add operations like batch normalization and max pooling, all of which are given explicitly. Alternatively, two variables can be related via an implicit equation. A simple example of an explicit function versus an implicit function is \\(y = x^2\\) versus \\(x^2 + y^2 = 1\\) . From the second example it becomes clear why implicit functions are sometimes favorable, since the implicit function \\(x^2 + y^2 = 1\\) has an explicit counterpart with two equations, namely \\(y = \\sqrt{1 - x^2}\\) and \\(y = -\\sqrt{1 - x^2}\\) . In a more abstract fashion, we can write an explicit equation as \\[ y = f(x), \\] and an implicit equation as \\[ f(x, y) = 0. \\] Neural networks can be defined implicitly as well through the concept of implicit layers and were introduced under the name Deep Equilibrium Models (DEQ) . This concept is a bit abstract but the nice thing about this paradigm is that the memory requirements for deep networks are constant. To understand this concept we need to cover two fundamental concepts: Implicit functions and the implicit function theorem. Taking derivatives of explicit functions is easy, since we have an explicit relation of the output with respect to the input, and we can compute \\(\\frac{dy}{dx}\\) in a straightforward manner. However, if \\(y\\) is only given through \\(f(x, y) = 0\\) then computing the derivative is less straightforward. Fixed point iterations. Fixed point iterations are iterations of the form \\(x_{k+1} = \\mathcal{F}(x_k)\\) and we call a vector \\(x_{\\star}\\) a fixed point of \\(\\mathcal{F}\\) if \\(x_{\\star} = \\mathcal{F}(x_{\\star})\\) . DEQs are based on the idea that the layers of a neural network will eventually reach a fixed point.","title":"Implicit neural networks"},{"location":"lectures/19_implicit/#fixed-point-iterations","text":"Consider the following fixed point iteration \\[ z_{k+1} = \\tanh(Wz_k + b + x). \\] This is essentially repeated application of one layer of a neural network with weight matrix \\(W\\) , bias \\(b\\) , some input \\(x\\) and activation function \\(\\tanh\\) . Assuming for now that a fixed point actually exists we iterate until convergence, i.e. \\(z_{\\star} = \\mathcal{F}(z_{\\star})\\) up to some tolerance. Alternatively, we can write the above equation as \\[ z - \\tanh(Wz + b + x) = 0, \\] where the function is now implicitly defined. Defining \\[ g(x, z) := z - \\tanh(Wz + b + x), \\] the goal is now to solve the root finding problem \\[ g(x, z_{\\star}(x)) = 0, \\] where \\(z_{\\star}(x)\\) denotes the solution depending on \\(x\\) . Let the solution to this problem be given by \\(z_{\\star}(x)\\) and assume we want to compute \\(\\frac{dz_{\\star}(x)}{dx}\\) (note that we could choose to differentiate through any parameter, for example the weight matrix, but this is just for illustrative purposes). Since we only have access to \\(z_{\\star}\\) through the equation \\(g(x, z_{\\star}(x)) = 0\\) need to differentiate through this equation to obtain \\(\\frac{dz_{\\star}(x)}{dx}\\) . This yields: \\[ \\frac{\\partial}{\\partial x}g(x, z_{\\star}(x)) = \\frac{\\partial g(x, z_{\\star})}{\\partial x} + \\frac{\\partial g(x, z_{\\star})}{\\partial z_{\\star}}\\frac{\\partial z_{\\star}(x)}{\\partial x} = 0 \\] This equation allows us to solve for \\(\\frac{dz_{\\star}(x)}{dx}\\) as follows \\[ \\frac{\\partial z_{\\star}(x)}{\\partial x} = -\\left(\\frac{\\partial g(x, z_{\\star})}{\\partial z_{\\star}}\\right)^{-1}\\frac{\\partial g(x, z_{\\star})}{\\partial x} \\] The main question here is whether existence is guaranteed. The implicit function theorem states that if a fixed point exists and the function \\(g\\) is differentiable with non-singular Jacobian around \\(z_{\\star}\\) there exists a unique function \\(z_{\\star}(x)\\) . The key point here is that one can differentiate through \\(z_{\\star}\\) without needing to differentiate through the solver used to obtain the fixed point. This saves a huge amount of memory that would otherwise be needed in order to perform backpropagation. This observation has led to the development of the Deep Equilibrium Network . This network has the following structure: \\[ \\begin{aligned} z_1 & = 0 \\\\ z_i & = \\sigma_i(Wz_i + Ux + b_i), \\quad i=1,\\ldots, k \\\\ h(x) & = W_kz_k + b_k \\end{aligned} \\] As we can see, DEQs apply a fixed point iteration to a single layer of a neural network. The question is whether this fixed point iteration actually converges: It could also blow-up or oscillate. It turns out that in general the fixed point iteration converges. As you can probably guess at this point, the fixed point iteration is solved using implicit differentiation, thereby bypassing the need to store any information necessary for the backward pass. This way one can build an extremely deep network. If we now want to update the weights of the neural network we need to evaluate the partial derivative with respect to \\(W\\) . Given that \\(z_{\\star}\\) is a fixed point we have \\[ z_{\\star} = f(x, z_{\\star}) \\: \\Leftrightarrow \\: \\frac{\\partial z_{\\star}}{\\partial W} = \\frac{\\partial f(x, z_{\\star})}{\\partial W} \\] Computing \\(\\frac{\\partial f(x, z_{\\star})}{\\partial W}\\) via implicit differentiation and rearranging terms gives \\[ \\frac{\\partial z_{\\star}}{\\partial W} = \\left(I - \\frac{\\partial f(x, z_{\\star})}{\\partial z_{\\star}}\\right)^{-1}\\frac{\\partial f(x, z_{\\star})}{\\partial W} \\] Backpropagation actually implements the transpose of this expression, i.e.: \\[ \\left(\\frac{\\partial z_{\\star}}{\\partial W}\\right)^Ty = \\left(\\frac{\\partial f(x, z_{\\star})}{\\partial W}\\right)^T\\left(I - \\frac{\\partial f(x, z_{\\star})}{\\partial z_{\\star}}\\right)^{-T}y, \\] where \\(y\\) is some vector we apply the gradient to. Evaluating the gradient is now a two-step process: Evaluate \\(\\left(I - \\frac{\\partial f(x, z_{\\star})}{\\partial z_{\\star}}\\right)^{-1}y\\) . Since this matrix tends to be large we do not evaluate the inverse directly, but rather solve the linear system $$ y = \\left(I - \\frac{\\partial f(x, z_{\\star})}{\\partial z_{\\star}}\\right)g \\quad \\Leftrightarrow \\quad g = \\frac{\\partial f(x, z_{\\star})}{\\partial z_{\\star}}g + y. $$ Compute $$ \\left(\\frac{\\partial z_{\\star}}{\\partial W}\\right)^Ty = \\left(\\frac{\\partial f(x, z_{\\star})}{\\partial W}\\right)^Tg $$ So far we have considered a rather simple model for the DEQ. We have assumed a constant weight \\(W\\) accross the layers and have assumed a simple feed-forward model. It turns out that a feed-forward neural network with constant weights accross the layers is actually equivalent to a neural network with a layer-dependent matrix, which is summarized in the following theorem by Bai et al., 2019 : Consider a traditional \\(L\\) -layer MLP \\[ z_{i+1} = \\sigma_{i}(W_iz_i + b_i), \\quad i=0,\\ldots,L-1, \\quad z_0 = x. \\] This network is equivalent to the following weight-tied network of equivalent depth: \\[ \\tilde{z}_{i+1} = \\tilde{\\sigma}(W_zz_i + \\tilde{b} + Ux), \\quad i=0, \\ldots, L-1, \\quad \\tilde{z}_{0} = (0, \\ldots, 0)^T \\] We prove the theorem for the case \\(L = 4\\) , but it extends to general \\(L\\) . Define the matrices \\[ W_z = \\begin{bmatrix} 0 & 0 & 0 & 0 \\\\ W_1 & 0 & 0 & 0 \\\\ 0 & W_2 & 0 & 0 \\\\ 0 & 0 & W_{3} & 0 \\end{bmatrix}, \\: U = \\begin{bmatrix} W_0 \\\\ 0 \\\\ 0 \\\\ 0 \\end{bmatrix}, \\: \\tilde{b} = \\begin{bmatrix} b_0 \\\\ b_1 \\\\ b_2 \\\\ b_3 \\end{bmatrix}, \\: \\tilde{\\sigma} = \\begin{bmatrix} \\sigma_0 \\\\ \\sigma_1 \\\\ \\sigma_2 \\\\ \\sigma_3 \\end{bmatrix}. \\] Then after one iteration we have \\[ \\tilde{z}_1 = \\tilde{\\sigma}(W_z\\tilde{z}_0 + Ux + \\tilde{b}) = \\begin{bmatrix} \\sigma_0(W_0x + b_0) \\\\ \\sigma_1(b_1) \\\\ \\sigma_2(b_2) \\\\ \\sigma_3(b_3) \\end{bmatrix} = \\begin{bmatrix} z_0 \\\\ \\sigma_1(b_1) \\\\ \\sigma_2(b_2) \\\\ \\sigma_3(b_3) \\end{bmatrix}. \\] For the second iteration we have \\[ W_z\\tilde{z_1} = \\begin{bmatrix} 0 & 0 & 0 & 0 \\\\ W_1 & 0 & 0 & 0 \\\\ 0 & W_2 & 0 & 0 \\\\ 0 & 0 & W_{3} & 0 \\end{bmatrix}\\begin{bmatrix} z_0 \\\\ \\sigma_1(b_1) \\\\ \\sigma_2(b_2) \\\\ \\sigma_3(b_3) \\end{bmatrix} = \\begin{bmatrix} 0 \\\\ W_1z_0 \\\\ W_2\\sigma_2(b_2) \\\\ W_3\\sigma_2(b_2) \\end{bmatrix} \\] and hence \\[ \\tilde{z}_{2} = \\tilde{\\sigma}(W_zz_1 + \\tilde{b} + Ux) = \\begin{bmatrix} \\sigma_0(W_0x + b_0) \\\\ \\sigma_1(W_1z_0 + b_1) \\\\ \\sigma_2(W_2\\sigma_1(b_1) + b_2) \\\\ \\sigma_3(W_3\\sigma_2(b_2) + b_3) \\end{bmatrix} = \\begin{bmatrix} z_0 \\\\ z_1 \\\\ \\sigma_2(W_2\\sigma_1(b_1) + b_2) \\\\ \\sigma_3(W_3\\sigma_2(b_2) + b_3) \\end{bmatrix}. \\] Similarly, for the next layer we obtain \\[ \\begin{bmatrix} 0 & 0 & 0 & 0 \\\\ W_1 & 0 & 0 & 0 \\\\ 0 & W_2 & 0 & 0 \\\\ 0 & 0 & W_{3} & 0 \\end{bmatrix}\\begin{bmatrix} z_0 \\\\ z_1 \\\\ \\sigma_2(W_2\\sigma_1(b_1) + b_2) \\\\ \\sigma_3(W_3\\sigma_2(b_2) + b_3) \\end{bmatrix} = \\begin{bmatrix} 0 \\\\ W_1z_0 \\\\ W_2z_1 \\\\ W_3\\sigma_2(W_2\\sigma_1(b_1) + b_2) \\end{bmatrix} \\] which leads to \\[ \\tilde{z}_3 = \\begin{bmatrix} \\sigma_0(W_0x + b_0) \\\\ \\sigma_1(W_1z_0 + b_1) \\\\ \\sigma_2(W_2z_1 + b_2) \\\\ \\sigma_3(W_3\\sigma_2(W_2\\sigma_1(b_1) + b_2) + b_3) \\end{bmatrix} = \\begin{bmatrix} z_1 \\\\ z_2 \\\\ z_3 \\\\ \\sigma_3(W_3\\sigma_2(W_2\\sigma_1(b_1) + b_2) + b_3) \\end{bmatrix} \\] Then, finally, \\[ \\begin{bmatrix} 0 & 0 & 0 & 0 \\\\ W_1 & 0 & 0 & 0 \\\\ 0 & W_2 & 0 & 0 \\\\ 0 & 0 & W_{3} & 0 \\end{bmatrix}\\begin{bmatrix} z_1 \\\\ z_2 \\\\ z_3 \\\\ \\sigma_3(W_3\\sigma_2(W_2\\sigma_1(b_1) + b_2) + b_3) \\end{bmatrix} = \\begin{bmatrix} 0 \\\\ W_1z_1 \\\\ W_2z_2 \\\\ W_3z_3 \\end{bmatrix} \\] and hence \\[ \\tilde{z}_4 = \\begin{bmatrix} \\sigma_0(W_0x + b_0) \\\\ \\sigma_1(W_1z_0 + b_1) \\\\ \\sigma_2(W_2z_1 + b_2) \\\\ \\sigma_3(W_3z_2 + b_3) \\end{bmatrix} = \\begin{bmatrix} z_1 \\\\ z_2 \\\\ z_3 \\\\ z_4 \\end{bmatrix}. \\] Moreover, note that we have only used a single layer DEQ as opposed to the multi-layer architecture that is typical for powerful neural networks. However, any deep neural network can be represented as a deep neural network. The argument is as follows. Assume that construct a two-layer network \\(g_2(g_1(x))\\) . This can be posed a single layer DEQ using the following relation: \\[ f(z, x) = f\\left( \\begin{bmatrix} z_1 \\\\ z_2 \\end{bmatrix}, x\\right) = \\begin{bmatrix} g(x) \\\\ g(z_1) \\end{bmatrix} \\] That is, the complexity of the extra layer can simply be added by concatenating the two layers to make a single layer neural network. The same argument holds for stacking DEQs: a single DEQ can model any number of stacked DEQs. Finally, we can increase the complexity of the DEQ by substituting the simple feed-forward neural network with any sequence of operations, including convolutions, normalizations, grouping and skip connections.","title":"Fixed point iterations"},{"location":"lectures/19_implicit/#further-reading","text":"These notes are essentially a summary of the following tutorial, specifically chapters 1 and 4: Implicit layer tutorial Below is the paper introducing Deep Equilibrium Models (DEQ): Bai et al., 2019","title":"Further reading"}]}
\ No newline at end of file
diff --git a/sitemap.xml b/sitemap.xml
index 1442d82..dd2f722 100644
--- a/sitemap.xml
+++ b/sitemap.xml
@@ -2,122 +2,122 @@
 <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-15</lastmod>
+         <lastmod>2024-03-20</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-15</lastmod>
+         <lastmod>2024-03-20</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-15</lastmod>
+         <lastmod>2024-03-20</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-15</lastmod>
+         <lastmod>2024-03-20</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-15</lastmod>
+         <lastmod>2024-03-20</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-15</lastmod>
+         <lastmod>2024-03-20</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-15</lastmod>
+         <lastmod>2024-03-20</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-15</lastmod>
+         <lastmod>2024-03-20</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-15</lastmod>
+         <lastmod>2024-03-20</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-15</lastmod>
+         <lastmod>2024-03-20</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-15</lastmod>
+         <lastmod>2024-03-20</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-15</lastmod>
+         <lastmod>2024-03-20</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-15</lastmod>
+         <lastmod>2024-03-20</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-15</lastmod>
+         <lastmod>2024-03-20</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-15</lastmod>
+         <lastmod>2024-03-20</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-15</lastmod>
+         <lastmod>2024-03-20</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-15</lastmod>
+         <lastmod>2024-03-20</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-15</lastmod>
+         <lastmod>2024-03-20</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-15</lastmod>
+         <lastmod>2024-03-20</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-15</lastmod>
+         <lastmod>2024-03-20</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-15</lastmod>
+         <lastmod>2024-03-20</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-15</lastmod>
+         <lastmod>2024-03-20</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-15</lastmod>
+         <lastmod>2024-03-20</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2024-03-15</lastmod>
+         <lastmod>2024-03-20</lastmod>
          <changefreq>daily</changefreq>
     </url>
 </urlset>
\ No newline at end of file
diff --git a/sitemap.xml.gz b/sitemap.xml.gz
index dfebb3ae9f4166dabd75e4e90ff046efd3c3e919..60af5bc8495f76e32a295c3a4acbacfe75c689a2 100644
GIT binary patch
delta 194
zcmV;z06qWK0o4HqABzYGTp;_A2OWQ`4r(af9G&z52*sA9g$ii(?WG?v-F$(&h2ziV
zek;6ucP)BpAB?jF-V%n9w#rq;)&;&@&)ET&V#A-l4LYJUY(s&Y82b~Y!!VFwqV8lL
zi1RfKs!5kXmyj}Y#A1U|jSJ>(D&9IJF3xIB$8P-~L+o57a+dEiw##ysMwT*cHKrPA
wYkl|nAu4Iw52veS7U_=!9B{w^2OMy~0S6p#!2jaYk$(N+3s62$+9eDC0PcxcZvX%Q

delta 195
zcmV;!06hQI0oDNrABzYGTS)Yg2OWP}HBm$9=IEpkKw4}`TBv|l-(LC=)6EySTR8q)
z?zh5=ch{nqa$Z}L<CHLrlqqecO`YT0^_(1VAvXN!TCW0%!^Y>h386nxIt&BxI;c+e
zo>*5?UpDdL=@L>#j#z9^tZ~8IO~qSVii<Uh)3IAWNFO>|iHv2tgl&^_A4fBmY&BLk
x($wni)I(HKw;xVd$t=<z2{_<@0}eRgfCCOV;DG<drz8IQ#TSw&-*zPo0045rTZ#Yx