From ac815651fb9084e6d03debd261cba715cb2ae20d Mon Sep 17 00:00:00 2001
From: jemus42 <1613346+jemus42@users.noreply.github.com>
Date: Wed, 20 Nov 2024 06:37:47 +0000
Subject: [PATCH] Update latex-math

---
 latex-math/Makefile                     |  34 ++
 latex-math/README.md                    |  47 +-
 latex-math/basic-math.tex               |   8 +-
 latex-math/basic-ml.tex                 |   9 +-
 latex-math/create-latex-math-combined.R |  17 +
 latex-math/latex-math-combined.tex      | 656 ++++++++++++++++++++++++
 latex-math/ml-eval.tex                  |   2 +-
 latex-math/ml-regu.tex                  |   6 +-
 latex-math/ml-svm.tex                   |   2 +
 latex-math/preamble.tex                 |   7 +
 10 files changed, 765 insertions(+), 23 deletions(-)
 create mode 100644 latex-math/Makefile
 create mode 100644 latex-math/create-latex-math-combined.R
 create mode 100644 latex-math/latex-math-combined.tex
 create mode 100644 latex-math/preamble.tex
diff --git a/latex-math/Makefile b/latex-math/Makefile
new file mode 100644
index 00000000..1556aa2d
--- /dev/null
+++ b/latex-math/Makefile
@@ -0,0 +1,34 @@
+MATHRMD=latex-math.Rmd
+MATHPDF=${MATHRMD:%.Rmd=%.pdf}
+
+TEXFILES=$(shell find . -iname "basic-*tex" -o -iname "ml-*tex")
+
+MATHCOMBINED=latex-math-combined.tex
+
+
+.PHONY: all pdf combined help
+all: $(MATHPDF) $(MATHCOMBINED)
+pdf: $(MATHPDF)
+combined: $(MATHCOMBINED)
+help:
+	@echo "Usage: make <target>:\n"
+	@echo "  pdf:      render $(MATHRMD) to $(MATHPDF)"
+	@echo "  combined: create the combined tex file $(MATHCOMBINED)"
+	@echo "  clean:    remove $(MATHPDF) and $(MATHCOMBINED)"
+	@echo "  all:      render $(MATHRMD) to $(MATHPDF) and create the combined tex file $(MATHCOMBINED)"
+	@echo "  help:     show this message"
+
+$(MATHPDF): $(MATHRMD) $(TEXFILES)
+	@echo rendering $<;
+	Rscript -e "rmarkdown::render('latex-math.Rmd')"
+
+$(MATHCOMBINED): $(TEXFILES)
+	@echo creating $@ from $(TEXFILES);
+	Rscript --quiet create-latex-math-combined.R
+
+.PHONY: clean
+clean:
+	@echo Removing $(MATHPDF) if it exists;
+	latexmk -C
+	test -f  $(MATHPDF) && rm  $(MATHPDF)
+	test -f combined-latex-and-math.tex && rm combined-latex-and-math.tex
diff --git a/latex-math/README.md b/latex-math/README.md
index 1730ab45..c1b5f206 100644
--- a/latex-math/README.md
+++ b/latex-math/README.md
@@ -7,14 +7,27 @@
 The notation and shortcuts used in latex-files of lectures, papers, ... of the Chair of Statistical Learning and Data Science is defined and maintained in this repository. 
 Notation & shortcuts are split into multiple files depending on subject and can be integrated as needed. 
 
-+ `basic-math`: basic mathematical notation such as mathematical spaces, sums & products, linear algebra, basic probability and statistics
-+ `basic-ml`: basic machine learning notation such as notation for data (x, y), prediction functions, likelihood, loss functions, generalization error
-+ `ml-nn`: neural networks
-+ `ml-svm`: support vector machines
-+ `ml-trees`: decision trees
-+ `ml-interpretable`: IML / xAI
-
-:warning: **Important Usage Note**: If you encounter these files within a lecture or project repository, do not make any changes locally. Go to [slds-lmu/latex-math](https://github.com/slds-lmu/latex-math) and make your changes either directly or via pull request.
+- `basic-math`: Basic mathematical notation such as mathematical spaces, sums & products, linear algebra, basic probability and statistics
+- `basic-ml`: Basic machine learning notation such as notation for data (x, y), prediction functions, likelihood, loss functions, generalization error
+- `ml-ensembles`: Ensemble methods
+- `ml-eval`: Evaluation metrics, resampling
+- `ml-feature-sel`: Feature selection
+- `ml-gp`: Gaussian processes
+- `ml-hpo`: Hyperparameter optimization
+- `ml-infotheory`: Information theory
+- `ml-interpretable`: IML / xAI
+- `ml-mbo`: Model-based optimization / Bayesian optimization
+- `ml-multitarget`: Multi-target learning
+- `ml-nn`: Neural networks
+- `ml-online`:
+- `ml-regu`: Regularization
+- `ml-survival`: Survival analysis
+- `ml-svm`: Support vector machines
+- `ml-trees`: Decision trees
+
+
+:warning: **Important Usage Note**: If you encounter these files within a lecture or project repository, do not make any changes locally.  
+Go to [slds-lmu/latex-math](https://github.com/slds-lmu/latex-math) and make your changes either directly or via pull request.
 Any local changes are assumed to be spurious and *will be overridden* with upstream `slds-lmu/latex-math`.
 
 ## Using the notation
@@ -31,22 +44,34 @@ Note that some of the macros defined here may use additional Latex packages -- a
 \usepackage{mathtools}
 \usepackage{bm}      % basic-ml, ml-gp
 \usepackage{siunitx} % basic-ml
-\usepackage{dsfont}  % basic-math
+\usepackage{dsfont}  % basic-math, not package is called `doublestroke` when installing via tlmgr
 \usepackage{xspace}  % ml-mbo
 \usepackage{xifthen} % ml-interpretable
 ```
 
 See `latex-math.pdf` for all currently defined commands & definitions. 
 
+Note that the file `preamble.tex` contains packages required for `latex-math.Rmd` to be rendered, which are not necessarily all packages you would need in a fresh LaTeX project, since RMarkdown by default includes various required packages already.
+
 ## Updating / adding files
 
 - A new shortcut / notation that falls into the scope of one of the existing files should be added in the respective file with a short description.
 - Multiple shortcuts / notations belonging to another major subject should be summarized in a new .tex file. 
 - **ALWAYS** check if a command is already contained in one of the files - overwriting a command might result in compiling errors.  
 - **ALWAYS recompile `latex-math.Rmd` if you add new commands so it is kept up-to-date and to check that you have committed all the changes your notation requires to work.**
+- If you add a new file, make sure it is added as an `include` in the header of `latex-math.Rmd` such that it is included in the rendered preview
 
-To ensure recompilation is not forgotten, please install the pre-commit hook:
+## Building
+
+Use the included `Makefile` to render `latex-math.pdf` and to create the combined .tex file `latex-math-combined.tex`:
 
 ```sh
-cp service/pre-commit-check-pdf .git/hooks/pre-commit
+Usage: make <target>:
+
+  pdf:      render latex-math.Rmd to latex-math.pdf
+  combined: create the combined tex file latex-math-combined.tex
+  clean:    remove latex-math.pdf and latex-math-combined.tex
+  all:      render latex-math.Rmd to latex-math.pdf and create the combined tex file latex-math-combined.tex
+  help:     show this message
 ```
+
diff --git a/latex-math/basic-math.tex b/latex-math/basic-math.tex
index 45ba1a34..31878902 100644
--- a/latex-math/basic-math.tex
+++ b/latex-math/basic-math.tex
@@ -20,10 +20,10 @@
 
 % basic math stuff
 \newcommand{\xt}{\tilde x} % x tilde
-\DeclareMathOperator*{\argmax}{arg\,max} % argmax
-\DeclareMathOperator*{\argmin}{arg\,min} % argmin
-\newcommand{\argminlim}{\mathop{\mathrm{arg\,min}}\limits} % argmax with limits
-\newcommand{\argmaxlim}{\mathop{\mathrm{arg\,max}}\limits} % argmin with limits
+\newcommand{\argmin}{\mathop{\mathrm{arg\,min}}} % argmin
+\newcommand{\argmax}{\mathop{\mathrm{arg\,max}}} % argmax
+\newcommand{\argminlim}{\argmin\limits} % argmin with limits
+\newcommand{\argmaxlim}{\argmax\limits} % argmax with limits
 \newcommand{\sign}{\operatorname{sign}} % sign, signum
 \newcommand{\I}{\mathbb{I}} % I, indicator
 \newcommand{\order}{\mathcal{O}} % O, order
diff --git a/latex-math/basic-ml.tex b/latex-math/basic-ml.tex
index 6fb0f8e0..0f2d541d 100644
--- a/latex-math/basic-ml.tex
+++ b/latex-math/basic-ml.tex
@@ -91,9 +91,6 @@
 \newcommand{\thetahDlam}{\thetavh_{\D, \lamv}} %theta learned on D with hp lambda
 \newcommand{\mint}{\min_{\thetav \in \Theta}} % min problem theta
 \newcommand{\argmint}{\argmin_{\thetav \in \Theta}} % argmin theta
-% LS 29.10.2024 addin thetab back for now because apparently this broke and nobody updated slides to reflect thetab -> thetav changes?
-\newcommand{\thetab}{\bm{\theta}} % theta vector
-
 
 % densities + probabilities
 % pdf of x
@@ -115,6 +112,8 @@
 
 % prior probabilities
 \newcommand{\pik}[1][k]{\pi_{#1}} % pi_k, prior
+\newcommand{\pih}{\hat{\pi}} % pi hat, estimated prior (binary classification)
+\newcommand{\pikh}[1][k]{\hat{\pi}_{#1}} % pi_k hat, estimated prior
 \newcommand{\lpik}[1][k]{\log \pi_{#1}} % log pi_k, log of the prior
 \newcommand{\pit}{\pi(\thetav)} % Prior probability of parameter theta
 
@@ -139,7 +138,9 @@
 
 % probababilistic
 \newcommand{\bayesrulek}[1][k]{\frac{\P(\xv | y= #1) \P(y= #1)}{\P(\xv)}} % Bayes rule
-\newcommand{\muk}{\bm{\mu_k}} % mean vector of class-k Gaussian (discr analysis)
+\newcommand{\muv}{\bm{\mu}} % expectation vector of Gaussian
+\newcommand{\muk}[1][k]{\bm{\mu_{#1}}} % mean vector of class-k Gaussian (discr analysis)
+\newcommand{\mukh}[1][k]{\bm{\hat{\mu}_{#1}}} % estimated mean vector of class-k Gaussian (discr analysis)
 
 % residual and margin
 \newcommand{\eps}{\epsilon} % residual, stochastic
diff --git a/latex-math/create-latex-math-combined.R b/latex-math/create-latex-math-combined.R
new file mode 100644
index 00000000..a694daff
--- /dev/null
+++ b/latex-math/create-latex-math-combined.R
@@ -0,0 +1,17 @@
+# Find all .tex files starting with `basic-` or `ml-`
+# !! If new file prefixes are added, they must be added to the regex pattern
+texfiles <- list.files(pattern = "(ml|basic)-.*\\.tex")
+
+combined <- vapply(seq_along(texfiles), \(i) {
+  lines <- readLines(texfiles[[i]])
+  chunk <- paste(lines, collapse = "\n")
+  chunk <- paste0(chunk, "\n")
+
+  paste(
+    sprintf("%% ------------- %s -------------", basename(texfiles[[i]])),
+    chunk,
+    sep = "\n\n"
+  )
+}, character(1))
+
+writeLines(combined, "latex-math-combined.tex")
diff --git a/latex-math/latex-math-combined.tex b/latex-math/latex-math-combined.tex
new file mode 100644
index 00000000..100273c5
--- /dev/null
+++ b/latex-math/latex-math-combined.tex
@@ -0,0 +1,656 @@
+% ------------- basic-math.tex -------------
+
+% dependencies: amsmath, amssymb, dsfont
+% math spaces
+\ifdefined\N
+\renewcommand{\N}{\mathds{N}} % N, naturals
+\else \newcommand{\N}{\mathds{N}} \fi
+\newcommand{\Z}{\mathds{Z}} % Z, integers
+\newcommand{\Q}{\mathds{Q}} % Q, rationals
+\newcommand{\R}{\mathds{R}} % R, reals
+\ifdefined\C
+\renewcommand{\C}{\mathds{C}} % C, complex
+\else \newcommand{\C}{\mathds{C}} \fi
+\newcommand{\continuous}{\mathcal{C}} % C, space of continuous functions
+\newcommand{\M}{\mathcal{M}} % machine numbers
+\newcommand{\epsm}{\epsilon_m} % maximum error
+
+% counting / finite sets
+\newcommand{\setzo}{\{0, 1\}} % set 0, 1
+\newcommand{\setmp}{\{-1, +1\}} % set -1, 1
+\newcommand{\unitint}{[0, 1]} % unit interval
+
+% basic math stuff
+\newcommand{\xt}{\tilde x} % x tilde
+\newcommand{\argmin}{\mathop{\mathrm{arg\,min}}} % argmin
+\newcommand{\argmax}{\mathop{\mathrm{arg\,max}}} % argmax
+\newcommand{\argminlim}{\argmin\limits} % argmin with limits
+\newcommand{\argmaxlim}{\argmax\limits} % argmax with limits
+\newcommand{\sign}{\operatorname{sign}} % sign, signum
+\newcommand{\I}{\mathbb{I}} % I, indicator
+\newcommand{\order}{\mathcal{O}} % O, order
+\newcommand{\bigO}{\mathcal{O}} % Big-O Landau
+\newcommand{\littleo}{{o}} % Little-o Landau
+\newcommand{\pd}[2]{\frac{\partial{#1}}{\partial #2}} % partial derivative
+\newcommand{\floorlr}[1]{\left\lfloor #1 \right\rfloor} % floor
+\newcommand{\ceillr}[1]{\left\lceil #1 \right\rceil} % ceiling
+\newcommand{\indep}{\perp \!\!\! \perp} % independence symbol
+
+% sums and products
+\newcommand{\sumin}{\sum\limits_{i=1}^n} % summation from i=1 to n
+\newcommand{\sumim}{\sum\limits_{i=1}^m} % summation from i=1 to m
+\newcommand{\sumjn}{\sum\limits_{j=1}^n} % summation from j=1 to p
+\newcommand{\sumjp}{\sum\limits_{j=1}^p} % summation from j=1 to p
+\newcommand{\sumik}{\sum\limits_{i=1}^k} % summation from i=1 to k
+\newcommand{\sumkg}{\sum\limits_{k=1}^g} % summation from k=1 to g
+\newcommand{\sumjg}{\sum\limits_{j=1}^g} % summation from j=1 to g
+\newcommand{\summM}{\sum\limits_{m=1}^M} % summation from m=1 to M
+\newcommand{\meanin}{\frac{1}{n} \sum\limits_{i=1}^n} % mean from i=1 to n
+\newcommand{\meanim}{\frac{1}{m} \sum\limits_{i=1}^m} % mean from i=1 to n
+\newcommand{\meankg}{\frac{1}{g} \sum\limits_{k=1}^g} % mean from k=1 to g
+\newcommand{\meanmM}{\frac{1}{M} \sum\limits_{m=1}^M} % mean from m=1 to M
+\newcommand{\prodin}{\prod\limits_{i=1}^n} % product from i=1 to n
+\newcommand{\prodkg}{\prod\limits_{k=1}^g} % product from k=1 to g
+\newcommand{\prodjp}{\prod\limits_{j=1}^p} % product from j=1 to p
+
+% linear algebra
+\newcommand{\one}{\bm{1}} % 1, unitvector
+\newcommand{\zero}{\mathbf{0}} % 0-vector
+\newcommand{\id}{\bm{I}} % I, identity
+\newcommand{\diag}{\operatorname{diag}} % diag, diagonal
+\newcommand{\trace}{\operatorname{tr}} % tr, trace
+\newcommand{\spn}{\operatorname{span}} % span
+\newcommand{\scp}[2]{\left\langle #1, #2 \right\rangle} % <.,.>, scalarproduct
+\newcommand{\mat}[1]{\begin{pmatrix} #1 \end{pmatrix}} % short pmatrix command
+\newcommand{\Amat}{\mathbf{A}} % matrix A
+\newcommand{\Deltab}{\mathbf{\Delta}} % error term for vectors
+
+% basic probability + stats
+\renewcommand{\P}{\mathds{P}} % P, probability
+\newcommand{\E}{\mathds{E}} % E, expectation
+\newcommand{\var}{\mathsf{Var}} % Var, variance
+\newcommand{\cov}{\mathsf{Cov}} % Cov, covariance
+\newcommand{\corr}{\mathsf{Corr}} % Corr, correlation
+\newcommand{\normal}{\mathcal{N}} % N of the normal distribution
+\newcommand{\iid}{\overset{i.i.d}{\sim}} % dist with i.i.d superscript
+\newcommand{\distas}[1]{\overset{#1}{\sim}} % ... is distributed as ...
+
+% ------------- basic-ml.tex -------------
+
+% machine learning
+\newcommand{\Xspace}{\mathcal{X}} % X, input space
+\newcommand{\Yspace}{\mathcal{Y}} % Y, output space
+\newcommand{\Zspace}{\mathcal{Z}} % Z, space of sampled datapoints
+\newcommand{\nset}{\{1, \ldots, n\}} % set from 1 to n
+\newcommand{\pset}{\{1, \ldots, p\}} % set from 1 to p
+\newcommand{\gset}{\{1, \ldots, g\}} % set from 1 to g
+\newcommand{\Pxy}{\mathbb{P}_{xy}} % P_xy
+\newcommand{\Exy}{\mathbb{E}_{xy}} % E_xy: Expectation over random variables xy
+\newcommand{\xv}{\mathbf{x}} % vector x (bold)
+\newcommand{\xtil}{\tilde{\mathbf{x}}} % vector x-tilde (bold)
+\newcommand{\yv}{\mathbf{y}} % vector y (bold)
+\newcommand{\xy}{(\xv, y)} % observation (x, y)
+\newcommand{\xvec}{\left(x_1, \ldots, x_p\right)^\top} % (x1, ..., xp)
+\newcommand{\Xmat}{\mathbf{X}} % Design matrix
+\newcommand{\allDatasets}{\mathds{D}} % The set of all datasets
+\newcommand{\allDatasetsn}{\mathds{D}_n}  % The set of all datasets of size n
+\newcommand{\D}{\mathcal{D}} % D, data
+\newcommand{\Dn}{\D_n} % D_n, data of size n
+\newcommand{\Dtrain}{\mathcal{D}_{\text{train}}} % D_train, training set
+\newcommand{\Dtest}{\mathcal{D}_{\text{test}}} % D_test, test set
+\newcommand{\xyi}[1][i]{\left(\xv^{(#1)}, y^{(#1)}\right)} % (x^i, y^i), i-th observation
+\newcommand{\Dset}{\left( \xyi[1], \ldots, \xyi[n]\right)} % {(x1,y1)), ..., (xn,yn)}, data
+\newcommand{\defAllDatasetsn}{(\Xspace \times \Yspace)^n} % Def. of the set of all datasets of size n
+\newcommand{\defAllDatasets}{\bigcup_{n \in \N}(\Xspace \times \Yspace)^n} % Def. of the set of all datasets
+\newcommand{\xdat}{\left\{ \xv^{(1)}, \ldots, \xv^{(n)}\right\}} % {x1, ..., xn}, input data
+\newcommand{\ydat}{\left\{ \yv^{(1)}, \ldots, \yv^{(n)}\right\}} % {y1, ..., yn}, input data
+\newcommand{\yvec}{\left(y^{(1)}, \hdots, y^{(n)}\right)^\top} % (y1, ..., yn), vector of outcomes
+\newcommand{\greekxi}{\xi} % Greek letter xi
+\renewcommand{\xi}[1][i]{\xv^{(#1)}} % x^i, i-th observed value of x
+\newcommand{\yi}[1][i]{y^{(#1)}} % y^i, i-th observed value of y
+\newcommand{\xivec}{\left(x^{(i)}_1, \ldots, x^{(i)}_p\right)^\top} % (x1^i, ..., xp^i), i-th observation vector
+\newcommand{\xj}{\xv_j} % x_j, j-th feature
+\newcommand{\xjvec}{\left(x^{(1)}_j, \ldots, x^{(n)}_j\right)^\top} % (x^1_j, ..., x^n_j), j-th feature vector
+\newcommand{\phiv}{\mathbf{\phi}} % Basis transformation function phi
+\newcommand{\phixi}{\mathbf{\phi}^{(i)}} % Basis transformation of xi: phi^i := phi(xi)
+
+%%%%%% ml - models general
+\newcommand{\lamv}{\bm{\lambda}} % lambda vector, hyperconfiguration vector
+\newcommand{\Lam}{\bm{\Lambda}}	 % Lambda, space of all hpos
+% Inducer / Inducing algorithm
+\newcommand{\preimageInducer}{\left(\defAllDatasets\right)\times\Lam} % Set of all datasets times the hyperparameter space
+\newcommand{\preimageInducerShort}{\allDatasets\times\Lam} % Set of all datasets times the hyperparameter space
+% Inducer / Inducing algorithm
+\newcommand{\ind}{\mathcal{I}} % Inducer, inducing algorithm, learning algorithm
+
+% continuous prediction function f
+\newcommand{\ftrue}{f_{\text{true}}}  % True underlying function (if a statistical model is assumed)
+\newcommand{\ftruex}{\ftrue(\xv)} % True underlying function (if a statistical model is assumed)
+\newcommand{\fx}{f(\xv)} % f(x), continuous prediction function
+\newcommand{\fdomains}{f: \Xspace \rightarrow \R^g} % f with domain and co-domain
+\newcommand{\Hspace}{\mathcal{H}} % hypothesis space where f is from
+\newcommand{\fbayes}{f^{\ast}} % Bayes-optimal model
+\newcommand{\fxbayes}{f^{\ast}(\xv)} % Bayes-optimal model
+\newcommand{\fkx}[1][k]{f_{#1}(\xv)} % f_j(x), discriminant component function
+\newcommand{\fh}{\hat{f}} % f hat, estimated prediction function
+\newcommand{\fxh}{\fh(\xv)} % fhat(x)
+\newcommand{\fxt}{f(\xv ~|~ \thetav)} % f(x | theta)
+\newcommand{\fxi}{f\left(\xv^{(i)}\right)} % f(x^(i))
+\newcommand{\fxih}{\hat{f}\left(\xv^{(i)}\right)} % f(x^(i))
+\newcommand{\fxit}{f\left(\xv^{(i)} ~|~ \thetav\right)} % f(x^(i) | theta)
+\newcommand{\fhD}{\fh_{\D}} % fhat_D, estimate of f based on D
+\newcommand{\fhDtrain}{\fh_{\Dtrain}} % fhat_Dtrain, estimate of f based on D
+\newcommand{\fhDnlam}{\fh_{\Dn, \lamv}} %model learned on Dn with hp lambda
+\newcommand{\fhDlam}{\fh_{\D, \lamv}} %model learned on D with hp lambda
+\newcommand{\fhDnlams}{\fh_{\Dn, \lamv^\ast}} %model learned on Dn with optimal hp lambda
+\newcommand{\fhDlams}{\fh_{\D, \lamv^\ast}} %model learned on D with optimal hp lambda
+
+% discrete prediction function h
+\newcommand{\hx}{h(\xv)} % h(x), discrete prediction function
+\newcommand{\hh}{\hat{h}} % h hat
+\newcommand{\hxh}{\hat{h}(\xv)} % hhat(x)
+\newcommand{\hxt}{h(\xv | \thetav)} % h(x | theta)
+\newcommand{\hxi}{h\left(\xi\right)} % h(x^(i))
+\newcommand{\hxit}{h\left(\xi ~|~ \thetav\right)} % h(x^(i) | theta)
+\newcommand{\hbayes}{h^{\ast}} % Bayes-optimal classification model
+\newcommand{\hxbayes}{h^{\ast}(\xv)} % Bayes-optimal classification model
+
+% yhat
+\newcommand{\yh}{\hat{y}} % yhat for prediction of target
+\newcommand{\yih}{\hat{y}^{(i)}} % yhat^(i) for prediction of ith targiet
+\newcommand{\resi}{\yi- \yih}
+
+% theta
+\newcommand{\thetah}{\hat{\theta}} % theta hat
+\newcommand{\thetav}{\bm{\theta}} % theta vector
+\newcommand{\thetavh}{\bm{\hat\theta}} % theta vector hat
+\newcommand{\thetat}[1][t]{\thetav^{[#1]}} % theta^[t] in optimization
+\newcommand{\thetatn}[1][t]{\thetav^{[#1 +1]}} % theta^[t+1] in optimization
+\newcommand{\thetahDnlam}{\thetavh_{\Dn, \lamv}} %theta learned on Dn with hp lambda
+\newcommand{\thetahDlam}{\thetavh_{\D, \lamv}} %theta learned on D with hp lambda
+\newcommand{\mint}{\min_{\thetav \in \Theta}} % min problem theta
+\newcommand{\argmint}{\argmin_{\thetav \in \Theta}} % argmin theta
+
+% densities + probabilities
+% pdf of x
+\newcommand{\pdf}{p} % p
+\newcommand{\pdfx}{p(\xv)} % p(x)
+\newcommand{\pixt}{\pi(\xv~|~ \thetav)} % pi(x|theta), pdf of x given theta
+\newcommand{\pixit}[1][i]{\pi\left(\xi[#1] ~|~ \thetav\right)} % pi(x^i|theta), pdf of x given theta
+\newcommand{\pixii}[1][i]{\pi\left(\xi[#1]\right)} % pi(x^i), pdf of i-th x
+
+% pdf of (x, y)
+\newcommand{\pdfxy}{p(\xv,y)} % p(x, y)
+\newcommand{\pdfxyt}{p(\xv, y ~|~ \thetav)} % p(x, y | theta)
+\newcommand{\pdfxyit}{p\left(\xi, \yi ~|~ \thetav\right)} % p(x^(i), y^(i) | theta)
+
+% pdf of x given y
+\newcommand{\pdfxyk}[1][k]{p(\xv | y= #1)} % p(x | y = k)
+\newcommand{\lpdfxyk}[1][k]{\log p(\xv | y= #1)} % log p(x | y = k)
+\newcommand{\pdfxiyk}[1][k]{p\left(\xi | y= #1 \right)} % p(x^i | y = k)
+
+% prior probabilities
+\newcommand{\pik}[1][k]{\pi_{#1}} % pi_k, prior
+\newcommand{\pih}{\hat{\pi}} % pi hat, estimated prior (binary classification)
+\newcommand{\pikh}[1][k]{\hat{\pi}_{#1}} % pi_k hat, estimated prior
+\newcommand{\lpik}[1][k]{\log \pi_{#1}} % log pi_k, log of the prior
+\newcommand{\pit}{\pi(\thetav)} % Prior probability of parameter theta
+
+% posterior probabilities
+\newcommand{\post}{\P(y = 1 ~|~ \xv)} % P(y = 1 | x), post. prob for y=1
+\newcommand{\postk}[1][k]{\P(y = #1 ~|~ \xv)} % P(y = k | y), post. prob for y=k
+\newcommand{\pidomains}{\pi: \Xspace \rightarrow \unitint} % pi with domain and co-domain
+\newcommand{\pibayes}{\pi^{\ast}} % Bayes-optimal classification model
+\newcommand{\pixbayes}{\pi^{\ast}(\xv)} % Bayes-optimal classification model
+\newcommand{\pix}{\pi(\xv)} % pi(x), P(y = 1 | x)
+\newcommand{\piv}{\bm{\pi}} % pi, bold, as vector
+\newcommand{\pikx}[1][k]{\pi_{#1}(\xv)} % pi_k(x), P(y = k | x)
+\newcommand{\pikxt}[1][k]{\pi_{#1}(\xv ~|~ \thetav)} % pi_k(x | theta), P(y = k | x, theta)
+\newcommand{\pixh}{\hat \pi(\xv)} % pi(x) hat, P(y = 1 | x) hat
+\newcommand{\pikxh}[1][k]{\hat \pi_{#1}(\xv)} % pi_k(x) hat, P(y = k | x) hat
+\newcommand{\pixih}{\hat \pi(\xi)} % pi(x^(i)) with hat
+\newcommand{\pikxih}[1][k]{\hat \pi_{#1}(\xi)} % pi_k(x^(i)) with hat
+\newcommand{\pdfygxt}{p(y ~|~\xv, \thetav)} % p(y | x, theta)
+\newcommand{\pdfyigxit}{p\left(\yi ~|~\xi, \thetav\right)} % p(y^i |x^i, theta)
+\newcommand{\lpdfygxt}{\log \pdfygxt } % log p(y | x, theta)
+\newcommand{\lpdfyigxit}{\log \pdfyigxit} % log p(y^i |x^i, theta)
+
+% probababilistic
+\newcommand{\bayesrulek}[1][k]{\frac{\P(\xv | y= #1) \P(y= #1)}{\P(\xv)}} % Bayes rule
+\newcommand{\muv}{\bm{\mu}} % expectation vector of Gaussian
+\newcommand{\muk}[1][k]{\bm{\mu_{#1}}} % mean vector of class-k Gaussian (discr analysis)
+\newcommand{\mukh}[1][k]{\bm{\hat{\mu}_{#1}}} % estimated mean vector of class-k Gaussian (discr analysis)
+
+% residual and margin
+\newcommand{\eps}{\epsilon} % residual, stochastic
+\newcommand{\epsv}{\bm{\epsilon}} % residual, stochastic, as vector
+\newcommand{\epsi}{\epsilon^{(i)}} % epsilon^i, residual, stochastic
+\newcommand{\epsh}{\hat{\epsilon}} % residual, estimated
+\newcommand{\epsvh}{\hat{\epsv}} % residual, estimated, vector
+\newcommand{\yf}{y \fx} % y f(x), margin
+\newcommand{\yfi}{\yi \fxi} % y^i f(x^i), margin
+\newcommand{\Sigmah}{\hat \Sigma} % estimated covariance matrix
+\newcommand{\Sigmahj}{\hat \Sigma_j} % estimated covariance matrix for the j-th class
+
+% ml - loss, risk, likelihood
+\newcommand{\Lyf}{L\left(y, f\right)} % L(y, f), loss function
+\newcommand{\Lypi}{L\left(y, \pi\right)} % L(y, pi), loss function
+\newcommand{\Lxy}{L\left(y, \fx\right)} % L(y, f(x)), loss function
+\newcommand{\Lxyi}{L\left(\yi, \fxi\right)} % loss of observation
+\newcommand{\Lxyt}{L\left(y, \fxt\right)} % loss with f parameterized
+\newcommand{\Lxyit}{L\left(\yi, \fxit\right)} % loss of observation with f parameterized
+\newcommand{\Lxym}{L\left(\yi, f\left(\bm{\tilde{x}}^{(i)} ~|~ \thetav\right)\right)} % loss of observation with f parameterized
+\newcommand{\Lpixy}{L\left(y, \pix\right)} % loss in classification
+\newcommand{\Lpiy}{L\left(y, \pi\right)} % loss in classification
+\newcommand{\Lpiv}{L\left(y, \piv\right)} % loss in classification
+\newcommand{\Lpixyi}{L\left(\yi, \pixii\right)} % loss of observation in classification
+\newcommand{\Lpixyt}{L\left(y, \pixt\right)} % loss with pi parameterized
+\newcommand{\Lpixyit}{L\left(\yi, \pixit\right)} % loss of observation with pi parameterized
+\newcommand{\Lhy}{L\left(y, h\right)} % L(y, h), loss function on discrete classes
+\newcommand{\Lhxy}{L\left(y, \hx\right)} % L(y, h(x)), loss function on discrete classes
+\newcommand{\Lr}{L\left(r\right)} % L(r), loss defined on residual (reg) / margin (classif)
+\newcommand{\lone}{|y - \fx|} % L1 loss
+\newcommand{\ltwo}{\left(y - \fx\right)^2} % L2 loss
+\newcommand{\lbernoullimp}{\ln(1 + \exp(-y \cdot \fx))} % Bernoulli loss for -1, +1 encoding
+\newcommand{\lbernoullizo}{- y \cdot \fx + \log(1 + \exp(\fx))} % Bernoulli loss for 0, 1 encoding
+\newcommand{\lcrossent}{- y \log \left(\pix\right) - (1 - y) \log \left(1 - \pix\right)} % cross-entropy loss
+\newcommand{\lbrier}{\left(\pix - y \right)^2} % Brier score
+\newcommand{\risk}{\mathcal{R}} % R, risk
+\newcommand{\riskbayes}{\mathcal{R}^\ast}
+\newcommand{\riskf}{\risk(f)} % R(f), risk
+\newcommand{\riskdef}{\E_{y|\xv}\left(\Lxy \right)} % risk def (expected loss)
+\newcommand{\riskt}{\mathcal{R}(\thetav)} % R(theta), risk
+\newcommand{\riske}{\mathcal{R}_{\text{emp}}} % R_emp, empirical risk w/o factor 1 / n
+\newcommand{\riskeb}{\bar{\mathcal{R}}_{\text{emp}}} % R_emp, empirical risk w/ factor 1 / n
+\newcommand{\riskef}{\riske(f)} % R_emp(f)
+\newcommand{\risket}{\mathcal{R}_{\text{emp}}(\thetav)} % R_emp(theta)
+\newcommand{\riskr}{\mathcal{R}_{\text{reg}}} % R_reg, regularized risk
+\newcommand{\riskrt}{\mathcal{R}_{\text{reg}}(\thetav)} % R_reg(theta)
+\newcommand{\riskrf}{\riskr(f)} % R_reg(f)
+\newcommand{\riskrth}{\hat{\mathcal{R}}_{\text{reg}}(\thetav)} % hat R_reg(theta)
+\newcommand{\risketh}{\hat{\mathcal{R}}_{\text{emp}}(\thetav)} % hat R_emp(theta)
+\newcommand{\LL}{\mathcal{L}} % L, likelihood
+\newcommand{\LLt}{\mathcal{L}(\thetav)} % L(theta), likelihood
+\newcommand{\LLtx}{\mathcal{L}(\thetav | \xv)} % L(theta|x), likelihood
+\newcommand{\logl}{\ell} % l, log-likelihood
+\newcommand{\loglt}{\logl(\thetav)} % l(theta), log-likelihood
+\newcommand{\logltx}{\logl(\thetav | \xv)} % l(theta|x), log-likelihood
+\newcommand{\errtrain}{\text{err}_{\text{train}}} % training error
+\newcommand{\errtest}{\text{err}_{\text{test}}} % test error
+\newcommand{\errexp}{\overline{\text{err}_{\text{test}}}} % avg training error
+
+% lm
+\newcommand{\thx}{\thetav^\top \xv} % linear model
+\newcommand{\olsest}{(\Xmat^\top \Xmat)^{-1} \Xmat^\top \yv} % OLS estimator in LM
+
+% ------------- ml-ensembles.tex -------------
+
+% ml - bagging, random forest
+\newcommand{\bl}[1][m]{b^{[#1]}} % baselearner, default m
+\newcommand{\blh}[1][m]{\hat{b}^{[#1]}} % estimated base learner, default m 
+\newcommand{\blx}[1][m]{b^{[#1]}(\xv)} % baselearner, default m
+\newcommand{\blf}[1][m]{f^{[#1]}} % baselearner: scores, default m
+\newcommand{\blfh}[1][m]{\hat{f}^{[#1]}} % estimated baselearner: scores, default m
+\newcommand{\blfhx}[1][m]{\hat{f}^{[#1]}(\xv)} % estimated baselearner: scores of x, default m
+\newcommand{\bll}[1][m]{h^{[#1]}} % baselearner: hard labels, default m
+\newcommand{\bllh}[1][m]{\hat{h}^{[#1]}} % estimated baselearner: hard labels, default m
+\newcommand{\bllhx}[1][m]{\hat{h}^{[#1]}(\xv)} % estimated baselearner: hard labels of x, default m
+\newcommand{\blp}[1][m]{\pi^{[#1]}} % baselearner: probabilities, default m
+\newcommand{\blph}[1][m]{\hat{\pi}^{[#1]}} % estimated baselearner: probabilities, default m
+\newcommand{\blphxk}[1][m]{\hat{\pi}_{k}^{[#1]}(\xv)} % estimated baselearner: probabilities of x for class k, default m
+\newcommand{\fM}{f^{[M]}(\xv)} % ensembled predictor
+\newcommand{\fMh}{\hat f^{[M]}(\xv)} % estimated ensembled predictor
+\newcommand{\ambifM}{\Delta\left(\fM\right)} % ambiguity/instability of ensemble
+\newcommand{\betam}[1][m]{\beta^{[#1]}} % weight of basemodel m
+\newcommand{\betamh}[1][m]{\hat{\beta}^{[#1]}} % weight of basemodel m with hat
+\newcommand{\betaM}{\beta^{[M]}} % last baselearner
+\newcommand{\ib}{\mathrm{IB}} % In-Bag (IB)
+\newcommand{\ibm}{\ib^{[m]}} % In-Bag (IB) for m-th bootstrap
+\newcommand{\oob}{\mathrm{OOB}} % Out-of-Bag (OOB)
+\newcommand{\oobm}{\oob^{[m]}} % Out-of-Bag (OOB) for m-th bootstrap
+
+% ml - boosting
+\newcommand{\fm}[1][m]{f^{[#1]}} % prediction in iteration m
+\newcommand{\fmh}[1][m]{\hat{f}^{[#1]}} % prediction in iteration m
+\newcommand{\fmd}[1][m]{f^{[#1-1]}} % prediction m-1
+\newcommand{\fmdh}[1][m]{\hat{f}^{[#1-1]}} % prediction m-1
+\newcommand{\errm}[1][m]{\text{err}^{[#1]}} % weighted in-sample misclassification rate
+\newcommand{\wm}[1][m]{w^{[#1]}} % weight vector of basemodel m
+\newcommand{\wmi}[1][m]{w^{[#1](i)}} % weight of obs i of basemodel m
+\newcommand{\thetam}[1][m]{\thetav^{[#1]}} % parameters of basemodel m
+\newcommand{\thetamh}[1][m]{\hat{\thetav}^{[#1]}} % parameters of basemodel m with hat
+\newcommand{\blxt}[1][m]{b(\xv, \thetav^{[#1]})} % baselearner, default m
+\newcommand{\ens}{\sum_{m=1}^M \betam \blxt} % ensemble
+\newcommand{\rmm}[1][m]{\tilde{r}^{[#1]}} % pseudo residuals
+\newcommand{\rmi}[1][m]{\tilde{r}^{[#1](i)}} % pseudo residuals
+\newcommand{\Rtm}[1][m]{R_{t}^{[#1]}} % terminal-region
+\newcommand{\Tm}[1][m]{T^{[#1]}} % terminal-region
+\newcommand{\ctm}[1][m]{c_t^{[#1]}} % mean, terminal-regions
+\newcommand{\ctmh}[1][m]{\hat{c}_t^{[#1]}} % mean, terminal-regions with hat
+\newcommand{\ctmt}[1][m]{\tilde{c}_t^{[#1]}} % mean, terminal-regions
+\newcommand{\Lp}{L^\prime}
+\newcommand{\Ldp}{L^{\prime\prime}}
+\newcommand{\Lpleft}{\Lp_{\text{left}}}
+
+% ml - boosting iml lecture
+\newcommand{\ts}{\thetav^{\star}} % theta*
+\newcommand{\bljt}{\bl[j](\xv, \thetav)} % BL j with theta
+\newcommand{\bljts}{\bl[j](\xv, \ts)} % BL j with theta*
+
+% ------------- ml-eval.tex -------------
+
+% resampling
+\newcommand{\ntest}{n_{\mathrm{test}}} % size of the test set
+\newcommand{\ntrain}{n_{\mathrm{train}}} % size of the train set
+\newcommand{\ntesti}[1][i]{n_{\mathrm{test},#1}} % size of the i-th test set
+\newcommand{\ntraini}[1][i]{n_{\mathrm{train},#1}} % size of the i-th train set
+\newcommand{\Jtrain}{J_\mathrm{train}} % index vector train data
+\newcommand{\Jtest}{J_\mathrm{test}} % index vector test data
+\newcommand{\Jtraini}[1][i]{J_{\mathrm{train},#1}} % index vector i-th train dataset
+\newcommand{\Jtesti}[1][i]{J_{\mathrm{test},#1}} % index vector i-th test dataset
+\newcommand{\Dtraini}[1][i]{\mathcal{D}_{\text{train},#1}} % D_train,i, i-th training set
+\newcommand{\Dtesti}[1][i]{\mathcal{D}_{\text{test},#1}} % D_test,i, i-th test set
+
+\newcommand{\JSpace}[1][m]{\nset^{#1}} % space of train indices of size n_train
+\newcommand{\JtrainSpace}{\nset^{\ntrain}} % space of train indices of size n_train
+\newcommand{\JtestSpace}{\nset^{\ntest}} % space of train indices of size n_test
+\newcommand{\yJ}[1][J]{\yv_{#1}} % output vector associated to index J
+\newcommand{\yJDef}{\left(y^{(J^{(1)})},\dots,y^{(J^{(m)})}\right)} % def of the output vector associated to index J
+\newcommand{\JJ}{\mathcal{J}} % cali-J, set of all splits
+\newcommand{\JJset}{\left((\Jtraini[1], \Jtesti[1]),\dots,(\Jtraini[B], \Jtesti[B])\right)} % (Jtrain_1,Jtest_1) ...(Jtrain_B,Jtest_B)
+\newcommand{\Itrainlam}{\ind(\Dtrain, \lamv)}
+% Generalization error
+\newcommand{\GE}{\mathrm{GE}} % GE
+\newcommand{\GEh}{\widehat{\GE}} % GE-hat
+\newcommand{\GEfull}[1][\ntrain]{\GE(\ind, \lamv, #1, \rho)} % GE full
+\newcommand{\GEhholdout}{\GEh_{\Jtrain, \Jtest}(\ind, \lamv, |\Jtrain|, \rho)} % GE hat holdout
+\newcommand{\GEhholdouti}[1][i]{\GEh_{\Jtraini[#1], \Jtesti[#1]}(\ind, \lamv, |\Jtraini[#1]|, \rho)} % GE hat holdout i-th set
+\newcommand{\GEhlam}{\GEh(\lamv)} % GE-hat(lam)
+\newcommand{\GEhlamsubIJrho}{\GEh_{\ind, \JJ, \rho}(\lamv)} % GE-hat_I,J,rho(lam)
+\newcommand{\GEhresa}{\GEh(\ind, \JJ, \rho, \lamv)} % GE-hat_I,J,rho(lam)
+\newcommand{\GErhoDef}{\lim_{\ntest\rightarrow\infty} \E_{\Dtrain,\Dtest \sim \Pxy} \left[ \rho\left(\yv_{\Jtest}, \FJtestftrain\right)\right]} % GE formal def
+\newcommand{\agr}{\mathrm{agr}} % aggregate function
+\newcommand{\GEf}{\GE\left(\fh\right)} % GE of a fitted model
+\newcommand{\GEfh}{\GEh\left(\fh\right)} % GEh of a fitted model
+\newcommand{\GEfL}{\GE\left(\fh, L\right)} % GE of a fitted model wrt loss L
+\newcommand{\Lyfhx}{L\left(y, \hat{f}(\xv)\right)} % pointwise loss of fitted model
+\newcommand{\GEnf}[1]{GE_n\left(\fh_{#1}\right)} % GE of a fitted model
+\newcommand{\GEind}{GE_n\left(\ind_{L, O}\right)} % GE of inducer
+\newcommand{\GED}{\GE_{\D}} % GE indexed with data
+\newcommand{\EGEn}{EGE_n} % expected GE
+\newcommand{\EDn}{\E_{|D| = n}} % expectation wrt data of size n
+
+% performance measure
+\newcommand{\rhoL}{\rho_L} % perf. measure derived from pointwise loss
+\newcommand{\F}{\bm{F}} % matrix of prediction scores
+\newcommand{\Fi}[1][i]{\F^{(#1)}} % i-th row vector of the predscore mat
+\newcommand{\FJ}[1][J]{\F_{#1}} % predscore mat idxvec J
+\newcommand{\FJf}{\FJ[J,f]} % predscore mat idxvec J and model f
+\newcommand{\FJtestfh}{\FJ[\Jtest, \fh]} % predscore mat idxvec Jtest and model f hat
+\newcommand{\FJtestftrain}{\F_{\Jtest, \Itrainlam}} % predscore mat idxvec Jtest and model f
+\newcommand{\FJtestftraini}[1][i]{\F_{\Jtesti[#1],\ind(\Dtraini[#1], \lamv)}}  % predscore mat i-th idxvec Jtest and model f
+\newcommand{\FJfDef}{\left(f(\xv^{(J^{(1)})}),\dots, f(\xv^{(J^{(m)})})\right)} % def of predscore mat idxvec J and model f
+\newcommand{\preimageRho}{\bigcup_{m\in\N}\left(\Yspace^m\times\R^{m\times g}\right)} % Set of all datasets times HP space
+
+% ml - ROC
+\newcommand{\np}{n_{+}} % no. of positive instances
+\newcommand{\nn}{n_{-}} % no. of negative instances
+\newcommand{\rn}{\pi_{-}} % proportion negative instances
+\newcommand{\rp}{\pi_{+}} % proportion negative instances
+% true/false pos/neg:
+\newcommand{\tp}{\# \text{TP}} % true pos
+\newcommand{\fap}{\# \text{FP}} % false pos (fp taken for partial derivs)
+\newcommand{\tn}{\# \text{TN}} % true neg
+\newcommand{\fan}{\# \text{FN}} % false neg
+
+% ------------- ml-feature-sel.tex -------------
+
+% ml - feature selection
+
+\newcommand{\xjNull}{x_{j_0}}
+\newcommand{\xjEins}{x_{j_1}}
+\newcommand{\xl}{\mathbf{x}_l}
+\newcommand{\pushcode}[1][1]{\hskip\dimexpr#1\algorithmicindent\relax} % IGNORE_NOTATION
+
+% ------------- ml-gp.tex -------------
+
+% ml - Gaussian Process
+
+\newcommand{\fvec}{\left[f\left(\xi[1]\right), \dots, f\left(\xi[n]\right) \right]} % function vector
+\newcommand{\fv}{\mathbf{f}} % function vector
+\newcommand{\kv}{\mathbf{k}} % cov matrix partition
+\newcommand{\kxxp}{k\left(\xv, \xv^{\prime} \right)} % cov of x, x'
+\newcommand{\kxij}[2]{k\left(\xi, \xi[j] \right)} % cov of x_i, x_j
+\newcommand{\mv}{\mathbf{m}} % GP mean vector
+\newcommand{\Kmat}{\mathbf{K}} % GP cov matrix
+\newcommand{\gaussmk}{\normal(\mv, \Kmat)} % Gaussian w/ mean vec, cov mat
+\newcommand{\gp}{\mathcal{GP}\left(m(\xv), \kxxp \right)} % Gaussian Process Definition
+\newcommand{\ls}{\ell} % length-scale
+\newcommand{\sqexpkernel}{\exp \left(- \frac{\| \xv - \xv^{\prime} \|^2}{2 \ls^2} \right)} % squared exponential kernel
+
+% GP prediction
+\newcommand{\fstarvec}{\left[f\left(\xi[1]_{\ast}\right), \dots, f\left(\xi[m]_{\ast}\right) \right]} % pred function vector
+\newcommand{\kstar}{\kv_{\ast}} % cov of new obs with x
+\newcommand{\kstarstar}{\kv_{\ast \ast}} % cov of new obs
+\newcommand{\Kstar}{\Kmat_{\ast}} % cov mat of new obs with x
+\newcommand{\Kstarstar}{\Kmat_{\ast \ast}} % cov mat of new obs
+\newcommand{\preddistsingle}{f_{\ast} ~|~ \xv_{\ast}, \Xmat, \fv} % predictive distribution for single pred
+\newcommand{\preddistdefsingle}{\normal(\kstar^\top\Kmat^{-1}\fv, \kstarstar - \kstar^\top \Kmat ^{-1}\kstar)} % Gaussian distribution for single pred
+\newcommand{\preddist}{f_{\ast} ~|~ \Xmat_{\ast}, \Xmat, \fv} % predictive distribution
+\newcommand{\preddistdef}{\normal(\Kstar^\top\Kmat^{-1}\fv, \Kstarstar - \Kstar^\top \Kmat ^{-1}\Kstar)} % Gaussian predictive distribution
+
+% ------------- ml-hpo.tex -------------
+
+%%% HPO Basics
+\newcommand{\Ilam}{\ind_{\lamv}} % inducer with HP
+\newcommand{\LamS}{\tilde\Lam} % search space
+\newcommand{\lami}[1][i]{\lamv^{(#1)}} % lambda i
+\newcommand{\clam}{c(\lamv)} % c(lambda)
+\newcommand{\clamh}{c(\lamh)} % c(lambda-hat)
+\newcommand{\lams}{\lamv^{*}} % theoretical min of c
+\newcommand{\lamh}{\hat{\lamv}} % returned lambda of HPO
+\newcommand{\lamp}{\lamv^+} % proposed lambda
+\newcommand{\clamp}{c(\lamp)} % c of proposed lambda
+\newcommand{\archive}{\mathcal{A}} % archive
+\newcommand{\archivet}[1][t]{\mathcal{A}^{[#1]}} % archive at time step t
+
+\newcommand{\tuner}{\mathcal{T}} % tuner
+\newcommand{\tunerfull}{\tuner_{\ind,\LamS, \rho,\JJ}} % tuner with inducer, search space, perf measure, resampling strategy
+
+%%% Bayesian Opt
+\newcommand{\chlam}{\hat{c}(\lamv)} % post mean of SM
+\newcommand{\shlam}{\hat{\sigma}(\lamv)} % post sd of SM
+\newcommand{\vhlam}{\hat{\sigma}^2(\lamv)} % post var of SM
+\newcommand{\ulam}{u(\lamv)} % acquisition function
+\newcommand{\lambdaopt}{\lambda^{*}} % minimum of the black box function Psi
+\newcommand{\metadata}{\left\{\left(\lami, \Psi^{[i]}\right)\right\}} % metadata for the Gaussian process
+\newcommand{\lamvec}{\left(\lambda^{[1]}, \dots, \lambda^{[\minit]}\right)} % vector of different inputs
+\newcommand{\minit}{m_{\text{init}}} % size of the initial design
+
+%%% Multifidelity / Hyperband
+\newcommand{\lambu}{\lambda_{\text{budget}}} % single lambda_budget component HP
+\newcommand{\lamfid}{\lambda_{\text{fid}}} % single lambda fidelity
+\newcommand{\lamfidl}{\lamfid^{\textrm{low}}} % single lambda fidelity lower
+\newcommand{\lamfidu}{\lamfid^{\textrm{upp}}} % single lambda fidelity upper
+\newcommand{\etahb}{\eta_{\text{HB}}}  % HB multiplier eta
+
+% ------------- ml-infotheory.tex -------------
+
+% basic info theory
+\newcommand{\entx}{- \sum_{x \in \Xspace} p(x) \cdot \log p(x)} % entropy of x
+\newcommand{\dentx}{- \int_{\Xspace} f(x) \cdot \log f(x) dx} % diff entropy of x
+\newcommand{\jentxy}{- \sum_{x \in \Xspace} p(x, y) \cdot \log p(x, y)} % joint entropy of x, y
+\newcommand{\jdentxy}{- \int_{\Xspace, \Yspace} f(x, y) \cdot \log f(x, y) dx dy} % joint diff entropy of x, y
+\newcommand{\centyx}{- \sum_{x \in \Xspace} p(x) \sum_{y \in \Yspace} p(y|x) \cdot \log p(y|x)} % cond entropy y|x
+\newcommand{\cdentyx}{- \int_{\Xspace, \Yspace} f(x, y) \cdot \log f(y | x) dx dy} % cond diff entropy y|x
+\newcommand{\xentpq}{- \sum_{x \in \Xspace} p(x) \cdot \log q(x)} % cross-entropy of p, q
+\newcommand{\kldpq}{D_{KL}(p \| q)} % KLD between p and q
+\newcommand{\kldpqt}{D_{KL}(p \| q_{\thetav})} % KLD divergence between p and parameterized q
+\newcommand{\explogpq}{\E_p \left[\log \frac{p(X)}{q(X)} \right]} % expected LLR of p, q (def KLD)
+\newcommand{\sumlogpq}{\sum_{x \in \Xspace} p(x) \cdot \log \frac{p(x)}{q(x)}} % expected LLR of p, q (def KLD)
+
+% ------------- ml-interpretable.tex -------------
+
+%%%%%% perturbed data
+\newcommand{\pert}[3]{\ifthenelse{\equal{#2}{}}{\tilde{#1}}{\ifthenelse{\equal{#3}{}}{\tilde{#1}^{#2}}{\tilde{#1}^{#2|#3}}}}	% command to express that for #1 the subset #2 was perturbed given subset #3
+
+%%%%%% marginalized functions
+\newcommand{\fj}{f_j} % marginal function f_j, depending on feature j
+\newcommand{\fnj}{f_{-j}} % marginal function f_{-j}, depending on all features but j
+\newcommand{\fS}{f_S} % marginal function f_S depending on feature set S
+\newcommand{\fC}{f_C} % marginal function f_C depending on feature set C
+\newcommand{\fhj}{\fh_j} % marginal function fh_j, depending on feature j
+\newcommand{\fhnj}{\fh_{-j}} % marginal function fh_{-j}, depending on all features but j
+\newcommand{\fhS}{\fh_S} % marginal function fh_S depending on feature set S
+\newcommand{\fhC}{\fh_C} % marginal function fh_C depending on feature set C
+\newcommand{\XSmat}{\Xmat_S} % Design matrix subset
+\newcommand{\XCmat}{\Xmat_C} % Design matrix subset
+\newcommand{\Xnj}{\Xmat_{-j}} % Design matrix subset -j = {1, .., j-1, j+1, ..., p}
+
+%%%%% ICE
+\newcommand{\fhice}[1]{\fh_{#1,ICE}} % ICE function
+
+%%%%% Shapley values
+\newcommand{\Scupj}{S \cup \{j\}} % coalition S but without player j
+\newcommand{\Scupk}{S \cup \{k\}} % coalition S but without player k
+\newcommand{\SsubP}{S \subseteq P} % coalition S subset of P
+\newcommand{\SsubPnoj}{\SsubP \setminus \{j\}} % coalition S subset of P without player j
+\newcommand{\SsubPnojk}{\SsubP \setminus \{j,k\}} % coalition S subset of P without player k
+\newcommand{\phiij}{\hat{\phi}_j^{(i)}} % Shapley value for feature j and observation i
+
+%%%%% LIME
+\newcommand{\Gspace}{\mathcal{G}} % Hypothesis space for surrogate model
+\newcommand{\neigh}{\phi_{\xv}} % Proximity measure
+\newcommand{\zv}{\mathbf{z}} % Sampled datapoints for surrogate
+\newcommand{\Gower}{d_G} % Gower distance
+
+
+% ------------- ml-mbo.tex -------------
+
+\newcommand{\xvsi}[1][i]{\xv^{[#1]}}  % x at iteration i
+\newcommand{\ysi}[1][i]{y^{[#1]}}     % y at iteration i
+\newcommand{\Dt}[1][t]{\D^{[#1]}}     % archive at iteration t
+\newcommand{\Dts}{\Dt = \{(\xvsi, \ysi)\}_{i = 1, \ldots, t}}  % archive at iteration t fully
+%\newcommand{\fh}{\hat{s}}            % surrogate mean
+\newcommand{\sh}{\hat{s}}             % surrogate se
+\newcommand{\fmin}{f_{\min}}          % current best
+
+% ------------- ml-multitarget.tex -------------
+
+% multitarget notation
+\newcommand{\Tspace}{\mathcal{T}}
+\newcommand{\tv}{\mathbf{t}}
+\newcommand{\tim}{\mathbf{t}^{(i)}_m}
+\newcommand{\yim}{y^{(i)}_m}
+
+% ------------- ml-nn.tex -------------
+
+% ml - NNs
+\newcommand{\neurons}{z_1,\dots,z_M} % vector of neurons
+\newcommand{\hidz}{\mathbf{z}}  % vector of hidden activations
+\newcommand{\biasb}{\mathbf{b}} % bias vector
+\newcommand{\biasc}{c} % bias in output
+\newcommand{\wtw}{\mathbf{w}} % weight vector (general)
+\newcommand{\Wmat}{\mathbf{W}} % weight vector (general)
+\newcommand{\wtu}{\mathbf{u}} % weight vector of output neuron
+
+% deeplearning - regularization
+\newcommand{\Oreg}{\mathnormal{R}_{reg}(\theta|X,y)} % regularized objective function
+\newcommand{\Ounreg}{\mathnormal{R}_{emp}(\theta|X,y)} % unconstrained objective function
+\newcommand{\Pen}{\Omega(\theta)} % penalty
+\newcommand{\Oregweight}{\mathnormal{R}_{reg}(w|X,y)} % regularized objective function with weight 
+\newcommand{\Oweight}{\mathnormal{R}_{emp}(w|X,y)} % unconstrained objective function with weight
+\newcommand{\Oweighti}{\mathnormal{R}_{emp}(w_i|X,y)} % unconstrained objective function with weight w_i
+\newcommand{\Oweightopt}{\mathnormal{J}(w^*|X,y)} % unconstrained objective function withoptimal  weight
+\newcommand{\Oopt}{\hat{\mathnormal{J}}(\theta|X,y)} % optimal objective function 
+\newcommand{\Odropout}{\mathnormal{J}(\theta, \mu|X,y)} % dropout objective function
+
+% deeplearning - optimization
+\newcommand{\Loss}{L(y, f(\xv, \thetav))}
+\newcommand{\Lmomentumnest}{L(\yi, f(x^{(i)}, \thetav + \varphi \nub))} % momentum risk
+\newcommand{\Lmomentumtilde}{L(\yi, f(x^{(i)}, \tilde{\thetav}))} % Nesterov momentum risk
+\newcommand{\Lmomentum}{L(\yi, f(x^{(i)}, \thetav))} 
+\newcommand{\Hess}{\mathbf{H}}
+\newcommand{\nub}{\bm{\nu}}
+
+% deeplearning - autoencoders
+\newcommand{\uauto}{L(x,g(f(x)))} % undercomplete autoencoder objective function
+\newcommand{\dauto}{L(x,g(f(\tilde{x})))} % denoising autoencoder objective function
+
+% deeplearning - adversarials 
+\newcommand{\deltab}{\bm{\delta}}
+\newcommand{\Lossdeltai}{L(\yi, f(\xi + \deltab|\thetav))}
+\newcommand{\Lossdelta}{L(y, f(\xv + \deltab| \thetav))}
+
+% ------------- ml-online.tex -------------
+
+\newcommand{\Aspace}{\mathcal{A}}
+\newcommand{\norm}[1]{\left|\left|#1\right|\right|_2}
+\newcommand{\llin}{L^{\texttt{lin}}}
+\newcommand{\lzeroone}{L^{0-1}}
+\newcommand{\lhinge}{L^{\texttt{hinge}}}
+\newcommand{\lexphinge}{\widetilde{L^{\texttt{hinge}}}}
+\newcommand{\lconv}{L^{\texttt{conv}}}
+\newcommand{\FTL}{\texttt{FTL}}
+\newcommand{\FTRL}{\texttt{FTRL}}
+\newcommand{\OGD}{{\texttt{OGD}}}
+\newcommand{\EWA}{{\texttt{EWA}}}
+\newcommand{\REWA}{{\texttt{REWA}}}
+\newcommand{\EXPthree}{{\texttt{EXP3}}}
+\newcommand{\EXPthreep}{{\texttt{EXP3P}}}
+\newcommand{\reg}{\psi}
+\newcommand{\Algo}{\texttt{Algo}}
+
+% ------------- ml-regu.tex -------------
+
+% \thetah is \hat{\theta}} (theta hat)
+% \thetav is \bm{\theta}}  (theta vector)
+\newcommand{\thetas}{\thetav^*} % theta star
+\newcommand{\thetaridge}{\thetav_{\mathrm{ridge}}} % theta (RIDGE)
+\newcommand{\thetalasso}{\thetav_{\mathrm{LASSO}}} % theta (LASSO)
+\newcommand{\thetaols}{\thetav_{\mathrm{OLS}}} % theta (RIDGE)
+
+% ------------- ml-survival.tex -------------
+
+\newcommand{\Ti}[1][i]{T^{(#1)}} % ??
+\newcommand{\Ci}[1][i]{C^{(#1)}} % ??  
+\newcommand{\oi}[1][i]{o^{(#1)}} % ??  
+\newcommand{\ti}[1][i]{t^{(#1)}} % ??  
+\newcommand{\deltai}[1][i]{\delta^{(#1)}}  
+\newcommand{\Lxdi}{L\left(\bm{\delta}, \fx\right)}           
+
+
+% ------------- ml-svm.tex -------------
+
+% linear svm
+\newcommand{\sv}{\operatorname{SV}} % supportvectors
+\renewcommand{\sl}{\zeta} % slack variable
+\newcommand{\slvec}{\left(\zeta^{(1)}, \zeta^{(n)}\right)} % slack variable vector
+\newcommand{\sli}[1][i]{\zeta^{(#1)}} % i-th slack variable
+\newcommand{\scptxi}{\scp{\thetav}{\xi}} % scalar prodct of theta and xi
+\newcommand{\svmhplane}{\yi \left( \scp{\thetav}{\xi} + \theta_0 \right)} % SVM hyperplane (normalized)
+\newcommand{\alphah}{\hat{\alpha}} % alpha-hat (basis fun coefficients)
+\newcommand{\alphav}{\bm{\alpha}} % vector alpha (bold) (basis fun coefficients)
+\newcommand{\alphavh}{\hat{\bm{\alpha}}} % vector alpha-hat (basis fun coefficients)
+\newcommand{\dualobj}{\sumin \alpha_i - \frac{1}{2}\sumin \sumjn \alpha_i\alpha_j\yi \yi[j] \scp{\xi}{\xv^{(j)}}} % min objective in lin svm dual
+
+% nonlinear svm
+\newcommand{\HS}{\Phi} % H, hilbertspace
+\newcommand{\phix}{\phi(\xv)} % feature map x
+\newcommand{\phixt}{\phi(\tilde \xv)} % feature map x tilde
+\newcommand{\kxxt}{k(\xv, \tilde \xv)} % kernel fun x, x tilde
+\newcommand{\scptxifm}{\scp{\thetav}{\phi(\xi)}} % scalar prodct of theta and xi
+
+% ------------- ml-trees.tex -------------
+
+% ml - trees, extra trees
+
+\newcommand{\Np}{\mathcal{N}} % (Parent) node N
+\newcommand{\Npk}{\Np_k} % node N_k
+\newcommand{\Nl}{\Np_1}	% Left node N_1
+\newcommand{\Nr}{\Np_2} % Right node N_2
+\newcommand{\pikN}[1][k]{\pi_#1^{(\Np)}} % class probability node N
+\newcommand{\pikNh}[1][k]{\hat\pi_#1^{(\Np)}} % estimated class probability node N
+\newcommand{\pikNlh}[1][k]{\hat\pi_#1^{(\Nl)}} % estimated class probability left node
+\newcommand{\pikNrh}[1][k]{\hat\pi_#1^{(\Nr)}} % estimated class probability right node
+
diff --git a/latex-math/ml-eval.tex b/latex-math/ml-eval.tex
index 01232669..0897eb81 100644
--- a/latex-math/ml-eval.tex
+++ b/latex-math/ml-eval.tex
@@ -56,7 +56,7 @@
 \newcommand{\nn}{n_{-}} % no. of negative instances
 \newcommand{\rn}{\pi_{-}} % proportion negative instances
 \newcommand{\rp}{\pi_{+}} % proportion negative instances
-  % true/false pos/neg:
+% true/false pos/neg:
 \newcommand{\tp}{\# \text{TP}} % true pos
 \newcommand{\fap}{\# \text{FP}} % false pos (fp taken for partial derivs)
 \newcommand{\tn}{\# \text{TN}} % true neg
diff --git a/latex-math/ml-regu.tex b/latex-math/ml-regu.tex
index 07559c73..35e2dfff 100644
--- a/latex-math/ml-regu.tex
+++ b/latex-math/ml-regu.tex
@@ -1,6 +1,6 @@
 % \thetah is \hat{\theta}} (theta hat)
 % \thetav is \bm{\theta}}  (theta vector)
 \newcommand{\thetas}{\thetav^*} % theta star
-\newcommand{\thetaridge}{\thetav_{\mathrm{ridge}}}} % theta (RIDGE)
-\newcommand{\thetalasso}{\thetav_{\mathrm{LASSO}}}} % theta (LASSO)
-\newcommand{\thetaols}{\thetav_{\mathrm{OLS}}}} % theta (RIDGE)
+\newcommand{\thetaridge}{\thetav_{\mathrm{ridge}}} % theta (RIDGE)
+\newcommand{\thetalasso}{\thetav_{\mathrm{LASSO}}} % theta (LASSO)
+\newcommand{\thetaols}{\thetav_{\mathrm{OLS}}} % theta (RIDGE)
diff --git a/latex-math/ml-svm.tex b/latex-math/ml-svm.tex
index 1f4d937f..b417a3f5 100644
--- a/latex-math/ml-svm.tex
+++ b/latex-math/ml-svm.tex
@@ -1,6 +1,8 @@
 % linear svm
 \newcommand{\sv}{\operatorname{SV}} % supportvectors
+\ifdefined\sl
 \renewcommand{\sl}{\zeta} % slack variable
+\else \newcommand{\sl}{\zeta} \fi
 \newcommand{\slvec}{\left(\zeta^{(1)}, \zeta^{(n)}\right)} % slack variable vector
 \newcommand{\sli}[1][i]{\zeta^{(#1)}} % i-th slack variable
 \newcommand{\scptxi}{\scp{\thetav}{\xi}} % scalar prodct of theta and xi
diff --git a/latex-math/preamble.tex b/latex-math/preamble.tex
new file mode 100644
index 00000000..e25951e2
--- /dev/null
+++ b/latex-math/preamble.tex
@@ -0,0 +1,7 @@
+\usepackage{mathtools}
+\usepackage{bm}
+\usepackage{siunitx}
+\usepackage{dsfont}
+\usepackage{xspace}
+\usepackage{longtable}
+\usepackage{xifthen}