From ac815651fb9084e6d03debd261cba715cb2ae20d Mon Sep 17 00:00:00 2001 From: jemus42 <1613346+jemus42@users.noreply.github.com> Date: Wed, 20 Nov 2024 06:37:47 +0000 Subject: [PATCH] Update latex-math --- latex-math/Makefile | 34 ++ latex-math/README.md | 47 +- latex-math/basic-math.tex | 8 +- latex-math/basic-ml.tex | 9 +- latex-math/create-latex-math-combined.R | 17 + latex-math/latex-math-combined.tex | 656 ++++++++++++++++++++++++ latex-math/ml-eval.tex | 2 +- latex-math/ml-regu.tex | 6 +- latex-math/ml-svm.tex | 2 + latex-math/preamble.tex | 7 + 10 files changed, 765 insertions(+), 23 deletions(-) create mode 100644 latex-math/Makefile create mode 100644 latex-math/create-latex-math-combined.R create mode 100644 latex-math/latex-math-combined.tex create mode 100644 latex-math/preamble.tex diff --git a/latex-math/Makefile b/latex-math/Makefile new file mode 100644 index 00000000..1556aa2d --- /dev/null +++ b/latex-math/Makefile @@ -0,0 +1,34 @@ +MATHRMD=latex-math.Rmd +MATHPDF=${MATHRMD:%.Rmd=%.pdf} + +TEXFILES=$(shell find . -iname "basic-*tex" -o -iname "ml-*tex") + +MATHCOMBINED=latex-math-combined.tex + + +.PHONY: all pdf combined help +all: $(MATHPDF) $(MATHCOMBINED) +pdf: $(MATHPDF) +combined: $(MATHCOMBINED) +help: + @echo "Usage: make :\n" + @echo " pdf: render $(MATHRMD) to $(MATHPDF)" + @echo " combined: create the combined tex file $(MATHCOMBINED)" + @echo " clean: remove $(MATHPDF) and $(MATHCOMBINED)" + @echo " all: render $(MATHRMD) to $(MATHPDF) and create the combined tex file $(MATHCOMBINED)" + @echo " help: show this message" + +$(MATHPDF): $(MATHRMD) $(TEXFILES) + @echo rendering $<; + Rscript -e "rmarkdown::render('latex-math.Rmd')" + +$(MATHCOMBINED): $(TEXFILES) + @echo creating $@ from $(TEXFILES); + Rscript --quiet create-latex-math-combined.R + +.PHONY: clean +clean: + @echo Removing $(MATHPDF) if it exists; + latexmk -C + test -f $(MATHPDF) && rm $(MATHPDF) + test -f combined-latex-and-math.tex && rm combined-latex-and-math.tex diff --git a/latex-math/README.md b/latex-math/README.md index 1730ab45..c1b5f206 100644 --- a/latex-math/README.md +++ b/latex-math/README.md @@ -7,14 +7,27 @@ The notation and shortcuts used in latex-files of lectures, papers, ... of the Chair of Statistical Learning and Data Science is defined and maintained in this repository. Notation & shortcuts are split into multiple files depending on subject and can be integrated as needed. -+ `basic-math`: basic mathematical notation such as mathematical spaces, sums & products, linear algebra, basic probability and statistics -+ `basic-ml`: basic machine learning notation such as notation for data (x, y), prediction functions, likelihood, loss functions, generalization error -+ `ml-nn`: neural networks -+ `ml-svm`: support vector machines -+ `ml-trees`: decision trees -+ `ml-interpretable`: IML / xAI - -:warning: **Important Usage Note**: If you encounter these files within a lecture or project repository, do not make any changes locally. Go to [slds-lmu/latex-math](https://github.com/slds-lmu/latex-math) and make your changes either directly or via pull request. +- `basic-math`: Basic mathematical notation such as mathematical spaces, sums & products, linear algebra, basic probability and statistics +- `basic-ml`: Basic machine learning notation such as notation for data (x, y), prediction functions, likelihood, loss functions, generalization error +- `ml-ensembles`: Ensemble methods +- `ml-eval`: Evaluation metrics, resampling +- `ml-feature-sel`: Feature selection +- `ml-gp`: Gaussian processes +- `ml-hpo`: Hyperparameter optimization +- `ml-infotheory`: Information theory +- `ml-interpretable`: IML / xAI +- `ml-mbo`: Model-based optimization / Bayesian optimization +- `ml-multitarget`: Multi-target learning +- `ml-nn`: Neural networks +- `ml-online`: +- `ml-regu`: Regularization +- `ml-survival`: Survival analysis +- `ml-svm`: Support vector machines +- `ml-trees`: Decision trees + + +:warning: **Important Usage Note**: If you encounter these files within a lecture or project repository, do not make any changes locally. +Go to [slds-lmu/latex-math](https://github.com/slds-lmu/latex-math) and make your changes either directly or via pull request. Any local changes are assumed to be spurious and *will be overridden* with upstream `slds-lmu/latex-math`. ## Using the notation @@ -31,22 +44,34 @@ Note that some of the macros defined here may use additional Latex packages -- a \usepackage{mathtools} \usepackage{bm} % basic-ml, ml-gp \usepackage{siunitx} % basic-ml -\usepackage{dsfont} % basic-math +\usepackage{dsfont} % basic-math, not package is called `doublestroke` when installing via tlmgr \usepackage{xspace} % ml-mbo \usepackage{xifthen} % ml-interpretable ``` See `latex-math.pdf` for all currently defined commands & definitions. +Note that the file `preamble.tex` contains packages required for `latex-math.Rmd` to be rendered, which are not necessarily all packages you would need in a fresh LaTeX project, since RMarkdown by default includes various required packages already. + ## Updating / adding files - A new shortcut / notation that falls into the scope of one of the existing files should be added in the respective file with a short description. - Multiple shortcuts / notations belonging to another major subject should be summarized in a new .tex file. - **ALWAYS** check if a command is already contained in one of the files - overwriting a command might result in compiling errors. - **ALWAYS recompile `latex-math.Rmd` if you add new commands so it is kept up-to-date and to check that you have committed all the changes your notation requires to work.** +- If you add a new file, make sure it is added as an `include` in the header of `latex-math.Rmd` such that it is included in the rendered preview -To ensure recompilation is not forgotten, please install the pre-commit hook: +## Building + +Use the included `Makefile` to render `latex-math.pdf` and to create the combined .tex file `latex-math-combined.tex`: ```sh -cp service/pre-commit-check-pdf .git/hooks/pre-commit +Usage: make : + + pdf: render latex-math.Rmd to latex-math.pdf + combined: create the combined tex file latex-math-combined.tex + clean: remove latex-math.pdf and latex-math-combined.tex + all: render latex-math.Rmd to latex-math.pdf and create the combined tex file latex-math-combined.tex + help: show this message ``` + diff --git a/latex-math/basic-math.tex b/latex-math/basic-math.tex index 45ba1a34..31878902 100644 --- a/latex-math/basic-math.tex +++ b/latex-math/basic-math.tex @@ -20,10 +20,10 @@ % basic math stuff \newcommand{\xt}{\tilde x} % x tilde -\DeclareMathOperator*{\argmax}{arg\,max} % argmax -\DeclareMathOperator*{\argmin}{arg\,min} % argmin -\newcommand{\argminlim}{\mathop{\mathrm{arg\,min}}\limits} % argmax with limits -\newcommand{\argmaxlim}{\mathop{\mathrm{arg\,max}}\limits} % argmin with limits +\newcommand{\argmin}{\mathop{\mathrm{arg\,min}}} % argmin +\newcommand{\argmax}{\mathop{\mathrm{arg\,max}}} % argmax +\newcommand{\argminlim}{\argmin\limits} % argmin with limits +\newcommand{\argmaxlim}{\argmax\limits} % argmax with limits \newcommand{\sign}{\operatorname{sign}} % sign, signum \newcommand{\I}{\mathbb{I}} % I, indicator \newcommand{\order}{\mathcal{O}} % O, order diff --git a/latex-math/basic-ml.tex b/latex-math/basic-ml.tex index 6fb0f8e0..0f2d541d 100644 --- a/latex-math/basic-ml.tex +++ b/latex-math/basic-ml.tex @@ -91,9 +91,6 @@ \newcommand{\thetahDlam}{\thetavh_{\D, \lamv}} %theta learned on D with hp lambda \newcommand{\mint}{\min_{\thetav \in \Theta}} % min problem theta \newcommand{\argmint}{\argmin_{\thetav \in \Theta}} % argmin theta -% LS 29.10.2024 addin thetab back for now because apparently this broke and nobody updated slides to reflect thetab -> thetav changes? -\newcommand{\thetab}{\bm{\theta}} % theta vector - % densities + probabilities % pdf of x @@ -115,6 +112,8 @@ % prior probabilities \newcommand{\pik}[1][k]{\pi_{#1}} % pi_k, prior +\newcommand{\pih}{\hat{\pi}} % pi hat, estimated prior (binary classification) +\newcommand{\pikh}[1][k]{\hat{\pi}_{#1}} % pi_k hat, estimated prior \newcommand{\lpik}[1][k]{\log \pi_{#1}} % log pi_k, log of the prior \newcommand{\pit}{\pi(\thetav)} % Prior probability of parameter theta @@ -139,7 +138,9 @@ % probababilistic \newcommand{\bayesrulek}[1][k]{\frac{\P(\xv | y= #1) \P(y= #1)}{\P(\xv)}} % Bayes rule -\newcommand{\muk}{\bm{\mu_k}} % mean vector of class-k Gaussian (discr analysis) +\newcommand{\muv}{\bm{\mu}} % expectation vector of Gaussian +\newcommand{\muk}[1][k]{\bm{\mu_{#1}}} % mean vector of class-k Gaussian (discr analysis) +\newcommand{\mukh}[1][k]{\bm{\hat{\mu}_{#1}}} % estimated mean vector of class-k Gaussian (discr analysis) % residual and margin \newcommand{\eps}{\epsilon} % residual, stochastic diff --git a/latex-math/create-latex-math-combined.R b/latex-math/create-latex-math-combined.R new file mode 100644 index 00000000..a694daff --- /dev/null +++ b/latex-math/create-latex-math-combined.R @@ -0,0 +1,17 @@ +# Find all .tex files starting with `basic-` or `ml-` +# !! If new file prefixes are added, they must be added to the regex pattern +texfiles <- list.files(pattern = "(ml|basic)-.*\\.tex") + +combined <- vapply(seq_along(texfiles), \(i) { + lines <- readLines(texfiles[[i]]) + chunk <- paste(lines, collapse = "\n") + chunk <- paste0(chunk, "\n") + + paste( + sprintf("%% ------------- %s -------------", basename(texfiles[[i]])), + chunk, + sep = "\n\n" + ) +}, character(1)) + +writeLines(combined, "latex-math-combined.tex") diff --git a/latex-math/latex-math-combined.tex b/latex-math/latex-math-combined.tex new file mode 100644 index 00000000..100273c5 --- /dev/null +++ b/latex-math/latex-math-combined.tex @@ -0,0 +1,656 @@ +% ------------- basic-math.tex ------------- + +% dependencies: amsmath, amssymb, dsfont +% math spaces +\ifdefined\N +\renewcommand{\N}{\mathds{N}} % N, naturals +\else \newcommand{\N}{\mathds{N}} \fi +\newcommand{\Z}{\mathds{Z}} % Z, integers +\newcommand{\Q}{\mathds{Q}} % Q, rationals +\newcommand{\R}{\mathds{R}} % R, reals +\ifdefined\C +\renewcommand{\C}{\mathds{C}} % C, complex +\else \newcommand{\C}{\mathds{C}} \fi +\newcommand{\continuous}{\mathcal{C}} % C, space of continuous functions +\newcommand{\M}{\mathcal{M}} % machine numbers +\newcommand{\epsm}{\epsilon_m} % maximum error + +% counting / finite sets +\newcommand{\setzo}{\{0, 1\}} % set 0, 1 +\newcommand{\setmp}{\{-1, +1\}} % set -1, 1 +\newcommand{\unitint}{[0, 1]} % unit interval + +% basic math stuff +\newcommand{\xt}{\tilde x} % x tilde +\newcommand{\argmin}{\mathop{\mathrm{arg\,min}}} % argmin +\newcommand{\argmax}{\mathop{\mathrm{arg\,max}}} % argmax +\newcommand{\argminlim}{\argmin\limits} % argmin with limits +\newcommand{\argmaxlim}{\argmax\limits} % argmax with limits +\newcommand{\sign}{\operatorname{sign}} % sign, signum +\newcommand{\I}{\mathbb{I}} % I, indicator +\newcommand{\order}{\mathcal{O}} % O, order +\newcommand{\bigO}{\mathcal{O}} % Big-O Landau +\newcommand{\littleo}{{o}} % Little-o Landau +\newcommand{\pd}[2]{\frac{\partial{#1}}{\partial #2}} % partial derivative +\newcommand{\floorlr}[1]{\left\lfloor #1 \right\rfloor} % floor +\newcommand{\ceillr}[1]{\left\lceil #1 \right\rceil} % ceiling +\newcommand{\indep}{\perp \!\!\! \perp} % independence symbol + +% sums and products +\newcommand{\sumin}{\sum\limits_{i=1}^n} % summation from i=1 to n +\newcommand{\sumim}{\sum\limits_{i=1}^m} % summation from i=1 to m +\newcommand{\sumjn}{\sum\limits_{j=1}^n} % summation from j=1 to p +\newcommand{\sumjp}{\sum\limits_{j=1}^p} % summation from j=1 to p +\newcommand{\sumik}{\sum\limits_{i=1}^k} % summation from i=1 to k +\newcommand{\sumkg}{\sum\limits_{k=1}^g} % summation from k=1 to g +\newcommand{\sumjg}{\sum\limits_{j=1}^g} % summation from j=1 to g +\newcommand{\summM}{\sum\limits_{m=1}^M} % summation from m=1 to M +\newcommand{\meanin}{\frac{1}{n} \sum\limits_{i=1}^n} % mean from i=1 to n +\newcommand{\meanim}{\frac{1}{m} \sum\limits_{i=1}^m} % mean from i=1 to n +\newcommand{\meankg}{\frac{1}{g} \sum\limits_{k=1}^g} % mean from k=1 to g +\newcommand{\meanmM}{\frac{1}{M} \sum\limits_{m=1}^M} % mean from m=1 to M +\newcommand{\prodin}{\prod\limits_{i=1}^n} % product from i=1 to n +\newcommand{\prodkg}{\prod\limits_{k=1}^g} % product from k=1 to g +\newcommand{\prodjp}{\prod\limits_{j=1}^p} % product from j=1 to p + +% linear algebra +\newcommand{\one}{\bm{1}} % 1, unitvector +\newcommand{\zero}{\mathbf{0}} % 0-vector +\newcommand{\id}{\bm{I}} % I, identity +\newcommand{\diag}{\operatorname{diag}} % diag, diagonal +\newcommand{\trace}{\operatorname{tr}} % tr, trace +\newcommand{\spn}{\operatorname{span}} % span +\newcommand{\scp}[2]{\left\langle #1, #2 \right\rangle} % <.,.>, scalarproduct +\newcommand{\mat}[1]{\begin{pmatrix} #1 \end{pmatrix}} % short pmatrix command +\newcommand{\Amat}{\mathbf{A}} % matrix A +\newcommand{\Deltab}{\mathbf{\Delta}} % error term for vectors + +% basic probability + stats +\renewcommand{\P}{\mathds{P}} % P, probability +\newcommand{\E}{\mathds{E}} % E, expectation +\newcommand{\var}{\mathsf{Var}} % Var, variance +\newcommand{\cov}{\mathsf{Cov}} % Cov, covariance +\newcommand{\corr}{\mathsf{Corr}} % Corr, correlation +\newcommand{\normal}{\mathcal{N}} % N of the normal distribution +\newcommand{\iid}{\overset{i.i.d}{\sim}} % dist with i.i.d superscript +\newcommand{\distas}[1]{\overset{#1}{\sim}} % ... is distributed as ... + +% ------------- basic-ml.tex ------------- + +% machine learning +\newcommand{\Xspace}{\mathcal{X}} % X, input space +\newcommand{\Yspace}{\mathcal{Y}} % Y, output space +\newcommand{\Zspace}{\mathcal{Z}} % Z, space of sampled datapoints +\newcommand{\nset}{\{1, \ldots, n\}} % set from 1 to n +\newcommand{\pset}{\{1, \ldots, p\}} % set from 1 to p +\newcommand{\gset}{\{1, \ldots, g\}} % set from 1 to g +\newcommand{\Pxy}{\mathbb{P}_{xy}} % P_xy +\newcommand{\Exy}{\mathbb{E}_{xy}} % E_xy: Expectation over random variables xy +\newcommand{\xv}{\mathbf{x}} % vector x (bold) +\newcommand{\xtil}{\tilde{\mathbf{x}}} % vector x-tilde (bold) +\newcommand{\yv}{\mathbf{y}} % vector y (bold) +\newcommand{\xy}{(\xv, y)} % observation (x, y) +\newcommand{\xvec}{\left(x_1, \ldots, x_p\right)^\top} % (x1, ..., xp) +\newcommand{\Xmat}{\mathbf{X}} % Design matrix +\newcommand{\allDatasets}{\mathds{D}} % The set of all datasets +\newcommand{\allDatasetsn}{\mathds{D}_n} % The set of all datasets of size n +\newcommand{\D}{\mathcal{D}} % D, data +\newcommand{\Dn}{\D_n} % D_n, data of size n +\newcommand{\Dtrain}{\mathcal{D}_{\text{train}}} % D_train, training set +\newcommand{\Dtest}{\mathcal{D}_{\text{test}}} % D_test, test set +\newcommand{\xyi}[1][i]{\left(\xv^{(#1)}, y^{(#1)}\right)} % (x^i, y^i), i-th observation +\newcommand{\Dset}{\left( \xyi[1], \ldots, \xyi[n]\right)} % {(x1,y1)), ..., (xn,yn)}, data +\newcommand{\defAllDatasetsn}{(\Xspace \times \Yspace)^n} % Def. of the set of all datasets of size n +\newcommand{\defAllDatasets}{\bigcup_{n \in \N}(\Xspace \times \Yspace)^n} % Def. of the set of all datasets +\newcommand{\xdat}{\left\{ \xv^{(1)}, \ldots, \xv^{(n)}\right\}} % {x1, ..., xn}, input data +\newcommand{\ydat}{\left\{ \yv^{(1)}, \ldots, \yv^{(n)}\right\}} % {y1, ..., yn}, input data +\newcommand{\yvec}{\left(y^{(1)}, \hdots, y^{(n)}\right)^\top} % (y1, ..., yn), vector of outcomes +\newcommand{\greekxi}{\xi} % Greek letter xi +\renewcommand{\xi}[1][i]{\xv^{(#1)}} % x^i, i-th observed value of x +\newcommand{\yi}[1][i]{y^{(#1)}} % y^i, i-th observed value of y +\newcommand{\xivec}{\left(x^{(i)}_1, \ldots, x^{(i)}_p\right)^\top} % (x1^i, ..., xp^i), i-th observation vector +\newcommand{\xj}{\xv_j} % x_j, j-th feature +\newcommand{\xjvec}{\left(x^{(1)}_j, \ldots, x^{(n)}_j\right)^\top} % (x^1_j, ..., x^n_j), j-th feature vector +\newcommand{\phiv}{\mathbf{\phi}} % Basis transformation function phi +\newcommand{\phixi}{\mathbf{\phi}^{(i)}} % Basis transformation of xi: phi^i := phi(xi) + +%%%%%% ml - models general +\newcommand{\lamv}{\bm{\lambda}} % lambda vector, hyperconfiguration vector +\newcommand{\Lam}{\bm{\Lambda}} % Lambda, space of all hpos +% Inducer / Inducing algorithm +\newcommand{\preimageInducer}{\left(\defAllDatasets\right)\times\Lam} % Set of all datasets times the hyperparameter space +\newcommand{\preimageInducerShort}{\allDatasets\times\Lam} % Set of all datasets times the hyperparameter space +% Inducer / Inducing algorithm +\newcommand{\ind}{\mathcal{I}} % Inducer, inducing algorithm, learning algorithm + +% continuous prediction function f +\newcommand{\ftrue}{f_{\text{true}}} % True underlying function (if a statistical model is assumed) +\newcommand{\ftruex}{\ftrue(\xv)} % True underlying function (if a statistical model is assumed) +\newcommand{\fx}{f(\xv)} % f(x), continuous prediction function +\newcommand{\fdomains}{f: \Xspace \rightarrow \R^g} % f with domain and co-domain +\newcommand{\Hspace}{\mathcal{H}} % hypothesis space where f is from +\newcommand{\fbayes}{f^{\ast}} % Bayes-optimal model +\newcommand{\fxbayes}{f^{\ast}(\xv)} % Bayes-optimal model +\newcommand{\fkx}[1][k]{f_{#1}(\xv)} % f_j(x), discriminant component function +\newcommand{\fh}{\hat{f}} % f hat, estimated prediction function +\newcommand{\fxh}{\fh(\xv)} % fhat(x) +\newcommand{\fxt}{f(\xv ~|~ \thetav)} % f(x | theta) +\newcommand{\fxi}{f\left(\xv^{(i)}\right)} % f(x^(i)) +\newcommand{\fxih}{\hat{f}\left(\xv^{(i)}\right)} % f(x^(i)) +\newcommand{\fxit}{f\left(\xv^{(i)} ~|~ \thetav\right)} % f(x^(i) | theta) +\newcommand{\fhD}{\fh_{\D}} % fhat_D, estimate of f based on D +\newcommand{\fhDtrain}{\fh_{\Dtrain}} % fhat_Dtrain, estimate of f based on D +\newcommand{\fhDnlam}{\fh_{\Dn, \lamv}} %model learned on Dn with hp lambda +\newcommand{\fhDlam}{\fh_{\D, \lamv}} %model learned on D with hp lambda +\newcommand{\fhDnlams}{\fh_{\Dn, \lamv^\ast}} %model learned on Dn with optimal hp lambda +\newcommand{\fhDlams}{\fh_{\D, \lamv^\ast}} %model learned on D with optimal hp lambda + +% discrete prediction function h +\newcommand{\hx}{h(\xv)} % h(x), discrete prediction function +\newcommand{\hh}{\hat{h}} % h hat +\newcommand{\hxh}{\hat{h}(\xv)} % hhat(x) +\newcommand{\hxt}{h(\xv | \thetav)} % h(x | theta) +\newcommand{\hxi}{h\left(\xi\right)} % h(x^(i)) +\newcommand{\hxit}{h\left(\xi ~|~ \thetav\right)} % h(x^(i) | theta) +\newcommand{\hbayes}{h^{\ast}} % Bayes-optimal classification model +\newcommand{\hxbayes}{h^{\ast}(\xv)} % Bayes-optimal classification model + +% yhat +\newcommand{\yh}{\hat{y}} % yhat for prediction of target +\newcommand{\yih}{\hat{y}^{(i)}} % yhat^(i) for prediction of ith targiet +\newcommand{\resi}{\yi- \yih} + +% theta +\newcommand{\thetah}{\hat{\theta}} % theta hat +\newcommand{\thetav}{\bm{\theta}} % theta vector +\newcommand{\thetavh}{\bm{\hat\theta}} % theta vector hat +\newcommand{\thetat}[1][t]{\thetav^{[#1]}} % theta^[t] in optimization +\newcommand{\thetatn}[1][t]{\thetav^{[#1 +1]}} % theta^[t+1] in optimization +\newcommand{\thetahDnlam}{\thetavh_{\Dn, \lamv}} %theta learned on Dn with hp lambda +\newcommand{\thetahDlam}{\thetavh_{\D, \lamv}} %theta learned on D with hp lambda +\newcommand{\mint}{\min_{\thetav \in \Theta}} % min problem theta +\newcommand{\argmint}{\argmin_{\thetav \in \Theta}} % argmin theta + +% densities + probabilities +% pdf of x +\newcommand{\pdf}{p} % p +\newcommand{\pdfx}{p(\xv)} % p(x) +\newcommand{\pixt}{\pi(\xv~|~ \thetav)} % pi(x|theta), pdf of x given theta +\newcommand{\pixit}[1][i]{\pi\left(\xi[#1] ~|~ \thetav\right)} % pi(x^i|theta), pdf of x given theta +\newcommand{\pixii}[1][i]{\pi\left(\xi[#1]\right)} % pi(x^i), pdf of i-th x + +% pdf of (x, y) +\newcommand{\pdfxy}{p(\xv,y)} % p(x, y) +\newcommand{\pdfxyt}{p(\xv, y ~|~ \thetav)} % p(x, y | theta) +\newcommand{\pdfxyit}{p\left(\xi, \yi ~|~ \thetav\right)} % p(x^(i), y^(i) | theta) + +% pdf of x given y +\newcommand{\pdfxyk}[1][k]{p(\xv | y= #1)} % p(x | y = k) +\newcommand{\lpdfxyk}[1][k]{\log p(\xv | y= #1)} % log p(x | y = k) +\newcommand{\pdfxiyk}[1][k]{p\left(\xi | y= #1 \right)} % p(x^i | y = k) + +% prior probabilities +\newcommand{\pik}[1][k]{\pi_{#1}} % pi_k, prior +\newcommand{\pih}{\hat{\pi}} % pi hat, estimated prior (binary classification) +\newcommand{\pikh}[1][k]{\hat{\pi}_{#1}} % pi_k hat, estimated prior +\newcommand{\lpik}[1][k]{\log \pi_{#1}} % log pi_k, log of the prior +\newcommand{\pit}{\pi(\thetav)} % Prior probability of parameter theta + +% posterior probabilities +\newcommand{\post}{\P(y = 1 ~|~ \xv)} % P(y = 1 | x), post. prob for y=1 +\newcommand{\postk}[1][k]{\P(y = #1 ~|~ \xv)} % P(y = k | y), post. prob for y=k +\newcommand{\pidomains}{\pi: \Xspace \rightarrow \unitint} % pi with domain and co-domain +\newcommand{\pibayes}{\pi^{\ast}} % Bayes-optimal classification model +\newcommand{\pixbayes}{\pi^{\ast}(\xv)} % Bayes-optimal classification model +\newcommand{\pix}{\pi(\xv)} % pi(x), P(y = 1 | x) +\newcommand{\piv}{\bm{\pi}} % pi, bold, as vector +\newcommand{\pikx}[1][k]{\pi_{#1}(\xv)} % pi_k(x), P(y = k | x) +\newcommand{\pikxt}[1][k]{\pi_{#1}(\xv ~|~ \thetav)} % pi_k(x | theta), P(y = k | x, theta) +\newcommand{\pixh}{\hat \pi(\xv)} % pi(x) hat, P(y = 1 | x) hat +\newcommand{\pikxh}[1][k]{\hat \pi_{#1}(\xv)} % pi_k(x) hat, P(y = k | x) hat +\newcommand{\pixih}{\hat \pi(\xi)} % pi(x^(i)) with hat +\newcommand{\pikxih}[1][k]{\hat \pi_{#1}(\xi)} % pi_k(x^(i)) with hat +\newcommand{\pdfygxt}{p(y ~|~\xv, \thetav)} % p(y | x, theta) +\newcommand{\pdfyigxit}{p\left(\yi ~|~\xi, \thetav\right)} % p(y^i |x^i, theta) +\newcommand{\lpdfygxt}{\log \pdfygxt } % log p(y | x, theta) +\newcommand{\lpdfyigxit}{\log \pdfyigxit} % log p(y^i |x^i, theta) + +% probababilistic +\newcommand{\bayesrulek}[1][k]{\frac{\P(\xv | y= #1) \P(y= #1)}{\P(\xv)}} % Bayes rule +\newcommand{\muv}{\bm{\mu}} % expectation vector of Gaussian +\newcommand{\muk}[1][k]{\bm{\mu_{#1}}} % mean vector of class-k Gaussian (discr analysis) +\newcommand{\mukh}[1][k]{\bm{\hat{\mu}_{#1}}} % estimated mean vector of class-k Gaussian (discr analysis) + +% residual and margin +\newcommand{\eps}{\epsilon} % residual, stochastic +\newcommand{\epsv}{\bm{\epsilon}} % residual, stochastic, as vector +\newcommand{\epsi}{\epsilon^{(i)}} % epsilon^i, residual, stochastic +\newcommand{\epsh}{\hat{\epsilon}} % residual, estimated +\newcommand{\epsvh}{\hat{\epsv}} % residual, estimated, vector +\newcommand{\yf}{y \fx} % y f(x), margin +\newcommand{\yfi}{\yi \fxi} % y^i f(x^i), margin +\newcommand{\Sigmah}{\hat \Sigma} % estimated covariance matrix +\newcommand{\Sigmahj}{\hat \Sigma_j} % estimated covariance matrix for the j-th class + +% ml - loss, risk, likelihood +\newcommand{\Lyf}{L\left(y, f\right)} % L(y, f), loss function +\newcommand{\Lypi}{L\left(y, \pi\right)} % L(y, pi), loss function +\newcommand{\Lxy}{L\left(y, \fx\right)} % L(y, f(x)), loss function +\newcommand{\Lxyi}{L\left(\yi, \fxi\right)} % loss of observation +\newcommand{\Lxyt}{L\left(y, \fxt\right)} % loss with f parameterized +\newcommand{\Lxyit}{L\left(\yi, \fxit\right)} % loss of observation with f parameterized +\newcommand{\Lxym}{L\left(\yi, f\left(\bm{\tilde{x}}^{(i)} ~|~ \thetav\right)\right)} % loss of observation with f parameterized +\newcommand{\Lpixy}{L\left(y, \pix\right)} % loss in classification +\newcommand{\Lpiy}{L\left(y, \pi\right)} % loss in classification +\newcommand{\Lpiv}{L\left(y, \piv\right)} % loss in classification +\newcommand{\Lpixyi}{L\left(\yi, \pixii\right)} % loss of observation in classification +\newcommand{\Lpixyt}{L\left(y, \pixt\right)} % loss with pi parameterized +\newcommand{\Lpixyit}{L\left(\yi, \pixit\right)} % loss of observation with pi parameterized +\newcommand{\Lhy}{L\left(y, h\right)} % L(y, h), loss function on discrete classes +\newcommand{\Lhxy}{L\left(y, \hx\right)} % L(y, h(x)), loss function on discrete classes +\newcommand{\Lr}{L\left(r\right)} % L(r), loss defined on residual (reg) / margin (classif) +\newcommand{\lone}{|y - \fx|} % L1 loss +\newcommand{\ltwo}{\left(y - \fx\right)^2} % L2 loss +\newcommand{\lbernoullimp}{\ln(1 + \exp(-y \cdot \fx))} % Bernoulli loss for -1, +1 encoding +\newcommand{\lbernoullizo}{- y \cdot \fx + \log(1 + \exp(\fx))} % Bernoulli loss for 0, 1 encoding +\newcommand{\lcrossent}{- y \log \left(\pix\right) - (1 - y) \log \left(1 - \pix\right)} % cross-entropy loss +\newcommand{\lbrier}{\left(\pix - y \right)^2} % Brier score +\newcommand{\risk}{\mathcal{R}} % R, risk +\newcommand{\riskbayes}{\mathcal{R}^\ast} +\newcommand{\riskf}{\risk(f)} % R(f), risk +\newcommand{\riskdef}{\E_{y|\xv}\left(\Lxy \right)} % risk def (expected loss) +\newcommand{\riskt}{\mathcal{R}(\thetav)} % R(theta), risk +\newcommand{\riske}{\mathcal{R}_{\text{emp}}} % R_emp, empirical risk w/o factor 1 / n +\newcommand{\riskeb}{\bar{\mathcal{R}}_{\text{emp}}} % R_emp, empirical risk w/ factor 1 / n +\newcommand{\riskef}{\riske(f)} % R_emp(f) +\newcommand{\risket}{\mathcal{R}_{\text{emp}}(\thetav)} % R_emp(theta) +\newcommand{\riskr}{\mathcal{R}_{\text{reg}}} % R_reg, regularized risk +\newcommand{\riskrt}{\mathcal{R}_{\text{reg}}(\thetav)} % R_reg(theta) +\newcommand{\riskrf}{\riskr(f)} % R_reg(f) +\newcommand{\riskrth}{\hat{\mathcal{R}}_{\text{reg}}(\thetav)} % hat R_reg(theta) +\newcommand{\risketh}{\hat{\mathcal{R}}_{\text{emp}}(\thetav)} % hat R_emp(theta) +\newcommand{\LL}{\mathcal{L}} % L, likelihood +\newcommand{\LLt}{\mathcal{L}(\thetav)} % L(theta), likelihood +\newcommand{\LLtx}{\mathcal{L}(\thetav | \xv)} % L(theta|x), likelihood +\newcommand{\logl}{\ell} % l, log-likelihood +\newcommand{\loglt}{\logl(\thetav)} % l(theta), log-likelihood +\newcommand{\logltx}{\logl(\thetav | \xv)} % l(theta|x), log-likelihood +\newcommand{\errtrain}{\text{err}_{\text{train}}} % training error +\newcommand{\errtest}{\text{err}_{\text{test}}} % test error +\newcommand{\errexp}{\overline{\text{err}_{\text{test}}}} % avg training error + +% lm +\newcommand{\thx}{\thetav^\top \xv} % linear model +\newcommand{\olsest}{(\Xmat^\top \Xmat)^{-1} \Xmat^\top \yv} % OLS estimator in LM + +% ------------- ml-ensembles.tex ------------- + +% ml - bagging, random forest +\newcommand{\bl}[1][m]{b^{[#1]}} % baselearner, default m +\newcommand{\blh}[1][m]{\hat{b}^{[#1]}} % estimated base learner, default m +\newcommand{\blx}[1][m]{b^{[#1]}(\xv)} % baselearner, default m +\newcommand{\blf}[1][m]{f^{[#1]}} % baselearner: scores, default m +\newcommand{\blfh}[1][m]{\hat{f}^{[#1]}} % estimated baselearner: scores, default m +\newcommand{\blfhx}[1][m]{\hat{f}^{[#1]}(\xv)} % estimated baselearner: scores of x, default m +\newcommand{\bll}[1][m]{h^{[#1]}} % baselearner: hard labels, default m +\newcommand{\bllh}[1][m]{\hat{h}^{[#1]}} % estimated baselearner: hard labels, default m +\newcommand{\bllhx}[1][m]{\hat{h}^{[#1]}(\xv)} % estimated baselearner: hard labels of x, default m +\newcommand{\blp}[1][m]{\pi^{[#1]}} % baselearner: probabilities, default m +\newcommand{\blph}[1][m]{\hat{\pi}^{[#1]}} % estimated baselearner: probabilities, default m +\newcommand{\blphxk}[1][m]{\hat{\pi}_{k}^{[#1]}(\xv)} % estimated baselearner: probabilities of x for class k, default m +\newcommand{\fM}{f^{[M]}(\xv)} % ensembled predictor +\newcommand{\fMh}{\hat f^{[M]}(\xv)} % estimated ensembled predictor +\newcommand{\ambifM}{\Delta\left(\fM\right)} % ambiguity/instability of ensemble +\newcommand{\betam}[1][m]{\beta^{[#1]}} % weight of basemodel m +\newcommand{\betamh}[1][m]{\hat{\beta}^{[#1]}} % weight of basemodel m with hat +\newcommand{\betaM}{\beta^{[M]}} % last baselearner +\newcommand{\ib}{\mathrm{IB}} % In-Bag (IB) +\newcommand{\ibm}{\ib^{[m]}} % In-Bag (IB) for m-th bootstrap +\newcommand{\oob}{\mathrm{OOB}} % Out-of-Bag (OOB) +\newcommand{\oobm}{\oob^{[m]}} % Out-of-Bag (OOB) for m-th bootstrap + +% ml - boosting +\newcommand{\fm}[1][m]{f^{[#1]}} % prediction in iteration m +\newcommand{\fmh}[1][m]{\hat{f}^{[#1]}} % prediction in iteration m +\newcommand{\fmd}[1][m]{f^{[#1-1]}} % prediction m-1 +\newcommand{\fmdh}[1][m]{\hat{f}^{[#1-1]}} % prediction m-1 +\newcommand{\errm}[1][m]{\text{err}^{[#1]}} % weighted in-sample misclassification rate +\newcommand{\wm}[1][m]{w^{[#1]}} % weight vector of basemodel m +\newcommand{\wmi}[1][m]{w^{[#1](i)}} % weight of obs i of basemodel m +\newcommand{\thetam}[1][m]{\thetav^{[#1]}} % parameters of basemodel m +\newcommand{\thetamh}[1][m]{\hat{\thetav}^{[#1]}} % parameters of basemodel m with hat +\newcommand{\blxt}[1][m]{b(\xv, \thetav^{[#1]})} % baselearner, default m +\newcommand{\ens}{\sum_{m=1}^M \betam \blxt} % ensemble +\newcommand{\rmm}[1][m]{\tilde{r}^{[#1]}} % pseudo residuals +\newcommand{\rmi}[1][m]{\tilde{r}^{[#1](i)}} % pseudo residuals +\newcommand{\Rtm}[1][m]{R_{t}^{[#1]}} % terminal-region +\newcommand{\Tm}[1][m]{T^{[#1]}} % terminal-region +\newcommand{\ctm}[1][m]{c_t^{[#1]}} % mean, terminal-regions +\newcommand{\ctmh}[1][m]{\hat{c}_t^{[#1]}} % mean, terminal-regions with hat +\newcommand{\ctmt}[1][m]{\tilde{c}_t^{[#1]}} % mean, terminal-regions +\newcommand{\Lp}{L^\prime} +\newcommand{\Ldp}{L^{\prime\prime}} +\newcommand{\Lpleft}{\Lp_{\text{left}}} + +% ml - boosting iml lecture +\newcommand{\ts}{\thetav^{\star}} % theta* +\newcommand{\bljt}{\bl[j](\xv, \thetav)} % BL j with theta +\newcommand{\bljts}{\bl[j](\xv, \ts)} % BL j with theta* + +% ------------- ml-eval.tex ------------- + +% resampling +\newcommand{\ntest}{n_{\mathrm{test}}} % size of the test set +\newcommand{\ntrain}{n_{\mathrm{train}}} % size of the train set +\newcommand{\ntesti}[1][i]{n_{\mathrm{test},#1}} % size of the i-th test set +\newcommand{\ntraini}[1][i]{n_{\mathrm{train},#1}} % size of the i-th train set +\newcommand{\Jtrain}{J_\mathrm{train}} % index vector train data +\newcommand{\Jtest}{J_\mathrm{test}} % index vector test data +\newcommand{\Jtraini}[1][i]{J_{\mathrm{train},#1}} % index vector i-th train dataset +\newcommand{\Jtesti}[1][i]{J_{\mathrm{test},#1}} % index vector i-th test dataset +\newcommand{\Dtraini}[1][i]{\mathcal{D}_{\text{train},#1}} % D_train,i, i-th training set +\newcommand{\Dtesti}[1][i]{\mathcal{D}_{\text{test},#1}} % D_test,i, i-th test set + +\newcommand{\JSpace}[1][m]{\nset^{#1}} % space of train indices of size n_train +\newcommand{\JtrainSpace}{\nset^{\ntrain}} % space of train indices of size n_train +\newcommand{\JtestSpace}{\nset^{\ntest}} % space of train indices of size n_test +\newcommand{\yJ}[1][J]{\yv_{#1}} % output vector associated to index J +\newcommand{\yJDef}{\left(y^{(J^{(1)})},\dots,y^{(J^{(m)})}\right)} % def of the output vector associated to index J +\newcommand{\JJ}{\mathcal{J}} % cali-J, set of all splits +\newcommand{\JJset}{\left((\Jtraini[1], \Jtesti[1]),\dots,(\Jtraini[B], \Jtesti[B])\right)} % (Jtrain_1,Jtest_1) ...(Jtrain_B,Jtest_B) +\newcommand{\Itrainlam}{\ind(\Dtrain, \lamv)} +% Generalization error +\newcommand{\GE}{\mathrm{GE}} % GE +\newcommand{\GEh}{\widehat{\GE}} % GE-hat +\newcommand{\GEfull}[1][\ntrain]{\GE(\ind, \lamv, #1, \rho)} % GE full +\newcommand{\GEhholdout}{\GEh_{\Jtrain, \Jtest}(\ind, \lamv, |\Jtrain|, \rho)} % GE hat holdout +\newcommand{\GEhholdouti}[1][i]{\GEh_{\Jtraini[#1], \Jtesti[#1]}(\ind, \lamv, |\Jtraini[#1]|, \rho)} % GE hat holdout i-th set +\newcommand{\GEhlam}{\GEh(\lamv)} % GE-hat(lam) +\newcommand{\GEhlamsubIJrho}{\GEh_{\ind, \JJ, \rho}(\lamv)} % GE-hat_I,J,rho(lam) +\newcommand{\GEhresa}{\GEh(\ind, \JJ, \rho, \lamv)} % GE-hat_I,J,rho(lam) +\newcommand{\GErhoDef}{\lim_{\ntest\rightarrow\infty} \E_{\Dtrain,\Dtest \sim \Pxy} \left[ \rho\left(\yv_{\Jtest}, \FJtestftrain\right)\right]} % GE formal def +\newcommand{\agr}{\mathrm{agr}} % aggregate function +\newcommand{\GEf}{\GE\left(\fh\right)} % GE of a fitted model +\newcommand{\GEfh}{\GEh\left(\fh\right)} % GEh of a fitted model +\newcommand{\GEfL}{\GE\left(\fh, L\right)} % GE of a fitted model wrt loss L +\newcommand{\Lyfhx}{L\left(y, \hat{f}(\xv)\right)} % pointwise loss of fitted model +\newcommand{\GEnf}[1]{GE_n\left(\fh_{#1}\right)} % GE of a fitted model +\newcommand{\GEind}{GE_n\left(\ind_{L, O}\right)} % GE of inducer +\newcommand{\GED}{\GE_{\D}} % GE indexed with data +\newcommand{\EGEn}{EGE_n} % expected GE +\newcommand{\EDn}{\E_{|D| = n}} % expectation wrt data of size n + +% performance measure +\newcommand{\rhoL}{\rho_L} % perf. measure derived from pointwise loss +\newcommand{\F}{\bm{F}} % matrix of prediction scores +\newcommand{\Fi}[1][i]{\F^{(#1)}} % i-th row vector of the predscore mat +\newcommand{\FJ}[1][J]{\F_{#1}} % predscore mat idxvec J +\newcommand{\FJf}{\FJ[J,f]} % predscore mat idxvec J and model f +\newcommand{\FJtestfh}{\FJ[\Jtest, \fh]} % predscore mat idxvec Jtest and model f hat +\newcommand{\FJtestftrain}{\F_{\Jtest, \Itrainlam}} % predscore mat idxvec Jtest and model f +\newcommand{\FJtestftraini}[1][i]{\F_{\Jtesti[#1],\ind(\Dtraini[#1], \lamv)}} % predscore mat i-th idxvec Jtest and model f +\newcommand{\FJfDef}{\left(f(\xv^{(J^{(1)})}),\dots, f(\xv^{(J^{(m)})})\right)} % def of predscore mat idxvec J and model f +\newcommand{\preimageRho}{\bigcup_{m\in\N}\left(\Yspace^m\times\R^{m\times g}\right)} % Set of all datasets times HP space + +% ml - ROC +\newcommand{\np}{n_{+}} % no. of positive instances +\newcommand{\nn}{n_{-}} % no. of negative instances +\newcommand{\rn}{\pi_{-}} % proportion negative instances +\newcommand{\rp}{\pi_{+}} % proportion negative instances +% true/false pos/neg: +\newcommand{\tp}{\# \text{TP}} % true pos +\newcommand{\fap}{\# \text{FP}} % false pos (fp taken for partial derivs) +\newcommand{\tn}{\# \text{TN}} % true neg +\newcommand{\fan}{\# \text{FN}} % false neg + +% ------------- ml-feature-sel.tex ------------- + +% ml - feature selection + +\newcommand{\xjNull}{x_{j_0}} +\newcommand{\xjEins}{x_{j_1}} +\newcommand{\xl}{\mathbf{x}_l} +\newcommand{\pushcode}[1][1]{\hskip\dimexpr#1\algorithmicindent\relax} % IGNORE_NOTATION + +% ------------- ml-gp.tex ------------- + +% ml - Gaussian Process + +\newcommand{\fvec}{\left[f\left(\xi[1]\right), \dots, f\left(\xi[n]\right) \right]} % function vector +\newcommand{\fv}{\mathbf{f}} % function vector +\newcommand{\kv}{\mathbf{k}} % cov matrix partition +\newcommand{\kxxp}{k\left(\xv, \xv^{\prime} \right)} % cov of x, x' +\newcommand{\kxij}[2]{k\left(\xi, \xi[j] \right)} % cov of x_i, x_j +\newcommand{\mv}{\mathbf{m}} % GP mean vector +\newcommand{\Kmat}{\mathbf{K}} % GP cov matrix +\newcommand{\gaussmk}{\normal(\mv, \Kmat)} % Gaussian w/ mean vec, cov mat +\newcommand{\gp}{\mathcal{GP}\left(m(\xv), \kxxp \right)} % Gaussian Process Definition +\newcommand{\ls}{\ell} % length-scale +\newcommand{\sqexpkernel}{\exp \left(- \frac{\| \xv - \xv^{\prime} \|^2}{2 \ls^2} \right)} % squared exponential kernel + +% GP prediction +\newcommand{\fstarvec}{\left[f\left(\xi[1]_{\ast}\right), \dots, f\left(\xi[m]_{\ast}\right) \right]} % pred function vector +\newcommand{\kstar}{\kv_{\ast}} % cov of new obs with x +\newcommand{\kstarstar}{\kv_{\ast \ast}} % cov of new obs +\newcommand{\Kstar}{\Kmat_{\ast}} % cov mat of new obs with x +\newcommand{\Kstarstar}{\Kmat_{\ast \ast}} % cov mat of new obs +\newcommand{\preddistsingle}{f_{\ast} ~|~ \xv_{\ast}, \Xmat, \fv} % predictive distribution for single pred +\newcommand{\preddistdefsingle}{\normal(\kstar^\top\Kmat^{-1}\fv, \kstarstar - \kstar^\top \Kmat ^{-1}\kstar)} % Gaussian distribution for single pred +\newcommand{\preddist}{f_{\ast} ~|~ \Xmat_{\ast}, \Xmat, \fv} % predictive distribution +\newcommand{\preddistdef}{\normal(\Kstar^\top\Kmat^{-1}\fv, \Kstarstar - \Kstar^\top \Kmat ^{-1}\Kstar)} % Gaussian predictive distribution + +% ------------- ml-hpo.tex ------------- + +%%% HPO Basics +\newcommand{\Ilam}{\ind_{\lamv}} % inducer with HP +\newcommand{\LamS}{\tilde\Lam} % search space +\newcommand{\lami}[1][i]{\lamv^{(#1)}} % lambda i +\newcommand{\clam}{c(\lamv)} % c(lambda) +\newcommand{\clamh}{c(\lamh)} % c(lambda-hat) +\newcommand{\lams}{\lamv^{*}} % theoretical min of c +\newcommand{\lamh}{\hat{\lamv}} % returned lambda of HPO +\newcommand{\lamp}{\lamv^+} % proposed lambda +\newcommand{\clamp}{c(\lamp)} % c of proposed lambda +\newcommand{\archive}{\mathcal{A}} % archive +\newcommand{\archivet}[1][t]{\mathcal{A}^{[#1]}} % archive at time step t + +\newcommand{\tuner}{\mathcal{T}} % tuner +\newcommand{\tunerfull}{\tuner_{\ind,\LamS, \rho,\JJ}} % tuner with inducer, search space, perf measure, resampling strategy + +%%% Bayesian Opt +\newcommand{\chlam}{\hat{c}(\lamv)} % post mean of SM +\newcommand{\shlam}{\hat{\sigma}(\lamv)} % post sd of SM +\newcommand{\vhlam}{\hat{\sigma}^2(\lamv)} % post var of SM +\newcommand{\ulam}{u(\lamv)} % acquisition function +\newcommand{\lambdaopt}{\lambda^{*}} % minimum of the black box function Psi +\newcommand{\metadata}{\left\{\left(\lami, \Psi^{[i]}\right)\right\}} % metadata for the Gaussian process +\newcommand{\lamvec}{\left(\lambda^{[1]}, \dots, \lambda^{[\minit]}\right)} % vector of different inputs +\newcommand{\minit}{m_{\text{init}}} % size of the initial design + +%%% Multifidelity / Hyperband +\newcommand{\lambu}{\lambda_{\text{budget}}} % single lambda_budget component HP +\newcommand{\lamfid}{\lambda_{\text{fid}}} % single lambda fidelity +\newcommand{\lamfidl}{\lamfid^{\textrm{low}}} % single lambda fidelity lower +\newcommand{\lamfidu}{\lamfid^{\textrm{upp}}} % single lambda fidelity upper +\newcommand{\etahb}{\eta_{\text{HB}}} % HB multiplier eta + +% ------------- ml-infotheory.tex ------------- + +% basic info theory +\newcommand{\entx}{- \sum_{x \in \Xspace} p(x) \cdot \log p(x)} % entropy of x +\newcommand{\dentx}{- \int_{\Xspace} f(x) \cdot \log f(x) dx} % diff entropy of x +\newcommand{\jentxy}{- \sum_{x \in \Xspace} p(x, y) \cdot \log p(x, y)} % joint entropy of x, y +\newcommand{\jdentxy}{- \int_{\Xspace, \Yspace} f(x, y) \cdot \log f(x, y) dx dy} % joint diff entropy of x, y +\newcommand{\centyx}{- \sum_{x \in \Xspace} p(x) \sum_{y \in \Yspace} p(y|x) \cdot \log p(y|x)} % cond entropy y|x +\newcommand{\cdentyx}{- \int_{\Xspace, \Yspace} f(x, y) \cdot \log f(y | x) dx dy} % cond diff entropy y|x +\newcommand{\xentpq}{- \sum_{x \in \Xspace} p(x) \cdot \log q(x)} % cross-entropy of p, q +\newcommand{\kldpq}{D_{KL}(p \| q)} % KLD between p and q +\newcommand{\kldpqt}{D_{KL}(p \| q_{\thetav})} % KLD divergence between p and parameterized q +\newcommand{\explogpq}{\E_p \left[\log \frac{p(X)}{q(X)} \right]} % expected LLR of p, q (def KLD) +\newcommand{\sumlogpq}{\sum_{x \in \Xspace} p(x) \cdot \log \frac{p(x)}{q(x)}} % expected LLR of p, q (def KLD) + +% ------------- ml-interpretable.tex ------------- + +%%%%%% perturbed data +\newcommand{\pert}[3]{\ifthenelse{\equal{#2}{}}{\tilde{#1}}{\ifthenelse{\equal{#3}{}}{\tilde{#1}^{#2}}{\tilde{#1}^{#2|#3}}}} % command to express that for #1 the subset #2 was perturbed given subset #3 + +%%%%%% marginalized functions +\newcommand{\fj}{f_j} % marginal function f_j, depending on feature j +\newcommand{\fnj}{f_{-j}} % marginal function f_{-j}, depending on all features but j +\newcommand{\fS}{f_S} % marginal function f_S depending on feature set S +\newcommand{\fC}{f_C} % marginal function f_C depending on feature set C +\newcommand{\fhj}{\fh_j} % marginal function fh_j, depending on feature j +\newcommand{\fhnj}{\fh_{-j}} % marginal function fh_{-j}, depending on all features but j +\newcommand{\fhS}{\fh_S} % marginal function fh_S depending on feature set S +\newcommand{\fhC}{\fh_C} % marginal function fh_C depending on feature set C +\newcommand{\XSmat}{\Xmat_S} % Design matrix subset +\newcommand{\XCmat}{\Xmat_C} % Design matrix subset +\newcommand{\Xnj}{\Xmat_{-j}} % Design matrix subset -j = {1, .., j-1, j+1, ..., p} + +%%%%% ICE +\newcommand{\fhice}[1]{\fh_{#1,ICE}} % ICE function + +%%%%% Shapley values +\newcommand{\Scupj}{S \cup \{j\}} % coalition S but without player j +\newcommand{\Scupk}{S \cup \{k\}} % coalition S but without player k +\newcommand{\SsubP}{S \subseteq P} % coalition S subset of P +\newcommand{\SsubPnoj}{\SsubP \setminus \{j\}} % coalition S subset of P without player j +\newcommand{\SsubPnojk}{\SsubP \setminus \{j,k\}} % coalition S subset of P without player k +\newcommand{\phiij}{\hat{\phi}_j^{(i)}} % Shapley value for feature j and observation i + +%%%%% LIME +\newcommand{\Gspace}{\mathcal{G}} % Hypothesis space for surrogate model +\newcommand{\neigh}{\phi_{\xv}} % Proximity measure +\newcommand{\zv}{\mathbf{z}} % Sampled datapoints for surrogate +\newcommand{\Gower}{d_G} % Gower distance + + +% ------------- ml-mbo.tex ------------- + +\newcommand{\xvsi}[1][i]{\xv^{[#1]}} % x at iteration i +\newcommand{\ysi}[1][i]{y^{[#1]}} % y at iteration i +\newcommand{\Dt}[1][t]{\D^{[#1]}} % archive at iteration t +\newcommand{\Dts}{\Dt = \{(\xvsi, \ysi)\}_{i = 1, \ldots, t}} % archive at iteration t fully +%\newcommand{\fh}{\hat{s}} % surrogate mean +\newcommand{\sh}{\hat{s}} % surrogate se +\newcommand{\fmin}{f_{\min}} % current best + +% ------------- ml-multitarget.tex ------------- + +% multitarget notation +\newcommand{\Tspace}{\mathcal{T}} +\newcommand{\tv}{\mathbf{t}} +\newcommand{\tim}{\mathbf{t}^{(i)}_m} +\newcommand{\yim}{y^{(i)}_m} + +% ------------- ml-nn.tex ------------- + +% ml - NNs +\newcommand{\neurons}{z_1,\dots,z_M} % vector of neurons +\newcommand{\hidz}{\mathbf{z}} % vector of hidden activations +\newcommand{\biasb}{\mathbf{b}} % bias vector +\newcommand{\biasc}{c} % bias in output +\newcommand{\wtw}{\mathbf{w}} % weight vector (general) +\newcommand{\Wmat}{\mathbf{W}} % weight vector (general) +\newcommand{\wtu}{\mathbf{u}} % weight vector of output neuron + +% deeplearning - regularization +\newcommand{\Oreg}{\mathnormal{R}_{reg}(\theta|X,y)} % regularized objective function +\newcommand{\Ounreg}{\mathnormal{R}_{emp}(\theta|X,y)} % unconstrained objective function +\newcommand{\Pen}{\Omega(\theta)} % penalty +\newcommand{\Oregweight}{\mathnormal{R}_{reg}(w|X,y)} % regularized objective function with weight +\newcommand{\Oweight}{\mathnormal{R}_{emp}(w|X,y)} % unconstrained objective function with weight +\newcommand{\Oweighti}{\mathnormal{R}_{emp}(w_i|X,y)} % unconstrained objective function with weight w_i +\newcommand{\Oweightopt}{\mathnormal{J}(w^*|X,y)} % unconstrained objective function withoptimal weight +\newcommand{\Oopt}{\hat{\mathnormal{J}}(\theta|X,y)} % optimal objective function +\newcommand{\Odropout}{\mathnormal{J}(\theta, \mu|X,y)} % dropout objective function + +% deeplearning - optimization +\newcommand{\Loss}{L(y, f(\xv, \thetav))} +\newcommand{\Lmomentumnest}{L(\yi, f(x^{(i)}, \thetav + \varphi \nub))} % momentum risk +\newcommand{\Lmomentumtilde}{L(\yi, f(x^{(i)}, \tilde{\thetav}))} % Nesterov momentum risk +\newcommand{\Lmomentum}{L(\yi, f(x^{(i)}, \thetav))} +\newcommand{\Hess}{\mathbf{H}} +\newcommand{\nub}{\bm{\nu}} + +% deeplearning - autoencoders +\newcommand{\uauto}{L(x,g(f(x)))} % undercomplete autoencoder objective function +\newcommand{\dauto}{L(x,g(f(\tilde{x})))} % denoising autoencoder objective function + +% deeplearning - adversarials +\newcommand{\deltab}{\bm{\delta}} +\newcommand{\Lossdeltai}{L(\yi, f(\xi + \deltab|\thetav))} +\newcommand{\Lossdelta}{L(y, f(\xv + \deltab| \thetav))} + +% ------------- ml-online.tex ------------- + +\newcommand{\Aspace}{\mathcal{A}} +\newcommand{\norm}[1]{\left|\left|#1\right|\right|_2} +\newcommand{\llin}{L^{\texttt{lin}}} +\newcommand{\lzeroone}{L^{0-1}} +\newcommand{\lhinge}{L^{\texttt{hinge}}} +\newcommand{\lexphinge}{\widetilde{L^{\texttt{hinge}}}} +\newcommand{\lconv}{L^{\texttt{conv}}} +\newcommand{\FTL}{\texttt{FTL}} +\newcommand{\FTRL}{\texttt{FTRL}} +\newcommand{\OGD}{{\texttt{OGD}}} +\newcommand{\EWA}{{\texttt{EWA}}} +\newcommand{\REWA}{{\texttt{REWA}}} +\newcommand{\EXPthree}{{\texttt{EXP3}}} +\newcommand{\EXPthreep}{{\texttt{EXP3P}}} +\newcommand{\reg}{\psi} +\newcommand{\Algo}{\texttt{Algo}} + +% ------------- ml-regu.tex ------------- + +% \thetah is \hat{\theta}} (theta hat) +% \thetav is \bm{\theta}} (theta vector) +\newcommand{\thetas}{\thetav^*} % theta star +\newcommand{\thetaridge}{\thetav_{\mathrm{ridge}}} % theta (RIDGE) +\newcommand{\thetalasso}{\thetav_{\mathrm{LASSO}}} % theta (LASSO) +\newcommand{\thetaols}{\thetav_{\mathrm{OLS}}} % theta (RIDGE) + +% ------------- ml-survival.tex ------------- + +\newcommand{\Ti}[1][i]{T^{(#1)}} % ?? +\newcommand{\Ci}[1][i]{C^{(#1)}} % ?? +\newcommand{\oi}[1][i]{o^{(#1)}} % ?? +\newcommand{\ti}[1][i]{t^{(#1)}} % ?? +\newcommand{\deltai}[1][i]{\delta^{(#1)}} +\newcommand{\Lxdi}{L\left(\bm{\delta}, \fx\right)} + + +% ------------- ml-svm.tex ------------- + +% linear svm +\newcommand{\sv}{\operatorname{SV}} % supportvectors +\renewcommand{\sl}{\zeta} % slack variable +\newcommand{\slvec}{\left(\zeta^{(1)}, \zeta^{(n)}\right)} % slack variable vector +\newcommand{\sli}[1][i]{\zeta^{(#1)}} % i-th slack variable +\newcommand{\scptxi}{\scp{\thetav}{\xi}} % scalar prodct of theta and xi +\newcommand{\svmhplane}{\yi \left( \scp{\thetav}{\xi} + \theta_0 \right)} % SVM hyperplane (normalized) +\newcommand{\alphah}{\hat{\alpha}} % alpha-hat (basis fun coefficients) +\newcommand{\alphav}{\bm{\alpha}} % vector alpha (bold) (basis fun coefficients) +\newcommand{\alphavh}{\hat{\bm{\alpha}}} % vector alpha-hat (basis fun coefficients) +\newcommand{\dualobj}{\sumin \alpha_i - \frac{1}{2}\sumin \sumjn \alpha_i\alpha_j\yi \yi[j] \scp{\xi}{\xv^{(j)}}} % min objective in lin svm dual + +% nonlinear svm +\newcommand{\HS}{\Phi} % H, hilbertspace +\newcommand{\phix}{\phi(\xv)} % feature map x +\newcommand{\phixt}{\phi(\tilde \xv)} % feature map x tilde +\newcommand{\kxxt}{k(\xv, \tilde \xv)} % kernel fun x, x tilde +\newcommand{\scptxifm}{\scp{\thetav}{\phi(\xi)}} % scalar prodct of theta and xi + +% ------------- ml-trees.tex ------------- + +% ml - trees, extra trees + +\newcommand{\Np}{\mathcal{N}} % (Parent) node N +\newcommand{\Npk}{\Np_k} % node N_k +\newcommand{\Nl}{\Np_1} % Left node N_1 +\newcommand{\Nr}{\Np_2} % Right node N_2 +\newcommand{\pikN}[1][k]{\pi_#1^{(\Np)}} % class probability node N +\newcommand{\pikNh}[1][k]{\hat\pi_#1^{(\Np)}} % estimated class probability node N +\newcommand{\pikNlh}[1][k]{\hat\pi_#1^{(\Nl)}} % estimated class probability left node +\newcommand{\pikNrh}[1][k]{\hat\pi_#1^{(\Nr)}} % estimated class probability right node + diff --git a/latex-math/ml-eval.tex b/latex-math/ml-eval.tex index 01232669..0897eb81 100644 --- a/latex-math/ml-eval.tex +++ b/latex-math/ml-eval.tex @@ -56,7 +56,7 @@ \newcommand{\nn}{n_{-}} % no. of negative instances \newcommand{\rn}{\pi_{-}} % proportion negative instances \newcommand{\rp}{\pi_{+}} % proportion negative instances - % true/false pos/neg: +% true/false pos/neg: \newcommand{\tp}{\# \text{TP}} % true pos \newcommand{\fap}{\# \text{FP}} % false pos (fp taken for partial derivs) \newcommand{\tn}{\# \text{TN}} % true neg diff --git a/latex-math/ml-regu.tex b/latex-math/ml-regu.tex index 07559c73..35e2dfff 100644 --- a/latex-math/ml-regu.tex +++ b/latex-math/ml-regu.tex @@ -1,6 +1,6 @@ % \thetah is \hat{\theta}} (theta hat) % \thetav is \bm{\theta}} (theta vector) \newcommand{\thetas}{\thetav^*} % theta star -\newcommand{\thetaridge}{\thetav_{\mathrm{ridge}}}} % theta (RIDGE) -\newcommand{\thetalasso}{\thetav_{\mathrm{LASSO}}}} % theta (LASSO) -\newcommand{\thetaols}{\thetav_{\mathrm{OLS}}}} % theta (RIDGE) +\newcommand{\thetaridge}{\thetav_{\mathrm{ridge}}} % theta (RIDGE) +\newcommand{\thetalasso}{\thetav_{\mathrm{LASSO}}} % theta (LASSO) +\newcommand{\thetaols}{\thetav_{\mathrm{OLS}}} % theta (RIDGE) diff --git a/latex-math/ml-svm.tex b/latex-math/ml-svm.tex index 1f4d937f..b417a3f5 100644 --- a/latex-math/ml-svm.tex +++ b/latex-math/ml-svm.tex @@ -1,6 +1,8 @@ % linear svm \newcommand{\sv}{\operatorname{SV}} % supportvectors +\ifdefined\sl \renewcommand{\sl}{\zeta} % slack variable +\else \newcommand{\sl}{\zeta} \fi \newcommand{\slvec}{\left(\zeta^{(1)}, \zeta^{(n)}\right)} % slack variable vector \newcommand{\sli}[1][i]{\zeta^{(#1)}} % i-th slack variable \newcommand{\scptxi}{\scp{\thetav}{\xi}} % scalar prodct of theta and xi diff --git a/latex-math/preamble.tex b/latex-math/preamble.tex new file mode 100644 index 00000000..e25951e2 --- /dev/null +++ b/latex-math/preamble.tex @@ -0,0 +1,7 @@ +\usepackage{mathtools} +\usepackage{bm} +\usepackage{siunitx} +\usepackage{dsfont} +\usepackage{xspace} +\usepackage{longtable} +\usepackage{xifthen}