forked from avehtari/BDA_course_Aalto
-
Notifications
You must be signed in to change notification settings - Fork 0
/
slides_ch11.tex
1284 lines (1038 loc) · 37.9 KB
/
slides_ch11.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
\documentclass[finnish,english,t]{beamer}
% \documentclass[finnish,english,handout]{beamer}
% Uncomment if want to show notes
% \setbeameroption{show notes}
\mode<presentation>
{
\usetheme{Warsaw}
}
\setbeamertemplate{itemize items}[circle]
\setbeamercolor{frametitle}{bg=white,fg=navyblue}
% \usepackage[pdftex]{graphicx}
%\usepackage[T1]{fontenc}
\usepackage[utf8]{inputenc}
\usepackage{times}
\usepackage{microtype}
\usepackage{epic,epsfig}
\usepackage{subfigure,float}
\usepackage{amsmath,amsfonts,amssymb}
\usepackage{inputenc}
\usepackage{babel}
\usepackage{afterpage}
\usepackage{url}
\urlstyle{same}
\usepackage{natbib}
\bibliographystyle{apalike}
% \definecolor{hutblue}{rgb}{0,0.2549,0.6784}
% \definecolor{midnightblue}{rgb}{0.0977,0.0977,0.4375}
% \definecolor{hutsilver}{rgb}{0.4863,0.4784,0.4784}
% \definecolor{lightgray}{rgb}{0.95,0.95,0.95}
% \definecolor{section}{rgb}{0,0.2549,0.6784}
% \definecolor{list1}{rgb}{0,0.2549,0.6784}
\definecolor{navyblue}{rgb}{0,0,0.5}
\renewcommand{\emph}[1]{\textcolor{navyblue}{#1}}
\definecolor{darkgreen}{rgb}{0,0.3922,0}
\graphicspath{{./figs/}}
\pdfinfo{
/Title (Bayesian data analysis, ch 11)
/Author (Aki Vehtari) %
/Keywords (Bayesian probability theory, Bayesian inference, Bayesian data analysis)
}
\parindent=0pt
\parskip=8pt
\tolerance=9000
\abovedisplayshortskip=2pt
\setbeamertemplate{navigation symbols}{}
\setbeamertemplate{headline}[default]{}
\setbeamertemplate{headline}[text line]{\insertsection}
\setbeamertemplate{footline}[frame number]
\def\o{{\mathbf o}}
\def\t{{\mathbf \theta}}
\def\w{{\mathbf w}}
\def\x{{\mathbf x}}
\def\y{{\mathbf y}}
\def\z{{\mathbf z}}
\def\eff{\mathrm{eff}}
\def\ESS{\mathrm{ESS}}
\DeclareMathOperator{\E}{E}
\DeclareMathOperator{\Var}{Var}
\DeclareMathOperator{\var}{var}
\DeclareMathOperator{\Sd}{Sd}
\DeclareMathOperator{\sd}{sd}
\DeclareMathOperator{\Gammad}{Gamma}
\DeclareMathOperator{\Invgamma}{Inv-gamma}
\DeclareMathOperator{\Bin}{Bin}
\DeclareMathOperator{\Negbin}{Neg-bin}
\DeclareMathOperator{\Poisson}{Poisson}
\DeclareMathOperator{\Beta}{Beta}
\DeclareMathOperator{\logit}{logit}
\DeclareMathOperator{\N}{N}
\DeclareMathOperator{\U}{U}
\DeclareMathOperator{\BF}{BF}
\DeclareMathOperator{\Invchi2}{Inv-\chi^2}
\DeclareMathOperator{\NInvchi2}{N-Inv-\chi^2}
\DeclareMathOperator{\InvWishart}{Inv-Wishart}
\DeclareMathOperator{\tr}{tr}
% \DeclareMathOperator{\Pr}{Pr}
\def\euro{{\footnotesize \EUR\, }}
\DeclareMathOperator{\rep}{\mathrm{rep}}
\title[]{Bayesian data analysis}
\subtitle{}
\author{Aki Vehtari}
\institute[Aalto]{}
\begin{document}
%http://elevanth.org/blog/2017/11/28/build-a-better-markov-chain/
\begin{frame}
{\Large\color{navyblue} Chapter 11}
\begin{itemize}
\item 11.1 Gibbs sampler
\item 11.2 Metropolis and Metropolis-Hastings
\item 11.3 Using Gibbs and Metropolis as building blocks
\item 11.4 Inference and assessing convergence (important)
\begin{itemize}
\item potential scale reduction $\widehat{R}$
\end{itemize}
\item 11.5 Effective number of simulation draws (important)
\begin{itemize}
\item effective sample size $S_{\mathrm{eff}}$
\end{itemize}
\item 11.6 Example: hierarchical normal model (quick glance)
\end{itemize}
\end{frame}
\begin{frame}
{\Large\color{navyblue} Chapter 11 demos}
\begin{itemize}
\item demo11\_1: Gibbs sampling
\item demo11\_2: Metropolis sampling
\item demo11\_3: Convergence of Markov chain
\item demo11\_4: split-$\widehat{R}$ and effective sample size $S_{\mathrm{eff}}$
\end{itemize}
\end{frame}
\begin{frame}
{\Large\color{navyblue} It's all about expectations}
\vspace{-1.5\baselineskip}
\begin{align*}
E_{\color{blue} p(\theta|y)}[f(\theta)] & = \int f(\theta) {\color{blue} p(\theta|y)} d\theta,\\
\text{where} \quad
{\color{blue} p(\theta|y)} & = \frac{\color{darkgreen}p(y|\theta)p(\theta)}{\color{red} \int p(y|\theta)p(\theta) d\theta}
\end{align*}
\uncover<2->{We can easily evaluate ${\color{darkgreen} p(y|\theta)p(\theta)}$ for any $\theta$, but the integral ${\color{red} \int p(y|\theta)p(\theta) d\theta}$ is usually difficult.}
\uncover<3->{We can use the unnormalized posterior ${\color{darkgreen} q(\theta|y)
= p(y|\theta)p(\theta)}$, for example, in}
\begin{itemize}
\vspace{-0.5\baselineskip}
\item<4-> Grid (equal spacing) evaluation with self-normalization
\begin{align*}
E_{\color{blue} p(\theta|y)}[f(\theta)] \approx
\frac{\sum_{s=1}^S \left[f(\theta^{(s)}){\color{darkgreen}q(\theta^{(s)}|y)}\right]}{\sum_{s=1}^S{\color{darkgreen}q(\theta^{(s)}|y)}}
\end{align*}
\item<5-> Monte Carlo methods which can sample from
${\color{blue}p(\theta^{(s)}|y)}$ using only
${\color{darkgreen}q(\theta^{(s)}|y)}$
\vspace{-0.5\baselineskip}
\begin{align*}
E_{\color{blue} p(\theta|y)}[f(\theta)] \approx \frac{1}{S} \sum_{s=1}^S f(\theta^{(s)})
\end{align*}
\end{itemize}
\end{frame}
\begin{frame}
{\Large\color{navyblue} Monte Carlo}
\begin{itemize}
\item Monte Carlo methods we have discussed so far
\begin{itemize}
\item Inverse CDF works for 1D
\item Analytic transformations work for only certain distributions
\item Factorization works only for certain joint distributions
\item Grid evaluation and sampling works in less than a few dimensions
\item Rejection sampling works mostly in 1D (truncation is a special case)
\item Importance sampling is reliable only in special cases
\end{itemize}
\pause
\item What to do in high dimensions?
\begin{itemize}
\item Markov chain Monte Carlo (Ch 11-12)
\item Laplace, Variational*, EP* (Ch 4,13*)
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}
{\Large\color{navyblue} Markov chain}
\begin{itemize}
\item<1-> Andrey Markov proved weak law of large numbers and central
limit theorem for certain dependent-random sequences, which were
later named Markov chains
\item<2-> The probability of each event depends only on the state
attained in the previous event (or finite number of previous events)
\item<3-> Markov's one example was the sequence of letters in
Pushkin's novel ``Yevgeniy Onegin''
\end{itemize}
\end{frame}
\begin{frame}
{\Large\color{navyblue} Markov chain}
\begin{itemize}
\item Example of a simple Markov chain on black board
\end{itemize}
\end{frame}
\begin{frame}
{\Large\color{navyblue} Markov chain Monte Carlo (MCMC)}
\begin{itemize}
\item Produce draws $\theta^{(t)}$ given $\theta^{(t-1)}$ from a
Markov chain, which has been constructed so that its equilibrium
distribution is $p(\theta|y)$
\begin{itemize}
\item<2->[+] generic
\item<2->[+] combine sequence of easier Monte Carlo draws to form a Markov chain
\item<3->[+] chain goes where most of the posterior mass is
\item<3->[+] asymptotically chain spends the $\alpha$\% of time where
$\alpha$\% posterior mass is
\item<4->[+] central limit theorem holds for expectations
\item<5->[-] draws are dependent
\item<5->[-] construction of efficient Markov chains is not always
easy
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}
{\Large\color{navyblue} Markov chain}
\begin{itemize}
\item Set of random variables $\theta^1,\theta^2,\ldots$, so that
with all values of $t$, $\theta^t$ depends only on the previous $\theta^{(t-1)}$
\begin{equation*}
p(\theta^t|\theta^1,\ldots,\theta^{(t-1)})=p(\theta^t|\theta^{(t-1)})
\end{equation*}
\item Chain has to be initialized with some starting point $\theta^0$
\item Transition distribution $T_t(\theta^t|\theta^{t-1})$ (may
depend on $t$)
\item By choosing a suitable transition distribution, the
stationary distribution of Markov chain is $p(\theta|y)$
\end{itemize}
\end{frame}
% \note{S-38.143 Jonoteorian luentokalvoissa
% %<http://www.netlab.hut.fi/opetus/s38143/luennot/index.shtml> ja
% erityisesti luento 4 on hyvä tiivistelmä perustermeistä ja niiden
% suomennokset}
\begin{frame}
{\Large\color{navyblue} Gibbs sampling}
\begin{itemize}
\item Alternate sampling from 1D conditional distributions
\begin{itemize}
\item 1D is easy even if no conjugate prior and analytic posterior
\end{itemize}
\item<2-> demo11\_1\\
\vspace{-.5\baselineskip}
\begin{center}
\includegraphics[width=5cm]{Gibbs1.pdf}
\end{center}
\vspace{-.5\baselineskip}
\item<3-> Basic algorithm {
\begin{align*}
\text{sample $\theta_j^t$ from} \quad & p(\theta_j|\theta_{-j}^{t-1}, y),\\
\text{where} \quad
& \theta^{t-1}_{-j}= (\theta^t_1,\dots,\theta^t_{j-1},
\theta^{t-1}_{j+1},\dots,\theta^{t-1}_d)
\end{align*}
}
\end{itemize}
\end{frame}
\begin{frame}
{\Large\color{navyblue} Gibbs sampling}
\vspace{-0.5\baselineskip}
\begin{itemize}
\item With {\it conditionally} conjugate priors, the sampling from
the conditional distributions is easy for wide range of models
\begin{itemize}
\item BUGS/WinBUGS/OpenBUGS/JAGS
\end{itemize}
\item<2-> No algorithm parameters to tune\\
(cf. proposal distribution in Metropolis algorithm)
\item<3-> For not so easy conditionals, use e.g. inverse-CDF
\item<4-> Several parameters can be updated in blocks ({\em blocking})
\item<5-> Slow if parameters are highly dependent in the posterior
\begin{itemize}
\item<5-> demo11\_1 continues\\
\vspace{-0.5\baselineskip}
\hspace{3.5cm}\includegraphics[width=5cm]{Gibbs2.pdf}
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}
{\Large\color{navyblue} Conditional vs joint}
\begin{itemize}
\item How about sampling $\theta$ jointly?
\begin{itemize}
\item e.g. it is easy to sample from multivariate normal
\end{itemize}
\item<2-> Can we use that to form a Markov chain?\\
{\small \url{http://elevanth.org/blog/2017/11/28/build-a-better-markov-chain/}}
\end{itemize}
\end{frame}
\begin{frame}
{\Large\color{navyblue} Metropolis algorithm}
\begin{itemize}
\item Algorithm
\begin{itemize}
\item[1.] starting point $\theta^0$
\item[2.] $t=1,2,\ldots$
\begin{itemize}
\item[(a)] pick a proposal $\theta^{*}$ from the proposal distribution
$J_t(\theta^{*}|\theta^{t-1})$. \\
Proposal distribution has to be symmetric, i.e.\\
$J_t(\theta_a|\theta_b)=J_t(\theta_b|\theta_a)$, for all
$\theta_a,\theta_b$
\item<2->[(b)] calculate acceptance ratio
\begin{equation*}
r=\frac{p(\theta^{*}|y)}{p(\theta^{t-1}|y)}
\end{equation*}
\vspace{-6mm}
\item<3->[(c)] set
\begin{equation*}
\theta^t=
\begin{cases}
\theta^{*} & \text{with probability $\min(r,1)$}\\
\theta^{t-1} & \text{otherwise}
\end{cases}
\end{equation*}
\uncover<4>{
ie, if $p(\theta^{*}|y)>p(\theta^{t-1})$ accept the proposal always \\
\hspace{0.4cm}and otherwise reject the proposal with probability $r$}
\end{itemize}
\vspace{-1.5\baselineskip}
\item<5-> rejection of a proposal increments the time $t$ also by one\\
ie, the new state is the same as previous
\item<6-> step c is executed by generating a random number from
$\U(0,1)$
\item<7-> $p(\theta^*|y)$ and $p(\theta^{t-1}|y)$ have the same
normalization terms, and thus instead of $p(\cdot|y)$,
unnormalized $q(\cdot|y)$ can be used, as the normalization
terms cancel out!
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}
{\Large\color{navyblue} Metropolis algorithm}
\begin{itemize}
\item Example: one bivariate observation $(y_1,y_2)$
\begin{itemize}
\item bivariate normal distribution with unknown mean and known
covariance
\begin{equation*}
\left.
\begin{pmatrix}
\theta_1\\
\theta_2
\end{pmatrix}
\right| y \sim
\N\left(
\begin{pmatrix}
y_1\\
y_2
\end{pmatrix},
\begin{pmatrix}
1 & \rho\\
\rho & 1
\end{pmatrix}
\right)
\end{equation*}
\item proposal distribution
$J_t(\theta^{*}|\theta^{t-1})=\N(\theta^{*}|\theta^{t-1},\sigma_p^2)$
\end{itemize}
\item Demo {\small \url{http://elevanth.org/blog/2017/11/28/build-a-better-markov-chain/}}
\end{itemize}
\end{frame}
\begin{frame}
{\Large\color{navyblue} Why Metropolis algorithm works}
\begin{itemize}
\item Intuitively more draws from the higher density areas as
jumps to higher density are always accepted and only some of the
jumps to the lower density are accepted
\vspace{5mm}
\pause
\item Theoretically
\begin{itemize}
\item[1.] Prove that simulated series is a Markov chain
which has unique stationary distribution
\item[2.] Prove that this stationary distribution is the desired target distribution
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}
{\Large\color{navyblue} Why Metropolis algorithm works}
\begin{itemize}
\item[1.] Prove that simulated series is a Markov chain
which has unique stationary distribution
\begin{itemize}
\item[a)] irreducible
\begin{itemize}
\item<2->[=] positive probability of eventually reaching any
state from any other state
\end{itemize}
\item[b)] aperiodic
\begin{itemize}
\item<3->[=] aperiodic (return times are not periodic)
\item<3->[-] holds for a random walk on any proper distribution (except for trivial exceptions)
\end{itemize}
\item[c)] recurrent / not transient
\begin{itemize}
\item<4->[=] probability to return to a state $i$ is 1
\item<4->[-] holds for a random walk on any proper distribution (except for trivial exceptions)
\end{itemize}
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}
{\Large\color{navyblue} Why Metropolis algorithm works}
\begin{itemize}
\item[2.] Prove that this stationary distribution is the desired target distribution $p(\theta|y)$
\begin{itemize}
\item[-] consider starting algorithm at time $t-1$ with a draw
$\theta^{t-1} \sim p(\theta|y)$
\item<2->[-] consider any two such points $\theta_a$ and $\theta_b$ drawn
from $p(\theta|y)$ and labeled so that
$p(\theta_b|y)\geq p(\theta_a|y)$
\item<3->[-] the unconditional probability density of a transition from $\theta_a$ to $\theta_b$ is
\vspace{-0.5\baselineskip}
\begin{equation*}
p(\theta^{t-1}=\theta_a,\theta^{t}=\theta_b)=
p(\theta_a|y)J_t(\theta_b|\theta_a),
\end{equation*}
\vspace{-1\baselineskip}
\item<4->[-] the unconditional probability density of a transition from $\theta_b$ to $\theta_a$ is
\vspace{-0.5\baselineskip}
\begin{eqnarray*}
p(\theta^{t}=\theta_a,\theta^{t-1}=\theta_b) & = &
p(\theta_b|y)J_t(\theta_a|\theta_b)\left(\frac{p(\theta_a|y)}{p(\theta_b|y)}\right)\\
\pause & = & p(\theta_a|y)J_t(\theta_a|\theta_b),
\end{eqnarray*}
\uncover<5->{
which is the same as the probability of transition from $\theta_a$ to $\theta_b$,
since we have required that $J_t(\cdot|\cdot)$ is symmetric}
\pause
\item<6->[-] since their joint distribution is symmetric, $\theta^t$ and
$\theta^{t-1}$ have the same marginal distributions, and so
$p(\theta|y)$ is the stationary distribution of the Markov chain of $\theta$
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}
{\Large\color{navyblue} Metropolis-Hastings algorithm}
\begin{itemize}
\item Generalization of Metropolis algorithm for non-symmetric proposal distributions
\begin{itemize}
\item acceptance ratio includes ratio of proposal distributions
\begin{equation*}
r =
\frac{p(\theta^{*}|y)/J_t(\theta^{*}|\theta^{t-1})}{p(\theta^{t-1}|y)/J_t(\theta^{t-1}|\theta^{*})} \pause =
\frac{p(\theta^{*}|y)J_t(\theta^{t-1}|\theta^{*})}{p(\theta^{t-1}|y)J_t(\theta^{*}|\theta^{t-1})}
\end{equation*}
\end{itemize}
\end{itemize}
\end{frame}
% \begin{frame}
% {\Large\color{navyblue} Metropolis-Hastings algorithm}
% \begin{itemize}
% \item More efficient proposal distributions
% \item e.g. Langevin-Hastings algorithm which uses gradient information to make proposals
% \begin{itemize}
% \item more likely to propose values with higher density
% \end{itemize}
% \end{itemize}
% \end{frame}
\begin{frame}
{\Large\color{navyblue} Metropolis-Hastings algorithm}
\begin{itemize}
\item Ideal proposal distribution is the distribution itself
\begin{itemize}
\item $J(\theta^{*}|\theta)\equiv p(\theta^{*}|y)$ for all
$\theta$
\item acceptance probability is $1$
\item independent draws
\item not usually feasible
\end{itemize}
\item<2-> Good proposal distribution resembles the target distribution
\begin{itemize}
\item if the shape of the target distribution is unknown, usually
normal or $t$ distribution is used
\end{itemize}
\item<3-> After the shape has been selected, it is important to select the scale
\begin{itemize}
\item small scale \\$\rightarrow$ many steps accepted, but the chain moves slowly due to small steps
\item big scale \\$\rightarrow$ long steps proposed, but many of
those rejected and again chain moves slowly
\end{itemize}
\item<4-> Generic rule for rejection rate is 60-90\% (but depends on
dimensionality and a specific algorithm variation)
\end{itemize}
\end{frame}
% \begin{frame}
% {\Large\color{navyblue} Metropolis-Hastings algorithm}
% \begin{itemize}
% \item Update of parameters
% \begin{itemize}
% \item jointly
% \item blocked
% \item single-component
% \end{itemize}
% \item Order of single or blocked updates is free
% \begin{itemize}
% \item same for all rounds
% \item random
% \item not all parameters updated each round
% \end{itemize}
% \end{itemize}
% \end{frame}
\begin{frame}
{\Large\color{navyblue} Gibbs sampling}
\begin{itemize}
\item Specific case of Metropolis-Hastings algorithm
\begin{itemize}
\item single updated (or blocked)
\item proposal distribution is the conditional distribution\\
$\rightarrow$ proposal and target distributions are same\\
$\rightarrow$ acceptance probability is $1$
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}
{\Large\color{navyblue} Metropolis}
\begin{itemize}
\item Usually doesn't scale well to high dimensions
\begin{itemize}
\item if the shape doesn't match the whole distribution, the efficiency drops
\item demo11\_2\\
\vspace{1\baselineskip}
\hspace{-1cm}\includegraphics[width=5cm]{Metrop2.pdf}\includegraphics[width=5cm]{Metrop3.pdf}
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}
{\Large\color{navyblue} Dynamic Hamiltonian Monte Carlo and NUTS}
\begin{itemize}
\item Chapter 12 presents some more advanced methods
\begin{itemize}
\item Chapter 12 includes Hamiltonian Monte Carlo and NUTS, which
is one of the most efficient methods
\begin{itemize}
\item uses gradient information
\item Hamiltonian dynamic simulation reduces random walk
\item Demo {\small \url{http://elevanth.org/blog/2017/11/28/build-a-better-markov-chain/}}
\end{itemize}
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}
{\large\color{navyblue} Example of uncertainty in modeling}
\includegraphics[width=10cm]{fakel_postdraws.pdf}
\end{frame}
\begin{frame}
{\Large\color{navyblue} HMC / NUTS}
\vspace{-.5\baselineskip}
\includegraphics[width=\textwidth,clip]{N250.pdf}\\
Source: Jonah Gabry
\end{frame}
% \begin{frame}{HMC / NUTS}
% \includegraphics[width=\textwidth,clip]{typicalset49.pdf}
% \end{frame}
% \begin{frame}{HMC / NUTS}
% \includegraphics[width=\textwidth,clip]{typicalset50.pdf}
% \end{frame}
% \begin{frame}{HMC / NUTS}
% \includegraphics[width=\textwidth,clip]{typicalset51.pdf}
% \end{frame}
% \begin{frame}{HMC / NUTS}
% \includegraphics[width=\textwidth,clip]{typicalset52.pdf}
% \end{frame}
% \begin{frame}{Hamiltonian Monte Carlo}
% \begin{itemize}
% \item demo12\_1
% \item Uses gradient information for more efficient sampling
% \item Alternating dynamic simulation and sampling of the energy
% level
% \item Parameters
% \begin{itemize}
% \item step size, number of steps in each chain
% \end{itemize}
% \pause
% \item No U-Turn Sampling
% \begin{itemize}
% \item adaptively selects number of steps to improve robustness and
% efficiency
% \end{itemize}
% \pause
% \item Adaptation in Stan
% \begin{itemize}
% \item Step size adjustment (mass matrix) is estimated during initial
% adaptation phase
% \end{itemize}
% \item See more
% \begin{itemize}
% \item \url{https://icerm.brown.edu/video_archive/\#/play/1107}
% \item \url{https://arxiv.org/abs/1701.02434}
% \end{itemize}
% \end{itemize}
% \end{frame}
\begin{frame}
{\Large\color{navyblue} Warm-up and convergence diagnostics}
\begin{itemize}
\item Asymptotically chain spends the $\alpha$\% of time where
$\alpha$\% posterior mass is
\uncover<2->{
\begin{itemize}
\item but in finite time the initial part of the chain may be
non-representative and lower error of the estimate can be
obtained by throwing it away\\
\hspace{2cm}\includegraphics[width=4cm]{Metrop1.pdf}
\end{itemize}
}
\vspace{-.5\baselineskip}
\item<3-> Warm-up = remove draws from the beginning of the chain
\begin{itemize}
\item warm-up may include also phase for adapting algorithm parameters
\end{itemize}
\item<4-> Convergence diagnostics
\begin{itemize}
\item Do we get samples from the target distribution?
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}
{\Large\color{navyblue} MCMC draws are dependent}
\begin{itemize}
\item Monte Carlo estimates still valid (central limit theorem holds)
\begin{align*}
E_{\color{blue} p(\theta|y)}[f(\theta)] \approx \frac{1}{S} \sum_{s=1}^S f(\theta^{(s)})
\end{align*}
\item Estimation of Monte Carlo error is more difficult
\begin{itemize}
\item evaluation of {\it effective} sample size
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}
{\Large\color{navyblue} Several chains}
\vspace{-0.5\baselineskip}
\begin{itemize}
\item Use of several chains make convergence diagnostics easier
\item Start chains from different starting points -- preferably overdispersed
\begin{center}
\vspace{-0.5\baselineskip}
\includegraphics[width=6cm]{10chains1.pdf}
\end{center}
\vspace{-0.5\baselineskip}
\item<2-> Remove draws from the beginning of the chains and run
chains long enough so that it is not possible to distinguish
where each chain started and the chains are well mixed
\end{itemize}
\end{frame}
\begin{frame}
{\Large\color{navyblue} Several chains}
\begin{center}
\only<1>{\includegraphics[width=10cm]{10chains2.pdf}}
\only<2->{\includegraphics[width=10cm]{10chains3.pdf}}\\
\uncover<3->{Visual convergence check is not sufficient}
\end{center}
\end{frame}
\begin{frame}[fragile]
{\Large\color{navyblue} $\widehat{R}$: comparison of within and between variances of the chains}
\begin{itemize}
\item BDA3: $\widehat{R}$ aka {\em potential scale reduction factor} (PSRF)
\item Compare means and variances of the chains\\
\uncover<2->{W = within chain variance estimate\\
var\_hat\_plus = total variance estimate\\}
\vspace{1\baselineskip}
\only<1>{\hspace{-0.5cm}\phantom{\includegraphics[width=10.5cm]{10chains_rhat1.pdf}}}
\only<2>{\hspace{-0.5cm}\includegraphics[width=10.5cm]{10chains_rhat1.pdf}}
\only<3>{\hspace{-0.5cm}\includegraphics[width=10.5cm]{10chains_rhat2.pdf}}
\only<4>{\hspace{-0.5cm}\includegraphics[width=10.5cm]{10chains_rhat3.pdf}}
\end{itemize}
\end{frame}
\begin{frame}
{\Large\color{navyblue} $\widehat{R}$}
\begin{itemize}
\item $M$ chains, each having $N$ draws (with new R-hat notation)
\item<2-> Within chains variance $W$
\begin{equation*}
W=\frac{1}{M}\sum_{m=1}^M s^2_m ,\text{ where }
s^2_m=\frac{1}{N-1}\sum_{n=1}^N (\theta_{nm}-\bar{\theta}_{.m})^2
\end{equation*}
\item<3-> Between chains variance $B$
\begin{align*}
B&=\frac{N}{M-1}\sum_{m=1}^M
(\bar{\theta}_{.m}-\bar{\theta}_{..})^2,\\
&\text{ where } \bar{\theta}_{.m}=\frac{1}{N}\sum_{n=1}^N \theta_{nm}, \,
\bar{\theta}_{..}=\frac{1}{M}\sum_{m=1}^M\bar{\theta}_{.m}
\end{align*}
%\vspace{-6mm}
\begin{itemize}
\item<4-> $B/N$ is variance of the means of the chains
\end{itemize}
\vspace{2mm}
\item<5-> Estimate total variance
$\var(\theta|y)$ as a weighted mean of $W$ and $B$
\begin{equation*}
\widehat{\var}^{+}(\theta|y) = \frac{N-1}{N}W+\frac{1}{N}B
\end{equation*}
\end{itemize}
\end{frame}
% \note{1) $B/n$ on varsinainen ketjujen keskiarvojen varianssi, jossa $n$
% johtuu siitä että ne keskiarvot joiden varianssia lasketaan on
% muodostettu $n$ näytteestä ja siten varianssi pienempi, mutta
% Gelman et kump. ovat halunneet merkitä kuitenkin $B$:llä $n$ kertaa
% tätä varianssia, koska silloin lopullisessa
% marginaaliposteriorivarianssin kaavassa näkyy $n$:n vaikutus
% 2) termi $\frac{n-1}{n}$ tulee siitä, että silloin painojen summa on 1
% }
\begin{frame}
{\Large\color{navyblue} $\widehat{R}$}
\begin{itemize}
\item Estimate total variance
$\var(\theta|y)$ as a weighted mean of $W$ and $B$
\begin{equation*}
\widehat{\var}^{+}(\theta|y) = \frac{N-1}{N}W+\frac{1}{N}B
\end{equation*}
\vspace{-0.5\baselineskip}
\begin{itemize}
\item this \emph{overestimates} marginal posterior variance if the
starting points are overdispersed
\end{itemize}
\vspace{0.5\baselineskip}
\item<2-> Given finite $N$, $W$ \emph{underestimates} marginal posterior variance
\begin{itemize}
\item single chains have not yet visited all points in the distribution
\item when $N\rightarrow\infty, \quad \E(W)\rightarrow \var(\theta|y)$
\end{itemize}
\vspace{0.5\baselineskip}
\item<3-> As $\widehat{\var}^{+}(\theta|y)$ overestimates and $W$ underestimates,
compute
\begin{equation*}
\widehat{R}=\sqrt{\frac{\widehat{\var}^{+}}{W}}
\end{equation*}
\end{itemize}
\end{frame}
\begin{frame}[fragile]
{\Large\color{navyblue} $\widehat{R}$}
\begin{itemize}
\item BDA3: $\widehat{R}$ aka {\em potential scale reduction factor} (PSRF)
\item Compare means and variances of the chains\\
W = within chain variance estimate\\
var\_hat\_plus = total variance estimate\\
\vspace{1\baselineskip}
\only<1>{\hspace{-0.5cm}\includegraphics[width=10.5cm]{10chains_rhat1.pdf}}
\only<2>{\hspace{-0.5cm}\includegraphics[width=10.5cm]{10chains_rhat2.pdf}}
\only<3>{\hspace{-0.5cm}\includegraphics[width=10.5cm]{10chains_rhat3.pdf}}
\end{itemize}
\end{frame}
\begin{frame}
{\Large\color{navyblue} $\widehat{R}$}
\begin{equation*}
\widehat{R}=\sqrt{\frac{\widehat{\var}^{+}}{W}}
\end{equation*}
\begin{itemize}
\item<1-> Estimates how much the scale of $\psi$ could reduce if $N\rightarrow\infty$
\item<1-> $\widehat{R}\rightarrow 1$, when $N\rightarrow\infty$
\item<1-> if $\widehat{R}$ is big (e.g., $R>1.01$), keep sampling
\item<2-> If $\widehat{R}$ close to 1, it is still possible that chains have not converged
\begin{itemize}
\item if starting points were not overdispersed
\item distribution far from normal (especially if infinite variance)
\item just by chance when $n$ is finite
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}[fragile]
{\Large\color{navyblue} Split-$\widehat{R}$}
\begin{itemize}
\item BDA3: split-$\widehat{R}$
\item Examines {\it mixing} and {\it stationarity} of chains
\item To examine stationarity chains are split to two parts
\begin{itemize}
\item after splitting, we have $M$ chains, each having $N$ draws
\item scalar draws $\theta_{nm} \quad (n=1,\ldots,N;m=1,\ldots,M)$
\item compare means and variances of the split chains
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}[fragile]
{\Large\color{navyblue} Rank normalized $\widehat{R}$}
\begin{itemize}
\item<+-> Original $\widehat{R}$ requires that the target distribution
has finite mean and variance
\item<+-> Rank normalization fixes this and is also more robust given
finite but high variance
\item<+-> Folding improves detecting scale differences between chains
\item<+-> Paper proposes also local convergence diagnostics and
practical MCSE estimates for quantiles
\item<+-> Notation updated compared to BDA3
\end{itemize}
Vehtari, Gelman, Simpson, Carpenter, Bürkner
(2019). Rank-normalization, folding, and localization: An improved
R-hat for assessing convergence of MCMC. arXiv preprint
arXiv:1903.08008.
\end{frame}
% \begin{frame}
% {\Large\color{navyblue} Convergence diagnostics}
% \begin{itemize}
% \item After how many iterations can we check for convergence
% \begin{itemize}
% \item it's not possible to no this beforehand
% \item Stan uses NUTS which is an efficient sampler and default is
% to use 4 chains, get 1000 draws from each for adaptation and
% warmup, 1000 more draws from each and estimate $\widehat{R}$ for
% each parameter
% \end{itemize}
% \end{itemize}
% \end{frame}
% \begin{frame}{Divergences}
% \begin{itemize}
% \item Diagnosing difficult to reach corners of the posterior
% \item Case study \url{http://mc-stan.org/users/documentation/case-studies/divergences_and_bias.html}
% \end{itemize}
% \end{frame}
% \begin{frame}{Tree depth}
% \begin{itemize}
% \item Diagnosing long tails
% \item Case studies
% \begin{itemize}
% \item R: \url{http://mc-stan.org/users/documentation/case-studies/rstan_workflow.html}
% \item Python: \url{http://mc-stan.org/users/documentation/case-studies/pystan_workflow.html}
% \end{itemize}
% \end{itemize}
% \end{frame}
\begin{frame}
{\Large\color{navyblue} Time series analysis}
\begin{itemize}
\item Auto correlation function
\begin{itemize}
\item describes the correlation given a certain lag
\item can be used to compare efficiency of MCMC algorithms and parameterizations