diff --git a/15-model_comparison.Rmd b/15-model_comparison.Rmd index b1bca40..a73d4f2 100644 --- a/15-model_comparison.Rmd +++ b/15-model_comparison.Rmd @@ -144,8 +144,11 @@ plot_fit = function(i){ geom_point(data = df.more_data, size = 2, color = "red") + - geom_smooth(method = "lm", se = F, - formula = y ~ poly(x, degree = i, raw = TRUE)) + + geom_smooth(method = "lm", + se = F, + formula = y ~ poly(x, + degree = i, + raw = TRUE)) + annotate(geom = "text", x = Inf, y = -Inf, @@ -189,7 +192,9 @@ sd = 0.5 # sample df.data = tibble(participant = 1:sample_size, - x = runif(sample_size, min = 0, max = 1), + x = runif(sample_size, + min = 0, + max = 1), y = b0 + b1*x + b2*x^2 + rnorm(sample_size, sd = sd)) ``` @@ -198,7 +203,7 @@ And plot it: ```{r} ggplot(data = df.data, mapping = aes(x = x, - y = y)) + + y = y)) + geom_smooth(method = "lm", formula = y ~ x + I(x^2)) + geom_point() @@ -283,8 +288,10 @@ fun.cv_plot = function(data_point){ filter(color == 2)) %>% clean_names() - p = ggplot(df.plot, - aes(x, y, color = as.factor(color))) + + p = ggplot(data = df.plot, + mapping = aes(x = x, + y = y, + color = as.factor(color))) + geom_segment(aes(xend = x, yend = fitted), data = df.fit, @@ -518,10 +525,12 @@ df.normal = tibble(y = seq(-5, 5, 0.1), # show the residual plot together with the normal distribution ggplot(data = df.plot , - mapping = aes(x = fitted, y = resid)) + + mapping = aes(x = fitted, + y = resid)) + geom_point() + geom_path(data = df.normal, - aes(x = x, y = y), + aes(x = x, + y = y), size = 2) ``` diff --git a/docs/404.html b/docs/404.html index fc51834..0eb93b0 100644 --- a/docs/404.html +++ b/docs/404.html @@ -23,7 +23,7 @@ - + @@ -51,8 +51,6 @@ - - - @@ -328,36 +292,36 @@
library("knitr") # for knitting RMarkdown
-library("kableExtra") # for making nice tables
-library("janitor") # for cleaning column names
-library("broom") # for tidying up linear models
-library("patchwork") # for figure panels
-library("modelr") # for cross-validation
-library("tidyverse") # for wrangling, plotting, etc.
theme_set(theme_classic() + #set the theme
- theme(text = element_text(size = 20))) #set the default text size
-
-opts_chunk$set(comment = "",
- fig.show = "hold")
library("knitr") # for knitting RMarkdown
+library("kableExtra") # for making nice tables
+library("janitor") # for cleaning column names
+library("broom") # for tidying up linear models
+library("patchwork") # for figure panels
+library("modelr") # for cross-validation
+library("tidyverse") # for wrangling, plotting, etc.
theme_set(theme_classic() + #set the theme
+ theme(text = element_text(size = 20))) #set the default text size
+
+opts_chunk$set(comment = "",
+ fig.show = "hold")
set.seed(1)
-
-n_plots = 3
-
-# sample size
-n_samples = 20
-
-# number of parameters in the polynomial regression
-n_parameters = c(1:4, seq(7, 19, length.out = 5))
-
-# generate data
-df.data = tibble(x = runif(n_samples, min = 0, max = 10),
- y = 10 + 3 * x + 3 * x^2 + rnorm(n_samples, sd = 20))
-
-# plotting function
-plot_fit = function(i){
- # calculate RMSE
- rmse = lm(formula = y ~ poly(x, degree = i, raw = TRUE),
- data = df.data) %>%
- rmse(data = df.data)
-
- # make a plot
- ggplot(data = df.data,
- mapping = aes(x = x,
- y = y)) +
- geom_point(size = 2) +
- geom_smooth(method = "lm", se = F,
- formula = y ~ poly(x, degree = i, raw = TRUE)) +
- annotate(geom = "text",
- x = Inf,
- y = -Inf,
- label = str_c("RMSE = ", round(rmse, 2)),
- hjust = 1.1,
- vjust = -0.3) +
- theme(axis.ticks = element_blank(),
- axis.title = element_blank(),
- axis.text = element_blank())
-}
-
-# save plots in a list
-l.p = map(.x = n_parameters,
- .f = ~ plot_fit(.))
-
-# make figure panel
-wrap_plots(plotlist = l.p, ncol = 3)
set.seed(1)
+
+n_plots = 3
+
+# sample size
+n_samples = 20
+
+# number of parameters in the polynomial regression
+n_parameters = c(1:4, seq(7, 19, length.out = 5))
+
+# generate data
+df.data = tibble(x = runif(n_samples, min = 0, max = 10),
+ y = 10 + 3 * x + 3 * x^2 + rnorm(n_samples, sd = 20))
+
+# plotting function
+plot_fit = function(i){
+ # calculate RMSE
+ rmse = lm(formula = y ~ poly(x, degree = i, raw = TRUE),
+ data = df.data) %>%
+ rmse(data = df.data)
+
+ # make a plot
+ ggplot(data = df.data,
+ mapping = aes(x = x,
+ y = y)) +
+ geom_point(size = 2) +
+ geom_smooth(method = "lm", se = F,
+ formula = y ~ poly(x, degree = i, raw = TRUE)) +
+ annotate(geom = "text",
+ x = Inf,
+ y = -Inf,
+ label = str_c("RMSE = ", round(rmse, 2)),
+ hjust = 1.1,
+ vjust = -0.3) +
+ theme(axis.ticks = element_blank(),
+ axis.title = element_blank(),
+ axis.text = element_blank())
+}
+
+# save plots in a list
+l.p = map(.x = n_parameters,
+ .f = ~ plot_fit(.))
+
+# make figure panel
+wrap_plots(plotlist = l.p, ncol = 3)
As we can see, RMSE becomes smaller and smaller the more parameters the model has to fit the data. But how does the RMSE look like for new data that is generated from the same underlying ground truth?
-set.seed(1)
-
-n_plots = 3
-
-# sample size
-n_samples = 20
-
-# number of parameters in the polynomial regression
-n_parameters = c(1:4, seq(7, 19, length.out = 5))
-
-# generate data
-df.data = tibble(
- x = runif(n_samples, min = 0, max = 10),
- y = 10 + 3 * x + 3 * x^2 + rnorm(n_samples, sd = 20)
-)
-
-# generate some more data
-df.more_data = tibble(x = runif(50, min = 0, max = 10),
- y = 10 + 3 * x + 3 * x^2 + rnorm(50, sd = 20))
-
-# list for plots
-l.p = list()
-
-# plotting function
-plot_fit = function(i){
- # calculate RMSE for fitted data
- fit = lm(formula = y ~ poly(x, degree = i, raw = TRUE),
- data = df.data)
-
- # calculate RMSE for training data
- rmse = fit %>%
- rmse(data = df.data)
-
- # calculate RMSE for new data
- rmse_new = fit %>%
- rmse(data = df.more_data)
-
- # make a plot
- ggplot(data = df.data,
- mapping = aes(x = x,
- y = y)) +
- geom_point(size = 2) +
- geom_point(data = df.more_data,
- size = 2,
- color = "red") +
- geom_smooth(method = "lm", se = F,
- formula = y ~ poly(x, degree = i, raw = TRUE)) +
- annotate(geom = "text",
- x = Inf,
- y = -Inf,
- label = str_c("RMSE = ", round(rmse, 2)),
- hjust = 1.1,
- vjust = -0.3) +
- annotate(geom = "text",
- x = Inf,
- y = -Inf,
- label = str_c("RMSE = ", round(rmse_new, 2)),
- hjust = 1.1,
- vjust = -2,
- color = "red") +
- theme(axis.ticks = element_blank(),
- axis.title = element_blank(),
- axis.text = element_blank())
-}
-
-# map over the parameters
-l.p = map(.x = n_parameters,
- .f = ~ plot_fit(.))
-
-# make figure panel
-wrap_plots(plotlist = l.p, ncol = 3)
set.seed(1)
+
+n_plots = 3
+
+# sample size
+n_samples = 20
+
+# number of parameters in the polynomial regression
+n_parameters = c(1:4, seq(7, 19, length.out = 5))
+
+# generate data
+df.data = tibble(
+ x = runif(n_samples, min = 0, max = 10),
+ y = 10 + 3 * x + 3 * x^2 + rnorm(n_samples, sd = 20)
+)
+
+# generate some more data
+df.more_data = tibble(x = runif(50, min = 0, max = 10),
+ y = 10 + 3 * x + 3 * x^2 + rnorm(50, sd = 20))
+
+# list for plots
+l.p = list()
+
+# plotting function
+plot_fit = function(i){
+ # calculate RMSE for fitted data
+ fit = lm(formula = y ~ poly(x, degree = i, raw = TRUE),
+ data = df.data)
+
+ # calculate RMSE for training data
+ rmse = fit %>%
+ rmse(data = df.data)
+
+ # calculate RMSE for new data
+ rmse_new = fit %>%
+ rmse(data = df.more_data)
+
+ # make a plot
+ ggplot(data = df.data,
+ mapping = aes(x = x,
+ y = y)) +
+ geom_point(size = 2) +
+ geom_point(data = df.more_data,
+ size = 2,
+ color = "red") +
+ geom_smooth(method = "lm",
+ se = F,
+ formula = y ~ poly(x,
+ degree = i,
+ raw = TRUE)) +
+ annotate(geom = "text",
+ x = Inf,
+ y = -Inf,
+ label = str_c("RMSE = ", round(rmse, 2)),
+ hjust = 1.1,
+ vjust = -0.3) +
+ annotate(geom = "text",
+ x = Inf,
+ y = -Inf,
+ label = str_c("RMSE = ", round(rmse_new, 2)),
+ hjust = 1.1,
+ vjust = -2,
+ color = "red") +
+ theme(axis.ticks = element_blank(),
+ axis.title = element_blank(),
+ axis.text = element_blank())
+}
+
+# map over the parameters
+l.p = map(.x = n_parameters,
+ .f = ~ plot_fit(.))
+
+# make figure panel
+wrap_plots(plotlist = l.p, ncol = 3)
The RMSE in black shows the root mean squared error for the data that the model was fit on. The RMSE in red shows the RMSE on the new data. As you can see, the complex models do really poorly. They overfit the noise in the original data which leads to make poor predictions for new data. The simplest model (with two parameters) doesn’t do particularly well either since it misses out on the quadratic trend in the data. Both the model with the quadratic term (top middle) and a model that includes a cubic term (top right) provide a good balance – their RMSE on the new data is lowest.
Let’s generate another data set:
-# make example reproducible
-set.seed(1)
-
-# parameters
-sample_size = 100
-b0 = 1
-b1 = 2
-b2 = 3
-sd = 0.5
-
-# sample
-df.data = tibble(participant = 1:sample_size,
- x = runif(sample_size, min = 0, max = 1),
- y = b0 + b1*x + b2*x^2 + rnorm(sample_size, sd = sd))
# make example reproducible
+set.seed(1)
+
+# parameters
+sample_size = 100
+b0 = 1
+b1 = 2
+b2 = 3
+sd = 0.5
+
+# sample
+df.data = tibble(participant = 1:sample_size,
+ x = runif(sample_size,
+ min = 0,
+ max = 1),
+ y = b0 + b1*x + b2*x^2 + rnorm(sample_size, sd = sd))
And plot it:
- - + +# fit models to the data
-fit_simple = lm(y ~ 1 + x, data = df.data)
-fit_correct = lm(y ~ 1 + x + I(x^2), data = df.data)
-fit_complex = lm(y ~ 1 + x + I(x^2) + I(x^3), data = df.data)
-
-# compare the models using an F-test
-anova(fit_simple, fit_correct)
# fit models to the data
+fit_simple = lm(y ~ 1 + x, data = df.data)
+fit_correct = lm(y ~ 1 + x + I(x^2), data = df.data)
+fit_complex = lm(y ~ 1 + x + I(x^2) + I(x^3), data = df.data)
+
+# compare the models using an F-test
+anova(fit_simple, fit_correct)
Analysis of Variance Table
Model 1: y ~ 1 + x
@@ -1074,7 +1051,7 @@ 15.3.2 F-testanova(fit_correct, fit_complex)
Analysis of Variance Table
Model 1: y ~ 1 + x + I(x^2)
@@ -1103,92 +1080,94 @@ 15.3.3 Cross-validation
15.3.3.1 Leave-one-out cross-validation
I’ve used code similar to this one to illustrate how LOO works in class. Here is a simple data set with 9 data points. We fit 9 models, where for each model, the training set includes one of the data points, and then we look at how well the model captures the held-out data point. We can then characterize the model’s performance by calculating the mean squared error across the 9 runs.
-# make example reproducible
-set.seed(1)
-
-# sample
-df.loo = tibble(x = 1:9,
- y = c(5, 2, 4, 10, 3, 4, 10, 2, 8))
-
-df.loo_cross = df.loo %>%
- crossv_loo() %>%
- mutate(fit = map(.x = train,
- .f = ~ lm(y ~ x, data = .)),
- tidy = map(.x = fit,
- .f = ~ tidy(.))) %>%
- unnest(tidy)
-
-# original plot
-df.plot = df.loo %>%
- mutate(color = 1)
-
-# fit to all data except one
-fun.cv_plot = function(data_point){
-
- # determine which point to leave out
- df.plot = df.plot %>%
- mutate(color = ifelse(row_number() == data_point, 2, color))
-
- # fit
- df.fit = df.plot %>%
- filter(color != 2) %>%
- lm(formula = y ~ x, data = .) %>%
- augment(newdata = df.plot %>%
- filter(color == 2)) %>%
- clean_names()
-
- p = ggplot(df.plot,
- aes(x, y, color = as.factor(color))) +
- geom_segment(aes(xend = x,
- yend = fitted),
- data = df.fit,
- color = "red",
- size = 1) +
- geom_point(size = 2) +
- geom_smooth(method = "lm",
- formula = "y ~ x",
- se = F,
- color = "black",
- fullrange = T,
- data = df.plot %>% filter(color != 2)) +
- scale_color_manual(values = c("black", "red")) +
- theme(legend.position = "none",
- axis.title = element_blank(),
- axis.ticks = element_blank(),
- axis.text = element_blank())
- return(p)
-}
-
-# save plots in list
-l.plots = map(.x = 1:9,
- .f = ~ fun.cv_plot(.))
-
-# make figure panel
-wrap_plots(plotlist = l.plots, ncol = 3)
-
+# make example reproducible
+set.seed(1)
+
+# sample
+df.loo = tibble(x = 1:9,
+ y = c(5, 2, 4, 10, 3, 4, 10, 2, 8))
+
+df.loo_cross = df.loo %>%
+ crossv_loo() %>%
+ mutate(fit = map(.x = train,
+ .f = ~ lm(y ~ x, data = .)),
+ tidy = map(.x = fit,
+ .f = ~ tidy(.))) %>%
+ unnest(tidy)
+
+# original plot
+df.plot = df.loo %>%
+ mutate(color = 1)
+
+# fit to all data except one
+fun.cv_plot = function(data_point){
+
+ # determine which point to leave out
+ df.plot = df.plot %>%
+ mutate(color = ifelse(row_number() == data_point, 2, color))
+
+ # fit
+ df.fit = df.plot %>%
+ filter(color != 2) %>%
+ lm(formula = y ~ x, data = .) %>%
+ augment(newdata = df.plot %>%
+ filter(color == 2)) %>%
+ clean_names()
+
+ p = ggplot(data = df.plot,
+ mapping = aes(x = x,
+ y = y,
+ color = as.factor(color))) +
+ geom_segment(aes(xend = x,
+ yend = fitted),
+ data = df.fit,
+ color = "red",
+ size = 1) +
+ geom_point(size = 2) +
+ geom_smooth(method = "lm",
+ formula = "y ~ x",
+ se = F,
+ color = "black",
+ fullrange = T,
+ data = df.plot %>% filter(color != 2)) +
+ scale_color_manual(values = c("black", "red")) +
+ theme(legend.position = "none",
+ axis.title = element_blank(),
+ axis.ticks = element_blank(),
+ axis.text = element_blank())
+ return(p)
+}
+
+# save plots in list
+l.plots = map(.x = 1:9,
+ .f = ~ fun.cv_plot(.))
+
+# make figure panel
+wrap_plots(plotlist = l.plots, ncol = 3)
+
As you can see, the regression line changes quite a bit depending on which data point is in the test set.
Now, let’s use LOO to evaluate the models on the data set I’ve created above:
-# fit the models and calculate the RMSE for each model on the test set
-df.cross = df.data %>%
- crossv_loo() %>% # function which generates training and test data sets
- mutate(model_simple = map(.x = train,
- .f = ~ lm(y ~ 1 + x, data = .)),
- model_correct = map(.x = train,
- .f = ~ lm(y ~ 1 + x + I(x^2), data = .)),
- model_complex = map(.x = train,
- .f = ~ lm(y ~ 1 + x + I(x^2) + I(x^3), data = .))) %>%
- pivot_longer(cols = contains("model"),
- names_to = "model",
- values_to = "fit") %>%
- mutate(rmse = map2_dbl(.x = fit,
- .y = test,
- .f = ~ rmse(.x, .y)))
-
-# show the average RMSE for each model
-df.cross %>%
- group_by(model) %>%
- summarize(mean_rmse = mean(rmse) %>%
- round(3))
+# fit the models and calculate the RMSE for each model on the test set
+df.cross = df.data %>%
+ crossv_loo() %>% # function which generates training and test data sets
+ mutate(model_simple = map(.x = train,
+ .f = ~ lm(y ~ 1 + x, data = .)),
+ model_correct = map(.x = train,
+ .f = ~ lm(y ~ 1 + x + I(x^2), data = .)),
+ model_complex = map(.x = train,
+ .f = ~ lm(y ~ 1 + x + I(x^2) + I(x^3), data = .))) %>%
+ pivot_longer(cols = contains("model"),
+ names_to = "model",
+ values_to = "fit") %>%
+ mutate(rmse = map2_dbl(.x = fit,
+ .y = test,
+ .f = ~ rmse(.x, .y)))
+
+# show the average RMSE for each model
+df.cross %>%
+ group_by(model) %>%
+ summarize(mean_rmse = mean(rmse) %>%
+ round(3))
# A tibble: 3 × 2
model mean_rmse
<chr> <dbl>
@@ -1202,25 +1181,25 @@ 15.3.3.1 Leave-one-out cross-vali
15.3.3.2 k-fold cross-validation
For k-fold cross-validation, we split the data set in k folds, and then use k-1 folds as the training set, and the remaining fold as the test set.
The code is almost identical as before. Instead of crossv_loo()
, we use the crossv_kfold()
function instead and say how many times we want to “fold” the data.
-# crossvalidation scheme
-df.cross = df.data %>%
- crossv_kfold(k = 10) %>%
- mutate(model_simple = map(.x = train,
- .f = ~ lm(y ~ 1 + x, data = .)),
- model_correct = map(.x = train,
- .f = ~ lm(y ~ 1 + x + I(x^2), data = .)),
- model_complex = map(.x = train,
- .f = ~ lm(y ~ 1 + x + I(x^2) + I(x^3), data = .))) %>%
- pivot_longer(cols = contains("model"),
- names_to = "model",
- values_to = "fit") %>%
- mutate(rsquare = map2_dbl(.x = fit,
- .y = test,
- .f = ~ rsquare(.x, .y)))
-
-df.cross %>%
- group_by(model) %>%
- summarize(median_rsquare = median(rsquare))
+# crossvalidation scheme
+df.cross = df.data %>%
+ crossv_kfold(k = 10) %>%
+ mutate(model_simple = map(.x = train,
+ .f = ~ lm(y ~ 1 + x, data = .)),
+ model_correct = map(.x = train,
+ .f = ~ lm(y ~ 1 + x + I(x^2), data = .)),
+ model_complex = map(.x = train,
+ .f = ~ lm(y ~ 1 + x + I(x^2) + I(x^3), data = .))) %>%
+ pivot_longer(cols = contains("model"),
+ names_to = "model",
+ values_to = "fit") %>%
+ mutate(rsquare = map2_dbl(.x = fit,
+ .y = test,
+ .f = ~ rsquare(.x, .y)))
+
+df.cross %>%
+ group_by(model) %>%
+ summarize(median_rsquare = median(rsquare))
# A tibble: 3 × 2
model median_rsquare
<chr> <dbl>
@@ -1232,25 +1211,25 @@ 15.3.3.2 k-fold cross-validation<
15.3.3.3 Monte Carlo cross-validation
Finally, let’s consider another very flexible version of cross-validation. For this version of cross-validation, we determine how many random splits into training set and test set we would like to do, and what proportion of the data should be in the test set.
-# crossvalidation scheme
-df.cross = df.data %>%
- crossv_mc(n = 50, test = 0.5) %>% # number of samples, and percentage of test
- mutate(model_simple = map(.x = train,
- .f = ~ lm(y ~ 1 + x, data = .x)),
- model_correct = map(.x = train,
- .f = ~ lm(y ~ 1 + x + I(x^2), data = .x)),
- model_complex = map(.x = train,
- .f = ~ lm(y ~ 1 + x + I(x^2) + I(x^3), data = .))) %>%
- pivot_longer(cols = contains("model"),
- names_to = "model",
- values_to = "fit") %>%
- mutate(rmse = map2_dbl(.x = fit,
- .y = test,
- .f = ~ rmse(.x, .y)))
-
-df.cross %>%
- group_by(model) %>%
- summarize(mean_rmse = mean(rmse))
+# crossvalidation scheme
+df.cross = df.data %>%
+ crossv_mc(n = 50, test = 0.5) %>% # number of samples, and percentage of test
+ mutate(model_simple = map(.x = train,
+ .f = ~ lm(y ~ 1 + x, data = .x)),
+ model_correct = map(.x = train,
+ .f = ~ lm(y ~ 1 + x + I(x^2), data = .x)),
+ model_complex = map(.x = train,
+ .f = ~ lm(y ~ 1 + x + I(x^2) + I(x^3), data = .))) %>%
+ pivot_longer(cols = contains("model"),
+ names_to = "model",
+ values_to = "fit") %>%
+ mutate(rmse = map2_dbl(.x = fit,
+ .y = test,
+ .f = ~ rmse(.x, .y)))
+
+df.cross %>%
+ group_by(model) %>%
+ summarize(mean_rmse = mean(rmse))
# A tibble: 3 × 2
model mean_rmse
<chr> <dbl>
@@ -1264,29 +1243,29 @@ 15.3.3.3 Monte Carlo cross-valida
15.3.4 Bootstrap
We can also use the modelr
package for bootstrapping. The idea is the same as when we did cross-validation. We create a number of data sets from our original data set. Instead of splitting the data set in a training and test data set, for bootstrapping, we sample values from the original data set with replacement. Doing so, we can, for example, calculate the confidence interval of different statistics of interest.
Here is an example for how to boostrap confidence intervals for a mean.
-# make example reproducible
-set.seed(1)
-
-sample_size = 10
-
-# sample
-df.data = tibble(participant = 1:sample_size,
- x = runif(sample_size, min = 0, max = 1))
-
-# mean of the actual sample
-mean(df.data$x)
+# make example reproducible
+set.seed(1)
+
+sample_size = 10
+
+# sample
+df.data = tibble(participant = 1:sample_size,
+ x = runif(sample_size, min = 0, max = 1))
+
+# mean of the actual sample
+mean(df.data$x)
[1] 0.5515139
-# bootstrap to get confidence intervals around the mean
-df.data %>%
- bootstrap(n = 1000) %>% # create 1000 bootstrapped samples
- mutate(estimate = map_dbl(.x = strap,
- .f = ~ .x %>%
- as_tibble() %>%
- pull(x) %>%
- mean())) %>%
- summarize(mean = mean(estimate),
- low = quantile(estimate, 0.025), # calculate the 2.5 / 97.5 percentiles
- high = quantile(estimate, 0.975))
+# bootstrap to get confidence intervals around the mean
+df.data %>%
+ bootstrap(n = 1000) %>% # create 1000 bootstrapped samples
+ mutate(estimate = map_dbl(.x = strap,
+ .f = ~ .x %>%
+ as_tibble() %>%
+ pull(x) %>%
+ mean())) %>%
+ summarize(mean = mean(estimate),
+ low = quantile(estimate, 0.025), # calculate the 2.5 / 97.5 percentiles
+ high = quantile(estimate, 0.975))
# A tibble: 1 × 3
mean low high
<dbl> <dbl> <dbl>
@@ -1303,25 +1282,25 @@ 15.3.5 AIC and BIC\(k\) is the number of parameters in the model, \(n\) is the number of observations, and \(\hat L\) is the maximized value of the likelihood function of the model. Both AIC and BIC trade off model fit (as measured by the maximum likelihood of the data \(\hat L\)) and the number of parameters in the model.
Calculating AIC and BIC in R is straightforward. We simply need to fit a linear model, and then call the AIC()
or BIC()
functions on the fitted model like so:
-set.seed(0)
-
-# let's generate some data
-df.example = tibble(x = runif(20, min = 0, max = 1),
- y = 1 + 3 * x + rnorm(20, sd = 2))
-
-# fit a linear model
-fit = lm(formula = y ~ 1 + x,
- data = df.example)
-
-# get AIC
-AIC(fit)
+set.seed(0)
+
+# let's generate some data
+df.example = tibble(x = runif(20, min = 0, max = 1),
+ y = 1 + 3 * x + rnorm(20, sd = 2))
+
+# fit a linear model
+fit = lm(formula = y ~ 1 + x,
+ data = df.example)
+
+# get AIC
+AIC(fit)
[1] 75.47296
-
+
[1] 78.46016
We can also just use the broom
package to get that information:
-
+
# A tibble: 1 × 12
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
@@ -1329,86 +1308,88 @@ 15.3.5 AIC and BIC# plot the data with a linear model fit
-ggplot(data = df.example,
- mapping = aes(x = x,
- y = y)) +
- geom_point(size = 2) +
- geom_smooth(method = "lm",
- color = "black")
-
+# plot the data with a linear model fit
+ggplot(data = df.example,
+ mapping = aes(x = x,
+ y = y)) +
+ geom_point(size = 2) +
+ geom_smooth(method = "lm",
+ color = "black")
+
Now, let’s take a look at the residuals by plotting the fitted values on the x axis, and the residuals on the y axis.
-# residual plot
-df.plot = df.example %>%
- lm(formula = y ~ x,
- data = .) %>%
- augment() %>%
- clean_names()
-
-ggplot(data = df.plot,
- mapping = aes(x = fitted,
- y = resid)) +
- geom_point(size = 2)
-
+# residual plot
+df.plot = df.example %>%
+ lm(formula = y ~ x,
+ data = .) %>%
+ augment() %>%
+ clean_names()
+
+ggplot(data = df.plot,
+ mapping = aes(x = fitted,
+ y = resid)) +
+ geom_point(size = 2)
+
Remember that the linear model makes the assumption that the residuals are normally distributed with mean 0 (which is always the case if we fit a linear model) and some fitted standard deviation. In fact, the standard deviation of the normal distribution is fitted such that the overall likelihood of the data is maximized.
Let’s make a plot that shows a normal distribution alongside the residuals:
-# define a normal distribution
-df.normal = tibble(y = seq(-5, 5, 0.1),
- x = dnorm(y, sd = 2) + 3.75)
-
-# show the residual plot together with the normal distribution
-ggplot(data = df.plot ,
- mapping = aes(x = fitted, y = resid)) +
- geom_point() +
- geom_path(data = df.normal,
- aes(x = x, y = y),
- size = 2)
-
+# define a normal distribution
+df.normal = tibble(y = seq(-5, 5, 0.1),
+ x = dnorm(y, sd = 2) + 3.75)
+
+# show the residual plot together with the normal distribution
+ggplot(data = df.plot ,
+ mapping = aes(x = fitted,
+ y = resid)) +
+ geom_point() +
+ geom_path(data = df.normal,
+ aes(x = x,
+ y = y),
+ size = 2)
+
To determine the likelihood of the data given the model \(\hat L\), we now calculate the likelihood of each point (with the dnorm()
function), and then multiply the likelihood of each data point to get the overall likelihood. We can simply multiply the data points since we also assume that the data points are independent.
Instead of multiplying likelihoods, we often sum the log likelihoods instead. This is because if we multiply many small values, the overall value gets to close to 0 so that computers get confused. By taking logs instead, we avoid these nasty precision errors.
To better understand AIC and BIC, let’s calculate them by hand:
-# we first get the estimate of the standard deviation of the residuals
-sigma = fit %>%
- glance() %>%
- pull(sigma)
-
-# then we calculate the log likelihood of the model
-log_likelihood = fit %>%
- augment() %>%
- mutate(likelihood = dnorm(.resid, sd = sigma)) %>%
- summarize(logLik = sum(log(likelihood))) %>%
- as.numeric()
-
-# then we calculate AIC and BIC using the formulas introduced above
-aic = 2*3 - 2 * log_likelihood
-bic = log(nrow(df.example)) * 3 - 2 * log_likelihood
-
-print(aic)
+# we first get the estimate of the standard deviation of the residuals
+sigma = fit %>%
+ glance() %>%
+ pull(sigma)
+
+# then we calculate the log likelihood of the model
+log_likelihood = fit %>%
+ augment() %>%
+ mutate(likelihood = dnorm(.resid, sd = sigma)) %>%
+ summarize(logLik = sum(log(likelihood))) %>%
+ as.numeric()
+
+# then we calculate AIC and BIC using the formulas introduced above
+aic = 2*3 - 2 * log_likelihood
+bic = log(nrow(df.example)) * 3 - 2 * log_likelihood
+
+print(aic)
[1] 75.58017
-
+
[1] 78.56737
Cool! The values are the same as when we use the glance()
function like so (except for a small difference due to rounding):
-
+
# A tibble: 1 × 2
AIC BIC
<dbl> <dbl>
1 75.5 78.5
15.3.5.1 log() is your friend
-ggplot(data = tibble(x = c(0, 1)),
- mapping = aes(x = x)) +
- stat_function(fun = "log",
- size = 1) +
- labs(x = "probability",
- y = "log(probability)") +
- theme(axis.text = element_text(size = 24),
- axis.title = element_text(size = 26))
+ggplot(data = tibble(x = c(0, 1)),
+ mapping = aes(x = x)) +
+ stat_function(fun = "log",
+ size = 1) +
+ labs(x = "probability",
+ y = "log(probability)") +
+ theme(axis.text = element_text(size = 24),
+ axis.title = element_text(size = 26))
Warning: Computation failed in `stat_function()`
Caused by error in `fun()`:
! could not find function "fun"
-
+
@@ -1431,7 +1412,7 @@ 15.4.2 Reading
15.5 Session info
Information about this R session including which version of R was used, and what packages were loaded.
-
+
R version 4.3.2 (2023-10-31)
Platform: aarch64-apple-darwin20 (64-bit)
Running under: macOS Sonoma 14.1.2
@@ -1452,7 +1433,7 @@ 15.5 Session info