-
Notifications
You must be signed in to change notification settings - Fork 119
/
Copy pathoj.R
138 lines (102 loc) · 3.93 KB
/
oj.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
###
# Orange juice regression
###
library(tidyverse)
library(rsample)
library(modelr)
## read in the data
oj = read.csv("../data/oj.csv")
head(oj)
#
ggplot(data=oj) +
geom_boxplot(aes(x=brand, y = price))
ggplot(data=oj) +
geom_boxplot(aes(x=brand, y = logmove))
ggplot(oj) + geom_point(aes(x=price, y=logmove))
###
# Dummy variables
###
reg = lm(logmove ~ log(price) + brand, data=oj)
## inspect the fitted model
summary(reg) ## coef, tests, fit
## create some data for prediction, using the data.frame function
## note the care in specifying brand factor (levels must match original data)
## we don't need all variables in oj; just those used as covariates in reg.
newdata=expand.grid(price=c(1.5,2,2.5, 3),
brand=factor(c("tropicana","minute.maid","dominicks")))
newdata
## predict
predict(reg, newdata=newdata) ## predicted log units moved
exp(predict(reg, newdata=newdata)) ## predicted # of units moved
## under the hood: `design matrices' and model.matrix
# here's the underlying model matrix
x = model.matrix( ~ log(price) + brand, data=oj)
mosaic::sample(x, 10) # show us 10 random rows
# we get the same model if we explicitly pass x to lm
# the "-1" says not to include an explicit intercept
# that's because we already have one in the model matrix
reg2 = lm(logmove ~ x-1, data=oj)
summary(reg2)
# you don't have to do this for vanilla lm.
# but it's good to know how to construct the model matrix explicitly,
# since you often have to for more complex models.
###
# Interactions
###
## note that `*' also adds the main effects automatically
reg_interact = lm(logmove ~ log(price)*brand, data=oj)
summary(reg_interact)
# let's see the explicit model matrix
x_interact = model.matrix( ~ log(price)*brand, data=oj)
# 10 random rows: each row is a vector of numbers, whose inner-product
# with the coefficient vector gives E(y | x)
mosaic::sample(x_interact, 10) %>% round(2) # show us 10 random rows
###
# Now let's add feat to the model
###
# Option 1: main effect only
reg_ads = lm(logmove ~ log(price)*brand + feat, data=oj)
summary(reg_ads)
# Option 2: two-way interaction
reg_ads2 = lm(logmove ~ log(price)*(brand+feat), data=oj)
summary(reg_ads2)
# Option 3: three-way interaction
reg_ads3 = lm(logmove ~ log(price)*brand*feat, data=oj)
summary(reg_ads3)
## fit plots for the 3-way interaction
# Add predictions to the data frame
oj$reg_ads2_fitted = fitted(reg_ads2)
oj$reg_ads3_fitted = fitted(reg_ads3)
p_base = ggplot(data=oj) +
geom_point(aes(x=log(price), y = logmove, color=factor(feat)), alpha=0.1) +
facet_grid(~brand)
p_base
# shape = 21 gives you points with black circles around them
# the fill aesthetic maps to whether or not feat=1
p_base + geom_point(aes(x=log(price), y = reg_ads2_fitted, fill=factor(feat)),
shape=21)
p_base + geom_point(aes(x=log(price), y = reg_ads3_fitted, fill=factor(feat)),
shape=21)
# Let's compare out-of-sample fit for our three models with feat
# Make a train-test split
oj_split = initial_split(oj, prop=0.8)
oj_train = training(oj_split)
oj_test = testing(oj_split)
# Update our models to use training data only
reg_ads = update(reg_ads, data=oj_train)
reg_ads2 = update(reg_ads2, data=oj_train)
reg_ads3 = update(reg_ads3, data=oj_train)
# RMSEs
rmse(reg_ads, oj_test)
rmse(reg_ads2, oj_test)
rmse(reg_ads3, oj_test)
# Let's be a little more systematic and use K-fold cross validation
oj_folds = crossv_kfold(oj, k=10)
# map the model-fitting function over the training sets
models1 = map(oj_folds$train, ~ lm(logmove ~ log(price)*brand + feat, data=.))
models2 = map(oj_folds$train, ~ lm(logmove ~ log(price)*(brand + feat), data=.))
models3 = map(oj_folds$train, ~ lm(logmove ~ log(price)*brand*feat, data=.))
# map the RMSE calculation over the trained models and test sets simultaneously
map2_dbl(models1, oj_folds$test, modelr::rmse) %>% mean
map2_dbl(models2, oj_folds$test, modelr::rmse) %>% mean
map2_dbl(models3, oj_folds$test, modelr::rmse) %>% mean