-
Notifications
You must be signed in to change notification settings - Fork 119
/
Copy pathsmallbeer.R
116 lines (84 loc) · 3.48 KB
/
smallbeer.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
library(tidyverse)
library(gamlr)
## small beer dataset
beer = read.csv("../data/smallbeer.csv",
colClasses=c(rep("factor",3),rep("numeric",2)))
nrow(beer)
view(beer)
# very little variation to estimate an elasticity!
beer %>% filter(description == '10 BARREL APOCALYPSE IPA 6PK ')
# more variation = more hope
beer %>% filter(description == 'STELLA ARTOIS 6PK BTL ')
# complete pooling: one elasticity for all beers
allforone = lm(log(units) ~ log(price), data=beer)
coef(allforone)
# no pooling: independent elasticities for each beer
oneforall = lm(log(units) ~ log(price)*item, data=beer)
# tons of NAs, lots of noisy coefficients
coef(oneforall)
hist(coef(oneforall)) ## super noisy zeros
# getting the elasticities? a bit annoying but this code does it
price_main = coef(oneforall)[2]
which_int = grep("log(price):item", names(coef(oneforall)), fixed=TRUE)
price_int = coef(oneforall)[which_int]
# add the main effect to all the relevant interactions
# to get each beer's elasticity
hist(price_main + price_int)
## Clear this won't work
# build some regression designs
x1 = sparse.model.matrix(~log(price)*item + factor(week)-1, data=beer)
head(x1)
# don't penalize the log price coefficient
ml1 = cv.gamlr(x=x1, y=log(beer$units), free = 1, standardize=FALSE, verb=TRUE)
coef(ml1)
# how can I get the elasticities?
price_main = coef(ml1)[2]
which_int = grep("log(price):item", rownames(coef(ml1)), fixed=TRUE)
price_int = coef(ml1)[which_int]
# these look much more reasonable, though not all negative
# and the elephant in the room: of course price is not exogenous here!
# price is changing over time and in response to features that also predict demand
hist(price_main + price_int)
####
# Orthogonal ML instead
# strategy: isolate "idiosyncratic" variation price and quantity sold
# by first explicitly adjusting for item and week
####
# OML steps 1-2
xitem = sparse.model.matrix(~item-1, lmr=1e-5, data=beer)
xweek = sparse.model.matrix(~week-1, lmr=1e-5, data=beer)
xx = cbind(xweek, xitem)
# isolate/partial out variation in log(price) predicted by item and week
pfit = gamlr(x=xx, y=log(beer$price), lmr=1e-5, standardize=FALSE)
# isolate/partial out variation in quantity sold predicted by item and week
qfit = gamlr(x=xx, y=log(beer$units), lmr=1e-5, standardize=FALSE)
# Calculate residuals: variation in price and units sold that
# cannot be predicted by item and week
lpr = drop(log(beer$price) - predict(pfit, xx))
lqr = drop(log(beer$units) - predict(qfit, xx))
# Run 3rd ML step to get elasticities
# parse the item description text
# each individual word in the title becomes a predictor
# why is this helping here?
library(tm)
descr = Corpus(VectorSource(as.character(beer$description)))
xtext = DocumentTermMatrix(descr)
# convert to Matrix format
xtext = sparseMatrix(i=xtext$i,j=xtext$j, x=as.numeric(xtext$v>0),
dims=dim(xtext), dimnames=dimnames(xtext))
colnames(xtext)
# look at interactions between text and log price residuals
xtreat = cbind(1,xtext)
ofit = gamlr(x=lpr*xtreat, y=lqr, standardize=FALSE, free=1)
coef(ofit)
gams = coef(ofit)[-1,]
# create a testing matrix, matching each level to a row in X
test_ind = match(levels(beer$item),beer$item)
xtest = xtext[test_ind,]
rownames(xtest) = beer$description[test_ind]
# translate into elasticities and plot
el = drop(gams[1] + xtest%*%gams[(1:ncol(xtext))+1])
hist(el, xlab="OML elasticities", xlim=c(-6,1), col="lightblue", main="",breaks=7)
# high and low sensitivity brands
sort(el) %>% head(20)
sort(el) %>% tail(20)