-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPigrimBankP2.R
335 lines (284 loc) · 14.6 KB
/
PigrimBankP2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
rm(list = ls())
setwd("/Users/david/Desktop/ownSpace/customerAnalytics")
pilgrim <- read.csv("pilgrim A2 data part1.csv")
library(partykit)
library(ggplot2)
library(dplyr)
library(cvTools)
### PART 1 ###
### Interaction Modeling ###
# 1. Basic regression
# A customer of age 0 with 0 income and 0 years with the bank and who doesn't go online
# will cost the bank on average $91.73.
# If a customer uses online banking they are $18.24 more profitable
# For each increase in age buckets, a customer will become $18.29 more profitable
# For each increase in income buckets, a customer will become $17.85 more profitable
# For each additional year a customer has been at the bank ,they will become $4.03 more profitable
#
# With a p-value of ~0.0000 the model as a whole is significant meaning it explains some significant
# amount of variance above 0%
# The R-squared is .05742 indicating that the model explains 5.74% of the variance in profit
# The residual standard error is 274.6 indicating that the model is on average off by $274.6
# per observation
pilgrimFact <- pilgrim
pilgrimFact[,4] <- as.factor(pilgrimFact[,4])
pilgrimFact[,5] <- as.factor(pilgrimFact[,5])
fit1 <- lm(Profit99~.-ID, data = pilgrim)
summary(fit1)
maineffects.fact <- lm(Profit99~.-ID, data = pilgrimFact)
summary(maineffects.fact)
# 2. Regression with interaction terms for Online
samp <- sample(nrow(pilgrim), 500, replace = FALSE)
# scatterplot showing Online99:Inc99 interaction term
ggplot(pilgrim[samp,], aes(x=Inc99, y=Profit99, shape=Online99, color = Online99)) +
geom_point() +
scale_shape_manual(values=c(1,2)) +
geom_smooth(method=lm, # Add linear regression lines
se=FALSE, # Don't add shaded confidence region
fullrange=TRUE) +
labs(title = "Interaction between Income and Online with Regards to Profit",
x = "Income", y = "Profit")
# scatterplot showing Online99:Age99 interaction term
ggplot(pilgrim[samp,], aes(x=Age99, y=Profit99, shape=Online99, color = Online99)) +
geom_point() +
scale_shape_manual(values=c(1,2)) +
geom_smooth(method=lm, # Add linear regression lines
se=FALSE, # Don't add shaded confidence region
fullrange=TRUE) +
labs(title = "Interaction between Age and Online with Regards to Profit",
x = "Age", y = "Profit")
ct1 <- ctree(Profit99~.-ID, data = pilgrim, control = ctree_control(mincriterion = .95))
plot(ct1)
ct2 <- ctree(Profit99~.-ID, data = pilgrimFact, control = ctree_control(mincriterion = .95))
plot(ct2)
# Viewed tree for interaction terms
# Took out Age99:Inc99 once triple interaction terms added because became insignificant
fit2 <- lm(Profit99~.-ID + Online99:Inc99, data = pilgrim)
summary(fit2)
fit2.fact <- lm(Profit99~.-ID+
Online99:Age99 + Online99:Inc99
, data = pilgrimFact)
summary(fit2.fact)
fit3 <- lm(Profit99~.-ID + Online99:Inc99 +Tenure99:Inc99 + Tenure99:Age99 + Tenure99:Inc99:Age99, data = pilgrim)
summary(fit3)
fit3.fact <- lm(Profit99~.-ID+
Online99:Age99 + Online99:Inc99 +
Tenure99:Inc99 + Tenure99:Age99 +
Tenure99:Inc99:Age99, data = pilgrimFact)
summary(fit3.fact)
# While Online99:Age99 is not significant at a .95 confidence level, it shows with some small
# level of confidence that as a customer increases in age buckets, their profitability for using
# online increases by an additional $4.14
# Online99:Inc99 shows that as a customer increases in income buckets, their profitability for
# using online increases by an additional $8.98
# This model changed the base Online coefficient from $18.24 to -49.19 indicating that Online
# has significant interaction effects with regards to profitability
# 3. Best model
# Interaction model had higher Adjusted R-squared, higher F-statistic
# Enough observations so that additional 2 paramaters is not a concern
# 4. Online banking significantly improves profitability for old, rich customers
# However, it decreases profitability for young, poor customers
# Pilgrim should do targeted advertising/incentives for their old,rich customers to use
# online banking and either charge young, poor customers for online, not allow them to use it,
# or at the very least not advertise the service to them
### PART 2 ###
### Experiment Analysis ###
pilgrim2 <- read.csv("pilgrim A2 data part 2 experiment.csv")
# Control: 1 = experiment; 0 = control?
# 1. Profit comparison
# Summary statistics show that both mean and median profit increased from 99 to 00
# for the experimental group
# A paired t-test shows that at a confidence level of .005 the two means
# are significantly different for the experimental group (is this the right test?)
# A paired t-test shows that at a confidence level of .005 the two means
# are NOT significantly different for the control group (is this the right test?)
summary(pilgrim2$Profit99[pilgrim2$condition==0])
summary(pilgrim2$Profit00[pilgrim2$condition==0])
summary(pilgrim2$Profit99[pilgrim2$condition==1])
summary(pilgrim2$Profit00[pilgrim2$condition==1])
t.test(pilgrim2$Profit99[pilgrim2$condition==0], pilgrim2$Profit00[pilgrim2$condition==0], paired = TRUE)
t.test(pilgrim2$Profit99[pilgrim2$condition==1], pilgrim2$Profit00[pilgrim2$condition==1], paired = TRUE)
# 2. t-tests show that mean demographics are not significantly different between groups
# Randomization appears successful
ggplot(pilgrim2,aes(x=Profit99,y=Profit00,colour=factor(condition)))+geom_point()+
stat_smooth(method='lm',formula=y~scale(x,scale=F))
t.test(pilgrim2$Age99[pilgrim2$condition==0], pilgrim2$Age99[pilgrim2$condition==1], paired = FALSE)
t.test(pilgrim2$Inc99[pilgrim2$condition==0], pilgrim2$Inc99[pilgrim2$condition==1], paired = FALSE)
t.test(pilgrim2$Tenure99[pilgrim2$condition==0], pilgrim2$Tenure99[pilgrim2$condition==1], paired = FALSE)
# 3. A t-test shows that the mean profit in 2000 between the two groups is significant
# It appears that migrating people to online was sucessful in increasing profit by on average $51.25
# per customer
t.test(pilgrim2$Profit00[pilgrim2$condition==0], pilgrim2$Profit00[pilgrim2$condition==1], paired = FALSE)
# 4. The idealized scenario is unrealistic for a recommendation. It depends on how much it costs
# to approach each customer and convince them to use online in addition to what percent of customers
# approached actually swap
# In addition, it does not account for retention in the Online space or satisfaction of the customers
# who swapped to Online. If they are more profitable for a year but become disgruntled with the Online
# service and leave the bank, then this may hurt them in the long run.
# More data with customers' feedback on the Online service would help generate a more robust recommendation
### PART 3 ###
### Out-of-Sample Prediction ###
pilgrim3 <- filter(pilgrim2, condition==1) %>% select(ID, Profit99, Age99, Inc99, Tenure99, Profit00)
# Age possibly square root? ln? log10?
ggplot(pilgrim2, aes(x=Age99, y=Profit00)) +
geom_point(shape=1) +
geom_smooth()
# Income possibly exponential?
ggplot(pilgrim2, aes(x=Inc99, y=Profit00)) +
geom_point(shape=1) +
geom_smooth()
# Tenure possibly square root? ln? log10?
ggplot(pilgrim2, aes(x=Tenure99, y=Profit00)) +
geom_point(shape=1) +
geom_smooth()
# base model
fit2 <- lm(Profit00~.-ID, data = pilgrim3)
summary(fit2)
# Transformations for variables
# log(Age99) - .19 significance - MAYBE - .06 on full model - KEEP
# I(Inc99^2) - .00 significance - YES
# log(Tenure99) - .56 significane - NO
# I(Tenure99^.5) - .25 significane - MAYBE - No significant increase in Adj-R^2 on full model
fit3 <- lm(Profit00~.-ID+log(Age99) + I(Inc99^2), data = pilgrim3)
summary(fit3)
# Additional Interaction Terms based on tree
ct2 <- ctree(Profit00~.-ID-Profit99, data = pilgrim3, control = ctree_control(mincriterion = .95))
plot(ct2)
fit4 <- lm(Profit00~.-ID + log(Age99) + I(Inc99^2) +
Inc99:Tenure99 +
Inc99:Age99 +
Tenure99:Inc99:Age99, data = pilgrim3)
summary(fit4)
# Removing nonsignificant nonlinear tranformations
fit5 <- lm(Profit00~.-ID +
Inc99:Age99 +
Tenure99:Inc99:Age99, data = pilgrim3)
summary(fit5)
# adding data feature for old, rich customers
pilgrim3$oldRich <- rep(0, times = nrow(pilgrim3))
pilgrim3$oldRich[pilgrim3$Age99>4 & pilgrim3$Inc99>4] <- 1
fit6 <- lm(Profit00~.-ID, data = pilgrim3)
summary(fit6)
# adding oldRich and transformations
fit7 <- lm(Profit00~Age99+Inc99+Tenure99+Profit99+Profit99*Age99*Inc99*Tenure99-Age99:Inc99-Age99:Tenure99-
Inc99:Tenure99-1, data = pilgrim3)
summary(fit7)
# adding oldRich and interactions
ct3 <- ctree(Profit00~.-ID-Profit99, data = pilgrim3, control = ctree_control(mincriterion = .75))
plot(ct3)
fit8 <- lm(Profit00~.-ID +
Inc99:Age99 +
Tenure99:Inc99:Age99, data = pilgrim3)
summary(fit8)
# adding oldRich and interactions and transformations
fit9 <- lm(Profit00~.-ID + log(Age99) + I(Inc99^2) +
Inc99:Tenure99 +
Inc99:Age99 +
Tenure99:Inc99:Age99, data = pilgrim3)
summary(fit9)
# base factor model
pilgrim3Fact <- pilgrim3
pilgrim3Fact$Age99 <- as.factor(pilgrim3Fact$Age99)
pilgrim3Fact$Inc99 <- as.factor(pilgrim3Fact$Inc99)
fit10 <- lm(Profit00~.-ID, data = pilgrim3Fact)
summary(fit10)
# just profit99
fit11 <- lm(Profit00~Profit99, data = pilgrim3Fact)
# Predict Profit00 for fit3-fit9
pred2 <- round(predict(fit2),2)
pred3 <- round(predict(fit3),2)
pred4 <- round(predict(fit4),2)
pred5 <- round(predict(fit5),2)
pred6 <- round(predict(fit6),2)
pred7 <- round(predict(fit7),2)
pred8 <- round(predict(fit8),2)
pred9 <- round(predict(fit9),2)
pred10 <- round(predict(fit10),2)
# 2. While adjusted R-squared is highest for model 5 and 8, the sum of squared residuals is smallest
# for model 9. For this reason, model 9 appears to be the best, however there may be overfitting
# due to the small sample size and large amount of predictors
resid2 <- pilgrim3$Profit00-pred2
resid3 <- pilgrim3$Profit00-pred3
resid4 <- pilgrim3$Profit00-pred4
resid5 <- pilgrim3$Profit00-pred5
resid6 <- pilgrim3$Profit00-pred6
resid7 <- pilgrim3$Profit00-pred7
resid8 <- pilgrim3$Profit00-pred8
resid9 <- pilgrim3$Profit00-pred9
resid10 <- pilgrim3$Profit00-pred10
sum(resid2^2)
sum(resid3^2)
sum(resid4^2)
sum(resid5^2)
sum(resid6^2)
sum(resid7^2) #10278974
sum(resid8^2)
sum(resid9^2) # best prediction
sum(resid10^2)
# 3. Model 8 appears the best
# Increasing K seems to increase variance between models
# Increasing R seems to decrease variance between models
set.seed(12345)
cvFit(fit2, data=pilgrim3, y=pilgrim3$Profit00, K=3, R=50) #219.0619
cvFit(fit3, data=pilgrim3, y=pilgrim3$Profit00, K=3, R=50) #219.7486
cvFit(fit4, data=pilgrim3, y=pilgrim3$Profit00, K=3, R=50) #219.4433
cvFit(fit5, data=pilgrim3, y=pilgrim3$Profit00, K=3, R=50) #216.5191 - Close Second
cvFit(fit6, data=pilgrim3, y=pilgrim3$Profit00, K=3, R=50) #217.6473
cvFit(fit7, data=pilgrim3, y=pilgrim3$Profit00, K=3, R=50) #209.5914
cvFit(fit8, data=pilgrim3, y=pilgrim3$Profit00, K=3, R=50) #215.8030 - Best Model
cvFit(fit9, data=pilgrim3, y=pilgrim3$Profit00, K=3, R=50) #219.4305
cvFit(fit10, data=pilgrim3Fact, y=pilgrim3Fact$Profit00, K=3, R=50) #223.1041
set.seed(12345)
cvFit(fit2, data=pilgrim3, y=pilgrim3$Profit00, K=3, R=500) #218.3528
cvFit(fit3, data=pilgrim3, y=pilgrim3$Profit00, K=3, R=500) #219.7863
cvFit(fit4, data=pilgrim3, y=pilgrim3$Profit00, K=3, R=500) #219.2754
cvFit(fit5, data=pilgrim3, y=pilgrim3$Profit00, K=3, R=500) #216.1581 - Best Model
cvFit(fit6, data=pilgrim3, y=pilgrim3$Profit00, K=3, R=500) #218.3660
cvFit(fit7, data=pilgrim3, y=pilgrim3$Profit00, K=3, R=500) #211.7786
cvFit(fit8, data=pilgrim3, y=pilgrim3$Profit00, K=3, R=500) #216.3155 - Close Second
cvFit(fit9, data=pilgrim3, y=pilgrim3$Profit00, K=3, R=500) #219.292
cvFit(fit10, data=pilgrim3Fact, y=pilgrim3Fact$Profit00, K=3, R=500) #222.5971
set.seed(12345)
cvFit(fit2, data=pilgrim3, y=pilgrim3$Profit00, K=5, R=50) #217.6801
cvFit(fit3, data=pilgrim3, y=pilgrim3$Profit00, K=5, R=50) #219.0235
cvFit(fit4, data=pilgrim3, y=pilgrim3$Profit00, K=5, R=50) #217.3494
cvFit(fit5, data=pilgrim3, y=pilgrim3$Profit00, K=5, R=50) #215.0576 - Close Second
cvFit(fit6, data=pilgrim3, y=pilgrim3$Profit00, K=5, R=50) #217.8197
cvFit(fit7, data=pilgrim3, y=pilgrim3$Profit00, K=5, R=50) #207.8711
cvFit(fit8, data=pilgrim3, y=pilgrim3$Profit00, K=5, R=50) #214.7322 - Best Model
cvFit(fit9, data=pilgrim3, y=pilgrim3$Profit00, K=5, R=50) #217.8236
cvFit(fit10, data=pilgrim3Fact, y=pilgrim3Fact$Profit00, K=5, R=50) #220.7548
set.seed(12345)
cvFit(fit2, data=pilgrim3, y=pilgrim3$Profit00, K=5, R=500) #217.5831
cvFit(fit3, data=pilgrim3, y=pilgrim3$Profit00, K=5, R=500) #218.8105
cvFit(fit4, data=pilgrim3, y=pilgrim3$Profit00, K=5, R=500) #218.0610
cvFit(fit5, data=pilgrim3, y=pilgrim3$Profit00, K=5, R=500) #215.3720 - Close Second
cvFit(fit6, data=pilgrim3, y=pilgrim3$Profit00, K=5, R=500) #217.3684
cvFit(fit7, data=pilgrim3, y=pilgrim3$Profit00, K=5, R=500) #207.8587
cvFit(fit8, data=pilgrim3, y=pilgrim3$Profit00, K=5, R=500) #215.1616 - Best Model
cvFit(fit9, data=pilgrim3, y=pilgrim3$Profit00, K=5, R=500) #218.2483
cvFit(fit10, data=pilgrim3Fact, y=pilgrim3Fact$Profit00, K=5, R=500) #220.6839
cvFit(fit2, data=pilgrim3, y=pilgrim3$Profit00, K=10, R=500) #215.3394
cvFit(fit5, data=pilgrim3, y=pilgrim3$Profit00, K=10, R=500) #214.9193
cvFit(fit7, data=pilgrim3, y=pilgrim3$Profit00, K=10, R=500) #205.038
cvFit(fit2, data=pilgrim3, y=pilgrim3$Profit00, K=300, R=50) #215.3394
cvFit(fit5, data=pilgrim3, y=pilgrim3$Profit00, K=300, R=50) #214.9193
cvFit(fit7, data=pilgrim3, y=pilgrim3$Profit00, K=300, R=50) #205.038
cvFit(fit11, data=pilgrim3, y=pilgrim3$Profit00, K=300, R=50) #214.3859
pilgrimVal <- read.csv("pilgrim A2 data part 2 validation.csv")
cvFit(fit7, data=pilgrimVal, y=pilgrimVal$Profit00, K=5, R=50)
pilgrimVal$pred <- round(predict(fit7, newdata = pilgrimVal, type = 'response'),2)
pilgrimVal$resid <- (pilgrimVal$pred - pilgrimVal$Profit00)
# 5. Yes, they should move forward with the incentive program.
# The model estimates that the mean profit will increase from $101.50 to $140.40.
# A paired t-test shows that this is a significant difference.
# They should not offer more than $38.90 as an incentive.
# The offer depends on how hard it is to get people to switch which we do not have information on
# I would suggest subsetting the group randomly into 5 different groups and offering
# $0, $5, $10, $15, $20 respectively and examining the results.
targets <- read.csv("pilgrim A2 data part 2 targets.csv")
targets <- select(targets, -Online99)
targets$oldRich <- rep(0, times = nrow(targets))
targets$oldRich[targets$Age99>4 & targets$Inc99>4] <- 1
targets$predProf00 <- predict(fit7, targets, type = "response")
t.test(targets$Profit99, targets$predProf00, paired = TRUE)