forked from OpenSourceAP/CrossSectionDemos
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmclean_pontiff_main.R
162 lines (122 loc) · 3.89 KB
/
mclean_pontiff_main.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# 2021 12 Andrew Chen: replicate the main result of McLean-Pontiff
# ENVIRONMENT ====
rm(list = ls())
library(tidyverse)
library(data.table)
library(googledrive)
library(readxl)
library(RColorBrewer)
library(lubridate)
### USER ENTRY
# root of March 2022 release
pathRelease = 'https://drive.google.com/drive/folders/1O18scg9iBTiBaDiQFhoGxdn4FdsbMqGo'
# root of August 2023
pathRelease = 'https://drive.google.com/drive/u/0/folders/1EP6oEabyZRamveGNyzYU0u6qJ-N43Qfq'
# login to gdrive
# this prompts a login
pathRelease %>% drive_ls()
# create temporary directory
dir.create('temp/')
# DOWNLOAD DATA =====
## download signal documentation and show user
target_dribble = pathRelease %>% drive_ls() %>%
filter(name=='SignalDoc.csv')
drive_download(target_dribble, path = 'temp/deleteme.csv', overwrite = T)
signaldoc = fread('temp/deleteme.csv') %>%
mutate(
signalname = Acronym
, pubdate = as.Date(paste0(Year, '-12-31'))
, sampend = as.Date(paste0(SampleEndYear, '-12-31'))
, sampstart = as.Date(paste0(SampleStartYear, '-01-01'))
) %>%
arrange(signalname) %>%
select(signalname, pubdate, sampend, sampstart)
## download all long-short returns (OP) ====
target_dribble = pathRelease %>% drive_ls() %>%
filter(name=='Portfolios') %>% drive_ls() %>%
filter(name=='Full Sets OP') %>% drive_ls() %>%
filter(name=='PredictorLSretWide.csv')
drive_download(target_dribble[1,], path = 'temp/deleteme.csv', overwrite = T)
ret0 = fread('temp/deleteme.csv') %>%
pivot_longer(-c(date),names_to = 'signalname', values_to = 'ret') %>%
filter(!is.na(ret))
# MERGE AND FIND OUT OF SAMPLE RETURNS ====
ret1 = ret0 %>%
left_join(signaldoc) %>%
mutate(
samptype = case_when(
(date >= sampstart) & (date <= sampend) ~ 'in-samp'
, (date > sampend) & (date <= pubdate) ~ 'out-of-samp'
, (date > pubdate) ~ 'post-pub'
, TRUE ~ NA_character_
)
) %>%
filter(!is.na(samptype))
sumsignal = ret1 %>%
group_by(signalname, samptype) %>%
summarize(
rbar = mean(ret)
, tstat = rbar/sd(ret)*sqrt(n())
)
# remove bad reproductions / bad predictors (if desired)
signalok = sumsignal %>%
filter(samptype=='in-samp') %>%
filter(tstat > -Inf) %>%
transmute(signalname)
sumsignal = sumsignal %>% inner_join(signalok)
# check out of sample decay simple way ====
sumsamp = sumsignal %>%
group_by(samptype) %>%
summarize(rbar = mean(rbar), nsignal = n())
baseline = sumsamp %>% filter(samptype == 'in-samp') %>% select(rbar) %>% as.numeric()
sumsamp = sumsamp %>%
mutate(
decay = (baseline - rbar)/baseline
)
sumsamp
# BOOTSTRAP MEAN DISTRIBUTIONS ====
# clustered by month
set.seed(6)
nboot = 200
bootfun = function(sampname){
# make wide dataset, use NA if not correct sample
wide_is = ret1 %>%
filter(samptype == sampname) %>%
select(signalname, date, ret) %>%
pivot_wider(
names_from = signalname, values_from = ret
) %>%
select(-date) %>%
as.matrix()
# make array that only has enough signals in each month (10)
tgood = rowSums(!is.na(wide_is), na.rm=T) > 10
mat = wide_is[tgood, ]
nmonth = dim(mat)[1]
# bootstrap pooled mean
rboot = rep(NA_real_, nboot)
for (i in 1:nboot){
tempt = sample(1:nmonth, replace = T)
rboot[i] = mat[tempt,] %>% as.vector %>% mean(na.rm=T)
}
return(rboot)
} # end bootfun
# bootstrap for each sample type
rboot1 = bootfun('in-samp')
rboot2 = bootfun('out-of-samp')
# compile and plot
bootdat = data.frame(
pooled_mean_ret = rboot1, samptype = 'in-samp'
) %>%
rbind(
data.frame(
pooled_mean_ret = rboot2, samptype = 'out-of-samp'
)
)
bootdat %>%
ggplot(aes(x=pooled_mean_ret, fill=samptype)) +
geom_histogram(
alpha = 0.6, position = 'identity', breaks = seq(0,1,0.025), aes(y=..density..)
) +
ggtitle('bootstrapped distribution') +
labs(x='pooled mean return (% monthly)') +
geom_vline(xintercept = 0)