-
Notifications
You must be signed in to change notification settings - Fork 0
/
DM Bootstrap Functions.R
191 lines (136 loc) · 6.26 KB
/
DM Bootstrap Functions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
#Dummy data
#set.seed(1)
#e1 <- rnorm(10, mean = 0, sd = 1)
#set.seed(2)
#e2 <- rnorm(10, mean = 0, sd = 1)
# FUNCTION 1 ; Do a dm test on bootstrapped sample
dm.bootstrap <- function(e1,e2, h, seed){
# WHAT THE FUNCTION DOES
# This function takes two error vectors which are time indexed, then constructs a bootstrap sample
# (indentical rows, with replacement) from both error vectors . The time index of error vectors is
# matched by setting the same seed for each round of sampling using set.seed(seed). [See code below]
#ARGUMENTS
# - e1 and e2 are error vectors from which we will be resampling from
# - h is a parameter that represents the forecasting time horizon for the error vectors
# - seed is a random integer that will be used as an arg. for set.seed so sampling from error vectors
# is matched.
#VALUE
# Function returns the p.val when Diebold-Mariano test is conducted for bootstrapped sample errors
#Step1: Select random integer to be the seed for sampling
#seed <- round(runif(1, min = 1, max = 10000)) #This isn't changing when I re-run the code. I'll set the see outside the function
#Step2: Sampling from error vectors(matching of time series indices are preserved using the random seed above)
#Only continue with test if error vectors have the same length
l1 <- length(e1)
l2 <- length(e2)
if(l1==l2){
set.seed(seed)
boot_e1 <- sample(e1, l1, replace = TRUE)
set.seed(seed)
boot_e2 <- sample(e2, l2, replace = TRUE)
} else {
print("Error vectors do not have the same length")
}
test <- dm.test(boot_e1, boot_e2, h = h, alternative = c("two.sided")) #Need to specify the forecast horizon(h), default is 1
return(c(test$p.value))
}
# FUNCTION 2: Repeated the dm test on a bootstrap sample N times (uses function 1)
#Question: Think of a more efficient way to do this using dplyr
repeat.dm.bootstrap <- function(e1,e2,N,h){
# Conducts N DM Test(forecast horizon h) on a bootstrapped sample from e1 and e2
# Returns a vector of N p.values from the bootstrap dm tests
library(forecast) #Needed by dm.bootstrap func. to do Diebold-Mariano Test
p.values <- NULL
for(i in 1:N){
rand.int <- sample.int(1000, 1)
p.values[i] <- dm.bootstrap(e1, e2, h, rand.int)
}
return(p.values)
}
#Test
p_vals <- repeat.dm.bootstrap(e_1, e_2, 1e5, h = 1) #Check if seed for rand.int is changing as a control measure
hist(p_vals)
####Delete this later
#Data
#log. returns for ARIMA
index.0 <- read.table("top40index.txt")
index.1 <- as.matrix(index.0)
index.2 <-(t(index.1))[2:1828,1]
index <- as.numeric(index.2)
log.return <- NULL
for(i in 1:length(index)-1){
x <- index[i]
y <- index[i+1]
log.return[i] <-log(y/x)
}
#data-frame for prophet package
start_date <- as.Date("2012-07-01")
end_date <- as.Date("2014-01-01") # Should be "2017-06-30" if we use the full time period, i.e w/o in- sample and out of sample
dates <- bizseq(start_date,end_date,"actual")
prophet_data <- data.frame(ds = dates,y = log.return[1:550]) #Note to self : remove [1:1766] If forecasting using full period
#Model estimation
library(forecast)
arima.fit <- auto.arima(data$return, approximation = F, trace = F) #This also builds the best ARIMA model and it turns out that you got it right. Its the arima(2,0,2) :)
prophet.fit <- prophet(df = prophet_data,yearly.seasonality = TRUE)
#Model Forecasts and residuals
#ARIMA
start_date <- 1766 #???
end_date <- 1825 # ???
period <- 1
arima_residuals <- list()
j = 1
for(i in seq(start_date, end_date, period)){
arima_model <- arima(data$returns[1:i], order = c(2,0,2),
optim.control = list(maxit = 1000))
arima_pred <- predict(arima_model, n.ahead = 5, newdata = data$returns[(i+1):(i+5)])
arima_error <- arima_pred$pred - data$returns[(i+1):(i+5)]
arima_residuals[[j]] <- data.frame(ret = data$returns[(i+1):(i+5)],
pred = as.numeric(arima_pred$pred),
error = as.numeric(arima_error))
j <- j + 1
}
do.call(rbind, arima_residuals)
arima_five_res <- do.call(rbind, arima_residuals)
plot(arima_five_res$ret, type = "l")
lines(arima_five_res$pred, col = 2)
#Prophet
pred_period_dates <- make_future_dataframe(proph_fit,periods = 1826-550) #Dates for prediction period
library("tidyr")
proph_forecast <- predict(proph_fit,pred_period_dates)
proph_in_sample_forecast <- proph_forecast$yhat[1:550]
prophet_residuals <- proph_in_sample_forecast - log.return[1:550]
#Bootstrap
e1 <- arima_five_res$error #Qn: There's 300 of them, how to I control this? Why are there NA's?
e2 <- prophet_residuals[1:300] #Truncated this so the length of the matrices match
p_vals <-repeat.dm.bootstrap(e1, e1, 1000, h = 1) #Check if seed for rand.int is changing as a control measure
hist(p_vals)
# Hanjo
#Model Forecasts and residuals
#ARIMA
arima_eval <- function(ret_data, start_date, end_date, period = 1){
library(lubridate)
start_date <- as.Date("2013-11-01") #???
end_date <- as.Date("2014-01-01") # ???
period <- paste0("+", period," day")
for_period <- period
arima_residuals <- list()
j = 1
for(i in seq(start_date, end_date, by = period)){
#training
train <- which(prophet_data$ds < start_date)
test <- which(prophet_data$ds %in% seq(from = start_date,
to = (start_date + (for_period-1)),
by = "day"))
arima_model <- arima(prophet_data$y[train], order = c(2,0,2),
optim.control = list(maxit = 1000))
arima_pred <- forecast(arima_model, h = for_period)
arima_error <- arima_pred$mean - prophet_data$y[test]
arima_residuals[[j]] <- data.frame(ret = prophet_data$y[test],
pred = as.numeric(arima_pred$mean),
error = as.numeric(arima_error))
j <- j + 1
}
do.call(rbind, arima_residuals)
arima_res <- do.call(rbind, arima_residuals)
return(arima_res)
}
arima_eval()