forked from nsmackler/dataviz
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlecture9.Rmd
141 lines (114 loc) · 4.05 KB
/
lecture9.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
```{r}
library(tidyverse)
theme_set(theme_classic(base_size = 20))
```
using lapply to make a list of files (save as object)
```{r}
files=lapply(list.files(path = "movies",full.names = F, pattern = ".txt"),function(file_name){
read_delim(paste0("movies/",file_name),delim = ",")
})
names(files)=str_replace(string=list.files(path = "movies",full.names = F, pattern = ".txt"),pattern = ".txt",replacement = "")
names(files)
movies_joined=full_join(files[["movies_imdb"]],files[["movies_rottentom"]],by=c("movie_title"="title"))
```
bootstrapping for a confidence interval on a mean
```{r}
movies_action=files[["movies_imdb"]] %>%
mutate(action=str_detect(genres,"Action")) %>%
select(action,title_year,imdb_score)
set.seed(10) ##this allows us to work with the same random sample
movies_action_sample <- movies_action %>%
filter(!is.na(imdb_score)) %>%
sample_n(50)
movies_action_sample %>%
specify(response=imdb_score) %>%
generate(reps = 100,type="bootstrap") %>%
calculate(stat="mean")%>%
visualize()
movies_action_sample %>%
specify(response=imdb_score) %>%
generate(reps = 1000,type="bootstrap") %>%
calculate(stat="mean") %>%
get_ci(level = 0.95, type = "percentile")
```
compare permutations to bootstrapping
```{r}
movies_action=files[["movies_imdb"]] %>%
mutate(action=str_detect(genres,"Action")) %>%
select(action,title_year,imdb_score)
set.seed(10) ##this allows us to work with the same random sample
movies_action_sample <- movies_action %>%
group_by(action) %>%
sample_n(25) %>%
ungroup()
permuted=movies_action_sample %>%
specify(formula = imdb_score ~ action) %>%
hypothesize(null = "independence") %>%
generate(reps = 100,type="permute") %>%
calculate(stat = "diff in means", order = c("TRUE", "FALSE"))
permuted$random="permutation"
bootstrapped=movies_action_sample %>%
specify(formula = imdb_score ~ action) %>%
hypothesize(null = "independence") %>%
generate(reps = 100,type="bootstrap") %>%
calculate(stat = "diff in means", order = c("TRUE", "FALSE"))
bootstrapped$random="bootstrap"
combined=rbind(bootstrapped,permuted) #combine them
ggplot(combined,aes(x=stat))+
geom_histogram(fill="grey25")+
facet_wrap(~random,nrow = 2)
```
bootstrapping regression coefficients
first we can estimate a regression coefficient
```{r}
set.seed(100) ##this allows us to work with the same random sample
movies_subset=movies_joined %>%
select(budget,title_year,movie_title,duration,imdb_score,metacritic,rotten_tomatoes) %>%
na.omit() %>%
sample_n(100)
lm(rotten_tomatoes~imdb_score,movies_subset)
library(broom) ## to view tidy summaries
tidy(lm(rotten_tomatoes~imdb_score,movies_subset))
```
bootstrap or permute regression coefficients
```{r}
movies_subset %>%
specify(rotten_tomatoes~imdb_score) %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "slope") %>%
visualize()
movies_subset %>%
specify(rotten_tomatoes~imdb_score) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "slope") %>%
visualize()
```
but this can't be expanded to multiple predictors using `infer`
So we use the `mosiac` package
```{r}
library(mosaic)
lm_boot=do(100) * lm(rotten_tomatoes~imdb_score,
data = sample(movies_subset,replace=T))
confint(lm_boot)
```
what happens if we add in multiple variables?
```{r}
lm_boot=do(1000) * lm(rotten_tomatoes~imdb_score + budget,
data = sample(movies_subset,replace=T))
lm_boot %>% head()
confint(lm_boot)
ggplot(movies_subset,aes(x=imdb_score,y=rotten_tomatoes))+
geom_abline(intercept = lm_boot$Intercept,
slope=lm_boot$imdb_score,col="grey50",lwd=3)+
geom_point()+
geom_smooth(method="lm",se=F)
```
plot all of the bootstrap iterations, what's another way to plot this?
```{r}
ggplot(movies_subset,aes(x=imdb_score,y=rotten_tomatoes))+
geom_abline(intercept = lm_boot$Intercept,
slope=lm_boot$imdb_score,col="grey70",lwd=3)+
geom_point()+
geom_abline(intercept=-54.8,slope=15.1,col="blue",lwd=2)
```