forked from nsmackler/dataviz
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlecture4.Rmd
141 lines (109 loc) · 3.76 KB
/
lecture4.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
```{r}
library(tidyverse)
```
read in the files
```{r}
movies_imdb=read_delim("movies/movies_imdb.txt",delim=",")
movies_rottentom=read_delim("movies/movies_rottentom.txt",delim=",")
```
## Including Plots
You can also embed plots, for example:
```{r}
movies_imdb %>% select(movie_title,title_year,duration,imdb_score)
```
```{r}
imdb_minimal=movies_imdb %>% select(movie_title,title_year,duration,imdb_score)
```
```{r}
imdb_minimal=movies_imdb %>% select(movie_title,title_year,duration,imdb_score)
joined=full_join(imdb_minimal,movies_rottentom,by=c("movie_title"="title"))
```
ggplot data
```{r}
plot_imdb = ggplot(movies_imdb)
summary(plot_imdb)
```
ggplot aesthetics
```{r}
plot_imdb = ggplot(movies_imdb) + aes(x=title_year,y=imdb_score)
summary(plot_imdb)
```
Add layers to a ggplot object with +
```{r}
plot_imdb = ggplot(movies_imdb)
plot_imdb = plot_imdb + aes(x=title_year,y=imdb_score)
summary(plot_imdb)
```
ggplot geoms
```{r}
plot_imdb = plot_imdb + geom_point()
summary(plot_imdb)
plot_imdb
```
back to the slides for a second.
some nice default themes
```{r}
plot_imdb = plot_imdb + theme_classic(base_size = 20)
plot_imdb
```
set it globally
```{r}
theme_set(theme_classic(base_size = 20))
```
and axis labels
```{r}
plot_imdb + xlab("movie release year") + ylab("IMDB score")
plot_imdb + xlab("movie release\n(year)") + ylab("IMDB score")
plot_imdb=plot_imdb + xlab("movie release year") + ylab("IMDB score")
```
ggplot scale
```{r}
plot_imdb + scale_y_continuous(limits=c(0,10))+scale_x_continuous(limits=c(1975,2010))
```
ggplot statistics
Note: every stat has a default geom, every geom has a default stat
```{r}
plot_imdb + stat_smooth()
plot_imdb + stat_smooth(method="lm")
plot_imdb + stat_smooth(method="lm",se=F)
```
Overplotting is a problem, how can we summarize?
```{r}
plot_imdb = ggplot(movies_imdb) + aes(x=title_year,y=imdb_score)+geom_point(alpha=0.1)+ xlab("movie release year") + ylab("IMDB score")+theme_classic(base_size = 20) + ggtitle("alpha == 0.1")
plot_imdb
plot_imdb = ggplot(movies_imdb) + aes(x=title_year,y=imdb_score)+geom_point(alpha=0.01)+ xlab("movie release year") + ylab("IMDB score")+theme_classic(base_size = 20) + ggtitle("alpha == 0.01")
plot_imdb
```
another way
```{r}
ggplot(movies_imdb) + aes(x=title_year,y=imdb_score)+xlab("movie release year") + ylab("IMDB score")+theme_classic(base_size = 20) + geom_bin2d()
```
another way
```{r}
ggplot(movies_imdb) + aes(x=title_year,y=imdb_score)+xlab("movie release year") + ylab("IMDB score")+theme_classic(base_size = 20) + geom_count(alpha=0.1)
```
another way
```{r}
ggplot(movies_imdb) + aes(x=title_year,y=imdb_score)+xlab("movie release year") + ylab("IMDB score")+geom_point()+theme_classic(base_size = 20) + geom_quantile()
```
Let's change the color of the points
Aesthetics: mapping vs. setting
```{r}
imdb_plot=ggplot(movies_imdb) + aes(x=title_year,y=imdb_score)+xlab("movie release year") + ylab("IMDB score")+theme_classic(base_size = 20)
imdb_plot+ geom_point(col="green")
imdb_plot+ geom_point(aes(col="green"))
```
So how would we color them by their total budget?
```{r}
imdb_plot=ggplot(movies_imdb) + aes(x=title_year,y=imdb_score)+xlab("movie release year") + ylab("IMDB score")+theme_classic(base_size = 20)
imdb_plot+geom_point(aes(col=budget))
```
Back to stats, how about plots for color and b/w movies?
```{r}
imdb_plot=ggplot(movies_imdb) + aes(x=title_year,y=imdb_score)+xlab("movie release year") + ylab("IMDB score")+theme_classic(base_size = 20)
imdb_plot+geom_point(aes(col=budget))
```
piping the tidyverse right into ggplots
```{r}
movies_imdb %>% group_by(title_year) %>% summarize(min_score=min(imdb_score),max_score=max(imdb_score),mean_score=mean(imdb_score)) %>% ggplot(aes(x=title_year,y=mean_score,ymin=min_score,ymax=max_score))+geom_errorbar()+geom_point()
```