-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathmodule4.Rmd
150 lines (118 loc) · 4.23 KB
/
module4.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
---
output:
pdf_document: default
html_document: default
---
start your engines (load your packages)
```{r}
library(tidyverse)
theme_set(theme_classic(base_size = 20))
```
read in the files
```{r}
movies_imdb=read_delim("movies/movies_imdb.txt",delim=",")
movies_rottentom=read_delim("movies/movies_rottentom.txt",delim=",")
```
read in drinks and convert it to long format with just beer, wine, and spirits
```{r}
drinks=read_delim("drinks/drinks_US.tsv",delim="\t")
drinks_long=drinks %>%
select(-total) %>%
pivot_longer(names_to="type",values_to="gallons",cols=beer:spirits)
drinks_long
ggplot(data=drinks_long,aes(x=year,y=gallons))+geom_point()
```
make some plots (what can we do to make this better?)
```{r}
ggplot(data=drinks_long,aes(x=year,y=gallons))+geom_point()
```
line plot? wow -that's weird....
```{r}
ggplot(data=drinks_long,aes(x=year,y=gallons))+geom_line()
```
what if we use the "group" mapping... hmmm
```{r}
ggplot(data=drinks_long,aes(x=year,y=gallons,group=type))+geom_line()
```
let's add some color so we know what drink is what
```{r}
ggplot(data=drinks_long,aes(x=year,y=gallons,group=type,col=type))+geom_line()
## note that when you use "color" or "fill" it automatically passes that same argument to "group", so this line is effectively the same
ggplot(data=drinks_long,aes(x=year,y=gallons,col=type))+geom_line()
```
What about better colors? (What's wrong with this code? Why doesn't it work?)
```{r}
#ggplot(data=drinks_long,aes(x=year,y=gallons,col=type))+geom_line()+scale_color_manual(values = c("black","red"))
```
What about better colors?
```{r}
ggplot(data=drinks_long,aes(x=year,y=gallons,col=type))+geom_line()+scale_color_brewer(palette = "Set1")
```
What about conveying more information? (why are the colors not the RColorBrewer palette colors?)
```{r}
ggplot(data=drinks_long,aes(x=year,y=gallons,fill=type))+geom_area()+scale_color_brewer(palette = "Set1")
```
well that didn't work as well as we thought:
BOOM:
```{r}
ggplot(data=drinks_long,aes(x=year,y=gallons,fill=type))+geom_area()+scale_fill_brewer(palette = "Set1")
```
we can also change the hue and saturation manually
```{r}
ggplot(data=drinks_long,aes(x=year,y=gallons,fill=type))+geom_area()+scale_fill_hue(l=45,c=50)
ggplot(data=drinks_long,aes(x=year,y=gallons,fill=type))+geom_area()+scale_fill_hue(l=45)
```
##Faceting
```{r}
ggplot(data=drinks_long,aes(x=year,y=gallons))+geom_line()+facet_grid(~type)
```
arranging multiple figures with gridArrange
```{r}
library(gridExtra)
plot1=ggplot(data=drinks_long,aes(x=year,y=gallons,fill=type))+geom_area()+scale_fill_hue(l=45)
plot2=ggplot(data=drinks_long,aes(x=year,y=gallons,fill=type))+geom_area()+scale_fill_hue(l=45,c=50)
grid.arrange(plot1,plot2,nrow=1)
```
Can we save these figures? How should we save them?
```{r}
drinks_plot=ggplot(data=drinks_long,aes(x=year,y=gallons,fill=type))+geom_area()+scale_fill_brewer(palette = "Set1")
drinks_plot
ggsave(drinks_plot,file="~/Downloads/drinks.pdf",width=10,height = 4)
```
##STRINGS ON STRINGS ON STRINGS
We're going to use the stringr package
using str_detect and str_subset: which one returns a logical?
```{r}
str_detect(movies_imdb$genres,"Action")
str_subset(movies_imdb$genres,"Action")
str_subset(movies_imdb$genres,"action") ## what's up with this output? Why is it different than the line above?
```
stringr and filter()
```{r}
action_movies = movies_imdb %>% filter(str_detect(genres,"Action"))
action_movies
## looks like more action movies have been made recently, but is this true? What's our null expectation given the data?
ggplot(action_movies,aes(x=title_year))+geom_histogram() + xlab("movie release (year)")
```
What about movies with a lead actor whose name begins with "Ch"
```{r}
movies_imdb %>% filter(str_detect(actor_1_name,"^Ch"))%>% select(actor_1_name,title_year,movie_title,everything())
```
What about movies with a lead actor whose name ends with "son"
```{r}
movies_imdb %>% filter(str_detect(actor_1_name,"on$"))%>% select(actor_1_name,title_year,movie_title,everything())
```
EVERYTHING IS CASE SENSITIVE
use str_to_lower() and str_to_upper() to fix that
Dates are fun
```{r}
library(lubridate)
ymd(20101215)
ymd("2010/12/15")
ymd("2010Dec15")
ymd("2010December15")
date1=ymd("2010 Dec 15")
year(date1)
month(date1)
mday(date1)
```