-
Notifications
You must be signed in to change notification settings - Fork 0
/
Seminar2.R
120 lines (101 loc) · 4.17 KB
/
Seminar2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
library("tidyverse")
library("readr")
library("dplyr")
urlfile = "https://raw.githubusercontent.com/CWWhitney/teaching_R/master/participants_data.csv"
participants_data <- read_csv(url(urlfile))
# Change the number of rows displayed to 7
head(participants_data,
n = 7)
names(participants_data)
str(participants_data)
# Change the variable to gender
participants_data$gender
# Change the selection to batch and age
select(participants_data,
batch,
age)
# Change the selection without batch and age
select(participants_data,
-c(batch, age))
# Change the selection to
# those who work more than 5 hours a day
filter(participants_data,
working_hours_per_day > 5)
# Change the filter to those who work more than 5 hours a day and
# names are longer than three letters
filter(participants_data,
working_hours_per_day > 5 &
letters_in_first_name > 3)
# Rename the variable km_home_to_office as commute
rename(participants_data,
name_length = letters_in_first_name,
commute = km_home_to_office)
# Mutate a new column named age_mean that is a function of the age multiplied by the mean of all ages in the group
mutate(participants_data,
labor_mean = working_hours_per_day*
mean(working_hours_per_day),
age_mean = age* mean(age))
# Mutate new column named response_speed populated by 'slow' if it took you more than a day to answer my email
# and 'fast' for others
mutate(participants_data,
response_speed = ifelse(days_to_email_response > 1, "slow", "fast"))
# Create a summary of the participants_mutate data with the mean number of siblings and median years of study
summarize(participants_data,
mean(number_of_siblings),
median(years_of_study))
# Use the magrittr pipe to summarize the mean days to email response,
# median letters in first name, and maximum years of study by gender
participants_data %>%
group_by(gender) %>%
summarize(mean(days_to_email_response),
median(letters_in_first_name),
max(years_of_study))
# Use the magrittr pipe to create a new column called commute,
# where those who travel more than 10km to get to the office are called "commuter" and others are "local".
# Summarize the mean days to email response, median letters in first name, and maximum years of study.
participants_data %>%
mutate(commute = ifelse(
km_home_to_office > 10,
"commuter", "local")) %>%
group_by(commute) %>%
summarize(mean(days_to_email_response),
median(letters_in_first_name),
max(years_of_study))
# Split the data frame by batch, fit a linear model formula
# (days to email response as dependent and working hours as independent)
# to each batch, compute the summary, then extract the R^2.
participants_data %>%
split(.$batch) %>%
map(~
lm(days_to_email_response ~
working_hours_per_day,
data = .)) %>%
map(summary) %>%
map_dbl("r.squared")
# Your turn to perform
#
# Up until this point the code has been provided for you to work on.
# Now it is time for you to apply your new found skills.
# Please work through the wrangling tasks we just went though.
# Use the diamonds data and make the steps in long format (i.e. assigning each step to an object)
# and short format with (i.e. with the magrittr pipeline):
my_diamonds <- diamonds
# select: carat and price
my_diamonds <- select(my_diamonds, carat, price)
my_diamonds
# filter: only where carat is > 0.5
my_diamonds <- filter(my_diamonds, carat > 0.5)
my_diamonds
# rename: rename price as cost
my_diamonds <- mutate(my_diamonds, cost = price, .keep = "unused")
my_diamonds
# mutate: create a variable with ‘expensive’ if greater than mean of cost and ‘cheap’ otherwise
my_diamonds <- mutate(my_diamonds, deal = ifelse(cost>mean(cost), "expensive", "cheap"))
my_diamonds
# group_by: split into cheap and expensive
my_diamonds <- group_by(my_diamonds, deal)
# summarize: give some summary statistics of your choice
summarize(my_diamonds, mean(cost), mean(carat))
# The diamonds data is built in with the ggplot2 library.
# It is already available in your R environment.
# Look at the help file with ?diamonds to learn more about it.