Ch_Combined.Rmd

---
title: "1. Data Visualization"
date: "2024-03-10"
bibliography: references.bib
output: rmdformats::downcute
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
getwd()

library(tidyverse)
library(palmerpenguins)
library(ggthemes)
library(lvplot)
library(ggrepel)
library(patchwork)
library(rvest)
library(skimr)
library(janitor)
library(here)
library(ggdark)
library(tidytuesdayR)
library(ggeasy)
library(nycflights13)
library(Lahman)
library(tidymodels)

library(tidylog)
# change tidylog color to cyan
library(crayon) 
crayon <- function(x) cat(green$bold(x), sep = "\n")
options("tidylog.display" = list(crayon))
```

```{r 1.2.3 Creating a ggplot}
penguins
glimpse(penguins)
?penguins
```

```{r basic plot}
ggplot(
  data = penguins, # data = optional
  mapping = aes(x = flipper_length_mm, y = body_mass_g)
) + # mapping = optional, inside first () so passed globally
  geom_point(mapping = aes(color = species, shape = species)) + # define colors for just points
  geom_smooth(method = "lm") +
  labs(
    title = "Body mass and flipper length",
    subtitle = "Dimensions for Adelie, Chinstrap, and Gentoo Penguins",
    x = "Flipper length (mm)",
    y = "Body mass (g)",
    color = "Species",
    shape = "Species",
    caption = "Data come from the palmerpenguins package"
  ) +
  scale_color_colorblind()
```

```{r exercises}
dim(penguins) # 344 rows, 8 columns

?penguins # a number denoting bill depth (millimeters) <- thickness

ggplot(
  data = penguins,
  mapping = aes(x = bill_length_mm, y = bill_depth_mm)
) +
  geom_point() + # appearas positive
  labs(
    title = "Bill length and bill depth",
    x = "Bill length (mm)",
    y = "Bill depth (mm)"
  )

ggplot(
  data = penguins,
  mapping = aes(x = bill_length_mm, y = species)
) +
  geom_point() +
  labs(
    title = "Bill length and species",
    x = "Bill length (mm)",
    y = "Species"
  )

ggplot(
  data = penguins,
  mapping = aes(x = bill_length_mm, y = species)
) +
  geom_boxplot() +
  labs(
    title = "Bill length and species",
    x = "Bill length (mm)",
    y = "Species"
  )

ggplot(
  data = penguins,
  mapping = aes(x = bill_length_mm, y = species)
) +
  geom_boxplot(na.rm = T) + # ignore missing values
  labs(
    title = "Bill length and species",
    x = "Bill length (mm)",
    y = "Species"
  )

colSums(is.na(penguins)) # get missing values

ggplot(
  data = penguins,
  mapping = aes(x = flipper_length_mm, y = body_mass_g)
) +
  geom_point(mapping = aes(color = bill_depth_mm)) +
  geom_smooth()


ggplot(
  data = penguins,
  mapping = aes(x = flipper_length_mm, y = body_mass_g, color = island)
) +
  geom_point() +
  geom_smooth(se = FALSE)

ggplot(
  data = penguins,
  mapping = aes(x = flipper_length_mm, y = body_mass_g)
) +
  geom_point() +
  geom_smooth()

ggplot() +
  geom_point(
    data = penguins,
    mapping = aes(x = flipper_length_mm, y = body_mass_g)
  ) +
  geom_smooth(
    data = penguins,
    mapping = aes(x = flipper_length_mm, y = body_mass_g)
  )
```
Chapter 1
```{r 1.3 GGPlot Cells}
# verbose
ggplot(
  data = penguins,
  mapping = aes(x = flipper_length_mm, y = body_mass_g)
) +
  geom_point()

# concise
ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) +
  geom_point()

# future pipe
penguins |>
  ggplot(aes(x = flipper_length_mm, y = body_mass_g)) +
  geom_point()
```

```{r 1.4 Visualizing distributions}
ggplot(penguins, aes(x = species)) +
  geom_bar()

# reorder bt frequency by change to factor
ggplot(penguins, aes(x = fct_infreq(species))) +
  geom_bar()

# 1.4.2 A numerical variable
ggplot(penguins, aes(x = body_mass_g)) +
  geom_histogram(binwidth = 200)

# play with bin width
ggplot(penguins, aes(x = body_mass_g)) +
  geom_histogram(binwidth = 20)
ggplot(penguins, aes(x = body_mass_g)) +
  geom_histogram(binwidth = 2000)

# denskity plot
ggplot(penguins, aes(x = body_mass_g)) +
  geom_density()
```

```{r 1.4.3 Exercises}
ggplot(penguins, aes(y = species)) + # horizatonal when y =
  geom_bar()

ggplot(penguins, aes(x = species)) +
  geom_bar(color = "red") # outline red

ggplot(penguins, aes(x = species)) +
  geom_bar(fill = "red") # fill in red

# binwidth width of each bar in terms of x units

ggplot(diamonds, aes(x = carat)) +
  geom_histogram(binwidth = 0.05)

ggplot(diamonds, aes(x = carat)) +
  geom_histogram(binwidth = 0.50)
```

```{r 1.5.1 Visualizing relationships}
# boxplot
ggplot(penguins, aes(x = species, y = body_mass_g)) +
  geom_boxplot()

# density plot, customized the thickness of the lines using the linewidth argument in order to make them stand out a bit more against the background.
ggplot(penguins, aes(x = body_mass_g, color = species)) +
  geom_density(linewidth = 0.75)

# alpha aesthetic to add transparency to the filled density curves.
ggplot(penguins, aes(x = body_mass_g, color = species, fill = species)) +
  geom_density(alpha = 0.5)

# 1.5.2 Two categorical variables

# difficlut to interpet, different bar siaes
ggplot(penguins, aes(x = island, fill = species)) +
  geom_bar()

#  relative frequency plot created by setting position = "fill" in the geom, is more useful for comparing species distributions across islands since it’s not affected by the unequal numbers of penguins across the islands
ggplot(penguins, aes(x = island, fill = species)) +
  geom_bar(position = "fill")

# 1.5.3 Two numerical variables

ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) +
  geom_point()

# 3 or more variables, butcluttered and difficult ot read
ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) +
  geom_point(aes(color = species, shape = island))

# solutoin1 facets!
ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) +
  geom_point(aes(color = species, shape = species)) +
  facet_wrap(~island) # seperate plots via island
```

```{r 1.5.3 exercises}
mpg
?mpg

ggplot(
  mpg,
  aes(x = hwy, y = displ, size = cty)
) +
  geom_point()

ggplot(
  mpg,
  aes(x = hwy, y = displ, size = cty, color = cty)
) +
  geom_point()

ggplot(
  mpg,
  aes(x = hwy, y = displ, size = cty, color = cty, shape = drv)
) +
  geom_point()

ggplot(
  mpg,
  aes(x = hwy, y = displ, size = cty, color = cty, shape = drv, linewidth = cty) # linewidth is ignored
) +
  geom_point()

ggplot(penguins, aes(x = bill_depth_mm, y = bill_length_mm)) +
  geom_point(aes(color = species)) +
  facet_wrap(~species)

ggplot(
  data = penguins,
  mapping = aes(
    x = bill_length_mm, y = bill_depth_mm,
    color = species, shape = species
  )
) +
  geom_point() +
  labs(color = "species") # lower case to match var name

ggplot(penguins, aes(x = island, fill = species)) +
  geom_bar(position = "fill") # differences between islands
ggplot(penguins, aes(x = species, fill = island)) +
  geom_bar(position = "fill") # differences between species
```

```{r 1.6 Saving your plots}
ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) +
  geom_point()
ggsave(filename = "plots/penguin-plot.svg")
```

```{r 1.6 exercise}
ggplot(mpg, aes(x = class)) +
  geom_bar()
ggplot(mpg, aes(x = cty, y = hwy)) +
  geom_point()
ggsave("plots/mpg-plot.pdf") # saves last run, remember to use f1 for help
```

```{r}
my_variable <- 10
my_variable # typo! shouod be my_variable

ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point() +
  geom_smooth(method = "lm")

# option shift k for keyboard shortcuts

my_bar_plot <- ggplot(mpg, aes(x = class)) +
  geom_bar()
my_bar_plot
my_scatter_plot <- ggplot(mpg, aes(x = cty, y = hwy)) +
  geom_point()
my_scatter_plot
ggsave(filename = "plots/mpg-plot.png", plot = my_bar_plot) # my bar plot is saved
```

## Ch 3

```{r}


#  Conflicts with filter and lag. If you want to use the base version of these functions after loading dplyr, you’ll need to use their full names: stats::filter() and stats::lag().
```

```{r explore data}
?flights
flights
glimpse(flights) # <int> is short for integer, <dbl> is short for double (aka real numbers), <chr> for character (aka strings), and <dttm> for date-time.
```

```{r 3.1.3 dplyr basics}
flights |>
  filter(dest == "IAH") |>
  group_by(year, month, day) |>
  summarize(
    arr_delay = mean(arr_delay, na.rm = TRUE)
  )

# find all flights that departed more than 120 minutes (two hours) late:
flights |>
  filter(dep_delay > 120)


# Flights that departed on January 1
flights |>
  filter(month == 1 & day == 1)

# Flights that departed in January or February
flights |>
  filter(month == 1 | month == 2)

# A shorter way to select flights that departed in January or February
flights |>
  filter(month %in% c(1, 2))

flights |>
  filter(month %in% c(11, 12))

feb <- flights |>
  filter(month == 2)
feb

flights |>
  arrange(year, month, day, dep_delay, dep_time)

flights |>
  arrange(desc(dep_delay))

# 3.2.4 distinct()
flights |>
  distinct() # remove any duplicate rows

# Find all unique origin and destination pairs
flights |>
  distinct(origin, dest)

# eep other columns when filtering for unique rows, you can use the .keep_all = TRUE option.
flights |>
  distinct(origin, dest, .keep_all = TRUE)

flights |>
  count(origin, dest, sort = TRUE)

flights |>
  count(year, month, sort = TRUE)
```

```{r 3.2.5 Exercises}
flights |>
  filter(dep_delay >= 120)
flights |>
  filter(dest %in% c("IAH", "HOU"))
flights |>
  filter(carrier %in% c("UA", "AA", "DL"))
flights |>
  filter(month %in% c(6, 7, 8))
flights |>
  filter(arr_delay > 120 & dep_delay == 0)

flights |>
  arrange(desc(dep_delay))

flights |>
  arrange(dep_time)

flights |>
  arrange(arr_time - dep_time) |>
  relocate(dep_time, arr_time)

flights |>
  distinct(month, day) |>
  arrange(month, day) |>
  count() #  OR nrow()

flights |>
  arrange(desc(distance)) |>
  relocate(distance, origin, dest) # JFK to HNL

flights |>
  arrange(distance) |>
  relocate(distance, origin, dest) # EWR to PHL

# Order does not matter
```

```{r 3.3 Columns}
# mutate
flights |>
  mutate(
    gain = dep_delay - arr_delay,
    speed = distance / air_time * 60
  ) |>
  relocate(gain, speed)

# add to left hand side after mutating
# mutate
flights |>
  mutate(
    gain = dep_delay - arr_delay,
    speed = distance / air_time * 60,
    before = 1
  ) # equivalent to relocate gain, speed

# add after day variable, . is a sign that .before is an argument to the mutate function, not a variable name
flights |>
  mutate(
    gain = dep_delay - arr_delay,
    speed = distance / air_time * 60,
    .after = day
  )

# only keep affected variables
df_delay_gain <- flights |>
  mutate(
    gain = dep_delay - arr_delay,
    hours = air_time / 60,
    gain_per_hour = gain / hours,
    .keep = "used"
  )

df_delay_gain

# 3.3.2 select()

# selet columns by name
flights |>
  select(year, month, day, carrier)

# select all columns between variables
flights |>
  select(flight:dest)

# select all columns except variables
flights |>
  select(!year:day) # Historically this operation was done with - instead of !, so you’re likely to see that in the wild. These two operators serve the same purpose but with subtle differences in behavior. We recommend using ! because it reads as “not” and combines well with & and |.

# select all columns with characters
flights |>
  select(where(is.character))

# rename variables
flights |>
  select(tail_num = tailnum) # new name on left, old on right

# further oin relocation
flights |>
  relocate(year:dep_time, .after = time_hour)
flights |>
  relocate(starts_with("arr"), .before = dep_time)
```

```{r 3.3.5 Exercises}
flights |>
  select(dep_time, sched_dep_time, dep_delay) # dep_delay diff between dep_time and sched

flights |>
  select(dep_time:dep_delay)

flights |>
  select(starts_with("dep") | starts_with("arr"))

flights |>
  select(month, month, month)

variables <- c("year", "month", "day", "dep_delay", "arr_delay")

flights |>
  select(any_of(variables))

flights |>
  select(contains("TIME")) # by default, case ignored

flights |>
  rename(air_time_min = air_time) |>
  relocate(air_time_min, .before = 1)

flights |>
  select(tailnum)
# arrange(arr_delay) # because tailnum was selected, so arr_delay was booted
```

```{r 3.4  Pipe and Groups}
flights |>
  filter(dest == "IAH") |>
  mutate(speed = distance / air_time * 60) |>
  select(year:day, dep_time, carrier, flight, speed) |>
  arrange(desc(speed))

flights |>
  group_by(month)

# 3.5.2 summarize()
flights |>
  group_by(month) |>
  summarize(
    avg_delay = mean(dep_delay) # uh oh, missing data!
  )

flights |>
  group_by(month) |>
  summarize(
    avg_delay = mean(dep_delay, na.rm = TRUE)
  )

flights |>
  group_by(month) |>
  summarize(
    avg_delay = mean(dep_delay, na.rm = TRUE),
    n = n() # number of rows in each group
  )

# 3.5.3 The slice_ functions

# df |> slice_head(n = 1) takes the first row from each group.
# df |> slice_tail(n = 1) takes the last row in each group.
# df |> slice_min(x, n = 1) takes the row with the smallest value of column x.
# df |> slice_max(x, n = 1) takes the row with the largest value of column x.
# df |> slice_sample(n = 1) takes one random row.

flights |>
  group_by(dest) |>
  slice_max(arr_delay, n = 1) |> # takes the row with the largest value of column arr_delay
  relocate(dest) # Note that there are 105 destinations but we get 108 rows here. What’s up? slice_min() and slice_max() keep tied values so n = 1 means give us all rows with the highest value. If you want exactly one row per group you can set with_ties = FALSE.

flights |>
  group_by(dest) |>
  slice_max(arr_delay, n = 1, with_ties = F) |> # no ties, only one row per group
  relocate(dest)

# This is similar to computing the max delay with summarize(), but you get the whole corresponding row (or rows if there’s a tie) instead of the single summary statistic.

daily <- flights |>
  group_by(year, month, day) # multiple group variables, each summary peels off the last group.
daily

daily_flights <- daily |>
  summarize(n = n()) # explains how one group was peeled off (day), now only 2 left

daily_flights <- daily |>
  summarize(
    n = n(),
    .groups = "drop_last" # surpress warning about peeling offf groups
  )

# summarize ungrouped data
daily |>
  ungroup() |>
  summarize( # only one row becuase ungrouped data is treated as one group
    avg_delay = mean(dep_delay, na.rm = TRUE),
    flights = n()
  )

# 3.5.6 .by

flights |>
  summarize(
    delay = mean(dep_delay, na.rm = TRUE),
    n = n(),
    .by = month # new and experimental
  )

# group by multiple variables
flights |>
  summarize(
    delay = mean(dep_delay, na.rm = TRUE),
    n = n(),
    .by = c(origin, dest) # .by works with all verbs and has the advantage that you don’t need to use the .groups argument to suppress the grouping message or ungroup() when you’re done.
  )
```

```{r 3.5.7 Exercises}
flights |>
  group_by(carrier, dest) |>
  summarize(n())

flights |>
  group_by(carrier) |>
  summarize(
    avg_dep_delay = mean(dep_delay, na.rm = T)
  ) |>
  arrange(desc(avg_dep_delay)) # F9 (Frontier Airlines) has the worst delays

flights |>
  group_by(dest) |>
  slice_max(dep_delay, with_ties = F) |>
  relocate(dest, dep_delay)

df_dep_time <- flights |>
  group_by(hour) |>
  summarize(
    avg_delay_time = mean(dep_delay, na.rm = T)
  )

ggplot(df_dep_time, aes(x = hour, y = avg_delay_time)) +
  geom_smooth()

# flights |>
#   group_by(dest)
#   slice_min(dep_delay, n = -1)

flights |>
  count(carrier, sort = T) # roughly equivalent to df %>% group_by(a, b) %>% summarise(n = n()). count()

df <- tibble(
  x = 1:5,
  y = c("a", "b", "a", "a", "b"),
  z = c("K", "K", "L", "L", "K")
)

df

# a, a, a, a, b

df |>
  group_by(y) # groups by y , thought output remains the same

# a, a, a, a, b

df |>
  arrange(y)

# two rows, mean of all a's, mean of all b's, two columns

df |>
  group_by(y) |>
  summarize(mean_x = mean(x))

# group by all combinatoins of y z, so a K, a L, b K
df |>
  group_by(y, z) |>
  summarize(mean_x = mean(x))

# same\?
df |>
  group_by(y, z) |>
  summarize(mean_x = mean(x), .groups = "drop") # result is not grouped

df |>
  group_by(y, z) |>
  summarize(mean_x = mean(x)) # show y z and summary mnean_x

df |>
  group_by(y, z) |>
  mutate(mean_x = mean(x)) # create new column plus all others
```

```{r 3.6 Case study: aggregates and sample size}
batters <- Lahman::Batting |>
  group_by(playerID) |>
  summarize(
    performance = sum(H, na.rm = TRUE) / sum(AB, na.rm = TRUE),
    n = sum(AB, na.rm = TRUE)
  )
batters


batters |>
  filter(n > 100) |>
  ggplot(aes(x = n, y = performance)) +
  geom_point(alpha = 1 / 10) +
  geom_smooth(se = FALSE)

batters |>
  arrange(desc(performance))
```

## Ch 4

```{r libraries}
library(nycflights13)
```

```{r}
flights %>%
  filter(dest == "IAH") %>%
  group_by(year, month, day) %>%
  summarize(
    n = n(),
    delay = mean(arr_delay, na.rm = TRUE)
  ) %>%
  filter(n > 10)

flights %>%
  filter(
    carrier == "UA",
    dest %in% c("IAH", "HOU"),
    sched_dep_time > 900,
    sched_arr_time < 2000
  ) %>%
  group_by(flight) %>%
  summarize(
    delay = mean(arr_delay, na.rm = TRUE),
    cancelled = sum(is.na(arr_delay)),
    n = n()
  ) %>%
  filter(n > 10)
```

## Ch 5

```{r 5.2 }
billboard

billboard |>
  pivot_longer(
    cols = starts_with("wk"),
    names_to = "week",
    values_to = "rank"
  )

billboard |>
  pivot_longer(
    cols = starts_with("wk"),
    names_to = "week",
    values_to = "rank",
    values_drop_na = TRUE
  )

billboard_longer <- billboard |>
  pivot_longer(
    cols = starts_with("wk"),
    names_to = "week",
    values_to = "rank",
    values_drop_na = TRUE
  ) |>
  mutate(
    week = parse_number(week)
  )
billboard_longer

billboard_longer |>
  ggplot(aes(x = week, y = rank, group = track)) +
  geom_line(alpha = 0.25) +
  scale_y_reverse()

df <- tribble(
  ~id, ~bp1, ~bp2,
  "A", 100, 120,
  "B", 140, 115,
  "C", 120, 125
)

df |>
  pivot_longer(
    cols = bp1:bp2,
    names_to = "measurement",
    values_to = "value"
  )

?who2

who2 |>
  pivot_longer(
    cols = !(country:year),
    names_to = c("diagnosis", "gender", "age"),
    names_sep = "_",
    values_to = "count"
  )

who2 |>
  pivot_longer(
    cols = !(country:year),
    names_to = c("diagnosis", "gender", "age"),
    names_sep = "_",
    values_to = "count"
  )
```

```{r 5.3 Legthening Data}
df <- tribble(
  ~id, ~bp1, ~bp2,
  "A", 100, 120,
  "B", 140, 115,
  "C", 120, 125
)

df |> pivot_longer(
  cols = bp1:bp2,
  names_to = "measurement",
  values_to = "amount"
)
```

```{r data and variable names}
household
?household

household |>
  pivot_longer(
    cols = !family,
    names_to = c(".value", "child"),
    names_sep = "_",
    values_drop_na = T,
    values_to = "name"
  )
```

```{r 5.4 Widening data}
cms_patient_experience |>
  distinct(measure_cd, measure_title)

cms_patient_experience |>
  pivot_wider(
    id_cols = starts_with("org"),
    names_from = measure_cd,
    values_from = prf_rate
  )
cms_patient_experience
```

```{r 5.4.1 How pivot wider works}
df <- tribble(
  ~id, ~measurement, ~value,
  "A",        "bp1",    100,
  "B",        "bp1",    140,
  "B",        "bp2",    115,
  "A",        "bp2",    120,
  "A",        "bp3",    105
)

df |> pivot_wider(
  id_cols = "id", # optional in this case
  names_from = measurement,
  values_from = value,
)

df |>
  distinct(measurement) |>
  pull() # extra column

df |>
  select(-measurement, -value) |> # all but measurement, value
  distinct() # A B

df |>
  select(-measurement, -value) |>
  distinct() |>
  mutate(x = NA, y = NA, z = NA)

df <- tribble(
  ~id, ~measurement, ~value,
  "A",        "bp1",    100,
  "A",        "bp1",    102,
  "A",        "bp2",    120,
  "B",        "bp1",    140,
  "B",        "bp2",    115
)

df |>
  summarise(
    n = n(),
    .by = c(id, measurement)
  ) |>
  filter(n > 1)

df |>
  # check to see if there are repeat combinations of ID and measurement and, if so, remove the repeat
  distinct(id, measurement, .keep_all = TRUE) |>
  pivot_wider(
    id_cols = id,
    names_from = measurement,
    values_from = value
  )
```

# Ch 6

```{r 6.1 shortcuts}
library(dplyr)
library(nycflights13)

not_cancelled <- flights |>
  filter(!is.na(dep_delay), !is.na(arr_delay))

not_cancelled |>
  group_by(year, month, day) |>
  summarize(mean = mean(dep_delay))
```

```{r 6.2.2 Where does your analysis live?}
getwd()
```

```{r 6.2.4 Relative and absolute paths}
# always use relative paths! 👀
```

```{r 6.3 exercises}
# Edit a DF interactively!
library(DataEditR)

# mtcars_new <- data_edit(mtcars,
#   save_as = "mtcars_new.csv"
# )

library(ViewPipeSteps)
diamonds
# View pipe steps!
# diamonds %>%
#   select(carat, cut, color, clarity, price) %>%
#   group_by(color) %>%
#   summarise(n = n(), price = mean(price)) %>%
#   arrange(desc(color)) %>%
#   print_pipe_steps()
```

## Ch 7

```{r 7.2 Reading data from a file}
students <- read_csv("https://pos.it/r4ds-students-csv")
glimpse(students)

students <- read_csv("https://pos.it/r4ds-students-csv", na = c("N/A", "")) # capture both empty and N/A strings
glimpse(students)

# columns names annoying have backticks, remove them
students <- rename(
  students,
  student_id = `Student ID`,
  full_name = `Full Name`
)
glimpse(students)

# alterntive automatic method
library(janitor)
students <- read_csv("https://pos.it/r4ds-students-csv", na = c("N/A", ""))
glimpse(students)
students |> janitor::clean_names()

# change cats to factors, fix age
students <- students |>
  janitor::clean_names() |>
  mutate(
    meal_plan = factor(meal_plan), # changes from chr to fct
    age = parse_number(if_else(age == "five", "5", age)) # if age is "five", turn to "5" otherwise leave it along, parse all age strings as numbers
  )
glimpse(students)
```

```{r 7.2.2 Other arguments}
read_csv(
  "a,b,c
  1,2,3
  4,5,6"
)

# skip first two rows of csv.
read_csv(
  "The first line of metadata
  The second line of metadata
  x,y,z
  1,2,3",
)

read_csv(
  "The first line of metadata
  The second line of metadata
  x,y,z
  1,2,3",
  skip = 2
)

# ignore specific lines
read_csv(
  "/ A comment I want to skip
  x,y,z
  1,2,3",
  comment = "/"
)

# no col names
read_csv(
  "1,2,3
  4,5,6",
  col_names = FALSE # creates col names
)

# custom col names
read_csv(
  "1,2,3
  4,5,6",
  col_names = c("x", "y", "z")
)
```

```{r 7.2.3 Other file types}
# semicolon: csv2, tab: tsv, delim: guess, fwf: fixed width, tableL white space/fixed width, log: apache log
```

```{r 7.2.4 Exercises}
# delim
# na, trim_ws, etc.
#  read_fwf() fwf_empty() - Guesses based on the positions of empty columns. fwf_widths() - Supply the widths of the columns. fwf_positions() - Supply paired vectors of start and end positions. fwf_cols() - Supply named arguments of paired start and end positions or column widths.

read_csv("x,y\n1,'a,b'", quote = "'")

read_csv("a,b,c\n1,2,3\n4,5,6") # was missing a column

annoying <- tibble(
  `1` = 1:10,
  `2` = `1` * 2 + rnorm(length(`1`))
)

annoying

getOne <- annoying |>
  select("1")

# Extracting the variable labeled as '1'
annoying |>
  pull(`1`)

# scatterplot one vs. two
annoying |>
  ggplot(aes(x = `2`, y = `1`)) +
  geom_point()

# Creating a new column called 3, which is 2 divided by 1.
annoying <- annoying |>
  mutate(
    `3` = `2` / `1`
  )

# Renaming the columns to one, two, and three
annoying |>
  rename("one" = `1`, "two" = `2`, "three" = `3`)
```

```{r 7.3 Controlling column types}
read_csv("
  logical,numeric,date,string
  TRUE,1,2021-01-15,abc
  false,4.5,2021-02-15,def
  T,Inf,2021-02-16,ghi
")

another_csv <- "
x,y,z
1,2,3"

read_csv(
  another_csv,
  col_types = cols(.default = col_character())
)

sales_files <- c(
  "https://pos.it/r4ds-01-sales",
  "https://pos.it/r4ds-02-sales",
  "https://pos.it/r4ds-03-sales"
)
read_csv(sales_files, id = "file") # id argument adds a new column called file to the resulting data frame that identifies the file the data come from.

sales_files <- list.files("data", pattern = "sales\\.csv$", full.names = TRUE)
sales_files
```

```{r 7.5 Writing to a file}
students
write_csv(students, "data/students-2.csv")
read_csv("data/students-2.csv") # note that we lose col type for meal_plan

# custom R's binary RDS
write_rds(students, "data/students.rds")
read_rds("data/students.rds")

# arrow binary for many languages
library(arrow)
students
# write_parquet(students, "students.parquet")
# read_parquet("students.parquet")
```

```{r 7.6 Data entry}
# by column which is a bit weird
tibble(
  x = c(1, 2, 5),
  y = c("h", "m", "g"),
  z = c(0.08, 0.83, 0.60)
)

# by row which is easier
tribble(
  ~x, ~y, ~z,
  1, "h", 0.08,
  2, "m", 0.83,
  5, "g", 0.60
)
```

## Ch 8

```{r}
y <- 1:4
mean(y)

dput(mtcars)
```

## Ch 9

```{r 9.2 Aesthetic mappings}
mpg
mpg |>
  ggplot(aes(x = displ, y = hwy, color = class)) +
  geom_point()

mpg |>
  ggplot(aes(x = displ, y = hwy, size = class)) +
  geom_point()

mpg |>
  ggplot(aes(x = displ, y = hwy, alpha = class)) +
  geom_point()

ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point(color = "blue")
```

```{r 9.2.1 Exercises}
# Create a scatterplot of hwy vs. displ where the points are pink filled in triangles.
ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point(color = "pink", shape = 17)

# Why did the following code not result in a plot with blue points?
ggplot(mpg) +
  geom_point(aes(x = displ, y = hwy, color = "blue")) # aes should be defined outisde of aes

ggplot(mpg) +
  geom_point(aes(x = displ, y = hwy), color = "blue")

# what does the stroke aes do ?

ggplot(mpg) +
  geom_point(aes(x = displ, y = hwy), stroke = 1) # adjust size, thickness

mpg |> ggplot(aes(x = displ, y = hwy, color = displ < 5)) +
  geom_point() # true / false with diff colors
```

```{r 9.3 Geometric objects}
ggplot(mpg, aes(x = displ, y = hwy, shape = drv)) +
  geom_point()

ggplot(mpg, aes(x = displ, y = hwy, linetype = drv)) +
  geom_smooth()

ggplot(mpg, aes(x = displ, y = hwy, color = drv)) +
  geom_point() +
  geom_smooth(aes(linetype = drv))

# Left
ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_smooth()

# Middle
ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_smooth(aes(group = drv))

# Right
ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_smooth(aes(color = drv), show.legend = FALSE)

ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point(aes(color = class)) +
  geom_smooth()

ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point() +
  geom_point(
    data = mpg |> filter(class == "2seater"),
    color = "red"
  ) +
  geom_point(
    data = mpg |> filter(class == "2seater"),
    shape = "circle open", size = 3, color = "red"
  )

# Left
ggplot(mpg, aes(x = hwy)) +
  geom_histogram(binwidth = 2)

# Middle
ggplot(mpg, aes(x = hwy)) +
  geom_density()

# Right
ggplot(mpg, aes(x = hwy)) +
  geom_boxplot()

library(ggridges)

ggplot(mpg, aes(x = hwy, y = drv, fill = drv, color = drv)) +
  geom_density_ridges(alpha = 0.5, show.legend = FALSE)
```

```{r 9.3.1 Exercises}
# What geom would you use to draw a line chart? A boxplot? A histogram? An area chart?
# linear chart - gemo_point
# boxplot - geom_box
# histogram - geom_histogram
# area chart - geom_area

# Earlier in this chapter we used show.legend without explaining it:

ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_smooth(aes(color = drv), show.legend = F)

ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_smooth(aes(color = drv), show.legend = F, se = F)
# se = Display confidence interval around smooth? (TRUE by default, see level to control.)
# hides legend, more space

# Recreate the R code necessary to generate the following graphs. Note that wherever a categorical variable is used in the plot, it’s drv

ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point(stroke = 2) +
  geom_smooth(se = F)

ggplot(mpg, aes(x = displ, y = hwy, )) +
  geom_point(stroke = 2) +
  geom_smooth(se = F, aes(group = drv))

ggplot(mpg, aes(x = displ, y = hwy, color = drv)) +
  geom_point(stroke = 2) +
  geom_smooth(se = F, aes(color = drv), show.legend = T)

ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point(stroke = 2, aes(color = drv)) +
  geom_smooth(se = F, show.legend = T, aes(linetype = drv))

ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point(shape = 16, size = 7, color = "white") +
  geom_point(aes(color = drv, stroke = 2))
```

```{r 9.4 Facets}
ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point() +
  facet_wrap(~cyl)


ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point() +
  facet_grid(drv ~ cyl)

ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point() +
  facet_grid(drv ~ cyl, scales = "free_y") # different scales in columns, helps visualize better
```

```{r 9.4.1 Exercises}
# What happens if you facet on a continuous variable?

ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point() +
  facet_wrap(~cty) # with cont. var, get sum of unique values

mpg |> count(cty)

ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point() +
  facet_grid(drv ~ cyl)

# What do the empty cells in the plot above with facet_grid(drv ~ cyl) mean? Run the following code. How do they relate to the resulting plot?

ggplot(mpg) +
  geom_point(aes(x = drv, y = cyl))

# certain combination do not exsist, for instance , there are no rear wheel drive train with 1 cylinder

mpg %>%
  filter(drv == "r" & cyl == 4)


# What plots does the following code make? What does . do?

ggplot(mpg) +
  geom_point(aes(x = displ, y = hwy)) +
  facet_grid(drv ~ .)

ggplot(mpg) +
  geom_point(aes(x = displ, y = hwy)) +
  facet_grid(. ~ cyl)

# Take the first faceted plot in this section:

ggplot(mpg) +
  geom_point(aes(x = displ, y = hwy)) +
  facet_wrap(~class, nrow = 2)

# What are the advantages to using faceting instead of the color aesthetic? What are the disadvantages? How might the balance change if you had a larger dataset?

# facets allow for different scales (y free) and alllow for more breathing spaces. Larger data sets might mean too many grids though.

# Read ?facet_wrap. What does nrow do? What does ncol do? What other options control the layout of the individual panels? Why doesn’t facet_grid() have nrow and ncol arguments?

?facet_wrap
# nrow, ncol	 Number of rows and columns.
ggplot(mpg) +
  geom_point(aes(x = displ, y = hwy)) +
  facet_wrap(~class, nrow = 4) # can make more vertical orientation, for instance, grid won't have these options as they are already set

# Which of the following plots makes it easier to compare engine size (displ) across cars with different drive trains? What does this say about when to place a faceting variable across rows or columns?

ggplot(mpg, aes(x = displ)) +
  geom_histogram() +
  facet_grid(drv ~ .) # by rows (much better)

ggplot(mpg, aes(x = displ)) +
  geom_histogram() +
  facet_grid(. ~ drv) # columns compete with x=displ

# Recreate the following plot using facet_wrap() instead of facet_grid(). How do the positions of the facet labels change?

ggplot(mpg) +
  geom_point(aes(x = displ, y = hwy)) +
  facet_grid(drv ~ .)

ggplot(mpg) +
  geom_point(aes(x = displ, y = hwy)) +
  facet_grid(drv ~ ., switch = "y") # move label to otherside
```

```{r 9.5 Statistical transformations}
ggplot(diamonds, aes(x = cut)) +
  geom_bar()

?geom_bar

diamonds |>
  count(cut) |>
  ggplot(aes(x = cut, y = n)) +
  geom_bar(stat = "identity")

ggplot(diamonds, aes(x = cut, y = after_stat(prop), group = 1)) +
  geom_bar()

ggplot(diamonds) +
  stat_summary(
    aes(x = cut, y = depth),
    fun.min = min,
    fun.max = max,
    fun = median
  )
```

```{r 9.5.1 Exercises}
# What is the default geom associated with stat_summary()? How could you rewrite the previous plot to use that geom function instead of the stat function?

# Uses "geom = pointrange" by default

diamonds |>
  group_by(cut) |>
  summarize(
    lower = min(depth),
    upper = max(depth),
    midpoint = median(depth)
  ) |>
  ggplot(aes(x = cut, y = midpoint)) +
  geom_pointrange(aes(ymin = lower, ymax = upper))

# What does geom_col() do? How is it different from geom_bar()?

# geom_col represents values in the data, why geom_bar uses counts

ggplot(diamonds, aes(x = cut)) +
  geom_bar()

ggplot(diamonds, aes(x = cut, y = depth)) +
  geom_col()
# Most geoms and stats come in pairs that are almost always used in concert. Make a list of all the pairs. What do they have in common? (Hint: Read through the documentation.)

# geom	                stat
# geom_bar()	          stat_count()
# geom_bin2d()	        stat_bin_2d()
# geom_boxplot()        stat_boxplot()
# geom_contour_filled()	stat_contour_filled()
# geom_contour()	      stat_contour()
# geom_count()	        stat_sum()
# geom_density_2d()	    stat_density_2d()
# geom_density()	      stat_density()
# geom_dotplot()	      stat_bindot()
# geom_function()	      stat_function()
# geom_sf()	            stat_sf()
# geom_sf()	            stat_sf()
# geom_smooth()	        stat_smooth()
# geom_violin()	        stat_ydensity()
# geom_hex()	          stat_bin_hex()
# geom_qq_line()	      stat_qq_line()
# geom_qq()	            stat_qq()
# geom_quantile()	      stat_quantile()

# What variables does stat_smooth() compute? What arguments control its behavior?
?stat_smooth
# predicted value, lower CI from mean, upper CI from mean, and SE

# In our proportion bar chart, we needed to set group = 1. Why? In other words, what is the problem with these two graphs?

# In the first pair of plots, we see that setting group = 1 results in the marginal proportions of cuts being plotted. In the second pair of plots, setting group = color results in the proportions of colors within each cut being plotted.
# one variable
ggplot(diamonds, aes(x = cut, y = after_stat(prop))) +
  geom_bar()
ggplot(diamonds, aes(x = cut, y = after_stat(prop), group = 1)) +
  geom_bar() # after_stat(prop) represents the proportion of each category. The group = 1 argument is used to ensure the proportion are grouped

# two variables
ggplot(diamonds, aes(x = cut, fill = color, y = after_stat(prop))) +
  geom_bar()
ggplot(diamonds, aes(x = cut, fill = color, y = after_stat(prop), group = color)) +
  geom_bar()
```

```{r 9.6 Position adjustments}
# Color
ggplot(mpg, aes(x = drv, color = drv)) +
  geom_bar()

# Fill
ggplot(mpg, aes(x = drv, fill = drv)) +
  geom_bar()

# fill with another class other than x
ggplot(mpg, aes(x = drv, fill = class)) +
  geom_bar()

# identity
ggplot(mpg, aes(x = drv, fill = class)) +
  geom_bar(alpha = 1 / 5, position = "identity") # create overlap, so use alpha for transparency, The identity position adjustment is more useful for 2d geoms, like points, where it is the default.

# transparent
ggplot(mpg, aes(x = drv, color = class)) +
  geom_bar(fill = NA, position = "identity") # completely transparent by setting fill = NA.

# fill
ggplot(mpg, aes(x = drv, fill = class)) +
  geom_bar(position = "fill") # "fill" works like stacking, but makes each set of stacked bars the same height. This makes it easier to compare proportions across groups.

# dodge
ggplot(mpg, aes(x = drv, fill = class)) +
  geom_bar(position = "dodge") # "dodge" places overlapping objects directly beside one another. This makes it easier to compare individual values.

# fixing overplotting (one plot containg many values)
ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point(position = "jitter") # adds a small amount of random noise to each point. This spreads the points out because no two points are likely to receive the same amount of random noise.

# shorthand for jitter
ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_jitter()
```

```{r 9.6.1 Exercises}
# What is the problem with the following plot? How could you improve it?

ggplot(mpg, aes(x = cty, y = hwy)) +
  geom_point()

# jitter it to avoid overplotting
ggplot(mpg, aes(x = cty, y = hwy)) +
  geom_jitter()

# What, if anything, is the difference between the two plots? Why?

ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point()
ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point(position = "identity")
# no differene, identity is default

# What parameters to geom_jitter() control the amount of jittering?
?geom_jitter # width, height, defaults to .4. this means the jitter values will occupy 80% (twice value of .4) of the implied bins. Categorical data is aligned on the integers, so a width or height of 0.5 will spread the data so it's not possible to see the distinction between the categories.

ggplot(mpg, aes(x = cty, y = hwy)) +
  geom_jitter(width = .8, height = .8)

ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_jitter()
ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_count() # size by overlap

# What’s the default position adjustment for geom_boxplot()? Create a visualization of the mpg dataset that demonstrates it.
?geom_boxplot # position = "dodge2"

ggplot(mpg, aes(x = cty, y = displ)) +
  geom_boxplot()

ggplot(mpg, aes(x = cty, y = displ)) +
  geom_boxplot(position = "dodge2")
```

```{r 9.7 Coordinate systems}
nz <- map_data("nz")

ggplot(nz, aes(x = long, y = lat, group = group)) +
  geom_polygon(fill = "white", color = "black")

ggplot(nz, aes(x = long, y = lat, group = group)) +
  geom_polygon(fill = "white", color = "black") +
  coord_quickmap() # sets the aspect ratio correctly for geographic maps.

bar <- ggplot(data = diamonds) +
  geom_bar(
    mapping = aes(x = clarity, fill = clarity),
    show.legend = FALSE,
    width = 1
  ) +
  theme(aspect.ratio = 1)

bar
bar + coord_flip() # flipped
bar + coord_polar() # bar/Coxcomb
```

```{r}
# Turn a stacked bar chart into a pie chart using coord_polar().
mpg |> ggplot(aes(x = "", fill = class)) + # stacked bar
  geom_bar() +
  coord_polar(theta = "y") # pie chart, theta ariable to map angle to (x or y), default = x

# What’s the difference between coord_quickmap() and coord_map()?
?coord_quickmap # quickmap: approximate projection, good for countries near equator, map: more computation, as no straight lines

# What does the following plot tell you about the relationship between city and highway mpg? Why is coord_fixed() important? What does geom_abline() do?

ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) +
  geom_point() +
  geom_abline() + # diagonal reference line where highway = city mileage, shows hwy mileage always higher than city
  coord_fixed() # 1 unit on y axis is same length as 1 unit on x axis,
```

```{r 9.8 The layered grammar of graphics}
# ggplot(data = <DATA>) +
#   <GEOM_FUNCTION>(
#      mapping = aes(<MAPPINGS>),
#      stat = <STAT>,
#      position = <POSITION>
#   ) +
#   <COORDINATE_FUNCTION> +
#   <FACET_FUNCTION>
```

## Ch. 10: Exploratory data analysis

```{r 10.3 variations}
ggplot(diamonds, aes(x = carat)) +
  geom_histogram(binwidth = 0.5)

smaller <- diamonds |>
  filter(carat < 3)

ggplot(smaller, aes(x = carat)) +
  geom_histogram(binwidth = 0.01)

glimpse(smaller)

ggplot(diamonds, aes(x = y)) +
  geom_histogram(binwidth = 0.5)

ggplot(diamonds, aes(x = y)) +
  geom_histogram(binwidth = 0.5) +
  coord_cartesian(ylim = c(0, 50)) #  zoom to small values of the y-axis with coord_cartesian(): coord_cartesian() also has an xlim() argument for when you need to zoom into the x-axis. ggplot2 also has xlim() and ylim() functions that work slightly differently: they throw away the data outside the limits.

unusual <- diamonds |>
  filter(y < 3 | y > 20) |>
  select(price, x, y, z) |>
  arrange(y)
unusual
```

```{r 10.3.3 Exercises}
# Explore the distribution of each of the x, y, and z variables in diamonds. What do you learn? Think about a diamond and how you might decide which dimension is the length, width, and depth.

ggplot(diamonds, aes(x = x)) +
  geom_histogram(binwidxth = 0.5)
ggplot(diamonds, aes(x = y)) +
  geom_histogram(binwidth = 0.5)
ggplot(diamonds, aes(x = z)) +
  geom_histogram(binwidth = 0.5)

summary(diamonds$x) # 10.7
summary(diamonds$y) # 58.9
summary(diamonds$z) # 31

# Appears to relate to vertical orientation

# Explore the distribution of price. Do you discover anything unusual or surprising? (Hint: Carefully think about the binwidth and make sure you try a wide range of values.)

ggplot(diamonds, aes(x = price, fill = cut)) +
  geom_histogram(binwidth = 3)


# How many diamonds are 0.99 carat? How many are 1 carat? What do you think is the cause of the difference?

diamonds |>
  filter(carat == 0.99) |>
  count() # 23
diamonds |>
  filter(carat == 1) |>
  count() # 1538

# Compare and contrast coord_cartesian() vs. xlim() or ylim() when zooming in on a histogram. What happens if you leave binwidth unset? What happens if you try and zoom so only half a bar shows?

ggplot(diamonds, aes(x = price, fill = cut)) +
  geom_histogram(binwidth = 3, )

ggplot(diamonds, aes(x = price, fill = cut)) +
  geom_histogram(binwidth = 3) +
  xlim(0, 10000) +
  ylim(0, 100) # no plotting  beyond limits

ggplot(diamonds, aes(x = price, fill = cut)) +
  geom_histogram(binwidth = 3) +
  coord_cartesian(xlim = c(0, 10000), ylim = c(0, 100)) # still plots data beyond limits
```

```{r 10.4 Unusual values}
# drop the row with strange values, not recommended
diamonds2 <- diamonds |>
  filter(between(y, 3, 20))

# better to replace with missing values
diamonds2 <- diamonds |>
  mutate(y = if_else(y < 3 | y > 20, NA, y))

# ggoplot will warn they've been removed
ggplot(diamonds2, aes(x = x, y = y)) +
  geom_point()

# to supress the warning
ggplot(diamonds2, aes(x = x, y = y)) +
  geom_point(na.rm = TRUE)

nycflights13::flights |>
  mutate(
    cancelled = is.na(dep_time), # NA = flight canceled, so plot!
    sched_hour = sched_dep_time %/% 100,
    sched_min = sched_dep_time %% 100,
    sched_dep_time = sched_hour + (sched_min / 60)
  ) |>
  ggplot(aes(x = sched_dep_time)) +
  geom_freqpoly(aes(color = cancelled), binwidth = 1 / 4)
# However this plot isn’t great because there are many more non-cancelled flights than cancelled flights.
```

```{r 10.4.1 Exercises}
# What happens to missing values in a histogram? What happens to missing values in a bar chart? Why is there a difference in how missing values are handled in histograms and bar charts?

diamonds2 |> ggplot(aes(x = y)) +
  geom_boxplot() # missing values ignored

diamonds2 |> ggplot(aes(x = y)) +
  geom_bar() # appears to allow ignore?

# What does na.rm = TRUE do in mean() and sum()?
diamonds2 |> summarise(
  ymean = mean(y) # not remove, NA will result
)

diamonds2 |> summarise(
  ymean = mean(y, na.rm = T) # now works
)

diamonds2 |> summarise(
  ymean = sum(y) # not removed, NA will result
)

diamonds2 |> summarise(
  ymean = sum(y, na.rm = T) # now works
)

# Recreate the frequency plot of scheduled_dep_time colored by whether the flight was cancelled or not. Also facet by the cancelled variable. Experiment with different values of the scales variable in the faceting function to mitigate the effect of more non-cancelled flights than cancelled flights.

nycflights13::flights |>
  mutate(
    cancelled = is.na(dep_time)
  ) |>
  ggplot(aes(x = sched_dep_time)) +
  geom_freqpoly(aes(color = cancelled), binwidth = 1 / 4) +
  facet_wrap(~cancelled, scales = "free_y") # zoom in on y range
```

```{r 10.5 Covariation}
ggplot(diamonds, aes(x = price)) +
  geom_freqpoly(aes(color = cut), binwidth = 500, linewidth = 0.75) # default appearance of geom_freqpoly() is not that useful here because the height, determined by the overall count, differs so much across cuts, making it hard to see the differences in the shapes of their distributions.

# we’ll display the density, which is the count standardized so that the area under each frequency polygon is one.  Note that we’re mapping the density to y, but since density is not a variable in the diamonds dataset, we need to first calculate it. We use the after_stat() function to do so.
ggplot(diamonds, aes(x = price, y = after_stat(density))) +
  geom_freqpoly(aes(color = cut), binwidth = 500, linewidth = 0.75)

# visually simpler plot for exploring this relationship is using side-by-side boxplots.
ggplot(diamonds, aes(x = cut, y = price)) +
  geom_boxplot()

ggplot(mpg, aes(x = class, y = hwy)) +
  geom_boxplot() # quite scattered

ggplot(mpg, aes(x = fct_reorder(class, hwy, median), y = hwy)) +
  geom_boxplot() # reorder class based on the median value of hwy:

ggplot(mpg, aes(x = fct_reorder(class, hwy, median), y = hwy)) +
  geom_boxplot() +
  coord_flip() # to help with long names
```

```{r 10.5.1.1 Exercises}
# Use what you’ve learned to improve the visualization of the departure times of cancelled vs. non-cancelled flights.
nycflights13::flights |>
  mutate(
    cancelled = is.na(dep_time)
  ) |>
  ggplot(aes(x = sched_dep_time, y = after_stat(density))) +
  geom_freqpoly(aes(color = cancelled), binwidth = 3)

# Based on EDA, what variable in the diamonds dataset appears to be most important for predicting the price of a diamond? How is that variable correlated with cut? Why does the combination of those two relationships lead to lower quality diamonds being more expensive?

# TO DO

# Instead of exchanging the x and y variables, add coord_flip() as a new layer to the vertical boxplot to create a horizontal one. How does this compare to exchanging the variables?

# no difference apparently
ggplot(mpg, aes(x = fct_reorder(class, hwy, median), y = hwy)) +
  geom_boxplot() +
  coord_flip()

ggplot(mpg, aes(y = fct_reorder(class, hwy, median), x = hwy)) +
  geom_boxplot()

# One problem with boxplots is that they were developed in an era of much smaller datasets and tend to display a prohibitively large number of “outlying values”. One approach to remedy this problem is the letter value plot. Install the lvplot package, and try using geom_lv() to display the distribution of price vs. cut. What do you learn? How do you interpret the plots?

diamonds |> ggplot(aes(x = cut, y = price)) +
  geom_lv()

# useful for larger datasets, many more quantiles.

# Create a visualization of diamond prices vs. a categorical variable from the diamonds dataset using geom_violin(), then a faceted geom_histogram(), then a colored geom_freqpoly(), and then a colored geom_density(). Compare and contrast the four plots. What are the pros and cons of each method of visualizing the distribution of a numerical variable based on the levels of a categorical variable?

# no overlaps but look the same after 5000 count,
diamonds |> ggplot(aes(x = color, y = price)) +
  geom_violin()

# no overlaps but quite small
diamonds %>%
  ggplot(aes(x = price)) +
  geom_histogram(show.legend = F) +
  facet_wrap(~color)

# not too bad, gets hard to read when counts are similiar
diamonds |> ggplot(aes(x = price, color = color)) +
  geom_freqpoly()

# overlaps but easier to see differences
diamonds |> ggplot(aes(x = price, fill = color)) +
  geom_density()

# If you have a small dataset, it’s sometimes useful to use geom_jitter() to avoid overplotting to more easily see the relationship between a continuous and categorical variable. The ggbeeswarm package provides a number of methods similar to geom_jitter(). List them and briefly describe what each one does.

# visit https://github.com/eclarke/ggbeeswarm
# geom_beeswarm - a beewarm?
# geom_quasirandom - imilarly to geom_jitter but reducing overplotting using a van der Corput sequence or Tukey texturing.
```

```{r 10.5.2 Two categorical variables}
# count the number of observations for each combination of levels of these categorical variables. One way to do that is to rely on the built-in geom_count():
ggplot(diamonds, aes(x = cut, y = color)) +
  geom_count()

# alternative
diamonds |>
  count(color, cut) |>
  arrange(desc(n))

diamonds |>
  count(color, cut) |>
  ggplot(aes(x = color, y = cut)) +
  geom_tile(aes(fill = n))
```

```{r 10.5.2.1 Exercises}
# 10.5.2.1 Exercises
# How could you rescale the count dataset above to more clearly show the distribution of cut within color, or color within cut?

diamonds |>
  count(color, cut) |>
  group_by(color) |>
  mutate(percent_cut = n / sum(n))

diamonds |>
  count(color, cut) |>
  group_by(cut) |> #
  mutate(percent_color = n / sum(n))


# What different data insights do you get with a segmented bar chart if color is mapped to the x aesthetic and cut is mapped to the fill aesthetic? Calculate the counts that fall into each of the segments.

diamonds |> ggplot(aes(x = color, fill = cut)) +
  geom_bar()

diamonds_counts <- diamonds %>%
  count(color, cut)

ggplot(diamonds_counts, aes(x = color, y = n, fill = cut)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = n), position = position_stack(vjust = 0.2))

# Use geom_tile() together with dplyr to explore how average flight departure delays vary by destination and month of year. What makes the plot difficult to read? How could you improve it?

nycflights13::flights %>%
  group_by(month, dest) %>%
  summarise(dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
  group_by(dest) %>%
  ungroup() %>%
  mutate(dest = reorder(dest, dep_delay)) %>%
  ggplot(aes(
    x = factor(month.name[month], levels = month.name),
    y = dest,
    fill = dep_delay
  )) +
  geom_tile() +
  labs(x = "Month", y = "Destination", fill = "Departure Delay") +
  scale_fill_gradient(low = "white", high = "red") +
  theme(
    axis.text.y = element_text(size = 4),
    axis.text.x = element_text(size = 7),
    legend.position = "bottom"
  )
```

```{r 10.5.3 Two numerical variables}
ggplot(smaller, aes(x = carat, y = price)) +
  geom_point()

ggplot(smaller, aes(x = carat, y = price)) +
  geom_point(alpha = 1 / 100) # using the alpha aesthetic to add transparency.

ggplot(smaller, aes(x = carat, y = price)) +
  geom_bin2d()

ggplot(smaller, aes(x = carat, y = price)) +
  geom_hex()

ggplot(smaller, aes(x = carat, y = price)) +
  geom_boxplot(aes(group = cut_width(carat, 0.1)))
```

```{r 10.5.3.1 Exercises}
# Instead of summarizing the conditional distribution with a boxplot, you could use a frequency polygon. What do you need to consider when using cut_width() vs. cut_number()? How does that impact a visualization of the 2d distribution of carat and price?

# Visualize the distribution of carat, partitioned by price.

# How does the price distribution of very large diamonds compare to small diamonds? Is it as you expect, or does it surprise you?

# Combine two of the techniques you’ve learned to visualize the combined distribution of cut, carat, and price.

# Two dimensional plots reveal outliers that are not visible in one dimensional plots. For example, some points in the following plot have an unusual combination of x and y values, which makes the points outliers even though their x and y values appear normal when examined separately. Why is a scatterplot a better display than a binned plot for this case?

diamonds |>
  filter(x >= 4) |>
  ggplot(aes(x = x, y = y)) +
  geom_point() +
  coord_cartesian(xlim = c(4, 11), ylim = c(4, 11))

# Instead of creating boxes of equal width with cut_width(), we could create boxes that contain roughly equal number of points with cut_number(). What are the advantages and disadvantages of this approach?

ggplot(smaller, aes(x = carat, y = price)) +
  geom_boxplot(aes(group = cut_number(carat, 20)))
```

```{r patterns}
diamonds <- diamonds |>
  mutate(
    log_price = log(price),
    log_carat = log(carat)
  )

diamonds_fit <- linear_reg() |>
  fit(log_price ~ log_carat, data = diamonds)

diamonds_aug <- augment(diamonds_fit, new_data = diamonds) |>
  mutate(.resid = exp(.resid))

ggplot(diamonds_aug, aes(x = carat, y = .resid)) +
  geom_point()

ggplot(diamonds_aug, aes(x = cut, y = .resid)) +
  geom_boxplot()
```

## Chapter 11

```{r labels}
ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point(aes(color = class)) +
  geom_smooth(se = FALSE) +
  labs(
    x = "Engine displacement (L)",
    y = "Highway fuel economy (mpg)",
    color = "Car type",
    title = "Fuel efficiency generally decreases with engine size",
    subtitle = "Two seaters (sports cars) are an exception because of their light weight",
    caption = "Data from fueleconomy.gov"
  )

df <- tibble(
  x = 1:10,
  y = cumsum(x^2)
)

ggplot(df, aes(x, y)) +
  geom_point() +
  labs(
    x = quote(x[i]),
    y = quote(sum(x[i]^2, i == 1, n))
  )
```

```{r 11.2.1 Exercises}
# Create one plot on the fuel economy data with customized title, subtitle, caption, x, y, and color labels.
names(mpg)
?mpg
mpg |> ggplot(aes(x = displ, y = cty, color = fl)) +
  geom_point() +
  labs(
    x = "Engine displacement (L)",
    y = "City fuel economy (mpg)",
    color = "Fue",
    title = "Disel tends",
    subtitle = "Two seaters (sports cars) are an exception because of their light weight",
    caption = "Data from fueleconomy.gov"
  )
```

Label exercises
```{r}
mpg |> ggplot(aes(cty, hwy, color = factor(class))) +
  geom_point() +
  geom_smooth(method = "lm", color = "blue", se = F) + # color to prevent multiple lines per class
  labs(
    x = "MPG in the city",
    y = "MPG on the highway",
    color = "Car class",
    title = "Highway vs. city MPG",
    subtitle = "SUVs are terrible!"
  )
```

Recreate the following plot using the fuel economy data. Note that both the colors and shapes of points vary by type of drive train.
```{r labels exercise}
mpg |> ggplot(aes(cty, hwy)) +
  geom_point(aes(color = factor(drv), shape = factor(drv))) +
  labs(
    x = "MPG in the city",
    y = "MPG on the highway",
    color = "Type of drive train",
    shape = "Type of drive train", # identical to avoid two legends
    title = "Highway vs. city MPG",
    subtitle = "SUVs are terrible!"
  )
```

11.3 Annotations!
```{r 11.3 annotations}
label_info <- mpg |>
  group_by(drv) |>
  arrange(desc(displ)) |>
  slice_head(n = 1) |> # pull out the cars with the highest engine size in each drive type
  mutate(
    drive_type = case_when(
      drv == "f" ~ "front-wheel drive",
      drv == "r" ~ "rear-wheel drive",
      drv == "4" ~ "4-wheel drive"
    )
  ) |>
  select(displ, hwy, drv, drive_type)

label_info

ggplot(mpg, aes(x = displ, y = hwy, color = drv)) +
  geom_point(alpha = 0.3) +
  geom_smooth(se = FALSE) +
  geom_text(
    data = label_info,
    aes(x = displ, y = hwy, label = drive_type),
    fontface = "bold", size = 5, hjust = "right", vjust = "bottom"
  ) +
  theme(legend.position = "none")
#> `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(mpg, aes(x = displ, y = hwy, color = drv)) +
  geom_point(alpha = 0.3) +
  geom_smooth(se = FALSE) +
  geom_text(
    data = label_info,
    aes(x = displ, y = hwy, label = drive_type),
    fontface = "bold", size = 5, hjust = "right", vjust = "bottom"
  ) + # hjust (horizontal justification) and vjust (vertical justification) to control the alignment of the label.
  theme(legend.position = "none")

ggplot(mpg, aes(x = displ, y = hwy, color = drv)) +
  geom_point(alpha = 0.3) +
  geom_smooth(se = FALSE) +
  geom_label_repel( #  We can use the geom_label_repel() function from the ggrepel package to address overlap
    data = label_info,
    aes(x = displ, y = hwy, label = drive_type),
    fontface = "bold", size = 5, nudge_y = 2 # Using the fontface and size arguments we can customize the look of the text labels
  ) +
  theme(legend.position = "none") # (theme(legend.position = "none") turns all the legends off

potential_outliers <- mpg |>
  filter(hwy > 40 | (hwy > 20 & displ > 5))

ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point() +
  geom_text_repel(data = potential_outliers, aes(label = model)) +
  geom_point(data = potential_outliers, color = "red") + # make outlier points red
  geom_point(
    data = potential_outliers,
    color = "red", size = 3, shape = "circle open"
  ) # circle around circle

trend_text <- "Larger engine sizes tend to have lower fuel economy." |>
  str_wrap(width = 30)
trend_text

# annotation!
ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point() 
 
```

11.3.1 Exercises with Annotations
Use geom_text() with infinite positions to place text at the four corners of the plot.
```{r}
corner_labels <- tibble(
  x = c(-Inf, -Inf, Inf, Inf),
  y = c(-Inf, Inf, -Inf, Inf),
  label = c(
    "(x0,y0)", "(x0,y1)",
    "(x1,y0)", "(x1,y1)"
  ),
  hjust = c(0, 0, 1, 1),
  vjust = c(0, 1, 0, 1)
)

annoation_plot <- mpg |> ggplot(aes(x = displ, y = hwy)) +
  geom_point() +
  geom_text(data = corner_labels, aes(
    x = x, y = y, hjust = hjust, vjust = vjust, label = label
  ))
annoation_plot
```
Use annotate() to add a point geom in the middle of your last plot without having to create a tibble. Customize the shape, size, or color of the point.
```{r}
annoation_plot + annotate(
  geom = "point", x = 4.5, y = 30,
  color = "pink", size = 15, shape = 15
)
```

How do labels with geom_text() interact with faceting? How can you add a label to a single facet? How can you put a different label in each facet? (Hint: Think about the dataset that is being passed to geom_text().)
```{r}
label_for_all <- tibble(
  x = Inf,
  y = Inf,
  label = "text for all",
  vjust = "inward",
  hjust = "inward",
  angle = 0,
)

mpg |> ggplot(aes(displ, hwy)) +
  geom_point() +
  facet_wrap(~class) +
  geom_text(data = label_for_all, aes(label = label, x = x, y = y, vjust = vjust, hjust = hjust, angle = angle))
```


Label for a single facet
```{r}
label_for_minivan <- tibble(
  x = Inf,
  y = Inf,
  label = "text for minivan",
  vjust = "inward",
  hjust = "inward",
  angle = 0,
  class = "minivan"
)

mpg |> ggplot(aes(displ, hwy)) +
  geom_point() +
  facet_wrap(~class) +
  geom_text(data = label_for_minivan, aes(label = label, x = x, y = y, vjust = vjust, hjust = hjust, angle = angle))
```


Labels for different facets
```{r}
# different facets
label_different <- mpg |>
  group_by(class) |>
  summarise(mean_mpg = round(mean(hwy), 1)) |>
  mutate(
    x = Inf,
    y = Inf,
    label = paste(class, "average highway mpg:", mean_mpg),
    vjust = "inward",
    hjust = "inward",
    angle = 0
  )

mpg |> ggplot(aes(displ, hwy)) +
  geom_point() +
  facet_wrap(~class) +
  geom_text(data = label_different, aes(label = label, x = x, y = y, vjust = vjust, hjust = hjust, angle = angle))
```


Playing with hjust and vjust
```{r}
td <- expand.grid(
  hjust = c(0, 0.5, 1),
  vjust = c(0, 0.5, 1),
  angle = c(0, 45, 90),
  text = "text"
)

ggplot(td, aes(x = hjust, y = vjust)) +
  geom_point() +
  geom_text(aes(label = text, angle = angle, hjust = hjust, vjust = vjust)) +
  facet_grid(~angle) +
  scale_x_continuous(breaks = c(0, 0.5, 1), expand = c(0, 0.2)) +
  scale_y_continuous(breaks = c(0, 0.5, 1), expand = c(0, 0.2))
```

Corner Labels
```{r}
corner_labels <- tibble(
  x = c(-Inf, -Inf, Inf, Inf),
  y = c(-Inf, Inf, -Inf, Inf),
  label = c(
    "(x0,y0)", "(x0,y1)",
    "(x1,y0)", "(x1,y1)"
  ),
  vjust = c(0, 0, 1, 1),
  hjust = c(0, 1, 0, 1)
)

annoation_plot <- mpg |> ggplot(aes(x = displ, y = hwy)) +
  geom_point() +
  geom_text(data = corner_labels, aes(
    x = x, y = y, hjust = hjust, vjust = vjust, label = label
  ))
annoation_plot
```

What arguments to geom_label() control the appearance of the background box?
```{r}
mpg |> ggplot(aes(x = displ, y = hwy)) +
  geom_point() +
  geom_label(data = data.frame(), label.padding = unit(0.55, "lines"), label.size = 1, color = "blue", size = 10, aes(
    x = Inf, y = Inf, hjust = "inward", vjust = "inward", label = "Hello!"
  ))
```

What are the four arguments to arrow()? How do they work? Create a series of plots that demonstrate the most important options.

```{r}
mpg |> ggplot(aes(x = displ, y = cty)) +
  geom_point() +
  annotate(
    geom = "segment",
    x = 3, y = 35, xend = 5, yend = 25, color = "red",
    arrow = arrow(type = "open", angle = 45, ends = "last")
  ) +
  annotate(
    geom = "segment",
    x = 2, y = 35, xend = 4, yend = 25, color = "blue",
    arrow = arrow(type = "closed", angle = 35, ends = "first", length = unit(1, "cm"))
  ) +
  annotate(
    geom = "segment",
    x = 2, y = 35, xend = 4, yend = 15, color = "green",
    arrow = arrow(type = "closed", angle = 35, ends = "last")
  )
```

Scales
```{r}
ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point(aes(color = class)) +
  scale_x_continuous() +
  scale_y_continuous() +
  scale_color_discrete()
```
Axis ticks and legend keys
```{r}
ggplot(mpg, aes(x = displ, y = hwy, color = drv)) +
  geom_point() +
  scale_y_continuous(breaks = seq(15, 40, by = 5)) 
```
Removing Ticks
```{r}
ggplot(mpg, aes(x = displ, y = hwy, color = drv)) +
  geom_point() +
  scale_x_continuous(labels = NULL) +
  scale_y_continuous(labels = NULL) +
  scale_color_discrete(labels = c("4" = "4-wheel", "f" = "front", "r" = "rear")) # for legend

ggplot(mpg, aes(x = displ, y = hwy, color = drv)) +
  geom_point() +
  scale_x_continuous(labels = NULL) +
  scale_y_continuous(labels = NULL) 
```

Dollars in the scale
```{r}
library(scales)
ggplot(diamonds, aes(x = price, y = cut)) +
  geom_boxplot(alpha = 0.05) +
  scale_x_continuous(labels = label_dollar()) # uses scales package
```
More custom dollars (1k, 7k, etc.)
```{r}
ggplot(diamonds, aes(x = price, y = cut)) +
  geom_boxplot(alpha = 0.05) +
  scale_x_continuous(
    labels = label_dollar(scale = 1/1000, suffix = "K"), 
    breaks = seq(1000, 19000, by = 6000)
  )
```
Label percente for bar charts
```{r}
ggplot(diamonds, aes(x = cut, fill = clarity)) +
  geom_bar(position = "fill") +
  scale_y_continuous(name = "Percentage", labels = label_percent())
```
Breaks for only a few data points
```{r}
presidential |>
  mutate(id = 33 + row_number()) |>
  ggplot(aes(x = start, y = id)) +
  geom_point() +
  geom_segment(aes(xend = end, yend = id)) +
  scale_x_date(name = NULL, breaks = presidential$start, date_labels = "'%y")
```
Position of the legend
```{r}
base <- ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point(aes(color = class))

base + theme(legend.position = "right") # the default
base + theme(legend.position = "left")
base + 
  theme(legend.position = "top") +
  guides(color = guide_legend(nrow = 3))
base + 
  theme(legend.position = "bottom") +
  guides(color = guide_legend(nrow = 2)) 
```


```{r}
ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point(aes(color = class)) +
  geom_smooth(se = FALSE) +
  theme(legend.position = "bottom") +
  guides(color = guide_legend(nrow = 2, override.aes = list(size = 6))) # larger legend symbols
```
11.4.4 Replacing a scale
```{r}
ggplot(diamonds, aes(x = carat, y = price)) +
  geom_bin2d()
# better to log transform
ggplot(diamonds, aes(x = log10(carat), y = log10(price))) +
  geom_bin2d()
```
adjust scale to log-transform
```{r}
ggplot(diamonds, aes(x = carat, y = price)) +
  geom_bin2d() + 
  scale_x_log10() + 
  scale_y_log10()
```
change color scale
```{r}
ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point(aes(color = drv)) +
  scale_color_brewer(palette = "Set1")
```
custom colors scale (blue democrat, red = republican)
```{r}
presidential |>
  mutate(id = 33 + row_number()) |>
  ggplot(aes(x = start, y = id, color = party)) +
  geom_point() +
  geom_segment(aes(xend = end, yend = id)) +
  scale_color_manual(values = c(Republican = "#E81B23", Democratic = "#00AEF3"))
```
dealing with color blindness
```{r}
df <- tibble(
  x = rnorm(10000),
  y = rnorm(10000)
)

ggplot(df, aes(x, y)) +
  geom_hex() +
  coord_fixed() +
  labs(title = "Default, continuous", x = NULL, y = NULL)

ggplot(df, aes(x, y)) +
  geom_hex() +
  coord_fixed() +
  scale_fill_viridis_c() +
  labs(title = "Viridis, continuous", x = NULL, y = NULL)

ggplot(df, aes(x, y)) +
  geom_hex() +
  coord_fixed() +
  scale_fill_viridis_b() +
  labs(title = "Viridis, binned", x = NULL, y = NULL)
```
11.4.5 Zooming 
```{r}
ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point(aes(color = drv)) +
  geom_smooth() +
  scale_x_continuous(limits = c(5, 6)) +
  scale_y_continuous(limits = c(10, 25))
```

Using coord_cartesian to set limits
```{r}
ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point(aes(color = drv)) +
  geom_smooth() +
  coord_cartesian(xlim = c(5, 6), ylim = c(10, 25))
```

Expanind the limits (limits on individual scales)

```{r}
suv <- mpg |> filter(class == "suv")
compact <- mpg |> filter(class == "compact")

# Left
ggplot(suv, aes(x = displ, y = hwy, color = drv)) +
  geom_point()

# right
ggplot(compact, aes(x = displ, y = hwy, color = drv)) +
  geom_point()
```
Share scales across multiple plots


```{r}
x_scale <- scale_x_continuous(limits = range(mpg$displ))
y_scale <- scale_y_continuous(limits = range(mpg$hwy))
col_scale <- scale_color_discrete(limits = unique(mpg$drv))

# Left
ggplot(suv, aes(x = displ, y = hwy, color = drv)) +
  geom_point() +
  x_scale +
  y_scale +
  col_scale

# Right
ggplot(compact, aes(x = displ, y = hwy, color = drv)) +
  geom_point() +
  x_scale +
  y_scale +
  col_scale


# In this particular case, you could have simply used faceting, but this technique is useful more generally, if for instance, you want to spread plots over multiple pages of a report.
```

11.4.6 Exercises
Why doesn’t the following code override the default scale?

Ones needs to use `scale_fill_gradient` instead of `scale_color_gradient`
```{r}
df <- tibble(
  x = rnorm(10000),
  y = rnorm(10000)
)

ggplot(df, aes(x, y)) +
  geom_hex() +
  scale_color_gradient(low = "white", high = "red") +
  scale_fill_gradient(low = "green", high = "blue") +
  coord_fixed()

```

What is the first argument to every scale? How does it compare to labs()?

The firs argument is name, which is the name of the scale. It is similar to labs() in that it is used to set the name of the scale, but it is used to set the name of the scale, not the name of the axis or legend.

Change the display of the presidential terms by:

```{r}
presidential |>
  mutate(id = 33 + row_number()) |>
  ggplot(aes(x = start, y = id)) +
  geom_point() +
  geom_segment(aes(xend = end, yend = id)) +
  scale_x_date(name = NULL, breaks = presidential$start, date_labels = "'%y")

prez + 
```


Combining the two variants that customize colors and x axis breaks.

Improving the display of the y axis.

Labelling each term with the name of the president.

Adding informative plot labels.

Placing breaks every 4 years (this is trickier than it seems!).

First, create the following plot. Then, modify the code using override.aes to make the legend easier to see.

TBD

```{r}

ggplot(diamonds, aes(x = carat, y = price)) +
  geom_point(aes(color = cut), alpha = 1/20)
```

11.5 Themes

```{r 11.5 themes}
ggplot(mpg, aes(x = displ, y = hwy, color = drv)) +
  geom_point() +
  labs(
    title = "Larger engine sizes tend to have lower fuel economy",
    caption = "Source: https://fueleconomy.gov."
  ) +
  theme(
    legend.position = c(0.6, 0.7),
    legend.direction = "horizontal",
    legend.box.background = element_rect(color = "black"),
    plot.title = element_text(face = "bold"),
    plot.title.position = "plot",
    plot.caption.position = "plot",
    plot.caption = element_text(hjust = 0)
  )
```

```{r}
library(ggthemes)

ggplot(df, aes(x, y)) +
  geom_hex() +
  scale_color_gradient(low = "white", high = "red") +
  scale_fill_gradient(low = "green", high = "blue") +
  coord_fixed() + 
  theme_dark() +
  theme(
    # make axsis title blue and bold
    axis.title = element_text(colour = "blue", face = "bold"),
  )
  
```
11.6 Layout

```{r 11.6 layout}
p1 <- ggplot(mpg, aes(x = displ, y = hwy)) + 
  geom_point() + 
  labs(title = "Plot 1")
p2 <- ggplot(mpg, aes(x = drv, y = hwy)) + 
  geom_boxplot() + 
  labs(title = "Plot 2")
p1 + p2 # It’s important to note that in the above code chunk we did not use a new function from the patchwork package. Instead, the package added a new functionality to the + operator.
p3 <- ggplot(mpg, aes(x = cty, y = hwy)) + 
  geom_point() + 
  labs(title = "Plot 3")
(p1 | p3) / p2
```

```{r}
p1 <- ggplot(mpg, aes(x = drv, y = cty, color = drv)) + 
  geom_boxplot(show.legend = FALSE) + 
  labs(title = "Plot 1")

p2 <- ggplot(mpg, aes(x = drv, y = hwy, color = drv)) + 
  geom_boxplot(show.legend = FALSE) + 
  labs(title = "Plot 2")

p3 <- ggplot(mpg, aes(x = cty, color = drv, fill = drv)) + 
  geom_density(alpha = 0.5) + 
  labs(title = "Plot 3")

p4 <- ggplot(mpg, aes(x = hwy, color = drv, fill = drv)) + 
  geom_density(alpha = 0.5) + 
  labs(title = "Plot 4")

p5 <- ggplot(mpg, aes(x = cty, y = hwy, color = drv)) + 
  geom_point(show.legend = FALSE) + 
  facet_wrap(~drv) +
  labs(title = "Plot 5")

(guide_area() / (p1 + p2) / (p3 + p4) / p5) +
  plot_annotation(
    title = "City and highway mileage for cars with different drive trains",
    caption = "Source: https://fueleconomy.gov."
  ) +
  plot_layout(
    guides = "collect",
    heights = c(1, 3, 2, 4)
    ) &
  theme(legend.position = "top")
```


```{r}
p1 <- ggplot(mpg, aes(x = displ, y = hwy)) + 
  geom_point() + 
  labs(title = "Plot 1") 
p2 <- ggplot(mpg, aes(x = drv, y = hwy)) + 
  geom_boxplot() + 
  labs(title = "Plot 2")
p3 <- ggplot(mpg, aes(x = cty, y = hwy)) + 
  geom_point() + 
  labs(title = "Plot 3")

(p1 | p2) / p3
# p1 | p2 / p3 # perhaps functions like order of operations, extends p1 to max without considering p2

```

```{r}
patchwork <- ( p1) / ( p2 | p3)
patchwork  + plot_annotation(tag_levels = c('A', 1), tag_prefix = 'Fig. ',
                            tag_sep = '.', tag_suffix = ':')

```

Random packages (janitor, skimr)
```{r}
library(ryouwithme)
# exploring
skim(mpg)
mpg |>
  tabyl(year, manufacturer)
# cleaning
beaches <- sydneybeaches
beaches <- clean_names(beaches) # all lowercase with underscores
glimpse(beaches)

beaches <- beaches |>
  rename(
    beachbugs = enterococci_cfu_100ml
  )
```
Most beach bugs
```{r}
worst_day_coogee <- beaches |>
  arrange(desc(beachbugs)) |>
  filter(site == "Coogee Beach") |>
  head(1)

worst_day_little_bay <- beaches |>
  arrange(desc(beachbugs)) |>
  filter(site == "Little Bay Beach") |>
  head(1)

worst_day_coogee$beachbugs # 1200
worst_day_little_bay$beachbugs # 4900
```

Question B: Does Coogee or Bondi have more extreme bacteria levels? Which beach has the worst bacteria levels on average?
```{r}
bugs_beach <- beaches %>%
  group_by(site) %>%
  summarise(avg_bug = mean(beachbugs, na.rm = TRUE)) |>
  arrange(desc(avg_bug))

bugs_beach # Malabar has worst on average

bugs_beach %>%
  filter(str_detect(site, "Coogee|Bondi")) # Cooggee is worse
```
Total beachbugs observations per site 
```{r}
beaches |>
  group_by(site) |>
  summarise(tl_bug = sum(beachbugs, na.rm = T)) |>
  arrange(desc(tl_bug))
```

Custom Theme
```{r}
theme_jen <- function() {
  # define font up front
  font <- "Helvetica"
  # this theme uses theme_bw as the base

  theme_bw() %+replace%
    theme(
      # get rid of grid lines/borders
      panel.border = element_blank(),
      panel.grid.major = element_blank(),
      panel.grid.minor = element_blank(),
      # add white space top, right, bottom, left
      plot.margin = unit(c(1, 1, 1, 1), "cm"),
      # custom axis title/text/lines
      axis.title = element_text(
        family = font,
        size = 14
      ),
      axis.text = element_text(
        family = font,
        size = 12
      ),
      # margin pulls text away from axis
      axis.text.x = element_text(
        margin = margin(5, b = 10)
      ),
      # black lines
      axis.line = element_line(colour = "black", size = rel(1)),
      # custom plot titles, subtitles, captions
      plot.title = element_text(
        family = font,
        size = 18,
        hjust = -0.1,
        vjust = 4
      ),
      plot.subtitle = element_text(
        family = font,
        size = 14,
        hjust = 0,
        vjust = 3
      ),
      plot.caption = element_text(
        family = font,
        size = 10,
        hjust = 1,
        vjust = 2
      ),
      # custom legend
      legend.title = element_text(
        family = font,
        size = 10,
        hjust = 0
      ),
      legend.text = element_text(
        family = font,
        size = 8,
        hjust = 0
      ),
      # no background on legend
      legend.key = element_blank(),
      # white background on plot
      strip.background = element_rect(
        fill = "white",
        colour = "black", size = rel(2)
      ), complete = TRUE
    )
}
```
GGEasy
```{r}

tt <- tt_load("2021-01-05")
cost <- tt$transit_cost

easy_expand_y_axis <- function() {
  scale_y_continuous(expand = c(0, 0))
}

cost %>%
  group_by(country) %>%
  summarise(meancost = mean(cost_km_millions)) %>%
  arrange(-meancost) %>%
  head(5) %>%
  ggplot(aes(x = reorder(country, meancost), y = meancost, fill = country)) +
  geom_col() +
  labs(
    y = "Average cost per km (million)", x = "Country",
    title = "Countries with the most expensive transit projects",
    caption = "why is the US so $$$$?"
  ) +
  theme_jen() +
  easy_remove_legend() +
  easy_expand_y_axis() +
  ggdark::dark_theme_dark()

```

12  Logical vectors

```{r}
flights |> 
  mutate(
    daytime = dep_time > 600 & dep_time < 2000,
    approx_ontime = abs(arr_delay) < 20,
    .keep = "used" # "used" retains only the columns used in ... to create new columns
  )
```

12.2.1 Floating point comparison (don't use == )

```{r}
x <- c(4.0000, 5)
near(x, c(4, 5))
```

12.2.2 Missing values (won't work with any comparison)

```{r}
flights |> 
  filter(is.na(dep_time))

# put na at bottom
flights |> 
  filter(month == 1, day == 1) |> 
  arrange(desc(is.na(dep_time)), dep_time) 
```

12.2.4 Exercises

How does dplyr::near() work? Type near to see the source code. Is sqrt(2)^2 near 2?

Uses built in tolerance to compare two numbers

```{r}
near(sqrt(2)^2, 2)
```

Use mutate(), is.na(), and count() together to describe how the missing values in dep_time, sched_dep_time and dep_delay are connected.

```{r}
missing_flights <- nycflights13::flights |> 
  mutate(
    dep_time_miss = is.na(dep_time),
    sched_dep_time_miss = is.na(sched_dep_time),
    dep_delay_miss = is.na(dep_delay)
  )

count(x = missing_flights, dep_time_miss, sched_dep_time_miss, dep_delay_miss) 
```
If departure time is missing, departure delay is also logically missing.

Multiple filters
```{r}
flights |> 
  filter(month %in% c(1,12))
```

12.3.4 Exercises
Find all flights where arr_delay is missing but dep_delay is not.

```{r}
flights |> filter(is.na(arr_delay) & !is.na(dep_delay))
```


Find all flights where neither arr_time nor sched_arr_time are missing, but arr_delay is.

```{r}
flights |> filter(!is.na(arr_time) & !is.na(sched_arr_time) & is.na(arr_delay))
```

How many flights have a missing dep_time? What other variables are missing in these rows? What might these rows represent?

```{r}
missing_dep_time <- flights |> filter(is.na(dep_time))

# make this rows instead of columns, 1st col missing category, 2nd col count of missing
other_missing_counts <- missing_dep_time %>%
  summarize(across(-dep_time, ~ sum(is.na(.)))) %>%
  pivot_longer(cols = everything(), names_to = "category", values_to = "count_of_missing") %>%
  arrange(desc(count_of_missing))
```

Assuming that a missing dep_time implies that a flight is cancelled, look at the number of cancelled flights per day. Is there a pattern? Is there a connection between the proportion of cancelled flights and the average delay of non-cancelled flights?

```{r}
cancel_timeline <- flights |> 
  mutate(date = as.Date(paste(year, month, day, sep = "-"))) |> 
  group_by(year, month, day) |> 
  summarize(
    date = first(date),
    cancelled = sum(is.na(dep_time)),
    avg_delay = mean(dep_delay, na.rm = TRUE),
    prop_cancelled = cancelled / n() 
  ) %>%
  ungroup() |> 
  select(-year, -month, -day) |> 
  distinct()

cancel_timeline |> ggplot(aes(x = date, y = cancelled)) +
  geom_point() +
  scale_x_date(date_labels = "%b %Y", date_breaks = "1 month") +
  labs(
    x = "Date",
    y = "Percentage of flights canceled",
    title = "Cancelled flights over time"
  )

cancel_timeline |>
  ggplot(aes(x = avg_delay, y = prop_cancelled)) +
  geom_point() +
  geom_smooth() +
  scale_y_continuous(labels = scales::percent) + # make percentage
  labs(
    x = "Average delay of non-cancelled flights",
    y = "Proprtion of cancelled flights",
    title = " Proprtion of Cancelled flights and average delay"
  ) 

```

Generally speaking, the more delays of non-canceled flights, the more likely there are cancelations.

## 12.5.4 Exercises

A number is even if it’s divisible by two, which in R you can find out with x %% 2 == 0. Use this fact and if_else() to determine whether each number between 0 and 20 is even or odd.

```{r}
x <- 0:20
y <- if_else(x %% 2 == 0, "even", "odd")
```

Given a vector of days like x <- c("Monday", "Saturday", "Wednesday"), use an if_else() statement to label them as weekends or weekdays.

```{r}
# make vector of Monday to Sunday
days <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")
y <- if_else(days %in% c("Saturday", "Sunday"), "weekend", "weekday")
```

Use if_else() to compute the absolute value of a numeric vector called x.

```{r}
x <- c(-1, 0, 1)
y <- if_else(x < 0, -x, x)
```

Write a case_when() statement that uses the month and day columns from flights to label a selection of important US holidays (e.g., New Years Day, 4th of July, Thanksgiving, and Christmas). First create a logical column that is either TRUE or FALSE, and then create a character column that either gives the name of the holiday or is NA.

```{r}
library(lubridate)
flights_holidays <- flights |>
  mutate(
    holiday =
      case_when(
        month == 12 & day == 25 ~ "Christmas",
        month == 12 & day == 31 ~ "New Years",
        month == 7 & day == 4 ~ "4th of July",
        month == 10 & day == 31 ~ "Halloween",
        TRUE ~ NA
      )
  ) |>
  mutate(
    is_holiday =
      case_when(
        month == 12 & day == 25 ~ T,
        month == 12 & day == 31 ~ T,
        month == 7 & day == 4 ~ T,
        month == 10 & day == 31 ~ T,
        TRUE ~ F
      )
  )
```

## 13  Numbers

```{r}
x <- c("123 4 ABC", "#3")
parse_number(x)
```

```{r}
flights |> count(carrier, sort = T)

flights |> 
  group_by(carrier) |> 
  summarise(
    count = n(),
    avg_delay = mean(dep_delay, na.rm = T)
)

flights |> 
  group_by(dest) |> 
  summarize(
    carriers = n_distinct(carrier)
  ) |>  arrange(desc(carriers))

flights |> 
  group_by(tailnum) |> 
  summarize(miles = sum(distance))

flights |>  count(tailnum, wt = distance, name = "miles", sort = T)

flights |> 
  group_by(dest) |> 
  summarize(n_cancelled = sum(is.na(dep_time))) |> arrange(desc(n_cancelled))
```

13.3.1 Exercises
How can you use count() to count the number rows with a missing value for a given variable?
```{r}
flights |> 
  filter(is.na(tailnum)) |> 
  count()
```
Expand the following calls to count() to instead use group_by(), summarize(), and arrange():

```{r}
flights |> count(dest, sort = TRUE)

flights |> group_by(dest) |> 
  summarize(n = n()) |> 
  arrange(desc(n))
```


```{r}
flights |> count(tailnum, wt = distance)

flights |> group_by(tailnum) |> 
  summarise(
    n = sum(distance)
  )
```

13.4 Numeric transformations

What will sum(is.na(x)) tell you? How about mean(is.na(x))?

```{r}
x <- c(1, 2, NA, 4, NA, 6)
sum(is.na(x)) # number of missing values
mean(is.na(x)) # proportion of missing values 1 / 3 

```


What does prod() return when applied to a logical vector? What logical summary function is it equivalent to? What does min() return when applied to a logical vector? What logical summary function is it equivalent to? Read the documentation and perform a few experiments.

13.4.8 Exercises

Explain in words what each line of the code used to generate Figure 13.1 does.

```{r}
flights |> 
  group_by(hour = sched_dep_time %/% 100) |>  
  # group by hour, extrat bour 1345 -> 13
  summarize(prop_cancelled = mean(is.na(dep_time)), n = n()) |>
  # percnatge of missing dep_times as well as count per hours
  filter(hour > 1) |>
  # only flight after 1am (00 is is all missing?)
  ggplot(aes(x = hour, y = prop_cancelled)) +
  # plot hoour vs. percentage canceled
  geom_line(color = "grey50") + 
  # conct the dots
  geom_point(aes(size = n))
  # make plots bigger if more flights at that time
```

What trigonometric functions does R provide? Guess some names and look up the documentation. Do they use degrees or radians?

sin(x), cos(x) etc. in radians

Currently dep_time and sched_dep_time are convenient to look at, but hard to compute with because they’re not really continuous numbers. You can see the basic problem by running the code below: there’s a gap between each hour.

```{r}
flights |> 
  filter(month == 1, day == 1) |> 
  ggplot(aes(x = sched_dep_time, y = dep_delay)) +
  geom_point()
```

Convert them to a more truthful representation of time (either fractional hours or minutes since midnight).

```{r}
get_minutes_since_midnight <- function(time) {
  hours <- floor(time / 100)  # Extract the whole number part (hours)
  minutes <- time %% 100  # Extract the remainder (minutes)
  minutes_since_midnight <- hours * 60 + minutes  # Convert hours to minutes and add minutes
  return(minutes_since_midnight)  # Return the result
}

flights_midnight <- flights |> mutate(
  dep_time_midnight = get_minutes_since_midnight(dep_time),
  sched_dep_time_midnight = get_minutes_since_midnight(sched_dep_time)
)

flights_midnight |> 
  filter(month == 1, day == 1) |> 
  ggplot(aes(x = sched_dep_time_midnight, y = dep_time_midnight)) +
  geom_point()
```


Round dep_time and arr_time to the nearest five minutes.

```{r}

round_to_nearest_5_minutes <- function(time) {
  hours <- time %/% 100  # Extract the hours
  minutes <- time %% 100  # Extract the minutes
  # Round the minutes to the nearest five-minute interval
  rounded_minutes <- round(minutes / 5) * 5
  # Adjust hours and minutes if rounding leads to 60 minutes
  if (rounded_minutes == 60) {
    hours <- hours + 1
    rounded_minutes <- 0
  }
  # Combine hours and rounded minutes to form the rounded time
  rounded_time <- hours * 100 + rounded_minutes
  return(rounded_time)  # Return the rounded time
}

# Example usage:
time <- 859  # Example time in HHMM format
rounded_time <- round_to_nearest_5_minutes(time)
print(rounded_time)  # Print the rounded time
```

13.5.4 Exercises
Find the 10 most delayed flights using a ranking function. How do you want to handle ties? Carefully read the documentation for min_rank().

```{r}
flights |> mutate(
  most_delay = min_rank(dep_delay), .before = year) |> arrange(desc(most_delay)) |>  head() |> view()
```

Which plane (tailnum) has the worst on-time record?
```{r}
flights |> 
  group_by(tailnum) |> 
  summarise(
    total_arr_delay = sum(arr_delay, na.rm = T) 
  ) |> 
  arrange(desc(total_arr_delay)) # N15910
```

What time of day should you fly if you want to avoid delays as much as possible?

```{r}
flights |> 
  group_by(hour) |> 
  summarise(
    total_dep_delay = sum(dep_delay, na.rm = T),
  ) |> arrange((total_dep_delay)) # Early morning
```


What does flights |> group_by(dest) |> filter(row_number() < 4) do? What does flights |> group_by(dest) |> filter(row_number(dep_delay) < 4) do?

```{r}
flights |> 
  group_by(dest) |> 
  filter(row_number() < 4) # keep on first three flights per destination
```

```{r}
flights |>
  group_by(dest) |> 
  filter(row_number(dep_delay) < 4) |> view()
# keep first three flights with smallest departure delay per destination 
```

For each destination, compute the total minutes of delay. For each flight, compute the proportion of the total delay for its destination.

```{r}

flights |> glimpse()
flights |> 
  group_by(dest) |> 
  summarize(
    total_delay = sum(arr_delay, na.rm = T)
  )

flights |> 
  mutate(
    prop_delay = (dep_delay + arr_delay) / air_time, .before = year
  )
```

Delays are typically temporally correlated: even once the problem that caused the initial delay has been resolved, later flights are delayed to allow earlier flights to leave. Using lag(), explore how the average flight delay for an hour is related to the average delay for the previous hour.

```{r}
flights |> 
  group_by(hour) |> 
  summarize(
    avg_delay = mean(dep_delay, na.rm = TRUE)
  ) |> 
  mutate(diff = avg_delay - lag(avg_delay, default = NA)) |> 
  ggplot(aes(x=diff, y=avg_delay)) + geom_point() + geom_smooth(method='lm')
```

```{r}
flights |> 
  mutate(hour = dep_time %/% 100) |> 
  group_by(year, month, day, hour) |> 
  summarize(
    dep_delay = mean(dep_delay, na.rm = TRUE),
    n = n(),
    .groups = "drop"
  ) |> 
  filter(n > 5)
```

Look at each destination. Can you find flights that are suspiciously fast (i.e. flights that represent a potential data entry error)? Compute the air time of a flight relative to the shortest flight to that destination. Which flights were most delayed in the air?

```{r}
flights |> 
  group_by(dest) |> 
  filter(row_number() < 5)
  arrange(air_time) |> ungroup() |> relocate(air_time) |>  print(n=100)
  
flights |> 
  group_by(dest) |> 
  filter(row_number(air_time) < 2) |> 
  ungroup() |> 
  relocate(air_time, dep_time, arr_time, dest) |> 
  view()
 
flights |> 
  group_by(dest) |> 
  mutate(
    # get the short flight for each destination
    shortest_flight = min(air_time, na.rm = T)
  ) |> ungroup() |> 
  mutate(
    # compare each flight to the shortest flight of the destination
    relative_shortest = air_time / shortest_flight
  ) |> view()
  
```


Find all destinations that are flown by at least two carriers. Use those destinations to come up with a relative ranking of the carriers based on their performance for the same destination.

```{r}
two_carriers  <- flights |> 
  group_by(dest) |> 
  mutate(num_carriers = n_distinct(carrier)) |> 
  filter(num_carriers > 1) 
   
  
```