Added code for last three screencasts

dgrtwo · Feb 23, 2021 · 129816b · 129816b
1 parent 29ed41a
commit 129816b
Show file tree

Hide file tree

Showing 3 changed files with 620 additions and 0 deletions.
diff --git a/2021_02_02_hbcu.Rmd b/2021_02_02_hbcu.Rmd
@@ -0,0 +1,191 @@
+---
+title: "TidyTemplate"
+date: 2021-02-02
+output: html_output
+---
+
+# TidyTuesday
+
+Join the R4DS Online Learning Community in the weekly #TidyTuesday event!
+Every week we post a raw dataset, a chart or article related to that dataset, and ask you to explore the data.
+While the dataset will be “tamed”, it will not always be tidy! As such you might need to apply various R for Data Science techniques to wrangle the data into a true tidy format.
+The goal of TidyTuesday is to apply your R skills, get feedback, explore other’s work, and connect with the greater #RStats community!
+As such we encourage everyone of all skills to participate!
+
+```{r setup, include=FALSE}
+
+knitr::opts_chunk$set(echo = TRUE)
+
+library(tidyverse)
+library(tidytuesdayR)
+theme_set(theme_light())
+library(scales)
+
+```
+
+# Load the weekly Data
+
+Dowload the weekly data and make available in the `tt` object.
+
+```{r Load}
+
+tt <- tt_load("2021-02-02")
+
+hbcu_all_long <- tt$hbcu_all %>%
+  gather(metric, enrollment, -Year) %>%
+  rename(year = Year)
+
+hbcu_all_long %>%
+  filter(str_detect(metric, " - ")) %>%
+  separate(metric, c("degree_length", "type"), sep = " - ") %>%
+  filter(degree_length != "Total") %>%
+  ggplot(aes(year, enrollment, color = type)) +
+  geom_line() +
+  facet_wrap(~ degree_length) +
+  labs(y = "# enrolled in HBCU",
+       color = "")
+
+hbcu_all_long %>%
+  filter(metric %in% c("Males", "Females")) %>%
+  ggplot(aes(year, enrollment, color = metric)) +
+  geom_line() +
+  expand_limits(y = 0) +
+  labs(y = "# enrolled in HBCU",
+       color = "")
+```
+
+* Most degrees from HBCU are 4-year, and 2-year degrees are almost entirely from public schools
+* More women than men enroll in HBCU, and that has been increasing over time since 1980s
+
+```{r}
+hbcu_black_long <- tt$hbcu_black %>%
+  gather(metric, black_enrollment, -Year) %>%
+  rename(year = Year)
+
+hbcu_compare_long <- hbcu_all_long %>%
+  full_join(hbcu_black_long, by = c("year", "metric")) %>%
+  mutate(pct_black = black_enrollment / enrollment)
+
+hbcu_compare_long %>%
+  filter(metric == "Total enrollment") %>%
+  ggplot(aes(year, pct_black)) +
+  geom_line() +
+  scale_y_continuous(labels = percent) +
+  expand_limits(y = 0) +
+  labs(y = "% of HBCU enrollment that is Black")
+
+
+hbcu_compare_long %>%
+  filter(metric %in% c("Males", "Females")) %>%
+  ggplot(aes(year, pct_black, color = metric)) +
+  geom_line() +
+  scale_y_continuous(labels = percent) +
+  expand_limits(y = 0) +
+  labs(y = "% of HBCU enrollment that is Black")
+
+hbcu_compare_long %>%
+  filter(str_detect(metric, "Total -")) %>%
+  mutate(metric = str_remove(metric, "Total - ")) %>%
+  ggplot(aes(year, pct_black, color = metric)) +
+  geom_line() +
+  scale_y_continuous(labels = percent) +
+  expand_limits(y = 0) +
+  labs(y = "% of HBCU enrollment that is Black",
+       color = "")
+```
+
+```{r}
+gather_race_ethnicity <- function(tbl) {
+  tbl %>%
+    mutate_if(is.character, parse_number) %>%
+    rename(year = Total) %>%
+    filter(!is.na(year)) %>%
+    gather(race_ethnicity, value, -year) %>%
+    mutate(column = ifelse(str_detect(race_ethnicity, "Standard Errors - "), "standard_error", "percent"),
+           race_ethnicity = str_remove(race_ethnicity, "Standard Errors - ")) %>%
+    spread(column, value) %>%
+    mutate(standard_error = abs(standard_error)) %>%
+    filter(!is.na(percent)) %>%
+    mutate(race_ethnicity = str_remove(race_ethnicity, "1$"),
+           percent = percent / 100,
+           standard_error = standard_error / 100)
+}
+
+hs_over_time <- tt$hs_students %>%
+  slice(-(1:3)) %>%
+  gather_race_ethnicity()
+
+bach_over_time <- tt$bach_students %>%
+  gather_race_ethnicity()
+
+education_over_time <- bind_rows(hs_over_time %>% mutate(degree = "High School"),
+                                 bach_over_time %>% mutate(degree = "Bachelor's"))
+
+hs_over_time %>%
+  mutate(race_ethnicity = fct_reorder(race_ethnicity, -percent)) %>%
+  ggplot(aes(year, percent, color = race_ethnicity)) +
+  geom_line() +
+  scale_y_continuous(labels = percent) +
+  labs(color = "Race/ethnicity",
+       y = "% of people aged >=25 who graduated HS") +
+  expand_limits(y = 0)
+
+bach_over_time %>%
+  mutate(race_ethnicity = fct_reorder(race_ethnicity, -percent)) %>%
+  ggplot(aes(year, percent, color = race_ethnicity)) +
+  geom_line() +
+  scale_y_continuous(labels = percent) +
+  labs(color = "Race/ethnicity",
+       y = "% of people aged >=25 who graduated a bachelor's program") +
+  expand_limits(y = 0)
+
+education_over_time %>%
+  filter(year >= 1940,
+         !str_detect(race_ethnicity, "Islander -")) %>%
+  mutate(degree = fct_relevel(degree, "High School"),
+         race_ethnicity = str_remove(race_ethnicity, "Total - ")) %>%
+  mutate(race_ethnicity = fct_reorder(race_ethnicity, percent, last, .desc = TRUE)) %>%
+  ggplot(aes(year, percent, color = race_ethnicity)) +
+  geom_line() +
+  facet_wrap(~ degree) +
+  scale_y_continuous(labels = percent) +
+  labs(x = "Year",
+       color = "Race/ethnicity",
+       y = "% of people aged >=25 who have this degree") +
+  expand_limits(y = 0)
+```
+
+Bring in a new dataset on fields
+
+```{r}
+a25 <- readxl::read_excel("~/Downloads/A-25.xls")
+
+a25_cleaned <- a25 %>%
+  select(-starts_with("...")) %>%
+  rename(field_gender = 1) %>%
+  mutate(group = cumsum(is.na(field_gender))) %>%
+  filter(!is.na(field_gender)) %>%
+  select(group, everything()) %>%
+  mutate(field_gender = str_remove(field_gender, " \\.\\.\\..*")) %>%
+  group_by(group) %>%
+  mutate(field = first(field_gender),
+         gender = ifelse(field_gender %in% c("Men", "Women"), field_gender, "Total")) %>%
+  ungroup() %>%
+  select(field, gender, everything()) %>%
+  select(-field_gender, -group)
+
+a25_cleaned %>%
+  select(field, gender, contains("HBCU")) %>%
+  rename(pct_hbcu_total = 3,
+         pct_hbcu_black = 4) %>%
+  filter(gender != "Total") %>%
+  mutate(field = fct_reorder(field, pct_hbcu_black, na.rm = TRUE),
+         pct_hbcu_black = pct_hbcu_black / 100) %>%
+  ggplot(aes(pct_hbcu_black, field, fill = gender)) +
+  geom_col(position = "dodge") +
+  scale_x_continuous(labels = percent) +
+  labs(x = "% of first degrees from an HBCU, among Black students",
+       y = "Field",
+       fill = "")
+```
+
diff --git a/2021_02_09_lifetime_earn.Rmd b/2021_02_09_lifetime_earn.Rmd
@@ -0,0 +1,190 @@
+---
+title: "TidyTemplate"
+date: 2021-02-09
+output: html_output
+---
+
+# TidyTuesday
+
+Join the R4DS Online Learning Community in the weekly #TidyTuesday event!
+Every week we post a raw dataset, a chart or article related to that dataset, and ask you to explore the data.
+While the dataset will be “tamed”, it will not always be tidy! As such you might need to apply various R for Data Science techniques to wrangle the data into a true tidy format.
+The goal of TidyTuesday is to apply your R skills, get feedback, explore other’s work, and connect with the greater #RStats community!
+As such we encourage everyone of all skills to participate!
+
+```{r setup, include=FALSE}
+
+knitr::opts_chunk$set(echo = TRUE)
+
+library(tidyverse)
+library(tidytuesdayR)
+library(scales)
+theme_set(theme_light())
+
+```
+
+# Load the weekly Data
+
+Dowload the weekly data and make available in the `tt` object.
+
+```{r Load}
+tt <- tt_load("2021-02-09")
+```
+
+```{r}
+# Let's make one graph of each dataset
+
+tt$lifetime_earn %>%
+  ggplot(aes(lifetime_earn, race, fill = gender)) +
+  geom_col(position = "dodge") +
+  scale_x_continuous(labels = dollar)
+
+plot_by_race <- function(data, column, labels = dollar, ...) {
+  last_year <- data %>%
+    group_by(race) %>%
+    top_n(1, year)
+  
+  data %>%
+    mutate(race = fct_reorder(race, -{{ column }}, last)) %>%
+    ggplot(aes(year, {{ column }}, color = race, ...)) +
+    geom_line() +
+    geom_text(aes(label = race, color = NULL),
+              hjust = 0, data = last_year,
+              nudge_x = .2) +
+    expand_limits(y = 0,
+                  x = 2020) +
+    scale_y_continuous(labels = labels) +
+    labs(x = "Year",
+         color = "Race") +
+    theme(legend.position = "none")
+}
+
+tt$student_debt %>%
+  plot_by_race(loan_debt_pct, labels = percent) +
+  labs(y = "% of families with student loan debt")
+
+tt$student_debt %>%
+  plot_by_race(loan_debt) +
+  labs(y = "Average family student loan debt for aged 25-55 (2016 dollars)")
+
+tt$retirement %>%
+  plot_by_race(retirement) +
+  labs(y = "Average family liquid retirement savings (2016 dollars)")
+
+tt$home_owner %>%
+  plot_by_race(home_owner_pct, labels = percent) +
+  labs(y = "Home ownership percentage")
+
+tt$race_wealth %>%
+  plot_by_race(wealth_family) +
+  facet_wrap(~ type, scales = "free_y") +
+  expand_limits(x = 2025) +
+  labs(y = "Family wealth (2016 dollars)")
+
+tt$income_time %>%
+  spread(percentile, income_family) %>%
+  ggplot(aes(year, `50th`, ymin = `10th`, ymax = `90th`)) +
+  geom_line() +
+  geom_ribbon(alpha = .25) +
+  expand_limits(y = 0) +
+  scale_y_continuous(labels = dollar) +
+  labs(x = "Year", y = "Family income (median with 10th and 90th percentiles)")
+
+tt$income_limits %>%
+  filter(dollar_type == "2019 Dollars",
+         !str_detect(race, "or in Combination")) %>%
+  distinct(race, year, income_quintile, .keep_all = TRUE) %>%
+  spread(income_quintile, income_dollars) %>%
+  mutate(race = fct_reorder(race, -Fourth)) %>%
+  ggplot(aes(year, ymin = Lowest, ymax = Fourth, fill = race)) +
+  geom_ribbon(alpha = .25) +
+  expand_limits(y = 0) +
+  scale_y_continuous(labels = dollar) +
+  labs(y = "20th-80th income quantiles")
+
+tt$income_limits %>%
+  filter(dollar_type == "2019 Dollars",
+         !str_detect(race, "or in Combination")) %>%
+  distinct(race, year, income_quintile, .keep_all = TRUE) %>%
+  mutate(income_quintile = fct_reorder(income_quintile, -income_dollars)) %>%
+  ggplot(aes(year, income_dollars, color = income_quintile)) +
+  geom_line() +
+  facet_wrap(~ race) +
+  scale_y_continuous(labels = dollar) +
+  labs(y = "Income quintile limit",
+       color = "")
+
+tt$income_limits %>%
+  filter(dollar_type == "2019 Dollars",
+         !str_detect(race, "or in Combination")) %>%
+  distinct(race, year, income_quintile, .keep_all = TRUE) %>%
+  mutate(income_quintile = fct_reorder(income_quintile, income_dollars),
+         race = fct_reorder(race, -income_dollars, last)) %>%
+  ggplot(aes(year, income_dollars, color = race)) +
+  geom_line() +
+  facet_wrap(~ income_quintile) +
+  scale_y_continuous(labels = dollar) +
+  labs(y = "Income quintile limit",
+       color = "")
+
+tt$income_mean %>%
+  filter(dollar_type == "2019 Dollars",
+         !str_detect(race, "or in Combination")) %>%
+  distinct(race, year, income_quintile, .keep_all = TRUE) %>%
+  mutate(income_quintile = fct_reorder(income_quintile, income_dollars),
+         race = fct_reorder(race, -income_dollars, last)) %>%
+  ggplot(aes(year, income_dollars, color = race)) +
+  geom_line() +
+  facet_wrap(~ income_quintile, scales = "free_y") +
+  scale_y_continuous(labels = dollar) +
+  expand_limits(y = 0) +
+  labs(y = "Income quintile",
+       color = "")
+
+# library(plotly)
+# ggplotly(g)
+```
+
+```{r}
+tt$income_aggregate %>%
+  filter(income_quintile != "Top 5%",
+         !str_detect(race, "Combination")) %>%
+  mutate(income_share = income_share / 100,
+         income_quintile = fct_inorder(income_quintile)) %>%
+  ggplot(aes(year, income_share, fill = income_quintile)) +
+  geom_area() +
+  facet_wrap(~ race) +
+  scale_y_continuous(labels = percent) +
+  labs(x = "",
+       y = "% share of income",
+       fill = "Income quintile",
+       title = "Income distribution over time")
+
+tt$income_aggregate %>%
+  filter(income_quintile == "Top 5%",
+         !str_detect(race, "Combination")) %>%
+  mutate(income_share = income_share / 100) %>%
+  plot_by_race(income_share, labels = percent) +
+  labs(y = "Share of income earned by the top 5%")
+
+tt$income_distribution %>%
+  filter(!str_detect(race, "Combination")) %>%
+  mutate(income_distribution = income_distribution / 100,
+         income_bracket = fct_inorder(income_bracket)) %>%
+  ggplot(aes(year, income_distribution, fill = income_bracket)) +
+  geom_area() +
+  facet_wrap(~ race) +
+  scale_y_continuous(labels = percent) +
+  labs(x = "",
+       y = "% share of income",
+       fill = "Income bracket",
+       title = "Income distribution over time")
+```
+
+```{r}
+tt$income_distribution %>%
+  View()
+```
+
+
+