diff --git a/2021_02_02_hbcu.Rmd b/2021_02_02_hbcu.Rmd new file mode 100644 index 0000000..5672eb1 --- /dev/null +++ b/2021_02_02_hbcu.Rmd @@ -0,0 +1,191 @@ +--- +title: "TidyTemplate" +date: 2021-02-02 +output: html_output +--- + +# TidyTuesday + +Join the R4DS Online Learning Community in the weekly #TidyTuesday event! +Every week we post a raw dataset, a chart or article related to that dataset, and ask you to explore the data. +While the dataset will be “tamed”, it will not always be tidy! As such you might need to apply various R for Data Science techniques to wrangle the data into a true tidy format. +The goal of TidyTuesday is to apply your R skills, get feedback, explore other’s work, and connect with the greater #RStats community! +As such we encourage everyone of all skills to participate! + +```{r setup, include=FALSE} + +knitr::opts_chunk$set(echo = TRUE) + +library(tidyverse) +library(tidytuesdayR) +theme_set(theme_light()) +library(scales) + +``` + +# Load the weekly Data + +Dowload the weekly data and make available in the `tt` object. + +```{r Load} + +tt <- tt_load("2021-02-02") + +hbcu_all_long <- tt$hbcu_all %>% + gather(metric, enrollment, -Year) %>% + rename(year = Year) + +hbcu_all_long %>% + filter(str_detect(metric, " - ")) %>% + separate(metric, c("degree_length", "type"), sep = " - ") %>% + filter(degree_length != "Total") %>% + ggplot(aes(year, enrollment, color = type)) + + geom_line() + + facet_wrap(~ degree_length) + + labs(y = "# enrolled in HBCU", + color = "") + +hbcu_all_long %>% + filter(metric %in% c("Males", "Females")) %>% + ggplot(aes(year, enrollment, color = metric)) + + geom_line() + + expand_limits(y = 0) + + labs(y = "# enrolled in HBCU", + color = "") +``` + +* Most degrees from HBCU are 4-year, and 2-year degrees are almost entirely from public schools +* More women than men enroll in HBCU, and that has been increasing over time since 1980s + +```{r} +hbcu_black_long <- tt$hbcu_black %>% + gather(metric, black_enrollment, -Year) %>% + rename(year = Year) + +hbcu_compare_long <- hbcu_all_long %>% + full_join(hbcu_black_long, by = c("year", "metric")) %>% + mutate(pct_black = black_enrollment / enrollment) + +hbcu_compare_long %>% + filter(metric == "Total enrollment") %>% + ggplot(aes(year, pct_black)) + + geom_line() + + scale_y_continuous(labels = percent) + + expand_limits(y = 0) + + labs(y = "% of HBCU enrollment that is Black") + + +hbcu_compare_long %>% + filter(metric %in% c("Males", "Females")) %>% + ggplot(aes(year, pct_black, color = metric)) + + geom_line() + + scale_y_continuous(labels = percent) + + expand_limits(y = 0) + + labs(y = "% of HBCU enrollment that is Black") + +hbcu_compare_long %>% + filter(str_detect(metric, "Total -")) %>% + mutate(metric = str_remove(metric, "Total - ")) %>% + ggplot(aes(year, pct_black, color = metric)) + + geom_line() + + scale_y_continuous(labels = percent) + + expand_limits(y = 0) + + labs(y = "% of HBCU enrollment that is Black", + color = "") +``` + +```{r} +gather_race_ethnicity <- function(tbl) { + tbl %>% + mutate_if(is.character, parse_number) %>% + rename(year = Total) %>% + filter(!is.na(year)) %>% + gather(race_ethnicity, value, -year) %>% + mutate(column = ifelse(str_detect(race_ethnicity, "Standard Errors - "), "standard_error", "percent"), + race_ethnicity = str_remove(race_ethnicity, "Standard Errors - ")) %>% + spread(column, value) %>% + mutate(standard_error = abs(standard_error)) %>% + filter(!is.na(percent)) %>% + mutate(race_ethnicity = str_remove(race_ethnicity, "1$"), + percent = percent / 100, + standard_error = standard_error / 100) +} + +hs_over_time <- tt$hs_students %>% + slice(-(1:3)) %>% + gather_race_ethnicity() + +bach_over_time <- tt$bach_students %>% + gather_race_ethnicity() + +education_over_time <- bind_rows(hs_over_time %>% mutate(degree = "High School"), + bach_over_time %>% mutate(degree = "Bachelor's")) + +hs_over_time %>% + mutate(race_ethnicity = fct_reorder(race_ethnicity, -percent)) %>% + ggplot(aes(year, percent, color = race_ethnicity)) + + geom_line() + + scale_y_continuous(labels = percent) + + labs(color = "Race/ethnicity", + y = "% of people aged >=25 who graduated HS") + + expand_limits(y = 0) + +bach_over_time %>% + mutate(race_ethnicity = fct_reorder(race_ethnicity, -percent)) %>% + ggplot(aes(year, percent, color = race_ethnicity)) + + geom_line() + + scale_y_continuous(labels = percent) + + labs(color = "Race/ethnicity", + y = "% of people aged >=25 who graduated a bachelor's program") + + expand_limits(y = 0) + +education_over_time %>% + filter(year >= 1940, + !str_detect(race_ethnicity, "Islander -")) %>% + mutate(degree = fct_relevel(degree, "High School"), + race_ethnicity = str_remove(race_ethnicity, "Total - ")) %>% + mutate(race_ethnicity = fct_reorder(race_ethnicity, percent, last, .desc = TRUE)) %>% + ggplot(aes(year, percent, color = race_ethnicity)) + + geom_line() + + facet_wrap(~ degree) + + scale_y_continuous(labels = percent) + + labs(x = "Year", + color = "Race/ethnicity", + y = "% of people aged >=25 who have this degree") + + expand_limits(y = 0) +``` + +Bring in a new dataset on fields + +```{r} +a25 <- readxl::read_excel("~/Downloads/A-25.xls") + +a25_cleaned <- a25 %>% + select(-starts_with("...")) %>% + rename(field_gender = 1) %>% + mutate(group = cumsum(is.na(field_gender))) %>% + filter(!is.na(field_gender)) %>% + select(group, everything()) %>% + mutate(field_gender = str_remove(field_gender, " \\.\\.\\..*")) %>% + group_by(group) %>% + mutate(field = first(field_gender), + gender = ifelse(field_gender %in% c("Men", "Women"), field_gender, "Total")) %>% + ungroup() %>% + select(field, gender, everything()) %>% + select(-field_gender, -group) + +a25_cleaned %>% + select(field, gender, contains("HBCU")) %>% + rename(pct_hbcu_total = 3, + pct_hbcu_black = 4) %>% + filter(gender != "Total") %>% + mutate(field = fct_reorder(field, pct_hbcu_black, na.rm = TRUE), + pct_hbcu_black = pct_hbcu_black / 100) %>% + ggplot(aes(pct_hbcu_black, field, fill = gender)) + + geom_col(position = "dodge") + + scale_x_continuous(labels = percent) + + labs(x = "% of first degrees from an HBCU, among Black students", + y = "Field", + fill = "") +``` + diff --git a/2021_02_09_lifetime_earn.Rmd b/2021_02_09_lifetime_earn.Rmd new file mode 100644 index 0000000..414e3e5 --- /dev/null +++ b/2021_02_09_lifetime_earn.Rmd @@ -0,0 +1,190 @@ +--- +title: "TidyTemplate" +date: 2021-02-09 +output: html_output +--- + +# TidyTuesday + +Join the R4DS Online Learning Community in the weekly #TidyTuesday event! +Every week we post a raw dataset, a chart or article related to that dataset, and ask you to explore the data. +While the dataset will be “tamed”, it will not always be tidy! As such you might need to apply various R for Data Science techniques to wrangle the data into a true tidy format. +The goal of TidyTuesday is to apply your R skills, get feedback, explore other’s work, and connect with the greater #RStats community! +As such we encourage everyone of all skills to participate! + +```{r setup, include=FALSE} + +knitr::opts_chunk$set(echo = TRUE) + +library(tidyverse) +library(tidytuesdayR) +library(scales) +theme_set(theme_light()) + +``` + +# Load the weekly Data + +Dowload the weekly data and make available in the `tt` object. + +```{r Load} +tt <- tt_load("2021-02-09") +``` + +```{r} +# Let's make one graph of each dataset + +tt$lifetime_earn %>% + ggplot(aes(lifetime_earn, race, fill = gender)) + + geom_col(position = "dodge") + + scale_x_continuous(labels = dollar) + +plot_by_race <- function(data, column, labels = dollar, ...) { + last_year <- data %>% + group_by(race) %>% + top_n(1, year) + + data %>% + mutate(race = fct_reorder(race, -{{ column }}, last)) %>% + ggplot(aes(year, {{ column }}, color = race, ...)) + + geom_line() + + geom_text(aes(label = race, color = NULL), + hjust = 0, data = last_year, + nudge_x = .2) + + expand_limits(y = 0, + x = 2020) + + scale_y_continuous(labels = labels) + + labs(x = "Year", + color = "Race") + + theme(legend.position = "none") +} + +tt$student_debt %>% + plot_by_race(loan_debt_pct, labels = percent) + + labs(y = "% of families with student loan debt") + +tt$student_debt %>% + plot_by_race(loan_debt) + + labs(y = "Average family student loan debt for aged 25-55 (2016 dollars)") + +tt$retirement %>% + plot_by_race(retirement) + + labs(y = "Average family liquid retirement savings (2016 dollars)") + +tt$home_owner %>% + plot_by_race(home_owner_pct, labels = percent) + + labs(y = "Home ownership percentage") + +tt$race_wealth %>% + plot_by_race(wealth_family) + + facet_wrap(~ type, scales = "free_y") + + expand_limits(x = 2025) + + labs(y = "Family wealth (2016 dollars)") + +tt$income_time %>% + spread(percentile, income_family) %>% + ggplot(aes(year, `50th`, ymin = `10th`, ymax = `90th`)) + + geom_line() + + geom_ribbon(alpha = .25) + + expand_limits(y = 0) + + scale_y_continuous(labels = dollar) + + labs(x = "Year", y = "Family income (median with 10th and 90th percentiles)") + +tt$income_limits %>% + filter(dollar_type == "2019 Dollars", + !str_detect(race, "or in Combination")) %>% + distinct(race, year, income_quintile, .keep_all = TRUE) %>% + spread(income_quintile, income_dollars) %>% + mutate(race = fct_reorder(race, -Fourth)) %>% + ggplot(aes(year, ymin = Lowest, ymax = Fourth, fill = race)) + + geom_ribbon(alpha = .25) + + expand_limits(y = 0) + + scale_y_continuous(labels = dollar) + + labs(y = "20th-80th income quantiles") + +tt$income_limits %>% + filter(dollar_type == "2019 Dollars", + !str_detect(race, "or in Combination")) %>% + distinct(race, year, income_quintile, .keep_all = TRUE) %>% + mutate(income_quintile = fct_reorder(income_quintile, -income_dollars)) %>% + ggplot(aes(year, income_dollars, color = income_quintile)) + + geom_line() + + facet_wrap(~ race) + + scale_y_continuous(labels = dollar) + + labs(y = "Income quintile limit", + color = "") + +tt$income_limits %>% + filter(dollar_type == "2019 Dollars", + !str_detect(race, "or in Combination")) %>% + distinct(race, year, income_quintile, .keep_all = TRUE) %>% + mutate(income_quintile = fct_reorder(income_quintile, income_dollars), + race = fct_reorder(race, -income_dollars, last)) %>% + ggplot(aes(year, income_dollars, color = race)) + + geom_line() + + facet_wrap(~ income_quintile) + + scale_y_continuous(labels = dollar) + + labs(y = "Income quintile limit", + color = "") + +tt$income_mean %>% + filter(dollar_type == "2019 Dollars", + !str_detect(race, "or in Combination")) %>% + distinct(race, year, income_quintile, .keep_all = TRUE) %>% + mutate(income_quintile = fct_reorder(income_quintile, income_dollars), + race = fct_reorder(race, -income_dollars, last)) %>% + ggplot(aes(year, income_dollars, color = race)) + + geom_line() + + facet_wrap(~ income_quintile, scales = "free_y") + + scale_y_continuous(labels = dollar) + + expand_limits(y = 0) + + labs(y = "Income quintile", + color = "") + +# library(plotly) +# ggplotly(g) +``` + +```{r} +tt$income_aggregate %>% + filter(income_quintile != "Top 5%", + !str_detect(race, "Combination")) %>% + mutate(income_share = income_share / 100, + income_quintile = fct_inorder(income_quintile)) %>% + ggplot(aes(year, income_share, fill = income_quintile)) + + geom_area() + + facet_wrap(~ race) + + scale_y_continuous(labels = percent) + + labs(x = "", + y = "% share of income", + fill = "Income quintile", + title = "Income distribution over time") + +tt$income_aggregate %>% + filter(income_quintile == "Top 5%", + !str_detect(race, "Combination")) %>% + mutate(income_share = income_share / 100) %>% + plot_by_race(income_share, labels = percent) + + labs(y = "Share of income earned by the top 5%") + +tt$income_distribution %>% + filter(!str_detect(race, "Combination")) %>% + mutate(income_distribution = income_distribution / 100, + income_bracket = fct_inorder(income_bracket)) %>% + ggplot(aes(year, income_distribution, fill = income_bracket)) + + geom_area() + + facet_wrap(~ race) + + scale_y_continuous(labels = percent) + + labs(x = "", + y = "% share of income", + fill = "Income bracket", + title = "Income distribution over time") +``` + +```{r} +tt$income_distribution %>% + View() +``` + + + diff --git a/2021_02_23_employment_earnings.Rmd b/2021_02_23_employment_earnings.Rmd new file mode 100644 index 0000000..f04f651 --- /dev/null +++ b/2021_02_23_employment_earnings.Rmd @@ -0,0 +1,239 @@ +--- +title: "TidyTemplate" +date: 2021-02-23 +output: html_output +--- + +# TidyTuesday + +Join the R4DS Online Learning Community in the weekly #TidyTuesday event! +Every week we post a raw dataset, a chart or article related to that dataset, and ask you to explore the data. +While the dataset will be “tamed”, it will not always be tidy! As such you might need to apply various R for Data Science techniques to wrangle the data into a true tidy format. +The goal of TidyTuesday is to apply your R skills, get feedback, explore other’s work, and connect with the greater #RStats community! +As such we encourage everyone of all skills to participate! + +```{r setup, include=FALSE} + +knitr::opts_chunk$set(echo = TRUE) + +library(tidyverse) +library(tidytuesdayR) +library(scales) +theme_set(theme_light()) + +``` + +# Load the weekly Data + +Dowload the weekly data and make available in the `tt` object. + +```{r Load} + +tt <- tt_load("2021-02-23") + +earn <- tt$earn + +employed <- tt$employed %>% + mutate(dimension = case_when( + race_gender == "TOTAL" ~ "Total", + race_gender %in% c("Men", "Women") ~ "Gender", + TRUE ~ "Race" + )) +``` + +```{r} +employed_cleaned <- employed %>% + filter(!is.na(employ_n)) %>% + mutate(industry = fct_lump(industry, 15, w = employ_n), + industry = fct_reorder(industry, employ_n, sum)) + +employed_cleaned %>% + filter(dimension == "Total") %>% + ggplot(aes(year, employ_n, fill = industry)) + + geom_col() + + scale_y_continuous(labels = comma) + + labs(y = "# employed in industry", + x = "Year") + +employed_cleaned %>% + filter(dimension == "Gender") %>% + group_by(industry, year, race_gender) %>% + summarize(employ_n = sum(employ_n)) %>% + ggplot(aes(year, employ_n, color = race_gender)) + + geom_line() + + facet_wrap(~ industry, scales = "free_y") + + expand_limits(y = 0) + + scale_y_continuous(labels = comma) + + labs(y = "# employed in industry", + x = "Year", + color = "Gender") + +employed_cleaned %>% + filter(year == 2020, + industry == "Wholesale and retail trade") %>% + View() +``` + +Not a ton of 2015-2019 variation; let's look at 2019 to 2020 + +```{r} +comparison <- employed_cleaned %>% + filter(year %in% c(2019, 2020)) %>% + mutate(major_occupation = paste(industry, major_occupation, sep = " - "), + minor_occupation = paste(major_occupation, minor_occupation, sep = " - ")) %>% + gather(level, occupation, industry, major_occupation, minor_occupation) %>% + group_by(dimension, race_gender, level, occupation, year) %>% + summarize(employ_n = sum(employ_n)) %>% + ungroup() %>% + arrange(year) %>% + group_by(dimension, level, occupation, race_gender) %>% + summarize(ratio = last(employ_n) / first(employ_n), + change = ratio - 1, + employed_2019 = first(employ_n)) %>% + group_by(dimension, level, occupation) %>% + mutate(total_2019 = sum(employed_2019)) %>% + ungroup() + +comparison %>% + View() + +comparison %>% + filter(dimension == "Total", level == "industry") %>% + mutate(occupation = fct_reorder(occupation, change)) %>% + ggplot(aes(change, occupation)) + + geom_col() + + scale_x_continuous(labels = percent) + + labs(title = "What industries suffered the most in 2020?", + x = "Shift in # employed from 2019 to 2020 (BLS)", + y = "") + +library(glue) + +compare_lollipop <- function(tbl) { + tbl %>% + mutate(occupation = glue("{ occupation } ({ comma(total_2019 / 1000) }K)"), + occupation = fct_reorder(occupation, change)) %>% + ggplot(aes(change, occupation)) + + geom_errorbarh(aes(xmin = 0, xmax = change, color = race_gender), + height = 0, + position = position_dodge(width = .7)) + + geom_point(aes(size = employed_2019, color = race_gender), + position = position_dodge(width = .7)) + + geom_vline(lty = 2, xintercept = 0) + + scale_x_continuous(labels = percent) + + scale_color_discrete(guide = guide_legend(reverse = TRUE)) + + scale_size_continuous(labels = comma, guide = FALSE) + + labs(x = "Shift in # employed from 2019 to 2020 (BLS)", + y = "", + color = "", + size = "# employed 2019") +} + +comparison %>% + filter(dimension == "Gender", level == "industry") %>% + compare_lollipop() + +comparison %>% + filter(dimension == "Race", level == "industry") %>% + compare_lollipop() + + labs(title = "What industries suffered the most in 2020?", + subtitle = "Size of point represents # employed in industry in 2019") + +comparison %>% + filter(dimension == "Race", level == "major_occupation") %>% + separate(occupation, c("industry", "occupation"), sep = " - ") %>% + filter(industry == "Construction", + employed_2019 >= 10000) %>% + compare_lollipop() + +comparison %>% + filter(dimension == "Gender", level == "major_occupation") %>% + separate(occupation, c("industry", "occupation"), sep = " - ") %>% + filter(industry == "Construction", + employed_2019 >= 10000) %>% + compare_lollipop() + +comparison %>% + filter(dimension == "Race", + level == "major_occupation") %>% + separate(occupation, c("industry", "occupation"), sep = " - ") %>% + filter(industry == "Leisure and hospitality", + employed_2019 >= 10000) %>% + compare_lollipop() + + labs(title = "Within 'Leisure and hospitality', what occupations suffered?") + +compare_2019_2020 %>% + filter(dimension == "Gender") %>% + mutate(industry = fct_reorder(industry, change)) %>% + ggplot(aes(change, industry, fill = race_gender)) + + geom_col(position = "dodge") + + scale_x_continuous(labels = percent) + + scale_fill_discrete(guide = guide_legend(reverse = TRUE)) + + labs(title = "What industries suffered the most in 2020?", + x = "Shift in # employed from 2019 to 2020 (BLS)", + y = "", + fill = "Gender") + +compare_2019_2020 %>% + filter(dimension == "Race") %>% + mutate(industry = fct_reorder(industry, change)) %>% + ggplot(aes(change, industry, fill = race_gender)) + + geom_col(position = "dodge") + + scale_x_continuous(labels = percent) + + scale_fill_discrete(guide = guide_legend(reverse = TRUE)) + + labs(title = "What industries suffered the most in 2020?", + x = "Shift in # employed from 2019 to 2020 (BLS)", + y = "", + fill = "Race") + +compare_2019_2020 %>% + filter(dimension == "Race") + +compare_2019_2020 %>% + filter(dimension == "Gender") %>% + mutate(industry = fct_reorder(industry, change)) %>% + ggplot(aes(change, industry)) + + geom_errorbarh(aes(xmin = 0, xmax = change, color = race_gender), + height = 0, + position = position_dodge(width = .7)) + + geom_point(aes(size = employed_2019, color = race_gender), + position = position_dodge(width = .7)) + + geom_vline(lty = 2, xintercept = 0) + + scale_x_continuous(labels = percent) + + scale_color_discrete(guide = guide_legend(reverse = TRUE)) + + scale_size_continuous(labels = comma, guide = FALSE) + + labs(title = "What industries suffered the most in 2020?", + subtitle = "Size of point represents # employed in industry in 2019", + x = "Shift in # employed from 2019 to 2020 (BLS)", + y = "", + color = "Gender", + size = "# employed 2019") +``` + +```{r} +library(ggrepel) + +compare_2019_2020 %>% + filter(dimension == "Total") %>% + mutate(industry = fct_reorder(industry, change)) %>% + ggplot(aes(employed_2019, change)) + + geom_point() + + geom_text_repel(aes(label = industry)) + + geom_hline(lty = 2, yintercept = 0) + + scale_x_continuous(labels = comma) + + scale_y_continuous(labels = percent) +``` + +```{r} +employed_cleaned %>% + filter(year %in% c(2019, 2020)) %>% + filter(industry == "Construction") %>% + group_by(dimension, race_gender, major_occupation, year) %>% + summarize(employ_n = sum(employ_n)) %>% + arrange(year) %>% + summarize(ratio = last(employ_n) / first(employ_n), + change = ratio - 1, + employed_2019 = first(employ_n)) %>% + ungroup() +``` +