forked from dgrtwo/data-screencasts
-
Notifications
You must be signed in to change notification settings - Fork 2
/
african-american-achievements.Rmd
131 lines (102 loc) · 2.59 KB
/
african-american-achievements.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
---
title: "African-American Achievements"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
```{r}
library(tidyverse)
theme_set(theme_light())
tuesdata <- tidytuesdayR::tt_load('2020-06-09')
science <- tuesdata$science
```
```{r}
tuesdata$firsts %>%
View()
tuesdata$science %>%
View()
```
```{r}
firsts <- tuesdata$firsts
tuesdata$firsts %>%
ggplot(aes(year)) +
geom_histogram()
tuesdata$firsts %>%
count(category, sort = TRUE) %>%
mutate(category = fct_reorder(category, n)) %>%
ggplot(aes(n, category)) +
geom_col()
```
```{r}
firsts <- tuesdata$firsts %>%
mutate(person = str_remove(person, "[\\[\\(].*"),
person = str_trim(person))
```
```{r}
library(plotly)
library(glue)
g <- firsts %>%
ggplot(aes(year,
category,
color = category,
text = glue("{ year }: { accomplishment }\n{ person }"))) +
geom_point() +
theme(axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
legend.position = "none") +
labs(title = "Timeline of some notable African-American achievements",
subtitle = "Source: https://en.wikipedia.org/wiki/List_of_African-American_firsts",
y = "Category",
x = "Year")
ggplotly(g, tooltip = "text")
```
### Science
```{r}
tuesdata$science %>%
ggplot(aes(birth)) +
geom_histogram()
tuesdata$science %>%
separate_rows(occupation_s, sep = "; ") %>%
mutate(occupation = str_to_title(occupation_s)) %>%
count(occupation, sort = TRUE)
science %>%
filter(str_detect(occupation_s, regex("istician", ignore_case = TRUE))) %>%
pull(name)
science %>%
filter(str_detect(occupation_s, "statistician")) %>%
View()
```
```{r}
library(rvest)
science_html <- science %>%
mutate(html = map(links, possibly(read_html, NULL, quiet = FALSE)))
```
```{r}
extract_infobox <- . %>%
html_node(".vcard") %>%
html_table(header = FALSE) %>%
as_tibble()
infoboxes <- science_html %>%
filter(!map_lgl(html, is.null)) %>%
mutate(infobox = map(html, possibly(extract_infobox, NULL))) %>%
select(link = links, infobox) %>%
unnest(infobox) %>%
filter(X1 != "" | X2 != "", X1 != "Scientific career") %>%
rename(key = X1, value = X2)
science_infoboxes <- infoboxes %>%
group_by(link) %>%
mutate(name = first(key)) %>%
group_by(key) %>%
filter(n() >= 10) %>%
ungroup() %>%
distinct(name, key, .keep_all = TRUE) %>%
spread(key, value) %>%
janitor::clean_names()
```
```{r}
science_infoboxes %>%
count(nationality, sort = TRUE)
```