-
Notifications
You must be signed in to change notification settings - Fork 0
/
story_corps_interview_scraping.R
110 lines (91 loc) · 3.19 KB
/
story_corps_interview_scraping.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
options(scipen = 7, stringsAsFactors = FALSE)
library(httr)
library(dplyr)
`%notin%` <- function(x,y) !(x %in% y)
#https://storycorps.org/wp-json/posts
#https://storycorps.org/wp-json/stories
url = "https://archive.storycorps.org/wp-json/interviews?page="
id = 1:17311
getStoryCorpsStats <- function() {
GET("https://archive.storycorps.org/wp-json/stats") %>%
content %>%
bind_rows %>%
as.data.frame
}
begin <- Sys.time()
sc_interview <- data.frame()
for (i in id) {
print(paste0("On page: ", i))
GET(paste0(url, i)) %>% content -> tmp
if (length(tmp) == 0) {
stop("No data returned from API")
}
dat <- data.frame()
for (a in 1:length(tmp)) {
if(is.null(tmp[[a]]$questions %>% unlist) | all(tmp[[a]]$questions %>%
unlist %>% attributes %>% unlist %>% unname %notin% 'text')) {
data.frame(
id = tmp[[a]]$`_id`,
title = tmp[[a]]$title,
url = tmp[[a]]$url,
description = tmp[[a]]$description,
length = ifelse(is.null(tmp[[a]]$interview_audio$length), NA, tmp[[a]]$interview_audio$length),
locality = tmp[[a]]$location %>%
.[attributes(.)$names == 'locality'] %>%
unlist %>%
unname,
region = tmp[[a]]$location %>%
.[attributes(.)$names == 'region'] %>%
unlist %>%
unname,
country = tmp[[a]]$location %>%
.[attributes(.)$names == 'country'] %>%
unlist %>%
unname,
keywords = paste(tmp[[a]]$keywords, collapse = ", "),
created_date = tmp[[a]]$created_date,
plays = tmp[[a]]$plays,
permit = tmp[[a]]$permissions %>%
.[attributes(.)$names == 'permit'] %>%
unlist %>%
unname,
questions = NA
) -> api
dat <- bind_rows(dat, api)
}else{
data.frame(
id = tmp[[a]]$`_id`,
title = tmp[[a]]$title,
url = tmp[[a]]$url,
description = tmp[[a]]$description,
length = ifelse(is.null(tmp[[a]]$interview_audio$length), NA, tmp[[a]]$interview_audio$length),
locality = tmp[[a]]$location %>%
.[attributes(.)$names == 'locality'] %>%
unlist %>%
unname,
region = tmp[[a]]$location %>%
.[attributes(.)$names == 'region'] %>%
unlist %>%
unname,
country = tmp[[a]]$location %>%
.[attributes(.)$names == 'country'] %>%
unlist %>%
unname,
keywords = paste(tmp[[a]]$keywords, collapse = ", "),
created_date = tmp[[a]]$created_date,
plays = tmp[[a]]$plays,
permit = tmp[[a]]$permissions %>%
.[attributes(.)$names == 'permit'] %>%
unlist %>%
unname,
questions = tmp[[a]]$questions %>%
unlist %>%
.[attributes(.)$names %in% 'text'] %>%
unname
) -> api
dat <- bind_rows(dat, api)
}
}
bind_rows(dat, sc_interview) -> sc_interview
}
print(Sys.time() - begin)