-
Notifications
You must be signed in to change notification settings - Fork 0
/
gov_website.R
128 lines (110 loc) · 3.29 KB
/
gov_website.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
library(tidyverse)
library(jsonlite)
library(rvest)
library(httr2)
library(rio)
done <- list.files("/home/runner/work/auto_web_crawling/auto_web_crawling/data/table", pattern = "csv", full.names = TRUE) %>%
map_dfr(~ import(., setclass = "tibble") |> mutate(across(everything(), as.character))) %>%
distinct(id, .keep_all = TRUE)
extract_info <- possibly(
insistently(
\(page = 1) {
link <- str_c("https://zfwzzc.www.gov.cn/check_web/errorInfo_querySearch.action?term=&searchType=-1&pageNo=", page, "&pos=3&size=10&problemIdbg=-1")
info <- link %>%
request() %>%
req_timeout(12) %>%
req_retry(
max_tries = 5,
max_seconds = 60,
backoff = ~6
) %>%
req_perform() %>%
resp_body_string()
Sys.sleep(sample(1:2, 1))
return(info)
},
rate = rate_backoff(
pause_base = 2,
pause_cap = 60,
pause_min = 1,
max_times = 10,
jitter = TRUE
)
),
otherwise = "error!"
)
total <- extract_info(1) %>%
fromJSON() %>%
pluck("iTotalDisplayRecords")
# info <- map(1:ceiling(total / 10), extract_info, .progress = TRUE)
info <- list()
for (i in 1:ceiling(total / 10)) {
info[[i]] <- extract_info(i)
if (
identical(
pluck(fromJSON(info[[i]]), "body", "id") %in% done$id,
rep(TRUE, 10)
)
) {
break
}
}
table <- info[info != "error!"] %>%
map_dfr(
\(x) fromJSON(x) |>
pluck("body") |>
mutate(across(everything(), as.character))
)
export(
table,
file = sprintf("data/table/%s_table.csv", Sys.Date()),
bom = TRUE
)
extract_contents <- possibly(
insistently(
\(link) {
contents <- link %>%
request() %>%
req_timeout(12) %>%
req_retry(
max_tries = 5,
max_seconds = 60,
backoff = ~6
) %>%
req_perform() %>%
resp_body_string()
Sys.sleep(sample(1:2, 1))
return(contents)
},
rate = rate_backoff(
pause_base = 2,
pause_cap = 60,
pause_min = 1,
max_times = 10,
jitter = TRUE
)
),
otherwise = "error!"
)
done <- list.files("/home/runner/work/auto_web_crawling/auto_web_crawling/data/contents", pattern = "contents", full.names = TRUE) %>%
map_dfr(~ import(., setclass = "tibble")) %>%
distinct(links, .keep_all = TRUE)
table <- table %>%
transmute(links = sprintf(fmt = "https://zfwzzc.www.gov.cn/check_web/errorInfo_getErrorInfoList2.action?id=%s", id)) %>%
anti_join(done, "links")
if (nrow(table) != 0) {
contents <- table %>%
distinct(links, .keep_all = TRUE) %>%
mutate(contents = map_chr(links, extract_contents, .progress = TRUE))
export(
contents,
file = sprintf("data/contents/%s_contents.csv", Sys.Date()),
bom = TRUE
)
} else {
export(
tibble(info = "there is no new data"),
file = sprintf("data/contents/%s_empty.csv", Sys.Date()),
bom = TRUE
)
}