Skip to content

Commit

Permalink
Update shanghai_petition.R
Browse files Browse the repository at this point in the history
  • Loading branch information
xinzhuohkust committed Jul 18, 2023
1 parent f1eab92 commit 22446d0
Showing 1 changed file with 22 additions and 5 deletions.
27 changes: 22 additions & 5 deletions shanghai_petition.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@ library(rvest)
library(httr2)
library(rio)

done <- list.files("/home/runner/work/auto_web_crawling/auto_web_crawling/data/table", pattern = "csv", full.names = TRUE) %>%
map_dfr(~import(., setclass = "tibble")) %>%
distinct(id, .keep_all = TRUE)

extract_info <- possibly(
\(page) {
link <- str_c("https://zfwzzc.www.gov.cn/check_web/errorInfo_querySearch.action?term=&searchType=-1&pageNo=", page, "&pos=3&size=10&problemIdbg=-1")
Expand Down Expand Up @@ -32,7 +36,20 @@ total <- extract_info(1) %>%
fromJSON() %>%
pluck("iTotalDisplayRecords")

info <- map(1:ceiling(total / 10), extract_info, .progress = TRUE)
# info <- map(1:ceiling(total / 10), extract_info, .progress = TRUE)

info <- list()
for (i in 1:total) {
info[[i]] <- extract_info(i)

if (identical(
pluck(fromJSON(info[[i]]), "body", "id") %in% done$id,
rep(TRUE, 10)
)
) {
break
}
}

table <- info[info != "error!"] %>%
map_dfr(\(x) fromJSON(x) |> pluck("body"))
Expand Down Expand Up @@ -65,16 +82,16 @@ extract_contents <- possibly(
otherwise = "error!"
)

done <- list.files("/home/runner/work/auto_web_crawling/auto_web_crawling/data/table", pattern = "table", full.names = TRUE) %>%
done <- list.files("/home/runner/work/auto_web_crawling/auto_web_crawling/data/contents", pattern = "contents", full.names = TRUE) %>%
map_dfr(~import(., setclass = "tibble")) %>%
distinct(id, .keep_all = TRUE)
distinct(links, .keep_all = TRUE)

table <- table %>%
anti_join(done, "id")
transmute(links = sprintf(fmt = "https://zfwzzc.www.gov.cn/check_web/errorInfo_getErrorInfoList2.action?id=%s", id)) %>%
anti_join(done, "links")

if(nrow(table) != 0) {
contents <- table %>%
transmute(links = sprintf(fmt = "https://zfwzzc.www.gov.cn/check_web/errorInfo_getErrorInfoList2.action?id=%s", id)) %>%
distinct(links, .keep_all = TRUE) %>%
mutate(contents = map_chr(links, extract_contents, .progress = TRUE))

Expand Down

0 comments on commit 22446d0

Please sign in to comment.