diff --git a/shanghai_petition.R b/shanghai_petition.R index 8f88e29..7cdd246 100644 --- a/shanghai_petition.R +++ b/shanghai_petition.R @@ -4,6 +4,10 @@ library(rvest) library(httr2) library(rio) +done <- list.files("/home/runner/work/auto_web_crawling/auto_web_crawling/data/table", pattern = "csv", full.names = TRUE) %>% + map_dfr(~import(., setclass = "tibble")) %>% + distinct(id, .keep_all = TRUE) + extract_info <- possibly( \(page) { link <- str_c("https://zfwzzc.www.gov.cn/check_web/errorInfo_querySearch.action?term=&searchType=-1&pageNo=", page, "&pos=3&size=10&problemIdbg=-1") @@ -32,7 +36,20 @@ total <- extract_info(1) %>% fromJSON() %>% pluck("iTotalDisplayRecords") -info <- map(1:ceiling(total / 10), extract_info, .progress = TRUE) +# info <- map(1:ceiling(total / 10), extract_info, .progress = TRUE) + +info <- list() +for (i in 1:total) { + info[[i]] <- extract_info(i) + + if (identical( + pluck(fromJSON(info[[i]]), "body", "id") %in% done$id, + rep(TRUE, 10) + ) + ) { + break + } +} table <- info[info != "error!"] %>% map_dfr(\(x) fromJSON(x) |> pluck("body")) @@ -65,16 +82,16 @@ extract_contents <- possibly( otherwise = "error!" ) -done <- list.files("/home/runner/work/auto_web_crawling/auto_web_crawling/data/table", pattern = "table", full.names = TRUE) %>% +done <- list.files("/home/runner/work/auto_web_crawling/auto_web_crawling/data/contents", pattern = "contents", full.names = TRUE) %>% map_dfr(~import(., setclass = "tibble")) %>% - distinct(id, .keep_all = TRUE) + distinct(links, .keep_all = TRUE) table <- table %>% - anti_join(done, "id") + transmute(links = sprintf(fmt = "https://zfwzzc.www.gov.cn/check_web/errorInfo_getErrorInfoList2.action?id=%s", id)) %>% + anti_join(done, "links") if(nrow(table) != 0) { contents <- table %>% - transmute(links = sprintf(fmt = "https://zfwzzc.www.gov.cn/check_web/errorInfo_getErrorInfoList2.action?id=%s", id)) %>% distinct(links, .keep_all = TRUE) %>% mutate(contents = map_chr(links, extract_contents, .progress = TRUE))