8
8
# ' @importFrom glue glue
9
9
# ' @importFrom xml2 read_html
10
10
# ' @importFrom dplyr %>% filter pull slice
11
- # ' @importFrom tibble tibble
12
11
# ' @importFrom rvest html_nodes html_attr html_text
13
- # ' @importFrom stringr str_trim str_remove
14
12
# ' @importFrom purrr map_dfr
15
13
# ' @importFrom rlang .data
16
14
# '
17
15
18
16
scrape_abs_catalogues <- function () {
19
17
20
18
# scrape the main page
21
- abs_stats_page <- xml2 :: read_html(" https://www.abs.gov.au/statistics" )
19
+ abs_stats_page <- xml2 :: read_html(" https://www.abs.gov.au/statistics" ,
20
+ user_agent = readabs_user_agent )
22
21
23
- main_page_data <- tibble :: tibble(
24
- heading = abs_stats_page %> % rvest :: html_nodes(" .field--type-ds h3" ) %> % rvest :: html_text() %> % stringr :: str_trim (),
25
- url_suffix = abs_stats_page %> % rvest :: html_nodes(" .card" ) %> % rvest :: html_attr(" href" ) %> % stringr :: str_trim ()
22
+ main_page_data <- dplyr :: tibble(
23
+ heading = abs_stats_page %> % rvest :: html_nodes(" .field--type-ds h3" ) %> % rvest :: html_text() %> % stringi :: stri_trim_both (),
24
+ url_suffix = abs_stats_page %> % rvest :: html_nodes(" .card" ) %> % rvest :: html_attr(" href" ) %> % stringi :: stri_trim_both ()
26
25
)
27
26
28
27
# scrape each page
@@ -31,15 +30,16 @@ scrape_abs_catalogues <- function() {
31
30
main_page_heading <- main_page_data $ heading [main_page_data $ url_suffix == sub_page_url_suffix ]
32
31
33
32
34
- sub_page <- xml2 :: read_html(glue :: glue(" https://www.abs.gov.au{sub_page_url_suffix}" ))
33
+ sub_page <- xml2 :: read_html(glue :: glue(" https://www.abs.gov.au{sub_page_url_suffix}" ),
34
+ user_agent = readabs_user_agent )
35
35
36
- sub_page_data <- tibble :: tibble(
36
+ sub_page_data <- dplyr :: tibble(
37
37
heading = main_page_heading ,
38
- sub_heading = sub_page %> % rvest :: html_nodes(" .abs-layout-title" ) %> % rvest :: html_text() %> % str_trim (),
38
+ sub_heading = sub_page %> % rvest :: html_nodes(" .abs-layout-title" ) %> % rvest :: html_text() %> % stringi :: stri_trim_both (),
39
39
catalogue = sub_page %> % rvest :: html_nodes(" #content .card" ) %> % rvest :: html_attr(" href" ) %> %
40
- stringr :: str_remove (sub_page_url_suffix ) %> %
41
- stringr :: str_remove (" /[^/]*$" ) %> %
42
- stringr :: str_remove (" /" ),
40
+ stringi :: stri_replace_all_fixed (sub_page_url_suffix , " " ) %> %
41
+ stringi :: stri_replace_all_regex (" /[^/]*$" , " " ) %> %
42
+ stringi :: stri_replace_all_fixed (" /" , " " ),
43
43
url = glue :: glue(" https://www.abs.gov.au{sub_page_url_suffix}/{catalogue}/latest-release" )
44
44
)
45
45
}
0 commit comments