diff --git a/NAMESPACE b/NAMESPACE index f1fb57d..fadc287 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -11,6 +11,7 @@ export(extract_table_urls) export(gcp_tables_to_list) export(gcp_to_bq) export(scrape_odd_methodology) +export(scrape_orr_methodology) export(tidied_df) export(year_column_check) import(bigrquery) diff --git a/R/scrape_dynamic_methodology.R b/R/scrape_dynamic_methodology.R index 51827a0..2d5eb84 100644 --- a/R/scrape_dynamic_methodology.R +++ b/R/scrape_dynamic_methodology.R @@ -1,7 +1,7 @@ -#' For a provided url, return those stats which have dynamically updating methdology links +#' For a provided url, return those stats which have dynamically updating methodology links #' @name scrape_odd_methodology #' @param url URL of the web page to scrape urls from. Defaults to the gov.uk stats homepage -#' @return a table containing two columns, the collection url and the methodoloy url +#' @return a table containing two columns, the collection url and the methodology url #' @export #' @importFrom dplyr filter #' @importFrom purrr map list_rbind diff --git a/R/scrape_orr_methodology.R b/R/scrape_orr_methodology.R new file mode 100644 index 0000000..808463e --- /dev/null +++ b/R/scrape_orr_methodology.R @@ -0,0 +1,24 @@ +#' For a provided ORR url, return those stats which have dynamically updating methodology links +#' @name scrape_orr_methodology +#' @param url URL of the ORR web page to scrape urls from +#' @return a table containing two columns, the collection url and the methodology url +#' @export +#' @importFrom dplyr filter +#' @importFrom purrr map list_rbind +#' @importFrom magrittr "%>%" +#' +##Scrape the odd dynamic methodology +scrape_orr_methodology <- function(url){ + +##Get the collection links + links <- scrape_links(url) %>% + ##Turn into a data frame + purrr::map(.f = ~tibble(methodology = .x)) %>% + purrr::list_rbind(names_to = "element_number") %>% + dplyr::filter(grepl("quality-report.pdf$", methodology, ignore.case = TRUE)) %>% + dplyr::mutate(collection = url, + methodology = paste0("https://dataportal.orr.gov.uk", methodology)) %>% + dplyr::select(collection, methodology) %>% + unique() + +} diff --git a/README.md b/README.md index ae0fc5c..17d7dcf 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,7 @@ The package complements and supports the development and maintenance of the Depa Currently, tinScrape is made up of the following main functions: * `extract_table_urls` +* `extract_metadata` * `gcp_to_bq` * `scrape_dynamic_methodology` * `scrape_orr_tables` @@ -43,6 +44,22 @@ For downloadable documents, such as Excel files, the `extract_table_urls()` func No argument is required to use this function as the Stats at DfT webpage is hardcoded into the function in the first instance. +### extract_metadata + +You can call this function using the following line: + +``` +tinScrape::extract_metadata() +``` + +This functions utilises the `download_cover()` function to read and extract a publication's metadata on an ODS' Cover sheet. For the purposes of TiN, this covers the email address and date of publication. + +The function reads in the data saved in GCS, extracts the information after identifying the Cover sheet and saves it in a dataframe. A separate chunk of code is then required to be run for this information to be exported to BQ. This is provided below: + +`bigrquery::bq_table_upload("[name of BQ table]", tinScrape::extract_metadata(), create_disposition='CREATE_IF_NEEDED', write_disposition = "WRITE_TRUNCATE")` + +Equivalent functions to extract and export the metadata from ORR tables exist, and are `extract_orr_metadata()` and `download_orr_cover()`. + ### gcp_to_bq You can call this function using the following line: @@ -141,6 +158,8 @@ This function webscrapes the any technical notes that accompany a publication. T No argument is required to use this function as the Stats at DfT webpage is hardcoded into the function in the first instance. +For scraping the ORR technical notes, related to their tables, the `scrape_orr_methodology()` function is called. + ### scrape_orr_tables You can call this function using the following line: diff --git a/man/scrape_odd_methodology.Rd b/man/scrape_odd_methodology.Rd index 5e29367..099e7e5 100644 --- a/man/scrape_odd_methodology.Rd +++ b/man/scrape_odd_methodology.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/scrape_dynamic_methodology.R \name{scrape_odd_methodology} \alias{scrape_odd_methodology} -\title{For a provided url, return those stats which have dynamically updating methdology links} +\title{For a provided url, return those stats which have dynamically updating methodology links} \usage{ scrape_odd_methodology( url = @@ -13,8 +13,8 @@ scrape_odd_methodology( \item{url}{URL of the web page to scrape urls from. Defaults to the gov.uk stats homepage} } \value{ -a table containing two columns, the collection url and the methodoloy url +a table containing two columns, the collection url and the methodology url } \description{ -For a provided url, return those stats which have dynamically updating methdology links +For a provided url, return those stats which have dynamically updating methodology links } diff --git a/man/scrape_orr_methodology.Rd b/man/scrape_orr_methodology.Rd new file mode 100644 index 0000000..6daf29f --- /dev/null +++ b/man/scrape_orr_methodology.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/scrape_orr_methodology.R +\name{scrape_orr_methodology} +\alias{scrape_orr_methodology} +\title{For a provided ORR url, return those stats which have dynamically updating methodology links} +\usage{ +scrape_orr_methodology(url) +} +\arguments{ +\item{url}{URL of the ORR web page to scrape urls from} +} +\value{ +a table containing two columns, the collection url and the methodology url +} +\description{ +For a provided ORR url, return those stats which have dynamically updating methodology links +}