Merge pull request #17 from department-for-transport-public/orr_metho…

…dology Orr methodology
department-for-transport-public · Jan 23, 2025 · 311c99a · 311c99a
2 parents dddde28 + d65ac3d
commit 311c99a
Show file tree

Hide file tree

Showing 6 changed files with 66 additions and 5 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -11,6 +11,7 @@ export(extract_table_urls)
 export(gcp_tables_to_list)
 export(gcp_to_bq)
 export(scrape_odd_methodology)
+export(scrape_orr_methodology)
 export(tidied_df)
 export(year_column_check)
 import(bigrquery)

diff --git a/R/scrape_dynamic_methodology.R b/R/scrape_dynamic_methodology.R
@@ -1,7 +1,7 @@
-#' For a provided url, return those stats which have dynamically updating methdology links
+#' For a provided url, return those stats which have dynamically updating methodology links
 #' @name scrape_odd_methodology
 #' @param url URL of the web page to scrape urls from. Defaults to the gov.uk stats homepage
-#' @return a table containing two columns, the collection url and the methodoloy url
+#' @return a table containing two columns, the collection url and the methodology url
 #' @export
 #' @importFrom dplyr filter
 #' @importFrom purrr map list_rbind

diff --git a/R/scrape_orr_methodology.R b/R/scrape_orr_methodology.R
@@ -0,0 +1,24 @@
+#' For a provided ORR url, return those stats which have dynamically updating methodology links
+#' @name scrape_orr_methodology
+#' @param url URL of the ORR web page to scrape urls from
+#' @return a table containing two columns, the collection url and the methodology url
+#' @export
+#' @importFrom dplyr filter
+#' @importFrom purrr map list_rbind
+#' @importFrom magrittr "%>%"
+#'
+##Scrape the odd dynamic methodology
+scrape_orr_methodology <- function(url){
+
+##Get the collection links
+  links <- scrape_links(url) %>% 
+    ##Turn into a data frame
+    purrr::map(.f = ~tibble(methodology = .x)) %>%
+    purrr::list_rbind(names_to = "element_number") %>% 
+    dplyr::filter(grepl("quality-report.pdf$", methodology, ignore.case = TRUE)) %>% 
+    dplyr::mutate(collection = url,
+                  methodology = paste0("https://dataportal.orr.gov.uk", methodology)) %>% 
+    dplyr::select(collection, methodology) %>% 
+    unique()
+
+}
diff --git a/README.md b/README.md
@@ -25,6 +25,7 @@ The package complements and supports the development and maintenance of the Depa
 Currently, tinScrape is made up of the following main functions:
 
 * `extract_table_urls`
+* `extract_metadata`
 * `gcp_to_bq`
 * `scrape_dynamic_methodology`
 * `scrape_orr_tables`
@@ -43,6 +44,22 @@ For downloadable documents, such as Excel files, the `extract_table_urls()` func
 
 No argument is required to use this function as the Stats at DfT webpage is hardcoded into the function in the first instance.
 
+### extract_metadata
+
+You can call this function using the following line:
+
+```
+tinScrape::extract_metadata()
+```
+
+This functions utilises the `download_cover()` function to read and extract a publication's metadata on an ODS' Cover sheet. For the purposes of TiN, this covers the email address and date of publication.
+
+The function reads in the data saved in GCS, extracts the information after identifying the Cover sheet and saves it in a dataframe. A separate chunk of code is then required to be run for this information to be exported to BQ. This is provided below:
+
+`bigrquery::bq_table_upload("[name of BQ table]", tinScrape::extract_metadata(), create_disposition='CREATE_IF_NEEDED', write_disposition = "WRITE_TRUNCATE")`
+
+Equivalent functions to extract and export the metadata from ORR tables exist, and are `extract_orr_metadata()` and `download_orr_cover()`.
+
 ### gcp_to_bq
 
 You can call this function using the following line:
@@ -141,6 +158,8 @@ This function webscrapes the any technical notes that accompany a publication. T
 
 No argument is required to use this function as the Stats at DfT webpage is hardcoded into the function in the first instance.
 
+For scraping the ORR technical notes, related to their tables, the `scrape_orr_methodology()` function is called.
+
 ### scrape_orr_tables
 
 You can call this function using the following line:

diff --git a/man/scrape_odd_methodology.Rd b/man/scrape_odd_methodology.Rd
diff --git a/man/scrape_orr_methodology.Rd b/man/scrape_orr_methodology.Rd