From a577aa0757aa1c956a7aa737923047c8eaf6548d Mon Sep 17 00:00:00 2001 From: HughParsonage Date: Sat, 5 Oct 2019 22:11:33 +1000 Subject: [PATCH] Initial commit towards #49 --- DESCRIPTION | 2 ++ R/fst-utils.R | 64 +++++++++++++++++++++++++++++++++++++++++++ R/read_abs.R | 50 +++++++++++++++++++++++++++------ R/read_abs_local.R | 24 +++++++++++----- man/read_abs.Rd | 2 +- man/read_abs_local.Rd | 5 +++- 6 files changed, 130 insertions(+), 17 deletions(-) create mode 100644 R/fst-utils.R diff --git a/DESCRIPTION b/DESCRIPTION index eac604d..87b79a7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -20,6 +20,8 @@ Imports: readxl (>= 1.2.0), tibble (>= 1.4.99), dplyr (>= 0.8.0), + hutils (>= 1.5.0), + fst, XML, curl, purrr, diff --git a/R/fst-utils.R b/R/fst-utils.R new file mode 100644 index 0000000..766dfd2 --- /dev/null +++ b/R/fst-utils.R @@ -0,0 +1,64 @@ +#' Internal functions for working with the fst cache +#' @name fst-utils +#' @noRd +#' +#' @param cat_no,path As in \code{\link{read_abs}}. +#' +#' @return For `catno2fst` the path to the `fst` file to be saved or read, given +#' `cat_no` and `path`. +#' +#' `fst_available` returns `TRUE` if and only if an appropriate `fst` file is +#' available. +#' +#' `ext2ext` changes the extension of the provided file to a file in the same +#' path but with the provided extension. +#' + + +catno2fst <- function(cat_no, + path = Sys.getenv("R_READABS_PATH", unset = tempdir())) { + hutils::provide.file(file.path(path, + "fst", + paste0(gsub(".", "-", cat_no, fixed = TRUE), + ".fst")), + on_failure = stop("`path = ", normalizePath(path, winslash = "/"), "`, ", + "but it was not possible to write to this directory.")) +} + +fst_available <- function(cat_no, + path = Sys.getenv("R_READABS_PATH", unset = tempdir())) { + if (!requireNamespace("fst", quietly = TRUE) || + !dir.exists(path)) { + return(FALSE) + } + + file.fst <- catno2fst(cat_no, path) + + if (!file.exists(file.fst)) { + return(FALSE) + } + + # fst may be damaged. If it appears to be (i.e. fst metadata returns an error) + # return FALSE + + # nocov start + out <- tryCatch(inherits(fst::fst.metadata(file.fst), "fstmetadata"), + error = function(e) FALSE, + warning = function(e) FALSE) + # nocov end + out +} + +ext2ext <- function(file, new.ext) { + paste0(tools::file_path_sans_ext(file), new.ext) +} + + + + + + + + + + diff --git a/R/read_abs.R b/R/read_abs.R index b6e1c84..875e209 100644 --- a/R/read_abs.R +++ b/R/read_abs.R @@ -60,6 +60,7 @@ read_abs <- function(cat_no = NULL, show_progress_bars = TRUE, retain_files = TRUE){ + if(!is.logical(retain_files)){ stop("The `retain_files` argument to `read_abs()` must be either TRUE or FALSE.") } @@ -88,17 +89,27 @@ read_abs <- function(cat_no = NULL, stop("`metadata` argument must be either TRUE or FALSE") } - # create temp directory to temporarily store spreadsheets if retain_files == FALSE - if(!retain_files){ - path <- tempdir() + + if (fst_available(cat_no = cat_no, path = path)) { + out <- fst::read_fst(catno2fst(cat_no = cat_no, path = path)) + return(tibble::as_tibble(out)) } + + # satisfy CRAN ProductReleaseDate=SeriesID=NULL # create a subdirectory of 'path' corresponding to the catalogue number if specified - if(retain_files & !is.null(cat_no)){ - path <- file.path(path, cat_no) + if (retain_files && !is.null(cat_no)){ + .path <- file.path(path, cat_no) + } else { + # create temp directory to temporarily store spreadsheets if retain_files == FALSE + if(!retain_files) { + .path <- tempdir() + } else { + .path <- path + } } # check that R has access to the internet @@ -143,13 +154,14 @@ read_abs <- function(cat_no = NULL, # download tables corresponding to URLs message(paste0("Attempting to download files from ", download_message, ", ", xml_dfs$ProductTitle[1])) - purrr::walk(urls, download_abs, path = path, show_progress_bars = show_progress_bars) + purrr::walk(urls, download_abs, path = .path, show_progress_bars = show_progress_bars) # extract the sheets to a list filenames <- base::basename(urls) message("Extracting data from downloaded spreadsheets") sheets <- purrr::map2(filenames, table_titles, - .f = extract_abs_sheets, path = path) + .f = extract_abs_sheets, + path = .path) # remove one 'layer' of the list, so that each sheet is its own element in the list sheets <- unlist(sheets, recursive = FALSE) @@ -160,7 +172,7 @@ read_abs <- function(cat_no = NULL, # remove spreadsheets from disk if `retain_files` == FALSE if(!retain_files){ # delete downloaded files - file.remove(file.path(path, filenames)) + file.remove(file.path(.path, filenames)) } # if series_id is specified, remove all other series_ids @@ -174,6 +186,28 @@ read_abs <- function(cat_no = NULL, } + # if fst is available, write the cache to the /fst/ file + if (retain_files && requireNamespace("fst", quietly = TRUE)) { + fst::write_fst(sheet, + catno2fst(cat_no = cat_no, + path = path)) + + if (metadata) { + fstMD5 <- tools::md5sum(catno2fst(cat_no = cat_no, path = path)) + theProductReleaseDate <- + if (utils::hasName(sheet, "ProductReleaseDate")) { + (sheet[["ProductReleaseDate"]])[1L] + } else { + NA_real_ + } + write.dcf(tibble::tibble(fst_MD5 = fstMD5, + ProductReleaseDate = theProductReleaseDate), + ext2ext(catno2fst(cat_no = cat_no, + path = path), + ".dcf")) + } + } + # return a data frame sheet diff --git a/R/read_abs_local.R b/R/read_abs_local.R index e0a6d90..e1a3c27 100644 --- a/R/read_abs_local.R +++ b/R/read_abs_local.R @@ -24,6 +24,9 @@ #' If nothing is specified in `filenames` or `cat_no`, #' `read_abs_local()` will attempt to read all .xls files in the directory specified with `path`. #' +#' @param use_fst logical. If `TRUE` (the default) then, if an `fst` file of the +#' tidy data frame has already been saved in `path`, it is read immediately. +#' #' @param metadata logical. If `TRUE` (the default), a tidy data frame including #' ABS metadata (series name, table name, etc.) is included in the output. If #' `FALSE`, metadata is dropped. @@ -42,14 +45,15 @@ read_abs_local <- function(cat_no = NULL, filenames = NULL, path = Sys.getenv("R_READABS_PATH", unset = tempdir()), + use_fst = TRUE, metadata = TRUE){ # Error catching - if(is.null(filenames) & is.null(path)){ + if(is.null(filenames) && is.null(path)){ stop("You must specify a value to filenames and/or path.") } - if(!is.null(filenames) & class(filenames) != "character") { + if(!is.null(filenames) && !is.character(filenames)) { stop("if a value is given to `filenames`, it must be specified as a character vector, such as '6202001.xls' or c('6202001.xls', '6202005.xls')") } @@ -57,10 +61,20 @@ read_abs_local <- function(cat_no = NULL, warning(paste0("`path` not specified.\nLooking for ABS time series files in ", getwd())) } - if(!is.null(cat_no) & !is.character(cat_no)){ + if(!is.null(cat_no) && !is.character(cat_no)){ stop("If `cat_no` is specified, it must be a string such as '6202.0'") } + if(!is.logical(metadata) || length(metadata) != 1L || is.na(metadata)){ + stop("`metadata` argument must be either TRUE or FALSE") + } + + # Retrieve cache if available + if (is.null(filenames) && isTRUE(use_fst) && fst_available(cat_no, path)) { + out <- fst::read_fst(catno2fst(cat_no = cat_no, path = path)) + return(tibble::as_tibble(out)) + } + # If catalogue number is specifid, that takes precedence if(!is.null(cat_no)){ @@ -91,10 +105,6 @@ read_abs_local <- function(cat_no = NULL, } - if(!is.logical(metadata)){ - stop("`metadata` argument must be either TRUE or FALSE") - } - # Create filenames for local ABS time series files filenames <- base::basename(filenames) diff --git a/man/read_abs.Rd b/man/read_abs.Rd index 644271e..866105e 100644 --- a/man/read_abs.Rd +++ b/man/read_abs.Rd @@ -24,7 +24,7 @@ to specifying `cat_no`.} spreadsheets. By default, `path` takes the value set in the environment variable "R_READABS_PATH". To check the value of this variable, run \code{Sys.getenv("R_READABS_PATH")}. If this variable is not set, the files downloaded by read_abs() -will be stored in a temporary directory (`tempdir()`).} +will be stored in a temporary directory (\code{tempdir()}).} \item{metadata}{logical. If `TRUE` (the default), a tidy data frame including ABS metadata (series name, table name, etc.) is included in the output. If diff --git a/man/read_abs_local.Rd b/man/read_abs_local.Rd index 19e2e76..13a3735 100644 --- a/man/read_abs_local.Rd +++ b/man/read_abs_local.Rd @@ -6,7 +6,7 @@ \usage{ read_abs_local(cat_no = NULL, filenames = NULL, path = Sys.getenv("R_READABS_PATH", unset = tempdir()), - metadata = TRUE) + use_fst = TRUE, metadata = TRUE) } \arguments{ \item{cat_no}{character; a single catalogue number such as "6202.0". When `cat_no` @@ -25,6 +25,9 @@ Default is `Sys.getenv("R_READABS_PATH", unset = tempdir())`. If nothing is specified in `filenames` or `cat_no`, `read_abs_local()` will attempt to read all .xls files in the directory specified with `path`.} +\item{use_fst}{logical. If `TRUE` (the default) then, if an `fst` file of the +tidy data frame has already been saved in `path`, it is read immediately.} + \item{metadata}{logical. If `TRUE` (the default), a tidy data frame including ABS metadata (series name, table name, etc.) is included in the output. If `FALSE`, metadata is dropped.}