Skip to content

Commit

Permalink
Initial commit towards #49
Browse files Browse the repository at this point in the history
  • Loading branch information
HughParsonage committed Oct 5, 2019
1 parent 1e8b37e commit a577aa0
Show file tree
Hide file tree
Showing 6 changed files with 130 additions and 17 deletions.
2 changes: 2 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ Imports:
readxl (>= 1.2.0),
tibble (>= 1.4.99),
dplyr (>= 0.8.0),
hutils (>= 1.5.0),
fst,
XML,
curl,
purrr,
Expand Down
64 changes: 64 additions & 0 deletions R/fst-utils.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#' Internal functions for working with the fst cache
#' @name fst-utils
#' @noRd
#'
#' @param cat_no,path As in \code{\link{read_abs}}.
#'
#' @return For `catno2fst` the path to the `fst` file to be saved or read, given
#' `cat_no` and `path`.
#'
#' `fst_available` returns `TRUE` if and only if an appropriate `fst` file is
#' available.
#'
#' `ext2ext` changes the extension of the provided file to a file in the same
#' path but with the provided extension.
#'


catno2fst <- function(cat_no,
path = Sys.getenv("R_READABS_PATH", unset = tempdir())) {
hutils::provide.file(file.path(path,
"fst",
paste0(gsub(".", "-", cat_no, fixed = TRUE),
".fst")),
on_failure = stop("`path = ", normalizePath(path, winslash = "/"), "`, ",
"but it was not possible to write to this directory."))
}

fst_available <- function(cat_no,
path = Sys.getenv("R_READABS_PATH", unset = tempdir())) {
if (!requireNamespace("fst", quietly = TRUE) ||
!dir.exists(path)) {
return(FALSE)
}

file.fst <- catno2fst(cat_no, path)

if (!file.exists(file.fst)) {
return(FALSE)
}

# fst may be damaged. If it appears to be (i.e. fst metadata returns an error)
# return FALSE

# nocov start
out <- tryCatch(inherits(fst::fst.metadata(file.fst), "fstmetadata"),
error = function(e) FALSE,
warning = function(e) FALSE)
# nocov end
out
}

ext2ext <- function(file, new.ext) {
paste0(tools::file_path_sans_ext(file), new.ext)
}










50 changes: 42 additions & 8 deletions R/read_abs.R
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ read_abs <- function(cat_no = NULL,
show_progress_bars = TRUE,
retain_files = TRUE){


if(!is.logical(retain_files)){
stop("The `retain_files` argument to `read_abs()` must be either TRUE or FALSE.")
}
Expand Down Expand Up @@ -88,17 +89,27 @@ read_abs <- function(cat_no = NULL,
stop("`metadata` argument must be either TRUE or FALSE")
}

# create temp directory to temporarily store spreadsheets if retain_files == FALSE
if(!retain_files){
path <- tempdir()

if (fst_available(cat_no = cat_no, path = path)) {
out <- fst::read_fst(catno2fst(cat_no = cat_no, path = path))
return(tibble::as_tibble(out))
}



# satisfy CRAN
ProductReleaseDate=SeriesID=NULL

# create a subdirectory of 'path' corresponding to the catalogue number if specified
if(retain_files & !is.null(cat_no)){
path <- file.path(path, cat_no)
if (retain_files && !is.null(cat_no)){
.path <- file.path(path, cat_no)
} else {
# create temp directory to temporarily store spreadsheets if retain_files == FALSE
if(!retain_files) {
.path <- tempdir()
} else {
.path <- path
}
}

# check that R has access to the internet
Expand Down Expand Up @@ -143,13 +154,14 @@ read_abs <- function(cat_no = NULL,
# download tables corresponding to URLs
message(paste0("Attempting to download files from ", download_message,
", ", xml_dfs$ProductTitle[1]))
purrr::walk(urls, download_abs, path = path, show_progress_bars = show_progress_bars)
purrr::walk(urls, download_abs, path = .path, show_progress_bars = show_progress_bars)

# extract the sheets to a list
filenames <- base::basename(urls)
message("Extracting data from downloaded spreadsheets")
sheets <- purrr::map2(filenames, table_titles,
.f = extract_abs_sheets, path = path)
.f = extract_abs_sheets,
path = .path)

# remove one 'layer' of the list, so that each sheet is its own element in the list
sheets <- unlist(sheets, recursive = FALSE)
Expand All @@ -160,7 +172,7 @@ read_abs <- function(cat_no = NULL,
# remove spreadsheets from disk if `retain_files` == FALSE
if(!retain_files){
# delete downloaded files
file.remove(file.path(path, filenames))
file.remove(file.path(.path, filenames))
}

# if series_id is specified, remove all other series_ids
Expand All @@ -174,6 +186,28 @@ read_abs <- function(cat_no = NULL,

}

# if fst is available, write the cache to the <path>/fst/ file
if (retain_files && requireNamespace("fst", quietly = TRUE)) {
fst::write_fst(sheet,
catno2fst(cat_no = cat_no,
path = path))

if (metadata) {
fstMD5 <- tools::md5sum(catno2fst(cat_no = cat_no, path = path))
theProductReleaseDate <-
if (utils::hasName(sheet, "ProductReleaseDate")) {
(sheet[["ProductReleaseDate"]])[1L]
} else {
NA_real_
}
write.dcf(tibble::tibble(fst_MD5 = fstMD5,
ProductReleaseDate = theProductReleaseDate),
ext2ext(catno2fst(cat_no = cat_no,
path = path),
".dcf"))
}
}

# return a data frame
sheet

Expand Down
24 changes: 17 additions & 7 deletions R/read_abs_local.R
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@
#' If nothing is specified in `filenames` or `cat_no`,
#' `read_abs_local()` will attempt to read all .xls files in the directory specified with `path`.
#'
#' @param use_fst logical. If `TRUE` (the default) then, if an `fst` file of the
#' tidy data frame has already been saved in `path`, it is read immediately.
#'
#' @param metadata logical. If `TRUE` (the default), a tidy data frame including
#' ABS metadata (series name, table name, etc.) is included in the output. If
#' `FALSE`, metadata is dropped.
Expand All @@ -42,25 +45,36 @@
read_abs_local <- function(cat_no = NULL,
filenames = NULL,
path = Sys.getenv("R_READABS_PATH", unset = tempdir()),
use_fst = TRUE,
metadata = TRUE){

# Error catching
if(is.null(filenames) & is.null(path)){
if(is.null(filenames) && is.null(path)){
stop("You must specify a value to filenames and/or path.")
}

if(!is.null(filenames) & class(filenames) != "character") {
if(!is.null(filenames) && !is.character(filenames)) {
stop("if a value is given to `filenames`, it must be specified as a character vector, such as '6202001.xls' or c('6202001.xls', '6202005.xls')")
}

if(is.null(path)){
warning(paste0("`path` not specified.\nLooking for ABS time series files in ", getwd()))
}

if(!is.null(cat_no) & !is.character(cat_no)){
if(!is.null(cat_no) && !is.character(cat_no)){
stop("If `cat_no` is specified, it must be a string such as '6202.0'")
}

if(!is.logical(metadata) || length(metadata) != 1L || is.na(metadata)){
stop("`metadata` argument must be either TRUE or FALSE")
}

# Retrieve cache if available
if (is.null(filenames) && isTRUE(use_fst) && fst_available(cat_no, path)) {
out <- fst::read_fst(catno2fst(cat_no = cat_no, path = path))
return(tibble::as_tibble(out))
}

# If catalogue number is specifid, that takes precedence
if(!is.null(cat_no)){

Expand Down Expand Up @@ -91,10 +105,6 @@ read_abs_local <- function(cat_no = NULL,

}

if(!is.logical(metadata)){
stop("`metadata` argument must be either TRUE or FALSE")
}

# Create filenames for local ABS time series files

filenames <- base::basename(filenames)
Expand Down
2 changes: 1 addition & 1 deletion man/read_abs.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 4 additions & 1 deletion man/read_abs_local.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit a577aa0

Please sign in to comment.