Merge pull request #33 from grattan/private-data-location

add ability to look for microdata in other locations
grattan · Feb 6, 2021 · ebf826e · ebf826e
2 parents a053956 + 3233b8d
commit ebf826e
Show file tree

Hide file tree

Showing 14 changed files with 161 additions and 21 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -15,7 +15,7 @@ License: GPL-3
 Depends: R (>= 3.5.0)
 Encoding: UTF-8
 LazyData: true
-RoxygenNote: 7.1.0
+RoxygenNote: 7.1.1
 Imports: 
     magrittr,
     jsonlite,

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,6 +1,7 @@
 # Generated by roxygen2: do not edit by hand
 
 export("%>%")
+export(add_microdata_location)
 export(check_dropbox_access)
 export(find_filename)
 export(get_dropbox_location)

diff --git a/NEWS.md b/NEWS.md
@@ -1,6 +1,8 @@
 # grattandata 0.1.2
 * GitHub Actions is now used rather than Travis-CI for continuous integration
 (automated package testing)
+* add_microdata_location() allows a user to load files not stored in the Grattan
+data warehouse using the `read_microdata()` function.
 
 # grattandata 0.1.1
 * `read_microdata()` now has the option of loading .fst files, which 

diff --git a/R/add_microdata_location.R b/R/add_microdata_location.R
@@ -0,0 +1,36 @@
+#' Add a specified location for microdata not in the Grattan data warehouse
+#' @param path File path to non-data warehouse microdata location, such as
+#' `file.path("~", "my_data", "hilda")
+#' 
+#' @description Some microdata, such as HILDA, has access restrictions that
+#' mean it cannot be stored on Dropbox, even with the security controls we 
+#' have in place. This function allows you to store microdata somewhere else,
+#' but still load it with the `read_microdata()` function. This increases 
+#' reproducibility within Grattan. `read_microdata()` will look both in the
+#' data warehouse and in the location you add with `add_microdata_location()`.
+#' @return Sets an environment variable `"R_GRATTANDATA_LOCATION"`. When
+#' this variable is set, `grattandata` functions including `read_microdata()`
+#' will look for the data file(s) you request in the specified location as well
+#' as in the Grattan data warehouse. 
+#' 
+#' @note You can only set one location (in addition to the Grattan data
+#' warehouse). All subfolders of the defined location will be included in
+#' the search path for `read_microdata()`.
+#' 
+#' We recommend that you use the `add_microdata_location()` function in your
+#' scripts rather than defining the `"R_GRATTANDATA_LOCATION"` environment
+#' variable elsewhere, to improve clarity/reproducibility of your code.
+#' @export
+#' 
+#' @examples 
+#' \dontrun{
+#' add_microdata_location(path = file.path("documents", "hilda"))
+#' 
+#' read_microdata("hilda_file.dta")
+#' }
+
+add_microdata_location <- function(path) {
+  Sys.setenv("R_GRATTANDATA_LOCATION" = path)
+
+  invisible(TRUE)
+}
diff --git a/R/dropbox_utils.R b/R/dropbox_utils.R
@@ -92,3 +92,12 @@ get_data_warehouse_path <- function() {
   data_warehouse_path
 
 }
+
+get_data_path <- function() {
+  data_warehouse_path <- get_data_warehouse_path()
+  user_data_path <- Sys.getenv("R_GRATTANDATA_LOCATION")
+  data_path <- c(data_warehouse_path,
+                 user_data_path)
+
+  data_path
+}
diff --git a/R/find_alias.R b/R/find_alias.R
@@ -10,7 +10,7 @@ find_alias <- function(filename) {
   .filename <- basename(filename)
   .filename <- tolower(.filename)
 
-  data_warehouse_path <- get_data_warehouse_path()
+  data_warehouse_path <- get_data_path()
 
   all_files <- list.files(data_warehouse_path, recursive = TRUE)
   all_files <- all_files[!tolower(file_ext(all_files)) %in% unused_extensions]

diff --git a/R/find_filename.R b/R/find_filename.R
@@ -23,7 +23,7 @@
 # either fails with appropriate errors or returns filename
 find_filename <- function(filename) {
 
-  data_warehouse_path <- get_data_warehouse_path()
+  data_warehouse_path <- get_data_path()
 
   # Ensure that the extension requested by user isn't filtered out
   supplied_ext <- file_ext(filename)
@@ -35,7 +35,7 @@ find_filename <- function(filename) {
   # First, exclude files with given extensions ('unused_extensions')
   # unused_extensions is an internal data object; see data-raw
 
-  all_files <- list.files(data_warehouse_path, recursive = TRUE)
+  all_files <- list.files(data_warehouse_path, full.names = TRUE, recursive = TRUE)
   all_files <- all_files[!tolower(file_ext(all_files)) %in% unused_extensions]
 
   # Exclude folders that match these names
@@ -85,7 +85,7 @@ find_filename <- function(filename) {
     ))
   }
 
-  path <- file.path(data_warehouse_path, matched_files)
+  path <- matched_files
 
   # Check if file exists -----
   if (!file.exists(path)) {

diff --git a/README.Rmd b/README.Rmd
@@ -20,13 +20,14 @@ library(grattandata)
 [![R-CMD-check](https://github.com/grattan/grattandata/workflows/R-CMD-check/badge.svg)](https://github.com/grattan/grattandata/actions)
 <!-- badges: end -->
 
+
 Easily load microdata from the Grattan Institute data warehouse in R. Users will require access to the Grattan Institute data warehouse.
 
 ## Get access to the data warehouse
 
-Speak to Jonathan, Will, or Matt to get access to the data warehouse.
+Speak to a Grattan R user to get access to the data warehouse. Post in `#r_at_grattan` Slack if you're not sure who to speak to.
 
-Note that access to some parts of the warehouse requires you to be an approved user of the relevant microdata.
+Note that access to some parts (most) of the warehouse requires you to be an approved user of the relevant microdata.
 
 ## Installation
 
@@ -70,5 +71,16 @@ vista <- read_microdata("VISTA12_16_")
 
 You can now  identify which file you want to load, and be more specific with the fragment that you pass to `read_microdata()`.
 
+### Data stored elsewhere
+Some data - like HILDA - can't be stored on Dropbox with our other microdata. The function `add_microdata_location()` enables you to tell the {grattandata} package where to look for this off-warehouse microdata. You use it like this:
+
+```{r eval=FALSE}
+add_microdata_location(path = file.path("documents", "hilda"))
+
+read_microdata("hilda_wave1.dta")
+read_microdata("hilda_wave2.dta")
+```
+
+### Package vignette
 For more, see the package vignette by typing `browseVignettes("grattandata")`. This should open a tab in your web browser - click 'HTML'.
 
diff --git a/README.md b/README.md
@@ -3,18 +3,21 @@
 
 # grattandata
 
-[![Build
-Status](https://travis-ci.org/grattan/grattandata.svg?branch=master)](https://travis-ci.org/grattan/grattandata)
+<!-- badges: start -->
+
+[![R-CMD-check](https://github.com/grattan/grattandata/workflows/R-CMD-check/badge.svg)](https://github.com/grattan/grattandata/actions)
+<!-- badges: end -->
 
 Easily load microdata from the Grattan Institute data warehouse in R.
 Users will require access to the Grattan Institute data warehouse.
 
 ## Get access to the data warehouse
 
-Speak to Jonathan, Will, or Matt to get access to the data warehouse.
+Speak to a Grattan R user to get access to the data warehouse. Post in
+`#r_at_grattan` Slack if you’re not sure who to speak to.
 
-Note that access to some parts of the warehouse requires you to be an
-approved user of the relevant microdata.
+Note that access to some parts (most) of the warehouse requires you to
+be an approved user of the relevant microdata.
 
 ## Installation
 
@@ -61,17 +64,33 @@ files match your fragment. For example:
 vista <- read_microdata("VISTA12_16_")
 #> Error in find_filename(filename): Multiple files were found with VISTA12_16_ in the filename. .
 #>  The matches are:
-#> victoria/vista/2012-2016/csv/H_VISTA12_16_SA1_V1.csv
-#> victoria/vista/2012-2016/csv/JTE_VISTA12_16_sa1_V1.csv
-#> victoria/vista/2012-2016/csv/JTW_VISTA12_16_SA1_V1.csv
-#> victoria/vista/2012-2016/csv/P_VISTA12_16_SA1_V1.csv
-#> victoria/vista/2012-2016/csv/S_VISTA12_16_SA1_V1.csv
-#> victoria/vista/2012-2016/csv/T_VISTA12_16_SA1_V1.csv
+#> victoria/vista/2012-2016/csv/2012_to_2016/H_VISTA12_16_SA1_V1.csv
+#> victoria/vista/2012-2016/csv/2012_to_2016/JTE_VISTA12_16_sa1_V1.csv
+#> victoria/vista/2012-2016/csv/2012_to_2016/JTW_VISTA12_16_SA1_V1.csv
+#> victoria/vista/2012-2016/csv/2012_to_2016/P_VISTA12_16_SA1_V1.csv
+#> victoria/vista/2012-2016/csv/2012_to_2016/S_VISTA12_16_SA1_V1.csv
+#> victoria/vista/2012-2016/csv/2012_to_2016/T_VISTA12_16_SA1_V1.csv
 ```
 
 You can now identify which file you want to load, and be more specific
 with the fragment that you pass to `read_microdata()`.
 
+### Data stored elsewhere
+
+Some data - like HILDA - can’t be stored on Dropbox with our other
+microdata. The function `add_microdata_location()` enables you to tell
+the {grattandata} package where to look for this off-warehouse
+microdata. You use it like this:
+
+``` r
+add_microdata_location(path = file.path("documents", "hilda"))
+
+read_microdata("hilda_wave1.dta")
+read_microdata("hilda_wave2.dta")
+```
+
+### Package vignette
+
 For more, see the package vignette by typing
 `browseVignettes("grattandata")`. This should open a tab in your web
 browser - click ‘HTML’.
diff --git a/man/add_microdata_location.Rd b/man/add_microdata_location.Rd
diff --git a/tests/testthat/test-add_microdata_location.R b/tests/testthat/test-add_microdata_location.R
@@ -0,0 +1,19 @@
+test_that("read_microdata() can load non-warehouse data from path defined with add_microdata_location()", {
+  skip_on_cran()
+  skip_on_ci()
+
+  temp_data <- tempfile(fileext = ".csv")
+  on.exit(unlink(temp_data))
+
+  temp_dir <- dirname(temp_data)
+  fake_data <- data.frame(x = sample(0:9, size = 1000, replace = TRUE))
+  write.csv(fake_data, temp_data, row.names = FALSE)
+
+  manually_loaded_data <- rio::import(temp_data, setclass = "tbl_df")
+
+  add_microdata_location(path = temp_dir)
+
+  loaded_with_package <- read_microdata(basename(temp_data))
+
+  expect_equal(manually_loaded_data, loaded_with_package)
+})
diff --git a/tests/testthat/test-find_alias.R b/tests/testthat/test-find_alias.R
@@ -1,7 +1,7 @@
 test_that("find_alias behaves as expected", {
 
   skip_on_cran()
-  c
+  skip_on_ci()
 
   expect_match(find_alias("survey of income and housing"), 
                "sih")

diff --git a/tests/testthat/test-find_filename.R b/tests/testthat/test-find_filename.R
@@ -1,5 +1,5 @@
 test_that("find_filename finds filename", {
-  skip_on_travis()
+  skip_on_ci()
   skip_on_cran()
 
   expect_is(find_filename("SIH15bh.dta"), "character")

diff --git a/tests/testthat/test-read_microdata.R b/tests/testthat/test-read_microdata.R
@@ -20,7 +20,7 @@ test_that("read_microdata loads SIH 2015-16", {
 })
 
 test_that("read_microdata fails with multiple matches", {
-  skip_on_travis()
+  skip_on_ci()
   skip_on_cran()
 
   expect_error(read_microdata("SIH15BH"))