From 7b45e92ba56573e3f882c665f25ff2e27fc93940 Mon Sep 17 00:00:00 2001 From: Peter Owen Date: Wed, 16 Aug 2023 11:38:59 +1000 Subject: [PATCH 01/11] add data_subclass column --- R/seifa.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/seifa.R b/R/seifa.R index 9013760..bcc8ebe 100644 --- a/R/seifa.R +++ b/R/seifa.R @@ -119,7 +119,7 @@ get_seifa <- function(structure = c('sa1','sa2','lga','postcode','suburb'), if (file.exists(filename)) { ind <- map(sheet_names, ~ get_seifa_index_sheet(filename, .x, structure, year), .id = 'seifa_index') %>% - list_rbind() + list_rbind(names_to = 'data_subclass') return(ind) } else { warning('Download of ABS file failed. Please check your internet connection and try again.') From 0548a7f445ba11c57fe48372823e83b1e3f793a8 Mon Sep 17 00:00:00 2001 From: Peter Owen Date: Wed, 16 Aug 2023 15:47:16 +1000 Subject: [PATCH 02/11] allow for seifa summary --- R/seifa.R | 254 +++++++++++++++++++++-------------- man/get_seifa.Rd | 2 +- man/get_seifa_index_sheet.Rd | 11 +- tests/testthat/test-seifa.R | 94 ++++++++----- 4 files changed, 226 insertions(+), 135 deletions(-) diff --git a/R/seifa.R b/R/seifa.R index bcc8ebe..65cc300 100644 --- a/R/seifa.R +++ b/R/seifa.R @@ -1,5 +1,3 @@ - - #' @title Import SEIFA Data from ABS #' @description The function will download all SEIFA data, for a specified spatial structure, #' to a temporary excel file and then merge sheets into a single `data.frame`. This `data.frame` @@ -43,62 +41,75 @@ #' #' @examples #' \dontrun{ -#' get_seifa(structure = 'lga', data_subclass = 'irsed', year = 2016) +#' get_seifa(structure = "lga", data_subclass = "irsed", year = 2016) #' } #' -get_seifa <- function(structure = c('sa1','sa2','lga','postcode','suburb'), - data_subclass = c('irsed', 'irsead', 'ier', 'ieo'), +get_seifa <- function(structure = c("sa1", "sa2", "lga", "postcode", "suburb"), + data_subclass = c("irsed", "irsead", "ier", "ieo"), year = NULL) { - # TODO: 2006 SEIFA has the Statistical Local Area (SLA) structure, not the # Statistical Level Areas (SA1, SA2) structures. Would need to update logic to # handle 2006. - release_years = c(2011, 2016, 2021) + release_years <- c(2011, 2016, 2021) + + stopifnot( + "data_subclass must be either: + 1. some combination of: 'irsed', 'irsead', 'ier', 'ieo' + 2. just the value 'summary'" = + all(data_subclass %in% c("irsed", "irsead", "ier", "ieo")) | + (data_subclass == "summary" & length(data_subclass) == 1) + ) - stopifnot(all(data_subclass %in% c('irsed', 'irsead', 'ier', 'ieo'))) # match excel sheet names to data_subclass - sheet_names <- c('irsed' = 'Table 2', - 'irsead' = 'Table 3', - 'ier' = 'Table 4', - 'ieo' = 'Table 5') + sheet_names <- c( + "summary" = "Table 1", + "irsed" = "Table 2", + "irsead" = "Table 3", + "ier" = "Table 4", + "ieo" = "Table 5" + ) sheet_names <- sheet_names[data_subclass] - # match spatial structures to specific urls + # match spatial structures to specific urls structure <- match.arg(structure, several.ok = FALSE) - urls <- list( '2011' = c( 'sa1' = 'https://www.abs.gov.au/AUSSTATS/subscriber.nsf/log?openagent&2033.0.55.001%20sa1%20indexes.xls&2033.0.55.001&Data%20Cubes&9828E2819C30D96DCA257B43000E923E&0&2011&05.04.2013&Latest', - 'sa2' = 'https://www.abs.gov.au/AUSSTATS/subscriber.nsf/log?openagent&2033.0.55.001%20SA2%20Indexes.xls&2033.0.55.001&Data%20Cubes&76D0BC44356DC34ACA257B3B001A4913&0&2011&12.11.2014&Latest', - 'lga' = 'https://www.abs.gov.au/AUSSTATS/subscriber.nsf/log?openagent&2033.0.55.001%20lga%20indexes.xls&2033.0.55.001&Data%20Cubes&28EF8569335AC7CDCA257BAB00136B0F&0&2011&18.07.2013&Latest', - 'postcode' = 'https://www.abs.gov.au/AUSSTATS/subscriber.nsf/log?openagent&2033.0.55.001%20POA%20Indexes.xls&2033.0.55.001&Data%20Cubes&209B3364525C82CCCA257B3B001A4D56&0&2011&12.11.2014&Latest', - 'suburb' = 'https://www.abs.gov.au/AUSSTATS/subscriber.nsf/log?openagent&2033.0.55.001%20ssc%20indexes.xls&2033.0.55.001&Data%20Cubes&F40D0630B245D5DCCA257B43000EA0F1&0&2011&05.04.2013&Latest'), - - '2016' = c( 'sa1' = 'https://www.abs.gov.au/ausstats/subscriber.nsf/log?openagent&2033055001%20-%20sa1%20indexes.xls&2033.0.55.001&Data%20Cubes&40A0EFDE970A1511CA25825D000F8E8D&0&2016&27.03.2018&Latest', - 'sa2' = 'https://www.abs.gov.au/ausstats/subscriber.nsf/log?openagent&2033055001%20-%20sa2%20indexes.xls&2033.0.55.001&Data%20Cubes&C9F7AD36397CB43DCA25825D000F917C&0&2016&27.03.2018&Latest', - 'lga' = 'https://www.abs.gov.au/ausstats/subscriber.nsf/log?openagent&2033055001%20-%20lga%20indexes.xls&2033.0.55.001&Data%20Cubes&5604C75C214CD3D0CA25825D000F91AE&0&2016&27.03.2018&Latest', - 'postcode' = 'https://www.abs.gov.au/ausstats/subscriber.nsf/log?openagent&2033055001%20-%20poa%20indexes.xls&2033.0.55.001&Data%20Cubes&DC124D1DAC3D9FDDCA25825D000F9267&0&2016&27.03.2018&Latest', - 'suburb' = 'https://www.abs.gov.au/ausstats/subscriber.nsf/log?openagent&2033055001%20-%20ssc%20indexes.xls&2033.0.55.001&Data%20Cubes&863031D939DE8105CA25825D000F91D2&0&2016&27.03.2018&Latest'), - - '2021' = c( 'sa1' = 'https://www.abs.gov.au/statistics/people/people-and-communities/socio-economic-indexes-areas-seifa-australia/2021/Statistical%20Area%20Level%201%2C%20Indexes%2C%20SEIFA%202021.xlsx', - 'sa2' = 'https://www.abs.gov.au/statistics/people/people-and-communities/socio-economic-indexes-areas-seifa-australia/2021/Statistical%20Area%20Level%202%2C%20Indexes%2C%20SEIFA%202021.xlsx', - 'lga' = 'https://www.abs.gov.au/statistics/people/people-and-communities/socio-economic-indexes-areas-seifa-australia/2021/Local%20Government%20Area%2C%20Indexes%2C%20SEIFA%202021.xlsx', - 'postcode' = 'https://www.abs.gov.au/statistics/people/people-and-communities/socio-economic-indexes-areas-seifa-australia/2021/Postal%20Area%2C%20Indexes%2C%20SEIFA%202021.xlsx', - 'suburb' = 'https://www.abs.gov.au/statistics/people/people-and-communities/socio-economic-indexes-areas-seifa-australia/2021/Suburbs%20and%20Localities%2C%20Indexes%2C%20SEIFA%202021.xlsx' ) - - ) - - - if( is.null(year) ){ - year = as.character(max(release_years)) - }else{ - if(! (is.numeric(year) | is.character(year) ) ){ - stop('year must either be an integer or character string.') + urls <- list( + "2011" = c( + "sa1" = "https://www.abs.gov.au/AUSSTATS/subscriber.nsf/log?openagent&2033.0.55.001%20sa1%20indexes.xls&2033.0.55.001&Data%20Cubes&9828E2819C30D96DCA257B43000E923E&0&2011&05.04.2013&Latest", + "sa2" = "https://www.abs.gov.au/AUSSTATS/subscriber.nsf/log?openagent&2033.0.55.001%20SA2%20Indexes.xls&2033.0.55.001&Data%20Cubes&76D0BC44356DC34ACA257B3B001A4913&0&2011&12.11.2014&Latest", + "lga" = "https://www.abs.gov.au/AUSSTATS/subscriber.nsf/log?openagent&2033.0.55.001%20lga%20indexes.xls&2033.0.55.001&Data%20Cubes&28EF8569335AC7CDCA257BAB00136B0F&0&2011&18.07.2013&Latest", + "postcode" = "https://www.abs.gov.au/AUSSTATS/subscriber.nsf/log?openagent&2033.0.55.001%20POA%20Indexes.xls&2033.0.55.001&Data%20Cubes&209B3364525C82CCCA257B3B001A4D56&0&2011&12.11.2014&Latest", + "suburb" = "https://www.abs.gov.au/AUSSTATS/subscriber.nsf/log?openagent&2033.0.55.001%20ssc%20indexes.xls&2033.0.55.001&Data%20Cubes&F40D0630B245D5DCCA257B43000EA0F1&0&2011&05.04.2013&Latest" + ), + "2016" = c( + "sa1" = "https://www.abs.gov.au/ausstats/subscriber.nsf/log?openagent&2033055001%20-%20sa1%20indexes.xls&2033.0.55.001&Data%20Cubes&40A0EFDE970A1511CA25825D000F8E8D&0&2016&27.03.2018&Latest", + "sa2" = "https://www.abs.gov.au/ausstats/subscriber.nsf/log?openagent&2033055001%20-%20sa2%20indexes.xls&2033.0.55.001&Data%20Cubes&C9F7AD36397CB43DCA25825D000F917C&0&2016&27.03.2018&Latest", + "lga" = "https://www.abs.gov.au/ausstats/subscriber.nsf/log?openagent&2033055001%20-%20lga%20indexes.xls&2033.0.55.001&Data%20Cubes&5604C75C214CD3D0CA25825D000F91AE&0&2016&27.03.2018&Latest", + "postcode" = "https://www.abs.gov.au/ausstats/subscriber.nsf/log?openagent&2033055001%20-%20poa%20indexes.xls&2033.0.55.001&Data%20Cubes&DC124D1DAC3D9FDDCA25825D000F9267&0&2016&27.03.2018&Latest", + "suburb" = "https://www.abs.gov.au/ausstats/subscriber.nsf/log?openagent&2033055001%20-%20ssc%20indexes.xls&2033.0.55.001&Data%20Cubes&863031D939DE8105CA25825D000F91D2&0&2016&27.03.2018&Latest" + ), + "2021" = c( + "sa1" = "https://www.abs.gov.au/statistics/people/people-and-communities/socio-economic-indexes-areas-seifa-australia/2021/Statistical%20Area%20Level%201%2C%20Indexes%2C%20SEIFA%202021.xlsx", + "sa2" = "https://www.abs.gov.au/statistics/people/people-and-communities/socio-economic-indexes-areas-seifa-australia/2021/Statistical%20Area%20Level%202%2C%20Indexes%2C%20SEIFA%202021.xlsx", + "lga" = "https://www.abs.gov.au/statistics/people/people-and-communities/socio-economic-indexes-areas-seifa-australia/2021/Local%20Government%20Area%2C%20Indexes%2C%20SEIFA%202021.xlsx", + "postcode" = "https://www.abs.gov.au/statistics/people/people-and-communities/socio-economic-indexes-areas-seifa-australia/2021/Postal%20Area%2C%20Indexes%2C%20SEIFA%202021.xlsx", + "suburb" = "https://www.abs.gov.au/statistics/people/people-and-communities/socio-economic-indexes-areas-seifa-australia/2021/Suburbs%20and%20Localities%2C%20Indexes%2C%20SEIFA%202021.xlsx" + ) + ) + + + if (is.null(year)) { + year <- as.character(max(release_years)) + } else { + if (!(is.numeric(year) | is.character(year))) { + stop("year must either be an integer or character string.") } year <- as.character(year) - if(! any(year %in% as.character(release_years))){ - stop('year is not a valid release year, please check SEIFA webpage.') + if (!any(year %in% as.character(release_years))) { + stop("year is not a valid release year, please check SEIFA webpage.") } } @@ -106,26 +117,28 @@ get_seifa <- function(structure = c('sa1','sa2','lga','postcode','suburb'), # Get file extension if possible, otherwise assume xls. url_ext <- tools::file_ext(sub("\\?.+", "", url)) - if(url_ext == ""){url_ext <- 'xls'} + if (url_ext == "") { + url_ext <- "xls" + } - filename <- tempfile(fileext = paste0('.',url_ext) ) + filename <- tempfile(fileext = paste0(".", url_ext)) try({ - download.file(url, destfile = filename, mode = 'wb') - message(paste0('ABS ', toupper(structure),' file downloaded to: \n'), - paste0(' ', filename), - appendLF = TRUE) + download.file(url, destfile = filename, mode = "wb") + message(paste0("ABS ", toupper(structure), " file downloaded to: \n"), + paste0(" ", filename), + appendLF = TRUE + ) }) if (file.exists(filename)) { - ind <- map(sheet_names, ~ get_seifa_index_sheet(filename, .x, structure, year), .id = 'seifa_index') %>% - list_rbind(names_to = 'data_subclass') + ind <- map(sheet_names, ~ get_seifa_index_sheet(filename, .x, structure, data_subclass, year), .id = "seifa_index") %>% + list_rbind(names_to = "data_subclass") return(ind) } else { - warning('Download of ABS file failed. Please check your internet connection and try again.') + warning("Download of ABS file failed. Please check your internet connection and try again.") return(NULL) } - } @@ -148,77 +161,114 @@ get_seifa <- function(structure = c('sa1','sa2','lga','postcode','suburb'), #' @examples #' \dontrun{ #' -#' get_seifa_index_sheet('downloaded_filename.xls', sheetname = 'Table 2', structure = 'lga') +#' get_seifa_index_sheet("downloaded_filename.xls", sheetname = "Table 2", structure = "lga") #' } #' -get_seifa_index_sheet <- function(filename, sheetname, structure = c('sa1','sa2','lga','postcode','suburb'), year) { - +get_seifa_index_sheet <- function(filename, sheetname, structure = c("sa1", "sa2", "lga", "postcode", "suburb"), data_subclass, year) { structure <- match.arg(structure, several.ok = FALSE) - column_names <- c('area_code', - 'area_name', - 'population', - 'score', - 'blank1', - 'rank_aus', - 'decile_aus', - 'percentile_aus', - 'blank2', - 'state', - 'rank_state', - 'decile_state', - 'percentile_state', - 'min_score_sa1_area', - 'max_score_sa1_area', - 'percent_usual_resident_pop_without_sa1_score') + column_names <- c( + "area_code", + "area_name", + "population", + "score", + "blank1", + "rank_aus", + "decile_aus", + "percentile_aus", + "blank2", + "state", + "rank_state", + "decile_state", + "percentile_state", + "min_score_sa1_area", + "max_score_sa1_area", + "percent_usual_resident_pop_without_sa1_score" + ) # Add column for SEIFA releases >= 2016 with structures suburb or postcode. - if (structure %in% c('suburb','postcode') && year >= 2016 ) { - column_names <- c(column_names, 'caution_poor_sa1_representation') + if (structure %in% c("suburb", "postcode") && year >= 2016) { + column_names <- c(column_names, "caution_poor_sa1_representation") } - if (structure == 'postcode') { - column_names <- column_names[-grep('area_name', column_names)] - if(year >= 2016){ - column_names <- c(column_names, 'postcode_crosses_state_boundary') + if (structure == "postcode") { + column_names <- column_names[-grep("area_name", column_names)] + if (year >= 2016) { + column_names <- c(column_names, "postcode_crosses_state_boundary") } } - if (structure == 'sa1') { - column_names <- c('sa1_7_code', - 'sa1_11_code', - 'population', - 'score', - 'blank1', - 'rank_aus', - 'decile_aus', - 'percentile_aus', - 'blank2', - 'state', - 'rank_state', - 'decile_state', - 'percentile_state') + if (structure == "sa1") { + column_names <- c( + "sa1_7_code", + "sa1_11_code", + "population", + "score", + "blank1", + "rank_aus", + "decile_aus", + "percentile_aus", + "blank2", + "state", + "rank_state", + "decile_state", + "percentile_state" + ) # remove sa1_11_code column for 2011 release. - if( year == 2011) { - column_names <- column_names[-grep('sa1_11_code', column_names)] - }else if( year == 2021) { - column_names <- column_names[-grep('sa1_7_code', column_names)] + if (year == 2011) { + column_names <- column_names[-grep("sa1_11_code", column_names)] + } else if (year == 2021) { + column_names <- column_names[-grep("sa1_7_code", column_names)] + } + } + + if (length(data_subclass) == 1) { + if (data_subclass == "summary") { + if (year != 2011) { + column_names <- c( + "area_code", + "area_name", + "irsed_score", + "irsed_decile", + "irsead_score", + "irsead_decile", + "ier_score", + "ier_decile", + "ieo_score", + "ieo_decile", + "population" + ) + } else if (year == 2011) { + column_names <- c( + "area_code", + "irsed_score", + "irsed_decile", + "irsead_score", + "irsead_decile", + "ier_score", + "ier_decile", + "ieo_score", + "ieo_decile", + "population" + ) + } } } suppressWarnings({ df <- read_excel(filename, - sheetname, - skip = 6, - col_names = column_names, - na = c("", "NA") ) %>% - dplyr::filter(across(ends_with('_code'), ~ !is.na(.x))) %>% - select(-starts_with('blank')) %>% - mutate(structure = structure) %>% + sheetname, + skip = 6, + col_names = column_names, + na = c("", "NA") + ) %>% + dplyr::filter(across(ends_with("_code"), ~ !is.na(.x))) %>% + select(-starts_with("blank")) %>% + mutate(structure = structure, + year = year) %>% relocate(structure) }) return(df) - } diff --git a/man/get_seifa.Rd b/man/get_seifa.Rd index ca2c2f7..7a6b679 100644 --- a/man/get_seifa.Rd +++ b/man/get_seifa.Rd @@ -53,7 +53,7 @@ For All ABS SEIFA spreadsheets go to \href{https://www.abs.gov.au/AUSSTATS/abs@. } \examples{ \dontrun{ - get_seifa(structure = 'lga', data_subclass = 'irsed', year = 2016) +get_seifa(structure = "lga", data_subclass = "irsed", year = 2016) } } diff --git a/man/get_seifa_index_sheet.Rd b/man/get_seifa_index_sheet.Rd index b15f9f6..afcf52f 100644 --- a/man/get_seifa_index_sheet.Rd +++ b/man/get_seifa_index_sheet.Rd @@ -8,6 +8,7 @@ get_seifa_index_sheet( filename, sheetname, structure = c("sa1", "sa2", "lga", "postcode", "suburb"), + data_subclass, year ) } @@ -19,6 +20,14 @@ get_seifa_index_sheet( \item{structure}{character spatial structure of the data to be parsed. The spatial structure is important as the shape of the data in the ABS spreadsheets if different for some structures.} +\item{data_subclass}{character vector matching available SEIFA indexes: +\itemize{ + \item{irsed}{ - Index of Relative Socio-economic Disadvantage} + \item{irsead}{ - Index of Relative Socio-economic Advantage and Disadvantage} + \item{ier}{ - Index of Economic Resources} + \item{ieo}{ - Index of Education and Occupation} +}} + \item{year}{a character string or numeric of the release year of SEIFA object, eg "2016"; 2011.} } \value{ @@ -32,7 +41,7 @@ from \url{https://www.abs.gov.au/AUSSTATS/abs@.nsf/DetailsPage/2033.0.55.0012016 \examples{ \dontrun{ - get_seifa_index_sheet('downloaded_filename.xls', sheetname = 'Table 2', structure = 'lga') +get_seifa_index_sheet("downloaded_filename.xls", sheetname = "Table 2", structure = "lga") } } diff --git a/tests/testthat/test-seifa.R b/tests/testthat/test-seifa.R index c8b4268..05b174b 100644 --- a/tests/testthat/test-seifa.R +++ b/tests/testthat/test-seifa.R @@ -1,5 +1,6 @@ # Define columns for each spreadsheet. -column_names <- list( '2011' = list( 'sa1' = c('structure', +column_names <- list( '2011' = list( 'sa1' = c('data_subclass', + 'structure', 'sa1_7_code', 'population', 'score', @@ -9,8 +10,10 @@ column_names <- list( '2011' = list( 'sa1' = c('structure', 'state', 'rank_state', 'decile_state', - 'percentile_state'), - 'sa2' =c('structure', + 'percentile_state', + 'year'), + 'sa2' =c('data_subclass', + 'structure', 'area_code', 'area_name', 'population', @@ -24,8 +27,10 @@ column_names <- list( '2011' = list( 'sa1' = c('structure', 'percentile_state', 'min_score_sa1_area', 'max_score_sa1_area', - 'percent_usual_resident_pop_without_sa1_score'), - 'lga' = c('structure', + 'percent_usual_resident_pop_without_sa1_score', + 'year'), + 'lga' = c('data_subclass', + 'structure', 'area_code', 'area_name', 'population', @@ -39,8 +44,10 @@ column_names <- list( '2011' = list( 'sa1' = c('structure', 'percentile_state', 'min_score_sa1_area', 'max_score_sa1_area', - 'percent_usual_resident_pop_without_sa1_score'), - 'postcode' = c('structure', + 'percent_usual_resident_pop_without_sa1_score', + 'year'), + 'postcode' = c('data_subclass', + 'structure', 'area_code', 'population', 'score', @@ -53,8 +60,10 @@ column_names <- list( '2011' = list( 'sa1' = c('structure', 'percentile_state', 'min_score_sa1_area', 'max_score_sa1_area', - 'percent_usual_resident_pop_without_sa1_score'), - 'suburb' = c('structure', + 'percent_usual_resident_pop_without_sa1_score', + 'year'), + 'suburb' = c('data_subclass', + 'structure', 'area_code', 'area_name', 'population', @@ -68,9 +77,11 @@ column_names <- list( '2011' = list( 'sa1' = c('structure', 'percentile_state', 'min_score_sa1_area', 'max_score_sa1_area', - 'percent_usual_resident_pop_without_sa1_score') + 'percent_usual_resident_pop_without_sa1_score', + 'year') ), - '2016' = list( 'sa1' = c('structure', + '2016' = list( 'sa1' = c('data_subclass', + 'structure', 'sa1_7_code', 'sa1_11_code', 'population', @@ -81,8 +92,10 @@ column_names <- list( '2011' = list( 'sa1' = c('structure', 'state', 'rank_state', 'decile_state', - 'percentile_state'), - 'sa2' =c('structure', + 'percentile_state', + 'year'), + 'sa2' =c('data_subclass', + 'structure', 'area_code', 'area_name', 'population', @@ -96,8 +109,10 @@ column_names <- list( '2011' = list( 'sa1' = c('structure', 'percentile_state', 'min_score_sa1_area', 'max_score_sa1_area', - 'percent_usual_resident_pop_without_sa1_score'), - 'lga' = c('structure', + 'percent_usual_resident_pop_without_sa1_score', + 'year'), + 'lga' = c('data_subclass', + 'structure', 'area_code', 'area_name', 'population', @@ -111,8 +126,10 @@ column_names <- list( '2011' = list( 'sa1' = c('structure', 'percentile_state', 'min_score_sa1_area', 'max_score_sa1_area', - 'percent_usual_resident_pop_without_sa1_score'), - 'postcode' = c('structure', + 'percent_usual_resident_pop_without_sa1_score', + 'year'), + 'postcode' = c('data_subclass', + 'structure', 'area_code', 'population', 'score', @@ -127,8 +144,10 @@ column_names <- list( '2011' = list( 'sa1' = c('structure', 'max_score_sa1_area', 'percent_usual_resident_pop_without_sa1_score', 'caution_poor_sa1_representation', - 'postcode_crosses_state_boundary'), - 'suburb' = c('structure', + 'postcode_crosses_state_boundary', + 'year'), + 'suburb' = c('data_subclass', + 'structure', 'area_code', 'area_name', 'population', @@ -143,9 +162,11 @@ column_names <- list( '2011' = list( 'sa1' = c('structure', 'min_score_sa1_area', 'max_score_sa1_area', 'percent_usual_resident_pop_without_sa1_score', - 'caution_poor_sa1_representation') + 'caution_poor_sa1_representation', + 'year') ), - '2021' = list( 'sa1' = c('structure', + '2021' = list( 'sa1' = c('data_subclass', + 'structure', 'sa1_11_code', 'population', 'score', @@ -155,8 +176,10 @@ column_names <- list( '2011' = list( 'sa1' = c('structure', 'state', 'rank_state', 'decile_state', - 'percentile_state'), - 'sa2' =c('structure', + 'percentile_state', + 'year'), + 'sa2' =c('data_subclass', + 'structure', 'area_code', 'area_name', 'population', @@ -170,8 +193,10 @@ column_names <- list( '2011' = list( 'sa1' = c('structure', 'percentile_state', 'min_score_sa1_area', 'max_score_sa1_area', - 'percent_usual_resident_pop_without_sa1_score'), - 'lga' = c('structure', + 'percent_usual_resident_pop_without_sa1_score', + 'year'), + 'lga' = c('data_subclass', + 'structure', 'area_code', 'area_name', 'population', @@ -185,8 +210,10 @@ column_names <- list( '2011' = list( 'sa1' = c('structure', 'percentile_state', 'min_score_sa1_area', 'max_score_sa1_area', - 'percent_usual_resident_pop_without_sa1_score'), - 'postcode' = c('structure', + 'percent_usual_resident_pop_without_sa1_score', + 'year'), + 'postcode' = c('data_subclass', + 'structure', 'area_code', 'population', 'score', @@ -201,8 +228,10 @@ column_names <- list( '2011' = list( 'sa1' = c('structure', 'max_score_sa1_area', 'percent_usual_resident_pop_without_sa1_score', 'caution_poor_sa1_representation', - 'postcode_crosses_state_boundary'), - 'suburb' = c('structure', + 'postcode_crosses_state_boundary', + 'year'), + 'suburb' = c('data_subclass', + 'structure', 'area_code', 'area_name', 'population', @@ -217,7 +246,8 @@ column_names <- list( '2011' = list( 'sa1' = c('structure', 'min_score_sa1_area', 'max_score_sa1_area', 'percent_usual_resident_pop_without_sa1_score', - 'caution_poor_sa1_representation') + 'caution_poor_sa1_representation', + 'year') ) ) @@ -297,9 +327,11 @@ test_that('sa1 spreadsheet can be parsed for 2016 release', { mustWork = TRUE), 'Table 2', 'sa1', + 'irsed', year = '2016') expect_is(df, 'data.frame') - expect_equal(colnames(df), column_names[['2016']][['sa1']]) + ### data_subclass is added in the next step + expect_equal(colnames(df), column_names[['2016']][['sa1']][2:length(column_names[['2016']][['sa1']])] ) } ) From 4327c257af23ff11541ac37042d0492a9ea7babd Mon Sep 17 00:00:00 2001 From: Peter Owen Date: Wed, 16 Aug 2023 15:57:31 +1000 Subject: [PATCH 03/11] consider '-' as missing value in spreadsheets --- R/seifa.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/seifa.R b/R/seifa.R index 65cc300..086edd4 100644 --- a/R/seifa.R +++ b/R/seifa.R @@ -261,7 +261,7 @@ get_seifa_index_sheet <- function(filename, sheetname, structure = c("sa1", "sa2 sheetname, skip = 6, col_names = column_names, - na = c("", "NA") + na = c("", "NA",'-') ) %>% dplyr::filter(across(ends_with("_code"), ~ !is.na(.x))) %>% select(-starts_with("blank")) %>% From 4ec81441d422b9722dec7d7aa6085d5bd6bf23fb Mon Sep 17 00:00:00 2001 From: Peter Owen Date: Wed, 16 Aug 2023 18:13:00 +1000 Subject: [PATCH 04/11] add cases for sa1s column changes --- R/seifa.R | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/R/seifa.R b/R/seifa.R index 086edd4..5be6b05 100644 --- a/R/seifa.R +++ b/R/seifa.R @@ -225,10 +225,9 @@ get_seifa_index_sheet <- function(filename, sheetname, structure = c("sa1", "sa2 if (length(data_subclass) == 1) { if (data_subclass == "summary") { - if (year != 2011) { + if (year == 2011 | (year == 2021 & structure == "sa1" )) { column_names <- c( "area_code", - "area_name", "irsed_score", "irsed_decile", "irsead_score", @@ -239,9 +238,11 @@ get_seifa_index_sheet <- function(filename, sheetname, structure = c("sa1", "sa2 "ieo_decile", "population" ) - } else if (year == 2011) { + } + else { column_names <- c( "area_code", + "area_name", "irsed_score", "irsed_decile", "irsead_score", @@ -256,6 +257,8 @@ get_seifa_index_sheet <- function(filename, sheetname, structure = c("sa1", "sa2 } } + + suppressWarnings({ df <- read_excel(filename, sheetname, From a62e35ea5ca7d82da41de7f0f65a9e4f97052099 Mon Sep 17 00:00:00 2001 From: Peter Owen Date: Thu, 17 Aug 2023 13:01:46 +1000 Subject: [PATCH 05/11] update columns for seifa summary tables --- R/seifa.R | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/R/seifa.R b/R/seifa.R index 5be6b05..c6cc5c0 100644 --- a/R/seifa.R +++ b/R/seifa.R @@ -225,7 +225,9 @@ get_seifa_index_sheet <- function(filename, sheetname, structure = c("sa1", "sa2 if (length(data_subclass) == 1) { if (data_subclass == "summary") { - if (year == 2011 | (year == 2021 & structure == "sa1" )) { + if ((year == 2011) | + (year == 2021 & structure == "sa1" ) + ){ column_names <- c( "area_code", "irsed_score", @@ -239,6 +241,38 @@ get_seifa_index_sheet <- function(filename, sheetname, structure = c("sa1", "sa2 "population" ) } + else if(year == 2016 & structure == 'suburb'){ + column_names <- c( + "area_code", + "area_name", + "irsed_score", + "irsed_decile", + "irsead_score", + "irsead_decile", + "ier_score", + "ier_decile", + "ieo_score", + "ieo_decile", + "population", + 'data_warning' + ) + } + else if (year %in% c(2016,2021) & structure == 'postcode') { + column_names <- c( + "area_code", + "irsed_score", + "irsed_decile", + "irsead_score", + "irsead_decile", + "ier_score", + "ier_decile", + "ieo_score", + "ieo_decile", + "population", + 'data_warning', + 'postcode_crosses_state_boundaries' + ) + } else { column_names <- c( "area_code", From 79c8bd70414bd292073878ba09b2a9c358ca1253 Mon Sep 17 00:00:00 2001 From: Peter Owen Date: Thu, 17 Aug 2023 13:59:00 +1000 Subject: [PATCH 06/11] tidy style --- R/seifa.R | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/R/seifa.R b/R/seifa.R index c6cc5c0..9499bae 100644 --- a/R/seifa.R +++ b/R/seifa.R @@ -226,8 +226,8 @@ get_seifa_index_sheet <- function(filename, sheetname, structure = c("sa1", "sa2 if (length(data_subclass) == 1) { if (data_subclass == "summary") { if ((year == 2011) | - (year == 2021 & structure == "sa1" ) - ){ + (year == 2021 & structure == "sa1") + ) { column_names <- c( "area_code", "irsed_score", @@ -240,8 +240,7 @@ get_seifa_index_sheet <- function(filename, sheetname, structure = c("sa1", "sa2 "ieo_decile", "population" ) - } - else if(year == 2016 & structure == 'suburb'){ + } else if (year == 2016 & structure == "suburb") { column_names <- c( "area_code", "area_name", @@ -254,10 +253,9 @@ get_seifa_index_sheet <- function(filename, sheetname, structure = c("sa1", "sa2 "ieo_score", "ieo_decile", "population", - 'data_warning' + "data_warning" ) - } - else if (year %in% c(2016,2021) & structure == 'postcode') { + } else if (year %in% c(2016, 2021) & structure == "postcode") { column_names <- c( "area_code", "irsed_score", @@ -269,11 +267,10 @@ get_seifa_index_sheet <- function(filename, sheetname, structure = c("sa1", "sa2 "ieo_score", "ieo_decile", "population", - 'data_warning', - 'postcode_crosses_state_boundaries' + "data_warning", + "postcode_crosses_state_boundaries" ) - } - else { + } else { column_names <- c( "area_code", "area_name", @@ -298,12 +295,14 @@ get_seifa_index_sheet <- function(filename, sheetname, structure = c("sa1", "sa2 sheetname, skip = 6, col_names = column_names, - na = c("", "NA",'-') + na = c("", "NA", "-") ) %>% dplyr::filter(across(ends_with("_code"), ~ !is.na(.x))) %>% select(-starts_with("blank")) %>% - mutate(structure = structure, - year = year) %>% + mutate( + structure = structure, + year = year + ) %>% relocate(structure) }) From 5366453457ef9290a01c8fca68242393c8236d4c Mon Sep 17 00:00:00 2001 From: Peter Owen Date: Thu, 17 Aug 2023 15:03:54 +1000 Subject: [PATCH 07/11] update pkgdown --- pkgdown/_pkgdown.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pkgdown/_pkgdown.yml b/pkgdown/_pkgdown.yml index c29a864..48b0d6d 100644 --- a/pkgdown/_pkgdown.yml +++ b/pkgdown/_pkgdown.yml @@ -27,6 +27,7 @@ reference: - starts_with("asced") - starts_with("asc") - auholidays + - school_terms - title: "Importing ABS Data" desc: > Functions for retrieving ABS data @@ -34,6 +35,7 @@ reference: - read_absmap - get_seifa - get_seifa_index_sheet + - read_correspondence_tbl - title: "Helper functions" desc: > Functions for cleaning data and working with datasets From 52fd632518fde3800c4f6670f994c44463bdcbc6 Mon Sep 17 00:00:00 2001 From: peteowen1 Date: Wed, 7 Feb 2024 11:03:15 +1100 Subject: [PATCH 08/11] make area_code a character variable --- R/seifa.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/seifa.R b/R/seifa.R index 9499bae..2003062 100644 --- a/R/seifa.R +++ b/R/seifa.R @@ -300,6 +300,7 @@ get_seifa_index_sheet <- function(filename, sheetname, structure = c("sa1", "sa2 dplyr::filter(across(ends_with("_code"), ~ !is.na(.x))) %>% select(-starts_with("blank")) %>% mutate( + area_code = as.character(area_code), structure = structure, year = year ) %>% From 99f3587e531d10f691559aa1ee4cf8cf21053361 Mon Sep 17 00:00:00 2001 From: peteowen1 Date: Wed, 7 Feb 2024 11:24:23 +1100 Subject: [PATCH 09/11] conditional area_code update only mutate area_code if the column exists also change filtering from across to if_any as dplyr guidlines --- R/seifa.R | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/R/seifa.R b/R/seifa.R index 2003062..0aed079 100644 --- a/R/seifa.R +++ b/R/seifa.R @@ -297,15 +297,18 @@ get_seifa_index_sheet <- function(filename, sheetname, structure = c("sa1", "sa2 col_names = column_names, na = c("", "NA", "-") ) %>% - dplyr::filter(across(ends_with("_code"), ~ !is.na(.x))) %>% + dplyr::filter(if_any(ends_with("_code"), ~ !is.na(.x))) %>% select(-starts_with("blank")) %>% mutate( - area_code = as.character(area_code), structure = structure, year = year ) %>% + mutate(across( + .cols = any_of("area_code"), # Specify the column name + .fns = ~ as.character(.) # Conditionally convert to character + )) %>% relocate(structure) }) - + return(df) } From 9f28f44426a76e889fe1c589f5480cf5ab0504a2 Mon Sep 17 00:00:00 2001 From: peteowen1 Date: Wed, 7 Feb 2024 14:21:47 +1100 Subject: [PATCH 10/11] change to ends_with _code might as well make all area codes change to character for consistency (now including the sa1_7 and sa1_11 columns) also filter out if code and/or name is null, not just code --- R/seifa.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/seifa.R b/R/seifa.R index 0aed079..86bf780 100644 --- a/R/seifa.R +++ b/R/seifa.R @@ -297,14 +297,14 @@ get_seifa_index_sheet <- function(filename, sheetname, structure = c("sa1", "sa2 col_names = column_names, na = c("", "NA", "-") ) %>% - dplyr::filter(if_any(ends_with("_code"), ~ !is.na(.x))) %>% + dplyr::filter(if_any(ends_with(c("_name","_code")), ~ !is.na(.x))) %>% select(-starts_with("blank")) %>% mutate( structure = structure, year = year ) %>% mutate(across( - .cols = any_of("area_code"), # Specify the column name + .cols = any_of(ends_with("_code")), # Specify the column name .fns = ~ as.character(.) # Conditionally convert to character )) %>% relocate(structure) From 1bdf270f45ddf57be586a5382dff321bd618fdee Mon Sep 17 00:00:00 2001 From: peteowen1 Date: Wed, 7 Feb 2024 14:24:37 +1100 Subject: [PATCH 11/11] needs if_all not if_any MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit filter works best to remove row unless it has a code and a name (this helps remove the © Commonwealth of Australia 2023 row) --- R/seifa.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/seifa.R b/R/seifa.R index 86bf780..be6b3e9 100644 --- a/R/seifa.R +++ b/R/seifa.R @@ -297,7 +297,7 @@ get_seifa_index_sheet <- function(filename, sheetname, structure = c("sa1", "sa2 col_names = column_names, na = c("", "NA", "-") ) %>% - dplyr::filter(if_any(ends_with(c("_name","_code")), ~ !is.na(.x))) %>% + dplyr::filter(if_all(ends_with(c("_name","_code")), ~ !is.na(.x))) %>% select(-starts_with("blank")) %>% mutate( structure = structure,