Skip to content

Commit

Permalink
speed up unpack_nested_data #42
Browse files Browse the repository at this point in the history
handle mixed atomic/data frame column and better explanations in comments
  • Loading branch information
William Dearden authored and jameslamb committed Mar 6, 2018
1 parent b3cffef commit ada870c
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 26 deletions.
1 change: 1 addition & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,4 @@ vignettes/*\.pdf

# system files
.*\.DS_Store
^.*\.Rproj$
71 changes: 45 additions & 26 deletions R/elasticsearch_parsers.R
Original file line number Diff line number Diff line change
Expand Up @@ -370,7 +370,7 @@ chomp_aggs <- function(aggs_json = NULL) {
#' unpackedDT <- unpack_nested_data(chomped_df = sampleChompedDT
#' , col_to_unpack = "details.pastPurchases")
#' print(unpackedDT)
unpack_nested_data <- function(chomped_df, col_to_unpack) {
unpack_nested_data <- function(chomped_df, col_to_unpack) {

# Input checks
if (!("data.table" %in% class(chomped_df))) {
Expand All @@ -390,45 +390,64 @@ unpack_nested_data <- function(chomped_df, col_to_unpack) {
log_fatal(msg)
}

# Avoid side effects
outDT <- data.table::copy(chomped_df)

# Get the column to unpack
listDT <- outDT[[col_to_unpack]]

# Make each row a data.table
listDT <- lapply(listDT, data.table::as.data.table)

# Remove the empty ones... important, due to data.table 1.10.4 bug
oldIDs <- which(sapply(listDT, nrow) != 0)
listDT <- listDT[oldIDs]

# Bind them together with an ID to match to the other data
newDT <- data.table::rbindlist(listDT, fill = TRUE, idcol = TRUE)

# If we tried to unpack an empty column, fail
if (nrow(newDT) == 0) {
# Check for empty column
if (all(lengths(listDT) == 0)) {
msg <- "The column given to unpack_nested_data had no data in it."
log_fatal(msg)
}

# Fix the ID because we may have removed some empty elements due to that bug
newDT[, .id := oldIDs[.id]]
listDT[lengths(listDT) == 0] <- NA

# Merge
outDT[, .id := .I]
outDT <- newDT[outDT, on = ".id"]
is_df <- purrr::map_lgl(listDT, is.data.frame)
is_atomic <- purrr::map_lgl(listDT, is.atomic)
is_na <- is.na(listDT)

# Bind packed column into one data.table
if (all(is_atomic)) {
newDT <- data.table::as.data.table(unlist(listDT))
} else if (all(is_df | is_atomic)) {
# If the packed column contains a mixture of data tables, we need to
# to convert the atomic vectors to data.tables

# Find column name to use for NA vectors
first_df <- min(which(is_df))
col_name <- names(listDT[[first_df]])[1]

# Convert non data.frame rows to data.table and assign name to rows
# with no name
prep_row <- function(x) {
if (is.atomic(x)) {
x <- data.table::as.data.table(x)
if (is.na(x)) names(x) <- col_name
else names(x) <- col_to_unpack
}
x
}
newDT <- purrr::map(listDT, prep_row)

newDT <- data.table::rbindlist(newDT, fill = TRUE)
} else {
msg <- paste0("Each row in column ", col_to_unpack, " must be a data frame or a vector.")
futile.logger::flog.fatal(msg)
stop(msg)
}

# Remove the id column and the original column
outDT <- outDT[, !c(".id", col_to_unpack), with = FALSE]
# Create the unpacked data.table by replicating the originally unpacked
# columns by the number of rows in each entry in the original unpacked column
times_to_replicate <- pmax(purrr::map_int(listDT, NROW), 1)
# Replicate the rows of the data.table by entries of times_to_replicate but drop col_to_unpack
replicatedDT <- chomped_df[rep(1:nrow(chomped_df), times_to_replicate)]
replicatedDT[, col_to_unpack] <- NULL

# Rename unpacked column if it didn't get a name
# Then bind the replicated columns with the unpacked column
outDT <- data.table::data.table(newDT, replicatedDT)
if ("V1" %in% names(outDT)) {
data.table::setnames(outDT, "V1", col_to_unpack)
}

return(outDT)

}

#' @title Hits to data.tables
Expand Down
29 changes: 29 additions & 0 deletions tests/testthat/test-elasticsearch_parsers.R
Original file line number Diff line number Diff line change
Expand Up @@ -916,6 +916,35 @@ futile.logger::flog.threshold(0)
regexp = "The column given to unpack_nested_data had no data in it")}
)

test_that("unpack_nested_data should break if the column contains a non data frame/vector", {
DT <- data.table::data.table(x = 1:2, y = list(list(2), 3))
expect_error(unpack_nested_data(chomped_df = DT, col_to_unpack = "y")
, regexp = "must be a data frame or a vector")
})

test_that("unpack_nested_data should handle NA and empty rows", {
DT <- data.table::data.table(x = 1:2, y = list(z = NA, data.table(w = 5:6, z = 7:8)))
DT2 <- data.table::data.table(x = 1:2, y = list(z = list(), data.table(w = 5:6, z = 7:8)))
unpackedDT <- data.table::data.table(
w = c(NA, 5, 6)
, z = c(NA, 7, 8)
, x = c(1, 2, 2)
)
expect_equal(unpack_nested_data(DT, col_to_unpack = "y"), unpackedDT)
expect_equal(unpack_nested_data(DT2, col_to_unpack = "y"), unpackedDT)
})

test_that("unpack_nested_data should handle mixed atomic/data frame column", {
DT <- data.table::data.table(x = 1:2, y = list(1, data.table(w = 5:6, z = 7:8)))
unpackedDT <- data.table::data.table(
y = c(1, NA, NA)
, w = c(NA, 5, 6)
, z = c(NA, 7, 8)
, x = c(1, 2, 2)
)
expect_equal(unpack_nested_data(DT, col_to_unpack = "y"), unpackedDT)
})

#---- 5. .ConvertToSec

# .ConvertToSec should work for seconds
Expand Down

0 comments on commit ada870c

Please sign in to comment.