Skip to content

Commit

Permalink
Made it a merge instead of rep
Browse files Browse the repository at this point in the history
  • Loading branch information
Austin Dickey committed Apr 12, 2018
1 parent f9b4516 commit 721cd9c
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 30 deletions.
31 changes: 16 additions & 15 deletions R/elasticsearch_parsers.R
Original file line number Diff line number Diff line change
Expand Up @@ -378,10 +378,6 @@ unpack_nested_data <- function(chomped_df, col_to_unpack) {
msg <- "For unpack_nested_data, chomped_df must be a data.table"
log_fatal(msg)
}
if (".id" %in% names(chomped_df)) {
msg <- "For unpack_nested_data, chomped_df cannot have a column named '.id'"
log_fatal(msg)
}
if (!("character" %in% class(col_to_unpack)) || length(col_to_unpack) != 1) {
msg <- "For unpack_nested_data, col_to_unpack must be a character of length 1"
log_fatal(msg)
Expand All @@ -391,7 +387,15 @@ unpack_nested_data <- function(chomped_df, col_to_unpack) {
log_fatal(msg)
}

listDT <- chomped_df[[col_to_unpack]]
inDT <- data.table::copy(chomped_df)

# Define a column name to store original row ID
joinCol <- uuid::UUIDgenerate()
inDT[, (joinCol) := .I]

# Take out the packed column
listDT <- inDT[[col_to_unpack]]
inDT[, (col_to_unpack) := NULL]

# Check for empty column
if (all(purrr::map_int(listDT, NROW) == 0)) {
Expand All @@ -409,6 +413,7 @@ unpack_nested_data <- function(chomped_df, col_to_unpack) {
# Bind packed column into one data.table
if (all(is_atomic)) {
newDT <- data.table::as.data.table(unlist(listDT))
newDT[, (joinCol) := rep(seq_along(listDT), lengths(listDT))]
} else if (all(is_df | is_list | is_na)) {
# Find name to use for NA columns
first_df <- min(which(is_df))
Expand All @@ -422,25 +427,21 @@ unpack_nested_data <- function(chomped_df, col_to_unpack) {

# If the packed column contains data.tables, we use rbindlist
newDT <- purrr::map_if(listDT, is_na, .prep_na_row, col_name = col_name)
newDT <- data.table::rbindlist(newDT, fill = TRUE)
newDT <- data.table::rbindlist(newDT, fill = TRUE, idcol = joinCol)
} else {
msg <- paste0("Each row in column ", col_to_unpack, " must be a data frame or a vector.")
log_fatal(msg)
}

# Create the unpacked data.table by replicating the originally unpacked
# columns by the number of rows in each entry in the original unpacked column
# We don't use newDT because it doesn't have the original row lengths
times_to_replicate <- pmax(purrr::map_int(listDT, NROW), 1)
# Replicate the rows of the data.table by entries of times_to_replicate but drop col_to_unpack
replicatedDT <- chomped_df[rep(1:nrow(chomped_df), times_to_replicate)]
replicatedDT[, col_to_unpack] <- NULL
# Then bind the replicated columns with the unpacked column
outDT <- data.table::data.table(newDT, replicatedDT)
# Join it back in
outDT <- inDT[newDT, on = joinCol]
outDT[, (joinCol) := NULL]

# In the case of all atomic...
if ("V1" %in% names(outDT)) {
data.table::setnames(outDT, "V1", col_to_unpack)
}

return(outDT)
}

Expand Down
23 changes: 8 additions & 15 deletions tests/testthat/test-elasticsearch_parsers.R
Original file line number Diff line number Diff line change
Expand Up @@ -845,8 +845,8 @@ futile.logger::flog.threshold(0)
, col_to_unpack = "details.appData")
expect_true("data.table" %in% class(unpackedDT))
expect_equivalent(dim(unpackedDT), c(7, 8))
expect_named(unpackedDT, c('appName', 'minutes', 'value', 'typovalue', 'dateTime', 'username',
'details.interactions', 'details.userType'))
expect_named(unpackedDT, c('dateTime', 'username', 'details.interactions',
'details.userType', 'appName', 'minutes', 'value', 'typovalue'))
expect_identical(unpackedDT$appName, c('farmville', 'candy_crush', 'angry_birds',
'minesweeper', 'pokemon_go', 'pokemon_stay',
'block_dude'))
Expand All @@ -868,8 +868,8 @@ futile.logger::flog.threshold(0)
, col_to_unpack = "details.minutes")
expect_true("data.table" %in% class(unpackedDT))
expect_equivalent(dim(unpackedDT), c(8, 5))
expect_named(unpackedDT, c('details.minutes', 'dateTime', 'username',
'details.interactions', 'details.userType'))
expect_named(unpackedDT, c('dateTime', 'username', 'details.interactions',
'details.userType', 'details.minutes'))
expect_equivalent(unpackedDT$details.minutes, c(500, 350, 422, NA, 28, 190, 1, 796))
expect_identical(unpackedDT$username, c(rep("Austin1", 3), "Austin2", rep("Austin3", 4)))
})
Expand All @@ -881,13 +881,6 @@ futile.logger::flog.threshold(0)
regexp = "chomped_df must be a data.table")}
)

# Should break if chomped_df already has a column named ".id"
test_that("unpack_nested_data should break if you pass a data.table with column '.id'",
{expect_error(unpack_nested_data(chomped_df = data.table::data.table(wow = 7, .id = 8)
, col_to_unpack = "blah"),
regexp = "chomped_df cannot have a column named '.id'")}
)

# Should break if col_to_unpack is not a string
test_that("unpack_nested_data should break if col_to_unpack is not a string",
{expect_error(unpack_nested_data(chomped_df = data.table::data.table(wow = 7)
Expand Down Expand Up @@ -923,12 +916,12 @@ futile.logger::flog.threshold(0)
})

test_that("unpack_nested_data should handle NA and empty rows", {
DT <- data.table::data.table(x = 1:2, y = list(z = NA, data.table(w = 5:6, z = 7:8)))
DT2 <- data.table::data.table(x = 1:2, y = list(z = list(), data.table(w = 5:6, z = 7:8)))
DT <- data.table::data.table(x = 1:2, y = list(z = NA, data.table::data.table(w = 5:6, z = 7:8)))
DT2 <- data.table::data.table(x = 1:2, y = list(z = list(), data.table::data.table(w = 5:6, z = 7:8)))
unpackedDT <- data.table::data.table(
w = c(NA, 5, 6)
x = c(1, 2, 2)
, w = c(NA, 5, 6)
, z = c(NA, 7, 8)
, x = c(1, 2, 2)
)
expect_equal(unpack_nested_data(DT, col_to_unpack = "y"), unpackedDT)
expect_equal(unpack_nested_data(DT2, col_to_unpack = "y"), unpackedDT)
Expand Down

0 comments on commit 721cd9c

Please sign in to comment.