speed up unpack_nested_data #42

handle mixed atomic/data frame column and better explanations in comments
uptake · Mar 6, 2018 · ada870c · ada870c
1 parent b3cffef
commit ada870c
Show file tree

Hide file tree

Showing 3 changed files with 75 additions and 26 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -45,3 +45,4 @@ vignettes/*\.pdf
 
 # system files
 .*\.DS_Store
+^.*\.Rproj$
diff --git a/R/elasticsearch_parsers.R b/R/elasticsearch_parsers.R
@@ -370,7 +370,7 @@ chomp_aggs <- function(aggs_json = NULL) {
 #' unpackedDT <- unpack_nested_data(chomped_df = sampleChompedDT
 #'                                  , col_to_unpack = "details.pastPurchases")
 #' print(unpackedDT)
-unpack_nested_data <- function(chomped_df, col_to_unpack) {
+unpack_nested_data <- function(chomped_df, col_to_unpack)  {
 
     # Input checks
     if (!("data.table" %in% class(chomped_df))) {
@@ -390,45 +390,64 @@ unpack_nested_data <- function(chomped_df, col_to_unpack) {
         log_fatal(msg)
     }
 
-    # Avoid side effects
     outDT <- data.table::copy(chomped_df)
-
-    # Get the column to unpack
     listDT <- outDT[[col_to_unpack]]
 
-    # Make each row a data.table
-    listDT <- lapply(listDT, data.table::as.data.table)
-
-    # Remove the empty ones... important, due to data.table 1.10.4 bug
-    oldIDs <- which(sapply(listDT, nrow) != 0)
-    listDT <- listDT[oldIDs]
-
-    # Bind them together with an ID to match to the other data
-    newDT <- data.table::rbindlist(listDT, fill = TRUE, idcol = TRUE)
-
-    # If we tried to unpack an empty column, fail
-    if (nrow(newDT) == 0) {
+    # Check for empty column
+    if (all(lengths(listDT) == 0)) {
         msg <- "The column given to unpack_nested_data had no data in it."
         log_fatal(msg)
     }
 
-    # Fix the ID because we may have removed some empty elements due to that bug
-    newDT[, .id := oldIDs[.id]]
+    listDT[lengths(listDT) == 0] <- NA
 
-    # Merge
-    outDT[, .id := .I]
-    outDT <- newDT[outDT, on = ".id"]
+    is_df <- purrr::map_lgl(listDT, is.data.frame)
+    is_atomic <- purrr::map_lgl(listDT, is.atomic)
+    is_na <- is.na(listDT)
+
+    # Bind packed column into one data.table
+    if (all(is_atomic)) {
+        newDT <- data.table::as.data.table(unlist(listDT))
+    } else if (all(is_df | is_atomic)) {
+        # If the packed column contains a mixture of data tables, we need to 
+        # to convert the atomic vectors to data.tables
+
+        # Find column name to use for NA vectors
+        first_df <- min(which(is_df))
+        col_name <- names(listDT[[first_df]])[1]
+
+        # Convert non data.frame rows to data.table and assign name to rows
+        # with no name
+        prep_row <- function(x) {
+            if (is.atomic(x)) {
+                x <- data.table::as.data.table(x)
+                if (is.na(x)) names(x) <- col_name
+                else names(x) <- col_to_unpack
+            } 
+            x
+        }
+        newDT <- purrr::map(listDT, prep_row)
+
+        newDT <- data.table::rbindlist(newDT, fill = TRUE)
+    } else {
+        msg <- paste0("Each row in column ", col_to_unpack, " must be a data frame or a vector.")
+        futile.logger::flog.fatal(msg)
+        stop(msg)
+    }
 
-    # Remove the id column and the original column
-    outDT <- outDT[, !c(".id", col_to_unpack), with = FALSE]
+    # Create the unpacked data.table by replicating the originally unpacked
+    # columns by the number of rows in each entry in the original unpacked column
+    times_to_replicate <- pmax(purrr::map_int(listDT, NROW), 1)
+    # Replicate the rows of the data.table by entries of times_to_replicate but drop col_to_unpack
+    replicatedDT <- chomped_df[rep(1:nrow(chomped_df), times_to_replicate)]
+    replicatedDT[, col_to_unpack] <- NULL
 
-    # Rename unpacked column if it didn't get a name
+    # Then bind the replicated columns with the unpacked column
+    outDT <- data.table::data.table(newDT, replicatedDT)
     if ("V1" %in% names(outDT)) {
         data.table::setnames(outDT, "V1", col_to_unpack)
     }
-
     return(outDT)
-
 }
 
 #' @title Hits to data.tables

diff --git a/tests/testthat/test-elasticsearch_parsers.R b/tests/testthat/test-elasticsearch_parsers.R
@@ -916,6 +916,35 @@ futile.logger::flog.threshold(0)
                             regexp = "The column given to unpack_nested_data had no data in it")}
     )
 
+    test_that("unpack_nested_data should break if the column contains a non data frame/vector", {
+        DT <- data.table::data.table(x = 1:2, y = list(list(2), 3))
+        expect_error(unpack_nested_data(chomped_df = DT, col_to_unpack = "y")
+                     , regexp = "must be a data frame or a vector")
+    })
+
+    test_that("unpack_nested_data should handle NA and empty rows", {
+        DT <- data.table::data.table(x = 1:2, y = list(z = NA, data.table(w = 5:6, z = 7:8)))
+        DT2 <- data.table::data.table(x = 1:2, y = list(z = list(), data.table(w = 5:6, z = 7:8)))
+        unpackedDT <- data.table::data.table(
+            w = c(NA, 5, 6)
+            , z = c(NA, 7, 8)
+            , x = c(1, 2, 2)
+        )
+        expect_equal(unpack_nested_data(DT, col_to_unpack = "y"), unpackedDT)
+        expect_equal(unpack_nested_data(DT2, col_to_unpack = "y"), unpackedDT)
+    })
+
+    test_that("unpack_nested_data should handle mixed atomic/data frame column", {
+        DT <- data.table::data.table(x = 1:2, y = list(1, data.table(w = 5:6, z = 7:8)))
+        unpackedDT <- data.table::data.table(
+            y = c(1, NA, NA)
+            , w = c(NA, 5, 6)
+            , z = c(NA, 7, 8)
+            , x = c(1, 2, 2)
+        )
+        expect_equal(unpack_nested_data(DT, col_to_unpack = "y"), unpackedDT)
+    })
+
 #---- 5. .ConvertToSec
 
     # .ConvertToSec should work for seconds
Original file line number	Diff line number	Diff line change
Expand Up		@@ -45,3 +45,4 @@ vignettes/*\.pdf

		# system files
		.*\.DS_Store
		^.*\.Rproj$