Made it a merge instead of rep

uptake · Apr 12, 2018 · 721cd9c · 721cd9c
1 parent f9b4516
commit 721cd9c
Show file tree

Hide file tree

Showing 2 changed files with 24 additions and 30 deletions.
diff --git a/R/elasticsearch_parsers.R b/R/elasticsearch_parsers.R
@@ -378,10 +378,6 @@ unpack_nested_data <- function(chomped_df, col_to_unpack)  {
         msg <- "For unpack_nested_data, chomped_df must be a data.table"
         log_fatal(msg)
     }
-    if (".id" %in% names(chomped_df)) {
-        msg <- "For unpack_nested_data, chomped_df cannot have a column named '.id'"
-        log_fatal(msg)
-    }
     if (!("character" %in% class(col_to_unpack)) || length(col_to_unpack) != 1) {
         msg <- "For unpack_nested_data, col_to_unpack must be a character of length 1"
         log_fatal(msg)
@@ -391,7 +387,15 @@ unpack_nested_data <- function(chomped_df, col_to_unpack)  {
         log_fatal(msg)
     }
 
-    listDT <- chomped_df[[col_to_unpack]]
+    inDT <- data.table::copy(chomped_df)
+
+    # Define a column name to store original row ID
+    joinCol <- uuid::UUIDgenerate()
+    inDT[, (joinCol) := .I]
+
+    # Take out the packed column
+    listDT <- inDT[[col_to_unpack]]
+    inDT[, (col_to_unpack) := NULL]
 
     # Check for empty column
     if (all(purrr::map_int(listDT, NROW) == 0)) {
@@ -409,6 +413,7 @@ unpack_nested_data <- function(chomped_df, col_to_unpack)  {
     # Bind packed column into one data.table
     if (all(is_atomic)) {
         newDT <- data.table::as.data.table(unlist(listDT))
+        newDT[, (joinCol) := rep(seq_along(listDT), lengths(listDT))]
     } else if (all(is_df | is_list | is_na)) {
 	    # Find name to use for NA columns
         first_df <- min(which(is_df))
@@ -422,25 +427,21 @@ unpack_nested_data <- function(chomped_df, col_to_unpack)  {
 
 	    # If the packed column contains data.tables, we use rbindlist
         newDT <- purrr::map_if(listDT, is_na, .prep_na_row, col_name = col_name)
-        newDT <- data.table::rbindlist(newDT, fill = TRUE)
+        newDT <- data.table::rbindlist(newDT, fill = TRUE, idcol = joinCol)
     } else {
         msg <- paste0("Each row in column ", col_to_unpack, " must be a data frame or a vector.")
         log_fatal(msg)
     }
 
-    # Create the unpacked data.table by replicating the originally unpacked
-    # columns by the number of rows in each entry in the original unpacked column
-    # We don't use newDT because it doesn't have the original row lengths
-    times_to_replicate <- pmax(purrr::map_int(listDT, NROW), 1)
-    # Replicate the rows of the data.table by entries of times_to_replicate but drop col_to_unpack
-    replicatedDT <- chomped_df[rep(1:nrow(chomped_df), times_to_replicate)]
-    replicatedDT[, col_to_unpack] <- NULL
-    # Then bind the replicated columns with the unpacked column
-    outDT <- data.table::data.table(newDT, replicatedDT)
+    # Join it back in
+    outDT <- inDT[newDT, on = joinCol]
+    outDT[, (joinCol) := NULL]
 
+    # In the case of all atomic...
     if ("V1" %in% names(outDT)) {
         data.table::setnames(outDT, "V1", col_to_unpack)
     }
+
     return(outDT)
 }
 

diff --git a/tests/testthat/test-elasticsearch_parsers.R b/tests/testthat/test-elasticsearch_parsers.R
@@ -845,8 +845,8 @@ futile.logger::flog.threshold(0)
                                                , col_to_unpack = "details.appData")
               expect_true("data.table" %in% class(unpackedDT))
               expect_equivalent(dim(unpackedDT), c(7, 8))
-              expect_named(unpackedDT, c('appName', 'minutes', 'value', 'typovalue', 'dateTime', 'username',
-                           'details.interactions', 'details.userType'))
+              expect_named(unpackedDT, c('dateTime', 'username', 'details.interactions', 
+                                         'details.userType', 'appName', 'minutes', 'value', 'typovalue'))
               expect_identical(unpackedDT$appName, c('farmville', 'candy_crush', 'angry_birds',
                                                      'minesweeper', 'pokemon_go', 'pokemon_stay',
                                                      'block_dude'))
@@ -868,8 +868,8 @@ futile.logger::flog.threshold(0)
                                                , col_to_unpack = "details.minutes")
               expect_true("data.table" %in% class(unpackedDT))
               expect_equivalent(dim(unpackedDT), c(8, 5))
-              expect_named(unpackedDT, c('details.minutes', 'dateTime', 'username',
-                                         'details.interactions', 'details.userType'))
+              expect_named(unpackedDT, c('dateTime', 'username', 'details.interactions', 
+                                         'details.userType', 'details.minutes'))
               expect_equivalent(unpackedDT$details.minutes, c(500, 350, 422, NA, 28, 190, 1, 796))
               expect_identical(unpackedDT$username, c(rep("Austin1", 3), "Austin2", rep("Austin3", 4)))
               })
@@ -881,13 +881,6 @@ futile.logger::flog.threshold(0)
                             regexp = "chomped_df must be a data.table")}
              )
 
-    # Should break if chomped_df already has a column named ".id"
-    test_that("unpack_nested_data should break if you pass a data.table with column '.id'",
-              {expect_error(unpack_nested_data(chomped_df = data.table::data.table(wow = 7, .id = 8)
-                                               , col_to_unpack = "blah"),
-                            regexp = "chomped_df cannot have a column named '.id'")}
-             )
-
     # Should break if col_to_unpack is not a string
     test_that("unpack_nested_data should break if col_to_unpack is not a string",
               {expect_error(unpack_nested_data(chomped_df = data.table::data.table(wow = 7)
@@ -923,12 +916,12 @@ futile.logger::flog.threshold(0)
     })
 
     test_that("unpack_nested_data should handle NA and empty rows", {
-        DT <- data.table::data.table(x = 1:2, y = list(z = NA, data.table(w = 5:6, z = 7:8)))
-        DT2 <- data.table::data.table(x = 1:2, y = list(z = list(), data.table(w = 5:6, z = 7:8)))
+        DT <- data.table::data.table(x = 1:2, y = list(z = NA, data.table::data.table(w = 5:6, z = 7:8)))
+        DT2 <- data.table::data.table(x = 1:2, y = list(z = list(), data.table::data.table(w = 5:6, z = 7:8)))
         unpackedDT <- data.table::data.table(
-            w = c(NA, 5, 6)
+            x = c(1, 2, 2)
+            , w = c(NA, 5, 6)
             , z = c(NA, 7, 8)
-            , x = c(1, 2, 2)
         )
         expect_equal(unpack_nested_data(DT, col_to_unpack = "y"), unpackedDT)
         expect_equal(unpack_nested_data(DT2, col_to_unpack = "y"), unpackedDT)