Merge pull request #13 from carriedaymont/fix-github-tests

Fix GitHub tests, including cluster generation addressing parallel processing on windows, fixing #12
mitre · Nov 27, 2020 · a6e30ba · a6e30ba
2 parents fbd61a7 + a58d21b
commit a6e30ba
Show file tree

Hide file tree

Showing 10 changed files with 53 additions and 37 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -20,6 +20,8 @@ Imports:
     Hmisc (>= 4.4-0),
     labelled (>= 2.5.0),
     magrittr (>= 1.5)
+Depends:
+    R (>= 2.10)
 License: MIT + file LICENSE
 Encoding: UTF-8
 LazyData: true

diff --git a/NAMESPACE b/NAMESPACE
@@ -16,5 +16,6 @@ import(dplyr, except = c(last, first, summarize, src, between))
 import(foreach)
 import(labelled)
 import(magrittr)
+import(parallel)
 import(plyr, except = c(failwith, id, summarize, count, desc, mutate, arrange, rename, is.discrete, summarise, summarize))
 import(tidyr, except = extract)
diff --git a/R/cdc.R b/R/cdc.R
@@ -131,12 +131,6 @@ ext_bmiz <- function(data,
                      bmi = "bmi",
                      adjust.integer.age = T,
                      ref.data.path = "") {
-  library(data.table, quietly = T)
-  library(dplyr, quietly = T)
-  library(Hmisc, quietly = T)
-  library(magrittr, quietly = T)
-  library(labelled, quietly = T)
-
   setDT(data)
 
   setnames(data,

diff --git a/R/growth.R b/R/growth.R
@@ -1598,6 +1598,7 @@ cleanbatch <- function(data.df,
 #' matching same ageday measurements for the other parameter. Options include "default" (standard growthcleanr approach),
 #' and "flag.both" (in case of two measurements of one type without matching values for the other parameter, flag both
 #' for exclusion if beyond threshold)
+#' @param height.tolerance.cm maximum decrease in height tolerated for sequential measurements
 #' @param error.load.mincount minimum count of exclusions on parameter before
 #' considering excluding all measurements. Defaults to 2.
 #' @param error.load.threshold threshold of percentage of excluded measurement count to included measurement
@@ -1648,6 +1649,7 @@ cleanbatch <- function(data.df,
 #' @rawNamespace import(plyr, except = c(failwith, id, summarize, count, desc, mutate, arrange, rename, is.discrete, summarise, summarize))
 #' @import foreach
 #' @import doParallel
+#' @import parallel
 #' @examples
 #' # Run calculation using a small subset of given data
 #' df_stats <- as.data.frame(syngrowth)
@@ -1660,17 +1662,17 @@ cleanbatch <- function(data.df,
 #'                          measurement = df_stats$measurement)
 #'
 #' # Once processed you can filter data based on result value
-#' df_stats <- cbind(df_stats, "clean_result" == clean_stats)
-#' clean_df_stats <- df_stats[, df_stats$clean_result == "Include"]
+#' df_stats <- cbind(df_stats, "clean_result" = clean_stats)
+#' clean_df_stats <- df_stats[df_stats$clean_result == "Include",]
 #'
-#' # Parallel processing: run using 3 cores and batches
-#' df_stats<-cleangrowth(subjid = df_stats$subjid,
-#'                       param = df_stats$param,
-#'                       agedays = df_stats$agedays,
-#'                       sex = df_stats$sex,
-#'                       measurement = df_stats$measurement,
-#'                       parallel = TRUE,
-#'                       num.batches = 2)
+#' # Parallel processing: run using 2 cores and batches
+#' clean_stats <- cleangrowth(subjid = df_stats$subjid,
+#'                            param = df_stats$param,
+#'                            agedays = df_stats$agedays,
+#'                            sex = df_stats$sex,
+#'                            measurement = df_stats$measurement,
+#'                            parallel = TRUE,
+#'                            num.batches = 2)
 cleangrowth <- function(subjid,
                         param,
                         agedays,
@@ -1707,10 +1709,17 @@ cleangrowth <- function(subjid,
 
   # if parallel processing is desired, load additional modules
   if (parallel) {
-    registerDoParallel(cores = num.batches)
     if (is.na(num.batches)) {
       num.batches <- getDoParWorkers()
     }
+    # variables needed for parallel workers
+    var_for_par <- c("temporary_duplicates", "valid", "swap_parameters",
+                     "na_as_false", "ewma", "read_anthro", "as_matrix_delta",
+                     "sd_median")
+
+    cl <- makeCluster(num.batches)
+    clusterExport(cl = cl, varlist = var_for_par, envir = environment())
+    registerDoParallel(cl)
   } else {
     if (is.na(num.batches))
       num.batches <- 1
@@ -1986,7 +1995,7 @@ cleangrowth <- function(subjid,
       error.load.threshold = error.load.threshold,
       error.load.mincount = error.load.mincount
     )
-    stopImplicitCluster()
+    stopCluster(cl)
   }
   if (!quietly)
     cat(sprintf("[%s] Done!\n", Sys.time()))

diff --git a/R/utils.R b/R/utils.R
@@ -9,7 +9,7 @@
 #' @param df data frame to split
 #' @param fname new name for each of the split files to start with
 #' @param fdir directory to put each of the split files (default working directory)
-#' @param min_row minimum number of rows for each split file (default 10000)
+#' @param min_nrow minimum number of rows for each split file (default 10000)
 #' @param keepcol the column name (default "subjid") to use to keep records with the same values together in the same single split file
 #'
 #' @return the count number referring to the last split file written
@@ -33,11 +33,11 @@
 splitinput <-
   function(df,
            fname = deparse(substitute(df)),
-           fdir = "",
+           fdir = ".",
            min_nrow = 10000,
            keepcol = 'subjid') {
     # first, check if the given directory exists
-    if (fdir != "" & is.character(fdir) & !dir.exists(fdir)){
+    if (fdir != "." & is.character(fdir) & !dir.exists(fdir)){
       stop("invalid directory")
     }
 
@@ -144,6 +144,7 @@ recode_sex <- function(input_data,
 #' @param agedays name of age (in days) descriptor column
 #' @param param name of parameter column to identify each type of measurement
 #' @param measurement name of measurement column containing the actual measurement data
+#' @param clean_value name of column of cleaned values from growthcleanr::cleangrowth()
 #' @param include_all Determines whether the function keeps all exclusion codes. If TRUE, all exclusion types are kept and the inclusion_types argument is ignored. Defaults to FALSE.
 #' @param inclusion_types Vector indicating which exclusion codes from the cleaning algorithm should be included in the data, given that include_all is FALSE. For all options, see growthcleanr::cleangrowth(). Defaults to c("Include").
 #'

diff --git a/README.md b/README.md
@@ -1,5 +1,7 @@
 # growthcleanr
 
+![R-CMD-check](https://github.com/carriedaymont/growthcleanr/workflows/R-CMD-check/badge.svg?branch=main)
+
 R package for cleaning data from Electronic Health Record systems, focused on
 cleaning height and weight measurements.
 
@@ -66,6 +68,7 @@ following packages:
 * `data.table`
 * `foreach`
 * `doParallel`
+* `parallel`
 * `dplyr`
 * `Hmisc`
 * `labelled`

diff --git a/man/cleangrowth.Rd b/man/cleangrowth.Rd
diff --git a/man/longwide.Rd b/man/longwide.Rd
diff --git a/man/splitinput.Rd b/man/splitinput.Rd
diff --git a/man/syngrowth.Rd b/man/syngrowth.Rd