From eb76dd43c5a93905bbd2d8b0b2071ec690a43405 Mon Sep 17 00:00:00 2001 From: Hannah De los Santos Date: Tue, 24 Nov 2020 17:12:44 -0500 Subject: [PATCH 01/14] Fix parallel workers for cleangrowth --- R/growth.R | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/R/growth.R b/R/growth.R index 6c9999dd..841c421d 100644 --- a/R/growth.R +++ b/R/growth.R @@ -1648,6 +1648,7 @@ cleanbatch <- function(data.df, #' @rawNamespace import(plyr, except = c(failwith, id, summarize, count, desc, mutate, arrange, rename, is.discrete, summarise, summarize)) #' @import foreach #' @import doParallel +#' @import parallel #' @examples #' # Run calculation using a small subset of given data #' df_stats <- as.data.frame(syngrowth) @@ -1707,10 +1708,16 @@ cleangrowth <- function(subjid, # if parallel processing is desired, load additional modules if (parallel) { - registerDoParallel(cores = num.batches) if (is.na(num.batches)) { num.batches <- getDoParWorkers() } + # variables needed for parallel workers + var_for_par <- c("temporary_duplicates", "valid", "swap_parameters", + "na_as_false") + + cl <- makeCluster(num.batches) + clusterExport(cl = cl, varlist = var_for_par) + registerDoParallel(cl) } else { if (is.na(num.batches)) num.batches <- 1 @@ -1968,7 +1975,8 @@ cleangrowth <- function(subjid, .(batch), cleanbatch, .parallel = parallel, - .paropts = list(.packages = "data.table"), + .paropts = list(.packages = "data.table", + .export = var_for_par), log.path = log.path, quietly = quietly, parallel = parallel, @@ -1986,7 +1994,7 @@ cleangrowth <- function(subjid, error.load.threshold = error.load.threshold, error.load.mincount = error.load.mincount ) - stopImplicitCluster() + stopCluster(cl) } if (!quietly) cat(sprintf("[%s] Done!\n", Sys.time())) From af1a275b541f1e5edebe9d1427ad9cbf0c2c0451 Mon Sep 17 00:00:00 2001 From: Hannah De los Santos Date: Tue, 24 Nov 2020 17:15:58 -0500 Subject: [PATCH 02/14] Add parallel dependency --- DESCRIPTION | 3 ++- NAMESPACE | 1 + README.md | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index ea2497d6..b841978b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,4 +1,4 @@ -Package: growthcleanr +liPackage: growthcleanr Type: Package Title: Growth Measurements Cleaner Version: 1.2.2 @@ -17,6 +17,7 @@ Imports: dplyr (>= 1.0.1), foreach (>= 1.5.0), doParallel (>= 1.0.15), + parallel (>=4.0.3), Hmisc (>= 4.4-0), labelled (>= 2.5.0), magrittr (>= 1.5) diff --git a/NAMESPACE b/NAMESPACE index 1956714d..1a3e85e0 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -16,5 +16,6 @@ import(dplyr, except = c(last, first, summarize, src, between)) import(foreach) import(labelled) import(magrittr) +import(parallel) import(plyr, except = c(failwith, id, summarize, count, desc, mutate, arrange, rename, is.discrete, summarise, summarize)) import(tidyr, except = extract) diff --git a/README.md b/README.md index e65e8005..7528317e 100644 --- a/README.md +++ b/README.md @@ -66,6 +66,7 @@ following packages: * `data.table` * `foreach` * `doParallel` +* `parallel` * `dplyr` * `Hmisc` * `labelled` From 663a47152f69f834036aba2420a1b4c196969594 Mon Sep 17 00:00:00 2001 From: Hannah De los Santos Date: Tue, 24 Nov 2020 17:20:59 -0500 Subject: [PATCH 03/14] Fix parallel example --- R/growth.R | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/R/growth.R b/R/growth.R index 841c421d..c02a2d19 100644 --- a/R/growth.R +++ b/R/growth.R @@ -1661,17 +1661,17 @@ cleanbatch <- function(data.df, #' measurement = df_stats$measurement) #' #' # Once processed you can filter data based on result value -#' df_stats <- cbind(df_stats, "clean_result" == clean_stats) -#' clean_df_stats <- df_stats[, df_stats$clean_result == "Include"] +#' df_stats <- cbind(df_stats, "clean_result" = clean_stats) +#' clean_df_stats <- df_stats[df_stats$clean_result == "Include",] #' #' # Parallel processing: run using 3 cores and batches -#' df_stats<-cleangrowth(subjid = df_stats$subjid, -#' param = df_stats$param, -#' agedays = df_stats$agedays, -#' sex = df_stats$sex, -#' measurement = df_stats$measurement, -#' parallel = TRUE, -#' num.batches = 2) +#' clean_stats <- cleangrowth(subjid = df_stats$subjid, +#' param = df_stats$param, +#' agedays = df_stats$agedays, +#' sex = df_stats$sex, +#' measurement = df_stats$measurement, +#' parallel = TRUE, +#' num.batches = 2) cleangrowth <- function(subjid, param, agedays, From 29f85dab47a94cbba55e29d252e658c17b60f0d1 Mon Sep 17 00:00:00 2001 From: Hannah De los Santos Date: Tue, 24 Nov 2020 17:21:32 -0500 Subject: [PATCH 04/14] Update syngrowth documentation --- man/syngrowth.Rd | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/man/syngrowth.Rd b/man/syngrowth.Rd index 127ad4a8..4ef36192 100644 --- a/man/syngrowth.Rd +++ b/man/syngrowth.Rd @@ -4,8 +4,10 @@ \name{syngrowth} \alias{syngrowth} \title{syngrowth} -\format{A data frame with six variables: \code{id}, \code{subjid}, -\code{sex}, \code{agedays}, \code{param}, and \code{measurement}} +\format{ +A data frame with six variables: \code{id}, \code{subjid}, +\code{sex}, \code{agedays}, \code{param}, and \code{measurement} +} \usage{ syngrowth } From 9277bfefa441d5d84e994405cbb816a5ede33761 Mon Sep 17 00:00:00 2001 From: Hannah De los Santos Date: Tue, 24 Nov 2020 17:48:59 -0500 Subject: [PATCH 05/14] Fix typo --- DESCRIPTION | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index b841978b..18f83439 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,4 +1,4 @@ -liPackage: growthcleanr +Package: growthcleanr Type: Package Title: Growth Measurements Cleaner Version: 1.2.2 @@ -17,7 +17,7 @@ Imports: dplyr (>= 1.0.1), foreach (>= 1.5.0), doParallel (>= 1.0.15), - parallel (>=4.0.3), + parallel (>= 4.0.3), Hmisc (>= 4.4-0), labelled (>= 2.5.0), magrittr (>= 1.5) From 85badc3769bf197e1d3caf4dabba801a8308ac29 Mon Sep 17 00:00:00 2001 From: Hannah De los Santos Date: Tue, 24 Nov 2020 17:56:41 -0500 Subject: [PATCH 06/14] Update documentation --- man/cleangrowth.Rd | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/man/cleangrowth.Rd b/man/cleangrowth.Rd index 0bc36364..f7fe6ac1 100644 --- a/man/cleangrowth.Rd +++ b/man/cleangrowth.Rd @@ -132,15 +132,15 @@ clean_stats <-cleangrowth(subjid = df_stats$subjid, measurement = df_stats$measurement) # Once processed you can filter data based on result value -df_stats <- cbind(df_stats, "clean_result" == clean_stats) -clean_df_stats <- df_stats[, df_stats$clean_result == "Include"] +df_stats <- cbind(df_stats, "clean_result" = clean_stats) +clean_df_stats <- df_stats[df_stats$clean_result == "Include",] # Parallel processing: run using 3 cores and batches -df_stats<-cleangrowth(subjid = df_stats$subjid, - param = df_stats$param, - agedays = df_stats$agedays, - sex = df_stats$sex, - measurement = df_stats$measurement, - parallel = TRUE, - num.batches = 2) +clean_stats <- cleangrowth(subjid = df_stats$subjid, + param = df_stats$param, + agedays = df_stats$agedays, + sex = df_stats$sex, + measurement = df_stats$measurement, + parallel = TRUE, + num.batches = 2) } From 01425ef12662e0df72f72b3ae2995c4a0f081e62 Mon Sep 17 00:00:00 2001 From: Hannah De los Santos Date: Wed, 25 Nov 2020 09:23:57 -0500 Subject: [PATCH 07/14] Fix comments and parallel environment --- R/growth.R | 7 +++---- man/cleangrowth.Rd | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/R/growth.R b/R/growth.R index c02a2d19..b7ca9c0b 100644 --- a/R/growth.R +++ b/R/growth.R @@ -1664,7 +1664,7 @@ cleanbatch <- function(data.df, #' df_stats <- cbind(df_stats, "clean_result" = clean_stats) #' clean_df_stats <- df_stats[df_stats$clean_result == "Include",] #' -#' # Parallel processing: run using 3 cores and batches +#' # Parallel processing: run using 2 cores and batches #' clean_stats <- cleangrowth(subjid = df_stats$subjid, #' param = df_stats$param, #' agedays = df_stats$agedays, @@ -1716,7 +1716,7 @@ cleangrowth <- function(subjid, "na_as_false") cl <- makeCluster(num.batches) - clusterExport(cl = cl, varlist = var_for_par) + clusterExport(cl = cl, varlist = var_for_par, envir = environment()) registerDoParallel(cl) } else { if (is.na(num.batches)) @@ -1975,8 +1975,7 @@ cleangrowth <- function(subjid, .(batch), cleanbatch, .parallel = parallel, - .paropts = list(.packages = "data.table", - .export = var_for_par), + .paropts = list(.packages = "data.table"), log.path = log.path, quietly = quietly, parallel = parallel, diff --git a/man/cleangrowth.Rd b/man/cleangrowth.Rd index f7fe6ac1..c3f535ba 100644 --- a/man/cleangrowth.Rd +++ b/man/cleangrowth.Rd @@ -135,7 +135,7 @@ clean_stats <-cleangrowth(subjid = df_stats$subjid, df_stats <- cbind(df_stats, "clean_result" = clean_stats) clean_df_stats <- df_stats[df_stats$clean_result == "Include",] -# Parallel processing: run using 3 cores and batches +# Parallel processing: run using 2 cores and batches clean_stats <- cleangrowth(subjid = df_stats$subjid, param = df_stats$param, agedays = df_stats$agedays, From 30b49a9154dfe49f4d3d3692bcbdecdb46f34c0e Mon Sep 17 00:00:00 2001 From: Hannah De los Santos Date: Wed, 25 Nov 2020 09:53:10 -0500 Subject: [PATCH 08/14] Fix path specification in splitinput --- R/utils.R | 4 ++-- man/splitinput.Rd | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/R/utils.R b/R/utils.R index 5013f630..08963c32 100644 --- a/R/utils.R +++ b/R/utils.R @@ -33,11 +33,11 @@ splitinput <- function(df, fname = deparse(substitute(df)), - fdir = "", + fdir = ".", min_nrow = 10000, keepcol = 'subjid') { # first, check if the given directory exists - if (fdir != "" & is.character(fdir) & !dir.exists(fdir)){ + if (fdir != "." & is.character(fdir) & !dir.exists(fdir)){ stop("invalid directory") } diff --git a/man/splitinput.Rd b/man/splitinput.Rd index 35ad739d..2974a5d7 100644 --- a/man/splitinput.Rd +++ b/man/splitinput.Rd @@ -7,7 +7,7 @@ splitinput( df, fname = deparse(substitute(df)), - fdir = "", + fdir = ".", min_nrow = 10000, keepcol = "subjid" ) From c03eff94b2530d37ddc066ce53b38b51695ba9b4 Mon Sep 17 00:00:00 2001 From: Hannah De los Santos Date: Wed, 25 Nov 2020 10:00:49 -0500 Subject: [PATCH 09/14] Update documentation for missing variables --- R/growth.R | 1 + R/utils.R | 3 ++- man/cleangrowth.Rd | 2 ++ man/longwide.Rd | 2 ++ man/splitinput.Rd | 4 ++-- 5 files changed, 9 insertions(+), 3 deletions(-) diff --git a/R/growth.R b/R/growth.R index b7ca9c0b..25c198d7 100644 --- a/R/growth.R +++ b/R/growth.R @@ -1598,6 +1598,7 @@ cleanbatch <- function(data.df, #' matching same ageday measurements for the other parameter. Options include "default" (standard growthcleanr approach), #' and "flag.both" (in case of two measurements of one type without matching values for the other parameter, flag both #' for exclusion if beyond threshold) +#' @param height.tolerance.cm maximum decrease in height tolerated for sequential measurements #' @param error.load.mincount minimum count of exclusions on parameter before #' considering excluding all measurements. Defaults to 2. #' @param error.load.threshold threshold of percentage of excluded measurement count to included measurement diff --git a/R/utils.R b/R/utils.R index 08963c32..5d9212fe 100644 --- a/R/utils.R +++ b/R/utils.R @@ -9,7 +9,7 @@ #' @param df data frame to split #' @param fname new name for each of the split files to start with #' @param fdir directory to put each of the split files (default working directory) -#' @param min_row minimum number of rows for each split file (default 10000) +#' @param min_nrow minimum number of rows for each split file (default 10000) #' @param keepcol the column name (default "subjid") to use to keep records with the same values together in the same single split file #' #' @return the count number referring to the last split file written @@ -144,6 +144,7 @@ recode_sex <- function(input_data, #' @param agedays name of age (in days) descriptor column #' @param param name of parameter column to identify each type of measurement #' @param measurement name of measurement column containing the actual measurement data +#' @param clean_value name of column of cleaned values from growthcleanr::cleangrowth() #' @param include_all Determines whether the function keeps all exclusion codes. If TRUE, all exclusion types are kept and the inclusion_types argument is ignored. Defaults to FALSE. #' @param inclusion_types Vector indicating which exclusion codes from the cleaning algorithm should be included in the data, given that include_all is FALSE. For all options, see growthcleanr::cleangrowth(). Defaults to c("Include"). #' diff --git a/man/cleangrowth.Rd b/man/cleangrowth.Rd index c3f535ba..59a737b2 100644 --- a/man/cleangrowth.Rd +++ b/man/cleangrowth.Rd @@ -60,6 +60,8 @@ matching same ageday measurements for the other parameter. Options include "defa and "flag.both" (in case of two measurements of one type without matching values for the other parameter, flag both for exclusion if beyond threshold)} +\item{height.tolerance.cm}{maximum decrease in height tolerated for sequential measurements} + \item{error.load.mincount}{minimum count of exclusions on parameter before considering excluding all measurements. Defaults to 2.} diff --git a/man/longwide.Rd b/man/longwide.Rd index e3ed183a..ef731bb5 100644 --- a/man/longwide.Rd +++ b/man/longwide.Rd @@ -32,6 +32,8 @@ longwide( \item{measurement}{name of measurement column containing the actual measurement data} +\item{clean_value}{name of column of cleaned values from growthcleanr::cleangrowth()} + \item{include_all}{Determines whether the function keeps all exclusion codes. If TRUE, all exclusion types are kept and the inclusion_types argument is ignored. Defaults to FALSE.} \item{inclusion_types}{Vector indicating which exclusion codes from the cleaning algorithm should be included in the data, given that include_all is FALSE. For all options, see growthcleanr::cleangrowth(). Defaults to c("Include").} diff --git a/man/splitinput.Rd b/man/splitinput.Rd index 2974a5d7..1548527f 100644 --- a/man/splitinput.Rd +++ b/man/splitinput.Rd @@ -19,9 +19,9 @@ splitinput( \item{fdir}{directory to put each of the split files (default working directory)} -\item{keepcol}{the column name (default "subjid") to use to keep records with the same values together in the same single split file} +\item{min_nrow}{minimum number of rows for each split file (default 10000)} -\item{min_row}{minimum number of rows for each split file (default 10000)} +\item{keepcol}{the column name (default "subjid") to use to keep records with the same values together in the same single split file} } \value{ the count number referring to the last split file written From fbac35f92eaeae880b7b7c618be8e53a537af532 Mon Sep 17 00:00:00 2001 From: Hannah De los Santos Date: Wed, 25 Nov 2020 10:00:59 -0500 Subject: [PATCH 10/14] Add R dependency --- DESCRIPTION | 2 ++ 1 file changed, 2 insertions(+) diff --git a/DESCRIPTION b/DESCRIPTION index 18f83439..accc07f3 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -21,6 +21,8 @@ Imports: Hmisc (>= 4.4-0), labelled (>= 2.5.0), magrittr (>= 1.5) +Depends: + R (>= 2.10) License: MIT + file LICENSE Encoding: UTF-8 LazyData: true From 9b2db4069b561473e186680657d8bed986259023 Mon Sep 17 00:00:00 2001 From: Hannah De los Santos Date: Wed, 25 Nov 2020 10:43:53 -0500 Subject: [PATCH 11/14] Add more support functions for parallelism --- R/growth.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/growth.R b/R/growth.R index 25c198d7..6f7c18cd 100644 --- a/R/growth.R +++ b/R/growth.R @@ -1714,7 +1714,8 @@ cleangrowth <- function(subjid, } # variables needed for parallel workers var_for_par <- c("temporary_duplicates", "valid", "swap_parameters", - "na_as_false") + "na_as_false", "ewma", "read_anthro", "as_matrix_delta", + "sd_median") cl <- makeCluster(num.batches) clusterExport(cl = cl, varlist = var_for_par, envir = environment()) From 0a02076b94d59dd8126ac0812a3b68814388711a Mon Sep 17 00:00:00 2001 From: Hannah De los Santos Date: Wed, 25 Nov 2020 11:05:29 -0500 Subject: [PATCH 12/14] Remove library calls --- R/cdc.R | 6 ------ 1 file changed, 6 deletions(-) diff --git a/R/cdc.R b/R/cdc.R index a4aef362..13777654 100644 --- a/R/cdc.R +++ b/R/cdc.R @@ -131,12 +131,6 @@ ext_bmiz <- function(data, bmi = "bmi", adjust.integer.age = T, ref.data.path = "") { - library(data.table, quietly = T) - library(dplyr, quietly = T) - library(Hmisc, quietly = T) - library(magrittr, quietly = T) - library(labelled, quietly = T) - setDT(data) setnames(data, From 6ed4614fa62d535ee574d970dc9d576157fac2a9 Mon Sep 17 00:00:00 2001 From: Hannah De los Santos Date: Wed, 25 Nov 2020 11:53:54 -0500 Subject: [PATCH 13/14] Remove parallel dependency (a base function) --- DESCRIPTION | 1 - 1 file changed, 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index accc07f3..2480a94b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -17,7 +17,6 @@ Imports: dplyr (>= 1.0.1), foreach (>= 1.5.0), doParallel (>= 1.0.15), - parallel (>= 4.0.3), Hmisc (>= 4.4-0), labelled (>= 2.5.0), magrittr (>= 1.5) From a58d21b4b178fdeb2c4eb87274a5ab068b8e53ab Mon Sep 17 00:00:00 2001 From: Hannah De los Santos Date: Wed, 25 Nov 2020 12:25:11 -0500 Subject: [PATCH 14/14] Add check button to readme --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 7528317e..cf7e8168 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # growthcleanr +![R-CMD-check](https://github.com/carriedaymont/growthcleanr/workflows/R-CMD-check/badge.svg?branch=main) + R package for cleaning data from Electronic Health Record systems, focused on cleaning height and weight measurements.