diff --git a/DESCRIPTION b/DESCRIPTION index 36d68bdd..1505acc9 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -16,7 +16,9 @@ Imports: farff, Rcpp (>= 1.0.5), RcppParallel, - stringr + stringr, + caret, + pracma Encoding: UTF-8 LinkingTo: Rcpp, BH (>= 1.51.0), RcppParallel RoxygenNote: 7.2.3 diff --git a/NAMESPACE b/NAMESPACE index 1bc1103e..fc2aaccd 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,16 +1,15 @@ # Generated by roxygen2: do not edit by hand -export(aurocEXPLORE) -export(changeSetting) -export(getSetting) +export(modelsCurveExplore) export(predictExplore) -export(saveData) -export(settingsExplore) +export(rocCurveExplore) export(trainExplore) import(Rcpp) import(checkmate) importFrom(RcppParallel,RcppParallelLibs) +importFrom(caret,confusionMatrix) importFrom(farff,writeARFF) +importFrom(pracma,trapz) importFrom(stringr,str_extract) importFrom(stringr,str_replace_all) importFrom(stringr,str_split_fixed) diff --git a/R/HelperFunctions.R b/R/HelperFunctions.R index 1831d0d6..2b4ecd25 100644 --- a/R/HelperFunctions.R +++ b/R/HelperFunctions.R @@ -6,7 +6,6 @@ #' #' @return A parameter value, character. #' @importFrom stringr str_extract str_replace_all -#' @export getSetting <- function(settings, parameter, type = "value") { extraction <- stringr::str_extract(settings, paste0(parameter , "=.*?\u000A"))[[1]] extraction <- stringr::str_replace_all(extraction, "\\n", "") @@ -33,8 +32,6 @@ getSetting <- function(settings, parameter, type = "value") { #' #' @return A setting parameter value #' @importFrom utils write.table -#' -#' @export changeSetting <- function(settings, parameter, input, default_setting) { current_setting <- getSetting(settings, parameter, type = "complete") @@ -77,8 +74,6 @@ changeSetting <- function(settings, parameter, input, default_setting) { #' #' @importFrom farff writeARFF #' @importFrom utils write.table -#' -#' @export saveData <- function(output_path, train_data, file_name) { # Save data as arff file @@ -96,12 +91,4 @@ saveData <- function(output_path, train_data, file_name) { row.names = FALSE) # TODO: Support other file formats? -} - -simple_auc <- function(TPR, FPR){ - # inputs already sorted, best scores first - # TODO: different computation? is it same as standard packages (how LASSO computed)? - dFPR <- c(diff(FPR), 0) - dTPR <- c(diff(TPR), 0) - sum(TPR * dFPR) + sum(dTPR * dFPR)/2 -} +} \ No newline at end of file diff --git a/R/MainFunctions.R b/R/MainFunctions.R index 369ff2c9..b556af11 100644 --- a/R/MainFunctions.R +++ b/R/MainFunctions.R @@ -16,8 +16,9 @@ #' @param ClassFeature String, should be name of one of columns in data train. Always provided by the user. The string should be enclused in single quotation marks, e.g. 'class' #' @param PositiveClass 1 or string (?) (should be one of elements of column 'ClassFeature' in data train). Always provided by the user. The string should be enclused in single quotation marks, e.g. 'class' #' @param FeatureInclude Empty or string (should be name of one of columns in data train) -#' @param Maximize One of list with strings, list = "ACCURACY", ... +#' @param Maximize One of list with strings, list = "ACCURACY", "SENSITIVITY", "SPECIFICITY", ... #' @param Accuracy Float 0-0.999 -> default = 0 (if 0, make empty = computationally more beneficial) +#' @param BalancedAccuracy Float 0-0.999 -> default = 0 (if 0, make empty = computationally more beneficial) #' @param Specificity float 0-0.999, default = 0 #' @param PrintSettings True or False #' @param PrintPerformance True or False @@ -43,17 +44,18 @@ trainExplore <- function(train_data = NULL, FeatureInclude = "", Maximize = "ACCURACY", Accuracy = 0, + BalancedAccuracy = 0, Specificity = 0, PrintSettings = TRUE, PrintPerformance = TRUE, Subsumption = TRUE, BranchBound = TRUE, Parallel = FALSE) { - + if (!dir.exists(output_path)) { dir.create(output_path, recursive = TRUE) - } - + } + # Create output folder if(!endsWith(output_path, "/")) { warning("Output path should end with /, add this") @@ -68,7 +70,7 @@ trainExplore <- function(train_data = NULL, OutputFile <- paste0(output_path, file_name, ".result") } else { checkmate::checkFileExists(OutputFile, - add = errorMessage) + add = errorMessage) } # check settings_path @@ -91,6 +93,7 @@ trainExplore <- function(train_data = NULL, checkString(FeatureInclude), checkString(Maximize), checkDouble(Accuracy), + checkDouble(BalancedAccuracy), checkDouble(Specificity), checkLogical(PrintSettings), checkLogical(PrintPerformance), @@ -101,13 +104,14 @@ trainExplore <- function(train_data = NULL, combine = "and" ) checkmate::reportAssertions(collection = errorMessage) - + PrintSettings <- ifelse(PrintSettings == TRUE, "yes", "no") PrintPerformance <- ifelse(PrintPerformance == TRUE, "yes", "no") Subsumption <- ifelse(Subsumption == TRUE, "yes", "no") BranchBound <- ifelse(BranchBound == TRUE, "yes", "no") Parallel <- ifelse(Parallel == TRUE, "yes", "no") - Accuracy <- ifelse(Accuracy == 0, "", Specificity) + Accuracy <- ifelse(Accuracy == 0, "", Accuracy) + BalancedAccuracy <- ifelse(BalancedAccuracy == 0, "", BalancedAccuracy) Specificity <- ifelse(Specificity == 0, "", Specificity) # Create project setting @@ -146,6 +150,7 @@ trainExplore <- function(train_data = NULL, FeatureInclude = FeatureInclude, Maximize = Maximize, Accuracy = Accuracy, + BalancedAccuracy = BalancedAccuracy, Specificity = Specificity, PrintSettings = PrintSettings, PrintPerformance = PrintPerformance, @@ -163,7 +168,7 @@ trainExplore <- function(train_data = NULL, # Load model rule_string <- stringr::str_extract(results, "Best candidate \\(overall\\):.*?\u000A") - + # Clean string rule_string <- stringr::str_replace(rule_string, "Best candidate \\(overall\\):", "") rule_string <- stringr::str_replace_all(rule_string, " ", "") @@ -200,7 +205,6 @@ trainExplore <- function(train_data = NULL, #' #' @return Settings path #' @import checkmate -#' @export settingsExplore <- function(settings, output_path, # C++ cannot handle spaces in file path well, avoid those file_name, @@ -215,6 +219,7 @@ settingsExplore <- function(settings, FeatureInclude = "", Maximize = "ACCURACY", Accuracy = 0, + BalancedAccuracy = 0, Specificity = 0, PrintSettings = "yes", PrintPerformance = "yes", @@ -222,7 +227,7 @@ settingsExplore <- function(settings, BranchBound = "yes", Parallel = "no") { - + # Insert location training data and cutoff file if train_data is entered if (!is.null(train_data)) { settings <- changeSetting(settings, parameter = "DataFile", input = paste0(output_path, file_name, ".arff")) @@ -240,6 +245,7 @@ settingsExplore <- function(settings, settings <- changeSetting(settings, parameter = "FeatureInclude", input = FeatureInclude) settings <- changeSetting(settings, parameter = "Maximize", input = Maximize) settings <- changeSetting(settings, parameter = "Accuracy", input = Accuracy) + settings <- changeSetting(settings, parameter = "BalancedAccuracy", input = BalancedAccuracy) settings <- changeSetting(settings, parameter = "Specificity", input = Specificity) settings <- changeSetting(settings, parameter = "PrintSettings", input = PrintSettings) settings <- changeSetting(settings, parameter = "PrintPerformance", input = PrintPerformance) @@ -303,7 +309,7 @@ predictExplore <- function(model, test_data) { } -#' aucrocExplore +#' modelsCurveExplore # TODO: update documentation? #' #' @param output_path A string declaring the path to the settings #' @param train_data Train data @@ -312,33 +318,91 @@ predictExplore <- function(model, test_data) { #' @param ... List of arguments #' #' @import checkmate -#' @return auroc +#' @return models for different sensitivities/specificities #' @export -aurocEXPLORE <- function(output_path, train_data, settings_path, file_name, ...) { - # TODO: check with latest implementation in PLP +modelsCurveExplore <- function(train_data = NULL, + settings_path = NULL, + output_path, + file_name = "train_data", + OutputFile = NULL, + StartRulelength = 1, + EndRulelength = 3, + OperatorMethod = "EXHAUSTIVE", + CutoffMethod = "RVAC", + ClassFeature = "'class'", + PositiveClass = "'Iris-versicolor'", + FeatureInclude = "", + Maximize = "ACCURACY", + Accuracy = 0, + BalancedAccuracy = 0, + Specificity = 0, + PrintSettings = TRUE, + PrintPerformance = TRUE, + Subsumption = TRUE, + BranchBound = TRUE, + Parallel = FALSE) { + # TODO: only input required variables? # Range of specificities to check - specificities <- seq(from = 0.01, to = 0.99, by = 0.02) + constraints <- c(seq(0.05,0.65,0.1), seq(0.75,0.97,0.02)) - # Set specificity constraint and maximize sensitivity - sensitivities <- rep(NA, length(specificities)) - for (s in 1:length(specificities)) { # s <- 0.1 - - model <- trainExplore(output_path = output_path, train_data = train_data, settings_path = settings_path, Maximize = "SENSITIVITY", Specificity = specificities[s], ...) + modelsCurve <- tryCatch({ + models <- sapply(constraints, function(constraint) { + print(paste0("Model for specificity: ", as.character(constraint))) + + # Fit EXPLORE + model <- Explore::trainExplore(output_path = file.path(output_path, "modelsCurve"), train_data = train_data, + settings_path = settings_path, + file_name = paste0("explore_specificity", as.character(constraint)), + OutputFile = OutputFile, + StartRulelength = StartRulelength, EndRulelength = EndRulelength, + OperatorMethod = OperatorMethod, CutoffMethod = CutoffMethod, + ClassFeature = ClassFeature, PositiveClass = PositiveClass, + FeatureInclude = FeatureInclude, Maximize = "SENSITIVITY", + Accuracy = Accuracy, BalancedAccuracy = BalancedAccuracy, Specificity = constraint, + PrintSettings = PrintSettings, PrintPerformance = PrintPerformance, + Subsumption = Subsumption, BranchBound = BranchBound, + Parallel = Parallel) + + return(model) + }) + }, + finally = warning("No model for specificity.") + ) + + return(modelsCurve) +} + + +#' rocCurveExplore +#' +#' @return auc value for EXPLORE +#' @export +#' @importFrom caret confusionMatrix +#' @importFrom pracma trapz +rocCurveExplore <- function(modelsCurve, data, labels) { # labels <- cohort$outcomeCount + + # TODO: input checks? + + # Combine all these results + curve_TPR <- c(1,0) + curve_FPR <- c(1,0) + + for (c in length(modelsCurve):1) { + model <- modelsCurve[c] - # Extract sensitivity from results file - results <- paste(readLines(paste0(output_path, "train_data.result")), collapse="\n") + # Predict using train and test + predict <- tryCatch(as.numeric(Explore::predictExplore(model = model, test_data = data))) - sensitivity <- stringr::str_extract_all(results, "Train-set: .*?\u000A")[[1]] - sensitivity <- stringr::str_extract(results, "SE:.*? ")[[1]] - sensitivity <- stringr::str_remove_all(sensitivity, "SE:") - sensitivity <- stringr::str_replace_all(sensitivity, " ", "") + # Compute metrics + conf_matrix <- table(factor(predict, levels = c(0,1)), factor(labels, levels = c(0,1))) # binary prediction + performance <- caret::confusionMatrix(conf_matrix, positive = '1') - sensitivities[s] <- as.numeric(sensitivity) + curve_TPR[c+2] <- performance$byClass['Sensitivity'] + curve_FPR[c+2] <- 1 - performance$byClass['Specificity'] } - auroc <- simple_auc(TPR = rev(sensitivities), FPR = rev(1 - specificities)) - # plot(1-specificities, sensitivities) + roc <- pracma::trapz(curve_FPR[length(curve_FPR):1],curve_TPR[length(curve_TPR):1]) - return(auroc) + return (roc) } diff --git a/inst/examples/iris.project b/inst/examples/iris.project index 627a13f8..1c907e75 100644 --- a/inst/examples/iris.project +++ b/inst/examples/iris.project @@ -24,6 +24,7 @@ FeatureRule= [Constraints] Maximize=ACCURACY Accuracy= +BalancedAccuracy= Specificity= [Output] OutputMethod=BEST diff --git a/inst/settings/template.project b/inst/settings/template.project index 7b639c0f..cec9bc2e 100755 --- a/inst/settings/template.project +++ b/inst/settings/template.project @@ -24,6 +24,7 @@ FeatureRule= [Constraints] Maximize=ACCURACY Accuracy= +BalancedAccuracy= Specificity= [Output] OutputMethod=BEST diff --git a/man/Explore-package.Rd b/man/Explore-package.Rd index 627b0d1b..16e8f97f 100644 --- a/man/Explore-package.Rd +++ b/man/Explore-package.Rd @@ -23,13 +23,5 @@ Authors: \item Aniek Markus \email{a.markus@erasmusmc.nl} (\href{https://orcid.org/0000-0001-5779-4794}{ORCID}) } -} -\seealso{ -Useful links: -\itemize{ - \item \url{https://cbarbozaerasmus.github.io/Explore/} - \item \url{https://mi-erasmusmc.github.io/Explore/} -} - } \keyword{internal} diff --git a/man/aurocEXPLORE.Rd b/man/aurocEXPLORE.Rd deleted file mode 100644 index f7d85d71..00000000 --- a/man/aurocEXPLORE.Rd +++ /dev/null @@ -1,25 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/MainFunctions.R -\name{aurocEXPLORE} -\alias{aurocEXPLORE} -\title{aucrocExplore} -\usage{ -aurocEXPLORE(output_path, train_data, settings_path, file_name, ...) -} -\arguments{ -\item{output_path}{A string declaring the path to the settings} - -\item{train_data}{Train data} - -\item{settings_path}{A string declaring the path to the settings} - -\item{file_name}{A string declaring the the path to the file name} - -\item{...}{List of arguments} -} -\value{ -auroc -} -\description{ -aucrocExplore -} diff --git a/man/modelsCurveExplore.Rd b/man/modelsCurveExplore.Rd new file mode 100644 index 00000000..0f0b79c1 --- /dev/null +++ b/man/modelsCurveExplore.Rd @@ -0,0 +1,47 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/MainFunctions.R +\name{modelsCurveExplore} +\alias{modelsCurveExplore} +\title{modelsCurveExplore # TODO: update documentation?} +\usage{ +modelsCurveExplore( + train_data = NULL, + settings_path = NULL, + output_path, + file_name = "train_data", + OutputFile = NULL, + StartRulelength = 1, + EndRulelength = 3, + OperatorMethod = "EXHAUSTIVE", + CutoffMethod = "RVAC", + ClassFeature = "'class'", + PositiveClass = "'Iris-versicolor'", + FeatureInclude = "", + Maximize = "ACCURACY", + Accuracy = 0, + BalancedAccuracy = 0, + Specificity = 0, + PrintSettings = TRUE, + PrintPerformance = TRUE, + Subsumption = TRUE, + BranchBound = TRUE, + Parallel = FALSE +) +} +\arguments{ +\item{train_data}{Train data} + +\item{settings_path}{A string declaring the path to the settings} + +\item{output_path}{A string declaring the path to the settings} + +\item{file_name}{A string declaring the the path to the file name} + +\item{...}{List of arguments} +} +\value{ +models for different sensitivities/specificities +} +\description{ +modelsCurveExplore # TODO: update documentation? +} diff --git a/man/rocCurveExplore.Rd b/man/rocCurveExplore.Rd new file mode 100644 index 00000000..9f60a4e2 --- /dev/null +++ b/man/rocCurveExplore.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/MainFunctions.R +\name{rocCurveExplore} +\alias{rocCurveExplore} +\title{rocCurveExplore} +\usage{ +rocCurveExplore(modelsCurve, data, labels) +} +\value{ +auc value for EXPLORE +} +\description{ +rocCurveExplore +} diff --git a/man/settingsExplore.Rd b/man/settingsExplore.Rd index 554aa1f8..a423f8c0 100644 --- a/man/settingsExplore.Rd +++ b/man/settingsExplore.Rd @@ -19,6 +19,7 @@ settingsExplore( FeatureInclude = "", Maximize = "ACCURACY", Accuracy = 0, + BalancedAccuracy = 0, Specificity = 0, PrintSettings = "yes", PrintPerformance = "yes", diff --git a/man/trainExplore.Rd b/man/trainExplore.Rd index a0148b37..05e73e3b 100644 --- a/man/trainExplore.Rd +++ b/man/trainExplore.Rd @@ -19,6 +19,7 @@ trainExplore( FeatureInclude = "", Maximize = "ACCURACY", Accuracy = 0, + BalancedAccuracy = 0, Specificity = 0, PrintSettings = TRUE, PrintPerformance = TRUE, @@ -52,10 +53,12 @@ trainExplore( \item{FeatureInclude}{Empty or string (should be name of one of columns in data train)} -\item{Maximize}{One of list with strings, list = "ACCURACY", ...} +\item{Maximize}{One of list with strings, list = "ACCURACY", "SENSITIVITY", "SPECIFICITY", ...} \item{Accuracy}{Float 0-0.999 -> default = 0 (if 0, make empty = computationally more beneficial)} +\item{BalancedAccuracy}{Float 0-0.999 -> default = 0 (if 0, make empty = computationally more beneficial)} + \item{Specificity}{float 0-0.999, default = 0} \item{PrintSettings}{True or False} diff --git a/tests/testthat/test-MainFunctions.R b/tests/testthat/test-MainFunctions.R index 1879cbd7..de5f6fc5 100644 --- a/tests/testthat/test-MainFunctions.R +++ b/tests/testthat/test-MainFunctions.R @@ -95,11 +95,17 @@ test_that("compute AUC", { } output_path <- paste0(output_path, "/") data <- farff::readARFF(data_path) - auroc <- Explore::aurocEXPLORE(output_path = output_path, - train_data = data, - settings_path = settings_path, - ClassFeature = "'class'", - PositiveClass = '"Iris-versicolor"') + + modelsCurve <- Explore::modelsCurveExplore(output_path = output_path, + train_data = data, + settings_path = settings_path, + ClassFeature = "'class'", + PositiveClass = '"Iris-versicolor"') + + auroc <- Explore::rocCurveExplore(modelsCurve = modelsCurve, + data = data, + labels = ifelse(data["class"] == "Iris-versicolor", 1, 0)) + expect_equal(class(auroc), "numeric") expect_true(auroc < 100) expect_true(auroc > 0)