Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Variable selection - update all methods #15

Open
wants to merge 30 commits into
base: variable_selection
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
7051ecd
add manual CV for IHT
egillax Aug 8, 2023
0c64c23
fix hyperParamSearch
egillax Aug 8, 2023
6b59ce2
fix notes and warnings
egillax Aug 8, 2023
a4b7cd0
fix docs
egillax Aug 8, 2023
4d0bb7f
print out progress
egillax Aug 21, 2023
245edf2
add timing for debugging
egillax Aug 21, 2023
180cea8
added boruta
egillax Aug 22, 2023
8d94f2d
fix namespace
egillax Aug 22, 2023
ae918ef
add boruta docs
egillax Aug 22, 2023
2273cde
add python checks and more parameters
egillax Aug 22, 2023
5b88f76
add timing to runPlp
egillax Aug 22, 2023
c1f1ca3
add missing nJobs
egillax Aug 22, 2023
43f30c8
use sparse boruta
egillax Aug 23, 2023
76d1ac9
fix python env check
egillax Aug 30, 2023
81da03a
docs for boruta
egillax Sep 5, 2023
36e6cb6
Update univariate + forward/backward selection
AniekMarkus Oct 26, 2023
264c620
Move NJMIM to main script
AniekMarkus Oct 26, 2023
a94ca73
Remove python univariate selection
AniekMarkus Oct 26, 2023
913330f
Merge remote-tracking branch 'origin/variable_selection_boruta' into …
AniekMarkus Oct 26, 2023
7a0b461
Add boruta to file variable selection
AniekMarkus Oct 26, 2023
0ce773f
Merge remote-tracking branch 'origin/variable_selection_iht_cv' into …
AniekMarkus Oct 26, 2023
e829364
Update docs
AniekMarkus Oct 26, 2023
b6a0693
Consistency names
AniekMarkus Oct 31, 2023
f36920c
Correct tests feature engineering + importance
AniekMarkus Nov 2, 2023
7c385c8
handle case k < #covariates
egillax Nov 7, 2023
3ee55bd
Correction documentation
AniekMarkus Nov 9, 2023
749c97a
Rename param covariateIdsInclude to covariateIdsSelected for new methods
AniekMarkus Nov 10, 2023
de8dff9
Add check for K in NJMIM
AniekMarkus Nov 14, 2023
e52c65f
Update shiny config to match OhdsiShinyModules
AniekMarkus Dec 18, 2023
76ac0f5
Update to fix shiny viewer
AniekMarkus Dec 18, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ export(addMultipleDiagnosePlpToDatabase)
export(addMultipleRunPlpToDatabase)
export(addRunPlpToDatabase)
export(averagePrecision)
export(borutaSettings)
export(brierScore)
export(calibrationLine)
export(computeAuc)
Expand All @@ -34,7 +35,6 @@ export(createSampleSettings)
export(createStudyPopulation)
export(createStudyPopulationSettings)
export(createTempModelLoc)
export(createUnivariateFeatureSelection)
export(createValidationSettings)
export(diagnoseMultiplePlp)
export(diagnosePlp)
Expand Down Expand Up @@ -105,13 +105,13 @@ export(setPythonEnvironment)
export(setRandomForest)
export(setRidgeRegression)
export(setSVM)
export(setStepwiseSelection)
export(setUnivariateSelection)
export(simulatePlpData)
export(sklearnFromJson)
export(sklearnToJson)
export(splitData)
export(stepwiseSettings)
export(toSparseM)
export(univariateSettings)
export(validateMultiplePlp)
export(viewDatabaseResultPlp)
export(viewMultiplePlp)
Expand Down
158 changes: 150 additions & 8 deletions R/CyclopsModels.R
Original file line number Diff line number Diff line change
Expand Up @@ -39,28 +39,28 @@ fitCyclopsModel <- function(
time = .data$survivalTime
)

covariates <- filterCovariateIds(param, trainData$covariateData)
trainData$covariateData$covariates <- filterCovariateIds(param, trainData$covariateData)

if (!is.null(param$priorCoefs)) {
sourceCoefs <- param$priorCoefs %>%
dplyr::filter(abs(.data$betas)>0 & .data$covariateIds != "(Intercept)")

newCovariates <- covariates %>%
newCovariates <- trainData$covariateData$covariates %>%
dplyr::filter(.data$covariateId %in% !!sourceCoefs$covariateIds) %>%
dplyr::mutate(newCovariateId = .data$covariateId*-1) %>%
dplyr::select(-"covariateId") %>%
dplyr::rename(covariateId = .data$newCovariateId) %>%
dplyr::collect()

Andromeda::appendToTable(covariates, newCovariates)
Andromeda::appendToTable(trainData$covariateData$covariates, newCovariates)

}

start <- Sys.time()

cyclopsData <- Cyclops::convertToCyclopsData(
outcomes = trainData$covariateData$labels,
covariates = covariates,
covariates = trainData$covariateData$covariates,
addIntercept = settings$addIntercept,
modelType = modelTypeToCyclopsModelType(settings$modelType),
checkRowIds = FALSE,
Expand Down Expand Up @@ -97,7 +97,8 @@ fitCyclopsModel <- function(
ParallelLogger::logInfo("Determining initialRidgeVariance")
Cyclops::fitCyclopsModel(cyclopsData,
prior = normalPrior,
control = normalControl)},
control = normalControl,
warnings = FALSE)},
finally = ParallelLogger::logInfo("Done."))
param$priorParams$initialRidgeVariance <- fit$variance
}
Expand Down Expand Up @@ -132,9 +133,17 @@ fitCyclopsModel <- function(
)},
finally = ParallelLogger::logInfo('Done.')
)
} else if (settings$manualCV) {
result <- doCyclopsCVPenalty(data = trainData,
prior = prior,
fixedCoefficients = fixedCoefficients,
startingCoefficients = startingCoefficients,
nTries = settings$nTries)
fit <- result$modelFit
hyperParamSearch <- result$hyperParamSearch
} else{
fit <- tryCatch({
ParallelLogger::logInfo('Running Cyclops with fixed varience')
ParallelLogger::logInfo('Running Cyclops with fixed variance')
Cyclops::fitCyclopsModel(cyclopsData, prior = prior)},
finally = ParallelLogger::logInfo('Done.'))
}
Expand Down Expand Up @@ -173,7 +182,6 @@ fitCyclopsModel <- function(
prediction$evaluationType <- 'Train'

# get cv AUC if exists
cvPerFold <- data.frame()
if(!is.null(modelTrained$cv)){
cvPrediction <- do.call(rbind, lapply(modelTrained$cv, function(x){x$predCV}))
cvPrediction$evaluationType <- 'CV'
Expand All @@ -199,6 +207,7 @@ fitCyclopsModel <- function(

# remove the cv from the model:
modelTrained$cv <- NULL
hyperParamSearch <- cvPerFold
}

result <- list(
Expand Down Expand Up @@ -237,7 +246,7 @@ fitCyclopsModel <- function(
variance = modelTrained$priorVariance,
log_likelihood = modelTrained$log_likelihood
),
hyperParamSearch = cvPerFold
hyperParamSearch = hyperParamSearch
),

covariateImportance = variableImportance
Expand Down Expand Up @@ -542,4 +551,137 @@ reparamTransferCoefs <- function(inCoefs) {
coefs <- data.frame(betas = coefs, covariateIds = rownames(coefs), row.names = NULL)

return(coefs)
}

#' do simple CV to determine best penalty manually
#'
#' @details
#' Will try a sequence of penalties from `BIC` down to `penaltyRatio` * `BIC`. How many penalties
#' to try is determined by `nTries.` Will use cross-validation to determine optimal penalty
#' based on `AUC`.
#'
#' @param data The training data
#' @param prior Cyclops prior to use
#' @param fixedCoefficients What coefficients (if any) should be fixed
#' @param startingCoefficients What coefficients (if any) should have some starting value
#' @param penaltyRatio This controls the lowest penalty to try as a ratio of `BIC` penalty.
#' Trying very low penalties will increase computation times a lot.
#' @param nTries How many penalties to try. Default: 10
doCyclopsCVPenalty <- function(data,
prior,
fixedCoefficients,
startingCoefficients,
penaltyRatio = 0.1,
nTries = 10) {

# first penalty to try is the strict BIC
startingPenalty <- log(nrow(data$labels)) / 2

penalties <- seq(from = startingPenalty,
to = penaltyRatio * startingPenalty,
length.out = nTries)

nFolds <- length(unique(data$folds$index))
hyperParamSearch <- data.frame(matrix(nrow=length(penalties), ncol=nFolds+2))
colnames(hyperParamSearch) <- c("Penalty", "avg_CV", unlist(lapply(seq_len(nFolds),
function(x) paste0("Fold_", x))))

ParallelLogger::logInfo("Performing hyperparameter tuning to determine best penalty")

start <- Sys.time()

data$covariateData$folds <- data$folds # folds needs to be Andromeda to for joins to work
on.exit(data$covariateData$folds <- NULL)

data$covariateData$covariates <- data$covariateData$covariates %>%
dplyr::inner_join(data$covariateData$folds, by='rowId') %>% dplyr::collect()

data$covariateData$labels <- data$covariateData$labels %>%
dplyr::inner_join(data$covariateData$folds, by='rowId') %>% dplyr::collect()

for (i in seq_along(penalties)) {
hyperParamSearch[i, "Penalty"] <- penalties[i]
prior$penalty <- penalties[i]

itStart <- Sys.time()
for (fold in seq_len(nFolds)) {

trainData <- data$covariateData$covariates %>%
dplyr::filter(.data$index!=fold) %>%
dplyr::select(c("rowId", "covariateId", "covariateValue")) %>%
dplyr::collect()

trainOutcomes <- data$covariateData$labels %>%
dplyr::filter(.data$index!=fold) %>%
dplyr::select(-.data$index) %>%
dplyr::collect()

cyclopsData <- Cyclops::convertToCyclopsData(outcomes = trainOutcomes,
covariates = trainData,
modelType = 'lr',
addIntercept = TRUE,
checkRowIds = FALSE,
normalize = NULL,
quiet=TRUE)

fit <- Cyclops::fitCyclopsModel(
cyclopsData = cyclopsData,
prior = prior,
fixedCoefficients = fixedCoefficients,
startingCoefficients = startingCoefficients,
warnings = FALSE)


# predict on held out fold
testData <- data$covariateData$covariates %>%
dplyr::filter(.data$index==fold) %>%
dplyr::select(c("rowId", "covariateId", "covariateValue")) %>%
dplyr::mutate(covariateId=as.numeric(.data$covariateId)) %>%
dplyr::collect()

testOutcomes <- data$covariateData$labels %>%
dplyr::filter(.data$index==fold) %>%
dplyr::select(-.data$index) %>%
dplyr::collect()

preds <- stats::predict(fit, newCovariates = testData,
newOutcomes = testOutcomes)

# calculate performance
prediction <- data.frame(outcomeCount=testOutcomes$outcomeCount,
value=preds)
attr(prediction, "metaData")$modelType <- "binary"
auc <- computeAuc(prediction)
hyperParamSearch[i, paste0("Fold_", fold)] <- auc
}
hyperParamSearch[i, "avg_CV"] <- mean(as.numeric(hyperParamSearch[i, seq(3,3 + nFolds - 1)]))
itDelta <- Sys.time() - itStart
ParallelLogger::logInfo(paste0('Hyperparameter iteration no: ', i, ' Penalty: ', signif(prior$penalty, 3),
' AUC: ', signif(hyperParamSearch[i, "avg_CV"], 3),
' Iteration Time: ', signif(itDelta, 3), " ", attr(itDelta, "units")))
}

bestIndex <- which.max(hyperParamSearch$avg_CV)
bestPenalty <- hyperParamSearch[bestIndex, "Penalty"]
delta <- Sys.time() - start
ParallelLogger::logInfo(paste0("HyperParameter tuning took ", signif(delta, 3), " ", attr(delta, "units")))
ParallelLogger::logInfo(paste0("Best penalty value is: ", signif(bestPenalty, 4),
" With performance of: ", signif(hyperParamSearch[bestIndex, "avg_CV"])))
# refit at best value

prior$penalty <- bestPenalty
cyclopsData <- Cyclops::convertToCyclopsData(data$covariateData$labels,
data$covariateData$covariates,
checkRowIds = FALSE,
quiet=TRUE)
ParallelLogger::logInfo("Refitting model with best penalty on training set")
fit <- Cyclops::fitCyclopsModel(
cyclopsData = cyclopsData,
prior = prior,
fixedCoefficients = fixedCoefficients,
startingCoefficients = startingCoefficients
)

results <- list(modelFit = fit,
hyperParamSearch = hyperParamSearch)
}
20 changes: 15 additions & 5 deletions R/CyclopsSettings.R
Original file line number Diff line number Diff line change
Expand Up @@ -169,16 +169,18 @@ setCoxModel <- function(
#'
#' @param K The maximum number of non-zero predictors
#' @param penalty Specifies the IHT penalty; possible values are `BIC` or `AIC` or a numeric value
#' If set to `auto` it will do CV to determine best penalty
#' @param seed An option to add a seed when training the model
#' @param exclude A vector of numbers or covariateId names to exclude from prior
#' @param forceIntercept Logical: Force intercept coefficient into regularization
#' @param fitBestSubset Logical: Fit final subset with no regularization
#' @param initialRidgeVariance integer or character vector. If set to auto will fit Ridge regression using
#' cross validation to determine best initialRidgeVariance value.
#' @param initialRidgeVariance integer or character vector. If set to `auto` will fit Ridge regression using
#' cross validation to determine best `initialRidgeVariance` value.
#' @param tolerance numeric
#' @param maxIterations integer
#' @param threshold numeric
#' @param delta numeric
#' @param nTries If `penalty` is `auto`, how many penalties to include in grid search
#'
#' @examples
#' model.lr <- setLassoLogisticRegression()
Expand All @@ -194,21 +196,23 @@ setIterativeHardThresholding<- function(
tolerance = 1e-08,
maxIterations = 10000,
threshold = 1e-06,
delta = 0
delta = 0,
nTries = 10
){

ensure_installed("IterativeHardThresholding")

if(K<1)
stop('Invalid maximum number of predictors')
if(!(penalty %in% c("aic", "bic") || is.numeric(penalty)))
stop('Penalty must be "aic", "bic" or numeric')
if(!(penalty %in% c("aic", "bic", "auto") || is.numeric(penalty)))
stop('Penalty must be "aic", "bic", "auto" or numeric')
if(!is.logical(forceIntercept))
stop("forceIntercept must be of type: logical")
if(!is.logical(fitBestSubset))
stop("fitBestSubset must be of type: logical")
if(!inherits(x = seed, what = c('numeric','NULL','integer')))
stop('Invalid seed')



# set seed
Expand Down Expand Up @@ -242,6 +246,12 @@ setIterativeHardThresholding<- function(
name = "Iterative Hard Thresholding"
)

if (penalty == "auto") {
attr(param, 'settings')$manualCV <- TRUE
attr(param, 'settings')$useControl <- FALSE
attr(param, 'settings')$nTries <- nTries
}

attr(param, 'modelType') <- 'binary'
attr(param, 'saveType') <- 'RtoJson'

Expand Down
Loading
Loading