Skip to content

Commit

Permalink
Merge pull request #123 from pegasystems/polished_offline_reports
Browse files Browse the repository at this point in the history
Made the off line reports shine even more
  • Loading branch information
operdeck authored Sep 22, 2023
2 parents 8181c45 + e3f4d77 commit 9fb3376
Show file tree
Hide file tree
Showing 28 changed files with 659 additions and 243 deletions.
5 changes: 2 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,8 @@ examples/**/*.html
r/*.csv
r/tests/testthat/*.html
r/tests/testthat/*.pdf
/*.ipynb

# Python
**/*.ipynb
**/*.gz
*.pyc
build/
*.egg-info/
Expand Down
201 changes: 117 additions & 84 deletions examples/datamart/modelreport.Rmd

Large diffs are not rendered by default.

141 changes: 132 additions & 9 deletions examples/datamart/offlinereports.R
Original file line number Diff line number Diff line change
@@ -1,17 +1,140 @@
# Example preprocessing to create off-line model reports for
# CDH Sample, for CreditCards offers in Outbound channels
# Example R script to create off-line model reports. There is a similar
# but much less complex bash script to do the same. Any language can really
# be used here - the fact that the off-line model reports are in an R notebook
# does not mean the preprocessing and batch processing needs to be in R as well.

# You can run this script from R, from R Studio, VS Code or really any editor
# of your choice. You will need to change some of the paths defined here.

library(pdstools)
library(data.table)
library(rmarkdown)
library(arrow)

# Pandoc is needed by RMarkdown and part of RStudio. If you run this
# script outside of RStudio you'll need to make sure pandoc is installed
# and known to R / markdown. For now this is the best I could think of. To
# make it slightly more generic, dir can be a character vector of paths:
if (!rmarkdown::pandoc_available()) {
rmarkdown::find_pandoc(dir = c("/opt/anaconda3/bin"))
cat("Pandoc:", rmarkdown::pandoc_available(), fill = T)
}

customer <- "SampleCustomer" # just for titles, change to your customer name
datamart_datasets_folder <- "~/Downloads" # will pick the latest from there

# Path to the checked out versions of the notebooks. You'll need them locally
# so make sure to to a "clone" of the PDS Tools GitHub repository at
# https://github.com/pegasystems/pega-datascientist-tools. Update the path
# below to reflect the folder where you cloned the repo.

pdstools_repo_folder <- "~/Documents/pega/pega-datascientist-tools"

healthcheck_notebook_R <- file.path(pdstools_repo_folder, "examples/datamart/healthcheck.Rmd")
offlinemodelreport_notebook_R <- file.path(pdstools_repo_folder, "examples/datamart/modelreport.Rmd")

# Read the latest ADM Model export files from the Downloads folder
dm <- ADMDatamart("~/Downloads",
working_folder <- tempdir(TRUE)
output_folder <- file.path(getwd(), "reports")
if (!dir.exists(output_folder)) dir.create(output_folder)

# Read ADM Datamart from the folder specified above. You can also give
# explicit paths to both dataset. See help on ADMDatamart, in R Studio
# with ?pdstools::ADMDatamart or online at
# https://pegasystems.github.io/pega-datascientist-tools/R/reference/ADMDatamart.html

# Example of a function you can implement to highlight certain
# types of predictors based on their names. By default the system will
# highlight IH.* and Param.* predictors - simply splitting on the first dot,
# but you can customize this as shown below:

myPredictorCategorization <- function(name)
{
if (startsWith(name, "Param.ExtGroup")) return("External Model")
if (endsWith(name, "Score")) return("External Model")
if (endsWith(name, "RiskCode")) return("External Model")

return(defaultPredictorCategorization(name))
}

dm <- ADMDatamart(datamart_datasets_folder,
# optional predictor categorization, see above
predictorCategorization = myPredictorCategorization,

# filtering the data to be used
filterModelData = function(mdls) {
return(mdls[ConfigurationName == "OmniAdaptiveModel" & Group == "CreditCards" & Direction == "Outbound"])
return(mdls[ConfigurationName %in% c("OmniAdaptiveModel") &
Group == "CreditCards" &
Direction == "Outbound"])
})

# Write back a CSV with only the model data of interest
write.csv(dm$modeldata, "~/Downloads/models.csv", row.names = F)
# Write back temp files with the filtered data - not strictly necessary, you
# can also refer to the full files in the call to the notebooks.

tempModelFile <- tempfile(fileext = "_mdls.arrow", tmpdir = working_folder)
arrow::write_ipc_file(dm$modeldata, sink = tempModelFile)
tempPredictorFile <- tempfile(fileext = "_preds.arrow", tmpdir = working_folder)
arrow::write_ipc_file(dm$predictordata, sink = tempPredictorFile)

# Create Health Check (legacy R version - now superseded by the new Python version)

rmarkdown::render(healthcheck_notebook_R,
params = list(
modelfile = tempModelFile,
predictordatafile = tempPredictorFile,
title = paste("ADM Health Check", customer, sep = " - "),
subtitle = "legacy R version"
),
output_dir = working_folder,
output_file = paste("ADM Health Check ", customer, ".html", sep = ""),
quiet = FALSE, intermediates_dir = working_folder
)

# Individual Model reports

# In real life situations you probably want to select a subset of the
# models, not run a model report for every possible ADM instance, which
# would typically be in the 100's or 1000's.

# Below we select 5 of the models from every channel with the largest
# response counts. This is just a simple example that can easily be
# extended.

recentModels <- filterLatestSnapshotOnly(dm$modeldata)[Positives > 10]
recentModels[, PosRank := frank(-Positives, ties.method="random"), by=c("Direction", "Channel", "ConfigurationName")]
ids <- recentModels[PosRank <= 5, "ModelID"]

# Associate a name with the model IDs
modelNames <- sapply(ids, function(id) {
make.names(paste(
sapply(unique(dm$modeldata[
ModelID == id,
c("ConfigurationName", "Channel", "Direction", "Issue", "Group", "Name", "Treatment")
]), as.character),
collapse = "_"
))
})

# Create a report for every of these models
for (n in seq_along(ids)) {
id <- ids[order(modelNames)][n]
modelName <- modelNames[id]

cat("Model:", modelName, n, "of", length(ids), fill = T)

localModelReportHTMLFile <- paste0(customer, "_", modelName, ".html")

rmarkdown::render(offlinemodelreport_notebook_R,
params = list(
predictordatafile = tempPredictorFile,
modeldescription = modelName,
modelid = id
),
output_dir = output_folder,
output_file = localModelReportHTMLFile,
quiet = F, intermediates_dir = working_folder
)
}

cat("Done. Output is in", output_folder, fill=T)


# Write back a CSV with only the predictor data for the models of interest
write.csv(dm$predictordata, "~/Downloads/predictors.csv", row.names = F)
1 change: 1 addition & 0 deletions r/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ export(getScoringModelFromJSONFactoryString)
export(hasMultipleSnapshots)
export(lift)
export(plotBinning)
export(plotBinningLift)
export(plotCumulativeGains)
export(plotCumulativeLift)
export(plotPerformanceOverTime)
Expand Down
6 changes: 6 additions & 0 deletions r/R/adm.R
Original file line number Diff line number Diff line change
Expand Up @@ -741,6 +741,12 @@ admVarImp <- function(datamart, facets = NULL, filter = function(x) {filterClass

getActiveRanges <- function(dm)
{
is_AUC_activerange <- is_AUC_fullrange <- nClassifierBins <- is_full_indexrange <- active_index_max <- NULL # Trick to silence warnings from R CMD Check
active_index_min <- activeRangeAUC <- fullRangeAUC <- Performance <- reportedAUC <- NULL
ModelID <- sumMaxLogOdds <- score_max <- NULL
EntryType <- BinPositives <- BinNegatives <- totalPos <- totalNeg <- NULL
logOddsMin <- logOddsMax <- score_min <- classifierLogOffset <- sumMinLogOdds <- nActivePredictors <- NULL

# log odds contribution of the bins, including Laplace smoothing
binLogOdds <- function(binpos, binneg) {
nbins = length(binpos)
Expand Down
Loading

0 comments on commit 9fb3376

Please sign in to comment.