Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Made the off line reports shine even more #123

Merged
merged 1 commit into from
Sep 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,8 @@ examples/**/*.html
r/*.csv
r/tests/testthat/*.html
r/tests/testthat/*.pdf
/*.ipynb

# Python
**/*.ipynb
**/*.gz
*.pyc
build/
*.egg-info/
Expand Down
201 changes: 117 additions & 84 deletions examples/datamart/modelreport.Rmd

Large diffs are not rendered by default.

141 changes: 132 additions & 9 deletions examples/datamart/offlinereports.R
Original file line number Diff line number Diff line change
@@ -1,17 +1,140 @@
# Example preprocessing to create off-line model reports for
# CDH Sample, for CreditCards offers in Outbound channels
# Example R script to create off-line model reports. There is a similar
# but much less complex bash script to do the same. Any language can really
# be used here - the fact that the off-line model reports are in an R notebook
# does not mean the preprocessing and batch processing needs to be in R as well.

# You can run this script from R, from R Studio, VS Code or really any editor
# of your choice. You will need to change some of the paths defined here.

library(pdstools)
library(data.table)
library(rmarkdown)
library(arrow)

# Pandoc is needed by RMarkdown and part of RStudio. If you run this
# script outside of RStudio you'll need to make sure pandoc is installed
# and known to R / markdown. For now this is the best I could think of. To
# make it slightly more generic, dir can be a character vector of paths:
if (!rmarkdown::pandoc_available()) {
rmarkdown::find_pandoc(dir = c("/opt/anaconda3/bin"))
cat("Pandoc:", rmarkdown::pandoc_available(), fill = T)
}

customer <- "SampleCustomer" # just for titles, change to your customer name
datamart_datasets_folder <- "~/Downloads" # will pick the latest from there

# Path to the checked out versions of the notebooks. You'll need them locally
# so make sure to to a "clone" of the PDS Tools GitHub repository at
# https://github.com/pegasystems/pega-datascientist-tools. Update the path
# below to reflect the folder where you cloned the repo.

pdstools_repo_folder <- "~/Documents/pega/pega-datascientist-tools"

healthcheck_notebook_R <- file.path(pdstools_repo_folder, "examples/datamart/healthcheck.Rmd")
offlinemodelreport_notebook_R <- file.path(pdstools_repo_folder, "examples/datamart/modelreport.Rmd")

# Read the latest ADM Model export files from the Downloads folder
dm <- ADMDatamart("~/Downloads",
working_folder <- tempdir(TRUE)
output_folder <- file.path(getwd(), "reports")
if (!dir.exists(output_folder)) dir.create(output_folder)

# Read ADM Datamart from the folder specified above. You can also give
# explicit paths to both dataset. See help on ADMDatamart, in R Studio
# with ?pdstools::ADMDatamart or online at
# https://pegasystems.github.io/pega-datascientist-tools/R/reference/ADMDatamart.html

# Example of a function you can implement to highlight certain
# types of predictors based on their names. By default the system will
# highlight IH.* and Param.* predictors - simply splitting on the first dot,
# but you can customize this as shown below:

myPredictorCategorization <- function(name)
{
if (startsWith(name, "Param.ExtGroup")) return("External Model")
if (endsWith(name, "Score")) return("External Model")
if (endsWith(name, "RiskCode")) return("External Model")

return(defaultPredictorCategorization(name))
}

dm <- ADMDatamart(datamart_datasets_folder,
# optional predictor categorization, see above
predictorCategorization = myPredictorCategorization,

# filtering the data to be used
filterModelData = function(mdls) {
return(mdls[ConfigurationName == "OmniAdaptiveModel" & Group == "CreditCards" & Direction == "Outbound"])
return(mdls[ConfigurationName %in% c("OmniAdaptiveModel") &
Group == "CreditCards" &
Direction == "Outbound"])
})

# Write back a CSV with only the model data of interest
write.csv(dm$modeldata, "~/Downloads/models.csv", row.names = F)
# Write back temp files with the filtered data - not strictly necessary, you
# can also refer to the full files in the call to the notebooks.

tempModelFile <- tempfile(fileext = "_mdls.arrow", tmpdir = working_folder)
arrow::write_ipc_file(dm$modeldata, sink = tempModelFile)
tempPredictorFile <- tempfile(fileext = "_preds.arrow", tmpdir = working_folder)
arrow::write_ipc_file(dm$predictordata, sink = tempPredictorFile)

# Create Health Check (legacy R version - now superseded by the new Python version)

rmarkdown::render(healthcheck_notebook_R,
params = list(
modelfile = tempModelFile,
predictordatafile = tempPredictorFile,
title = paste("ADM Health Check", customer, sep = " - "),
subtitle = "legacy R version"
),
output_dir = working_folder,
output_file = paste("ADM Health Check ", customer, ".html", sep = ""),
quiet = FALSE, intermediates_dir = working_folder
)

# Individual Model reports

# In real life situations you probably want to select a subset of the
# models, not run a model report for every possible ADM instance, which
# would typically be in the 100's or 1000's.

# Below we select 5 of the models from every channel with the largest
# response counts. This is just a simple example that can easily be
# extended.

recentModels <- filterLatestSnapshotOnly(dm$modeldata)[Positives > 10]
recentModels[, PosRank := frank(-Positives, ties.method="random"), by=c("Direction", "Channel", "ConfigurationName")]
ids <- recentModels[PosRank <= 5, "ModelID"]

# Associate a name with the model IDs
modelNames <- sapply(ids, function(id) {
make.names(paste(
sapply(unique(dm$modeldata[
ModelID == id,
c("ConfigurationName", "Channel", "Direction", "Issue", "Group", "Name", "Treatment")
]), as.character),
collapse = "_"
))
})

# Create a report for every of these models
for (n in seq_along(ids)) {
id <- ids[order(modelNames)][n]
modelName <- modelNames[id]

cat("Model:", modelName, n, "of", length(ids), fill = T)

localModelReportHTMLFile <- paste0(customer, "_", modelName, ".html")

rmarkdown::render(offlinemodelreport_notebook_R,
params = list(
predictordatafile = tempPredictorFile,
modeldescription = modelName,
modelid = id
),
output_dir = output_folder,
output_file = localModelReportHTMLFile,
quiet = F, intermediates_dir = working_folder
)
}

cat("Done. Output is in", output_folder, fill=T)


# Write back a CSV with only the predictor data for the models of interest
write.csv(dm$predictordata, "~/Downloads/predictors.csv", row.names = F)
1 change: 1 addition & 0 deletions r/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ export(getScoringModelFromJSONFactoryString)
export(hasMultipleSnapshots)
export(lift)
export(plotBinning)
export(plotBinningLift)
export(plotCumulativeGains)
export(plotCumulativeLift)
export(plotPerformanceOverTime)
Expand Down
6 changes: 6 additions & 0 deletions r/R/adm.R
Original file line number Diff line number Diff line change
Expand Up @@ -741,6 +741,12 @@ admVarImp <- function(datamart, facets = NULL, filter = function(x) {filterClass

getActiveRanges <- function(dm)
{
is_AUC_activerange <- is_AUC_fullrange <- nClassifierBins <- is_full_indexrange <- active_index_max <- NULL # Trick to silence warnings from R CMD Check
active_index_min <- activeRangeAUC <- fullRangeAUC <- Performance <- reportedAUC <- NULL
ModelID <- sumMaxLogOdds <- score_max <- NULL
EntryType <- BinPositives <- BinNegatives <- totalPos <- totalNeg <- NULL
logOddsMin <- logOddsMax <- score_min <- classifierLogOffset <- sumMinLogOdds <- nActivePredictors <- NULL

# log odds contribution of the bins, including Laplace smoothing
binLogOdds <- function(binpos, binneg) {
nbins = length(binpos)
Expand Down
Loading