Skip to content

Commit

Permalink
feat: re-added model info
Browse files Browse the repository at this point in the history
  • Loading branch information
jlwalke2 committed Aug 5, 2024
1 parent d42cf8e commit 303b371
Show file tree
Hide file tree
Showing 2 changed files with 423 additions and 0 deletions.
250 changes: 250 additions & 0 deletions src/sasctl/utils/model_info.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,250 @@
#!/usr/bin/env python
# encoding: utf-8
#
# Copyright © 2023, SAS Institute Inc., Cary, NC, USA. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0

from abc import ABC, abstractmethod
from typing import Any, Callable, Dict, List, Union

import pandas as pd


def get_model_info(model, X, y):
"""Extracts metadata about the model and associated data sets.
Parameters
----------
model : object
A trained model
X : array-like
Sample of the data used to train the model.
y : array-like
Sample of the output produced by the model.
Returns
-------
ModelInfo
Raises
------
ValueError
If `model` is not a recognized type.
"""
if model.__class__.__module__.startswith("sklearn."):
return SklearnModelInfo(model, X, y)

raise ValueError(f"Unrecognized model type {model} received.")


class ModelInfo(ABC):
"""Base class for storing model metadata.
Attributes
----------
algorithm : str
analytic_function : str
is_binary_classifier : bool
is_classifier
is_regressor
is_clusterer
model : object
The model instance that the information was extracted from.
model_params : {str: any}
Dictionary of parameter names and values.
output_column_names : list of str
Variable names associated with the outputs of `model`.
predict_function : callable
The method on `model` that is called to produce predicted values.
target_values : list of str
Class labels returned by a classification model. For binary classification models
this is just the label of the targeted event level.
threshold : float or None
The cutoff value used in a binary classification model to determine which class an
observation belongs to. Returns None if not a binary classification model.
"""

@property
@abstractmethod
def algorithm(self) -> str:
return

@property
def analytic_function(self) -> str:
if self.is_classifier:
return "classification"
if self.is_regressor:
return "prediction"

@property
def description(self) -> str:
return str(self.model)

@property
@abstractmethod
def is_binary_classifier(self) -> bool:
return

@property
@abstractmethod
def is_classifier(self) -> bool:
return

@property
@abstractmethod
def is_clusterer(self) -> bool:
return

@property
@abstractmethod
def is_regressor(self) -> bool:
return

@property
@abstractmethod
def model(self) -> object:
return

@property
@abstractmethod
def model_params(self) -> Dict[str, Any]:
return

@property
@abstractmethod
def output_column_names(self) -> List[str]:
return

@property
@abstractmethod
def predict_function(self) -> Callable:
return

@property
@abstractmethod
def target_values(self):
# "target event"
# value that indicates the target event has occurred in bianry classi
return

@property
@abstractmethod
def threshold(self) -> Union[str, None]:
return


class SklearnModelInfo(ModelInfo):
"""Stores model information for a scikit-learn model instance."""

# Map class names from sklearn to algorithm names used by SAS
_algorithm_mappings = {
"LogisticRegression": "Logistic regression",
"LinearRegression": "Linear regression",
"SVC": "Support vector machine",
"SVR": "Support vector machine",
"GradientBoostingClassifier": "Gradient boosting",
"GradientBoostingRegressor": "Gradient boosting",
"RandomForestClassifier": "Forest",
"RandomForestRegressor": "Forest",
"DecisionTreeClassifier": "Decision tree",
"DecisionTreeRegressor": "Decision tree",
}

def __init__(self, model, X, y):
# Ensure input/output is a DataFrame for consistency
X_df = pd.DataFrame(X)
y_df = pd.DataFrame(y)

is_classifier = hasattr(model, "classes_")
is_binary_classifier = is_classifier and len(model.classes_) == 2
is_clusterer = hasattr(model, "cluster_centers_")

# If not a classfier or a clustering algorithm and output is a single column, then
# assume its a regression algorithm
is_regressor = not is_classifier and not is_clusterer and y_df.shape[1] == 1

if not is_classifier and not is_regressor and not is_clusterer:
raise ValueError(f"Unexpected model type {model} received.")

self._is_classifier = is_classifier
self._is_binary_classifier = is_binary_classifier
self._is_regressor = is_regressor
self._is_clusterer = is_clusterer
self._model = model

if not hasattr(y, "name") and not hasattr(y, "columns"):
# If example output doesn't contain column names then our DataFrame equivalent
# also lacks good column names. Assign reasonable names for use downstream.
if y_df.shape[1] == 1:
y_df.columns = ["I_Target"]
elif self.is_classifier:
# Output is probability of each label. Name columns according to classes.
y_df.columns = [f"P_{class_}" for class_ in model.classes_]
else:
# This *shouldn't* happen unless a cluster algorithm somehow produces wide output.
raise ValueError(f"Unrecognized model output format.")

# Store the data sets for reference later.
self._X = X_df
self._y = y_df

@property
def algorithm(self):
# Get the model or the last step in the Pipeline
estimator = getattr(self.model, "_final_estimator", self.model)
estimator = type(estimator).__name__

# Convert the class name to an algorithm, or return the class name if no match.
return self._algorithm_mappings.get(estimator, estimator)

@property
def is_binary_classifier(self):
return self._is_binary_classifier

@property
def is_classifier(self):
return self._is_classifier

@property
def is_clusterer(self):
return self._is_clusterer

@property
def is_regressor(self):
return self._is_regressor

@property
def model(self):
return self._model

@property
def model_params(self) -> Dict[str, Any]:
return self.model.get_params()

@property
def output_column_names(self):
return list(self._y.columns)

@property
def predict_function(self):
# If desired output has multiple columns then we can assume its the probability values
if self._y.shape[1] > 1 and hasattr(self.model, "predict_proba"):
return self.model.predict_proba

# Otherwise its the single value from .predict()
return self.model.predict

@property
def target_values(self):
if self.is_binary_classifier:
return [self.model.classes_[-1]]
if self.is_classifier:
return list(self.model.classes_)

@property
def threshold(self):
# sklearn seems to always use 0.5 as a cutoff for .predict()
if self.is_binary_classifier:
return 0.5
Loading

0 comments on commit 303b371

Please sign in to comment.