-
Notifications
You must be signed in to change notification settings - Fork 42
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
2 changed files
with
423 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,250 @@ | ||
#!/usr/bin/env python | ||
# encoding: utf-8 | ||
# | ||
# Copyright © 2023, SAS Institute Inc., Cary, NC, USA. All Rights Reserved. | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
from abc import ABC, abstractmethod | ||
from typing import Any, Callable, Dict, List, Union | ||
|
||
import pandas as pd | ||
|
||
|
||
def get_model_info(model, X, y): | ||
"""Extracts metadata about the model and associated data sets. | ||
Parameters | ||
---------- | ||
model : object | ||
A trained model | ||
X : array-like | ||
Sample of the data used to train the model. | ||
y : array-like | ||
Sample of the output produced by the model. | ||
Returns | ||
------- | ||
ModelInfo | ||
Raises | ||
------ | ||
ValueError | ||
If `model` is not a recognized type. | ||
""" | ||
if model.__class__.__module__.startswith("sklearn."): | ||
return SklearnModelInfo(model, X, y) | ||
|
||
raise ValueError(f"Unrecognized model type {model} received.") | ||
|
||
|
||
class ModelInfo(ABC): | ||
"""Base class for storing model metadata. | ||
Attributes | ||
---------- | ||
algorithm : str | ||
analytic_function : str | ||
is_binary_classifier : bool | ||
is_classifier | ||
is_regressor | ||
is_clusterer | ||
model : object | ||
The model instance that the information was extracted from. | ||
model_params : {str: any} | ||
Dictionary of parameter names and values. | ||
output_column_names : list of str | ||
Variable names associated with the outputs of `model`. | ||
predict_function : callable | ||
The method on `model` that is called to produce predicted values. | ||
target_values : list of str | ||
Class labels returned by a classification model. For binary classification models | ||
this is just the label of the targeted event level. | ||
threshold : float or None | ||
The cutoff value used in a binary classification model to determine which class an | ||
observation belongs to. Returns None if not a binary classification model. | ||
""" | ||
|
||
@property | ||
@abstractmethod | ||
def algorithm(self) -> str: | ||
return | ||
|
||
@property | ||
def analytic_function(self) -> str: | ||
if self.is_classifier: | ||
return "classification" | ||
if self.is_regressor: | ||
return "prediction" | ||
|
||
@property | ||
def description(self) -> str: | ||
return str(self.model) | ||
|
||
@property | ||
@abstractmethod | ||
def is_binary_classifier(self) -> bool: | ||
return | ||
|
||
@property | ||
@abstractmethod | ||
def is_classifier(self) -> bool: | ||
return | ||
|
||
@property | ||
@abstractmethod | ||
def is_clusterer(self) -> bool: | ||
return | ||
|
||
@property | ||
@abstractmethod | ||
def is_regressor(self) -> bool: | ||
return | ||
|
||
@property | ||
@abstractmethod | ||
def model(self) -> object: | ||
return | ||
|
||
@property | ||
@abstractmethod | ||
def model_params(self) -> Dict[str, Any]: | ||
return | ||
|
||
@property | ||
@abstractmethod | ||
def output_column_names(self) -> List[str]: | ||
return | ||
|
||
@property | ||
@abstractmethod | ||
def predict_function(self) -> Callable: | ||
return | ||
|
||
@property | ||
@abstractmethod | ||
def target_values(self): | ||
# "target event" | ||
# value that indicates the target event has occurred in bianry classi | ||
return | ||
|
||
@property | ||
@abstractmethod | ||
def threshold(self) -> Union[str, None]: | ||
return | ||
|
||
|
||
class SklearnModelInfo(ModelInfo): | ||
"""Stores model information for a scikit-learn model instance.""" | ||
|
||
# Map class names from sklearn to algorithm names used by SAS | ||
_algorithm_mappings = { | ||
"LogisticRegression": "Logistic regression", | ||
"LinearRegression": "Linear regression", | ||
"SVC": "Support vector machine", | ||
"SVR": "Support vector machine", | ||
"GradientBoostingClassifier": "Gradient boosting", | ||
"GradientBoostingRegressor": "Gradient boosting", | ||
"RandomForestClassifier": "Forest", | ||
"RandomForestRegressor": "Forest", | ||
"DecisionTreeClassifier": "Decision tree", | ||
"DecisionTreeRegressor": "Decision tree", | ||
} | ||
|
||
def __init__(self, model, X, y): | ||
# Ensure input/output is a DataFrame for consistency | ||
X_df = pd.DataFrame(X) | ||
y_df = pd.DataFrame(y) | ||
|
||
is_classifier = hasattr(model, "classes_") | ||
is_binary_classifier = is_classifier and len(model.classes_) == 2 | ||
is_clusterer = hasattr(model, "cluster_centers_") | ||
|
||
# If not a classfier or a clustering algorithm and output is a single column, then | ||
# assume its a regression algorithm | ||
is_regressor = not is_classifier and not is_clusterer and y_df.shape[1] == 1 | ||
|
||
if not is_classifier and not is_regressor and not is_clusterer: | ||
raise ValueError(f"Unexpected model type {model} received.") | ||
|
||
self._is_classifier = is_classifier | ||
self._is_binary_classifier = is_binary_classifier | ||
self._is_regressor = is_regressor | ||
self._is_clusterer = is_clusterer | ||
self._model = model | ||
|
||
if not hasattr(y, "name") and not hasattr(y, "columns"): | ||
# If example output doesn't contain column names then our DataFrame equivalent | ||
# also lacks good column names. Assign reasonable names for use downstream. | ||
if y_df.shape[1] == 1: | ||
y_df.columns = ["I_Target"] | ||
elif self.is_classifier: | ||
# Output is probability of each label. Name columns according to classes. | ||
y_df.columns = [f"P_{class_}" for class_ in model.classes_] | ||
else: | ||
# This *shouldn't* happen unless a cluster algorithm somehow produces wide output. | ||
raise ValueError(f"Unrecognized model output format.") | ||
|
||
# Store the data sets for reference later. | ||
self._X = X_df | ||
self._y = y_df | ||
|
||
@property | ||
def algorithm(self): | ||
# Get the model or the last step in the Pipeline | ||
estimator = getattr(self.model, "_final_estimator", self.model) | ||
estimator = type(estimator).__name__ | ||
|
||
# Convert the class name to an algorithm, or return the class name if no match. | ||
return self._algorithm_mappings.get(estimator, estimator) | ||
|
||
@property | ||
def is_binary_classifier(self): | ||
return self._is_binary_classifier | ||
|
||
@property | ||
def is_classifier(self): | ||
return self._is_classifier | ||
|
||
@property | ||
def is_clusterer(self): | ||
return self._is_clusterer | ||
|
||
@property | ||
def is_regressor(self): | ||
return self._is_regressor | ||
|
||
@property | ||
def model(self): | ||
return self._model | ||
|
||
@property | ||
def model_params(self) -> Dict[str, Any]: | ||
return self.model.get_params() | ||
|
||
@property | ||
def output_column_names(self): | ||
return list(self._y.columns) | ||
|
||
@property | ||
def predict_function(self): | ||
# If desired output has multiple columns then we can assume its the probability values | ||
if self._y.shape[1] > 1 and hasattr(self.model, "predict_proba"): | ||
return self.model.predict_proba | ||
|
||
# Otherwise its the single value from .predict() | ||
return self.model.predict | ||
|
||
@property | ||
def target_values(self): | ||
if self.is_binary_classifier: | ||
return [self.model.classes_[-1]] | ||
if self.is_classifier: | ||
return list(self.model.classes_) | ||
|
||
@property | ||
def threshold(self): | ||
# sklearn seems to always use 0.5 as a cutoff for .predict() | ||
if self.is_binary_classifier: | ||
return 0.5 |
Oops, something went wrong.