Skip to content

Commit

Permalink
Added some tests for model cards + fixed some model card errors
Browse files Browse the repository at this point in the history
  • Loading branch information
djm21 committed Mar 12, 2024
1 parent c20c162 commit 3913c30
Show file tree
Hide file tree
Showing 2 changed files with 206 additions and 28 deletions.
55 changes: 27 additions & 28 deletions src/sasctl/pzmm/write_json_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -2374,7 +2374,7 @@ def generate_outcome_average(
):
"""
Generates the outcome average of the training data. For Interval targets, the event average
is generated. For Classification targets, the event average is returned.
is generated. For Classification targets, the event percentage is returned.
Parameters
----------
Expand All @@ -2395,17 +2395,23 @@ def generate_outcome_average(
dict
Returns a dictionary with a key value pair that represents the outcome average.
"""
import numbers
output_var = train_data.drop(input_variables, axis=1)
if target_type == "classification":
value_counts = output_var[output_var.columns[0]].value_counts()
return {'eventPercentage': value_counts[target_value]/sum(value_counts)}
elif target_type == "interval":
return {'eventAverage': sum(value_counts[value_counts.columns[0]]) / len(value_counts)}
if not isinstance(output_var[output_var.columns[0]].iloc[0], numbers.Number):
raise ValueError("Detected output column is not numeric. Please ensure that " +
"the correct output column is being passed, and that no extra columns " +
"are in front of the output column. This function assumes that the first " +
"non-input column is the output column.jf")
return {'eventAverage': sum(output_var[output_var.columns[0]]) / len(output_var)}

@staticmethod
def get_selection_statistic_value(
model_files,
selection_statistic
model_files: Union[str, Path, dict],
selection_statistic: str = "_GINI_"
):
"""
Finds the value of the chosen selection statistic in dmcas_fitstat.json, which should have been
Expand Down Expand Up @@ -2493,10 +2499,11 @@ def update_model_properties(
)
with open(Path(model_files) / PROP, 'r+') as properties_json:
model_properties = json.load(properties_json)
if not isinstance(update_dict[key], str):
model_files[PROP][key] = str(round(update_dict[key], 14))
else:
model_files[PROP][key] = update_dict[key]
for key in update_dict:
if not isinstance(update_dict[key], str):
model_properties[key] = str(round(update_dict[key], 14))
else:
model_properties[key] = update_dict[key]
properties_json.seek(0)
properties_json.write(json.dumps(model_properties, indent=4, cls=NpEncoder))
properties_json.truncate()
Expand Down Expand Up @@ -2537,14 +2544,6 @@ def generate_variable_importance(
caslib: str, optional
The caslib the training data will be stored on. The default value is "Public"
"""
try:
sess = current_session()
conn = sess.as_swat()
except ImportError:
raise RuntimeError(
"The `swat` package is required to generate fit statistics, ROC, and "
"Lift charts with the calculate_model_statistics function."
)
# Remove target variable from training data by selecting only input variable columns
x_train_data = train_data[interval_vars + class_vars]
# Upload scored training data to run variable importance on
Expand Down Expand Up @@ -2573,12 +2572,12 @@ def generate_variable_importance(
"name": 'BIN',
"inputs": [{"name": var} for var in interval_vars],
"targets": [{"name": "Prediction"}],
"discretize":{
"method":method,
"arguments":{
"minNBins":1,
"maxNBins":8,
"treeCrit":treeCrit,
"discretize": {
"method": method,
"arguments": {
"minNBins": 1,
"maxNBins": 8,
"treeCrit": treeCrit,
"contingencyTblOpts":{"inputsMethod": 'BUCKET', "inputsNLevels": 100},
"overrides": {"minNObsInBin": 5, "binMissing": True, "noDataLowerUpperBound": True}
}
Expand All @@ -2589,12 +2588,12 @@ def generate_variable_importance(
"name": 'BIN_NOM',
"inputs": [{"name": var} for var in class_vars],
"targets": [{"name": "Prediction"}],
"catTrans":{
"method":method,
"arguments":{
"minNBins":1,
"maxNBins":8,
"treeCrit":treeCrit,
"catTrans": {
"method": method,
"arguments": {
"minNBins": 1,
"maxNBins": 8,
"treeCrit": treeCrit,
"overrides": {"minNObsInBin": 5, "binMissing": True}
}
}
Expand Down
179 changes: 179 additions & 0 deletions tests/unit/test_write_json_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,14 @@
import warnings
from pathlib import Path
from unittest.mock import patch
import math

import numpy as np
import pandas as pd
import pytest
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

import sasctl.pzmm as pzmm
Expand All @@ -43,6 +46,37 @@
{"name": "REASON_HomeImp", "type": "integer"},
]

class BadModel:
attr = None

@pytest.fixture
def bad_model():
return BadModel()


@pytest.fixture
def train_data():
"""Returns the Iris data set as (X, y)"""
raw = datasets.load_iris()
iris = pd.DataFrame(raw.data, columns=raw.feature_names)
iris = iris.join(pd.DataFrame(raw.target))
iris.columns = ["SepalLength", "SepalWidth", "PetalLength", "PetalWidth", "Species"]
iris["Species"] = iris["Species"].astype("category")
iris.Species.cat.categories = raw.target_names
return iris.iloc[:, 0:4], iris["Species"]


@pytest.fixture
def sklearn_model(train_data):
"""Returns a simple Scikit-Learn model"""
X, y = train_data
with warnings.catch_warnings():
warnings.simplefilter("ignore")
model = LogisticRegression(
multi_class="multinomial", solver="lbfgs", max_iter=1000
)
model.fit(X, y)
return model

@pytest.fixture
def change_dir():
Expand Down Expand Up @@ -849,3 +883,148 @@ def test_errors(self):
jf.assess_model_bias(
score_table, sensitive_values, actual_values
)


class TestModelCardGeneration(unittest.TestCase):
def test_generate_outcome_average_interval(self):
df = pd.DataFrame({"input": [3, 2, 1], "output": [1, 2, 3]})
assert (
jf.generate_outcome_average(df, ["input"], "interval") ==
{'eventAverage': 2.0}
)

def test_generate_outcome_average_classification(self):
df = pd.DataFrame({"input": [3, 2], "output": [0, 1]})
event_percentage = jf.generate_outcome_average(df, ["input"], "classification", 1)
assert('eventPercentage' in event_percentage)

def test_generate_outcome_average_interval_non_numeric_output(self):
df = pd.DataFrame({"input": [3, 2, 1], "output": ["one", "two", "three"]})
with pytest.raises(ValueError):
jf.generate_outcome_average(df, ["input"], "interval")


class TestGetSelectionStatisticValue(unittest.TestCase):
model_file_dict = {
"dmcas_fitstat.json": {
"data": [
{
"dataMap": {
"_GINI_": 1,
"_C_": 2,
"_TAU_": None,
"_DataRole_": "TRAIN"
}
}
]
}
}
tmp_dir = tempfile.TemporaryDirectory()
with open(Path(tmp_dir.name) / "dmcas_fitstat.json", "w+") as f:
f.write(json.dumps(model_file_dict['dmcas_fitstat.json']))

def test_get_statistic_dict_default(self):
selection_statistic = jf.get_selection_statistic_value(self.model_file_dict)
assert(selection_statistic == 1)

def test_get_statistic_dict_custom(self):
selection_statistic = jf.get_selection_statistic_value(self.model_file_dict, "_C_")
assert(selection_statistic == 2)

def test_get_blank_statistic_dict(self):
with pytest.raises(RuntimeError):
jf.get_selection_statistic_value(self.model_file_dict, "_TAU_")

def test_get_statistics_path_default(self):
selection_statistic = jf.get_selection_statistic_value(Path(self.tmp_dir.name))
assert(selection_statistic == 1)

def test_get_statistics_path_custom(self):
selection_statistic = jf.get_selection_statistic_value(Path(self.tmp_dir.name), "_C_")
assert(selection_statistic == 2)

def test_get_blank_statistic_path(self):
with pytest.raises(RuntimeError):
jf.get_selection_statistic_value(Path(self.tmp_dir.name), "_TAU_")

def test_get_statistics_str_default(self):
selection_statistic = jf.get_selection_statistic_value(self.tmp_dir.name)
assert (selection_statistic == 1)

def test_get_statistics_str_custom(self):
selection_statistic = jf.get_selection_statistic_value(self.tmp_dir.name, "_C_")
assert (selection_statistic == 2)

def test_get_blank_statistic_str(self):
with pytest.raises(RuntimeError):
jf.get_selection_statistic_value(self.tmp_dir.name, "_TAU_")


class TestUpdateModelProperties(unittest.TestCase):
def setUp(self):
self.model_file_dict = {
"ModelProperties.json":
{
"example": "property"
}
}
self.tmp_dir = tempfile.TemporaryDirectory()
with open(Path(self.tmp_dir.name) / "ModelProperties.json", "w+") as f:
f.write(json.dumps(self.model_file_dict['ModelProperties.json']))

def tearDown(self):
self.tmp_dir.cleanup()

def test_update_model_properties_dict(self):
update_dict = {'new': 'arg', 'newer': 'thing'}
jf.update_model_properties(self.model_file_dict, update_dict)
assert(self.model_file_dict['ModelProperties.json']['example'] == 'property')
assert(self.model_file_dict['ModelProperties.json']['new'] == 'arg')
assert(self.model_file_dict['ModelProperties.json']['newer'] == 'thing')

def test_update_model_properties_dict_overwrite(self):
update_dict = {'new': 'arg', 'example': 'thing'}
jf.update_model_properties(self.model_file_dict, update_dict)
assert (self.model_file_dict['ModelProperties.json']['example'] == 'thing')
assert (self.model_file_dict['ModelProperties.json']['new'] == 'arg')

def test_update_model_properties_dict_number(self):
update_dict = {"number": 1}
jf.update_model_properties(self.model_file_dict, update_dict)
assert (self.model_file_dict['ModelProperties.json']['number'] == '1')

def test_update_model_properties_dict_round_number(self):
update_dict = {'number': 0.123456789012345}
jf.update_model_properties(self.model_file_dict, update_dict)
assert (self.model_file_dict['ModelProperties.json']['number'] == '0.12345678901234')

def test_update_model_properties_str(self):
update_dict = {'new': 'arg', 'newer': 'thing'}
jf.update_model_properties(self.tmp_dir.name, update_dict)
with open(Path(self.tmp_dir.name) / 'ModelProperties.json', 'r') as f:
model_properties = json.load(f)
assert(model_properties['example'] == 'property')
assert(model_properties['new'] == 'arg')
assert(model_properties['newer'] == 'thing')

def test_update_model_properties_str_overwrite(self):
update_dict = {'new': 'arg', 'example': 'thing'}
jf.update_model_properties(self.tmp_dir.name, update_dict)
with open(Path(self.tmp_dir.name) / 'ModelProperties.json', 'r') as f:
model_properties = json.load(f)
assert (model_properties['example'] == 'thing')
assert (model_properties['new'] == 'arg')

def test_update_model_properties_str_number(self):
update_dict = {"number": 1}
jf.update_model_properties(self.tmp_dir.name, update_dict)
with open(Path(self.tmp_dir.name) / 'ModelProperties.json', 'r') as f:
model_properties = json.load(f)
assert (model_properties['number'] == '1')

def test_update_model_properties_str_round_number(self):
update_dict = {'number': 0.123456789012345}
jf.update_model_properties(self.tmp_dir.name, update_dict)
with open(Path(self.tmp_dir.name) / 'ModelProperties.json', 'r') as f:
model_properties = json.load(f)
assert (model_properties['number'] == '0.12345678901234')

0 comments on commit 3913c30

Please sign in to comment.