From 3913c309d9a50c77563f2fbf234158d76d21092d Mon Sep 17 00:00:00 2001 From: djm21 Date: Tue, 12 Mar 2024 14:59:16 -0500 Subject: [PATCH] Added some tests for model cards + fixed some model card errors --- src/sasctl/pzmm/write_json_files.py | 55 +++++---- tests/unit/test_write_json_files.py | 179 ++++++++++++++++++++++++++++ 2 files changed, 206 insertions(+), 28 deletions(-) diff --git a/src/sasctl/pzmm/write_json_files.py b/src/sasctl/pzmm/write_json_files.py index 4345b6eb..70a3ee9b 100644 --- a/src/sasctl/pzmm/write_json_files.py +++ b/src/sasctl/pzmm/write_json_files.py @@ -2374,7 +2374,7 @@ def generate_outcome_average( ): """ Generates the outcome average of the training data. For Interval targets, the event average - is generated. For Classification targets, the event average is returned. + is generated. For Classification targets, the event percentage is returned. Parameters ---------- @@ -2395,17 +2395,23 @@ def generate_outcome_average( dict Returns a dictionary with a key value pair that represents the outcome average. """ + import numbers output_var = train_data.drop(input_variables, axis=1) if target_type == "classification": value_counts = output_var[output_var.columns[0]].value_counts() return {'eventPercentage': value_counts[target_value]/sum(value_counts)} elif target_type == "interval": - return {'eventAverage': sum(value_counts[value_counts.columns[0]]) / len(value_counts)} + if not isinstance(output_var[output_var.columns[0]].iloc[0], numbers.Number): + raise ValueError("Detected output column is not numeric. Please ensure that " + + "the correct output column is being passed, and that no extra columns " + + "are in front of the output column. This function assumes that the first " + + "non-input column is the output column.jf") + return {'eventAverage': sum(output_var[output_var.columns[0]]) / len(output_var)} @staticmethod def get_selection_statistic_value( - model_files, - selection_statistic + model_files: Union[str, Path, dict], + selection_statistic: str = "_GINI_" ): """ Finds the value of the chosen selection statistic in dmcas_fitstat.json, which should have been @@ -2493,10 +2499,11 @@ def update_model_properties( ) with open(Path(model_files) / PROP, 'r+') as properties_json: model_properties = json.load(properties_json) - if not isinstance(update_dict[key], str): - model_files[PROP][key] = str(round(update_dict[key], 14)) - else: - model_files[PROP][key] = update_dict[key] + for key in update_dict: + if not isinstance(update_dict[key], str): + model_properties[key] = str(round(update_dict[key], 14)) + else: + model_properties[key] = update_dict[key] properties_json.seek(0) properties_json.write(json.dumps(model_properties, indent=4, cls=NpEncoder)) properties_json.truncate() @@ -2537,14 +2544,6 @@ def generate_variable_importance( caslib: str, optional The caslib the training data will be stored on. The default value is "Public" """ - try: - sess = current_session() - conn = sess.as_swat() - except ImportError: - raise RuntimeError( - "The `swat` package is required to generate fit statistics, ROC, and " - "Lift charts with the calculate_model_statistics function." - ) # Remove target variable from training data by selecting only input variable columns x_train_data = train_data[interval_vars + class_vars] # Upload scored training data to run variable importance on @@ -2573,12 +2572,12 @@ def generate_variable_importance( "name": 'BIN', "inputs": [{"name": var} for var in interval_vars], "targets": [{"name": "Prediction"}], - "discretize":{ - "method":method, - "arguments":{ - "minNBins":1, - "maxNBins":8, - "treeCrit":treeCrit, + "discretize": { + "method": method, + "arguments": { + "minNBins": 1, + "maxNBins": 8, + "treeCrit": treeCrit, "contingencyTblOpts":{"inputsMethod": 'BUCKET', "inputsNLevels": 100}, "overrides": {"minNObsInBin": 5, "binMissing": True, "noDataLowerUpperBound": True} } @@ -2589,12 +2588,12 @@ def generate_variable_importance( "name": 'BIN_NOM', "inputs": [{"name": var} for var in class_vars], "targets": [{"name": "Prediction"}], - "catTrans":{ - "method":method, - "arguments":{ - "minNBins":1, - "maxNBins":8, - "treeCrit":treeCrit, + "catTrans": { + "method": method, + "arguments": { + "minNBins": 1, + "maxNBins": 8, + "treeCrit": treeCrit, "overrides": {"minNObsInBin": 5, "binMissing": True} } } diff --git a/tests/unit/test_write_json_files.py b/tests/unit/test_write_json_files.py index 412759f2..19cbd4f9 100644 --- a/tests/unit/test_write_json_files.py +++ b/tests/unit/test_write_json_files.py @@ -16,11 +16,14 @@ import warnings from pathlib import Path from unittest.mock import patch +import math import numpy as np import pandas as pd import pytest from sklearn.model_selection import train_test_split +from sklearn import datasets +from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier import sasctl.pzmm as pzmm @@ -43,6 +46,37 @@ {"name": "REASON_HomeImp", "type": "integer"}, ] +class BadModel: + attr = None + +@pytest.fixture +def bad_model(): + return BadModel() + + +@pytest.fixture +def train_data(): + """Returns the Iris data set as (X, y)""" + raw = datasets.load_iris() + iris = pd.DataFrame(raw.data, columns=raw.feature_names) + iris = iris.join(pd.DataFrame(raw.target)) + iris.columns = ["SepalLength", "SepalWidth", "PetalLength", "PetalWidth", "Species"] + iris["Species"] = iris["Species"].astype("category") + iris.Species.cat.categories = raw.target_names + return iris.iloc[:, 0:4], iris["Species"] + + +@pytest.fixture +def sklearn_model(train_data): + """Returns a simple Scikit-Learn model""" + X, y = train_data + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + model = LogisticRegression( + multi_class="multinomial", solver="lbfgs", max_iter=1000 + ) + model.fit(X, y) + return model @pytest.fixture def change_dir(): @@ -849,3 +883,148 @@ def test_errors(self): jf.assess_model_bias( score_table, sensitive_values, actual_values ) + + +class TestModelCardGeneration(unittest.TestCase): + def test_generate_outcome_average_interval(self): + df = pd.DataFrame({"input": [3, 2, 1], "output": [1, 2, 3]}) + assert ( + jf.generate_outcome_average(df, ["input"], "interval") == + {'eventAverage': 2.0} + ) + + def test_generate_outcome_average_classification(self): + df = pd.DataFrame({"input": [3, 2], "output": [0, 1]}) + event_percentage = jf.generate_outcome_average(df, ["input"], "classification", 1) + assert('eventPercentage' in event_percentage) + + def test_generate_outcome_average_interval_non_numeric_output(self): + df = pd.DataFrame({"input": [3, 2, 1], "output": ["one", "two", "three"]}) + with pytest.raises(ValueError): + jf.generate_outcome_average(df, ["input"], "interval") + + +class TestGetSelectionStatisticValue(unittest.TestCase): + model_file_dict = { + "dmcas_fitstat.json": { + "data": [ + { + "dataMap": { + "_GINI_": 1, + "_C_": 2, + "_TAU_": None, + "_DataRole_": "TRAIN" + } + } + ] + } + } + tmp_dir = tempfile.TemporaryDirectory() + with open(Path(tmp_dir.name) / "dmcas_fitstat.json", "w+") as f: + f.write(json.dumps(model_file_dict['dmcas_fitstat.json'])) + + def test_get_statistic_dict_default(self): + selection_statistic = jf.get_selection_statistic_value(self.model_file_dict) + assert(selection_statistic == 1) + + def test_get_statistic_dict_custom(self): + selection_statistic = jf.get_selection_statistic_value(self.model_file_dict, "_C_") + assert(selection_statistic == 2) + + def test_get_blank_statistic_dict(self): + with pytest.raises(RuntimeError): + jf.get_selection_statistic_value(self.model_file_dict, "_TAU_") + + def test_get_statistics_path_default(self): + selection_statistic = jf.get_selection_statistic_value(Path(self.tmp_dir.name)) + assert(selection_statistic == 1) + + def test_get_statistics_path_custom(self): + selection_statistic = jf.get_selection_statistic_value(Path(self.tmp_dir.name), "_C_") + assert(selection_statistic == 2) + + def test_get_blank_statistic_path(self): + with pytest.raises(RuntimeError): + jf.get_selection_statistic_value(Path(self.tmp_dir.name), "_TAU_") + + def test_get_statistics_str_default(self): + selection_statistic = jf.get_selection_statistic_value(self.tmp_dir.name) + assert (selection_statistic == 1) + + def test_get_statistics_str_custom(self): + selection_statistic = jf.get_selection_statistic_value(self.tmp_dir.name, "_C_") + assert (selection_statistic == 2) + + def test_get_blank_statistic_str(self): + with pytest.raises(RuntimeError): + jf.get_selection_statistic_value(self.tmp_dir.name, "_TAU_") + + +class TestUpdateModelProperties(unittest.TestCase): + def setUp(self): + self.model_file_dict = { + "ModelProperties.json": + { + "example": "property" + } + } + self.tmp_dir = tempfile.TemporaryDirectory() + with open(Path(self.tmp_dir.name) / "ModelProperties.json", "w+") as f: + f.write(json.dumps(self.model_file_dict['ModelProperties.json'])) + + def tearDown(self): + self.tmp_dir.cleanup() + + def test_update_model_properties_dict(self): + update_dict = {'new': 'arg', 'newer': 'thing'} + jf.update_model_properties(self.model_file_dict, update_dict) + assert(self.model_file_dict['ModelProperties.json']['example'] == 'property') + assert(self.model_file_dict['ModelProperties.json']['new'] == 'arg') + assert(self.model_file_dict['ModelProperties.json']['newer'] == 'thing') + + def test_update_model_properties_dict_overwrite(self): + update_dict = {'new': 'arg', 'example': 'thing'} + jf.update_model_properties(self.model_file_dict, update_dict) + assert (self.model_file_dict['ModelProperties.json']['example'] == 'thing') + assert (self.model_file_dict['ModelProperties.json']['new'] == 'arg') + + def test_update_model_properties_dict_number(self): + update_dict = {"number": 1} + jf.update_model_properties(self.model_file_dict, update_dict) + assert (self.model_file_dict['ModelProperties.json']['number'] == '1') + + def test_update_model_properties_dict_round_number(self): + update_dict = {'number': 0.123456789012345} + jf.update_model_properties(self.model_file_dict, update_dict) + assert (self.model_file_dict['ModelProperties.json']['number'] == '0.12345678901234') + + def test_update_model_properties_str(self): + update_dict = {'new': 'arg', 'newer': 'thing'} + jf.update_model_properties(self.tmp_dir.name, update_dict) + with open(Path(self.tmp_dir.name) / 'ModelProperties.json', 'r') as f: + model_properties = json.load(f) + assert(model_properties['example'] == 'property') + assert(model_properties['new'] == 'arg') + assert(model_properties['newer'] == 'thing') + + def test_update_model_properties_str_overwrite(self): + update_dict = {'new': 'arg', 'example': 'thing'} + jf.update_model_properties(self.tmp_dir.name, update_dict) + with open(Path(self.tmp_dir.name) / 'ModelProperties.json', 'r') as f: + model_properties = json.load(f) + assert (model_properties['example'] == 'thing') + assert (model_properties['new'] == 'arg') + + def test_update_model_properties_str_number(self): + update_dict = {"number": 1} + jf.update_model_properties(self.tmp_dir.name, update_dict) + with open(Path(self.tmp_dir.name) / 'ModelProperties.json', 'r') as f: + model_properties = json.load(f) + assert (model_properties['number'] == '1') + + def test_update_model_properties_str_round_number(self): + update_dict = {'number': 0.123456789012345} + jf.update_model_properties(self.tmp_dir.name, update_dict) + with open(Path(self.tmp_dir.name) / 'ModelProperties.json', 'r') as f: + model_properties = json.load(f) + assert (model_properties['number'] == '0.12345678901234') \ No newline at end of file