Added some tests for model cards + fixed some model card errors

sassoftware · Mar 12, 2024 · 3913c30 · 3913c30
1 parent c20c162
commit 3913c30
Show file tree

Hide file tree

Showing 2 changed files with 206 additions and 28 deletions.
diff --git a/src/sasctl/pzmm/write_json_files.py b/src/sasctl/pzmm/write_json_files.py
@@ -2374,7 +2374,7 @@ def generate_outcome_average(
     ):
         """
         Generates the outcome average of the training data. For Interval targets, the event average
-        is generated. For Classification targets, the event average is returned.
+        is generated. For Classification targets, the event percentage is returned.
 
         Parameters
         ----------
@@ -2395,17 +2395,23 @@ def generate_outcome_average(
         dict
         Returns a dictionary with a key value pair that represents the outcome average.
         """
+        import numbers
         output_var = train_data.drop(input_variables, axis=1)
         if target_type == "classification":
             value_counts = output_var[output_var.columns[0]].value_counts()
             return {'eventPercentage': value_counts[target_value]/sum(value_counts)}
         elif target_type == "interval":
-            return {'eventAverage': sum(value_counts[value_counts.columns[0]]) / len(value_counts)}
+            if not isinstance(output_var[output_var.columns[0]].iloc[0], numbers.Number):
+                raise ValueError("Detected output column is not numeric. Please ensure that " +
+                                 "the correct output column is being passed, and that no extra columns " +
+                                 "are in front of the output column. This function assumes that the first " +
+                                 "non-input column is the output column.jf")
+            return {'eventAverage': sum(output_var[output_var.columns[0]]) / len(output_var)}
 
     @staticmethod
     def get_selection_statistic_value(
-        model_files,
-        selection_statistic
+        model_files: Union[str, Path, dict],
+        selection_statistic: str = "_GINI_"
     ):
         """
         Finds the value of the chosen selection statistic in dmcas_fitstat.json, which should have been
@@ -2493,10 +2499,11 @@ def update_model_properties(
                 )
             with open(Path(model_files) / PROP, 'r+') as properties_json:
                 model_properties = json.load(properties_json)
-                if not isinstance(update_dict[key], str):
-                    model_files[PROP][key] = str(round(update_dict[key], 14))
-                else:
-                    model_files[PROP][key] = update_dict[key]
+                for key in update_dict:
+                    if not isinstance(update_dict[key], str):
+                        model_properties[key] = str(round(update_dict[key], 14))
+                    else:
+                        model_properties[key] = update_dict[key]
                 properties_json.seek(0)
                 properties_json.write(json.dumps(model_properties, indent=4, cls=NpEncoder))
                 properties_json.truncate()
@@ -2537,14 +2544,6 @@ def generate_variable_importance(
         caslib: str, optional
             The caslib the training data will be stored on. The default value is "Public"
         """
-        try:
-            sess = current_session()
-            conn = sess.as_swat()
-        except ImportError:
-            raise RuntimeError(
-                "The `swat` package is required to generate fit statistics, ROC, and "
-                "Lift charts with the calculate_model_statistics function."
-            )
         # Remove target variable from training data by selecting only input variable columns
         x_train_data = train_data[interval_vars + class_vars]
         # Upload scored training data to run variable importance on
@@ -2573,12 +2572,12 @@ def generate_variable_importance(
                 "name": 'BIN',
                 "inputs": [{"name": var} for var in interval_vars],
                 "targets": [{"name": "Prediction"}],
-                "discretize":{
-                    "method":method, 
-                    "arguments":{
-                        "minNBins":1,
-                        "maxNBins":8, 
-                        "treeCrit":treeCrit,
+                "discretize": {
+                    "method": method,
+                    "arguments": {
+                        "minNBins": 1,
+                        "maxNBins": 8,
+                        "treeCrit": treeCrit,
                         "contingencyTblOpts":{"inputsMethod": 'BUCKET', "inputsNLevels": 100}, 
                         "overrides": {"minNObsInBin": 5, "binMissing": True, "noDataLowerUpperBound": True}
                     }
@@ -2589,12 +2588,12 @@ def generate_variable_importance(
                 "name": 'BIN_NOM',
                 "inputs": [{"name": var} for var in class_vars],
                 "targets": [{"name": "Prediction"}],
-                "catTrans":{
-                    "method":method, 
-                    "arguments":{
-                        "minNBins":1,
-                        "maxNBins":8, 
-                        "treeCrit":treeCrit,
+                "catTrans": {
+                    "method": method,
+                    "arguments": {
+                        "minNBins": 1,
+                        "maxNBins": 8,
+                        "treeCrit": treeCrit,
                         "overrides": {"minNObsInBin": 5, "binMissing": True}
                     }
                 }

diff --git a/tests/unit/test_write_json_files.py b/tests/unit/test_write_json_files.py
@@ -16,11 +16,14 @@
 import warnings
 from pathlib import Path
 from unittest.mock import patch
+import math
 
 import numpy as np
 import pandas as pd
 import pytest
 from sklearn.model_selection import train_test_split
+from sklearn import datasets
+from sklearn.linear_model import LogisticRegression
 from sklearn.tree import DecisionTreeClassifier
 
 import sasctl.pzmm as pzmm
@@ -43,6 +46,37 @@
     {"name": "REASON_HomeImp", "type": "integer"},
 ]
 
+class BadModel:
+    attr = None
+
+@pytest.fixture
+def bad_model():
+    return BadModel()
+
+
+@pytest.fixture
+def train_data():
+    """Returns the Iris data set as (X, y)"""
+    raw = datasets.load_iris()
+    iris = pd.DataFrame(raw.data, columns=raw.feature_names)
+    iris = iris.join(pd.DataFrame(raw.target))
+    iris.columns = ["SepalLength", "SepalWidth", "PetalLength", "PetalWidth", "Species"]
+    iris["Species"] = iris["Species"].astype("category")
+    iris.Species.cat.categories = raw.target_names
+    return iris.iloc[:, 0:4], iris["Species"]
+
+
+@pytest.fixture
+def sklearn_model(train_data):
+    """Returns a simple Scikit-Learn model"""
+    X, y = train_data
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        model = LogisticRegression(
+            multi_class="multinomial", solver="lbfgs", max_iter=1000
+        )
+        model.fit(X, y)
+    return model
 
 @pytest.fixture
 def change_dir():
@@ -849,3 +883,148 @@ def test_errors(self):
                         jf.assess_model_bias(
                             score_table, sensitive_values, actual_values
                         )
+
+
+class TestModelCardGeneration(unittest.TestCase):
+    def test_generate_outcome_average_interval(self):
+        df = pd.DataFrame({"input": [3, 2, 1], "output": [1, 2, 3]})
+        assert (
+            jf.generate_outcome_average(df, ["input"], "interval") ==
+            {'eventAverage': 2.0}
+        )
+
+    def test_generate_outcome_average_classification(self):
+        df = pd.DataFrame({"input": [3, 2], "output": [0, 1]})
+        event_percentage = jf.generate_outcome_average(df, ["input"], "classification", 1)
+        assert('eventPercentage' in event_percentage)
+
+    def test_generate_outcome_average_interval_non_numeric_output(self):
+        df = pd.DataFrame({"input": [3, 2, 1], "output": ["one", "two", "three"]})
+        with pytest.raises(ValueError):
+            jf.generate_outcome_average(df, ["input"], "interval")
+
+
+class TestGetSelectionStatisticValue(unittest.TestCase):
+    model_file_dict = {
+        "dmcas_fitstat.json": {
+            "data": [
+                {
+                    "dataMap": {
+                        "_GINI_": 1,
+                        "_C_": 2,
+                        "_TAU_": None,
+                        "_DataRole_": "TRAIN"
+                    }
+                }
+            ]
+        }
+    }
+    tmp_dir = tempfile.TemporaryDirectory()
+    with open(Path(tmp_dir.name) / "dmcas_fitstat.json", "w+") as f:
+        f.write(json.dumps(model_file_dict['dmcas_fitstat.json']))
+
+    def test_get_statistic_dict_default(self):
+        selection_statistic = jf.get_selection_statistic_value(self.model_file_dict)
+        assert(selection_statistic == 1)
+
+    def test_get_statistic_dict_custom(self):
+        selection_statistic = jf.get_selection_statistic_value(self.model_file_dict, "_C_")
+        assert(selection_statistic == 2)
+
+    def test_get_blank_statistic_dict(self):
+        with pytest.raises(RuntimeError):
+            jf.get_selection_statistic_value(self.model_file_dict, "_TAU_")
+
+    def test_get_statistics_path_default(self):
+        selection_statistic = jf.get_selection_statistic_value(Path(self.tmp_dir.name))
+        assert(selection_statistic == 1)
+
+    def test_get_statistics_path_custom(self):
+        selection_statistic = jf.get_selection_statistic_value(Path(self.tmp_dir.name), "_C_")
+        assert(selection_statistic == 2)
+
+    def test_get_blank_statistic_path(self):
+        with pytest.raises(RuntimeError):
+            jf.get_selection_statistic_value(Path(self.tmp_dir.name), "_TAU_")
+
+    def test_get_statistics_str_default(self):
+        selection_statistic = jf.get_selection_statistic_value(self.tmp_dir.name)
+        assert (selection_statistic == 1)
+
+    def test_get_statistics_str_custom(self):
+        selection_statistic = jf.get_selection_statistic_value(self.tmp_dir.name, "_C_")
+        assert (selection_statistic == 2)
+
+    def test_get_blank_statistic_str(self):
+        with pytest.raises(RuntimeError):
+            jf.get_selection_statistic_value(self.tmp_dir.name, "_TAU_")
+
+
+class TestUpdateModelProperties(unittest.TestCase):
+    def setUp(self):
+        self.model_file_dict = {
+            "ModelProperties.json":
+                {
+                    "example": "property"
+                }
+        }
+        self.tmp_dir = tempfile.TemporaryDirectory()
+        with open(Path(self.tmp_dir.name) / "ModelProperties.json", "w+") as f:
+            f.write(json.dumps(self.model_file_dict['ModelProperties.json']))
+
+    def tearDown(self):
+        self.tmp_dir.cleanup()
+
+    def test_update_model_properties_dict(self):
+        update_dict = {'new': 'arg', 'newer': 'thing'}
+        jf.update_model_properties(self.model_file_dict, update_dict)
+        assert(self.model_file_dict['ModelProperties.json']['example'] == 'property')
+        assert(self.model_file_dict['ModelProperties.json']['new'] == 'arg')
+        assert(self.model_file_dict['ModelProperties.json']['newer'] == 'thing')
+
+    def test_update_model_properties_dict_overwrite(self):
+        update_dict = {'new': 'arg', 'example': 'thing'}
+        jf.update_model_properties(self.model_file_dict, update_dict)
+        assert (self.model_file_dict['ModelProperties.json']['example'] == 'thing')
+        assert (self.model_file_dict['ModelProperties.json']['new'] == 'arg')
+
+    def test_update_model_properties_dict_number(self):
+        update_dict = {"number": 1}
+        jf.update_model_properties(self.model_file_dict, update_dict)
+        assert (self.model_file_dict['ModelProperties.json']['number'] == '1')
+
+    def test_update_model_properties_dict_round_number(self):
+        update_dict = {'number': 0.123456789012345}
+        jf.update_model_properties(self.model_file_dict, update_dict)
+        assert (self.model_file_dict['ModelProperties.json']['number'] == '0.12345678901234')
+
+    def test_update_model_properties_str(self):
+        update_dict = {'new': 'arg', 'newer': 'thing'}
+        jf.update_model_properties(self.tmp_dir.name, update_dict)
+        with open(Path(self.tmp_dir.name) / 'ModelProperties.json', 'r') as f:
+            model_properties = json.load(f)
+            assert(model_properties['example'] == 'property')
+            assert(model_properties['new'] == 'arg')
+            assert(model_properties['newer'] == 'thing')
+
+    def test_update_model_properties_str_overwrite(self):
+        update_dict = {'new': 'arg', 'example': 'thing'}
+        jf.update_model_properties(self.tmp_dir.name, update_dict)
+        with open(Path(self.tmp_dir.name) / 'ModelProperties.json', 'r') as f:
+            model_properties = json.load(f)
+            assert (model_properties['example'] == 'thing')
+            assert (model_properties['new'] == 'arg')
+
+    def test_update_model_properties_str_number(self):
+        update_dict = {"number": 1}
+        jf.update_model_properties(self.tmp_dir.name, update_dict)
+        with open(Path(self.tmp_dir.name) / 'ModelProperties.json', 'r') as f:
+            model_properties = json.load(f)
+            assert (model_properties['number'] == '1')
+
+    def test_update_model_properties_str_round_number(self):
+        update_dict = {'number': 0.123456789012345}
+        jf.update_model_properties(self.tmp_dir.name, update_dict)
+        with open(Path(self.tmp_dir.name) / 'ModelProperties.json', 'r') as f:
+            model_properties = json.load(f)
+            assert (model_properties['number'] == '0.12345678901234')