From 53cb8bb9a337fe9082ea04fb9c1ba731152d5004 Mon Sep 17 00:00:00 2001 From: djm21 Date: Wed, 6 Mar 2024 18:22:26 -0600 Subject: [PATCH 1/9] Added functionality for Model Cards in write_json_files. --- .../dmcas_relativeimportance.json | 58 +++ src/sasctl/pzmm/write_json_files.py | 445 ++++++++++++++++++ 2 files changed, 503 insertions(+) create mode 100644 src/sasctl/pzmm/template_files/dmcas_relativeimportance.json diff --git a/src/sasctl/pzmm/template_files/dmcas_relativeimportance.json b/src/sasctl/pzmm/template_files/dmcas_relativeimportance.json new file mode 100644 index 00000000..c64cae9f --- /dev/null +++ b/src/sasctl/pzmm/template_files/dmcas_relativeimportance.json @@ -0,0 +1,58 @@ +{ + "creationTimeStamp" : "0001-01-01T00:00:00Z", + "modifiedTimeStamp" : "0001-01-01T00:00:00Z", + "revision" : 0, + "name" : "dmcas_relativeimportance", + "version" : 0, + "order" : 0, + "parameterMap" : { + "LABEL" : { + "label" : "Variable Label", + "length" : 256, + "order" : 1, + "parameter" : "LABEL", + "preformatted" : false, + "type" : "char", + "values" : [ "LABEL" ] + }, + "LEVEL" : { + "label" : "Variable Level", + "length" : 10, + "order" : 5, + "parameter" : "LEVEL", + "preformatted" : false, + "type" : "char", + "values" : [ "LEVEL" ] + }, + "ROLE" : { + "label" : "Role", + "length" : 32, + "order" : 4, + "parameter" : "ROLE", + "preformatted" : false, + "type" : "char", + "values" : [ "ROLE" ] + }, + "RelativeImportance" : { + "label" : "Relative Importance", + "length" : 8, + "order" : 3, + "parameter" : "RelativeImportance", + "preformatted" : false, + "type" : "num", + "values" : [ "RelativeImportance" ] + }, + "Variable" : { + "label" : "Variable Name", + "length" : 255, + "order" : 2, + "parameter" : "Variable", + "preformatted" : false, + "type" : "char", + "values" : [ "Variable" ] + } + }, + "data" : [], + "xInteger" : false, + "yInteger" : false + } \ No newline at end of file diff --git a/src/sasctl/pzmm/write_json_files.py b/src/sasctl/pzmm/write_json_files.py index 2d2fc327..2618c209 100644 --- a/src/sasctl/pzmm/write_json_files.py +++ b/src/sasctl/pzmm/write_json_files.py @@ -57,6 +57,7 @@ class NpEncoder(json.JSONEncoder): LIFT = "dmcas_lift.json" MAXDIFFERENCES = "maxDifferences.json" GROUPMETRICS = "groupMetrics.json" +VARIMPORTANCES = 'dmcas_relativeimportance.json' def _flatten(nested_list: Iterable) -> Generator[Any, None, None]: @@ -2198,3 +2199,447 @@ def remove_standard_library_packages(package_list: List[str]) -> List[str]: package for package in package_list if package not in py10stdlib ] return package_list + + @classmethod + def generate_model_card( + cls, + model_prefix: str, + model_files: Union[str, Path, dict], + algorithm: str, + train_data: pd.DataFrame, + train_predictions: Union[pd.Series, list], + target_type: str = "Interval", + target_value: Union[str, int, float, None] = None, + interval_vars: Optional[list] = [], + class_vars: Optional[list] = [], + selection_statistic: str = "_GINI_", + server: str = "cas-shared-default", + caslib: str = "Public", + ): + """ + Generates everything required for the model card feature within SAS Model Manager. + + This includes uploading the training data to CAS, updating ModelProperties.json to have + some extra properties, and generating dmcas_relativeimportance.json. + + Parameters + ---------- + model_prefix : string + The prefix used to name files relating to the model. This is used to provide a unique + name to the training data table when it is uploaded to CAS. + model_files : string, Path, or dict + Either the directory location of the model files (string or Path object), or + a dictionary containing the contents of all the model files. + algorithm : str + The name of the algorithm used to generate the model. + train_data: pandas.DataFrame + Training data that contains all input variables as well as the target variable. + train_predictions : pandas.Series, list + List of predictions made by the model on the training data. + target_type : string + Type the model is targeting. Currently supports "Classification" and "Interval" types. + The default value is "Interval". + target_value : string, int, float, optional + Value the model is targeting for Classification models. This argument is not needed for + Interval models. The default value is None. + interval_vars : list, optional + A list of interval variables. The default value is an empty list. + class_vars : list, optional + A list of classification variables. The default value is an empty list. + selection_statistic: str, optional + The selection statistic chosen to score the model against other models. Can be any of the + following values: "_RASE_", "_NObs_", "_GINI_", "_GAMMA_", "_MCE_", "_ASE_", "_MCLL_", + "_KS_", "_KSPostCutoff_", "_DIV_", "_TAU_", "_KSCut_", or "_C_". The default value is "_GINI_". + server: str, optional + The CAS server the training data will be stored on. The default value is "cas-shared-default" + caslib: str, optional + The caslib the training data will be stored on. The default value is "Public" + """ + if not target_value and target_type == "Classification": + raise RuntimeError( + "For the model card data to be properly generated on a Classification " + "model, a target value is required." + ) + if target_type not in ["Classification", "Interval"]: + raise RuntimeError( + "Only Classification and Interval target types are currently accepted." + ) + if selection_statistic not in cls.valid_params: + raise RuntimeError( + "The selection statistic must be a value generated in dmcas_fitstat.json. See " + "the documentation for a list of valid selection statistic values." + ) + if not algorithm: + raise RuntimeError( + "Either a given algorithm or a model is required for the model card." + ) + try: + sess = current_session() + conn = sess.as_swat() + except ImportError: + raise RuntimeError( + "The `swat` package is required to generate fit statistics, ROC, and " + "Lift charts with the calculate_model_statistics function." + ) + + # Upload training table to CAS. The location of the training table is returned. + training_table = cls.upload_training_data( + conn, + model_prefix, + train_data, + server, + caslib + ) + + # Generates the event percentage for Classification targets, and the event average + # for Interval targets + update_dict = cls.generate_outcome_average( + train_data=train_data, + input_variables=interval_vars + class_vars, + target_type=target_type, + target_value=target_value + ) + + # Formats all new ModelProperties information into one dictionary that can be used to update the json file + update_dict['trainTable'] = training_table + update_dict['selectionStatistic'] = selection_statistic + update_dict['algorithm'] = algorithm + update_dict['selectionStatisticValue'] = cls.get_selection_statistic_value(model_files, selection_statistic) + cls.update_model_properties(model_files, update_dict) + + # Generates dmcas_relativeimportance.json file + cls.generate_variable_importance( + conn, + model_files, + train_data, + train_predictions, + target_type, + interval_vars, + class_vars, + caslib + ) + + @staticmethod + def upload_training_data( + conn, + model_prefix: str, + train_data: pd.DataFrame, + server: str = "cas-shared-default", + caslib: str = 'Public' + ): + """ + Uploads training data to CAS server. + + Parameters + ---------- + conn + SWAT connection. Used to connect to CAS server. + model_prefix : string + The prefix used to name files relating to the model. This is used to provide a unique + name to the training data table when it is uploaded to CAS. + train_data: pandas.DataFrame + Training data that contains all input variables as well as the target variable. + server: str, optional + The CAS server the training data will be stored on. The default value is "cas-shared-default" + caslib: str, optional + The caslib the training data will be stored on. The default value is "Public" + + Returns + ------- + string + Returns a string that represents the location of the training table within CAS. + """ + # Upload raw training data to caslib so that data can be analyzed + train_data_name = model_prefix + "_train_data" + upload_train_data = conn.upload( + train_data, + casout={"name": train_data_name, "caslib": caslib}, + promote=True + ) + + if upload_train_data.status is not None: + raise RuntimeError( + f'A table with the name {train_data_name} already exists in the specified caslib. Please ' + 'either delete/rename the old table or give a new name to the current table.' + ) + + return server + '/' + caslib + '/' + train_data_name + + @staticmethod + def generate_outcome_average( + train_data: pd.DataFrame, + input_variables: list, + target_type, + target_value: Union[str, int, float] = None + ): + """ + Generates the outcome average of the training data. For Interval targets, the event average + is generated. For Classification targets, the event average is returned. + + Parameters + ---------- + train_data: pandas.DataFrame + Training data that contains all input variables as well as the target variable. If multiple + non-input variables are included, the function will assume that the first non-input variable row + is the output. + input_variables: list + A list of all input variables used by the model. Used to isolate the output variable. + target_type : string + Type the model is targeting. Currently supports "Classification" and "Interval" types. + target_value : string, int, float, optional + Value the model is targeting for Classification models. This argument is not needed for + Interval models. The default value is None. + + Returns + ------- + dict + Returns a dictionary with a key value pair that represents the outcome average. + """ + output_var = train_data.drop(input_variables, axis=1) + if target_type == "Classification": + value_counts = output_var[output_var.columns[0]].value_counts() + return {'eventPercentage': value_counts[target_value]/sum(value_counts)} + elif target_type == "Interval": + return {'eventAverage': sum(value_counts[value_counts.columns[0]]) / len(value_counts)} + + @staticmethod + def get_selection_statistic_value( + model_files, + selection_statistic + ): + """ + Finds the value of the chosen selection statistic in dmcas_fitstat.json, which should have been + generated before this function has been called. + + Parameters + ---------- + model_files : string, Path, or dict + Either the directory location of the model files (string or Path object), or + a dictionary containing the contents of all the model files. + selection_statistic: str, optional + The selection statistic chosen to score the model against other models. Can be any of the + following values: "_RASE_", "_NObs_", "_GINI_", "_GAMMA_", "_MCE_", "_ASE_", "_MCLL_", + "_KS_", "_KSPostCutoff_", "_DIV_", "_TAU_", "_KSCut_", or "_C_". The default value is "_GINI_". + + Returns + ------- + float + Returns the numerical value assoicated with the chosen selection statistic. + """ + if isinstance(model_files, dict): + if FITSTAT not in model_files: + raise RuntimeError( + "The dmcas_fitstat.json file must be generated before the model card data " + "can be generated." + ) + for fitstat in model_files[FITSTAT]['data']: + if fitstat['dataMap']['_DataRole_'] == "TRAIN": + if selection_statistic not in fitstat['dataMap'] or fitstat['dataMap'][selection_statistic] == None: + raise RuntimeError( + "The chosen selection statistic was not generated properly. Please ensure the value has been " + "properly created then try again." + ) + return fitstat['dataMap'][selection_statistic] + else: + if not Path.exists(Path(model_files) / FITSTAT): + raise RuntimeError( + "The dmcas_fitstat.json file must be generated before the model card data " + "can be generated." + ) + with open(Path(model_files) / FITSTAT, 'r') as fitstat_json: + fitstat_dict = json.load(fitstat_json) + for fitstat in fitstat_dict['data']: + if fitstat['dataMap']['_DataRole_'] == "TRAIN": + if selection_statistic not in fitstat['dataMap'] or fitstat['dataMap'][selection_statistic] == None: + raise RuntimeError( + "The chosen selection statistic was not generated properly. Please ensure the value has been " + "properly created then try again." + ) + return fitstat['dataMap'][selection_statistic] + + @staticmethod + def update_model_properties( + model_files, + update_dict + ): + """ + Updates the ModelProperties.json file to include properties listed in the update_dict dictionary. + + Parameters + ---------- + model_files : string, Path, or dict + Either the directory location of the model files (string or Path object), or + a dictionary containing the contents of all the model files. + update_dict : dictionary + A dictionary containing the key-value pairs that represent properties to be added + to the ModelProperties.json file. + """ + if isinstance(model_files, dict): + if PROP not in model_files: + raise RuntimeError( + "The ModelProperties.json file must be generated before the model card data " + "can be generated." + ) + for key, value in update_dict: + model_files[PROP][key] = value + else: + if not Path.exists(Path(model_files) / PROP): + raise RuntimeError( + "The ModelProperties.json file must be generated before the model card data " + "can be generated." + ) + with open(Path(model_files) / PROP, 'r+') as properties_json: + model_properties = json.load(properties_json) + for key, value in update_dict: + model_properties[key] = value + properties_json.seek(0) + properties_json.write(json.dumps(model_properties, indent=4, cls=NpEncoder)) + properties_json.truncate() + + @classmethod + def generate_variable_importance( + cls, + conn, + model_files: Union[str, Path, dict], + train_data: pd.DataFrame, + train_predictions: Union[pd.Series, list], + target_type: str = "interval", + interval_vars: Optional[list] = [], + class_vars: Optional[list] = [], + caslib: str = "Public", + ): + """ + Generates the dmcas_relativeimportance.json file, which is used to determine variable importance + + Parameters + ---------- + conn + A SWAT connection used to connect to the user's CAS server + model_files : string, Path, or dict + Either the directory location of the model files (string or Path object), or + a dictionary containing the contents of all the model files. + train_data: pandas.DataFrame + Training data that contains all input variables as well as the target variable. + train_predictions : pandas.Series, list + List of predictions made by the model on the training data. + target_type : string, optional + Type the model is targeting. Currently supports "Classification" and "Interval" types. + The default value is "Interval". + interval_vars : list, optional + A list of interval variables. The default value is an empty list. + class_vars : list, optional + A list of classification variables. The default value is an empty list. + caslib: str, optional + The caslib the training data will be stored on. The default value is "Public" + """ + try: + sess = current_session() + conn = sess.as_swat() + except ImportError: + raise RuntimeError( + "The `swat` package is required to generate fit statistics, ROC, and " + "Lift charts with the calculate_model_statistics function." + ) + # Remove target variable from training data by selecting only input variable columns + x_train_data = train_data[interval_vars + class_vars] + # Upload scored training data to run variable importance on + x_train_data.insert(0, "Prediction", train_predictions, True) + conn.upload( + x_train_data, + casout={"name": "train_data", "replace": True, "caslib": caslib} + ) + + # Load actionset necessary to generate variable importance + conn.loadactionset('dataPreprocess') + request_packages = list() + if target_type == "classification": + method = "DTREE" + treeCrit = "Entropy" + elif target_type == "interval": + method = "RTREE" + treeCrit = 'RSS' + else: + raise RuntimeError( + "The selected model type is unsupported. Currently, only models that have interval or classification target types are supported." + ) + request_packages = list() + if interval_vars: + request_packages.append({ + "name": 'BIN', + "inputs": [{"name": var} for var in interval_vars], + "targets": [{"name": "Prediction"}], + "discretize":{ + "method":method, + "arguments":{ + "minNBins":1, + "maxNBins":8, + "treeCrit":treeCrit, + "contingencyTblOpts":{"inputsMethod": 'BUCKET', "inputsNLevels": 100}, + "overrides": {"minNObsInBin": 5, "binMissing": True, "noDataLowerUpperBound": True} + } + } + }) + if class_vars: + request_packages.append({ + "name": 'BIN_NOM', + "inputs": [{"name": var} for var in class_vars], + "targets": [{"name": "Prediction"}], + "catTrans":{ + "method":method, + "arguments":{ + "minNBins":1, + "maxNBins":8, + "treeCrit":treeCrit, + "overrides": {"minNObsInBin": 5, "binMissing": True} + } + } + }) + var_data = conn.dataPreprocess.transform( + table={"name": "test_data", "caslib": caslib}, + requestPackages=request_packages, + evaluationStats=True, + percentileMaxIterations=10, + percentileTolerance=0.00001, + distinctCountLimit=5000, + sasVarNameLength=True, + outputTableOptions={"inputVarPrintOrder": True}, + sasProcClient=True + ) + var_importances = var_data['VarTransInfo'][['Variable', 'RelVarImportance']] + var_importances = var_importances.sort_values(by=['RelVarImportance'], ascending=False).reset_index(drop=True) + relative_importances = list() + for index, row in var_importances.iterrows(): + if row['Variable'] in interval_vars: + level = "INTERVAL" + elif row['Variable'] in class_vars: + level = "NOMINAL" + relative_importances.append({ + "dataMap" : { + "LABEL": "", + "LEVEL": level, + "ROLE": "INPUT", + "RelativeImportance": str(row['RelVarImportance']), + "Variable": row['Variable'] + }, + "rowNumber": index+1 + }) + with open('./dmcas_relativeimportance.json', 'r') as f: + relative_importance_json = json.load(f) + relative_importance_json['data'] = relative_importances + + if isinstance(model_files, dict): + model_files[VARIMPORTANCES] = json.dumps(relative_importance_json, indent=4, cls=NpEncoder) + if cls.notebook_output: + print( + f"{VARIMPORTANCES} was successfully written and saved to " + f"model files dictionary." + ) + else: + with open(Path(model_files) / VARIMPORTANCES, 'w') as json_file: + json_file.write(json.dumps(relative_importance_json, indent=4, cls=NpEncoder)) + if cls.notebook_output: + print( + f"{VARIMPORTANCES} was successfully written and saved to " + f"{Path(model_files) / VARIMPORTANCES}" + + ) \ No newline at end of file From 850f05498e0bed264715cccd9010b391f2752666 Mon Sep 17 00:00:00 2001 From: djm21 Date: Wed, 6 Mar 2024 19:53:14 -0600 Subject: [PATCH 2/9] fixed some bugs to allow for model card files to generate correctly. --- src/sasctl/pzmm/write_json_files.py | 34 +++++++++++++++-------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/src/sasctl/pzmm/write_json_files.py b/src/sasctl/pzmm/write_json_files.py index 2618c209..d905acde 100644 --- a/src/sasctl/pzmm/write_json_files.py +++ b/src/sasctl/pzmm/write_json_files.py @@ -2208,7 +2208,7 @@ def generate_model_card( algorithm: str, train_data: pd.DataFrame, train_predictions: Union[pd.Series, list], - target_type: str = "Interval", + target_type: str = "interval", target_value: Union[str, int, float, None] = None, interval_vars: Optional[list] = [], class_vars: Optional[list] = [], @@ -2237,10 +2237,10 @@ def generate_model_card( train_predictions : pandas.Series, list List of predictions made by the model on the training data. target_type : string - Type the model is targeting. Currently supports "Classification" and "Interval" types. + Type the model is targeting. Currently supports "classification" and "interval" types. The default value is "Interval". target_value : string, int, float, optional - Value the model is targeting for Classification models. This argument is not needed for + Value the model is targeting for classification models. This argument is not needed for Interval models. The default value is None. interval_vars : list, optional A list of interval variables. The default value is an empty list. @@ -2255,14 +2255,14 @@ def generate_model_card( caslib: str, optional The caslib the training data will be stored on. The default value is "Public" """ - if not target_value and target_type == "Classification": + if not target_value and target_type == "classification": raise RuntimeError( - "For the model card data to be properly generated on a Classification " + "For the model card data to be properly generated on a classification " "model, a target value is required." ) - if target_type not in ["Classification", "Interval"]: + if target_type not in ["classification", "interval"]: raise RuntimeError( - "Only Classification and Interval target types are currently accepted." + "Only classification and interval target types are currently accepted." ) if selection_statistic not in cls.valid_params: raise RuntimeError( @@ -2396,10 +2396,10 @@ def generate_outcome_average( Returns a dictionary with a key value pair that represents the outcome average. """ output_var = train_data.drop(input_variables, axis=1) - if target_type == "Classification": + if target_type == "classification": value_counts = output_var[output_var.columns[0]].value_counts() return {'eventPercentage': value_counts[target_value]/sum(value_counts)} - elif target_type == "Interval": + elif target_type == "interval": return {'eventAverage': sum(value_counts[value_counts.columns[0]]) / len(value_counts)} @staticmethod @@ -2480,8 +2480,8 @@ def update_model_properties( "The ModelProperties.json file must be generated before the model card data " "can be generated." ) - for key, value in update_dict: - model_files[PROP][key] = value + for key in update_dict: + model_files[PROP][key] = update_dict[key] else: if not Path.exists(Path(model_files) / PROP): raise RuntimeError( @@ -2490,8 +2490,8 @@ def update_model_properties( ) with open(Path(model_files) / PROP, 'r+') as properties_json: model_properties = json.load(properties_json) - for key, value in update_dict: - model_properties[key] = value + for key in update_dict: + model_properties[key] = update_dict[key] properties_json.seek(0) properties_json.write(json.dumps(model_properties, indent=4, cls=NpEncoder)) properties_json.truncate() @@ -2595,7 +2595,7 @@ def generate_variable_importance( } }) var_data = conn.dataPreprocess.transform( - table={"name": "test_data", "caslib": caslib}, + table={"name": "train_data", "caslib": caslib}, requestPackages=request_packages, evaluationStats=True, percentileMaxIterations=10, @@ -2623,7 +2623,10 @@ def generate_variable_importance( }, "rowNumber": index+1 }) - with open('./dmcas_relativeimportance.json', 'r') as f: + json_template_path = ( + Path(__file__).resolve().parent / f"template_files/{VARIMPORTANCES}" + ) + with open(json_template_path, 'r') as f: relative_importance_json = json.load(f) relative_importance_json['data'] = relative_importances @@ -2641,5 +2644,4 @@ def generate_variable_importance( print( f"{VARIMPORTANCES} was successfully written and saved to " f"{Path(model_files) / VARIMPORTANCES}" - ) \ No newline at end of file From c20c1629aef3aabd20765da4318f6fe1e33eb7dd Mon Sep 17 00:00:00 2001 From: djm21 Date: Sun, 10 Mar 2024 22:21:28 -0500 Subject: [PATCH 3/9] Updated update_model_properties to give proper type/length to model properties --- src/sasctl/pzmm/write_json_files.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/sasctl/pzmm/write_json_files.py b/src/sasctl/pzmm/write_json_files.py index d905acde..4345b6eb 100644 --- a/src/sasctl/pzmm/write_json_files.py +++ b/src/sasctl/pzmm/write_json_files.py @@ -2481,7 +2481,10 @@ def update_model_properties( "can be generated." ) for key in update_dict: - model_files[PROP][key] = update_dict[key] + if not isinstance(update_dict[key], str): + model_files[PROP][key] = str(round(update_dict[key], 14)) + else: + model_files[PROP][key] = update_dict[key] else: if not Path.exists(Path(model_files) / PROP): raise RuntimeError( @@ -2490,8 +2493,10 @@ def update_model_properties( ) with open(Path(model_files) / PROP, 'r+') as properties_json: model_properties = json.load(properties_json) - for key in update_dict: - model_properties[key] = update_dict[key] + if not isinstance(update_dict[key], str): + model_files[PROP][key] = str(round(update_dict[key], 14)) + else: + model_files[PROP][key] = update_dict[key] properties_json.seek(0) properties_json.write(json.dumps(model_properties, indent=4, cls=NpEncoder)) properties_json.truncate() From 3913c309d9a50c77563f2fbf234158d76d21092d Mon Sep 17 00:00:00 2001 From: djm21 Date: Tue, 12 Mar 2024 14:59:16 -0500 Subject: [PATCH 4/9] Added some tests for model cards + fixed some model card errors --- src/sasctl/pzmm/write_json_files.py | 55 +++++---- tests/unit/test_write_json_files.py | 179 ++++++++++++++++++++++++++++ 2 files changed, 206 insertions(+), 28 deletions(-) diff --git a/src/sasctl/pzmm/write_json_files.py b/src/sasctl/pzmm/write_json_files.py index 4345b6eb..70a3ee9b 100644 --- a/src/sasctl/pzmm/write_json_files.py +++ b/src/sasctl/pzmm/write_json_files.py @@ -2374,7 +2374,7 @@ def generate_outcome_average( ): """ Generates the outcome average of the training data. For Interval targets, the event average - is generated. For Classification targets, the event average is returned. + is generated. For Classification targets, the event percentage is returned. Parameters ---------- @@ -2395,17 +2395,23 @@ def generate_outcome_average( dict Returns a dictionary with a key value pair that represents the outcome average. """ + import numbers output_var = train_data.drop(input_variables, axis=1) if target_type == "classification": value_counts = output_var[output_var.columns[0]].value_counts() return {'eventPercentage': value_counts[target_value]/sum(value_counts)} elif target_type == "interval": - return {'eventAverage': sum(value_counts[value_counts.columns[0]]) / len(value_counts)} + if not isinstance(output_var[output_var.columns[0]].iloc[0], numbers.Number): + raise ValueError("Detected output column is not numeric. Please ensure that " + + "the correct output column is being passed, and that no extra columns " + + "are in front of the output column. This function assumes that the first " + + "non-input column is the output column.jf") + return {'eventAverage': sum(output_var[output_var.columns[0]]) / len(output_var)} @staticmethod def get_selection_statistic_value( - model_files, - selection_statistic + model_files: Union[str, Path, dict], + selection_statistic: str = "_GINI_" ): """ Finds the value of the chosen selection statistic in dmcas_fitstat.json, which should have been @@ -2493,10 +2499,11 @@ def update_model_properties( ) with open(Path(model_files) / PROP, 'r+') as properties_json: model_properties = json.load(properties_json) - if not isinstance(update_dict[key], str): - model_files[PROP][key] = str(round(update_dict[key], 14)) - else: - model_files[PROP][key] = update_dict[key] + for key in update_dict: + if not isinstance(update_dict[key], str): + model_properties[key] = str(round(update_dict[key], 14)) + else: + model_properties[key] = update_dict[key] properties_json.seek(0) properties_json.write(json.dumps(model_properties, indent=4, cls=NpEncoder)) properties_json.truncate() @@ -2537,14 +2544,6 @@ def generate_variable_importance( caslib: str, optional The caslib the training data will be stored on. The default value is "Public" """ - try: - sess = current_session() - conn = sess.as_swat() - except ImportError: - raise RuntimeError( - "The `swat` package is required to generate fit statistics, ROC, and " - "Lift charts with the calculate_model_statistics function." - ) # Remove target variable from training data by selecting only input variable columns x_train_data = train_data[interval_vars + class_vars] # Upload scored training data to run variable importance on @@ -2573,12 +2572,12 @@ def generate_variable_importance( "name": 'BIN', "inputs": [{"name": var} for var in interval_vars], "targets": [{"name": "Prediction"}], - "discretize":{ - "method":method, - "arguments":{ - "minNBins":1, - "maxNBins":8, - "treeCrit":treeCrit, + "discretize": { + "method": method, + "arguments": { + "minNBins": 1, + "maxNBins": 8, + "treeCrit": treeCrit, "contingencyTblOpts":{"inputsMethod": 'BUCKET', "inputsNLevels": 100}, "overrides": {"minNObsInBin": 5, "binMissing": True, "noDataLowerUpperBound": True} } @@ -2589,12 +2588,12 @@ def generate_variable_importance( "name": 'BIN_NOM', "inputs": [{"name": var} for var in class_vars], "targets": [{"name": "Prediction"}], - "catTrans":{ - "method":method, - "arguments":{ - "minNBins":1, - "maxNBins":8, - "treeCrit":treeCrit, + "catTrans": { + "method": method, + "arguments": { + "minNBins": 1, + "maxNBins": 8, + "treeCrit": treeCrit, "overrides": {"minNObsInBin": 5, "binMissing": True} } } diff --git a/tests/unit/test_write_json_files.py b/tests/unit/test_write_json_files.py index 412759f2..19cbd4f9 100644 --- a/tests/unit/test_write_json_files.py +++ b/tests/unit/test_write_json_files.py @@ -16,11 +16,14 @@ import warnings from pathlib import Path from unittest.mock import patch +import math import numpy as np import pandas as pd import pytest from sklearn.model_selection import train_test_split +from sklearn import datasets +from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier import sasctl.pzmm as pzmm @@ -43,6 +46,37 @@ {"name": "REASON_HomeImp", "type": "integer"}, ] +class BadModel: + attr = None + +@pytest.fixture +def bad_model(): + return BadModel() + + +@pytest.fixture +def train_data(): + """Returns the Iris data set as (X, y)""" + raw = datasets.load_iris() + iris = pd.DataFrame(raw.data, columns=raw.feature_names) + iris = iris.join(pd.DataFrame(raw.target)) + iris.columns = ["SepalLength", "SepalWidth", "PetalLength", "PetalWidth", "Species"] + iris["Species"] = iris["Species"].astype("category") + iris.Species.cat.categories = raw.target_names + return iris.iloc[:, 0:4], iris["Species"] + + +@pytest.fixture +def sklearn_model(train_data): + """Returns a simple Scikit-Learn model""" + X, y = train_data + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + model = LogisticRegression( + multi_class="multinomial", solver="lbfgs", max_iter=1000 + ) + model.fit(X, y) + return model @pytest.fixture def change_dir(): @@ -849,3 +883,148 @@ def test_errors(self): jf.assess_model_bias( score_table, sensitive_values, actual_values ) + + +class TestModelCardGeneration(unittest.TestCase): + def test_generate_outcome_average_interval(self): + df = pd.DataFrame({"input": [3, 2, 1], "output": [1, 2, 3]}) + assert ( + jf.generate_outcome_average(df, ["input"], "interval") == + {'eventAverage': 2.0} + ) + + def test_generate_outcome_average_classification(self): + df = pd.DataFrame({"input": [3, 2], "output": [0, 1]}) + event_percentage = jf.generate_outcome_average(df, ["input"], "classification", 1) + assert('eventPercentage' in event_percentage) + + def test_generate_outcome_average_interval_non_numeric_output(self): + df = pd.DataFrame({"input": [3, 2, 1], "output": ["one", "two", "three"]}) + with pytest.raises(ValueError): + jf.generate_outcome_average(df, ["input"], "interval") + + +class TestGetSelectionStatisticValue(unittest.TestCase): + model_file_dict = { + "dmcas_fitstat.json": { + "data": [ + { + "dataMap": { + "_GINI_": 1, + "_C_": 2, + "_TAU_": None, + "_DataRole_": "TRAIN" + } + } + ] + } + } + tmp_dir = tempfile.TemporaryDirectory() + with open(Path(tmp_dir.name) / "dmcas_fitstat.json", "w+") as f: + f.write(json.dumps(model_file_dict['dmcas_fitstat.json'])) + + def test_get_statistic_dict_default(self): + selection_statistic = jf.get_selection_statistic_value(self.model_file_dict) + assert(selection_statistic == 1) + + def test_get_statistic_dict_custom(self): + selection_statistic = jf.get_selection_statistic_value(self.model_file_dict, "_C_") + assert(selection_statistic == 2) + + def test_get_blank_statistic_dict(self): + with pytest.raises(RuntimeError): + jf.get_selection_statistic_value(self.model_file_dict, "_TAU_") + + def test_get_statistics_path_default(self): + selection_statistic = jf.get_selection_statistic_value(Path(self.tmp_dir.name)) + assert(selection_statistic == 1) + + def test_get_statistics_path_custom(self): + selection_statistic = jf.get_selection_statistic_value(Path(self.tmp_dir.name), "_C_") + assert(selection_statistic == 2) + + def test_get_blank_statistic_path(self): + with pytest.raises(RuntimeError): + jf.get_selection_statistic_value(Path(self.tmp_dir.name), "_TAU_") + + def test_get_statistics_str_default(self): + selection_statistic = jf.get_selection_statistic_value(self.tmp_dir.name) + assert (selection_statistic == 1) + + def test_get_statistics_str_custom(self): + selection_statistic = jf.get_selection_statistic_value(self.tmp_dir.name, "_C_") + assert (selection_statistic == 2) + + def test_get_blank_statistic_str(self): + with pytest.raises(RuntimeError): + jf.get_selection_statistic_value(self.tmp_dir.name, "_TAU_") + + +class TestUpdateModelProperties(unittest.TestCase): + def setUp(self): + self.model_file_dict = { + "ModelProperties.json": + { + "example": "property" + } + } + self.tmp_dir = tempfile.TemporaryDirectory() + with open(Path(self.tmp_dir.name) / "ModelProperties.json", "w+") as f: + f.write(json.dumps(self.model_file_dict['ModelProperties.json'])) + + def tearDown(self): + self.tmp_dir.cleanup() + + def test_update_model_properties_dict(self): + update_dict = {'new': 'arg', 'newer': 'thing'} + jf.update_model_properties(self.model_file_dict, update_dict) + assert(self.model_file_dict['ModelProperties.json']['example'] == 'property') + assert(self.model_file_dict['ModelProperties.json']['new'] == 'arg') + assert(self.model_file_dict['ModelProperties.json']['newer'] == 'thing') + + def test_update_model_properties_dict_overwrite(self): + update_dict = {'new': 'arg', 'example': 'thing'} + jf.update_model_properties(self.model_file_dict, update_dict) + assert (self.model_file_dict['ModelProperties.json']['example'] == 'thing') + assert (self.model_file_dict['ModelProperties.json']['new'] == 'arg') + + def test_update_model_properties_dict_number(self): + update_dict = {"number": 1} + jf.update_model_properties(self.model_file_dict, update_dict) + assert (self.model_file_dict['ModelProperties.json']['number'] == '1') + + def test_update_model_properties_dict_round_number(self): + update_dict = {'number': 0.123456789012345} + jf.update_model_properties(self.model_file_dict, update_dict) + assert (self.model_file_dict['ModelProperties.json']['number'] == '0.12345678901234') + + def test_update_model_properties_str(self): + update_dict = {'new': 'arg', 'newer': 'thing'} + jf.update_model_properties(self.tmp_dir.name, update_dict) + with open(Path(self.tmp_dir.name) / 'ModelProperties.json', 'r') as f: + model_properties = json.load(f) + assert(model_properties['example'] == 'property') + assert(model_properties['new'] == 'arg') + assert(model_properties['newer'] == 'thing') + + def test_update_model_properties_str_overwrite(self): + update_dict = {'new': 'arg', 'example': 'thing'} + jf.update_model_properties(self.tmp_dir.name, update_dict) + with open(Path(self.tmp_dir.name) / 'ModelProperties.json', 'r') as f: + model_properties = json.load(f) + assert (model_properties['example'] == 'thing') + assert (model_properties['new'] == 'arg') + + def test_update_model_properties_str_number(self): + update_dict = {"number": 1} + jf.update_model_properties(self.tmp_dir.name, update_dict) + with open(Path(self.tmp_dir.name) / 'ModelProperties.json', 'r') as f: + model_properties = json.load(f) + assert (model_properties['number'] == '1') + + def test_update_model_properties_str_round_number(self): + update_dict = {'number': 0.123456789012345} + jf.update_model_properties(self.tmp_dir.name, update_dict) + with open(Path(self.tmp_dir.name) / 'ModelProperties.json', 'r') as f: + model_properties = json.load(f) + assert (model_properties['number'] == '0.12345678901234') \ No newline at end of file From fc85adce542add5fd44d41c0da61f43bd7eb054a Mon Sep 17 00:00:00 2001 From: djm21 Date: Mon, 18 Mar 2024 11:55:37 -0500 Subject: [PATCH 5/9] changed "interval" to "prediction" + changed calculate_model_statistics to allow for prediction models --- src/sasctl/pzmm/write_json_files.py | 123 +++++++++++++++++----------- 1 file changed, 75 insertions(+), 48 deletions(-) diff --git a/src/sasctl/pzmm/write_json_files.py b/src/sasctl/pzmm/write_json_files.py index 70a3ee9b..1d00bd0e 100644 --- a/src/sasctl/pzmm/write_json_files.py +++ b/src/sasctl/pzmm/write_json_files.py @@ -1174,6 +1174,7 @@ def calculate_model_statistics( train_data: Union[DataFrame, List[list], Type["numpy.array"]] = None, test_data: Union[DataFrame, List[list], Type["numpy.array"]] = None, json_path: Union[str, Path, None] = None, + target_type: str = "classification" ) -> Union[dict, None]: """ Calculates fit statistics (including ROC and Lift curves) from datasets and then @@ -1214,6 +1215,9 @@ def calculate_model_statistics( Dataset pertaining to the test data. The default value is None. json_path : str or Path, optional Location for the output JSON files. The default value is None. + target_type: str, optional + Type of target the model is trying to find. Currently supports "classification" + and "prediction" types. The default value is "classification". Returns ------- @@ -1260,18 +1264,26 @@ def calculate_model_statistics( data, casout={"name": "assess_dataset", "replace": True, "caslib": "Public"}, ) - - conn.percentile.assess( - table={"name": "assess_dataset", "caslib": "Public"}, - response="predict", - pVar="predict_proba", - event=str(target_value), - pEvent=str(prob_value) if prob_value else str(0.5), - inputs="actual", - fitStatOut={"name": "FitStat", "replace": True, "caslib": "Public"}, - rocOut={"name": "ROC", "replace": True, "caslib": "Public"}, - casout={"name": "Lift", "replace": True, "caslib": "Public"}, - ) + if target_type == 'classification': + conn.percentile.assess( + table={"name": "assess_dataset", "caslib": "Public"}, + response="predict", + pVar="predict_proba", + event=str(target_value), + pEvent=str(prob_value) if prob_value else str(0.5), + inputs="actual", + fitStatOut={"name": "FitStat", "replace": True, "caslib": "Public"}, + rocOut={"name": "ROC", "replace": True, "caslib": "Public"}, + casout={"name": "Lift", "replace": True, "caslib": "Public"}, + ) + else: + conn.percentile.assess( + table={"name": "assess_dataset", "caslib": "Public"}, + response="predict", + inputs="actual", + fitStatOut={"name": "FitStat", "replace": True, "caslib": "Public"}, + casout={"name": "Lift", "replace": True, "caslib": "Public"} + ) fitstat_dict = ( pd.DataFrame(conn.CASTable("FitStat", caslib="Public").to_frame()) @@ -1280,11 +1292,11 @@ def calculate_model_statistics( .to_dict() ) json_dict[0]["data"][i]["dataMap"].update(fitstat_dict) - - roc_df = pd.DataFrame(conn.CASTable("ROC", caslib="Public").to_frame()) - roc_dict = cls.apply_dataframe_to_json(json_dict[1]["data"], i, roc_df) - for j in range(len(roc_dict)): - json_dict[1]["data"][j].update(roc_dict[j]) + if target_type == 'classification': + roc_df = pd.DataFrame(conn.CASTable("ROC", caslib="Public").to_frame()) + roc_dict = cls.apply_dataframe_to_json(json_dict[1]["data"], i, roc_df) + for j in range(len(roc_dict)): + json_dict[1]["data"][j].update(roc_dict[j]) lift_df = pd.DataFrame(conn.CASTable("Lift", caslib="Public").to_frame()) lift_dict = cls.apply_dataframe_to_json(json_dict[2]["data"], i, lift_df, 1) @@ -1293,19 +1305,26 @@ def calculate_model_statistics( if json_path: for i, name in enumerate([FITSTAT, ROC, LIFT]): - with open(Path(json_path) / name, "w") as json_file: - json_file.write(json.dumps(json_dict[i], indent=4, cls=NpEncoder)) - if cls.notebook_output: - print( - f"{name} was successfully written and saved to " - f"{Path(json_path) / name}" - ) + if not (name == ROC and target_type == "prediction"): + with open(Path(json_path) / name, "w") as json_file: + json_file.write(json.dumps(json_dict[i], indent=4, cls=NpEncoder)) + if cls.notebook_output: + print( + f"{name} was successfully written and saved to " + f"{Path(json_path) / name}" + ) else: - return { - FITSTAT: json.dumps(json_dict[0], indent=4, cls=NpEncoder), - ROC: json.dumps(json_dict[1], indent=4, cls=NpEncoder), - LIFT: json.dumps(json_dict[2], indent=4, cls=NpEncoder), - } + if target_type == 'classification': + return { + FITSTAT: json.dumps(json_dict[0], indent=4, cls=NpEncoder), + ROC: json.dumps(json_dict[1], indent=4, cls=NpEncoder), + LIFT: json.dumps(json_dict[2], indent=4, cls=NpEncoder), + } + else: + return { + FITSTAT: json.dumps(json_dict[0], indent=4, cls=NpEncoder), + LIFT: json.dumps(json_dict[2], indent=4, cls=NpEncoder), + } @staticmethod def check_for_data( @@ -2208,11 +2227,11 @@ def generate_model_card( algorithm: str, train_data: pd.DataFrame, train_predictions: Union[pd.Series, list], - target_type: str = "interval", + target_type: str = "classificaiton", target_value: Union[str, int, float, None] = None, interval_vars: Optional[list] = [], class_vars: Optional[list] = [], - selection_statistic: str = "_GINI_", + selection_statistic: str = None, server: str = "cas-shared-default", caslib: str = "Public", ): @@ -2237,19 +2256,22 @@ def generate_model_card( train_predictions : pandas.Series, list List of predictions made by the model on the training data. target_type : string - Type the model is targeting. Currently supports "classification" and "interval" types. - The default value is "Interval". + Type of target the model is trying to find. Currently supports "classification" and "prediction" types. + The default value is "classification". target_value : string, int, float, optional Value the model is targeting for classification models. This argument is not needed for - Interval models. The default value is None. + prediction models. The default value is None. interval_vars : list, optional A list of interval variables. The default value is an empty list. class_vars : list, optional A list of classification variables. The default value is an empty list. selection_statistic: str, optional - The selection statistic chosen to score the model against other models. Can be any of the - following values: "_RASE_", "_NObs_", "_GINI_", "_GAMMA_", "_MCE_", "_ASE_", "_MCLL_", - "_KS_", "_KSPostCutoff_", "_DIV_", "_TAU_", "_KSCut_", or "_C_". The default value is "_GINI_". + The selection statistic chosen to score the model against other models. Classification + models can take any of the following values: "_RASE_", "_GINI_", "_GAMMA_", "_MCE_", + "_ASE_", "_MCLL_", "_KS_", "_KSPostCutoff_", "_DIV_", "_TAU_", "_KSCut_", or "_C_". + Prediction models can take any of the following values: "_ASE_", "_DIV_", "_RASE_", "_MAE_", + "_RMAE_", "_MSLE_", "_RMSLE_" The default value is "_KS_" for classification models and + "_ASE_" for prediction models. server: str, optional The CAS server the training data will be stored on. The default value is "cas-shared-default" caslib: str, optional @@ -2260,10 +2282,15 @@ def generate_model_card( "For the model card data to be properly generated on a classification " "model, a target value is required." ) - if target_type not in ["classification", "interval"]: + if target_type not in ["classification", "prediction"]: raise RuntimeError( - "Only classification and interval target types are currently accepted." + "Only classification and prediction target types are currently accepted." ) + if selection_statistic is None: + if target_type is 'classification': + selection_statistic = '_KS_' + elif target_type is 'prediction': + selection_statistic = "_ASE_" if selection_statistic not in cls.valid_params: raise RuntimeError( "The selection statistic must be a value generated in dmcas_fitstat.json. See " @@ -2292,7 +2319,7 @@ def generate_model_card( ) # Generates the event percentage for Classification targets, and the event average - # for Interval targets + # for prediction targets update_dict = cls.generate_outcome_average( train_data=train_data, input_variables=interval_vars + class_vars, @@ -2373,7 +2400,7 @@ def generate_outcome_average( target_value: Union[str, int, float] = None ): """ - Generates the outcome average of the training data. For Interval targets, the event average + Generates the outcome average of the training data. For prediction targets, the event average is generated. For Classification targets, the event percentage is returned. Parameters @@ -2385,10 +2412,10 @@ def generate_outcome_average( input_variables: list A list of all input variables used by the model. Used to isolate the output variable. target_type : string - Type the model is targeting. Currently supports "Classification" and "Interval" types. + Type the model is targeting. Currently supports "classification" and "prediction" types. target_value : string, int, float, optional Value the model is targeting for Classification models. This argument is not needed for - Interval models. The default value is None. + prediction models. The default value is None. Returns ------- @@ -2400,7 +2427,7 @@ def generate_outcome_average( if target_type == "classification": value_counts = output_var[output_var.columns[0]].value_counts() return {'eventPercentage': value_counts[target_value]/sum(value_counts)} - elif target_type == "interval": + elif target_type == "prediction": if not isinstance(output_var[output_var.columns[0]].iloc[0], numbers.Number): raise ValueError("Detected output column is not numeric. Please ensure that " + "the correct output column is being passed, and that no extra columns " + @@ -2515,7 +2542,7 @@ def generate_variable_importance( model_files: Union[str, Path, dict], train_data: pd.DataFrame, train_predictions: Union[pd.Series, list], - target_type: str = "interval", + target_type: str = "classification", interval_vars: Optional[list] = [], class_vars: Optional[list] = [], caslib: str = "Public", @@ -2535,8 +2562,8 @@ def generate_variable_importance( train_predictions : pandas.Series, list List of predictions made by the model on the training data. target_type : string, optional - Type the model is targeting. Currently supports "Classification" and "Interval" types. - The default value is "Interval". + Type the model is targeting. Currently supports "classification" and "prediction" types. + The default value is "classification". interval_vars : list, optional A list of interval variables. The default value is an empty list. class_vars : list, optional @@ -2564,7 +2591,7 @@ def generate_variable_importance( treeCrit = 'RSS' else: raise RuntimeError( - "The selected model type is unsupported. Currently, only models that have interval or classification target types are supported." + "The selected model type is unsupported. Currently, only models that have prediction or classification target types are supported." ) request_packages = list() if interval_vars: From 7b14935a7545e0a84aca143c8222d46ef50ead94 Mon Sep 17 00:00:00 2001 From: djm21 Date: Tue, 26 Mar 2024 01:06:20 -0400 Subject: [PATCH 6/9] Update pzmm_binary_classification_model_import notebook to include model card generation --- ...m_binary_classification_model_import.ipynb | 20 ++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/examples/pzmm_binary_classification_model_import.ipynb b/examples/pzmm_binary_classification_model_import.ipynb index e540b703..458b5014 100644 --- a/examples/pzmm_binary_classification_model_import.ipynb +++ b/examples/pzmm_binary_classification_model_import.ipynb @@ -740,7 +740,7 @@ ], "source": [ "import getpass\n", - "def write_model_stats(x_train, y_train, test_predict, test_proba, y_test, model, path):\n", + "def write_model_stats(x_train, y_train, test_predict, test_proba, y_test, model, path, prefix):\n", " # Calculate train predictions\n", " train_predict = model.predict(x_train)\n", " train_proba = model.predict_proba(x_train)\n", @@ -757,6 +757,20 @@ " test_data=test_data, \n", " json_path=path\n", " )\n", + "\n", + " full_training_data = pd.concat([y_train.reset_index(drop=True), x_train.reset_index(drop=True)], axis=1)\n", + "\n", + " pzmm.JSONFiles.generate_model_card(\n", + " model_prefix=prefix,\n", + " model_files = path,\n", + " algorithm = str(type(model).__name__),\n", + " train_data = full_training_data,\n", + " train_predictions=train_predict,\n", + " target_type='classification',\n", + " target_value=1,\n", + " interval_vars=predictor_columns,\n", + " selection_statistic='_RASE_',\n", + " )\n", " \n", "username = getpass.getpass()\n", "password = getpass.getpass()\n", @@ -766,8 +780,8 @@ "\n", "test_predict = [y_dtc_predict, y_rfc_predict, y_gbc_predict]\n", "test_proba = [y_dtc_proba, y_rfc_proba, y_gbc_proba]\n", - "for (mod, pred, proba, path) in zip(model, test_predict, test_proba, zip_folder):\n", - " write_model_stats(x_train, y_train, pred, proba, y_test, mod, path)" + "for (mod, pred, proba, path, prefix) in zip(model, test_predict, test_proba, zip_folder, model_prefix):\n", + " write_model_stats(x_train, y_train, pred, proba, y_test, mod, path, prefix)" ] }, { From e7c37918169909dc2b6819534462730fa6a1613f Mon Sep 17 00:00:00 2001 From: djm21 Date: Mon, 8 Apr 2024 16:34:25 -0500 Subject: [PATCH 7/9] added dmcas_misc file generation capability --- src/sasctl/pzmm/write_json_files.py | 81 ++++++++++++++++++++++++++++- 1 file changed, 80 insertions(+), 1 deletion(-) diff --git a/src/sasctl/pzmm/write_json_files.py b/src/sasctl/pzmm/write_json_files.py index 1d00bd0e..04da0260 100644 --- a/src/sasctl/pzmm/write_json_files.py +++ b/src/sasctl/pzmm/write_json_files.py @@ -58,6 +58,7 @@ class NpEncoder(json.JSONEncoder): MAXDIFFERENCES = "maxDifferences.json" GROUPMETRICS = "groupMetrics.json" VARIMPORTANCES = 'dmcas_relativeimportance.json' +MISC = 'dmcas_misc.json' def _flatten(nested_list: Iterable) -> Generator[Any, None, None]: @@ -1174,7 +1175,8 @@ def calculate_model_statistics( train_data: Union[DataFrame, List[list], Type["numpy.array"]] = None, test_data: Union[DataFrame, List[list], Type["numpy.array"]] = None, json_path: Union[str, Path, None] = None, - target_type: str = "classification" + target_type: str = "classification", + cutoff: Optional[float] = None ) -> Union[dict, None]: """ Calculates fit statistics (including ROC and Lift curves) from datasets and then @@ -2345,6 +2347,12 @@ def generate_model_card( class_vars, caslib ) + + # Generates dmcas_misc.json file + cls.generate_misc( + conn, + model_files + ) @staticmethod def upload_training_data( @@ -2675,4 +2683,75 @@ def generate_variable_importance( print( f"{VARIMPORTANCES} was successfully written and saved to " f"{Path(model_files) / VARIMPORTANCES}" + ) + + @classmethod + def generate_misc( + cls, + model_files: Union[str, Path, dict] + ): + """ + Generates the dmcas_relativeimportance.json file, which is used to determine variable importance + + Parameters + ---------- + conn + A SWAT connection used to connect to the user's CAS server + model_files : string, Path, or dict + Either the directory location of the model files (string or Path object), or + a dictionary containing the contents of all the model files. + """ + if isinstance(model_files, dict): + if ROC not in model_files: + raise RuntimeError( + "The ModelProperties.json file must be generated before the model card data " + "can be generated." + ) + roc_table = model_files[ROC] + else: + if not Path.exists(Path(model_files) / ROC): + raise RuntimeError( + "The ModelProperties.json file must be generated before the model card data " + "can be generated." + ) + with open(Path(model_files) / ROC, 'r') as roc_file: + roc_table = json.load(roc_file) + correct_text = ["CORRECT", "INCORRECT", "CORRECT", "INCORRECT"] + outcome_values = ['1', '0', '0', '1'] + misc_data = list() + # Iterates through ROC table to get TRAIN, TEST, and VALIDATE data with a cutoff of .5 + for i in range(50, 300, 100): + roc_data = roc_table['data'][i]['dataMap'] + correctness_values = [roc_data['_TP_'], roc_data['_FP_'], roc_data['_TN_'], roc_data['_FN_']] + for (c_text, c_val, o_val) in zip(correct_text, correctness_values, outcome_values): + misc_data.append({ + "CorrectText": c_text, + "Outcome": o_val, + "_Count_": c_val, + "_DataRole_": roc_data['_DataRole_'], + "_cutoffSource_": "Default", + "_cutoff_": "0.5" + }) + + json_template_path = ( + Path(__file__).resolve().parent / f"template_files/{MISC}" + ) + with open(json_template_path, 'r') as f: + misc_json = json.load(f) + misc_json['data'] = misc_data + + if isinstance(model_files, dict): + model_files[MISC] = json.dumps(misc_json, indent=4, cls=NpEncoder) + if cls.notebook_output: + print( + f"{MISC} was successfully written and saved to " + f"model files dictionary." + ) + else: + with open(Path(model_files) / MISC, 'w') as json_file: + json_file.write(json.dumps(misc_json, indent=4, cls=NpEncoder)) + if cls.notebook_output: + print( + f"{MISC} was successfully written and saved to " + f"{Path(model_files) / MISC}" ) \ No newline at end of file From 4b7ebca1ff87babd624a0dfbad8638153d1d0936 Mon Sep 17 00:00:00 2001 From: djm21 Date: Tue, 9 Apr 2024 14:32:04 -0500 Subject: [PATCH 8/9] fixed formatting for misc json --- src/sasctl/pzmm/write_json_files.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/src/sasctl/pzmm/write_json_files.py b/src/sasctl/pzmm/write_json_files.py index 04da0260..15f9700a 100644 --- a/src/sasctl/pzmm/write_json_files.py +++ b/src/sasctl/pzmm/write_json_files.py @@ -2724,14 +2724,18 @@ def generate_misc( roc_data = roc_table['data'][i]['dataMap'] correctness_values = [roc_data['_TP_'], roc_data['_FP_'], roc_data['_TN_'], roc_data['_FN_']] for (c_text, c_val, o_val) in zip(correct_text, correctness_values, outcome_values): - misc_data.append({ - "CorrectText": c_text, - "Outcome": o_val, - "_Count_": c_val, - "_DataRole_": roc_data['_DataRole_'], - "_cutoffSource_": "Default", - "_cutoff_": "0.5" - }) + misc_data.append( + { + "dataMap": { + "CorrectText": c_text, + "Outcome": o_val, + "_Count_": c_val, + "_DataRole_": roc_data['_DataRole_'], + "_cutoffSource_": "Default", + "_cutoff_": "0.5" + }, + "rowNumber": len(misc_data) + 1 + }) json_template_path = ( Path(__file__).resolve().parent / f"template_files/{MISC}" From 48d2ab5df54d2e94c2e7de62df51b0ece823baa8 Mon Sep 17 00:00:00 2001 From: djm21 Date: Tue, 9 Apr 2024 14:47:22 -0500 Subject: [PATCH 9/9] Added changes to fitstat for classification models --- src/sasctl/pzmm/write_json_files.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/sasctl/pzmm/write_json_files.py b/src/sasctl/pzmm/write_json_files.py index 15f9700a..fdcd7334 100644 --- a/src/sasctl/pzmm/write_json_files.py +++ b/src/sasctl/pzmm/write_json_files.py @@ -1299,6 +1299,16 @@ def calculate_model_statistics( roc_dict = cls.apply_dataframe_to_json(json_dict[1]["data"], i, roc_df) for j in range(len(roc_dict)): json_dict[1]["data"][j].update(roc_dict[j]) + if(roc_dict[j]["dataMap"]["_KS_"] == 1): + fitstat_data = { + "_KS_": roc_dict[j]["dataMap"]["_KS_"], + "_KS2_": roc_dict[j]["dataMap"]["_KS2_"], + "_C_": roc_dict[j]["dataMap"]["_C_"], + "_Gini_": roc_dict[j]["dataMap"]["_Gini_"], + "_Gamma_": roc_dict[j]["dataMap"]["_Gamma_"], + "_Tau_": roc_dict[j]["dataMap"]["_Tau_"] + } + json_dict[0]["data"][i]["dataMap"].update(fitstat_data) lift_df = pd.DataFrame(conn.CASTable("Lift", caslib="Public").to_frame()) lift_dict = cls.apply_dataframe_to_json(json_dict[2]["data"], i, lift_df, 1)