diff --git a/raiutils/raiutils/data_processing/__init__.py b/raiutils/raiutils/data_processing/__init__.py new file mode 100644 index 0000000000..a1e1a58ba4 --- /dev/null +++ b/raiutils/raiutils/data_processing/__init__.py @@ -0,0 +1,11 @@ +# Copyright (c) Microsoft Corporation +# Licensed under the MIT License. + +"""Module for defining common utilities related to data processing.""" +from .data_processing_utils import (convert_to_list, + convert_to_string_list_dict, + serialize_json_safe) + +__all__ = ['convert_to_list', + 'convert_to_string_list_dict', + 'serialize_json_safe'] diff --git a/raiutils/raiutils/data_processing/data_processing_utils.py b/raiutils/raiutils/data_processing/data_processing_utils.py new file mode 100644 index 0000000000..3bbbf7b96b --- /dev/null +++ b/raiutils/raiutils/data_processing/data_processing_utils.py @@ -0,0 +1,161 @@ +# Copyright (c) Microsoft Corporation +# Licensed under the MIT License. + +import datetime +import json +from typing import Any, Dict, List + +import numpy as np +import pandas as pd +from scipy.sparse import issparse +from sklearn.utils import check_consistent_length + +_DF_COLUMN_BAD_NAME = "DataFrame column names must be strings."\ + " Name '{0}' is of type {1}" +_LIST_NONSCALAR = "Lists must be of scalar types" +_TOO_MANY_DIMS = "Array must have at most two dimensions" + + +def convert_to_list(array, custom_err_msg=None): + """Convert an array to a list. + + :param array: An array like python object. + :type array: pd.DataFrame or pd.Series or np.ndarray or + pd.Index or scipy sparse array + :param custom_err_msg: A custom error message to use. + :type custom_err_msg: str + :return: Python List. + :rtype: list + """ + if issparse(array): + if array.shape[1] > 1000: + if custom_err_msg is None: + raise ValueError("Exceeds maximum number of features for " + "visualization (1000)") + else: + raise ValueError(custom_err_msg) + return array.toarray().tolist() + if isinstance(array, pd.DataFrame) or isinstance(array, pd.Series): + return array.values.tolist() + if isinstance(array, np.ndarray) or isinstance(array, pd.Index): + return array.tolist() + return array + + +def convert_to_string_list_dict( + base_name_format: str, + ys, + sample_array) -> Dict[str, List]: + """Convert the given input to a string-list dictionary. + + This function is used to convert arrays in a variety of types + into a dictionary mapping column names to regular Python lists + (in preparation for JSON serialization). It is a modification + of the feature processing code in :class:`fairlearn.metrics.MetricFrame`. + + The array to be converted is passed in :code:`ys`, and a variety + of types are supported. The :code:`sample_array` argument is + used in a call to :func:`sklearn.utils.check_consistent_length` + to ensure that the resultant lists are of the right length. + Finally `base_name_format` is used to generate sequential + keys for the dictionary if none are in the supplied :code:`ys`. + It must be of the form :code:`'Base String {0}'`, with the + :code:`{0}` being replaced by a sequential integer. + + It is not possible to list out all the possible underlying types + for :code:`ys`. A brief summary: + - :class:`pd.Series` + - :class:`pd.DataFrame` + - A simple Python list + - A Python dictionary with string keys and values which are + convertible to lists + - Anything convertible to a :class:`np.ndarray` + + :param base_name_format: A custom name format to use. + :type base_name_format: str + :param ys: An array like python object. + :type ys: pd.DataFrame or pd.Series or list or dictionary + :param sample_array: An array like python object. + :type sample_array: pd.DataFrame or pd.Series or list or dictionary + :return: A dictionary of string and lists. + :rtype: Dict[str, List] + """ + result = {} + + if isinstance(ys, pd.Series): + check_consistent_length(ys, sample_array) + if ys.name is not None: + result[ys.name] = convert_to_list(ys) + else: + result[base_name_format.format(0)] = convert_to_list(ys) + elif isinstance(ys, pd.DataFrame): + for i in range(len(ys.columns)): + col_name = ys.columns[i] + if not isinstance(col_name, str): + msg = _DF_COLUMN_BAD_NAME.format(col_name, type(col_name)) + raise ValueError(msg) + column = ys.iloc[:, i] + check_consistent_length(column, sample_array) + result[col_name] = convert_to_list(column) + elif isinstance(ys, list): + if np.isscalar(ys[0]): + f_arr = np.atleast_1d(np.squeeze(np.asarray(ys))) + assert len(f_arr.shape) == 1 # Sanity check + check_consistent_length(f_arr, sample_array) + result[base_name_format.format(0)] = convert_to_list(f_arr) + else: + raise ValueError(_LIST_NONSCALAR) + elif isinstance(ys, dict): + for k, v in ys.items(): + result[k] = convert_to_list(v) + else: + # Assume it's something which can go into np.as_array + f_arr = np.squeeze(np.asarray(ys, dtype=object)) + if len(f_arr.shape) == 1: + check_consistent_length(f_arr, sample_array) + result[base_name_format.format(0)] = convert_to_list(f_arr) + elif len(f_arr.shape) == 2: + # Work similarly to pd.DataFrame(data=ndarray) + for i in range(f_arr.shape[1]): + col = f_arr[:, i] + check_consistent_length(col, sample_array) + result[base_name_format.format(i)] = convert_to_list(col) + else: + raise ValueError(_TOO_MANY_DIMS) + + return result + + +def serialize_json_safe(o: Any): + """ + Convert a value into something that is safe to parse as JSON. + + :param o: Object to make JSON safe. + :type o: Any + :return: Serialized object. + """ + if type(o) in {bool, int, float, str, type(None)}: + if isinstance(o, float): + if np.isinf(o) or np.isnan(o): + return 0 + # need to escape double quoted string values + # and other special characters for json + if isinstance(o, str): + return json.dumps(o)[1:-1] + return o + elif isinstance(o, datetime.datetime): + return o.__str__() + elif isinstance(o, dict): + return {k: serialize_json_safe(v, ) for k, v in o.items()} + elif isinstance(o, list): + return [serialize_json_safe(v) for v in o] + elif isinstance(o, tuple): + return tuple(serialize_json_safe(v) for v in o) + elif isinstance(o, np.ndarray): + return serialize_json_safe(o.tolist()) + elif hasattr(o, 'item'): + return o.item() # numpy types + elif hasattr(o, '__dict__'): + return serialize_json_safe(o.__dict__) # objects + else: + return o diff --git a/raiutils/raiutils/models/__init__.py b/raiutils/raiutils/models/__init__.py new file mode 100644 index 0000000000..f9cae2bbe1 --- /dev/null +++ b/raiutils/raiutils/models/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) Microsoft Corporation +# Licensed under the MIT License. + +"""Module for defining common utilities related to models.""" +from .model_utils import SKLearn, is_classifier + +__all__ = ['is_classifier', 'SKLearn'] diff --git a/raiutils/raiutils/models/model_utils.py b/raiutils/raiutils/models/model_utils.py new file mode 100644 index 0000000000..eb499a4872 --- /dev/null +++ b/raiutils/raiutils/models/model_utils.py @@ -0,0 +1,23 @@ +# Copyright (c) Microsoft Corporation +# Licensed under the MIT License. + + +class SKLearn(object): + """Provide scikit-learn related constants.""" + + EXAMPLES = 'examples' + LABELS = 'labels' + PREDICT = 'predict' + PREDICTIONS = 'predictions' + PREDICT_PROBA = 'predict_proba' + + +def is_classifier(model): + """Check if the model is a classifier. + + :return: True if the model is a classifier, False otherwise. + :rtype: bool + """ + return (model is not None and + hasattr(model, SKLearn.PREDICT_PROBA) and + model.predict_proba is not None) diff --git a/raiutils/requirements.txt b/raiutils/requirements.txt index fd7d3e06f0..d993101f8d 100644 --- a/raiutils/requirements.txt +++ b/raiutils/requirements.txt @@ -1 +1,5 @@ -requests==2.25.1 \ No newline at end of file +numpy +pandas +requests +scikit-learn +scipy diff --git a/raiutils/tests/test_data_processing_utils.py b/raiutils/tests/test_data_processing_utils.py new file mode 100644 index 0000000000..c70b3a0261 --- /dev/null +++ b/raiutils/tests/test_data_processing_utils.py @@ -0,0 +1,267 @@ +# Copyright (c) Microsoft Corporation +# Licensed under the MIT License. + +import datetime +import json + +import numpy as np +import pandas as pd +import pytest +from scipy.sparse import csr_matrix + +from raiutils.data_processing import (convert_to_list, + convert_to_string_list_dict, + serialize_json_safe) + + +class TestConvertToStringListDict: + def test_unnamed_series(self): + input = pd.Series(data=[0, 1, 2]) + sample_array = [4, 5, 6] + result = convert_to_string_list_dict("Base {0}", input, sample_array) + assert isinstance(result, dict) + assert len(result) == 1 + assert "Base 0" in result + arr = result["Base 0"] + assert isinstance(arr, list) + assert np.array_equal(arr, [0, 1, 2]) + + def test_named_series(self): + input = pd.Series(data=[1, 3, 5], name="Something") + sample_array = [4, 5, 6] + result = convert_to_string_list_dict("Base {0}", input, sample_array) + assert isinstance(result, dict) + assert len(result) == 1 + assert "Something" in result + arr = result["Something"] + assert isinstance(arr, list) + assert np.array_equal(arr, [1, 3, 5]) + + def test_dataframe(self): + input = pd.DataFrame.from_dict({"a": [0, 1, 2], "b": [4, 5, 6]}) + sample_array = [3, 6, 9] + result = convert_to_string_list_dict("Base {0}", input, sample_array) + assert isinstance(result, dict) + assert len(result) == 2 + assert "a" in result + arr = result["a"] + assert isinstance(arr, list) + assert np.array_equal(arr, [0, 1, 2]) + assert "b" in result + arr = result["b"] + assert isinstance(arr, list) + assert np.array_equal(arr, [4, 5, 6]) + + def test_simplelist(self): + input = [0, 1, 4] + sample_array = [2, 3, 4] + result = convert_to_string_list_dict("Base {0}", input, sample_array) + assert isinstance(result, dict) + assert len(result) == 1 + assert "Base 0" in result + arr = result["Base 0"] + assert isinstance(arr, list) + assert np.array_equal(arr, [0, 1, 4]) + + def test_dict(self): + input = {"a": np.array([0, 1, 2]), "b": pd.Series(data=[3, 4, 5])} + sample_array = [2, 3, 4] + result = convert_to_string_list_dict("Base {0}", input, sample_array) + assert isinstance(result, dict) + assert len(result) == 2 + assert "a" in result + arr = result["a"] + assert isinstance(arr, list) + assert np.array_equal(arr, [0, 1, 2]) + assert "b" in result + arr = result["b"] + assert isinstance(arr, list) + assert np.array_equal(arr, [3, 4, 5]) + + def test_numpy1d(self): + input = np.array([0, 1, 4]) + sample_array = [2, 3, 4] + result = convert_to_string_list_dict("Base {0}", input, sample_array) + assert isinstance(result, dict) + assert len(result) == 1 + assert "Base 0" in result + arr = result["Base 0"] + assert isinstance(arr, list) + assert np.array_equal(arr, [0, 1, 4]) + + def test_numpy2d(self): + # Note transpose on the end + input = np.array([[0, 1, 4], [2, 6, 7]]).T + sample_array = [2, 3, 4] + result = convert_to_string_list_dict("Base {0}", input, sample_array) + assert isinstance(result, dict) + assert len(result) == 2 + assert "Base 0" in result + arr = result["Base 0"] + assert isinstance(arr, list) + assert np.array_equal(arr, [0, 1, 4]) + assert "Base 1" in result + arr = result["Base 1"] + assert isinstance(arr, list) + assert np.array_equal(arr, [2, 6, 7]) + + +class TestConvertToList: + def test_pandas_dataframe_to_list(self): + input_dataframe = pd.DataFrame.from_dict( + {"a": [0, 1, 2], "b": [4, 5, 6]} + ) + expected_list = [[0, 4], [1, 5], [2, 6]] + input_as_list = convert_to_list(input_dataframe) + + assert input_as_list is not None + assert input_as_list == expected_list + + def test_array_to_list(self): + input_array = np.array([[0, 4], [1, 5], [2, 6]]) + expected_list = [[0, 4], [1, 5], [2, 6]] + input_as_list = convert_to_list(input_array) + + assert input_as_list is not None + assert input_as_list == expected_list + + def test_list_to_list(self): + input_list = [[0, 4], [1, 5], [2, 6]] + expected_list = [[0, 4], [1, 5], [2, 6]] + input_as_list = convert_to_list(input_list) + + assert input_as_list is not None + assert input_as_list == expected_list + + def test_series_to_list(self): + input_series = pd.Series(data=[[0, 4], [1, 5], [2, 6]]) + expected_list = [[0, 4], [1, 5], [2, 6]] + input_as_list = convert_to_list(input_series) + + assert input_as_list is not None + assert input_as_list == expected_list + + def test_index_to_list(self): + input_index = pd.Index(data=[[0, 4], [1, 5], [2, 6]]) + expected_list = [[0, 4], [1, 5], [2, 6]] + input_as_list = convert_to_list(input_index) + + assert input_as_list is not None + assert input_as_list == expected_list + + def test_csr_matrix_to_list(self): + input_sparse_matrix = csr_matrix((3, 10000), + dtype=np.int8) + with pytest.raises(ValueError) as ve: + convert_to_list(input_sparse_matrix) + assert "Exceeds maximum number of features for " + \ + "visualization (1000)" in str(ve.value) + + with pytest.raises(ValueError) as ve: + convert_to_list(input_sparse_matrix, + custom_err_msg="Error occurred") + assert "Error occurred" in str(ve.value) + + row = np.array([0, 0, 1, 2, 2, 2]) + col = np.array([0, 2, 2, 0, 1, 2]) + data = np.array([1, 2, 3, 4, 5, 6]) + sparse_matrix = csr_matrix((data, (row, col)), shape=(3, 3)) + expected_list = [[1, 0, 2], + [0, 0, 3], + [4, 5, 6]] + input_as_list = convert_to_list(sparse_matrix) + + assert input_as_list is not None + assert input_as_list == expected_list + + +class TestSerializationUtilities: + + def test_embedded_object(self): + class A: + def __init__(self): + self.a_data = 'a' + + class B: + def __init__(self): + self.b_data = A() + + result = serialize_json_safe({'B': B()}) + assert result == {'B': {'b_data': {'a_data': 'a'}}} + + def test_numpy(self): + result = serialize_json_safe(np.array([1, 2, 3])) + assert result == [1, 2, 3] + + def test_unknown(self): + c = complex(1, 2) + result = serialize_json_safe([c, 42]) + assert result == [c, 42] + + def test_strings_with_special_chars(self): + special_chars_dict = {"hello": "world\"with\"quotes", + "hi": ["a", "list", "of", + "special\t\"\r\nblah", + "chars"]} + result = json.dumps(special_chars_dict, default=serialize_json_safe) + assert result == ("{\"hello\": \"world\\\"with\\\"quotes\", " + + "\"hi\": [\"a\", \"list\", \"of\", " + + "\"special\\t\\\"\\r\\nblah\", \"chars\"]}") + deserialized_special_chars_dict = json.loads(result) + assert special_chars_dict == deserialized_special_chars_dict + + def test_serialize_json_safe_basic(self): + values = [0, 1, 2, 3, 4, 5] + result = serialize_json_safe(values) + assert result == [0, 1, 2, 3, 4, 5] + + values = ['a', 'b', 'a', 'c', 'a', 'b'] + result = serialize_json_safe(values) + assert result == ['a', 'b', 'a', 'c', 'a', 'b'] + + def test_serialize_json_safe_missing(self): + values = [0, np.nan, 2, 3, 4, 5] + result = serialize_json_safe(values) + assert result == [0, 0, 2, 3, 4, 5] + + values = [0, np.inf, 2, 3, 4, 5] + result = serialize_json_safe(values) + assert result == [0, 0, 2, 3, 4, 5] + + values = ['a', 'b', 'a', np.nan, 'a', 'b'] + result = serialize_json_safe(values) + assert result == ['a', 'b', 'a', 0, 'a', 'b'] + + def test_serialize_json_safe_aggregate_types(self): + o = { + 'a': [1, 2, 3], + 'c': 'b' + } + result = serialize_json_safe(o) + assert result == o + + o = ('a', [1, 2, 3]) + result = serialize_json_safe(o) + assert result == o + + values = np.array([[1, 2, 3], [4, 5, 6]]) + result = serialize_json_safe(values) + assert result == values.tolist() + + def test_serialize_timestamp(self): + datetime_str = "2020-10-10" + datetime_object = datetime.datetime.strptime(datetime_str, "%Y-%m-%d") + result = serialize_json_safe(datetime_object) + assert datetime_str in result + + def test_serialize_via_json_timestamp(self): + timestamp_obj = pd.Timestamp(2020, 1, 1) + assert isinstance(timestamp_obj, pd.Timestamp) + result = json.dumps(serialize_json_safe(timestamp_obj)) + assert result is not None + assert "2020" in result + + timestamp_obj_array = np.array([pd.Timestamp(2020, 1, 1)]) + result = json.dumps(serialize_json_safe(timestamp_obj_array)) + assert result is not None + assert "2020" in result diff --git a/raiutils/tests/test_model_utils.py b/raiutils/tests/test_model_utils.py new file mode 100644 index 0000000000..c192b973d6 --- /dev/null +++ b/raiutils/tests/test_model_utils.py @@ -0,0 +1,27 @@ +# Copyright (c) Microsoft Corporation +# Licensed under the MIT License. + +from raiutils.models import is_classifier + + +class Classifier: + def predict_proba(self): + pass + + def predict(self): + pass + + +class Regressor: + def predict(self): + pass + + +class TestIsClassifier: + def test_classifier(self): + classifier = Classifier() + assert is_classifier(classifier) + + def test_regressor(self): + regressor = Regressor() + assert not is_classifier(regressor)