diff --git a/docs/notebooks/Tutorial.ipynb b/docs/notebooks/Tutorial.ipynb index ade67e7..77888ec 100644 --- a/docs/notebooks/Tutorial.ipynb +++ b/docs/notebooks/Tutorial.ipynb @@ -639,7 +639,23 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "As we'd hope, the `'Noise'` attribute is shown to be not very useful." + "As we'd hope, the `'Noise'` attribute is shown to be not very useful.\n", + "\n", + "The relative importance of your features (dataset columns) for making accurate predictions is not a perfectly well-defined thing. Accordingly, there are several ways to measure feature importance. The `feature_importances` function aggregates three different measures of feature importance. The underlying models it uses depend on the type of task.\n", + "\n", + "**Classification tasks** use the following:\n", + "\n", + "- A logistic regression model (using the absolute values of the coefficients).\n", + "- A random forest classifier (based on [mean reduction in impurity](https://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html) or Gini importance).\n", + "- A K-nearest neighbours classifier (based on [permutation feature importance](https://scikit-learn.org/stable/modules/permutation_importance.html) with F1 score objective).\n", + "\n", + "**Regression tasks** are assessed with the following:\n", + "\n", + "- A linear regression (using the absolute coefficients).\n", + "- A random forest regressor, again using Gini importance.\n", + "- A K-nearest neighbours, again with permutation importance but with mean squared error objective.\n", + "\n", + "The aggregation function sums the normalized scores of the tests, and normalizes the result so that it sums to one." ] }, { diff --git a/src/redflag/importance.py b/src/redflag/importance.py index c237331..8e99667 100644 --- a/src/redflag/importance.py +++ b/src/redflag/importance.py @@ -23,7 +23,7 @@ import numpy as np from numpy.typing import ArrayLike from sklearn.inspection import permutation_importance -from sklearn.linear_model import Lasso +from sklearn.linear_model import LinearRegression from sklearn.ensemble import RandomForestRegressor from sklearn.neighbors import KNeighborsClassifier from sklearn.neighbors import KNeighborsRegressor @@ -32,31 +32,33 @@ from .target import is_continuous from .utils import split_and_standardize +from .utils import aggregate def feature_importances(X: ArrayLike, y: ArrayLike=None, - n: int=3, task: Optional[str]=None, + task: Optional[str]=None, random_state: Optional[int]=None, - standardize: bool=True) -> np.ndarray: + ) -> np.ndarray: """ - Measure feature importances on a task, given X and y. + Estimate feature importances on a supervised task, given X and y. Classification tasks are assessed with logistic regression, a random forest, and KNN permutation importance. Regression tasks are assessed with - lasso regression, a random forest, and KNN permutation importance. In each - case, the `n` normalized importances with the most variance are averaged. + lasso regression, a random forest, and KNN permutation importance. + + The scores from these assessments are normalized, and the normalized + sum is returned. + + See the Tutorial in the documentation for more information. Args: X (array): an array representing the data. y (array or None): an array representing the target. If None, the task is assumed to be an unsupervised clustering task. - n (int): the number of tests to average. Only the n tests with the - highest variance across features are kept. task (str or None): either 'classification' or 'regression'. If None, the task will be inferred from the labels and a warning will show the assumption being made. random_state (int or None): the random state to use. - standardize (bool): whether to standardize the data. Default is True. Returns: array: The importance of the features, in the order in which they @@ -66,7 +68,7 @@ def feature_importances(X: ArrayLike, y: ArrayLike=None, >>> X = [[0, 0, 0], [0, 1, 1], [0, 2, 0], [0, 3, 1], [0, 4, 0], [0, 5, 1], [0, 7, 0], [0, 8, 1], [0, 8, 0]] >>> y = [5, 15, 25, 35, 45, 55, 80, 85, 90] >>> feature_importances(X, y, task='regression', random_state=42) - array([0. , 0.99416839, 0.00583161]) + array([0. , 0.9831828, 0.0168172]) >>> y = ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c'] >>> x0, x1, x2 = feature_importances(X, y, task='classification', random_state=42) >>> x1 > x2 > x0 # See Issue #49 for why this test is like this. @@ -79,8 +81,7 @@ def feature_importances(X: ArrayLike, y: ArrayLike=None, task = 'regression' if is_continuous(y) else 'classification' # Split the data and ensure it is standardized. - if standardize: - X, X_train, X_val, y, y_train, y_val = split_and_standardize(X, y, random_state=random_state) + X, X_train, X_val, y, y_train, y_val = split_and_standardize(X, y, random_state=random_state) # Train three models and gather the importances. imps: list = [] @@ -91,23 +92,16 @@ def feature_importances(X: ArrayLike, y: ArrayLike=None, r = permutation_importance(model, X_val, y_val, n_repeats=8, scoring='f1_weighted', random_state=random_state) imps.append(r.importances_mean) elif task == 'regression': - # Need data to be scaled, but don't necessarily want to scale entire dataset. - imps.append(np.abs(Lasso(random_state=random_state).fit(X, y).coef_)) + imps.append(np.abs(LinearRegression().fit(X, y).coef_)) imps.append(RandomForestRegressor(random_state=random_state).fit(X, y).feature_importances_) model = KNeighborsRegressor().fit(X_train, y_train) r = permutation_importance(model, X_val, y_val, n_repeats=8, scoring='neg_mean_squared_error', random_state=random_state) - if not all(r.importances_mean < 0): - r.importances_mean[r.importances_mean < 0] = 1e-9 - imps.append(r.importances_mean) + imps.append(r.importances_mean) + # Eliminate negative values and aggregate. imps = np.array(imps) - - # Normalize the rows by the sum of *only positive* elements. - normalizer = np.where(imps>0, imps, 0).sum(axis=1) - imps /= normalizer[:, None] - - # Drop imps with smallest variance and take mean of what's left. - return np.nanmean(sorted(imps, key=lambda row: np.std(row))[-n:], axis=0) + imps[imps < 0] = 0 + return aggregate(imps, normalize_input=True, normalize_output=True) def least_important_features(importances: ArrayLike, diff --git a/src/redflag/utils.py b/src/redflag/utils.py index 19df25e..70925f8 100644 --- a/src/redflag/utils.py +++ b/src/redflag/utils.py @@ -642,3 +642,97 @@ def has_monotonic(a: ArrayLike, tolerance: int=3) -> np.ndarray: zeros = get_idx(np.diff(a) == 0) flats = [list(x)+[x[-1]+1, x[-1]+2] for x in consecutive(zeros) if x.size >= tolerance] return np.array(list(flatten(flats)), dtype=int) + + +def aggregate(arr, + absolute=False, + rank_input=False, + rank_output=False, + normalize_input=False, + normalize_output=False, + ): + """ + Compute the Borda count ranking from an N x M matrix representing + N sets of rankings (or scores) for M candidates. This function + aggregates the scores for each candidate and optionally normalizes + them so that they sum to 1. The absolute value of the scores is + considered if 'absolute' is set to True. + + If you are providing rankings like [1 (best), 2, 3] and so on, + then set `rank=True`. If you also set `normalize_output` to `False`, + you will get Borda ranking scores. + + If your score arrays contain negative numbers and you want a + large negative number to be considered 'strong', then set + `normalize_input` to `True`. + + Arguments: + arr (array-like): An N x M matrix where N is the number of sets + of rankings or scores, and M is the number of candidates. Each + element represents the score of a candidate in a particular set. + + absolute (bool, optional): If True, the absolute value of each + score is considered. This is useful when a large negative + number should be considered as a strong score. Defaults to False. + + rank_input (bool, optional): If True, them the input is transformed + input ranking (such as [4 (best), 2, 3, ...]). Defaults to False. + + rank_output (bool, optional): If True, the output will be the + rankings of the aggregated scores instead of the scores themselves. + This converts the aggregated scores into a rank format (such as + [3 (best), 1, 2, ...]). Defaults to False. + + normalize_input (bool, optional): If True, each row of the input + array will be normalized before aggregation. This is useful when + the input array contains values in different ranges or units and + should be normalized to a common scale. Defaults to False. + + normalize_output (bool, optional): If True, the aggregated scores + will be normalized so that they sum to 1. This is useful to + understand the relative importance of each candidate's score. + Defaults to False. + + Returns: + numpy.ndarray: An array of length M containing the aggregated (and + optionally normalized) scores for each of the M candidates. + + + Example: + >>> scores = ([ + ... [ 1, 0.25, 0], + ... [ 4, 1.5, 0], + ... [ 1, -0.25, 0] + ... ]) + >>> aggregate(scores, normalize_output=True) + array([0.8, 0.2, 0. ]) + >>> aggregate(scores, absolute=True, normalize_output=True) + array([0.75, 0.25, 0. ]) + >>> scores = ([ + ... [ 1, 0.25, 0], + ... [ 4, 1.5, 0], + ... [ 1, -0.25, 0] + ... ]) + >>> aggregate(scores, rank_input=True, rank_output=True) + array([2, 1, 0]) + """ + arr = np.abs(arr) if absolute else np.array(arr) + + if rank_input: + arr = np.argsort(np.argsort(arr, axis=1), axis=1) + + if normalize_input: + s = arr.sum(axis=1) + s[s == 0] = 1e-12 # Division by zero. + arr = (arr.T / s).T + return aggregate(arr, normalize_output=normalize_output) + + scores = np.atleast_2d(arr).sum(axis=0) + + if rank_output: + scores = np.argsort(np.argsort(scores)) + elif normalize_output: + s = np.sum(scores) or 1e-12 + scores = scores / s + + return scores diff --git a/tests/test_pandas.py b/tests/test_pandas.py index 4adb447..78df780 100644 --- a/tests/test_pandas.py +++ b/tests/test_pandas.py @@ -58,4 +58,4 @@ def test_series_continuous_report(): def test_feature_importances_docstring(): s = pd.DataFrame([c, r]).redflag.feature_importances.__doc__ - assert s.strip().startswith("Measure feature importances on a task, given X and y.") + assert s.strip().startswith("Estimate feature importances on a supervised task, given X and y.")