From d018ba3ddeea0cd494129d8a3a65edc49f02a912 Mon Sep 17 00:00:00 2001 From: Maksim Terpilovskii Date: Sun, 20 Oct 2024 11:40:24 +0200 Subject: [PATCH] reimplemented multiple tests, improved type hints, linted and formatted code with ruff --- README.rst | 2 + scikit_posthocs/__init__.py | 43 +- scikit_posthocs/_omnibus.py | 147 ++-- scikit_posthocs/_posthocs.py | 894 +++++++++++++------------ tests/test_posthocs.py | 1220 +++++++++++++++++++++++++--------- 5 files changed, 1492 insertions(+), 814 deletions(-) diff --git a/README.rst b/README.rst index c143b51..3354a5c 100644 --- a/README.rst +++ b/README.rst @@ -21,6 +21,8 @@ .. image:: https://img.shields.io/conda/vn/conda-forge/scikit-posthocs.svg :target: https://anaconda.org/conda-forge/scikit-posthocs +=============== + **scikit-posthocs** is a Python package that provides post hoc tests for pairwise multiple comparisons that are usually performed in statistical data analysis to assess the differences between group levels if a statistically diff --git a/scikit_posthocs/__init__.py b/scikit_posthocs/__init__.py index 5c5889c..5489657 100644 --- a/scikit_posthocs/__init__.py +++ b/scikit_posthocs/__init__.py @@ -1,22 +1,43 @@ -__version__ = '0.9.1' +__version__ = "0.10.0" from scikit_posthocs._global import global_simes_test, global_f_test from scikit_posthocs._omnibus import test_osrt, test_durbin, test_mackwolfe from scikit_posthocs._posthocs import ( - posthoc_anderson, posthoc_conover, posthoc_conover_friedman, - posthoc_dscf, posthoc_dunn, posthoc_durbin, - posthoc_mannwhitney, posthoc_miller_friedman, posthoc_nemenyi, - posthoc_nemenyi_friedman, posthoc_npm_test, posthoc_quade, - posthoc_scheffe, posthoc_siegel_friedman, posthoc_tamhane, - posthoc_ttest, posthoc_tukey, posthoc_tukey_hsd, - posthoc_vanwaerden, posthoc_wilcoxon, posthoc_dunnett, - __convert_to_df, __convert_to_block_df, + posthoc_anderson, + posthoc_conover, + posthoc_conover_friedman, + posthoc_dscf, + posthoc_dunn, + posthoc_durbin, + posthoc_mannwhitney, + posthoc_miller_friedman, + posthoc_nemenyi, + posthoc_nemenyi_friedman, + posthoc_npm_test, + posthoc_quade, + posthoc_scheffe, + posthoc_siegel_friedman, + posthoc_tamhane, + posthoc_ttest, + posthoc_tukey, + posthoc_tukey_hsd, + posthoc_vanwaerden, + posthoc_wilcoxon, + posthoc_dunnett, + __convert_to_df, + __convert_to_block_df, ) from scikit_posthocs._plotting import ( - sign_array, sign_plot, sign_table, critical_difference_diagram, + sign_array, + sign_plot, + sign_table, + critical_difference_diagram, ) from scikit_posthocs._outliers import ( - outliers_gesd, outliers_grubbs, outliers_iqr, outliers_tietjen, + outliers_gesd, + outliers_grubbs, + outliers_iqr, + outliers_tietjen, ) diff --git a/scikit_posthocs/_omnibus.py b/scikit_posthocs/_omnibus.py index f2b7b5c..cb35f33 100644 --- a/scikit_posthocs/_omnibus.py +++ b/scikit_posthocs/_omnibus.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -from typing import Union, List, Tuple +from typing import Optional, Union, List, Tuple, cast import itertools as it import numpy as np import scipy.stats as ss @@ -10,14 +10,14 @@ def test_mackwolfe( - data: Union[List, np.ndarray, DataFrame], - val_col: str = None, - group_col: str = None, - p: int = None, - n_perm: int = 100, - sort: bool = False) -> Tuple[float, float]: - - '''Mack-Wolfe Test for Umbrella Alternatives. + data: Union[List, np.ndarray, DataFrame], + val_col: Optional[str] = None, + group_col: Optional[str] = None, + p: Optional[int] = None, + n_perm: int = 100, + sort: bool = False, +) -> Tuple[float, float]: + """Mack-Wolfe Test for Umbrella Alternatives. In dose-finding studies one may assume an increasing treatment effect with increasing dose level. However, the test subject may actually succumb to @@ -71,25 +71,26 @@ def test_mackwolfe( -------- >>> x = [[22, 23, 35], [60, 59, 54], [98, 78, 50], [60, 82, 59], [22, 44, 33], [23, 21, 25]] >>> sp.posthoc_mackwolfe(x) - ''' + """ x, _val_col, _group_col = __convert_to_df(data, val_col, group_col) if not sort: x[_group_col] = Categorical( - x[_group_col], categories=x[_group_col].unique(), ordered=True) + x[_group_col], categories=x[_group_col].unique(), ordered=True + ) x.sort_values(by=[_group_col], ascending=True, inplace=True) k = x[_group_col].unique().size if p and p > k: print("Selected 'p' > number of groups:", str(p), " > ", str(k)) - return False + return (np.nan, np.nan) elif p is not None and p < 1: print("Selected 'p' < 1: ", str(p)) - return False + return (np.nan, np.nan) Rij = x[_val_col].rank() - n = x.groupby(_group_col)[_val_col].count() + n = cast(Series, x.groupby(_group_col)[_val_col].count()) def _fn(Ri, Rj): return np.sum(Ri.apply(lambda x: Rj[Rj > x].size)) @@ -100,44 +101,53 @@ def _ustat(Rij, g, k): for i in range(k): for j in range(i): - U[i, j] = _fn(Rij[x[_group_col] == levels[i]], Rij[x[_group_col] == levels[j]]) - U[j, i] = _fn(Rij[x[_group_col] == levels[j]], Rij[x[_group_col] == levels[i]]) + U[i, j] = _fn( + Rij[x[_group_col] == levels[i]], Rij[x[_group_col] == levels[j]] + ) + U[j, i] = _fn( + Rij[x[_group_col] == levels[j]], Rij[x[_group_col] == levels[i]] + ) return U - def _ap(p, U): - tmp1 = 0. + def _ap(p, U) -> float: + tmp1 = 0.0 if p > 0: for i in range(p): - for j in range(i+1, p+1): + for j in range(i + 1, p + 1): tmp1 += U[i, j] - tmp2 = 0. + tmp2 = 0.0 if p < k: for i in range(p, k): - for j in range(i+1, k): + for j in range(i + 1, k): tmp2 += U[j, i] return tmp1 + tmp2 - def _n1(p, n): - return np.sum(n[:p+1]) + def _n1(p: int, n: Series) -> float: + return np.sum(n[: p + 1]) - def _n2(p, n): + def _n2(p: int, n: Series) -> float: return np.sum(n[p:k]) - def _mean_at(p, n): + def _mean_at(p, n) -> float: N1 = _n1(p, n) N2 = _n2(p, n) - return (N1**2. + N2**2. - np.sum(n**2.) - n.iloc[p]**2.)/4. + return (N1**2.0 + N2**2.0 - np.sum(n**2.0) - n.iloc[p] ** 2.0) / 4.0 - def _var_at(p, n): + def _var_at(p: int, n: Series) -> float: N1 = _n1(p, n) N2 = _n2(p, n) N = np.sum(n) - var = (2. * (N1**3 + N2**3) + 3. * (N1**2 + N2**2) - - np.sum(n**2 * (2*n + 3.)) - n.iloc[p]**2. * (2. * n.iloc[p] + 3.) + - 12. * n.iloc[p] * N1 * N2 - 12. * n.iloc[p] ** 2. * N) / 72. + var = ( + 2.0 * (N1**3 + N2**3) + + 3.0 * (N1**2 + N2**2) + - np.sum(n**2 * (2 * n + 3.0)) + - n.iloc[p] ** 2.0 * (2.0 * n.iloc[p] + 3.0) + + 12.0 * n.iloc[p] * N1 * N2 + - 12.0 * n.iloc[p] ** 2.0 * N + ) / 72.0 return var if p: @@ -147,19 +157,18 @@ def _var_at(p, n): est = _ap(p, U) mean = _mean_at(p, n) sd = np.sqrt(_var_at(p, n)) - stat = (est - mean)/sd - p_value = ss.norm.sf(stat) + stat = (est - mean) / sd + p_value = ss.norm.sf(stat).item() else: U = _ustat(Rij, x[_group_col], k) Ap = np.array([_ap(i, U) for i in range(k)]).ravel() mean = np.array([_mean_at(i, n) for i in range(k)]).ravel() var = np.array([_var_at(i, n) for i in range(k)]).ravel() A = (Ap - mean) / np.sqrt(var) - stat = np.max(A) + stat = float(np.max(A)) mt = [] for _ in range(n_perm): - ix = Series(np.random.permutation(Rij)) uix = _ustat(ix, x[_group_col], k) apix = np.array([_ap(i, uix) for i in range(k)]) @@ -167,17 +176,18 @@ def _var_at(p, n): mt.append(np.max(astarix)) mt = np.array(mt) - p_value = mt[mt > stat] / n_perm + p_value = mt[mt > stat].size / n_perm return p_value, stat def test_osrt( - data: Union[List, np.ndarray, DataFrame], - val_col: str = None, - group_col: str = None, - sort: bool = False) -> Tuple[float, float, int]: - '''Hayter's one-sided studentised range test (OSRT) + data: Union[List, np.ndarray, DataFrame], + val_col: Optional[str] = None, + group_col: Optional[str] = None, + sort: bool = False, +) -> Tuple[float, float, int]: + """Hayter's one-sided studentised range test (OSRT) Tests a hypothesis against an ordered alternative for normal data with equal variances [1]_. @@ -223,11 +233,13 @@ def test_osrt( >>> x = pd.DataFrame({"a": [1,2,3,5,1], "b": [12,31,54,62,12], "c": [10,12,6,74,11]}) >>> x = x.melt(var_name='groups', value_name='values') >>> sp.test_osrt(x, val_col='values', group_col='groups') - ''' + """ x, _val_col, _group_col = __convert_to_df(data, val_col, group_col) if not sort: - x[_group_col] = Categorical(x[_group_col], categories=x[_group_col].unique(), ordered=True) + x[_group_col] = Categorical( + x[_group_col], categories=x[_group_col].unique(), ordered=True + ) x.sort_values(by=[_group_col], ascending=True, inplace=True) groups = np.unique(x[_group_col]) @@ -245,13 +257,13 @@ def test_osrt( for i in range(k): for j in range(ni.iloc[i]): c += 1 - sigma2 += (x[_val_col].iat[c] - xi[i])**2. / df + sigma2 += (x[_val_col].iat[c] - xi[i]) ** 2.0 / df sigma = np.sqrt(sigma2) def compare(i, j): dif = xi.loc[groups[j]] - xi.loc[groups[i]] - A = sigma / np.sqrt(2.) * np.sqrt(1. / ni[groups[j]] + 1. / ni[groups[i]]) + A = sigma / np.sqrt(2.0) * np.sqrt(1.0 / ni[groups[j]] + 1.0 / ni[groups[i]]) qval = np.abs(dif) / A return qval @@ -267,14 +279,14 @@ def compare(i, j): def test_durbin( - data: Union[List, np.ndarray, DataFrame], - y_col: Union[str, int] = None, - block_col: Union[str, int] = None, - group_col: Union[str, int] = None, - melted: bool = False, - sort: bool = True) -> Tuple[float, float, int]: - - '''Durbin's test whether k groups (or treatments) in a two-way + data: Union[List, np.ndarray, DataFrame], + y_col: Optional[Union[str, int]] = None, + block_col: Optional[Union[str, int]] = None, + group_col: Optional[Union[str, int]] = None, + melted: bool = False, + sort: bool = True, +) -> Tuple[float, float, int]: + """Durbin's test whether k groups (or treatments) in a two-way balanced incomplete block design (BIBD) have identical effects. See references for additional information [1]_, [2]_. @@ -331,11 +343,15 @@ def test_durbin( -------- >>> x = np.array([[31,27,24],[31,28,31],[45,29,46],[21,18,48],[42,36,46],[32,17,40]]) >>> sp.test_durbin(x) - ''' + """ if melted and not all([block_col, group_col, y_col]): - raise ValueError('block_col, group_col, y_col should be explicitly specified if using melted data') + raise ValueError( + "block_col, group_col, y_col should be explicitly specified if using melted data" + ) - x, _y_col, _group_col, _block_col = __convert_to_block_df(data, y_col, group_col, block_col, melted) + x, _y_col, _group_col, _block_col = __convert_to_block_df( + data, y_col, group_col, block_col, melted + ) groups = x[_group_col].unique() blocks = x[_block_col].unique() @@ -350,21 +366,20 @@ def test_durbin( r = np.unique(x.groupby(_group_col).count()) k = np.unique(x.groupby(_block_col).count()) if r.size > 1 and k.size > 1: - raise ValueError('Data appear to be unbalanced. Please correct your input data') + raise ValueError("Data appear to be unbalanced. Please correct your input data") else: - r = r.item() - k = k.item() - x['y_ranks'] = x.groupby(_block_col)[_y_col].rank() - x['y_ranks_sum'] = x.groupby(_group_col)['y_ranks'].sum() + r = float(r.item()) + k = float(k.item()) + x["y_ranks"] = x.groupby(_block_col)[_y_col].rank() + x["y_ranks_sum"] = x.groupby(_group_col)["y_ranks"].sum() - A = np.sum(x['y_ranks'] ** 2.) - C = (b * k * (k + 1)**2.) / 4. - D = np.sum(x['y_ranks_sum'] ** 2.) - r * C - T1 = (t - 1.) / (A - C) * D + A = float(np.sum(x["y_ranks"] ** 2.0)) + C = float(b * k * (k + 1) ** 2.0) / 4.0 + D = float(np.sum(x["y_ranks_sum"] ** 2.0)) - r * C + T1 = (t - 1.0) / (A - C) * D stat = T1 df = t - 1 - pval = ss.chi2.sf(stat, df) + pval = ss.chi2.sf(stat, df).item() return pval, stat, df - diff --git a/scikit_posthocs/_posthocs.py b/scikit_posthocs/_posthocs.py index 872a9d2..e352572 100644 --- a/scikit_posthocs/_posthocs.py +++ b/scikit_posthocs/_posthocs.py @@ -1,20 +1,20 @@ import itertools as it -from typing import Tuple, Union, Literal +from typing import Optional, Tuple, Union, Literal import numpy as np import scipy.stats as ss from statsmodels.sandbox.stats.multicomp import multipletests -from statsmodels.stats.multicomp import pairwise_tukeyhsd from statsmodels.stats.libqsturng import psturng from pandas import DataFrame, Series, MultiIndex def __convert_to_df( - a: Union[list, np.ndarray, DataFrame], - val_col: str = 'vals', - group_col: str = 'groups', - val_id: int = None, - group_id: int = None) -> Tuple[DataFrame, str, str]: - '''Hidden helper method to create a DataFrame with input data for further + a: Union[list, np.ndarray, DataFrame], + val_col: Optional[str] = "vals", + group_col: Optional[str] = "groups", + val_id: Optional[int] = None, + group_id: Optional[int] = None, +) -> Tuple[DataFrame, str, str]: + """Hidden helper method to create a DataFrame with input data for further processing. Parameters @@ -59,32 +59,33 @@ def __convert_to_df( ----- Inferrence algorithm for determining `val_id` and `group_id` args is rather simple, so it is better to specify them explicitly to prevent errors. - ''' + """ if not group_col: - group_col = 'groups' + group_col = "groups" if not val_col: - val_col = 'vals' + val_col = "vals" if isinstance(a, DataFrame): x = a.copy() if not {group_col, val_col}.issubset(a.columns): raise ValueError( - 'Specify correct column names using `group_col` and `val_col` args') + "Specify correct column names using `group_col` and `val_col` args" + ) return x, val_col, group_col elif isinstance(a, list) or (isinstance(a, np.ndarray) and not a.shape.count(2)): grps_len = map(len, a) - grps = list(it.chain(*[[i+1] * l for i, l in enumerate(grps_len)])) + grps = list( + it.chain(*[[i + 1] * grp_len for i, grp_len in enumerate(grps_len)]) + ) vals = list(it.chain(*a)) return DataFrame({val_col: vals, group_col: grps}), val_col, group_col elif isinstance(a, np.ndarray): - # cols ids not defined # trying to infer if not all([val_id, group_id]): - if np.argmax(a.shape): a = a.T @@ -95,74 +96,81 @@ def __convert_to_df( __group_col = np.argmin(ax) else: raise ValueError( - 'Cannot infer input format.\nPlease specify `val_id` and `group_id` args') + "Cannot infer input format.\nPlease specify `val_id` and `group_id` args" + ) - cols = {__val_col: val_col, - __group_col: group_col} + cols = {__val_col: val_col, __group_col: group_col} else: - cols = {val_id: val_col, - group_id: group_col} + cols = {val_id: val_col, group_id: group_col} - cols_vals = dict(sorted(cols.items())).values() - return DataFrame(a, columns=cols_vals), val_col, group_col + cols_vals = np.array(dict(sorted(cols.items())).values()) + return DataFrame(a, columns=cols_vals).dropna(), val_col, group_col def __convert_to_block_df( - a, - y_col: str = None, - group_col: str = None, - block_col: str = None, - melted: bool = False) -> DataFrame: + a, + y_col: Optional[Union[str, int]] = None, + group_col: Optional[Union[str, int]] = None, + block_col: Optional[Union[str, int]] = None, + melted: bool = False, +) -> Tuple[DataFrame, str, str, str]: # TODO: refactor conversion of block data to DataFrame if melted and not all([i is not None for i in [block_col, group_col, y_col]]): raise ValueError( - '`block_col`, `group_col`, `y_col` should be explicitly specified if using melted data') + "`block_col`, `group_col`, `y_col` should be explicitly specified if using melted data" + ) if isinstance(a, DataFrame) and not melted: x = a.copy(deep=True) - group_col = 'groups' - block_col = 'blocks' - y_col = 'y' + group_col = "groups" + block_col = "blocks" + y_col = "y" x.columns.name = group_col x.index.name = block_col - x = x.reset_index().melt(id_vars=block_col, var_name=group_col, value_name=y_col) + x = x.reset_index().melt( + id_vars=block_col, var_name=group_col, value_name=y_col + ) elif isinstance(a, DataFrame) and melted: - x = DataFrame.from_dict({'groups': a[group_col], - 'blocks': a[block_col], - 'y': a[y_col]}) + x = DataFrame.from_dict( + {"groups": a[group_col], "blocks": a[block_col], "y": a[y_col]} + ) elif not isinstance(a, DataFrame): x = np.array(a) - x = DataFrame(x, index=np.arange( - x.shape[0]), columns=np.arange(x.shape[1])) + x = DataFrame(x, index=np.arange(x.shape[0]), columns=np.arange(x.shape[1])) if not melted: - group_col = 'groups' - block_col = 'blocks' - y_col = 'y' + group_col = "groups" + block_col = "blocks" + y_col = "y" x.columns.name = group_col x.index.name = block_col - x = x.reset_index().melt(id_vars=block_col, var_name=group_col, value_name=y_col) + x = x.reset_index().melt( + id_vars=block_col, var_name=group_col, value_name=y_col + ) else: - x.rename(columns={group_col: 'groups', - block_col: 'blocks', y_col: 'y'}, inplace=True) + x.rename( + columns={group_col: "groups", block_col: "blocks", y_col: "y"}, + inplace=True, + ) - group_col = 'groups' - block_col = 'blocks' - y_col = 'y' + group_col = "groups" + block_col = "blocks" + y_col = "y" return x, y_col, group_col, block_col def posthoc_conover( - a: Union[list, np.ndarray, DataFrame], - val_col: str = None, - group_col: str = None, - p_adjust: str = None, - sort: bool = True) -> DataFrame: - '''Post hoc pairwise test for multiple comparisons of mean rank sums + a: Union[list, np.ndarray, DataFrame], + val_col: Optional[str] = None, + group_col: Optional[str] = None, + p_adjust: Optional[str] = None, + sort: bool = True, +) -> DataFrame: + """Post hoc pairwise test for multiple comparisons of mean rank sums (Conover´s test). May be used after Kruskal-Wallis one-way analysis of variance by ranks to do pairwise comparisons [1]_. @@ -219,13 +227,14 @@ def posthoc_conover( -------- >>> x = [[1,2,3,5,1], [12,31,54, np.nan], [10,12,6,74,11]] >>> sp.posthoc_conover(x, p_adjust = 'holm') - ''' + """ + def compare_conover(i, j): diff = np.abs(x_ranks_avg.loc[i] - x_ranks_avg.loc[j]) - B = 1. / x_lens.loc[i] + 1. / x_lens.loc[j] - D = (n - 1. - h_cor) / (n - x_len) + B = 1.0 / x_lens.loc[i] + 1.0 / x_lens.loc[j] + D = (n - 1.0 - h_cor) / (n - x_len) t_value = diff / np.sqrt(S2 * B * D) - p_value = 2. * ss.t.sf(np.abs(t_value), df=n-x_len) + p_value = 2.0 * ss.t.sf(np.abs(t_value), df=n - x_len) return p_value x, _val_col, _group_col = __convert_to_df(a, val_col, group_col) @@ -236,25 +245,25 @@ def compare_conover(i, j): x_len = x_groups_unique.size x_lens = x.groupby(_group_col)[_val_col].count() - x['ranks'] = x[_val_col].rank() - x_ranks_avg = x.groupby(_group_col)['ranks'].mean() - x_ranks_sum = x.groupby(_group_col)['ranks'].sum() + x["ranks"] = x[_val_col].rank() + x_ranks_avg = x.groupby(_group_col)["ranks"].mean() + x_ranks_sum = x.groupby(_group_col)["ranks"].sum().to_numpy() # ties - vals = x.groupby('ranks').count()[_val_col].values + vals = x.groupby("ranks").count()[_val_col].to_numpy() tie_sum = np.sum(vals[vals != 1] ** 3 - vals[vals != 1]) tie_sum = 0 if not tie_sum else tie_sum - x_ties = np.min([1., 1. - tie_sum / (n ** 3. - n)]) + x_ties = np.min([1.0, 1.0 - tie_sum / (n**3.0 - n)]) - h = (12. / (n * (n + 1.))) * \ - np.sum(x_ranks_sum**2 / x_lens) - 3. * (n + 1.) + h = (12.0 / (n * (n + 1.0))) * np.sum(x_ranks_sum**2 / x_lens) - 3.0 * (n + 1.0) h_cor = h / x_ties if x_ties == 1: - S2 = n * (n + 1.) / 12. + S2 = n * (n + 1.0) / 12.0 else: - S2 = (1. / (n - 1.)) * \ - (np.sum(x['ranks'] ** 2.) - (n * (((n + 1.)**2.) / 4.))) + S2 = (1.0 / (n - 1.0)) * ( + np.sum(x["ranks"] ** 2.0) - (n * (((n + 1.0) ** 2.0) / 4.0)) + ) vs = np.zeros((x_len, x_len)) tri_upper = np.triu_indices(vs.shape[0], 1) @@ -275,12 +284,13 @@ def compare_conover(i, j): def posthoc_dunn( - a: Union[list, np.ndarray, DataFrame], - val_col: str = None, - group_col: str = None, - p_adjust: str = None, - sort: bool = True) -> DataFrame: - '''Post hoc pairwise test for multiple comparisons of mean rank sums + a: Union[list, np.ndarray, DataFrame], + val_col: Optional[str] = None, + group_col: Optional[str] = None, + p_adjust: Optional[str] = None, + sort: bool = True, +) -> DataFrame: + """Post hoc pairwise test for multiple comparisons of mean rank sums (Dunn's test). May be used after Kruskal-Wallis one-way analysis of variance by ranks to do pairwise comparisons [1]_, [2]_. @@ -339,13 +349,14 @@ def posthoc_dunn( >>> x = [[1,2,3,5,1], [12,31,54, np.nan], [10,12,6,74,11]] >>> sp.posthoc_dunn(x, p_adjust = 'holm') - ''' + """ + def compare_dunn(i, j): diff = np.abs(x_ranks_avg.loc[i] - x_ranks_avg.loc[j]) - A = n * (n + 1.) / 12. - B = (1. / x_lens.loc[i] + 1. / x_lens.loc[j]) + A = n * (n + 1.0) / 12.0 + B = 1.0 / x_lens.loc[i] + 1.0 / x_lens.loc[j] z_value = diff / np.sqrt((A - x_ties) * B) - p_value = 2. * ss.norm.sf(np.abs(z_value)) + p_value = 2.0 * ss.norm.sf(np.abs(z_value)) return p_value x, _val_col, _group_col = __convert_to_df(a, val_col, group_col) @@ -356,14 +367,14 @@ def compare_dunn(i, j): x_len = x_groups_unique.size x_lens = x.groupby(_group_col)[_val_col].count() - x['ranks'] = x[_val_col].rank() - x_ranks_avg = x.groupby(_group_col)['ranks'].mean() + x["ranks"] = x[_val_col].rank() + x_ranks_avg = x.groupby(_group_col)["ranks"].mean() # ties - vals = x.groupby('ranks').count()[_val_col].values + vals = x.groupby("ranks").count()[_val_col].to_numpy() tie_sum = np.sum(vals[vals != 1] ** 3 - vals[vals != 1]) tie_sum = 0 if not tie_sum else tie_sum - x_ties = tie_sum / (12. * (n - 1)) + x_ties = tie_sum / (12.0 * (n - 1)) vs = np.zeros((x_len, x_len)) combs = it.combinations(range(x_len), 2) @@ -384,12 +395,13 @@ def compare_dunn(i, j): def posthoc_nemenyi( - a: Union[list, np.ndarray, DataFrame], - val_col: str = None, - group_col: str = None, - dist: str = 'chi', - sort: bool = True) -> DataFrame: - '''Post hoc pairwise test for multiple comparisons of mean rank sums + a: Union[list, np.ndarray, DataFrame], + val_col: Optional[str] = None, + group_col: Optional[str] = None, + dist: str = "chi", + sort: bool = True, +) -> DataFrame: + """Post hoc pairwise test for multiple comparisons of mean rank sums (Nemenyi's test). May be used after Kruskal-Wallis one-way analysis of variance by ranks to do pairwise comparisons [1]_. @@ -436,18 +448,19 @@ def posthoc_nemenyi( -------- >>> x = [[1,2,3,5,1], [12,31,54, np.nan], [10,12,6,74,11]] >>> sp.posthoc_nemenyi(x) - ''' + """ + def compare_stats_chi(i, j): diff = np.abs(x_ranks_avg.loc[i] - x_ranks_avg.loc[j]) - A = n * (n + 1.) / 12. - B = 1. / x_lens.loc[i] + 1. / x_lens.loc[j] - chi = diff ** 2. / (A * B) + A = n * (n + 1.0) / 12.0 + B = 1.0 / x_lens.loc[i] + 1.0 / x_lens.loc[j] + chi = diff**2.0 / (A * B) return chi def compare_stats_tukey(i, j): diff = np.abs(x_ranks_avg.loc[i] - x_ranks_avg.loc[j]) - B = (1. / x_lens.loc[i] + 1. / x_lens.loc[j]) - q = diff / np.sqrt((n * (n + 1.) / 12.) * B) + B = 1.0 / x_lens.loc[i] + 1.0 / x_lens.loc[j] + q = diff / np.sqrt((n * (n + 1.0) / 12.0) * B) return q x, _val_col, _group_col = __convert_to_df(a, val_col, group_col) @@ -458,14 +471,14 @@ def compare_stats_tukey(i, j): x_len = x_groups_unique.size x_lens = x.groupby(_group_col)[_val_col].count() - x['ranks'] = x[_val_col].rank() - x_ranks_avg = x.groupby(_group_col)['ranks'].mean() + x["ranks"] = x[_val_col].rank() + x_ranks_avg = x.groupby(_group_col)["ranks"].mean() # ties - vals = x.groupby('ranks').count()[_val_col].values + vals = x.groupby("ranks").count()[_val_col].to_numpy() tie_sum = np.sum(vals[vals != 1] ** 3 - vals[vals != 1]) tie_sum = 0 if not tie_sum else tie_sum - x_ties = np.min([1., 1. - tie_sum / (n ** 3. - n)]) + x_ties = np.min([1.0, 1.0 - tie_sum / (n**3.0 - n)]) vs = np.zeros((x_len, x_len)) combs = it.combinations(range(x_len), 2) @@ -474,19 +487,21 @@ def compare_stats_tukey(i, j): tri_lower = np.tril_indices(vs.shape[0], -1) vs[:, :] = 0 - if dist == 'chi': + if dist == "chi": for i, j in combs: - vs[i, j] = compare_stats_chi( - x_groups_unique[i], x_groups_unique[j]) / x_ties + vs[i, j] = ( + compare_stats_chi(x_groups_unique[i], x_groups_unique[j]) / x_ties + ) vs[tri_upper] = ss.chi2.sf(vs[tri_upper], x_len - 1) - elif dist == 'tukey': + elif dist == "tukey": for i, j in combs: vs[i, j] = compare_stats_tukey( - x_groups_unique[i], x_groups_unique[j]) * np.sqrt(2.) + x_groups_unique[i], x_groups_unique[j] + ) * np.sqrt(2.0) - vs[tri_upper] = psturng(vs[tri_upper], x_len, np.inf) + vs[tri_upper] = ss.studentized_range.sf(vs[tri_upper], x_len, np.inf) vs[tri_lower] = np.transpose(vs)[tri_lower] np.fill_diagonal(vs, 1) @@ -495,13 +510,14 @@ def compare_stats_tukey(i, j): def posthoc_nemenyi_friedman( - a: Union[list, np.ndarray, DataFrame], - y_col: str = None, - block_col: str = None, - group_col: str = None, - melted: bool = False, - sort: bool = False) -> DataFrame: - '''Calculate pairwise comparisons using Nemenyi post hoc test for + a: Union[list, np.ndarray, DataFrame], + y_col: Optional[str] = None, + block_col: Optional[str] = None, + group_col: Optional[str] = None, + melted: bool = False, + sort: bool = False, +) -> DataFrame: + """Calculate pairwise comparisons using Nemenyi post hoc test for unreplicated blocked data. This test is usually conducted post hoc if significant results of the Friedman's test are obtained. The statistics refer to upper quantiles of the studentized range distribution (Tukey) [1]_, @@ -574,24 +590,25 @@ def posthoc_nemenyi_friedman( >>> # and columns are groups. >>> x = np.array([[31,27,24],[31,28,31],[45,29,46],[21,18,48],[42,36,46],[32,17,40]]) >>> sp.posthoc_nemenyi_friedman(x) - ''' + """ + def compare_stats(i, j): dif = np.abs(R[groups[i]] - R[groups[j]]) - qval = dif / np.sqrt(k * (k + 1.) / (6. * n)) + qval = dif / np.sqrt(k * (k + 1.0) / (6.0 * n)) return qval x, _y_col, _group_col, _block_col = __convert_to_block_df( - a, y_col, group_col, block_col, melted) - x = x.sort_values(by=[_group_col, _block_col], - ascending=True) if sort else x + a, y_col, group_col, block_col, melted + ) + x = x.sort_values(by=[_group_col, _block_col], ascending=True) if sort else x x.dropna(inplace=True) groups = x[_group_col].unique() k = groups.size n = x[_block_col].unique().size - x['mat'] = x.groupby(_block_col)[_y_col].rank() - R = x.groupby(_group_col)['mat'].mean() + x["mat"] = x.groupby(_block_col)[_y_col].rank() + R = x.groupby(_group_col)["mat"].mean() vs = np.zeros((k, k)) combs = it.combinations(range(k), 2) @@ -602,22 +619,23 @@ def compare_stats(i, j): for i, j in combs: vs[i, j] = compare_stats(i, j) - vs *= np.sqrt(2.) - vs[tri_upper] = psturng(vs[tri_upper], k, np.inf) + vs *= np.sqrt(2.0) + vs[tri_upper] = ss.studentized_range.sf(vs[tri_upper], k, np.inf) vs[tri_lower] = np.transpose(vs)[tri_lower] np.fill_diagonal(vs, 1) return DataFrame(vs, index=groups, columns=groups) def posthoc_conover_friedman( - a: Union[list, np.ndarray, DataFrame], - y_col: str = None, - block_col: str = None, - group_col: str = None, - melted: bool = False, - sort: bool = False, - p_adjust: str = None) -> DataFrame: - '''Calculate pairwise comparisons using Conover post hoc test for unreplicated + a: Union[list, np.ndarray, DataFrame], + y_col: Optional[str] = None, + block_col: Optional[str] = None, + group_col: Optional[str] = None, + melted: bool = False, + sort: bool = False, + p_adjust: Optional[str] = None, +) -> DataFrame: + """Calculate pairwise comparisons using Conover post hoc test for unreplicated blocked data. This test is usually conducted post hoc after significant results of the Friedman test. The statistics refer to the Student t distribution [1]_, [2]_. @@ -700,37 +718,38 @@ def posthoc_conover_friedman( -------- >>> x = np.array([[31,27,24],[31,28,31],[45,29,46],[21,18,48],[42,36,46],[32,17,40]]) >>> sp.posthoc_conover_friedman(x) - ''' + """ + def compare_stats(i, j): dif = np.abs(R.loc[groups[i]] - R.loc[groups[j]]) tval = dif / np.sqrt(A) / np.sqrt(B) - pval = 2. * ss.t.sf(np.abs(tval), df=(m*n*k - k - n + 1)) + pval = 2.0 * ss.t.sf(np.abs(tval), df=(m * n * k - k - n + 1)) return pval def compare_tukey(i, j): dif = np.abs(R.loc[groups[i]] - R.loc[groups[j]]) - qval = np.sqrt(2.) * dif / (np.sqrt(A) * np.sqrt(B)) + qval = np.sqrt(2.0) * dif / (np.sqrt(A) * np.sqrt(B)) pval = psturng(qval, k, np.inf) return pval x, _y_col, _group_col, _block_col = __convert_to_block_df( - a, y_col, group_col, block_col, melted) - x = x.sort_values(by=[_group_col, _block_col], - ascending=True) if sort else x + a, y_col, group_col, block_col, melted + ) + x = x.sort_values(by=[_group_col, _block_col], ascending=True) if sort else x x.dropna(inplace=True) groups = x[_group_col].unique() k = groups.size n = x[_block_col].unique().size - x['mat'] = x.groupby(_block_col)[_y_col].rank() - R = x.groupby(_group_col)['mat'].sum() - A1 = (x['mat'] ** 2).sum() + x["mat"] = x.groupby(_block_col)[_y_col].rank() + R = x.groupby(_group_col)["mat"].sum() + A1 = (x["mat"] ** 2).sum() m = 1 - S2 = m/(m*k - 1.) * (A1 - m*k*n*(m*k + 1.)**2./4.) - T2 = 1. / S2 * np.sum((R - n * m * ((m * k + 1.) / 2.))**2.) - A = S2 * (2. * n * (m * k - 1.)) / (m * n * k - k - n + 1.) - B = 1. - T2 / (n * (m * k - 1.)) + S2 = m / (m * k - 1.0) * (A1 - m * k * n * (m * k + 1.0) ** 2.0 / 4.0) + T2 = 1.0 / S2 * np.sum((R.to_numpy() - n * m * ((m * k + 1.0) / 2.0)) ** 2.0) + A = S2 * (2.0 * n * (m * k - 1.0)) / (m * n * k - k - n + 1.0) + B = 1.0 - T2 / (n * (m * k - 1.0)) vs = np.zeros((k, k)) combs = it.combinations(range(k), 2) @@ -739,7 +758,7 @@ def compare_tukey(i, j): tri_lower = np.tril_indices(vs.shape[0], -1) vs[:, :] = 0 - if p_adjust == 'single-step': + if p_adjust == "single-step": for i, j in combs: vs[i, j] = compare_tukey(i, j) else: @@ -755,11 +774,14 @@ def compare_tukey(i, j): def posthoc_npm_test( - a: Union[list, np.ndarray, DataFrame], - val_col: str = None, - group_col: str = None, - sort: bool = False) -> DataFrame: - '''Calculate pairwise comparisons using Nashimoto and Wright´s all-pairs + a: Union[list, np.ndarray, DataFrame], + val_col: Optional[str] = None, + group_col: Optional[str] = None, + alternative: Literal["greater", "less"] = "greater", + nperm: int = 1000, + sort: bool = False, +) -> DataFrame: + """Calculate pairwise comparisons using Nashimoto and Wright´s all-pairs comparison procedure (NPM test) for simply ordered mean ranksums. NPM test is basically an extension of Nemenyi´s procedure for testing @@ -772,9 +794,9 @@ def posthoc_npm_test( DataFrame. val_col : str, optional - Name of a DataFrame column that contains dependent variable values (test - or response variable). Values should have a non-nominal scale. Must be - specified if `a` is a pandas DataFrame object. + Name of a DataFrame column that contains dependent variable values + (test or response variable). Values should have a non-nominal scale. + Must be specified if `a` is a pandas DataFrame object. group_col : str, optional Name of a DataFrame column that contains independent variable values @@ -784,19 +806,12 @@ def posthoc_npm_test( sort : bool, optional If True, sort data by block and group columns. - p_adjust : str, optional - Method for adjusting p values. See `statsmodels.sandbox.stats.multicomp` - for details. Available methods are: - 'bonferroni' : one-step correction - 'sidak' : one-step correction - 'holm-sidak' : step-down method using Sidak adjustments - 'holm' : step-down method using Bonferroni adjustments - 'simes-hochberg' : step-up method (independent) - 'hommel' : closed method based on Simes tests (non-negative) - 'fdr_bh' : Benjamini/Hochberg (non-negative) - 'fdr_by' : Benjamini/Yekutieli (negative) - 'fdr_tsbh' : two stage fdr correction (non-negative) - 'fdr_tsbky' : two stage fdr correction (non-negative) + alternative : str, optional + Alternative hypothesis being tested. + Can be either "greater" (by default) or "less". + + nperm : int, optional + Number of permutations to perform for calculating p values. Returns ------- @@ -805,9 +820,7 @@ def posthoc_npm_test( Notes ----- - The p values are estimated from the studentized range distribution. If - the medians are already increasingly ordered, than the NPM-test - simplifies to the ordinary Nemenyi test + An asymetric permutation test is conducted for calculating p values. References ---------- @@ -821,52 +834,74 @@ def posthoc_npm_test( [110,112,123,130,145], [132,141,156,160,172]]) >>> sp.posthoc_npm_test(x) - ''' + """ x, _val_col, _group_col = __convert_to_df(a, val_col, group_col) x = x.sort_values(by=[_group_col], ascending=True) if sort else x + if alternative == "less": + x[_val_col] *= -1 groups = x[_group_col].unique() - x['ranks'] = x[_val_col].rank() - ri = x.groupby(_group_col)['ranks'].mean() - ni = x.groupby(_group_col)[_val_col].count() k = groups.size n = x.shape[0] - sigma = np.sqrt(n * (n + 1) / 12.) - df = np.inf - - def compare(m, u): - a = [(ri.loc[groups[u]]-ri.loc[groups[_mi]])/(sigma/np.sqrt(2) * - np.sqrt(1./ni.loc[groups[_mi]] + 1./ni.loc[groups[u]])) for _mi in m] - return np.array(a) - - stat = np.zeros((k, k)) - - for i in range(k-1): - for j in range(i+1, k): - u = j - m = np.arange(i, u) - tmp = compare(m, u) - stat[j, i] = np.max(tmp) - - stat[stat < 0] = 0 - - p_values = psturng(stat, k, df) - tri_lower = np.tril_indices(p_values.shape[0], -1) - p_values[tri_lower] = p_values.T[tri_lower] - + sigma = np.sqrt(n * (n + 1) / 12.0) + + def compare(x, ix): + x0 = x.copy() + x0.loc[:, _val_col] = x0.loc[ix, _val_col].values + x0["ranks"] = x0[_val_col].rank() + ri = x0.groupby(_group_col)["ranks"].mean() + ni = x0.groupby(_group_col)[_val_col].count() + is_balanced = all(ni == n) + stat = np.ones((k, k)) + + for i in range(k - 1): + for j in range(i + 1, k): + m = np.arange(i, j) + if is_balanced: + tmp = [ + (ri.loc[groups[j]] - ri.loc[groups[_mi]]) / (sigma / np.sqrt(n)) + for _mi in m + ] + else: + tmp = [ + (ri.loc[groups[j]] - ri.loc[groups[_mi]]) + / ( + sigma + * np.sqrt( + 1.0 / ni.loc[groups[_mi]] + 1.0 / ni.loc[groups[j]] + ) + ) + for _mi in m + ] + stat[j, i] = np.max(tmp) + return stat + + stat = compare(x, x.index) + + mt = np.zeros((nperm, k, k)) + for i in range(nperm): + ix = np.random.permutation(x.index) + mt[i] = compare(x, ix) + + p_values = (mt >= stat).sum(axis=0) / nperm + + tri_upper = np.triu_indices(p_values.shape[0], 1) + p_values[tri_upper] = np.transpose(p_values)[tri_upper] np.fill_diagonal(p_values, 1) + return DataFrame(p_values, index=groups, columns=groups) def posthoc_siegel_friedman( - a: Union[list, np.ndarray, DataFrame], - y_col: str = None, - block_col: str = None, - group_col: str = None, - p_adjust: str = None, - melted: bool = False, - sort: bool = False) -> DataFrame: - '''Siegel and Castellan´s All-Pairs Comparisons Test for Unreplicated Blocked + a: Union[list, np.ndarray, DataFrame], + y_col: Optional[str] = None, + block_col: Optional[str] = None, + group_col: Optional[str] = None, + p_adjust: Optional[str] = None, + melted: bool = False, + sort: bool = False, +) -> DataFrame: + """Siegel and Castellan´s All-Pairs Comparisons Test for Unreplicated Blocked Data. See authors' paper for additional information [1]_. Parameters @@ -938,14 +973,16 @@ def posthoc_siegel_friedman( -------- >>> x = np.array([[31,27,24],[31,28,31],[45,29,46],[21,18,48],[42,36,46],[32,17,40]]) >>> sp.posthoc_siegel_friedman(x) - ''' + """ + def compare_stats(i, j): dif = np.abs(R[groups[i]] - R[groups[j]]) - zval = dif / np.sqrt(k * (k + 1.) / (6. * n)) + zval = dif / np.sqrt(k * (k + 1.0) / (6.0 * n)) return zval x, y_col, group_col, block_col = __convert_to_block_df( - a, y_col, group_col, block_col, melted) + a, y_col, group_col, block_col, melted + ) x = x.sort_values(by=[group_col, block_col], ascending=True) if sort else x x.dropna(inplace=True) @@ -953,8 +990,8 @@ def compare_stats(i, j): k = groups.size n = x[block_col].unique().size - x['mat'] = x.groupby(block_col)[y_col].rank() - R = x.groupby(group_col)['mat'].mean() + x["mat"] = x.groupby(block_col)[y_col].rank() + R = x.groupby(group_col)["mat"].mean() vs = np.zeros((k, k), dtype=float) combs = it.combinations(range(k), 2) @@ -965,8 +1002,8 @@ def compare_stats(i, j): for i, j in combs: vs[i, j] = compare_stats(i, j) - vs = 2. * ss.norm.sf(np.abs(vs)) - vs[vs > 1] = 1. + vs = 2.0 * ss.norm.sf(np.abs(vs)) + vs[vs > 1] = 1.0 if p_adjust: vs[tri_upper] = multipletests(vs[tri_upper], method=p_adjust)[1] @@ -977,13 +1014,14 @@ def compare_stats(i, j): def posthoc_miller_friedman( - a: Union[list, np.ndarray, DataFrame], - y_col: str = None, - block_col: str = None, - group_col: str = None, - melted: bool = False, - sort: bool = False) -> DataFrame: - '''Miller´s All-Pairs Comparisons Test for Unreplicated Blocked Data. + a: Union[list, np.ndarray, DataFrame], + y_col: Optional[str] = None, + block_col: Optional[str] = None, + group_col: Optional[str] = None, + melted: bool = False, + sort: bool = False, +) -> DataFrame: + """Miller´s All-Pairs Comparisons Test for Unreplicated Blocked Data. The p-values are computed from the chi-square distribution [1]_, [2]_, [3]_. @@ -1049,14 +1087,16 @@ def posthoc_miller_friedman( -------- >>> x = np.array([[31,27,24],[31,28,31],[45,29,46],[21,18,48],[42,36,46],[32,17,40]]) >>> sp.posthoc_miller_friedman(x) - ''' + """ + def compare_stats(i, j): dif = np.abs(R[groups[i]] - R[groups[j]]) - qval = dif / np.sqrt(k * (k + 1.) / (6. * n)) + qval = dif / np.sqrt(k * (k + 1.0) / (6.0 * n)) return qval x, y_col, group_col, block_col = __convert_to_block_df( - a, y_col, group_col, block_col, melted) + a, y_col, group_col, block_col, melted + ) x = x.sort_values(by=[group_col, block_col], ascending=True) if sort else x x.dropna(inplace=True) @@ -1064,8 +1104,8 @@ def compare_stats(i, j): k = groups.size n = x[block_col].unique().size - x['mat'] = x.groupby(block_col)[y_col].rank() - R = x.groupby(group_col)['mat'].mean() + x["mat"] = x.groupby(block_col)[y_col].rank() + R = x.groupby(group_col)["mat"].mean() vs = np.zeros((k, k), dtype=float) combs = it.combinations(range(k), 2) @@ -1075,7 +1115,7 @@ def compare_stats(i, j): for i, j in combs: vs[i, j] = compare_stats(i, j) - vs = vs ** 2 + vs = vs**2 vs = ss.chi2.sf(vs, k - 1) vs[tri_lower] = np.transpose(vs)[tri_lower] @@ -1084,14 +1124,15 @@ def compare_stats(i, j): def posthoc_durbin( - a: Union[list, np.ndarray, DataFrame], - y_col: str = None, - block_col: str = None, - group_col: str = None, - p_adjust: str = None, - melted: bool = False, - sort: bool = False) -> DataFrame: - '''Pairwise post hoc test for multiple comparisons of rank sums according to + a: Union[list, np.ndarray, DataFrame], + y_col: Optional[str] = None, + block_col: Optional[str] = None, + group_col: Optional[str] = None, + p_adjust: Optional[str] = None, + melted: bool = False, + sort: bool = False, +) -> DataFrame: + """Pairwise post hoc test for multiple comparisons of rank sums according to Durbin and Conover for a two-way balanced incomplete block design (BIBD). See references for additional information [1]_, [2]_. @@ -1161,14 +1202,15 @@ def posthoc_durbin( -------- >>> x = np.array([[31,27,24],[31,28,31],[45,29,46],[21,18,48],[42,36,46],[32,17,40]]) >>> sp.posthoc_durbin(x) - ''' + """ x, y_col, group_col, block_col = __convert_to_block_df( - a, y_col, group_col, block_col, melted) + a, y_col, group_col, block_col, melted + ) def compare_stats(i, j): dif = np.abs(rj[groups[i]] - rj[groups[j]]) tval = dif / denom - pval = 2. * ss.t.sf(np.abs(tval), df=df) + pval = 2.0 * ss.t.sf(np.abs(tval), df=df) return pval x = x.sort_values(by=[block_col, group_col], ascending=True) if sort else x @@ -1179,14 +1221,13 @@ def compare_stats(i, j): b = x[block_col].unique().size r = b k = t - x['y_ranked'] = x.groupby(block_col)[y_col].rank() - rj = x.groupby(group_col)['y_ranked'].sum() - A = (x['y_ranked'] ** 2).sum() - C = (b * k * (k + 1) ** 2) / 4. - D = (rj ** 2).sum() - r * C + x["y_ranked"] = x.groupby(block_col)[y_col].rank() + rj = x.groupby(group_col)["y_ranked"].sum() + A = (x["y_ranked"] ** 2).sum() + C = (b * k * (k + 1) ** 2) / 4.0 + D = (rj.to_numpy() ** 2).sum() - r * C T1 = (t - 1) / (A - C) * D - denom = np.sqrt(((A - C) * 2 * r) / (b * k - b - t + 1) - * (1 - T1 / (b * (k - 1)))) + denom = np.sqrt(((A - C) * 2 * r) / (b * k - b - t + 1) * (1 - T1 / (b * (k - 1)))) df = b * k - b - t + 1 vs = np.zeros((t, t), dtype=float) @@ -1208,13 +1249,14 @@ def compare_stats(i, j): def posthoc_anderson( - a: Union[list, np.ndarray, DataFrame], - val_col: str = None, - group_col: str = None, - midrank: bool = True, - p_adjust: str = None, - sort: bool = False) -> DataFrame: - '''Anderson-Darling Pairwise Test for k-samples. Tests the null hypothesis + a: Union[list, np.ndarray, DataFrame], + val_col: Optional[str] = None, + group_col: Optional[str] = None, + midrank: bool = True, + p_adjust: Optional[str] = None, + sort: bool = False, +) -> DataFrame: + """Anderson-Darling Pairwise Test for k-samples. Tests the null hypothesis that k-samples are drawn from the same population without having to specify the distribution function of that population [1]_. @@ -1269,7 +1311,7 @@ def posthoc_anderson( -------- >>> x = np.array([[2.9, 3.0, 2.5, 2.6, 3.2], [3.8, 2.7, 4.0, 2.4], [2.8, 3.4, 3.7, 2.2, 2.0]]) >>> sp.posthoc_anderson(x) - ''' + """ x, _val_col, _group_col = __convert_to_df(a, val_col, group_col) x = x.sort_values(by=[_group_col], ascending=True) if sort else x @@ -1283,8 +1325,13 @@ def posthoc_anderson( vs[:, :] = 0 for i, j in combs: - vs[i, j] = ss.anderson_ksamp([x.loc[x[_group_col] == groups[i], _val_col], - x.loc[x[_group_col] == groups[j], _val_col]], midrank=midrank)[2] + vs[i, j] = ss.anderson_ksamp( + [ + x.loc[x[_group_col] == groups[i], _val_col], + x.loc[x[_group_col] == groups[j], _val_col], + ], + midrank=midrank, + )[2] if p_adjust: vs[tri_upper] = multipletests(vs[tri_upper], method=p_adjust)[1] @@ -1295,15 +1342,16 @@ def posthoc_anderson( def posthoc_quade( - a: Union[list, np.ndarray, DataFrame], - y_col: str = None, - block_col: str = None, - group_col: str = None, - dist: str = 't', - p_adjust: str = None, - melted: bool = False, - sort: bool = False) -> DataFrame: - '''Calculate pairwise comparisons using Quade's post hoc test for + a: Union[list, np.ndarray, DataFrame], + y_col: Optional[str] = None, + block_col: Optional[str] = None, + group_col: Optional[str] = None, + dist: str = "t", + p_adjust: Optional[str] = None, + melted: bool = False, + sort: bool = False, +) -> DataFrame: + """Calculate pairwise comparisons using Quade's post hoc test for unreplicated blocked data. This test is usually conducted if significant results were obtained by the omnibus test [1]_, [2]_, [3]_. @@ -1384,21 +1432,23 @@ def posthoc_quade( -------- >>> x = np.array([[31,27,24],[31,28,31],[45,29,46],[21,18,48],[42,36,46],[32,17,40]]) >>> sp.posthoc_quade(x) - ''' + """ + def compare_stats_t(i, j): dif = np.abs(S[groups[i]] - S[groups[j]]) tval = dif / denom - pval = 2. * ss.t.sf(np.abs(tval), df=(b - 1) * (k - 1)) + pval = 2.0 * ss.t.sf(np.abs(tval), df=(b - 1) * (k - 1)) return pval def compare_stats_norm(i, j): dif = np.abs(W[groups[i]] * ff - W[groups[j]] * ff) zval = dif / denom - pval = 2. * ss.norm.sf(np.abs(zval)) + pval = 2.0 * ss.norm.sf(np.abs(zval)) return pval x, y_col, group_col, block_col = __convert_to_block_df( - a, y_col, group_col, block_col, melted) + a, y_col, group_col, block_col, melted + ) x = x.sort_values(by=[block_col, group_col], ascending=True) if sort else x x.dropna(inplace=True) @@ -1407,17 +1457,18 @@ def compare_stats_norm(i, j): k = len(groups) b = x[block_col].unique().size - x['r'] = x.groupby(block_col)[y_col].rank() - q = (x.groupby(block_col)[y_col].max() - - x.groupby(block_col)[y_col].min()).rank() - x['rr'] = x['r'] - (k + 1)/2 - x['s'] = x.apply(lambda row: row['rr'] * q[row[block_col]], axis=1) - x['w'] = x.apply(lambda row: row['r'] * q[row[block_col]], axis=1) + x["r"] = x.groupby(block_col)[y_col].rank() + q = Series( + x.groupby(block_col)[y_col].max() - x.groupby(block_col)[y_col].min().to_numpy() + ).rank() + x["rr"] = x["r"] - (k + 1) / 2 + x["s"] = x.apply(lambda row: row["rr"] * q[row[block_col]], axis=1) + x["w"] = x.apply(lambda row: row["r"] * q[row[block_col]], axis=1) - A = (x['s'] ** 2).sum() - S = x.groupby(group_col)['s'].sum() - B = np.sum(S ** 2) / b - W = x.groupby(group_col)['w'].sum() + A = (x["s"] ** 2).sum() + S = x.groupby(group_col)["s"].sum() + B = np.sum(S.to_numpy() ** 2) / b + W = x.groupby(group_col)["w"].sum() vs = np.zeros((k, k), dtype=float) combs = it.combinations(range(k), 2) @@ -1426,7 +1477,7 @@ def compare_stats_norm(i, j): tri_lower = np.tril_indices(vs.shape[0], -1) vs[:, :] = 0 - if dist == 't': + if dist == "t": denom = np.sqrt((2 * b * (A - B)) / ((b - 1) * (k - 1))) for i, j in combs: @@ -1434,9 +1485,10 @@ def compare_stats_norm(i, j): else: n = b * k - denom = np.sqrt((k * (k + 1.) * (2. * n + 1.) * - (k-1.)) / (18. * n * (n + 1.))) - ff = 1. / (b * (b + 1.)/2.) + denom = np.sqrt( + (k * (k + 1.0) * (2.0 * n + 1.0) * (k - 1.0)) / (18.0 * n * (n + 1.0)) + ) + ff = 1.0 / (b * (b + 1.0) / 2.0) for i, j in combs: vs[i, j] = compare_stats_norm(i, j) @@ -1450,12 +1502,13 @@ def compare_stats_norm(i, j): def posthoc_vanwaerden( - a: Union[list, np.ndarray, DataFrame], - val_col: str = None, - group_col: str = None, - sort: bool = False, - p_adjust: str = None) -> DataFrame: - '''Van der Waerden's test for pairwise multiple comparisons between group + a: Union[list, np.ndarray, DataFrame], + val_col: Optional[str] = None, + group_col: Optional[str] = None, + sort: bool = False, + p_adjust: Optional[str] = None, +) -> DataFrame: + """Van der Waerden's test for pairwise multiple comparisons between group levels. See references for additional information [1]_, [2]_. Parameters @@ -1520,7 +1573,7 @@ def posthoc_vanwaerden( -------- >>> x = np.array([[10,'a'], [59,'a'], [76,'b'], [10, 'b']]) >>> sp.posthoc_vanwaerden(x, val_col = 0, group_col = 1) - ''' + """ x, _val_col, _group_col = __convert_to_df(a, val_col, group_col) x = x.sort_values(by=[_group_col], ascending=True) if sort else x @@ -1528,12 +1581,12 @@ def posthoc_vanwaerden( n = x[_val_col].size k = groups.size r = ss.rankdata(x[_val_col]) - x['z_scores'] = ss.norm.ppf(r / (n + 1)) + x["z_scores"] = ss.norm.ppf(r / (n + 1)) - aj = x.groupby(_group_col)['z_scores'].sum() - nj = x.groupby(_group_col)['z_scores'].count() - s2 = (1. / (n - 1.)) * (x['z_scores'] ** 2.).sum() - sts = (1. / s2) * np.sum(aj ** 2. / nj) + aj = x.groupby(_group_col)["z_scores"].sum().to_numpy() + nj = x.groupby(_group_col)["z_scores"].count() + s2 = (1.0 / (n - 1.0)) * (x["z_scores"] ** 2.0).sum() + sts = (1.0 / s2) * np.sum(aj**2.0 / nj) A = aj / nj vs = np.zeros((k, k), dtype=float) @@ -1545,9 +1598,9 @@ def posthoc_vanwaerden( def compare_stats(i, j): dif = np.abs(A[groups[i]] - A[groups[j]]) - B = 1. / nj[groups[i]] + 1. / nj[groups[j]] - tval = dif / np.sqrt(s2 * (n - 1. - sts)/(n - k) * B) - pval = 2. * ss.t.sf(np.abs(tval), df=n - k) + B = 1.0 / nj[groups[i]] + 1.0 / nj[groups[j]] + tval = dif / np.sqrt(s2 * (n - 1.0 - sts) / (n - k) * B) + pval = 2.0 * ss.t.sf(np.abs(tval), df=n - k) return pval for i, j in combs: @@ -1561,13 +1614,15 @@ def compare_stats(i, j): return DataFrame(vs, index=groups, columns=groups) -def posthoc_dunnett(a: Union[list, np.ndarray, DataFrame], - val_col: str = None, - group_col: str = None, - control: str = None, - alternative: Literal['two-sided', 'less', 'greater'] = 'two-sided', - sort: bool = False, - to_matrix: bool = True) -> Union[Series, DataFrame]: +def posthoc_dunnett( + a: Union[list, np.ndarray, DataFrame], + val_col: Optional[str] = None, + group_col: Optional[str] = None, + control: Optional[str] = None, + alternative: Literal["two-sided", "less", "greater"] = "two-sided", + sort: bool = False, + to_matrix: bool = True, +) -> Union[Series, DataFrame]: """ Dunnett's test [1, 2, 3] for multiple comparisons against a control group, used after parametric ANOVA. The control group is specified by the `control` parameter. @@ -1625,7 +1680,9 @@ def posthoc_dunnett(a: Union[list, np.ndarray, DataFrame], control_data = x_embedded.loc[control] treatment_data = x_embedded.drop(control) - pvals = ss.dunnett(*treatment_data, control=control_data, alternative=alternative).pvalue + pvals = ss.dunnett( + *treatment_data, control=control_data, alternative=alternative + ).pvalue multi_index = MultiIndex.from_product([[control], treatment_data.index.tolist()]) dunnett_sr = Series(pvals, index=multi_index) @@ -1634,7 +1691,7 @@ def posthoc_dunnett(a: Union[list, np.ndarray, DataFrame], return dunnett_sr else: - levels = x.index.unique().tolist() + levels = x.index.unique().to_numpy() result_df = DataFrame(index=levels, columns=levels) for pair in dunnett_sr.index: @@ -1646,14 +1703,15 @@ def posthoc_dunnett(a: Union[list, np.ndarray, DataFrame], def posthoc_ttest( - a: Union[list, np.ndarray, DataFrame], - val_col: str = None, - group_col: str = None, - pool_sd: bool = False, - equal_var: bool = True, - p_adjust: str = None, - sort: bool = False) -> DataFrame: - '''Pairwise T test for multiple comparisons of independent groups. May be + a: Union[list, np.ndarray, DataFrame], + val_col: Optional[str] = None, + group_col: Optional[str] = None, + pool_sd: bool = False, + equal_var: bool = True, + p_adjust: Optional[str] = None, + sort: bool = False, +) -> DataFrame: + """Pairwise T test for multiple comparisons of independent groups. May be used after a parametric ANOVA to do pairwise comparisons. Parameters @@ -1722,7 +1780,7 @@ def posthoc_ttest( array([[-1. , 0.04600899, 0.31269089], [ 0.04600899, -1. , 0.6327077 ], [ 0.31269089, 0.6327077 , -1. ]]) - ''' + """ x, _val_col, _group_col = __convert_to_df(a, val_col, group_col) x = x.sort_values(by=[_group_col], ascending=True) if sort else x @@ -1740,22 +1798,23 @@ def posthoc_ttest( ni = xg.count() m = xg.mean() sd = xg.std(ddof=1) - deg_f = ni - 1. + deg_f = ni - 1.0 total_deg_f = np.sum(deg_f) - pooled_sd = np.sqrt(np.sum(sd ** 2. * deg_f) / total_deg_f) + pooled_sd = np.sqrt(np.sum(sd**2.0 * deg_f) / total_deg_f) def compare_pooled(i, j): diff = m.iloc[i] - m.iloc[j] - se_diff = pooled_sd * np.sqrt(1. / ni.iloc[i] + 1. / ni.iloc[j]) + se_diff = pooled_sd * np.sqrt(1.0 / ni.iloc[i] + 1.0 / ni.iloc[j]) t_value = diff / se_diff - return 2. * ss.t.cdf(-np.abs(t_value), total_deg_f) + return 2.0 * ss.t.cdf(-np.abs(t_value), total_deg_f) for i, j in combs: vs[i, j] = compare_pooled(i, j) else: for i, j in combs: - vs[i, j] = ss.ttest_ind(xg.get_group(groups[i]), xg.get_group( - groups[j]), equal_var=equal_var)[1] + vs[i, j] = ss.ttest_ind( + xg.get_group(groups[i]), xg.get_group(groups[j]), equal_var=equal_var + )[1] if p_adjust: vs[tri_upper] = multipletests(vs[tri_upper], method=p_adjust)[1] @@ -1766,10 +1825,12 @@ def compare_pooled(i, j): def posthoc_tukey_hsd( - x: Union[list, np.ndarray, DataFrame], - g: str, - alpha: float = 0.05) -> DataFrame: - '''Pairwise comparisons with TukeyHSD confidence intervals. This is a + a: Union[list, np.ndarray, DataFrame], + val_col: Optional[str] = None, + group_col: Optional[str] = None, + sort: bool = True, +) -> DataFrame: + """Pairwise comparisons with TukeyHSD confidence intervals. This is a convenience function to make statsmodels `pairwise_tukeyhsd` method more applicable for further use. @@ -1800,36 +1861,31 @@ def posthoc_tukey_hsd( >>> x = [[1,2,3,4,5], [35,31,75,40,21], [10,6,9,6,1]] >>> g = [['a'] * 5, ['b'] * 5, ['c'] * 5] >>> sp.posthoc_tukey_hsd(np.concatenate(x), np.concatenate(g)) - ''' - result = pairwise_tukeyhsd(x, g, alpha=alpha) - groups = np.array(result.groupsunique, dtype=str) - groups_len = len(groups) - - vs = np.zeros((groups_len, groups_len), dtype=int) + """ + x, _val_col, _group_col = __convert_to_df(a, val_col, group_col) + x = x.sort_values(by=[_group_col, _val_col], ascending=True) if sort else x + groups = x[_group_col].unique() - for a in result.summary()[1:]: - a0 = str(a[0]) - a1 = str(a[1]) - a0i = np.where(groups == a0)[0][0] - a1i = np.where(groups == a1)[0][0] - vs[a0i, a1i] = 1 if str(a[-1]) == 'True' else 0 + results = ss.tukey_hsd( + *[ + x.loc[idx, _val_col].to_numpy() + for idx in x.groupby(_group_col).groups.values() + ] + ) - vsu = np.triu(vs) - np.fill_diagonal(vsu, 1) - tri_lower = np.tril_indices(vsu.shape[0], -1) - vsu[tri_lower] = np.transpose(vsu)[tri_lower] - return DataFrame(vsu, index=groups, columns=groups) + return DataFrame(results.pvalue, index=groups, columns=groups) def posthoc_mannwhitney( - a: Union[list, np.ndarray, DataFrame], - val_col: str = None, - group_col: str = None, - use_continuity: bool = True, - alternative: str = 'two-sided', - p_adjust: str = None, - sort: bool = True) -> DataFrame: - '''Pairwise comparisons with Mann-Whitney rank test. + a: Union[list, np.ndarray, DataFrame], + val_col: Optional[str] = None, + group_col: Optional[str] = None, + use_continuity: bool = True, + alternative: str = "two-sided", + p_adjust: Optional[str] = None, + sort: bool = True, +) -> DataFrame: + """Pairwise comparisons with Mann-Whitney rank test. Parameters ---------- @@ -1888,7 +1944,7 @@ def posthoc_mannwhitney( -------- >>> x = [[1,2,3,4,5], [35,31,75,40,21], [10,6,9,6,1]] >>> sp.posthoc_mannwhitney(x, p_adjust = 'holm') - ''' + """ x, _val_col, _group_col = __convert_to_df(a, val_col, group_col) x = x.sort_values(by=[_group_col, _val_col], ascending=True) if sort else x @@ -1907,7 +1963,8 @@ def posthoc_mannwhitney( xg.get_group(groups[i]), xg.get_group(groups[j]), use_continuity=use_continuity, - alternative=alternative)[1] + alternative=alternative, + )[1] if p_adjust: vs[tri_upper] = multipletests(vs[tri_upper], method=p_adjust)[1] @@ -1918,15 +1975,16 @@ def posthoc_mannwhitney( def posthoc_wilcoxon( - a: Union[list, np.ndarray, DataFrame], - val_col: str = None, - group_col: str = None, - method: str = 'auto', - zero_method: str = 'wilcox', - correction: bool = False, - p_adjust: str = None, - sort: bool = False) -> DataFrame: - '''Pairwise comparisons with Wilcoxon signed-rank test. + a: Union[list, np.ndarray, DataFrame], + val_col: Optional[str] = None, + group_col: Optional[str] = None, + method: str = "auto", + zero_method: str = "wilcox", + correction: bool = False, + p_adjust: Optional[str] = None, + sort: bool = False, +) -> DataFrame: + """Pairwise comparisons with Wilcoxon signed-rank test. It is a non-parametric version of the paired T-test for use with non-parametric ANOVA. @@ -1998,7 +2056,7 @@ def posthoc_wilcoxon( -------- >>> x = [[1,2,3,4,5], [35,31,75,40,21], [10,6,9,6,1]] >>> sp.posthoc_wilcoxon(x) - ''' + """ x, _val_col, _group_col = __convert_to_df(a, val_col, group_col) x = x.sort_values(by=[_group_col, _val_col], ascending=True) if sort else x @@ -2018,7 +2076,8 @@ def posthoc_wilcoxon( xg.get_group(groups[j]), zero_method=zero_method, method=method, - correction=correction)[1] + correction=correction, + )[1] if p_adjust: vs[tri_upper] = multipletests(vs[tri_upper], method=p_adjust)[1] @@ -2028,11 +2087,12 @@ def posthoc_wilcoxon( def posthoc_scheffe( - a: Union[list, np.ndarray, DataFrame], - val_col: str = None, - group_col: str = None, - sort: bool = False) -> DataFrame: - '''Scheffe's all-pairs comparisons test for normally distributed data with equal + a: Union[list, np.ndarray, DataFrame], + val_col: Optional[str] = None, + group_col: Optional[str] = None, + sort: bool = False, +) -> DataFrame: + """Scheffe's all-pairs comparisons test for normally distributed data with equal group variances. For all-pairs comparisons in an one-factorial layout with normally distributed residuals and equal variances Scheffe's test can be performed with parametric ANOVA [1]_, [2]_, [3]_. @@ -2084,7 +2144,7 @@ def posthoc_scheffe( >>> x = pd.DataFrame({"a": [1,2,3,5,1], "b": [12,31,54,62,12], "c": [10,12,6,74,11]}) >>> x = x.melt(var_name='groups', value_name='values') >>> sp.posthoc_scheffe(x, val_col='values', group_col='groups') - ''' + """ x, _val_col, _group_col = __convert_to_df(a, val_col, group_col) x = x.sort_values(by=[_group_col], ascending=True) if sort else x @@ -2094,12 +2154,12 @@ def posthoc_scheffe( xi = x_grouped.mean() si = x_grouped.var() n = ni.sum() - sin = 1. / (n - groups.size) * np.sum(si * (ni - 1.)) + sin = 1.0 / (n - groups.size) * np.sum(si * (ni - 1.0)) def compare(i, j): dif = xi.loc[i] - xi.loc[j] - A = sin * (1. / ni.loc[i] + 1. / ni.loc[j]) * (groups.size - 1.) - f_val = dif ** 2. / A + A = sin * (1.0 / ni.loc[i] + 1.0 / ni.loc[j]) * (groups.size - 1.0) + f_val = dif**2.0 / A return f_val vs = np.zeros((groups.size, groups.size), dtype=float) @@ -2112,19 +2172,20 @@ def compare(i, j): vs[i, j] = compare(groups[i], groups[j]) vs[tri_lower] = np.transpose(vs)[tri_lower] - p_values = ss.f.sf(vs, groups.size - 1., n - groups.size) + p_values = ss.f.sf(vs, groups.size - 1.0, n - groups.size) np.fill_diagonal(p_values, 1) return DataFrame(p_values, index=groups, columns=groups) def posthoc_tamhane( - a: Union[list, np.ndarray, DataFrame], - val_col: str = None, - group_col: str = None, - welch: bool = True, - sort: bool = False) -> DataFrame: - '''Tamhane's T2 all-pairs comparison test for normally distributed data with + a: Union[list, np.ndarray, DataFrame], + val_col: Optional[str] = None, + group_col: Optional[str] = None, + welch: bool = True, + sort: bool = False, +) -> DataFrame: + """Tamhane's T2 all-pairs comparison test for normally distributed data with unequal variances. Tamhane's T2 test can be performed for all-pairs comparisons in an one-factorial layout with normally distributed residuals but unequal groups variances. A total of m = k(k-1)/2 hypotheses can be @@ -2177,7 +2238,7 @@ def posthoc_tamhane( >>> x = pd.DataFrame({"a": [1,2,3,5,1], "b": [12,31,54,62,12], "c": [10,12,6,74,11]}) >>> x = x.melt(var_name='groups', value_name='values') >>> sp.posthoc_tamhane(x, val_col='values', group_col='groups') - ''' + """ x, _val_col, _group_col = __convert_to_df(a, val_col, group_col) x = x.sort_values(by=[_group_col], ascending=True) if sort else x @@ -2192,25 +2253,35 @@ def compare(i, j): A = si[i] / ni[i] + si[j] / ni[j] t_val = dif / np.sqrt(A) if welch: - df = A ** 2. / (si[i] ** 2. / (ni[i] ** 2. * (ni[i] - 1.)) + - si[j] ** 2. / (ni[j] ** 2. * (ni[j] - 1.))) + df = A**2.0 / ( + si[i] ** 2.0 / (ni[i] ** 2.0 * (ni[i] - 1.0)) + + si[j] ** 2.0 / (ni[j] ** 2.0 * (ni[j] - 1.0)) + ) else: # checks according to Tamhane (1979, p. 474) - ok1 = (9./10. <= ni[i]/ni[j]) and (ni[i]/ni[j] <= 10./9.) - ok2 = (9./10. <= (si[i] / ni[i]) / (si[j] / ni[j])) and\ - ((si[i] / ni[i]) / (si[j] / ni[j]) <= 10./9.) - ok3 = (4./5. <= ni[i]/ni[j]) and (ni[i]/ni[j] <= 5./4.) and\ - (1./2. <= (si[i] / ni[i]) / (si[j] / ni[j])) and\ - ((si[i] / ni[i]) / (si[j] / ni[j]) <= 2.) - ok4 = (2./3. <= ni[i]/ni[j]) and (ni[i]/ni[j] <= 3./2.) and\ - (3./4. <= (si[i] / ni[i]) / (si[j] / ni[j]))\ - and ((si[i] / ni[i]) / (si[j] / ni[j]) <= 4./3.) + ok1 = (9.0 / 10.0 <= ni[i] / ni[j]) and (ni[i] / ni[j] <= 10.0 / 9.0) + ok2 = (9.0 / 10.0 <= (si[i] / ni[i]) / (si[j] / ni[j])) and ( + (si[i] / ni[i]) / (si[j] / ni[j]) <= 10.0 / 9.0 + ) + ok3 = ( + (4.0 / 5.0 <= ni[i] / ni[j]) + and (ni[i] / ni[j] <= 5.0 / 4.0) + and (1.0 / 2.0 <= (si[i] / ni[i]) / (si[j] / ni[j])) + and ((si[i] / ni[i]) / (si[j] / ni[j]) <= 2.0) + ) + ok4 = ( + (2.0 / 3.0 <= ni[i] / ni[j]) + and (ni[i] / ni[j] <= 3.0 / 2.0) + and (3.0 / 4.0 <= (si[i] / ni[i]) / (si[j] / ni[j])) + and ((si[i] / ni[i]) / (si[j] / ni[j]) <= 4.0 / 3.0) + ) OK = any([ok1, ok2, ok3, ok4]) if not OK: print( - "Sample sizes or standard errors are not balanced. T2 test is recommended.") - df = ni[i] + ni[j] - 2. - p_val = 2. * ss.t.sf(np.abs(t_val), df=df) + "Sample sizes or standard errors are not balanced. T2 test is recommended." + ) + df = ni[i] + ni[j] - 2.0 + p_val = 2.0 * ss.t.sf(np.abs(t_val), df=df) return p_val vs = np.zeros((groups.size, groups.size), dtype=float) @@ -2223,7 +2294,7 @@ def compare(i, j): for i, j in combs: vs[i, j] = compare(groups[i], groups[j]) - vs[tri_upper] = 1. - (1. - vs[tri_upper]) ** groups.size + vs[tri_upper] = 1.0 - (1.0 - vs[tri_upper]) ** groups.size vs[tri_lower] = np.transpose(vs)[tri_lower] vs[vs > 1] = 1 @@ -2232,11 +2303,12 @@ def compare(i, j): def posthoc_tukey( - a: Union[list, np.ndarray, DataFrame], - val_col: str = None, - group_col: str = None, - sort: bool = False) -> DataFrame: - '''Performs Tukey's all-pairs comparisons test for normally distributed data + a: Union[list, np.ndarray, DataFrame], + val_col: Optional[str] = None, + group_col: Optional[str] = None, + sort: bool = False, +) -> DataFrame: + """Performs Tukey's all-pairs comparisons test for normally distributed data with equal group variances. For all-pairs comparisons in an one-factorial layout with normally distributed residuals and equal variances Tukey's test can be performed. A total of m = k(k-1)/2 hypotheses can be @@ -2284,7 +2356,7 @@ def posthoc_tukey( >>> x = pd.DataFrame({"a": [1,2,3,5,1], "b": [12,31,54,62,12], "c": [10,12,6,74,11]}) >>> x = x.melt(var_name='groups', value_name='values') >>> sp.posthoc_tukey(x, val_col='values', group_col='groups') - ''' + """ x, _val_col, _group_col = __convert_to_df(a, val_col, group_col) x = x.sort_values(by=[_group_col], ascending=True) if sort else x groups = x[_group_col].unique() @@ -2294,11 +2366,11 @@ def posthoc_tukey( n = ni.sum() xi = x_grouped.mean() si = x_grouped.var() - sin = 1. / (n - groups.size) * np.sum(si * (ni - 1)) + sin = 1.0 / (n - groups.size) * np.sum(si * (ni - 1)) def compare(i, j): dif = xi[i] - xi[j] - A = sin * 0.5 * (1. / ni.loc[i] + 1. / ni.loc[j]) + A = sin * 0.5 * (1.0 / ni.loc[i] + 1.0 / ni.loc[j]) q_val = dif / np.sqrt(A) return q_val @@ -2312,8 +2384,7 @@ def compare(i, j): for i, j in combs: vs[i, j] = compare(groups[i], groups[j]) - vs[tri_upper] = psturng(np.abs(vs[tri_upper]), - groups.size, n - groups.size) + vs[tri_upper] = psturng(np.abs(vs[tri_upper]), groups.size, n - groups.size) vs[tri_lower] = np.transpose(vs)[tri_lower] np.fill_diagonal(vs, 1) @@ -2321,11 +2392,12 @@ def compare(i, j): def posthoc_dscf( - a: Union[list, np.ndarray, DataFrame], - val_col: str = None, - group_col: str = None, - sort: bool = False) -> DataFrame: - '''Dwass, Steel, Critchlow and Fligner all-pairs comparison test for a + a: Union[list, np.ndarray, DataFrame], + val_col: Optional[str] = None, + group_col: Optional[str] = None, + sort: bool = False, +) -> DataFrame: + """Dwass, Steel, Critchlow and Fligner all-pairs comparison test for a one-factorial layout with non-normally distributed residuals. As opposed to the all-pairs comparison procedures that depend on Kruskal ranks, the DSCF test is basically an extension of the U-test as re-ranking is conducted for @@ -2379,7 +2451,7 @@ def posthoc_dscf( >>> x = pd.DataFrame({"a": [1,2,3,5,1], "b": [12,31,54,62,12], "c": [10,12,6,74,11]}) >>> x = x.melt(var_name='groups', value_name='values') >>> sp.posthoc_dscf(x, val_col='values', group_col='groups') - ''' + """ x, _val_col, _group_col = __convert_to_df(a, val_col, group_col) x = x.sort_values(by=[_group_col], ascending=True) if sort else x groups = x[_group_col].unique() @@ -2389,22 +2461,22 @@ def posthoc_dscf( def get_ties(x): t = x.value_counts().values - c = np.sum((t ** 3 - t) / 12.) + c = np.sum((t**3 - t) / 12.0) return c def compare(i, j): ni = n.loc[i] nj = n.loc[j] x_raw = x.loc[(x[_group_col] == i) | (x[_group_col] == j)].copy() - x_raw['ranks'] = x_raw.loc[:, _val_col].rank() - r = x_raw.groupby(_group_col)['ranks'].sum().loc[[j, i]] - u = np.array([nj * ni + (nj * (nj + 1) / 2), - nj * ni + (ni * (ni + 1) / 2)]) - r + x_raw["ranks"] = x_raw.loc[:, _val_col].rank() + r = x_raw.groupby(_group_col)["ranks"].sum().loc[[j, i]] + u = np.array([nj * ni + (nj * (nj + 1) / 2), nj * ni + (ni * (ni + 1) / 2)]) - r u_min = np.min(u) s = ni + nj - var = (nj*ni/(s*(s - 1.))) * \ - ((s**3 - s)/12. - get_ties(x_raw['ranks'])) - p = np.sqrt(2.) * (u_min - nj * ni / 2.) / np.sqrt(var) + var = (nj * ni / (s * (s - 1.0))) * ( + (s**3 - s) / 12.0 - get_ties(x_raw["ranks"]) + ) + p = np.sqrt(2.0) * (u_min - nj * ni / 2.0) / np.sqrt(var) return p vs = np.zeros((k, k)) diff --git a/tests/test_posthocs.py b/tests/test_posthocs.py index d601caf..c546747 100644 --- a/tests/test_posthocs.py +++ b/tests/test_posthocs.py @@ -11,73 +11,89 @@ import numpy as np import matplotlib.axes as ma from pandas import DataFrame, Series -if os.environ.get('DISPLAY', '') == '': - print('No display found. Using non-interactive Agg backend') - mpl.use('Agg') -sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +if os.environ.get("DISPLAY", "") == "": + print("No display found. Using non-interactive Agg backend") + mpl.use("Agg") +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) -class TestPosthocs(unittest.TestCase): +class TestPosthocs(unittest.TestCase): # Global tests def test_global_simes_test(self): - a = np.array([0.9, 0.1, 0.01, 0.99, 1., 0.02, 0.04]) + a = np.array([0.9, 0.1, 0.01, 0.99, 1.0, 0.02, 0.04]) result = spg.global_simes_test(a) self.assertAlmostEqual(result, 0.07) def test_global_f_test(self): - a = np.array([0.9, 0.1, 0.01, 0.99, 1., 0.02, 0.04]) + a = np.array([0.9, 0.1, 0.01, 0.99, 1.0, 0.02, 0.04]) result, _ = spg.global_f_test(a) self.assertAlmostEqual(result, 0.01294562) # Plotting tests def test_sign_array(self): - - p_values = np.array([[0., 0.00119517, 0.00278329], - [0.00119517, 0., 0.18672227], - [0.00278329, 0.18672227, 0.]]) + p_values = np.array( + [ + [0.0, 0.00119517, 0.00278329], + [0.00119517, 0.0, 0.18672227], + [0.00278329, 0.18672227, 0.0], + ] + ) test_results = splt.sign_array(p_values) correct_results = np.array([[1, 1, 1], [1, 1, 0], [1, 0, 1]]) self.assertTrue(np.all(test_results == correct_results)) def test_sign_table(self): + p_values = np.array( + [ + [1.0, 0.00119517, 0.00278329], + [0.00119517, 1.0, 0.18672227], + [0.00278329, 0.18672227, 1.0], + ] + ) - p_values = np.array([[1., 0.00119517, 0.00278329], - [0.00119517, 1., 0.18672227], - [0.00278329, 0.18672227, 1.]]) - - correct_results = np.array([['-', '**', '**'], - ['**', '-', 'NS'], - ['**', 'NS', '-']], dtype=object) - correct_resultsl = np.array([['-', '', ''], - ['**', '-', ''], - ['**', 'NS', '-']], dtype=object) - correct_resultsu = np.array([['-', '**', '**'], - ['', '-', 'NS'], - ['', '', '-']], dtype=object) + correct_results = np.array( + [["-", "**", "**"], ["**", "-", "NS"], ["**", "NS", "-"]], dtype=object + ) + correct_resultsl = np.array( + [["-", "", ""], ["**", "-", ""], ["**", "NS", "-"]], dtype=object + ) + correct_resultsu = np.array( + [["-", "**", "**"], ["", "-", "NS"], ["", "", "-"]], dtype=object + ) with self.assertRaises(ValueError): splt.sign_table(p_values, lower=False, upper=False) - self.assertTrue(np.all(splt.sign_table(p_values, lower=False, upper=True) == correct_resultsu)) - self.assertTrue(np.all(splt.sign_table(p_values, lower=True, upper=False) == correct_resultsl)) - self.assertTrue(np.all(splt.sign_table(p_values, lower=True, upper=True) == correct_results)) + self.assertTrue( + np.all( + splt.sign_table(p_values, lower=False, upper=True) == correct_resultsu + ) + ) + self.assertTrue( + np.all( + splt.sign_table(p_values, lower=True, upper=False) == correct_resultsl + ) + ) + self.assertTrue( + np.all(splt.sign_table(p_values, lower=True, upper=True) == correct_results) + ) def test_sign_plot(self): - - x = np.array([[1, 1, 1], - [1, 1, 0], - [1, 0, 1]]) + x = np.array([[1, 1, 1], [1, 1, 0], [1, 0, 1]]) a = splt.sign_plot(x, flat=True, labels=False) with self.assertRaises(ValueError): splt.sign_plot(x.astype(float), flat=True, labels=False) self.assertTrue(isinstance(a, ma._axes.Axes)) def test_sign_plot_nonflat(self): - - x = np.array([[1., 0.00119517, 0.00278329], - [0.00119517, 1., 0.18672227], - [0.00278329, 0.18672227, 1.]]) + x = np.array( + [ + [1.0, 0.00119517, 0.00278329], + [0.00119517, 1.0, 0.18672227], + [0.00278329, 0.18672227, 1.0], + ] + ) a, cbar = splt.sign_plot(x, cbar=True, labels=False) with self.assertRaises(ValueError): @@ -85,61 +101,63 @@ def test_sign_plot_nonflat(self): with self.assertRaises(ValueError): splt.sign_plot(x.astype(np.int64), labels=False) - self.assertTrue(isinstance(a, ma._axes.Axes) and isinstance(cbar, mpl.colorbar.ColorbarBase)) + self.assertTrue( + isinstance(a, ma._axes.Axes) and isinstance(cbar, mpl.colorbar.ColorbarBase) + ) def test_find_maximal_cliques_input_validation(self): with self.assertRaisesRegex(ValueError, ".*indices do not match"): splt._find_maximal_cliques( DataFrame( [[0, 1], [1, 0]], - index=['a', 'b'], - columns=['b', 'a'], + index=["a", "b"], + columns=["b", "a"], ) ) with self.assertRaises(ValueError, msg="Input matrix must be binary"): splt._find_maximal_cliques(DataFrame([[0, 3], [3, 0]])) - with self.assertRaisesRegex(ValueError, '.*empty and symmetric'): + with self.assertRaisesRegex(ValueError, ".*empty and symmetric"): splt._find_maximal_cliques(DataFrame()) - with self.assertRaisesRegex(ValueError, '.*empty and symmetric'): + with self.assertRaisesRegex(ValueError, ".*empty and symmetric"): splt._find_maximal_cliques(DataFrame([[1, 0], [1, 0]])) def test_find_maximal_cliques_1x1(self): - adj_matrix = DataFrame([[0]], columns=['a'], index=['a']) - expected = [{'a'}] + adj_matrix = DataFrame([[0]], columns=["a"], index=["a"]) + expected = [{"a"}] self.assertEqual(splt._find_maximal_cliques(adj_matrix), expected) def test_find_maximal_cliques_2x2(self): adj_matrix = DataFrame( [[0, 1], [1, 0]], - columns=['a', 'b'], - index=['a', 'b'], + columns=["a", "b"], + index=["a", "b"], ) - expected = [{'a', 'b'}] + expected = [{"a", "b"}] self.assertEqual(splt._find_maximal_cliques(adj_matrix), expected) def test_find_maximal_cliques_3x3(self): adj_matrix = DataFrame( - [[0, 0, 1], - [0, 0, 0], - [1, 0, 0]], - columns=['a', 'b', 'c'], - index=['a', 'b', 'c'], + [[0, 0, 1], [0, 0, 0], [1, 0, 0]], + columns=["a", "b", "c"], + index=["a", "b", "c"], ) - expected = [{'a', 'c'}, {'b'}] + expected = [{"a", "c"}, {"b"}] self.assertEqual( set(map(frozenset, splt._find_maximal_cliques(adj_matrix))), set(map(frozenset, expected)), ) def test_find_maximal_cliques_6x6(self): - adj_matrix = DataFrame([ - [0, 1, 0, 0, 0, 0], - [1, 0, 1, 1, 1, 0], - [0, 1, 0, 1, 1, 0], - [0, 1, 1, 0, 1, 0], - [0, 1, 1, 1, 0, 0], - [0, 0, 0, 0, 0, 0], - ]) + adj_matrix = DataFrame( + [ + [0, 1, 0, 0, 0, 0], + [1, 0, 1, 1, 1, 0], + [0, 1, 0, 1, 1, 0], + [0, 1, 1, 0, 1, 0], + [0, 1, 1, 1, 0, 0], + [0, 0, 0, 0, 0, 0], + ] + ) expected = [{0, 1}, {1, 2, 3, 4}, {5}] self.assertEqual( set(map(frozenset, splt._find_maximal_cliques(adj_matrix))), @@ -147,28 +165,29 @@ def test_find_maximal_cliques_6x6(self): ) def test_cd_diagram_number_of_artists(self): - index = list('abcdef') + index = list("abcdef") ranks = Series([2.1, 1.2, 4.5, 3.2, 5.7, 6.5], index=index) sig_matrix = DataFrame( - [[0.08, 0.08, 0.01, 0.01, 0.01, 0.01], - [0.08, 0.08, 0.08, 0.08, 0.08, 0.01], - [0.01, 0.08, 0.08, 0.08, 0.08, 0.01], - [0.01, 0.08, 0.08, 0.08, 0.08, 0.01], - [0.01, 0.08, 0.08, 0.08, 0.08, 0.01], - [0.01, 0.01, 0.01, 0.01, 0.01, 0.08]], + [ + [0.08, 0.08, 0.01, 0.01, 0.01, 0.01], + [0.08, 0.08, 0.08, 0.08, 0.08, 0.01], + [0.01, 0.08, 0.08, 0.08, 0.08, 0.01], + [0.01, 0.08, 0.08, 0.08, 0.08, 0.01], + [0.01, 0.08, 0.08, 0.08, 0.08, 0.01], + [0.01, 0.01, 0.01, 0.01, 0.01, 0.08], + ], index=index, columns=index, ) output = splt.critical_difference_diagram(ranks, sig_matrix) - self.assertEqual(len(output['markers']), len(ranks)) - self.assertEqual(len(output['elbows']), len(ranks)) - self.assertEqual(len(output['labels']), len(ranks)) - self.assertEqual(len(output['crossbars']), 2) + self.assertEqual(len(output["markers"]), len(ranks)) + self.assertEqual(len(output["elbows"]), len(ranks)) + self.assertEqual(len(output["labels"]), len(ranks)) + self.assertEqual(len(output["crossbars"]), 2) # Outliers tests def test_outliers_iqr(self): - x = np.array([4, 5, 6, 10, 12, 4, 3, 1, 2, 3, 23, 5, 3]) x_filtered = np.array([4, 5, 6, 10, 4, 3, 1, 2, 3, 5, 3]) @@ -176,104 +195,246 @@ def test_outliers_iqr(self): outliers_indices = np.array([4, 10]) outliers = np.array([12, 23]) - test_outliers = so.outliers_iqr(x, ret='outliers') - test_outliers_indices = so.outliers_iqr(x, ret='outliers_indices') - test_indices = so.outliers_iqr(x, ret='indices') - test_filtered = so.outliers_iqr(x, ret='filtered') + test_outliers = so.outliers_iqr(x, ret="outliers") + test_outliers_indices = so.outliers_iqr(x, ret="outliers_indices") + test_indices = so.outliers_iqr(x, ret="indices") + test_filtered = so.outliers_iqr(x, ret="filtered") - self.assertTrue(np.all(test_outliers == outliers) - and np.all(test_outliers_indices == outliers_indices) - and np.all(test_indices == indices) - and np.all(test_filtered == x_filtered)) + self.assertTrue( + np.all(test_outliers == outliers) + and np.all(test_outliers_indices == outliers_indices) + and np.all(test_indices == indices) + and np.all(test_filtered == x_filtered) + ) def test_outliers_grubbs(self): - - x = np.array([199.31, 199.53, 200.19, 200.82, - 201.92, 201.95, 202.18, 245.57]) + x = np.array([199.31, 199.53, 200.19, 200.82, 201.92, 201.95, 202.18, 245.57]) test_results = so.outliers_grubbs(x) correct_results = np.array( - [199.31, 199.53, 200.19, 200.82, 201.92, 201.95, 202.18]) + [199.31, 199.53, 200.19, 200.82, 201.92, 201.95, 202.18] + ) self.assertTrue(so.outliers_grubbs(x, hypo=True)) self.assertTrue(np.all(test_results == correct_results)) def test_outliers_tietjen(self): - - x = np.array([-1.40, -0.44, -0.30, -0.24, -0.22, -0.13, -0.05, 0.06, 0.10, - 0.18, 0.20, 0.39, 0.48, 0.63, 1.01]) + x = np.array( + [ + -1.40, + -0.44, + -0.30, + -0.24, + -0.22, + -0.13, + -0.05, + 0.06, + 0.10, + 0.18, + 0.20, + 0.39, + 0.48, + 0.63, + 1.01, + ] + ) test_results = so.outliers_tietjen(x, 2) - correct_results = np.array([-0.44, -0.3, -0.24, -0.22, -0.13, -0.05, 0.06, - 0.1, 0.18, 0.2, 0.39, 0.48, 0.63]) + correct_results = np.array( + [ + -0.44, + -0.3, + -0.24, + -0.22, + -0.13, + -0.05, + 0.06, + 0.1, + 0.18, + 0.2, + 0.39, + 0.48, + 0.63, + ] + ) self.assertTrue(so.outliers_tietjen(x, 2, hypo=True)) self.assertTrue(np.all(test_results == correct_results)) def test_outliers_gesd(self): - - x = np.array([-0.25, 0.68, 0.94, 1.15, 1.2, 1.26, 1.26, 1.34, 1.38, - 1.43, 1.49, 1.49, 1.55, 1.56, 1.58, 1.65, 1.69, 1.7, 1.76, 1.77, - 1.81, 1.91, 1.94, 1.96, 1.99, 2.06, 2.09, 2.1, 2.14, 2.15, 2.23, - 2.24, 2.26, 2.35, 2.37, 2.4, 2.47, 2.54, 2.62, 2.64, 2.9, 2.92, - 2.92, 2.93, 3.21, 3.26, 3.3, 3.59, 3.68, 4.3, 4.64, 5.34, 5.42, - 6.01]) + x = np.array( + [ + -0.25, + 0.68, + 0.94, + 1.15, + 1.2, + 1.26, + 1.26, + 1.34, + 1.38, + 1.43, + 1.49, + 1.49, + 1.55, + 1.56, + 1.58, + 1.65, + 1.69, + 1.7, + 1.76, + 1.77, + 1.81, + 1.91, + 1.94, + 1.96, + 1.99, + 2.06, + 2.09, + 2.1, + 2.14, + 2.15, + 2.23, + 2.24, + 2.26, + 2.35, + 2.37, + 2.4, + 2.47, + 2.54, + 2.62, + 2.64, + 2.9, + 2.92, + 2.92, + 2.93, + 3.21, + 3.26, + 3.3, + 3.59, + 3.68, + 4.3, + 4.64, + 5.34, + 5.42, + 6.01, + ] + ) correct_mask = np.zeros_like(x, dtype=bool) correct_mask[-3:] = True test_results = so.outliers_gesd(x, 5) test_mask_results = so.outliers_gesd(x, 5, hypo=True) - correct_results = np.array([-0.25, 0.68, 0.94, 1.15, 1.2, 1.26, - 1.26, 1.34, 1.38, 1.43, 1.49, 1.49, 1.55, 1.56, 1.58, - 1.65, 1.69, 1.7 , 1.76, 1.77, 1.81, 1.91, 1.94, 1.96, - 1.99, 2.06, 2.09, 2.1 , 2.14, 2.15, 2.23, 2.24, 2.26, - 2.35, 2.37, 2.4 , 2.47, 2.54, 2.62, 2.64, 2.9 , 2.92, - 2.92, 2.93, 3.21, 3.26, 3.3 , 3.59, 3.68, 4.3 , 4.64]) + correct_results = np.array( + [ + -0.25, + 0.68, + 0.94, + 1.15, + 1.2, + 1.26, + 1.26, + 1.34, + 1.38, + 1.43, + 1.49, + 1.49, + 1.55, + 1.56, + 1.58, + 1.65, + 1.69, + 1.7, + 1.76, + 1.77, + 1.81, + 1.91, + 1.94, + 1.96, + 1.99, + 2.06, + 2.09, + 2.1, + 2.14, + 2.15, + 2.23, + 2.24, + 2.26, + 2.35, + 2.37, + 2.4, + 2.47, + 2.54, + 2.62, + 2.64, + 2.9, + 2.92, + 2.92, + 2.93, + 3.21, + 3.26, + 3.3, + 3.59, + 3.68, + 4.3, + 4.64, + ] + ) self.assertTrue(isinstance(so.outliers_gesd(x, 5, report=True), np.ndarray)) self.assertTrue(np.all(test_results == correct_results)) self.assertTrue(np.all(test_mask_results == correct_mask)) - # Statistical tests df = sb.load_dataset("exercise") - df[df.columns[df.dtypes == 'category']] = df[df.columns[df.dtypes == 'category']].astype(object) - df_bn = np.array([[4,3,4,4,5,6,3], - [1,2,3,5,6,7,7], - [1,2,6,4,1,5,1]]) + df[df.columns[df.dtypes == "category"]] = df[ + df.columns[df.dtypes == "category"] + ].astype(object) + df_bn = np.array( + [[4, 3, 4, 4, 5, 6, 3], [1, 2, 3, 5, 6, 7, 7], [1, 2, 6, 4, 1, 5, 1]] + ) # DataFrame conversion tests def test_convert_to_block_df(self): - a = np.array([[0, 0, 4], - [1, 0, 1], - [2, 0, 1], - [0, 1, 3], - [1, 1, 2], - [2, 1, 2], - [0, 2, 4], - [1, 2, 3], - [2, 2, 6], - [0, 3, 4], - [1, 3, 5], - [2, 3, 4], - [0, 4, 5], - [1, 4, 6], - [2, 4, 1], - [0, 5, 6], - [1, 5, 7], - [2, 5, 5], - [0, 6, 3], - [1, 6, 7], - [2, 6, 1]], dtype=float) - df_a = DataFrame(a, columns=['blk_col', 'grp_col', 'y_col']) - - result = sp.posthoc_nemenyi_friedman(a, y_col=2, group_col=1, block_col=0, melted=True)[0].values + a = np.array( + [ + [0, 0, 4], + [1, 0, 1], + [2, 0, 1], + [0, 1, 3], + [1, 1, 2], + [2, 1, 2], + [0, 2, 4], + [1, 2, 3], + [2, 2, 6], + [0, 3, 4], + [1, 3, 5], + [2, 3, 4], + [0, 4, 5], + [1, 4, 6], + [2, 4, 1], + [0, 5, 6], + [1, 5, 7], + [2, 5, 5], + [0, 6, 3], + [1, 6, 7], + [2, 6, 1], + ], + dtype=float, + ) + df_a = DataFrame(a, columns=["blk_col", "grp_col", "y_col"]) + + result = sp.posthoc_nemenyi_friedman( + a, y_col=2, group_col=1, block_col=0, melted=True + )[0].values result2 = sp.posthoc_nemenyi_friedman(self.df_bn)[0].values - result3 = sp.posthoc_nemenyi_friedman(df_a, y_col='y_col', group_col='grp_col', block_col='blk_col', melted=True)[0].values + result3 = sp.posthoc_nemenyi_friedman( + df_a, y_col="y_col", group_col="grp_col", block_col="blk_col", melted=True + )[0].values self.assertTrue(np.allclose(result, result2)) self.assertTrue(np.allclose(result, result3)) self.assertTrue(np.allclose(result2, result3)) # Omnibox tests def test_osrt(self): - df = DataFrame(dict(zip(['a', 'b', 'c'], self.df_bn.tolist()))).melt() - p,_,_ = som.test_osrt(df, val_col='value', group_col='variable') + df = DataFrame(dict(zip(["a", "b", "c"], self.df_bn.tolist()))).melt() + p, _, _ = som.test_osrt(df, val_col="value", group_col="variable") result = 0.3157646 - self.assertTrue(np.allclose(p, result, atol=1.e-3)) + self.assertTrue(np.allclose(p, result, atol=1.0e-3)) def test_durbin(self): r_result = np.array([0.205758, 8.468354, 6]) @@ -281,337 +442,744 @@ def test_durbin(self): self.assertTrue(np.allclose(result, r_result)) def test_mackwolfe(self): - x = [[22, 23, 35], [60, 59, 54], [98, 78, 50], [60, 82, 59], [22, 44, 33], [23, 21, 25]] + x = [ + [22, 23, 35], + [60, 59, 54], + [98, 78, 50], + [60, 82, 59], + [22, 44, 33], + [23, 21, 25], + ] result, _ = som.test_mackwolfe(x, p=2) - self.assertFalse(som.test_mackwolfe(x, p=20)) - self.assertFalse(som.test_mackwolfe(x, p=0)) + self.assertEqual(som.test_mackwolfe(x, p=20), (np.nan, np.nan)) + self.assertEqual(som.test_mackwolfe(x, p=0), (np.nan, np.nan)) self.assertTrue(np.allclose(result, 0.0006812725)) def test_mackwolfe_nperm(self): - x = [[22, 23, 35], [60, 59, 54], [98, 78, 50], [60, 82, 59], [22, 44, 33], [23, 21, 25]] + x = [ + [22, 23, 35], + [60, 59, 54], + [98, 78, 50], + [60, 82, 59], + [22, 44, 33], + [23, 21, 25], + ] _, stat = som.test_mackwolfe(x, n_perm=50) self.assertTrue(np.allclose(stat, 3.2024699769846983)) - # Post hoc tests def test_posthoc_anderson(self): + r_results = np.array( + [ + [1, 1.35079e-02, 8.64418e-09], + [1.35079e-02, 1, 1.644534e-05], + [8.64418e-09, 1.644534e-05, 1], + ] + ) - r_results = np.array([[1, 1.35079e-02, 8.64418e-09], - [1.35079e-02, 1, 1.644534e-05], - [8.64418e-09, 1.644534e-05, 1]]) - - results = sp.posthoc_anderson(self.df, val_col = 'pulse', group_col = 'kind', p_adjust = 'holm') - self.assertTrue(np.allclose(results.values, r_results, atol=3.e-3)) + results = sp.posthoc_anderson( + self.df, val_col="pulse", group_col="kind", p_adjust="holm" + ) + self.assertTrue(np.allclose(results.values, r_results, atol=3.0e-3)) def test_posthoc_conover(self): + r_results = np.array( + [ + [1, 9.354690e-11, 1.131263e-02], + [9.354690e-11, 1, 5.496288e-06], + [1.131263e-02, 5.496288e-06, 1], + ] + ) - r_results = np.array([[1, 9.354690e-11, 1.131263e-02], - [9.354690e-11, 1, 5.496288e-06], - [1.131263e-02, 5.496288e-06, 1]]) - - results = sp.posthoc_conover(self.df, val_col = 'pulse', group_col = 'kind', p_adjust = 'holm').values + results = sp.posthoc_conover( + self.df, val_col="pulse", group_col="kind", p_adjust="holm" + ).values self.assertTrue(np.allclose(results, r_results)) - def test_posthoc_dunn(self): + r_results = np.array( + [ + [1, 9.570998e-09, 4.390066e-02], + [9.570998e-09, 1, 1.873208e-04], + [4.390066e-02, 1.873208e-04, 1], + ] + ) - r_results = np.array([[1, 9.570998e-09, 4.390066e-02], - [9.570998e-09, 1, 1.873208e-04], - [4.390066e-02, 1.873208e-04, 1]]) - - results = sp.posthoc_dunn(self.df, val_col = 'pulse', group_col = 'kind', p_adjust = 'holm').values + results = sp.posthoc_dunn( + self.df, val_col="pulse", group_col="kind", p_adjust="holm" + ).values self.assertTrue(np.allclose(results, r_results)) def test_posthoc_nemenyi(self): + r_results = np.array( + [ + [1, 2.431833e-08, 1.313107e-01], + [2.431833e-08, 1, 4.855675e-04], + [1.313107e-01, 4.855675e-04, 1], + ] + ) - r_results = np.array([[1, 2.431833e-08, 1.313107e-01], - [2.431833e-08, 1, 4.855675e-04], - [1.313107e-01, 4.855675e-04, 1]]) - - results = sp.posthoc_nemenyi(self.df, val_col = 'pulse', group_col = 'kind').values + results = sp.posthoc_nemenyi(self.df, val_col="pulse", group_col="kind").values self.assertTrue(np.allclose(results, r_results)) def test_posthoc_nemenyi_tukey(self): + r_results = np.array( + [ + [1, 9.793203e-09, 1.088785e-01], + [9.793203e-09, 1, 0.0002789016], + [1.088785e-01, 0.0002789016, 1], + ] + ) - r_results = np.array([[1, 9.793203e-09, 1.088785e-01], - [9.793203e-09, 1, 0.0002789016], - [1.088785e-01, 0.0002789016, 1]]) - - results = sp.posthoc_nemenyi(self.df, val_col = 'pulse', group_col = 'kind', dist = 'tukey').values - self.assertTrue(np.allclose(results, r_results, atol=1.e-3)) - + results = sp.posthoc_nemenyi( + self.df, val_col="pulse", group_col="kind", dist="tukey" + ).values + print(results) + self.assertTrue(np.allclose(results, r_results, atol=1.0e-3)) def test_posthoc_nemenyi_friedman(self): - - p_results = np.array([[1., 0.9, 0.82163255, 0.9, 0.9, 0.21477876, 0.9], - [0.9, 1., 0.87719193, 0.9, 0.9, 0.25967965, 0.9], - [0.82163255, 0.87719193, 1., 0.9, 0.9, 0.9, 0.9], - [0.9, 0.9, 0.9, 1., 0.9, 0.87719193, 0.9], - [0.9, 0.9, 0.9, 0.9, 1., 0.87719193, 0.9], - [0.21477876, 0.25967965, 0.9, 0.87719193, 0.87719193, 1., 0.54381888], - [0.9, 0.9, 0.9, 0.9, 0.9, 0.54381888, 1.]]) + p_results = np.array( + [ + [ + 1.0, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + ], + [ + 0.9999999, + 1.0, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + ], + [ + 0.8414506, + 0.8833015, + 1.0, + np.nan, + np.nan, + np.nan, + np.nan, + ], + [0.9177741, 0.9449086, 0.9999962, 1.0, np.nan, np.nan, np.nan], + [0.9177741, 0.9449086, 0.9999962, 1.0000000, 1.0, np.nan, np.nan], + [0.2147827, 0.2597539, 0.9449086, 0.8833015, 0.8833015, 1.0, np.nan], + [0.9976902, 0.9991770, 0.9888953, 0.9976902, 0.9976902, 0.5511935, 1.0], + ] + ) + tri_upper = np.triu_indices(p_results.shape[0], 1) + p_results[tri_upper] = np.transpose(p_results)[tri_upper] results = sp.posthoc_nemenyi_friedman(self.df_bn) + print(p_results) + print(results) self.assertTrue(np.allclose(results, p_results)) - def test_posthoc_conover_friedman(self): - results = sp.posthoc_conover_friedman(self.df_bn) - p_results = np.array([[1.0000000, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], - [0.9147508, 1.00000000, np.nan, np.nan, np.nan, np.nan, np.nan], - [0.1518030, 0.18071036, 1.0000000, np.nan, np.nan, np.nan, np.nan], - [0.2140927, 0.25232845, 0.8305955, 1.000000, np.nan, np.nan, np.nan], - [0.2140927, 0.25232845, 0.8305955, 1.000000, 1.000000, np.nan, np.nan], - [0.0181602, 0.02222747, 0.2523284, 0.1807104, 0.1807104, 1.00009000, np.nan], - [0.5242303, 0.59465124, 0.3989535, 0.5242303, 0.5242303, 0.05991984, 1.000000]]) + p_results = np.array( + [ + [1.0000000, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], + [0.9147508, 1.00000000, np.nan, np.nan, np.nan, np.nan, np.nan], + [0.1518030, 0.18071036, 1.0000000, np.nan, np.nan, np.nan, np.nan], + [0.2140927, 0.25232845, 0.8305955, 1.000000, np.nan, np.nan, np.nan], + [0.2140927, 0.25232845, 0.8305955, 1.000000, 1.000000, np.nan, np.nan], + [ + 0.0181602, + 0.02222747, + 0.2523284, + 0.1807104, + 0.1807104, + 1.00009000, + np.nan, + ], + [ + 0.5242303, + 0.59465124, + 0.3989535, + 0.5242303, + 0.5242303, + 0.05991984, + 1.000000, + ], + ] + ) tri_upper = np.triu_indices(p_results.shape[0], 1) p_results[tri_upper] = np.transpose(p_results)[tri_upper] np.fill_diagonal(p_results, 1) self.assertTrue(np.allclose(results, p_results)) def test_posthoc_conover_friedman_tukey(self): - - results = sp.posthoc_conover_friedman(self.df_bn, p_adjust='single-step') - p_results = np.array([[1.00000000, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], - [0.99999986, 1.0000000, np.nan, np.nan, np.nan, np.nan, np.nan], - [0.72638075, 0.7905289, 1.0000000, np.nan, np.nan, np.nan, np.nan], - [0.84667448, 0.8934524, 0.9999910, 1.0000000, np.nan, np.nan, np.nan], - [0.84667448, 0.8934524, 0.9999910, 1.0000000, 1.0000000, np.nan, np.nan], - [0.09013677, 0.1187580, 0.8934524, 0.7905289, 0.7905289, 1.0000000, np.nan], - [0.99482447, 0.9981178, 0.9763466, 0.9948245, 0.9948245, 0.3662675, 1.000000]]) + results = sp.posthoc_conover_friedman(self.df_bn, p_adjust="single-step") + p_results = np.array( + [ + [1.00000000, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], + [0.99999986, 1.0000000, np.nan, np.nan, np.nan, np.nan, np.nan], + [0.72638075, 0.7905289, 1.0000000, np.nan, np.nan, np.nan, np.nan], + [0.84667448, 0.8934524, 0.9999910, 1.0000000, np.nan, np.nan, np.nan], + [ + 0.84667448, + 0.8934524, + 0.9999910, + 1.0000000, + 1.0000000, + np.nan, + np.nan, + ], + [ + 0.09013677, + 0.1187580, + 0.8934524, + 0.7905289, + 0.7905289, + 1.0000000, + np.nan, + ], + [ + 0.99482447, + 0.9981178, + 0.9763466, + 0.9948245, + 0.9948245, + 0.3662675, + 1.000000, + ], + ] + ) p_results[p_results > 0.9] = 0.9 tri_upper = np.triu_indices(p_results.shape[0], 1) p_results[tri_upper] = np.transpose(p_results)[tri_upper] np.fill_diagonal(p_results, 1) - self.assertTrue(np.allclose(results, p_results, atol=3.e-2)) + self.assertTrue(np.allclose(results, p_results, atol=3.0e-2)) def test_posthoc_conover_friedman_non_melted(self): - df = DataFrame(self.df_bn) results = sp.posthoc_conover_friedman(df, melted=False) - p_results = np.array([[1.0000000, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], - [0.9147508, 1.00000000, np.nan, np.nan, np.nan, np.nan, np.nan], - [0.1518030, 0.18071036, 1.0000000, np.nan, np.nan, np.nan, np.nan], - [0.2140927, 0.25232845, 0.8305955, 1.000000, np.nan, np.nan, np.nan], - [0.2140927, 0.25232845, 0.8305955, 1.000000, 1.000000, np.nan, np.nan], - [0.0181602, 0.02222747, 0.2523284, 0.1807104, 0.1807104, 1.00009000, np.nan], - [0.5242303, 0.59465124, 0.3989535, 0.5242303, 0.5242303, 0.05991984, 1.000000]]) + p_results = np.array( + [ + [1.0000000, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], + [0.9147508, 1.00000000, np.nan, np.nan, np.nan, np.nan, np.nan], + [0.1518030, 0.18071036, 1.0000000, np.nan, np.nan, np.nan, np.nan], + [0.2140927, 0.25232845, 0.8305955, 1.000000, np.nan, np.nan, np.nan], + [0.2140927, 0.25232845, 0.8305955, 1.000000, 1.000000, np.nan, np.nan], + [ + 0.0181602, + 0.02222747, + 0.2523284, + 0.1807104, + 0.1807104, + 1.00009000, + np.nan, + ], + [ + 0.5242303, + 0.59465124, + 0.3989535, + 0.5242303, + 0.5242303, + 0.05991984, + 1.000000, + ], + ] + ) tri_upper = np.triu_indices(p_results.shape[0], 1) p_results[tri_upper] = np.transpose(p_results)[tri_upper] np.fill_diagonal(p_results, 1) self.assertTrue(np.allclose(results, p_results)) - def test_posthoc_miller_friedman(self): - results = sp.posthoc_miller_friedman(self.df_bn) - p_results = np.array([[1.0, 1.0, 0.9411963, 0.9724396000000001, 0.9724396000000001, 0.4717981, 0.9993864], - [1.0, 1.0, 0.9588993, 0.9823818000000001, 0.9823818000000001, 0.5256257, 0.9997869], - [0.9411963, 0.9588993, 1.0, 0.9999991, 0.9999991, 0.9823818000000001, 0.9968575999999999], - [0.9724396000000001, 0.9823818000000001, 0.9999991, 1.0, 1.0, 0.9588993, 0.9993864], - [0.9724396000000001, 0.9823818000000001, 0.9999991, 1.0, 1.0, 0.9588993, 0.9993864], - [0.4717981, 0.5256257, 0.9823818000000001, 0.9588993, 0.9588993, 1.0, 0.7803545999999999], - [0.9993864, 0.9997869, 0.9968575999999999, 0.9993864, 0.9993864, 0.7803545999999999, 1.0]]) + p_results = np.array( + [ + [ + 1.0, + 1.0, + 0.9411963, + 0.9724396000000001, + 0.9724396000000001, + 0.4717981, + 0.9993864, + ], + [ + 1.0, + 1.0, + 0.9588993, + 0.9823818000000001, + 0.9823818000000001, + 0.5256257, + 0.9997869, + ], + [ + 0.9411963, + 0.9588993, + 1.0, + 0.9999991, + 0.9999991, + 0.9823818000000001, + 0.9968575999999999, + ], + [ + 0.9724396000000001, + 0.9823818000000001, + 0.9999991, + 1.0, + 1.0, + 0.9588993, + 0.9993864, + ], + [ + 0.9724396000000001, + 0.9823818000000001, + 0.9999991, + 1.0, + 1.0, + 0.9588993, + 0.9993864, + ], + [ + 0.4717981, + 0.5256257, + 0.9823818000000001, + 0.9588993, + 0.9588993, + 1.0, + 0.7803545999999999, + ], + [ + 0.9993864, + 0.9997869, + 0.9968575999999999, + 0.9993864, + 0.9993864, + 0.7803545999999999, + 1.0, + ], + ] + ) self.assertTrue(np.allclose(results, p_results)) - def test_posthoc_siegel_friedman(self): - results = sp.posthoc_siegel_friedman(self.df_bn) - p_results = np.array([[1.000000, 0.92471904, 0.18587673, 0.25683926, 0.25683926, 0.01816302, 0.57075039], - [0.92471904, 1.0000000, 0.2193026, 0.2986177, 0.2986177, 0.0233422, 0.6366016], - [0.18587673, 0.2193026, 1.0000000, 0.8501067, 0.8501067, 0.2986177, 0.4496918], - [0.25683926, 0.2986177, 0.8501067, 1.000000, 1.0000000, 0.2193026, 0.5707504], - [0.25683926, 0.2986177, 0.8501067, 1.0000000, 1.0000000, 0.2193026, 0.5707504], - [0.01816302, 0.0233422, 0.2986177, 0.2193026, 0.2193026, 1.000000, 0.07260094], - [0.57075039, 0.6366016, 0.4496918, 0.5707504, 0.5707504, 0.07260094, 1.000000]]) + p_results = np.array( + [ + [ + 1.000000, + 0.92471904, + 0.18587673, + 0.25683926, + 0.25683926, + 0.01816302, + 0.57075039, + ], + [ + 0.92471904, + 1.0000000, + 0.2193026, + 0.2986177, + 0.2986177, + 0.0233422, + 0.6366016, + ], + [ + 0.18587673, + 0.2193026, + 1.0000000, + 0.8501067, + 0.8501067, + 0.2986177, + 0.4496918, + ], + [ + 0.25683926, + 0.2986177, + 0.8501067, + 1.000000, + 1.0000000, + 0.2193026, + 0.5707504, + ], + [ + 0.25683926, + 0.2986177, + 0.8501067, + 1.0000000, + 1.0000000, + 0.2193026, + 0.5707504, + ], + [ + 0.01816302, + 0.0233422, + 0.2986177, + 0.2193026, + 0.2193026, + 1.000000, + 0.07260094, + ], + [ + 0.57075039, + 0.6366016, + 0.4496918, + 0.5707504, + 0.5707504, + 0.07260094, + 1.000000, + ], + ] + ) self.assertTrue(np.allclose(results, p_results)) - def test_posthoc_durbin(self): - results = sp.posthoc_durbin(self.df_bn, p_adjust = 'holm') - - p_results = np.array([[1.000000, 1.000000, 1.0, 1.0, 1.0, 0.381364, 1.0], - [1.000000, 1.000000, 1.0, 1.0, 1.0, 0.444549, 1.0], - [1.000000, 1.000000, 1.0, 1.0, 1.0, 1.000000, 1.0], - [1.000000, 1.000000, 1.0, 1.0, 1.0, 1.000000, 1.0], - [1.000000, 1.000000, 1.0, 1.0, 1.0, 1.000000, 1.0], - [0.381364, 0.444549, 1.0, 1.0, 1.0, 1.000000, 1.0], - [1.000000, 1.000000, 1.0, 1.0, 1.0, 1.000000, 1.0]]) + results = sp.posthoc_durbin(self.df_bn, p_adjust="holm") + + p_results = np.array( + [ + [1.000000, 1.000000, 1.0, 1.0, 1.0, 0.381364, 1.0], + [1.000000, 1.000000, 1.0, 1.0, 1.0, 0.444549, 1.0], + [1.000000, 1.000000, 1.0, 1.0, 1.0, 1.000000, 1.0], + [1.000000, 1.000000, 1.0, 1.0, 1.0, 1.000000, 1.0], + [1.000000, 1.000000, 1.0, 1.0, 1.0, 1.000000, 1.0], + [0.381364, 0.444549, 1.0, 1.0, 1.0, 1.000000, 1.0], + [1.000000, 1.000000, 1.0, 1.0, 1.0, 1.000000, 1.0], + ] + ) self.assertTrue(np.allclose(results, p_results)) def test_posthoc_quade(self): results = sp.posthoc_quade(self.df_bn) - p_results = np.array([[1.00000000, 0.67651326, 0.15432143, 0.17954686, 0.2081421 , 0.02267043, 0.2081421], - [ 0.67651326,1.00000000, 0.29595042, 0.33809987, 0.38443835, 0.0494024 , 0.38443835], - [ 0.15432143, 0.29595042,1.00000000, 0.92586499, 0.85245022, 0.29595042, 0.85245022], - [ 0.17954686, 0.33809987, 0.92586499,1.00000000, 0.92586499, 0.25789648, 0.92586499], - [ 0.2081421 , 0.38443835, 0.85245022, 0.92586499,1.00000000, 0.22378308, 1.00000000], - [ 0.02267043, 0.0494024 , 0.29595042, 0.25789648, 0.22378308,1.00000000, 0.22378308], - [ 0.2081421 , 0.38443835, 0.85245022, 0.92586499, 1.00000000, 0.22378308,1.00000000]]) + p_results = np.array( + [ + [ + 1.00000000, + 0.67651326, + 0.15432143, + 0.17954686, + 0.2081421, + 0.02267043, + 0.2081421, + ], + [ + 0.67651326, + 1.00000000, + 0.29595042, + 0.33809987, + 0.38443835, + 0.0494024, + 0.38443835, + ], + [ + 0.15432143, + 0.29595042, + 1.00000000, + 0.92586499, + 0.85245022, + 0.29595042, + 0.85245022, + ], + [ + 0.17954686, + 0.33809987, + 0.92586499, + 1.00000000, + 0.92586499, + 0.25789648, + 0.92586499, + ], + [ + 0.2081421, + 0.38443835, + 0.85245022, + 0.92586499, + 1.00000000, + 0.22378308, + 1.00000000, + ], + [ + 0.02267043, + 0.0494024, + 0.29595042, + 0.25789648, + 0.22378308, + 1.00000000, + 0.22378308, + ], + [ + 0.2081421, + 0.38443835, + 0.85245022, + 0.92586499, + 1.00000000, + 0.22378308, + 1.00000000, + ], + ] + ) self.assertTrue(np.allclose(results, p_results)) def test_posthoc_quade_norm(self): - - results = sp.posthoc_quade(self.df_bn, dist='normal') - - p_results = np.array([[1.00000000, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], - [ 0.5693540320,1.00000000, np.nan, np.nan, np.nan, np.nan, np.nan], - [ 0.0430605548, 0.145913303,1.00000000, np.nan, np.nan, np.nan, np.nan], - [ 0.0578705783, 0.184285855, 0.8993796,1.00000000, np.nan, np.nan, np.nan], - [ 0.0766885196, 0.229662468, 0.8003530, 0.8993796,1.00000000, np.nan, np.nan], - [ 0.0005066018, 0.003634715, 0.1459133, 0.1139777, 0.08782032,1.00000000, np.nan], - [ 0.0766885196, 0.229662468, 0.8003530, 0.8993796, 1.00000000, 0.08782032,1.00000000]]) + results = sp.posthoc_quade(self.df_bn, dist="normal") + + p_results = np.array( + [ + [1.00000000, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], + [0.5693540320, 1.00000000, np.nan, np.nan, np.nan, np.nan, np.nan], + [0.0430605548, 0.145913303, 1.00000000, np.nan, np.nan, np.nan, np.nan], + [ + 0.0578705783, + 0.184285855, + 0.8993796, + 1.00000000, + np.nan, + np.nan, + np.nan, + ], + [ + 0.0766885196, + 0.229662468, + 0.8003530, + 0.8993796, + 1.00000000, + np.nan, + np.nan, + ], + [ + 0.0005066018, + 0.003634715, + 0.1459133, + 0.1139777, + 0.08782032, + 1.00000000, + np.nan, + ], + [ + 0.0766885196, + 0.229662468, + 0.8003530, + 0.8993796, + 1.00000000, + 0.08782032, + 1.00000000, + ], + ] + ) tri_upper = np.triu_indices(p_results.shape[0], 1) p_results[tri_upper] = np.transpose(p_results)[tri_upper] self.assertTrue(np.allclose(results, p_results)) def test_posthoc_npm_test(self): - results = sp.posthoc_npm_test(self.df_bn) + data = np.array( + [ + [2.4, 3, 3, 2.2, 2.2, 2.2, 2.2, 2.8, 2, 3], + [2.8, 2.2, 3.8, 9.4, 8.4, 3, 3.2, 4.4, 3.2, 7.4], + [9.8, 3.2, 5.8, 7.8, 2.6, 2.2, 6.2, 9.4, 7.8, 3.4], + [7, 9.8, 9.4, 8.8, 8.8, 3.4, 9, 8.4, 2.4, 7.8], + ] + ) - p_results = np.array([[1., .9, .9], - [.9, 1., .9], - [.9, .9, 1.]]) + results = sp.posthoc_npm_test(data) - self.assertTrue(np.allclose(results, p_results)) + print(results) + p_results = np.array( + [ + [1.0, 0.0077, 0.0020, 2e-16], + [0.0077, 1.0, 0.2884, 0.0854], + [0.0020, 0.2884, 1.0, 0.1385], + [2e-16, 0.0854, 0.1385, 1.0], + ] + ) + self.assertTrue(np.allclose(results, p_results, rtol=1)) def test_posthoc_vanwaerden(self): - r_results = np.array([[1, 1.054709e-02, 6.476665e-11], - [1.054709e-02, 1, 4.433141e-06], - [6.476665e-11, 4.433141e-06, 1]]) + r_results = np.array( + [ + [1, 1.054709e-02, 6.476665e-11], + [1.054709e-02, 1, 4.433141e-06], + [6.476665e-11, 4.433141e-06, 1], + ] + ) - results = sp.posthoc_vanwaerden(self.df, val_col = 'pulse', group_col = 'kind', p_adjust='holm') + results = sp.posthoc_vanwaerden( + self.df, val_col="pulse", group_col="kind", p_adjust="holm" + ) self.assertTrue(np.allclose(results, r_results)) - def test_posthoc_dscf(self): - r_results = np.array([[1, 4.430682e-02, 9.828003e-08], - [4.430682e-02, 1, 5.655274e-05], - [9.828003e-08, 5.655274e-05, 1]]) + r_results = np.array( + [ + [1, 4.430682e-02, 9.828003e-08], + [4.430682e-02, 1, 5.655274e-05], + [9.828003e-08, 5.655274e-05, 1], + ] + ) - results = sp.posthoc_dscf(self.df, val_col = 'pulse', group_col = 'kind') + results = sp.posthoc_dscf(self.df, val_col="pulse", group_col="kind") self.assertTrue(np.allclose(results, r_results, atol=0.001)) - def test_posthoc_ttest(self): + r_results = np.array( + [ + [1, 9.757069e-03, 4.100954e-07], + [9.757069e-03, 1, 1.556010e-05], + [4.100954e-07, 1.556010e-05, 1], + ] + ) - r_results = np.array([[1, 9.757069e-03, 4.100954e-07], - [9.757069e-03, 1, 1.556010e-05], - [4.100954e-07, 1.556010e-05, 1]]) - - results = sp.posthoc_ttest(self.df, val_col = 'pulse', group_col = 'kind', equal_var = False, p_adjust='holm') + results = sp.posthoc_ttest( + self.df, val_col="pulse", group_col="kind", equal_var=False, p_adjust="holm" + ) self.assertTrue(np.allclose(results, r_results)) def test_posthoc_ttest_pooled(self): + x = [[1, 2, 3, 5, 1], [12, 31, 54, 50, 40], [10, 12, 6, 74, 11]] + r_results = np.array( + [ + [1, 0.04226866, 0.24706893], + [0.04226866, 1, 0.2482456], + [0.24706893, 0.2482456, 1], + ] + ) - x = [[1,2,3,5,1], [12,31,54,50,40], [10,12,6,74,11]] - r_results = np.array([[1, 0.04226866, 0.24706893], - [0.04226866, 1, 0.2482456], - [0.24706893, 0.2482456, 1]]) - - results = sp.posthoc_ttest(x, equal_var = False, p_adjust='holm', pool_sd=True) + results = sp.posthoc_ttest(x, equal_var=False, p_adjust="holm", pool_sd=True) self.assertTrue(np.allclose(results, r_results)) - def test_posthoc_tukey_hsd(self): - - x = [[1,2,3,4,5], [35,31,75,40,21], [10,6,9,6,1]] - g = [['a'] * 5, ['b'] * 5, ['c'] * 5] - results = sp.posthoc_tukey_hsd(np.concatenate(x), np.concatenate(g)) - n_results = np.array([[1, 1, 0], [1, 1, 1], [0, 1, 1]]) + x = [[1, 2, 3, 4, 5], [35, 31, 75, 40, 21], [10, 6, 9, 6, 1]] + results = sp.posthoc_tukey_hsd(x) + n_results = np.array( + [ + [1.0, 0.000991287, 0.897449027], + [0.000991287, 1.0, 0.00210909], + [0.897449027, 0.00210909, 1.0], + ] + ) self.assertTrue(np.allclose(n_results, results)) - def test_posthoc_mannwhitney(self): + r_results = np.array( + [ + [1, 3.420508e-08, 1.714393e-02], + [3.420508e-08, 1, 1.968352e-05], + [1.714393e-02, 1.968352e-05, 1], + ] + ) - r_results = np.array([[1, 3.420508e-08, 1.714393e-02], - [3.420508e-08, 1, 1.968352e-05], - [1.714393e-02, 1.968352e-05, 1]]) - - results = sp.posthoc_mannwhitney(self.df, val_col = 'pulse', group_col = 'kind').values + results = sp.posthoc_mannwhitney( + self.df, val_col="pulse", group_col="kind" + ).values self.assertTrue(np.allclose(results, r_results)) def test_posthoc_mannwhitney_ndarray(self): - - _x = [[1,2,3,5,1], [12,31,54,50,40], [10,12,6,74,11]] + _x = [[1, 2, 3, 5, 1], [12, 31, 54, 50, 40], [10, 12, 6, 74, 11]] x = np.array(_x) - g = np.repeat([0,1,2], 5) + g = np.repeat([0, 1, 2], 5) nd = np.column_stack((x.ravel(), g)) - xdf = DataFrame(dict(zip(list('abc'), _x))).melt(var_name='groups', value_name='vals') - results = sp.posthoc_mannwhitney(xdf, val_col = 'vals', group_col = 'groups').values + xdf = DataFrame(dict(zip(list("abc"), _x))).melt( + var_name="groups", value_name="vals" + ) + results = sp.posthoc_mannwhitney(xdf, val_col="vals", group_col="groups").values nd_results = sp.posthoc_mannwhitney(nd, val_col=0, group_col=1).values self.assertTrue(np.allclose(nd_results, results)) - def test_posthoc_wilcoxon(self): + r_results = np.array( + [ + [1, 2.337133e-03, 2.857818e-06], + [2.337133e-03, 1, 1.230888e-05], + [2.857818e-06, 1.230888e-05, 1], + ] + ) - r_results = np.array([[1, 2.337133e-03, 2.857818e-06], - [2.337133e-03, 1, 1.230888e-05], - [2.857818e-06, 1.230888e-05, 1]]) - - results = sp.posthoc_wilcoxon(self.df.sort_index(), val_col = 'pulse', group_col = 'kind') + results = sp.posthoc_wilcoxon( + self.df.sort_index(), val_col="pulse", group_col="kind" + ) self.assertTrue(np.allclose(results, r_results, atol=1e-4)) - def test_posthoc_scheffe(self): + r_results = np.array( + [ + [1.0, 3.378449e-01, 3.047472e-10], + [3.378449e-01, 1.0, 2.173209e-07], + [3.047472e-10, 2.173209e-07, 1.0], + ] + ) - r_results = np.array([[1., 3.378449e-01, 3.047472e-10], - [3.378449e-01, 1., 2.173209e-07], - [3.047472e-10, 2.173209e-07, 1.]]) - - results = sp.posthoc_scheffe(self.df.sort_index(), val_col = 'pulse', group_col = 'kind') + results = sp.posthoc_scheffe( + self.df.sort_index(), val_col="pulse", group_col="kind" + ) self.assertTrue(np.allclose(results, r_results)) - def test_posthoc_tamhane(self): + r_results = np.array( + [ + [1, 2.898653e-02, 4.100954e-07], + [2.898653e-02, 1, 2.333996e-05], + [4.100954e-07, 2.333996e-05, 1], + ] + ) - r_results = np.array([[1, 2.898653e-02, 4.100954e-07], - [2.898653e-02, 1, 2.333996e-05], - [4.100954e-07, 2.333996e-05, 1]]) - - results = sp.posthoc_tamhane(self.df.sort_index(), val_col = 'pulse', group_col = 'kind') + results = sp.posthoc_tamhane( + self.df.sort_index(), val_col="pulse", group_col="kind" + ) self.assertTrue(np.allclose(results, r_results)) def test_posthoc_tamhane_nw(self): + r_results = np.array( + [ + [1, 2.883219e-02, 4.780682e-08], + [2.883219e-02, 1, 8.643683e-06], + [4.780682e-08, 8.643683e-06, 1], + ] + ) - r_results = np.array([[1, 2.883219e-02, 4.780682e-08], - [2.883219e-02, 1, 8.643683e-06], - [4.780682e-08, 8.643683e-06, 1]]) - - results = sp.posthoc_tamhane(self.df.sort_index(), val_col = 'pulse', group_col = 'kind', welch = False) + results = sp.posthoc_tamhane( + self.df.sort_index(), val_col="pulse", group_col="kind", welch=False + ) self.assertTrue(np.allclose(results, r_results)) - def test_posthoc_tukey(self): - r_results = np.array([[1, 3.042955e-01, 4.308631e-10], - [3.042955e-01, 1, 9.946571e-08], - [4.308631e-10, 9.946571e-08, 1]]) - - results = sp.posthoc_tukey(self.df.sort_index(), val_col = 'pulse', group_col = 'kind') - self.assertTrue(np.allclose(results, r_results, atol=1.e-3)) + r_results = np.array( + [ + [1, 3.042955e-01, 4.308631e-10], + [3.042955e-01, 1, 9.946571e-08], + [4.308631e-10, 9.946571e-08, 1], + ] + ) + results = sp.posthoc_tukey( + self.df.sort_index(), val_col="pulse", group_col="kind" + ) + self.assertTrue(np.allclose(results, r_results, atol=1.0e-3)) def test_posthoc_dunnett(self): r_results = [8.125844e-11, 2.427434e-01] - results = sp.posthoc_dunnett(self.df.sort_index(), val_col='pulse', group_col='kind', - control='rest', to_matrix=False) + results = sp.posthoc_dunnett( + self.df.sort_index(), + val_col="pulse", + group_col="kind", + control="rest", + to_matrix=False, + ) # scipy use randomized Quasi-Monte Carlo integration of the multivariate-t distribution # to compute the p-values. The result may vary slightly from run to run. # we run the test 1000 times (maximum absolute tolerance = 1.e-4 for example data) is_close = [] for i in range(1000): - is_close.append(np.allclose(results, r_results, atol=1.e-4)) + is_close.append(np.allclose(results, r_results, atol=1.0e-4)) is_close = all(is_close) self.assertTrue(is_close) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() -