diff --git a/.github/workflows/python-runtests-basic.yml b/.github/workflows/python-runtests-basic.yml index 74bb0753..178f01cf 100644 --- a/.github/workflows/python-runtests-basic.yml +++ b/.github/workflows/python-runtests-basic.yml @@ -15,7 +15,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.9", "3.10", "3.11", "3.12"] + python-version: ["3.10", "3.11", "3.12"] steps: - uses: actions/checkout@v2 diff --git a/wqio/datacollections.py b/wqio/datacollections.py index 923aa563..6908a4b8 100644 --- a/wqio/datacollections.py +++ b/wqio/datacollections.py @@ -20,10 +20,11 @@ _Stat = namedtuple("_stat", ["stat", "pvalue"]) -def _dist_compare(x, y, stat_comp_func): +def _dist_compare(x, y, stat_comp_func, **test_opts): if (len(x) == len(y)) and numpy.equal(x, y).all(): return _Stat(numpy.nan, numpy.nan) - return stat_comp_func(x, y, alternative="two-sided") + + return stat_comp_func(x, y, **test_opts) class DataCollection: @@ -130,6 +131,8 @@ def __init__( **{self.cencol: dataframe[self.qualcol].isin(self.ndval)} ).reset_index() + self.pbarfxn = tqdm if (self.showpbar and tqdm) else utils.misc.no_op + @cache_readonly def tidy(self): if self.useros: @@ -323,7 +326,7 @@ def mean(self): @cache_readonly def std_dev(self): - return self.generic_stat(numpy.std, statname="std. dev.", use_bootstrap=False) + return self.generic_stat(numpy.std, statname="std. dev.", use_bootstrap=False, ddof=1) def percentile(self, percentile): """Return the percentiles (0 - 100) for the data.""" @@ -342,7 +345,7 @@ def logmean(self): @cache_readonly def logstd_dev(self): return self.generic_stat( - lambda x, axis=0: numpy.std(numpy.log(x), axis=axis), + lambda x, axis=0: numpy.std(numpy.log(x), axis=axis, ddof=1), use_bootstrap=False, statname="Log-std. dev.", ) @@ -359,65 +362,93 @@ def geostd_dev(self): geostd.columns.names = ["station", "Geo-std. dev."] return geostd - @cache_readonly - def shapiro(self): + def shapiro(self, **opts): + """ + Run the Shapiro-Wilk test for normality on the datasets. + + Requires at least 3 observations in each dataset. + + See `scipy.stats.shapiro` for info on kwargs you can pass. + """ return self.generic_stat( stats.shapiro, use_bootstrap=False, has_pvalue=True, statname="shapiro", filterfxn=lambda x: x.shape[0] > 3, + **opts, ) - @cache_readonly - def shapiro_log(self): + def shapiro_log(self, **opts): + """ + Run the Shapiro-Wilk test for normality on log-transformed datasets. + + Requires at least 3 observations in each dataset. + + See `scipy.stats.shapiro` for info on kwargs you can pass. + """ return self.generic_stat( lambda x: stats.shapiro(numpy.log(x)), use_bootstrap=False, has_pvalue=True, filterfxn=lambda x: x.shape[0] > 3, statname="log-shapiro", + **opts, ) - @cache_readonly - def lilliefors(self): + def lilliefors(self, **opts): + """ + Run the Lilliefors test for normality on the datasets. + + Requires at least 3 observations in each dataset. + + See `statsmodels.api.stats.lilliefors` for info on kwargs you can pass. + """ return self.generic_stat( sm.stats.lilliefors, use_bootstrap=False, has_pvalue=True, statname="lilliefors", + **opts, ) - @cache_readonly - def lilliefors_log(self): + def lilliefors_log(self, **opts): + """ + Run the Lilliefors test for normality on the log-transformed datasets. + + Requires at least 3 observations in each dataset. + + See `statsmodels.api.stats.lilliefors` for info on kwargs you can pass. + """ return self.generic_stat( lambda x: sm.stats.lilliefors(numpy.log(x)), use_bootstrap=False, has_pvalue=True, statname="log-lilliefors", + **opts, ) - @cache_readonly - def anderson_darling(self): + def anderson_darling(self, **opts): raise NotImplementedError return self.generic_stat( utils.anderson_darling, use_bootstrap=False, has_pvalue=True, statname="anderson-darling", + **opts, ) - @cache_readonly - def anderson_darling_log(self): + def anderson_darling_log(self, **opts): raise NotImplementedError return self.generic_stat( lambda x: utils.anderson_darling(numpy.log(x)), use_bootstrap=False, has_pvalue=True, statname="log-anderson-darling", + **opts, ) - def comparison_stat(self, statfxn, statname=None, paired=False, **statopts): + def comparison_stat_twoway(self, statfxn, statname=None, paired=False, **statopts): """Generic function to apply comparative hypothesis tests to the groups of the ``DataCollection``. @@ -430,7 +461,7 @@ def comparison_stat(self, statfxn, statname=None, paired=False, **statopts): statname : string, optional Name of the statistic. Included as a column name in the final dataframe. - apired : bool, optional + paired : bool, optional Set to ``True`` if ``statfxn`` requires paired data. **statopts : optional kwargs Additional keyword arguments that will be passed to @@ -455,9 +486,9 @@ def comparison_stat(self, statfxn, statname=None, paired=False, **statopts): >>> dc = DataCollection(df, rescol='res', qualcol='qual', ... stationcol='loc', paramcol='param', ... ndval='<') - >>> mwht = dc.comparison_stat(stats.mannwhitneyu, - ... statname='mann_whitney', - ... alternative='two-sided') + >>> mwht = dc.comparison_stat_twoway(stats.mannwhitneyu, + ... statname='mann_whitney', + ... alternative='two-sided') """ @@ -475,46 +506,134 @@ def comparison_stat(self, statfxn, statname=None, paired=False, **statopts): index_cols = meta_columns + station_columns results = generator( - data, meta_columns, self.stationcol, rescol, statfxn, statname=statname, **statopts + data, + meta_columns, + self.stationcol, + rescol, + statfxn, + statname=statname, + pbarfxn=self.pbarfxn, + **statopts, ) return pandas.DataFrame.from_records(results).set_index(index_cols) - @cache_readonly - def mann_whitney(self): - return self.comparison_stat( - partial(_dist_compare, stat_comp_func=stats.mannwhitneyu), + def comparison_stat_allway(self, statfxn, statname, control=None, **statopts): + results = utils.numutils._group_comp_stat_generator( + self.tidy, + self.groupcols_comparison, + self.stationcol, + self.rescol, + statfxn, + statname=statname, + control=control, + pbarfxn=self.pbarfxn, + **statopts, + ) + return pandas.DataFrame.from_records(results).set_index(self.groupcols_comparison) + + def mann_whitney(self, **opts): + """ + Run the Mann-Whitney U test across datasets. + + See `scipy.stats.mannwhitneyu` for available options. + """ + return self.comparison_stat_twoway( + partial(_dist_compare, stat_comp_func=stats.mannwhitneyu, **opts), statname="mann_whitney", ) - @cache_readonly - def ranksums(self): - return self.comparison_stat(stats.ranksums, statname="rank_sums") + def ranksums(self, **opts): + """ + Run the unpaired Wilcoxon rank-sum test across datasets. - @cache_readonly - def t_test(self): - return self.comparison_stat(stats.ttest_ind, statname="t_test", equal_var=False) + See `scipy.stats.ranksums` for available options. + """ + return self.comparison_stat_twoway(stats.ranksums, statname="rank_sums", **opts) - @cache_readonly - def levene(self): - return self.comparison_stat(stats.levene, statname="levene", center="median") + def t_test(self, **opts): + """ + Run the T-test for independent scores. - @cache_readonly - def wilcoxon(self): - return self.comparison_stat( + See `scipy.stats.ttest_ind` for available options. + """ + return self.comparison_stat_twoway(stats.ttest_ind, statname="t_test", **opts) + + def levene(self, **opts): + """ + Run the Levene test for equal variances + + See `scipy.stats.levene` for available options. + """ + return self.comparison_stat_twoway(stats.levene, statname="levene", **opts) + + def wilcoxon(self, **opts): + """ + Run the paired Wilcoxon rank-sum test across paired dataset. + + See `scipy.stats.wilcoxon` for available options. + """ + return self.comparison_stat_twoway( partial(_dist_compare, stat_comp_func=stats.wilcoxon), statname="wilcoxon", paired=True, + **opts, ) - @cache_readonly - def kendall(self): - return self.comparison_stat(stats.kendalltau, statname="kendalltau", paired=True) + def kendall(self, **opts): + """ + Run the paired Kendall-tau test across paired dataset. - @cache_readonly - def spearman(self): - return self.comparison_stat(stats.spearmanr, statname="spearmanrho", paired=True) + See `scipy.stats.kendalltau` for available options. + """ + return self.comparison_stat_twoway( + stats.kendalltau, statname="kendalltau", paired=True, **opts + ) + + def spearman(self, **opts): + """ + Run the paired Spearman-rho test across paired dataset. + + See `scipy.stats.spearmanr` for available options. + """ + return self.comparison_stat_twoway( + stats.spearmanr, statname="spearmanrho", paired=True, **opts + ) + + def kruskal_wallis(self, **opts): + """ + Run the paired Kruskal-Wallos H-test across paired dataset. + + See `scipy.stats.kruskal` for available options. + """ + return self.comparison_stat_allway(stats.kruskal, statname="K-W H", control=None, **opts) + + def f_test(self, **opts): + """ + One-way ANOVA test across datasets + + See `scipy.stats.f_oneway` for available options. + """ + return self.comparison_stat_allway(stats.f_oneway, statname="f-test", control=None, **opts) + + def tukey_hsd(self) -> tuple[pandas.DataFrame, pandas.DataFrame]: + """ + Tukey Honestly Significant Difference (HSD) test across stations + other groups + for each parameter + """ + hsd = utils.tukey_hsd( + self.tidy, self.rescol, self.stationcol, self.paramcol, *self.othergroups + ) + scores = utils.process_tukey_hsd_scores(hsd, self.stationcol, self.paramcol) + return hsd, scores + + def dunn(self): + """ + Dunn test across the different stations for each pollutant + """ + return self.tidy.groupby(by=[self.paramcol]).apply( + lambda g: utils.dunn_test(g, self.rescol, self.stationcol, *self.othergroups).scores + ) - @cache_readonly def theilslopes(self, logs=False): raise NotImplementedError diff --git a/wqio/features.py b/wqio/features.py index d727d66e..5272ffc5 100644 --- a/wqio/features.py +++ b/wqio/features.py @@ -1599,20 +1599,18 @@ def scatterplot( raise ValueError(f"`eqn_pos` must be on of {list.positions.keys()}") # annotate axes with stats + slope = utils.sig_figs(modelres.params[1], n=3) + icept = utils.sig_figs(modelres.params[0], n=3) ax.annotate( - r"$\log(y) = {} \, \log(x) + {}$".format( - utils.sig_figs(modelres.params[1], n=3), - utils.sig_figs(modelres.params[0], n=3), - ), + rf"$\log(y) = {slope} \, \log(x) + {icept}$", (txt_x, txt_y), xycoords="axes fraction", ) + slope_pval = utils.process_p_vals(modelres.pvalues[1]) + icept_pval = utils.process_p_vals(modelres.pvalues[0]) ax.annotate( - "Slope p-value: {}\nIntercept p-value: {}".format( - utils.process_p_vals(modelres.pvalues[1]), - utils.process_p_vals(modelres.pvalues[0]), - ), + f"Slope p-value: {slope_pval}\nIntercept p-value: {icept_pval}", (txt_x, txt_y - vert_offset), xycoords="axes fraction", ) diff --git a/wqio/tests/__init__.py b/wqio/tests/__init__.py index 179885ce..8018b58f 100644 --- a/wqio/tests/__init__.py +++ b/wqio/tests/__init__.py @@ -1,6 +1,5 @@ import warnings - -from pkg_resources import resource_filename +from importlib import resources from wqio.tests.helpers import requires @@ -12,7 +11,7 @@ @requires(pytest, "pytest") def test(*args): - options = [resource_filename("wqio", "")] + options = [str(resources.files("wqio"))] options.extend(list(args)) return pytest.main(options) diff --git a/wqio/tests/_data/sed.pkl b/wqio/tests/_data/sed.pkl new file mode 100644 index 00000000..b17859de Binary files /dev/null and b/wqio/tests/_data/sed.pkl differ diff --git a/wqio/tests/_data/wq.pkl b/wqio/tests/_data/wq.pkl new file mode 100644 index 00000000..677d3198 Binary files /dev/null and b/wqio/tests/_data/wq.pkl differ diff --git a/wqio/tests/helpers.py b/wqio/tests/helpers.py index 177c1a3d..8587ffd2 100644 --- a/wqio/tests/helpers.py +++ b/wqio/tests/helpers.py @@ -6,13 +6,13 @@ from collections import namedtuple from contextlib import contextmanager from functools import wraps +from importlib import resources from io import StringIO import numpy import pandas import pytest from packaging.version import Version -from pkg_resources import resource_filename def get_img_tolerance(): @@ -154,8 +154,21 @@ def comp_statfxn(x, y): return stat(result, result * 0.25) +def group_statfxn(*x): + stat = namedtuple("teststat", ("statistic", "pvalue")) + result = max([max(_) for _ in x]) + return stat(result, 0.25) + + +def group_statfxn_with_control(*x, control): + stat = namedtuple("teststat", ("statistic", "pvalue")) + x = [*x, control] + result = max([max(_) for _ in x]) + return stat(result, 0.25) + + def test_data_path(filename): - path = resource_filename("wqio.tests._data", filename) + path = resources.files("wqio.tests._data").joinpath(filename) return path diff --git a/wqio/tests/test_datacollections.py b/wqio/tests/test_datacollections.py index 19becdb7..7fbcdf14 100644 --- a/wqio/tests/test_datacollections.py +++ b/wqio/tests/test_datacollections.py @@ -59,6 +59,26 @@ def dc(): return dc +@pytest.fixture +def dc_small(): + df = helpers.make_dc_data_complex() + dc = DataCollection( + df, + rescol="res", + qualcol="qual", + stationcol="loc", + paramcol="param", + ndval="<", + othergroups=None, + pairgroups=["state", "bmp"], + useros=True, + filterfxn=lambda g: g.name[1] in ["A", "B", "C"], + bsiter=10000, + ) + + return dc + + @pytest.fixture def dc_noNDs(): df = helpers.make_dc_data_complex() @@ -196,12 +216,12 @@ def test_std_dev(dc): station,Inflow,Outflow,Reference result,std. dev.,std. dev.,std. dev. param,,, - A,3.58649,8.719371,5.527633 - B,12.360099,13.60243,10.759285 - C,0.353755,1.691208,0.493325 - D,4.811938,2.849393,2.248178 - E,2.55038,1.096698,2.789238 - F,34.447565,5.361033,3.398367 + A,3.675059,8.92456,5.671232 + B,12.625938,13.922531,11.054115 + C,0.361364,1.727582,0.503498 + D,4.915433,2.90815,2.303697 + E,2.620266,1.132665,2.861698 + F,35.298251,5.476337,3.502956 """ check_stat(known_csv, dc.std_dev) @@ -260,12 +280,12 @@ def test_logstd_dev(dc): station,Inflow,Outflow,Reference result,Log-std. dev.,Log-std. dev.,Log-std. dev. param,,, - A,1.374026,1.343662,1.225352 - B,1.430381,2.07646,1.662001 - C,0.818504,1.263631,1.057177 - D,1.530871,1.187246,1.277927 - E,1.264403,1.121038,1.474431 - F,2.324063,1.516331,1.701596 + A,1.407957,1.375282,1.257185 + B,1.461146,2.125324,1.707543 + C,0.836108,1.290809,1.078977 + D,1.563796,1.211728,1.309485 + E,1.29905,1.157803,1.512734 + F,2.381456,1.548944,1.753965 """ check_stat(known_csv, dc.logstd_dev) @@ -292,12 +312,12 @@ def test_geostd_dev(dc): station,Inflow,Outflow,Reference Geo-std. dev.,Log-std. dev.,Log-std. dev.,Log-std. dev. param,,, - A,3.951225,3.833055,3.405365 - B,4.180294,7.976181,5.269843 - C,2.267105,3.538244,2.878234 - D,4.622199,3.278041,3.589191 - E,3.540977,3.068036,4.368548 - F,10.217099,4.55548,5.48269 + A,4.087597,3.956193,3.515511 + B,4.310897,8.375613,5.515395 + C,2.30737,3.635725,2.941668 + D,4.776921,3.359284,3.704266 + E,3.665813,3.182932,4.539124 + F,10.820642,4.706498,5.777465 """ check_stat(known_csv, dc.geostd_dev) @@ -315,7 +335,7 @@ def test_shapiro(dc): E,1.7e-05,0.654137,0.004896,0.818813,0.000165,0.74917 F,0.0,0.292916,2e-06,0.634671,0.000167,0.713968 """ - check_stat(known_csv, dc.shapiro) + check_stat(known_csv, dc.shapiro()) @helpers.seed @@ -331,7 +351,7 @@ def test_shapiro_log(dc): E,0.955088913,0.479993254,0.95211035,0.523841977,0.963425279,0.61430341 F,0.97542423,0.847370088,0.982230783,0.933124721,0.966197193,0.749036908 """ - check_stat(known_csv, dc.shapiro_log) + check_stat(known_csv, dc.shapiro_log()) @helpers.seed @@ -347,7 +367,7 @@ def test_lilliefors(dc): E,0.341398,0.001,0.239314,0.016156,0.233773,0.005575 F,0.419545,0.001,0.331315,0.001,0.284249,0.001 """ - check_stat(known_csv, dc.lilliefors) + check_stat(known_csv, dc.lilliefors()) @helpers.seed @@ -363,19 +383,19 @@ def test_lilliefors_log(dc): E,0.135066,0.471868,0.145523,0.477979,0.091649,0.928608 F,0.144208,0.306945,0.084633,0.927419,0.085869,0.980029 """ - check_stat(known_csv, dc.lilliefors_log) + check_stat(known_csv, dc.lilliefors_log()) @helpers.seed def test_anderson_darling(dc): with helpers.raises(NotImplementedError): - _ = dc.anderson_darling + _ = dc.anderson_darling() @helpers.seed def test_anderson_darling_log(dc): with helpers.raises(NotImplementedError): - _ = dc.anderson_darling_log + _ = dc.anderson_darling_log() @helpers.seed @@ -403,35 +423,35 @@ def test_mann_whitney(dc): F,Outflow,303.0,,236.0,0.2505911218,,0.4045186043 F,Reference,185.0,172.0,,0.8601783903,0.4045186043 """ - check_stat(known_csv, dc.mann_whitney, comp=True) + check_stat(known_csv, dc.mann_whitney(), comp=True) @helpers.seed def test_t_test(dc): known_csv = """\ - ,,pvalue,pvalue,pvalue,t_test,t_test,t_test - loc_2,,Inflow,Outflow,Reference,Inflow,Outflow,Reference - param,loc_1,,,,,, - A,Inflow,,0.2178424157,0.4563196599,,-1.2604458127,-0.7539785777 - A,Outflow,0.2178424157,,0.5240147979,1.2604458127,,0.643450194 - A,Reference,0.4563196599,0.5240147979,,0.7539785777,-0.643450194, - B,Inflow,,0.8430007638,0.3898358794,,0.1992705833,0.869235357 - B,Outflow,0.8430007638,,0.5491097882,-0.1992705833,,0.6043850808 - B,Reference,0.3898358794,0.5491097882,,-0.869235357,-0.6043850808, - C,Inflow,,0.1847386316,0.8191392537,,-1.3639360123,-0.2300373632 - C,Outflow,0.1847386316,,0.2179907667,1.3639360123,,1.2615982727 - C,Reference,0.8191392537,0.2179907667,,0.2300373632,-1.2615982727, - D,Inflow,,0.5484265023,0.344783812,,0.6056706932,0.9582600001 - D,Outflow,0.5484265023,,0.6299742693,-0.6056706932,,0.4851636024 - D,Reference,0.344783812,0.6299742693,,-0.9582600001,-0.4851636024, - E,Inflow,,0.2304569921,0.6770414622,,1.2287029977,-0.4198288251 - E,Outflow,0.2304569921,,0.1023435465,-1.2287029977,,-1.6935358498 - E,Reference,0.6770414622,0.1023435465,,0.4198288251,1.6935358498, - F,Inflow,,0.422008391,0.3549979666,,0.8190789273,0.9463539528 - F,Outflow,0.422008391,,0.4988994144,-0.8190789273,,0.6826435968 - F,Reference,0.3549979666,0.4988994144,,-0.9463539528,-0.6826435968 + ,,t_test,t_test,t_test,pvalue,pvalue,pvalue + loc_2,,Inflow,Outflow,Reference,Inflow,Outflow,Reference + param,loc_1,,,,,, + A,Inflow,,-1.239311,-0.761726,,0.222278,0.450806 + A,Outflow,1.239311,,0.630254,0.222278,,0.532113 + A,Reference,0.761726,-0.630254,,0.450806,0.532113, + B,Inflow,,0.200136,0.855661,,0.842296,0.397158 + B,Outflow,-0.200136,,0.594193,0.842296,,0.555814 + B,Reference,-0.855661,-0.594193,,0.397158,0.555814, + C,Inflow,,-1.363936,-0.228508,,0.179225,0.820243 + C,Outflow,1.363936,,1.283982,0.179225,,0.205442 + C,Reference,0.228508,-1.283982,,0.820243,0.205442, + D,Inflow,,0.61178,0.917349,,0.543631,0.364076 + D,Outflow,-0.61178,,0.475391,0.543631,,0.63686 + D,Reference,-0.917349,-0.475391,,0.364076,0.63686, + E,Inflow,,1.156605,-0.418858,,0.255738,0.677741 + E,Outflow,-1.156605,,-1.558039,0.255738,,0.128485 + E,Reference,0.418858,1.558039,,0.677741,0.128485, + F,Inflow,,0.874261,0.851029,,0.386833,0.400379 + F,Outflow,-0.874261,,0.63432,0.386833,,0.529575 + F,Reference,-0.851029,-0.63432,,0.400379,0.529575, """ - check_stat(known_csv, dc.t_test, comp=True) + check_stat(known_csv, dc.t_test(), comp=True) @helpers.seed @@ -459,7 +479,7 @@ def test_levene(dc): F,Outflow,0.841984453,,0.25881357,0.363948105,,0.613802541 F,Reference,0.734809611,0.25881357,,0.396999375,0.613802541, """ - check_stat(known_csv, dc.levene, comp=True) + check_stat(known_csv, dc.levene(), comp=True) @helpers.seed @@ -488,7 +508,7 @@ def test_wilcoxon(dc): F,Reference,22.0,28.0,,0.952765,0.656642, """ with pytest.warns(UserWarning): - check_stat(known_csv, dc.wilcoxon, comp=True) + check_stat(known_csv, dc.wilcoxon(), comp=True) @helpers.seed @@ -516,7 +536,7 @@ def test_ranksums(dc): F,Outflow,0.2459307,,0.3971011,1.1602902,,0.8468098 F,Reference,0.8486619,0.3971011,,0.190826,-0.8468098, """ - check_stat(known_csv, dc.ranksums, comp=True) + check_stat(known_csv, dc.ranksums(), comp=True) @helpers.seed @@ -545,7 +565,7 @@ def test_kendall(dc): F,Outflow,0.0,,0.388889,1.0,,0.063 F,Reference,0.07231,0.388889,,0.763851,0.063, """ - check_stat(known_csv, dc.kendall, comp=True) + check_stat(known_csv, dc.kendall(), comp=True) @helpers.seed @@ -573,13 +593,69 @@ def test_spearman(dc): F,Outflow,0.9645303744,,0.0560590794,-0.0106277141,,0.5028571429 F,Reference,0.6759971848,0.0560590794,,0.1348767061,0.5028571429 """ - check_stat(known_csv, dc.spearman, comp=True) + check_stat(known_csv, dc.spearman(), comp=True) + + +@helpers.seed +def test_kruskal_wallis(dc): + result = dc.kruskal_wallis() + _expected = [ + {"K-W H": 1.798578, "pvalue": 0.406859, "param": "A"}, + {"K-W H": 5.421338, "pvalue": 0.066492, "param": "B"}, + {"K-W H": 0.29261, "pvalue": 0.863894, "param": "C"}, + {"K-W H": 0.39957, "pvalue": 0.818907, "param": "D"}, + {"K-W H": 0.585441, "pvalue": 0.746231, "param": "E"}, + {"K-W H": 1.483048, "pvalue": 0.476387, "param": "F"}, + ] + expected = pandas.DataFrame(_expected).set_index("param") + pandas.testing.assert_frame_equal(result, expected) + + +@helpers.seed +def test_f_test(dc): + result = dc.f_test() + _expected = [ + {"f-test": 0.861339, "pvalue": 0.427754, "param": "A"}, + {"f-test": 0.343445, "pvalue": 0.710663, "param": "B"}, + {"f-test": 1.653142, "pvalue": 0.198833, "param": "C"}, + {"f-test": 0.526782, "pvalue": 0.592927, "param": "D"}, + {"f-test": 1.114655, "pvalue": 0.335738, "param": "E"}, + {"f-test": 0.738629, "pvalue": 0.482134, "param": "F"}, + ] + expected = pandas.DataFrame(_expected).set_index("param") + pandas.testing.assert_frame_equal(result, expected) + + +def test_tukey_hsd_smoke_test(dc): + hsd, scores = dc.tukey_hsd() + assert isinstance(hsd, pandas.DataFrame) + assert isinstance(scores, pandas.DataFrame) + + assert hsd.index.names == ["loc 1", "loc 2", "param"] + assert hsd.columns.tolist() == [ + "HSD Stat", + "p-value", + "CI-Low", + "CI-High", + "is_diff", + "sign_of_diff", + "score", + ] + + assert scores.index.names == ["param"] + assert scores.columns.tolist() == ["Inflow", "Outflow", "Reference"] + + +def test_dunn(dc): + dunnres = dc.dunn() + assert dunnres.index.tolist() == list("ABCDEF") + assert dunnres.columns.tolist() == ["Inflow", "Outflow", "Reference"] @helpers.seed def test_theilslopes(dc): with helpers.raises(NotImplementedError): - _ = dc.theilslopes + _ = dc.theilslopes() def test_inventory(dc): @@ -765,7 +841,7 @@ def test_selectDatasets(dc): def test_dist_compare_wrapper(x, all_same, func): y = [5, 5, 5, 5, 5] with mock.patch.object(stats, func.__name__) as _test: - result = _dist_compare(x, y, _test) + result = _dist_compare(x, y, _test, alternative="two-sided") if all_same: assert numpy.isnan(result.stat) assert numpy.isnan(result.pvalue) diff --git a/wqio/tests/utils_tests/test_numutils.py b/wqio/tests/utils_tests/test_numutils.py index b7ae2275..82ae2740 100644 --- a/wqio/tests/utils_tests/test_numutils.py +++ b/wqio/tests/utils_tests/test_numutils.py @@ -2,6 +2,7 @@ from collections import namedtuple from io import StringIO from textwrap import dedent +from typing import Any, Literal import numpy import numpy.testing as nptest @@ -9,6 +10,8 @@ import pandas.testing as pdtest import pytest import statsmodels.api as sm +from numpy._typing._array_like import NDArray +from pandas import DataFrame from scipy import stats from wqio.tests import helpers @@ -92,7 +95,7 @@ def test_process_p_vals(fxn, pval, expected, error_to_raise): (1.01, (None, None), ValueError), ], ) -def test_translate_p_vals(pval, expected, as_emoji, error_to_raise): +def test_translate_p_vals(pval, expected, as_emoji: bool, error_to_raise): with helpers.raises(error_to_raise): result = numutils.translate_p_vals(pval, as_emoji=as_emoji) assert result == expected[as_emoji] @@ -125,7 +128,7 @@ def test_anderson_darling(): @pytest.mark.parametrize("which", ["good", "bad"]) -def test_processAndersonDarlingResults(which): +def test_processAndersonDarlingResults(which: Literal["good"] | Literal["bad"]): fieldnames = ["statistic", "critical_values", "significance_level"] AndersonResult = namedtuple("AndersonResult", fieldnames) ARs = { @@ -215,7 +218,7 @@ def units_norm_data(): return raw, expected -def test_normalize_units(units_norm_data): +def test_normalize_units(units_norm_data: tuple[DataFrame, DataFrame]): unitsmap = {"ug/L": 1e-6, "mg/L": 1e-3, "g/L": 1e0} targetunits = {"Lead, Total": "ug/L", "Cadmium, Total": "mg/L"} @@ -226,7 +229,7 @@ def test_normalize_units(units_norm_data): pdtest.assert_frame_equal(result, expected) -def test_normalize_units_bad_targetunits(units_norm_data): +def test_normalize_units_bad_targetunits(units_norm_data: tuple[DataFrame, DataFrame]): unitsmap = {"ug/L": 1e-6, "mg/L": 1e-3, "g/L": 1e0} targetunits = {"Lead, Total": "ug/L"} @@ -243,7 +246,7 @@ def test_normalize_units_bad_targetunits(units_norm_data): ) -def test_normalize_units_bad_normalization(units_norm_data): +def test_normalize_units_bad_normalization(units_norm_data: tuple[DataFrame, DataFrame]): unitsmap = {"mg/L": 1e-3, "g/L": 1e0} targetunits = {"Lead, Total": "ug/L", "Cadmium, Total": "mg/L"} @@ -260,7 +263,7 @@ def test_normalize_units_bad_normalization(units_norm_data): ) -def test_normalize_units_bad_conversion(units_norm_data): +def test_normalize_units_bad_conversion(units_norm_data: tuple[DataFrame, DataFrame]): unitsmap = {"ug/L": 1e-6, "mg/L": 1e-3, "g/L": 1e0} targetunits = {"Lead, Total": "ng/L", "Cadmium, Total": "mg/L"} @@ -292,7 +295,7 @@ def test_test_pH2concentration(pH, expected, error): @helpers.seed @pytest.mark.parametrize("error", [None, ValueError]) -def test_compute_theilslope_default(error): +def test_compute_theilslope_default(error: types.NoneType | type[ValueError]): with helpers.raises(error): y = helpers.getTestROSData()["res"].values x = numpy.arange(len(y) - 1) if error else None @@ -443,7 +446,7 @@ def fit_data(): (None, "junk", ValueError), ], ) -def test_fit_line(fit_data, fitlogs, fitprobs, error): +def test_fit_line(fit_data: dict[str, NDArray[Any]], fitlogs, fitprobs, error): xy = { (None, None): (fit_data["zscores"], fit_data["data"]), ("y", None): (fit_data["zscores"], fit_data["data"]), @@ -483,13 +486,13 @@ def test_fit_line(fit_data, fitlogs, fitprobs, error): assert isinstance(res, sm.regression.linear_model.RegressionResultsWrapper) -def test_fit_line_through_origin(fit_data): +def test_fit_line_through_origin(fit_data: dict[str, NDArray[Any]]): x, y = fit_data["zscores"], fit_data["data"] x_, y_, res = numutils.fit_line(x, y, through_origin=True) assert res.params[0] == 0 -def test_fit_line_with_xhat(fit_data): +def test_fit_line_with_xhat(fit_data: dict[str, NDArray[Any]]): x, y = fit_data["zscores"], fit_data["data"] x_, y_, res = numutils.fit_line(x, y, xhat=[-2, -1, 0, 1, 2]) expected = [-0.566018, 4.774419, 10.114857, 15.455295, 20.795733] @@ -685,6 +688,50 @@ def test__comp_stat_generator_single_group_col(): ) +@pytest.mark.parametrize( + ("control", "statfxn"), + [(None, helpers.group_statfxn), ("Reference", helpers.group_statfxn_with_control)], +) +def test__group_comp_stat_generator(control, statfxn): + df = helpers.make_dc_data_complex().reset_index().loc[lambda df: df["state"].isin(["CA", "OR"])] + gen = numutils._group_comp_stat_generator( + df, ["bmp", "state"], "loc", "res", statfxn, statname="max_of_max", control=control + ) + + assert isinstance(gen, types.GeneratorType) + result = pandas.DataFrame(gen).sort_values(by=["bmp", "state"]).reset_index(drop=True) + + expected_vals = [ + 10.50949742, + 14.77713121, + 23.29366747, + 25.31733815, + 15.0478646, + 163.01000855, + 81.9363735, + 12.89370847, + 16.76919938, + 5.25222615, + 8.75794985, + 24.71916248, + 13.87664484, + 15.05442994, + ] + expected = ( + pandas.DataFrame( + { + "bmp": list(map(str, range(1, 8))) * 2, + "state": (["CA"] * 7) + (["OR"] * 7), + "max_of_max": expected_vals, + "pvalue": [0.25] * 14, + } + ) + .sort_values(by=["bmp", "state"]) + .reset_index(drop=True) + ) + pandas.testing.assert_frame_equal(result, expected, rtol=0.01) + + def test__paired_stat_generator(): df = helpers.make_dc_data_complex().unstack(level="loc") gen = numutils._paired_stat_generator(df, ["param"], "loc", "res", helpers.comp_statfxn) @@ -755,3 +802,122 @@ def test_remove_outliers(): x = numpy.random.normal(0, 4, size=37) assert numutils.remove_outliers(x).shape == expected_shape + + +def test_tukey_hsd_functions(): + expected_records = [ + { + "chemical_name": "Copper", + "Loc_0": -2.0, + "Loc_1": 6.0, + "Loc_2": -2.0, + "Loc_3": -4.0, + "Loc_4": 3.0, + "Loc_5": -1.0, + "Loc_6": 0.0, + }, + { + "chemical_name": "Di(2-ethylhexyl)phthalate", + "Loc_0": 3.0, + "Loc_1": 5.0, + "Loc_2": -2.0, + "Loc_3": -2.0, + "Loc_4": -2.0, + "Loc_5": -1.0, + "Loc_6": -1.0, + }, + { + "chemical_name": "Indeno(1,2,3-cd)pyrene", + "Loc_0": 2.0, + "Loc_1": 0.0, + "Loc_2": 6.0, + "Loc_3": -2.0, + "Loc_4": -2.0, + "Loc_5": -4.0, + "Loc_6": 0.0, + }, + { + "chemical_name": "Lead", + "Loc_0": 0.0, + "Loc_1": 6.0, + "Loc_2": -2.0, + "Loc_3": -3.0, + "Loc_4": 4.0, + "Loc_5": -3.0, + "Loc_6": -2.0, + }, + { + "chemical_name": "Phenanthrene", + "Loc_0": 1.0, + "Loc_1": 0.0, + "Loc_2": 1.0, + "Loc_3": -3.0, + "Loc_4": 0.0, + "Loc_5": 0.0, + "Loc_6": 1.0, + }, + { + "chemical_name": "Pyrene", + "Loc_0": 0.0, + "Loc_1": -1.0, + "Loc_2": 3.0, + "Loc_3": -2.0, + "Loc_4": -2.0, + "Loc_5": -2.0, + "Loc_6": 4.0, + }, + { + "chemical_name": "Total Suspended Solids", + "Loc_0": -1.0, + "Loc_1": -1.0, + "Loc_2": -1.0, + "Loc_3": -1.0, + "Loc_4": -1.0, + "Loc_5": -1.0, + "Loc_6": 6.0, + }, + { + "chemical_name": "Zinc", + "Loc_0": 0.0, + "Loc_1": 1.0, + "Loc_2": -1.0, + "Loc_3": -6.0, + "Loc_4": -1.0, + "Loc_5": 4.0, + "Loc_6": 3.0, + }, + ] + expected = pandas.DataFrame(expected_records).set_index("chemical_name") + wq = pandas.read_pickle(helpers.test_data_path("wq.pkl")) + hsd = numutils.tukey_hsd(wq, "res", "location", "chemical_name") + result = numutils.process_tukey_hsd_scores(hsd, "location", "chemical_name") + pandas.testing.assert_frame_equal(result, expected, check_names=False) + + +def test_rank_stats(): + copper = pandas.read_pickle(helpers.test_data_path("sed.pkl")).loc[ + lambda df: df["chemical_name"] == "Copper" + ] + result = numutils.rank_stats(copper, "res", "location") + expected = pandas.DataFrame( + [ + {"location": "Loc_0", "count": 8, "sum": 178.0, "mean": 22.25}, + {"location": "Loc_1", "count": 11, "sum": 459.5, "mean": 41.77272727272727}, + {"location": "Loc_2", "count": 10, "sum": 248.0, "mean": 24.8}, + {"location": "Loc_3", "count": 11, "sum": 78.0, "mean": 7.090909090909091}, + {"location": "Loc_4", "count": 9, "sum": 503.0, "mean": 55.888888888888886}, + {"location": "Loc_5", "count": 12, "sum": 424.5, "mean": 35.375}, + ] + ) + pandas.testing.assert_frame_equal(result, expected) + + +def test_dunn_scores(): + copper = pandas.read_pickle(helpers.test_data_path("sed.pkl")).loc[ + lambda df: df["chemical_name"] == "Copper" + ] + dunnres = numutils.dunn_test(copper, "res", "location") + expected = pandas.Series( + data=[-1, 1, -1, -3, 3, 1], index=["Loc_0", "Loc_1", "Loc_2", "Loc_3", "Loc_4", "Loc_5"] + ) + pandas.testing.assert_series_equal(dunnres.scores, expected, check_names=False) diff --git a/wqio/utils/numutils.py b/wqio/utils/numutils.py index d6d3a89e..f3c39f44 100644 --- a/wqio/utils/numutils.py +++ b/wqio/utils/numutils.py @@ -1,16 +1,20 @@ import itertools from collections import namedtuple +from collections.abc import Callable from textwrap import dedent import numpy +import pandas import statsmodels.api as sm from probscale.algo import _estimate_from_fit from scipy import stats +from scipy.stats._hypotests import TukeyHSDResult from wqio import validate from wqio.utils import misc TheilStats = namedtuple("TheilStats", ("slope", "intercept", "low_slope", "high_slope")) +DunnResult = namedtuple("DunnResult", ("rank_stats", "results", "scores")) def sig_figs(x, n, expthresh=5, tex=False, pval=False, forceint=False): @@ -338,8 +342,9 @@ def normalize_units( if normalization.isnull().any(): nulls = df[normalization.isnull()][unitcol].unique() - msg += "Some normalization factors could not be mapped to the {} column ({})\n".format( - unitcol, nulls + msg += ( + "Some normalization factors could not " + f"be mapped to the {unitcol} column ({nulls})\n" ) if conversion.isnull().any(): @@ -625,7 +630,14 @@ def remove_outliers(x, factor=1.5): def _comp_stat_generator( - df, groupcols, pivotcol, rescol, statfxn, statname=None, pbarfxn=None, **statopts + df: pandas.DataFrame, + groupcols: list[str], + pivotcol: str, + rescol: str, + statfxn: Callable, + statname: str = "stat", + pbarfxn: Callable = misc.no_op, + **statopts, ): """Generator of records containing results of comparitive statistical functions. @@ -657,8 +669,6 @@ def _comp_stat_generator( comp_stats : pandas.DataFrame """ - if not pbarfxn: - pbarfxn = misc.no_op groupcols = validate.at_least_empty_list(groupcols) if statname is None: @@ -682,8 +692,48 @@ def _comp_stat_generator( yield row +def _group_comp_stat_generator( + df: pandas.DataFrame, + groupcols: list[str], + pivotcol: str, + rescol: str, + statfxn: Callable, + pbarfxn: Callable = misc.no_op, + statname: str = "stat", + control: str | None = None, + **statopts, +): + groupcols = validate.at_least_empty_list(groupcols) + + for names, main_group in pbarfxn(df.groupby(by=groupcols)): + subsets = { + pivot: subgroup[rescol].to_numpy() + for pivot, subgroup in main_group.groupby(by=pivotcol) + } + + _pivots = list(subsets.keys()) + _samples = list(subsets.values()) + + if control is not None: + control_sample = _samples.pop(_pivots.index(control)) + result = statfxn(*_samples, control=control_sample, **statopts) + + else: + result = statfxn(*_samples, **statopts) + + row = {**dict(zip(groupcols, names)), statname: result.statistic, "pvalue": result.pvalue} + yield row + + def _paired_stat_generator( - df, groupcols, pivotcol, rescol, statfxn, statname=None, pbarfxn=None, **statopts + df: pandas.DataFrame, + groupcols: list[str], + pivotcol: str, + rescol: str, + statfxn: Callable, + statname: str = "stat", + pbarfxn: Callable = misc.no_op, + **statopts, ): """Generator of records containing results of comparitive statistical functions specifically for paired data. @@ -715,8 +765,6 @@ def _paired_stat_generator( comp_stats : pandsa.DataFrame """ - if not pbarfxn: - pbarfxn = misc.no_op groupcols = validate.at_least_empty_list(groupcols) if statname is None: @@ -740,3 +788,195 @@ def _paired_stat_generator( stat = statfxn(x, y, **statopts) row.update({statname: stat[0], "pvalue": stat.pvalue}) yield row + + +def _tukey_res_to_df( + names: list[str], hsd_res: list[TukeyHSDResult], group_prefix: str +) -> pandas.DataFrame: + """Converts Scipy's TukeyHSDResult to a dataframe + + Parameters + ---------- + names : list of str + Name of the groups present in the Tukey HSD Results + hsd_res : list of TukeyHSDResult + List of Tukey results to be converted to a dateframe + group_prefix : str (default = "Loc") + Prefix that describes the nature of the groups + + Returns + ------- + hsd_df : pandas.DataFrame + + """ + rows = [] + for i, n1 in enumerate(names): + for j, n2 in enumerate(names): + if i != j: + ci_bands = hsd_res.confidence_interval() + row = { + f"{group_prefix} 1": names[n1], + f"{group_prefix} 2": names[n2], + "HSD Stat": hsd_res.statistic[i, j], + "p-value": hsd_res.pvalue[i, j], + "CI-Low": ci_bands.low[i, j], + "CI-High": ci_bands.high[i, j], + } + + rows.append(row) + + df = pandas.DataFrame(rows).set_index([f"{group_prefix} 1", f"{group_prefix} 2"]) + return df + + +def tukey_hsd( + df: pandas.DataFrame, + rescol: str, + compcol: str, + paramcol: str, + *othergroups: str, +): + """ + Run the Tukey HSD Test on a dataframe based on groupings + + Parameters + ---------- + df : pandas.DataFrame + rescol : str + Name of the column that contains the values of interest + compcol: str + Name of the column that defines the groups to be compared + (i.e., treatment vs control) + paramcol: str + Name of the column that contains the measured parameter + *othergroups : str + Names of any other columsn that need to considered when + defining the groups to be considered. + + Returns + ------- + hsd_df : pandas.DataFrame + + """ + groupcols = [paramcol, *othergroups] + scores = [] + for name, g in df.groupby(by=groupcols): + locs = {loc: subg[rescol].values for loc, subg in g.groupby(compcol) if subg.shape[0] > 1} + subset_names = {n: loc for n, loc in enumerate(locs)} + res = stats.tukey_hsd(*[v for v in locs.values()]) + df_res = _tukey_res_to_df(subset_names, res, group_prefix=compcol) + + keys = {g: n for g, n in zip(groupcols, name)} + scores.append( + df_res.assign( + is_diff=lambda df: df["p-value"].lt(0.05).astype(int), + sign_of_diff=lambda df: numpy.sign(df["HSD Stat"]).astype(int), + score=lambda df: df["is_diff"] * df["sign_of_diff"], + **keys, + ).set_index(groupcols, append=True) + ) + + return pandas.concat(scores, ignore_index=False, axis="index") + + +def process_tukey_hsd_scores( + hsd_df: pandas.DataFrame, compcol: str, paramcol: str +) -> pandas.DataFrame: + """ + Converts a Tukey HSD Results dataframe into scores that describe + the value of groups' magnitude relative to each other. + + Generally speaking: + * -7 to -5 -> significantly lower + * -5 to -3 -> moderately lower + * -3 to -1 -> slightly lower + * -1 to +1 -> neutral + * +1 to +3 -> slightly higher + * +3 to +5 -> moderately higher + * +5 to +7 -> significantly higher + + Parameters + ---------- + hsd_df : pandas.DataFrame + Dataframe dumped by `tukey_hsd` + group_prefix : str (default = "Loc") + Prefix that describes the nature of the groups + + Returns + ------- + scores : pandas.DataFrame + + """ + return ( + hsd_df["score"] + .unstack(level=f"{compcol} 2") + .fillna(0) + .groupby(level=paramcol, as_index=False, group_keys=False) + .apply(lambda g: g.sum(axis="columns")) + .unstack(level=f"{compcol} 1") + ) + + +def rank_stats( + df: pandas.DataFrame, + rescol: str, + compcol: str, + *othergroups: str, +) -> pandas.DataFrame: + """ + Compute basic rank statistics (count, mean, sum) of a dataframe + based on logical groupings. Assumes a single parameter is in the + dataframe + + Parameters + ---------- + df : pandas.DataFrame + rescol : str + Name of the column that contains the values of interest + compcol: str + Name of the column that defines the groups to be compared + (i.e., treatment vs control) + + *othergroups : str + Names of any other columsn that need to considered when + defining the groups to be considered. + + """ + + return ( + df.assign(rank=lambda df: df[rescol].rank()) + .groupby([compcol, *othergroups])["rank"] + .agg(["count", "sum", "mean"]) + .reset_index() + ) + + +def _dunn_result(rs: pandas.DataFrame, group_prefix: str): + threshold = stats.norm.isf(0.001667 / 2) + N = rs["count"].sum() + results = ( + rs.join(rs, how="cross", lsuffix="_1", rsuffix="_2") + .assign( + diff=lambda df: df["mean_1"] - df["mean_2"], + scale=lambda df: ((N * (N + 1) / 12) * (1 / df["count_1"] + 1 / df["count_2"])) ** 0.5, + dunn_z=lambda df: df["diff"] / df["scale"], + score=lambda df: numpy.where( + numpy.abs(df["dunn_z"]) <= threshold, 0, numpy.sign(df["dunn_z"]) + ).astype(int), + ) + .loc[:, [f"{group_prefix}_1", f"{group_prefix}_2", "dunn_z", "score"]] + ) + + return results + + +def _dunn_scores(dr: pandas.DataFrame, group_prefix: str) -> pandas.DataFrame: + scores = dr.pivot(index=f"{group_prefix}_2", columns=f"{group_prefix}_1", values="score").sum() + return scores + + +def dunn_test(df: pandas.DataFrame, rescol: str, compcol: str, *othergroups: str) -> DunnResult: + rs = rank_stats(df, rescol, compcol, *othergroups) + results = _dunn_result(rs, compcol) + scores = _dunn_scores(results, compcol) + return DunnResult(rs, results, scores) diff --git a/wqio/viz.py b/wqio/viz.py index 5dbc4e42..16f85e3a 100644 --- a/wqio/viz.py +++ b/wqio/viz.py @@ -200,7 +200,7 @@ def jointplot( return jg -def whiskers_and_fliers(x, q1=None, q3=None, transformout=None): +def whiskers_and_fliers(x, q1=None, q3=None, flierfactor=1.5, transformout=None): """Computes extent of whiskers and fliers on optionally transformed data for box and whisker plots. @@ -210,6 +210,8 @@ def whiskers_and_fliers(x, q1=None, q3=None, transformout=None): Sequence of optionally transformed data. q1, q3 : floats, optional First and third quartiles of the optionally transformed data. + flierfactor : float, optional (default=1.5) + multiplier by which outliers are compared to the IQR transformout : callable, optional Function to un-transform the results back into the original space of the data. @@ -243,12 +245,12 @@ def transformout(x): iqr = q3 - q1 # get low extreme - loval = q1 - (1.5 * iqr) + loval = q1 - (flierfactor * iqr) whislo = numpy.compress(x >= loval, x) whislo = q1 if len(whislo) == 0 or numpy.min(whislo) > q1 else numpy.min(whislo) # get high extreme - hival = q3 + (1.5 * iqr) + hival = q3 + (flierfactor * iqr) whishi = numpy.compress(x <= hival, x) whishi = q3 if len(whishi) == 0 or numpy.max(whishi) < q3 else numpy.max(whishi)