Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

adding mode as aggfunc #194

Merged
merged 5 commits into from
Feb 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions acro/acro_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ def olsr( # pylint: disable=too-many-locals,keyword-arg-before-vararg
data must define __getitem__ with the keys in the formula terms
args and kwargs are passed on to the model instantiation. E.g.,
a numpy structured or rec array, a dictionary, or a pandas DataFrame.
Arguments are passed in the same order as statsmodels.
"""
logger.debug("olsr()")
command: str = utils.get_command("olsr()", stack())
Expand All @@ -122,7 +123,7 @@ def olsr( # pylint: disable=too-many-locals,keyword-arg-before-vararg
data=data,
subset=subset,
drop_cols=drop_cols,
*args,
*args, # noqa: B026
**kwargs,
)
results = model.fit()
Expand Down Expand Up @@ -227,6 +228,7 @@ def logitr( # pylint: disable=too-many-locals,keyword-arg-before-vararg
data must define __getitem__ with the keys in the formula terms
args and kwargs are passed on to the model instantiation. E.g.,
a numpy structured or rec array, a dictionary, or a pandas DataFrame.
Arguments are passed in the same order as statsmodels.
"""
logger.debug("logitr()")
command: str = utils.get_command("logitr()", stack())
Expand All @@ -235,7 +237,7 @@ def logitr( # pylint: disable=too-many-locals,keyword-arg-before-vararg
data=data,
subset=subset,
drop_cols=drop_cols,
*args,
*args, # noqa: B026
**kwargs,
)
results = model.fit()
Expand Down Expand Up @@ -340,6 +342,7 @@ def probitr( # pylint: disable=too-many-locals,keyword-arg-before-vararg
data must define __getitem__ with the keys in the formula terms
args and kwargs are passed on to the model instantiation. E.g.,
a numpy structured or rec array, a dictionary, or a pandas DataFrame.
Arguments are passed in the same order as statsmodels.
"""
logger.debug("probitr()")
command: str = utils.get_command("probitr()", stack())
Expand All @@ -348,7 +351,7 @@ def probitr( # pylint: disable=too-many-locals,keyword-arg-before-vararg
data=data,
subset=subset,
drop_cols=drop_cols,
*args,
*args, # noqa: B026
**kwargs,
)
results = model.fit()
Expand Down
159 changes: 112 additions & 47 deletions acro/acro_tables.py
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

line 1102 where you see if all the values in a series are unique.
sorting might take a while - could you just use https://pandas.pydata.org/docs/reference/api/pandas.Series.nunique.html

Copy link
Contributor

@jim-smith jim-smith Jan 30, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looking at the levels of indentation and the nu,mbersof line in this method near where you calculate threshold masks etc., (lines 742 onwards) , I wonder if it is time to refactor some of this code into a series of functions called something like get_mask_for_aggfuncc()

Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import logging
import os
import secrets
from collections.abc import Callable
from inspect import stack

Expand All @@ -20,12 +21,30 @@
logger = logging.getLogger("acro")


AGGFUNC: dict[str, str] = {
def mode_aggfunc(values) -> Series:
"""Calculate the mode or randomly selects one of the modes from a pandas Series.

Parameters
----------
values : Series
A pandas Series for which to calculate the mode.

Returns
-------
Series
The mode. If there are multiple modes, randomly selects and returns one of the modes.
"""
modes = values.mode()
return secrets.choice(modes)


AGGFUNC: dict[str, str | Callable] = {
"mean": "mean",
"median": "median",
"sum": "sum",
"std": "std",
"count": "count",
"mode": mode_aggfunc,
}

# aggregation function parameters
Expand Down Expand Up @@ -137,9 +156,11 @@ def crosstab( # pylint: disable=too-many-arguments,too-many-locals
dropna,
normalize,
)
# delete empty rows and columns from table
table, comments = delete_empty_rows_columns(table)

comments: list[str] = []
# do not delete empty rows and columns from table if the aggfunc is mode
if agg_func is not mode_aggfunc:
# delete empty rows and columns from table
table, comments = delete_empty_rows_columns(table)
masks = create_crosstab_masks(
index,
columns,
Expand Down Expand Up @@ -742,7 +763,7 @@ def create_crosstab_masks( # pylint: disable=too-many-arguments,too-many-locals

if agg_func is not None:
# create lists with single entry for when there is only one aggfunc
count_funcs: list[str] = [AGGFUNC["count"]]
count_funcs: list[str | Callable] = [AGGFUNC["count"]]
neg_funcs: list[Callable] = [agg_negative]
pperc_funcs: list[Callable] = [agg_p_percent]
nk_funcs: list[Callable] = [agg_nk]
Expand All @@ -756,49 +777,67 @@ def create_crosstab_masks( # pylint: disable=too-many-arguments,too-many-locals
nk_funcs.extend([agg_nk for i in range(1, num)])
missing_funcs.extend([agg_missing for i in range(1, num)])
# threshold check- doesn't matter what we pass for value
if agg_func is mode_aggfunc:
# check that all observations dont have the same value
logger.info(
"If there are multiple modes, one of them is randomly selected and displayed."
)
masks["all-values-are-same"] = pd.crosstab( # type: ignore
index,
columns,
values,
aggfunc=agg_values_are_same,
margins=margins,
dropna=dropna,
)
else:
t_values = pd.crosstab( # type: ignore
index,
columns,
values=values,
rownames=rownames,
colnames=colnames,
aggfunc=count_funcs,
margins=margins,
margins_name=margins_name,
dropna=dropna,
normalize=normalize,
)

t_values = pd.crosstab( # type: ignore
index,
columns,
values=values,
rownames=rownames,
colnames=colnames,
aggfunc=count_funcs,
margins=margins,
margins_name=margins_name,
dropna=dropna,
normalize=normalize,
)

# drop empty columns and rows
if dropna or margins:
empty_cols_mask = t_values.sum(axis=0) == 0
empty_rows_mask = t_values.sum(axis=1) == 0
# drop empty columns and rows
if dropna or margins:
empty_cols_mask = t_values.sum(axis=0) == 0
empty_rows_mask = t_values.sum(axis=1) == 0

t_values = t_values.loc[:, ~empty_cols_mask]
t_values = t_values.loc[~empty_rows_mask, :]
t_values = t_values.loc[:, ~empty_cols_mask]
t_values = t_values.loc[~empty_rows_mask, :]

t_values = t_values < THRESHOLD
masks["threshold"] = t_values
# check for negative values -- currently unsupported
negative = pd.crosstab( # type: ignore
index, columns, values, aggfunc=neg_funcs, margins=margins
)
if negative.to_numpy().sum() > 0:
masks["negative"] = negative
# p-percent check
masks["p-ratio"] = pd.crosstab( # type: ignore
index, columns, values, aggfunc=pperc_funcs, margins=margins, dropna=dropna
)
# nk values check
masks["nk-rule"] = pd.crosstab( # type: ignore
index, columns, values, aggfunc=nk_funcs, margins=margins, dropna=dropna
)
# check for missing values -- currently unsupported
if CHECK_MISSING_VALUES:
masks["missing"] = pd.crosstab( # type: ignore
index, columns, values, aggfunc=missing_funcs, margins=margins
t_values = t_values < THRESHOLD
masks["threshold"] = t_values
# check for negative values -- currently unsupported
negative = pd.crosstab( # type: ignore
index, columns, values, aggfunc=neg_funcs, margins=margins
)
if negative.to_numpy().sum() > 0:
masks["negative"] = negative
# p-percent check
masks["p-ratio"] = pd.crosstab( # type: ignore
index,
columns,
values,
aggfunc=pperc_funcs,
margins=margins,
dropna=dropna,
)
# nk values check
masks["nk-rule"] = pd.crosstab( # type: ignore
index, columns, values, aggfunc=nk_funcs, margins=margins, dropna=dropna
)
# check for missing values -- currently unsupported
if CHECK_MISSING_VALUES:
masks["missing"] = pd.crosstab( # type: ignore
index, columns, values, aggfunc=missing_funcs, margins=margins
)
else:
# threshold check- doesn't matter what we pass for value
t_values = pd.crosstab( # type: ignore
Expand Down Expand Up @@ -908,7 +947,7 @@ def rounded_survival_table(survival_table):
return survival_table


def get_aggfunc(aggfunc: str | None) -> str | None:
def get_aggfunc(aggfunc: str | None) -> str | Callable | None:
"""Checks whether an aggregation function is allowed and returns the
appropriate function.

Expand Down Expand Up @@ -940,7 +979,7 @@ def get_aggfunc(aggfunc: str | None) -> str | None:

def get_aggfuncs(
aggfuncs: str | list[str] | None,
) -> str | list[str] | None:
) -> str | Callable | list[str | Callable] | None:
"""Checks whether a list of aggregation functions is allowed and returns
the appropriate functions.

Expand All @@ -963,7 +1002,7 @@ def get_aggfuncs(
logger.debug("aggfuncs: %s", function)
return function
if isinstance(aggfuncs, list):
functions: list[str] = []
functions: list[str | Callable] = []
for function_name in aggfuncs:
function = get_aggfunc(function_name)
if function is not None:
Expand Down Expand Up @@ -1076,6 +1115,24 @@ def agg_threshold(vals: Series) -> bool:
return vals.count() < THRESHOLD


def agg_values_are_same(vals: Series) -> bool:
"""Aggregation function that returns whether all observations having
the same value.

Parameters
----------
vals : Series
Series to calculate if all the values are the same.

Returns
-------
bool
Whether the values are the same.
"""
# the observations are not the same
return vals.nunique(dropna=True) == 1


def apply_suppression(
table: DataFrame, masks: dict[str, DataFrame]
) -> tuple[DataFrame, DataFrame]:
Expand Down Expand Up @@ -1147,6 +1204,7 @@ def get_table_sdc(masks: dict[str, DataFrame], suppress: bool) -> dict:
sdc["summary"]["threshold"] = 0
sdc["summary"]["p-ratio"] = 0
sdc["summary"]["nk-rule"] = 0
sdc["summary"]["all-values-are-same"] = 0
for name, mask in masks.items():
sdc["summary"][name] = int(np.nansum(mask.to_numpy()))
# positions of cells to be suppressed
Expand All @@ -1155,6 +1213,7 @@ def get_table_sdc(masks: dict[str, DataFrame], suppress: bool) -> dict:
sdc["cells"]["threshold"] = []
sdc["cells"]["p-ratio"] = []
sdc["cells"]["nk-rule"] = []
sdc["cells"]["all-values-are-same"] = []
for name, mask in masks.items():
true_positions = np.column_stack(np.where(mask.values))
for pos in true_positions:
Expand Down Expand Up @@ -1198,6 +1257,12 @@ def get_summary(sdc: dict) -> tuple[str, str]:
if sdc_summary["nk-rule"] > 0:
summary += f"nk-rule: {sdc_summary['nk-rule']} cells {sup}; "
status = "fail"
if sdc_summary["all-values-are-same"] > 0:
summary += (
f"all-values-are-same: {sdc_summary['all-values-are-same']} "
f"cells {sup}; "
)
status = "fail"
if summary != "":
summary = f"{status}; {summary}"
else:
Expand Down
Binary file added data/nursery_dataset.dta
Binary file not shown.
Loading
Loading