Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

adding mode as aggfunc #194

Merged
merged 5 commits into from
Feb 20, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
146 changes: 99 additions & 47 deletions acro/acro_tables.py
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

line 1102 where you see if all the values in a series are unique.
sorting might take a while - could you just use https://pandas.pydata.org/docs/reference/api/pandas.Series.nunique.html

Copy link
Contributor

@jim-smith jim-smith Jan 30, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looking at the levels of indentation and the nu,mbersof line in this method near where you calculate threshold masks etc., (lines 742 onwards) , I wonder if it is time to refactor some of this code into a series of functions called something like get_mask_for_aggfuncc()

Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import logging
import os
import random
from collections.abc import Callable
from inspect import stack

Expand All @@ -19,12 +20,18 @@
logger = logging.getLogger("acro")


AGGFUNC: dict[str, str] = {
def mode_aggfunc(values):
modes = values.mode()
return random.choice(modes)


AGGFUNC: dict[str, str | Callable] = {
"mean": "mean",
"median": "median",
"sum": "sum",
"std": "std",
"count": "count",
"mode": mode_aggfunc,
}

# aggregation function parameters
Expand Down Expand Up @@ -136,9 +143,11 @@
dropna,
normalize,
)
# delete empty rows and columns from table
table, comments = delete_empty_rows_columns(table)

comments: list[str] = []
# do not delete empty rows and columns from table if the aggfunc is mode
if agg_func is not mode_aggfunc:
# delete empty rows and columns from table
table, comments = delete_empty_rows_columns(table)
masks = create_crosstab_masks(
index,
columns,
Expand Down Expand Up @@ -741,7 +750,7 @@

if agg_func is not None:
# create lists with single entry for when there is only one aggfunc
count_funcs: list[str] = [AGGFUNC["count"]]
count_funcs: list[str | Callable] = [AGGFUNC["count"]]
neg_funcs: list[Callable] = [agg_negative]
pperc_funcs: list[Callable] = [agg_p_percent]
nk_funcs: list[Callable] = [agg_nk]
Expand All @@ -755,49 +764,64 @@
nk_funcs.extend([agg_nk for i in range(1, num)])
missing_funcs.extend([agg_missing for i in range(1, num)])
# threshold check- doesn't matter what we pass for value
if agg_func is mode_aggfunc:
# check that all observations dont have the same value
masks["all-values-are-same"] = pd.crosstab( # type: ignore
index,
columns,
values,
aggfunc=agg_values_are_same,
margins=margins,
dropna=dropna,
)
else:
t_values = pd.crosstab( # type: ignore
index,
columns,
values=values,
rownames=rownames,
colnames=colnames,
aggfunc=count_funcs,
margins=margins,
margins_name=margins_name,
dropna=dropna,
normalize=normalize,
)

t_values = pd.crosstab( # type: ignore
index,
columns,
values=values,
rownames=rownames,
colnames=colnames,
aggfunc=count_funcs,
margins=margins,
margins_name=margins_name,
dropna=dropna,
normalize=normalize,
)

# drop empty columns and rows
if dropna or margins:
empty_cols_mask = t_values.sum(axis=0) == 0
empty_rows_mask = t_values.sum(axis=1) == 0
# drop empty columns and rows
if dropna or margins:
empty_cols_mask = t_values.sum(axis=0) == 0
empty_rows_mask = t_values.sum(axis=1) == 0

t_values = t_values.loc[:, ~empty_cols_mask]
t_values = t_values.loc[~empty_rows_mask, :]
t_values = t_values.loc[:, ~empty_cols_mask]
t_values = t_values.loc[~empty_rows_mask, :]

t_values = t_values < THRESHOLD
masks["threshold"] = t_values
# check for negative values -- currently unsupported
negative = pd.crosstab( # type: ignore
index, columns, values, aggfunc=neg_funcs, margins=margins
)
if negative.to_numpy().sum() > 0:
masks["negative"] = negative
# p-percent check
masks["p-ratio"] = pd.crosstab( # type: ignore
index, columns, values, aggfunc=pperc_funcs, margins=margins, dropna=dropna
)
# nk values check
masks["nk-rule"] = pd.crosstab( # type: ignore
index, columns, values, aggfunc=nk_funcs, margins=margins, dropna=dropna
)
# check for missing values -- currently unsupported
if CHECK_MISSING_VALUES:
masks["missing"] = pd.crosstab( # type: ignore
index, columns, values, aggfunc=missing_funcs, margins=margins
t_values = t_values < THRESHOLD
masks["threshold"] = t_values
# check for negative values -- currently unsupported
negative = pd.crosstab( # type: ignore
index, columns, values, aggfunc=neg_funcs, margins=margins
)
if negative.to_numpy().sum() > 0:
masks["negative"] = negative
# p-percent check
masks["p-ratio"] = pd.crosstab( # type: ignore
index,
columns,
values,
aggfunc=pperc_funcs,
margins=margins,
dropna=dropna,
)
# nk values check
masks["nk-rule"] = pd.crosstab( # type: ignore
index, columns, values, aggfunc=nk_funcs, margins=margins, dropna=dropna
)
# check for missing values -- currently unsupported
if CHECK_MISSING_VALUES:
masks["missing"] = pd.crosstab( # type: ignore
index, columns, values, aggfunc=missing_funcs, margins=margins
)
else:
# threshold check- doesn't matter what we pass for value
t_values = pd.crosstab( # type: ignore
Expand Down Expand Up @@ -907,7 +931,7 @@
return survival_table


def get_aggfunc(aggfunc: str | None) -> str | None:
def get_aggfunc(aggfunc: str | None) -> str | Callable | None:
"""Checks whether an aggregation function is allowed and returns the
appropriate function.

Expand Down Expand Up @@ -939,7 +963,7 @@

def get_aggfuncs(
aggfuncs: str | list[str] | None,
) -> str | list[str] | None:
) -> str | Callable | list[str | Callable] | None:
"""Checks whether a list of aggregation functions is allowed and returns
the appropriate functions.

Expand All @@ -962,7 +986,7 @@
logger.debug("aggfuncs: %s", function)
return function
if isinstance(aggfuncs, list):
functions: list[str] = []
functions: list[str | Callable] = []
for function_name in aggfuncs:
function = get_aggfunc(function_name)
if function is not None:
Expand Down Expand Up @@ -1075,6 +1099,26 @@
return vals.count() < THRESHOLD


def agg_values_are_same(vals: Series) -> bool:
"""Aggregation function that returns whether all observations having
the same value.

Parameters
----------
vals : Series
Series to calculate if all the values are the same.

Returns
-------
bool
Whether the values are the same.
"""
# order observations
sorted_vals = vals.sort_values(ascending=False)
# check x[1] ≠ x[N]
return sorted_vals.iloc[0] == sorted_vals.iloc[-1]


def apply_suppression(
table: DataFrame, masks: dict[str, DataFrame]
) -> tuple[DataFrame, DataFrame]:
Expand Down Expand Up @@ -1146,6 +1190,7 @@
sdc["summary"]["threshold"] = 0
sdc["summary"]["p-ratio"] = 0
sdc["summary"]["nk-rule"] = 0
sdc["summary"]["all-values-are-same"] = 0
for name, mask in masks.items():
sdc["summary"][name] = int(np.nansum(mask.to_numpy()))
# positions of cells to be suppressed
Expand All @@ -1154,6 +1199,7 @@
sdc["cells"]["threshold"] = []
sdc["cells"]["p-ratio"] = []
sdc["cells"]["nk-rule"] = []
sdc["cells"]["all-values-are-same"] = []
for name, mask in masks.items():
true_positions = np.column_stack(np.where(mask.values))
for pos in true_positions:
Expand Down Expand Up @@ -1197,6 +1243,12 @@
if sdc_summary["nk-rule"] > 0:
summary += f"nk-rule: {sdc_summary['nk-rule']} cells {sup}; "
status = "fail"
if sdc_summary["all-values-are-same"] > 0:
summary += (

Check warning on line 1247 in acro/acro_tables.py

View check run for this annotation

Codecov / codecov/patch

acro/acro_tables.py#L1247

Added line #L1247 was not covered by tests
f"all-values-are-same: {sdc_summary['all-values-are-same']} "
f"cells {sup}; "
)
status = "fail"

Check warning on line 1251 in acro/acro_tables.py

View check run for this annotation

Codecov / codecov/patch

acro/acro_tables.py#L1251

Added line #L1251 was not covered by tests
if summary != "":
summary = f"{status}; {summary}"
else:
Expand Down
Loading
Loading