Skip to content

Commit

Permalink
Fixing of association rule lastest version (#1121)
Browse files Browse the repository at this point in the history
* Updated FPGrowth/FPMax and Association Rules with the existence of missing values

* Re-structure and document code

* Update unit tests

* Update CHANGELOG.md

* Modify the corresponding documentation in Jupyter notebooks

* Final modifications

* Fix association rules and corresponding tests

* Fix typos

* Fixing memory usage increase

* Fixing memory usage increase

---------

Co-authored-by: Sebastian Raschka <[email protected]>
  • Loading branch information
zazass8 and rasbt authored Jan 25, 2025
1 parent 71f2531 commit f951cbb
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2418,13 +2418,16 @@
},
{
"cell_type": "code",

"execution_count": 20,

"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [

"/tmp/ipykernel_34953/2823279667.py:23: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'nan' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.\n",
" df.iloc[idx[i], col[i]] = np.nan\n",
"/tmp/ipykernel_34953/2823279667.py:23: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'nan' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.\n",
Expand All @@ -2438,6 +2441,7 @@
"/tmp/ipykernel_34953/2823279667.py:23: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'nan' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.\n",
" df.iloc[idx[i], col[i]] = np.nan\n",
"/tmp/ipykernel_34953/2823279667.py:23: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'nan' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.\n",

" df.iloc[idx[i], col[i]] = np.nan\n"
]
},
Expand Down Expand Up @@ -2489,6 +2493,7 @@
" <td>True</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",

" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
Expand Down Expand Up @@ -2710,6 +2715,7 @@
]
},
"execution_count": 21,

"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -2718,6 +2724,7 @@
"frequent_itemsets = fpgrowth(df, min_support=0.6, null_values = True, use_colnames=True)\n",
"# frequent_itemsets = fpmax(df, min_support=0.6, null_values = True, use_colnames=True)\n",
"rules = association_rules(frequent_itemsets, len(df), df, null_values = True, metric=\"confidence\", min_threshold=0.8)\n",

"rules"
]
},
Expand Down
11 changes: 7 additions & 4 deletions mlxtend/frequent_patterns/association_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@

def association_rules(
df: pd.DataFrame,
num_itemsets: int,
num_itemsets: Optional[int] = 1,
df_orig: Optional[pd.DataFrame] = None,
null_values=False,
metric="confidence",
Expand All @@ -54,8 +54,8 @@ def association_rules(
df_orig : pandas DataFrame (default: None)
DataFrame with original input data. Only provided when null_values exist
num_itemsets : int
Number of transactions in original input data
num_itemsets : int (default: 1)
Number of transactions in original input data (df_orig)
null_values : bool (default: False)
In case there are null values as NaNs in the original input data
Expand Down Expand Up @@ -119,6 +119,10 @@ def association_rules(
if null_values and df_orig is None:
raise TypeError("If null values exist, df_orig must be provided.")

# if null values exist, num_itemsets must be provided
if null_values and num_itemsets == 1:
raise TypeError("If null values exist, num_itemsets must be provided.")

# check for valid input
fpc.valid_input_check(df_orig, null_values)

Expand Down Expand Up @@ -285,7 +289,6 @@ def certainty_metric_helper(sAC, sA, sC, disAC, disA, disC, dis_int, dis_int_):
# if the input dataframe is complete
if not null_values:
disAC, disA, disC, dis_int, dis_int_ = 0, 0, 0, 0, 0
num_itemsets = 1

else:
an = list(antecedent)
Expand Down
2 changes: 1 addition & 1 deletion mlxtend/frequent_patterns/fpcommon.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def setup_fptree(df, min_support):
)

item_support = np.array(
np.sum(np.logical_or(df.values == 1, df.values is True), axis=0)
np.nansum(df.values, axis=0)
/ (float(num_itemsets) - np.nansum(disabled, axis=0))
)
item_support = item_support.reshape(-1)
Expand Down

0 comments on commit f951cbb

Please sign in to comment.