Fixing of association rule lastest version (#1121)

* Updated FPGrowth/FPMax and Association Rules with the existence of missing values * Re-structure and document code * Update unit tests * Update CHANGELOG.md * Modify the corresponding documentation in Jupyter notebooks * Final modifications * Fix association rules and corresponding tests * Fix typos * Fixing memory usage increase * Fixing memory usage increase --------- Co-authored-by: Sebastian Raschka <[email protected]>
rasbt · Jan 25, 2025 · f951cbb · f951cbb
1 parent 71f2531
commit f951cbb
Show file tree

Hide file tree

Showing 3 changed files with 15 additions and 5 deletions.
diff --git a/docs/sources/user_guide/frequent_patterns/association_rules.ipynb b/docs/sources/user_guide/frequent_patterns/association_rules.ipynb
@@ -2418,13 +2418,16 @@
   },
   {
    "cell_type": "code",
+
    "execution_count": 20,
+
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
+
       "/tmp/ipykernel_34953/2823279667.py:23: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'nan' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.\n",
       "  df.iloc[idx[i], col[i]] = np.nan\n",
       "/tmp/ipykernel_34953/2823279667.py:23: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'nan' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.\n",
@@ -2438,6 +2441,7 @@
       "/tmp/ipykernel_34953/2823279667.py:23: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'nan' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.\n",
       "  df.iloc[idx[i], col[i]] = np.nan\n",
       "/tmp/ipykernel_34953/2823279667.py:23: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'nan' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.\n",
+
       "  df.iloc[idx[i], col[i]] = np.nan\n"
      ]
     },
@@ -2489,6 +2493,7 @@
        "      <td>True</td>\n",
        "      <td>False</td>\n",
        "      <td>NaN</td>\n",
+
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
@@ -2710,6 +2715,7 @@
       ]
      },
      "execution_count": 21,
+
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2718,6 +2724,7 @@
     "frequent_itemsets = fpgrowth(df, min_support=0.6, null_values = True, use_colnames=True)\n",
     "# frequent_itemsets = fpmax(df, min_support=0.6, null_values = True, use_colnames=True)\n",
     "rules = association_rules(frequent_itemsets, len(df), df, null_values = True, metric=\"confidence\", min_threshold=0.8)\n",
+
     "rules"
    ]
   },

diff --git a/mlxtend/frequent_patterns/association_rules.py b/mlxtend/frequent_patterns/association_rules.py
@@ -34,7 +34,7 @@
 
 def association_rules(
     df: pd.DataFrame,
-    num_itemsets: int,
+    num_itemsets: Optional[int] = 1,
     df_orig: Optional[pd.DataFrame] = None,
     null_values=False,
     metric="confidence",
@@ -54,8 +54,8 @@ def association_rules(
     df_orig : pandas DataFrame (default: None)
       DataFrame with original input data. Only provided when null_values exist
 
-    num_itemsets : int
-      Number of transactions in original input data
+    num_itemsets : int (default: 1)
+      Number of transactions in original input data (df_orig)
 
     null_values : bool (default: False)
       In case there are null values as NaNs in the original input data
@@ -119,6 +119,10 @@ def association_rules(
     if null_values and df_orig is None:
         raise TypeError("If null values exist, df_orig must be provided.")
 
+    # if null values exist, num_itemsets must be provided
+    if null_values and num_itemsets == 1:
+        raise TypeError("If null values exist, num_itemsets must be provided.")
+
     # check for valid input
     fpc.valid_input_check(df_orig, null_values)
 
@@ -285,7 +289,6 @@ def certainty_metric_helper(sAC, sA, sC, disAC, disA, disC, dis_int, dis_int_):
                         # if the input dataframe is complete
                         if not null_values:
                             disAC, disA, disC, dis_int, dis_int_ = 0, 0, 0, 0, 0
-                            num_itemsets = 1
 
                         else:
                             an = list(antecedent)

diff --git a/mlxtend/frequent_patterns/fpcommon.py b/mlxtend/frequent_patterns/fpcommon.py
@@ -31,7 +31,7 @@ def setup_fptree(df, min_support):
     )
 
     item_support = np.array(
-        np.sum(np.logical_or(df.values == 1, df.values is True), axis=0)
+        np.nansum(df.values, axis=0)
         / (float(num_itemsets) - np.nansum(disabled, axis=0))
     )
     item_support = item_support.reshape(-1)