From 115278bac14d7fc278885c0722da03f1c3b91604 Mon Sep 17 00:00:00 2001 From: Steve Harenberg Date: Tue, 6 Aug 2019 03:44:59 -0400 Subject: [PATCH] fix fpmax issue (#570) with fptrees that contain no nodes (#573) * fix fpmax issue (#570) with fptrees that contain no nodes * Add additional unit test for pattern mining. Also refactored tests. * update changelog * bumb version to 0.18.0dev0 * add unit test for min_support=0. --- docs/sources/CHANGELOG.md | 25 ++++ mlxtend/frequent_patterns/apriori.py | 5 + mlxtend/frequent_patterns/fpcommon.py | 4 +- mlxtend/frequent_patterns/fpgrowth.py | 5 + mlxtend/frequent_patterns/fpmax.py | 9 +- .../frequent_patterns/tests/test_apriori.py | 35 ++++- .../frequent_patterns/tests/test_fpbase.py | 132 +++++++++++++----- .../frequent_patterns/tests/test_fpgrowth.py | 31 +++- mlxtend/frequent_patterns/tests/test_fpmax.py | 61 +++++++- 9 files changed, 256 insertions(+), 51 deletions(-) diff --git a/docs/sources/CHANGELOG.md b/docs/sources/CHANGELOG.md index 4e9d021d5..36c6f9656 100755 --- a/docs/sources/CHANGELOG.md +++ b/docs/sources/CHANGELOG.md @@ -7,6 +7,31 @@ The CHANGELOG for the current development version is available at --- +### Version 0.18.0 (TBD) + +##### Downloads + +- [Source code (zip)](https://github.com/rasbt/mlxtend/archive/v0.18.0.zip) + +- [Source code (tar.gz)](https://github.com/rasbt/mlxtend/archive/v0.18.0.tar.gz) + +##### New Features + +- - + +##### Changes + +- - + +##### Bug Fixes + +- Behavior of `fpgrowth` and `apriori` consistent for edgecases such as `min_support=0`. ([#573](https://github.com/rasbt/mlxtend/pull/550) via [Steve Harenberg](https://github.com/harenbergsd)) +- `fpmax` returns an empty data frame now instead of raising an error if the frequent itemset set is empty. ([#573](https://github.com/rasbt/mlxtend/pull/550) via [Steve Harenberg](https://github.com/harenbergsd)) + + + + + ### Version 0.17.0 (07/19/2019) ##### Downloads diff --git a/mlxtend/frequent_patterns/apriori.py b/mlxtend/frequent_patterns/apriori.py index fa28ff472..e0822c301 100644 --- a/mlxtend/frequent_patterns/apriori.py +++ b/mlxtend/frequent_patterns/apriori.py @@ -143,6 +143,11 @@ def _support(_x, _n_rows, _is_sparse): out = (np.sum(_x, axis=0) / _n_rows) return np.array(out).reshape(-1) + if min_support <= 0.: + raise ValueError('`min_support` must be a positive ' + 'number within the interval `(0, 1]`. ' + 'Got %s.' % min_support) + idxs = np.where((df.values != 1) & (df.values != 0)) if len(idxs[0]) > 0: val = df.values[idxs[0][0], idxs[1][0]] diff --git a/mlxtend/frequent_patterns/fpcommon.py b/mlxtend/frequent_patterns/fpcommon.py index 02ce50711..b1bca1b97 100644 --- a/mlxtend/frequent_patterns/fpcommon.py +++ b/mlxtend/frequent_patterns/fpcommon.py @@ -123,6 +123,8 @@ def insert_itemset(self, itemset, count=1): count : int The number of occurrences of the itemset. """ + self.root.count += count + if len(itemset) == 0: return @@ -162,7 +164,7 @@ def print_status(self, count, colnames): class FPNode(object): - def __init__(self, item, count=1, parent=None): + def __init__(self, item, count=0, parent=None): self.item = item self.count = count self.parent = parent diff --git a/mlxtend/frequent_patterns/fpgrowth.py b/mlxtend/frequent_patterns/fpgrowth.py index aad2778c5..2920b6384 100644 --- a/mlxtend/frequent_patterns/fpgrowth.py +++ b/mlxtend/frequent_patterns/fpgrowth.py @@ -60,6 +60,11 @@ def fpgrowth(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0): """ fpc.valid_input_check(df) + if min_support <= 0.: + raise ValueError('`min_support` must be a positive ' + 'number within the interval `(0, 1]`. ' + 'Got %s.' % min_support) + colname_map = None if use_colnames: colname_map = {idx: item for idx, item in enumerate(df.columns)} diff --git a/mlxtend/frequent_patterns/fpmax.py b/mlxtend/frequent_patterns/fpmax.py index 1294d94b1..e7140c913 100644 --- a/mlxtend/frequent_patterns/fpmax.py +++ b/mlxtend/frequent_patterns/fpmax.py @@ -61,6 +61,11 @@ def fpmax(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0): """ fpc.valid_input_check(df) + if min_support <= 0.: + raise ValueError('`min_support` must be a positive ' + 'number within the interval `(0, 1]`. ' + 'Got %s.' % min_support) + colname_map = None if use_colnames: colname_map = {idx: item for idx, item in enumerate(df.columns)} @@ -78,6 +83,8 @@ def fpmax_step(tree, minsup, mfit, colnames, max_len, verbose): count = 0 items = list(tree.nodes.keys()) largest_set = sorted(tree.cond_items+items, key=mfit.rank.get) + if len(largest_set) == 0: + return if tree.is_path(): if not mfit.contains(largest_set): count += 1 @@ -85,7 +92,7 @@ def fpmax_step(tree, minsup, mfit, colnames, max_len, verbose): mfit.cache = largest_set mfit.insert_itemset(largest_set) if max_len is None or len(largest_set) <= max_len: - support = min([tree.nodes[i][0].count for i in items]) + support = tree.root.count yield support, largest_set if verbose: diff --git a/mlxtend/frequent_patterns/tests/test_apriori.py b/mlxtend/frequent_patterns/tests/test_apriori.py index 16e80a025..9bf1cd1c4 100644 --- a/mlxtend/frequent_patterns/tests/test_apriori.py +++ b/mlxtend/frequent_patterns/tests/test_apriori.py @@ -6,7 +6,8 @@ import unittest import numpy as np -from mlxtend.frequent_patterns.tests.test_fpbase import FPTestAll +from test_fpbase import FPTestEdgeCases, FPTestErrors, \ + FPTestEx1All, FPTestEx2All, FPTestEx3All from mlxtend.frequent_patterns import apriori @@ -14,17 +15,27 @@ def apriori_wrapper_low_memory(*args, **kwargs): return apriori(*args, **kwargs, low_memory=True) -class TestApriori(unittest.TestCase, FPTestAll): +class TestEdgeCases(unittest.TestCase, FPTestEdgeCases): def setUp(self): - FPTestAll.setUp(self, apriori) + FPTestEdgeCases.setUp(self, apriori) -class TestAprioriLowMemory(unittest.TestCase, FPTestAll): +class TestErrors(unittest.TestCase, FPTestErrors): def setUp(self): - FPTestAll.setUp(self, apriori_wrapper_low_memory) + FPTestErrors.setUp(self, apriori) -class TestAprioriBinaryInput(unittest.TestCase, FPTestAll): +class TestApriori(unittest.TestCase, FPTestEx1All): + def setUp(self): + FPTestEx1All.setUp(self, apriori) + + +class TestAprioriLowMemory(unittest.TestCase, FPTestEx1All): + def setUp(self): + FPTestEx1All.setUp(self, apriori_wrapper_low_memory) + + +class TestAprioriBoolInput(unittest.TestCase, FPTestEx1All): def setUp(self): one_ary = np.array( [[False, False, False, True, False, True, True, True, True, @@ -37,4 +48,14 @@ def setUp(self): True, True], [False, True, False, True, True, True, False, False, True, False, False]]) - FPTestAll.setUp(self, apriori, one_ary=one_ary) + FPTestEx1All.setUp(self, apriori, one_ary=one_ary) + + +class TestEx2(unittest.TestCase, FPTestEx2All): + def setUp(self): + FPTestEx2All.setUp(self, apriori) + + +class TestEx3(unittest.TestCase, FPTestEx3All): + def setUp(self): + FPTestEx3All.setUp(self, apriori) diff --git a/mlxtend/frequent_patterns/tests/test_fpbase.py b/mlxtend/frequent_patterns/tests/test_fpbase.py index ec69463c5..c0500c775 100644 --- a/mlxtend/frequent_patterns/tests/test_fpbase.py +++ b/mlxtend/frequent_patterns/tests/test_fpbase.py @@ -7,6 +7,7 @@ import numpy as np from numpy.testing import assert_array_equal from mlxtend.utils import assert_raises +from mlxtend.preprocessing import TransactionEncoder import pandas as pd import sys from contextlib import contextmanager @@ -24,30 +25,37 @@ def captured_output(): sys.stdout, sys.stderr = old_out, old_err -class FPTestBase(object): +class FPTestEdgeCases(object): """ - Base testing class for frequent pattern mining. This class should include - setup and tests common to all methods (e.g., error for improper input) + Base class for testing edge cases for pattern mining. """ - def setUp(self, fpalgo, one_ary=None): - if one_ary is None: - self.one_ary = np.array( - [[0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1], - [0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1], - [1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0], - [0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1], - [0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0]]) + def setUp(self, fpalgo): + self.fpalgo = fpalgo - else: - self.one_ary = one_ary + def test_empty(self): + df = pd.DataFrame([[]]) + res_df = self.fpalgo(df) + expect = pd.DataFrame([], columns=['support', 'itemsets']) + compare_dataframes(res_df, expect) + +class FPTestErrors(object): + """ + Base class for testing expected errors for pattern mining. + """ + + def setUp(self, fpalgo): + self.one_ary = np.array( + [[0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1], + [0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1], + [1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0], + [0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1], + [0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0]]) self.cols = ['Apple', 'Corn', 'Dill', 'Eggs', 'Ice cream', 'Kidney Beans', 'Milk', 'Nutmeg', 'Onion', 'Unicorn', 'Yogurt'] - self.df = pd.DataFrame(self.one_ary, columns=self.cols) - self.fpalgo = fpalgo def test_itemsets_type(self): @@ -84,6 +92,31 @@ def test_sparsedataframe_notzero_column(self): '`df.columns = [str(i) for i in df.columns`].', self.fpalgo, dfs) + +class FPTestEx1(object): + """ + Base class for testing frequent pattern mining on a small example. + """ + + def setUp(self, fpalgo, one_ary=None): + if one_ary is None: + self.one_ary = np.array( + [[0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1], + [0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1], + [1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0], + [0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1], + [0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0]]) + else: + self.one_ary = one_ary + + self.cols = ['Apple', 'Corn', 'Dill', 'Eggs', 'Ice cream', + 'Kidney Beans', 'Milk', + 'Nutmeg', 'Onion', 'Unicorn', 'Yogurt'] + + self.df = pd.DataFrame(self.one_ary, columns=self.cols) + + self.fpalgo = fpalgo + def test_frozenset_selection(self): res_df = self.fpalgo(self.df, use_colnames=True) assert res_df.values.shape == self.fpalgo(self.df).values.shape @@ -117,9 +150,9 @@ def test_with_fill_values(fill_value): test_with_fill_values(False) -class FPTestAll(FPTestBase): +class FPTestEx1All(FPTestEx1): def setUp(self, fpalgo, one_ary=None): - FPTestBase.setUp(self, fpalgo, one_ary=one_ary) + FPTestEx1.setUp(self, fpalgo, one_ary=one_ary) def test_default(self): res_df = self.fpalgo(self.df) @@ -162,27 +195,62 @@ def test_low_memory_flag(self): assert True -class FPTestMaximal(FPTestBase): - def setUp(self, fpalgo, one_ary=None): - FPTestBase.setUp(self, fpalgo, one_ary=one_ary) +class FPTestEx2(object): + """ + Base class for testing frequent pattern mining on a small example. + """ - def test_default(self): - res_df = self.fpalgo(self.df) - expect = pd.DataFrame([[0.6, frozenset([5, 6])], - [0.6, frozenset([5, 10])], - [0.6, frozenset([3, 5, 8])]], + def setUp(self): + database = [['a'], ['b'], ['c', 'd'], ['e']] + te = TransactionEncoder() + te_ary = te.fit(database).transform(database) + + self.df = pd.DataFrame(te_ary, columns=te.columns_) + + +class FPTestEx2All(FPTestEx2): + def setUp(self, fpalgo): + self.fpalgo = fpalgo + FPTestEx2.setUp(self) + + def test_output(self): + res_df = self.fpalgo(self.df, min_support=0.001, use_colnames=True) + expect = pd.DataFrame([[0.25, frozenset(['a'])], + [0.25, frozenset(['b'])], + [0.25, frozenset(['c'])], + [0.25, frozenset(['d'])], + [0.25, frozenset(['e'])], + [0.25, frozenset(['c', 'd'])]], columns=['support', 'itemsets']) compare_dataframes(res_df, expect) - def test_max_len(self): - res_df1 = self.fpalgo(self.df) - max_len = np.max(res_df1['itemsets'].apply(len)) - assert max_len == 3 - res_df2 = self.fpalgo(self.df, max_len=2) - max_len = np.max(res_df2['itemsets'].apply(len)) - assert max_len == 2 +class FPTestEx3(object): + """ + Base class for testing frequent pattern mining on a small example. + """ + + def setUp(self): + database = [['a'], ['b'], ['c', 'd'], ['e']] + te = TransactionEncoder() + te_ary = te.fit(database).transform(database) + + self.df = pd.DataFrame(te_ary, columns=te.columns_) + + +class FPTestEx3All(FPTestEx3): + def setUp(self, fpalgo): + self.fpalgo = fpalgo + FPTestEx3.setUp(self) + + def test_output3(self): + assert_raises(ValueError, + '`min_support` must be a positive ' + 'number within the interval `(0, 1]`. Got 0.0.', + self.fpalgo, + self.df, + min_support=0.) def compare_dataframes(df1, df2): diff --git a/mlxtend/frequent_patterns/tests/test_fpgrowth.py b/mlxtend/frequent_patterns/tests/test_fpgrowth.py index 4c4b69fc3..463c1ce1b 100644 --- a/mlxtend/frequent_patterns/tests/test_fpgrowth.py +++ b/mlxtend/frequent_patterns/tests/test_fpgrowth.py @@ -1,15 +1,26 @@ import unittest import numpy as np -from mlxtend.frequent_patterns.tests.test_fpbase import FPTestAll +from test_fpbase import FPTestEdgeCases, FPTestErrors, \ + FPTestEx1All, FPTestEx2All, FPTestEx3All from mlxtend.frequent_patterns import fpgrowth -class TestFPGrowth(unittest.TestCase, FPTestAll): +class TestEdgeCases(unittest.TestCase, FPTestEdgeCases): def setUp(self): - FPTestAll.setUp(self, fpgrowth) + FPTestEdgeCases.setUp(self, fpgrowth) -class TestFPGrowth2(unittest.TestCase, FPTestAll): +class TestErrors(unittest.TestCase, FPTestErrors): + def setUp(self): + FPTestErrors.setUp(self, fpgrowth) + + +class TestEx1(unittest.TestCase, FPTestEx1All): + def setUp(self): + FPTestEx1All.setUp(self, fpgrowth) + + +class TestEx1BoolInput(unittest.TestCase, FPTestEx1All): def setUp(self): one_ary = np.array( [[False, False, False, True, False, True, True, True, True, @@ -22,4 +33,14 @@ def setUp(self): True, True], [False, True, False, True, True, True, False, False, True, False, False]]) - FPTestAll.setUp(self, fpgrowth, one_ary=one_ary) + FPTestEx1All.setUp(self, fpgrowth, one_ary=one_ary) + + +class TestEx2(unittest.TestCase, FPTestEx2All): + def setUp(self): + FPTestEx2All.setUp(self, fpgrowth) + + +class TestEx3(unittest.TestCase, FPTestEx3All): + def setUp(self): + FPTestEx3All.setUp(self, fpgrowth) diff --git a/mlxtend/frequent_patterns/tests/test_fpmax.py b/mlxtend/frequent_patterns/tests/test_fpmax.py index 3151c889b..949fa5bc9 100644 --- a/mlxtend/frequent_patterns/tests/test_fpmax.py +++ b/mlxtend/frequent_patterns/tests/test_fpmax.py @@ -1,15 +1,46 @@ import unittest +import pandas as pd import numpy as np -from mlxtend.frequent_patterns.tests.test_fpbase import FPTestMaximal from mlxtend.frequent_patterns import fpmax +from test_fpbase import FPTestEdgeCases, FPTestErrors, FPTestEx1, FPTestEx2, \ + FPTestEx3All +from test_fpbase import compare_dataframes -class TestFPMax(unittest.TestCase, FPTestMaximal): +class TestEdgeCases(unittest.TestCase, FPTestEdgeCases): def setUp(self): - FPTestMaximal.setUp(self, fpmax) + FPTestEdgeCases.setUp(self, fpmax) -class TestFPMax2(unittest.TestCase, FPTestMaximal): +class TestErrors(unittest.TestCase, FPTestErrors): + def setUp(self): + FPTestErrors.setUp(self, fpmax) + + +class TestEx1(unittest.TestCase, FPTestEx1): + def setUp(self, one_ary=None): + FPTestEx1.setUp(self, fpmax, one_ary=one_ary) + + def test_default(self): + res_df = fpmax(self.df) + expect = pd.DataFrame([[0.6, frozenset([5, 6])], + [0.6, frozenset([5, 10])], + [0.6, frozenset([3, 5, 8])]], + columns=['support', 'itemsets']) + + compare_dataframes(res_df, expect) + + def test_max_len(self): + res_df1 = fpmax(self.df) + max_len = np.max(res_df1['itemsets'].apply(len)) + assert max_len == 3 + + res_df2 = fpmax(self.df, max_len=2) + max_len = np.max(res_df2['itemsets'].apply(len)) + assert max_len == 2 + + +class TestEx1BoolInput(TestEx1): def setUp(self): one_ary = np.array( [[False, False, False, True, False, True, True, True, True, @@ -22,4 +53,24 @@ def setUp(self): True, True], [False, True, False, True, True, True, False, False, True, False, False]]) - FPTestMaximal.setUp(self, fpmax, one_ary=one_ary) + FPTestEx1.setUp(self, fpmax, one_ary=one_ary) + + +class TestEx2(unittest.TestCase, FPTestEx2): + def setUp(self): + FPTestEx2.setUp(self) + + def test_output(self): + res_df = fpmax(self.df, min_support=0.001, use_colnames=True) + expect = pd.DataFrame([[0.25, frozenset(['a'])], + [0.25, frozenset(['b'])], + [0.25, frozenset(['c', 'd'])], + [0.25, frozenset(['e'])]], + columns=['support', 'itemsets']) + + compare_dataframes(res_df, expect) + + +class TestEx3(unittest.TestCase, FPTestEx3All): + def setUp(self): + FPTestEx3All.setUp(self, fpmax)