Skip to content

Commit

Permalink
fix fpmax issue (#570) with fptrees that contain no nodes (#573)
Browse files Browse the repository at this point in the history
* fix fpmax issue (#570) with fptrees that contain no nodes

* Add additional unit test for pattern mining. Also refactored tests.

* update changelog

* bumb version to 0.18.0dev0

* add unit test for min_support=0.
  • Loading branch information
harenbergsd authored and rasbt committed Aug 6, 2019
1 parent ac0f0c1 commit 115278b
Show file tree
Hide file tree
Showing 9 changed files with 256 additions and 51 deletions.
25 changes: 25 additions & 0 deletions docs/sources/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,31 @@ The CHANGELOG for the current development version is available at

---

### Version 0.18.0 (TBD)

##### Downloads

- [Source code (zip)](https://github.com/rasbt/mlxtend/archive/v0.18.0.zip)

- [Source code (tar.gz)](https://github.com/rasbt/mlxtend/archive/v0.18.0.tar.gz)

##### New Features

- -

##### Changes

- -

##### Bug Fixes

- Behavior of `fpgrowth` and `apriori` consistent for edgecases such as `min_support=0`. ([#573](https://github.com/rasbt/mlxtend/pull/550) via [Steve Harenberg](https://github.com/harenbergsd))
- `fpmax` returns an empty data frame now instead of raising an error if the frequent itemset set is empty. ([#573](https://github.com/rasbt/mlxtend/pull/550) via [Steve Harenberg](https://github.com/harenbergsd))





### Version 0.17.0 (07/19/2019)

##### Downloads
Expand Down
5 changes: 5 additions & 0 deletions mlxtend/frequent_patterns/apriori.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,11 @@ def _support(_x, _n_rows, _is_sparse):
out = (np.sum(_x, axis=0) / _n_rows)
return np.array(out).reshape(-1)

if min_support <= 0.:
raise ValueError('`min_support` must be a positive '
'number within the interval `(0, 1]`. '
'Got %s.' % min_support)

idxs = np.where((df.values != 1) & (df.values != 0))
if len(idxs[0]) > 0:
val = df.values[idxs[0][0], idxs[1][0]]
Expand Down
4 changes: 3 additions & 1 deletion mlxtend/frequent_patterns/fpcommon.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,8 @@ def insert_itemset(self, itemset, count=1):
count : int
The number of occurrences of the itemset.
"""
self.root.count += count

if len(itemset) == 0:
return

Expand Down Expand Up @@ -162,7 +164,7 @@ def print_status(self, count, colnames):


class FPNode(object):
def __init__(self, item, count=1, parent=None):
def __init__(self, item, count=0, parent=None):
self.item = item
self.count = count
self.parent = parent
Expand Down
5 changes: 5 additions & 0 deletions mlxtend/frequent_patterns/fpgrowth.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,11 @@ def fpgrowth(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0):
"""
fpc.valid_input_check(df)

if min_support <= 0.:
raise ValueError('`min_support` must be a positive '
'number within the interval `(0, 1]`. '
'Got %s.' % min_support)

colname_map = None
if use_colnames:
colname_map = {idx: item for idx, item in enumerate(df.columns)}
Expand Down
9 changes: 8 additions & 1 deletion mlxtend/frequent_patterns/fpmax.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,11 @@ def fpmax(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0):
"""
fpc.valid_input_check(df)

if min_support <= 0.:
raise ValueError('`min_support` must be a positive '
'number within the interval `(0, 1]`. '
'Got %s.' % min_support)

colname_map = None
if use_colnames:
colname_map = {idx: item for idx, item in enumerate(df.columns)}
Expand All @@ -78,14 +83,16 @@ def fpmax_step(tree, minsup, mfit, colnames, max_len, verbose):
count = 0
items = list(tree.nodes.keys())
largest_set = sorted(tree.cond_items+items, key=mfit.rank.get)
if len(largest_set) == 0:
return
if tree.is_path():
if not mfit.contains(largest_set):
count += 1
largest_set.reverse()
mfit.cache = largest_set
mfit.insert_itemset(largest_set)
if max_len is None or len(largest_set) <= max_len:
support = min([tree.nodes[i][0].count for i in items])
support = tree.root.count
yield support, largest_set

if verbose:
Expand Down
35 changes: 28 additions & 7 deletions mlxtend/frequent_patterns/tests/test_apriori.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,25 +6,36 @@

import unittest
import numpy as np
from mlxtend.frequent_patterns.tests.test_fpbase import FPTestAll
from test_fpbase import FPTestEdgeCases, FPTestErrors, \
FPTestEx1All, FPTestEx2All, FPTestEx3All
from mlxtend.frequent_patterns import apriori


def apriori_wrapper_low_memory(*args, **kwargs):
return apriori(*args, **kwargs, low_memory=True)


class TestApriori(unittest.TestCase, FPTestAll):
class TestEdgeCases(unittest.TestCase, FPTestEdgeCases):
def setUp(self):
FPTestAll.setUp(self, apriori)
FPTestEdgeCases.setUp(self, apriori)


class TestAprioriLowMemory(unittest.TestCase, FPTestAll):
class TestErrors(unittest.TestCase, FPTestErrors):
def setUp(self):
FPTestAll.setUp(self, apriori_wrapper_low_memory)
FPTestErrors.setUp(self, apriori)


class TestAprioriBinaryInput(unittest.TestCase, FPTestAll):
class TestApriori(unittest.TestCase, FPTestEx1All):
def setUp(self):
FPTestEx1All.setUp(self, apriori)


class TestAprioriLowMemory(unittest.TestCase, FPTestEx1All):
def setUp(self):
FPTestEx1All.setUp(self, apriori_wrapper_low_memory)


class TestAprioriBoolInput(unittest.TestCase, FPTestEx1All):
def setUp(self):
one_ary = np.array(
[[False, False, False, True, False, True, True, True, True,
Expand All @@ -37,4 +48,14 @@ def setUp(self):
True, True],
[False, True, False, True, True, True, False, False, True,
False, False]])
FPTestAll.setUp(self, apriori, one_ary=one_ary)
FPTestEx1All.setUp(self, apriori, one_ary=one_ary)


class TestEx2(unittest.TestCase, FPTestEx2All):
def setUp(self):
FPTestEx2All.setUp(self, apriori)


class TestEx3(unittest.TestCase, FPTestEx3All):
def setUp(self):
FPTestEx3All.setUp(self, apriori)
132 changes: 100 additions & 32 deletions mlxtend/frequent_patterns/tests/test_fpbase.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import numpy as np
from numpy.testing import assert_array_equal
from mlxtend.utils import assert_raises
from mlxtend.preprocessing import TransactionEncoder
import pandas as pd
import sys
from contextlib import contextmanager
Expand All @@ -24,30 +25,37 @@ def captured_output():
sys.stdout, sys.stderr = old_out, old_err


class FPTestBase(object):
class FPTestEdgeCases(object):
"""
Base testing class for frequent pattern mining. This class should include
setup and tests common to all methods (e.g., error for improper input)
Base class for testing edge cases for pattern mining.
"""

def setUp(self, fpalgo, one_ary=None):
if one_ary is None:
self.one_ary = np.array(
[[0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1],
[0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1],
[1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0],
[0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1],
[0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0]])
def setUp(self, fpalgo):
self.fpalgo = fpalgo

else:
self.one_ary = one_ary
def test_empty(self):
df = pd.DataFrame([[]])
res_df = self.fpalgo(df)
expect = pd.DataFrame([], columns=['support', 'itemsets'])
compare_dataframes(res_df, expect)


class FPTestErrors(object):
"""
Base class for testing expected errors for pattern mining.
"""

def setUp(self, fpalgo):
self.one_ary = np.array(
[[0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1],
[0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1],
[1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0],
[0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1],
[0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0]])
self.cols = ['Apple', 'Corn', 'Dill', 'Eggs', 'Ice cream',
'Kidney Beans', 'Milk',
'Nutmeg', 'Onion', 'Unicorn', 'Yogurt']

self.df = pd.DataFrame(self.one_ary, columns=self.cols)

self.fpalgo = fpalgo

def test_itemsets_type(self):
Expand Down Expand Up @@ -84,6 +92,31 @@ def test_sparsedataframe_notzero_column(self):
'`df.columns = [str(i) for i in df.columns`].',
self.fpalgo, dfs)


class FPTestEx1(object):
"""
Base class for testing frequent pattern mining on a small example.
"""

def setUp(self, fpalgo, one_ary=None):
if one_ary is None:
self.one_ary = np.array(
[[0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1],
[0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1],
[1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0],
[0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1],
[0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0]])
else:
self.one_ary = one_ary

self.cols = ['Apple', 'Corn', 'Dill', 'Eggs', 'Ice cream',
'Kidney Beans', 'Milk',
'Nutmeg', 'Onion', 'Unicorn', 'Yogurt']

self.df = pd.DataFrame(self.one_ary, columns=self.cols)

self.fpalgo = fpalgo

def test_frozenset_selection(self):
res_df = self.fpalgo(self.df, use_colnames=True)
assert res_df.values.shape == self.fpalgo(self.df).values.shape
Expand Down Expand Up @@ -117,9 +150,9 @@ def test_with_fill_values(fill_value):
test_with_fill_values(False)


class FPTestAll(FPTestBase):
class FPTestEx1All(FPTestEx1):
def setUp(self, fpalgo, one_ary=None):
FPTestBase.setUp(self, fpalgo, one_ary=one_ary)
FPTestEx1.setUp(self, fpalgo, one_ary=one_ary)

def test_default(self):
res_df = self.fpalgo(self.df)
Expand Down Expand Up @@ -162,27 +195,62 @@ def test_low_memory_flag(self):
assert True


class FPTestMaximal(FPTestBase):
def setUp(self, fpalgo, one_ary=None):
FPTestBase.setUp(self, fpalgo, one_ary=one_ary)
class FPTestEx2(object):
"""
Base class for testing frequent pattern mining on a small example.
"""

def test_default(self):
res_df = self.fpalgo(self.df)
expect = pd.DataFrame([[0.6, frozenset([5, 6])],
[0.6, frozenset([5, 10])],
[0.6, frozenset([3, 5, 8])]],
def setUp(self):
database = [['a'], ['b'], ['c', 'd'], ['e']]
te = TransactionEncoder()
te_ary = te.fit(database).transform(database)

self.df = pd.DataFrame(te_ary, columns=te.columns_)


class FPTestEx2All(FPTestEx2):
def setUp(self, fpalgo):
self.fpalgo = fpalgo
FPTestEx2.setUp(self)

def test_output(self):
res_df = self.fpalgo(self.df, min_support=0.001, use_colnames=True)
expect = pd.DataFrame([[0.25, frozenset(['a'])],
[0.25, frozenset(['b'])],
[0.25, frozenset(['c'])],
[0.25, frozenset(['d'])],
[0.25, frozenset(['e'])],
[0.25, frozenset(['c', 'd'])]],
columns=['support', 'itemsets'])

compare_dataframes(res_df, expect)

def test_max_len(self):
res_df1 = self.fpalgo(self.df)
max_len = np.max(res_df1['itemsets'].apply(len))
assert max_len == 3

res_df2 = self.fpalgo(self.df, max_len=2)
max_len = np.max(res_df2['itemsets'].apply(len))
assert max_len == 2
class FPTestEx3(object):
"""
Base class for testing frequent pattern mining on a small example.
"""

def setUp(self):
database = [['a'], ['b'], ['c', 'd'], ['e']]
te = TransactionEncoder()
te_ary = te.fit(database).transform(database)

self.df = pd.DataFrame(te_ary, columns=te.columns_)


class FPTestEx3All(FPTestEx3):
def setUp(self, fpalgo):
self.fpalgo = fpalgo
FPTestEx3.setUp(self)

def test_output3(self):
assert_raises(ValueError,
'`min_support` must be a positive '
'number within the interval `(0, 1]`. Got 0.0.',
self.fpalgo,
self.df,
min_support=0.)


def compare_dataframes(df1, df2):
Expand Down
31 changes: 26 additions & 5 deletions mlxtend/frequent_patterns/tests/test_fpgrowth.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,26 @@
import unittest
import numpy as np
from mlxtend.frequent_patterns.tests.test_fpbase import FPTestAll
from test_fpbase import FPTestEdgeCases, FPTestErrors, \
FPTestEx1All, FPTestEx2All, FPTestEx3All
from mlxtend.frequent_patterns import fpgrowth


class TestFPGrowth(unittest.TestCase, FPTestAll):
class TestEdgeCases(unittest.TestCase, FPTestEdgeCases):
def setUp(self):
FPTestAll.setUp(self, fpgrowth)
FPTestEdgeCases.setUp(self, fpgrowth)


class TestFPGrowth2(unittest.TestCase, FPTestAll):
class TestErrors(unittest.TestCase, FPTestErrors):
def setUp(self):
FPTestErrors.setUp(self, fpgrowth)


class TestEx1(unittest.TestCase, FPTestEx1All):
def setUp(self):
FPTestEx1All.setUp(self, fpgrowth)


class TestEx1BoolInput(unittest.TestCase, FPTestEx1All):
def setUp(self):
one_ary = np.array(
[[False, False, False, True, False, True, True, True, True,
Expand All @@ -22,4 +33,14 @@ def setUp(self):
True, True],
[False, True, False, True, True, True, False, False, True,
False, False]])
FPTestAll.setUp(self, fpgrowth, one_ary=one_ary)
FPTestEx1All.setUp(self, fpgrowth, one_ary=one_ary)


class TestEx2(unittest.TestCase, FPTestEx2All):
def setUp(self):
FPTestEx2All.setUp(self, fpgrowth)


class TestEx3(unittest.TestCase, FPTestEx3All):
def setUp(self):
FPTestEx3All.setUp(self, fpgrowth)
Loading

0 comments on commit 115278b

Please sign in to comment.