Skip to content

Commit

Permalink
Let apriori always use low_memory processing
Browse files Browse the repository at this point in the history
Thanks to previous optimizations, processing with low_memory=True is
now almost as efficient as with low_memory=False, and allows to process
much larger datasets.

Removing processing with low_memory=False makes code simpler and allows
to generate itemsets by a generator, which saves more memory.

The downside is that we do not know in advance the number of itemsets to
process, thus it is displayed afterwards.  Note that commit 2f928cb
introduced a bug, the number of processing combinations was multiplied
by itemset's length, which explains why output is different now.
  • Loading branch information
dbarbier committed Dec 30, 2019
1 parent fe783b5 commit 7eb928e
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 129 deletions.
173 changes: 45 additions & 128 deletions mlxtend/frequent_patterns/apriori.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,88 +61,36 @@ def generate_new_combinations(old_combinations):
# early exit from for-loop skips else clause just below
break
else:
yield from candidate
yield candidate
j = j + 1


def generate_new_combinations_low_memory(old_combinations, X, min_support,
is_sparse):
"""
Generator of all combinations based on the last state of Apriori algorithm
Parameters
-----------
old_combinations: np.array
All combinations with enough support in the last step
Combinations are represented by a matrix.
Number of columns is equal to the combination size
of the previous step.
Each row represents one combination
and contains item type ids in the ascending order
```
0 1
0 15 20
1 15 22
2 17 19
```
X: np.array or scipy sparse matrix
The allowed values are either 0/1 or True/False.
For example,
```
0 True False True True False True
1 True False True False False True
2 True False True False False False
3 True True False False False False
4 False False True True True True
5 False False True False True True
6 False False True False True False
7 True True False False False False
```
min_support : float (default: 0.5)
A float between 0 and 1 for minumum support of the itemsets returned.
The support is computed as the fraction
`transactions_where_item(s)_occur / total_transactions`.
is_sparse : bool True if X is sparse
Returns
-----------
Generator of all combinations from the last step x items
from the previous step. Every combination contains the
number of transactions where this item occurs, followed
by item type ids in the ascending order.
No combination other than generated
do not have a chance to get enough support
Examples
-----------
For usage examples, please see
http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/generate_new_combinations/
"""

items_types_in_previous_step = np.unique(old_combinations.flatten())
rows_count = X.shape[0]
threshold = min_support * rows_count
for old_combination in old_combinations:
max_combination = old_combination[-1]
mask = items_types_in_previous_step > max_combination
valid_items = items_types_in_previous_step[mask]
old_tuple = tuple(old_combination)
if is_sparse:
mask_rows = X[:, old_tuple].toarray().all(axis=1)
X_cols = X[:, valid_items].toarray()
supports = X_cols[mask_rows].sum(axis=0)
else:
mask_rows = X[:, old_tuple].all(axis=1)
supports = X[mask_rows][:, valid_items].sum(axis=0)
valid_indices = (supports >= threshold).nonzero()[0]
for index in valid_indices:
yield supports[index]
yield from old_tuple
yield valid_items[index]
def generate_supports_and_itemsets(X, is_sparse, combin, min_support):
counter = 0
if is_sparse:
count = np.empty(X.shape[0], dtype=int)
for itemset in combin:
counter += 1
count[:] = 0
for item in itemset:
# faster than X[:, item].toarray() or X.getcol(item).indices
count[X.indices[X.indptr[item]:X.indptr[item+1]]] += 1
support = np.count_nonzero(count == len(itemset))
if support >= min_support:
yield support
yield from itemset
else:
for itemset in combin:
counter += 1
_bools = np.ones(X.shape[0], dtype=bool)
for item in itemset:
np.logical_and(_bools, X[:, item], out=_bools)
support = np.count_nonzero(_bools)
if support >= min_support:
yield support
yield from itemset
# return the total of processed itemsets as last element
yield counter


def apriori(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0,
Expand Down Expand Up @@ -254,13 +202,15 @@ def _support(_x, _n_rows, _is_sparse):
X = df.values
else:
X = df.to_coo().tocsc()
X.eliminate_zeros()
is_sparse = True
elif hasattr(df, "sparse"):
# DataFrame with SparseArray (pandas >= 0.24)
if df.size == 0:
X = df.values
else:
X = df.sparse.to_coo().tocsc()
X.eliminate_zeros()
is_sparse = True
else:
# dense DataFrame
Expand All @@ -271,63 +221,30 @@ def _support(_x, _n_rows, _is_sparse):
support_dict = {1: support[support >= min_support]}
itemset_dict = {1: ary_col_idx[support >= min_support].reshape(-1, 1)}
max_itemset = 1
rows_count = float(X.shape[0])

all_ones = np.ones((int(rows_count), 1))

while max_itemset and max_itemset < (max_len or float('inf')):
next_max_itemset = max_itemset + 1

# With exceptionally large datasets, the matrix operations can use a
# substantial amount of memory. For low memory applications or large
# datasets, set `low_memory=True` to use a slower but more memory-
# efficient implementation.
if low_memory:
combin = generate_new_combinations_low_memory(
itemset_dict[max_itemset], X, min_support, is_sparse)
# slightly faster than creating an array from a list of tuples
combin = np.fromiter(combin, dtype=int)
combin = combin.reshape(-1, next_max_itemset + 1)

if combin.size == 0:
break
combin = generate_new_combinations(itemset_dict[max_itemset])
min_rows = np.math.ceil(min_support * X.shape[0])
gen_itemsets = generate_supports_and_itemsets(
X, is_sparse, combin, min_rows)

support_valid_itemsets = np.fromiter(gen_itemsets, dtype=int)
processed_itemsets = support_valid_itemsets[-1]
support_valid_itemsets = support_valid_itemsets[:-1]
if support_valid_itemsets.size > 0:
if verbose:
print(
'\rProcessing %d combinations | Sampling itemset size %d' %
(combin.size, next_max_itemset), end="")

itemset_dict[next_max_itemset] = combin[:, 1:]
support_dict[next_max_itemset] = combin[:, 0].astype(float) \
/ rows_count
'\rProcessed %d combinations | Sampling itemset size %d' %
(processed_itemsets, next_max_itemset), end="")
support_valid_itemsets.shape = (-1, 1 + next_max_itemset)
itemset_dict[next_max_itemset] = support_valid_itemsets[:, 1:]
support_dict[next_max_itemset] = support_valid_itemsets[:, 0] / X.shape[0]
max_itemset = next_max_itemset
else:
combin = generate_new_combinations(itemset_dict[max_itemset])
combin = np.fromiter(combin, dtype=int)
combin = combin.reshape(-1, next_max_itemset)

if combin.size == 0:
break
if verbose:
print(
'\rProcessing %d combinations | Sampling itemset size %d' %
(combin.size, next_max_itemset), end="")

if is_sparse:
_bools = X[:, combin[:, 0]] == all_ones
for n in range(1, combin.shape[1]):
_bools = _bools & (X[:, combin[:, n]] == all_ones)
else:
_bools = np.all(X[:, combin], axis=2)

support = _support(np.array(_bools), rows_count, is_sparse)
_mask = (support >= min_support).reshape(-1)
if any(_mask):
itemset_dict[next_max_itemset] = np.array(combin[_mask])
support_dict[next_max_itemset] = np.array(support[_mask])
max_itemset = next_max_itemset
else:
# Exit condition
break
# Exit condition
break

all_res = []
for k in sorted(itemset_dict):
Expand Down
2 changes: 1 addition & 1 deletion mlxtend/frequent_patterns/tests/test_fpbase.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@ def test_low_memory_flag(self):
_ = self.fpalgo(self.df, low_memory=True, verbose=1)

# Only get the last value of the stream to reduce test noise
expect = 'Processing 4 combinations | Sampling itemset size 3\n'
expect = 'Processed 1 combinations | Sampling itemset size 3\n'
out = out.getvalue().split('\r')[-1]
assert out == expect
else:
Expand Down

0 comments on commit 7eb928e

Please sign in to comment.