From 4dfa8a542b5575825115d9d69dd5e74318a847ed Mon Sep 17 00:00:00 2001 From: JaredSchwartz <75581425+JaredSchwartz@users.noreply.github.com> Date: Sun, 27 Oct 2024 16:20:22 -0600 Subject: [PATCH] Optimize memory usage with bitmatrix operations --- src/RuleMiner.jl | 1 + src/association_rules/apriori.jl | 3 +- src/data_structures/txnutils.jl | 40 ++++++++++ src/itemsets/closed/carpenter.jl | 65 ++++++---------- src/itemsets/closed/charm.jl | 90 +++++++++++------------ src/itemsets/closed/fpclose.jl | 11 +-- src/itemsets/closed/lcm.jl | 80 +++++++++----------- src/itemsets/frequent/eclat.jl | 44 +++-------- src/itemsets/frequent/fpgrowth.jl | 12 +-- src/itemsets/itemset_utils.jl | 118 ++++++++++++++++++++++++++++++ src/itemsets/maximal/fpmax.jl | 12 +-- src/itemsets/maximal/genmax.jl | 98 ++++++++++++------------- 12 files changed, 325 insertions(+), 249 deletions(-) create mode 100644 src/itemsets/itemset_utils.jl diff --git a/src/RuleMiner.jl b/src/RuleMiner.jl index 144bdc9..bf2f6cc 100644 --- a/src/RuleMiner.jl +++ b/src/RuleMiner.jl @@ -62,6 +62,7 @@ export apriori include("itemsets/frequent/eclat.jl") include("itemsets/frequent/fpgrowth.jl") include("itemsets/frequent/recovery.jl") +include("itemsets/itemset_utils.jl") export eclat export fpgrowth diff --git a/src/association_rules/apriori.jl b/src/association_rules/apriori.jl index c1bd135..ccee7a9 100644 --- a/src/association_rules/apriori.jl +++ b/src/association_rules/apriori.jl @@ -92,8 +92,7 @@ function apriori(txns::Transactions, min_support::Union{Int,Float64}, min_confid basenum = vec(count(txns.matrix, dims=1)) min_support = min_support isa Float64 ? ceil(Int, min_support * n_transactions) : min_support - items = findall(basenum .>= min_support) - subtxns = BitMatrix(txns.matrix[:, items]) + subtxns, items = RuleMiner.prune_matrix(txns.matrix,min_support) rules = Vector{Arule}() initials = Vector{Arule}() diff --git a/src/data_structures/txnutils.jl b/src/data_structures/txnutils.jl index dc3aeab..dd99bca 100644 --- a/src/data_structures/txnutils.jl +++ b/src/data_structures/txnutils.jl @@ -299,4 +299,44 @@ function txns_to_df(txns::SeqTxns, index::Bool = true)::DataFrame end return df +end + +""" + prune_matrix(matrix::SparseMatrixCSC, min_support::Int) -> Tuple{BitMatrix, Vector{Int}} + +Filter and sort sparse matrix columns based on minimum support threshold. + +# Arguments +- `matrix::SparseMatrixCSC`: A sparse boolean matrix where rows represent transactions and columns + represent items. A true value at position (i,j) indicates item j is present in transaction i. +- `min_support::Int`: The minimum absolute support threshold. Columns with fewer than this number + of true values will be removed. + +# Returns +A tuple containing: +- `BitMatrix`: A pruned view of the matrix containing only frequent columns, converted to a BitMatrix +- `Vector{Int}`: A vector of column indices corresponding to the frequent columns, sorted by their sums + +# Description +This helper function performs two key preprocessing steps for frequent pattern mining: +1. Removes infrequent columns (pruning): Filters out columns whose sum is less than the minimum + support threshold +2. Sorts columns by frequency: Reorders the remaining columns based on their sums in ascending order + +The pruned matrix is returned as a BitMatrix for efficient boolean operations in pattern mining algorithms. + +# Example +```julia +txns = Txns(sparse([1 1 0; 1 0 1; 0 1 1]), ["A", "B", "C"], ["I1", "I2", "I3"]) +matrix, indices = prune_matrix(txns, 2) +``` +""" +function prune_matrix(matrix::SparseMatrixCSC, min_support::Int) + supports = sum(matrix, dims=1) + sorted_items = [i for i in axes(matrix,2) if supports[1,i] >= min_support] + sort!(sorted_items, by= x -> supports[1,x]) + + matrix = view(matrix,:, sorted_items) |> BitMatrix + + return matrix[vec(any(matrix, dims=2)), :], sorted_items end \ No newline at end of file diff --git a/src/itemsets/closed/carpenter.jl b/src/itemsets/closed/carpenter.jl index d7a9708..e96db19 100644 --- a/src/itemsets/closed/carpenter.jl +++ b/src/itemsets/closed/carpenter.jl @@ -66,14 +66,8 @@ function carpenter(txns::Transactions, min_support::Union{Int,Float64}) # Handle min_support as a float value min_support = min_support isa Float64 ? ceil(Int, min_support * n_transactions) : min_support - # Create tidsets (transaction ID sets) for each item - tidsets = [BitSet(findall(txns.matrix[:,col])) for col in 1:n_items] - supports = vec(sum(txns.matrix, dims=1)) - - # Create vectors of all items and all frequent items for mining - allitems = collect(1:n_items) - frequent_items = findall(supports .>= min_support) - + matrix, sorted_items = RuleMiner.prune_matrix(txns.matrix, min_support) + # Initialize results dictionary and threading lock Results = Dict{Vector{Int}, Int}() ThreadLock = ReentrantLock() @@ -82,50 +76,39 @@ function carpenter(txns::Transactions, min_support::Union{Int,Float64}) # Pruning 3: Early return if itemset is already present in the output haskey(closed_itemsets, X) && return - # Find transactions with the itemset and calculate support - tidset_X = length(X) == 1 ? tidsets[X[1]] : intersect(tidsets[X]...) - support_X = length(tidset_X) + # Get closure of current itemset and map back to original indices + X_pos = Vector{Int}(findall(in(X), sorted_items)) + closed_pos = RuleMiner.closure(matrix, X_pos) + closed = sorted_items[closed_pos] - # Pruning 1: Early return if the itemset is not frequent - support_X < min_support && return - - # Pruning 2: Find items that can be added without changing support - Y = filter(i -> length(intersect(tidset_X, tidsets[i])) == support_X, R) - - # Add X to itemsets if it's closed (Y is empty) - if isempty(Y) - lock(Lock) do - closed_itemsets[X] = support_X - end - # If Y is not empty, add the itemset's closure (X ∪ Y) - else + # Calculate support + rows = vec(all(view(matrix, :, X_pos), dims=2)) + support = count(rows) + + # Pruning 1: Early return if not frequent + support < min_support && return + + # Pruning 2: Add closure to results if not empty + if !isempty(closed) lock(Lock) do - closed_itemsets[sort(vcat(X, Y))] = support_X + closed_itemsets[closed] = support end end # Recursive enumeration - for i in setdiff(R, Y) - carpenter!(closed_itemsets, sort(vcat(X, i)), setdiff(R, [i]), Lock) + remaining = filter(i -> i ∉ closed, R) + for i in remaining + carpenter!(closed_itemsets, sort(vcat(X, i)), filter(>(i), remaining), Lock) end end - + # Parallel Processing of initial itemsets @sync begin - for item in frequent_items - Threads.@spawn carpenter!(Results, [item], setdiff(allitems, [item]), ThreadLock) + for item in sorted_items + remaining_items = filter(x -> x > item, sorted_items) + Threads.@spawn carpenter!(Results, [item], remaining_items, ThreadLock) end end - # Create the result DataFrame - result_df = DataFrame( - Itemset = [RuleMiner.getnames(itemset, txns) for itemset in keys(Results)], - Support = [support / n_transactions for support in values(Results)], - N = collect(values(Results)), - Length = [length(itemset) for itemset in keys(Results)] - ) - - # Sort results by support in descending order - sort!(result_df, :N, rev=true) - return result_df + return RuleMiner.make_itemset_df(Results, txns) end \ No newline at end of file diff --git a/src/itemsets/closed/charm.jl b/src/itemsets/closed/charm.jl index 31c692f..4377d00 100644 --- a/src/itemsets/closed/charm.jl +++ b/src/itemsets/closed/charm.jl @@ -58,70 +58,72 @@ Zaki, Mohammed, and Ching-Jui Hsiao. “CHARM: An Efficient Algorithm for Closed """ function charm(txns::Transactions, min_support::Union{Int,Float64})::DataFrame n_transactions, n_items = size(txns.matrix) - + # Handle min_support as a float value min_support = min_support isa Float64 ? ceil(Int, min_support * n_transactions) : min_support - - # Create tidsets (transaction ID sets) for each item - tidsets = [BitSet(findall(txns.matrix[:,col])) for col in 1:n_items] - supports = vec(sum(txns.matrix,dims=1)) - # Sort items by support in ascending order, keeping only frequent items - item_order = sort(findall(s -> s >= min_support, supports), by=i -> supports[i]) + # Get pruned matrix and sorted items + matrix, sorted_items = RuleMiner.prune_matrix(txns.matrix, min_support) # Initialize results dictionary and threading lock Results = Dict{Vector{Int}, Int}() ThreadLock = ReentrantLock() - function charm!(closed_itemsets::Dict{Vector{Int}, Int}, prefix::Vector{Int}, eq_class::Vector{Int}) - for (i, item) in enumerate(eq_class) - + function charm!(closed_itemsets::Dict{Vector{Int}, Int}, prefix::Vector{Int}, eq_class::Vector{Int}, rows::BitVector) + for (i, pos) in enumerate(eq_class) # Create new itemset by adding current item to prefix - new_itemset = vcat(prefix, item) - new_tidset = intersect(tidsets[new_itemset]...) - support = length(new_tidset) + new_itemset = vcat(prefix, pos) + new_rows = rows .& matrix[:, pos] + support = count(new_rows) # Skip infrequent itemsets support < min_support && continue + # Initialize new equivalence class new_eq_class = Int[] + + # Process remaining items in current equivalence class for j in (i+1):length(eq_class) - - # Generate itemset, tidset, and support for new items in the next eq class - other_item = eq_class[j] - other_tidset = intersect(new_tidset, tidsets[other_item]) - other_support = length(other_tidset) + other_pos = eq_class[j] + + # Calculate intersection with the other item + other_rows = new_rows .& matrix[:, other_pos] + other_support = count(other_rows) # Skip infrequent items other_support < min_support && continue - + if support == other_support # If supports are equal, add item to current itemset - push!(new_itemset, other_item) + push!(new_itemset, other_pos) else - # Otherwise, add to new equivalence class for further processing - push!(new_eq_class, other_item) + # Otherwise, add to new equivalence class + push!(new_eq_class, other_pos) end end - # Update closed itemsets list, ensuring thread safety + # Map positions back to original item indices + orig_itemset = sorted_items[new_itemset] + + # Update closed itemsets list with thread safety lock(ThreadLock) do - update_closed_itemsets!(closed_itemsets, new_itemset, support) + update_closed_itemsets!(closed_itemsets, orig_itemset, support) end # Recursively process new equivalence class if non-empty - !isempty(new_eq_class) && charm!(closed_itemsets, new_itemset, new_eq_class) + !isempty(new_eq_class) && charm!(closed_itemsets, new_itemset, new_eq_class, new_rows) end end - + # Helper function to update closed itemsets - function update_closed_itemsets!(closed_itemsets, new_itemset, support) + function update_closed_itemsets!(closed_itemsets::Dict{Vector{Int}, Int}, new_itemset::Vector{Int}, support::Int) new_set = Set(new_itemset) + + # Check against existing closed itemsets for (existing_itemset, existing_support) in closed_itemsets - # Only compare itemsets with equal support support != existing_support && continue - + existing_set = Set(existing_itemset) # If new itemset is a subset of an existing one, it's not closed @@ -137,27 +139,25 @@ function charm(txns::Transactions, min_support::Union{Int,Float64})::DataFrame closed_itemsets[new_itemset] = support end - # Add single-item frequent itemsets - for item in item_order - Results[[item]] = supports[item] + # Process single items and add to results + for (pos, item) in enumerate(sorted_items) + Results[[item]] = count(matrix[:, pos]) end # Parallel processing of top-level equivalence classes @sync begin - for (i, item) in enumerate(item_order) - Threads.@spawn charm!(Results, [item], item_order[i+1:end]) + for (i, pos) in enumerate(1:length(sorted_items)) + Threads.@spawn begin + # Get initial rows for this item + initial_rows = matrix[:, pos] + + # Only process if it meets minimum support + if count(initial_rows) >= min_support + charm!(Results, [pos], collect((i+1):length(sorted_items)), initial_rows) + end + end end end - # Create the result DataFrame - result_df = DataFrame( - Itemset = [RuleMiner.getnames(itemset, txns) for itemset in keys(Results)], - Support = [support / n_transactions for support in values(Results)], - N = collect(values(Results)), - Length = [length(itemset) for itemset in keys(Results)] - ) - - # Sort results by support in descending order - sort!(result_df, :N, rev=true) - return result_df + return RuleMiner.make_itemset_df(Results, txns) end \ No newline at end of file diff --git a/src/itemsets/closed/fpclose.jl b/src/itemsets/closed/fpclose.jl index c188a90..51fe44d 100644 --- a/src/itemsets/closed/fpclose.jl +++ b/src/itemsets/closed/fpclose.jl @@ -126,14 +126,5 @@ function fpclose(data::Union{Transactions,FPTree}, min_support::Union{Int,Float6 # Start the mining process fpclose!(Results, tree, Int[], min_support) - df = DataFrame( - Itemset = [data.colkeys[itemset] for itemset in keys(Results)], - Support = [support / n_transactions for support in values(Results)], - N = collect(values(Results)), - Length = [length(itemset) for itemset in keys(Results)] - ) - - sort!(df, [:Length, :N], rev=[false, true]) - - return df + return RuleMiner.make_itemset_df(Results, data) end \ No newline at end of file diff --git a/src/itemsets/closed/lcm.jl b/src/itemsets/closed/lcm.jl index 83774e0..2f36cb9 100644 --- a/src/itemsets/closed/lcm.jl +++ b/src/itemsets/closed/lcm.jl @@ -66,69 +66,57 @@ function LCM(txns::Transactions, min_support::Union{Int,Float64})::DataFrame # Handle min_support as a float value min_support = min_support isa Float64 ? ceil(Int, min_support * n_transactions) : min_support - # Create tidsets (transaction ID sets) for each item - tidsets = [BitSet(findall(txns.matrix[:,col])) for col in 1:n_items] - supports = vec(sum(txns.matrix, dims=1)) - - # Sort items by support in descending order, keeping only frequent items - sorted_items = sort(findall(s -> s >= min_support, supports), by=i -> supports[i], rev=true) - + matrix, sorted_items = prune_matrix(txns.matrix, min_support) + # Dictionary to store closed itemsets and their supports - Results = Dict{Vector{Int}, Int}() - + results = Dict{Vector{Int}, Int}() ThreadLock = ReentrantLock() - - function lcm!(closed_itemsets::Dict{Vector{Int}, Int}, current::Vector{Int}, tidset::BitSet, dict_lock::ReentrantLock) - closure = findall(i -> length(intersect(tidset, tidsets[i])) == length(tidset), 1:n_items) - support = length(tidset) + + function lcm!(closed_itemsets::Dict{Vector{Int}, Int}, current::Vector{Int}, rows::BitVector, dict_lock::ReentrantLock) + # Get closure of current itemset + closed = sorted_items[closure(matrix, current)] # Map back to original indices + support = count(rows) lock(dict_lock) do # If we've seen this closure with equal or higher support, skip it - (haskey(closed_itemsets, closure) && closed_itemsets[closure] >= support) && return - + (haskey(closed_itemsets, closed) && closed_itemsets[closed] >= support) && return + # Add Closure to Dict - if !isempty(closure) - closed_itemsets[closure] = support + if !isempty(closed) + closed_itemsets[closed] = support end end + # Get current item's position in sorted_items for comparison + curr_pos = isempty(current) ? 0 : findfirst(==(current[end]), 1:size(matrix, 2)) + # Try extending the itemset with each frequent item - for item in sorted_items - + for new_pos in eachindex(sorted_items) + orig_item = sorted_items[new_pos] + # Skip if the item is already in the closure - item ∈ closure && continue - + orig_item ∈ closed && continue + # Skip if the item comes before the last item in the current itemset - item <= (isempty(current) ? 0 : current[end]) && continue + new_pos <= curr_pos && continue - # Compute the new tidset for the extended itemset - new_tidset = intersect(tidset, tidsets[item]) - - # Skip if the new tidset doesn't meet minimum support - length(new_tidset) < min_support && continue + # Compute the new rows that contain both the current itemset and the new item + new_rows = rows .& matrix[:, new_pos] - # Recurse with new tidset and itemset - lcm!(closed_itemsets, vcat(current, item), new_tidset, dict_lock) + # Skip if the new rows don't meet minimum support + count(new_rows) < min_support && continue + + # Recurse with new rows and itemset + lcm!(closed_itemsets, vcat(current, new_pos), new_rows, dict_lock) end end - + # Start the LCM process with size-1 itemsets @sync begin - for item in sorted_items - Threads.@spawn lcm!(Results, [item], tidsets[item], ThreadLock) + for pos in 1:length(sorted_items) + Threads.@spawn lcm!(results, [pos], matrix[:, pos], ThreadLock) end end - - # Convert results to a DataFrame - result = DataFrame( - Itemset = [RuleMiner.getnames(itemset, txns) for itemset in keys(Results)], - Support = [support / n_transactions for support in values(Results)], - N = collect(values(Results)), - Length = [length(itemset) for itemset in keys(Results)] - ) - - # Sort results by support in descending order - sort!(result, :N, rev=true) - - return result -end + + return make_itemset_df(results, txns) +end \ No newline at end of file diff --git a/src/itemsets/frequent/eclat.jl b/src/itemsets/frequent/eclat.jl index 9a4bc53..95460c8 100644 --- a/src/itemsets/frequent/eclat.jl +++ b/src/itemsets/frequent/eclat.jl @@ -62,40 +62,30 @@ result = eclat(txns, 5_000) # References Zaki, Mohammed. “Scalable Algorithms for Association Mining.” Knowledge and Data Engineering, IEEE Transactions On 12 (June 1, 2000): 372–90. https://doi.org/10.1109/69.846291. """ -function eclat(txns::Transactions, min_support::Union{Int,Float64})::DataFrame +function eclat(txns::Transactions, min_support::Union{Int,Float64})#::DataFrame n_transactions = size(txns.matrix, 1) # Handle min_support as a float value min_support = min_support isa Float64 ? ceil(Int, min_support * n_transactions) : min_support - # Calculate initial supports and sort the columns - item_index = collect(1:size(txns.matrix, 2)) - item_supports = Dict(zip(item_index, vec(sum(txns.matrix, dims=1)))) - - frequent_items = [item for item in item_index if item_supports[item] >= min_support] - sorted_items = sort(frequent_items, by= x -> item_supports[x]) + matrix, sorted_items = prune_matrix(txns.matrix, min_support) # Initialize results dictionary and threading lock - Results = Dict{Vector{Int}, Int}() - ThreadLock = ReentrantLock() - - # Add single-item frequent itemsets to results - for item in sorted_items - Results[[item]] = item_supports[item] - end + results = Dict(zip([[i] for i in sorted_items], vec(sum(matrix,dims=1)))) + thread_lock = ReentrantLock() # Define recursive eclat function and run it on the data - function eclat!(lineage::Vector{Int}, items::Vector{Int}, trans::Transactions, min_support::Int) + function eclat!(results::Dict{Vector{Int}, Int}, lineage::Vector{Int}, items::Vector{Int}, matrix::BitMatrix, min_support::Int) for (i, item) in enumerate(items) new_lineage = vcat(lineage, item) - support = sum(all(trans.matrix[:, new_lineage], dims=2)) + support = sum(all(view(matrix, :, new_lineage), dims=2)) # Skip this itemset if it does not meet minimum suppot support < min_support && continue # Add the Itemset to results - lock(ThreadLock) do - Results[new_lineage] = support + lock(thread_lock) do + results[sorted_items[new_lineage]] = support end # Generate new possible items @@ -105,25 +95,15 @@ function eclat(txns::Transactions, min_support::Union{Int,Float64})::DataFrame isempty(new_items) && continue # Recurse with new items - eclat!(new_lineage, new_items, trans, min_support) + eclat!(results, new_lineage, new_items, matrix, min_support) end end @sync begin - for (i, item) in enumerate(sorted_items) - Threads.@spawn eclat!([item], sorted_items[i+1:end], txns, min_support) + for item in eachindex(sorted_items) + Threads.@spawn eclat!(results, [item], collect(item+1:length(sorted_items)), matrix, min_support) end end - # Create the result DataFrame - result_df = DataFrame( - Itemset = [RuleMiner.getnames(itemset, txns) for itemset in keys(Results)], - Support = [support / n_transactions for support in values(Results)], - N = collect(values(Results)), - Length = [length(itemset) for itemset in keys(Results)] - ) - - # Sort results by support in descending order - sort!(result_df, :N, rev=true) - return result_df + return make_itemset_df(results,txns) end \ No newline at end of file diff --git a/src/itemsets/frequent/fpgrowth.jl b/src/itemsets/frequent/fpgrowth.jl index 47f8c2e..c229e8b 100644 --- a/src/itemsets/frequent/fpgrowth.jl +++ b/src/itemsets/frequent/fpgrowth.jl @@ -108,15 +108,5 @@ function fpgrowth(data::Union{Transactions,FPTree}, min_support::Union{Int,Float # Mine frequent sets fpgrowth!(Results,tree, Int[], min_support) - # Create the result DataFrame - result_df = DataFrame( - Itemset = [data.colkeys[itemset] for itemset in keys(Results)], - Support = [support / n_transactions for support in values(Results)], - N = collect(values(Results)), - Length = [length(itemset) for itemset in keys(Results)] - ) - - # Sort results by support in descending order - sort!(result_df, :N, rev=true) - return result_df + return RuleMiner.make_itemset_df(Results, data) end \ No newline at end of file diff --git a/src/itemsets/itemset_utils.jl b/src/itemsets/itemset_utils.jl new file mode 100644 index 0000000..4204fec --- /dev/null +++ b/src/itemsets/itemset_utils.jl @@ -0,0 +1,118 @@ +# itemsetutils.jl +# Utilities for mining frequent itemsets +#= +Copyright (c) 2024 Jared Schwartz + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +=# + +""" + make_itemset_df(results::Dict{Vector{Int}, Int}, txns::Transactions)::DataFrame + +Convert a dictionary of frequent itemsets and their supports into a formatted DataFrame. + +# Arguments +- `results::Dict{Vector{Int}, Int}`: Dictionary mapping itemsets (as vectors of integer indices) to + their absolute support counts. Keys are vectors representing itemsets and values are + the number of transactions containing that itemset. +- `txns::Transactions`: The Transactions object used for mining, containing item names and + the total number of transactions. + +# Returns +A DataFrame with the following columns: +- `Itemset`: Vector{String} - The items in each frequent itemset, with integer indices converted + to their original item names +- `Support`: Float64 - The relative support of each itemset (proportion of transactions containing it) +- `N`: Int - The absolute support count (number of transactions containing the itemset) +- `Length`: Int - The number of items in each itemset + +The DataFrame is sorted by absolute support (N) in descending order. + +# Example +```julia +# Assuming we have mined results and a transactions object +results = Dict( + [1, 2] => 50, # Itemset of items 1 and 2 appears in 50 transactions + [1] => 75 # Item 1 appears in 75 transactions +) +txns = Txns(...) # Transactions object with item names "A" and "B" + +df = make_itemset_df(results, txns) + +# Returns DataFrame: +# Itemset Support N Length +# ["A"] 0.75 75 1 +# ["A", "B"] 0.50 50 2 +``` +""" +function make_itemset_df(results::Dict{Vector{Int}, Int}, txns::Union{Transactions,FPTree})::DataFrame + result_df = DataFrame( + Itemset = [RuleMiner.getnames(itemset, txns) for itemset in keys(results)], + Support = [support / txns.n_transactions for support in values(results)], + N = collect(values(results)), + Length = [length(itemset) for itemset in keys(results)] + ) + sort!(result_df, :N, rev=true) + return result_df +end + + +""" + closure(matrix::BitMatrix, itemset::Vector{Int}) -> Vector{Int} + +Calculate the closure of an itemset in a binary transaction matrix. + +# Arguments +- `matrix::BitMatrix`: A binary matrix where rows represent transactions and columns represent items. + True values indicate item presence in a transaction. +- `itemset::Vector{Int}`: Vector of column indices representing the itemset whose closure + should be computed. + +# Returns +- `Vector{Int}`: Column indices of the closure - all items that appear in every transaction + containing the input itemset. + +# Description +The closure operation finds all items that are functionally implied by a given itemset +in the transaction data. It works by: +1. Finding all transactions that contain the input itemset +2. Identifying which items appear in all of these transactions + +An item is in the closure if it appears in every transaction that contains the input itemset. +The input itemset is always a subset of its closure. + +# Example +```julia +# Create a binary matrix with 3 transactions and 4 items +matrix = BitMatrix([ + 1 1 1 0; # Transaction 1 contains items 1, 2, and 3 + 1 1 1 0; # Transaction 2 contains items 1, 2, and 3 + 0 0 0 1 # Transaction 3 contains only item 4 +]) + +# Find closure of itemset [1] +closed = closure(matrix, [1]) # Returns [1, 2, 3] +# Items 2 and 3 are in the closure because they appear in +# all transactions containing item 1 +``` +""" +function closure(matrix::BitMatrix, itemset::Vector{Int}) + rows = vec(all(view(matrix,:, itemset), dims=2)) + return findall(vec(all(view(matrix, rows, :), dims=1))) +end \ No newline at end of file diff --git a/src/itemsets/maximal/fpmax.jl b/src/itemsets/maximal/fpmax.jl index f6cdef1..82ca63f 100644 --- a/src/itemsets/maximal/fpmax.jl +++ b/src/itemsets/maximal/fpmax.jl @@ -136,15 +136,5 @@ function fpmax(data::Union{Transactions,FPTree}, min_support::Union{Int,Float64} is_maximal && (Results[itemset] = support) end - # Create the result DataFrame - result_df = DataFrame( - Itemset = [data.colkeys[itemset] for itemset in keys(Results)], - Support = [support / n_transactions for support in values(Results)], - N = collect(values(Results)), - Length = [length(itemset) for itemset in keys(Results)] - ) - - # Sort results by length in descending order, then by support - sort!(result_df, [:Length, :Support], rev=true) - return result_df + return RuleMiner.make_itemset_df(Results, data) end \ No newline at end of file diff --git a/src/itemsets/maximal/genmax.jl b/src/itemsets/maximal/genmax.jl index c4ea47c..1d5b26b 100644 --- a/src/itemsets/maximal/genmax.jl +++ b/src/itemsets/maximal/genmax.jl @@ -63,84 +63,80 @@ result = genmax(txns, 5_000) Gouda, Karam, and Mohammed J. Zaki. “GenMax: An Efficient Algorithm for Mining Maximal Frequent Itemsets.” Data Mining and Knowledge Discovery 11, no. 3 (November 1, 2005): 223–42. https://doi.org/10.1007/s10618-005-0002-x. """ function genmax(txns::Transactions, min_support::Union{Int,Float64})::DataFrame - n_transactions, n_items = size(txns.matrix) # Handle min_support as a float value - min_support = min_support isa Float64 ? ceil(Int, min_support * n_transactions) : min_support + min_support = min_support isa Float64 ? ceil(Int, min_support * txns.n_transactions) : min_support - # Calculate initial supports for each item - item_supports = Dict(i => sum(txns.matrix[:, i]) for i in 1:n_items) + # Get pruned matrix and sorted items + matrix, sorted_items = RuleMiner.prune_matrix(txns.matrix, min_support) - # Sort items by support in descending order and filter for frequent items - sorted_items = sort(collect(keys(item_supports)), by=i -> item_supports[i], rev=true) - frequent_items = filter(i -> item_supports[i] >= min_support, sorted_items) + # Initialize results dictionary and threading lock + results = Dict{Vector{Int}, Int}() + candidates = Dict{Vector{Int}, Int}() + thread_lock = ReentrantLock() - # Create BitSets for each frequent item's transactions - item_bitsets = [BitSet(findall(txns.matrix[:, i])) for i in frequent_items] - - # Initialize the Maximal Frequent Itemsets (MFI) list and threading lock - Results = Vector{Vector{Int}}() - ThreadLock = ReentrantLock() - - # Depth-First Search to find maximal frequent itemsets - function genmax!(itemset::Vector{Int}, start_idx::Int, tidset::BitSet) + function genmax!(itemset::Vector{Int}, start_idx::Int, rows::BitVector) local_maximal = true - for i in start_idx:length(frequent_items) - item = frequent_items[i] - new_tidset = intersect(tidset, item_bitsets[i]) + for i in start_idx:size(matrix, 2) + # Calculate new support with additional item + new_rows = rows .& matrix[:, i] + new_support = count(new_rows) # Skip if the new itemset is not frequent - length(new_tidset) < min_support && continue + new_support < min_support && continue local_maximal = false - new_itemset = push!(copy(itemset), item) - genmax!(new_itemset, i + 1, new_tidset) + new_itemset = push!(copy(itemset), i) + genmax!(new_itemset, i + 1, new_rows) end # If itemset is empty or not locally maximal, return (isempty(itemset) || !local_maximal) && return - lock(ThreadLock) do - push!(Results, itemset) + # Map positions back to original item indices + orig_itemset = sorted_items[itemset] + support = count(rows) + + lock(thread_lock) do + candidates[orig_itemset] = support end end # Start the depth-first search in parallel @sync begin - for (i, item) in enumerate(frequent_items) - Threads.@spawn genmax!([item], i + 1, item_bitsets[i]) + for i in 1:length(sorted_items) + Threads.@spawn begin + initial_rows = matrix[:, i] + initial_support = count(initial_rows) + + # Only process if it meets minimum support + if initial_support >= min_support + genmax!([i], i + 1, initial_rows) + end + end end end - # Filter candidates to get final maximal sets - sort!(Results, by=length, rev=true) - maximal = trues(length(Results)) - - for i in 1:length(Results) - #Skip if the item has already been marked as non-maximal - !maximal[i] && continue + # Filter candidates to get maximal itemsets + for (itemset, support) in candidates + is_maximal = true + itemset_set = Set(itemset) - for j in 1:length(Results) - # Skip if item is being compared to its self or if [j] has been marked as non-maximal - (i == j || !maximal[j]) && continue + for (other_itemset, other_support) in candidates + itemset === other_itemset && continue - # Check if Results[j] is a subset of Results[i] and mark it not maximal if it is - Results[j] ⊊ Results[i] && (maximal[j] = false) + if issubset(itemset_set, Set(other_itemset)) + is_maximal = false + break + end + end + + # Add to results if maximal + if is_maximal + results[itemset] = support end end - result = Results[maximal] - - # Create output DataFrame - df = DataFrame( - Itemset = [RuleMiner.getnames(itemset, txns) for itemset in result], - Support = [length(intersect([item_bitsets[findfirst(==(item), frequent_items)] for item in itemset]...)) / n_transactions for itemset in result], - N = [length(intersect([item_bitsets[findfirst(==(item), frequent_items)] for item in itemset]...)) for itemset in result], - Length = [length(itemset) for itemset in result] - ) - - # Sort by length (descending) and then by support (descending) - sort!(df, [:Length, :Support], rev=true) - return df + return RuleMiner.make_itemset_df(results, txns) end \ No newline at end of file