Merge pull request #49 from JaredSchwartz/memory-optimization

Optimize memory usage
JaredSchwartz · Oct 27, 2024 · b8d10b1 · b8d10b1
2 parents d44b260 + 4dfa8a5
commit b8d10b1
Show file tree

Hide file tree

Showing 12 changed files with 325 additions and 249 deletions.
diff --git a/src/RuleMiner.jl b/src/RuleMiner.jl
@@ -62,6 +62,7 @@ export apriori
 include("itemsets/frequent/eclat.jl")
 include("itemsets/frequent/fpgrowth.jl")
 include("itemsets/frequent/recovery.jl")
+include("itemsets/itemset_utils.jl")
 
 export eclat
 export fpgrowth

diff --git a/src/association_rules/apriori.jl b/src/association_rules/apriori.jl
@@ -92,8 +92,7 @@ function apriori(txns::Transactions, min_support::Union{Int,Float64}, min_confid
     basenum = vec(count(txns.matrix, dims=1))
     min_support = min_support isa Float64 ? ceil(Int, min_support * n_transactions) : min_support
 
-    items = findall(basenum .>= min_support)
-    subtxns = BitMatrix(txns.matrix[:, items])
+    subtxns, items = RuleMiner.prune_matrix(txns.matrix,min_support)
     rules = Vector{Arule}()
 
     initials = Vector{Arule}()

diff --git a/src/data_structures/txnutils.jl b/src/data_structures/txnutils.jl
@@ -299,4 +299,44 @@ function txns_to_df(txns::SeqTxns, index::Bool = true)::DataFrame
     end
 
     return df
+end
+
+"""
+    prune_matrix(matrix::SparseMatrixCSC, min_support::Int) -> Tuple{BitMatrix, Vector{Int}}
+
+Filter and sort sparse matrix columns based on minimum support threshold.
+
+# Arguments
+- `matrix::SparseMatrixCSC`: A sparse boolean matrix where rows represent transactions and columns
+   represent items. A true value at position (i,j) indicates item j is present in transaction i.
+- `min_support::Int`: The minimum absolute support threshold. Columns with fewer than this number
+   of true values will be removed.
+
+# Returns
+A tuple containing:
+- `BitMatrix`: A pruned view of the matrix containing only frequent columns, converted to a BitMatrix
+- `Vector{Int}`: A vector of column indices corresponding to the frequent columns, sorted by their sums
+
+# Description
+This helper function performs two key preprocessing steps for frequent pattern mining:
+1. Removes infrequent columns (pruning): Filters out columns whose sum is less than the minimum
+   support threshold
+2. Sorts columns by frequency: Reorders the remaining columns based on their sums in ascending order
+
+The pruned matrix is returned as a BitMatrix for efficient boolean operations in pattern mining algorithms.
+
+# Example
+```julia
+txns = Txns(sparse([1 1 0; 1 0 1; 0 1 1]), ["A", "B", "C"], ["I1", "I2", "I3"])
+matrix, indices = prune_matrix(txns, 2)
+```
+"""
+function prune_matrix(matrix::SparseMatrixCSC, min_support::Int)
+    supports = sum(matrix, dims=1)
+    sorted_items = [i for i in axes(matrix,2) if supports[1,i] >= min_support]
+    sort!(sorted_items, by= x -> supports[1,x])
+
+    matrix = view(matrix,:, sorted_items) |> BitMatrix
+
+    return matrix[vec(any(matrix, dims=2)), :], sorted_items
 end
diff --git a/src/itemsets/closed/carpenter.jl b/src/itemsets/closed/carpenter.jl
@@ -66,14 +66,8 @@ function carpenter(txns::Transactions, min_support::Union{Int,Float64})
     # Handle min_support as a float value
     min_support = min_support isa Float64 ? ceil(Int, min_support * n_transactions) : min_support
 
-    # Create tidsets (transaction ID sets) for each item
-    tidsets = [BitSet(findall(txns.matrix[:,col])) for col in 1:n_items]
-    supports = vec(sum(txns.matrix, dims=1))
-
-    # Create vectors of all items and all frequent items for mining
-    allitems = collect(1:n_items)
-    frequent_items = findall(supports .>= min_support)
-
+    matrix, sorted_items = RuleMiner.prune_matrix(txns.matrix, min_support)
+
     # Initialize results dictionary and threading lock
     Results = Dict{Vector{Int}, Int}()
     ThreadLock = ReentrantLock()
@@ -82,50 +76,39 @@ function carpenter(txns::Transactions, min_support::Union{Int,Float64})
         # Pruning 3: Early return if itemset is already present in the output
         haskey(closed_itemsets, X) && return
 
-        # Find transactions with the itemset and calculate support
-        tidset_X = length(X) == 1 ? tidsets[X[1]] : intersect(tidsets[X]...)
-        support_X = length(tidset_X)
+        # Get closure of current itemset and map back to original indices
+        X_pos = Vector{Int}(findall(in(X), sorted_items))
+        closed_pos = RuleMiner.closure(matrix, X_pos)
+        closed = sorted_items[closed_pos]
 
-        # Pruning 1: Early return if the itemset is not frequent
-        support_X < min_support && return
-
-        # Pruning 2: Find items that can be added without changing support
-        Y = filter(i -> length(intersect(tidset_X, tidsets[i])) == support_X, R)
-
-        # Add X to itemsets if it's closed (Y is empty)
-        if isempty(Y) 
-            lock(Lock) do
-                closed_itemsets[X] = support_X
-            end
-        # If Y is not empty, add the itemset's closure (X ∪ Y)
-        else 
+        # Calculate support
+        rows = vec(all(view(matrix, :, X_pos), dims=2))
+        support = count(rows)
+
+        # Pruning 1: Early return if not frequent
+        support < min_support && return
+
+        # Pruning 2: Add closure to results if not empty
+        if !isempty(closed)
             lock(Lock) do
-                closed_itemsets[sort(vcat(X, Y))] = support_X
+                closed_itemsets[closed] = support
             end
         end
 
         # Recursive enumeration
-        for i in setdiff(R, Y)
-            carpenter!(closed_itemsets, sort(vcat(X, i)), setdiff(R, [i]), Lock)
+        remaining = filter(i -> i ∉ closed, R)
+        for i in remaining
+            carpenter!(closed_itemsets, sort(vcat(X, i)), filter(>(i), remaining), Lock)
         end
     end
-
+    
     # Parallel Processing of initial itemsets
     @sync begin
-        for item in frequent_items
-            Threads.@spawn carpenter!(Results, [item], setdiff(allitems, [item]), ThreadLock)
+        for item in sorted_items
+            remaining_items = filter(x -> x > item, sorted_items)
+            Threads.@spawn carpenter!(Results, [item], remaining_items, ThreadLock)
         end
     end
 
-    # Create the result DataFrame
-    result_df = DataFrame(
-        Itemset = [RuleMiner.getnames(itemset, txns) for itemset in keys(Results)],
-        Support = [support / n_transactions for support in values(Results)],
-        N = collect(values(Results)),
-        Length = [length(itemset) for itemset in keys(Results)]
-    )
-
-    # Sort results by support in descending order
-    sort!(result_df, :N, rev=true)
-    return result_df
+    return RuleMiner.make_itemset_df(Results, txns)
 end
diff --git a/src/itemsets/closed/charm.jl b/src/itemsets/closed/charm.jl
@@ -58,70 +58,72 @@ Zaki, Mohammed, and Ching-Jui Hsiao. “CHARM: An Efficient Algorithm for Closed
 """
 function charm(txns::Transactions, min_support::Union{Int,Float64})::DataFrame
     n_transactions, n_items = size(txns.matrix)
-
+    
     # Handle min_support as a float value
     min_support = min_support isa Float64 ? ceil(Int, min_support * n_transactions) : min_support
-
-    # Create tidsets (transaction ID sets) for each item
-    tidsets = [BitSet(findall(txns.matrix[:,col])) for col in 1:n_items]
-    supports = vec(sum(txns.matrix,dims=1))
 
-    # Sort items by support in ascending order, keeping only frequent items
-    item_order = sort(findall(s -> s >= min_support, supports), by=i -> supports[i])
+    # Get pruned matrix and sorted items
+    matrix, sorted_items = RuleMiner.prune_matrix(txns.matrix, min_support)
 
     # Initialize results dictionary and threading lock
     Results = Dict{Vector{Int}, Int}()
     ThreadLock = ReentrantLock()
 
-    function charm!(closed_itemsets::Dict{Vector{Int}, Int}, prefix::Vector{Int}, eq_class::Vector{Int})
-        for (i, item) in enumerate(eq_class)
-
+    function charm!(closed_itemsets::Dict{Vector{Int}, Int}, prefix::Vector{Int}, eq_class::Vector{Int}, rows::BitVector)
+        for (i, pos) in enumerate(eq_class)
             # Create new itemset by adding current item to prefix
-            new_itemset = vcat(prefix, item)
-            new_tidset = intersect(tidsets[new_itemset]...)
-            support = length(new_tidset)
+            new_itemset = vcat(prefix, pos)
+            new_rows = rows .& matrix[:, pos]
+            support = count(new_rows)
 
             # Skip infrequent itemsets
             support < min_support && continue
 
+            # Initialize new equivalence class
             new_eq_class = Int[]
+
+            # Process remaining items in current equivalence class
             for j in (i+1):length(eq_class)
-
-                # Generate itemset, tidset, and support for new items in the next eq class
-                other_item = eq_class[j]
-                other_tidset = intersect(new_tidset, tidsets[other_item])
-                other_support = length(other_tidset)
+                other_pos = eq_class[j]
+
+                # Calculate intersection with the other item
+                other_rows = new_rows .& matrix[:, other_pos]
+                other_support = count(other_rows)
 
                 # Skip infrequent items
                 other_support < min_support && continue
-
+                
                 if support == other_support
                     # If supports are equal, add item to current itemset
-                    push!(new_itemset, other_item)
+                    push!(new_itemset, other_pos)
                 else
-                    # Otherwise, add to new equivalence class for further processing
-                    push!(new_eq_class, other_item)
+                    # Otherwise, add to new equivalence class
+                    push!(new_eq_class, other_pos)
                 end
             end
 
-            # Update closed itemsets list, ensuring thread safety
+            # Map positions back to original item indices
+            orig_itemset = sorted_items[new_itemset]
+
+            # Update closed itemsets list with thread safety
             lock(ThreadLock) do
-                update_closed_itemsets!(closed_itemsets, new_itemset, support)
+                update_closed_itemsets!(closed_itemsets, orig_itemset, support)
             end
 
             # Recursively process new equivalence class if non-empty
-            !isempty(new_eq_class) && charm!(closed_itemsets, new_itemset, new_eq_class)
+            !isempty(new_eq_class) && charm!(closed_itemsets, new_itemset, new_eq_class, new_rows)
         end
     end
-
+    
     # Helper function to update closed itemsets
-    function update_closed_itemsets!(closed_itemsets, new_itemset, support)
+    function update_closed_itemsets!(closed_itemsets::Dict{Vector{Int}, Int}, new_itemset::Vector{Int}, support::Int)
         new_set = Set(new_itemset)
+
+        # Check against existing closed itemsets
         for (existing_itemset, existing_support) in closed_itemsets
-
             # Only compare itemsets with equal support
             support != existing_support && continue
-
+            
             existing_set = Set(existing_itemset)
 
             # If new itemset is a subset of an existing one, it's not closed
@@ -137,27 +139,25 @@ function charm(txns::Transactions, min_support::Union{Int,Float64})::DataFrame
         closed_itemsets[new_itemset] = support
     end
 
-    # Add single-item frequent itemsets
-    for item in item_order
-        Results[[item]] = supports[item]
+    # Process single items and add to results
+    for (pos, item) in enumerate(sorted_items)
+        Results[[item]] = count(matrix[:, pos])
     end
 
     # Parallel processing of top-level equivalence classes
     @sync begin
-        for (i, item) in enumerate(item_order)
-            Threads.@spawn charm!(Results, [item], item_order[i+1:end])
+        for (i, pos) in enumerate(1:length(sorted_items))
+            Threads.@spawn begin
+                # Get initial rows for this item
+                initial_rows = matrix[:, pos]
+
+                # Only process if it meets minimum support
+                if count(initial_rows) >= min_support
+                    charm!(Results, [pos], collect((i+1):length(sorted_items)), initial_rows)
+                end
+            end
         end
     end
 
-    # Create the result DataFrame
-    result_df = DataFrame(
-        Itemset = [RuleMiner.getnames(itemset, txns) for itemset in keys(Results)],
-        Support = [support / n_transactions for support in values(Results)],
-        N = collect(values(Results)),
-        Length = [length(itemset) for itemset in keys(Results)]
-    )
-
-    # Sort results by support in descending order
-    sort!(result_df, :N, rev=true)
-    return result_df
+    return RuleMiner.make_itemset_df(Results, txns)
 end
diff --git a/src/itemsets/closed/fpclose.jl b/src/itemsets/closed/fpclose.jl
@@ -126,14 +126,5 @@ function fpclose(data::Union{Transactions,FPTree}, min_support::Union{Int,Float6
     # Start the mining process
     fpclose!(Results, tree, Int[], min_support)
 
-    df = DataFrame(
-        Itemset = [data.colkeys[itemset] for itemset in keys(Results)], 
-        Support = [support / n_transactions for support in values(Results)],
-        N = collect(values(Results)),
-        Length = [length(itemset) for itemset in keys(Results)]
-    )
-
-    sort!(df, [:Length, :N], rev=[false, true])
-
-    return df
+    return RuleMiner.make_itemset_df(Results, data)
 end