Skip to content

Commit

Permalink
Merge pull request #49 from JaredSchwartz/memory-optimization
Browse files Browse the repository at this point in the history
Optimize memory usage
  • Loading branch information
JaredSchwartz authored Oct 27, 2024
2 parents d44b260 + 4dfa8a5 commit b8d10b1
Show file tree
Hide file tree
Showing 12 changed files with 325 additions and 249 deletions.
1 change: 1 addition & 0 deletions src/RuleMiner.jl
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ export apriori
include("itemsets/frequent/eclat.jl")
include("itemsets/frequent/fpgrowth.jl")
include("itemsets/frequent/recovery.jl")
include("itemsets/itemset_utils.jl")

export eclat
export fpgrowth
Expand Down
3 changes: 1 addition & 2 deletions src/association_rules/apriori.jl
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,7 @@ function apriori(txns::Transactions, min_support::Union{Int,Float64}, min_confid
basenum = vec(count(txns.matrix, dims=1))
min_support = min_support isa Float64 ? ceil(Int, min_support * n_transactions) : min_support

items = findall(basenum .>= min_support)
subtxns = BitMatrix(txns.matrix[:, items])
subtxns, items = RuleMiner.prune_matrix(txns.matrix,min_support)
rules = Vector{Arule}()

initials = Vector{Arule}()
Expand Down
40 changes: 40 additions & 0 deletions src/data_structures/txnutils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -299,4 +299,44 @@ function txns_to_df(txns::SeqTxns, index::Bool = true)::DataFrame
end

return df
end

"""
prune_matrix(matrix::SparseMatrixCSC, min_support::Int) -> Tuple{BitMatrix, Vector{Int}}
Filter and sort sparse matrix columns based on minimum support threshold.
# Arguments
- `matrix::SparseMatrixCSC`: A sparse boolean matrix where rows represent transactions and columns
represent items. A true value at position (i,j) indicates item j is present in transaction i.
- `min_support::Int`: The minimum absolute support threshold. Columns with fewer than this number
of true values will be removed.
# Returns
A tuple containing:
- `BitMatrix`: A pruned view of the matrix containing only frequent columns, converted to a BitMatrix
- `Vector{Int}`: A vector of column indices corresponding to the frequent columns, sorted by their sums
# Description
This helper function performs two key preprocessing steps for frequent pattern mining:
1. Removes infrequent columns (pruning): Filters out columns whose sum is less than the minimum
support threshold
2. Sorts columns by frequency: Reorders the remaining columns based on their sums in ascending order
The pruned matrix is returned as a BitMatrix for efficient boolean operations in pattern mining algorithms.
# Example
```julia
txns = Txns(sparse([1 1 0; 1 0 1; 0 1 1]), ["A", "B", "C"], ["I1", "I2", "I3"])
matrix, indices = prune_matrix(txns, 2)
```
"""
function prune_matrix(matrix::SparseMatrixCSC, min_support::Int)
supports = sum(matrix, dims=1)
sorted_items = [i for i in axes(matrix,2) if supports[1,i] >= min_support]
sort!(sorted_items, by= x -> supports[1,x])

matrix = view(matrix,:, sorted_items) |> BitMatrix

return matrix[vec(any(matrix, dims=2)), :], sorted_items
end
65 changes: 24 additions & 41 deletions src/itemsets/closed/carpenter.jl
Original file line number Diff line number Diff line change
Expand Up @@ -66,14 +66,8 @@ function carpenter(txns::Transactions, min_support::Union{Int,Float64})
# Handle min_support as a float value
min_support = min_support isa Float64 ? ceil(Int, min_support * n_transactions) : min_support

# Create tidsets (transaction ID sets) for each item
tidsets = [BitSet(findall(txns.matrix[:,col])) for col in 1:n_items]
supports = vec(sum(txns.matrix, dims=1))

# Create vectors of all items and all frequent items for mining
allitems = collect(1:n_items)
frequent_items = findall(supports .>= min_support)

matrix, sorted_items = RuleMiner.prune_matrix(txns.matrix, min_support)

# Initialize results dictionary and threading lock
Results = Dict{Vector{Int}, Int}()
ThreadLock = ReentrantLock()
Expand All @@ -82,50 +76,39 @@ function carpenter(txns::Transactions, min_support::Union{Int,Float64})
# Pruning 3: Early return if itemset is already present in the output
haskey(closed_itemsets, X) && return

# Find transactions with the itemset and calculate support
tidset_X = length(X) == 1 ? tidsets[X[1]] : intersect(tidsets[X]...)
support_X = length(tidset_X)
# Get closure of current itemset and map back to original indices
X_pos = Vector{Int}(findall(in(X), sorted_items))
closed_pos = RuleMiner.closure(matrix, X_pos)
closed = sorted_items[closed_pos]

# Pruning 1: Early return if the itemset is not frequent
support_X < min_support && return

# Pruning 2: Find items that can be added without changing support
Y = filter(i -> length(intersect(tidset_X, tidsets[i])) == support_X, R)

# Add X to itemsets if it's closed (Y is empty)
if isempty(Y)
lock(Lock) do
closed_itemsets[X] = support_X
end
# If Y is not empty, add the itemset's closure (X ∪ Y)
else
# Calculate support
rows = vec(all(view(matrix, :, X_pos), dims=2))
support = count(rows)

# Pruning 1: Early return if not frequent
support < min_support && return

# Pruning 2: Add closure to results if not empty
if !isempty(closed)
lock(Lock) do
closed_itemsets[sort(vcat(X, Y))] = support_X
closed_itemsets[closed] = support
end
end

# Recursive enumeration
for i in setdiff(R, Y)
carpenter!(closed_itemsets, sort(vcat(X, i)), setdiff(R, [i]), Lock)
remaining = filter(i -> i closed, R)
for i in remaining
carpenter!(closed_itemsets, sort(vcat(X, i)), filter(>(i), remaining), Lock)
end
end

# Parallel Processing of initial itemsets
@sync begin
for item in frequent_items
Threads.@spawn carpenter!(Results, [item], setdiff(allitems, [item]), ThreadLock)
for item in sorted_items
remaining_items = filter(x -> x > item, sorted_items)
Threads.@spawn carpenter!(Results, [item], remaining_items, ThreadLock)
end
end

# Create the result DataFrame
result_df = DataFrame(
Itemset = [RuleMiner.getnames(itemset, txns) for itemset in keys(Results)],
Support = [support / n_transactions for support in values(Results)],
N = collect(values(Results)),
Length = [length(itemset) for itemset in keys(Results)]
)

# Sort results by support in descending order
sort!(result_df, :N, rev=true)
return result_df
return RuleMiner.make_itemset_df(Results, txns)
end
90 changes: 45 additions & 45 deletions src/itemsets/closed/charm.jl
Original file line number Diff line number Diff line change
Expand Up @@ -58,70 +58,72 @@ Zaki, Mohammed, and Ching-Jui Hsiao. “CHARM: An Efficient Algorithm for Closed
"""
function charm(txns::Transactions, min_support::Union{Int,Float64})::DataFrame
n_transactions, n_items = size(txns.matrix)

# Handle min_support as a float value
min_support = min_support isa Float64 ? ceil(Int, min_support * n_transactions) : min_support

# Create tidsets (transaction ID sets) for each item
tidsets = [BitSet(findall(txns.matrix[:,col])) for col in 1:n_items]
supports = vec(sum(txns.matrix,dims=1))

# Sort items by support in ascending order, keeping only frequent items
item_order = sort(findall(s -> s >= min_support, supports), by=i -> supports[i])
# Get pruned matrix and sorted items
matrix, sorted_items = RuleMiner.prune_matrix(txns.matrix, min_support)

# Initialize results dictionary and threading lock
Results = Dict{Vector{Int}, Int}()
ThreadLock = ReentrantLock()

function charm!(closed_itemsets::Dict{Vector{Int}, Int}, prefix::Vector{Int}, eq_class::Vector{Int})
for (i, item) in enumerate(eq_class)

function charm!(closed_itemsets::Dict{Vector{Int}, Int}, prefix::Vector{Int}, eq_class::Vector{Int}, rows::BitVector)
for (i, pos) in enumerate(eq_class)
# Create new itemset by adding current item to prefix
new_itemset = vcat(prefix, item)
new_tidset = intersect(tidsets[new_itemset]...)
support = length(new_tidset)
new_itemset = vcat(prefix, pos)
new_rows = rows .& matrix[:, pos]
support = count(new_rows)

# Skip infrequent itemsets
support < min_support && continue

# Initialize new equivalence class
new_eq_class = Int[]

# Process remaining items in current equivalence class
for j in (i+1):length(eq_class)

# Generate itemset, tidset, and support for new items in the next eq class
other_item = eq_class[j]
other_tidset = intersect(new_tidset, tidsets[other_item])
other_support = length(other_tidset)
other_pos = eq_class[j]

# Calculate intersection with the other item
other_rows = new_rows .& matrix[:, other_pos]
other_support = count(other_rows)

# Skip infrequent items
other_support < min_support && continue

if support == other_support
# If supports are equal, add item to current itemset
push!(new_itemset, other_item)
push!(new_itemset, other_pos)
else
# Otherwise, add to new equivalence class for further processing
push!(new_eq_class, other_item)
# Otherwise, add to new equivalence class
push!(new_eq_class, other_pos)
end
end

# Update closed itemsets list, ensuring thread safety
# Map positions back to original item indices
orig_itemset = sorted_items[new_itemset]

# Update closed itemsets list with thread safety
lock(ThreadLock) do
update_closed_itemsets!(closed_itemsets, new_itemset, support)
update_closed_itemsets!(closed_itemsets, orig_itemset, support)
end

# Recursively process new equivalence class if non-empty
!isempty(new_eq_class) && charm!(closed_itemsets, new_itemset, new_eq_class)
!isempty(new_eq_class) && charm!(closed_itemsets, new_itemset, new_eq_class, new_rows)
end
end

# Helper function to update closed itemsets
function update_closed_itemsets!(closed_itemsets, new_itemset, support)
function update_closed_itemsets!(closed_itemsets::Dict{Vector{Int}, Int}, new_itemset::Vector{Int}, support::Int)
new_set = Set(new_itemset)

# Check against existing closed itemsets
for (existing_itemset, existing_support) in closed_itemsets

# Only compare itemsets with equal support
support != existing_support && continue

existing_set = Set(existing_itemset)

# If new itemset is a subset of an existing one, it's not closed
Expand All @@ -137,27 +139,25 @@ function charm(txns::Transactions, min_support::Union{Int,Float64})::DataFrame
closed_itemsets[new_itemset] = support
end

# Add single-item frequent itemsets
for item in item_order
Results[[item]] = supports[item]
# Process single items and add to results
for (pos, item) in enumerate(sorted_items)
Results[[item]] = count(matrix[:, pos])
end

# Parallel processing of top-level equivalence classes
@sync begin
for (i, item) in enumerate(item_order)
Threads.@spawn charm!(Results, [item], item_order[i+1:end])
for (i, pos) in enumerate(1:length(sorted_items))
Threads.@spawn begin
# Get initial rows for this item
initial_rows = matrix[:, pos]

# Only process if it meets minimum support
if count(initial_rows) >= min_support
charm!(Results, [pos], collect((i+1):length(sorted_items)), initial_rows)
end
end
end
end

# Create the result DataFrame
result_df = DataFrame(
Itemset = [RuleMiner.getnames(itemset, txns) for itemset in keys(Results)],
Support = [support / n_transactions for support in values(Results)],
N = collect(values(Results)),
Length = [length(itemset) for itemset in keys(Results)]
)

# Sort results by support in descending order
sort!(result_df, :N, rev=true)
return result_df
return RuleMiner.make_itemset_df(Results, txns)
end
11 changes: 1 addition & 10 deletions src/itemsets/closed/fpclose.jl
Original file line number Diff line number Diff line change
Expand Up @@ -126,14 +126,5 @@ function fpclose(data::Union{Transactions,FPTree}, min_support::Union{Int,Float6
# Start the mining process
fpclose!(Results, tree, Int[], min_support)

df = DataFrame(
Itemset = [data.colkeys[itemset] for itemset in keys(Results)],
Support = [support / n_transactions for support in values(Results)],
N = collect(values(Results)),
Length = [length(itemset) for itemset in keys(Results)]
)

sort!(df, [:Length, :N], rev=[false, true])

return df
return RuleMiner.make_itemset_df(Results, data)
end
Loading

0 comments on commit b8d10b1

Please sign in to comment.