diff --git a/docs/make.jl b/docs/make.jl index a0464b4..86d2960 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -24,7 +24,8 @@ makedocs( "Levelwise" => "algorithms/levelwise.md" ], "Maximal Itemset Mining" => Any[ - "FPMax" => "algorithms/fpmax.md" + "FPMax" => "algorithms/fpmax.md", + "GenMax" => "algorithms/genmax.md" ], ], warnonly = [:missing_docs, :cross_references], diff --git a/docs/src/algorithms/genmax.md b/docs/src/algorithms/genmax.md new file mode 100644 index 0000000..061bb63 --- /dev/null +++ b/docs/src/algorithms/genmax.md @@ -0,0 +1,7 @@ +# GenMax + +The `genmax` function implements the GenMax algorithm for mining closed itemsets. This algorithm, proposed by Karam Gouda and Mohammad Zaki in 2005, utilizes a technique called progressive focusing to reduce the search space for maximal itemset mining. + +```@docs +genmax(txns::Transactions, min_support::Union{Int,Float64}) +``` \ No newline at end of file diff --git a/src/RuleMiner.jl b/src/RuleMiner.jl index 97a10ac..da21577 100644 --- a/src/RuleMiner.jl +++ b/src/RuleMiner.jl @@ -35,4 +35,6 @@ include("charm.jl") include("carpenter.jl") include("lcm.jl") include("levelwise.jl") +# Maximal Itemset Mining +include("genmax.jl") end \ No newline at end of file diff --git a/src/genmax.jl b/src/genmax.jl new file mode 100644 index 0000000..b04226e --- /dev/null +++ b/src/genmax.jl @@ -0,0 +1,141 @@ +# genmax.jl +# GenMax maximal itemset mining in Julia +# +# Copyright (c) 2024 Jared Schwartz +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +export genmax + +""" + genmax(txns::Transactions, min_support::Union{Int,Float64})::DataFrame + +Implements the GenMax algorithm to find maximal frequent itemsets in a transactional dataset. + +# Arguments +- `txns::Transactions`: A `Transactions` object containing the dataset to mine. +- `min_support::Union{Int,Float64}`: The minimum support threshold. If an `Int`, it represents + the absolute support. If a `Float64`, it represents relative support. + +# Returns +- `DataFrame`: A DataFrame containing the maximal frequent itemsets, with columns: + - `Itemset`: The items in the maximal frequent itemset. + - `Support`: The support of the itemset as a proportion of total transactions. + - `N`: The absolute support count of the itemset. + - `Length`: The number of items in the itemset. + +# Description +The GenMax algorithm finds maximal frequent itemsets, which are frequent itemsets that are not +proper subsets of any other frequent itemset. It uses a depth-first search strategy with +pruning techniques like progressive focusing to discover these itemsets. + +The algorithm proceeds in two main phases: +1. Candidate Generation: Uses a depth-first search to generate candidate maximal frequent itemsets. +2. Maximality Checking: Ensures that only truly maximal itemsets are retained in the final output. + +# Example +```julia +txns = load_transactions("transactions.txt", ' ') + +# Find maximal frequent itemsets with 5% minimum support +result = genmax(txns, 0.05) +``` +""" +function genmax(txns::Transactions, min_support::Union{Int,Float64})::DataFrame + n_transactions, n_items = size(txns.matrix) + + # Handle min_support as a float value + min_support = min_support isa Float64 ? ceil(Int, min_support * n_transactions) : min_support + + # Calculate initial supports for each item + item_supports = Dict(i => sum(txns.matrix[:, i]) for i in 1:n_items) + + # Sort items by support in descending order and filter for frequent items + sorted_items = sort(collect(keys(item_supports)), by=i -> item_supports[i], rev=true) + frequent_items = filter(i -> item_supports[i] >= min_support, sorted_items) + + # Create BitSets for each frequent item's transactions + item_bitsets = [BitSet(findall(txns.matrix[:, i])) for i in frequent_items] + + # Initialize the Maximal Frequent Itemsets (MFI) list and threading lock + Results = Vector{Vector{Int}}() + ThreadLock = ReentrantLock() + + # Depth-First Search to find maximal frequent itemsets + function genmax!(itemset::Vector{Int}, start_idx::Int, tidset::BitSet) + local_maximal = true + + for i in start_idx:length(frequent_items) + item = frequent_items[i] + new_tidset = intersect(tidset, item_bitsets[i]) + + # Skip if the new itemset is not frequent + length(new_tidset) < min_support && continue + + local_maximal = false + new_itemset = push!(copy(itemset), item) + genmax!(new_itemset, i + 1, new_tidset) + end + + # If itemset is empty or not locally maximal, return + (isempty(itemset) || !local_maximal) && return + + lock(ThreadLock) do + push!(Results, itemset) + end + end + + # Start the depth-first search in parallel + @sync begin + for (i, item) in enumerate(frequent_items) + Threads.@spawn genmax!([item], i + 1, item_bitsets[i]) + end + end + + # Filter candidates to get final maximal sets + sort!(Results, by=length, rev=true) + maximal = trues(length(Results)) + + for i in 1:length(Results) + #Skip if the item has already been marked as non-maximal + !maximal[i] && continue + + for j in 1:length(Results) + # Skip if item is being compared to its self or if [j] has been marked as non-maximal + (i == j || !maximal[j]) && continue + + # Check if Results[j] is a subset of Results[i] and mark it not maximal if it is + Results[j] ⊊ Results[i] && (maximal[j] = false) + end + end + + result = Results[maximal] + + # Create output DataFrame + df = DataFrame( + Itemset = [getnames(itemset, txns) for itemset in result], + Support = [length(intersect([item_bitsets[findfirst(==(item), frequent_items)] for item in itemset]...)) / n_transactions for itemset in result], + N = [length(intersect([item_bitsets[findfirst(==(item), frequent_items)] for item in itemset]...)) for itemset in result], + Length = [length(itemset) for itemset in result] + ) + + # Sort by length (descending) and then by support (descending) + sort!(df, [:Length, :Support], rev=true) + return df +end \ No newline at end of file diff --git a/src/transactions.jl b/src/transactions.jl index 4b8a7ce..ab5e651 100644 --- a/src/transactions.jl +++ b/src/transactions.jl @@ -225,7 +225,7 @@ function load_transactions(file::String, delimiter::Char; id_col::Bool = false, end """ - convert_csc!(column_values::Vector{Int}, row_values::Vector{Int}, n_cols::Int) -> Tuple{Vector{Int}, Vector{Int}} + convert_csc!(column_values::Vector{Int}, row_values::Vector{Int}, n_cols::Int)::Tuple{Vector{Int}, Vector{Int}} Convert COO (Coordinate) format sparse matrix data to CSC (Compressed Sparse Column) format. diff --git a/test/runtests.jl b/test/runtests.jl index 0b1e795..a9d9e1f 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -304,4 +304,24 @@ end @test remainder.Itemset == [["bacon"], ["hamburger"], ["ketchup"], ["sugar"], ["bacon", "eggs"], ["beer", "hamburger"], ["beer", "milk"], ["bread", "ham"], ["cheese", "ham"], ["eggs", "sugar"], ["milk", "sugar"], ["eggs", "milk", "sugar"]] @test remainder.N == [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2] @test remainder.Length == [1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3] +end + +@testset "genmax.jl" begin + @testset "percentage support" begin + sets = genmax(data,max_perc_sup) + setsorter!(sets) + @test sets.Itemset == max_items + @test sets.Support ≈ max_supports + @test sets.N == max_N + @test sets.Length == max_length + end + + @testset "asbolute support" begin + sets = genmax(data,max_abs_sup) + setsorter!(sets) + @test sets.Itemset == max_items + @test sets.Support ≈ max_supports + @test sets.N == max_N + @test sets.Length == max_length + end end \ No newline at end of file