From dc44ebf6ca30fa9345a84eb4e4d68e7e39854976 Mon Sep 17 00:00:00 2001 From: kylebd99 Date: Mon, 16 Oct 2023 17:18:34 -0700 Subject: [PATCH 1/6] Merge pull request #37 from mkyl:kbd-coloring-experiments-2 This PR adjusts how cycles are accounted for. Previously, the shortest path between the closing nodes is found and used to calculate the probability of closure. This adds an option to consider all simple paths between the nodes, treat their closure as independent events, then calculate the probability that at least one of them closes. --- Experiments/Scripts/cycle_prob_exps.jl | 21 +++++ Experiments/graph_results.jl | 4 +- Experiments/run_estimators.jl | 3 +- Experiments/utils.jl | 8 +- Source/ColorSummary.jl | 4 +- Source/QuasiStableCardinalityEstimator.jl | 103 +++++++++++++++++----- 6 files changed, 117 insertions(+), 26 deletions(-) create mode 100644 Experiments/Scripts/cycle_prob_exps.jl diff --git a/Experiments/Scripts/cycle_prob_exps.jl b/Experiments/Scripts/cycle_prob_exps.jl new file mode 100644 index 0000000..77d352b --- /dev/null +++ b/Experiments/Scripts/cycle_prob_exps.jl @@ -0,0 +1,21 @@ + +using Profile +include("../Experiments.jl") + +datasets = [aids, yeast, dblp] + +experiment_params = Vector{ExperimentParams}() +build_params = Vector{ExperimentParams}() +for dataset in datasets + push!(build_params, ExperimentParams(dataset=dataset)) + for only_shortest_path_cycle in [false, true] + push!(experiment_params, ExperimentParams(dataset=dataset, + only_shortest_path_cycle=only_shortest_path_cycle)) + end +end + +#build_experiments(build_params) + +run_estimation_experiments(experiment_params) + +graph_grouped_box_plot(experiment_params; grouping=cycle_stats, filename="cycle_stats_exps") diff --git a/Experiments/graph_results.jl b/Experiments/graph_results.jl index 963570e..7db9176 100644 --- a/Experiments/graph_results.jl +++ b/Experiments/graph_results.jl @@ -1,4 +1,4 @@ -@enum GROUP dataset technique cycle_size summary_paths inference_paths query_type sampling_type +@enum GROUP dataset technique cycle_size summary_paths inference_paths query_type sampling_type cycle_stats #todo: query type @enum VALUE estimate_error runtime @@ -63,6 +63,8 @@ function get_value_from_param(experiment_param::ExperimentParams, value_type::GR return experiment_param.inference_max_paths elseif value_type == sampling_type return experiment_param.sampling_strategy + elseif value_type == cycle_stats + return experiment_param.only_shortest_path_cycle else # default to grouping by technique return (experiment_param.summary_params.partitioner, experiment_param.summary_params.label_refining_rounds) diff --git a/Experiments/run_estimators.jl b/Experiments/run_estimators.jl index 4d3209f..4466f0a 100644 --- a/Experiments/run_estimators.jl +++ b/Experiments/run_estimators.jl @@ -15,7 +15,8 @@ function run_estimation_experiments(experiment_params_list::Vector{ExperimentPar results = @timed get_cardinality_bounds(query, summary; max_partial_paths = experiment_params.inference_max_paths, use_partial_sums=experiment_params.use_partial_sums, usingStoredStats=true, - sampling_strategy=experiment_params.sampling_strategy) + sampling_strategy=experiment_params.sampling_strategy, + only_shortest_path_cycle= experiment_params.only_shortest_path_cycle) upper_bound = results.value[3] estimate = max(1, results.value[2]) lower_bound = results.value[1] diff --git a/Experiments/utils.jl b/Experiments/utils.jl index 79a50cb..8eca011 100644 --- a/Experiments/utils.jl +++ b/Experiments/utils.jl @@ -7,10 +7,14 @@ struct ExperimentParams dataset::DATASET summary_params::ColorSummaryParams inference_max_paths::Int + # Whether to consider just the shortest path when calculating cycle probabilities or + # all simple paths. + only_shortest_path_cycle::Bool use_partial_sums::Bool sampling_strategy::SAMPLING_STRATEGY - function ExperimentParams(;dataset::DATASET, num_colors::Int=64, max_cycle_size=6, summary_max_paths=1000, + function ExperimentParams(;dataset::DATASET, num_colors::Int=64, max_cycle_size=6, + only_shortest_path_cycle=false, summary_max_paths=1000, partitioner::PARTITIONER = QuasiStable, weighting=true, inference_max_paths=500, use_partial_sums=true, sampling_strategy=redistributive, label_refining_rounds = 0) return new(dataset, ColorSummaryParams(num_colors=num_colors, @@ -20,6 +24,7 @@ struct ExperimentParams weighting=weighting, label_refining_rounds=label_refining_rounds), inference_max_paths, + only_shortest_path_cycle, use_partial_sums, sampling_strategy ) @@ -30,6 +35,7 @@ function params_to_results_filename(experiment_params::ExperimentParams) name = string(experiment_params.dataset) * "_" name *= params_to_string(experiment_params.summary_params) * "_" name *= string(experiment_params.inference_max_paths) * "_" + name *= string(experiment_params.only_shortest_path_cycle) * "_" name *= string(experiment_params.use_partial_sums) * "_" name *= string(experiment_params.sampling_strategy) * ".csv" return name diff --git a/Source/ColorSummary.jl b/Source/ColorSummary.jl index 841f0d9..40a07cb 100644 --- a/Source/ColorSummary.jl +++ b/Source/ColorSummary.jl @@ -27,6 +27,7 @@ struct ColorSummary color_filters::Dict{Int, BloomFilter} # color_filters[c] = filter cycle_probabilities::Dict{CyclePathAndColors, Float64} # cycle_probabilities[[c1, c2], path] = likelihood cycle_length_probabilities::Dict{Int, Float64} #cycle_probabilities[path_length] = likelihood + max_cycle_size::Int total_edges::Int total_nodes::Int # for outdegrees, c2 is the color of the outneighbor @@ -234,7 +235,8 @@ function generate_color_summary(g::DataGraph, params::ColorSummaryParams=ColorSu println("Finished tracking statistics") end return ColorSummary(color_label_cardinality, edge_deg, color_filters, - cycle_probabilities, cycle_length_probabilities, ne(g.graph), nv(g.graph)) + cycle_probabilities, cycle_length_probabilities, params.max_cycle_size, + ne(g.graph), nv(g.graph)) end function color_hash_to_groups(color_hash, num_colors) diff --git a/Source/QuasiStableCardinalityEstimator.jl b/Source/QuasiStableCardinalityEstimator.jl index 77fa933..5ddb37f 100644 --- a/Source/QuasiStableCardinalityEstimator.jl +++ b/Source/QuasiStableCardinalityEstimator.jl @@ -94,6 +94,58 @@ function sample_paths(partial_paths::Vector{Tuple{Vector{Int}, Vector{Float64}}} return path_samples end +function get_simple_paths_dfs!(visited::Set{Int}, cur::Int, finish::Int, max_length::Int, + graph::SimpleGraph, current_path::Vector{Int}, + simple_paths::Vector{Vector{Int}}) + length(current_path) > max_length && return + cur in visited && return + push!(visited, cur) + push!(current_path, cur) + if cur == finish + push!(simple_paths, deepcopy(current_path)) + delete!(visited, cur) + pop!(current_path) + return + end + + for next in all_neighbors(graph, cur) + get_simple_paths_dfs!(visited, next, finish, max_length, graph, current_path, + simple_paths) + end + if length(current_path) > 0 + pop!(current_path) + end + delete!(visited, cur) +end + +# gets all directed, simple paths from the start to finish node +function get_all_simple_path_bools(start::Int, finish::Int, max_length::Int, query_graph::DiGraph) + # convert the graph to be undirected + graph_copy = Graph(copy(query_graph)) + rem_edge!(graph_copy, start, finish) + visited = Set{Int}() + current_path = Vector{Int}() + simple_paths = Vector{Vector{Int}}() + get_simple_paths_dfs!(visited, start, finish, max_length, graph_copy, + current_path, simple_paths) + path_bools = Vector{BoolPath}() + for path in simple_paths + bools::Vector{Bool} = [false for _ in 1:length(path)-1] + for i in 1 : length(path)-1 + src_node = path[i] + dst_node = path[i+1] + if dst_node in outneighbors(query_graph, src_node) + bools[i] = true # out edge + else + bools[i] = false # in edge + end + end + push!(path_bools, bools) + end + return path_bools +end + + # gets the directed path from the start to finish node function get_matching_graph(start::Int, finish::Int, query::QueryGraph) # convert the graph to be undirected @@ -119,7 +171,8 @@ function get_matching_graph(start::Int, finish::Int, query::QueryGraph) end function handle_extra_edges!(query::QueryGraph, summary::ColorSummary, partial_paths::Vector{Tuple{Vector{Int}, Vector{Float64}}}, - current_query_nodes::Vector{Int}, visited_query_edges::Vector{Tuple{Int,Int}}, usingStoredStats::Bool) + current_query_nodes::Vector{Int}, visited_query_edges::Vector{Tuple{Int,Int}}, usingStoredStats::Bool, + only_shortest_path_cycle::Bool) # To account for cyclic queries, we check whether there are any remaining edges that have not # been processed. If so, we set the lower bound to 0, reduce the average estimate accordingly, and leave # the upper bound unchanged. @@ -138,11 +191,14 @@ function handle_extra_edges!(query::QueryGraph, summary::ColorSummary, partial_p new_node_idx::Int = only(indexin(edge[2], current_query_nodes)) child_label::Int = only(query.vertex_labels[edge[2]]) edge_label::Int = only(query.edge_labels[(edge[1],edge[2])]) - path_graph = get_matching_graph(edge[2], edge[1], query) - path_bools = convert_path_graph_to_bools(path_graph) + all_path_bools = Vector{BoolPath}() + if only_shortest_path_cycle + all_path_bools = [convert_path_graph_to_bools(get_matching_graph(edge[2], edge[1], query))] + else + all_path_bools = get_all_simple_path_bools(edge[2], edge[1], summary.max_cycle_size, query.graph) + end + default_colors::StartEndColorPair = (-1, -1) - default_cycle_description = CyclePathAndColors(path_bools, default_colors) - path_length = length(path_bools) edge_deg::Dict{Int, Dict{Int, DegreeStats}} = Dict() if haskey(summary.edge_deg, edge_label) && haskey(summary.edge_deg[edge_label], child_label) edge_deg = summary.edge_deg[edge_label][child_label] @@ -155,27 +211,29 @@ function handle_extra_edges!(query::QueryGraph, summary::ColorSummary, partial_p current_colors::StartEndColorPair = (child_color, parent_color) # We don't have to check data label because these nodes are already in the # partial path, so we have already ensured that the colors are appropriate - probability_of_edge = 0.0 + probability_no_edge = 1.0 if (haskey(edge_deg, parent_color) && haskey(edge_deg[parent_color], child_color)) - if usingStoredStats - # we flip this because the matching graph finds the path between two nodes, - # where the last node is the start of the closing edge - current_cycle_description = CyclePathAndColors(path_bools, current_colors) - if haskey(summary.cycle_probabilities, current_cycle_description) - probability_of_edge = summary.cycle_probabilities[current_cycle_description] - elseif haskey(summary.cycle_probabilities, default_cycle_description) - probability_of_edge = summary.cycle_probabilities[default_cycle_description] - elseif haskey(summary.cycle_length_probabilities, path_length) - probability_of_edge = summary.cycle_length_probabilities[path_length] - else - probability_of_edge = get_independent_cycle_likelihood(edge_label, child_label, parent_color, child_color, summary) + if usingStoredStats && length(all_path_bools) > 0 + for path_bools in all_path_bools + path_length = length(path_bools) + default_cycle_description = CyclePathAndColors(path_bools, default_colors) + current_cycle_description = CyclePathAndColors(path_bools, current_colors) + if haskey(summary.cycle_probabilities, current_cycle_description) + probability_no_edge *= 1.0 - summary.cycle_probabilities[current_cycle_description] + elseif haskey(summary.cycle_probabilities, default_cycle_description) + probability_no_edge *= 1.0 - summary.cycle_probabilities[default_cycle_description] + elseif haskey(summary.cycle_length_probabilities, path_length) + probability_no_edge *= 1.0 - summary.cycle_length_probabilities[path_length] + else + probability_no_edge *= 1.0 - get_independent_cycle_likelihood(edge_label, child_label, parent_color, child_color, summary) + end end else - probability_of_edge = get_independent_cycle_likelihood(edge_label, child_label, parent_color, child_color, summary) + probability_no_edge *= 1.0 - get_independent_cycle_likelihood(edge_label, child_label, parent_color, child_color, summary) end end partial_paths[i][2][1] = 0 - partial_paths[i][2][2] *= probability_of_edge + partial_paths[i][2][2] *= 1.0 - probability_no_edge end end end @@ -203,7 +261,8 @@ end function get_cardinality_bounds(query::QueryGraph, summary::ColorSummary; max_partial_paths = nothing, use_partial_sums = true, verbose = false, usingStoredStats = false, - include_cycles = true, sampling_strategy=weighted) + include_cycles = true, sampling_strategy=weighted, + only_shortest_path_cycle=false) node_order = get_min_width_node_order(query.graph) #spanning tree to cut out cycles if verbose println("Node Order:", node_order) @@ -354,7 +413,7 @@ function get_cardinality_bounds(query::QueryGraph, summary::ColorSummary; max_pa end if (include_cycles) - handle_extra_edges!(query, summary, partial_paths, current_query_nodes, visited_query_edges, usingStoredStats) + handle_extra_edges!(query, summary, partial_paths, current_query_nodes, visited_query_edges, usingStoredStats, only_shortest_path_cycle) end end From b35dcf63948de5b31fc6fbe750bfffabb6c2f9bb Mon Sep 17 00:00:00 2001 From: kylebd99 Date: Wed, 18 Oct 2023 13:17:00 -0700 Subject: [PATCH 2/6] Only include cycles on visited edges --- Experiments/Scripts/cycle_prob_exps.jl | 4 ++-- Source/QuasiStableCardinalityEstimator.jl | 15 ++++++++++----- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/Experiments/Scripts/cycle_prob_exps.jl b/Experiments/Scripts/cycle_prob_exps.jl index 77d352b..6933dc7 100644 --- a/Experiments/Scripts/cycle_prob_exps.jl +++ b/Experiments/Scripts/cycle_prob_exps.jl @@ -2,7 +2,7 @@ using Profile include("../Experiments.jl") -datasets = [aids, yeast, dblp] +datasets = [aids, yeast, hprd, dblp, youtube, wordnet] experiment_params = Vector{ExperimentParams}() build_params = Vector{ExperimentParams}() @@ -16,6 +16,6 @@ end #build_experiments(build_params) -run_estimation_experiments(experiment_params) +#run_estimation_experiments(experiment_params) graph_grouped_box_plot(experiment_params; grouping=cycle_stats, filename="cycle_stats_exps") diff --git a/Source/QuasiStableCardinalityEstimator.jl b/Source/QuasiStableCardinalityEstimator.jl index 5ddb37f..273af3c 100644 --- a/Source/QuasiStableCardinalityEstimator.jl +++ b/Source/QuasiStableCardinalityEstimator.jl @@ -119,10 +119,15 @@ function get_simple_paths_dfs!(visited::Set{Int}, cur::Int, finish::Int, max_len end # gets all directed, simple paths from the start to finish node -function get_all_simple_path_bools(start::Int, finish::Int, max_length::Int, query_graph::DiGraph) - # convert the graph to be undirected - graph_copy = Graph(copy(query_graph)) +function get_all_simple_path_bools(start::Int, finish::Int, max_length::Int, + query_graph::DiGraph, visited_edges::Vector{Tuple{Int,Int}}) + # convert the graph to be undirected and only include the edges that have already been processed + graph_copy = Graph(nv(query_graph)) + for edge in visited_edges + add_edge!(graph_copy, edge[1], edge[2]) + end rem_edge!(graph_copy, start, finish) + visited = Set{Int}() current_path = Vector{Int}() simple_paths = Vector{Vector{Int}}() @@ -181,12 +186,12 @@ function handle_extra_edges!(query::QueryGraph, summary::ColorSummary, partial_p if ! ((src(edge), dst(edge)) in visited_query_edges) && (src(edge) in current_query_nodes && dst(edge) in current_query_nodes) push!(remaining_edges, (src(edge), dst(edge))) - push!(visited_query_edges, (src(edge), dst(edge))) end end # scale down the average if there are remaining non-tree-edges for edge in remaining_edges + push!(visited_query_edges, edge) parent_node_idx::Int = only(indexin(edge[1], current_query_nodes)) new_node_idx::Int = only(indexin(edge[2], current_query_nodes)) child_label::Int = only(query.vertex_labels[edge[2]]) @@ -195,7 +200,7 @@ function handle_extra_edges!(query::QueryGraph, summary::ColorSummary, partial_p if only_shortest_path_cycle all_path_bools = [convert_path_graph_to_bools(get_matching_graph(edge[2], edge[1], query))] else - all_path_bools = get_all_simple_path_bools(edge[2], edge[1], summary.max_cycle_size, query.graph) + all_path_bools = get_all_simple_path_bools(edge[2], edge[1], summary.max_cycle_size, query.graph, visited_query_edges) end default_colors::StartEndColorPair = (-1, -1) From bb62125f645e175a700f42b1caaac5cbb8b36a78 Mon Sep 17 00:00:00 2001 From: kylebd99 Date: Wed, 18 Oct 2023 16:20:12 -0700 Subject: [PATCH 3/6] add memory exps --- Experiments/Scripts/cycle_prob_exps.jl | 14 +++-- Experiments/Scripts/memory_exps.jl | 20 +++++++ Experiments/build_color_summaries.jl | 10 +++- Experiments/graph_results.jl | 76 +++++++++++++++++++++++++- 4 files changed, 109 insertions(+), 11 deletions(-) create mode 100644 Experiments/Scripts/memory_exps.jl diff --git a/Experiments/Scripts/cycle_prob_exps.jl b/Experiments/Scripts/cycle_prob_exps.jl index 6933dc7..7e99551 100644 --- a/Experiments/Scripts/cycle_prob_exps.jl +++ b/Experiments/Scripts/cycle_prob_exps.jl @@ -7,15 +7,19 @@ datasets = [aids, yeast, hprd, dblp, youtube, wordnet] experiment_params = Vector{ExperimentParams}() build_params = Vector{ExperimentParams}() for dataset in datasets - push!(build_params, ExperimentParams(dataset=dataset)) + push!(build_params, ExperimentParams(dataset=dataset, + num_colors=16, + label_refining_rounds=2)) for only_shortest_path_cycle in [false, true] push!(experiment_params, ExperimentParams(dataset=dataset, - only_shortest_path_cycle=only_shortest_path_cycle)) + num_colors=16, + label_refining_rounds=2, + only_shortest_path_cycle=only_shortest_path_cycle)) end end -#build_experiments(build_params) +build_experiments(build_params) -#run_estimation_experiments(experiment_params) +run_estimation_experiments(experiment_params) -graph_grouped_box_plot(experiment_params; grouping=cycle_stats, filename="cycle_stats_exps") +graph_grouped_box_plot(experiment_params; grouping=cycle_stats, filename="cycle_stats_exps_w_refining") diff --git a/Experiments/Scripts/memory_exps.jl b/Experiments/Scripts/memory_exps.jl new file mode 100644 index 0000000..3a173b6 --- /dev/null +++ b/Experiments/Scripts/memory_exps.jl @@ -0,0 +1,20 @@ + +using Profile +include("../Experiments.jl") + +#datasets = [aids, yeast, hprd, dblp, youtube, wordnet] +datasets = [aids, yeast, hprd, dblp, youtube, wordnet] +num_colors = [4, 8, 16, 32, 64, 128] +experiment_params = Vector{ExperimentParams}() +build_params = Vector{ExperimentParams}() +for dataset in datasets + for n in num_colors + push!(build_params, ExperimentParams(dataset=dataset, num_colors=n)) + end +end +build_experiments(build_params) + +graph_grouped_bar_plot(build_params; grouping=number_of_colors, + y_type=memory_footprint, + y_lims=[1, 10000], + filename="memory_size_vs_colors") diff --git a/Experiments/build_color_summaries.jl b/Experiments/build_color_summaries.jl index c7615ca..00ae7c2 100644 --- a/Experiments/build_color_summaries.jl +++ b/Experiments/build_color_summaries.jl @@ -1,6 +1,6 @@ function build_experiments(experiment_params_list::Vector{ExperimentParams}) - build_times = [("Dataset", "Partitioner", "NumColors", "BuildTime")] for experiment_params in experiment_params_list + build_times = [("Dataset", "Partitioner", "NumColors", "BuildTime", "MemoryFootprint")] dataset = experiment_params.dataset summary_params = experiment_params.summary_params data = load_dataset(dataset) @@ -8,9 +8,13 @@ function build_experiments(experiment_params_list::Vector{ExperimentParams}) summary_file_location = "Experiments/SerializedSummaries/" * summary_name println("Building Color Summary: ", summary_name) results = @timed generate_color_summary(data, summary_params; verbose=1) + summary_size = Base.summarysize(results.value) serialize(summary_file_location, results.value) - push!(build_times, (string(dataset), string(summary_params.partitioner), - string(summary_params.num_colors), string(results.time))) + push!(build_times, (string(dataset), + string(summary_params.partitioner), + string(summary_params.num_colors), + string(results.time), + string(summary_size))) results_filename = params_to_results_filename(experiment_params) result_file_location = "Experiments/Results/Build_" * results_filename writedlm(result_file_location, build_times, ",") diff --git a/Experiments/graph_results.jl b/Experiments/graph_results.jl index 7db9176..6d8bf4d 100644 --- a/Experiments/graph_results.jl +++ b/Experiments/graph_results.jl @@ -1,7 +1,7 @@ -@enum GROUP dataset technique cycle_size summary_paths inference_paths query_type sampling_type cycle_stats +@enum GROUP dataset technique cycle_size summary_paths inference_paths query_type sampling_type cycle_stats number_of_colors #todo: query type -@enum VALUE estimate_error runtime +@enum VALUE estimate_error runtime memory_footprint function graph_grouped_box_plot(experiment_params_list::Vector{ExperimentParams}; x_type::GROUP=dataset, y_type::VALUE=estimate_error, @@ -43,7 +43,7 @@ function graph_grouped_box_plot(experiment_params_list::Vector{ExperimentParams} # See this: https://discourse.julialang.org/t/deactivate-plot-display-to-avoid-need-for-x-server/19359/15 ENV["GKSwstype"]="100" gbplot = groupedboxplot(x_values, y_values, group = groups, yscale =:log10, - ylims=[10^-13, 10^11], yticks=[10^-10, 10^-5, 1, 10^5, 10^10], + ylims=[10^-13, 10^11], yticks=[10^-10, 10^-5, 10^-2, 1, 10^2, 10^5, 10^10], legend = :outertopleft, size = (1000, 600)) x_label !== nothing && xlabel!(gbplot, x_label) y_label !== nothing && ylabel!(gbplot, y_label) @@ -51,6 +51,74 @@ function graph_grouped_box_plot(experiment_params_list::Vector{ExperimentParams} savefig(gbplot, "Experiments/Results/Figures/" * plotname) end +function graph_grouped_bar_plot(experiment_params_list::Vector{ExperimentParams}; + x_type::GROUP=dataset, + y_type::VALUE=estimate_error, + grouping::GROUP=technique, + x_label=nothing, + y_label=nothing, + y_lims=[0, 10], + filename=nothing) + # for now let's just use the dataset as the x-values and the cycle size as the groups + x_values = [] + y_values = Float64[] + groups = [] + for experiment_params in experiment_params_list + # load the results + results_filename = params_to_results_filename(experiment_params) + prefix = "Experiments/Results/Estimation_" + if y_type == memory_footprint + prefix = "Experiments/Results/Build_" + end + results_path = prefix * results_filename + results_df = CSV.read(results_path, DataFrame; normalizenames=true) + + # get the x_value and grouping (same for all results in this experiment param) + println(results_df) + # keep track of the data points + for i in 1:nrow(results_df) + current_x = x_type == query_type ? results_df[i, :QueryType] : get_value_from_param(experiment_params, x_type) + current_group = grouping == query_type ? results_df[i, :QueryType] : get_value_from_param(experiment_params, grouping) + current_y = 0 + if y_type == estimate_error + current_y = results_df[i, :Estimate] / results_df[i, :TrueCard] + elseif y_type == memory_footprint + current_y = results_df[i, :MemoryFootprint]/(10^6) + else + # y_type == runtime + current_y = results_df[i, :EstimationTime] + end + # push the errors and their groupings into the correct vector + push!(x_values, current_x) + push!(y_values, current_y) + push!(groups, current_group) + end + end + results_filename = params_to_results_filename(experiment_params_list[1]) + println("starting graphs") + + # This seems to be necessary for using Plots.jl outside of the ipynb framework. + # See this: https://discourse.julialang.org/t/deactivate-plot-display-to-avoid-need-for-x-server/19359/15 + ENV["GKSwstype"]="100" + println(x_values) + println(y_values) + println(groups) + gbplot = StatsPlots.groupedbar(x_values, + y_values, + group = groups, +# yscale =:log10, + ylims=y_lims, + legend = :outertopleft, + size = (1000, 600)) + x_label !== nothing && xlabel!(gbplot, x_label) + y_label !== nothing && ylabel!(gbplot, y_label) + plotname = (isnothing(filename)) ? results_filename * ".png" : filename * ".png" + savefig(gbplot, "Experiments/Results/Figures/" * plotname) +end + + + + # default to grouping by dataset function get_value_from_param(experiment_param::ExperimentParams, value_type::GROUP) if value_type == dataset @@ -65,6 +133,8 @@ function get_value_from_param(experiment_param::ExperimentParams, value_type::GR return experiment_param.sampling_strategy elseif value_type == cycle_stats return experiment_param.only_shortest_path_cycle + elseif value_type == number_of_colors + return experiment_param.summary_params.num_colors else # default to grouping by technique return (experiment_param.summary_params.partitioner, experiment_param.summary_params.label_refining_rounds) From 3b16fb182d15ce135fe6449d247484ff10d70141 Mon Sep 17 00:00:00 2001 From: kylebd99 Date: Wed, 18 Oct 2023 17:39:34 -0700 Subject: [PATCH 4/6] Change color & stats types to reduce memory --- Experiments/Scripts/cycle_prob_exps.jl | 4 +-- Experiments/Scripts/memory_exps.jl | 8 +++--- Source/CardinalityWithColors.jl | 4 ++- Source/ColorSummary.jl | 30 +++++++++++------------ Source/ColoringMethods.jl | 26 ++++++++++---------- Source/ExactSizeCalculator.jl | 14 +++++------ Source/QuasiStableCardinalityEstimator.jl | 22 ++++++++--------- Source/datasets.jl | 8 +++--- 8 files changed, 58 insertions(+), 58 deletions(-) diff --git a/Experiments/Scripts/cycle_prob_exps.jl b/Experiments/Scripts/cycle_prob_exps.jl index 7e99551..85623d3 100644 --- a/Experiments/Scripts/cycle_prob_exps.jl +++ b/Experiments/Scripts/cycle_prob_exps.jl @@ -12,8 +12,6 @@ for dataset in datasets label_refining_rounds=2)) for only_shortest_path_cycle in [false, true] push!(experiment_params, ExperimentParams(dataset=dataset, - num_colors=16, - label_refining_rounds=2, only_shortest_path_cycle=only_shortest_path_cycle)) end end @@ -22,4 +20,4 @@ build_experiments(build_params) run_estimation_experiments(experiment_params) -graph_grouped_box_plot(experiment_params; grouping=cycle_stats, filename="cycle_stats_exps_w_refining") +graph_grouped_box_plot(experiment_params; grouping=cycle_stats, filename="cycle_stats_exps") diff --git a/Experiments/Scripts/memory_exps.jl b/Experiments/Scripts/memory_exps.jl index 3a173b6..1ec3c29 100644 --- a/Experiments/Scripts/memory_exps.jl +++ b/Experiments/Scripts/memory_exps.jl @@ -3,7 +3,7 @@ using Profile include("../Experiments.jl") #datasets = [aids, yeast, hprd, dblp, youtube, wordnet] -datasets = [aids, yeast, hprd, dblp, youtube, wordnet] +datasets = [aids, yeast, hprd, dblp, wordnet] num_colors = [4, 8, 16, 32, 64, 128] experiment_params = Vector{ExperimentParams}() build_params = Vector{ExperimentParams}() @@ -12,9 +12,9 @@ for dataset in datasets push!(build_params, ExperimentParams(dataset=dataset, num_colors=n)) end end -build_experiments(build_params) +#build_experiments(build_params) graph_grouped_bar_plot(build_params; grouping=number_of_colors, y_type=memory_footprint, - y_lims=[1, 10000], - filename="memory_size_vs_colors") + y_lims=[0, 16], + filename="memory_size_vs_colors_fp32_int16") diff --git a/Source/CardinalityWithColors.jl b/Source/CardinalityWithColors.jl index 68777c9..5608241 100644 --- a/Source/CardinalityWithColors.jl +++ b/Source/CardinalityWithColors.jl @@ -10,7 +10,9 @@ using Graphs: SimpleDiGraphFromIterator, Edge, DiGraph, edges, nv, ne, add_edge! BoolPath = Vector{Bool} -StartEndColorPair = Tuple{Int, Int} +NodeId = Int +Color = Int16 +StartEndColorPair = Tuple{Color, Color} abstract type Comparable end import Base .== function ==(a::T, b::T) where T <: Comparable diff --git a/Source/ColorSummary.jl b/Source/ColorSummary.jl index 40a07cb..99470c8 100644 --- a/Source/ColorSummary.jl +++ b/Source/ColorSummary.jl @@ -1,12 +1,12 @@ using Graphs struct DegreeStats - min_out::Float64 - avg_out::Float64 - max_out::Float64 - min_in::Float64 - avg_in::Float64 - max_in::Float64 + min_out::Float32 + avg_out::Float32 + max_out::Float32 + min_in::Float32 + avg_in::Float32 + max_in::Float32 function DegreeStats(min_out, avg_out, max_out) return new(min_out, avg_out, max_out, 0, 0, 0) @@ -22,9 +22,9 @@ end # a particular color. Note that `-1` is used to represent a "wildcard" label. These do not appear in the data graph, # but they do occur in the query graph. struct ColorSummary - color_label_cardinality::Dict{Int, Dict{Int, Int}} # color_label_cardinality[c][v] = num_vertices - edge_deg::Dict{Int, Dict{Int, Dict{Int, Dict{Int, DegreeStats}}}} # edge_min_out_deg[e][v2][c1][c2] = min - color_filters::Dict{Int, BloomFilter} # color_filters[c] = filter + color_label_cardinality::Dict{Color, Dict{Int, Int}} # color_label_cardinality[c][v] = num_vertices + edge_deg::Dict{Int, Dict{Int, Dict{Color, Dict{Color, DegreeStats}}}} # edge_min_out_deg[e][v2][c1][c2] = min + color_filters::Dict{Color, BloomFilter} # color_filters[c] = filter cycle_probabilities::Dict{CyclePathAndColors, Float64} # cycle_probabilities[[c1, c2], path] = likelihood cycle_length_probabilities::Dict{Int, Float64} #cycle_probabilities[path_length] = likelihood max_cycle_size::Int @@ -40,9 +40,9 @@ function generate_color_summary(g::DataGraph, params::ColorSummaryParams=ColorSu if (verbose > 0) println("Started coloring") end - color_filters = Dict() - color_label_cardinality = Dict() - color_hash::Dict{Int, Int32} = color_graph(g, params, params.num_colors) + color_filters::Dict{Color, BloomFilter} = Dict() + color_label_cardinality::Dict{Color, Int} = Dict() + color_hash::Dict{NodeId, Color} = color_graph(g, params, params.num_colors) color_sizes = [0 for _ in 1:maximum(values(color_hash))] for c in values(color_hash) color_sizes[c] += 1 @@ -97,7 +97,7 @@ function generate_color_summary(g::DataGraph, params::ColorSummaryParams=ColorSu println("Started tracking statistics") end # We keep separate degree statistics for in-degree and out-degree. - color_to_color_out_counter::Dict{Int32, Dict{Int32, Any}} = Dict() + color_to_color_out_counter::Dict{Color, Dict{Color, Any}} = Dict() for x in vertices(g.graph) c1 = color_hash[x] for y in outneighbors(g.graph,x) @@ -132,7 +132,7 @@ function generate_color_summary(g::DataGraph, params::ColorSummaryParams=ColorSu end end - edge_deg::Dict{Int, Dict{Int, Dict{Int, Dict{Int, DegreeStats}}}} = Dict() + edge_deg::Dict{Int, Dict{Int, Dict{Color, Dict{Color, DegreeStats}}}} = Dict() for edge_label in keys(color_to_color_out_counter) edge_deg[edge_label] = Dict() for vertex_label in keys(color_to_color_out_counter[edge_label]) @@ -160,7 +160,7 @@ function generate_color_summary(g::DataGraph, params::ColorSummaryParams=ColorSu end # We keep separate degree statistics for in-degree and out-degree. - color_to_color_in_counter::Dict{Int32, Dict{Int32, Any}} = Dict() + color_to_color_in_counter::Dict{Color, Dict{Color, Any}} = Dict() for x in vertices(g.graph) c1 = color_hash[x] for y in inneighbors(g.graph,x) diff --git a/Source/ColoringMethods.jl b/Source/ColoringMethods.jl index fba1fd7..471fd6d 100644 --- a/Source/ColoringMethods.jl +++ b/Source/ColoringMethods.jl @@ -301,7 +301,7 @@ function _edge_ratio_color(g::DataGraph, num_colors::Int) return color_hash end -function _recursive_label_split(g::DataGraph, group::Vector{Int}, depth::Int, max_depth::Int) +function _recursive_label_split(g::DataGraph, group::Vector{NodeId}, depth::Int, max_depth::Int) if depth == max_depth return [group] end @@ -323,8 +323,8 @@ function _recursive_label_split(g::DataGraph, group::Vector{Int}, depth::Int, ma end end - left_group = Vector{Int}() - right_group = Vector{Int}() + left_group = Vector{NodeId}() + right_group = Vector{NodeId}() for v in group if most_even_label in g.vertex_labels[v] push!(right_group, v) @@ -339,9 +339,9 @@ end # It does this by recursively choosing a single label which most evenly breaks the color # into two sub-colors with up to `label_refining_rounds` depth. function _refine_by_vertex_labels(g::DataGraph, params::ColorSummaryParams, - color_hash::Dict{Int, Int}, label_refining_rounds::Int) - refined_color_hash::Dict{Int, Int} = Dict() - color_to_vertices::Dict{Int, Vector{Int}} = Dict() + color_hash::Dict{NodeId, Color}, label_refining_rounds::Int) + refined_color_hash::Dict{NodeId, Color} = Dict() + color_to_vertices::Dict{Color, Vector{NodeId}} = Dict() for v in keys(color_hash) color = color_hash[v] if haskey(color_to_vertices, color) @@ -373,7 +373,7 @@ end # It does this by recursively choosing a single label which has the greatest stddev w.r.t. # the edge count of vertices and splitting the nodes based on their edge count for that label # with up to `label_refining_rounds` depth. -function _recursive_neighbor_split(g::DataGraph, group::Vector{Int}, depth::Int, max_depth::Int) +function _recursive_neighbor_split(g::DataGraph, group::Vector{NodeId}, depth::Int, max_depth::Int) if depth == max_depth return [group] end @@ -409,8 +409,8 @@ function _recursive_neighbor_split(g::DataGraph, group::Vector{Int}, depth::Int, end end - left_group = Vector{Int}() - right_group = Vector{Int}() + left_group = Vector{NodeId}() + right_group = Vector{NodeId}() for v in group label_count = 0 for n in all_neighbors(g.graph, v) @@ -428,9 +428,9 @@ function _recursive_neighbor_split(g::DataGraph, group::Vector{Int}, depth::Int, end function _refine_by_neighbor_labels(g::DataGraph, params::ColorSummaryParams, - color_hash::Dict{Int, Int}, label_refining_rounds::Int) - refined_color_hash::Dict{Int, Int} = Dict() - color_to_vertices::Dict{Int, Vector{Int}} = Dict() + color_hash::Dict{NodeId, Color}, label_refining_rounds::Int) + refined_color_hash::Dict{NodeId, Color} = Dict() + color_to_vertices::Dict{Color, Vector{NodeId}} = Dict() for v in keys(color_hash) color = color_hash[v] if haskey(color_to_vertices, color) @@ -457,7 +457,7 @@ function _refine_by_neighbor_labels(g::DataGraph, params::ColorSummaryParams, end function color_graph(g::DataGraph, params::ColorSummaryParams, num_colors::Int) - color_hash::Dict{Int, Int} = if params.partitioner == QuasiStable + color_hash::Dict{NodeId, Color} = if params.partitioner == QuasiStable _quasi_stable_coloring(g, params, num_colors) elseif params.partitioner == Hash _hash_coloring(g, num_colors) diff --git a/Source/ExactSizeCalculator.jl b/Source/ExactSizeCalculator.jl index 946ec64..f623ef6 100644 --- a/Source/ExactSizeCalculator.jl +++ b/Source/ExactSizeCalculator.jl @@ -1,6 +1,6 @@ # This file contains a prototype implementation of exact sub-graph counting. -function sum_over_node_exact!(partial_paths::Vector{Tuple{Vector{Int}, Int}}, +function sum_over_node_exact!(partial_paths::Vector{Tuple{Vector{NodeId}, Int}}, current_query_nodes, node_to_remove, timeout, start_time) nodeIdx = 1 @@ -10,7 +10,7 @@ function sum_over_node_exact!(partial_paths::Vector{Tuple{Vector{Int}, Int}}, end nodeIdx += 1 end - new_partial_paths::Dict{Vector{Int}, Union{Vector{Float64}, Int}} = Dict() + new_partial_paths::Dict{Vector{NodeId}, Union{Vector{Float64}, Int}} = Dict() for path_and_bounds in partial_paths if timeout > 0 && time() - start_time > timeout println("Timeout Reached") @@ -33,7 +33,7 @@ function sum_over_node_exact!(partial_paths::Vector{Tuple{Vector{Int}, Int}}, end -function sum_over_finished_query_nodes_exact!(query::QueryGraph, partial_paths::Vector{Tuple{Vector{Int}, Int}}, +function sum_over_finished_query_nodes_exact!(query::QueryGraph, partial_paths::Vector{Tuple{Vector{NodeId}, Int}}, current_query_nodes, visited_query_edges, timeout, start_time; nodes_to_not_sum = []) prev_query_nodes = copy(current_query_nodes) @@ -56,10 +56,10 @@ function sum_over_finished_query_nodes_exact!(query::QueryGraph, partial_paths:: end function handle_extra_edges_exact!(query::QueryGraph, data::DataGraph, - partial_paths::Vector{Tuple{Vector{Int}, Int}}, + partial_paths::Vector{Tuple{Vector{NodeId}, Int}}, current_query_nodes, visited_query_edges, timeout, start_time) - remaining_edges::Vector{Tuple{Int, Int}} = [] + remaining_edges::Vector{Tuple{NodeId, NodeId}} = [] for edge in edges(query.graph) # since the edge's nodes are already processed, we don't have to check if ! ((src(edge), dst(edge)) in visited_query_edges) && @@ -126,7 +126,7 @@ function sample_paths_exact(partial_paths, num_samples::Int) end # choose a sample of the paths - path_samples::Vector{Tuple{Vector{Int}, Int}} = sample(partial_paths, num_samples; replace=false) + path_samples::Vector{Tuple{Vector{NodeId}, Int}} = sample(partial_paths, num_samples; replace=false) # sum up the sampled bounds sampled_bounds_sum = 0 @@ -157,7 +157,7 @@ function get_subgraph_counts(query::QueryGraph, data::DataGraph; use_partial_sum start_time = time() node_order = get_min_width_node_order(query.graph) - partial_paths::Vector{Tuple{Vector{Int}, Int}} = [] + partial_paths::Vector{Tuple{Vector{NodeId}, Int}} = [] visited_query_edges = [] current_query_nodes = [] if verbose diff --git a/Source/QuasiStableCardinalityEstimator.jl b/Source/QuasiStableCardinalityEstimator.jl index 273af3c..98a1795 100644 --- a/Source/QuasiStableCardinalityEstimator.jl +++ b/Source/QuasiStableCardinalityEstimator.jl @@ -4,7 +4,7 @@ # Equivalently, they perform a groupby on all other nodes of the query graph. The goal of this is to prevent # an exponential growth in the number of paths through the lifted color graph. However, we can only remove query nodes whose # edges have already been processed. -function sum_over_node!(partial_paths::Vector{Tuple{Vector{Int}, Vector{Float64}}}, current_query_nodes, node_to_remove) +function sum_over_node!(partial_paths::Vector{Tuple{Vector{Color}, Vector{Float64}}}, current_query_nodes, node_to_remove) nodeIdx = 1 for node in current_query_nodes if node == node_to_remove @@ -12,7 +12,7 @@ function sum_over_node!(partial_paths::Vector{Tuple{Vector{Int}, Vector{Float64} end nodeIdx += 1 end - new_partial_paths::Dict{Vector{Int}, Union{Vector{Float64}, Int}} = Dict() + new_partial_paths::Dict{Vector{Color}, Union{Vector{Float64}, Int}} = Dict() for path_and_bounds in partial_paths path = path_and_bounds[1] bounds = path_and_bounds[2] @@ -33,7 +33,7 @@ end @enum SAMPLING_STRATEGY uniform weighted redistributive -function sample_paths(partial_paths::Vector{Tuple{Vector{Int}, Vector{Float64}}}, num_samples::Int, sampling_strategy::SAMPLING_STRATEGY) +function sample_paths(partial_paths::Vector{Tuple{Vector{Color}, Vector{Float64}}}, num_samples::Int, sampling_strategy::SAMPLING_STRATEGY) # partial_path[x] = (color path, bounds) partial_paths = [x for x in partial_paths if x[2][2] > 0] @@ -56,7 +56,7 @@ function sample_paths(partial_paths::Vector{Tuple{Vector{Int}, Vector{Float64}}} if sampling_strategy == uniform sample_weights = AnalyticWeights([1.0 for _ in eachindex(partial_paths)] ./ length(partial_paths)) end - path_samples::Vector{Tuple{Vector{Int}, Vector{Float64}}} = sample(partial_paths, sample_weights, num_samples; replace=false) + path_samples::Vector{Tuple{Vector{Color}, Vector{Float64}}} = sample(partial_paths, sample_weights, num_samples; replace=false) # sum up the sampled bounds sampled_bounds_sum::Vector{Float64} = [0,0,0] @@ -175,7 +175,7 @@ function get_matching_graph(start::Int, finish::Int, query::QueryGraph) return new_graph end -function handle_extra_edges!(query::QueryGraph, summary::ColorSummary, partial_paths::Vector{Tuple{Vector{Int}, Vector{Float64}}}, +function handle_extra_edges!(query::QueryGraph, summary::ColorSummary, partial_paths::Vector{Tuple{Vector{Color}, Vector{Float64}}}, current_query_nodes::Vector{Int}, visited_query_edges::Vector{Tuple{Int,Int}}, usingStoredStats::Bool, only_shortest_path_cycle::Bool) # To account for cyclic queries, we check whether there are any remaining edges that have not @@ -243,7 +243,7 @@ function handle_extra_edges!(query::QueryGraph, summary::ColorSummary, partial_p end end -function sum_over_finished_query_nodes!(query::QueryGraph, partial_paths::Vector{Tuple{Vector{Int}, Vector{Float64}}}, +function sum_over_finished_query_nodes!(query::QueryGraph, partial_paths::Vector{Tuple{Vector{Color}, Vector{Float64}}}, current_query_nodes::Vector{Int}, visited_query_edges::Vector{Tuple{Int, Int}}) prev_query_nodes = copy(current_query_nodes) for node in prev_query_nodes @@ -274,7 +274,7 @@ function get_cardinality_bounds(query::QueryGraph, summary::ColorSummary; max_pa end # Because the label is implied by the color -> query_graph_vertex mapping stored in current_query_nodes, # we don't have to keep the label in the partial paths object. - partial_paths::Vector{Tuple{Vector{Int}, Vector{Float64}}} = [] # each tuple contains a pairing of color paths -> bounds + partial_paths::Vector{Tuple{Vector{Color}, Vector{Float64}}} = [] # each tuple contains a pairing of color paths -> bounds visited_query_edges::Vector{Tuple{Int,Int}} = [] current_query_nodes::Vector{Int} = [] @@ -352,17 +352,17 @@ function get_cardinality_bounds(query::QueryGraph, summary::ColorSummary; max_pa new_label = only(query.vertex_labels[new_node]) new_data_labels = get_data_label(query, new_node) - new_partial_paths::Vector{Tuple{Vector{Int}, Vector{Float64}}} = [] + new_partial_paths::Vector{Tuple{Vector{Color}, Vector{Float64}}} = [] # Update the partial paths using the parent-child combo that comes next from the query. - edge_deg::Dict{Int, Dict{Int, DegreeStats}} = Dict() + edge_deg::Dict{Color, Dict{Color, DegreeStats}} = Dict() if haskey(summary.edge_deg, edge_label) && haskey(summary.edge_deg[edge_label], new_label) edge_deg = summary.edge_deg[edge_label][new_label] end for path_and_bounds in partial_paths - path::Vector{Int} = path_and_bounds[1] + path::Vector{Color} = path_and_bounds[1] running_bounds::Vector{Float64} = path_and_bounds[2] old_color = path[parent_idx] @@ -385,7 +385,7 @@ function get_cardinality_bounds(query::QueryGraph, summary::ColorSummary; max_pa continue end degree_stats::DegreeStats = edge_deg[old_color][new_color] - new_path::Vector{Int} = [path..., new_color] + new_path::Vector{Color} = [path..., new_color] new_bounds::Vector{Float64} = [0, 0, 0] if out_edge new_bounds = [running_bounds[1]*degree_stats.min_out, diff --git a/Source/datasets.jl b/Source/datasets.jl index 3d4eb88..bac9e56 100644 --- a/Source/datasets.jl +++ b/Source/datasets.jl @@ -1,7 +1,7 @@ function load_dataset(path; subgraph_matching_data=false) n = 0 - edges::Array{Tuple{Tuple{Int, Int}, Int}} = [] - vertices::Array{Tuple{Int, Array{Int}}} = [] + edges::Array{Tuple{Tuple{NodeId, NodeId}, Int}} = [] + vertices::Array{Tuple{NodeId, Array{Int}}} = [] for line in eachline(path) if length(line) == 0 continue @@ -44,8 +44,8 @@ end function load_query(path; subgraph_matching_data=false) n = 0 - edges::Array{Tuple{Tuple{Int, Int}, Int}} = [] - vertices::Array{Tuple{Int, Int, Int}} = [] + edges::Array{Tuple{Tuple{NodeId, NodeId}, Int}} = [] + vertices::Array{Tuple{NodeId, Int, Int}} = [] for line in eachline(path) if length(line) == 0 continue From 25b3909b67c07215ad61e63e0e2bdb9886ae287b Mon Sep 17 00:00:00 2001 From: kylebd99 Date: Wed, 18 Oct 2023 18:09:59 -0700 Subject: [PATCH 5/6] Fix small type issue --- Experiments/Scripts/accuracy_vs_runtime.jl | 17 ++++++++--------- Source/ColorSummary.jl | 2 +- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/Experiments/Scripts/accuracy_vs_runtime.jl b/Experiments/Scripts/accuracy_vs_runtime.jl index ecfeeb1..39d1c23 100644 --- a/Experiments/Scripts/accuracy_vs_runtime.jl +++ b/Experiments/Scripts/accuracy_vs_runtime.jl @@ -24,8 +24,7 @@ function generate_num_colors_graph(dataset::DATASET) p99_runtimes = [] for colors in num_colors - experiment_params = ExperimentParams(dataset=dataset; - inference_max_paths=2*colors, num_colors=colors) + experiment_params = ExperimentParams(dataset=dataset; num_colors=colors) build_experiments([experiment_params]) println("Num Colors: ", colors) run_estimation_experiments([experiment_params]) @@ -113,11 +112,11 @@ function generate_partial_paths_graph(dataset::DATASET; num_colors = 64) savefig(fig, "Experiments/Results/Figures/PP_Accuracy_vs_Runtime_P99_" * string(dataset) * ".png") end -#generate_num_colors_graph(human) -#generate_num_colors_graph(aids) -#generate_num_colors_graph(hprd) -#generate_num_colors_graph(yeast) -#generate_partial_paths_graph(human) -#generate_partial_paths_graph(aids) -#generate_partial_paths_graph(hprd) +generate_num_colors_graph(human) +generate_num_colors_graph(aids) +generate_num_colors_graph(hprd) +generate_num_colors_graph(yeast) +generate_partial_paths_graph(human) +generate_partial_paths_graph(aids) +generate_partial_paths_graph(hprd) generate_partial_paths_graph(yeast) diff --git a/Source/ColorSummary.jl b/Source/ColorSummary.jl index 99470c8..b69dbbe 100644 --- a/Source/ColorSummary.jl +++ b/Source/ColorSummary.jl @@ -41,7 +41,7 @@ function generate_color_summary(g::DataGraph, params::ColorSummaryParams=ColorSu println("Started coloring") end color_filters::Dict{Color, BloomFilter} = Dict() - color_label_cardinality::Dict{Color, Int} = Dict() + color_label_cardinality::Dict{Color, Any} = Dict() color_hash::Dict{NodeId, Color} = color_graph(g, params, params.num_colors) color_sizes = [0 for _ in 1:maximum(values(color_hash))] for c in values(color_hash) From 1be405b49b17b7bd3c28ffe1a95a98f2fe3c272e Mon Sep 17 00:00:00 2001 From: kylebd99 Date: Thu, 19 Oct 2023 13:57:12 -0700 Subject: [PATCH 6/6] Script Changes --- Experiments/Scripts/accuracy_vs_runtime.jl | 8 ++++---- Experiments/Scripts/cycle_prob_exps.jl | 4 +--- Experiments/Scripts/sampling_strategies.jl | 2 +- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/Experiments/Scripts/accuracy_vs_runtime.jl b/Experiments/Scripts/accuracy_vs_runtime.jl index 39d1c23..d8d04b1 100644 --- a/Experiments/Scripts/accuracy_vs_runtime.jl +++ b/Experiments/Scripts/accuracy_vs_runtime.jl @@ -112,10 +112,10 @@ function generate_partial_paths_graph(dataset::DATASET; num_colors = 64) savefig(fig, "Experiments/Results/Figures/PP_Accuracy_vs_Runtime_P99_" * string(dataset) * ".png") end -generate_num_colors_graph(human) -generate_num_colors_graph(aids) -generate_num_colors_graph(hprd) -generate_num_colors_graph(yeast) +#generate_num_colors_graph(human) +#generate_num_colors_graph(aids) +#generate_num_colors_graph(hprd) +#generate_num_colors_graph(yeast) generate_partial_paths_graph(human) generate_partial_paths_graph(aids) generate_partial_paths_graph(hprd) diff --git a/Experiments/Scripts/cycle_prob_exps.jl b/Experiments/Scripts/cycle_prob_exps.jl index 85623d3..69dfa04 100644 --- a/Experiments/Scripts/cycle_prob_exps.jl +++ b/Experiments/Scripts/cycle_prob_exps.jl @@ -7,9 +7,7 @@ datasets = [aids, yeast, hprd, dblp, youtube, wordnet] experiment_params = Vector{ExperimentParams}() build_params = Vector{ExperimentParams}() for dataset in datasets - push!(build_params, ExperimentParams(dataset=dataset, - num_colors=16, - label_refining_rounds=2)) + push!(build_params, ExperimentParams(dataset=dataset)) for only_shortest_path_cycle in [false, true] push!(experiment_params, ExperimentParams(dataset=dataset, only_shortest_path_cycle=only_shortest_path_cycle)) diff --git a/Experiments/Scripts/sampling_strategies.jl b/Experiments/Scripts/sampling_strategies.jl index 24464a4..9593f4d 100644 --- a/Experiments/Scripts/sampling_strategies.jl +++ b/Experiments/Scripts/sampling_strategies.jl @@ -2,7 +2,7 @@ using Profile include("../Experiments.jl") -datasets = [human, aids, yeast, hprd, dblp] +datasets = [human, aids, yeast, hprd, dblp, wordnet] experiment_params = Vector{ExperimentParams}() build_params = Vector{ExperimentParams}()