From dc44ebf6ca30fa9345a84eb4e4d68e7e39854976 Mon Sep 17 00:00:00 2001
From: kylebd99 <kdeeds@cs.washington.edu>
Date: Mon, 16 Oct 2023 17:18:34 -0700
Subject: [PATCH 1/6] Merge pull request #37 from
 mkyl:kbd-coloring-experiments-2 This PR adjusts how cycles are accounted for.
 Previously, the shortest path between the closing nodes is found and used to
 calculate the probability of closure. This adds an option to consider all
 simple paths between the nodes, treat their closure as independent events,
 then calculate the probability that at least one of them closes.

---
 Experiments/Scripts/cycle_prob_exps.jl    |  21 +++++
 Experiments/graph_results.jl              |   4 +-
 Experiments/run_estimators.jl             |   3 +-
 Experiments/utils.jl                      |   8 +-
 Source/ColorSummary.jl                    |   4 +-
 Source/QuasiStableCardinalityEstimator.jl | 103 +++++++++++++++++-----
 6 files changed, 117 insertions(+), 26 deletions(-)
 create mode 100644 Experiments/Scripts/cycle_prob_exps.jl

diff --git a/Experiments/Scripts/cycle_prob_exps.jl b/Experiments/Scripts/cycle_prob_exps.jl
new file mode 100644
index 0000000..77d352b
--- /dev/null
+++ b/Experiments/Scripts/cycle_prob_exps.jl
@@ -0,0 +1,21 @@
+
+using Profile
+include("../Experiments.jl")
+
+datasets = [aids, yeast, dblp]
+
+experiment_params = Vector{ExperimentParams}()
+build_params = Vector{ExperimentParams}()
+for dataset in datasets
+    push!(build_params, ExperimentParams(dataset=dataset))
+    for only_shortest_path_cycle in [false, true]
+        push!(experiment_params, ExperimentParams(dataset=dataset,
+                                                  only_shortest_path_cycle=only_shortest_path_cycle))
+    end
+end
+
+#build_experiments(build_params)
+
+run_estimation_experiments(experiment_params)
+
+graph_grouped_box_plot(experiment_params; grouping=cycle_stats, filename="cycle_stats_exps")
diff --git a/Experiments/graph_results.jl b/Experiments/graph_results.jl
index 963570e..7db9176 100644
--- a/Experiments/graph_results.jl
+++ b/Experiments/graph_results.jl
@@ -1,4 +1,4 @@
-@enum GROUP dataset technique cycle_size summary_paths inference_paths query_type sampling_type
+@enum GROUP dataset technique cycle_size summary_paths inference_paths query_type sampling_type cycle_stats
 #todo: query type
 
 @enum VALUE estimate_error runtime
@@ -63,6 +63,8 @@ function get_value_from_param(experiment_param::ExperimentParams, value_type::GR
         return experiment_param.inference_max_paths
     elseif value_type == sampling_type
         return experiment_param.sampling_strategy
+    elseif value_type == cycle_stats
+        return experiment_param.only_shortest_path_cycle
     else
         # default to grouping by technique
         return (experiment_param.summary_params.partitioner, experiment_param.summary_params.label_refining_rounds)
diff --git a/Experiments/run_estimators.jl b/Experiments/run_estimators.jl
index 4d3209f..4466f0a 100644
--- a/Experiments/run_estimators.jl
+++ b/Experiments/run_estimators.jl
@@ -15,7 +15,8 @@ function run_estimation_experiments(experiment_params_list::Vector{ExperimentPar
             results = @timed get_cardinality_bounds(query, summary;
                                 max_partial_paths = experiment_params.inference_max_paths,
                                 use_partial_sums=experiment_params.use_partial_sums, usingStoredStats=true,
-                                sampling_strategy=experiment_params.sampling_strategy)
+                                sampling_strategy=experiment_params.sampling_strategy,
+                                only_shortest_path_cycle= experiment_params.only_shortest_path_cycle)
             upper_bound = results.value[3]
             estimate = max(1, results.value[2])
             lower_bound = results.value[1]
diff --git a/Experiments/utils.jl b/Experiments/utils.jl
index 79a50cb..8eca011 100644
--- a/Experiments/utils.jl
+++ b/Experiments/utils.jl
@@ -7,10 +7,14 @@ struct ExperimentParams
     dataset::DATASET
     summary_params::ColorSummaryParams
     inference_max_paths::Int
+    # Whether to consider just the shortest path when calculating cycle probabilities or
+    # all simple paths.
+    only_shortest_path_cycle::Bool
     use_partial_sums::Bool
     sampling_strategy::SAMPLING_STRATEGY
 
-    function ExperimentParams(;dataset::DATASET,  num_colors::Int=64, max_cycle_size=6, summary_max_paths=1000,
+    function ExperimentParams(;dataset::DATASET,  num_colors::Int=64, max_cycle_size=6,
+        only_shortest_path_cycle=false, summary_max_paths=1000,
         partitioner::PARTITIONER = QuasiStable, weighting=true, inference_max_paths=500, use_partial_sums=true,
         sampling_strategy=redistributive, label_refining_rounds = 0)
         return new(dataset, ColorSummaryParams(num_colors=num_colors,
@@ -20,6 +24,7 @@ struct ExperimentParams
                                                        weighting=weighting,
                                                        label_refining_rounds=label_refining_rounds),
                     inference_max_paths,
+                    only_shortest_path_cycle,
                     use_partial_sums,
                     sampling_strategy
                )
@@ -30,6 +35,7 @@ function params_to_results_filename(experiment_params::ExperimentParams)
     name = string(experiment_params.dataset) * "_"
     name *= params_to_string(experiment_params.summary_params) * "_"
     name *= string(experiment_params.inference_max_paths) * "_"
+    name *= string(experiment_params.only_shortest_path_cycle) * "_"
     name *= string(experiment_params.use_partial_sums) * "_"
     name *= string(experiment_params.sampling_strategy) * ".csv"
     return name
diff --git a/Source/ColorSummary.jl b/Source/ColorSummary.jl
index 841f0d9..40a07cb 100644
--- a/Source/ColorSummary.jl
+++ b/Source/ColorSummary.jl
@@ -27,6 +27,7 @@ struct ColorSummary
     color_filters::Dict{Int, BloomFilter} # color_filters[c] = filter
     cycle_probabilities::Dict{CyclePathAndColors, Float64} # cycle_probabilities[[c1, c2], path] = likelihood
     cycle_length_probabilities::Dict{Int, Float64} #cycle_probabilities[path_length] = likelihood
+    max_cycle_size::Int
     total_edges::Int
     total_nodes::Int
     # for outdegrees, c2 is the color of the outneighbor
@@ -234,7 +235,8 @@ function generate_color_summary(g::DataGraph, params::ColorSummaryParams=ColorSu
         println("Finished tracking statistics")
     end
     return ColorSummary(color_label_cardinality, edge_deg, color_filters,
-                cycle_probabilities, cycle_length_probabilities, ne(g.graph), nv(g.graph))
+                cycle_probabilities, cycle_length_probabilities, params.max_cycle_size,
+                 ne(g.graph), nv(g.graph))
 end
 
 function color_hash_to_groups(color_hash, num_colors)
diff --git a/Source/QuasiStableCardinalityEstimator.jl b/Source/QuasiStableCardinalityEstimator.jl
index 77fa933..5ddb37f 100644
--- a/Source/QuasiStableCardinalityEstimator.jl
+++ b/Source/QuasiStableCardinalityEstimator.jl
@@ -94,6 +94,58 @@ function sample_paths(partial_paths::Vector{Tuple{Vector{Int}, Vector{Float64}}}
     return path_samples
 end
 
+function get_simple_paths_dfs!(visited::Set{Int}, cur::Int, finish::Int, max_length::Int,
+                                graph::SimpleGraph, current_path::Vector{Int},
+                                simple_paths::Vector{Vector{Int}})
+    length(current_path) > max_length && return
+    cur in visited && return
+    push!(visited, cur)
+    push!(current_path, cur)
+    if cur == finish
+        push!(simple_paths, deepcopy(current_path))
+        delete!(visited, cur)
+        pop!(current_path)
+        return
+    end
+
+    for next in all_neighbors(graph, cur)
+        get_simple_paths_dfs!(visited, next, finish, max_length, graph, current_path,
+                                simple_paths)
+    end
+    if length(current_path) > 0
+        pop!(current_path)
+    end
+    delete!(visited, cur)
+end
+
+# gets all directed, simple paths from the start to finish node
+function get_all_simple_path_bools(start::Int, finish::Int, max_length::Int, query_graph::DiGraph)
+    # convert the graph to be undirected
+    graph_copy = Graph(copy(query_graph))
+    rem_edge!(graph_copy, start, finish)
+    visited = Set{Int}()
+    current_path = Vector{Int}()
+    simple_paths = Vector{Vector{Int}}()
+    get_simple_paths_dfs!(visited, start, finish, max_length, graph_copy,
+                                current_path, simple_paths)
+    path_bools = Vector{BoolPath}()
+    for path in simple_paths
+        bools::Vector{Bool} = [false for _ in 1:length(path)-1]
+        for i in 1 : length(path)-1
+            src_node = path[i]
+            dst_node = path[i+1]
+            if dst_node in outneighbors(query_graph, src_node)
+                bools[i] = true # out edge
+            else
+                bools[i] = false # in edge
+            end
+        end
+        push!(path_bools, bools)
+    end
+    return path_bools
+end
+
+
 # gets the directed path from the start to finish node
 function get_matching_graph(start::Int, finish::Int, query::QueryGraph)
     # convert the graph to be undirected
@@ -119,7 +171,8 @@ function get_matching_graph(start::Int, finish::Int, query::QueryGraph)
 end
 
 function handle_extra_edges!(query::QueryGraph, summary::ColorSummary, partial_paths::Vector{Tuple{Vector{Int}, Vector{Float64}}},
-                                current_query_nodes::Vector{Int}, visited_query_edges::Vector{Tuple{Int,Int}}, usingStoredStats::Bool)
+                                current_query_nodes::Vector{Int}, visited_query_edges::Vector{Tuple{Int,Int}}, usingStoredStats::Bool,
+                                only_shortest_path_cycle::Bool)
     # To account for cyclic queries, we check whether there are any remaining edges that have not
     # been processed. If so, we set the lower bound to 0, reduce the average estimate accordingly, and leave
     # the upper bound unchanged.
@@ -138,11 +191,14 @@ function handle_extra_edges!(query::QueryGraph, summary::ColorSummary, partial_p
         new_node_idx::Int = only(indexin(edge[2], current_query_nodes))
         child_label::Int = only(query.vertex_labels[edge[2]])
         edge_label::Int = only(query.edge_labels[(edge[1],edge[2])])
-        path_graph = get_matching_graph(edge[2], edge[1], query)
-        path_bools = convert_path_graph_to_bools(path_graph)
+        all_path_bools = Vector{BoolPath}()
+        if only_shortest_path_cycle
+            all_path_bools = [convert_path_graph_to_bools(get_matching_graph(edge[2], edge[1], query))]
+        else
+            all_path_bools = get_all_simple_path_bools(edge[2], edge[1], summary.max_cycle_size, query.graph)
+        end
+
         default_colors::StartEndColorPair = (-1, -1)
-        default_cycle_description = CyclePathAndColors(path_bools, default_colors)
-        path_length = length(path_bools)
         edge_deg::Dict{Int, Dict{Int, DegreeStats}} = Dict()
         if haskey(summary.edge_deg, edge_label) && haskey(summary.edge_deg[edge_label], child_label)
             edge_deg = summary.edge_deg[edge_label][child_label]
@@ -155,27 +211,29 @@ function handle_extra_edges!(query::QueryGraph, summary::ColorSummary, partial_p
             current_colors::StartEndColorPair = (child_color, parent_color)
             # We don't have to check data label because these nodes are already in the
             # partial path, so we have already ensured that the colors are appropriate
-            probability_of_edge = 0.0
+            probability_no_edge = 1.0
             if (haskey(edge_deg, parent_color) && haskey(edge_deg[parent_color], child_color))
-                if usingStoredStats
-                    # we flip this because the matching graph finds the path between two nodes,
-                    # where the last node is the start of the closing edge
-                    current_cycle_description = CyclePathAndColors(path_bools, current_colors)
-                    if haskey(summary.cycle_probabilities, current_cycle_description)
-                        probability_of_edge = summary.cycle_probabilities[current_cycle_description]
-                    elseif haskey(summary.cycle_probabilities, default_cycle_description)
-                        probability_of_edge = summary.cycle_probabilities[default_cycle_description]
-                    elseif haskey(summary.cycle_length_probabilities, path_length)
-                        probability_of_edge = summary.cycle_length_probabilities[path_length]
-                    else
-                        probability_of_edge = get_independent_cycle_likelihood(edge_label, child_label, parent_color, child_color, summary)
+                if usingStoredStats && length(all_path_bools) > 0
+                    for path_bools in all_path_bools
+                        path_length = length(path_bools)
+                        default_cycle_description = CyclePathAndColors(path_bools, default_colors)
+                        current_cycle_description = CyclePathAndColors(path_bools, current_colors)
+                        if haskey(summary.cycle_probabilities, current_cycle_description)
+                            probability_no_edge *= 1.0 - summary.cycle_probabilities[current_cycle_description]
+                        elseif haskey(summary.cycle_probabilities, default_cycle_description)
+                            probability_no_edge *= 1.0 - summary.cycle_probabilities[default_cycle_description]
+                        elseif haskey(summary.cycle_length_probabilities, path_length)
+                            probability_no_edge *= 1.0 - summary.cycle_length_probabilities[path_length]
+                        else
+                            probability_no_edge *= 1.0 - get_independent_cycle_likelihood(edge_label, child_label, parent_color, child_color, summary)
+                        end
                     end
                 else
-                    probability_of_edge = get_independent_cycle_likelihood(edge_label, child_label, parent_color, child_color, summary)
+                    probability_no_edge *= 1.0 - get_independent_cycle_likelihood(edge_label, child_label, parent_color, child_color, summary)
                 end
             end
             partial_paths[i][2][1] = 0
-            partial_paths[i][2][2] *= probability_of_edge
+            partial_paths[i][2][2] *= 1.0 - probability_no_edge
         end
     end
 end
@@ -203,7 +261,8 @@ end
 
 function get_cardinality_bounds(query::QueryGraph, summary::ColorSummary; max_partial_paths = nothing,
                                 use_partial_sums = true, verbose = false, usingStoredStats = false,
-                                include_cycles = true, sampling_strategy=weighted)
+                                include_cycles = true, sampling_strategy=weighted,
+                                only_shortest_path_cycle=false)
     node_order = get_min_width_node_order(query.graph) #spanning tree to cut out cycles
     if verbose
         println("Node Order:", node_order)
@@ -354,7 +413,7 @@ function get_cardinality_bounds(query::QueryGraph, summary::ColorSummary; max_pa
         end
 
         if (include_cycles)
-            handle_extra_edges!(query, summary, partial_paths, current_query_nodes, visited_query_edges, usingStoredStats)
+            handle_extra_edges!(query, summary, partial_paths, current_query_nodes, visited_query_edges, usingStoredStats, only_shortest_path_cycle)
         end
     end
 

From b35dcf63948de5b31fc6fbe750bfffabb6c2f9bb Mon Sep 17 00:00:00 2001
From: kylebd99 <kdeeds@cs.washington.edu>
Date: Wed, 18 Oct 2023 13:17:00 -0700
Subject: [PATCH 2/6] Only include cycles on visited edges

---
 Experiments/Scripts/cycle_prob_exps.jl    |  4 ++--
 Source/QuasiStableCardinalityEstimator.jl | 15 ++++++++++-----
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/Experiments/Scripts/cycle_prob_exps.jl b/Experiments/Scripts/cycle_prob_exps.jl
index 77d352b..6933dc7 100644
--- a/Experiments/Scripts/cycle_prob_exps.jl
+++ b/Experiments/Scripts/cycle_prob_exps.jl
@@ -2,7 +2,7 @@
 using Profile
 include("../Experiments.jl")
 
-datasets = [aids, yeast, dblp]
+datasets = [aids, yeast, hprd, dblp, youtube, wordnet]
 
 experiment_params = Vector{ExperimentParams}()
 build_params = Vector{ExperimentParams}()
@@ -16,6 +16,6 @@ end
 
 #build_experiments(build_params)
 
-run_estimation_experiments(experiment_params)
+#run_estimation_experiments(experiment_params)
 
 graph_grouped_box_plot(experiment_params; grouping=cycle_stats, filename="cycle_stats_exps")
diff --git a/Source/QuasiStableCardinalityEstimator.jl b/Source/QuasiStableCardinalityEstimator.jl
index 5ddb37f..273af3c 100644
--- a/Source/QuasiStableCardinalityEstimator.jl
+++ b/Source/QuasiStableCardinalityEstimator.jl
@@ -119,10 +119,15 @@ function get_simple_paths_dfs!(visited::Set{Int}, cur::Int, finish::Int, max_len
 end
 
 # gets all directed, simple paths from the start to finish node
-function get_all_simple_path_bools(start::Int, finish::Int, max_length::Int, query_graph::DiGraph)
-    # convert the graph to be undirected
-    graph_copy = Graph(copy(query_graph))
+function get_all_simple_path_bools(start::Int, finish::Int, max_length::Int,
+                                    query_graph::DiGraph, visited_edges::Vector{Tuple{Int,Int}})
+    # convert the graph to be undirected and only include the edges that have already been processed
+    graph_copy = Graph(nv(query_graph))
+    for edge in visited_edges
+        add_edge!(graph_copy, edge[1], edge[2])
+    end
     rem_edge!(graph_copy, start, finish)
+
     visited = Set{Int}()
     current_path = Vector{Int}()
     simple_paths = Vector{Vector{Int}}()
@@ -181,12 +186,12 @@ function handle_extra_edges!(query::QueryGraph, summary::ColorSummary, partial_p
         if ! ((src(edge), dst(edge)) in visited_query_edges) &&
                  (src(edge) in current_query_nodes && dst(edge) in current_query_nodes)
             push!(remaining_edges, (src(edge), dst(edge)))
-            push!(visited_query_edges, (src(edge), dst(edge)))
         end
     end
 
     # scale down the average if there are remaining non-tree-edges
     for edge in remaining_edges
+        push!(visited_query_edges, edge)
         parent_node_idx::Int = only(indexin(edge[1], current_query_nodes))
         new_node_idx::Int = only(indexin(edge[2], current_query_nodes))
         child_label::Int = only(query.vertex_labels[edge[2]])
@@ -195,7 +200,7 @@ function handle_extra_edges!(query::QueryGraph, summary::ColorSummary, partial_p
         if only_shortest_path_cycle
             all_path_bools = [convert_path_graph_to_bools(get_matching_graph(edge[2], edge[1], query))]
         else
-            all_path_bools = get_all_simple_path_bools(edge[2], edge[1], summary.max_cycle_size, query.graph)
+            all_path_bools = get_all_simple_path_bools(edge[2], edge[1], summary.max_cycle_size, query.graph, visited_query_edges)
         end
 
         default_colors::StartEndColorPair = (-1, -1)

From bb62125f645e175a700f42b1caaac5cbb8b36a78 Mon Sep 17 00:00:00 2001
From: kylebd99 <kdeeds@cs.washington.edu>
Date: Wed, 18 Oct 2023 16:20:12 -0700
Subject: [PATCH 3/6] add memory exps

---
 Experiments/Scripts/cycle_prob_exps.jl | 14 +++--
 Experiments/Scripts/memory_exps.jl     | 20 +++++++
 Experiments/build_color_summaries.jl   | 10 +++-
 Experiments/graph_results.jl           | 76 +++++++++++++++++++++++++-
 4 files changed, 109 insertions(+), 11 deletions(-)
 create mode 100644 Experiments/Scripts/memory_exps.jl

diff --git a/Experiments/Scripts/cycle_prob_exps.jl b/Experiments/Scripts/cycle_prob_exps.jl
index 6933dc7..7e99551 100644
--- a/Experiments/Scripts/cycle_prob_exps.jl
+++ b/Experiments/Scripts/cycle_prob_exps.jl
@@ -7,15 +7,19 @@ datasets = [aids, yeast, hprd, dblp, youtube, wordnet]
 experiment_params = Vector{ExperimentParams}()
 build_params = Vector{ExperimentParams}()
 for dataset in datasets
-    push!(build_params, ExperimentParams(dataset=dataset))
+    push!(build_params, ExperimentParams(dataset=dataset,
+                        num_colors=16,
+                        label_refining_rounds=2))
     for only_shortest_path_cycle in [false, true]
         push!(experiment_params, ExperimentParams(dataset=dataset,
-                                                  only_shortest_path_cycle=only_shortest_path_cycle))
+                                                    num_colors=16,
+                                                    label_refining_rounds=2,
+                                                    only_shortest_path_cycle=only_shortest_path_cycle))
     end
 end
 
-#build_experiments(build_params)
+build_experiments(build_params)
 
-#run_estimation_experiments(experiment_params)
+run_estimation_experiments(experiment_params)
 
-graph_grouped_box_plot(experiment_params; grouping=cycle_stats, filename="cycle_stats_exps")
+graph_grouped_box_plot(experiment_params; grouping=cycle_stats, filename="cycle_stats_exps_w_refining")
diff --git a/Experiments/Scripts/memory_exps.jl b/Experiments/Scripts/memory_exps.jl
new file mode 100644
index 0000000..3a173b6
--- /dev/null
+++ b/Experiments/Scripts/memory_exps.jl
@@ -0,0 +1,20 @@
+
+using Profile
+include("../Experiments.jl")
+
+#datasets = [aids, yeast, hprd, dblp, youtube, wordnet]
+datasets = [aids, yeast, hprd, dblp, youtube, wordnet]
+num_colors = [4, 8, 16, 32, 64, 128]
+experiment_params = Vector{ExperimentParams}()
+build_params = Vector{ExperimentParams}()
+for dataset in datasets
+    for n in num_colors
+        push!(build_params, ExperimentParams(dataset=dataset, num_colors=n))
+    end
+end
+build_experiments(build_params)
+
+graph_grouped_bar_plot(build_params; grouping=number_of_colors,
+                                          y_type=memory_footprint,
+                                          y_lims=[1, 10000],
+                                          filename="memory_size_vs_colors")
diff --git a/Experiments/build_color_summaries.jl b/Experiments/build_color_summaries.jl
index c7615ca..00ae7c2 100644
--- a/Experiments/build_color_summaries.jl
+++ b/Experiments/build_color_summaries.jl
@@ -1,6 +1,6 @@
 function build_experiments(experiment_params_list::Vector{ExperimentParams})
-    build_times = [("Dataset", "Partitioner", "NumColors", "BuildTime")]
     for experiment_params in experiment_params_list
+        build_times = [("Dataset", "Partitioner", "NumColors", "BuildTime", "MemoryFootprint")]
         dataset = experiment_params.dataset
         summary_params = experiment_params.summary_params
         data = load_dataset(dataset)
@@ -8,9 +8,13 @@ function build_experiments(experiment_params_list::Vector{ExperimentParams})
         summary_file_location = "Experiments/SerializedSummaries/" * summary_name
         println("Building Color Summary: ", summary_name)
         results = @timed generate_color_summary(data, summary_params; verbose=1)
+        summary_size = Base.summarysize(results.value)
         serialize(summary_file_location, results.value)
-        push!(build_times, (string(dataset), string(summary_params.partitioner),
-                             string(summary_params.num_colors), string(results.time)))
+        push!(build_times, (string(dataset),
+                             string(summary_params.partitioner),
+                             string(summary_params.num_colors),
+                             string(results.time),
+                             string(summary_size)))
         results_filename = params_to_results_filename(experiment_params)
         result_file_location = "Experiments/Results/Build_" * results_filename
         writedlm(result_file_location, build_times, ",")
diff --git a/Experiments/graph_results.jl b/Experiments/graph_results.jl
index 7db9176..6d8bf4d 100644
--- a/Experiments/graph_results.jl
+++ b/Experiments/graph_results.jl
@@ -1,7 +1,7 @@
-@enum GROUP dataset technique cycle_size summary_paths inference_paths query_type sampling_type cycle_stats
+@enum GROUP dataset technique cycle_size summary_paths inference_paths query_type sampling_type cycle_stats number_of_colors
 #todo: query type
 
-@enum VALUE estimate_error runtime
+@enum VALUE estimate_error runtime memory_footprint
 
 function graph_grouped_box_plot(experiment_params_list::Vector{ExperimentParams};
                                         x_type::GROUP=dataset, y_type::VALUE=estimate_error,
@@ -43,7 +43,7 @@ function graph_grouped_box_plot(experiment_params_list::Vector{ExperimentParams}
     # See this: https://discourse.julialang.org/t/deactivate-plot-display-to-avoid-need-for-x-server/19359/15
     ENV["GKSwstype"]="100"
     gbplot = groupedboxplot(x_values, y_values, group = groups, yscale =:log10,
-                            ylims=[10^-13, 10^11], yticks=[10^-10, 10^-5, 1, 10^5, 10^10],
+                            ylims=[10^-13, 10^11], yticks=[10^-10, 10^-5, 10^-2, 1, 10^2, 10^5, 10^10],
                             legend = :outertopleft, size = (1000, 600))
     x_label !== nothing && xlabel!(gbplot, x_label)
     y_label !== nothing && ylabel!(gbplot, y_label)
@@ -51,6 +51,74 @@ function graph_grouped_box_plot(experiment_params_list::Vector{ExperimentParams}
     savefig(gbplot, "Experiments/Results/Figures/" * plotname)
 end
 
+function graph_grouped_bar_plot(experiment_params_list::Vector{ExperimentParams};
+                                        x_type::GROUP=dataset,
+                                        y_type::VALUE=estimate_error,
+                                        grouping::GROUP=technique,
+                                        x_label=nothing,
+                                        y_label=nothing,
+                                        y_lims=[0, 10],
+                                        filename=nothing)
+    # for now let's just use the dataset as the x-values and the cycle size as the groups
+    x_values = []
+    y_values = Float64[]
+    groups = []
+    for experiment_params in experiment_params_list
+        # load the results
+        results_filename = params_to_results_filename(experiment_params)
+        prefix = "Experiments/Results/Estimation_"
+        if y_type == memory_footprint
+            prefix = "Experiments/Results/Build_"
+        end
+        results_path = prefix * results_filename
+        results_df = CSV.read(results_path, DataFrame; normalizenames=true)
+
+        # get the x_value and grouping (same for all results in this experiment param)
+        println(results_df)
+        # keep track of the data points
+        for i in 1:nrow(results_df)
+            current_x = x_type == query_type ? results_df[i, :QueryType] : get_value_from_param(experiment_params, x_type)
+            current_group = grouping == query_type ? results_df[i, :QueryType] : get_value_from_param(experiment_params, grouping)
+            current_y = 0
+            if y_type == estimate_error
+                current_y = results_df[i, :Estimate] / results_df[i, :TrueCard]
+            elseif y_type == memory_footprint
+                current_y = results_df[i, :MemoryFootprint]/(10^6)
+            else
+                     # y_type == runtime
+                current_y = results_df[i, :EstimationTime]
+            end
+            # push the errors and their groupings into the correct vector
+            push!(x_values, current_x)
+            push!(y_values, current_y)
+            push!(groups, current_group)
+        end
+    end
+    results_filename = params_to_results_filename(experiment_params_list[1])
+    println("starting graphs")
+
+    # This seems to be necessary for using Plots.jl outside of the ipynb framework.
+    # See this: https://discourse.julialang.org/t/deactivate-plot-display-to-avoid-need-for-x-server/19359/15
+    ENV["GKSwstype"]="100"
+    println(x_values)
+    println(y_values)
+    println(groups)
+    gbplot = StatsPlots.groupedbar(x_values,
+                            y_values,
+                            group = groups,
+#                            yscale =:log10,
+                            ylims=y_lims,
+                            legend = :outertopleft,
+                            size = (1000, 600))
+    x_label !== nothing && xlabel!(gbplot, x_label)
+    y_label !== nothing && ylabel!(gbplot, y_label)
+    plotname = (isnothing(filename)) ? results_filename * ".png" : filename * ".png"
+    savefig(gbplot, "Experiments/Results/Figures/" * plotname)
+end
+
+
+
+
 # default to grouping by dataset
 function get_value_from_param(experiment_param::ExperimentParams, value_type::GROUP)
     if value_type == dataset
@@ -65,6 +133,8 @@ function get_value_from_param(experiment_param::ExperimentParams, value_type::GR
         return experiment_param.sampling_strategy
     elseif value_type == cycle_stats
         return experiment_param.only_shortest_path_cycle
+    elseif value_type == number_of_colors
+        return experiment_param.summary_params.num_colors
     else
         # default to grouping by technique
         return (experiment_param.summary_params.partitioner, experiment_param.summary_params.label_refining_rounds)

From 3b16fb182d15ce135fe6449d247484ff10d70141 Mon Sep 17 00:00:00 2001
From: kylebd99 <kdeeds@cs.washington.edu>
Date: Wed, 18 Oct 2023 17:39:34 -0700
Subject: [PATCH 4/6] Change color & stats types to reduce memory

---
 Experiments/Scripts/cycle_prob_exps.jl    |  4 +--
 Experiments/Scripts/memory_exps.jl        |  8 +++---
 Source/CardinalityWithColors.jl           |  4 ++-
 Source/ColorSummary.jl                    | 30 +++++++++++------------
 Source/ColoringMethods.jl                 | 26 ++++++++++----------
 Source/ExactSizeCalculator.jl             | 14 +++++------
 Source/QuasiStableCardinalityEstimator.jl | 22 ++++++++---------
 Source/datasets.jl                        |  8 +++---
 8 files changed, 58 insertions(+), 58 deletions(-)

diff --git a/Experiments/Scripts/cycle_prob_exps.jl b/Experiments/Scripts/cycle_prob_exps.jl
index 7e99551..85623d3 100644
--- a/Experiments/Scripts/cycle_prob_exps.jl
+++ b/Experiments/Scripts/cycle_prob_exps.jl
@@ -12,8 +12,6 @@ for dataset in datasets
                         label_refining_rounds=2))
     for only_shortest_path_cycle in [false, true]
         push!(experiment_params, ExperimentParams(dataset=dataset,
-                                                    num_colors=16,
-                                                    label_refining_rounds=2,
                                                     only_shortest_path_cycle=only_shortest_path_cycle))
     end
 end
@@ -22,4 +20,4 @@ build_experiments(build_params)
 
 run_estimation_experiments(experiment_params)
 
-graph_grouped_box_plot(experiment_params; grouping=cycle_stats, filename="cycle_stats_exps_w_refining")
+graph_grouped_box_plot(experiment_params; grouping=cycle_stats, filename="cycle_stats_exps")
diff --git a/Experiments/Scripts/memory_exps.jl b/Experiments/Scripts/memory_exps.jl
index 3a173b6..1ec3c29 100644
--- a/Experiments/Scripts/memory_exps.jl
+++ b/Experiments/Scripts/memory_exps.jl
@@ -3,7 +3,7 @@ using Profile
 include("../Experiments.jl")
 
 #datasets = [aids, yeast, hprd, dblp, youtube, wordnet]
-datasets = [aids, yeast, hprd, dblp, youtube, wordnet]
+datasets = [aids, yeast, hprd, dblp, wordnet]
 num_colors = [4, 8, 16, 32, 64, 128]
 experiment_params = Vector{ExperimentParams}()
 build_params = Vector{ExperimentParams}()
@@ -12,9 +12,9 @@ for dataset in datasets
         push!(build_params, ExperimentParams(dataset=dataset, num_colors=n))
     end
 end
-build_experiments(build_params)
+#build_experiments(build_params)
 
 graph_grouped_bar_plot(build_params; grouping=number_of_colors,
                                           y_type=memory_footprint,
-                                          y_lims=[1, 10000],
-                                          filename="memory_size_vs_colors")
+                                          y_lims=[0, 16],
+                                          filename="memory_size_vs_colors_fp32_int16")
diff --git a/Source/CardinalityWithColors.jl b/Source/CardinalityWithColors.jl
index 68777c9..5608241 100644
--- a/Source/CardinalityWithColors.jl
+++ b/Source/CardinalityWithColors.jl
@@ -10,7 +10,9 @@ using Graphs: SimpleDiGraphFromIterator, Edge, DiGraph, edges, nv, ne, add_edge!
 
 
 BoolPath = Vector{Bool}
-StartEndColorPair = Tuple{Int, Int}
+NodeId = Int
+Color = Int16
+StartEndColorPair = Tuple{Color, Color}
 abstract type Comparable end
 import Base .==
 function ==(a::T, b::T) where T <: Comparable
diff --git a/Source/ColorSummary.jl b/Source/ColorSummary.jl
index 40a07cb..99470c8 100644
--- a/Source/ColorSummary.jl
+++ b/Source/ColorSummary.jl
@@ -1,12 +1,12 @@
 using Graphs
 
 struct DegreeStats
-    min_out::Float64
-    avg_out::Float64
-    max_out::Float64
-    min_in::Float64
-    avg_in::Float64
-    max_in::Float64
+    min_out::Float32
+    avg_out::Float32
+    max_out::Float32
+    min_in::Float32
+    avg_in::Float32
+    max_in::Float32
 
     function DegreeStats(min_out, avg_out, max_out)
         return new(min_out, avg_out, max_out, 0, 0, 0)
@@ -22,9 +22,9 @@ end
 # a particular color. Note that `-1` is used to represent a "wildcard" label. These do not appear in the data graph,
 # but they do occur in the query graph.
 struct ColorSummary
-    color_label_cardinality::Dict{Int, Dict{Int, Int}} # color_label_cardinality[c][v] = num_vertices
-    edge_deg::Dict{Int, Dict{Int, Dict{Int, Dict{Int, DegreeStats}}}} # edge_min_out_deg[e][v2][c1][c2] = min
-    color_filters::Dict{Int, BloomFilter} # color_filters[c] = filter
+    color_label_cardinality::Dict{Color, Dict{Int, Int}} # color_label_cardinality[c][v] = num_vertices
+    edge_deg::Dict{Int, Dict{Int, Dict{Color, Dict{Color, DegreeStats}}}} # edge_min_out_deg[e][v2][c1][c2] = min
+    color_filters::Dict{Color, BloomFilter} # color_filters[c] = filter
     cycle_probabilities::Dict{CyclePathAndColors, Float64} # cycle_probabilities[[c1, c2], path] = likelihood
     cycle_length_probabilities::Dict{Int, Float64} #cycle_probabilities[path_length] = likelihood
     max_cycle_size::Int
@@ -40,9 +40,9 @@ function generate_color_summary(g::DataGraph, params::ColorSummaryParams=ColorSu
     if (verbose > 0)
         println("Started coloring")
     end
-    color_filters = Dict()
-    color_label_cardinality = Dict()
-    color_hash::Dict{Int, Int32} = color_graph(g, params, params.num_colors)
+    color_filters::Dict{Color, BloomFilter} = Dict()
+    color_label_cardinality::Dict{Color, Int} = Dict()
+    color_hash::Dict{NodeId, Color} = color_graph(g, params, params.num_colors)
     color_sizes = [0 for _ in 1:maximum(values(color_hash))]
     for c in values(color_hash)
         color_sizes[c] += 1
@@ -97,7 +97,7 @@ function generate_color_summary(g::DataGraph, params::ColorSummaryParams=ColorSu
         println("Started tracking statistics")
     end
     # We keep separate degree statistics for in-degree and out-degree.
-    color_to_color_out_counter::Dict{Int32, Dict{Int32, Any}} = Dict()
+    color_to_color_out_counter::Dict{Color, Dict{Color, Any}} = Dict()
     for x in vertices(g.graph)
         c1 = color_hash[x]
         for y in outneighbors(g.graph,x)
@@ -132,7 +132,7 @@ function generate_color_summary(g::DataGraph, params::ColorSummaryParams=ColorSu
         end
     end
 
-    edge_deg::Dict{Int, Dict{Int, Dict{Int, Dict{Int, DegreeStats}}}} = Dict()
+    edge_deg::Dict{Int, Dict{Int, Dict{Color, Dict{Color, DegreeStats}}}} = Dict()
     for edge_label in keys(color_to_color_out_counter)
         edge_deg[edge_label] = Dict()
         for vertex_label in keys(color_to_color_out_counter[edge_label])
@@ -160,7 +160,7 @@ function generate_color_summary(g::DataGraph, params::ColorSummaryParams=ColorSu
     end
 
     # We keep separate degree statistics for in-degree and out-degree.
-    color_to_color_in_counter::Dict{Int32, Dict{Int32, Any}} = Dict()
+    color_to_color_in_counter::Dict{Color, Dict{Color, Any}} = Dict()
     for x in vertices(g.graph)
         c1 = color_hash[x]
         for y in inneighbors(g.graph,x)
diff --git a/Source/ColoringMethods.jl b/Source/ColoringMethods.jl
index fba1fd7..471fd6d 100644
--- a/Source/ColoringMethods.jl
+++ b/Source/ColoringMethods.jl
@@ -301,7 +301,7 @@ function _edge_ratio_color(g::DataGraph, num_colors::Int)
     return color_hash
 end
 
-function _recursive_label_split(g::DataGraph, group::Vector{Int}, depth::Int, max_depth::Int)
+function _recursive_label_split(g::DataGraph, group::Vector{NodeId}, depth::Int, max_depth::Int)
     if depth == max_depth
         return [group]
     end
@@ -323,8 +323,8 @@ function _recursive_label_split(g::DataGraph, group::Vector{Int}, depth::Int, ma
         end
     end
 
-    left_group = Vector{Int}()
-    right_group = Vector{Int}()
+    left_group = Vector{NodeId}()
+    right_group = Vector{NodeId}()
     for v in group
         if most_even_label in g.vertex_labels[v]
             push!(right_group, v)
@@ -339,9 +339,9 @@ end
 # It does this by recursively choosing a single label which most evenly breaks the color
 # into two sub-colors with up to `label_refining_rounds` depth.
 function _refine_by_vertex_labels(g::DataGraph, params::ColorSummaryParams,
-                                    color_hash::Dict{Int, Int}, label_refining_rounds::Int)
-    refined_color_hash::Dict{Int, Int} = Dict()
-    color_to_vertices::Dict{Int, Vector{Int}} = Dict()
+                                    color_hash::Dict{NodeId, Color}, label_refining_rounds::Int)
+    refined_color_hash::Dict{NodeId, Color} = Dict()
+    color_to_vertices::Dict{Color, Vector{NodeId}} = Dict()
     for v in keys(color_hash)
         color = color_hash[v]
         if haskey(color_to_vertices, color)
@@ -373,7 +373,7 @@ end
 # It does this by recursively choosing a single label which has the greatest stddev w.r.t.
 # the edge count of vertices and splitting the nodes based on their edge count for that label
 # with up to `label_refining_rounds` depth.
-function _recursive_neighbor_split(g::DataGraph, group::Vector{Int}, depth::Int, max_depth::Int)
+function _recursive_neighbor_split(g::DataGraph, group::Vector{NodeId}, depth::Int, max_depth::Int)
     if depth == max_depth
         return [group]
     end
@@ -409,8 +409,8 @@ function _recursive_neighbor_split(g::DataGraph, group::Vector{Int}, depth::Int,
         end
     end
 
-    left_group = Vector{Int}()
-    right_group = Vector{Int}()
+    left_group = Vector{NodeId}()
+    right_group = Vector{NodeId}()
     for v in group
         label_count = 0
         for n in all_neighbors(g.graph, v)
@@ -428,9 +428,9 @@ function _recursive_neighbor_split(g::DataGraph, group::Vector{Int}, depth::Int,
 end
 
 function _refine_by_neighbor_labels(g::DataGraph, params::ColorSummaryParams,
-                                    color_hash::Dict{Int, Int}, label_refining_rounds::Int)
-    refined_color_hash::Dict{Int, Int} = Dict()
-    color_to_vertices::Dict{Int, Vector{Int}} = Dict()
+                                    color_hash::Dict{NodeId, Color}, label_refining_rounds::Int)
+    refined_color_hash::Dict{NodeId, Color} = Dict()
+    color_to_vertices::Dict{Color, Vector{NodeId}} = Dict()
     for v in keys(color_hash)
         color = color_hash[v]
         if haskey(color_to_vertices, color)
@@ -457,7 +457,7 @@ function _refine_by_neighbor_labels(g::DataGraph, params::ColorSummaryParams,
 end
 
 function color_graph(g::DataGraph, params::ColorSummaryParams, num_colors::Int)
-    color_hash::Dict{Int, Int} = if params.partitioner == QuasiStable
+    color_hash::Dict{NodeId, Color} = if params.partitioner == QuasiStable
          _quasi_stable_coloring(g, params, num_colors)
     elseif params.partitioner == Hash
          _hash_coloring(g, num_colors)
diff --git a/Source/ExactSizeCalculator.jl b/Source/ExactSizeCalculator.jl
index 946ec64..f623ef6 100644
--- a/Source/ExactSizeCalculator.jl
+++ b/Source/ExactSizeCalculator.jl
@@ -1,6 +1,6 @@
 # This file contains a prototype implementation of exact sub-graph counting.
 
-function sum_over_node_exact!(partial_paths::Vector{Tuple{Vector{Int},  Int}},
+function sum_over_node_exact!(partial_paths::Vector{Tuple{Vector{NodeId},  Int}},
                                  current_query_nodes, node_to_remove,
                                  timeout, start_time)
     nodeIdx = 1
@@ -10,7 +10,7 @@ function sum_over_node_exact!(partial_paths::Vector{Tuple{Vector{Int},  Int}},
         end
         nodeIdx += 1
     end
-    new_partial_paths::Dict{Vector{Int}, Union{Vector{Float64}, Int}} = Dict()
+    new_partial_paths::Dict{Vector{NodeId}, Union{Vector{Float64}, Int}} = Dict()
     for path_and_bounds in partial_paths
         if timeout > 0 && time() - start_time > timeout
             println("Timeout Reached")
@@ -33,7 +33,7 @@ function sum_over_node_exact!(partial_paths::Vector{Tuple{Vector{Int},  Int}},
 end
 
 
-function sum_over_finished_query_nodes_exact!(query::QueryGraph, partial_paths::Vector{Tuple{Vector{Int},  Int}},
+function sum_over_finished_query_nodes_exact!(query::QueryGraph, partial_paths::Vector{Tuple{Vector{NodeId},  Int}},
                                                  current_query_nodes, visited_query_edges,
                                                  timeout, start_time; nodes_to_not_sum = [])
     prev_query_nodes = copy(current_query_nodes)
@@ -56,10 +56,10 @@ function sum_over_finished_query_nodes_exact!(query::QueryGraph, partial_paths::
 end
 
 function handle_extra_edges_exact!(query::QueryGraph, data::DataGraph,
-                                    partial_paths::Vector{Tuple{Vector{Int}, Int}},
+                                    partial_paths::Vector{Tuple{Vector{NodeId}, Int}},
                                     current_query_nodes, visited_query_edges,
                                     timeout, start_time)
-    remaining_edges::Vector{Tuple{Int, Int}} = []
+    remaining_edges::Vector{Tuple{NodeId, NodeId}} = []
     for edge in edges(query.graph)
         # since the edge's nodes are already processed, we don't have to check
         if ! ((src(edge), dst(edge)) in visited_query_edges) &&
@@ -126,7 +126,7 @@ function sample_paths_exact(partial_paths, num_samples::Int)
     end
 
     # choose a sample of the paths
-    path_samples::Vector{Tuple{Vector{Int}, Int}} = sample(partial_paths, num_samples; replace=false)
+    path_samples::Vector{Tuple{Vector{NodeId}, Int}} = sample(partial_paths, num_samples; replace=false)
 
     # sum up the sampled bounds
     sampled_bounds_sum = 0
@@ -157,7 +157,7 @@ function get_subgraph_counts(query::QueryGraph, data::DataGraph; use_partial_sum
     start_time = time()
 
     node_order = get_min_width_node_order(query.graph)
-    partial_paths::Vector{Tuple{Vector{Int}, Int}} = []
+    partial_paths::Vector{Tuple{Vector{NodeId}, Int}} = []
     visited_query_edges = []
     current_query_nodes = []
     if verbose
diff --git a/Source/QuasiStableCardinalityEstimator.jl b/Source/QuasiStableCardinalityEstimator.jl
index 273af3c..98a1795 100644
--- a/Source/QuasiStableCardinalityEstimator.jl
+++ b/Source/QuasiStableCardinalityEstimator.jl
@@ -4,7 +4,7 @@
 # Equivalently, they perform a groupby on all other nodes of the query graph. The goal of this is to prevent
 # an exponential growth in the number of paths through the lifted color graph. However, we can only remove query nodes whose
 # edges have already been processed.
-function sum_over_node!(partial_paths::Vector{Tuple{Vector{Int}, Vector{Float64}}}, current_query_nodes, node_to_remove)
+function sum_over_node!(partial_paths::Vector{Tuple{Vector{Color}, Vector{Float64}}}, current_query_nodes, node_to_remove)
     nodeIdx = 1
     for node in current_query_nodes
         if node == node_to_remove
@@ -12,7 +12,7 @@ function sum_over_node!(partial_paths::Vector{Tuple{Vector{Int}, Vector{Float64}
         end
         nodeIdx += 1
     end
-    new_partial_paths::Dict{Vector{Int}, Union{Vector{Float64}, Int}} = Dict()
+    new_partial_paths::Dict{Vector{Color}, Union{Vector{Float64}, Int}} = Dict()
     for path_and_bounds in partial_paths
         path = path_and_bounds[1]
         bounds = path_and_bounds[2]
@@ -33,7 +33,7 @@ end
 
 @enum SAMPLING_STRATEGY uniform weighted redistributive
 
-function sample_paths(partial_paths::Vector{Tuple{Vector{Int}, Vector{Float64}}}, num_samples::Int, sampling_strategy::SAMPLING_STRATEGY)
+function sample_paths(partial_paths::Vector{Tuple{Vector{Color}, Vector{Float64}}}, num_samples::Int, sampling_strategy::SAMPLING_STRATEGY)
     # partial_path[x] = (color path, bounds)
     partial_paths = [x for x  in partial_paths if x[2][2] > 0]
 
@@ -56,7 +56,7 @@ function sample_paths(partial_paths::Vector{Tuple{Vector{Int}, Vector{Float64}}}
     if sampling_strategy == uniform
         sample_weights = AnalyticWeights([1.0 for _ in eachindex(partial_paths)] ./ length(partial_paths))
     end
-    path_samples::Vector{Tuple{Vector{Int}, Vector{Float64}}} = sample(partial_paths, sample_weights,  num_samples; replace=false)
+    path_samples::Vector{Tuple{Vector{Color}, Vector{Float64}}} = sample(partial_paths, sample_weights,  num_samples; replace=false)
 
     # sum up the sampled bounds
     sampled_bounds_sum::Vector{Float64} = [0,0,0]
@@ -175,7 +175,7 @@ function get_matching_graph(start::Int, finish::Int, query::QueryGraph)
     return new_graph
 end
 
-function handle_extra_edges!(query::QueryGraph, summary::ColorSummary, partial_paths::Vector{Tuple{Vector{Int}, Vector{Float64}}},
+function handle_extra_edges!(query::QueryGraph, summary::ColorSummary, partial_paths::Vector{Tuple{Vector{Color}, Vector{Float64}}},
                                 current_query_nodes::Vector{Int}, visited_query_edges::Vector{Tuple{Int,Int}}, usingStoredStats::Bool,
                                 only_shortest_path_cycle::Bool)
     # To account for cyclic queries, we check whether there are any remaining edges that have not
@@ -243,7 +243,7 @@ function handle_extra_edges!(query::QueryGraph, summary::ColorSummary, partial_p
     end
 end
 
-function sum_over_finished_query_nodes!(query::QueryGraph, partial_paths::Vector{Tuple{Vector{Int}, Vector{Float64}}},
+function sum_over_finished_query_nodes!(query::QueryGraph, partial_paths::Vector{Tuple{Vector{Color}, Vector{Float64}}},
                                             current_query_nodes::Vector{Int}, visited_query_edges::Vector{Tuple{Int, Int}})
     prev_query_nodes = copy(current_query_nodes)
     for node in prev_query_nodes
@@ -274,7 +274,7 @@ function get_cardinality_bounds(query::QueryGraph, summary::ColorSummary; max_pa
     end
     # Because the label is implied by the color -> query_graph_vertex mapping stored in current_query_nodes,
     # we don't have to keep the label in the partial paths object.
-    partial_paths::Vector{Tuple{Vector{Int}, Vector{Float64}}} = [] # each tuple contains a pairing of color paths -> bounds
+    partial_paths::Vector{Tuple{Vector{Color}, Vector{Float64}}} = [] # each tuple contains a pairing of color paths -> bounds
     visited_query_edges::Vector{Tuple{Int,Int}} = []
     current_query_nodes::Vector{Int} = []
 
@@ -352,17 +352,17 @@ function get_cardinality_bounds(query::QueryGraph, summary::ColorSummary; max_pa
         new_label = only(query.vertex_labels[new_node])
         new_data_labels = get_data_label(query, new_node)
 
-        new_partial_paths::Vector{Tuple{Vector{Int}, Vector{Float64}}} = []
+        new_partial_paths::Vector{Tuple{Vector{Color}, Vector{Float64}}} = []
 
         # Update the partial paths using the parent-child combo that comes next from the query.
-        edge_deg::Dict{Int, Dict{Int, DegreeStats}} = Dict()
+        edge_deg::Dict{Color, Dict{Color, DegreeStats}} = Dict()
         if haskey(summary.edge_deg, edge_label) &&
                         haskey(summary.edge_deg[edge_label], new_label)
             edge_deg = summary.edge_deg[edge_label][new_label]
         end
 
         for path_and_bounds in partial_paths
-            path::Vector{Int} = path_and_bounds[1]
+            path::Vector{Color} = path_and_bounds[1]
             running_bounds::Vector{Float64} = path_and_bounds[2]
             old_color = path[parent_idx]
 
@@ -385,7 +385,7 @@ function get_cardinality_bounds(query::QueryGraph, summary::ColorSummary; max_pa
                         continue
                     end
                     degree_stats::DegreeStats = edge_deg[old_color][new_color]
-                    new_path::Vector{Int} = [path..., new_color]
+                    new_path::Vector{Color} = [path..., new_color]
                     new_bounds::Vector{Float64} = [0, 0, 0]
                     if out_edge
                         new_bounds = [running_bounds[1]*degree_stats.min_out,
diff --git a/Source/datasets.jl b/Source/datasets.jl
index 3d4eb88..bac9e56 100644
--- a/Source/datasets.jl
+++ b/Source/datasets.jl
@@ -1,7 +1,7 @@
 function load_dataset(path; subgraph_matching_data=false)
     n = 0
-    edges::Array{Tuple{Tuple{Int, Int}, Int}} = []
-    vertices::Array{Tuple{Int, Array{Int}}} = []
+    edges::Array{Tuple{Tuple{NodeId, NodeId}, Int}} = []
+    vertices::Array{Tuple{NodeId, Array{Int}}} = []
     for line in eachline(path)
         if length(line) == 0
             continue
@@ -44,8 +44,8 @@ end
 
 function load_query(path; subgraph_matching_data=false)
     n = 0
-    edges::Array{Tuple{Tuple{Int, Int}, Int}} = []
-    vertices::Array{Tuple{Int, Int, Int}} = []
+    edges::Array{Tuple{Tuple{NodeId, NodeId}, Int}} = []
+    vertices::Array{Tuple{NodeId, Int, Int}} = []
     for line in eachline(path)
         if length(line) == 0
             continue

From 25b3909b67c07215ad61e63e0e2bdb9886ae287b Mon Sep 17 00:00:00 2001
From: kylebd99 <kdeeds@cs.washington.edu>
Date: Wed, 18 Oct 2023 18:09:59 -0700
Subject: [PATCH 5/6] Fix small type issue

---
 Experiments/Scripts/accuracy_vs_runtime.jl | 17 ++++++++---------
 Source/ColorSummary.jl                     |  2 +-
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/Experiments/Scripts/accuracy_vs_runtime.jl b/Experiments/Scripts/accuracy_vs_runtime.jl
index ecfeeb1..39d1c23 100644
--- a/Experiments/Scripts/accuracy_vs_runtime.jl
+++ b/Experiments/Scripts/accuracy_vs_runtime.jl
@@ -24,8 +24,7 @@ function generate_num_colors_graph(dataset::DATASET)
     p99_runtimes = []
 
     for colors in num_colors
-        experiment_params = ExperimentParams(dataset=dataset;
-                                        inference_max_paths=2*colors, num_colors=colors)
+        experiment_params = ExperimentParams(dataset=dataset; num_colors=colors)
         build_experiments([experiment_params])
         println("Num Colors: ", colors)
         run_estimation_experiments([experiment_params])
@@ -113,11 +112,11 @@ function generate_partial_paths_graph(dataset::DATASET; num_colors = 64)
     savefig(fig, "Experiments/Results/Figures/PP_Accuracy_vs_Runtime_P99_" * string(dataset) * ".png")
 end
 
-#generate_num_colors_graph(human)
-#generate_num_colors_graph(aids)
-#generate_num_colors_graph(hprd)
-#generate_num_colors_graph(yeast)
-#generate_partial_paths_graph(human)
-#generate_partial_paths_graph(aids)
-#generate_partial_paths_graph(hprd)
+generate_num_colors_graph(human)
+generate_num_colors_graph(aids)
+generate_num_colors_graph(hprd)
+generate_num_colors_graph(yeast)
+generate_partial_paths_graph(human)
+generate_partial_paths_graph(aids)
+generate_partial_paths_graph(hprd)
 generate_partial_paths_graph(yeast)
diff --git a/Source/ColorSummary.jl b/Source/ColorSummary.jl
index 99470c8..b69dbbe 100644
--- a/Source/ColorSummary.jl
+++ b/Source/ColorSummary.jl
@@ -41,7 +41,7 @@ function generate_color_summary(g::DataGraph, params::ColorSummaryParams=ColorSu
         println("Started coloring")
     end
     color_filters::Dict{Color, BloomFilter} = Dict()
-    color_label_cardinality::Dict{Color, Int} = Dict()
+    color_label_cardinality::Dict{Color, Any} = Dict()
     color_hash::Dict{NodeId, Color} = color_graph(g, params, params.num_colors)
     color_sizes = [0 for _ in 1:maximum(values(color_hash))]
     for c in values(color_hash)

From 1be405b49b17b7bd3c28ffe1a95a98f2fe3c272e Mon Sep 17 00:00:00 2001
From: kylebd99 <kdeeds@cs.washington.edu>
Date: Thu, 19 Oct 2023 13:57:12 -0700
Subject: [PATCH 6/6] Script Changes

---
 Experiments/Scripts/accuracy_vs_runtime.jl | 8 ++++----
 Experiments/Scripts/cycle_prob_exps.jl     | 4 +---
 Experiments/Scripts/sampling_strategies.jl | 2 +-
 3 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/Experiments/Scripts/accuracy_vs_runtime.jl b/Experiments/Scripts/accuracy_vs_runtime.jl
index 39d1c23..d8d04b1 100644
--- a/Experiments/Scripts/accuracy_vs_runtime.jl
+++ b/Experiments/Scripts/accuracy_vs_runtime.jl
@@ -112,10 +112,10 @@ function generate_partial_paths_graph(dataset::DATASET; num_colors = 64)
     savefig(fig, "Experiments/Results/Figures/PP_Accuracy_vs_Runtime_P99_" * string(dataset) * ".png")
 end
 
-generate_num_colors_graph(human)
-generate_num_colors_graph(aids)
-generate_num_colors_graph(hprd)
-generate_num_colors_graph(yeast)
+#generate_num_colors_graph(human)
+#generate_num_colors_graph(aids)
+#generate_num_colors_graph(hprd)
+#generate_num_colors_graph(yeast)
 generate_partial_paths_graph(human)
 generate_partial_paths_graph(aids)
 generate_partial_paths_graph(hprd)
diff --git a/Experiments/Scripts/cycle_prob_exps.jl b/Experiments/Scripts/cycle_prob_exps.jl
index 85623d3..69dfa04 100644
--- a/Experiments/Scripts/cycle_prob_exps.jl
+++ b/Experiments/Scripts/cycle_prob_exps.jl
@@ -7,9 +7,7 @@ datasets = [aids, yeast, hprd, dblp, youtube, wordnet]
 experiment_params = Vector{ExperimentParams}()
 build_params = Vector{ExperimentParams}()
 for dataset in datasets
-    push!(build_params, ExperimentParams(dataset=dataset,
-                        num_colors=16,
-                        label_refining_rounds=2))
+    push!(build_params, ExperimentParams(dataset=dataset))
     for only_shortest_path_cycle in [false, true]
         push!(experiment_params, ExperimentParams(dataset=dataset,
                                                     only_shortest_path_cycle=only_shortest_path_cycle))
diff --git a/Experiments/Scripts/sampling_strategies.jl b/Experiments/Scripts/sampling_strategies.jl
index 24464a4..9593f4d 100644
--- a/Experiments/Scripts/sampling_strategies.jl
+++ b/Experiments/Scripts/sampling_strategies.jl
@@ -2,7 +2,7 @@
 using Profile
 include("../Experiments.jl")
 
-datasets = [human, aids, yeast, hprd, dblp]
+datasets = [human, aids, yeast, hprd, dblp, wordnet]
 
 experiment_params = Vector{ExperimentParams}()
 build_params = Vector{ExperimentParams}()