Skip to content

Commit

Permalink
Merge pull request #38 from mkyl/kbd-cycles-all-paths
Browse files Browse the repository at this point in the history
  • Loading branch information
kylebd99 authored Oct 19, 2023
2 parents b877fe8 + 1be405b commit a6e387f
Show file tree
Hide file tree
Showing 14 changed files with 280 additions and 89 deletions.
9 changes: 4 additions & 5 deletions Experiments/Scripts/accuracy_vs_runtime.jl
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,7 @@ function generate_num_colors_graph(dataset::DATASET)
p99_runtimes = []

for colors in num_colors
experiment_params = ExperimentParams(dataset=dataset;
inference_max_paths=2*colors, num_colors=colors)
experiment_params = ExperimentParams(dataset=dataset; num_colors=colors)
build_experiments([experiment_params])
println("Num Colors: ", colors)
run_estimation_experiments([experiment_params])
Expand Down Expand Up @@ -117,7 +116,7 @@ end
#generate_num_colors_graph(aids)
#generate_num_colors_graph(hprd)
#generate_num_colors_graph(yeast)
#generate_partial_paths_graph(human)
#generate_partial_paths_graph(aids)
#generate_partial_paths_graph(hprd)
generate_partial_paths_graph(human)
generate_partial_paths_graph(aids)
generate_partial_paths_graph(hprd)
generate_partial_paths_graph(yeast)
21 changes: 21 additions & 0 deletions Experiments/Scripts/cycle_prob_exps.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@

using Profile
include("../Experiments.jl")

datasets = [aids, yeast, hprd, dblp, youtube, wordnet]

experiment_params = Vector{ExperimentParams}()
build_params = Vector{ExperimentParams}()
for dataset in datasets
push!(build_params, ExperimentParams(dataset=dataset))
for only_shortest_path_cycle in [false, true]
push!(experiment_params, ExperimentParams(dataset=dataset,
only_shortest_path_cycle=only_shortest_path_cycle))
end
end

build_experiments(build_params)

run_estimation_experiments(experiment_params)

graph_grouped_box_plot(experiment_params; grouping=cycle_stats, filename="cycle_stats_exps")
20 changes: 20 additions & 0 deletions Experiments/Scripts/memory_exps.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@

using Profile
include("../Experiments.jl")

#datasets = [aids, yeast, hprd, dblp, youtube, wordnet]
datasets = [aids, yeast, hprd, dblp, wordnet]
num_colors = [4, 8, 16, 32, 64, 128]
experiment_params = Vector{ExperimentParams}()
build_params = Vector{ExperimentParams}()
for dataset in datasets
for n in num_colors
push!(build_params, ExperimentParams(dataset=dataset, num_colors=n))
end
end
#build_experiments(build_params)

graph_grouped_bar_plot(build_params; grouping=number_of_colors,
y_type=memory_footprint,
y_lims=[0, 16],
filename="memory_size_vs_colors_fp32_int16")
2 changes: 1 addition & 1 deletion Experiments/Scripts/sampling_strategies.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
using Profile
include("../Experiments.jl")

datasets = [human, aids, yeast, hprd, dblp]
datasets = [human, aids, yeast, hprd, dblp, wordnet]

experiment_params = Vector{ExperimentParams}()
build_params = Vector{ExperimentParams}()
Expand Down
10 changes: 7 additions & 3 deletions Experiments/build_color_summaries.jl
Original file line number Diff line number Diff line change
@@ -1,16 +1,20 @@
function build_experiments(experiment_params_list::Vector{ExperimentParams})
build_times = [("Dataset", "Partitioner", "NumColors", "BuildTime")]
for experiment_params in experiment_params_list
build_times = [("Dataset", "Partitioner", "NumColors", "BuildTime", "MemoryFootprint")]
dataset = experiment_params.dataset
summary_params = experiment_params.summary_params
data = load_dataset(dataset)
summary_name = params_to_summary_filename(experiment_params)
summary_file_location = "Experiments/SerializedSummaries/" * summary_name
println("Building Color Summary: ", summary_name)
results = @timed generate_color_summary(data, summary_params; verbose=1)
summary_size = Base.summarysize(results.value)
serialize(summary_file_location, results.value)
push!(build_times, (string(dataset), string(summary_params.partitioner),
string(summary_params.num_colors), string(results.time)))
push!(build_times, (string(dataset),
string(summary_params.partitioner),
string(summary_params.num_colors),
string(results.time),
string(summary_size)))
results_filename = params_to_results_filename(experiment_params)
result_file_location = "Experiments/Results/Build_" * results_filename
writedlm(result_file_location, build_times, ",")
Expand Down
78 changes: 75 additions & 3 deletions Experiments/graph_results.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
@enum GROUP dataset technique cycle_size summary_paths inference_paths query_type sampling_type
@enum GROUP dataset technique cycle_size summary_paths inference_paths query_type sampling_type cycle_stats number_of_colors
#todo: query type

@enum VALUE estimate_error runtime
@enum VALUE estimate_error runtime memory_footprint

function graph_grouped_box_plot(experiment_params_list::Vector{ExperimentParams};
x_type::GROUP=dataset, y_type::VALUE=estimate_error,
Expand Down Expand Up @@ -43,14 +43,82 @@ function graph_grouped_box_plot(experiment_params_list::Vector{ExperimentParams}
# See this: https://discourse.julialang.org/t/deactivate-plot-display-to-avoid-need-for-x-server/19359/15
ENV["GKSwstype"]="100"
gbplot = groupedboxplot(x_values, y_values, group = groups, yscale =:log10,
ylims=[10^-13, 10^11], yticks=[10^-10, 10^-5, 1, 10^5, 10^10],
ylims=[10^-13, 10^11], yticks=[10^-10, 10^-5, 10^-2, 1, 10^2, 10^5, 10^10],
legend = :outertopleft, size = (1000, 600))
x_label !== nothing && xlabel!(gbplot, x_label)
y_label !== nothing && ylabel!(gbplot, y_label)
plotname = (isnothing(filename)) ? results_filename * ".png" : filename * ".png"
savefig(gbplot, "Experiments/Results/Figures/" * plotname)
end

function graph_grouped_bar_plot(experiment_params_list::Vector{ExperimentParams};
x_type::GROUP=dataset,
y_type::VALUE=estimate_error,
grouping::GROUP=technique,
x_label=nothing,
y_label=nothing,
y_lims=[0, 10],
filename=nothing)
# for now let's just use the dataset as the x-values and the cycle size as the groups
x_values = []
y_values = Float64[]
groups = []
for experiment_params in experiment_params_list
# load the results
results_filename = params_to_results_filename(experiment_params)
prefix = "Experiments/Results/Estimation_"
if y_type == memory_footprint
prefix = "Experiments/Results/Build_"
end
results_path = prefix * results_filename
results_df = CSV.read(results_path, DataFrame; normalizenames=true)

# get the x_value and grouping (same for all results in this experiment param)
println(results_df)
# keep track of the data points
for i in 1:nrow(results_df)
current_x = x_type == query_type ? results_df[i, :QueryType] : get_value_from_param(experiment_params, x_type)
current_group = grouping == query_type ? results_df[i, :QueryType] : get_value_from_param(experiment_params, grouping)
current_y = 0
if y_type == estimate_error
current_y = results_df[i, :Estimate] / results_df[i, :TrueCard]
elseif y_type == memory_footprint
current_y = results_df[i, :MemoryFootprint]/(10^6)
else
# y_type == runtime
current_y = results_df[i, :EstimationTime]
end
# push the errors and their groupings into the correct vector
push!(x_values, current_x)
push!(y_values, current_y)
push!(groups, current_group)
end
end
results_filename = params_to_results_filename(experiment_params_list[1])
println("starting graphs")

# This seems to be necessary for using Plots.jl outside of the ipynb framework.
# See this: https://discourse.julialang.org/t/deactivate-plot-display-to-avoid-need-for-x-server/19359/15
ENV["GKSwstype"]="100"
println(x_values)
println(y_values)
println(groups)
gbplot = StatsPlots.groupedbar(x_values,
y_values,
group = groups,
# yscale =:log10,
ylims=y_lims,
legend = :outertopleft,
size = (1000, 600))
x_label !== nothing && xlabel!(gbplot, x_label)
y_label !== nothing && ylabel!(gbplot, y_label)
plotname = (isnothing(filename)) ? results_filename * ".png" : filename * ".png"
savefig(gbplot, "Experiments/Results/Figures/" * plotname)
end




# default to grouping by dataset
function get_value_from_param(experiment_param::ExperimentParams, value_type::GROUP)
if value_type == dataset
Expand All @@ -63,6 +131,10 @@ function get_value_from_param(experiment_param::ExperimentParams, value_type::GR
return experiment_param.inference_max_paths
elseif value_type == sampling_type
return experiment_param.sampling_strategy
elseif value_type == cycle_stats
return experiment_param.only_shortest_path_cycle
elseif value_type == number_of_colors
return experiment_param.summary_params.num_colors
else
# default to grouping by technique
return (experiment_param.summary_params.partitioner, experiment_param.summary_params.label_refining_rounds)
Expand Down
3 changes: 2 additions & 1 deletion Experiments/run_estimators.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ function run_estimation_experiments(experiment_params_list::Vector{ExperimentPar
results = @timed get_cardinality_bounds(query, summary;
max_partial_paths = experiment_params.inference_max_paths,
use_partial_sums=experiment_params.use_partial_sums, usingStoredStats=true,
sampling_strategy=experiment_params.sampling_strategy)
sampling_strategy=experiment_params.sampling_strategy,
only_shortest_path_cycle= experiment_params.only_shortest_path_cycle)
upper_bound = results.value[3]
estimate = max(1, results.value[2])
lower_bound = results.value[1]
Expand Down
8 changes: 7 additions & 1 deletion Experiments/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,14 @@ struct ExperimentParams
dataset::DATASET
summary_params::ColorSummaryParams
inference_max_paths::Int
# Whether to consider just the shortest path when calculating cycle probabilities or
# all simple paths.
only_shortest_path_cycle::Bool
use_partial_sums::Bool
sampling_strategy::SAMPLING_STRATEGY

function ExperimentParams(;dataset::DATASET, num_colors::Int=64, max_cycle_size=6, summary_max_paths=1000,
function ExperimentParams(;dataset::DATASET, num_colors::Int=64, max_cycle_size=6,
only_shortest_path_cycle=false, summary_max_paths=1000,
partitioner::PARTITIONER = QuasiStable, weighting=true, inference_max_paths=500, use_partial_sums=true,
sampling_strategy=redistributive, label_refining_rounds = 0)
return new(dataset, ColorSummaryParams(num_colors=num_colors,
Expand All @@ -20,6 +24,7 @@ struct ExperimentParams
weighting=weighting,
label_refining_rounds=label_refining_rounds),
inference_max_paths,
only_shortest_path_cycle,
use_partial_sums,
sampling_strategy
)
Expand All @@ -30,6 +35,7 @@ function params_to_results_filename(experiment_params::ExperimentParams)
name = string(experiment_params.dataset) * "_"
name *= params_to_string(experiment_params.summary_params) * "_"
name *= string(experiment_params.inference_max_paths) * "_"
name *= string(experiment_params.only_shortest_path_cycle) * "_"
name *= string(experiment_params.use_partial_sums) * "_"
name *= string(experiment_params.sampling_strategy) * ".csv"
return name
Expand Down
4 changes: 3 additions & 1 deletion Source/CardinalityWithColors.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@ using Graphs: SimpleDiGraphFromIterator, Edge, DiGraph, edges, nv, ne, add_edge!


BoolPath = Vector{Bool}
StartEndColorPair = Tuple{Int, Int}
NodeId = Int
Color = Int16
StartEndColorPair = Tuple{Color, Color}
abstract type Comparable end
import Base .==
function ==(a::T, b::T) where T <: Comparable
Expand Down
34 changes: 18 additions & 16 deletions Source/ColorSummary.jl
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
using Graphs

struct DegreeStats
min_out::Float64
avg_out::Float64
max_out::Float64
min_in::Float64
avg_in::Float64
max_in::Float64
min_out::Float32
avg_out::Float32
max_out::Float32
min_in::Float32
avg_in::Float32
max_in::Float32

function DegreeStats(min_out, avg_out, max_out)
return new(min_out, avg_out, max_out, 0, 0, 0)
Expand All @@ -22,11 +22,12 @@ end
# a particular color. Note that `-1` is used to represent a "wildcard" label. These do not appear in the data graph,
# but they do occur in the query graph.
struct ColorSummary
color_label_cardinality::Dict{Int, Dict{Int, Int}} # color_label_cardinality[c][v] = num_vertices
edge_deg::Dict{Int, Dict{Int, Dict{Int, Dict{Int, DegreeStats}}}} # edge_min_out_deg[e][v2][c1][c2] = min
color_filters::Dict{Int, BloomFilter} # color_filters[c] = filter
color_label_cardinality::Dict{Color, Dict{Int, Int}} # color_label_cardinality[c][v] = num_vertices
edge_deg::Dict{Int, Dict{Int, Dict{Color, Dict{Color, DegreeStats}}}} # edge_min_out_deg[e][v2][c1][c2] = min
color_filters::Dict{Color, BloomFilter} # color_filters[c] = filter
cycle_probabilities::Dict{CyclePathAndColors, Float64} # cycle_probabilities[[c1, c2], path] = likelihood
cycle_length_probabilities::Dict{Int, Float64} #cycle_probabilities[path_length] = likelihood
max_cycle_size::Int
total_edges::Int
total_nodes::Int
# for outdegrees, c2 is the color of the outneighbor
Expand All @@ -39,9 +40,9 @@ function generate_color_summary(g::DataGraph, params::ColorSummaryParams=ColorSu
if (verbose > 0)
println("Started coloring")
end
color_filters = Dict()
color_label_cardinality = Dict()
color_hash::Dict{Int, Int32} = color_graph(g, params, params.num_colors)
color_filters::Dict{Color, BloomFilter} = Dict()
color_label_cardinality::Dict{Color, Any} = Dict()
color_hash::Dict{NodeId, Color} = color_graph(g, params, params.num_colors)
color_sizes = [0 for _ in 1:maximum(values(color_hash))]
for c in values(color_hash)
color_sizes[c] += 1
Expand Down Expand Up @@ -96,7 +97,7 @@ function generate_color_summary(g::DataGraph, params::ColorSummaryParams=ColorSu
println("Started tracking statistics")
end
# We keep separate degree statistics for in-degree and out-degree.
color_to_color_out_counter::Dict{Int32, Dict{Int32, Any}} = Dict()
color_to_color_out_counter::Dict{Color, Dict{Color, Any}} = Dict()
for x in vertices(g.graph)
c1 = color_hash[x]
for y in outneighbors(g.graph,x)
Expand Down Expand Up @@ -131,7 +132,7 @@ function generate_color_summary(g::DataGraph, params::ColorSummaryParams=ColorSu
end
end

edge_deg::Dict{Int, Dict{Int, Dict{Int, Dict{Int, DegreeStats}}}} = Dict()
edge_deg::Dict{Int, Dict{Int, Dict{Color, Dict{Color, DegreeStats}}}} = Dict()
for edge_label in keys(color_to_color_out_counter)
edge_deg[edge_label] = Dict()
for vertex_label in keys(color_to_color_out_counter[edge_label])
Expand Down Expand Up @@ -159,7 +160,7 @@ function generate_color_summary(g::DataGraph, params::ColorSummaryParams=ColorSu
end

# We keep separate degree statistics for in-degree and out-degree.
color_to_color_in_counter::Dict{Int32, Dict{Int32, Any}} = Dict()
color_to_color_in_counter::Dict{Color, Dict{Color, Any}} = Dict()
for x in vertices(g.graph)
c1 = color_hash[x]
for y in inneighbors(g.graph,x)
Expand Down Expand Up @@ -234,7 +235,8 @@ function generate_color_summary(g::DataGraph, params::ColorSummaryParams=ColorSu
println("Finished tracking statistics")
end
return ColorSummary(color_label_cardinality, edge_deg, color_filters,
cycle_probabilities, cycle_length_probabilities, ne(g.graph), nv(g.graph))
cycle_probabilities, cycle_length_probabilities, params.max_cycle_size,
ne(g.graph), nv(g.graph))
end

function color_hash_to_groups(color_hash, num_colors)
Expand Down
Loading

0 comments on commit a6e387f

Please sign in to comment.