Skip to content

Commit 24ca006

Browse files
author
Erdem Sariyuce
committed
Implementing %degreeDistribution magic command
(cherry picked from commit 37e84b4bb9c21c9b19ab259df6a0a93009e20add)
1 parent a927c4c commit 24ca006

File tree

4 files changed

+117
-27
lines changed

4 files changed

+117
-27
lines changed

ChangeLog.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
Starting with v1.31.6, this file will contain a record of major features and updates made in each release of graph-notebook.
44

55
## Upcoming
6-
- Added %degreeDistribution magic command ([PR](https://github.com/aws/graph-notebook/pull/749))
6+
- Added %degreeDistribution magic command ([PR](https://github.com/aws/graph-notebook/pull/749)) TODO: add to the specific release below when it's released
77
- Locked numba dependency to 0.60.0 to avoid numpy conflict ([Link to PR](https://github.com/aws/graph-notebook/pull/735))
88
- Fixed library target for nbclassic nbextension for graph_notebook_widget ([Link to PR](https://github.com/aws/graph-notebook/pull/739))
99

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ dependencies = [
4646
'networkx==2.4',
4747
'numpy>=1.23.5,<1.24.0',
4848
'pandas>=2.1.0,<=2.2.2',
49+
'matplotlib>=3.9.4',
4950

5051
# Graph databases and query languages
5152
'gremlinpython>=3.5.1,<=3.7.2',

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ itables>=2.0.0,<=2.1.0
1818
networkx==2.4
1919
numpy>=1.23.5,<1.24.0
2020
pandas>=2.1.0,<=2.2.2
21+
matplotlib>=3.9.4
2122

2223
# Graph databases and query languages
2324
gremlinpython>=3.5.1,<=3.7.2

src/graph_notebook/magics/graph_magic.py

Lines changed: 114 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@
1919
import numpy as np
2020
import matplotlib.pyplot as plt
2121

22+
import numpy as np
23+
import matplotlib.pyplot as plt
24+
2225
from ipyfilechooser import FileChooser
2326
from enum import Enum
2427
from copy import copy
@@ -57,6 +60,7 @@
5760
SPARQL_EXPLAIN_MODES, OPENCYPHER_EXPLAIN_MODES, GREMLIN_EXPLAIN_MODES, \
5861
OPENCYPHER_PLAN_CACHE_MODES, OPENCYPHER_DEFAULT_TIMEOUT, OPENCYPHER_STATUS_STATE_MODES, \
5962
normalize_service_name, NEPTUNE_DB_SERVICE_NAME, NEPTUNE_ANALYTICS_SERVICE_NAME, GRAPH_PG_INFO_METRICS, TRAVERSAL_DIRECTIONS, \
63+
normalize_service_name, NEPTUNE_DB_SERVICE_NAME, NEPTUNE_ANALYTICS_SERVICE_NAME, GRAPH_PG_INFO_METRICS, TRAVERSAL_DIRECTIONS, \
6064
GREMLIN_PROTOCOL_FORMATS, DEFAULT_HTTP_PROTOCOL, DEFAULT_WS_PROTOCOL, GRAPHSONV4_UNTYPED, \
6165
GREMLIN_SERIALIZERS_WS, get_gremlin_serializer_mime, normalize_protocol_name, generate_snapshot_name)
6266
from graph_notebook.network import SPARQLNetwork
@@ -3928,15 +3932,29 @@ def handle_opencypher_status(self, line, local_ns):
39283932

39293933

39303934

3931-
# %degreeDistribution. Takes traversalDirection, vertexLabels, edgeLabels parameters, and visualizes
3932-
# the degree distribution.
3935+
# %degreeDistribution magic command.
3936+
# It obtains the degree distribution of a graph in the form of a visual histogram in notebook. Histogram simply
3937+
# shows the number of vertices with a given degree, where degree is shown on the x-axis and the count on y-axis.
3938+
# It takes traversalDirection [both (default), inbound, outbound], vertexLabels [default is empty list],
3939+
# edgeLabels parameters [default is empty list], and then gives the histogram for the specified degree
3940+
# (both/in/out) distribution of the vertices in the graph filtered by the specified vertex labels and edge
3941+
# labels. Parameters can be defined as command line argument and/or through the dropdown widgets.
3942+
# Example usages:
3943+
# > %degreeDistribution
3944+
# > %degreeDistribution --traversalDirection inbound
3945+
# > %degreeDistribution --traversalDirection inbound --vertexLabels airport country
3946+
39333947
# TODO: Error handling
39343948

39353949
@line_magic
39363950
@needs_local_scope
39373951
@display_exceptions
39383952
@neptune_graph_only
39393953
def degreeDistribution(self, line, local_ns: dict = None):
3954+
if not self.client.is_analytics_domain():
3955+
print("This command is only supported for Neptune Analytics domains.")
3956+
return
3957+
39403958
parser = argparse.ArgumentParser()
39413959

39423960
# Get the vertexLabels and edgeLabels from graph summary, to be shown in the widgets for selection.
@@ -3950,18 +3968,23 @@ def degreeDistribution(self, line, local_ns: dict = None):
39503968
print(f"Error retrieving graph summary: {e}")
39513969
return
39523970

3953-
# traversalDirection parameter
3971+
# traversalDirection: Type of the degree computed:
3972+
# - inbound: Counts only the incoming edges for each vertex
3973+
# - outbound: Counts only the outgoing edges for each vertex
3974+
# - both [default]: Counts both the incoming and outgoing edges for each vertex.
39543975
parser.add_argument('--traversalDirection', nargs='?', type=str.lower, default='both',
39553976
help=f'Type of the degree for which the distribution is shown. Valid inputs: {TRAVERSAL_DIRECTIONS}. '
39563977
f'Default: both.',
39573978
choices=TRAVERSAL_DIRECTIONS)
39583979

3959-
# vertexLabels parameter
3980+
# vertexLabels: List of the vertex labels, space separated, for which the degrees are computed:
3981+
# - default value is empty list, which means the degrees are computed for any vertex label.
39603982
parser.add_argument('--vertexLabels', nargs='*', default=[],
39613983
help="The vertex labels for which the induced graph is considered and the degree distribution is shown. "
39623984
"If not supplied, we will default to using all the vertex labels.")
39633985

3964-
# edgeLabels parameter
3986+
# edgeLabels: List of the edge labels, space separated, for which the degrees are computed:
3987+
# - default value is empty list, which means the degrees are computed for any edge label.
39653988
parser.add_argument('--edgeLabels', nargs='*', default=[],
39663989
help="The edge labels for which the degree distribution is shown. If not supplied, "
39673990
"we will default to using all the edge labels.")
@@ -3973,7 +3996,8 @@ def degreeDistribution(self, line, local_ns: dict = None):
39733996

39743997
args = parser.parse_args(line.split())
39753998

3976-
# Put the selection specified on the command line, if any; o.w. default is 'both'
3999+
# If the traversalDirection parameter selection is specified on the command line, it is shown as the default
4000+
# in the dropdown menu. Othweise, the default in the dropdown is 'both'
39774001
td_val = args.traversalDirection
39784002
td_val = td_val.lower() if td_val else 'both'
39794003

@@ -3985,7 +4009,9 @@ def degreeDistribution(self, line, local_ns: dict = None):
39854009
value = td_val
39864010
)
39874011

3988-
# Put the vertex label(s) specified on the command line, if any; o.w. default is all the vertex labels (denoted by [])
4012+
# Existing vertex labels in the graph are shown in the dropdown menu. If any vertex label is specified on
4013+
# the command line, they are shown to be selected in the dropdown menu. Otherwise, no label is selected
4014+
# in the dropdown menu, which means any label and all the labels are considered in the computation.
39894015
available_vertex_labels = sorted(available_vertex_labels)
39904016
selected_vlabels = args.vertexLabels if args.vertexLabels else []
39914017
vertex_labels_select = widgets.SelectMultiple(
@@ -3996,7 +4022,9 @@ def degreeDistribution(self, line, local_ns: dict = None):
39964022
value = selected_vlabels
39974023
)
39984024

3999-
# Put the edge label(s) specified on the command line, if any; o.w. default is all the edge labels (denoted by [])
4025+
# Existing edge labels in the graph are shown in the dropdown menu. If any edge label is specified on
4026+
# the command line, they are shown to be selected in the dropdown menu. Otherwise, no label is selected
4027+
# in the dropdown menu, which means any label and all the labels are considered in the computation.
40004028
available_edge_labels = sorted(available_edge_labels)
40014029
selected_elabels = args.edgeLabels if args.edgeLabels else []
40024030
edge_labels_select = widgets.SelectMultiple(
@@ -4024,7 +4052,7 @@ def on_button_clicked(b):
40244052

40254053
# Call the function with the selected parameters
40264054
with output:
4027-
res = self.callDD(td, vlabels, elabels, local_ns)
4055+
res = self.execute_degree_distribution_query(td, vlabels, elabels, local_ns)
40284056

40294057
# Retrieve the distribution
40304058
pairs = np.array(res['results'][0]['output']['distribution'])
@@ -4041,7 +4069,7 @@ def on_button_clicked(b):
40414069

40424070
submit_button.on_click(on_button_clicked)
40434071

4044-
def callDD (self, td, vlabels, elabels, local_ns):
4072+
def execute_degree_distribution_query (self, td, vlabels, elabels, local_ns):
40454073
query_parts = [f'traversalDirection: "{td}"']
40464074

40474075
if vlabels:
@@ -4055,8 +4083,7 @@ def callDD (self, td, vlabels, elabels, local_ns):
40554083
# Construct the query
40564084
line = "CALL neptune.algo.degreeDistribution({" + ", ".join(query_parts) + "}) YIELD output RETURN output"
40574085

4058-
# oc_rebuild_args = (f"{f'--store-to js --silent'}")
4059-
oc_rebuild_args = (f"{f'--store-to js'}")
4086+
oc_rebuild_args = (f"{f'--store-to js --silent'}")
40604087

40614088
self.handle_opencypher_query(oc_rebuild_args, line, local_ns)
40624089

@@ -4068,14 +4095,17 @@ def plot_interactive_degree_distribution(self, unique_degrees, counts, max_deg,
40684095
min_deg = 0
40694096

40704097
def update_plot(scale_type, bin_type, bin_width, y_max, x_range, show_mindeg, show_maxdeg):
4071-
marker_size = 50
4072-
alpha = 0.6
4098+
# Start timing
4099+
start_time = time.time()
4100+
4101+
alpha = 1
40734102
plt.clf()
40744103

40754104
# Get zero degree count
40764105
zero_idx = np.where(unique_degrees == 0)[0]
40774106
zero_degree_count = counts[zero_idx[0]] if len(zero_idx) > 0 else 0
40784107

4108+
isolateds_exist = zero_degree_count > 0
40794109
# Get non-zero degrees and counts
40804110
mask = unique_degrees > 0
40814111
filtered_degrees = unique_degrees[mask]
@@ -4085,8 +4115,8 @@ def update_plot(scale_type, bin_type, bin_width, y_max, x_range, show_mindeg, sh
40854115
if len(filtered_degrees) == 0:
40864116
min_deg = 0
40874117
else:
4088-
min_deg = np.min(filtered_degrees)
4089-
4118+
min_deg = np.min(filtered_degrees)
4119+
40904120
n_bins = 1
40914121
# Create histogram only if there are non-zero degree nodes
40924122
if len(filtered_degrees) > 0:
@@ -4111,20 +4141,32 @@ def update_plot(scale_type, bin_type, bin_width, y_max, x_range, show_mindeg, sh
41114141
label='Raw', color='#000000')
41124142

41134143
# Plot zero degree node count separately
4114-
if zero_degree_count > 0:
4115-
plt.bar(0, zero_degree_count, color='red',
4116-
label='Isolated', alpha=alpha, width=0.2)
4144+
if isolateds_exist:
4145+
# Use a special x position for zero degree nodes in log scale
4146+
zero_x_pos = 0.1 if scale_type in ['Log-Log', 'Log(x)-Linear(y)'] else 0
4147+
plt.bar(zero_x_pos, zero_degree_count, color='red',
4148+
label='Isolated', alpha=alpha, width=0.1 if scale_type in ['Log-Log', 'Log(x)-Linear(y)'] else 2)
41174149

41184150
plt.xlim(x_range[0], x_range[1])
41194151

4152+
if isolateds_exist:
4153+
plt.xlim(x_range[0], x_range[1])
4154+
41204155
# Set scales based on selection
41214156
if scale_type == 'Log-Log':
41224157
plt.xscale('log')
41234158
plt.yscale('log')
4124-
plt.xlim(x_range[0]+1, x_range[1])
4159+
if isolateds_exist:
4160+
plt.xlim(0.05, x_range[1])
4161+
else:
4162+
plt.xlim(x_range[0]+0.05, x_range[1])
4163+
41254164
elif scale_type == 'Log(x)-Linear(y)':
41264165
plt.xscale('log')
4127-
plt.xlim(x_range[0]+1, x_range[1])
4166+
if isolateds_exist:
4167+
plt.xlim(0.05, x_range[1])
4168+
else:
4169+
plt.xlim(x_range[0]+0.05, x_range[1])
41284170
elif scale_type == 'Linear(x)-Log(y)':
41294171
plt.yscale('log')
41304172

@@ -4143,13 +4185,21 @@ def update_plot(scale_type, bin_type, bin_width, y_max, x_range, show_mindeg, sh
41434185
plt.legend()
41444186

41454187
plt.title(f'Degree Distribution')
4146-
4188+
4189+
# End timing and display
4190+
end_time = time.time()
4191+
runtime = end_time - start_time
4192+
41474193
# Update statistics
41484194
with stats_output:
41494195
stats_output.clear_output(wait=True)
41504196
total_nodes = sum(counts)
41514197
total_edges = sum(d * c for d, c in zip(unique_degrees, counts)) // 2
41524198
avg_degree = sum(d * c for d, c in zip(unique_degrees, counts)) / total_nodes
4199+
4200+
print(f"Render time: {runtime:.3f} seconds")
4201+
print(f"--------------------")
4202+
41534203
print(f"Number of nodes: {total_nodes}")
41544204
print(f"Number of edges: {total_edges}")
41554205
print(f"Number of isolated nodes: {zero_degree_count}")
@@ -4178,18 +4228,56 @@ def update_plot(scale_type, bin_type, bin_width, y_max, x_range, show_mindeg, sh
41784228
description='Binning:'
41794229
)
41804230

4231+
# Define a function to update bin_width_widget based on bin_type
4232+
def update_bin_width_widget(change):
4233+
if change['new'] == 'Logarithmic':
4234+
# For logarithmic binning, use a FloatSlider with smaller values
4235+
bin_width_widget.min = 1.00
4236+
bin_width_widget.max = 10.00
4237+
bin_width_widget.step = 0.01
4238+
bin_width_widget.value = 1.00
4239+
bin_width_widget.readout_format = '.2f'
4240+
bin_width_widget.disabled = False
4241+
elif change['new'] == 'Raw':
4242+
# For raw binning, disable the widget
4243+
bin_width_widget.value = 1
4244+
bin_width_widget.disabled = True
4245+
else:
4246+
# For linear binning, use integer values
4247+
bin_width_widget.min = 1
4248+
bin_width_widget.max = (max_deg+2)/10
4249+
bin_width_widget.step = 1
4250+
bin_width_widget.value = 1
4251+
bin_width_widget.readout_format = 'd'
4252+
bin_width_widget.disabled = False
4253+
4254+
def update_y_max_widget(change):
4255+
if bin_widget.value == 'Raw':
4256+
# For raw data, use the original max count
4257+
y_max_widget.max = max_count * 1.1
4258+
y_max_widget.value = max_count * 1.1
4259+
elif bin_widget.value == 'Linear':
4260+
y_max_widget.max = max_count * bin_width_widget.value * 0.5
4261+
y_max_widget.value = max_count * bin_width_widget.value * 0.5
4262+
else: # 'Logarithmic'
4263+
y_max_widget.max = max_count * (10 ** bin_width_widget.value) * 0.5
4264+
y_max_widget.value = max_count * (10 ** bin_width_widget.value) * 0.5
4265+
41814266
# Bin width widget, integer options in [1, 1+(max_deg/2)] interval
4182-
# TODO: if logarithmic binning, a much smaller range makes more sense
4183-
bin_width_widget = widgets.IntSlider(
4267+
bin_width_widget = widgets.FloatSlider(
41844268
value=1,
41854269
min=1,
4186-
max=(max_deg+2)/2,
4270+
max=(max_deg+2)/10,
41874271
step=1,
41884272
description='Bin width:',
41894273
tooltip=('For linear binning: actual width\n'
41904274
'For log binning: multiplicative factor')
41914275
)
41924276

4277+
# Observe changes to bin_width_widget and bin_widget
4278+
bin_width_widget.observe(update_y_max_widget, names='value')
4279+
bin_widget.observe(update_y_max_widget, names='value')
4280+
41934281
# Upper limit for y-axis range, enables zooming (lower limit is always zero)
41944282
y_max_widget = widgets.IntSlider(
41954283
value=max_count * 1.1,
@@ -4203,7 +4291,7 @@ def update_plot(scale_type, bin_type, bin_width, y_max, x_range, show_mindeg, sh
42034291
x_range_widget = widgets.FloatRangeSlider(
42044292
min=0,
42054293
max=max_deg * 1.1 + 5,
4206-
value=[min, max],
4294+
value=[0, max_deg * 1.1 + 5],
42074295
step=1,
42084296
description='x-axis range:',
42094297
disabled=False,

0 commit comments

Comments
 (0)