Skip to content

Commit

Permalink
Significantly improve graph mapping on uncompressible networks (~8x)
Browse files Browse the repository at this point in the history
  • Loading branch information
Jake-Moss committed Sep 20, 2024
1 parent 140b293 commit 9d46053
Showing 1 changed file with 77 additions and 46 deletions.
123 changes: 77 additions & 46 deletions aequilibrae/paths/cython/graph_building.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ cimport numpy as np
cimport cython

from libcpp.queue cimport queue
import time

@cython.wraparound(False)
@cython.embedsignature(True)
Expand Down Expand Up @@ -387,50 +388,76 @@ def build_compressed_graph(graph):
@cython.boundscheck(False)
@cython.initializedcheck(False)
def create_compressed_link_network_mapping(graph):
# Cache the result, this isn't a huge computation but isn't worth doing twice
if (
graph.compressed_link_network_mapping_idx is not None
and graph.compressed_link_network_mapping_data is not None
and graph.network_compressed_node_mapping is not None
):
return (
graph.compressed_link_network_mapping_idx,
graph.compressed_link_network_mapping_data,
graph.network_compressed_node_mapping,
)

cdef:
long long i, j, a_node, x, b_node, tmp, compressed_id
long long[:] b
long long[:] values
np.uint32_t[:] idx
np.uint32_t[:] data
np.int32_t[:] node_mapping

# This method requires that graph.graph is sorted on the a_node IDs, since that's done already we don't
# bother redoing sorting it.
# Cache the result, this isn't a huge computation but isn't worth doing twice
if (
graph.compressed_link_network_mapping_idx is not None
and graph.compressed_link_network_mapping_data is not None
and graph.network_compressed_node_mapping is not None
):
return (
graph.compressed_link_network_mapping_idx,
graph.compressed_link_network_mapping_data,
graph.network_compressed_node_mapping,
)

# Some links are completely removed from the network, they are assigned ID `graph.compact_graph.id.max() + 1`,
# we skip them.
filtered = graph.graph[graph.graph.__compressed_id__ != graph.compact_graph.id.max() + 1]
gb = filtered.groupby(by="__compressed_id__", sort=True)
idx = np.zeros(graph.compact_num_links + 1, dtype=np.uint32)
data = np.zeros(len(filtered), dtype=np.uint32)

node_mapping = np.full(graph.num_nodes, -1, dtype=np.int32)

i = 0
for compressed_id, df in gb:
cdef:
long long i, j, a_node, x, b_node, tmp, compressed_id, dup_idx
long long[:] b
long long[:] values
np.uint32_t[:] idx
np.uint32_t[:] data
np.int32_t[:] node_mapping
np.int64_t[:, :] dups

start_tiem = time.time()
print("starting compressed link network mapping", start_tiem)

# This method requires that graph.graph is sorted on the a_node IDs, since that's done already we don't
# bother redoing sorting it.

# Some links are completely removed from the network, they are assigned ID `graph.compact_graph.id.max() + 1`,
# we skip them.
filtered = graph.graph[graph.graph.__compressed_id__ != graph.compact_graph.id.max() + 1]
filtered = filtered[["__compressed_id__", "a_node", "b_node", "link_id"]]
duplicated = filtered.__compressed_id__.duplicated(keep=False)
gb = filtered[duplicated].groupby(by="__compressed_id__", sort=True)

idx = np.zeros(graph.compact_num_links + 1, dtype=np.uint32)
data = np.zeros(len(filtered), dtype=np.uint32)
node_mapping = np.full(graph.num_nodes, -1, dtype=np.int32)

compact_a_nodes = graph.compact_graph["a_node"].to_numpy()
compact_b_nodes = graph.compact_graph["b_node"].to_numpy()

dup_idx = 0
dups = filtered[~duplicated].sort_values(by="__compressed_id__").to_numpy()

i = 0
# This should be possible to parallelise, each thread gets a segment of the bincount below, they compute their
# respective idx and data, then the end i value from the first segment is added to the idx of the segment after and
# so on. Then the idx and data values are concatenated
for compressed_id, count in enumerate(np.bincount(filtered["__compressed_id__"].to_numpy())):
# We separate the easy un-compressible link path from the compressible link path
# gb.get_group and those sorted searches are rather slow
if count == 1:
compressed_id, a_node, b_node, link_id = dups[dup_idx]
dup_idx += 1
idx[compressed_id] = i
values = df.link_id.values
a = df.a_node.values
b = df.b_node.values
data[i] = link_id
node_mapping[a_node] = compact_a_nodes[compressed_id]
node_mapping[b_node] = compact_b_nodes[compressed_id]
else:
df = gb.get_group(compressed_id)
idx[compressed_id] = i
values = df.link_id.to_numpy()
a = df.a_node.to_numpy()
b = df.b_node.to_numpy()

# In order to ensure that the link IDs come out in the correct order we must walk the links
# we do this assuming the `a` array is sorted.
j = 0
# Find the missing a_node, this is the starting of the chain. We cannot rely on the node ordering to do a simple lookup

# Find the missing a_node, this is the starting of the chain. We cannot rely on the node ordering to do a simple lookup
a_node = x = a[np.isin(a, b, invert=True, assume_unique=True)][0]
while True:
tmp = a.searchsorted(x)
Expand All @@ -442,15 +469,19 @@ def create_compressed_link_network_mapping(graph):
j += 1

b_node = x
node_mapping[a_node] = graph.compact_graph["a_node"].iat[compressed_id]
node_mapping[b_node] = graph.compact_graph["b_node"].iat[compressed_id]

i += len(values)
node_mapping[a_node] = compact_a_nodes[compressed_id]
node_mapping[b_node] = compact_b_nodes[compressed_id]
i += count

idx[-1] = i
idx[-1] = i

graph.compressed_link_network_mapping_idx = np.array(idx)
graph.compressed_link_network_mapping_data = np.array(data)
graph.network_compressed_node_mapping = np.array(node_mapping)
graph.compressed_link_network_mapping_idx = np.array(idx)
graph.compressed_link_network_mapping_data = np.array(data)
graph.network_compressed_node_mapping = np.array(node_mapping)

return idx, data, node_mapping
print("end compressed link network mapping", time.time() - start_tiem)
return (
graph.compressed_link_network_mapping_idx,
graph.compressed_link_network_mapping_data,
graph.network_compressed_node_mapping
)

0 comments on commit 9d46053

Please sign in to comment.