Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Significantly improve graph mapping on uncompressible networks (~8x) #573

Open
wants to merge 1 commit into
base: develop
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 77 additions & 46 deletions aequilibrae/paths/cython/graph_building.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ cimport numpy as np
cimport cython

from libcpp.queue cimport queue
import time

@cython.wraparound(False)
@cython.embedsignature(True)
Expand Down Expand Up @@ -387,50 +388,76 @@ def build_compressed_graph(graph):
@cython.boundscheck(False)
@cython.initializedcheck(False)
def create_compressed_link_network_mapping(graph):
# Cache the result, this isn't a huge computation but isn't worth doing twice
if (
graph.compressed_link_network_mapping_idx is not None
and graph.compressed_link_network_mapping_data is not None
and graph.network_compressed_node_mapping is not None
):
return (
graph.compressed_link_network_mapping_idx,
graph.compressed_link_network_mapping_data,
graph.network_compressed_node_mapping,
)

cdef:
long long i, j, a_node, x, b_node, tmp, compressed_id
long long[:] b
long long[:] values
np.uint32_t[:] idx
np.uint32_t[:] data
np.int32_t[:] node_mapping

# This method requires that graph.graph is sorted on the a_node IDs, since that's done already we don't
# bother redoing sorting it.
# Cache the result, this isn't a huge computation but isn't worth doing twice
if (
graph.compressed_link_network_mapping_idx is not None
and graph.compressed_link_network_mapping_data is not None
and graph.network_compressed_node_mapping is not None
):
return (
graph.compressed_link_network_mapping_idx,
graph.compressed_link_network_mapping_data,
graph.network_compressed_node_mapping,
)

# Some links are completely removed from the network, they are assigned ID `graph.compact_graph.id.max() + 1`,
# we skip them.
filtered = graph.graph[graph.graph.__compressed_id__ != graph.compact_graph.id.max() + 1]
gb = filtered.groupby(by="__compressed_id__", sort=True)
idx = np.zeros(graph.compact_num_links + 1, dtype=np.uint32)
data = np.zeros(len(filtered), dtype=np.uint32)

node_mapping = np.full(graph.num_nodes, -1, dtype=np.int32)

i = 0
for compressed_id, df in gb:
cdef:
long long i, j, a_node, x, b_node, tmp, compressed_id, dup_idx
long long[:] b
long long[:] values
np.uint32_t[:] idx
np.uint32_t[:] data
np.int32_t[:] node_mapping
np.int64_t[:, :] dups

start_tiem = time.time()
print("starting compressed link network mapping", start_tiem)

# This method requires that graph.graph is sorted on the a_node IDs, since that's done already we don't
# bother redoing sorting it.

# Some links are completely removed from the network, they are assigned ID `graph.compact_graph.id.max() + 1`,
# we skip them.
filtered = graph.graph[graph.graph.__compressed_id__ != graph.compact_graph.id.max() + 1]
filtered = filtered[["__compressed_id__", "a_node", "b_node", "link_id"]]
duplicated = filtered.__compressed_id__.duplicated(keep=False)
gb = filtered[duplicated].groupby(by="__compressed_id__", sort=True)

idx = np.zeros(graph.compact_num_links + 1, dtype=np.uint32)
data = np.zeros(len(filtered), dtype=np.uint32)
node_mapping = np.full(graph.num_nodes, -1, dtype=np.int32)

compact_a_nodes = graph.compact_graph["a_node"].to_numpy()
compact_b_nodes = graph.compact_graph["b_node"].to_numpy()

dup_idx = 0
dups = filtered[~duplicated].sort_values(by="__compressed_id__").to_numpy()

i = 0
# This should be possible to parallelise, each thread gets a segment of the bincount below, they compute their
# respective idx and data, then the end i value from the first segment is added to the idx of the segment after and
# so on. Then the idx and data values are concatenated
for compressed_id, count in enumerate(np.bincount(filtered["__compressed_id__"].to_numpy())):
# We separate the easy un-compressible link path from the compressible link path
# gb.get_group and those sorted searches are rather slow
if count == 1:
compressed_id, a_node, b_node, link_id = dups[dup_idx]
dup_idx += 1
idx[compressed_id] = i
values = df.link_id.values
a = df.a_node.values
b = df.b_node.values
data[i] = link_id
node_mapping[a_node] = compact_a_nodes[compressed_id]
node_mapping[b_node] = compact_b_nodes[compressed_id]
else:
df = gb.get_group(compressed_id)
idx[compressed_id] = i
values = df.link_id.to_numpy()
a = df.a_node.to_numpy()
b = df.b_node.to_numpy()

# In order to ensure that the link IDs come out in the correct order we must walk the links
# we do this assuming the `a` array is sorted.
j = 0
# Find the missing a_node, this is the starting of the chain. We cannot rely on the node ordering to do a simple lookup

# Find the missing a_node, this is the starting of the chain. We cannot rely on the node ordering to do a simple lookup
a_node = x = a[np.isin(a, b, invert=True, assume_unique=True)][0]
while True:
tmp = a.searchsorted(x)
Expand All @@ -442,15 +469,19 @@ def create_compressed_link_network_mapping(graph):
j += 1

b_node = x
node_mapping[a_node] = graph.compact_graph["a_node"].iat[compressed_id]
node_mapping[b_node] = graph.compact_graph["b_node"].iat[compressed_id]

i += len(values)
node_mapping[a_node] = compact_a_nodes[compressed_id]
node_mapping[b_node] = compact_b_nodes[compressed_id]
i += count

idx[-1] = i
idx[-1] = i

graph.compressed_link_network_mapping_idx = np.array(idx)
graph.compressed_link_network_mapping_data = np.array(data)
graph.network_compressed_node_mapping = np.array(node_mapping)
graph.compressed_link_network_mapping_idx = np.array(idx)
graph.compressed_link_network_mapping_data = np.array(data)
graph.network_compressed_node_mapping = np.array(node_mapping)

return idx, data, node_mapping
print("end compressed link network mapping", time.time() - start_tiem)
return (
graph.compressed_link_network_mapping_idx,
graph.compressed_link_network_mapping_data,
graph.network_compressed_node_mapping
)