From 3525ff17efd475654577568c3bf196990962a4c5 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Sun, 6 Aug 2023 16:36:26 +0000 Subject: [PATCH 001/116] fix: cleanup ingest code --- .../graph/connectivity/cross_edges.py | 24 ++++--------------- pychunkedgraph/graph/misc.py | 1 - .../ingest/create/abstract_layers.py | 21 ++-------------- pychunkedgraph/ingest/create/atomic_layer.py | 4 ++-- 4 files changed, 9 insertions(+), 41 deletions(-) diff --git a/pychunkedgraph/graph/connectivity/cross_edges.py b/pychunkedgraph/graph/connectivity/cross_edges.py index 8aa52a9f1..d69759bbf 100644 --- a/pychunkedgraph/graph/connectivity/cross_edges.py +++ b/pychunkedgraph/graph/connectivity/cross_edges.py @@ -1,10 +1,9 @@ -import time +# pylint: disable=invalid-name, missing-docstring, import-outside-toplevel + import math import multiprocessing as mp from collections import defaultdict -from typing import Optional from typing import Sequence -from typing import List from typing import Dict import numpy as np @@ -13,9 +12,7 @@ from .. import attributes from ..types import empty_2d from ..utils import basetypes -from ..utils import serializers from ..chunkedgraph import ChunkedGraph -from ..utils.generic import get_valid_timestamp from ..utils.generic import filter_failed_node_ids from ..chunks.atomic import get_touching_atomic_chunks from ..chunks.atomic import get_bounding_atomic_chunks @@ -30,14 +27,12 @@ def get_children_chunk_cross_edges( The edges are between node IDs in the given layer (not atomic). """ atomic_chunks = get_touching_atomic_chunks(cg.meta, layer, chunk_coord) - if not len(atomic_chunks): + if len(atomic_chunks) == 0: return [] - print(f"touching atomic chunk count {len(atomic_chunks)}") if not use_threads: return _get_children_chunk_cross_edges(cg, atomic_chunks, layer - 1) - print("get_children_chunk_cross_edges, atomic chunks", len(atomic_chunks)) with mp.Manager() as manager: edge_ids_shared = manager.list() edge_ids_shared.append(empty_2d) @@ -69,9 +64,6 @@ def _get_children_chunk_cross_edges_helper(args) -> None: def _get_children_chunk_cross_edges(cg, atomic_chunks, layer) -> None: - print( - f"_get_children_chunk_cross_edges {layer} atomic_chunks count {len(atomic_chunks)}" - ) cross_edges = [empty_2d] for layer2_chunk in atomic_chunks: edges = _read_atomic_chunk_cross_edges(cg, layer2_chunk, layer) @@ -80,11 +72,10 @@ def _get_children_chunk_cross_edges(cg, atomic_chunks, layer) -> None: cross_edges = np.concatenate(cross_edges) if not cross_edges.size: return empty_2d - print(f"getting roots at stop_layer {layer} {cross_edges.shape}") + cross_edges[:, 0] = cg.get_roots(cross_edges[:, 0], stop_layer=layer, ceil=False) cross_edges[:, 1] = cg.get_roots(cross_edges[:, 1], stop_layer=layer, ceil=False) result = np.unique(cross_edges, axis=0) if cross_edges.size else empty_2d - print(f"_get_children_chunk_cross_edges done {result.shape}") return result @@ -118,16 +109,13 @@ def get_chunk_nodes_cross_edge_layer( return_type dict {node_id: layer} the lowest layer (>= current layer) at which a node_id is part of a cross edge """ - print("get_bounding_atomic_chunks") atomic_chunks = get_bounding_atomic_chunks(cg.meta, layer, chunk_coord) - print("get_bounding_atomic_chunks complete") - if not len(atomic_chunks): + if len(atomic_chunks) == 0: return {} if not use_threads: return _get_chunk_nodes_cross_edge_layer(cg, atomic_chunks, layer) - print("divide tasks") cg_info = cg.get_serialized_info() manager = mp.Manager() ids_l_shared = manager.list() @@ -139,7 +127,6 @@ def get_chunk_nodes_cross_edge_layer( multi_args.append( (ids_l_shared, layers_l_shared, cg_info, atomic_chunks, layer) ) - print("divide tasks complete") multiprocess_func( _get_chunk_nodes_cross_edge_layer_helper, @@ -149,7 +136,6 @@ def get_chunk_nodes_cross_edge_layer( node_layer_d_shared = manager.dict() _find_min_layer(node_layer_d_shared, ids_l_shared, layers_l_shared) - print("_find_min_layer complete") return node_layer_d_shared diff --git a/pychunkedgraph/graph/misc.py b/pychunkedgraph/graph/misc.py index b33e8a6fd..873422db1 100644 --- a/pychunkedgraph/graph/misc.py +++ b/pychunkedgraph/graph/misc.py @@ -202,7 +202,6 @@ def get_contact_sites( # Load edges of these cs_svs edges_cs_svs_rows = cg.client.read_nodes( node_ids=u_cs_svs, - # columns=[attributes.Connectivity.Partner, attributes.Connectivity.Connected], ) pre_cs_edges = [] for ri in edges_cs_svs_rows.items(): diff --git a/pychunkedgraph/ingest/create/abstract_layers.py b/pychunkedgraph/ingest/create/abstract_layers.py index 529a6846f..1973daacc 100644 --- a/pychunkedgraph/ingest/create/abstract_layers.py +++ b/pychunkedgraph/ingest/create/abstract_layers.py @@ -1,15 +1,14 @@ +# pylint: disable=invalid-name, missing-docstring, import-outside-toplevel + """ Functions for creating parents in level 3 and above """ -import time import math import datetime import multiprocessing as mp -from collections import defaultdict from typing import Optional from typing import Sequence -from typing import List import numpy as np from multiwrapper import multiprocessing_utils as mu @@ -44,11 +43,6 @@ def add_layer( cg, layer_id, parent_coords, use_threads=n_threads > 1 ) - print("children_coords", children_coords.size, layer_id, parent_coords) - print( - "n e", len(children_ids), len(edge_ids), layer_id, parent_coords, - ) - node_layers = cg.get_chunk_layers(children_ids) edge_layers = cg.get_chunk_layers(np.unique(edge_ids)) assert np.all(node_layers < layer_id), "invalid node layers" @@ -62,7 +56,6 @@ def add_layer( edge_ids.extend(add_edge_ids) graph, _, _, graph_ids = flatgraph.build_gt_graph(edge_ids, make_directed=True) ccs = flatgraph.connected_components(graph) - print("ccs", len(ccs)) _write_connected_components( cg, layer_id, @@ -84,7 +77,6 @@ def _read_children_chunks( children_ids.append(_read_chunk([], cg, layer_id - 1, child_coord)) return np.concatenate(children_ids) - print("_read_children_chunks") with mp.Manager() as manager: children_ids_shared = manager.list() multi_args = [] @@ -102,7 +94,6 @@ def _read_children_chunks( multi_args, n_threads=min(len(multi_args), mp.cpu_count()), ) - print("_read_children_chunks done") return np.concatenate(children_ids_shared) @@ -113,7 +104,6 @@ def _read_chunk_helper(args): def _read_chunk(children_ids_shared, cg: ChunkedGraph, layer_id: int, chunk_coord): - print(f"_read_chunk {layer_id}, {chunk_coord}") x, y, z = chunk_coord range_read = cg.range_read_chunk( cg.get_chunk_id(layer=layer_id, x=x, y=y, z=z), @@ -129,7 +119,6 @@ def _read_chunk(children_ids_shared, cg: ChunkedGraph, layer_id: int, chunk_coor row_ids = filter_failed_node_ids(row_ids, segment_ids, max_children_ids) children_ids_shared.append(row_ids) - print(f"_read_chunk {layer_id}, {chunk_coord} done {len(row_ids)}") return row_ids @@ -147,13 +136,10 @@ def _write_connected_components( node_layer_d_shared = {} if layer_id < cg.meta.layer_count: - print("getting node_layer_d_shared") node_layer_d_shared = get_chunk_nodes_cross_edge_layer( cg, layer_id, parent_coords, use_threads=use_threads ) - print("node_layer_d_shared", len(node_layer_d_shared)) - ccs_with_node_ids = [] for cc in ccs: ccs_with_node_ids.append(graph_ids[cc]) @@ -186,7 +172,6 @@ def _write_connected_components( def _write_components_helper(args): - print("running _write_components_helper") cg_info, layer_id, parent_coords, ccs, node_layer_d_shared, time_stamp = args cg = ChunkedGraph(**cg_info) _write(cg, layer_id, parent_coords, ccs, node_layer_d_shared, time_stamp) @@ -241,7 +226,5 @@ def _write( if len(rows) > 100000: cg.client.write(rows) - print("wrote rows", len(rows), layer_id, parent_coords) rows = [] cg.client.write(rows) - print("wrote rows", len(rows), layer_id, parent_coords) diff --git a/pychunkedgraph/ingest/create/atomic_layer.py b/pychunkedgraph/ingest/create/atomic_layer.py index 4fa1f1688..d87638b26 100644 --- a/pychunkedgraph/ingest/create/atomic_layer.py +++ b/pychunkedgraph/ingest/create/atomic_layer.py @@ -1,14 +1,14 @@ +# pylint: disable=invalid-name, missing-function-docstring, import-outside-toplevel + """ Functions for creating atomic nodes and their level 2 abstract parents """ import datetime from typing import Dict -from typing import List from typing import Optional from typing import Sequence -import pytz import numpy as np from ...graph import attributes From 411a922fdf8d6cb1b742d519909523a53378bc22 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Sun, 6 Aug 2023 16:38:22 +0000 Subject: [PATCH 002/116] add ttl column family --- pychunkedgraph/graph/client/bigtable/client.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pychunkedgraph/graph/client/bigtable/client.py b/pychunkedgraph/graph/client/bigtable/client.py index 5b86826bd..486cbdd73 100644 --- a/pychunkedgraph/graph/client/bigtable/client.py +++ b/pychunkedgraph/graph/client/bigtable/client.py @@ -1,4 +1,4 @@ -# pylint: disable=invalid-name, missing-docstring, import-outside-toplevel, line-too-long, protected-access, arguments-differ, arguments-renamed, logging-fstring-interpolation +# pylint: disable=invalid-name, missing-docstring, import-outside-toplevel, line-too-long, protected-access, arguments-differ, arguments-renamed, logging-fstring-interpolation, too-many-arguments import sys import time @@ -15,11 +15,12 @@ from google.api_core.exceptions import Aborted from google.api_core.exceptions import DeadlineExceeded from google.api_core.exceptions import ServiceUnavailable +from google.cloud.bigtable.column_family import MaxAgeGCRule +from google.cloud.bigtable.column_family import MaxVersionsGCRule from google.cloud.bigtable.table import Table from google.cloud.bigtable.row_set import RowSet from google.cloud.bigtable.row_data import PartialRowData from google.cloud.bigtable.row_filters import RowFilter -from google.cloud.bigtable.column_family import MaxVersionsGCRule from . import utils from . import BigTableConfig @@ -637,6 +638,8 @@ def _create_column_families(self): f.create() f = self._table.column_family("3") f.create() + f = self._table.column_family("4", gc_rule=MaxAgeGCRule(datetime.timedelta(days=1))) + f.create() def _get_ids_range(self, key: bytes, size: int) -> typing.Tuple: """Returns a range (min, max) of IDs for a given `key`.""" From c0632e87162d900600a6dad3314d1bd526d0b7aa Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Sun, 6 Aug 2023 16:40:55 +0000 Subject: [PATCH 003/116] fix: new l2 cx edge attribute --- pychunkedgraph/graph/attributes.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/pychunkedgraph/graph/attributes.py b/pychunkedgraph/graph/attributes.py index 3e48d204a..ea03d2216 100644 --- a/pychunkedgraph/graph/attributes.py +++ b/pychunkedgraph/graph/attributes.py @@ -1,6 +1,9 @@ +# pylint: disable=invalid-name, missing-docstring, protected-access, raise-missing-from + # TODO design to use these attributes across different clients # `family_id` is specific to bigtable +from enum import Enum from typing import NamedTuple from .utils import serializers @@ -101,8 +104,8 @@ class Connectivity: serializer=serializers.NumPyArray(dtype=basetypes.EDGE_AREA), ) - CrossChunkEdge = _AttributeArray( - pattern=b"atomic_cross_edges_%d", + L2CrossChunkEdge = _AttributeArray( + pattern=b"l2_cross_edge_%d", family_id="3", serializer=serializers.NumPyArray( dtype=basetypes.NODE_ID, shape=(-1, 2), compression_level=22 @@ -115,6 +118,14 @@ class Connectivity: serializer=serializers.NumPyArray(dtype=basetypes.NODE_ID, shape=(-1, 2)), ) + CrossChunkEdge = _AttributeArray( + pattern=b"atomic_cross_edges_%d", + family_id="4", + serializer=serializers.NumPyArray( + dtype=basetypes.NODE_ID, shape=(-1, 2), compression_level=22 + ), + ) + class Hierarchy: Child = _Attribute( @@ -157,8 +168,6 @@ class GraphVersion: class OperationLogs: key = b"ioperations" - from enum import Enum - class StatusCodes(Enum): SUCCESS = 0 # all is well, new changes persisted CREATED = 1 # log record created in storage From 08bd83a14322925f8c6b92ab9d83fb0c70b588ee Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Sun, 6 Aug 2023 20:02:44 +0000 Subject: [PATCH 004/116] feat: post process sv cross edges --- pychunkedgraph/graph/attributes.py | 6 +-- .../graph/client/bigtable/client.py | 4 +- pychunkedgraph/ingest/create/atomic_layer.py | 54 ++++++++++++++++++- 3 files changed, 58 insertions(+), 6 deletions(-) diff --git a/pychunkedgraph/graph/attributes.py b/pychunkedgraph/graph/attributes.py index ea03d2216..b58a6f0f8 100644 --- a/pychunkedgraph/graph/attributes.py +++ b/pychunkedgraph/graph/attributes.py @@ -106,7 +106,7 @@ class Connectivity: L2CrossChunkEdge = _AttributeArray( pattern=b"l2_cross_edge_%d", - family_id="3", + family_id="4", serializer=serializers.NumPyArray( dtype=basetypes.NODE_ID, shape=(-1, 2), compression_level=22 ), @@ -114,13 +114,13 @@ class Connectivity: FakeEdges = _Attribute( key=b"fake_edges", - family_id="3", + family_id="4", serializer=serializers.NumPyArray(dtype=basetypes.NODE_ID, shape=(-1, 2)), ) CrossChunkEdge = _AttributeArray( pattern=b"atomic_cross_edges_%d", - family_id="4", + family_id="3", serializer=serializers.NumPyArray( dtype=basetypes.NODE_ID, shape=(-1, 2), compression_level=22 ), diff --git a/pychunkedgraph/graph/client/bigtable/client.py b/pychunkedgraph/graph/client/bigtable/client.py index 486cbdd73..19a08b9a8 100644 --- a/pychunkedgraph/graph/client/bigtable/client.py +++ b/pychunkedgraph/graph/client/bigtable/client.py @@ -636,9 +636,9 @@ def _create_column_families(self): f.create() f = self._table.column_family("2") f.create() - f = self._table.column_family("3") + f = self._table.column_family("3", gc_rule=MaxAgeGCRule(datetime.timedelta(days=1))) f.create() - f = self._table.column_family("4", gc_rule=MaxAgeGCRule(datetime.timedelta(days=1))) + f = self._table.column_family("4") f.create() def _get_ids_range(self, key: bytes, size: int) -> typing.Tuple: diff --git a/pychunkedgraph/ingest/create/atomic_layer.py b/pychunkedgraph/ingest/create/atomic_layer.py index d87638b26..a59bc9f20 100644 --- a/pychunkedgraph/ingest/create/atomic_layer.py +++ b/pychunkedgraph/ingest/create/atomic_layer.py @@ -101,7 +101,13 @@ def _get_remapping(chunk_edges_d: dict): def _process_component( - cg, chunk_edges_d, parent_id, node_ids, sparse_indices, remapping, time_stamp, + cg, + chunk_edges_d, + parent_id, + node_ids, + sparse_indices, + remapping, + time_stamp, ): nodes = [] chunk_out_edges = [] # out = between + cross @@ -145,3 +151,49 @@ def _get_outgoing_edges(node_id, chunk_edges_d, sparse_indices, remapping): # edges that this node is part of chunk_out_edges = np.concatenate([chunk_out_edges, edges[row_ids]]) return chunk_out_edges + + +def postprocess_atomic_chunk( + cg: ChunkedGraph, + chunk_coord: np.ndarray, + time_stamp: Optional[datetime.datetime] = None, +): + time_stamp = get_valid_timestamp(time_stamp) + + chunk_id = cg.get_chunk_id( + layer=2, x=chunk_coord[0], y=chunk_coord[1], z=chunk_coord[2] + ) + + properties = [ + attributes.Connectivity.CrossChunkEdge[l] for l in range(2, cg.meta.layer_count) + ] + + chunk_rr = cg.range_read_chunk( + chunk_id, properties=properties, time_stamp=time_stamp + ) + + result = {} + for l2id, raw_cx_edges in chunk_rr.items(): + try: + cx_edges = { + prop.index: val[0].value.copy() for prop, val in raw_cx_edges.items() + } + result[l2id] = cx_edges + except KeyError: + continue + + nodes = [] + val_dicts = [] + for l2id, cx_edges in result.items(): + val_dict = {} + for layer, edges in cx_edges.items(): + l2_edges = np.zeros_like(edges) + l2_edges[:, 0] = l2id + l2_edges[:, 1] = cg.get_parents(edges[:, 1]) + col = attributes.Connectivity.L2CrossChunkEdge[layer] + val_dict[col] = np.unique(l2_edges, axis=0) + val_dicts.append(val_dict) + + r_key = serializers.serialize_uint64(l2id) + nodes.append(cg.client.mutate_row(r_key, val_dict, time_stamp=time_stamp)) + cg.client.write(nodes) From b8a44251f805338a66139ef7a9034e4b71325cae Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Fri, 11 Aug 2023 15:11:02 +0000 Subject: [PATCH 005/116] fix: use longer expiry for debugging --- pychunkedgraph/graph/attributes.py | 12 ++++++------ pychunkedgraph/graph/client/bigtable/client.py | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pychunkedgraph/graph/attributes.py b/pychunkedgraph/graph/attributes.py index b58a6f0f8..a3cf4a99c 100644 --- a/pychunkedgraph/graph/attributes.py +++ b/pychunkedgraph/graph/attributes.py @@ -104,6 +104,12 @@ class Connectivity: serializer=serializers.NumPyArray(dtype=basetypes.EDGE_AREA), ) + FakeEdges = _Attribute( + key=b"fake_edges", + family_id="4", + serializer=serializers.NumPyArray(dtype=basetypes.NODE_ID, shape=(-1, 2)), + ) + L2CrossChunkEdge = _AttributeArray( pattern=b"l2_cross_edge_%d", family_id="4", @@ -112,12 +118,6 @@ class Connectivity: ), ) - FakeEdges = _Attribute( - key=b"fake_edges", - family_id="4", - serializer=serializers.NumPyArray(dtype=basetypes.NODE_ID, shape=(-1, 2)), - ) - CrossChunkEdge = _AttributeArray( pattern=b"atomic_cross_edges_%d", family_id="3", diff --git a/pychunkedgraph/graph/client/bigtable/client.py b/pychunkedgraph/graph/client/bigtable/client.py index 19a08b9a8..135ad9d07 100644 --- a/pychunkedgraph/graph/client/bigtable/client.py +++ b/pychunkedgraph/graph/client/bigtable/client.py @@ -636,7 +636,7 @@ def _create_column_families(self): f.create() f = self._table.column_family("2") f.create() - f = self._table.column_family("3", gc_rule=MaxAgeGCRule(datetime.timedelta(days=1))) + f = self._table.column_family("3", gc_rule=MaxAgeGCRule(datetime.timedelta(days=365))) f.create() f = self._table.column_family("4") f.create() From 8986fac9ed99ffaf3d53e3426334ea0264b738d2 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Fri, 11 Aug 2023 15:55:37 +0000 Subject: [PATCH 006/116] feat(ingest): read l2 cross edges --- pychunkedgraph/graph/connectivity/cross_edges.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pychunkedgraph/graph/connectivity/cross_edges.py b/pychunkedgraph/graph/connectivity/cross_edges.py index d69759bbf..99dc8df7f 100644 --- a/pychunkedgraph/graph/connectivity/cross_edges.py +++ b/pychunkedgraph/graph/connectivity/cross_edges.py @@ -82,7 +82,7 @@ def _get_children_chunk_cross_edges(cg, atomic_chunks, layer) -> None: def _read_atomic_chunk_cross_edges( cg, chunk_coord: Sequence[int], cross_edge_layer: int ) -> np.ndarray: - cross_edge_col = attributes.Connectivity.CrossChunkEdge[cross_edge_layer] + cross_edge_col = attributes.Connectivity.L2CrossChunkEdge[cross_edge_layer] range_read, l2ids = _read_atomic_chunk(cg, chunk_coord, [cross_edge_layer]) parent_neighboring_chunk_supervoxels_d = defaultdict(list) @@ -170,7 +170,7 @@ def _read_atomic_chunk_cross_edge_nodes(cg, chunk_coord, cross_edge_layers): range_read, l2ids = _read_atomic_chunk(cg, chunk_coord, cross_edge_layers) for l2id in l2ids: for layer in cross_edge_layers: - if attributes.Connectivity.CrossChunkEdge[layer] in range_read[l2id]: + if attributes.Connectivity.L2CrossChunkEdge[layer] in range_read[l2id]: node_layer_d[l2id] = layer break return node_layer_d @@ -190,7 +190,7 @@ def _read_atomic_chunk(cg, chunk_coord, layers): range_read = cg.range_read_chunk( cg.get_chunk_id(layer=2, x=x, y=y, z=z), properties=[child_col] - + [attributes.Connectivity.CrossChunkEdge[l] for l in layers], + + [attributes.Connectivity.L2CrossChunkEdge[l] for l in layers], ) row_ids = [] From 24798bd5f17a293458a2d1ed823e9bc6b7cc84cb Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Sat, 12 Aug 2023 16:41:25 +0000 Subject: [PATCH 007/116] feat(ingest): postprocess job handling --- pychunkedgraph/ingest/cli.py | 25 ++++++++-- pychunkedgraph/ingest/cluster.py | 79 +++++++++++++------------------- 2 files changed, 54 insertions(+), 50 deletions(-) diff --git a/pychunkedgraph/ingest/cli.py b/pychunkedgraph/ingest/cli.py index 7668e8f24..ed0c3a3d6 100644 --- a/pychunkedgraph/ingest/cli.py +++ b/pychunkedgraph/ingest/cli.py @@ -1,3 +1,5 @@ +# pylint: disable=invalid-name, missing-function-docstring, import-outside-toplevel + """ cli for running ingest """ @@ -10,6 +12,7 @@ from flask.cli import AppGroup from rq import Queue +from .cluster import enqueue_atomic_tasks from .manager import IngestionManager from .utils import bootstrap from .cluster import randomize_grid_points @@ -45,8 +48,6 @@ def ingest_graph( Main ingest command. Takes ingest config from a yaml file and queues atomic tasks. """ - from .cluster import enqueue_atomic_tasks - with open(dataset, "r") as stream: config = yaml.safe_load(stream) @@ -62,6 +63,16 @@ def ingest_graph( enqueue_atomic_tasks(IngestionManager(ingest_config, meta)) +@ingest_cli.command("postprocess") +def postprocess(): + """ + Run postprocessing step on level 2 chunks. + """ + redis = get_redis_connection() + imanager = IngestionManager.from_pickle(redis.get(r_keys.INGESTION_MANAGER)) + enqueue_atomic_tasks(imanager, postprocess=True) + + @ingest_cli.command("imanager") @click.argument("graph_id", type=str) @click.argument("dataset", type=click.Path(exists=True)) @@ -143,7 +154,15 @@ def ingest_status(): """Print ingest status to console by layer.""" redis = get_redis_connection() imanager = IngestionManager.from_pickle(redis.get(r_keys.INGESTION_MANAGER)) - layers = range(2, imanager.cg_meta.layer_count + 1) + + layer = 2 + completed = redis.scard(f"{layer}c") + print(f"{layer}\t: {completed} / {imanager.cg_meta.layer_count}") + + completed = redis.scard(f"pp{layer}c") + print(f"{layer}\t: {completed} / {imanager.cg_meta.layer_count} [postprocess]") + + layers = range(3, imanager.cg_meta.layer_count + 1) for layer, layer_count in zip(layers, imanager.cg_meta.layer_chunk_counts): completed = redis.scard(f"{layer}c") print(f"{layer}\t: {completed} / {layer_count}") diff --git a/pychunkedgraph/ingest/cluster.py b/pychunkedgraph/ingest/cluster.py index cf9417024..768c474ce 100644 --- a/pychunkedgraph/ingest/cluster.py +++ b/pychunkedgraph/ingest/cluster.py @@ -1,3 +1,5 @@ +# pylint: disable=invalid-name, missing-function-docstring, import-outside-toplevel + """ Ingest / create chunkedgraph with workers. """ @@ -11,6 +13,7 @@ from .common import get_atomic_chunk_data from .ran_agglomeration import get_active_edges from .create.atomic_layer import add_atomic_edges +from .create.atomic_layer import postprocess_atomic_chunk from .create.abstract_layers import add_layer from ..graph.meta import ChunkedGraphMeta from ..graph.chunks.hierarchy import get_children_chunk_coords @@ -18,44 +21,16 @@ from ..utils.redis import get_redis_connection -def _post_task_completion(imanager: IngestionManager, layer: int, coords: np.ndarray): - from os import environ - +def _post_task_completion( + imanager: IngestionManager, + layer: int, + coords: np.ndarray, + postprocess: bool = False, +): chunk_str = "_".join(map(str, coords)) # mark chunk as completed - "c" - imanager.redis.sadd(f"{layer}c", chunk_str) - - if environ.get("DO_NOT_AUTOQUEUE_PARENT_CHUNKS", None) is not None: - return - - parent_layer = layer + 1 - if parent_layer > imanager.cg_meta.layer_count: - return - - parent_coords = np.array(coords, int) // imanager.cg_meta.graph_config.FANOUT - parent_id_str = chunk_id_str(parent_layer, parent_coords) - imanager.redis.sadd(parent_id_str, chunk_str) - - parent_chunk_str = "_".join(map(str, parent_coords)) - if not imanager.redis.hget(parent_layer, parent_chunk_str): - # cache children chunk count - # checked by tracker worker to enqueue parent chunk - children_count = len( - get_children_chunk_coords(imanager.cg_meta, parent_layer, parent_coords) - ) - imanager.redis.hset(parent_layer, parent_chunk_str, children_count) - - tracker_queue = imanager.get_task_queue(f"t{layer}") - tracker_queue.enqueue( - enqueue_parent_task, - job_id=f"t{layer}_{chunk_str}", - job_timeout=f"30s", - result_ttl=0, - args=( - parent_layer, - parent_coords, - ), - ) + pprocess = "_pprocess" if postprocess else "" + imanager.redis.sadd(f"{layer}c{pprocess}", chunk_str) def enqueue_parent_task( @@ -127,7 +102,7 @@ def randomize_grid_points(X: int, Y: int, Z: int) -> Tuple[int, int, int]: yield np.unravel_index(index, (X, Y, Z)) -def enqueue_atomic_tasks(imanager: IngestionManager): +def enqueue_atomic_tasks(imanager: IngestionManager, postprocess: bool = False): from os import environ from time import sleep from rq import Queue as RQueue @@ -138,13 +113,18 @@ def enqueue_atomic_tasks(imanager: IngestionManager): atomic_chunk_bounds = imanager.cg_meta.layer_chunk_bounds[2] chunk_coords = randomize_grid_points(*atomic_chunk_bounds) chunk_count = imanager.cg_meta.layer_chunk_counts[0] - print(f"total chunk count: {chunk_count}, queuing...") - batch_size = int(environ.get("L2JOB_BATCH_SIZE", 1000)) + pprocess = "" + if postprocess: + pprocess = "_pprocess" + print("postprocessing l2 chunks") + + queue_name = f"{imanager.config.CLUSTER.ATOMIC_Q_NAME}{pprocess}" + q = imanager.get_task_queue(queue_name) job_datas = [] + batch_size = int(environ.get("L2JOB_BATCH_SIZE", 1000)) for chunk_coord in chunk_coords: - q = imanager.get_task_queue(imanager.config.CLUSTER.ATOMIC_Q_NAME) # buffer for optimal use of redis memory if len(q) > imanager.config.CLUSTER.ATOMIC_Q_LIMIT: print(f"Sleeping {imanager.config.CLUSTER.ATOMIC_Q_INTERVAL}s...") @@ -152,13 +132,13 @@ def enqueue_atomic_tasks(imanager: IngestionManager): x, y, z = chunk_coord chunk_str = f"{x}_{y}_{z}" - if imanager.redis.sismember("2c", chunk_str): + if imanager.redis.sismember(f"2c{pprocess}", chunk_str): # already done, skip continue job_datas.append( RQueue.prepare_data( _create_atomic_chunk, - args=(chunk_coord,), + args=(chunk_coord, postprocess), timeout=environ.get("L2JOB_TIMEOUT", "3m"), result_ttl=0, job_id=chunk_id_str(2, chunk_coord), @@ -170,21 +150,26 @@ def enqueue_atomic_tasks(imanager: IngestionManager): q.enqueue_many(job_datas) -def _create_atomic_chunk(coords: Sequence[int]): +def _create_atomic_chunk(coords: Sequence[int], postprocess: bool = False): """Creates single atomic chunk""" redis = get_redis_connection() imanager = IngestionManager.from_pickle(redis.get(r_keys.INGESTION_MANAGER)) coords = np.array(list(coords), dtype=int) - chunk_edges_all, mapping = get_atomic_chunk_data(imanager, coords) - chunk_edges_active, isolated_ids = get_active_edges(chunk_edges_all, mapping) - add_atomic_edges(imanager.cg, coords, chunk_edges_active, isolated=isolated_ids) + + if postprocess: + postprocess_atomic_chunk(imanager.cg, coords) + else: + chunk_edges_all, mapping = get_atomic_chunk_data(imanager, coords) + chunk_edges_active, isolated_ids = get_active_edges(chunk_edges_all, mapping) + add_atomic_edges(imanager.cg, coords, chunk_edges_active, isolated=isolated_ids) + if imanager.config.TEST_RUN: # print for debugging for k, v in chunk_edges_all.items(): print(k, len(v)) for k, v in chunk_edges_active.items(): print(f"active_{k}", len(v)) - _post_task_completion(imanager, 2, coords) + _post_task_completion(imanager, 2, coords, postprocess=postprocess) def _get_test_chunks(meta: ChunkedGraphMeta): From bb5512d367e5d3d6618d4ac0d029e5ed6839f797 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Sat, 12 Aug 2023 17:14:37 +0000 Subject: [PATCH 008/116] fix(ingest): status --- pychunkedgraph/ingest/cli.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pychunkedgraph/ingest/cli.py b/pychunkedgraph/ingest/cli.py index ed0c3a3d6..8cf081952 100644 --- a/pychunkedgraph/ingest/cli.py +++ b/pychunkedgraph/ingest/cli.py @@ -157,13 +157,13 @@ def ingest_status(): layer = 2 completed = redis.scard(f"{layer}c") - print(f"{layer}\t: {completed} / {imanager.cg_meta.layer_count}") + print(f"{layer}\t: {completed} / {imanager.cg_meta.layer_chunk_counts[0]}") completed = redis.scard(f"pp{layer}c") - print(f"{layer}\t: {completed} / {imanager.cg_meta.layer_count} [postprocess]") + print(f"{layer}\t: {completed} / {imanager.cg_meta.layer_chunk_counts[0]} [postprocess]") - layers = range(3, imanager.cg_meta.layer_count + 1) - for layer, layer_count in zip(layers, imanager.cg_meta.layer_chunk_counts): + layers = range(3, imanager.cg_meta.layer_count) + for layer, layer_count in zip(layers, imanager.cg_meta.layer_chunk_counts[1:]): completed = redis.scard(f"{layer}c") print(f"{layer}\t: {completed} / {layer_count}") From 5985b28d1c1cfc3e92c19383f386d645030bbeae Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Sat, 12 Aug 2023 17:20:15 +0000 Subject: [PATCH 009/116] fix: timedelta import --- pychunkedgraph/graph/client/bigtable/client.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pychunkedgraph/graph/client/bigtable/client.py b/pychunkedgraph/graph/client/bigtable/client.py index 135ad9d07..788c76a8e 100644 --- a/pychunkedgraph/graph/client/bigtable/client.py +++ b/pychunkedgraph/graph/client/bigtable/client.py @@ -4,8 +4,8 @@ import time import typing import logging -import datetime from datetime import datetime +from datetime import timedelta import numpy as np from multiwrapper import multiprocessing_utils as mu @@ -636,7 +636,7 @@ def _create_column_families(self): f.create() f = self._table.column_family("2") f.create() - f = self._table.column_family("3", gc_rule=MaxAgeGCRule(datetime.timedelta(days=365))) + f = self._table.column_family("3", gc_rule=MaxAgeGCRule(timedelta(days=365))) f.create() f = self._table.column_family("4") f.create() From 335f701114c1c97231214db2afea105036f90f82 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Sat, 12 Aug 2023 19:31:05 +0000 Subject: [PATCH 010/116] fix(ingest): status --- pychunkedgraph/ingest/cli.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pychunkedgraph/ingest/cli.py b/pychunkedgraph/ingest/cli.py index 8cf081952..aedcb6d97 100644 --- a/pychunkedgraph/ingest/cli.py +++ b/pychunkedgraph/ingest/cli.py @@ -159,10 +159,10 @@ def ingest_status(): completed = redis.scard(f"{layer}c") print(f"{layer}\t: {completed} / {imanager.cg_meta.layer_chunk_counts[0]}") - completed = redis.scard(f"pp{layer}c") + completed = redis.scard(f"{layer}c_pprocess") print(f"{layer}\t: {completed} / {imanager.cg_meta.layer_chunk_counts[0]} [postprocess]") - layers = range(3, imanager.cg_meta.layer_count) + layers = range(3, imanager.cg_meta.layer_count + 1) for layer, layer_count in zip(layers, imanager.cg_meta.layer_chunk_counts[1:]): completed = redis.scard(f"{layer}c") print(f"{layer}\t: {completed} / {layer_count}") From 00aa97f384c1a9ae51c1cc99297ef805e317514d Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Sat, 12 Aug 2023 19:35:53 +0000 Subject: [PATCH 011/116] fix(ingest): use hypenated names for valid dns --- pychunkedgraph/ingest/cli.py | 2 +- pychunkedgraph/ingest/cluster.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pychunkedgraph/ingest/cli.py b/pychunkedgraph/ingest/cli.py index aedcb6d97..145c9bea6 100644 --- a/pychunkedgraph/ingest/cli.py +++ b/pychunkedgraph/ingest/cli.py @@ -159,7 +159,7 @@ def ingest_status(): completed = redis.scard(f"{layer}c") print(f"{layer}\t: {completed} / {imanager.cg_meta.layer_chunk_counts[0]}") - completed = redis.scard(f"{layer}c_pprocess") + completed = redis.scard(f"{layer}c-postprocess") print(f"{layer}\t: {completed} / {imanager.cg_meta.layer_chunk_counts[0]} [postprocess]") layers = range(3, imanager.cg_meta.layer_count + 1) diff --git a/pychunkedgraph/ingest/cluster.py b/pychunkedgraph/ingest/cluster.py index 768c474ce..2b7927869 100644 --- a/pychunkedgraph/ingest/cluster.py +++ b/pychunkedgraph/ingest/cluster.py @@ -29,7 +29,7 @@ def _post_task_completion( ): chunk_str = "_".join(map(str, coords)) # mark chunk as completed - "c" - pprocess = "_pprocess" if postprocess else "" + pprocess = "-postprocess" if postprocess else "" imanager.redis.sadd(f"{layer}c{pprocess}", chunk_str) @@ -117,7 +117,7 @@ def enqueue_atomic_tasks(imanager: IngestionManager, postprocess: bool = False): pprocess = "" if postprocess: - pprocess = "_pprocess" + pprocess = "-postprocess" print("postprocessing l2 chunks") queue_name = f"{imanager.config.CLUSTER.ATOMIC_Q_NAME}{pprocess}" From d5a665451eddb9fb9079d6ed6688a265d94e87fd Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Sun, 20 Aug 2023 19:39:35 +0000 Subject: [PATCH 012/116] fix: rename attr; better var names --- pychunkedgraph/graph/attributes.py | 6 ++-- .../ingest/create/abstract_layers.py | 32 +++++++++++-------- pychunkedgraph/ingest/create/atomic_layer.py | 6 ++-- 3 files changed, 24 insertions(+), 20 deletions(-) diff --git a/pychunkedgraph/graph/attributes.py b/pychunkedgraph/graph/attributes.py index a3cf4a99c..b0f18c2ec 100644 --- a/pychunkedgraph/graph/attributes.py +++ b/pychunkedgraph/graph/attributes.py @@ -110,15 +110,15 @@ class Connectivity: serializer=serializers.NumPyArray(dtype=basetypes.NODE_ID, shape=(-1, 2)), ) - L2CrossChunkEdge = _AttributeArray( - pattern=b"l2_cross_edge_%d", + CrossChunkEdge = _AttributeArray( + pattern=b"cross_edge_%d", family_id="4", serializer=serializers.NumPyArray( dtype=basetypes.NODE_ID, shape=(-1, 2), compression_level=22 ), ) - CrossChunkEdge = _AttributeArray( + AtomicCrossChunkEdge = _AttributeArray( pattern=b"atomic_cross_edges_%d", family_id="3", serializer=serializers.NumPyArray( diff --git a/pychunkedgraph/ingest/create/abstract_layers.py b/pychunkedgraph/ingest/create/abstract_layers.py index 1973daacc..215929c41 100644 --- a/pychunkedgraph/ingest/create/abstract_layers.py +++ b/pychunkedgraph/ingest/create/abstract_layers.py @@ -56,12 +56,15 @@ def add_layer( edge_ids.extend(add_edge_ids) graph, _, _, graph_ids = flatgraph.build_gt_graph(edge_ids, make_directed=True) ccs = flatgraph.connected_components(graph) + connected_components = [] + for cc in ccs: + connected_components.append(graph_ids[cc]) + _write_connected_components( cg, layer_id, parent_coords, - ccs, - graph_ids, + connected_components, get_valid_timestamp(time_stamp), n_threads > 1, ) @@ -126,12 +129,11 @@ def _write_connected_components( cg: ChunkedGraph, layer_id: int, parent_coords, - ccs, - graph_ids, + connected_components: list, time_stamp, use_threads=True, ) -> None: - if not ccs: + if len(connected_components) == 0: return node_layer_d_shared = {} @@ -140,24 +142,20 @@ def _write_connected_components( cg, layer_id, parent_coords, use_threads=use_threads ) - ccs_with_node_ids = [] - for cc in ccs: - ccs_with_node_ids.append(graph_ids[cc]) - if not use_threads: _write( cg, layer_id, parent_coords, - ccs_with_node_ids, + connected_components, node_layer_d_shared, time_stamp, use_threads=use_threads, ) return - task_size = int(math.ceil(len(ccs_with_node_ids) / mp.cpu_count() / 10)) - chunked_ccs = chunked(ccs_with_node_ids, task_size) + task_size = int(math.ceil(len(connected_components) / mp.cpu_count() / 10)) + chunked_ccs = chunked(connected_components, task_size) cg_info = cg.get_serialized_info() multi_args = [] for ccs in chunked_ccs: @@ -178,11 +176,17 @@ def _write_components_helper(args): def _write( - cg, layer_id, parent_coords, ccs, node_layer_d_shared, time_stamp, use_threads=True + cg, + layer_id, + parent_coords, + connected_components, + node_layer_d_shared, + time_stamp, + use_threads=True, ): parent_layer_ids = range(layer_id, cg.meta.layer_count + 1) cc_connections = {l: [] for l in parent_layer_ids} - for node_ids in ccs: + for node_ids in connected_components: layer = layer_id if len(node_ids) == 1: layer = node_layer_d_shared.get(node_ids[0], cg.meta.layer_count) diff --git a/pychunkedgraph/ingest/create/atomic_layer.py b/pychunkedgraph/ingest/create/atomic_layer.py index a59bc9f20..42b6a01b5 100644 --- a/pychunkedgraph/ingest/create/atomic_layer.py +++ b/pychunkedgraph/ingest/create/atomic_layer.py @@ -126,7 +126,7 @@ def _process_component( for cc_layer in u_cce_layers: layer_out_edges = chunk_out_edges[cce_layers == cc_layer] if layer_out_edges.size: - col = attributes.Connectivity.CrossChunkEdge[cc_layer] + col = attributes.Connectivity.AtomicCrossChunkEdge[cc_layer] val_dict[col] = layer_out_edges r_key = serializers.serialize_uint64(parent_id) @@ -165,7 +165,7 @@ def postprocess_atomic_chunk( ) properties = [ - attributes.Connectivity.CrossChunkEdge[l] for l in range(2, cg.meta.layer_count) + attributes.Connectivity.AtomicCrossChunkEdge[l] for l in range(2, cg.meta.layer_count) ] chunk_rr = cg.range_read_chunk( @@ -190,7 +190,7 @@ def postprocess_atomic_chunk( l2_edges = np.zeros_like(edges) l2_edges[:, 0] = l2id l2_edges[:, 1] = cg.get_parents(edges[:, 1]) - col = attributes.Connectivity.L2CrossChunkEdge[layer] + col = attributes.Connectivity.CrossChunkEdge[layer] val_dict[col] = np.unique(l2_edges, axis=0) val_dicts.append(val_dict) From b17038b4051b2ba362016ee36adac0b28dda07e0 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Sun, 20 Aug 2023 19:40:48 +0000 Subject: [PATCH 013/116] fix: rename attr; better var names --- pychunkedgraph/graph/edits.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pychunkedgraph/graph/edits.py b/pychunkedgraph/graph/edits.py index be2eee1c6..4cb536ea7 100644 --- a/pychunkedgraph/graph/edits.py +++ b/pychunkedgraph/graph/edits.py @@ -565,7 +565,7 @@ def _get_atomic_cross_edges_val_dict(self): for id_ in new_ids: val_dict = {} for layer, edges in atomic_cross_edges_d[id_].items(): - val_dict[attributes.Connectivity.CrossChunkEdge[layer]] = edges + val_dict[attributes.Connectivity.AtomicCrossChunkEdge[layer]] = edges val_dicts[id_] = val_dict return val_dicts From 5e4aee77e4333a40278d7f0042cd36d64febd644 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Sun, 20 Aug 2023 20:15:31 +0000 Subject: [PATCH 014/116] fix: add more docs; better var names --- .../graph/connectivity/cross_edges.py | 79 ++++++++++++++----- 1 file changed, 58 insertions(+), 21 deletions(-) diff --git a/pychunkedgraph/graph/connectivity/cross_edges.py b/pychunkedgraph/graph/connectivity/cross_edges.py index 99dc8df7f..d2dbcbb8c 100644 --- a/pychunkedgraph/graph/connectivity/cross_edges.py +++ b/pychunkedgraph/graph/connectivity/cross_edges.py @@ -64,6 +64,11 @@ def _get_children_chunk_cross_edges_helper(args) -> None: def _get_children_chunk_cross_edges(cg, atomic_chunks, layer) -> None: + """ + Non parallelized version + Cross edges that connect children chunks. + The edges are between node IDs in the given layer (not atomic). + """ cross_edges = [empty_2d] for layer2_chunk in atomic_chunks: edges = _read_atomic_chunk_cross_edges(cg, layer2_chunk, layer) @@ -82,7 +87,11 @@ def _get_children_chunk_cross_edges(cg, atomic_chunks, layer) -> None: def _read_atomic_chunk_cross_edges( cg, chunk_coord: Sequence[int], cross_edge_layer: int ) -> np.ndarray: - cross_edge_col = attributes.Connectivity.L2CrossChunkEdge[cross_edge_layer] + """ + Returns cross edges between l2 nodes in current chunk and + l1 supervoxels from neighbor chunks. + """ + cross_edge_col = attributes.Connectivity.CrossChunkEdge[cross_edge_layer] range_read, l2ids = _read_atomic_chunk(cg, chunk_coord, [cross_edge_layer]) parent_neighboring_chunk_supervoxels_d = defaultdict(list) @@ -93,8 +102,7 @@ def _read_atomic_chunk_cross_edges( parent_neighboring_chunk_supervoxels_d[l2id] = edges[:, 1] cross_edges = [empty_2d] - for l2id in parent_neighboring_chunk_supervoxels_d: - nebor_svs = parent_neighboring_chunk_supervoxels_d[l2id] + for l2id, nebor_svs in parent_neighboring_chunk_supervoxels_d.items(): chunk_parent_ids = np.array([l2id] * len(nebor_svs), dtype=basetypes.NODE_ID) cross_edges.append(np.vstack([chunk_parent_ids, nebor_svs]).T) cross_edges = np.concatenate(cross_edges) @@ -118,14 +126,14 @@ def get_chunk_nodes_cross_edge_layer( cg_info = cg.get_serialized_info() manager = mp.Manager() - ids_l_shared = manager.list() - layers_l_shared = manager.list() + node_ids_shared = manager.list() + node_layers_shared = manager.list() task_size = int(math.ceil(len(atomic_chunks) / mp.cpu_count() / 10)) chunked_l2chunk_list = chunked(atomic_chunks, task_size) multi_args = [] for atomic_chunks in chunked_l2chunk_list: multi_args.append( - (ids_l_shared, layers_l_shared, cg_info, atomic_chunks, layer) + (node_ids_shared, node_layers_shared, cg_info, atomic_chunks, layer) ) multiprocess_func( @@ -135,24 +143,28 @@ def get_chunk_nodes_cross_edge_layer( ) node_layer_d_shared = manager.dict() - _find_min_layer(node_layer_d_shared, ids_l_shared, layers_l_shared) + _find_min_layer(node_layer_d_shared, node_ids_shared, node_layers_shared) return node_layer_d_shared def _get_chunk_nodes_cross_edge_layer_helper(args): - ids_l_shared, layers_l_shared, cg_info, atomic_chunks, layer = args + node_ids_shared, node_layers_shared, cg_info, atomic_chunks, layer = args cg = ChunkedGraph(**cg_info) node_layer_d = _get_chunk_nodes_cross_edge_layer(cg, atomic_chunks, layer) - ids_l_shared.append(np.fromiter(node_layer_d.keys(), dtype=basetypes.NODE_ID)) - layers_l_shared.append(np.fromiter(node_layer_d.values(), dtype=np.uint8)) + node_ids_shared.append(np.fromiter(node_layer_d.keys(), dtype=basetypes.NODE_ID)) + node_layers_shared.append(np.fromiter(node_layer_d.values(), dtype=np.uint8)) def _get_chunk_nodes_cross_edge_layer(cg, atomic_chunks, layer): + """ + Non parallelized version + gets nodes in a chunk that are part of cross chunk edges + return_type dict {node_id: layer} + the lowest layer (>= current layer) at which a node_id is part of a cross edge + """ atomic_node_layer_d = {} for atomic_chunk in atomic_chunks: - chunk_node_layer_d = _read_atomic_chunk_cross_edge_nodes( - cg, atomic_chunk, range(layer, cg.meta.layer_count + 1) - ) + chunk_node_layer_d = _read_atomic_chunk_cross_edge_nodes(cg, atomic_chunk, layer) atomic_node_layer_d.update(chunk_node_layer_d) l2ids = np.fromiter(atomic_node_layer_d.keys(), dtype=basetypes.NODE_ID) @@ -165,32 +177,57 @@ def _get_chunk_nodes_cross_edge_layer(cg, atomic_chunks, layer): return node_layer_d -def _read_atomic_chunk_cross_edge_nodes(cg, chunk_coord, cross_edge_layers): +def _read_atomic_chunk_cross_edge_nodes(cg, chunk_coord, layer): + """ + the lowest layer at which an l2 node is part of a cross edge + """ node_layer_d = {} - range_read, l2ids = _read_atomic_chunk(cg, chunk_coord, cross_edge_layers) + relevant_layers = range(layer, cg.meta.layer_count + 1) + range_read, l2ids = _read_atomic_chunk(cg, chunk_coord, relevant_layers) for l2id in l2ids: - for layer in cross_edge_layers: - if attributes.Connectivity.L2CrossChunkEdge[layer] in range_read[l2id]: + for layer in relevant_layers: + if attributes.Connectivity.CrossChunkEdge[layer] in range_read[l2id]: node_layer_d[l2id] = layer break return node_layer_d -def _find_min_layer(node_layer_d_shared, ids_l_shared, layers_l_shared): - node_ids = np.concatenate(ids_l_shared) - layers = np.concatenate(layers_l_shared) +def _find_min_layer(node_layer_d_shared, node_ids_shared, node_layers_shared): + """ + `node_layer_d_shared`: DictProxy + + `node_ids_shared`: ListProxy + + `node_layers_shared`: ListProxy + + Due to parallelization, there will be multiple values for min_layer of a node. + We need to find the global min_layer after all multiprocesses return. + For eg: + At some indices p and q, there will be a node_id x + i.e. `node_ids_shared[p] == node_ids_shared[q]` + + and node_layers_shared[p] != node_layers_shared[q] + so we need: + `node_layer_d_shared[x] = min(node_layers_shared[p], node_layers_shared[q])` + """ + node_ids = np.concatenate(node_ids_shared) + layers = np.concatenate(node_layers_shared) for i, node_id in enumerate(node_ids): layer = node_layer_d_shared.get(node_id, layers[i]) node_layer_d_shared[node_id] = min(layer, layers[i]) def _read_atomic_chunk(cg, chunk_coord, layers): + """ + read entire atomic chunk; all nodes and their relevant cross edges + filter out invalid nodes generated by failed tasks + """ x, y, z = chunk_coord child_col = attributes.Hierarchy.Child range_read = cg.range_read_chunk( cg.get_chunk_id(layer=2, x=x, y=y, z=z), properties=[child_col] - + [attributes.Connectivity.L2CrossChunkEdge[l] for l in layers], + + [attributes.Connectivity.CrossChunkEdge[l] for l in layers], ) row_ids = [] From febb9f235f620508866a816093c34ec62f6ba21f Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Sun, 20 Aug 2023 20:22:40 +0000 Subject: [PATCH 015/116] fix: move cross_edges module to ingest module; only used in ingest --- pychunkedgraph/graph/chunks/atomic.py | 6 +++--- pychunkedgraph/ingest/create/abstract_layers.py | 4 ++-- .../create}/cross_edges.py | 16 ++++++++-------- 3 files changed, 13 insertions(+), 13 deletions(-) rename pychunkedgraph/{graph/connectivity => ingest/create}/cross_edges.py (95%) diff --git a/pychunkedgraph/graph/chunks/atomic.py b/pychunkedgraph/graph/chunks/atomic.py index e3de065ff..b609f4cfb 100644 --- a/pychunkedgraph/graph/chunks/atomic.py +++ b/pychunkedgraph/graph/chunks/atomic.py @@ -1,3 +1,5 @@ +# pylint: disable=invalid-name, missing-docstring + from typing import List from typing import Sequence from itertools import product @@ -6,8 +8,6 @@ from .utils import get_bounding_children_chunks from ..meta import ChunkedGraphMeta -from ..utils.generic import get_valid_timestamp -from ..utils import basetypes def get_touching_atomic_chunks( @@ -27,7 +27,7 @@ def get_touching_atomic_chunks( chunk_offset = chunk_coords * atomic_chunk_count mid = (atomic_chunk_count // 2) - 1 - # TODO (akhileshh) convert this for loop to numpy + # TODO (akhileshh) convert this for loop to numpy; # relevant chunks along touching planes at center for axis_1, axis_2 in product(*[range(atomic_chunk_count)] * 2): # x-y plane diff --git a/pychunkedgraph/ingest/create/abstract_layers.py b/pychunkedgraph/ingest/create/abstract_layers.py index 215929c41..c5a78d2ca 100644 --- a/pychunkedgraph/ingest/create/abstract_layers.py +++ b/pychunkedgraph/ingest/create/abstract_layers.py @@ -23,8 +23,8 @@ from ...graph.utils.generic import get_valid_timestamp from ...graph.utils.generic import filter_failed_node_ids from ...graph.chunks.hierarchy import get_children_chunk_coords -from ...graph.connectivity.cross_edges import get_children_chunk_cross_edges -from ...graph.connectivity.cross_edges import get_chunk_nodes_cross_edge_layer +from .cross_edges import get_children_chunk_cross_edges +from .cross_edges import get_chunk_nodes_cross_edge_layer def add_layer( diff --git a/pychunkedgraph/graph/connectivity/cross_edges.py b/pychunkedgraph/ingest/create/cross_edges.py similarity index 95% rename from pychunkedgraph/graph/connectivity/cross_edges.py rename to pychunkedgraph/ingest/create/cross_edges.py index d2dbcbb8c..481a5b6e5 100644 --- a/pychunkedgraph/graph/connectivity/cross_edges.py +++ b/pychunkedgraph/ingest/create/cross_edges.py @@ -1,4 +1,4 @@ -# pylint: disable=invalid-name, missing-docstring, import-outside-toplevel +# pylint: disable=invalid-name, missing-docstring import math import multiprocessing as mp @@ -9,13 +9,13 @@ import numpy as np from multiwrapper.multiprocessing_utils import multiprocess_func -from .. import attributes -from ..types import empty_2d -from ..utils import basetypes -from ..chunkedgraph import ChunkedGraph -from ..utils.generic import filter_failed_node_ids -from ..chunks.atomic import get_touching_atomic_chunks -from ..chunks.atomic import get_bounding_atomic_chunks +from ...graph import attributes +from ...graph.types import empty_2d +from ...graph.utils import basetypes +from ...graph.chunkedgraph import ChunkedGraph +from ...graph.utils.generic import filter_failed_node_ids +from ...graph.chunks.atomic import get_touching_atomic_chunks +from ...graph.chunks.atomic import get_bounding_atomic_chunks from ...utils.general import chunked From 7ac3b7a1a7a7cdb7c06210eb037e809392295dde Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Sun, 20 Aug 2023 20:49:13 +0000 Subject: [PATCH 016/116] fix: reduce mem use; var names; remove unused code --- pychunkedgraph/ingest/cli.py | 74 ++++++------------- pychunkedgraph/ingest/cluster.py | 47 +----------- .../ingest/create/abstract_layers.py | 23 ++---- pychunkedgraph/ingest/manager.py | 6 +- tracker.py | 22 ------ 5 files changed, 38 insertions(+), 134 deletions(-) delete mode 100644 tracker.py diff --git a/pychunkedgraph/ingest/cli.py b/pychunkedgraph/ingest/cli.py index 145c9bea6..486224cec 100644 --- a/pychunkedgraph/ingest/cli.py +++ b/pychunkedgraph/ingest/cli.py @@ -1,4 +1,4 @@ -# pylint: disable=invalid-name, missing-function-docstring, import-outside-toplevel +# pylint: disable=invalid-name, missing-function-docstring, unspecified-encoding """ cli for running ingest @@ -12,10 +12,14 @@ from flask.cli import AppGroup from rq import Queue +from .cluster import create_atomic_chunk +from .cluster import create_parent_chunk from .cluster import enqueue_atomic_tasks +from .cluster import randomize_grid_points from .manager import IngestionManager from .utils import bootstrap -from .cluster import randomize_grid_points +from .utils import chunk_id_str +from .create.abstract_layers import add_layer from ..graph.chunkedgraph import ChunkedGraph from ..utils.redis import get_redis_connection from ..utils.redis import keys as r_keys @@ -90,7 +94,7 @@ def pickle_imanager(graph_id: str, dataset: click.Path, raw: bool): meta, ingest_config, _ = bootstrap(graph_id, config=config, raw=raw) imanager = IngestionManager(ingest_config, meta) - imanager.redis + imanager.redis # pylint: disable=pointless-statement @ingest_cli.command("layer") @@ -100,11 +104,6 @@ def queue_layer(parent_layer): Queue all chunk tasks at a given layer. Must be used when all the chunks at `parent_layer - 1` have completed. """ - from itertools import product - import numpy as np - from .cluster import create_parent_chunk - from .utils import chunk_id_str - assert parent_layer > 2, "This command is for layers 3 and above." redis = get_redis_connection() imanager = IngestionManager.from_pickle(redis.get(r_keys.INGESTION_MANAGER)) @@ -115,38 +114,15 @@ def queue_layer(parent_layer): bounds = imanager.cg_meta.layer_chunk_bounds[parent_layer] chunk_coords = randomize_grid_points(*bounds) - def get_chunks_not_done(coords: list) -> list: - """check for set membership in redis in batches""" - coords_strs = ["_".join(map(str, coord)) for coord in coords] - try: - completed = imanager.redis.smismember(f"{parent_layer}c", coords_strs) - except Exception: - return coords - return [coord for coord, c in zip(coords, completed) if not c] - - batch_size = int(environ.get("JOB_BATCH_SIZE", 10000)) - batches = chunked(chunk_coords, batch_size) - q = imanager.get_task_queue(f"l{parent_layer}") - - for batch in batches: - _coords = get_chunks_not_done(batch) - # buffer for optimal use of redis memory - if len(q) > int(environ.get("QUEUE_SIZE", 100000)): - interval = int(environ.get("QUEUE_INTERVAL", 300)) - sleep(interval) - - job_datas = [] - for chunk_coord in _coords: - job_datas.append( - Queue.prepare_data( - create_parent_chunk, - args=(parent_layer, chunk_coord), - result_ttl=0, - job_id=chunk_id_str(parent_layer, chunk_coord), - timeout=f"{int(parent_layer * parent_layer)}m", - ) - ) - q.enqueue_many(job_datas) + for coords in chunk_coords: + task_q = imanager.get_task_queue(f"l{parent_layer}") + task_q.enqueue( + create_parent_chunk, + job_id=chunk_id_str(parent_layer, coords), + job_timeout=f"{int(parent_layer * parent_layer)}m", + result_ttl=0, + args=(parent_layer, coords), + ) @ingest_cli.command("status") @@ -156,16 +132,16 @@ def ingest_status(): imanager = IngestionManager.from_pickle(redis.get(r_keys.INGESTION_MANAGER)) layer = 2 - completed = redis.scard(f"{layer}c") - print(f"{layer}\t: {completed} / {imanager.cg_meta.layer_chunk_counts[0]}") + done = redis.scard(f"{layer}c") + print(f"{layer}\t: {done} / {imanager.cg_meta.layer_chunk_counts[0]}") - completed = redis.scard(f"{layer}c-postprocess") - print(f"{layer}\t: {completed} / {imanager.cg_meta.layer_chunk_counts[0]} [postprocess]") + done = redis.scard(f"{layer}c-postprocess") + print(f"{layer}\t: {done} / {imanager.cg_meta.layer_chunk_counts[0]} [postprocess]") layers = range(3, imanager.cg_meta.layer_count + 1) for layer, layer_count in zip(layers, imanager.cg_meta.layer_chunk_counts[1:]): - completed = redis.scard(f"{layer}c") - print(f"{layer}\t: {completed} / {layer_count}") + done = redis.scard(f"{layer}c") + print(f"{layer}\t: {done} / {layer_count}") @ingest_cli.command("chunk") @@ -173,17 +149,13 @@ def ingest_status(): @click.argument("chunk_info", nargs=4, type=int) def ingest_chunk(queue: str, chunk_info): """Manually queue chunk when a job is stuck for whatever reason.""" - from .cluster import _create_atomic_chunk - from .cluster import create_parent_chunk - from .utils import chunk_id_str - redis = get_redis_connection() imanager = IngestionManager.from_pickle(redis.get(r_keys.INGESTION_MANAGER)) layer = chunk_info[0] coords = chunk_info[1:] queue = imanager.get_task_queue(queue) if layer == 2: - func = _create_atomic_chunk + func = create_atomic_chunk args = (coords,) else: func = create_parent_chunk diff --git a/pychunkedgraph/ingest/cluster.py b/pychunkedgraph/ingest/cluster.py index 2b7927869..9394c4e26 100644 --- a/pychunkedgraph/ingest/cluster.py +++ b/pychunkedgraph/ingest/cluster.py @@ -33,49 +33,6 @@ def _post_task_completion( imanager.redis.sadd(f"{layer}c{pprocess}", chunk_str) -def enqueue_parent_task( - parent_layer: int, - parent_coords: Sequence[int], -): - redis = get_redis_connection() - imanager = IngestionManager.from_pickle(redis.get(r_keys.INGESTION_MANAGER)) - parent_id_str = chunk_id_str(parent_layer, parent_coords) - parent_chunk_str = "_".join(map(str, parent_coords)) - - children_done = redis.scard(parent_id_str) - # if zero then this key was deleted and parent already queued. - if children_done == 0: - print("parent already queued.") - return - - # if the previous layer is complete - # no need to check children progress for each parent chunk - child_layer = parent_layer - 1 - child_layer_done = redis.scard(f"{child_layer}c") - child_layer_count = imanager.cg_meta.layer_chunk_counts[child_layer - 2] - child_layer_finished = child_layer_done == child_layer_count - - if not child_layer_finished: - children_count = int(redis.hget(parent_layer, parent_chunk_str).decode("utf-8")) - if children_done != children_count: - print("children not done.") - return - - queue = imanager.get_task_queue(f"l{parent_layer}") - queue.enqueue( - create_parent_chunk, - job_id=parent_id_str, - job_timeout=f"{int(parent_layer * parent_layer)}m", - result_ttl=0, - args=( - parent_layer, - parent_coords, - ), - ) - redis.hdel(parent_layer, parent_chunk_str) - redis.delete(parent_id_str) - - def create_parent_chunk( parent_layer: int, parent_coords: Sequence[int], @@ -137,7 +94,7 @@ def enqueue_atomic_tasks(imanager: IngestionManager, postprocess: bool = False): continue job_datas.append( RQueue.prepare_data( - _create_atomic_chunk, + create_atomic_chunk, args=(chunk_coord, postprocess), timeout=environ.get("L2JOB_TIMEOUT", "3m"), result_ttl=0, @@ -150,7 +107,7 @@ def enqueue_atomic_tasks(imanager: IngestionManager, postprocess: bool = False): q.enqueue_many(job_datas) -def _create_atomic_chunk(coords: Sequence[int], postprocess: bool = False): +def create_atomic_chunk(coords: Sequence[int], postprocess: bool = False): """Creates single atomic chunk""" redis = get_redis_connection() imanager = IngestionManager.from_pickle(redis.get(r_keys.INGESTION_MANAGER)) diff --git a/pychunkedgraph/ingest/create/abstract_layers.py b/pychunkedgraph/ingest/create/abstract_layers.py index c5a78d2ca..8912a2d53 100644 --- a/pychunkedgraph/ingest/create/abstract_layers.py +++ b/pychunkedgraph/ingest/create/abstract_layers.py @@ -39,26 +39,20 @@ def add_layer( if not children_coords.size: children_coords = get_children_chunk_coords(cg.meta, layer_id, parent_coords) children_ids = _read_children_chunks(cg, layer_id, children_coords, n_threads > 1) - edge_ids = get_children_chunk_cross_edges( + cross_edges = get_children_chunk_cross_edges( cg, layer_id, parent_coords, use_threads=n_threads > 1 ) node_layers = cg.get_chunk_layers(children_ids) - edge_layers = cg.get_chunk_layers(np.unique(edge_ids)) + edge_layers = cg.get_chunk_layers(np.unique(cross_edges)) assert np.all(node_layers < layer_id), "invalid node layers" assert np.all(edge_layers < layer_id), "invalid edge layers" - # Extract connected components - # isolated_node_mask = ~np.in1d(children_ids, np.unique(edge_ids)) - # add_node_ids = children_ids[isolated_node_mask].squeeze() - add_edge_ids = np.vstack([children_ids, children_ids]).T - - edge_ids = list(edge_ids) - edge_ids.extend(add_edge_ids) - graph, _, _, graph_ids = flatgraph.build_gt_graph(edge_ids, make_directed=True) - ccs = flatgraph.connected_components(graph) - connected_components = [] - for cc in ccs: - connected_components.append(graph_ids[cc]) + + cross_edges = list(cross_edges) + cross_edges.extend(np.vstack([children_ids, children_ids]).T) # add self-edges + graph, _, _, graph_ids = flatgraph.build_gt_graph(cross_edges, make_directed=True) + raw_ccs = flatgraph.connected_components(graph) # connected components with indices + connected_components = [graph_ids[cc] for cc in raw_ccs] _write_connected_components( cg, @@ -68,7 +62,6 @@ def add_layer( get_valid_timestamp(time_stamp), n_threads > 1, ) - return f"{layer_id}_{'_'.join(map(str, parent_coords))}" def _read_children_chunks( diff --git a/pychunkedgraph/ingest/manager.py b/pychunkedgraph/ingest/manager.py index f5f870810..55e7d253f 100644 --- a/pychunkedgraph/ingest/manager.py +++ b/pychunkedgraph/ingest/manager.py @@ -1,3 +1,5 @@ +# pylint: disable=invalid-name, missing-docstring + import pickle from . import IngestConfig @@ -15,7 +17,9 @@ def __init__(self, config: IngestConfig, chunkedgraph_meta: ChunkedGraphMeta): self._cg = None self._redis = None self._task_queues = {} - self.redis # initiate and cache info + + # initiate redis and cache info + self.redis # pylint: disable=pointless-statement @property def config(self): diff --git a/tracker.py b/tracker.py deleted file mode 100644 index d2ae63cb3..000000000 --- a/tracker.py +++ /dev/null @@ -1,22 +0,0 @@ -import sys -from rq import Connection, Worker - -# Preload libraries from pychunkedgraph.ingest.cluster -from typing import Sequence, Tuple - -import numpy as np - -from pychunkedgraph.ingest.utils import chunk_id_str -from pychunkedgraph.ingest.manager import IngestionManager -from pychunkedgraph.ingest.common import get_atomic_chunk_data -from pychunkedgraph.ingest.ran_agglomeration import get_active_edges -from pychunkedgraph.ingest.create.atomic_layer import add_atomic_edges -from pychunkedgraph.ingest.create.abstract_layers import add_layer -from pychunkedgraph.graph.meta import ChunkedGraphMeta -from pychunkedgraph.graph.chunks.hierarchy import get_children_chunk_coords -from pychunkedgraph.utils.redis import keys as r_keys -from pychunkedgraph.utils.redis import get_redis_connection - -qs = sys.argv[1:] -w = Worker(qs, connection=get_redis_connection()) -w.work() \ No newline at end of file From 1fe1e20e24bc0e4f98e515818896862e0b0698ce Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Sun, 20 Aug 2023 21:06:45 +0000 Subject: [PATCH 017/116] fix: adds cg typehint --- pychunkedgraph/ingest/create/cross_edges.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/pychunkedgraph/ingest/create/cross_edges.py b/pychunkedgraph/ingest/create/cross_edges.py index 481a5b6e5..78b7309fe 100644 --- a/pychunkedgraph/ingest/create/cross_edges.py +++ b/pychunkedgraph/ingest/create/cross_edges.py @@ -20,7 +20,7 @@ def get_children_chunk_cross_edges( - cg, layer, chunk_coord, *, use_threads=True + cg: ChunkedGraph, layer, chunk_coord, *, use_threads=True ) -> np.ndarray: """ Cross edges that connect children chunks. @@ -63,7 +63,7 @@ def _get_children_chunk_cross_edges_helper(args) -> None: edge_ids_shared.append(_get_children_chunk_cross_edges(cg, atomic_chunks, layer)) -def _get_children_chunk_cross_edges(cg, atomic_chunks, layer) -> None: +def _get_children_chunk_cross_edges(cg: ChunkedGraph, atomic_chunks, layer) -> None: """ Non parallelized version Cross edges that connect children chunks. @@ -85,7 +85,7 @@ def _get_children_chunk_cross_edges(cg, atomic_chunks, layer) -> None: def _read_atomic_chunk_cross_edges( - cg, chunk_coord: Sequence[int], cross_edge_layer: int + cg: ChunkedGraph, chunk_coord: Sequence[int], cross_edge_layer: int ) -> np.ndarray: """ Returns cross edges between l2 nodes in current chunk and @@ -110,7 +110,7 @@ def _read_atomic_chunk_cross_edges( def get_chunk_nodes_cross_edge_layer( - cg, layer: int, chunk_coord: Sequence[int], use_threads=True + cg: ChunkedGraph, layer: int, chunk_coord: Sequence[int], use_threads=True ) -> Dict: """ gets nodes in a chunk that are part of cross chunk edges @@ -155,7 +155,7 @@ def _get_chunk_nodes_cross_edge_layer_helper(args): node_layers_shared.append(np.fromiter(node_layer_d.values(), dtype=np.uint8)) -def _get_chunk_nodes_cross_edge_layer(cg, atomic_chunks, layer): +def _get_chunk_nodes_cross_edge_layer(cg: ChunkedGraph, atomic_chunks, layer): """ Non parallelized version gets nodes in a chunk that are part of cross chunk edges @@ -164,7 +164,9 @@ def _get_chunk_nodes_cross_edge_layer(cg, atomic_chunks, layer): """ atomic_node_layer_d = {} for atomic_chunk in atomic_chunks: - chunk_node_layer_d = _read_atomic_chunk_cross_edge_nodes(cg, atomic_chunk, layer) + chunk_node_layer_d = _read_atomic_chunk_cross_edge_nodes( + cg, atomic_chunk, layer + ) atomic_node_layer_d.update(chunk_node_layer_d) l2ids = np.fromiter(atomic_node_layer_d.keys(), dtype=basetypes.NODE_ID) @@ -177,7 +179,7 @@ def _get_chunk_nodes_cross_edge_layer(cg, atomic_chunks, layer): return node_layer_d -def _read_atomic_chunk_cross_edge_nodes(cg, chunk_coord, layer): +def _read_atomic_chunk_cross_edge_nodes(cg: ChunkedGraph, chunk_coord, layer): """ the lowest layer at which an l2 node is part of a cross edge """ @@ -217,7 +219,7 @@ def _find_min_layer(node_layer_d_shared, node_ids_shared, node_layers_shared): node_layer_d_shared[node_id] = min(layer, layers[i]) -def _read_atomic_chunk(cg, chunk_coord, layers): +def _read_atomic_chunk(cg: ChunkedGraph, chunk_coord, layers): """ read entire atomic chunk; all nodes and their relevant cross edges filter out invalid nodes generated by failed tasks From fd43a1a13ea7cd402fe09dfa097490b10c3a9dca Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Sun, 20 Aug 2023 21:32:58 +0000 Subject: [PATCH 018/116] fix: reduce loc --- .../ingest/create/abstract_layers.py | 60 ++++++------------- pychunkedgraph/ingest/create/cross_edges.py | 2 +- 2 files changed, 20 insertions(+), 42 deletions(-) diff --git a/pychunkedgraph/ingest/create/abstract_layers.py b/pychunkedgraph/ingest/create/abstract_layers.py index 8912a2d53..31610aeab 100644 --- a/pychunkedgraph/ingest/create/abstract_layers.py +++ b/pychunkedgraph/ingest/create/abstract_layers.py @@ -49,9 +49,9 @@ def add_layer( assert np.all(edge_layers < layer_id), "invalid edge layers" cross_edges = list(cross_edges) - cross_edges.extend(np.vstack([children_ids, children_ids]).T) # add self-edges + cross_edges.extend(np.vstack([children_ids, children_ids]).T) # add self-edges graph, _, _, graph_ids = flatgraph.build_gt_graph(cross_edges, make_directed=True) - raw_ccs = flatgraph.connected_components(graph) # connected components with indices + raw_ccs = flatgraph.connected_components(graph) # connected components with indices connected_components = [graph_ids[cc] for cc in raw_ccs] _write_connected_components( @@ -119,42 +119,26 @@ def _read_chunk(children_ids_shared, cg: ChunkedGraph, layer_id: int, chunk_coor def _write_connected_components( - cg: ChunkedGraph, - layer_id: int, - parent_coords, - connected_components: list, - time_stamp, - use_threads=True, -) -> None: - if len(connected_components) == 0: + cg, layer, pcoords, components, cross_edges, time_stamp, use_threads=True +): + if len(components) == 0: return - node_layer_d_shared = {} - if layer_id < cg.meta.layer_count: - node_layer_d_shared = get_chunk_nodes_cross_edge_layer( - cg, layer_id, parent_coords, use_threads=use_threads - ) + node_layer_d = {} + if layer < cg.meta.layer_count: + node_layer_d = get_chunk_nodes_cross_edge_layer(cg, layer, pcoords, use_threads) if not use_threads: - _write( - cg, - layer_id, - parent_coords, - connected_components, - node_layer_d_shared, - time_stamp, - use_threads=use_threads, - ) + _write(cg, layer, pcoords, components, cross_edges, node_layer_d, time_stamp) return - task_size = int(math.ceil(len(connected_components) / mp.cpu_count() / 10)) - chunked_ccs = chunked(connected_components, task_size) + task_size = int(math.ceil(len(components) / mp.cpu_count() / 10)) + chunked_ccs = chunked(components, task_size) cg_info = cg.get_serialized_info() multi_args = [] for ccs in chunked_ccs: - multi_args.append( - (cg_info, layer_id, parent_coords, ccs, node_layer_d_shared, time_stamp) - ) + args = (cg_info, layer, pcoords, ccs, cross_edges, node_layer_d, time_stamp) + multi_args.append(args) mu.multiprocess_func( _write_components_helper, multi_args, @@ -163,26 +147,20 @@ def _write_connected_components( def _write_components_helper(args): - cg_info, layer_id, parent_coords, ccs, node_layer_d_shared, time_stamp = args + cg_info, layer, pcoords, ccs, cross_edges, node_layer_d, time_stamp = args cg = ChunkedGraph(**cg_info) - _write(cg, layer_id, parent_coords, ccs, node_layer_d_shared, time_stamp) + _write(cg, layer, pcoords, ccs, cross_edges, node_layer_d, time_stamp) def _write( - cg, - layer_id, - parent_coords, - connected_components, - node_layer_d_shared, - time_stamp, - use_threads=True, + cg, layer_id, parent_coords, components, cross_edges, node_layer_d, time_stamp ): parent_layer_ids = range(layer_id, cg.meta.layer_count + 1) cc_connections = {l: [] for l in parent_layer_ids} - for node_ids in connected_components: + for node_ids in components: layer = layer_id if len(node_ids) == 1: - layer = node_layer_d_shared.get(node_ids[0], cg.meta.layer_count) + layer = node_layer_d.get(node_ids[0], cg.meta.layer_count) cc_connections[layer].append(node_ids) rows = [] @@ -199,7 +177,7 @@ def _write( reserved_parent_ids = cg.id_client.create_node_ids( parent_chunk_id, size=len(cc_connections[parent_layer_id]), - root_chunk=parent_layer_id == cg.meta.layer_count and use_threads, + root_chunk=parent_layer_id == cg.meta.layer_count, ) for i_cc, node_ids in enumerate(cc_connections[parent_layer_id]): diff --git a/pychunkedgraph/ingest/create/cross_edges.py b/pychunkedgraph/ingest/create/cross_edges.py index 78b7309fe..b7a888b27 100644 --- a/pychunkedgraph/ingest/create/cross_edges.py +++ b/pychunkedgraph/ingest/create/cross_edges.py @@ -24,7 +24,7 @@ def get_children_chunk_cross_edges( ) -> np.ndarray: """ Cross edges that connect children chunks. - The edges are between node IDs in the given layer (not atomic). + The edges are between node IDs in the given layer. """ atomic_chunks = get_touching_atomic_chunks(cg.meta, layer, chunk_coord) if len(atomic_chunks) == 0: From d7bfc89558a169433884c565b608df7729569aa0 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Sun, 20 Aug 2023 21:36:46 +0000 Subject: [PATCH 019/116] fix: use shorter name --- .../ingest/create/abstract_layers.py | 25 +++++++++---------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/pychunkedgraph/ingest/create/abstract_layers.py b/pychunkedgraph/ingest/create/abstract_layers.py index 31610aeab..107ac5714 100644 --- a/pychunkedgraph/ingest/create/abstract_layers.py +++ b/pychunkedgraph/ingest/create/abstract_layers.py @@ -39,18 +39,18 @@ def add_layer( if not children_coords.size: children_coords = get_children_chunk_coords(cg.meta, layer_id, parent_coords) children_ids = _read_children_chunks(cg, layer_id, children_coords, n_threads > 1) - cross_edges = get_children_chunk_cross_edges( + cx_edges = get_children_chunk_cross_edges( cg, layer_id, parent_coords, use_threads=n_threads > 1 ) node_layers = cg.get_chunk_layers(children_ids) - edge_layers = cg.get_chunk_layers(np.unique(cross_edges)) + edge_layers = cg.get_chunk_layers(np.unique(cx_edges)) assert np.all(node_layers < layer_id), "invalid node layers" assert np.all(edge_layers < layer_id), "invalid edge layers" - cross_edges = list(cross_edges) - cross_edges.extend(np.vstack([children_ids, children_ids]).T) # add self-edges - graph, _, _, graph_ids = flatgraph.build_gt_graph(cross_edges, make_directed=True) + cx_edges = list(cx_edges) + cx_edges.extend(np.vstack([children_ids, children_ids]).T) # add self-edges + graph, _, _, graph_ids = flatgraph.build_gt_graph(cx_edges, make_directed=True) raw_ccs = flatgraph.connected_components(graph) # connected components with indices connected_components = [graph_ids[cc] for cc in raw_ccs] @@ -59,6 +59,7 @@ def add_layer( layer_id, parent_coords, connected_components, + cx_edges, get_valid_timestamp(time_stamp), n_threads > 1, ) @@ -119,7 +120,7 @@ def _read_chunk(children_ids_shared, cg: ChunkedGraph, layer_id: int, chunk_coor def _write_connected_components( - cg, layer, pcoords, components, cross_edges, time_stamp, use_threads=True + cg, layer, pcoords, components, cx_edges, time_stamp, use_threads=True ): if len(components) == 0: return @@ -129,7 +130,7 @@ def _write_connected_components( node_layer_d = get_chunk_nodes_cross_edge_layer(cg, layer, pcoords, use_threads) if not use_threads: - _write(cg, layer, pcoords, components, cross_edges, node_layer_d, time_stamp) + _write(cg, layer, pcoords, components, cx_edges, node_layer_d, time_stamp) return task_size = int(math.ceil(len(components) / mp.cpu_count() / 10)) @@ -137,7 +138,7 @@ def _write_connected_components( cg_info = cg.get_serialized_info() multi_args = [] for ccs in chunked_ccs: - args = (cg_info, layer, pcoords, ccs, cross_edges, node_layer_d, time_stamp) + args = (cg_info, layer, pcoords, ccs, cx_edges, node_layer_d, time_stamp) multi_args.append(args) mu.multiprocess_func( _write_components_helper, @@ -147,14 +148,12 @@ def _write_connected_components( def _write_components_helper(args): - cg_info, layer, pcoords, ccs, cross_edges, node_layer_d, time_stamp = args + cg_info, layer, pcoords, ccs, cx_edges, node_layer_d, time_stamp = args cg = ChunkedGraph(**cg_info) - _write(cg, layer, pcoords, ccs, cross_edges, node_layer_d, time_stamp) + _write(cg, layer, pcoords, ccs, cx_edges, node_layer_d, time_stamp) -def _write( - cg, layer_id, parent_coords, components, cross_edges, node_layer_d, time_stamp -): +def _write(cg, layer_id, parent_coords, components, cx_edges, node_layer_d, time_stamp): parent_layer_ids = range(layer_id, cg.meta.layer_count + 1) cc_connections = {l: [] for l in parent_layer_ids} for node_ids in components: From 92951560a37499e4928387acf8086073d5afe003 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Sun, 20 Aug 2023 22:13:44 +0000 Subject: [PATCH 020/116] feat: cache cx edges at each layer --- .../ingest/create/abstract_layers.py | 49 ++++++++++++------- 1 file changed, 31 insertions(+), 18 deletions(-) diff --git a/pychunkedgraph/ingest/create/abstract_layers.py b/pychunkedgraph/ingest/create/abstract_layers.py index 107ac5714..148a370ba 100644 --- a/pychunkedgraph/ingest/create/abstract_layers.py +++ b/pychunkedgraph/ingest/create/abstract_layers.py @@ -9,6 +9,7 @@ import multiprocessing as mp from typing import Optional from typing import Sequence +from collections import defaultdict import numpy as np from multiwrapper import multiprocessing_utils as mu @@ -153,7 +154,15 @@ def _write_components_helper(args): _write(cg, layer, pcoords, ccs, cx_edges, node_layer_d, time_stamp) -def _write(cg, layer_id, parent_coords, components, cx_edges, node_layer_d, time_stamp): +def _write( + cg: ChunkedGraph, + layer_id, + parent_coords, + components, + cx_edges, + node_layer_d, + time_stamp, +): parent_layer_ids = range(layer_id, cg.meta.layer_count + 1) cc_connections = {l: [] for l in parent_layer_ids} for node_ids in components: @@ -180,24 +189,28 @@ def _write(cg, layer_id, parent_coords, components, cx_edges, node_layer_d, time ) for i_cc, node_ids in enumerate(cc_connections[parent_layer_id]): - parent_id = reserved_parent_ids[i_cc] - for node_id in node_ids: - rows.append( - cg.client.mutate_row( - serializers.serialize_uint64(node_id), - {attributes.Hierarchy.Parent: parent_id}, - time_stamp=time_stamp, - ) - ) - - rows.append( - cg.client.mutate_row( - serializers.serialize_uint64(parent_id), - {attributes.Hierarchy.Child: node_ids}, - time_stamp=time_stamp, - ) - ) + node_cx_edges_d = defaultdict(lambda: types.empty_2d) + for node in node_ids: + mask0 = cx_edges[:, 0] == node + mask1 = cx_edges[:, 1] == node + node_cx_edges_d[node] = cx_edges[mask0 | mask1] + parent_id = reserved_parent_ids[i_cc] + for node in node_ids: + row_id = serializers.serialize_uint64(node) + val_dict = {attributes.Hierarchy.Parent: parent_id} + + node_cx_edges = node_cx_edges_d[node] + cx_layers = cg.get_cross_chunk_edges_layer(node_cx_edges) + for layer in set(cx_layers): + layer_mask = cx_layers == layer + col = attributes.Connectivity.CrossChunkEdge[layer] + val_dict[col] = node_cx_edges[layer_mask] + rows.append(cg.client.mutate_row(row_id, val_dict, time_stamp)) + + row_id = serializers.serialize_uint64(parent_id) + val_dict = {attributes.Hierarchy.Child: node_ids} + rows.append(cg.client.mutate_row(row_id, val_dict, time_stamp)) if len(rows) > 100000: cg.client.write(rows) rows = [] From 4e5b682520a967a913ea4963927b23a577594092 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Sun, 20 Aug 2023 22:24:49 +0000 Subject: [PATCH 021/116] fix: convert array type --- pychunkedgraph/ingest/create/abstract_layers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pychunkedgraph/ingest/create/abstract_layers.py b/pychunkedgraph/ingest/create/abstract_layers.py index 148a370ba..f1341419d 100644 --- a/pychunkedgraph/ingest/create/abstract_layers.py +++ b/pychunkedgraph/ingest/create/abstract_layers.py @@ -176,7 +176,7 @@ def _write( parent_chunk_id = cg.get_chunk_id(layer=layer_id, x=x, y=y, z=z) parent_chunk_id_dict = cg.get_parent_chunk_id_dict(parent_chunk_id) - # Iterate through layers + cx_edges = np.array(cx_edges, dtype=basetypes.NODE_ID) for parent_layer_id in parent_layer_ids: if len(cc_connections[parent_layer_id]) == 0: continue From ad107eef0bbdbc7e323bb24c9bc8a82edc2ebc25 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Sun, 20 Aug 2023 22:50:51 +0000 Subject: [PATCH 022/116] fix: use atomic edges during ingest --- pychunkedgraph/graph/cache.py | 1 + pychunkedgraph/ingest/create/cross_edges.py | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pychunkedgraph/graph/cache.py b/pychunkedgraph/graph/cache.py index f60b6ca92..8c824c732 100644 --- a/pychunkedgraph/graph/cache.py +++ b/pychunkedgraph/graph/cache.py @@ -1,3 +1,4 @@ +# pylint: disable=invalid-name, missing-docstring, import-outside-toplevel """ Cache nodes, parents, children and cross edges. """ diff --git a/pychunkedgraph/ingest/create/cross_edges.py b/pychunkedgraph/ingest/create/cross_edges.py index b7a888b27..c7f45e9eb 100644 --- a/pychunkedgraph/ingest/create/cross_edges.py +++ b/pychunkedgraph/ingest/create/cross_edges.py @@ -91,7 +91,7 @@ def _read_atomic_chunk_cross_edges( Returns cross edges between l2 nodes in current chunk and l1 supervoxels from neighbor chunks. """ - cross_edge_col = attributes.Connectivity.CrossChunkEdge[cross_edge_layer] + cross_edge_col = attributes.Connectivity.AtomicCrossChunkEdge[cross_edge_layer] range_read, l2ids = _read_atomic_chunk(cg, chunk_coord, [cross_edge_layer]) parent_neighboring_chunk_supervoxels_d = defaultdict(list) @@ -188,7 +188,7 @@ def _read_atomic_chunk_cross_edge_nodes(cg: ChunkedGraph, chunk_coord, layer): range_read, l2ids = _read_atomic_chunk(cg, chunk_coord, relevant_layers) for l2id in l2ids: for layer in relevant_layers: - if attributes.Connectivity.CrossChunkEdge[layer] in range_read[l2id]: + if attributes.Connectivity.AtomicCrossChunkEdge[layer] in range_read[l2id]: node_layer_d[l2id] = layer break return node_layer_d @@ -229,7 +229,7 @@ def _read_atomic_chunk(cg: ChunkedGraph, chunk_coord, layers): range_read = cg.range_read_chunk( cg.get_chunk_id(layer=2, x=x, y=y, z=z), properties=[child_col] - + [attributes.Connectivity.CrossChunkEdge[l] for l in layers], + + [attributes.Connectivity.AtomicCrossChunkEdge[l] for l in layers], ) row_ids = [] From 1811f36f5e24de54f81fdf59506672901bdd6b3e Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Sun, 20 Aug 2023 23:23:18 +0000 Subject: [PATCH 023/116] fix: tests --- pychunkedgraph/ingest/create/abstract_layers.py | 5 +++-- pychunkedgraph/ingest/create/cross_edges.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pychunkedgraph/ingest/create/abstract_layers.py b/pychunkedgraph/ingest/create/abstract_layers.py index f1341419d..63b613ae6 100644 --- a/pychunkedgraph/ingest/create/abstract_layers.py +++ b/pychunkedgraph/ingest/create/abstract_layers.py @@ -131,7 +131,7 @@ def _write_connected_components( node_layer_d = get_chunk_nodes_cross_edge_layer(cg, layer, pcoords, use_threads) if not use_threads: - _write(cg, layer, pcoords, components, cx_edges, node_layer_d, time_stamp) + _write(cg, layer, pcoords, components, cx_edges, node_layer_d, time_stamp, use_threads) return task_size = int(math.ceil(len(components) / mp.cpu_count() / 10)) @@ -162,6 +162,7 @@ def _write( cx_edges, node_layer_d, time_stamp, + use_threads=True, ): parent_layer_ids = range(layer_id, cg.meta.layer_count + 1) cc_connections = {l: [] for l in parent_layer_ids} @@ -185,7 +186,7 @@ def _write( reserved_parent_ids = cg.id_client.create_node_ids( parent_chunk_id, size=len(cc_connections[parent_layer_id]), - root_chunk=parent_layer_id == cg.meta.layer_count, + root_chunk=parent_layer_id == cg.meta.layer_count and use_threads, ) for i_cc, node_ids in enumerate(cc_connections[parent_layer_id]): diff --git a/pychunkedgraph/ingest/create/cross_edges.py b/pychunkedgraph/ingest/create/cross_edges.py index c7f45e9eb..5f0ebf8df 100644 --- a/pychunkedgraph/ingest/create/cross_edges.py +++ b/pychunkedgraph/ingest/create/cross_edges.py @@ -184,7 +184,7 @@ def _read_atomic_chunk_cross_edge_nodes(cg: ChunkedGraph, chunk_coord, layer): the lowest layer at which an l2 node is part of a cross edge """ node_layer_d = {} - relevant_layers = range(layer, cg.meta.layer_count + 1) + relevant_layers = range(layer, cg.meta.layer_count) range_read, l2ids = _read_atomic_chunk(cg, chunk_coord, relevant_layers) for l2id in l2ids: for layer in relevant_layers: From 176ea2f2e374d17c6ded0d750f468cd281427cbc Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Sun, 20 Aug 2023 23:55:02 +0000 Subject: [PATCH 024/116] fix: remove postprocess step --- pychunkedgraph/ingest/cli.py | 22 +--------- pychunkedgraph/ingest/cluster.py | 31 +++++-------- pychunkedgraph/ingest/create/atomic_layer.py | 46 -------------------- 3 files changed, 12 insertions(+), 87 deletions(-) diff --git a/pychunkedgraph/ingest/cli.py b/pychunkedgraph/ingest/cli.py index 486224cec..2ad51ca18 100644 --- a/pychunkedgraph/ingest/cli.py +++ b/pychunkedgraph/ingest/cli.py @@ -67,16 +67,6 @@ def ingest_graph( enqueue_atomic_tasks(IngestionManager(ingest_config, meta)) -@ingest_cli.command("postprocess") -def postprocess(): - """ - Run postprocessing step on level 2 chunks. - """ - redis = get_redis_connection() - imanager = IngestionManager.from_pickle(redis.get(r_keys.INGESTION_MANAGER)) - enqueue_atomic_tasks(imanager, postprocess=True) - - @ingest_cli.command("imanager") @click.argument("graph_id", type=str) @click.argument("dataset", type=click.Path(exists=True)) @@ -130,16 +120,8 @@ def ingest_status(): """Print ingest status to console by layer.""" redis = get_redis_connection() imanager = IngestionManager.from_pickle(redis.get(r_keys.INGESTION_MANAGER)) - - layer = 2 - done = redis.scard(f"{layer}c") - print(f"{layer}\t: {done} / {imanager.cg_meta.layer_chunk_counts[0]}") - - done = redis.scard(f"{layer}c-postprocess") - print(f"{layer}\t: {done} / {imanager.cg_meta.layer_chunk_counts[0]} [postprocess]") - - layers = range(3, imanager.cg_meta.layer_count + 1) - for layer, layer_count in zip(layers, imanager.cg_meta.layer_chunk_counts[1:]): + layers = range(2, imanager.cg_meta.layer_count + 1) + for layer, layer_count in zip(layers, imanager.cg_meta.layer_chunk_counts): done = redis.scard(f"{layer}c") print(f"{layer}\t: {done} / {layer_count}") diff --git a/pychunkedgraph/ingest/cluster.py b/pychunkedgraph/ingest/cluster.py index 9394c4e26..b952ae0ba 100644 --- a/pychunkedgraph/ingest/cluster.py +++ b/pychunkedgraph/ingest/cluster.py @@ -13,7 +13,6 @@ from .common import get_atomic_chunk_data from .ran_agglomeration import get_active_edges from .create.atomic_layer import add_atomic_edges -from .create.atomic_layer import postprocess_atomic_chunk from .create.abstract_layers import add_layer from ..graph.meta import ChunkedGraphMeta from ..graph.chunks.hierarchy import get_children_chunk_coords @@ -25,12 +24,10 @@ def _post_task_completion( imanager: IngestionManager, layer: int, coords: np.ndarray, - postprocess: bool = False, ): chunk_str = "_".join(map(str, coords)) # mark chunk as completed - "c" - pprocess = "-postprocess" if postprocess else "" - imanager.redis.sadd(f"{layer}c{pprocess}", chunk_str) + imanager.redis.sadd(f"{layer}c", chunk_str) def create_parent_chunk( @@ -59,7 +56,7 @@ def randomize_grid_points(X: int, Y: int, Z: int) -> Tuple[int, int, int]: yield np.unravel_index(index, (X, Y, Z)) -def enqueue_atomic_tasks(imanager: IngestionManager, postprocess: bool = False): +def enqueue_atomic_tasks(imanager: IngestionManager): from os import environ from time import sleep from rq import Queue as RQueue @@ -72,12 +69,7 @@ def enqueue_atomic_tasks(imanager: IngestionManager, postprocess: bool = False): chunk_count = imanager.cg_meta.layer_chunk_counts[0] print(f"total chunk count: {chunk_count}, queuing...") - pprocess = "" - if postprocess: - pprocess = "-postprocess" - print("postprocessing l2 chunks") - - queue_name = f"{imanager.config.CLUSTER.ATOMIC_Q_NAME}{pprocess}" + queue_name = f"{imanager.config.CLUSTER.ATOMIC_Q_NAME}" q = imanager.get_task_queue(queue_name) job_datas = [] batch_size = int(environ.get("L2JOB_BATCH_SIZE", 1000)) @@ -89,13 +81,13 @@ def enqueue_atomic_tasks(imanager: IngestionManager, postprocess: bool = False): x, y, z = chunk_coord chunk_str = f"{x}_{y}_{z}" - if imanager.redis.sismember(f"2c{pprocess}", chunk_str): + if imanager.redis.sismember(f"2c", chunk_str): # already done, skip continue job_datas.append( RQueue.prepare_data( create_atomic_chunk, - args=(chunk_coord, postprocess), + args=(chunk_coord,), timeout=environ.get("L2JOB_TIMEOUT", "3m"), result_ttl=0, job_id=chunk_id_str(2, chunk_coord), @@ -107,18 +99,15 @@ def enqueue_atomic_tasks(imanager: IngestionManager, postprocess: bool = False): q.enqueue_many(job_datas) -def create_atomic_chunk(coords: Sequence[int], postprocess: bool = False): +def create_atomic_chunk(coords: Sequence[int]): """Creates single atomic chunk""" redis = get_redis_connection() imanager = IngestionManager.from_pickle(redis.get(r_keys.INGESTION_MANAGER)) coords = np.array(list(coords), dtype=int) - if postprocess: - postprocess_atomic_chunk(imanager.cg, coords) - else: - chunk_edges_all, mapping = get_atomic_chunk_data(imanager, coords) - chunk_edges_active, isolated_ids = get_active_edges(chunk_edges_all, mapping) - add_atomic_edges(imanager.cg, coords, chunk_edges_active, isolated=isolated_ids) + chunk_edges_all, mapping = get_atomic_chunk_data(imanager, coords) + chunk_edges_active, isolated_ids = get_active_edges(chunk_edges_all, mapping) + add_atomic_edges(imanager.cg, coords, chunk_edges_active, isolated=isolated_ids) if imanager.config.TEST_RUN: # print for debugging @@ -126,7 +115,7 @@ def create_atomic_chunk(coords: Sequence[int], postprocess: bool = False): print(k, len(v)) for k, v in chunk_edges_active.items(): print(f"active_{k}", len(v)) - _post_task_completion(imanager, 2, coords, postprocess=postprocess) + _post_task_completion(imanager, 2, coords) def _get_test_chunks(meta: ChunkedGraphMeta): diff --git a/pychunkedgraph/ingest/create/atomic_layer.py b/pychunkedgraph/ingest/create/atomic_layer.py index 42b6a01b5..054a82840 100644 --- a/pychunkedgraph/ingest/create/atomic_layer.py +++ b/pychunkedgraph/ingest/create/atomic_layer.py @@ -151,49 +151,3 @@ def _get_outgoing_edges(node_id, chunk_edges_d, sparse_indices, remapping): # edges that this node is part of chunk_out_edges = np.concatenate([chunk_out_edges, edges[row_ids]]) return chunk_out_edges - - -def postprocess_atomic_chunk( - cg: ChunkedGraph, - chunk_coord: np.ndarray, - time_stamp: Optional[datetime.datetime] = None, -): - time_stamp = get_valid_timestamp(time_stamp) - - chunk_id = cg.get_chunk_id( - layer=2, x=chunk_coord[0], y=chunk_coord[1], z=chunk_coord[2] - ) - - properties = [ - attributes.Connectivity.AtomicCrossChunkEdge[l] for l in range(2, cg.meta.layer_count) - ] - - chunk_rr = cg.range_read_chunk( - chunk_id, properties=properties, time_stamp=time_stamp - ) - - result = {} - for l2id, raw_cx_edges in chunk_rr.items(): - try: - cx_edges = { - prop.index: val[0].value.copy() for prop, val in raw_cx_edges.items() - } - result[l2id] = cx_edges - except KeyError: - continue - - nodes = [] - val_dicts = [] - for l2id, cx_edges in result.items(): - val_dict = {} - for layer, edges in cx_edges.items(): - l2_edges = np.zeros_like(edges) - l2_edges[:, 0] = l2id - l2_edges[:, 1] = cg.get_parents(edges[:, 1]) - col = attributes.Connectivity.CrossChunkEdge[layer] - val_dict[col] = np.unique(l2_edges, axis=0) - val_dicts.append(val_dict) - - r_key = serializers.serialize_uint64(l2id) - nodes.append(cg.client.mutate_row(r_key, val_dict, time_stamp=time_stamp)) - cg.client.write(nodes) From 49d0b955fb6df8d156f010e50bacdc5f881f0f2a Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Sun, 20 Aug 2023 23:57:00 +0000 Subject: [PATCH 025/116] fix: raises specific error --- pychunkedgraph/ingest/cluster.py | 2 +- pychunkedgraph/ingest/utils.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pychunkedgraph/ingest/cluster.py b/pychunkedgraph/ingest/cluster.py index b952ae0ba..a5c6a9861 100644 --- a/pychunkedgraph/ingest/cluster.py +++ b/pychunkedgraph/ingest/cluster.py @@ -81,7 +81,7 @@ def enqueue_atomic_tasks(imanager: IngestionManager): x, y, z = chunk_coord chunk_str = f"{x}_{y}_{z}" - if imanager.redis.sismember(f"2c", chunk_str): + if imanager.redis.sismember("2c", chunk_str): # already done, skip continue job_datas.append( diff --git a/pychunkedgraph/ingest/utils.py b/pychunkedgraph/ingest/utils.py index fa7ef7a3c..1c3236561 100644 --- a/pychunkedgraph/ingest/utils.py +++ b/pychunkedgraph/ingest/utils.py @@ -1,6 +1,6 @@ +# pylint: disable=invalid-name, missing-docstring from typing import Tuple - from . import ClusterIngestConfig from . import IngestConfig from ..graph.meta import ChunkedGraphMeta @@ -72,4 +72,4 @@ def postprocess_edge_data(im, edge_dict): return new_edge_dict else: - raise Exception(f"Unknown data_version: {data_version}") + raise ValueError(f"Unknown data_version: {data_version}") From d59428958bc9b93b738aa5bd90bb11a94cc83811 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Mon, 21 Aug 2023 00:17:51 +0000 Subject: [PATCH 026/116] fix: removes dangerous default value --- pychunkedgraph/graph/chunkedgraph.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pychunkedgraph/graph/chunkedgraph.py b/pychunkedgraph/graph/chunkedgraph.py index 210bff50b..2630d8250 100644 --- a/pychunkedgraph/graph/chunkedgraph.py +++ b/pychunkedgraph/graph/chunkedgraph.py @@ -551,7 +551,7 @@ def get_subgraph( node_id_or_ids: typing.Union[np.uint64, typing.Iterable], bbox: typing.Optional[typing.Sequence[typing.Sequence[int]]] = None, bbox_is_coordinate: bool = False, - return_layers: typing.List = [2], + return_layers: typing.List = None, nodes_only: bool = False, edges_only: bool = False, leaves_only: bool = False, @@ -563,6 +563,9 @@ def get_subgraph( from .subgraph import get_subgraph_nodes from .subgraph import get_subgraph_edges_and_leaves + if return_layers is None: + return_layers = [2] + if nodes_only: return get_subgraph_nodes( self, @@ -581,7 +584,7 @@ def get_subgraph_nodes( node_id_or_ids: typing.Union[np.uint64, typing.Iterable], bbox: typing.Optional[typing.Sequence[typing.Sequence[int]]] = None, bbox_is_coordinate: bool = False, - return_layers: typing.List = [2], + return_layers: typing.List = None, serializable: bool = False, return_flattened: bool = False, ) -> typing.Tuple[typing.Dict, typing.Dict, Edges]: @@ -591,6 +594,9 @@ def get_subgraph_nodes( """ from .subgraph import get_subgraph_nodes + if return_layers is None: + return_layers = [2] + return get_subgraph_nodes( self, node_id_or_ids, From 32ee750c18373efdc589d1f7e1111bbded7dbb30 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Mon, 21 Aug 2023 01:02:21 +0000 Subject: [PATCH 027/116] wip: read from cached edges --- pychunkedgraph/graph/attributes.py | 2 +- pychunkedgraph/graph/chunkedgraph.py | 87 ++++++++-------------------- pychunkedgraph/graph/edges/utils.py | 39 +------------ pychunkedgraph/graph/edits.py | 11 +--- pychunkedgraph/graph/operation.py | 4 +- pychunkedgraph/graph/subgraph.py | 36 +++++------- 6 files changed, 49 insertions(+), 130 deletions(-) diff --git a/pychunkedgraph/graph/attributes.py b/pychunkedgraph/graph/attributes.py index b0f18c2ec..958913119 100644 --- a/pychunkedgraph/graph/attributes.py +++ b/pychunkedgraph/graph/attributes.py @@ -111,7 +111,7 @@ class Connectivity: ) CrossChunkEdge = _AttributeArray( - pattern=b"cross_edge_%d", + pattern=b"cross_edges_%d", family_id="4", serializer=serializers.NumPyArray( dtype=basetypes.NODE_ID, shape=(-1, 2), compression_level=22 diff --git a/pychunkedgraph/graph/chunkedgraph.py b/pychunkedgraph/graph/chunkedgraph.py index 2630d8250..83c543b6e 100644 --- a/pychunkedgraph/graph/chunkedgraph.py +++ b/pychunkedgraph/graph/chunkedgraph.py @@ -1,4 +1,4 @@ -# pylint: disable=invalid-name, missing-docstring, too-many-lines, import-outside-toplevel +# pylint: disable=invalid-name, missing-docstring, too-many-lines, import-outside-toplevel, unsupported-binary-operation import time import typing @@ -112,13 +112,15 @@ def range_read_chunk( """Read all nodes in a chunk.""" layer = self.get_chunk_layer(chunk_id) root_chunk = layer == self.meta.layer_count - max_node_id = self.id_client.get_max_node_id(chunk_id=chunk_id, root_chunk=root_chunk) + max_id = self.id_client.get_max_node_id( + chunk_id=chunk_id, root_chunk=root_chunk + ) if layer == 1: - max_node_id = chunk_id | self.get_segment_id_limit(chunk_id) # pylint: disable=unsupported-binary-operation + max_id = chunk_id | self.get_segment_id_limit(chunk_id) return self.client.read_nodes( start_id=self.get_node_id(np.uint64(0), chunk_id=chunk_id), - end_id=max_node_id, + end_id=max_id, end_id_inclusive=True, properties=properties, end_time=time_stamp, @@ -293,7 +295,7 @@ def _get_children_multiple( def get_atomic_cross_edges( self, l2_ids: typing.Iterable, *, raw_only=False ) -> typing.Dict[np.uint64, typing.Dict[int, typing.Iterable]]: - """Returns cross edges for level 2 IDs.""" + """Returns atomic cross edges for level 2 IDs.""" if raw_only or not self.cache: node_edges_d_d = self.client.read_nodes( node_ids=l2_ids, @@ -314,67 +316,30 @@ def get_atomic_cross_edges( return result return self.cache.atomic_cross_edges_multiple(l2_ids) - def get_cross_chunk_edges( - self, node_ids: typing.Iterable, uplift=True, all_layers=False - ) -> typing.Dict[np.uint64, typing.Dict[int, typing.Iterable]]: + def get_cross_chunk_edges(self, node_ids: typing.Iterable) -> typing.Dict: """ - Cross chunk edges for `node_id` at `node_layer`. - The edges are between node IDs at the `node_layer`, not atomic cross edges. - Returns dict {layer_id: cross_edges} - The first layer (>= `node_layer`) with atleast one cross chunk edge. - For current use-cases, other layers are not relevant. - - For performance, only children that lie along chunk boundary are considered. - Cross edges that belong to inner level 2 IDs are subsumed within the chunk. - This is because cross edges are stored only in level 2 IDs. + Returns cross edges for `node_ids`. + A dict of the form `{node_id: {layer: cross_edges}}` """ result = {} node_ids = np.array(node_ids, dtype=basetypes.NODE_ID) - if not node_ids.size: + if node_ids.size == 0: return result - - node_l2ids_d = {} - layers_ = self.get_chunk_layers(node_ids) - for l in set(layers_): - node_l2ids_d.update(self._get_bounding_l2_children(node_ids[layers_ == l])) - l2_edges_d_d = self.get_atomic_cross_edges( - np.concatenate(list(node_l2ids_d.values())) - ) - for node_id in node_ids: - l2_edges_ds = [l2_edges_d_d[l2_id] for l2_id in node_l2ids_d[node_id]] - if all_layers: - result[node_id] = edge_utils.concatenate_cross_edge_dicts(l2_edges_ds) - else: - result[node_id] = self._get_min_layer_cross_edges( - node_id, l2_edges_ds, uplift=uplift - ) + attrs = [ + attributes.Connectivity.CrossChunkEdge[l] + for l in range(2, self.meta.layer_count) + ] + node_edges_d_d = self.client.read_nodes(node_ids=node_ids, properties=attrs) + for id_ in node_ids: + try: + result[id_] = { + prop.index: val[0].value.copy() + for prop, val in node_edges_d_d[id_].items() + } + except KeyError: + result[id_] = {} return result - def _get_min_layer_cross_edges( - self, - node_id: basetypes.NODE_ID, - l2id_atomic_cross_edges_ds: typing.Iterable, - uplift=True, - ) -> typing.Dict[int, typing.Iterable]: - """ - Find edges at relevant min_layer >= node_layer. - `l2id_atomic_cross_edges_ds` is a list of atomic cross edges of - level 2 IDs that are descendants of `node_id`. - """ - min_layer, edges = edge_utils.filter_min_layer_cross_edges_multiple( - self.meta, l2id_atomic_cross_edges_ds, self.get_chunk_layer(node_id) - ) - if self.get_chunk_layer(node_id) < min_layer: - # cross edges irrelevant - return {self.get_chunk_layer(node_id): types.empty_2d} - if not uplift: - return {min_layer: edges} - node_root_id = node_id - node_root_id = self.get_root(node_id, stop_layer=min_layer, ceil=False) - edges[:, 0] = node_root_id - edges[:, 1] = self.get_roots(edges[:, 1], stop_layer=min_layer, ceil=False) - return {min_layer: np.unique(edges, axis=0) if edges.size else types.empty_2d} - def get_roots( self, node_ids: typing.Sequence[np.uint64], @@ -698,9 +663,7 @@ def get_l2_agglomerations( sv_parent_d.update(dict(zip(svs.tolist(), [l2id] * len(svs)))) in_edges, out_edges, cross_edges = edge_utils.categorize_edges_v2( - self.meta, - all_chunk_edges, - sv_parent_d + self.meta, all_chunk_edges, sv_parent_d ) agglomeration_d = get_agglomerations( diff --git a/pychunkedgraph/graph/edges/utils.py b/pychunkedgraph/graph/edges/utils.py index 034ca6ebc..94641343a 100644 --- a/pychunkedgraph/graph/edges/utils.py +++ b/pychunkedgraph/graph/edges/utils.py @@ -8,16 +8,17 @@ from typing import Tuple from typing import Iterable from typing import Optional +from collections import defaultdict import fastremap import numpy as np from . import Edges from . import EDGE_TYPES -from ..types import empty_2d from ..utils import basetypes from ..chunks import utils as chunk_utils from ..meta import ChunkedGraphMeta +from ...utils.general import in2d def concatenate_chunk_edges(chunk_edge_dicts: Iterable) -> Dict: @@ -47,10 +48,7 @@ def concatenate_chunk_edges(chunk_edge_dicts: Iterable) -> Dict: def concatenate_cross_edge_dicts(edges_ds: Iterable[Dict]) -> Dict: """Combines cross chunk edge dicts of form {layer id : edge list}.""" - from collections import defaultdict - result_d = defaultdict(list) - for edges_d in edges_ds: for layer, edges in edges_d.items(): result_d[layer].append(edges) @@ -152,40 +150,7 @@ def get_cross_chunk_edges_layer(meta: ChunkedGraphMeta, cross_edges: Iterable): return cross_chunk_edge_layers -def filter_min_layer_cross_edges( - meta: ChunkedGraphMeta, cross_edges_d: Dict, node_layer: int = 2 -) -> Tuple[int, Iterable]: - """ - Given a dict of cross chunk edges {layer: edges} - Return the first layer with cross edges. - """ - for layer in range(node_layer, meta.layer_count): - edges_ = cross_edges_d.get(layer, empty_2d) - if edges_.size: - return (layer, edges_) - return (meta.layer_count, edges_) - - -def filter_min_layer_cross_edges_multiple( - meta: ChunkedGraphMeta, l2id_atomic_cross_edges_ds: Iterable, node_layer: int = 2 -) -> Tuple[int, Iterable]: - """ - Given a list of dicts of cross chunk edges [{layer: edges}] - Return the first layer with cross edges. - """ - min_layer = meta.layer_count - for edges_d in l2id_atomic_cross_edges_ds: - layer_, _ = filter_min_layer_cross_edges(meta, edges_d, node_layer=node_layer) - min_layer = min(min_layer, layer_) - edges = [empty_2d] - for edges_d in l2id_atomic_cross_edges_ds: - edges.append(edges_d.get(min_layer, empty_2d)) - return min_layer, np.concatenate(edges) - - def get_edges_status(cg, edges: Iterable, time_stamp: Optional[float] = None): - from ...utils.general import in2d - coords0 = chunk_utils.get_chunk_coordinates_multiple(cg.meta, edges[:, 0]) coords1 = chunk_utils.get_chunk_coordinates_multiple(cg.meta, edges[:, 1]) diff --git a/pychunkedgraph/graph/edits.py b/pychunkedgraph/graph/edits.py index 4cb536ea7..6d823e720 100644 --- a/pychunkedgraph/graph/edits.py +++ b/pychunkedgraph/graph/edits.py @@ -142,7 +142,6 @@ def check_fake_edges( ) ) assert len(roots) == 2, "edges must be from 2 roots" - print("found inactive", len(inactive_edges)) return inactive_edges, [] rows = [] @@ -177,7 +176,6 @@ def check_fake_edges( time_stamp=time_stamp, ) ) - print("no inactive", len(atomic_edges)) return atomic_edges, rows @@ -249,8 +247,7 @@ def _process_l2_agglomeration( atomic_cross_edges_d: Dict[int, np.ndarray], ): """ - For a given L2 id, remove given edges - and calculate new connected components. + For a given L2 id, remove given edges; calculate new connected components. """ chunk_edges = agg.in_edges.get_pairs() cross_edges = np.concatenate([types.empty_2d, *atomic_cross_edges_d.values()]) @@ -312,7 +309,7 @@ def remove_edges( ccs, graph_ids, cross_edges = _process_l2_agglomeration( l2_agg, removed_edges, atomic_cross_edges_d[id_] ) - # calculated here to avoid repeat computation in loop + # done here to avoid repeat computation in loop cross_edge_layers = cg.get_cross_chunk_edges_layer(cross_edges) new_parent_ids = cg.id_client.create_node_ids( l2id_chunk_id_d[l2_agg.node_id], len(ccs) @@ -413,9 +410,7 @@ def _get_connected_components( self.cg.graph_id, self._operation_id, ): - self._cross_edges_d.update( - self.cg.get_cross_chunk_edges(not_cached, all_layers=True) - ) + self._cross_edges_d.update(self.cg.get_cross_chunk_edges(not_cached)) sv_parent_d, sv_cross_edges = self._map_sv_to_parent(node_ids, layer) get_sv_parents = np.vectorize(sv_parent_d.get, otypes=[np.uint64]) diff --git a/pychunkedgraph/graph/operation.py b/pychunkedgraph/graph/operation.py index d0d0e172a..b864a2d0d 100644 --- a/pychunkedgraph/graph/operation.py +++ b/pychunkedgraph/graph/operation.py @@ -1,4 +1,4 @@ -# pylint: disable=invalid-name, missing-docstring, too-many-lines, protected-access +# pylint: disable=invalid-name, missing-docstring, too-many-lines, protected-access, broad_exception_raised from abc import ABC, abstractmethod from collections import namedtuple @@ -469,7 +469,7 @@ def execute( exception=repr(err), ) self.cg.client.write([log_record_error]) - raise Exception(err) + raise Exception(err) from err with TimeIt(f"{op_type}.write", self.cg.graph_id, lock.operation_id): result = self._write( diff --git a/pychunkedgraph/graph/subgraph.py b/pychunkedgraph/graph/subgraph.py index ab2593175..5b50b7c43 100644 --- a/pychunkedgraph/graph/subgraph.py +++ b/pychunkedgraph/graph/subgraph.py @@ -1,3 +1,5 @@ +# pylint: disable=invalid-name, missing-docstring + from typing import List from typing import Dict from typing import Tuple @@ -30,9 +32,7 @@ def __init__(self, meta, node_ids, return_layers, serializable): # "Frontier" of nodes that cg.get_children will be called on self.cur_nodes = np.array(list(node_ids), dtype=np.uint64) # Mapping of current frontier to self.node_ids - self.cur_nodes_to_original_nodes = dict( - zip(self.cur_nodes, self.cur_nodes) - ) + self.cur_nodes_to_original_nodes = dict(zip(self.cur_nodes, self.cur_nodes)) self.stop_layer = max(1, min(return_layers)) self.create_initial_node_to_subgraph() @@ -107,13 +107,11 @@ def flatten_subgraph(self): for node_id in self.node_ids: for return_layer in self.return_layers: node_key = self.get_dict_key(node_id) - children_at_layer = self.node_to_subgraph[node_key][ - return_layer - ] + children_at_layer = self.node_to_subgraph[node_key][return_layer] if len(children_at_layer) > 0: - self.node_to_subgraph[node_key][ - return_layer - ] = np.concatenate(children_at_layer) + self.node_to_subgraph[node_key][return_layer] = np.concatenate( + children_at_layer + ) else: self.node_to_subgraph[node_key][return_layer] = empty_1d @@ -123,10 +121,12 @@ def get_subgraph_nodes( node_id_or_ids: Union[np.uint64, Iterable], bbox: Optional[Sequence[Sequence[int]]] = None, bbox_is_coordinate: bool = False, - return_layers: List = [2], + return_layers: List = None, serializable: bool = False, - return_flattened: bool = False + return_flattened: bool = False, ) -> Tuple[Dict, Dict, Edges]: + if return_layers is None: + return_layers = [2] single = False node_ids = node_id_or_ids bbox = normalize_bounding_box(cg.meta, bbox, bbox_is_coordinate) @@ -139,7 +139,7 @@ def get_subgraph_nodes( bounding_box=bbox, return_layers=return_layers, serializable=serializable, - return_flattened=return_flattened + return_flattened=return_flattened, ) if single: if serializable: @@ -183,7 +183,7 @@ def _get_subgraph_multiple_nodes( bounding_box: Optional[Sequence[Sequence[int]]], return_layers: Sequence[int], serializable: bool = False, - return_flattened: bool = False + return_flattened: bool = False, ): from collections import ChainMap from multiwrapper.multiprocessing_utils import n_cpus @@ -223,9 +223,7 @@ def _get_subgraph_multiple_nodes_threaded( subgraph = SubgraphProgress(cg.meta, node_ids, return_layers, serializable) while not subgraph.done_processing(): - this_n_threads = min( - [int(len(subgraph.cur_nodes) // 50000) + 1, n_cpus] - ) + this_n_threads = min([int(len(subgraph.cur_nodes) // 50000) + 1, n_cpus]) cur_nodes_child_maps = multithread_func( _get_subgraph_multiple_nodes_threaded, np.array_split(subgraph.cur_nodes, this_n_threads), @@ -239,8 +237,6 @@ def _get_subgraph_multiple_nodes_threaded( for node_id in node_ids: subgraph.node_to_subgraph[ _get_dict_key(node_id) - ] = subgraph.node_to_subgraph[_get_dict_key(node_id)][ - return_layers[0] - ] + ] = subgraph.node_to_subgraph[_get_dict_key(node_id)][return_layers[0]] - return subgraph.node_to_subgraph \ No newline at end of file + return subgraph.node_to_subgraph From 36526d1341f38481ebf265bc44d15cc8e26b4698 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Mon, 21 Aug 2023 03:24:03 +0000 Subject: [PATCH 028/116] wip: edits refactor --- pychunkedgraph/graph/cache.py | 30 ++++--- pychunkedgraph/graph/chunkedgraph.py | 75 +++++++++-------- pychunkedgraph/graph/edits.py | 119 +++++++++------------------ 3 files changed, 92 insertions(+), 132 deletions(-) diff --git a/pychunkedgraph/graph/cache.py b/pychunkedgraph/graph/cache.py index 8c824c732..4e5ed17c1 100644 --- a/pychunkedgraph/graph/cache.py +++ b/pychunkedgraph/graph/cache.py @@ -31,26 +31,24 @@ def __init__(self, cg): self._parent_vec = np.vectorize(self.parent, otypes=[np.uint64]) self._children_vec = np.vectorize(self.children, otypes=[np.ndarray]) - self._atomic_cross_edges_vec = np.vectorize( - self.atomic_cross_edges, otypes=[dict] - ) + self._cross_chunk_edges_vec = np.vectorize(self.cross_chunk_edges, otypes=[dict]) # no limit because we don't want to lose new IDs self.parents_cache = LRUCache(maxsize=maxsize) self.children_cache = LRUCache(maxsize=maxsize) - self.atomic_cx_edges_cache = LRUCache(maxsize=maxsize) + self.cross_chunk_edges_cache = LRUCache(maxsize=maxsize) def __len__(self): return ( len(self.parents_cache) + len(self.children_cache) - + len(self.atomic_cx_edges_cache) + + len(self.cross_chunk_edges_cache) ) def clear(self): self.parents_cache.clear() self.children_cache.clear() - self.atomic_cx_edges_cache.clear() + self.cross_chunk_edges_cache.clear() def parent(self, node_id: np.uint64, *, time_stamp: datetime = None): @cached(cache=self.parents_cache, key=lambda node_id: node_id) @@ -68,15 +66,15 @@ def children_decorated(node_id): return children_decorated(node_id) - def atomic_cross_edges(self, node_id): - @cached(cache=self.atomic_cx_edges_cache, key=lambda node_id: node_id) - def atomic_cross_edges_decorated(node_id): - edges = self._cg.get_atomic_cross_edges( + def cross_chunk_edges(self, node_id): + @cached(cache=self.cross_chunk_edges_cache, key=lambda node_id: node_id) + def cross_edges_decorated(node_id): + edges = self._cg.get_cross_chunk_edges( np.array([node_id], dtype=NODE_ID), raw_only=True ) return edges[node_id] - return atomic_cross_edges_decorated(node_id) + return cross_edges_decorated(node_id) def parents_multiple(self, node_ids: np.ndarray, *, time_stamp: datetime = None): if not node_ids.size: @@ -105,20 +103,20 @@ def children_multiple(self, node_ids: np.ndarray, *, flatten=False): return np.concatenate([*result.values()]) return result - def atomic_cross_edges_multiple(self, node_ids: np.ndarray): + def cross_chunk_edges_multiple(self, node_ids: np.ndarray): result = {} if not node_ids.size: return result mask = np.in1d( - node_ids, np.fromiter(self.atomic_cx_edges_cache.keys(), dtype=NODE_ID) + node_ids, np.fromiter(self.cross_chunk_edges_cache.keys(), dtype=NODE_ID) ) - cached_edges_ = self._atomic_cross_edges_vec(node_ids[mask]) + cached_edges_ = self._cross_chunk_edges_vec(node_ids[mask]) result.update( {id_: edges_ for id_, edges_ in zip(node_ids[mask], cached_edges_)} ) - result.update(self._cg.get_atomic_cross_edges(node_ids[~mask], raw_only=True)) + result.update(self._cg.get_cross_chunk_edges(node_ids[~mask], raw_only=True)) update( - self.atomic_cx_edges_cache, + self.cross_chunk_edges_cache, node_ids[~mask], [result[k] for k in node_ids[~mask]], ) diff --git a/pychunkedgraph/graph/chunkedgraph.py b/pychunkedgraph/graph/chunkedgraph.py index 83c543b6e..1cdecd77a 100644 --- a/pychunkedgraph/graph/chunkedgraph.py +++ b/pychunkedgraph/graph/chunkedgraph.py @@ -292,45 +292,20 @@ def _get_children_multiple( } return self.cache.children_multiple(node_ids) - def get_atomic_cross_edges( - self, l2_ids: typing.Iterable, *, raw_only=False - ) -> typing.Dict[np.uint64, typing.Dict[int, typing.Iterable]]: - """Returns atomic cross edges for level 2 IDs.""" - if raw_only or not self.cache: - node_edges_d_d = self.client.read_nodes( - node_ids=l2_ids, - properties=[ - attributes.Connectivity.CrossChunkEdge[l] - for l in range(2, max(3, self.meta.layer_count)) - ], - ) - result = {} - for id_ in l2_ids: - try: - result[id_] = { - prop.index: val[0].value.copy() - for prop, val in node_edges_d_d[id_].items() - } - except KeyError: - result[id_] = {} - return result - return self.cache.atomic_cross_edges_multiple(l2_ids) - - def get_cross_chunk_edges(self, node_ids: typing.Iterable) -> typing.Dict: + def get_atomic_cross_edges(self, l2_ids: typing.Iterable) -> typing.Dict: """ - Returns cross edges for `node_ids`. - A dict of the form `{node_id: {layer: cross_edges}}` + Returns atomic cross edges for level 2 IDs. + A dict of the form `{l2id: {layer: atomic_cross_edges}}`. """ + node_edges_d_d = self.client.read_nodes( + node_ids=l2_ids, + properties=[ + attributes.Connectivity.AtomicCrossChunkEdge[l] + for l in range(2, self.meta.layer_count) + ], + ) result = {} - node_ids = np.array(node_ids, dtype=basetypes.NODE_ID) - if node_ids.size == 0: - return result - attrs = [ - attributes.Connectivity.CrossChunkEdge[l] - for l in range(2, self.meta.layer_count) - ] - node_edges_d_d = self.client.read_nodes(node_ids=node_ids, properties=attrs) - for id_ in node_ids: + for id_ in l2_ids: try: result[id_] = { prop.index: val[0].value.copy() @@ -340,6 +315,34 @@ def get_cross_chunk_edges(self, node_ids: typing.Iterable) -> typing.Dict: result[id_] = {} return result + def get_cross_chunk_edges( + self, node_ids: typing.Iterable, *, raw_only=False + ) -> typing.Dict: + """ + Returns cross edges for `node_ids`. + A dict of the form `{node_id: {layer: cross_edges}}`. + """ + if raw_only or not self.cache: + result = {} + node_ids = np.array(node_ids, dtype=basetypes.NODE_ID) + if node_ids.size == 0: + return result + attrs = [ + attributes.Connectivity.CrossChunkEdge[l] + for l in range(2, self.meta.layer_count) + ] + node_edges_d_d = self.client.read_nodes(node_ids=node_ids, properties=attrs) + for id_ in node_ids: + try: + result[id_] = { + prop.index: val[0].value.copy() + for prop, val in node_edges_d_d[id_].items() + } + except KeyError: + result[id_] = {} + return result + return self.cache.cross_chunk_edges_multiple(node_ids) + def get_roots( self, node_ids: typing.Sequence[np.uint64], diff --git a/pychunkedgraph/graph/edits.py b/pychunkedgraph/graph/edits.py index 6d823e720..68a8c9b3b 100644 --- a/pychunkedgraph/graph/edits.py +++ b/pychunkedgraph/graph/edits.py @@ -38,9 +38,9 @@ def _analyze_affected_edges( cg, atomic_edges: Iterable[np.ndarray], parent_ts: datetime.datetime = None ) -> Tuple[Iterable, Dict]: """ - Determine if atomic edges are within the chunk. - If not, they are cross edges between two L2 IDs in adjacent chunks. - Returns edges between L2 IDs and atomic cross edges. + Returns l2 edges within chunk and adds self edges for nodes in cross chunk edges. + + Also returns new cross edges dicts for nodes crossing chunk boundary. """ supervoxels = np.unique(atomic_edges) parents = cg.get_parents(supervoxels, time_stamp=parent_ts) @@ -51,19 +51,18 @@ def _analyze_affected_edges( for edge_ in atomic_edges[edge_layers == 1] ] - # cross chunk edges - atomic_cross_edges_d = defaultdict(lambda: defaultdict(list)) + cross_edges_d = defaultdict(lambda: defaultdict(list)) for layer in range(2, cg.meta.layer_count): layer_edges = atomic_edges[edge_layers == layer] if not layer_edges.size: continue for edge in layer_edges: - parent_1 = sv_parent_d[edge[0]] - parent_2 = sv_parent_d[edge[1]] - atomic_cross_edges_d[parent_1][layer].append(edge) - atomic_cross_edges_d[parent_2][layer].append(edge[::-1]) - parent_edges.extend([[parent_1, parent_1], [parent_2, parent_2]]) - return (parent_edges, atomic_cross_edges_d) + parent0 = sv_parent_d[edge[0]] + parent1 = sv_parent_d[edge[1]] + cross_edges_d[parent0][layer].append([parent0, parent1]) + cross_edges_d[parent1][layer].append([parent1, parent0]) + parent_edges.extend([[parent0, parent0], [parent1, parent1]]) + return parent_edges, cross_edges_d def _get_relevant_components(edges: np.ndarray, supervoxels: np.ndarray) -> Tuple: @@ -89,9 +88,7 @@ def merge_preprocess( parent_ts: datetime.datetime = None, ) -> np.ndarray: """ - Determine if a fake edge needs to be added. - Get subgraph within the bounding box - Add fake edge if there are no inactive edges between two components. + Check and return inactive edges in the subgraph. """ edge_layers = cg.get_cross_chunk_edges_layer(subgraph_edges) active_edges = [types.empty_2d] @@ -146,6 +143,7 @@ def check_fake_edges( rows = [] supervoxels = atomic_edges.ravel() + # fake edges are stored with l2 chunks chunk_ids = cg.get_chunk_ids_from_node_ids( cg.get_parents(supervoxels, time_stamp=parent_ts) ) @@ -188,21 +186,19 @@ def add_edges( parent_ts: datetime.datetime = None, allow_same_segment_merge=False, ): - edges, l2_atomic_cross_edges_d = _analyze_affected_edges( + edges, l2_cross_edges_d = _analyze_affected_edges( cg, atomic_edges, parent_ts=parent_ts ) l2ids = np.unique(edges) if not allow_same_segment_merge: - assert ( - np.unique(cg.get_roots(l2ids, assert_roots=True, time_stamp=parent_ts)).size - == 2 - ), "L2 IDs must belong to different roots." + roots = cg.get_roots(l2ids, assert_roots=True, time_stamp=parent_ts) + assert np.unique(roots).size == 2, "L2 IDs must belong to different roots." new_old_id_d, old_new_id_d, old_hierarchy_d = _init_old_hierarchy( cg, l2ids, parent_ts=parent_ts ) atomic_children_d = cg.get_children(l2ids) - atomic_cross_edges_d = merge_cross_edge_dicts( - cg.get_atomic_cross_edges(l2ids), l2_atomic_cross_edges_d + cross_edges_d = merge_cross_edge_dicts( + cg.get_cross_chunk_edges(l2ids), l2_cross_edges_d ) graph, _, _, graph_ids = flatgraph.build_gt_graph(edges, make_directed=True) @@ -214,8 +210,8 @@ def add_edges( cg.cache.children_cache[new_id] = np.concatenate( [atomic_children_d[l2id] for l2id in l2ids_] ) - cg.cache.atomic_cx_edges_cache[new_id] = concatenate_cross_edge_dicts( - [atomic_cross_edges_d[l2id] for l2id in l2ids_] + cg.cache.cross_chunk_edges_cache[new_id] = concatenate_cross_edge_dicts( + [cross_edges_d[l2id] for l2id in l2ids_] ) cache_utils.update( cg.cache.parents_cache, cg.cache.children_cache[new_id], new_id @@ -300,14 +296,14 @@ def remove_edges( cg, l2ids, parent_ts=parent_ts ) l2id_chunk_id_d = dict(zip(l2ids.tolist(), cg.get_chunk_ids_from_node_ids(l2ids))) - atomic_cross_edges_d = cg.get_atomic_cross_edges(l2ids) + cross_edges_d = cg.get_cross_chunk_edges(l2ids) removed_edges = np.concatenate([atomic_edges, atomic_edges[:, ::-1]], axis=0) new_l2_ids = [] for id_ in l2ids: l2_agg = l2id_agglomeration_d[id_] ccs, graph_ids, cross_edges = _process_l2_agglomeration( - l2_agg, removed_edges, atomic_cross_edges_d[id_] + l2_agg, removed_edges, cross_edges_d[id_] ) # done here to avoid repeat computation in loop cross_edge_layers = cg.get_cross_chunk_edges_layer(cross_edges) @@ -386,60 +382,27 @@ def _get_old_ids(self, new_ids): ] return np.concatenate(old_ids) - def _map_sv_to_parent(self, node_ids, layer, node_map=None): - sv_parent_d = {} - sv_cross_edges = [types.empty_2d] - if node_map is None: - node_map = {} - for id_ in node_ids: - id_eff = node_map.get(id_, id_) - edges_ = self._cross_edges_d[id_].get(layer, types.empty_2d) - sv_parent_d.update(dict(zip(edges_[:, 0], [id_eff] * len(edges_)))) - sv_cross_edges.append(edges_) - return sv_parent_d, np.concatenate(sv_cross_edges) - - def _get_connected_components( - self, node_ids: np.ndarray, layer: int, lower_layer_ids: np.ndarray - ): - _node_ids = np.concatenate([node_ids, lower_layer_ids]) - cached = np.fromiter(self._cross_edges_d.keys(), dtype=basetypes.NODE_ID) - not_cached = _node_ids[~np.in1d(_node_ids, cached)] - + def _get_connected_components(self, node_ids: np.ndarray, layer: int): with TimeIt( f"get_cross_chunk_edges.{layer}", self.cg.graph_id, self._operation_id, ): - self._cross_edges_d.update(self.cg.get_cross_chunk_edges(not_cached)) - - sv_parent_d, sv_cross_edges = self._map_sv_to_parent(node_ids, layer) - get_sv_parents = np.vectorize(sv_parent_d.get, otypes=[np.uint64]) - try: - cross_edges = get_sv_parents(sv_cross_edges) - except TypeError: # NoneType error - # if there is a missing parent, try including lower layer ids - # this can happen due to skip connections - - # we want to map all these lower IDs to the current layer - lower_layer_to_layer = self.cg.get_roots( - lower_layer_ids, stop_layer=layer, ceil=False - ) - node_map = {k: v for k, v in zip(lower_layer_ids, lower_layer_to_layer)} - sv_parent_d, sv_cross_edges = self._map_sv_to_parent( - _node_ids, layer, node_map=node_map - ) - get_sv_parents = np.vectorize(sv_parent_d.get, otypes=[np.uint64]) - cross_edges = get_sv_parents(sv_cross_edges) + cross_edges_d = self.cg.get_cross_chunk_edges(node_ids) + self._cross_edges_d.update(cross_edges_d) + + cross_edges = [types.empty_2d] + for id_ in node_ids: + edges_ = self._cross_edges_d[id_].get(layer, types.empty_2d) + cross_edges.append(edges_) - cross_edges = np.concatenate([cross_edges, np.vstack([node_ids, node_ids]).T]) + cross_edges = np.concatenate([*cross_edges, np.vstack([node_ids, node_ids]).T]) graph, _, _, graph_ids = flatgraph.build_gt_graph( cross_edges, make_directed=True ) return flatgraph.connected_components(graph), graph_ids - def _get_layer_node_ids( - self, new_ids: np.ndarray, layer: int - ) -> Tuple[np.ndarray, np.ndarray]: + def _get_layer_node_ids(self, new_ids: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: # get old identities of new IDs old_ids = self._get_old_ids(new_ids) # get their parents, then children of those parents @@ -458,9 +421,7 @@ def _get_layer_node_ids( ] + [node_ids[~mask], new_ids] ) - node_ids = np.unique(node_ids) - layer_mask = self.cg.get_chunk_layers(node_ids) == layer - return node_ids[layer_mask], node_ids[~layer_mask] + return np.unique(node_ids) def _create_new_parents(self, layer: int): """ @@ -473,10 +434,8 @@ def _create_new_parents(self, layer: int): update parent old IDs """ new_ids = self._new_ids_d[layer] - layer_node_ids, lower_layer_ids = self._get_layer_node_ids(new_ids, layer) - components, graph_ids = self._get_connected_components( - layer_node_ids, layer, lower_layer_ids - ) + layer_node_ids = self._get_layer_node_ids(new_ids) + components, graph_ids = self._get_connected_components(layer_node_ids, layer) for cc_indices in components: parent_layer = layer + 1 cc_ids = graph_ids[cc_indices] @@ -553,20 +512,20 @@ def _update_root_id_lineage(self): ) return rows - def _get_atomic_cross_edges_val_dict(self): + def _get_cross_edges_val_dict(self): new_ids = np.array(self._new_ids_d[2], dtype=basetypes.NODE_ID) val_dicts = {} - atomic_cross_edges_d = self.cg.get_atomic_cross_edges(new_ids) + cross_edges_d = self.cg.get_cross_chunk_edges(new_ids) for id_ in new_ids: val_dict = {} - for layer, edges in atomic_cross_edges_d[id_].items(): - val_dict[attributes.Connectivity.AtomicCrossChunkEdge[layer]] = edges + for layer, edges in cross_edges_d[id_].items(): + val_dict[attributes.Connectivity.CrossChunkEdge[layer]] = edges val_dicts[id_] = val_dict return val_dicts def create_new_entries(self) -> List: rows = [] - val_dicts = self._get_atomic_cross_edges_val_dict() + val_dicts = self._get_cross_edges_val_dict() for layer in range(2, self.cg.meta.layer_count + 1): new_ids = self._new_ids_d[layer] for id_ in new_ids: From 6c6375e93208d9731d985df3f31d94b057a2bf67 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Mon, 21 Aug 2023 17:31:47 +0000 Subject: [PATCH 029/116] wip: edits refactor --- pychunkedgraph/graph/edits.py | 44 ++++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/pychunkedgraph/graph/edits.py b/pychunkedgraph/graph/edits.py index 68a8c9b3b..ae7c25b4c 100644 --- a/pychunkedgraph/graph/edits.py +++ b/pychunkedgraph/graph/edits.py @@ -7,6 +7,7 @@ from typing import Iterable from collections import defaultdict +import fastremap import numpy as np import fastremap @@ -233,6 +234,8 @@ def add_edges( ) new_roots = create_parents.run() + print("new_roots", new_roots, cg.meta.layer_count) + print(cg.get_children(np.array(new_roots, dtype=np.uint64))) new_entries = create_parents.create_new_entries() return new_roots, new_l2_ids, new_entries @@ -397,21 +400,22 @@ def _get_connected_components(self, node_ids: np.ndarray, layer: int): cross_edges.append(edges_) cross_edges = np.concatenate([*cross_edges, np.vstack([node_ids, node_ids]).T]) + temp_d = {k: next(iter(v)) for k, v in self._old_new_id_d.items()} + cross_edges = fastremap.remap(cross_edges, temp_d, preserve_missing_labels=True) + graph, _, _, graph_ids = flatgraph.build_gt_graph( cross_edges, make_directed=True ) return flatgraph.connected_components(graph), graph_ids - def _get_layer_node_ids(self, new_ids: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: + def _get_layer_node_ids( + self, new_ids: np.ndarray, layer: int + ) -> Tuple[np.ndarray, np.ndarray]: # get old identities of new IDs old_ids = self._get_old_ids(new_ids) # get their parents, then children of those parents - node_ids = self.cg.get_children( - np.unique( - self.cg.get_parents(old_ids, time_stamp=self._last_successful_ts) - ), - flatten=True, - ) + parents = self.cg.get_parents(old_ids, time_stamp=self._last_successful_ts) + node_ids = self.cg.get_children(np.unique(parents), flatten=True) # replace old identities with new IDs mask = np.in1d(node_ids, old_ids) node_ids = np.concatenate( @@ -421,7 +425,9 @@ def _get_layer_node_ids(self, new_ids: np.ndarray) -> Tuple[np.ndarray, np.ndarr ] + [node_ids[~mask], new_ids] ) - return np.unique(node_ids) + node_ids = np.unique(node_ids) + layer_mask = self.cg.get_chunk_layers(node_ids) == layer + return node_ids[layer_mask] def _create_new_parents(self, layer: int): """ @@ -434,7 +440,7 @@ def _create_new_parents(self, layer: int): update parent old IDs """ new_ids = self._new_ids_d[layer] - layer_node_ids = self._get_layer_node_ids(new_ids) + layer_node_ids = self._get_layer_node_ids(new_ids, layer) components, graph_ids = self._get_connected_components(layer_node_ids, layer) for cc_indices in components: parent_layer = layer + 1 @@ -458,6 +464,11 @@ def _create_new_parents(self, layer: int): cc_ids, parent_id, ) + + children_cx_edges = [self._cross_edges_d[child] for child in cc_ids] + cx_edges = concatenate_cross_edge_dicts(children_cx_edges) + self.cg.cache.cross_chunk_edges_cache[parent_id] = cx_edges + self._update_id_lineage(parent_id, cc_ids, layer, parent_layer) def run(self) -> Iterable: @@ -513,14 +524,15 @@ def _update_root_id_lineage(self): return rows def _get_cross_edges_val_dict(self): - new_ids = np.array(self._new_ids_d[2], dtype=basetypes.NODE_ID) val_dicts = {} - cross_edges_d = self.cg.get_cross_chunk_edges(new_ids) - for id_ in new_ids: - val_dict = {} - for layer, edges in cross_edges_d[id_].items(): - val_dict[attributes.Connectivity.CrossChunkEdge[layer]] = edges - val_dicts[id_] = val_dict + for layer in range(2, self.cg.meta.layer_count): + new_ids = np.array(self._new_ids_d[layer], dtype=basetypes.NODE_ID) + cross_edges_d = self.cg.get_cross_chunk_edges(new_ids) + for id_ in new_ids: + val_dict = {} + for layer, edges in cross_edges_d[id_].items(): + val_dict[attributes.Connectivity.CrossChunkEdge[layer]] = edges + val_dicts[id_] = val_dict return val_dicts def create_new_entries(self) -> List: From 88d3a5d551cc390f786cd711ac28ff1599ded71b Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Tue, 22 Aug 2023 16:59:25 +0000 Subject: [PATCH 030/116] fix(ingest): cache cross chunk edges from children --- .../ingest/create/abstract_layers.py | 84 ++++++++++++------- pychunkedgraph/ingest/create/cross_edges.py | 2 +- 2 files changed, 56 insertions(+), 30 deletions(-) diff --git a/pychunkedgraph/ingest/create/abstract_layers.py b/pychunkedgraph/ingest/create/abstract_layers.py index 63b613ae6..9a339443f 100644 --- a/pychunkedgraph/ingest/create/abstract_layers.py +++ b/pychunkedgraph/ingest/create/abstract_layers.py @@ -1,4 +1,4 @@ -# pylint: disable=invalid-name, missing-docstring, import-outside-toplevel +# pylint: disable=invalid-name, missing-docstring, import-outside-toplevel, c-extension-no-member """ Functions for creating parents in level 3 and above @@ -9,8 +9,8 @@ import multiprocessing as mp from typing import Optional from typing import Sequence -from collections import defaultdict +import fastremap import numpy as np from multiwrapper import multiprocessing_utils as mu @@ -21,6 +21,7 @@ from ...graph.utils import basetypes from ...graph.utils import serializers from ...graph.chunkedgraph import ChunkedGraph +from ...graph.edges.utils import concatenate_cross_edge_dicts from ...graph.utils.generic import get_valid_timestamp from ...graph.utils.generic import filter_failed_node_ids from ...graph.chunks.hierarchy import get_children_chunk_coords @@ -60,7 +61,6 @@ def add_layer( layer_id, parent_coords, connected_components, - cx_edges, get_valid_timestamp(time_stamp), n_threads > 1, ) @@ -121,7 +121,7 @@ def _read_chunk(children_ids_shared, cg: ChunkedGraph, layer_id: int, chunk_coor def _write_connected_components( - cg, layer, pcoords, components, cx_edges, time_stamp, use_threads=True + cg, layer, pcoords, components, time_stamp, use_threads=True ): if len(components) == 0: return @@ -131,7 +131,7 @@ def _write_connected_components( node_layer_d = get_chunk_nodes_cross_edge_layer(cg, layer, pcoords, use_threads) if not use_threads: - _write(cg, layer, pcoords, components, cx_edges, node_layer_d, time_stamp, use_threads) + _write(cg, layer, pcoords, components, node_layer_d, time_stamp, use_threads) return task_size = int(math.ceil(len(components) / mp.cpu_count() / 10)) @@ -139,7 +139,7 @@ def _write_connected_components( cg_info = cg.get_serialized_info() multi_args = [] for ccs in chunked_ccs: - args = (cg_info, layer, pcoords, ccs, cx_edges, node_layer_d, time_stamp) + args = (cg_info, layer, pcoords, ccs, node_layer_d, time_stamp) multi_args.append(args) mu.multiprocess_func( _write_components_helper, @@ -149,9 +149,9 @@ def _write_connected_components( def _write_components_helper(args): - cg_info, layer, pcoords, ccs, cx_edges, node_layer_d, time_stamp = args + cg_info, layer, pcoords, ccs, node_layer_d, time_stamp = args cg = ChunkedGraph(**cg_info) - _write(cg, layer, pcoords, ccs, cx_edges, node_layer_d, time_stamp) + _write(cg, layer, pcoords, ccs, node_layer_d, time_stamp) def _write( @@ -159,13 +159,12 @@ def _write( layer_id, parent_coords, components, - cx_edges, node_layer_d, time_stamp, use_threads=True, ): - parent_layer_ids = range(layer_id, cg.meta.layer_count + 1) - cc_connections = {l: [] for l in parent_layer_ids} + parent_layers = range(layer_id, cg.meta.layer_count + 1) + cc_connections = {l: [] for l in parent_layers} for node_ids in components: layer = layer_id if len(node_ids) == 1: @@ -177,40 +176,67 @@ def _write( parent_chunk_id = cg.get_chunk_id(layer=layer_id, x=x, y=y, z=z) parent_chunk_id_dict = cg.get_parent_chunk_id_dict(parent_chunk_id) - cx_edges = np.array(cx_edges, dtype=basetypes.NODE_ID) - for parent_layer_id in parent_layer_ids: - if len(cc_connections[parent_layer_id]) == 0: + for parent_layer in parent_layers: + if len(cc_connections[parent_layer]) == 0: continue - parent_chunk_id = parent_chunk_id_dict[parent_layer_id] + parent_chunk_id = parent_chunk_id_dict[parent_layer] reserved_parent_ids = cg.id_client.create_node_ids( parent_chunk_id, - size=len(cc_connections[parent_layer_id]), - root_chunk=parent_layer_id == cg.meta.layer_count and use_threads, + size=len(cc_connections[parent_layer]), + root_chunk=parent_layer == cg.meta.layer_count and use_threads, ) - for i_cc, node_ids in enumerate(cc_connections[parent_layer_id]): - node_cx_edges_d = defaultdict(lambda: types.empty_2d) - for node in node_ids: - mask0 = cx_edges[:, 0] == node - mask1 = cx_edges[:, 1] == node - node_cx_edges_d[node] = cx_edges[mask0 | mask1] - + for i_cc, node_ids in enumerate(cc_connections[parent_layer]): parent_id = reserved_parent_ids[i_cc] + + if parent_layer == 3: + # children are from atomic chunks + cx_edges_d = cg.get_atomic_cross_edges(node_ids) + else: + # children are from abstract chunks + cx_edges_d = cg.get_cross_chunk_edges(node_ids, raw_only=True) + + children_cx_edges = [] for node in node_ids: + node_layer = cg.get_chunk_layer(node) row_id = serializers.serialize_uint64(node) val_dict = {attributes.Hierarchy.Parent: parent_id} - node_cx_edges = node_cx_edges_d[node] - cx_layers = cg.get_cross_chunk_edges_layer(node_cx_edges) - for layer in set(cx_layers): - layer_mask = cx_layers == layer + node_cx_edges_d = cx_edges_d.get(node, {}) + if not node_cx_edges_d: + rows.append(cg.client.mutate_row(row_id, val_dict, time_stamp)) + continue + + for layer in range(node_layer, cg.meta.layer_count): + if not layer in node_cx_edges_d: + continue + + layer_edges = node_cx_edges_d[layer] + edges_nodes = np.unique(layer_edges) + edges_nodes_parents = cg.get_parents(edges_nodes) + temp_map = dict(zip(edges_nodes, edges_nodes_parents)) + + layer_edges = fastremap.remap( + layer_edges, temp_map, preserve_missing_labels=True + ) + layer_edges = np.unique(layer_edges, axis=0) + col = attributes.Connectivity.CrossChunkEdge[layer] - val_dict[col] = node_cx_edges[layer_mask] + val_dict[col] = layer_edges + node_cx_edges_d[layer] = layer_edges + children_cx_edges.append(node_cx_edges_d) rows.append(cg.client.mutate_row(row_id, val_dict, time_stamp)) row_id = serializers.serialize_uint64(parent_id) val_dict = {attributes.Hierarchy.Child: node_ids} + parent_cx_edges_d = concatenate_cross_edge_dicts(children_cx_edges, unique=True) + for layer in range(parent_layer, cg.meta.layer_count): + if not layer in parent_cx_edges_d: + continue + col = attributes.Connectivity.CrossChunkEdge[layer] + val_dict[col] = parent_cx_edges_d[layer] + rows.append(cg.client.mutate_row(row_id, val_dict, time_stamp)) if len(rows) > 100000: cg.client.write(rows) diff --git a/pychunkedgraph/ingest/create/cross_edges.py b/pychunkedgraph/ingest/create/cross_edges.py index 5f0ebf8df..9581838af 100644 --- a/pychunkedgraph/ingest/create/cross_edges.py +++ b/pychunkedgraph/ingest/create/cross_edges.py @@ -63,7 +63,7 @@ def _get_children_chunk_cross_edges_helper(args) -> None: edge_ids_shared.append(_get_children_chunk_cross_edges(cg, atomic_chunks, layer)) -def _get_children_chunk_cross_edges(cg: ChunkedGraph, atomic_chunks, layer) -> None: +def _get_children_chunk_cross_edges(cg: ChunkedGraph, atomic_chunks, layer) -> np.ndarray: """ Non parallelized version Cross edges that connect children chunks. From a34f476fd82dc78d2be5ce320352405b07188a8a Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Tue, 22 Aug 2023 17:21:13 +0000 Subject: [PATCH 031/116] feat: add unique flag --- pychunkedgraph/graph/edges/utils.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pychunkedgraph/graph/edges/utils.py b/pychunkedgraph/graph/edges/utils.py index 94641343a..cd0e85fe8 100644 --- a/pychunkedgraph/graph/edges/utils.py +++ b/pychunkedgraph/graph/edges/utils.py @@ -46,7 +46,7 @@ def concatenate_chunk_edges(chunk_edge_dicts: Iterable) -> Dict: return edges_dict -def concatenate_cross_edge_dicts(edges_ds: Iterable[Dict]) -> Dict: +def concatenate_cross_edge_dicts(edges_ds: Iterable[Dict], unique: bool = False) -> Dict: """Combines cross chunk edge dicts of form {layer id : edge list}.""" result_d = defaultdict(list) for edges_d in edges_ds: @@ -54,7 +54,10 @@ def concatenate_cross_edge_dicts(edges_ds: Iterable[Dict]) -> Dict: result_d[layer].append(edges) for layer, edge_lists in result_d.items(): - result_d[layer] = np.concatenate(edge_lists) + edges = np.concatenate(edge_lists) + if unique: + edges = np.unique(edges, axis=0) + result_d[layer] = edges return result_d From 59bac668b1c63b0670b05c67e2037c981b7f4cbf Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Tue, 22 Aug 2023 17:21:53 +0000 Subject: [PATCH 032/116] feat: cross edges column family gcversionrule --- pychunkedgraph/graph/attributes.py | 20 +++++++++---------- .../graph/client/bigtable/client.py | 4 +++- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/pychunkedgraph/graph/attributes.py b/pychunkedgraph/graph/attributes.py index 958913119..84283161d 100644 --- a/pychunkedgraph/graph/attributes.py +++ b/pychunkedgraph/graph/attributes.py @@ -104,10 +104,12 @@ class Connectivity: serializer=serializers.NumPyArray(dtype=basetypes.EDGE_AREA), ) - FakeEdges = _Attribute( - key=b"fake_edges", - family_id="4", - serializer=serializers.NumPyArray(dtype=basetypes.NODE_ID, shape=(-1, 2)), + AtomicCrossChunkEdge = _AttributeArray( + pattern=b"atomic_cross_edges_%d", + family_id="3", + serializer=serializers.NumPyArray( + dtype=basetypes.NODE_ID, shape=(-1, 2), compression_level=22 + ), ) CrossChunkEdge = _AttributeArray( @@ -118,12 +120,10 @@ class Connectivity: ), ) - AtomicCrossChunkEdge = _AttributeArray( - pattern=b"atomic_cross_edges_%d", - family_id="3", - serializer=serializers.NumPyArray( - dtype=basetypes.NODE_ID, shape=(-1, 2), compression_level=22 - ), + FakeEdges = _Attribute( + key=b"fake_edges", + family_id="5", + serializer=serializers.NumPyArray(dtype=basetypes.NODE_ID, shape=(-1, 2)), ) diff --git a/pychunkedgraph/graph/client/bigtable/client.py b/pychunkedgraph/graph/client/bigtable/client.py index 788c76a8e..1bd027255 100644 --- a/pychunkedgraph/graph/client/bigtable/client.py +++ b/pychunkedgraph/graph/client/bigtable/client.py @@ -638,7 +638,9 @@ def _create_column_families(self): f.create() f = self._table.column_family("3", gc_rule=MaxAgeGCRule(timedelta(days=365))) f.create() - f = self._table.column_family("4") + f = self._table.column_family("4", gc_rule=MaxVersionsGCRule(1)) + f.create() + f = self._table.column_family("5") f.create() def _get_ids_range(self, key: bytes, size: int) -> typing.Tuple: From 48139b7c43e3df462939c7addb813cf5ac0e9124 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Tue, 22 Aug 2023 17:22:10 +0000 Subject: [PATCH 033/116] fix: convert input to np arrays --- pychunkedgraph/graph/cache.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pychunkedgraph/graph/cache.py b/pychunkedgraph/graph/cache.py index 4e5ed17c1..52fdfd022 100644 --- a/pychunkedgraph/graph/cache.py +++ b/pychunkedgraph/graph/cache.py @@ -31,7 +31,9 @@ def __init__(self, cg): self._parent_vec = np.vectorize(self.parent, otypes=[np.uint64]) self._children_vec = np.vectorize(self.children, otypes=[np.ndarray]) - self._cross_chunk_edges_vec = np.vectorize(self.cross_chunk_edges, otypes=[dict]) + self._cross_chunk_edges_vec = np.vectorize( + self.cross_chunk_edges, otypes=[dict] + ) # no limit because we don't want to lose new IDs self.parents_cache = LRUCache(maxsize=maxsize) @@ -77,6 +79,7 @@ def cross_edges_decorated(node_id): return cross_edges_decorated(node_id) def parents_multiple(self, node_ids: np.ndarray, *, time_stamp: datetime = None): + node_ids = np.array(node_ids, dtype=NODE_ID) if not node_ids.size: return node_ids mask = np.in1d(node_ids, np.fromiter(self.parents_cache.keys(), dtype=NODE_ID)) @@ -90,6 +93,7 @@ def parents_multiple(self, node_ids: np.ndarray, *, time_stamp: datetime = None) def children_multiple(self, node_ids: np.ndarray, *, flatten=False): result = {} + node_ids = np.array(node_ids, dtype=NODE_ID) if not node_ids.size: return result mask = np.in1d(node_ids, np.fromiter(self.children_cache.keys(), dtype=NODE_ID)) @@ -105,6 +109,7 @@ def children_multiple(self, node_ids: np.ndarray, *, flatten=False): def cross_chunk_edges_multiple(self, node_ids: np.ndarray): result = {} + node_ids = np.array(node_ids, dtype=NODE_ID) if not node_ids.size: return result mask = np.in1d( From aadfe82475678d59ba94078ff03a981a5c7b1520 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Tue, 22 Aug 2023 17:23:47 +0000 Subject: [PATCH 034/116] fix: linting issues --- pychunkedgraph/graph/chunkedgraph.py | 14 ++++---------- pychunkedgraph/graph/operation.py | 6 +++--- pychunkedgraph/graph/subgraph.py | 4 ++-- 3 files changed, 9 insertions(+), 15 deletions(-) diff --git a/pychunkedgraph/graph/chunkedgraph.py b/pychunkedgraph/graph/chunkedgraph.py index 1cdecd77a..f4e87290c 100644 --- a/pychunkedgraph/graph/chunkedgraph.py +++ b/pychunkedgraph/graph/chunkedgraph.py @@ -24,6 +24,8 @@ from .edges import utils as edge_utils from .chunks import utils as chunk_utils from .chunks import hierarchy as chunk_hierarchy +from .subgraph import get_subgraph_nodes +from .subgraph import get_subgraph_edges_and_leaves class ChunkedGraph: @@ -524,12 +526,10 @@ def get_subgraph( edges_only: bool = False, leaves_only: bool = False, return_flattened: bool = False, - ) -> typing.Tuple[typing.Dict, typing.Dict, Edges]: + ) -> typing.Tuple[typing.Dict, typing.Tuple[Edges]]: """ Generic subgraph method. """ - from .subgraph import get_subgraph_nodes - from .subgraph import get_subgraph_edges_and_leaves if return_layers is None: return_layers = [2] @@ -560,8 +560,6 @@ def get_subgraph_nodes( Get the children of `node_ids` that are at each of return_layers within the specified bounding box. """ - from .subgraph import get_subgraph_nodes - if return_layers is None: return_layers = [2] @@ -584,8 +582,6 @@ def get_subgraph_edges( """ Get the atomic edges of the `node_ids` within the specified bounding box. """ - from .subgraph import get_subgraph_edges_and_leaves - return get_subgraph_edges_and_leaves( self, node_id_or_ids, bbox, bbox_is_coordinate, True, False ) @@ -599,8 +595,6 @@ def get_subgraph_leaves( """ Get the supervoxels of the `node_ids` within the specified bounding box. """ - from .subgraph import get_subgraph_edges_and_leaves - return get_subgraph_edges_and_leaves( self, node_id_or_ids, bbox, bbox_is_coordinate, False, True ) @@ -625,7 +619,7 @@ def get_fake_edges( def get_l2_agglomerations( self, level2_ids: np.ndarray, edges_only: bool = False - ) -> typing.Tuple[typing.Dict[int, types.Agglomeration], np.ndarray]: + ) -> typing.Tuple[typing.Dict[int, types.Agglomeration], typing.Tuple[Edges]]: """ Children of Level 2 Node IDs and edges. Edges are read from cloud storage. diff --git a/pychunkedgraph/graph/operation.py b/pychunkedgraph/graph/operation.py index b864a2d0d..39668565f 100644 --- a/pychunkedgraph/graph/operation.py +++ b/pychunkedgraph/graph/operation.py @@ -1,4 +1,4 @@ -# pylint: disable=invalid-name, missing-docstring, too-many-lines, protected-access, broad_exception_raised +# pylint: disable=invalid-name, missing-docstring, too-many-lines, protected-access, broad-exception-raised from abc import ABC, abstractmethod from collections import namedtuple @@ -892,11 +892,11 @@ def _apply( self.cg.meta.split_bounding_offset, ) with TimeIt("get_subgraph", self.cg.graph_id, operation_id): - l2id_agglomeration_d, edges = self.cg.get_subgraph( + l2id_agglomeration_d, edges_tuple = self.cg.get_subgraph( root_ids.pop(), bbox=bbox, bbox_is_coordinate=True ) - edges = reduce(lambda x, y: x + y, edges, Edges([], [])) + edges = reduce(lambda x, y: x + y, edges_tuple, Edges([], [])) supervoxels = np.concatenate( [agg.supervoxels for agg in l2id_agglomeration_d.values()] ) diff --git a/pychunkedgraph/graph/subgraph.py b/pychunkedgraph/graph/subgraph.py index 5b50b7c43..1538b3cc2 100644 --- a/pychunkedgraph/graph/subgraph.py +++ b/pychunkedgraph/graph/subgraph.py @@ -1,4 +1,4 @@ -# pylint: disable=invalid-name, missing-docstring +# pylint: disable=invalid-name, missing-docstring, import-outside-toplevel from typing import List from typing import Dict @@ -155,7 +155,7 @@ def get_subgraph_edges_and_leaves( bbox_is_coordinate: bool = False, edges_only: bool = False, leaves_only: bool = False, -) -> Tuple[Dict, Dict, Edges]: +) -> Tuple[Dict, Tuple[Edges]]: """Get the edges and/or leaves of the specified node_ids within the specified bounding box.""" from .types import empty_1d From 9f935d1e3d25d985a36c38f19e3dcc84ee0d84cc Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Tue, 22 Aug 2023 17:24:05 +0000 Subject: [PATCH 035/116] wip: edits refactor --- pychunkedgraph/graph/edits.py | 169 ++++++++++++++++++---------------- 1 file changed, 92 insertions(+), 77 deletions(-) diff --git a/pychunkedgraph/graph/edits.py b/pychunkedgraph/graph/edits.py index ae7c25b4c..0086f00cd 100644 --- a/pychunkedgraph/graph/edits.py +++ b/pychunkedgraph/graph/edits.py @@ -39,7 +39,7 @@ def _analyze_affected_edges( cg, atomic_edges: Iterable[np.ndarray], parent_ts: datetime.datetime = None ) -> Tuple[Iterable, Dict]: """ - Returns l2 edges within chunk and adds self edges for nodes in cross chunk edges. + Returns l2 edges within chunk and self edges for nodes in cross chunk edges. Also returns new cross edges dicts for nodes crossing chunk boundary. """ @@ -208,20 +208,30 @@ def add_edges( for cc_indices in components: l2ids_ = graph_ids[cc_indices] new_id = cg.id_client.create_node_id(cg.get_chunk_id(l2ids_[0])) - cg.cache.children_cache[new_id] = np.concatenate( - [atomic_children_d[l2id] for l2id in l2ids_] - ) - cg.cache.cross_chunk_edges_cache[new_id] = concatenate_cross_edge_dicts( - [cross_edges_d[l2id] for l2id in l2ids_] - ) - cache_utils.update( - cg.cache.parents_cache, cg.cache.children_cache[new_id], new_id - ) new_l2_ids.append(new_id) new_old_id_d[new_id].update(l2ids_) for id_ in l2ids_: old_new_id_d[id_].add(new_id) + # update cache + # map parent to new merged children and vice versa + merged_children = np.concatenate([atomic_children_d[l2id] for l2id in l2ids_]) + cg.cache.children_cache[new_id] = merged_children + cache_utils.update(cg.cache.parents_cache, merged_children, new_id) + + # update cross chunk edges by replacing old_ids with new + # this can be done only after all new IDs have been created + for new_id, cc_indices in zip(new_l2_ids, components): + l2ids_ = graph_ids[cc_indices] + new_cx_edges_d = {} + cx_edges = [cross_edges_d[l2id] for l2id in l2ids_] + cx_edges_d = concatenate_cross_edge_dicts(cx_edges, unique=True) + temp_map = {k: next(iter(v)) for k, v in old_new_id_d.items()} + for layer, edges in cx_edges_d.items(): + edges = fastremap.remap(edges, temp_map, preserve_missing_labels=True) + new_cx_edges_d[layer] = edges + cg.cache.cross_chunk_edges_cache[new_id] = new_cx_edges_d + create_parents = CreateParentNodes( cg, new_l2_ids=new_l2_ids, @@ -234,50 +244,25 @@ def add_edges( ) new_roots = create_parents.run() - print("new_roots", new_roots, cg.meta.layer_count) - print(cg.get_children(np.array(new_roots, dtype=np.uint64))) + print() + print("layers", cg.meta.layer_count, "new_roots", new_roots) new_entries = create_parents.create_new_entries() return new_roots, new_l2_ids, new_entries -def _process_l2_agglomeration( - agg: types.Agglomeration, - removed_edges: np.ndarray, - atomic_cross_edges_d: Dict[int, np.ndarray], -): +def _process_l2_agglomeration(agg: types.Agglomeration, removed_edges: np.ndarray): """ For a given L2 id, remove given edges; calculate new connected components. """ chunk_edges = agg.in_edges.get_pairs() - cross_edges = np.concatenate([types.empty_2d, *atomic_cross_edges_d.values()]) chunk_edges = chunk_edges[~in2d(chunk_edges, removed_edges)] - cross_edges = cross_edges[~in2d(cross_edges, removed_edges)] isolated_ids = agg.supervoxels[~np.in1d(agg.supervoxels, chunk_edges)] isolated_edges = np.column_stack((isolated_ids, isolated_ids)) graph, _, _, graph_ids = flatgraph.build_gt_graph( np.concatenate([chunk_edges, isolated_edges]), make_directed=True ) - return flatgraph.connected_components(graph), graph_ids, cross_edges - - -def _filter_component_cross_edges( - cc_ids: np.ndarray, cross_edges: np.ndarray, cross_edge_layers: np.ndarray -) -> Dict[int, np.ndarray]: - """ - Filters cross edges for a connected component `cc_ids` - from `cross_edges` of the complete chunk. - """ - mask = np.in1d(cross_edges[:, 0], cc_ids) - cross_edges_ = cross_edges[mask] - cross_edge_layers_ = cross_edge_layers[mask] - edges_d = {} - for layer in np.unique(cross_edge_layers_): - edge_m = cross_edge_layers_ == layer - _cross_edges = cross_edges_[edge_m] - if _cross_edges.size: - edges_d[layer] = _cross_edges - return edges_d + return flatgraph.connected_components(graph), graph_ids def remove_edges( @@ -291,10 +276,9 @@ def remove_edges( ): edges, _ = _analyze_affected_edges(cg, atomic_edges, parent_ts=parent_ts) l2ids = np.unique(edges) - assert ( - np.unique(cg.get_roots(l2ids, assert_roots=True, time_stamp=parent_ts)).size - == 1 - ), "L2 IDs must belong to same root." + roots = cg.get_roots(l2ids, assert_roots=True, time_stamp=parent_ts) + assert np.unique(roots).size == 1, "L2 IDs must belong to same root." + new_old_id_d, old_new_id_d, old_hierarchy_d = _init_old_hierarchy( cg, l2ids, parent_ts=parent_ts ) @@ -305,20 +289,14 @@ def remove_edges( new_l2_ids = [] for id_ in l2ids: l2_agg = l2id_agglomeration_d[id_] - ccs, graph_ids, cross_edges = _process_l2_agglomeration( - l2_agg, removed_edges, cross_edges_d[id_] - ) - # done here to avoid repeat computation in loop - cross_edge_layers = cg.get_cross_chunk_edges_layer(cross_edges) + ccs, graph_ids = _process_l2_agglomeration(l2_agg, removed_edges) new_parent_ids = cg.id_client.create_node_ids( l2id_chunk_id_d[l2_agg.node_id], len(ccs) ) for i_cc, cc in enumerate(ccs): new_id = new_parent_ids[i_cc] cg.cache.children_cache[new_id] = graph_ids[cc] - cg.cache.atomic_cx_edges_cache[new_id] = _filter_component_cross_edges( - graph_ids[cc], cross_edges, cross_edge_layers - ) + cg.cache.atomic_cx_edges_cache[new_id] = None cache_utils.update(cg.cache.parents_cache, graph_ids[cc], new_id) new_l2_ids.append(new_id) new_old_id_d[new_id].add(id_) @@ -358,7 +336,6 @@ def __init__( self._new_old_id_d = new_old_id_d self._old_new_id_d = old_new_id_d self._new_ids_d = defaultdict(list) # new IDs in each layer - self._cross_edges_d = {} self._operation_id = operation_id self._time_stamp = time_stamp self._last_successful_ts = parent_ts @@ -385,6 +362,13 @@ def _get_old_ids(self, new_ids): ] return np.concatenate(old_ids) + def _get_new_ids(self, old_ids): + old_ids = [ + np.array(list(self._old_new_id_d[id_]), dtype=basetypes.NODE_ID) + for id_ in old_ids + ] + return np.concatenate(old_ids) + def _get_connected_components(self, node_ids: np.ndarray, layer: int): with TimeIt( f"get_cross_chunk_edges.{layer}", @@ -392,20 +376,16 @@ def _get_connected_components(self, node_ids: np.ndarray, layer: int): self._operation_id, ): cross_edges_d = self.cg.get_cross_chunk_edges(node_ids) - self._cross_edges_d.update(cross_edges_d) - cross_edges = [types.empty_2d] + cx_edges = [types.empty_2d] for id_ in node_ids: - edges_ = self._cross_edges_d[id_].get(layer, types.empty_2d) - cross_edges.append(edges_) - - cross_edges = np.concatenate([*cross_edges, np.vstack([node_ids, node_ids]).T]) - temp_d = {k: next(iter(v)) for k, v in self._old_new_id_d.items()} - cross_edges = fastremap.remap(cross_edges, temp_d, preserve_missing_labels=True) + edges_ = cross_edges_d[id_].get(layer, types.empty_2d) + cx_edges.append(edges_) - graph, _, _, graph_ids = flatgraph.build_gt_graph( - cross_edges, make_directed=True - ) + cx_edges = np.concatenate([*cx_edges, np.vstack([node_ids, node_ids]).T]) + temp_map = {k: next(iter(v)) for k, v in self._old_new_id_d.items()} + cx_edges = fastremap.remap(cx_edges, temp_map, preserve_missing_labels=True) + graph, _, _, graph_ids = flatgraph.build_gt_graph(cx_edges, make_directed=True) return flatgraph.connected_components(graph), graph_ids def _get_layer_node_ids( @@ -419,15 +399,37 @@ def _get_layer_node_ids( # replace old identities with new IDs mask = np.in1d(node_ids, old_ids) node_ids = np.concatenate( - [ - np.array(list(self._old_new_id_d[id_]), dtype=basetypes.NODE_ID) - for id_ in node_ids[mask] - ] - + [node_ids[~mask], new_ids] + [self._get_new_ids(node_ids[mask]), node_ids[~mask], new_ids] ) node_ids = np.unique(node_ids) layer_mask = self.cg.get_chunk_layers(node_ids) == layer return node_ids[layer_mask] + # return node_ids + + def _update_cross_edge_cache(self, parent, children): + """ + updates cross chunk edges in cache; + this can only be done after all new components at a layer have IDs + """ + cx_edges_d = self.cg.get_cross_chunk_edges(children) + cx_edges_d = concatenate_cross_edge_dicts(cx_edges_d.values(), unique=True) + + parent_layer = self.cg.get_chunk_layer(parent) + edge_nodes = np.unique(np.concatenate([*cx_edges_d.values(), types.empty_2d])) + edge_parents = self.cg.get_roots( + edge_nodes, stop_layer=parent_layer, ceil=False + ) + edge_parents_d = dict(zip(edge_nodes, edge_parents)) + + new_cx_edges_d = {} + for layer in range(parent_layer, self.cg.meta.layer_count): + layer_edges = cx_edges_d.get(layer, types.empty_2d) + if len(layer_edges) == 0: + continue + new_cx_edges_d[layer] = fastremap.remap( + layer_edges, edge_parents_d, preserve_missing_labels=True + ) + self.cg.cache.cross_chunk_edges_cache[parent] = new_cx_edges_d def _create_new_parents(self, layer: int): """ @@ -439,25 +441,30 @@ def _create_new_parents(self, layer: int): get cross edges of all, find connected components update parent old IDs """ + parent_layer = layer + 1 new_ids = self._new_ids_d[layer] layer_node_ids = self._get_layer_node_ids(new_ids, layer) + print(layer, layer_node_ids) components, graph_ids = self._get_connected_components(layer_node_ids, layer) + new_parent_ids = [] for cc_indices in components: - parent_layer = layer + 1 cc_ids = graph_ids[cc_indices] if len(cc_ids) == 1: # skip connection parent_layer = self.cg.meta.layer_count for l in range(layer + 1, self.cg.meta.layer_count): - if len(self._cross_edges_d[cc_ids[0]].get(l, types.empty_2d)) > 0: + cx_edges_d = self.cg.get_cross_chunk_edges([cc_ids[0]]) + if len(cx_edges_d[cc_ids[0]].get(l, types.empty_2d)) > 0: parent_layer = l break - parent_id = self.cg.id_client.create_node_id( self.cg.get_parent_chunk_id(cc_ids[0], parent_layer), root_chunk=parent_layer == self.cg.meta.layer_count, ) self._new_ids_d[parent_layer].append(parent_id) + self._update_id_lineage(parent_id, cc_ids, layer, parent_layer) + new_parent_ids.append(parent_id) + self.cg.cache.children_cache[parent_id] = cc_ids cache_utils.update( self.cg.cache.parents_cache, @@ -465,11 +472,9 @@ def _create_new_parents(self, layer: int): parent_id, ) - children_cx_edges = [self._cross_edges_d[child] for child in cc_ids] - cx_edges = concatenate_cross_edge_dicts(children_cx_edges) - self.cg.cache.cross_chunk_edges_cache[parent_id] = cx_edges - - self._update_id_lineage(parent_id, cc_ids, layer, parent_layer) + for new_id in new_parent_ids: + children = self.cg.get_children(new_id) + self._update_cross_edge_cache(new_id, children) def run(self) -> Iterable: """ @@ -492,9 +497,14 @@ def _update_root_id_lineage(self): new_root_ids = self._new_ids_d[self.cg.meta.layer_count] former_root_ids = self._get_old_ids(new_root_ids) former_root_ids = np.unique(former_root_ids) + + print() + print(former_root_ids, "->", new_root_ids) + print(self.cg.get_children(former_root_ids)) + print(self.cg.get_children(np.array(new_root_ids, dtype=np.uint64))) assert ( len(former_root_ids) < 2 or len(new_root_ids) < 2 - ), "Something went wrong." + ), "Result inconsistent with either split or merge effects." rows = [] for new_root_id in new_root_ids: val_dict = { @@ -524,10 +534,15 @@ def _update_root_id_lineage(self): return rows def _get_cross_edges_val_dict(self): + print("haha", self.cg.get_cross_chunk_edges([216172782113783809])) val_dicts = {} for layer in range(2, self.cg.meta.layer_count): new_ids = np.array(self._new_ids_d[layer], dtype=basetypes.NODE_ID) cross_edges_d = self.cg.get_cross_chunk_edges(new_ids) + print() + print(layer, new_ids) + print("cx", cross_edges_d) + print("ch", self.cg.get_children(new_ids)) for id_ in new_ids: val_dict = {} for layer, edges in cross_edges_d[id_].items(): From 2edccc998bea195393bcd83b5f457695668355e6 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Wed, 23 Aug 2023 02:59:22 +0000 Subject: [PATCH 036/116] fix: undo gcrule changes --- pychunkedgraph/graph/attributes.py | 2 +- .../graph/client/bigtable/client.py | 26 +++++++++---------- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/pychunkedgraph/graph/attributes.py b/pychunkedgraph/graph/attributes.py index 84283161d..33f675dc8 100644 --- a/pychunkedgraph/graph/attributes.py +++ b/pychunkedgraph/graph/attributes.py @@ -122,7 +122,7 @@ class Connectivity: FakeEdges = _Attribute( key=b"fake_edges", - family_id="5", + family_id="4", serializer=serializers.NumPyArray(dtype=basetypes.NODE_ID, shape=(-1, 2)), ) diff --git a/pychunkedgraph/graph/client/bigtable/client.py b/pychunkedgraph/graph/client/bigtable/client.py index 1bd027255..6601b654e 100644 --- a/pychunkedgraph/graph/client/bigtable/client.py +++ b/pychunkedgraph/graph/client/bigtable/client.py @@ -72,6 +72,18 @@ def __init__( self._version = None self._max_row_key_count = config.MAX_ROW_KEY_COUNT + def _create_column_families(self): + f = self._table.column_family("0") + f.create() + f = self._table.column_family("1", gc_rule=MaxVersionsGCRule(1)) + f.create() + f = self._table.column_family("2") + f.create() + f = self._table.column_family("3", gc_rule=MaxAgeGCRule(timedelta(days=365))) + f.create() + f = self._table.column_family("4") + f.create() + @property def graph_meta(self): return self._graph_meta @@ -629,20 +641,6 @@ def get_compatible_timestamp( return utils.get_google_compatible_time_stamp(time_stamp, round_up=round_up) # PRIVATE METHODS - def _create_column_families(self): - f = self._table.column_family("0") - f.create() - f = self._table.column_family("1", gc_rule=MaxVersionsGCRule(1)) - f.create() - f = self._table.column_family("2") - f.create() - f = self._table.column_family("3", gc_rule=MaxAgeGCRule(timedelta(days=365))) - f.create() - f = self._table.column_family("4", gc_rule=MaxVersionsGCRule(1)) - f.create() - f = self._table.column_family("5") - f.create() - def _get_ids_range(self, key: bytes, size: int) -> typing.Tuple: """Returns a range (min, max) of IDs for a given `key`.""" column = attributes.Concurrency.Counter From 8af6407960a677ffa226a3c89d4c8040dd9bcb7d Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Wed, 23 Aug 2023 03:01:18 +0000 Subject: [PATCH 037/116] fix: add mock_edges; linting issues --- pychunkedgraph/debug/utils.py | 4 +++- pychunkedgraph/graph/chunkedgraph.py | 2 ++ pychunkedgraph/graph/types.py | 3 +-- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/pychunkedgraph/debug/utils.py b/pychunkedgraph/debug/utils.py index 179f50aef..e194f4ee1 100644 --- a/pychunkedgraph/debug/utils.py +++ b/pychunkedgraph/debug/utils.py @@ -1,3 +1,5 @@ +# pylint: disable=invalid-name, missing-docstring, bare-except, unidiomatic-typecheck + import numpy as np from ..graph import ChunkedGraph @@ -27,7 +29,7 @@ def print_node( if cg.get_chunk_layer(node) <= stop_layer: return for child in children: - print_node(cg, child, indent=indent + 1, stop_layer=stop_layer) + print_node(cg, child, indent=indent + 4, stop_layer=stop_layer) def get_l2children(cg: ChunkedGraph, node: NODE_ID) -> np.ndarray: diff --git a/pychunkedgraph/graph/chunkedgraph.py b/pychunkedgraph/graph/chunkedgraph.py index f4e87290c..a118d4c82 100644 --- a/pychunkedgraph/graph/chunkedgraph.py +++ b/pychunkedgraph/graph/chunkedgraph.py @@ -642,6 +642,8 @@ def get_l2_agglomerations( chain(edges_d.values(), fake_edges.values()), Edges([], []), ) + if self.mock_edges is not None: + all_chunk_edges += self.mock_edges if edges_only: if self.mock_edges is not None: diff --git a/pychunkedgraph/graph/types.py b/pychunkedgraph/graph/types.py index 9a551f35c..1f35e5f6b 100644 --- a/pychunkedgraph/graph/types.py +++ b/pychunkedgraph/graph/types.py @@ -1,5 +1,4 @@ -from typing import Dict -from typing import Iterable +# pylint: disable=invalid-name, missing-docstring from collections import namedtuple import numpy as np From 0a23d8483dcf848e5ada45aed5c08e3486a31604 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Wed, 23 Aug 2023 22:55:50 +0000 Subject: [PATCH 038/116] feat: edits using cached cross edges --- pychunkedgraph/graph/edits.py | 248 +++++++++++++++++++++++----------- 1 file changed, 172 insertions(+), 76 deletions(-) diff --git a/pychunkedgraph/graph/edits.py b/pychunkedgraph/graph/edits.py index 0086f00cd..ba9481139 100644 --- a/pychunkedgraph/graph/edits.py +++ b/pychunkedgraph/graph/edits.py @@ -221,6 +221,7 @@ def add_edges( # update cross chunk edges by replacing old_ids with new # this can be done only after all new IDs have been created + updated_entries = [] for new_id, cc_indices in zip(new_l2_ids, components): l2ids_ = graph_ids[cc_indices] new_cx_edges_d = {} @@ -230,8 +231,36 @@ def add_edges( for layer, edges in cx_edges_d.items(): edges = fastremap.remap(edges, temp_map, preserve_missing_labels=True) new_cx_edges_d[layer] = edges + assert np.all(edges[:, 0] == new_id) cg.cache.cross_chunk_edges_cache[new_id] = new_cx_edges_d + # must also update cross chunk edges in reverse (counterparts) + layer_edges = new_cx_edges_d.get(2, types.empty_2d) + counterparts = layer_edges[:, 1] + counterpart_cx_edges_d = cg.get_cross_chunk_edges(counterparts) + temp_map = { + old_id: new_id for old_id in _get_flipped_ids(new_old_id_d, [new_id]) + } + for counterpart, edges_d in counterpart_cx_edges_d.items(): + val_dict = {} + for layer in range(2, cg.meta.layer_count): + edges = edges_d.get(layer, types.empty_2d) + if edges.size == 0: + continue + assert np.all(edges[:, 0] == counterpart) + edges = fastremap.remap(edges, temp_map, preserve_missing_labels=True) + edges_d[layer] = edges + val_dict[attributes.Connectivity.CrossChunkEdge[layer]] = edges + if not val_dict: + continue + cg.cache.cross_chunk_edges_cache[counterpart] = edges_d + row = cg.client.mutate_row( + serialize_uint64(counterpart), + val_dict, + time_stamp=time_stamp, + ) + updated_entries.append(row) + create_parents = CreateParentNodes( cg, new_l2_ids=new_l2_ids, @@ -244,10 +273,8 @@ def add_edges( ) new_roots = create_parents.run() - print() - print("layers", cg.meta.layer_count, "new_roots", new_roots) - new_entries = create_parents.create_new_entries() - return new_roots, new_l2_ids, new_entries + create_parents.create_new_entries() + return new_roots, new_l2_ids, updated_entries + create_parents.new_entries def _process_l2_agglomeration(agg: types.Agglomeration, removed_edges: np.ndarray): @@ -257,12 +284,36 @@ def _process_l2_agglomeration(agg: types.Agglomeration, removed_edges: np.ndarra chunk_edges = agg.in_edges.get_pairs() chunk_edges = chunk_edges[~in2d(chunk_edges, removed_edges)] + # cross during edits refers to all edges crossing chunk boundary + cross_edges = [agg.out_edges.get_pairs(), agg.cross_edges.get_pairs()] + cross_edges = np.concatenate(cross_edges) + cross_edges = cross_edges[~in2d(cross_edges, removed_edges)] + isolated_ids = agg.supervoxels[~np.in1d(agg.supervoxels, chunk_edges)] isolated_edges = np.column_stack((isolated_ids, isolated_ids)) graph, _, _, graph_ids = flatgraph.build_gt_graph( np.concatenate([chunk_edges, isolated_edges]), make_directed=True ) - return flatgraph.connected_components(graph), graph_ids + return flatgraph.connected_components(graph), graph_ids, cross_edges + + +def _filter_component_cross_edges( + component_ids: np.ndarray, cross_edges: np.ndarray, cross_edge_layers: np.ndarray +) -> Dict[int, np.ndarray]: + """ + Filters cross edges for a connected component `cc_ids` + from `cross_edges` of the complete chunk. + """ + mask = np.in1d(cross_edges[:, 0], component_ids) + cross_edges_ = cross_edges[mask] + cross_edge_layers_ = cross_edge_layers[mask] + edges_d = {} + for layer in np.unique(cross_edge_layers_): + edge_m = cross_edge_layers_ == layer + _cross_edges = cross_edges_[edge_m] + if _cross_edges.size: + edges_d[layer] = _cross_edges + return edges_d def remove_edges( @@ -282,25 +333,67 @@ def remove_edges( new_old_id_d, old_new_id_d, old_hierarchy_d = _init_old_hierarchy( cg, l2ids, parent_ts=parent_ts ) - l2id_chunk_id_d = dict(zip(l2ids.tolist(), cg.get_chunk_ids_from_node_ids(l2ids))) - cross_edges_d = cg.get_cross_chunk_edges(l2ids) + chunk_id_map = dict(zip(l2ids.tolist(), cg.get_chunk_ids_from_node_ids(l2ids))) removed_edges = np.concatenate([atomic_edges, atomic_edges[:, ::-1]], axis=0) new_l2_ids = [] for id_ in l2ids: - l2_agg = l2id_agglomeration_d[id_] - ccs, graph_ids = _process_l2_agglomeration(l2_agg, removed_edges) - new_parent_ids = cg.id_client.create_node_ids( - l2id_chunk_id_d[l2_agg.node_id], len(ccs) - ) + agg = l2id_agglomeration_d[id_] + ccs, graph_ids, cross_edges = _process_l2_agglomeration(agg, removed_edges) + new_parents = cg.id_client.create_node_ids(chunk_id_map[agg.node_id], len(ccs)) + + cross_edge_layers = cg.get_cross_chunk_edges_layer(cross_edges) for i_cc, cc in enumerate(ccs): - new_id = new_parent_ids[i_cc] - cg.cache.children_cache[new_id] = graph_ids[cc] - cg.cache.atomic_cx_edges_cache[new_id] = None - cache_utils.update(cg.cache.parents_cache, graph_ids[cc], new_id) + new_id = new_parents[i_cc] new_l2_ids.append(new_id) new_old_id_d[new_id].add(id_) old_new_id_d[id_].add(new_id) + cg.cache.children_cache[new_id] = graph_ids[cc] + cache_utils.update(cg.cache.parents_cache, graph_ids[cc], new_id) + cg.cache.cross_chunk_edges_cache[new_id] = _filter_component_cross_edges( + graph_ids[cc], cross_edges, cross_edge_layers + ) + + updated_entries = [] + new_cx_edges_d = cg.get_cross_chunk_edges(new_l2_ids) + for new_id in new_l2_ids: + cx_edges_d = new_cx_edges_d.get(new_id, {}) + for layer, edges in cx_edges_d.items(): + svs = np.unique(edges) + parents = cg.get_parents(svs) + temp_map = dict(zip(svs, parents)) + + edges = fastremap.remap(edges, temp_map, preserve_missing_labels=True) + edges = np.unique(edges, axis=0) + cx_edges_d[layer] = edges + assert np.all(edges[:, 0] == new_id) + cg.cache.cross_chunk_edges_cache[new_id] = cx_edges_d + + layer_edges = cx_edges_d.get(2, types.empty_2d) + counterparts = layer_edges[:, 1] + counterpart_cx_edges_d = cg.get_cross_chunk_edges(counterparts) + temp_map = { + old_id: new_id for old_id in _get_flipped_ids(new_old_id_d, [new_id]) + } + for counterpart, edges_d in counterpart_cx_edges_d.items(): + val_dict = {} + for layer in range(2, cg.meta.layer_count): + edges = edges_d.get(layer, types.empty_2d) + if edges.size == 0: + continue + assert np.all(edges[:, 0] == counterpart) + edges = fastremap.remap(edges, temp_map, preserve_missing_labels=True) + edges_d[layer] = edges + val_dict[attributes.Connectivity.CrossChunkEdge[layer]] = edges + if not val_dict: + continue + cg.cache.cross_chunk_edges_cache[counterpart] = edges_d + row = cg.client.mutate_row( + serialize_uint64(counterpart), + val_dict, + time_stamp=time_stamp, + ) + updated_entries.append(row) create_parents = CreateParentNodes( cg, @@ -313,8 +406,16 @@ def remove_edges( parent_ts=parent_ts, ) new_roots = create_parents.run() - new_entries = create_parents.create_new_entries() - return new_roots, new_l2_ids, new_entries + create_parents.create_new_entries() + return new_roots, new_l2_ids, updated_entries + create_parents.new_entries + + +def _get_flipped_ids(id_map, node_ids): + """ + returns old or new ids according to the map + """ + ids = [np.array(list(id_map[id_]), dtype=basetypes.NODE_ID) for id_ in node_ids] + return np.concatenate(ids) class CreateParentNodes: @@ -331,6 +432,7 @@ def __init__( parent_ts: datetime.datetime = None, ): self.cg = cg + self.new_entries = [] self._new_l2_ids = new_l2_ids self._old_hierarchy_d = old_hierarchy_d self._new_old_id_d = new_old_id_d @@ -355,20 +457,6 @@ def _update_id_lineage( self._new_old_id_d[parent].add(old_id) self._old_new_id_d[old_id].add(parent) - def _get_old_ids(self, new_ids): - old_ids = [ - np.array(list(self._new_old_id_d[id_]), dtype=basetypes.NODE_ID) - for id_ in new_ids - ] - return np.concatenate(old_ids) - - def _get_new_ids(self, old_ids): - old_ids = [ - np.array(list(self._old_new_id_d[id_]), dtype=basetypes.NODE_ID) - for id_ in old_ids - ] - return np.concatenate(old_ids) - def _get_connected_components(self, node_ids: np.ndarray, layer: int): with TimeIt( f"get_cross_chunk_edges.{layer}", @@ -381,10 +469,7 @@ def _get_connected_components(self, node_ids: np.ndarray, layer: int): for id_ in node_ids: edges_ = cross_edges_d[id_].get(layer, types.empty_2d) cx_edges.append(edges_) - cx_edges = np.concatenate([*cx_edges, np.vstack([node_ids, node_ids]).T]) - temp_map = {k: next(iter(v)) for k, v in self._old_new_id_d.items()} - cx_edges = fastremap.remap(cx_edges, temp_map, preserve_missing_labels=True) graph, _, _, graph_ids = flatgraph.build_gt_graph(cx_edges, make_directed=True) return flatgraph.connected_components(graph), graph_ids @@ -392,14 +477,14 @@ def _get_layer_node_ids( self, new_ids: np.ndarray, layer: int ) -> Tuple[np.ndarray, np.ndarray]: # get old identities of new IDs - old_ids = self._get_old_ids(new_ids) + old_ids = _get_flipped_ids(self._new_old_id_d, new_ids) # get their parents, then children of those parents - parents = self.cg.get_parents(old_ids, time_stamp=self._last_successful_ts) - node_ids = self.cg.get_children(np.unique(parents), flatten=True) + old_parents = self.cg.get_parents(old_ids, time_stamp=self._last_successful_ts) + siblings = self.cg.get_children(np.unique(old_parents), flatten=True) # replace old identities with new IDs - mask = np.in1d(node_ids, old_ids) + mask = np.in1d(siblings, old_ids) node_ids = np.concatenate( - [self._get_new_ids(node_ids[mask]), node_ids[~mask], new_ids] + [_get_flipped_ids(self._old_new_id_d, old_ids), siblings[~mask], new_ids] ) node_ids = np.unique(node_ids) layer_mask = self.cg.get_chunk_layers(node_ids) == layer @@ -423,14 +508,40 @@ def _update_cross_edge_cache(self, parent, children): new_cx_edges_d = {} for layer in range(parent_layer, self.cg.meta.layer_count): - layer_edges = cx_edges_d.get(layer, types.empty_2d) - if len(layer_edges) == 0: + edges = cx_edges_d.get(layer, types.empty_2d) + if len(edges) == 0: continue - new_cx_edges_d[layer] = fastremap.remap( - layer_edges, edge_parents_d, preserve_missing_labels=True - ) + edges = fastremap.remap(edges, edge_parents_d, preserve_missing_labels=True) + new_cx_edges_d[layer] = np.unique(edges, axis=0) + assert np.all(edges[:, 0] == parent) self.cg.cache.cross_chunk_edges_cache[parent] = new_cx_edges_d + layer_edges = new_cx_edges_d.get(parent_layer, types.empty_2d) + counterparts = layer_edges[:, 1] + counterpart_cx_edges_d = self.cg.get_cross_chunk_edges(counterparts) + temp_map = { + old_id: parent for old_id in _get_flipped_ids(self._new_old_id_d, [parent]) + } + for counterpart, edges_d in counterpart_cx_edges_d.items(): + val_dict = {} + for layer in range(parent_layer, self.cg.meta.layer_count): + edges = edges_d.get(layer, types.empty_2d) + if edges.size == 0: + continue + assert np.all(edges[:, 0] == counterpart) + edges = fastremap.remap(edges, temp_map, preserve_missing_labels=True) + edges_d[layer] = edges + val_dict[attributes.Connectivity.CrossChunkEdge[layer]] = edges + if not val_dict: + continue + self.cg.cache.cross_chunk_edges_cache[counterpart] = edges_d + row = self.cg.client.mutate_row( + serialize_uint64(counterpart), + val_dict, + time_stamp=self._time_stamp, + ) + self.new_entries.append(row) + def _create_new_parents(self, layer: int): """ keep track of old IDs @@ -444,7 +555,6 @@ def _create_new_parents(self, layer: int): parent_layer = layer + 1 new_ids = self._new_ids_d[layer] layer_node_ids = self._get_layer_node_ids(new_ids, layer) - print(layer, layer_node_ids) components, graph_ids = self._get_connected_components(layer_node_ids, layer) new_parent_ids = [] for cc_indices in components: @@ -494,24 +604,17 @@ def run(self) -> Iterable: return self._new_ids_d[self.cg.meta.layer_count] def _update_root_id_lineage(self): - new_root_ids = self._new_ids_d[self.cg.meta.layer_count] - former_root_ids = self._get_old_ids(new_root_ids) - former_root_ids = np.unique(former_root_ids) - - print() - print(former_root_ids, "->", new_root_ids) - print(self.cg.get_children(former_root_ids)) - print(self.cg.get_children(np.array(new_root_ids, dtype=np.uint64))) - assert ( - len(former_root_ids) < 2 or len(new_root_ids) < 2 - ), "Result inconsistent with either split or merge effects." - rows = [] - for new_root_id in new_root_ids: + new_roots = self._new_ids_d[self.cg.meta.layer_count] + former_roots = _get_flipped_ids(self._new_old_id_d, new_roots) + former_roots = np.unique(former_roots) + + assert len(former_roots) < 2 or len(new_roots) < 2, "new roots are inconsistent" + for new_root_id in new_roots: val_dict = { - attributes.Hierarchy.FormerParent: np.array(former_root_ids), + attributes.Hierarchy.FormerParent: np.array(former_roots), attributes.OperationLogs.OperationID: self._operation_id, } - rows.append( + self.new_entries.append( self.cg.client.mutate_row( serialize_uint64(new_root_id), val_dict, @@ -519,30 +622,24 @@ def _update_root_id_lineage(self): ) ) - for former_root_id in former_root_ids: + for former_root_id in former_roots: val_dict = { - attributes.Hierarchy.NewParent: np.array(new_root_ids), + attributes.Hierarchy.NewParent: np.array(new_roots), attributes.OperationLogs.OperationID: self._operation_id, } - rows.append( + self.new_entries.append( self.cg.client.mutate_row( serialize_uint64(former_root_id), val_dict, time_stamp=self._time_stamp, ) ) - return rows - def _get_cross_edges_val_dict(self): - print("haha", self.cg.get_cross_chunk_edges([216172782113783809])) + def _get_cross_edges_val_dicts(self): val_dicts = {} for layer in range(2, self.cg.meta.layer_count): new_ids = np.array(self._new_ids_d[layer], dtype=basetypes.NODE_ID) cross_edges_d = self.cg.get_cross_chunk_edges(new_ids) - print() - print(layer, new_ids) - print("cx", cross_edges_d) - print("ch", self.cg.get_children(new_ids)) for id_ in new_ids: val_dict = {} for layer, edges in cross_edges_d[id_].items(): @@ -551,8 +648,7 @@ def _get_cross_edges_val_dict(self): return val_dicts def create_new_entries(self) -> List: - rows = [] - val_dicts = self._get_cross_edges_val_dict() + val_dicts = self._get_cross_edges_val_dicts() for layer in range(2, self.cg.meta.layer_count + 1): new_ids = self._new_ids_d[layer] for id_ in new_ids: @@ -562,7 +658,7 @@ def create_new_entries(self) -> List: self.cg.get_chunk_layers(children) ) < self.cg.get_chunk_layer(id_), "Parent layer less than children." val_dict[attributes.Hierarchy.Child] = children - rows.append( + self.new_entries.append( self.cg.client.mutate_row( serialize_uint64(id_), val_dict, @@ -570,11 +666,11 @@ def create_new_entries(self) -> List: ) ) for child_id in children: - rows.append( + self.new_entries.append( self.cg.client.mutate_row( serialize_uint64(child_id), {attributes.Hierarchy.Parent: id_}, time_stamp=self._time_stamp, ) ) - return rows + self._update_root_id_lineage() + self._update_root_id_lineage() From 8f49c30aa46d51139b46a9f1364c999877950e77 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Thu, 24 Aug 2023 00:41:22 +0000 Subject: [PATCH 039/116] fix: use function for dry code --- pychunkedgraph/graph/edits.py | 135 +++++++++++++--------------------- 1 file changed, 51 insertions(+), 84 deletions(-) diff --git a/pychunkedgraph/graph/edits.py b/pychunkedgraph/graph/edits.py index ba9481139..7a2a03408 100644 --- a/pychunkedgraph/graph/edits.py +++ b/pychunkedgraph/graph/edits.py @@ -178,6 +178,40 @@ def check_fake_edges( return atomic_edges, rows +def _update_neighbor_cross_edges( + cg, new_id: int, cx_edges_d: dict, new_old_id_d: dict, time_stamp +) -> list: + updated_entries = [] + node_layer = cg.get_chunk_layer(new_id) + for cx_layer in range(node_layer, cg.meta.layer_count): + layer_edges = cx_edges_d.get(cx_layer, types.empty_2d) + counterparts = layer_edges[:, 1] + counterpart_cx_edges_d = cg.get_cross_chunk_edges(counterparts) + temp_map = { + old_id: new_id for old_id in _get_flipped_ids(new_old_id_d, [new_id]) + } + for counterpart, edges_d in counterpart_cx_edges_d.items(): + val_dict = {} + for layer in range(2, cg.meta.layer_count): + edges = edges_d.get(layer, types.empty_2d) + if edges.size == 0: + continue + assert np.all(edges[:, 0] == counterpart) + edges = fastremap.remap(edges, temp_map, preserve_missing_labels=True) + edges_d[layer] = edges + val_dict[attributes.Connectivity.CrossChunkEdge[layer]] = edges + if not val_dict: + continue + cg.cache.cross_chunk_edges_cache[counterpart] = edges_d + row = cg.client.mutate_row( + serialize_uint64(counterpart), + val_dict, + time_stamp=time_stamp, + ) + updated_entries.append(row) + return updated_entries + + def add_edges( cg, *, @@ -233,33 +267,10 @@ def add_edges( new_cx_edges_d[layer] = edges assert np.all(edges[:, 0] == new_id) cg.cache.cross_chunk_edges_cache[new_id] = new_cx_edges_d - - # must also update cross chunk edges in reverse (counterparts) - layer_edges = new_cx_edges_d.get(2, types.empty_2d) - counterparts = layer_edges[:, 1] - counterpart_cx_edges_d = cg.get_cross_chunk_edges(counterparts) - temp_map = { - old_id: new_id for old_id in _get_flipped_ids(new_old_id_d, [new_id]) - } - for counterpart, edges_d in counterpart_cx_edges_d.items(): - val_dict = {} - for layer in range(2, cg.meta.layer_count): - edges = edges_d.get(layer, types.empty_2d) - if edges.size == 0: - continue - assert np.all(edges[:, 0] == counterpart) - edges = fastremap.remap(edges, temp_map, preserve_missing_labels=True) - edges_d[layer] = edges - val_dict[attributes.Connectivity.CrossChunkEdge[layer]] = edges - if not val_dict: - continue - cg.cache.cross_chunk_edges_cache[counterpart] = edges_d - row = cg.client.mutate_row( - serialize_uint64(counterpart), - val_dict, - time_stamp=time_stamp, - ) - updated_entries.append(row) + entries = _update_neighbor_cross_edges( + cg, new_id, new_cx_edges_d, new_old_id_d, time_stamp + ) + updated_entries.extend(entries) create_parents = CreateParentNodes( cg, @@ -355,45 +366,23 @@ def remove_edges( ) updated_entries = [] - new_cx_edges_d = cg.get_cross_chunk_edges(new_l2_ids) + cx_edges_d = cg.get_cross_chunk_edges(new_l2_ids) for new_id in new_l2_ids: - cx_edges_d = new_cx_edges_d.get(new_id, {}) - for layer, edges in cx_edges_d.items(): + new_cx_edges_d = cx_edges_d.get(new_id, {}) + for layer, edges in new_cx_edges_d.items(): svs = np.unique(edges) parents = cg.get_parents(svs) temp_map = dict(zip(svs, parents)) edges = fastremap.remap(edges, temp_map, preserve_missing_labels=True) edges = np.unique(edges, axis=0) - cx_edges_d[layer] = edges + new_cx_edges_d[layer] = edges assert np.all(edges[:, 0] == new_id) - cg.cache.cross_chunk_edges_cache[new_id] = cx_edges_d - - layer_edges = cx_edges_d.get(2, types.empty_2d) - counterparts = layer_edges[:, 1] - counterpart_cx_edges_d = cg.get_cross_chunk_edges(counterparts) - temp_map = { - old_id: new_id for old_id in _get_flipped_ids(new_old_id_d, [new_id]) - } - for counterpart, edges_d in counterpart_cx_edges_d.items(): - val_dict = {} - for layer in range(2, cg.meta.layer_count): - edges = edges_d.get(layer, types.empty_2d) - if edges.size == 0: - continue - assert np.all(edges[:, 0] == counterpart) - edges = fastremap.remap(edges, temp_map, preserve_missing_labels=True) - edges_d[layer] = edges - val_dict[attributes.Connectivity.CrossChunkEdge[layer]] = edges - if not val_dict: - continue - cg.cache.cross_chunk_edges_cache[counterpart] = edges_d - row = cg.client.mutate_row( - serialize_uint64(counterpart), - val_dict, - time_stamp=time_stamp, - ) - updated_entries.append(row) + cg.cache.cross_chunk_edges_cache[new_id] = new_cx_edges_d + entries = _update_neighbor_cross_edges( + cg, new_id, new_cx_edges_d, new_old_id_d, time_stamp + ) + updated_entries.extend(entries) create_parents = CreateParentNodes( cg, @@ -515,32 +504,10 @@ def _update_cross_edge_cache(self, parent, children): new_cx_edges_d[layer] = np.unique(edges, axis=0) assert np.all(edges[:, 0] == parent) self.cg.cache.cross_chunk_edges_cache[parent] = new_cx_edges_d - - layer_edges = new_cx_edges_d.get(parent_layer, types.empty_2d) - counterparts = layer_edges[:, 1] - counterpart_cx_edges_d = self.cg.get_cross_chunk_edges(counterparts) - temp_map = { - old_id: parent for old_id in _get_flipped_ids(self._new_old_id_d, [parent]) - } - for counterpart, edges_d in counterpart_cx_edges_d.items(): - val_dict = {} - for layer in range(parent_layer, self.cg.meta.layer_count): - edges = edges_d.get(layer, types.empty_2d) - if edges.size == 0: - continue - assert np.all(edges[:, 0] == counterpart) - edges = fastremap.remap(edges, temp_map, preserve_missing_labels=True) - edges_d[layer] = edges - val_dict[attributes.Connectivity.CrossChunkEdge[layer]] = edges - if not val_dict: - continue - self.cg.cache.cross_chunk_edges_cache[counterpart] = edges_d - row = self.cg.client.mutate_row( - serialize_uint64(counterpart), - val_dict, - time_stamp=self._time_stamp, - ) - self.new_entries.append(row) + entries = _update_neighbor_cross_edges( + self.cg, parent, new_cx_edges_d, self._new_old_id_d, self._time_stamp + ) + self.new_entries.extend(entries) def _create_new_parents(self, layer: int): """ From fa901b2b48850baa974f66ece8639ca89a68dd02 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Mon, 28 Aug 2023 21:02:26 +0000 Subject: [PATCH 040/116] fix: mask skipped nodes --- pychunkedgraph/ingest/create/abstract_layers.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pychunkedgraph/ingest/create/abstract_layers.py b/pychunkedgraph/ingest/create/abstract_layers.py index 9a339443f..df6375c5f 100644 --- a/pychunkedgraph/ingest/create/abstract_layers.py +++ b/pychunkedgraph/ingest/create/abstract_layers.py @@ -214,8 +214,10 @@ def _write( layer_edges = node_cx_edges_d[layer] edges_nodes = np.unique(layer_edges) - edges_nodes_parents = cg.get_parents(edges_nodes) - temp_map = dict(zip(edges_nodes, edges_nodes_parents)) + edges_nodes_layers = cg.get_chunk_layers(edges_nodes) + mask = edges_nodes_layers < layer_id - 1 + edges_nodes_parents = cg.get_parents(edges_nodes[mask]) + temp_map = dict(zip(edges_nodes[mask], edges_nodes_parents)) layer_edges = fastremap.remap( layer_edges, temp_map, preserve_missing_labels=True @@ -230,7 +232,9 @@ def _write( row_id = serializers.serialize_uint64(parent_id) val_dict = {attributes.Hierarchy.Child: node_ids} - parent_cx_edges_d = concatenate_cross_edge_dicts(children_cx_edges, unique=True) + parent_cx_edges_d = concatenate_cross_edge_dicts( + children_cx_edges, unique=True + ) for layer in range(parent_layer, cg.meta.layer_count): if not layer in parent_cx_edges_d: continue From 3f2759c6b945f7269d400fe1b4526f4cfa118551 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Mon, 28 Aug 2023 21:03:46 +0000 Subject: [PATCH 041/116] fix: use the correct layer variable --- pychunkedgraph/ingest/create/abstract_layers.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pychunkedgraph/ingest/create/abstract_layers.py b/pychunkedgraph/ingest/create/abstract_layers.py index df6375c5f..d65e225a3 100644 --- a/pychunkedgraph/ingest/create/abstract_layers.py +++ b/pychunkedgraph/ingest/create/abstract_layers.py @@ -190,8 +190,9 @@ def _write( for i_cc, node_ids in enumerate(cc_connections[parent_layer]): parent_id = reserved_parent_ids[i_cc] - if parent_layer == 3: - # children are from atomic chunks + if layer_id == 3: + # when layer 3 is being processed, children chunks are at layer 2 + # layer 2 chunks at this time will only have atomic cross edges cx_edges_d = cg.get_atomic_cross_edges(node_ids) else: # children are from abstract chunks From d53e0fd6f29190689cd632e25b3ebf48546cd3e9 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Tue, 29 Aug 2023 14:58:10 +0000 Subject: [PATCH 042/116] fix: redis pipeline for lower latency --- pychunkedgraph/ingest/cli.py | 29 ++++++++++++++++++++++++++--- pychunkedgraph/ingest/rq_cli.py | 4 ++-- 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/pychunkedgraph/ingest/cli.py b/pychunkedgraph/ingest/cli.py index 2ad51ca18..997bf768a 100644 --- a/pychunkedgraph/ingest/cli.py +++ b/pychunkedgraph/ingest/cli.py @@ -121,9 +121,32 @@ def ingest_status(): redis = get_redis_connection() imanager = IngestionManager.from_pickle(redis.get(r_keys.INGESTION_MANAGER)) layers = range(2, imanager.cg_meta.layer_count + 1) - for layer, layer_count in zip(layers, imanager.cg_meta.layer_chunk_counts): - done = redis.scard(f"{layer}c") - print(f"{layer}\t: {done} / {layer_count}") + layer_counts = imanager.cg_meta.layer_chunk_counts + + pipeline = redis.pipeline() + for layer in layers: + pipeline.scard(f"{layer}c") + queue = Queue(f"l{layer}") + pipeline.llen(queue.key) + pipeline.zcard(queue.failed_job_registry.key) + + results = pipeline.execute() + completed = [] + queued = [] + failed = [] + for i in range(0, len(results), 3): + result = results[i : i + 3] + completed.append(result[0]) + queued.append(result[1]) + failed.append(result[2]) + + print("layer status:") + for layer, done, count in zip(layers, completed, layer_counts): + print(f"{layer}\t: {done} / {count}") + + print("\n\nqueue status:") + for layer, q, f in zip(layers, queued, failed): + print(f"l{layer}\t: queued {q}, failed {f}") @ingest_cli.command("chunk") diff --git a/pychunkedgraph/ingest/rq_cli.py b/pychunkedgraph/ingest/rq_cli.py index 27b9c865d..c9b21ae36 100644 --- a/pychunkedgraph/ingest/rq_cli.py +++ b/pychunkedgraph/ingest/rq_cli.py @@ -1,7 +1,8 @@ +# pylint: disable=invalid-name, missing-function-docstring + """ cli for redis jobs """ -import os import sys import click @@ -14,7 +15,6 @@ from rq.exceptions import NoSuchJobError from rq.registry import StartedJobRegistry from rq.registry import FailedJobRegistry -from flask import current_app from flask.cli import AppGroup from ..utils.redis import REDIS_HOST From cd76cad2ced1297aecbb5f73aae0b29345524cac Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Tue, 29 Aug 2023 15:15:24 +0000 Subject: [PATCH 043/116] fix: pass redis connection --- pychunkedgraph/ingest/cli.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pychunkedgraph/ingest/cli.py b/pychunkedgraph/ingest/cli.py index 997bf768a..93bb328c1 100644 --- a/pychunkedgraph/ingest/cli.py +++ b/pychunkedgraph/ingest/cli.py @@ -126,7 +126,7 @@ def ingest_status(): pipeline = redis.pipeline() for layer in layers: pipeline.scard(f"{layer}c") - queue = Queue(f"l{layer}") + queue = Queue(f"l{layer}", connection=redis) pipeline.llen(queue.key) pipeline.zcard(queue.failed_job_registry.key) @@ -146,7 +146,7 @@ def ingest_status(): print("\n\nqueue status:") for layer, q, f in zip(layers, queued, failed): - print(f"l{layer}\t: queued {q}, failed {f}") + print(f"l{layer}\t: queued\t {q}, failed\t {f}") @ingest_cli.command("chunk") From 4a0d1a28c0055d3ae353007e9da1efef81533232 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Tue, 29 Aug 2023 17:51:17 +0000 Subject: [PATCH 044/116] fix: version update for deployment --- pychunkedgraph/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pychunkedgraph/__init__.py b/pychunkedgraph/__init__.py index e615ea2b7..528787cfc 100644 --- a/pychunkedgraph/__init__.py +++ b/pychunkedgraph/__init__.py @@ -1 +1 @@ -__version__ = "2.21.1" +__version__ = "3.0.0" From ce27bba28ebf863ea1825af810068ce144c50320 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Tue, 29 Aug 2023 17:52:17 +0000 Subject: [PATCH 045/116] fix: status print padding --- pychunkedgraph/ingest/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pychunkedgraph/ingest/cli.py b/pychunkedgraph/ingest/cli.py index 93bb328c1..0fe925d78 100644 --- a/pychunkedgraph/ingest/cli.py +++ b/pychunkedgraph/ingest/cli.py @@ -146,7 +146,7 @@ def ingest_status(): print("\n\nqueue status:") for layer, q, f in zip(layers, queued, failed): - print(f"l{layer}\t: queued\t {q}, failed\t {f}") + print(f"l{layer}\t: queued\t {q}\t, failed\t {f}") @ingest_cli.command("chunk") From 23f645894f0e4511145e86ade415eef2f9d4e691 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Wed, 30 Aug 2023 02:37:55 +0000 Subject: [PATCH 046/116] fix: filter active edges for split, add timestamp for reading cross chunk edges --- pychunkedgraph/graph/cache.py | 14 ++++-- pychunkedgraph/graph/chunkedgraph.py | 14 +++++- pychunkedgraph/graph/edits.py | 67 ++++++++++++++++++++++------ 3 files changed, 76 insertions(+), 19 deletions(-) diff --git a/pychunkedgraph/graph/cache.py b/pychunkedgraph/graph/cache.py index 52fdfd022..d381baa7d 100644 --- a/pychunkedgraph/graph/cache.py +++ b/pychunkedgraph/graph/cache.py @@ -68,11 +68,11 @@ def children_decorated(node_id): return children_decorated(node_id) - def cross_chunk_edges(self, node_id): + def cross_chunk_edges(self, node_id, *, time_stamp: datetime = None): @cached(cache=self.cross_chunk_edges_cache, key=lambda node_id: node_id) def cross_edges_decorated(node_id): edges = self._cg.get_cross_chunk_edges( - np.array([node_id], dtype=NODE_ID), raw_only=True + np.array([node_id], dtype=NODE_ID), raw_only=True, time_stamp=time_stamp ) return edges[node_id] @@ -107,7 +107,9 @@ def children_multiple(self, node_ids: np.ndarray, *, flatten=False): return np.concatenate([*result.values()]) return result - def cross_chunk_edges_multiple(self, node_ids: np.ndarray): + def cross_chunk_edges_multiple( + self, node_ids: np.ndarray, *, time_stamp: datetime = None + ): result = {} node_ids = np.array(node_ids, dtype=NODE_ID) if not node_ids.size: @@ -119,7 +121,11 @@ def cross_chunk_edges_multiple(self, node_ids: np.ndarray): result.update( {id_: edges_ for id_, edges_ in zip(node_ids[mask], cached_edges_)} ) - result.update(self._cg.get_cross_chunk_edges(node_ids[~mask], raw_only=True)) + result.update( + self._cg.get_cross_chunk_edges( + node_ids[~mask], raw_only=True, time_stamp=time_stamp + ) + ) update( self.cross_chunk_edges_cache, node_ids[~mask], diff --git a/pychunkedgraph/graph/chunkedgraph.py b/pychunkedgraph/graph/chunkedgraph.py index a118d4c82..049c7f683 100644 --- a/pychunkedgraph/graph/chunkedgraph.py +++ b/pychunkedgraph/graph/chunkedgraph.py @@ -318,12 +318,17 @@ def get_atomic_cross_edges(self, l2_ids: typing.Iterable) -> typing.Dict: return result def get_cross_chunk_edges( - self, node_ids: typing.Iterable, *, raw_only=False + self, + node_ids: typing.Iterable, + *, + raw_only=False, + time_stamp: typing.Optional[datetime.datetime] = None, ) -> typing.Dict: """ Returns cross edges for `node_ids`. A dict of the form `{node_id: {layer: cross_edges}}`. """ + time_stamp = misc_utils.get_valid_timestamp(time_stamp) if raw_only or not self.cache: result = {} node_ids = np.array(node_ids, dtype=basetypes.NODE_ID) @@ -333,7 +338,12 @@ def get_cross_chunk_edges( attributes.Connectivity.CrossChunkEdge[l] for l in range(2, self.meta.layer_count) ] - node_edges_d_d = self.client.read_nodes(node_ids=node_ids, properties=attrs) + node_edges_d_d = self.client.read_nodes( + node_ids=node_ids, + properties=attrs, + end_time=time_stamp, + end_time_inclusive=True, + ) for id_ in node_ids: try: result[id_] = { diff --git a/pychunkedgraph/graph/edits.py b/pychunkedgraph/graph/edits.py index 7a2a03408..c7485a26e 100644 --- a/pychunkedgraph/graph/edits.py +++ b/pychunkedgraph/graph/edits.py @@ -179,14 +179,16 @@ def check_fake_edges( def _update_neighbor_cross_edges( - cg, new_id: int, cx_edges_d: dict, new_old_id_d: dict, time_stamp + cg, new_id: int, cx_edges_d: dict, new_old_id_d: dict, *, time_stamp, parent_ts ) -> list: updated_entries = [] node_layer = cg.get_chunk_layer(new_id) for cx_layer in range(node_layer, cg.meta.layer_count): layer_edges = cx_edges_d.get(cx_layer, types.empty_2d) counterparts = layer_edges[:, 1] - counterpart_cx_edges_d = cg.get_cross_chunk_edges(counterparts) + counterpart_cx_edges_d = cg.get_cross_chunk_edges( + counterparts, time_stamp=parent_ts + ) temp_map = { old_id: new_id for old_id in _get_flipped_ids(new_old_id_d, [new_id]) } @@ -233,7 +235,7 @@ def add_edges( ) atomic_children_d = cg.get_children(l2ids) cross_edges_d = merge_cross_edge_dicts( - cg.get_cross_chunk_edges(l2ids), l2_cross_edges_d + cg.get_cross_chunk_edges(l2ids, time_stamp=parent_ts), l2_cross_edges_d ) graph, _, _, graph_ids = flatgraph.build_gt_graph(edges, make_directed=True) @@ -268,7 +270,12 @@ def add_edges( assert np.all(edges[:, 0] == new_id) cg.cache.cross_chunk_edges_cache[new_id] = new_cx_edges_d entries = _update_neighbor_cross_edges( - cg, new_id, new_cx_edges_d, new_old_id_d, time_stamp + cg, + new_id, + new_cx_edges_d, + new_old_id_d, + time_stamp=time_stamp, + parent_ts=parent_ts, ) updated_entries.extend(entries) @@ -288,7 +295,12 @@ def add_edges( return new_roots, new_l2_ids, updated_entries + create_parents.new_entries -def _process_l2_agglomeration(agg: types.Agglomeration, removed_edges: np.ndarray): +def _process_l2_agglomeration( + cg, + agg: types.Agglomeration, + removed_edges: np.ndarray, + parent_ts: datetime.datetime = None, +): """ For a given L2 id, remove given edges; calculate new connected components. """ @@ -298,6 +310,15 @@ def _process_l2_agglomeration(agg: types.Agglomeration, removed_edges: np.ndarra # cross during edits refers to all edges crossing chunk boundary cross_edges = [agg.out_edges.get_pairs(), agg.cross_edges.get_pairs()] cross_edges = np.concatenate(cross_edges) + + parents = cg.get_parents(cross_edges[:, 0], time_stamp=parent_ts) + assert np.unique(parents).size == 1, "got cross edges from more than one l2 node" + root = cg.get_root(parents[0], time_stamp=parent_ts) + + # inactive edges must be filtered out + neighbor_roots = cg.get_roots(cross_edges[:, 1], time_stamp=parent_ts) + active_mask = neighbor_roots == root + cross_edges = cross_edges[active_mask] cross_edges = cross_edges[~in2d(cross_edges, removed_edges)] isolated_ids = agg.supervoxels[~np.in1d(agg.supervoxels, chunk_edges)] @@ -350,7 +371,9 @@ def remove_edges( new_l2_ids = [] for id_ in l2ids: agg = l2id_agglomeration_d[id_] - ccs, graph_ids, cross_edges = _process_l2_agglomeration(agg, removed_edges) + ccs, graph_ids, cross_edges = _process_l2_agglomeration( + cg, agg, removed_edges, parent_ts + ) new_parents = cg.id_client.create_node_ids(chunk_id_map[agg.node_id], len(ccs)) cross_edge_layers = cg.get_cross_chunk_edges_layer(cross_edges) @@ -366,7 +389,7 @@ def remove_edges( ) updated_entries = [] - cx_edges_d = cg.get_cross_chunk_edges(new_l2_ids) + cx_edges_d = cg.get_cross_chunk_edges(new_l2_ids, time_stamp=parent_ts) for new_id in new_l2_ids: new_cx_edges_d = cx_edges_d.get(new_id, {}) for layer, edges in new_cx_edges_d.items(): @@ -380,7 +403,12 @@ def remove_edges( assert np.all(edges[:, 0] == new_id) cg.cache.cross_chunk_edges_cache[new_id] = new_cx_edges_d entries = _update_neighbor_cross_edges( - cg, new_id, new_cx_edges_d, new_old_id_d, time_stamp + cg, + new_id, + new_cx_edges_d, + new_old_id_d, + time_stamp=time_stamp, + parent_ts=parent_ts, ) updated_entries.extend(entries) @@ -452,7 +480,9 @@ def _get_connected_components(self, node_ids: np.ndarray, layer: int): self.cg.graph_id, self._operation_id, ): - cross_edges_d = self.cg.get_cross_chunk_edges(node_ids) + cross_edges_d = self.cg.get_cross_chunk_edges( + node_ids, time_stamp=self._last_successful_ts + ) cx_edges = [types.empty_2d] for id_ in node_ids: @@ -485,7 +515,9 @@ def _update_cross_edge_cache(self, parent, children): updates cross chunk edges in cache; this can only be done after all new components at a layer have IDs """ - cx_edges_d = self.cg.get_cross_chunk_edges(children) + cx_edges_d = self.cg.get_cross_chunk_edges( + children, time_stamp=self._last_successful_ts + ) cx_edges_d = concatenate_cross_edge_dicts(cx_edges_d.values(), unique=True) parent_layer = self.cg.get_chunk_layer(parent) @@ -505,7 +537,12 @@ def _update_cross_edge_cache(self, parent, children): assert np.all(edges[:, 0] == parent) self.cg.cache.cross_chunk_edges_cache[parent] = new_cx_edges_d entries = _update_neighbor_cross_edges( - self.cg, parent, new_cx_edges_d, self._new_old_id_d, self._time_stamp + self.cg, + parent, + new_cx_edges_d, + self._new_old_id_d, + time_stamp=self._time_stamp, + parent_ts=self._last_successful_ts, ) self.new_entries.extend(entries) @@ -530,7 +567,9 @@ def _create_new_parents(self, layer: int): # skip connection parent_layer = self.cg.meta.layer_count for l in range(layer + 1, self.cg.meta.layer_count): - cx_edges_d = self.cg.get_cross_chunk_edges([cc_ids[0]]) + cx_edges_d = self.cg.get_cross_chunk_edges( + [cc_ids[0]], time_stamp=self._last_successful_ts + ) if len(cx_edges_d[cc_ids[0]].get(l, types.empty_2d)) > 0: parent_layer = l break @@ -606,7 +645,9 @@ def _get_cross_edges_val_dicts(self): val_dicts = {} for layer in range(2, self.cg.meta.layer_count): new_ids = np.array(self._new_ids_d[layer], dtype=basetypes.NODE_ID) - cross_edges_d = self.cg.get_cross_chunk_edges(new_ids) + cross_edges_d = self.cg.get_cross_chunk_edges( + new_ids, time_stamp=self._last_successful_ts + ) for id_ in new_ids: val_dict = {} for layer, edges in cross_edges_d[id_].items(): From 64a678bb5799567b18f90be1b7b84562061f2b8f Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Wed, 30 Aug 2023 14:06:29 +0000 Subject: [PATCH 047/116] fix: get roots no cache flag --- pychunkedgraph/graph/chunkedgraph.py | 11 +++++++++-- pychunkedgraph/graph/edits.py | 5 ++++- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/pychunkedgraph/graph/chunkedgraph.py b/pychunkedgraph/graph/chunkedgraph.py index 049c7f683..a3c9aafc3 100644 --- a/pychunkedgraph/graph/chunkedgraph.py +++ b/pychunkedgraph/graph/chunkedgraph.py @@ -364,6 +364,7 @@ def get_roots( stop_layer: int = None, ceil: bool = True, fail_to_zero: bool = False, + raw_only=False, n_tries: int = 1, ) -> typing.Union[np.ndarray, typing.Dict[int, np.ndarray]]: """ @@ -387,7 +388,10 @@ def get_roots( filtered_ids = parent_ids[layer_mask] unique_ids, inverse = np.unique(filtered_ids, return_inverse=True) temp_ids = self.get_parents( - unique_ids, time_stamp=time_stamp, fail_to_zero=fail_to_zero + unique_ids, + time_stamp=time_stamp, + fail_to_zero=fail_to_zero, + raw_only=raw_only, ) if not temp_ids.size: break @@ -442,6 +446,7 @@ def get_root( get_all_parents: bool = False, stop_layer: int = None, ceil: bool = True, + raw_only: bool = False, n_tries: int = 1, ) -> typing.Union[typing.List[np.uint64], np.uint64]: """Takes a node id and returns the associated agglomeration ids.""" @@ -459,7 +464,9 @@ def get_root( for _ in range(n_tries): parent_id = node_id for _ in range(self.get_chunk_layer(node_id), int(stop_layer + 1)): - temp_parent_id = self.get_parent(parent_id, time_stamp=time_stamp) + temp_parent_id = self.get_parent( + parent_id, time_stamp=time_stamp, raw_only=raw_only + ) if temp_parent_id is None: break else: diff --git a/pychunkedgraph/graph/edits.py b/pychunkedgraph/graph/edits.py index c7485a26e..709c2dadc 100644 --- a/pychunkedgraph/graph/edits.py +++ b/pychunkedgraph/graph/edits.py @@ -316,7 +316,10 @@ def _process_l2_agglomeration( root = cg.get_root(parents[0], time_stamp=parent_ts) # inactive edges must be filtered out - neighbor_roots = cg.get_roots(cross_edges[:, 1], time_stamp=parent_ts) + # we must avoid the cache to read roots to get segment state before edit began + neighbor_roots = cg.get_roots( + cross_edges[:, 1], raw_only=True, time_stamp=parent_ts + ) active_mask = neighbor_roots == root cross_edges = cross_edges[active_mask] cross_edges = cross_edges[~in2d(cross_edges, removed_edges)] From aee03d27304bca3411256c027242b09d40b108bb Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Thu, 31 Aug 2023 12:51:51 +0000 Subject: [PATCH 048/116] fix: parent and roots no cache --- pychunkedgraph/graph/edits.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pychunkedgraph/graph/edits.py b/pychunkedgraph/graph/edits.py index 709c2dadc..dd53354d8 100644 --- a/pychunkedgraph/graph/edits.py +++ b/pychunkedgraph/graph/edits.py @@ -311,9 +311,9 @@ def _process_l2_agglomeration( cross_edges = [agg.out_edges.get_pairs(), agg.cross_edges.get_pairs()] cross_edges = np.concatenate(cross_edges) - parents = cg.get_parents(cross_edges[:, 0], time_stamp=parent_ts) + parents = cg.get_parents(cross_edges[:, 0], time_stamp=parent_ts, raw_only=True) assert np.unique(parents).size == 1, "got cross edges from more than one l2 node" - root = cg.get_root(parents[0], time_stamp=parent_ts) + root = cg.get_root(parents[0], time_stamp=parent_ts, raw_only=True) # inactive edges must be filtered out # we must avoid the cache to read roots to get segment state before edit began From 8b60346771da257b23f32cdf7e5a432a2f71f672 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Thu, 31 Aug 2023 15:52:12 +0000 Subject: [PATCH 049/116] fix: out edges here dont refer to edges crossing chunk --- pychunkedgraph/graph/edits.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pychunkedgraph/graph/edits.py b/pychunkedgraph/graph/edits.py index dd53354d8..fd397d5a8 100644 --- a/pychunkedgraph/graph/edits.py +++ b/pychunkedgraph/graph/edits.py @@ -307,10 +307,7 @@ def _process_l2_agglomeration( chunk_edges = agg.in_edges.get_pairs() chunk_edges = chunk_edges[~in2d(chunk_edges, removed_edges)] - # cross during edits refers to all edges crossing chunk boundary - cross_edges = [agg.out_edges.get_pairs(), agg.cross_edges.get_pairs()] - cross_edges = np.concatenate(cross_edges) - + cross_edges = agg.cross_edges.get_pairs() parents = cg.get_parents(cross_edges[:, 0], time_stamp=parent_ts, raw_only=True) assert np.unique(parents).size == 1, "got cross edges from more than one l2 node" root = cg.get_root(parents[0], time_stamp=parent_ts, raw_only=True) From ecba1cc5b160a4bf4ca815332a7b8c86ee510a52 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Sat, 2 Sep 2023 19:42:49 +0000 Subject: [PATCH 050/116] fix: missing timestamps --- pychunkedgraph/graph/edits.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pychunkedgraph/graph/edits.py b/pychunkedgraph/graph/edits.py index fd397d5a8..76c708a38 100644 --- a/pychunkedgraph/graph/edits.py +++ b/pychunkedgraph/graph/edits.py @@ -394,7 +394,7 @@ def remove_edges( new_cx_edges_d = cx_edges_d.get(new_id, {}) for layer, edges in new_cx_edges_d.items(): svs = np.unique(edges) - parents = cg.get_parents(svs) + parents = cg.get_parents(svs, time_stamp=parent_ts) temp_map = dict(zip(svs, parents)) edges = fastremap.remap(edges, temp_map, preserve_missing_labels=True) @@ -523,7 +523,10 @@ def _update_cross_edge_cache(self, parent, children): parent_layer = self.cg.get_chunk_layer(parent) edge_nodes = np.unique(np.concatenate([*cx_edges_d.values(), types.empty_2d])) edge_parents = self.cg.get_roots( - edge_nodes, stop_layer=parent_layer, ceil=False + edge_nodes, + stop_layer=parent_layer, + ceil=False, + time_stamp=self._last_successful_ts, ) edge_parents_d = dict(zip(edge_nodes, edge_parents)) From c18bd99cec3a393f1668851e734da4413bb9b474 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Fri, 8 Sep 2023 15:53:34 +0000 Subject: [PATCH 051/116] fix: consolidate neighbor nodes cx edge updates --- pychunkedgraph/graph/edits.py | 124 +++++++++++++++++++--------------- 1 file changed, 68 insertions(+), 56 deletions(-) diff --git a/pychunkedgraph/graph/edits.py b/pychunkedgraph/graph/edits.py index 76c708a38..f08a5310d 100644 --- a/pychunkedgraph/graph/edits.py +++ b/pychunkedgraph/graph/edits.py @@ -179,38 +179,53 @@ def check_fake_edges( def _update_neighbor_cross_edges( - cg, new_id: int, cx_edges_d: dict, new_old_id_d: dict, *, time_stamp, parent_ts -) -> list: - updated_entries = [] - node_layer = cg.get_chunk_layer(new_id) - for cx_layer in range(node_layer, cg.meta.layer_count): - layer_edges = cx_edges_d.get(cx_layer, types.empty_2d) - counterparts = layer_edges[:, 1] - counterpart_cx_edges_d = cg.get_cross_chunk_edges( - counterparts, time_stamp=parent_ts - ) - temp_map = { + cg, new_ids: List[int], new_old_id_d: dict, *, time_stamp, parent_ts +) -> List: + temp_map = {} + for new_id in new_ids: + old_new_d = { old_id: new_id for old_id in _get_flipped_ids(new_old_id_d, [new_id]) } - for counterpart, edges_d in counterpart_cx_edges_d.items(): - val_dict = {} - for layer in range(2, cg.meta.layer_count): - edges = edges_d.get(layer, types.empty_2d) - if edges.size == 0: - continue - assert np.all(edges[:, 0] == counterpart) - edges = fastremap.remap(edges, temp_map, preserve_missing_labels=True) - edges_d[layer] = edges - val_dict[attributes.Connectivity.CrossChunkEdge[layer]] = edges - if not val_dict: + temp_map.update(old_new_d) + newid_cx_edges_d = cg.get_cross_chunk_edges(new_ids) + + def _get_counterparts(layer) -> set: + result = set() + for new_id in new_ids: + cx_edges_d = newid_cx_edges_d[new_id] + layer_edges = cx_edges_d.get(layer, types.empty_2d) + result.update(layer_edges[:, 1]) + return result + + start_layer = min(cg.get_chunk_layers(new_ids)) + counterparts = set() + for cx_layer in range(start_layer, cg.meta.layer_count): + counterparts.update(_get_counterparts(cx_layer)) + + counterpart_cx_edges_d = cg.get_cross_chunk_edges( + counterparts, time_stamp=parent_ts + ) + + updated_entries = [] + for counterpart, edges_d in counterpart_cx_edges_d.items(): + val_dict = {} + for layer in range(2, cg.meta.layer_count): + edges = edges_d.get(layer, types.empty_2d) + if edges.size == 0: continue - cg.cache.cross_chunk_edges_cache[counterpart] = edges_d - row = cg.client.mutate_row( - serialize_uint64(counterpart), - val_dict, - time_stamp=time_stamp, - ) - updated_entries.append(row) + assert np.all(edges[:, 0] == counterpart) + edges = fastremap.remap(edges, temp_map, preserve_missing_labels=True) + edges_d[layer] = edges + val_dict[attributes.Connectivity.CrossChunkEdge[layer]] = edges + if not val_dict: + continue + cg.cache.cross_chunk_edges_cache[counterpart] = edges_d + row = cg.client.mutate_row( + serialize_uint64(counterpart), + val_dict, + time_stamp=time_stamp, + ) + updated_entries.append(row) return updated_entries @@ -269,15 +284,14 @@ def add_edges( new_cx_edges_d[layer] = edges assert np.all(edges[:, 0] == new_id) cg.cache.cross_chunk_edges_cache[new_id] = new_cx_edges_d - entries = _update_neighbor_cross_edges( - cg, - new_id, - new_cx_edges_d, - new_old_id_d, - time_stamp=time_stamp, - parent_ts=parent_ts, - ) - updated_entries.extend(entries) + entries = _update_neighbor_cross_edges( + cg, + new_l2_ids, + new_old_id_d, + time_stamp=time_stamp, + parent_ts=parent_ts, + ) + updated_entries.extend(entries) create_parents = CreateParentNodes( cg, @@ -402,15 +416,14 @@ def remove_edges( new_cx_edges_d[layer] = edges assert np.all(edges[:, 0] == new_id) cg.cache.cross_chunk_edges_cache[new_id] = new_cx_edges_d - entries = _update_neighbor_cross_edges( - cg, - new_id, - new_cx_edges_d, - new_old_id_d, - time_stamp=time_stamp, - parent_ts=parent_ts, - ) - updated_entries.extend(entries) + entries = _update_neighbor_cross_edges( + cg, + new_l2_ids, + new_old_id_d, + time_stamp=time_stamp, + parent_ts=parent_ts, + ) + updated_entries.extend(entries) create_parents = CreateParentNodes( cg, @@ -539,15 +552,6 @@ def _update_cross_edge_cache(self, parent, children): new_cx_edges_d[layer] = np.unique(edges, axis=0) assert np.all(edges[:, 0] == parent) self.cg.cache.cross_chunk_edges_cache[parent] = new_cx_edges_d - entries = _update_neighbor_cross_edges( - self.cg, - parent, - new_cx_edges_d, - self._new_old_id_d, - time_stamp=self._time_stamp, - parent_ts=self._last_successful_ts, - ) - self.new_entries.extend(entries) def _create_new_parents(self, layer: int): """ @@ -594,6 +598,14 @@ def _create_new_parents(self, layer: int): for new_id in new_parent_ids: children = self.cg.get_children(new_id) self._update_cross_edge_cache(new_id, children) + entries = _update_neighbor_cross_edges( + self.cg, + new_parent_ids, + self._new_old_id_d, + time_stamp=self._time_stamp, + parent_ts=self._last_successful_ts, + ) + self.new_entries.extend(entries) def run(self) -> Iterable: """ From 51554a3a36295d04f554d3f59f70cc94ec7f6017 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Fri, 8 Sep 2023 15:57:59 +0000 Subject: [PATCH 052/116] fix: set to list for np.array --- pychunkedgraph/graph/edits.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pychunkedgraph/graph/edits.py b/pychunkedgraph/graph/edits.py index f08a5310d..3797e2082 100644 --- a/pychunkedgraph/graph/edits.py +++ b/pychunkedgraph/graph/edits.py @@ -202,12 +202,9 @@ def _get_counterparts(layer) -> set: for cx_layer in range(start_layer, cg.meta.layer_count): counterparts.update(_get_counterparts(cx_layer)) - counterpart_cx_edges_d = cg.get_cross_chunk_edges( - counterparts, time_stamp=parent_ts - ) - + cx_edges_d = cg.get_cross_chunk_edges(list(counterparts), time_stamp=parent_ts) updated_entries = [] - for counterpart, edges_d in counterpart_cx_edges_d.items(): + for counterpart, edges_d in cx_edges_d.items(): val_dict = {} for layer in range(2, cg.meta.layer_count): edges = edges_d.get(layer, types.empty_2d) From 43dc7f9d10b61be4d9231be08473a29d4271eb58 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Fri, 8 Sep 2023 16:52:42 +0000 Subject: [PATCH 053/116] fix: use copy=False where possible; some cleanup --- pychunkedgraph/graph/cache.py | 6 +- pychunkedgraph/graph/chunkedgraph.py | 81 +-------------------- pychunkedgraph/graph/chunks/utils.py | 26 ++++--- pychunkedgraph/graph/connectivity/search.py | 47 ------------ pychunkedgraph/graph/edits.py | 6 +- pychunkedgraph/graph/utils/flatgraph.py | 15 +++- 6 files changed, 35 insertions(+), 146 deletions(-) delete mode 100644 pychunkedgraph/graph/connectivity/search.py diff --git a/pychunkedgraph/graph/cache.py b/pychunkedgraph/graph/cache.py index d381baa7d..13fa962ae 100644 --- a/pychunkedgraph/graph/cache.py +++ b/pychunkedgraph/graph/cache.py @@ -79,7 +79,7 @@ def cross_edges_decorated(node_id): return cross_edges_decorated(node_id) def parents_multiple(self, node_ids: np.ndarray, *, time_stamp: datetime = None): - node_ids = np.array(node_ids, dtype=NODE_ID) + node_ids = np.array(node_ids, dtype=NODE_ID, copy=False) if not node_ids.size: return node_ids mask = np.in1d(node_ids, np.fromiter(self.parents_cache.keys(), dtype=NODE_ID)) @@ -93,7 +93,7 @@ def parents_multiple(self, node_ids: np.ndarray, *, time_stamp: datetime = None) def children_multiple(self, node_ids: np.ndarray, *, flatten=False): result = {} - node_ids = np.array(node_ids, dtype=NODE_ID) + node_ids = np.array(node_ids, dtype=NODE_ID, copy=False) if not node_ids.size: return result mask = np.in1d(node_ids, np.fromiter(self.children_cache.keys(), dtype=NODE_ID)) @@ -111,7 +111,7 @@ def cross_chunk_edges_multiple( self, node_ids: np.ndarray, *, time_stamp: datetime = None ): result = {} - node_ids = np.array(node_ids, dtype=NODE_ID) + node_ids = np.array(node_ids, dtype=NODE_ID, copy=False) if not node_ids.size: return result mask = np.in1d( diff --git a/pychunkedgraph/graph/chunkedgraph.py b/pychunkedgraph/graph/chunkedgraph.py index a3c9aafc3..472257d1e 100644 --- a/pychunkedgraph/graph/chunkedgraph.py +++ b/pychunkedgraph/graph/chunkedgraph.py @@ -629,7 +629,7 @@ def get_fake_edges( ) for id_, val in fake_edges_d.items(): edges = np.concatenate( - [np.array(e.value, dtype=basetypes.NODE_ID) for e in val] + [np.array(e.value, dtype=basetypes.NODE_ID, copy=False) for e in val] ) result[id_] = Edges(edges[:, 0], edges[:, 1], fake_edges=True) return result @@ -827,82 +827,7 @@ def redo_operation( multicut_as_split=True, ).execute() - # PRIVATE - - def _get_bounding_chunk_ids( - self, - parent_chunk_ids: typing.Iterable, - unique: bool = False, - ) -> typing.Dict: - """ - Returns bounding chunk IDs at layers < parent_layer for all chunk IDs. - Dict[parent_chunk_id] = np.array(bounding_chunk_ids) - """ - parent_chunk_coords = self.get_chunk_coordinates_multiple(parent_chunk_ids) - parents_layer = self.get_chunk_layer(parent_chunk_ids[0]) - chunk_id_bchunk_ids_d = {} - for i, chunk_id in enumerate(parent_chunk_ids): - if chunk_id in chunk_id_bchunk_ids_d: - # `parent_chunk_ids` can have duplicates - # avoid redundant calculations - continue - parent_coord = parent_chunk_coords[i] - chunk_ids = [types.empty_1d] - for child_layer in range(2, parents_layer): - bcoords = chunk_utils.get_bounding_children_chunks( - self.meta, - parents_layer, - parent_coord, - child_layer, - return_unique=False, - ) - bchunks_ids = chunk_utils.get_chunk_ids_from_coords( - self.meta, child_layer, bcoords - ) - chunk_ids.append(bchunks_ids) - chunk_ids = np.concatenate(chunk_ids) - if unique: - chunk_ids = np.unique(chunk_ids) - chunk_id_bchunk_ids_d[chunk_id] = chunk_ids - return chunk_id_bchunk_ids_d - - def _get_bounding_l2_children(self, parents: typing.Iterable) -> typing.Dict: - parent_chunk_ids = self.get_chunk_ids_from_node_ids(parents) - chunk_id_bchunk_ids_d = self._get_bounding_chunk_ids( - parent_chunk_ids, unique=len(parents) >= 200 - ) - - parent_descendants_d = { - _id: np.array([_id], dtype=basetypes.NODE_ID) for _id in parents - } - descendants_all = np.concatenate(list(parent_descendants_d.values())) - descendants_layers = self.get_chunk_layers(descendants_all) - layer_mask = descendants_layers > 2 - descendants_all = descendants_all[layer_mask] - - while descendants_all.size: - descendant_children_d = self.get_children(descendants_all) - for i, parent_id in enumerate(parents): - _descendants = parent_descendants_d[parent_id] - _layers = self.get_chunk_layers(_descendants) - _l2mask = _layers == 2 - descendants = [_descendants[_l2mask]] - for child in _descendants[~_l2mask]: - descendants.append(descendant_children_d[child]) - descendants = np.concatenate(descendants) - chunk_ids = self.get_chunk_ids_from_node_ids(descendants) - bchunk_ids = chunk_id_bchunk_ids_d[parent_chunk_ids[i]] - bounding_descendants = descendants[np.in1d(chunk_ids, bchunk_ids)] - parent_descendants_d[parent_id] = bounding_descendants - - descendants_all = np.concatenate(list(parent_descendants_d.values())) - descendants_layers = self.get_chunk_layers(descendants_all) - layer_mask = descendants_layers > 2 - descendants_all = descendants_all[layer_mask] - return parent_descendants_d - # HELPERS / WRAPPERS - def is_root(self, node_id: basetypes.NODE_ID) -> bool: return self.get_chunk_layer(node_id) == self.meta.layer_count @@ -940,7 +865,9 @@ def get_chunk_coordinates(self, node_or_chunk_id: basetypes.NODE_ID): return chunk_utils.get_chunk_coordinates(self.meta, node_or_chunk_id) def get_chunk_coordinates_multiple(self, node_or_chunk_ids: typing.Sequence): - node_or_chunk_ids = np.array(node_or_chunk_ids, dtype=basetypes.NODE_ID) + node_or_chunk_ids = np.array( + node_or_chunk_ids, dtype=basetypes.NODE_ID, copy=False + ) layers = self.get_chunk_layers(node_or_chunk_ids) assert np.all(layers == layers[0]), "All IDs must have the same layer." return chunk_utils.get_chunk_coordinates_multiple(self.meta, node_or_chunk_ids) diff --git a/pychunkedgraph/graph/chunks/utils.py b/pychunkedgraph/graph/chunks/utils.py index dc895bde4..4d01258bd 100644 --- a/pychunkedgraph/graph/chunks/utils.py +++ b/pychunkedgraph/graph/chunks/utils.py @@ -8,6 +8,7 @@ import numpy as np + def get_chunks_boundary(voxel_boundary, chunk_size) -> np.ndarray: """returns number of chunks in each dimension""" return np.ceil((voxel_boundary / chunk_size)).astype(int) @@ -43,7 +44,7 @@ def normalize_bounding_box( def get_chunk_layer(meta, node_or_chunk_id: np.uint64) -> int: - """ Extract Layer from Node ID or Chunk ID """ + """Extract Layer from Node ID or Chunk ID""" return int(int(node_or_chunk_id) >> 64 - meta.graph_config.LAYER_ID_BITS) @@ -75,9 +76,9 @@ def get_chunk_coordinates(meta, node_or_chunk_id: np.uint64) -> np.ndarray: y_offset = x_offset - bits_per_dim z_offset = y_offset - bits_per_dim - x = int(node_or_chunk_id) >> x_offset & 2 ** bits_per_dim - 1 - y = int(node_or_chunk_id) >> y_offset & 2 ** bits_per_dim - 1 - z = int(node_or_chunk_id) >> z_offset & 2 ** bits_per_dim - 1 + x = int(node_or_chunk_id) >> x_offset & 2**bits_per_dim - 1 + y = int(node_or_chunk_id) >> y_offset & 2**bits_per_dim - 1 + z = int(node_or_chunk_id) >> z_offset & 2**bits_per_dim - 1 return np.array([x, y, z]) @@ -86,7 +87,7 @@ def get_chunk_coordinates_multiple(meta, ids: np.ndarray) -> np.ndarray: Array version of get_chunk_coordinates. Assumes all given IDs are in same layer. """ - if not len(ids): + if len(ids) == 0: return np.array([]) layer = get_chunk_layer(meta, ids[0]) bits_per_dim = meta.bitmasks[layer] @@ -95,10 +96,10 @@ def get_chunk_coordinates_multiple(meta, ids: np.ndarray) -> np.ndarray: y_offset = x_offset - bits_per_dim z_offset = y_offset - bits_per_dim - ids = np.array(ids, dtype=int) - X = ids >> x_offset & 2 ** bits_per_dim - 1 - Y = ids >> y_offset & 2 ** bits_per_dim - 1 - Z = ids >> z_offset & 2 ** bits_per_dim - 1 + ids = np.array(ids, dtype=int, copy=False) + X = ids >> x_offset & 2**bits_per_dim - 1 + Y = ids >> y_offset & 2**bits_per_dim - 1 + Z = ids >> z_offset & 2**bits_per_dim - 1 return np.column_stack((X, Y, Z)) @@ -142,14 +143,15 @@ def get_chunk_ids_from_coords(meta, layer: int, coords: np.ndarray): def get_chunk_ids_from_node_ids(meta, ids: Iterable[np.uint64]) -> np.ndarray: - """ Extract Chunk IDs from Node IDs""" + """Extract Chunk IDs from Node IDs""" if len(ids) == 0: return np.array([], dtype=np.uint64) bits_per_dims = np.array([meta.bitmasks[l] for l in get_chunk_layers(meta, ids)]) offsets = 64 - meta.graph_config.LAYER_ID_BITS - 3 * bits_per_dims - cids1 = np.array((np.array(ids, dtype=int) >> offsets) << offsets, dtype=np.uint64) + ids = np.array(ids, dtype=int, copy=False) + cids1 = np.array((ids >> offsets) << offsets, dtype=np.uint64) # cids2 = np.vectorize(get_chunk_id)(meta, ids) # assert np.all(cids1 == cids2) return cids1 @@ -164,7 +166,7 @@ def _compute_chunk_id( ) -> np.uint64: s_bits_per_dim = meta.bitmasks[layer] if not ( - x < 2 ** s_bits_per_dim and y < 2 ** s_bits_per_dim and z < 2 ** s_bits_per_dim + x < 2**s_bits_per_dim and y < 2**s_bits_per_dim and z < 2**s_bits_per_dim ): raise ValueError( f"Coordinate is out of range \ diff --git a/pychunkedgraph/graph/connectivity/search.py b/pychunkedgraph/graph/connectivity/search.py deleted file mode 100644 index bd3faf227..000000000 --- a/pychunkedgraph/graph/connectivity/search.py +++ /dev/null @@ -1,47 +0,0 @@ -import random -from typing import List - -import numpy as np -from graph_tool.search import bfs_search -from graph_tool.search import BFSVisitor -from graph_tool.search import StopSearch - -from ..utils.basetypes import NODE_ID - - -class TargetVisitor(BFSVisitor): - def __init__(self, target, reachable): - self.target = target - self.reachable = reachable - - def discover_vertex(self, u): - if u == self.target: - self.reachable[u] = 1 - raise StopSearch - - -def check_reachability(g, sv1s: np.ndarray, sv2s: np.ndarray, original_ids: np.ndarray) -> np.ndarray: - """ - g: graph tool Graph instance with ids 0 to N-1 where N = vertex count - original_ids: sorted ChunkedGraph supervoxel ids - (to identify corresponding ids in graph tool) - for each pair (sv1, sv2) check if a path exists (BFS) - """ - # mapping from original ids to graph tool ids - original_ids_d = { - sv_id: index for sv_id, index in zip(original_ids, range(len(original_ids))) - } - reachable = g.new_vertex_property("int", val=0) - - def _check_reachability(source, target): - bfs_search(g, source, TargetVisitor(target, reachable)) - return reachable[target] - - return np.array( - [ - _check_reachability(original_ids_d[source], original_ids_d[target]) - for source, target in zip(sv1s, sv2s) - ], - dtype=bool, - ) - diff --git a/pychunkedgraph/graph/edits.py b/pychunkedgraph/graph/edits.py index 3797e2082..6792f2f7d 100644 --- a/pychunkedgraph/graph/edits.py +++ b/pychunkedgraph/graph/edits.py @@ -441,7 +441,7 @@ def _get_flipped_ids(id_map, node_ids): """ returns old or new ids according to the map """ - ids = [np.array(list(id_map[id_]), dtype=basetypes.NODE_ID) for id_ in node_ids] + ids = [np.array(list(id_map[id_]), dtype=basetypes.NODE_ID, copy=False) for id_ in node_ids] return np.concatenate(ids) @@ -629,7 +629,7 @@ def _update_root_id_lineage(self): assert len(former_roots) < 2 or len(new_roots) < 2, "new roots are inconsistent" for new_root_id in new_roots: val_dict = { - attributes.Hierarchy.FormerParent: np.array(former_roots), + attributes.Hierarchy.FormerParent: former_roots, attributes.OperationLogs.OperationID: self._operation_id, } self.new_entries.append( @@ -642,7 +642,7 @@ def _update_root_id_lineage(self): for former_root_id in former_roots: val_dict = { - attributes.Hierarchy.NewParent: np.array(new_roots), + attributes.Hierarchy.NewParent: new_roots, attributes.OperationLogs.OperationID: self._operation_id, } self.new_entries.append( diff --git a/pychunkedgraph/graph/utils/flatgraph.py b/pychunkedgraph/graph/utils/flatgraph.py index df469d728..03cb6e2d2 100644 --- a/pychunkedgraph/graph/utils/flatgraph.py +++ b/pychunkedgraph/graph/utils/flatgraph.py @@ -1,8 +1,11 @@ +# pylint: disable=invalid-name, missing-docstring, c-extension-no-member + +from itertools import combinations, chain + import fastremap import numpy as np -from itertools import combinations, chain from graph_tool import Graph, GraphView -from graph_tool import topology, search +from graph_tool import topology def build_gt_graph( @@ -88,7 +91,10 @@ def team_paths_all_to_all(graph, capacity, team_vertex_ids): def neighboring_edges(graph, vertex_id): - """Returns vertex and edge lists of a seed vertex, in the same format as team_paths_all_to_all.""" + """ + Returns vertex and edge lists of a seed vertex, + in the same format as team_paths_all_to_all. + """ add_v = [] add_e = [] v0 = graph.vertex(vertex_id) @@ -124,7 +130,8 @@ def compute_filtered_paths( gfilt, capacity, team_vertex_ids ) - # graph-tool will invalidate the vertex and edge properties if I don't rebase them on the main graph + # graph-tool will invalidate the vertex and + # edge properties if I don't rebase them on the main graph # before tearing down the GraphView new_paths_e = [] for pth in paths_e: From d4bf0226c944be15f1e9ba434832f609faadd9b6 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Fri, 8 Sep 2023 16:56:09 +0000 Subject: [PATCH 054/116] fix: attribute type must be np.array --- pychunkedgraph/graph/edits.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pychunkedgraph/graph/edits.py b/pychunkedgraph/graph/edits.py index 6792f2f7d..278cb92db 100644 --- a/pychunkedgraph/graph/edits.py +++ b/pychunkedgraph/graph/edits.py @@ -441,7 +441,10 @@ def _get_flipped_ids(id_map, node_ids): """ returns old or new ids according to the map """ - ids = [np.array(list(id_map[id_]), dtype=basetypes.NODE_ID, copy=False) for id_ in node_ids] + ids = [ + np.array(list(id_map[id_]), dtype=basetypes.NODE_ID, copy=False) + for id_ in node_ids + ] return np.concatenate(ids) @@ -642,7 +645,9 @@ def _update_root_id_lineage(self): for former_root_id in former_roots: val_dict = { - attributes.Hierarchy.NewParent: new_roots, + attributes.Hierarchy.NewParent: np.array( + new_roots, dtype=basetypes.NODE_ID + ), attributes.OperationLogs.OperationID: self._operation_id, } self.new_entries.append( From e19440d207674afac1cca9005ec7f61c52761884 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Sat, 9 Sep 2023 19:00:17 +0000 Subject: [PATCH 055/116] fix(ingest): worker details in status --- pychunkedgraph/ingest/cli.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/pychunkedgraph/ingest/cli.py b/pychunkedgraph/ingest/cli.py index 0fe925d78..89106a097 100644 --- a/pychunkedgraph/ingest/cli.py +++ b/pychunkedgraph/ingest/cli.py @@ -11,6 +11,8 @@ import yaml from flask.cli import AppGroup from rq import Queue +from rq import Worker +from rq.worker import WorkerStatus from .cluster import create_atomic_chunk from .cluster import create_parent_chunk @@ -124,11 +126,14 @@ def ingest_status(): layer_counts = imanager.cg_meta.layer_chunk_counts pipeline = redis.pipeline() + worker_busy = [] for layer in layers: pipeline.scard(f"{layer}c") queue = Queue(f"l{layer}", connection=redis) pipeline.llen(queue.key) pipeline.zcard(queue.failed_job_registry.key) + workers = Worker.all(queue=queue) + worker_busy.append(sum([w.get_state() == WorkerStatus.BUSY for w in workers])) results = pipeline.execute() completed = [] @@ -140,13 +145,16 @@ def ingest_status(): queued.append(result[1]) failed.append(result[2]) - print("layer status:") + print(f"version: \t{imanager.cg.version}") + print(f"graph_id: \t{imanager.cg.graph_id}") + print(f"chunk_size: \t{imanager.cg.meta.graph_config.CHUNK_SIZE}") + print("\nlayer status:") for layer, done, count in zip(layers, completed, layer_counts): print(f"{layer}\t: {done} / {count}") print("\n\nqueue status:") - for layer, q, f in zip(layers, queued, failed): - print(f"l{layer}\t: queued\t {q}\t, failed\t {f}") + for layer, q, f, wb in zip(layers, queued, failed, worker_busy): + print(f"l{layer}\t: queued: {q}\t\t failed: {f}\t\t busy: {wb}") @ingest_cli.command("chunk") From 93a2f279c33adf4102734dfe85f7868eb90d7b6d Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Sat, 9 Sep 2023 19:44:07 +0000 Subject: [PATCH 056/116] fix: handle empty input --- pychunkedgraph/graph/edits.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pychunkedgraph/graph/edits.py b/pychunkedgraph/graph/edits.py index 278cb92db..6c7176924 100644 --- a/pychunkedgraph/graph/edits.py +++ b/pychunkedgraph/graph/edits.py @@ -441,6 +441,8 @@ def _get_flipped_ids(id_map, node_ids): """ returns old or new ids according to the map """ + if len(node_ids) == 0: + return types.empty_1d ids = [ np.array(list(id_map[id_]), dtype=basetypes.NODE_ID, copy=False) for id_ in node_ids From b2e0b94ca902659f4cf68b45859ba814362ede07 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Sat, 9 Sep 2023 20:00:52 +0000 Subject: [PATCH 057/116] fix: use empty array instead --- pychunkedgraph/graph/edits.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pychunkedgraph/graph/edits.py b/pychunkedgraph/graph/edits.py index 6c7176924..17502ddda 100644 --- a/pychunkedgraph/graph/edits.py +++ b/pychunkedgraph/graph/edits.py @@ -441,12 +441,11 @@ def _get_flipped_ids(id_map, node_ids): """ returns old or new ids according to the map """ - if len(node_ids) == 0: - return types.empty_1d ids = [ np.array(list(id_map[id_]), dtype=basetypes.NODE_ID, copy=False) for id_ in node_ids ] + ids.append(types.empty_1d) # concatenate needs at least one array return np.concatenate(ids) From b75fbde2ddd92cf8e174c2e082f8bf2b522b5c76 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Sun, 10 Sep 2023 18:59:55 +0000 Subject: [PATCH 058/116] fix: missed time_stamp --- pychunkedgraph/graph/chunkedgraph.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pychunkedgraph/graph/chunkedgraph.py b/pychunkedgraph/graph/chunkedgraph.py index 472257d1e..988dd5d89 100644 --- a/pychunkedgraph/graph/chunkedgraph.py +++ b/pychunkedgraph/graph/chunkedgraph.py @@ -353,7 +353,7 @@ def get_cross_chunk_edges( except KeyError: result[id_] = {} return result - return self.cache.cross_chunk_edges_multiple(node_ids) + return self.cache.cross_chunk_edges_multiple(node_ids, time_stamp=time_stamp) def get_roots( self, From cadede598d7bb0afe5c637aaf9f501f850b4e70a Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Sun, 10 Sep 2023 19:15:28 +0000 Subject: [PATCH 059/116] fix: only consolidate cx_edge writes; update per new_id --- pychunkedgraph/graph/edits.py | 76 +++++++++++++++++------------------ 1 file changed, 37 insertions(+), 39 deletions(-) diff --git a/pychunkedgraph/graph/edits.py b/pychunkedgraph/graph/edits.py index 17502ddda..f835577e0 100644 --- a/pychunkedgraph/graph/edits.py +++ b/pychunkedgraph/graph/edits.py @@ -178,50 +178,53 @@ def check_fake_edges( return atomic_edges, rows -def _update_neighbor_cross_edges( - cg, new_ids: List[int], new_old_id_d: dict, *, time_stamp, parent_ts -) -> List: - temp_map = {} - for new_id in new_ids: - old_new_d = { - old_id: new_id for old_id in _get_flipped_ids(new_old_id_d, [new_id]) - } - temp_map.update(old_new_d) - newid_cx_edges_d = cg.get_cross_chunk_edges(new_ids) - - def _get_counterparts(layer) -> set: - result = set() - for new_id in new_ids: - cx_edges_d = newid_cx_edges_d[new_id] - layer_edges = cx_edges_d.get(layer, types.empty_2d) - result.update(layer_edges[:, 1]) - return result - - start_layer = min(cg.get_chunk_layers(new_ids)) - counterparts = set() - for cx_layer in range(start_layer, cg.meta.layer_count): - counterparts.update(_get_counterparts(cx_layer)) - - cx_edges_d = cg.get_cross_chunk_edges(list(counterparts), time_stamp=parent_ts) - updated_entries = [] - for counterpart, edges_d in cx_edges_d.items(): +def _update_neighbor_cross_edges_single( + cg, new_id: int, cx_edges_d: dict, node_map: dict, *, parent_ts +) -> dict: + node_layer = cg.get_chunk_layer(new_id) + counterparts = [] + for layer in range(node_layer, cg.meta.layer_count): + layer_edges = cx_edges_d.get(layer, types.empty_2d) + counterparts.extend(layer_edges[:, 1]) + + cp_cx_edges_d = cg.get_cross_chunk_edges(counterparts, time_stamp=parent_ts) + updated_counterparts = {} + for counterpart, edges_d in cp_cx_edges_d.items(): val_dict = {} for layer in range(2, cg.meta.layer_count): edges = edges_d.get(layer, types.empty_2d) if edges.size == 0: continue assert np.all(edges[:, 0] == counterpart) - edges = fastremap.remap(edges, temp_map, preserve_missing_labels=True) + edges = fastremap.remap(edges, node_map, preserve_missing_labels=True) edges_d[layer] = edges val_dict[attributes.Connectivity.CrossChunkEdge[layer]] = edges if not val_dict: continue cg.cache.cross_chunk_edges_cache[counterpart] = edges_d - row = cg.client.mutate_row( - serialize_uint64(counterpart), - val_dict, - time_stamp=time_stamp, + updated_counterparts[counterpart] = val_dict + return updated_counterparts + + +def _update_neighbor_cross_edges( + cg, new_ids: List[int], new_old_id_d: dict, *, time_stamp, parent_ts +) -> List: + newid_cx_edges_d = cg.get_cross_chunk_edges(new_ids, time_stamp=parent_ts) + updated_counterparts = {} + for new_id in new_ids: + cx_edges_d = newid_cx_edges_d[new_id] + temp_map = { + old_id: new_id for old_id in _get_flipped_ids(new_old_id_d, [new_id]) + } + result = _update_neighbor_cross_edges_single( + cg, new_id, cx_edges_d, temp_map, parent_ts=parent_ts ) + updated_counterparts.update(result) + + updated_entries = [] + for node, val_dict in updated_counterparts.items(): + rowkey = serialize_uint64(node) + row = cg.client.mutate_row(rowkey, val_dict, time_stamp=time_stamp) updated_entries.append(row) return updated_entries @@ -269,7 +272,6 @@ def add_edges( # update cross chunk edges by replacing old_ids with new # this can be done only after all new IDs have been created - updated_entries = [] for new_id, cc_indices in zip(new_l2_ids, components): l2ids_ = graph_ids[cc_indices] new_cx_edges_d = {} @@ -281,14 +283,13 @@ def add_edges( new_cx_edges_d[layer] = edges assert np.all(edges[:, 0] == new_id) cg.cache.cross_chunk_edges_cache[new_id] = new_cx_edges_d - entries = _update_neighbor_cross_edges( + updated_entries = _update_neighbor_cross_edges( cg, new_l2_ids, new_old_id_d, time_stamp=time_stamp, parent_ts=parent_ts, ) - updated_entries.extend(entries) create_parents = CreateParentNodes( cg, @@ -399,7 +400,6 @@ def remove_edges( graph_ids[cc], cross_edges, cross_edge_layers ) - updated_entries = [] cx_edges_d = cg.get_cross_chunk_edges(new_l2_ids, time_stamp=parent_ts) for new_id in new_l2_ids: new_cx_edges_d = cx_edges_d.get(new_id, {}) @@ -413,14 +413,13 @@ def remove_edges( new_cx_edges_d[layer] = edges assert np.all(edges[:, 0] == new_id) cg.cache.cross_chunk_edges_cache[new_id] = new_cx_edges_d - entries = _update_neighbor_cross_edges( + updated_entries = _update_neighbor_cross_edges( cg, new_l2_ids, new_old_id_d, time_stamp=time_stamp, parent_ts=parent_ts, ) - updated_entries.extend(entries) create_parents = CreateParentNodes( cg, @@ -595,7 +594,6 @@ def _create_new_parents(self, layer: int): cc_ids, parent_id, ) - for new_id in new_parent_ids: children = self.cg.get_children(new_id) self._update_cross_edge_cache(new_id, children) From f95d8926ed2e86294746e959bfb530a8f862c896 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Mon, 11 Sep 2023 15:05:21 +0000 Subject: [PATCH 060/116] fix: reset parent layer in loop --- pychunkedgraph/graph/edits.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pychunkedgraph/graph/edits.py b/pychunkedgraph/graph/edits.py index f835577e0..9e186c274 100644 --- a/pychunkedgraph/graph/edits.py +++ b/pychunkedgraph/graph/edits.py @@ -563,12 +563,12 @@ def _create_new_parents(self, layer: int): get cross edges of all, find connected components update parent old IDs """ - parent_layer = layer + 1 new_ids = self._new_ids_d[layer] layer_node_ids = self._get_layer_node_ids(new_ids, layer) components, graph_ids = self._get_connected_components(layer_node_ids, layer) new_parent_ids = [] for cc_indices in components: + parent_layer = layer + 1 # must be reset for each connected component cc_ids = graph_ids[cc_indices] if len(cc_ids) == 1: # skip connection From 6637066bfadf6ec7054cbb1b159e1aa589230622 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Mon, 11 Sep 2023 15:26:32 +0000 Subject: [PATCH 061/116] fix(ingest): use get_roots with ceil=False instead of get_parents --- pychunkedgraph/ingest/create/abstract_layers.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/pychunkedgraph/ingest/create/abstract_layers.py b/pychunkedgraph/ingest/create/abstract_layers.py index d65e225a3..718ec74b7 100644 --- a/pychunkedgraph/ingest/create/abstract_layers.py +++ b/pychunkedgraph/ingest/create/abstract_layers.py @@ -212,16 +212,13 @@ def _write( for layer in range(node_layer, cg.meta.layer_count): if not layer in node_cx_edges_d: continue - layer_edges = node_cx_edges_d[layer] - edges_nodes = np.unique(layer_edges) - edges_nodes_layers = cg.get_chunk_layers(edges_nodes) - mask = edges_nodes_layers < layer_id - 1 - edges_nodes_parents = cg.get_parents(edges_nodes[mask]) - temp_map = dict(zip(edges_nodes[mask], edges_nodes_parents)) + nodes = np.unique(layer_edges) + parents = cg.get_roots(nodes, stop_layer=parent_layer, ceil=False) + edge_parents_d = dict(zip(nodes, parents)) layer_edges = fastremap.remap( - layer_edges, temp_map, preserve_missing_labels=True + layer_edges, edge_parents_d, preserve_missing_labels=True ) layer_edges = np.unique(layer_edges, axis=0) From 0393f335e8496847cb62bddf0ec6d1161553c118 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Mon, 11 Sep 2023 15:43:44 +0000 Subject: [PATCH 062/116] fix(ingest): incorrect stop_layer --- pychunkedgraph/ingest/create/abstract_layers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pychunkedgraph/ingest/create/abstract_layers.py b/pychunkedgraph/ingest/create/abstract_layers.py index 718ec74b7..adbe4a5ab 100644 --- a/pychunkedgraph/ingest/create/abstract_layers.py +++ b/pychunkedgraph/ingest/create/abstract_layers.py @@ -214,7 +214,7 @@ def _write( continue layer_edges = node_cx_edges_d[layer] nodes = np.unique(layer_edges) - parents = cg.get_roots(nodes, stop_layer=parent_layer, ceil=False) + parents = cg.get_roots(nodes, stop_layer=node_layer, ceil=False) edge_parents_d = dict(zip(nodes, parents)) layer_edges = fastremap.remap( From cfa4ba0764fcbad1bed3aa76f34fe4e95140c979 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Tue, 12 Sep 2023 14:07:54 +0000 Subject: [PATCH 063/116] fix: add safeguard to against data corruption --- pychunkedgraph/graph/chunkedgraph.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pychunkedgraph/graph/chunkedgraph.py b/pychunkedgraph/graph/chunkedgraph.py index 988dd5d89..8c3e14166 100644 --- a/pychunkedgraph/graph/chunkedgraph.py +++ b/pychunkedgraph/graph/chunkedgraph.py @@ -676,6 +676,9 @@ def get_l2_agglomerations( sv_parent_d = {} for l2id in l2id_children_d: svs = l2id_children_d[l2id] + for sv in svs: + if sv in sv_parent_d: + raise ValueError("Found conflicting parents.") sv_parent_d.update(dict(zip(svs.tolist(), [l2id] * len(svs)))) in_edges, out_edges, cross_edges = edge_utils.categorize_edges_v2( From f095925c782fe1e9db57eb66bc1d148e1bd30b1e Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Tue, 12 Sep 2023 14:55:09 +0000 Subject: [PATCH 064/116] add another safeguard --- pychunkedgraph/graph/edits.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pychunkedgraph/graph/edits.py b/pychunkedgraph/graph/edits.py index 9e186c274..5087f503d 100644 --- a/pychunkedgraph/graph/edits.py +++ b/pychunkedgraph/graph/edits.py @@ -567,9 +567,18 @@ def _create_new_parents(self, layer: int): layer_node_ids = self._get_layer_node_ids(new_ids, layer) components, graph_ids = self._get_connected_components(layer_node_ids, layer) new_parent_ids = [] + all_old_ids = [] + for v in self._new_old_id_d.values(): + all_old_ids.extend(v) + all_old_ids = np.array(all_old_ids, dtype=basetypes.NODE_ID) + for cc_indices in components: parent_layer = layer + 1 # must be reset for each connected component cc_ids = graph_ids[cc_indices] + mask = np.isin(cc_ids, all_old_ids) + old_ids = cc_ids[mask] + new_ids = _get_flipped_ids(self._old_new_id_d, cc_ids[mask]) + assert np.all(~mask), f"got old ids {old_ids} -> {new_ids}" if len(cc_ids) == 1: # skip connection parent_layer = self.cg.meta.layer_count From 6b0bf9d60651e0c6db3dc54e23a3884924dc6e7d Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Tue, 12 Sep 2023 15:02:05 +0000 Subject: [PATCH 065/116] feat: log operation_id in errors --- pychunkedgraph/graph/edits.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/pychunkedgraph/graph/edits.py b/pychunkedgraph/graph/edits.py index 5087f503d..08792108e 100644 --- a/pychunkedgraph/graph/edits.py +++ b/pychunkedgraph/graph/edits.py @@ -309,6 +309,7 @@ def add_edges( def _process_l2_agglomeration( cg, + operation_id: int, agg: types.Agglomeration, removed_edges: np.ndarray, parent_ts: datetime.datetime = None, @@ -321,7 +322,8 @@ def _process_l2_agglomeration( cross_edges = agg.cross_edges.get_pairs() parents = cg.get_parents(cross_edges[:, 0], time_stamp=parent_ts, raw_only=True) - assert np.unique(parents).size == 1, "got cross edges from more than one l2 node" + err = f"got cross edges from more than one l2 node; op {operation_id}" + assert np.unique(parents).size == 1, err root = cg.get_root(parents[0], time_stamp=parent_ts, raw_only=True) # inactive edges must be filtered out @@ -384,7 +386,7 @@ def remove_edges( for id_ in l2ids: agg = l2id_agglomeration_d[id_] ccs, graph_ids, cross_edges = _process_l2_agglomeration( - cg, agg, removed_edges, parent_ts + cg, operation_id, agg, removed_edges, parent_ts ) new_parents = cg.id_client.create_node_ids(chunk_id_map[agg.node_id], len(ccs)) @@ -432,6 +434,7 @@ def remove_edges( parent_ts=parent_ts, ) new_roots = create_parents.run() + raise RuntimeError("haha") create_parents.create_new_entries() return new_roots, new_l2_ids, updated_entries + create_parents.new_entries @@ -578,7 +581,8 @@ def _create_new_parents(self, layer: int): mask = np.isin(cc_ids, all_old_ids) old_ids = cc_ids[mask] new_ids = _get_flipped_ids(self._old_new_id_d, cc_ids[mask]) - assert np.all(~mask), f"got old ids {old_ids} -> {new_ids}" + err = f"got old ids {old_ids} -> {new_ids}; op {self._operation_id}" + assert np.all(~mask), err if len(cc_ids) == 1: # skip connection parent_layer = self.cg.meta.layer_count @@ -637,7 +641,8 @@ def _update_root_id_lineage(self): former_roots = _get_flipped_ids(self._new_old_id_d, new_roots) former_roots = np.unique(former_roots) - assert len(former_roots) < 2 or len(new_roots) < 2, "new roots are inconsistent" + err = f"new roots are inconsistent; op {self._operation_id}" + assert len(former_roots) < 2 or len(new_roots) < 2, err for new_root_id in new_roots: val_dict = { attributes.Hierarchy.FormerParent: former_roots, @@ -687,9 +692,10 @@ def create_new_entries(self) -> List: for id_ in new_ids: val_dict = val_dicts.get(id_, {}) children = self.cg.get_children(id_) + err = f"parent layer less than children; op {self._operation_id}" assert np.max( self.cg.get_chunk_layers(children) - ) < self.cg.get_chunk_layer(id_), "Parent layer less than children." + ) < self.cg.get_chunk_layer(id_), err val_dict[attributes.Hierarchy.Child] = children self.new_entries.append( self.cg.client.mutate_row( From c86ba6d05e2e9d9c2d978c7cc48b82c875ddbdf0 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Tue, 12 Sep 2023 16:24:52 +0000 Subject: [PATCH 066/116] fix: remove temp error --- pychunkedgraph/graph/edits.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pychunkedgraph/graph/edits.py b/pychunkedgraph/graph/edits.py index 08792108e..dd53f8538 100644 --- a/pychunkedgraph/graph/edits.py +++ b/pychunkedgraph/graph/edits.py @@ -434,7 +434,6 @@ def remove_edges( parent_ts=parent_ts, ) new_roots = create_parents.run() - raise RuntimeError("haha") create_parents.create_new_entries() return new_roots, new_l2_ids, updated_entries + create_parents.new_entries From 5337e44cd35d68b399c34a8882c379033fabcc7e Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Tue, 12 Sep 2023 20:08:37 +0000 Subject: [PATCH 067/116] add more safeguards --- pychunkedgraph/graph/edits.py | 40 +++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/pychunkedgraph/graph/edits.py b/pychunkedgraph/graph/edits.py index dd53f8538..da574db14 100644 --- a/pychunkedgraph/graph/edits.py +++ b/pychunkedgraph/graph/edits.py @@ -21,6 +21,7 @@ from .utils.serializers import serialize_uint64 from ..logging.log_db import TimeIt from ..utils.general import in2d +from ..debug.utils import get_l2children def _init_old_hierarchy(cg, l2ids: np.ndarray, parent_ts: datetime.datetime = None): @@ -187,7 +188,9 @@ def _update_neighbor_cross_edges_single( layer_edges = cx_edges_d.get(layer, types.empty_2d) counterparts.extend(layer_edges[:, 1]) - cp_cx_edges_d = cg.get_cross_chunk_edges(counterparts, time_stamp=parent_ts) + cp_cx_edges_d = cg.get_cross_chunk_edges( + counterparts, time_stamp=parent_ts, raw_only=True + ) updated_counterparts = {} for counterpart, edges_d in cp_cx_edges_d.items(): val_dict = {} @@ -207,17 +210,22 @@ def _update_neighbor_cross_edges_single( def _update_neighbor_cross_edges( - cg, new_ids: List[int], new_old_id_d: dict, *, time_stamp, parent_ts + cg, new_ids: List[int], new_old_id_d: dict, old_new_id_d, *, time_stamp, parent_ts ) -> List: - newid_cx_edges_d = cg.get_cross_chunk_edges(new_ids, time_stamp=parent_ts) + node_map = {} + for k, v in old_new_id_d.items(): + node_map[k] = next(iter(v)) + updated_counterparts = {} + newid_cx_edges_d = cg.get_cross_chunk_edges(new_ids, time_stamp=parent_ts) for new_id in new_ids: cx_edges_d = newid_cx_edges_d[new_id] temp_map = { old_id: new_id for old_id in _get_flipped_ids(new_old_id_d, [new_id]) } + node_map.update(temp_map) result = _update_neighbor_cross_edges_single( - cg, new_id, cx_edges_d, temp_map, parent_ts=parent_ts + cg, new_id, cx_edges_d, node_map, parent_ts=parent_ts ) updated_counterparts.update(result) @@ -287,6 +295,7 @@ def add_edges( cg, new_l2_ids, new_old_id_d, + old_new_id_d, time_stamp=time_stamp, parent_ts=parent_ts, ) @@ -303,6 +312,9 @@ def add_edges( ) new_roots = create_parents.run() + for new_root in new_roots: + l2c = get_l2children(cg, new_root) + assert len(l2c) == np.unique(l2c).size, f"inconsistent result op {operation_id}" create_parents.create_new_entries() return new_roots, new_l2_ids, updated_entries + create_parents.new_entries @@ -321,13 +333,13 @@ def _process_l2_agglomeration( chunk_edges = chunk_edges[~in2d(chunk_edges, removed_edges)] cross_edges = agg.cross_edges.get_pairs() + # we must avoid the cache to read roots to get segment state before edit began parents = cg.get_parents(cross_edges[:, 0], time_stamp=parent_ts, raw_only=True) err = f"got cross edges from more than one l2 node; op {operation_id}" assert np.unique(parents).size == 1, err root = cg.get_root(parents[0], time_stamp=parent_ts, raw_only=True) # inactive edges must be filtered out - # we must avoid the cache to read roots to get segment state before edit began neighbor_roots = cg.get_roots( cross_edges[:, 1], raw_only=True, time_stamp=parent_ts ) @@ -419,6 +431,7 @@ def remove_edges( cg, new_l2_ids, new_old_id_d, + old_new_id_d, time_stamp=time_stamp, parent_ts=parent_ts, ) @@ -434,6 +447,9 @@ def remove_edges( parent_ts=parent_ts, ) new_roots = create_parents.run() + for new_root in new_roots: + l2c = get_l2children(cg, new_root) + assert len(l2c) == np.unique(l2c).size, f"inconsistent result op {operation_id}" create_parents.create_new_entries() return new_roots, new_l2_ids, updated_entries + create_parents.new_entries @@ -481,6 +497,7 @@ def _update_id_lineage( layer: int, parent_layer: int, ): + # update newly created children; mask others mask = np.in1d(children, self._new_ids_d[layer]) for child_id in children[mask]: child_old_ids = self._new_old_id_d[child_id] @@ -533,7 +550,7 @@ def _update_cross_edge_cache(self, parent, children): cx_edges_d = self.cg.get_cross_chunk_edges( children, time_stamp=self._last_successful_ts ) - cx_edges_d = concatenate_cross_edge_dicts(cx_edges_d.values(), unique=True) + cx_edges_d = concatenate_cross_edge_dicts(cx_edges_d.values()) parent_layer = self.cg.get_chunk_layer(parent) edge_nodes = np.unique(np.concatenate([*cx_edges_d.values(), types.empty_2d])) @@ -569,19 +586,9 @@ def _create_new_parents(self, layer: int): layer_node_ids = self._get_layer_node_ids(new_ids, layer) components, graph_ids = self._get_connected_components(layer_node_ids, layer) new_parent_ids = [] - all_old_ids = [] - for v in self._new_old_id_d.values(): - all_old_ids.extend(v) - all_old_ids = np.array(all_old_ids, dtype=basetypes.NODE_ID) - for cc_indices in components: parent_layer = layer + 1 # must be reset for each connected component cc_ids = graph_ids[cc_indices] - mask = np.isin(cc_ids, all_old_ids) - old_ids = cc_ids[mask] - new_ids = _get_flipped_ids(self._old_new_id_d, cc_ids[mask]) - err = f"got old ids {old_ids} -> {new_ids}; op {self._operation_id}" - assert np.all(~mask), err if len(cc_ids) == 1: # skip connection parent_layer = self.cg.meta.layer_count @@ -613,6 +620,7 @@ def _create_new_parents(self, layer: int): self.cg, new_parent_ids, self._new_old_id_d, + self._old_new_id_d, time_stamp=self._time_stamp, parent_ts=self._last_successful_ts, ) From 9adb8a7c299d94ad4e89ad939a14b832b47b7a24 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Tue, 12 Sep 2023 20:12:06 +0000 Subject: [PATCH 068/116] fix: circular import --- pychunkedgraph/debug/utils.py | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/pychunkedgraph/debug/utils.py b/pychunkedgraph/debug/utils.py index e194f4ee1..53152ec6f 100644 --- a/pychunkedgraph/debug/utils.py +++ b/pychunkedgraph/debug/utils.py @@ -2,9 +2,6 @@ import numpy as np -from ..graph import ChunkedGraph -from ..graph.utils.basetypes import NODE_ID - def print_attrs(d): for k, v in d.items(): @@ -18,12 +15,7 @@ def print_attrs(d): print(v) -def print_node( - cg: ChunkedGraph, - node: NODE_ID, - indent: int = 0, - stop_layer: int = 2, -) -> None: +def print_node(cg, node: np.uint64, indent: int = 0, stop_layer: int = 2) -> None: children = cg.get_children(node) print(f"{' ' * indent}{node}[{len(children)}]") if cg.get_chunk_layer(node) <= stop_layer: @@ -32,8 +24,8 @@ def print_node( print_node(cg, child, indent=indent + 4, stop_layer=stop_layer) -def get_l2children(cg: ChunkedGraph, node: NODE_ID) -> np.ndarray: - nodes = np.array([node], dtype=NODE_ID) +def get_l2children(cg, node: np.uint64) -> np.ndarray: + nodes = np.array([node], dtype=np.uint64) layers = cg.get_chunk_layers(nodes) assert np.all(layers > 2), "nodes must be at layers > 2" l2children = [] From e9977e1db8f9c203bbb9dbbafaa1f58746606aa4 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Tue, 12 Sep 2023 20:28:02 +0000 Subject: [PATCH 069/116] fix: consider layer 2 as well --- pychunkedgraph/debug/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pychunkedgraph/debug/utils.py b/pychunkedgraph/debug/utils.py index 53152ec6f..43562afd2 100644 --- a/pychunkedgraph/debug/utils.py +++ b/pychunkedgraph/debug/utils.py @@ -27,7 +27,7 @@ def print_node(cg, node: np.uint64, indent: int = 0, stop_layer: int = 2) -> Non def get_l2children(cg, node: np.uint64) -> np.ndarray: nodes = np.array([node], dtype=np.uint64) layers = cg.get_chunk_layers(nodes) - assert np.all(layers > 2), "nodes must be at layers > 2" + assert np.all(layers >= 2), "nodes must be at layers >= 2" l2children = [] while nodes.size: children = cg.get_children(nodes, flatten=True) From 99d7b07096549c0fa3c5c1364b379440389857e4 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Wed, 13 Sep 2023 16:48:49 +0000 Subject: [PATCH 070/116] fix(edits): incorrect order of opeartions; documentation --- pychunkedgraph/graph/edits.py | 210 +++++++++++++++++----------------- 1 file changed, 108 insertions(+), 102 deletions(-) diff --git a/pychunkedgraph/graph/edits.py b/pychunkedgraph/graph/edits.py index da574db14..b9a07493a 100644 --- a/pychunkedgraph/graph/edits.py +++ b/pychunkedgraph/graph/edits.py @@ -5,6 +5,7 @@ from typing import List from typing import Tuple from typing import Iterable +from typing import Set from collections import defaultdict import fastremap @@ -25,15 +26,13 @@ def _init_old_hierarchy(cg, l2ids: np.ndarray, parent_ts: datetime.datetime = None): - new_old_id_d = defaultdict(set) - old_new_id_d = defaultdict(set) old_hierarchy_d = {id_: {2: id_} for id_ in l2ids} for id_ in l2ids: layer_parent_d = cg.get_all_parents_dict(id_, time_stamp=parent_ts) old_hierarchy_d[id_].update(layer_parent_d) for parent in layer_parent_d.values(): old_hierarchy_d[parent] = old_hierarchy_d[id_] - return new_old_id_d, old_new_id_d, old_hierarchy_d + return old_hierarchy_d def _analyze_affected_edges( @@ -179,64 +178,6 @@ def check_fake_edges( return atomic_edges, rows -def _update_neighbor_cross_edges_single( - cg, new_id: int, cx_edges_d: dict, node_map: dict, *, parent_ts -) -> dict: - node_layer = cg.get_chunk_layer(new_id) - counterparts = [] - for layer in range(node_layer, cg.meta.layer_count): - layer_edges = cx_edges_d.get(layer, types.empty_2d) - counterparts.extend(layer_edges[:, 1]) - - cp_cx_edges_d = cg.get_cross_chunk_edges( - counterparts, time_stamp=parent_ts, raw_only=True - ) - updated_counterparts = {} - for counterpart, edges_d in cp_cx_edges_d.items(): - val_dict = {} - for layer in range(2, cg.meta.layer_count): - edges = edges_d.get(layer, types.empty_2d) - if edges.size == 0: - continue - assert np.all(edges[:, 0] == counterpart) - edges = fastremap.remap(edges, node_map, preserve_missing_labels=True) - edges_d[layer] = edges - val_dict[attributes.Connectivity.CrossChunkEdge[layer]] = edges - if not val_dict: - continue - cg.cache.cross_chunk_edges_cache[counterpart] = edges_d - updated_counterparts[counterpart] = val_dict - return updated_counterparts - - -def _update_neighbor_cross_edges( - cg, new_ids: List[int], new_old_id_d: dict, old_new_id_d, *, time_stamp, parent_ts -) -> List: - node_map = {} - for k, v in old_new_id_d.items(): - node_map[k] = next(iter(v)) - - updated_counterparts = {} - newid_cx_edges_d = cg.get_cross_chunk_edges(new_ids, time_stamp=parent_ts) - for new_id in new_ids: - cx_edges_d = newid_cx_edges_d[new_id] - temp_map = { - old_id: new_id for old_id in _get_flipped_ids(new_old_id_d, [new_id]) - } - node_map.update(temp_map) - result = _update_neighbor_cross_edges_single( - cg, new_id, cx_edges_d, node_map, parent_ts=parent_ts - ) - updated_counterparts.update(result) - - updated_entries = [] - for node, val_dict in updated_counterparts.items(): - rowkey = serialize_uint64(node) - row = cg.client.mutate_row(rowkey, val_dict, time_stamp=time_stamp) - updated_entries.append(row) - return updated_entries - - def add_edges( cg, *, @@ -253,9 +194,10 @@ def add_edges( if not allow_same_segment_merge: roots = cg.get_roots(l2ids, assert_roots=True, time_stamp=parent_ts) assert np.unique(roots).size == 2, "L2 IDs must belong to different roots." - new_old_id_d, old_new_id_d, old_hierarchy_d = _init_old_hierarchy( - cg, l2ids, parent_ts=parent_ts - ) + + new_old_id_d = defaultdict(set) + old_new_id_d = defaultdict(set) + old_hierarchy_d = _init_old_hierarchy(cg, l2ids, parent_ts=parent_ts) atomic_children_d = cg.get_children(l2ids) cross_edges_d = merge_cross_edge_dicts( cg.get_cross_chunk_edges(l2ids, time_stamp=parent_ts), l2_cross_edges_d @@ -291,14 +233,6 @@ def add_edges( new_cx_edges_d[layer] = edges assert np.all(edges[:, 0] == new_id) cg.cache.cross_chunk_edges_cache[new_id] = new_cx_edges_d - updated_entries = _update_neighbor_cross_edges( - cg, - new_l2_ids, - new_old_id_d, - old_new_id_d, - time_stamp=time_stamp, - parent_ts=parent_ts, - ) create_parents = CreateParentNodes( cg, @@ -316,7 +250,7 @@ def add_edges( l2c = get_l2children(cg, new_root) assert len(l2c) == np.unique(l2c).size, f"inconsistent result op {operation_id}" create_parents.create_new_entries() - return new_roots, new_l2_ids, updated_entries + create_parents.new_entries + return new_roots, new_l2_ids, create_parents.new_entries def _process_l2_agglomeration( @@ -388,9 +322,9 @@ def remove_edges( roots = cg.get_roots(l2ids, assert_roots=True, time_stamp=parent_ts) assert np.unique(roots).size == 1, "L2 IDs must belong to same root." - new_old_id_d, old_new_id_d, old_hierarchy_d = _init_old_hierarchy( - cg, l2ids, parent_ts=parent_ts - ) + new_old_id_d = defaultdict(set) + old_new_id_d = defaultdict(set) + old_hierarchy_d = _init_old_hierarchy(cg, l2ids, parent_ts=parent_ts) chunk_id_map = dict(zip(l2ids.tolist(), cg.get_chunk_ids_from_node_ids(l2ids))) removed_edges = np.concatenate([atomic_edges, atomic_edges[:, ::-1]], axis=0) @@ -427,14 +361,6 @@ def remove_edges( new_cx_edges_d[layer] = edges assert np.all(edges[:, 0] == new_id) cg.cache.cross_chunk_edges_cache[new_id] = new_cx_edges_d - updated_entries = _update_neighbor_cross_edges( - cg, - new_l2_ids, - new_old_id_d, - old_new_id_d, - time_stamp=time_stamp, - parent_ts=parent_ts, - ) create_parents = CreateParentNodes( cg, @@ -451,7 +377,7 @@ def remove_edges( l2c = get_l2children(cg, new_root) assert len(l2c) == np.unique(l2c).size, f"inconsistent result op {operation_id}" create_parents.create_new_entries() - return new_roots, new_l2_ids, updated_entries + create_parents.new_entries + return new_roots, new_l2_ids, create_parents.new_entries def _get_flipped_ids(id_map, node_ids): @@ -466,6 +392,82 @@ def _get_flipped_ids(id_map, node_ids): return np.concatenate(ids) +def _update_neighbor_cross_edges_single( + cg, new_id: int, cx_edges_d: dict, node_map: dict, *, parent_ts +) -> dict: + """ + For each new_id, get counterparts and update its cross chunk edges. + Some of them maybe updated multiple times so we need to collect them first + and then write to storage to consolidate the mutations. + Returns updated counterparts. + """ + node_layer = cg.get_chunk_layer(new_id) + counterparts = [] + for layer in range(node_layer, cg.meta.layer_count): + layer_edges = cx_edges_d.get(layer, types.empty_2d) + counterparts.extend(layer_edges[:, 1]) + + cp_cx_edges_d = cg.get_cross_chunk_edges( + counterparts, time_stamp=parent_ts, raw_only=True + ) + updated_counterparts = {} + for counterpart, edges_d in cp_cx_edges_d.items(): + val_dict = {} + for layer in range(2, cg.meta.layer_count): + edges = edges_d.get(layer, types.empty_2d) + if edges.size == 0: + continue + assert np.all(edges[:, 0] == counterpart) + edges = fastremap.remap(edges, node_map, preserve_missing_labels=True) + edges_d[layer] = edges + val_dict[attributes.Connectivity.CrossChunkEdge[layer]] = edges + if not val_dict: + continue + cg.cache.cross_chunk_edges_cache[counterpart] = edges_d + updated_counterparts[counterpart] = val_dict + return updated_counterparts + + +def _update_neighbor_cross_edges( + cg, + new_ids: List[int], + new_old_id: dict, + old_new_id, + *, + time_stamp, + parent_ts, +) -> List: + """ + For each new_id, get counterparts and update its cross chunk edges. + Some of them maybe updated multiple times so we need to collect them first + and then write to storage to consolidate the mutations. + Returns mutations to updated counterparts/partner nodes. + """ + updated_counterparts = {} + newid_cx_edges_d = cg.get_cross_chunk_edges(new_ids, time_stamp=parent_ts) + + node_map = {} + for k, v in old_new_id.items(): + if len(v) == 1: + node_map[k] = next(iter(v)) + + for new_id in new_ids: + cx_edges_d = newid_cx_edges_d[new_id] + m = {old_id: new_id for old_id in _get_flipped_ids(new_old_id, [new_id])} + node_map.update(m) + result = _update_neighbor_cross_edges_single( + cg, new_id, cx_edges_d, node_map, parent_ts=parent_ts + ) + updated_counterparts.update(result) + + updated_entries = [] + for node, val_dict in updated_counterparts.items(): + rowkey = serialize_uint64(node) + row = cg.client.mutate_row(rowkey, val_dict, time_stamp=time_stamp) + updated_entries.append(row) + return updated_entries + + class CreateParentNodes: def __init__( self, @@ -474,8 +476,8 @@ def __init__( new_l2_ids: Iterable, operation_id: basetypes.OPERATION_ID, time_stamp: datetime.datetime, - new_old_id_d: Dict[np.uint64, Iterable[np.uint64]] = None, - old_new_id_d: Dict[np.uint64, Iterable[np.uint64]] = None, + new_old_id_d: Dict[np.uint64, Set[np.uint64]] = None, + old_new_id_d: Dict[np.uint64, Set[np.uint64]] = None, old_hierarchy_d: Dict[np.uint64, Dict[int, np.uint64]] = None, parent_ts: datetime.datetime = None, ): @@ -547,12 +549,15 @@ def _update_cross_edge_cache(self, parent, children): updates cross chunk edges in cache; this can only be done after all new components at a layer have IDs """ + parent_layer = self.cg.get_chunk_layer(parent) + if parent_layer == 2: + # l2 cross edges have already been updated by this point + return cx_edges_d = self.cg.get_cross_chunk_edges( children, time_stamp=self._last_successful_ts ) cx_edges_d = concatenate_cross_edge_dicts(cx_edges_d.values()) - parent_layer = self.cg.get_chunk_layer(parent) edge_nodes = np.unique(np.concatenate([*cx_edges_d.values(), types.empty_2d])) edge_parents = self.cg.get_roots( edge_nodes, @@ -603,28 +608,15 @@ def _create_new_parents(self, layer: int): self.cg.get_parent_chunk_id(cc_ids[0], parent_layer), root_chunk=parent_layer == self.cg.meta.layer_count, ) + new_parent_ids.append(parent_id) self._new_ids_d[parent_layer].append(parent_id) self._update_id_lineage(parent_id, cc_ids, layer, parent_layer) - new_parent_ids.append(parent_id) - self.cg.cache.children_cache[parent_id] = cc_ids cache_utils.update( self.cg.cache.parents_cache, cc_ids, parent_id, ) - for new_id in new_parent_ids: - children = self.cg.get_children(new_id) - self._update_cross_edge_cache(new_id, children) - entries = _update_neighbor_cross_edges( - self.cg, - new_parent_ids, - self._new_old_id_d, - self._old_new_id_d, - time_stamp=self._time_stamp, - parent_ts=self._last_successful_ts, - ) - self.new_entries.extend(entries) def run(self) -> Iterable: """ @@ -640,6 +632,20 @@ def run(self) -> Iterable: self.cg.graph_id, self._operation_id, ): + # all new IDs in this layer have been created + # update their cross chunk edges and their neighbors' + for new_id in self._new_ids_d[layer]: + children = self.cg.get_children(new_id) + self._update_cross_edge_cache(new_id, children) + entries = _update_neighbor_cross_edges( + self.cg, + self._new_ids_d[layer], + self._new_old_id_d, + self._old_new_id_d, + time_stamp=self._time_stamp, + parent_ts=self._last_successful_ts, + ) + self.new_entries.extend(entries) self._create_new_parents(layer) return self._new_ids_d[self.cg.meta.layer_count] From 9dd30e5b2c56801622486f9bb8d39407abc3deb4 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Fri, 15 Sep 2023 14:51:57 +0000 Subject: [PATCH 071/116] feat(ingest): add tests command --- pychunkedgraph/debug/cross_edge_test.py | 60 -------- pychunkedgraph/debug/existence_test.py | 78 ----------- pychunkedgraph/debug/family_test.py | 54 ------- pychunkedgraph/ingest/cli.py | 9 ++ pychunkedgraph/ingest/simple_tests.py | 178 ++++++++++++++++++++++++ 5 files changed, 187 insertions(+), 192 deletions(-) delete mode 100644 pychunkedgraph/debug/cross_edge_test.py delete mode 100644 pychunkedgraph/debug/existence_test.py delete mode 100644 pychunkedgraph/debug/family_test.py create mode 100644 pychunkedgraph/ingest/simple_tests.py diff --git a/pychunkedgraph/debug/cross_edge_test.py b/pychunkedgraph/debug/cross_edge_test.py deleted file mode 100644 index 25bacfa0b..000000000 --- a/pychunkedgraph/debug/cross_edge_test.py +++ /dev/null @@ -1,60 +0,0 @@ -import os -from datetime import datetime -import numpy as np - -from pychunkedgraph.graph import chunkedgraph -from pychunkedgraph.graph import attributes - -#os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/home/svenmd/.cloudvolume/secrets/google-secret.json" - -layer = 2 -n_chunks = 1000 -n_segments_per_chunk = 200 -# timestamp = datetime.datetime.fromtimestamp(1588875769) -timestamp = datetime.utcnow() - -cg = chunkedgraph.ChunkedGraph(graph_id="pinky_nf_v2") - -np.random.seed(42) - -node_ids = [] -for _ in range(n_chunks): - c_x = np.random.randint(0, cg.meta.layer_chunk_bounds[layer][0]) - c_y = np.random.randint(0, cg.meta.layer_chunk_bounds[layer][1]) - c_z = np.random.randint(0, cg.meta.layer_chunk_bounds[layer][2]) - - chunk_id = cg.get_chunk_id(layer=layer, x=c_x, y=c_y, z=c_z) - - max_segment_id = cg.get_segment_id(cg.id_client.get_max_node_id(chunk_id)) - - if max_segment_id < 10: - continue - - segment_ids = np.random.randint(1, max_segment_id, n_segments_per_chunk) - - for segment_id in segment_ids: - node_ids.append(cg.get_node_id(np.uint64(segment_id), np.uint64(chunk_id))) - -rows = cg.client.read_nodes(node_ids=node_ids, end_time=timestamp, - properties=attributes.Hierarchy.Parent) -valid_node_ids = [] -non_valid_node_ids = [] -for k in rows.keys(): - if len(rows[k]) > 0: - valid_node_ids.append(k) - else: - non_valid_node_ids.append(k) - -cc_edges = cg.get_atomic_cross_edges(valid_node_ids) -cc_ids = np.unique(np.concatenate([np.concatenate(list(v.values())) for v in list(cc_edges.values()) if len(v.values())])) - -roots = cg.get_roots(cc_ids) -root_dict = dict(zip(cc_ids, roots)) -root_dict_vec = np.vectorize(root_dict.get) - -for k in cc_edges: - if len(cc_edges[k]) == 0: - continue - local_ids = np.unique(np.concatenate(list(cc_edges[k].values()))) - - assert len(np.unique(root_dict_vec(local_ids))) \ No newline at end of file diff --git a/pychunkedgraph/debug/existence_test.py b/pychunkedgraph/debug/existence_test.py deleted file mode 100644 index 757d3d542..000000000 --- a/pychunkedgraph/debug/existence_test.py +++ /dev/null @@ -1,78 +0,0 @@ -import os -from datetime import datetime -import numpy as np - -from pychunkedgraph.graph import chunkedgraph -from pychunkedgraph.graph import attributes - -#os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/home/svenmd/.cloudvolume/secrets/google-secret.json" - -layer = 2 -n_chunks = 100 -n_segments_per_chunk = 200 -# timestamp = datetime.datetime.fromtimestamp(1588875769) -timestamp = datetime.utcnow() - -cg = chunkedgraph.ChunkedGraph(graph_id="pinky_nf_v2") - -np.random.seed(42) - -node_ids = [] -for _ in range(n_chunks): - c_x = np.random.randint(0, cg.meta.layer_chunk_bounds[layer][0]) - c_y = np.random.randint(0, cg.meta.layer_chunk_bounds[layer][1]) - c_z = np.random.randint(0, cg.meta.layer_chunk_bounds[layer][2]) - - chunk_id = cg.get_chunk_id(layer=layer, x=c_x, y=c_y, z=c_z) - - max_segment_id = cg.get_segment_id(cg.id_client.get_max_node_id(chunk_id)) - - if max_segment_id < 10: - continue - - segment_ids = np.random.randint(1, max_segment_id, n_segments_per_chunk) - - for segment_id in segment_ids: - node_ids.append(cg.get_node_id(np.uint64(segment_id), np.uint64(chunk_id))) - -rows = cg.client.read_nodes(node_ids=node_ids, end_time=timestamp, - properties=attributes.Hierarchy.Parent) -valid_node_ids = [] -non_valid_node_ids = [] -for k in rows.keys(): - if len(rows[k]) > 0: - valid_node_ids.append(k) - else: - non_valid_node_ids.append(k) - -roots = cg.get_roots(valid_node_ids, time_stamp=timestamp) - -roots = [] -try: - roots = cg.get_roots(valid_node_ids) - assert len(roots) == len(valid_node_ids) - print(f"ALL {len(roots)} have been successful!") -except: - print("At least one node failed. Checking nodes one by one now") - -if len(roots) != len(valid_node_ids): - log_dict = {} - success_dict = {} - for node_id in valid_node_ids: - try: - root = cg.get_root(node_id, time_stamp=timestamp) - print(f"Success: {node_id} from chunk {cg.get_chunk_id(node_id)}") - success_dict[node_id] = True - except Exception as e: - print(f"{node_id} from chunk {cg.get_chunk_id(node_id)} failed with {e}") - success_dict[node_id] = False - - t_id = node_id - - while t_id is not None: - last_working_chunk = cg.get_chunk_id(t_id) - t_id = cg.get_parent(t_id) - - print(f"Failed on layer {cg.get_chunk_layer(last_working_chunk)} in chunk {last_working_chunk}") - log_dict[node_id] = last_working_chunk - diff --git a/pychunkedgraph/debug/family_test.py b/pychunkedgraph/debug/family_test.py deleted file mode 100644 index 198351e74..000000000 --- a/pychunkedgraph/debug/family_test.py +++ /dev/null @@ -1,54 +0,0 @@ -import os -from datetime import datetime -import numpy as np - -from pychunkedgraph.graph import chunkedgraph -from pychunkedgraph.graph import attributes - -# os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/home/svenmd/.cloudvolume/secrets/google-secret.json" - -layers = [2, 3, 4, 5, 6, 7] -n_chunks = 10 -n_segments_per_chunk = 200 -# timestamp = datetime.datetime.fromtimestamp(1588875769) -timestamp = datetime.utcnow() - -cg = chunkedgraph.ChunkedGraph(graph_id="pinky_nf_v2") - -np.random.seed(42) - -node_ids = [] - -for layer in layers: - for _ in range(n_chunks): - c_x = np.random.randint(0, cg.meta.layer_chunk_bounds[layer][0]) - c_y = np.random.randint(0, cg.meta.layer_chunk_bounds[layer][1]) - c_z = np.random.randint(0, cg.meta.layer_chunk_bounds[layer][2]) - - chunk_id = cg.get_chunk_id(layer=layer, x=c_x, y=c_y, z=c_z) - - max_segment_id = cg.get_segment_id(cg.id_client.get_max_node_id(chunk_id)) - - if max_segment_id < 10: - continue - - segment_ids = np.random.randint(1, max_segment_id, n_segments_per_chunk) - - for segment_id in segment_ids: - node_ids.append(cg.get_node_id(np.uint64(segment_id), np.uint64(chunk_id))) - -rows = cg.client.read_nodes(node_ids=node_ids, end_time=timestamp, - properties=attributes.Hierarchy.Parent) -valid_node_ids = [] -non_valid_node_ids = [] -for k in rows.keys(): - if len(rows[k]) > 0: - valid_node_ids.append(k) - else: - non_valid_node_ids.append(k) - -parents = cg.get_parents(valid_node_ids, time_stamp=timestamp) -children_dict = cg.get_children(parents) - -for child, parent in zip(valid_node_ids, parents): - assert child in children_dict[parent] \ No newline at end of file diff --git a/pychunkedgraph/ingest/cli.py b/pychunkedgraph/ingest/cli.py index 89106a097..67182fc81 100644 --- a/pychunkedgraph/ingest/cli.py +++ b/pychunkedgraph/ingest/cli.py @@ -21,6 +21,7 @@ from .manager import IngestionManager from .utils import bootstrap from .utils import chunk_id_str +from .simple_tests import run_all from .create.abstract_layers import add_layer from ..graph.chunkedgraph import ChunkedGraph from ..utils.redis import get_redis_connection @@ -196,3 +197,11 @@ def ingest_chunk_local(graph_id: str, chunk_info, n_threads: int): else: cg = ChunkedGraph(graph_id=graph_id) add_layer(cg, chunk_info[0], chunk_info[1:], n_threads=n_threads) + cg = ChunkedGraph(graph_id=graph_id) + add_layer(cg, chunk_info[0], chunk_info[1:], n_threads=n_threads) + + +@ingest_cli.command("run_tests") +@click.argument("graph_id", type=str) +def run_tests(graph_id): + run_all(ChunkedGraph(graph_id=graph_id)) diff --git a/pychunkedgraph/ingest/simple_tests.py b/pychunkedgraph/ingest/simple_tests.py new file mode 100644 index 000000000..33946bcec --- /dev/null +++ b/pychunkedgraph/ingest/simple_tests.py @@ -0,0 +1,178 @@ +# pylint: disable=invalid-name, missing-function-docstring, broad-exception-caught + +""" +Some sanity tests to ensure chunkedgraph was created properly. +""" + +from datetime import datetime +import numpy as np + +from pychunkedgraph.graph import ChunkedGraph +from pychunkedgraph.graph import attributes + + +def family(cg: ChunkedGraph): + np.random.seed(42) + n_chunks = 100 + n_segments_per_chunk = 200 + timestamp = datetime.utcnow() + + node_ids = [] + for layer in range(2, cg.meta.layer_count - 1): + for _ in range(n_chunks): + c_x = np.random.randint(0, cg.meta.layer_chunk_bounds[layer][0]) + c_y = np.random.randint(0, cg.meta.layer_chunk_bounds[layer][1]) + c_z = np.random.randint(0, cg.meta.layer_chunk_bounds[layer][2]) + chunk_id = cg.get_chunk_id(layer=layer, x=c_x, y=c_y, z=c_z) + max_segment_id = cg.get_segment_id(cg.id_client.get_max_node_id(chunk_id)) + if max_segment_id < 10: + continue + + segment_ids = np.random.randint(1, max_segment_id, n_segments_per_chunk) + for segment_id in segment_ids: + node_ids.append( + cg.get_node_id(np.uint64(segment_id), np.uint64(chunk_id)) + ) + + rows = cg.client.read_nodes( + node_ids=node_ids, end_time=timestamp, properties=attributes.Hierarchy.Parent + ) + valid_node_ids = [] + non_valid_node_ids = [] + for k in rows.keys(): + if len(rows[k]) > 0: + valid_node_ids.append(k) + else: + non_valid_node_ids.append(k) + + parents = cg.get_parents(valid_node_ids, time_stamp=timestamp) + children_dict = cg.get_children(parents) + for child, parent in zip(valid_node_ids, parents): + assert child in children_dict[parent] + print("success") + + +def existence(cg: ChunkedGraph): + np.random.seed(42) + layer = 2 + n_chunks = 100 + n_segments_per_chunk = 200 + timestamp = datetime.utcnow() + node_ids = [] + for _ in range(n_chunks): + c_x = np.random.randint(0, cg.meta.layer_chunk_bounds[layer][0]) + c_y = np.random.randint(0, cg.meta.layer_chunk_bounds[layer][1]) + c_z = np.random.randint(0, cg.meta.layer_chunk_bounds[layer][2]) + chunk_id = cg.get_chunk_id(layer=layer, x=c_x, y=c_y, z=c_z) + max_segment_id = cg.get_segment_id(cg.id_client.get_max_node_id(chunk_id)) + if max_segment_id < 10: + continue + + segment_ids = np.random.randint(1, max_segment_id, n_segments_per_chunk) + for segment_id in segment_ids: + node_ids.append(cg.get_node_id(np.uint64(segment_id), np.uint64(chunk_id))) + + rows = cg.client.read_nodes( + node_ids=node_ids, end_time=timestamp, properties=attributes.Hierarchy.Parent + ) + valid_node_ids = [] + non_valid_node_ids = [] + for k in rows.keys(): + if len(rows[k]) > 0: + valid_node_ids.append(k) + else: + non_valid_node_ids.append(k) + + roots = [] + try: + roots = cg.get_roots(valid_node_ids) + assert len(roots) == len(valid_node_ids) + print("success") + except Exception as e: + print(f"Something went wrong: {e}") + print("At least one node failed. Checking nodes one by one:") + + if len(roots) != len(valid_node_ids): + log_dict = {} + success_dict = {} + for node_id in valid_node_ids: + try: + _ = cg.get_root(node_id, time_stamp=timestamp) + print(f"Success: {node_id} from chunk {cg.get_chunk_id(node_id)}") + success_dict[node_id] = True + except Exception as e: + print(f"{node_id} - chunk {cg.get_chunk_id(node_id)} failed: {e}") + success_dict[node_id] = False + t_id = node_id + while t_id is not None: + last_working_chunk = cg.get_chunk_id(t_id) + t_id = cg.get_parent(t_id) + + layer = cg.get_chunk_layer(last_working_chunk) + print(f"Failed on layer {layer} in chunk {last_working_chunk}") + log_dict[node_id] = last_working_chunk + + +def cross_edges(cg: ChunkedGraph): + np.random.seed(42) + layer = 2 + n_chunks = 10 + n_segments_per_chunk = 200 + timestamp = datetime.utcnow() + node_ids = [] + for _ in range(n_chunks): + c_x = np.random.randint(0, cg.meta.layer_chunk_bounds[layer][0]) + c_y = np.random.randint(0, cg.meta.layer_chunk_bounds[layer][1]) + c_z = np.random.randint(0, cg.meta.layer_chunk_bounds[layer][2]) + chunk_id = cg.get_chunk_id(layer=layer, x=c_x, y=c_y, z=c_z) + max_segment_id = cg.get_segment_id(cg.id_client.get_max_node_id(chunk_id)) + if max_segment_id < 10: + continue + + segment_ids = np.random.randint(1, max_segment_id, n_segments_per_chunk) + for segment_id in segment_ids: + node_ids.append(cg.get_node_id(np.uint64(segment_id), np.uint64(chunk_id))) + + rows = cg.client.read_nodes( + node_ids=node_ids, end_time=timestamp, properties=attributes.Hierarchy.Parent + ) + valid_node_ids = [] + non_valid_node_ids = [] + for k in rows.keys(): + if len(rows[k]) > 0: + valid_node_ids.append(k) + else: + non_valid_node_ids.append(k) + + cc_edges = cg.get_atomic_cross_edges(valid_node_ids) + cc_ids = np.unique( + np.concatenate( + [ + np.concatenate(list(v.values())) + for v in list(cc_edges.values()) + if len(v.values()) + ] + ) + ) + + roots = cg.get_roots(cc_ids) + root_dict = dict(zip(cc_ids, roots)) + root_dict_vec = np.vectorize(root_dict.get) + + for k in cc_edges: + if len(cc_edges[k]) == 0: + continue + local_ids = np.unique(np.concatenate(list(cc_edges[k].values()))) + assert len(np.unique(root_dict_vec(local_ids))) + print("success") + + +def run_all(cg: ChunkedGraph): + print("Running family tests:") + family(cg) + + print("\nRunning existence tests:") + existence(cg) + + print("\nRunning cross_edges tests:") + cross_edges(cg) From 21d5f32a7d6aa4d9ef8c474d2efb2d3f8c9a23ab Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Tue, 26 Sep 2023 15:46:54 +0000 Subject: [PATCH 072/116] fix(edits): make sure to add reverse edges --- pychunkedgraph/graph/edits.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/pychunkedgraph/graph/edits.py b/pychunkedgraph/graph/edits.py index b9a07493a..f4b6fc0ce 100644 --- a/pychunkedgraph/graph/edits.py +++ b/pychunkedgraph/graph/edits.py @@ -403,9 +403,12 @@ def _update_neighbor_cross_edges_single( """ node_layer = cg.get_chunk_layer(new_id) counterparts = [] + counterpart_layers = {} for layer in range(node_layer, cg.meta.layer_count): layer_edges = cx_edges_d.get(layer, types.empty_2d) counterparts.extend(layer_edges[:, 1]) + layers_d = dict(zip(layer_edges[:, 1], [layer] * len(layer_edges[:, 1]))) + counterpart_layers.update(layers_d) cp_cx_edges_d = cg.get_cross_chunk_edges( counterparts, time_stamp=parent_ts, raw_only=True @@ -413,12 +416,18 @@ def _update_neighbor_cross_edges_single( updated_counterparts = {} for counterpart, edges_d in cp_cx_edges_d.items(): val_dict = {} + counterpart_layer = counterpart_layers[counterpart] for layer in range(2, cg.meta.layer_count): edges = edges_d.get(layer, types.empty_2d) if edges.size == 0: continue assert np.all(edges[:, 0] == counterpart) edges = fastremap.remap(edges, node_map, preserve_missing_labels=True) + if layer == counterpart_layer: + reverse_edge = np.array([counterpart, new_id], dtype=basetypes.NODE_ID) + edges = np.concatenate([edges, [reverse_edge]]) + edges = np.unique(edges, axis=0) + edges_d[layer] = edges val_dict[attributes.Connectivity.CrossChunkEdge[layer]] = edges if not val_dict: @@ -445,7 +454,6 @@ def _update_neighbor_cross_edges( """ updated_counterparts = {} newid_cx_edges_d = cg.get_cross_chunk_edges(new_ids, time_stamp=parent_ts) - node_map = {} for k, v in old_new_id.items(): if len(v) == 1: @@ -459,7 +467,6 @@ def _update_neighbor_cross_edges( cg, new_id, cx_edges_d, node_map, parent_ts=parent_ts ) updated_counterparts.update(result) - updated_entries = [] for node, val_dict in updated_counterparts.items(): rowkey = serialize_uint64(node) @@ -557,7 +564,6 @@ def _update_cross_edge_cache(self, parent, children): children, time_stamp=self._last_successful_ts ) cx_edges_d = concatenate_cross_edge_dicts(cx_edges_d.values()) - edge_nodes = np.unique(np.concatenate([*cx_edges_d.values(), types.empty_2d])) edge_parents = self.cg.get_roots( edge_nodes, @@ -590,7 +596,6 @@ def _create_new_parents(self, layer: int): new_ids = self._new_ids_d[layer] layer_node_ids = self._get_layer_node_ids(new_ids, layer) components, graph_ids = self._get_connected_components(layer_node_ids, layer) - new_parent_ids = [] for cc_indices in components: parent_layer = layer + 1 # must be reset for each connected component cc_ids = graph_ids[cc_indices] @@ -608,7 +613,6 @@ def _create_new_parents(self, layer: int): self.cg.get_parent_chunk_id(cc_ids[0], parent_layer), root_chunk=parent_layer == self.cg.meta.layer_count, ) - new_parent_ids.append(parent_id) self._new_ids_d[parent_layer].append(parent_id) self._update_id_lineage(parent_id, cc_ids, layer, parent_layer) self.cg.cache.children_cache[parent_id] = cc_ids From dd64aa4f9524fcb73c6459ccf544499a43e1bd02 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Tue, 26 Sep 2023 19:47:24 +0000 Subject: [PATCH 073/116] fix(edits): read neighbor cx edges from cache --- pychunkedgraph/graph/edits.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pychunkedgraph/graph/edits.py b/pychunkedgraph/graph/edits.py index f4b6fc0ce..36188a03e 100644 --- a/pychunkedgraph/graph/edits.py +++ b/pychunkedgraph/graph/edits.py @@ -410,9 +410,7 @@ def _update_neighbor_cross_edges_single( layers_d = dict(zip(layer_edges[:, 1], [layer] * len(layer_edges[:, 1]))) counterpart_layers.update(layers_d) - cp_cx_edges_d = cg.get_cross_chunk_edges( - counterparts, time_stamp=parent_ts, raw_only=True - ) + cp_cx_edges_d = cg.get_cross_chunk_edges(counterparts, time_stamp=parent_ts) updated_counterparts = {} for counterpart, edges_d in cp_cx_edges_d.items(): val_dict = {} From 8cd2aa78588718e452790ebdd914019f992baa91 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Wed, 27 Sep 2023 16:17:45 +0000 Subject: [PATCH 074/116] fix(edits): check for no cx edges; comments --- pychunkedgraph/graph/edits.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pychunkedgraph/graph/edits.py b/pychunkedgraph/graph/edits.py index 36188a03e..c348b4fcc 100644 --- a/pychunkedgraph/graph/edits.py +++ b/pychunkedgraph/graph/edits.py @@ -269,8 +269,11 @@ def _process_l2_agglomeration( cross_edges = agg.cross_edges.get_pairs() # we must avoid the cache to read roots to get segment state before edit began parents = cg.get_parents(cross_edges[:, 0], time_stamp=parent_ts, raw_only=True) + + # if there are cross edges, there must be a single parent. + # if there aren't any, there must be no parents. XOR these 2 conditions. err = f"got cross edges from more than one l2 node; op {operation_id}" - assert np.unique(parents).size == 1, err + assert (np.unique(parents).size == 1) != (cross_edges.size == 0), err root = cg.get_root(parents[0], time_stamp=parent_ts, raw_only=True) # inactive edges must be filtered out From 17d0d462d3e19c42e48a243e707500f0de32d865 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Tue, 3 Oct 2023 21:13:11 +0000 Subject: [PATCH 075/116] fix(edits): update neighbor cx edges in a skipped layer --- pychunkedgraph/graph/edits.py | 54 ++++++++++++++++++++++++++++++----- 1 file changed, 47 insertions(+), 7 deletions(-) diff --git a/pychunkedgraph/graph/edits.py b/pychunkedgraph/graph/edits.py index c348b4fcc..9f96db786 100644 --- a/pychunkedgraph/graph/edits.py +++ b/pychunkedgraph/graph/edits.py @@ -623,6 +623,45 @@ def _create_new_parents(self, layer: int): parent_id, ) + def _update_skipped_neighbors(self, current_layer): + """ + Update neighbor nodes in a skipped layer to reflect changes in their descendants. + Get neighbors of new ids at `current_layer - 1`. + Get their parents and update their cx edges. + """ + neighbors = [] + lower_new_ids = self._new_ids_d[current_layer - 1] + newid_cx_edges_d = self.cg.get_cross_chunk_edges( + lower_new_ids, time_stamp=self._last_successful_ts + ) + for cx_edges_d in newid_cx_edges_d.values(): + for edges in cx_edges_d.values(): + neighbors.extend(edges[:, 1]) + + neighbor_parents = self.cg.get_parents( + neighbors, time_stamp=self._last_successful_ts + ) + parents_layers = self.cg.get_chunk_layers(neighbor_parents) + neighbor_parents = neighbor_parents[parents_layers == current_layer] + + updated_entries = [] + children_d = self.cg.get_children(neighbor_parents) + for parent, children in children_d.items(): + self._update_cross_edge_cache(parent, children) + edges_d = self.cg.cache.cross_chunk_edges_cache[parent] + val_dict = {} + for layer in range(2, self.cg.meta.layer_count): + edges = edges_d.get(layer, types.empty_2d) + if edges.size == 0: + continue + val_dict[attributes.Connectivity.CrossChunkEdge[layer]] = edges + rowkey = serialize_uint64(parent) + row = self.cg.client.mutate_row( + rowkey, val_dict, time_stamp=self._time_stamp + ) + updated_entries.append(row) + return updated_entries + def run(self) -> Iterable: """ After new level 2 IDs are created, create parents in higher layers. @@ -631,14 +670,15 @@ def run(self) -> Iterable: self._new_ids_d[2] = self._new_l2_ids for layer in range(2, self.cg.meta.layer_count): if len(self._new_ids_d[layer]) == 0: + # if there are no new ids in a layer due to a skipped connection + # ensure updates to cx edges of parents of neighbors from previous layer + entries = self._update_skipped_neighbors(layer) + self.new_entries.extend(entries) continue - with TimeIt( - f"create_new_parents_layer.{layer}", - self.cg.graph_id, - self._operation_id, - ): - # all new IDs in this layer have been created - # update their cross chunk edges and their neighbors' + # all new IDs in this layer have been created + # update their cross chunk edges and their neighbors' + m = f"create_new_parents_layer.{layer}" + with TimeIt(m, self.cg.graph_id, self._operation_id): for new_id in self._new_ids_d[layer]: children = self.cg.get_children(new_id) self._update_cross_edge_cache(new_id, children) From c828449ba6c570526a00f95fb3482c2378e7aa97 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Wed, 11 Oct 2023 21:03:11 +0000 Subject: [PATCH 076/116] fix(edits): make sure to update all skipped neighbors --- pychunkedgraph/graph/edits.py | 96 ++++++++++++++++------------------- 1 file changed, 43 insertions(+), 53 deletions(-) diff --git a/pychunkedgraph/graph/edits.py b/pychunkedgraph/graph/edits.py index 9f96db786..2edfd3137 100644 --- a/pychunkedgraph/graph/edits.py +++ b/pychunkedgraph/graph/edits.py @@ -428,7 +428,6 @@ def _update_neighbor_cross_edges_single( reverse_edge = np.array([counterpart, new_id], dtype=basetypes.NODE_ID) edges = np.concatenate([edges, [reverse_edge]]) edges = np.unique(edges, axis=0) - edges_d[layer] = edges val_dict[attributes.Connectivity.CrossChunkEdge[layer]] = edges if not val_dict: @@ -584,6 +583,39 @@ def _update_cross_edge_cache(self, parent, children): assert np.all(edges[:, 0] == parent) self.cg.cache.cross_chunk_edges_cache[parent] = new_cx_edges_d + def _update_neighbor_parents(self, neighbor, ceil_layer, updated) -> list: + updated_parents = [] + while True: + parent = self.cg.get_parent(neighbor, time_stamp=self._last_successful_ts) + parent_layer = self.cg.get_chunk_layer(parent) + if parent_layer >= ceil_layer or parent in updated: + break + children = self.cg.get_children(parent) + self._update_cross_edge_cache(parent, children) + updated_parents.append(parent) + neighbor = parent + return updated_parents + + def _update_skipped_neighbors(self, node, layer, parent_layer): + updated_parents = set() + cx_edges_d = self.cg.cache.cross_chunk_edges_cache[node] + for l in range(layer, parent_layer + 1): + layer_edges = cx_edges_d.get(l, types.empty_2d) + neighbors = layer_edges[:, 1] + for n in neighbors: + res = self._update_neighbor_parents(n, parent_layer, updated_parents) + updated_parents.update(res) + + updated_entries = [] + for parent in updated_parents: + val_dict = {} + for layer, edges in self.cg.cache.cross_chunk_edges_cache[parent].items(): + val_dict[attributes.Connectivity.CrossChunkEdge[layer]] = edges + rkey = serialize_uint64(parent) + row = self.cg.client.mutate_row(rkey, val_dict, time_stamp=self._time_stamp) + updated_entries.append(row) + return updated_entries + def _create_new_parents(self, layer: int): """ keep track of old IDs @@ -598,6 +630,7 @@ def _create_new_parents(self, layer: int): layer_node_ids = self._get_layer_node_ids(new_ids, layer) components, graph_ids = self._get_connected_components(layer_node_ids, layer) for cc_indices in components: + update_skipped_neighbors = False parent_layer = layer + 1 # must be reset for each connected component cc_ids = graph_ids[cc_indices] if len(cc_ids) == 1: @@ -610,57 +643,18 @@ def _create_new_parents(self, layer: int): if len(cx_edges_d[cc_ids[0]].get(l, types.empty_2d)) > 0: parent_layer = l break - parent_id = self.cg.id_client.create_node_id( + update_skipped_neighbors = cc_ids[0] in self._new_old_id_d + parent = self.cg.id_client.create_node_id( self.cg.get_parent_chunk_id(cc_ids[0], parent_layer), root_chunk=parent_layer == self.cg.meta.layer_count, ) - self._new_ids_d[parent_layer].append(parent_id) - self._update_id_lineage(parent_id, cc_ids, layer, parent_layer) - self.cg.cache.children_cache[parent_id] = cc_ids - cache_utils.update( - self.cg.cache.parents_cache, - cc_ids, - parent_id, - ) - - def _update_skipped_neighbors(self, current_layer): - """ - Update neighbor nodes in a skipped layer to reflect changes in their descendants. - Get neighbors of new ids at `current_layer - 1`. - Get their parents and update their cx edges. - """ - neighbors = [] - lower_new_ids = self._new_ids_d[current_layer - 1] - newid_cx_edges_d = self.cg.get_cross_chunk_edges( - lower_new_ids, time_stamp=self._last_successful_ts - ) - for cx_edges_d in newid_cx_edges_d.values(): - for edges in cx_edges_d.values(): - neighbors.extend(edges[:, 1]) - - neighbor_parents = self.cg.get_parents( - neighbors, time_stamp=self._last_successful_ts - ) - parents_layers = self.cg.get_chunk_layers(neighbor_parents) - neighbor_parents = neighbor_parents[parents_layers == current_layer] - - updated_entries = [] - children_d = self.cg.get_children(neighbor_parents) - for parent, children in children_d.items(): - self._update_cross_edge_cache(parent, children) - edges_d = self.cg.cache.cross_chunk_edges_cache[parent] - val_dict = {} - for layer in range(2, self.cg.meta.layer_count): - edges = edges_d.get(layer, types.empty_2d) - if edges.size == 0: - continue - val_dict[attributes.Connectivity.CrossChunkEdge[layer]] = edges - rowkey = serialize_uint64(parent) - row = self.cg.client.mutate_row( - rowkey, val_dict, time_stamp=self._time_stamp - ) - updated_entries.append(row) - return updated_entries + self._new_ids_d[parent_layer].append(parent) + self._update_id_lineage(parent, cc_ids, layer, parent_layer) + self.cg.cache.children_cache[parent] = cc_ids + cache_utils.update(self.cg.cache.parents_cache, cc_ids, parent) + if update_skipped_neighbors: + res = self._update_skipped_neighbors(cc_ids[0], layer, parent_layer) + self.new_entries.extend(res) def run(self) -> Iterable: """ @@ -670,10 +664,6 @@ def run(self) -> Iterable: self._new_ids_d[2] = self._new_l2_ids for layer in range(2, self.cg.meta.layer_count): if len(self._new_ids_d[layer]) == 0: - # if there are no new ids in a layer due to a skipped connection - # ensure updates to cx edges of parents of neighbors from previous layer - entries = self._update_skipped_neighbors(layer) - self.new_entries.extend(entries) continue # all new IDs in this layer have been created # update their cross chunk edges and their neighbors' From d6e0e6352b19f0fb689c0817b6aaeb1a27a3d046 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Wed, 11 Oct 2023 23:02:01 +0000 Subject: [PATCH 077/116] fix(edits): ignore new ids in neighbor update --- pychunkedgraph/graph/edits.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pychunkedgraph/graph/edits.py b/pychunkedgraph/graph/edits.py index 2edfd3137..d2523715b 100644 --- a/pychunkedgraph/graph/edits.py +++ b/pychunkedgraph/graph/edits.py @@ -603,6 +603,9 @@ def _update_skipped_neighbors(self, node, layer, parent_layer): layer_edges = cx_edges_d.get(l, types.empty_2d) neighbors = layer_edges[:, 1] for n in neighbors: + if n in self._new_old_id_d: + # ignore new ids + continue res = self._update_neighbor_parents(n, parent_layer, updated_parents) updated_parents.update(res) From fce92a6105e3ccf371cd43340cf4dfce45aa3529 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Thu, 12 Oct 2023 17:17:44 +0000 Subject: [PATCH 078/116] add docs --- pychunkedgraph/graph/edits.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/pychunkedgraph/graph/edits.py b/pychunkedgraph/graph/edits.py index d2523715b..839db48b9 100644 --- a/pychunkedgraph/graph/edits.py +++ b/pychunkedgraph/graph/edits.py @@ -494,7 +494,7 @@ def __init__( self._old_hierarchy_d = old_hierarchy_d self._new_old_id_d = new_old_id_d self._old_new_id_d = old_new_id_d - self._new_ids_d = defaultdict(list) # new IDs in each layer + self._new_ids_d = defaultdict(list) self._operation_id = operation_id self._time_stamp = time_stamp self._last_successful_ts = parent_ts @@ -572,7 +572,6 @@ def _update_cross_edge_cache(self, parent, children): time_stamp=self._last_successful_ts, ) edge_parents_d = dict(zip(edge_nodes, edge_parents)) - new_cx_edges_d = {} for layer in range(parent_layer, self.cg.meta.layer_count): edges = cx_edges_d.get(layer, types.empty_2d) @@ -583,8 +582,9 @@ def _update_cross_edge_cache(self, parent, children): assert np.all(edges[:, 0] == parent) self.cg.cache.cross_chunk_edges_cache[parent] = new_cx_edges_d - def _update_neighbor_parents(self, neighbor, ceil_layer, updated) -> list: - updated_parents = [] + def _update_neighbor_parents(self, neighbor, ceil_layer: int, updated: set) -> list: + """helper for `_update_skipped_neighbors`""" + parents = [] while True: parent = self.cg.get_parent(neighbor, time_stamp=self._last_successful_ts) parent_layer = self.cg.get_chunk_layer(parent) @@ -592,15 +592,22 @@ def _update_neighbor_parents(self, neighbor, ceil_layer, updated) -> list: break children = self.cg.get_children(parent) self._update_cross_edge_cache(parent, children) - updated_parents.append(parent) + parents.append(parent) neighbor = parent - return updated_parents + return parents def _update_skipped_neighbors(self, node, layer, parent_layer): + """ + Updates cross edges of neighbors of a skip connection node. + Neighbors of such nodes can have parents at contiguous layers. + + This method updates cross edges of all such parents + from `layer` through `parent_layer`. + """ updated_parents = set() cx_edges_d = self.cg.cache.cross_chunk_edges_cache[node] - for l in range(layer, parent_layer + 1): - layer_edges = cx_edges_d.get(l, types.empty_2d) + for _layer in range(layer, parent_layer + 1): + layer_edges = cx_edges_d.get(_layer, types.empty_2d) neighbors = layer_edges[:, 1] for n in neighbors: if n in self._new_old_id_d: @@ -608,12 +615,11 @@ def _update_skipped_neighbors(self, node, layer, parent_layer): continue res = self._update_neighbor_parents(n, parent_layer, updated_parents) updated_parents.update(res) - updated_entries = [] for parent in updated_parents: val_dict = {} - for layer, edges in self.cg.cache.cross_chunk_edges_cache[parent].items(): - val_dict[attributes.Connectivity.CrossChunkEdge[layer]] = edges + for _layer, edges in self.cg.cache.cross_chunk_edges_cache[parent].items(): + val_dict[attributes.Connectivity.CrossChunkEdge[_layer]] = edges rkey = serialize_uint64(parent) row = self.cg.client.mutate_row(rkey, val_dict, time_stamp=self._time_stamp) updated_entries.append(row) From 0f7399da01aa0714bdd9bb6c76ea7a0a516264f5 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Sun, 14 Jan 2024 16:53:06 +0000 Subject: [PATCH 079/116] fix: resolve column filter ambiguity --- pychunkedgraph/graph/chunkedgraph.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pychunkedgraph/graph/chunkedgraph.py b/pychunkedgraph/graph/chunkedgraph.py index 8c3e14166..7b3c5d8f4 100644 --- a/pychunkedgraph/graph/chunkedgraph.py +++ b/pychunkedgraph/graph/chunkedgraph.py @@ -303,7 +303,7 @@ def get_atomic_cross_edges(self, l2_ids: typing.Iterable) -> typing.Dict: node_ids=l2_ids, properties=[ attributes.Connectivity.AtomicCrossChunkEdge[l] - for l in range(2, self.meta.layer_count) + for l in range(2, max(3, self.meta.layer_count)) ], ) result = {} From 7536d90db1e3ecfb5cb570a3324326841c8e0a72 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Sun, 14 Jan 2024 20:20:18 +0000 Subject: [PATCH 080/116] fix: resolve column filter ambiguity(2) --- pychunkedgraph/graph/chunkedgraph.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pychunkedgraph/graph/chunkedgraph.py b/pychunkedgraph/graph/chunkedgraph.py index 7b3c5d8f4..7edc538df 100644 --- a/pychunkedgraph/graph/chunkedgraph.py +++ b/pychunkedgraph/graph/chunkedgraph.py @@ -336,7 +336,7 @@ def get_cross_chunk_edges( return result attrs = [ attributes.Connectivity.CrossChunkEdge[l] - for l in range(2, self.meta.layer_count) + for l in range(2, max(3, self.meta.layer_count)) ] node_edges_d_d = self.client.read_nodes( node_ids=node_ids, From c17c41b520d4fe8507c4bd04ed2905e6500510ab Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Sun, 12 May 2024 10:35:48 -0500 Subject: [PATCH 081/116] V3 migration (#484) * feat: convert edges to ocdbt * feat: worker function to convert edges to ocdbt * feat: ocdbt option, consolidate ingest cli * fix(ingest): move fn to utils * fix(ingest): move ocdbt setup to a fn * add tensorstore req, fix build kaniko cache * feat: copy fake_edges to column family 4 * feat: upgrade atomic chunks * fix: rename abstract module to parent * feat: upgrade higher layers, docs * feat: upgrade cli, move common fns to utils * add copy_fake_edges in upgrade fn * handle earliest_timestamp, add test flag to upgrade * fix: fake_edges serialize np.uint64 * add get_operation method, fix timestamp in repair, check for parent * check for l2 ids invalidated by edit retries * remove unnecessary parent assert * remove unused vars * ignore invalid ids, assert parent after earliest_ts * check for ids invalidated by retries in higher layers * parallelize update_cross_edges * overwrite graph version, create col family 4 * improve status print formatting * remove ununsed code, consolidate small common module * efficient check for chunks not done * check for empty chunks, use get_parents * efficient get_edit_ts call by batching all children * reduce earliest_ts calls * combine bigtable calls, use numpy unique * add completion rate command * fix: ignore children without cross edges * add span option to rate calculation * reduce mem usage with global vars * optimize cross edge reading * use existing layer var * limit cx edge reading above given layer * fix: read for earliest_ts check only if true * filter cross edges fn with timestamps * remove git from dockerignore, print stats * shuffle for better distribution of ids * fix: use different var name for layer * increase bigtable read timeout * add message with assert * fix: make span option int * handle skipped connections * fix: read cross edges at layer >= node_layer * handle another case of skipped nodes * check for unique parent count * update graph_id in meta * uncomment line * make repair easier to use * add sanity check for edits * add sanity check for each layer * add layers flag for cx edges * use better names for functions and vars, update types, fix docs --- pychunkedgraph/app/__init__.py | 2 + pychunkedgraph/debug/utils.py | 23 ++ pychunkedgraph/graph/attributes.py | 6 + pychunkedgraph/graph/chunkedgraph.py | 78 ++++-- pychunkedgraph/graph/client/base.py | 2 +- .../graph/client/bigtable/client.py | 37 ++- pychunkedgraph/graph/edges/__init__.py | 94 ++++++- pychunkedgraph/graph/edits.py | 13 +- pychunkedgraph/ingest/__init__.py | 22 +- pychunkedgraph/ingest/cli.py | 131 ++++------ pychunkedgraph/ingest/cli_upgrade.py | 143 +++++++++++ pychunkedgraph/ingest/cluster.py | 243 +++++++++++++----- pychunkedgraph/ingest/common.py | 61 ----- pychunkedgraph/ingest/create/atomic_layer.py | 8 +- .../{abstract_layers.py => parent_layer.py} | 10 +- pychunkedgraph/ingest/ran_agglomeration.py | 8 +- pychunkedgraph/ingest/rq_cli.py | 28 +- pychunkedgraph/ingest/simple_tests.py | 3 +- pychunkedgraph/ingest/upgrade/__init__.py | 0 pychunkedgraph/ingest/upgrade/atomic_layer.py | 119 +++++++++ pychunkedgraph/ingest/upgrade/parent_layer.py | 170 ++++++++++++ pychunkedgraph/ingest/upgrade/utils.py | 13 + pychunkedgraph/ingest/utils.py | 135 +++++++++- pychunkedgraph/repair/edits.py | 6 +- pychunkedgraph/tests/helpers.py | 45 ++-- pychunkedgraph/tests/test_uncategorized.py | 84 +++--- pychunkedgraph/utils/general.py | 9 + requirements.in | 1 + requirements.txt | 6 + 29 files changed, 1100 insertions(+), 400 deletions(-) create mode 100644 pychunkedgraph/ingest/cli_upgrade.py delete mode 100644 pychunkedgraph/ingest/common.py rename pychunkedgraph/ingest/create/{abstract_layers.py => parent_layer.py} (98%) create mode 100644 pychunkedgraph/ingest/upgrade/__init__.py create mode 100644 pychunkedgraph/ingest/upgrade/atomic_layer.py create mode 100644 pychunkedgraph/ingest/upgrade/parent_layer.py create mode 100644 pychunkedgraph/ingest/upgrade/utils.py diff --git a/pychunkedgraph/app/__init__.py b/pychunkedgraph/app/__init__.py index 3e938628b..262849258 100644 --- a/pychunkedgraph/app/__init__.py +++ b/pychunkedgraph/app/__init__.py @@ -105,6 +105,8 @@ def configure_app(app): with app.app_context(): from ..ingest.rq_cli import init_rq_cmds from ..ingest.cli import init_ingest_cmds + from ..ingest.cli_upgrade import init_upgrade_cmds init_rq_cmds(app) init_ingest_cmds(app) + init_upgrade_cmds(app) diff --git a/pychunkedgraph/debug/utils.py b/pychunkedgraph/debug/utils.py index 43562afd2..b1bdbc2be 100644 --- a/pychunkedgraph/debug/utils.py +++ b/pychunkedgraph/debug/utils.py @@ -35,3 +35,26 @@ def get_l2children(cg, node: np.uint64) -> np.ndarray: l2children.append(children[layers == 2]) nodes = children[layers > 2] return np.concatenate(l2children) + + +def sanity_check(cg, new_roots, operation_id): + """ + Check for duplicates in hierarchy, useful for debugging. + """ + print(f"{len(new_roots)} new ids from {operation_id}") + l2c_d = {} + for new_root in new_roots: + l2c_d[new_root] = get_l2children(cg, new_root) + success = True + for k, v in l2c_d.items(): + success = success and (len(v) == np.unique(v).size) + print(f"{k}: {np.unique(v).size}, {len(v)}") + if not success: + raise RuntimeError("Some ids are not valid.") + + +def sanity_check_single(cg, node, operation_id): + v = get_l2children(cg, node) + msg = f"invalid node {node}:" + msg += f" found {len(v)} l2 ids, must be {np.unique(v).size}" + assert np.unique(v).size == len(v), f"{msg}, from {operation_id}." diff --git a/pychunkedgraph/graph/attributes.py b/pychunkedgraph/graph/attributes.py index 33f675dc8..b431a159b 100644 --- a/pychunkedgraph/graph/attributes.py +++ b/pychunkedgraph/graph/attributes.py @@ -120,6 +120,12 @@ class Connectivity: ), ) + FakeEdgesCF3 = _Attribute( + key=b"fake_edges", + family_id="3", + serializer=serializers.NumPyArray(dtype=basetypes.NODE_ID, shape=(-1, 2)), + ) + FakeEdges = _Attribute( key=b"fake_edges", family_id="4", diff --git a/pychunkedgraph/graph/chunkedgraph.py b/pychunkedgraph/graph/chunkedgraph.py index 7edc538df..7d1a24cc3 100644 --- a/pychunkedgraph/graph/chunkedgraph.py +++ b/pychunkedgraph/graph/chunkedgraph.py @@ -19,6 +19,7 @@ from .meta import ChunkedGraphMeta from .utils import basetypes from .utils import id_helpers +from .utils import serializers from .utils import generic as misc_utils from .edges import Edges from .edges import utils as edge_utils @@ -76,7 +77,7 @@ def version(self) -> str: return self.client.read_graph_version() @property - def client(self) -> base.SimpleClient: + def client(self) -> BigTableClient: return self._client @property @@ -287,9 +288,11 @@ def _get_children_multiple( node_ids=node_ids, properties=attributes.Hierarchy.Child ) return { - x: node_children_d[x][0].value - if x in node_children_d - else types.empty_1d.copy() + x: ( + node_children_d[x][0].value + if x in node_children_d + else types.empty_1d.copy() + ) for x in node_ids } return self.cache.children_multiple(node_ids) @@ -322,6 +325,7 @@ def get_cross_chunk_edges( node_ids: typing.Iterable, *, raw_only=False, + all_layers=True, time_stamp: typing.Optional[datetime.datetime] = None, ) -> typing.Dict: """ @@ -334,21 +338,24 @@ def get_cross_chunk_edges( node_ids = np.array(node_ids, dtype=basetypes.NODE_ID) if node_ids.size == 0: return result - attrs = [ - attributes.Connectivity.CrossChunkEdge[l] - for l in range(2, max(3, self.meta.layer_count)) - ] + layers = range(2, max(3, self.meta.layer_count)) + attrs = [attributes.Connectivity.CrossChunkEdge[l] for l in layers] node_edges_d_d = self.client.read_nodes( node_ids=node_ids, properties=attrs, end_time=time_stamp, end_time_inclusive=True, ) - for id_ in node_ids: + layers = self.get_chunk_layers(node_ids) + valid_layer = lambda x, y: x >= y + if not all_layers: + valid_layer = lambda x, y: x == y + for layer, id_ in zip(layers, node_ids): try: result[id_] = { prop.index: val[0].value.copy() for prop, val in node_edges_d_d[id_].items() + if valid_layer(prop.index, layer) } except KeyError: result[id_] = {} @@ -631,9 +638,24 @@ def get_fake_edges( edges = np.concatenate( [np.array(e.value, dtype=basetypes.NODE_ID, copy=False) for e in val] ) - result[id_] = Edges(edges[:, 0], edges[:, 1], fake_edges=True) + result[id_] = Edges(edges[:, 0], edges[:, 1]) return result + def copy_fake_edges(self, chunk_id: np.uint64) -> None: + _edges = self.client.read_node( + node_id=chunk_id, + properties=attributes.Connectivity.FakeEdgesCF3, + end_time_inclusive=True, + fake_edges=True, + ) + mutations = [] + _id = serializers.serialize_uint64(chunk_id, fake_edges=True) + for e in _edges: + val_dict = {attributes.Connectivity.FakeEdges: e.value} + row = self.client.mutate_row(_id, val_dict, time_stamp=e.timestamp) + mutations.append(row) + self.client.write(mutations) + def get_l2_agglomerations( self, level2_ids: np.ndarray, edges_only: bool = False ) -> typing.Tuple[typing.Dict[int, types.Agglomeration], typing.Tuple[Edges]]: @@ -690,13 +712,15 @@ def get_l2_agglomerations( ) return ( agglomeration_d, - (self.mock_edges,) - if self.mock_edges is not None - else (in_edges, out_edges, cross_edges), + ( + (self.mock_edges,) + if self.mock_edges is not None + else (in_edges, out_edges, cross_edges) + ), ) def get_node_timestamps( - self, node_ids: typing.Sequence[np.uint64], return_numpy=True + self, node_ids: typing.Sequence[np.uint64], return_numpy=True, normalize=False ) -> typing.Iterable: """ The timestamp of the children column can be assumed @@ -710,17 +734,22 @@ def get_node_timestamps( if return_numpy: return np.array([], dtype=np.datetime64) return [] + result = [] + earliest_ts = self.get_earliest_timestamp() + for n in node_ids: + ts = children[n][0].timestamp + if normalize: + ts = earliest_ts if ts < earliest_ts else ts + result.append(ts) if return_numpy: - return np.array( - [children[x][0].timestamp for x in node_ids], dtype=np.datetime64 - ) - return [children[x][0].timestamp for x in node_ids] + return np.array(result, dtype=np.datetime64) + return result # OPERATIONS def add_edges( self, user_id: str, - atomic_edges: typing.Sequence[np.uint64], + atomic_edges: typing.Sequence[typing.Sequence[np.uint64]], *, affinities: typing.Sequence[np.float32] = None, source_coords: typing.Sequence[int] = None, @@ -935,3 +964,14 @@ def get_earliest_timestamp(self): _, timestamp = self.client.read_log_entry(op_id) if timestamp is not None: return timestamp - timedelta(milliseconds=500) + + def get_operation_ids(self, node_ids: typing.Sequence): + response = self.client.read_nodes(node_ids=node_ids) + result = {} + for node in node_ids: + try: + operations = response[node][attributes.OperationLogs.OperationID] + result[node] = [(x.value, x.timestamp) for x in operations] + except KeyError: + ... + return result diff --git a/pychunkedgraph/graph/client/base.py b/pychunkedgraph/graph/client/base.py index a66602a6a..953734670 100644 --- a/pychunkedgraph/graph/client/base.py +++ b/pychunkedgraph/graph/client/base.py @@ -13,7 +13,7 @@ def create_graph(self) -> None: """Initialize the graph and store associated meta.""" @abstractmethod - def add_graph_version(self, version): + def add_graph_version(self, version: str, overwrite: bool = False): """Add a version to the graph.""" @abstractmethod diff --git a/pychunkedgraph/graph/client/bigtable/client.py b/pychunkedgraph/graph/client/bigtable/client.py index 6601b654e..52ec9a856 100644 --- a/pychunkedgraph/graph/client/bigtable/client.py +++ b/pychunkedgraph/graph/client/bigtable/client.py @@ -19,7 +19,7 @@ from google.cloud.bigtable.column_family import MaxVersionsGCRule from google.cloud.bigtable.table import Table from google.cloud.bigtable.row_set import RowSet -from google.cloud.bigtable.row_data import PartialRowData +from google.cloud.bigtable.row_data import DEFAULT_RETRY_READ_ROWS, PartialRowData from google.cloud.bigtable.row_filters import RowFilter from . import utils @@ -97,8 +97,9 @@ def create_graph(self, meta: ChunkedGraphMeta, version: str) -> None: self.add_graph_version(version) self.update_graph_meta(meta) - def add_graph_version(self, version: str): - assert self.read_graph_version() is None, "Graph has already been versioned." + def add_graph_version(self, version: str, overwrite: bool = False): + if not overwrite: + assert self.read_graph_version() is None, self.read_graph_version() self._version = version row = self.mutate_row( attributes.GraphVersion.key, @@ -160,18 +161,25 @@ def read_nodes( # when all IDs in a block are within a range node_ids = np.sort(node_ids) rows = self._read_byte_rows( - start_key=serialize_uint64(start_id, fake_edges=fake_edges) - if start_id is not None - else None, - end_key=serialize_uint64(end_id, fake_edges=fake_edges) - if end_id is not None - else None, + start_key=( + serialize_uint64(start_id, fake_edges=fake_edges) + if start_id is not None + else None + ), + end_key=( + serialize_uint64(end_id, fake_edges=fake_edges) + if end_id is not None + else None + ), end_key_inclusive=end_id_inclusive, row_keys=( - serialize_uint64(node_id, fake_edges=fake_edges) for node_id in node_ids - ) - if node_ids is not None - else None, + ( + serialize_uint64(node_id, fake_edges=fake_edges) + for node_id in node_ids + ) + if node_ids is not None + else None + ), columns=properties, start_time=start_time, end_time=end_time, @@ -819,7 +827,8 @@ def _execute_read_thread(self, args: typing.Tuple[Table, RowSet, RowFilter]): # Check for everything falsy, because Bigtable considers even empty # lists of row_keys as no upper/lower bound! return {} - range_read = table.read_rows(row_set=row_set, filter_=row_filter) + retry = DEFAULT_RETRY_READ_ROWS.with_timeout(180) + range_read = table.read_rows(row_set=row_set, filter_=row_filter, retry=retry) res = {v.row_key: utils.partial_row_data_to_column_dict(v) for v in range_read} return res diff --git a/pychunkedgraph/graph/edges/__init__.py b/pychunkedgraph/graph/edges/__init__.py index b0e488d05..430ab9fa7 100644 --- a/pychunkedgraph/graph/edges/__init__.py +++ b/pychunkedgraph/graph/edges/__init__.py @@ -2,10 +2,14 @@ Classes and types for edges """ -from typing import Optional from collections import namedtuple +from os import environ +from typing import Optional import numpy as np +import tensorstore as ts +import zstandard as zstd +from graph_tool import Graph from ..utils import basetypes @@ -18,6 +22,14 @@ DEFAULT_AFFINITY = np.finfo(np.float32).tiny DEFAULT_AREA = np.finfo(np.float32).tiny +ADJACENCY_DTYPE = np.dtype( + [ + ("node", basetypes.NODE_ID), + ("aff", basetypes.EDGE_AFFINITY), + ("area", basetypes.EDGE_AREA), + ] +) +ZSTD_EDGE_COMPRESSION = 17 class Edges: @@ -28,17 +40,17 @@ def __init__( *, affinities: Optional[np.ndarray] = None, areas: Optional[np.ndarray] = None, - fake_edges=False, ): self.node_ids1 = np.array(node_ids1, dtype=basetypes.NODE_ID, copy=False) self.node_ids2 = np.array(node_ids2, dtype=basetypes.NODE_ID, copy=False) assert self.node_ids1.size == self.node_ids2.size self._as_pairs = None - self._fake_edges = fake_edges if affinities is not None and len(affinities) > 0: - self._affinities = np.array(affinities, dtype=basetypes.EDGE_AFFINITY, copy=False) + self._affinities = np.array( + affinities, dtype=basetypes.EDGE_AFFINITY, copy=False + ) assert self.node_ids1.size == self._affinities.size else: self._affinities = np.full(len(self.node_ids1), DEFAULT_AFFINITY) @@ -103,3 +115,77 @@ def get_pairs(self) -> np.ndarray: return self._as_pairs self._as_pairs = np.column_stack((self.node_ids1, self.node_ids2)) return self._as_pairs + + +def put_edges(destination: str, nodes: np.ndarray, edges: Edges) -> None: + graph_ids, _edges = np.unique(edges.get_pairs(), return_inverse=True) + graph_ids_reverse = {n: i for i, n in enumerate(graph_ids)} + _edges = _edges.reshape(-1, 2) + + graph = Graph(directed=False) + graph.add_edge_list(_edges) + e_aff = graph.new_edge_property("double", vals=edges.affinities) + e_area = graph.new_edge_property("int", vals=edges.areas) + cctx = zstd.ZstdCompressor(level=ZSTD_EDGE_COMPRESSION) + ocdbt_host = environ["OCDBT_COORDINATOR_HOST"] + ocdbt_port = environ["OCDBT_COORDINATOR_PORT"] + + spec = { + "driver": "ocdbt", + "base": destination, + "coordinator": {"address": f"{ocdbt_host}:{ocdbt_port}"}, + } + dataset = ts.KvStore.open(spec).result() + with ts.Transaction() as txn: + for _node in nodes: + node = graph_ids_reverse[_node] + neighbors = graph.get_all_neighbors(node) + adjacency_list = np.zeros(neighbors.size, dtype=ADJACENCY_DTYPE) + adjacency_list["node"] = graph_ids[neighbors] + adjacency_list["aff"] = [e_aff[(node, neighbor)] for neighbor in neighbors] + adjacency_list["area"] = [ + e_area[(node, neighbor)] for neighbor in neighbors + ] + dataset.with_transaction(txn)[str(graph_ids[node])] = cctx.compress( + adjacency_list.tobytes() + ) + + +def get_edges(source: str, nodes: np.ndarray) -> Edges: + spec = {"driver": "ocdbt", "base": source} + dataset = ts.KvStore.open(spec).result() + zdc = zstd.ZstdDecompressor() + + read_futures = [dataset.read(str(n)) for n in nodes] + read_results = [rf.result() for rf in read_futures] + compressed = [rr.value for rr in read_results] + + try: + n_threads = int(environ.get("ZSTD_THREADS", 1)) + except ValueError: + n_threads = 1 + + decompressed = [] + try: + decompressed = zdc.multi_decompress_to_buffer(compressed, threads=n_threads) + except ValueError: + for content in compressed: + decompressed.append(zdc.decompressobj().decompress(content)) + + node_ids1 = [np.empty(0, dtype=basetypes.NODE_ID)] + node_ids2 = [np.empty(0, dtype=basetypes.NODE_ID)] + affinities = [np.empty(0, dtype=basetypes.EDGE_AFFINITY)] + areas = [np.empty(0, dtype=basetypes.EDGE_AREA)] + for n, content in zip(nodes, compressed): + adjacency_list = np.frombuffer(content, dtype=ADJACENCY_DTYPE) + node_ids1.append([n] * adjacency_list.size) + node_ids2.append(adjacency_list["node"]) + affinities.append(adjacency_list["aff"]) + areas.append(adjacency_list["area"]) + + return Edges( + np.concatenate(node_ids1), + np.concatenate(node_ids2), + affinities=np.concatenate(affinities), + areas=np.concatenate(areas), + ) diff --git a/pychunkedgraph/graph/edits.py b/pychunkedgraph/graph/edits.py index 839db48b9..ee7e643c3 100644 --- a/pychunkedgraph/graph/edits.py +++ b/pychunkedgraph/graph/edits.py @@ -22,7 +22,7 @@ from .utils.serializers import serialize_uint64 from ..logging.log_db import TimeIt from ..utils.general import in2d -from ..debug.utils import get_l2children +from ..debug.utils import sanity_check, sanity_check_single def _init_old_hierarchy(cg, l2ids: np.ndarray, parent_ts: datetime.datetime = None): @@ -246,9 +246,7 @@ def add_edges( ) new_roots = create_parents.run() - for new_root in new_roots: - l2c = get_l2children(cg, new_root) - assert len(l2c) == np.unique(l2c).size, f"inconsistent result op {operation_id}" + sanity_check(cg, new_roots, operation_id) create_parents.create_new_entries() return new_roots, new_l2_ids, create_parents.new_entries @@ -376,9 +374,7 @@ def remove_edges( parent_ts=parent_ts, ) new_roots = create_parents.run() - for new_root in new_roots: - l2c = get_l2children(cg, new_root) - assert len(l2c) == np.unique(l2c).size, f"inconsistent result op {operation_id}" + sanity_check(cg, new_roots, operation_id) create_parents.create_new_entries() return new_roots, new_l2_ids, create_parents.new_entries @@ -579,7 +575,7 @@ def _update_cross_edge_cache(self, parent, children): continue edges = fastremap.remap(edges, edge_parents_d, preserve_missing_labels=True) new_cx_edges_d[layer] = np.unique(edges, axis=0) - assert np.all(edges[:, 0] == parent) + assert np.all(edges[:, 0] == parent), f"{parent}, {np.unique(edges[:, 0])}" self.cg.cache.cross_chunk_edges_cache[parent] = new_cx_edges_d def _update_neighbor_parents(self, neighbor, ceil_layer: int, updated: set) -> list: @@ -661,6 +657,7 @@ def _create_new_parents(self, layer: int): self._update_id_lineage(parent, cc_ids, layer, parent_layer) self.cg.cache.children_cache[parent] = cc_ids cache_utils.update(self.cg.cache.parents_cache, cc_ids, parent) + sanity_check_single(self.cg, parent, self._operation_id) if update_skipped_neighbors: res = self._update_skipped_neighbors(cc_ids[0], layer, parent_layer) self.new_entries.extend(res) diff --git a/pychunkedgraph/ingest/__init__.py b/pychunkedgraph/ingest/__init__.py index b3d832d5e..55c10ca5f 100644 --- a/pychunkedgraph/ingest/__init__.py +++ b/pychunkedgraph/ingest/__init__.py @@ -1,32 +1,16 @@ +import logging from collections import namedtuple - -_cluster_ingest_config_fields = ( - "ATOMIC_Q_NAME", - "ATOMIC_Q_LIMIT", - "ATOMIC_Q_INTERVAL", -) -_cluster_ingest_defaults = ( - "l2", - 100000, - 120, -) -ClusterIngestConfig = namedtuple( - "ClusterIngestConfig", - _cluster_ingest_config_fields, - defaults=_cluster_ingest_defaults, -) - +logging.basicConfig(format="%(levelname)s:%(message)s", level=logging.INFO) _ingestconfig_fields = ( - "CLUSTER", # cluster config "AGGLOMERATION", "WATERSHED", "USE_RAW_EDGES", "USE_RAW_COMPONENTS", "TEST_RUN", ) -_ingestconfig_defaults = (None, None, None, False, False, False) +_ingestconfig_defaults = (None, None, False, False, False) IngestConfig = namedtuple( "IngestConfig", _ingestconfig_fields, defaults=_ingestconfig_defaults ) diff --git a/pychunkedgraph/ingest/cli.py b/pychunkedgraph/ingest/cli.py index 67182fc81..928e1852f 100644 --- a/pychunkedgraph/ingest/cli.py +++ b/pychunkedgraph/ingest/cli.py @@ -4,29 +4,25 @@ cli for running ingest """ -from os import environ -from time import sleep +import logging import click import yaml from flask.cli import AppGroup -from rq import Queue -from rq import Worker -from rq.worker import WorkerStatus - -from .cluster import create_atomic_chunk -from .cluster import create_parent_chunk -from .cluster import enqueue_atomic_tasks -from .cluster import randomize_grid_points + +from .cluster import create_atomic_chunk, create_parent_chunk, enqueue_l2_tasks from .manager import IngestionManager -from .utils import bootstrap -from .utils import chunk_id_str +from .utils import ( + bootstrap, + chunk_id_str, + print_completion_rate, + print_ingest_status, + queue_layer_helper, +) from .simple_tests import run_all -from .create.abstract_layers import add_layer +from .create.parent_layer import add_parent_chunk from ..graph.chunkedgraph import ChunkedGraph -from ..utils.redis import get_redis_connection -from ..utils.redis import keys as r_keys -from ..utils.general import chunked +from ..utils.redis import get_redis_connection, keys as r_keys ingest_cli = AppGroup("ingest") @@ -45,9 +41,9 @@ def flush_redis(): @ingest_cli.command("graph") @click.argument("graph_id", type=str) @click.argument("dataset", type=click.Path(exists=True)) -@click.option("--raw", is_flag=True) -@click.option("--test", is_flag=True) -@click.option("--retry", is_flag=True) +@click.option("--raw", is_flag=True, help="Read edges from agglomeration output.") +@click.option("--test", is_flag=True, help="Test 8 chunks at the center of dataset.") +@click.option("--retry", is_flag=True, help="Rerun without creating a new table.") def ingest_graph( graph_id: str, dataset: click.Path, raw: bool, test: bool, retry: bool ): @@ -58,16 +54,16 @@ def ingest_graph( with open(dataset, "r") as stream: config = yaml.safe_load(stream) - meta, ingest_config, client_info = bootstrap( - graph_id, - config=config, - raw=raw, - test_run=test, - ) + if test: + logging.basicConfig(format="%(levelname)s:%(message)s", level=logging.DEBUG) + + meta, ingest_config, client_info = bootstrap(graph_id, config, raw, test) cg = ChunkedGraph(meta=meta, client_info=client_info) if not retry: cg.create() - enqueue_atomic_tasks(IngestionManager(ingest_config, meta)) + + imanager = IngestionManager(ingest_config, meta) + enqueue_l2_tasks(imanager, create_atomic_chunk) @ingest_cli.command("imanager") @@ -100,22 +96,7 @@ def queue_layer(parent_layer): assert parent_layer > 2, "This command is for layers 3 and above." redis = get_redis_connection() imanager = IngestionManager.from_pickle(redis.get(r_keys.INGESTION_MANAGER)) - - if parent_layer == imanager.cg_meta.layer_count: - chunk_coords = [(0, 0, 0)] - else: - bounds = imanager.cg_meta.layer_chunk_bounds[parent_layer] - chunk_coords = randomize_grid_points(*bounds) - - for coords in chunk_coords: - task_q = imanager.get_task_queue(f"l{parent_layer}") - task_q.enqueue( - create_parent_chunk, - job_id=chunk_id_str(parent_layer, coords), - job_timeout=f"{int(parent_layer * parent_layer)}m", - result_ttl=0, - args=(parent_layer, coords), - ) + queue_layer_helper(parent_layer, imanager, create_parent_chunk) @ingest_cli.command("status") @@ -123,39 +104,7 @@ def ingest_status(): """Print ingest status to console by layer.""" redis = get_redis_connection() imanager = IngestionManager.from_pickle(redis.get(r_keys.INGESTION_MANAGER)) - layers = range(2, imanager.cg_meta.layer_count + 1) - layer_counts = imanager.cg_meta.layer_chunk_counts - - pipeline = redis.pipeline() - worker_busy = [] - for layer in layers: - pipeline.scard(f"{layer}c") - queue = Queue(f"l{layer}", connection=redis) - pipeline.llen(queue.key) - pipeline.zcard(queue.failed_job_registry.key) - workers = Worker.all(queue=queue) - worker_busy.append(sum([w.get_state() == WorkerStatus.BUSY for w in workers])) - - results = pipeline.execute() - completed = [] - queued = [] - failed = [] - for i in range(0, len(results), 3): - result = results[i : i + 3] - completed.append(result[0]) - queued.append(result[1]) - failed.append(result[2]) - - print(f"version: \t{imanager.cg.version}") - print(f"graph_id: \t{imanager.cg.graph_id}") - print(f"chunk_size: \t{imanager.cg.meta.graph_config.CHUNK_SIZE}") - print("\nlayer status:") - for layer, done, count in zip(layers, completed, layer_counts): - print(f"{layer}\t: {done} / {count}") - - print("\n\nqueue status:") - for layer, q, f, wb in zip(layers, queued, failed, worker_busy): - print(f"l{layer}\t: queued: {q}\t\t failed: {f}\t\t busy: {wb}") + print_ingest_status(imanager, redis) @ingest_cli.command("chunk") @@ -165,15 +114,14 @@ def ingest_chunk(queue: str, chunk_info): """Manually queue chunk when a job is stuck for whatever reason.""" redis = get_redis_connection() imanager = IngestionManager.from_pickle(redis.get(r_keys.INGESTION_MANAGER)) - layer = chunk_info[0] - coords = chunk_info[1:] - queue = imanager.get_task_queue(queue) + layer, coords = chunk_info[0], chunk_info[1:] + + func = create_parent_chunk + args = (layer, coords) if layer == 2: func = create_atomic_chunk args = (coords,) - else: - func = create_parent_chunk - args = (layer, coords) + queue = imanager.get_task_queue(queue) queue.enqueue( func, job_id=chunk_id_str(layer, coords), @@ -189,16 +137,23 @@ def ingest_chunk(queue: str, chunk_info): @click.option("--n_threads", type=int, default=1) def ingest_chunk_local(graph_id: str, chunk_info, n_threads: int): """Manually ingest a chunk on a local machine.""" - from .create.abstract_layers import add_layer - from .cluster import _create_atomic_chunk - - if chunk_info[0] == 2: - _create_atomic_chunk(chunk_info[1:]) + layer, coords = chunk_info[0], chunk_info[1:] + if layer == 2: + create_atomic_chunk(coords) else: cg = ChunkedGraph(graph_id=graph_id) - add_layer(cg, chunk_info[0], chunk_info[1:], n_threads=n_threads) + add_parent_chunk(cg, layer, coords, n_threads=n_threads) cg = ChunkedGraph(graph_id=graph_id) - add_layer(cg, chunk_info[0], chunk_info[1:], n_threads=n_threads) + add_parent_chunk(cg, layer, coords, n_threads=n_threads) + + +@ingest_cli.command("rate") +@click.argument("layer", type=int) +@click.option("--span", default=10, help="Time span to calculate rate.") +def rate(layer: int, span: int): + redis = get_redis_connection() + imanager = IngestionManager.from_pickle(redis.get(r_keys.INGESTION_MANAGER)) + print_completion_rate(imanager, layer, span=span) @ingest_cli.command("run_tests") diff --git a/pychunkedgraph/ingest/cli_upgrade.py b/pychunkedgraph/ingest/cli_upgrade.py new file mode 100644 index 000000000..c77c0be64 --- /dev/null +++ b/pychunkedgraph/ingest/cli_upgrade.py @@ -0,0 +1,143 @@ +# pylint: disable=invalid-name, missing-function-docstring, unspecified-encoding + +""" +cli for running upgrade +""" + +import logging +from time import sleep + +import click +import tensorstore as ts +from flask.cli import AppGroup +from pychunkedgraph import __version__ +from pychunkedgraph.graph.meta import GraphConfig + +from . import IngestConfig +from .cluster import ( + convert_to_ocdbt, + enqueue_l2_tasks, + upgrade_atomic_chunk, + upgrade_parent_chunk, +) +from .manager import IngestionManager +from .utils import ( + chunk_id_str, + print_completion_rate, + print_ingest_status, + queue_layer_helper, + start_ocdbt_server, +) +from ..graph.chunkedgraph import ChunkedGraph, ChunkedGraphMeta +from ..utils.redis import get_redis_connection +from ..utils.redis import keys as r_keys + +upgrade_cli = AppGroup("upgrade") + + +def init_upgrade_cmds(app): + app.cli.add_command(upgrade_cli) + + +@upgrade_cli.command("flush_redis") +def flush_redis(): + """FLush redis db.""" + redis = get_redis_connection() + redis.flushdb() + + +@upgrade_cli.command("graph") +@click.argument("graph_id", type=str) +@click.option("--test", is_flag=True, help="Test 8 chunks at the center of dataset.") +@click.option("--ocdbt", is_flag=True, help="Store edges using ts ocdbt kv store.") +def upgrade_graph(graph_id: str, test: bool, ocdbt: bool): + """ + Main upgrade command. + Takes upgrade config from a yaml file and queues atomic tasks. + """ + ingest_config = IngestConfig(TEST_RUN=test) + cg = ChunkedGraph(graph_id=graph_id) + cg.client.add_graph_version(__version__, overwrite=True) + + if graph_id != cg.graph_id: + gc = cg.meta.graph_config._asdict() + gc["ID"] = graph_id + new_meta = ChunkedGraphMeta( + GraphConfig(**gc), cg.meta.data_source, cg.meta.custom_data + ) + cg.update_meta(new_meta, overwrite=True) + cg = ChunkedGraph(graph_id=graph_id) + + try: + # create new column family for cross chunk edges + f = cg.client._table.column_family("4") + f.create() + except Exception: + ... + + imanager = IngestionManager(ingest_config, cg.meta) + server = ts.ocdbt.DistributedCoordinatorServer() + if ocdbt: + start_ocdbt_server(imanager, server) + + fn = convert_to_ocdbt if ocdbt else upgrade_atomic_chunk + enqueue_l2_tasks(imanager, fn) + + if ocdbt: + logging.info("All tasks queued. Keep this alive for ocdbt coordinator server.") + while True: + sleep(60) + + +@upgrade_cli.command("layer") +@click.argument("parent_layer", type=int) +def queue_layer(parent_layer): + """ + Queue all chunk tasks at a given layer. + Must be used when all the chunks at `parent_layer - 1` have completed. + """ + assert parent_layer > 2, "This command is for layers 3 and above." + redis = get_redis_connection() + imanager = IngestionManager.from_pickle(redis.get(r_keys.INGESTION_MANAGER)) + queue_layer_helper(parent_layer, imanager, upgrade_parent_chunk) + + +@upgrade_cli.command("status") +def ingest_status(): + """Print upgrade status to console.""" + redis = get_redis_connection() + imanager = IngestionManager.from_pickle(redis.get(r_keys.INGESTION_MANAGER)) + print_ingest_status(imanager, redis, upgrade=True) + + +@upgrade_cli.command("chunk") +@click.argument("queue", type=str) +@click.argument("chunk_info", nargs=4, type=int) +def ingest_chunk(queue: str, chunk_info): + """Manually queue chunk when a job is stuck for whatever reason.""" + redis = get_redis_connection() + imanager = IngestionManager.from_pickle(redis.get(r_keys.INGESTION_MANAGER)) + layer, coords = chunk_info[0], chunk_info[1:] + + func = upgrade_parent_chunk + args = (layer, coords) + if layer == 2: + func = upgrade_atomic_chunk + args = (coords,) + queue = imanager.get_task_queue(queue) + queue.enqueue( + func, + job_id=chunk_id_str(layer, coords), + job_timeout=f"{int(layer * layer)}m", + result_ttl=0, + args=args, + ) + + +@upgrade_cli.command("rate") +@click.argument("layer", type=int) +@click.option("--span", default=10, help="Time span to calculate rate.") +def rate(layer: int, span: int): + redis = get_redis_connection() + imanager = IngestionManager.from_pickle(redis.get(r_keys.INGESTION_MANAGER)) + print_completion_rate(imanager, layer, span=span) diff --git a/pychunkedgraph/ingest/cluster.py b/pychunkedgraph/ingest/cluster.py index a5c6a9861..485251568 100644 --- a/pychunkedgraph/ingest/cluster.py +++ b/pychunkedgraph/ingest/cluster.py @@ -1,23 +1,37 @@ # pylint: disable=invalid-name, missing-function-docstring, import-outside-toplevel """ -Ingest / create chunkedgraph with workers. +Ingest / create chunkedgraph with workers on a cluster. """ -from typing import Sequence, Tuple +import logging +from os import environ +from time import sleep +from typing import Callable, Dict, Iterable, Tuple, Sequence import numpy as np +from rq import Queue as RQueue -from .utils import chunk_id_str + +from .utils import chunk_id_str, get_chunks_not_done, randomize_grid_points from .manager import IngestionManager -from .common import get_atomic_chunk_data -from .ran_agglomeration import get_active_edges -from .create.atomic_layer import add_atomic_edges -from .create.abstract_layers import add_layer -from ..graph.meta import ChunkedGraphMeta +from .ran_agglomeration import ( + get_active_edges, + read_raw_edge_data, + read_raw_agglomeration_data, +) +from .create.atomic_layer import add_atomic_chunk +from .create.parent_layer import add_parent_chunk +from .upgrade.atomic_layer import update_chunk as update_atomic_chunk +from .upgrade.parent_layer import update_chunk as update_parent_chunk +from ..graph.edges import EDGE_TYPES, Edges, put_edges +from ..graph import ChunkedGraph, ChunkedGraphMeta from ..graph.chunks.hierarchy import get_children_chunk_coords -from ..utils.redis import keys as r_keys -from ..utils.redis import get_redis_connection +from ..graph.utils.basetypes import NODE_ID +from ..io.edges import get_chunk_edges +from ..io.components import get_chunk_components +from ..utils.redis import keys as r_keys, get_redis_connection +from ..utils.general import chunked def _post_task_completion( @@ -36,7 +50,7 @@ def create_parent_chunk( ) -> None: redis = get_redis_connection() imanager = IngestionManager.from_pickle(redis.get(r_keys.INGESTION_MANAGER)) - add_layer( + add_parent_chunk( imanager.cg, parent_layer, parent_coords, @@ -49,54 +63,61 @@ def create_parent_chunk( _post_task_completion(imanager, parent_layer, parent_coords) -def randomize_grid_points(X: int, Y: int, Z: int) -> Tuple[int, int, int]: - indices = np.arange(X * Y * Z) - np.random.shuffle(indices) - for index in indices: - yield np.unravel_index(index, (X, Y, Z)) +def upgrade_parent_chunk( + parent_layer: int, + parent_coords: Sequence[int], +) -> None: + redis = get_redis_connection() + imanager = IngestionManager.from_pickle(redis.get(r_keys.INGESTION_MANAGER)) + update_parent_chunk(imanager.cg, parent_coords, layer=parent_layer) + _post_task_completion(imanager, parent_layer, parent_coords) -def enqueue_atomic_tasks(imanager: IngestionManager): - from os import environ - from time import sleep - from rq import Queue as RQueue +def _get_atomic_chunk_data( + imanager: IngestionManager, coord: Sequence[int] +) -> Tuple[Dict, Dict]: + """ + Helper to read either raw data or processed data + If reading from raw data, save it as processed data + """ + chunk_edges = ( + read_raw_edge_data(imanager, coord) + if imanager.config.USE_RAW_EDGES + else get_chunk_edges(imanager.cg_meta.data_source.EDGES, [coord]) + ) - chunk_coords = _get_test_chunks(imanager.cg.meta) - chunk_count = len(chunk_coords) - if not imanager.config.TEST_RUN: - atomic_chunk_bounds = imanager.cg_meta.layer_chunk_bounds[2] - chunk_coords = randomize_grid_points(*atomic_chunk_bounds) - chunk_count = imanager.cg_meta.layer_chunk_counts[0] - print(f"total chunk count: {chunk_count}, queuing...") + _check_edges_direction(chunk_edges, imanager.cg, coord) + + mapping = ( + read_raw_agglomeration_data(imanager, coord) + if imanager.config.USE_RAW_COMPONENTS + else get_chunk_components(imanager.cg_meta.data_source.COMPONENTS, coord) + ) + return chunk_edges, mapping - queue_name = f"{imanager.config.CLUSTER.ATOMIC_Q_NAME}" - q = imanager.get_task_queue(queue_name) - job_datas = [] - batch_size = int(environ.get("L2JOB_BATCH_SIZE", 1000)) - for chunk_coord in chunk_coords: - # buffer for optimal use of redis memory - if len(q) > imanager.config.CLUSTER.ATOMIC_Q_LIMIT: - print(f"Sleeping {imanager.config.CLUSTER.ATOMIC_Q_INTERVAL}s...") - sleep(imanager.config.CLUSTER.ATOMIC_Q_INTERVAL) - - x, y, z = chunk_coord - chunk_str = f"{x}_{y}_{z}" - if imanager.redis.sismember("2c", chunk_str): - # already done, skip - continue - job_datas.append( - RQueue.prepare_data( - create_atomic_chunk, - args=(chunk_coord,), - timeout=environ.get("L2JOB_TIMEOUT", "3m"), - result_ttl=0, - job_id=chunk_id_str(2, chunk_coord), - ) - ) - if len(job_datas) % batch_size == 0: - q.enqueue_many(job_datas) - job_datas = [] - q.enqueue_many(job_datas) + +def _check_edges_direction( + chunk_edges: dict, cg: ChunkedGraph, coord: Sequence[int] +) -> None: + """ + For between and cross chunk edges: + Checks and flips edges such that nodes1 are always within a chunk and nodes2 outside the chunk. + Where nodes1 = edges[:,0] and nodes2 = edges[:,1]. + """ + x, y, z = coord + chunk_id = cg.get_chunk_id(layer=1, x=x, y=y, z=z) + for edge_type in [EDGE_TYPES.between_chunk, EDGE_TYPES.cross_chunk]: + edges = chunk_edges[edge_type] + e1 = edges.node_ids1 + e2 = edges.node_ids2 + + e2_chunk_ids = cg.get_chunk_ids_from_node_ids(e2) + mask = e2_chunk_ids == chunk_id + e1[mask], e2[mask] = e2[mask], e1[mask] + + e1_chunk_ids = cg.get_chunk_ids_from_node_ids(e1) + mask = e1_chunk_ids == chunk_id + assert np.all(mask), "all IDs must belong to same chunk" def create_atomic_chunk(coords: Sequence[int]): @@ -105,22 +126,110 @@ def create_atomic_chunk(coords: Sequence[int]): imanager = IngestionManager.from_pickle(redis.get(r_keys.INGESTION_MANAGER)) coords = np.array(list(coords), dtype=int) - chunk_edges_all, mapping = get_atomic_chunk_data(imanager, coords) + chunk_edges_all, mapping = _get_atomic_chunk_data(imanager, coords) chunk_edges_active, isolated_ids = get_active_edges(chunk_edges_all, mapping) - add_atomic_edges(imanager.cg, coords, chunk_edges_active, isolated=isolated_ids) - - if imanager.config.TEST_RUN: - # print for debugging - for k, v in chunk_edges_all.items(): - print(k, len(v)) - for k, v in chunk_edges_active.items(): - print(f"active_{k}", len(v)) + add_atomic_chunk(imanager.cg, coords, chunk_edges_active, isolated=isolated_ids) + + for k, v in chunk_edges_all.items(): + logging.debug(f"{k}: {len(v)}") + for k, v in chunk_edges_active.items(): + logging.debug(f"active_{k}: {len(v)}") + _post_task_completion(imanager, 2, coords) + + +def upgrade_atomic_chunk(coords: Sequence[int]): + """Upgrades single atomic chunk""" + redis = get_redis_connection() + imanager = IngestionManager.from_pickle(redis.get(r_keys.INGESTION_MANAGER)) + coords = np.array(list(coords), dtype=int) + update_atomic_chunk(imanager.cg, coords, layer=2) + _post_task_completion(imanager, 2, coords) + + +def convert_to_ocdbt(coords: Sequence[int]): + """ + Convert edges stored per chunk to ajacency list in the tensorstore ocdbt kv store. + """ + redis = get_redis_connection() + imanager = IngestionManager.from_pickle(redis.get(r_keys.INGESTION_MANAGER)) + coords = np.array(list(coords), dtype=int) + chunk_edges_all, mapping = _get_atomic_chunk_data(imanager, coords) + + node_ids1 = [] + node_ids2 = [] + affinities = [] + areas = [] + for edges in chunk_edges_all.values(): + node_ids1.extend(edges.node_ids1) + node_ids2.extend(edges.node_ids2) + affinities.extend(edges.affinities) + areas.extend(edges.areas) + + edges = Edges(node_ids1, node_ids2, affinities=affinities, areas=areas) + nodes = np.concatenate( + [edges.node_ids1, edges.node_ids2, np.fromiter(mapping.keys(), dtype=NODE_ID)] + ) + nodes = np.unique(nodes) + + chunk_id = imanager.cg.get_chunk_id(layer=1, x=coords[0], y=coords[1], z=coords[2]) + chunk_ids = imanager.cg.get_chunk_ids_from_node_ids(nodes) + + host = imanager.redis.get("OCDBT_COORDINATOR_HOST").decode() + port = imanager.redis.get("OCDBT_COORDINATOR_PORT").decode() + environ["OCDBT_COORDINATOR_HOST"] = host + environ["OCDBT_COORDINATOR_PORT"] = port + logging.info(f"OCDBT Coordinator address {host}:{port}") + + put_edges( + f"{imanager.cg.meta.data_source.EDGES}/ocdbt", + nodes[chunk_ids == chunk_id], + edges, + ) _post_task_completion(imanager, 2, coords) def _get_test_chunks(meta: ChunkedGraphMeta): - """Chunks at center of the dataset most likely not to be empty""" + """Chunks at the center most likely not to be empty""" parent_coords = np.array(meta.layer_chunk_bounds[3]) // 2 return get_children_chunk_coords(meta, 3, parent_coords) - # f = lambda r1, r2, r3: np.array(np.meshgrid(r1, r2, r3), dtype=int).T.reshape(-1, 3) - # return f((x, x + 1), (y, y + 1), (z, z + 1)) + + +def _queue_tasks(imanager: IngestionManager, chunk_fn: Callable, coords: Iterable): + queue_name = "l2" + q = imanager.get_task_queue(queue_name) + batch_size = int(environ.get("JOB_BATCH_SIZE", 100000)) + batches = chunked(coords, batch_size) + for batch in batches: + _coords = get_chunks_not_done(imanager, 2, batch) + # buffer for optimal use of redis memory + if len(q) > int(environ.get("QUEUE_SIZE", 100000)): + interval = int(environ.get("QUEUE_INTERVAL", 300)) + logging.info(f"Queue full; sleeping {interval}s...") + sleep(interval) + + job_datas = [] + for chunk_coord in _coords: + job_datas.append( + RQueue.prepare_data( + chunk_fn, + args=(chunk_coord,), + timeout=environ.get("L2JOB_TIMEOUT", "3m"), + result_ttl=0, + job_id=chunk_id_str(2, chunk_coord), + ) + ) + q.enqueue_many(job_datas) + + +def enqueue_l2_tasks(imanager: IngestionManager, chunk_fn: Callable): + """ + `chunk_fn`: function to process a given layer 2 chunk. + """ + chunk_coords = _get_test_chunks(imanager.cg.meta) + chunk_count = len(chunk_coords) + if not imanager.config.TEST_RUN: + atomic_chunk_bounds = imanager.cg_meta.layer_chunk_bounds[2] + chunk_coords = randomize_grid_points(*atomic_chunk_bounds) + chunk_count = imanager.cg_meta.layer_chunk_counts[0] + logging.info(f"Chunk count: {chunk_count}, queuing...") + _queue_tasks(imanager, chunk_fn, chunk_coords) diff --git a/pychunkedgraph/ingest/common.py b/pychunkedgraph/ingest/common.py deleted file mode 100644 index dccf58602..000000000 --- a/pychunkedgraph/ingest/common.py +++ /dev/null @@ -1,61 +0,0 @@ -from typing import Dict -from typing import Tuple -from typing import Sequence - -from .manager import IngestionManager -from .ran_agglomeration import read_raw_edge_data -from .ran_agglomeration import read_raw_agglomeration_data -from ..graph import ChunkedGraph -from ..io.edges import get_chunk_edges -from ..io.components import get_chunk_components - - -def get_atomic_chunk_data( - imanager: IngestionManager, coord: Sequence[int] -) -> Tuple[Dict, Dict]: - """ - Helper to read either raw data or processed data - If reading from raw data, save it as processed data - """ - chunk_edges = ( - read_raw_edge_data(imanager, coord) - if imanager.config.USE_RAW_EDGES - else get_chunk_edges(imanager.cg_meta.data_source.EDGES, [coord]) - ) - - _check_edges_direction(chunk_edges, imanager.cg, coord) - - mapping = ( - read_raw_agglomeration_data(imanager, coord) - if imanager.config.USE_RAW_COMPONENTS - else get_chunk_components(imanager.cg_meta.data_source.COMPONENTS, coord) - ) - return chunk_edges, mapping - - -def _check_edges_direction( - chunk_edges: dict, cg: ChunkedGraph, coord: Sequence[int] -) -> None: - """ - For between and cross chunk edges: - Checks and flips edges such that nodes1 are always within a chunk and nodes2 outside the chunk. - Where nodes1 = edges[:,0] and nodes2 = edges[:,1]. - """ - import numpy as np - from ..graph.edges import Edges - from ..graph.edges import EDGE_TYPES - - x, y, z = coord - chunk_id = cg.get_chunk_id(layer=1, x=x, y=y, z=z) - for edge_type in [EDGE_TYPES.between_chunk, EDGE_TYPES.cross_chunk]: - edges = chunk_edges[edge_type] - e1 = edges.node_ids1 - e2 = edges.node_ids2 - - e2_chunk_ids = cg.get_chunk_ids_from_node_ids(e2) - mask = e2_chunk_ids == chunk_id - e1[mask], e2[mask] = e2[mask], e1[mask] - - e1_chunk_ids = cg.get_chunk_ids_from_node_ids(e1) - mask = e1_chunk_ids == chunk_id - assert np.all(mask), "all IDs must belong to same chunk" diff --git a/pychunkedgraph/ingest/create/atomic_layer.py b/pychunkedgraph/ingest/create/atomic_layer.py index 054a82840..0a7aae728 100644 --- a/pychunkedgraph/ingest/create/atomic_layer.py +++ b/pychunkedgraph/ingest/create/atomic_layer.py @@ -23,9 +23,9 @@ from ...graph.utils.flatgraph import connected_components -def add_atomic_edges( +def add_atomic_chunk( cg: ChunkedGraph, - chunk_coord: np.ndarray, + coords: Sequence[int], chunk_edges_d: Dict[str, Edges], isolated: Sequence[int], time_stamp: Optional[datetime.datetime] = None, @@ -40,9 +40,7 @@ def add_atomic_edges( graph, _, _, unique_ids = build_gt_graph(chunk_edge_ids, make_directed=True) ccs = connected_components(graph) - parent_chunk_id = cg.get_chunk_id( - layer=2, x=chunk_coord[0], y=chunk_coord[1], z=chunk_coord[2] - ) + parent_chunk_id = cg.get_chunk_id(layer=2, x=coords[0], y=coords[1], z=coords[2]) parent_ids = cg.id_client.create_node_ids(parent_chunk_id, size=len(ccs)) sparse_indices, remapping = _get_remapping(chunk_edges_d) diff --git a/pychunkedgraph/ingest/create/abstract_layers.py b/pychunkedgraph/ingest/create/parent_layer.py similarity index 98% rename from pychunkedgraph/ingest/create/abstract_layers.py rename to pychunkedgraph/ingest/create/parent_layer.py index adbe4a5ab..09be61407 100644 --- a/pychunkedgraph/ingest/create/abstract_layers.py +++ b/pychunkedgraph/ingest/create/parent_layer.py @@ -29,20 +29,20 @@ from .cross_edges import get_chunk_nodes_cross_edge_layer -def add_layer( +def add_parent_chunk( cg: ChunkedGraph, layer_id: int, - parent_coords: Sequence[int], + coords: Sequence[int], children_coords: Sequence[Sequence[int]] = np.array([]), *, time_stamp: Optional[datetime.datetime] = None, n_threads: int = 4, ) -> None: if not children_coords.size: - children_coords = get_children_chunk_coords(cg.meta, layer_id, parent_coords) + children_coords = get_children_chunk_coords(cg.meta, layer_id, coords) children_ids = _read_children_chunks(cg, layer_id, children_coords, n_threads > 1) cx_edges = get_children_chunk_cross_edges( - cg, layer_id, parent_coords, use_threads=n_threads > 1 + cg, layer_id, coords, use_threads=n_threads > 1 ) node_layers = cg.get_chunk_layers(children_ids) @@ -59,7 +59,7 @@ def add_layer( _write_connected_components( cg, layer_id, - parent_coords, + coords, connected_components, get_valid_timestamp(time_stamp), n_threads > 1, diff --git a/pychunkedgraph/ingest/ran_agglomeration.py b/pychunkedgraph/ingest/ran_agglomeration.py index 7c4af51f7..a0ca42d54 100644 --- a/pychunkedgraph/ingest/ran_agglomeration.py +++ b/pychunkedgraph/ingest/ran_agglomeration.py @@ -5,10 +5,7 @@ from collections import defaultdict from itertools import product -from typing import Dict -from typing import Iterable -from typing import Tuple -from typing import Union +from typing import Dict, Iterable, Tuple, Union from binascii import crc32 @@ -23,8 +20,7 @@ from ..io.edges import put_chunk_edges from ..io.components import put_chunk_components from ..graph.utils import basetypes -from ..graph.edges import Edges -from ..graph.edges import EDGE_TYPES +from ..graph.edges import EDGE_TYPES, Edges from ..graph.types import empty_2d from ..graph.chunks.utils import get_chunk_id diff --git a/pychunkedgraph/ingest/rq_cli.py b/pychunkedgraph/ingest/rq_cli.py index c9b21ae36..6a1a4882d 100644 --- a/pychunkedgraph/ingest/rq_cli.py +++ b/pychunkedgraph/ingest/rq_cli.py @@ -8,8 +8,6 @@ import click from redis import Redis from rq import Queue -from rq import Worker -from rq.worker import WorkerStatus from rq.job import Job from rq.exceptions import InvalidJobOperationError from rq.exceptions import NoSuchJobError @@ -27,23 +25,6 @@ connection = Redis(host=REDIS_HOST, port=REDIS_PORT, db=0, password=REDIS_PASSWORD) -@rq_cli.command("status") -@click.argument("queues", nargs=-1, type=str) -@click.option("--show-busy", is_flag=True) -def get_status(queues, show_busy): - print("NOTE: Use --show-busy to display count of non idle workers\n") - for queue in queues: - q = Queue(queue, connection=connection) - print(f"Queue name \t: {queue}") - print(f"Jobs queued \t: {len(q)}") - print(f"Workers total \t: {Worker.count(queue=q)}") - if show_busy: - workers = Worker.all(queue=q) - count = sum([worker.get_state() == WorkerStatus.BUSY for worker in workers]) - print(f"Workers busy \t: {count}") - print(f"Jobs failed \t: {q.failed_job_registry.count}\n") - - @rq_cli.command("failed") @click.argument("queue", type=str) @click.argument("job_ids", nargs=-1) @@ -129,9 +110,14 @@ def clean_start_registry(queue): def clear_failed_registry(queue): failed_job_registry = FailedJobRegistry(queue, connection=connection) job_ids = failed_job_registry.get_job_ids() + count = 0 for job_id in job_ids: - failed_job_registry.remove(job_id, delete_job=True) - print(f"Deleted {len(job_ids)} jobs from the failed job registry.") + try: + failed_job_registry.remove(job_id, delete_job=True) + count += 1 + except Exception: + ... + print(f"Deleted {count} jobs from the failed job registry.") def init_rq_cmds(app): diff --git a/pychunkedgraph/ingest/simple_tests.py b/pychunkedgraph/ingest/simple_tests.py index 33946bcec..07a60f5f3 100644 --- a/pychunkedgraph/ingest/simple_tests.py +++ b/pychunkedgraph/ingest/simple_tests.py @@ -7,8 +7,7 @@ from datetime import datetime import numpy as np -from pychunkedgraph.graph import ChunkedGraph -from pychunkedgraph.graph import attributes +from pychunkedgraph.graph import attributes, ChunkedGraph def family(cg: ChunkedGraph): diff --git a/pychunkedgraph/ingest/upgrade/__init__.py b/pychunkedgraph/ingest/upgrade/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/pychunkedgraph/ingest/upgrade/atomic_layer.py b/pychunkedgraph/ingest/upgrade/atomic_layer.py new file mode 100644 index 000000000..96f7f71bd --- /dev/null +++ b/pychunkedgraph/ingest/upgrade/atomic_layer.py @@ -0,0 +1,119 @@ +# pylint: disable=invalid-name, missing-docstring, c-extension-no-member +from datetime import timedelta + +import fastremap +import numpy as np +from pychunkedgraph.graph import ChunkedGraph +from pychunkedgraph.graph.attributes import Connectivity +from pychunkedgraph.graph.attributes import Hierarchy +from pychunkedgraph.graph.utils import serializers + +from .utils import exists_as_parent + + +def get_parent_timestamps(cg, supervoxels, start_time=None, end_time=None) -> set: + """ + Timestamps of when the given supervoxels were edited, in the given time range. + """ + response = cg.client.read_nodes( + node_ids=supervoxels, + start_time=start_time, + end_time=end_time, + end_time_inclusive=False, + ) + result = set() + for v in response.values(): + for cell in v[Hierarchy.Parent]: + valid = cell.timestamp >= start_time or cell.timestamp < end_time + assert valid, f"{cell.timestamp}, {start_time}" + result.add(cell.timestamp) + return result + + +def get_edit_timestamps(cg: ChunkedGraph, edges_d, start_ts, end_ts) -> list: + """ + Timestamps of when post-side supervoxels were involved in an edit. + Post-side - supervoxels in the neighbor chunk. + This is required because we need to update edges from both sides. + """ + atomic_cx_edges = np.concatenate(list(edges_d.values())) + timestamps = get_parent_timestamps( + cg, atomic_cx_edges[:, 1], start_time=start_ts, end_time=end_ts + ) + timestamps.add(start_ts) + return sorted(timestamps) + + +def update_cross_edges(cg: ChunkedGraph, node, cx_edges_d, node_ts, end_ts) -> list: + """ + Helper function to update a single L2 ID. + Returns a list of mutations with given timestamps. + """ + rows = [] + edges = np.concatenate(list(cx_edges_d.values())) + uparents = np.unique(cg.get_parents(edges[:, 0], time_stamp=node_ts)) + assert uparents.size <= 1, f"{node}, {node_ts}, {uparents}" + if uparents.size == 0 or node != uparents[0]: + # if node is not the parent at this ts, it must be invalid + assert not exists_as_parent(cg, node, edges[:, 0]) + return rows + + timestamps = [node_ts] + if node_ts != end_ts: + timestamps = get_edit_timestamps(cg, cx_edges_d, node_ts, end_ts) + for ts in timestamps: + val_dict = {} + svs = edges[:, 1] + parents = cg.get_parents(svs, time_stamp=ts) + edge_parents_d = dict(zip(svs, parents)) + for layer, layer_edges in cx_edges_d.items(): + layer_edges = fastremap.remap( + layer_edges, edge_parents_d, preserve_missing_labels=True + ) + layer_edges[:, 0] = node + layer_edges = np.unique(layer_edges, axis=0) + col = Connectivity.CrossChunkEdge[layer] + val_dict[col] = layer_edges + row_id = serializers.serialize_uint64(node) + rows.append(cg.client.mutate_row(row_id, val_dict, time_stamp=ts)) + return rows + + +def update_chunk(cg: ChunkedGraph, chunk_coords: list[int], layer: int = 2): + """ + Iterate over all L2 IDs in a chunk and update their cross chunk edges, + within the periods they were valid/active. + """ + x, y, z = chunk_coords + chunk_id = cg.get_chunk_id(layer=layer, x=x, y=y, z=z) + cg.copy_fake_edges(chunk_id) + rr = cg.range_read_chunk(chunk_id) + nodes = list(rr.keys()) + + # get start_ts when node becomes valid + nodes_ts = cg.get_node_timestamps(nodes, return_numpy=False, normalize=True) + cx_edges_d = cg.get_atomic_cross_edges(nodes) + children_d = cg.get_children(nodes) + + rows = [] + for node, start_ts in zip(nodes, nodes_ts): + if cg.get_parent(node) is None: + # invalid id caused by failed ingest task + continue + node_cx_edges_d = cx_edges_d.get(node, {}) + if not node_cx_edges_d: + continue + + # get end_ts when node becomes invalid (bigtable resolution is in ms) + start = start_ts + timedelta(milliseconds=1) + _timestamps = get_parent_timestamps(cg, children_d[node], start_time=start) + try: + end_ts = sorted(_timestamps)[0] + except IndexError: + # start_ts == end_ts means there has been no edit involving this node + # meaning only one timestamp to update cross edges, start_ts + end_ts = start_ts + # for each timestamp until end_ts, update cross chunk edges of node + _rows = update_cross_edges(cg, node, node_cx_edges_d, start_ts, end_ts) + rows.extend(_rows) + cg.client.write(rows) diff --git a/pychunkedgraph/ingest/upgrade/parent_layer.py b/pychunkedgraph/ingest/upgrade/parent_layer.py new file mode 100644 index 000000000..8674e45b7 --- /dev/null +++ b/pychunkedgraph/ingest/upgrade/parent_layer.py @@ -0,0 +1,170 @@ +# pylint: disable=invalid-name, missing-docstring, c-extension-no-member + +import math, random, time +import multiprocessing as mp +from collections import defaultdict + +import fastremap +import numpy as np +from multiwrapper import multiprocessing_utils as mu + +from pychunkedgraph.graph import ChunkedGraph +from pychunkedgraph.graph.attributes import Connectivity, Hierarchy +from pychunkedgraph.graph.utils import serializers +from pychunkedgraph.graph.types import empty_2d +from pychunkedgraph.utils.general import chunked + +from .utils import exists_as_parent + + +CHILDREN = {} +CX_EDGES = {} + + +def _populate_nodes_and_children( + cg: ChunkedGraph, chunk_id: np.uint64, nodes: list = None +) -> dict: + global CHILDREN + if nodes: + CHILDREN = cg.get_children(nodes) + return + response = cg.range_read_chunk(chunk_id, properties=Hierarchy.Child) + for k, v in response.items(): + CHILDREN[k] = v[0].value + + +def _get_cx_edges_at_timestamp(node, response, ts): + result = defaultdict(list) + for child in CHILDREN[node]: + if child not in response: + continue + for key, cells in response[child].items(): + for cell in cells: + # cells are sorted in descending order of timestamps + if ts >= cell.timestamp: + result[key.index].append(cell.value) + break + for layer, edges in result.items(): + result[layer] = np.concatenate(edges) + return result + + +def _populate_cx_edges_with_timestamps( + cg: ChunkedGraph, layer: int, nodes: list, nodes_ts: list +): + """ + Collect timestamps of edits from children, since we use the same timestamp + for all IDs involved in an edit, we can use the timestamps of + when cross edges of children were updated. + """ + global CX_EDGES + attrs = [Connectivity.CrossChunkEdge[l] for l in range(layer, cg.meta.layer_count)] + all_children = np.concatenate(list(CHILDREN.values())) + response = cg.client.read_nodes(node_ids=all_children, properties=attrs) + for node, node_ts in zip(nodes, nodes_ts): + timestamps = set([node_ts]) + for child in CHILDREN[node]: + if child not in response: + continue + for cells in response[child].values(): + timestamps.update([c.timestamp for c in cells if c.timestamp > node_ts]) + CX_EDGES[node] = {} + for ts in sorted(timestamps): + CX_EDGES[node][ts] = _get_cx_edges_at_timestamp(node, response, ts) + + +def update_cross_edges(cg: ChunkedGraph, layer, node, node_ts, earliest_ts) -> list: + """ + Helper function to update a single ID. + Returns a list of mutations with timestamps. + """ + rows = [] + if node_ts > earliest_ts: + try: + cx_edges_d = CX_EDGES[node][node_ts] + except KeyError: + raise KeyError(f"{node}:{node_ts}") + edges = np.concatenate([empty_2d] + list(cx_edges_d.values())) + if edges.size: + parents = cg.get_roots( + edges[:, 0], time_stamp=node_ts, stop_layer=layer, ceil=False + ) + uparents = np.unique(parents) + layers = cg.get_chunk_layers(uparents) + uparents = uparents[layers == layer] + assert uparents.size <= 1, f"{node}, {node_ts}, {uparents}" + if uparents.size == 0 or node != uparents[0]: + # if node is not the parent at this ts, it must be invalid + assert not exists_as_parent(cg, node, edges[:, 0]), f"{node}, {node_ts}" + return rows + + for ts, cx_edges_d in CX_EDGES[node].items(): + edges = np.concatenate([empty_2d] + list(cx_edges_d.values())) + if edges.size == 0: + continue + nodes = np.unique(edges[:, 1]) + parents = cg.get_roots(nodes, time_stamp=ts, stop_layer=layer, ceil=False) + edge_parents_d = dict(zip(nodes, parents)) + val_dict = {} + for _layer, layer_edges in cx_edges_d.items(): + layer_edges = fastremap.remap( + layer_edges, edge_parents_d, preserve_missing_labels=True + ) + layer_edges[:, 0] = node + layer_edges = np.unique(layer_edges, axis=0) + col = Connectivity.CrossChunkEdge[_layer] + val_dict[col] = layer_edges + row_id = serializers.serialize_uint64(node) + rows.append(cg.client.mutate_row(row_id, val_dict, time_stamp=ts)) + return rows + + +def _update_cross_edges_helper(args): + cg_info, layer, nodes, nodes_ts, earliest_ts = args + rows = [] + cg = ChunkedGraph(**cg_info) + parents = cg.get_parents(nodes, fail_to_zero=True) + for node, parent, node_ts in zip(nodes, parents, nodes_ts): + if parent == 0: + # invalid id caused by failed ingest task + continue + _rows = update_cross_edges(cg, layer, node, node_ts, earliest_ts) + rows.extend(_rows) + cg.client.write(rows) + + +def update_chunk( + cg: ChunkedGraph, chunk_coords: list[int], layer: int, nodes: list = None +): + """ + Iterate over all layer IDs in a chunk and update their cross chunk edges. + """ + start = time.time() + x, y, z = chunk_coords + chunk_id = cg.get_chunk_id(layer=layer, x=x, y=y, z=z) + _populate_nodes_and_children(cg, chunk_id, nodes=nodes) + if not CHILDREN: + return + nodes = list(CHILDREN.keys()) + random.shuffle(nodes) + nodes_ts = cg.get_node_timestamps(nodes, return_numpy=False, normalize=True) + _populate_cx_edges_with_timestamps(cg, layer, nodes, nodes_ts) + + task_size = int(math.ceil(len(nodes) / mp.cpu_count() / 2)) + chunked_nodes = chunked(nodes, task_size) + chunked_nodes_ts = chunked(nodes_ts, task_size) + cg_info = cg.get_serialized_info() + earliest_ts = cg.get_earliest_timestamp() + + multi_args = [] + for chunk, ts_chunk in zip(chunked_nodes, chunked_nodes_ts): + args = (cg_info, layer, chunk, ts_chunk, earliest_ts) + multi_args.append(args) + + print(f"nodes: {len(nodes)}, tasks: {len(multi_args)}, size: {task_size}") + mu.multiprocess_func( + _update_cross_edges_helper, + multi_args, + n_threads=min(len(multi_args), mp.cpu_count()), + ) + print(f"total elaspsed time: {time.time() - start}") diff --git a/pychunkedgraph/ingest/upgrade/utils.py b/pychunkedgraph/ingest/upgrade/utils.py new file mode 100644 index 000000000..43c9a3034 --- /dev/null +++ b/pychunkedgraph/ingest/upgrade/utils.py @@ -0,0 +1,13 @@ +from pychunkedgraph.graph import ChunkedGraph +from pychunkedgraph.graph.attributes import Hierarchy + + +def exists_as_parent(cg: ChunkedGraph, parent, nodes) -> bool: + """ + Check if a given l2 parent is in the history of given nodes. + """ + response = cg.client.read_nodes(node_ids=nodes, properties=Hierarchy.Parent) + parents = set() + for cells in response.values(): + parents.update([cell.value for cell in cells]) + return parent in parents diff --git a/pychunkedgraph/ingest/utils.py b/pychunkedgraph/ingest/utils.py index 1c3236561..3d573ce37 100644 --- a/pychunkedgraph/ingest/utils.py +++ b/pychunkedgraph/ingest/utils.py @@ -1,14 +1,21 @@ # pylint: disable=invalid-name, missing-docstring -from typing import Tuple -from . import ClusterIngestConfig -from . import IngestConfig -from ..graph.meta import ChunkedGraphMeta -from ..graph.meta import DataSource -from ..graph.meta import GraphConfig +import logging +from os import environ +from time import sleep +from typing import Any, Generator, Tuple + +import numpy as np +import tensorstore as ts +from rq import Queue, Worker +from rq.worker import WorkerStatus +from . import IngestConfig +from .manager import IngestionManager +from ..graph.meta import ChunkedGraphMeta, DataSource, GraphConfig from ..graph.client import BackendClientInfo from ..graph.client.bigtable import BigTableConfig +from ..utils.general import chunked chunk_id_str = lambda layer, coords: f"{layer}_{'_'.join(map(str, coords))}" @@ -16,14 +23,12 @@ def bootstrap( graph_id: str, config: dict, - overwrite: bool = False, raw: bool = False, test_run: bool = False, ) -> Tuple[ChunkedGraphMeta, IngestConfig, BackendClientInfo]: """Parse config loaded from a yaml file.""" ingest_config = IngestConfig( **config.get("ingest_config", {}), - CLUSTER=ClusterIngestConfig(), USE_RAW_EDGES=raw, USE_RAW_COMPONENTS=raw, TEST_RUN=test_run, @@ -33,7 +38,7 @@ def bootstrap( graph_config = GraphConfig( ID=f"{graph_id}", - OVERWRITE=overwrite, + OVERWRITE=False, **config["graph_config"], ) data_source = DataSource(**config["data_source"]) @@ -73,3 +78,115 @@ def postprocess_edge_data(im, edge_dict): return new_edge_dict else: raise ValueError(f"Unknown data_version: {data_version}") + + +def start_ocdbt_server(imanager: IngestionManager, server: Any): + spec = {"driver": "ocdbt", "base": f"{imanager.cg.meta.data_source.EDGES}/ocdbt"} + spec["coordinator"] = {"address": f"localhost:{server.port}"} + ts.KvStore.open(spec).result() + imanager.redis.set("OCDBT_COORDINATOR_PORT", str(server.port)) + ocdbt_host = environ.get("MY_POD_IP", "localhost") + imanager.redis.set("OCDBT_COORDINATOR_HOST", ocdbt_host) + logging.info(f"OCDBT Coordinator address {ocdbt_host}:{server.port}") + + +def randomize_grid_points(X: int, Y: int, Z: int) -> Generator[int, int, int]: + indices = np.arange(X * Y * Z) + np.random.shuffle(indices) + for index in indices: + yield np.unravel_index(index, (X, Y, Z)) + + +def get_chunks_not_done(imanager: IngestionManager, layer: int, coords: list) -> list: + """check for set membership in redis in batches""" + coords_strs = ["_".join(map(str, coord)) for coord in coords] + try: + completed = imanager.redis.smismember(f"{layer}c", coords_strs) + except Exception: + return coords + return [coord for coord, c in zip(coords, completed) if not c] + + +def print_completion_rate(imanager: IngestionManager, layer: int, span: int = 10): + counts = [] + for _ in range(span + 1): + counts.append(imanager.redis.scard(f"{layer}c")) + sleep(1) + rate = np.diff(counts).sum() / span + print(f"{rate} chunks per second.") + + +def print_ingest_status(imanager: IngestionManager, redis, upgrade: bool = False): + """ + Helper to print status to console. + If `upgrade=True`, status does not include the root layer, + since there is no need to update cross edges for root ids. + """ + layers = range(2, imanager.cg_meta.layer_count + 1) + if upgrade: + layers = range(2, imanager.cg_meta.layer_count) + layer_counts = imanager.cg_meta.layer_chunk_counts + + pipeline = redis.pipeline() + worker_busy = [] + for layer in layers: + pipeline.scard(f"{layer}c") + queue = Queue(f"l{layer}", connection=redis) + pipeline.llen(queue.key) + pipeline.zcard(queue.failed_job_registry.key) + workers = Worker.all(queue=queue) + worker_busy.append(sum([w.get_state() == WorkerStatus.BUSY for w in workers])) + + results = pipeline.execute() + completed = [] + queued = [] + failed = [] + for i in range(0, len(results), 3): + result = results[i : i + 3] + completed.append(result[0]) + queued.append(result[1]) + failed.append(result[2]) + + print(f"version: \t{imanager.cg.version}") + print(f"graph_id: \t{imanager.cg.graph_id}") + print(f"chunk_size: \t{imanager.cg.meta.graph_config.CHUNK_SIZE}") + print("\nlayer status:") + for layer, done, count in zip(layers, completed, layer_counts): + print(f"{layer}\t: {done:<9} / {count}") + + print("\n\nqueue status:") + for layer, q, f, wb in zip(layers, queued, failed, worker_busy): + print(f"l{layer}\t: queued: {q:<10} failed: {f:<10} busy: {wb}") + + +def queue_layer_helper(parent_layer: int, imanager: IngestionManager, fn): + if parent_layer == imanager.cg_meta.layer_count: + chunk_coords = [(0, 0, 0)] + else: + bounds = imanager.cg_meta.layer_chunk_bounds[parent_layer] + chunk_coords = randomize_grid_points(*bounds) + + q = imanager.get_task_queue(f"l{parent_layer}") + batch_size = int(environ.get("JOB_BATCH_SIZE", 10000)) + timeout_scale = int(environ.get("TIMEOUT_SCALE_FACTOR", 1)) + batches = chunked(chunk_coords, batch_size) + for batch in batches: + _coords = get_chunks_not_done(imanager, parent_layer, batch) + # buffer for optimal use of redis memory + if len(q) > int(environ.get("QUEUE_SIZE", 100000)): + interval = int(environ.get("QUEUE_INTERVAL", 300)) + logging.info(f"Queue full; sleeping {interval}s...") + sleep(interval) + + job_datas = [] + for chunk_coord in _coords: + job_datas.append( + Queue.prepare_data( + fn, + args=(parent_layer, chunk_coord), + result_ttl=0, + job_id=chunk_id_str(parent_layer, chunk_coord), + timeout=f"{timeout_scale * int(parent_layer * parent_layer)}m", + ) + ) + q.enqueue_many(job_datas) diff --git a/pychunkedgraph/repair/edits.py b/pychunkedgraph/repair/edits.py index cb403a380..849b17e08 100644 --- a/pychunkedgraph/repair/edits.py +++ b/pychunkedgraph/repair/edits.py @@ -56,8 +56,6 @@ def repair_operation( op_ids_to_retry.append(locked_op) print(f"{node_id} indefinitely locked by op {locked_op}") print(f"total to retry: {len(op_ids_to_retry)}") - - logs = cg.client.read_log_entries(op_ids_to_retry) - for op_id, log in logs.items(): + for op_id in op_ids_to_retry: print(f"repairing {op_id}") - repair_operation(cg, log, op_id) + repair_operation(cg, op_id) diff --git a/pychunkedgraph/tests/helpers.py b/pychunkedgraph/tests/helpers.py index de5314422..b9c689ad6 100644 --- a/pychunkedgraph/tests/helpers.py +++ b/pychunkedgraph/tests/helpers.py @@ -14,12 +14,12 @@ from google.cloud import bigtable from ..ingest.utils import bootstrap -from ..ingest.create.atomic_layer import add_atomic_edges +from ..ingest.create.atomic_layer import add_atomic_chunk from ..graph.edges import Edges from ..graph.edges import EDGE_TYPES from ..graph.utils import basetypes from ..graph.chunkedgraph import ChunkedGraph -from ..ingest.create.abstract_layers import add_layer +from ..ingest.create.parent_layer import add_parent_chunk class CloudVolumeBounds(object): @@ -120,7 +120,7 @@ def _cgraph(request, n_layers=10, atomic_chunk_bounds: np.ndarray = np.array([]) "FANOUT": 2, "SPATIAL_BITS": 10, "ID_PREFIX": "", - "ROOT_LOCK_EXPIRY": timedelta(seconds=5) + "ROOT_LOCK_EXPIRY": timedelta(seconds=5), }, "backend_client": { "TYPE": "bigtable", @@ -130,15 +130,14 @@ def _cgraph(request, n_layers=10, atomic_chunk_bounds: np.ndarray = np.array([]) "PROJECT": "IGNORE_ENVIRONMENT_PROJECT", "INSTANCE": "emulated_instance", "CREDENTIALS": credentials.AnonymousCredentials(), - "MAX_ROW_KEY_COUNT": 1000 + "MAX_ROW_KEY_COUNT": 1000, }, }, "ingest_config": {}, } meta, _, client_info = bootstrap("test", config=config) - graph = ChunkedGraph(graph_id="test", meta=meta, - client_info=client_info) + graph = ChunkedGraph(graph_id="test", meta=meta, client_info=client_info) graph.mock_edges = Edges([], []) graph.meta._ws_cv = CloudVolumeMock() graph.meta.layer_count = n_layers @@ -176,8 +175,7 @@ def gen_graph_simplequerytest(request, gen_graph): # Chunk B create_chunk( graph, - vertices=[to_label(graph, 1, 1, 0, 0, 0), - to_label(graph, 1, 1, 0, 0, 1)], + vertices=[to_label(graph, 1, 1, 0, 0, 0), to_label(graph, 1, 1, 0, 0, 1)], edges=[ (to_label(graph, 1, 1, 0, 0, 0), to_label(graph, 1, 1, 0, 0, 1), 0.5), (to_label(graph, 1, 1, 0, 0, 0), to_label(graph, 1, 2, 0, 0, 0), inf), @@ -188,13 +186,12 @@ def gen_graph_simplequerytest(request, gen_graph): create_chunk( graph, vertices=[to_label(graph, 1, 2, 0, 0, 0)], - edges=[(to_label(graph, 1, 2, 0, 0, 0), - to_label(graph, 1, 1, 0, 0, 0), inf)], + edges=[(to_label(graph, 1, 2, 0, 0, 0), to_label(graph, 1, 1, 0, 0, 0), inf)], ) - add_layer(graph, 3, [0, 0, 0], n_threads=1) - add_layer(graph, 3, [1, 0, 0], n_threads=1) - add_layer(graph, 4, [0, 0, 0], n_threads=1) + add_parent_chunk(graph, 3, [0, 0, 0], n_threads=1) + add_parent_chunk(graph, 3, [1, 0, 0], n_threads=1) + add_parent_chunk(graph, 4, [0, 0, 0], n_threads=1) return graph @@ -206,8 +203,7 @@ def create_chunk(cg, vertices=None, edges=None, timestamp=None): edges = edges if edges else [] vertices = vertices if vertices else [] vertices = np.unique(np.array(vertices, dtype=np.uint64)) - edges = [(np.uint64(v1), np.uint64(v2), np.float32(aff)) - for v1, v2, aff in edges] + edges = [(np.uint64(v1), np.uint64(v2), np.float32(aff)) for v1, v2, aff in edges] isolated_ids = [ x for x in vertices @@ -230,8 +226,7 @@ def create_chunk(cg, vertices=None, edges=None, timestamp=None): chunk_id = None if len(chunk_edges_active[EDGE_TYPES.in_chunk]): - chunk_id = cg.get_chunk_id( - chunk_edges_active[EDGE_TYPES.in_chunk].node_ids1[0]) + chunk_id = cg.get_chunk_id(chunk_edges_active[EDGE_TYPES.in_chunk].node_ids1[0]) elif len(vertices): chunk_id = cg.get_chunk_id(vertices[0]) @@ -257,7 +252,7 @@ def create_chunk(cg, vertices=None, edges=None, timestamp=None): cg.mock_edges += all_edges isolated_ids = np.array(isolated_ids, dtype=np.uint64) - add_atomic_edges( + add_atomic_chunk( cg, cg.get_chunk_coordinates(chunk_id), chunk_edges_active, @@ -282,21 +277,21 @@ def get_layer_chunk_bounds( return layer_bounds_d -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def sv_data(): - test_data_dir = 'pychunkedgraph/tests/data' - edges_file = f'{test_data_dir}/sv_edges.npy' + test_data_dir = "pychunkedgraph/tests/data" + edges_file = f"{test_data_dir}/sv_edges.npy" sv_edges = np.load(edges_file) - source_file = f'{test_data_dir}/sv_sources.npy' + source_file = f"{test_data_dir}/sv_sources.npy" sv_sources = np.load(source_file) - sinks_file = f'{test_data_dir}/sv_sinks.npy' + sinks_file = f"{test_data_dir}/sv_sinks.npy" sv_sinks = np.load(sinks_file) - affinity_file = f'{test_data_dir}/sv_affinity.npy' + affinity_file = f"{test_data_dir}/sv_affinity.npy" sv_affinity = np.load(affinity_file) - area_file = f'{test_data_dir}/sv_area.npy' + area_file = f"{test_data_dir}/sv_area.npy" sv_area = np.load(area_file) yield (sv_edges, sv_sources, sv_sinks, sv_affinity, sv_area) diff --git a/pychunkedgraph/tests/test_uncategorized.py b/pychunkedgraph/tests/test_uncategorized.py index 93c41158d..8b26f5c5e 100644 --- a/pychunkedgraph/tests/test_uncategorized.py +++ b/pychunkedgraph/tests/test_uncategorized.py @@ -36,7 +36,7 @@ from ..graph.lineage import get_future_root_ids from ..graph.utils.serializers import serialize_uint64 from ..graph.utils.serializers import deserialize_uint64 -from ..ingest.create.abstract_layers import add_layer +from ..ingest.create.parent_layer import add_parent_chunk class TestGraphNodeConversion: @@ -68,9 +68,9 @@ def test_node_id_adjacency(self, gen_graph): ) == cg.get_node_id(np.uint64(1), layer=2, x=3, y=1, z=0) assert cg.get_node_id( - np.uint64(2 ** 53 - 2), layer=10, x=0, y=0, z=0 + np.uint64(2**53 - 2), layer=10, x=0, y=0, z=0 ) + np.uint64(1) == cg.get_node_id( - np.uint64(2 ** 53 - 1), layer=10, x=0, y=0, z=0 + np.uint64(2**53 - 1), layer=10, x=0, y=0, z=0 ) @pytest.mark.timeout(30) @@ -82,9 +82,9 @@ def test_serialize_node_id(self, gen_graph): ) < serialize_uint64(cg.get_node_id(np.uint64(1), layer=2, x=3, y=1, z=0)) assert serialize_uint64( - cg.get_node_id(np.uint64(2 ** 53 - 2), layer=10, x=0, y=0, z=0) + cg.get_node_id(np.uint64(2**53 - 2), layer=10, x=0, y=0, z=0) ) < serialize_uint64( - cg.get_node_id(np.uint64(2 ** 53 - 1), layer=10, x=0, y=0, z=0) + cg.get_node_id(np.uint64(2**53 - 1), layer=10, x=0, y=0, z=0) ) @pytest.mark.timeout(30) @@ -222,7 +222,7 @@ def test_build_single_across_edge(self, gen_graph): edges=[(to_label(cg, 1, 1, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 0), inf)], ) - add_layer(cg, 3, [0, 0, 0], n_threads=1) + add_parent_chunk(cg, 3, [0, 0, 0], n_threads=1) res = cg.client._table.read_rows() res.consume_all() @@ -327,7 +327,7 @@ def test_build_single_edge_and_single_across_edge(self, gen_graph): edges=[(to_label(cg, 1, 1, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 0), inf)], ) - add_layer(cg, 3, np.array([0, 0, 0]), n_threads=1) + add_parent_chunk(cg, 3, np.array([0, 0, 0]), n_threads=1) res = cg.client._table.read_rows() res.consume_all() @@ -424,10 +424,10 @@ def test_build_big_graph(self, gen_graph): # Preparation: Build Chunk Z create_chunk(cg, vertices=[to_label(cg, 1, 7, 7, 7, 0)], edges=[]) - add_layer(cg, 3, [0, 0, 0], n_threads=1) - add_layer(cg, 3, [3, 3, 3], n_threads=1) - add_layer(cg, 4, [0, 0, 0], n_threads=1) - add_layer(cg, 5, [0, 0, 0], n_threads=1) + add_parent_chunk(cg, 3, [0, 0, 0], n_threads=1) + add_parent_chunk(cg, 3, [3, 3, 3], n_threads=1) + add_parent_chunk(cg, 4, [0, 0, 0], n_threads=1) + add_parent_chunk(cg, 5, [0, 0, 0], n_threads=1) res = cg.client._table.read_rows() res.consume_all() @@ -468,21 +468,21 @@ def test_double_chunk_creation(self, gen_graph): timestamp=fake_timestamp, ) - add_layer( + add_parent_chunk( cg, 3, [0, 0, 0], time_stamp=fake_timestamp, n_threads=1, ) - add_layer( + add_parent_chunk( cg, 3, [0, 0, 0], time_stamp=fake_timestamp, n_threads=1, ) - add_layer( + add_parent_chunk( cg, 4, [0, 0, 0], @@ -831,7 +831,7 @@ def test_merge_pair_neighboring_chunks(self, gen_graph): timestamp=fake_timestamp, ) - add_layer( + add_parent_chunk( cg, 3, [0, 0, 0], @@ -887,28 +887,28 @@ def test_merge_pair_disconnected_chunks(self, gen_graph): timestamp=fake_timestamp, ) - add_layer( + add_parent_chunk( cg, 3, [0, 0, 0], time_stamp=fake_timestamp, n_threads=1, ) - add_layer( + add_parent_chunk( cg, 3, [3, 3, 3], time_stamp=fake_timestamp, n_threads=1, ) - add_layer( + add_parent_chunk( cg, 4, [0, 0, 0], time_stamp=fake_timestamp, n_threads=1, ) - add_layer( + add_parent_chunk( cg, 5, [0, 0, 0], @@ -1052,7 +1052,7 @@ def test_merge_triple_chain_to_full_circle_neighboring_chunks(self, gen_graph): timestamp=fake_timestamp, ) - add_layer( + add_parent_chunk( cg, 3, [0, 0, 0], @@ -1111,35 +1111,35 @@ def test_merge_triple_chain_to_full_circle_disconnected_chunks(self, gen_graph): timestamp=fake_timestamp, ) - add_layer( + add_parent_chunk( cg, 3, [0, 0, 0], time_stamp=fake_timestamp, n_threads=1, ) - add_layer( + add_parent_chunk( cg, 3, [3, 3, 3], time_stamp=fake_timestamp, n_threads=1, ) - add_layer( + add_parent_chunk( cg, 4, [0, 0, 0], time_stamp=fake_timestamp, n_threads=1, ) - add_layer( + add_parent_chunk( cg, 4, [1, 1, 1], time_stamp=fake_timestamp, n_threads=1, ) - add_layer( + add_parent_chunk( cg, 5, [0, 0, 0], @@ -1239,7 +1239,7 @@ def test_merge_pair_abstract_nodes(self, gen_graph): timestamp=fake_timestamp, ) - add_layer( + add_parent_chunk( cg, 3, [0, 0, 0], @@ -1314,7 +1314,7 @@ def test_diagonal_connections(self, gen_graph): edges=[(to_label(cg, 1, 1, 1, 0, 0), to_label(cg, 1, 0, 1, 0, 0), inf)], ) - add_layer( + add_parent_chunk( cg, 3, [0, 0, 0], @@ -1405,28 +1405,28 @@ def test_cross_edges(self, gen_graph): timestamp=fake_timestamp, ) - add_layer( + add_parent_chunk( cg, 3, [0, 0, 0], time_stamp=fake_timestamp, n_threads=1, ) - add_layer( + add_parent_chunk( cg, 3, [1, 0, 0], time_stamp=fake_timestamp, n_threads=1, ) - add_layer( + add_parent_chunk( cg, 4, [0, 0, 0], time_stamp=fake_timestamp, n_threads=1, ) - add_layer( + add_parent_chunk( cg, 5, [0, 0, 0], @@ -1591,7 +1591,7 @@ def test_cut_regular_link(self, gen_graph): timestamp=fake_timestamp, ) - add_layer( + add_parent_chunk( cg, 3, [0, 0, 0], @@ -1662,7 +1662,7 @@ def test_cut_no_link(self, gen_graph): timestamp=fake_timestamp, ) - add_layer( + add_parent_chunk( cg, 3, [0, 0, 0], @@ -1723,7 +1723,7 @@ def test_cut_old_link(self, gen_graph): timestamp=fake_timestamp, ) - add_layer( + add_parent_chunk( cg, 3, [0, 0, 0], @@ -1791,7 +1791,7 @@ def test_cut_indivisible_link(self, gen_graph): timestamp=fake_timestamp, ) - add_layer( + add_parent_chunk( cg, 3, [0, 0, 0], @@ -1922,7 +1922,7 @@ def test_cut_merge_history(self, gen_graph): timestamp=fake_timestamp, ) - add_layer( + add_parent_chunk( cg, 3, [0, 0, 0], @@ -2063,7 +2063,7 @@ def test_lock_unlock(self, gen_graph): timestamp=fake_timestamp, ) - add_layer( + add_parent_chunk( cg, 3, [0, 0, 0], @@ -2129,7 +2129,7 @@ def test_lock_expiration(self, gen_graph): timestamp=fake_timestamp, ) - add_layer( + add_parent_chunk( cg, 3, [0, 0, 0], @@ -2197,7 +2197,7 @@ def test_lock_renew(self, gen_graph): timestamp=fake_timestamp, ) - add_layer( + add_parent_chunk( cg, 3, [0, 0, 0], @@ -2249,7 +2249,7 @@ def test_lock_merge_lock_old_id(self, gen_graph): timestamp=fake_timestamp, ) - add_layer( + add_parent_chunk( cg, 3, [0, 0, 0], @@ -2315,7 +2315,7 @@ def test_indefinite_lock(self, gen_graph): timestamp=fake_timestamp, ) - add_layer( + add_parent_chunk( cg, 3, [0, 0, 0], @@ -2388,7 +2388,7 @@ def test_indefinite_lock_with_normal_lock_expiration(self, gen_graph): timestamp=fake_timestamp, ) - add_layer( + add_parent_chunk( cg, 3, [0, 0, 0], diff --git a/pychunkedgraph/utils/general.py b/pychunkedgraph/utils/general.py index 71e24eab0..c299d3b9b 100644 --- a/pychunkedgraph/utils/general.py +++ b/pychunkedgraph/utils/general.py @@ -1,9 +1,11 @@ """ generic helper funtions """ + from typing import Sequence from itertools import islice + import numpy as np @@ -24,6 +26,10 @@ def reverse_dictionary(dictionary): def chunked(l: Sequence, n: int): + """ + Yield successive n-sized chunks from l. + NOTE: Use itertools.batched from python 3.12 + """ """ Yield successive n-sized chunks from l. NOTE: Use itertools.batched from python 3.12 @@ -33,6 +39,9 @@ def chunked(l: Sequence, n: int): it = iter(l) while batch := tuple(islice(it, n)): yield batch + it = iter(l) + while batch := tuple(islice(it, n)): + yield batch def in2d(arr1: np.ndarray, arr2: np.ndarray) -> np.ndarray: diff --git a/requirements.in b/requirements.in index 63e0b3472..1ec536a5c 100644 --- a/requirements.in +++ b/requirements.in @@ -15,6 +15,7 @@ rq<2 pyyaml cachetools werkzeug +tensorstore # PyPI only: cloud-files>=4.21.1 diff --git a/requirements.txt b/requirements.txt index 5a2f18adc..059b8fd91 100644 --- a/requirements.txt +++ b/requirements.txt @@ -192,6 +192,8 @@ messagingclient==0.1.3 # via -r requirements.in middle-auth-client==3.16.1 # via -r requirements.in +ml-dtypes==0.3.2 + # via tensorstore multiprocess==0.70.15 # via pathos multiwrapper==0.1.1 @@ -210,11 +212,13 @@ numpy==1.26.0 # fastremap # fpzip # messagingclient + # ml-dtypes # multiwrapper # pandas # pyspng-seunglab # simplejpeg # task-queue + # tensorstore # zfpc # zmesh orderedmultidict==1.0.1 @@ -337,6 +341,8 @@ tenacity==8.2.3 # cloud-files # cloud-volume # task-queue +tensorstore==0.1.53 + # via -r requirements.in tqdm==4.66.1 # via # cloud-files From c4ac266caf87db9785b519422ad32650d5771c2d Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Sun, 12 May 2024 16:10:12 +0000 Subject: [PATCH 082/116] reset version v3 --- .bumpversion.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index b6b4de269..5583246c5 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 2.21.1 +current_version = 3.0.0 commit = True tag = True From 03179b4dbb97fcea20e3684260af0a12e45f7df8 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Sun, 12 May 2024 18:18:05 +0000 Subject: [PATCH 083/116] breakup long fn --- pychunkedgraph/ingest/create/parent_layer.py | 96 ++++++++++---------- 1 file changed, 48 insertions(+), 48 deletions(-) diff --git a/pychunkedgraph/ingest/create/parent_layer.py b/pychunkedgraph/ingest/create/parent_layer.py index 09be61407..a777d9efc 100644 --- a/pychunkedgraph/ingest/create/parent_layer.py +++ b/pychunkedgraph/ingest/create/parent_layer.py @@ -154,13 +154,50 @@ def _write_components_helper(args): _write(cg, layer, pcoords, ccs, node_layer_d, time_stamp) +def _children_rows( + cg: ChunkedGraph, parent_id, children: Sequence, cx_edges_d: dict, time_stamp +): + """ + Update children rows to point to the parent_id, collect cached children + cross chunk edges to lift and update parent cross chunk edges. + Returns list of mutations to children and list of children cross edges. + """ + rows = [] + children_cx_edges = [] + for child in children: + node_layer = cg.get_chunk_layer(child) + row_id = serializers.serialize_uint64(child) + val_dict = {attributes.Hierarchy.Parent: parent_id} + node_cx_edges_d = cx_edges_d.get(child, {}) + if not node_cx_edges_d: + rows.append(cg.client.mutate_row(row_id, val_dict, time_stamp)) + continue + for layer in range(node_layer, cg.meta.layer_count): + if not layer in node_cx_edges_d: + continue + layer_edges = node_cx_edges_d[layer] + nodes = np.unique(layer_edges) + parents = cg.get_roots(nodes, stop_layer=node_layer, ceil=False) + edge_parents_d = dict(zip(nodes, parents)) + layer_edges = fastremap.remap( + layer_edges, edge_parents_d, preserve_missing_labels=True + ) + layer_edges = np.unique(layer_edges, axis=0) + col = attributes.Connectivity.CrossChunkEdge[layer] + val_dict[col] = layer_edges + node_cx_edges_d[layer] = layer_edges + children_cx_edges.append(node_cx_edges_d) + rows.append(cg.client.mutate_row(row_id, val_dict, time_stamp)) + return rows, children_cx_edges + + def _write( cg: ChunkedGraph, layer_id, parent_coords, components, node_layer_d, - time_stamp, + ts, use_threads=True, ): parent_layers = range(layer_id, cg.meta.layer_count + 1) @@ -175,71 +212,34 @@ def _write( x, y, z = parent_coords parent_chunk_id = cg.get_chunk_id(layer=layer_id, x=x, y=y, z=z) parent_chunk_id_dict = cg.get_parent_chunk_id_dict(parent_chunk_id) - for parent_layer in parent_layers: if len(cc_connections[parent_layer]) == 0: continue - parent_chunk_id = parent_chunk_id_dict[parent_layer] reserved_parent_ids = cg.id_client.create_node_ids( parent_chunk_id, size=len(cc_connections[parent_layer]), root_chunk=parent_layer == cg.meta.layer_count and use_threads, ) - - for i_cc, node_ids in enumerate(cc_connections[parent_layer]): - parent_id = reserved_parent_ids[i_cc] - + for i_cc, children in enumerate(cc_connections[parent_layer]): + parent = reserved_parent_ids[i_cc] if layer_id == 3: # when layer 3 is being processed, children chunks are at layer 2 # layer 2 chunks at this time will only have atomic cross edges - cx_edges_d = cg.get_atomic_cross_edges(node_ids) + cx_edges_d = cg.get_atomic_cross_edges(children) else: - # children are from abstract chunks - cx_edges_d = cg.get_cross_chunk_edges(node_ids, raw_only=True) - - children_cx_edges = [] - for node in node_ids: - node_layer = cg.get_chunk_layer(node) - row_id = serializers.serialize_uint64(node) - val_dict = {attributes.Hierarchy.Parent: parent_id} - - node_cx_edges_d = cx_edges_d.get(node, {}) - if not node_cx_edges_d: - rows.append(cg.client.mutate_row(row_id, val_dict, time_stamp)) - continue - - for layer in range(node_layer, cg.meta.layer_count): - if not layer in node_cx_edges_d: - continue - layer_edges = node_cx_edges_d[layer] - nodes = np.unique(layer_edges) - parents = cg.get_roots(nodes, stop_layer=node_layer, ceil=False) - - edge_parents_d = dict(zip(nodes, parents)) - layer_edges = fastremap.remap( - layer_edges, edge_parents_d, preserve_missing_labels=True - ) - layer_edges = np.unique(layer_edges, axis=0) - - col = attributes.Connectivity.CrossChunkEdge[layer] - val_dict[col] = layer_edges - node_cx_edges_d[layer] = layer_edges - children_cx_edges.append(node_cx_edges_d) - rows.append(cg.client.mutate_row(row_id, val_dict, time_stamp)) - - row_id = serializers.serialize_uint64(parent_id) - val_dict = {attributes.Hierarchy.Child: node_ids} - parent_cx_edges_d = concatenate_cross_edge_dicts( - children_cx_edges, unique=True - ) + cx_edges_d = cg.get_cross_chunk_edges(children, raw_only=True) + _rows, cx_edges = _children_rows(cg, parent, children, cx_edges_d, ts) + rows.extend(_rows) + row_id = serializers.serialize_uint64(parent) + val_dict = {attributes.Hierarchy.Child: children} + parent_cx_edges_d = concatenate_cross_edge_dicts(cx_edges, unique=True) for layer in range(parent_layer, cg.meta.layer_count): if not layer in parent_cx_edges_d: continue col = attributes.Connectivity.CrossChunkEdge[layer] val_dict[col] = parent_cx_edges_d[layer] - - rows.append(cg.client.mutate_row(row_id, val_dict, time_stamp)) + rows.append(cg.client.mutate_row(row_id, val_dict, ts)) if len(rows) > 100000: cg.client.write(rows) rows = [] From 3eba2a8a85d47ceb8282ab3e3b715cd36c4cd14f Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Wed, 15 May 2024 00:11:42 +0000 Subject: [PATCH 084/116] gh actions for pcgv3 --- .github/workflows/main.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 899f0431f..fd20bf4b7 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -4,9 +4,11 @@ on: push: branches: - "main" + - "pcgv3" pull_request: branches: - "main" + - "pcgv3" jobs: unit-tests: From 5e0162208f6dd483c58df4c9a85d36db57f8eb97 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Fri, 24 May 2024 21:31:39 -0500 Subject: [PATCH 085/116] update split tests (#497) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat(ingest): use temporarily cached cross chunk edges * fix: switch to using partners vector instead of 2d edges array * fix(edits): l2 - use and store cx edges that become relevant only at l2 * chore: rename counterpart to partner * fix: update partner cx edges * feat(edits): use layer relevant partners * fix tests * persist cross chunk layers with each node * fix: update cross chunk layers in edits * fix: update cross layer from old ids in l2 * update deprecated utcnoww * fix split tests * Bump version: 3.0.0 → 3.0.1 * fix: missed timestamp arg * update docs, remove unnecessary methods * revert structural changes * fix new tests; revert bumpversion.cfg --- pychunkedgraph/graph/edits.py | 22 +- pychunkedgraph/graph/misc.py | 58 +- pychunkedgraph/graph/utils/basetypes.py | 22 +- pychunkedgraph/ingest/create/parent_layer.py | 3 +- pychunkedgraph/tests/helpers.py | 1 + pychunkedgraph/tests/test_uncategorized.py | 2141 ++++++++---------- 6 files changed, 1036 insertions(+), 1211 deletions(-) diff --git a/pychunkedgraph/graph/edits.py b/pychunkedgraph/graph/edits.py index ee7e643c3..807fff257 100644 --- a/pychunkedgraph/graph/edits.py +++ b/pychunkedgraph/graph/edits.py @@ -251,7 +251,7 @@ def add_edges( return new_roots, new_l2_ids, create_parents.new_entries -def _process_l2_agglomeration( +def _split_l2_agglomeration( cg, operation_id: int, agg: types.Agglomeration, @@ -272,16 +272,16 @@ def _process_l2_agglomeration( # if there aren't any, there must be no parents. XOR these 2 conditions. err = f"got cross edges from more than one l2 node; op {operation_id}" assert (np.unique(parents).size == 1) != (cross_edges.size == 0), err - root = cg.get_root(parents[0], time_stamp=parent_ts, raw_only=True) - - # inactive edges must be filtered out - neighbor_roots = cg.get_roots( - cross_edges[:, 1], raw_only=True, time_stamp=parent_ts - ) - active_mask = neighbor_roots == root - cross_edges = cross_edges[active_mask] - cross_edges = cross_edges[~in2d(cross_edges, removed_edges)] + if cross_edges.size: + # inactive edges must be filtered out + root = cg.get_root(parents[0], time_stamp=parent_ts, raw_only=True) + neighbor_roots = cg.get_roots( + cross_edges[:, 1], raw_only=True, time_stamp=parent_ts + ) + active_mask = neighbor_roots == root + cross_edges = cross_edges[active_mask] + cross_edges = cross_edges[~in2d(cross_edges, removed_edges)] isolated_ids = agg.supervoxels[~np.in1d(agg.supervoxels, chunk_edges)] isolated_edges = np.column_stack((isolated_ids, isolated_ids)) graph, _, _, graph_ids = flatgraph.build_gt_graph( @@ -332,7 +332,7 @@ def remove_edges( new_l2_ids = [] for id_ in l2ids: agg = l2id_agglomeration_d[id_] - ccs, graph_ids, cross_edges = _process_l2_agglomeration( + ccs, graph_ids, cross_edges = _split_l2_agglomeration( cg, operation_id, agg, removed_edges, parent_ts ) new_parents = cg.id_client.create_node_ids(chunk_id_map[agg.node_id], len(ccs)) diff --git a/pychunkedgraph/graph/misc.py b/pychunkedgraph/graph/misc.py index 873422db1..0f53c71c3 100644 --- a/pychunkedgraph/graph/misc.py +++ b/pychunkedgraph/graph/misc.py @@ -8,7 +8,6 @@ import fastremap import numpy as np -from multiwrapper import multiprocessing_utils as mu from . import ChunkedGraph from . import attributes @@ -51,22 +50,6 @@ def _read_delta_root_rows( return new_root_ids, expired_root_ids -def _read_root_rows_thread(args) -> list: - start_seg_id, end_seg_id, serialized_cg_info, time_stamp = args - cg = ChunkedGraph(**serialized_cg_info) - start_id = cg.get_node_id(segment_id=start_seg_id, chunk_id=cg.root_chunk_id) - end_id = cg.get_node_id(segment_id=end_seg_id, chunk_id=cg.root_chunk_id) - rows = cg.client.read_nodes( - start_id=start_id, - end_id=end_id, - end_id_inclusive=False, - end_time=time_stamp, - end_time_inclusive=True, - ) - root_ids = [k for (k, v) in rows.items() if attributes.Hierarchy.NewParent not in v] - return root_ids - - def get_proofread_root_ids( cg: ChunkedGraph, start_time: Optional[datetime.datetime] = None, @@ -94,43 +77,12 @@ def get_proofread_root_ids( def get_latest_roots( - cg, time_stamp: Optional[datetime.datetime] = None, n_threads: int = 1 + cg: ChunkedGraph, time_stamp: Optional[datetime.datetime] = None, n_threads: int = 1 ) -> Sequence[np.uint64]: - # Create filters: time and id range - max_seg_id = cg.get_max_seg_id(cg.root_chunk_id) + 1 - n_blocks = 1 if n_threads == 1 else int(np.min([n_threads * 3 + 1, max_seg_id])) - seg_id_blocks = np.linspace(1, max_seg_id, n_blocks + 1, dtype=np.uint64) - cg_serialized_info = cg.get_serialized_info() - if n_threads > 1: - del cg_serialized_info["credentials"] - - multi_args = [] - for i_id_block in range(0, len(seg_id_blocks) - 1): - multi_args.append( - [ - seg_id_blocks[i_id_block], - seg_id_blocks[i_id_block + 1], - cg_serialized_info, - time_stamp, - ] - ) - - if n_threads == 1: - results = mu.multiprocess_func( - _read_root_rows_thread, - multi_args, - n_threads=n_threads, - verbose=False, - debug=n_threads == 1, - ) - else: - results = mu.multisubprocess_func( - _read_root_rows_thread, multi_args, n_threads=n_threads - ) - root_ids = [] - for result in results: - root_ids.extend(result) - return np.array(root_ids, dtype=np.uint64) + root_chunk = cg.get_chunk_id(layer=cg.meta.layer_count, x=0, y=0, z=0) + rr = cg.range_read_chunk(root_chunk, time_stamp=time_stamp) + roots = [k for k, v in rr.items() if attributes.Hierarchy.NewParent not in v] + return np.array(roots, dtype=np.uint64) def get_delta_roots( diff --git a/pychunkedgraph/graph/utils/basetypes.py b/pychunkedgraph/graph/utils/basetypes.py index e55324e6a..c6b0b1974 100644 --- a/pychunkedgraph/graph/utils/basetypes.py +++ b/pychunkedgraph/graph/utils/basetypes.py @@ -1,16 +1,16 @@ import numpy as np -CHUNK_ID = SEGMENT_ID = NODE_ID = OPERATION_ID = np.dtype('uint64').newbyteorder('L') -EDGE_AFFINITY = np.dtype('float32').newbyteorder('L') -EDGE_AREA = np.dtype('uint64').newbyteorder('L') +CHUNK_ID = SEGMENT_ID = NODE_ID = OPERATION_ID = np.dtype("uint64").newbyteorder("L") +EDGE_AFFINITY = np.dtype("float32").newbyteorder("L") +EDGE_AREA = np.dtype("uint64").newbyteorder("L") -COUNTER = np.dtype('int64').newbyteorder('B') +COUNTER = np.dtype("int64").newbyteorder("B") -COORDINATES = np.dtype('int64').newbyteorder('L') -CHUNKSIZE = np.dtype('uint64').newbyteorder('L') -FANOUT = np.dtype('uint64').newbyteorder('L') -LAYERCOUNT = np.dtype('uint64').newbyteorder('L') -SPATIALBITS = np.dtype('uint64').newbyteorder('L') -ROOTCOUNTERBITS = np.dtype('uint64').newbyteorder('L') -SKIPCONNECTIONS = np.dtype('uint64').newbyteorder('L') \ No newline at end of file +COORDINATES = np.dtype("int64").newbyteorder("L") +CHUNKSIZE = np.dtype("uint64").newbyteorder("L") +FANOUT = np.dtype("uint64").newbyteorder("L") +LAYERCOUNT = np.dtype("uint64").newbyteorder("L") +SPATIALBITS = np.dtype("uint64").newbyteorder("L") +ROOTCOUNTERBITS = np.dtype("uint64").newbyteorder("L") +SKIPCONNECTIONS = np.dtype("uint64").newbyteorder("L") diff --git a/pychunkedgraph/ingest/create/parent_layer.py b/pychunkedgraph/ingest/create/parent_layer.py index a777d9efc..90b24d26a 100644 --- a/pychunkedgraph/ingest/create/parent_layer.py +++ b/pychunkedgraph/ingest/create/parent_layer.py @@ -164,7 +164,8 @@ def _children_rows( """ rows = [] children_cx_edges = [] - for child in children: + children_layers = cg.get_chunk_layers(children) + for child, node_layer in zip(children, children_layers): node_layer = cg.get_chunk_layer(child) row_id = serializers.serialize_uint64(child) val_dict = {attributes.Hierarchy.Parent: parent_id} diff --git a/pychunkedgraph/tests/helpers.py b/pychunkedgraph/tests/helpers.py index b9c689ad6..551c596bf 100644 --- a/pychunkedgraph/tests/helpers.py +++ b/pychunkedgraph/tests/helpers.py @@ -257,6 +257,7 @@ def create_chunk(cg, vertices=None, edges=None, timestamp=None): cg.get_chunk_coordinates(chunk_id), chunk_edges_active, isolated=isolated_ids, + time_stamp=timestamp, ) diff --git a/pychunkedgraph/tests/test_uncategorized.py b/pychunkedgraph/tests/test_uncategorized.py index 8b26f5c5e..5c2de29d4 100644 --- a/pychunkedgraph/tests/test_uncategorized.py +++ b/pychunkedgraph/tests/test_uncategorized.py @@ -1,20 +1,10 @@ -import collections -import os -import subprocess -import sys from time import sleep -from datetime import datetime, timedelta -from functools import partial +from datetime import datetime, timedelta, UTC from math import inf -from signal import SIGTERM -from unittest import mock from warnings import warn import numpy as np import pytest -from google.auth import credentials -from google.cloud import bigtable -from grpc._channel import _Rendezvous from .helpers import ( bigtable_emulator, @@ -24,13 +14,14 @@ to_label, sv_data, ) +from ..graph import ChunkedGraph from ..graph import types from ..graph import attributes from ..graph import exceptions -from ..graph import chunkedgraph from ..graph.edges import Edges from ..graph.utils import basetypes -from ..graph.misc import get_delta_roots +from ..graph.lineage import lineage_graph +from ..graph.misc import get_delta_roots, get_latest_roots from ..graph.cutting import run_multicut from ..graph.lineage import get_root_id_history from ..graph.lineage import get_future_root_ids @@ -452,7 +443,7 @@ def test_double_chunk_creation(self, gen_graph): cg = gen_graph(n_layers=4, atomic_chunk_bounds=atomic_chunk_bounds) # Preparation: Build Chunk A - fake_timestamp = datetime.utcnow() - timedelta(days=10) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) create_chunk( cg, vertices=[to_label(cg, 1, 0, 0, 0, 1), to_label(cg, 1, 0, 0, 0, 2)], @@ -775,7 +766,7 @@ def test_merge_pair_same_chunk(self, gen_graph): cg = gen_graph(n_layers=2, atomic_chunk_bounds=atomic_chunk_bounds) # Preparation: Build Chunk A - fake_timestamp = datetime.utcnow() - timedelta(days=10) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) create_chunk( cg, vertices=[to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1)], @@ -815,7 +806,7 @@ def test_merge_pair_neighboring_chunks(self, gen_graph): cg = gen_graph(n_layers=3) # Preparation: Build Chunk A - fake_timestamp = datetime.utcnow() - timedelta(days=10) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) create_chunk( cg, vertices=[to_label(cg, 1, 0, 0, 0, 0)], @@ -871,7 +862,7 @@ def test_merge_pair_disconnected_chunks(self, gen_graph): cg = gen_graph(n_layers=5) # Preparation: Build Chunk A - fake_timestamp = datetime.utcnow() - timedelta(days=10) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) create_chunk( cg, vertices=[to_label(cg, 1, 0, 0, 0, 0)], @@ -955,7 +946,7 @@ def test_merge_pair_already_connected(self, gen_graph): cg = gen_graph(n_layers=2) # Preparation: Build Chunk A - fake_timestamp = datetime.utcnow() - timedelta(days=10) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) create_chunk( cg, vertices=[to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1)], @@ -996,7 +987,7 @@ def test_merge_triple_chain_to_full_circle_same_chunk(self, gen_graph): cg = gen_graph(n_layers=2) # Preparation: Build Chunk A - fake_timestamp = datetime.utcnow() - timedelta(days=10) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) create_chunk( cg, vertices=[ @@ -1033,7 +1024,7 @@ def test_merge_triple_chain_to_full_circle_neighboring_chunks(self, gen_graph): cg = gen_graph(n_layers=3) # Preparation: Build Chunk A - fake_timestamp = datetime.utcnow() - timedelta(days=10) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) create_chunk( cg, vertices=[to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1)], @@ -1082,7 +1073,7 @@ def test_merge_triple_chain_to_full_circle_disconnected_chunks(self, gen_graph): cg = gen_graph(n_layers=5) # Preparation: Build Chunk A - fake_timestamp = datetime.utcnow() - timedelta(days=10) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) create_chunk( cg, vertices=[to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1)], @@ -1181,7 +1172,7 @@ def test_merge_same_node(self, gen_graph): cg = gen_graph(n_layers=2) # Preparation: Build Chunk A - fake_timestamp = datetime.utcnow() - timedelta(days=10) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) create_chunk( cg, vertices=[to_label(cg, 1, 0, 0, 0, 0)], @@ -1223,7 +1214,7 @@ def test_merge_pair_abstract_nodes(self, gen_graph): cg = gen_graph(n_layers=3) # Preparation: Build Chunk A - fake_timestamp = datetime.utcnow() - timedelta(days=10) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) create_chunk( cg, vertices=[to_label(cg, 1, 0, 0, 0, 0)], @@ -1352,7 +1343,7 @@ def test_cross_edges(self, gen_graph): cg = gen_graph(n_layers=5) # Preparation: Build Chunk A - fake_timestamp = datetime.utcnow() - timedelta(days=10) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) create_chunk( cg, vertices=[ @@ -1466,81 +1457,72 @@ def test_multiple_cuts_and_splits(self, gen_graph_simplequerytest): child_ids = np.concatenate(child_ids) for i in range(10): - - print(f"\n\nITERATION {i}/10") - print("\n\nMERGE 1 & 3\n\n") + print(f"\n\nITERATION {i}/10 - MERGE 1 & 3") new_roots = cg.add_edges( "Jane Doe", [to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 1, 0, 0, 1)], affinities=0.9, ).new_root_ids - assert len(new_roots) == 1 + assert len(new_roots) == 1, new_roots assert len(cg.get_subgraph([new_roots[0]], leaves_only=True)) == 4 - root_ids = [] - for child_id in child_ids: - root_ids.append(cg.get_root(child_id)) - + root_ids = cg.get_roots(child_ids, assert_roots=True) + print(child_ids) + print(list(root_ids)) u_root_ids = np.unique(root_ids) - assert len(u_root_ids) == 1 + assert len(u_root_ids) == 1, u_root_ids # ------------------------------------------------------------------ + print(f"\n\nITERATION {i}/10 - SPLIT 2 & 3") new_roots = cg.remove_edges( "John Doe", source_ids=to_label(cg, 1, 1, 0, 0, 0), sink_ids=to_label(cg, 1, 1, 0, 0, 1), mincut=False, ).new_root_ids + assert len(new_roots) == 2, new_roots - assert len(np.unique(new_roots)) == 2 - - root_ids = [] - for child_id in child_ids: - root_ids.append(cg.get_root(child_id)) - + root_ids = cg.get_roots(child_ids, assert_roots=True) + print(child_ids) + print(list(root_ids)) u_root_ids = np.unique(root_ids) these_child_ids = [] for root_id in u_root_ids: these_child_ids.extend(cg.get_subgraph([root_id], leaves_only=True)) assert len(these_child_ids) == 4 - assert len(u_root_ids) == 2 + assert len(u_root_ids) == 2, u_root_ids # ------------------------------------------------------------------ - + print(f"\n\nITERATION {i}/10 - SPLIT 1 & 3") new_roots = cg.remove_edges( "Jane Doe", source_ids=to_label(cg, 1, 0, 0, 0, 0), sink_ids=to_label(cg, 1, 1, 0, 0, 1), mincut=False, ).new_root_ids - assert len(new_roots) == 2 - - root_ids = [] - for child_id in child_ids: - root_ids.append(cg.get_root(child_id)) + assert len(new_roots) == 2, new_roots + root_ids = cg.get_roots(child_ids, assert_roots=True) + print(child_ids) + print(list(root_ids)) u_root_ids = np.unique(root_ids) - assert len(u_root_ids) == 3 + assert len(u_root_ids) == 3, u_root_ids # ------------------------------------------------------------------ - - print(f"\n\nITERATION {i}/10") - print("\n\nMERGE 2 & 3\n\n") - + print(f"\n\nITERATION {i}/10 - MERGE 2 & 3") new_roots = cg.add_edges( "Jane Doe", [to_label(cg, 1, 1, 0, 0, 0), to_label(cg, 1, 1, 0, 0, 1)], affinities=0.9, ).new_root_ids - assert len(new_roots) == 1 - - root_ids = [] - for child_id in child_ids: - root_ids.append(cg.get_root(child_id)) + assert len(new_roots) == 1, new_roots + root_ids = cg.get_roots(child_ids, assert_roots=True) + print(child_ids) + print(list(root_ids)) u_root_ids = np.unique(root_ids) - assert len(u_root_ids) == 2 + assert len(u_root_ids) == 2, u_root_ids # for root_id in root_ids: # cross_edge_dict_layers = graph_tests.root_cross_edge_test( @@ -1575,7 +1557,7 @@ def test_cut_regular_link(self, gen_graph): cg = gen_graph(n_layers=3) # Preparation: Build Chunk A - fake_timestamp = datetime.utcnow() - timedelta(days=10) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) create_chunk( cg, vertices=[to_label(cg, 1, 0, 0, 0, 0)], @@ -1614,7 +1596,7 @@ def test_cut_regular_link(self, gen_graph): disallow_isolating_cut=True, ).new_root_ids - # Check New State + # verify new state assert len(new_root_ids) == 2 assert cg.get_root(to_label(cg, 1, 0, 0, 0, 0)) != cg.get_root( to_label(cg, 1, 1, 0, 0, 0) @@ -1646,7 +1628,7 @@ def test_cut_no_link(self, gen_graph): cg = gen_graph(n_layers=3) # Preparation: Build Chunk A - fake_timestamp = datetime.utcnow() - timedelta(days=10) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) create_chunk( cg, vertices=[to_label(cg, 1, 0, 0, 0, 0)], @@ -1707,7 +1689,7 @@ def test_cut_old_link(self, gen_graph): cg = gen_graph(n_layers=3) # Preparation: Build Chunk A - fake_timestamp = datetime.utcnow() - timedelta(days=10) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) create_chunk( cg, vertices=[to_label(cg, 1, 0, 0, 0, 0)], @@ -1775,7 +1757,7 @@ def test_cut_indivisible_link(self, gen_graph): cg = gen_graph(n_layers=3) # Preparation: Build Chunk A - fake_timestamp = datetime.utcnow() - timedelta(days=10) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) create_chunk( cg, vertices=[to_label(cg, 1, 0, 0, 0, 0)], @@ -1837,7 +1819,7 @@ def test_mincut_disrespects_sources_or_sinks(self, gen_graph): """ cg = gen_graph(n_layers=2) - fake_timestamp = datetime.utcnow() - timedelta(days=10) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) create_chunk( cg, vertices=[ @@ -1877,13 +1859,11 @@ def test_path_augmented_multicut(self, sv_data): edges = Edges( sv_edges[:, 0], sv_edges[:, 1], affinities=sv_affinity, areas=sv_area ) - cut_edges_aug = run_multicut(edges, sv_sources, sv_sinks, path_augment=True) assert cut_edges_aug.shape[0] == 350 with pytest.raises(exceptions.PreconditionError): run_multicut(edges, sv_sources, sv_sinks, path_augment=False) - pass class TestGraphHistory: @@ -1901,20 +1881,14 @@ def test_cut_merge_history(self, gen_graph): (1) Split 1 and 2 (2) Merge 1 and 2 """ - from ..graph.lineage import lineage_graph - - cg = gen_graph(n_layers=3) - - # Preparation: Build Chunk A - fake_timestamp = datetime.utcnow() - timedelta(days=10) + cg: ChunkedGraph = gen_graph(n_layers=3) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) create_chunk( cg, vertices=[to_label(cg, 1, 0, 0, 0, 0)], edges=[(to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 1, 0, 0, 0), 0.5)], timestamp=fake_timestamp, ) - - # Preparation: Build Chunk B create_chunk( cg, vertices=[to_label(cg, 1, 1, 0, 0, 0)], @@ -1932,7 +1906,7 @@ def test_cut_merge_history(self, gen_graph): first_root = cg.get_root(to_label(cg, 1, 0, 0, 0, 0)) assert first_root == cg.get_root(to_label(cg, 1, 1, 0, 0, 0)) - timestamp_before_split = datetime.utcnow() + timestamp_before_split = datetime.now(UTC) split_roots = cg.remove_edges( "Jane Doe", source_ids=to_label(cg, 1, 0, 0, 0, 0), @@ -1945,7 +1919,7 @@ def test_cut_merge_history(self, gen_graph): g = lineage_graph(cg, split_roots) assert g.size() == 2 - timestamp_after_split = datetime.utcnow() + timestamp_after_split = datetime.now(UTC) merge_roots = cg.add_edges( "Jane Doe", [to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 1, 0, 0, 0)], @@ -1953,7 +1927,7 @@ def test_cut_merge_history(self, gen_graph): ).new_root_ids assert len(merge_roots) == 1 merge_root = merge_roots[0] - timestamp_after_merge = datetime.utcnow() + timestamp_after_merge = datetime.now(UTC) g = lineage_graph(cg, merge_roots) assert g.size() == 4 @@ -2047,7 +2021,7 @@ def test_lock_unlock(self, gen_graph): cg = gen_graph(n_layers=3) # Preparation: Build Chunk A - fake_timestamp = datetime.utcnow() - timedelta(days=10) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) create_chunk( cg, vertices=[to_label(cg, 1, 0, 0, 0, 1), to_label(cg, 1, 0, 0, 0, 2)], @@ -2113,7 +2087,7 @@ def test_lock_expiration(self, gen_graph): cg = gen_graph(n_layers=3) # Preparation: Build Chunk A - fake_timestamp = datetime.utcnow() - timedelta(days=10) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) create_chunk( cg, vertices=[to_label(cg, 1, 0, 0, 0, 1), to_label(cg, 1, 0, 0, 0, 2)], @@ -2181,7 +2155,7 @@ def test_lock_renew(self, gen_graph): cg = gen_graph(n_layers=3) # Preparation: Build Chunk A - fake_timestamp = datetime.utcnow() - timedelta(days=10) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) create_chunk( cg, vertices=[to_label(cg, 1, 0, 0, 0, 1), to_label(cg, 1, 0, 0, 0, 2)], @@ -2233,7 +2207,7 @@ def test_lock_merge_lock_old_id(self, gen_graph): cg = gen_graph(n_layers=3) # Preparation: Build Chunk A - fake_timestamp = datetime.utcnow() - timedelta(days=10) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) create_chunk( cg, vertices=[to_label(cg, 1, 0, 0, 0, 1), to_label(cg, 1, 0, 0, 0, 2)], @@ -2299,7 +2273,7 @@ def test_indefinite_lock(self, gen_graph): cg = gen_graph(n_layers=3) # Preparation: Build Chunk A - fake_timestamp = datetime.utcnow() - timedelta(days=10) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) create_chunk( cg, vertices=[to_label(cg, 1, 0, 0, 0, 1), to_label(cg, 1, 0, 0, 0, 2)], @@ -2372,7 +2346,7 @@ def test_indefinite_lock_with_normal_lock_expiration(self, gen_graph): cg = gen_graph(n_layers=3) # Preparation: Build Chunk A - fake_timestamp = datetime.utcnow() - timedelta(days=10) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) create_chunk( cg, vertices=[to_label(cg, 1, 0, 0, 0, 1), to_label(cg, 1, 0, 0, 0, 2)], @@ -2451,7 +2425,7 @@ def test_indefinite_lock_with_normal_lock_expiration(self, gen_graph): # cg = gen_graph(n_layers=3) # # Preparation: Build Chunk A - # fake_timestamp = datetime.utcnow() - timedelta(days=10) + # fake_timestamp = datetime.now(UTC) - timedelta(days=10) # create_chunk( # cg, # vertices=[to_label(cg, 1, 0, 0, 0, 1), to_label(cg, 1, 0, 0, 0, 2)], @@ -2467,7 +2441,7 @@ def test_indefinite_lock_with_normal_lock_expiration(self, gen_graph): # timestamp=fake_timestamp, # ) - # add_layer( + # add_parent_chunk( # cg, 3, [0, 0, 0], time_stamp=fake_timestamp, n_threads=1, # ) @@ -2491,1054 +2465,951 @@ def test_indefinite_lock_with_normal_lock_expiration(self, gen_graph): # )[0] -# class MockChunkedGraph: -# """ -# Dummy class to mock partial functionality of the ChunkedGraph for use in unit tests. -# Feel free to add more functions as need be. Can pass in alternative member functions into constructor. -# """ - -# def __init__( -# self, get_chunk_coordinates=None, get_chunk_layer=None, get_chunk_id=None -# ): -# if get_chunk_coordinates is not None: -# self.get_chunk_coordinates = get_chunk_coordinates -# if get_chunk_layer is not None: -# self.get_chunk_layer = get_chunk_layer -# if get_chunk_id is not None: -# self.get_chunk_id = get_chunk_id - -# def get_chunk_coordinates(self, chunk_id): # pylint: disable=method-hidden -# return np.array([0, 0, 0]) - -# def get_chunk_layer(self, chunk_id): # pylint: disable=method-hidden -# return 2 - -# def get_chunk_id(self, *args): # pylint: disable=method-hidden -# return 0 - - -# class TestGraphSplit: -# @pytest.mark.timeout(30) -# def test_split_pair_same_chunk(self, gen_graph): -# """ -# Remove edge between existing RG supervoxels 1 and 2 (same chunk) -# Expected: Different (new) parents for RG 1 and 2 on Layer two -# ┌─────┐ ┌─────┐ -# │ A¹ │ │ A¹ │ -# │ 1━2 │ => │ 1 2 │ -# │ │ │ │ -# └─────┘ └─────┘ -# """ - -# cg = gen_graph(n_layers=2) - -# # Preparation: Build Chunk A -# fake_timestamp = datetime.utcnow() - timedelta(days=10) -# create_chunk( -# cg, -# vertices=[to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1)], -# edges=[(to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1), 0.5)], -# timestamp=fake_timestamp, -# ) - -# # Split -# new_root_ids = cg.remove_edges( -# "Jane Doe", -# source_ids=to_label(cg, 1, 0, 0, 0, 1), -# sink_ids=to_label(cg, 1, 0, 0, 0, 0), -# mincut=False, -# ).new_root_ids - -# # Check New State -# assert len(new_root_ids) == 2 -# assert cg.get_root(to_label(cg, 1, 0, 0, 0, 0)) != cg.get_root( -# to_label(cg, 1, 0, 0, 0, 1) -# ) -# leaves = np.unique( -# cg.get_subgraph([cg.get_root(to_label(cg, 1, 0, 0, 0, 0))], leaves_only=True) -# ) -# assert len(leaves) == 1 and to_label(cg, 1, 0, 0, 0, 0) in leaves -# leaves = np.unique( -# cg.get_subgraph([cg.get_root(to_label(cg, 1, 0, 0, 0, 1))], leaves_only=True) -# ) -# assert len(leaves) == 1 and to_label(cg, 1, 0, 0, 0, 1) in leaves - -# # Check Old State still accessible -# assert cg.get_root( -# to_label(cg, 1, 0, 0, 0, 0), time_stamp=fake_timestamp -# ) == cg.get_root(to_label(cg, 1, 0, 0, 0, 1), time_stamp=fake_timestamp) -# leaves = np.unique( -# cg.get_subgraph( -# [cg.get_root(to_label(cg, 1, 0, 0, 0, 0), time_stamp=fake_timestamp)], -# leaves_only=True, -# ) -# ) -# assert len(leaves) == 2 -# assert to_label(cg, 1, 0, 0, 0, 0) in leaves -# assert to_label(cg, 1, 0, 0, 0, 1) in leaves - -# # assert len(cg.get_latest_roots()) == 2 -# # assert len(cg.get_latest_roots(fake_timestamp)) == 1 - -# def test_split_nonexisting_edge(self, gen_graph): -# """ -# Remove edge between existing RG supervoxels 1 and 2 (same chunk) -# Expected: Different (new) parents for RG 1 and 2 on Layer two -# ┌─────┐ ┌─────┐ -# │ A¹ │ │ A¹ │ -# │ 1━2 │ => │ 1━2 │ -# │ | │ │ | │ -# │ 3 │ │ 3 │ -# └─────┘ └─────┘ -# """ - -# cg = gen_graph(n_layers=2) - -# # Preparation: Build Chunk A -# fake_timestamp = datetime.utcnow() - timedelta(days=10) -# create_chunk( -# cg, -# vertices=[to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1)], -# edges=[ -# (to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1), 0.5), -# (to_label(cg, 1, 0, 0, 0, 2), to_label(cg, 1, 0, 0, 0, 1), 0.5), -# ], -# timestamp=fake_timestamp, -# ) - -# # Split -# new_root_ids = cg.remove_edges( -# "Jane Doe", -# source_ids=to_label(cg, 1, 0, 0, 0, 0), -# sink_ids=to_label(cg, 1, 0, 0, 0, 2), -# mincut=False, -# ).new_root_ids - -# assert len(new_root_ids) == 1 - -# @pytest.mark.timeout(30) -# def test_split_pair_neighboring_chunks(self, gen_graph): -# """ -# Remove edge between existing RG supervoxels 1 and 2 (neighboring chunks) -# ┌─────┬─────┐ ┌─────┬─────┐ -# │ A¹ │ B¹ │ │ A¹ │ B¹ │ -# │ 1━━┿━━2 │ => │ 1 │ 2 │ -# │ │ │ │ │ │ -# └─────┴─────┘ └─────┴─────┘ -# """ - -# cg = gen_graph(n_layers=3) - -# # Preparation: Build Chunk A -# fake_timestamp = datetime.utcnow() - timedelta(days=10) -# create_chunk( -# cg, -# vertices=[to_label(cg, 1, 0, 0, 0, 0)], -# edges=[(to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 1, 0, 0, 0), 1.0)], -# timestamp=fake_timestamp, -# ) - -# # Preparation: Build Chunk B -# create_chunk( -# cg, -# vertices=[to_label(cg, 1, 1, 0, 0, 0)], -# edges=[(to_label(cg, 1, 1, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 0), 1.0)], -# timestamp=fake_timestamp, -# ) - -# add_layer( -# cg, -# 3, -# [0, 0, 0], -# -# time_stamp=fake_timestamp, -# n_threads=1, -# ) - -# # Split -# new_root_ids = cg.remove_edges( -# "Jane Doe", -# source_ids=to_label(cg, 1, 1, 0, 0, 0), -# sink_ids=to_label(cg, 1, 0, 0, 0, 0), -# mincut=False, -# ).new_root_ids - -# # Check New State -# assert len(new_root_ids) == 2 -# assert cg.get_root(to_label(cg, 1, 0, 0, 0, 0)) != cg.get_root( -# to_label(cg, 1, 1, 0, 0, 0) -# ) -# leaves = np.unique( -# cg.get_subgraph([cg.get_root(to_label(cg, 1, 0, 0, 0, 0))], leaves_only=True) -# ) -# assert len(leaves) == 1 and to_label(cg, 1, 0, 0, 0, 0) in leaves -# leaves = np.unique( -# cg.get_subgraph([cg.get_root(to_label(cg, 1, 1, 0, 0, 0))], leaves_only=True) -# ) -# assert len(leaves) == 1 and to_label(cg, 1, 1, 0, 0, 0) in leaves - -# # Check Old State still accessible -# assert cg.get_root( -# to_label(cg, 1, 0, 0, 0, 0), time_stamp=fake_timestamp -# ) == cg.get_root(to_label(cg, 1, 1, 0, 0, 0), time_stamp=fake_timestamp) -# leaves = np.unique( -# cg.get_subgraph( -# [cg.get_root(to_label(cg, 1, 0, 0, 0, 0), time_stamp=fake_timestamp)], -# leaves_only=True, -# ) -# ) -# assert len(leaves) == 2 -# assert to_label(cg, 1, 0, 0, 0, 0) in leaves -# assert to_label(cg, 1, 1, 0, 0, 0) in leaves - -# assert len(cg.get_latest_roots()) == 2 -# assert len(cg.get_latest_roots(fake_timestamp)) == 1 - -# @pytest.mark.timeout(30) -# def test_split_verify_cross_chunk_edges(self, gen_graph): -# """ -# Remove edge between existing RG supervoxels 1 and 2 (neighboring chunks) -# ┌─────┬─────┬─────┐ ┌─────┬─────┬─────┐ -# | │ A¹ │ B¹ │ | │ A¹ │ B¹ │ -# | │ 1━━┿━━3 │ => | │ 1━━┿━━3 │ -# | │ | │ │ | │ │ │ -# | │ 2 │ │ | │ 2 │ │ -# └─────┴─────┴─────┘ └─────┴─────┴─────┘ -# """ - -# cg = gen_graph(n_layers=4) - -# # Preparation: Build Chunk A -# fake_timestamp = datetime.utcnow() - timedelta(days=10) -# create_chunk( -# cg, -# vertices=[to_label(cg, 1, 1, 0, 0, 0), to_label(cg, 1, 1, 0, 0, 1)], -# edges=[ -# (to_label(cg, 1, 1, 0, 0, 0), to_label(cg, 1, 2, 0, 0, 0), inf), -# (to_label(cg, 1, 1, 0, 0, 0), to_label(cg, 1, 1, 0, 0, 1), 0.5), -# ], -# timestamp=fake_timestamp, -# ) - -# # Preparation: Build Chunk B -# create_chunk( -# cg, -# vertices=[to_label(cg, 1, 2, 0, 0, 0)], -# edges=[(to_label(cg, 1, 2, 0, 0, 0), to_label(cg, 1, 1, 0, 0, 0), inf)], -# timestamp=fake_timestamp, -# ) - -# add_layer( -# cg, -# 3, -# [0, 0, 0], -# -# time_stamp=fake_timestamp, -# n_threads=1, -# ) -# add_layer( -# cg, -# 3, -# [0, 0, 0], -# -# time_stamp=fake_timestamp, -# n_threads=1, -# ) -# add_layer( -# cg, -# 4, -# [0, 0, 0], -# -# time_stamp=fake_timestamp, -# n_threads=1, -# ) - -# assert cg.get_root(to_label(cg, 1, 1, 0, 0, 0)) == cg.get_root( -# to_label(cg, 1, 1, 0, 0, 1) -# ) -# assert cg.get_root(to_label(cg, 1, 1, 0, 0, 0)) == cg.get_root( -# to_label(cg, 1, 2, 0, 0, 0) -# ) - -# # Split -# new_root_ids = cg.remove_edges( -# "Jane Doe", -# source_ids=to_label(cg, 1, 1, 0, 0, 0), -# sink_ids=to_label(cg, 1, 1, 0, 0, 1), -# mincut=False, -# ).new_root_ids - -# assert len(new_root_ids) == 2 - -# svs2 = cg.get_subgraph([new_root_ids[0]], leaves_only=True) -# svs1 = cg.get_subgraph([new_root_ids[1]], leaves_only=True) -# len_set = {1, 2} -# assert len(svs1) in len_set -# len_set.remove(len(svs1)) -# assert len(svs2) in len_set - -# # Check New State -# assert len(new_root_ids) == 2 -# assert cg.get_root(to_label(cg, 1, 1, 0, 0, 0)) != cg.get_root( -# to_label(cg, 1, 1, 0, 0, 1) -# ) -# assert cg.get_root(to_label(cg, 1, 1, 0, 0, 0)) == cg.get_root( -# to_label(cg, 1, 2, 0, 0, 0) -# ) - -# cc_dict = cg.get_atomic_cross_edges( -# cg.get_parent(to_label(cg, 1, 1, 0, 0, 0)) -# ) -# assert len(cc_dict[3]) == 1 -# assert cc_dict[3][0][0] == to_label(cg, 1, 1, 0, 0, 0) -# assert cc_dict[3][0][1] == to_label(cg, 1, 2, 0, 0, 0) - -# assert len(cg.get_latest_roots()) == 2 -# assert len(cg.get_latest_roots(fake_timestamp)) == 1 - -# @pytest.mark.timeout(30) -# def test_split_verify_loop(self, gen_graph): -# """ -# Remove edge between existing RG supervoxels 1 and 2 (neighboring chunks) -# ┌─────┬────────┬─────┐ ┌─────┬────────┬─────┐ -# | │ A¹ │ B¹ │ | │ A¹ │ B¹ │ -# | │ 4━━1━━┿━━5 │ => | │ 4 1━━┿━━5 │ -# | │ / │ | │ | │ │ | │ -# | │ 3 2━━┿━━6 │ | │ 3 2━━┿━━6 │ -# └─────┴────────┴─────┘ └─────┴────────┴─────┘ -# """ - -# cg = gen_graph(n_layers=4) - -# # Preparation: Build Chunk A -# fake_timestamp = datetime.utcnow() - timedelta(days=10) -# create_chunk( -# cg, -# vertices=[ -# to_label(cg, 1, 1, 0, 0, 0), -# to_label(cg, 1, 1, 0, 0, 1), -# to_label(cg, 1, 1, 0, 0, 2), -# to_label(cg, 1, 1, 0, 0, 3), -# ], -# edges=[ -# (to_label(cg, 1, 1, 0, 0, 0), to_label(cg, 1, 2, 0, 0, 0), inf), -# (to_label(cg, 1, 1, 0, 0, 1), to_label(cg, 1, 2, 0, 0, 1), inf), -# (to_label(cg, 1, 1, 0, 0, 0), to_label(cg, 1, 1, 0, 0, 2), 0.5), -# (to_label(cg, 1, 1, 0, 0, 0), to_label(cg, 1, 1, 0, 0, 3), 0.5), -# ], -# timestamp=fake_timestamp, -# ) - -# # Preparation: Build Chunk B -# create_chunk( -# cg, -# vertices=[to_label(cg, 1, 2, 0, 0, 0), to_label(cg, 1, 2, 0, 0, 1)], -# edges=[ -# (to_label(cg, 1, 2, 0, 0, 0), to_label(cg, 1, 1, 0, 0, 0), inf), -# (to_label(cg, 1, 2, 0, 0, 1), to_label(cg, 1, 1, 0, 0, 1), inf), -# (to_label(cg, 1, 2, 0, 0, 1), to_label(cg, 1, 2, 0, 0, 0), 0.5), -# ], -# timestamp=fake_timestamp, -# ) - -# add_layer( -# cg, -# 3, -# [0, 0, 0], -# -# time_stamp=fake_timestamp, -# n_threads=1, -# ) -# add_layer( -# cg, -# 3, -# [0, 0, 0], -# -# time_stamp=fake_timestamp, -# n_threads=1, -# ) -# add_layer( -# cg, -# 4, -# [0, 0, 0], -# -# time_stamp=fake_timestamp, -# n_threads=1, -# ) - -# assert cg.get_root(to_label(cg, 1, 1, 0, 0, 0)) == cg.get_root( -# to_label(cg, 1, 1, 0, 0, 1) -# ) -# assert cg.get_root(to_label(cg, 1, 1, 0, 0, 0)) == cg.get_root( -# to_label(cg, 1, 2, 0, 0, 0) -# ) - -# # Split -# new_root_ids = cg.remove_edges( -# "Jane Doe", -# source_ids=to_label(cg, 1, 1, 0, 0, 0), -# sink_ids=to_label(cg, 1, 1, 0, 0, 2), -# mincut=False, -# ).new_root_ids - -# assert len(new_root_ids) == 2 - -# new_root_ids = cg.remove_edges( -# "Jane Doe", -# source_ids=to_label(cg, 1, 1, 0, 0, 0), -# sink_ids=to_label(cg, 1, 1, 0, 0, 3), -# mincut=False, -# ).new_root_ids - -# assert len(new_root_ids) == 2 - -# cc_dict = cg.get_atomic_cross_edges( -# cg.get_parent(to_label(cg, 1, 1, 0, 0, 0)) -# ) -# assert len(cc_dict[3]) == 1 -# cc_dict = cg.get_atomic_cross_edges( -# cg.get_parent(to_label(cg, 1, 1, 0, 0, 0)) -# ) -# assert len(cc_dict[3]) == 1 - -# assert len(cg.get_latest_roots()) == 3 -# assert len(cg.get_latest_roots(fake_timestamp)) == 1 - -# @pytest.mark.timeout(30) -# def test_split_pair_disconnected_chunks(self, gen_graph): -# """ -# Remove edge between existing RG supervoxels 1 and 2 (disconnected chunks) -# ┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ -# │ A¹ │ ... │ Z¹ │ │ A¹ │ ... │ Z¹ │ -# │ 1━━┿━━━━━┿━━2 │ => │ 1 │ │ 2 │ -# │ │ │ │ │ │ │ │ -# └─────┘ └─────┘ └─────┘ └─────┘ -# """ - -# cg = gen_graph(n_layers=9) - -# # Preparation: Build Chunk A -# fake_timestamp = datetime.utcnow() - timedelta(days=10) -# create_chunk( -# cg, -# vertices=[to_label(cg, 1, 0, 0, 0, 0)], -# edges=[(to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 7, 7, 7, 0), 1.0,)], -# timestamp=fake_timestamp, -# ) - -# # Preparation: Build Chunk Z -# create_chunk( -# cg, -# vertices=[to_label(cg, 1, 7, 7, 7, 0)], -# edges=[(to_label(cg, 1, 7, 7, 7, 0), to_label(cg, 1, 0, 0, 0, 0), 1.0,)], -# timestamp=fake_timestamp, -# ) - -# add_layer( -# cg, -# 3, -# [0, 0, 0], -# -# time_stamp=fake_timestamp, -# n_threads=1, -# ) -# add_layer( -# cg, -# 3, -# [0, 0, 0], -# -# time_stamp=fake_timestamp, -# n_threads=1, -# ) -# add_layer( -# cg, -# 4, -# [0, 0, 0], -# -# time_stamp=fake_timestamp, -# n_threads=1, -# ) -# add_layer( -# cg, -# 4, -# [0, 0, 0], -# -# time_stamp=fake_timestamp, -# n_threads=1, -# ) -# add_layer( -# cg, -# 5, -# [0, 0, 0], -# -# time_stamp=fake_timestamp, -# n_threads=1, -# ) -# add_layer( -# cg, -# 5, -# [0, 0, 0], -# -# time_stamp=fake_timestamp, -# n_threads=1, -# ) -# add_layer( -# cg, -# 6, -# [0, 0, 0], -# -# time_stamp=fake_timestamp, -# n_threads=1, -# ) -# add_layer( -# cg, -# 6, -# [0, 0, 0], -# -# time_stamp=fake_timestamp, -# n_threads=1, -# ) -# add_layer( -# cg, -# 7, -# [0, 0, 0], -# -# time_stamp=fake_timestamp, -# n_threads=1, -# ) -# add_layer( -# cg, -# 7, -# [0, 0, 0], -# -# time_stamp=fake_timestamp, -# n_threads=1, -# ) -# add_layer( -# cg, -# 8, -# [0, 0, 0], -# -# time_stamp=fake_timestamp, -# n_threads=1, -# ) -# add_layer( -# cg, -# 8, -# [0, 0, 0], -# -# time_stamp=fake_timestamp, -# n_threads=1, -# ) -# add_layer( -# cg, -# 9, -# [0, 0, 0], -# -# time_stamp=fake_timestamp, -# n_threads=1, -# ) - -# # Split -# new_roots = cg.remove_edges( -# "Jane Doe", -# source_ids=to_label(cg, 1, 7, 7, 7, 0), -# sink_ids=to_label(cg, 1, 0, 0, 0, 0), -# mincut=False, -# ).new_root_ids - -# # Check New State -# assert len(new_roots) == 2 -# assert cg.get_root(to_label(cg, 1, 0, 0, 0, 0)) != cg.get_root( -# to_label(cg, 1, 7, 7, 7, 0) -# ) -# leaves = np.unique( -# cg.get_subgraph([cg.get_root(to_label(cg, 1, 0, 0, 0, 0))], leaves_only=True) -# ) -# assert len(leaves) == 1 and to_label(cg, 1, 0, 0, 0, 0) in leaves -# leaves = np.unique( -# cg.get_subgraph([cg.get_root(to_label(cg, 1, 7, 7, 7, 0))], leaves_only=True) -# ) -# assert len(leaves) == 1 and to_label(cg, 1, 7, 7, 7, 0) in leaves - -# # Check Old State still accessible -# assert cg.get_root( -# to_label(cg, 1, 0, 0, 0, 0), time_stamp=fake_timestamp -# ) == cg.get_root(to_label(cg, 1, 7, 7, 7, 0), time_stamp=fake_timestamp) -# leaves = np.unique( -# cg.get_subgraph( -# [cg.get_root(to_label(cg, 1, 0, 0, 0, 0), time_stamp=fake_timestamp)], -# leaves_only=True, -# ) -# ) -# assert len(leaves) == 2 -# assert to_label(cg, 1, 0, 0, 0, 0) in leaves -# assert to_label(cg, 1, 7, 7, 7, 0) in leaves - -# @pytest.mark.timeout(30) -# def test_split_pair_already_disconnected(self, gen_graph): -# """ -# Try to remove edge between already disconnected RG supervoxels 1 and 2 (same chunk). -# Expected: No change, no error -# ┌─────┐ ┌─────┐ -# │ A¹ │ │ A¹ │ -# │ 1 2 │ => │ 1 2 │ -# │ │ │ │ -# └─────┘ └─────┘ -# """ - -# cg = gen_graph(n_layers=2) - -# # Preparation: Build Chunk A -# fake_timestamp = datetime.utcnow() - timedelta(days=10) -# create_chunk( -# cg, -# vertices=[to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1)], -# edges=[], -# timestamp=fake_timestamp, -# ) - -# res_old = cg.client._table.read_rows() -# res_old.consume_all() - -# # Split -# with pytest.raises(exceptions.PreconditionError): -# cg.remove_edges( -# "Jane Doe", -# source_ids=to_label(cg, 1, 0, 0, 0, 1), -# sink_ids=to_label(cg, 1, 0, 0, 0, 0), -# mincut=False, -# ) - -# res_new = cg.client._table.read_rows() -# res_new.consume_all() - -# # Check -# if res_old.rows != res_new.rows: -# warn( -# "Rows were modified when splitting a pair of already disconnected supervoxels. " -# "While probably not an error, it is an unnecessary operation." -# ) - -# @pytest.mark.timeout(30) -# def test_split_full_circle_to_triple_chain_same_chunk(self, gen_graph): -# """ -# Remove direct edge between RG supervoxels 1 and 2, but leave indirect connection (same chunk) -# ┌─────┐ ┌─────┐ -# │ A¹ │ │ A¹ │ -# │ 1━2 │ => │ 1 2 │ -# │ ┗3┛ │ │ ┗3┛ │ -# └─────┘ └─────┘ -# """ - -# cg = gen_graph(n_layers=2) - -# # Preparation: Build Chunk A -# fake_timestamp = datetime.utcnow() - timedelta(days=10) -# create_chunk( -# cg, -# vertices=[ -# to_label(cg, 1, 0, 0, 0, 0), -# to_label(cg, 1, 0, 0, 0, 1), -# to_label(cg, 1, 0, 0, 0, 2), -# ], -# edges=[ -# (to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 2), 0.5), -# (to_label(cg, 1, 0, 0, 0, 1), to_label(cg, 1, 0, 0, 0, 2), 0.5), -# (to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1), 0.3), -# ], -# timestamp=fake_timestamp, -# ) - -# # Split -# new_root_ids = cg.remove_edges( -# "Jane Doe", -# source_ids=to_label(cg, 1, 0, 0, 0, 1), -# sink_ids=to_label(cg, 1, 0, 0, 0, 0), -# mincut=False, -# ).new_root_ids - -# # Check New State -# assert len(new_root_ids) == 1 -# assert cg.get_root(to_label(cg, 1, 0, 0, 0, 0)) == new_root_ids[0] -# assert cg.get_root(to_label(cg, 1, 0, 0, 0, 1)) == new_root_ids[0] -# assert cg.get_root(to_label(cg, 1, 0, 0, 0, 2)) == new_root_ids[0] -# leaves = np.unique(cg.get_subgraph([new_root_ids[0]], leaves_only=True)) -# assert len(leaves) == 3 -# assert to_label(cg, 1, 0, 0, 0, 0) in leaves -# assert to_label(cg, 1, 0, 0, 0, 1) in leaves -# assert to_label(cg, 1, 0, 0, 0, 2) in leaves - -# # Check Old State still accessible -# old_root_id = cg.get_root( -# to_label(cg, 1, 0, 0, 0, 0), time_stamp=fake_timestamp -# ) -# assert new_root_ids[0] != old_root_id - -# # assert len(cg.get_latest_roots()) == 1 -# # assert len(cg.get_latest_roots(fake_timestamp)) == 1 - -# @pytest.mark.timeout(30) -# def test_split_full_circle_to_triple_chain_neighboring_chunks(self, gen_graph): -# """ -# Remove direct edge between RG supervoxels 1 and 2, but leave indirect connection (neighboring chunks) -# ┌─────┬─────┐ ┌─────┬─────┐ -# │ A¹ │ B¹ │ │ A¹ │ B¹ │ -# │ 1━━┿━━2 │ => │ 1 │ 2 │ -# │ ┗3━┿━━┛ │ │ ┗3━┿━━┛ │ -# └─────┴─────┘ └─────┴─────┘ -# """ - -# cg = gen_graph(n_layers=3) - -# # Preparation: Build Chunk A -# fake_timestamp = datetime.utcnow() - timedelta(days=10) -# create_chunk( -# cg, -# vertices=[to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1)], -# edges=[ -# (to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1), 0.5), -# (to_label(cg, 1, 0, 0, 0, 1), to_label(cg, 1, 1, 0, 0, 0), 0.5), -# (to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 1, 0, 0, 0), 0.3), -# ], -# timestamp=fake_timestamp, -# ) - -# # Preparation: Build Chunk B -# create_chunk( -# cg, -# vertices=[to_label(cg, 1, 1, 0, 0, 0)], -# edges=[ -# (to_label(cg, 1, 1, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1), 0.5), -# (to_label(cg, 1, 1, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 0), 0.3), -# ], -# timestamp=fake_timestamp, -# ) - -# add_layer( -# cg, -# 3, -# [0, 0, 0], -# -# time_stamp=fake_timestamp, -# n_threads=1, -# ) - -# # Split -# new_root_ids = cg.remove_edges( -# "Jane Doe", -# source_ids=to_label(cg, 1, 1, 0, 0, 0), -# sink_ids=to_label(cg, 1, 0, 0, 0, 0), -# mincut=False, -# ).new_root_ids - -# # Check New State -# assert len(new_root_ids) == 1 -# assert cg.get_root(to_label(cg, 1, 0, 0, 0, 0)) == new_root_ids[0] -# assert cg.get_root(to_label(cg, 1, 0, 0, 0, 1)) == new_root_ids[0] -# assert cg.get_root(to_label(cg, 1, 1, 0, 0, 0)) == new_root_ids[0] -# leaves = np.unique(cg.get_subgraph([new_root_ids[0]], leaves_only=True)) -# assert len(leaves) == 3 -# assert to_label(cg, 1, 0, 0, 0, 0) in leaves -# assert to_label(cg, 1, 0, 0, 0, 1) in leaves -# assert to_label(cg, 1, 1, 0, 0, 0) in leaves - -# # Check Old State still accessible -# old_root_id = cg.get_root( -# to_label(cg, 1, 0, 0, 0, 0), time_stamp=fake_timestamp -# ) -# assert new_root_ids[0] != old_root_id - -# assert len(cg.get_latest_roots()) == 1 -# assert len(cg.get_latest_roots(fake_timestamp)) == 1 - -# @pytest.mark.timeout(30) -# def test_split_full_circle_to_triple_chain_disconnected_chunks(self, gen_graph): -# """ -# Remove direct edge between RG supervoxels 1 and 2, but leave indirect connection (disconnected chunks) -# ┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ -# │ A¹ │ ... │ Z¹ │ │ A¹ │ ... │ Z¹ │ -# │ 1━━┿━━━━━┿━━2 │ => │ 1 │ │ 2 │ -# │ ┗3━┿━━━━━┿━━┛ │ │ ┗3━┿━━━━━┿━━┛ │ -# └─────┘ └─────┘ └─────┘ └─────┘ -# """ - -# cg = gen_graph(n_layers=9) - -# loc = 2 - -# # Preparation: Build Chunk A -# fake_timestamp = datetime.utcnow() - timedelta(days=10) -# create_chunk( -# cg, -# vertices=[to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1)], -# edges=[ -# (to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1), 0.5), -# (to_label(cg, 1, 0, 0, 0, 1), to_label(cg, 1, loc, loc, loc, 0), 0.5,), -# (to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, loc, loc, loc, 0), 0.3,), -# ], -# timestamp=fake_timestamp, -# ) - -# # Preparation: Build Chunk Z -# create_chunk( -# cg, -# vertices=[to_label(cg, 1, loc, loc, loc, 0)], -# edges=[ -# (to_label(cg, 1, loc, loc, loc, 0), to_label(cg, 1, 0, 0, 0, 1), 0.5,), -# (to_label(cg, 1, loc, loc, loc, 0), to_label(cg, 1, 0, 0, 0, 0), 0.3,), -# ], -# timestamp=fake_timestamp, -# ) - -# for i_layer in range(3, 10): -# if loc // 2 ** (i_layer - 3) == 1: -# add_layer( -# cg, -# i_layer, -# [0, 0, 0], -# -# time_stamp=fake_timestamp, -# n_threads=1, -# ) -# elif loc // 2 ** (i_layer - 3) == 0: -# add_layer( -# cg, -# i_layer, -# [0, 0, 0], -# -# time_stamp=fake_timestamp, -# n_threads=1, -# ) -# else: -# add_layer( -# cg, -# i_layer, -# [0, 0, 0], -# -# time_stamp=fake_timestamp, -# n_threads=1, -# ) -# add_layer( -# cg, -# i_layer, -# [0, 0, 0], -# -# time_stamp=fake_timestamp, -# n_threads=1, -# ) - -# assert ( -# cg.get_root(to_label(cg, 1, loc, loc, loc, 0)) -# == cg.get_root(to_label(cg, 1, 0, 0, 0, 0)) -# == cg.get_root(to_label(cg, 1, 0, 0, 0, 1)) -# ) - -# # Split -# new_root_ids = cg.remove_edges( -# "Jane Doe", -# source_ids=to_label(cg, 1, loc, loc, loc, 0), -# sink_ids=to_label(cg, 1, 0, 0, 0, 0), -# mincut=False, -# ).new_root_ids - -# # Check New State -# assert len(new_root_ids) == 1 -# assert cg.get_root(to_label(cg, 1, 0, 0, 0, 0)) == new_root_ids[0] -# assert cg.get_root(to_label(cg, 1, 0, 0, 0, 1)) == new_root_ids[0] -# assert cg.get_root(to_label(cg, 1, loc, loc, loc, 0)) == new_root_ids[0] -# leaves = np.unique(cg.get_subgraph([new_root_ids[0]], leaves_only=True)) -# assert len(leaves) == 3 -# assert to_label(cg, 1, 0, 0, 0, 0) in leaves -# assert to_label(cg, 1, 0, 0, 0, 1) in leaves -# assert to_label(cg, 1, loc, loc, loc, 0) in leaves - -# # Check Old State still accessible -# old_root_id = cg.get_root( -# to_label(cg, 1, 0, 0, 0, 0), time_stamp=fake_timestamp -# ) -# assert new_root_ids[0] != old_root_id - -# assert len(cg.get_latest_roots()) == 1 -# assert len(cg.get_latest_roots(fake_timestamp)) == 1 - -# @pytest.mark.timeout(30) -# def test_split_same_node(self, gen_graph): -# """ -# Try to remove (non-existing) edge between RG supervoxel 1 and itself -# ┌─────┐ -# │ A¹ │ -# │ 1 │ => Reject -# │ │ -# └─────┘ -# """ - -# cg = gen_graph(n_layers=2) - -# # Preparation: Build Chunk A -# fake_timestamp = datetime.utcnow() - timedelta(days=10) -# create_chunk( -# cg, -# vertices=[to_label(cg, 1, 0, 0, 0, 0)], -# edges=[], -# timestamp=fake_timestamp, -# ) - -# res_old = cg.client._table.read_rows() -# res_old.consume_all() - -# # Split -# with pytest.raises(exceptions.PreconditionError): -# cg.remove_edges( -# "Jane Doe", -# source_ids=to_label(cg, 1, 0, 0, 0, 0), -# sink_ids=to_label(cg, 1, 0, 0, 0, 0), -# mincut=False, -# ) - -# res_new = cg.client._table.read_rows() -# res_new.consume_all() - -# assert res_new.rows == res_old.rows - -# @pytest.mark.timeout(30) -# def test_split_pair_abstract_nodes(self, gen_graph): -# """ -# Try to remove (non-existing) edge between RG supervoxel 1 and abstract node "2" -# ┌─────┐ -# │ B² │ -# │ "2" │ -# │ │ -# └─────┘ -# ┌─────┐ => Reject -# │ A¹ │ -# │ 1 │ -# │ │ -# └─────┘ -# """ - -# cg = gen_graph(n_layers=3) - -# # Preparation: Build Chunk A -# fake_timestamp = datetime.utcnow() - timedelta(days=10) -# create_chunk( -# cg, -# vertices=[to_label(cg, 1, 0, 0, 0, 0)], -# edges=[], -# timestamp=fake_timestamp, -# ) - -# # Preparation: Build Chunk B -# create_chunk( -# cg, -# vertices=[to_label(cg, 1, 1, 0, 0, 0)], -# edges=[], -# timestamp=fake_timestamp, -# ) - -# add_layer( -# cg, -# 3, -# [0, 0, 0], -# -# time_stamp=fake_timestamp, -# n_threads=1, -# ) - -# res_old = cg.client._table.read_rows() -# res_old.consume_all() - -# # Split -# with pytest.raises(exceptions.PreconditionError): -# cg.remove_edges( -# "Jane Doe", -# source_ids=to_label(cg, 1, 0, 0, 0, 0), -# sink_ids=to_label(cg, 2, 1, 0, 0, 1), -# mincut=False, -# ) - -# res_new = cg.client._table.read_rows() -# res_new.consume_all() - -# assert res_new.rows == res_old.rows - -# @pytest.mark.timeout(30) -# def test_diagonal_connections(self, gen_graph): -# """ -# Create graph with edge between RG supervoxels 1 and 2 (same chunk) -# and edge between RG supervoxels 1 and 3 (neighboring chunks) -# ┌─────┬─────┐ -# │ A¹ │ B¹ │ -# │ 2━1━┿━━3 │ -# │ / │ │ -# ┌─────┬─────┐ -# │ | │ │ -# │ 4━━┿━━5 │ -# │ C¹ │ D¹ │ -# └─────┴─────┘ -# """ - -# cg = gen_graph(n_layers=3) - -# # Chunk A -# create_chunk( -# cg, -# vertices=[to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1)], -# edges=[ -# (to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1), 0.5), -# (to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 1, 0, 0, 0), inf), -# (to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 1, 0, 0), inf), -# ], -# ) - -# # Chunk B -# create_chunk( -# cg, -# vertices=[to_label(cg, 1, 1, 0, 0, 0)], -# edges=[(to_label(cg, 1, 1, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 0), inf)], -# ) - -# # Chunk C -# create_chunk( -# cg, -# vertices=[to_label(cg, 1, 0, 1, 0, 0)], -# edges=[ -# (to_label(cg, 1, 0, 1, 0, 0), to_label(cg, 1, 1, 1, 0, 0), inf), -# (to_label(cg, 1, 0, 1, 0, 0), to_label(cg, 1, 0, 0, 0, 0), inf), -# ], -# ) - -# # Chunk D -# create_chunk( -# cg, -# vertices=[to_label(cg, 1, 1, 1, 0, 0)], -# edges=[(to_label(cg, 1, 1, 1, 0, 0), to_label(cg, 1, 0, 1, 0, 0), inf)], -# ) - -# add_layer( -# cg, 3, [0, 0, 0], n_threads=1, -# ) - -# rr = cg.range_read_chunk(chunk_id=cg.get_chunk_id(layer=3, x=0, y=0, z=0)) -# root_ids_t0 = list(rr.keys()) - -# assert len(root_ids_t0) == 1 - -# child_ids = [] -# for root_id in root_ids_t0: -# child_ids.extend([cg.get_subgraph([root_id])], leaves_only=True) - -# new_roots = cg.remove_edges( -# "Jane Doe", -# source_ids=to_label(cg, 1, 0, 0, 0, 0), -# sink_ids=to_label(cg, 1, 0, 0, 0, 1), -# mincut=False, -# ).new_root_ids - -# assert len(new_roots) == 2 -# assert cg.get_root(to_label(cg, 1, 1, 1, 0, 0)) == cg.get_root( -# to_label(cg, 1, 0, 1, 0, 0) -# ) -# assert cg.get_root(to_label(cg, 1, 0, 0, 0, 0)) == cg.get_root( -# to_label(cg, 1, 0, 0, 0, 0) -# ) +class TestGraphSplit: + @pytest.mark.timeout(30) + def test_split_pair_same_chunk(self, gen_graph): + """ + Remove edge between existing RG supervoxels 1 and 2 (same chunk) + Expected: Different (new) parents for RG 1 and 2 on Layer two + ┌─────┐ ┌─────┐ + │ A¹ │ │ A¹ │ + │ 1━2 │ => │ 1 2 │ + │ │ │ │ + └─────┘ └─────┘ + """ + + cg: ChunkedGraph = gen_graph(n_layers=2) + + # Preparation: Build Chunk A + fake_timestamp = datetime.now(UTC) - timedelta(days=10) + create_chunk( + cg, + vertices=[to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1)], + edges=[(to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1), 0.5)], + timestamp=fake_timestamp, + ) + + # Split + new_root_ids = cg.remove_edges( + "Jane Doe", + source_ids=to_label(cg, 1, 0, 0, 0, 1), + sink_ids=to_label(cg, 1, 0, 0, 0, 0), + mincut=False, + ).new_root_ids + + # verify new state + assert len(new_root_ids) == 2 + assert cg.get_root(to_label(cg, 1, 0, 0, 0, 0)) != cg.get_root( + to_label(cg, 1, 0, 0, 0, 1) + ) + leaves = np.unique( + cg.get_subgraph( + [cg.get_root(to_label(cg, 1, 0, 0, 0, 0))], leaves_only=True + ) + ) + assert len(leaves) == 1 and to_label(cg, 1, 0, 0, 0, 0) in leaves + leaves = np.unique( + cg.get_subgraph( + [cg.get_root(to_label(cg, 1, 0, 0, 0, 1))], leaves_only=True + ) + ) + assert len(leaves) == 1 and to_label(cg, 1, 0, 0, 0, 1) in leaves + + # verify old state + cg.cache = None + assert cg.get_root( + to_label(cg, 1, 0, 0, 0, 0), time_stamp=fake_timestamp + ) == cg.get_root(to_label(cg, 1, 0, 0, 0, 1), time_stamp=fake_timestamp) + leaves = np.unique( + cg.get_subgraph( + [cg.get_root(to_label(cg, 1, 0, 0, 0, 0), time_stamp=fake_timestamp)], + leaves_only=True, + ) + ) + assert len(leaves) == 2 + assert to_label(cg, 1, 0, 0, 0, 0) in leaves + assert to_label(cg, 1, 0, 0, 0, 1) in leaves + + assert len(get_latest_roots(cg)) == 2 + assert len(get_latest_roots(cg, fake_timestamp)) == 1 + + def test_split_nonexisting_edge(self, gen_graph): + """ + Remove edge between existing RG supervoxels 1 and 2 (same chunk) + Expected: Different (new) parents for RG 1 and 2 on Layer two + ┌─────┐ ┌─────┐ + │ A¹ │ │ A¹ │ + │ 1━2 │ => │ 1━2 │ + │ | │ │ | │ + │ 3 │ │ 3 │ + └─────┘ └─────┘ + """ + cg = gen_graph(n_layers=2) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) + create_chunk( + cg, + vertices=[to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1)], + edges=[ + (to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1), 0.5), + (to_label(cg, 1, 0, 0, 0, 2), to_label(cg, 1, 0, 0, 0, 1), 0.5), + ], + timestamp=fake_timestamp, + ) + new_root_ids = cg.remove_edges( + "Jane Doe", + source_ids=to_label(cg, 1, 0, 0, 0, 0), + sink_ids=to_label(cg, 1, 0, 0, 0, 2), + mincut=False, + ).new_root_ids + assert len(new_root_ids) == 1 + + @pytest.mark.timeout(30) + def test_split_pair_neighboring_chunks(self, gen_graph): + """ + Remove edge between existing RG supervoxels 1 and 2 (neighboring chunks) + ┌─────┬─────┐ ┌─────┬─────┐ + │ A¹ │ B¹ │ │ A¹ │ B¹ │ + │ 1━━┿━━2 │ => │ 1 │ 2 │ + │ │ │ │ │ │ + └─────┴─────┘ └─────┴─────┘ + """ + cg: ChunkedGraph = gen_graph(n_layers=3) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) + create_chunk( + cg, + vertices=[to_label(cg, 1, 0, 0, 0, 0)], + edges=[(to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 1, 0, 0, 0), 1.0)], + timestamp=fake_timestamp, + ) + create_chunk( + cg, + vertices=[to_label(cg, 1, 1, 0, 0, 0)], + edges=[(to_label(cg, 1, 1, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 0), 1.0)], + timestamp=fake_timestamp, + ) + add_parent_chunk( + cg, + 3, + [0, 0, 0], + time_stamp=fake_timestamp, + n_threads=1, + ) + new_root_ids = cg.remove_edges( + "Jane Doe", + source_ids=to_label(cg, 1, 1, 0, 0, 0), + sink_ids=to_label(cg, 1, 0, 0, 0, 0), + mincut=False, + ).new_root_ids + + # verify new state + assert len(new_root_ids) == 2 + assert cg.get_root(to_label(cg, 1, 0, 0, 0, 0)) != cg.get_root( + to_label(cg, 1, 1, 0, 0, 0) + ) + leaves = np.unique( + cg.get_subgraph( + [cg.get_root(to_label(cg, 1, 0, 0, 0, 0))], leaves_only=True + ) + ) + assert len(leaves) == 1 and to_label(cg, 1, 0, 0, 0, 0) in leaves + leaves = np.unique( + cg.get_subgraph( + [cg.get_root(to_label(cg, 1, 1, 0, 0, 0))], leaves_only=True + ) + ) + assert len(leaves) == 1 and to_label(cg, 1, 1, 0, 0, 0) in leaves + + # verify old state + assert cg.get_root( + to_label(cg, 1, 0, 0, 0, 0), time_stamp=fake_timestamp + ) == cg.get_root(to_label(cg, 1, 1, 0, 0, 0), time_stamp=fake_timestamp) + leaves = np.unique( + cg.get_subgraph( + [cg.get_root(to_label(cg, 1, 0, 0, 0, 0), time_stamp=fake_timestamp)], + leaves_only=True, + ) + ) + assert len(leaves) == 2 + assert to_label(cg, 1, 0, 0, 0, 0) in leaves + assert to_label(cg, 1, 1, 0, 0, 0) in leaves + assert len(get_latest_roots(cg)) == 2 + assert len(get_latest_roots(cg, fake_timestamp)) == 1 + + @pytest.mark.timeout(30) + def test_split_verify_cross_chunk_edges(self, gen_graph): + """ + Remove edge between existing RG supervoxels 1 and 2 (neighboring chunks) + ┌─────┬─────┬─────┐ ┌─────┬─────┬─────┐ + | │ A¹ │ B¹ │ | │ A¹ │ B¹ │ + | │ 1━━┿━━3 │ => | │ 1━━┿━━3 │ + | │ | │ │ | │ │ │ + | │ 2 │ │ | │ 2 │ │ + └─────┴─────┴─────┘ └─────┴─────┴─────┘ + """ + cg: ChunkedGraph = gen_graph(n_layers=4) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) + create_chunk( + cg, + vertices=[to_label(cg, 1, 1, 0, 0, 0), to_label(cg, 1, 1, 0, 0, 1)], + edges=[ + (to_label(cg, 1, 1, 0, 0, 0), to_label(cg, 1, 2, 0, 0, 0), inf), + (to_label(cg, 1, 1, 0, 0, 0), to_label(cg, 1, 1, 0, 0, 1), 0.5), + ], + timestamp=fake_timestamp, + ) + create_chunk( + cg, + vertices=[to_label(cg, 1, 2, 0, 0, 0)], + edges=[(to_label(cg, 1, 2, 0, 0, 0), to_label(cg, 1, 1, 0, 0, 0), inf)], + timestamp=fake_timestamp, + ) + + add_parent_chunk( + cg, + 3, + [0, 0, 0], + time_stamp=fake_timestamp, + n_threads=1, + ) + add_parent_chunk( + cg, + 3, + [1, 0, 0], + time_stamp=fake_timestamp, + n_threads=1, + ) + add_parent_chunk( + cg, + 4, + [0, 0, 0], + time_stamp=fake_timestamp, + n_threads=1, + ) + + assert cg.get_root(to_label(cg, 1, 1, 0, 0, 0)) == cg.get_root( + to_label(cg, 1, 1, 0, 0, 1) + ) + assert cg.get_root(to_label(cg, 1, 1, 0, 0, 0)) == cg.get_root( + to_label(cg, 1, 2, 0, 0, 0) + ) + + new_root_ids = cg.remove_edges( + "Jane Doe", + source_ids=to_label(cg, 1, 1, 0, 0, 0), + sink_ids=to_label(cg, 1, 1, 0, 0, 1), + mincut=False, + ).new_root_ids + + assert len(new_root_ids) == 2 + + svs2 = cg.get_subgraph([new_root_ids[0]], leaves_only=True) + svs1 = cg.get_subgraph([new_root_ids[1]], leaves_only=True) + len_set = {1, 2} + assert len(svs1) in len_set + len_set.remove(len(svs1)) + assert len(svs2) in len_set + + # verify new state + assert len(new_root_ids) == 2 + assert cg.get_root(to_label(cg, 1, 1, 0, 0, 0)) != cg.get_root( + to_label(cg, 1, 1, 0, 0, 1) + ) + assert cg.get_root(to_label(cg, 1, 1, 0, 0, 0)) == cg.get_root( + to_label(cg, 1, 2, 0, 0, 0) + ) + + # l2id = cg.get_parent(to_label(cg, 1, 1, 0, 0, 0)) + # cce = cg.get_atomic_cross_edges([l2id])[l2id] + # assert len(cce[3]) == 1 + # assert cce[3][0][0] == to_label(cg, 1, 1, 0, 0, 0) + # assert cce[3][0][1] == to_label(cg, 1, 2, 0, 0, 0) + + assert len(get_latest_roots(cg)) == 2 + assert len(get_latest_roots(cg, fake_timestamp)) == 1 + + @pytest.mark.timeout(30) + def test_split_verify_loop(self, gen_graph): + """ + Remove edge between existing RG supervoxels 1 and 2 (neighboring chunks) + ┌─────┬────────┬─────┐ ┌─────┬────────┬─────┐ + | │ A¹ │ B¹ │ | │ A¹ │ B¹ │ + | │ 4━━1━━┿━━5 │ => | │ 4 1━━┿━━5 │ + | │ / │ | │ | │ │ | │ + | │ 3 2━━┿━━6 │ | │ 3 2━━┿━━6 │ + └─────┴────────┴─────┘ └─────┴────────┴─────┘ + """ + cg: ChunkedGraph = gen_graph(n_layers=4) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) + create_chunk( + cg, + vertices=[ + to_label(cg, 1, 1, 0, 0, 0), + to_label(cg, 1, 1, 0, 0, 1), + to_label(cg, 1, 1, 0, 0, 2), + to_label(cg, 1, 1, 0, 0, 3), + ], + edges=[ + (to_label(cg, 1, 1, 0, 0, 0), to_label(cg, 1, 2, 0, 0, 0), inf), + (to_label(cg, 1, 1, 0, 0, 1), to_label(cg, 1, 2, 0, 0, 1), inf), + (to_label(cg, 1, 1, 0, 0, 0), to_label(cg, 1, 1, 0, 0, 2), 0.5), + (to_label(cg, 1, 1, 0, 0, 0), to_label(cg, 1, 1, 0, 0, 3), 0.5), + ], + timestamp=fake_timestamp, + ) + create_chunk( + cg, + vertices=[to_label(cg, 1, 2, 0, 0, 0), to_label(cg, 1, 2, 0, 0, 1)], + edges=[ + (to_label(cg, 1, 2, 0, 0, 0), to_label(cg, 1, 1, 0, 0, 0), inf), + (to_label(cg, 1, 2, 0, 0, 1), to_label(cg, 1, 1, 0, 0, 1), inf), + (to_label(cg, 1, 2, 0, 0, 1), to_label(cg, 1, 2, 0, 0, 0), 0.5), + ], + timestamp=fake_timestamp, + ) + + add_parent_chunk( + cg, + 3, + [0, 0, 0], + time_stamp=fake_timestamp, + n_threads=1, + ) + add_parent_chunk( + cg, + 3, + [1, 0, 0], + time_stamp=fake_timestamp, + n_threads=1, + ) + add_parent_chunk( + cg, + 4, + [0, 0, 0], + time_stamp=fake_timestamp, + n_threads=1, + ) + + assert cg.get_root(to_label(cg, 1, 1, 0, 0, 0)) == cg.get_root( + to_label(cg, 1, 1, 0, 0, 1) + ) + assert cg.get_root(to_label(cg, 1, 1, 0, 0, 0)) == cg.get_root( + to_label(cg, 1, 2, 0, 0, 0) + ) + + new_root_ids = cg.remove_edges( + "Jane Doe", + source_ids=to_label(cg, 1, 1, 0, 0, 0), + sink_ids=to_label(cg, 1, 1, 0, 0, 2), + mincut=False, + ).new_root_ids + assert len(new_root_ids) == 2 + + new_root_ids = cg.remove_edges( + "Jane Doe", + source_ids=to_label(cg, 1, 1, 0, 0, 0), + sink_ids=to_label(cg, 1, 1, 0, 0, 3), + mincut=False, + ).new_root_ids + assert len(new_root_ids) == 2 + + # l2id = cg.get_parent(to_label(cg, 1, 1, 0, 0, 0)) + # cce = cg.get_atomic_cross_edges([l2id]) + # assert len(cce[3]) == 1 + + assert len(get_latest_roots(cg)) == 3 + assert len(get_latest_roots(cg, fake_timestamp)) == 1 + + # @pytest.mark.timeout(30) + # def test_split_pair_disconnected_chunks(self, gen_graph): + # """ + # Remove edge between existing RG supervoxels 1 and 2 (disconnected chunks) + # ┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ + # │ A¹ │ ... │ Z¹ │ │ A¹ │ ... │ Z¹ │ + # │ 1━━┿━━━━━┿━━2 │ => │ 1 │ │ 2 │ + # │ │ │ │ │ │ │ │ + # └─────┘ └─────┘ └─────┘ └─────┘ + # """ + # cg: ChunkedGraph = gen_graph(n_layers=9) + # fake_timestamp = datetime.now(UTC) - timedelta(days=10) + # create_chunk( + # cg, + # vertices=[to_label(cg, 1, 0, 0, 0, 0)], + # edges=[ + # ( + # to_label(cg, 1, 0, 0, 0, 0), + # to_label(cg, 1, 7, 7, 7, 0), + # 1.0, + # ) + # ], + # timestamp=fake_timestamp, + # ) + # create_chunk( + # cg, + # vertices=[to_label(cg, 1, 7, 7, 7, 0)], + # edges=[ + # ( + # to_label(cg, 1, 7, 7, 7, 0), + # to_label(cg, 1, 0, 0, 0, 0), + # 1.0, + # ) + # ], + # timestamp=fake_timestamp, + # ) + + # add_parent_chunk( + # cg, + # 3, + # [0, 0, 0], + # time_stamp=fake_timestamp, + # n_threads=1, + # ) + # add_parent_chunk( + # cg, + # 3, + # [1, 0, 0], + # time_stamp=fake_timestamp, + # n_threads=1, + # ) + # add_parent_chunk( + # cg, + # 4, + # [0, 0, 0], + # time_stamp=fake_timestamp, + # n_threads=1, + # ) + # add_parent_chunk( + # cg, + # 4, + # [1, 0, 0], + # time_stamp=fake_timestamp, + # n_threads=1, + # ) + # add_parent_chunk( + # cg, + # 5, + # [0, 0, 0], + # time_stamp=fake_timestamp, + # n_threads=1, + # ) + # add_parent_chunk( + # cg, + # 5, + # [0, 0, 0], + # time_stamp=fake_timestamp, + # n_threads=1, + # ) + # add_parent_chunk( + # cg, + # 6, + # [0, 0, 0], + # time_stamp=fake_timestamp, + # n_threads=1, + # ) + # add_parent_chunk( + # cg, + # 6, + # [0, 0, 0], + # time_stamp=fake_timestamp, + # n_threads=1, + # ) + # add_parent_chunk( + # cg, + # 7, + # [0, 0, 0], + # time_stamp=fake_timestamp, + # n_threads=1, + # ) + # add_parent_chunk( + # cg, + # 7, + # [0, 0, 0], + # time_stamp=fake_timestamp, + # n_threads=1, + # ) + # add_parent_chunk( + # cg, + # 8, + # [0, 0, 0], + # time_stamp=fake_timestamp, + # n_threads=1, + # ) + # add_parent_chunk( + # cg, + # 8, + # [0, 0, 0], + # time_stamp=fake_timestamp, + # n_threads=1, + # ) + # add_parent_chunk( + # cg, + # 9, + # [0, 0, 0], + # time_stamp=fake_timestamp, + # n_threads=1, + # ) + + # new_roots = cg.remove_edges( + # "Jane Doe", + # source_ids=to_label(cg, 1, 7, 7, 7, 0), + # sink_ids=to_label(cg, 1, 0, 0, 0, 0), + # mincut=False, + # ).new_root_ids + + # # verify new state + # assert len(new_roots) == 2 + # assert cg.get_root(to_label(cg, 1, 0, 0, 0, 0)) != cg.get_root( + # to_label(cg, 1, 7, 7, 7, 0) + # ) + # leaves = np.unique( + # cg.get_subgraph( + # [cg.get_root(to_label(cg, 1, 0, 0, 0, 0))], leaves_only=True + # ) + # ) + # assert len(leaves) == 1 and to_label(cg, 1, 0, 0, 0, 0) in leaves + # leaves = np.unique( + # cg.get_subgraph( + # [cg.get_root(to_label(cg, 1, 7, 7, 7, 0))], leaves_only=True + # ) + # ) + # assert len(leaves) == 1 and to_label(cg, 1, 7, 7, 7, 0) in leaves + + # # verify old state + # assert cg.get_root( + # to_label(cg, 1, 0, 0, 0, 0), time_stamp=fake_timestamp + # ) == cg.get_root(to_label(cg, 1, 7, 7, 7, 0), time_stamp=fake_timestamp) + # leaves = np.unique( + # cg.get_subgraph( + # [cg.get_root(to_label(cg, 1, 0, 0, 0, 0), time_stamp=fake_timestamp)], + # leaves_only=True, + # ) + # ) + # assert len(leaves) == 2 + # assert to_label(cg, 1, 0, 0, 0, 0) in leaves + # assert to_label(cg, 1, 7, 7, 7, 0) in leaves + + @pytest.mark.timeout(30) + def test_split_pair_already_disconnected(self, gen_graph): + """ + Try to remove edge between already disconnected RG supervoxels 1 and 2 (same chunk). + Expected: No change, no error + ┌─────┐ ┌─────┐ + │ A¹ │ │ A¹ │ + │ 1 2 │ => │ 1 2 │ + │ │ │ │ + └─────┘ └─────┘ + """ + cg: ChunkedGraph = gen_graph(n_layers=2) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) + create_chunk( + cg, + vertices=[to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1)], + edges=[], + timestamp=fake_timestamp, + ) + res_old = cg.client._table.read_rows() + res_old.consume_all() + + with pytest.raises(exceptions.PreconditionError): + cg.remove_edges( + "Jane Doe", + source_ids=to_label(cg, 1, 0, 0, 0, 1), + sink_ids=to_label(cg, 1, 0, 0, 0, 0), + mincut=False, + ) + + res_new = cg.client._table.read_rows() + res_new.consume_all() + + if res_old.rows != res_new.rows: + warn( + "Rows were modified when splitting a pair of already disconnected supervoxels." + "While probably not an error, it is an unnecessary operation." + ) + + @pytest.mark.timeout(30) + def test_split_full_circle_to_triple_chain_same_chunk(self, gen_graph): + """ + Remove direct edge between RG supervoxels 1 and 2, but leave indirect connection (same chunk) + ┌─────┐ ┌─────┐ + │ A¹ │ │ A¹ │ + │ 1━2 │ => │ 1 2 │ + │ ┗3┛ │ │ ┗3┛ │ + └─────┘ └─────┘ + """ + cg: ChunkedGraph = gen_graph(n_layers=2) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) + create_chunk( + cg, + vertices=[ + to_label(cg, 1, 0, 0, 0, 0), + to_label(cg, 1, 0, 0, 0, 1), + to_label(cg, 1, 0, 0, 0, 2), + ], + edges=[ + (to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 2), 0.5), + (to_label(cg, 1, 0, 0, 0, 1), to_label(cg, 1, 0, 0, 0, 2), 0.5), + (to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1), 0.3), + ], + timestamp=fake_timestamp, + ) + new_root_ids = cg.remove_edges( + "Jane Doe", + source_ids=to_label(cg, 1, 0, 0, 0, 1), + sink_ids=to_label(cg, 1, 0, 0, 0, 0), + mincut=False, + ).new_root_ids + + # verify new state + assert len(new_root_ids) == 1 + assert cg.get_root(to_label(cg, 1, 0, 0, 0, 0)) == new_root_ids[0] + assert cg.get_root(to_label(cg, 1, 0, 0, 0, 1)) == new_root_ids[0] + assert cg.get_root(to_label(cg, 1, 0, 0, 0, 2)) == new_root_ids[0] + leaves = np.unique(cg.get_subgraph([new_root_ids[0]], leaves_only=True)) + assert len(leaves) == 3 + assert to_label(cg, 1, 0, 0, 0, 0) in leaves + assert to_label(cg, 1, 0, 0, 0, 1) in leaves + assert to_label(cg, 1, 0, 0, 0, 2) in leaves + + # verify old state + old_root_id = cg.get_root( + to_label(cg, 1, 0, 0, 0, 0), time_stamp=fake_timestamp + ) + assert new_root_ids[0] != old_root_id + assert len(get_latest_roots(cg)) == 1 + assert len(get_latest_roots(cg, fake_timestamp)) == 1 + + @pytest.mark.timeout(30) + def test_split_full_circle_to_triple_chain_neighboring_chunks(self, gen_graph): + """ + Remove direct edge between RG supervoxels 1 and 2, but leave indirect connection (neighboring chunks) + ┌─────┬─────┐ ┌─────┬─────┐ + │ A¹ │ B¹ │ │ A¹ │ B¹ │ + │ 1━━┿━━2 │ => │ 1 │ 2 │ + │ ┗3━┿━━┛ │ │ ┗3━┿━━┛ │ + └─────┴─────┘ └─────┴─────┘ + """ + cg: ChunkedGraph = gen_graph(n_layers=3) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) + create_chunk( + cg, + vertices=[to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1)], + edges=[ + (to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1), 0.5), + (to_label(cg, 1, 0, 0, 0, 1), to_label(cg, 1, 1, 0, 0, 0), 0.5), + (to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 1, 0, 0, 0), 0.3), + ], + timestamp=fake_timestamp, + ) + create_chunk( + cg, + vertices=[to_label(cg, 1, 1, 0, 0, 0)], + edges=[ + (to_label(cg, 1, 1, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1), 0.5), + (to_label(cg, 1, 1, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 0), 0.3), + ], + timestamp=fake_timestamp, + ) + add_parent_chunk( + cg, + 3, + [0, 0, 0], + time_stamp=fake_timestamp, + n_threads=1, + ) + + new_root_ids = cg.remove_edges( + "Jane Doe", + source_ids=to_label(cg, 1, 1, 0, 0, 0), + sink_ids=to_label(cg, 1, 0, 0, 0, 0), + mincut=False, + ).new_root_ids + + # verify new state + assert len(new_root_ids) == 1 + assert cg.get_root(to_label(cg, 1, 0, 0, 0, 0)) == new_root_ids[0] + assert cg.get_root(to_label(cg, 1, 0, 0, 0, 1)) == new_root_ids[0] + assert cg.get_root(to_label(cg, 1, 1, 0, 0, 0)) == new_root_ids[0] + leaves = np.unique(cg.get_subgraph([new_root_ids[0]], leaves_only=True)) + assert len(leaves) == 3 + assert to_label(cg, 1, 0, 0, 0, 0) in leaves + assert to_label(cg, 1, 0, 0, 0, 1) in leaves + assert to_label(cg, 1, 1, 0, 0, 0) in leaves + + # verify old state + old_root_id = cg.get_root( + to_label(cg, 1, 0, 0, 0, 0), time_stamp=fake_timestamp + ) + assert new_root_ids[0] != old_root_id + assert len(get_latest_roots(cg)) == 1 + assert len(get_latest_roots(cg, fake_timestamp)) == 1 + + # @pytest.mark.timeout(30) + # def test_split_full_circle_to_triple_chain_disconnected_chunks(self, gen_graph): + # """ + # Remove direct edge between RG supervoxels 1 and 2, but leave indirect connection (disconnected chunks) + # ┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ + # │ A¹ │ ... │ Z¹ │ │ A¹ │ ... │ Z¹ │ + # │ 1━━┿━━━━━┿━━2 │ => │ 1 │ │ 2 │ + # │ ┗3━┿━━━━━┿━━┛ │ │ ┗3━┿━━━━━┿━━┛ │ + # └─────┘ └─────┘ └─────┘ └─────┘ + # """ + # cg: ChunkedGraph = gen_graph(n_layers=9) + # loc = 2 + # fake_timestamp = datetime.now(UTC) - timedelta(days=10) + # create_chunk( + # cg, + # vertices=[to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1)], + # edges=[ + # (to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1), 0.5), + # ( + # to_label(cg, 1, 0, 0, 0, 1), + # to_label(cg, 1, loc, loc, loc, 0), + # 0.5, + # ), + # ( + # to_label(cg, 1, 0, 0, 0, 0), + # to_label(cg, 1, loc, loc, loc, 0), + # 0.3, + # ), + # ], + # timestamp=fake_timestamp, + # ) + # create_chunk( + # cg, + # vertices=[to_label(cg, 1, loc, loc, loc, 0)], + # edges=[ + # ( + # to_label(cg, 1, loc, loc, loc, 0), + # to_label(cg, 1, 0, 0, 0, 1), + # 0.5, + # ), + # ( + # to_label(cg, 1, loc, loc, loc, 0), + # to_label(cg, 1, 0, 0, 0, 0), + # 0.3, + # ), + # ], + # timestamp=fake_timestamp, + # ) + # for i_layer in range(3, 10): + # if loc // 2 ** (i_layer - 3) == 1: + # add_parent_chunk( + # cg, + # i_layer, + # [0, 0, 0], + # time_stamp=fake_timestamp, + # n_threads=1, + # ) + # elif loc // 2 ** (i_layer - 3) == 0: + # add_parent_chunk( + # cg, + # i_layer, + # [0, 0, 0], + # time_stamp=fake_timestamp, + # n_threads=1, + # ) + # else: + # add_parent_chunk( + # cg, + # i_layer, + # [0, 0, 0], + # time_stamp=fake_timestamp, + # n_threads=1, + # ) + # add_parent_chunk( + # cg, + # i_layer, + # [0, 0, 0], + # time_stamp=fake_timestamp, + # n_threads=1, + # ) + + # assert ( + # cg.get_root(to_label(cg, 1, loc, loc, loc, 0)) + # == cg.get_root(to_label(cg, 1, 0, 0, 0, 0)) + # == cg.get_root(to_label(cg, 1, 0, 0, 0, 1)) + # ) + # new_root_ids = cg.remove_edges( + # "Jane Doe", + # source_ids=to_label(cg, 1, loc, loc, loc, 0), + # sink_ids=to_label(cg, 1, 0, 0, 0, 0), + # mincut=False, + # ).new_root_ids + + # # verify new state + # assert len(new_root_ids) == 1 + # assert cg.get_root(to_label(cg, 1, 0, 0, 0, 0)) == new_root_ids[0] + # assert cg.get_root(to_label(cg, 1, 0, 0, 0, 1)) == new_root_ids[0] + # assert cg.get_root(to_label(cg, 1, loc, loc, loc, 0)) == new_root_ids[0] + # leaves = np.unique(cg.get_subgraph([new_root_ids[0]], leaves_only=True)) + # assert len(leaves) == 3 + # assert to_label(cg, 1, 0, 0, 0, 0) in leaves + # assert to_label(cg, 1, 0, 0, 0, 1) in leaves + # assert to_label(cg, 1, loc, loc, loc, 0) in leaves + + # # verify old state + # old_root_id = cg.get_root( + # to_label(cg, 1, 0, 0, 0, 0), time_stamp=fake_timestamp + # ) + # assert new_root_ids[0] != old_root_id + + # assert len(get_latest_roots(cg)) == 1 + # assert len(get_latest_roots(cg, fake_timestamp)) == 1 + + @pytest.mark.timeout(30) + def test_split_same_node(self, gen_graph): + """ + Try to remove (non-existing) edge between RG supervoxel 1 and itself + ┌─────┐ + │ A¹ │ + │ 1 │ => Reject + │ │ + └─────┘ + """ + cg: ChunkedGraph = gen_graph(n_layers=2) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) + create_chunk( + cg, + vertices=[to_label(cg, 1, 0, 0, 0, 0)], + edges=[], + timestamp=fake_timestamp, + ) + + res_old = cg.client._table.read_rows() + res_old.consume_all() + with pytest.raises(exceptions.PreconditionError): + cg.remove_edges( + "Jane Doe", + source_ids=to_label(cg, 1, 0, 0, 0, 0), + sink_ids=to_label(cg, 1, 0, 0, 0, 0), + mincut=False, + ) + + res_new = cg.client._table.read_rows() + res_new.consume_all() + assert res_new.rows == res_old.rows + + @pytest.mark.timeout(30) + def test_split_pair_abstract_nodes(self, gen_graph): + """ + Try to remove (non-existing) edge between RG supervoxel 1 and abstract node "2" + ┌─────┐ + │ B² │ + │ "2" │ + │ │ + └─────┘ + ┌─────┐ => Reject + │ A¹ │ + │ 1 │ + │ │ + └─────┘ + """ + + cg: ChunkedGraph = gen_graph(n_layers=3) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) + create_chunk( + cg, + vertices=[to_label(cg, 1, 0, 0, 0, 0)], + edges=[], + timestamp=fake_timestamp, + ) + create_chunk( + cg, + vertices=[to_label(cg, 1, 1, 0, 0, 0)], + edges=[], + timestamp=fake_timestamp, + ) + + add_parent_chunk( + cg, + 3, + [0, 0, 0], + time_stamp=fake_timestamp, + n_threads=1, + ) + res_old = cg.client._table.read_rows() + res_old.consume_all() + with pytest.raises((exceptions.PreconditionError, AssertionError)): + cg.remove_edges( + "Jane Doe", + source_ids=to_label(cg, 1, 0, 0, 0, 0), + sink_ids=to_label(cg, 2, 1, 0, 0, 1), + mincut=False, + ) + + res_new = cg.client._table.read_rows() + res_new.consume_all() + assert res_new.rows == res_old.rows + + @pytest.mark.timeout(30) + def test_diagonal_connections(self, gen_graph): + """ + Create graph with edge between RG supervoxels 1 and 2 (same chunk) + and edge between RG supervoxels 1 and 3 (neighboring chunks) + ┌─────┬─────┐ + │ A¹ │ B¹ │ + │ 2━1━┿━━3 │ + │ / │ │ + ┌─────┬─────┐ + │ | │ │ + │ 4━━┿━━5 │ + │ C¹ │ D¹ │ + └─────┴─────┘ + """ + cg: ChunkedGraph = gen_graph(n_layers=3) + create_chunk( + cg, + vertices=[to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1)], + edges=[ + (to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1), 0.5), + (to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 1, 0, 0, 0), inf), + (to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 1, 0, 0), inf), + ], + ) + create_chunk( + cg, + vertices=[to_label(cg, 1, 1, 0, 0, 0)], + edges=[(to_label(cg, 1, 1, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 0), inf)], + ) + create_chunk( + cg, + vertices=[to_label(cg, 1, 0, 1, 0, 0)], + edges=[ + (to_label(cg, 1, 0, 1, 0, 0), to_label(cg, 1, 1, 1, 0, 0), inf), + (to_label(cg, 1, 0, 1, 0, 0), to_label(cg, 1, 0, 0, 0, 0), inf), + ], + ) + create_chunk( + cg, + vertices=[to_label(cg, 1, 1, 1, 0, 0)], + edges=[(to_label(cg, 1, 1, 1, 0, 0), to_label(cg, 1, 0, 1, 0, 0), inf)], + ) + add_parent_chunk( + cg, + 3, + [0, 0, 0], + n_threads=1, + ) + + rr = cg.range_read_chunk(chunk_id=cg.get_chunk_id(layer=3, x=0, y=0, z=0)) + root_ids_t0 = list(rr.keys()) + assert len(root_ids_t0) == 1 + + child_ids = [] + for root_id in root_ids_t0: + child_ids.extend([cg.get_subgraph([root_id], leaves_only=True)]) + + new_roots = cg.remove_edges( + "Jane Doe", + source_ids=to_label(cg, 1, 0, 0, 0, 0), + sink_ids=to_label(cg, 1, 0, 0, 0, 1), + mincut=False, + ).new_root_ids + + assert len(new_roots) == 2 + assert cg.get_root(to_label(cg, 1, 1, 1, 0, 0)) == cg.get_root( + to_label(cg, 1, 0, 1, 0, 0) + ) + assert cg.get_root(to_label(cg, 1, 0, 0, 0, 0)) == cg.get_root( + to_label(cg, 1, 0, 0, 0, 0) + ) From db227e5ea3b78082a798b3293e69f80cad37eefc Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Mon, 10 Jun 2024 15:13:15 +0000 Subject: [PATCH 086/116] segregate update nodes logic --- .../graph/client/bigtable/client.py | 8 +++++- pychunkedgraph/ingest/upgrade/atomic_layer.py | 26 +++++++++++-------- 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/pychunkedgraph/graph/client/bigtable/client.py b/pychunkedgraph/graph/client/bigtable/client.py index 52ec9a856..9195fb397 100644 --- a/pychunkedgraph/graph/client/bigtable/client.py +++ b/pychunkedgraph/graph/client/bigtable/client.py @@ -151,6 +151,7 @@ def read_nodes( end_time=None, end_time_inclusive: bool = False, fake_edges: bool = False, + attr_keys: bool = True, ): """ Read nodes and their properties. @@ -186,8 +187,13 @@ def read_nodes( end_time_inclusive=end_time_inclusive, user_id=user_id, ) + if attr_keys: + return { + deserialize_uint64(row_key, fake_edges=fake_edges): data + for (row_key, data) in rows.items() + } return { - deserialize_uint64(row_key, fake_edges=fake_edges): data + deserialize_uint64(row_key, fake_edges=fake_edges): {k.key:v for k,v in data.items()} for (row_key, data) in rows.items() } diff --git a/pychunkedgraph/ingest/upgrade/atomic_layer.py b/pychunkedgraph/ingest/upgrade/atomic_layer.py index 96f7f71bd..6c4244968 100644 --- a/pychunkedgraph/ingest/upgrade/atomic_layer.py +++ b/pychunkedgraph/ingest/upgrade/atomic_layer.py @@ -79,17 +79,7 @@ def update_cross_edges(cg: ChunkedGraph, node, cx_edges_d, node_ts, end_ts) -> l return rows -def update_chunk(cg: ChunkedGraph, chunk_coords: list[int], layer: int = 2): - """ - Iterate over all L2 IDs in a chunk and update their cross chunk edges, - within the periods they were valid/active. - """ - x, y, z = chunk_coords - chunk_id = cg.get_chunk_id(layer=layer, x=x, y=y, z=z) - cg.copy_fake_edges(chunk_id) - rr = cg.range_read_chunk(chunk_id) - nodes = list(rr.keys()) - +def update_nodes(cg: ChunkedGraph, nodes) -> list: # get start_ts when node becomes valid nodes_ts = cg.get_node_timestamps(nodes, return_numpy=False, normalize=True) cx_edges_d = cg.get_atomic_cross_edges(nodes) @@ -116,4 +106,18 @@ def update_chunk(cg: ChunkedGraph, chunk_coords: list[int], layer: int = 2): # for each timestamp until end_ts, update cross chunk edges of node _rows = update_cross_edges(cg, node, node_cx_edges_d, start_ts, end_ts) rows.extend(_rows) + return rows + + +def update_chunk(cg: ChunkedGraph, chunk_coords: list[int], layer: int = 2): + """ + Iterate over all L2 IDs in a chunk and update their cross chunk edges, + within the periods they were valid/active. + """ + x, y, z = chunk_coords + chunk_id = cg.get_chunk_id(layer=layer, x=x, y=y, z=z) + cg.copy_fake_edges(chunk_id) + rr = cg.range_read_chunk(chunk_id) + nodes = list(rr.keys()) + rows = update_nodes(cg, nodes) cg.client.write(rows) From e96c1e5b5227346e00f78b33aa496191a0ef1d9f Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Fri, 28 Jun 2024 16:43:45 +0000 Subject: [PATCH 087/116] fix(edits): overwrite children partners when superseded by parents --- pychunkedgraph/debug/utils.py | 15 +++++++-- pychunkedgraph/graph/edits.py | 56 ++++--------------------------- pychunkedgraph/graph/operation.py | 3 ++ 3 files changed, 23 insertions(+), 51 deletions(-) diff --git a/pychunkedgraph/debug/utils.py b/pychunkedgraph/debug/utils.py index b1bdbc2be..130d85500 100644 --- a/pychunkedgraph/debug/utils.py +++ b/pychunkedgraph/debug/utils.py @@ -2,6 +2,8 @@ import numpy as np +from pychunkedgraph.graph.meta import ChunkedGraphMeta, GraphConfig + def print_attrs(d): for k, v in d.items(): @@ -41,14 +43,14 @@ def sanity_check(cg, new_roots, operation_id): """ Check for duplicates in hierarchy, useful for debugging. """ - print(f"{len(new_roots)} new ids from {operation_id}") + # print(f"{len(new_roots)} new ids from {operation_id}") l2c_d = {} for new_root in new_roots: l2c_d[new_root] = get_l2children(cg, new_root) success = True for k, v in l2c_d.items(): success = success and (len(v) == np.unique(v).size) - print(f"{k}: {np.unique(v).size}, {len(v)}") + # print(f"{k}: {np.unique(v).size}, {len(v)}") if not success: raise RuntimeError("Some ids are not valid.") @@ -58,3 +60,12 @@ def sanity_check_single(cg, node, operation_id): msg = f"invalid node {node}:" msg += f" found {len(v)} l2 ids, must be {np.unique(v).size}" assert np.unique(v).size == len(v), f"{msg}, from {operation_id}." + return v + + +def update_graph_id(cg, new_graph_id:str): + old_gc = cg.meta.graph_config._asdict() + old_gc["ID"] = new_graph_id + new_gc = GraphConfig(**old_gc) + new_meta = ChunkedGraphMeta(new_gc, cg.meta.data_source, cg.meta.custom_data) + cg.update_meta(new_meta, overwrite=True) diff --git a/pychunkedgraph/graph/edits.py b/pychunkedgraph/graph/edits.py index 807fff257..0778a1f82 100644 --- a/pychunkedgraph/graph/edits.py +++ b/pychunkedgraph/graph/edits.py @@ -420,9 +420,15 @@ def _update_neighbor_cross_edges_single( continue assert np.all(edges[:, 0] == counterpart) edges = fastremap.remap(edges, node_map, preserve_missing_labels=True) - if layer == counterpart_layer: + if layer == counterpart_layer and layer >= node_layer: reverse_edge = np.array([counterpart, new_id], dtype=basetypes.NODE_ID) edges = np.concatenate([edges, [reverse_edge]]) + children = cg.get_children(new_id) + mask = np.isin(edges[:, 1], children) + if np.any(mask): + masked_edges = edges[mask] + masked_edges[:, 1] = new_id + edges[mask] = masked_edges edges = np.unique(edges, axis=0) edges_d[layer] = edges val_dict[attributes.Connectivity.CrossChunkEdge[layer]] = edges @@ -578,49 +584,6 @@ def _update_cross_edge_cache(self, parent, children): assert np.all(edges[:, 0] == parent), f"{parent}, {np.unique(edges[:, 0])}" self.cg.cache.cross_chunk_edges_cache[parent] = new_cx_edges_d - def _update_neighbor_parents(self, neighbor, ceil_layer: int, updated: set) -> list: - """helper for `_update_skipped_neighbors`""" - parents = [] - while True: - parent = self.cg.get_parent(neighbor, time_stamp=self._last_successful_ts) - parent_layer = self.cg.get_chunk_layer(parent) - if parent_layer >= ceil_layer or parent in updated: - break - children = self.cg.get_children(parent) - self._update_cross_edge_cache(parent, children) - parents.append(parent) - neighbor = parent - return parents - - def _update_skipped_neighbors(self, node, layer, parent_layer): - """ - Updates cross edges of neighbors of a skip connection node. - Neighbors of such nodes can have parents at contiguous layers. - - This method updates cross edges of all such parents - from `layer` through `parent_layer`. - """ - updated_parents = set() - cx_edges_d = self.cg.cache.cross_chunk_edges_cache[node] - for _layer in range(layer, parent_layer + 1): - layer_edges = cx_edges_d.get(_layer, types.empty_2d) - neighbors = layer_edges[:, 1] - for n in neighbors: - if n in self._new_old_id_d: - # ignore new ids - continue - res = self._update_neighbor_parents(n, parent_layer, updated_parents) - updated_parents.update(res) - updated_entries = [] - for parent in updated_parents: - val_dict = {} - for _layer, edges in self.cg.cache.cross_chunk_edges_cache[parent].items(): - val_dict[attributes.Connectivity.CrossChunkEdge[_layer]] = edges - rkey = serialize_uint64(parent) - row = self.cg.client.mutate_row(rkey, val_dict, time_stamp=self._time_stamp) - updated_entries.append(row) - return updated_entries - def _create_new_parents(self, layer: int): """ keep track of old IDs @@ -635,7 +598,6 @@ def _create_new_parents(self, layer: int): layer_node_ids = self._get_layer_node_ids(new_ids, layer) components, graph_ids = self._get_connected_components(layer_node_ids, layer) for cc_indices in components: - update_skipped_neighbors = False parent_layer = layer + 1 # must be reset for each connected component cc_ids = graph_ids[cc_indices] if len(cc_ids) == 1: @@ -648,7 +610,6 @@ def _create_new_parents(self, layer: int): if len(cx_edges_d[cc_ids[0]].get(l, types.empty_2d)) > 0: parent_layer = l break - update_skipped_neighbors = cc_ids[0] in self._new_old_id_d parent = self.cg.id_client.create_node_id( self.cg.get_parent_chunk_id(cc_ids[0], parent_layer), root_chunk=parent_layer == self.cg.meta.layer_count, @@ -658,9 +619,6 @@ def _create_new_parents(self, layer: int): self.cg.cache.children_cache[parent] = cc_ids cache_utils.update(self.cg.cache.parents_cache, cc_ids, parent) sanity_check_single(self.cg, parent, self._operation_id) - if update_skipped_neighbors: - res = self._update_skipped_neighbors(cc_ids[0], layer, parent_layer) - self.new_entries.extend(res) def run(self) -> Iterable: """ diff --git a/pychunkedgraph/graph/operation.py b/pychunkedgraph/graph/operation.py index 39668565f..6d2f3c0bb 100644 --- a/pychunkedgraph/graph/operation.py +++ b/pychunkedgraph/graph/operation.py @@ -457,6 +457,9 @@ def execute( except PostconditionError as err: self.cg.cache = None raise PostconditionError(err) from err + except (AssertionError, RuntimeError) as err: + self.cg.cache = None + raise RuntimeError(err) from err except Exception as err: # unknown exception, update log record with error self.cg.cache = None From 55df0d6e8db51765b1fca98cc18c6ea89b153379 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Thu, 4 Jul 2024 15:54:48 +0000 Subject: [PATCH 088/116] fix: unique edges always, predecing edit ts, allow same segment merge --- pychunkedgraph/graph/edits.py | 6 +++--- pychunkedgraph/graph/operation.py | 18 +++++++++++------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/pychunkedgraph/graph/edits.py b/pychunkedgraph/graph/edits.py index 0778a1f82..735ae65f8 100644 --- a/pychunkedgraph/graph/edits.py +++ b/pychunkedgraph/graph/edits.py @@ -414,13 +414,13 @@ def _update_neighbor_cross_edges_single( for counterpart, edges_d in cp_cx_edges_d.items(): val_dict = {} counterpart_layer = counterpart_layers[counterpart] - for layer in range(2, cg.meta.layer_count): + for layer in range(node_layer, cg.meta.layer_count): edges = edges_d.get(layer, types.empty_2d) if edges.size == 0: continue assert np.all(edges[:, 0] == counterpart) edges = fastremap.remap(edges, node_map, preserve_missing_labels=True) - if layer == counterpart_layer and layer >= node_layer: + if layer == counterpart_layer: reverse_edge = np.array([counterpart, new_id], dtype=basetypes.NODE_ID) edges = np.concatenate([edges, [reverse_edge]]) children = cg.get_children(new_id) @@ -429,7 +429,7 @@ def _update_neighbor_cross_edges_single( masked_edges = edges[mask] masked_edges[:, 1] = new_id edges[mask] = masked_edges - edges = np.unique(edges, axis=0) + edges = np.unique(edges, axis=0) edges_d[layer] = edges val_dict[attributes.Connectivity.CrossChunkEdge[layer]] = edges if not val_dict: diff --git a/pychunkedgraph/graph/operation.py b/pychunkedgraph/graph/operation.py index 6d2f3c0bb..1a221d236 100644 --- a/pychunkedgraph/graph/operation.py +++ b/pychunkedgraph/graph/operation.py @@ -615,13 +615,16 @@ def _apply( edges_only=True, ) - with TimeIt("preprocess", self.cg.graph_id, operation_id): - inactive_edges = edits.merge_preprocess( - self.cg, - subgraph_edges=edges, - supervoxels=self.added_edges.ravel(), - parent_ts=self.parent_ts, - ) + if self.allow_same_segment_merge: + inactive_edges = types.empty_2d + else: + with TimeIt("preprocess", self.cg.graph_id, operation_id): + inactive_edges = edits.merge_preprocess( + self.cg, + subgraph_edges=edges, + supervoxels=self.added_edges.ravel(), + parent_ts=self.parent_ts, + ) atomic_edges, fake_edge_rows = edits.check_fake_edges( self.cg, @@ -637,6 +640,7 @@ def _apply( operation_id=operation_id, time_stamp=timestamp, parent_ts=self.parent_ts, + allow_same_segment_merge=self.allow_same_segment_merge ) return new_roots, new_l2_ids, fake_edge_rows + new_entries From 8febe810e38f4365bfbc324381d183856e8ff1d6 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Thu, 4 Jul 2024 16:22:25 +0000 Subject: [PATCH 089/116] =?UTF-8?q?Bump=20version:=203.0.0=20=E2=86=92=203?= =?UTF-8?q?.0.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .bumpversion.cfg | 2 +- pychunkedgraph/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 5583246c5..6526fbc66 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 3.0.0 +current_version = 3.0.1 commit = True tag = True diff --git a/pychunkedgraph/__init__.py b/pychunkedgraph/__init__.py index 528787cfc..055276878 100644 --- a/pychunkedgraph/__init__.py +++ b/pychunkedgraph/__init__.py @@ -1 +1 @@ -__version__ = "3.0.0" +__version__ = "3.0.1" From 3df48be5781bf46ba5756256c70ec89b708c9b10 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Sat, 6 Jul 2024 17:38:44 +0000 Subject: [PATCH 090/116] fix(edits): mask all descendants when updating cx edges --- pychunkedgraph/graph/edits.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/pychunkedgraph/graph/edits.py b/pychunkedgraph/graph/edits.py index 735ae65f8..add0c9d0c 100644 --- a/pychunkedgraph/graph/edits.py +++ b/pychunkedgraph/graph/edits.py @@ -391,6 +391,25 @@ def _get_flipped_ids(id_map, node_ids): return np.concatenate(ids) +def _get_descendants(cg, new_id): + """get all descendants at layers >= 2""" + result = [] + children = cg.get_children(new_id) + while True: + mask = cg.get_chunk_layers(children) >= 2 + children = children[mask] + result.extend(children) + + mask = cg.get_chunk_layers(children) > 2 + children = children[mask] + if children.size == 0: + break + + children = cg.get_children(children, flatten=True) + return result + + + def _update_neighbor_cross_edges_single( cg, new_id: int, cx_edges_d: dict, node_map: dict, *, parent_ts ) -> dict: @@ -423,8 +442,8 @@ def _update_neighbor_cross_edges_single( if layer == counterpart_layer: reverse_edge = np.array([counterpart, new_id], dtype=basetypes.NODE_ID) edges = np.concatenate([edges, [reverse_edge]]) - children = cg.get_children(new_id) - mask = np.isin(edges[:, 1], children) + descendants = _get_descendants(cg, new_id) + mask = np.isin(edges[:, 1], descendants) if np.any(mask): masked_edges = edges[mask] masked_edges[:, 1] = new_id From 05afaad6c87acaf0cc94fd09f909bd2cf42b2f61 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Sat, 6 Jul 2024 17:39:16 +0000 Subject: [PATCH 091/116] =?UTF-8?q?Bump=20version:=203.0.1=20=E2=86=92=203?= =?UTF-8?q?.0.2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .bumpversion.cfg | 2 +- pychunkedgraph/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 6526fbc66..62209053d 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 3.0.1 +current_version = 3.0.2 commit = True tag = True diff --git a/pychunkedgraph/__init__.py b/pychunkedgraph/__init__.py index 055276878..131942e76 100644 --- a/pychunkedgraph/__init__.py +++ b/pychunkedgraph/__init__.py @@ -1 +1 @@ -__version__ = "3.0.1" +__version__ = "3.0.2" From 50c94c4c9fed2240e77774214436b95c409e63cc Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Sun, 7 Jul 2024 01:25:34 +0000 Subject: [PATCH 092/116] fix(edits): use supervoxels to get the correct cross edge parents --- pychunkedgraph/graph/edits.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/pychunkedgraph/graph/edits.py b/pychunkedgraph/graph/edits.py index add0c9d0c..4efead0c9 100644 --- a/pychunkedgraph/graph/edits.py +++ b/pychunkedgraph/graph/edits.py @@ -496,6 +496,25 @@ def _update_neighbor_cross_edges( return updated_entries +def _get_supervoxels(cg, node_ids): + """Returns the first supervoxel found for each node_id.""" + result = {} + node_ids_copy = np.copy(node_ids) + children = np.copy(node_ids) + children_d = cg.get_children(node_ids) + while True: + children = [children_d[k][0] for k in children] + children = np.array(children, dtype=basetypes.NODE_ID) + mask = cg.get_chunk_layers(children) == 1 + result.update([(node, sv) for node, sv in zip(node_ids[mask], children[mask])]) + node_ids = node_ids[~mask] + children = children[~mask] + if children.size == 0: + break + children_d = cg.get_children(children) + return np.array([result[k] for k in node_ids_copy], dtype=basetypes.NODE_ID) + + class CreateParentNodes: def __init__( self, @@ -586,8 +605,9 @@ def _update_cross_edge_cache(self, parent, children): ) cx_edges_d = concatenate_cross_edge_dicts(cx_edges_d.values()) edge_nodes = np.unique(np.concatenate([*cx_edges_d.values(), types.empty_2d])) + edge_supervoxels = _get_supervoxels(self.cg, edge_nodes) edge_parents = self.cg.get_roots( - edge_nodes, + edge_supervoxels, stop_layer=parent_layer, ceil=False, time_stamp=self._last_successful_ts, From a128a3a16b0c29581f957c8280fdcba8c1ab3b6d Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Sun, 7 Jul 2024 01:25:52 +0000 Subject: [PATCH 093/116] =?UTF-8?q?Bump=20version:=203.0.2=20=E2=86=92=203?= =?UTF-8?q?.0.3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .bumpversion.cfg | 2 +- pychunkedgraph/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 62209053d..f98e5ee64 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 3.0.2 +current_version = 3.0.3 commit = True tag = True diff --git a/pychunkedgraph/__init__.py b/pychunkedgraph/__init__.py index 131942e76..8d1c8625f 100644 --- a/pychunkedgraph/__init__.py +++ b/pychunkedgraph/__init__.py @@ -1 +1 @@ -__version__ = "3.0.2" +__version__ = "3.0.3" From 1d5270ddccb6b97a4d27c2284fcb04065b31aa8e Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Tue, 16 Jul 2024 19:24:15 +0000 Subject: [PATCH 094/116] fix(edits/split): filter out inactive cross edges --- pychunkedgraph/graph/chunkedgraph.py | 15 ++++++++++++++- pychunkedgraph/graph/edits.py | 7 ++++--- pychunkedgraph/graph/operation.py | 10 +--------- 3 files changed, 19 insertions(+), 13 deletions(-) diff --git a/pychunkedgraph/graph/chunkedgraph.py b/pychunkedgraph/graph/chunkedgraph.py index 7d1a24cc3..1836094f0 100644 --- a/pychunkedgraph/graph/chunkedgraph.py +++ b/pychunkedgraph/graph/chunkedgraph.py @@ -657,7 +657,11 @@ def copy_fake_edges(self, chunk_id: np.uint64) -> None: self.client.write(mutations) def get_l2_agglomerations( - self, level2_ids: np.ndarray, edges_only: bool = False + self, + level2_ids: np.ndarray, + edges_only: bool = False, + active: bool = False, + time_stamp: typing.Optional[datetime.datetime] = None, ) -> typing.Tuple[typing.Dict[int, types.Agglomeration], typing.Tuple[Edges]]: """ Children of Level 2 Node IDs and edges. @@ -703,6 +707,15 @@ def get_l2_agglomerations( raise ValueError("Found conflicting parents.") sv_parent_d.update(dict(zip(svs.tolist(), [l2id] * len(svs)))) + if active: + n1, n2 = all_chunk_edges.node_ids1, all_chunk_edges.node_ids2 + layers = self.get_cross_chunk_edges_layer(all_chunk_edges.get_pairs()) + max_layer = np.max(layers) + 1 + parents1 = self.get_roots(n1, stop_layer=max_layer, time_stamp=time_stamp) + parents2 = self.get_roots(n2, stop_layer=max_layer, time_stamp=time_stamp) + mask = parents1 == parents2 + all_chunk_edges = all_chunk_edges[mask] + in_edges, out_edges, cross_edges = edge_utils.categorize_edges_v2( self.meta, all_chunk_edges, sv_parent_d ) diff --git a/pychunkedgraph/graph/edits.py b/pychunkedgraph/graph/edits.py index 4efead0c9..30e86951a 100644 --- a/pychunkedgraph/graph/edits.py +++ b/pychunkedgraph/graph/edits.py @@ -313,7 +313,6 @@ def remove_edges( cg, *, atomic_edges: Iterable[np.ndarray], - l2id_agglomeration_d: Dict, operation_id: basetypes.OPERATION_ID = None, time_stamp: datetime.datetime = None, parent_ts: datetime.datetime = None, @@ -323,6 +322,9 @@ def remove_edges( roots = cg.get_roots(l2ids, assert_roots=True, time_stamp=parent_ts) assert np.unique(roots).size == 1, "L2 IDs must belong to same root." + l2id_agglomeration_d, _ = cg.get_l2_agglomerations( + l2ids, active=True, time_stamp=parent_ts + ) new_old_id_d = defaultdict(set) old_new_id_d = defaultdict(set) old_hierarchy_d = _init_old_hierarchy(cg, l2ids, parent_ts=parent_ts) @@ -409,7 +411,6 @@ def _get_descendants(cg, new_id): return result - def _update_neighbor_cross_edges_single( cg, new_id: int, cx_edges_d: dict, node_map: dict, *, parent_ts ) -> dict: @@ -498,7 +499,7 @@ def _update_neighbor_cross_edges( def _get_supervoxels(cg, node_ids): """Returns the first supervoxel found for each node_id.""" - result = {} + result = {} node_ids_copy = np.copy(node_ids) children = np.copy(node_ids) children_d = cg.get_children(node_ids) diff --git a/pychunkedgraph/graph/operation.py b/pychunkedgraph/graph/operation.py index 1a221d236..8c5d4484e 100644 --- a/pychunkedgraph/graph/operation.py +++ b/pychunkedgraph/graph/operation.py @@ -640,7 +640,7 @@ def _apply( operation_id=operation_id, time_stamp=timestamp, parent_ts=self.parent_ts, - allow_same_segment_merge=self.allow_same_segment_merge + allow_same_segment_merge=self.allow_same_segment_merge, ) return new_roots, new_l2_ids, fake_edge_rows + new_entries @@ -751,18 +751,11 @@ def _apply( ): raise PreconditionError("Supervoxels must belong to the same object.") - with TimeIt("subgraph", self.cg.graph_id, operation_id): - l2id_agglomeration_d, _ = self.cg.get_l2_agglomerations( - self.cg.get_parents( - self.removed_edges.ravel(), time_stamp=self.parent_ts - ), - ) with TimeIt("remove_edges", self.cg.graph_id, operation_id): return edits.remove_edges( self.cg, operation_id=operation_id, atomic_edges=self.removed_edges, - l2id_agglomeration_d=l2id_agglomeration_d, time_stamp=timestamp, parent_ts=self.parent_ts, ) @@ -929,7 +922,6 @@ def _apply( self.cg, operation_id=operation_id, atomic_edges=self.removed_edges, - l2id_agglomeration_d=l2id_agglomeration_d, time_stamp=timestamp, parent_ts=self.parent_ts, ) From d35c44034753fe4c55bf3d9ebe11aa757d011f2c Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Wed, 17 Jul 2024 16:16:21 +0000 Subject: [PATCH 095/116] fix(edits/split): filter out inactive cross edges AT EACH LAYER --- pychunkedgraph/__init__.py | 2 +- pychunkedgraph/graph/chunkedgraph.py | 14 +++++--------- pychunkedgraph/graph/edges/utils.py | 22 +++++++++++++++++++++- 3 files changed, 27 insertions(+), 11 deletions(-) diff --git a/pychunkedgraph/__init__.py b/pychunkedgraph/__init__.py index 8d1c8625f..528787cfc 100644 --- a/pychunkedgraph/__init__.py +++ b/pychunkedgraph/__init__.py @@ -1 +1 @@ -__version__ = "3.0.3" +__version__ = "3.0.0" diff --git a/pychunkedgraph/graph/chunkedgraph.py b/pychunkedgraph/graph/chunkedgraph.py index 1836094f0..7823695db 100644 --- a/pychunkedgraph/graph/chunkedgraph.py +++ b/pychunkedgraph/graph/chunkedgraph.py @@ -3,6 +3,8 @@ import time import typing import datetime +from itertools import chain +from functools import reduce import numpy as np from pychunkedgraph import __version__ @@ -667,8 +669,6 @@ def get_l2_agglomerations( Children of Level 2 Node IDs and edges. Edges are read from cloud storage. """ - from itertools import chain - from functools import reduce from .misc import get_agglomerations chunk_ids = np.unique(self.get_chunk_ids_from_node_ids(level2_ids)) @@ -708,13 +708,9 @@ def get_l2_agglomerations( sv_parent_d.update(dict(zip(svs.tolist(), [l2id] * len(svs)))) if active: - n1, n2 = all_chunk_edges.node_ids1, all_chunk_edges.node_ids2 - layers = self.get_cross_chunk_edges_layer(all_chunk_edges.get_pairs()) - max_layer = np.max(layers) + 1 - parents1 = self.get_roots(n1, stop_layer=max_layer, time_stamp=time_stamp) - parents2 = self.get_roots(n2, stop_layer=max_layer, time_stamp=time_stamp) - mask = parents1 == parents2 - all_chunk_edges = all_chunk_edges[mask] + all_chunk_edges = edge_utils.filter_inactive_cross_edges( + self, all_chunk_edges, time_stamp=time_stamp + ) in_edges, out_edges, cross_edges = edge_utils.categorize_edges_v2( self.meta, all_chunk_edges, sv_parent_d diff --git a/pychunkedgraph/graph/edges/utils.py b/pychunkedgraph/graph/edges/utils.py index cd0e85fe8..76f8ea1d8 100644 --- a/pychunkedgraph/graph/edges/utils.py +++ b/pychunkedgraph/graph/edges/utils.py @@ -9,6 +9,7 @@ from typing import Iterable from typing import Optional from collections import defaultdict +from functools import reduce import fastremap import numpy as np @@ -46,7 +47,9 @@ def concatenate_chunk_edges(chunk_edge_dicts: Iterable) -> Dict: return edges_dict -def concatenate_cross_edge_dicts(edges_ds: Iterable[Dict], unique: bool = False) -> Dict: +def concatenate_cross_edge_dicts( + edges_ds: Iterable[Dict], unique: bool = False +) -> Dict: """Combines cross chunk edge dicts of form {layer id : edge list}.""" result_d = defaultdict(list) for edges_d in edges_ds: @@ -182,3 +185,20 @@ def get_edges_status(cg, edges: Iterable, time_stamp: Optional[float] = None): active_status.extend(mask) active_status = np.array(active_status, dtype=bool) return existence_status, active_status + + +def filter_inactive_cross_edges( + cg, all_chunk_edges: Edges, time_stamp: Optional[float] = None +): + result = [] + layers = cg.get_cross_chunk_edges_layer(all_chunk_edges.get_pairs()) + for layer in np.unique(layers): + layer_mask = layers == layer + parent_layer = layer + 1 + layer_edges = all_chunk_edges[layer_mask] + n1, n2 = layer_edges.node_ids1, layer_edges.node_ids2 + parents1 = cg.get_roots(n1, stop_layer=parent_layer, time_stamp=time_stamp) + parents2 = cg.get_roots(n2, stop_layer=parent_layer, time_stamp=time_stamp) + mask = parents1 == parents2 + result.append(layer_edges[mask]) + return reduce(lambda x, y: x + y, result, Edges([], [])) From fb5178c55ed44ebead2bc4785186c0257493affd Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Fri, 30 Aug 2024 15:38:53 +0000 Subject: [PATCH 096/116] migration debug code --- .bumpversion.cfg | 2 +- pychunkedgraph/__init__.py | 2 +- pychunkedgraph/graph/edits.py | 22 ++++++++++++++++++---- 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index f98e5ee64..6526fbc66 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 3.0.3 +current_version = 3.0.1 commit = True tag = True diff --git a/pychunkedgraph/__init__.py b/pychunkedgraph/__init__.py index 528787cfc..055276878 100644 --- a/pychunkedgraph/__init__.py +++ b/pychunkedgraph/__init__.py @@ -1 +1 @@ -__version__ = "3.0.0" +__version__ = "3.0.1" diff --git a/pychunkedgraph/graph/edits.py b/pychunkedgraph/graph/edits.py index 30e86951a..340cefadd 100644 --- a/pychunkedgraph/graph/edits.py +++ b/pychunkedgraph/graph/edits.py @@ -313,7 +313,7 @@ def remove_edges( cg, *, atomic_edges: Iterable[np.ndarray], - operation_id: basetypes.OPERATION_ID = None, + operation_id: basetypes.OPERATION_ID = None, # type: ignore time_stamp: datetime.datetime = None, parent_ts: datetime.datetime = None, ): @@ -522,7 +522,7 @@ def __init__( cg, *, new_l2_ids: Iterable, - operation_id: basetypes.OPERATION_ID, + operation_id: basetypes.OPERATION_ID, # type: ignore time_stamp: datetime.datetime, new_old_id_d: Dict[np.uint64, Set[np.uint64]] = None, old_new_id_d: Dict[np.uint64, Set[np.uint64]] = None, @@ -542,7 +542,7 @@ def __init__( def _update_id_lineage( self, - parent: basetypes.NODE_ID, + parent: basetypes.NODE_ID, # type: ignore children: np.ndarray, layer: int, parent_layer: int, @@ -658,7 +658,21 @@ def _create_new_parents(self, layer: int): self._update_id_lineage(parent, cc_ids, layer, parent_layer) self.cg.cache.children_cache[parent] = cc_ids cache_utils.update(self.cg.cache.parents_cache, cc_ids, parent) - sanity_check_single(self.cg, parent, self._operation_id) + + try: + sanity_check_single(self.cg, parent, self._operation_id) + except AssertionError: + from pychunkedgraph.debug.utils import get_l2children + + pairs = [ + (a, b) for idx, a in enumerate(cc_ids) for b in cc_ids[idx + 1 :] + ] + for c1, c2 in pairs: + l2c1 = get_l2children(self.cg, c1) + l2c2 = get_l2children(self.cg, c2) + if np.intersect1d(l2c1, l2c2).size: + msg = f"{self._operation_id}:{c1} {c2} have common children." + raise ValueError(msg) def run(self) -> Iterable: """ From b0f2ff195a22c2c47dfb5adc6948d12d5c6e19e8 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Sun, 22 Sep 2024 15:30:45 +0000 Subject: [PATCH 097/116] use parent timestamps to lift cx edges --- pychunkedgraph/ingest/upgrade/atomic_layer.py | 72 ++++--------------- pychunkedgraph/ingest/upgrade/parent_layer.py | 20 +++--- pychunkedgraph/ingest/upgrade/utils.py | 50 +++++++++++++ 3 files changed, 74 insertions(+), 68 deletions(-) diff --git a/pychunkedgraph/ingest/upgrade/atomic_layer.py b/pychunkedgraph/ingest/upgrade/atomic_layer.py index 6c4244968..a975146de 100644 --- a/pychunkedgraph/ingest/upgrade/atomic_layer.py +++ b/pychunkedgraph/ingest/upgrade/atomic_layer.py @@ -1,50 +1,19 @@ # pylint: disable=invalid-name, missing-docstring, c-extension-no-member + from datetime import timedelta import fastremap import numpy as np from pychunkedgraph.graph import ChunkedGraph from pychunkedgraph.graph.attributes import Connectivity -from pychunkedgraph.graph.attributes import Hierarchy from pychunkedgraph.graph.utils import serializers -from .utils import exists_as_parent - - -def get_parent_timestamps(cg, supervoxels, start_time=None, end_time=None) -> set: - """ - Timestamps of when the given supervoxels were edited, in the given time range. - """ - response = cg.client.read_nodes( - node_ids=supervoxels, - start_time=start_time, - end_time=end_time, - end_time_inclusive=False, - ) - result = set() - for v in response.values(): - for cell in v[Hierarchy.Parent]: - valid = cell.timestamp >= start_time or cell.timestamp < end_time - assert valid, f"{cell.timestamp}, {start_time}" - result.add(cell.timestamp) - return result +from .utils import exists_as_parent, get_parent_timestamps -def get_edit_timestamps(cg: ChunkedGraph, edges_d, start_ts, end_ts) -> list: - """ - Timestamps of when post-side supervoxels were involved in an edit. - Post-side - supervoxels in the neighbor chunk. - This is required because we need to update edges from both sides. - """ - atomic_cx_edges = np.concatenate(list(edges_d.values())) - timestamps = get_parent_timestamps( - cg, atomic_cx_edges[:, 1], start_time=start_ts, end_time=end_ts - ) - timestamps.add(start_ts) - return sorted(timestamps) - - -def update_cross_edges(cg: ChunkedGraph, node, cx_edges_d, node_ts, end_ts) -> list: +def update_cross_edges( + cg: ChunkedGraph, node, cx_edges_d, node_ts, timestamps, earliest_ts +) -> list: """ Helper function to update a single L2 ID. Returns a list of mutations with given timestamps. @@ -58,10 +27,9 @@ def update_cross_edges(cg: ChunkedGraph, node, cx_edges_d, node_ts, end_ts) -> l assert not exists_as_parent(cg, node, edges[:, 0]) return rows - timestamps = [node_ts] - if node_ts != end_ts: - timestamps = get_edit_timestamps(cg, cx_edges_d, node_ts, end_ts) for ts in timestamps: + if ts < earliest_ts: + ts = earliest_ts val_dict = {} svs = edges[:, 1] parents = cg.get_parents(svs, time_stamp=ts) @@ -80,31 +48,21 @@ def update_cross_edges(cg: ChunkedGraph, node, cx_edges_d, node_ts, end_ts) -> l def update_nodes(cg: ChunkedGraph, nodes) -> list: - # get start_ts when node becomes valid nodes_ts = cg.get_node_timestamps(nodes, return_numpy=False, normalize=True) + earliest_ts = cg.get_earliest_timestamp() + timestamps_d = get_parent_timestamps(cg, nodes) cx_edges_d = cg.get_atomic_cross_edges(nodes) - children_d = cg.get_children(nodes) - rows = [] - for node, start_ts in zip(nodes, nodes_ts): + for node, node_ts in zip(nodes, nodes_ts): if cg.get_parent(node) is None: # invalid id caused by failed ingest task continue - node_cx_edges_d = cx_edges_d.get(node, {}) - if not node_cx_edges_d: + _cx_edges_d = cx_edges_d.get(node, {}) + if not _cx_edges_d: continue - - # get end_ts when node becomes invalid (bigtable resolution is in ms) - start = start_ts + timedelta(milliseconds=1) - _timestamps = get_parent_timestamps(cg, children_d[node], start_time=start) - try: - end_ts = sorted(_timestamps)[0] - except IndexError: - # start_ts == end_ts means there has been no edit involving this node - # meaning only one timestamp to update cross edges, start_ts - end_ts = start_ts - # for each timestamp until end_ts, update cross chunk edges of node - _rows = update_cross_edges(cg, node, node_cx_edges_d, start_ts, end_ts) + _rows = update_cross_edges( + cg, node, _cx_edges_d, node_ts, timestamps_d[node], earliest_ts + ) rows.extend(_rows) return rows diff --git a/pychunkedgraph/ingest/upgrade/parent_layer.py b/pychunkedgraph/ingest/upgrade/parent_layer.py index 8674e45b7..0606ff674 100644 --- a/pychunkedgraph/ingest/upgrade/parent_layer.py +++ b/pychunkedgraph/ingest/upgrade/parent_layer.py @@ -14,7 +14,7 @@ from pychunkedgraph.graph.types import empty_2d from pychunkedgraph.utils.general import chunked -from .utils import exists_as_parent +from .utils import exists_as_parent, get_parent_timestamps CHILDREN = {} @@ -50,7 +50,7 @@ def _get_cx_edges_at_timestamp(node, response, ts): def _populate_cx_edges_with_timestamps( - cg: ChunkedGraph, layer: int, nodes: list, nodes_ts: list + cg: ChunkedGraph, layer: int, nodes: list, earliest_ts ): """ Collect timestamps of edits from children, since we use the same timestamp @@ -61,15 +61,13 @@ def _populate_cx_edges_with_timestamps( attrs = [Connectivity.CrossChunkEdge[l] for l in range(layer, cg.meta.layer_count)] all_children = np.concatenate(list(CHILDREN.values())) response = cg.client.read_nodes(node_ids=all_children, properties=attrs) - for node, node_ts in zip(nodes, nodes_ts): - timestamps = set([node_ts]) - for child in CHILDREN[node]: - if child not in response: - continue - for cells in response[child].values(): - timestamps.update([c.timestamp for c in cells if c.timestamp > node_ts]) + timestamps_d = get_parent_timestamps(cg, nodes) + for node in nodes: CX_EDGES[node] = {} + timestamps = timestamps_d[node] for ts in sorted(timestamps): + if ts < earliest_ts: + ts = earliest_ts CX_EDGES[node][ts] = _get_cx_edges_at_timestamp(node, response, ts) @@ -142,19 +140,19 @@ def update_chunk( start = time.time() x, y, z = chunk_coords chunk_id = cg.get_chunk_id(layer=layer, x=x, y=y, z=z) + earliest_ts = cg.get_earliest_timestamp() _populate_nodes_and_children(cg, chunk_id, nodes=nodes) if not CHILDREN: return nodes = list(CHILDREN.keys()) random.shuffle(nodes) nodes_ts = cg.get_node_timestamps(nodes, return_numpy=False, normalize=True) - _populate_cx_edges_with_timestamps(cg, layer, nodes, nodes_ts) + _populate_cx_edges_with_timestamps(cg, layer, nodes, earliest_ts) task_size = int(math.ceil(len(nodes) / mp.cpu_count() / 2)) chunked_nodes = chunked(nodes, task_size) chunked_nodes_ts = chunked(nodes_ts, task_size) cg_info = cg.get_serialized_info() - earliest_ts = cg.get_earliest_timestamp() multi_args = [] for chunk, ts_chunk in zip(chunked_nodes, chunked_nodes_ts): diff --git a/pychunkedgraph/ingest/upgrade/utils.py b/pychunkedgraph/ingest/upgrade/utils.py index 43c9a3034..cc43b561a 100644 --- a/pychunkedgraph/ingest/upgrade/utils.py +++ b/pychunkedgraph/ingest/upgrade/utils.py @@ -1,3 +1,9 @@ +# pylint: disable=invalid-name, missing-docstring + +from collections import defaultdict +from datetime import timedelta + +import numpy as np from pychunkedgraph.graph import ChunkedGraph from pychunkedgraph.graph.attributes import Hierarchy @@ -11,3 +17,47 @@ def exists_as_parent(cg: ChunkedGraph, parent, nodes) -> bool: for cells in response.values(): parents.update([cell.value for cell in cells]) return parent in parents + + +def get_edit_timestamps(cg: ChunkedGraph, edges_d, start_ts, end_ts) -> list: + """ + Timestamps of when post-side nodes were involved in an edit. + Post-side - nodes in the neighbor chunk. + This is required because we need to update edges from both sides. + """ + cx_edges = np.concatenate(list(edges_d.values())) + timestamps = get_parent_timestamps( + cg, cx_edges[:, 1], start_time=start_ts, end_time=end_ts + ) + timestamps.add(start_ts) + return sorted(timestamps) + + +def get_end_ts(cg: ChunkedGraph, children, start_ts): + # get end_ts when node becomes invalid (bigtable resolution is in ms) + start = start_ts + timedelta(milliseconds=1) + _timestamps = get_parent_timestamps(cg, children, start_time=start) + try: + end_ts = sorted(_timestamps)[0] + except IndexError: + # start_ts == end_ts means there has been no edit involving this node + # meaning only one timestamp to update cross edges, start_ts + end_ts = start_ts + return end_ts + + +def get_parent_timestamps(cg: ChunkedGraph, nodes) -> dict[int, set]: + """ + Timestamps of when the given nodes were edited. + """ + response = cg.client.read_nodes( + node_ids=nodes, + properties=[Hierarchy.Parent], + end_time_inclusive=False, + ) + + result = defaultdict(set) + for k, v in response.items(): + for cell in v[Hierarchy.Parent]: + result[k].add(cell.timestamp) + return result From d05a55a3f33566248b85a3f17c28cda5468dded5 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Mon, 23 Sep 2024 14:48:14 +0000 Subject: [PATCH 098/116] make dynamic mesh dir graph specific --- pychunkedgraph/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pychunkedgraph/__init__.py b/pychunkedgraph/__init__.py index 055276878..8e10cb462 100644 --- a/pychunkedgraph/__init__.py +++ b/pychunkedgraph/__init__.py @@ -1 +1 @@ -__version__ = "3.0.1" +__version__ = "3.0.4" From 3a4fe64866de6159f78d9070e3c107818f579a8c Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Thu, 26 Sep 2024 14:45:14 +0000 Subject: [PATCH 099/116] fix(upgrade): use hierarchy from supervoxels --- pychunkedgraph/__init__.py | 2 +- pychunkedgraph/graph/edits.py | 7 ++++--- pychunkedgraph/ingest/upgrade/parent_layer.py | 4 +++- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/pychunkedgraph/__init__.py b/pychunkedgraph/__init__.py index 8e10cb462..e94f36fe8 100644 --- a/pychunkedgraph/__init__.py +++ b/pychunkedgraph/__init__.py @@ -1 +1 @@ -__version__ = "3.0.4" +__version__ = "3.0.5" diff --git a/pychunkedgraph/graph/edits.py b/pychunkedgraph/graph/edits.py index 340cefadd..afe1b3abf 100644 --- a/pychunkedgraph/graph/edits.py +++ b/pychunkedgraph/graph/edits.py @@ -497,7 +497,7 @@ def _update_neighbor_cross_edges( return updated_entries -def _get_supervoxels(cg, node_ids): +def get_supervoxels(cg, node_ids): """Returns the first supervoxel found for each node_id.""" result = {} node_ids_copy = np.copy(node_ids) @@ -606,7 +606,7 @@ def _update_cross_edge_cache(self, parent, children): ) cx_edges_d = concatenate_cross_edge_dicts(cx_edges_d.values()) edge_nodes = np.unique(np.concatenate([*cx_edges_d.values(), types.empty_2d])) - edge_supervoxels = _get_supervoxels(self.cg, edge_nodes) + edge_supervoxels = get_supervoxels(self.cg, edge_nodes) edge_parents = self.cg.get_roots( edge_supervoxels, stop_layer=parent_layer, @@ -671,7 +671,8 @@ def _create_new_parents(self, layer: int): l2c1 = get_l2children(self.cg, c1) l2c2 = get_l2children(self.cg, c2) if np.intersect1d(l2c1, l2c2).size: - msg = f"{self._operation_id}:{c1} {c2} have common children." + c = np.intersect1d(l2c1, l2c2) + msg = f"{self._operation_id}: {layer} {c1} {c2} have common children {c}" raise ValueError(msg) def run(self) -> Iterable: diff --git a/pychunkedgraph/ingest/upgrade/parent_layer.py b/pychunkedgraph/ingest/upgrade/parent_layer.py index 0606ff674..2869fcf85 100644 --- a/pychunkedgraph/ingest/upgrade/parent_layer.py +++ b/pychunkedgraph/ingest/upgrade/parent_layer.py @@ -10,6 +10,7 @@ from pychunkedgraph.graph import ChunkedGraph from pychunkedgraph.graph.attributes import Connectivity, Hierarchy +from pychunkedgraph.graph.edits import get_supervoxels from pychunkedgraph.graph.utils import serializers from pychunkedgraph.graph.types import empty_2d from pychunkedgraph.utils.general import chunked @@ -101,7 +102,8 @@ def update_cross_edges(cg: ChunkedGraph, layer, node, node_ts, earliest_ts) -> l if edges.size == 0: continue nodes = np.unique(edges[:, 1]) - parents = cg.get_roots(nodes, time_stamp=ts, stop_layer=layer, ceil=False) + svs = get_supervoxels(cg, nodes) + parents = cg.get_roots(svs, time_stamp=ts, stop_layer=layer, ceil=False) edge_parents_d = dict(zip(nodes, parents)) val_dict = {} for _layer, layer_edges in cx_edges_d.items(): From fa24c99edbab62bf381e3dc099f4aecd173ebd68 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Thu, 26 Sep 2024 16:59:39 +0000 Subject: [PATCH 100/116] fix(upgrade): include cx edges at node_ts explicitly --- pychunkedgraph/ingest/upgrade/parent_layer.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pychunkedgraph/ingest/upgrade/parent_layer.py b/pychunkedgraph/ingest/upgrade/parent_layer.py index 2869fcf85..a7e79b8f0 100644 --- a/pychunkedgraph/ingest/upgrade/parent_layer.py +++ b/pychunkedgraph/ingest/upgrade/parent_layer.py @@ -51,7 +51,7 @@ def _get_cx_edges_at_timestamp(node, response, ts): def _populate_cx_edges_with_timestamps( - cg: ChunkedGraph, layer: int, nodes: list, earliest_ts + cg: ChunkedGraph, layer: int, nodes: list, nodes_ts:list, earliest_ts ): """ Collect timestamps of edits from children, since we use the same timestamp @@ -63,9 +63,10 @@ def _populate_cx_edges_with_timestamps( all_children = np.concatenate(list(CHILDREN.values())) response = cg.client.read_nodes(node_ids=all_children, properties=attrs) timestamps_d = get_parent_timestamps(cg, nodes) - for node in nodes: + for node, node_ts in zip(nodes, nodes_ts): CX_EDGES[node] = {} timestamps = timestamps_d[node] + timestamps.add(node_ts) for ts in sorted(timestamps): if ts < earliest_ts: ts = earliest_ts @@ -82,6 +83,7 @@ def update_cross_edges(cg: ChunkedGraph, layer, node, node_ts, earliest_ts) -> l try: cx_edges_d = CX_EDGES[node][node_ts] except KeyError: + print(CX_EDGES) raise KeyError(f"{node}:{node_ts}") edges = np.concatenate([empty_2d] + list(cx_edges_d.values())) if edges.size: @@ -149,7 +151,7 @@ def update_chunk( nodes = list(CHILDREN.keys()) random.shuffle(nodes) nodes_ts = cg.get_node_timestamps(nodes, return_numpy=False, normalize=True) - _populate_cx_edges_with_timestamps(cg, layer, nodes, earliest_ts) + _populate_cx_edges_with_timestamps(cg, layer, nodes, nodes_ts, earliest_ts) task_size = int(math.ceil(len(nodes) / mp.cpu_count() / 2)) chunked_nodes = chunked(nodes, task_size) From 3ad09734b118f5052aea702fc59de0fd416c6ed8 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Sun, 29 Sep 2024 19:42:14 +0000 Subject: [PATCH 101/116] adds job type guard, flush_redis prompts, improved status output --- pychunkedgraph/__init__.py | 2 +- pychunkedgraph/ingest/cli.py | 27 ++++++++-- pychunkedgraph/ingest/cli_upgrade.py | 29 ++++++++--- pychunkedgraph/ingest/upgrade/parent_layer.py | 22 ++++----- pychunkedgraph/ingest/utils.py | 49 ++++++++++++++++--- pychunkedgraph/utils/redis.py | 4 +- 6 files changed, 97 insertions(+), 36 deletions(-) diff --git a/pychunkedgraph/__init__.py b/pychunkedgraph/__init__.py index e94f36fe8..6ed01825f 100644 --- a/pychunkedgraph/__init__.py +++ b/pychunkedgraph/__init__.py @@ -1 +1 @@ -__version__ = "3.0.5" +__version__ = "3.0.6" diff --git a/pychunkedgraph/ingest/cli.py b/pychunkedgraph/ingest/cli.py index 928e1852f..c50525ec6 100644 --- a/pychunkedgraph/ingest/cli.py +++ b/pychunkedgraph/ingest/cli.py @@ -16,15 +16,17 @@ bootstrap, chunk_id_str, print_completion_rate, - print_ingest_status, + print_status, queue_layer_helper, + job_type_guard, ) from .simple_tests import run_all from .create.parent_layer import add_parent_chunk from ..graph.chunkedgraph import ChunkedGraph from ..utils.redis import get_redis_connection, keys as r_keys -ingest_cli = AppGroup("ingest") +group_name = "ingest" +ingest_cli = AppGroup(group_name) def init_ingest_cmds(app): @@ -32,6 +34,8 @@ def init_ingest_cmds(app): @ingest_cli.command("flush_redis") +@click.confirmation_option(prompt="Are you sure you want to flush redis?") +@job_type_guard(group_name) def flush_redis(): """FLush redis db.""" redis = get_redis_connection() @@ -44,6 +48,7 @@ def flush_redis(): @click.option("--raw", is_flag=True, help="Read edges from agglomeration output.") @click.option("--test", is_flag=True, help="Test 8 chunks at the center of dataset.") @click.option("--retry", is_flag=True, help="Rerun without creating a new table.") +@job_type_guard(group_name) def ingest_graph( graph_id: str, dataset: click.Path, raw: bool, test: bool, retry: bool ): @@ -51,6 +56,8 @@ def ingest_graph( Main ingest command. Takes ingest config from a yaml file and queues atomic tasks. """ + redis = get_redis_connection() + redis.set(r_keys.JOB_TYPE, group_name) with open(dataset, "r") as stream: config = yaml.safe_load(stream) @@ -70,6 +77,7 @@ def ingest_graph( @click.argument("graph_id", type=str) @click.argument("dataset", type=click.Path(exists=True)) @click.option("--raw", is_flag=True) +@job_type_guard(group_name) def pickle_imanager(graph_id: str, dataset: click.Path, raw: bool): """ Load ingest config into redis server. @@ -83,11 +91,12 @@ def pickle_imanager(graph_id: str, dataset: click.Path, raw: bool): meta, ingest_config, _ = bootstrap(graph_id, config=config, raw=raw) imanager = IngestionManager(ingest_config, meta) - imanager.redis # pylint: disable=pointless-statement + imanager.redis.set(r_keys.JOB_TYPE, group_name) @ingest_cli.command("layer") @click.argument("parent_layer", type=int) +@job_type_guard(group_name) def queue_layer(parent_layer): """ Queue all chunk tasks at a given layer. @@ -100,16 +109,21 @@ def queue_layer(parent_layer): @ingest_cli.command("status") +@job_type_guard(group_name) def ingest_status(): """Print ingest status to console by layer.""" redis = get_redis_connection() - imanager = IngestionManager.from_pickle(redis.get(r_keys.INGESTION_MANAGER)) - print_ingest_status(imanager, redis) + try: + imanager = IngestionManager.from_pickle(redis.get(r_keys.INGESTION_MANAGER)) + print_status(imanager, redis) + except TypeError as err: + print(f"\nNo current `{group_name}` job found in redis: {err}") @ingest_cli.command("chunk") @click.argument("queue", type=str) @click.argument("chunk_info", nargs=4, type=int) +@job_type_guard(group_name) def ingest_chunk(queue: str, chunk_info): """Manually queue chunk when a job is stuck for whatever reason.""" redis = get_redis_connection() @@ -135,6 +149,7 @@ def ingest_chunk(queue: str, chunk_info): @click.argument("graph_id", type=str) @click.argument("chunk_info", nargs=4, type=int) @click.option("--n_threads", type=int, default=1) +@job_type_guard(group_name) def ingest_chunk_local(graph_id: str, chunk_info, n_threads: int): """Manually ingest a chunk on a local machine.""" layer, coords = chunk_info[0], chunk_info[1:] @@ -150,6 +165,7 @@ def ingest_chunk_local(graph_id: str, chunk_info, n_threads: int): @ingest_cli.command("rate") @click.argument("layer", type=int) @click.option("--span", default=10, help="Time span to calculate rate.") +@job_type_guard(group_name) def rate(layer: int, span: int): redis = get_redis_connection() imanager = IngestionManager.from_pickle(redis.get(r_keys.INGESTION_MANAGER)) @@ -158,5 +174,6 @@ def rate(layer: int, span: int): @ingest_cli.command("run_tests") @click.argument("graph_id", type=str) +@job_type_guard(group_name) def run_tests(graph_id): run_all(ChunkedGraph(graph_id=graph_id)) diff --git a/pychunkedgraph/ingest/cli_upgrade.py b/pychunkedgraph/ingest/cli_upgrade.py index c77c0be64..84939544b 100644 --- a/pychunkedgraph/ingest/cli_upgrade.py +++ b/pychunkedgraph/ingest/cli_upgrade.py @@ -24,15 +24,17 @@ from .utils import ( chunk_id_str, print_completion_rate, - print_ingest_status, + print_status, queue_layer_helper, start_ocdbt_server, + job_type_guard, ) from ..graph.chunkedgraph import ChunkedGraph, ChunkedGraphMeta from ..utils.redis import get_redis_connection from ..utils.redis import keys as r_keys -upgrade_cli = AppGroup("upgrade") +group_name = "upgrade" +upgrade_cli = AppGroup(group_name) def init_upgrade_cmds(app): @@ -40,6 +42,8 @@ def init_upgrade_cmds(app): @upgrade_cli.command("flush_redis") +@click.confirmation_option(prompt="Are you sure you want to flush redis?") +@job_type_guard(group_name) def flush_redis(): """FLush redis db.""" redis = get_redis_connection() @@ -50,11 +54,13 @@ def flush_redis(): @click.argument("graph_id", type=str) @click.option("--test", is_flag=True, help="Test 8 chunks at the center of dataset.") @click.option("--ocdbt", is_flag=True, help="Store edges using ts ocdbt kv store.") +@job_type_guard(group_name) def upgrade_graph(graph_id: str, test: bool, ocdbt: bool): """ - Main upgrade command. - Takes upgrade config from a yaml file and queues atomic tasks. + Main upgrade command. Queues atomic tasks. """ + redis = get_redis_connection() + redis.set(r_keys.JOB_TYPE, group_name) ingest_config = IngestConfig(TEST_RUN=test) cg = ChunkedGraph(graph_id=graph_id) cg.client.add_graph_version(__version__, overwrite=True) @@ -91,6 +97,7 @@ def upgrade_graph(graph_id: str, test: bool, ocdbt: bool): @upgrade_cli.command("layer") @click.argument("parent_layer", type=int) +@job_type_guard(group_name) def queue_layer(parent_layer): """ Queue all chunk tasks at a given layer. @@ -103,17 +110,22 @@ def queue_layer(parent_layer): @upgrade_cli.command("status") -def ingest_status(): +@job_type_guard(group_name) +def upgrade_status(): """Print upgrade status to console.""" redis = get_redis_connection() - imanager = IngestionManager.from_pickle(redis.get(r_keys.INGESTION_MANAGER)) - print_ingest_status(imanager, redis, upgrade=True) + try: + imanager = IngestionManager.from_pickle(redis.get(r_keys.INGESTION_MANAGER)) + print_status(imanager, redis, upgrade=True) + except TypeError as err: + print(f"\nNo current `{group_name}` job found in redis: {err}") @upgrade_cli.command("chunk") @click.argument("queue", type=str) @click.argument("chunk_info", nargs=4, type=int) -def ingest_chunk(queue: str, chunk_info): +@job_type_guard(group_name) +def upgrade_chunk(queue: str, chunk_info): """Manually queue chunk when a job is stuck for whatever reason.""" redis = get_redis_connection() imanager = IngestionManager.from_pickle(redis.get(r_keys.INGESTION_MANAGER)) @@ -137,6 +149,7 @@ def ingest_chunk(queue: str, chunk_info): @upgrade_cli.command("rate") @click.argument("layer", type=int) @click.option("--span", default=10, help="Time span to calculate rate.") +@job_type_guard(group_name) def rate(layer: int, span: int): redis = get_redis_connection() imanager = IngestionManager.from_pickle(redis.get(r_keys.INGESTION_MANAGER)) diff --git a/pychunkedgraph/ingest/upgrade/parent_layer.py b/pychunkedgraph/ingest/upgrade/parent_layer.py index a7e79b8f0..7c95cc1b6 100644 --- a/pychunkedgraph/ingest/upgrade/parent_layer.py +++ b/pychunkedgraph/ingest/upgrade/parent_layer.py @@ -6,7 +6,7 @@ import fastremap import numpy as np -from multiwrapper import multiprocessing_utils as mu +from tqdm import tqdm from pychunkedgraph.graph import ChunkedGraph from pychunkedgraph.graph.attributes import Connectivity, Hierarchy @@ -51,7 +51,7 @@ def _get_cx_edges_at_timestamp(node, response, ts): def _populate_cx_edges_with_timestamps( - cg: ChunkedGraph, layer: int, nodes: list, nodes_ts:list, earliest_ts + cg: ChunkedGraph, layer: int, nodes: list, nodes_ts: list, earliest_ts ): """ Collect timestamps of edits from children, since we use the same timestamp @@ -83,7 +83,6 @@ def update_cross_edges(cg: ChunkedGraph, layer, node, node_ts, earliest_ts) -> l try: cx_edges_d = CX_EDGES[node][node_ts] except KeyError: - print(CX_EDGES) raise KeyError(f"{node}:{node_ts}") edges = np.concatenate([empty_2d] + list(cx_edges_d.values())) if edges.size: @@ -158,15 +157,14 @@ def update_chunk( chunked_nodes_ts = chunked(nodes_ts, task_size) cg_info = cg.get_serialized_info() - multi_args = [] + tasks = [] for chunk, ts_chunk in zip(chunked_nodes, chunked_nodes_ts): args = (cg_info, layer, chunk, ts_chunk, earliest_ts) - multi_args.append(args) - - print(f"nodes: {len(nodes)}, tasks: {len(multi_args)}, size: {task_size}") - mu.multiprocess_func( - _update_cross_edges_helper, - multi_args, - n_threads=min(len(multi_args), mp.cpu_count()), - ) + tasks.append(args) + + with mp.Pool(min(mp.cpu_count(), len(tasks))) as pool: + tqdm( + pool.imap_unordered(_update_cross_edges_helper, tasks), + total=len(tasks), + ) print(f"total elaspsed time: {time.time() - start}") diff --git a/pychunkedgraph/ingest/utils.py b/pychunkedgraph/ingest/utils.py index 3d573ce37..1692db43b 100644 --- a/pychunkedgraph/ingest/utils.py +++ b/pychunkedgraph/ingest/utils.py @@ -1,6 +1,7 @@ # pylint: disable=invalid-name, missing-docstring import logging +import functools from os import environ from time import sleep from typing import Any, Generator, Tuple @@ -16,6 +17,8 @@ from ..graph.client import BackendClientInfo from ..graph.client.bigtable import BigTableConfig from ..utils.general import chunked +from ..utils.redis import get_redis_connection +from ..utils.redis import keys as r_keys chunk_id_str = lambda layer, coords: f"{layer}_{'_'.join(map(str, coords))}" @@ -116,7 +119,7 @@ def print_completion_rate(imanager: IngestionManager, layer: int, span: int = 10 print(f"{rate} chunks per second.") -def print_ingest_status(imanager: IngestionManager, redis, upgrade: bool = False): +def print_status(imanager: IngestionManager, redis, upgrade: bool = False): """ Helper to print status to console. If `upgrade=True`, status does not include the root layer, @@ -128,6 +131,7 @@ def print_ingest_status(imanager: IngestionManager, redis, upgrade: bool = False layer_counts = imanager.cg_meta.layer_chunk_counts pipeline = redis.pipeline() + pipeline.get(r_keys.JOB_TYPE) worker_busy = [] for layer in layers: pipeline.scard(f"{layer}c") @@ -138,25 +142,32 @@ def print_ingest_status(imanager: IngestionManager, redis, upgrade: bool = False worker_busy.append(sum([w.get_state() == WorkerStatus.BUSY for w in workers])) results = pipeline.execute() + job_type = "not_available" + if results[0] is not None: + job_type = results[0].decode() completed = [] queued = [] failed = [] - for i in range(0, len(results), 3): + for i in range(1, len(results), 3): result = results[i : i + 3] completed.append(result[0]) queued.append(result[1]) failed.append(result[2]) - print(f"version: \t{imanager.cg.version}") - print(f"graph_id: \t{imanager.cg.graph_id}") - print(f"chunk_size: \t{imanager.cg.meta.graph_config.CHUNK_SIZE}") - print("\nlayer status:") + header = ( + f"\njob_type: \t{job_type}" + f"\nversion: \t{imanager.cg.version}" + f"\ngraph_id: \t{imanager.cg.graph_id}" + f"\nchunk_size: \t{imanager.cg.meta.graph_config.CHUNK_SIZE}" + "\n\nlayer status:" + ) + print(header) for layer, done, count in zip(layers, completed, layer_counts): - print(f"{layer}\t: {done:<9} / {count}") + print(f"{layer}\t| {done:9} / {count} \t| {done/count:6.1%}") print("\n\nqueue status:") for layer, q, f, wb in zip(layers, queued, failed, worker_busy): - print(f"l{layer}\t: queued: {q:<10} failed: {f:<10} busy: {wb}") + print(f"l{layer}\t| queued: {q:<10} failed: {f:<10} busy: {wb}") def queue_layer_helper(parent_layer: int, imanager: IngestionManager, fn): @@ -190,3 +201,25 @@ def queue_layer_helper(parent_layer: int, imanager: IngestionManager, fn): ) ) q.enqueue_many(job_datas) + + +def job_type_guard(job_type: str): + def decorator_job_type_guard(func): + @functools.wraps(func) + def wrapper_job_type_guard(*args, **kwargs): + redis = get_redis_connection() + current_type = redis.get(r_keys.JOB_TYPE) + if current_type is not None: + current_type = current_type.decode() + msg = ( + f"Currently running `{current_type}`. You're attempting to run `{job_type}`." + f"\nRun `[flask] {current_type} flush_redis` to clear the current job and restart." + ) + if current_type != job_type: + print(f"\n*WARNING*\n{msg}") + exit(1) + return func(*args, **kwargs) + + return wrapper_job_type_guard + + return decorator_job_type_guard diff --git a/pychunkedgraph/utils/redis.py b/pychunkedgraph/utils/redis.py index 420a849f1..fa43c867a 100644 --- a/pychunkedgraph/utils/redis.py +++ b/pychunkedgraph/utils/redis.py @@ -19,8 +19,8 @@ REDIS_PASSWORD = os.environ.get("REDIS_PASSWORD", "") REDIS_URL = f"redis://:{REDIS_PASSWORD}@{REDIS_HOST}:{REDIS_PORT}/0" -keys_fields = ("INGESTION_MANAGER",) -keys_defaults = ("pcg:imanager",) +keys_fields = ("INGESTION_MANAGER", "JOB_TYPE") +keys_defaults = ("pcg:imanager", "pcg:job_type") Keys = namedtuple("keys", keys_fields, defaults=keys_defaults) keys = Keys() From 4e22c06579dcfcbbbc9546811e9137dcc734a2dd Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Sun, 10 Nov 2024 19:47:48 +0000 Subject: [PATCH 102/116] fix(upgrade): include timestamps for partner supervoxel parents --- pychunkedgraph/__init__.py | 2 +- pychunkedgraph/ingest/upgrade/atomic_layer.py | 8 ++++++-- pychunkedgraph/ingest/upgrade/parent_layer.py | 8 +++++--- pychunkedgraph/ingest/utils.py | 3 ++- 4 files changed, 14 insertions(+), 7 deletions(-) diff --git a/pychunkedgraph/__init__.py b/pychunkedgraph/__init__.py index 6ed01825f..c11769ec9 100644 --- a/pychunkedgraph/__init__.py +++ b/pychunkedgraph/__init__.py @@ -1 +1 @@ -__version__ = "3.0.6" +__version__ = "3.0.7" diff --git a/pychunkedgraph/ingest/upgrade/atomic_layer.py b/pychunkedgraph/ingest/upgrade/atomic_layer.py index a975146de..c9c8bdb11 100644 --- a/pychunkedgraph/ingest/upgrade/atomic_layer.py +++ b/pychunkedgraph/ingest/upgrade/atomic_layer.py @@ -12,7 +12,7 @@ def update_cross_edges( - cg: ChunkedGraph, node, cx_edges_d, node_ts, timestamps, earliest_ts + cg: ChunkedGraph, node, cx_edges_d: dict, node_ts, timestamps: set, earliest_ts ) -> list: """ Helper function to update a single L2 ID. @@ -27,7 +27,11 @@ def update_cross_edges( assert not exists_as_parent(cg, node, edges[:, 0]) return rows - for ts in timestamps: + partner_parent_ts_d = get_parent_timestamps(cg, edges[:, 1]) + for v in partner_parent_ts_d.values(): + timestamps.update(v) + + for ts in sorted(timestamps): if ts < earliest_ts: ts = earliest_ts val_dict = {} diff --git a/pychunkedgraph/ingest/upgrade/parent_layer.py b/pychunkedgraph/ingest/upgrade/parent_layer.py index 7c95cc1b6..dace88b43 100644 --- a/pychunkedgraph/ingest/upgrade/parent_layer.py +++ b/pychunkedgraph/ingest/upgrade/parent_layer.py @@ -163,8 +163,10 @@ def update_chunk( tasks.append(args) with mp.Pool(min(mp.cpu_count(), len(tasks))) as pool: - tqdm( - pool.imap_unordered(_update_cross_edges_helper, tasks), - total=len(tasks), + _ = list( + tqdm( + pool.imap_unordered(_update_cross_edges_helper, tasks), + total=len(tasks), + ) ) print(f"total elaspsed time: {time.time() - start}") diff --git a/pychunkedgraph/ingest/utils.py b/pychunkedgraph/ingest/utils.py index 1692db43b..45b6e728f 100644 --- a/pychunkedgraph/ingest/utils.py +++ b/pychunkedgraph/ingest/utils.py @@ -2,6 +2,7 @@ import logging import functools +import math from os import environ from time import sleep from typing import Any, Generator, Tuple @@ -163,7 +164,7 @@ def print_status(imanager: IngestionManager, redis, upgrade: bool = False): ) print(header) for layer, done, count in zip(layers, completed, layer_counts): - print(f"{layer}\t| {done:9} / {count} \t| {done/count:6.1%}") + print(f"{layer}\t| {done:9} / {count} \t| {math.floor((done/count)*100):6}%") print("\n\nqueue status:") for layer, q, f, wb in zip(layers, queued, failed, worker_busy): From 28f9773b3fd54c3a85093839617cbc751d1cd4ae Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Thu, 21 Nov 2024 18:49:27 +0000 Subject: [PATCH 103/116] fix(upgrade): use timestamps of partners at layers > 2 --- pychunkedgraph/__init__.py | 2 +- pychunkedgraph/ingest/upgrade/parent_layer.py | 9 ++++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/pychunkedgraph/__init__.py b/pychunkedgraph/__init__.py index c11769ec9..35c154a9d 100644 --- a/pychunkedgraph/__init__.py +++ b/pychunkedgraph/__init__.py @@ -1 +1 @@ -__version__ = "3.0.7" +__version__ = "3.0.8" diff --git a/pychunkedgraph/ingest/upgrade/parent_layer.py b/pychunkedgraph/ingest/upgrade/parent_layer.py index dace88b43..6f0b08711 100644 --- a/pychunkedgraph/ingest/upgrade/parent_layer.py +++ b/pychunkedgraph/ingest/upgrade/parent_layer.py @@ -66,7 +66,14 @@ def _populate_cx_edges_with_timestamps( for node, node_ts in zip(nodes, nodes_ts): CX_EDGES[node] = {} timestamps = timestamps_d[node] - timestamps.add(node_ts) + cx_edges_d_node_ts = _get_cx_edges_at_timestamp(node, response, node_ts) + + edges = np.concatenate([empty_2d] + list(cx_edges_d_node_ts.values())) + partner_parent_ts_d = get_parent_timestamps(cg, edges[:, 1]) + for v in partner_parent_ts_d.values(): + timestamps.update(v) + CX_EDGES[node][node_ts] = cx_edges_d_node_ts + for ts in sorted(timestamps): if ts < earliest_ts: ts = earliest_ts From 9bef0145780e1032fb3db907f7d5152a329fb6ac Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Thu, 5 Dec 2024 20:33:53 +0000 Subject: [PATCH 104/116] version 3.0.9 --- .bumpversion.cfg | 2 +- pychunkedgraph/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 6526fbc66..250e55eff 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 3.0.1 +current_version = 3.0.9 commit = True tag = True diff --git a/pychunkedgraph/__init__.py b/pychunkedgraph/__init__.py index 35c154a9d..67ae584d7 100644 --- a/pychunkedgraph/__init__.py +++ b/pychunkedgraph/__init__.py @@ -1 +1 @@ -__version__ = "3.0.8" +__version__ = "3.0.9" From e0565b7723b7226681ea34bb0a5adc734deb6632 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Mon, 9 Dec 2024 22:33:47 +0000 Subject: [PATCH 105/116] feat: use mesh dir and dynamic dir from metadata --- .bumpversion.cfg | 2 +- pychunkedgraph/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 250e55eff..2a9dad726 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 3.0.9 +current_version = 3.0.10 commit = True tag = True diff --git a/pychunkedgraph/__init__.py b/pychunkedgraph/__init__.py index 67ae584d7..84994dc59 100644 --- a/pychunkedgraph/__init__.py +++ b/pychunkedgraph/__init__.py @@ -1 +1 @@ -__version__ = "3.0.9" +__version__ = "3.0.10" From 5c6306c43589de15f846d0a5f5bd866003eb58c4 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Tue, 15 Jul 2025 21:44:19 +0000 Subject: [PATCH 106/116] ingest: change job batch size, more logging --- pychunkedgraph/ingest/cluster.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pychunkedgraph/ingest/cluster.py b/pychunkedgraph/ingest/cluster.py index 485251568..f557ac45a 100644 --- a/pychunkedgraph/ingest/cluster.py +++ b/pychunkedgraph/ingest/cluster.py @@ -197,12 +197,12 @@ def _get_test_chunks(meta: ChunkedGraphMeta): def _queue_tasks(imanager: IngestionManager, chunk_fn: Callable, coords: Iterable): queue_name = "l2" q = imanager.get_task_queue(queue_name) - batch_size = int(environ.get("JOB_BATCH_SIZE", 100000)) + batch_size = int(environ.get("JOB_BATCH_SIZE", 10000)) batches = chunked(coords, batch_size) for batch in batches: _coords = get_chunks_not_done(imanager, 2, batch) # buffer for optimal use of redis memory - if len(q) > int(environ.get("QUEUE_SIZE", 100000)): + if len(q) > int(environ.get("QUEUE_SIZE", 1000000)): interval = int(environ.get("QUEUE_INTERVAL", 300)) logging.info(f"Queue full; sleeping {interval}s...") sleep(interval) @@ -219,6 +219,7 @@ def _queue_tasks(imanager: IngestionManager, chunk_fn: Callable, coords: Iterabl ) ) q.enqueue_many(job_datas) + logging.info(f"Queued {len(job_datas)} chunks.") def enqueue_l2_tasks(imanager: IngestionManager, chunk_fn: Callable): From 5ddfa812f8ef9387151ec34650add416fc82c185 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Tue, 15 Jul 2025 21:44:25 +0000 Subject: [PATCH 107/116] =?UTF-8?q?Bump=20version:=203.0.10=20=E2=86=92=20?= =?UTF-8?q?3.0.11?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .bumpversion.cfg | 2 +- pychunkedgraph/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 2a9dad726..5f550ff4a 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 3.0.10 +current_version = 3.0.11 commit = True tag = True diff --git a/pychunkedgraph/__init__.py b/pychunkedgraph/__init__.py index 84994dc59..6c5152b5f 100644 --- a/pychunkedgraph/__init__.py +++ b/pychunkedgraph/__init__.py @@ -1 +1 @@ -__version__ = "3.0.10" +__version__ = "3.0.11" From 2e6fcc8efbcb9c5366351202127602d24b4897c2 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Tue, 15 Jul 2025 22:20:26 +0000 Subject: [PATCH 108/116] ingest: add socket_timeout for redis connections --- pychunkedgraph/utils/redis.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pychunkedgraph/utils/redis.py b/pychunkedgraph/utils/redis.py index fa43c867a..45ccfbdcc 100644 --- a/pychunkedgraph/utils/redis.py +++ b/pychunkedgraph/utils/redis.py @@ -27,9 +27,9 @@ def get_redis_connection(redis_url=REDIS_URL): - return redis.Redis.from_url(redis_url) + return redis.Redis.from_url(redis_url, socket_timeout=60) def get_rq_queue(queue): - connection = redis.Redis.from_url(REDIS_URL) + connection = redis.Redis.from_url(REDIS_URL, socket_timeout=60) return Queue(queue, connection=connection) From 19af42d8f3a34a1cb31af9e12e7ef30abc0d89bc Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Tue, 15 Jul 2025 22:20:34 +0000 Subject: [PATCH 109/116] =?UTF-8?q?Bump=20version:=203.0.11=20=E2=86=92=20?= =?UTF-8?q?3.0.12?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .bumpversion.cfg | 2 +- pychunkedgraph/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 5f550ff4a..7592461a3 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 3.0.11 +current_version = 3.0.12 commit = True tag = True diff --git a/pychunkedgraph/__init__.py b/pychunkedgraph/__init__.py index 6c5152b5f..730a6c4a9 100644 --- a/pychunkedgraph/__init__.py +++ b/pychunkedgraph/__init__.py @@ -1 +1 @@ -__version__ = "3.0.11" +__version__ = "3.0.12" From cc5d0f20e8f44d0c81448f28139a70b1fd9517ee Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Thu, 17 Jul 2025 21:01:45 +0000 Subject: [PATCH 110/116] fix(edits): descriptive error message --- pychunkedgraph/graph/edits.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pychunkedgraph/graph/edits.py b/pychunkedgraph/graph/edits.py index afe1b3abf..899d1ce42 100644 --- a/pychunkedgraph/graph/edits.py +++ b/pychunkedgraph/graph/edits.py @@ -621,7 +621,9 @@ def _update_cross_edge_cache(self, parent, children): continue edges = fastremap.remap(edges, edge_parents_d, preserve_missing_labels=True) new_cx_edges_d[layer] = np.unique(edges, axis=0) - assert np.all(edges[:, 0] == parent), f"{parent}, {np.unique(edges[:, 0])}" + assert np.all( + edges[:, 0] == parent + ), f"OP {self._operation_id}: parent mismatch {parent} != {np.unique(edges[:, 0])}" self.cg.cache.cross_chunk_edges_cache[parent] = new_cx_edges_d def _create_new_parents(self, layer: int): From b428147044bf0419c38b46cf10d3d250b64ba59f Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Thu, 17 Jul 2025 21:01:57 +0000 Subject: [PATCH 111/116] =?UTF-8?q?Bump=20version:=203.0.12=20=E2=86=92=20?= =?UTF-8?q?3.0.13?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .bumpversion.cfg | 2 +- pychunkedgraph/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 7592461a3..1e9b72ac5 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 3.0.12 +current_version = 3.0.13 commit = True tag = True diff --git a/pychunkedgraph/__init__.py b/pychunkedgraph/__init__.py index 730a6c4a9..1adf1ce9e 100644 --- a/pychunkedgraph/__init__.py +++ b/pychunkedgraph/__init__.py @@ -1 +1 @@ -__version__ = "3.0.12" +__version__ = "3.0.13" From aee77f47e14c9bf15c23b7186b931c4312b1f5a6 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Tue, 29 Jul 2025 23:51:48 +0000 Subject: [PATCH 112/116] fix(edits): find stale edges and their latest nodes --- pychunkedgraph/graph/chunkedgraph.py | 40 +++++ pychunkedgraph/graph/chunks/hierarchy.py | 15 +- pychunkedgraph/graph/edges/__init__.py | 168 +++++++++++++++++- pychunkedgraph/graph/edges/utils.py | 2 +- pychunkedgraph/graph/edits.py | 54 +++--- pychunkedgraph/ingest/upgrade/parent_layer.py | 3 +- 6 files changed, 255 insertions(+), 27 deletions(-) diff --git a/pychunkedgraph/graph/chunkedgraph.py b/pychunkedgraph/graph/chunkedgraph.py index 7823695db..143d1ba9e 100644 --- a/pychunkedgraph/graph/chunkedgraph.py +++ b/pychunkedgraph/graph/chunkedgraph.py @@ -940,6 +940,11 @@ def get_parent_chunk_id( self.meta, node_or_chunk_id, parent_layer ) + def get_parent_chunk_id_multiple(self, node_or_chunk_ids: typing.Sequence): + return chunk_hierarchy.get_parent_chunk_id_multiple( + self.meta, node_or_chunk_ids + ) + def get_parent_chunk_ids(self, node_or_chunk_id: basetypes.NODE_ID): return chunk_hierarchy.get_parent_chunk_ids(self.meta, node_or_chunk_id) @@ -984,3 +989,38 @@ def get_operation_ids(self, node_ids: typing.Sequence): except KeyError: ... return result + + def get_single_leaf_multiple(self, node_ids): + """Returns the first supervoxel found for each node_id.""" + result = {} + node_ids_copy = np.copy(node_ids) + children = np.copy(node_ids) + children_d = self.get_children(node_ids) + while True: + children = [children_d[k][0] for k in children] + children = np.array(children, dtype=basetypes.NODE_ID) + mask = self.get_chunk_layers(children) == 1 + result.update( + [(node, sv) for node, sv in zip(node_ids[mask], children[mask])] + ) + node_ids = node_ids[~mask] + children = children[~mask] + if children.size == 0: + break + children_d = self.get_children(children) + return np.array([result[k] for k in node_ids_copy], dtype=basetypes.NODE_ID) + + def get_chunk_layers_and_coordinates(self, node_or_chunk_ids: typing.Sequence): + """ + Helper function that wraps get chunk layer and coordinates for nodes at any layer. + """ + node_or_chunk_ids = np.array(node_or_chunk_ids, dtype=basetypes.NODE_ID) + layers = self.get_chunk_layers(node_or_chunk_ids) + chunk_coords = np.zeros(shape=(len(node_or_chunk_ids), 3)) + for _layer in np.unique(layers): + mask = layers == _layer + _nodes = node_or_chunk_ids[mask] + chunk_coords[mask] = chunk_utils.get_chunk_coordinates_multiple( + self.meta, _nodes + ) + return layers, chunk_coords diff --git a/pychunkedgraph/graph/chunks/hierarchy.py b/pychunkedgraph/graph/chunks/hierarchy.py index 32d6029ee..6128d5914 100644 --- a/pychunkedgraph/graph/chunks/hierarchy.py +++ b/pychunkedgraph/graph/chunks/hierarchy.py @@ -43,7 +43,7 @@ def get_children_chunk_ids( else: children_coords = get_children_chunk_coords(meta, layer, (x, y, z)) children_chunk_ids = [] - for (x, y, z) in children_coords: + for x, y, z in children_coords: children_chunk_ids.append( utils.get_chunk_id(meta, layer=layer - 1, x=x, y=y, z=z) ) @@ -62,6 +62,19 @@ def get_parent_chunk_id( return utils.get_chunk_id(meta, layer=parent_layer, x=x, y=y, z=z) +def get_parent_chunk_id_multiple( + meta: ChunkedGraphMeta, node_or_chunk_ids: np.ndarray +) -> np.ndarray: + """Parent chunk IDs for multiple nodes. Assumes nodes at same layer.""" + + node_layers = utils.get_chunk_layers(meta, node_or_chunk_ids) + assert np.unique(node_layers).size == 1, np.unique(node_layers) + parent_layer = node_layers[0] + 1 + coords = utils.get_chunk_coordinates_multiple(meta, node_or_chunk_ids) + coords = coords // meta.graph_config.FANOUT + return utils.get_chunk_ids_from_coords(meta, layer=parent_layer, coords=coords) + + def get_parent_chunk_ids( meta: ChunkedGraphMeta, node_or_chunk_id: np.uint64 ) -> np.ndarray: diff --git a/pychunkedgraph/graph/edges/__init__.py b/pychunkedgraph/graph/edges/__init__.py index 430ab9fa7..2bc523313 100644 --- a/pychunkedgraph/graph/edges/__init__.py +++ b/pychunkedgraph/graph/edges/__init__.py @@ -3,14 +3,20 @@ """ from collections import namedtuple +import datetime from os import environ -from typing import Optional +from copy import copy +from typing import Iterable, Optional import numpy as np import tensorstore as ts import zstandard as zstd from graph_tool import Graph +from pychunkedgraph.graph import types +from pychunkedgraph.graph.chunks import utils as chunk_utils +from pychunkedgraph.graph.utils import basetypes + from ..utils import basetypes @@ -189,3 +195,163 @@ def get_edges(source: str, nodes: np.ndarray) -> Edges: affinities=np.concatenate(affinities), areas=np.concatenate(areas), ) + + +def get_stale_nodes( + cg, edge_nodes: Iterable[basetypes.NODE_ID], parent_ts: datetime.datetime = None +): + """ + Checks to see if partner nodes in edges (edges[:,1]) are stale. + This is done by getting a supervoxel of the node and check + if it has a new parent at the same layer as the node. + """ + edge_supervoxels = cg.get_single_leaf_multiple(edge_nodes) + # nodes can be at different layers due to skip connections + edge_nodes_layers = cg.get_chunk_layers(edge_nodes) + stale_nodes = [types.empty_1d] + for layer in np.unique(edge_nodes_layers): + _mask = edge_nodes_layers == layer + layer_nodes = edge_nodes[_mask] + _nodes = cg.get_roots( + edge_supervoxels[_mask], + stop_layer=layer, + ceil=False, + time_stamp=parent_ts, + ) + stale_mask = layer_nodes != _nodes + stale_nodes.append(layer_nodes[stale_mask]) + return np.concatenate(stale_nodes), edge_supervoxels + + +def get_latest_edges( + cg, + stale_edges: Iterable, + edge_layers: Iterable, + parent_ts: datetime.datetime = None, +) -> dict: + """ + For each of stale_edges [[`node`, `partner`]], get their L2 edge equivalent. + Then get supervoxels of those L2 IDs and get parent(s) at `node` level. + These parents would be the new identities for the stale `partner`. + """ + _nodes = np.unique(stale_edges[:, 1]) + nodes_ts_map = dict(zip(_nodes, cg.get_node_timestamps(_nodes, return_numpy=False))) + _nodes = np.unique(stale_edges) + layers, coords = cg.get_chunk_layers_and_coordinates(_nodes) + layers_d = dict(zip(_nodes, layers)) + coords_d = dict(zip(_nodes, coords)) + + def _get_normalized_coords(node_a, node_b) -> tuple: + max_layer = layers_d[node_a] + coord_a, coord_b = coords_d[node_a], coords_d[node_b] + if layers_d[node_a] != layers_d[node_b]: + # normalize if nodes are not from the same layer + max_layer = max(layers_d[node_a], layers_d[node_b]) + chunk_a = cg.get_parent_chunk_id(node_a, parent_layer=max_layer) + chunk_b = cg.get_parent_chunk_id(node_b, parent_layer=max_layer) + coord_a, coord_b = cg.get_chunk_coordinates_multiple([chunk_a, chunk_b]) + return max_layer, coord_a, coord_b + + def _get_l2chunkids_along_boundary(max_layer, coord_a, coord_b): + direction = coord_a - coord_b + axis = np.flatnonzero(direction) + assert len(axis) == 1, f"{direction}, {coord_a}, {coord_b}" + axis = axis[0] + children_a = chunk_utils.get_bounding_children_chunks( + cg.meta, max_layer, coord_a, children_layer=2 + ) + children_b = chunk_utils.get_bounding_children_chunks( + cg.meta, max_layer, coord_b, children_layer=2 + ) + if direction[axis] > 0: + mid = coord_a[axis] * 2 ** (max_layer - 2) + l2chunks_a = children_a[children_a[:, axis] == mid] + l2chunks_b = children_b[children_b[:, axis] == mid - 1] + else: + mid = coord_b[axis] * 2 ** (max_layer - 2) + l2chunks_a = children_a[children_a[:, axis] == mid - 1] + l2chunks_b = children_b[children_b[:, axis] == mid] + + l2chunk_ids_a = chunk_utils.get_chunk_ids_from_coords(cg.meta, 2, l2chunks_a) + l2chunk_ids_b = chunk_utils.get_chunk_ids_from_coords(cg.meta, 2, l2chunks_b) + return l2chunk_ids_a, l2chunk_ids_b + + def _get_filtered_l2ids(node_a, node_b, chunks_map): + def _filter(node): + result = [] + children = cg.get_children(node) + while True: + chunk_ids = cg.get_chunk_ids_from_node_ids(children) + mask = np.isin(chunk_ids, chunks_map[node]) + children = children[mask] + + mask = cg.get_chunk_layers(children) == 2 + result.append(children[mask]) + + mask = cg.get_chunk_layers(children) > 2 + if children[mask].size == 0: + break + children = cg.get_children(children[mask], flatten=True) + return np.concatenate(result) + + return _filter(node_a), _filter(node_b) + + result = [] + chunks_map = {} + for edge_layer, _edge in zip(edge_layers, stale_edges): + node_a, node_b = _edge + mlayer, coord_a, coord_b = _get_normalized_coords(node_a, node_b) + chunks_a, chunks_b = _get_l2chunkids_along_boundary(mlayer, coord_a, coord_b) + + chunks_map[node_a] = [] + chunks_map[node_b] = [] + _layer = 2 + while _layer < mlayer: + chunks_map[node_a].append(chunks_a) + chunks_map[node_b].append(chunks_b) + chunks_a = np.unique(cg.get_parent_chunk_id_multiple(chunks_a)) + chunks_b = np.unique(cg.get_parent_chunk_id_multiple(chunks_b)) + _layer += 1 + chunks_map[node_a] = np.concatenate(chunks_map[node_a]) + chunks_map[node_b] = np.concatenate(chunks_map[node_b]) + + l2ids_a, l2ids_b = _get_filtered_l2ids(node_a, node_b, chunks_map) + edges_d = cg.get_cross_chunk_edges( + node_ids=l2ids_a, time_stamp=nodes_ts_map[node_b], raw_only=True + ) + + _edges = [] + for v in edges_d.values(): + _edges.append(v.get(edge_layer, types.empty_2d)) + _edges = np.concatenate(_edges) + mask = np.isin(_edges[:, 1], l2ids_b) + + children_a = cg.get_children(_edges[mask][:, 0], flatten=True) + children_b = cg.get_children(_edges[mask][:, 1], flatten=True) + if 85431849467249595 in children_a and 85502218144317440 in children_b: + print("woohoo0") + continue + + if 85502218144317440 in children_a and 85431849467249595 in children_b: + print("woohoo1") + continue + parents_a = np.unique( + cg.get_roots( + children_a, stop_layer=mlayer, ceil=False, time_stamp=parent_ts + ) + ) + assert parents_a.size == 1 and parents_a[0] == node_a, ( + node_a, + parents_a, + children_a, + ) + + parents_b = np.unique( + cg.get_roots( + children_b, stop_layer=mlayer, ceil=False, time_stamp=parent_ts + ) + ) + + parents_a = np.array([node_a] * parents_b.size, dtype=basetypes.NODE_ID) + result.append(np.column_stack((parents_a, parents_b))) + return np.concatenate(result) diff --git a/pychunkedgraph/graph/edges/utils.py b/pychunkedgraph/graph/edges/utils.py index 76f8ea1d8..b49a9a547 100644 --- a/pychunkedgraph/graph/edges/utils.py +++ b/pychunkedgraph/graph/edges/utils.py @@ -135,7 +135,7 @@ def categorize_edges_v2( def get_cross_chunk_edges_layer(meta: ChunkedGraphMeta, cross_edges: Iterable): - """Computes the layer in which a cross chunk edge becomes relevant. + """Computes the layer in which an atomic cross chunk edge becomes relevant. I.e. if a cross chunk edge links two nodes in layer 4 this function returns 3. :param cross_edges: n x 2 array diff --git a/pychunkedgraph/graph/edits.py b/pychunkedgraph/graph/edits.py index 899d1ce42..af96ebb93 100644 --- a/pychunkedgraph/graph/edits.py +++ b/pychunkedgraph/graph/edits.py @@ -15,6 +15,7 @@ from . import types from . import attributes from . import cache as cache_utils +from .edges import get_latest_edges, get_stale_nodes from .edges.utils import concatenate_cross_edge_dicts from .edges.utils import merge_cross_edge_dicts from .utils import basetypes @@ -497,25 +498,6 @@ def _update_neighbor_cross_edges( return updated_entries -def get_supervoxels(cg, node_ids): - """Returns the first supervoxel found for each node_id.""" - result = {} - node_ids_copy = np.copy(node_ids) - children = np.copy(node_ids) - children_d = cg.get_children(node_ids) - while True: - children = [children_d[k][0] for k in children] - children = np.array(children, dtype=basetypes.NODE_ID) - mask = cg.get_chunk_layers(children) == 1 - result.update([(node, sv) for node, sv in zip(node_ids[mask], children[mask])]) - node_ids = node_ids[~mask] - children = children[~mask] - if children.size == 0: - break - children_d = cg.get_children(children) - return np.array([result[k] for k in node_ids_copy], dtype=basetypes.NODE_ID) - - class CreateParentNodes: def __init__( self, @@ -605,10 +587,38 @@ def _update_cross_edge_cache(self, parent, children): children, time_stamp=self._last_successful_ts ) cx_edges_d = concatenate_cross_edge_dicts(cx_edges_d.values()) - edge_nodes = np.unique(np.concatenate([*cx_edges_d.values(), types.empty_2d])) - edge_supervoxels = get_supervoxels(self.cg, edge_nodes) + + _cx_edges = [types.empty_2d] + _edge_layers = [types.empty_1d] + for k, v in cx_edges_d.items(): + _cx_edges.append(v) + _edge_layers.append([k] * len(v)) + _cx_edges = np.concatenate(_cx_edges) + _edge_layers = np.concatenate(_edge_layers, dtype=int) + + edge_nodes = np.unique(_cx_edges) + stale_nodes, edge_supervoxels = get_stale_nodes( + self.cg, edge_nodes, parent_ts=self._last_successful_ts + ) + stale_nodes_mask = np.isin(edge_nodes, stale_nodes) + + latest_edges = types.empty_2d.copy() + if np.any(stale_nodes_mask): + stalte_edges_mask = _cx_edges[:, 1] == stale_nodes + stale_edges = _cx_edges[stalte_edges_mask] + stale_edge_layers = _edge_layers[stalte_edges_mask] + latest_edges = get_latest_edges( + self.cg, + stale_edges, + stale_edge_layers, + parent_ts=self._last_successful_ts, + ) + + _cx_edges = np.concatenate([_cx_edges, latest_edges]) + edge_nodes = np.unique(_cx_edges) + edge_parents = self.cg.get_roots( - edge_supervoxels, + edge_nodes, stop_layer=parent_layer, ceil=False, time_stamp=self._last_successful_ts, diff --git a/pychunkedgraph/ingest/upgrade/parent_layer.py b/pychunkedgraph/ingest/upgrade/parent_layer.py index 6f0b08711..80558a362 100644 --- a/pychunkedgraph/ingest/upgrade/parent_layer.py +++ b/pychunkedgraph/ingest/upgrade/parent_layer.py @@ -10,7 +10,6 @@ from pychunkedgraph.graph import ChunkedGraph from pychunkedgraph.graph.attributes import Connectivity, Hierarchy -from pychunkedgraph.graph.edits import get_supervoxels from pychunkedgraph.graph.utils import serializers from pychunkedgraph.graph.types import empty_2d from pychunkedgraph.utils.general import chunked @@ -110,7 +109,7 @@ def update_cross_edges(cg: ChunkedGraph, layer, node, node_ts, earliest_ts) -> l if edges.size == 0: continue nodes = np.unique(edges[:, 1]) - svs = get_supervoxels(cg, nodes) + svs = cg.get_single_leaf_multiple(nodes) parents = cg.get_roots(svs, time_stamp=ts, stop_layer=layer, ceil=False) edge_parents_d = dict(zip(nodes, parents)) val_dict = {} From 20b4cf2f7e4c39076a714c6d7ac5ea6a594a915a Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Sun, 3 Aug 2025 21:26:08 +0000 Subject: [PATCH 113/116] fix(edits): more precise filter for latest edges; error on chunk_id mismatch (ingest); bump version --- .bumpversion.cfg | 2 +- pychunkedgraph/__init__.py | 2 +- pychunkedgraph/graph/chunkedgraph.py | 3 +- pychunkedgraph/graph/edges/__init__.py | 77 ++++++++++++------- pychunkedgraph/graph/edits.py | 34 +------- pychunkedgraph/ingest/cluster.py | 11 +-- pychunkedgraph/ingest/upgrade/parent_layer.py | 15 ++-- pychunkedgraph/ingest/utils.py | 1 + 8 files changed, 71 insertions(+), 74 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 1e9b72ac5..59a83e91b 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 3.0.13 +current_version = 3.1.0 commit = True tag = True diff --git a/pychunkedgraph/__init__.py b/pychunkedgraph/__init__.py index 1adf1ce9e..f5f41e567 100644 --- a/pychunkedgraph/__init__.py +++ b/pychunkedgraph/__init__.py @@ -1 +1 @@ -__version__ = "3.0.13" +__version__ = "3.1.0" diff --git a/pychunkedgraph/graph/chunkedgraph.py b/pychunkedgraph/graph/chunkedgraph.py index 143d1ba9e..1754315d8 100644 --- a/pychunkedgraph/graph/chunkedgraph.py +++ b/pychunkedgraph/graph/chunkedgraph.py @@ -216,6 +216,7 @@ def get_parents( if fail_to_zero: parents.append(0) else: + exc.add_note(f"timestamp: {time_stamp}") raise KeyError from exc parents = np.array(parents, dtype=basetypes.NODE_ID) else: @@ -1016,7 +1017,7 @@ def get_chunk_layers_and_coordinates(self, node_or_chunk_ids: typing.Sequence): """ node_or_chunk_ids = np.array(node_or_chunk_ids, dtype=basetypes.NODE_ID) layers = self.get_chunk_layers(node_or_chunk_ids) - chunk_coords = np.zeros(shape=(len(node_or_chunk_ids), 3)) + chunk_coords = np.zeros(shape=(len(node_or_chunk_ids), 3), dtype=int) for _layer in np.unique(layers): mask = layers == _layer _nodes = node_or_chunk_ids[mask] diff --git a/pychunkedgraph/graph/edges/__init__.py b/pychunkedgraph/graph/edges/__init__.py index 2bc523313..1d54248c2 100644 --- a/pychunkedgraph/graph/edges/__init__.py +++ b/pychunkedgraph/graph/edges/__init__.py @@ -220,7 +220,7 @@ def get_stale_nodes( ) stale_mask = layer_nodes != _nodes stale_nodes.append(layer_nodes[stale_mask]) - return np.concatenate(stale_nodes), edge_supervoxels + return np.concatenate(stale_nodes) def get_latest_edges( @@ -279,7 +279,7 @@ def _get_l2chunkids_along_boundary(max_layer, coord_a, coord_b): def _get_filtered_l2ids(node_a, node_b, chunks_map): def _filter(node): result = [] - children = cg.get_children(node) + children = np.array([node], dtype=basetypes.NODE_ID) while True: chunk_ids = cg.get_chunk_ids_from_node_ids(children) mask = np.isin(chunk_ids, chunks_map[node]) @@ -296,15 +296,15 @@ def _filter(node): return _filter(node_a), _filter(node_b) - result = [] + result = [types.empty_2d] chunks_map = {} for edge_layer, _edge in zip(edge_layers, stale_edges): node_a, node_b = _edge mlayer, coord_a, coord_b = _get_normalized_coords(node_a, node_b) chunks_a, chunks_b = _get_l2chunkids_along_boundary(mlayer, coord_a, coord_b) - chunks_map[node_a] = [] - chunks_map[node_b] = [] + chunks_map[node_a] = [np.array([cg.get_chunk_id(node_a)])] + chunks_map[node_b] = [np.array([cg.get_chunk_id(node_b)])] _layer = 2 while _layer < mlayer: chunks_map[node_a].append(chunks_a) @@ -312,8 +312,8 @@ def _filter(node): chunks_a = np.unique(cg.get_parent_chunk_id_multiple(chunks_a)) chunks_b = np.unique(cg.get_parent_chunk_id_multiple(chunks_b)) _layer += 1 - chunks_map[node_a] = np.concatenate(chunks_map[node_a]) - chunks_map[node_b] = np.concatenate(chunks_map[node_b]) + chunks_map[node_a] = np.concatenate(chunks_map[node_a]).astype(basetypes.NODE_ID) + chunks_map[node_b] = np.concatenate(chunks_map[node_b]).astype(basetypes.NODE_ID) l2ids_a, l2ids_b = _get_filtered_l2ids(node_a, node_b, chunks_map) edges_d = cg.get_cross_chunk_edges( @@ -326,32 +326,57 @@ def _filter(node): _edges = np.concatenate(_edges) mask = np.isin(_edges[:, 1], l2ids_b) - children_a = cg.get_children(_edges[mask][:, 0], flatten=True) children_b = cg.get_children(_edges[mask][:, 1], flatten=True) - if 85431849467249595 in children_a and 85502218144317440 in children_b: - print("woohoo0") - continue - - if 85502218144317440 in children_a and 85431849467249595 in children_b: - print("woohoo1") - continue - parents_a = np.unique( - cg.get_roots( - children_a, stop_layer=mlayer, ceil=False, time_stamp=parent_ts - ) - ) - assert parents_a.size == 1 and parents_a[0] == node_a, ( - node_a, - parents_a, - children_a, - ) + parents_a = _edges[mask][:, 0] + parents_b = np.unique(cg.get_parents(children_b, time_stamp=parent_ts)) + _cx_edges_d = cg.get_cross_chunk_edges(parents_b) + parents_b = [] + for _node, _edges_d in _cx_edges_d.items(): + for _edges in _edges_d.values(): + _mask = np.isin(_edges[:,1], parents_a) + if np.any(_mask): + parents_b.append(_node) + + parents_b = np.array(parents_b, dtype=basetypes.NODE_ID) parents_b = np.unique( cg.get_roots( - children_b, stop_layer=mlayer, ceil=False, time_stamp=parent_ts + parents_b, stop_layer=mlayer, ceil=False, time_stamp=parent_ts ) ) parents_a = np.array([node_a] * parents_b.size, dtype=basetypes.NODE_ID) result.append(np.column_stack((parents_a, parents_b))) return np.concatenate(result) + + +def get_latest_edges_wrapper( + cg, + cx_edges_d: dict, + parent_ts: datetime.datetime = None, +) -> np.ndarray: + """Helper function to filter stale edges and replace with latest edges.""" + _cx_edges = [types.empty_2d] + _edge_layers = [types.empty_1d] + for k, v in cx_edges_d.items(): + _cx_edges.append(v) + _edge_layers.append([k] * len(v)) + _cx_edges = np.concatenate(_cx_edges) + _edge_layers = np.concatenate(_edge_layers, dtype=int) + + edge_nodes = np.unique(_cx_edges) + stale_nodes = get_stale_nodes(cg, edge_nodes, parent_ts=parent_ts) + stale_nodes_mask = np.isin(edge_nodes, stale_nodes) + + latest_edges = types.empty_2d.copy() + if np.any(stale_nodes_mask): + stalte_edges_mask = np.isin(_cx_edges[:, 1], stale_nodes) + stale_edges = _cx_edges[stalte_edges_mask] + stale_edge_layers = _edge_layers[stalte_edges_mask] + latest_edges = get_latest_edges( + cg, + stale_edges, + stale_edge_layers, + parent_ts=parent_ts, + ) + return np.concatenate([_cx_edges, latest_edges]) diff --git a/pychunkedgraph/graph/edits.py b/pychunkedgraph/graph/edits.py index af96ebb93..4ac9352a8 100644 --- a/pychunkedgraph/graph/edits.py +++ b/pychunkedgraph/graph/edits.py @@ -15,7 +15,7 @@ from . import types from . import attributes from . import cache as cache_utils -from .edges import get_latest_edges, get_stale_nodes +from .edges import get_latest_edges, get_latest_edges_wrapper, get_stale_nodes from .edges.utils import concatenate_cross_edge_dicts from .edges.utils import merge_cross_edge_dicts from .utils import basetypes @@ -587,36 +587,10 @@ def _update_cross_edge_cache(self, parent, children): children, time_stamp=self._last_successful_ts ) cx_edges_d = concatenate_cross_edge_dicts(cx_edges_d.values()) - - _cx_edges = [types.empty_2d] - _edge_layers = [types.empty_1d] - for k, v in cx_edges_d.items(): - _cx_edges.append(v) - _edge_layers.append([k] * len(v)) - _cx_edges = np.concatenate(_cx_edges) - _edge_layers = np.concatenate(_edge_layers, dtype=int) - - edge_nodes = np.unique(_cx_edges) - stale_nodes, edge_supervoxels = get_stale_nodes( - self.cg, edge_nodes, parent_ts=self._last_successful_ts + _cx_edges = get_latest_edges_wrapper( + self.cg, cx_edges_d, parent_ts=self._last_successful_ts ) - stale_nodes_mask = np.isin(edge_nodes, stale_nodes) - - latest_edges = types.empty_2d.copy() - if np.any(stale_nodes_mask): - stalte_edges_mask = _cx_edges[:, 1] == stale_nodes - stale_edges = _cx_edges[stalte_edges_mask] - stale_edge_layers = _edge_layers[stalte_edges_mask] - latest_edges = get_latest_edges( - self.cg, - stale_edges, - stale_edge_layers, - parent_ts=self._last_successful_ts, - ) - - _cx_edges = np.concatenate([_cx_edges, latest_edges]) - edge_nodes = np.unique(_cx_edges) - + edge_nodes = np.unique(_cx_edges) edge_parents = self.cg.get_roots( edge_nodes, stop_layer=parent_layer, diff --git a/pychunkedgraph/ingest/cluster.py b/pychunkedgraph/ingest/cluster.py index f557ac45a..1ae13a353 100644 --- a/pychunkedgraph/ingest/cluster.py +++ b/pychunkedgraph/ingest/cluster.py @@ -108,15 +108,8 @@ def _check_edges_direction( chunk_id = cg.get_chunk_id(layer=1, x=x, y=y, z=z) for edge_type in [EDGE_TYPES.between_chunk, EDGE_TYPES.cross_chunk]: edges = chunk_edges[edge_type] - e1 = edges.node_ids1 - e2 = edges.node_ids2 - - e2_chunk_ids = cg.get_chunk_ids_from_node_ids(e2) - mask = e2_chunk_ids == chunk_id - e1[mask], e2[mask] = e2[mask], e1[mask] - - e1_chunk_ids = cg.get_chunk_ids_from_node_ids(e1) - mask = e1_chunk_ids == chunk_id + chunk_ids = cg.get_chunk_ids_from_node_ids(edges.node_ids1) + mask = chunk_ids == chunk_id assert np.all(mask), "all IDs must belong to same chunk" diff --git a/pychunkedgraph/ingest/upgrade/parent_layer.py b/pychunkedgraph/ingest/upgrade/parent_layer.py index 80558a362..b8503f1d9 100644 --- a/pychunkedgraph/ingest/upgrade/parent_layer.py +++ b/pychunkedgraph/ingest/upgrade/parent_layer.py @@ -10,6 +10,7 @@ from pychunkedgraph.graph import ChunkedGraph from pychunkedgraph.graph.attributes import Connectivity, Hierarchy +from pychunkedgraph.graph.edges import get_latest_edges_wrapper from pychunkedgraph.graph.utils import serializers from pychunkedgraph.graph.types import empty_2d from pychunkedgraph.utils.general import chunked @@ -104,14 +105,17 @@ def update_cross_edges(cg: ChunkedGraph, layer, node, node_ts, earliest_ts) -> l assert not exists_as_parent(cg, node, edges[:, 0]), f"{node}, {node_ts}" return rows + row_id = serializers.serialize_uint64(node) for ts, cx_edges_d in CX_EDGES[node].items(): - edges = np.concatenate([empty_2d] + list(cx_edges_d.values())) + if node_ts > ts: + continue + edges = get_latest_edges_wrapper(cg, cx_edges_d, parent_ts=ts) if edges.size == 0: continue - nodes = np.unique(edges[:, 1]) - svs = cg.get_single_leaf_multiple(nodes) - parents = cg.get_roots(svs, time_stamp=ts, stop_layer=layer, ceil=False) - edge_parents_d = dict(zip(nodes, parents)) + + edge_nodes = np.unique(edges) + parents = cg.get_roots(edge_nodes, time_stamp=ts, stop_layer=layer, ceil=False) + edge_parents_d = dict(zip(edge_nodes, parents)) val_dict = {} for _layer, layer_edges in cx_edges_d.items(): layer_edges = fastremap.remap( @@ -121,7 +125,6 @@ def update_cross_edges(cg: ChunkedGraph, layer, node, node_ts, earliest_ts) -> l layer_edges = np.unique(layer_edges, axis=0) col = Connectivity.CrossChunkEdge[_layer] val_dict[col] = layer_edges - row_id = serializers.serialize_uint64(node) rows.append(cg.client.mutate_row(row_id, val_dict, time_stamp=ts)) return rows diff --git a/pychunkedgraph/ingest/utils.py b/pychunkedgraph/ingest/utils.py index 45b6e728f..5c51242ac 100644 --- a/pychunkedgraph/ingest/utils.py +++ b/pychunkedgraph/ingest/utils.py @@ -202,6 +202,7 @@ def queue_layer_helper(parent_layer: int, imanager: IngestionManager, fn): ) ) q.enqueue_many(job_datas) + logging.info(f"Queued {len(job_datas)} chunks.") def job_type_guard(job_type: str): From 266275179e8831aac22aa9abebbaefa98d5a9c56 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Tue, 5 Aug 2025 19:39:37 +0000 Subject: [PATCH 114/116] fix(edits): account for fake edges when finding latest edges --- pychunkedgraph/graph/chunks/atomic.py | 4 +- pychunkedgraph/graph/chunks/utils.py | 7 +- pychunkedgraph/graph/edges/__init__.py | 131 +++++++++++------- pychunkedgraph/ingest/upgrade/parent_layer.py | 3 +- 4 files changed, 94 insertions(+), 51 deletions(-) diff --git a/pychunkedgraph/graph/chunks/atomic.py b/pychunkedgraph/graph/chunks/atomic.py index b609f4cfb..ec0109c69 100644 --- a/pychunkedgraph/graph/chunks/atomic.py +++ b/pychunkedgraph/graph/chunks/atomic.py @@ -62,4 +62,6 @@ def get_bounding_atomic_chunks( chunkedgraph_meta: ChunkedGraphMeta, layer: int, chunk_coords: Sequence[int] ) -> List: """Atomic chunk coordinates along the boundary of a chunk""" - return get_bounding_children_chunks(chunkedgraph_meta, layer, chunk_coords, 2) + return get_bounding_children_chunks( + chunkedgraph_meta, layer, tuple(chunk_coords), 2 + ) diff --git a/pychunkedgraph/graph/chunks/utils.py b/pychunkedgraph/graph/chunks/utils.py index 4d01258bd..f22a4d84a 100644 --- a/pychunkedgraph/graph/chunks/utils.py +++ b/pychunkedgraph/graph/chunks/utils.py @@ -1,11 +1,13 @@ # pylint: disable=invalid-name, missing-docstring -from typing import List from typing import Union from typing import Optional from typing import Sequence +from typing import Tuple from typing import Iterable +from functools import lru_cache + import numpy as np @@ -210,8 +212,9 @@ def _get_chunk_coordinates_from_vol_coordinates( return coords.astype(int) +@lru_cache() def get_bounding_children_chunks( - cg_meta, layer: int, chunk_coords: Sequence[int], children_layer, return_unique=True + cg_meta, layer: int, chunk_coords: Tuple[int], children_layer, return_unique=True ) -> np.ndarray: """Children chunk coordinates at given layer, along the boundary of a chunk""" chunk_coords = np.array(chunk_coords, dtype=int) diff --git a/pychunkedgraph/graph/edges/__init__.py b/pychunkedgraph/graph/edges/__init__.py index 1d54248c2..ad039f4b4 100644 --- a/pychunkedgraph/graph/edges/__init__.py +++ b/pychunkedgraph/graph/edges/__init__.py @@ -3,7 +3,7 @@ """ from collections import namedtuple -import datetime +import datetime, logging from os import environ from copy import copy from typing import Iterable, Optional @@ -14,7 +14,10 @@ from graph_tool import Graph from pychunkedgraph.graph import types -from pychunkedgraph.graph.chunks import utils as chunk_utils +from pychunkedgraph.graph.chunks.utils import ( + get_bounding_children_chunks, + get_chunk_ids_from_coords, +) from pychunkedgraph.graph.utils import basetypes from ..utils import basetypes @@ -235,7 +238,9 @@ def get_latest_edges( These parents would be the new identities for the stale `partner`. """ _nodes = np.unique(stale_edges[:, 1]) - nodes_ts_map = dict(zip(_nodes, cg.get_node_timestamps(_nodes, return_numpy=False))) + nodes_ts_map = dict( + zip(_nodes, cg.get_node_timestamps(_nodes, return_numpy=False, normalize=True)) + ) _nodes = np.unique(stale_edges) layers, coords = cg.get_chunk_layers_and_coordinates(_nodes) layers_d = dict(zip(_nodes, layers)) @@ -252,31 +257,55 @@ def _get_normalized_coords(node_a, node_b) -> tuple: coord_a, coord_b = cg.get_chunk_coordinates_multiple([chunk_a, chunk_b]) return max_layer, coord_a, coord_b - def _get_l2chunkids_along_boundary(max_layer, coord_a, coord_b): + def _get_l2chunkids_along_boundary(mlayer: int, coord_a, coord_b, padding: int = 0): + """ + Gets L2 Chunk IDs along opposing faces for larger chunks. + If padding is enabled, more faces of L2 chunks are padded on both sides. + This is necessary to find fake edges that can span more than 2 L2 chunks. + """ direction = coord_a - coord_b - axis = np.flatnonzero(direction) - assert len(axis) == 1, f"{direction}, {coord_a}, {coord_b}" - axis = axis[0] - children_a = chunk_utils.get_bounding_children_chunks( - cg.meta, max_layer, coord_a, children_layer=2 - ) - children_b = chunk_utils.get_bounding_children_chunks( - cg.meta, max_layer, coord_b, children_layer=2 - ) - if direction[axis] > 0: - mid = coord_a[axis] * 2 ** (max_layer - 2) - l2chunks_a = children_a[children_a[:, axis] == mid] - l2chunks_b = children_b[children_b[:, axis] == mid - 1] - else: - mid = coord_b[axis] * 2 ** (max_layer - 2) - l2chunks_a = children_a[children_a[:, axis] == mid - 1] - l2chunks_b = children_b[children_b[:, axis] == mid] + major_axis = np.argmax(np.abs(direction)) + bounds_a = get_bounding_children_chunks(cg.meta, mlayer, tuple(coord_a), 2) + bounds_b = get_bounding_children_chunks(cg.meta, mlayer, tuple(coord_b), 2) + + l2chunk_count = 2 ** (mlayer - 2) + max_coord = coord_a if direction[major_axis] > 0 else coord_b + + skip = abs(direction[major_axis]) - 1 + l2_skip = skip * l2chunk_count - l2chunk_ids_a = chunk_utils.get_chunk_ids_from_coords(cg.meta, 2, l2chunks_a) - l2chunk_ids_b = chunk_utils.get_chunk_ids_from_coords(cg.meta, 2, l2chunks_b) + mid = max_coord[major_axis] * l2chunk_count + face_a = mid if direction[major_axis] > 0 else (mid - l2_skip - 1) + face_b = mid if direction[major_axis] < 0 else (mid - l2_skip - 1) + + l2chunks_a = [bounds_a[bounds_a[:, major_axis] == face_a]] + l2chunks_b = [bounds_b[bounds_b[:, major_axis] == face_b]] + + step_a, step_b = (1, -1) if direction[major_axis] > 0 else (-1, 1) + for _ in range(padding): + _l2_chunks_a = copy(l2chunks_a[-1]) + _l2_chunks_b = copy(l2chunks_b[-1]) + _l2_chunks_a[:, major_axis] += step_a + _l2_chunks_b[:, major_axis] += step_b + l2chunks_a.append(_l2_chunks_a) + l2chunks_b.append(_l2_chunks_b) + + l2chunks_a = np.concatenate(l2chunks_a) + l2chunks_b = np.concatenate(l2chunks_b) + + l2chunk_ids_a = get_chunk_ids_from_coords(cg.meta, 2, l2chunks_a) + l2chunk_ids_b = get_chunk_ids_from_coords(cg.meta, 2, l2chunks_b) return l2chunk_ids_a, l2chunk_ids_b - def _get_filtered_l2ids(node_a, node_b, chunks_map): + def _get_filtered_l2ids(node_a, node_b, padding: int): + """ + Finds L2 IDs along opposing faces for given nodes. + Filterting is done by first finding L2 chunks along these faces. + Then get their parent chunks iteratively. + Then filter children iteratively using these chunks. + """ + chunks_map = {} + def _filter(node): result = [] children = np.array([node], dtype=basetypes.NODE_ID) @@ -294,17 +323,13 @@ def _filter(node): children = cg.get_children(children[mask], flatten=True) return np.concatenate(result) - return _filter(node_a), _filter(node_b) - - result = [types.empty_2d] - chunks_map = {} - for edge_layer, _edge in zip(edge_layers, stale_edges): - node_a, node_b = _edge mlayer, coord_a, coord_b = _get_normalized_coords(node_a, node_b) - chunks_a, chunks_b = _get_l2chunkids_along_boundary(mlayer, coord_a, coord_b) + chunks_a, chunks_b = _get_l2chunkids_along_boundary( + mlayer, coord_a, coord_b, padding + ) - chunks_map[node_a] = [np.array([cg.get_chunk_id(node_a)])] - chunks_map[node_b] = [np.array([cg.get_chunk_id(node_b)])] + chunks_map[node_a] = [[cg.get_chunk_id(node_a)]] + chunks_map[node_b] = [[cg.get_chunk_id(node_b)]] _layer = 2 while _layer < mlayer: chunks_map[node_a].append(chunks_a) @@ -312,41 +337,53 @@ def _filter(node): chunks_a = np.unique(cg.get_parent_chunk_id_multiple(chunks_a)) chunks_b = np.unique(cg.get_parent_chunk_id_multiple(chunks_b)) _layer += 1 - chunks_map[node_a] = np.concatenate(chunks_map[node_a]).astype(basetypes.NODE_ID) - chunks_map[node_b] = np.concatenate(chunks_map[node_b]).astype(basetypes.NODE_ID) + chunks_map[node_a] = np.concatenate(chunks_map[node_a]) + chunks_map[node_b] = np.concatenate(chunks_map[node_b]) + return int(mlayer), _filter(node_a), _filter(node_b) - l2ids_a, l2ids_b = _get_filtered_l2ids(node_a, node_b, chunks_map) + result = [types.empty_2d] + for edge_layer, _edge in zip(edge_layers, stale_edges): + node_a, node_b = _edge + mlayer, l2ids_a, l2ids_b = _get_filtered_l2ids(node_a, node_b, padding=0) + if l2ids_a.size == 0 or l2ids_b.size == 0: + logging.info(f"{node_a}, {node_b}, expanding search with padding.") + mlayer, l2ids_a, l2ids_b = _get_filtered_l2ids(node_a, node_b, padding=2) + logging.info(f"Found {l2ids_a} and {l2ids_b}") + + _edges = [] edges_d = cg.get_cross_chunk_edges( node_ids=l2ids_a, time_stamp=nodes_ts_map[node_b], raw_only=True ) - - _edges = [] for v in edges_d.values(): _edges.append(v.get(edge_layer, types.empty_2d)) - _edges = np.concatenate(_edges) - mask = np.isin(_edges[:, 1], l2ids_b) - children_b = cg.get_children(_edges[mask][:, 1], flatten=True) + try: + _edges = np.concatenate(_edges) + except ValueError as exc: + logging.warning(f"No edges found for {node_a}, {node_b}") + raise ValueError from exc + mask = np.isin(_edges[:, 1], l2ids_b) parents_a = _edges[mask][:, 0] + children_b = cg.get_children(_edges[mask][:, 1], flatten=True) parents_b = np.unique(cg.get_parents(children_b, time_stamp=parent_ts)) - _cx_edges_d = cg.get_cross_chunk_edges(parents_b) + _cx_edges_d = cg.get_cross_chunk_edges(parents_b, time_stamp=parent_ts) parents_b = [] for _node, _edges_d in _cx_edges_d.items(): for _edges in _edges_d.values(): - _mask = np.isin(_edges[:,1], parents_a) + _mask = np.isin(_edges[:, 1], parents_a) if np.any(_mask): parents_b.append(_node) parents_b = np.array(parents_b, dtype=basetypes.NODE_ID) parents_b = np.unique( - cg.get_roots( - parents_b, stop_layer=mlayer, ceil=False, time_stamp=parent_ts - ) + cg.get_roots(parents_b, stop_layer=mlayer, ceil=False, time_stamp=parent_ts) ) parents_a = np.array([node_a] * parents_b.size, dtype=basetypes.NODE_ID) - result.append(np.column_stack((parents_a, parents_b))) + _new_edges = np.column_stack((parents_a, parents_b)) + assert _new_edges.size, f"No edge found for {node_a}, {node_b} at {parent_ts}" + result.append(_new_edges) return np.concatenate(result) diff --git a/pychunkedgraph/ingest/upgrade/parent_layer.py b/pychunkedgraph/ingest/upgrade/parent_layer.py index b8503f1d9..8c92e9e77 100644 --- a/pychunkedgraph/ingest/upgrade/parent_layer.py +++ b/pychunkedgraph/ingest/upgrade/parent_layer.py @@ -1,6 +1,6 @@ # pylint: disable=invalid-name, missing-docstring, c-extension-no-member -import math, random, time +import logging, math, random, time import multiprocessing as mp from collections import defaultdict @@ -171,6 +171,7 @@ def update_chunk( args = (cg_info, layer, chunk, ts_chunk, earliest_ts) tasks.append(args) + logging.info(f"Processing {len(nodes)} nodes.") with mp.Pool(min(mp.cpu_count(), len(tasks))) as pool: _ = list( tqdm( From 7c2edc51c2bf285bd630d0c3076c8d3382c79eb1 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Tue, 21 Oct 2025 17:17:26 +0000 Subject: [PATCH 115/116] fix(upgrade): use start and end timestamps to filter out irreleveant timestamps --- pychunkedgraph/app/meshing/common.py | 40 ++------- pychunkedgraph/graph/edges/__init__.py | 10 ++- pychunkedgraph/ingest/upgrade/atomic_layer.py | 82 +++++++++++++++---- pychunkedgraph/ingest/upgrade/parent_layer.py | 39 ++++++--- pychunkedgraph/ingest/upgrade/utils.py | 47 +++++++---- pychunkedgraph/utils/redis.py | 6 +- requirements.in | 4 +- requirements.txt | 5 +- 8 files changed, 148 insertions(+), 85 deletions(-) diff --git a/pychunkedgraph/app/meshing/common.py b/pychunkedgraph/app/meshing/common.py index 8f1a0c20a..10306543a 100644 --- a/pychunkedgraph/app/meshing/common.py +++ b/pychunkedgraph/app/meshing/common.py @@ -4,8 +4,6 @@ import threading import numpy as np -import redis -from rq import Queue, Connection, Retry from flask import Response, current_app, jsonify, make_response, request from pychunkedgraph import __version__ @@ -145,37 +143,15 @@ def _check_post_options(cg, resp, data, seg_ids): def handle_remesh(table_id): current_app.request_type = "remesh_enque" current_app.table_id = table_id - is_priority = request.args.get("priority", True, type=str2bool) - is_redisjob = request.args.get("use_redis", False, type=str2bool) - new_lvl2_ids = json.loads(request.data)["new_lvl2_ids"] - - if is_redisjob: - with Connection(redis.from_url(current_app.config["REDIS_URL"])): - - if is_priority: - retry = Retry(max=3, interval=[1, 10, 60]) - queue_name = "mesh-chunks" - else: - retry = Retry(max=3, interval=[60, 60, 60]) - queue_name = "mesh-chunks-low-priority" - q = Queue(queue_name, retry=retry, default_timeout=1200) - task = q.enqueue(meshing_tasks.remeshing, table_id, new_lvl2_ids) - - response_object = {"status": "success", "data": {"task_id": task.get_id()}} - - return jsonify(response_object), 202 - else: - new_lvl2_ids = np.array(new_lvl2_ids, dtype=np.uint64) - cg = app_utils.get_cg(table_id) - - if len(new_lvl2_ids) > 0: - t = threading.Thread( - target=_remeshing, args=(cg.get_serialized_info(), new_lvl2_ids) - ) - t.start() - - return Response(status=202) + new_lvl2_ids = np.array(new_lvl2_ids, dtype=np.uint64) + cg = app_utils.get_cg(table_id) + if len(new_lvl2_ids) > 0: + t = threading.Thread( + target=_remeshing, args=(cg.get_serialized_info(), new_lvl2_ids) + ) + t.start() + return Response(status=202) def _remeshing(serialized_cg_info, lvl2_nodes): diff --git a/pychunkedgraph/graph/edges/__init__.py b/pychunkedgraph/graph/edges/__init__.py index ad039f4b4..3359cefdd 100644 --- a/pychunkedgraph/graph/edges/__init__.py +++ b/pychunkedgraph/graph/edges/__init__.py @@ -237,11 +237,10 @@ def get_latest_edges( Then get supervoxels of those L2 IDs and get parent(s) at `node` level. These parents would be the new identities for the stale `partner`. """ - _nodes = np.unique(stale_edges[:, 1]) + _nodes = np.unique(stale_edges) nodes_ts_map = dict( zip(_nodes, cg.get_node_timestamps(_nodes, return_numpy=False, normalize=True)) ) - _nodes = np.unique(stale_edges) layers, coords = cg.get_chunk_layers_and_coordinates(_nodes) layers_d = dict(zip(_nodes, layers)) coords_d = dict(zip(_nodes, coords)) @@ -352,7 +351,9 @@ def _filter(node): _edges = [] edges_d = cg.get_cross_chunk_edges( - node_ids=l2ids_a, time_stamp=nodes_ts_map[node_b], raw_only=True + node_ids=l2ids_a, + time_stamp=max(nodes_ts_map[node_a], nodes_ts_map[node_b]), + raw_only=True, ) for v in edges_d.values(): _edges.append(v.get(edge_layer, types.empty_2d)) @@ -382,7 +383,8 @@ def _filter(node): parents_a = np.array([node_a] * parents_b.size, dtype=basetypes.NODE_ID) _new_edges = np.column_stack((parents_a, parents_b)) - assert _new_edges.size, f"No edge found for {node_a}, {node_b} at {parent_ts}" + err = f"No edge found for {node_a}, {node_b} at {edge_layer}; {parent_ts}" + assert _new_edges.size, err result.append(_new_edges) return np.concatenate(result) diff --git a/pychunkedgraph/ingest/upgrade/atomic_layer.py b/pychunkedgraph/ingest/upgrade/atomic_layer.py index c9c8bdb11..99a67b1de 100644 --- a/pychunkedgraph/ingest/upgrade/atomic_layer.py +++ b/pychunkedgraph/ingest/upgrade/atomic_layer.py @@ -1,18 +1,23 @@ # pylint: disable=invalid-name, missing-docstring, c-extension-no-member -from datetime import timedelta +from concurrent.futures import ThreadPoolExecutor, as_completed +import logging, math, time import fastremap import numpy as np +from tqdm import tqdm from pychunkedgraph.graph import ChunkedGraph -from pychunkedgraph.graph.attributes import Connectivity +from pychunkedgraph.graph.attributes import Connectivity, Hierarchy from pychunkedgraph.graph.utils import serializers +from pychunkedgraph.utils.general import chunked -from .utils import exists_as_parent, get_parent_timestamps +from .utils import exists_as_parent, get_end_timestamps, get_parent_timestamps + +CHILDREN = {} def update_cross_edges( - cg: ChunkedGraph, node, cx_edges_d: dict, node_ts, timestamps: set, earliest_ts + cg: ChunkedGraph, node, cx_edges_d: dict, node_ts, node_end_ts, timestamps: set ) -> list: """ Helper function to update a single L2 ID. @@ -27,13 +32,15 @@ def update_cross_edges( assert not exists_as_parent(cg, node, edges[:, 0]) return rows - partner_parent_ts_d = get_parent_timestamps(cg, edges[:, 1]) + partner_parent_ts_d = get_parent_timestamps(cg, np.unique(edges[:, 1])) for v in partner_parent_ts_d.values(): timestamps.update(v) for ts in sorted(timestamps): - if ts < earliest_ts: - ts = earliest_ts + if ts < node_ts: + continue + if ts > node_end_ts: + break val_dict = {} svs = edges[:, 1] parents = cg.get_parents(svs, time_stamp=ts) @@ -51,35 +58,78 @@ def update_cross_edges( return rows -def update_nodes(cg: ChunkedGraph, nodes) -> list: - nodes_ts = cg.get_node_timestamps(nodes, return_numpy=False, normalize=True) - earliest_ts = cg.get_earliest_timestamp() +def update_nodes(cg: ChunkedGraph, nodes, nodes_ts, children_map=None) -> list: + if children_map is None: + children_map = CHILDREN + end_timestamps = get_end_timestamps(cg, nodes, nodes_ts, children_map) timestamps_d = get_parent_timestamps(cg, nodes) cx_edges_d = cg.get_atomic_cross_edges(nodes) rows = [] - for node, node_ts in zip(nodes, nodes_ts): + for node, node_ts, end_ts in zip(nodes, nodes_ts, end_timestamps): if cg.get_parent(node) is None: - # invalid id caused by failed ingest task + # invalid id caused by failed ingest task / edits continue _cx_edges_d = cx_edges_d.get(node, {}) if not _cx_edges_d: continue _rows = update_cross_edges( - cg, node, _cx_edges_d, node_ts, timestamps_d[node], earliest_ts + cg, node, _cx_edges_d, node_ts, end_ts, timestamps_d[node] ) rows.extend(_rows) return rows -def update_chunk(cg: ChunkedGraph, chunk_coords: list[int], layer: int = 2): +def _update_nodes_helper(args): + cg, nodes, nodes_ts = args + return update_nodes(cg, nodes, nodes_ts) + + +def update_chunk( + cg: ChunkedGraph, chunk_coords: list[int], layer: int = 2, debug: bool = False +): """ Iterate over all L2 IDs in a chunk and update their cross chunk edges, within the periods they were valid/active. """ + global CHILDREN + + start = time.time() x, y, z = chunk_coords chunk_id = cg.get_chunk_id(layer=layer, x=x, y=y, z=z) cg.copy_fake_edges(chunk_id) rr = cg.range_read_chunk(chunk_id) - nodes = list(rr.keys()) - rows = update_nodes(cg, nodes) + + nodes = [] + nodes_ts = [] + earliest_ts = cg.get_earliest_timestamp() + for k, v in rr.items(): + nodes.append(k) + CHILDREN[k] = v[Hierarchy.Child][0].value + ts = v[Hierarchy.Child][0].timestamp + nodes_ts.append(earliest_ts if ts < earliest_ts else ts) + + if len(nodes) > 0: + logging.info(f"Processing {len(nodes)} nodes.") + assert len(CHILDREN) > 0, (nodes, CHILDREN) + else: + return + + if debug: + rows = update_nodes(cg, nodes, nodes_ts) + else: + task_size = int(math.ceil(len(nodes) / 64)) + chunked_nodes = chunked(nodes, task_size) + chunked_nodes_ts = chunked(nodes_ts, task_size) + tasks = [] + for chunk, ts_chunk in zip(chunked_nodes, chunked_nodes_ts): + args = (cg, chunk, ts_chunk) + tasks.append(args) + + rows = [] + with ThreadPoolExecutor(max_workers=8) as executor: + futures = [executor.submit(_update_nodes_helper, task) for task in tasks] + for future in tqdm(as_completed(futures), total=len(futures)): + rows.extend(future.result()) + cg.client.write(rows) + print(f"total elaspsed time: {time.time() - start}") diff --git a/pychunkedgraph/ingest/upgrade/parent_layer.py b/pychunkedgraph/ingest/upgrade/parent_layer.py index 8c92e9e77..79d97b9fe 100644 --- a/pychunkedgraph/ingest/upgrade/parent_layer.py +++ b/pychunkedgraph/ingest/upgrade/parent_layer.py @@ -3,6 +3,7 @@ import logging, math, random, time import multiprocessing as mp from collections import defaultdict +from concurrent.futures import ThreadPoolExecutor, as_completed import fastremap import numpy as np @@ -15,7 +16,7 @@ from pychunkedgraph.graph.types import empty_2d from pychunkedgraph.utils.general import chunked -from .utils import exists_as_parent, get_parent_timestamps +from .utils import exists_as_parent, get_end_timestamps, get_parent_timestamps CHILDREN = {} @@ -51,7 +52,7 @@ def _get_cx_edges_at_timestamp(node, response, ts): def _populate_cx_edges_with_timestamps( - cg: ChunkedGraph, layer: int, nodes: list, nodes_ts: list, earliest_ts + cg: ChunkedGraph, layer: int, nodes: list, nodes_ts: list ): """ Collect timestamps of edits from children, since we use the same timestamp @@ -63,7 +64,8 @@ def _populate_cx_edges_with_timestamps( all_children = np.concatenate(list(CHILDREN.values())) response = cg.client.read_nodes(node_ids=all_children, properties=attrs) timestamps_d = get_parent_timestamps(cg, nodes) - for node, node_ts in zip(nodes, nodes_ts): + end_timestamps = get_end_timestamps(cg, nodes, nodes_ts, CHILDREN) + for node, node_ts, node_end_ts in zip(nodes, nodes_ts, end_timestamps): CX_EDGES[node] = {} timestamps = timestamps_d[node] cx_edges_d_node_ts = _get_cx_edges_at_timestamp(node, response, node_ts) @@ -75,8 +77,8 @@ def _populate_cx_edges_with_timestamps( CX_EDGES[node][node_ts] = cx_edges_d_node_ts for ts in sorted(timestamps): - if ts < earliest_ts: - ts = earliest_ts + if ts > node_end_ts: + break CX_EDGES[node][ts] = _get_cx_edges_at_timestamp(node, response, ts) @@ -107,7 +109,7 @@ def update_cross_edges(cg: ChunkedGraph, layer, node, node_ts, earliest_ts) -> l row_id = serializers.serialize_uint64(node) for ts, cx_edges_d in CX_EDGES[node].items(): - if node_ts > ts: + if ts < node_ts: continue edges = get_latest_edges_wrapper(cg, cx_edges_d, parent_ts=ts) if edges.size == 0: @@ -129,17 +131,29 @@ def update_cross_edges(cg: ChunkedGraph, layer, node, node_ts, earliest_ts) -> l return rows +def _update_cross_edges_helper_thread(args): + cg, layer, node, node_ts, earliest_ts = args + return update_cross_edges(cg, layer, node, node_ts, earliest_ts) + + def _update_cross_edges_helper(args): cg_info, layer, nodes, nodes_ts, earliest_ts = args rows = [] cg = ChunkedGraph(**cg_info) parents = cg.get_parents(nodes, fail_to_zero=True) + + tasks = [] for node, parent, node_ts in zip(nodes, parents, nodes_ts): if parent == 0: - # invalid id caused by failed ingest task + # invalid id caused by failed ingest task / edits continue - _rows = update_cross_edges(cg, layer, node, node_ts, earliest_ts) - rows.extend(_rows) + tasks.append((cg, layer, node, node_ts, earliest_ts)) + + with ThreadPoolExecutor(max_workers=4) as executor: + futures = [executor.submit(_update_cross_edges_helper_thread, task) for task in tasks] + for future in tqdm(as_completed(futures), total=len(futures)): + rows.extend(future.result()) + cg.client.write(rows) @@ -159,7 +173,7 @@ def update_chunk( nodes = list(CHILDREN.keys()) random.shuffle(nodes) nodes_ts = cg.get_node_timestamps(nodes, return_numpy=False, normalize=True) - _populate_cx_edges_with_timestamps(cg, layer, nodes, nodes_ts, earliest_ts) + _populate_cx_edges_with_timestamps(cg, layer, nodes, nodes_ts) task_size = int(math.ceil(len(nodes) / mp.cpu_count() / 2)) chunked_nodes = chunked(nodes, task_size) @@ -171,8 +185,9 @@ def update_chunk( args = (cg_info, layer, chunk, ts_chunk, earliest_ts) tasks.append(args) - logging.info(f"Processing {len(nodes)} nodes.") - with mp.Pool(min(mp.cpu_count(), len(tasks))) as pool: + processes = min(mp.cpu_count() * 2, len(tasks)) + logging.info(f"Processing {len(nodes)} nodes with {processes} workers.") + with mp.Pool(processes) as pool: _ = list( tqdm( pool.imap_unordered(_update_cross_edges_helper, tasks), diff --git a/pychunkedgraph/ingest/upgrade/utils.py b/pychunkedgraph/ingest/upgrade/utils.py index cc43b561a..3407ea7b5 100644 --- a/pychunkedgraph/ingest/upgrade/utils.py +++ b/pychunkedgraph/ingest/upgrade/utils.py @@ -1,7 +1,7 @@ # pylint: disable=invalid-name, missing-docstring from collections import defaultdict -from datetime import timedelta +from datetime import datetime, timedelta, timezone import numpy as np from pychunkedgraph.graph import ChunkedGraph @@ -33,31 +33,50 @@ def get_edit_timestamps(cg: ChunkedGraph, edges_d, start_ts, end_ts) -> list: return sorted(timestamps) -def get_end_ts(cg: ChunkedGraph, children, start_ts): - # get end_ts when node becomes invalid (bigtable resolution is in ms) - start = start_ts + timedelta(milliseconds=1) - _timestamps = get_parent_timestamps(cg, children, start_time=start) - try: - end_ts = sorted(_timestamps)[0] - except IndexError: - # start_ts == end_ts means there has been no edit involving this node - # meaning only one timestamp to update cross edges, start_ts - end_ts = start_ts - return end_ts +def get_end_timestamps(cg: ChunkedGraph, nodes, nodes_ts, children_map): + """ + Gets the last timestamp for each node at which to update its cross edges. + For this, we get parent timestamps for all children of a node. + The first timestamp > node_timestamp among these is the last timestamp. + This is the timestamp at which one of node's children + got a new parent that superseded the current node. + """ + result = [] + children = np.concatenate([*children_map.values()]) + timestamps_d = get_parent_timestamps(cg, children) + for node, node_ts in zip(nodes, nodes_ts): + node_children = children_map[node] + _timestamps = set().union(*[timestamps_d[k] for k in node_children]) + try: + _timestamps = sorted(_timestamps) + _index = np.searchsorted(_timestamps, node_ts) + assert _timestamps[_index] == node_ts, (_index, node_ts, _timestamps) + end_ts = _timestamps[_index + 1] - timedelta(milliseconds=1) + except IndexError: + # this node has not been edited, but might have it edges updated + end_ts = datetime.now(timezone.utc) + result.append(end_ts) + return result -def get_parent_timestamps(cg: ChunkedGraph, nodes) -> dict[int, set]: +def get_parent_timestamps( + cg: ChunkedGraph, nodes, start_time=None, end_time=None +) -> dict[int, set]: """ Timestamps of when the given nodes were edited. """ + earliest_ts = cg.get_earliest_timestamp() response = cg.client.read_nodes( node_ids=nodes, properties=[Hierarchy.Parent], + start_time=start_time, + end_time=end_time, end_time_inclusive=False, ) result = defaultdict(set) for k, v in response.items(): for cell in v[Hierarchy.Parent]: - result[k].add(cell.timestamp) + ts = cell.timestamp + result[k].add(earliest_ts if ts < earliest_ts else ts) return result diff --git a/pychunkedgraph/utils/redis.py b/pychunkedgraph/utils/redis.py index 45ccfbdcc..82921f030 100644 --- a/pychunkedgraph/utils/redis.py +++ b/pychunkedgraph/utils/redis.py @@ -18,6 +18,7 @@ ) REDIS_PASSWORD = os.environ.get("REDIS_PASSWORD", "") REDIS_URL = f"redis://:{REDIS_PASSWORD}@{REDIS_HOST}:{REDIS_PORT}/0" +CONNECTION = redis.Redis.from_url(REDIS_URL, socket_timeout=60) keys_fields = ("INGESTION_MANAGER", "JOB_TYPE") keys_defaults = ("pcg:imanager", "pcg:job_type") @@ -27,9 +28,10 @@ def get_redis_connection(redis_url=REDIS_URL): + if redis_url == REDIS_URL: + return CONNECTION return redis.Redis.from_url(redis_url, socket_timeout=60) def get_rq_queue(queue): - connection = redis.Redis.from_url(REDIS_URL, socket_timeout=60) - return Queue(queue, connection=connection) + return Queue(queue, connection=CONNECTION) diff --git a/requirements.in b/requirements.in index 1ec536a5c..4fcd353ed 100644 --- a/requirements.in +++ b/requirements.in @@ -10,8 +10,8 @@ google-cloud-datastore>=1.8 flask flask_cors python-json-logger -redis -rq<2 +redis>5 +rq>2 pyyaml cachetools werkzeug diff --git a/requirements.txt b/requirements.txt index 059b8fd91..0eedacb31 100644 --- a/requirements.txt +++ b/requirements.txt @@ -97,7 +97,6 @@ gevent==23.9.1 # task-queue google-api-core[grpc]==2.11.1 # via - # google-api-core # google-cloud-bigtable # google-cloud-core # google-cloud-datastore @@ -295,7 +294,7 @@ pytz==2023.3.post1 # via pandas pyyaml==6.0.1 # via -r requirements.in -redis==5.0.0 +redis==6.4.0 # via # -r requirements.in # rq @@ -316,7 +315,7 @@ rpds-py==0.10.3 # via # jsonschema # referencing -rq==1.15.1 +rq==2.4.1 # via -r requirements.in rsa==4.9 # via From 52447f741dc3e5c2d26a9cfb593242ef64213c22 Mon Sep 17 00:00:00 2001 From: Akhilesh Halageri Date: Fri, 24 Oct 2025 16:13:28 +0000 Subject: [PATCH 116/116] feat(upgrade): cache stale timestamp info; remove unnecessary checks to reduce latency --- pychunkedgraph/graph/attributes.py | 6 ++ pychunkedgraph/graph/edges/__init__.py | 19 ++--- pychunkedgraph/ingest/cluster.py | 2 +- pychunkedgraph/ingest/upgrade/atomic_layer.py | 81 +++++++++++-------- pychunkedgraph/ingest/upgrade/parent_layer.py | 56 ++++++------- pychunkedgraph/ingest/upgrade/utils.py | 37 ++++++--- pychunkedgraph/utils/general.py | 7 -- 7 files changed, 118 insertions(+), 90 deletions(-) diff --git a/pychunkedgraph/graph/attributes.py b/pychunkedgraph/graph/attributes.py index b431a159b..6b7a277f0 100644 --- a/pychunkedgraph/graph/attributes.py +++ b/pychunkedgraph/graph/attributes.py @@ -160,6 +160,12 @@ class Hierarchy: serializer=serializers.NumPyValue(dtype=basetypes.NODE_ID), ) + # track when nodes became stale, required for migration + # will be eventually deleted by GC rule for column family_id 3. + StaleTimeStamp = _Attribute( + key=b"stale_ts", family_id="3", serializer=serializers.Pickle() + ) + class GraphMeta: key = b"meta" diff --git a/pychunkedgraph/graph/edges/__init__.py b/pychunkedgraph/graph/edges/__init__.py index 3359cefdd..1a8baf225 100644 --- a/pychunkedgraph/graph/edges/__init__.py +++ b/pychunkedgraph/graph/edges/__init__.py @@ -201,22 +201,23 @@ def get_edges(source: str, nodes: np.ndarray) -> Edges: def get_stale_nodes( - cg, edge_nodes: Iterable[basetypes.NODE_ID], parent_ts: datetime.datetime = None + cg, nodes: Iterable[basetypes.NODE_ID], parent_ts: datetime.datetime = None ): """ - Checks to see if partner nodes in edges (edges[:,1]) are stale. - This is done by getting a supervoxel of the node and check + Checks to see if given nodes are stale. + This is done by getting a supervoxel of a node and checking if it has a new parent at the same layer as the node. """ - edge_supervoxels = cg.get_single_leaf_multiple(edge_nodes) + nodes = np.array(nodes, dtype=basetypes.NODE_ID) + supervoxels = cg.get_single_leaf_multiple(nodes) # nodes can be at different layers due to skip connections - edge_nodes_layers = cg.get_chunk_layers(edge_nodes) + node_layers = cg.get_chunk_layers(nodes) stale_nodes = [types.empty_1d] - for layer in np.unique(edge_nodes_layers): - _mask = edge_nodes_layers == layer - layer_nodes = edge_nodes[_mask] + for layer in np.unique(node_layers): + _mask = node_layers == layer + layer_nodes = nodes[_mask] _nodes = cg.get_roots( - edge_supervoxels[_mask], + supervoxels[_mask], stop_layer=layer, ceil=False, time_stamp=parent_ts, diff --git a/pychunkedgraph/ingest/cluster.py b/pychunkedgraph/ingest/cluster.py index 1ae13a353..d87576ca0 100644 --- a/pychunkedgraph/ingest/cluster.py +++ b/pychunkedgraph/ingest/cluster.py @@ -135,7 +135,7 @@ def upgrade_atomic_chunk(coords: Sequence[int]): redis = get_redis_connection() imanager = IngestionManager.from_pickle(redis.get(r_keys.INGESTION_MANAGER)) coords = np.array(list(coords), dtype=int) - update_atomic_chunk(imanager.cg, coords, layer=2) + update_atomic_chunk(imanager.cg, coords) _post_task_completion(imanager, 2, coords) diff --git a/pychunkedgraph/ingest/upgrade/atomic_layer.py b/pychunkedgraph/ingest/upgrade/atomic_layer.py index 99a67b1de..e4bd18b62 100644 --- a/pychunkedgraph/ingest/upgrade/atomic_layer.py +++ b/pychunkedgraph/ingest/upgrade/atomic_layer.py @@ -1,23 +1,31 @@ # pylint: disable=invalid-name, missing-docstring, c-extension-no-member +from collections import defaultdict from concurrent.futures import ThreadPoolExecutor, as_completed +from datetime import timedelta import logging, math, time +from copy import copy import fastremap import numpy as np from tqdm import tqdm -from pychunkedgraph.graph import ChunkedGraph +from pychunkedgraph.graph import ChunkedGraph, types from pychunkedgraph.graph.attributes import Connectivity, Hierarchy from pychunkedgraph.graph.utils import serializers from pychunkedgraph.utils.general import chunked -from .utils import exists_as_parent, get_end_timestamps, get_parent_timestamps +from .utils import get_end_timestamps, get_parent_timestamps CHILDREN = {} def update_cross_edges( - cg: ChunkedGraph, node, cx_edges_d: dict, node_ts, node_end_ts, timestamps: set + cg: ChunkedGraph, + node, + cx_edges_d: dict, + node_ts, + node_end_ts, + timestamps_d: defaultdict[int, set], ) -> list: """ Helper function to update a single L2 ID. @@ -25,26 +33,21 @@ def update_cross_edges( """ rows = [] edges = np.concatenate(list(cx_edges_d.values())) - uparents = np.unique(cg.get_parents(edges[:, 0], time_stamp=node_ts)) - assert uparents.size <= 1, f"{node}, {node_ts}, {uparents}" - if uparents.size == 0 or node != uparents[0]: - # if node is not the parent at this ts, it must be invalid - assert not exists_as_parent(cg, node, edges[:, 0]) - return rows + partners = np.unique(edges[:, 1]) - partner_parent_ts_d = get_parent_timestamps(cg, np.unique(edges[:, 1])) - for v in partner_parent_ts_d.values(): - timestamps.update(v) + timestamps = copy(timestamps_d[node]) + for partner in partners: + timestamps.update(timestamps_d[partner]) for ts in sorted(timestamps): if ts < node_ts: continue if ts > node_end_ts: break + val_dict = {} - svs = edges[:, 1] - parents = cg.get_parents(svs, time_stamp=ts) - edge_parents_d = dict(zip(svs, parents)) + parents = cg.get_parents(partners, time_stamp=ts) + edge_parents_d = dict(zip(partners, parents)) for layer, layer_edges in cx_edges_d.items(): layer_edges = fastremap.remap( layer_edges, edge_parents_d, preserve_missing_labels=True @@ -61,20 +64,26 @@ def update_cross_edges( def update_nodes(cg: ChunkedGraph, nodes, nodes_ts, children_map=None) -> list: if children_map is None: children_map = CHILDREN - end_timestamps = get_end_timestamps(cg, nodes, nodes_ts, children_map) - timestamps_d = get_parent_timestamps(cg, nodes) + end_timestamps = get_end_timestamps(cg, nodes, nodes_ts, children_map, layer=2) + cx_edges_d = cg.get_atomic_cross_edges(nodes) + all_cx_edges = [types.empty_2d] + for _cx_edges_d in cx_edges_d.values(): + if _cx_edges_d: + all_cx_edges.append(np.concatenate(list(_cx_edges_d.values()))) + all_partners = np.unique(np.concatenate(all_cx_edges)[:, 1]) + timestamps_d = get_parent_timestamps(cg, np.concatenate([nodes, all_partners])) + rows = [] for node, node_ts, end_ts in zip(nodes, nodes_ts, end_timestamps): - if cg.get_parent(node) is None: - # invalid id caused by failed ingest task / edits - continue + end_ts -= timedelta(milliseconds=1) _cx_edges_d = cx_edges_d.get(node, {}) if not _cx_edges_d: continue - _rows = update_cross_edges( - cg, node, _cx_edges_d, node_ts, end_ts, timestamps_d[node] - ) + _rows = update_cross_edges(cg, node, _cx_edges_d, node_ts, end_ts, timestamps_d) + row_id = serializers.serialize_uint64(node) + val_dict = {Hierarchy.StaleTimeStamp: 0} + _rows.append(cg.client.mutate_row(row_id, val_dict, time_stamp=end_ts)) rows.extend(_rows) return rows @@ -84,9 +93,7 @@ def _update_nodes_helper(args): return update_nodes(cg, nodes, nodes_ts) -def update_chunk( - cg: ChunkedGraph, chunk_coords: list[int], layer: int = 2, debug: bool = False -): +def update_chunk(cg: ChunkedGraph, chunk_coords: list[int], debug: bool = False): """ Iterate over all L2 IDs in a chunk and update their cross chunk edges, within the periods they were valid/active. @@ -95,7 +102,7 @@ def update_chunk( start = time.time() x, y, z = chunk_coords - chunk_id = cg.get_chunk_id(layer=layer, x=x, y=y, z=z) + chunk_id = cg.get_chunk_id(layer=2, x=x, y=y, z=z) cg.copy_fake_edges(chunk_id) rr = cg.range_read_chunk(chunk_id) @@ -103,13 +110,18 @@ def update_chunk( nodes_ts = [] earliest_ts = cg.get_earliest_timestamp() for k, v in rr.items(): - nodes.append(k) - CHILDREN[k] = v[Hierarchy.Child][0].value - ts = v[Hierarchy.Child][0].timestamp - nodes_ts.append(earliest_ts if ts < earliest_ts else ts) + try: + _ = v[Hierarchy.Parent] + nodes.append(k) + CHILDREN[k] = v[Hierarchy.Child][0].value + ts = v[Hierarchy.Child][0].timestamp + nodes_ts.append(earliest_ts if ts < earliest_ts else ts) + except KeyError: + # invalid nodes from failed tasks w/o parent column entry + continue if len(nodes) > 0: - logging.info(f"Processing {len(nodes)} nodes.") + logging.info(f"processing {len(nodes)} nodes.") assert len(CHILDREN) > 0, (nodes, CHILDREN) else: return @@ -117,13 +129,14 @@ def update_chunk( if debug: rows = update_nodes(cg, nodes, nodes_ts) else: - task_size = int(math.ceil(len(nodes) / 64)) + task_size = int(math.ceil(len(nodes) / 16)) chunked_nodes = chunked(nodes, task_size) chunked_nodes_ts = chunked(nodes_ts, task_size) tasks = [] for chunk, ts_chunk in zip(chunked_nodes, chunked_nodes_ts): args = (cg, chunk, ts_chunk) tasks.append(args) + logging.info(f"task size {task_size}, count {len(tasks)}.") rows = [] with ThreadPoolExecutor(max_workers=8) as executor: @@ -132,4 +145,4 @@ def update_chunk( rows.extend(future.result()) cg.client.write(rows) - print(f"total elaspsed time: {time.time() - start}") + logging.info(f"total elaspsed time: {time.time() - start}") diff --git a/pychunkedgraph/ingest/upgrade/parent_layer.py b/pychunkedgraph/ingest/upgrade/parent_layer.py index 79d97b9fe..b205f1753 100644 --- a/pychunkedgraph/ingest/upgrade/parent_layer.py +++ b/pychunkedgraph/ingest/upgrade/parent_layer.py @@ -16,7 +16,7 @@ from pychunkedgraph.graph.types import empty_2d from pychunkedgraph.utils.general import chunked -from .utils import exists_as_parent, get_end_timestamps, get_parent_timestamps +from .utils import get_end_timestamps, get_parent_timestamps CHILDREN = {} @@ -64,7 +64,9 @@ def _populate_cx_edges_with_timestamps( all_children = np.concatenate(list(CHILDREN.values())) response = cg.client.read_nodes(node_ids=all_children, properties=attrs) timestamps_d = get_parent_timestamps(cg, nodes) - end_timestamps = get_end_timestamps(cg, nodes, nodes_ts, CHILDREN) + end_timestamps = get_end_timestamps(cg, nodes, nodes_ts, CHILDREN, layer=layer) + + rows = [] for node, node_ts, node_end_ts in zip(nodes, nodes_ts, end_timestamps): CX_EDGES[node] = {} timestamps = timestamps_d[node] @@ -81,32 +83,18 @@ def _populate_cx_edges_with_timestamps( break CX_EDGES[node][ts] = _get_cx_edges_at_timestamp(node, response, ts) + row_id = serializers.serialize_uint64(node) + val_dict = {Hierarchy.StaleTimeStamp: 0} + rows.append(cg.client.mutate_row(row_id, val_dict, time_stamp=node_end_ts)) + cg.client.write(rows) + -def update_cross_edges(cg: ChunkedGraph, layer, node, node_ts, earliest_ts) -> list: +def update_cross_edges(cg: ChunkedGraph, layer, node, node_ts) -> list: """ Helper function to update a single ID. Returns a list of mutations with timestamps. """ rows = [] - if node_ts > earliest_ts: - try: - cx_edges_d = CX_EDGES[node][node_ts] - except KeyError: - raise KeyError(f"{node}:{node_ts}") - edges = np.concatenate([empty_2d] + list(cx_edges_d.values())) - if edges.size: - parents = cg.get_roots( - edges[:, 0], time_stamp=node_ts, stop_layer=layer, ceil=False - ) - uparents = np.unique(parents) - layers = cg.get_chunk_layers(uparents) - uparents = uparents[layers == layer] - assert uparents.size <= 1, f"{node}, {node_ts}, {uparents}" - if uparents.size == 0 or node != uparents[0]: - # if node is not the parent at this ts, it must be invalid - assert not exists_as_parent(cg, node, edges[:, 0]), f"{node}, {node_ts}" - return rows - row_id = serializers.serialize_uint64(node) for ts, cx_edges_d in CX_EDGES[node].items(): if ts < node_ts: @@ -132,12 +120,12 @@ def update_cross_edges(cg: ChunkedGraph, layer, node, node_ts, earliest_ts) -> l def _update_cross_edges_helper_thread(args): - cg, layer, node, node_ts, earliest_ts = args - return update_cross_edges(cg, layer, node, node_ts, earliest_ts) + cg, layer, node, node_ts = args + return update_cross_edges(cg, layer, node, node_ts) def _update_cross_edges_helper(args): - cg_info, layer, nodes, nodes_ts, earliest_ts = args + cg_info, layer, nodes, nodes_ts = args rows = [] cg = ChunkedGraph(**cg_info) parents = cg.get_parents(nodes, fail_to_zero=True) @@ -147,7 +135,7 @@ def _update_cross_edges_helper(args): if parent == 0: # invalid id caused by failed ingest task / edits continue - tasks.append((cg, layer, node, node_ts, earliest_ts)) + tasks.append((cg, layer, node, node_ts)) with ThreadPoolExecutor(max_workers=4) as executor: futures = [executor.submit(_update_cross_edges_helper_thread, task) for task in tasks] @@ -163,10 +151,10 @@ def update_chunk( """ Iterate over all layer IDs in a chunk and update their cross chunk edges. """ + debug = nodes is not None start = time.time() x, y, z = chunk_coords chunk_id = cg.get_chunk_id(layer=layer, x=x, y=y, z=z) - earliest_ts = cg.get_earliest_timestamp() _populate_nodes_and_children(cg, chunk_id, nodes=nodes) if not CHILDREN: return @@ -175,6 +163,14 @@ def update_chunk( nodes_ts = cg.get_node_timestamps(nodes, return_numpy=False, normalize=True) _populate_cx_edges_with_timestamps(cg, layer, nodes, nodes_ts) + if debug: + rows = [] + for node, node_ts in zip(nodes, nodes_ts): + rows.extend(update_cross_edges(cg, layer, node, node_ts)) + cg.client.write(rows) + logging.info(f"total elaspsed time: {time.time() - start}") + return + task_size = int(math.ceil(len(nodes) / mp.cpu_count() / 2)) chunked_nodes = chunked(nodes, task_size) chunked_nodes_ts = chunked(nodes_ts, task_size) @@ -182,11 +178,11 @@ def update_chunk( tasks = [] for chunk, ts_chunk in zip(chunked_nodes, chunked_nodes_ts): - args = (cg_info, layer, chunk, ts_chunk, earliest_ts) + args = (cg_info, layer, chunk, ts_chunk) tasks.append(args) processes = min(mp.cpu_count() * 2, len(tasks)) - logging.info(f"Processing {len(nodes)} nodes with {processes} workers.") + logging.info(f"processing {len(nodes)} nodes with {processes} workers.") with mp.Pool(processes) as pool: _ = list( tqdm( @@ -194,4 +190,4 @@ def update_chunk( total=len(tasks), ) ) - print(f"total elaspsed time: {time.time() - start}") + logging.info(f"total elaspsed time: {time.time() - start}") diff --git a/pychunkedgraph/ingest/upgrade/utils.py b/pychunkedgraph/ingest/upgrade/utils.py index 3407ea7b5..17f5db84e 100644 --- a/pychunkedgraph/ingest/upgrade/utils.py +++ b/pychunkedgraph/ingest/upgrade/utils.py @@ -1,7 +1,7 @@ # pylint: disable=invalid-name, missing-docstring from collections import defaultdict -from datetime import datetime, timedelta, timezone +from datetime import datetime, timezone import numpy as np from pychunkedgraph.graph import ChunkedGraph @@ -33,25 +33,44 @@ def get_edit_timestamps(cg: ChunkedGraph, edges_d, start_ts, end_ts) -> list: return sorted(timestamps) -def get_end_timestamps(cg: ChunkedGraph, nodes, nodes_ts, children_map): +def _get_end_timestamps_helper(cg: ChunkedGraph, nodes: list) -> defaultdict[int, set]: + result = defaultdict(set) + response = cg.client.read_nodes(node_ids=nodes, properties=Hierarchy.StaleTimeStamp) + for k, v in response.items(): + result[k].add(v[0].timestamp) + return result + + +def get_end_timestamps( + cg: ChunkedGraph, nodes: list, nodes_ts: datetime, children_map: dict, layer: int +): """ Gets the last timestamp for each node at which to update its cross edges. - For this, we get parent timestamps for all children of a node. - The first timestamp > node_timestamp among these is the last timestamp. - This is the timestamp at which one of node's children - got a new parent that superseded the current node. + For layer 2: + Get parent timestamps for all children of a node. + The first timestamp > node_timestamp among these is the last timestamp. + This is the timestamp at which one of node's children + got a new parent that superseded the current node. + These are cached in database. + For all nodes in each layer > 2: + Pick the earliest child node_end_ts > node_ts and cache in database. """ result = [] children = np.concatenate([*children_map.values()]) - timestamps_d = get_parent_timestamps(cg, children) + if layer == 2: + timestamps_d = get_parent_timestamps(cg, children) + else: + timestamps_d = _get_end_timestamps_helper(cg, children) + for node, node_ts in zip(nodes, nodes_ts): node_children = children_map[node] _timestamps = set().union(*[timestamps_d[k] for k in node_children]) + _timestamps.add(node_ts) try: _timestamps = sorted(_timestamps) _index = np.searchsorted(_timestamps, node_ts) assert _timestamps[_index] == node_ts, (_index, node_ts, _timestamps) - end_ts = _timestamps[_index + 1] - timedelta(milliseconds=1) + end_ts = _timestamps[_index + 1] except IndexError: # this node has not been edited, but might have it edges updated end_ts = datetime.now(timezone.utc) @@ -61,7 +80,7 @@ def get_end_timestamps(cg: ChunkedGraph, nodes, nodes_ts, children_map): def get_parent_timestamps( cg: ChunkedGraph, nodes, start_time=None, end_time=None -) -> dict[int, set]: +) -> defaultdict[int, set]: """ Timestamps of when the given nodes were edited. """ diff --git a/pychunkedgraph/utils/general.py b/pychunkedgraph/utils/general.py index c299d3b9b..ac4929660 100644 --- a/pychunkedgraph/utils/general.py +++ b/pychunkedgraph/utils/general.py @@ -26,10 +26,6 @@ def reverse_dictionary(dictionary): def chunked(l: Sequence, n: int): - """ - Yield successive n-sized chunks from l. - NOTE: Use itertools.batched from python 3.12 - """ """ Yield successive n-sized chunks from l. NOTE: Use itertools.batched from python 3.12 @@ -39,9 +35,6 @@ def chunked(l: Sequence, n: int): it = iter(l) while batch := tuple(islice(it, n)): yield batch - it = iter(l) - while batch := tuple(islice(it, n)): - yield batch def in2d(arr1: np.ndarray, arr2: np.ndarray) -> np.ndarray: