From 445f9e9d9b3379ebf6d252eeb2c91cdacc44fdc2 Mon Sep 17 00:00:00 2001
From: Gilles Daviet <gdaviet@nvidia.com>
Date: Sun, 26 May 2024 17:22:28 -0700
Subject: [PATCH] NanoVDB Index grid support

---
 docs/modules/functions.rst                    |   55 +
 docs/modules/runtime.rst                      |   41 +-
 warp/builtins.py                              |  181 +
 warp/context.py                               |   88 +-
 warp/native/exports.h                         |    1 +
 warp/native/mat.h                             |   12 +
 warp/native/nanovdb/GridHandle.h              |  366 +
 warp/native/nanovdb/HostBuffer.h              |  590 ++
 warp/native/nanovdb/NanoVDB.h                 | 6156 +++++++++++------
 warp/native/nanovdb/PNanoVDB.h                | 1035 ++-
 warp/native/nanovdb/PNanoVDBWrite.h           |  295 -
 warp/native/nanovdb/cuda/DeviceBuffer.h       |  231 +
 warp/native/nanovdb/cuda/GridHandle.cuh       |   76 +
 warp/native/nanovdb/math/Math.h               | 1448 ++++
 .../nanovdb/tools/cuda/PointsToGrid.cuh       | 1291 ++++
 warp/native/nanovdb/util/Util.h               |  657 ++
 warp/native/nanovdb/util/cuda/Timer.h         |  116 +
 warp/native/nanovdb/util/cuda/Util.h          |  193 +
 warp/native/volume.cpp                        |  438 +-
 warp/native/volume.cu                         |   52 +-
 warp/native/volume.h                          | 1028 ++-
 warp/native/volume_builder.cu                 |  741 +-
 warp/native/volume_builder.h                  |   17 +-
 warp/native/volume_impl.h                     |   61 +
 warp/native/warp.h                            |   22 +-
 warp/stubs.py                                 |   11 +
 warp/tests/assets/test_index_grid.nvdb        |    3 +
 warp/tests/test_volume.py                     |  248 +-
 warp/tests/test_volume_write.py               |   86 +
 warp/types.py                                 |  480 +-
 30 files changed, 12606 insertions(+), 3413 deletions(-)
 create mode 100644 warp/native/nanovdb/GridHandle.h
 create mode 100644 warp/native/nanovdb/HostBuffer.h
 delete mode 100644 warp/native/nanovdb/PNanoVDBWrite.h
 create mode 100644 warp/native/nanovdb/cuda/DeviceBuffer.h
 create mode 100644 warp/native/nanovdb/cuda/GridHandle.cuh
 create mode 100644 warp/native/nanovdb/math/Math.h
 create mode 100644 warp/native/nanovdb/tools/cuda/PointsToGrid.cuh
 create mode 100644 warp/native/nanovdb/util/Util.h
 create mode 100644 warp/native/nanovdb/util/cuda/Timer.h
 create mode 100644 warp/native/nanovdb/util/cuda/Util.h
 create mode 100644 warp/native/volume_impl.h
 create mode 100644 warp/tests/assets/test_index_grid.nvdb

diff --git a/docs/modules/functions.rst b/docs/modules/functions.rst
index e112444a..d010ccf9 100644
--- a/docs/modules/functions.rst
+++ b/docs/modules/functions.rst
@@ -1563,6 +1563,32 @@ Geometry
 
 Volumes
 ---------------
+.. py:function:: volume_sample(id: uint64, uvw: vec3f, sampling_mode: int32, dtype: Any)
+
+    Sample the volume of type `dtype` given by ``id`` at the volume local-space point ``uvw``.
+
+    Interpolation should be :attr:`warp.Volume.CLOSEST` or :attr:`wp.Volume.LINEAR.`
+
+
+.. py:function:: volume_sample_grad(id: uint64, uvw: vec3f, sampling_mode: int32, grad: Any, dtype: Any)
+
+    Sample the volume given by ``id`` and its gradient at the volume local-space point ``uvw``.
+
+    Interpolation should be :attr:`warp.Volume.CLOSEST` or :attr:`wp.Volume.LINEAR.`
+
+
+.. py:function:: volume_lookup(id: uint64, i: int32, j: int32, k: int32, dtype: Any)
+
+    Returns the value of voxel with coordinates ``i``, ``j``, ``k`` for a volume of type type `dtype`.
+
+    If the voxel at this index does not exist, this function returns the background value.
+
+
+.. py:function:: volume_store(id: uint64, i: int32, j: int32, k: int32, value: Any)
+
+    Store ``value`` at the voxel with coordinates ``i``, ``j``, ``k``.
+
+
 .. py:function:: volume_sample_f(id: uint64, uvw: vec3f, sampling_mode: int32) -> float
 
     Sample the volume given by ``id`` at the volume local-space point ``uvw``.
@@ -1625,6 +1651,35 @@ Volumes
     Store ``value`` at the voxel with coordinates ``i``, ``j``, ``k``.
 
 
+.. py:function:: volume_sample_index(id: uint64, uvw: vec3f, sampling_mode: int32, voxel_data: Array[Any], background: Any)
+
+    Sample the volume given by ``id`` at the volume local-space point ``uvw``.
+
+    Values for allocated voxels are read from the ``voxel_data`` array, and `background` is used as the value of non-existing voxels.
+    Interpolation should be :attr:`warp.Volume.CLOSEST` or :attr:`wp.Volume.LINEAR`.
+    This function is available for both index grids and classical volumes.
+    
+
+
+.. py:function:: volume_sample_grad_index(id: uint64, uvw: vec3f, sampling_mode: int32, voxel_data: Array[Any], background: Any, grad: Any)
+
+    Sample the volume given by ``id`` and its gradient at the volume local-space point ``uvw``.
+
+    Values for allocated voxels are read from the ``voxel_data`` array, and `background` is used as the value of non-existing voxels.
+    Interpolation should be :attr:`warp.Volume.CLOSEST` or :attr:`wp.Volume.LINEAR`.
+    This function is available for both index grids and classical volumes.
+   
+
+
+.. py:function:: volume_lookup_index(id: uint64, i: int32, j: int32, k: int32) -> int32
+
+    Returns the index associated to the voxel with coordinates ``i``, ``j``, ``k``.
+
+    If the voxel at this index does not exist, this function returns -1.
+    This function is available for both index grids and classical volumes.
+    
+
+
 .. py:function:: volume_index_to_world(id: uint64, uvw: vec3f) -> vec3f
 
     Transform a point ``uvw`` defined in volume index space to world space given the volume's intrinsic affine transformation.
diff --git a/docs/modules/runtime.rst b/docs/modules/runtime.rst
index 85400bb9..c4db7348 100644
--- a/docs/modules/runtime.rst
+++ b/docs/modules/runtime.rst
@@ -979,12 +979,12 @@ or use built-in closest-point or trilinear interpolation to sample grid data fro
 
 Volume objects can be created directly from Warp arrays containing a NanoVDB grid, from the contents of a
 standard ``.nvdb`` file using :func:`load_from_nvdb() <warp.Volume.load_from_nvdb>`,
+from an uncompressed in-memory buffer using :func:`load_from_address() <warp.Volume.load_from_address>`,
 or from a dense 3D NumPy array using :func:`load_from_numpy() <warp.Volume.load_from_numpy>`.
 
-Volumes can also be created using :func:`allocate() <warp.Volume.allocate>` or
-:func:`allocate_by_tiles() <warp.Volume.allocate_by_tiles>`. The values for a Volume object can be modified in a Warp
-kernel using :func:`wp.volume_store_f() <warp.volume_store_f>`, :func:`wp.volume_store_v() <warp.volume_store_v>`, and
-:func:`wp.volume_store_i() <warp.volume_store_i>`.
+Volumes can also be created using :func:`allocate() <warp.Volume.allocate>`, 
+:func:`allocate_by_tiles() <warp.Volume.allocate_by_tiles>` or :func:`allocate_by_voxels() <warp.Volume.allocate_by_voxels>`. 
+The values for a Volume object can be modified in a Warp kernel using :func:`wp.volume_store() <warp.volume_store>`.
 
 .. note::
     Warp does not currently support modifying the topology of sparse volumes at runtime.
@@ -999,8 +999,11 @@ Below we give an example of creating a Volume object from an existing NanoVDB fi
 
 .. note::
     Files written by the NanoVDB library, commonly marked by the ``.nvdb`` extension, can contain multiple grids with
-    various compression methods, but a :class:`Volume` object represents a single NanoVDB grid therefore only files with
-    a single grid are supported. NanoVDB's uncompressed and zip-compressed file formats are supported.
+    various compression methods, but a :class:`Volume` object represents a single NanoVDB grid. 
+    The first grid is loaded by default, then  Warp volumes corresponding to the other grids in the file can be created
+    using repeated calls to :func:`load_next_grid() <warp.Volume.load_next_grid>`.
+    NanoVDB's uncompressed and zip-compressed file formats are supported out-of-the-box, blosc compressed files require
+    the `blosc` Python package to be installed.
 
 To sample the volume inside a kernel we pass a reference to it by ID, and use the built-in sampling modes::
 
@@ -1018,11 +1021,35 @@ To sample the volume inside a kernel we pass a reference to it by ID, and use th
         q = wp.volume_world_to_index(volume, p)
 
         # sample volume with trilinear interpolation
-        f = wp.volume_sample_f(volume, q, wp.Volume.LINEAR)
+        f = wp.volume_sample(volume, q, wp.Volume.LINEAR, dtype=float)
 
         # write result
         samples[tid] = f
 
+Warp also supports NanoVDB index grids, which provide a memory-efficient linearization of voxel indices that can refer 
+to values in arbitrarily shaped arrays::
+
+    @wp.kernel
+    def sample_index_grid(volume: wp.uint64,
+                         points: wp.array(dtype=wp.vec3),
+                         voxel_values: wp.array(dtype=Any)):
+
+        tid = wp.tid()
+
+        # load sample point in world-space
+        p = points[tid]
+
+        # transform position to the volume's local-space
+        q = wp.volume_world_to_index(volume, p)
+
+        # sample volume with trilinear interpolation
+        background_value = voxel_values.dtype(0.0)
+        f = wp.volume_sample_index(volume, q, wp.Volume.LINEAR, voxel_values, background_value)
+
+The coordinates of all indexable voxels can be recovered using :func:`get_voxels() <warp.Volume.get_voxels>`.
+NanoVDB grids may also contains embedded *blind* data arrays; those can be accessed with the 
+:func:`feature_array() <warp.Volume.feature_array>` function.
+
 .. autoclass:: Volume
     :members:
     :undoc-members:
diff --git a/warp/builtins.py b/warp/builtins.py
index cd66410b..e6a45249 100644
--- a/warp/builtins.py
+++ b/warp/builtins.py
@@ -2103,6 +2103,112 @@ def spatial_vector_constructor_value_func(arg_types, kwds, templates):
 # ---------------------------------
 # Volumes
 
+_volume_supported_value_types = {
+    int32,
+    int64,
+    uint32,
+    float32,
+    float64,
+    vec3f,
+    vec3d,
+    vec4f,
+    vec4d,
+}
+
+
+def volume_value_func(arg_types, kwds, templates):
+    try:
+        dtype = kwds["dtype"]
+    except KeyError as err:
+        raise RuntimeError(
+            "'dtype' keyword argument must be specified when calling generic volume lookup or sampling functions"
+        ) from err
+
+    if dtype not in _volume_supported_value_types:
+        raise RuntimeError(f"Unsupported volume type '{type_repr(dtype)}'")
+
+    templates.append(dtype)
+
+    return dtype
+
+
+add_builtin(
+    "volume_sample",
+    input_types={"id": uint64, "uvw": vec3, "sampling_mode": int, "dtype": Any},
+    value_func=volume_value_func,
+    export=False,
+    group="Volumes",
+    doc="""Sample the volume of type `dtype` given by ``id`` at the volume local-space point ``uvw``.
+
+    Interpolation should be :attr:`warp.Volume.CLOSEST` or :attr:`wp.Volume.LINEAR.`""",
+)
+
+
+def check_volume_value_grad_compatibility(dtype, grad_dtype):
+    if type_is_vector(dtype):
+        expected = matrix(shape=(type_length(dtype), 3), dtype=type_scalar_type(dtype))
+    else:
+        expected = vector(length=3, dtype=dtype)
+
+    if not types_equal(grad_dtype, expected):
+        raise RuntimeError(f"Incompatible gradient type, expected {type_repr(expected)}, got {type_repr(grad_dtype)}")
+
+
+def volume_sample_grad_value_func(arg_types, kwds, templates):
+    dtype = volume_value_func(arg_types, kwds, templates)
+
+    if len(arg_types) < 4:
+        raise RuntimeError("'volume_sample_grad' requires 4 positional arguments")
+
+    grad_type = arg_types[3]
+    check_volume_value_grad_compatibility(dtype, grad_type)
+    return dtype
+
+
+add_builtin(
+    "volume_sample_grad",
+    input_types={"id": uint64, "uvw": vec3, "sampling_mode": int, "grad": Any, "dtype": Any},
+    value_func=volume_sample_grad_value_func,
+    export=False,
+    group="Volumes",
+    doc="""Sample the volume given by ``id`` and its gradient at the volume local-space point ``uvw``.
+
+    Interpolation should be :attr:`warp.Volume.CLOSEST` or :attr:`wp.Volume.LINEAR.`""",
+)
+
+add_builtin(
+    "volume_lookup",
+    input_types={"id": uint64, "i": int, "j": int, "k": int, "dtype": Any},
+    value_type=int,
+    value_func=volume_value_func,
+    export=False,
+    group="Volumes",
+    doc="""Returns the value of voxel with coordinates ``i``, ``j``, ``k`` for a volume of type type `dtype`.
+
+    If the voxel at this index does not exist, this function returns the background value.""",
+)
+
+
+def volume_store_value_func(arg_types, kwds, templates):
+    if len(arg_types) < 4:
+        raise RuntimeError("'volume_store' requires 5 positional arguments")
+
+    dtype = arg_types[4]
+    if dtype not in _volume_supported_value_types:
+        raise RuntimeError(f"Unsupported volume type '{type_repr(dtype)}'")
+
+    return None
+
+
+add_builtin(
+    "volume_store",
+    value_func=volume_store_value_func,
+    input_types={"id": uint64, "i": int, "j": int, "k": int, "value": Any},
+    export=False,
+    group="Volumes",
+    doc="""Store ``value`` at the voxel with coordinates ``i``, ``j``, ``k``.""",
+)
+
 add_builtin(
     "volume_sample_f",
     input_types={"id": uint64, "uvw": vec3, "sampling_mode": int},
@@ -2192,6 +2298,81 @@ def spatial_vector_constructor_value_func(arg_types, kwds, templates):
     doc="""Store ``value`` at the voxel with coordinates ``i``, ``j``, ``k``.""",
 )
 
+
+def volume_sample_index_value_func(arg_types, kwds, templates):
+    if len(arg_types) != 5:
+        raise RuntimeError("'volume_sample_index' requires 5 positional arguments")
+
+    dtype = arg_types[3].dtype
+
+    if not types_equal(dtype, arg_types[4]):
+        raise RuntimeError("The 'voxel_data' array and the 'background' value must have the same dtype")
+
+    return dtype
+
+
+add_builtin(
+    "volume_sample_index",
+    input_types={"id": uint64, "uvw": vec3, "sampling_mode": int, "voxel_data": array(dtype=Any), "background": Any},
+    value_func=volume_sample_index_value_func,
+    export=False,
+    group="Volumes",
+    doc="""Sample the volume given by ``id`` at the volume local-space point ``uvw``.
+
+    Values for allocated voxels are read from the ``voxel_data`` array, and `background` is used as the value of non-existing voxels.
+    Interpolation should be :attr:`warp.Volume.CLOSEST` or :attr:`wp.Volume.LINEAR`.
+    This function is available for both index grids and classical volumes.
+    """,
+)
+
+
+def volume_sample_grad_index_value_func(arg_types, kwds, templates):
+    if len(arg_types) != 6:
+        raise RuntimeError("'volume_sample_grad_index' requires 6 positional arguments")
+
+    dtype = arg_types[3].dtype
+
+    if not types_equal(dtype, arg_types[4]):
+        raise RuntimeError("The 'voxel_data' array and the 'background' value must have the same dtype")
+
+    grad_type = arg_types[5]
+    check_volume_value_grad_compatibility(dtype, grad_type)
+    return dtype
+
+
+add_builtin(
+    "volume_sample_grad_index",
+    input_types={
+        "id": uint64,
+        "uvw": vec3,
+        "sampling_mode": int,
+        "voxel_data": array(dtype=Any),
+        "background": Any,
+        "grad": Any,
+    },
+    value_func=volume_sample_grad_index_value_func,
+    export=False,
+    group="Volumes",
+    doc="""Sample the volume given by ``id`` and its gradient at the volume local-space point ``uvw``.
+
+    Values for allocated voxels are read from the ``voxel_data`` array, and `background` is used as the value of non-existing voxels.
+    Interpolation should be :attr:`warp.Volume.CLOSEST` or :attr:`wp.Volume.LINEAR`.
+    This function is available for both index grids and classical volumes.
+   """,
+)
+
+add_builtin(
+    "volume_lookup_index",
+    input_types={"id": uint64, "i": int, "j": int, "k": int},
+    value_type=int32,
+    group="Volumes",
+    doc="""Returns the index associated to the voxel with coordinates ``i``, ``j``, ``k``.
+
+    If the voxel at this index does not exist, this function returns -1.
+    This function is available for both index grids and classical volumes.
+    """,
+)
+
 add_builtin(
     "volume_index_to_world",
     input_types={"id": uint64, "uvw": vec3},
diff --git a/warp/context.py b/warp/context.py
index 98053f7f..49a3ac4e 100644
--- a/warp/context.py
+++ b/warp/context.py
@@ -2622,22 +2622,36 @@ def __init__(self):
             ]
             self.core.cutlass_gemm.restype = ctypes.c_bool
 
-            self.core.volume_create_host.argtypes = [ctypes.c_void_p, ctypes.c_uint64]
+            self.core.volume_create_host.argtypes = [ctypes.c_void_p, ctypes.c_uint64, ctypes.c_bool, ctypes.c_bool]
             self.core.volume_create_host.restype = ctypes.c_uint64
-            self.core.volume_get_buffer_info_host.argtypes = [
+            self.core.volume_get_tiles_host.argtypes = [
                 ctypes.c_uint64,
-                ctypes.POINTER(ctypes.c_void_p),
-                ctypes.POINTER(ctypes.c_uint64),
+                ctypes.c_void_p,
             ]
-            self.core.volume_get_tiles_host.argtypes = [
+            self.core.volume_get_voxels_host.argtypes = [
                 ctypes.c_uint64,
-                ctypes.POINTER(ctypes.c_void_p),
-                ctypes.POINTER(ctypes.c_uint64),
+                ctypes.c_void_p,
             ]
             self.core.volume_destroy_host.argtypes = [ctypes.c_uint64]
 
-            self.core.volume_create_device.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_uint64]
+            self.core.volume_create_device.argtypes = [
+                ctypes.c_void_p,
+                ctypes.c_void_p,
+                ctypes.c_uint64,
+                ctypes.c_bool,
+                ctypes.c_bool,
+            ]
             self.core.volume_create_device.restype = ctypes.c_uint64
+            self.core.volume_get_tiles_device.argtypes = [
+                ctypes.c_uint64,
+                ctypes.c_void_p,
+            ]
+            self.core.volume_get_voxels_device.argtypes = [
+                ctypes.c_uint64,
+                ctypes.c_void_p,
+            ]
+            self.core.volume_destroy_device.argtypes = [ctypes.c_uint64]
+
             self.core.volume_f_from_tiles_device.argtypes = [
                 ctypes.c_void_p,
                 ctypes.c_void_p,
@@ -2676,24 +2690,68 @@ def __init__(self):
                 ctypes.c_bool,
             ]
             self.core.volume_i_from_tiles_device.restype = ctypes.c_uint64
-            self.core.volume_get_buffer_info_device.argtypes = [
-                ctypes.c_uint64,
-                ctypes.POINTER(ctypes.c_void_p),
-                ctypes.POINTER(ctypes.c_uint64),
+            self.core.volume_index_from_tiles_device.argtypes = [
+                ctypes.c_void_p,
+                ctypes.c_void_p,
+                ctypes.c_int,
+                ctypes.c_float,
+                ctypes.c_float,
+                ctypes.c_float,
+                ctypes.c_float,
+                ctypes.c_bool,
             ]
-            self.core.volume_get_tiles_device.argtypes = [
+            self.core.volume_index_from_tiles_device.restype = ctypes.c_uint64
+            self.core.volume_from_active_voxels_device.argtypes = [
+                ctypes.c_void_p,
+                ctypes.c_void_p,
+                ctypes.c_int,
+                ctypes.c_float,
+                ctypes.c_float,
+                ctypes.c_float,
+                ctypes.c_float,
+                ctypes.c_bool,
+            ]
+            self.core.volume_from_active_voxels_device.restype = ctypes.c_uint64
+
+            self.core.volume_get_buffer_info.argtypes = [
                 ctypes.c_uint64,
                 ctypes.POINTER(ctypes.c_void_p),
                 ctypes.POINTER(ctypes.c_uint64),
             ]
-            self.core.volume_destroy_device.argtypes = [ctypes.c_uint64]
-
             self.core.volume_get_voxel_size.argtypes = [
                 ctypes.c_uint64,
                 ctypes.POINTER(ctypes.c_float),
                 ctypes.POINTER(ctypes.c_float),
                 ctypes.POINTER(ctypes.c_float),
             ]
+            self.core.volume_get_tile_and_voxel_count.argtypes = [
+                ctypes.c_uint64,
+                ctypes.POINTER(ctypes.c_uint32),
+                ctypes.POINTER(ctypes.c_uint64),
+            ]
+            self.core.volume_get_grid_info.argtypes = [
+                ctypes.c_uint64,
+                ctypes.POINTER(ctypes.c_uint64),
+                ctypes.POINTER(ctypes.c_uint32),
+                ctypes.POINTER(ctypes.c_uint32),
+                ctypes.c_float * 3,
+                ctypes.c_float * 9,
+                ctypes.c_char * 16,
+            ]
+            self.core.volume_get_grid_info.restype = ctypes.c_char_p
+            self.core.volume_get_blind_data_count.argtypes = [
+                ctypes.c_uint64,
+            ]
+            self.core.volume_get_blind_data_count.restype = ctypes.c_uint64
+            self.core.volume_get_blind_data_info.argtypes = [
+                ctypes.c_uint64,
+                ctypes.c_uint32,
+                ctypes.POINTER(ctypes.c_void_p),
+                ctypes.POINTER(ctypes.c_uint64),
+                ctypes.POINTER(ctypes.c_uint32),
+                ctypes.c_char * 16,
+            ]
+            self.core.volume_get_blind_data_info.restype = ctypes.c_char_p
 
             bsr_matrix_from_triplets_argtypes = [
                 ctypes.c_int,
diff --git a/warp/native/exports.h b/warp/native/exports.h
index 636c7da8..f6f085a4 100644
--- a/warp/native/exports.h
+++ b/warp/native/exports.h
@@ -968,6 +968,7 @@ WP_API void builtin_volume_sample_v_uint64_vec3f_int32(uint64 id, vec3f& uvw, in
 WP_API void builtin_volume_lookup_v_uint64_int32_int32_int32(uint64 id, int32 i, int32 j, int32 k, vec3f* ret) { *ret = wp::volume_lookup_v(id, i, j, k); }
 WP_API void builtin_volume_sample_i_uint64_vec3f(uint64 id, vec3f& uvw, int* ret) { *ret = wp::volume_sample_i(id, uvw); }
 WP_API void builtin_volume_lookup_i_uint64_int32_int32_int32(uint64 id, int32 i, int32 j, int32 k, int* ret) { *ret = wp::volume_lookup_i(id, i, j, k); }
+WP_API void builtin_volume_lookup_index_uint64_int32_int32_int32(uint64 id, int32 i, int32 j, int32 k, int32* ret) { *ret = wp::volume_lookup_index(id, i, j, k); }
 WP_API void builtin_volume_index_to_world_uint64_vec3f(uint64 id, vec3f& uvw, vec3f* ret) { *ret = wp::volume_index_to_world(id, uvw); }
 WP_API void builtin_volume_world_to_index_uint64_vec3f(uint64 id, vec3f& xyz, vec3f* ret) { *ret = wp::volume_world_to_index(id, xyz); }
 WP_API void builtin_volume_index_to_world_dir_uint64_vec3f(uint64 id, vec3f& uvw, vec3f* ret) { *ret = wp::volume_index_to_world_dir(id, uvw); }
diff --git a/warp/native/mat.h b/warp/native/mat.h
index 5e569d89..6a2c099f 100644
--- a/warp/native/mat.h
+++ b/warp/native/mat.h
@@ -877,6 +877,18 @@ inline CUDA_CALLABLE mat_t<Rows,Cols,Type> outer(const vec_t<Rows,Type>& a, cons
     return ret;
 }
 
+template<unsigned Cols,typename Type>
+inline CUDA_CALLABLE vec_t<Cols,Type> outer(Type a, const vec_t<Cols,Type>& b)
+{
+    return mul(a, b);
+}
+
+template<unsigned Rows,typename Type>
+inline CUDA_CALLABLE vec_t<Rows,Type> outer(const vec_t<Rows,Type>& a, Type b)
+{
+    return mul(a, b);
+}
+
 template<typename Type>
 inline CUDA_CALLABLE mat_t<3,3,Type> skew(const vec_t<3,Type>& a)
 {
diff --git a/warp/native/nanovdb/GridHandle.h b/warp/native/nanovdb/GridHandle.h
new file mode 100644
index 00000000..9aa3ea66
--- /dev/null
+++ b/warp/native/nanovdb/GridHandle.h
@@ -0,0 +1,366 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: MPL-2.0
+
+/*!
+    \file nanovdb/GridHandle.h
+
+    \author Ken Museth
+
+    \date January 8, 2020
+
+    \brief Defines GridHandle, which manages a host, and possibly a device,
+           memory buffer containing one or more NanoVDB grids.
+*/
+
+#ifndef NANOVDB_GRID_HANDLE_H_HAS_BEEN_INCLUDED
+#define NANOVDB_GRID_HANDLE_H_HAS_BEEN_INCLUDED
+
+#include <fstream> // for std::ifstream
+#include <iostream> // for std::cerr/cout
+#include <vector>
+#include <initializer_list>
+
+#include <nanovdb/NanoVDB.h>// for toGridType
+#include <nanovdb/HostBuffer.h>
+
+namespace nanovdb {
+
+// --------------------------> GridHandle <------------------------------------
+
+struct GridHandleMetaData {uint64_t offset, size; GridType gridType;};
+
+/// @brief This class serves to manage a buffer containing one or more NanoVDB Grids.
+///
+/// @note  It is important to note that this class does NOT depend on OpenVDB.
+template<typename BufferT = HostBuffer>
+class GridHandle
+{
+    std::vector<GridHandleMetaData> mMetaData;
+    BufferT mBuffer;
+
+    template <typename T>
+    static T* no_const(const T* ptr) { return const_cast<T*>(ptr); }
+
+public:
+    using BufferType = BufferT;
+
+    /// @brief  Move constructor from a host buffer
+    /// @param buffer buffer containing one or more NanoGrids that will be moved into this GridHandle
+    /// @throw Will throw and error with the buffer does not contain a valid NanoGrid!
+    template<typename T = BufferT, typename util::enable_if<BufferTraits<T>::hasDeviceDual, int>::type = 0>
+    GridHandle(T&& buffer);
+
+    /// @brief  Move constructor from a dual host-device buffer
+    /// @param buffer buffer containing one or more NanoGrids that will be moved into this GridHandle
+    /// @throw Will throw and error with the buffer does not contain a valid NanoGrid!
+    template<typename T = BufferT, typename util::disable_if<BufferTraits<T>::hasDeviceDual, int>::type = 0>
+    GridHandle(T&& buffer);
+
+    /// @brief Constructs an empty GridHandle
+    GridHandle() = default;
+
+    /// @brief Disallow copy-construction
+    GridHandle(const GridHandle&) = delete;
+
+    /// @brief Move copy-constructor
+    GridHandle(GridHandle&& other) noexcept {
+        mBuffer   = std::move(other.mBuffer);
+        mMetaData = std::move(other.mMetaData);
+    }
+
+    /// @brief clear this GridHandle to an empty handle
+    void reset() {
+        mBuffer.clear();
+        mMetaData.clear();
+    }
+
+    /// @brief Disallow copy assignment operation
+    GridHandle& operator=(const GridHandle&) = delete;
+
+    /// @brief Move copy assignment operation
+    GridHandle& operator=(GridHandle&& other) noexcept {
+        mBuffer   = std::move(other.mBuffer);
+        mMetaData = std::move(other.mMetaData);
+        return *this;
+    }
+
+    /// @brief Performs a deep copy of the GridHandle, possibly templated on a different buffer type
+    /// @tparam OtherBufferT Buffer type of the deep copy
+    /// @param buffer optional buffer used for allocation
+    /// @return A new handle of the specified buffer type that contains a deep copy of the current handle
+    template <typename OtherBufferT = HostBuffer>
+    GridHandle<OtherBufferT> copy(const OtherBufferT& buffer = OtherBufferT()) const;
+
+    /// @brief Return a reference to the buffer
+    BufferT&       buffer() { return mBuffer; }
+
+    /// @brief Return a const reference to the buffer
+    const BufferT& buffer() const { return mBuffer; }
+
+    /// @brief Returns a non-const pointer to the data.
+    /// @warning Note that the return pointer can be NULL if the GridHandle was not initialized
+    void* data() { return mBuffer.data(); }
+
+    /// @brief Returns a const pointer to the data.
+    /// @warning Note that the return pointer can be NULL if the GridHandle was not initialized
+    const void* data() const { return mBuffer.data(); }
+
+    template<typename U = BufferT>
+    typename util::enable_if<BufferTraits<U>::hasDeviceDual, const void*>::type
+    deviceData() const { return mBuffer.deviceData(); }
+    template<typename U = BufferT>
+    typename util::enable_if<BufferTraits<U>::hasDeviceDual, void*>::type
+    deviceData() { return mBuffer.deviceData(); }
+
+    /// @brief Returns the size in bytes of the raw memory buffer managed by this GridHandle.
+    uint64_t size() const { return mBuffer.size(); }
+
+    //@{
+    /// @brief Return true if this handle is empty, i.e. has no allocated memory
+    bool empty()   const { return this->size() == 0; }
+    bool isEmpty() const { return this->size() == 0; }
+    //@}
+
+    /// @brief Return true if this handle contains any grids
+    operator bool() const { return !this->empty(); }
+
+    /// @brief Returns a const host pointer to the @a n'th NanoVDB grid encoded in this GridHandle.
+    /// @tparam ValueT Value type of the grid point to be returned
+    /// @param n Index of the (host) grid pointer to be returned
+    /// @warning Note that the return pointer can be NULL if the GridHandle no host grid, @a n is invalid
+    ///          or if the template parameter does not match the specified grid!
+    template<typename ValueT>
+    const NanoGrid<ValueT>* grid(uint32_t n = 0) const;
+
+    /// @brief Returns a host pointer to the @a n'th  NanoVDB grid encoded in this GridHandle.
+    /// @tparam ValueT Value type of the grid point to be returned
+    /// @param n Index of the (host) grid pointer to be returned
+    /// @warning Note that the return pointer can be NULL if the GridHandle no host grid, @a n is invalid
+    ///          or if the template parameter does not match the specified grid!
+    template<typename ValueT>
+    NanoGrid<ValueT>* grid(uint32_t n = 0) {return const_cast<NanoGrid<ValueT>*>(static_cast<const GridHandle*>(this)->template grid<ValueT>(n));}
+
+    /// @brief Return a const pointer to the @a n'th grid encoded in this GridHandle on the device, e.g. GPU
+    /// @tparam ValueT Value type of the grid point to be returned
+    /// @param n Index of the (device) grid pointer to be returned
+    /// @warning Note that the return pointer can be NULL if the GridHandle has no device grid, @a n is invalid,
+    ///          or if the template parameter does not match the specified grid.
+    template<typename ValueT, typename U = BufferT>
+    typename util::enable_if<BufferTraits<U>::hasDeviceDual, const NanoGrid<ValueT>*>::type
+    deviceGrid(uint32_t n=0) const;
+
+    /// @brief Return a const pointer to the @a n'th grid encoded in this GridHandle on the device, e.g. GPU
+    /// @tparam ValueT Value type of the grid point to be returned
+    /// @param n Index if of the grid pointer to be returned
+    /// @param verbose if non-zero error messages will be printed in case something failed
+    /// @warning Note that the return pointer can be NULL if the GridHandle was not initialized, @a n is invalid,
+    ///          or if the template parameter does not match the specified grid.
+    template<typename ValueT, typename U = BufferT>
+    typename util::enable_if<BufferTraits<U>::hasDeviceDual, NanoGrid<ValueT>*>::type
+    deviceGrid(uint32_t n=0){return const_cast<NanoGrid<ValueT>*>(static_cast<const GridHandle*>(this)->template deviceGrid<ValueT>(n));}
+
+    /// @brief Upload the grid to the device, e.g. from CPU to GPU
+    /// @note This method is only available if the buffer supports devices
+    template<typename U = BufferT>
+    typename util::enable_if<BufferTraits<U>::hasDeviceDual, void>::type
+    deviceUpload(void* stream = nullptr, bool sync = true) { mBuffer.deviceUpload(stream, sync); }
+
+    /// @brief Download the grid to from the device, e.g. from GPU to CPU
+    /// @note This method is only available if the buffer supports devices
+    template<typename U = BufferT>
+    typename util::enable_if<BufferTraits<U>::hasDeviceDual, void>::type
+    deviceDownload(void* stream = nullptr, bool sync = true) { mBuffer.deviceDownload(stream, sync); }
+
+    /// @brief Check if the buffer is this handle has any padding, i.e. if the buffer is larger than the combined size of all its grids
+    /// @return true is the combined size of all grid is smaller than the buffer size
+    bool isPadded() const {return mMetaData.empty() ? false : mMetaData.back().offset + mMetaData.back().size != mBuffer.size();}
+
+    /// @brief Return the total number of grids contained in this buffer
+    uint32_t gridCount() const {return static_cast<uint32_t>(mMetaData.size());}
+
+    /// @brief Return the grid size of the @a n'th grid in this GridHandle
+    /// @param n index of the grid (assumed to be less than gridCount())
+    /// @return Return the byte size of the specified grid
+    uint64_t gridSize(uint32_t n = 0) const {return mMetaData[n].size; }
+
+    /// @brief Return the GridType of the @a n'th grid in this GridHandle
+    /// @param n index of the grid (assumed to be less than gridCount())
+    /// @return Return the GridType of the specified grid
+    GridType gridType(uint32_t n = 0) const {return mMetaData[n].gridType; }
+
+    /// @brief Access to the GridData of the n'th grid in the current handle
+    /// @param n zero-based ID of the grid
+    /// @return Const pointer to the n'th GridData in the current handle
+    const GridData* gridData(uint32_t n = 0) const;
+
+    /// @brief Returns a const point to the @a n'th grid meta data
+    /// @param n zero-based ID of the grid
+    /// @warning Note that the return pointer can be NULL if the GridHandle was not initialized
+    const GridMetaData* gridMetaData(uint32_t n = 0) const;
+
+    /// @brief Write a specific grid in this buffer to an output stream
+    /// @param os  output stream that the buffer will be written to
+    /// @param n zero-based index of the grid to be written to stream
+    void write(std::ostream& os, uint32_t n) const {
+        if (const GridData* data = this->gridData(n)) {
+            os.write((const char*)data, data->mGridSize);
+        } else {
+            throw std::runtime_error("GridHandle does not contain a #" + std::to_string(n) + " grid");
+        }
+    }
+
+    /// @brief Write the entire grid buffer to an output stream
+    /// @param os output stream that the buffer will be written to
+    void write(std::ostream& os) const {
+        for (uint32_t n=0; n<this->gridCount(); ++n) this->write(os, n);
+    }
+
+    /// @brief Write this entire grid buffer to a file
+    /// @param fileName string name of the output file
+    void write(const std::string &fileName) const {
+        std::ofstream os(fileName, std::ios::out | std::ios::binary | std::ios::trunc);
+        if (!os.is_open()) throw std::ios_base::failure("Unable to open file named \"" + fileName + "\" for output");
+        this->write(os);
+    }
+
+    /// @brief Write a specific grid to file
+    /// @param fileName string name of the output file
+    /// @param n zero-based index of the grid to be written to file
+    void write(const std::string &fileName, uint32_t n) const {
+        std::ofstream os(fileName, std::ios::out | std::ios::binary | std::ios::trunc);
+        if (!os.is_open()) throw std::ios_base::failure("Unable to open file named \"" + fileName + "\" for output");
+        this->write(os, n);
+    }
+
+    /// @brief Read an entire raw grid buffer from an input stream
+    /// @param is input stream containing a raw grid buffer
+    /// @param pool optional pool from which to allocate the new grid buffer
+    /// @throw Will throw a std::logic_error if the stream does not contain a valid raw grid
+    void read(std::istream& is, const BufferT& pool = BufferT());
+
+    /// @brief Read a specific grid from an input stream containing a raw grid buffer
+    /// @param is input stream containing a raw grid buffer
+    /// @param n zero-based index of the grid to be read
+    /// @param pool optional pool from which to allocate the new grid buffer
+    /// @throw Will throw a std::logic_error if the stream does not contain a valid raw grid
+    void read(std::istream& is, uint32_t n, const BufferT& pool = BufferT());
+
+    /// @brief Read a specific grid from an input stream containing a raw grid buffer
+    /// @param is input stream containing a raw grid buffer
+    /// @param gridName string name of the grid to be read
+    /// @param pool optional pool from which to allocate the new grid buffer
+    /// @throw Will throw a std::logic_error if the stream does not contain a valid raw grid with the speficied name
+    void read(std::istream& is, const std::string &gridName, const BufferT& pool = BufferT());
+
+    /// @brief Read a raw grid buffer from a file
+    /// @param filename string name of the input file containing a raw grid buffer
+    /// @param pool optional pool from which to allocate the new grid buffe
+    void read(const std::string &fileName, const BufferT& pool = BufferT()) {
+        std::ifstream is(fileName, std::ios::in | std::ios::binary);
+        if (!is.is_open()) throw std::ios_base::failure("Unable to open file named \"" + fileName + "\" for input");
+        this->read(is, pool);
+    }
+
+    /// @brief Read a specific grid from a file containing a raw grid buffer
+    /// @param filename string name of the input file containing a raw grid buffer
+    /// @param n zero-based index of the grid to be read
+    /// @param pool optional pool from which to allocate the new grid buffer
+    /// @throw Will throw a std::ios_base::failure if the file does not exist and a
+    ///        std::logic_error if the files does not contain a valid raw grid
+    void read(const std::string &fileName, uint32_t n, const BufferT& pool = BufferT()) {
+        std::ifstream is(fileName, std::ios::in | std::ios::binary);
+        if (!is.is_open()) throw std::ios_base::failure("Unable to open file named \"" + fileName + "\" for input");
+        this->read(is, n, pool);
+    }
+
+    /// @brief Read a specific grid from a file containing a raw grid buffer
+    /// @param filename string name of the input file containing a raw grid buffer
+    /// @param gridName string name of the grid to be read
+    /// @param pool optional pool from which to allocate the new grid buffer
+    /// @throw Will throw a std::ios_base::failure if the file does not exist and a
+    ///        std::logic_error if the files does not contain a valid raw grid withe the specified name
+    void read(const std::string &fileName, const std::string &gridName, const BufferT& pool = BufferT()) {
+        std::ifstream is(fileName, std::ios::in | std::ios::binary);
+        if (!is.is_open()) throw std::ios_base::failure("Unable to open file named \"" + fileName + "\" for input");
+        this->read(is, gridName, pool);
+    }
+}; // GridHandle
+
+// --------------------------> Implementation of private methods in GridHandle <------------------------------------
+
+template<typename BufferT>
+inline const GridData* GridHandle<BufferT>::gridData(uint32_t n) const
+{
+    const void *data = this->data();
+    if (data == nullptr || n >= mMetaData.size()) return nullptr;
+    return util::PtrAdd<GridData>(data, mMetaData[n].offset);
+}// const GridData* GridHandle<BufferT>::gridData(uint32_t n) const
+
+template<typename BufferT>
+inline const GridMetaData* GridHandle<BufferT>::gridMetaData(uint32_t n) const
+{
+    const auto *data = this->data();
+    if (data == nullptr || n >= mMetaData.size()) return nullptr;
+    return util::PtrAdd<GridMetaData>(data, mMetaData[n].offset);
+}// const GridMetaData* GridHandle<BufferT>::gridMetaData(uint32_t n) const
+
+inline __hostdev__ void cpyGridHandleMeta(const GridData *data, GridHandleMetaData *meta)
+{
+    uint64_t offset = 0;
+    for (auto *p=meta, *q=p+data->mGridCount; p!=q; ++p) {
+        *p = {offset,  data->mGridSize, data->mGridType};
+        offset += p->size;
+        data = util::PtrAdd<GridData>(data, p->size);
+    }
+}// void cpyGridHandleMeta(const GridData *data, GridHandleMetaData *meta)
+
+template<typename BufferT>
+template<typename T, typename util::disable_if<BufferTraits<T>::hasDeviceDual, int>::type>
+GridHandle<BufferT>::GridHandle(T&& buffer)
+{
+    static_assert(util::is_same<T,BufferT>::value, "Expected U==BufferT");
+    mBuffer = std::move(buffer);
+    if (auto *data = reinterpret_cast<const GridData*>(mBuffer.data())) {
+        if (!data->isValid()) throw std::runtime_error("GridHandle was constructed with an invalid host buffer");
+        mMetaData.resize(data->mGridCount);
+        cpyGridHandleMeta(data, mMetaData.data());
+    }
+}// GridHandle<BufferT>::GridHandle(T&& buffer)
+
+template<typename BufferT>
+template <typename OtherBufferT>
+inline GridHandle<OtherBufferT> GridHandle<BufferT>::copy(const OtherBufferT& other) const
+{
+    if (mBuffer.isEmpty()) return GridHandle<OtherBufferT>();// return an empty handle
+    auto buffer = OtherBufferT::create(mBuffer.size(), &other);
+    std::memcpy(buffer.data(), mBuffer.data(), mBuffer.size());// deep copy of buffer
+    return GridHandle<OtherBufferT>(std::move(buffer));
+}// GridHandle<OtherBufferT> GridHandle<BufferT>::copy(const OtherBufferT& other) const
+
+template<typename BufferT>
+template<typename ValueT>
+inline const NanoGrid<ValueT>* GridHandle<BufferT>::grid(uint32_t n) const
+{
+    const void *data = mBuffer.data();
+    if (data == nullptr || n >= mMetaData.size() || mMetaData[n].gridType != toGridType<ValueT>()) return nullptr;
+    return util::PtrAdd<NanoGrid<ValueT>>(data, mMetaData[n].offset);
+}// const NanoGrid<ValueT>* GridHandle<BufferT>::grid(uint32_t n) const
+
+template<typename BufferT>
+template<typename ValueT, typename U>
+inline typename util::enable_if<BufferTraits<U>::hasDeviceDual, const NanoGrid<ValueT>*>::type
+GridHandle<BufferT>::deviceGrid(uint32_t n) const
+{
+    const void *data = mBuffer.deviceData();
+    if (data == nullptr || n >= mMetaData.size() || mMetaData[n].gridType != toGridType<ValueT>()) return nullptr;
+    return util::PtrAdd<NanoGrid<ValueT>>(data, mMetaData[n].offset);
+}// GridHandle<BufferT>::deviceGrid(uint32_t n) cons
+
+
+} // namespace nanovdb
+
+#if defined(__CUDACC__)
+#include <nanovdb/cuda/GridHandle.cuh>
+#endif// defined(__CUDACC__)
+
+#endif // NANOVDB_GRID_HANDLE_H_HAS_BEEN_INCLUDED
diff --git a/warp/native/nanovdb/HostBuffer.h b/warp/native/nanovdb/HostBuffer.h
new file mode 100644
index 00000000..c664856a
--- /dev/null
+++ b/warp/native/nanovdb/HostBuffer.h
@@ -0,0 +1,590 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: MPL-2.0
+
+/*!
+    @file nanovdb/HostBuffer.h
+
+    @date April 20, 2021
+
+    @brief HostBuffer - a buffer that contains a shared or private bump
+           pool to either externally or internally managed host memory.
+
+    @details This HostBuffer can be used in multiple ways, most of which are
+             demonstrated in the examples below. Memory in the pool can
+             be managed or unmanged (e.g. internal or external) and can
+             be shared between multiple buffers or belong to a single buffer.
+
+   Example that uses HostBuffer::create inside io::readGrids to create a
+   full self-managed buffer, i.e. not shared and without padding, per grid in the file.
+   @code
+        auto handles = nanovdb::io::readGrids("file.nvdb");
+   @endcode
+
+   Example that uses HostBuffer::createFull. Assuming you have a raw pointer
+   to a NanoVDB grid of unknown type, this examples shows how to create its
+   GridHandle which can be used to enquire about the grid type and meta data.
+   @code
+        void    *data;// pointer to a NanoVDB grid of unknown type
+        uint64_t size;// byte size of NanoVDB grid of unknown type
+        auto buffer = nanovdb::HostBuffer::createFull(size, data);
+        nanovdb::GridHandle<> gridHandle(std::move(buffer));
+   @endcode
+
+   Example that uses HostBuffer::createPool for internally managed host memory.
+   Suppose you want to read multiple grids in multiple files, but reuse the same
+   fixed sized memory buffer to both avoid memory fragmentation as well as
+   exceeding the fixed memory ceiling!
+   @code
+        auto pool = nanovdb::HostBuffer::createPool(1 << 30);// 1 GB memory pool
+        std::vector<std::string>> frames;// vector of grid names
+        for (int i=0; i<frames.size(); ++i) {
+            auto handles = nanovdb::io::readGrids(frames[i], 0, pool);// throws if grids in file exceed 1 GB
+            ...
+            pool.reset();// clears all handles and resets the memory pool for reuse
+        }
+   @endcode
+
+   Example that uses HostBuffer::createPool for externally managed host memory.
+   Note that in this example @c handles are allowed to outlive @c pool since
+   they internally store a shared pointer to the memory pool. However @c data
+   MUST outlive @c handles since the pool does not own its memory in this example.
+   @code
+        const size_t poolSize = 1 << 30;// 1 GB
+        void *data = std::malloc(size + NANOVDB_DATA_ALIGNMENT);// 1 GB pool with padding
+        void *buffer = nanovdb::alignPtr(data);// 32B aligned buffer
+        //void *buffer = std::aligned_alloc(NANOVDB_DATA_ALIGNMENT, poolSize);// in C++17
+        auto pool = nanovdb::HostBuffer::createPool(poolSize, buffer);
+        auto handles1 = nanovdb::io::readGrids("file1.nvdb", 0, pool);
+        auto handles2 = nanovdb::io::readGrids("file2.nvdb", 0, pool);
+        ....
+        std::free(data);
+        //std::free(buffer);
+   @endcode
+
+   Example that uses HostBuffer::createPool for externally managed host memory.
+   Note that in this example @c handles are allowed to outlive @c pool since
+   they internally store a shared pointer to the memory pool. However @c array
+   MUST outlive @c handles since the pool does not own its memory in this example.
+   @code
+        const size_t poolSize = 1 << 30;// 1 GB
+        std::unique_ptr<char[]> array(new char[size + NANOVDB_DATA_ALIGNMENT]);// scoped pool of 1 GB with padding
+        void *buffer = nanovdb::alignPtr(array.get());// 32B aligned buffer
+        auto pool = nanovdb::HostBuffer::createPool(poolSize, buffer);
+        auto handles = nanovdb::io::readGrids("file.nvdb", 0, pool);
+   @endcode
+*/
+
+#ifndef NANOVDB_HOSTBUFFER_H_HAS_BEEN_INCLUDED
+#define NANOVDB_HOSTBUFFER_H_HAS_BEEN_INCLUDED
+
+#include <nanovdb/NanoVDB.h>// for NANOVDB_DATA_ALIGNMENT;
+#include <stdint.h> //         for types like int32_t etc
+#include <cstdio> //           for fprintf
+#include <cstdlib> //          for std::malloc/std::realloc/std::free
+#include <memory>//            for std::make_shared
+#include <mutex>//             for std::mutex
+#include <unordered_set>//     for std::unordered_set
+#include <cassert>//           for assert
+#include <sstream>//           for std::stringstream
+#include <cstring>//           for memcpy
+
+#define checkPtr(ptr, msg) \
+    { \
+        ptrAssert((ptr), (msg), __FILE__, __LINE__); \
+    }
+
+namespace nanovdb {
+
+template<typename BufferT>
+struct BufferTraits
+{
+    static constexpr bool hasDeviceDual = false;
+};
+
+// ----------------------------> HostBuffer <--------------------------------------
+
+/// @brief This is a buffer that contains a shared or private pool
+///        to either externally or internally managed host memory.
+///
+/// @note  Terminology:
+///        Pool:   0 = buffer.size() < buffer.poolSize()
+///        Buffer: 0 < buffer.size() < buffer.poolSize()
+///        Full:   0 < buffer.size() = buffer.poolSize()
+///        Empty:  0 = buffer.size() = buffer.poolSize()
+class HostBuffer
+{
+    struct Pool;// forward declaration of private pool struct
+    std::shared_ptr<Pool> mPool;
+    uint64_t              mSize; // total number of bytes for the NanoVDB grid.
+    void*                 mData; // raw buffer for the NanoVDB grid.
+
+#if defined(DEBUG) || defined(_DEBUG)
+    static inline void ptrAssert(void* ptr, const char* msg, const char* file, int line, bool abort = true)
+    {
+        if (ptr == nullptr) {
+            fprintf(stderr, "NULL pointer error: %s %s %d\n", msg, file, line);
+            if (abort)
+                exit(1);
+        }
+        if (uint64_t(ptr) % NANOVDB_DATA_ALIGNMENT) {
+            fprintf(stderr, "Alignment pointer error: %s %s %d\n", msg, file, line);
+            if (abort)
+                exit(1);
+        }
+    }
+#else
+    static inline void ptrAssert(void*, const char*, const char*, int, bool = true)
+    {
+    }
+#endif
+
+public:
+    /// @brief Return a full buffer or an empty buffer
+    HostBuffer(uint64_t bufferSize = 0);
+
+     /// @brief Move copy-constructor
+    HostBuffer(HostBuffer&& other);
+
+    /// @brief Custom descructor
+    ~HostBuffer() { this->clear(); }
+
+    /// @brief Move copy assignment operation
+    HostBuffer& operator=(HostBuffer&& other);
+
+    /// @brief Disallow copy-construction
+    HostBuffer(const HostBuffer&) = delete;
+
+    /// @brief Disallow copy assignment operation
+    HostBuffer& operator=(const HostBuffer&) = delete;
+
+    /// @brief Return a pool buffer which satisfies: buffer.size == 0,
+    ///        buffer.poolSize() == poolSize, and buffer.data() == nullptr.
+    ///        If data==nullptr, memory for the pool will be allocated.
+    ///
+    /// @throw If poolSize is zero.
+    static HostBuffer createPool(uint64_t poolSize, void *data = nullptr);
+
+    /// @brief Return a full buffer which satisfies: buffer.size == bufferSize,
+    ///        buffer.poolSize() == bufferSize, and buffer.data() == data.
+    ///        If data==nullptr, memory for the pool will be allocated.
+    ///
+    /// @throw If bufferSize is zero.
+    static HostBuffer createFull(uint64_t bufferSize, void *data = nullptr);
+
+    /// @brief Return a buffer with @c bufferSize bytes managed by
+    ///        the specified memory @c pool. If none is provided, i.e.
+    ///        @c pool == nullptr or @c pool->poolSize() == 0, one is
+    ///        created with size @c bufferSize, i.e. a full buffer is returned.
+    ///
+    /// @throw If the specified @c pool has insufficient memory for
+    ///        the requested buffer size.
+    static HostBuffer create(uint64_t bufferSize, const HostBuffer* pool = nullptr);
+
+    /// @brief Initialize as a full buffer with the specified size. If data is NULL
+    ///        the memory is internally allocated.
+    void init(uint64_t bufferSize, void *data = nullptr);
+
+    //@{
+    /// @brief Retuns a pointer to the raw memory buffer managed by this allocator.
+    ///
+    /// @warning Note that the pointer can be NULL if the allocator was not initialized!
+    const void* data() const { return mData; }
+    void* data() { return mData; }
+    //@}
+
+    //@{
+    /// @brief Returns the size in bytes associated with this buffer.
+    uint64_t bufferSize() const { return mSize; }
+    uint64_t size() const { return this->bufferSize(); }
+    //@}
+
+    /// @brief Returns the size in bytes of the memory pool shared with this instance.
+    uint64_t poolSize() const;
+
+    /// @brief Return true if memory is managed (using std::malloc and std:free) by the
+    ///        shared pool in this buffer. Else memory is assumed to be managed externally.
+    bool isManaged() const;
+
+    //@{
+    /// @brief Returns true if this buffer has no memory associated with it
+    bool isEmpty() const { return !mPool || mSize == 0 || mData == nullptr; }
+    bool empty() const { return this->isEmpty(); }
+    //@}
+
+    /// @brief Return true if this is a pool, i.e. an empty buffer with a nonempty
+    ///        internal pool, i.e. this->size() == 0 and this->poolSize() != 0
+    bool isPool() const { return mSize == 0 && this->poolSize() > 0; }
+
+    /// @brief Return true if the pool exists, is nonempty but has no more available memory
+    bool isFull() const;
+
+    /// @brief Clear this buffer so it is empty.
+    void clear();
+
+    /// @brief Clears all existing buffers that are registered against the memory pool
+    ///        and resets the pool so it can be reused to create new buffers.
+    ///
+    /// @throw If this instance is not empty or contains no pool.
+    ///
+    /// @warning This method is not thread-safe!
+    void reset();
+
+    /// @brief Total number of bytes from the pool currently in use by buffers
+    uint64_t poolUsage() const;
+
+    /// @brief resize the pool size. It will attempt to resize the existing
+    ///        memory block, but if that fails a deep copy is performed.
+    ///        If @c data is not NULL it will be used as new externally
+    ///        managed memory for the pool. All registered buffers are
+    ///        updated so GridHandle::grid might return a new address (if
+    ///        deep copy was performed).
+    ///
+    /// @note  This method can be use to resize the memory pool and even
+    ///        change it from internally to externally managed memory or vice versa.
+    ///
+    /// @throw if @c poolSize is less than this->poolUsage() the used memory
+    ///        or allocations fail.
+    void resizePool(uint64_t poolSize, void *data = nullptr);
+
+}; // HostBuffer class
+
+// --------------------------> Implementation of HostBuffer::Pool <------------------------------------
+
+// This is private struct of HostBuffer so you can safely ignore the API
+struct HostBuffer::Pool
+{
+    using HashTableT = std::unordered_set<HostBuffer*>;
+    std::mutex mMutex; // mutex for updating mRegister and mFree
+    HashTableT mRegister;
+    void      *mData, *mFree;
+    uint64_t   mSize, mPadding;
+    bool       mManaged;
+
+    /// @brief External memory ctor
+    Pool(uint64_t size = 0, void* data = nullptr)
+        : mData(data)
+        , mFree(mData)
+        , mSize(size)
+        , mPadding(0)
+        , mManaged(data == nullptr)
+    {
+        if (mManaged) {
+            mData = Pool::alloc(mSize);
+            if (mData == nullptr) throw std::runtime_error("Pool::Pool malloc failed");
+        }
+        mPadding = alignmentPadding(mData);
+        if (!mManaged && mPadding != 0) {
+            throw std::runtime_error("Pool::Pool: external memory buffer is not aligned to " +
+                                     std::to_string(NANOVDB_DATA_ALIGNMENT) +
+                                     " bytes.\nHint: use nanovdb::alignPtr or std::aligned_alloc (C++17 only)");
+        }
+        mFree = util::PtrAdd(mData, mPadding);
+    }
+
+    /// @brief Custom destructor
+    ~Pool()
+    {
+        assert(mRegister.empty());
+        if (mManaged) std::free(mData);
+    }
+
+    /// @brief Disallow copy-construction
+    Pool(const Pool&) = delete;
+
+    /// @brief Disallow move-construction
+    Pool(const Pool&&) = delete;
+
+    /// @brief Disallow copy assignment operation
+    Pool& operator=(const Pool&) = delete;
+
+    /// @brief Disallow move assignment operation
+    Pool& operator=(const Pool&&) = delete;
+
+    /// @brief Return the total number of bytes used from this Pool by buffers
+    uint64_t usage() const { return util::PtrDiff(mFree, mData) - mPadding; }
+
+    /// @brief Allocate a buffer of the specified size and add it to the register
+    void add(HostBuffer* buffer, uint64_t size)
+    {
+        void *alignedFree = util::PtrAdd(mFree, alignmentPadding(mFree));
+
+        if (util::PtrAdd(alignedFree, size) > util::PtrAdd(mData, mPadding + mSize)) {
+            std::stringstream ss;
+            ss << "HostBuffer::Pool: insufficient memory\n"
+               << "\tA buffer requested " << size << " bytes with " << NANOVDB_DATA_ALIGNMENT
+               << "-bytes alignment from a pool with "
+               << mSize << " bytes of which\n\t" << (util::PtrDiff(alignedFree, mData) - mPadding)
+               << " bytes are used by " << mRegister.size() << " other buffer(s). "
+               << "Pool is " << (mManaged ? "internally" : "externally") << " managed.\n";
+            //std::cerr << ss.str();
+            throw std::runtime_error(ss.str());
+        }
+        buffer->mSize = size;
+        const std::lock_guard<std::mutex> lock(mMutex);
+        mRegister.insert(buffer);
+        buffer->mData = alignedFree;
+        mFree = util::PtrAdd(alignedFree, size);
+    }
+
+    /// @brief Remove the specified buffer from the register
+    void remove(HostBuffer *buffer)
+    {
+        const std::lock_guard<std::mutex> lock(mMutex);
+        mRegister.erase(buffer);
+    }
+
+    /// @brief Replaces buffer1 with buffer2 in the register
+    void replace(HostBuffer *buffer1, HostBuffer *buffer2)
+    {
+        const std::lock_guard<std::mutex> lock(mMutex);
+        mRegister.erase( buffer1);
+        mRegister.insert(buffer2);
+    }
+
+    /// @brief Reset the register and all its buffers
+    void reset()
+    {
+        for (HostBuffer *buffer : mRegister) {
+            buffer->mPool.reset();
+            buffer->mSize = 0;
+            buffer->mData = nullptr;
+        }
+        mRegister.clear();
+        mFree = util::PtrAdd(mData, mPadding);
+    }
+
+    /// @brief Resize this Pool and update registered buffers as needed. If data is no NULL
+    ///        it is used as externally managed memory.
+    void resize(uint64_t size, void *data = nullptr)
+    {
+        const uint64_t memUsage = this->usage();
+
+        const bool managed = (data == nullptr);
+
+        if (!managed && alignmentPadding(data) != 0) {
+            throw std::runtime_error("Pool::resize: external memory buffer is not aligned to " +
+                                     std::to_string(NANOVDB_DATA_ALIGNMENT) + " bytes");
+        }
+
+        if (memUsage > size) {
+            throw std::runtime_error("Pool::resize: insufficient memory");
+        }
+
+        uint64_t padding = 0;
+        if (mManaged && managed && size != mSize) { // managed -> managed
+            padding = mPadding;
+            data = Pool::realloc(mData, memUsage, size, padding); // performs both copy and free of mData
+        } else if (!mManaged && managed) { // un-managed -> managed
+            data = Pool::alloc(size);
+            padding = alignmentPadding(data);
+        }
+
+        if (data == nullptr) {
+            throw std::runtime_error("Pool::resize: allocation failed");
+        } else if (data != mData) {
+            void* paddedData = util::PtrAdd(data, padding);
+
+            if (!(mManaged && managed)) { // no need to copy if managed -> managed
+                memcpy(paddedData, util::PtrAdd(mData, mPadding), memUsage);
+            }
+
+            for (HostBuffer* buffer : mRegister) { // update registered buffers
+                //buffer->mData = paddedData + ptrdiff_t(buffer->mData - (mData + mPadding));
+                buffer->mData = util::PtrAdd(paddedData, util::PtrDiff(buffer->mData, util::PtrAdd(mData, mPadding)));
+            }
+            mFree = util::PtrAdd(paddedData, memUsage); // update the free pointer
+            if (mManaged && !managed) {// only free if managed -> un-managed
+                std::free(mData);
+            }
+
+            mData = data;
+            mPadding = padding;
+        }
+        mSize    = size;
+        mManaged = managed;
+    }
+    /// @brief Return true is all the memory in this pool is in use.
+    bool isFull() const
+    {
+        assert(mFree <= util::PtrAdd(mData, mPadding + mSize));
+        return mSize > 0 ? mFree == util::PtrAdd(mData, mPadding + mSize) : false;
+    }
+
+private:
+
+    static void* alloc(uint64_t size)
+    {
+//#if (__cplusplus >= 201703L)
+//    return std::aligned_alloc(NANOVDB_DATA_ALIGNMENT, size);//C++17 or newer
+//#else
+    // make sure we alloc enough space to align the result
+    return std::malloc(size + NANOVDB_DATA_ALIGNMENT);
+//#endif
+    }
+
+    static void* realloc(void* const origData,
+                         uint64_t    origSize,
+                         uint64_t    desiredSize,
+                         uint64_t&   padding)
+    {
+        // make sure we alloc enough space to align the result
+        void* data = std::realloc(origData, desiredSize + NANOVDB_DATA_ALIGNMENT);
+
+        if (data != nullptr && data != origData) {
+            uint64_t newPadding = alignmentPadding(data);
+            // Number of padding bytes may have changed -- move data if that's the case
+            if (newPadding != padding) {
+                // Realloc should not happen when shrinking down buffer, but let's be safe
+                std::memmove(util::PtrAdd(data, newPadding),
+                             util::PtrAdd(data, padding),
+                             math::Min(origSize, desiredSize));
+                padding = newPadding;
+            }
+        }
+
+        return data;
+    }
+
+};// struct HostBuffer::Pool
+
+// --------------------------> Implementation of HostBuffer <------------------------------------
+
+inline HostBuffer::HostBuffer(uint64_t size) : mPool(nullptr), mSize(size), mData(nullptr)
+{
+    if (size>0) {
+        mPool = std::make_shared<Pool>(size);
+        mData = mPool->mFree;
+        mPool->mRegister.insert(this);
+        mPool->mFree = util::PtrAdd(mPool->mFree, size);
+    }
+}
+
+inline HostBuffer::HostBuffer(HostBuffer&& other) : mPool(other.mPool), mSize(other.mSize), mData(other.mData)
+{
+    if (mPool && mSize != 0) {
+        mPool->replace(&other, this);
+    }
+    other.mPool.reset();
+    other.mSize = 0;
+    other.mData = nullptr;
+}
+
+inline void HostBuffer::init(uint64_t bufferSize, void *data)
+{
+    if (bufferSize == 0) {
+        throw std::runtime_error("HostBuffer: invalid buffer size");
+    }
+    if (mPool) {
+        mPool.reset();
+    }
+    if (!mPool || mPool->mSize != bufferSize) {
+        mPool = std::make_shared<Pool>(bufferSize, data);
+    }
+    mPool->add(this, bufferSize);
+}
+
+inline HostBuffer& HostBuffer::operator=(HostBuffer&& other)
+{
+    if (mPool) {
+        mPool->remove(this);
+    }
+    mPool = other.mPool;
+    mSize = other.mSize;
+    mData = other.mData;
+    if (mPool && mSize != 0) {
+        mPool->replace(&other, this);
+    }
+    other.mPool.reset();
+    other.mSize = 0;
+    other.mData = nullptr;
+    return *this;
+}
+
+inline uint64_t HostBuffer::poolSize() const
+{
+    return mPool ? mPool->mSize : 0u;
+}
+
+inline uint64_t HostBuffer::poolUsage() const
+{
+    return mPool ? mPool->usage(): 0u;
+}
+
+inline bool HostBuffer::isManaged() const
+{
+    return mPool ? mPool->mManaged : false;
+}
+
+inline bool HostBuffer::isFull() const
+{
+    return mPool ? mPool->isFull() : false;
+}
+
+inline HostBuffer HostBuffer::createPool(uint64_t poolSize, void *data)
+{
+    if (poolSize == 0) {
+        throw std::runtime_error("HostBuffer: invalid pool size");
+    }
+    HostBuffer buffer;
+    buffer.mPool = std::make_shared<Pool>(poolSize, data);
+    // note the buffer is NOT registered by its pool since it is not using its memory
+    buffer.mSize = 0;
+    buffer.mData = nullptr;
+    return buffer;
+}
+
+inline HostBuffer HostBuffer::createFull(uint64_t bufferSize, void *data)
+{
+    if (bufferSize == 0) {
+        throw std::runtime_error("HostBuffer: invalid buffer size");
+    }
+    HostBuffer buffer;
+    buffer.mPool = std::make_shared<Pool>(bufferSize, data);
+    buffer.mPool->add(&buffer, bufferSize);
+    return buffer;
+}
+
+inline HostBuffer HostBuffer::create(uint64_t bufferSize, const HostBuffer* pool)
+{
+    HostBuffer buffer;
+    if (pool == nullptr || !pool->mPool) {
+        buffer.mPool = std::make_shared<Pool>(bufferSize);
+    } else {
+       buffer.mPool = pool->mPool;
+    }
+    buffer.mPool->add(&buffer, bufferSize);
+    return buffer;
+}
+
+inline void HostBuffer::clear()
+{
+    if (mPool) {// remove self from the buffer register in the pool
+        mPool->remove(this);
+    }
+    mPool.reset();
+    mSize = 0;
+    mData = nullptr;
+}
+
+inline void HostBuffer::reset()
+{
+    if (this->size()>0) {
+        throw std::runtime_error("HostBuffer: only empty buffers can call reset");
+    }
+    if (!mPool) {
+        throw std::runtime_error("HostBuffer: this buffer contains no pool to reset");
+    }
+    mPool->reset();
+}
+
+inline void HostBuffer::resizePool(uint64_t size, void *data)
+{
+    if (!mPool) {
+        throw std::runtime_error("HostBuffer: this buffer contains no pool to resize");
+    }
+    mPool->resize(size, data);
+}
+
+} // namespace nanovdb
+
+#endif // end of NANOVDB_HOSTBUFFER_H_HAS_BEEN_INCLUDED
diff --git a/warp/native/nanovdb/NanoVDB.h b/warp/native/nanovdb/NanoVDB.h
index 2cbf6564..53265ca9 100644
--- a/warp/native/nanovdb/NanoVDB.h
+++ b/warp/native/nanovdb/NanoVDB.h
@@ -2,7 +2,7 @@
 // SPDX-License-Identifier: MPL-2.0
 
 /*!
-    \file   NanoVDB.h
+    \file   nanovdb/NanoVDB.h
 
     \author Ken Museth
 
@@ -29,7 +29,7 @@
           structure can safely be ignored by most client codes)!
 
 
-    \warning NanoVDB grids can only be constructed via tools like openToNanoVDB
+    \warning NanoVDB grids can only be constructed via tools like createNanoGrid
              or the GridBuilder. This explains why none of the grid nodes defined below
              have public constructors or destructors.
 
@@ -38,6 +38,8 @@
           ACM Transactions on Graphics 32(3), 2013, which can be found here:
           http://www.museth.org/Ken/Publications_files/Museth_TOG13.pdf
 
+          NanoVDB was first published there: https://dl.acm.org/doi/fullHtml/10.1145/3450623.3464653
+
 
     Overview: This file implements the following fundamental class that when combined
           forms the backbone of the VDB tree data structure:
@@ -64,8 +66,15 @@
 
     Memory layout:
 
+    It's important to emphasize that all the grid data (defined below) are explicitly 32 byte
+    aligned, which implies that any memory buffer that contains a NanoVDB grid must also be at
+    32 byte aligned. That is, the memory address of the beginning of a buffer (see ascii diagram below)
+    must be divisible by 32, i.e. uintptr_t(&buffer)%32 == 0! If this is not the case, the C++ standard
+    says the behaviour is undefined! Normally this is not a concerns on GPUs, because they use 256 byte
+    aligned allocations, but the same cannot be said about the CPU.
+
     GridData is always at the very beginning of the buffer immediately followed by TreeData!
-    The remaining nodes and blind-data are allowed to be scattered thoughout the buffer,
+    The remaining nodes and blind-data are allowed to be scattered throughout the buffer,
     though in practice they are arranged as:
 
     GridData: 672 bytes (e.g. magic, checksum, major, flags, index, count, size, name, map, world bbox, voxel size, class, type, offset, count)
@@ -91,214 +100,315 @@
     Array of: LeafNodes of size 8^3: bbox, bit masks, 512 voxel values, and min/max/avg/standard deviation values
 
 
-    Example layout: ("---" implies it has a custom offset, "..." implies zero or more)
-    [GridData(672B)][TreeData(64B)]---[RootData][N x Root::Tile]---[NodeData<5>]---[ModeData<4>]---[LeafData<3>]---[BLINDMETA...]---[BLIND0]---[BLIND1]---etc.
+    Notation: "]---[" implies it has optional padding, and "][" implies zero padding
+
+    [GridData(672B)][TreeData(64B)]---[RootData][N x Root::Tile]---[InternalData<5>]---[InternalData<4>]---[LeafData<3>]---[BLINDMETA...]---[BLIND0]---[BLIND1]---etc.
+    ^                                 ^         ^                  ^                   ^                   ^
+    |                                 |         |                  |                   |                   |
+    +-- Start of 32B aligned buffer   |         |                  |                   |                   +-- Node0::DataType* leafData
+        GridType::DataType* gridData  |         |                  |                   |
+                                      |         |                  |                   +-- Node1::DataType* lowerData
+       RootType::DataType* rootData --+         |                  |
+                                                |                  +-- Node2::DataType* upperData
+                                                |
+                                                +-- RootType::DataType::Tile* tile
 
 */
 
 #ifndef NANOVDB_NANOVDB_H_HAS_BEEN_INCLUDED
 #define NANOVDB_NANOVDB_H_HAS_BEEN_INCLUDED
 
-#define NANOVDB_MAGIC_NUMBER 0x304244566f6e614eUL // "NanoVDB0" in hex - little endian (uint64_t)
+// The following two header files are the only mandatory dependencies
+#include <nanovdb/util/Util.h>// for __hostdev__ and lots of other utility functions
+#include <nanovdb/math/Math.h>// for Coord, BBox, Vec3, Vec4 etc
+
+// Do not change this value! 32 byte alignment is fixed in NanoVDB
+#define NANOVDB_DATA_ALIGNMENT 32
+
+// NANOVDB_MAGIC_NUMB is currently used for both grids and files (starting with v32.6.0)
+// NANOVDB_MAGIC_GRID will soon be used exclusively for grids (serialized to a single buffer)
+// NANOVDB_MAGIC_FILE will soon be used exclusively for files
+// NANOVDB_MAGIC_NODE will soon be used exclusively for NodeManager
+// NANOVDB_MAGIC_FRAG will soon be used exclusively for a fragmented grid, i.e. a grid that is not serialized
+//                              | : 0 in 30 corresponds to 0 in NanoVDB0
+#define NANOVDB_MAGIC_NUMB  0x304244566f6e614eUL // "NanoVDB0" in hex - little endian (uint64_t)
+#define NANOVDB_MAGIC_GRID  0x314244566f6e614eUL // "NanoVDB1" in hex - little endian (uint64_t)
+#define NANOVDB_MAGIC_FILE  0x324244566f6e614eUL // "NanoVDB2" in hex - little endian (uint64_t)
+#define NANOVDB_MAGIC_NODE  0x334244566f6e614eUL // "NanoVDB3" in hex - little endian (uint64_t)
+#define NANOVDB_MAGIC_FRAG  0x344244566f6e614eUL // "NanoVDB4" in hex - little endian (uint64_t)
+#define NANOVDB_MAGIC_MASK  0x00FFFFFFFFFFFFFFUL // use this mask to remove the number
+
+//#define NANOVDB_MAGIC_NUMBER 0x304244566f6e614eUL
+//#define NANOVDB_USE_NEW_MAGIC_NUMBERS// used to enable use of the new magic numbers described above
 
 #define NANOVDB_MAJOR_VERSION_NUMBER 32 // reflects changes to the ABI and hence also the file format
-#define NANOVDB_MINOR_VERSION_NUMBER 3 //  reflects changes to the API but not ABI
-#define NANOVDB_PATCH_VERSION_NUMBER 3 //  reflects changes that does not affect the ABI or API
+#define NANOVDB_MINOR_VERSION_NUMBER 7 //  reflects changes to the API but not ABI
+#define NANOVDB_PATCH_VERSION_NUMBER 0 //  reflects changes that does not affect the ABI or API
+
+#define TBB_SUPPRESS_DEPRECATED_MESSAGES 1
 
 // This replaces a Coord key at the root level with a single uint64_t
-#define USE_SINGLE_ROOT_KEY
+#define NANOVDB_USE_SINGLE_ROOT_KEY
 
 // This replaces three levels of Coord keys in the ReadAccessor with one Coord
-//#define USE_SINGLE_ACCESSOR_KEY
+//#define NANOVDB_USE_SINGLE_ACCESSOR_KEY
 
-#define NANOVDB_FPN_BRANCHLESS
+// Use this to switch between std::ofstream or FILE implementations
+//#define NANOVDB_USE_IOSTREAMS
 
-#define NANOVDB_DATA_ALIGNMENT 32
+// Use this to switch between old and new accessor methods
+#define NANOVDB_NEW_ACCESSOR_METHODS
+
+#define NANOVDB_FPN_BRANCHLESS
 
 #if !defined(NANOVDB_ALIGN)
 #define NANOVDB_ALIGN(n) alignas(n)
 #endif // !defined(NANOVDB_ALIGN)
 
-#ifdef __CUDACC_RTC__
-
-typedef signed char        int8_t;
-typedef short              int16_t;
-typedef int                int32_t;
-typedef long long          int64_t;
-typedef unsigned char      uint8_t;
-typedef unsigned int       uint32_t;
-typedef unsigned short     uint16_t;
-typedef unsigned long long uint64_t;
+namespace nanovdb {// =================================================================
 
-#define NANOVDB_ASSERT(x)
+// --------------------------> Build types <------------------------------------
 
-#define UINT64_C(x)  (x ## ULL)
+/// @brief Dummy type for a voxel whose value equals an offset into an external value array
+class ValueIndex{};
 
-#else // __CUDACC_RTC__
+/// @brief Dummy type for a voxel whose value equals an offset into an external value array of active values
+class ValueOnIndex{};
 
-#include <stdlib.h> //    for abs in clang7
-#include <stdint.h> //    for types like int32_t etc
-#include <stddef.h> //    for size_t type
-#include <cassert> //     for assert
-#include <cstdio> //      for sprinf
-#include <cmath> //       for sqrt and fma
-#include <limits> //      for numeric_limits
+/// @brief Like @c ValueIndex but with a mutable mask
+class ValueIndexMask{};
 
-// All asserts can be disabled here, even for debug builds
-#if 1
-#define NANOVDB_ASSERT(x) assert(x)
-#else
-#define NANOVDB_ASSERT(x)
-#endif
+/// @brief Like @c ValueOnIndex but with a mutable mask
+class ValueOnIndexMask{};
 
-#if defined(NANOVDB_USE_INTRINSICS) && defined(_MSC_VER)
-#include <intrin.h>
-#pragma intrinsic(_BitScanReverse)
-#pragma intrinsic(_BitScanForward)
-#pragma intrinsic(_BitScanReverse64)
-#pragma intrinsic(_BitScanForward64)
-#endif
+/// @brief Dummy type for a voxel whose value equals its binary active state
+class ValueMask{};
 
-#endif // __CUDACC_RTC__
-
-#if defined(__CUDACC__) || defined(__HIP__)
-// Only define __hostdev__ when using NVIDIA CUDA or HIP compiler
-#define __hostdev__ __host__ __device__
-#else
-#define __hostdev__
-#endif
-
-// The following macro will suppress annoying warnings when nvcc
-// compiles functions that call (host) intrinsics (which is perfectly valid)
-#if defined(_MSC_VER) && defined(__CUDACC__)
-#define NANOVDB_HOSTDEV_DISABLE_WARNING __pragma("hd_warning_disable")
-#elif defined(__GNUC__) && defined(__CUDACC__)
-#define NANOVDB_HOSTDEV_DISABLE_WARNING _Pragma("hd_warning_disable")
-#else
-#define NANOVDB_HOSTDEV_DISABLE_WARNING
-#endif
-
-// A portable implementation of offsetof - unfortunately it doesn't work with static_assert
-#define NANOVDB_OFFSETOF(CLASS, MEMBER) ((int)(size_t)((char*)&((CLASS*)0)->MEMBER - (char*)0))
-
-namespace nanovdb {
-
-// --------------------------> Build types <------------------------------------
-
-/// @brief Dummy type for a voxel with a binary mask value, e.g. the active state
-class ValueMask {};
-
-/// @brief Dummy type for a 16 bit floating point values
-class Half {};
+/// @brief Dummy type for a 16 bit floating point values (placeholder for IEEE 754 Half)
+class Half{};
 
 /// @brief Dummy type for a 4bit quantization of float point values
-class Fp4 {};
+class Fp4{};
 
 /// @brief Dummy type for a 8bit quantization of float point values
-class Fp8 {};
+class Fp8{};
 
 /// @brief Dummy type for a 16bit quantization of float point values
-class Fp16 {};
+class Fp16{};
 
 /// @brief Dummy type for a variable bit quantization of floating point values
-class FpN {};
+class FpN{};
+
+/// @brief Dummy type for indexing points into voxels
+class Point{};
 
 // --------------------------> GridType <------------------------------------
 
+/// @brief return the number of characters (including null termination) required to convert enum type to a string
+template <class EnumT>
+__hostdev__ inline constexpr uint32_t strlen(){return (uint32_t)EnumT::StrLen - (uint32_t)EnumT::End;}
+
 /// @brief List of types that are currently supported by NanoVDB
 ///
 /// @note To expand on this list do:
 ///       1) Add the new type between Unknown and End in the enum below
 ///       2) Add the new type to OpenToNanoVDB::processGrid that maps OpenVDB types to GridType
 ///       3) Verify that the ConvertTrait in NanoToOpenVDB.h works correctly with the new type
-///       4) Add the new type to mapToGridType (defined below) that maps NanoVDB types to GridType
+///       4) Add the new type to toGridType (defined below) that maps NanoVDB types to GridType
 ///       5) Add the new type to toStr (defined below)
-enum class GridType : uint32_t { Unknown = 0,
-                                 Float   = 1, //  single precision floating point value
-                                 Double  = 2,//   double precision floating point value
-                                 Int16   = 3,//   half precision signed integer value
-                                 Int32   = 4,//   single precision signed integer value
-                                 Int64   = 5,//   double precision signed integer value
-                                 Vec3f   = 6,//   single precision floating 3D vector
-                                 Vec3d   = 7,//   double precision floating 3D vector
-                                 Mask    = 8,//   no value, just the active state
-                                 Half    = 9,//   half precision floating point value
-                                 UInt32  = 10,// single precision unsigned integer value
-                                 Boolean = 11,// boolean value, encoded in bit array
-                                 RGBA8   = 12,// RGBA packed into 32bit word in reverse-order. R in low bits.
-                                 Fp4     = 13,// 4bit quantization of float point value
-                                 Fp8     = 14,// 8bit quantization of float point value
-                                 Fp16    = 15,// 16bit quantization of float point value
-                                 FpN     = 16,// variable bit quantization of floating point value
-                                 Vec4f   = 17,// single precision floating 4D vector
-                                 Vec4d   = 18,// double precision floating 4D vector
-                                 End     = 19 };
-
-#ifndef __CUDACC_RTC__
-/// @brief Retuns a c-string used to describe a GridType
-inline const char* toStr(GridType gridType)
-{
-    static const char * LUT[] = { "?", "float", "double" , "int16", "int32",
-                                 "int64", "Vec3f", "Vec3d", "Mask", "Half",
-                                 "uint32", "bool", "RGBA8", "Float4", "Float8",
-                                 "Float16", "FloatN", "Vec4f", "Vec4d", "End" };
-    static_assert( sizeof(LUT)/sizeof(char*) - 1 == int(GridType::End), "Unexpected size of LUT" );
-    return LUT[static_cast<int>(gridType)];
+enum class GridType : uint32_t { Unknown = 0, //  unknown value type - should rarely be used
+                                 Float = 1, //  single precision floating point value
+                                 Double = 2, //  double precision floating point value
+                                 Int16 = 3, //  half precision signed integer value
+                                 Int32 = 4, //  single precision signed integer value
+                                 Int64 = 5, //  double precision signed integer value
+                                 Vec3f = 6, //  single precision floating 3D vector
+                                 Vec3d = 7, //  double precision floating 3D vector
+                                 Mask = 8, //  no value, just the active state
+                                 Half = 9, //  half precision floating point value (placeholder for IEEE 754 Half)
+                                 UInt32 = 10, // single precision unsigned integer value
+                                 Boolean = 11, // boolean value, encoded in bit array
+                                 RGBA8 = 12, // RGBA packed into 32bit word in reverse-order, i.e. R is lowest byte.
+                                 Fp4 = 13, // 4bit quantization of floating point value
+                                 Fp8 = 14, // 8bit quantization of floating point value
+                                 Fp16 = 15, // 16bit quantization of floating point value
+                                 FpN = 16, // variable bit quantization of floating point value
+                                 Vec4f = 17, // single precision floating 4D vector
+                                 Vec4d = 18, // double precision floating 4D vector
+                                 Index = 19, // index into an external array of active and inactive values
+                                 OnIndex = 20, // index into an external array of active values
+                                 IndexMask = 21, // like Index but with a mutable mask
+                                 OnIndexMask = 22, // like OnIndex but with a mutable mask
+                                 PointIndex = 23, // voxels encode indices to co-located points
+                                 Vec3u8 = 24, // 8bit quantization of floating point 3D vector (only as blind data)
+                                 Vec3u16 = 25, // 16bit quantization of floating point 3D vector (only as blind data)
+                                 UInt8 = 26, // 8 bit unsigned integer values (eg 0 -> 255 gray scale)
+                                 End = 27,// total number of types in this enum (excluding StrLen since it's not a type)
+                                 StrLen = End + 12};// this entry is used to determine the minimum size of c-string
+
+/// @brief Maps a GridType to a c-string
+/// @param dst destination string of size 12 or larger
+/// @param gridType GridType enum to be mapped to a string
+/// @return Retuns a c-string used to describe a GridType
+__hostdev__ inline char* toStr(char *dst, GridType gridType)
+{
+    switch (gridType){
+        case GridType::Unknown:     return util::strcpy(dst, "?");
+        case GridType::Float:       return util::strcpy(dst, "float");
+        case GridType::Double:      return util::strcpy(dst, "double");
+        case GridType::Int16:       return util::strcpy(dst, "int16");
+        case GridType::Int32:       return util::strcpy(dst, "int32");
+        case GridType::Int64:       return util::strcpy(dst, "int64");
+        case GridType::Vec3f:       return util::strcpy(dst, "Vec3f");
+        case GridType::Vec3d:       return util::strcpy(dst, "Vec3d");
+        case GridType::Mask:        return util::strcpy(dst, "Mask");
+        case GridType::Half:        return util::strcpy(dst, "Half");
+        case GridType::UInt32:      return util::strcpy(dst, "uint32");
+        case GridType::Boolean:     return util::strcpy(dst, "bool");
+        case GridType::RGBA8:       return util::strcpy(dst, "RGBA8");
+        case GridType::Fp4:         return util::strcpy(dst, "Float4");
+        case GridType::Fp8:         return util::strcpy(dst, "Float8");
+        case GridType::Fp16:        return util::strcpy(dst, "Float16");
+        case GridType::FpN:         return util::strcpy(dst, "FloatN");
+        case GridType::Vec4f:       return util::strcpy(dst, "Vec4f");
+        case GridType::Vec4d:       return util::strcpy(dst, "Vec4d");
+        case GridType::Index:       return util::strcpy(dst, "Index");
+        case GridType::OnIndex:     return util::strcpy(dst, "OnIndex");
+        case GridType::IndexMask:   return util::strcpy(dst, "IndexMask");
+        case GridType::OnIndexMask: return util::strcpy(dst, "OnIndexMask");
+        case GridType::PointIndex:  return util::strcpy(dst, "PointIndex");
+        case GridType::Vec3u8:      return util::strcpy(dst, "Vec3u8");
+        case GridType::Vec3u16:     return util::strcpy(dst, "Vec3u16");
+        case GridType::UInt8:       return util::strcpy(dst, "uint8");
+        default:                    return util::strcpy(dst, "End");
+    }
 }
-#endif
 
 // --------------------------> GridClass <------------------------------------
 
-/// @brief Classes (defined in OpenVDB) that are currently supported by NanoVDB
+/// @brief Classes (superset of OpenVDB) that are currently supported by NanoVDB
 enum class GridClass : uint32_t { Unknown = 0,
-                                  LevelSet = 1, //   narrow band level set, e.g. SDF
-                                  FogVolume = 2, //  fog volume, e.g. density
-                                  Staggered = 3, //  staggered MAC grid, e.g. velocity
+                                  LevelSet = 1, // narrow band level set, e.g. SDF
+                                  FogVolume = 2, // fog volume, e.g. density
+                                  Staggered = 3, // staggered MAC grid, e.g. velocity
                                   PointIndex = 4, // point index grid
-                                  PointData = 5, //  point data grid
+                                  PointData = 5, // point data grid
                                   Topology = 6, // grid with active states only (no values)
-                                  VoxelVolume = 7, // volume of geometric cubes, e.g. minecraft
-                                  End = 8 };
+                                  VoxelVolume = 7, // volume of geometric cubes, e.g. colors cubes in Minecraft
+                                  IndexGrid = 8, // grid whose values are offsets, e.g. into an external array
+                                  TensorGrid = 9, // Index grid for indexing learnable tensor features
+                                  End = 10,// total number of types in this enum (excluding StrLen since it's not a type)
+                                  StrLen = End + 7};// this entry is used to determine the minimum size of c-string
+
 
-#ifndef __CUDACC_RTC__
 /// @brief Retuns a c-string used to describe a GridClass
-inline const char* toStr(GridClass gridClass)
-{
-    static const char * LUT[] = { "?", "SDF", "FOG" , "MAC", "PNTIDX",
-                                 "PNTDAT", "TOPO", "VOX", "END" };
-    static_assert( sizeof(LUT)/sizeof(char*) - 1 == int(GridClass::End), "Unexpected size of LUT" );
-    return LUT[static_cast<int>(gridClass)];
+/// @param dst destination string of size 7 or larger
+/// @param gridClass GridClass enum to be converted to a string
+__hostdev__ inline char* toStr(char *dst, GridClass gridClass)
+{
+    switch (gridClass){
+        case GridClass::Unknown:     return util::strcpy(dst, "?");
+        case GridClass::LevelSet:    return util::strcpy(dst, "SDF");
+        case GridClass::FogVolume:   return util::strcpy(dst, "FOG");
+        case GridClass::Staggered:   return util::strcpy(dst, "MAC");
+        case GridClass::PointIndex:  return util::strcpy(dst, "PNTIDX");
+        case GridClass::PointData:   return util::strcpy(dst, "PNTDAT");
+        case GridClass::Topology:    return util::strcpy(dst, "TOPO");
+        case GridClass::VoxelVolume: return util::strcpy(dst, "VOX");
+        case GridClass::IndexGrid:   return util::strcpy(dst, "INDEX");
+        case GridClass::TensorGrid:  return util::strcpy(dst, "TENSOR");
+        default:                     return util::strcpy(dst, "END");
+    }
 }
-#endif
 
 // --------------------------> GridFlags <------------------------------------
 
 /// @brief Grid flags which indicate what extra information is present in the grid buffer.
 enum class GridFlags : uint32_t {
-    HasLongGridName = 1 << 0,// grid name is longer than 256 characters
-    HasBBox = 1 << 1,// nodes contain bounding-boxes of active values
-    HasMinMax = 1 << 2,// nodes contain min/max of active values
-    HasAverage = 1 << 3,// nodes contain averages of active values
-    HasStdDeviation = 1 << 4,// nodes contain standard deviations of active values
-    IsBreadthFirst = 1 << 5,// nodes are arranged breadth-first in memory
-    End = 1 << 6,
+    HasLongGridName = 1 << 0, // grid name is longer than 256 characters
+    HasBBox = 1 << 1, // nodes contain bounding-boxes of active values
+    HasMinMax = 1 << 2, // nodes contain min/max of active values
+    HasAverage = 1 << 3, // nodes contain averages of active values
+    HasStdDeviation = 1 << 4, // nodes contain standard deviations of active values
+    IsBreadthFirst = 1 << 5, // nodes are typically arranged breadth-first in memory
+    End = 1 << 6, // use End - 1 as a mask for the 5 lower bit flags
+    StrLen = End + 23,// this entry is used to determine the minimum size of c-string
 };
 
-#ifndef __CUDACC_RTC__
 /// @brief Retuns a c-string used to describe a GridFlags
-inline const char* toStr(GridFlags gridFlags)
-{
-    static const char * LUT[] = { "has long grid name",
-                                  "has bbox",
-                                  "has min/max",
-                                  "has average",
-                                  "has standard deviation",
-                                  "is breadth-first",
-                                  "end" };
-    static_assert( 1 << (sizeof(LUT)/sizeof(char*) - 1) == int(GridFlags::End), "Unexpected size of LUT" );
-    return LUT[static_cast<int>(gridFlags)];
+/// @param dst destination string of size 23 or larger
+/// @param gridFlags GridFlags enum to be converted to a string
+__hostdev__ inline const char* toStr(char *dst, GridFlags gridFlags)
+{
+    switch (gridFlags){
+        case GridFlags::HasLongGridName: return util::strcpy(dst, "has long grid name");
+        case GridFlags::HasBBox:         return util::strcpy(dst, "has bbox");
+        case GridFlags::HasMinMax:       return util::strcpy(dst, "has min/max");
+        case GridFlags::HasAverage:      return util::strcpy(dst, "has average");
+        case GridFlags::HasStdDeviation: return util::strcpy(dst, "has standard deviation");
+        case GridFlags::IsBreadthFirst:  return util::strcpy(dst, "is breadth-first");
+        default:                         return util::strcpy(dst, "end");
+    }
 }
-#endif
+
+// --------------------------> MagicType <------------------------------------
+
+/// @brief Enums used to identify magic numbers recognized by NanoVDB
+enum class MagicType : uint32_t { Unknown  = 0,// first 64 bits are neither of the cases below
+                                  OpenVDB  = 1,// first 32 bits = 0x56444220UL
+                                  NanoVDB  = 2,// first 64 bits = NANOVDB_MAGIC_NUMB
+                                  NanoGrid = 3,// first 64 bits = NANOVDB_MAGIC_GRID
+                                  NanoFile = 4,// first 64 bits = NANOVDB_MAGIC_FILE
+                                  NanoNode = 5,// first 64 bits = NANOVDB_MAGIC_NODE
+                                  NanoFrag = 6,// first 64 bits = NANOVDB_MAGIC_FRAG
+                                  End      = 7,
+                                  StrLen   = End + 25};// this entry is used to determine the minimum size of c-string
+
+/// @brief maps 64 bits of magic number to enum
+__hostdev__ inline MagicType toMagic(uint64_t magic)
+{
+    switch (magic){
+        case NANOVDB_MAGIC_NUMB:   return MagicType::NanoVDB;
+        case NANOVDB_MAGIC_GRID:   return MagicType::NanoGrid;
+        case NANOVDB_MAGIC_FILE:   return MagicType::NanoFile;
+        case NANOVDB_MAGIC_NODE:   return MagicType::NanoNode;
+        case NANOVDB_MAGIC_FRAG:   return MagicType::NanoFrag;
+        default: return (magic & ~uint32_t(0)) == 0x56444220UL ? MagicType::OpenVDB : MagicType::Unknown;
+    }
+}
+
+/// @brief print 64-bit magic number to string
+/// @param dst destination string of size 25 or larger
+/// @param magic 64 bit magic number to be printed
+/// @return return destination string @c dst
+__hostdev__ inline char* toStr(char *dst, MagicType magic)
+{
+    switch (magic){
+        case MagicType::Unknown:  return util::strcpy(dst, "unknown");
+        case MagicType::NanoVDB:  return util::strcpy(dst, "nanovdb");
+        case MagicType::NanoGrid: return util::strcpy(dst, "nanovdb::Grid");
+        case MagicType::NanoFile: return util::strcpy(dst, "nanovdb::File");
+        case MagicType::NanoNode: return util::strcpy(dst, "nanovdb::NodeManager");
+        case MagicType::NanoFrag: return util::strcpy(dst, "fragmented nanovdb::Grid");
+        case MagicType::OpenVDB:  return util::strcpy(dst, "openvdb");
+        default:                  return util::strcpy(dst, "end");
+    }
+}
+
+// --------------------------> PointType enums <------------------------------------
+
+// Define the type used when the points are encoded as blind data in the output grid
+enum class PointType : uint32_t { Disable = 0,// no point information e.g. when BuildT != Point
+                                  PointID = 1,// linear index of type uint32_t to points
+                                  World64 = 2,// Vec3d in world space
+                                  World32 = 3,// Vec3f in world space
+                                  Grid64  = 4,// Vec3d in grid space
+                                  Grid32  = 5,// Vec3f in grid space
+                                  Voxel32 = 6,// Vec3f in voxel space
+                                  Voxel16 = 7,// Vec3u16 in voxel space
+                                  Voxel8  = 8,// Vec3u8 in voxel space
+                                  Default = 9,// output matches input, i.e. Vec3d or Vec3f in world space
+                                  End     =10 };
 
 // --------------------------> GridBlindData enums <------------------------------------
 
@@ -307,82 +417,79 @@ enum class GridBlindDataClass : uint32_t { Unknown = 0,
                                            IndexArray = 1,
                                            AttributeArray = 2,
                                            GridName = 3,
-                                           End = 4 };
+                                           ChannelArray = 4,
+                                           End = 5 };
 
 /// @brief Blind-data Semantics that are currently understood by NanoVDB
 enum class GridBlindDataSemantic : uint32_t { Unknown = 0,
-                                              PointPosition = 1,
+                                              PointPosition = 1, // 3D coordinates in an unknown space
                                               PointColor = 2,
                                               PointNormal = 3,
                                               PointRadius = 4,
                                               PointVelocity = 5,
                                               PointId = 6,
-                                              End = 7 };
+                                              WorldCoords = 7, // 3D coordinates in world space, e.g. (0.056, 0.8, 1,8)
+                                              GridCoords = 8, // 3D coordinates in grid space, e.g. (1.2, 4.0, 5.7), aka index-space
+                                              VoxelCoords = 9, // 3D coordinates in voxel space, e.g. (0.2, 0.0, 0.7)
+                                              End = 10 };
 
-// --------------------------> is_same <------------------------------------
-
-/// @brief C++11 implementation of std::is_same
-template<typename T1, typename T2>
-struct is_same
-{
-    static constexpr bool value = false;
-};
+// --------------------------> BuildTraits <------------------------------------
 
+/// @brief Define static boolean tests for template build types
 template<typename T>
-struct is_same<T, T>
-{
-    static constexpr bool value = true;
-};
+struct BuildTraits
+{
+    // check if T is an index type
+    static constexpr bool is_index     = util::is_same<T, ValueIndex, ValueIndexMask, ValueOnIndex, ValueOnIndexMask>::value;
+    static constexpr bool is_onindex   = util::is_same<T, ValueOnIndex, ValueOnIndexMask>::value;
+    static constexpr bool is_offindex  = util::is_same<T, ValueIndex, ValueIndexMask>::value;
+    static constexpr bool is_indexmask = util::is_same<T, ValueIndexMask, ValueOnIndexMask>::value;
+    // check if T is a compressed float type with fixed bit precision
+    static constexpr bool is_FpX = util::is_same<T, Fp4, Fp8, Fp16>::value;
+    // check if T is a compressed float type with fixed or variable bit precision
+    static constexpr bool is_Fp = util::is_same<T, Fp4, Fp8, Fp16, FpN>::value;
+    // check if T is a POD float type, i.e float or double
+    static constexpr bool is_float = util::is_floating_point<T>::value;
+    // check if T is a template specialization of LeafData<T>, i.e. has T mValues[512]
+    static constexpr bool is_special = is_index || is_Fp || util::is_same<T, Point, bool, ValueMask>::value;
+}; // BuildTraits
+
+// --------------------------> BuildToValueMap <------------------------------------
 
-// --------------------------> enable_if <------------------------------------
-
-/// @brief C++11 implementation of std::enable_if
-template <bool, typename T = void>
-struct enable_if
-{
-};
-
-template <typename T>
-struct enable_if<true, T>
+/// @brief Maps one type (e.g. the build types above) to other (actual) types
+template<typename T>
+struct BuildToValueMap
 {
+    using Type = T;
     using type = T;
 };
 
-// --------------------------> is_floating_point <------------------------------------
-
-/// @brief C++11 implementation of std::is_floating_point
-template<typename T>
-struct is_floating_point
+template<>
+struct BuildToValueMap<ValueIndex>
 {
-    static const bool value = is_same<T, float>::value || is_same<T, double>::value;
+    using Type = uint64_t;
+    using type = uint64_t;
 };
 
-// --------------------------> is_specialization <------------------------------------
-
-/// @brief Metafunction used to determine if the first template
-///        parameter is a specialization of the class template
-///        given in the second template parameter.
-///
-/// @details is_specialization<Vec3<float>, Vec3>::value == true;
-template<typename AnyType, template<typename...> class TemplateType>
-struct is_specialization
+template<>
+struct BuildToValueMap<ValueOnIndex>
 {
-    static const bool value = false;
+    using Type = uint64_t;
+    using type = uint64_t;
 };
-template<typename... Args, template<typename...> class TemplateType>
-struct is_specialization<TemplateType<Args...>, TemplateType>
+
+template<>
+struct BuildToValueMap<ValueIndexMask>
 {
-    static const bool value = true;
+    using Type = uint64_t;
+    using type = uint64_t;
 };
 
-// --------------------------> Value Map <------------------------------------
-
-/// @brief Maps one type (e.g. the build types above) to other (actual) types
-template <typename T>
-struct BuildToValueMap
+template<>
+struct BuildToValueMap<ValueOnIndexMask>
 {
-    using Type = T;
-    using type = T;
+    using Type = uint64_t;
+    using type = uint64_t;
 };
 
 template<>
@@ -427,888 +534,197 @@ struct BuildToValueMap<FpN>
     using type = float;
 };
 
-// --------------------------> PtrDiff  PtrAdd <------------------------------------
-
-template <typename T1, typename T2>
-__hostdev__ inline static int64_t PtrDiff(const T1* p, const T2* q)
+template<>
+struct BuildToValueMap<Point>
 {
-    NANOVDB_ASSERT(p && q);
-    return reinterpret_cast<const char*>(p) - reinterpret_cast<const char*>(q);
-}
+    using Type = uint64_t;
+    using type = uint64_t;
+};
 
-template <typename DstT, typename SrcT>
-__hostdev__ inline static DstT* PtrAdd(SrcT *p, int64_t offset)
-{
-    NANOVDB_ASSERT(p);
-    return reinterpret_cast<DstT*>(reinterpret_cast<char*>(p) + offset);
-}
+// --------------------------> utility functions related to alignment <------------------------------------
 
-template <typename DstT, typename SrcT>
-__hostdev__ inline static const DstT* PtrAdd(const SrcT *p, int64_t offset)
+/// @brief return true if the specified pointer is 32 byte aligned
+__hostdev__ inline static bool isAligned(const void* p){return uint64_t(p) % NANOVDB_DATA_ALIGNMENT == 0;}
+
+/// @brief return the smallest number of bytes that when added to the specified pointer results in a 32 byte aligned pointer.
+__hostdev__ inline static uint64_t alignmentPadding(const void* p)
 {
     NANOVDB_ASSERT(p);
-    return reinterpret_cast<const DstT*>(reinterpret_cast<const char*>(p) + offset);
+    return (NANOVDB_DATA_ALIGNMENT - (uint64_t(p) % NANOVDB_DATA_ALIGNMENT)) % NANOVDB_DATA_ALIGNMENT;
 }
-// --------------------------> Rgba8 <------------------------------------
 
-/// @brief 8-bit red, green, blue, alpha packed into 32 bit unsigned int
-class Rgba8
-{
-    union {
-        uint8_t  c[4];// 4 color channels of red, green, blue and alpha components.
-        uint32_t packed;// 32 bit packed representation
-    } mData;
-public:
-    static const int SIZE = 4;
-    using ValueType = uint8_t;
-
-    Rgba8(const Rgba8&) = default;
-    Rgba8(Rgba8&&) = default;
-    Rgba8& operator=(Rgba8&&) = default;
-    Rgba8& operator=(const Rgba8&) = default;
-    __hostdev__ Rgba8() : mData{0,0,0,0} {static_assert(sizeof(uint32_t) == sizeof(Rgba8),"Unexpected sizeof");}
-    __hostdev__ Rgba8(uint8_t r, uint8_t g, uint8_t b, uint8_t a = 255u) : mData{r, g, b, a} {}
-    explicit __hostdev__ Rgba8(uint8_t v) : Rgba8(v,v,v,v) {}
-    __hostdev__ Rgba8(float r, float g, float b, float a = 1.0f)
-        : mData{(uint8_t(0.5f + r * 255.0f)),// round to nearest
-                (uint8_t(0.5f + g * 255.0f)),// round to nearest
-                (uint8_t(0.5f + b * 255.0f)),// round to nearest
-                (uint8_t(0.5f + a * 255.0f))}// round to nearest
-    {
-    }
-    __hostdev__ bool operator<(const Rgba8& rhs) const { return mData.packed < rhs.mData.packed; }
-    __hostdev__ bool operator==(const Rgba8& rhs) const { return mData.packed == rhs.mData.packed; }
-    __hostdev__ float lengthSqr() const
-    {
-        return 0.0000153787005f*(float(mData.c[0])*mData.c[0] +
-                                 float(mData.c[1])*mData.c[1] +
-                                 float(mData.c[2])*mData.c[2]);//1/255^2
-    }
-    __hostdev__ float length() const { return sqrtf(this->lengthSqr() ); }
-    __hostdev__ const uint8_t& operator[](int n) const { return mData.c[n]; }
-    __hostdev__ uint8_t& operator[](int n) { return mData.c[n]; }
-    __hostdev__ const uint32_t& packed() const { return mData.packed; }
-    __hostdev__ uint32_t& packed() { return mData.packed; }
-    __hostdev__ const uint8_t& r() const { return mData.c[0]; }
-    __hostdev__ const uint8_t& g() const { return mData.c[1]; }
-    __hostdev__ const uint8_t& b() const { return mData.c[2]; }
-    __hostdev__ const uint8_t& a() const { return mData.c[3]; }
-    __hostdev__ uint8_t& r() { return mData.c[0]; }
-    __hostdev__ uint8_t& g() { return mData.c[1]; }
-    __hostdev__ uint8_t& b() { return mData.c[2]; }
-    __hostdev__ uint8_t& a() { return mData.c[3]; }
-};// Rgba8
-
-using PackedRGBA8 = Rgba8;// for backwards compatibility
+/// @brief offset the specified pointer so it is 32 byte aligned. Works with both const and non-const pointers.
+template <typename T>
+__hostdev__ inline static T* alignPtr(T* p){return util::PtrAdd<T>(p, alignmentPadding(p));}
 
-// --------------------------> isValue(GridType, GridClass) <------------------------------------
+// --------------------------> isFloatingPoint(GridType) <------------------------------------
 
-/// @brief return true if the GridType maps to a floating point value.
+/// @brief return true if the GridType maps to a floating point type
 __hostdev__ inline bool isFloatingPoint(GridType gridType)
 {
-    return gridType == GridType::Float  ||
+    return gridType == GridType::Float ||
            gridType == GridType::Double ||
-           gridType == GridType::Fp4    ||
-           gridType == GridType::Fp8    ||
-           gridType == GridType::Fp16   ||
+           gridType == GridType::Half ||
+           gridType == GridType::Fp4 ||
+           gridType == GridType::Fp8 ||
+           gridType == GridType::Fp16 ||
            gridType == GridType::FpN;
 }
 
-// --------------------------> isValue(GridType, GridClass) <------------------------------------
-
-/// @brief return true if the combination of GridType and GridClass is valid.
-__hostdev__ inline bool isValid(GridType gridType, GridClass gridClass)
-{
-    if (gridClass == GridClass::LevelSet || gridClass == GridClass::FogVolume) {
-        return isFloatingPoint(gridType);
-    } else if (gridClass == GridClass::Staggered) {
-        return gridType == GridType::Vec3f || gridType == GridType::Vec3d ||
-               gridType == GridType::Vec4f || gridType == GridType::Vec4d;
-    } else if (gridClass == GridClass::PointIndex || gridClass ==  GridClass::PointData) {
-        return gridType == GridType::UInt32;
-    } else if (gridClass == GridClass::VoxelVolume) {
-        return gridType == GridType::RGBA8 || gridType == GridType::Float || gridType == GridType::Double || gridType == GridType::Vec3f || gridType == GridType::Vec3d || gridType == GridType::UInt32;
-    }
-    return gridClass < GridClass::End && gridType < GridType::End;// any valid combination
-}
-
-// ----------------------------> Version class <-------------------------------------
-
-/// @brief Bit-compacted representation of all three version numbers
-///
-/// @details major is the top 11 bits, minor is the 11 middle bits and patch is the lower 10 bits
-class Version
-{
-    uint32_t mData;// 11 + 11 + 10 bit packing of major + minor + patch
-public:
-    __hostdev__ Version() : mData( uint32_t(NANOVDB_MAJOR_VERSION_NUMBER) << 21 |
-                                   uint32_t(NANOVDB_MINOR_VERSION_NUMBER) << 10 |
-                                   uint32_t(NANOVDB_PATCH_VERSION_NUMBER) )
-    {
-    }
-    __hostdev__ Version(uint32_t major, uint32_t minor, uint32_t patch)
-        : mData( major << 21 | minor << 10 | patch )
-    {
-        NANOVDB_ASSERT(major < (1u << 11));// max value of major is 2047
-        NANOVDB_ASSERT(minor < (1u << 11));// max value of minor is 2047
-        NANOVDB_ASSERT(patch < (1u << 10));// max value of patch is 1023
-    }
-    __hostdev__ bool operator==(const Version &rhs) const {return mData == rhs.mData;}
-    __hostdev__ bool operator< (const Version &rhs) const {return mData <  rhs.mData;}
-    __hostdev__ bool operator<=(const Version &rhs) const {return mData <= rhs.mData;}
-    __hostdev__ bool operator> (const Version &rhs) const {return mData >  rhs.mData;}
-    __hostdev__ bool operator>=(const Version &rhs) const {return mData >= rhs.mData;}
-    __hostdev__ uint32_t id()       const { return mData; }
-    __hostdev__ uint32_t getMajor() const { return (mData >> 21) & ((1u << 11) - 1);}
-    __hostdev__ uint32_t getMinor() const { return (mData >> 10) & ((1u << 11) - 1);}
-    __hostdev__ uint32_t getPatch() const { return  mData        & ((1u << 10) - 1);}
-
-#ifndef __CUDACC_RTC__
-    const char* c_str() const
-    {
-        char *buffer = (char*)malloc(4 + 1 + 4 + 1 + 4 + 1);// xxxx.xxxx.xxxx\n
-        sprintf(buffer, "%d.%d.%d", this->getMajor(), this->getMinor(), this->getPatch());
-        return buffer;
-    }
-#endif
-};// Version
-
-// ----------------------------> Various math functions <-------------------------------------
-
-//@{
-/// Tolerance for floating-point comparison
-template<typename T>
-struct Tolerance;
-template<>
-struct Tolerance<float>
-{
-    __hostdev__ static float value() { return 1e-8f; }
-};
-template<>
-struct Tolerance<double>
-{
-    __hostdev__ static double value() { return 1e-15; }
-};
-//@}
-
-//@{
-/// Delta for small floating-point offsets
-template<typename T>
-struct Delta;
-template<>
-struct Delta<float>
-{
-    __hostdev__ static float value() { return 1e-5f; }
-};
-template<>
-struct Delta<double>
-{
-    __hostdev__ static double value() { return 1e-9; }
-};
-//@}
-
-//@{
-/// Maximum floating-point values
-template<typename T>
-struct Maximum;
-#if defined(__CUDA_ARCH__) || defined(__HIP__)
-template<>
-struct Maximum<int>
-{
-    __hostdev__ static int value() { return 2147483647; }
-};
-template<>
-struct Maximum<uint32_t>
-{
-    __hostdev__ static uint32_t value() { return 4294967295; }
-};
-template<>
-struct Maximum<float>
-{
-    __hostdev__ static float value() { return 1e+38f; }
-};
-template<>
-struct Maximum<double>
-{
-    __hostdev__ static double value() { return 1e+308; }
-};
-#else
-template<typename T>
-struct Maximum
-{
-    static T value() { return std::numeric_limits<T>::max(); }
-};
-#endif
-//@}
-
-template<typename Type>
-__hostdev__ inline bool isApproxZero(const Type& x)
-{
-    return !(x > Tolerance<Type>::value()) && !(x < -Tolerance<Type>::value());
-}
-
-template<typename Type>
-__hostdev__ inline Type Min(Type a, Type b)
-{
-    return (a < b) ? a : b;
-}
-__hostdev__ inline int32_t Min(int32_t a, int32_t b)
-{
-    return int32_t(fminf(float(a), float(b)));
-}
-__hostdev__ inline uint32_t Min(uint32_t a, uint32_t b)
-{
-    return uint32_t(fminf(float(a), float(b)));
-}
-__hostdev__ inline float Min(float a, float b)
-{
-    return fminf(a, b);
-}
-__hostdev__ inline double Min(double a, double b)
-{
-    return fmin(a, b);
-}
-template<typename Type>
-__hostdev__ inline Type Max(Type a, Type b)
-{
-    return (a > b) ? a : b;
-}
-
-__hostdev__ inline int32_t Max(int32_t a, int32_t b)
-{
-    return int32_t(fmaxf(float(a), float(b)));
-}
-__hostdev__ inline uint32_t Max(uint32_t a, uint32_t b)
-{
-    return uint32_t(fmaxf(float(a), float(b)));
-}
-__hostdev__ inline float Max(float a, float b)
-{
-    return fmaxf(a, b);
-}
-__hostdev__ inline double Max(double a, double b)
-{
-    return fmax(a, b);
-}
-__hostdev__ inline float Clamp(float x, float a, float b)
-{
-    return Max(Min(x, b), a);
-}
-__hostdev__ inline double Clamp(double x, double a, double b)
-{
-    return Max(Min(x, b), a);
-}
-
-__hostdev__ inline float Fract(float x)
-{
-    return x - floorf(x);
-}
-__hostdev__ inline double Fract(double x)
-{
-    return x - ::floor(x);
-}
-
-__hostdev__ inline int32_t Floor(float x)
-{
-    return int32_t(floorf(x));
-}
-__hostdev__ inline int32_t Floor(double x)
-{
-    return int32_t(::floor(x));
-}
-
-__hostdev__ inline int32_t Ceil(float x)
-{
-    return int32_t(ceilf(x));
-}
-__hostdev__ inline int32_t Ceil(double x)
-{
-    return int32_t(::ceil(x));
-}
-
-template<typename T>
-__hostdev__ inline T Pow2(T x)
-{
-    return x * x;
-}
-
-template<typename T>
-__hostdev__ inline T Pow3(T x)
-{
-    return x * x * x;
-}
-
-template<typename T>
-__hostdev__ inline T Pow4(T x)
-{
-    return Pow2(x * x);
-}
-template<typename T>
-__hostdev__ inline T Abs(T x)
-{
-    return x < 0 ? -x : x;
-}
+// --------------------------> isFloatingPointVector(GridType) <------------------------------------
 
-template<>
-__hostdev__ inline float Abs(float x)
+/// @brief return true if the GridType maps to a floating point vec3.
+__hostdev__ inline bool isFloatingPointVector(GridType gridType)
 {
-    return fabsf(x);
+    return gridType == GridType::Vec3f ||
+           gridType == GridType::Vec3d ||
+           gridType == GridType::Vec4f ||
+           gridType == GridType::Vec4d;
 }
 
-template<>
-__hostdev__ inline double Abs(double x)
-{
-    return fabs(x);
-}
+// --------------------------> isInteger(GridType) <------------------------------------
 
-template<>
-__hostdev__ inline int Abs(int x)
+/// @brief Return true if the GridType maps to a POD integer type.
+/// @details These types are used to associate a voxel with a POD integer type
+__hostdev__ inline bool isInteger(GridType gridType)
 {
-    return ::abs(x);
+    return gridType == GridType::Int16 ||
+           gridType == GridType::Int32 ||
+           gridType == GridType::Int64 ||
+           gridType == GridType::UInt32||
+           gridType == GridType::UInt8;
 }
 
-template<typename CoordT, typename RealT, template<typename> class Vec3T>
-__hostdev__ inline CoordT Round(const Vec3T<RealT>& xyz);
-
-template<typename CoordT, template<typename> class Vec3T>
-__hostdev__ inline CoordT Round(const Vec3T<float>& xyz)
-{
-    return CoordT(int32_t(rintf(xyz[0])), int32_t(rintf(xyz[1])), int32_t(rintf(xyz[2])));
-    //return CoordT(int32_t(roundf(xyz[0])), int32_t(roundf(xyz[1])), int32_t(roundf(xyz[2])) );
-    //return CoordT(int32_t(floorf(xyz[0] + 0.5f)), int32_t(floorf(xyz[1] + 0.5f)), int32_t(floorf(xyz[2] + 0.5f)));
-}
+// --------------------------> isIndex(GridType) <------------------------------------
 
-template<typename CoordT, template<typename> class Vec3T>
-__hostdev__ inline CoordT Round(const Vec3T<double>& xyz)
+/// @brief Return true if the GridType maps to a special index type (not a POD integer type).
+/// @details These types are used to index from a voxel into an external array of values, e.g. sidecar or blind data.
+__hostdev__ inline bool isIndex(GridType gridType)
 {
-    return CoordT(int32_t(floor(xyz[0] + 0.5)), int32_t(floor(xyz[1] + 0.5)), int32_t(floor(xyz[2] + 0.5)));
+    return gridType == GridType::Index ||// index both active and inactive values
+           gridType == GridType::OnIndex ||// index active values only
+           gridType == GridType::IndexMask ||// as Index, but with an additional mask
+           gridType == GridType::OnIndexMask;// as OnIndex, but with an additional mask
 }
 
-template<typename CoordT, typename RealT, template<typename> class Vec3T>
-__hostdev__ inline CoordT RoundDown(const Vec3T<RealT>& xyz)
-{
-    return CoordT(Floor(xyz[0]), Floor(xyz[1]), Floor(xyz[2]));
-}
+// --------------------------> isValue(GridType, GridClass) <------------------------------------
 
-//@{
-/// Return the square root of a floating-point value.
-__hostdev__ inline float Sqrt(float x)
-{
-    return sqrtf(x);
-}
-__hostdev__ inline double Sqrt(double x)
+/// @brief return true if the combination of GridType and GridClass is valid.
+__hostdev__ inline bool isValid(GridType gridType, GridClass gridClass)
 {
-    return ::sqrt(x);
+    if (gridClass == GridClass::LevelSet || gridClass == GridClass::FogVolume) {
+        return isFloatingPoint(gridType);
+    } else if (gridClass == GridClass::Staggered) {
+        return isFloatingPointVector(gridType);
+    } else if (gridClass == GridClass::PointIndex || gridClass == GridClass::PointData) {
+        return gridType == GridType::PointIndex || gridType == GridType::UInt32;
+    } else if (gridClass == GridClass::Topology) {
+        return gridType == GridType::Mask;
+    } else if (gridClass == GridClass::IndexGrid) {
+        return isIndex(gridType);
+    } else if (gridClass == GridClass::VoxelVolume) {
+        return gridType == GridType::RGBA8 || gridType == GridType::Float ||
+               gridType == GridType::Double || gridType == GridType::Vec3f ||
+               gridType == GridType::Vec3d || gridType == GridType::UInt32 ||
+               gridType == GridType::UInt8;
+    }
+    return gridClass < GridClass::End && gridType < GridType::End; // any valid combination
 }
-//@}
 
-/// Return the sign of the given value as an integer (either -1, 0 or 1).
-template <typename T>
-__hostdev__ inline T Sign(const T &x) { return ((T(0) < x)?T(1):T(0)) - ((x < T(0))?T(1):T(0)); }
-
-template<typename Vec3T>
-__hostdev__ inline int MinIndex(const Vec3T& v)
-{
-#if 0
-    static const int hashTable[8] = {2, 1, 9, 1, 2, 9, 0, 0}; //9 are dummy values
-    const int        hashKey = ((v[0] < v[1]) << 2) + ((v[0] < v[2]) << 1) + (v[1] < v[2]); // ?*4+?*2+?*1
-    return hashTable[hashKey];
-#else
-    if (v[0] < v[1] && v[0] < v[2])
-        return 0;
-    if (v[1] < v[2])
-        return 1;
-    else
-        return 2;
-#endif
+// --------------------------> validation of blind data meta data <------------------------------------
+
+/// @brief return true if the combination of GridBlindDataClass, GridBlindDataSemantic and GridType is valid.
+__hostdev__ inline bool isValid(const GridBlindDataClass&    blindClass,
+                                const GridBlindDataSemantic& blindSemantics,
+                                const GridType&              blindType)
+{
+    bool test = false;
+    switch (blindClass) {
+    case GridBlindDataClass::IndexArray:
+        test = (blindSemantics == GridBlindDataSemantic::Unknown ||
+                blindSemantics == GridBlindDataSemantic::PointId) &&
+               isInteger(blindType);
+        break;
+    case GridBlindDataClass::AttributeArray:
+        if (blindSemantics == GridBlindDataSemantic::PointPosition ||
+            blindSemantics == GridBlindDataSemantic::WorldCoords) {
+            test = blindType == GridType::Vec3f || blindType == GridType::Vec3d;
+        } else if (blindSemantics == GridBlindDataSemantic::GridCoords) {
+            test = blindType == GridType::Vec3f;
+        } else if (blindSemantics == GridBlindDataSemantic::VoxelCoords) {
+            test = blindType == GridType::Vec3f || blindType == GridType::Vec3u8 || blindType == GridType::Vec3u16;
+        } else {
+            test = blindSemantics != GridBlindDataSemantic::PointId;
+        }
+        break;
+    case GridBlindDataClass::GridName:
+        test = blindSemantics == GridBlindDataSemantic::Unknown && blindType == GridType::Unknown;
+        break;
+    default: // captures blindClass == Unknown and ChannelArray
+        test = blindClass < GridBlindDataClass::End &&
+               blindSemantics < GridBlindDataSemantic::End &&
+               blindType < GridType::End; // any valid combination
+        break;
+    }
+    //if (!test) printf("Invalid combination: GridBlindDataClass=%u, GridBlindDataSemantic=%u, GridType=%u\n",(uint32_t)blindClass, (uint32_t)blindSemantics, (uint32_t)blindType);
+    return test;
 }
 
-template<typename Vec3T>
-__hostdev__ inline int MaxIndex(const Vec3T& v)
-{
-#if 0
-    static const int hashTable[8] = {2, 1, 9, 1, 2, 9, 0, 0}; //9 are dummy values
-    const int        hashKey = ((v[0] > v[1]) << 2) + ((v[0] > v[2]) << 1) + (v[1] > v[2]); // ?*4+?*2+?*1
-    return hashTable[hashKey];
-#else
-    if (v[0] > v[1] && v[0] > v[2])
-        return 0;
-    if (v[1] > v[2])
-        return 1;
-    else
-        return 2;
-#endif
-}
+// ----------------------------> Version class <-------------------------------------
 
-/// @brief round up byteSize to the nearest wordSize, e.g. to align to machine word: AlignUp<sizeof(size_t)(n)
+/// @brief Bit-compacted representation of all three version numbers
 ///
-/// @details both wordSize and byteSize are in byte units
-template<uint64_t wordSize>
-__hostdev__ inline uint64_t AlignUp(uint64_t byteCount)
-{
-    const uint64_t r = byteCount % wordSize;
-    return r ? byteCount - r + wordSize : byteCount;
-}
-
-// ------------------------------> Coord <--------------------------------------
-
-// forward decleration so we can define Coord::asVec3s and Coord::asVec3d
-template<typename> class Vec3;
-
-/// @brief Signed (i, j, k) 32-bit integer coordinate class, similar to openvdb::math::Coord
-class Coord
-{
-    int32_t mVec[3]; // private member data - three signed index coordinates
-public:
-    using ValueType = int32_t;
-    using IndexType = uint32_t;
-
-    /// @brief Initialize all coordinates to zero.
-    __hostdev__ Coord()
-        : mVec{0, 0, 0}
-    {
-    }
-
-    /// @brief Initializes all coordinates to the given signed integer.
-    __hostdev__ explicit Coord(ValueType n)
-        : mVec{n, n, n}
-    {
-    }
-
-    /// @brief Initializes coordinate to the given signed integers.
-    __hostdev__ Coord(ValueType i, ValueType j, ValueType k)
-        : mVec{i, j, k}
-    {
-    }
-
-    __hostdev__ Coord(ValueType *ptr)
-        : mVec{ptr[0], ptr[1], ptr[2]}
-    {
-    }
-
-    __hostdev__ int32_t x() const { return mVec[0]; }
-    __hostdev__ int32_t y() const { return mVec[1]; }
-    __hostdev__ int32_t z() const { return mVec[2]; }
-
-    __hostdev__ int32_t& x() { return mVec[0]; }
-    __hostdev__ int32_t& y() { return mVec[1]; }
-    __hostdev__ int32_t& z() { return mVec[2]; }
-
-    __hostdev__ static Coord max() { return Coord(int32_t((1u << 31) - 1)); }
-
-    __hostdev__ static Coord min() { return Coord(-int32_t((1u << 31) - 1) - 1); }
-
-    __hostdev__ static size_t memUsage() { return sizeof(Coord); }
-
-    /// @brief Return a const reference to the given Coord component.
-    /// @warning The argument is assumed to be 0, 1, or 2.
-    __hostdev__ const ValueType& operator[](IndexType i) const { return mVec[i]; }
-
-    /// @brief Return a non-const reference to the given Coord component.
-    /// @warning The argument is assumed to be 0, 1, or 2.
-    __hostdev__ ValueType& operator[](IndexType i) { return mVec[i]; }
-
-   /// @brief Assignment operator that works with openvdb::Coord
-   template <typename CoordT>
-    __hostdev__ Coord& operator=(const CoordT &other)
-    {
-        static_assert(sizeof(Coord) == sizeof(CoordT), "Mis-matched sizeof");
-        mVec[0] = other[0];
-        mVec[1] = other[1];
-        mVec[2] = other[2];
-        return *this;
-    }
-
-    /// @brief Return a new instance with coordinates masked by the given unsigned integer.
-    __hostdev__ Coord operator&(IndexType n) const { return Coord(mVec[0] & n, mVec[1] & n, mVec[2] & n); }
-
-    // @brief Return a new instance with coordinates left-shifted by the given unsigned integer.
-    __hostdev__ Coord operator<<(IndexType n) const { return Coord(mVec[0] << n, mVec[1] << n, mVec[2] << n); }
-
-    // @brief Return a new instance with coordinates right-shifted by the given unsigned integer.
-    __hostdev__ Coord operator>>(IndexType n) const { return Coord(mVec[0] >> n, mVec[1] >> n, mVec[2] >> n); }
-
-    /// @brief Return true if this Coord is lexicographically less than the given Coord.
-    __hostdev__ bool operator<(const Coord& rhs) const
-    {
-        return mVec[0] < rhs[0] ? true : mVec[0] > rhs[0] ? false : mVec[1] < rhs[1] ? true : mVec[1] > rhs[1] ? false : mVec[2] < rhs[2] ? true : false;
-    }
-
-    // @brief Return true if the Coord components are identical.
-    __hostdev__ bool operator==(const Coord& rhs) const { return mVec[0] == rhs[0] && mVec[1] == rhs[1] && mVec[2] == rhs[2]; }
-    __hostdev__ bool operator!=(const Coord& rhs) const { return mVec[0] != rhs[0] || mVec[1] != rhs[1] || mVec[2] != rhs[2]; }
-    __hostdev__ Coord& operator&=(int n)
-    {
-        mVec[0] &= n;
-        mVec[1] &= n;
-        mVec[2] &= n;
-        return *this;
-    }
-    __hostdev__ Coord& operator<<=(uint32_t n)
-    {
-        mVec[0] <<= n;
-        mVec[1] <<= n;
-        mVec[2] <<= n;
-        return *this;
-    }
-    __hostdev__ Coord& operator+=(int n)
-    {
-        mVec[0] += n;
-        mVec[1] += n;
-        mVec[2] += n;
-        return *this;
-    }
-    __hostdev__ Coord operator+(const Coord& rhs) const { return Coord(mVec[0] + rhs[0], mVec[1] + rhs[1], mVec[2] + rhs[2]); }
-    __hostdev__ Coord operator-(const Coord& rhs) const { return Coord(mVec[0] - rhs[0], mVec[1] - rhs[1], mVec[2] - rhs[2]); }
-    __hostdev__ Coord& operator+=(const Coord& rhs)
-    {
-        mVec[0] += rhs[0];
-        mVec[1] += rhs[1];
-        mVec[2] += rhs[2];
-        return *this;
-    }
-    __hostdev__ Coord& operator-=(const Coord& rhs)
-    {
-        mVec[0] -= rhs[0];
-        mVec[1] -= rhs[1];
-        mVec[2] -= rhs[2];
-        return *this;
-    }
-
-    /// @brief Perform a component-wise minimum with the other Coord.
-    __hostdev__ Coord& minComponent(const Coord& other)
-    {
-        if (other[0] < mVec[0])
-            mVec[0] = other[0];
-        if (other[1] < mVec[1])
-            mVec[1] = other[1];
-        if (other[2] < mVec[2])
-            mVec[2] = other[2];
-        return *this;
-    }
-
-    /// @brief Perform a component-wise maximum with the other Coord.
-    __hostdev__ Coord& maxComponent(const Coord& other)
-    {
-        if (other[0] > mVec[0])
-            mVec[0] = other[0];
-        if (other[1] > mVec[1])
-            mVec[1] = other[1];
-        if (other[2] > mVec[2])
-            mVec[2] = other[2];
-        return *this;
-    }
-
-    __hostdev__ Coord offsetBy(ValueType dx, ValueType dy, ValueType dz) const
-    {
-        return Coord(mVec[0] + dx, mVec[1] + dy, mVec[2] + dz);
-    }
-
-    __hostdev__ Coord offsetBy(ValueType n) const { return this->offsetBy(n, n, n); }
-
-    /// Return true if any of the components of @a a are smaller than the
-    /// corresponding components of @a b.
-    __hostdev__ static inline bool lessThan(const Coord& a, const Coord& b)
-    {
-        return (a[0] < b[0] || a[1] < b[1] || a[2] < b[2]);
-    }
-
-    /// @brief Return the largest integer coordinates that are not greater
-    /// than @a xyz (node centered conversion).
-    template<typename Vec3T>
-    __hostdev__ static Coord Floor(const Vec3T& xyz) { return Coord(nanovdb::Floor(xyz[0]), nanovdb::Floor(xyz[1]), nanovdb::Floor(xyz[2])); }
-
-    /// @brief Return a hash key derived from the existing coordinates.
-    /// @details For details on this hash function please see the VDB paper.
-    template<int Log2N = 3 + 4 + 5>
-    __hostdev__ uint32_t hash() const { return ((1 << Log2N) - 1) & (mVec[0] * 73856093 ^ mVec[1] * 19349663 ^ mVec[2] * 83492791); }
-
-    /// @brief Return the octant of this Coord
-    //__hostdev__ size_t octant() const { return (uint32_t(mVec[0])>>31) | ((uint32_t(mVec[1])>>31)<<1) | ((uint32_t(mVec[2])>>31)<<2); }
-    __hostdev__ uint8_t octant() const { return (uint8_t(bool(mVec[0] & (1u << 31)))) |
-                                                (uint8_t(bool(mVec[1] & (1u << 31))) << 1) |
-                                                (uint8_t(bool(mVec[2] & (1u << 31))) << 2); }
-
-    /// @brief Return a single precision floating-point vector of this coordinate
-    __hostdev__ inline Vec3<float> asVec3s() const;
-
-    /// @brief Return a double precision floating-point vector of this coordinate
-    __hostdev__ inline Vec3<double> asVec3d() const;
-}; // Coord class
-
-// ----------------------------> Vec3 <--------------------------------------
-
-/// @brief A simple vector class with three double components, similar to openvdb::math::Vec3
-template<typename T>
-class Vec3
-{
-    T mVec[3];
-
-public:
-    static const int SIZE = 3;
-    using ValueType = T;
-    Vec3() = default;
-    __hostdev__ explicit Vec3(T x)
-        : mVec{x, x, x}
-    {
-    }
-    __hostdev__ Vec3(T x, T y, T z)
-        : mVec{x, y, z}
-    {
-    }
-    template<typename T2>
-    __hostdev__ explicit Vec3(const Vec3<T2>& v)
-        : mVec{T(v[0]), T(v[1]), T(v[2])}
-    {
-    }
-    __hostdev__ explicit Vec3(const Coord& ijk)
-        : mVec{T(ijk[0]), T(ijk[1]), T(ijk[2])}
-    {
-    }
-    __hostdev__ bool operator==(const Vec3& rhs) const { return mVec[0] == rhs[0] && mVec[1] == rhs[1] && mVec[2] == rhs[2]; }
-    __hostdev__ bool operator!=(const Vec3& rhs) const { return mVec[0] != rhs[0] || mVec[1] != rhs[1] || mVec[2] != rhs[2]; }
-    template<typename Vec3T>
-    __hostdev__ Vec3& operator=(const Vec3T& rhs)
-    {
-        mVec[0] = rhs[0];
-        mVec[1] = rhs[1];
-        mVec[2] = rhs[2];
-        return *this;
-    }
-    __hostdev__ const T& operator[](int i) const { return mVec[i]; }
-    __hostdev__ T& operator[](int i) { return mVec[i]; }
-    template<typename Vec3T>
-    __hostdev__ T dot(const Vec3T& v) const { return mVec[0] * v[0] + mVec[1] * v[1] + mVec[2] * v[2]; }
-    template<typename Vec3T>
-    __hostdev__ Vec3 cross(const Vec3T& v) const
-    {
-        return Vec3(mVec[1] * v[2] - mVec[2] * v[1],
-                    mVec[2] * v[0] - mVec[0] * v[2],
-                    mVec[0] * v[1] - mVec[1] * v[0]);
-    }
-    __hostdev__ T lengthSqr() const
-    {
-        return mVec[0] * mVec[0] + mVec[1] * mVec[1] + mVec[2] * mVec[2]; // 5 flops
-    }
-    __hostdev__ T    length() const { return Sqrt(this->lengthSqr()); }
-    __hostdev__ Vec3 operator-() const { return Vec3(-mVec[0], -mVec[1], -mVec[2]); }
-    __hostdev__ Vec3 operator*(const Vec3& v) const { return Vec3(mVec[0] * v[0], mVec[1] * v[1], mVec[2] * v[2]); }
-    __hostdev__ Vec3 operator/(const Vec3& v) const { return Vec3(mVec[0] / v[0], mVec[1] / v[1], mVec[2] / v[2]); }
-    __hostdev__ Vec3 operator+(const Vec3& v) const { return Vec3(mVec[0] + v[0], mVec[1] + v[1], mVec[2] + v[2]); }
-    __hostdev__ Vec3 operator-(const Vec3& v) const { return Vec3(mVec[0] - v[0], mVec[1] - v[1], mVec[2] - v[2]); }
-    __hostdev__ Vec3 operator*(const T& s) const { return Vec3(s * mVec[0], s * mVec[1], s * mVec[2]); }
-    __hostdev__ Vec3 operator/(const T& s) const { return (T(1) / s) * (*this); }
-    __hostdev__ Vec3& operator+=(const Vec3& v)
-    {
-        mVec[0] += v[0];
-        mVec[1] += v[1];
-        mVec[2] += v[2];
-        return *this;
-    }
-    __hostdev__ Vec3& operator-=(const Vec3& v)
-    {
-        mVec[0] -= v[0];
-        mVec[1] -= v[1];
-        mVec[2] -= v[2];
-        return *this;
-    }
-    __hostdev__ Vec3& operator*=(const T& s)
-    {
-        mVec[0] *= s;
-        mVec[1] *= s;
-        mVec[2] *= s;
-        return *this;
-    }
-    __hostdev__ Vec3& operator/=(const T& s) { return (*this) *= T(1) / s; }
-    __hostdev__ Vec3& normalize() { return (*this) /= this->length(); }
-    /// @brief Perform a component-wise minimum with the other Coord.
-    __hostdev__ Vec3& minComponent(const Vec3& other)
-    {
-        if (other[0] < mVec[0])
-            mVec[0] = other[0];
-        if (other[1] < mVec[1])
-            mVec[1] = other[1];
-        if (other[2] < mVec[2])
-            mVec[2] = other[2];
-        return *this;
-    }
-
-    /// @brief Perform a component-wise maximum with the other Coord.
-    __hostdev__ Vec3& maxComponent(const Vec3& other)
-    {
-        if (other[0] > mVec[0])
-            mVec[0] = other[0];
-        if (other[1] > mVec[1])
-            mVec[1] = other[1];
-        if (other[2] > mVec[2])
-            mVec[2] = other[2];
-        return *this;
-    }
-    /// @brief Return the smallest vector component
-    __hostdev__ ValueType min() const
-    {
-        return mVec[0] < mVec[1] ? (mVec[0] < mVec[2] ? mVec[0] : mVec[2]) : (mVec[1] < mVec[2] ? mVec[1] : mVec[2]);
-    }
-    /// @brief Return the largest vector component
-    __hostdev__ ValueType max() const
-    {
-        return mVec[0] > mVec[1] ? (mVec[0] > mVec[2] ? mVec[0] : mVec[2]) : (mVec[1] > mVec[2] ? mVec[1] : mVec[2]);
-    }
-    __hostdev__ Coord floor() const { return Coord(Floor(mVec[0]), Floor(mVec[1]), Floor(mVec[2])); }
-    __hostdev__ Coord ceil() const { return Coord(Ceil(mVec[0]), Ceil(mVec[1]), Ceil(mVec[2])); }
-    __hostdev__ Coord round() const { return Coord(Floor(mVec[0] + 0.5), Floor(mVec[1] + 0.5), Floor(mVec[2] + 0.5)); }
-}; // Vec3<T>
-
-template<typename T1, typename T2>
-__hostdev__ inline Vec3<T2> operator*(T1 scalar, const Vec3<T2>& vec)
-{
-    return Vec3<T2>(scalar * vec[0], scalar * vec[1], scalar * vec[2]);
-}
-template<typename T1, typename T2>
-__hostdev__ inline Vec3<T2> operator/(T1 scalar, const Vec3<T2>& vec)
-{
-    return Vec3<T2>(scalar / vec[0], scalar / vec[1], scalar / vec[2]);
-}
-
-using Vec3R = Vec3<double>;
-using Vec3d = Vec3<double>;
-using Vec3f = Vec3<float>;
-using Vec3i = Vec3<int>;
-
-/// @brief Return a single precision floating-point vector of this coordinate
-__hostdev__ inline Vec3f Coord::asVec3s() const { return Vec3f(float(mVec[0]), float(mVec[1]), float(mVec[2])); }
-
-/// @brief Return a double precision floating-point vector of this coordinate
-__hostdev__ inline Vec3d Coord::asVec3d() const { return Vec3d(double(mVec[0]), double(mVec[1]), double(mVec[2])); }
-
-// ----------------------------> Vec4 <--------------------------------------
-
-/// @brief A simple vector class with three double components, similar to openvdb::math::Vec4
-template<typename T>
-class Vec4
+/// @details major is the top 11 bits, minor is the 11 middle bits and patch is the lower 10 bits
+class Version
 {
-    T mVec[4];
-
+    uint32_t mData; // 11 + 11 + 10 bit packing of major + minor + patch
 public:
-    static const int SIZE = 4;
-    using ValueType = T;
-    Vec4() = default;
-    __hostdev__ explicit Vec4(T x)
-        : mVec{x, x, x, x}
+    static constexpr uint32_t End = 0, StrLen = 8;// for strlen<Version>()
+    /// @brief Default constructor
+    __hostdev__ Version()
+        : mData(uint32_t(NANOVDB_MAJOR_VERSION_NUMBER) << 21 |
+                uint32_t(NANOVDB_MINOR_VERSION_NUMBER) << 10 |
+                uint32_t(NANOVDB_PATCH_VERSION_NUMBER))
     {
-    }
-    __hostdev__ Vec4(T x, T y, T z, T w)
-        : mVec{x, y, z, w}
-    {
-    }
-    template<typename T2>
-    __hostdev__ explicit Vec4(const Vec4<T2>& v)
-        : mVec{T(v[0]), T(v[1]), T(v[2]), T(v[3])}
-    {
-    }
-    __hostdev__ bool operator==(const Vec4& rhs) const { return mVec[0] == rhs[0] && mVec[1] == rhs[1] && mVec[2] == rhs[2] && mVec[3] == rhs[3]; }
-    __hostdev__ bool operator!=(const Vec4& rhs) const { return mVec[0] != rhs[0] || mVec[1] != rhs[1] || mVec[2] != rhs[2] || mVec[3] != rhs[3]; }
-    template<typename Vec4T>
-    __hostdev__ Vec4& operator=(const Vec4T& rhs)
-    {
-        mVec[0] = rhs[0];
-        mVec[1] = rhs[1];
-        mVec[2] = rhs[2];
-        mVec[3] = rhs[3];
-        return *this;
-    }
-    __hostdev__ const T& operator[](int i) const { return mVec[i]; }
-    __hostdev__ T& operator[](int i) { return mVec[i]; }
-    template<typename Vec4T>
-    __hostdev__ T dot(const Vec4T& v) const { return mVec[0] * v[0] + mVec[1] * v[1] + mVec[2] * v[2] + mVec[3] * v[3]; }
-    __hostdev__ T lengthSqr() const
-    {
-        return mVec[0] * mVec[0] + mVec[1] * mVec[1] + mVec[2] * mVec[2] + mVec[3] * mVec[3]; // 7 flops
-    }
-    __hostdev__ T    length() const { return Sqrt(this->lengthSqr()); }
-    __hostdev__ Vec4 operator-() const { return Vec4(-mVec[0], -mVec[1], -mVec[2], -mVec[3]); }
-    __hostdev__ Vec4 operator*(const Vec4& v) const { return Vec4(mVec[0] * v[0], mVec[1] * v[1], mVec[2] * v[2], mVec[3] * v[3]); }
-    __hostdev__ Vec4 operator/(const Vec4& v) const { return Vec4(mVec[0] / v[0], mVec[1] / v[1], mVec[2] / v[2], mVec[3] / v[3]); }
-    __hostdev__ Vec4 operator+(const Vec4& v) const { return Vec4(mVec[0] + v[0], mVec[1] + v[1], mVec[2] + v[2], mVec[3] + v[3]); }
-    __hostdev__ Vec4 operator-(const Vec4& v) const { return Vec4(mVec[0] - v[0], mVec[1] - v[1], mVec[2] - v[2], mVec[3] - v[3]); }
-    __hostdev__ Vec4 operator*(const T& s) const { return Vec4(s * mVec[0], s * mVec[1], s * mVec[2], s * mVec[3]); }
-    __hostdev__ Vec4 operator/(const T& s) const { return (T(1) / s) * (*this); }
-    __hostdev__ Vec4& operator+=(const Vec4& v)
-    {
-        mVec[0] += v[0];
-        mVec[1] += v[1];
-        mVec[2] += v[2];
-        mVec[3] += v[3];
-        return *this;
-    }
-    __hostdev__ Vec4& operator-=(const Vec4& v)
-    {
-        mVec[0] -= v[0];
-        mVec[1] -= v[1];
-        mVec[2] -= v[2];
-        mVec[3] -= v[3];
-        return *this;
-    }
-    __hostdev__ Vec4& operator*=(const T& s)
-    {
-        mVec[0] *= s;
-        mVec[1] *= s;
-        mVec[2] *= s;
-        mVec[3] *= s;
-        return *this;
-    }
-    __hostdev__ Vec4& operator/=(const T& s) { return (*this) *= T(1) / s; }
-    __hostdev__ Vec4& normalize() { return (*this) /= this->length(); }
-    /// @brief Perform a component-wise minimum with the other Coord.
-    __hostdev__ Vec4& minComponent(const Vec4& other)
-    {
-        if (other[0] < mVec[0])
-            mVec[0] = other[0];
-        if (other[1] < mVec[1])
-            mVec[1] = other[1];
-        if (other[2] < mVec[2])
-            mVec[2] = other[2];
-        if (other[3] < mVec[3])
-            mVec[3] = other[3];
-        return *this;
-    }
-
-    /// @brief Perform a component-wise maximum with the other Coord.
-    __hostdev__ Vec4& maxComponent(const Vec4& other)
-    {
-        if (other[0] > mVec[0])
-            mVec[0] = other[0];
-        if (other[1] > mVec[1])
-            mVec[1] = other[1];
-        if (other[2] > mVec[2])
-            mVec[2] = other[2];
-        if (other[3] > mVec[3])
-            mVec[3] = other[3];
-        return *this;
-    }
-}; // Vec4<T>
-
-template<typename T1, typename T2>
-__hostdev__ inline Vec4<T2> operator*(T1 scalar, const Vec4<T2>& vec)
-{
-    return Vec4<T2>(scalar * vec[0], scalar * vec[1], scalar * vec[2], scalar * vec[3]);
-}
-template<typename T1, typename T2>
-__hostdev__ inline Vec4<T2> operator/(T1 scalar, const Vec3<T2>& vec)
-{
-    return Vec4<T2>(scalar / vec[0], scalar / vec[1], scalar / vec[2], scalar / vec[3]);
+    }
+    /// @brief Constructor from a raw uint32_t data representation
+    __hostdev__ Version(uint32_t data) : mData(data) {}
+    /// @brief Constructor from major.minor.patch version numbers
+    __hostdev__ Version(uint32_t major, uint32_t minor, uint32_t patch)
+        : mData(major << 21 | minor << 10 | patch)
+    {
+        NANOVDB_ASSERT(major < (1u << 11)); // max value of major is 2047
+        NANOVDB_ASSERT(minor < (1u << 11)); // max value of minor is 2047
+        NANOVDB_ASSERT(patch < (1u << 10)); // max value of patch is 1023
+    }
+    __hostdev__ bool     operator==(const Version& rhs) const { return mData == rhs.mData; }
+    __hostdev__ bool     operator<( const Version& rhs) const { return mData < rhs.mData; }
+    __hostdev__ bool     operator<=(const Version& rhs) const { return mData <= rhs.mData; }
+    __hostdev__ bool     operator>( const Version& rhs) const { return mData > rhs.mData; }
+    __hostdev__ bool     operator>=(const Version& rhs) const { return mData >= rhs.mData; }
+    __hostdev__ uint32_t id() const { return mData; }
+    __hostdev__ uint32_t getMajor() const { return (mData >> 21) & ((1u << 11) - 1); }
+    __hostdev__ uint32_t getMinor() const { return (mData >> 10) & ((1u << 11) - 1); }
+    __hostdev__ uint32_t getPatch() const { return  mData        & ((1u << 10) - 1); }
+    __hostdev__ bool isCompatible() const { return this->getMajor() == uint32_t(NANOVDB_MAJOR_VERSION_NUMBER); }
+    /// @brief Returns the difference between major version of this instance and NANOVDB_MAJOR_VERSION_NUMBER
+    /// @return return 0 if the major version equals NANOVDB_MAJOR_VERSION_NUMBER, else a negative age if this
+    ///         instance has a smaller major verion (is older), and a positive age if it is newer, i.e. larger.
+    __hostdev__ int age() const {return int(this->getMajor()) - int(NANOVDB_MAJOR_VERSION_NUMBER);}
+}; // Version
+
+/// @brief print the verion number to a c-string
+/// @param dst destination string of size 8 or more
+/// @param v version to be printed
+/// @return returns destination string @c dst
+__hostdev__ inline char* toStr(char *dst, const Version &v)
+{
+    return util::sprint(dst, v.getMajor(), ".",v.getMinor(), ".",v.getPatch());
 }
 
-using Vec4R = Vec4<double>;
-using Vec4d = Vec4<double>;
-using Vec4f = Vec4<float>;
-using Vec4i = Vec4<int>;
-
 // ----------------------------> TensorTraits <--------------------------------------
 
-template<typename T, int Rank = (is_specialization<T, Vec3>::value ||
-                                 is_specialization<T, Vec4>::value ||
-                                 is_same<T, Rgba8>::value) ? 1 : 0>
+template<typename T, int Rank = (util::is_specialization<T, math::Vec3>::value || util::is_specialization<T, math::Vec4>::value || util::is_same<T, math::Rgba8>::value) ? 1 : 0>
 struct TensorTraits;
 
 template<typename T>
@@ -1354,438 +770,261 @@ struct FloatTraits<bool, 1>
 };
 
 template<>
-struct FloatTraits<ValueMask, 1>
+struct FloatTraits<ValueIndex, 1> // size of empty class in C++ is 1 byte and not 0 byte
+{
+    using FloatType = uint64_t;
+};
+
+template<>
+struct FloatTraits<ValueIndexMask, 1> // size of empty class in C++ is 1 byte and not 0 byte
+{
+    using FloatType = uint64_t;
+};
+
+template<>
+struct FloatTraits<ValueOnIndex, 1> // size of empty class in C++ is 1 byte and not 0 byte
+{
+    using FloatType = uint64_t;
+};
+
+template<>
+struct FloatTraits<ValueOnIndexMask, 1> // size of empty class in C++ is 1 byte and not 0 byte
+{
+    using FloatType = uint64_t;
+};
+
+template<>
+struct FloatTraits<ValueMask, 1> // size of empty class in C++ is 1 byte and not 0 byte
 {
     using FloatType = bool;
 };
 
-// ----------------------------> mapping ValueType -> GridType <--------------------------------------
+template<>
+struct FloatTraits<Point, 1> // size of empty class in C++ is 1 byte and not 0 byte
+{
+    using FloatType = double;
+};
+
+// ----------------------------> mapping BuildType -> GridType <--------------------------------------
 
-/// @brief Maps from a templated value type to a GridType enum
+/// @brief Maps from a templated build type to a GridType enum
 template<typename BuildT>
-__hostdev__ inline GridType mapToGridType()
+__hostdev__ inline GridType toGridType()
 {
-    if (is_same<BuildT, float>::value) { // resolved at compile-time
+    if (util::is_same<BuildT, float>::value) { // resolved at compile-time
         return GridType::Float;
-    } else if (is_same<BuildT, double>::value) {
+    } else if (util::is_same<BuildT, double>::value) {
         return GridType::Double;
-    } else if (is_same<BuildT, int16_t>::value) {
+    } else if (util::is_same<BuildT, int16_t>::value) {
         return GridType::Int16;
-    } else if (is_same<BuildT, int32_t>::value) {
+    } else if (util::is_same<BuildT, int32_t>::value) {
         return GridType::Int32;
-    } else if (is_same<BuildT, int64_t>::value) {
+    } else if (util::is_same<BuildT, int64_t>::value) {
         return GridType::Int64;
-    } else if (is_same<BuildT, Vec3f>::value) {
+    } else if (util::is_same<BuildT, Vec3f>::value) {
         return GridType::Vec3f;
-    } else if (is_same<BuildT, Vec3d>::value) {
+    } else if (util::is_same<BuildT, Vec3d>::value) {
         return GridType::Vec3d;
-    } else if (is_same<BuildT, uint32_t>::value) {
+    } else if (util::is_same<BuildT, uint32_t>::value) {
         return GridType::UInt32;
-    } else if (is_same<BuildT, ValueMask>::value) {
+    } else if (util::is_same<BuildT, ValueMask>::value) {
         return GridType::Mask;
-    } else if (is_same<BuildT, bool>::value) {
+    } else if (util::is_same<BuildT, Half>::value) {
+        return GridType::Half;
+    } else if (util::is_same<BuildT, ValueIndex>::value) {
+        return GridType::Index;
+    } else if (util::is_same<BuildT, ValueOnIndex>::value) {
+        return GridType::OnIndex;
+    } else if (util::is_same<BuildT, ValueIndexMask>::value) {
+        return GridType::IndexMask;
+    } else if (util::is_same<BuildT, ValueOnIndexMask>::value) {
+        return GridType::OnIndexMask;
+    } else if (util::is_same<BuildT, bool>::value) {
         return GridType::Boolean;
-    } else if (is_same<BuildT, Rgba8>::value) {
+    } else if (util::is_same<BuildT, math::Rgba8>::value) {
         return GridType::RGBA8;
-    } else if (is_same<BuildT, Fp4>::value) {
+    } else if (util::is_same<BuildT, Fp4>::value) {
         return GridType::Fp4;
-    } else if (is_same<BuildT, Fp8>::value) {
+    } else if (util::is_same<BuildT, Fp8>::value) {
         return GridType::Fp8;
-    } else if (is_same<BuildT, Fp16>::value) {
+    } else if (util::is_same<BuildT, Fp16>::value) {
         return GridType::Fp16;
-    } else if (is_same<BuildT, FpN>::value) {
+    } else if (util::is_same<BuildT, FpN>::value) {
         return GridType::FpN;
-    } else if (is_same<BuildT, Vec4f>::value) {
+    } else if (util::is_same<BuildT, Vec4f>::value) {
         return GridType::Vec4f;
-    } else if (is_same<BuildT, Vec4d>::value) {
+    } else if (util::is_same<BuildT, Vec4d>::value) {
         return GridType::Vec4d;
+    } else if (util::is_same<BuildT, Point>::value) {
+        return GridType::PointIndex;
+    } else if (util::is_same<BuildT, Vec3u8>::value) {
+        return GridType::Vec3u8;
+    } else if (util::is_same<BuildT, Vec3u16>::value) {
+        return GridType::Vec3u16;
+    } else if (util::is_same<BuildT, uint8_t>::value) {
+        return GridType::UInt8;
     }
     return GridType::Unknown;
-}
-
-// ----------------------------> matMult <--------------------------------------
+}// toGridType
 
-template<typename Vec3T>
-__hostdev__ inline Vec3T matMult(const float* mat, const Vec3T& xyz)
-{
-    return Vec3T(fmaf(xyz[0], mat[0], fmaf(xyz[1], mat[1], xyz[2] * mat[2])),
-                 fmaf(xyz[0], mat[3], fmaf(xyz[1], mat[4], xyz[2] * mat[5])),
-                 fmaf(xyz[0], mat[6], fmaf(xyz[1], mat[7], xyz[2] * mat[8]))); // 6 fmaf + 3 mult = 9 flops
-}
+template<typename BuildT>
+[[deprecated("Use toGridType<T>() instead.")]]
+__hostdev__ inline GridType mapToGridType(){return toGridType<BuildT>();}
 
-template<typename Vec3T>
-__hostdev__ inline Vec3T matMult(const double* mat, const Vec3T& xyz)
-{
-    return Vec3T(fma(static_cast<double>(xyz[0]), mat[0], fma(static_cast<double>(xyz[1]), mat[1], static_cast<double>(xyz[2]) * mat[2])),
-                 fma(static_cast<double>(xyz[0]), mat[3], fma(static_cast<double>(xyz[1]), mat[4], static_cast<double>(xyz[2]) * mat[5])),
-                 fma(static_cast<double>(xyz[0]), mat[6], fma(static_cast<double>(xyz[1]), mat[7], static_cast<double>(xyz[2]) * mat[8]))); // 6 fmaf + 3 mult = 9 flops
-}
+// ----------------------------> mapping BuildType -> GridClass <--------------------------------------
 
-template<typename Vec3T>
-__hostdev__ inline Vec3T matMult(const float* mat, const float* vec, const Vec3T& xyz)
-{
-    return Vec3T(fmaf(xyz[0], mat[0], fmaf(xyz[1], mat[1], fmaf(xyz[2], mat[2], vec[0]))),
-                 fmaf(xyz[0], mat[3], fmaf(xyz[1], mat[4], fmaf(xyz[2], mat[5], vec[1]))),
-                 fmaf(xyz[0], mat[6], fmaf(xyz[1], mat[7], fmaf(xyz[2], mat[8], vec[2])))); // 9 fmaf = 9 flops
+/// @brief Maps from a templated build type to a GridClass enum
+template<typename BuildT>
+__hostdev__ inline GridClass toGridClass(GridClass defaultClass = GridClass::Unknown)
+{
+    if (util::is_same<BuildT, ValueMask>::value) {
+        return GridClass::Topology;
+    } else if (BuildTraits<BuildT>::is_index) {
+        return GridClass::IndexGrid;
+    } else if (util::is_same<BuildT, math::Rgba8>::value) {
+        return GridClass::VoxelVolume;
+    } else if (util::is_same<BuildT, Point>::value) {
+        return GridClass::PointIndex;
+    }
+    return defaultClass;
 }
 
-template<typename Vec3T>
-__hostdev__ inline Vec3T matMult(const double* mat, const double* vec, const Vec3T& xyz)
+template<typename BuildT>
+[[deprecated("Use toGridClass<T>() instead.")]]
+__hostdev__ inline GridClass mapToGridClass(GridClass defaultClass = GridClass::Unknown)
 {
-    return Vec3T(fma(static_cast<double>(xyz[0]), mat[0], fma(static_cast<double>(xyz[1]), mat[1], fma(static_cast<double>(xyz[2]), mat[2], vec[0]))),
-                 fma(static_cast<double>(xyz[0]), mat[3], fma(static_cast<double>(xyz[1]), mat[4], fma(static_cast<double>(xyz[2]), mat[5], vec[1]))),
-                 fma(static_cast<double>(xyz[0]), mat[6], fma(static_cast<double>(xyz[1]), mat[7], fma(static_cast<double>(xyz[2]), mat[8], vec[2])))); // 9 fma = 9 flops
+    return toGridClass<BuildT>();
 }
 
-// matMultT: Multiply with the transpose:
+//  ----------------------------> BitFlags <--------------------------------------
 
-template<typename Vec3T>
-__hostdev__ inline Vec3T matMultT(const float* mat, const Vec3T& xyz)
+template<int N>
+struct BitArray;
+template<>
+struct BitArray<8>
 {
-    return Vec3T(fmaf(xyz[0], mat[0], fmaf(xyz[1], mat[3], xyz[2] * mat[6])),
-                 fmaf(xyz[0], mat[1], fmaf(xyz[1], mat[4], xyz[2] * mat[7])),
-                 fmaf(xyz[0], mat[2], fmaf(xyz[1], mat[5], xyz[2] * mat[8]))); // 6 fmaf + 3 mult = 9 flops
-}
-
-template<typename Vec3T>
-__hostdev__ inline Vec3T matMultT(const double* mat, const Vec3T& xyz)
+    uint8_t mFlags{0};
+};
+template<>
+struct BitArray<16>
 {
-    return Vec3T(fma(static_cast<double>(xyz[0]), mat[0], fma(static_cast<double>(xyz[1]), mat[3], static_cast<double>(xyz[2]) * mat[6])),
-                 fma(static_cast<double>(xyz[0]), mat[1], fma(static_cast<double>(xyz[1]), mat[4], static_cast<double>(xyz[2]) * mat[7])),
-                 fma(static_cast<double>(xyz[0]), mat[2], fma(static_cast<double>(xyz[1]), mat[5], static_cast<double>(xyz[2]) * mat[8]))); // 6 fmaf + 3 mult = 9 flops
-}
-
-template<typename Vec3T>
-__hostdev__ inline Vec3T matMultT(const float* mat, const float* vec, const Vec3T& xyz)
+    uint16_t mFlags{0};
+};
+template<>
+struct BitArray<32>
 {
-    return Vec3T(fmaf(xyz[0], mat[0], fmaf(xyz[1], mat[3], fmaf(xyz[2], mat[6], vec[0]))),
-                 fmaf(xyz[0], mat[1], fmaf(xyz[1], mat[4], fmaf(xyz[2], mat[7], vec[1]))),
-                 fmaf(xyz[0], mat[2], fmaf(xyz[1], mat[5], fmaf(xyz[2], mat[8], vec[2])))); // 9 fmaf = 9 flops
-}
-
-template<typename Vec3T>
-__hostdev__ inline Vec3T matMultT(const double* mat, const double* vec, const Vec3T& xyz)
+    uint32_t mFlags{0};
+};
+template<>
+struct BitArray<64>
 {
-    return Vec3T(fma(static_cast<double>(xyz[0]), mat[0], fma(static_cast<double>(xyz[1]), mat[3], fma(static_cast<double>(xyz[2]), mat[6], vec[0]))),
-                 fma(static_cast<double>(xyz[0]), mat[1], fma(static_cast<double>(xyz[1]), mat[4], fma(static_cast<double>(xyz[2]), mat[7], vec[1]))),
-                 fma(static_cast<double>(xyz[0]), mat[2], fma(static_cast<double>(xyz[1]), mat[5], fma(static_cast<double>(xyz[2]), mat[8], vec[2])))); // 9 fma = 9 flops
-}
-
-// ----------------------------> BBox <-------------------------------------
+    uint64_t mFlags{0};
+};
 
-// Base-class for static polymorphism (cannot be constructed directly)
-template<typename Vec3T>
-struct BaseBBox
+template<int N>
+class BitFlags : public BitArray<N>
 {
-    Vec3T             mCoord[2];
-    __hostdev__ bool  operator==(const BaseBBox& rhs) const { return mCoord[0] == rhs.mCoord[0] && mCoord[1] == rhs.mCoord[1]; };
-    __hostdev__ bool  operator!=(const BaseBBox& rhs) const { return mCoord[0] != rhs.mCoord[0] || mCoord[1] != rhs.mCoord[1]; };
-    __hostdev__ const Vec3T& operator[](int i) const { return mCoord[i]; }
-    __hostdev__ Vec3T& operator[](int i) { return mCoord[i]; }
-    __hostdev__ Vec3T& min() { return mCoord[0]; }
-    __hostdev__ Vec3T& max() { return mCoord[1]; }
-    __hostdev__ const Vec3T& min() const { return mCoord[0]; }
-    __hostdev__ const Vec3T& max() const { return mCoord[1]; }
-    __hostdev__ Coord&       translate(const Vec3T& xyz)
-    {
-        mCoord[0] += xyz;
-        mCoord[1] += xyz;
-        return *this;
+protected:
+    using BitArray<N>::mFlags;
+
+public:
+    using Type = decltype(mFlags);
+    BitFlags() {}
+    BitFlags(Type mask) : BitArray<N>{mask} {}
+    BitFlags(std::initializer_list<uint8_t> list)
+    {
+        for (auto bit : list) mFlags |= static_cast<Type>(1 << bit);
     }
-    // @brief Expand this bounding box to enclose point (i, j, k).
-    __hostdev__ BaseBBox& expand(const Vec3T& xyz)
+    template<typename MaskT>
+    BitFlags(std::initializer_list<MaskT> list)
     {
-        mCoord[0].minComponent(xyz);
-        mCoord[1].maxComponent(xyz);
-        return *this;
+        for (auto mask : list) mFlags |= static_cast<Type>(mask);
     }
-    //__hostdev__ BaseBBox expandBy(typename Vec3T::ValueType padding) const
-    //{
-    //    return BaseBBox(mCoord[0].offsetBy(-padding),mCoord[1].offsetBy(padding));
-    //}
-    __hostdev__ bool isInside(const Vec3T& xyz)
+    __hostdev__ Type  data() const { return mFlags; }
+    __hostdev__ Type& data() { return mFlags; }
+    __hostdev__ void  initBit(std::initializer_list<uint8_t> list)
     {
-        if (xyz[0] < mCoord[0][0] || xyz[1] < mCoord[0][1] || xyz[2] < mCoord[0][2])
-            return false;
-        if (xyz[0] > mCoord[1][0] || xyz[1] > mCoord[1][1] || xyz[2] > mCoord[1][2])
-            return false;
-        return true;
+        mFlags = 0u;
+        for (auto bit : list) mFlags |= static_cast<Type>(1 << bit);
     }
-
-protected:
-    __hostdev__ BaseBBox() {}
-    __hostdev__ BaseBBox(const Vec3T& min, const Vec3T& max)
-        : mCoord{min, max}
+    template<typename MaskT>
+    __hostdev__ void initMask(std::initializer_list<MaskT> list)
     {
+        mFlags = 0u;
+        for (auto mask : list) mFlags |= static_cast<Type>(mask);
     }
-}; // BaseBBox
+    //__hostdev__ Type& data() { return mFlags; }
+    //__hostdev__ Type data() const { return mFlags; }
+    __hostdev__ Type getFlags() const { return mFlags & (static_cast<Type>(GridFlags::End) - 1u); } // mask out everything except relevant bits
 
-template<typename Vec3T, bool = is_floating_point<typename Vec3T::ValueType>::value>
-struct BBox;
+    __hostdev__ void setOn() { mFlags = ~Type(0u); }
+    __hostdev__ void setOff() { mFlags = Type(0u); }
 
-/// @brief Partial template specialization for floating point coordinate types.
-///
-/// @note Min is inclusive and max is exclusive. If min = max the dimension of
-///       the bounding box is zero and therefore it is also empty.
-template<typename Vec3T>
-struct BBox<Vec3T, true> : public BaseBBox<Vec3T>
-{
-    using Vec3Type = Vec3T;
-    using ValueType = typename Vec3T::ValueType;
-    static_assert(is_floating_point<ValueType>::value, "Expected a floating point coordinate type");
-    using BaseT = BaseBBox<Vec3T>;
-    using BaseT::mCoord;
-    __hostdev__ BBox()
-        : BaseT(Vec3T( Maximum<typename Vec3T::ValueType>::value()),
-                Vec3T(-Maximum<typename Vec3T::ValueType>::value()))
+    __hostdev__ void setBitOn(uint8_t bit) { mFlags |= static_cast<Type>(1 << bit); }
+    __hostdev__ void setBitOff(uint8_t bit) { mFlags &= ~static_cast<Type>(1 << bit); }
+
+    __hostdev__ void setBitOn(std::initializer_list<uint8_t> list)
     {
+        for (auto bit : list) mFlags |= static_cast<Type>(1 << bit);
     }
-    __hostdev__ BBox(const Vec3T& min, const Vec3T& max)
-        : BaseT(min, max)
+    __hostdev__ void setBitOff(std::initializer_list<uint8_t> list)
     {
+        for (auto bit : list) mFlags &= ~static_cast<Type>(1 << bit);
     }
-    __hostdev__ BBox(const Coord& min, const Coord& max)
-        : BaseT(Vec3T(ValueType(min[0]), ValueType(min[1]), ValueType(min[2])),
-                Vec3T(ValueType(max[0] + 1), ValueType(max[1] + 1), ValueType(max[2] + 1)))
+
+    template<typename MaskT>
+    __hostdev__ void setMaskOn(MaskT mask) { mFlags |= static_cast<Type>(mask); }
+    template<typename MaskT>
+    __hostdev__ void setMaskOff(MaskT mask) { mFlags &= ~static_cast<Type>(mask); }
+
+    template<typename MaskT>
+    __hostdev__ void setMaskOn(std::initializer_list<MaskT> list)
     {
+        for (auto mask : list) mFlags |= static_cast<Type>(mask);
     }
-    __hostdev__ BBox(const BaseBBox<Coord>& bbox) : BBox(bbox[0], bbox[1]) {}
-    __hostdev__ bool  empty() const { return mCoord[0][0] >= mCoord[1][0] ||
-                                             mCoord[0][1] >= mCoord[1][1] ||
-                                             mCoord[0][2] >= mCoord[1][2]; }
-    __hostdev__ Vec3T dim() const { return this->empty() ? Vec3T(0) : this->max() - this->min(); }
-    __hostdev__ bool  isInside(const Vec3T& p) const
+    template<typename MaskT>
+    __hostdev__ void setMaskOff(std::initializer_list<MaskT> list)
     {
-        return p[0] > mCoord[0][0] && p[1] > mCoord[0][1] && p[2] > mCoord[0][2] &&
-               p[0] < mCoord[1][0] && p[1] < mCoord[1][1] && p[2] < mCoord[1][2];
+        for (auto mask : list) mFlags &= ~static_cast<Type>(mask);
     }
-};// BBox<Vec3T, true>
 
-/// @brief Partial template specialization for integer coordinate types
-///
-/// @note Both min and max are INCLUDED in the bbox so dim = max - min + 1. So,
-///       if min = max the bounding box contains exactly one point and dim = 1!
-template<typename CoordT>
-struct BBox<CoordT, false> : public BaseBBox<CoordT>
-{
-    static_assert(is_same<int, typename CoordT::ValueType>::value, "Expected \"int\" coordinate type");
-    using BaseT = BaseBBox<CoordT>;
-    using BaseT::mCoord;
-    /// @brief Iterator over the domain covered by a BBox
-    /// @details z is the fastest-moving coordinate.
-    class Iterator
+    __hostdev__ void setBit(uint8_t bit, bool on) { on ? this->setBitOn(bit) : this->setBitOff(bit); }
+    template<typename MaskT>
+    __hostdev__ void setMask(MaskT mask, bool on) { on ? this->setMaskOn(mask) : this->setMaskOff(mask); }
+
+    __hostdev__ bool isOn() const { return mFlags == ~Type(0u); }
+    __hostdev__ bool isOff() const { return mFlags == Type(0u); }
+    __hostdev__ bool isBitOn(uint8_t bit) const { return 0 != (mFlags & static_cast<Type>(1 << bit)); }
+    __hostdev__ bool isBitOff(uint8_t bit) const { return 0 == (mFlags & static_cast<Type>(1 << bit)); }
+    template<typename MaskT>
+    __hostdev__ bool isMaskOn(MaskT mask) const { return 0 != (mFlags & static_cast<Type>(mask)); }
+    template<typename MaskT>
+    __hostdev__ bool isMaskOff(MaskT mask) const { return 0 == (mFlags & static_cast<Type>(mask)); }
+    /// @brief return true if any of the masks in the list are on
+    template<typename MaskT>
+    __hostdev__ bool isMaskOn(std::initializer_list<MaskT> list) const
     {
-        const BBox& mBBox;
-        CoordT      mPos;
-    public:
-        __hostdev__ Iterator(const BBox& b)
-            : mBBox(b)
-            , mPos(b.min())
-        {
-        }
-        __hostdev__ Iterator& operator++()
-        {
-            if (mPos[2] < mBBox[1][2]) {// this is the most common case
-                ++mPos[2];
-            } else if (mPos[1] < mBBox[1][1]) {
-                mPos[2] = mBBox[0][2];
-                ++mPos[1];
-            } else if (mPos[0] <= mBBox[1][0]) {
-                mPos[2] = mBBox[0][2];
-                mPos[1] = mBBox[0][1];
-                ++mPos[0];
-            }
-            return *this;
-        }
-        __hostdev__ Iterator operator++(int)
-        {
-            auto tmp = *this;
-            ++(*this);
-            return tmp;
+        for (auto mask : list) {
+            if (0 != (mFlags & static_cast<Type>(mask))) return true;
         }
-        /// @brief Return @c true if the iterator still points to a valid coordinate.
-        __hostdev__       operator bool() const { return mPos[0] <= mBBox[1][0]; }
-        __hostdev__ const CoordT& operator*() const { return mPos; }
-    }; // Iterator
-    __hostdev__ Iterator begin() const { return Iterator{*this}; }
-    __hostdev__          BBox()
-        : BaseT(CoordT::max(), CoordT::min())
-    {
-    }
-    __hostdev__ BBox(const CoordT& min, const CoordT& max)
-        : BaseT(min, max)
-    {
-    }
-    template<typename SplitT>
-    __hostdev__ BBox(BBox& other, const SplitT&)
-        : BaseT(other.mCoord[0], other.mCoord[1])
-    {
-        NANOVDB_ASSERT(this->is_divisible());
-        const int n = MaxIndex(this->dim());
-        mCoord[1][n] = (mCoord[0][n] + mCoord[1][n]) >> 1;
-        other.mCoord[0][n] = mCoord[1][n] + 1;
-    }
-    __hostdev__ bool is_divisible() const { return mCoord[0][0] < mCoord[1][0] &&
-                                                   mCoord[0][1] < mCoord[1][1] &&
-                                                   mCoord[0][2] < mCoord[1][2]; }
-    /// @brief Return true if this bounding box is empty, i.e. uninitialized
-    __hostdev__ bool   empty() const { return mCoord[0][0] > mCoord[1][0] ||
-                                              mCoord[0][1] > mCoord[1][1] ||
-                                              mCoord[0][2] > mCoord[1][2]; }
-    __hostdev__ CoordT dim() const { return this->empty() ? Coord(0) : this->max() - this->min() + Coord(1); }
-    __hostdev__ uint64_t volume() const { auto d = this->dim(); return uint64_t(d[0])*uint64_t(d[1])*uint64_t(d[2]); }
-    __hostdev__ bool   isInside(const CoordT& p) const { return !(CoordT::lessThan(p, this->min()) || CoordT::lessThan(this->max(), p)); }
-    __hostdev__ bool   isInside(const BBox& b) const
-    {
-        return !(CoordT::lessThan(b.min(), this->min()) || CoordT::lessThan(this->max(), b.max()));
+        return false;
     }
-
-    /// @warning This converts a CoordBBox into a floating-point bounding box which implies that max += 1 !
-    template<typename RealT>
-    __hostdev__ BBox<Vec3<RealT>> asReal() const
+    /// @brief return true if any of the masks in the list are off
+    template<typename MaskT>
+    __hostdev__ bool isMaskOff(std::initializer_list<MaskT> list) const
     {
-        static_assert(is_floating_point<RealT>::value, "CoordBBox::asReal: Expected a floating point coordinate");
-        return BBox<Vec3<RealT>>(Vec3<RealT>(RealT(mCoord[0][0]), RealT(mCoord[0][1]), RealT(mCoord[0][2])),
-                                 Vec3<RealT>(RealT(mCoord[1][0] + 1), RealT(mCoord[1][1] + 1), RealT(mCoord[1][2] + 1)));
+        for (auto mask : list) {
+            if (0 == (mFlags & static_cast<Type>(mask))) return true;
+        }
+        return false;
     }
-    /// @brief Return a new instance that is expanded by the specified padding.
-    __hostdev__ BBox expandBy(typename CoordT::ValueType padding) const
+    /// @brief required for backwards compatibility
+    __hostdev__ BitFlags& operator=(Type n)
     {
-        return BBox(mCoord[0].offsetBy(-padding), mCoord[1].offsetBy(padding));
+        mFlags = n;
+        return *this;
     }
-};// BBox<CoordT, false>
-
-using CoordBBox = BBox<Coord>;
-using BBoxR = BBox<Vec3R>;
-
-// -------------------> Find lowest and highest bit in a word <----------------------------
-
-/// @brief Returns the index of the lowest, i.e. least significant, on bit in the specified 32 bit word
-///
-/// @warning Assumes that at least one bit is set in the word, i.e. @a v != uint32_t(0)!
-NANOVDB_HOSTDEV_DISABLE_WARNING
-__hostdev__ static inline uint32_t FindLowestOn(uint32_t v)
-{
-    NANOVDB_ASSERT(v);
-#if defined(_MSC_VER) && defined(NANOVDB_USE_INTRINSICS)
-    unsigned long index;
-    _BitScanForward(&index, v);
-    return static_cast<uint32_t>(index);
-#elif (defined(__GNUC__) || defined(__clang__)) && defined(NANOVDB_USE_INTRINSICS)
-    return static_cast<uint32_t>(__builtin_ctzl(v));
-#else
-    static const unsigned char DeBruijn[32] = {
-        0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9};
-// disable unary minus on unsigned warning
-#if defined(_MSC_VER) && !defined(__NVCC__)
-#pragma warning(push)
-#pragma warning(disable : 4146)
-#endif
-    return DeBruijn[uint32_t((v & -v) * 0x077CB531U) >> 27];
-#if defined(_MSC_VER) && !defined(__NVCC__)
-#pragma warning(pop)
-#endif
-
-#endif
-}
-
-/// @brief Returns the index of the highest, i.e. most significant, on bit in the specified 32 bit word
-///
-/// @warning Assumes that at least one bit is set in the word, i.e. @a v != uint32_t(0)!
-NANOVDB_HOSTDEV_DISABLE_WARNING
-__hostdev__ static inline uint32_t FindHighestOn(uint32_t v)
-{
-    NANOVDB_ASSERT(v);
-#if defined(_MSC_VER) && defined(NANOVDB_USE_INTRINSICS)
-    unsigned long index;
-    _BitScanReverse(&index, v);
-    return static_cast<uint32_t>(index);
-#elif (defined(__GNUC__) || defined(__clang__)) && defined(NANOVDB_USE_INTRINSICS)
-    return sizeof(unsigned long) * 8 - 1 - __builtin_clzl(v);
-
-#else
-    static const unsigned char DeBruijn[32] = {
-        0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31};
-    v |= v >> 1; // first round down to one less than a power of 2
-    v |= v >> 2;
-    v |= v >> 4;
-    v |= v >> 8;
-    v |= v >> 16;
-    return DeBruijn[uint32_t(v * 0x07C4ACDDU) >> 27];
-#endif
-}
-
-/// @brief Returns the index of the lowest, i.e. least significant, on bit in the specified 64 bit word
-///
-/// @warning Assumes that at least one bit is set in the word, i.e. @a v != uint32_t(0)!
-NANOVDB_HOSTDEV_DISABLE_WARNING
-__hostdev__ static inline uint32_t FindLowestOn(uint64_t v)
-{
-    NANOVDB_ASSERT(v);
-#if defined(_MSC_VER) && defined(NANOVDB_USE_INTRINSICS)
-    unsigned long index;
-    _BitScanForward64(&index, v);
-    return static_cast<uint32_t>(index);
-#elif (defined(__GNUC__) || defined(__clang__)) && defined(NANOVDB_USE_INTRINSICS)
-    return static_cast<uint32_t>(__builtin_ctzll(v));
-#else
-    static const unsigned char DeBruijn[64] = {
-        0,   1,  2, 53,  3,  7, 54, 27, 4,  38, 41,  8, 34, 55, 48, 28,
-        62,  5, 39, 46, 44, 42, 22,  9, 24, 35, 59, 56, 49, 18, 29, 11,
-        63, 52,  6, 26, 37, 40, 33, 47, 61, 45, 43, 21, 23, 58, 17, 10,
-        51, 25, 36, 32, 60, 20, 57, 16, 50, 31, 19, 15, 30, 14, 13, 12,
-    };
-// disable unary minus on unsigned warning
-#if defined(_MSC_VER) && !defined(__NVCC__)
-#pragma warning(push)
-#pragma warning(disable : 4146)
-#endif
-    return DeBruijn[uint64_t((v & -v) * UINT64_C(0x022FDD63CC95386D)) >> 58];
-#if defined(_MSC_VER) && !defined(__NVCC__)
-#pragma warning(pop)
-#endif
-
-#endif
-}
-
-/// @brief Returns the index of the highest, i.e. most significant, on bit in the specified 64 bit word
-///
-/// @warning Assumes that at least one bit is set in the word, i.e. @a v != uint32_t(0)!
-NANOVDB_HOSTDEV_DISABLE_WARNING
-__hostdev__ static inline uint32_t FindHighestOn(uint64_t v)
-{
-    NANOVDB_ASSERT(v);
-#if defined(_MSC_VER) && defined(NANOVDB_USE_INTRINSICS)
-    unsigned long index;
-    _BitScanReverse64(&index, v);
-    return static_cast<uint32_t>(index);
-#elif (defined(__GNUC__) || defined(__clang__)) && defined(NANOVDB_USE_INTRINSICS)
-    return sizeof(unsigned long) * 8 - 1 - __builtin_clzll(v);
-#else
-    const uint32_t* p = reinterpret_cast<const uint32_t*>(&v);
-    return p[1] ? 32u + FindHighestOn(p[1]) : FindHighestOn(p[0]);
-#endif
-}
-
-// ----------------------------> CountOn <--------------------------------------
-
-/// @return Number of bits that are on in the specified 64-bit word
-NANOVDB_HOSTDEV_DISABLE_WARNING
-__hostdev__ inline uint32_t CountOn(uint64_t v)
-{
-// __popcnt* intrinsic support was added in VS 2019 16.8
-#if defined(_MSC_VER) && defined(_M_X64) && (_MSC_VER >= 1928)
-    v = __popcnt64(v);
-#elif (defined(__GNUC__) || defined(__clang__))
-    v = __builtin_popcountll(v);
-#else
-    // Software Implementation
-    v = v - ((v >> 1) & uint64_t(0x5555555555555555));
-    v = (v & uint64_t(0x3333333333333333)) + ((v >> 2) & uint64_t(0x3333333333333333));
-    v = (((v + (v >> 4)) & uint64_t(0xF0F0F0F0F0F0F0F)) * uint64_t(0x101010101010101)) >> 56;
-#endif
-    return static_cast<uint32_t>(v);
-}
+}; // BitFlags<N>
 
 // ----------------------------> Mask <--------------------------------------
 
@@ -1794,11 +1033,10 @@ __hostdev__ inline uint32_t CountOn(uint64_t v)
 template<uint32_t LOG2DIM>
 class Mask
 {
+public:
     static constexpr uint32_t SIZE = 1U << (3 * LOG2DIM); // Number of bits in mask
     static constexpr uint32_t WORD_COUNT = SIZE >> 6; // Number of 64 bit words
-    uint64_t                  mWords[WORD_COUNT];
 
-public:
     /// @brief Return the memory footprint in bytes of this Mask
     __hostdev__ static size_t memUsage() { return sizeof(Mask); }
 
@@ -1808,14 +1046,25 @@ class Mask
     /// @brief Return the number of machine words used by this Mask
     __hostdev__ static uint32_t wordCount() { return WORD_COUNT; }
 
+    /// @brief Return the total number of set bits in this Mask
     __hostdev__ uint32_t countOn() const
     {
-        uint32_t sum = 0, n = WORD_COUNT;
+        uint32_t sum = 0;
+        for (const uint64_t *w = mWords, *q = w + WORD_COUNT; w != q; ++w)
+            sum += util::countOn(*w);
+        return sum;
+    }
+
+    /// @brief Return the number of lower set bits in mask up to but excluding the i'th bit
+    inline __hostdev__ uint32_t countOn(uint32_t i) const
+    {
+        uint32_t n = i >> 6, sum = util::countOn(mWords[n] & ((uint64_t(1) << (i & 63u)) - 1u));
         for (const uint64_t* w = mWords; n--; ++w)
-            sum += CountOn(*w);
+            sum += util::countOn(*w);
         return sum;
     }
 
+    template<bool On>
     class Iterator
     {
     public:
@@ -1831,17 +1080,60 @@ class Mask
         }
         Iterator&            operator=(const Iterator&) = default;
         __hostdev__ uint32_t operator*() const { return mPos; }
-        __hostdev__          operator bool() const { return mPos != Mask::SIZE; }
+        __hostdev__ uint32_t pos() const { return mPos; }
+        __hostdev__ operator bool() const { return mPos != Mask::SIZE; }
         __hostdev__ Iterator& operator++()
         {
-            mPos = mParent->findNextOn(mPos + 1);
+            mPos = mParent->findNext<On>(mPos + 1);
             return *this;
         }
+        __hostdev__ Iterator operator++(int)
+        {
+            auto tmp = *this;
+            ++(*this);
+            return tmp;
+        }
 
     private:
         uint32_t    mPos;
         const Mask* mParent;
-    }; // Member class MaskIterator
+    }; // Member class Iterator
+
+    class DenseIterator
+    {
+    public:
+        __hostdev__ DenseIterator(uint32_t pos = Mask::SIZE)
+            : mPos(pos)
+        {
+        }
+        DenseIterator&       operator=(const DenseIterator&) = default;
+        __hostdev__ uint32_t operator*() const { return mPos; }
+        __hostdev__ uint32_t pos() const { return mPos; }
+        __hostdev__ operator bool() const { return mPos != Mask::SIZE; }
+        __hostdev__ DenseIterator& operator++()
+        {
+            ++mPos;
+            return *this;
+        }
+        __hostdev__ DenseIterator operator++(int)
+        {
+            auto tmp = *this;
+            ++mPos;
+            return tmp;
+        }
+
+    private:
+        uint32_t mPos;
+    }; // Member class DenseIterator
+
+    using OnIterator = Iterator<true>;
+    using OffIterator = Iterator<false>;
+
+    __hostdev__ OnIterator beginOn() const { return OnIterator(this->findFirst<true>(), this); }
+
+    __hostdev__ OffIterator beginOff() const { return OffIterator(this->findFirst<false>(), this); }
+
+    __hostdev__ DenseIterator beginAll() const { return DenseIterator(0); }
 
     /// @brief Initialize all bits to zero.
     __hostdev__ Mask()
@@ -1863,44 +1155,44 @@ class Mask
             mWords[i] = other.mWords[i];
     }
 
-    /// @brief Return the <i>n</i>th word of the bit mask, for a word of arbitrary size.
-    template<typename WordT>
-    __hostdev__ WordT getWord(int n) const
-    {
-        NANOVDB_ASSERT(n * 8 * sizeof(WordT) < SIZE);
-        return reinterpret_cast<const WordT*>(mWords)[n];
-    }
+    /// @brief Return a pointer to the list of words of the bit mask
+    __hostdev__ uint64_t*       words() { return mWords; }
+    __hostdev__ const uint64_t* words() const { return mWords; }
 
     /// @brief Assignment operator that works with openvdb::util::NodeMask
-    template<typename MaskT>
-    __hostdev__ Mask& operator=(const MaskT& other)
+    template<typename MaskT = Mask>
+    __hostdev__ typename util::enable_if<!util::is_same<MaskT, Mask>::value, Mask&>::type operator=(const MaskT& other)
     {
         static_assert(sizeof(Mask) == sizeof(MaskT), "Mismatching sizeof");
         static_assert(WORD_COUNT == MaskT::WORD_COUNT, "Mismatching word count");
         static_assert(LOG2DIM == MaskT::LOG2DIM, "Mismatching LOG2DIM");
-        auto     *src = reinterpret_cast<const uint64_t*>(&other);
-        uint64_t *dst = mWords;
-        for (uint32_t i = 0; i < WORD_COUNT; ++i) {
-            *dst++ = *src++;
-        }
+        auto* src = reinterpret_cast<const uint64_t*>(&other);
+        for (uint64_t *dst = mWords, *end = dst + WORD_COUNT; dst != end; ++dst)
+            *dst = *src++;
         return *this;
     }
 
+    //__hostdev__ Mask& operator=(const Mask& other){return *util::memcpy(this, &other);}
+    Mask& operator=(const Mask&) = default;
+
     __hostdev__ bool operator==(const Mask& other) const
     {
         for (uint32_t i = 0; i < WORD_COUNT; ++i) {
-            if (mWords[i] != other.mWords[i]) return false;
+            if (mWords[i] != other.mWords[i])
+                return false;
         }
         return true;
     }
 
     __hostdev__ bool operator!=(const Mask& other) const { return !((*this) == other); }
 
-    __hostdev__ Iterator beginOn() const { return Iterator(this->findFirstOn(), this); }
-
     /// @brief Return true if the given bit is set.
     __hostdev__ bool isOn(uint32_t n) const { return 0 != (mWords[n >> 6] & (uint64_t(1) << (n & 63))); }
 
+    /// @brief Return true if the given bit is NOT set.
+    __hostdev__ bool isOff(uint32_t n) const { return 0 == (mWords[n >> 6] & (uint64_t(1) << (n & 63))); }
+
+    /// @brief Return true if all the bits are set in this Mask.
     __hostdev__ bool isOn() const
     {
         for (uint32_t i = 0; i < WORD_COUNT; ++i)
@@ -1909,6 +1201,7 @@ class Mask
         return true;
     }
 
+    /// @brief Return true if none of the bits are set in this Mask.
     __hostdev__ bool isOff() const
     {
         for (uint32_t i = 0; i < WORD_COUNT; ++i)
@@ -1917,86 +1210,144 @@ class Mask
         return true;
     }
 
-    /// @brief Set the given bit on.
+    /// @brief Set the specified bit on.
     __hostdev__ void setOn(uint32_t n) { mWords[n >> 6] |= uint64_t(1) << (n & 63); }
+    /// @brief Set the specified bit off.
     __hostdev__ void setOff(uint32_t n) { mWords[n >> 6] &= ~(uint64_t(1) << (n & 63)); }
 
-    __hostdev__ void set(uint32_t n, bool On)
+#if defined(__CUDACC__) // the following functions only run on the GPU!
+    __device__ inline void setOnAtomic(uint32_t n)
+    {
+        atomicOr(reinterpret_cast<unsigned long long int*>(this) + (n >> 6), 1ull << (n & 63));
+    }
+    __device__ inline void setOffAtomic(uint32_t n)
+    {
+        atomicAnd(reinterpret_cast<unsigned long long int*>(this) + (n >> 6), ~(1ull << (n & 63)));
+    }
+    __device__ inline void setAtomic(uint32_t n, bool on)
+    {
+        on ? this->setOnAtomic(n) : this->setOffAtomic(n);
+    }
+#endif
+    /// @brief Set the specified bit on or off.
+    __hostdev__ void set(uint32_t n, bool on)
     {
-#if 1   // switch between branchless
-        auto &word = mWords[n >> 6];
+#if 1 // switch between branchless
+        auto& word = mWords[n >> 6];
         n &= 63;
         word &= ~(uint64_t(1) << n);
-        word |=   uint64_t(On) << n;
+        word |= uint64_t(on) << n;
 #else
-        On ? this->setOn(n) : this->setOff(n);
+        on ? this->setOn(n) : this->setOff(n);
 #endif
     }
 
     /// @brief Set all bits on
     __hostdev__ void setOn()
     {
-        for (uint32_t i = 0; i < WORD_COUNT; ++i)
-            mWords[i] = ~uint64_t(0);
+        for (uint32_t i = 0; i < WORD_COUNT; ++i)mWords[i] = ~uint64_t(0);
     }
 
     /// @brief Set all bits off
     __hostdev__ void setOff()
     {
-        for (uint32_t i = 0; i < WORD_COUNT; ++i)
-            mWords[i] = uint64_t(0);
+        for (uint32_t i = 0; i < WORD_COUNT; ++i) mWords[i] = uint64_t(0);
     }
 
     /// @brief Set all bits off
     __hostdev__ void set(bool on)
     {
         const uint64_t v = on ? ~uint64_t(0) : uint64_t(0);
-        for (uint32_t i = 0; i < WORD_COUNT; ++i)
-            mWords[i] = v;
+        for (uint32_t i = 0; i < WORD_COUNT; ++i) mWords[i] = v;
     }
     /// brief Toggle the state of all bits in the mask
     __hostdev__ void toggle()
     {
         uint32_t n = WORD_COUNT;
-        for (auto* w = mWords; n--; ++w)
-            *w = ~*w;
+        for (auto* w = mWords; n--; ++w) *w = ~*w;
     }
     __hostdev__ void toggle(uint32_t n) { mWords[n >> 6] ^= uint64_t(1) << (n & 63); }
 
-private:
+    /// @brief Bitwise intersection
+    __hostdev__ Mask& operator&=(const Mask& other)
+    {
+        uint64_t*       w1 = mWords;
+        const uint64_t* w2 = other.mWords;
+        for (uint32_t n = WORD_COUNT; n--; ++w1, ++w2) *w1 &= *w2;
+        return *this;
+    }
+    /// @brief Bitwise union
+    __hostdev__ Mask& operator|=(const Mask& other)
+    {
+        uint64_t*       w1 = mWords;
+        const uint64_t* w2 = other.mWords;
+        for (uint32_t n = WORD_COUNT; n--; ++w1, ++w2) *w1 |= *w2;
+        return *this;
+    }
+    /// @brief Bitwise difference
+    __hostdev__ Mask& operator-=(const Mask& other)
+    {
+        uint64_t*       w1 = mWords;
+        const uint64_t* w2 = other.mWords;
+        for (uint32_t n = WORD_COUNT; n--; ++w1, ++w2) *w1 &= ~*w2;
+        return *this;
+    }
+    /// @brief Bitwise XOR
+    __hostdev__ Mask& operator^=(const Mask& other)
+    {
+        uint64_t*       w1 = mWords;
+        const uint64_t* w2 = other.mWords;
+        for (uint32_t n = WORD_COUNT; n--; ++w1, ++w2) *w1 ^= *w2;
+        return *this;
+    }
 
     NANOVDB_HOSTDEV_DISABLE_WARNING
-    __hostdev__ uint32_t findFirstOn() const
+    template<bool ON>
+    __hostdev__ uint32_t findFirst() const
     {
-        uint32_t        n = 0;
+        uint32_t        n = 0u;
         const uint64_t* w = mWords;
-        for (; n < WORD_COUNT && !*w; ++w, ++n)
-            ;
-        return n == WORD_COUNT ? SIZE : (n << 6) + FindLowestOn(*w);
+        for (; n < WORD_COUNT && !(ON ? *w : ~*w); ++w, ++n);
+        return n < WORD_COUNT ? (n << 6) + util::findLowestOn(ON ? *w : ~*w) : SIZE;
+    }
+
+    NANOVDB_HOSTDEV_DISABLE_WARNING
+    template<bool ON>
+    __hostdev__ uint32_t findNext(uint32_t start) const
+    {
+        uint32_t n = start >> 6; // initiate
+        if (n >= WORD_COUNT) return SIZE; // check for out of bounds
+        uint32_t m = start & 63u;
+        uint64_t b = ON ? mWords[n] : ~mWords[n];
+        if (b & (uint64_t(1u) << m)) return start; // simple case: start is on/off
+        b &= ~uint64_t(0u) << m; // mask out lower bits
+        while (!b && ++n < WORD_COUNT) b = ON ? mWords[n] : ~mWords[n]; // find next non-zero word
+        return b ? (n << 6) + util::findLowestOn(b) : SIZE; // catch last word=0
     }
 
     NANOVDB_HOSTDEV_DISABLE_WARNING
-    __hostdev__ uint32_t findNextOn(uint32_t start) const
+    template<bool ON>
+    __hostdev__ uint32_t findPrev(uint32_t start) const
     {
         uint32_t n = start >> 6; // initiate
-        if (n >= WORD_COUNT)
-            return SIZE; // check for out of bounds
-        uint32_t m = start & 63;
-        uint64_t b = mWords[n];
-        if (b & (uint64_t(1) << m))
-            return start; // simple case: start is on
-        b &= ~uint64_t(0) << m; // mask out lower bits
-        while (!b && ++n < WORD_COUNT)
-            b = mWords[n]; // find next non-zero word
-        return (!b ? SIZE : (n << 6) + FindLowestOn(b)); // catch last word=0
+        if (n >= WORD_COUNT) return SIZE; // check for out of bounds
+        uint32_t m = start & 63u;
+        uint64_t b = ON ? mWords[n] : ~mWords[n];
+        if (b & (uint64_t(1u) << m)) return start; // simple case: start is on/off
+        b &= (uint64_t(1u) << m) - 1u; // mask out higher bits
+        while (!b && n) b = ON ? mWords[--n] : ~mWords[--n]; // find previous non-zero word
+        return b ? (n << 6) + util::findHighestOn(b) : SIZE; // catch first word=0
     }
+
+private:
+    uint64_t mWords[WORD_COUNT];
 }; // Mask class
 
 // ----------------------------> Map <--------------------------------------
 
 /// @brief Defines an affine transform and its inverse represented as a 3x3 matrix and a vec3 translation
 struct Map
-{
+{ // 264B (not 32B aligned!)
     float  mMatF[9]; // 9*4B <- 3x3 matrix
     float  mInvMatF[9]; // 9*4B <- 3x3 matrix
     float  mVecF[3]; // 3*4B <- translation
@@ -2006,93 +1357,241 @@ struct Map
     double mVecD[3]; // 3*8B <- translation
     double mTaperD; // 8B, placeholder for taper value
 
-    // This method can only be called on the host to initialize the member data
+    /// @brief Default constructor for the identity map
+    __hostdev__ Map()
+        : mMatF{   1.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 1.0f}
+        , mInvMatF{1.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 1.0f}
+        , mVecF{0.0f, 0.0f, 0.0f}
+        , mTaperF{1.0f}
+        , mMatD{   1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0}
+        , mInvMatD{1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0}
+        , mVecD{0.0, 0.0, 0.0}
+        , mTaperD{1.0}
+    {
+    }
+    __hostdev__ Map(double s, const Vec3d& t = Vec3d(0.0, 0.0, 0.0))
+        : mMatF{float(s), 0.0f, 0.0f, 0.0f, float(s), 0.0f, 0.0f, 0.0f, float(s)}
+        , mInvMatF{1.0f / float(s), 0.0f, 0.0f, 0.0f, 1.0f / float(s), 0.0f, 0.0f, 0.0f, 1.0f / float(s)}
+        , mVecF{float(t[0]), float(t[1]), float(t[2])}
+        , mTaperF{1.0f}
+        , mMatD{s, 0.0, 0.0, 0.0, s, 0.0, 0.0, 0.0, s}
+        , mInvMatD{1.0 / s, 0.0, 0.0, 0.0, 1.0 / s, 0.0, 0.0, 0.0, 1.0 / s}
+        , mVecD{t[0], t[1], t[2]}
+        , mTaperD{1.0}
+    {
+    }
+
+    /// @brief Initialize the member data from 3x3 or 4x4 matrices
+    /// @note This is not _hostdev__ since then MatT=openvdb::Mat4d will produce warnings
+    template<typename MatT, typename Vec3T>
+    void set(const MatT& mat, const MatT& invMat, const Vec3T& translate, double taper = 1.0);
+
+    /// @brief Initialize the member data from 4x4 matrices
+    /// @note  The last (4th) row of invMat is actually ignored.
+    ///        This is not _hostdev__ since then Mat4T=openvdb::Mat4d will produce warnings
     template<typename Mat4T>
-    __hostdev__ void set(const Mat4T& mat, const Mat4T& invMat, double taper);
+    void set(const Mat4T& mat, const Mat4T& invMat, double taper = 1.0) { this->set(mat, invMat, mat[3], taper); }
 
     template<typename Vec3T>
-    __hostdev__ Vec3T applyMap(const Vec3T& xyz) const { return matMult(mMatD, mVecD, xyz); }
+    void set(double scale, const Vec3T& translation, double taper = 1.0);
+
+    /// @brief Apply the forward affine transformation to a vector using 64bit floating point arithmetics.
+    /// @note Typically this operation is used for the scale, rotation and translation of index -> world mapping
+    /// @tparam Vec3T Template type of the 3D vector to be mapped
+    /// @param ijk 3D vector to be mapped - typically floating point index coordinates
+    /// @return Forward mapping for affine transformation, i.e. (mat x ijk) + translation
     template<typename Vec3T>
-    __hostdev__ Vec3T applyMapF(const Vec3T& xyz) const { return matMult(mMatF, mVecF, xyz); }
+    __hostdev__ Vec3T applyMap(const Vec3T& ijk) const { return math::matMult(mMatD, mVecD, ijk); }
 
+    /// @brief Apply the forward affine transformation to a vector using 32bit floating point arithmetics.
+    /// @note Typically this operation is used for the scale, rotation and translation of index -> world mapping
+    /// @tparam Vec3T Template type of the 3D vector to be mapped
+    /// @param ijk 3D vector to be mapped - typically floating point index coordinates
+    /// @return Forward mapping for affine transformation, i.e. (mat x ijk) + translation
     template<typename Vec3T>
-    __hostdev__ Vec3T applyJacobian(const Vec3T& xyz) const { return matMult(mMatD, xyz); }
+    __hostdev__ Vec3T applyMapF(const Vec3T& ijk) const { return math::matMult(mMatF, mVecF, ijk); }
+
+    /// @brief Apply the linear forward 3x3 transformation to an input 3d vector using 64bit floating point arithmetics,
+    ///        e.g. scale and rotation WITHOUT translation.
+    /// @note Typically this operation is used for scale and rotation from index -> world mapping
+    /// @tparam Vec3T Template type of the 3D vector to be mapped
+    /// @param ijk 3D vector to be mapped - typically floating point index coordinates
+    /// @return linear forward 3x3 mapping of the input vector
     template<typename Vec3T>
-    __hostdev__ Vec3T applyJacobianF(const Vec3T& xyz) const { return matMult(mMatF, xyz); }
+    __hostdev__ Vec3T applyJacobian(const Vec3T& ijk) const { return math::matMult(mMatD, ijk); }
+
+    /// @brief Apply the linear forward 3x3 transformation to an input 3d vector using 32bit floating point arithmetics,
+    ///        e.g. scale and rotation WITHOUT translation.
+    /// @note Typically this operation is used for scale and rotation from index -> world mapping
+    /// @tparam Vec3T Template type of the 3D vector to be mapped
+    /// @param ijk 3D vector to be mapped - typically floating point index coordinates
+    /// @return linear forward 3x3 mapping of the input vector
+    template<typename Vec3T>
+    __hostdev__ Vec3T applyJacobianF(const Vec3T& ijk) const { return math::matMult(mMatF, ijk); }
 
+    /// @brief Apply the inverse affine mapping to a vector using 64bit floating point arithmetics.
+    /// @note Typically this operation is used for the world -> index mapping
+    /// @tparam Vec3T Template type of the 3D vector to be mapped
+    /// @param xyz 3D vector to be mapped - typically floating point world coordinates
+    /// @return Inverse affine mapping of the input @c xyz i.e. (xyz - translation) x mat^-1
     template<typename Vec3T>
     __hostdev__ Vec3T applyInverseMap(const Vec3T& xyz) const
     {
-        return matMult(mInvMatD, Vec3T(xyz[0] - mVecD[0], xyz[1] - mVecD[1], xyz[2] - mVecD[2]));
+        return math::matMult(mInvMatD, Vec3T(xyz[0] - mVecD[0], xyz[1] - mVecD[1], xyz[2] - mVecD[2]));
     }
+
+    /// @brief Apply the inverse affine mapping to a vector using 32bit floating point arithmetics.
+    /// @note Typically this operation is used for the world -> index mapping
+    /// @tparam Vec3T Template type of the 3D vector to be mapped
+    /// @param xyz 3D vector to be mapped - typically floating point world coordinates
+    /// @return Inverse affine mapping of the input @c xyz i.e. (xyz - translation) x mat^-1
     template<typename Vec3T>
     __hostdev__ Vec3T applyInverseMapF(const Vec3T& xyz) const
     {
-        return matMult(mInvMatF, Vec3T(xyz[0] - mVecF[0], xyz[1] - mVecF[1], xyz[2] - mVecF[2]));
+        return math::matMult(mInvMatF, Vec3T(xyz[0] - mVecF[0], xyz[1] - mVecF[1], xyz[2] - mVecF[2]));
     }
 
+    /// @brief Apply the linear inverse 3x3 transformation to an input 3d vector using 64bit floating point arithmetics,
+    ///        e.g. inverse scale and inverse rotation WITHOUT translation.
+    /// @note Typically this operation is used for scale and rotation from world -> index mapping
+    /// @tparam Vec3T Template type of the 3D vector to be mapped
+    /// @param ijk 3D vector to be mapped - typically floating point index coordinates
+    /// @return linear inverse 3x3 mapping of the input vector i.e. xyz x mat^-1
     template<typename Vec3T>
-    __hostdev__ Vec3T applyInverseJacobian(const Vec3T& xyz) const { return matMult(mInvMatD, xyz); }
+    __hostdev__ Vec3T applyInverseJacobian(const Vec3T& xyz) const { return math::matMult(mInvMatD, xyz); }
+
+    /// @brief Apply the linear inverse 3x3 transformation to an input 3d vector using 32bit floating point arithmetics,
+    ///        e.g. inverse scale and inverse rotation WITHOUT translation.
+    /// @note Typically this operation is used for scale and rotation from world -> index mapping
+    /// @tparam Vec3T Template type of the 3D vector to be mapped
+    /// @param ijk 3D vector to be mapped - typically floating point index coordinates
+    /// @return linear inverse 3x3 mapping of the input vector i.e. xyz x mat^-1
     template<typename Vec3T>
-    __hostdev__ Vec3T applyInverseJacobianF(const Vec3T& xyz) const { return matMult(mInvMatF, xyz); }
-
+    __hostdev__ Vec3T applyInverseJacobianF(const Vec3T& xyz) const { return math::matMult(mInvMatF, xyz); }
+
+    /// @brief Apply the transposed inverse 3x3 transformation to an input 3d vector using 64bit floating point arithmetics,
+    ///        e.g. inverse scale and inverse rotation WITHOUT translation.
+    /// @note Typically this operation is used for scale and rotation from world -> index mapping
+    /// @tparam Vec3T Template type of the 3D vector to be mapped
+    /// @param ijk 3D vector to be mapped - typically floating point index coordinates
+    /// @return linear inverse 3x3 mapping of the input vector i.e. xyz x mat^-1
     template<typename Vec3T>
-    __hostdev__ Vec3T applyIJT(const Vec3T& xyz) const { return matMultT(mInvMatD, xyz); }
+    __hostdev__ Vec3T applyIJT(const Vec3T& xyz) const { return math::matMultT(mInvMatD, xyz); }
     template<typename Vec3T>
-    __hostdev__ Vec3T applyIJTF(const Vec3T& xyz) const { return matMultT(mInvMatF, xyz); }
+    __hostdev__ Vec3T applyIJTF(const Vec3T& xyz) const { return math::matMultT(mInvMatF, xyz); }
+
+    /// @brief Return a voxels size in each coordinate direction, measured at the origin
+    __hostdev__ Vec3d getVoxelSize() const { return this->applyMap(Vec3d(1)) - this->applyMap(Vec3d(0)); }
 }; // Map
 
-template<typename Mat4T>
-__hostdev__ void Map::set(const Mat4T& mat, const Mat4T& invMat, double taper)
+template<typename MatT, typename Vec3T>
+inline void Map::set(const MatT& mat, const MatT& invMat, const Vec3T& translate, double taper)
 {
-    float * mf = mMatF, *vf = mVecF;
-    float*  mif = mInvMatF;
-    double *md = mMatD, *vd = mVecD;
-    double* mid = mInvMatD;
+    float * mf = mMatF, *vf = mVecF, *mif = mInvMatF;
+    double *md = mMatD, *vd = mVecD, *mid = mInvMatD;
     mTaperF = static_cast<float>(taper);
     mTaperD = taper;
     for (int i = 0; i < 3; ++i) {
-        *vd++ = mat[3][i]; //translation
-        *vf++ = static_cast<float>(mat[3][i]);
+        *vd++ = translate[i]; //translation
+        *vf++ = static_cast<float>(translate[i]); //translation
         for (int j = 0; j < 3; ++j) {
             *md++ = mat[j][i]; //transposed
             *mid++ = invMat[j][i];
-            *mf++ = static_cast<float>(mat[j][i]);
+            *mf++ = static_cast<float>(mat[j][i]); //transposed
             *mif++ = static_cast<float>(invMat[j][i]);
         }
     }
 }
 
+template<typename Vec3T>
+inline void Map::set(double dx, const Vec3T& trans, double taper)
+{
+    NANOVDB_ASSERT(dx > 0.0);
+    const double mat[3][3] = { {dx, 0.0, 0.0},   // row 0
+                               {0.0, dx, 0.0},   // row 1
+                               {0.0, 0.0, dx} }; // row 2
+    const double idx = 1.0 / dx;
+    const double invMat[3][3] = { {idx, 0.0, 0.0},   // row 0
+                                  {0.0, idx, 0.0},   // row 1
+                                  {0.0, 0.0, idx} }; // row 2
+    this->set(mat, invMat, trans, taper);
+}
+
 // ----------------------------> GridBlindMetaData <--------------------------------------
 
 struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) GridBlindMetaData
-{
-    static const int      MaxNameSize = 256;// due to NULL termination the maximum length is one less!
-    int64_t               mByteOffset; // byte offset to the blind data, relative to the GridData.
-    uint64_t              mElementCount; // number of elements, e.g. point count
-    uint32_t              mFlags; // flags
+{ // 288 bytes
+    static const int      MaxNameSize = 256; // due to NULL termination the maximum length is one less!
+    int64_t               mDataOffset; // byte offset to the blind data, relative to this GridBlindMetaData.
+    uint64_t              mValueCount; // number of blind values, e.g. point count
+    uint32_t              mValueSize;// byte size of each value, e.g. 4 if mDataType=Float and 1 if mDataType=Unknown since that amounts to char
     GridBlindDataSemantic mSemantic; // semantic meaning of the data.
     GridBlindDataClass    mDataClass; // 4 bytes
     GridType              mDataType; // 4 bytes
-    char                  mName[MaxNameSize];// note this include the NULL termination
-
-    /// @brief return memory usage in bytes for the class (note this computes for all blindMetaData structures.)
-    __hostdev__ static uint64_t memUsage(uint64_t blindDataCount = 0)
-    {
-        return blindDataCount * sizeof(GridBlindMetaData);
+    char                  mName[MaxNameSize]; // note this includes the NULL termination
+    // no padding required for 32 byte alignment
+
+    // disallow copy-construction since methods like blindData and getBlindData uses the this pointer!
+    GridBlindMetaData(const GridBlindMetaData&) = delete;
+
+    // disallow copy-assignment since methods like blindData and getBlindData uses the this pointer!
+    const GridBlindMetaData& operator=(const GridBlindMetaData&) = delete;
+
+    __hostdev__ void setBlindData(void* blindData) { mDataOffset = util::PtrDiff(blindData, this); }
+
+    // unsafe
+    __hostdev__ const void* blindData() const {return util::PtrAdd(this, mDataOffset);}
+
+    /// @brief Get a const pointer to the blind data represented by this meta data
+    /// @tparam BlindDataT Expected value type of the blind data.
+    /// @return Returns NULL if mGridType!=toGridType<BlindDataT>(), else a const point of type BlindDataT.
+    /// @note Use mDataType=Unknown if BlindDataT is a custom data type unknown to NanoVDB.
+    template<typename BlindDataT>
+    __hostdev__ const BlindDataT* getBlindData() const
+    {
+        //if (mDataType != toGridType<BlindDataT>()) printf("getBlindData mismatch\n");
+        return mDataType == toGridType<BlindDataT>() ? util::PtrAdd<BlindDataT>(this, mDataOffset) : nullptr;
+    }
+
+    /// @brief return true if this meta data has a valid combination of semantic, class and value tags
+    __hostdev__ bool isValid() const
+    {
+        auto check = [&]()->bool{
+            switch (mDataType){
+            case GridType::Unknown: return mValueSize==1u;// i.e. we encode data as mValueCount chars
+            case GridType::Float:   return mValueSize==4u;
+            case GridType::Double:  return mValueSize==8u;
+            case GridType::Int16:   return mValueSize==2u;
+            case GridType::Int32:   return mValueSize==4u;
+            case GridType::Int64:   return mValueSize==8u;
+            case GridType::Vec3f:   return mValueSize==12u;
+            case GridType::Vec3d:   return mValueSize==24u;
+            case GridType::Half:    return mValueSize==2u;
+            case GridType::RGBA8:   return mValueSize==4u;
+            case GridType::Fp8:     return mValueSize==1u;
+            case GridType::Fp16:    return mValueSize==2u;
+            case GridType::Vec4f:   return mValueSize==16u;
+            case GridType::Vec4d:   return mValueSize==32u;
+            case GridType::Vec3u8:  return mValueSize==3u;
+            case GridType::Vec3u16: return mValueSize==6u;
+            default: return true;}// all other combinations are valid
+        };
+        return nanovdb::isValid(mDataClass, mSemantic, mDataType) && check();
+    }
+
+    /// @brief return size in bytes of the blind data represented by this blind meta data
+    /// @note This size includes possible padding for 32 byte alignment. The actual amount
+    ///       of bind data is mValueCount * mValueSize
+    __hostdev__ uint64_t blindDataSize() const
+    {
+        return math::AlignUp<NANOVDB_DATA_ALIGNMENT>(mValueCount * mValueSize);
     }
-
-    __hostdev__  void setBlindData(void *ptr) { mByteOffset = PtrDiff(ptr, this); }
-
-    template <typename T>
-    __hostdev__ const T* getBlindData() const { return PtrAdd<T>(this, mByteOffset); }
-
 }; // GridBlindMetaData
 
 // ----------------------------> NodeTrait <--------------------------------------
 
 /// @brief Struct to derive node type from its level in a given
-///        grid, tree or root while perserving constness
+///        grid, tree or root while preserving constness
 template<typename GridOrTreeOrRootT, int LEVEL>
 struct NodeTrait;
 
@@ -2100,14 +1599,14 @@ struct NodeTrait;
 template<typename GridOrTreeOrRootT>
 struct NodeTrait<GridOrTreeOrRootT, 0>
 {
-    static_assert(GridOrTreeOrRootT::RootType::LEVEL == 3, "Tree depth is not supported");
+    static_assert(GridOrTreeOrRootT::RootNodeType::LEVEL == 3, "Tree depth is not supported");
     using Type = typename GridOrTreeOrRootT::LeafNodeType;
     using type = typename GridOrTreeOrRootT::LeafNodeType;
 };
 template<typename GridOrTreeOrRootT>
 struct NodeTrait<const GridOrTreeOrRootT, 0>
 {
-    static_assert(GridOrTreeOrRootT::RootType::LEVEL == 3, "Tree depth is not supported");
+    static_assert(GridOrTreeOrRootT::RootNodeType::LEVEL == 3, "Tree depth is not supported");
     using Type = const typename GridOrTreeOrRootT::LeafNodeType;
     using type = const typename GridOrTreeOrRootT::LeafNodeType;
 };
@@ -2115,46 +1614,172 @@ struct NodeTrait<const GridOrTreeOrRootT, 0>
 template<typename GridOrTreeOrRootT>
 struct NodeTrait<GridOrTreeOrRootT, 1>
 {
-    static_assert(GridOrTreeOrRootT::RootType::LEVEL == 3, "Tree depth is not supported");
-    using Type = typename GridOrTreeOrRootT::RootType::ChildNodeType::ChildNodeType;
-    using type = typename GridOrTreeOrRootT::RootType::ChildNodeType::ChildNodeType;
+    static_assert(GridOrTreeOrRootT::RootNodeType::LEVEL == 3, "Tree depth is not supported");
+    using Type = typename GridOrTreeOrRootT::RootNodeType::ChildNodeType::ChildNodeType;
+    using type = typename GridOrTreeOrRootT::RootNodeType::ChildNodeType::ChildNodeType;
 };
 template<typename GridOrTreeOrRootT>
 struct NodeTrait<const GridOrTreeOrRootT, 1>
 {
-    static_assert(GridOrTreeOrRootT::RootType::LEVEL == 3, "Tree depth is not supported");
-    using Type = const typename GridOrTreeOrRootT::RootType::ChildNodeType::ChildNodeType;
-    using type = const typename GridOrTreeOrRootT::RootType::ChildNodeType::ChildNodeType;
+    static_assert(GridOrTreeOrRootT::RootNodeType::LEVEL == 3, "Tree depth is not supported");
+    using Type = const typename GridOrTreeOrRootT::RootNodeType::ChildNodeType::ChildNodeType;
+    using type = const typename GridOrTreeOrRootT::RootNodeType::ChildNodeType::ChildNodeType;
 };
 template<typename GridOrTreeOrRootT>
 struct NodeTrait<GridOrTreeOrRootT, 2>
 {
-    static_assert(GridOrTreeOrRootT::RootType::LEVEL == 3, "Tree depth is not supported");
-    using Type = typename GridOrTreeOrRootT::RootType::ChildNodeType;
-    using type = typename GridOrTreeOrRootT::RootType::ChildNodeType;
+    static_assert(GridOrTreeOrRootT::RootNodeType::LEVEL == 3, "Tree depth is not supported");
+    using Type = typename GridOrTreeOrRootT::RootNodeType::ChildNodeType;
+    using type = typename GridOrTreeOrRootT::RootNodeType::ChildNodeType;
 };
 template<typename GridOrTreeOrRootT>
 struct NodeTrait<const GridOrTreeOrRootT, 2>
 {
-    static_assert(GridOrTreeOrRootT::RootType::LEVEL == 3, "Tree depth is not supported");
-    using Type = const typename GridOrTreeOrRootT::RootType::ChildNodeType;
-    using type = const typename GridOrTreeOrRootT::RootType::ChildNodeType;
+    static_assert(GridOrTreeOrRootT::RootNodeType::LEVEL == 3, "Tree depth is not supported");
+    using Type = const typename GridOrTreeOrRootT::RootNodeType::ChildNodeType;
+    using type = const typename GridOrTreeOrRootT::RootNodeType::ChildNodeType;
 };
 template<typename GridOrTreeOrRootT>
 struct NodeTrait<GridOrTreeOrRootT, 3>
 {
-    static_assert(GridOrTreeOrRootT::RootType::LEVEL == 3, "Tree depth is not supported");
-    using Type = typename GridOrTreeOrRootT::RootType;
-    using type = typename GridOrTreeOrRootT::RootType;
+    static_assert(GridOrTreeOrRootT::RootNodeType::LEVEL == 3, "Tree depth is not supported");
+    using Type = typename GridOrTreeOrRootT::RootNodeType;
+    using type = typename GridOrTreeOrRootT::RootNodeType;
+};
+
+template<typename GridOrTreeOrRootT>
+struct NodeTrait<const GridOrTreeOrRootT, 3>
+{
+    static_assert(GridOrTreeOrRootT::RootNodeType::LEVEL == 3, "Tree depth is not supported");
+    using Type = const typename GridOrTreeOrRootT::RootNodeType;
+    using type = const typename GridOrTreeOrRootT::RootNodeType;
 };
 
-template<typename GridOrTreeOrRootT>
-struct NodeTrait<const GridOrTreeOrRootT, 3>
+// ----------------------------> Froward decelerations of random access methods <--------------------------------------
+
+template<typename BuildT>
+struct GetValue;
+template<typename BuildT>
+struct SetValue;
+template<typename BuildT>
+struct SetVoxel;
+template<typename BuildT>
+struct GetState;
+template<typename BuildT>
+struct GetDim;
+template<typename BuildT>
+struct GetLeaf;
+template<typename BuildT>
+struct ProbeValue;
+template<typename BuildT>
+struct GetNodeInfo;
+
+// ----------------------------> CheckMode <----------------------------------
+
+/// @brief List of different modes for computing for a checksum
+enum class CheckMode : uint32_t { Disable = 0,  // no computation
+                                  Empty   = 0,
+                                  Half    = 1,
+                                  Partial = 1,  // fast but approximate
+                                  Default = 1,  // defaults to Partial
+                                  Full    = 2,  // slow but accurate
+                                  End     = 3, // marks the end of the enum list
+                                  StrLen  = 9 + End};
+
+/// @brief Prints CheckMode enum to a c-string
+/// @param dst Destination c-string
+/// @param mode CheckMode enum to be converted to string
+/// @return destinations string @c dst
+__hostdev__ inline char* toStr(char *dst, CheckMode mode)
+{
+    switch (mode){
+        case CheckMode::Half: return util::strcpy(dst, "half");
+        case CheckMode::Full: return util::strcpy(dst, "full");
+        default: return util::strcpy(dst, "disabled");
+    }
+}
+
+// ----------------------------> Checksum <----------------------------------
+
+/// @brief Class that encapsulates two CRC32 checksums, one for the Grid, Tree and Root node meta data
+///        and one for the remaining grid nodes.
+class Checksum
 {
-    static_assert(GridOrTreeOrRootT::RootType::LEVEL == 3, "Tree depth is not supported");
-    using Type = const typename GridOrTreeOrRootT::RootType;
-    using type = const typename GridOrTreeOrRootT::RootType;
-};
+    /// Three types of checksums:
+    ///   1) Empty: all 64 bits are on (used to signify a disabled or undefined checksum)
+    ///   2) Half: Upper 32 bits are on and not all of lower 32 bits are on (lower 32 bits checksum head of grid)
+    ///   3) Full: Not all of the 64 bits are one (lower 32 bits checksum head of grid and upper 32 bits checksum tail of grid)
+    union { uint32_t mCRC32[2]; uint64_t mCRC64; };// mCRC32[0] is checksum of Grid, Tree and Root, and mCRC32[1] is checksum of nodes
+
+public:
+
+    static constexpr uint32_t EMPTY32 = ~uint32_t{0};
+    static constexpr uint64_t EMPTY64 = ~uint64_t(0);
+
+    /// @brief default constructor initiates checksum to EMPTY
+    __hostdev__ Checksum() : mCRC64{EMPTY64} {}
+
+    /// @brief Constructor that allows the two 32bit checksums to be initiated explicitly
+    /// @param head Initial 32bit CRC checksum of grid, tree and root data
+    /// @param tail Initial 32bit CRC checksum of all the nodes and blind data
+    __hostdev__ Checksum(uint32_t head, uint32_t tail) :  mCRC32{head, tail} {}
+
+    /// @brief
+    /// @param checksum
+    /// @param mode
+    __hostdev__ Checksum(uint64_t checksum, CheckMode mode = CheckMode::Full) : mCRC64{mode == CheckMode::Disable ? EMPTY64 : checksum}
+    {
+        if (mode == CheckMode::Partial) mCRC32[1] = EMPTY32;
+    }
+
+    /// @brief return the 64 bit checksum of this instance
+    [[deprecated("Use Checksum::data instead.")]]
+    __hostdev__ uint64_t checksum() const { return mCRC64; }
+    [[deprecated("Use Checksum::head and Ckecksum::tail instead.")]]
+    __hostdev__ uint32_t& checksum(int i) {NANOVDB_ASSERT(i==0 || i==1); return mCRC32[i]; }
+    [[deprecated("Use Checksum::head and Ckecksum::tail instead.")]]
+    __hostdev__ uint32_t checksum(int i) const {NANOVDB_ASSERT(i==0 || i==1); return mCRC32[i]; }
+
+    __hostdev__ uint64_t  full() const { return mCRC64; }
+    __hostdev__ uint64_t& full()       { return mCRC64; }
+    __hostdev__ uint32_t  head() const { return mCRC32[0]; }
+    __hostdev__ uint32_t& head()       { return mCRC32[0]; }
+    __hostdev__ uint32_t  tail() const { return mCRC32[1]; }
+    __hostdev__ uint32_t& tail()       { return mCRC32[1]; }
+
+    /// @brief return true if the 64 bit checksum is partial, i.e. of head only
+    [[deprecated("Use Checksum::isHalf instead.")]]
+    __hostdev__ bool isPartial() const { return mCRC32[0] != EMPTY32 && mCRC32[1] == EMPTY32; }
+    __hostdev__ bool isHalf() const { return mCRC32[0] != EMPTY32 && mCRC32[1] == EMPTY32; }
+
+    /// @brief return true if the 64 bit checksum is fill, i.e. of both had and nodes
+    __hostdev__ bool isFull() const { return mCRC64 != EMPTY64 && mCRC32[1] != EMPTY32; }
+
+    /// @brief return true if the 64 bit checksum is disables (unset)
+    __hostdev__ bool isEmpty() const { return mCRC64 == EMPTY64; }
+
+    __hostdev__ void disable() { mCRC64 = EMPTY64; }
+
+    /// @brief return the mode of the 64 bit checksum
+    __hostdev__ CheckMode mode() const
+    {
+        return mCRC64    == EMPTY64 ? CheckMode::Disable :
+               mCRC32[1] == EMPTY32 ? CheckMode::Partial : CheckMode::Full;
+    }
+
+    /// @brief return true if the checksums are identical
+    /// @param rhs other Checksum
+    __hostdev__ bool operator==(const Checksum &rhs) const {return mCRC64 == rhs.mCRC64;}
+
+    /// @brief return true if the checksums are not identical
+    /// @param rhs other Checksum
+    __hostdev__ bool operator!=(const Checksum &rhs) const {return mCRC64 != rhs.mCRC64;}
+};// Checksum
+
+/// @brief Maps 64 bit checksum to CheckMode enum
+/// @param checksum 64 bit checksum with two CRC32 codes
+/// @return CheckMode enum
+__hostdev__ inline CheckMode toCheckMode(const Checksum &checksum){return checksum.mode();}
 
 // ----------------------------> Grid <--------------------------------------
 
@@ -2173,7 +1798,7 @@ struct NodeTrait<const GridOrTreeOrRootT, 3>
     N0 LeafNodes each with a bit mask, N0 ValueTypes and min/max
 
     Example layout: ("---" implies it has a custom offset, "..." implies zero or more)
-    [GridData][TreeData]---[RootData][ROOT TILES...]---[NodeData<5>]---[ModeData<4>]---[LeafData<3>]---[BLINDMETA...]---[BLIND0]---[BLIND1]---etc.
+    [GridData][TreeData]---[RootData][ROOT TILES...]---[InternalData<5>]---[InternalData<4>]---[LeafData<3>]---[BLINDMETA...]---[BLIND0]---[BLIND1]---etc.
 */
 
 /// @brief Struct with all the member data of the Grid (useful during serialization of an openvdb grid)
@@ -2183,76 +1808,83 @@ struct NodeTrait<const GridOrTreeOrRootT, 3>
 ///
 /// @note No client code should (or can) interface with this struct so it can safely be ignored!
 struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) GridData
-{// sizeof(GridData) = 672B
-    static const int MaxNameSize = 256;// due to NULL termination the maximum length is one less
-    uint64_t         mMagic; // 8B magic to validate it is valid grid data.
-    uint64_t         mChecksum; // 8B. Checksum of grid buffer.
-    Version          mVersion;// 4B major, minor, and patch version numbers
-    uint32_t         mFlags; // 4B. flags for grid.
-    uint32_t         mGridIndex;// 4B. Index of this grid in the buffer
-    uint32_t         mGridCount; // 4B. Total number of grids in the buffer
-    uint64_t         mGridSize; // 8B. byte count of this entire grid occupied in the buffer.
-    char             mGridName[MaxNameSize]; // 256B
-    Map              mMap; // 264B. affine transformation between index and world space in both single and double precision
-    BBox<Vec3R>      mWorldBBox; // 48B. floating-point AABB of active values in WORLD SPACE (2 x 3 doubles)
-    Vec3R            mVoxelSize; // 24B. size of a voxel in world units
-    GridClass        mGridClass; // 4B.
-    GridType         mGridType; //  4B.
-    int64_t          mBlindMetadataOffset; // 8B. offset of GridBlindMetaData structures that follow this grid.
-    uint32_t         mBlindMetadataCount; // 4B. count of GridBlindMetaData structures that follow this grid.
-
-
-    // Set and unset various bit flags
-    __hostdev__ void setFlagsOff() { mFlags = uint32_t(0); }
-    __hostdev__ void setMinMaxOn(bool on = true)
-    {
-        if (on) {
-            mFlags |= static_cast<uint32_t>(GridFlags::HasMinMax);
-        } else {
-            mFlags &= ~static_cast<uint32_t>(GridFlags::HasMinMax);
-        }
-    }
-    __hostdev__ void setBBoxOn(bool on = true)
-    {
-        if (on) {
-            mFlags |= static_cast<uint32_t>(GridFlags::HasBBox);
-        } else {
-            mFlags &= ~static_cast<uint32_t>(GridFlags::HasBBox);
-        }
-    }
-    __hostdev__ void setLongGridNameOn(bool on = true)
-    {
-        if (on) {
-            mFlags |= static_cast<uint32_t>(GridFlags::HasLongGridName);
-        } else {
-            mFlags &= ~static_cast<uint32_t>(GridFlags::HasLongGridName);
-        }
-    }
-    __hostdev__ void setAverageOn(bool on = true)
-    {
-        if (on) {
-            mFlags |= static_cast<uint32_t>(GridFlags::HasAverage);
-        } else {
-            mFlags &= ~static_cast<uint32_t>(GridFlags::HasAverage);
-        }
-    }
-    __hostdev__ void setStdDeviationOn(bool on = true)
-    {
-        if (on) {
-            mFlags |= static_cast<uint32_t>(GridFlags::HasStdDeviation);
-        } else {
-            mFlags &= ~static_cast<uint32_t>(GridFlags::HasStdDeviation);
-        }
+{ // sizeof(GridData) = 672B
+    static const int MaxNameSize = 256; // due to NULL termination the maximum length is one less
+    uint64_t         mMagic; // 8B (0) magic to validate it is valid grid data.
+    Checksum         mChecksum; // 8B (8). Checksum of grid buffer.
+    Version          mVersion; // 4B (16) major, minor, and patch version numbers
+    BitFlags<32>     mFlags; // 4B (20). flags for grid.
+    uint32_t         mGridIndex; // 4B (24). Index of this grid in the buffer
+    uint32_t         mGridCount; // 4B (28). Total number of grids in the buffer
+    uint64_t         mGridSize; // 8B (32). byte count of this entire grid occupied in the buffer.
+    char             mGridName[MaxNameSize]; // 256B (40)
+    Map              mMap; // 264B (296). affine transformation between index and world space in both single and double precision
+    Vec3dBBox        mWorldBBox; // 48B (560). floating-point AABB of active values in WORLD SPACE (2 x 3 doubles)
+    Vec3d            mVoxelSize; // 24B (608). size of a voxel in world units
+    GridClass        mGridClass; // 4B (632).
+    GridType         mGridType; //  4B (636).
+    int64_t          mBlindMetadataOffset; // 8B (640). offset to beginning of GridBlindMetaData structures that follow this grid.
+    uint32_t         mBlindMetadataCount; // 4B (648). count of GridBlindMetaData structures that follow this grid.
+    uint32_t         mData0; // 4B (652) unused
+    uint64_t         mData1; // 8B (656) is use for the total number of values indexed by an IndexGrid
+    uint64_t         mData2; // 8B (664) padding to 32 B alignment
+    /// @brief Use this method to initiate most member data
+    GridData& operator=(const GridData&) = default;
+    //__hostdev__ GridData& operator=(const GridData& other){return *util::memcpy(this, &other);}
+    __hostdev__ void init(std::initializer_list<GridFlags> list = {GridFlags::IsBreadthFirst},
+                          uint64_t                         gridSize = 0u,
+                          const Map&                       map = Map(),
+                          GridType                         gridType = GridType::Unknown,
+                          GridClass                        gridClass = GridClass::Unknown)
+    {
+#ifdef NANOVDB_USE_NEW_MAGIC_NUMBERS
+        mMagic = NANOVDB_MAGIC_GRID;
+#else
+        mMagic = NANOVDB_MAGIC_NUMB;
+#endif
+        mChecksum.disable();// all 64 bits ON means checksum is disabled
+        mVersion = Version();
+        mFlags.initMask(list);
+        mGridIndex = 0u;
+        mGridCount = 1u;
+        mGridSize = gridSize;
+        mGridName[0] = '\0';
+        mMap = map;
+        mWorldBBox = Vec3dBBox();// invalid bbox
+        mVoxelSize = map.getVoxelSize();
+        mGridClass = gridClass;
+        mGridType = gridType;
+        mBlindMetadataOffset = mGridSize; // i.e. no blind data
+        mBlindMetadataCount = 0u; // i.e. no blind data
+        mData0 = 0u; // zero padding
+        mData1 = 0u; // only used for index and point grids
+        mData2 = NANOVDB_MAGIC_GRID; // since version 32.6.0 (will change in the future)
+    }
+    /// @brief return true if the magic number and the version are both valid
+    __hostdev__ bool isValid() const {
+        // Before v32.6.0: toMagic(mMagic) = MagicType::NanoVDB  and mData2 was undefined
+        // For    v32.6.0: toMagic(mMagic) = MagicType::NanoVDB  and toMagic(mData2) = MagicType::NanoGrid
+        // After  v32.7.X: toMagic(mMagic) = MagicType::NanoGrid and mData2 will again be undefined
+        const MagicType magic = toMagic(mMagic);
+        if (magic == MagicType::NanoGrid || toMagic(mData2) == MagicType::NanoGrid) return true;
+        bool test = magic == MagicType::NanoVDB;// could be GridData or io::FileHeader
+        if (test) test = mVersion.isCompatible();
+        if (test) test = mGridCount > 0u && mGridIndex < mGridCount;
+        if (test) test = mGridClass < GridClass::End && mGridType < GridType::End;
+        return test;
     }
-    __hostdev__ void setBreadthFirstOn(bool on = true)
+    // Set and unset various bit flags
+    __hostdev__ void setMinMaxOn(bool on = true) { mFlags.setMask(GridFlags::HasMinMax, on); }
+    __hostdev__ void setBBoxOn(bool on = true) { mFlags.setMask(GridFlags::HasBBox, on); }
+    __hostdev__ void setLongGridNameOn(bool on = true) { mFlags.setMask(GridFlags::HasLongGridName, on); }
+    __hostdev__ void setAverageOn(bool on = true) { mFlags.setMask(GridFlags::HasAverage, on); }
+    __hostdev__ void setStdDeviationOn(bool on = true) { mFlags.setMask(GridFlags::HasStdDeviation, on); }
+    __hostdev__ bool setGridName(const char* src)
     {
-        if (on) {
-            mFlags |= static_cast<uint32_t>(GridFlags::IsBreadthFirst);
-        } else {
-            mFlags &= ~static_cast<uint32_t>(GridFlags::IsBreadthFirst);
-        }
+        const bool success = (util::strncpy(mGridName, src, MaxNameSize)[MaxNameSize-1] == '\0');
+        if (!success) mGridName[MaxNameSize-1] = '\0';
+        return success; // returns true if input grid name is NOT longer than MaxNameSize characters
     }
-
     // Affine transformations based on double precision
     template<typename Vec3T>
     __hostdev__ Vec3T applyMap(const Vec3T& xyz) const { return mMap.applyMap(xyz); } // Pos: index -> world
@@ -2277,10 +1909,42 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) GridData
     __hostdev__ Vec3T applyIJTF(const Vec3T& xyz) const { return mMap.applyIJTF(xyz); }
 
     // @brief Return a non-const void pointer to the tree
-    __hostdev__ void* treePtr() { return this + 1; }
+    __hostdev__ void* treePtr() { return this + 1; }// TreeData is always right after GridData
 
     // @brief Return a const void pointer to the tree
-    __hostdev__ const void* treePtr() const { return this + 1; }
+    __hostdev__ const void* treePtr() const { return this + 1; }// TreeData is always right after GridData
+
+    /// @brief Return a non-const void pointer to the first node at @c LEVEL
+    /// @tparam LEVEL Level of the node. LEVEL 0 means leaf node and LEVEL 3 means root node
+    template <uint32_t LEVEL>
+    __hostdev__ const void* nodePtr() const
+    {
+        static_assert(LEVEL >= 0 && LEVEL <= 3, "invalid LEVEL template parameter");
+        const void *treeData = this + 1;// TreeData is always right after GridData
+        const uint64_t nodeOffset = *util::PtrAdd<uint64_t>(treeData, 8*LEVEL);// skip LEVEL uint64_t
+        return nodeOffset ? util::PtrAdd(treeData, nodeOffset) : nullptr;
+    }
+
+    /// @brief Return a non-const void pointer to the first node at @c LEVEL
+    /// @tparam LEVEL of the node. LEVEL 0 means leaf node and LEVEL 3 means root node
+    /// @warning If not nodes exist at @c LEVEL NULL is returned
+    template <uint32_t LEVEL>
+    __hostdev__ void* nodePtr()
+    {
+        static_assert(LEVEL >= 0 && LEVEL <= 3, "invalid LEVEL template parameter");
+        void *treeData  = this + 1;// TreeData is always right after GridData
+        const uint64_t nodeOffset = *util::PtrAdd<uint64_t>(treeData, 8*LEVEL);// skip LEVEL uint64_t
+        return nodeOffset ? util::PtrAdd(treeData, nodeOffset) : nullptr;
+    }
+
+    /// @brief Return number of nodes at @c LEVEL
+    /// @tparam Level of the node. LEVEL 0 means leaf node and LEVEL 2 means upper node
+    template <uint32_t LEVEL>
+    __hostdev__ uint32_t nodeCount() const
+    {
+        static_assert(LEVEL >= 0 && LEVEL < 3, "invalid LEVEL template parameter");
+        return *util::PtrAdd<uint32_t>(this + 1, 4*(8 + LEVEL));// TreeData is always right after GridData
+    }
 
     /// @brief Returns a const reference to the blindMetaData at the specified linear offset.
     ///
@@ -2288,16 +1952,55 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) GridData
     __hostdev__ const GridBlindMetaData* blindMetaData(uint32_t n) const
     {
         NANOVDB_ASSERT(n < mBlindMetadataCount);
-        return PtrAdd<GridBlindMetaData>(this, mBlindMetadataOffset) + n;
+        return util::PtrAdd<GridBlindMetaData>(this, mBlindMetadataOffset) + n;
+    }
+
+    __hostdev__ const char* gridName() const
+    {
+        if (mFlags.isMaskOn(GridFlags::HasLongGridName)) {// search for first blind meta data that contains a name
+            NANOVDB_ASSERT(mBlindMetadataCount > 0);
+            for (uint32_t i = 0; i < mBlindMetadataCount; ++i) {
+                const auto* metaData = this->blindMetaData(i);// EXTREMELY important to be a pointer
+                if (metaData->mDataClass == GridBlindDataClass::GridName) {
+                    NANOVDB_ASSERT(metaData->mDataType == GridType::Unknown);
+                    return metaData->template getBlindData<const char>();
+                }
+            }
+            NANOVDB_ASSERT(false); // should never hit this!
+        }
+        return mGridName;
+    }
+
+    /// @brief Return memory usage in bytes for this class only.
+    __hostdev__ static uint64_t memUsage() { return sizeof(GridData); }
+
+    /// @brief return AABB of active values in world space
+    __hostdev__ const Vec3dBBox& worldBBox() const { return mWorldBBox; }
+
+    /// @brief return AABB of active values in index space
+    __hostdev__ const CoordBBox& indexBBox() const {return *(const CoordBBox*)(this->nodePtr<3>());}
+
+    /// @brief return the root table has size
+    __hostdev__ uint32_t rootTableSize() const
+    {
+        const void *root = this->nodePtr<3>();
+        return root ? *util::PtrAdd<uint32_t>(root, sizeof(CoordBBox)) : 0u;
     }
 
+    /// @brief test if the grid is empty, e.i the root table has size 0
+    /// @return  true if this grid contains not data whatsoever
+    __hostdev__ bool isEmpty() const {return this->rootTableSize() == 0u;}
+
+    /// @brief  return true if RootData follows TreeData in memory without any extra padding
+    /// @details TreeData is always following right after GridData, but the same might not be true for RootData
+    __hostdev__ bool isRootConnected() const { return *(const uint64_t*)((const char*)(this + 1) + 24) == 64u;}
 }; // GridData
 
 // Forward declaration of accelerated random access class
-template <typename BuildT, int LEVEL0 = -1, int LEVEL1 = -1, int LEVEL2 = -1>
+template<typename BuildT, int LEVEL0 = -1, int LEVEL1 = -1, int LEVEL2 = -1>
 class ReadAccessor;
 
-template <typename BuildT>
+template<typename BuildT>
 using DefaultReadAccessor = ReadAccessor<BuildT, 0, 1, 2>;
 
 /// @brief Highest level of the data structure. Contains a tree and a world->index
@@ -2305,14 +2008,18 @@ using DefaultReadAccessor = ReadAccessor<BuildT, 0, 1, 2>;
 ///
 /// @note This the API of this class to interface with client code
 template<typename TreeT>
-class Grid : private GridData
+class Grid : public GridData
 {
 public:
-    using TreeType  = TreeT;
-    using RootType  = typename TreeT::RootType;
-    using DataType  = GridData;
+    using TreeType = TreeT;
+    using RootType = typename TreeT::RootType;
+    using RootNodeType = RootType;
+    using UpperNodeType = typename RootNodeType::ChildNodeType;
+    using LowerNodeType = typename UpperNodeType::ChildNodeType;
+    using LeafNodeType = typename RootType::LeafNodeType;
+    using DataType = GridData;
     using ValueType = typename TreeT::ValueType;
-    using BuildType = typename TreeT::BuildType;// in rare cases BuildType != ValueType, e.g. then BuildType = ValueMask and ValueType = bool
+    using BuildType = typename TreeT::BuildType; // in rare cases BuildType != ValueType, e.g. then BuildType = ValueMask and ValueType = bool
     using CoordType = typename TreeT::CoordType;
     using AccessorType = DefaultReadAccessor<BuildType>;
 
@@ -2330,7 +2037,7 @@ class Grid : private GridData
     __hostdev__ const DataType* data() const { return reinterpret_cast<const DataType*>(this); }
 
     /// @brief Return memory usage in bytes for this class only.
-    __hostdev__ static uint64_t memUsage() { return sizeof(GridData); }
+    //__hostdev__ static uint64_t memUsage() { return sizeof(GridData); }
 
     /// @brief Return the memory footprint of the entire grid, i.e. including all nodes and blind data
     __hostdev__ uint64_t gridSize() const { return DataType::mGridSize; }
@@ -2341,6 +2048,20 @@ class Grid : private GridData
     /// @brief Return total number of grids in the buffer
     __hostdev__ uint32_t gridCount() const { return DataType::mGridCount; }
 
+    /// @brief  @brief Return the total number of values indexed by this IndexGrid
+    ///
+    /// @note This method is only defined for IndexGrid = NanoGrid<ValueIndex || ValueOnIndex || ValueIndexMask || ValueOnIndexMask>
+    template<typename T = BuildType>
+    __hostdev__ typename util::enable_if<BuildTraits<T>::is_index, const uint64_t&>::type
+    valueCount() const { return DataType::mData1; }
+
+    /// @brief  @brief Return the total number of points indexed by this PointGrid
+    ///
+    /// @note This method is only defined for PointGrid = NanoGrid<Point>
+    template<typename T = BuildType>
+    __hostdev__ typename util::enable_if<util::is_same<T, Point>::value, const uint64_t&>::type
+    pointCount() const { return DataType::mData1; }
+
     /// @brief Return a const reference to the tree
     __hostdev__ const TreeT& tree() const { return *reinterpret_cast<const TreeT*>(this->treePtr()); }
 
@@ -2351,7 +2072,7 @@ class Grid : private GridData
     __hostdev__ AccessorType getAccessor() const { return AccessorType(this->tree().root()); }
 
     /// @brief Return a const reference to the size of a voxel in world units
-    __hostdev__ const Vec3R& voxelSize() const { return DataType::mVoxelSize; }
+    __hostdev__ const Vec3d& voxelSize() const { return DataType::mVoxelSize; }
 
     /// @brief Return a const reference to the Map for this grid
     __hostdev__ const Map& map() const { return DataType::mMap; }
@@ -2403,84 +2124,97 @@ class Grid : private GridData
     __hostdev__ Vec3T indexToWorldGradF(const Vec3T& grad) const { return DataType::applyIJTF(grad); }
 
     /// @brief Computes a AABB of active values in world space
-    __hostdev__ const BBox<Vec3R>& worldBBox() const { return DataType::mWorldBBox; }
+    //__hostdev__ const Vec3dBBox& worldBBox() const { return DataType::mWorldBBox; }
 
     /// @brief Computes a AABB of active values in index space
     ///
     /// @note This method is returning a floating point bounding box and not a CoordBBox. This makes
     ///       it more useful for clipping rays.
-    __hostdev__ const BBox<CoordType>& indexBBox() const { return this->tree().bbox(); }
+    //__hostdev__ const BBox<CoordType>& indexBBox() const { return this->tree().bbox(); }
 
     /// @brief Return the total number of active voxels in this tree.
     __hostdev__ uint64_t activeVoxelCount() const { return this->tree().activeVoxelCount(); }
 
     /// @brief Methods related to the classification of this grid
-    __hostdev__ bool  isValid() const { return DataType::mMagic == NANOVDB_MAGIC_NUMBER; }
+    __hostdev__ bool             isValid() const { return DataType::isValid(); }
     __hostdev__ const GridType&  gridType() const { return DataType::mGridType; }
     __hostdev__ const GridClass& gridClass() const { return DataType::mGridClass; }
     __hostdev__ bool             isLevelSet() const { return DataType::mGridClass == GridClass::LevelSet; }
     __hostdev__ bool             isFogVolume() const { return DataType::mGridClass == GridClass::FogVolume; }
     __hostdev__ bool             isStaggered() const { return DataType::mGridClass == GridClass::Staggered; }
     __hostdev__ bool             isPointIndex() const { return DataType::mGridClass == GridClass::PointIndex; }
+    __hostdev__ bool             isGridIndex() const { return DataType::mGridClass == GridClass::IndexGrid; }
     __hostdev__ bool             isPointData() const { return DataType::mGridClass == GridClass::PointData; }
     __hostdev__ bool             isMask() const { return DataType::mGridClass == GridClass::Topology; }
     __hostdev__ bool             isUnknown() const { return DataType::mGridClass == GridClass::Unknown; }
-    __hostdev__ bool             hasMinMax() const { return DataType::mFlags & static_cast<uint32_t>(GridFlags::HasMinMax); }
-    __hostdev__ bool             hasBBox() const { return DataType::mFlags & static_cast<uint32_t>(GridFlags::HasBBox); }
-    __hostdev__ bool             hasLongGridName() const { return DataType::mFlags & static_cast<uint32_t>(GridFlags::HasLongGridName); }
-    __hostdev__ bool             hasAverage() const { return DataType::mFlags & static_cast<uint32_t>(GridFlags::HasAverage); }
-    __hostdev__ bool             hasStdDeviation() const { return DataType::mFlags & static_cast<uint32_t>(GridFlags::HasStdDeviation); }
-    __hostdev__ bool             isBreadthFirst() const { return DataType::mFlags & static_cast<uint32_t>(GridFlags::IsBreadthFirst); }
+    __hostdev__ bool             hasMinMax() const { return DataType::mFlags.isMaskOn(GridFlags::HasMinMax); }
+    __hostdev__ bool             hasBBox() const { return DataType::mFlags.isMaskOn(GridFlags::HasBBox); }
+    __hostdev__ bool             hasLongGridName() const { return DataType::mFlags.isMaskOn(GridFlags::HasLongGridName); }
+    __hostdev__ bool             hasAverage() const { return DataType::mFlags.isMaskOn(GridFlags::HasAverage); }
+    __hostdev__ bool             hasStdDeviation() const { return DataType::mFlags.isMaskOn(GridFlags::HasStdDeviation); }
+    __hostdev__ bool             isBreadthFirst() const { return DataType::mFlags.isMaskOn(GridFlags::IsBreadthFirst); }
 
     /// @brief return true if the specified node type is layed out breadth-first in memory and has a fixed size.
     ///        This allows for sequential access to the nodes.
-    template <typename NodeT>
+    template<typename NodeT>
     __hostdev__ bool isSequential() const { return NodeT::FIXED_SIZE && this->isBreadthFirst(); }
 
     /// @brief return true if the specified node level is layed out breadth-first in memory and has a fixed size.
     ///        This allows for sequential access to the nodes.
-    template <int LEVEL>
-    __hostdev__ bool isSequential() const { return NodeTrait<TreeT,LEVEL>::type::FIXED_SIZE && this->isBreadthFirst(); }
+    template<int LEVEL>
+    __hostdev__ bool isSequential() const { return NodeTrait<TreeT, LEVEL>::type::FIXED_SIZE && this->isBreadthFirst(); }
+
+   /// @brief return true if nodes at all levels can safely be accessed with simple linear offsets
+    __hostdev__ bool isSequential() const { return UpperNodeType::FIXED_SIZE && LowerNodeType::FIXED_SIZE && LeafNodeType::FIXED_SIZE && this->isBreadthFirst(); }
 
     /// @brief Return a c-string with the name of this grid
-    __hostdev__ const char* gridName() const
-    {
-        if (this->hasLongGridName()) {
-            const auto &metaData = this->blindMetaData(DataType::mBlindMetadataCount-1);// always the last
-            NANOVDB_ASSERT(metaData.mDataClass == GridBlindDataClass::GridName);
-            return metaData.template getBlindData<const char>();
-        }
-        return DataType::mGridName;
-    }
+    __hostdev__ const char* gridName() const { return DataType::gridName(); }
 
     /// @brief Return a c-string with the name of this grid, truncated to 255 characters
     __hostdev__ const char* shortGridName() const { return DataType::mGridName; }
 
     /// @brief Return checksum of the grid buffer.
-    __hostdev__ uint64_t checksum() const { return DataType::mChecksum; }
+    __hostdev__ const Checksum& checksum() const { return DataType::mChecksum; }
 
     /// @brief Return true if this grid is empty, i.e. contains no values or nodes.
-    __hostdev__ bool isEmpty() const { return this->tree().isEmpty(); }
+    //__hostdev__ bool isEmpty() const { return this->tree().isEmpty(); }
 
     /// @brief Return the count of blind-data encoded in this grid
-    __hostdev__ int blindDataCount() const { return DataType::mBlindMetadataCount; }
+    __hostdev__ uint32_t blindDataCount() const { return DataType::mBlindMetadataCount; }
+
+    /// @brief Return the index of the first blind data with specified name if found, otherwise -1.
+    __hostdev__ int findBlindData(const char* name) const;
 
-    /// @brief Return the index of the blind data with specified semantic if found, otherwise -1.
+    /// @brief Return the index of the first blind data with specified semantic if found, otherwise -1.
     __hostdev__ int findBlindDataForSemantic(GridBlindDataSemantic semantic) const;
 
     /// @brief Returns a const pointer to the blindData at the specified linear offset.
     ///
-    /// @warning Point might be NULL and the linear offset is assumed to be in the valid range
+    /// @warning Pointer might be NULL and the linear offset is assumed to be in the valid range
+    // this method is deprecated !!!!
+    [[deprecated("Use Grid::getBlindData<T>() instead.")]]
     __hostdev__ const void* blindData(uint32_t n) const
     {
-        if (DataType::mBlindMetadataCount == 0) {
-            return nullptr;
-        }
+        printf("\nnanovdb::Grid::blindData is unsafe and hence deprecated! Please use nanovdb::Grid::getBlindData instead.\n\n");
         NANOVDB_ASSERT(n < DataType::mBlindMetadataCount);
-        return this->blindMetaData(n).template getBlindData<void>();
+        return this->blindMetaData(n).blindData();
     }
 
-    __hostdev__ const GridBlindMetaData& blindMetaData(int n) const { return *DataType::blindMetaData(n); }
+    template <typename BlindDataT>
+     __hostdev__ const BlindDataT* getBlindData(uint32_t n) const
+    {
+        if (n >= DataType::mBlindMetadataCount) return nullptr;// index is out of bounds
+        return this->blindMetaData(n).template getBlindData<BlindDataT>();// NULL if mismatching BlindDataT
+    }
+
+    template <typename BlindDataT>
+     __hostdev__ BlindDataT* getBlindData(uint32_t n)
+    {
+        if (n >= DataType::mBlindMetadataCount) return nullptr;// index is out of bounds
+        return const_cast<BlindDataT*>(this->blindMetaData(n).template getBlindData<BlindDataT>());// NULL if mismatching BlindDataT
+    }
+
+    __hostdev__ const GridBlindMetaData& blindMetaData(uint32_t n) const { return *DataType::blindMetaData(n); }
 
 private:
     static_assert(sizeof(GridData) % NANOVDB_DATA_ALIGNMENT == 0, "sizeof(GridData) is misaligned");
@@ -2489,40 +2223,70 @@ class Grid : private GridData
 template<typename TreeT>
 __hostdev__ int Grid<TreeT>::findBlindDataForSemantic(GridBlindDataSemantic semantic) const
 {
-    for (uint32_t i = 0, n = this->blindDataCount(); i < n; ++i)
+    for (uint32_t i = 0, n = this->blindDataCount(); i < n; ++i) {
         if (this->blindMetaData(i).mSemantic == semantic)
             return int(i);
+    }
+    return -1;
+}
+
+template<typename TreeT>
+__hostdev__ int Grid<TreeT>::findBlindData(const char* name) const
+{
+    auto test = [&](int n) {
+        const char* str = this->blindMetaData(n).mName;
+        for (int i = 0; i < GridBlindMetaData::MaxNameSize; ++i) {
+            if (name[i] != str[i])
+                return false;
+            if (name[i] == '\0' && str[i] == '\0')
+                return true;
+        }
+        return true; // all len characters matched
+    };
+    for (int i = 0, n = this->blindDataCount(); i < n; ++i)
+        if (test(i))
+            return i;
     return -1;
 }
 
 // ----------------------------> Tree <--------------------------------------
 
-template<int ROOT_LEVEL = 3>
 struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) TreeData
-{// sizeof(TreeData<3>) == 64B
-    static_assert(ROOT_LEVEL == 3, "Root level is assumed to be three");
-    uint64_t mNodeOffset[4];//32B, byte offset from this tree to first leaf, lower, upper and root node
-    uint32_t mNodeCount[3];// 12B, total number of nodes of type: leaf, lower internal, upper internal
-    uint32_t mTileCount[3];// 12B, total number of tiles of type: leaf, lower internal, upper internal (node, only active tiles!)
-    uint64_t mVoxelCount;//    8B, total number of active voxels in the root and all its child nodes.
+{ // sizeof(TreeData) == 64B
+    int64_t  mNodeOffset[4];// 32B, byte offset from this tree to first leaf, lower, upper and root node. If mNodeCount[N]=0 => mNodeOffset[N]==mNodeOffset[N+1]
+    uint32_t mNodeCount[3]; // 12B, total number of nodes of type: leaf, lower internal, upper internal
+    uint32_t mTileCount[3]; // 12B, total number of active tile values at the lower internal, upper internal and root node levels
+    uint64_t mVoxelCount; //    8B, total number of active voxels in the root and all its child nodes.
+    // No padding since it's always 32B aligned
+    //__hostdev__ TreeData& operator=(const TreeData& other){return *util::memcpy(this, &other);}
+    TreeData& operator=(const TreeData&) = default;
+    __hostdev__ void setRoot(const void* root) {
+        NANOVDB_ASSERT(root);
+        mNodeOffset[3] = util::PtrDiff(root, this);
+    }
 
-    template <typename RootT>
-    __hostdev__ void setRoot(const RootT* root) { mNodeOffset[3] = PtrDiff(root, this); }
-    template <typename RootT>
-    __hostdev__ RootT* getRoot() { return PtrAdd<RootT>(this, mNodeOffset[3]); }
-    template <typename RootT>
-    __hostdev__ const RootT* getRoot() const { return PtrAdd<RootT>(this, mNodeOffset[3]); }
+    /// @brief Get a non-const void pointer to the root node (never NULL)
+    __hostdev__ void* getRoot() { return util::PtrAdd(this, mNodeOffset[3]); }
 
-    template <typename NodeT>
-    __hostdev__ void setFirstNode(const NodeT* node)
-    {
-        mNodeOffset[NodeT::LEVEL] = node ? PtrDiff(node, this) : 0;
-    }
-};
+    /// @brief Get a const void pointer to the root node (never NULL)
+    __hostdev__ const void* getRoot() const { return util::PtrAdd(this, mNodeOffset[3]); }
+
+    template<typename NodeT>
+    __hostdev__ void setFirstNode(const NodeT* node) {mNodeOffset[NodeT::LEVEL] = (node ? util::PtrDiff(node, this) : 0);}
+
+    /// @brief Return true if the root is empty, i.e. has not child nodes or constant tiles
+    __hostdev__ bool isEmpty() const  {return  mNodeOffset[3] ? *util::PtrAdd<uint32_t>(this, mNodeOffset[3] + sizeof(CoordBBox)) == 0 : true;}
+
+    /// @brief Return the index bounding box of all the active values in this tree, i.e. in all nodes of the tree
+    __hostdev__ CoordBBox bbox() const {return  mNodeOffset[3] ? *util::PtrAdd<CoordBBox>(this, mNodeOffset[3]) : CoordBBox();}
+
+    /// @brief  return true if RootData is layout out immediately after TreeData in memory
+    __hostdev__ bool isRootNext() const {return mNodeOffset[3] ? mNodeOffset[3] == sizeof(TreeData) : false; }
+};// TreeData
 
 // ----------------------------> GridTree <--------------------------------------
 
-/// @brief defines a tree type from a grid type while perserving constness
+/// @brief defines a tree type from a grid type while preserving constness
 template<typename GridT>
 struct GridTree
 {
@@ -2540,7 +2304,7 @@ struct GridTree<const GridT>
 
 /// @brief VDB Tree, which is a thin wrapper around a RootNode.
 template<typename RootT>
-class Tree : private TreeData<RootT::LEVEL>
+class Tree : public TreeData
 {
     static_assert(RootT::LEVEL == 3, "Tree depth is not supported");
     static_assert(RootT::ChildNodeType::LOG2DIM == 5, "Tree configuration is not supported");
@@ -2548,11 +2312,14 @@ class Tree : private TreeData<RootT::LEVEL>
     static_assert(RootT::LeafNodeType::LOG2DIM == 3, "Tree configuration is not supported");
 
 public:
-    using DataType = TreeData<RootT::LEVEL>;
+    using DataType = TreeData;
     using RootType = RootT;
-    using LeafNodeType = typename RootT::LeafNodeType;
+    using RootNodeType = RootT;
+    using UpperNodeType = typename RootNodeType::ChildNodeType;
+    using LowerNodeType = typename UpperNodeType::ChildNodeType;
+    using LeafNodeType = typename RootType::LeafNodeType;
     using ValueType = typename RootT::ValueType;
-    using BuildType = typename RootT::BuildType;// in rare cases BuildType != ValueType, e.g. then BuildType = ValueMask and ValueType = bool
+    using BuildType = typename RootT::BuildType; // in rare cases BuildType != ValueType, e.g. then BuildType = ValueMask and ValueType = bool
     using CoordType = typename RootT::CoordType;
     using AccessorType = DefaultReadAccessor<BuildType>;
 
@@ -2574,20 +2341,21 @@ class Tree : private TreeData<RootT::LEVEL>
     /// @brief return memory usage in bytes for the class
     __hostdev__ static uint64_t memUsage() { return sizeof(DataType); }
 
-    __hostdev__ RootT& root() { return *DataType::template getRoot<RootT>(); }
+    __hostdev__ RootT& root() {return *reinterpret_cast<RootT*>(DataType::getRoot());}
 
-    __hostdev__ const RootT& root() const { return *DataType::template getRoot<RootT>(); }
+    __hostdev__ const RootT& root() const {return *reinterpret_cast<const RootT*>(DataType::getRoot());}
 
     __hostdev__ AccessorType getAccessor() const { return AccessorType(this->root()); }
 
     /// @brief Return the value of the given voxel (regardless of state or location in the tree.)
     __hostdev__ ValueType getValue(const CoordType& ijk) const { return this->root().getValue(ijk); }
+    __hostdev__ ValueType getValue(int i, int j, int k) const { return this->root().getValue(CoordType(i, j, k)); }
 
     /// @brief Return the active state of the given voxel (regardless of state or location in the tree.)
     __hostdev__ bool isActive(const CoordType& ijk) const { return this->root().isActive(ijk); }
 
     /// @brief Return true if this tree is empty, i.e. contains no values or nodes
-    __hostdev__ bool isEmpty() const { return this->root().isEmpty(); }
+    //__hostdev__ bool isEmpty() const { return this->root().isEmpty(); }
 
     /// @brief Combines the previous two methods in a single call
     __hostdev__ bool probeValue(const CoordType& ijk, ValueType& v) const { return this->root().probeValue(ijk, v); }
@@ -2599,18 +2367,20 @@ class Tree : private TreeData<RootT::LEVEL>
     __hostdev__ void extrema(ValueType& min, ValueType& max) const;
 
     /// @brief Return a const reference to the index bounding box of all the active values in this tree, i.e. in all nodes of the tree
-    __hostdev__ const BBox<CoordType>& bbox() const { return this->root().bbox(); }
+    //__hostdev__ const BBox<CoordType>& bbox() const { return this->root().bbox(); }
 
     /// @brief Return the total number of active voxels in this tree.
     __hostdev__ uint64_t activeVoxelCount() const { return DataType::mVoxelCount; }
 
     /// @brief   Return the total number of active tiles at the specified level of the tree.
     ///
-    /// @details n = 0 corresponds to leaf level tiles.
-    __hostdev__ const uint32_t& activeTileCount(uint32_t n) const
+    /// @details level = 1,2,3 corresponds to active tile count in lower internal nodes, upper
+    ///          internal nodes, and the root level. Note active values at the leaf level are
+    ///          referred to as active voxels (see activeVoxelCount defined above).
+    __hostdev__ const uint32_t& activeTileCount(uint32_t level) const
     {
-        NANOVDB_ASSERT(n < 3);
-        return DataType::mTileCount[n];
+        NANOVDB_ASSERT(level > 0 && level <= 3); // 1, 2, or 3
+        return DataType::mTileCount[level - 1];
     }
 
     template<typename NodeT>
@@ -2626,44 +2396,67 @@ class Tree : private TreeData<RootT::LEVEL>
         return DataType::mNodeCount[level];
     }
 
+    __hostdev__ uint32_t totalNodeCount() const
+    {
+        return DataType::mNodeCount[0] + DataType::mNodeCount[1] + DataType::mNodeCount[2];
+    }
+
     /// @brief return a pointer to the first node of the specified type
     ///
     /// @warning Note it may return NULL if no nodes exist
-     template <typename NodeT>
+    template<typename NodeT>
     __hostdev__ NodeT* getFirstNode()
     {
-        const uint64_t offset = DataType::mNodeOffset[NodeT::LEVEL];
-        return offset>0 ? PtrAdd<NodeT>(this, offset) : nullptr;
+        const int64_t nodeOffset = DataType::mNodeOffset[NodeT::LEVEL];
+        return nodeOffset ? util::PtrAdd<NodeT>(this, nodeOffset) : nullptr;
     }
 
     /// @brief return a const pointer to the first node of the specified type
     ///
     /// @warning Note it may return NULL if no nodes exist
-    template <typename NodeT>
+    template<typename NodeT>
     __hostdev__ const NodeT* getFirstNode() const
     {
-        const uint64_t offset = DataType::mNodeOffset[NodeT::LEVEL];
-        return offset>0 ? PtrAdd<NodeT>(this, offset) : nullptr;
+        const int64_t nodeOffset = DataType::mNodeOffset[NodeT::LEVEL];
+        return nodeOffset ? util::PtrAdd<NodeT>(this, nodeOffset) : nullptr;
     }
 
     /// @brief return a pointer to the first node at the specified level
     ///
     /// @warning Note it may return NULL if no nodes exist
-    template <int LEVEL>
-    __hostdev__ typename NodeTrait<RootT, LEVEL>::type*
-    getFirstNode()
+    template<int LEVEL>
+    __hostdev__ typename NodeTrait<RootT, LEVEL>::type* getFirstNode()
     {
-        return this->template getFirstNode<typename NodeTrait<RootT,LEVEL>::type>();
+        return this->template getFirstNode<typename NodeTrait<RootT, LEVEL>::type>();
     }
 
     /// @brief return a const pointer to the first node of the specified level
     ///
     /// @warning Note it may return NULL if no nodes exist
-    template <int LEVEL>
-    __hostdev__ const typename NodeTrait<RootT, LEVEL>::type*
-    getFirstNode() const
+    template<int LEVEL>
+    __hostdev__ const typename NodeTrait<RootT, LEVEL>::type* getFirstNode() const
+    {
+        return this->template getFirstNode<typename NodeTrait<RootT, LEVEL>::type>();
+    }
+
+    /// @brief Template specializations of getFirstNode
+    __hostdev__ LeafNodeType*                             getFirstLeaf() { return this->getFirstNode<LeafNodeType>(); }
+    __hostdev__ const LeafNodeType*                       getFirstLeaf() const { return this->getFirstNode<LeafNodeType>(); }
+    __hostdev__ typename NodeTrait<RootT, 1>::type*       getFirstLower() { return this->getFirstNode<1>(); }
+    __hostdev__ const typename NodeTrait<RootT, 1>::type* getFirstLower() const { return this->getFirstNode<1>(); }
+    __hostdev__ typename NodeTrait<RootT, 2>::type*       getFirstUpper() { return this->getFirstNode<2>(); }
+    __hostdev__ const typename NodeTrait<RootT, 2>::type* getFirstUpper() const { return this->getFirstNode<2>(); }
+
+    template<typename OpT, typename... ArgsT>
+    __hostdev__ auto get(const CoordType& ijk, ArgsT&&... args) const
+    {
+        return this->root().template get<OpT>(ijk, args...);
+    }
+
+    template<typename OpT, typename... ArgsT>
+    __hostdev__ auto set(const CoordType& ijk, ArgsT&&... args)
     {
-        return this->template getFirstNode<typename NodeTrait<RootT,LEVEL>::type>();
+        return this->root().template set<OpT>(ijk, args...);
     }
 
 private:
@@ -2678,7 +2471,7 @@ __hostdev__ void Tree<RootT>::extrema(ValueType& min, ValueType& max) const
     max = this->root().maximum();
 }
 
-// --------------------------> RootNode <------------------------------------
+// --------------------------> RootData <------------------------------------
 
 /// @brief Struct with all the member data of the RootNode (useful during serialization of an openvdb RootNode)
 ///
@@ -2687,15 +2480,15 @@ template<typename ChildT>
 struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) RootData
 {
     using ValueT = typename ChildT::ValueType;
-    using BuildT = typename ChildT::BuildType;// in rare cases BuildType != ValueType, e.g. then BuildType = ValueMask and ValueType = bool
+    using BuildT = typename ChildT::BuildType; // in rare cases BuildType != ValueType, e.g. then BuildType = ValueMask and ValueType = bool
     using CoordT = typename ChildT::CoordType;
     using StatsT = typename ChildT::FloatType;
     static constexpr bool FIXED_SIZE = false;
 
     /// @brief Return a key based on the coordinates of a voxel
-#ifdef USE_SINGLE_ROOT_KEY
+#ifdef NANOVDB_USE_SINGLE_ROOT_KEY
     using KeyT = uint64_t;
-    template <typename CoordType>
+    template<typename CoordType>
     __hostdev__ static KeyT CoordToKey(const CoordType& ijk)
     {
         static_assert(sizeof(CoordT) == sizeof(CoordType), "Mismatching sizeof");
@@ -2706,44 +2499,55 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) RootData
     }
     __hostdev__ static CoordT KeyToCoord(const KeyT& key)
     {
-        static constexpr uint64_t MASK = (1u << 21) - 1;
-        return CoordT(((key >> 42) & MASK) << ChildT::TOTAL,
-                      ((key >> 21) & MASK) << ChildT::TOTAL,
-                       (key & MASK) << ChildT::TOTAL);
+        static constexpr uint64_t MASK = (1u << 21) - 1; // used to mask out 21 lower bits
+        return CoordT(((key >> 42) & MASK) << ChildT::TOTAL, // x are the upper 21 bits
+                      ((key >> 21) & MASK) << ChildT::TOTAL, // y are the middle 21 bits
+                      (key & MASK) << ChildT::TOTAL); // z are the lower 21 bits
     }
 #else
     using KeyT = CoordT;
     __hostdev__ static KeyT   CoordToKey(const CoordT& ijk) { return ijk & ~ChildT::MASK; }
     __hostdev__ static CoordT KeyToCoord(const KeyT& key) { return key; }
 #endif
-    BBox<CoordT> mBBox; // 24B. AABB if active values in index space.
-    uint32_t     mTableSize; // 4B. number of tiles and child pointers in the root node
+    math::BBox<CoordT> mBBox; // 24B. AABB of active values in index space.
+    uint32_t           mTableSize; // 4B. number of tiles and child pointers in the root node
 
     ValueT mBackground; // background value, i.e. value of any unset voxel
-    ValueT mMinimum; // typically 4B, minmum of all the active values
+    ValueT mMinimum; // typically 4B, minimum of all the active values
     ValueT mMaximum; // typically 4B, maximum of all the active values
     StatsT mAverage; // typically 4B, average of all the active values in this node and its child nodes
     StatsT mStdDevi; // typically 4B, standard deviation of all the active values in this node and its child nodes
 
+    /// @brief Return padding of this class in bytes, due to aliasing and 32B alignment
+    ///
+    /// @note The extra bytes are not necessarily at the end, but can come from aliasing of individual data members.
+    __hostdev__ static constexpr uint32_t padding()
+    {
+        return sizeof(RootData) - (24 + 4 + 3 * sizeof(ValueT) + 2 * sizeof(StatsT));
+    }
+
     struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) Tile
     {
-        template <typename CoordType>
-        __hostdev__ void setChild(const CoordType& k, const ChildT *ptr, const RootData *data)
+        template<typename CoordType>
+        __hostdev__ void setChild(const CoordType& k, const void* ptr, const RootData* data)
         {
             key = CoordToKey(k);
-            child = PtrDiff(ptr, data);
+            state = false;
+            child = util::PtrDiff(ptr, data);
         }
-        template <typename CoordType, typename ValueType>
-        __hostdev__ void setValue(const CoordType& k, bool s, const ValueType &v)
+        template<typename CoordType, typename ValueType>
+        __hostdev__ void setValue(const CoordType& k, bool s, const ValueType& v)
         {
             key = CoordToKey(k);
             state = s;
             value = v;
             child = 0;
         }
-        __hostdev__ bool  isChild() const { return child; }
+        __hostdev__ bool   isChild() const { return child != 0; }
+        __hostdev__ bool   isValue() const { return child == 0; }
+        __hostdev__ bool   isActive() const { return child == 0 && state; }
         __hostdev__ CoordT origin() const { return KeyToCoord(key); }
-        KeyT               key; // USE_SINGLE_ROOT_KEY ? 8B : 12B
+        KeyT               key; // NANOVDB_USE_SINGLE_ROOT_KEY ? 8B : 12B
         int64_t            child; // 8B. signed byte offset from this node to the child node.  0 means it is a constant tile, so use value.
         uint32_t           state; // 4B. state of tile value
         ValueT             value; // value of tile (i.e. no child node)
@@ -2763,57 +2567,322 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) RootData
         return reinterpret_cast<Tile*>(this + 1) + n;
     }
 
+    __hostdev__ Tile* probeTile(const CoordT& ijk)
+    {
+#if 1 // switch between linear and binary seach
+        const auto key = CoordToKey(ijk);
+        for (Tile *p = reinterpret_cast<Tile*>(this + 1), *q = p + mTableSize; p < q; ++p)
+            if (p->key == key)
+                return p;
+        return nullptr;
+#else // do not enable binary search if tiles are not guaranteed to be sorted!!!!!!
+        int32_t low = 0, high = mTableSize; // low is inclusive and high is exclusive
+        while (low != high) {
+            int         mid = low + ((high - low) >> 1);
+            const Tile* tile = &tiles[mid];
+            if (tile->key == key) {
+                return tile;
+            } else if (tile->key < key) {
+                low = mid + 1;
+            } else {
+                high = mid;
+            }
+        }
+        return nullptr;
+#endif
+    }
+
+    __hostdev__ inline const Tile* probeTile(const CoordT& ijk) const
+    {
+        return const_cast<RootData*>(this)->probeTile(ijk);
+    }
+
     /// @brief Returns a const reference to the child node in the specified tile.
     ///
     /// @warning A child node is assumed to exist in the specified tile
     __hostdev__ ChildT* getChild(const Tile* tile)
     {
         NANOVDB_ASSERT(tile->child);
-        return PtrAdd<ChildT>(this, tile->child);
+        return util::PtrAdd<ChildT>(this, tile->child);
     }
     __hostdev__ const ChildT* getChild(const Tile* tile) const
     {
         NANOVDB_ASSERT(tile->child);
-        return PtrAdd<ChildT>(this, tile->child);
+        return util::PtrAdd<ChildT>(this, tile->child);
     }
 
-    __hostdev__ const ValueT& getMin()     const { return mMinimum; }
-    __hostdev__ const ValueT& getMax()     const { return mMaximum; }
-    __hostdev__ const StatsT& average()      const { return mAverage; }
+    __hostdev__ const ValueT& getMin() const { return mMinimum; }
+    __hostdev__ const ValueT& getMax() const { return mMaximum; }
+    __hostdev__ const StatsT& average() const { return mAverage; }
     __hostdev__ const StatsT& stdDeviation() const { return mStdDevi; }
 
-    __hostdev__ void setMin(const ValueT& v) { mMinimum = v; }
-    __hostdev__ void setMax(const ValueT& v) { mMaximum = v; }
-    __hostdev__ void setAvg(const StatsT& v) { mAverage = v; }
-    __hostdev__ void setDev(const StatsT& v) { mStdDevi = v; }
+    __hostdev__ void setMin(const ValueT& v) { mMinimum = v; }
+    __hostdev__ void setMax(const ValueT& v) { mMaximum = v; }
+    __hostdev__ void setAvg(const StatsT& v) { mAverage = v; }
+    __hostdev__ void setDev(const StatsT& v) { mStdDevi = v; }
+
+    /// @brief This class cannot be constructed or deleted
+    RootData() = delete;
+    RootData(const RootData&) = delete;
+    RootData& operator=(const RootData&) = delete;
+    ~RootData() = delete;
+}; // RootData
+
+// --------------------------> RootNode <------------------------------------
+
+/// @brief Top-most node of the VDB tree structure.
+template<typename ChildT>
+class RootNode : public RootData<ChildT>
+{
+public:
+    using DataType = RootData<ChildT>;
+    using ChildNodeType = ChildT;
+    using RootType = RootNode<ChildT>; // this allows RootNode to behave like a Tree
+    using RootNodeType = RootType;
+    using UpperNodeType = ChildT;
+    using LowerNodeType = typename UpperNodeType::ChildNodeType;
+    using LeafNodeType = typename ChildT::LeafNodeType;
+    using ValueType = typename DataType::ValueT;
+    using FloatType = typename DataType::StatsT;
+    using BuildType = typename DataType::BuildT; // in rare cases BuildType != ValueType, e.g. then BuildType = ValueMask and ValueType = bool
+
+    using CoordType = typename ChildT::CoordType;
+    using BBoxType = math::BBox<CoordType>;
+    using AccessorType = DefaultReadAccessor<BuildType>;
+    using Tile = typename DataType::Tile;
+    static constexpr bool FIXED_SIZE = DataType::FIXED_SIZE;
+
+    static constexpr uint32_t LEVEL = 1 + ChildT::LEVEL; // level 0 = leaf
+
+    template<typename RootT>
+    class BaseIter
+    {
+    protected:
+        using DataT = typename util::match_const<DataType, RootT>::type;
+        using TileT = typename util::match_const<Tile, RootT>::type;
+        DataT*      mData;
+        uint32_t    mPos, mSize;
+        __hostdev__ BaseIter(DataT* data = nullptr, uint32_t n = 0)
+            : mData(data)
+            , mPos(0)
+            , mSize(n)
+        {
+        }
+
+    public:
+        __hostdev__ operator bool() const { return mPos < mSize; }
+        __hostdev__ uint32_t  pos() const { return mPos; }
+        __hostdev__ void      next() { ++mPos; }
+        __hostdev__ TileT*    tile() const { return mData->tile(mPos); }
+        __hostdev__ CoordType getOrigin() const
+        {
+            NANOVDB_ASSERT(*this);
+            return this->tile()->origin();
+        }
+        __hostdev__ CoordType getCoord() const
+        {
+            NANOVDB_ASSERT(*this);
+            return this->tile()->origin();
+        }
+    }; // Member class BaseIter
+
+    template<typename RootT>
+    class ChildIter : public BaseIter<RootT>
+    {
+        static_assert(util::is_same<typename util::remove_const<RootT>::type, RootNode>::value, "Invalid RootT");
+        using BaseT = BaseIter<RootT>;
+        using NodeT = typename util::match_const<ChildT, RootT>::type;
+
+    public:
+        __hostdev__ ChildIter()
+            : BaseT()
+        {
+        }
+        __hostdev__ ChildIter(RootT* parent)
+            : BaseT(parent->data(), parent->tileCount())
+        {
+            NANOVDB_ASSERT(BaseT::mData);
+            while (*this && !this->tile()->isChild())
+                this->next();
+        }
+        __hostdev__ NodeT& operator*() const
+        {
+            NANOVDB_ASSERT(*this);
+            return *BaseT::mData->getChild(this->tile());
+        }
+        __hostdev__ NodeT* operator->() const
+        {
+            NANOVDB_ASSERT(*this);
+            return BaseT::mData->getChild(this->tile());
+        }
+        __hostdev__ ChildIter& operator++()
+        {
+            NANOVDB_ASSERT(BaseT::mData);
+            this->next();
+            while (*this && this->tile()->isValue())
+                this->next();
+            return *this;
+        }
+        __hostdev__ ChildIter operator++(int)
+        {
+            auto tmp = *this;
+            ++(*this);
+            return tmp;
+        }
+    }; // Member class ChildIter
+
+    using ChildIterator      = ChildIter<RootNode>;
+    using ConstChildIterator = ChildIter<const RootNode>;
+
+    __hostdev__ ChildIterator       beginChild() { return ChildIterator(this); }
+    __hostdev__ ConstChildIterator cbeginChild() const { return ConstChildIterator(this); }
+
+    template<typename RootT>
+    class ValueIter : public BaseIter<RootT>
+    {
+        using BaseT = BaseIter<RootT>;
+
+    public:
+        __hostdev__ ValueIter()
+            : BaseT()
+        {
+        }
+        __hostdev__ ValueIter(RootT* parent)
+            : BaseT(parent->data(), parent->tileCount())
+        {
+            NANOVDB_ASSERT(BaseT::mData);
+            while (*this && this->tile()->isChild())
+                this->next();
+        }
+        __hostdev__ ValueType operator*() const
+        {
+            NANOVDB_ASSERT(*this);
+            return this->tile()->value;
+        }
+        __hostdev__ bool isActive() const
+        {
+            NANOVDB_ASSERT(*this);
+            return this->tile()->state;
+        }
+        __hostdev__ ValueIter& operator++()
+        {
+            NANOVDB_ASSERT(BaseT::mData);
+            this->next();
+            while (*this && this->tile()->isChild())
+                this->next();
+            return *this;
+        }
+        __hostdev__ ValueIter operator++(int)
+        {
+            auto tmp = *this;
+            ++(*this);
+            return tmp;
+        }
+    }; // Member class ValueIter
+
+    using ValueIterator = ValueIter<RootNode>;
+    using ConstValueIterator = ValueIter<const RootNode>;
+
+    __hostdev__ ValueIterator      beginValue() { return ValueIterator(this); }
+    __hostdev__ ConstValueIterator cbeginValueAll() const { return ConstValueIterator(this); }
+
+    template<typename RootT>
+    class ValueOnIter : public BaseIter<RootT>
+    {
+        using BaseT = BaseIter<RootT>;
+
+    public:
+        __hostdev__ ValueOnIter()
+            : BaseT()
+        {
+        }
+        __hostdev__ ValueOnIter(RootT* parent)
+            : BaseT(parent->data(), parent->tileCount())
+        {
+            NANOVDB_ASSERT(BaseT::mData);
+            while (*this && !this->tile()->isActive())
+                ++BaseT::mPos;
+        }
+        __hostdev__ ValueType operator*() const
+        {
+            NANOVDB_ASSERT(*this);
+            return this->tile()->value;
+        }
+        __hostdev__ ValueOnIter& operator++()
+        {
+            NANOVDB_ASSERT(BaseT::mData);
+            this->next();
+            while (*this && !this->tile()->isActive())
+                this->next();
+            return *this;
+        }
+        __hostdev__ ValueOnIter operator++(int)
+        {
+            auto tmp = *this;
+            ++(*this);
+            return tmp;
+        }
+    }; // Member class ValueOnIter
+
+    using ValueOnIterator = ValueOnIter<RootNode>;
+    using ConstValueOnIterator = ValueOnIter<const RootNode>;
 
-    /// @brief This class cannot be constructed or deleted
-    RootData() = delete;
-    RootData(const RootData&) = delete;
-    RootData& operator=(const RootData&) = delete;
-    ~RootData() = delete;
-}; // RootData
+    __hostdev__ ValueOnIterator      beginValueOn() { return ValueOnIterator(this); }
+    __hostdev__ ConstValueOnIterator cbeginValueOn() const { return ConstValueOnIterator(this); }
 
-/// @brief Top-most node of the VDB tree structure.
-template<typename ChildT>
-class RootNode : private RootData<ChildT>
-{
-public:
-    using DataType = RootData<ChildT>;
-    using LeafNodeType = typename ChildT::LeafNodeType;
-    using ChildNodeType = ChildT;
-    using RootType = RootNode<ChildT>;// this allows RootNode to behave like a Tree
+    template<typename RootT>
+    class DenseIter : public BaseIter<RootT>
+    {
+        using BaseT = BaseIter<RootT>;
+        using NodeT = typename util::match_const<ChildT, RootT>::type;
 
-    using ValueType = typename DataType::ValueT;
-    using FloatType = typename DataType::StatsT;
-    using BuildType = typename DataType::BuildT;// in rare cases BuildType != ValueType, e.g. then BuildType = ValueMask and ValueType = bool
+    public:
+        __hostdev__ DenseIter()
+            : BaseT()
+        {
+        }
+        __hostdev__ DenseIter(RootT* parent)
+            : BaseT(parent->data(), parent->tileCount())
+        {
+            NANOVDB_ASSERT(BaseT::mData);
+        }
+        __hostdev__ NodeT* probeChild(ValueType& value) const
+        {
+            NANOVDB_ASSERT(*this);
+            NodeT* child = nullptr;
+            auto*  t = this->tile();
+            if (t->isChild()) {
+                child = BaseT::mData->getChild(t);
+            } else {
+                value = t->value;
+            }
+            return child;
+        }
+        __hostdev__ bool isValueOn() const
+        {
+            NANOVDB_ASSERT(*this);
+            return this->tile()->state;
+        }
+        __hostdev__ DenseIter& operator++()
+        {
+            NANOVDB_ASSERT(BaseT::mData);
+            this->next();
+            return *this;
+        }
+        __hostdev__ DenseIter operator++(int)
+        {
+            auto tmp = *this;
+            ++(*this);
+            return tmp;
+        }
+    }; // Member class DenseIter
 
-    using CoordType = typename ChildT::CoordType;
-    using AccessorType = DefaultReadAccessor<BuildType>;
-    using Tile = typename DataType::Tile;
-    static constexpr bool FIXED_SIZE = DataType::FIXED_SIZE;
+    using DenseIterator = DenseIter<RootNode>;
+    using ConstDenseIterator = DenseIter<const RootNode>;
 
-    static constexpr uint32_t LEVEL = 1 + ChildT::LEVEL; // level 0 = leaf
+    __hostdev__ DenseIterator      beginDense() { return DenseIterator(this); }
+    __hostdev__ ConstDenseIterator cbeginDense() const { return ConstDenseIterator(this); }
+    __hostdev__ ConstDenseIterator cbeginChildAll() const { return ConstDenseIterator(this); }
 
     /// @brief This class cannot be constructed or deleted
     RootNode() = delete;
@@ -2828,7 +2897,7 @@ class RootNode : private RootData<ChildT>
     __hostdev__ const DataType* data() const { return reinterpret_cast<const DataType*>(this); }
 
     /// @brief Return a const reference to the index bounding box of all the active values in this tree, i.e. in all nodes of the tree
-    __hostdev__ const BBox<CoordType>& bbox() const { return DataType::mBBox; }
+    __hostdev__ const BBoxType& bbox() const { return DataType::mBBox; }
 
     /// @brief Return the total number of active voxels in the root and all its child nodes.
 
@@ -2838,18 +2907,19 @@ class RootNode : private RootData<ChildT>
 
     /// @brief Return the number of tiles encoded in this root node
     __hostdev__ const uint32_t& tileCount() const { return DataType::mTableSize; }
+    __hostdev__ const uint32_t& getTableSize() const { return DataType::mTableSize; }
 
     /// @brief Return a const reference to the minimum active value encoded in this root node and any of its child nodes
-    __hostdev__ const ValueType& minimum() const { return this->getMin(); }
+    __hostdev__ const ValueType& minimum() const { return DataType::mMinimum; }
 
     /// @brief Return a const reference to the maximum active value encoded in this root node and any of its child nodes
-    __hostdev__ const ValueType& maximum() const { return this->getMax(); }
+    __hostdev__ const ValueType& maximum() const { return DataType::mMaximum; }
 
     /// @brief Return a const reference to the average of all the active values encoded in this root node and any of its child nodes
     __hostdev__ const FloatType& average() const { return DataType::mAverage; }
 
     /// @brief Return the variance of all the active values encoded in this root node and any of its child nodes
-    __hostdev__ FloatType variance() const { return DataType::mStdDevi * DataType::mStdDevi; }
+    __hostdev__ FloatType variance() const { return math::Pow2(DataType::mStdDevi); }
 
     /// @brief Return a const reference to the standard deviation of all the active values encoded in this root node and any of its child nodes
     __hostdev__ const FloatType& stdDeviation() const { return DataType::mStdDevi; }
@@ -2860,31 +2930,42 @@ class RootNode : private RootData<ChildT>
     /// @brief Return the actual memory footprint of this root node
     __hostdev__ uint64_t memUsage() const { return sizeof(RootNode) + DataType::mTableSize * sizeof(Tile); }
 
+    /// @brief Return true if this RootNode is empty, i.e. contains no values or nodes
+    __hostdev__ bool isEmpty() const { return DataType::mTableSize == uint32_t(0); }
+
+#ifdef NANOVDB_NEW_ACCESSOR_METHODS
+    /// @brief Return the value of the given voxel
+    __hostdev__ ValueType getValue(const CoordType& ijk) const { return this->template get<GetValue<BuildType>>(ijk); }
+    __hostdev__ ValueType getValue(int i, int j, int k) const { return this->template get<GetValue<BuildType>>(CoordType(i, j, k)); }
+    __hostdev__ bool      isActive(const CoordType& ijk) const { return this->template get<GetState<BuildType>>(ijk); }
+    /// @brief return the state and updates the value of the specified voxel
+    __hostdev__ bool                probeValue(const CoordType& ijk, ValueType& v) const { return this->template get<ProbeValue<BuildType>>(ijk, v); }
+    __hostdev__ const LeafNodeType* probeLeaf(const CoordType& ijk) const { return this->template get<GetLeaf<BuildType>>(ijk); }
+#else // NANOVDB_NEW_ACCESSOR_METHODS
+
     /// @brief Return the value of the given voxel
     __hostdev__ ValueType getValue(const CoordType& ijk) const
     {
-        if (const Tile* tile = this->findTile(ijk)) {
+        if (const Tile* tile = DataType::probeTile(ijk)) {
             return tile->isChild() ? this->getChild(tile)->getValue(ijk) : tile->value;
         }
         return DataType::mBackground;
     }
+    __hostdev__ ValueType getValue(int i, int j, int k) const { return this->getValue(CoordType(i, j, k)); }
 
     __hostdev__ bool isActive(const CoordType& ijk) const
     {
-        if (const Tile* tile = this->findTile(ijk)) {
+        if (const Tile* tile = DataType::probeTile(ijk)) {
             return tile->isChild() ? this->getChild(tile)->isActive(ijk) : tile->state;
         }
         return false;
     }
 
-    /// @brief Return true if this RootNode is empty, i.e. contains no values or nodes
-    __hostdev__ bool isEmpty() const { return DataType::mTableSize == uint32_t(0); }
-
     __hostdev__ bool probeValue(const CoordType& ijk, ValueType& v) const
     {
-        if (const Tile* tile = this->findTile(ijk)) {
+        if (const Tile* tile = DataType::probeTile(ijk)) {
             if (tile->isChild()) {
-                const auto *child = this->getChild(tile);
+                const auto* child = this->getChild(tile);
                 return child->probeValue(ijk, v);
             }
             v = tile->value;
@@ -2896,14 +2977,52 @@ class RootNode : private RootData<ChildT>
 
     __hostdev__ const LeafNodeType* probeLeaf(const CoordType& ijk) const
     {
-        const Tile* tile = this->findTile(ijk);
+        const Tile* tile = DataType::probeTile(ijk);
         if (tile && tile->isChild()) {
-            const auto *child = this->getChild(tile);
+            const auto* child = this->getChild(tile);
             return child->probeLeaf(ijk);
         }
         return nullptr;
     }
 
+#endif // NANOVDB_NEW_ACCESSOR_METHODS
+
+    __hostdev__ const ChildNodeType* probeChild(const CoordType& ijk) const
+    {
+        const Tile* tile = DataType::probeTile(ijk);
+        return tile && tile->isChild() ? this->getChild(tile) : nullptr;
+    }
+
+    __hostdev__ ChildNodeType* probeChild(const CoordType& ijk)
+    {
+        const Tile* tile = DataType::probeTile(ijk);
+        return tile && tile->isChild() ? this->getChild(tile) : nullptr;
+    }
+
+    template<typename OpT, typename... ArgsT>
+    __hostdev__ auto get(const CoordType& ijk, ArgsT&&... args) const
+    {
+        if (const Tile* tile = this->probeTile(ijk)) {
+            if (tile->isChild())
+                return this->getChild(tile)->template get<OpT>(ijk, args...);
+            return OpT::get(*tile, args...);
+        }
+        return OpT::get(*this, args...);
+    }
+
+    template<typename OpT, typename... ArgsT>
+    // __hostdev__ auto // occasionally fails with NVCC
+    __hostdev__ decltype(OpT::set(util::declval<Tile&>(), util::declval<ArgsT>()...))
+    set(const CoordType& ijk, ArgsT&&... args)
+    {
+        if (Tile* tile = DataType::probeTile(ijk)) {
+            if (tile->isChild())
+                return this->getChild(tile)->template set<OpT>(ijk, args...);
+            return OpT::set(*tile, args...);
+        }
+        return OpT::set(*this, args...);
+    }
+
 private:
     static_assert(sizeof(DataType) % NANOVDB_DATA_ALIGNMENT == 0, "sizeof(RootData) is misaligned");
     static_assert(sizeof(typename DataType::Tile) % NANOVDB_DATA_ALIGNMENT == 0, "sizeof(RootData::Tile) is misaligned");
@@ -2913,62 +3032,30 @@ class RootNode : private RootData<ChildT>
 
     template<typename>
     friend class Tree;
-
-    /// @brief Private method to find a Tile of this root node by means of binary-search. This is obviously
-    ///        much slower then direct lookup into a linear array (as in the other nodes) which is exactly
-    ///        why it is important to use the ReadAccessor which amortizes this overhead by node caching and
-    ///        inverse tree traversal!
-    __hostdev__ const Tile* findTile(const CoordType& ijk) const
-    {
-        const Tile* tiles = reinterpret_cast<const Tile*>(this + 1);
-        const auto  key = DataType::CoordToKey(ijk);
-#if 1   // switch between linear and binary seach
-        for (uint32_t i = 0; i < DataType::mTableSize; ++i) {
-            if (tiles[i].key == key) return &tiles[i];
-        }
-#else// do not enable binary search if tiles are not guaranteed to be sorted!!!!!!
-        // binary-search of pre-sorted elements
-        int32_t low = 0, high = DataType::mTableSize; // low is inclusive and high is exclusive
-        while (low != high) {
-            int         mid = low + ((high - low) >> 1);
-            const Tile* tile = &tiles[mid];
-            if (tile->key == key) {
-                return tile;
-            } else if (tile->key < key) {
-                low = mid + 1;
-            } else {
-                high = mid;
-            }
-        }
-#endif
-        return nullptr;
-    }
-
+#ifndef NANOVDB_NEW_ACCESSOR_METHODS
     /// @brief Private method to return node information and update a ReadAccessor
     template<typename AccT>
     __hostdev__ typename AccT::NodeInfo getNodeInfoAndCache(const CoordType& ijk, const AccT& acc) const
     {
         using NodeInfoT = typename AccT::NodeInfo;
-        if (const Tile* tile = this->findTile(ijk)) {
+        if (const Tile* tile = this->probeTile(ijk)) {
             if (tile->isChild()) {
-                const auto *child = this->getChild(tile);
+                const auto* child = this->getChild(tile);
                 acc.insert(ijk, child);
                 return child->getNodeInfoAndCache(ijk, acc);
             }
-            return NodeInfoT{LEVEL, ChildT::dim(), tile->value, tile->value, tile->value,
-                                 0, tile->origin(), tile->origin() + CoordType(ChildT::DIM)};
+            return NodeInfoT{LEVEL, ChildT::dim(), tile->value, tile->value, tile->value, 0, tile->origin(), tile->origin() + CoordType(ChildT::DIM)};
         }
-        return NodeInfoT{LEVEL, ChildT::dim(), this->minimum(), this->maximum(),
-                         this->average(), this->stdDeviation(), this->bbox()[0], this->bbox()[1]};
+        return NodeInfoT{LEVEL, ChildT::dim(), this->minimum(), this->maximum(), this->average(), this->stdDeviation(), this->bbox()[0], this->bbox()[1]};
     }
 
     /// @brief Private method to return a voxel value and update a ReadAccessor
     template<typename AccT>
     __hostdev__ ValueType getValueAndCache(const CoordType& ijk, const AccT& acc) const
     {
-        if (const Tile* tile = this->findTile(ijk)) {
+        if (const Tile* tile = this->probeTile(ijk)) {
             if (tile->isChild()) {
-                const auto *child = this->getChild(tile);
+                const auto* child = this->getChild(tile);
                 acc.insert(ijk, child);
                 return child->getValueAndCache(ijk, acc);
             }
@@ -2980,9 +3067,9 @@ class RootNode : private RootData<ChildT>
     template<typename AccT>
     __hostdev__ bool isActiveAndCache(const CoordType& ijk, const AccT& acc) const
     {
-        const Tile* tile = this->findTile(ijk);
+        const Tile* tile = this->probeTile(ijk);
         if (tile && tile->isChild()) {
-            const auto *child = this->getChild(tile);
+            const auto* child = this->getChild(tile);
             acc.insert(ijk, child);
             return child->isActiveAndCache(ijk, acc);
         }
@@ -2992,9 +3079,9 @@ class RootNode : private RootData<ChildT>
     template<typename AccT>
     __hostdev__ bool probeValueAndCache(const CoordType& ijk, ValueType& v, const AccT& acc) const
     {
-        if (const Tile* tile = this->findTile(ijk)) {
+        if (const Tile* tile = this->probeTile(ijk)) {
             if (tile->isChild()) {
-                const auto *child = this->getChild(tile);
+                const auto* child = this->getChild(tile);
                 acc.insert(ijk, child);
                 return child->probeValueAndCache(ijk, v, acc);
             }
@@ -3008,21 +3095,22 @@ class RootNode : private RootData<ChildT>
     template<typename AccT>
     __hostdev__ const LeafNodeType* probeLeafAndCache(const CoordType& ijk, const AccT& acc) const
     {
-        const Tile* tile = this->findTile(ijk);
+        const Tile* tile = this->probeTile(ijk);
         if (tile && tile->isChild()) {
-            const auto *child = this->getChild(tile);
+            const auto* child = this->getChild(tile);
             acc.insert(ijk, child);
             return child->probeLeafAndCache(ijk, acc);
         }
         return nullptr;
     }
+#endif // NANOVDB_NEW_ACCESSOR_METHODS
 
     template<typename RayT, typename AccT>
     __hostdev__ uint32_t getDimAndCache(const CoordType& ijk, const RayT& ray, const AccT& acc) const
     {
-        if (const Tile* tile = this->findTile(ijk)) {
+        if (const Tile* tile = this->probeTile(ijk)) {
             if (tile->isChild()) {
-                const auto *child = this->getChild(tile);
+                const auto* child = this->getChild(tile);
                 acc.insert(ijk, child);
                 return child->getDimAndCache(ijk, ray, acc);
             }
@@ -3030,6 +3118,39 @@ class RootNode : private RootData<ChildT>
         }
         return ChildNodeType::dim(); // background
     }
+
+    template<typename OpT, typename AccT, typename... ArgsT>
+    //__hostdev__  decltype(OpT::get(util::declval<const Tile&>(), util::declval<ArgsT>()...))
+    __hostdev__ auto
+    getAndCache(const CoordType& ijk, const AccT& acc, ArgsT&&... args) const
+    {
+        if (const Tile* tile = this->probeTile(ijk)) {
+            if (tile->isChild()) {
+                const ChildT* child = this->getChild(tile);
+                acc.insert(ijk, child);
+                return child->template getAndCache<OpT>(ijk, acc, args...);
+            }
+            return OpT::get(*tile, args...);
+        }
+        return OpT::get(*this, args...);
+    }
+
+    template<typename OpT, typename AccT, typename... ArgsT>
+    // __hostdev__ auto // occasionally fails with NVCC
+    __hostdev__ decltype(OpT::set(util::declval<Tile&>(), util::declval<ArgsT>()...))
+    setAndCache(const CoordType& ijk, const AccT& acc, ArgsT&&... args)
+    {
+        if (Tile* tile = DataType::probeTile(ijk)) {
+            if (tile->isChild()) {
+                ChildT* child = this->getChild(tile);
+                acc.insert(ijk, child);
+                return child->template setAndCache<OpT>(ijk, acc, args...);
+            }
+            return OpT::set(*tile, args...);
+        }
+        return OpT::set(*this, args...);
+    }
+
 }; // RootNode class
 
 // After the RootNode the memory layout is assumed to be the sorted Tiles
@@ -3043,16 +3164,16 @@ template<typename ChildT, uint32_t LOG2DIM>
 struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) InternalData
 {
     using ValueT = typename ChildT::ValueType;
-    using BuildT = typename ChildT::BuildType;// in rare cases BuildType != ValueType, e.g. then BuildType = ValueMask and ValueType = bool
+    using BuildT = typename ChildT::BuildType; // in rare cases BuildType != ValueType, e.g. then BuildType = ValueMask and ValueType = bool
     using StatsT = typename ChildT::FloatType;
     using CoordT = typename ChildT::CoordType;
-    using MaskT  = typename ChildT::template MaskType<LOG2DIM>;
+    using MaskT = typename ChildT::template MaskType<LOG2DIM>;
     static constexpr bool FIXED_SIZE = true;
 
     union Tile
     {
         ValueT  value;
-        int64_t child;//signed 64 bit byte offset relative to the InternalData!!
+        int64_t child; //signed 64 bit byte offset relative to this InternalData, i.e. child-pointer = Tile::child + this
         /// @brief This class cannot be constructed or deleted
         Tile() = delete;
         Tile(const Tile&) = delete;
@@ -3060,7 +3181,7 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) InternalData
         ~Tile() = delete;
     };
 
-    BBox<CoordT> mBBox; // 24B. node bounding box.                   |
+    math::BBox<CoordT> mBBox; // 24B. node bounding box.                   |
     uint64_t     mFlags; // 8B. node flags.                          | 32B aligned
     MaskT        mValueMask; // LOG2DIM(5): 4096B, LOG2DIM(4): 512B  | 32B aligned
     MaskT        mChildMask; // LOG2DIM(5): 4096B, LOG2DIM(4): 512B  | 32B aligned
@@ -3069,16 +3190,27 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) InternalData
     ValueT mMaximum; // typically 4B
     StatsT mAverage; // typically 4B, average of all the active values in this node and its child nodes
     StatsT mStdDevi; // typically 4B, standard deviation of all the active values in this node and its child nodes
+    // possible padding, e.g. 28 byte padding when ValueType = bool
+
+    /// @brief Return padding of this class in bytes, due to aliasing and 32B alignment
+    ///
+    /// @note The extra bytes are not necessarily at the end, but can come from aliasing of individual data members.
+    __hostdev__ static constexpr uint32_t padding()
+    {
+        return sizeof(InternalData) - (24u + 8u + 2 * (sizeof(MaskT) + sizeof(ValueT) + sizeof(StatsT)) + (1u << (3 * LOG2DIM)) * (sizeof(ValueT) > 8u ? sizeof(ValueT) : 8u));
+    }
     alignas(32) Tile mTable[1u << (3 * LOG2DIM)]; // sizeof(ValueT) x (16*16*16 or 32*32*32)
 
-    __hostdev__ void setChild(uint32_t n, const void *ptr)
+    __hostdev__ static uint64_t memUsage() { return sizeof(InternalData); }
+
+    __hostdev__ void setChild(uint32_t n, const void* ptr)
     {
         NANOVDB_ASSERT(mChildMask.isOn(n));
-        mTable[n].child = PtrDiff(ptr, this);
+        mTable[n].child = util::PtrDiff(ptr, this);
     }
 
-    template <typename ValueT>
-    __hostdev__ void setValue(uint32_t n, const ValueT &v)
+    template<typename ValueT>
+    __hostdev__ void setValue(uint32_t n, const ValueT& v)
     {
         NANOVDB_ASSERT(!mChildMask.isOn(n));
         mTable[n].value = v;
@@ -3088,26 +3220,47 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) InternalData
     __hostdev__ ChildT* getChild(uint32_t n)
     {
         NANOVDB_ASSERT(mChildMask.isOn(n));
-        return PtrAdd<ChildT>(this, mTable[n].child);
+        return util::PtrAdd<ChildT>(this, mTable[n].child);
     }
     __hostdev__ const ChildT* getChild(uint32_t n) const
     {
         NANOVDB_ASSERT(mChildMask.isOn(n));
-        return PtrAdd<ChildT>(this, mTable[n].child);
+        return util::PtrAdd<ChildT>(this, mTable[n].child);
+    }
+
+    __hostdev__ ValueT getValue(uint32_t n) const
+    {
+        NANOVDB_ASSERT(mChildMask.isOff(n));
+        return mTable[n].value;
+    }
+
+    __hostdev__ bool isActive(uint32_t n) const
+    {
+        NANOVDB_ASSERT(mChildMask.isOff(n));
+        return mValueMask.isOn(n);
     }
 
-    template <typename T>
+    __hostdev__ bool isChild(uint32_t n) const { return mChildMask.isOn(n); }
+
+    template<typename T>
     __hostdev__ void setOrigin(const T& ijk) { mBBox[0] = ijk; }
 
-    __hostdev__ const ValueT& getMin()       const { return mMinimum; }
-    __hostdev__ const ValueT& getMax()       const { return mMaximum; }
-    __hostdev__ const StatsT& average()      const { return mAverage; }
+    __hostdev__ const ValueT& getMin() const { return mMinimum; }
+    __hostdev__ const ValueT& getMax() const { return mMaximum; }
+    __hostdev__ const StatsT& average() const { return mAverage; }
     __hostdev__ const StatsT& stdDeviation() const { return mStdDevi; }
 
+#if defined(__GNUC__) && !defined(__APPLE__) && !defined(__llvm__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstringop-overflow"
+#endif
     __hostdev__ void setMin(const ValueT& v) { mMinimum = v; }
     __hostdev__ void setMax(const ValueT& v) { mMaximum = v; }
     __hostdev__ void setAvg(const StatsT& v) { mAverage = v; }
     __hostdev__ void setDev(const StatsT& v) { mStdDevi = v; }
+#if defined(__GNUC__) && !defined(__APPLE__) && !defined(__llvm__)
+#pragma GCC diagnostic pop
+#endif
 
     /// @brief This class cannot be constructed or deleted
     InternalData() = delete;
@@ -3116,9 +3269,9 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) InternalData
     ~InternalData() = delete;
 }; // InternalData
 
-/// @brief Internal nodes of a VDB treedim(),
+/// @brief Internal nodes of a VDB tree
 template<typename ChildT, uint32_t Log2Dim = ChildT::LOG2DIM + 1>
-class InternalNode : private InternalData<ChildT, Log2Dim>
+class InternalNode : public InternalData<ChildT, Log2Dim>
 {
 public:
     using DataType = InternalData<ChildT, Log2Dim>;
@@ -3131,6 +3284,8 @@ class InternalNode : private InternalData<ChildT, Log2Dim>
     static constexpr bool FIXED_SIZE = DataType::FIXED_SIZE;
     template<uint32_t LOG2>
     using MaskType = typename ChildT::template MaskType<LOG2>;
+    template<bool On>
+    using MaskIterT = typename Mask<Log2Dim>::template Iterator<On>;
 
     static constexpr uint32_t LOG2DIM = Log2Dim;
     static constexpr uint32_t TOTAL = LOG2DIM + ChildT::TOTAL; // dimension in index space
@@ -3140,6 +3295,169 @@ class InternalNode : private InternalData<ChildT, Log2Dim>
     static constexpr uint32_t LEVEL = 1 + ChildT::LEVEL; // level 0 = leaf
     static constexpr uint64_t NUM_VALUES = uint64_t(1) << (3 * TOTAL); // total voxel count represented by this node
 
+    /// @brief Visits child nodes of this node only
+    template <typename ParentT>
+    class ChildIter : public MaskIterT<true>
+    {
+        static_assert(util::is_same<typename util::remove_const<ParentT>::type, InternalNode>::value, "Invalid ParentT");
+        using BaseT = MaskIterT<true>;
+        using NodeT = typename util::match_const<ChildT, ParentT>::type;
+        ParentT* mParent;
+
+    public:
+        __hostdev__ ChildIter()
+            : BaseT()
+            , mParent(nullptr)
+        {
+        }
+        __hostdev__ ChildIter(ParentT* parent)
+            : BaseT(parent->mChildMask.beginOn())
+            , mParent(parent)
+        {
+        }
+        ChildIter& operator=(const ChildIter&) = default;
+        __hostdev__ NodeT& operator*() const
+        {
+            NANOVDB_ASSERT(*this);
+            return *mParent->getChild(BaseT::pos());
+        }
+        __hostdev__ NodeT* operator->() const
+        {
+            NANOVDB_ASSERT(*this);
+            return mParent->getChild(BaseT::pos());
+        }
+        __hostdev__ CoordType getOrigin() const
+        {
+            NANOVDB_ASSERT(*this);
+            return (*this)->origin();
+        }
+        __hostdev__ CoordType getCoord() const {return this->getOrigin();}
+    }; // Member class ChildIter
+
+    using ChildIterator      = ChildIter<InternalNode>;
+    using ConstChildIterator = ChildIter<const InternalNode>;
+
+    __hostdev__ ChildIterator       beginChild()       { return ChildIterator(this); }
+    __hostdev__ ConstChildIterator cbeginChild() const { return ConstChildIterator(this); }
+
+    /// @brief Visits all tile values in this node, i.e. both inactive and active tiles
+    class ValueIterator : public MaskIterT<false>
+    {
+        using BaseT = MaskIterT<false>;
+        const InternalNode* mParent;
+
+    public:
+        __hostdev__ ValueIterator()
+            : BaseT()
+            , mParent(nullptr)
+        {
+        }
+        __hostdev__ ValueIterator(const InternalNode* parent)
+            : BaseT(parent->data()->mChildMask.beginOff())
+            , mParent(parent)
+        {
+        }
+        ValueIterator&        operator=(const ValueIterator&) = default;
+        __hostdev__ ValueType operator*() const
+        {
+            NANOVDB_ASSERT(*this);
+            return mParent->data()->getValue(BaseT::pos());
+        }
+        __hostdev__ CoordType getOrigin() const
+        {
+            NANOVDB_ASSERT(*this);
+            return mParent->offsetToGlobalCoord(BaseT::pos());
+        }
+        __hostdev__ CoordType getCoord() const {return this->getOrigin();}
+        __hostdev__ bool isActive() const
+        {
+            NANOVDB_ASSERT(*this);
+            return mParent->data()->isActive(BaseT::mPos);
+        }
+    }; // Member class ValueIterator
+
+    __hostdev__ ValueIterator beginValue() const { return ValueIterator(this); }
+    __hostdev__ ValueIterator cbeginValueAll() const { return ValueIterator(this); }
+
+    /// @brief Visits active tile values of this node only
+    class ValueOnIterator : public MaskIterT<true>
+    {
+        using BaseT = MaskIterT<true>;
+        const InternalNode* mParent;
+
+    public:
+        __hostdev__ ValueOnIterator()
+            : BaseT()
+            , mParent(nullptr)
+        {
+        }
+        __hostdev__ ValueOnIterator(const InternalNode* parent)
+            : BaseT(parent->data()->mValueMask.beginOn())
+            , mParent(parent)
+        {
+        }
+        ValueOnIterator&      operator=(const ValueOnIterator&) = default;
+        __hostdev__ ValueType operator*() const
+        {
+            NANOVDB_ASSERT(*this);
+            return mParent->data()->getValue(BaseT::pos());
+        }
+        __hostdev__ CoordType getOrigin() const
+        {
+            NANOVDB_ASSERT(*this);
+            return mParent->offsetToGlobalCoord(BaseT::pos());
+        }
+        __hostdev__ CoordType getCoord() const {return this->getOrigin();}
+    }; // Member class ValueOnIterator
+
+    __hostdev__ ValueOnIterator beginValueOn() const { return ValueOnIterator(this); }
+    __hostdev__ ValueOnIterator cbeginValueOn() const { return ValueOnIterator(this); }
+
+    /// @brief Visits all tile values and child nodes of this node
+    class DenseIterator : public Mask<Log2Dim>::DenseIterator
+    {
+        using BaseT = typename Mask<Log2Dim>::DenseIterator;
+        const DataType* mParent;
+
+    public:
+        __hostdev__ DenseIterator()
+            : BaseT()
+            , mParent(nullptr)
+        {
+        }
+        __hostdev__ DenseIterator(const InternalNode* parent)
+            : BaseT(0)
+            , mParent(parent->data())
+        {
+        }
+        DenseIterator&            operator=(const DenseIterator&) = default;
+        __hostdev__ const ChildT* probeChild(ValueType& value) const
+        {
+            NANOVDB_ASSERT(mParent && bool(*this));
+            const ChildT* child = nullptr;
+            if (mParent->mChildMask.isOn(BaseT::pos())) {
+                child = mParent->getChild(BaseT::pos());
+            } else {
+                value = mParent->getValue(BaseT::pos());
+            }
+            return child;
+        }
+        __hostdev__ bool isValueOn() const
+        {
+            NANOVDB_ASSERT(mParent && bool(*this));
+            return mParent->isActive(BaseT::pos());
+        }
+        __hostdev__ CoordType getOrigin() const
+        {
+            NANOVDB_ASSERT(mParent && bool(*this));
+            return mParent->offsetToGlobalCoord(BaseT::pos());
+        }
+        __hostdev__ CoordType getCoord() const {return this->getOrigin();}
+    }; // Member class DenseIterator
+
+    __hostdev__ DenseIterator beginDense() const { return DenseIterator(this); }
+    __hostdev__ DenseIterator cbeginChildAll() const { return DenseIterator(this); } // matches openvdb
+
     /// @brief This class cannot be constructed or deleted
     InternalNode() = delete;
     InternalNode(const InternalNode&) = delete;
@@ -3154,13 +3472,15 @@ class InternalNode : private InternalData<ChildT, Log2Dim>
     __hostdev__ static uint32_t dim() { return 1u << TOTAL; }
 
     /// @brief Return memory usage in bytes for the class
-    __hostdev__ static size_t memUsage() { return sizeof(DataType); }
+    __hostdev__ static size_t memUsage() { return DataType::memUsage(); }
 
     /// @brief Return a const reference to the bit mask of active voxels in this internal node
     __hostdev__ const MaskType<LOG2DIM>& valueMask() const { return DataType::mValueMask; }
+    __hostdev__ const MaskType<LOG2DIM>& getValueMask() const { return DataType::mValueMask; }
 
     /// @brief Return a const reference to the bit mask of child nodes in this internal node
     __hostdev__ const MaskType<LOG2DIM>& childMask() const { return DataType::mChildMask; }
+    __hostdev__ const MaskType<LOG2DIM>& getChildMask() const { return DataType::mChildMask; }
 
     /// @brief Return the origin in index space of this leaf node
     __hostdev__ CoordType origin() const { return DataType::mBBox.min() & ~MASK; }
@@ -3175,36 +3495,54 @@ class InternalNode : private InternalData<ChildT, Log2Dim>
     __hostdev__ const FloatType& average() const { return DataType::mAverage; }
 
     /// @brief Return the variance of all the active values encoded in this internal node and any of its child nodes
-    __hostdev__ FloatType variance() const { return DataType::mStdDevi*DataType::mStdDevi; }
+    __hostdev__ FloatType variance() const { return DataType::mStdDevi * DataType::mStdDevi; }
 
     /// @brief Return a const reference to the standard deviation of all the active values encoded in this internal node and any of its child nodes
     __hostdev__ const FloatType& stdDeviation() const { return DataType::mStdDevi; }
 
     /// @brief Return a const reference to the bounding box in index space of active values in this internal node and any of its child nodes
-    __hostdev__ const BBox<CoordType>& bbox() const { return DataType::mBBox; }
+    __hostdev__ const math::BBox<CoordType>& bbox() const { return DataType::mBBox; }
+
+    /// @brief If the first entry in this node's table is a tile, return the tile's value.
+    ///        Otherwise, return the result of calling getFirstValue() on the child.
+    __hostdev__ ValueType getFirstValue() const
+    {
+        return DataType::mChildMask.isOn(0) ? this->getChild(0)->getFirstValue() : DataType::getValue(0);
+    }
 
+    /// @brief If the last entry in this node's table is a tile, return the tile's value.
+    ///        Otherwise, return the result of calling getLastValue() on the child.
+    __hostdev__ ValueType getLastValue() const
+    {
+        return DataType::mChildMask.isOn(SIZE - 1) ? this->getChild(SIZE - 1)->getLastValue() : DataType::getValue(SIZE - 1);
+    }
+
+#ifdef NANOVDB_NEW_ACCESSOR_METHODS
     /// @brief Return the value of the given voxel
+    __hostdev__ ValueType getValue(const CoordType& ijk) const { return this->template get<GetValue<BuildType>>(ijk); }
+    __hostdev__ bool      isActive(const CoordType& ijk) const { return this->template get<GetState<BuildType>>(ijk); }
+    /// @brief return the state and updates the value of the specified voxel
+    __hostdev__ bool                probeValue(const CoordType& ijk, ValueType& v) const { return this->template get<ProbeValue<BuildType>>(ijk, v); }
+    __hostdev__ const LeafNodeType* probeLeaf(const CoordType& ijk) const { return this->template get<GetLeaf<BuildType>>(ijk); }
+#else // NANOVDB_NEW_ACCESSOR_METHODS
     __hostdev__ ValueType getValue(const CoordType& ijk) const
     {
         const uint32_t n = CoordToOffset(ijk);
-        return DataType::mChildMask.isOn(n) ? this->getChild(n)->getValue(ijk) : DataType::mTable[n].value;
+        return DataType::mChildMask.isOn(n) ? this->getChild(n)->getValue(ijk) : DataType::getValue(n);
     }
-
     __hostdev__ bool isActive(const CoordType& ijk) const
     {
         const uint32_t n = CoordToOffset(ijk);
-        return DataType::mChildMask.isOn(n) ? this->getChild(n)->isActive(ijk) : DataType::mValueMask.isOn(n);
+        return DataType::mChildMask.isOn(n) ? this->getChild(n)->isActive(ijk) : DataType::isActive(n);
     }
-
     __hostdev__ bool probeValue(const CoordType& ijk, ValueType& v) const
     {
         const uint32_t n = CoordToOffset(ijk);
         if (DataType::mChildMask.isOn(n))
             return this->getChild(n)->probeValue(ijk, v);
-        v = DataType::mTable[n].value;
-        return DataType::mValueMask.isOn(n);
+        v = DataType::getValue(n);
+        return DataType::isActive(n);
     }
-
     __hostdev__ const LeafNodeType* probeLeaf(const CoordType& ijk) const
     {
         const uint32_t n = CoordToOffset(ijk);
@@ -3213,18 +3551,25 @@ class InternalNode : private InternalData<ChildT, Log2Dim>
         return nullptr;
     }
 
+#endif // NANOVDB_NEW_ACCESSOR_METHODS
+
+    __hostdev__ ChildNodeType* probeChild(const CoordType& ijk)
+    {
+        const uint32_t n = CoordToOffset(ijk);
+        return DataType::mChildMask.isOn(n) ? this->getChild(n) : nullptr;
+    }
+    __hostdev__ const ChildNodeType* probeChild(const CoordType& ijk) const
+    {
+        const uint32_t n = CoordToOffset(ijk);
+        return DataType::mChildMask.isOn(n) ? this->getChild(n) : nullptr;
+    }
+
     /// @brief Return the linear offset corresponding to the given coordinate
     __hostdev__ static uint32_t CoordToOffset(const CoordType& ijk)
     {
-#if 0
-        return (((ijk[0] & MASK) >> ChildT::TOTAL) << (2 * LOG2DIM)) +
-               (((ijk[1] & MASK) >> ChildT::TOTAL) << (LOG2DIM)) +
-               ((ijk[2] & MASK) >> ChildT::TOTAL);
-#else
-        return (((ijk[0] & MASK) >> ChildT::TOTAL) << (2 * LOG2DIM)) |
+        return (((ijk[0] & MASK) >> ChildT::TOTAL) << (2 * LOG2DIM)) | // note, we're using bitwise OR instead of +
                (((ijk[1] & MASK) >> ChildT::TOTAL) << (LOG2DIM)) |
                ((ijk[2] & MASK) >> ChildT::TOTAL);
-#endif
     }
 
     /// @return the local coordinate of the n'th tile or child node
@@ -3249,15 +3594,31 @@ class InternalNode : private InternalData<ChildT, Log2Dim>
         return ijk;
     }
 
-    /// @brief Retrun true if this node or any of its child nodes contain active values
-    __hostdev__ bool isActive() const
+    /// @brief Return true if this node or any of its child nodes contain active values
+    __hostdev__ bool isActive() const { return DataType::mFlags & uint32_t(2); }
+
+    template<typename OpT, typename... ArgsT>
+    __hostdev__ auto get(const CoordType& ijk, ArgsT&&... args) const
+    {
+        const uint32_t n = CoordToOffset(ijk);
+        if (this->isChild(n))
+            return this->getChild(n)->template get<OpT>(ijk, args...);
+        return OpT::get(*this, n, args...);
+    }
+
+    template<typename OpT, typename... ArgsT>
+    //__hostdev__ auto // occasionally fails with NVCC
+    __hostdev__ decltype(OpT::set(util::declval<InternalNode&>(), util::declval<uint32_t>(), util::declval<ArgsT>()...))
+    set(const CoordType& ijk, ArgsT&&... args)
     {
-        return DataType::mFlags & uint32_t(2);
+        const uint32_t n = CoordToOffset(ijk);
+        if (this->isChild(n))
+            return this->getChild(n)->template set<OpT>(ijk, args...);
+        return OpT::set(*this, n, args...);
     }
 
 private:
     static_assert(sizeof(DataType) % NANOVDB_DATA_ALIGNMENT == 0, "sizeof(InternalData) is misaligned");
-    //static_assert(offsetof(DataType, mTable) % 32 == 0, "InternalData::mTable is misaligned");
 
     template<typename, int, int, int>
     friend class ReadAccessor;
@@ -3267,72 +3628,69 @@ class InternalNode : private InternalData<ChildT, Log2Dim>
     template<typename, uint32_t>
     friend class InternalNode;
 
+#ifndef NANOVDB_NEW_ACCESSOR_METHODS
     /// @brief Private read access method used by the ReadAccessor
     template<typename AccT>
     __hostdev__ ValueType getValueAndCache(const CoordType& ijk, const AccT& acc) const
     {
         const uint32_t n = CoordToOffset(ijk);
-        if (!DataType::mChildMask.isOn(n))
-            return DataType::mTable[n].value;
+        if (DataType::mChildMask.isOff(n))
+            return DataType::getValue(n);
         const ChildT* child = this->getChild(n);
         acc.insert(ijk, child);
         return child->getValueAndCache(ijk, acc);
     }
-
-    template<typename AccT>
-    __hostdev__ typename AccT::NodeInfo getNodeInfoAndCache(const CoordType& ijk, const AccT& acc) const
-    {
-        using NodeInfoT = typename AccT::NodeInfo;
-        const uint32_t n = CoordToOffset(ijk);
-        if (!DataType::mChildMask.isOn(n)) {
-            return NodeInfoT{LEVEL, this->dim(), this->minimum(), this->maximum(), this->average(),
-                             this->stdDeviation(), this->bbox()[0], this->bbox()[1]};
-        }
-        const ChildT* child = this->getChild(n);
-        acc.insert(ijk, child);
-        return child->getNodeInfoAndCache(ijk, acc);
-    }
-
     template<typename AccT>
     __hostdev__ bool isActiveAndCache(const CoordType& ijk, const AccT& acc) const
     {
         const uint32_t n = CoordToOffset(ijk);
-        if (!DataType::mChildMask.isOn(n))
-            return DataType::mValueMask.isOn(n);
+        if (DataType::mChildMask.isOff(n))
+            return DataType::isActive(n);
         const ChildT* child = this->getChild(n);
         acc.insert(ijk, child);
         return child->isActiveAndCache(ijk, acc);
     }
-
     template<typename AccT>
     __hostdev__ bool probeValueAndCache(const CoordType& ijk, ValueType& v, const AccT& acc) const
     {
         const uint32_t n = CoordToOffset(ijk);
-        if (!DataType::mChildMask.isOn(n)) {
-            v = DataType::mTable[n].value;
-            return DataType::mValueMask.isOn(n);
+        if (DataType::mChildMask.isOff(n)) {
+            v = DataType::getValue(n);
+            return DataType::isActive(n);
         }
         const ChildT* child = this->getChild(n);
         acc.insert(ijk, child);
         return child->probeValueAndCache(ijk, v, acc);
     }
-
     template<typename AccT>
     __hostdev__ const LeafNodeType* probeLeafAndCache(const CoordType& ijk, const AccT& acc) const
     {
         const uint32_t n = CoordToOffset(ijk);
-        if (!DataType::mChildMask.isOn(n))
+        if (DataType::mChildMask.isOff(n))
             return nullptr;
         const ChildT* child = this->getChild(n);
         acc.insert(ijk, child);
         return child->probeLeafAndCache(ijk, acc);
     }
+    template<typename AccT>
+    __hostdev__ typename AccT::NodeInfo getNodeInfoAndCache(const CoordType& ijk, const AccT& acc) const
+    {
+        using NodeInfoT = typename AccT::NodeInfo;
+        const uint32_t n = CoordToOffset(ijk);
+        if (DataType::mChildMask.isOff(n)) {
+            return NodeInfoT{LEVEL, this->dim(), this->minimum(), this->maximum(), this->average(), this->stdDeviation(), this->bbox()[0], this->bbox()[1]};
+        }
+        const ChildT* child = this->getChild(n);
+        acc.insert(ijk, child);
+        return child->getNodeInfoAndCache(ijk, acc);
+    }
+#endif // NANOVDB_NEW_ACCESSOR_METHODS
 
     template<typename RayT, typename AccT>
     __hostdev__ uint32_t getDimAndCache(const CoordType& ijk, const RayT& ray, const AccT& acc) const
     {
-        if (DataType::mFlags & uint32_t(1))
-            this->dim(); //ship this node if first bit is set
+        if (DataType::mFlags & uint32_t(1u))
+            return this->dim(); // skip this node if the 1st bit is set
         //if (!ray.intersects( this->bbox() )) return 1<<TOTAL;
 
         const uint32_t n = CoordToOffset(ijk);
@@ -3344,9 +3702,35 @@ class InternalNode : private InternalData<ChildT, Log2Dim>
         return ChildNodeType::dim(); // tile value
     }
 
+    template<typename OpT, typename AccT, typename... ArgsT>
+    __hostdev__ auto
+    //__hostdev__  decltype(OpT::get(util::declval<const InternalNode&>(), util::declval<uint32_t>(), util::declval<ArgsT>()...))
+    getAndCache(const CoordType& ijk, const AccT& acc, ArgsT&&... args) const
+    {
+        const uint32_t n = CoordToOffset(ijk);
+        if (DataType::mChildMask.isOff(n))
+            return OpT::get(*this, n, args...);
+        const ChildT* child = this->getChild(n);
+        acc.insert(ijk, child);
+        return child->template getAndCache<OpT>(ijk, acc, args...);
+    }
+
+    template<typename OpT, typename AccT, typename... ArgsT>
+    //__hostdev__ auto // occasionally fails with NVCC
+    __hostdev__ decltype(OpT::set(util::declval<InternalNode&>(), util::declval<uint32_t>(), util::declval<ArgsT>()...))
+    setAndCache(const CoordType& ijk, const AccT& acc, ArgsT&&... args)
+    {
+        const uint32_t n = CoordToOffset(ijk);
+        if (DataType::mChildMask.isOff(n))
+            return OpT::set(*this, n, args...);
+        ChildT* child = this->getChild(n);
+        acc.insert(ijk, child);
+        return child->template setAndCache<OpT>(ijk, acc, args...);
+    }
+
 }; // InternalNode class
 
-// --------------------------> LeafNode <------------------------------------
+// --------------------------> LeafData<T> <------------------------------------
 
 /// @brief Stuct with all the member data of the LeafNode (useful during serialization of an openvdb LeafNode)
 ///
@@ -3359,12 +3743,12 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafData
     using ValueType = ValueT;
     using BuildType = ValueT;
     using FloatType = typename FloatTraits<ValueT>::FloatType;
-    using ArrayType = ValueT;// type used for the internal mValue array
+    using ArrayType = ValueT; // type used for the internal mValue array
     static constexpr bool FIXED_SIZE = true;
 
     CoordT         mBBoxMin; // 12B.
     uint8_t        mBBoxDif[3]; // 3B.
-    uint8_t        mFlags; // 1B.
+    uint8_t        mFlags; // 1B. bit0: skip render?, bit1: has bbox?, bit3: unused, bit4: has stats, bits5,6,7: bit-width for FpN
     MaskT<LOG2DIM> mValueMask; // LOG2DIM(3): 64B.
 
     ValueType mMinimum; // typically 4B
@@ -3373,14 +3757,25 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafData
     FloatType mStdDevi; // typically 4B, standard deviation of all the active values in this node and its child nodes
     alignas(32) ValueType mValues[1u << 3 * LOG2DIM];
 
-    //__hostdev__ const ValueType* values() const { return mValues; }
+    /// @brief Return padding of this class in bytes, due to aliasing and 32B alignment
+    ///
+    /// @note The extra bytes are not necessarily at the end, but can come from aliasing of individual data members.
+    __hostdev__ static constexpr uint32_t padding()
+    {
+        return sizeof(LeafData) - (12 + 3 + 1 + sizeof(MaskT<LOG2DIM>) + 2 * (sizeof(ValueT) + sizeof(FloatType)) + (1u << (3 * LOG2DIM)) * sizeof(ValueT));
+    }
+    __hostdev__ static uint64_t memUsage() { return sizeof(LeafData); }
+
+    __hostdev__ static bool hasStats() { return true; }
+
     __hostdev__ ValueType getValue(uint32_t i) const { return mValues[i]; }
-    __hostdev__ void setValueOnly(uint32_t offset, const ValueType& value) { mValues[offset] = value; }
-    __hostdev__ void setValue(uint32_t offset, const ValueType& value)
+    __hostdev__ void      setValueOnly(uint32_t offset, const ValueType& value) { mValues[offset] = value; }
+    __hostdev__ void      setValue(uint32_t offset, const ValueType& value)
     {
         mValueMask.setOn(offset);
         mValues[offset] = value;
     }
+    __hostdev__ void setOn(uint32_t offset) { mValueMask.setOn(offset); }
 
     __hostdev__ ValueType getMin() const { return mMinimum; }
     __hostdev__ ValueType getMax() const { return mMaximum; }
@@ -3392,9 +3787,15 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafData
     __hostdev__ void setAvg(const FloatType& v) { mAverage = v; }
     __hostdev__ void setDev(const FloatType& v) { mStdDevi = v; }
 
-    template <typename T>
+    template<typename T>
     __hostdev__ void setOrigin(const T& ijk) { mBBoxMin = ijk; }
 
+    __hostdev__ void fill(const ValueType& v)
+    {
+        for (auto *p = mValues, *q = p + 512; p != q; ++p)
+            *p = v;
+    }
+
     /// @brief This class cannot be constructed or deleted
     LeafData() = delete;
     LeafData(const LeafData&) = delete;
@@ -3402,6 +3803,8 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafData
     ~LeafData() = delete;
 }; // LeafData<ValueT>
 
+// --------------------------> LeafFnBase <------------------------------------
+
 /// @brief Base-class for quantized float leaf nodes
 template<typename CoordT, template<uint32_t> class MaskT, uint32_t LOG2DIM>
 struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafFnBase
@@ -3413,47 +3816,62 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafFnBase
 
     CoordT         mBBoxMin; // 12B.
     uint8_t        mBBoxDif[3]; // 3B.
-    uint8_t        mFlags; // 1B.
+    uint8_t        mFlags; // 1B. bit0: skip render?, bit1: has bbox?, bit3: unused, bit4: has stats, bits5,6,7: bit-width for FpN
     MaskT<LOG2DIM> mValueMask; // LOG2DIM(3): 64B.
 
-    float mMinimum; //  4B - minimum of ALL values in this node
-    float mQuantum; //  = (max - min)/15 4B
-    uint16_t mMin, mMax, mAvg, mDev;// quantized representations of statistics of active values
+    float    mMinimum; //  4B - minimum of ALL values in this node
+    float    mQuantum; //  = (max - min)/15 4B
+    uint16_t mMin, mMax, mAvg, mDev; // quantized representations of statistics of active values
+    // no padding since it's always 32B aligned
+    __hostdev__ static uint64_t memUsage() { return sizeof(LeafFnBase); }
 
-    void init(float min, float max, uint8_t bitWidth)
+    __hostdev__ static bool hasStats() { return true; }
+
+    /// @brief Return padding of this class in bytes, due to aliasing and 32B alignment
+    ///
+    /// @note The extra bytes are not necessarily at the end, but can come from aliasing of individual data members.
+    __hostdev__ static constexpr uint32_t padding()
+    {
+        return sizeof(LeafFnBase) - (12 + 3 + 1 + sizeof(MaskT<LOG2DIM>) + 2 * 4 + 4 * 2);
+    }
+    __hostdev__ void init(float min, float max, uint8_t bitWidth)
     {
         mMinimum = min;
-        mQuantum = (max - min)/float((1 << bitWidth)-1);
+        mQuantum = (max - min) / float((1 << bitWidth) - 1);
     }
 
+    __hostdev__ void setOn(uint32_t offset) { mValueMask.setOn(offset); }
+
     /// @brief return the quantized minimum of the active values in this node
-    __hostdev__ float getMin()    const { return mMin*mQuantum + mMinimum; }
+    __hostdev__ float getMin() const { return mMin * mQuantum + mMinimum; }
 
     /// @brief return the quantized maximum of the active values in this node
-    __hostdev__ float getMax()    const { return mMax*mQuantum + mMinimum; }
+    __hostdev__ float getMax() const { return mMax * mQuantum + mMinimum; }
 
     /// @brief return the quantized average of the active values in this node
-    __hostdev__ float getAvg()    const { return mAvg*mQuantum + mMinimum; }
+    __hostdev__ float getAvg() const { return mAvg * mQuantum + mMinimum; }
     /// @brief return the quantized standard deviation of the active values in this node
 
     /// @note 0 <= StdDev <= max-min or 0 <= StdDev/(max-min) <= 1
-    __hostdev__ float getDev() const { return mDev*mQuantum; }
+    __hostdev__ float getDev() const { return mDev * mQuantum; }
 
     /// @note min <= X <= max or 0 <= (X-min)/(min-max) <= 1
-    __hostdev__ void setMin(float min) { mMin = uint16_t((min - mMinimum)/mQuantum + 0.5f); }
+    __hostdev__ void setMin(float min) { mMin = uint16_t((min - mMinimum) / mQuantum + 0.5f); }
 
     /// @note min <= X <= max or 0 <= (X-min)/(min-max) <= 1
-    __hostdev__ void setMax(float max) { mMax = uint16_t((max - mMinimum)/mQuantum + 0.5f); }
+    __hostdev__ void setMax(float max) { mMax = uint16_t((max - mMinimum) / mQuantum + 0.5f); }
 
     /// @note min <= avg <= max or 0 <= (avg-min)/(min-max) <= 1
-    __hostdev__ void setAvg(float avg) { mAvg = uint16_t((avg - mMinimum)/mQuantum + 0.5f); }
+    __hostdev__ void setAvg(float avg) { mAvg = uint16_t((avg - mMinimum) / mQuantum + 0.5f); }
 
     /// @note 0 <= StdDev <= max-min or 0 <= StdDev/(max-min) <= 1
-    __hostdev__ void setDev(float dev) { mDev = uint16_t(dev/mQuantum + 0.5f); }
+    __hostdev__ void setDev(float dev) { mDev = uint16_t(dev / mQuantum + 0.5f); }
 
-    template <typename T>
+    template<typename T>
     __hostdev__ void setOrigin(const T& ijk) { mBBoxMin = ijk; }
-};// LeafFnBase
+}; // LeafFnBase
+
+// --------------------------> LeafData<Fp4> <------------------------------------
 
 /// @brief Stuct with all the member data of the LeafNode (useful during serialization of an openvdb LeafNode)
 ///
@@ -3464,18 +3882,25 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafData<Fp4, CoordT, MaskT, LOG2DI
 {
     using BaseT = LeafFnBase<CoordT, MaskT, LOG2DIM>;
     using BuildType = Fp4;
-    using ArrayType = uint8_t;// type used for the internal mValue array
+    using ArrayType = uint8_t; // type used for the internal mValue array
     static constexpr bool FIXED_SIZE = true;
-    alignas(32) uint8_t mCode[1u << (3 * LOG2DIM - 1)];
+    alignas(32) uint8_t mCode[1u << (3 * LOG2DIM - 1)]; // LeafFnBase is 32B aligned and so is mCode
+
+    __hostdev__ static constexpr uint64_t memUsage() { return sizeof(LeafData); }
+    __hostdev__ static constexpr uint32_t padding()
+    {
+        static_assert(BaseT::padding() == 0, "expected no padding in LeafFnBase");
+        return sizeof(LeafData) - sizeof(BaseT) - (1u << (3 * LOG2DIM - 1));
+    }
 
     __hostdev__ static constexpr uint8_t bitWidth() { return 4u; }
-    __hostdev__ float getValue(uint32_t i) const
+    __hostdev__ float                    getValue(uint32_t i) const
     {
 #if 0
         const uint8_t c = mCode[i>>1];
         return ( (i&1) ? c >> 4 : c & uint8_t(15) )*BaseT::mQuantum + BaseT::mMinimum;
 #else
-        return ((mCode[i>>1] >> ((i&1)<<2)) & uint8_t(15))*BaseT::mQuantum + BaseT::mMinimum;
+        return ((mCode[i >> 1] >> ((i & 1) << 2)) & uint8_t(15)) * BaseT::mQuantum + BaseT::mMinimum;
 #endif
     }
 
@@ -3486,20 +3911,28 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafData<Fp4, CoordT, MaskT, LOG2DI
     ~LeafData() = delete;
 }; // LeafData<Fp4>
 
+// --------------------------> LeafBase<Fp8> <------------------------------------
+
 template<typename CoordT, template<uint32_t> class MaskT, uint32_t LOG2DIM>
 struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafData<Fp8, CoordT, MaskT, LOG2DIM>
     : public LeafFnBase<CoordT, MaskT, LOG2DIM>
 {
     using BaseT = LeafFnBase<CoordT, MaskT, LOG2DIM>;
     using BuildType = Fp8;
-    using ArrayType = uint8_t;// type used for the internal mValue array
+    using ArrayType = uint8_t; // type used for the internal mValue array
     static constexpr bool FIXED_SIZE = true;
     alignas(32) uint8_t mCode[1u << 3 * LOG2DIM];
+    __hostdev__ static constexpr int64_t  memUsage() { return sizeof(LeafData); }
+    __hostdev__ static constexpr uint32_t padding()
+    {
+        static_assert(BaseT::padding() == 0, "expected no padding in LeafFnBase");
+        return sizeof(LeafData) - sizeof(BaseT) - (1u << 3 * LOG2DIM);
+    }
 
     __hostdev__ static constexpr uint8_t bitWidth() { return 8u; }
-    __hostdev__ float getValue(uint32_t i) const
+    __hostdev__ float                    getValue(uint32_t i) const
     {
-        return mCode[i]*BaseT::mQuantum + BaseT::mMinimum;// code * (max-min)/255 + min
+        return mCode[i] * BaseT::mQuantum + BaseT::mMinimum; // code * (max-min)/255 + min
     }
     /// @brief This class cannot be constructed or deleted
     LeafData() = delete;
@@ -3508,20 +3941,29 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafData<Fp8, CoordT, MaskT, LOG2DI
     ~LeafData() = delete;
 }; // LeafData<Fp8>
 
+// --------------------------> LeafData<Fp16> <------------------------------------
+
 template<typename CoordT, template<uint32_t> class MaskT, uint32_t LOG2DIM>
 struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafData<Fp16, CoordT, MaskT, LOG2DIM>
     : public LeafFnBase<CoordT, MaskT, LOG2DIM>
 {
     using BaseT = LeafFnBase<CoordT, MaskT, LOG2DIM>;
     using BuildType = Fp16;
-    using ArrayType = uint16_t;// type used for the internal mValue array
+    using ArrayType = uint16_t; // type used for the internal mValue array
     static constexpr bool FIXED_SIZE = true;
     alignas(32) uint16_t mCode[1u << 3 * LOG2DIM];
 
+    __hostdev__ static constexpr uint64_t memUsage() { return sizeof(LeafData); }
+    __hostdev__ static constexpr uint32_t padding()
+    {
+        static_assert(BaseT::padding() == 0, "expected no padding in LeafFnBase");
+        return sizeof(LeafData) - sizeof(BaseT) - 2 * (1u << 3 * LOG2DIM);
+    }
+
     __hostdev__ static constexpr uint8_t bitWidth() { return 16u; }
-    __hostdev__ float getValue(uint32_t i) const
+    __hostdev__ float                    getValue(uint32_t i) const
     {
-        return mCode[i]*BaseT::mQuantum + BaseT::mMinimum;// code * (max-min)/65535 + min
+        return mCode[i] * BaseT::mQuantum + BaseT::mMinimum; // code * (max-min)/65535 + min
     }
 
     /// @brief This class cannot be constructed or deleted
@@ -3531,54 +3973,61 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafData<Fp16, CoordT, MaskT, LOG2D
     ~LeafData() = delete;
 }; // LeafData<Fp16>
 
+// --------------------------> LeafData<FpN> <------------------------------------
+
 template<typename CoordT, template<uint32_t> class MaskT, uint32_t LOG2DIM>
 struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafData<FpN, CoordT, MaskT, LOG2DIM>
     : public LeafFnBase<CoordT, MaskT, LOG2DIM>
-{
+{ // this class has no additional data members, however every instance is immediately followed by
+    //  bitWidth*64 bytes. Since its base class is 32B aligned so are the bitWidth*64 bytes
     using BaseT = LeafFnBase<CoordT, MaskT, LOG2DIM>;
     using BuildType = FpN;
     static constexpr bool FIXED_SIZE = false;
+    __hostdev__ static constexpr uint32_t padding()
+    {
+        static_assert(BaseT::padding() == 0, "expected no padding in LeafFnBase");
+        return 0;
+    }
 
-    __hostdev__ uint8_t bitWidth() const { return 1 << (BaseT::mFlags >> 5); }// 4,8,16,32 = 2^(2,3,4,5)
-    __hostdev__ size_t memUsage() const { return sizeof(*this) + this->bitWidth()*64; }
-    __hostdev__ static size_t memUsage(uint32_t bitWidth) { return 96u + bitWidth*64; }
-    __hostdev__ float getValue(uint32_t i) const
+    __hostdev__ uint8_t       bitWidth() const { return 1 << (BaseT::mFlags >> 5); } // 4,8,16,32 = 2^(2,3,4,5)
+    __hostdev__ size_t        memUsage() const { return sizeof(*this) + this->bitWidth() * 64; }
+    __hostdev__ static size_t memUsage(uint32_t bitWidth) { return 96u + bitWidth * 64; }
+    __hostdev__ float         getValue(uint32_t i) const
     {
-#ifdef NANOVDB_FPN_BRANCHLESS// faster
-        const int b = BaseT::mFlags >> 5;// b = 0, 1, 2, 3, 4 corresponding to 1, 2, 4, 8, 16 bits
-#if 0// use LUT
+#ifdef NANOVDB_FPN_BRANCHLESS // faster
+        const int b = BaseT::mFlags >> 5; // b = 0, 1, 2, 3, 4 corresponding to 1, 2, 4, 8, 16 bits
+#if 0 // use LUT
         uint16_t code = reinterpret_cast<const uint16_t*>(this + 1)[i >> (4 - b)];
         const static uint8_t shift[5] = {15, 7, 3, 1, 0};
         const static uint16_t mask[5] = {1, 3, 15, 255, 65535};
         code >>= (i & shift[b]) << b;
         code  &= mask[b];
-#else// no LUT
+#else // no LUT
         uint32_t code = reinterpret_cast<const uint32_t*>(this + 1)[i >> (5 - b)];
-        //code >>= (i & ((16 >> b) - 1)) << b;
         code >>= (i & ((32 >> b) - 1)) << b;
-        code  &= (1 << (1 << b)) - 1;
+        code &= (1 << (1 << b)) - 1;
 #endif
-#else// use branched version (slow)
+#else // use branched version (slow)
         float code;
-        auto *values = reinterpret_cast<const uint8_t*>(this+1);
+        auto* values = reinterpret_cast<const uint8_t*>(this + 1);
         switch (BaseT::mFlags >> 5) {
-            case 0u:// 1 bit float
-                code = float((values[i>>3] >>  (i&7)    ) & uint8_t(1));
-                break;
-            case 1u:// 2 bits float
-                code = float((values[i>>2] >> ((i&3)<<1)) & uint8_t(3));
-                break;
-            case 2u:// 4 bits float
-                code = float((values[i>>1] >> ((i&1)<<2)) & uint8_t(15));
-                break;
-            case 3u:// 8 bits float
-                code = float(values[i]);
-                break;
-            default:// 16 bits float
-                code = float(reinterpret_cast<const uint16_t*>(values)[i]);
+        case 0u: // 1 bit float
+            code = float((values[i >> 3] >> (i & 7)) & uint8_t(1));
+            break;
+        case 1u: // 2 bits float
+            code = float((values[i >> 2] >> ((i & 3) << 1)) & uint8_t(3));
+            break;
+        case 2u: // 4 bits float
+            code = float((values[i >> 1] >> ((i & 1) << 2)) & uint8_t(15));
+            break;
+        case 3u: // 8 bits float
+            code = float(values[i]);
+            break;
+        default: // 16 bits float
+            code = float(reinterpret_cast<const uint16_t*>(values)[i]);
         }
 #endif
-        return float(code) * BaseT::mQuantum + BaseT::mMinimum;// code * (max-min)/UNITS + min
+        return float(code) * BaseT::mQuantum + BaseT::mMinimum; // code * (max-min)/UNITS + min
     }
 
     /// @brief This class cannot be constructed or deleted
@@ -3588,6 +4037,8 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafData<FpN, CoordT, MaskT, LOG2DI
     ~LeafData() = delete;
 }; // LeafData<FpN>
 
+// --------------------------> LeafData<bool> <------------------------------------
+
 // Partial template specialization of LeafData with bool
 template<typename CoordT, template<uint32_t> class MaskT, uint32_t LOG2DIM>
 struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafData<bool, CoordT, MaskT, LOG2DIM>
@@ -3596,34 +4047,37 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafData<bool, CoordT, MaskT, LOG2D
     static_assert(sizeof(MaskT<LOG2DIM>) == sizeof(Mask<LOG2DIM>), "Mismatching sizeof");
     using ValueType = bool;
     using BuildType = bool;
-    using FloatType = bool;// dummy value type
-    using ArrayType = MaskT<LOG2DIM>;// type used for the internal mValue array
+    using FloatType = bool; // dummy value type
+    using ArrayType = MaskT<LOG2DIM>; // type used for the internal mValue array
     static constexpr bool FIXED_SIZE = true;
 
     CoordT         mBBoxMin; // 12B.
     uint8_t        mBBoxDif[3]; // 3B.
-    uint8_t        mFlags; // 1B.
+    uint8_t        mFlags; // 1B. bit0: skip render?, bit1: has bbox?, bit3: unused, bit4: has stats, bits5,6,7: bit-width for FpN
     MaskT<LOG2DIM> mValueMask; // LOG2DIM(3): 64B.
     MaskT<LOG2DIM> mValues; // LOG2DIM(3): 64B.
+    uint64_t       mPadding[2]; // 16B padding to 32B alignment
 
-    //__hostdev__ const ValueType* values() const { return nullptr; }
+    __hostdev__ static constexpr uint32_t padding() { return sizeof(LeafData) - 12u - 3u - 1u - 2 * sizeof(MaskT<LOG2DIM>) - 16u; }
+    __hostdev__ static uint64_t           memUsage() { return sizeof(LeafData); }
+    __hostdev__ static bool hasStats() { return false; }
     __hostdev__ bool getValue(uint32_t i) const { return mValues.isOn(i); }
-    __hostdev__ bool getMin() const { return false; }// dummy
-    __hostdev__ bool getMax() const { return false; }// dummy
-    __hostdev__ bool getAvg() const { return false; }// dummy
-    __hostdev__ bool getDev() const { return false; }// dummy
+    __hostdev__ bool getMin() const { return false; } // dummy
+    __hostdev__ bool getMax() const { return false; } // dummy
+    __hostdev__ bool getAvg() const { return false; } // dummy
+    __hostdev__ bool getDev() const { return false; } // dummy
     __hostdev__ void setValue(uint32_t offset, bool v)
     {
         mValueMask.setOn(offset);
         mValues.set(offset, v);
     }
+    __hostdev__ void setOn(uint32_t offset) { mValueMask.setOn(offset); }
+    __hostdev__ void setMin(const bool&) {} // no-op
+    __hostdev__ void setMax(const bool&) {} // no-op
+    __hostdev__ void setAvg(const bool&) {} // no-op
+    __hostdev__ void setDev(const bool&) {} // no-op
 
-    __hostdev__ void setMin(const bool&) {}// no-op
-    __hostdev__ void setMax(const bool&) {}// no-op
-    __hostdev__ void setAvg(const bool&) {}// no-op
-    __hostdev__ void setDev(const bool&) {}// no-op
-
-    template <typename T>
+    template<typename T>
     __hostdev__ void setOrigin(const T& ijk) { mBBoxMin = ijk; }
 
     /// @brief This class cannot be constructed or deleted
@@ -3633,6 +4087,8 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafData<bool, CoordT, MaskT, LOG2D
     ~LeafData() = delete;
 }; // LeafData<bool>
 
+// --------------------------> LeafData<ValueMask> <------------------------------------
+
 // Partial template specialization of LeafData with ValueMask
 template<typename CoordT, template<uint32_t> class MaskT, uint32_t LOG2DIM>
 struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafData<ValueMask, CoordT, MaskT, LOG2DIM>
@@ -3641,32 +4097,217 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafData<ValueMask, CoordT, MaskT,
     static_assert(sizeof(MaskT<LOG2DIM>) == sizeof(Mask<LOG2DIM>), "Mismatching sizeof");
     using ValueType = bool;
     using BuildType = ValueMask;
-    using FloatType = bool;// dummy value type
-    using ArrayType = void;// type used for the internal mValue array - void means missing
+    using FloatType = bool; // dummy value type
+    using ArrayType = void; // type used for the internal mValue array - void means missing
     static constexpr bool FIXED_SIZE = true;
 
     CoordT         mBBoxMin; // 12B.
     uint8_t        mBBoxDif[3]; // 3B.
-    uint8_t        mFlags; // 1B.
+    uint8_t        mFlags; // 1B. bit0: skip render?, bit1: has bbox?, bit3: unused, bit4: has stats, bits5,6,7: bit-width for FpN
     MaskT<LOG2DIM> mValueMask; // LOG2DIM(3): 64B.
+    uint64_t       mPadding[2]; // 16B padding to 32B alignment
+
+    __hostdev__ static uint64_t memUsage() { return sizeof(LeafData); }
+    __hostdev__ static bool hasStats() { return false; }
+    __hostdev__ static constexpr uint32_t padding()
+    {
+        return sizeof(LeafData) - (12u + 3u + 1u + sizeof(MaskT<LOG2DIM>) + 2 * 8u);
+    }
 
-    //__hostdev__ const ValueType* values() const { return nullptr; }
     __hostdev__ bool getValue(uint32_t i) const { return mValueMask.isOn(i); }
-    __hostdev__ bool getMin() const { return false; }// dummy
-    __hostdev__ bool getMax() const { return false; }// dummy
-    __hostdev__ bool getAvg() const { return false; }// dummy
-    __hostdev__ bool getDev() const { return false; }// dummy
-    __hostdev__ void setValue(uint32_t offset, bool)
+    __hostdev__ bool getMin() const { return false; } // dummy
+    __hostdev__ bool getMax() const { return false; } // dummy
+    __hostdev__ bool getAvg() const { return false; } // dummy
+    __hostdev__ bool getDev() const { return false; } // dummy
+    __hostdev__ void setValue(uint32_t offset, bool) { mValueMask.setOn(offset); }
+    __hostdev__ void setOn(uint32_t offset) { mValueMask.setOn(offset); }
+    __hostdev__ void setMin(const ValueType&) {} // no-op
+    __hostdev__ void setMax(const ValueType&) {} // no-op
+    __hostdev__ void setAvg(const FloatType&) {} // no-op
+    __hostdev__ void setDev(const FloatType&) {} // no-op
+
+    template<typename T>
+    __hostdev__ void setOrigin(const T& ijk) { mBBoxMin = ijk; }
+
+    /// @brief This class cannot be constructed or deleted
+    LeafData() = delete;
+    LeafData(const LeafData&) = delete;
+    LeafData& operator=(const LeafData&) = delete;
+    ~LeafData() = delete;
+}; // LeafData<ValueMask>
+
+// --------------------------> LeafIndexBase <------------------------------------
+
+// Partial template specialization of LeafData with ValueIndex
+template<typename CoordT, template<uint32_t> class MaskT, uint32_t LOG2DIM>
+struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafIndexBase
+{
+    static_assert(sizeof(CoordT) == sizeof(Coord), "Mismatching sizeof");
+    static_assert(sizeof(MaskT<LOG2DIM>) == sizeof(Mask<LOG2DIM>), "Mismatching sizeof");
+    using ValueType = uint64_t;
+    using FloatType = uint64_t;
+    using ArrayType = void; // type used for the internal mValue array - void means missing
+    static constexpr bool FIXED_SIZE = true;
+
+    CoordT         mBBoxMin; // 12B.
+    uint8_t        mBBoxDif[3]; // 3B.
+    uint8_t        mFlags; // 1B. bit0: skip render?, bit1: has bbox?, bit3: unused, bit4: has stats, bits5,6,7: bit-width for FpN
+    MaskT<LOG2DIM> mValueMask; // LOG2DIM(3): 64B.
+    uint64_t mOffset, mPrefixSum; // 8B offset to first value in this leaf node and 9-bit prefix sum
+    __hostdev__ static constexpr uint32_t padding()
+    {
+        return sizeof(LeafIndexBase) - (12u + 3u + 1u + sizeof(MaskT<LOG2DIM>) + 2 * 8u);
+    }
+    __hostdev__ static uint64_t memUsage() { return sizeof(LeafIndexBase); }
+    __hostdev__ bool            hasStats() const { return mFlags & (uint8_t(1) << 4); }
+    // return the offset to the first value indexed by this leaf node
+    __hostdev__ const uint64_t& firstOffset() const { return mOffset; }
+    __hostdev__ void            setMin(const ValueType&) {} // no-op
+    __hostdev__ void            setMax(const ValueType&) {} // no-op
+    __hostdev__ void            setAvg(const FloatType&) {} // no-op
+    __hostdev__ void            setDev(const FloatType&) {} // no-op
+    __hostdev__ void            setOn(uint32_t offset) { mValueMask.setOn(offset); }
+    template<typename T>
+    __hostdev__ void setOrigin(const T& ijk) { mBBoxMin = ijk; }
+
+protected:
+    /// @brief This class should be used as an abstract class and only constructed or deleted via child classes
+    LeafIndexBase() = default;
+    LeafIndexBase(const LeafIndexBase&) = default;
+    LeafIndexBase& operator=(const LeafIndexBase&) = default;
+    ~LeafIndexBase() = default;
+}; // LeafIndexBase
+
+// --------------------------> LeafData<ValueIndex> <------------------------------------
+
+// Partial template specialization of LeafData with ValueIndex
+template<typename CoordT, template<uint32_t> class MaskT, uint32_t LOG2DIM>
+struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafData<ValueIndex, CoordT, MaskT, LOG2DIM>
+    : public LeafIndexBase<CoordT, MaskT, LOG2DIM>
+{
+    using BaseT = LeafIndexBase<CoordT, MaskT, LOG2DIM>;
+    using BuildType = ValueIndex;
+    // return the total number of values indexed by this leaf node, excluding the optional 4 stats
+    __hostdev__ static uint32_t valueCount() { return uint32_t(512); } // 8^3 = 2^9
+    // return the offset to the last value indexed by this leaf node (disregarding optional stats)
+    __hostdev__ uint64_t lastOffset() const { return BaseT::mOffset + 511u; } // 2^9 - 1
+    // if stats are available, they are always placed after the last voxel value in this leaf node
+    __hostdev__ uint64_t getMin() const { return this->hasStats() ? BaseT::mOffset + 512u : 0u; }
+    __hostdev__ uint64_t getMax() const { return this->hasStats() ? BaseT::mOffset + 513u : 0u; }
+    __hostdev__ uint64_t getAvg() const { return this->hasStats() ? BaseT::mOffset + 514u : 0u; }
+    __hostdev__ uint64_t getDev() const { return this->hasStats() ? BaseT::mOffset + 515u : 0u; }
+    __hostdev__ uint64_t getValue(uint32_t i) const { return BaseT::mOffset + i; } // dense leaf node with active and inactive voxels
+}; // LeafData<ValueIndex>
+
+// --------------------------> LeafData<ValueOnIndex> <------------------------------------
+
+template<typename CoordT, template<uint32_t> class MaskT, uint32_t LOG2DIM>
+struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafData<ValueOnIndex, CoordT, MaskT, LOG2DIM>
+    : public LeafIndexBase<CoordT, MaskT, LOG2DIM>
+{
+    using BaseT = LeafIndexBase<CoordT, MaskT, LOG2DIM>;
+    using BuildType = ValueOnIndex;
+    __hostdev__ uint32_t valueCount() const
+    {
+        return util::countOn(BaseT::mValueMask.words()[7]) + (BaseT::mPrefixSum >> 54u & 511u); // last 9 bits of mPrefixSum do not account for the last word in mValueMask
+    }
+    __hostdev__ uint64_t lastOffset() const { return BaseT::mOffset + this->valueCount() - 1u; }
+    __hostdev__ uint64_t getMin() const { return this->hasStats() ? this->lastOffset() + 1u : 0u; }
+    __hostdev__ uint64_t getMax() const { return this->hasStats() ? this->lastOffset() + 2u : 0u; }
+    __hostdev__ uint64_t getAvg() const { return this->hasStats() ? this->lastOffset() + 3u : 0u; }
+    __hostdev__ uint64_t getDev() const { return this->hasStats() ? this->lastOffset() + 4u : 0u; }
+    __hostdev__ uint64_t getValue(uint32_t i) const
+    {
+        //return mValueMask.isOn(i) ? mOffset + mValueMask.countOn(i) : 0u;// for debugging
+        uint32_t       n = i >> 6;
+        const uint64_t w = BaseT::mValueMask.words()[n], mask = uint64_t(1) << (i & 63u);
+        if (!(w & mask)) return uint64_t(0); // if i'th value is inactive return offset to background value
+        uint64_t sum  = BaseT::mOffset + util::countOn(w & (mask - 1u));
+        if (n--) sum += BaseT::mPrefixSum >> (9u * n) & 511u;
+        return sum;
+    }
+}; // LeafData<ValueOnIndex>
+
+// --------------------------> LeafData<ValueIndexMask> <------------------------------------
+
+template<typename CoordT, template<uint32_t> class MaskT, uint32_t LOG2DIM>
+struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafData<ValueIndexMask, CoordT, MaskT, LOG2DIM>
+    : public LeafData<ValueIndex, CoordT, MaskT, LOG2DIM>
+{
+    using BuildType = ValueIndexMask;
+    MaskT<LOG2DIM>              mMask;
+    __hostdev__ static uint64_t memUsage() { return sizeof(LeafData); }
+    __hostdev__ bool            isMaskOn(uint32_t offset) const { return mMask.isOn(offset); }
+    __hostdev__ void            setMask(uint32_t offset, bool v) { mMask.set(offset, v); }
+}; // LeafData<ValueIndexMask>
+
+template<typename CoordT, template<uint32_t> class MaskT, uint32_t LOG2DIM>
+struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafData<ValueOnIndexMask, CoordT, MaskT, LOG2DIM>
+    : public LeafData<ValueOnIndex, CoordT, MaskT, LOG2DIM>
+{
+    using BuildType = ValueOnIndexMask;
+    MaskT<LOG2DIM>              mMask;
+    __hostdev__ static uint64_t memUsage() { return sizeof(LeafData); }
+    __hostdev__ bool            isMaskOn(uint32_t offset) const { return mMask.isOn(offset); }
+    __hostdev__ void            setMask(uint32_t offset, bool v) { mMask.set(offset, v); }
+}; // LeafData<ValueOnIndexMask>
+
+// --------------------------> LeafData<Point> <------------------------------------
+
+template<typename CoordT, template<uint32_t> class MaskT, uint32_t LOG2DIM>
+struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafData<Point, CoordT, MaskT, LOG2DIM>
+{
+    static_assert(sizeof(CoordT) == sizeof(Coord), "Mismatching sizeof");
+    static_assert(sizeof(MaskT<LOG2DIM>) == sizeof(Mask<LOG2DIM>), "Mismatching sizeof");
+    using ValueType = uint64_t;
+    using BuildType = Point;
+    using FloatType = typename FloatTraits<ValueType>::FloatType;
+    using ArrayType = uint16_t; // type used for the internal mValue array
+    static constexpr bool FIXED_SIZE = true;
+
+    CoordT         mBBoxMin; // 12B.
+    uint8_t        mBBoxDif[3]; // 3B.
+    uint8_t        mFlags; // 1B. bit0: skip render?, bit1: has bbox?, bit3: unused, bit4: has stats, bits5,6,7: bit-width for FpN
+    MaskT<LOG2DIM> mValueMask; // LOG2DIM(3): 64B.
+
+    uint64_t mOffset; //  8B
+    uint64_t mPointCount; //  8B
+    alignas(32) uint16_t mValues[1u << 3 * LOG2DIM]; // 1KB
+    // no padding
+
+    /// @brief Return padding of this class in bytes, due to aliasing and 32B alignment
+    ///
+    /// @note The extra bytes are not necessarily at the end, but can come from aliasing of individual data members.
+    __hostdev__ static constexpr uint32_t padding()
+    {
+        return sizeof(LeafData) - (12u + 3u + 1u + sizeof(MaskT<LOG2DIM>) + 2 * 8u + (1u << 3 * LOG2DIM) * 2u);
+    }
+    __hostdev__ static uint64_t memUsage() { return sizeof(LeafData); }
+
+    __hostdev__ uint64_t offset() const { return mOffset; }
+    __hostdev__ uint64_t pointCount() const { return mPointCount; }
+    __hostdev__ uint64_t first(uint32_t i) const { return i ? uint64_t(mValues[i - 1u]) + mOffset : mOffset; }
+    __hostdev__ uint64_t last(uint32_t i) const { return uint64_t(mValues[i]) + mOffset; }
+    __hostdev__ uint64_t getValue(uint32_t i) const { return uint64_t(mValues[i]); }
+    __hostdev__ void     setValueOnly(uint32_t offset, uint16_t value) { mValues[offset] = value; }
+    __hostdev__ void     setValue(uint32_t offset, uint16_t value)
     {
         mValueMask.setOn(offset);
+        mValues[offset] = value;
     }
+    __hostdev__ void setOn(uint32_t offset) { mValueMask.setOn(offset); }
+
+    __hostdev__ ValueType getMin() const { return mOffset; }
+    __hostdev__ ValueType getMax() const { return mPointCount; }
+    __hostdev__ FloatType getAvg() const { return 0.0f; }
+    __hostdev__ FloatType getDev() const { return 0.0f; }
 
-    __hostdev__ void setMin(const ValueType&) {}// no-op
-    __hostdev__ void setMax(const ValueType&) {}// no-op
-    __hostdev__ void setAvg(const FloatType&) {}// no-op
-    __hostdev__ void setDev(const FloatType&) {}// no-op
+    __hostdev__ void setMin(const ValueType&) {}
+    __hostdev__ void setMax(const ValueType&) {}
+    __hostdev__ void setAvg(const FloatType&) {}
+    __hostdev__ void setDev(const FloatType&) {}
 
-    template <typename T>
+    template<typename T>
     __hostdev__ void setOrigin(const T& ijk) { mBBoxMin = ijk; }
 
     /// @brief This class cannot be constructed or deleted
@@ -3674,18 +4315,22 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) LeafData<ValueMask, CoordT, MaskT,
     LeafData(const LeafData&) = delete;
     LeafData& operator=(const LeafData&) = delete;
     ~LeafData() = delete;
-}; // LeafData<ValueMask>
+}; // LeafData<Point>
+
+// --------------------------> LeafNode<T> <------------------------------------
 
 /// @brief Leaf nodes of the VDB tree. (defaults to 8x8x8 = 512 voxels)
 template<typename BuildT,
          typename CoordT = Coord,
          template<uint32_t> class MaskT = Mask,
          uint32_t Log2Dim = 3>
-class LeafNode : private LeafData<BuildT, CoordT, MaskT, Log2Dim>
+class LeafNode : public LeafData<BuildT, CoordT, MaskT, Log2Dim>
 {
 public:
     struct ChildNodeType
     {
+        static constexpr uint32_t   TOTAL = 0;
+        static constexpr uint32_t   DIM = 1;
         __hostdev__ static uint32_t dim() { return 1u; }
     }; // Voxel
     using LeafNodeType = LeafNode<BuildT, CoordT, MaskT, Log2Dim>;
@@ -3697,8 +4342,127 @@ class LeafNode : private LeafData<BuildT, CoordT, MaskT, Log2Dim>
     static constexpr bool FIXED_SIZE = DataType::FIXED_SIZE;
     template<uint32_t LOG2>
     using MaskType = MaskT<LOG2>;
+    template<bool ON>
+    using MaskIterT = typename Mask<Log2Dim>::template Iterator<ON>;
+
+    /// @brief Visits all active values in a leaf node
+    class ValueOnIterator : public MaskIterT<true>
+    {
+        using BaseT = MaskIterT<true>;
+        const LeafNode* mParent;
+
+    public:
+        __hostdev__ ValueOnIterator()
+            : BaseT()
+            , mParent(nullptr)
+        {
+        }
+        __hostdev__ ValueOnIterator(const LeafNode* parent)
+            : BaseT(parent->data()->mValueMask.beginOn())
+            , mParent(parent)
+        {
+        }
+        ValueOnIterator&      operator=(const ValueOnIterator&) = default;
+        __hostdev__ ValueType operator*() const
+        {
+            NANOVDB_ASSERT(*this);
+            return mParent->getValue(BaseT::pos());
+        }
+        __hostdev__ CoordT getCoord() const
+        {
+            NANOVDB_ASSERT(*this);
+            return mParent->offsetToGlobalCoord(BaseT::pos());
+        }
+    }; // Member class ValueOnIterator
+
+    __hostdev__ ValueOnIterator beginValueOn() const { return ValueOnIterator(this); }
+    __hostdev__ ValueOnIterator cbeginValueOn() const { return ValueOnIterator(this); }
+
+    /// @brief Visits all inactive values in a leaf node
+    class ValueOffIterator : public MaskIterT<false>
+    {
+        using BaseT = MaskIterT<false>;
+        const LeafNode* mParent;
+
+    public:
+        __hostdev__ ValueOffIterator()
+            : BaseT()
+            , mParent(nullptr)
+        {
+        }
+        __hostdev__ ValueOffIterator(const LeafNode* parent)
+            : BaseT(parent->data()->mValueMask.beginOff())
+            , mParent(parent)
+        {
+        }
+        ValueOffIterator&     operator=(const ValueOffIterator&) = default;
+        __hostdev__ ValueType operator*() const
+        {
+            NANOVDB_ASSERT(*this);
+            return mParent->getValue(BaseT::pos());
+        }
+        __hostdev__ CoordT getCoord() const
+        {
+            NANOVDB_ASSERT(*this);
+            return mParent->offsetToGlobalCoord(BaseT::pos());
+        }
+    }; // Member class ValueOffIterator
+
+    __hostdev__ ValueOffIterator  beginValueOff() const { return ValueOffIterator(this); }
+    __hostdev__ ValueOffIterator cbeginValueOff() const { return ValueOffIterator(this); }
+
+    /// @brief Visits all values in a leaf node, i.e. both active and inactive values
+    class ValueIterator
+    {
+        const LeafNode* mParent;
+        uint32_t        mPos;
+
+    public:
+        __hostdev__ ValueIterator()
+            : mParent(nullptr)
+            , mPos(1u << 3 * Log2Dim)
+        {
+        }
+        __hostdev__ ValueIterator(const LeafNode* parent)
+            : mParent(parent)
+            , mPos(0)
+        {
+            NANOVDB_ASSERT(parent);
+        }
+        ValueIterator&        operator=(const ValueIterator&) = default;
+        __hostdev__ ValueType operator*() const
+        {
+            NANOVDB_ASSERT(*this);
+            return mParent->getValue(mPos);
+        }
+        __hostdev__ CoordT getCoord() const
+        {
+            NANOVDB_ASSERT(*this);
+            return mParent->offsetToGlobalCoord(mPos);
+        }
+        __hostdev__ bool isActive() const
+        {
+            NANOVDB_ASSERT(*this);
+            return mParent->isActive(mPos);
+        }
+        __hostdev__ operator bool() const { return mPos < (1u << 3 * Log2Dim); }
+        __hostdev__ ValueIterator& operator++()
+        {
+            ++mPos;
+            return *this;
+        }
+        __hostdev__ ValueIterator operator++(int)
+        {
+            auto tmp = *this;
+            ++(*this);
+            return tmp;
+        }
+    }; // Member class ValueIterator
 
-    static_assert(is_same<ValueType,typename BuildToValueMap<BuildType>::Type>::value, "Mismatching BuildType");
+    __hostdev__ ValueIterator  beginValue()    const { return ValueIterator(this); }
+    __hostdev__ ValueIterator cbeginValueAll() const { return ValueIterator(this); }
+
+    static_assert(util::is_same<ValueType, typename BuildToValueMap<BuildType>::Type>::value, "Mismatching BuildType");
     static constexpr uint32_t LOG2DIM = Log2Dim;
     static constexpr uint32_t TOTAL = LOG2DIM; // needed by parent nodes
     static constexpr uint32_t DIM = 1u << TOTAL; // number of voxels along each axis of this node
@@ -3713,18 +4477,19 @@ class LeafNode : private LeafData<BuildT, CoordT, MaskT, Log2Dim>
 
     /// @brief Return a const reference to the bit mask of active voxels in this leaf node
     __hostdev__ const MaskType<LOG2DIM>& valueMask() const { return DataType::mValueMask; }
+    __hostdev__ const MaskType<LOG2DIM>& getValueMask() const { return DataType::mValueMask; }
 
     /// @brief Return a const reference to the minimum active value encoded in this leaf node
-    __hostdev__ ValueType minimum() const { return this->getMin(); }
+    __hostdev__ ValueType minimum() const { return DataType::getMin(); }
 
     /// @brief Return a const reference to the maximum active value encoded in this leaf node
-    __hostdev__ ValueType maximum() const { return this->getMax(); }
+    __hostdev__ ValueType maximum() const { return DataType::getMax(); }
 
     /// @brief Return a const reference to the average of all the active values encoded in this leaf node
     __hostdev__ FloatType average() const { return DataType::getAvg(); }
 
     /// @brief Return the variance of all the active values encoded in this leaf node
-    __hostdev__ FloatType variance() const { return DataType::getDev()*DataType::getDev(); }
+    __hostdev__ FloatType variance() const { return Pow2(DataType::getDev()); }
 
     /// @brief Return a const reference to the standard deviation of all the active values encoded in this leaf node
     __hostdev__ FloatType stdDeviation() const { return DataType::getDev(); }
@@ -3734,6 +4499,9 @@ class LeafNode : private LeafData<BuildT, CoordT, MaskT, Log2Dim>
     /// @brief Return the origin in index space of this leaf node
     __hostdev__ CoordT origin() const { return DataType::mBBoxMin & ~MASK; }
 
+    /// @brief  Compute the local coordinates from a linear offset
+    /// @param n Linear offset into this nodes dense table
+    /// @return Local (vs global) 3D coordinates
     __hostdev__ static CoordT OffsetToLocalCoord(uint32_t n)
     {
         NANOVDB_ASSERT(n < SIZE);
@@ -3753,15 +4521,15 @@ class LeafNode : private LeafData<BuildT, CoordT, MaskT, Log2Dim>
     __hostdev__ static uint32_t dim() { return 1u << LOG2DIM; }
 
     /// @brief Return the bounding box in index space of active values in this leaf node
-    __hostdev__ BBox<CoordT> bbox() const
+    __hostdev__ math::BBox<CoordT> bbox() const
     {
-        BBox<CoordT> bbox(DataType::mBBoxMin, DataType::mBBoxMin);
-        if ( this->isActive() ) {
+        math::BBox<CoordT> bbox(DataType::mBBoxMin, DataType::mBBoxMin);
+        if (this->hasBBox()) {
             bbox.max()[0] += DataType::mBBoxDif[0];
             bbox.max()[1] += DataType::mBBoxDif[1];
             bbox.max()[2] += DataType::mBBoxDif[2];
-        } else {// very rare case
-            bbox = BBox<CoordT>();// invalid
+        } else { // very rare case
+            bbox = math::BBox<CoordT>(); // invalid
         }
         return bbox;
     }
@@ -3769,8 +4537,10 @@ class LeafNode : private LeafData<BuildT, CoordT, MaskT, Log2Dim>
     /// @brief Return the total number of voxels (e.g. values) encoded in this leaf node
     __hostdev__ static uint32_t voxelCount() { return 1u << (3 * LOG2DIM); }
 
-    /// @brief return memory usage in bytes for the class
-    __hostdev__ static uint64_t memUsage() { return sizeof(LeafNodeType); }
+    __hostdev__ static uint32_t padding() { return DataType::padding(); }
+
+    /// @brief return memory usage in bytes for the leaf node
+    __hostdev__ uint64_t memUsage() const { return DataType::memUsage(); }
 
     /// @brief This class cannot be constructed or deleted
     LeafNode() = delete;
@@ -3779,10 +4549,15 @@ class LeafNode : private LeafData<BuildT, CoordT, MaskT, Log2Dim>
     ~LeafNode() = delete;
 
     /// @brief Return the voxel value at the given offset.
-    __hostdev__ ValueType getValue(uint32_t offset) const { return  DataType::getValue(offset); }
+    __hostdev__ ValueType getValue(uint32_t offset) const { return DataType::getValue(offset); }
 
     /// @brief Return the voxel value at the given coordinate.
-    __hostdev__ ValueType getValue(const CoordT& ijk) const { return  DataType::getValue(CoordToOffset(ijk)); }
+    __hostdev__ ValueType getValue(const CoordT& ijk) const { return DataType::getValue(CoordToOffset(ijk)); }
+
+    /// @brief Return the first value in this leaf node.
+    __hostdev__ ValueType getFirstValue() const { return this->getValue(0); }
+    /// @brief Return the last value in this leaf node.
+    __hostdev__ ValueType getLastValue() const { return this->getValue(SIZE - 1); }
 
     /// @brief Sets the value at the specified location and activate its state.
     ///
@@ -3802,16 +4577,18 @@ class LeafNode : private LeafData<BuildT, CoordT, MaskT, Log2Dim>
     /// @brief Return @c true if any of the voxel value are active in this leaf node.
     __hostdev__ bool isActive() const
     {
-        NANOVDB_ASSERT( bool(DataType::mFlags & uint8_t(2)) != DataType::mValueMask.isOff() );
-        return DataType::mFlags & uint8_t(2);
+        //NANOVDB_ASSERT( bool(DataType::mFlags & uint8_t(2)) != DataType::mValueMask.isOff() );
+        //return DataType::mFlags & uint8_t(2);
+        return !DataType::mValueMask.isOff();
     }
 
+    __hostdev__ bool hasBBox() const { return DataType::mFlags & uint8_t(2); }
 
     /// @brief Return @c true if the voxel value at the given coordinate is active and updates @c v with the value.
     __hostdev__ bool probeValue(const CoordT& ijk, ValueType& v) const
     {
         const uint32_t n = CoordToOffset(ijk);
-        v =  DataType::getValue(n);
+        v = DataType::getValue(n);
         return DataType::mValueMask.isOn(n);
     }
 
@@ -3820,25 +4597,44 @@ class LeafNode : private LeafData<BuildT, CoordT, MaskT, Log2Dim>
     /// @brief Return the linear offset corresponding to the given coordinate
     __hostdev__ static uint32_t CoordToOffset(const CoordT& ijk)
     {
-    #if 0
-        return ((ijk[0] & MASK) << (2 * LOG2DIM)) + ((ijk[1] & MASK) << LOG2DIM) + (ijk[2] & MASK);
-    #else
         return ((ijk[0] & MASK) << (2 * LOG2DIM)) | ((ijk[1] & MASK) << LOG2DIM) | (ijk[2] & MASK);
-    #endif
     }
 
-    /// @brief Updates the local bounding box of active voxels in this node.
+    /// @brief Updates the local bounding box of active voxels in this node. Return true if bbox was updated.
     ///
     /// @warning It assumes that the origin and value mask have already been set.
     ///
     /// @details This method is based on few (intrinsic) bit operations and hence is relatively fast.
-    ///          However, it should only only be called of either the value mask has changed or if the
-    ///          active bounding box is still undefined. e.g. during constrution of this node.
-    __hostdev__ void updateBBox();
+    ///          However, it should only only be called if either the value mask has changed or if the
+    ///          active bounding box is still undefined. e.g. during construction of this node.
+    __hostdev__ bool updateBBox();
+
+    template<typename OpT, typename... ArgsT>
+    __hostdev__ auto get(const CoordType& ijk, ArgsT&&... args) const
+    {
+        return OpT::get(*this, CoordToOffset(ijk), args...);
+    }
+
+    template<typename OpT, typename... ArgsT>
+    __hostdev__ auto get(const uint32_t n, ArgsT&&... args) const
+    {
+        return OpT::get(*this, n, args...);
+    }
+
+    template<typename OpT, typename... ArgsT>
+    __hostdev__ auto set(const CoordType& ijk, ArgsT&&... args)
+    {
+        return OpT::set(*this, CoordToOffset(ijk), args...);
+    }
+
+    template<typename OpT, typename... ArgsT>
+    __hostdev__ auto set(const uint32_t n, ArgsT&&... args)
+    {
+        return OpT::set(*this, n, args...);
+    }
 
 private:
     static_assert(sizeof(DataType) % NANOVDB_DATA_ALIGNMENT == 0, "sizeof(LeafData) is misaligned");
-    //static_assert(offsetof(DataType, mValues) % 32 == 0, "LeafData::mValues is misaligned");
 
     template<typename, int, int, int>
     friend class ReadAccessor;
@@ -3848,16 +4644,17 @@ class LeafNode : private LeafData<BuildT, CoordT, MaskT, Log2Dim>
     template<typename, uint32_t>
     friend class InternalNode;
 
+#ifndef NANOVDB_NEW_ACCESSOR_METHODS
     /// @brief Private method to return a voxel value and update a (dummy) ReadAccessor
     template<typename AccT>
     __hostdev__ ValueType getValueAndCache(const CoordT& ijk, const AccT&) const { return this->getValue(ijk); }
 
     /// @brief Return the node information.
     template<typename AccT>
-    __hostdev__ typename AccT::NodeInfo getNodeInfoAndCache(const CoordType& /*ijk*/, const AccT& /*acc*/) const {
+    __hostdev__ typename AccT::NodeInfo getNodeInfoAndCache(const CoordType& /*ijk*/, const AccT& /*acc*/) const
+    {
         using NodeInfoT = typename AccT::NodeInfo;
-        return NodeInfoT{LEVEL, this->dim(), this->minimum(), this->maximum(),
-                         this->average(), this->stdDeviation(), this->bbox()[0], this->bbox()[1]};
+        return NodeInfoT{LEVEL, this->dim(), this->minimum(), this->maximum(), this->average(), this->stdDeviation(), this->bbox()[0], this->bbox()[1]};
     }
 
     template<typename AccT>
@@ -3868,48 +4665,71 @@ class LeafNode : private LeafData<BuildT, CoordT, MaskT, Log2Dim>
 
     template<typename AccT>
     __hostdev__ const LeafNode* probeLeafAndCache(const CoordT&, const AccT&) const { return this; }
+#endif
 
     template<typename RayT, typename AccT>
     __hostdev__ uint32_t getDimAndCache(const CoordT&, const RayT& /*ray*/, const AccT&) const
     {
-        if (DataType::mFlags & uint8_t(1))
-            return this->dim(); // skip this node if first bit is set
+        if (DataType::mFlags & uint8_t(1u))
+            return this->dim(); // skip this node if the 1st bit is set
+
         //if (!ray.intersects( this->bbox() )) return 1 << LOG2DIM;
         return ChildNodeType::dim();
     }
 
+    template<typename OpT, typename AccT, typename... ArgsT>
+    __hostdev__ auto
+    //__hostdev__  decltype(OpT::get(util::declval<const LeafNode&>(), util::declval<uint32_t>(), util::declval<ArgsT>()...))
+    getAndCache(const CoordType& ijk, const AccT&, ArgsT&&... args) const
+    {
+        return OpT::get(*this, CoordToOffset(ijk), args...);
+    }
+
+    template<typename OpT, typename AccT, typename... ArgsT>
+    //__hostdev__ auto // occasionally fails with NVCC
+    __hostdev__ decltype(OpT::set(util::declval<LeafNode&>(), util::declval<uint32_t>(), util::declval<ArgsT>()...))
+    setAndCache(const CoordType& ijk, const AccT&, ArgsT&&... args)
+    {
+        return OpT::set(*this, CoordToOffset(ijk), args...);
+    }
+
 }; // LeafNode class
 
+// --------------------------> LeafNode<T>::updateBBox <------------------------------------
+
 template<typename ValueT, typename CoordT, template<uint32_t> class MaskT, uint32_t LOG2DIM>
-__hostdev__ inline void LeafNode<ValueT, CoordT, MaskT, LOG2DIM>::updateBBox()
+__hostdev__ inline bool LeafNode<ValueT, CoordT, MaskT, LOG2DIM>::updateBBox()
 {
     static_assert(LOG2DIM == 3, "LeafNode::updateBBox: only supports LOGDIM = 3!");
-    if (!this->isActive()) return;
+    if (DataType::mValueMask.isOff()) {
+        DataType::mFlags &= ~uint8_t(2); // set 2nd bit off, which indicates that this nodes has no bbox
+        return false;
+    }
     auto update = [&](uint32_t min, uint32_t max, int axis) {
         NANOVDB_ASSERT(min <= max && max < 8);
         DataType::mBBoxMin[axis] = (DataType::mBBoxMin[axis] & ~MASK) + int(min);
         DataType::mBBoxDif[axis] = uint8_t(max - min);
     };
-    uint64_t word64 = DataType::mValueMask.template getWord<uint64_t>(0);
-    uint32_t Xmin = word64 ? 0u : 8u;
-    uint32_t Xmax = Xmin;
-    for (int i = 1; i < 8; ++i) { // last loop over 8 64 words
-        if (uint64_t w = DataType::mValueMask.template getWord<uint64_t>(i)) { // skip if word has no set bits
-            word64 |= w; // union 8 x 64 bits words into one 64 bit word
-            if (Xmin == 8) {
+    uint64_t *w = DataType::mValueMask.words(), word64 = *w;
+    uint32_t  Xmin = word64 ? 0u : 8u, Xmax = Xmin;
+    for (int i = 1; i < 8; ++i) { // last loop over 8 64 bit words
+        if (w[i]) { // skip if word has no set bits
+            word64 |= w[i]; // union 8 x 64 bits words into one 64 bit word
+            if (Xmin == 8)
                 Xmin = i; // only set once
-            }
             Xmax = i;
         }
     }
     NANOVDB_ASSERT(word64);
     update(Xmin, Xmax, 0);
-    update(FindLowestOn(word64) >> 3, FindHighestOn(word64) >> 3, 1);
+    update(util::findLowestOn(word64) >> 3, util::findHighestOn(word64) >> 3, 1);
     const uint32_t *p = reinterpret_cast<const uint32_t*>(&word64), word32 = p[0] | p[1];
     const uint16_t *q = reinterpret_cast<const uint16_t*>(&word32), word16 = q[0] | q[1];
-    const uint8_t  *b = reinterpret_cast<const uint8_t* >(&word16), byte   = b[0] | b[1];
+    const uint8_t  *b = reinterpret_cast<const uint8_t*>(&word16), byte = b[0] | b[1];
     NANOVDB_ASSERT(byte);
-    update(FindLowestOn(static_cast<uint32_t>(byte)), FindHighestOn(static_cast<uint32_t>(byte)), 2);
+    update(util::findLowestOn(static_cast<uint32_t>(byte)), util::findHighestOn(static_cast<uint32_t>(byte)), 2);
+    DataType::mFlags |= uint8_t(2); // set 2nd bit on, which indicates that this nodes has a bbox
+    return true;
 } // LeafNode::updateBBox
 
 // --------------------------> Template specializations and traits <------------------------------------
@@ -3959,31 +4779,131 @@ struct NanoNode<BuildT, 3>
     using type = NanoRoot<BuildT>;
 };
 
-using FloatTree  = NanoTree<float>;
+using FloatTree = NanoTree<float>;
+using Fp4Tree = NanoTree<Fp4>;
+using Fp8Tree = NanoTree<Fp8>;
+using Fp16Tree = NanoTree<Fp16>;
+using FpNTree = NanoTree<FpN>;
 using DoubleTree = NanoTree<double>;
-using Int32Tree  = NanoTree<int32_t>;
+using Int32Tree = NanoTree<int32_t>;
 using UInt32Tree = NanoTree<uint32_t>;
-using Int64Tree  = NanoTree<int64_t>;
-using Vec3fTree  = NanoTree<Vec3f>;
-using Vec3dTree  = NanoTree<Vec3d>;
-using Vec4fTree  = NanoTree<Vec4f>;
-using Vec4dTree  = NanoTree<Vec4d>;
-using Vec3ITree  = NanoTree<Vec3i>;
-using MaskTree   = NanoTree<ValueMask>;
-using BoolTree   = NanoTree<bool>;
-
-using FloatGrid  = Grid<FloatTree>;
+using Int64Tree = NanoTree<int64_t>;
+using Vec3fTree = NanoTree<Vec3f>;
+using Vec3dTree = NanoTree<Vec3d>;
+using Vec4fTree = NanoTree<Vec4f>;
+using Vec4dTree = NanoTree<Vec4d>;
+using Vec3ITree = NanoTree<Vec3i>;
+using MaskTree = NanoTree<ValueMask>;
+using BoolTree = NanoTree<bool>;
+using IndexTree = NanoTree<ValueIndex>;
+using OnIndexTree = NanoTree<ValueOnIndex>;
+using IndexMaskTree = NanoTree<ValueIndexMask>;
+using OnIndexMaskTree = NanoTree<ValueOnIndexMask>;
+
+using FloatGrid = Grid<FloatTree>;
+using Fp4Grid = Grid<Fp4Tree>;
+using Fp8Grid = Grid<Fp8Tree>;
+using Fp16Grid = Grid<Fp16Tree>;
+using FpNGrid = Grid<FpNTree>;
 using DoubleGrid = Grid<DoubleTree>;
-using Int32Grid  = Grid<Int32Tree>;
+using Int32Grid = Grid<Int32Tree>;
 using UInt32Grid = Grid<UInt32Tree>;
-using Int64Grid  = Grid<Int64Tree>;
-using Vec3fGrid  = Grid<Vec3fTree>;
-using Vec3dGrid  = Grid<Vec3dTree>;
-using Vec4fGrid  = Grid<Vec4fTree>;
-using Vec4dGrid  = Grid<Vec4dTree>;
-using Vec3IGrid  = Grid<Vec3ITree>;
-using MaskGrid   = Grid<MaskTree>;
-using BoolGrid   = Grid<BoolTree>;
+using Int64Grid = Grid<Int64Tree>;
+using Vec3fGrid = Grid<Vec3fTree>;
+using Vec3dGrid = Grid<Vec3dTree>;
+using Vec4fGrid = Grid<Vec4fTree>;
+using Vec4dGrid = Grid<Vec4dTree>;
+using Vec3IGrid = Grid<Vec3ITree>;
+using MaskGrid = Grid<MaskTree>;
+using BoolGrid = Grid<BoolTree>;
+using PointGrid = Grid<Point>;
+using IndexGrid = Grid<IndexTree>;
+using OnIndexGrid = Grid<OnIndexTree>;
+using IndexMaskGrid = Grid<IndexMaskTree>;
+using OnIndexMaskGrid = Grid<OnIndexMaskTree>;
+
+// --------------------------> callNanoGrid <------------------------------------
+
+/**
+* @brief Below is an example of the struct used for generic programming with callNanoGrid
+* @details For an example see "struct Crc32TailOld" in nanovdb/tools/GridChecksum.h or
+*          "struct IsNanoGridValid" in nanovdb/tools/GridValidator.h
+* @code
+*   struct OpT {
+        // define these two static functions with non-const GridData
+*       template <typename BuildT>
+*       static auto   known(      GridData *gridData, args...);
+*       static auto unknown(      GridData *gridData, args...);
+*       // or alternatively these two static functions with const GridData
+*       template <typename BuildT>
+*       static auto   known(const GridData *gridData, args...);
+*       static auto unknown(const GridData *gridData, args...);
+*   };
+*  @endcode
+*
+* @brief Here is an example of how to use callNanoGrid in client code
+* @code
+*    return callNanoGrid<OpT>(gridData, args...);
+* @endcode
+*/
+
+/// @brief Use this function, which depends a pointer to GridData, to call
+///        other functions that depend on a NanoGrid of a known ValueType.
+/// @details This function allows for generic programming by converting GridData
+///          to a NanoGrid of the type encoded in GridData::mGridType.
+template<typename OpT, typename GridDataT, typename... ArgsT>
+auto callNanoGrid(GridDataT *gridData, ArgsT&&... args)
+{
+    static_assert(util::is_same<GridDataT, GridData, const GridData>::value, "Expected gridData to be of type GridData* or const GridData*");
+    switch (gridData->mGridType){
+        case GridType::Float:
+            return OpT::template known<float>(gridData, args...);
+        case GridType::Double:
+            return OpT::template known<double>(gridData, args...);
+        case GridType::Int16:
+            return OpT::template known<int16_t>(gridData, args...);
+        case GridType::Int32:
+            return OpT::template known<int32_t>(gridData, args...);
+        case GridType::Int64:
+            return OpT::template known<int64_t>(gridData, args...);
+        case GridType::Vec3f:
+            return OpT::template known<Vec3f>(gridData, args...);
+        case GridType::Vec3d:
+            return OpT::template known<Vec3d>(gridData, args...);
+        case GridType::UInt32:
+            return OpT::template known<uint32_t>(gridData, args...);
+        case GridType::Mask:
+            return OpT::template known<ValueMask>(gridData, args...);
+        case GridType::Index:
+            return OpT::template known<ValueIndex>(gridData, args...);
+        case GridType::OnIndex:
+            return OpT::template known<ValueOnIndex>(gridData, args...);
+        case GridType::IndexMask:
+            return OpT::template known<ValueIndexMask>(gridData, args...);
+        case GridType::OnIndexMask:
+            return OpT::template known<ValueOnIndexMask>(gridData, args...);
+        case GridType::Boolean:
+            return OpT::template known<bool>(gridData, args...);
+        case GridType::RGBA8:
+            return OpT::template known<math::Rgba8>(gridData, args...);
+        case GridType::Fp4:
+            return OpT::template known<Fp4>(gridData, args...);
+        case GridType::Fp8:
+            return OpT::template known<Fp8>(gridData, args...);
+        case GridType::Fp16:
+            return OpT::template known<Fp16>(gridData, args...);
+        case GridType::FpN:
+            return OpT::template known<FpN>(gridData, args...);
+        case GridType::Vec4f:
+            return OpT::template known<Vec4f>(gridData, args...);
+        case GridType::Vec4d:
+            return OpT::template known<Vec4d>(gridData, args...);
+        case GridType::UInt8:
+            return OpT::template known<uint8_t>(gridData, args...);
+        default:
+            return OpT::unknown(gridData, args...);
+    }
+}// callNanoGrid
 
 // --------------------------> ReadAccessor <------------------------------------
 
@@ -3994,7 +4914,7 @@ using BoolGrid   = Grid<BoolTree>;
 /// @note  By virtue of the fact that a value accessor accelerates random access operations
 ///        by re-using cached access patterns, this access should be reused for multiple access
 ///        operations. In other words, never create an instance of this accessor for a single
-///        acccess only. In general avoid single access operations with this accessor, and
+///        access only. In general avoid single access operations with this accessor, and
 ///        if that is not possible call the corresponding method on the tree instead.
 ///
 /// @warning Since this ReadAccessor internally caches raw pointers to the nodes of the tree
@@ -4006,22 +4926,26 @@ using BoolGrid   = Grid<BoolTree>;
 ///          O(1) random access operations by means of inverse tree traversal,
 ///          which amortizes the non-const time complexity of the root node.
 
-template <typename BuildT>
+template<typename BuildT>
 class ReadAccessor<BuildT, -1, -1, -1>
 {
-    using RootT  = NanoRoot<BuildT>; // root node
-    using LeafT  = NanoLeaf<BuildT>; // Leaf node
+    using GridT = NanoGrid<BuildT>; // grid
+    using TreeT = NanoTree<BuildT>; // tree
+    using RootT = NanoRoot<BuildT>; // root node
+    using LeafT = NanoLeaf<BuildT>; // Leaf node
     using FloatType = typename RootT::FloatType;
     using CoordValueType = typename RootT::CoordType::ValueType;
 
     mutable const RootT* mRoot; // 8 bytes (mutable to allow for access methods to be const)
 public:
+    using BuildType = BuildT;
     using ValueType = typename RootT::ValueType;
     using CoordType = typename RootT::CoordType;
 
     static const int CacheLevels = 0;
-
-    struct NodeInfo {
+#ifndef NANOVDB_NEW_ACCESSOR_METHODS
+    struct NodeInfo
+    {
         uint32_t  mLevel; //   4B
         uint32_t  mDim; //     4B
         ValueType mMinimum; // typically 4B
@@ -4031,9 +4955,28 @@ class ReadAccessor<BuildT, -1, -1, -1>
         CoordType mBBoxMin; // 3*4B
         CoordType mBBoxMax; // 3*4B
     };
-
+#endif
     /// @brief Constructor from a root node
-    __hostdev__ ReadAccessor(const RootT& root) : mRoot{&root} {}
+    __hostdev__ ReadAccessor(const RootT& root)
+        : mRoot{&root}
+    {
+    }
+
+    /// @brief Constructor from a grid
+    __hostdev__ ReadAccessor(const GridT& grid)
+        : ReadAccessor(grid.tree().root())
+    {
+    }
+
+    /// @brief Constructor from a tree
+    __hostdev__ ReadAccessor(const TreeT& tree)
+        : ReadAccessor(tree.root())
+    {
+    }
+
+    /// @brief Reset this access to its initial state, i.e. with an empty cache
+    /// @node Noop since this template specialization has no cache
+    __hostdev__ void clear() {}
 
     __hostdev__ const RootT& root() const { return *mRoot; }
 
@@ -4041,11 +4984,35 @@ class ReadAccessor<BuildT, -1, -1, -1>
     ReadAccessor(const ReadAccessor&) = default;
     ~ReadAccessor() = default;
     ReadAccessor& operator=(const ReadAccessor&) = default;
-
+#ifdef NANOVDB_NEW_ACCESSOR_METHODS
+    __hostdev__ ValueType getValue(const CoordType& ijk) const
+    {
+        return this->template get<GetValue<BuildT>>(ijk);
+    }
+    __hostdev__ ValueType    getValue(int i, int j, int k) const { return this->template get<GetValue<BuildT>>(CoordType(i, j, k)); }
+    __hostdev__ ValueType    operator()(const CoordType& ijk) const { return this->template get<GetValue<BuildT>>(ijk); }
+    __hostdev__ ValueType    operator()(int i, int j, int k) const { return this->template get<GetValue<BuildT>>(CoordType(i, j, k)); }
+    __hostdev__ auto         getNodeInfo(const CoordType& ijk) const { return this->template get<GetNodeInfo<BuildT>>(ijk); }
+    __hostdev__ bool         isActive(const CoordType& ijk) const { return this->template get<GetState<BuildT>>(ijk); }
+    __hostdev__ bool         probeValue(const CoordType& ijk, ValueType& v) const { return this->template get<ProbeValue<BuildT>>(ijk, v); }
+    __hostdev__ const LeafT* probeLeaf(const CoordType& ijk) const { return this->template get<GetLeaf<BuildT>>(ijk); }
+#else // NANOVDB_NEW_ACCESSOR_METHODS
     __hostdev__ ValueType getValue(const CoordType& ijk) const
     {
         return mRoot->getValueAndCache(ijk, *this);
     }
+    __hostdev__ ValueType getValue(int i, int j, int k) const
+    {
+        return this->getValue(CoordType(i, j, k));
+    }
+    __hostdev__ ValueType operator()(const CoordType& ijk) const
+    {
+        return this->getValue(ijk);
+    }
+    __hostdev__ ValueType operator()(int i, int j, int k) const
+    {
+        return this->getValue(CoordType(i, j, k));
+    }
 
     __hostdev__ NodeInfo getNodeInfo(const CoordType& ijk) const
     {
@@ -4066,12 +5033,23 @@ class ReadAccessor<BuildT, -1, -1, -1>
     {
         return mRoot->probeLeafAndCache(ijk, *this);
     }
-
+#endif // NANOVDB_NEW_ACCESSOR_METHODS
     template<typename RayT>
     __hostdev__ uint32_t getDim(const CoordType& ijk, const RayT& ray) const
     {
         return mRoot->getDimAndCache(ijk, ray, *this);
     }
+    template<typename OpT, typename... ArgsT>
+    __hostdev__ auto get(const CoordType& ijk, ArgsT&&... args) const
+    {
+        return mRoot->template get<OpT>(ijk, args...);
+    }
+
+    template<typename OpT, typename... ArgsT>
+    __hostdev__ auto set(const CoordType& ijk, ArgsT&&... args) const
+    {
+        return const_cast<RootT*>(mRoot)->template set<OpT>(ijk, args...);
+    }
 
 private:
     /// @brief Allow nodes to insert themselves into the cache.
@@ -4088,15 +5066,16 @@ class ReadAccessor<BuildT, -1, -1, -1>
 }; // ReadAccessor<ValueT, -1, -1, -1> class
 
 /// @brief Node caching at a single tree level
-template <typename BuildT, int LEVEL0>
-class ReadAccessor<BuildT, LEVEL0, -1, -1>//e.g. 0, 1, 2
+template<typename BuildT, int LEVEL0>
+class ReadAccessor<BuildT, LEVEL0, -1, -1> //e.g. 0, 1, 2
 {
     static_assert(LEVEL0 >= 0 && LEVEL0 <= 2, "LEVEL0 should be 0, 1, or 2");
 
-    using TreeT  = NanoTree<BuildT>;
-    using RootT  = NanoRoot<BuildT>; //  root node
-    using LeafT  = NanoLeaf<BuildT>; // Leaf node
-    using NodeT  = typename NodeTrait<TreeT, LEVEL0>::type;
+    using GridT = NanoGrid<BuildT>; // grid
+    using TreeT = NanoTree<BuildT>;
+    using RootT = NanoRoot<BuildT>; //  root node
+    using LeafT = NanoLeaf<BuildT>; // Leaf node
+    using NodeT = typename NodeTrait<TreeT, LEVEL0>::type;
     using CoordT = typename RootT::CoordType;
     using ValueT = typename RootT::ValueType;
 
@@ -4109,13 +5088,14 @@ class ReadAccessor<BuildT, LEVEL0, -1, -1>//e.g. 0, 1, 2
     mutable const NodeT* mNode; // 8 bytes
 
 public:
+    using BuildType = BuildT;
     using ValueType = ValueT;
     using CoordType = CoordT;
 
     static const int CacheLevels = 1;
-
+#ifndef NANOVDB_NEW_ACCESSOR_METHODS
     using NodeInfo = typename ReadAccessor<ValueT, -1, -1, -1>::NodeInfo;
-
+#endif
     /// @brief Constructor from a root node
     __hostdev__ ReadAccessor(const RootT& root)
         : mKey(CoordType::max())
@@ -4124,6 +5104,25 @@ class ReadAccessor<BuildT, LEVEL0, -1, -1>//e.g. 0, 1, 2
     {
     }
 
+    /// @brief Constructor from a grid
+    __hostdev__ ReadAccessor(const GridT& grid)
+        : ReadAccessor(grid.tree().root())
+    {
+    }
+
+    /// @brief Constructor from a tree
+    __hostdev__ ReadAccessor(const TreeT& tree)
+        : ReadAccessor(tree.root())
+    {
+    }
+
+    /// @brief Reset this access to its initial state, i.e. with an empty cache
+    __hostdev__ void clear()
+    {
+        mKey = CoordType::max();
+        mNode = nullptr;
+    }
+
     __hostdev__ const RootT& root() const { return *mRoot; }
 
     /// @brief Defaults constructors
@@ -4138,55 +5137,90 @@ class ReadAccessor<BuildT, LEVEL0, -1, -1>//e.g. 0, 1, 2
                (ijk[2] & int32_t(~NodeT::MASK)) == mKey[2];
     }
 
+#ifdef NANOVDB_NEW_ACCESSOR_METHODS
     __hostdev__ ValueType getValue(const CoordType& ijk) const
     {
-        if (this->isCached(ijk)) {
+        return this->template get<GetValue<BuildT>>(ijk);
+    }
+    __hostdev__ ValueType    getValue(int i, int j, int k) const { return this->template get<GetValue<BuildT>>(CoordType(i, j, k)); }
+    __hostdev__ ValueType    operator()(const CoordType& ijk) const { return this->template get<GetValue<BuildT>>(ijk); }
+    __hostdev__ ValueType    operator()(int i, int j, int k) const { return this->template get<GetValue<BuildT>>(CoordType(i, j, k)); }
+    __hostdev__ auto         getNodeInfo(const CoordType& ijk) const { return this->template get<GetNodeInfo<BuildT>>(ijk); }
+    __hostdev__ bool         isActive(const CoordType& ijk) const { return this->template get<GetState<BuildT>>(ijk); }
+    __hostdev__ bool         probeValue(const CoordType& ijk, ValueType& v) const { return this->template get<ProbeValue<BuildT>>(ijk, v); }
+    __hostdev__ const LeafT* probeLeaf(const CoordType& ijk) const { return this->template get<GetLeaf<BuildT>>(ijk); }
+#else // NANOVDB_NEW_ACCESSOR_METHODS
+    __hostdev__ ValueType getValue(const CoordType& ijk) const
+    {
+        if (this->isCached(ijk))
             return mNode->getValueAndCache(ijk, *this);
-        }
         return mRoot->getValueAndCache(ijk, *this);
     }
+    __hostdev__ ValueType getValue(int i, int j, int k) const
+    {
+        return this->getValue(CoordType(i, j, k));
+    }
+    __hostdev__ ValueType operator()(const CoordType& ijk) const
+    {
+        return this->getValue(ijk);
+    }
+    __hostdev__ ValueType operator()(int i, int j, int k) const
+    {
+        return this->getValue(CoordType(i, j, k));
+    }
 
     __hostdev__ NodeInfo getNodeInfo(const CoordType& ijk) const
     {
-        if (this->isCached(ijk)) {
+        if (this->isCached(ijk))
             return mNode->getNodeInfoAndCache(ijk, *this);
-        }
         return mRoot->getNodeInfoAndCache(ijk, *this);
     }
 
     __hostdev__ bool isActive(const CoordType& ijk) const
     {
-        if (this->isCached(ijk)) {
+        if (this->isCached(ijk))
             return mNode->isActiveAndCache(ijk, *this);
-        }
         return mRoot->isActiveAndCache(ijk, *this);
     }
 
     __hostdev__ bool probeValue(const CoordType& ijk, ValueType& v) const
     {
-        if (this->isCached(ijk)) {
+        if (this->isCached(ijk))
             return mNode->probeValueAndCache(ijk, v, *this);
-        }
         return mRoot->probeValueAndCache(ijk, v, *this);
     }
 
     __hostdev__ const LeafT* probeLeaf(const CoordType& ijk) const
     {
-        if (this->isCached(ijk)) {
+        if (this->isCached(ijk))
             return mNode->probeLeafAndCache(ijk, *this);
-        }
         return mRoot->probeLeafAndCache(ijk, *this);
     }
-
+#endif // NANOVDB_NEW_ACCESSOR_METHODS
     template<typename RayT>
     __hostdev__ uint32_t getDim(const CoordType& ijk, const RayT& ray) const
     {
-        if (this->isCached(ijk)) {
+        if (this->isCached(ijk))
             return mNode->getDimAndCache(ijk, ray, *this);
-        }
         return mRoot->getDimAndCache(ijk, ray, *this);
     }
 
+    template<typename OpT, typename... ArgsT>
+    __hostdev__ auto get(const CoordType& ijk, ArgsT&&... args) const
+    {
+        if (this->isCached(ijk))
+            return mNode->template getAndCache<OpT>(ijk, *this, args...);
+        return mRoot->template getAndCache<OpT>(ijk, *this, args...);
+    }
+
+    template<typename OpT, typename... ArgsT>
+    __hostdev__ auto set(const CoordType& ijk, ArgsT&&... args) const
+    {
+        if (this->isCached(ijk))
+            return const_cast<NodeT*>(mNode)->template setAndCache<OpT>(ijk, *this, args...);
+        return const_cast<RootT*>(mRoot)->template setAndCache<OpT>(ijk, *this, args...);
+    }
+
 private:
     /// @brief Allow nodes to insert themselves into the cache.
     template<typename>
@@ -4209,15 +5243,16 @@ class ReadAccessor<BuildT, LEVEL0, -1, -1>//e.g. 0, 1, 2
 
 }; // ReadAccessor<ValueT, LEVEL0>
 
-template <typename BuildT, int LEVEL0, int LEVEL1>
-class ReadAccessor<BuildT, LEVEL0, LEVEL1, -1>//e.g. (0,1), (1,2), (0,2)
+template<typename BuildT, int LEVEL0, int LEVEL1>
+class ReadAccessor<BuildT, LEVEL0, LEVEL1, -1> //e.g. (0,1), (1,2), (0,2)
 {
     static_assert(LEVEL0 >= 0 && LEVEL0 <= 2, "LEVEL0 must be 0, 1, 2");
     static_assert(LEVEL1 >= 0 && LEVEL1 <= 2, "LEVEL1 must be 0, 1, 2");
     static_assert(LEVEL0 < LEVEL1, "Level 0 must be lower than level 1");
-    using TreeT  = NanoTree<BuildT>;
-    using RootT  = NanoRoot<BuildT>;
-    using LeafT  = NanoLeaf<BuildT>;
+    using GridT = NanoGrid<BuildT>; // grid
+    using TreeT = NanoTree<BuildT>;
+    using RootT = NanoRoot<BuildT>;
+    using LeafT = NanoLeaf<BuildT>;
     using Node1T = typename NodeTrait<TreeT, LEVEL0>::type;
     using Node2T = typename NodeTrait<TreeT, LEVEL1>::type;
     using CoordT = typename RootT::CoordType;
@@ -4226,7 +5261,7 @@ class ReadAccessor<BuildT, LEVEL0, LEVEL1, -1>//e.g. (0,1), (1,2), (0,2)
     using CoordValueType = typename RootT::CoordT::ValueType;
 
     // All member data are mutable to allow for access methods to be const
-#ifdef USE_SINGLE_ACCESSOR_KEY // 44 bytes total
+#ifdef NANOVDB_USE_SINGLE_ACCESSOR_KEY // 44 bytes total
     mutable CoordT mKey; // 3*4 = 12 bytes
 #else // 68 bytes total
     mutable CoordT mKeys[2]; // 2*3*4 = 24 bytes
@@ -4236,24 +5271,49 @@ class ReadAccessor<BuildT, LEVEL0, LEVEL1, -1>//e.g. (0,1), (1,2), (0,2)
     mutable const Node2T* mNode2;
 
 public:
+    using BuildType = BuildT;
     using ValueType = ValueT;
     using CoordType = CoordT;
 
     static const int CacheLevels = 2;
+#ifndef NANOVDB_NEW_ACCESSOR_METHODS
+    using NodeInfo = typename ReadAccessor<ValueT, -1, -1, -1>::NodeInfo;
+#endif
+    /// @brief Constructor from a root node
+    __hostdev__ ReadAccessor(const RootT& root)
+#ifdef NANOVDB_USE_SINGLE_ACCESSOR_KEY
+        : mKey(CoordType::max())
+#else
+        : mKeys{CoordType::max(), CoordType::max()}
+#endif
+        , mRoot(&root)
+        , mNode1(nullptr)
+        , mNode2(nullptr)
+    {
+    }
+
+    /// @brief Constructor from a grid
+    __hostdev__ ReadAccessor(const GridT& grid)
+        : ReadAccessor(grid.tree().root())
+    {
+    }
 
-    using NodeInfo = typename ReadAccessor<ValueT,-1,-1,-1>::NodeInfo;
+    /// @brief Constructor from a tree
+    __hostdev__ ReadAccessor(const TreeT& tree)
+        : ReadAccessor(tree.root())
+    {
+    }
 
-    /// @brief Constructor from a root node
-    __hostdev__ ReadAccessor(const RootT& root)
-#ifdef USE_SINGLE_ACCESSOR_KEY
-        : mKey(CoordType::max())
+    /// @brief Reset this access to its initial state, i.e. with an empty cache
+    __hostdev__ void clear()
+    {
+#ifdef NANOVDB_USE_SINGLE_ACCESSOR_KEY
+        mKey = CoordType::max();
 #else
-        : mKeys{CoordType::max(), CoordType::max()}
+        mKeys[0] = mKeys[1] = CoordType::max();
 #endif
-        , mRoot(&root)
-        , mNode1(nullptr)
-        , mNode2(nullptr)
-    {
+        mNode1 = nullptr;
+        mNode2 = nullptr;
     }
 
     __hostdev__ const RootT& root() const { return *mRoot; }
@@ -4263,7 +5323,7 @@ class ReadAccessor<BuildT, LEVEL0, LEVEL1, -1>//e.g. (0,1), (1,2), (0,2)
     ~ReadAccessor() = default;
     ReadAccessor& operator=(const ReadAccessor&) = default;
 
-#ifdef USE_SINGLE_ACCESSOR_KEY
+#ifdef NANOVDB_USE_SINGLE_ACCESSOR_KEY
     __hostdev__ bool isCached1(CoordValueType dirty) const
     {
         if (!mNode1)
@@ -4303,9 +5363,23 @@ class ReadAccessor<BuildT, LEVEL0, LEVEL1, -1>//e.g. (0,1), (1,2), (0,2)
     }
 #endif
 
+#ifdef NANOVDB_NEW_ACCESSOR_METHODS
+    __hostdev__ ValueType getValue(const CoordType& ijk) const
+    {
+        return this->template get<GetValue<BuildT>>(ijk);
+    }
+    __hostdev__ ValueType    getValue(int i, int j, int k) const { return this->template get<GetValue<BuildT>>(CoordType(i, j, k)); }
+    __hostdev__ ValueType    operator()(const CoordType& ijk) const { return this->template get<GetValue<BuildT>>(ijk); }
+    __hostdev__ ValueType    operator()(int i, int j, int k) const { return this->template get<GetValue<BuildT>>(CoordType(i, j, k)); }
+    __hostdev__ auto         getNodeInfo(const CoordType& ijk) const { return this->template get<GetNodeInfo<BuildT>>(ijk); }
+    __hostdev__ bool         isActive(const CoordType& ijk) const { return this->template get<GetState<BuildT>>(ijk); }
+    __hostdev__ bool         probeValue(const CoordType& ijk, ValueType& v) const { return this->template get<ProbeValue<BuildT>>(ijk, v); }
+    __hostdev__ const LeafT* probeLeaf(const CoordType& ijk) const { return this->template get<GetLeaf<BuildT>>(ijk); }
+#else // NANOVDB_NEW_ACCESSOR_METHODS
+
     __hostdev__ ValueType getValue(const CoordType& ijk) const
     {
-#ifdef USE_SINGLE_ACCESSOR_KEY
+#ifdef NANOVDB_USE_SINGLE_ACCESSOR_KEY
         const CoordValueType dirty = this->computeDirty(ijk);
 #else
         auto&& dirty = ijk;
@@ -4317,10 +5391,21 @@ class ReadAccessor<BuildT, LEVEL0, LEVEL1, -1>//e.g. (0,1), (1,2), (0,2)
         }
         return mRoot->getValueAndCache(ijk, *this);
     }
-
+    __hostdev__ ValueType operator()(const CoordType& ijk) const
+    {
+        return this->getValue(ijk);
+    }
+    __hostdev__ ValueType operator()(int i, int j, int k) const
+    {
+        return this->getValue(CoordType(i, j, k));
+    }
+    __hostdev__ ValueType getValue(int i, int j, int k) const
+    {
+        return this->getValue(CoordType(i, j, k));
+    }
     __hostdev__ NodeInfo getNodeInfo(const CoordType& ijk) const
     {
-#ifdef USE_SINGLE_ACCESSOR_KEY
+#ifdef NANOVDB_USE_SINGLE_ACCESSOR_KEY
         const CoordValueType dirty = this->computeDirty(ijk);
 #else
         auto&& dirty = ijk;
@@ -4335,7 +5420,7 @@ class ReadAccessor<BuildT, LEVEL0, LEVEL1, -1>//e.g. (0,1), (1,2), (0,2)
 
     __hostdev__ bool isActive(const CoordType& ijk) const
     {
-#ifdef USE_SINGLE_ACCESSOR_KEY
+#ifdef NANOVDB_USE_SINGLE_ACCESSOR_KEY
         const CoordValueType dirty = this->computeDirty(ijk);
 #else
         auto&& dirty = ijk;
@@ -4350,7 +5435,7 @@ class ReadAccessor<BuildT, LEVEL0, LEVEL1, -1>//e.g. (0,1), (1,2), (0,2)
 
     __hostdev__ bool probeValue(const CoordType& ijk, ValueType& v) const
     {
-#ifdef USE_SINGLE_ACCESSOR_KEY
+#ifdef NANOVDB_USE_SINGLE_ACCESSOR_KEY
         const CoordValueType dirty = this->computeDirty(ijk);
 #else
         auto&& dirty = ijk;
@@ -4365,7 +5450,7 @@ class ReadAccessor<BuildT, LEVEL0, LEVEL1, -1>//e.g. (0,1), (1,2), (0,2)
 
     __hostdev__ const LeafT* probeLeaf(const CoordType& ijk) const
     {
-#ifdef USE_SINGLE_ACCESSOR_KEY
+#ifdef NANOVDB_USE_SINGLE_ACCESSOR_KEY
         const CoordValueType dirty = this->computeDirty(ijk);
 #else
         auto&& dirty = ijk;
@@ -4377,11 +5462,12 @@ class ReadAccessor<BuildT, LEVEL0, LEVEL1, -1>//e.g. (0,1), (1,2), (0,2)
         }
         return mRoot->probeLeafAndCache(ijk, *this);
     }
+#endif // NANOVDB_NEW_ACCESSOR_METHODS
 
     template<typename RayT>
     __hostdev__ uint32_t getDim(const CoordType& ijk, const RayT& ray) const
     {
-#ifdef USE_SINGLE_ACCESSOR_KEY
+#ifdef NANOVDB_USE_SINGLE_ACCESSOR_KEY
         const CoordValueType dirty = this->computeDirty(ijk);
 #else
         auto&& dirty = ijk;
@@ -4394,6 +5480,38 @@ class ReadAccessor<BuildT, LEVEL0, LEVEL1, -1>//e.g. (0,1), (1,2), (0,2)
         return mRoot->getDimAndCache(ijk, ray, *this);
     }
 
+    template<typename OpT, typename... ArgsT>
+    __hostdev__ auto get(const CoordType& ijk, ArgsT&&... args) const
+    {
+#ifdef NANOVDB_USE_SINGLE_ACCESSOR_KEY
+        const CoordValueType dirty = this->computeDirty(ijk);
+#else
+        auto&& dirty = ijk;
+#endif
+        if (this->isCached1(dirty)) {
+            return mNode1->template getAndCache<OpT>(ijk, *this, args...);
+        } else if (this->isCached2(dirty)) {
+            return mNode2->template getAndCache<OpT>(ijk, *this, args...);
+        }
+        return mRoot->template getAndCache<OpT>(ijk, *this, args...);
+    }
+
+    template<typename OpT, typename... ArgsT>
+    __hostdev__ auto set(const CoordType& ijk, ArgsT&&... args) const
+    {
+#ifdef NANOVDB_USE_SINGLE_ACCESSOR_KEY
+        const CoordValueType dirty = this->computeDirty(ijk);
+#else
+        auto&& dirty = ijk;
+#endif
+        if (this->isCached1(dirty)) {
+            return const_cast<Node1T*>(mNode1)->template setAndCache<OpT>(ijk, *this, args...);
+        } else if (this->isCached2(dirty)) {
+            return const_cast<Node2T*>(mNode2)->template setAndCache<OpT>(ijk, *this, args...);
+        }
+        return const_cast<RootT*>(mRoot)->template setAndCache<OpT>(ijk, *this, args...);
+    }
+
 private:
     /// @brief Allow nodes to insert themselves into the cache.
     template<typename>
@@ -4406,7 +5524,7 @@ class ReadAccessor<BuildT, LEVEL0, LEVEL1, -1>//e.g. (0,1), (1,2), (0,2)
     /// @brief Inserts a leaf node and key pair into this ReadAccessor
     __hostdev__ void insert(const CoordType& ijk, const Node1T* node) const
     {
-#ifdef USE_SINGLE_ACCESSOR_KEY
+#ifdef NANOVDB_USE_SINGLE_ACCESSOR_KEY
         mKey = ijk;
 #else
         mKeys[0] = ijk & ~Node1T::MASK;
@@ -4415,27 +5533,27 @@ class ReadAccessor<BuildT, LEVEL0, LEVEL1, -1>//e.g. (0,1), (1,2), (0,2)
     }
     __hostdev__ void insert(const CoordType& ijk, const Node2T* node) const
     {
-#ifdef USE_SINGLE_ACCESSOR_KEY
+#ifdef NANOVDB_USE_SINGLE_ACCESSOR_KEY
         mKey = ijk;
 #else
         mKeys[1] = ijk & ~Node2T::MASK;
 #endif
         mNode2 = node;
     }
-    template <typename OtherNodeT>
+    template<typename OtherNodeT>
     __hostdev__ void insert(const CoordType&, const OtherNodeT*) const {}
 }; // ReadAccessor<BuildT, LEVEL0, LEVEL1>
 
-
 /// @brief Node caching at all (three) tree levels
-template <typename BuildT>
+template<typename BuildT>
 class ReadAccessor<BuildT, 0, 1, 2>
 {
-    using TreeT  = NanoTree<BuildT>;
-    using RootT  = NanoRoot<BuildT>; //  root node
+    using GridT = NanoGrid<BuildT>; // grid
+    using TreeT = NanoTree<BuildT>;
+    using RootT = NanoRoot<BuildT>; //  root node
     using NodeT2 = NanoUpper<BuildT>; // upper internal node
     using NodeT1 = NanoLower<BuildT>; // lower internal node
-    using LeafT  = NanoLeaf< BuildT>; // Leaf node
+    using LeafT = NanoLeaf<BuildT>; // Leaf node
     using CoordT = typename RootT::CoordType;
     using ValueT = typename RootT::ValueType;
 
@@ -4443,25 +5561,26 @@ class ReadAccessor<BuildT, 0, 1, 2>
     using CoordValueType = typename RootT::CoordT::ValueType;
 
     // All member data are mutable to allow for access methods to be const
-#ifdef USE_SINGLE_ACCESSOR_KEY // 44 bytes total
+#ifdef NANOVDB_USE_SINGLE_ACCESSOR_KEY // 44 bytes total
     mutable CoordT mKey; // 3*4 = 12 bytes
 #else // 68 bytes total
     mutable CoordT mKeys[3]; // 3*3*4 = 36 bytes
 #endif
     mutable const RootT* mRoot;
-    mutable const void* mNode[3]; // 4*8 = 32 bytes
+    mutable const void*  mNode[3]; // 4*8 = 32 bytes
 
 public:
+    using BuildType = BuildT;
     using ValueType = ValueT;
     using CoordType = CoordT;
 
     static const int CacheLevels = 3;
-
+#ifndef NANOVDB_NEW_ACCESSOR_METHODS
     using NodeInfo = typename ReadAccessor<ValueT, -1, -1, -1>::NodeInfo;
-
+#endif
     /// @brief Constructor from a root node
     __hostdev__ ReadAccessor(const RootT& root)
-#ifdef USE_SINGLE_ACCESSOR_KEY
+#ifdef NANOVDB_USE_SINGLE_ACCESSOR_KEY
         : mKey(CoordType::max())
 #else
         : mKeys{CoordType::max(), CoordType::max(), CoordType::max()}
@@ -4471,6 +5590,18 @@ class ReadAccessor<BuildT, 0, 1, 2>
     {
     }
 
+    /// @brief Constructor from a grid
+    __hostdev__ ReadAccessor(const GridT& grid)
+        : ReadAccessor(grid.tree().root())
+    {
+    }
+
+    /// @brief Constructor from a tree
+    __hostdev__ ReadAccessor(const TreeT& tree)
+        : ReadAccessor(tree.root())
+    {
+    }
+
     __hostdev__ const RootT& root() const { return *mRoot; }
 
     /// @brief Defaults constructors
@@ -4485,11 +5616,30 @@ class ReadAccessor<BuildT, 0, 1, 2>
     __hostdev__ const NodeT* getNode() const
     {
         using T = typename NodeTrait<TreeT, NodeT::LEVEL>::type;
-        static_assert(is_same<T, NodeT>::value, "ReadAccessor::getNode: Invalid node type");
+        static_assert(util::is_same<T, NodeT>::value, "ReadAccessor::getNode: Invalid node type");
         return reinterpret_cast<const T*>(mNode[NodeT::LEVEL]);
     }
 
-#ifdef USE_SINGLE_ACCESSOR_KEY
+    template<int LEVEL>
+    __hostdev__ const typename NodeTrait<TreeT, LEVEL>::type* getNode() const
+    {
+        using T = typename NodeTrait<TreeT, LEVEL>::type;
+        static_assert(LEVEL >= 0 && LEVEL <= 2, "ReadAccessor::getNode: Invalid node type");
+        return reinterpret_cast<const T*>(mNode[LEVEL]);
+    }
+
+    /// @brief Reset this access to its initial state, i.e. with an empty cache
+    __hostdev__ void clear()
+    {
+#ifdef NANOVDB_USE_SINGLE_ACCESSOR_KEY
+        mKey = CoordType::max();
+#else
+        mKeys[0] = mKeys[1] = mKeys[2] = CoordType::max();
+#endif
+        mNode[0] = mNode[1] = mNode[2] = nullptr;
+    }
+
+#ifdef NANOVDB_USE_SINGLE_ACCESSOR_KEY
     template<typename NodeT>
     __hostdev__ bool isCached(CoordValueType dirty) const
     {
@@ -4510,13 +5660,29 @@ class ReadAccessor<BuildT, 0, 1, 2>
     template<typename NodeT>
     __hostdev__ bool isCached(const CoordType& ijk) const
     {
-        return (ijk[0] & int32_t(~NodeT::MASK)) == mKeys[NodeT::LEVEL][0] && (ijk[1] & int32_t(~NodeT::MASK)) == mKeys[NodeT::LEVEL][1] && (ijk[2] & int32_t(~NodeT::MASK)) == mKeys[NodeT::LEVEL][2];
+        return (ijk[0] & int32_t(~NodeT::MASK)) == mKeys[NodeT::LEVEL][0] &&
+               (ijk[1] & int32_t(~NodeT::MASK)) == mKeys[NodeT::LEVEL][1] &&
+               (ijk[2] & int32_t(~NodeT::MASK)) == mKeys[NodeT::LEVEL][2];
     }
 #endif
 
+#ifdef NANOVDB_NEW_ACCESSOR_METHODS
+    __hostdev__ ValueType getValue(const CoordType& ijk) const
+    {
+        return this->template get<GetValue<BuildT>>(ijk);
+    }
+    __hostdev__ ValueType    getValue(int i, int j, int k) const { return this->template get<GetValue<BuildT>>(CoordType(i, j, k)); }
+    __hostdev__ ValueType    operator()(const CoordType& ijk) const { return this->template get<GetValue<BuildT>>(ijk); }
+    __hostdev__ ValueType    operator()(int i, int j, int k) const { return this->template get<GetValue<BuildT>>(CoordType(i, j, k)); }
+    __hostdev__ auto         getNodeInfo(const CoordType& ijk) const { return this->template get<GetNodeInfo<BuildT>>(ijk); }
+    __hostdev__ bool         isActive(const CoordType& ijk) const { return this->template get<GetState<BuildT>>(ijk); }
+    __hostdev__ bool         probeValue(const CoordType& ijk, ValueType& v) const { return this->template get<ProbeValue<BuildT>>(ijk, v); }
+    __hostdev__ const LeafT* probeLeaf(const CoordType& ijk) const { return this->template get<GetLeaf<BuildT>>(ijk); }
+#else // NANOVDB_NEW_ACCESSOR_METHODS
+
     __hostdev__ ValueType getValue(const CoordType& ijk) const
     {
-#ifdef USE_SINGLE_ACCESSOR_KEY
+#ifdef NANOVDB_USE_SINGLE_ACCESSOR_KEY
         const CoordValueType dirty = this->computeDirty(ijk);
 #else
         auto&& dirty = ijk;
@@ -4530,10 +5696,22 @@ class ReadAccessor<BuildT, 0, 1, 2>
         }
         return mRoot->getValueAndCache(ijk, *this);
     }
+    __hostdev__ ValueType operator()(const CoordType& ijk) const
+    {
+        return this->getValue(ijk);
+    }
+    __hostdev__ ValueType operator()(int i, int j, int k) const
+    {
+        return this->getValue(CoordType(i, j, k));
+    }
+    __hostdev__ ValueType getValue(int i, int j, int k) const
+    {
+        return this->getValue(CoordType(i, j, k));
+    }
 
     __hostdev__ NodeInfo getNodeInfo(const CoordType& ijk) const
     {
-#ifdef USE_SINGLE_ACCESSOR_KEY
+#ifdef NANOVDB_USE_SINGLE_ACCESSOR_KEY
         const CoordValueType dirty = this->computeDirty(ijk);
 #else
         auto&& dirty = ijk;
@@ -4550,7 +5728,7 @@ class ReadAccessor<BuildT, 0, 1, 2>
 
     __hostdev__ bool isActive(const CoordType& ijk) const
     {
-#ifdef USE_SINGLE_ACCESSOR_KEY
+#ifdef NANOVDB_USE_SINGLE_ACCESSOR_KEY
         const CoordValueType dirty = this->computeDirty(ijk);
 #else
         auto&& dirty = ijk;
@@ -4567,7 +5745,7 @@ class ReadAccessor<BuildT, 0, 1, 2>
 
     __hostdev__ bool probeValue(const CoordType& ijk, ValueType& v) const
     {
-#ifdef USE_SINGLE_ACCESSOR_KEY
+#ifdef NANOVDB_USE_SINGLE_ACCESSOR_KEY
         const CoordValueType dirty = this->computeDirty(ijk);
 #else
         auto&& dirty = ijk;
@@ -4581,10 +5759,9 @@ class ReadAccessor<BuildT, 0, 1, 2>
         }
         return mRoot->probeValueAndCache(ijk, v, *this);
     }
-
     __hostdev__ const LeafT* probeLeaf(const CoordType& ijk) const
     {
-#ifdef USE_SINGLE_ACCESSOR_KEY
+#ifdef NANOVDB_USE_SINGLE_ACCESSOR_KEY
         const CoordValueType dirty = this->computeDirty(ijk);
 #else
         auto&& dirty = ijk;
@@ -4598,11 +5775,48 @@ class ReadAccessor<BuildT, 0, 1, 2>
         }
         return mRoot->probeLeafAndCache(ijk, *this);
     }
+#endif // NANOVDB_NEW_ACCESSOR_METHODS
+
+    template<typename OpT, typename... ArgsT>
+    __hostdev__ auto get(const CoordType& ijk, ArgsT&&... args) const
+    {
+#ifdef NANOVDB_USE_SINGLE_ACCESSOR_KEY
+        const CoordValueType dirty = this->computeDirty(ijk);
+#else
+        auto&& dirty = ijk;
+#endif
+        if (this->isCached<LeafT>(dirty)) {
+            return ((const LeafT*)mNode[0])->template getAndCache<OpT>(ijk, *this, args...);
+        } else if (this->isCached<NodeT1>(dirty)) {
+            return ((const NodeT1*)mNode[1])->template getAndCache<OpT>(ijk, *this, args...);
+        } else if (this->isCached<NodeT2>(dirty)) {
+            return ((const NodeT2*)mNode[2])->template getAndCache<OpT>(ijk, *this, args...);
+        }
+        return mRoot->template getAndCache<OpT>(ijk, *this, args...);
+    }
+
+    template<typename OpT, typename... ArgsT>
+    __hostdev__ auto set(const CoordType& ijk, ArgsT&&... args) const
+    {
+#ifdef NANOVDB_USE_SINGLE_ACCESSOR_KEY
+        const CoordValueType dirty = this->computeDirty(ijk);
+#else
+        auto&& dirty = ijk;
+#endif
+        if (this->isCached<LeafT>(dirty)) {
+            return ((LeafT*)mNode[0])->template setAndCache<OpT>(ijk, *this, args...);
+        } else if (this->isCached<NodeT1>(dirty)) {
+            return ((NodeT1*)mNode[1])->template setAndCache<OpT>(ijk, *this, args...);
+        } else if (this->isCached<NodeT2>(dirty)) {
+            return ((NodeT2*)mNode[2])->template setAndCache<OpT>(ijk, *this, args...);
+        }
+        return ((RootT*)mRoot)->template setAndCache<OpT>(ijk, *this, args...);
+    }
 
     template<typename RayT>
     __hostdev__ uint32_t getDim(const CoordType& ijk, const RayT& ray) const
     {
-#ifdef USE_SINGLE_ACCESSOR_KEY
+#ifdef NANOVDB_USE_SINGLE_ACCESSOR_KEY
         const CoordValueType dirty = this->computeDirty(ijk);
 #else
         auto&& dirty = ijk;
@@ -4630,7 +5844,7 @@ class ReadAccessor<BuildT, 0, 1, 2>
     template<typename NodeT>
     __hostdev__ void insert(const CoordType& ijk, const NodeT* node) const
     {
-#ifdef USE_SINGLE_ACCESSOR_KEY
+#ifdef NANOVDB_USE_SINGLE_ACCESSOR_KEY
         mKey = ijk;
 #else
         mKeys[NodeT::LEVEL] = ijk & ~NodeT::MASK;
@@ -4653,20 +5867,20 @@ class ReadAccessor<BuildT, 0, 1, 2>
 ///          createAccessor<1,2>(grid): Caching of lower and upper internal nodes
 ///          createAccessor<0,1,2>(grid): Caching of all nodes at all tree levels
 
-template <int LEVEL0 = -1, int LEVEL1 = -1, int LEVEL2 = -1, typename ValueT = float>
-ReadAccessor<ValueT, LEVEL0, LEVEL1, LEVEL2> createAccessor(const NanoGrid<ValueT> &grid)
+template<int LEVEL0 = -1, int LEVEL1 = -1, int LEVEL2 = -1, typename ValueT = float>
+ReadAccessor<ValueT, LEVEL0, LEVEL1, LEVEL2> createAccessor(const NanoGrid<ValueT>& grid)
 {
-    return ReadAccessor<ValueT, LEVEL0, LEVEL1, LEVEL2>(grid.tree().root());
+    return ReadAccessor<ValueT, LEVEL0, LEVEL1, LEVEL2>(grid);
 }
 
-template <int LEVEL0 = -1, int LEVEL1 = -1, int LEVEL2 = -1, typename ValueT = float>
-ReadAccessor<ValueT, LEVEL0, LEVEL1, LEVEL2> createAccessor(const NanoTree<ValueT> &tree)
+template<int LEVEL0 = -1, int LEVEL1 = -1, int LEVEL2 = -1, typename ValueT = float>
+ReadAccessor<ValueT, LEVEL0, LEVEL1, LEVEL2> createAccessor(const NanoTree<ValueT>& tree)
 {
-    return ReadAccessor<ValueT, LEVEL0, LEVEL1, LEVEL2>(tree().root());
+    return ReadAccessor<ValueT, LEVEL0, LEVEL1, LEVEL2>(tree);
 }
 
-template <int LEVEL0 = -1, int LEVEL1 = -1, int LEVEL2 = -1, typename ValueT = float>
-ReadAccessor<ValueT, LEVEL0, LEVEL1, LEVEL2> createAccessor(const NanoRoot<ValueT> &root)
+template<int LEVEL0 = -1, int LEVEL1 = -1, int LEVEL2 = -1, typename ValueT = float>
+ReadAccessor<ValueT, LEVEL0, LEVEL1, LEVEL2> createAccessor(const NanoRoot<ValueT>& root)
 {
     return ReadAccessor<ValueT, LEVEL0, LEVEL1, LEVEL2>(root);
 }
@@ -4678,70 +5892,115 @@ ReadAccessor<ValueT, LEVEL0, LEVEL1, LEVEL2> createAccessor(const NanoRoot<Value
 ///        can be used to get information about a grid without actually knowing
 ///        its ValueType.
 class GridMetaData
-{
-    // We cast to a grid templated on a dummy ValueType which is safe because we are very
-    // careful only to call certain methods which are known to be invariant to the ValueType!
-    // In other words, don't use this technique unless you are intimately familiar with the
-    // memory-layout of the data structure and the reasons why certain methods are safe
-    // to call and others are not!
-    using GridT = NanoGrid<int>;
-    __hostdev__ const GridT& grid() const { return *reinterpret_cast<const GridT*>(this); }
+{ // 768 bytes (32 byte aligned)
+    GridData  mGridData; // 672B
+    TreeData  mTreeData; // 64B
+    CoordBBox mIndexBBox; // 24B. AABB of active values in index space.
+    uint32_t  mRootTableSize, mPadding{0}; // 8B
 
 public:
-    __hostdev__ bool        isValid() const { return this->grid().isValid(); }
-    __hostdev__ uint64_t    gridSize() const { return this->grid().gridSize(); }
-    __hostdev__ uint32_t    gridIndex() const { return this->grid().gridIndex(); }
-    __hostdev__ uint32_t    gridCount() const { return this->grid().gridCount(); }
-    __hostdev__ const char* shortGridName() const { return this->grid().shortGridName(); }
-    __hostdev__ GridType    gridType() const { return this->grid().gridType(); }
-    __hostdev__ GridClass   gridClass() const { return this->grid().gridClass(); }
-    __hostdev__ bool        isLevelSet() const { return this->grid().isLevelSet(); }
-    __hostdev__ bool        isFogVolume() const { return this->grid().isFogVolume(); }
-    __hostdev__ bool        isPointIndex() const { return this->grid().isPointIndex(); }
-    __hostdev__ bool        isPointData() const { return this->grid().isPointData(); }
-    __hostdev__ bool        isMask() const { return this->grid().isMask(); }
-    __hostdev__ bool        isStaggered() const { return this->grid().isStaggered(); }
-    __hostdev__ bool        isUnknown() const { return this->grid().isUnknown(); }
-    __hostdev__ const Map& map() const { return this->grid().map(); }
-    __hostdev__ const BBox<Vec3R>& worldBBox() const { return this->grid().worldBBox(); }
-    __hostdev__ const BBox<Coord>& indexBBox() const { return this->grid().indexBBox(); }
-    __hostdev__ Vec3R              voxelSize() const { return this->grid().voxelSize(); }
-    __hostdev__ int                blindDataCount() const { return this->grid().blindDataCount(); }
-    __hostdev__ const GridBlindMetaData& blindMetaData(int n) const { return this->grid().blindMetaData(n); }
-    __hostdev__ uint64_t                 activeVoxelCount() const { return this->grid().activeVoxelCount(); }
-    __hostdev__ uint32_t                 activeTileCount(uint32_t n) const { return this->grid().tree().activeTileCount(n); }
-    __hostdev__ uint32_t                 nodeCount(uint32_t level) const { return this->grid().tree().nodeCount(level); }
-    __hostdev__ uint64_t                 checksum() const { return this->grid().checksum(); }
-    __hostdev__ bool                     isEmpty() const { return this->grid().isEmpty(); }
-    __hostdev__ Version                  version() const { return this->grid().version(); }
+    template<typename T>
+    GridMetaData(const NanoGrid<T>& grid)
+    {
+        mGridData = *grid.data();
+        mTreeData = *grid.tree().data();
+        mIndexBBox = grid.indexBBox();
+        mRootTableSize = grid.tree().root().getTableSize();
+    }
+    GridMetaData(const GridData* gridData)
+    {
+        if (GridMetaData::safeCast(gridData)) {
+            *this = *reinterpret_cast<const GridMetaData*>(gridData);
+            //util::memcpy(this, (const GridMetaData*)gridData);
+        } else {// otherwise copy each member individually
+            mGridData  = *gridData;
+            mTreeData  = *reinterpret_cast<const TreeData*>(gridData->treePtr());
+            mIndexBBox = gridData->indexBBox();
+            mRootTableSize = gridData->rootTableSize();
+        }
+    }
+    GridMetaData& operator=(const GridMetaData&) = default;
+    /// @brief return true if the RootData follows right after the TreeData.
+    ///        If so, this implies that it's safe to cast the grid from which
+    ///        this instance was constructed to a GridMetaData
+    __hostdev__ bool safeCast() const { return mTreeData.isRootNext(); }
+
+    /// @brief return true if it is safe to cast the grid to a pointer
+    ///        of type GridMetaData, i.e. construction can be avoided.
+    __hostdev__ static bool      safeCast(const GridData *gridData){
+        NANOVDB_ASSERT(gridData && gridData->isValid());
+        return gridData->isRootConnected();
+    }
+    /// @brief return true if it is safe to cast the grid to a pointer
+    ///        of type GridMetaData, i.e. construction can be avoided.
+    template<typename T>
+    __hostdev__ static bool      safeCast(const NanoGrid<T>& grid){return grid.tree().isRootNext();}
+    __hostdev__ bool             isValid() const { return mGridData.isValid(); }
+    __hostdev__ const GridType&  gridType() const { return mGridData.mGridType; }
+    __hostdev__ const GridClass& gridClass() const { return mGridData.mGridClass; }
+    __hostdev__ bool             isLevelSet() const { return mGridData.mGridClass == GridClass::LevelSet; }
+    __hostdev__ bool             isFogVolume() const { return mGridData.mGridClass == GridClass::FogVolume; }
+    __hostdev__ bool             isStaggered() const { return mGridData.mGridClass == GridClass::Staggered; }
+    __hostdev__ bool             isPointIndex() const { return mGridData.mGridClass == GridClass::PointIndex; }
+    __hostdev__ bool             isGridIndex() const { return mGridData.mGridClass == GridClass::IndexGrid; }
+    __hostdev__ bool             isPointData() const { return mGridData.mGridClass == GridClass::PointData; }
+    __hostdev__ bool             isMask() const { return mGridData.mGridClass == GridClass::Topology; }
+    __hostdev__ bool             isUnknown() const { return mGridData.mGridClass == GridClass::Unknown; }
+    __hostdev__ bool             hasMinMax() const { return mGridData.mFlags.isMaskOn(GridFlags::HasMinMax); }
+    __hostdev__ bool             hasBBox() const { return mGridData.mFlags.isMaskOn(GridFlags::HasBBox); }
+    __hostdev__ bool             hasLongGridName() const { return mGridData.mFlags.isMaskOn(GridFlags::HasLongGridName); }
+    __hostdev__ bool             hasAverage() const { return mGridData.mFlags.isMaskOn(GridFlags::HasAverage); }
+    __hostdev__ bool             hasStdDeviation() const { return mGridData.mFlags.isMaskOn(GridFlags::HasStdDeviation); }
+    __hostdev__ bool             isBreadthFirst() const { return mGridData.mFlags.isMaskOn(GridFlags::IsBreadthFirst); }
+    __hostdev__ uint64_t         gridSize() const { return mGridData.mGridSize; }
+    __hostdev__ uint32_t         gridIndex() const { return mGridData.mGridIndex; }
+    __hostdev__ uint32_t         gridCount() const { return mGridData.mGridCount; }
+    __hostdev__ const char*      shortGridName() const { return mGridData.mGridName; }
+    __hostdev__ const Map&       map() const { return mGridData.mMap; }
+    __hostdev__ const Vec3dBBox& worldBBox() const { return mGridData.mWorldBBox; }
+    __hostdev__ const CoordBBox& indexBBox() const { return mIndexBBox; }
+    __hostdev__ Vec3d              voxelSize() const { return mGridData.mVoxelSize; }
+    __hostdev__ int                blindDataCount() const { return mGridData.mBlindMetadataCount; }
+    __hostdev__ uint64_t        activeVoxelCount() const { return mTreeData.mVoxelCount; }
+    __hostdev__ const uint32_t& activeTileCount(uint32_t level) const { return mTreeData.mTileCount[level - 1]; }
+    __hostdev__ uint32_t        nodeCount(uint32_t level) const { return mTreeData.mNodeCount[level]; }
+    __hostdev__ const Checksum& checksum() const { return mGridData.mChecksum; }
+    __hostdev__ uint32_t        rootTableSize() const { return mRootTableSize; }
+    __hostdev__ bool            isEmpty() const { return mRootTableSize == 0; }
+    __hostdev__ Version         version() const { return mGridData.mVersion; }
 }; // GridMetaData
 
 /// @brief Class to access points at a specific voxel location
-template<typename AttT>
-class PointAccessor : public DefaultReadAccessor<uint32_t>
+///
+/// @note If GridClass::PointIndex AttT should be uint32_t and if GridClass::PointData Vec3f
+template<typename AttT, typename BuildT = uint32_t>
+class PointAccessor : public DefaultReadAccessor<BuildT>
 {
-    using AccT = DefaultReadAccessor<uint32_t>;
-    const UInt32Grid* mGrid;
-    const AttT*       mData;
+    using AccT = DefaultReadAccessor<BuildT>;
+    const NanoGrid<BuildT>& mGrid;
+    const AttT*             mData;
 
 public:
-    using LeafNodeType = typename NanoRoot<uint32_t>::LeafNodeType;
-
-    PointAccessor(const UInt32Grid& grid)
+    PointAccessor(const NanoGrid<BuildT>& grid)
         : AccT(grid.tree().root())
-        , mGrid(&grid)
-        , mData(reinterpret_cast<const AttT*>(grid.blindData(0)))
+        , mGrid(grid)
+        , mData(grid.template getBlindData<AttT>(0))
     {
-        NANOVDB_ASSERT(grid.gridType() == GridType::UInt32);
-        NANOVDB_ASSERT((grid.gridClass() == GridClass::PointIndex && is_same<uint32_t, AttT>::value) ||
-                       (grid.gridClass() == GridClass::PointData && is_same<Vec3f, AttT>::value));
-        NANOVDB_ASSERT(grid.blindDataCount() >= 1);
+        NANOVDB_ASSERT(grid.gridType() == toGridType<BuildT>());
+        NANOVDB_ASSERT((grid.gridClass() == GridClass::PointIndex && util::is_same<uint32_t, AttT>::value) ||
+                       (grid.gridClass() == GridClass::PointData && util::is_same<Vec3f, AttT>::value));
     }
+
+    /// @brief  return true if this access was initialized correctly
+    __hostdev__ operator bool() const { return mData != nullptr; }
+
+    __hostdev__ const NanoGrid<BuildT>& grid() const { return mGrid; }
+
     /// @brief Return the total number of point in the grid and set the
     ///        iterators to the complete range of points.
     __hostdev__ uint64_t gridPoints(const AttT*& begin, const AttT*& end) const
     {
-        const uint64_t count = mGrid->blindMetaData(0).mElementCount;
+        const uint64_t count = mGrid.blindMetaData(0u).mValueCount;
         begin = mData;
         end = begin + count;
         return count;
@@ -4760,23 +6019,606 @@ class PointAccessor : public DefaultReadAccessor<uint32_t>
         return leaf->maximum();
     }
 
-    /// @brief get iterators over offsets to points at a specific voxel location
+    /// @brief get iterators over attributes to points at a specific voxel location
     __hostdev__ uint64_t voxelPoints(const Coord& ijk, const AttT*& begin, const AttT*& end) const
+    {
+        begin = end = nullptr;
+        if (auto* leaf = this->probeLeaf(ijk)) {
+            const uint32_t offset = NanoLeaf<BuildT>::CoordToOffset(ijk);
+            if (leaf->isActive(offset)) {
+                begin = mData + leaf->minimum();
+                end = begin + leaf->getValue(offset);
+                if (offset > 0u)
+                    begin += leaf->getValue(offset - 1);
+            }
+        }
+        return end - begin;
+    }
+}; // PointAccessor
+
+template<typename AttT>
+class PointAccessor<AttT, Point> : public DefaultReadAccessor<Point>
+{
+    using AccT = DefaultReadAccessor<Point>;
+    const NanoGrid<Point>& mGrid;
+    const AttT*             mData;
+
+public:
+    PointAccessor(const NanoGrid<Point>& grid)
+        : AccT(grid.tree().root())
+        , mGrid(grid)
+        , mData(grid.template getBlindData<AttT>(0))
+    {
+        NANOVDB_ASSERT(mData);
+        NANOVDB_ASSERT(grid.gridType() == GridType::PointIndex);
+        NANOVDB_ASSERT((grid.gridClass() == GridClass::PointIndex && util::is_same<uint32_t, AttT>::value) ||
+                       (grid.gridClass() == GridClass::PointData && util::is_same<Vec3f, AttT>::value) ||
+                       (grid.gridClass() == GridClass::PointData && util::is_same<Vec3d, AttT>::value) ||
+                       (grid.gridClass() == GridClass::PointData && util::is_same<Vec3u16, AttT>::value) ||
+                       (grid.gridClass() == GridClass::PointData && util::is_same<Vec3u8, AttT>::value));
+    }
+
+    /// @brief  return true if this access was initialized correctly
+    __hostdev__ operator bool() const { return mData != nullptr; }
+
+    __hostdev__ const NanoGrid<Point>& grid() const { return mGrid; }
+
+    /// @brief Return the total number of point in the grid and set the
+    ///        iterators to the complete range of points.
+    __hostdev__ uint64_t gridPoints(const AttT*& begin, const AttT*& end) const
+    {
+        const uint64_t count = mGrid.blindMetaData(0u).mValueCount;
+        begin = mData;
+        end = begin + count;
+        return count;
+    }
+    /// @brief Return the number of points in the leaf node containing the coordinate @a ijk.
+    ///        If this return value is larger than zero then the iterators @a begin and @a end
+    ///        will point to all the attributes contained within that leaf node.
+    __hostdev__ uint64_t leafPoints(const Coord& ijk, const AttT*& begin, const AttT*& end) const
     {
         auto* leaf = this->probeLeaf(ijk);
         if (leaf == nullptr)
             return 0;
-        const uint32_t offset = LeafNodeType::CoordToOffset(ijk);
-        if (leaf->isActive(offset)) {
-            auto* p = mData + leaf->minimum();
-            begin = p + (offset == 0 ? 0 : leaf->getValue(offset - 1));
-            end = p + leaf->getValue(offset);
-            return end - begin;
+        begin = mData + leaf->offset();
+        end = begin + leaf->pointCount();
+        return leaf->pointCount();
+    }
+
+    /// @brief get iterators over attributes to points at a specific voxel location
+    __hostdev__ uint64_t voxelPoints(const Coord& ijk, const AttT*& begin, const AttT*& end) const
+    {
+        if (auto* leaf = this->probeLeaf(ijk)) {
+            const uint32_t n = NanoLeaf<Point>::CoordToOffset(ijk);
+            if (leaf->isActive(n)) {
+                begin = mData + leaf->first(n);
+                end = mData + leaf->last(n);
+                return end - begin;
+            }
         }
-        return 0;
+        begin = end = nullptr;
+        return 0u; // no leaf or inactive voxel
     }
-}; // PointAccessor
+}; // PointAccessor<AttT, Point>
+
+/// @brief Class to access values in channels at a specific voxel location.
+///
+/// @note The ChannelT template parameter can be either const and non-const.
+template<typename ChannelT, typename IndexT = ValueIndex>
+class ChannelAccessor : public DefaultReadAccessor<IndexT>
+{
+    static_assert(BuildTraits<IndexT>::is_index, "Expected an index build type");
+    using BaseT = DefaultReadAccessor<IndexT>;
+
+    const NanoGrid<IndexT>& mGrid;
+    ChannelT*               mChannel;
+
+public:
+    using ValueType = ChannelT;
+    using TreeType = NanoTree<IndexT>;
+    using AccessorType = ChannelAccessor<ChannelT, IndexT>;
+
+    /// @brief Ctor from an IndexGrid and an integer ID of an internal channel
+    ///        that is assumed to exist as blind data in the IndexGrid.
+    __hostdev__ ChannelAccessor(const NanoGrid<IndexT>& grid, uint32_t channelID = 0u)
+        : BaseT(grid.tree().root())
+        , mGrid(grid)
+        , mChannel(nullptr)
+    {
+        NANOVDB_ASSERT(isIndex(grid.gridType()));
+        NANOVDB_ASSERT(grid.gridClass() == GridClass::IndexGrid);
+        this->setChannel(channelID);
+    }
+
+    /// @brief Ctor from an IndexGrid and an external channel
+    __hostdev__ ChannelAccessor(const NanoGrid<IndexT>& grid, ChannelT* channelPtr)
+        : BaseT(grid.tree().root())
+        , mGrid(grid)
+        , mChannel(channelPtr)
+    {
+        NANOVDB_ASSERT(isIndex(grid.gridType()));
+        NANOVDB_ASSERT(grid.gridClass() == GridClass::IndexGrid);
+    }
+
+    /// @brief  return true if this access was initialized correctly
+    __hostdev__ operator bool() const { return mChannel != nullptr; }
+
+    /// @brief Return a const reference to the IndexGrid
+    __hostdev__ const NanoGrid<IndexT>& grid() const { return mGrid; }
+
+    /// @brief Return a const reference to the tree of the IndexGrid
+    __hostdev__ const TreeType& tree() const { return mGrid.tree(); }
+
+    /// @brief Return a vector of the axial voxel sizes
+    __hostdev__ const Vec3d& voxelSize() const { return mGrid.voxelSize(); }
+
+    /// @brief Return total number of values indexed by the IndexGrid
+    __hostdev__ const uint64_t& valueCount() const { return mGrid.valueCount(); }
+
+    /// @brief Change to an external channel
+    /// @return Pointer to channel data
+    __hostdev__ ChannelT* setChannel(ChannelT* channelPtr) {return mChannel = channelPtr;}
+
+    /// @brief Change to an internal channel, assuming it exists as as blind data
+    ///        in the IndexGrid.
+    /// @return Pointer to channel data, which could be NULL if channelID is out of range or
+    ///         if ChannelT does not match the value type of the blind data
+    __hostdev__ ChannelT* setChannel(uint32_t channelID)
+    {
+        return mChannel = const_cast<ChannelT*>(mGrid.template getBlindData<ChannelT>(channelID));
+    }
+
+    /// @brief Return the linear offset into a channel that maps to the specified coordinate
+    __hostdev__ uint64_t getIndex(const math::Coord& ijk) const { return BaseT::getValue(ijk); }
+    __hostdev__ uint64_t idx(int i, int j, int k) const { return BaseT::getValue(math::Coord(i, j, k)); }
+
+    /// @brief Return the value from a cached channel that maps to the specified coordinate
+    __hostdev__ ChannelT& getValue(const math::Coord& ijk) const { return mChannel[BaseT::getValue(ijk)]; }
+    __hostdev__ ChannelT& operator()(const math::Coord& ijk) const { return this->getValue(ijk); }
+    __hostdev__ ChannelT& operator()(int i, int j, int k) const { return this->getValue(math::Coord(i, j, k)); }
+
+    /// @brief return the state and updates the value of the specified voxel
+    __hostdev__ bool probeValue(const math::Coord& ijk, typename util::remove_const<ChannelT>::type& v) const
+    {
+        uint64_t   idx;
+        const bool isActive = BaseT::probeValue(ijk, idx);
+        v = mChannel[idx];
+        return isActive;
+    }
+    /// @brief Return the value from a specified channel that maps to the specified coordinate
+    ///
+    /// @note The template parameter can be either const or non-const
+    template<typename T>
+    __hostdev__ T& getValue(const math::Coord& ijk, T* channelPtr) const { return channelPtr[BaseT::getValue(ijk)]; }
+
+}; // ChannelAccessor
+
+#if 0
+// This MiniGridHandle class is only included as a stand-alone example. Note that aligned_alloc is a C++17 feature!
+// Normally we recommend using GridHandle defined in util/GridHandle.h but this minimal implementation could be an
+// alternative when using the IO methods defined below.
+struct MiniGridHandle {
+    struct BufferType {
+        uint8_t *data;
+        uint64_t size;
+        BufferType(uint64_t n=0) : data(std::aligned_alloc(NANOVDB_DATA_ALIGNMENT, n)), size(n) {assert(isValid(data));}
+        BufferType(BufferType &&other) : data(other.data), size(other.size) {other.data=nullptr; other.size=0;}
+        ~BufferType() {std::free(data);}
+        BufferType& operator=(const BufferType &other) = delete;
+        BufferType& operator=(BufferType &&other){data=other.data; size=other.size; other.data=nullptr; other.size=0; return *this;}
+        static BufferType create(size_t n, BufferType* dummy = nullptr) {return BufferType(n);}
+    } buffer;
+    MiniGridHandle(BufferType &&buf) : buffer(std::move(buf)) {}
+    const uint8_t* data() const {return buffer.data;}
+};// MiniGridHandle
+#endif
+
+namespace io {
+
+/// @brief Define compression codecs
+///
+/// @note NONE is the default, ZIP is slow but compact and BLOSC offers a great balance.
+///
+/// @throw NanoVDB optionally supports ZIP and BLOSC compression and will throw an exception
+///        if its support is required but missing.
+enum class Codec : uint16_t { NONE = 0,
+                              ZIP = 1,
+                              BLOSC = 2,
+                              End = 3,
+                              StrLen = 6 + End };
+
+__hostdev__ inline const char* toStr(char *dst, Codec codec)
+{
+    switch (codec){
+        case Codec::NONE:   return util::strcpy(dst, "NONE");
+        case Codec::ZIP:    return util::strcpy(dst, "ZIP");
+        case Codec::BLOSC : return util::strcpy(dst, "BLOSC");
+        default:            return util::strcpy(dst, "END");
+    }
+}
+
+__hostdev__ inline Codec toCodec(const char *str)
+{
+    if (util::streq(str, "none"))  return Codec::NONE;
+    if (util::streq(str, "zip"))   return Codec::ZIP;
+    if (util::streq(str, "blosc")) return Codec::BLOSC;
+    return Codec::End;
+}
+
+/// @brief Data encoded at the head of each segment of a file or stream.
+///
+/// @note A file or stream is composed of one or more segments that each contain
+//        one or more grids.
+struct FileHeader {// 16 bytes
+    uint64_t magic;//     8 bytes
+    Version  version;//   4 bytes version numbers
+    uint16_t gridCount;// 2 bytes
+    Codec    codec;//     2 bytes
+    bool isValid() const {return magic == NANOVDB_MAGIC_NUMB || magic == NANOVDB_MAGIC_FILE;}
+}; // FileHeader ( 16 bytes = 2 words )
+
+// @brief Data encoded for each of the grids associated with a segment.
+// Grid size in memory             (uint64_t)   |
+// Grid size on disk               (uint64_t)   |
+// Grid name hash key              (uint64_t)   |
+// Numer of active voxels          (uint64_t)   |
+// Grid type                       (uint32_t)   |
+// Grid class                      (uint32_t)   |
+// Characters in grid name         (uint32_t)   |
+// AABB in world space             (2*3*double) | one per grid in file
+// AABB in index space             (2*3*int)    |
+// Size of a voxel in world units  (3*double)   |
+// Byte size of the grid name      (uint32_t)   |
+// Number of nodes per level       (4*uint32_t) |
+// Numer of active tiles per level (3*uint32_t) |
+// Codec for file compression      (uint16_t)   |
+// Padding due to 8B alignment     (uint16_t)   |
+// Version number                  (uint32_t)   |
+struct FileMetaData
+{// 176 bytes
+    uint64_t    gridSize, fileSize, nameKey, voxelCount; // 4 * 8 = 32B.
+    GridType    gridType;  // 4B.
+    GridClass   gridClass; // 4B.
+    Vec3dBBox   worldBBox; // 2 * 3 * 8 = 48B.
+    CoordBBox   indexBBox; // 2 * 3 * 4 = 24B.
+    Vec3d       voxelSize; // 24B.
+    uint32_t    nameSize;  // 4B.
+    uint32_t    nodeCount[4]; //4 x 4 = 16B
+    uint32_t    tileCount[3];// 3 x 4 = 12B
+    Codec       codec;  // 2B
+    uint16_t    padding;// 2B, due to 8B alignment from uint64_t
+    Version     version;// 4B
+}; // FileMetaData
+
+// the following code block uses std and therefore needs to be ignored by CUDA and HIP
+#if !defined(__CUDA_ARCH__) && !defined(__HIP__)
+
+// Note that starting with version 32.6.0 it is possible to write and read raw grid buffers to
+// files, e.g. os.write((const char*)&buffer.data(), buffer.size()) or more conveniently as
+// handle.write(fileName). In addition to this simple approach we offer the methods below to
+// write traditional uncompressed nanovdb files that unlike raw files include metadata that
+// is used for tools like nanovdb_print.
+
+///
+/// @brief This is a standalone alternative to io::writeGrid(...,Codec::NONE) defined in util/IO.h
+///        Unlike the latter this function has no dependencies at all, not even NanoVDB.h, so it also
+///        works if client code only includes PNanoVDB.h!
+///
+/// @details Writes a raw NanoVDB buffer, possibly with multiple grids, to a stream WITHOUT compression.
+///          It follows all the conventions in util/IO.h so the stream can be read by all existing client
+///          code of NanoVDB.
+///
+/// @note This method will always write uncompressed grids to the stream, i.e. Blosc or ZIP compression
+///       is never applied! This is a fundamental limitation and feature of this standalone function.
+///
+/// @throw std::invalid_argument if buffer does not point to a valid NanoVDB grid.
+///
+/// @warning This is pretty ugly code that involves lots of pointer and bit manipulations - not for the faint of heart :)
+template<typename StreamT> // StreamT class must support: "void write(const char*, size_t)"
+void writeUncompressedGrid(StreamT& os, const GridData* gridData, bool raw = false)
+{
+    NANOVDB_ASSERT(gridData->mMagic == NANOVDB_MAGIC_NUMB || gridData->mMagic == NANOVDB_MAGIC_GRID);
+    NANOVDB_ASSERT(gridData->mVersion.isCompatible());
+    if (!raw) {// segment with a single grid:  FileHeader, FileMetaData, gridName, Grid
+#ifdef NANOVDB_USE_NEW_MAGIC_NUMBERS
+        FileHeader head{NANOVDB_MAGIC_FILE, gridData->mVersion, 1u, Codec::NONE};
+#else
+        FileHeader head{NANOVDB_MAGIC_NUMB, gridData->mVersion, 1u, Codec::NONE};
+#endif
+        const char* gridName = gridData->gridName();
+        const uint32_t nameSize = util::strlen(gridName) + 1;// include '\0'
+        const TreeData* treeData = (const TreeData*)(gridData->treePtr());
+        FileMetaData meta{gridData->mGridSize, gridData->mGridSize, 0u, treeData->mVoxelCount,
+                          gridData->mGridType, gridData->mGridClass, gridData->mWorldBBox,
+                          treeData->bbox(), gridData->mVoxelSize, nameSize,
+                          {treeData->mNodeCount[0], treeData->mNodeCount[1], treeData->mNodeCount[2], 1u},
+                          {treeData->mTileCount[0], treeData->mTileCount[1], treeData->mTileCount[2]},
+                          Codec::NONE, 0u, gridData->mVersion }; // FileMetaData
+        os.write((const char*)&head, sizeof(FileHeader)); // write header
+        os.write((const char*)&meta, sizeof(FileMetaData)); // write meta data
+        os.write(gridName, nameSize); // write grid name
+    }
+    os.write((const char*)gridData, gridData->mGridSize);// write the grid
+}// writeUncompressedGrid
+
+/// @brief  write multiple NanoVDB grids to a single file, without compression.
+/// @note To write all grids in a single GridHandle simply use handle.write("fieNane")
+template<typename GridHandleT, template<typename...> class VecT>
+void writeUncompressedGrids(const char* fileName, const VecT<GridHandleT>& handles, bool raw = false)
+{
+#ifdef NANOVDB_USE_IOSTREAMS // use this to switch between std::ofstream or FILE implementations
+    std::ofstream os(fileName, std::ios::out | std::ios::binary | std::ios::trunc);
+#else
+    struct StreamT {
+        FILE* fptr;
+        StreamT(const char* name) { fptr = fopen(name, "wb"); }
+        ~StreamT() { fclose(fptr); }
+        void write(const char* data, size_t n) { fwrite(data, 1, n, fptr); }
+        bool is_open() const { return fptr != NULL; }
+    } os(fileName);
+#endif
+    if (!os.is_open()) {
+        fprintf(stderr, "nanovdb::writeUncompressedGrids: Unable to open file \"%s\"for output\n", fileName);
+        exit(EXIT_FAILURE);
+    }
+    for (auto& h : handles) {
+        for (uint32_t n=0; n<h.gridCount(); ++n) writeUncompressedGrid(os, h.gridData(n), raw);
+    }
+} // writeUncompressedGrids
+
+/// @brief read all uncompressed grids from a stream and return their handles.
+///
+/// @throw std::invalid_argument if stream does not contain a single uncompressed valid NanoVDB grid
+///
+/// @details StreamT class must support: "bool read(char*, size_t)" and "void skip(uint32_t)"
+template<typename GridHandleT, typename StreamT, template<typename...> class VecT>
+VecT<GridHandleT> readUncompressedGrids(StreamT& is, const typename GridHandleT::BufferType& pool = typename GridHandleT::BufferType())
+{
+    VecT<GridHandleT> handles;
+    GridData data;
+    is.read((char*)&data, sizeof(GridData));
+    if (data.isValid()) {// stream contains a raw grid buffer
+        uint64_t size = data.mGridSize, sum = 0u;
+        while(data.mGridIndex + 1u < data.mGridCount) {
+            is.skip(data.mGridSize - sizeof(GridData));// skip grid
+            is.read((char*)&data, sizeof(GridData));// read sizeof(GridData) bytes
+            sum += data.mGridSize;
+        }
+        is.skip(-int64_t(sum + sizeof(GridData)));// rewind to start
+        auto buffer = GridHandleT::BufferType::create(size + sum, &pool);
+        is.read((char*)(buffer.data()), buffer.size());
+        handles.emplace_back(std::move(buffer));
+    } else {// Header0, MetaData0, gridName0, Grid0...HeaderN, MetaDataN, gridNameN, GridN
+        is.skip(-sizeof(GridData));// rewind
+        FileHeader head;
+        while(is.read((char*)&head, sizeof(FileHeader))) {
+            if (!head.isValid()) {
+                fprintf(stderr, "nanovdb::readUncompressedGrids: invalid magic number = \"%s\"\n", (const char*)&(head.magic));
+                exit(EXIT_FAILURE);
+            } else if (!head.version.isCompatible()) {
+                char str[20];
+                fprintf(stderr, "nanovdb::readUncompressedGrids: invalid major version = \"%s\"\n", toStr(str, head.version));
+                exit(EXIT_FAILURE);
+            } else if (head.codec != Codec::NONE) {
+                char str[8];
+                fprintf(stderr, "nanovdb::readUncompressedGrids: invalid codec = \"%s\"\n", toStr(str, head.codec));
+                exit(EXIT_FAILURE);
+            }
+            FileMetaData meta;
+            for (uint16_t i = 0; i < head.gridCount; ++i) { // read all grids in segment
+                is.read((char*)&meta, sizeof(FileMetaData));// read meta data
+                is.skip(meta.nameSize); // skip grid name
+                auto buffer = GridHandleT::BufferType::create(meta.gridSize, &pool);
+                is.read((char*)buffer.data(), meta.gridSize);// read grid
+                handles.emplace_back(std::move(buffer));
+            }// loop over grids in segment
+        }// loop over segments
+    }
+    return handles;
+} // readUncompressedGrids
+
+/// @brief Read a multiple un-compressed NanoVDB grids from a file and return them as a vector.
+template<typename GridHandleT, template<typename...> class VecT>
+VecT<GridHandleT> readUncompressedGrids(const char* fileName, const typename GridHandleT::BufferType& buffer = typename GridHandleT::BufferType())
+{
+#ifdef NANOVDB_USE_IOSTREAMS // use this to switch between std::ifstream or FILE implementations
+    struct StreamT : public std::ifstream {
+        StreamT(const char* name) : std::ifstream(name, std::ios::in | std::ios::binary){}
+        void skip(int64_t off) { this->seekg(off, std::ios_base::cur); }
+    };
+#else
+    struct StreamT {
+        FILE* fptr;
+        StreamT(const char* name) { fptr = fopen(name, "rb"); }
+        ~StreamT() { fclose(fptr); }
+        bool read(char* data, size_t n) {
+            size_t m = fread(data, 1, n, fptr);
+            return n == m;
+        }
+        void skip(int64_t off) { fseek(fptr, (long int)off, SEEK_CUR); }
+        bool is_open() const { return fptr != NULL; }
+    };
+#endif
+    StreamT is(fileName);
+    if (!is.is_open()) {
+        fprintf(stderr, "nanovdb::readUncompressedGrids: Unable to open file \"%s\"for input\n", fileName);
+        exit(EXIT_FAILURE);
+    }
+    return readUncompressedGrids<GridHandleT, StreamT, VecT>(is, buffer);
+} // readUncompressedGrids
+
+#endif // if !defined(__CUDA_ARCH__) && !defined(__HIP__)
+
+} // namespace io
+
+// ----------------------------> Implementations of random access methods <--------------------------------------
+
+/// @brief Implements Tree::getValue(math::Coord), i.e. return the value associated with a specific coordinate @c ijk.
+/// @tparam BuildT Build type of the grid being called
+/// @details The value at a coordinate maps to the background, a tile value or a leaf value.
+template<typename BuildT>
+struct GetValue
+{
+    __hostdev__ static auto get(const NanoRoot<BuildT>& root) { return root.mBackground; }
+    __hostdev__ static auto get(const typename NanoRoot<BuildT>::Tile& tile) { return tile.value; }
+    __hostdev__ static auto get(const NanoUpper<BuildT>& node, uint32_t n) { return node.mTable[n].value; }
+    __hostdev__ static auto get(const NanoLower<BuildT>& node, uint32_t n) { return node.mTable[n].value; }
+    __hostdev__ static auto get(const NanoLeaf<BuildT>& leaf,  uint32_t n) { return leaf.getValue(n); } // works with all build types
+}; // GetValue<BuildT>
+
+template<typename BuildT>
+struct SetValue
+{
+    static_assert(!BuildTraits<BuildT>::is_special, "SetValue does not support special value types");
+    using ValueT = typename NanoLeaf<BuildT>::ValueType;
+    __hostdev__ static auto set(NanoRoot<BuildT>&, const ValueT&) {} // no-op
+    __hostdev__ static auto set(typename NanoRoot<BuildT>::Tile& tile, const ValueT& v) { tile.value = v; }
+    __hostdev__ static auto set(NanoUpper<BuildT>& node, uint32_t n, const ValueT& v) { node.mTable[n].value = v; }
+    __hostdev__ static auto set(NanoLower<BuildT>& node, uint32_t n, const ValueT& v) { node.mTable[n].value = v; }
+    __hostdev__ static auto set(NanoLeaf<BuildT>& leaf,  uint32_t n, const ValueT& v) { leaf.mValues[n] = v; }
+}; // SetValue<BuildT>
+
+template<typename BuildT>
+struct SetVoxel
+{
+    static_assert(!BuildTraits<BuildT>::is_special, "SetVoxel does not support special value types");
+    using ValueT = typename NanoLeaf<BuildT>::ValueType;
+    __hostdev__ static auto set(NanoRoot<BuildT>&, const ValueT&) {} // no-op
+    __hostdev__ static auto set(typename NanoRoot<BuildT>::Tile&, const ValueT&) {} // no-op
+    __hostdev__ static auto set(NanoUpper<BuildT>&, uint32_t, const ValueT&) {} // no-op
+    __hostdev__ static auto set(NanoLower<BuildT>&, uint32_t, const ValueT&) {} // no-op
+    __hostdev__ static auto set(NanoLeaf<BuildT>& leaf, uint32_t n, const ValueT& v) { leaf.mValues[n] = v; }
+}; // SetVoxel<BuildT>
+
+/// @brief Implements Tree::isActive(math::Coord)
+/// @tparam BuildT Build type of the grid being called
+template<typename BuildT>
+struct GetState
+{
+    __hostdev__ static auto get(const NanoRoot<BuildT>&) { return false; }
+    __hostdev__ static auto get(const typename NanoRoot<BuildT>::Tile& tile) { return tile.state > 0; }
+    __hostdev__ static auto get(const NanoUpper<BuildT>& node, uint32_t n) { return node.mValueMask.isOn(n); }
+    __hostdev__ static auto get(const NanoLower<BuildT>& node, uint32_t n) { return node.mValueMask.isOn(n); }
+    __hostdev__ static auto get(const NanoLeaf<BuildT>& leaf,  uint32_t n) { return leaf.mValueMask.isOn(n); }
+}; // GetState<BuildT>
+
+/// @brief Implements Tree::getDim(math::Coord)
+/// @tparam BuildT Build type of the grid being called
+template<typename BuildT>
+struct GetDim
+{
+    __hostdev__ static uint32_t get(const NanoRoot<BuildT>&) { return 0u; } // background
+    __hostdev__ static uint32_t get(const typename NanoRoot<BuildT>::Tile&) { return 4096u; }
+    __hostdev__ static uint32_t get(const NanoUpper<BuildT>&, uint32_t) { return 128u; }
+    __hostdev__ static uint32_t get(const NanoLower<BuildT>&, uint32_t) { return 8u; }
+    __hostdev__ static uint32_t get(const NanoLeaf<BuildT>&, uint32_t) { return 1u; }
+}; // GetDim<BuildT>
+
+/// @brief Return the pointer to the leaf node that contains math::Coord. Implements Tree::probeLeaf(math::Coord)
+/// @tparam BuildT Build type of the grid being called
+template<typename BuildT>
+struct GetLeaf
+{
+    __hostdev__ static const NanoLeaf<BuildT>* get(const NanoRoot<BuildT>&) { return nullptr; }
+    __hostdev__ static const NanoLeaf<BuildT>* get(const typename NanoRoot<BuildT>::Tile&) { return nullptr; }
+    __hostdev__ static const NanoLeaf<BuildT>* get(const NanoUpper<BuildT>&, uint32_t) { return nullptr; }
+    __hostdev__ static const NanoLeaf<BuildT>* get(const NanoLower<BuildT>&, uint32_t) { return nullptr; }
+    __hostdev__ static const NanoLeaf<BuildT>* get(const NanoLeaf<BuildT>& leaf, uint32_t) { return &leaf; }
+}; // GetLeaf<BuildT>
+
+/// @brief Return point to the lower internal node where math::Coord maps to one of its values, i.e. terminates
+/// @tparam BuildT Build type of the grid being called
+template<typename BuildT>
+struct GetLower
+{
+    __hostdev__ static const NanoLower<BuildT>* get(const NanoRoot<BuildT>&) { return nullptr; }
+    __hostdev__ static const NanoLower<BuildT>* get(const typename NanoRoot<BuildT>::Tile&) { return nullptr; }
+    __hostdev__ static const NanoLower<BuildT>* get(const NanoUpper<BuildT>&, uint32_t) { return nullptr; }
+    __hostdev__ static const NanoLower<BuildT>* get(const NanoLower<BuildT>& node, uint32_t) { return &node; }
+    __hostdev__ static const NanoLower<BuildT>* get(const NanoLeaf<BuildT>&, uint32_t) { return nullptr; }
+}; // GetLower<BuildT>
+
+/// @brief Return point to the upper internal node where math::Coord maps to one of its values, i.e. terminates
+/// @tparam BuildT Build type of the grid being called
+template<typename BuildT>
+struct GetUpper
+{
+    __hostdev__ static const NanoUpper<BuildT>* get(const NanoRoot<BuildT>&) { return nullptr; }
+    __hostdev__ static const NanoUpper<BuildT>* get(const typename NanoRoot<BuildT>::Tile&) { return nullptr; }
+    __hostdev__ static const NanoUpper<BuildT>* get(const NanoUpper<BuildT>& node, uint32_t) { return &node; }
+    __hostdev__ static const NanoUpper<BuildT>* get(const NanoLower<BuildT>& node, uint32_t) { return nullptr; }
+    __hostdev__ static const NanoUpper<BuildT>* get(const NanoLeaf<BuildT>&, uint32_t) { return nullptr; }
+}; // GetUpper<BuildT>
+
+/// @brief Implements Tree::probeLeaf(math::Coord)
+/// @tparam BuildT Build type of the grid being called
+template<typename BuildT>
+struct ProbeValue
+{
+    using ValueT = typename BuildToValueMap<BuildT>::Type;
+    __hostdev__ static bool get(const NanoRoot<BuildT>& root, ValueT& v)
+    {
+        v = root.mBackground;
+        return false;
+    }
+    __hostdev__ static bool get(const typename NanoRoot<BuildT>::Tile& tile, ValueT& v)
+    {
+        v = tile.value;
+        return tile.state > 0u;
+    }
+    __hostdev__ static bool get(const NanoUpper<BuildT>& node, uint32_t n, ValueT& v)
+    {
+        v = node.mTable[n].value;
+        return node.mValueMask.isOn(n);
+    }
+    __hostdev__ static bool get(const NanoLower<BuildT>& node, uint32_t n, ValueT& v)
+    {
+        v = node.mTable[n].value;
+        return node.mValueMask.isOn(n);
+    }
+    __hostdev__ static bool get(const NanoLeaf<BuildT>& leaf, uint32_t n, ValueT& v)
+    {
+        v = leaf.getValue(n);
+        return leaf.mValueMask.isOn(n);
+    }
+}; // ProbeValue<BuildT>
+
+/// @brief Implements Tree::getNodeInfo(math::Coord)
+/// @tparam BuildT Build type of the grid being called
+template<typename BuildT>
+struct GetNodeInfo
+{
+    using ValueType = typename NanoLeaf<BuildT>::ValueType;
+    using FloatType = typename NanoLeaf<BuildT>::FloatType;
+    struct NodeInfo
+    {
+        uint32_t level, dim;
+        ValueType minimum, maximum;
+        FloatType average, stdDevi;
+        CoordBBox bbox;
+    };
+    __hostdev__ static NodeInfo get(const NanoRoot<BuildT>& root)
+    {
+        return NodeInfo{3u, NanoUpper<BuildT>::DIM, root.minimum(), root.maximum(), root.average(), root.stdDeviation(), root.bbox()};
+    }
+    __hostdev__ static NodeInfo get(const typename NanoRoot<BuildT>::Tile& tile)
+    {
+        return NodeInfo{3u, NanoUpper<BuildT>::DIM, tile.value, tile.value, static_cast<FloatType>(tile.value), 0, CoordBBox::createCube(tile.origin(), NanoUpper<BuildT>::DIM)};
+    }
+    __hostdev__ static NodeInfo get(const NanoUpper<BuildT>& node, uint32_t n)
+    {
+        return NodeInfo{2u, node.dim(), node.minimum(), node.maximum(), node.average(), node.stdDeviation(), node.bbox()};
+    }
+    __hostdev__ static NodeInfo get(const NanoLower<BuildT>& node, uint32_t n)
+    {
+        return NodeInfo{1u, node.dim(), node.minimum(), node.maximum(), node.average(), node.stdDeviation(), node.bbox()};
+    }
+    __hostdev__ static NodeInfo get(const NanoLeaf<BuildT>& leaf, uint32_t n)
+    {
+        return NodeInfo{0u, leaf.dim(), leaf.minimum(), leaf.maximum(), leaf.average(), leaf.stdDeviation(), leaf.bbox()};
+    }
+}; // GetNodeInfo<BuildT>
 
-} // namespace nanovdb
+} // namespace nanovdb ===================================================================
 
 #endif // end of NANOVDB_NANOVDB_H_HAS_BEEN_INCLUDED
diff --git a/warp/native/nanovdb/PNanoVDB.h b/warp/native/nanovdb/PNanoVDB.h
index ac5796b5..40888f24 100644
--- a/warp/native/nanovdb/PNanoVDB.h
+++ b/warp/native/nanovdb/PNanoVDB.h
@@ -3,11 +3,11 @@
 // SPDX-License-Identifier: MPL-2.0
 
 /*!
-    \file   PNanoVDB.h
+    \file   nanovdb/PNanoVDB.h
 
     \author Andrew Reidmeyer
 
-    \brief  This file is a portable (e.g. pointer-less) C99/GLSL/HLSL port 
+    \brief  This file is a portable (e.g. pointer-less) C99/GLSL/HLSL port
             of NanoVDB.h, which is compatible with most graphics APIs.
 */
 
@@ -56,8 +56,10 @@
 #endif
 
 #ifdef PNANOVDB_CMATH
+#ifndef __CUDACC_RTC__
 #include <math.h>
 #endif
+#endif
 
 // ------------------------------------------------ Buffer -----------------------------------------------------------
 
@@ -72,38 +74,11 @@
 #endif
 
 #if defined(PNANOVDB_BUF_C)
-// #include <stdint.h>
-#if !defined(_STDINT) && !defined(__GNUC__)
-//typedef signed char      int8_t;
-//typedef signed short     int16_t;
-typedef signed int       int32_t;
-typedef signed long long int64_t;
-//typedef signed char      int_fast8_t;
-//typedef signed short     int_fast16_t;
-//typedef signed int       int_fast32_t;
-//typedef signed long long int_fast64_t;
-//typedef signed char      int_least8_t;
-//typedef signed short     int_least16_t;
-//typedef signed int       int_least32_t;
-//typedef signed long long int_least64_t;
-//typedef signed long long intmax_t;
-//typedef signed long      intptr_t; 
-//typedef unsigned char      uint8_t;
-//typedef unsigned short     uint16_t;
-typedef unsigned int       uint32_t;
-typedef unsigned long long uint64_t;
-//typedef unsigned char      uint_fast8_t;
-//typedef unsigned short     uint_fast16_t;
-//typedef unsigned int       uint_fast32_t;
-//typedef unsigned long long uint_fast64_t;
-//typedef unsigned char      uint_least8_t;
-//typedef unsigned short     uint_least16_t;
-//typedef unsigned int       uint_least32_t;
-//typedef unsigned long long uint_least64_t;
-//typedef unsigned long long uintmax_t;
+#ifndef __CUDACC_RTC__
+#include <stdint.h>
 #endif
 #if defined(__CUDACC__)
-#define PNANOVDB_BUF_FORCE_INLINE __host__ __device__ static __forceinline__
+#define PNANOVDB_BUF_FORCE_INLINE static __host__ __device__ __forceinline__
 #elif defined(_WIN32)
 #define PNANOVDB_BUF_FORCE_INLINE static inline __forceinline
 #else
@@ -146,6 +121,32 @@ PNANOVDB_BUF_FORCE_INLINE uint64_t pnanovdb_buf_read_uint64(pnanovdb_buf_t buf,
     return data64[wordaddress64];
 #endif
 }
+PNANOVDB_BUF_FORCE_INLINE void pnanovdb_buf_write_uint32(pnanovdb_buf_t buf, uint32_t byte_offset, uint32_t value)
+{
+    uint32_t wordaddress = (byte_offset >> 2u);
+#ifdef PNANOVDB_BUF_BOUNDS_CHECK
+    if (wordaddress < buf.size_in_words)
+    {
+        buf.data[wordaddress] = value;
+}
+#else
+    buf.data[wordaddress] = value;
+#endif
+}
+PNANOVDB_BUF_FORCE_INLINE void pnanovdb_buf_write_uint64(pnanovdb_buf_t buf, uint32_t byte_offset, uint64_t value)
+{
+    uint64_t* data64 = (uint64_t*)buf.data;
+    uint32_t wordaddress64 = (byte_offset >> 3u);
+#ifdef PNANOVDB_BUF_BOUNDS_CHECK
+    uint64_t size_in_words64 = buf.size_in_words >> 1u;
+    if (wordaddress64 < size_in_words64)
+    {
+        data64[wordaddress64] = value;
+    }
+#else
+    data64[wordaddress64] = value;
+#endif
+}
 #elif defined(PNANOVDB_ADDRESS_64)
 PNANOVDB_BUF_FORCE_INLINE uint32_t pnanovdb_buf_read_uint32(pnanovdb_buf_t buf, uint64_t byte_offset)
 {
@@ -167,6 +168,32 @@ PNANOVDB_BUF_FORCE_INLINE uint64_t pnanovdb_buf_read_uint64(pnanovdb_buf_t buf,
     return data64[wordaddress64];
 #endif
 }
+PNANOVDB_BUF_FORCE_INLINE void pnanovdb_buf_write_uint32(pnanovdb_buf_t buf, uint64_t byte_offset, uint32_t value)
+{
+    uint64_t wordaddress = (byte_offset >> 2u);
+#ifdef PNANOVDB_BUF_BOUNDS_CHECK
+    if (wordaddress < buf.size_in_words)
+    {
+        buf.data[wordaddress] = value;
+    }
+#else
+    buf.data[wordaddress] = value;
+#endif
+}
+PNANOVDB_BUF_FORCE_INLINE void pnanovdb_buf_write_uint64(pnanovdb_buf_t buf, uint64_t byte_offset, uint64_t value)
+{
+    uint64_t* data64 = (uint64_t*)buf.data;
+    uint64_t wordaddress64 = (byte_offset >> 3u);
+#ifdef PNANOVDB_BUF_BOUNDS_CHECK
+    uint64_t size_in_words64 = buf.size_in_words >> 1u;
+    if (wordaddress64 < size_in_words64)
+    {
+        data64[wordaddress64] = value;
+    }
+#else
+    data64[wordaddress64] = value;
+#endif
+}
 #endif
 typedef uint32_t pnanovdb_grid_type_t;
 #define PNANOVDB_GRID_TYPE_GET(grid_typeIn, nameIn) pnanovdb_grid_type_constants[grid_typeIn].nameIn
@@ -184,6 +211,14 @@ uint2 pnanovdb_buf_read_uint64(pnanovdb_buf_t buf, uint byte_offset)
     ret.y = pnanovdb_buf_read_uint32(buf, byte_offset + 4u);
     return ret;
 }
+void pnanovdb_buf_write_uint32(pnanovdb_buf_t buf, uint byte_offset, uint value)
+{
+    // NOP, by default no write in HLSL
+}
+void pnanovdb_buf_write_uint64(pnanovdb_buf_t buf, uint byte_offset, uint2 value)
+{
+    // NOP, by default no write in HLSL
+}
 #elif defined(PNANOVDB_ADDRESS_64)
 #define pnanovdb_buf_t StructuredBuffer<uint>
 uint pnanovdb_buf_read_uint32(pnanovdb_buf_t buf, uint64_t byte_offset)
@@ -197,6 +232,14 @@ uint64_t pnanovdb_buf_read_uint64(pnanovdb_buf_t buf, uint64_t byte_offset)
     ret = ret + (uint64_t(pnanovdb_buf_read_uint32(buf, byte_offset + 4u)) << 32u);
     return ret;
 }
+void pnanovdb_buf_write_uint32(pnanovdb_buf_t buf, uint64_t byte_offset, uint value)
+{
+    // NOP, by default no write in HLSL
+}
+void pnanovdb_buf_write_uint64(pnanovdb_buf_t buf, uint64_t byte_offset, uint64_t value)
+{
+    // NOP, by default no write in HLSL
+}
 #endif
 #define pnanovdb_grid_type_t uint
 #define PNANOVDB_GRID_TYPE_GET(grid_typeIn, nameIn) pnanovdb_grid_type_constants[grid_typeIn].nameIn
@@ -216,6 +259,14 @@ uvec2 pnanovdb_buf_read_uint64(pnanovdb_buf_t buf, uint byte_offset)
     ret.y = pnanovdb_buf_read_uint32(buf, byte_offset + 4u);
     return ret;
 }
+void pnanovdb_buf_write_uint32(pnanovdb_buf_t buf, uint byte_offset, uint value)
+{
+    // NOP, by default no write in HLSL
+}
+void pnanovdb_buf_write_uint64(pnanovdb_buf_t buf, uint byte_offset, uvec2 value)
+{
+    // NOP, by default no write in HLSL
+}
 #define pnanovdb_grid_type_t uint
 #define PNANOVDB_GRID_TYPE_GET(grid_typeIn, nameIn) pnanovdb_grid_type_constants[grid_typeIn].nameIn
 #endif
@@ -225,7 +276,7 @@ uvec2 pnanovdb_buf_read_uint64(pnanovdb_buf_t buf, uint byte_offset)
 // force inline
 #if defined(PNANOVDB_C)
 #if defined(__CUDACC__)
-#define PNANOVDB_FORCE_INLINE __host__ __device__ static __forceinline__
+#define PNANOVDB_FORCE_INLINE static __host__ __device__ __forceinline__
 #elif defined(_WIN32)
 #define PNANOVDB_FORCE_INLINE static inline __forceinline
 #else
@@ -240,7 +291,11 @@ uvec2 pnanovdb_buf_read_uint64(pnanovdb_buf_t buf, uint byte_offset)
 // struct typedef, static const, inout
 #if defined(PNANOVDB_C)
 #define PNANOVDB_STRUCT_TYPEDEF(X) typedef struct X X;
+#if defined(__CUDA_ARCH__)
+#define PNANOVDB_STATIC_CONST constexpr __constant__
+#else
 #define PNANOVDB_STATIC_CONST static const
+#endif
 #define PNANOVDB_INOUT(X) X*
 #define PNANOVDB_IN(X) const X*
 #define PNANOVDB_DEREF(X) (*X)
@@ -264,9 +319,13 @@ uvec2 pnanovdb_buf_read_uint64(pnanovdb_buf_t buf, uint byte_offset)
 // basic types, type conversion
 #if defined(PNANOVDB_C)
 #define PNANOVDB_NATIVE_64
-// #include <stdint.h>
+#ifndef __CUDACC_RTC__
+#include <stdint.h>
+#endif
 #if !defined(PNANOVDB_MEMCPY_CUSTOM)
+#ifndef __CUDACC_RTC__
 #include <string.h>
+#endif
 #define pnanovdb_memcpy memcpy
 #endif
 typedef uint32_t pnanovdb_uint32_t;
@@ -288,10 +347,10 @@ PNANOVDB_FORCE_INLINE pnanovdb_int32_t pnanovdb_uint32_as_int32(pnanovdb_uint32_
 PNANOVDB_FORCE_INLINE pnanovdb_int64_t pnanovdb_uint64_as_int64(pnanovdb_uint64_t v) { return (pnanovdb_int64_t)v; }
 PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_int64_as_uint64(pnanovdb_int64_t v) { return (pnanovdb_uint64_t)v; }
 PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_int32_as_uint32(pnanovdb_int32_t v) { return (pnanovdb_uint32_t)v; }
-// PNANOVDB_FORCE_INLINE float pnanovdb_uint32_as_float(pnanovdb_uint32_t v) { return *((float*)&v); }
-// PNANOVDB_FORCE_INLINE double pnanovdb_uint64_as_double(pnanovdb_uint64_t v) { return *((double*)&v); }
 PNANOVDB_FORCE_INLINE float pnanovdb_uint32_as_float(pnanovdb_uint32_t v) { float vf; pnanovdb_memcpy(&vf, &v, sizeof(vf)); return vf; }
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_float_as_uint32(float v) { return *((pnanovdb_uint32_t*)(&v)); }
 PNANOVDB_FORCE_INLINE double pnanovdb_uint64_as_double(pnanovdb_uint64_t v) { double vf; pnanovdb_memcpy(&vf, &v, sizeof(vf)); return vf; }
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_double_as_uint64(double v) { return *((pnanovdb_uint64_t*)(&v)); }
 PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_uint64_low(pnanovdb_uint64_t v) { return (pnanovdb_uint32_t)v; }
 PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_uint64_high(pnanovdb_uint64_t v) { return (pnanovdb_uint32_t)(v >> 32u); }
 PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_uint32_as_uint64(pnanovdb_uint32_t x, pnanovdb_uint32_t y) { return ((pnanovdb_uint64_t)x) | (((pnanovdb_uint64_t)y) << 32u); }
@@ -317,6 +376,7 @@ typedef float3 pnanovdb_vec3_t;
 pnanovdb_int32_t pnanovdb_uint32_as_int32(pnanovdb_uint32_t v) { return int(v); }
 pnanovdb_uint32_t pnanovdb_int32_as_uint32(pnanovdb_int32_t v) { return uint(v); }
 float pnanovdb_uint32_as_float(pnanovdb_uint32_t v) { return asfloat(v); }
+pnanovdb_uint32_t pnanovdb_float_as_uint32(float v) { return asuint(v); }
 float pnanovdb_floor(float v) { return floor(v); }
 pnanovdb_int32_t pnanovdb_float_to_int32(float v) { return int(v); }
 float pnanovdb_int32_to_float(pnanovdb_int32_t v) { return float(v); }
@@ -329,6 +389,7 @@ typedef int2 pnanovdb_int64_t;
 pnanovdb_int64_t pnanovdb_uint64_as_int64(pnanovdb_uint64_t v) { return int2(v); }
 pnanovdb_uint64_t pnanovdb_int64_as_uint64(pnanovdb_int64_t v) { return uint2(v); }
 double pnanovdb_uint64_as_double(pnanovdb_uint64_t v) { return asdouble(v.x, v.y); }
+pnanovdb_uint64_t pnanovdb_double_as_uint64(double v) { uint2 ret; asuint(v, ret.x, ret.y); return ret; }
 pnanovdb_uint32_t pnanovdb_uint64_low(pnanovdb_uint64_t v) { return v.x; }
 pnanovdb_uint32_t pnanovdb_uint64_high(pnanovdb_uint64_t v) { return v.y; }
 pnanovdb_uint64_t pnanovdb_uint32_as_uint64(pnanovdb_uint32_t x, pnanovdb_uint32_t y) { return uint2(x, y); }
@@ -341,6 +402,7 @@ typedef int64_t pnanovdb_int64_t;
 pnanovdb_int64_t pnanovdb_uint64_as_int64(pnanovdb_uint64_t v) { return int64_t(v); }
 pnanovdb_uint64_t pnanovdb_int64_as_uint64(pnanovdb_int64_t v) { return uint64_t(v); }
 double pnanovdb_uint64_as_double(pnanovdb_uint64_t v) { return asdouble(uint(v), uint(v >> 32u)); }
+pnanovdb_uint64_t pnanovdb_double_as_uint64(double v) { uint2 ret; asuint(v, ret.x, ret.y); return uint64_t(ret.x) + (uint64_t(ret.y) << 32u); }
 pnanovdb_uint32_t pnanovdb_uint64_low(pnanovdb_uint64_t v) { return uint(v); }
 pnanovdb_uint32_t pnanovdb_uint64_high(pnanovdb_uint64_t v) { return uint(v >> 32u); }
 pnanovdb_uint64_t pnanovdb_uint32_as_uint64(pnanovdb_uint32_t x, pnanovdb_uint32_t y) { return uint64_t(x) + (uint64_t(y) << 32u); }
@@ -363,7 +425,9 @@ pnanovdb_int64_t pnanovdb_uint64_as_int64(pnanovdb_uint64_t v) { return ivec2(v)
 pnanovdb_uint64_t pnanovdb_int64_as_uint64(pnanovdb_int64_t v) { return uvec2(v); }
 pnanovdb_uint32_t pnanovdb_int32_as_uint32(pnanovdb_int32_t v) { return uint(v); }
 float pnanovdb_uint32_as_float(pnanovdb_uint32_t v) { return uintBitsToFloat(v); }
+pnanovdb_uint32_t pnanovdb_float_as_uint32(float v) { return floatBitsToUint(v); }
 double pnanovdb_uint64_as_double(pnanovdb_uint64_t v) { return packDouble2x32(uvec2(v.x, v.y)); }
+pnanovdb_uint64_t pnanovdb_double_as_uint64(double v) { return unpackDouble2x32(v); }
 pnanovdb_uint32_t pnanovdb_uint64_low(pnanovdb_uint64_t v) { return v.x; }
 pnanovdb_uint32_t pnanovdb_uint64_high(pnanovdb_uint64_t v) { return v.y; }
 pnanovdb_uint64_t pnanovdb_uint32_as_uint64(pnanovdb_uint32_t x, pnanovdb_uint32_t y) { return uvec2(x, y); }
@@ -437,14 +501,6 @@ PNANOVDB_FORCE_INLINE pnanovdb_vec3_t pnanovdb_vec3_max(const pnanovdb_vec3_t a,
     v.z = a.z > b.z ? a.z : b.z;
     return v;
 }
-PNANOVDB_FORCE_INLINE pnanovdb_coord_t pnanovdb_vec3_round_to_coord(const pnanovdb_vec3_t a)
-{
-    pnanovdb_coord_t v;
-    v.x = pnanovdb_float_to_int32(roundf(a.x));
-    v.y = pnanovdb_float_to_int32(roundf(a.y));
-    v.z = pnanovdb_float_to_int32(roundf(a.z));
-    return v;
-}
 PNANOVDB_FORCE_INLINE pnanovdb_vec3_t pnanovdb_coord_to_vec3(const pnanovdb_coord_t coord)
 {
     pnanovdb_vec3_t v;
@@ -493,6 +549,119 @@ pnanovdb_coord_t pnanovdb_coord_uniform(pnanovdb_int32_t a) { return ivec3(a, a,
 pnanovdb_coord_t pnanovdb_coord_add(pnanovdb_coord_t a, pnanovdb_coord_t b) { return a + b; }
 #endif
 
+// ------------------------------------------------ Uint64 Utils -----------------------------------------------------------
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_uint32_countbits(pnanovdb_uint32_t value)
+{
+#if defined(PNANOVDB_C)
+#if defined(_MSC_VER) && (_MSC_VER >= 1928) && defined(PNANOVDB_USE_INTRINSICS)
+    return __popcnt(value);
+#elif (defined(__GNUC__) || defined(__clang__)) && defined(PNANOVDB_USE_INTRINSICS)
+    return __builtin_popcount(value);
+#else
+    value = value - ((value >> 1) & 0x55555555);
+    value = (value & 0x33333333) + ((value >> 2) & 0x33333333);
+    value = (value + (value >> 4)) & 0x0F0F0F0F;
+    return (value * 0x01010101) >> 24;
+#endif
+#elif defined(PNANOVDB_HLSL)
+    return countbits(value);
+#elif defined(PNANOVDB_GLSL)
+    return bitCount(value);
+#endif
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_uint64_countbits(pnanovdb_uint64_t value)
+{
+    return pnanovdb_uint32_countbits(pnanovdb_uint64_low(value)) + pnanovdb_uint32_countbits(pnanovdb_uint64_high(value));
+}
+
+#if defined(PNANOVDB_ADDRESS_32)
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_uint64_offset(pnanovdb_uint64_t a, pnanovdb_uint32_t b)
+{
+    pnanovdb_uint32_t low = pnanovdb_uint64_low(a);
+    pnanovdb_uint32_t high = pnanovdb_uint64_high(a);
+    low += b;
+    if (low < b)
+    {
+        high += 1u;
+    }
+    return pnanovdb_uint32_as_uint64(low, high);
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_uint64_dec(pnanovdb_uint64_t a)
+{
+    pnanovdb_uint32_t low = pnanovdb_uint64_low(a);
+    pnanovdb_uint32_t high = pnanovdb_uint64_high(a);
+    if (low == 0u)
+    {
+        high -= 1u;
+    }
+    low -= 1u;
+    return pnanovdb_uint32_as_uint64(low, high);
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_uint64_to_uint32_lsr(pnanovdb_uint64_t a, pnanovdb_uint32_t b)
+{
+    pnanovdb_uint32_t low = pnanovdb_uint64_low(a);
+    pnanovdb_uint32_t high = pnanovdb_uint64_high(a);
+    return (b >= 32u) ?
+        (high >> (b - 32)) :
+        ((low >> b) | ((b > 0) ? (high << (32u - b)) : 0u));
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_uint64_bit_mask(pnanovdb_uint32_t bit_idx)
+{
+    pnanovdb_uint32_t mask_low = bit_idx < 32u ? 1u << bit_idx : 0u;
+    pnanovdb_uint32_t mask_high = bit_idx >= 32u ? 1u << (bit_idx - 32u) : 0u;
+    return pnanovdb_uint32_as_uint64(mask_low, mask_high);
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_uint64_and(pnanovdb_uint64_t a, pnanovdb_uint64_t b)
+{
+    return pnanovdb_uint32_as_uint64(
+        pnanovdb_uint64_low(a) & pnanovdb_uint64_low(b),
+        pnanovdb_uint64_high(a) & pnanovdb_uint64_high(b)
+    );
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_bool_t pnanovdb_uint64_any_bit(pnanovdb_uint64_t a)
+{
+    return pnanovdb_uint64_low(a) != 0u || pnanovdb_uint64_high(a) != 0u;
+}
+
+#else
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_uint64_offset(pnanovdb_uint64_t a, pnanovdb_uint32_t b)
+{
+    return a + b;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_uint64_dec(pnanovdb_uint64_t a)
+{
+    return a - 1u;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_uint64_to_uint32_lsr(pnanovdb_uint64_t a, pnanovdb_uint32_t b)
+{
+    return pnanovdb_uint64_low(a >> b);
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_uint64_bit_mask(pnanovdb_uint32_t bit_idx)
+{
+    return 1llu << bit_idx;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_uint64_and(pnanovdb_uint64_t a, pnanovdb_uint64_t b)
+{
+    return a & b;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_bool_t pnanovdb_uint64_any_bit(pnanovdb_uint64_t a)
+{
+    return a != 0llu;
+}
+#endif
+
 // ------------------------------------------------ Address Type -----------------------------------------------------------
 
 #if defined(PNANOVDB_ADDRESS_32)
@@ -527,6 +696,12 @@ PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_address_offset64(pnanovdb_addr
     ret.byte_offset += pnanovdb_uint64_low(byte_offset);
     return ret;
 }
+PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_address_offset64_product(pnanovdb_address_t address, pnanovdb_uint64_t byte_offset, pnanovdb_uint32_t multiplier)
+{
+    pnanovdb_address_t ret = address;
+    ret.byte_offset += pnanovdb_uint64_low(byte_offset) * multiplier;
+    return ret;
+}
 PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_address_mask(pnanovdb_address_t address, pnanovdb_uint32_t mask)
 {
     return address.byte_offset & mask;
@@ -581,6 +756,12 @@ PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_address_offset64(pnanovdb_addr
     ret.byte_offset += byte_offset;
     return ret;
 }
+PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_address_offset64_product(pnanovdb_address_t address, pnanovdb_uint64_t byte_offset, pnanovdb_uint32_t multiplier)
+{
+    pnanovdb_address_t ret = address;
+    ret.byte_offset += byte_offset * pnanovdb_uint32_as_uint64_low(multiplier);
+    return ret;
+}
 PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_address_mask(pnanovdb_address_t address, pnanovdb_uint32_t mask)
 {
     return pnanovdb_uint64_low(address.byte_offset) & mask;
@@ -640,12 +821,41 @@ PNANOVDB_FORCE_INLINE pnanovdb_coord_t pnanovdb_read_coord(pnanovdb_buf_t buf, p
     ret.z = pnanovdb_uint32_as_int32(pnanovdb_read_uint32(buf, pnanovdb_address_offset(address, 8u)));
     return ret;
 }
-PNANOVDB_FORCE_INLINE pnanovdb_vec3_t pnanovdb_read_vec3f(pnanovdb_buf_t buf, pnanovdb_address_t address)
+PNANOVDB_FORCE_INLINE pnanovdb_vec3_t pnanovdb_read_vec3(pnanovdb_buf_t buf, pnanovdb_address_t address)
+{
+    pnanovdb_vec3_t ret;
+    ret.x = pnanovdb_read_float(buf, pnanovdb_address_offset(address, 0u));
+    ret.y = pnanovdb_read_float(buf, pnanovdb_address_offset(address, 4u));
+    ret.z = pnanovdb_read_float(buf, pnanovdb_address_offset(address, 8u));
+    return ret;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_read_uint16(pnanovdb_buf_t buf, pnanovdb_address_t address)
+{
+    pnanovdb_uint32_t raw = pnanovdb_read_uint32(buf, pnanovdb_address_mask_inv(address, 3u));
+    return (raw >> (pnanovdb_address_mask(address, 2) << 3));
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_read_uint8(pnanovdb_buf_t buf, pnanovdb_address_t address)
+{
+    pnanovdb_uint32_t raw = pnanovdb_read_uint32(buf, pnanovdb_address_mask_inv(address, 3u));
+    return (raw >> (pnanovdb_address_mask(address, 3) << 3)) & 255;
+}
+PNANOVDB_FORCE_INLINE pnanovdb_vec3_t pnanovdb_read_vec3u16(pnanovdb_buf_t buf, pnanovdb_address_t address)
 {
     pnanovdb_vec3_t ret;
-    ret.x = pnanovdb_uint32_as_float(pnanovdb_read_uint32(buf, pnanovdb_address_offset(address, 0u)));
-    ret.y = pnanovdb_uint32_as_float(pnanovdb_read_uint32(buf, pnanovdb_address_offset(address, 4u)));
-    ret.z = pnanovdb_uint32_as_float(pnanovdb_read_uint32(buf, pnanovdb_address_offset(address, 8u)));
+    const float scale = 1.f / 65535.f;
+    ret.x = scale * pnanovdb_uint32_to_float(pnanovdb_read_uint16(buf, pnanovdb_address_offset(address, 0u))) - 0.5f;
+    ret.y = scale * pnanovdb_uint32_to_float(pnanovdb_read_uint16(buf, pnanovdb_address_offset(address, 2u))) - 0.5f;
+    ret.z = scale * pnanovdb_uint32_to_float(pnanovdb_read_uint16(buf, pnanovdb_address_offset(address, 4u))) - 0.5f;
+    return ret;
+}
+PNANOVDB_FORCE_INLINE pnanovdb_vec3_t pnanovdb_read_vec3u8(pnanovdb_buf_t buf, pnanovdb_address_t address)
+{
+    pnanovdb_vec3_t ret;
+    const float scale = 1.f / 255.f;
+    ret.x = scale * pnanovdb_uint32_to_float(pnanovdb_read_uint8(buf, pnanovdb_address_offset(address, 0u))) - 0.5f;
+    ret.y = scale * pnanovdb_uint32_to_float(pnanovdb_read_uint8(buf, pnanovdb_address_offset(address, 1u))) - 0.5f;
+    ret.z = scale * pnanovdb_uint32_to_float(pnanovdb_read_uint8(buf, pnanovdb_address_offset(address, 2u))) - 0.5f;
     return ret;
 }
 
@@ -677,13 +887,54 @@ PNANOVDB_FORCE_INLINE float pnanovdb_read_half(pnanovdb_buf_t buf, pnanovdb_addr
 }
 #endif
 
+// ------------------------------------------------ High Level Buffer Write -----------------------------------------------------------
+
+PNANOVDB_FORCE_INLINE void pnanovdb_write_uint32(pnanovdb_buf_t buf, pnanovdb_address_t address, pnanovdb_uint32_t value)
+{
+    pnanovdb_buf_write_uint32(buf, address.byte_offset, value);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_write_uint64(pnanovdb_buf_t buf, pnanovdb_address_t address, pnanovdb_uint64_t value)
+{
+    pnanovdb_buf_write_uint64(buf, address.byte_offset, value);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_write_int32(pnanovdb_buf_t buf, pnanovdb_address_t address, pnanovdb_int32_t value)
+{
+    pnanovdb_write_uint32(buf, address, pnanovdb_int32_as_uint32(value));
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_write_int64(pnanovdb_buf_t buf, pnanovdb_address_t address, pnanovdb_int64_t value)
+{
+    pnanovdb_buf_write_uint64(buf, address.byte_offset, pnanovdb_int64_as_uint64(value));
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_write_float(pnanovdb_buf_t buf, pnanovdb_address_t address, float value)
+{
+    pnanovdb_write_uint32(buf, address, pnanovdb_float_as_uint32(value));
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_write_double(pnanovdb_buf_t buf, pnanovdb_address_t address, double value)
+{
+    pnanovdb_write_uint64(buf, address, pnanovdb_double_as_uint64(value));
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_write_coord(pnanovdb_buf_t buf, pnanovdb_address_t address, PNANOVDB_IN(pnanovdb_coord_t) value)
+{
+    pnanovdb_write_uint32(buf, pnanovdb_address_offset(address, 0u), pnanovdb_int32_as_uint32(PNANOVDB_DEREF(value).x));
+    pnanovdb_write_uint32(buf, pnanovdb_address_offset(address, 4u), pnanovdb_int32_as_uint32(PNANOVDB_DEREF(value).y));
+    pnanovdb_write_uint32(buf, pnanovdb_address_offset(address, 8u), pnanovdb_int32_as_uint32(PNANOVDB_DEREF(value).z));
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_write_vec3(pnanovdb_buf_t buf, pnanovdb_address_t address, PNANOVDB_IN(pnanovdb_vec3_t) value)
+{
+    pnanovdb_write_float(buf, pnanovdb_address_offset(address, 0u), PNANOVDB_DEREF(value).x);
+    pnanovdb_write_float(buf, pnanovdb_address_offset(address, 4u), PNANOVDB_DEREF(value).y);
+    pnanovdb_write_float(buf, pnanovdb_address_offset(address, 8u), PNANOVDB_DEREF(value).z);
+}
+
 // ------------------------------------------------ Core Structures -----------------------------------------------------------
 
 #define PNANOVDB_MAGIC_NUMBER 0x304244566f6e614eUL// "NanoVDB0" in hex - little endian (uint64_t)
+#define PNANOVDB_MAGIC_GRID   0x314244566f6e614eUL// "NanoVDB1" in hex - little endian (uint64_t)
+#define PNANOVDB_MAGIC_FILE   0x324244566f6e614eUL// "NanoVDB2" in hex - little endian (uint64_t)
 
 #define PNANOVDB_MAJOR_VERSION_NUMBER 32// reflects changes to the ABI
-#define PNANOVDB_MINOR_VERSION_NUMBER  3// reflects changes to the API but not ABI
-#define PNANOVDB_PATCH_VERSION_NUMBER  3// reflects bug-fixes with no ABI or API changes
+#define PNANOVDB_MINOR_VERSION_NUMBER  7// reflects changes to the API but not ABI
+#define PNANOVDB_PATCH_VERSION_NUMBER  0// reflects bug-fixes with no ABI or API changes
 
 #define PNANOVDB_GRID_TYPE_UNKNOWN 0
 #define PNANOVDB_GRID_TYPE_FLOAT 1
@@ -704,17 +955,27 @@ PNANOVDB_FORCE_INLINE float pnanovdb_read_half(pnanovdb_buf_t buf, pnanovdb_addr
 #define PNANOVDB_GRID_TYPE_FPN 16
 #define PNANOVDB_GRID_TYPE_VEC4F 17
 #define PNANOVDB_GRID_TYPE_VEC4D 18
-#define PNANOVDB_GRID_TYPE_END 19
+#define PNANOVDB_GRID_TYPE_INDEX 19
+#define PNANOVDB_GRID_TYPE_ONINDEX 20
+#define PNANOVDB_GRID_TYPE_INDEXMASK 21
+#define PNANOVDB_GRID_TYPE_ONINDEXMASK 22
+#define PNANOVDB_GRID_TYPE_POINTINDEX 23
+#define PNANOVDB_GRID_TYPE_VEC3U8 24
+#define PNANOVDB_GRID_TYPE_VEC3U16 25
+#define PNANOVDB_GRID_TYPE_UINT8 26
+#define PNANOVDB_GRID_TYPE_END 27
 
 #define PNANOVDB_GRID_CLASS_UNKNOWN 0
-#define PNANOVDB_GRID_CLASS_LEVEL_SET 1     // narrow band levelset, e.g. SDF
+#define PNANOVDB_GRID_CLASS_LEVEL_SET 1     // narrow band level set, e.g. SDF
 #define PNANOVDB_GRID_CLASS_FOG_VOLUME 2    // fog volume, e.g. density
 #define PNANOVDB_GRID_CLASS_STAGGERED 3     // staggered MAC grid, e.g. velocity
 #define PNANOVDB_GRID_CLASS_POINT_INDEX 4   // point index grid
 #define PNANOVDB_GRID_CLASS_POINT_DATA 5    // point data grid
 #define PNANOVDB_GRID_CLASS_TOPOLOGY 6      // grid with active states only (no values)
 #define PNANOVDB_GRID_CLASS_VOXEL_VOLUME 7  // volume of geometric cubes, e.g. minecraft
-#define PNANOVDB_GRID_CLASS_END 8
+#define PNANOVDB_GRID_CLASS_INDEX_GRID 8    // grid whose values are offsets, e.g. into an external array
+#define PNANOVDB_GRID_CLASS_TENSOR_GRID 9 // grid which can have extra metadata and features
+#define PNANOVDB_GRID_CLASS_END 10
 
 #define PNANOVDB_GRID_FLAGS_HAS_LONG_GRID_NAME (1 << 0)
 #define PNANOVDB_GRID_FLAGS_HAS_BBOX (1 << 1)
@@ -727,13 +988,23 @@ PNANOVDB_FORCE_INLINE float pnanovdb_read_half(pnanovdb_buf_t buf, pnanovdb_addr
 #define PNANOVDB_LEAF_TYPE_DEFAULT 0
 #define PNANOVDB_LEAF_TYPE_LITE 1
 #define PNANOVDB_LEAF_TYPE_FP 2
-
-PNANOVDB_STATIC_CONST pnanovdb_uint32_t pnanovdb_grid_type_value_strides_bits[PNANOVDB_GRID_TYPE_END]  = {  0, 32, 64, 16, 32, 64, 96,  192,  0, 16, 32,  1, 32,  4,  8, 16,  0, 128, 256 };
-PNANOVDB_STATIC_CONST pnanovdb_uint32_t pnanovdb_grid_type_table_strides_bits[PNANOVDB_GRID_TYPE_END] =  { 64, 64, 64, 64, 64, 64, 128, 192, 64, 64, 64, 64, 64, 64, 64, 64, 64, 128, 256 };
-PNANOVDB_STATIC_CONST pnanovdb_uint32_t pnanovdb_grid_type_minmax_strides_bits[PNANOVDB_GRID_TYPE_END] = {  0, 32, 64, 16, 32, 64, 96,  192,  8, 16, 32,  8, 32, 32, 32, 32, 32, 128, 256 };
-PNANOVDB_STATIC_CONST pnanovdb_uint32_t pnanovdb_grid_type_minmax_aligns_bits[PNANOVDB_GRID_TYPE_END]  = {  0, 32, 64, 16, 32, 64, 32,   64,  8, 16, 32,  8, 32, 32, 32, 32, 32,  32,  64 };
-PNANOVDB_STATIC_CONST pnanovdb_uint32_t pnanovdb_grid_type_stat_strides_bits[PNANOVDB_GRID_TYPE_END]   = {  0, 32, 64, 32, 32, 64, 32,   64,  8, 32, 32,  8, 32, 32, 32, 32, 32,  32,  64 };
-PNANOVDB_STATIC_CONST pnanovdb_uint32_t pnanovdb_grid_type_leaf_type[PNANOVDB_GRID_TYPE_END]           = {  0,  0,  0,  0,  0,  0,  0,    0,  1,  0,  0,  1,  0,  2,  2,  2,  2,   0,   0 };
+#define PNANOVDB_LEAF_TYPE_INDEX 3
+#define PNANOVDB_LEAF_TYPE_INDEXMASK 4
+#define PNANOVDB_LEAF_TYPE_POINTINDEX 5
+
+// BuildType = Unknown, float, double, int16_t, int32_t, int64_t, Vec3f, Vec3d, Mask, ...
+// bit count of values in leaf nodes, i.e. 8*sizeof(*nanovdb::LeafNode<BuildType>::mValues) or zero if no values are stored
+PNANOVDB_STATIC_CONST pnanovdb_uint32_t pnanovdb_grid_type_value_strides_bits[PNANOVDB_GRID_TYPE_END]  = {  0, 32, 64, 16, 32, 64,  96, 192,  0, 16, 32,  1, 32,  4,  8, 16,  0, 128, 256,  0,  0,  0,  0, 16, 24, 48,  8 };
+// bit count of the Tile union in InternalNodes, i.e. 8*sizeof(nanovdb::InternalData::Tile)
+PNANOVDB_STATIC_CONST pnanovdb_uint32_t pnanovdb_grid_type_table_strides_bits[PNANOVDB_GRID_TYPE_END]  = { 64, 64, 64, 64, 64, 64, 128, 192, 64, 64, 64, 64, 64, 64, 64, 64, 64, 128, 256, 64, 64, 64, 64, 64, 64, 64, 64 };
+// bit count of min/max values, i.e. 8*sizeof(nanovdb::LeafData::mMinimum) or zero if no min/max exists
+PNANOVDB_STATIC_CONST pnanovdb_uint32_t pnanovdb_grid_type_minmax_strides_bits[PNANOVDB_GRID_TYPE_END] = {  0, 32, 64, 16, 32, 64,  96, 192,  8, 16, 32,  8, 32, 32, 32, 32, 32, 128, 256, 64, 64, 64, 64, 64, 24, 48,  8 };
+// bit alignment of the value type, controlled by the smallest native type, which is why it is always 0, 8, 16, 32, or 64, e.g. for Vec3f it is 32
+PNANOVDB_STATIC_CONST pnanovdb_uint32_t pnanovdb_grid_type_minmax_aligns_bits[PNANOVDB_GRID_TYPE_END]  = {  0, 32, 64, 16, 32, 64,  32,  64,  8, 16, 32,  8, 32, 32, 32, 32, 32,  32,  64, 64, 64, 64, 64, 64,  8, 16,  8 };
+// bit alignment of the stats (avg/std-dev) types, e.g. 8*sizeof(nanovdb::LeafData::mAverage)
+PNANOVDB_STATIC_CONST pnanovdb_uint32_t pnanovdb_grid_type_stat_strides_bits[PNANOVDB_GRID_TYPE_END]   = {  0, 32, 64, 32, 32, 64,  32,  64,  8, 32, 32,  8, 32, 32, 32, 32, 32,  32,  64, 64, 64, 64, 64, 64, 32, 32, 32 };
+// one of the 4 leaf types defined above, e.g. PNANOVDB_LEAF_TYPE_INDEX = 3
+PNANOVDB_STATIC_CONST pnanovdb_uint32_t pnanovdb_grid_type_leaf_type[PNANOVDB_GRID_TYPE_END]           = {  0,  0,  0,  0,  0,  0,  0,    0,  1,  0,  0,  1,  0,  2,  2,  2,  2,   0,   0,  3,  3,  4,  4,  5,  0,  0,  0 };
 
 struct pnanovdb_map_t
 {
@@ -786,6 +1057,31 @@ PNANOVDB_FORCE_INLINE double pnanovdb_map_get_taperd(pnanovdb_buf_t buf, pnanovd
     return pnanovdb_read_double(buf, pnanovdb_address_offset(p.address, PNANOVDB_MAP_OFF_TAPERD));
 }
 
+PNANOVDB_FORCE_INLINE void pnanovdb_map_set_matf(pnanovdb_buf_t buf, pnanovdb_map_handle_t p, pnanovdb_uint32_t index, float matf) {
+    pnanovdb_write_float(buf, pnanovdb_address_offset(p.address, PNANOVDB_MAP_OFF_MATF + 4u * index), matf);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_map_set_invmatf(pnanovdb_buf_t buf, pnanovdb_map_handle_t p, pnanovdb_uint32_t index, float invmatf) {
+    pnanovdb_write_float(buf, pnanovdb_address_offset(p.address, PNANOVDB_MAP_OFF_INVMATF + 4u * index), invmatf);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_map_set_vecf(pnanovdb_buf_t buf, pnanovdb_map_handle_t p, pnanovdb_uint32_t index, float vecf) {
+    pnanovdb_write_float(buf, pnanovdb_address_offset(p.address, PNANOVDB_MAP_OFF_VECF + 4u * index), vecf);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_map_set_taperf(pnanovdb_buf_t buf, pnanovdb_map_handle_t p, pnanovdb_uint32_t index, float taperf) {
+    pnanovdb_write_float(buf, pnanovdb_address_offset(p.address, PNANOVDB_MAP_OFF_TAPERF), taperf);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_map_set_matd(pnanovdb_buf_t buf, pnanovdb_map_handle_t p, pnanovdb_uint32_t index, double matd) {
+    pnanovdb_write_double(buf, pnanovdb_address_offset(p.address, PNANOVDB_MAP_OFF_MATD + 8u * index), matd);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_map_set_invmatd(pnanovdb_buf_t buf, pnanovdb_map_handle_t p, pnanovdb_uint32_t index, double invmatd) {
+    pnanovdb_write_double(buf, pnanovdb_address_offset(p.address, PNANOVDB_MAP_OFF_INVMATD + 8u * index), invmatd);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_map_set_vecd(pnanovdb_buf_t buf, pnanovdb_map_handle_t p, pnanovdb_uint32_t index, double vecd) {
+    pnanovdb_write_double(buf, pnanovdb_address_offset(p.address, PNANOVDB_MAP_OFF_VECD + 8u * index), vecd);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_map_set_taperd(pnanovdb_buf_t buf, pnanovdb_map_handle_t p, pnanovdb_uint32_t index, double taperd) {
+    pnanovdb_write_double(buf, pnanovdb_address_offset(p.address, PNANOVDB_MAP_OFF_TAPERD), taperd);
+}
+
 struct pnanovdb_grid_t
 {
     pnanovdb_uint64_t magic;                    // 8 bytes,     0
@@ -875,6 +1171,54 @@ PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_grid_get_blind_metadata_count(p
     return pnanovdb_read_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_BLIND_METADATA_COUNT));
 }
 
+PNANOVDB_FORCE_INLINE void pnanovdb_grid_set_magic(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p, pnanovdb_uint64_t magic) {
+    pnanovdb_write_uint64(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_MAGIC), magic);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_grid_set_checksum(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p, pnanovdb_uint64_t checksum) {
+    pnanovdb_write_uint64(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_CHECKSUM), checksum);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_grid_set_version(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p, pnanovdb_uint32_t version) {
+    pnanovdb_write_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_VERSION), version);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_grid_set_flags(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p, pnanovdb_uint32_t flags) {
+    pnanovdb_write_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_FLAGS), flags);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_grid_set_grid_index(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p, pnanovdb_uint32_t grid_index) {
+    pnanovdb_write_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_GRID_INDEX), grid_index);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_grid_set_grid_count(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p, pnanovdb_uint32_t grid_count) {
+    pnanovdb_write_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_GRID_COUNT), grid_count);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_grid_set_grid_size(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p, pnanovdb_uint64_t grid_size) {
+    pnanovdb_write_uint64(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_GRID_SIZE), grid_size);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_grid_set_grid_name(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p, pnanovdb_uint32_t index, pnanovdb_uint32_t grid_name) {
+    pnanovdb_write_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_GRID_NAME + 4u * index), grid_name);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_grid_set_world_bbox(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p, pnanovdb_uint32_t index, double world_bbox) {
+    pnanovdb_write_double(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_WORLD_BBOX + 8u * index), world_bbox);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_grid_set_voxel_size(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p, pnanovdb_uint32_t index, double voxel_size) {
+    pnanovdb_write_double(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_VOXEL_SIZE + 8u * index), voxel_size);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_grid_set_grid_class(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p, pnanovdb_uint32_t grid_class) {
+    pnanovdb_write_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_GRID_CLASS), grid_class);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_grid_set_grid_type(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p, pnanovdb_uint32_t grid_type) {
+    pnanovdb_write_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_GRID_TYPE), grid_type);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_grid_set_blind_metadata_offset(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p, pnanovdb_uint64_t blind_metadata_offset) {
+    pnanovdb_write_uint64(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_BLIND_METADATA_OFFSET), blind_metadata_offset);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_grid_set_blind_metadata_count(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p, pnanovdb_uint32_t metadata_count) {
+    pnanovdb_write_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_BLIND_METADATA_COUNT), metadata_count);
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_make_version(pnanovdb_uint32_t major, pnanovdb_uint32_t minor, pnanovdb_uint32_t patch_num)
+{
+    return (major << 21u) | (minor << 10u) | patch_num;
+}
+
 PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_version_get_major(pnanovdb_uint32_t version)
 {
     return (version >> 21u) & ((1u << 11u) - 1u);
@@ -890,9 +1234,9 @@ PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_version_get_patch(pnanovdb_uint
 
 struct pnanovdb_gridblindmetadata_t
 {
-    pnanovdb_int64_t byte_offset;       // 8 bytes,     0
-    pnanovdb_uint64_t element_count;    // 8 bytes,     8
-    pnanovdb_uint32_t flags;            // 4 bytes,     16
+    pnanovdb_int64_t data_offset;       // 8 bytes,     0
+    pnanovdb_uint64_t value_count;      // 8 bytes,     8
+    pnanovdb_uint32_t value_size;       // 4 bytes,     16
     pnanovdb_uint32_t semantic;         // 4 bytes,     20
     pnanovdb_uint32_t data_class;       // 4 bytes,     24
     pnanovdb_uint32_t data_type;        // 4 bytes,     28
@@ -904,22 +1248,22 @@ PNANOVDB_STRUCT_TYPEDEF(pnanovdb_gridblindmetadata_handle_t)
 
 #define PNANOVDB_GRIDBLINDMETADATA_SIZE 288
 
-#define PNANOVDB_GRIDBLINDMETADATA_OFF_BYTE_OFFSET 0
-#define PNANOVDB_GRIDBLINDMETADATA_OFF_ELEMENT_COUNT 8
-#define PNANOVDB_GRIDBLINDMETADATA_OFF_FLAGS 16
+#define PNANOVDB_GRIDBLINDMETADATA_OFF_DATA_OFFSET 0
+#define PNANOVDB_GRIDBLINDMETADATA_OFF_VALUE_COUNT 8
+#define PNANOVDB_GRIDBLINDMETADATA_OFF_VALUE_SIZE 16
 #define PNANOVDB_GRIDBLINDMETADATA_OFF_SEMANTIC 20
 #define PNANOVDB_GRIDBLINDMETADATA_OFF_DATA_CLASS 24
 #define PNANOVDB_GRIDBLINDMETADATA_OFF_DATA_TYPE 28
 #define PNANOVDB_GRIDBLINDMETADATA_OFF_NAME 32
 
-PNANOVDB_FORCE_INLINE pnanovdb_int64_t pnanovdb_gridblindmetadata_get_byte_offset(pnanovdb_buf_t buf, pnanovdb_gridblindmetadata_handle_t p) {
-    return pnanovdb_read_int64(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRIDBLINDMETADATA_OFF_BYTE_OFFSET));
+PNANOVDB_FORCE_INLINE pnanovdb_int64_t pnanovdb_gridblindmetadata_get_data_offset(pnanovdb_buf_t buf, pnanovdb_gridblindmetadata_handle_t p) {
+    return pnanovdb_read_int64(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRIDBLINDMETADATA_OFF_DATA_OFFSET));
 }
-PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_gridblindmetadata_get_element_count(pnanovdb_buf_t buf, pnanovdb_gridblindmetadata_handle_t p) {
-    return pnanovdb_read_uint64(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRIDBLINDMETADATA_OFF_ELEMENT_COUNT));
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_gridblindmetadata_get_value_count(pnanovdb_buf_t buf, pnanovdb_gridblindmetadata_handle_t p) {
+    return pnanovdb_read_uint64(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRIDBLINDMETADATA_OFF_VALUE_COUNT));
 }
-PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_gridblindmetadata_get_flags(pnanovdb_buf_t buf, pnanovdb_gridblindmetadata_handle_t p) {
-    return pnanovdb_read_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRIDBLINDMETADATA_OFF_FLAGS));
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_gridblindmetadata_get_value_size(pnanovdb_buf_t buf, pnanovdb_gridblindmetadata_handle_t p) {
+    return pnanovdb_read_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRIDBLINDMETADATA_OFF_VALUE_SIZE));
 }
 PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_gridblindmetadata_get_semantic(pnanovdb_buf_t buf, pnanovdb_gridblindmetadata_handle_t p) {
     return pnanovdb_read_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRIDBLINDMETADATA_OFF_SEMANTIC));
@@ -1000,6 +1344,40 @@ PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_tree_get_voxel_count(pnanovdb_b
     return pnanovdb_read_uint64(buf, pnanovdb_address_offset(p.address, PNANOVDB_TREE_OFF_VOXEL_COUNT));
 }
 
+PNANOVDB_FORCE_INLINE void pnanovdb_tree_set_node_offset_leaf(pnanovdb_buf_t buf, pnanovdb_tree_handle_t p, pnanovdb_uint64_t node_offset_leaf) {
+    pnanovdb_write_uint64(buf, pnanovdb_address_offset(p.address, PNANOVDB_TREE_OFF_NODE_OFFSET_LEAF), node_offset_leaf);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_tree_set_node_offset_lower(pnanovdb_buf_t buf, pnanovdb_tree_handle_t p, pnanovdb_uint64_t node_offset_lower) {
+    pnanovdb_write_uint64(buf, pnanovdb_address_offset(p.address, PNANOVDB_TREE_OFF_NODE_OFFSET_LOWER), node_offset_lower);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_tree_set_node_offset_upper(pnanovdb_buf_t buf, pnanovdb_tree_handle_t p, pnanovdb_uint64_t node_offset_upper) {
+    pnanovdb_write_uint64(buf, pnanovdb_address_offset(p.address, PNANOVDB_TREE_OFF_NODE_OFFSET_UPPER), node_offset_upper);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_tree_set_node_offset_root(pnanovdb_buf_t buf, pnanovdb_tree_handle_t p, pnanovdb_uint64_t node_offset_root) {
+    pnanovdb_write_uint64(buf, pnanovdb_address_offset(p.address, PNANOVDB_TREE_OFF_NODE_OFFSET_ROOT), node_offset_root);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_tree_set_node_count_leaf(pnanovdb_buf_t buf, pnanovdb_tree_handle_t p, pnanovdb_uint32_t node_count_leaf) {
+    pnanovdb_write_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_TREE_OFF_NODE_COUNT_LEAF), node_count_leaf);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_tree_set_node_count_lower(pnanovdb_buf_t buf, pnanovdb_tree_handle_t p, pnanovdb_uint32_t node_count_lower) {
+    pnanovdb_write_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_TREE_OFF_NODE_COUNT_LOWER), node_count_lower);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_tree_set_node_count_upper(pnanovdb_buf_t buf, pnanovdb_tree_handle_t p, pnanovdb_uint32_t node_count_upper) {
+    pnanovdb_write_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_TREE_OFF_NODE_COUNT_UPPER), node_count_upper);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_tree_set_tile_count_leaf(pnanovdb_buf_t buf, pnanovdb_tree_handle_t p, pnanovdb_uint32_t tile_count_leaf) {
+    pnanovdb_write_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_TREE_OFF_TILE_COUNT_LEAF), tile_count_leaf);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_tree_set_tile_count_lower(pnanovdb_buf_t buf, pnanovdb_tree_handle_t p, pnanovdb_uint32_t tile_count_lower) {
+    pnanovdb_write_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_TREE_OFF_TILE_COUNT_LOWER), tile_count_lower);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_tree_set_tile_count_upper(pnanovdb_buf_t buf, pnanovdb_tree_handle_t p, pnanovdb_uint32_t tile_count_upper) {
+    pnanovdb_write_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_TREE_OFF_TILE_COUNT_UPPER), tile_count_upper);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_tree_set_voxel_count(pnanovdb_buf_t buf, pnanovdb_tree_handle_t p, pnanovdb_uint64_t voxel_count) {
+    pnanovdb_write_uint64(buf, pnanovdb_address_offset(p.address, PNANOVDB_TREE_OFF_VOXEL_COUNT), voxel_count);
+}
+
 struct pnanovdb_root_t
 {
     pnanovdb_coord_t bbox_min;
@@ -1028,6 +1406,16 @@ PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_root_get_tile_count(pnanovdb_bu
     return pnanovdb_read_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_ROOT_OFF_TABLE_SIZE));
 }
 
+PNANOVDB_FORCE_INLINE void pnanovdb_root_set_bbox_min(pnanovdb_buf_t buf, pnanovdb_root_handle_t p, PNANOVDB_IN(pnanovdb_coord_t) bbox_min) {
+    pnanovdb_write_coord(buf, pnanovdb_address_offset(p.address, PNANOVDB_ROOT_OFF_BBOX_MIN), bbox_min);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_root_set_bbox_max(pnanovdb_buf_t buf, pnanovdb_root_handle_t p, PNANOVDB_IN(pnanovdb_coord_t) bbox_max) {
+    pnanovdb_write_coord(buf, pnanovdb_address_offset(p.address, PNANOVDB_ROOT_OFF_BBOX_MAX), bbox_max);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_root_set_tile_count(pnanovdb_buf_t buf, pnanovdb_root_handle_t p, pnanovdb_uint32_t tile_count) {
+    pnanovdb_write_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_ROOT_OFF_TABLE_SIZE), tile_count);
+}
+
 struct pnanovdb_root_tile_t
 {
     pnanovdb_uint64_t key;
@@ -1056,6 +1444,16 @@ PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_root_tile_get_state(pnanovdb_bu
     return pnanovdb_read_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_ROOT_TILE_OFF_STATE));
 }
 
+PNANOVDB_FORCE_INLINE void pnanovdb_root_tile_set_key(pnanovdb_buf_t buf, pnanovdb_root_tile_handle_t p, pnanovdb_uint64_t key) {
+    pnanovdb_write_uint64(buf, pnanovdb_address_offset(p.address, PNANOVDB_ROOT_TILE_OFF_KEY), key);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_root_tile_set_child(pnanovdb_buf_t buf, pnanovdb_root_tile_handle_t p, pnanovdb_int64_t child) {
+    pnanovdb_write_int64(buf, pnanovdb_address_offset(p.address, PNANOVDB_ROOT_TILE_OFF_CHILD), child);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_root_tile_set_state(pnanovdb_buf_t buf, pnanovdb_root_tile_handle_t p, pnanovdb_uint32_t state) {
+    pnanovdb_write_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_ROOT_TILE_OFF_STATE), state);
+}
+
 struct pnanovdb_upper_t
 {
     pnanovdb_coord_t bbox_min;
@@ -1097,6 +1495,20 @@ PNANOVDB_FORCE_INLINE pnanovdb_bool_t pnanovdb_upper_get_child_mask(pnanovdb_buf
     return ((value >> (bit_index & 31u)) & 1) != 0u;
 }
 
+PNANOVDB_FORCE_INLINE void pnanovdb_upper_set_bbox_min(pnanovdb_buf_t buf, pnanovdb_upper_handle_t p, PNANOVDB_IN(pnanovdb_coord_t) bbox_min) {
+    pnanovdb_write_coord(buf, pnanovdb_address_offset(p.address, PNANOVDB_UPPER_OFF_BBOX_MIN), bbox_min);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_upper_set_bbox_max(pnanovdb_buf_t buf, pnanovdb_upper_handle_t p, PNANOVDB_IN(pnanovdb_coord_t) bbox_max) {
+    pnanovdb_write_coord(buf, pnanovdb_address_offset(p.address, PNANOVDB_UPPER_OFF_BBOX_MAX), bbox_max);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_upper_set_child_mask(pnanovdb_buf_t buf, pnanovdb_upper_handle_t p, pnanovdb_uint32_t bit_index, pnanovdb_bool_t value) {
+    pnanovdb_address_t addr = pnanovdb_address_offset(p.address, PNANOVDB_UPPER_OFF_CHILD_MASK + 4u * (bit_index >> 5u));
+    pnanovdb_uint32_t valueMask = pnanovdb_read_uint32(buf, addr);
+    if (!value) { valueMask &= ~(1u << (bit_index & 31u)); }
+    if (value) valueMask |= (1u << (bit_index & 31u));
+    pnanovdb_write_uint32(buf, addr, valueMask);
+}
+
 struct pnanovdb_lower_t
 {
     pnanovdb_coord_t bbox_min;
@@ -1138,6 +1550,20 @@ PNANOVDB_FORCE_INLINE pnanovdb_bool_t pnanovdb_lower_get_child_mask(pnanovdb_buf
     return ((value >> (bit_index & 31u)) & 1) != 0u;
 }
 
+PNANOVDB_FORCE_INLINE void pnanovdb_lower_set_bbox_min(pnanovdb_buf_t buf, pnanovdb_lower_handle_t p, PNANOVDB_IN(pnanovdb_coord_t) bbox_min) {
+    pnanovdb_write_coord(buf, pnanovdb_address_offset(p.address, PNANOVDB_LOWER_OFF_BBOX_MIN), bbox_min);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_lower_set_bbox_max(pnanovdb_buf_t buf, pnanovdb_lower_handle_t p, PNANOVDB_IN(pnanovdb_coord_t) bbox_max) {
+    pnanovdb_write_coord(buf, pnanovdb_address_offset(p.address, PNANOVDB_LOWER_OFF_BBOX_MAX), bbox_max);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_lower_set_child_mask(pnanovdb_buf_t buf, pnanovdb_lower_handle_t p, pnanovdb_uint32_t bit_index, pnanovdb_bool_t value) {
+    pnanovdb_address_t addr = pnanovdb_address_offset(p.address, PNANOVDB_LOWER_OFF_CHILD_MASK + 4u * (bit_index >> 5u));
+    pnanovdb_uint32_t valueMask = pnanovdb_read_uint32(buf, addr);
+    if (!value) { valueMask &= ~(1u << (bit_index & 31u)); }
+    if (value) valueMask |= (1u << (bit_index & 31u));
+    pnanovdb_write_uint32(buf, addr, valueMask);
+}
+
 struct pnanovdb_leaf_t
 {
     pnanovdb_coord_t bbox_min;
@@ -1172,6 +1598,13 @@ PNANOVDB_FORCE_INLINE pnanovdb_bool_t pnanovdb_leaf_get_value_mask(pnanovdb_buf_
     return ((value >> (bit_index & 31u)) & 1) != 0u;
 }
 
+PNANOVDB_FORCE_INLINE void pnanovdb_leaf_set_bbox_min(pnanovdb_buf_t buf, pnanovdb_leaf_handle_t p, PNANOVDB_IN(pnanovdb_coord_t) bbox_min) {
+    pnanovdb_write_coord(buf, pnanovdb_address_offset(p.address, PNANOVDB_LEAF_OFF_BBOX_MIN), bbox_min);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_leaf_set_bbox_dif_and_flags(pnanovdb_buf_t buf, pnanovdb_leaf_handle_t p, pnanovdb_uint32_t bbox_dif_and_flags) {
+    pnanovdb_write_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_LEAF_OFF_BBOX_DIF_AND_FLAGS), bbox_dif_and_flags);
+}
+
 struct pnanovdb_grid_type_constants_t
 {
     pnanovdb_uint32_t root_off_background;
@@ -1205,27 +1638,36 @@ struct pnanovdb_grid_type_constants_t
 };
 PNANOVDB_STRUCT_TYPEDEF(pnanovdb_grid_type_constants_t)
 
+// The following table with offsets will nedd to be updates as new GridTypes are added in NanoVDB.h
 PNANOVDB_STATIC_CONST pnanovdb_grid_type_constants_t pnanovdb_grid_type_constants[PNANOVDB_GRID_TYPE_END] =
 {
-    {28, 28, 28, 28, 28, 32,  0, 8, 20, 32,  8224, 8224, 8224, 8224, 8224, 270368,  1056, 1056, 1056, 1056, 1056, 33824,  80, 80, 80, 80, 96, 96},
-    {28, 32, 36, 40, 44, 64,  32, 8, 20, 32,  8224, 8228, 8232, 8236, 8256, 270400,  1056, 1060, 1064, 1068, 1088, 33856,  80, 84, 88, 92, 96, 2144},
-    {32, 40, 48, 56, 64, 96,  64, 8, 24, 32,  8224, 8232, 8240, 8248, 8256, 270400,  1056, 1064, 1072, 1080, 1088, 33856,  80, 88, 96, 104, 128, 4224},
-    {28, 30, 32, 36, 40, 64,  16, 8, 20, 32,  8224, 8226, 8228, 8232, 8256, 270400,  1056, 1058, 1060, 1064, 1088, 33856,  80, 82, 84, 88, 96, 1120},
-    {28, 32, 36, 40, 44, 64,  32, 8, 20, 32,  8224, 8228, 8232, 8236, 8256, 270400,  1056, 1060, 1064, 1068, 1088, 33856,  80, 84, 88, 92, 96, 2144},
-    {32, 40, 48, 56, 64, 96,  64, 8, 24, 32,  8224, 8232, 8240, 8248, 8256, 270400,  1056, 1064, 1072, 1080, 1088, 33856,  80, 88, 96, 104, 128, 4224},
-    {28, 40, 52, 64, 68, 96,  96, 16, 20, 32,  8224, 8236, 8248, 8252, 8256, 532544,  1056, 1068, 1080, 1084, 1088, 66624,  80, 92, 104, 108, 128, 6272},
-    {32, 56, 80, 104, 112, 128,  192, 24, 24, 64,  8224, 8248, 8272, 8280, 8288, 794720,  1056, 1080, 1104, 1112, 1120, 99424,  80, 104, 128, 136, 160, 12448},
-    {28, 29, 30, 31, 32, 64,  0, 8, 20, 32,  8224, 8225, 8226, 8227, 8256, 270400,  1056, 1057, 1058, 1059, 1088, 33856,  80, 80, 80, 80, 96, 96},
-    {28, 30, 32, 36, 40, 64,  16, 8, 20, 32,  8224, 8226, 8228, 8232, 8256, 270400,  1056, 1058, 1060, 1064, 1088, 33856,  80, 82, 84, 88, 96, 1120},
-    {28, 32, 36, 40, 44, 64,  32, 8, 20, 32,  8224, 8228, 8232, 8236, 8256, 270400,  1056, 1060, 1064, 1068, 1088, 33856,  80, 84, 88, 92, 96, 2144},
-    {28, 29, 30, 31, 32, 64,  1, 8, 20, 32,  8224, 8225, 8226, 8227, 8256, 270400,  1056, 1057, 1058, 1059, 1088, 33856,  80, 80, 80, 80, 96, 160},
-    {28, 32, 36, 40, 44, 64,  32, 8, 20, 32,  8224, 8228, 8232, 8236, 8256, 270400,  1056, 1060, 1064, 1068, 1088, 33856,  80, 84, 88, 92, 96, 2144},
-    {28, 32, 36, 40, 44, 64,  0, 8, 20, 32,  8224, 8228, 8232, 8236, 8256, 270400,  1056, 1060, 1064, 1068, 1088, 33856,  88, 90, 92, 94, 96, 352},
-    {28, 32, 36, 40, 44, 64,  0, 8, 20, 32,  8224, 8228, 8232, 8236, 8256, 270400,  1056, 1060, 1064, 1068, 1088, 33856,  88, 90, 92, 94, 96, 608},
-    {28, 32, 36, 40, 44, 64,  0, 8, 20, 32,  8224, 8228, 8232, 8236, 8256, 270400,  1056, 1060, 1064, 1068, 1088, 33856,  88, 90, 92, 94, 96, 1120},
-    {28, 32, 36, 40, 44, 64,  0, 8, 20, 32,  8224, 8228, 8232, 8236, 8256, 270400,  1056, 1060, 1064, 1068, 1088, 33856,  88, 90, 92, 94, 96, 96},
-    {28, 44, 60, 76, 80, 96,  128, 16, 20, 64,  8224, 8240, 8256, 8260, 8288, 532576,  1056, 1072, 1088, 1092, 1120, 66656,  80, 96, 112, 116, 128, 8320},
-    {32, 64, 96, 128, 136, 160,  256, 32, 24, 64,  8224, 8256, 8288, 8296, 8320, 1056896,  1056, 1088, 1120, 1128, 1152, 132224,  80, 112, 144, 152, 160, 16544},
+{28, 28, 28, 28, 28, 32,  0, 8, 20, 32,  8224, 8224, 8224, 8224, 8224, 270368,  1056, 1056, 1056, 1056, 1056, 33824,  80, 80, 80, 80, 96, 96},
+{28, 32, 36, 40, 44, 64,  32, 8, 20, 32,  8224, 8228, 8232, 8236, 8256, 270400,  1056, 1060, 1064, 1068, 1088, 33856,  80, 84, 88, 92, 96, 2144},
+{32, 40, 48, 56, 64, 96,  64, 8, 24, 32,  8224, 8232, 8240, 8248, 8256, 270400,  1056, 1064, 1072, 1080, 1088, 33856,  80, 88, 96, 104, 128, 4224},
+{28, 30, 32, 36, 40, 64,  16, 8, 20, 32,  8224, 8226, 8228, 8232, 8256, 270400,  1056, 1058, 1060, 1064, 1088, 33856,  80, 82, 84, 88, 96, 1120},
+{28, 32, 36, 40, 44, 64,  32, 8, 20, 32,  8224, 8228, 8232, 8236, 8256, 270400,  1056, 1060, 1064, 1068, 1088, 33856,  80, 84, 88, 92, 96, 2144},
+{32, 40, 48, 56, 64, 96,  64, 8, 24, 32,  8224, 8232, 8240, 8248, 8256, 270400,  1056, 1064, 1072, 1080, 1088, 33856,  80, 88, 96, 104, 128, 4224},
+{28, 40, 52, 64, 68, 96,  96, 16, 20, 32,  8224, 8236, 8248, 8252, 8256, 532544,  1056, 1068, 1080, 1084, 1088, 66624,  80, 92, 104, 108, 128, 6272},
+{32, 56, 80, 104, 112, 128,  192, 24, 24, 64,  8224, 8248, 8272, 8280, 8288, 794720,  1056, 1080, 1104, 1112, 1120, 99424,  80, 104, 128, 136, 160, 12448},
+{28, 29, 30, 31, 32, 64,  0, 8, 20, 32,  8224, 8225, 8226, 8227, 8256, 270400,  1056, 1057, 1058, 1059, 1088, 33856,  80, 80, 80, 80, 96, 96},
+{28, 30, 32, 36, 40, 64,  16, 8, 20, 32,  8224, 8226, 8228, 8232, 8256, 270400,  1056, 1058, 1060, 1064, 1088, 33856,  80, 82, 84, 88, 96, 1120},
+{28, 32, 36, 40, 44, 64,  32, 8, 20, 32,  8224, 8228, 8232, 8236, 8256, 270400,  1056, 1060, 1064, 1068, 1088, 33856,  80, 84, 88, 92, 96, 2144},
+{28, 29, 30, 31, 32, 64,  1, 8, 20, 32,  8224, 8225, 8226, 8227, 8256, 270400,  1056, 1057, 1058, 1059, 1088, 33856,  80, 80, 80, 80, 96, 160},
+{28, 32, 36, 40, 44, 64,  32, 8, 20, 32,  8224, 8228, 8232, 8236, 8256, 270400,  1056, 1060, 1064, 1068, 1088, 33856,  80, 84, 88, 92, 96, 2144},
+{28, 32, 36, 40, 44, 64,  0, 8, 20, 32,  8224, 8228, 8232, 8236, 8256, 270400,  1056, 1060, 1064, 1068, 1088, 33856,  88, 90, 92, 94, 96, 352},
+{28, 32, 36, 40, 44, 64,  0, 8, 20, 32,  8224, 8228, 8232, 8236, 8256, 270400,  1056, 1060, 1064, 1068, 1088, 33856,  88, 90, 92, 94, 96, 608},
+{28, 32, 36, 40, 44, 64,  0, 8, 20, 32,  8224, 8228, 8232, 8236, 8256, 270400,  1056, 1060, 1064, 1068, 1088, 33856,  88, 90, 92, 94, 96, 1120},
+{28, 32, 36, 40, 44, 64,  0, 8, 20, 32,  8224, 8228, 8232, 8236, 8256, 270400,  1056, 1060, 1064, 1068, 1088, 33856,  88, 90, 92, 94, 96, 96},
+{28, 44, 60, 76, 80, 96,  128, 16, 20, 64,  8224, 8240, 8256, 8260, 8288, 532576,  1056, 1072, 1088, 1092, 1120, 66656,  80, 96, 112, 116, 128, 8320},
+{32, 64, 96, 128, 136, 160,  256, 32, 24, 64,  8224, 8256, 8288, 8296, 8320, 1056896,  1056, 1088, 1120, 1128, 1152, 132224,  80, 112, 144, 152, 160, 16544},
+{32, 40, 48, 56, 64, 96,  0, 8, 24, 32,  8224, 8232, 8240, 8248, 8256, 270400,  1056, 1064, 1072, 1080, 1088, 33856,  80, 80, 80, 80, 80, 96},
+{32, 40, 48, 56, 64, 96,  0, 8, 24, 32,  8224, 8232, 8240, 8248, 8256, 270400,  1056, 1064, 1072, 1080, 1088, 33856,  80, 80, 80, 80, 80, 96},
+{32, 40, 48, 56, 64, 96,  0, 8, 24, 32,  8224, 8232, 8240, 8248, 8256, 270400,  1056, 1064, 1072, 1080, 1088, 33856,  80, 80, 80, 80, 80, 160},
+{32, 40, 48, 56, 64, 96,  0, 8, 24, 32,  8224, 8232, 8240, 8248, 8256, 270400,  1056, 1064, 1072, 1080, 1088, 33856,  80, 80, 80, 80, 80, 160},
+{32, 40, 48, 56, 64, 96,  16, 8, 24, 32,  8224, 8232, 8240, 8248, 8256, 270400,  1056, 1064, 1072, 1080, 1088, 33856,  80, 88, 96, 96, 96, 1120},
+{28, 31, 34, 40, 44, 64,  24, 8, 20, 32,  8224, 8227, 8232, 8236, 8256, 270400,  1056, 1059, 1064, 1068, 1088, 33856,  80, 83, 88, 92, 96, 1632},
+{28, 34, 40, 48, 52, 64,  48, 8, 20, 32,  8224, 8230, 8236, 8240, 8256, 270400,  1056, 1062, 1068, 1072, 1088, 33856,  80, 86, 92, 96, 128, 3200},
+{28, 29, 30, 32, 36, 64,  8, 8, 20, 32,  8224, 8225, 8228, 8232, 8256, 270400,  1056, 1057, 1060, 1064, 1088, 33856,  80, 81, 84, 88, 96, 608},
 };
 
 // ------------------------------------------------ Basic Lookup -----------------------------------------------------------
@@ -1239,12 +1681,11 @@ PNANOVDB_FORCE_INLINE pnanovdb_gridblindmetadata_handle_t pnanovdb_grid_get_grid
     return meta;
 }
 
-PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanodvb_grid_get_gridblindmetadata_value_address(pnanovdb_buf_t buf, pnanovdb_grid_handle_t grid, pnanovdb_uint32_t index)
+PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_grid_get_gridblindmetadata_value_address(pnanovdb_buf_t buf, pnanovdb_grid_handle_t grid, pnanovdb_uint32_t index)
 {
     pnanovdb_gridblindmetadata_handle_t meta = pnanovdb_grid_get_gridblindmetadata(buf, grid, index);
-    pnanovdb_int64_t byte_offset = pnanovdb_gridblindmetadata_get_byte_offset(buf, meta);
-    pnanovdb_address_t address = grid.address;
-    address = pnanovdb_address_offset64(address, pnanovdb_int64_as_uint64(byte_offset));
+    pnanovdb_int64_t byte_offset = pnanovdb_gridblindmetadata_get_data_offset(buf, meta);
+    pnanovdb_address_t address = pnanovdb_address_offset64(meta.address, pnanovdb_int64_as_uint64(byte_offset));
     return address;
 }
 
@@ -1319,6 +1760,8 @@ PNANOVDB_FORCE_INLINE pnanovdb_root_tile_handle_t pnanovdb_root_find_tile(pnanov
     return null_handle;
 }
 
+// ----------------------------- Leaf Node ---------------------------------------
+
 PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_leaf_coord_to_offset(PNANOVDB_IN(pnanovdb_coord_t) ijk)
 {
     return (((PNANOVDB_DEREF(ijk).x & 7) >> 0) << (2 * 3)) +
@@ -1362,6 +1805,8 @@ PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_leaf_get_value_address(pnanovd
     return pnanovdb_leaf_get_table_address(grid_type, buf, leaf, n);
 }
 
+// ----------------------------- Leaf FP Types Specialization ---------------------------------------
+
 PNANOVDB_FORCE_INLINE float pnanovdb_leaf_fp_read_float(pnanovdb_buf_t buf, pnanovdb_address_t address, PNANOVDB_IN(pnanovdb_coord_t) ijk, pnanovdb_uint32_t value_log_bits)
 {
     //  value_log_bits                                                          //   2     3       4
@@ -1401,11 +1846,301 @@ PNANOVDB_FORCE_INLINE float pnanovdb_leaf_fpn_read_float(pnanovdb_buf_t buf, pna
     return pnanovdb_leaf_fp_read_float(buf, address, ijk, value_log_bits);
 }
 
+// ----------------------------- Leaf Index Specialization ---------------------------------------
+
+PNANOVDB_FORCE_INLINE pnanovdb_bool_t pnanovdb_leaf_index_has_stats(pnanovdb_buf_t buf, pnanovdb_leaf_handle_t leaf)
+{
+    return (pnanovdb_leaf_get_bbox_dif_and_flags(buf, leaf) & (1u << 28u)) != 0u;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_leaf_index_get_min_index(pnanovdb_buf_t buf, pnanovdb_address_t min_address)
+{
+    return pnanovdb_uint64_offset(pnanovdb_read_uint64(buf, min_address), 512u);
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_leaf_index_get_max_index(pnanovdb_buf_t buf, pnanovdb_address_t max_address)
+{
+    return pnanovdb_uint64_offset(pnanovdb_read_uint64(buf, max_address), 513u);
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_leaf_index_get_ave_index(pnanovdb_buf_t buf, pnanovdb_address_t ave_address)
+{
+    return pnanovdb_uint64_offset(pnanovdb_read_uint64(buf, ave_address), 514u);
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_leaf_index_get_dev_index(pnanovdb_buf_t buf, pnanovdb_address_t dev_address)
+{
+    return pnanovdb_uint64_offset(pnanovdb_read_uint64(buf, dev_address), 515u);
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_leaf_index_get_value_index(pnanovdb_buf_t buf, pnanovdb_address_t value_address, PNANOVDB_IN(pnanovdb_coord_t) ijk)
+{
+    pnanovdb_uint32_t n = pnanovdb_leaf_coord_to_offset(ijk);
+    pnanovdb_uint64_t offset = pnanovdb_read_uint64(buf, value_address);
+    return pnanovdb_uint64_offset(offset, n);
+}
+
+// ----------------------------- Leaf IndexMask Specialization ---------------------------------------
+
+PNANOVDB_FORCE_INLINE pnanovdb_bool_t pnanovdb_leaf_indexmask_has_stats(pnanovdb_buf_t buf, pnanovdb_leaf_handle_t leaf)
+{
+    return pnanovdb_leaf_index_has_stats(buf, leaf);
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_leaf_indexmask_get_min_index(pnanovdb_buf_t buf, pnanovdb_address_t min_address)
+{
+    return pnanovdb_leaf_index_get_min_index(buf, min_address);
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_leaf_indexmask_get_max_index(pnanovdb_buf_t buf, pnanovdb_address_t max_address)
+{
+    return pnanovdb_leaf_index_get_max_index(buf, max_address);
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_leaf_indexmask_get_ave_index(pnanovdb_buf_t buf, pnanovdb_address_t ave_address)
+{
+    return pnanovdb_leaf_index_get_ave_index(buf, ave_address);
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_leaf_indexmask_get_dev_index(pnanovdb_buf_t buf, pnanovdb_address_t dev_address)
+{
+    return pnanovdb_leaf_index_get_dev_index(buf, dev_address);
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_leaf_indexmask_get_value_index(pnanovdb_buf_t buf, pnanovdb_address_t value_address, PNANOVDB_IN(pnanovdb_coord_t) ijk)
+{
+    return pnanovdb_leaf_index_get_value_index(buf, value_address, ijk);
+}
+PNANOVDB_FORCE_INLINE pnanovdb_bool_t pnanovdb_leaf_indexmask_get_mask_bit(pnanovdb_buf_t buf, pnanovdb_leaf_handle_t leaf, pnanovdb_uint32_t n)
+{
+    pnanovdb_uint32_t word_idx = n >> 5;
+    pnanovdb_uint32_t bit_idx = n & 31;
+    pnanovdb_uint32_t val_mask =
+        pnanovdb_read_uint32(buf, pnanovdb_address_offset(leaf.address, 96u + 4u * word_idx));
+    return (val_mask & (1u << bit_idx)) != 0u;
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_leaf_indexmask_set_mask_bit(pnanovdb_buf_t buf, pnanovdb_leaf_handle_t leaf, pnanovdb_uint32_t n, pnanovdb_bool_t v)
+{
+    pnanovdb_uint32_t word_idx = n >> 5;
+    pnanovdb_uint32_t bit_idx = n & 31;
+    pnanovdb_uint32_t val_mask =
+        pnanovdb_read_uint32(buf, pnanovdb_address_offset(leaf.address, 96u + 4u * word_idx));
+    if (v)
+    {
+        val_mask = val_mask | (1u << bit_idx);
+    }
+    else
+    {
+        val_mask = val_mask & ~(1u << bit_idx);
+    }
+    pnanovdb_write_uint32(buf, pnanovdb_address_offset(leaf.address, 96u + 4u * word_idx), val_mask);
+}
+
+// ----------------------------- Leaf OnIndex Specialization ---------------------------------------
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_leaf_onindex_get_value_count(pnanovdb_buf_t buf, pnanovdb_leaf_handle_t leaf)
+{
+    pnanovdb_uint64_t val_mask = pnanovdb_read_uint64(buf, pnanovdb_address_offset(leaf.address, PNANOVDB_LEAF_OFF_VALUE_MASK + 8u * 7u));
+    pnanovdb_uint64_t prefix_sum = pnanovdb_read_uint64(
+        buf, pnanovdb_address_offset(leaf.address, PNANOVDB_GRID_TYPE_GET(PNANOVDB_GRID_TYPE_ONINDEX, leaf_off_table) + 8u));
+    return pnanovdb_uint64_countbits(val_mask) + (pnanovdb_uint64_to_uint32_lsr(prefix_sum, 54u) & 511u);
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_leaf_onindex_get_last_offset(pnanovdb_buf_t buf, pnanovdb_leaf_handle_t leaf)
+{
+    return pnanovdb_uint64_offset(
+        pnanovdb_read_uint64(buf, pnanovdb_address_offset(leaf.address, PNANOVDB_GRID_TYPE_GET(PNANOVDB_GRID_TYPE_ONINDEX, leaf_off_table))),
+        pnanovdb_leaf_onindex_get_value_count(buf, leaf) - 1u);
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_bool_t pnanovdb_leaf_onindex_has_stats(pnanovdb_buf_t buf, pnanovdb_leaf_handle_t leaf)
+{
+    return (pnanovdb_leaf_get_bbox_dif_and_flags(buf, leaf) & (1u << 28u)) != 0u;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_leaf_onindex_get_min_index(pnanovdb_buf_t buf, pnanovdb_address_t min_address)
+{
+    pnanovdb_leaf_handle_t leaf = { pnanovdb_address_offset_neg(min_address, PNANOVDB_GRID_TYPE_GET(PNANOVDB_GRID_TYPE_ONINDEX, leaf_off_table)) };
+    pnanovdb_uint64_t idx = pnanovdb_uint32_as_uint64_low(0u);
+    if (pnanovdb_leaf_onindex_has_stats(buf, leaf))
+    {
+        idx = pnanovdb_uint64_offset(pnanovdb_leaf_onindex_get_last_offset(buf, leaf), 1u);
+    }
+    return idx;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_leaf_onindex_get_max_index(pnanovdb_buf_t buf, pnanovdb_address_t max_address)
+{
+    pnanovdb_leaf_handle_t leaf = { pnanovdb_address_offset_neg(max_address, PNANOVDB_GRID_TYPE_GET(PNANOVDB_GRID_TYPE_ONINDEX, leaf_off_table)) };
+    pnanovdb_uint64_t idx = pnanovdb_uint32_as_uint64_low(0u);
+    if (pnanovdb_leaf_onindex_has_stats(buf, leaf))
+    {
+        idx = pnanovdb_uint64_offset(pnanovdb_leaf_onindex_get_last_offset(buf, leaf), 2u);
+    }
+    return idx;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_leaf_onindex_get_ave_index(pnanovdb_buf_t buf, pnanovdb_address_t ave_address)
+{
+    pnanovdb_leaf_handle_t leaf = { pnanovdb_address_offset_neg(ave_address, PNANOVDB_GRID_TYPE_GET(PNANOVDB_GRID_TYPE_ONINDEX, leaf_off_table)) };
+    pnanovdb_uint64_t idx = pnanovdb_uint32_as_uint64_low(0u);
+    if (pnanovdb_leaf_onindex_has_stats(buf, leaf))
+    {
+        idx = pnanovdb_uint64_offset(pnanovdb_leaf_onindex_get_last_offset(buf, leaf), 3u);
+    }
+    return idx;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_leaf_onindex_get_dev_index(pnanovdb_buf_t buf, pnanovdb_address_t dev_address)
+{
+    pnanovdb_leaf_handle_t leaf = { pnanovdb_address_offset_neg(dev_address, PNANOVDB_GRID_TYPE_GET(PNANOVDB_GRID_TYPE_ONINDEX, leaf_off_table)) };
+    pnanovdb_uint64_t idx = pnanovdb_uint32_as_uint64_low(0u);
+    if (pnanovdb_leaf_onindex_has_stats(buf, leaf))
+    {
+        idx = pnanovdb_uint64_offset(pnanovdb_leaf_onindex_get_last_offset(buf, leaf), 4u);
+    }
+    return idx;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_leaf_onindex_get_value_index(pnanovdb_buf_t buf, pnanovdb_address_t value_address, PNANOVDB_IN(pnanovdb_coord_t) ijk)
+{
+    pnanovdb_uint32_t n = pnanovdb_leaf_coord_to_offset(ijk);
+    pnanovdb_leaf_handle_t leaf = { pnanovdb_address_offset_neg(value_address, PNANOVDB_GRID_TYPE_GET(PNANOVDB_GRID_TYPE_ONINDEX, leaf_off_table)) };
+
+    pnanovdb_uint32_t word_idx = n >> 6u;
+    pnanovdb_uint32_t bit_idx = n & 63u;
+    pnanovdb_uint64_t val_mask = pnanovdb_read_uint64(buf, pnanovdb_address_offset(leaf.address, PNANOVDB_LEAF_OFF_VALUE_MASK + 8u * word_idx));
+    pnanovdb_uint64_t mask = pnanovdb_uint64_bit_mask(bit_idx);
+    pnanovdb_uint64_t value_index = pnanovdb_uint32_as_uint64_low(0u);
+    if (pnanovdb_uint64_any_bit(pnanovdb_uint64_and(val_mask, mask)))
+    {
+        pnanovdb_uint32_t sum = 0u;
+        sum += pnanovdb_uint64_countbits(pnanovdb_uint64_and(val_mask, pnanovdb_uint64_dec(mask)));
+        if (word_idx > 0u)
+        {
+            pnanovdb_uint64_t prefix_sum = pnanovdb_read_uint64(buf, pnanovdb_address_offset(value_address, 8u));
+            sum += pnanovdb_uint64_to_uint32_lsr(prefix_sum, 9u * (word_idx - 1u)) & 511u;
+        }
+        pnanovdb_uint64_t offset = pnanovdb_read_uint64(buf, value_address);
+        value_index = pnanovdb_uint64_offset(offset, sum);
+    }
+    return value_index;
+}
+
+// ----------------------------- Leaf OnIndexMask Specialization ---------------------------------------
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_leaf_onindexmask_get_value_count(pnanovdb_buf_t buf, pnanovdb_leaf_handle_t leaf)
+{
+    return pnanovdb_leaf_onindex_get_value_count(buf, leaf);
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_leaf_onindexmask_get_last_offset(pnanovdb_buf_t buf, pnanovdb_leaf_handle_t leaf)
+{
+    return pnanovdb_leaf_onindex_get_last_offset(buf, leaf);
+}
+PNANOVDB_FORCE_INLINE pnanovdb_bool_t pnanovdb_leaf_onindexmask_has_stats(pnanovdb_buf_t buf, pnanovdb_leaf_handle_t leaf)
+{
+    return pnanovdb_leaf_onindex_has_stats(buf, leaf);
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_leaf_onindexmask_get_min_index(pnanovdb_buf_t buf, pnanovdb_address_t min_address)
+{
+    return pnanovdb_leaf_onindex_get_min_index(buf, min_address);
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_leaf_onindexmask_get_max_index(pnanovdb_buf_t buf, pnanovdb_address_t max_address)
+{
+    return pnanovdb_leaf_onindex_get_max_index(buf, max_address);
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_leaf_onindexmask_get_ave_index(pnanovdb_buf_t buf, pnanovdb_address_t ave_address)
+{
+    return pnanovdb_leaf_onindex_get_ave_index(buf, ave_address);
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_leaf_onindexmask_get_dev_index(pnanovdb_buf_t buf, pnanovdb_address_t dev_address)
+{
+    return pnanovdb_leaf_onindex_get_dev_index(buf, dev_address);
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_leaf_onindexmask_get_value_index(pnanovdb_buf_t buf, pnanovdb_address_t value_address, PNANOVDB_IN(pnanovdb_coord_t) ijk)
+{
+    return pnanovdb_leaf_onindex_get_value_index(buf, value_address, ijk);
+}
+PNANOVDB_FORCE_INLINE pnanovdb_bool_t pnanovdb_leaf_onindexmask_get_mask_bit(pnanovdb_buf_t buf, pnanovdb_leaf_handle_t leaf, pnanovdb_uint32_t n)
+{
+    pnanovdb_uint32_t word_idx = n >> 5;
+    pnanovdb_uint32_t bit_idx = n & 31;
+    pnanovdb_uint32_t val_mask =
+        pnanovdb_read_uint32(buf, pnanovdb_address_offset(leaf.address, 96u + 4u * word_idx));
+    return (val_mask & (1u << bit_idx)) != 0u;
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_leaf_onindexmask_set_mask_bit(pnanovdb_buf_t buf, pnanovdb_leaf_handle_t leaf, pnanovdb_uint32_t n, pnanovdb_bool_t v)
+{
+    pnanovdb_uint32_t word_idx = n >> 5;
+    pnanovdb_uint32_t bit_idx = n & 31;
+    pnanovdb_uint32_t val_mask =
+        pnanovdb_read_uint32(buf, pnanovdb_address_offset(leaf.address, 96u + 4u * word_idx));
+    if (v)
+    {
+        val_mask = val_mask | (1u << bit_idx);
+    }
+    else
+    {
+        val_mask = val_mask & ~(1u << bit_idx);
+    }
+    pnanovdb_write_uint32(buf, pnanovdb_address_offset(leaf.address, 96u + 4u * word_idx), val_mask);
+}
+
+// ----------------------------- Leaf PointIndex Specialization ---------------------------------------
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_leaf_pointindex_get_offset(pnanovdb_buf_t buf, pnanovdb_leaf_handle_t leaf)
+{
+    return pnanovdb_read_uint64(buf, pnanovdb_leaf_get_min_address(PNANOVDB_GRID_TYPE_POINTINDEX, buf, leaf));
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_leaf_pointindex_get_point_count(pnanovdb_buf_t buf, pnanovdb_leaf_handle_t leaf)
+{
+    return pnanovdb_read_uint64(buf, pnanovdb_leaf_get_max_address(PNANOVDB_GRID_TYPE_POINTINDEX, buf, leaf));
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_leaf_pointindex_get_first(pnanovdb_buf_t buf, pnanovdb_leaf_handle_t leaf, pnanovdb_uint32_t i)
+{
+    return pnanovdb_uint64_offset(pnanovdb_leaf_pointindex_get_offset(buf, leaf),
+        (i == 0u ? 0u : pnanovdb_read_uint16(buf, pnanovdb_leaf_get_table_address(PNANOVDB_GRID_TYPE_POINTINDEX, buf, leaf, i - 1u))));
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_leaf_pointindex_get_last(pnanovdb_buf_t buf, pnanovdb_leaf_handle_t leaf, pnanovdb_uint32_t i)
+{
+    return pnanovdb_uint64_offset(pnanovdb_leaf_pointindex_get_offset(buf, leaf),
+        pnanovdb_read_uint16(buf, pnanovdb_leaf_get_table_address(PNANOVDB_GRID_TYPE_POINTINDEX, buf, leaf, i)));
+}
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_leaf_pointindex_get_value(pnanovdb_buf_t buf, pnanovdb_leaf_handle_t leaf, pnanovdb_uint32_t i)
+{
+    return pnanovdb_uint32_as_uint64_low(pnanovdb_read_uint16(buf, pnanovdb_leaf_get_table_address(PNANOVDB_GRID_TYPE_POINTINDEX, buf, leaf, i)));
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_leaf_pointindex_set_value_only(pnanovdb_buf_t buf, pnanovdb_leaf_handle_t leaf, pnanovdb_uint32_t i, pnanovdb_uint32_t value)
+{
+    pnanovdb_address_t addr = pnanovdb_leaf_get_table_address(PNANOVDB_GRID_TYPE_POINTINDEX, buf, leaf, i);
+    pnanovdb_uint32_t raw32 = pnanovdb_read_uint32(buf, pnanovdb_address_mask_inv(addr, 3u));
+    if ((i & 1) == 0u)
+    {
+        raw32 = (raw32 & 0xFFFF0000) | (value & 0x0000FFFF);
+    }
+    else
+    {
+        raw32 = (raw32 & 0x0000FFFF) | (value << 16u);
+    }
+    pnanovdb_write_uint32(buf, addr, raw32);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_leaf_pointindex_set_on(pnanovdb_buf_t buf, pnanovdb_leaf_handle_t leaf, pnanovdb_uint32_t i)
+{
+    pnanovdb_uint32_t word_idx = i >> 5;
+    pnanovdb_uint32_t bit_idx = i & 31;
+    pnanovdb_address_t addr = pnanovdb_address_offset(leaf.address, PNANOVDB_LEAF_OFF_VALUE_MASK + 4u * word_idx);
+    pnanovdb_uint32_t val_mask = pnanovdb_read_uint32(buf, addr);
+    val_mask = val_mask | (1u << bit_idx);
+    pnanovdb_write_uint32(buf, addr, val_mask);
+}
+PNANOVDB_FORCE_INLINE void pnanovdb_leaf_pointindex_set_value(pnanovdb_buf_t buf, pnanovdb_leaf_handle_t leaf, pnanovdb_uint32_t i, pnanovdb_uint32_t value)
+{
+    pnanovdb_leaf_pointindex_set_on(buf, leaf, i);
+    pnanovdb_leaf_pointindex_set_value_only(buf, leaf, i, value);
+}
+
+// ------------------------------------------------ Lower Node -----------------------------------------------------------
+
 PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_lower_coord_to_offset(PNANOVDB_IN(pnanovdb_coord_t) ijk)
 {
     return (((PNANOVDB_DEREF(ijk).x & 127) >> 3) << (2 * 4)) +
-        (((PNANOVDB_DEREF(ijk).y & 127) >> 3) << (4)) +
-        ((PNANOVDB_DEREF(ijk).z & 127) >> 3);
+           (((PNANOVDB_DEREF(ijk).y & 127) >> 3) << (4)) +
+            ((PNANOVDB_DEREF(ijk).z & 127) >> 3);
 }
 
 PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_lower_get_min_address(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_lower_handle_t node)
@@ -1475,11 +2210,13 @@ PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_lower_get_value_address(pnanov
     return pnanovdb_lower_get_value_address_and_level(grid_type, buf, lower, ijk, PNANOVDB_REF(level));
 }
 
+// ------------------------------------------------ Upper Node -----------------------------------------------------------
+
 PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_upper_coord_to_offset(PNANOVDB_IN(pnanovdb_coord_t) ijk)
 {
     return (((PNANOVDB_DEREF(ijk).x & 4095) >> 7) << (2 * 5)) +
-        (((PNANOVDB_DEREF(ijk).y & 4095) >> 7) << (5)) +
-        ((PNANOVDB_DEREF(ijk).z & 4095) >> 7);
+           (((PNANOVDB_DEREF(ijk).y & 4095) >> 7) << (5)) +
+            ((PNANOVDB_DEREF(ijk).z & 4095) >> 7);
 }
 
 PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_upper_get_min_address(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_upper_handle_t node)
@@ -1548,6 +2285,14 @@ PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_upper_get_value_address(pnanov
     return pnanovdb_upper_get_value_address_and_level(grid_type, buf, upper, ijk, PNANOVDB_REF(level));
 }
 
+PNANOVDB_FORCE_INLINE void pnanovdb_upper_set_table_child(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_upper_handle_t node, pnanovdb_uint32_t n, pnanovdb_int64_t child)
+{
+    pnanovdb_address_t bufAddress = pnanovdb_upper_get_table_address(grid_type, buf, node, n);
+    pnanovdb_write_int64(buf, bufAddress, child);
+}
+
+// ------------------------------------------------ Root -----------------------------------------------------------
+
 PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_root_get_min_address(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_root_handle_t root)
 {
     pnanovdb_uint32_t byte_offset = PNANOVDB_GRID_TYPE_GET(grid_type, root_off_min);
@@ -1670,6 +2415,92 @@ PNANOVDB_FORCE_INLINE float pnanovdb_root_fpn_read_float(pnanovdb_buf_t buf, pna
     return ret;
 }
 
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_root_index_get_value_index(pnanovdb_buf_t buf, pnanovdb_address_t address, PNANOVDB_IN(pnanovdb_coord_t) ijk, pnanovdb_uint32_t level)
+{
+    pnanovdb_uint64_t ret;
+    if (level == 0)
+    {
+        ret = pnanovdb_leaf_index_get_value_index(buf, address, ijk);
+    }
+    else
+    {
+        ret = pnanovdb_read_uint64(buf, address);
+    }
+    return ret;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_root_onindex_get_value_index(pnanovdb_buf_t buf, pnanovdb_address_t address, PNANOVDB_IN(pnanovdb_coord_t) ijk, pnanovdb_uint32_t level)
+{
+    pnanovdb_uint64_t ret;
+    if (level == 0)
+    {
+        ret = pnanovdb_leaf_onindex_get_value_index(buf, address, ijk);
+    }
+    else
+    {
+        ret = pnanovdb_read_uint64(buf, address);
+    }
+    return ret;
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_root_pointindex_get_point_range(
+    pnanovdb_buf_t buf,
+    pnanovdb_address_t value_address,
+    PNANOVDB_IN(pnanovdb_coord_t) ijk,
+    pnanovdb_uint32_t level,
+    PNANOVDB_INOUT(pnanovdb_uint64_t)range_begin,
+    PNANOVDB_INOUT(pnanovdb_uint64_t)range_end
+)
+{
+    pnanovdb_uint32_t local_range_begin = 0u;
+    pnanovdb_uint32_t local_range_end = 0u;
+    pnanovdb_uint64_t offset = pnanovdb_uint32_as_uint64_low(0u);
+    if (level == 0)
+    {
+        pnanovdb_uint32_t n = pnanovdb_leaf_coord_to_offset(ijk);
+        // recover leaf address
+        pnanovdb_leaf_handle_t leaf = { pnanovdb_address_offset_neg(value_address, PNANOVDB_GRID_TYPE_GET(PNANOVDB_GRID_TYPE_POINTINDEX, leaf_off_table) + 2u * n) };
+        if (n > 0u)
+        {
+            local_range_begin = pnanovdb_read_uint16(buf, pnanovdb_address_offset_neg(value_address, 2u));
+        }
+        local_range_end = pnanovdb_read_uint16(buf, value_address);
+        offset = pnanovdb_leaf_pointindex_get_offset(buf, leaf);
+    }
+    PNANOVDB_DEREF(range_begin) = pnanovdb_uint64_offset(offset, local_range_begin);
+    PNANOVDB_DEREF(range_end) = pnanovdb_uint64_offset(offset, local_range_end);
+    return pnanovdb_uint32_as_uint64_low(local_range_end - local_range_begin);
+}
+
+PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_root_pointindex_get_point_address_range(
+    pnanovdb_buf_t buf,
+    pnanovdb_grid_type_t value_type,
+    pnanovdb_address_t value_address,
+    pnanovdb_address_t blindmetadata_value_address,
+    PNANOVDB_IN(pnanovdb_coord_t) ijk,
+    pnanovdb_uint32_t level,
+    PNANOVDB_INOUT(pnanovdb_address_t)address_begin,
+    PNANOVDB_INOUT(pnanovdb_address_t)address_end
+)
+{
+    pnanovdb_uint64_t range_begin;
+    pnanovdb_uint64_t range_end;
+    pnanovdb_uint64_t range_size = pnanovdb_root_pointindex_get_point_range(buf, value_address, ijk, level, PNANOVDB_REF(range_begin), PNANOVDB_REF(range_end));
+
+    pnanovdb_uint32_t stride = 12u; // vec3f
+    if (value_type == PNANOVDB_GRID_TYPE_VEC3U8)
+    {
+        stride = 3u;
+    }
+    else if (value_type == PNANOVDB_GRID_TYPE_VEC3U16)
+    {
+        stride = 6u;
+    }
+    PNANOVDB_DEREF(address_begin) = pnanovdb_address_offset64_product(blindmetadata_value_address, range_begin, stride);
+    PNANOVDB_DEREF(address_end) = pnanovdb_address_offset64_product(blindmetadata_value_address, range_end, stride);
+    return range_size;
+}
+
 // ------------------------------------------------ ReadAccessor -----------------------------------------------------------
 
 struct pnanovdb_readaccessor_t
@@ -1760,6 +2591,12 @@ PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_lower_get_value_address_and_ca
     return pnanovdb_lower_get_value_address_and_level_and_cache(grid_type, buf, lower, ijk, acc, PNANOVDB_REF(level));
 }
 
+PNANOVDB_FORCE_INLINE void pnanovdb_lower_set_table_child(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_lower_handle_t node, pnanovdb_uint32_t n, pnanovdb_int64_t child)
+{
+    pnanovdb_address_t table_address = pnanovdb_lower_get_table_address(grid_type, buf, node, n);
+    pnanovdb_write_int64(buf, table_address, child);
+}
+
 PNANOVDB_FORCE_INLINE pnanovdb_address_t pnanovdb_upper_get_value_address_and_level_and_cache(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_upper_handle_t upper, PNANOVDB_IN(pnanovdb_coord_t) ijk, PNANOVDB_INOUT(pnanovdb_readaccessor_t) acc, PNANOVDB_INOUT(pnanovdb_uint32_t) level)
 {
     pnanovdb_uint32_t n = pnanovdb_upper_coord_to_offset(ijk);
diff --git a/warp/native/nanovdb/PNanoVDBWrite.h b/warp/native/nanovdb/PNanoVDBWrite.h
deleted file mode 100644
index 916c48d8..00000000
--- a/warp/native/nanovdb/PNanoVDBWrite.h
+++ /dev/null
@@ -1,295 +0,0 @@
-
-// Copyright Contributors to the OpenVDB Project
-// SPDX-License-Identifier: MPL-2.0
-
-/*!
-	\file   PNanoVDBWrite.h
-
-	\author Andrew Reidmeyer
-
-	\brief  This file is a portable (e.g. pointer-less) C99/GLSL/HLSL port
-			of NanoVDBWrite.h, which is compatible with most graphics APIs.
-*/
-
-#ifndef NANOVDB_PNANOVDB_WRITE_H_HAS_BEEN_INCLUDED
-#define NANOVDB_PNANOVDB_WRITE_H_HAS_BEEN_INCLUDED
-
-#if defined(PNANOVDB_BUF_C)
-#if defined(PNANOVDB_ADDRESS_32)
-PNANOVDB_BUF_FORCE_INLINE void pnanovdb_buf_write_uint32(pnanovdb_buf_t buf, uint32_t byte_offset, uint32_t value)
-{
-	uint32_t wordaddress = (byte_offset >> 2u);
-#ifdef PNANOVDB_BUF_BOUNDS_CHECK
-	if (wordaddress < buf.size_in_words)
-	{
-		buf.data[wordaddress] = value;
-	}
-#else
-	buf.data[wordaddress] = value;
-#endif
-}
-PNANOVDB_BUF_FORCE_INLINE void pnanovdb_buf_write_uint64(pnanovdb_buf_t buf, uint32_t byte_offset, uint64_t value)
-{
-	uint64_t* data64 = (uint64_t*)buf.data;
-	uint32_t wordaddress64 = (byte_offset >> 3u);
-#ifdef PNANOVDB_BUF_BOUNDS_CHECK
-	uint64_t size_in_words64 = buf.size_in_words >> 1u;
-	if (wordaddress64 < size_in_words64)
-	{
-		data64[wordaddress64] = value;
-	}
-#else
-	data64[wordaddress64] = value;
-#endif
-}
-#elif defined(PNANOVDB_ADDRESS_64)
-PNANOVDB_BUF_FORCE_INLINE void pnanovdb_buf_write_uint32(pnanovdb_buf_t buf, uint64_t byte_offset, uint32_t value)
-{
-	uint64_t wordaddress = (byte_offset >> 2u);
-#ifdef PNANOVDB_BUF_BOUNDS_CHECK
-	if (wordaddress < buf.size_in_words)
-	{
-		buf.data[wordaddress] = value;
-	}
-#else
-	buf.data[wordaddress] = value;
-#endif
-}
-PNANOVDB_BUF_FORCE_INLINE void pnanovdb_buf_write_uint64(pnanovdb_buf_t buf, uint64_t byte_offset, uint64_t value)
-{
-	uint64_t* data64 = (uint64_t*)buf.data;
-	uint64_t wordaddress64 = (byte_offset >> 3u);
-#ifdef PNANOVDB_BUF_BOUNDS_CHECK
-	uint64_t size_in_words64 = buf.size_in_words >> 1u;
-	if (wordaddress64 < size_in_words64)
-	{
-		data64[wordaddress64] = value;
-	}
-#else
-	data64[wordaddress64] = value;
-#endif
-}
-#endif
-#endif
-
-#if defined(PNANOVDB_C)
-PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_float_as_uint32(float v) { return *((pnanovdb_uint32_t*)(&v)); }
-PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_double_as_uint64(double v) { return *((pnanovdb_uint64_t*)(&v)); }
-#elif defined(PNANOVDB_HLSL)
-PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_float_as_uint32(float v) { return asuint(v); }
-PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_double_as_uint64(double v) { uint2 ret; asuint(v, ret.x, ret.y); return ret; }
-#elif defined(PNANOVDB_GLSL)
-PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_float_as_uint32(float v) { return floatBitsToUint(v); }
-PNANOVDB_FORCE_INLINE pnanovdb_uint64_t pnanovdb_double_as_uint64(double v) { return unpackDouble2x32(v); }
-#endif
-
-PNANOVDB_FORCE_INLINE void pnanovdb_write_uint32(pnanovdb_buf_t buf, pnanovdb_address_t address, pnanovdb_uint32_t value)
-{
-	pnanovdb_buf_write_uint32(buf, address.byte_offset, value);
-}
-PNANOVDB_FORCE_INLINE void pnanovdb_write_uint64(pnanovdb_buf_t buf, pnanovdb_address_t address, pnanovdb_uint64_t value)
-{
-	pnanovdb_buf_write_uint64(buf, address.byte_offset, value);
-}
-PNANOVDB_FORCE_INLINE void pnanovdb_write_int32(pnanovdb_buf_t buf, pnanovdb_address_t address, pnanovdb_int32_t value)
-{
-	pnanovdb_write_uint32(buf, address, pnanovdb_int32_as_uint32(value));
-}
-PNANOVDB_FORCE_INLINE void pnanovdb_write_int64(pnanovdb_buf_t buf, pnanovdb_address_t address, pnanovdb_int64_t value)
-{
-	pnanovdb_buf_write_uint64(buf, address.byte_offset, pnanovdb_int64_as_uint64(value));
-}
-PNANOVDB_FORCE_INLINE void pnanovdb_write_float(pnanovdb_buf_t buf, pnanovdb_address_t address, float value)
-{
-	pnanovdb_write_uint32(buf, address, pnanovdb_float_as_uint32(value));
-}
-PNANOVDB_FORCE_INLINE void pnanovdb_write_double(pnanovdb_buf_t buf, pnanovdb_address_t address, double value)
-{
-	pnanovdb_write_uint64(buf, address, pnanovdb_double_as_uint64(value));
-}
-PNANOVDB_FORCE_INLINE void pnanovdb_write_coord(pnanovdb_buf_t buf, pnanovdb_address_t address, PNANOVDB_IN(pnanovdb_coord_t) value)
-{
-	pnanovdb_write_uint32(buf, pnanovdb_address_offset(address, 0u), pnanovdb_int32_as_uint32(PNANOVDB_DEREF(value).x));
-	pnanovdb_write_uint32(buf, pnanovdb_address_offset(address, 4u), pnanovdb_int32_as_uint32(PNANOVDB_DEREF(value).y));
-	pnanovdb_write_uint32(buf, pnanovdb_address_offset(address, 8u), pnanovdb_int32_as_uint32(PNANOVDB_DEREF(value).z));
-}
-PNANOVDB_FORCE_INLINE void pnanovdb_write_vec3(pnanovdb_buf_t buf, pnanovdb_address_t address, PNANOVDB_IN(pnanovdb_vec3_t) value)
-{
-	pnanovdb_write_uint32(buf, pnanovdb_address_offset(address, 0u), pnanovdb_float_as_uint32(PNANOVDB_DEREF(value).x));
-	pnanovdb_write_uint32(buf, pnanovdb_address_offset(address, 4u), pnanovdb_float_as_uint32(PNANOVDB_DEREF(value).y));
-	pnanovdb_write_uint32(buf, pnanovdb_address_offset(address, 8u), pnanovdb_float_as_uint32(PNANOVDB_DEREF(value).z));
-}
-
-PNANOVDB_FORCE_INLINE void pnanovdb_tree_set_node_offset_leaf(pnanovdb_buf_t buf, pnanovdb_tree_handle_t p, pnanovdb_uint64_t node_offset_leaf) {
-	pnanovdb_write_uint64(buf, pnanovdb_address_offset(p.address, PNANOVDB_TREE_OFF_NODE_OFFSET_LEAF), node_offset_leaf);
-}
-PNANOVDB_FORCE_INLINE void pnanovdb_tree_set_node_offset_lower(pnanovdb_buf_t buf, pnanovdb_tree_handle_t p, pnanovdb_uint64_t node_offset_lower) {
-	pnanovdb_write_uint64(buf, pnanovdb_address_offset(p.address, PNANOVDB_TREE_OFF_NODE_OFFSET_LOWER), node_offset_lower);
-}
-PNANOVDB_FORCE_INLINE void pnanovdb_tree_set_node_offset_upper(pnanovdb_buf_t buf, pnanovdb_tree_handle_t p, pnanovdb_uint64_t node_offset_upper) {
-	pnanovdb_write_uint64(buf, pnanovdb_address_offset(p.address, PNANOVDB_TREE_OFF_NODE_OFFSET_UPPER), node_offset_upper);
-}
-PNANOVDB_FORCE_INLINE void pnanovdb_tree_set_node_offset_root(pnanovdb_buf_t buf, pnanovdb_tree_handle_t p, pnanovdb_uint64_t node_offset_root) {
-	pnanovdb_write_uint64(buf, pnanovdb_address_offset(p.address, PNANOVDB_TREE_OFF_NODE_OFFSET_ROOT), node_offset_root);
-}
-PNANOVDB_FORCE_INLINE void pnanovdb_tree_set_node_count_leaf(pnanovdb_buf_t buf, pnanovdb_tree_handle_t p, pnanovdb_uint32_t node_count_leaf) {
-	pnanovdb_write_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_TREE_OFF_NODE_COUNT_LEAF), node_count_leaf);
-}
-PNANOVDB_FORCE_INLINE void pnanovdb_tree_set_node_count_lower(pnanovdb_buf_t buf, pnanovdb_tree_handle_t p, pnanovdb_uint32_t node_count_lower) {
-	pnanovdb_write_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_TREE_OFF_NODE_COUNT_LOWER), node_count_lower);
-}
-PNANOVDB_FORCE_INLINE void pnanovdb_tree_set_node_count_upper(pnanovdb_buf_t buf, pnanovdb_tree_handle_t p, pnanovdb_uint32_t node_count_upper) {
-	pnanovdb_write_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_TREE_OFF_NODE_COUNT_UPPER), node_count_upper);
-}
-PNANOVDB_FORCE_INLINE void pnanovdb_tree_set_tile_count_leaf(pnanovdb_buf_t buf, pnanovdb_tree_handle_t p, pnanovdb_uint32_t tile_count_leaf) {
-	pnanovdb_write_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_TREE_OFF_TILE_COUNT_LEAF), tile_count_leaf);
-}
-PNANOVDB_FORCE_INLINE void pnanovdb_tree_set_tile_count_lower(pnanovdb_buf_t buf, pnanovdb_tree_handle_t p, pnanovdb_uint32_t tile_count_lower) {
-	pnanovdb_write_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_TREE_OFF_TILE_COUNT_LOWER), tile_count_lower);
-}
-PNANOVDB_FORCE_INLINE void pnanovdb_tree_set_tile_count_upper(pnanovdb_buf_t buf, pnanovdb_tree_handle_t p, pnanovdb_uint32_t tile_count_upper) {
-	pnanovdb_write_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_TREE_OFF_TILE_COUNT_UPPER), tile_count_upper);
-}
-PNANOVDB_FORCE_INLINE void pnanovdb_tree_set_voxel_count(pnanovdb_buf_t buf, pnanovdb_tree_handle_t p, pnanovdb_uint64_t voxel_count) {
-	pnanovdb_write_uint64(buf, pnanovdb_address_offset(p.address, PNANOVDB_TREE_OFF_VOXEL_COUNT), voxel_count);
-}
-
-PNANOVDB_FORCE_INLINE void pnanovdb_root_set_bbox_min(pnanovdb_buf_t buf, pnanovdb_root_handle_t p, PNANOVDB_IN(pnanovdb_coord_t) bbox_min) {
-	pnanovdb_write_coord(buf, pnanovdb_address_offset(p.address, PNANOVDB_ROOT_OFF_BBOX_MIN), bbox_min);
-}
-PNANOVDB_FORCE_INLINE void pnanovdb_root_set_bbox_max(pnanovdb_buf_t buf, pnanovdb_root_handle_t p, PNANOVDB_IN(pnanovdb_coord_t) bbox_max) {
-	pnanovdb_write_coord(buf, pnanovdb_address_offset(p.address, PNANOVDB_ROOT_OFF_BBOX_MAX), bbox_max);
-}
-PNANOVDB_FORCE_INLINE void pnanovdb_root_set_tile_count(pnanovdb_buf_t buf, pnanovdb_root_handle_t p, pnanovdb_uint32_t tile_count) {
-	pnanovdb_write_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_ROOT_OFF_TABLE_SIZE), tile_count);
-}
-
-PNANOVDB_FORCE_INLINE void pnanovdb_root_tile_set_key(pnanovdb_buf_t buf, pnanovdb_root_tile_handle_t p, pnanovdb_uint64_t key) {
-	pnanovdb_write_uint64(buf, pnanovdb_address_offset(p.address, PNANOVDB_ROOT_TILE_OFF_KEY), key);
-}
-PNANOVDB_FORCE_INLINE void pnanovdb_root_tile_set_child(pnanovdb_buf_t buf, pnanovdb_root_tile_handle_t p, pnanovdb_int64_t child) {
-	pnanovdb_write_int64(buf, pnanovdb_address_offset(p.address, PNANOVDB_ROOT_TILE_OFF_CHILD), child);
-}
-PNANOVDB_FORCE_INLINE void pnanovdb_root_tile_set_state(pnanovdb_buf_t buf, pnanovdb_root_tile_handle_t p, pnanovdb_uint32_t state) {
-	pnanovdb_write_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_ROOT_TILE_OFF_STATE), state);
-}
-
-PNANOVDB_FORCE_INLINE void pnanovdb_upper_set_bbox_min(pnanovdb_buf_t buf, pnanovdb_upper_handle_t p, PNANOVDB_IN(pnanovdb_coord_t) bbox_min) {
-	pnanovdb_write_coord(buf, pnanovdb_address_offset(p.address, PNANOVDB_UPPER_OFF_BBOX_MIN), bbox_min);
-}
-PNANOVDB_FORCE_INLINE void pnanovdb_upper_set_bbox_max(pnanovdb_buf_t buf, pnanovdb_upper_handle_t p, PNANOVDB_IN(pnanovdb_coord_t) bbox_max) {
-	pnanovdb_write_coord(buf, pnanovdb_address_offset(p.address, PNANOVDB_UPPER_OFF_BBOX_MAX), bbox_max);
-}
-PNANOVDB_FORCE_INLINE void pnanovdb_upper_set_child_mask(pnanovdb_buf_t buf, pnanovdb_upper_handle_t p, pnanovdb_uint32_t bit_index, pnanovdb_bool_t value) {
-	pnanovdb_address_t addr = pnanovdb_address_offset(p.address, PNANOVDB_UPPER_OFF_CHILD_MASK + 4u * (bit_index >> 5u));
-	pnanovdb_uint32_t valueMask = pnanovdb_read_uint32(buf, addr);
-	if (!value) { valueMask &= ~(1u << (bit_index & 31u)); }
-	if (value) valueMask |= (1u << (bit_index & 31u));
-	pnanovdb_write_uint32(buf, addr, valueMask);
-}
-PNANOVDB_FORCE_INLINE void pnanovdb_upper_set_table_child(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_upper_handle_t node, pnanovdb_uint32_t n, pnanovdb_int64_t child)
-{
-	pnanovdb_address_t bufAddress = pnanovdb_upper_get_table_address(grid_type, buf, node, n);
-	pnanovdb_write_int64(buf, bufAddress, child);
-}
-
-PNANOVDB_FORCE_INLINE void pnanovdb_lower_set_bbox_min(pnanovdb_buf_t buf, pnanovdb_lower_handle_t p, PNANOVDB_IN(pnanovdb_coord_t) bbox_min) {
-	pnanovdb_write_coord(buf, pnanovdb_address_offset(p.address, PNANOVDB_LOWER_OFF_BBOX_MIN), bbox_min);
-}
-PNANOVDB_FORCE_INLINE void pnanovdb_lower_set_bbox_max(pnanovdb_buf_t buf, pnanovdb_lower_handle_t p, PNANOVDB_IN(pnanovdb_coord_t) bbox_max) {
-	pnanovdb_write_coord(buf, pnanovdb_address_offset(p.address, PNANOVDB_LOWER_OFF_BBOX_MAX), bbox_max);
-}
-PNANOVDB_FORCE_INLINE void pnanovdb_lower_set_child_mask(pnanovdb_buf_t buf, pnanovdb_lower_handle_t p, pnanovdb_uint32_t bit_index, pnanovdb_bool_t value) {
-	pnanovdb_address_t addr = pnanovdb_address_offset(p.address, PNANOVDB_LOWER_OFF_CHILD_MASK + 4u * (bit_index >> 5u));
-	pnanovdb_uint32_t valueMask = pnanovdb_read_uint32(buf, addr);
-	if (!value) { valueMask &= ~(1u << (bit_index & 31u)); }
-	if (value) valueMask |= (1u << (bit_index & 31u));
-	pnanovdb_write_uint32(buf, addr, valueMask);
-}
-PNANOVDB_FORCE_INLINE void pnanovdb_lower_set_table_child(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf, pnanovdb_lower_handle_t node, pnanovdb_uint32_t n, pnanovdb_int64_t child)
-{
-	pnanovdb_address_t table_address = pnanovdb_lower_get_table_address(grid_type, buf, node, n);
-	pnanovdb_write_int64(buf, table_address, child);
-}
-
-PNANOVDB_FORCE_INLINE void pnanovdb_leaf_set_bbox_min(pnanovdb_buf_t buf, pnanovdb_leaf_handle_t p, PNANOVDB_IN(pnanovdb_coord_t) bbox_min) {
-	pnanovdb_write_coord(buf, pnanovdb_address_offset(p.address, PNANOVDB_LEAF_OFF_BBOX_MIN), bbox_min);
-}
-PNANOVDB_FORCE_INLINE void pnanovdb_leaf_set_bbox_dif_and_flags(pnanovdb_buf_t buf, pnanovdb_leaf_handle_t p, pnanovdb_uint32_t bbox_dif_and_flags) {
-	pnanovdb_write_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_LEAF_OFF_BBOX_DIF_AND_FLAGS), bbox_dif_and_flags);
-}
-
-PNANOVDB_FORCE_INLINE void pnanovdb_map_set_matf(pnanovdb_buf_t buf, pnanovdb_map_handle_t p, pnanovdb_uint32_t index, float matf) {
-	pnanovdb_write_float(buf, pnanovdb_address_offset(p.address, PNANOVDB_MAP_OFF_MATF + 4u * index), matf);
-}
-PNANOVDB_FORCE_INLINE void pnanovdb_map_set_invmatf(pnanovdb_buf_t buf, pnanovdb_map_handle_t p, pnanovdb_uint32_t index, float invmatf) {
-	pnanovdb_write_float(buf, pnanovdb_address_offset(p.address, PNANOVDB_MAP_OFF_INVMATF + 4u * index), invmatf);
-}
-PNANOVDB_FORCE_INLINE void pnanovdb_map_set_vecf(pnanovdb_buf_t buf, pnanovdb_map_handle_t p, pnanovdb_uint32_t index, float vecf) {
-	pnanovdb_write_float(buf, pnanovdb_address_offset(p.address, PNANOVDB_MAP_OFF_VECF + 4u * index), vecf);
-}
-PNANOVDB_FORCE_INLINE void pnanovdb_map_set_taperf(pnanovdb_buf_t buf, pnanovdb_map_handle_t p, pnanovdb_uint32_t index, float taperf) {
-	pnanovdb_write_float(buf, pnanovdb_address_offset(p.address, PNANOVDB_MAP_OFF_TAPERF), taperf);
-}
-PNANOVDB_FORCE_INLINE void pnanovdb_map_set_matd(pnanovdb_buf_t buf, pnanovdb_map_handle_t p, pnanovdb_uint32_t index, double matd) {
-	pnanovdb_write_double(buf, pnanovdb_address_offset(p.address, PNANOVDB_MAP_OFF_MATD + 8u * index), matd);
-}
-PNANOVDB_FORCE_INLINE void pnanovdb_map_set_invmatd(pnanovdb_buf_t buf, pnanovdb_map_handle_t p, pnanovdb_uint32_t index, double invmatd) {
-	pnanovdb_write_double(buf, pnanovdb_address_offset(p.address, PNANOVDB_MAP_OFF_INVMATD + 8u * index), invmatd);
-}
-PNANOVDB_FORCE_INLINE void pnanovdb_map_set_vecd(pnanovdb_buf_t buf, pnanovdb_map_handle_t p, pnanovdb_uint32_t index, double vecd) {
-	pnanovdb_write_double(buf, pnanovdb_address_offset(p.address, PNANOVDB_MAP_OFF_VECD + 8u * index), vecd);
-}
-PNANOVDB_FORCE_INLINE void pnanovdb_map_set_taperd(pnanovdb_buf_t buf, pnanovdb_map_handle_t p, pnanovdb_uint32_t index, double taperd) {
-	pnanovdb_write_double(buf, pnanovdb_address_offset(p.address, PNANOVDB_MAP_OFF_TAPERD), taperd);
-}
-
-PNANOVDB_FORCE_INLINE void pnanovdb_grid_set_magic(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p, pnanovdb_uint64_t magic) {
-	pnanovdb_write_uint64(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_MAGIC), magic);
-}
-PNANOVDB_FORCE_INLINE void pnanovdb_grid_set_checksum(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p, pnanovdb_uint64_t checksum) {
-	pnanovdb_write_uint64(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_CHECKSUM), checksum);
-}
-PNANOVDB_FORCE_INLINE void pnanovdb_grid_set_version(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p, pnanovdb_uint32_t version) {
-	pnanovdb_write_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_VERSION), version);
-}
-PNANOVDB_FORCE_INLINE void pnanovdb_grid_set_flags(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p, pnanovdb_uint32_t flags) {
-	pnanovdb_write_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_FLAGS), flags);
-}
-PNANOVDB_FORCE_INLINE void pnanovdb_grid_get_grid_index(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p, pnanovdb_uint32_t grid_index) {
-	pnanovdb_write_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_GRID_INDEX), grid_index);
-}
-PNANOVDB_FORCE_INLINE void pnanovdb_grid_get_grid_count(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p, pnanovdb_uint32_t grid_count) {
-	pnanovdb_write_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_GRID_COUNT), grid_count);
-}
-PNANOVDB_FORCE_INLINE void pnanovdb_grid_set_grid_size(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p, pnanovdb_uint64_t grid_size) {
-	pnanovdb_write_uint64(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_GRID_SIZE), grid_size);
-}
-PNANOVDB_FORCE_INLINE void pnanovdb_grid_set_grid_name(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p, pnanovdb_uint32_t index, pnanovdb_uint32_t grid_name) {
-	pnanovdb_write_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_GRID_NAME + 4u * index), grid_name);
-}
-PNANOVDB_FORCE_INLINE void pnanovdb_grid_set_world_bbox(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p, pnanovdb_uint32_t index, double world_bbox) {
-	pnanovdb_write_double(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_WORLD_BBOX + 8u * index), world_bbox);
-}
-PNANOVDB_FORCE_INLINE void pnanovdb_grid_set_voxel_size(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p, pnanovdb_uint32_t index, double voxel_size) {
-	pnanovdb_write_double(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_VOXEL_SIZE + 8u * index), voxel_size);
-}
-PNANOVDB_FORCE_INLINE void pnanovdb_grid_set_grid_class(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p, pnanovdb_uint32_t grid_class) {
-	pnanovdb_write_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_GRID_CLASS), grid_class);
-}
-PNANOVDB_FORCE_INLINE void pnanovdb_grid_set_grid_type(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p, pnanovdb_uint32_t grid_type) {
-	pnanovdb_write_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_GRID_TYPE), grid_type);
-}
-PNANOVDB_FORCE_INLINE void pnanovdb_grid_set_blind_metadata_offset(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p, pnanovdb_uint64_t blind_metadata_offset) {
-	pnanovdb_write_uint64(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_BLIND_METADATA_OFFSET), blind_metadata_offset);
-}
-PNANOVDB_FORCE_INLINE void pnanovdb_grid_set_blind_metadata_count(pnanovdb_buf_t buf, pnanovdb_grid_handle_t p, pnanovdb_uint32_t metadata_count) {
-	pnanovdb_write_uint32(buf, pnanovdb_address_offset(p.address, PNANOVDB_GRID_OFF_BLIND_METADATA_COUNT), metadata_count);
-}
-
-PNANOVDB_FORCE_INLINE pnanovdb_uint32_t pnanovdb_make_version(pnanovdb_uint32_t major, pnanovdb_uint32_t minor, pnanovdb_uint32_t patch)
-{
-	return (major << 21u) | (minor << 10u) | (patch);
-}
-
-#endif
\ No newline at end of file
diff --git a/warp/native/nanovdb/cuda/DeviceBuffer.h b/warp/native/nanovdb/cuda/DeviceBuffer.h
new file mode 100644
index 00000000..52c151ca
--- /dev/null
+++ b/warp/native/nanovdb/cuda/DeviceBuffer.h
@@ -0,0 +1,231 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: MPL-2.0
+
+/*!
+    \file DeviceBuffer.h
+
+    \author Ken Museth
+
+    \date January 8, 2020
+
+    \brief Implements a simple dual (host/device) CUDA buffer.
+
+    \note This file has no device-only kernel functions,
+          which explains why it's a .h and not .cuh file.
+*/
+
+#ifndef NANOVDB_CUDA_DEVICEBUFFER_H_HAS_BEEN_INCLUDED
+#define NANOVDB_CUDA_DEVICEBUFFER_H_HAS_BEEN_INCLUDED
+
+#include <nanovdb/HostBuffer.h>// for BufferTraits
+#include <nanovdb/util/cuda/Util.h>// for cudaMalloc/cudaMallocManaged/cudaFree
+
+namespace nanovdb {// ================================================================
+
+namespace cuda {// ===================================================================
+
+// ----------------------------> DeviceBuffer <--------------------------------------
+
+/// @brief Simple memory buffer using un-managed pinned host memory when compiled with NVCC.
+///        Obviously this class is making explicit used of CUDA so replace it with your own memory
+///        allocator if you are not using CUDA.
+/// @note  While CUDA's pinned host memory allows for asynchronous memory copy between host and device
+///        it is significantly slower then cached (un-pinned) memory on the host.
+class DeviceBuffer
+{
+    uint64_t mSize; // total number of bytes managed by this buffer (assumed to be identical for host and device)
+    void *mCpuData, *mGpuData; // raw pointers to the host and device buffers
+    bool mManaged;
+
+public:
+    /// @brief Static factory method that return an instance of this buffer
+    /// @param size byte size of buffer to be initialized
+    /// @param dummy this argument is currently ignored but required to match the API of the HostBuffer
+    /// @param host If true buffer is initialized only on the host/CPU, else on the device/GPU
+    /// @param stream optional stream argument (defaults to stream NULL)
+    /// @return An instance of this class using move semantics
+    static DeviceBuffer create(uint64_t size, const DeviceBuffer* dummy = nullptr, bool host = true, void* stream = nullptr);
+
+    /// @brief Static factory method that return an instance of this buffer that wraps externally managed memory
+    /// @param size byte size of buffer specified by external memory
+    /// @param cpuData pointer to externally managed host memory
+    /// @param gpuData pointer to externally managed device memory
+    /// @return An instance of this class using move semantics
+    static DeviceBuffer create(uint64_t size, void* cpuData, void* gpuData);
+
+    /// @brief Constructor
+    /// @param size byte size of buffer to be initialized
+    /// @param host If true buffer is initialized only on the host/CPU, else on the device/GPU
+    /// @param stream optional stream argument (defaults to stream NULL)
+    DeviceBuffer(uint64_t size = 0, bool host = true, void* stream = nullptr)
+        : mSize(0)
+        , mCpuData(nullptr)
+        , mGpuData(nullptr)
+        , mManaged(false)
+    {
+        if (size > 0) this->init(size, host, stream);
+    }
+
+    DeviceBuffer(uint64_t size, void* cpuData, void* gpuData)
+        : mSize(size)
+        , mCpuData(cpuData)
+        , mGpuData(gpuData)
+        , mManaged(false)
+    {
+    }
+
+    /// @brief Disallow copy-construction
+    DeviceBuffer(const DeviceBuffer&) = delete;
+
+    /// @brief Move copy-constructor
+    DeviceBuffer(DeviceBuffer&& other) noexcept
+        : mSize(other.mSize)
+        , mCpuData(other.mCpuData)
+        , mGpuData(other.mGpuData)
+        , mManaged(other.mManaged)
+    {
+        other.mSize = 0;
+        other.mCpuData = nullptr;
+        other.mGpuData = nullptr;
+        other.mManaged = false;
+    }
+
+    /// @brief Disallow copy assignment operation
+    DeviceBuffer& operator=(const DeviceBuffer&) = delete;
+
+    /// @brief Move copy assignment operation
+    DeviceBuffer& operator=(DeviceBuffer&& other) noexcept
+    {
+        this->clear();
+        mSize = other.mSize;
+        mCpuData = other.mCpuData;
+        mGpuData = other.mGpuData;
+        mManaged = other.mManaged;
+        other.mSize = 0;
+        other.mCpuData = nullptr;
+        other.mGpuData = nullptr;
+        other.mManaged = false;
+        return *this;
+    }
+
+    /// @brief Destructor frees memory on both the host and device
+    ~DeviceBuffer() { this->clear(); };
+
+    /// @brief Initialize buffer
+    /// @param size byte size of buffer to be initialized
+    /// @param host If true buffer is initialized only on the host/CPU, else on the device/GPU
+    /// @note All existing buffers are first cleared
+    /// @warning size is expected to be non-zero. Use clear() clear buffer!
+    void init(uint64_t size, bool host = true, void* stream = nullptr);
+
+    /// @brief Retuns a raw pointer to the host/CPU buffer managed by this allocator.
+    /// @warning Note that the pointer can be NULL!
+    void* data() const { return mCpuData; }
+
+    /// @brief Retuns a raw pointer to the device/GPU buffer managed by this allocator.
+    /// @warning Note that the pointer can be NULL!
+    void* deviceData() const { return mGpuData; }
+
+    /// @brief  Upload this buffer from the host to the device, i.e. CPU -> GPU.
+    /// @param stream optional CUDA stream (defaults to CUDA stream 0)
+    /// @param sync if false the memory copy is asynchronous
+    /// @note If the device/GPU buffer does not exist it is first allocated
+    /// @warning Assumes that the host/CPU buffer already exists
+    void deviceUpload(void* stream = nullptr, bool sync = true) const;
+
+    /// @brief Upload this buffer from the device to the host, i.e. GPU -> CPU.
+    /// @param stream optional CUDA stream (defaults to CUDA stream 0)
+    /// @param sync if false the memory copy is asynchronous
+    /// @note If the host/CPU buffer does not exist it is first allocated
+    /// @warning Assumes that the device/GPU buffer already exists
+    void deviceDownload(void* stream = nullptr, bool sync = true) const;
+
+    /// @brief Returns the size in bytes of the raw memory buffer managed by this allocator.
+    uint64_t size() const { return mSize; }
+
+    //@{
+    /// @brief Returns true if this allocator is empty, i.e. has no allocated memory
+    bool empty() const { return mSize == 0; }
+    bool isEmpty() const { return mSize == 0; }
+    //@}
+
+    /// @brief De-allocate all memory managed by this allocator and set all pointers to NULL
+    void clear(void* stream = nullptr);
+
+}; // DeviceBuffer class
+
+// --------------------------> Implementations below <------------------------------------
+
+inline DeviceBuffer DeviceBuffer::create(uint64_t size, const DeviceBuffer*, bool host, void* stream)
+{
+    return DeviceBuffer(size, host, stream);
+}
+
+inline DeviceBuffer DeviceBuffer::create(uint64_t size, void* cpuData, void* gpuData)
+{
+    return DeviceBuffer(size, cpuData, gpuData);
+}
+
+inline void DeviceBuffer::init(uint64_t size, bool host, void* stream)
+{
+    if (mSize>0) this->clear(stream);
+    NANOVDB_ASSERT(size > 0);
+    if (host) {
+        cudaCheck(cudaMallocHost((void**)&mCpuData, size)); // un-managed pinned memory on the host (can be slow to access!). Always 32B aligned
+        checkPtr(mCpuData, "cuda::DeviceBuffer::init: failed to allocate host buffer");
+    } else {
+        cudaCheck(util::cuda::mallocAsync((void**)&mGpuData, size, reinterpret_cast<cudaStream_t>(stream))); // un-managed memory on the device, always 32B aligned!
+        checkPtr(mGpuData, "cuda::DeviceBuffer::init: failed to allocate device buffer");
+    }
+    mSize = size;
+    mManaged = true;
+} // DeviceBuffer::init
+
+inline void DeviceBuffer::deviceUpload(void* stream, bool sync) const
+{
+    if (!mManaged) throw std::runtime_error("DeviceBuffer::deviceUpload called on externally managed memory. Replace deviceUpload call with the appropriate external copy operation.");
+
+    checkPtr(mCpuData, "uninitialized cpu data");
+    if (mGpuData == nullptr) {
+        cudaCheck(util::cuda::mallocAsync((void**)&mGpuData, mSize, reinterpret_cast<cudaStream_t>(stream))); // un-managed memory on the device, always 32B aligned!
+    }
+    checkPtr(mGpuData, "uninitialized gpu data");
+    cudaCheck(cudaMemcpyAsync(mGpuData, mCpuData, mSize, cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream)));
+    if (sync) cudaCheck(cudaStreamSynchronize(reinterpret_cast<cudaStream_t>(stream)));
+} // DeviceBuffer::gpuUpload
+
+inline void DeviceBuffer::deviceDownload(void* stream, bool sync) const
+{
+    if (!mManaged) throw std::runtime_error("DeviceBuffer::deviceDownload called on externally managed memory. Replace deviceDownload call with the appropriate external copy operation.");
+
+    checkPtr(mGpuData, "uninitialized gpu data");
+    if (mCpuData == nullptr) {
+        cudaCheck(cudaMallocHost((void**)&mCpuData, mSize)); // un-managed pinned memory on the host (can be slow to access!). Always 32B aligned
+    }
+    checkPtr(mCpuData, "uninitialized cpu data");
+    cudaCheck(cudaMemcpyAsync(mCpuData, mGpuData, mSize, cudaMemcpyDeviceToHost, reinterpret_cast<cudaStream_t>(stream)));
+    if (sync) cudaCheck(cudaStreamSynchronize(reinterpret_cast<cudaStream_t>(stream)));
+} // DeviceBuffer::gpuDownload
+
+inline void DeviceBuffer::clear(void *stream)
+{
+    if (mManaged && mGpuData) cudaCheck(util::cuda::freeAsync(mGpuData, reinterpret_cast<cudaStream_t>(stream)));
+    if (mManaged && mCpuData) cudaCheck(cudaFreeHost(mCpuData));
+    mCpuData = mGpuData = nullptr;
+    mSize = 0;
+    mManaged = false;
+} // DeviceBuffer::clear
+
+}// namespace cuda
+
+using CudaDeviceBuffer [[deprecated("Use nanovdb::cuda::DeviceBudder instead")]] = cuda::DeviceBuffer;
+
+template<>
+struct BufferTraits<cuda::DeviceBuffer>
+{
+    static constexpr bool hasDeviceDual = true;
+};
+
+}// namespace nanovdb
+
+#endif // end of NANOVDB_CUDA_DEVICEBUFFER_H_HAS_BEEN_INCLUDED
diff --git a/warp/native/nanovdb/cuda/GridHandle.cuh b/warp/native/nanovdb/cuda/GridHandle.cuh
new file mode 100644
index 00000000..602087be
--- /dev/null
+++ b/warp/native/nanovdb/cuda/GridHandle.cuh
@@ -0,0 +1,76 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: MPL-2.0
+
+/*!
+    \file nanovdb/cuda/GridHandle.cuh
+
+    \author Ken Museth, Doyub Kim
+
+    \date August 3, 2023
+
+    \brief Contains cuda kernels for GridHandle
+
+    \warning The header file contains cuda device code so be sure
+             to only include it in .cu files (or other .cuh files)
+*/
+
+#ifndef NANOVDB_CUDA_GRIDHANDLE_CUH_HAS_BEEN_INCLUDED
+#define NANOVDB_CUDA_GRIDHANDLE_CUH_HAS_BEEN_INCLUDED
+
+#include <nanovdb/cuda/DeviceBuffer.h>// required for instantiation of move c-tor of GridHandle
+#include <nanovdb/GridHandle.h>
+
+namespace nanovdb {
+
+namespace cuda {
+
+namespace {// anonymous namespace
+__global__ void cpyGridHandleMeta(const GridData *d_data, GridHandleMetaData *d_meta)
+{
+    nanovdb::cpyGridHandleMeta(d_data, d_meta);
+}
+
+__global__ void updateGridCount(GridData *d_data, uint32_t gridIndex, uint32_t gridCount, bool *d_dirty)
+{
+    NANOVDB_ASSERT(gridIndex < gridCount);
+    if (*d_dirty = d_data->mGridIndex != gridIndex || d_data->mGridCount != gridCount) {
+        d_data->mGridIndex = gridIndex;
+        d_data->mGridCount = gridCount;
+        if (d_data->mChecksum.isEmpty()) *d_dirty = false;// no need to update checksum if it didn't already exist
+    }
+}
+}// anonymous namespace
+
+}// namespace cuda
+
+template<typename BufferT>
+template<typename T, typename util::enable_if<BufferTraits<T>::hasDeviceDual, int>::type>
+GridHandle<BufferT>::GridHandle(T&& buffer)
+{
+    static_assert(util::is_same<T,BufferT>::value, "Expected U==BufferT");
+    mBuffer = std::move(buffer);
+    if (auto *data = reinterpret_cast<const GridData*>(mBuffer.data())) {
+        if (!data->isValid()) throw std::runtime_error("GridHandle was constructed with an invalid host buffer");
+        mMetaData.resize(data->mGridCount);
+        cpyGridHandleMeta(data, mMetaData.data());
+    } else {
+        if (auto *d_data = reinterpret_cast<const GridData*>(mBuffer.deviceData())) {
+            GridData tmp;
+            cudaCheck(cudaMemcpy(&tmp, d_data, sizeof(GridData), cudaMemcpyDeviceToHost));
+            if (!tmp.isValid()) throw std::runtime_error("GridHandle was constructed with an invalid device buffer");
+            GridHandleMetaData *d_metaData;
+            cudaMalloc((void**)&d_metaData, tmp.mGridCount*sizeof(GridHandleMetaData));
+            cuda::cpyGridHandleMeta<<<1,1>>>(d_data, d_metaData);
+            mMetaData.resize(tmp.mGridCount);
+            cudaCheck(cudaMemcpy(mMetaData.data(), d_metaData,tmp.mGridCount*sizeof(GridHandleMetaData), cudaMemcpyDeviceToHost));
+            cudaCheck(cudaFree(d_metaData));
+        }
+    }
+}// GridHandle(T&& buffer)
+
+// Dummy function that ensures instantiation of the move-constructor above when BufferT=cuda::DeviceBuffer
+namespace {auto __dummy(){return GridHandle<cuda::DeviceBuffer>(std::move(cuda::DeviceBuffer()));}}
+
+} // namespace nanovdb
+
+#endif // NANOVDB_CUDA_GRIDHANDLE_CUH_HAS_BEEN_INCLUDED
diff --git a/warp/native/nanovdb/math/Math.h b/warp/native/nanovdb/math/Math.h
new file mode 100644
index 00000000..d9523074
--- /dev/null
+++ b/warp/native/nanovdb/math/Math.h
@@ -0,0 +1,1448 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: MPL-2.0
+
+/*!
+    \file   Math.h
+
+    \author Ken Museth
+
+    \date  January 8, 2020
+
+    \brief Math functions and classes
+
+*/
+
+#ifndef NANOVDB_MATH_MATH_H_HAS_BEEN_INCLUDED
+#define NANOVDB_MATH_MATH_H_HAS_BEEN_INCLUDED
+
+#include <nanovdb/util/Util.h>// for __hostdev__ and lots of other utility functions
+
+namespace nanovdb {// =================================================================
+
+namespace math {// =============================================================
+
+// ----------------------------> Various math functions <-------------------------------------
+
+//@{
+/// @brief Pi constant taken from Boost to match old behaviour
+template<typename T>
+inline __hostdev__ constexpr T pi()
+{
+    return 3.141592653589793238462643383279502884e+00;
+}
+template<>
+inline __hostdev__ constexpr float pi()
+{
+    return 3.141592653589793238462643383279502884e+00F;
+}
+template<>
+inline __hostdev__ constexpr double pi()
+{
+    return 3.141592653589793238462643383279502884e+00;
+}
+template<>
+inline __hostdev__ constexpr long double pi()
+{
+    return 3.141592653589793238462643383279502884e+00L;
+}
+//@}
+
+//@{
+/// Tolerance for floating-point comparison
+template<typename T>
+struct Tolerance;
+template<>
+struct Tolerance<float>
+{
+    __hostdev__ static float value() { return 1e-8f; }
+};
+template<>
+struct Tolerance<double>
+{
+    __hostdev__ static double value() { return 1e-15; }
+};
+//@}
+
+//@{
+/// Delta for small floating-point offsets
+template<typename T>
+struct Delta;
+template<>
+struct Delta<float>
+{
+    __hostdev__ static float value() { return 1e-5f; }
+};
+template<>
+struct Delta<double>
+{
+    __hostdev__ static double value() { return 1e-9; }
+};
+//@}
+
+//@{
+/// Maximum floating-point values
+template<typename T>
+struct Maximum;
+#if defined(__CUDA_ARCH__) || defined(__HIP__)
+template<>
+struct Maximum<int>
+{
+    __hostdev__ static int value() { return 2147483647; }
+};
+template<>
+struct Maximum<uint32_t>
+{
+    __hostdev__ static uint32_t value() { return 4294967295u; }
+};
+template<>
+struct Maximum<float>
+{
+    __hostdev__ static float value() { return 1e+38f; }
+};
+template<>
+struct Maximum<double>
+{
+    __hostdev__ static double value() { return 1e+308; }
+};
+#else
+template<typename T>
+struct Maximum
+{
+    static T value() { return std::numeric_limits<T>::max(); }
+};
+#endif
+//@}
+
+template<typename Type>
+__hostdev__ inline bool isApproxZero(const Type& x)
+{
+    return !(x > Tolerance<Type>::value()) && !(x < -Tolerance<Type>::value());
+}
+
+template<typename Type>
+__hostdev__ inline Type Min(Type a, Type b)
+{
+    return (a < b) ? a : b;
+}
+__hostdev__ inline int32_t Min(int32_t a, int32_t b)
+{
+    return int32_t(fminf(float(a), float(b)));
+}
+__hostdev__ inline uint32_t Min(uint32_t a, uint32_t b)
+{
+    return uint32_t(fminf(float(a), float(b)));
+}
+__hostdev__ inline float Min(float a, float b)
+{
+    return fminf(a, b);
+}
+__hostdev__ inline double Min(double a, double b)
+{
+    return fmin(a, b);
+}
+template<typename Type>
+__hostdev__ inline Type Max(Type a, Type b)
+{
+    return (a > b) ? a : b;
+}
+
+__hostdev__ inline int32_t Max(int32_t a, int32_t b)
+{
+    return int32_t(fmaxf(float(a), float(b)));
+}
+__hostdev__ inline uint32_t Max(uint32_t a, uint32_t b)
+{
+    return uint32_t(fmaxf(float(a), float(b)));
+}
+__hostdev__ inline float Max(float a, float b)
+{
+    return fmaxf(a, b);
+}
+__hostdev__ inline double Max(double a, double b)
+{
+    return fmax(a, b);
+}
+__hostdev__ inline float Clamp(float x, float a, float b)
+{
+    return Max(Min(x, b), a);
+}
+__hostdev__ inline double Clamp(double x, double a, double b)
+{
+    return Max(Min(x, b), a);
+}
+
+__hostdev__ inline float Fract(float x)
+{
+    return x - floorf(x);
+}
+__hostdev__ inline double Fract(double x)
+{
+    return x - floor(x);
+}
+
+__hostdev__ inline int32_t Floor(float x)
+{
+    return int32_t(floorf(x));
+}
+__hostdev__ inline int32_t Floor(double x)
+{
+    return int32_t(floor(x));
+}
+
+__hostdev__ inline int32_t Ceil(float x)
+{
+    return int32_t(ceilf(x));
+}
+__hostdev__ inline int32_t Ceil(double x)
+{
+    return int32_t(ceil(x));
+}
+
+template<typename T>
+__hostdev__ inline T Pow2(T x)
+{
+    return x * x;
+}
+
+template<typename T>
+__hostdev__ inline T Pow3(T x)
+{
+    return x * x * x;
+}
+
+template<typename T>
+__hostdev__ inline T Pow4(T x)
+{
+    return Pow2(x * x);
+}
+template<typename T>
+__hostdev__ inline T Abs(T x)
+{
+    return x < 0 ? -x : x;
+}
+
+template<>
+__hostdev__ inline float Abs(float x)
+{
+    return fabsf(x);
+}
+
+template<>
+__hostdev__ inline double Abs(double x)
+{
+    return fabs(x);
+}
+
+template<>
+__hostdev__ inline int Abs(int x)
+{
+    return abs(x);
+}
+
+template<typename CoordT, typename RealT, template<typename> class Vec3T>
+__hostdev__ inline CoordT Round(const Vec3T<RealT>& xyz);
+
+template<typename CoordT, template<typename> class Vec3T>
+__hostdev__ inline CoordT Round(const Vec3T<float>& xyz)
+{
+    return CoordT(int32_t(rintf(xyz[0])), int32_t(rintf(xyz[1])), int32_t(rintf(xyz[2])));
+    //return CoordT(int32_t(roundf(xyz[0])), int32_t(roundf(xyz[1])), int32_t(roundf(xyz[2])) );
+    //return CoordT(int32_t(floorf(xyz[0] + 0.5f)), int32_t(floorf(xyz[1] + 0.5f)), int32_t(floorf(xyz[2] + 0.5f)));
+}
+
+template<typename CoordT, template<typename> class Vec3T>
+__hostdev__ inline CoordT Round(const Vec3T<double>& xyz)
+{
+    return CoordT(int32_t(floor(xyz[0] + 0.5)), int32_t(floor(xyz[1] + 0.5)), int32_t(floor(xyz[2] + 0.5)));
+}
+
+template<typename CoordT, typename RealT, template<typename> class Vec3T>
+__hostdev__ inline CoordT RoundDown(const Vec3T<RealT>& xyz)
+{
+    return CoordT(Floor(xyz[0]), Floor(xyz[1]), Floor(xyz[2]));
+}
+
+//@{
+/// Return the square root of a floating-point value.
+__hostdev__ inline float Sqrt(float x)
+{
+    return sqrtf(x);
+}
+__hostdev__ inline double Sqrt(double x)
+{
+    return sqrt(x);
+}
+//@}
+
+/// Return the sign of the given value as an integer (either -1, 0 or 1).
+template<typename T>
+__hostdev__ inline T Sign(const T& x)
+{
+    return ((T(0) < x) ? T(1) : T(0)) - ((x < T(0)) ? T(1) : T(0));
+}
+
+template<typename Vec3T>
+__hostdev__ inline int MinIndex(const Vec3T& v)
+{
+#if 0
+    static const int hashTable[8] = {2, 1, 9, 1, 2, 9, 0, 0}; //9 are dummy values
+    const int        hashKey = ((v[0] < v[1]) << 2) + ((v[0] < v[2]) << 1) + (v[1] < v[2]); // ?*4+?*2+?*1
+    return hashTable[hashKey];
+#else
+    if (v[0] < v[1] && v[0] < v[2])
+        return 0;
+    if (v[1] < v[2])
+        return 1;
+    else
+        return 2;
+#endif
+}
+
+template<typename Vec3T>
+__hostdev__ inline int MaxIndex(const Vec3T& v)
+{
+#if 0
+    static const int hashTable[8] = {2, 1, 9, 1, 2, 9, 0, 0}; //9 are dummy values
+    const int        hashKey = ((v[0] > v[1]) << 2) + ((v[0] > v[2]) << 1) + (v[1] > v[2]); // ?*4+?*2+?*1
+    return hashTable[hashKey];
+#else
+    if (v[0] > v[1] && v[0] > v[2])
+        return 0;
+    if (v[1] > v[2])
+        return 1;
+    else
+        return 2;
+#endif
+}
+
+/// @brief round up byteSize to the nearest wordSize, e.g. to align to machine word: AlignUp<sizeof(size_t)(n)
+///
+/// @details both wordSize and byteSize are in byte units
+template<uint64_t wordSize>
+__hostdev__ inline uint64_t AlignUp(uint64_t byteCount)
+{
+    const uint64_t r = byteCount % wordSize;
+    return r ? byteCount - r + wordSize : byteCount;
+}
+
+// ------------------------------> Coord <--------------------------------------
+
+// forward declaration so we can define Coord::asVec3s and Coord::asVec3d
+template<typename>
+class Vec3;
+
+/// @brief Signed (i, j, k) 32-bit integer coordinate class, similar to openvdb::math::Coord
+class Coord
+{
+    int32_t mVec[3]; // private member data - three signed index coordinates
+public:
+    using ValueType = int32_t;
+    using IndexType = uint32_t;
+
+    /// @brief Initialize all coordinates to zero.
+    __hostdev__ Coord()
+        : mVec{0, 0, 0}
+    {
+    }
+
+    /// @brief Initializes all coordinates to the given signed integer.
+    __hostdev__ explicit Coord(ValueType n)
+        : mVec{n, n, n}
+    {
+    }
+
+    /// @brief Initializes coordinate to the given signed integers.
+    __hostdev__ Coord(ValueType i, ValueType j, ValueType k)
+        : mVec{i, j, k}
+    {
+    }
+
+    __hostdev__ Coord(ValueType* ptr)
+        : mVec{ptr[0], ptr[1], ptr[2]}
+    {
+    }
+
+    __hostdev__ int32_t x() const { return mVec[0]; }
+    __hostdev__ int32_t y() const { return mVec[1]; }
+    __hostdev__ int32_t z() const { return mVec[2]; }
+
+    __hostdev__ int32_t& x() { return mVec[0]; }
+    __hostdev__ int32_t& y() { return mVec[1]; }
+    __hostdev__ int32_t& z() { return mVec[2]; }
+
+    __hostdev__ static Coord max() { return Coord(int32_t((1u << 31) - 1)); }
+
+    __hostdev__ static Coord min() { return Coord(-int32_t((1u << 31) - 1) - 1); }
+
+    __hostdev__ static size_t memUsage() { return sizeof(Coord); }
+
+    /// @brief Return a const reference to the given Coord component.
+    /// @warning The argument is assumed to be 0, 1, or 2.
+    __hostdev__ const ValueType& operator[](IndexType i) const { return mVec[i]; }
+
+    /// @brief Return a non-const reference to the given Coord component.
+    /// @warning The argument is assumed to be 0, 1, or 2.
+    __hostdev__ ValueType& operator[](IndexType i) { return mVec[i]; }
+
+    /// @brief Assignment operator that works with openvdb::Coord
+    template<typename CoordT>
+    __hostdev__ Coord& operator=(const CoordT& other)
+    {
+        static_assert(sizeof(Coord) == sizeof(CoordT), "Mis-matched sizeof");
+        mVec[0] = other[0];
+        mVec[1] = other[1];
+        mVec[2] = other[2];
+        return *this;
+    }
+
+    /// @brief Return a new instance with coordinates masked by the given unsigned integer.
+    __hostdev__ Coord operator&(IndexType n) const { return Coord(mVec[0] & n, mVec[1] & n, mVec[2] & n); }
+
+    // @brief Return a new instance with coordinates left-shifted by the given unsigned integer.
+    __hostdev__ Coord operator<<(IndexType n) const { return Coord(mVec[0] << n, mVec[1] << n, mVec[2] << n); }
+
+    // @brief Return a new instance with coordinates right-shifted by the given unsigned integer.
+    __hostdev__ Coord operator>>(IndexType n) const { return Coord(mVec[0] >> n, mVec[1] >> n, mVec[2] >> n); }
+
+    /// @brief Return true if this Coord is lexicographically less than the given Coord.
+    __hostdev__ bool operator<(const Coord& rhs) const
+    {
+        return mVec[0] < rhs[0] ? true
+             : mVec[0] > rhs[0] ? false
+             : mVec[1] < rhs[1] ? true
+             : mVec[1] > rhs[1] ? false
+             : mVec[2] < rhs[2] ? true : false;
+    }
+
+    /// @brief Return true if this Coord is lexicographically less or equal to the given Coord.
+    __hostdev__ bool operator<=(const Coord& rhs) const
+    {
+        return mVec[0] < rhs[0] ? true
+             : mVec[0] > rhs[0] ? false
+             : mVec[1] < rhs[1] ? true
+             : mVec[1] > rhs[1] ? false
+             : mVec[2] <=rhs[2] ? true : false;
+    }
+
+    // @brief Return true if this Coord is lexicographically greater than the given Coord.
+    __hostdev__ bool operator>(const Coord& rhs) const
+    {
+        return mVec[0] > rhs[0] ? true
+             : mVec[0] < rhs[0] ? false
+             : mVec[1] > rhs[1] ? true
+             : mVec[1] < rhs[1] ? false
+             : mVec[2] > rhs[2] ? true : false;
+    }
+
+    // @brief Return true if this Coord is lexicographically greater or equal to the given Coord.
+    __hostdev__ bool operator>=(const Coord& rhs) const
+    {
+        return mVec[0] > rhs[0] ? true
+             : mVec[0] < rhs[0] ? false
+             : mVec[1] > rhs[1] ? true
+             : mVec[1] < rhs[1] ? false
+             : mVec[2] >=rhs[2] ? true : false;
+    }
+
+    // @brief Return true if the Coord components are identical.
+    __hostdev__ bool   operator==(const Coord& rhs) const { return mVec[0] == rhs[0] && mVec[1] == rhs[1] && mVec[2] == rhs[2]; }
+    __hostdev__ bool   operator!=(const Coord& rhs) const { return mVec[0] != rhs[0] || mVec[1] != rhs[1] || mVec[2] != rhs[2]; }
+    __hostdev__ Coord& operator&=(int n)
+    {
+        mVec[0] &= n;
+        mVec[1] &= n;
+        mVec[2] &= n;
+        return *this;
+    }
+    __hostdev__ Coord& operator<<=(uint32_t n)
+    {
+        mVec[0] <<= n;
+        mVec[1] <<= n;
+        mVec[2] <<= n;
+        return *this;
+    }
+    __hostdev__ Coord& operator>>=(uint32_t n)
+    {
+        mVec[0] >>= n;
+        mVec[1] >>= n;
+        mVec[2] >>= n;
+        return *this;
+    }
+    __hostdev__ Coord& operator+=(int n)
+    {
+        mVec[0] += n;
+        mVec[1] += n;
+        mVec[2] += n;
+        return *this;
+    }
+    __hostdev__ Coord  operator+(const Coord& rhs) const { return Coord(mVec[0] + rhs[0], mVec[1] + rhs[1], mVec[2] + rhs[2]); }
+    __hostdev__ Coord  operator-(const Coord& rhs) const { return Coord(mVec[0] - rhs[0], mVec[1] - rhs[1], mVec[2] - rhs[2]); }
+    __hostdev__ Coord  operator-() const { return Coord(-mVec[0], -mVec[1], -mVec[2]); }
+    __hostdev__ Coord& operator+=(const Coord& rhs)
+    {
+        mVec[0] += rhs[0];
+        mVec[1] += rhs[1];
+        mVec[2] += rhs[2];
+        return *this;
+    }
+    __hostdev__ Coord& operator-=(const Coord& rhs)
+    {
+        mVec[0] -= rhs[0];
+        mVec[1] -= rhs[1];
+        mVec[2] -= rhs[2];
+        return *this;
+    }
+
+    /// @brief Perform a component-wise minimum with the other Coord.
+    __hostdev__ Coord& minComponent(const Coord& other)
+    {
+        if (other[0] < mVec[0])
+            mVec[0] = other[0];
+        if (other[1] < mVec[1])
+            mVec[1] = other[1];
+        if (other[2] < mVec[2])
+            mVec[2] = other[2];
+        return *this;
+    }
+
+    /// @brief Perform a component-wise maximum with the other Coord.
+    __hostdev__ Coord& maxComponent(const Coord& other)
+    {
+        if (other[0] > mVec[0])
+            mVec[0] = other[0];
+        if (other[1] > mVec[1])
+            mVec[1] = other[1];
+        if (other[2] > mVec[2])
+            mVec[2] = other[2];
+        return *this;
+    }
+#if defined(__CUDACC__) // the following functions only run on the GPU!
+    __device__ inline Coord& minComponentAtomic(const Coord& other)
+    {
+        atomicMin(&mVec[0], other[0]);
+        atomicMin(&mVec[1], other[1]);
+        atomicMin(&mVec[2], other[2]);
+        return *this;
+    }
+    __device__ inline Coord& maxComponentAtomic(const Coord& other)
+    {
+        atomicMax(&mVec[0], other[0]);
+        atomicMax(&mVec[1], other[1]);
+        atomicMax(&mVec[2], other[2]);
+        return *this;
+    }
+#endif
+
+    __hostdev__ Coord offsetBy(ValueType dx, ValueType dy, ValueType dz) const
+    {
+        return Coord(mVec[0] + dx, mVec[1] + dy, mVec[2] + dz);
+    }
+
+    __hostdev__ Coord offsetBy(ValueType n) const { return this->offsetBy(n, n, n); }
+
+    /// Return true if any of the components of @a a are smaller than the
+    /// corresponding components of @a b.
+    __hostdev__ static inline bool lessThan(const Coord& a, const Coord& b)
+    {
+        return (a[0] < b[0] || a[1] < b[1] || a[2] < b[2]);
+    }
+
+    /// @brief Return the largest integer coordinates that are not greater
+    /// than @a xyz (node centered conversion).
+    template<typename Vec3T>
+    __hostdev__ static Coord Floor(const Vec3T& xyz) { return Coord(math::Floor(xyz[0]), math::Floor(xyz[1]), math::Floor(xyz[2])); }
+
+    /// @brief Return a hash key derived from the existing coordinates.
+    /// @details The hash function is originally taken from the SIGGRAPH paper:
+    ///          "VDB: High-resolution sparse volumes with dynamic topology"
+    ///          and the prime numbers are modified based on the ACM Transactions on Graphics paper:
+    ///          "Real-time 3D reconstruction at scale using voxel hashing" (the second number had a typo!)
+    template<int Log2N = 3 + 4 + 5>
+    __hostdev__ uint32_t hash() const { return ((1 << Log2N) - 1) & (mVec[0] * 73856093 ^ mVec[1] * 19349669 ^ mVec[2] * 83492791); }
+
+    /// @brief Return the octant of this Coord
+    //__hostdev__ size_t octant() const { return (uint32_t(mVec[0])>>31) | ((uint32_t(mVec[1])>>31)<<1) | ((uint32_t(mVec[2])>>31)<<2); }
+    __hostdev__ uint8_t octant() const { return (uint8_t(bool(mVec[0] & (1u << 31)))) |
+                                                (uint8_t(bool(mVec[1] & (1u << 31))) << 1) |
+                                                (uint8_t(bool(mVec[2] & (1u << 31))) << 2); }
+
+    /// @brief Return a single precision floating-point vector of this coordinate
+    __hostdev__ inline Vec3<float> asVec3s() const;
+
+    /// @brief Return a double precision floating-point vector of this coordinate
+    __hostdev__ inline Vec3<double> asVec3d() const;
+
+    // returns a copy of itself, so it mimics the behaviour of Vec3<T>::round()
+    __hostdev__ inline Coord round() const { return *this; }
+}; // Coord class
+
+// ----------------------------> Vec3 <--------------------------------------
+
+/// @brief A simple vector class with three components, similar to openvdb::math::Vec3
+template<typename T>
+class Vec3
+{
+    T mVec[3];
+
+public:
+    static const int SIZE = 3;
+    static const int size = 3; // in openvdb::math::Tuple
+    using ValueType = T;
+    Vec3() = default;
+    __hostdev__ explicit Vec3(T x)
+        : mVec{x, x, x}
+    {
+    }
+    __hostdev__ Vec3(T x, T y, T z)
+        : mVec{x, y, z}
+    {
+    }
+    template<template<class> class Vec3T, class T2>
+    __hostdev__ Vec3(const Vec3T<T2>& v)
+        : mVec{T(v[0]), T(v[1]), T(v[2])}
+    {
+        static_assert(Vec3T<T2>::size == size, "expected Vec3T::size==3!");
+    }
+    template<typename T2>
+    __hostdev__ explicit Vec3(const Vec3<T2>& v)
+        : mVec{T(v[0]), T(v[1]), T(v[2])}
+    {
+    }
+    __hostdev__ explicit Vec3(const Coord& ijk)
+        : mVec{T(ijk[0]), T(ijk[1]), T(ijk[2])}
+    {
+    }
+    __hostdev__ bool operator==(const Vec3& rhs) const { return mVec[0] == rhs[0] && mVec[1] == rhs[1] && mVec[2] == rhs[2]; }
+    __hostdev__ bool operator!=(const Vec3& rhs) const { return mVec[0] != rhs[0] || mVec[1] != rhs[1] || mVec[2] != rhs[2]; }
+    template<template<class> class Vec3T, class T2>
+    __hostdev__ Vec3& operator=(const Vec3T<T2>& rhs)
+    {
+        static_assert(Vec3T<T2>::size == size, "expected Vec3T::size==3!");
+        mVec[0] = rhs[0];
+        mVec[1] = rhs[1];
+        mVec[2] = rhs[2];
+        return *this;
+    }
+    __hostdev__ const T& operator[](int i) const { return mVec[i]; }
+    __hostdev__ T&       operator[](int i) { return mVec[i]; }
+    template<typename Vec3T>
+    __hostdev__ T dot(const Vec3T& v) const { return mVec[0] * v[0] + mVec[1] * v[1] + mVec[2] * v[2]; }
+    template<typename Vec3T>
+    __hostdev__ Vec3 cross(const Vec3T& v) const
+    {
+        return Vec3(mVec[1] * v[2] - mVec[2] * v[1],
+                    mVec[2] * v[0] - mVec[0] * v[2],
+                    mVec[0] * v[1] - mVec[1] * v[0]);
+    }
+    __hostdev__ T lengthSqr() const
+    {
+        return mVec[0] * mVec[0] + mVec[1] * mVec[1] + mVec[2] * mVec[2]; // 5 flops
+    }
+    __hostdev__ T     length() const { return Sqrt(this->lengthSqr()); }
+    __hostdev__ Vec3  operator-() const { return Vec3(-mVec[0], -mVec[1], -mVec[2]); }
+    __hostdev__ Vec3  operator*(const Vec3& v) const { return Vec3(mVec[0] * v[0], mVec[1] * v[1], mVec[2] * v[2]); }
+    __hostdev__ Vec3  operator/(const Vec3& v) const { return Vec3(mVec[0] / v[0], mVec[1] / v[1], mVec[2] / v[2]); }
+    __hostdev__ Vec3  operator+(const Vec3& v) const { return Vec3(mVec[0] + v[0], mVec[1] + v[1], mVec[2] + v[2]); }
+    __hostdev__ Vec3  operator-(const Vec3& v) const { return Vec3(mVec[0] - v[0], mVec[1] - v[1], mVec[2] - v[2]); }
+    __hostdev__ Vec3  operator+(const Coord& ijk) const { return Vec3(mVec[0] + ijk[0], mVec[1] + ijk[1], mVec[2] + ijk[2]); }
+    __hostdev__ Vec3  operator-(const Coord& ijk) const { return Vec3(mVec[0] - ijk[0], mVec[1] - ijk[1], mVec[2] - ijk[2]); }
+    __hostdev__ Vec3  operator*(const T& s) const { return Vec3(s * mVec[0], s * mVec[1], s * mVec[2]); }
+    __hostdev__ Vec3  operator/(const T& s) const { return (T(1) / s) * (*this); }
+    __hostdev__ Vec3& operator+=(const Vec3& v)
+    {
+        mVec[0] += v[0];
+        mVec[1] += v[1];
+        mVec[2] += v[2];
+        return *this;
+    }
+    __hostdev__ Vec3& operator+=(const Coord& ijk)
+    {
+        mVec[0] += T(ijk[0]);
+        mVec[1] += T(ijk[1]);
+        mVec[2] += T(ijk[2]);
+        return *this;
+    }
+    __hostdev__ Vec3& operator-=(const Vec3& v)
+    {
+        mVec[0] -= v[0];
+        mVec[1] -= v[1];
+        mVec[2] -= v[2];
+        return *this;
+    }
+    __hostdev__ Vec3& operator-=(const Coord& ijk)
+    {
+        mVec[0] -= T(ijk[0]);
+        mVec[1] -= T(ijk[1]);
+        mVec[2] -= T(ijk[2]);
+        return *this;
+    }
+    __hostdev__ Vec3& operator*=(const T& s)
+    {
+        mVec[0] *= s;
+        mVec[1] *= s;
+        mVec[2] *= s;
+        return *this;
+    }
+    __hostdev__ Vec3& operator/=(const T& s) { return (*this) *= T(1) / s; }
+    __hostdev__ Vec3& normalize() { return (*this) /= this->length(); }
+    /// @brief Perform a component-wise minimum with the other Coord.
+    __hostdev__ Vec3& minComponent(const Vec3& other)
+    {
+        if (other[0] < mVec[0])
+            mVec[0] = other[0];
+        if (other[1] < mVec[1])
+            mVec[1] = other[1];
+        if (other[2] < mVec[2])
+            mVec[2] = other[2];
+        return *this;
+    }
+
+    /// @brief Perform a component-wise maximum with the other Coord.
+    __hostdev__ Vec3& maxComponent(const Vec3& other)
+    {
+        if (other[0] > mVec[0])
+            mVec[0] = other[0];
+        if (other[1] > mVec[1])
+            mVec[1] = other[1];
+        if (other[2] > mVec[2])
+            mVec[2] = other[2];
+        return *this;
+    }
+    /// @brief Return the smallest vector component
+    __hostdev__ ValueType min() const
+    {
+        return mVec[0] < mVec[1] ? (mVec[0] < mVec[2] ? mVec[0] : mVec[2]) : (mVec[1] < mVec[2] ? mVec[1] : mVec[2]);
+    }
+    /// @brief Return the largest vector component
+    __hostdev__ ValueType max() const
+    {
+        return mVec[0] > mVec[1] ? (mVec[0] > mVec[2] ? mVec[0] : mVec[2]) : (mVec[1] > mVec[2] ? mVec[1] : mVec[2]);
+    }
+    /// @brief Round each component if this Vec<T> up to its integer value
+    /// @return Return an integer Coord
+    __hostdev__ Coord floor() const { return Coord(Floor(mVec[0]), Floor(mVec[1]), Floor(mVec[2])); }
+    /// @brief Round each component if this Vec<T> down to its integer value
+    /// @return Return an integer Coord
+    __hostdev__ Coord ceil() const { return Coord(Ceil(mVec[0]), Ceil(mVec[1]), Ceil(mVec[2])); }
+    /// @brief Round each component if this Vec<T> to its closest integer value
+    /// @return Return an integer Coord
+    __hostdev__ Coord round() const
+    {
+        if (util::is_same<T, float>::value) {
+            return Coord(Floor(mVec[0] + 0.5f), Floor(mVec[1] + 0.5f), Floor(mVec[2] + 0.5f));
+        } else if (util::is_same<T, int>::value) {
+            return Coord(mVec[0], mVec[1], mVec[2]);
+        } else {
+            return Coord(Floor(mVec[0] + 0.5), Floor(mVec[1] + 0.5), Floor(mVec[2] + 0.5));
+        }
+    }
+
+    /// @brief return a non-const raw constant pointer to array of three vector components
+    __hostdev__ T* asPointer() { return mVec; }
+    /// @brief return a const raw constant pointer to array of three vector components
+    __hostdev__ const T* asPointer() const { return mVec; }
+}; // Vec3<T>
+
+template<typename T1, typename T2>
+__hostdev__ inline Vec3<T2> operator*(T1 scalar, const Vec3<T2>& vec)
+{
+    return Vec3<T2>(scalar * vec[0], scalar * vec[1], scalar * vec[2]);
+}
+template<typename T1, typename T2>
+__hostdev__ inline Vec3<T2> operator/(T1 scalar, const Vec3<T2>& vec)
+{
+    return Vec3<T2>(scalar / vec[0], scalar / vec[1], scalar / vec[2]);
+}
+
+/// @brief Return a single precision floating-point vector of this coordinate
+__hostdev__ inline Vec3<float> Coord::asVec3s() const
+{
+    return Vec3<float>(float(mVec[0]), float(mVec[1]), float(mVec[2]));
+}
+
+/// @brief Return a double precision floating-point vector of this coordinate
+__hostdev__ inline Vec3<double> Coord::asVec3d() const
+{
+    return Vec3<double>(double(mVec[0]), double(mVec[1]), double(mVec[2]));
+}
+
+// ----------------------------> Vec4 <--------------------------------------
+
+/// @brief A simple vector class with four components, similar to openvdb::math::Vec4
+template<typename T>
+class Vec4
+{
+    T mVec[4];
+
+public:
+    static const int SIZE = 4;
+    static const int size = 4;
+    using ValueType = T;
+    Vec4() = default;
+    __hostdev__ explicit Vec4(T x)
+        : mVec{x, x, x, x}
+    {
+    }
+    __hostdev__ Vec4(T x, T y, T z, T w)
+        : mVec{x, y, z, w}
+    {
+    }
+    template<typename T2>
+    __hostdev__ explicit Vec4(const Vec4<T2>& v)
+        : mVec{T(v[0]), T(v[1]), T(v[2]), T(v[3])}
+    {
+    }
+    template<template<class> class Vec4T, class T2>
+    __hostdev__ Vec4(const Vec4T<T2>& v)
+        : mVec{T(v[0]), T(v[1]), T(v[2]), T(v[3])}
+    {
+        static_assert(Vec4T<T2>::size == size, "expected Vec4T::size==4!");
+    }
+    __hostdev__ bool operator==(const Vec4& rhs) const { return mVec[0] == rhs[0] && mVec[1] == rhs[1] && mVec[2] == rhs[2] && mVec[3] == rhs[3]; }
+    __hostdev__ bool operator!=(const Vec4& rhs) const { return mVec[0] != rhs[0] || mVec[1] != rhs[1] || mVec[2] != rhs[2] || mVec[3] != rhs[3]; }
+    template<template<class> class Vec4T, class T2>
+    __hostdev__ Vec4& operator=(const Vec4T<T2>& rhs)
+    {
+        static_assert(Vec4T<T2>::size == size, "expected Vec4T::size==4!");
+        mVec[0] = rhs[0];
+        mVec[1] = rhs[1];
+        mVec[2] = rhs[2];
+        mVec[3] = rhs[3];
+        return *this;
+    }
+
+    __hostdev__ const T& operator[](int i) const { return mVec[i]; }
+    __hostdev__ T&       operator[](int i) { return mVec[i]; }
+    template<typename Vec4T>
+    __hostdev__ T dot(const Vec4T& v) const { return mVec[0] * v[0] + mVec[1] * v[1] + mVec[2] * v[2] + mVec[3] * v[3]; }
+    __hostdev__ T lengthSqr() const
+    {
+        return mVec[0] * mVec[0] + mVec[1] * mVec[1] + mVec[2] * mVec[2] + mVec[3] * mVec[3]; // 7 flops
+    }
+    __hostdev__ T     length() const { return Sqrt(this->lengthSqr()); }
+    __hostdev__ Vec4  operator-() const { return Vec4(-mVec[0], -mVec[1], -mVec[2], -mVec[3]); }
+    __hostdev__ Vec4  operator*(const Vec4& v) const { return Vec4(mVec[0] * v[0], mVec[1] * v[1], mVec[2] * v[2], mVec[3] * v[3]); }
+    __hostdev__ Vec4  operator/(const Vec4& v) const { return Vec4(mVec[0] / v[0], mVec[1] / v[1], mVec[2] / v[2], mVec[3] / v[3]); }
+    __hostdev__ Vec4  operator+(const Vec4& v) const { return Vec4(mVec[0] + v[0], mVec[1] + v[1], mVec[2] + v[2], mVec[3] + v[3]); }
+    __hostdev__ Vec4  operator-(const Vec4& v) const { return Vec4(mVec[0] - v[0], mVec[1] - v[1], mVec[2] - v[2], mVec[3] - v[3]); }
+    __hostdev__ Vec4  operator*(const T& s) const { return Vec4(s * mVec[0], s * mVec[1], s * mVec[2], s * mVec[3]); }
+    __hostdev__ Vec4  operator/(const T& s) const { return (T(1) / s) * (*this); }
+    __hostdev__ Vec4& operator+=(const Vec4& v)
+    {
+        mVec[0] += v[0];
+        mVec[1] += v[1];
+        mVec[2] += v[2];
+        mVec[3] += v[3];
+        return *this;
+    }
+    __hostdev__ Vec4& operator-=(const Vec4& v)
+    {
+        mVec[0] -= v[0];
+        mVec[1] -= v[1];
+        mVec[2] -= v[2];
+        mVec[3] -= v[3];
+        return *this;
+    }
+    __hostdev__ Vec4& operator*=(const T& s)
+    {
+        mVec[0] *= s;
+        mVec[1] *= s;
+        mVec[2] *= s;
+        mVec[3] *= s;
+        return *this;
+    }
+    __hostdev__ Vec4& operator/=(const T& s) { return (*this) *= T(1) / s; }
+    __hostdev__ Vec4& normalize() { return (*this) /= this->length(); }
+    /// @brief Perform a component-wise minimum with the other Coord.
+    __hostdev__ Vec4& minComponent(const Vec4& other)
+    {
+        if (other[0] < mVec[0])
+            mVec[0] = other[0];
+        if (other[1] < mVec[1])
+            mVec[1] = other[1];
+        if (other[2] < mVec[2])
+            mVec[2] = other[2];
+        if (other[3] < mVec[3])
+            mVec[3] = other[3];
+        return *this;
+    }
+
+    /// @brief Perform a component-wise maximum with the other Coord.
+    __hostdev__ Vec4& maxComponent(const Vec4& other)
+    {
+        if (other[0] > mVec[0])
+            mVec[0] = other[0];
+        if (other[1] > mVec[1])
+            mVec[1] = other[1];
+        if (other[2] > mVec[2])
+            mVec[2] = other[2];
+        if (other[3] > mVec[3])
+            mVec[3] = other[3];
+        return *this;
+    }
+}; // Vec4<T>
+
+template<typename T1, typename T2>
+__hostdev__ inline Vec4<T2> operator*(T1 scalar, const Vec4<T2>& vec)
+{
+    return Vec4<T2>(scalar * vec[0], scalar * vec[1], scalar * vec[2], scalar * vec[3]);
+}
+template<typename T1, typename T2>
+__hostdev__ inline Vec4<T2> operator/(T1 scalar, const Vec4<T2>& vec)
+{
+    return Vec4<T2>(scalar / vec[0], scalar / vec[1], scalar / vec[2], scalar / vec[3]);
+}
+
+// ----------------------------> matMult <--------------------------------------
+
+/// @brief Multiply a 3x3 matrix and a 3d vector using 32bit floating point arithmetics
+/// @note This corresponds to a linear mapping, e.g. scaling, rotation etc.
+/// @tparam Vec3T Template type of the input and output 3d vectors
+/// @param mat pointer to an array of floats with the 3x3 matrix
+/// @param xyz input vector to be multiplied by the matrix
+/// @return result of matrix-vector multiplication, i.e. mat x xyz
+template<typename Vec3T>
+__hostdev__ inline Vec3T matMult(const float* mat, const Vec3T& xyz)
+{
+    return Vec3T(fmaf(static_cast<float>(xyz[0]), mat[0], fmaf(static_cast<float>(xyz[1]), mat[1], static_cast<float>(xyz[2]) * mat[2])),
+                 fmaf(static_cast<float>(xyz[0]), mat[3], fmaf(static_cast<float>(xyz[1]), mat[4], static_cast<float>(xyz[2]) * mat[5])),
+                 fmaf(static_cast<float>(xyz[0]), mat[6], fmaf(static_cast<float>(xyz[1]), mat[7], static_cast<float>(xyz[2]) * mat[8]))); // 6 fmaf + 3 mult = 9 flops
+}
+
+/// @brief Multiply a 3x3 matrix and a 3d vector using 64bit floating point arithmetics
+/// @note This corresponds to a linear mapping, e.g. scaling, rotation etc.
+/// @tparam Vec3T Template type of the input and output 3d vectors
+/// @param mat pointer to an array of floats with the 3x3 matrix
+/// @param xyz input vector to be multiplied by the matrix
+/// @return result of matrix-vector multiplication, i.e. mat x xyz
+template<typename Vec3T>
+__hostdev__ inline Vec3T matMult(const double* mat, const Vec3T& xyz)
+{
+    return Vec3T(fma(static_cast<double>(xyz[0]), mat[0], fma(static_cast<double>(xyz[1]), mat[1], static_cast<double>(xyz[2]) * mat[2])),
+                 fma(static_cast<double>(xyz[0]), mat[3], fma(static_cast<double>(xyz[1]), mat[4], static_cast<double>(xyz[2]) * mat[5])),
+                 fma(static_cast<double>(xyz[0]), mat[6], fma(static_cast<double>(xyz[1]), mat[7], static_cast<double>(xyz[2]) * mat[8]))); // 6 fmaf + 3 mult = 9 flops
+}
+
+/// @brief Multiply a 3x3 matrix to a 3d vector and add another 3d vector using 32bit floating point arithmetics
+/// @note This corresponds to an affine transformation, i.e a linear mapping followed by a translation. e.g. scale/rotation and translation
+/// @tparam Vec3T Template type of the input and output 3d vectors
+/// @param mat pointer to an array of floats with the 3x3 matrix
+/// @param vec 3d vector to be added AFTER the matrix multiplication
+/// @param xyz input vector to be multiplied by the matrix and a translated by @c vec
+/// @return result of affine transformation, i.e. (mat x xyz) + vec
+template<typename Vec3T>
+__hostdev__ inline Vec3T matMult(const float* mat, const float* vec, const Vec3T& xyz)
+{
+    return Vec3T(fmaf(static_cast<float>(xyz[0]), mat[0], fmaf(static_cast<float>(xyz[1]), mat[1], fmaf(static_cast<float>(xyz[2]), mat[2], vec[0]))),
+                 fmaf(static_cast<float>(xyz[0]), mat[3], fmaf(static_cast<float>(xyz[1]), mat[4], fmaf(static_cast<float>(xyz[2]), mat[5], vec[1]))),
+                 fmaf(static_cast<float>(xyz[0]), mat[6], fmaf(static_cast<float>(xyz[1]), mat[7], fmaf(static_cast<float>(xyz[2]), mat[8], vec[2])))); // 9 fmaf = 9 flops
+}
+
+/// @brief Multiply a 3x3 matrix to a 3d vector and add another 3d vector using 64bit floating point arithmetics
+/// @note This corresponds to an affine transformation, i.e a linear mapping followed by a translation. e.g. scale/rotation and translation
+/// @tparam Vec3T Template type of the input and output 3d vectors
+/// @param mat pointer to an array of floats with the 3x3 matrix
+/// @param vec 3d vector to be added AFTER the matrix multiplication
+/// @param xyz input vector to be multiplied by the matrix and a translated by @c vec
+/// @return result of affine transformation, i.e. (mat x xyz) + vec
+template<typename Vec3T>
+__hostdev__ inline Vec3T matMult(const double* mat, const double* vec, const Vec3T& xyz)
+{
+    return Vec3T(fma(static_cast<double>(xyz[0]), mat[0], fma(static_cast<double>(xyz[1]), mat[1], fma(static_cast<double>(xyz[2]), mat[2], vec[0]))),
+                 fma(static_cast<double>(xyz[0]), mat[3], fma(static_cast<double>(xyz[1]), mat[4], fma(static_cast<double>(xyz[2]), mat[5], vec[1]))),
+                 fma(static_cast<double>(xyz[0]), mat[6], fma(static_cast<double>(xyz[1]), mat[7], fma(static_cast<double>(xyz[2]), mat[8], vec[2])))); // 9 fma = 9 flops
+}
+
+/// @brief Multiply the transposed of a 3x3 matrix and a 3d vector using 32bit floating point arithmetics
+/// @note This corresponds to an inverse linear mapping, e.g. inverse scaling, inverse rotation etc.
+/// @tparam Vec3T Template type of the input and output 3d vectors
+/// @param mat pointer to an array of floats with the 3x3 matrix
+/// @param xyz input vector to be multiplied by the transposed matrix
+/// @return result of matrix-vector multiplication, i.e. mat^T x xyz
+template<typename Vec3T>
+__hostdev__ inline Vec3T matMultT(const float* mat, const Vec3T& xyz)
+{
+    return Vec3T(fmaf(static_cast<float>(xyz[0]), mat[0], fmaf(static_cast<float>(xyz[1]), mat[3], static_cast<float>(xyz[2]) * mat[6])),
+                 fmaf(static_cast<float>(xyz[0]), mat[1], fmaf(static_cast<float>(xyz[1]), mat[4], static_cast<float>(xyz[2]) * mat[7])),
+                 fmaf(static_cast<float>(xyz[0]), mat[2], fmaf(static_cast<float>(xyz[1]), mat[5], static_cast<float>(xyz[2]) * mat[8]))); // 6 fmaf + 3 mult = 9 flops
+}
+
+/// @brief Multiply the transposed of a 3x3 matrix and a 3d vector using 64bit floating point arithmetics
+/// @note This corresponds to an inverse linear mapping, e.g. inverse scaling, inverse rotation etc.
+/// @tparam Vec3T Template type of the input and output 3d vectors
+/// @param mat pointer to an array of floats with the 3x3 matrix
+/// @param xyz input vector to be multiplied by the transposed matrix
+/// @return result of matrix-vector multiplication, i.e. mat^T x xyz
+template<typename Vec3T>
+__hostdev__ inline Vec3T matMultT(const double* mat, const Vec3T& xyz)
+{
+    return Vec3T(fma(static_cast<double>(xyz[0]), mat[0], fma(static_cast<double>(xyz[1]), mat[3], static_cast<double>(xyz[2]) * mat[6])),
+                 fma(static_cast<double>(xyz[0]), mat[1], fma(static_cast<double>(xyz[1]), mat[4], static_cast<double>(xyz[2]) * mat[7])),
+                 fma(static_cast<double>(xyz[0]), mat[2], fma(static_cast<double>(xyz[1]), mat[5], static_cast<double>(xyz[2]) * mat[8]))); // 6 fmaf + 3 mult = 9 flops
+}
+
+template<typename Vec3T>
+__hostdev__ inline Vec3T matMultT(const float* mat, const float* vec, const Vec3T& xyz)
+{
+    return Vec3T(fmaf(static_cast<float>(xyz[0]), mat[0], fmaf(static_cast<float>(xyz[1]), mat[3], fmaf(static_cast<float>(xyz[2]), mat[6], vec[0]))),
+                 fmaf(static_cast<float>(xyz[0]), mat[1], fmaf(static_cast<float>(xyz[1]), mat[4], fmaf(static_cast<float>(xyz[2]), mat[7], vec[1]))),
+                 fmaf(static_cast<float>(xyz[0]), mat[2], fmaf(static_cast<float>(xyz[1]), mat[5], fmaf(static_cast<float>(xyz[2]), mat[8], vec[2])))); // 9 fmaf = 9 flops
+}
+
+template<typename Vec3T>
+__hostdev__ inline Vec3T matMultT(const double* mat, const double* vec, const Vec3T& xyz)
+{
+    return Vec3T(fma(static_cast<double>(xyz[0]), mat[0], fma(static_cast<double>(xyz[1]), mat[3], fma(static_cast<double>(xyz[2]), mat[6], vec[0]))),
+                 fma(static_cast<double>(xyz[0]), mat[1], fma(static_cast<double>(xyz[1]), mat[4], fma(static_cast<double>(xyz[2]), mat[7], vec[1]))),
+                 fma(static_cast<double>(xyz[0]), mat[2], fma(static_cast<double>(xyz[1]), mat[5], fma(static_cast<double>(xyz[2]), mat[8], vec[2])))); // 9 fma = 9 flops
+}
+
+// ----------------------------> BBox <-------------------------------------
+
+// Base-class for static polymorphism (cannot be constructed directly)
+template<typename Vec3T>
+struct BaseBBox
+{
+    Vec3T                    mCoord[2];
+    __hostdev__ bool         operator==(const BaseBBox& rhs) const { return mCoord[0] == rhs.mCoord[0] && mCoord[1] == rhs.mCoord[1]; };
+    __hostdev__ bool         operator!=(const BaseBBox& rhs) const { return mCoord[0] != rhs.mCoord[0] || mCoord[1] != rhs.mCoord[1]; };
+    __hostdev__ const Vec3T& operator[](int i) const { return mCoord[i]; }
+    __hostdev__ Vec3T&       operator[](int i) { return mCoord[i]; }
+    __hostdev__ Vec3T&       min() { return mCoord[0]; }
+    __hostdev__ Vec3T&       max() { return mCoord[1]; }
+    __hostdev__ const Vec3T& min() const { return mCoord[0]; }
+    __hostdev__ const Vec3T& max() const { return mCoord[1]; }
+    __hostdev__ BaseBBox&    translate(const Vec3T& xyz)
+    {
+        mCoord[0] += xyz;
+        mCoord[1] += xyz;
+        return *this;
+    }
+    /// @brief Expand this bounding box to enclose point @c xyz.
+    __hostdev__ BaseBBox& expand(const Vec3T& xyz)
+    {
+        mCoord[0].minComponent(xyz);
+        mCoord[1].maxComponent(xyz);
+        return *this;
+    }
+
+    /// @brief Expand this bounding box to enclose the given bounding box.
+    __hostdev__ BaseBBox& expand(const BaseBBox& bbox)
+    {
+        mCoord[0].minComponent(bbox[0]);
+        mCoord[1].maxComponent(bbox[1]);
+        return *this;
+    }
+
+    /// @brief Intersect this bounding box with the given bounding box.
+    __hostdev__ BaseBBox& intersect(const BaseBBox& bbox)
+    {
+        mCoord[0].maxComponent(bbox[0]);
+        mCoord[1].minComponent(bbox[1]);
+        return *this;
+    }
+
+    //__hostdev__ BaseBBox expandBy(typename Vec3T::ValueType padding) const
+    //{
+    //    return BaseBBox(mCoord[0].offsetBy(-padding),mCoord[1].offsetBy(padding));
+    //}
+    __hostdev__ bool isInside(const Vec3T& xyz)
+    {
+        if (xyz[0] < mCoord[0][0] || xyz[1] < mCoord[0][1] || xyz[2] < mCoord[0][2])
+            return false;
+        if (xyz[0] > mCoord[1][0] || xyz[1] > mCoord[1][1] || xyz[2] > mCoord[1][2])
+            return false;
+        return true;
+    }
+
+protected:
+    __hostdev__ BaseBBox() {}
+    __hostdev__ BaseBBox(const Vec3T& min, const Vec3T& max)
+        : mCoord{min, max}
+    {
+    }
+}; // BaseBBox
+
+template<typename Vec3T, bool = util::is_floating_point<typename Vec3T::ValueType>::value>
+struct BBox;
+
+/// @brief Partial template specialization for floating point coordinate types.
+///
+/// @note Min is inclusive and max is exclusive. If min = max the dimension of
+///       the bounding box is zero and therefore it is also empty.
+template<typename Vec3T>
+struct BBox<Vec3T, true> : public BaseBBox<Vec3T>
+{
+    using Vec3Type = Vec3T;
+    using ValueType = typename Vec3T::ValueType;
+    static_assert(util::is_floating_point<ValueType>::value, "Expected a floating point coordinate type");
+    using BaseT = BaseBBox<Vec3T>;
+    using BaseT::mCoord;
+    /// @brief Default construction sets BBox to an empty bbox
+    __hostdev__ BBox()
+        : BaseT(Vec3T( Maximum<typename Vec3T::ValueType>::value()),
+                Vec3T(-Maximum<typename Vec3T::ValueType>::value()))
+    {
+    }
+    __hostdev__ BBox(const Vec3T& min, const Vec3T& max)
+        : BaseT(min, max)
+    {
+    }
+    __hostdev__ BBox(const Coord& min, const Coord& max)
+        : BaseT(Vec3T(ValueType(min[0]), ValueType(min[1]), ValueType(min[2])),
+                Vec3T(ValueType(max[0] + 1), ValueType(max[1] + 1), ValueType(max[2] + 1)))
+    {
+    }
+    __hostdev__ static BBox createCube(const Coord& min, typename Coord::ValueType dim)
+    {
+        return BBox(min, min.offsetBy(dim));
+    }
+
+    __hostdev__ BBox(const BaseBBox<Coord>& bbox)
+        : BBox(bbox[0], bbox[1])
+    {
+    }
+    __hostdev__ bool  empty() const { return mCoord[0][0] >= mCoord[1][0] ||
+                                             mCoord[0][1] >= mCoord[1][1] ||
+                                             mCoord[0][2] >= mCoord[1][2]; }
+    __hostdev__ operator bool() const { return mCoord[0][0] < mCoord[1][0] &&
+                                               mCoord[0][1] < mCoord[1][1] &&
+                                               mCoord[0][2] < mCoord[1][2]; }
+    __hostdev__ Vec3T dim() const { return *this ? this->max() - this->min() : Vec3T(0); }
+    __hostdev__ bool  isInside(const Vec3T& p) const
+    {
+        return p[0] > mCoord[0][0] && p[1] > mCoord[0][1] && p[2] > mCoord[0][2] &&
+               p[0] < mCoord[1][0] && p[1] < mCoord[1][1] && p[2] < mCoord[1][2];
+    }
+
+}; // BBox<Vec3T, true>
+
+/// @brief Partial template specialization for integer coordinate types
+///
+/// @note Both min and max are INCLUDED in the bbox so dim = max - min + 1. So,
+///       if min = max the bounding box contains exactly one point and dim = 1!
+template<typename CoordT>
+struct BBox<CoordT, false> : public BaseBBox<CoordT>
+{
+    static_assert(util::is_same<int, typename CoordT::ValueType>::value, "Expected \"int\" coordinate type");
+    using BaseT = BaseBBox<CoordT>;
+    using BaseT::mCoord;
+    /// @brief Iterator over the domain covered by a BBox
+    /// @details z is the fastest-moving coordinate.
+    class Iterator
+    {
+        const BBox& mBBox;
+        CoordT      mPos;
+
+    public:
+        __hostdev__ Iterator(const BBox& b)
+            : mBBox(b)
+            , mPos(b.min())
+        {
+        }
+        __hostdev__ Iterator(const BBox& b, const Coord& p)
+            : mBBox(b)
+            , mPos(p)
+        {
+        }
+        __hostdev__ Iterator& operator++()
+        {
+            if (mPos[2] < mBBox[1][2]) { // this is the most common case
+                ++mPos[2];// increment z
+            } else if (mPos[1] < mBBox[1][1]) {
+                mPos[2] = mBBox[0][2];// reset z
+                ++mPos[1];// increment y
+            } else if (mPos[0] <= mBBox[1][0]) {
+                mPos[2] = mBBox[0][2];// reset z
+                mPos[1] = mBBox[0][1];// reset y
+                ++mPos[0];// increment x
+            }
+            return *this;
+        }
+        __hostdev__ Iterator operator++(int)
+        {
+            auto tmp = *this;
+            ++(*this);
+            return tmp;
+        }
+        __hostdev__ bool operator==(const Iterator& rhs) const
+        {
+            NANOVDB_ASSERT(mBBox == rhs.mBBox);
+            return mPos == rhs.mPos;
+        }
+        __hostdev__ bool operator!=(const Iterator& rhs) const
+        {
+            NANOVDB_ASSERT(mBBox == rhs.mBBox);
+            return mPos != rhs.mPos;
+        }
+        __hostdev__ bool operator<(const Iterator& rhs) const
+        {
+            NANOVDB_ASSERT(mBBox == rhs.mBBox);
+            return mPos < rhs.mPos;
+        }
+        __hostdev__ bool operator<=(const Iterator& rhs) const
+        {
+            NANOVDB_ASSERT(mBBox == rhs.mBBox);
+            return mPos <= rhs.mPos;
+        }
+        /// @brief Return @c true if the iterator still points to a valid coordinate.
+        __hostdev__ operator bool() const { return mPos <= mBBox[1]; }
+        __hostdev__ const CoordT& operator*() const { return mPos; }
+    }; // Iterator
+    __hostdev__ Iterator begin() const { return Iterator{*this}; }
+    __hostdev__ Iterator end()   const { return Iterator{*this, CoordT(mCoord[1][0]+1, mCoord[0][1], mCoord[0][2])}; }
+    __hostdev__          BBox()
+        : BaseT(CoordT::max(), CoordT::min())
+    {
+    }
+    __hostdev__ BBox(const CoordT& min, const CoordT& max)
+        : BaseT(min, max)
+    {
+    }
+
+    template<typename SplitT>
+    __hostdev__ BBox(BBox& other, const SplitT&)
+        : BaseT(other.mCoord[0], other.mCoord[1])
+    {
+        NANOVDB_ASSERT(this->is_divisible());
+        const int n = MaxIndex(this->dim());
+        mCoord[1][n] = (mCoord[0][n] + mCoord[1][n]) >> 1;
+        other.mCoord[0][n] = mCoord[1][n] + 1;
+    }
+
+    __hostdev__ static BBox createCube(const CoordT& min, typename CoordT::ValueType dim)
+    {
+        return BBox(min, min.offsetBy(dim - 1));
+    }
+
+    __hostdev__ static BBox createCube(typename CoordT::ValueType min, typename CoordT::ValueType max)
+    {
+        return BBox(CoordT(min), CoordT(max));
+    }
+
+    __hostdev__ bool is_divisible() const { return mCoord[0][0] < mCoord[1][0] &&
+                                                   mCoord[0][1] < mCoord[1][1] &&
+                                                   mCoord[0][2] < mCoord[1][2]; }
+    /// @brief Return true if this bounding box is empty, e.g. uninitialized
+    __hostdev__ bool     empty() const { return mCoord[0][0] > mCoord[1][0] ||
+                                                mCoord[0][1] > mCoord[1][1] ||
+                                                mCoord[0][2] > mCoord[1][2]; }
+    /// @brief Convert this BBox to boolean true if it is not empty
+    __hostdev__ operator bool() const { return mCoord[0][0] <= mCoord[1][0] &&
+                                               mCoord[0][1] <= mCoord[1][1] &&
+                                               mCoord[0][2] <= mCoord[1][2]; }
+    __hostdev__ CoordT   dim() const { return *this ? this->max() - this->min() + Coord(1) : Coord(0); }
+    __hostdev__ uint64_t volume() const
+    {
+        auto d = this->dim();
+        return uint64_t(d[0]) * uint64_t(d[1]) * uint64_t(d[2]);
+    }
+    __hostdev__ bool isInside(const CoordT& p) const { return !(CoordT::lessThan(p, this->min()) || CoordT::lessThan(this->max(), p)); }
+    /// @brief Return @c true if the given bounding box is inside this bounding box.
+    __hostdev__ bool isInside(const BBox& b) const
+    {
+        return !(CoordT::lessThan(b.min(), this->min()) || CoordT::lessThan(this->max(), b.max()));
+    }
+
+    /// @brief Return @c true if the given bounding box overlaps with this bounding box.
+    __hostdev__ bool hasOverlap(const BBox& b) const
+    {
+        return !(CoordT::lessThan(this->max(), b.min()) || CoordT::lessThan(b.max(), this->min()));
+    }
+
+    /// @warning This converts a CoordBBox into a floating-point bounding box which implies that max += 1 !
+    template<typename RealT = double>
+    __hostdev__ BBox<Vec3<RealT>> asReal() const
+    {
+        static_assert(util::is_floating_point<RealT>::value, "CoordBBox::asReal: Expected a floating point coordinate");
+        return BBox<Vec3<RealT>>(Vec3<RealT>(RealT(mCoord[0][0]), RealT(mCoord[0][1]), RealT(mCoord[0][2])),
+                                 Vec3<RealT>(RealT(mCoord[1][0] + 1), RealT(mCoord[1][1] + 1), RealT(mCoord[1][2] + 1)));
+    }
+    /// @brief Return a new instance that is expanded by the specified padding.
+    __hostdev__ BBox expandBy(typename CoordT::ValueType padding) const
+    {
+        return BBox(mCoord[0].offsetBy(-padding), mCoord[1].offsetBy(padding));
+    }
+
+    /// @brief  @brief transform this coordinate bounding box by the specified map
+    /// @param map mapping of index to world coordinates
+    /// @return world bounding box
+    template<typename Map>
+    __hostdev__ auto transform(const Map& map) const
+    {
+        using Vec3T = Vec3<double>;
+        const Vec3T tmp = map.applyMap(Vec3T(mCoord[0][0], mCoord[0][1], mCoord[0][2]));
+        BBox<Vec3T> bbox(tmp, tmp);// return value
+        bbox.expand(map.applyMap(Vec3T(mCoord[0][0], mCoord[0][1], mCoord[1][2])));
+        bbox.expand(map.applyMap(Vec3T(mCoord[0][0], mCoord[1][1], mCoord[0][2])));
+        bbox.expand(map.applyMap(Vec3T(mCoord[1][0], mCoord[0][1], mCoord[0][2])));
+        bbox.expand(map.applyMap(Vec3T(mCoord[1][0], mCoord[1][1], mCoord[0][2])));
+        bbox.expand(map.applyMap(Vec3T(mCoord[1][0], mCoord[0][1], mCoord[1][2])));
+        bbox.expand(map.applyMap(Vec3T(mCoord[0][0], mCoord[1][1], mCoord[1][2])));
+        bbox.expand(map.applyMap(Vec3T(mCoord[1][0], mCoord[1][1], mCoord[1][2])));
+        return bbox;
+    }
+
+#if defined(__CUDACC__) // the following functions only run on the GPU!
+    __device__ inline BBox& expandAtomic(const CoordT& ijk)
+    {
+        mCoord[0].minComponentAtomic(ijk);
+        mCoord[1].maxComponentAtomic(ijk);
+        return *this;
+    }
+    __device__ inline BBox& expandAtomic(const BBox& bbox)
+    {
+        mCoord[0].minComponentAtomic(bbox[0]);
+        mCoord[1].maxComponentAtomic(bbox[1]);
+        return *this;
+    }
+    __device__ inline BBox& intersectAtomic(const BBox& bbox)
+    {
+        mCoord[0].maxComponentAtomic(bbox[0]);
+        mCoord[1].minComponentAtomic(bbox[1]);
+        return *this;
+    }
+#endif
+}; // BBox<CoordT, false>
+
+// --------------------------> Rgba8 <------------------------------------
+
+/// @brief 8-bit red, green, blue, alpha packed into 32 bit unsigned int
+class Rgba8
+{
+    union
+    {
+        uint8_t  c[4];   // 4 integer color channels of red, green, blue and alpha components.
+        uint32_t packed; // 32 bit packed representation
+    } mData;
+
+public:
+    static const int SIZE = 4;
+    using ValueType = uint8_t;
+
+    /// @brief Default copy constructor
+    Rgba8(const Rgba8&) = default;
+
+    /// @brief Default move constructor
+    Rgba8(Rgba8&&) = default;
+
+    /// @brief Default move assignment operator
+    /// @return non-const reference to this instance
+    Rgba8&      operator=(Rgba8&&) = default;
+
+    /// @brief Default copy assignment operator
+    /// @return non-const reference to this instance
+    Rgba8&      operator=(const Rgba8&) = default;
+
+    /// @brief Default ctor initializes all channels to zero
+    __hostdev__ Rgba8()
+        : mData{{0, 0, 0, 0}}
+    {
+        static_assert(sizeof(uint32_t) == sizeof(Rgba8), "Unexpected sizeof");
+    }
+
+    /// @brief integer r,g,b,a ctor where alpha channel defaults to opaque
+    /// @note all values should be in the range 0u to 255u
+    __hostdev__ Rgba8(uint8_t r, uint8_t g, uint8_t b, uint8_t a = 255u)
+        : mData{{r, g, b, a}}
+    {
+    }
+
+    /// @brief  @brief ctor where all channels are initialized to the same value
+    /// @note value should be in the range 0u to 255u
+    explicit __hostdev__ Rgba8(uint8_t v)
+        : mData{{v, v, v, v}}
+    {
+    }
+
+    /// @brief floating-point r,g,b,a ctor where alpha channel defaults to opaque
+    /// @note all values should be in the range 0.0f to 1.0f
+    __hostdev__ Rgba8(float r, float g, float b, float a = 1.0f)
+        : mData{{static_cast<uint8_t>(0.5f + r * 255.0f), // round floats to nearest integers
+                 static_cast<uint8_t>(0.5f + g * 255.0f), // double {{}} is needed due to union
+                 static_cast<uint8_t>(0.5f + b * 255.0f),
+                 static_cast<uint8_t>(0.5f + a * 255.0f)}}
+    {
+    }
+
+    /// @brief Vec3f r,g,b ctor (alpha channel it set to 1)
+    /// @note all values should be in the range 0.0f to 1.0f
+    __hostdev__ Rgba8(const Vec3<float>& rgb)
+        : Rgba8(rgb[0], rgb[1], rgb[2])
+    {
+    }
+
+    /// @brief Vec4f r,g,b,a ctor
+    /// @note all values should be in the range 0.0f to 1.0f
+    __hostdev__ Rgba8(const Vec4<float>& rgba)
+        : Rgba8(rgba[0], rgba[1], rgba[2], rgba[3])
+    {
+    }
+
+    __hostdev__ bool  operator< (const Rgba8& rhs) const { return mData.packed < rhs.mData.packed; }
+    __hostdev__ bool  operator==(const Rgba8& rhs) const { return mData.packed == rhs.mData.packed; }
+    __hostdev__ float lengthSqr() const
+    {
+        return 0.0000153787005f * (float(mData.c[0]) * mData.c[0] +
+                                   float(mData.c[1]) * mData.c[1] +
+                                   float(mData.c[2]) * mData.c[2]); //1/255^2
+    }
+    __hostdev__ float           length() const { return sqrtf(this->lengthSqr()); }
+    /// @brief return n'th color channel as a float in the range 0 to 1
+    __hostdev__ float           asFloat(int n) const { return 0.003921569f*float(mData.c[n]); }// divide by 255
+    __hostdev__ const uint8_t&  operator[](int n) const { return mData.c[n]; }
+    __hostdev__ uint8_t&        operator[](int n) { return mData.c[n]; }
+    __hostdev__ const uint32_t& packed() const { return mData.packed; }
+    __hostdev__ uint32_t&       packed() { return mData.packed; }
+    __hostdev__ const uint8_t&  r() const { return mData.c[0]; }
+    __hostdev__ const uint8_t&  g() const { return mData.c[1]; }
+    __hostdev__ const uint8_t&  b() const { return mData.c[2]; }
+    __hostdev__ const uint8_t&  a() const { return mData.c[3]; }
+    __hostdev__ uint8_t&        r() { return mData.c[0]; }
+    __hostdev__ uint8_t&        g() { return mData.c[1]; }
+    __hostdev__ uint8_t&        b() { return mData.c[2]; }
+    __hostdev__ uint8_t&        a() { return mData.c[3]; }
+    __hostdev__                 operator Vec3<float>() const {
+        return Vec3<float>(this->asFloat(0), this->asFloat(1), this->asFloat(2));
+    }
+    __hostdev__                 operator Vec4<float>() const {
+        return Vec4<float>(this->asFloat(0), this->asFloat(1), this->asFloat(2), this->asFloat(3));
+    }
+}; // Rgba8
+
+using Vec3d  = Vec3<double>;
+using Vec3f  = Vec3<float>;
+using Vec3i  = Vec3<int32_t>;
+using Vec3u  = Vec3<uint32_t>;
+using Vec3u8 = Vec3<uint8_t>;
+using Vec3u16 = Vec3<uint16_t>;
+
+using Vec4R  = Vec4<double>;
+using Vec4d  = Vec4<double>;
+using Vec4f  = Vec4<float>;
+using Vec4i  = Vec4<int>;
+
+}// namespace math ===============================================================
+
+using Rgba8 [[deprecated("Use math::Rgba8 instead.")]] = math::Rgba8;
+using math::Coord;
+
+using Vec3d = math::Vec3<double>;
+using Vec3f = math::Vec3<float>;
+using Vec3i = math::Vec3<int32_t>;
+using Vec3u = math::Vec3<uint32_t>;
+using Vec3u8 = math::Vec3<uint8_t>;
+using Vec3u16 = math::Vec3<uint16_t>;
+
+using Vec4R = math::Vec4<double>;
+using Vec4d = math::Vec4<double>;
+using Vec4f = math::Vec4<float>;
+using Vec4i = math::Vec4<int>;
+
+using CoordBBox = math::BBox<Coord>;
+using Vec3dBBox = math::BBox<Vec3d>;
+using BBoxR [[deprecated("Use Vec3dBBox instead.")]] = math::BBox<Vec3d>;
+
+} // namespace nanovdb ===================================================================
+
+#endif // end of NANOVDB_MATH_MATH_H_HAS_BEEN_INCLUDED
diff --git a/warp/native/nanovdb/tools/cuda/PointsToGrid.cuh b/warp/native/nanovdb/tools/cuda/PointsToGrid.cuh
new file mode 100644
index 00000000..3404d887
--- /dev/null
+++ b/warp/native/nanovdb/tools/cuda/PointsToGrid.cuh
@@ -0,0 +1,1291 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: MPL-2.0
+
+/*!
+    \file nanovdb/tools/cuda/PointsToGrid.cuh
+
+    \authors Greg Klar (initial version) and Ken Museth (final version)
+
+    \brief Generates NanoVDB grids from a list of voxels or points on the device
+
+    \warning The header file contains cuda device code so be sure
+             to only include it in .cu files (or other .cuh files)
+*/
+
+#ifndef NVIDIA_TOOLS_CUDA_POINTSTOGRID_CUH_HAS_BEEN_INCLUDED
+#define NVIDIA_TOOLS_CUDA_POINTSTOGRID_CUH_HAS_BEEN_INCLUDED
+
+#include <cub/cub.cuh>
+#include <cub/util_allocator.cuh>
+#include <vector>
+#include <tuple>
+
+#include <nanovdb/NanoVDB.h>
+#include <nanovdb/cuda/DeviceBuffer.h>
+#include <nanovdb/GridHandle.h>
+#include <nanovdb/util/cuda/Timer.h>
+#include <nanovdb/util/cuda/Util.h>
+
+/*
+   Note: 4.29 billion (=2^32) coordinates of type Vec3f have a memory footprint of 48 GB!
+*/
+
+namespace nanovdb {// ================================================================================
+
+namespace tools { namespace cuda {// ============================================================================
+
+/// @brief Generates a NanoGrid<Point> from a list of point coordinates on the device. This method is
+///        mainly used as a means to build a BVH acceleration structure for points, e.g. for efficient rendering.
+/// @tparam PtrT Template type to a raw or fancy-pointer of point coordinates in world space. Dereferencing should return Vec3f or Vec3d.
+/// @tparam BufferT Template type of buffer used for memory allocation on the device
+/// @tparam AllocT  Template type of optional device allocator for internal temporary memory
+/// @param dWorldPoints Raw or fancy pointer to list of point coordinates in world space on the device
+/// @param pointCount number of point in the list @c d_world
+/// @param voxelSize Size of a voxel in world units used for the output grid
+/// @param type Defined the way point information is represented in the output grid (see PointType enum NanoVDB.h)
+///             Should not be PointType::Disable!
+/// @param buffer Instance of the device buffer used for memory allocation
+/// @param stream optional CUDA stream (defaults to CUDA stream 0)
+/// @return Returns a handle with a grid of type NanoGrid<Point> where point information, e.g. coordinates,
+///         are represented as blind data defined by @c type.
+template<typename PtrT, typename BufferT = nanovdb::cuda::DeviceBuffer, typename AllocT = cub::CachingDeviceAllocator>
+GridHandle<BufferT>
+pointsToGrid(const PtrT dWorldPoints,
+             int pointCount,
+             double voxelSize,
+             PointType type = PointType::Default,
+             const BufferT &buffer = BufferT(),
+             cudaStream_t stream = 0);
+
+//-----------------------------------------------------------------------------------------------------
+
+/// @brief Generates a NanoGrid<Point> from a list of point coordinates on the device. This method is
+///        mainly used as a means to build a BVH acceleration structure for points, e.g. for efficient rendering.
+/// @tparam PtrT Template type to a raw or fancy-pointer of point coordinates in world space. Dereferencing should return Vec3f or Vec3d.
+/// @tparam BufferT Template type of buffer used for memory allocation on the device
+/// @tparam AllocT  Template type of optional device allocator for internal temporary memory
+/// @param dWorldPoints Raw or fancy pointer to list of point coordinates in world space on the device
+/// @param pointCount total number of point in the list @c d_world
+/// @param maxPointsPerVoxel Max density of points per voxel, i.e. maximum number of points in any voxel
+/// @param tolerance allow for point density to vary by the specified tolerance (defaults to 1). That is, the voxel size
+///                  is selected such that the max density is +/- the tolerance.
+/// @param maxIterations Maximum number of iterations used to seach for a voxel size that produces a point density
+///                      with specified tolerance takes.
+/// @param type Defined the way point information is represented in the output grid (see PointType enum in NanoVDB.h)
+///             Should not be PointType::Disable!
+/// @param buffer Instance of the device buffer used for memory allocation
+/// @param stream optional CUDA stream (defaults to CUDA stream 0)
+/// @return Returns a handle with a grid of type NanoGrid<Point> where point information, e.g. coordinates,
+///         are represented as blind data defined by @c type.
+template<typename PtrT, typename BufferT = nanovdb::cuda::DeviceBuffer, typename AllocT = cub::CachingDeviceAllocator>
+GridHandle<BufferT>
+pointsToGrid(const PtrT dWorldPoints,
+             int pointCount,
+             int maxPointPerVoxel,
+             int tolerance = 1,
+             int maxIterations = 10,
+             PointType type = PointType::Default,
+             const BufferT &buffer = BufferT(),
+             cudaStream_t stream = 0);
+
+//-----------------------------------------------------------------------------------------------------
+
+template<typename BuildT, typename PtrT, typename BufferT = nanovdb::cuda::DeviceBuffer, typename AllocT = cub::CachingDeviceAllocator>
+GridHandle<BufferT>
+pointsToGrid(std::vector<std::tuple<const PtrT,size_t,double,PointType>> pointSet,
+            const BufferT &buffer = BufferT(),
+            cudaStream_t stream = 0);
+
+//-----------------------------------------------------------------------------------------------------
+
+/// @brief Generates a NanoGrid of any type from a list of voxel coordinates on the device. Unlike @c cudaPointsToGrid
+///        this method only builds the grid but does not encode the coordinates as blind data. It is mainly useful as a
+///        means to generate a grid that is know to contain the voxels given in the list.
+/// @tparam BuildT Template type of the return grid
+/// @tparam PtrT Template type to a raw or fancy-pointer of point coordinates in world space. Dereferencing should return Vec3f or Vec3d.
+/// @tparam BufferT Template type of buffer used for memory allocation on the device
+/// @tparam AllocT  Template type of optional device allocator for internal temporary memory
+/// @param dGridVoxels Raw or fancy pointer to list of voxel coordinates in grid (or index) space on the device
+/// @param pointCount number of voxel in the list @c dGridVoxels
+/// @param voxelSize Size of a voxel in world units used for the output grid
+/// @param buffer Instance of the device buffer used for memory allocation
+/// @return Returns a handle with the grid of type NanoGrid<BuildT>
+template<typename BuildT, typename PtrT, typename BufferT = nanovdb::cuda::DeviceBuffer, typename AllocT = cub::CachingDeviceAllocator>
+GridHandle<BufferT>
+voxelsToGrid(const PtrT dGridVoxels,
+             size_t voxelCount,
+             double voxelSize = 1.0,
+             const BufferT &buffer = BufferT(),
+             cudaStream_t stream = 0);
+
+//-------------------------------------------------------------------------------------------------------
+
+template<typename BuildT, typename PtrT, typename BufferT = nanovdb::cuda::DeviceBuffer, typename AllocT = cub::CachingDeviceAllocator>
+GridHandle<BufferT>
+voxelsToGrid(std::vector<std::tuple<const PtrT, size_t, double>> pointSet,
+             const BufferT &buffer = BufferT(),
+             cudaStream_t stream = 0);
+
+} }// namespace tools::cuda ========================================================================
+
+/// @brief Example class of a fancy pointer that can optionally be used as a template for writing
+///        a custom fancy pointer that allows for particle coordinates to be arrange non-linearly
+///        in memory. For instance with coordinates are interlaced with other dats, i.e. an array
+///        of structs, a custom implementation of fancy_ptr::operator[](size_t i) can account for
+///        strides that skip other interlaces data.
+/// @tparam T Template type that specifies the type use for the coordinates of the points
+template <typename T>
+class fancy_ptr
+{
+    const T* mPtr;
+public:
+    /// @brief Default constructor.
+    /// @note  This method is atcually not required by cuda::PointsToGrid
+    /// @param ptr Pointer to array of elements
+    __hostdev__ explicit fancy_ptr(const T* ptr = nullptr) : mPtr(ptr) {}
+    /// @brief Index acces into the array pointed to by the stored pointer.
+    /// @note  This method is required by cuda::PointsToGrid!
+    /// @param i Unsigned index of the element to be returned
+    /// @return Const refernce to the element at the i'th poisiton
+    __hostdev__ inline const T& operator[](size_t i) const {return mPtr[i];}
+    /// @brief Dummy implementation required by pointer_traits.
+    /// @note  Note that only the return type matters!
+    /// @details Unlike operator[] it is safe to assume that all pointer types have operator*,
+    ///          which is why pointer_traits makes use of it to determine the element_type that
+    ///          a pointer class is pointing to. E.g. operator[] is not always defined for std::shared_ptr!
+    __hostdev__ inline const T& operator*() const {return *mPtr;}
+};// fancy_ptr<T>
+
+/// @brief Simple stand-alone function that can be used to conveniently construct a fancy_ptr
+/// @tparam T Template type that specifies the type use for the coordinates of the points
+/// @param ptr Raw pointer to data
+/// @return a new instance of a fancy_ptr
+template <typename T>
+fancy_ptr<T> make_fancy(const T* ptr = nullptr) {return fancy_ptr<T>(ptr);}
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+/// @brief Trait of points, like type of pointer and size of the pointer type
+template <typename>
+struct pointer_traits;
+
+template <typename T>
+struct pointer_traits<T*> {
+    using element_type = T;
+    static constexpr size_t element_size = sizeof(T);
+};
+
+template <typename T>
+struct pointer_traits {
+    using element_type = typename util::remove_reference<decltype(*util::declval<T>())>::type;// assumes T::operator*() exists!
+    static constexpr size_t element_size = sizeof(element_type);
+};
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+/// @brief computes the relative 8-bit voxel offsets from a world coordinate
+/// @tparam Vec3T Type of the world coordinate
+/// @param voxel 8-bit output coordinates that are relative to a voxel
+/// @param world input world coordinates
+/// @param indexToWorld Transform from index to world space
+template <typename Vec3T>
+__hostdev__ inline static void worldToVoxel(Vec3u8 &voxel, const Vec3T &world, const Map &indexToWorld)
+{
+    const Vec3d ijk = indexToWorld.applyInverseMap(world);// world -> index
+    static constexpr double encode = double((1<<8) - 1);
+    voxel[0] = uint8_t( encode*(ijk[0] - math::Floor(ijk[0] + 0.5) + 0.5) );
+    voxel[1] = uint8_t( encode*(ijk[1] - math::Floor(ijk[1] + 0.5) + 0.5) );
+    voxel[2] = uint8_t( encode*(ijk[2] - math::Floor(ijk[2] + 0.5) + 0.5) );
+}
+
+/// @brief computes the relative 16-bit voxel offsets from a world coordinate
+/// @tparam Vec3T Type of the world coordinate
+/// @param voxel 16-bit output coordinates that are relative to a voxel
+/// @param world input world coordinates
+/// @param indexToWorld Transform from index to world space
+template <typename Vec3T>
+__hostdev__ inline static void worldToVoxel(Vec3u16 &voxel, const Vec3T &world, const Map &indexToWorld)
+{
+    const Vec3d ijk = indexToWorld.applyInverseMap(world);// world -> index
+    static constexpr double encode = double((1<<16) - 1);
+    voxel[0] = uint16_t( encode*(ijk[0] - math::Floor(ijk[0] + 0.5) + 0.5) );
+    voxel[1] = uint16_t( encode*(ijk[1] - math::Floor(ijk[1] + 0.5) + 0.5) );
+    voxel[2] = uint16_t( encode*(ijk[2] - math::Floor(ijk[2] + 0.5) + 0.5) );
+}
+
+/// @brief computes the relative float voxel offsets from a world coordinate
+/// @tparam Vec3T Type of the world coordinate
+/// @param voxel float output coordinates that are relative to a voxel
+/// @param world input world coordinates
+/// @param indexToWorld Transform from index to world space
+template <typename Vec3T>
+__hostdev__ inline static void worldToVoxel(Vec3f &voxel, const Vec3T &world, const Map &indexToWorld)
+{
+    const Vec3d ijk = indexToWorld.applyInverseMap(world);// world -> index
+    voxel[0] = float( ijk[0] - math::Floor(ijk[0] + 0.5) );
+    voxel[1] = float( ijk[1] - math::Floor(ijk[1] + 0.5) );
+    voxel[2] = float( ijk[2] - math::Floor(ijk[2] + 0.5) );
+}
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+template <typename Vec3T = Vec3d>
+__hostdev__ inline static Vec3T voxelToWorld(const Vec3u8 &voxel, const Coord &ijk, const Map &map)
+{
+    static constexpr double decode = 1.0/double((1<<8) - 1);
+    if constexpr(util::is_same<Vec3T,Vec3d>::value) {
+        return map.applyMap( Vec3d(ijk[0] + decode*voxel[0] - 0.5, ijk[1] + decode*voxel[1] - 0.5, ijk[2] + decode*voxel[2] - 0.5));
+    } else {
+        return map.applyMapF(Vec3f(ijk[0] + decode*voxel[0] - 0.5f, ijk[1] + decode*voxel[1] - 0.5f, ijk[2] + decode*voxel[2] - 0.5f));
+    }
+}
+
+template <typename Vec3T = Vec3d>
+__hostdev__ inline static Vec3T voxelToWorld(const Vec3u16 &voxel, const Coord &ijk, const Map &map)
+{
+    static constexpr double decode = 1.0/double((1<<16) - 1);
+    if constexpr(util::is_same<Vec3T,Vec3d>::value) {
+        return map.applyMap( Vec3d(ijk[0] + decode*voxel[0] - 0.5, ijk[1] + decode*voxel[1] - 0.5, ijk[2] + decode*voxel[2] - 0.5));
+    } else {
+        return map.applyMapF(Vec3f(ijk[0] + decode*voxel[0] - 0.5f, ijk[1] + decode*voxel[1] - 0.5f, ijk[2] + decode*voxel[2] - 0.5f));
+    }
+}
+
+template <typename Vec3T = Vec3d>
+__hostdev__ inline static Vec3T voxelToWorld(const Vec3f &voxel, const Coord &ijk, const Map &map)
+{
+    if constexpr(util::is_same<Vec3T,Vec3d>::value) {
+        return map.applyMap( Vec3d(ijk[0] + voxel[0], ijk[1] + voxel[1], ijk[2] + voxel[2]));
+    } else {
+        return map.applyMapF(Vec3f(ijk[0] + voxel[0], ijk[1] + voxel[1], ijk[2] + voxel[2]));
+    }
+}
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+namespace tools{ namespace cuda {
+
+template <typename BuildT, typename AllocT = cub::CachingDeviceAllocator>
+class PointsToGrid
+{
+public:
+
+    struct Data {
+        Map map;
+        void     *d_bufferPtr;
+        uint64_t *d_keys, *d_tile_keys, *d_lower_keys, *d_leaf_keys;// device pointer to 64 bit keys
+        uint64_t  grid, tree, root, upper, lower, leaf, meta, blind, size;// byte offsets to nodes in buffer
+        uint32_t *d_indx;// device pointer to point indices (or IDs)
+        uint32_t  nodeCount[3], *pointsPerLeafPrefix, *pointsPerLeaf;// 0=leaf,1=lower, 2=upper
+        uint32_t  voxelCount,  *pointsPerVoxelPrefix, *pointsPerVoxel;
+        BitFlags<16> flags;
+        __hostdev__ NanoGrid<BuildT>&  getGrid() const {return *util::PtrAdd<NanoGrid<BuildT>>(d_bufferPtr, grid);}
+        __hostdev__ NanoTree<BuildT>&  getTree() const {return *util::PtrAdd<NanoTree<BuildT>>(d_bufferPtr, tree);}
+        __hostdev__ NanoRoot<BuildT>&  getRoot() const {return *util::PtrAdd<NanoRoot<BuildT>>(d_bufferPtr, root);}
+        __hostdev__ NanoUpper<BuildT>& getUpper(int i) const {return *(util::PtrAdd<NanoUpper<BuildT>>(d_bufferPtr, upper)+i);}
+        __hostdev__ NanoLower<BuildT>& getLower(int i) const {return *(util::PtrAdd<NanoLower<BuildT>>(d_bufferPtr, lower)+i);}
+        __hostdev__ NanoLeaf<BuildT>&  getLeaf(int i) const {return *(util::PtrAdd<NanoLeaf<BuildT>>(d_bufferPtr, leaf)+i);}
+        __hostdev__ GridBlindMetaData& getMeta() const { return *util::PtrAdd<GridBlindMetaData>(d_bufferPtr, meta);};
+         template <typename Vec3T>
+        __hostdev__ Vec3T& getPoint(int i) const {return *(util::PtrAdd<Vec3T>(d_bufferPtr, blind)+i);}
+    };// Data
+
+    /// @brief Map constructor, which other constructors might call
+    /// @param map Map to be used for the output device grid
+    /// @param stream optional CUDA stream (defaults to CUDA stream 0)
+    PointsToGrid(const Map &map, cudaStream_t stream = 0)
+        : mStream(stream)
+        , mPointType(util::is_same<BuildT,Point>::value ? PointType::Default : PointType::Disable)
+    {
+        mData.map = map;
+        mData.flags.initMask({GridFlags::HasBBox, GridFlags::IsBreadthFirst});
+        mDeviceData = mMemPool.template alloc<Data>(mStream);
+    }
+
+    /// @brief Default constructor that calls the Map constructor defined above
+    /// @param scale Voxel size in world units
+    /// @param trans Translation of origin in world units
+    /// @param stream optional CUDA stream (defaults to CUDA stream 0)
+    PointsToGrid(const double scale = 1.0, const Vec3d &trans = Vec3d(0.0), cudaStream_t stream = 0)
+        : PointsToGrid(Map(scale, trans), stream){}
+
+    /// @brief Constructor from a target maximum number of particles per voxel. Calls the Map constructor defined above
+    /// @param maxPointsPerVoxel Maximum number of points oer voxel
+    /// @param stream optional CUDA stream (defaults to CUDA stream 0)
+    PointsToGrid(int maxPointsPerVoxel, int tolerance = 1, int maxIterations = 10, cudaStream_t stream = 0)
+        : PointsToGrid(Map(1.0), stream)
+    {
+        mMaxPointsPerVoxel = maxPointsPerVoxel;
+        mTolerance = tolerance;
+        mMaxIterations = maxIterations;
+    }
+
+    /// @brief Destructor
+    ~PointsToGrid() {mMemPool.free(mDeviceData);}
+
+    /// @brief Toggle on and off verbose mode
+    /// @param level Verbose level: 0=quiet, 1=timing, 2=benchmarking
+    void setVerbose(int level = 1) {mVerbose = level; mData.flags.setBit(7u, level); }
+
+    /// @brief Set the mode for checksum computation, which is disabled by default
+    /// @param mode Mode of checksum computation
+    void setChecksum(CheckMode mode = CheckMode::Disable){mChecksum = mode;}
+
+    /// @brief Toggle on and off the computation of a bounding-box
+    /// @param on If true bbox will be computed
+    void includeBBox(bool on = true) { mData.flags.setMask(GridFlags::HasBBox, on); }
+
+    /// @brief Set the name of the output grid
+    /// @param name name of the output grid
+    void setGridName(const std::string &name) {mGridName = name;}
+
+    // only available when BuildT == Point
+    template <typename T = BuildT> typename util::enable_if<util::is_same<T, Point>::value>::type
+    setPointType(PointType type) { mPointType = type; }
+
+    /// @brief Creates a handle to a grid with the specified build type from a list of points in index or world space
+    /// @tparam BuildT Build type of the output grid, i.e NanoGrid<BuildT>
+    /// @tparam PtrT Template type to a raw or fancy-pointer of point coordinates in world or index space.
+    /// @tparam BufferT Buffer type used for allocation of the grid handle
+    /// @param points device point to an array of points in world space
+    /// @param pointCount number of input points or voxels
+    /// @param buffer optional buffer (currently ignored)
+    /// @return returns a handle with a grid of type NanoGrid<BuildT>
+    template<typename PtrT, typename BufferT = nanovdb::cuda::DeviceBuffer>
+    GridHandle<BufferT> getHandle(const PtrT points,
+                                  size_t pointCount,
+                                  const BufferT &buffer = BufferT());
+
+    template <typename PtrT>
+    void countNodes(const PtrT points, size_t pointCount);
+
+    template <typename PtrT>
+    void processGridTreeRoot(const PtrT points, size_t pointCount);
+
+    void processUpperNodes();
+
+    void processLowerNodes();
+
+    template <typename PtrT>
+    void processLeafNodes(const PtrT points);
+
+    template <typename PtrT>
+    void processPoints(const PtrT points, size_t pointCount);
+
+    void processBBox();
+
+    // the following methods are only defined when BuildT == Point
+    template <typename T = BuildT> typename util::enable_if<util::is_same<T, Point>::value, uint32_t>::type
+    maxPointsPerVoxel() const {return mMaxPointsPerVoxel;}
+    template <typename T = BuildT> typename util::enable_if<util::is_same<T, Point>::value, uint32_t>::type
+    maxPointsPerLeaf()  const {return mMaxPointsPerLeaf;}
+
+private:
+    static constexpr unsigned int mNumThreads = 128;// seems faster than the old value of 256!
+    static unsigned int numBlocks(unsigned int n) {return (n + mNumThreads - 1) / mNumThreads;}
+
+    cudaStream_t      mStream{0};
+    util::cuda::Timer mTimer;
+    PointType         mPointType;
+    std::string       mGridName;
+    int               mVerbose{0};
+    Data              mData, *mDeviceData;
+    uint32_t          mMaxPointsPerVoxel{0u}, mMaxPointsPerLeaf{0u};
+    int               mTolerance{1}, mMaxIterations{1};
+    CheckMode         mChecksum{CheckMode::Disable};
+
+    // wrapper of AllocT, defaulting to cub::CachingDeviceAllocator, which offers a shared scratch space
+    struct Allocator {
+        AllocT mAllocator;
+        void* d_scratch;
+        size_t scratchSize, actualScratchSize;
+        Allocator() : d_scratch(nullptr), scratchSize(0), actualScratchSize(0) {}
+        ~Allocator() {
+            if (scratchSize > 0) this->free(d_scratch);// a bug in cub makes this necessary
+            mAllocator.FreeAllCached();
+        }
+        template <typename T>
+        T* alloc(size_t count, cudaStream_t stream) {
+            T* d_ptr = nullptr;
+            cudaCheck(mAllocator.DeviceAllocate((void**)&d_ptr, sizeof(T)*count, stream));
+            return d_ptr;
+        }
+        template <typename T>
+        T* alloc(cudaStream_t stream) {return this->template alloc<T>(1, stream);}
+        void free(void *d_ptr) {if (d_ptr) cudaCheck(mAllocator.DeviceFree(d_ptr));}
+        template<class... T>
+        void free(void *d_ptr, T... other) {
+            if (d_ptr) cudaCheck(mAllocator.DeviceFree(d_ptr));
+            this->free(other...);
+        }
+        void adjustScratch(cudaStream_t stream){
+            if (scratchSize > actualScratchSize) {
+                if (actualScratchSize>0) cudaCheck(mAllocator.DeviceFree(d_scratch));
+                cudaCheck(mAllocator.DeviceAllocate((void**)&d_scratch, scratchSize, stream));
+                actualScratchSize = scratchSize;
+            }
+        }
+    } mMemPool;
+
+    template<typename PtrT, typename BufferT>
+    BufferT getBuffer(const PtrT points, size_t pointCount, const BufferT &buffer);
+};// tools::cuda::PointsToGrid<BuildT>
+
+namespace kernels {
+/// @details Used by cuda::PointsToGrid<BuildT>::processLeafNodes before the computation
+/// of prefix-sum for index grid.
+/// Moving this away from an implementation using the lambdaKernel wrapper
+/// to fix the following on Windows platform:
+/// error : For this host platform/dialect, an extended lambda cannot be defined inside the 'if'
+/// or 'else' block of a constexpr if statement.
+/// function in a lambda through lambdaKernel wrapper defined in CudaUtils.h.
+template <typename BuildT, typename AllocT = cub::CachingDeviceAllocator>
+__global__ void fillValueIndexKernel(const size_t numItems, uint64_t* devValueIndex, typename PointsToGrid<BuildT, AllocT>::Data* d_data) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= numItems) return;
+    devValueIndex[tid] = static_cast<uint64_t>(d_data->getLeaf(tid).mValueMask.countOn());
+}
+
+/// @details Used by PointsToGrid<BuildT>::processLeafNodes for the computation
+/// of prefix-sum for index grid.
+/// Moving this away from an implementation using the lambdaKernel wrapper
+/// to fix the following on Windows platform:
+/// error : For this host platform/dialect, an extended lambda cannot be defined inside the 'if'
+/// or 'else' block of a constexpr if statement.
+template <typename BuildT, typename AllocT = cub::CachingDeviceAllocator>
+__global__ void leafPrefixSumKernel(const size_t numItems, uint64_t* devValueIndexPrefix, typename PointsToGrid<BuildT, AllocT>::Data* d_data) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= numItems) return;
+
+    auto &leaf = d_data->getLeaf(tid);
+    leaf.mOffset = 1u;// will be re-set below
+    const uint64_t *w = leaf.mValueMask.words();
+    uint64_t &prefixSum = leaf.mPrefixSum, sum = util::countOn(*w++);
+    prefixSum = sum;
+    for (int n = 9; n < 55; n += 9) {// n=i*9 where i=1,2,..6
+        sum += util::countOn(*w++);
+        prefixSum |= sum << n;// each pre-fixed sum is encoded in 9 bits
+    }
+    if (tid==0) {
+        d_data->getGrid().mData1 = 1u + devValueIndexPrefix[d_data->nodeCount[0]-1];// set total count
+        d_data->getTree().mVoxelCount = devValueIndexPrefix[d_data->nodeCount[0]-1];
+    } else {
+        leaf.mOffset = 1u + devValueIndexPrefix[tid-1];// background is index 0
+    }
+}
+
+/// @details Used by PointsToGrid<BuildT>::processLeafNodes to make sure leaf.mMask - leaf.mValueMask.
+/// Moving this away from an implementation using the lambdaKernel wrapper
+/// to fix the following on Windows platform:
+/// error : For this host platform/dialect, an extended lambda cannot be defined inside the 'if'
+/// or 'else' block of a constexpr if statement.
+template <typename BuildT, typename AllocT = cub::CachingDeviceAllocator>
+__global__ void setMaskEqValMaskKernel(const size_t numItems, typename PointsToGrid<BuildT, AllocT>::Data* d_data) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= numItems) return;
+    auto &leaf = d_data->getLeaf(tid);
+    leaf.mMask = leaf.mValueMask;
+}
+} // namespace kernels
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+// Define utility macro used to call cub functions that use dynamic temporary storage
+#ifndef CALL_CUBS
+#ifdef _WIN32
+#define CALL_CUBS(func, ...) \
+    cudaCheck(cub::func(nullptr, mMemPool.scratchSize, __VA_ARGS__, mStream)); \
+    mMemPool.adjustScratch(mStream); \
+    cudaCheck(cub::func(mMemPool.d_scratch, mMemPool.scratchSize, __VA_ARGS__, mStream));
+#else// fdef _WIN32
+#define CALL_CUBS(func, args...) \
+    cudaCheck(cub::func(nullptr, mMemPool.scratchSize, args, mStream)); \
+    mMemPool.adjustScratch(mStream); \
+    cudaCheck(cub::func(mMemPool.d_scratch, mMemPool.scratchSize, args, mStream));
+#endif// ifdef _WIN32
+#endif// ifndef CALL_CUBS
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+template<typename BuildT, typename AllocT>
+template<typename PtrT, typename BufferT>
+inline GridHandle<BufferT>
+PointsToGrid<BuildT, AllocT>::getHandle(const PtrT points,
+                                        size_t pointCount,
+                                        const BufferT &pool)
+{
+    if (mVerbose==1) mTimer.start("\nCounting nodes");
+    this->countNodes(points, pointCount);
+
+    if (mVerbose==1) mTimer.restart("Initiate buffer");
+    auto buffer = this->getBuffer(points, pointCount, pool);
+
+    if (mVerbose==1) mTimer.restart("Process grid,tree,root");
+    this->processGridTreeRoot(points, pointCount);
+
+    if (mVerbose==1) mTimer.restart("Process upper nodes");
+    this->processUpperNodes();
+
+    if (mVerbose==1) mTimer.restart("Process lower nodes");
+    this->processLowerNodes();
+
+    if (mVerbose==1) mTimer.restart("Process leaf nodes");
+    this->processLeafNodes(points);
+
+    if (mVerbose==1) mTimer.restart("Process points");
+    this->processPoints(points, pointCount);
+
+    if (mVerbose==1) mTimer.restart("Process bbox");
+    this->processBBox();
+    if (mVerbose==1) mTimer.stop();
+
+    return GridHandle<BufferT>(std::move(buffer));
+}// PointsToGrid<BuildT>::getHandle
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+// --- CUB helpers ---
+template<uint8_t BitCount, typename InT, typename OutT>
+struct ShiftRight
+{
+    __hostdev__ inline OutT operator()(const InT& v) const {return static_cast<OutT>(v >> BitCount);}
+};
+
+template<uint8_t BitCount, typename InT = uint64_t, typename OutT = uint64_t>
+struct ShiftRightIterator : public cub::TransformInputIterator<OutT, ShiftRight<BitCount, InT, OutT>, InT*>
+{
+    using BASE = cub::TransformInputIterator<OutT, ShiftRight<BitCount, InT, OutT>, InT*>;
+    __hostdev__ inline ShiftRightIterator(uint64_t* input_itr) : BASE(input_itr, ShiftRight<BitCount, InT, OutT>()) {}
+};
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+template <typename BuildT, typename AllocT>
+template <typename PtrT>
+void PointsToGrid<BuildT, AllocT>::countNodes(const PtrT points, size_t pointCount)
+{
+    using Vec3T = typename util::remove_const<typename pointer_traits<PtrT>::element_type>::type;
+    if constexpr(util::is_same<BuildT, Point>::value) {
+        static_assert(util::is_same<Vec3T, Vec3f, Vec3d>::value, "Point (vs voxels) coordinates should be represented as Vec3f or Vec3d");
+    } else {
+        static_assert(util::is_same<Vec3T, Coord, Vec3f, Vec3d>::value, "Voxel coordinates should be represented as Coord, Vec3f or Vec3d");
+    }
+
+    mMaxPointsPerVoxel = math::Min(mMaxPointsPerVoxel, pointCount);
+    int iterCounter = 0;
+    struct Foo {// pairs current voxel size, dx, with the corresponding particle density, i.e. maximum number of points per voxel
+        double   dx;
+        uint32_t density;
+        bool operator<(const Foo &rhs) const {return density < rhs.density || (density == rhs.density && dx < rhs.dx);}
+    } min{0.0, 1}, max{0.0, 0};// min: as dx -> 0 density -> 1 point per voxel, max: density is 0 i.e. undefined
+
+jump:// this marks the beginning of the actual algorithm
+
+    mData.d_keys = mMemPool.template alloc<uint64_t>(pointCount, mStream);
+    mData.d_indx = mMemPool.template alloc<uint32_t>(pointCount, mStream);// uint32_t can index 4.29 billion Coords, corresponding to 48 GB
+    cudaCheck(cudaMemcpyAsync(mDeviceData, &mData, sizeof(Data), cudaMemcpyHostToDevice, mStream));// copy mData from CPU -> GPU
+
+    if (mVerbose==2) mTimer.start("\nAllocating arrays for keys and indices");
+    auto *d_keys = mMemPool.template alloc<uint64_t>(pointCount, mStream);
+    auto *d_indx = mMemPool.template alloc<uint32_t>(pointCount, mStream);
+
+    if (mVerbose==2) mTimer.restart("Generate tile keys");
+    util::cuda::lambdaKernel<<<numBlocks(pointCount), mNumThreads, 0, mStream>>>(pointCount, [=] __device__(size_t tid, const Data *d_data, const PtrT points) {
+        auto coordToKey = [](const Coord &ijk)->uint64_t{
+            // Note: int32_t has a range of -2^31 to 2^31 - 1 whereas uint32_t has a range of 0 to 2^32 - 1
+            static constexpr int64_t offset = 1 << 31;
+            return (uint64_t(uint32_t(int64_t(ijk[2]) + offset) >> 12)      ) | // z is the lower 21 bits
+                   (uint64_t(uint32_t(int64_t(ijk[1]) + offset) >> 12) << 21) | // y is the middle 21 bits
+                   (uint64_t(uint32_t(int64_t(ijk[0]) + offset) >> 12) << 42); //  x is the upper 21 bits
+        };// coordToKey lambda functor
+        d_indx[tid] = uint32_t(tid);
+        uint64_t &key = d_keys[tid];
+        if constexpr(util::is_same<BuildT, Point>::value) {// points are in world space
+            if constexpr(util::is_same<Vec3T, Vec3f>::value) {
+                key = coordToKey(d_data->map.applyInverseMapF(points[tid]).round());
+            } else {// points are Vec3d
+                key = coordToKey(d_data->map.applyInverseMap(points[tid]).round());
+            }
+        } else if constexpr(util::is_same<Vec3T, Coord>::value) {// points Coord are in index space
+            key = coordToKey(points[tid]);
+        } else {// points are Vec3f or Vec3d in index space
+            key = coordToKey(points[tid].round());
+        }
+    }, mDeviceData, points);
+    cudaCheckError();
+    if (mVerbose==2) mTimer.restart("DeviceRadixSort of "+std::to_string(pointCount)+" tile keys");
+    CALL_CUBS(DeviceRadixSort::SortPairs, d_keys, mData.d_keys, d_indx, mData.d_indx, pointCount, 0, 62);// 21 bits per coord
+    std::swap(d_indx, mData.d_indx);// sorted indices are now in d_indx
+
+    if (mVerbose==2) mTimer.restart("Allocate runs");
+    auto *d_points_per_tile = mMemPool.template alloc<uint32_t>(pointCount, mStream);
+    uint32_t *d_node_count  = mMemPool.template alloc<uint32_t>(3, mStream);
+
+    if (mVerbose==2) mTimer.restart("DeviceRunLengthEncode tile keys");
+    CALL_CUBS(DeviceRunLengthEncode::Encode, mData.d_keys, d_keys, d_points_per_tile, d_node_count+2, pointCount);
+    cudaCheck(cudaMemcpyAsync(mData.nodeCount+2, d_node_count+2, sizeof(uint32_t), cudaMemcpyDeviceToHost, mStream));
+    mData.d_tile_keys = mMemPool.template alloc<uint64_t>(mData.nodeCount[2], mStream);
+    cudaCheck(cudaMemcpyAsync(mData.d_tile_keys, d_keys, mData.nodeCount[2]*sizeof(uint64_t), cudaMemcpyDeviceToDevice, mStream));
+
+    if (mVerbose) mTimer.restart("DeviceRadixSort of " + std::to_string(pointCount) + " voxel keys in " + std::to_string(mData.nodeCount[2]) + " tiles");
+    uint32_t *points_per_tile = new uint32_t[mData.nodeCount[2]];
+    cudaCheck(cudaMemcpyAsync(points_per_tile, d_points_per_tile, mData.nodeCount[2]*sizeof(uint32_t), cudaMemcpyDeviceToHost, mStream));
+    mMemPool.free(d_points_per_tile);
+
+    for (uint32_t id = 0, offset = 0; id < mData.nodeCount[2]; ++id) {
+        const uint32_t count = points_per_tile[id];
+        util::cuda::lambdaKernel<<<numBlocks(count), mNumThreads, 0, mStream>>>(count, [=] __device__(size_t tid, const Data *d_data) {
+            auto voxelKey = [] __device__ (uint64_t tileID, const Coord &ijk){
+                return tileID << 36 |                                       // upper offset: 64-15-12-9=28, i.e. last 28 bits
+                    uint64_t(NanoUpper<BuildT>::CoordToOffset(ijk)) << 21 | // lower offset: 32^3 = 2^15,   i.e. next 15 bits
+                    uint64_t(NanoLower<BuildT>::CoordToOffset(ijk)) <<  9 | // leaf  offset: 16^3 = 2^12,   i.e. next 12 bits
+                    uint64_t(NanoLeaf< BuildT>::CoordToOffset(ijk));        // voxel offset:  8^3 =  2^9,   i.e. first 9 bits
+            };// voxelKey lambda functor
+            tid += offset;
+            Vec3T p = points[d_indx[tid]];
+            if constexpr(util::is_same<BuildT, Point>::value) p = util::is_same<Vec3T, Vec3f>::value ? d_data->map.applyInverseMapF(p) : d_data->map.applyInverseMap(p);
+            d_keys[tid] = voxelKey(id, p.round());
+        }, mDeviceData); cudaCheckError();
+        CALL_CUBS(DeviceRadixSort::SortPairs, d_keys + offset, mData.d_keys + offset, d_indx + offset, mData.d_indx + offset, count, 0, 36);// 9+12+15=36
+        offset += count;
+    }
+    mMemPool.free(d_indx);
+    delete [] points_per_tile;
+
+    if (mVerbose==2) mTimer.restart("Count points per voxel");
+
+    mData.pointsPerVoxel    = mMemPool.template alloc<uint32_t>(pointCount, mStream);
+    uint32_t *d_voxel_count = mMemPool.template alloc<uint32_t>(mStream);
+    CALL_CUBS(DeviceRunLengthEncode::Encode, mData.d_keys, d_keys, mData.pointsPerVoxel, d_voxel_count, pointCount);
+    cudaCheck(cudaMemcpyAsync(&mData.voxelCount, d_voxel_count, sizeof(uint32_t), cudaMemcpyDeviceToHost, mStream));
+    mMemPool.free(d_voxel_count);
+
+    if (util::is_same<BuildT, Point>::value) {
+        if (mVerbose==2) mTimer.restart("Count max points per voxel");
+        uint32_t *d_maxPointsPerVoxel = mMemPool.template alloc<uint32_t>(mStream), maxPointsPerVoxel;
+        CALL_CUBS(DeviceReduce::Max, mData.pointsPerVoxel, d_maxPointsPerVoxel, mData.voxelCount);
+        cudaCheck(cudaMemcpyAsync(&maxPointsPerVoxel, d_maxPointsPerVoxel, sizeof(uint32_t), cudaMemcpyDeviceToHost, mStream));
+        mMemPool.free(d_maxPointsPerVoxel);
+        double dx = mData.map.getVoxelSize()[0];
+        if (++iterCounter >= mMaxIterations || pointCount == 1u || math::Abs((int)maxPointsPerVoxel - (int)mMaxPointsPerVoxel) <= mTolerance) {
+            mMaxPointsPerVoxel = maxPointsPerVoxel;
+        } else {
+            const Foo tmp{dx, maxPointsPerVoxel};
+            if (maxPointsPerVoxel < mMaxPointsPerVoxel) {
+                if (min < tmp) min = tmp;
+            } else if (max.density == 0 || tmp < max) {
+                max = tmp;
+            }
+            if (max.density) {
+                dx = (min.dx*(max.density - mMaxPointsPerVoxel) + max.dx*(mMaxPointsPerVoxel-min.density))/double(max.density-min.density);
+            } else if (maxPointsPerVoxel > 1u) {
+                dx *= (mMaxPointsPerVoxel-1.0)/(maxPointsPerVoxel-1.0);
+            } else {// maxPointsPerVoxel = 1 so increase dx significantly
+                dx *= 10.0;
+            }
+            if (mVerbose==2) printf("\ntarget density = %u, current density = %u current dx = %f, next dx = %f\n", mMaxPointsPerVoxel, maxPointsPerVoxel, tmp.dx, dx);
+            mData.map = Map(dx);
+            mMemPool.free(mData.d_keys, mData.d_indx, d_keys, mData.d_tile_keys, d_node_count, mData.pointsPerVoxel);
+            goto jump;
+        }
+    }
+    if (iterCounter>1 && mVerbose) std::cerr << "Used " << iterCounter << " attempts to determine dx that produces a target dpoint denisty\n\n";
+
+    if (mVerbose==2) mTimer.restart("Compute prefix sum of points per voxel");
+    mData.pointsPerVoxelPrefix = mMemPool.template alloc<uint32_t>(mData.voxelCount, mStream);
+    CALL_CUBS(DeviceScan::ExclusiveSum, mData.pointsPerVoxel, mData.pointsPerVoxelPrefix, mData.voxelCount);
+
+    mData.pointsPerLeaf = mMemPool.template alloc<uint32_t>(pointCount, mStream);
+    CALL_CUBS(DeviceRunLengthEncode::Encode, ShiftRightIterator<9>(mData.d_keys), d_keys, mData.pointsPerLeaf, d_node_count, pointCount);
+    cudaCheck(cudaMemcpyAsync(mData.nodeCount, d_node_count, sizeof(uint32_t), cudaMemcpyDeviceToHost, mStream));
+
+    if constexpr(util::is_same<BuildT, Point>::value) {
+        uint32_t *d_maxPointsPerLeaf = mMemPool.template alloc<uint32_t>(mStream);
+        CALL_CUBS(DeviceReduce::Max, mData.pointsPerLeaf, d_maxPointsPerLeaf, mData.nodeCount[0]);
+        cudaCheck(cudaMemcpyAsync(&mMaxPointsPerLeaf, d_maxPointsPerLeaf, sizeof(uint32_t), cudaMemcpyDeviceToHost, mStream));
+        //printf("\n Leaf count = %u, max points per leaf = %u\n", mData.nodeCount[0], mMaxPointsPerLeaf);
+        if (mMaxPointsPerLeaf > std::numeric_limits<uint16_t>::max()) {
+            throw std::runtime_error("Too many points per leaf: "+std::to_string(mMaxPointsPerLeaf));
+        }
+        mMemPool.free(d_maxPointsPerLeaf);
+    }
+
+    mData.pointsPerLeafPrefix = mMemPool.template alloc<uint32_t>(mData.nodeCount[0], mStream);
+    CALL_CUBS(DeviceScan::ExclusiveSum, mData.pointsPerLeaf, mData.pointsPerLeafPrefix, mData.nodeCount[0]);
+
+    mData.d_leaf_keys = mMemPool.template alloc<uint64_t>(mData.nodeCount[0], mStream);
+    cudaCheck(cudaMemcpyAsync(mData.d_leaf_keys, d_keys, mData.nodeCount[0]*sizeof(uint64_t), cudaMemcpyDeviceToDevice, mStream));
+
+    CALL_CUBS(DeviceSelect::Unique, ShiftRightIterator<12>(mData.d_leaf_keys), d_keys, d_node_count+1, mData.nodeCount[0]);// count lower nodes
+    cudaCheck(cudaMemcpyAsync(mData.nodeCount+1, d_node_count+1, sizeof(uint32_t), cudaMemcpyDeviceToHost, mStream));
+    mData.d_lower_keys = mMemPool.template alloc<uint64_t>(mData.nodeCount[1], mStream);
+    cudaCheck(cudaMemcpyAsync(mData.d_lower_keys, d_keys, mData.nodeCount[1]*sizeof(uint64_t), cudaMemcpyDeviceToDevice, mStream));
+
+    mMemPool.free(d_keys, d_node_count);
+    if (mVerbose==2) mTimer.stop();
+
+    //printf("Leaf count = %u, lower count = %u, upper count = %u\n", mData.nodeCount[0], mData.nodeCount[1], mData.nodeCount[2]);
+}// PointsToGrid<BuildT>::countNodes
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+template <typename BuildT, typename AllocT>
+template <typename PtrT, typename BufferT>
+inline BufferT PointsToGrid<BuildT, AllocT>::getBuffer(const PtrT, size_t pointCount, const BufferT &pool)
+{
+    auto sizeofPoint = [&]()->size_t{
+        switch (mPointType){
+        case PointType::PointID: return sizeof(uint32_t);
+        case PointType::World64: return sizeof(Vec3d);
+        case PointType::World32: return sizeof(Vec3f);
+        case PointType::Grid64:  return sizeof(Vec3d);
+        case PointType::Grid32:  return sizeof(Vec3f);
+        case PointType::Voxel32: return sizeof(Vec3f);
+        case PointType::Voxel16: return sizeof(Vec3u16);
+        case PointType::Voxel8:  return sizeof(Vec3u8);
+        case PointType::Default: return pointer_traits<PtrT>::element_size;
+        default: return size_t(0);// PointType::Disable
+        }
+    };
+
+    mData.grid  = 0;// grid is always stored at the start of the buffer!
+    mData.tree  = NanoGrid<BuildT>::memUsage(); // grid ends and tree begins
+    mData.root  = mData.tree  + NanoTree<BuildT>::memUsage(); // tree ends and root node begins
+    mData.upper = mData.root  + NanoRoot<BuildT>::memUsage(mData.nodeCount[2]); // root node ends and upper internal nodes begin
+    mData.lower = mData.upper + NanoUpper<BuildT>::memUsage()*mData.nodeCount[2]; // upper internal nodes ends and lower internal nodes begin
+    mData.leaf  = mData.lower + NanoLower<BuildT>::memUsage()*mData.nodeCount[1]; // lower internal nodes ends and leaf nodes begin
+    mData.meta  = mData.leaf  + NanoLeaf<BuildT>::DataType::memUsage()*mData.nodeCount[0];// leaf nodes end and blind meta data begins
+    mData.blind = mData.meta  + sizeof(GridBlindMetaData)*int( mPointType!=PointType::Disable ); // meta data ends and blind data begins
+    mData.size  = mData.blind + pointCount*sizeofPoint();// end of buffer
+
+    auto buffer = BufferT::create(mData.size, &pool, false);// only allocate buffer on the device
+    mData.d_bufferPtr = buffer.deviceData();
+    if (mData.d_bufferPtr == nullptr) throw std::runtime_error("Failed to allocate grid buffer on the device");
+    cudaCheck(cudaMemcpyAsync(mDeviceData, &mData, sizeof(Data), cudaMemcpyHostToDevice, mStream));// copy Data CPU -> GPU
+    return buffer;
+}// PointsToGrid<BuildT>::getBuffer
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+template <typename BuildT, typename AllocT>
+template <typename PtrT>
+inline void PointsToGrid<BuildT, AllocT>::processGridTreeRoot(const PtrT points, size_t pointCount)
+{
+    using Vec3T = typename util::remove_const<typename pointer_traits<PtrT>::element_type>::type;
+    util::cuda::lambdaKernel<<<1, 1, 0, mStream>>>(1, [=] __device__(size_t, Data *d_data, PointType pointType) {
+       // process Root
+        auto &root = d_data->getRoot();
+        root.mBBox = CoordBBox(); // init to empty
+        root.mTableSize = d_data->nodeCount[2];
+        root.mBackground = NanoRoot<BuildT>::ValueType(0);// background_value
+        root.mMinimum = root.mMaximum = NanoRoot<BuildT>::ValueType(0);
+        root.mAverage = root.mStdDevi = NanoRoot<BuildT>::FloatType(0);
+
+        // process Tree
+        auto &tree = d_data->getTree();
+        tree.setRoot(&root);
+        tree.setFirstNode(&d_data->getUpper(0));
+        tree.setFirstNode(&d_data->getLower(0));
+        tree.setFirstNode(&d_data->getLeaf(0));
+        tree.mNodeCount[2] = tree.mTileCount[2] = d_data->nodeCount[2];
+        tree.mNodeCount[1] = tree.mTileCount[1] = d_data->nodeCount[1];
+        tree.mNodeCount[0] = tree.mTileCount[0] = d_data->nodeCount[0];
+        tree.mVoxelCount = d_data->voxelCount;
+
+        // process Grid
+        auto &grid = d_data->getGrid();
+        grid.init({GridFlags::HasBBox, GridFlags::IsBreadthFirst}, d_data->size, d_data->map, toGridType<BuildT>());
+        grid.mChecksum = ~uint64_t(0);// set all bits on which means it's disabled
+        grid.mBlindMetadataCount  = util::is_same<BuildT, Point>::value;// ? 1u : 0u;
+        grid.mBlindMetadataOffset = d_data->meta;
+        if (pointType != PointType::Disable) {
+            const auto lastLeaf = tree.mNodeCount[0] - 1;
+            grid.mData1 = d_data->pointsPerLeafPrefix[lastLeaf] + d_data->pointsPerLeaf[lastLeaf];
+            auto &meta = d_data->getMeta();
+            meta.mDataOffset = sizeof(GridBlindMetaData);// blind data is placed right after this meta data
+            meta.mValueCount = pointCount;
+            // Blind meta data
+            switch (pointType){
+            case PointType::PointID:
+                grid.mGridClass = GridClass::PointIndex;
+                meta.mSemantic  = GridBlindDataSemantic::PointId;
+                meta.mDataClass = GridBlindDataClass::IndexArray;
+                meta.mDataType  = toGridType<uint32_t>();
+                meta.mValueSize = sizeof(uint32_t);
+                util::strcpy(meta.mName, "PointID: uint32_t indices to points");
+                break;
+            case PointType::World64:
+                grid.mGridClass = GridClass::PointData;
+                meta.mSemantic  = GridBlindDataSemantic::WorldCoords;
+                meta.mDataClass = GridBlindDataClass::AttributeArray;
+                meta.mDataType  = toGridType<Vec3d>();
+                meta.mValueSize = sizeof(Vec3d);
+                util::strcpy(meta.mName, "World64: Vec3<double> point coordinates in world space");
+                break;
+            case PointType::World32:
+                grid.mGridClass = GridClass::PointData;
+                meta.mSemantic  = GridBlindDataSemantic::WorldCoords;
+                meta.mDataClass = GridBlindDataClass::AttributeArray;
+                meta.mDataType  = toGridType<Vec3f>();
+                meta.mValueSize = sizeof(Vec3f);
+                util::strcpy(meta.mName, "World32: Vec3<float> point coordinates in world space");
+                break;
+            case PointType::Grid64:
+                grid.mGridClass = GridClass::PointData;
+                meta.mSemantic  = GridBlindDataSemantic::GridCoords;
+                meta.mDataClass = GridBlindDataClass::AttributeArray;
+                meta.mDataType  = toGridType<Vec3d>();
+                meta.mValueSize = sizeof(Vec3d);
+                util::strcpy(meta.mName, "Grid64: Vec3<double> point coordinates in grid space");
+                break;
+            case PointType::Grid32:
+                grid.mGridClass = GridClass::PointData;
+                meta.mSemantic  = GridBlindDataSemantic::GridCoords;
+                meta.mDataClass = GridBlindDataClass::AttributeArray;
+                meta.mDataType  = toGridType<Vec3f>();
+                meta.mValueSize = sizeof(Vec3f);
+                util::strcpy(meta.mName, "Grid32: Vec3<float> point coordinates in grid space");
+                break;
+            case PointType::Voxel32:
+                grid.mGridClass = GridClass::PointData;
+                meta.mSemantic  = GridBlindDataSemantic::VoxelCoords;
+                meta.mDataClass = GridBlindDataClass::AttributeArray;
+                meta.mDataType  = toGridType<Vec3f>();
+                meta.mValueSize = sizeof(Vec3f);
+                util::strcpy(meta.mName, "Voxel32: Vec3<float> point coordinates in voxel space");
+                break;
+            case PointType::Voxel16:
+                grid.mGridClass = GridClass::PointData;
+                meta.mSemantic  = GridBlindDataSemantic::VoxelCoords;
+                meta.mDataClass = GridBlindDataClass::AttributeArray;
+                meta.mDataType  = toGridType<Vec3u16>();
+                meta.mValueSize = sizeof(Vec3u16);
+                util::strcpy(meta.mName, "Voxel16: Vec3<uint16_t> point coordinates in voxel space");
+                break;
+            case PointType::Voxel8:
+                grid.mGridClass = GridClass::PointData;
+                meta.mSemantic  = GridBlindDataSemantic::VoxelCoords;
+                meta.mDataClass = GridBlindDataClass::AttributeArray;
+                meta.mDataType  = toGridType<Vec3u8>();
+                meta.mValueSize = sizeof(Vec3u8);
+                util::strcpy(meta.mName, "Voxel8: Vec3<uint8_t> point coordinates in voxel space");
+                break;
+            case PointType::Default:
+                grid.mGridClass = GridClass::PointData;
+                meta.mSemantic  = GridBlindDataSemantic::WorldCoords;
+                meta.mDataClass = GridBlindDataClass::AttributeArray;
+                meta.mDataType  = toGridType<Vec3T>();
+                meta.mValueSize = sizeof(Vec3T);
+                if constexpr(util::is_same<Vec3T, Vec3f>::value) {
+                    util::strcpy(meta.mName, "World32: Vec3<float> point coordinates in world space");
+                } else if constexpr(util::is_same<Vec3T, Vec3d>::value){
+                    util::strcpy(meta.mName, "World64: Vec3<double> point coordinates in world space");
+                } else {
+                    printf("Error in PointsToGrid<BuildT>::processGridTreeRoot: expected Vec3T = Vec3f or Vec3d\n");
+                }
+                break;
+            default:
+                printf("Error in PointsToGrid<BuildT>::processGridTreeRoot: invalid pointType\n");
+            }
+        } else if constexpr(BuildTraits<BuildT>::is_offindex) {
+            grid.mData1 = 1u + 512u*d_data->nodeCount[0];
+            grid.mGridClass = GridClass::IndexGrid;
+        }
+    }, mDeviceData, mPointType);// lambdaKernel
+    cudaCheckError();
+
+    char *dst = mData.getGrid().mGridName;
+    if (const char *src = mGridName.data()) {
+        cudaCheck(cudaMemcpyAsync(dst, src, GridData::MaxNameSize, cudaMemcpyHostToDevice, mStream));
+    } else {
+        cudaCheck(cudaMemsetAsync(dst, 0, GridData::MaxNameSize, mStream));
+    }
+}// PointsToGrid<BuildT>::processGridTreeRoot
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+template <typename BuildT, typename AllocT>
+inline void PointsToGrid<BuildT, AllocT>::processUpperNodes()
+{
+    util::cuda::lambdaKernel<<<numBlocks(mData.nodeCount[2]), mNumThreads, 0, mStream>>>(mData.nodeCount[2], [=] __device__(size_t tid, Data *d_data) {
+        auto &root  = d_data->getRoot();
+        auto &upper = d_data->getUpper(tid);
+#if 1
+        auto keyToCoord = [](uint64_t key)->nanovdb::Coord{
+            static constexpr int64_t offset = 1 << 31;// max values of uint32_t is 2^31 - 1
+            static constexpr uint64_t MASK = (1u << 21) - 1; // used to mask out 21 lower bits
+            return nanovdb::Coord(int(int64_t(((key >> 42) & MASK) << 12) - offset),  // x are the upper 21 bits
+                                  int(int64_t(((key >> 21) & MASK) << 12) - offset),  // y are the middle 21 bits
+                                  int(int64_t(( key        & MASK) << 12) - offset)); // z are the lower 21 bits
+        };
+        const Coord ijk = keyToCoord(d_data->d_tile_keys[tid]);
+#else
+        const Coord ijk = NanoRoot<uint32_t>::KeyToCoord(d_data->d_tile_keys[tid]);
+#endif
+        root.tile(tid)->setChild(ijk, &upper, &root);
+        upper.mBBox[0] = ijk;
+        upper.mFlags = 0;
+        upper.mValueMask.setOff();
+        upper.mChildMask.setOff();
+        upper.mMinimum = upper.mMaximum = NanoLower<BuildT>::ValueType(0);
+        upper.mAverage = upper.mStdDevi = NanoLower<BuildT>::FloatType(0);
+    }, mDeviceData);
+    cudaCheckError();
+
+    mMemPool.free(mData.d_tile_keys);
+
+    const uint64_t valueCount = mData.nodeCount[2] << 15;
+    util::cuda::lambdaKernel<<<numBlocks(valueCount), mNumThreads, 0, mStream>>>(valueCount, [=] __device__(size_t tid, Data *d_data) {
+        auto &upper = d_data->getUpper(tid >> 15);
+        upper.mTable[tid & 32767u].value = NanoUpper<BuildT>::ValueType(0);// background
+    }, mDeviceData);
+    cudaCheckError();
+}// PointsToGrid<BuildT>::processUpperNodes
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+template <typename BuildT, typename AllocT>
+inline void PointsToGrid<BuildT, AllocT>::processLowerNodes()
+{
+    util::cuda::lambdaKernel<<<numBlocks(mData.nodeCount[1]), mNumThreads, 0, mStream>>>(mData.nodeCount[1], [=] __device__(size_t tid, Data *d_data) {
+        auto &root  = d_data->getRoot();
+        const uint64_t lowerKey = d_data->d_lower_keys[tid];
+        auto &upper = d_data->getUpper(lowerKey >> 15);
+        const uint32_t upperOffset = lowerKey & 32767u;// (1 << 15) - 1 = 32767
+        upper.mChildMask.setOnAtomic(upperOffset);
+        auto &lower = d_data->getLower(tid);
+        upper.setChild(upperOffset, &lower);
+        lower.mBBox[0] = upper.offsetToGlobalCoord(upperOffset);
+        lower.mFlags = 0;
+        lower.mValueMask.setOff();
+        lower.mChildMask.setOff();
+        lower.mMinimum = lower.mMaximum = NanoLower<BuildT>::ValueType(0);// background;
+        lower.mAverage = lower.mStdDevi = NanoLower<BuildT>::FloatType(0);
+    }, mDeviceData);
+    cudaCheckError();
+
+    const uint64_t valueCount = mData.nodeCount[1] << 12;
+    util::cuda::lambdaKernel<<<numBlocks(valueCount), mNumThreads, 0, mStream>>>(valueCount, [=] __device__(size_t tid, Data *d_data) {
+        auto &lower = d_data->getLower(tid >> 12);
+        lower.mTable[tid & 4095u].value = NanoLower<BuildT>::ValueType(0);// background
+    }, mDeviceData);
+    cudaCheckError();
+}// PointsToGrid<BuildT>::processLowerNodes
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+template <typename BuildT, typename AllocT>
+template <typename PtrT>
+inline void PointsToGrid<BuildT, AllocT>::processLeafNodes(const PtrT points)
+{
+    const uint8_t flags = static_cast<uint8_t>(mData.flags.data());// mIncludeStats ? 16u : 0u;// 4th bit indicates stats
+
+    if (mVerbose==2) mTimer.start("process leaf meta data");
+    // loop over leaf nodes and add it to its parent node
+    util::cuda::lambdaKernel<<<numBlocks(mData.nodeCount[0]), mNumThreads, 0, mStream>>>(mData.nodeCount[0], [=] __device__(size_t tid, Data *d_data) {
+        const uint64_t leafKey = d_data->d_leaf_keys[tid], tile_id = leafKey >> 27;
+        auto &upper = d_data->getUpper(tile_id);
+        const uint32_t lowerOffset = leafKey & 4095u, upperOffset = (leafKey >> 12) & 32767u;
+        auto &lower = *upper.getChild(upperOffset);
+        lower.mChildMask.setOnAtomic(lowerOffset);
+        auto &leaf = d_data->getLeaf(tid);
+        lower.setChild(lowerOffset, &leaf);
+        leaf.mBBoxMin = lower.offsetToGlobalCoord(lowerOffset);
+        leaf.mFlags = flags;
+        auto &valueMask = leaf.mValueMask;
+        valueMask.setOff();// initiate all bits to off
+
+        if constexpr(util::is_same<Point, BuildT>::value) {
+            leaf.mOffset = d_data->pointsPerLeafPrefix[tid];
+            leaf.mPointCount = d_data->pointsPerLeaf[tid];
+        } else if constexpr(BuildTraits<BuildT>::is_offindex) {
+            leaf.mOffset = tid*512u + 1u;// background is index 0
+            leaf.mPrefixSum = 0u;
+        } else if constexpr(!BuildTraits<BuildT>::is_special) {
+            leaf.mAverage = leaf.mStdDevi = NanoLeaf<BuildT>::FloatType(0);
+            leaf.mMinimum = leaf.mMaximum = NanoLeaf<BuildT>::ValueType(0);
+        }
+    }, mDeviceData); cudaCheckError();
+
+    if (mVerbose==2) mTimer.restart("set active voxel state and values");
+    // loop over all active voxels and set LeafNode::mValueMask and LeafNode::mValues
+    util::cuda::lambdaKernel<<<numBlocks(mData.voxelCount), mNumThreads, 0, mStream>>>(mData.voxelCount, [=] __device__(size_t tid, Data *d_data) {
+        const uint32_t pointID  = d_data->pointsPerVoxelPrefix[tid];
+        const uint64_t voxelKey = d_data->d_keys[pointID];
+        auto &upper = d_data->getUpper(voxelKey >> 36);
+        auto &lower = *upper.getChild((voxelKey >> 21) & 32767u);
+        auto &leaf  = *lower.getChild((voxelKey >>  9) &  4095u);
+        const uint32_t n = voxelKey & 511u;
+        leaf.mValueMask.setOnAtomic(n);// <--- slow!
+        if constexpr(util::is_same<Point, BuildT>::value) {
+            leaf.mValues[n] = uint16_t(pointID + d_data->pointsPerVoxel[tid] - leaf.offset());
+        } else if constexpr(!BuildTraits<BuildT>::is_special) {
+            leaf.mValues[n] = NanoLeaf<BuildT>::ValueType(1);// set value of active voxels that are not points (or index)
+        }
+    }, mDeviceData); cudaCheckError();
+
+    mMemPool.free(mData.d_keys, mData.pointsPerVoxel, mData.pointsPerVoxelPrefix, mData.pointsPerLeafPrefix, mData.pointsPerLeaf);
+
+    if (mVerbose==2) mTimer.restart("set inactive voxel values");
+    const uint64_t denseVoxelCount = mData.nodeCount[0] << 9;
+    util::cuda::lambdaKernel<<<numBlocks(denseVoxelCount), mNumThreads, 0, mStream>>>(denseVoxelCount, [=] __device__(size_t tid, Data *d_data) {
+        auto &leaf = d_data->getLeaf(tid >> 9u);
+        const uint32_t n = tid & 511u;
+        if (leaf.mValueMask.isOn(n)) return;
+        if constexpr(util::is_same<BuildT, Point>::value) {
+            const uint32_t m = leaf.mValueMask.findPrev<true>(n - 1);
+            leaf.mValues[n] = m < 512u ? leaf.mValues[m] : 0u;
+        } else if constexpr(!BuildTraits<BuildT>::is_special) {
+            leaf.mValues[n] = NanoLeaf<BuildT>::ValueType(0);// value of inactive voxels
+        }
+    }, mDeviceData); cudaCheckError();
+
+    if constexpr(BuildTraits<BuildT>::is_onindex) {
+        if (mVerbose==2) mTimer.restart("prefix-sum for index grid");
+        uint64_t *devValueIndex = mMemPool.template alloc<uint64_t>(mData.nodeCount[0], mStream);
+        auto devValueIndexPrefix = mMemPool.template alloc<uint64_t>(mData.nodeCount[0], mStream);
+        kernels::fillValueIndexKernel<BuildT, AllocT><<<numBlocks(mData.nodeCount[0]), mNumThreads, 0, mStream>>>(mData.nodeCount[0], devValueIndex, mDeviceData);
+        cudaCheckError();
+        CALL_CUBS(DeviceScan::InclusiveSum, devValueIndex, devValueIndexPrefix, mData.nodeCount[0]);
+        mMemPool.free(devValueIndex);
+        kernels::leafPrefixSumKernel<BuildT, AllocT><<<numBlocks(mData.nodeCount[0]), mNumThreads, 0, mStream>>>(mData.nodeCount[0], devValueIndexPrefix, mDeviceData);
+        cudaCheckError();
+        mMemPool.free(devValueIndexPrefix);
+    }
+
+    if constexpr(BuildTraits<BuildT>::is_indexmask) {
+        if (mVerbose==2) mTimer.restart("leaf.mMask = leaf.mValueMask");
+        kernels::setMaskEqValMaskKernel<BuildT, AllocT><<<numBlocks(mData.nodeCount[0]), mNumThreads, 0, mStream>>>(mData.nodeCount[0], mDeviceData);
+        cudaCheckError();
+    }
+    if (mVerbose==2) mTimer.stop();
+}// PointsToGrid<BuildT>::processLeafNodes
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+template <typename BuildT, typename AllocT>
+template <typename PtrT>
+inline void PointsToGrid<BuildT, AllocT>::processPoints(const PtrT, size_t)
+{
+    mMemPool.free(mData.d_indx);
+}
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+// Template specialization with BuildT = Point
+template <>
+template <typename PtrT>
+inline void PointsToGrid<Point>::processPoints(const PtrT points, size_t pointCount)
+{
+    switch (mPointType){
+    case PointType::Disable:
+        throw std::runtime_error("PointsToGrid<Point>::processPoints: mPointType == PointType::Disable\n");
+    case PointType::PointID:
+        util::cuda::lambdaKernel<<<numBlocks(pointCount), mNumThreads, 0, mStream>>>(pointCount, [=] __device__(size_t tid, Data *d_data) {
+            d_data->template getPoint<uint32_t>(tid) = d_data->d_indx[tid];
+        }, mDeviceData); cudaCheckError();
+        break;
+    case PointType::World64:
+        util::cuda::lambdaKernel<<<numBlocks(pointCount), mNumThreads, 0, mStream>>>(pointCount, [=] __device__(size_t tid, Data *d_data) {
+            d_data->template getPoint<Vec3d>(tid) = points[d_data->d_indx[tid]];
+        }, mDeviceData); cudaCheckError();
+        break;
+    case PointType::World32:
+        util::cuda::lambdaKernel<<<numBlocks(pointCount), mNumThreads, 0, mStream>>>(pointCount, [=] __device__(size_t tid, Data *d_data) {
+            d_data->template getPoint<Vec3f>(tid) = points[d_data->d_indx[tid]];
+        }, mDeviceData); cudaCheckError();
+        break;
+    case PointType::Grid64:
+        util::cuda::lambdaKernel<<<numBlocks(pointCount), mNumThreads, 0, mStream>>>(pointCount, [=] __device__(size_t tid, Data *d_data) {
+            d_data->template getPoint<Vec3d>(tid) = d_data->map.applyInverseMap(points[d_data->d_indx[tid]]);
+        }, mDeviceData); cudaCheckError();
+        break;
+    case PointType::Grid32:
+        util::cuda::lambdaKernel<<<numBlocks(pointCount), mNumThreads, 0, mStream>>>(pointCount, [=] __device__(size_t tid, Data *d_data) {
+            d_data->template getPoint<Vec3f>(tid) = d_data->map.applyInverseMapF(points[d_data->d_indx[tid]]);
+        }, mDeviceData); cudaCheckError();
+        break;
+    case PointType::Voxel32:
+        util::cuda::lambdaKernel<<<numBlocks(pointCount), mNumThreads, 0, mStream>>>(pointCount, [=] __device__(size_t tid, Data *d_data) {
+            worldToVoxel(d_data->template getPoint<Vec3f>(tid), points[d_data->d_indx[tid]], d_data->map);
+        }, mDeviceData); cudaCheckError();
+        break;
+    case PointType::Voxel16:
+        util::cuda::lambdaKernel<<<numBlocks(pointCount), mNumThreads, 0, mStream>>>(pointCount, [=] __device__(size_t tid, Data *d_data) {
+            worldToVoxel(d_data->template getPoint<Vec3u16>(tid), points[d_data->d_indx[tid]], d_data->map);
+        }, mDeviceData); cudaCheckError();
+        break;
+    case PointType::Voxel8:
+        util::cuda::lambdaKernel<<<numBlocks(pointCount), mNumThreads, 0, mStream>>>(pointCount, [=] __device__(size_t tid, Data *d_data) {
+            worldToVoxel(d_data->template getPoint<Vec3u8>(tid), points[d_data->d_indx[tid]], d_data->map);
+        }, mDeviceData); cudaCheckError();
+        break;
+    case PointType::Default:
+        util::cuda::lambdaKernel<<<numBlocks(pointCount), mNumThreads, 0, mStream>>>(pointCount, [=] __device__(size_t tid, Data *d_data) {
+            d_data->template getPoint<typename pointer_traits<PtrT>::element_type>(tid) = points[d_data->d_indx[tid]];
+        }, mDeviceData); cudaCheckError();
+        break;
+    default:
+        printf("Internal error in PointsToGrid<Point>::processPoints\n");
+    }
+    mMemPool.free(mData.d_indx);
+}// PointsToGrid<Point>::processPoints
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+template <typename BuildT, typename AllocT>
+inline void PointsToGrid<BuildT, AllocT>::processBBox()
+{
+    if (mData.flags.isMaskOff(GridFlags::HasBBox)) {
+        mMemPool.free(mData.d_leaf_keys, mData.d_lower_keys);
+        return;
+    }
+
+    // reset bbox in lower nodes
+    util::cuda::lambdaKernel<<<numBlocks(mData.nodeCount[1]), mNumThreads, 0, mStream>>>(mData.nodeCount[1], [=] __device__(size_t tid, Data *d_data) {
+        d_data->getLower(tid).mBBox = CoordBBox();
+    }, mDeviceData);
+    cudaCheckError();
+
+    // update and propagate bbox from leaf -> lower/parent nodes
+    util::cuda::lambdaKernel<<<numBlocks(mData.nodeCount[0]), mNumThreads, 0, mStream>>>(mData.nodeCount[0], [=] __device__(size_t tid, Data *d_data) {
+        const uint64_t leafKey = d_data->d_leaf_keys[tid];
+        auto &upper = d_data->getUpper(leafKey >> 27);
+        auto &lower = *upper.getChild((leafKey >> 12) & 32767u);
+        auto &leaf = d_data->getLeaf(tid);
+        leaf.updateBBox();
+        lower.mBBox.expandAtomic(leaf.bbox());
+    }, mDeviceData);
+    mMemPool.free(mData.d_leaf_keys);
+    cudaCheckError();
+
+    // reset bbox in upper nodes
+    util::cuda::lambdaKernel<<<numBlocks(mData.nodeCount[2]), mNumThreads, 0, mStream>>>(mData.nodeCount[2], [=] __device__(size_t tid, Data *d_data) {
+        d_data->getUpper(tid).mBBox = CoordBBox();
+    }, mDeviceData);
+    cudaCheckError();
+
+    // propagate bbox from lower -> upper/parent node
+    util::cuda::lambdaKernel<<<numBlocks(mData.nodeCount[1]), mNumThreads, 0, mStream>>>(mData.nodeCount[1], [=] __device__(size_t tid, Data *d_data) {
+        const uint64_t lowerKey = d_data->d_lower_keys[tid];
+        auto &upper = d_data->getUpper(lowerKey >> 15);
+        auto &lower = d_data->getLower(tid);
+        upper.mBBox.expandAtomic(lower.bbox());
+    }, mDeviceData);
+    mMemPool.free(mData.d_lower_keys);
+    cudaCheckError()
+
+    // propagate bbox from upper -> root/parent node
+    util::cuda::lambdaKernel<<<numBlocks(mData.nodeCount[2]), mNumThreads, 0, mStream>>>(mData.nodeCount[2], [=] __device__(size_t tid, Data *d_data) {
+        d_data->getRoot().mBBox.expandAtomic(d_data->getUpper(tid).bbox());
+    }, mDeviceData);
+    cudaCheckError();
+
+    // update the world-bbox in the root node
+    util::cuda::lambdaKernel<<<1, 1, 0, mStream>>>(1, [=] __device__(size_t, Data *d_data) {
+        d_data->getGrid().mWorldBBox = d_data->getRoot().mBBox.transform(d_data->map);
+    }, mDeviceData);
+    cudaCheckError();
+}// PointsToGrid<BuildT>::processBBox
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+template<typename BuildT, typename PtrT, typename BufferT, typename AllocT>
+GridHandle<BufferT>// Grid<BuildT>
+voxelsToGrid(const PtrT d_ijk, size_t voxelCount, double voxelSize, const BufferT &buffer, cudaStream_t stream)
+{
+    PointsToGrid<BuildT, AllocT> converter(voxelSize, Vec3d(0.0), stream);
+    return converter.getHandle(d_ijk, voxelCount, buffer);
+}
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+template<typename PtrT, typename BufferT, typename AllocT>
+GridHandle<BufferT>// Grid<Point> with PointType coordinates as blind data
+pointsToGrid(const PtrT d_xyz, int pointCount, int maxPointsPerVoxel, int tolerance, int maxIterations, PointType type, const BufferT &buffer, cudaStream_t stream)
+{
+    PointsToGrid<Point, AllocT> converter(maxPointsPerVoxel, tolerance, maxIterations, Vec3d(0.0), stream);
+    converter.setPointType(type);
+    return converter.getHandle(d_xyz, pointCount, buffer);
+}
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+template<typename BuildT, typename PtrT, typename BufferT, typename AllocT>
+GridHandle<BufferT>
+pointsToGrid(std::vector<std::tuple<const PtrT,size_t,double,PointType>> vec, const BufferT &buffer, cudaStream_t stream)
+{
+    std::vector<GridHandle<BufferT>> handles;
+    for (auto &p : vec) handles.push_back(pointsToGrid<BuildT, AllocT>(std::get<0>(p), std::get<1>(p), std::get<2>(p), std::get<3>(p), buffer, stream));
+    return mergeDeviceGrids(handles, stream);
+}
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+template<typename BuildT, typename PtrT, typename BufferT, typename AllocT>
+GridHandle<BufferT>
+voxelsToGrid(std::vector<std::tuple<const PtrT,size_t,double>> vec, const BufferT &buffer, cudaStream_t stream)
+{
+    std::vector<GridHandle<BufferT>> handles;
+    for (auto &p : vec) handles.push_back(voxelsToGrid<BuildT, PtrT, BufferT, AllocT>(std::get<0>(p), std::get<1>(p), std::get<2>(p), buffer, stream));
+    return mergeDeviceGrids(handles, stream);
+}
+
+}}// namespace tools::cuda ======================================================================================================================================
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+template<typename PtrT, typename BufferT = cuda::DeviceBuffer, typename AllocT = cub::CachingDeviceAllocator>
+[[deprecated("Use cuda::pointsToGrid instead")]]
+GridHandle<BufferT>
+cudaPointsToGrid(const PtrT dWorldPoints,
+                 int pointCount,
+                 double voxelSize = 1.0,
+                 PointType type = PointType::Default,
+                 const BufferT &buffer = BufferT(),
+                 cudaStream_t stream = 0)
+{
+    return tools::cuda::pointsToGrid<PtrT, BufferT, AllocT>(dWorldPoints, pointCount, voxelSize, type, buffer, stream);
+}
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+template<typename BuildT, typename PtrT, typename BufferT = cuda::DeviceBuffer, typename AllocT = cub::CachingDeviceAllocator>
+[[deprecated("Use cuda::pointsToGrid instead")]]
+GridHandle<BufferT>
+cudaPointsToGrid(std::vector<std::tuple<const PtrT,size_t,double,PointType>> pointSet,
+                 const BufferT &buffer = BufferT(),
+                 cudaStream_t stream = 0)
+{
+    return tools::cuda::pointsToGrid<BuildT, PtrT, BufferT,AllocT>(pointSet, buffer, stream);
+}
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+template<typename BuildT, typename PtrT, typename BufferT = cuda::DeviceBuffer, typename AllocT = cub::CachingDeviceAllocator>
+[[deprecated("Use cuda::voxelsToGrid instead")]]
+GridHandle<BufferT>
+cudaVoxelsToGrid(const PtrT dGridVoxels,
+                 size_t voxelCount,
+                 double voxelSize = 1.0,
+                 const BufferT &buffer = BufferT(),
+                 cudaStream_t stream = 0)
+{
+    return tools::cuda::voxelsToGrid<BuildT, PtrT, BufferT, AllocT>(dGridVoxels, voxelCount, voxelSize, buffer, stream);
+}
+
+//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+template<typename BuildT, typename PtrT, typename BufferT = cuda::DeviceBuffer, typename AllocT = cub::CachingDeviceAllocator>
+[[deprecated("Use cuda::voxelsToGrid instead")]]
+GridHandle<BufferT>
+cudaVoxelsToGrid(std::vector<std::tuple<const PtrT, size_t, double>> pointSet,
+                 const BufferT &buffer = BufferT(),
+                 cudaStream_t stream = 0)
+{
+    return tools::cuda::voxelsToGrid<BuildT, PtrT, BufferT, AllocT>(pointSet, buffer, stream);
+}
+
+}// namespace nanovdb
+
+#endif // NVIDIA_TOOLS_CUDA_POINTSTOGRID_CUH_HAS_BEEN_INCLUDED
diff --git a/warp/native/nanovdb/util/Util.h b/warp/native/nanovdb/util/Util.h
new file mode 100644
index 00000000..e8ebfc1c
--- /dev/null
+++ b/warp/native/nanovdb/util/Util.h
@@ -0,0 +1,657 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: MPL-2.0
+
+/*!
+    \file   nanovdb/util/Util.h
+
+    \author Ken Museth
+
+    \date  January 8, 2020
+
+    \brief Utility functions
+*/
+
+#ifndef NANOVDB_UTIL_UTIL_H_HAS_BEEN_INCLUDED
+#define NANOVDB_UTIL_UTIL_H_HAS_BEEN_INCLUDED
+
+#ifdef __CUDACC_RTC__
+
+typedef signed char        int8_t;
+typedef short              int16_t;
+typedef int                int32_t;
+typedef long long          int64_t;
+typedef unsigned char      uint8_t;
+typedef unsigned int       uint32_t;
+typedef unsigned short     uint16_t;
+typedef unsigned long long uint64_t;
+
+#define NANOVDB_ASSERT(x)
+
+#ifndef UINT64_C
+#define UINT64_C(x) (x ## ULL)
+#endif
+
+#else // !__CUDACC_RTC__
+
+#include <stdlib.h> //    for abs in clang7
+#include <stdint.h> //    for types like int32_t etc
+#include <stddef.h> //    for size_t type
+#include <cassert> //     for assert
+#include <cstdio> //      for stderr and snprintf
+#include <cmath> //       for sqrt and fma
+#include <limits> //      for numeric_limits
+#include <utility>//      for std::move
+#ifdef NANOVDB_USE_IOSTREAMS
+#include <fstream>//      for read/writeUncompressedGrids
+#endif// ifdef NANOVDB_USE_IOSTREAMS
+
+// All asserts can be disabled here, even for debug builds
+#if 1
+#define NANOVDB_ASSERT(x) assert(x)
+#else
+#define NANOVDB_ASSERT(x)
+#endif
+
+#if defined(NANOVDB_USE_INTRINSICS) && defined(_MSC_VER)
+#include <intrin.h>
+#pragma intrinsic(_BitScanReverse)
+#pragma intrinsic(_BitScanForward)
+#pragma intrinsic(_BitScanReverse64)
+#pragma intrinsic(_BitScanForward64)
+#endif
+
+#endif // __CUDACC_RTC__
+
+#if defined(__CUDACC__) || defined(__HIP__)
+// Only define __hostdev__ qualifier when using NVIDIA CUDA or HIP compilers
+#ifndef __hostdev__
+#define __hostdev__ __host__ __device__ // Runs on the CPU and GPU, called from the CPU or the GPU
+#endif
+#else
+// Dummy definitions of macros only defined by CUDA and HIP compilers
+#ifndef __hostdev__
+#define __hostdev__ // Runs on the CPU and GPU, called from the CPU or the GPU
+#endif
+#ifndef __global__
+#define __global__ // Runs on the GPU, called from the CPU or the GPU
+#endif
+#ifndef __device__
+#define __device__ // Runs on the GPU, called from the GPU
+#endif
+#ifndef __host__
+#define __host__ // Runs on the CPU, called from the CPU
+#endif
+
+#endif // if defined(__CUDACC__) || defined(__HIP__)
+
+// The following macro will suppress annoying warnings when nvcc
+// compiles functions that call (host) intrinsics (which is perfectly valid)
+#if defined(_MSC_VER) && defined(__CUDACC__)
+#define NANOVDB_HOSTDEV_DISABLE_WARNING __pragma("hd_warning_disable")
+#elif defined(__GNUC__) && defined(__CUDACC__)
+#define NANOVDB_HOSTDEV_DISABLE_WARNING _Pragma("hd_warning_disable")
+#else
+#define NANOVDB_HOSTDEV_DISABLE_WARNING
+#endif
+
+// Define compiler warnings that work with all compilers
+//#if defined(_MSC_VER)
+//#define NANO_WARNING(msg) _pragma("message" #msg)
+//#else
+//#define NANO_WARNING(msg) _Pragma("message" #msg)
+//#endif
+
+//==============================================
+/// @brief Defines macros that issues warnings for deprecated header files
+/// @details Example:
+/// @code
+/// #include <nanovdb/util/Util.h> // for NANOVDB_DEPRECATED_HEADER
+/// #include <nanovdb/path/Alternative.h>
+/// NANOVDB_DEPRECATED_HEADER("This header file is deprecated, please use <nanovdb/path/Alternative.h> instead")
+/// @endcode
+#ifdef __GNUC__
+#define NANOVDB_PRAGMA(X) _Pragma(#X)
+#define NANOVDB_DEPRECATED_HEADER(MSG) NANOVDB_PRAGMA(GCC warning MSG)
+#elif defined(_MSC_VER)
+#define NANOVDB_STRINGIZE_(MSG) #MSG
+#define NANOVDB_STRINGIZE(MSG) NANOVDB_STRINGIZE_(MSG)
+#define NANOVDB_DEPRECATED_HEADER(MSG) \
+    __pragma(message(__FILE__ "(" NANOVDB_STRINGIZE(__LINE__) ") : Warning: " MSG))
+#endif
+
+// A portable implementation of offsetof - unfortunately it doesn't work with static_assert
+#define NANOVDB_OFFSETOF(CLASS, MEMBER) ((int)(size_t)((char*)&((CLASS*)0)->MEMBER - (char*)0))
+
+namespace nanovdb {// =================================================================
+
+namespace util {// ====================================================================
+
+/// @brief Minimal implementation of std::declval, which converts any type @c T to
+////       a reference type, making it possible to use member functions in the operand
+///        of the decltype specifier without the need to go through constructors.
+/// @tparam T Template type to be converted to T&&
+/// @return T&&
+/// @warning Unlike std::declval, this version does not work when T = void! However,
+///          NVRTC does not like std::declval, so we provide our own implementation.
+template<typename T>
+T&& declval() noexcept;
+
+// --------------------------> string utility functions <------------------------------------
+
+/// @brief tests if a c-string @c str is empty, that is its first value is '\0'
+/// @param str c-string to be tested for null termination
+/// @return true if str[0] = '\0'
+__hostdev__ inline bool empty(const char* str)
+{
+    NANOVDB_ASSERT(str != nullptr);
+    return *str == '\0';
+}// util::empty
+
+/// @brief length of a c-sting, excluding '\0'.
+/// @param str c-string
+/// @return the number of characters that precede the terminating null character.
+__hostdev__ inline size_t strlen(const char *str)
+{
+    NANOVDB_ASSERT(str != nullptr);
+    const char *s = str;
+    while(*s) ++s;               ;
+    return (s - str);
+}// util::strlen
+
+/// @brief Copy characters from @c src to @c dst.
+/// @param dst pointer to the destination string.
+/// @param src pointer to the null-terminated source string.
+/// @return destination string @c dst.
+/// @note Emulates the behaviour of std::strcpy, except this version also runs on the GPU.
+__hostdev__ inline char* strcpy(char *dst, const char *src)
+{
+    NANOVDB_ASSERT(dst != nullptr && src != nullptr);
+    for (char *p = dst; (*p++ = *src) != '\0'; ++src);
+    return dst;
+}// util::strcpy(char*, const char*)
+
+/// @brief Copies the first num characters of @c src to @c dst.
+///        If the end of the source C string (which is signaled by a
+///        null-character) is found before @c max characters have been
+///        copied, @c dst is padded with zeros until a total of @c max
+///        characters have been written to it.
+/// @param dst destination string
+/// @param src source string
+/// @param max maximum number of character in destination string
+/// @return destination string @c dst
+/// @warning if strncpy(dst, src, max)[max-1]!='\0' then @c src has more
+///          characters than @c max and the return string needs to be
+///          manually null-terminated, i.e. strncpy(dst, src, max)[max-1]='\0'
+__hostdev__ inline char* strncpy(char *dst, const char *src, size_t max)
+{
+    NANOVDB_ASSERT(dst != nullptr && src != nullptr);
+    size_t i = 0;
+    for (; i < max && src[i] != '\0'; ++i) dst[i] = src[i];
+    for (; i < max; ++i) dst[i] = '\0';
+    return dst;
+}// util::strncpy(char *dst, const char *src, size_t max)
+
+/// @brief converts a number to a string using a specific base
+/// @param dst destination string
+/// @param num signed number to be concatenated after @c dst
+/// @param bas base used when converting @c num to a string
+/// @return destination string @c dst
+/// @note Emulates the behaviour of itoa, except this verion also works on the GPU.
+__hostdev__ inline char* strcpy(char* dst, int num, int bas = 10)
+{
+    NANOVDB_ASSERT(dst != nullptr && bas > 0);
+    int len = 0;// length of number once converted to a string
+    if (num == 0) dst[len++] = '0';
+    for (int abs = num < 0 && bas == 10 ? -num : num; abs; abs /= bas) {
+        const int rem = abs % bas;
+        dst[len++] = rem > 9 ? rem - 10 + 'a' : rem + '0';
+    }
+    if (num < 0) dst[len++] = '-';// append '-' if negative
+    for (char *a = dst, *b = a + len - 1; a < b; ++a, --b) {// reverse dst
+        dst[len] = *a;// use end of string as temp
+        *a = *b;
+        *b = dst[len];
+    }
+    dst[len] = '\0';// explicitly terminate end of string
+    return dst;
+}// util::strcpy(char*, int, int)
+
+/// @brief Appends a copy of the character string pointed to by @c src to
+///        the end of the character string pointed to by @c dst on the device.
+/// @param dst pointer to the null-terminated byte string to append to.
+/// @param src pointer to the null-terminated byte string to copy from.
+/// @return pointer to the character array being appended to.
+/// @note Emulates the behaviour of std::strcat, except this version also runs on the GPU.
+__hostdev__ inline char* strcat(char *dst, const char *src)
+{
+    NANOVDB_ASSERT(dst != nullptr && src != nullptr);
+    char *p = dst;
+    while (*p != '\0') ++p;// advance till end of dst
+    strcpy(p, src);// append src
+    return dst;
+}// util::strcat(char*, const char*)
+
+/// @brief concatenates a number after a string using a specific base
+/// @param dst null terminated destination string
+/// @param num signed number to be concatenated after @c dst
+/// @param bas base used when converting @c num to a string
+/// @return destination string @c dst
+__hostdev__ inline char* strcat(char* dst, int num, int bas = 10)
+{
+    NANOVDB_ASSERT(dst != nullptr);
+    char *p = dst;
+    while (*p != '\0') ++p;
+    strcpy(p, num, bas);
+    return dst;
+}// util::strcat(char*, int, int)
+
+/// @brief Compares two null-terminated byte strings lexicographically.
+/// @param lhs pointer to the null-terminated byte strings to compare
+/// @param rhs pointer to the null-terminated byte strings to compare
+/// @return Negative value if @c lhs appears before @c rhs in lexicographical order.
+///         Zero if @c lhs and @c rhs compare equal. Positive value if @c lhs appears
+///         after @c rhs in lexicographical order.
+/// @note Emulates the behaviour of std::strcmp, except this version also runs on the GPU.
+__hostdev__ inline int strcmp(const char *lhs, const char *rhs)
+{
+    while(*lhs != '\0' && (*lhs == *rhs)){
+        lhs++;
+        rhs++;
+    }
+    return *(const unsigned char*)lhs - *(const unsigned char*)rhs;// zero if lhs == rhs
+}// util::strcmp(const char*, const char*)
+
+/// @brief Test if two null-terminated byte strings are the same
+/// @param lhs pointer to the null-terminated byte strings to compare
+/// @param rhs pointer to the null-terminated byte strings to compare
+/// @return true if the two c-strings are identical
+__hostdev__ inline bool streq(const char *lhs, const char *rhs)
+{
+    return strcmp(lhs, rhs) == 0;
+}// util::streq
+
+namespace impl {// =======================================================
+// Base-case implementation of Variadic Template function impl::sprint
+__hostdev__ inline char* sprint(char *dst){return dst;}
+// Variadic Template function impl::sprint
+template <typename T, typename... Types>
+__hostdev__ inline char* sprint(char *dst, T var1, Types... var2)
+{
+    return impl::sprint(strcat(dst, var1), var2...);
+}
+}// namespace impl =========================================================
+
+/// @brief prints a variable number of string and/or numbers to a destination string
+template <typename T, typename... Types>
+__hostdev__ inline char* sprint(char *dst, T var1, Types... var2)
+{
+    return impl::sprint(strcpy(dst, var1), var2...);
+}// util::sprint
+
+// --------------------------> memzero <------------------------------------
+
+/// @brief Zero initialization of memory
+/// @param dst pointer to destination
+/// @param byteCount number of bytes to be initialized to zero
+/// @return destination pointer @c dst
+__hostdev__ inline static void* memzero(void *dst, size_t byteCount)
+{
+    NANOVDB_ASSERT(dst);
+    const size_t wordCount = byteCount >> 3;
+    if (wordCount << 3 == byteCount) {
+        for (auto *d = (uint64_t*)dst, *e = d + wordCount; d != e; ++d) *d = 0ULL;
+    } else {
+        for (auto *d = (char*)dst, *e = d + byteCount; d != e; ++d) *d = '\0';
+    }
+    return dst;
+}// util::memzero
+
+// --------------------------> util::is_same <------------------------------------
+
+/// @brief C++11 implementation of std::is_same
+/// @note When more than two arguments are provided value = T0==T1 || T0==T2 || ...
+template<typename T0, typename T1, typename ...T>
+struct is_same
+{
+    static constexpr bool value = is_same<T0, T1>::value || is_same<T0, T...>::value;
+};
+
+template<typename T0, typename T1>
+struct is_same<T0, T1> {static constexpr bool value = false;};
+
+template<typename T>
+struct is_same<T, T> {static constexpr bool value = true;};
+
+// --------------------------> util::is_floating_point <------------------------------------
+
+/// @brief C++11 implementation of std::is_floating_point
+template<typename T>
+struct is_floating_point {static constexpr bool value = is_same<T, float, double>::value;};
+
+// --------------------------> util::enable_if <------------------------------------
+
+/// @brief C++11 implementation of std::enable_if
+template <bool, typename T = void>
+struct enable_if {};
+
+template <typename T>
+struct enable_if<true, T> {using type = T;};
+
+// --------------------------> util::disable_if <------------------------------------
+
+template<bool, typename T = void>
+struct disable_if {using type = T;};
+
+template<typename T>
+struct disable_if<true, T> {};
+
+// --------------------------> util::is_const <------------------------------------
+
+template<typename T>
+struct is_const {static constexpr bool value = false;};
+
+template<typename T>
+struct is_const<const T> {static constexpr bool value = true;};
+
+// --------------------------> util::is_pointer <------------------------------------
+
+/// @brief Trait used to identify template parameter that are pointers
+/// @tparam T Template parameter to be tested
+template<class T>
+struct is_pointer {static constexpr bool value = false;};
+
+/// @brief Template specialization of pointers
+/// @tparam T Template parameter to be tested
+/// @note T can be both a non-const and const type
+template<class T>
+struct is_pointer<T*> {static constexpr bool value = true;};
+
+// --------------------------> util::conditional <------------------------------------
+
+/// @brief C++11 implementation of std::conditional
+template<bool, class TrueT, class FalseT>
+struct conditional { using type = TrueT; };
+
+/// @brief Template specialization of conditional
+/// @tparam FalseT Type used when boolean is false
+/// @tparam TrueT Type used when boolean is true
+template<class TrueT, class FalseT>
+struct conditional<false, TrueT, FalseT> { using type = FalseT; };
+
+// --------------------------> util::remove_const <------------------------------------
+
+/// @brief Trait use to const from type. Default implementation is just a pass-through
+/// @tparam T Type
+/// @details remove_pointer<float>::type = float
+template<typename T>
+struct remove_const {using type = T;};
+
+/// @brief Template specialization of trait class use to remove const qualifier type from a type
+/// @tparam T Type of the const type
+/// @details remove_pointer<const float>::type = float
+template<typename T>
+struct remove_const<const T> {using type = T;};
+
+// --------------------------> util::remove_reference <------------------------------------
+
+/// @brief Trait use to remove reference, i.e. "&", qualifier from a type. Default implementation is just a pass-through
+/// @tparam T Type
+/// @details remove_pointer<float>::type = float
+template <typename T>
+struct remove_reference {using type = T;};
+
+/// @brief Template specialization of trait class use to remove reference, i.e. "&", qualifier from a type
+/// @tparam T Type of the reference
+/// @details remove_pointer<float&>::type = float
+template <typename T>
+struct remove_reference<T&> {using type = T;};
+
+// --------------------------> util::remove_pointer <------------------------------------
+
+/// @brief Trait use to remove pointer, i.e. "*", qualifier from a type. Default implementation is just a pass-through
+/// @tparam T Type
+/// @details remove_pointer<float>::type = float
+template <typename T>
+struct remove_pointer {using type = T;};
+
+/// @brief Template specialization of trait class use to to remove pointer, i.e. "*", qualifier from a type
+/// @tparam T Type of the pointer
+/// @details remove_pointer<float*>::type = float
+template <typename T>
+struct remove_pointer<T*> {using type = T;};
+
+// --------------------------> util::match_const <------------------------------------
+
+/// @brief Trait used to transfer the const-ness of a reference type to another type
+/// @tparam T Type whose const-ness needs to match the reference type
+/// @tparam ReferenceT Reference type that is not const
+/// @details match_const<const int, float>::type = int
+///          match_const<int, float>::type = int
+template<typename T, typename ReferenceT>
+struct match_const {using type = typename remove_const<T>::type;};
+
+/// @brief Template specialization used to transfer the const-ness of a reference type to another type
+/// @tparam T Type that will adopt the const-ness of the reference type
+/// @tparam ReferenceT Reference type that is const
+/// @details match_const<const int, const float>::type = const int
+///          match_const<int, const float>::type = const int
+template<typename T, typename ReferenceT>
+struct match_const<T, const ReferenceT> {using type = const typename remove_const<T>::type;};
+
+// --------------------------> util::is_specialization <------------------------------------
+
+/// @brief Metafunction used to determine if the first template
+///        parameter is a specialization of the class template
+///        given in the second template parameter.
+///
+/// @details is_specialization<Vec3<float>, Vec3>::value == true;
+///          is_specialization<Vec3f, Vec3>::value == true;
+///          is_specialization<std::vector<float>, std::vector>::value == true;
+template<typename AnyType, template<typename...> class TemplateType>
+struct is_specialization {static const bool value = false;};
+template<typename... Args, template<typename...> class TemplateType>
+struct is_specialization<TemplateType<Args...>, TemplateType>
+{
+    static const bool value = true;
+};// util::is_specialization
+
+// --------------------------> util::PtrDiff <------------------------------------
+
+/// @brief Compute the distance, in bytes, between two pointers, dist = p - q
+/// @param p fist pointer, assumed to NOT be NULL
+/// @param q second pointer, assumed to NOT be NULL
+/// @return signed distance between pointer, p - q, addresses in units of bytes
+__hostdev__ inline static int64_t PtrDiff(const void* p, const void* q)
+{
+    NANOVDB_ASSERT(p && q);
+    return reinterpret_cast<const char*>(p) - reinterpret_cast<const char*>(q);
+}// util::PtrDiff
+
+// --------------------------> util::PtrAdd <------------------------------------
+
+/// @brief Adds a byte offset to a non-const pointer to produce another non-const pointer
+/// @tparam DstT Type of the return pointer (defaults to void)
+/// @param p non-const input pointer, assumed to NOT be NULL
+/// @param offset signed byte offset
+/// @return a non-const pointer defined as the offset of an input pointer
+template<typename DstT = void>
+__hostdev__ inline static DstT* PtrAdd(void* p, int64_t offset)
+{
+    NANOVDB_ASSERT(p);
+    return reinterpret_cast<DstT*>(reinterpret_cast<char*>(p) + offset);
+}// util::PtrAdd
+
+/// @brief Adds a byte offset to a const pointer to produce another const pointer
+/// @tparam DstT Type of the return pointer (defaults to void)
+/// @param p const input pointer, assumed to NOT be NULL
+/// @param offset signed byte offset
+/// @return a const pointer defined as the offset of a const input pointer
+template<typename DstT = void>
+__hostdev__ inline static const DstT* PtrAdd(const void* p, int64_t offset)
+{
+    NANOVDB_ASSERT(p);
+    return reinterpret_cast<const DstT*>(reinterpret_cast<const char*>(p) + offset);
+}// util::PtrAdd
+
+// -------------------> findLowestOn <----------------------------
+
+/// @brief Returns the index of the lowest, i.e. least significant, on bit in the specified 32 bit word
+///
+/// @warning Assumes that at least one bit is set in the word, i.e. @a v != uint32_t(0)!
+NANOVDB_HOSTDEV_DISABLE_WARNING
+__hostdev__ inline uint32_t findLowestOn(uint32_t v)
+{
+    NANOVDB_ASSERT(v);
+#if (defined(__CUDA_ARCH__) || defined(__HIP__)) && defined(NANOVDB_USE_INTRINSICS)
+    return __ffs(v) - 1; // one based indexing
+#elif defined(_MSC_VER) && defined(NANOVDB_USE_INTRINSICS)
+    unsigned long index;
+    _BitScanForward(&index, v);
+    return static_cast<uint32_t>(index);
+#elif (defined(__GNUC__) || defined(__clang__)) && defined(NANOVDB_USE_INTRINSICS)
+    return static_cast<uint32_t>(__builtin_ctzl(v));
+#else
+    //NANO_WARNING("Using software implementation for findLowestOn(uint32_t v)")
+    static const unsigned char DeBruijn[32] = {
+        0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9};
+// disable unary minus on unsigned warning
+#if defined(_MSC_VER) && !defined(__NVCC__)
+#pragma warning(push)
+#pragma warning(disable : 4146)
+#endif
+    return DeBruijn[uint32_t((v & -v) * 0x077CB531U) >> 27];
+#if defined(_MSC_VER) && !defined(__NVCC__)
+#pragma warning(pop)
+#endif
+
+#endif
+}// util::findLowestOn(uint32_t)
+
+/// @brief Returns the index of the lowest, i.e. least significant, on bit in the specified 64 bit word
+///
+/// @warning Assumes that at least one bit is set in the word, i.e. @a v != uint32_t(0)!
+NANOVDB_HOSTDEV_DISABLE_WARNING
+__hostdev__ inline uint32_t findLowestOn(uint64_t v)
+{
+    NANOVDB_ASSERT(v);
+#if (defined(__CUDA_ARCH__) || defined(__HIP__)) && defined(NANOVDB_USE_INTRINSICS)
+    return __ffsll(static_cast<unsigned long long int>(v)) - 1; // one based indexing
+#elif defined(_MSC_VER) && defined(NANOVDB_USE_INTRINSICS)
+    unsigned long index;
+    _BitScanForward64(&index, v);
+    return static_cast<uint32_t>(index);
+#elif (defined(__GNUC__) || defined(__clang__)) && defined(NANOVDB_USE_INTRINSICS)
+    return static_cast<uint32_t>(__builtin_ctzll(v));
+#else
+    //NANO_WARNING("Using software implementation for util::findLowestOn(uint64_t)")
+    static const unsigned char DeBruijn[64] = {
+        0,   1,  2, 53,  3,  7, 54, 27, 4,  38, 41,  8, 34, 55, 48, 28,
+        62,  5, 39, 46, 44, 42, 22,  9, 24, 35, 59, 56, 49, 18, 29, 11,
+        63, 52,  6, 26, 37, 40, 33, 47, 61, 45, 43, 21, 23, 58, 17, 10,
+        51, 25, 36, 32, 60, 20, 57, 16, 50, 31, 19, 15, 30, 14, 13, 12,
+    };
+// disable unary minus on unsigned warning
+#if defined(_MSC_VER) && !defined(__NVCC__)
+#pragma warning(push)
+#pragma warning(disable : 4146)
+#endif
+    return DeBruijn[uint64_t((v & -v) * UINT64_C(0x022FDD63CC95386D)) >> 58];
+#if defined(_MSC_VER) && !defined(__NVCC__)
+#pragma warning(pop)
+#endif
+
+#endif
+}// util::findLowestOn(uint64_t)
+
+// -------------------> findHighestOn <----------------------------
+
+/// @brief Returns the index of the highest, i.e. most significant, on bit in the specified 32 bit word
+///
+/// @warning Assumes that at least one bit is set in the word, i.e. @a v != uint32_t(0)!
+NANOVDB_HOSTDEV_DISABLE_WARNING
+__hostdev__ inline uint32_t findHighestOn(uint32_t v)
+{
+    NANOVDB_ASSERT(v);
+#if (defined(__CUDA_ARCH__) || defined(__HIP__)) && defined(NANOVDB_USE_INTRINSICS)
+    return sizeof(uint32_t) * 8 - 1 - __clz(v); // Return the number of consecutive high-order zero bits in a 32-bit integer.
+#elif defined(_MSC_VER) && defined(NANOVDB_USE_INTRINSICS)
+    unsigned long index;
+    _BitScanReverse(&index, v);
+    return static_cast<uint32_t>(index);
+#elif (defined(__GNUC__) || defined(__clang__)) && defined(NANOVDB_USE_INTRINSICS)
+    return sizeof(unsigned long) * 8 - 1 - __builtin_clzl(v);
+#else
+    //NANO_WARNING("Using software implementation for util::findHighestOn(uint32_t)")
+    static const unsigned char DeBruijn[32] = {
+        0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
+        8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31};
+    v |= v >> 1; // first round down to one less than a power of 2
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    v |= v >> 16;
+    return DeBruijn[uint32_t(v * 0x07C4ACDDU) >> 27];
+#endif
+}// util::findHighestOn
+
+/// @brief Returns the index of the highest, i.e. most significant, on bit in the specified 64 bit word
+///
+/// @warning Assumes that at least one bit is set in the word, i.e. @a v != uint32_t(0)!
+NANOVDB_HOSTDEV_DISABLE_WARNING
+__hostdev__ inline uint32_t findHighestOn(uint64_t v)
+{
+    NANOVDB_ASSERT(v);
+#if (defined(__CUDA_ARCH__) || defined(__HIP__)) && defined(NANOVDB_USE_INTRINSICS)
+    return sizeof(unsigned long) * 8 - 1 - __clzll(static_cast<unsigned long long int>(v));
+#elif defined(_MSC_VER) && defined(NANOVDB_USE_INTRINSICS)
+    unsigned long index;
+    _BitScanReverse64(&index, v);
+    return static_cast<uint32_t>(index);
+#elif (defined(__GNUC__) || defined(__clang__)) && defined(NANOVDB_USE_INTRINSICS)
+    return sizeof(unsigned long) * 8 - 1 - __builtin_clzll(v);
+#else
+    const uint32_t* p = reinterpret_cast<const uint32_t*>(&v);
+    return p[1] ? 32u + findHighestOn(p[1]) : findHighestOn(p[0]);
+#endif
+}// util::findHighestOn
+
+// ----------------------------> util::countOn <--------------------------------------
+
+/// @return Number of bits that are on in the specified 64-bit word
+NANOVDB_HOSTDEV_DISABLE_WARNING
+__hostdev__ inline uint32_t countOn(uint64_t v)
+{
+#if (defined(__CUDA_ARCH__) || defined(__HIP__)) && defined(NANOVDB_USE_INTRINSICS)
+    //#warning Using popcll for util::countOn
+    return __popcll(v);
+// __popcnt64 intrinsic support was added in VS 2019 16.8
+#elif defined(_MSC_VER) && defined(_M_X64) && (_MSC_VER >= 1928) && defined(NANOVDB_USE_INTRINSICS)
+    //#warning Using popcnt64 for util::countOn
+    return uint32_t(__popcnt64(v));
+#elif (defined(__GNUC__) || defined(__clang__)) && defined(NANOVDB_USE_INTRINSICS)
+    //#warning Using builtin_popcountll for util::countOn
+    return __builtin_popcountll(v);
+#else // use software implementation
+    //NANO_WARNING("Using software implementation for util::countOn")
+    v = v - ((v >> 1) & uint64_t(0x5555555555555555));
+    v = (v & uint64_t(0x3333333333333333)) + ((v >> 2) & uint64_t(0x3333333333333333));
+    return (((v + (v >> 4)) & uint64_t(0xF0F0F0F0F0F0F0F)) * uint64_t(0x101010101010101)) >> 56;
+#endif
+}// util::countOn(uint64_t)
+
+}// namespace util ==================================================================
+
+[[deprecated("Use nanovdb::util::findLowestOn instead")]]
+__hostdev__ inline uint32_t FindLowestOn(uint32_t v){return util::findLowestOn(v);}
+[[deprecated("Use nanovdb::util::findLowestOn instead")]]
+__hostdev__ inline uint32_t FindLowestOn(uint64_t v){return util::findLowestOn(v);}
+[[deprecated("Use nanovdb::util::findHighestOn instead")]]
+__hostdev__ inline uint32_t FindHighestOn(uint32_t v){return util::findHighestOn(v);}
+[[deprecated("Use nanovdb::util::findHighestOn instead")]]
+__hostdev__ inline uint32_t FindHighestOn(uint64_t v){return util::findHighestOn(v);}
+[[deprecated("Use nanovdb::util::countOn instead")]]
+__hostdev__ inline uint32_t CountOn(uint64_t v){return util::countOn(v);}
+
+} // namespace nanovdb ===================================================================
+
+#endif // end of NANOVDB_UTIL_UTIL_H_HAS_BEEN_INCLUDED
diff --git a/warp/native/nanovdb/util/cuda/Timer.h b/warp/native/nanovdb/util/cuda/Timer.h
new file mode 100644
index 00000000..bd73f8e3
--- /dev/null
+++ b/warp/native/nanovdb/util/cuda/Timer.h
@@ -0,0 +1,116 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: MPL-2.0
+
+/// @file nanovdb/util/cuda/Timer.h
+///
+/// @author Ken Museth
+///
+/// @brief A simple GPU timing class
+
+#ifndef NANOVDB_UTIL_CUDA_TIMER_H_HAS_BEEN_INCLUDED
+#define NANOVDB_UTIL_CUDA_TIMER_H_HAS_BEEN_INCLUDED
+
+#include <iostream>// for std::cerr
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+
+namespace nanovdb {
+
+namespace util{ namespace cuda {
+
+class Timer
+{
+    cudaStream_t mStream{0};
+    cudaEvent_t mStart, mStop;
+
+public:
+    /// @brief Default constructor
+    /// @param stream CUDA stream to be timed (defaults to stream 0)
+    /// @note Starts the timer
+    Timer(cudaStream_t stream = 0) : mStream(stream)
+    {
+        cudaEventCreate(&mStart);
+        cudaEventCreate(&mStop);
+        cudaEventRecord(mStart, mStream);
+    }
+
+    /// @brief Construct and start the timer
+    /// @param msg string message to be printed when timer is started
+    /// @param stream CUDA stream to be timed (defaults to stream 0)
+    /// @param os output stream for the message above
+    Timer(const std::string &msg, cudaStream_t stream = 0, std::ostream& os = std::cerr)
+        : mStream(stream)
+    {
+        os << msg << " ... " << std::flush;
+        cudaEventCreate(&mStart);
+        cudaEventCreate(&mStop);
+        cudaEventRecord(mStart, mStream);
+    }
+
+    /// @brief Destructor
+    ~Timer()
+    {
+        cudaEventDestroy(mStart);
+        cudaEventDestroy(mStop);
+    }
+
+    /// @brief Start the timer
+    /// @param stream CUDA stream to be timed (defaults to stream 0)
+    /// @param os output stream for the message above
+    void start() {cudaEventRecord(mStart, mStream);}
+
+    /// @brief Start the timer
+    /// @param msg string message to be printed when timer is started
+
+    /// @param os output stream for the message above
+    void start(const std::string &msg, std::ostream& os = std::cerr)
+    {
+        os << msg << " ... " << std::flush;
+        this->start();
+    }
+
+    /// @brief Start the timer
+    /// @param msg string message to be printed when timer is started
+    /// @param os output stream for the message above
+    void start(const char* msg, std::ostream& os = std::cerr)
+    {
+        os << msg << " ... " << std::flush;
+        this->start();
+    }
+
+    /// @brief elapsed time (since start) in miliseconds
+    /// @return elapsed time (since start) in miliseconds
+    float elapsed()
+    {
+        cudaEventRecord(mStop, mStream);
+        cudaEventSynchronize(mStop);
+        float diff = 0.0f;
+        cudaEventElapsedTime(&diff, mStart, mStop);
+        return diff;
+    }
+
+    /// @brief stop the timer
+    /// @param os output stream for the message above
+    void stop(std::ostream& os = std::cerr)
+    {
+        float diff = this->elapsed();
+        os << "completed in " << diff << " milliseconds" << std::endl;
+    }
+
+    /// @brief stop and start the timer
+    /// @param msg string message to be printed when timer is started
+    /// @warning Remember to call start before restart
+    void restart(const std::string &msg, std::ostream& os = std::cerr)
+    {
+        this->stop();
+        this->start(msg, os);
+    }
+};// Timer
+
+}}// namespace util::cuda
+
+using GpuTimer [[deprecated("Use nanovdb::util::cuda::Timer instead")]]= util::cuda::Timer;
+
+} // namespace nanovdb
+
+#endif // NANOVDB_UTIL_CUDA_TIMER_H_HAS_BEEN_INCLUDED
diff --git a/warp/native/nanovdb/util/cuda/Util.h b/warp/native/nanovdb/util/cuda/Util.h
new file mode 100644
index 00000000..b4391bab
--- /dev/null
+++ b/warp/native/nanovdb/util/cuda/Util.h
@@ -0,0 +1,193 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: MPL-2.0
+
+/*!
+    \file nanovdb/util/cuda/Util.h
+
+    \author Ken Museth
+
+    \date December 20, 2023
+
+    \brief Cuda specific utility functions
+*/
+
+#ifndef NANOVDB_UTIL_CUDA_UTIL_H_HAS_BEEN_INCLUDED
+#define NANOVDB_UTIL_CUDA_UTIL_H_HAS_BEEN_INCLUDED
+
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <nanovdb/util/Util.h> // for stderr and NANOVDB_ASSERT
+
+// change 1 -> 0 to only perform asserts during debug builds
+#if 1 || defined(DEBUG) || defined(_DEBUG)
+    static inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true)
+    {
+        if (code != cudaSuccess) {
+            fprintf(stderr, "CUDA error %u: %s (%s:%d)\n", unsigned(code), cudaGetErrorString(code), file, line);
+            //fprintf(stderr, "CUDA Runtime Error: %s %s %d\n", cudaGetErrorString(code), file, line);
+            if (abort) exit(code);
+        }
+    }
+    static inline void ptrAssert(const void* ptr, const char* msg, const char* file, int line, bool abort = true)
+    {
+        if (ptr == nullptr) {
+            fprintf(stderr, "NULL pointer error: %s %s %d\n", msg, file, line);
+            if (abort) exit(1);
+        } else if (uint64_t(ptr) % 32) {
+            fprintf(stderr, "Pointer misalignment error: %s %s %d\n", msg, file, line);
+            if (abort) exit(1);
+        }
+    }
+#else
+    static inline void gpuAssert(cudaError_t, const char*, int, bool = true){}
+    static inline void ptrAssert(void*, const char*, const char*, int, bool = true){}
+#endif
+
+// Convenience function for checking CUDA runtime API results
+// can be wrapped around any runtime API call. No-op in release builds.
+#define cudaCheck(ans) \
+    { \
+        gpuAssert((ans), __FILE__, __LINE__); \
+    }
+
+#define checkPtr(ptr, msg) \
+    { \
+        ptrAssert((ptr), (msg), __FILE__, __LINE__); \
+    }
+
+#define cudaSync() \
+    { \
+        cudaCheck(cudaDeviceSynchronize()); \
+    }
+
+#define cudaCheckError() \
+    { \
+        cudaCheck(cudaGetLastError()); \
+    }
+
+namespace nanovdb {// =========================================================
+
+namespace util{ namespace cuda {// ======================================================
+
+//#define NANOVDB_USE_SYNC_CUDA_MALLOC
+// cudaMallocAsync and cudaFreeAsync were introduced in CUDA 11.2 so we introduce
+// custom implementations that map to cudaMalloc and cudaFree below. If NANOVDB_USE_SYNC_CUDA_MALLOC
+// is defined these implementations will also be defined, which is useful in virtualized environments
+// that slice up the GPU and share it between instances as vGPU's. GPU unified memory is usually disabled
+// out of security considerations. Asynchronous CUDA malloc/free depends on GPU unified memory, so it
+// is not possible to use cudaMallocAsync and cudaFreeAsync in such environments.
+
+#if (CUDART_VERSION < 11020) || defined(NANOVDB_USE_SYNC_CUDA_MALLOC) // 11.2 introduced cudaMallocAsync and cudaFreeAsync
+
+/// @brief Simple wrapper that calls cudaMalloc
+/// @param d_ptr Device pointer to allocated device memory
+/// @param size  Number of bytes to allocate
+/// @param dummy The stream establishing the stream ordering contract and the memory pool to allocate from (ignored)
+/// @return Cuda error code
+inline cudaError_t mallocAsync(void** d_ptr, size_t size, cudaStream_t){return cudaMalloc(d_ptr, size);}
+
+/// @brief Simple wrapper that calls cudaFree
+/// @param d_ptr Device pointer that will be freed
+/// @param dummy The stream establishing the stream ordering promise (ignored)
+/// @return Cuda error code
+inline cudaError_t freeAsync(void* d_ptr, cudaStream_t){return cudaFree(d_ptr);}
+
+#else
+
+/// @brief Simple wrapper that calls cudaMallocAsync
+/// @param d_ptr Device pointer to allocated device memory
+/// @param size  Number of bytes to allocate
+/// @param stream The stream establishing the stream ordering contract and the memory pool to allocate from
+/// @return Cuda error code
+inline cudaError_t mallocAsync(void** d_ptr, size_t size, cudaStream_t stream){return cudaMallocAsync(d_ptr, size, stream);}
+
+/// @brief Simple wrapper that calls cudaFreeAsync
+/// @param d_ptr Device pointer that will be freed
+/// @param stream The stream establishing the stream ordering promise
+/// @return Cuda error code
+inline cudaError_t freeAsync(void* d_ptr, cudaStream_t stream){return cudaFreeAsync(d_ptr, stream);}
+
+#endif
+
+/// @brief Simple (naive) implementation of a unique device pointer
+///        using stream ordered memory allocation and deallocation.
+/// @tparam T Type of the device pointer
+template <typename T>
+class unique_ptr
+{
+    T           *mPtr;// pointer to stream ordered memory allocation
+    cudaStream_t mStream;
+public:
+    unique_ptr(size_t count = 0, cudaStream_t stream = 0) : mPtr(nullptr), mStream(stream)
+    {
+        if (count>0) cudaCheck(mallocAsync((void**)&mPtr, count*sizeof(T), stream));
+    }
+    unique_ptr(const unique_ptr&) = delete;
+    unique_ptr(unique_ptr&& other) : mPtr(other.mPtr), mStream(other.mStream)
+    {
+        other.mPtr = nullptr;
+    }
+    ~unique_ptr()
+    {
+        if (mPtr) cudaCheck(freeAsync(mPtr, mStream));
+    }
+    unique_ptr& operator=(const unique_ptr&) = delete;
+    unique_ptr& operator=(unique_ptr&& rhs) noexcept
+    {
+        mPtr = rhs.mPtr;
+        mStream = rhs.mStream;
+        rhs.mPtr = nullptr;
+        return *this;
+    }
+    void reset() {
+        if (mPtr) {
+            cudaCheck(freeAsync(mPtr, mStream));
+            mPtr = nullptr;
+        }
+    }
+    T* get()                 const {return mPtr;}
+    explicit operator bool() const {return mPtr != nullptr;}
+};// util::cuda::unique_ptr
+
+/// @brief Computes the number of blocks per grid given the problem size and number of threads per block
+/// @param numItems Problem size
+/// @param threadsPerBlock Number of threads per block (second CUDA launch parameter)
+/// @return number of blocks per grid (first CUDA launch parameter)
+/// @note CUDA launch parameters: kernel<<< blocksPerGrid, threadsPerBlock, sharedMemSize, streamID>>>
+inline size_t blocksPerGrid(size_t numItems, size_t threadsPerBlock)
+{
+    NANOVDB_ASSERT(numItems > 0 && threadsPerBlock >= 32 && threadsPerBlock % 32 == 0);
+    return (numItems + threadsPerBlock - 1) / threadsPerBlock;
+}
+
+
+#if defined(__CUDACC__)// the following functions only run on the GPU!
+
+/// @brief Cuda kernel that launches device lambda functions
+/// @param numItems Problem size
+template<typename Func, typename... Args>
+__global__ void lambdaKernel(const size_t numItems, Func func, Args... args)
+{
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= numItems) return;
+    func(tid, args...);
+}// util::cuda::lambdaKernel
+
+#endif// __CUDACC__
+
+}}// namespace util::cuda ============================================================
+
+}// namespace nanovdb ===============================================================
+
+#if defined(__CUDACC__)// the following functions only run on the GPU!
+template<typename Func, typename... Args>
+[[deprecated("Use nanovdb::cuda::lambdaKernel instead")]]
+__global__ void cudaLambdaKernel(const size_t numItems, Func func, Args... args)
+{
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= numItems) return;
+    func(tid, args...);
+}
+#endif// __CUDACC__
+
+#endif// NANOVDB_UTIL_CUDA_UTIL_H_HAS_BEEN_INCLUDED
\ No newline at end of file
diff --git a/warp/native/volume.cpp b/warp/native/volume.cpp
index 68c3c554..36267c5e 100644
--- a/warp/native/volume.cpp
+++ b/warp/native/volume.cpp
@@ -6,10 +6,10 @@
  * license agreement from NVIDIA CORPORATION is strictly prohibited.
  */
 
-#include "volume.h"
+#include "cuda_util.h"
 #include "volume_builder.h"
+#include "volume_impl.h"
 #include "warp.h"
-#include "cuda_util.h"
 
 #include <map>
 
@@ -21,40 +21,50 @@ namespace
 struct VolumeDesc
 {
     // NanoVDB buffer either in device or host memory
-    void* buffer; 
+    void *buffer;
     uint64_t size_in_bytes;
+    bool owner; // whether the buffer should be deallocated when the volume is destroyed
 
-    // offset to the voxel values of the first leaf node relative to buffer
-    uint64_t first_voxel_data_offs;
-
-    // copy of the grids's metadata to keep on the host for device volumes
     pnanovdb_grid_t grid_data;
-
-    // copy of the tree's metadata to keep on the host for device volumes
     pnanovdb_tree_t tree_data;
 
+    // Host-accessible version of the blind metadata (copy if GPU, alias if CPU)
+    pnanovdb_gridblindmetadata_t *blind_metadata;
+
     // CUDA context for this volume (NULL if CPU)
-    void* context;
+    void *context;
+
+    pnanovdb_buf_t as_pnano() const
+    {
+        return pnanovdb_make_buf(static_cast<uint32_t *>(buffer), size_in_bytes);
+    }
 };
 
 // Host-side volume descriptors. Maps each CPU/GPU volume buffer address (id) to a CPU desc
 std::map<uint64_t, VolumeDesc> g_volume_descriptors;
 
-bool volume_get_descriptor(uint64_t id, VolumeDesc& volumeDesc)
+bool volume_get_descriptor(uint64_t id, const VolumeDesc *&volumeDesc)
 {
-    if (id == 0) return false;
+    if (id == 0)
+        return false;
 
-    const auto& iter = g_volume_descriptors.find(id);
+    const auto &iter = g_volume_descriptors.find(id);
     if (iter == g_volume_descriptors.end())
         return false;
     else
-        volumeDesc = iter->second;
+        volumeDesc = &iter->second;
     return true;
 }
 
-void volume_add_descriptor(uint64_t id, const VolumeDesc& volumeDesc)
+bool volume_exists(const void *id)
 {
-    g_volume_descriptors[id] = volumeDesc;
+    const VolumeDesc *volume;
+    return volume_get_descriptor((uint64_t)id, volume);
+}
+
+void volume_add_descriptor(uint64_t id, VolumeDesc &&volumeDesc)
+{
+    g_volume_descriptors[id] = std::move(volumeDesc);
 }
 
 void volume_rem_descriptor(uint64_t id)
@@ -64,234 +74,456 @@ void volume_rem_descriptor(uint64_t id)
 
 } // anonymous namespace
 
-
 // NB: buf must be a host pointer
-uint64_t volume_create_host(void* buf, uint64_t size)
+uint64_t volume_create_host(void *buf, uint64_t size, bool copy, bool owner)
 {
-    if (size < sizeof(pnanovdb_grid_t) + sizeof(pnanovdb_tree_t))
-        return 0;  // This cannot be a valid NanoVDB grid with data
+    if (size > 0 && size < sizeof(pnanovdb_grid_t) + sizeof(pnanovdb_tree_t))
+        return 0; // This cannot be a valid NanoVDB grid with data
 
-    VolumeDesc volume;
+    if (!copy && volume_exists(buf))
+    {
+        // descriptor already created for this volume
+        return 0;
+    }
 
+    VolumeDesc volume;
     volume.context = NULL;
 
     memcpy_h2h(&volume.grid_data, buf, sizeof(pnanovdb_grid_t));
-    memcpy_h2h(&volume.tree_data, (pnanovdb_grid_t*)buf + 1, sizeof(pnanovdb_tree_t));
+    memcpy_h2h(&volume.tree_data, (pnanovdb_grid_t *)buf + 1, sizeof(pnanovdb_tree_t));
 
-    if (volume.grid_data.magic != PNANOVDB_MAGIC_NUMBER)
+    if (volume.grid_data.magic != PNANOVDB_MAGIC_NUMBER && volume.grid_data.magic != PNANOVDB_MAGIC_GRID)
         return 0;
 
+    if (size == 0)
+    {
+        size = volume.grid_data.grid_size;
+    }
+
+    // Copy or alias buffer
     volume.size_in_bytes = size;
-    volume.buffer = alloc_host(size);
-    memcpy_h2h(volume.buffer, buf, size);
+    if (copy)
+    {
+        volume.buffer = alloc_host(size);
+        memcpy_h2h(volume.buffer, buf, size);
+        volume.owner = true;
+    }
+    else
+    {
+        volume.buffer = buf;
+        volume.owner = owner;
+    }
 
-    volume.first_voxel_data_offs =
-        sizeof(pnanovdb_grid_t) + volume.tree_data.node_offset_leaf + PNANOVDB_GRID_TYPE_GET(PNANOVDB_GRID_TYPE_FLOAT, leaf_off_table);
+    // Alias blind metadata
+    volume.blind_metadata = reinterpret_cast<pnanovdb_gridblindmetadata_t *>(static_cast<uint8_t *>(volume.buffer) +
+                                                                             volume.grid_data.blind_metadata_offset);
 
     uint64_t id = (uint64_t)volume.buffer;
 
-    volume_add_descriptor(id, volume);
+    volume_add_descriptor(id, std::move(volume));
 
     return id;
 }
 
 // NB: buf must be a pointer on the same device
-uint64_t volume_create_device(void* context, void* buf, uint64_t size)
+uint64_t volume_create_device(void *context, void *buf, uint64_t size, bool copy, bool owner)
 {
-    if (size < sizeof(pnanovdb_grid_t) + sizeof(pnanovdb_tree_t))
-        return 0;  // This cannot be a valid NanoVDB grid with data
+    if (size > 0 && size < sizeof(pnanovdb_grid_t) + sizeof(pnanovdb_tree_t))
+        return 0; // This cannot be a valid NanoVDB grid with data
+
+    if (!copy && volume_exists(buf))
+    {
+        // descriptor already created for this volume
+        return 0;
+    }
 
     ContextGuard guard(context);
 
     VolumeDesc volume;
-
     volume.context = context ? context : cuda_context_get_current();
 
     memcpy_d2h(WP_CURRENT_CONTEXT, &volume.grid_data, buf, sizeof(pnanovdb_grid_t));
-    memcpy_d2h(WP_CURRENT_CONTEXT, &volume.tree_data, (pnanovdb_grid_t*)buf + 1, sizeof(pnanovdb_tree_t));
+    memcpy_d2h(WP_CURRENT_CONTEXT, &volume.tree_data, (pnanovdb_grid_t *)buf + 1, sizeof(pnanovdb_tree_t));
+    // no sync needed since the above copies are to pageable memory
 
-    if (volume.grid_data.magic != PNANOVDB_MAGIC_NUMBER)
+    if (volume.grid_data.magic != PNANOVDB_MAGIC_NUMBER && volume.grid_data.magic != PNANOVDB_MAGIC_GRID)
         return 0;
 
+    if (size == 0)
+    {
+        size = volume.grid_data.grid_size;
+    }
+
+    // Copy or alias data buffer
     volume.size_in_bytes = size;
-    volume.buffer = alloc_device(WP_CURRENT_CONTEXT, size);
-    memcpy_d2d(WP_CURRENT_CONTEXT, volume.buffer, buf, size);
+    if (copy)
+    {
+        volume.buffer = alloc_device(WP_CURRENT_CONTEXT, size);
+        memcpy_d2d(WP_CURRENT_CONTEXT, volume.buffer, buf, size);
+        volume.owner = true;
+    }
+    else
+    {
+        volume.buffer = buf;
+        volume.owner = owner;
+    }
 
-    volume.first_voxel_data_offs =
-        sizeof(pnanovdb_grid_t) + volume.tree_data.node_offset_leaf + PNANOVDB_GRID_TYPE_GET(PNANOVDB_GRID_TYPE_FLOAT, leaf_off_table);
+    // Make blind metadata accessible on host
+    const uint64_t blindmetadata_size = volume.grid_data.blind_metadata_count * sizeof(pnanovdb_gridblindmetadata_t);
+    volume.blind_metadata = static_cast<pnanovdb_gridblindmetadata_t *>(alloc_pinned(blindmetadata_size));
+    memcpy_d2h(WP_CURRENT_CONTEXT, volume.blind_metadata,
+               static_cast<uint8_t *>(volume.buffer) + volume.grid_data.blind_metadata_offset, blindmetadata_size);
 
     uint64_t id = (uint64_t)volume.buffer;
-
-    volume_add_descriptor(id, volume);
+    volume_add_descriptor(id, std::move(volume));
 
     return id;
 }
 
-static void volume_get_buffer_info(uint64_t id, void** buf, uint64_t* size)
+void volume_get_buffer_info(uint64_t id, void **buf, uint64_t *size)
 {
     *buf = 0;
     *size = 0;
 
-    VolumeDesc volume;
+    const VolumeDesc *volume;
     if (volume_get_descriptor(id, volume))
     {
-        *buf = volume.buffer;
-        *size = volume.size_in_bytes;
+        *buf = volume->buffer;
+        *size = volume->size_in_bytes;
     }
 }
 
-void volume_get_buffer_info_host(uint64_t id, void** buf, uint64_t* size)
+void volume_get_voxel_size(uint64_t id, float *dx, float *dy, float *dz)
 {
-    volume_get_buffer_info(id, buf, size);
+    *dx = *dx = *dz = 0.0f;
+
+    const VolumeDesc *volume;
+    if (volume_get_descriptor(id, volume))
+    {
+        *dx = (float)volume->grid_data.voxel_size[0];
+        *dy = (float)volume->grid_data.voxel_size[1];
+        *dz = (float)volume->grid_data.voxel_size[2];
+    }
 }
 
-void volume_get_buffer_info_device(uint64_t id, void** buf, uint64_t* size)
+void volume_get_tile_and_voxel_count(uint64_t id, uint32_t &tile_count, uint64_t &voxel_count)
 {
-    volume_get_buffer_info(id, buf, size);
+    tile_count = 0;
+    voxel_count = 0;
+
+    const VolumeDesc *volume;
+    if (volume_get_descriptor(id, volume))
+    {
+        tile_count = volume->tree_data.node_count_leaf;
+
+        const uint32_t grid_type = volume->grid_data.grid_type;
+
+        switch (grid_type)
+        {
+        case PNANOVDB_GRID_TYPE_ONINDEX:
+        case PNANOVDB_GRID_TYPE_ONINDEXMASK:
+            // number of indexable voxels is number of active voxels
+            voxel_count = volume->tree_data.voxel_count;
+            break;
+        default:
+            // all leaf voxels are indexable
+            voxel_count = uint64_t(tile_count) * PNANOVDB_LEAF_TABLE_COUNT;
+        }
+    }
 }
 
-void volume_get_voxel_size(uint64_t id, float* dx, float* dy, float* dz)
+const char *volume_get_grid_info(uint64_t id, uint64_t *grid_size, uint32_t *grid_index, uint32_t *grid_count,
+                                 float translation[3], float transform[9], char type_str[16])
 {
-    *dx = *dx = *dz = 0.0f;
+    const VolumeDesc *volume;
+    if (volume_get_descriptor(id, volume))
+    {
+        const pnanovdb_grid_t &grid_data = volume->grid_data;
+        *grid_count = grid_data.grid_count;
+        *grid_index = grid_data.grid_index;
+        *grid_size = grid_data.grid_size;
 
-    VolumeDesc volume;
+        memcpy(translation, grid_data.map.vecf, sizeof(grid_data.map.vecf));
+        memcpy(transform, grid_data.map.matf, sizeof(grid_data.map.matf));
+
+        nanovdb::toStr(type_str, static_cast<nanovdb::GridType>(grid_data.grid_type));
+        return (const char *)grid_data.grid_name;
+    }
+
+    *grid_size = 0;
+    *grid_index = 0;
+    *grid_count = 0;
+    type_str[0] = 0;
+
+    return nullptr;
+}
+
+uint32_t volume_get_blind_data_count(uint64_t id)
+{
+    const VolumeDesc *volume;
     if (volume_get_descriptor(id, volume))
     {
-        *dx = (float)volume.grid_data.voxel_size[0];
-        *dy = (float)volume.grid_data.voxel_size[1];
-        *dz = (float)volume.grid_data.voxel_size[2];
+        return volume->grid_data.blind_metadata_count;
     }
+    return 0;
 }
 
-void volume_get_tiles_host(uint64_t id, void** buf, uint64_t* size)
+const char *volume_get_blind_data_info(uint64_t id, uint32_t data_index, void **buf, uint64_t *value_count,
+                                       uint32_t *value_size, char type_str[16])
 {
-    static constexpr uint32_t MASK = (1u << 3u) - 1u; // mask for bit operations
+    const VolumeDesc *volume;
+    if (volume_get_descriptor(id, volume) && data_index < volume->grid_data.blind_metadata_count)
+    {
+        const pnanovdb_gridblindmetadata_t &metadata = volume->blind_metadata[data_index];
+        *value_count = metadata.value_count;
+        *value_size = metadata.value_size;
+
+        nanovdb::toStr(type_str, static_cast<nanovdb::GridType>(metadata.data_type));
+        *buf = static_cast<uint8_t *>(volume->buffer) + volume->grid_data.blind_metadata_offset +
+               data_index * sizeof(pnanovdb_gridblindmetadata_t) + metadata.data_offset;
+        return (const char *)metadata.name;
+    }
+    *buf = nullptr;
+    *value_count = 0;
+    *value_size = 0;
+    type_str[0] = 0;
+    return nullptr;
+}
 
-    *buf = 0;
-    *size = 0;
+void volume_get_tiles_host(uint64_t id, void *buf)
+{
+    static constexpr uint32_t MASK = (1u << 3u) - 1u; // mask for bit operations
 
-    VolumeDesc volume;
+    const VolumeDesc *volume;
     if (volume_get_descriptor(id, volume))
     {
-        const uint32_t leaf_count = volume.tree_data.node_count_leaf;
-        *size = leaf_count * sizeof(pnanovdb_coord_t);
+        const uint32_t leaf_count = volume->tree_data.node_count_leaf;
+
+        pnanovdb_coord_t *leaf_coords = static_cast<pnanovdb_coord_t *>(buf);
+
+        const uint64_t first_leaf =
+            (uint64_t)volume->buffer + sizeof(pnanovdb_grid_t) + volume->tree_data.node_offset_leaf;
+        const uint32_t leaf_stride = PNANOVDB_GRID_TYPE_GET(volume->grid_data.grid_type, leaf_size);
+
+        const pnanovdb_buf_t pnano_buf = volume->as_pnano();
+
+        for (uint32_t i = 0; i < leaf_count; ++i)
+        {
+            pnanovdb_leaf_handle_t leaf = volume::get_leaf(pnano_buf, i);
+            leaf_coords[i] = volume::leaf_origin(pnano_buf, leaf);
+        }
+    }
+}
 
-        pnanovdb_coord_t *leaf_coords = (pnanovdb_coord_t*)alloc_host(*size);
-        *buf = leaf_coords;
+void volume_get_voxels_host(uint64_t id, void *buf)
+{
+    const VolumeDesc *volume;
+    if (volume_get_descriptor(id, volume))
+    {
+        uint32_t leaf_count;
+        uint64_t voxel_count;
+        volume_get_tile_and_voxel_count(id, leaf_count, voxel_count);
 
-        const uint64_t first_leaf = (uint64_t)volume.buffer + sizeof(pnanovdb_grid_t) + volume.tree_data.node_offset_leaf;
-        const uint32_t leaf_stride = PNANOVDB_GRID_TYPE_GET(volume.grid_data.grid_type, leaf_size);
+        pnanovdb_coord_t *voxel_coords = static_cast<pnanovdb_coord_t *>(buf);
 
+        const pnanovdb_buf_t pnano_buf = volume->as_pnano();
         for (uint32_t i = 0; i < leaf_count; ++i)
         {
-            leaf_coords[i] = ((pnanovdb_leaf_t*)(first_leaf + leaf_stride * i))->bbox_min;
-            leaf_coords[i].x &= ~MASK;
-            leaf_coords[i].y &= ~MASK;
-            leaf_coords[i].z &= ~MASK;
+            pnanovdb_leaf_handle_t leaf = volume::get_leaf(pnano_buf, i);
+            pnanovdb_coord_t leaf_coords = volume::leaf_origin(pnano_buf, leaf);
+
+            for (uint32_t n = 0; n < 512; ++n)
+            {
+                pnanovdb_coord_t loc_ijk = volume::leaf_offset_to_local_coord(n);
+                pnanovdb_coord_t ijk = {
+                    loc_ijk.x + leaf_coords.x,
+                    loc_ijk.y + leaf_coords.y,
+                    loc_ijk.z + leaf_coords.z,
+                };
+
+                const uint64_t index = volume::leaf_voxel_index(pnano_buf, i, ijk);
+                if (index < voxel_count)
+                {
+                    voxel_coords[index] = ijk;
+                }
+            }
         }
     }
 }
 
 void volume_destroy_host(uint64_t id)
 {
-    free_host((void*)id);
-    volume_rem_descriptor(id);
+    const VolumeDesc *volume;
+    if (volume_get_descriptor(id, volume))
+    {
+        if (volume->owner)
+        {
+            free_host(volume->buffer);
+        }
+        volume_rem_descriptor(id);
+    }
 }
 
 void volume_destroy_device(uint64_t id)
 {
-    VolumeDesc volume;
+    const VolumeDesc *volume;
     if (volume_get_descriptor(id, volume))
     {
-        ContextGuard guard(volume.context);
-        free_device(WP_CURRENT_CONTEXT, volume.buffer);
+        ContextGuard guard(volume->context);
+        if (volume->owner)
+        {
+            free_device(WP_CURRENT_CONTEXT, volume->buffer);
+        }
+        free_pinned(volume->blind_metadata);
         volume_rem_descriptor(id);
     }
 }
 
-
 #if WP_ENABLE_CUDA
-uint64_t volume_f_from_tiles_device(void* context, void* points, int num_points, float voxel_size, float bg_value, float tx, float ty, float tz, bool points_in_world_space)
+uint64_t volume_f_from_tiles_device(void *context, void *points, int num_points, float voxel_size, float bg_value,
+                                    float tx, float ty, float tz, bool points_in_world_space)
 {
-    nanovdb::FloatGrid* grid;
+    nanovdb::FloatGrid *grid;
     size_t gridSize;
     BuildGridParams<float> params;
     params.voxel_size = voxel_size;
     params.background_value = bg_value;
     params.translation = nanovdb::Vec3f{tx, ty, tz};
 
-    build_grid_from_tiles(grid, gridSize, points, num_points, points_in_world_space, params);
+    build_grid_from_points(grid, gridSize, points, num_points, points_in_world_space, params);
 
-    return volume_create_device(context, grid, gridSize);
+    return volume_create_device(context, grid, gridSize, false, true);
 }
 
-uint64_t volume_v_from_tiles_device(void* context, void* points, int num_points, float voxel_size, float bg_value_x, float bg_value_y, float bg_value_z, float tx, float ty, float tz, bool points_in_world_space)
+uint64_t volume_v_from_tiles_device(void *context, void *points, int num_points, float voxel_size, float bg_value_x,
+                                    float bg_value_y, float bg_value_z, float tx, float ty, float tz,
+                                    bool points_in_world_space)
 {
-    nanovdb::Vec3fGrid* grid;
+    nanovdb::Vec3fGrid *grid;
     size_t gridSize;
     BuildGridParams<nanovdb::Vec3f> params;
     params.voxel_size = voxel_size;
     params.background_value = nanovdb::Vec3f{bg_value_x, bg_value_y, bg_value_z};
     params.translation = nanovdb::Vec3f{tx, ty, tz};
 
-    build_grid_from_tiles(grid, gridSize, points, num_points, points_in_world_space, params);
+    build_grid_from_points(grid, gridSize, points, num_points, points_in_world_space, params);
 
-    return volume_create_device(context, grid, gridSize);
+    return volume_create_device(context, grid, gridSize, false, true);
 }
 
-uint64_t volume_i_from_tiles_device(void* context, void* points, int num_points, float voxel_size, int bg_value, float tx, float ty, float tz, bool points_in_world_space)
+uint64_t volume_i_from_tiles_device(void *context, void *points, int num_points, float voxel_size, int bg_value,
+                                    float tx, float ty, float tz, bool points_in_world_space)
 {
-    nanovdb::Int32Grid* grid;
+    nanovdb::Int32Grid *grid;
     size_t gridSize;
     BuildGridParams<int32_t> params;
     params.voxel_size = voxel_size;
     params.background_value = (int32_t)(bg_value);
     params.translation = nanovdb::Vec3f{tx, ty, tz};
 
-    build_grid_from_tiles(grid, gridSize, points, num_points, points_in_world_space, params);
+    build_grid_from_points(grid, gridSize, points, num_points, points_in_world_space, params);
 
-    return volume_create_device(context, grid, gridSize);
+    return volume_create_device(context, grid, gridSize, false, true);
 }
 
-void launch_get_leaf_coords(void* context, const uint32_t leaf_count, pnanovdb_coord_t *leaf_coords, const uint64_t first_leaf, const uint32_t leaf_stride);
+uint64_t volume_index_from_tiles_device(void *context, void *points, int num_points, float voxel_size, float tx,
+                                        float ty, float tz, bool points_in_world_space)
+{
+    nanovdb::IndexGrid *grid;
+    size_t gridSize;
+    BuildGridParams<nanovdb::ValueIndex> params;
+    params.voxel_size = voxel_size;
+    params.translation = nanovdb::Vec3f{tx, ty, tz};
 
-void volume_get_tiles_device(uint64_t id, void** buf, uint64_t* size)
+    build_grid_from_points(grid, gridSize, points, num_points, points_in_world_space, params);
+
+    return volume_create_device(context, grid, gridSize, false, true);
+}
+
+uint64_t volume_from_active_voxels_device(void *context, void *points, int num_points, float voxel_size, float tx,
+                                          float ty, float tz, bool points_in_world_space)
 {
-    *buf = 0;
-    *size = 0;
+    nanovdb::OnIndexGrid *grid;
+    size_t gridSize;
+    BuildGridParams<nanovdb::ValueOnIndex> params;
+    params.voxel_size = voxel_size;
+    params.translation = nanovdb::Vec3f{tx, ty, tz};
 
-    VolumeDesc volume;
+    build_grid_from_points(grid, gridSize, points, num_points, points_in_world_space, params);
+
+    return volume_create_device(context, grid, gridSize, false, true);
+}
+
+void launch_get_leaf_coords(void *context, const uint32_t leaf_count, pnanovdb_coord_t *leaf_coords,
+                            pnanovdb_buf_t buf);
+void launch_get_voxel_coords(void *context, const uint32_t leaf_count, const uint32_t voxel_count,
+                             pnanovdb_coord_t *voxel_coords, pnanovdb_buf_t buf);
+
+void volume_get_tiles_device(uint64_t id, void *buf)
+{
+    const VolumeDesc *volume;
     if (volume_get_descriptor(id, volume))
     {
-        const uint32_t leaf_count = volume.tree_data.node_count_leaf;
-        *size = leaf_count * sizeof(pnanovdb_coord_t);
+        const uint32_t leaf_count = volume->tree_data.node_count_leaf;
 
-        pnanovdb_coord_t *leaf_coords = (pnanovdb_coord_t*)alloc_device(volume.context, *size);
-        *buf = leaf_coords;
+        pnanovdb_coord_t *leaf_coords = static_cast<pnanovdb_coord_t *>(buf);
+        launch_get_leaf_coords(volume->context, leaf_count, leaf_coords, volume->as_pnano());
+    }
+}
 
-        const uint64_t first_leaf = (uint64_t)volume.buffer + sizeof(pnanovdb_grid_t) + volume.tree_data.node_offset_leaf;
-        const uint32_t leaf_stride = PNANOVDB_GRID_TYPE_GET(volume.grid_data.grid_type, leaf_size);
+void volume_get_voxels_device(uint64_t id, void *buf)
+{
+    const VolumeDesc *volume;
+    if (volume_get_descriptor(id, volume))
+    {
+        uint32_t leaf_count;
+        uint64_t voxel_count;
+        volume_get_tile_and_voxel_count(id, leaf_count, voxel_count);
 
-        launch_get_leaf_coords(volume.context, leaf_count, leaf_coords, first_leaf, leaf_stride);
+        pnanovdb_coord_t *voxel_coords = static_cast<pnanovdb_coord_t *>(buf);
+        launch_get_voxel_coords(volume->context, leaf_count, voxel_count, voxel_coords, volume->as_pnano());
     }
 }
+
 #else
 // stubs for non-CUDA platforms
-uint64_t volume_f_from_tiles_device(void* context, void* points, int num_points, float voxel_size, float bg_value, float tx, float ty, float tz, bool points_in_world_space)
+uint64_t volume_f_from_tiles_device(void *context, void *points, int num_points, float voxel_size, float bg_value,
+                                    float tx, float ty, float tz, bool points_in_world_space)
+{
+    return 0;
+}
+
+uint64_t volume_v_from_tiles_device(void *context, void *points, int num_points, float voxel_size, float bg_value_x,
+                                    float bg_value_y, float bg_value_z, float tx, float ty, float tz,
+                                    bool points_in_world_space)
+{
+    return 0;
+}
+
+uint64_t volume_i_from_tiles_device(void *context, void *points, int num_points, float voxel_size, int bg_value,
+                                    float tx, float ty, float tz, bool points_in_world_space)
 {
     return 0;
 }
 
-uint64_t volume_v_from_tiles_device(void* context, void* points, int num_points, float voxel_size, float bg_value_x, float bg_value_y, float bg_value_z, float tx, float ty, float tz, bool points_in_world_space)
+uint64_t volume_index_from_tiles_device(void *context, void *points, int num_points, float voxel_size,
+                                        float tx, float ty, float tz, bool points_in_world_space)
 {
     return 0;
 }
 
-uint64_t volume_i_from_tiles_device(void* context, void* points, int num_points, float voxel_size, int bg_value, float tx, float ty, float tz, bool points_in_world_space)
+uint64_t volume_from_active_voxels_device(void *context, void *points, int num_points, float voxel_size, float tx,
+                                          float ty, float tz, bool points_in_world_space)
 {
     return 0;
 }
 
-void volume_get_tiles_device(uint64_t id, void** buf, uint64_t* size) {}
+void volume_get_tiles_device(uint64_t id, void *buf)
+{
+}
+
+void volume_get_voxels_device(uint64_t id, void *buf)
+{
+}
 
 #endif
diff --git a/warp/native/volume.cu b/warp/native/volume.cu
index b0c67dbe..c1586f24 100644
--- a/warp/native/volume.cu
+++ b/warp/native/volume.cu
@@ -6,27 +6,53 @@
  * license agreement from NVIDIA CORPORATION is strictly prohibited.
  */
 
-#include "volume.h"
-#include "warp.h"
 #include "cuda_util.h"
+#include "volume_impl.h"
+#include "warp.h"
+
 
+__global__ void volume_get_leaf_coords(const uint32_t leaf_count, pnanovdb_coord_t *leaf_coords,
+                                       const pnanovdb_buf_t buf)
+{
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (tid < leaf_count)
+    {
+        pnanovdb_leaf_handle_t leaf = wp::volume::get_leaf(buf, tid);
+        leaf_coords[tid] = wp::volume::leaf_origin(buf, leaf);
+    }
+}
 
-__global__ void volume_get_leaf_coords(const uint32_t leaf_count, pnanovdb_coord_t *leaf_coords, const uint64_t first_leaf, const uint32_t leaf_stride)
-{ 
-    static constexpr uint32_t MASK = (1u << 3u) - 1u; // mask for bit operations
+__global__ void volume_get_voxel_coords(const uint32_t voxel_count, pnanovdb_coord_t *voxel_coords,
+                                        const pnanovdb_buf_t buf)
+{
+    const uint32_t leaf_index = blockIdx.x;
+    pnanovdb_leaf_handle_t leaf = wp::volume::get_leaf(buf, leaf_index);
+    pnanovdb_coord_t leaf_coords = wp::volume::leaf_origin(buf, leaf);
 
-    const int tid = blockIdx.x*blockDim.x + threadIdx.x;
+    pnanovdb_coord_t ijk = {
+        int32_t(threadIdx.x) + leaf_coords.x,
+        int32_t(threadIdx.y) + leaf_coords.y,
+        int32_t(threadIdx.z) + leaf_coords.z,
+    };
 
-    if (tid < leaf_count) {
-        leaf_coords[tid] = ((pnanovdb_leaf_t*)(first_leaf + leaf_stride * tid))->bbox_min;
-        leaf_coords[tid].x &= ~MASK;
-        leaf_coords[tid].y &= ~MASK;
-        leaf_coords[tid].z &= ~MASK;
+    const uint64_t index = wp::volume::leaf_voxel_index(buf, leaf_index, ijk);
+    if (index < voxel_count)
+    {
+        voxel_coords[index] = ijk;
     }
 }
 
-void launch_get_leaf_coords(void* context, const uint32_t leaf_count, pnanovdb_coord_t *leaf_coords, const uint64_t first_leaf, const uint32_t leaf_stride)
+void launch_get_leaf_coords(void *context, const uint32_t leaf_count, pnanovdb_coord_t *leaf_coords, pnanovdb_buf_t buf)
+{
+    ContextGuard guard(context);
+    wp_launch_device(WP_CURRENT_CONTEXT, volume_get_leaf_coords, leaf_count, (leaf_count, leaf_coords, buf));
+}
+
+void launch_get_voxel_coords(void *context, const uint32_t leaf_count, const uint32_t voxel_count,
+                             pnanovdb_coord_t *voxel_coords, pnanovdb_buf_t buf)
 {
     ContextGuard guard(context);
-    wp_launch_device(WP_CURRENT_CONTEXT, volume_get_leaf_coords, leaf_count, (leaf_count, leaf_coords, first_leaf, leaf_stride));
+    cudaStream_t stream = (cudaStream_t)cuda_stream_get_current();
+    volume_get_voxel_coords<<<leaf_count, dim3(8, 8, 8), 0, stream>>>(voxel_count, voxel_coords, buf);
 }
diff --git a/warp/native/volume.h b/warp/native/volume.h
index f15b398d..319f2a1f 100644
--- a/warp/native/volume.h
+++ b/warp/native/volume.h
@@ -8,529 +8,951 @@
 
 #pragma once
 
+#include "array.h"
 #include "builtin.h"
 
 #define PNANOVDB_C
 #define PNANOVDB_MEMCPY_CUSTOM
 #define pnanovdb_memcpy memcpy
+
+#if defined(WP_NO_CRT) && !defined(__CUDACC__)
+// PNanoVDB will try to include <stdint.h> unless __CUDACC_RTC__ is defined
+#define __CUDACC_RTC__
+#endif
+
 #include "nanovdb/PNanoVDB.h"
-#include "nanovdb/PNanoVDBWrite.h"
+
+#if defined(WP_NO_CRT) && !defined(__CUDACC__)
+#undef __CUDACC_RTC__
+#endif
 
 namespace wp
 {
 namespace volume
 {
 
+// Need to kept in sync with constants in python-side Volume class
 static constexpr int CLOSEST = 0;
 static constexpr int LINEAR = 1;
 
-// helper functions
+// pnanovdb helper function
+
 CUDA_CALLABLE inline pnanovdb_buf_t id_to_buffer(uint64_t id)
 {
     pnanovdb_buf_t buf;
-    buf.data = (uint32_t*)id;
+    buf.data = (uint32_t *)id;
     return buf;
 }
 
-CUDA_CALLABLE inline pnanovdb_uint32_t get_grid_type(const pnanovdb_buf_t& buf)
+CUDA_CALLABLE inline pnanovdb_grid_handle_t get_grid(pnanovdb_buf_t buf)
 {
-    const pnanovdb_grid_t *grid_data = (const pnanovdb_grid_t*)buf.data;
-    return grid_data->grid_type;
+    return {0u};
 }
 
-CUDA_CALLABLE inline pnanovdb_root_handle_t get_root(const pnanovdb_buf_t& buf,
-                                                     const pnanovdb_grid_handle_t& grid = { 0u })
+CUDA_CALLABLE inline pnanovdb_uint32_t get_grid_type(pnanovdb_buf_t buf)
 {
-    const auto tree = pnanovdb_grid_get_tree(buf, grid);
-    return pnanovdb_tree_get_root(buf, tree);
+    return pnanovdb_grid_get_grid_type(buf, get_grid(buf));
 }
-} // namespace volume
 
-CUDA_CALLABLE inline void pnano_read(float& result, pnanovdb_buf_t buf, pnanovdb_root_handle_t root, PNANOVDB_IN(pnanovdb_coord_t) ijk) {
-    const pnanovdb_address_t address = pnanovdb_root_get_value_address(PNANOVDB_GRID_TYPE_FLOAT, buf, root, ijk);
-    result = pnanovdb_read_float(buf, address);
+CUDA_CALLABLE inline pnanovdb_tree_handle_t get_tree(pnanovdb_buf_t buf)
+{
+    return pnanovdb_grid_get_tree(buf, get_grid(buf));
 }
-CUDA_CALLABLE inline void pnano_read(int32_t& result, pnanovdb_buf_t buf, pnanovdb_root_handle_t root, PNANOVDB_IN(pnanovdb_coord_t) ijk) {
-    const pnanovdb_address_t address = pnanovdb_root_get_value_address(PNANOVDB_GRID_TYPE_INT32, buf, root, ijk);
-    result = pnanovdb_read_int32(buf, address);
+
+CUDA_CALLABLE inline pnanovdb_root_handle_t get_root(pnanovdb_buf_t buf)
+{
+    return pnanovdb_tree_get_root(buf, get_tree(buf));
 }
-CUDA_CALLABLE inline void pnano_read(vec3& result, pnanovdb_buf_t buf, pnanovdb_root_handle_t root, PNANOVDB_IN(pnanovdb_coord_t) ijk) {
-    const pnanovdb_address_t address = pnanovdb_root_get_value_address(PNANOVDB_GRID_TYPE_VEC3F, buf, root, ijk);
-    const pnanovdb_vec3_t v = pnanovdb_read_vec3f(buf, address);
-    result = {v.x, v.y, v.z};
+
+template <typename T> struct pnano_traits
+{
+};
+
+// to add support for more grid types, extend this
+// and update _volume_supported_value_types in builtins.py
+
+template <> struct pnano_traits<int32_t>
+{
+    static constexpr int GRID_TYPE = PNANOVDB_GRID_TYPE_INT32;
+};
+
+template <> struct pnano_traits<int64_t>
+{
+    static constexpr int GRID_TYPE = PNANOVDB_GRID_TYPE_INT64;
+};
+
+template <> struct pnano_traits<uint32_t>
+{
+    static constexpr int GRID_TYPE = PNANOVDB_GRID_TYPE_UINT32;
+};
+
+template <> struct pnano_traits<float>
+{
+    static constexpr int GRID_TYPE = PNANOVDB_GRID_TYPE_FLOAT;
+};
+
+template <> struct pnano_traits<double>
+{
+    static constexpr int GRID_TYPE = PNANOVDB_GRID_TYPE_DOUBLE;
+};
+
+template <> struct pnano_traits<vec3f>
+{
+    static constexpr int GRID_TYPE = PNANOVDB_GRID_TYPE_VEC3F;
+};
+
+template <> struct pnano_traits<vec3d>
+{
+    static constexpr int GRID_TYPE = PNANOVDB_GRID_TYPE_VEC3D;
+};
+
+template <> struct pnano_traits<vec4f>
+{
+    static constexpr int GRID_TYPE = PNANOVDB_GRID_TYPE_VEC4F;
+};
+
+template <> struct pnano_traits<vec4d>
+{
+    static constexpr int GRID_TYPE = PNANOVDB_GRID_TYPE_VEC4D;
+};
+
+// common accessors over various grid types
+// WARNING: implementation below only for >=32b values, but that's the case for all types above
+// for smaller types add a specialization
+
+template <typename T> CUDA_CALLABLE inline void pnano_read(T &result, pnanovdb_buf_t buf, pnanovdb_address_t address)
+{
+    result = *reinterpret_cast<const T *>(buf.data + (address.byte_offset >> 2));
 }
 
-CUDA_CALLABLE inline void pnano_read(float& result, pnanovdb_buf_t buf, PNANOVDB_INOUT(pnanovdb_readaccessor_t) acc, PNANOVDB_IN(pnanovdb_coord_t) ijk) {
-    pnanovdb_address_t address = pnanovdb_readaccessor_get_value_address(PNANOVDB_GRID_TYPE_FLOAT, buf, acc, ijk);
-    result = pnanovdb_read_float(buf, address);
+template <typename T>
+CUDA_CALLABLE inline void pnano_write(const T &value, pnanovdb_buf_t buf, pnanovdb_address_t address)
+{
+    *reinterpret_cast<T *>(buf.data + (address.byte_offset >> 2)) = value;
 }
-CUDA_CALLABLE inline void pnano_read(int32_t& result, pnanovdb_buf_t buf, PNANOVDB_INOUT(pnanovdb_readaccessor_t) acc, PNANOVDB_IN(pnanovdb_coord_t) ijk) {
-    pnanovdb_address_t address = pnanovdb_readaccessor_get_value_address(PNANOVDB_GRID_TYPE_INT32, buf, acc, ijk);
-    result = pnanovdb_read_int32(buf, address);
+
+template <typename T>
+CUDA_CALLABLE inline void pnano_read(T &result, pnanovdb_buf_t buf, pnanovdb_root_handle_t root,
+                                     PNANOVDB_IN(pnanovdb_coord_t) ijk)
+{
+    using traits = pnano_traits<T>;
+    const pnanovdb_address_t address = pnanovdb_root_get_value_address(traits::GRID_TYPE, buf, root, ijk);
+    pnano_read<T>(result, buf, address);
 }
-CUDA_CALLABLE inline void pnano_read(vec3& result, pnanovdb_buf_t buf, PNANOVDB_INOUT(pnanovdb_readaccessor_t) acc, PNANOVDB_IN(pnanovdb_coord_t) ijk) {
-    pnanovdb_address_t address = pnanovdb_readaccessor_get_value_address(PNANOVDB_GRID_TYPE_VEC3F, buf, acc, ijk);
-    const pnanovdb_vec3_t v = pnanovdb_read_vec3f(buf, address);
-    result = {v.x, v.y, v.z};
+
+template <typename T>
+CUDA_CALLABLE inline void pnano_read(T &result, pnanovdb_buf_t buf, PNANOVDB_INOUT(pnanovdb_readaccessor_t) acc,
+                                     PNANOVDB_IN(pnanovdb_coord_t) ijk)
+{
+    using traits = pnano_traits<T>;
+    // pnanovdb_address_t address = pnanovdb_readaccessor_get_value_address(traits::GRID_TYPE, buf, acc, ijk);
+    pnanovdb_uint32_t level;
+    const pnanovdb_address_t address =
+        pnanovdb_readaccessor_get_value_address_and_level(traits::GRID_TYPE, buf, acc, ijk, PNANOVDB_REF(level));
+    pnano_read<T>(result, buf, address);
 }
 
-// Sampling the volume at the given index-space coordinates, uvw can be fractional
-template<typename T>
-CUDA_CALLABLE inline T volume_sample(uint64_t id, vec3 uvw, int sampling_mode)
+/// regular grid accessor (values stored in leafs)
+
+struct value_accessor_base
 {
-    const pnanovdb_buf_t buf = volume::id_to_buffer(id);
-    const pnanovdb_root_handle_t root = volume::get_root(buf);
-    const pnanovdb_vec3_t uvw_pnano{ uvw[0], uvw[1], uvw[2] };
+    pnanovdb_buf_t buf;
+    pnanovdb_root_handle_t root;
+    pnanovdb_readaccessor_t accessor;
+
+    explicit inline CUDA_CALLABLE value_accessor_base(const pnanovdb_buf_t buf) : buf(buf), root(get_root(buf))
+    {
+    }
+
+    CUDA_CALLABLE inline void init_cache()
+    {
+        pnanovdb_readaccessor_init(PNANOVDB_REF(accessor), root);
+    }
+};
+
+template <typename T> struct leaf_value_accessor : value_accessor_base
+{
+    using ValueType = T;
+
+    explicit inline CUDA_CALLABLE leaf_value_accessor(const pnanovdb_buf_t buf) : value_accessor_base(buf)
+    {
+    }
+
+    CUDA_CALLABLE inline bool is_valid() const
+    {
+        return get_grid_type(buf) == pnano_traits<T>::GRID_TYPE;
+    }
 
-    if (sampling_mode == volume::CLOSEST)
+    CUDA_CALLABLE inline T read_single(const pnanovdb_coord_t &ijk) const
     {
-        const pnanovdb_coord_t ijk = pnanovdb_vec3_round_to_coord(uvw_pnano);
         T val;
         pnano_read(val, buf, root, PNANOVDB_REF(ijk));
         return val;
     }
-    else if (sampling_mode == volume::LINEAR)
+
+    CUDA_CALLABLE inline T read_cache(const pnanovdb_coord_t &ijk)
     {
-        // NB. linear sampling is not used on int volumes
-        constexpr pnanovdb_coord_t OFFSETS[] = {
-            { 0, 0, 0 }, { 0, 0, 1 }, { 0, 1, 0 }, { 0, 1, 1 }, { 1, 0, 0 }, { 1, 0, 1 }, { 1, 1, 0 }, { 1, 1, 1 },
-        };
+        T val;
+        pnano_read(val, buf, PNANOVDB_REF(accessor), PNANOVDB_REF(ijk));
+        return val;
+    }
 
-        const pnanovdb_vec3_t ijk_base{ floorf(uvw_pnano.x), floorf(uvw_pnano.y), floorf(uvw_pnano.z) };
-        const pnanovdb_vec3_t ijk_frac{ uvw_pnano.x - ijk_base.x, uvw_pnano.y - ijk_base.y, uvw_pnano.z - ijk_base.z };
-        const pnanovdb_coord_t ijk{ (pnanovdb_int32_t)ijk_base.x, (pnanovdb_int32_t)ijk_base.y, (pnanovdb_int32_t)ijk_base.z };
+    CUDA_CALLABLE inline void adj_read_single(const pnanovdb_coord_t &ijk, const T &adj_ret)
+    {
+        // NOP
+    }
 
-        pnanovdb_readaccessor_t accessor;
-        pnanovdb_readaccessor_init(PNANOVDB_REF(accessor), root);
-        T val = 0;
-        const float wx[2]{ 1 - ijk_frac.x, ijk_frac.x };
-        const float wy[2]{ 1 - ijk_frac.y, ijk_frac.y };
-        const float wz[2]{ 1 - ijk_frac.z, ijk_frac.z };
-        for (int idx = 0; idx < 8; ++idx)
-        {
-            const pnanovdb_coord_t& offs = OFFSETS[idx];
-            const pnanovdb_coord_t ijk_shifted = pnanovdb_coord_add(ijk, offs);
-            T v;
-            pnano_read(v, buf, PNANOVDB_REF(accessor), PNANOVDB_REF(ijk_shifted));
-            val = add(val, T(wx[offs.x] * wy[offs.y] * wz[offs.z] * v));
-        }
-        return val;
+    CUDA_CALLABLE inline void adj_read_cache(const pnanovdb_coord_t &ijk, const T &adj_ret)
+    {
+        // NOP
     }
-    return 0;
-}
+};
 
-// Sampling a float volume at the given index-space coordinates, uvw can be fractional
-CUDA_CALLABLE inline float volume_sample_f(uint64_t id, vec3 uvw, int sampling_mode)
+CUDA_CALLABLE inline pnanovdb_uint64_t leaf_regular_get_voxel_index(pnanovdb_buf_t buf,
+                                                                    pnanovdb_address_t value_address,
+                                                                    PNANOVDB_IN(pnanovdb_coord_t) ijk)
 {
-    if (volume::get_grid_type(volume::id_to_buffer(id)) != PNANOVDB_GRID_TYPE_FLOAT) return 0.f;
-    return volume_sample<float>(id, uvw, sampling_mode);
+    // compute leaf index from value address, assuming all leaf voxels are allocated
+    const pnanovdb_grid_type_t grid_type = get_grid_type(buf);
+    const pnanovdb_uint32_t n = pnanovdb_leaf_coord_to_offset(ijk);
+    const pnanovdb_uint32_t byte_offset = PNANOVDB_GRID_TYPE_GET(grid_type, leaf_off_table) +
+                                          ((PNANOVDB_GRID_TYPE_GET(grid_type, value_stride_bits) * n) >> 3u);
+    const pnanovdb_address_t leaf_address = pnanovdb_address_offset_neg(value_address, byte_offset);
+
+    const pnanovdb_uint64_t first_leaf_offset = pnanovdb_tree_get_node_offset_leaf(buf, get_tree(buf));
+    const pnanovdb_uint32_t leaf_size = PNANOVDB_GRID_TYPE_GET(grid_type, leaf_size);
+    const pnanovdb_uint64_t leaf_index = (leaf_address.byte_offset - first_leaf_offset) / leaf_size;
+
+    return leaf_index * PNANOVDB_LEAF_TABLE_COUNT + n + 1;
 }
 
-// Sampling an int volume at the given index-space coordinates, uvw can be fractional
-CUDA_CALLABLE inline int32_t volume_sample_i(uint64_t id, vec3 uvw)
+CUDA_CALLABLE inline pnanovdb_uint64_t get_grid_voxel_index(pnanovdb_grid_type_t grid_type, pnanovdb_buf_t buf,
+                                                            pnanovdb_address_t value_address,
+                                                            const pnanovdb_coord_t &ijk)
 {
-    if (volume::get_grid_type(volume::id_to_buffer(id)) != PNANOVDB_GRID_TYPE_INT32) return 0;
-    return volume_sample<int32_t>(id, uvw, volume::CLOSEST);
-}
+    switch (grid_type)
+    {
+    case PNANOVDB_GRID_TYPE_INDEX:
+        return pnanovdb_leaf_index_get_value_index(buf, value_address, PNANOVDB_REF(ijk));
+    case PNANOVDB_GRID_TYPE_ONINDEX:
+        return pnanovdb_leaf_onindex_get_value_index(buf, value_address, PNANOVDB_REF(ijk));
+    case PNANOVDB_GRID_TYPE_INDEXMASK:
+        return pnanovdb_leaf_indexmask_get_value_index(buf, value_address, PNANOVDB_REF(ijk));
+    case PNANOVDB_GRID_TYPE_ONINDEXMASK:
+        return pnanovdb_leaf_onindexmask_get_value_index(buf, value_address, PNANOVDB_REF(ijk));
+    default:
+        return leaf_regular_get_voxel_index(buf, value_address, PNANOVDB_REF(ijk));
+    }
+};
 
-// Sampling a vector volume at the given index-space coordinates, uvw can be fractional
-CUDA_CALLABLE inline vec3 volume_sample_v(uint64_t id, vec3 uvw, int sampling_mode)
+/// index grid accessor
+template <typename T> struct index_value_accessor : value_accessor_base
 {
-    if (volume::get_grid_type(volume::id_to_buffer(id)) != PNANOVDB_GRID_TYPE_VEC3F) return vec3(0.f);
-    return volume_sample<vec3>(id, uvw, sampling_mode);
+    using ValueType = T;
+
+    pnanovdb_grid_type_t grid_type;
+    array_t<T> data;
+    const T &background;
+    T *adj_background;
+
+    explicit inline CUDA_CALLABLE index_value_accessor(const pnanovdb_buf_t buf, const array_t<T> &data,
+                                                       const T &background, T *adj_background = nullptr)
+        : value_accessor_base(buf), grid_type(get_grid_type(buf)), data(data), background(background),
+          adj_background(adj_background)
+    {
+    }
+
+    CUDA_CALLABLE inline bool is_valid() const
+    {
+        // Accessor is valid for all grid types
+        return true;
+    }
+
+    CUDA_CALLABLE inline T read_single(const pnanovdb_coord_t &ijk) const
+    {
+        pnanovdb_uint32_t level;
+        const pnanovdb_address_t address =
+            pnanovdb_root_get_value_address_and_level(grid_type, buf, root, PNANOVDB_REF(ijk), PNANOVDB_REF(level));
+        return read_at(level, address, ijk);
+    }
+
+    CUDA_CALLABLE inline T read_cache(const pnanovdb_coord_t &ijk)
+    {
+        pnanovdb_uint32_t level;
+        const pnanovdb_address_t address = pnanovdb_readaccessor_get_value_address_and_level(
+            grid_type, buf, PNANOVDB_REF(accessor), PNANOVDB_REF(ijk), PNANOVDB_REF(level));
+        return read_at(level, address, ijk);
+    }
+
+    CUDA_CALLABLE inline T read_at(pnanovdb_uint32_t level, const pnanovdb_address_t address,
+                                   const pnanovdb_coord_t &ijk) const
+    {
+        if (level == 0)
+        {
+            pnanovdb_uint64_t voxel_index = get_grid_voxel_index(grid_type, buf, address, ijk);
+
+            if (voxel_index > 0)
+            {
+                return *wp::address(data, voxel_index - 1);
+            }
+        }
+
+        return background;
+    }
+
+    CUDA_CALLABLE inline void adj_read_single(const pnanovdb_coord_t &ijk, const T &adj_ret)
+    {
+        pnanovdb_uint32_t level;
+        const pnanovdb_address_t address =
+            pnanovdb_root_get_value_address_and_level(grid_type, buf, root, PNANOVDB_REF(ijk), PNANOVDB_REF(level));
+        adj_read_at(level, address, ijk, adj_ret);
+    }
+
+    CUDA_CALLABLE inline void adj_read_cache(const pnanovdb_coord_t &ijk, const T &adj_ret)
+    {
+        pnanovdb_uint32_t level;
+        const pnanovdb_address_t address = pnanovdb_readaccessor_get_value_address_and_level(
+            grid_type, buf, PNANOVDB_REF(accessor), PNANOVDB_REF(ijk), PNANOVDB_REF(level));
+        adj_read_at(level, address, ijk, adj_ret);
+    }
+
+    CUDA_CALLABLE inline void adj_read_at(pnanovdb_uint32_t level, const pnanovdb_address_t address,
+                                          const pnanovdb_coord_t &ijk, const T &adj_ret) const
+    {
+        if (level == 0)
+        {
+            pnanovdb_uint64_t voxel_index = get_grid_voxel_index(grid_type, buf, address, ijk);
+
+            if (voxel_index > 0)
+            {
+                adj_atomic_add(&index_grad(data, voxel_index - 1), adj_ret);
+                return;
+            }
+        }
+        *adj_background += adj_ret;
+    }
+};
+
+CUDA_CALLABLE inline pnanovdb_coord_t vec3_round_to_coord(const pnanovdb_vec3_t a)
+{
+    pnanovdb_coord_t v;
+    v.x = pnanovdb_float_to_int32(roundf(a.x));
+    v.y = pnanovdb_float_to_int32(roundf(a.y));
+    v.z = pnanovdb_float_to_int32(roundf(a.z));
+    return v;
 }
 
-CUDA_CALLABLE inline void adj_volume_sample_f(
-    uint64_t id, vec3 uvw, int sampling_mode, uint64_t& adj_id, vec3& adj_uvw, int& adj_sampling_mode, const float& adj_ret)
+template <typename T> struct val_traits
 {
-    if (volume::get_grid_type(volume::id_to_buffer(id)) != PNANOVDB_GRID_TYPE_FLOAT) return;
+    using grad_t = vec_t<3, T>;
+    using scalar_t = T;
 
-    if (sampling_mode != volume::LINEAR) {
-        return; // NOP
+    // multiplies the gradient on the right
+    // needs to be specialized for scalar types as gradient is stored as column rather than row vector
+    static CUDA_CALLABLE inline T rmul(const grad_t &grad, const vec_t<3, scalar_t> &rhs)
+    {
+        return dot(grad, rhs);
     }
+};
 
-    const pnanovdb_buf_t buf = volume::id_to_buffer(id);
-    const pnanovdb_root_handle_t root = volume::get_root(buf);
-    const pnanovdb_vec3_t uvw_pnano{ uvw[0], uvw[1], uvw[2] };
+template <unsigned Length, typename T> struct val_traits<vec_t<Length, T>>
+{
+    using grad_t = mat_t<3, Length, T>;
+    using scalar_t = T;
 
-    constexpr pnanovdb_coord_t OFFSETS[] = {
-        { 0, 0, 0 }, { 0, 0, 1 }, { 0, 1, 0 }, { 0, 1, 1 }, { 1, 0, 0 }, { 1, 0, 1 }, { 1, 1, 0 }, { 1, 1, 1 },
-    };
+    static CUDA_CALLABLE inline vec_t<Length, T> rmul(const grad_t &grad, const vec_t<3, scalar_t> &rhs)
+    {
+        return mul(grad, rhs);
+    }
+};
 
-    const pnanovdb_vec3_t ijk_base{ floorf(uvw_pnano.x), floorf(uvw_pnano.y), floorf(uvw_pnano.z) };
-    const pnanovdb_vec3_t ijk_frac{ uvw_pnano.x - ijk_base.x, uvw_pnano.y - ijk_base.y, uvw_pnano.z - ijk_base.z };
-    const pnanovdb_coord_t ijk{ (pnanovdb_int32_t)ijk_base.x, (pnanovdb_int32_t)ijk_base.y, (pnanovdb_int32_t)ijk_base.z };
+// Sampling the volume at the given index-space coordinates, uvw can be fractional
+template <typename Accessor>
+CUDA_CALLABLE inline typename Accessor::ValueType volume_sample(Accessor &accessor, vec3 uvw, int sampling_mode)
+{
+    using T = typename Accessor::ValueType;
+    using w_t = typename val_traits<T>::scalar_t;
 
-    pnanovdb_readaccessor_t accessor;
-    pnanovdb_readaccessor_init(PNANOVDB_REF(accessor), root);
-    const float wx[2]{ 1 - ijk_frac.x, ijk_frac.x };
-    const float wy[2]{ 1 - ijk_frac.y, ijk_frac.y };
-    const float wz[2]{ 1 - ijk_frac.z, ijk_frac.z };
-    vec3 dphi(0,0,0);
-    for (int idx = 0; idx < 8; ++idx)
+    if (!accessor.is_valid())
     {
-        const pnanovdb_coord_t& offs = OFFSETS[idx];
-        const pnanovdb_coord_t ijk_shifted = pnanovdb_coord_add(ijk, offs);
-        float v;
-        pnano_read(v, buf, PNANOVDB_REF(accessor), PNANOVDB_REF(ijk_shifted));
-        const vec3 signs(offs.x * 2 - 1, offs.y * 2 - 1, offs.z * 2 - 1);
-        const vec3 grad_w(signs[0] * wy[offs.y] * wz[offs.z], signs[1] * wx[offs.x] * wz[offs.z], signs[2] * wx[offs.x] * wy[offs.y]); 
-        dphi = add(dphi, mul(v, grad_w));
+        return 0;
+    }
+
+    const pnanovdb_buf_t buf = accessor.buf;
+    const pnanovdb_vec3_t uvw_pnano{uvw[0], uvw[1], uvw[2]};
+
+    if (sampling_mode == CLOSEST)
+    {
+        const pnanovdb_coord_t ijk = vec3_round_to_coord(uvw_pnano);
+        return accessor.read_single(ijk);
     }
+    else if (sampling_mode == LINEAR)
+    {
+        // NB. linear sampling is not used on int volumes
+        constexpr pnanovdb_coord_t OFFSETS[] = {
+            {0, 0, 0}, {0, 0, 1}, {0, 1, 0}, {0, 1, 1}, {1, 0, 0}, {1, 0, 1}, {1, 1, 0}, {1, 1, 1},
+        };
+
+        const pnanovdb_vec3_t ijk_base{floorf(uvw_pnano.x), floorf(uvw_pnano.y), floorf(uvw_pnano.z)};
+        const pnanovdb_vec3_t ijk_frac{uvw_pnano.x - ijk_base.x, uvw_pnano.y - ijk_base.y, uvw_pnano.z - ijk_base.z};
+        const pnanovdb_coord_t ijk{(pnanovdb_int32_t)ijk_base.x, (pnanovdb_int32_t)ijk_base.y,
+                                   (pnanovdb_int32_t)ijk_base.z};
+
+        accessor.init_cache();
+        T val = 0;
+        const float wx[2]{1 - ijk_frac.x, ijk_frac.x};
+        const float wy[2]{1 - ijk_frac.y, ijk_frac.y};
+        const float wz[2]{1 - ijk_frac.z, ijk_frac.z};
+        for (int idx = 0; idx < 8; ++idx)
+        {
+            const pnanovdb_coord_t &offs = OFFSETS[idx];
+            const pnanovdb_coord_t ijk_shifted = pnanovdb_coord_add(ijk, offs);
+            const T v = accessor.read_cache(ijk_shifted);
 
-    adj_uvw += mul(dphi, adj_ret);
+            const w_t w = wx[offs.x] * wy[offs.y] * wz[offs.z];
+            val = add(val, w * v);
+        }
+        return val;
+    }
+    return 0;
 }
 
-CUDA_CALLABLE inline void adj_volume_sample_v(
-    uint64_t id, vec3 uvw, int sampling_mode, uint64_t& adj_id, vec3& adj_uvw, int& adj_sampling_mode, const vec3& adj_ret)
+template <typename Accessor>
+CUDA_CALLABLE inline void adj_volume_sample(Accessor &accessor, vec3 uvw, int sampling_mode, vec3 &adj_uvw,
+                                            const typename Accessor::ValueType &adj_ret)
 {
-    if (volume::get_grid_type(volume::id_to_buffer(id)) != PNANOVDB_GRID_TYPE_VEC3F) return;
+    // TODO: accessor data gradients
 
-    if (sampling_mode != volume::LINEAR) {
-        return; // NOP
+    using T = typename Accessor::ValueType;
+    using w_t = typename val_traits<T>::scalar_t;
+    using w_grad_t = vec_t<3, w_t>;
+
+    if (!accessor.is_valid())
+    {
+        return;
     }
 
-    const pnanovdb_buf_t buf = volume::id_to_buffer(id);
-    const pnanovdb_root_handle_t root = volume::get_root(buf);
-    const pnanovdb_vec3_t uvw_pnano{ uvw[0], uvw[1], uvw[2] };
+    const pnanovdb_buf_t buf = accessor.buf;
+    const pnanovdb_vec3_t uvw_pnano{uvw[0], uvw[1], uvw[2]};
+
+    if (sampling_mode != LINEAR)
+    {
+        const pnanovdb_coord_t ijk = vec3_round_to_coord(uvw_pnano);
+        accessor.adj_read_single(ijk, adj_ret);
+        return;
+    }
 
     constexpr pnanovdb_coord_t OFFSETS[] = {
-        { 0, 0, 0 }, { 0, 0, 1 }, { 0, 1, 0 }, { 0, 1, 1 }, { 1, 0, 0 }, { 1, 0, 1 }, { 1, 1, 0 }, { 1, 1, 1 },
+        {0, 0, 0}, {0, 0, 1}, {0, 1, 0}, {0, 1, 1}, {1, 0, 0}, {1, 0, 1}, {1, 1, 0}, {1, 1, 1},
     };
 
-    const pnanovdb_vec3_t ijk_base{ floorf(uvw_pnano.x), floorf(uvw_pnano.y), floorf(uvw_pnano.z) };
-    const pnanovdb_vec3_t ijk_frac{ uvw_pnano.x - ijk_base.x, uvw_pnano.y - ijk_base.y, uvw_pnano.z - ijk_base.z };
-    const pnanovdb_coord_t ijk{ (pnanovdb_int32_t)ijk_base.x, (pnanovdb_int32_t)ijk_base.y, (pnanovdb_int32_t)ijk_base.z };
+    const pnanovdb_vec3_t ijk_base{floorf(uvw_pnano.x), floorf(uvw_pnano.y), floorf(uvw_pnano.z)};
+    const pnanovdb_vec3_t ijk_frac{uvw_pnano.x - ijk_base.x, uvw_pnano.y - ijk_base.y, uvw_pnano.z - ijk_base.z};
+    const pnanovdb_coord_t ijk{(pnanovdb_int32_t)ijk_base.x, (pnanovdb_int32_t)ijk_base.y,
+                               (pnanovdb_int32_t)ijk_base.z};
 
-    pnanovdb_readaccessor_t accessor;
-    pnanovdb_readaccessor_init(PNANOVDB_REF(accessor), root);
-    const float wx[2]{ 1 - ijk_frac.x, ijk_frac.x };
-    const float wy[2]{ 1 - ijk_frac.y, ijk_frac.y };
-    const float wz[2]{ 1 - ijk_frac.z, ijk_frac.z };
-    vec3 dphi[3] = {{0,0,0}, {0,0,0}, {0,0,0}};
+    accessor.init_cache();
+
+    const float wx[2]{1 - ijk_frac.x, ijk_frac.x};
+    const float wy[2]{1 - ijk_frac.y, ijk_frac.y};
+    const float wz[2]{1 - ijk_frac.z, ijk_frac.z};
     for (int idx = 0; idx < 8; ++idx)
     {
-        const pnanovdb_coord_t& offs = OFFSETS[idx];
+        const pnanovdb_coord_t &offs = OFFSETS[idx];
         const pnanovdb_coord_t ijk_shifted = pnanovdb_coord_add(ijk, offs);
-        vec3 v;
-        pnano_read(v, buf, PNANOVDB_REF(accessor), PNANOVDB_REF(ijk_shifted));
+        const T v = accessor.read_cache(ijk_shifted);
+
         const vec3 signs(offs.x * 2 - 1, offs.y * 2 - 1, offs.z * 2 - 1);
-        const vec3 grad_w(signs[0] * wy[offs.y] * wz[offs.z], signs[1] * wx[offs.x] * wz[offs.z], signs[2] * wx[offs.x] * wy[offs.y]); 
-        dphi[0] = add(dphi[0], mul(v[0], grad_w));
-        dphi[1] = add(dphi[1], mul(v[1], grad_w));
-        dphi[2] = add(dphi[2], mul(v[2], grad_w));
-    }
 
-    for (int k = 0; k < 3; ++k)
-    {
-        adj_uvw[k] += dot(dphi[k], adj_ret);
-    }
-}
+        const w_t w = wx[offs.x] * wy[offs.y] * wz[offs.z];
+        const w_grad_t grad_w(signs[0] * wy[offs.y] * wz[offs.z], signs[1] * wx[offs.x] * wz[offs.z],
+                              signs[2] * wx[offs.x] * wy[offs.y]);
 
-CUDA_CALLABLE inline void adj_volume_sample_i(uint64_t id, vec3 uvw, uint64_t& adj_id, vec3& adj_uvw, const int32_t& adj_ret)
-{
-    // NOP
+        adj_uvw += vec3(mul(w_t(dot(v, adj_ret)), grad_w));
+
+        const T adj_v = w * adj_ret;
+        accessor.adj_read_cache(ijk_shifted, adj_v);
+    }
 }
 
 // Sampling the volume at the given index-space coordinates, uvw can be fractional
-CUDA_CALLABLE inline float volume_sample_grad_f(uint64_t id, vec3 uvw, int sampling_mode, vec3& grad)
+template <typename Accessor>
+CUDA_CALLABLE inline typename Accessor::ValueType volume_sample_grad(
+    Accessor &accessor, vec3 uvw, int sampling_mode, typename val_traits<typename Accessor::ValueType>::grad_t &grad)
 {
-    const pnanovdb_buf_t buf = volume::id_to_buffer(id);
-    const pnanovdb_root_handle_t root = volume::get_root(buf);
-    const pnanovdb_vec3_t uvw_pnano{ uvw[0], uvw[1], uvw[2] };
+    using T = typename Accessor::ValueType;
+    using grad_T = typename val_traits<T>::grad_t;
+    using w_t = typename val_traits<T>::scalar_t;
+    using w_grad_t = vec_t<3, w_t>;
+
+    grad = grad_T{};
 
-    if (sampling_mode == volume::CLOSEST)
+    if (!accessor.is_valid())
     {
-        const pnanovdb_coord_t ijk = pnanovdb_vec3_round_to_coord(uvw_pnano);
-        float val;
-        pnano_read(val, buf, root, PNANOVDB_REF(ijk));
-        grad = vec3(0.0f, 0.0f, 0.0f);
-        return val;
+        return 0;
     }
-    else if (sampling_mode == volume::LINEAR)
+
+    const pnanovdb_buf_t buf = accessor.buf;
+    const pnanovdb_vec3_t uvw_pnano{uvw[0], uvw[1], uvw[2]};
+
+    if (sampling_mode == CLOSEST)
+    {
+        const pnanovdb_coord_t ijk = vec3_round_to_coord(uvw_pnano);
+        return accessor.read_single(ijk);
+    }
+    else if (sampling_mode == LINEAR)
     {
         // NB. linear sampling is not used on int volumes
         constexpr pnanovdb_coord_t OFFSETS[] = {
-            { 0, 0, 0 }, { 0, 0, 1 }, { 0, 1, 0 }, { 0, 1, 1 }, { 1, 0, 0 }, { 1, 0, 1 }, { 1, 1, 0 }, { 1, 1, 1 },
+            {0, 0, 0}, {0, 0, 1}, {0, 1, 0}, {0, 1, 1}, {1, 0, 0}, {1, 0, 1}, {1, 1, 0}, {1, 1, 1},
         };
 
-        const pnanovdb_vec3_t ijk_base{ floorf(uvw_pnano.x), floorf(uvw_pnano.y), floorf(uvw_pnano.z) };
-        const pnanovdb_vec3_t ijk_frac{ uvw_pnano.x - ijk_base.x, uvw_pnano.y - ijk_base.y, uvw_pnano.z - ijk_base.z };
-        const pnanovdb_coord_t ijk{ (pnanovdb_int32_t)ijk_base.x, (pnanovdb_int32_t)ijk_base.y, (pnanovdb_int32_t)ijk_base.z };
+        const pnanovdb_vec3_t ijk_base{floorf(uvw_pnano.x), floorf(uvw_pnano.y), floorf(uvw_pnano.z)};
+        const pnanovdb_vec3_t ijk_frac{uvw_pnano.x - ijk_base.x, uvw_pnano.y - ijk_base.y, uvw_pnano.z - ijk_base.z};
+        const pnanovdb_coord_t ijk{(pnanovdb_int32_t)ijk_base.x, (pnanovdb_int32_t)ijk_base.y,
+                                   (pnanovdb_int32_t)ijk_base.z};
 
-        pnanovdb_readaccessor_t accessor;
-        pnanovdb_readaccessor_init(PNANOVDB_REF(accessor), root);
-        float val = 0.0f;
-        const float wx[2]{ 1 - ijk_frac.x, ijk_frac.x };
-        const float wy[2]{ 1 - ijk_frac.y, ijk_frac.y };
-        const float wz[2]{ 1 - ijk_frac.z, ijk_frac.z };
-
-        const float sign_dx[8] = {-1.0f, -1.0f, -1.0f, -1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
-        const float sign_dy[8] = {-1.0f, -1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, 1.0f};
-        const float sign_dz[8] = {-1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f};
-        
-        float dfdx = 0.0f;
-        float dfdy = 0.0f;
-        float dfdz = 0.0f;
+        accessor.init_cache();
+        T val = 0;
+        const float wx[2]{1 - ijk_frac.x, ijk_frac.x};
+        const float wy[2]{1 - ijk_frac.y, ijk_frac.y};
+        const float wz[2]{1 - ijk_frac.z, ijk_frac.z};
         for (int idx = 0; idx < 8; ++idx)
         {
-            const pnanovdb_coord_t& offs = OFFSETS[idx];
+            const pnanovdb_coord_t &offs = OFFSETS[idx];
             const pnanovdb_coord_t ijk_shifted = pnanovdb_coord_add(ijk, offs);
-            float v;
-            pnano_read(v, buf, PNANOVDB_REF(accessor), PNANOVDB_REF(ijk_shifted));
-            val = add(val, wx[offs.x] * wy[offs.y] * wz[offs.z] * v);
-            dfdx = add(dfdx, wy[offs.y] * wz[offs.z] * sign_dx[idx] * v);
-            dfdy = add(dfdy, wx[offs.x] * wz[offs.z] * sign_dy[idx] * v);
-            dfdz = add(dfdz, wx[offs.x] * wy[offs.y] * sign_dz[idx] * v);
+            const T v = accessor.read_cache(ijk_shifted);
+
+            const vec3 signs(offs.x * 2 - 1, offs.y * 2 - 1, offs.z * 2 - 1);
+
+            const w_t w = wx[offs.x] * wy[offs.y] * wz[offs.z];
+            const w_grad_t grad_w(signs[0] * wy[offs.y] * wz[offs.z], signs[1] * wx[offs.x] * wz[offs.z],
+                                  signs[2] * wx[offs.x] * wy[offs.y]);
+
+            val = add(val, w * v);
+            grad += outer(v, grad_w);
         }
-        grad = vec3(dfdx, dfdy, dfdz);
         return val;
     }
-    return 0.0f;
+    return 0;
 }
 
-CUDA_CALLABLE inline void adj_volume_sample_grad_f(
-    uint64_t id, vec3 uvw, int sampling_mode, vec3& grad, uint64_t& adj_id, vec3& adj_uvw, int& adj_sampling_mode, vec3& adj_grad, const float& adj_ret)
+template <typename Accessor>
+CUDA_CALLABLE inline void adj_volume_sample_grad(Accessor &accessor, vec3 uvw, int sampling_mode,
+                                                 typename val_traits<typename Accessor::ValueType>::grad_t &grad,
+                                                 vec3 &adj_uvw,
+                                                 typename val_traits<typename Accessor::ValueType>::grad_t &adj_grad,
+                                                 const typename Accessor::ValueType &adj_ret)
 {
-    if (volume::get_grid_type(volume::id_to_buffer(id)) != PNANOVDB_GRID_TYPE_FLOAT) return;
+    // TODO: accessor data gradients
 
-    if (sampling_mode != volume::LINEAR) {
-        return; // NOP
+    using T = typename Accessor::ValueType;
+    using grad_T = typename val_traits<T>::grad_t;
+    using w_t = typename val_traits<T>::scalar_t;
+    using w_grad_t = vec_t<3, w_t>;
+    using w_hess_t = mat_t<3, 3, w_t>;
+
+    if (!accessor.is_valid())
+    {
+        return;
     }
 
-    const pnanovdb_buf_t buf = volume::id_to_buffer(id);
-    const pnanovdb_root_handle_t root = volume::get_root(buf);
-    const pnanovdb_vec3_t uvw_pnano{ uvw[0], uvw[1], uvw[2] };
+    const pnanovdb_buf_t buf = accessor.buf;
+    const pnanovdb_vec3_t uvw_pnano{uvw[0], uvw[1], uvw[2]};
+
+    if (sampling_mode != LINEAR)
+    {
+        const pnanovdb_coord_t ijk = vec3_round_to_coord(uvw_pnano);
+        accessor.adj_read_single(ijk, adj_ret);
+        return;
+    }
 
     constexpr pnanovdb_coord_t OFFSETS[] = {
-        { 0, 0, 0 }, { 0, 0, 1 }, { 0, 1, 0 }, { 0, 1, 1 }, { 1, 0, 0 }, { 1, 0, 1 }, { 1, 1, 0 }, { 1, 1, 1 },
+        {0, 0, 0}, {0, 0, 1}, {0, 1, 0}, {0, 1, 1}, {1, 0, 0}, {1, 0, 1}, {1, 1, 0}, {1, 1, 1},
     };
 
-    const pnanovdb_vec3_t ijk_base{ floorf(uvw_pnano.x), floorf(uvw_pnano.y), floorf(uvw_pnano.z) };
-    const pnanovdb_vec3_t ijk_frac{ uvw_pnano.x - ijk_base.x, uvw_pnano.y - ijk_base.y, uvw_pnano.z - ijk_base.z };
-    const pnanovdb_coord_t ijk{ (pnanovdb_int32_t)ijk_base.x, (pnanovdb_int32_t)ijk_base.y, (pnanovdb_int32_t)ijk_base.z };
+    const pnanovdb_vec3_t ijk_base{floorf(uvw_pnano.x), floorf(uvw_pnano.y), floorf(uvw_pnano.z)};
+    const pnanovdb_vec3_t ijk_frac{uvw_pnano.x - ijk_base.x, uvw_pnano.y - ijk_base.y, uvw_pnano.z - ijk_base.z};
+    const pnanovdb_coord_t ijk{(pnanovdb_int32_t)ijk_base.x, (pnanovdb_int32_t)ijk_base.y,
+                               (pnanovdb_int32_t)ijk_base.z};
 
-    pnanovdb_readaccessor_t accessor;
-    pnanovdb_readaccessor_init(PNANOVDB_REF(accessor), root);
-    const float wx[2]{ 1 - ijk_frac.x, ijk_frac.x };
-    const float wy[2]{ 1 - ijk_frac.y, ijk_frac.y };
-    const float wz[2]{ 1 - ijk_frac.z, ijk_frac.z };
-    const float sign_dx[8] = {-1.0f, -1.0f, -1.0f, -1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
-    const float sign_dy[8] = {-1.0f, -1.0f, 1.0f, 1.0f, -1.0f, -1.0f, 1.0f, 1.0f};
-    const float sign_dz[8] = {-1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f};
-    
-    float dfdxdy = 0.0f;
-    float dfdxdz = 0.0f;
-    float dfdydx = 0.0f;
-    float dfdydz = 0.0f;
-    float dfdzdx = 0.0f;    
-    float dfdzdy = 0.0f;    
-    vec3 dphi(0,0,0);
+    accessor.init_cache();
+
+    const float wx[2]{1 - ijk_frac.x, ijk_frac.x};
+    const float wy[2]{1 - ijk_frac.y, ijk_frac.y};
+    const float wz[2]{1 - ijk_frac.z, ijk_frac.z};
     for (int idx = 0; idx < 8; ++idx)
     {
-        const pnanovdb_coord_t& offs = OFFSETS[idx];
+        const pnanovdb_coord_t &offs = OFFSETS[idx];
         const pnanovdb_coord_t ijk_shifted = pnanovdb_coord_add(ijk, offs);
-        float v;
-        pnano_read(v, buf, PNANOVDB_REF(accessor), PNANOVDB_REF(ijk_shifted));
+        const T v = accessor.read_cache(ijk_shifted);
+
         const vec3 signs(offs.x * 2 - 1, offs.y * 2 - 1, offs.z * 2 - 1);
-        const vec3 grad_w(signs[0] * wy[offs.y] * wz[offs.z], signs[1] * wx[offs.x] * wz[offs.z], signs[2] * wx[offs.x] * wy[offs.y]); 
-        dphi = add(dphi, mul(v, grad_w));
 
-        dfdxdy = add(dfdxdy, signs[1] * wz[offs.z] * sign_dx[idx] * v);
-        dfdxdz = add(dfdxdz, wy[offs.y] * signs[2] * sign_dx[idx] * v);
+        const w_t w = wx[offs.x] * wy[offs.y] * wz[offs.z];
+        const w_grad_t grad_w(signs[0] * wy[offs.y] * wz[offs.z], signs[1] * wx[offs.x] * wz[offs.z],
+                              signs[2] * wx[offs.x] * wy[offs.y]);
+        adj_uvw += vec3(mul(w_t(dot(v, adj_ret)), grad_w));
 
-        dfdydx = add(dfdydx, signs[0] * wz[offs.z] * sign_dy[idx] * v);
-        dfdydz = add(dfdydz, wx[offs.x] * signs[2] * sign_dy[idx] * v);
+        const w_hess_t hess_w(0.0, signs[1] * signs[0] * wz[offs.z], signs[2] * signs[0] * wy[offs.y],
+                              signs[0] * signs[1] * wz[offs.z], 0.0, signs[2] * signs[1] * wx[offs.x],
+                              signs[0] * signs[2] * wy[offs.y], signs[1] * signs[2] * wx[offs.x], 0.0);
+        adj_uvw += vec3(mul(mul(v, adj_grad), hess_w));
 
-        dfdzdx = add(dfdzdx, signs[0] * wy[offs.y] * sign_dz[idx] * v);        
-        dfdzdy = add(dfdzdy, wx[offs.x] * signs[1] * sign_dz[idx] * v);        
+        const T adj_v = w * adj_ret + val_traits<T>::rmul(adj_grad, grad_w);
+        accessor.adj_read_cache(ijk_shifted, adj_v);
     }
+}
+
+} // namespace volume
+  // namespace volume
+
+// exposed kernel builtins
+
+// volume_sample
 
-    adj_uvw += mul(dphi, adj_ret);
-    adj_uvw[0] += adj_grad[1] * dfdydx + adj_grad[2] * dfdzdx;
-    adj_uvw[1] += adj_grad[0] * dfdxdy + adj_grad[2] * dfdzdy;
-    adj_uvw[2] += adj_grad[0] * dfdxdz + adj_grad[1] * dfdydz;
+template <typename T> CUDA_CALLABLE inline T volume_sample(uint64_t id, vec3 uvw, int sampling_mode)
+{
+    volume::leaf_value_accessor<T> accessor(volume::id_to_buffer(id));
+    return volume::volume_sample(accessor, uvw, sampling_mode);
 }
 
-CUDA_CALLABLE inline float volume_lookup_f(uint64_t id, int32_t i, int32_t j, int32_t k)
+template <typename T>
+CUDA_CALLABLE inline void adj_volume_sample(uint64_t id, vec3 uvw, int sampling_mode, uint64_t &adj_id, vec3 &adj_uvw,
+                                            int &adj_sampling_mode, const T &adj_ret)
 {
-    if (volume::get_grid_type(volume::id_to_buffer(id)) != PNANOVDB_GRID_TYPE_FLOAT) return 0.f;
+    volume::leaf_value_accessor<T> accessor(volume::id_to_buffer(id));
+    volume::adj_volume_sample(accessor, uvw, sampling_mode, adj_uvw, adj_ret);
+}
 
-    const pnanovdb_buf_t buf = volume::id_to_buffer(id);
-    const pnanovdb_root_handle_t root = volume::get_root(buf);
+template <typename T>
+CUDA_CALLABLE inline T volume_sample_grad(uint64_t id, vec3 uvw, int sampling_mode,
+                                          typename volume::val_traits<T>::grad_t &grad)
+{
+    volume::leaf_value_accessor<T> accessor(volume::id_to_buffer(id));
+    return volume::volume_sample_grad(accessor, uvw, sampling_mode, grad);
+}
 
-    const pnanovdb_coord_t ijk{ i, j, k };
-    float val;
-    pnano_read(val, buf, root, PNANOVDB_REF(ijk));
-    return val;
+template <typename T>
+CUDA_CALLABLE inline void adj_volume_sample_grad(uint64_t id, vec3 uvw, int sampling_mode,
+                                                 typename volume::val_traits<T>::grad_t &grad, uint64_t &adj_id,
+                                                 vec3 &adj_uvw, int &adj_sampling_mode,
+                                                 typename volume::val_traits<T>::grad_t &adj_grad, const T &adj_ret)
+{
+    volume::leaf_value_accessor<T> accessor(volume::id_to_buffer(id));
+    volume::adj_volume_sample_grad(accessor, uvw, sampling_mode, grad, adj_uvw, adj_grad, adj_ret);
 }
 
-CUDA_CALLABLE inline int32_t volume_lookup_i(uint64_t id, int32_t i, int32_t j, int32_t k)
+// Sampling a float volume at the given index-space coordinates, uvw can be fractional
+CUDA_CALLABLE inline float volume_sample_f(uint64_t id, vec3 uvw, int sampling_mode)
 {
-    if (volume::get_grid_type(volume::id_to_buffer(id)) != PNANOVDB_GRID_TYPE_INT32) return 0;
+    return volume_sample<float>(id, uvw, sampling_mode);
+}
 
-    const pnanovdb_buf_t buf = volume::id_to_buffer(id);
-    const pnanovdb_root_handle_t root = volume::get_root(buf);
+// Sampling an int volume at the given index-space coordinates, uvw can be fractional
+CUDA_CALLABLE inline int32_t volume_sample_i(uint64_t id, vec3 uvw)
+{
+    return volume_sample<int32_t>(id, uvw, volume::CLOSEST);
+}
 
-    const pnanovdb_coord_t ijk{ i, j, k };
-    int32_t val;
-    pnano_read(val, buf, root, PNANOVDB_REF(ijk));
-    return val;
+// Sampling a vector volume at the given index-space coordinates, uvw can be fractional
+CUDA_CALLABLE inline vec3 volume_sample_v(uint64_t id, vec3 uvw, int sampling_mode)
+{
+    return volume_sample<vec3>(id, uvw, sampling_mode);
 }
 
-CUDA_CALLABLE inline vec3 volume_lookup_v(uint64_t id, int32_t i, int32_t j, int32_t k)
+CUDA_CALLABLE inline void adj_volume_sample_f(uint64_t id, vec3 uvw, int sampling_mode, uint64_t &adj_id, vec3 &adj_uvw,
+                                              int &adj_sampling_mode, const float &adj_ret)
+{
+    adj_volume_sample(id, uvw, sampling_mode, adj_id, adj_uvw, adj_sampling_mode, adj_ret);
+}
+
+CUDA_CALLABLE inline void adj_volume_sample_v(uint64_t id, vec3 uvw, int sampling_mode, uint64_t &adj_id, vec3 &adj_uvw,
+                                              int &adj_sampling_mode, const vec3 &adj_ret)
+{
+    adj_volume_sample(id, uvw, sampling_mode, adj_id, adj_uvw, adj_sampling_mode, adj_ret);
+}
+
+CUDA_CALLABLE inline void adj_volume_sample_i(uint64_t id, vec3 uvw, uint64_t &adj_id, vec3 &adj_uvw,
+                                              const int32_t &adj_ret)
+{
+    // NOP
+}
+
+// Sampling the volume at the given index-space coordinates, uvw can be fractional
+CUDA_CALLABLE inline float volume_sample_grad_f(uint64_t id, vec3 uvw, int sampling_mode, vec3 &grad)
+{
+    return volume_sample_grad<float>(id, uvw, sampling_mode, grad);
+}
+
+CUDA_CALLABLE inline void adj_volume_sample_grad_f(uint64_t id, vec3 uvw, int sampling_mode, vec3 &grad,
+                                                   uint64_t &adj_id, vec3 &adj_uvw, int &adj_sampling_mode,
+                                                   vec3 &adj_grad, const float &adj_ret)
+{
+    adj_volume_sample_grad<float>(id, uvw, sampling_mode, grad, adj_id, adj_uvw, adj_sampling_mode, adj_grad, adj_ret);
+}
+
+// volume_sample_index
+
+template <typename T>
+CUDA_CALLABLE inline T volume_sample_index(uint64_t id, vec3 uvw, int sampling_mode, const array_t<T> &voxel_data,
+                                           const T &background)
+{
+    volume::index_value_accessor<T> accessor(volume::id_to_buffer(id), voxel_data, background);
+    return volume::volume_sample(accessor, uvw, sampling_mode);
+}
+
+template <typename T>
+CUDA_CALLABLE inline void adj_volume_sample_index(uint64_t id, vec3 uvw, int sampling_mode,
+                                                  const array_t<T> &voxel_data, const T &background, uint64_t &adj_id,
+                                                  vec3 &adj_uvw, int &adj_sampling_mode, array_t<T> &adj_voxel_data,
+                                                  T &adj_background, const T &adj_ret)
 {
-    if (volume::get_grid_type(volume::id_to_buffer(id)) != PNANOVDB_GRID_TYPE_VEC3F) return vec3(0.f);
+    volume::index_value_accessor<T> accessor(volume::id_to_buffer(id), voxel_data, background, &adj_background);
+    volume::adj_volume_sample(accessor, uvw, sampling_mode, adj_uvw, adj_ret);
+}
+
+template <typename T>
+CUDA_CALLABLE inline T volume_sample_grad_index(uint64_t id, vec3 uvw, int sampling_mode, const array_t<T> &voxel_data,
+                                                const T &background, typename volume::val_traits<T>::grad_t &grad)
+{
+    volume::index_value_accessor<T> accessor(volume::id_to_buffer(id), voxel_data, background);
+    return volume::volume_sample_grad(accessor, uvw, sampling_mode, grad);
+}
+
+template <typename T>
+CUDA_CALLABLE inline void adj_volume_sample_grad_index(
+    uint64_t id, vec3 uvw, int sampling_mode, const array_t<T> &voxel_data, const T &background,
+    typename volume::val_traits<T>::grad_t &grad, uint64_t &adj_id, vec3 &adj_uvw, int &adj_sampling_mode,
+    array_t<T> &adj_voxel_data, T &adj_background, typename volume::val_traits<T>::grad_t &adj_grad, const T &adj_ret)
+{
+    volume::index_value_accessor<T> accessor(volume::id_to_buffer(id), voxel_data, background, &adj_background);
+    volume::adj_volume_sample_grad(accessor, uvw, sampling_mode, grad, adj_uvw, adj_grad, adj_ret);
+}
+
+// volume_lookup
+
+template <typename T> CUDA_CALLABLE inline T volume_lookup(uint64_t id, int32_t i, int32_t j, int32_t k)
+{
+    using traits = volume::pnano_traits<T>;
 
     const pnanovdb_buf_t buf = volume::id_to_buffer(id);
+    if (volume::get_grid_type(buf) != traits::GRID_TYPE)
+        return 0;
+
     const pnanovdb_root_handle_t root = volume::get_root(buf);
 
-    const pnanovdb_coord_t ijk{ i, j, k };
-    vec3 val;
-    pnano_read(val, buf, root, PNANOVDB_REF(ijk));
+    const pnanovdb_coord_t ijk{i, j, k};
+    T val;
+    volume::pnano_read(val, buf, root, PNANOVDB_REF(ijk));
     return val;
 }
 
-CUDA_CALLABLE inline void adj_volume_lookup_f(
-    uint64_t id, int32_t i, int32_t j, int32_t k, uint64_t& adj_id, int32_t& adj_i, int32_t& adj_j, int32_t& adj_k, const float& adj_ret)
+template <typename T>
+CUDA_CALLABLE inline void adj_volume_lookup(uint64_t id, int32_t i, int32_t j, int32_t k, uint64_t &adj_id,
+                                            int32_t &adj_i, int32_t &adj_j, int32_t &adj_k, const T &adj_ret)
 {
-    // NOP
+    // NOP -- adjoint of grid values is not available
+}
+
+CUDA_CALLABLE inline float volume_lookup_f(uint64_t id, int32_t i, int32_t j, int32_t k)
+{
+    return volume_lookup<float>(id, i, j, k);
 }
 
-CUDA_CALLABLE inline void adj_volume_lookup_i(
-    uint64_t id, int32_t i, int32_t j, int32_t k, uint64_t& adj_id, int32_t& adj_i, int32_t& adj_j, int32_t& adj_k, const int32_t& adj_ret)
+CUDA_CALLABLE inline int32_t volume_lookup_i(uint64_t id, int32_t i, int32_t j, int32_t k)
 {
-    // NOP
+    return volume_lookup<int32_t>(id, i, j, k);
 }
 
-CUDA_CALLABLE inline void adj_volume_lookup_v(
-    uint64_t id, int32_t i, int32_t j, int32_t k, uint64_t& adj_id, int32_t& adj_i, int32_t& adj_j, int32_t& adj_k, const vec3& adj_ret)
+CUDA_CALLABLE inline vec3 volume_lookup_v(uint64_t id, int32_t i, int32_t j, int32_t k)
 {
-    // NOP
+    return volume_lookup<vec3>(id, i, j, k);
 }
 
-CUDA_CALLABLE inline void volume_store_f(uint64_t id, int32_t i, int32_t j, int32_t k, const float& value)
+CUDA_CALLABLE inline void adj_volume_lookup_f(uint64_t id, int32_t i, int32_t j, int32_t k, uint64_t &adj_id,
+                                              int32_t &adj_i, int32_t &adj_j, int32_t &adj_k, const float &adj_ret)
 {
-    if (volume::get_grid_type(volume::id_to_buffer(id)) != PNANOVDB_GRID_TYPE_FLOAT) return;
+    adj_volume_lookup(id, i, j, k, adj_id, adj_i, adj_j, adj_k, adj_ret);
+}
 
+CUDA_CALLABLE inline void adj_volume_lookup_i(uint64_t id, int32_t i, int32_t j, int32_t k, uint64_t &adj_id,
+                                              int32_t &adj_i, int32_t &adj_j, int32_t &adj_k, const int32_t &adj_ret)
+{
+    adj_volume_lookup(id, i, j, k, adj_id, adj_i, adj_j, adj_k, adj_ret);
+}
+
+CUDA_CALLABLE inline void adj_volume_lookup_v(uint64_t id, int32_t i, int32_t j, int32_t k, uint64_t &adj_id,
+                                              int32_t &adj_i, int32_t &adj_j, int32_t &adj_k, const vec3 &adj_ret)
+{
+    adj_volume_lookup(id, i, j, k, adj_id, adj_i, adj_j, adj_k, adj_ret);
+}
+
+CUDA_CALLABLE inline int32_t volume_lookup_index(uint64_t id, int32_t i, int32_t j, int32_t k)
+{
     const pnanovdb_buf_t buf = volume::id_to_buffer(id);
     const pnanovdb_root_handle_t root = volume::get_root(buf);
+    const pnanovdb_grid_type_t grid_type = volume::get_grid_type(buf);
+
+    const pnanovdb_coord_t ijk{i, j, k};
+
+    pnanovdb_uint32_t level;
+    const pnanovdb_address_t address =
+        pnanovdb_root_get_value_address_and_level(grid_type, buf, root, PNANOVDB_REF(ijk), PNANOVDB_REF(level));
+
+    if (level == 0)
+    {
+        pnanovdb_uint64_t voxel_index = volume::get_grid_voxel_index(grid_type, buf, address, ijk);
 
-    const pnanovdb_coord_t ijk{ i, j, k };
-    const pnanovdb_address_t address = pnanovdb_root_get_value_address(PNANOVDB_GRID_TYPE_FLOAT, buf, root, PNANOVDB_REF(ijk));
-    pnanovdb_write_float(buf, address, value);
+        return static_cast<int32_t>(voxel_index) - 1;
+    }
+    return -1;
 }
 
-CUDA_CALLABLE inline void adj_volume_store_f(
-    uint64_t id, int32_t i, int32_t j, int32_t k, const float& value,
-    uint64_t& adj_id, int32_t& adj_i, int32_t& adj_j, int32_t& adj_k, float& adj_value)
+CUDA_CALLABLE inline void adj_volume_lookup_index(uint64_t id, int32_t i, int32_t j, int32_t k, uint64_t &adj_id,
+                                                  int32_t &adj_i, int32_t &adj_j, int32_t &adj_k, const vec3 &adj_ret)
 {
-    adj_value += volume_lookup_f(id, i, j, k);
+    // NOP
 }
 
-CUDA_CALLABLE inline void volume_store_v(uint64_t id, int32_t i, int32_t j, int32_t k, const vec3& value)
+// volume_store
+
+template <typename T>
+CUDA_CALLABLE inline void volume_store(uint64_t id, int32_t i, int32_t j, int32_t k, const T &value)
 {
-    if (volume::get_grid_type(volume::id_to_buffer(id)) != PNANOVDB_GRID_TYPE_VEC3F) return;
+    using traits = volume::pnano_traits<T>;
 
     const pnanovdb_buf_t buf = volume::id_to_buffer(id);
+    if (volume::get_grid_type(buf) != traits::GRID_TYPE)
+        return;
+
     const pnanovdb_root_handle_t root = volume::get_root(buf);
+    const pnanovdb_coord_t ijk{i, j, k};
+
+    pnanovdb_uint32_t level;
+    const pnanovdb_address_t address =
+        pnanovdb_root_get_value_address_and_level(traits::GRID_TYPE, buf, root, PNANOVDB_REF(ijk), PNANOVDB_REF(level));
 
-    const pnanovdb_coord_t ijk{ i, j, k };
-    const pnanovdb_address_t address = pnanovdb_root_get_value_address(PNANOVDB_GRID_TYPE_VEC3F, buf, root, PNANOVDB_REF(ijk));
-    const pnanovdb_vec3_t v{ value[0], value[1], value[2] };
-    pnanovdb_write_vec3(buf, address, &v);
+    if (level == 0)
+    {
+        // only write at at leaf level (prevent modifying background value)
+        // TODO is this the intended semantics? or should be allow writing to background?
+        volume::pnano_write(value, buf, address);
+    }
 }
 
-CUDA_CALLABLE inline void adj_volume_store_v(
-    uint64_t id, int32_t i, int32_t j, int32_t k, const vec3& value,
-    uint64_t& adj_id, int32_t& adj_i, int32_t& adj_j, int32_t& adj_k, vec3& adj_value)
+template <typename T>
+CUDA_CALLABLE inline void adj_volume_store(uint64_t id, int32_t i, int32_t j, int32_t k, const T &value,
+                                           uint64_t &adj_id, int32_t &adj_i, int32_t &adj_j, int32_t &adj_k,
+                                           T &adj_value)
 {
-    adj_value = add(adj_value, volume_lookup_v(id, i, j, k));
+    // NOP -- adjoint of grid values is not available
 }
 
-CUDA_CALLABLE inline void volume_store_i(uint64_t id, int32_t i, int32_t j, int32_t k, const int32_t& value)
+CUDA_CALLABLE inline void volume_store_f(uint64_t id, int32_t i, int32_t j, int32_t k, const float &value)
 {
-    if (volume::get_grid_type(volume::id_to_buffer(id)) != PNANOVDB_GRID_TYPE_INT32) return;
+    volume_store(id, i, j, k, value);
+}
 
-    const pnanovdb_buf_t buf = volume::id_to_buffer(id);
-    const pnanovdb_root_handle_t root = volume::get_root(buf);
+CUDA_CALLABLE inline void adj_volume_store_f(uint64_t id, int32_t i, int32_t j, int32_t k, const float &value,
+                                             uint64_t &adj_id, int32_t &adj_i, int32_t &adj_j, int32_t &adj_k,
+                                             float &adj_value)
+{
+    adj_volume_store(id, i, j, k, value, adj_id, adj_i, adj_j, adj_k, adj_value);
+}
+
+CUDA_CALLABLE inline void volume_store_v(uint64_t id, int32_t i, int32_t j, int32_t k, const vec3 &value)
+{
+    volume_store(id, i, j, k, value);
+}
 
-    const pnanovdb_coord_t ijk{ i, j, k };
-    const pnanovdb_address_t address = pnanovdb_root_get_value_address(PNANOVDB_GRID_TYPE_INT32, buf, root, PNANOVDB_REF(ijk));
-    pnanovdb_write_int32(buf, address, value);
+CUDA_CALLABLE inline void adj_volume_store_v(uint64_t id, int32_t i, int32_t j, int32_t k, const vec3 &value,
+                                             uint64_t &adj_id, int32_t &adj_i, int32_t &adj_j, int32_t &adj_k,
+                                             vec3 &adj_value)
+{
+    adj_volume_store(id, i, j, k, value, adj_id, adj_i, adj_j, adj_k, adj_value);
+}
+
+CUDA_CALLABLE inline void volume_store_i(uint64_t id, int32_t i, int32_t j, int32_t k, const int32_t &value)
+{
+    volume_store(id, i, j, k, value);
 }
 
-CUDA_CALLABLE inline void adj_volume_store_i(
-    uint64_t id, int32_t i, int32_t j, int32_t k, const int32_t& value,
-    uint64_t& adj_id, int32_t& adj_i, int32_t& adj_j, int32_t& adj_k, int32_t& adj_value)
+CUDA_CALLABLE inline void adj_volume_store_i(uint64_t id, int32_t i, int32_t j, int32_t k, const int32_t &value,
+                                             uint64_t &adj_id, int32_t &adj_i, int32_t &adj_j, int32_t &adj_k,
+                                             int32_t &adj_value)
 {
-    adj_value = add(adj_value, volume_lookup_i(id, i, j, k));
+    adj_volume_store(id, i, j, k, value, adj_id, adj_i, adj_j, adj_k, adj_value);
 }
 
 // Transform position from index space to world space
 CUDA_CALLABLE inline vec3 volume_index_to_world(uint64_t id, vec3 uvw)
 {
     const pnanovdb_buf_t buf = volume::id_to_buffer(id);
-    const pnanovdb_grid_handle_t grid = { 0u };
-    const pnanovdb_vec3_t pos{ uvw[0], uvw[1], uvw[2] };
+    const pnanovdb_grid_handle_t grid = {0u};
+    const pnanovdb_vec3_t pos{uvw[0], uvw[1], uvw[2]};
     const pnanovdb_vec3_t xyz = pnanovdb_grid_index_to_worldf(buf, grid, PNANOVDB_REF(pos));
-    return { xyz.x, xyz.y, xyz.z };
+    return {xyz.x, xyz.y, xyz.z};
 }
 
 // Transform position from world space to index space
 CUDA_CALLABLE inline vec3 volume_world_to_index(uint64_t id, vec3 xyz)
 {
     const pnanovdb_buf_t buf = volume::id_to_buffer(id);
-    const pnanovdb_grid_handle_t grid = { 0u };
-    const pnanovdb_vec3_t pos{ xyz[0], xyz[1], xyz[2] };
+    const pnanovdb_grid_handle_t grid = {0u};
+    const pnanovdb_vec3_t pos{xyz[0], xyz[1], xyz[2]};
     const pnanovdb_vec3_t uvw = pnanovdb_grid_world_to_indexf(buf, grid, PNANOVDB_REF(pos));
-    return { uvw.x, uvw.y, uvw.z };
+    return {uvw.x, uvw.y, uvw.z};
 }
 
-CUDA_CALLABLE inline void adj_volume_index_to_world(uint64_t id, vec3 uvw, uint64_t& adj_id, vec3& adj_uvw, const vec3& adj_ret)
+CUDA_CALLABLE inline void adj_volume_index_to_world(uint64_t id, vec3 uvw, uint64_t &adj_id, vec3 &adj_uvw,
+                                                    const vec3 &adj_ret)
 {
     const pnanovdb_buf_t buf = volume::id_to_buffer(id);
-    const pnanovdb_grid_handle_t grid = { 0u };
-    const pnanovdb_vec3_t pos{ adj_ret[0], adj_ret[1], adj_ret[2] };
+    const pnanovdb_grid_handle_t grid = {0u};
+    const pnanovdb_vec3_t pos{adj_ret[0], adj_ret[1], adj_ret[2]};
     const pnanovdb_vec3_t xyz = pnanovdb_grid_index_to_world_dirf(buf, grid, PNANOVDB_REF(pos));
-    adj_uvw = add(adj_uvw, vec3{ xyz.x, xyz.y, xyz.z });
+    adj_uvw = add(adj_uvw, vec3{xyz.x, xyz.y, xyz.z});
 }
 
-CUDA_CALLABLE inline void adj_volume_world_to_index(uint64_t id, vec3 xyz, uint64_t& adj_id, vec3& adj_xyz, const vec3& adj_ret)
+CUDA_CALLABLE inline void adj_volume_world_to_index(uint64_t id, vec3 xyz, uint64_t &adj_id, vec3 &adj_xyz,
+                                                    const vec3 &adj_ret)
 {
     const pnanovdb_buf_t buf = volume::id_to_buffer(id);
-    const pnanovdb_grid_handle_t grid = { 0u };
-    const pnanovdb_vec3_t pos{ adj_ret[0], adj_ret[1], adj_ret[2] };
+    const pnanovdb_grid_handle_t grid = {0u};
+    const pnanovdb_vec3_t pos{adj_ret[0], adj_ret[1], adj_ret[2]};
     const pnanovdb_vec3_t uvw = pnanovdb_grid_world_to_index_dirf(buf, grid, PNANOVDB_REF(pos));
-    adj_xyz = add(adj_xyz, vec3{ uvw.x, uvw.y, uvw.z });
+    adj_xyz = add(adj_xyz, vec3{uvw.x, uvw.y, uvw.z});
 }
 
 // Transform direction from index space to world space
 CUDA_CALLABLE inline vec3 volume_index_to_world_dir(uint64_t id, vec3 uvw)
 {
     const pnanovdb_buf_t buf = volume::id_to_buffer(id);
-    const pnanovdb_grid_handle_t grid = { 0u };
-    const pnanovdb_vec3_t pos{ uvw[0], uvw[1], uvw[2] };
+    const pnanovdb_grid_handle_t grid = {0u};
+    const pnanovdb_vec3_t pos{uvw[0], uvw[1], uvw[2]};
     const pnanovdb_vec3_t xyz = pnanovdb_grid_index_to_world_dirf(buf, grid, PNANOVDB_REF(pos));
-    return { xyz.x, xyz.y, xyz.z };
+    return {xyz.x, xyz.y, xyz.z};
 }
 
 // Transform direction from world space to index space
 CUDA_CALLABLE inline vec3 volume_world_to_index_dir(uint64_t id, vec3 xyz)
 {
     const pnanovdb_buf_t buf = volume::id_to_buffer(id);
-    const pnanovdb_grid_handle_t grid = { 0u };
-    const pnanovdb_vec3_t pos{ xyz[0], xyz[1], xyz[2] };
+    const pnanovdb_grid_handle_t grid = {0u};
+    const pnanovdb_vec3_t pos{xyz[0], xyz[1], xyz[2]};
     const pnanovdb_vec3_t uvw = pnanovdb_grid_world_to_index_dirf(buf, grid, PNANOVDB_REF(pos));
-    return { uvw.x, uvw.y, uvw.z };
+    return {uvw.x, uvw.y, uvw.z};
 }
 
-CUDA_CALLABLE inline void adj_volume_index_to_world_dir(uint64_t id, vec3 uvw, uint64_t& adj_id, vec3& adj_uvw, const vec3& adj_ret)
+CUDA_CALLABLE inline void adj_volume_index_to_world_dir(uint64_t id, vec3 uvw, uint64_t &adj_id, vec3 &adj_uvw,
+                                                        const vec3 &adj_ret)
 {
     adj_volume_index_to_world(id, uvw, adj_id, adj_uvw, adj_ret);
 }
 
-CUDA_CALLABLE inline void adj_volume_world_to_index_dir(uint64_t id, vec3 xyz, uint64_t& adj_id, vec3& adj_xyz, const vec3& adj_ret)
+CUDA_CALLABLE inline void adj_volume_world_to_index_dir(uint64_t id, vec3 xyz, uint64_t &adj_id, vec3 &adj_xyz,
+                                                        const vec3 &adj_ret)
 {
     adj_volume_world_to_index(id, xyz, adj_id, adj_xyz, adj_ret);
 }
diff --git a/warp/native/volume_builder.cu b/warp/native/volume_builder.cu
index 53449417..31c1a531 100644
--- a/warp/native/volume_builder.cu
+++ b/warp/native/volume_builder.cu
@@ -1,425 +1,446 @@
 #include "volume_builder.h"
 
+#include <nanovdb/tools/cuda/PointsToGrid.cuh>
+
 #include <cuda.h>
 #include <cuda_runtime_api.h>
 
 #include <cub/cub.cuh>
-#include <cub/util_allocator.cuh>
-
-// Explanation of key types
-// ------------------------
-//
-// leaf_key:
-// .__.__. .... .__.__.__.__.__.__.__.__.__.__.__.__.__.__.__.__.__.__.__.__.__.__.__.__.__.__.__.__.
-//  63 62  ....  27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00
-//  XX|< tile key >|<               upper offset               >|<           lower offset          >|
-//
-// tile key (36 bit):
-//   (uint32(ijk[2]) >> ChildT::TOTAL) |
-//   (uint64_t(uint32(ijk[1]) >> ChildT::TOTAL)) << 12 |
-//   (uint64_t(uint32(ijk[0]) >> ChildT::TOTAL)) << 24 
-//
-// lower_key (51 bits) == leaf_key >> 12
-//
-// upper_key (36 bits) == lower_key >> 15 == leaf_key >> 27 == tile key
-
-CUDA_CALLABLE inline uint64_t coord_to_full_key(const nanovdb::Coord& ijk) 
-{
-    using Tree = nanovdb::FloatTree; // any type is fine at this point
-    assert((abs(ijk[0]) >> 24) == 0);
-    assert((abs(ijk[1]) >> 24) == 0);
-    assert((abs(ijk[2]) >> 24) == 0);
-    constexpr uint32_t MASK_12BITS = (1u << 12) - 1u;
-    const uint64_t     tile_key36 =
-        ((uint32_t(ijk[2]) >> 12) & MASK_12BITS) | // z is the lower 12 bits
-        (uint64_t((uint32_t(ijk[1]) >> 12) & MASK_12BITS) << 12) | // y is the middle 12 bits
-        (uint64_t((uint32_t(ijk[0]) >> 12) & MASK_12BITS) << 24); // x is the upper 12 bits
-    const uint32_t upper_offset = Tree::Node2::CoordToOffset(ijk);
-    const uint32_t lower_offset = Tree::Node1::CoordToOffset(ijk);
-    return (tile_key36 << 27) | (upper_offset << 12) | lower_offset; 
-}
 
-__global__
-void generate_keys(size_t num_points, const nanovdb::Coord* points, uint64_t* all_leaf_keys)
+#if defined(__NVCC_DIAG_PRAGMA_SUPPORT__)
+// dynamic initialization is not supported for a function-scope static __shared__ variable within a
+// __device__/__global__ function
+#pragma nv_diag_suppress 20054
+#elif defined(__NVCC__)
+#pragma diag_suppress 20054
+#endif
+namespace
+{
+/// Allocator class following interface of cub::cachingDeviceAllocator, as expected by naovdb::PointsToGrid
+struct Allocator
 {
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    if (tid >= num_points) return;
 
-    all_leaf_keys[tid] = coord_to_full_key(points[tid]); 
-}
+    cudaError_t DeviceAllocate(void **d_ptr,               ///< [out] Reference to pointer to the allocation
+                               size_t bytes,               ///< [in] Minimum number of bytes for the allocation
+                               cudaStream_t active_stream) ///< [in] The stream to be associated with this allocation
+    {
+        // in PointsToGrid stream argument always coincide with current stream, ignore
+        *d_ptr = alloc_device(WP_CURRENT_CONTEXT, bytes);
+        return cudaSuccess;
+    }
+
+    cudaError_t DeviceFree(void *d_ptr)
+    {
+        free_device(WP_CURRENT_CONTEXT, d_ptr);
+        return cudaSuccess;
+    }
+
+    cudaError_t FreeAllCached()
+    {
+        return cudaSuccess;
+    }
+};
 
-__global__
-void generate_keys(size_t num_points, const nanovdb::Vec3f* points, uint64_t* all_leaf_keys, float one_over_voxel_size, nanovdb::Vec3f translation)
+/// @brief  Implementation of NanoVDB's DeviceBuffer that uses warp allocators
+class DeviceBuffer
 {
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    if (tid >= num_points) return;
+    uint64_t mSize; // total number of bytes managed by this buffer (assumed to be identical for host and device)
+    void *mCpuData, *mGpuData; // raw pointers to the host and device buffers
+    bool mManaged;
+
+  public:
+    /// @brief Static factory method that return an instance of this buffer
+    /// @param size byte size of buffer to be initialized
+    /// @param dummy this argument is currently ignored but required to match the API of the HostBuffer
+    /// @param host If true buffer is initialized only on the host/CPU, else on the device/GPU
+    /// @param stream optional stream argument (defaults to stream NULL)
+    /// @return An instance of this class using move semantics
+    static DeviceBuffer create(uint64_t size, const DeviceBuffer *dummy = nullptr, bool host = true,
+                               void *stream = nullptr)
+    {
+        return DeviceBuffer(size, host, stream);
+    }
 
-    const nanovdb::Coord ijk = ((points[tid] - translation) * one_over_voxel_size).round();
-    all_leaf_keys[tid] = coord_to_full_key(ijk); 
-}
+    /// @brief Static factory method that return an instance of this buffer that wraps externally managed memory
+    /// @param size byte size of buffer specified by external memory
+    /// @param cpuData pointer to externally managed host memory
+    /// @param gpuData pointer to externally managed device memory
+    /// @return An instance of this class using move semantics
+    static DeviceBuffer create(uint64_t size, void *cpuData, void *gpuData)
+    {
+        return DeviceBuffer(size, cpuData, gpuData);
+    }
 
-// Convert a 36 bit tile key to the ijk origin of the addressed tile
-CUDA_CALLABLE inline nanovdb::Coord tile_key36_to_coord(uint64_t tile_key36) {
-    auto extend_sign = [](uint32_t i) -> int32_t { return i | ((i>>11 & 1) * 0xFFFFF800);};
-    constexpr uint32_t MASK_12BITS = (1u << 12) - 1u;
-    const int32_t i = extend_sign(uint32_t(tile_key36 >> 24) & MASK_12BITS);
-    const int32_t j = extend_sign(uint32_t(tile_key36 >> 12) & MASK_12BITS);
-    const int32_t k = extend_sign(uint32_t(tile_key36) & MASK_12BITS);
-    return nanovdb::Coord(i, j, k) << 12;
-}
+    /// @brief Constructor
+    /// @param size byte size of buffer to be initialized
+    /// @param host If true buffer is initialized only on the host/CPU, else on the device/GPU
+    /// @param stream optional stream argument (defaults to stream NULL)
+    DeviceBuffer(uint64_t size = 0, bool host = true, void *stream = nullptr)
+        : mSize(0), mCpuData(nullptr), mGpuData(nullptr), mManaged(false)
+    {
+        if (size > 0)
+            this->init(size, host, stream);
+    }
+
+    DeviceBuffer(uint64_t size, void *cpuData, void *gpuData)
+        : mSize(size), mCpuData(cpuData), mGpuData(gpuData), mManaged(false)
+    {
+    }
 
+    /// @brief Disallow copy-construction
+    DeviceBuffer(const DeviceBuffer &) = delete;
 
-// --- CUB helpers ---
-template<uint8_t bits, typename InType, typename OutType>
-struct ShiftRight {
-    CUDA_CALLABLE inline OutType operator()(const InType& v) const {
-        return static_cast<OutType>(v >> bits);
+    /// @brief Move copy-constructor
+    DeviceBuffer(DeviceBuffer &&other) noexcept
+        : mSize(other.mSize), mCpuData(other.mCpuData), mGpuData(other.mGpuData), mManaged(other.mManaged)
+    {
+        other.mSize = 0;
+        other.mCpuData = nullptr;
+        other.mGpuData = nullptr;
+        other.mManaged = false;
     }
-};
 
-template<uint8_t bits, typename InType = uint64_t, typename OutType = uint64_t>
-struct ShiftRightIterator : public cub::TransformInputIterator<OutType, ShiftRight<bits, InType, OutType>, InType*> {
-    using BASE = cub::TransformInputIterator<OutType, ShiftRight<bits, InType, OutType>, InType*>;
-    CUDA_CALLABLE inline ShiftRightIterator(uint64_t* input_itr)
-        : BASE(input_itr, ShiftRight<bits, InType, OutType>()) {}
-};
+    /// @brief Disallow copy assignment operation
+    DeviceBuffer &operator=(const DeviceBuffer &) = delete;
 
+    /// @brief Move copy assignment operation
+    DeviceBuffer &operator=(DeviceBuffer &&other) noexcept
+    {
+        this->clear();
+        mSize = other.mSize;
+        mCpuData = other.mCpuData;
+        mGpuData = other.mGpuData;
+        mManaged = other.mManaged;
+        other.mSize = 0;
+        other.mCpuData = nullptr;
+        other.mGpuData = nullptr;
+        other.mManaged = false;
+        return *this;
+    }
 
-// --- Atomic instructions for NanoVDB construction ---
-template<typename MaskT>
-CUDA_CALLABLE_DEVICE void set_mask_atomic(MaskT& mask, uint32_t n) {
-    unsigned long long int* words = reinterpret_cast<unsigned long long int*>(&mask);
-    atomicOr(words + (n / 64), 1ull << (n & 63));
-}
+    /// @brief Destructor frees memory on both the host and device
+    ~DeviceBuffer()
+    {
+        this->clear();
+    };
+
+    /// @brief Initialize buffer
+    /// @param size byte size of buffer to be initialized
+    /// @param host If true buffer is initialized only on the host/CPU, else on the device/GPU
+    /// @note All existing buffers are first cleared
+    /// @warning size is expected to be non-zero. Use clear() clear buffer!
+    void init(uint64_t size, bool host = true, void *stream = nullptr)
+    {
+        if (mSize > 0)
+            this->clear(stream);
+        NANOVDB_ASSERT(size > 0);
+        if (host)
+        {
+            mCpuData =
+                alloc_pinned(size); // un-managed pinned memory on the host (can be slow to access!). Always 32B aligned
+        }
+        else
+        {
+            mGpuData = alloc_device(WP_CURRENT_CONTEXT, size);
+        }
+        mSize = size;
+        mManaged = true;
+    }
 
-template<typename Vec3T>
-CUDA_CALLABLE_DEVICE void expand_cwise_atomic(nanovdb::BBox<Vec3T>& bbox, const Vec3T& v) {
-    atomicMin(&bbox.mCoord[0][0], v[0]);
-    atomicMin(&bbox.mCoord[0][1], v[1]);
-    atomicMin(&bbox.mCoord[0][2], v[2]);
-    atomicMax(&bbox.mCoord[1][0], v[0]);
-    atomicMax(&bbox.mCoord[1][1], v[1]);
-    atomicMax(&bbox.mCoord[1][2], v[2]);
-}
+    /// @brief Retuns a raw pointer to the host/CPU buffer managed by this allocator.
+    /// @warning Note that the pointer can be NULL!
+    void *data() const
+    {
+        return mCpuData;
+    }
+
+    /// @brief Retuns a raw pointer to the device/GPU buffer managed by this allocator.
+    /// @warning Note that the pointer can be NULL!
+    void *deviceData() const
+    {
+        return mGpuData;
+    }
+
+    /// @brief Returns the size in bytes of the raw memory buffer managed by this allocator.
+    uint64_t size() const
+    {
+        return mSize;
+    }
 
-template<typename RootDataType>
-__hostdev__ const typename RootDataType::Tile* find_tile(const RootDataType* root_data, const nanovdb::Coord& ijk)
+    //@{
+    /// @brief Returns true if this allocator is empty, i.e. has no allocated memory
+    bool empty() const
+    {
+        return mSize == 0;
+    }
+    bool isEmpty() const
+    {
+        return mSize == 0;
+    }
+    //@}
+
+    /// @brief Detach device data so it is not dealloced when this buffer is destroyed
+    void detachDeviceData()
+    {
+        mGpuData = nullptr;
+        if (!mCpuData)
+        {
+            mSize = 0;
+        }
+    }
+
+    /// @brief De-allocate all memory managed by this allocator and set all pointers to NULL
+    void clear(void *stream = nullptr)
+    {
+        if (mManaged && mGpuData)
+            free_device(WP_CURRENT_CONTEXT, mGpuData);
+        if (mManaged && mCpuData)
+            free_pinned(mCpuData);
+        mCpuData = mGpuData = nullptr;
+        mSize = 0;
+        mManaged = false;
+    }
+
+}; // DeviceBuffer class
+
+template <typename Tree> __global__ void activateAllLeafVoxels(Tree *tree)
 {
-    using Tile = typename RootDataType::Tile;
-    const Tile *tiles = reinterpret_cast<const Tile *>(root_data + 1);
-    const auto key = RootDataType::CoordToKey(ijk);
+    const unsigned leaf_count = tree->mNodeCount[0];
 
-    for (uint32_t i = 0; i < root_data->mTableSize; ++i)
+    const unsigned tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (tid < leaf_count)
     {
-        if (tiles[i].key == key)
-            return &tiles[i];
+        // activate all leaf voxels
+        typename Tree::LeafNodeType *const leaf_nodes = tree->getFirstLeaf();
+        typename Tree::LeafNodeType &leaf = leaf_nodes[tid];
+        leaf.mValueMask.setOn();
+        leaf.updateBBox();
+    }
+
+    if (tid == 0)
+    {
+        tree->mVoxelCount = Tree::LeafNodeType::SIZE * leaf_count; // full leaves
     }
-    return nullptr;
 }
 
-// --- Wrapper for launching lambda kernels
-template<typename Func, typename... Args>
-__global__ void kernel(const size_t num_items, Func f, Args... args)
+template <typename Node>
+__device__ std::enable_if_t<!nanovdb::BuildTraits<typename Node::BuildType>::is_index> setBackgroundValue(
+    Node &node, unsigned tile_id, const typename Node::BuildType background_value)
 {
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    if (tid >= num_items) return;
-    f(tid, args...);
+    node.setValue(tile_id, background_value);
 }
 
-template <typename BuildT>
-void build_grid_from_tiles(nanovdb::Grid<nanovdb::NanoTree<BuildT>> *&out_grid,
-                           size_t &out_grid_size,
-                           const void *points,
-                           size_t num_points,
-                           bool points_in_world_space,
-                           const BuildGridParams<BuildT> &params)
+template <typename Node>
+__device__ std::enable_if_t<nanovdb::BuildTraits<typename Node::BuildType>::is_index> setBackgroundValue(
+    Node &node, unsigned tile_id, const typename Node::BuildType background_value)
 {
-    using FloatT = typename nanovdb::FloatTraits<BuildT>::FloatType;
-    const BuildT ZERO_VAL{0};
-    const FloatT ZERO_SCALAR{0};
-
-    // Don't want to access "params" in kernels
-    const double dx = params.voxel_size;
-    const double Tx = params.translation[0], Ty = params.translation[1], Tz = params.translation[2];
-    const BuildT background_value = params.background_value;
+}
 
-    const unsigned int num_threads = 256;
-    unsigned int num_blocks;
+template <typename Node>
+__device__ std::enable_if_t<!nanovdb::BuildTraits<typename Node::BuildType>::is_index> setBackgroundValue(
+    Node &node, const typename Node::BuildType background_value)
+{
+    node.mBackground = background_value;
+}
 
-    out_grid = nullptr;
-    out_grid_size = 0;
+template <typename Node>
+__device__ std::enable_if_t<nanovdb::BuildTraits<typename Node::BuildType>::is_index> setBackgroundValue(
+    Node &node, const typename Node::BuildType background_value)
+{
+}
 
-    cub::CachingDeviceAllocator allocator;
-    
-    uint64_t* leaf_keys;
-    uint64_t* lower_keys;
-    uint64_t* upper_keys;
-    uint32_t* node_counts;
-    uint32_t leaf_count, lower_node_count, upper_node_count;
+template <typename Tree, typename NodeT>
+__global__ void setInternalBBoxAndBackgroundValue(Tree *tree, const typename Tree::BuildType background_value)
+{
+    using BBox = nanovdb::math::BBox<typename NodeT::CoordT>;
+    __shared__ BBox bbox;
 
-    allocator.DeviceAllocate((void**)&leaf_keys, sizeof(uint64_t) * num_points);
-    allocator.DeviceAllocate((void**)&node_counts, sizeof(uint32_t) * 3);
+    const unsigned node_count = tree->mNodeCount[NodeT::LEVEL];
+    const unsigned node_id = blockIdx.x;
 
-    // Phase 1: counting the nodes
+    if (node_id < node_count)
     {
-        // Generating keys from coords
-        uint64_t* all_leaf_keys;
-        uint64_t* all_leaf_keys_sorted;
-        allocator.DeviceAllocate((void**)&all_leaf_keys, sizeof(uint64_t) * num_points);
-        allocator.DeviceAllocate((void**)&all_leaf_keys_sorted, sizeof(uint64_t) * num_points);
-
-        num_blocks = (static_cast<unsigned int>(num_points) + num_threads - 1) / num_threads;
-        if (points_in_world_space) {
-            generate_keys<<<num_blocks, num_threads>>>(num_points, static_cast<const nanovdb::Vec3f*>(points), all_leaf_keys, static_cast<float>(1.0 / dx), nanovdb::Vec3f(params.translation));
-        } else {
-            generate_keys<<<num_blocks, num_threads>>>(num_points, static_cast<const nanovdb::Coord*>(points), all_leaf_keys);
+
+        if (threadIdx.x == 0)
+        {
+            bbox = BBox();
         }
 
-        void*  d_temp_storage = nullptr;
-        size_t temp_storage_bytes;
-
-        // Sort the keys, then get an array of unique keys
-        cub::DeviceRadixSort::SortKeys(nullptr, temp_storage_bytes, all_leaf_keys, all_leaf_keys_sorted, static_cast<int>(num_points), /* begin_bit = */ 0, /* end_bit = */ 63);
-        allocator.DeviceAllocate((void**)&d_temp_storage, temp_storage_bytes);
-        cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, all_leaf_keys, all_leaf_keys_sorted, static_cast<int>(num_points), /* begin_bit = */ 0, /* end_bit = */ 63);
-        allocator.DeviceFree(d_temp_storage);
-
-        cub::DeviceSelect::Unique(nullptr, temp_storage_bytes, all_leaf_keys_sorted, leaf_keys, node_counts, static_cast<int>(num_points));
-        allocator.DeviceAllocate((void**)&d_temp_storage, temp_storage_bytes);
-        cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, all_leaf_keys_sorted, leaf_keys, node_counts, static_cast<int>(num_points));
-        allocator.DeviceFree(d_temp_storage);
-        check_cuda(cudaMemcpy(&leaf_count, node_counts, sizeof(uint32_t), cudaMemcpyDeviceToHost));
-
-        allocator.DeviceFree(all_leaf_keys);
-        all_leaf_keys = nullptr;
-        allocator.DeviceFree(all_leaf_keys_sorted);
-        all_leaf_keys_sorted = nullptr;
-
-
-        // Get the keys unique to lower nodes and the number of them
-        allocator.DeviceAllocate((void**)&lower_keys, sizeof(uint64_t) * leaf_count);
-        cub::DeviceSelect::Unique(nullptr, temp_storage_bytes, ShiftRightIterator<12>(leaf_keys), lower_keys, node_counts + 1, leaf_count);
-        allocator.DeviceAllocate((void**)&d_temp_storage, temp_storage_bytes);
-        cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, ShiftRightIterator<12>(leaf_keys), lower_keys, node_counts + 1, leaf_count);
-        allocator.DeviceFree(d_temp_storage);
-        check_cuda(cudaMemcpy(&lower_node_count, node_counts + 1, sizeof(uint32_t), cudaMemcpyDeviceToHost));
-
-        // Get the keys unique to upper nodes and the number of them
-        allocator.DeviceAllocate((void**)&upper_keys, sizeof(uint64_t) * lower_node_count);
-        cub::DeviceSelect::Unique(nullptr, temp_storage_bytes, ShiftRightIterator<15>(lower_keys), upper_keys, node_counts + 2, lower_node_count);
-        allocator.DeviceAllocate((void**)&d_temp_storage, temp_storage_bytes);
-        cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, ShiftRightIterator<15>(lower_keys), upper_keys, node_counts + 2, lower_node_count);
-        allocator.DeviceFree(d_temp_storage);
-        check_cuda(cudaMemcpy(&upper_node_count, node_counts + 2, sizeof(uint32_t), cudaMemcpyDeviceToHost));
+        __syncthreads();
+
+        NodeT &node = tree->template getFirstNode<NodeT>()[node_id];
+        for (unsigned child_id = threadIdx.x; child_id < NodeT::SIZE; child_id += blockDim.x)
+        {
+            if (node.isChild(child_id))
+            {
+                bbox.expandAtomic(node.getChild(child_id)->bbox());
+            }
+            else
+            {
+                setBackgroundValue(node, child_id, background_value);
+            }
+        }
+
+        __syncthreads();
+
+        if (threadIdx.x == 0)
+        {
+            node.mBBox = bbox;
+        }
     }
+}
 
-    using Tree = nanovdb::NanoTree<BuildT>;
-    using Grid = nanovdb::Grid<Tree>;
-
-    const size_t total_bytes =
-        sizeof(Grid) +
-        sizeof(Tree) +
-        sizeof(typename Tree::RootType) +
-        sizeof(typename Tree::RootType::Tile) * upper_node_count +
-        sizeof(typename Tree::Node2) * upper_node_count +
-        sizeof(typename Tree::Node1) * lower_node_count +
-        sizeof(typename Tree::Node0) * leaf_count;
-
-    const int64_t upper_mem_offset =
-        sizeof(nanovdb::GridData) + sizeof(Tree) + sizeof(typename Tree::RootType) + 
-        sizeof(typename Tree::RootType::Tile) * upper_node_count;
-    const int64_t lower_mem_offset = upper_mem_offset + sizeof(typename Tree::Node2) * upper_node_count;
-    const int64_t leaf_mem_offset = lower_mem_offset + sizeof(typename Tree::Node1) * lower_node_count;
-
-    typename Grid::DataType* grid;
-    check_cuda(cudaMalloc(&grid, total_bytes));
-
-    typename Tree::DataType* const tree = reinterpret_cast<typename Tree::DataType*>(grid + 1); // The tree is immediately after the grid
-    typename Tree::RootType::DataType* const root = reinterpret_cast<typename Tree::RootType::DataType*>(tree + 1); // The root is immediately after the tree
-    typename Tree::RootType::Tile* const tiles = reinterpret_cast<typename Tree::RootType::Tile*>(root + 1);
-    typename Tree::Node2::DataType* const upper_nodes = nanovdb::PtrAdd<typename Tree::Node2::DataType>(grid, upper_mem_offset);
-    typename Tree::Node1::DataType* const lower_nodes = nanovdb::PtrAdd<typename Tree::Node1::DataType>(grid, lower_mem_offset);
-    typename Tree::Node0::DataType* const leaf_nodes  = nanovdb::PtrAdd<typename Tree::Node0::DataType>(grid, leaf_mem_offset);
-
-    // Phase 2: building the tree
+template <typename Tree>
+__global__ void setRootBBoxAndBackgroundValue(nanovdb::Grid<Tree> *grid,
+                                              const typename Tree::BuildType background_value)
+{
+    using BBox = typename Tree::RootNodeType::BBoxType;
+    __shared__ BBox bbox;
+
+    Tree &tree = grid->tree();
+    const unsigned upper_count = tree.mNodeCount[2];
+
+    if (threadIdx.x == 0)
     {
-        // Setting up the tree and root node
-        kernel<<<1, 1>>>(1, [=] __device__(size_t i) {
-            tree->mNodeOffset[3] = sizeof(Tree);
-            tree->mNodeOffset[2] = tree->mNodeOffset[3] + sizeof(typename Tree::RootType) + sizeof(typename Tree::RootType::Tile) * upper_node_count;
-            tree->mNodeOffset[1] = tree->mNodeOffset[2] + sizeof(typename Tree::Node2) * upper_node_count;
-            tree->mNodeOffset[0] = tree->mNodeOffset[1] + sizeof(typename Tree::Node1) * lower_node_count;
-            tree->mNodeCount[2] = tree->mTileCount[2] = upper_node_count;
-            tree->mNodeCount[1] = tree->mTileCount[1] = lower_node_count;
-            tree->mNodeCount[0] = tree->mTileCount[0] = leaf_count;
-            tree->mVoxelCount = Tree::Node0::SIZE * leaf_count; // assuming full leaves
-
-            root->mBBox = nanovdb::CoordBBox(); // init to empty
-            root->mTableSize = upper_node_count;
-            root->mBackground = background_value;
-            root->mMinimum = ZERO_VAL;
-            root->mMaximum = ZERO_VAL;
-            root->mAverage = ZERO_SCALAR;
-            root->mStdDevi = ZERO_SCALAR;
-        });
+        bbox = BBox();
     }
 
-    // Add tiles and upper nodes
-    // i : 0 .. upper_node_count-1 
-    num_blocks = (upper_node_count + num_threads - 1) / num_threads;
+    __syncthreads();
+
+    for (unsigned upper_id = threadIdx.x; upper_id < upper_count; upper_id += blockDim.x)
     {
-        kernel<<<num_blocks, num_threads>>>(upper_node_count, [=] __device__(size_t i) {
-            tiles[i].key = root->CoordToKey(tile_key36_to_coord(upper_keys[i]));
-            tiles[i].child = sizeof(typename Tree::RootType) + sizeof(typename Tree::RootType::Tile) * upper_node_count + sizeof(typename Tree::Node2) * i;
-            tiles[i].state = 0;
-            tiles[i].value = background_value;
-
-            assert(reinterpret_cast<const char*>(root->getChild(tiles + i)) == reinterpret_cast<const char*>(upper_nodes + i));
-            auto& node = upper_nodes[i];
-            node.mBBox = nanovdb::CoordBBox();
-            node.mFlags = 0;
-            node.mValueMask.setOff();
-            node.mChildMask.setOff();
-            node.mMinimum = ZERO_VAL;
-            node.mMaximum = ZERO_VAL;
-            node.mAverage = ZERO_SCALAR;
-            node.mStdDevi = ZERO_SCALAR;
-            for (size_t n = 0; n < Tree::Node2::SIZE; ++n) {
-                node.mTable[n].value = background_value;
-            }
-        });
+        typename Tree::UpperNodeType &upper = tree.getFirstUpper()[upper_id];
+        bbox.expandAtomic(upper.bbox());
     }
 
-    constexpr uint32_t MASK_15BITS = (1u << 15) - 1u;
-    constexpr uint32_t MASK_12BITS = (1u << 12) - 1u;
+    __syncthreads();
 
-    // Init lower nodes and register to parent
-    // i : 0 .. lower_node_count-1 
-    num_blocks = (lower_node_count + num_threads - 1) / num_threads;
+    if (threadIdx.x == 0)
     {
-        kernel<<<num_blocks, num_threads>>>(lower_node_count, [=] __device__(size_t i) {
-            uint32_t upper_offset = lower_keys[i] & MASK_15BITS;
-            auto*    upper_node = root->getChild(find_tile(root, tile_key36_to_coord(lower_keys[i] >> 15)))->data();
-            set_mask_atomic(upper_node->mChildMask, upper_offset);
-            upper_node->setChild(upper_offset, lower_nodes + i);
-
-            auto& node = lower_nodes[i];
-            node.mBBox = nanovdb::CoordBBox();
-            node.mFlags = 0;
-            node.mValueMask.setOff();
-            node.mChildMask.setOff();
-            node.mMinimum = ZERO_VAL;
-            node.mMaximum = ZERO_VAL;
-            node.mAverage = ZERO_SCALAR;
-            node.mStdDevi = ZERO_SCALAR;
-            for (size_t n = 0; n < Tree::Node1::SIZE; ++n) {
-                node.mTable[n].value = background_value;
-            }
-        });
+        typename Tree::RootNodeType &root = tree.root();
+        setBackgroundValue(root, background_value);
+        root.mBBox = bbox;
+
+        grid->mWorldBBox = root.mBBox.transform(grid->map());
     }
+}
+
+template <typename BuildT>
+void finalize_grid(nanovdb::Grid<nanovdb::NanoTree<BuildT>> &out_grid, const BuildGridParams<BuildT> &params)
+{
+    // set background value, activate all voxels for allocated tiles and update bbox
+
+    using Tree = nanovdb::NanoTree<BuildT>;
+    Tree *tree = &out_grid.tree();
+
+    int node_counts[3];
+    memcpy_d2h(WP_CURRENT_CONTEXT, node_counts, tree->mNodeCount, sizeof(node_counts));
+    // synchronization below is unnecessary as node_counts is in pageable memory.
+    // keep it for clarity
+    cudaStream_t stream = static_cast<cudaStream_t>(cuda_stream_get_current());
+    cuda_stream_synchronize(stream);
+
+    const unsigned int leaf_count = node_counts[0];
+    const unsigned int lower_count = node_counts[1];
+    const unsigned int upper_count = node_counts[2];
+
+    constexpr unsigned NUM_THREADS = 256;
+    const unsigned leaf_blocks = (leaf_count + NUM_THREADS - 1) / NUM_THREADS;
+    activateAllLeafVoxels<Tree><<<leaf_blocks, NUM_THREADS, 0, stream>>>(tree);
+
+    setInternalBBoxAndBackgroundValue<Tree, typename Tree::LowerNodeType>
+        <<<lower_count, NUM_THREADS, 0, stream>>>(tree, params.background_value);
+    setInternalBBoxAndBackgroundValue<Tree, typename Tree::UpperNodeType>
+        <<<upper_count, NUM_THREADS, 0, stream>>>(tree, params.background_value);
+    setRootBBoxAndBackgroundValue<Tree><<<1, NUM_THREADS, 0, stream>>>(&out_grid, params.background_value);
+
+    check_cuda(cuda_context_check(WP_CURRENT_CONTEXT));
+}
+
+template <>
+void finalize_grid(nanovdb::Grid<nanovdb::NanoTree<nanovdb::ValueOnIndex>> &out_grid,
+                   const BuildGridParams<nanovdb::ValueOnIndex> &params)
+{
+    // nothing to do for OnIndex grids
+}
+
+/// "fancy-pointer" that transforms from world to index coordinates
+struct WorldSpacePointsPtr
+{
+    const nanovdb::Vec3f *points;
+    const nanovdb::Map map;
 
-    // Init leaf nodes and register to parent
-    // i : 0 .. leaf_count-1 
-    num_blocks = (leaf_count + num_threads - 1) / num_threads;
+    __device__ nanovdb::Vec3f operator[](int idx) const
     {
-        kernel<<<num_blocks, num_threads>>>(leaf_count, [=] __device__(size_t i) {
-            uint32_t lower_offset = leaf_keys[i] & MASK_12BITS;
-            uint32_t upper_offset = (leaf_keys[i] >> 12) & MASK_15BITS;
-            const nanovdb::Coord ijk = tile_key36_to_coord(leaf_keys[i] >> 27);
-
-            auto* upper_node = root->getChild(find_tile(root, ijk))->data();
-            auto* lower_node = upper_node->getChild(upper_offset)->data();
-            set_mask_atomic(lower_node->mChildMask, lower_offset);
-            lower_node->setChild(lower_offset, leaf_nodes + i);
-
-            const nanovdb::Coord localUpperIjk = Tree::Node2::OffsetToLocalCoord(upper_offset) << Tree::Node1::TOTAL;
-            const nanovdb::Coord localLowerIjk = Tree::Node1::OffsetToLocalCoord(lower_offset) << Tree::Node0::TOTAL;
-            const nanovdb::Coord leafOrigin = ijk + localUpperIjk + localLowerIjk;
-
-            auto& node = leaf_nodes[i];
-            node.mBBoxMin = leafOrigin;
-            node.mBBoxDif[0] = leaf_nodes[i].mBBoxDif[1] = leaf_nodes[i].mBBoxDif[2] = Tree::Node0::DIM;
-            node.mFlags = 0;
-            node.mValueMask.setOn();
-            node.mMinimum = ZERO_VAL;
-            node.mMaximum = ZERO_VAL;
-            node.mAverage = ZERO_SCALAR;
-            node.mStdDevi = ZERO_SCALAR;
-            // mValues is undefined
-
-            // propagating bbox up:
-            expand_cwise_atomic(lower_node->mBBox, leafOrigin);
-            expand_cwise_atomic(lower_node->mBBox, leafOrigin + nanovdb::Coord(Tree::Node0::DIM));
-        });
+        return map.applyInverseMapF(points[idx]);
     }
 
-    // Propagating bounding boxes from lower nodes to upper nodes
-    // i : 0 .. lower_node_count-1 
-    num_blocks = (lower_node_count + num_threads - 1) / num_threads;
+    __device__ nanovdb::Vec3f operator*() const
     {
-        kernel<<<num_blocks, num_threads>>>(lower_node_count, [=] __device__(size_t i) {
-            auto* upper_node = root->getChild(find_tile(root, tile_key36_to_coord(lower_keys[i] >> 15)))->data();
-            expand_cwise_atomic(upper_node->mBBox, lower_nodes[i].mBBox.min());
-            expand_cwise_atomic(upper_node->mBBox, lower_nodes[i].mBBox.max());
-        });
+        return (*this)[0];
     }
+};
 
-    // Setting up root bounding box and grid
-    {
-        kernel<<<1, 1>>>(1, [=] __device__(size_t i) {
-            for (int i = 0; i < upper_node_count; ++i) {
-                root->mBBox.expand(upper_nodes[i].mBBox.min());
-                root->mBBox.expand(upper_nodes[i].mBBox.max());
-            }
+} // namespace
 
-            nanovdb::Map map;
-            {
-                const double mat[4][4] = {
-                    {dx, 0.0, 0.0, 0.0}, // row 0
-                    {0.0, dx, 0.0, 0.0}, // row 1
-                    {0.0, 0.0, dx, 0.0}, // row 2
-                    {Tx, Ty, Tz, 1.0}, // row 3
-                };
-                const double invMat[4][4] = {
-                    {1 / dx, 0.0, 0.0, 0.0}, // row 0
-                    {0.0, 1 / dx, 0.0, 0.0}, // row 1
-                    {0.0, 0.0, 1 / dx, 0.0}, // row 2
-                    {0.0, 0.0, 0.0, 0.0}, // row 3, ignored by Map::set
-                };
-                map.set(mat, invMat, 1.0);
-            }
+namespace nanovdb
+{
+template <> struct BufferTraits<DeviceBuffer>
+{
+    static constexpr bool hasDeviceDual = true;
+};
+
+} // namespace nanovdb
+
+template <typename BuildT>
+void build_grid_from_points(nanovdb::Grid<nanovdb::NanoTree<BuildT>> *&out_grid, size_t &out_grid_size,
+                            const void *points, size_t num_points, bool points_in_world_space,
+                            const BuildGridParams<BuildT> &params)
+{
+
+    out_grid = nullptr;
+    out_grid_size = 0;
 
-            grid->mMagic = NANOVDB_MAGIC_NUMBER;
-            grid->mChecksum = 0xFFFFFFFFFFFFFFFFull;
-            grid->mVersion = nanovdb::Version();
-            grid->mFlags = static_cast<uint32_t>(nanovdb::GridFlags::HasBBox) | 
-                           static_cast<uint32_t>(nanovdb::GridFlags::IsBreadthFirst);
-            grid->mGridIndex = 0;
-            grid->mGridCount = 1;
-            grid->mGridSize = total_bytes;
-            // mGridName is set below
-            grid->mWorldBBox.mCoord[0] = map.applyMap(nanovdb::Vec3R(root->mBBox.mCoord[0]));
-            grid->mWorldBBox.mCoord[1] = map.applyMap(nanovdb::Vec3R(root->mBBox.mCoord[1]));
-            grid->mVoxelSize = nanovdb::Vec3d(dx);
-            grid->mMap = map;
-            grid->mGridClass = nanovdb::GridClass::Unknown;
-            grid->mGridType = nanovdb::mapToGridType<BuildT>();
-            grid->mBlindMetadataOffset = total_bytes;
-            grid->mBlindMetadataCount = 0;
-        });
+    cudaStream_t stream = static_cast<cudaStream_t>(cuda_stream_get_current());
+    nanovdb::Map map(params.voxel_size, params.translation);
+    nanovdb::tools::cuda::PointsToGrid<BuildT, Allocator> p2g(map, stream);
+
+    // p2g.setVerbose(2);
+    p2g.setGridName(params.name);
+    p2g.setChecksum(nanovdb::CheckMode::Disable);
+
+    // Only compute bbox for OnIndex grids. Otherwise bbox will be computed after activating all leaf voxels
+    p2g.includeBBox(nanovdb::BuildTraits<BuildT>::is_onindex);
+
+    nanovdb::GridHandle<DeviceBuffer> grid_handle;
+
+    if (points_in_world_space)
+    {
+        grid_handle = p2g.getHandle(WorldSpacePointsPtr{static_cast<const nanovdb::Vec3f *>(points), map}, num_points,
+                                    DeviceBuffer());
+    }
+    else
+    {
+        grid_handle = p2g.getHandle(static_cast<const nanovdb::Coord *>(points), num_points, DeviceBuffer());
     }
 
-    check_cuda(cudaMemcpy(grid->mGridName, params.name, 256, cudaMemcpyHostToDevice));
+    out_grid = grid_handle.deviceGrid<BuildT>();
+    out_grid_size = grid_handle.gridSize();
 
-    allocator.DeviceFree(lower_keys);
-    allocator.DeviceFree(upper_keys);
-    allocator.DeviceFree(leaf_keys);
-    allocator.DeviceFree(node_counts);
+    finalize_grid(*out_grid, params);
 
-    out_grid = reinterpret_cast<Grid*>(grid);
-    out_grid_size = total_bytes;
+    // So that buffer is not destroyed when handles goes out of scope
+    grid_handle.buffer().detachDeviceData();
 }
 
-template void build_grid_from_tiles(nanovdb::Grid<nanovdb::NanoTree<float>>*&, size_t&, const void*, size_t, bool, const BuildGridParams<float>&);
-template void build_grid_from_tiles(nanovdb::Grid<nanovdb::NanoTree<nanovdb::Vec3f>>*&, size_t&, const void*, size_t, bool, const BuildGridParams<nanovdb::Vec3f>&);
-template void build_grid_from_tiles(nanovdb::Grid<nanovdb::NanoTree<int32_t>>*&, size_t&, const void*, size_t, bool, const BuildGridParams<int32_t>&);
+template void build_grid_from_points(nanovdb::Grid<nanovdb::NanoTree<float>> *&, size_t &, const void *, size_t, bool,
+                                     const BuildGridParams<float> &);
+template void build_grid_from_points(nanovdb::Grid<nanovdb::NanoTree<nanovdb::Vec3f>> *&, size_t &, const void *,
+                                     size_t, bool, const BuildGridParams<nanovdb::Vec3f> &);
+template void build_grid_from_points(nanovdb::Grid<nanovdb::NanoTree<int32_t>> *&, size_t &, const void *, size_t, bool,
+                                     const BuildGridParams<int32_t> &);
+template void build_grid_from_points(nanovdb::Grid<nanovdb::NanoTree<nanovdb::ValueIndex>> *&, size_t &, const void *,
+                                     size_t, bool, const BuildGridParams<nanovdb::ValueIndex> &);
+template void build_grid_from_points(nanovdb::Grid<nanovdb::NanoTree<nanovdb::ValueOnIndex>> *&, size_t &, const void *,
+                                     size_t, bool, const BuildGridParams<nanovdb::ValueOnIndex> &);
diff --git a/warp/native/volume_builder.h b/warp/native/volume_builder.h
index 2281125e..74c7d6cb 100644
--- a/warp/native/volume_builder.h
+++ b/warp/native/volume_builder.h
@@ -10,8 +10,23 @@ struct BuildGridParams {
     char name[256] = "";
 };
 
+template<>
+struct BuildGridParams<nanovdb::ValueIndex> {
+    double voxel_size = 1.0;
+    nanovdb::ValueIndex background_value;
+    nanovdb::Vec3d translation{0.0, 0.0, 0.0};
+    char name[256] = "";
+};
+
+template<>
+struct BuildGridParams<nanovdb::ValueOnIndex> {
+    double voxel_size = 1.0;
+    nanovdb::Vec3d translation{0.0, 0.0, 0.0};
+    char name[256] = "";
+};
+
 template <typename BuildT>
-void build_grid_from_tiles(nanovdb::Grid<nanovdb::NanoTree<BuildT>> *&out_grid,
+void build_grid_from_points(nanovdb::Grid<nanovdb::NanoTree<BuildT>> *&out_grid,
                            size_t &out_grid_size,
                            const void *points,
                            size_t num_points,
diff --git a/warp/native/volume_impl.h b/warp/native/volume_impl.h
new file mode 100644
index 00000000..68e014c8
--- /dev/null
+++ b/warp/native/volume_impl.h
@@ -0,0 +1,61 @@
+/** Copyright (c) 2022 NVIDIA CORPORATION.  All rights reserved.
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+#pragma once
+
+#include "volume.h"
+
+// Helper functions for cpp/cu files, not to be exposed to user kernels
+
+namespace wp
+{
+
+namespace volume
+{
+
+inline CUDA_CALLABLE pnanovdb_leaf_handle_t get_leaf(const pnanovdb_buf_t buf, const uint32_t leaf_id)
+{
+    const pnanovdb_tree_handle_t tree = get_tree(buf);
+    const uint64_t first_leaf_offset = pnanovdb_tree_get_node_offset_leaf(buf, tree);
+    const uint32_t leaf_stride = PNANOVDB_GRID_TYPE_GET(get_grid_type(buf), leaf_size);
+    return {pnanovdb_address_offset64(tree.address, first_leaf_offset + uint64_t(leaf_id) * leaf_stride)};
+}
+
+inline CUDA_CALLABLE pnanovdb_coord_t leaf_origin(const pnanovdb_buf_t buf, const pnanovdb_leaf_handle_t leaf)
+{
+    pnanovdb_coord_t origin = pnanovdb_leaf_get_bbox_min(buf, leaf);
+    // mask out last three bits corresponding to voxel coordinates within leaf
+    constexpr uint32_t MASK = (1u << 3u) - 1u;
+    origin.x &= ~MASK;
+    origin.y &= ~MASK;
+    origin.z &= ~MASK;
+    return origin;
+}
+
+inline CUDA_CALLABLE uint64_t leaf_voxel_index(const pnanovdb_buf_t buf, const uint32_t leaf_id,
+                                               const pnanovdb_coord_t &ijk)
+{
+    const uint32_t grid_type = get_grid_type(buf);
+
+    const pnanovdb_leaf_handle_t leaf = get_leaf(buf, leaf_id);
+    const pnanovdb_address_t value_address = pnanovdb_leaf_get_value_address(grid_type, buf, leaf, &ijk);
+    return volume::get_grid_voxel_index(grid_type, buf, value_address, ijk) - 1;
+}
+
+inline CUDA_CALLABLE pnanovdb_coord_t leaf_offset_to_local_coord(uint32_t offset)
+{
+    pnanovdb_coord_t coord;
+    coord.x = (offset >> 6) & 7;
+    coord.y = (offset >> 3) & 7;
+    coord.z = (offset >> 0) & 7;
+    return coord;
+}
+
+} // namespace volume
+
+} // namespace wp
diff --git a/warp/native/warp.h b/warp/native/warp.h
index 2c072b61..8af1a3ed 100644
--- a/warp/native/warp.h
+++ b/warp/native/warp.h
@@ -97,20 +97,28 @@ extern "C"
                              const void* a, const void* b, const void* c, void* d, float alpha, float beta,
                              bool row_major_a, bool row_major_b, bool allow_tf32x3_arith, int batch_count);
 
-    WP_API uint64_t volume_create_host(void* buf, uint64_t size);
-    WP_API void volume_get_buffer_info_host(uint64_t id, void** buf, uint64_t* size);
-    WP_API void volume_get_tiles_host(uint64_t id, void** buf, uint64_t* size);
+    WP_API uint64_t volume_create_host(void* buf, uint64_t size, bool copy, bool owner);
+    WP_API void volume_get_tiles_host(uint64_t id, void* buf);
+    WP_API void volume_get_voxels_host(uint64_t id, void* buf);
     WP_API void volume_destroy_host(uint64_t id);
 
-    WP_API uint64_t volume_create_device(void* context, void* buf, uint64_t size);
+    WP_API uint64_t volume_create_device(void* context, void* buf, uint64_t size, bool copy, bool owner);
+    WP_API void volume_get_tiles_device(uint64_t id, void* buf);
+    WP_API void volume_get_voxels_device(uint64_t id, void* buf);
+    WP_API void volume_destroy_device(uint64_t id);
+    
     WP_API uint64_t volume_f_from_tiles_device(void* context, void* points, int num_points, float voxel_size, float bg_value, float tx, float ty, float tz, bool points_in_world_space);
     WP_API uint64_t volume_v_from_tiles_device(void* context, void* points, int num_points, float voxel_size, float bg_value_x, float bg_value_y, float bg_value_z, float tx, float ty, float tz, bool points_in_world_space);
     WP_API uint64_t volume_i_from_tiles_device(void* context, void* points, int num_points, float voxel_size, int bg_value, float tx, float ty, float tz, bool points_in_world_space);
-    WP_API void volume_get_buffer_info_device(uint64_t id, void** buf, uint64_t* size);
-    WP_API void volume_get_tiles_device(uint64_t id, void** buf, uint64_t* size);
-    WP_API void volume_destroy_device(uint64_t id);
+    WP_API uint64_t volume_index_from_tiles_device(void* context, void* points, int num_points, float voxel_size, float tx, float ty, float tz, bool points_in_world_space);
+    WP_API uint64_t volume_from_active_voxels_device(void* context, void* points, int num_points, float voxel_size, float tx, float ty, float tz, bool points_in_world_space);
 
+    WP_API void volume_get_buffer_info(uint64_t id, void** buf, uint64_t* size);
     WP_API void volume_get_voxel_size(uint64_t id, float* dx, float* dy, float* dz);
+    WP_API void volume_get_tile_and_voxel_count(uint64_t id, uint32_t& tile_count, uint64_t& voxel_count);
+    WP_API const char* volume_get_grid_info(uint64_t id, uint64_t *grid_size, uint32_t *grid_index, uint32_t *grid_count, float translation[3], float transform[9], char type_str[16]);
+    WP_API uint32_t volume_get_blind_data_count(uint64_t id);
+    WP_API const char* volume_get_blind_data_info(uint64_t id, uint32_t data_index, void** buf, uint64_t* value_count, uint32_t* value_size, char type_str[16]);
     
     WP_API uint64_t marching_cubes_create_device(void* context);
     WP_API void marching_cubes_destroy_device(uint64_t id);
diff --git a/warp/stubs.py b/warp/stubs.py
index cffd825c..ac927d34 100644
--- a/warp/stubs.py
+++ b/warp/stubs.py
@@ -1112,6 +1112,17 @@ def volume_store_i(id: uint64, i: int32, j: int32, k: int32, value: int32):
     ...
 
 
+@over
+def volume_lookup_index(id: uint64, i: int32, j: int32, k: int32) -> int32:
+    """Returns the index associated to the voxel with coordinates ``i``, ``j``, ``k``.
+
+    If the voxel at this index does not exist, this function returns -1.
+    This function is available for both index grids and classical volumes.
+
+    """
+    ...
+
+
 @over
 def volume_index_to_world(id: uint64, uvw: vec3f) -> vec3f:
     """Transform a point ``uvw`` defined in volume index space to world space given the volume's intrinsic affine transformation."""
diff --git a/warp/tests/assets/test_index_grid.nvdb b/warp/tests/assets/test_index_grid.nvdb
new file mode 100644
index 00000000..142764ba
--- /dev/null
+++ b/warp/tests/assets/test_index_grid.nvdb
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0c99e8b7a3b8f49ff4f85c3759cf7c0580b4aca41398df67ae795db0ab7bcf54
+size 8034
diff --git a/warp/tests/test_volume.py b/warp/tests/test_volume.py
index 56b09556..038f3c42 100644
--- a/warp/tests/test_volume.py
+++ b/warp/tests/test_volume.py
@@ -6,6 +6,7 @@
 # license agreement from NVIDIA CORPORATION is strictly prohibited.
 
 import unittest
+from typing import Any
 
 import numpy as np
 
@@ -30,6 +31,7 @@ def test_volume_lookup_f(volume: wp.uint64, points: wp.array(dtype=wp.vec3)):
     k = int(p[2])
 
     expect_eq(wp.volume_lookup_f(volume, i, j, k), expected)
+    expect_eq(wp.volume_lookup(volume, i, j, k, dtype=wp.float32), expected)
 
 
 @wp.kernel
@@ -45,6 +47,7 @@ def test_volume_sample_closest_f(volume: wp.uint64, points: wp.array(dtype=wp.ve
         expected = 10.0
 
     expect_eq(wp.volume_sample_f(volume, p, wp.Volume.CLOSEST), expected)
+    expect_eq(wp.volume_sample(volume, p, wp.Volume.CLOSEST, dtype=wp.float32), expected)
 
     q = wp.volume_index_to_world(volume, p)
     q_inv = wp.volume_world_to_index(volume, q)
@@ -62,6 +65,7 @@ def test_volume_sample_linear_f(volume: wp.uint64, points: wp.array(dtype=wp.vec
         return  # not testing against background values
 
     expect_near(wp.volume_sample_f(volume, p, wp.Volume.LINEAR), expected, 2.0e-4)
+    expect_near(wp.volume_sample(volume, p, wp.Volume.LINEAR, dtype=wp.float32), expected, 2.0e-4)
 
 
 @wp.kernel
@@ -86,6 +90,13 @@ def test_volume_sample_grad_linear_f(volume: wp.uint64, points: wp.array(dtype=w
     expect_near(grad[1], expected_gy, 2.0e-4)
     expect_near(grad[2], expected_gz, 2.0e-4)
 
+    val = wp.volume_sample_grad(volume, p, wp.Volume.LINEAR, grad, dtype=wp.float32)
+
+    expect_near(val, expected_val, 2.0e-4)
+    expect_near(grad[0], expected_gx, 2.0e-4)
+    expect_near(grad[1], expected_gy, 2.0e-4)
+    expect_near(grad[2], expected_gz, 2.0e-4)
+
 
 @wp.kernel
 def test_volume_sample_local_f_linear_values(
@@ -162,6 +173,7 @@ def test_volume_lookup_v(volume: wp.uint64, points: wp.array(dtype=wp.vec3)):
     k = int(p[2])
 
     expect_eq(wp.volume_lookup_v(volume, i, j, k), expected)
+    expect_eq(wp.volume_lookup(volume, i, j, k, dtype=wp.vec3), expected)
 
 
 @wp.kernel
@@ -177,6 +189,7 @@ def test_volume_sample_closest_v(volume: wp.uint64, points: wp.array(dtype=wp.ve
         expected = wp.vec3(10.8, -4.13, 10.26)
 
     expect_eq(wp.volume_sample_v(volume, p, wp.Volume.CLOSEST), expected)
+    expect_eq(wp.volume_sample(volume, p, wp.Volume.CLOSEST, dtype=wp.vec3), expected)
 
     q = wp.volume_index_to_world(volume, p)
     q_inv = wp.volume_world_to_index(volume, q)
@@ -196,6 +209,30 @@ def test_volume_sample_linear_v(volume: wp.uint64, points: wp.array(dtype=wp.vec
         return  # not testing against background values
 
     expect_near(wp.volume_sample_v(volume, p, wp.Volume.LINEAR), expected, 2.0e-4)
+    expect_near(wp.volume_sample(volume, p, wp.Volume.LINEAR, dtype=wp.vec3), expected, 2.0e-4)
+
+
+@wp.kernel
+def test_volume_sample_grad_linear_v(volume: wp.uint64, points: wp.array(dtype=wp.vec3)):
+    tid = wp.tid()
+
+    p = points[tid]
+
+    if abs(p[0]) > 10.0 or abs(p[1]) > 10.0 or abs(p[2]) > 10.0:
+        return  # not testing against background values
+
+    expected_val = wp.vec3(
+        p[0] + 2.0 * p[1] + 3.0 * p[2], 4.0 * p[0] + 5.0 * p[1] + 6.0 * p[2], 7.0 * p[0] + 8.0 * p[1] + 9.0 * p[2]
+    )
+    expected_grad = wp.mat33(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0)
+
+    grad = wp.mat33(0.0)
+    val = wp.volume_sample_grad(volume, p, wp.Volume.LINEAR, grad, dtype=wp.vec3)
+
+    expect_near(val, expected_val, 2.0e-4)
+    expect_near(grad[0], expected_grad[0], 2.0e-4)
+    expect_near(grad[1], expected_grad[1], 2.0e-4)
+    expect_near(grad[2], expected_grad[2], 2.0e-4)
 
 
 @wp.kernel
@@ -233,6 +270,7 @@ def test_volume_lookup_i(volume: wp.uint64, points: wp.array(dtype=wp.vec3)):
         expected = 10
 
     expect_eq(wp.volume_lookup_i(volume, i, j, k), expected)
+    expect_eq(wp.volume_lookup(volume, i, j, k, dtype=wp.int32), expected)
 
 
 @wp.kernel
@@ -248,6 +286,7 @@ def test_volume_sample_i(volume: wp.uint64, points: wp.array(dtype=wp.vec3)):
         expected = 10
 
     expect_eq(wp.volume_sample_i(volume, p), expected)
+    expect_eq(wp.volume_sample(volume, p, wp.Volume.CLOSEST, dtype=wp.int32), expected)
 
     q = wp.volume_index_to_world(volume, p)
     q_inv = wp.volume_world_to_index(volume, q)
@@ -293,9 +332,7 @@ def test_volume_store_f(volume: wp.uint64, points: wp.array(dtype=wp.vec3), valu
     j = int(p[1])
     k = int(p[2])
 
-    # NB: Writing outside the allocated domain overwrites the background value of the Volume
-    if abs(i) <= 11 and abs(j) <= 11 and abs(k) <= 11:
-        wp.volume_store_f(volume, i, j, k, float(i + 100 * j + 10000 * k))
+    wp.volume_store(volume, i, j, k, float(i + 100 * j + 10000 * k))
     values[tid] = wp.volume_lookup_f(volume, i, j, k)
 
 
@@ -308,9 +345,7 @@ def test_volume_store_v(volume: wp.uint64, points: wp.array(dtype=wp.vec3), valu
     j = int(p[1])
     k = int(p[2])
 
-    # NB: Writing outside the allocated domain overwrites the background value of the Volume
-    if abs(i) <= 11 and abs(j) <= 11 and abs(k) <= 11:
-        wp.volume_store_v(volume, i, j, k, p)
+    wp.volume_store(volume, i, j, k, p)
     values[tid] = wp.volume_lookup_v(volume, i, j, k)
 
 
@@ -323,9 +358,7 @@ def test_volume_store_i(volume: wp.uint64, points: wp.array(dtype=wp.vec3), valu
     j = int(p[1])
     k = int(p[2])
 
-    # NB: Writing outside the allocated domain overwrites the background value of the Volume
-    if abs(i) <= 11 and abs(j) <= 11 and abs(k) <= 11:
-        wp.volume_store_i(volume, i, j, k, i + 100 * j + 10000 * k)
+    wp.volume_store(volume, i, j, k, i + 100 * j + 10000 * k)
     values[tid] = wp.volume_lookup_i(volume, i, j, k)
 
 
@@ -355,6 +388,7 @@ def test_volume_store_i(volume: wp.uint64, points: wp.array(dtype=wp.vec3), valu
     "float": os.path.abspath(os.path.join(os.path.dirname(__file__), "assets/test_grid.nvdb")),
     "int32": os.path.abspath(os.path.join(os.path.dirname(__file__), "assets/test_int32_grid.nvdb")),
     "vec3f": os.path.abspath(os.path.join(os.path.dirname(__file__), "assets/test_vec_grid.nvdb")),
+    "index": os.path.abspath(os.path.join(os.path.dirname(__file__), "assets/test_index_grid.nvdb")),
     "torus": os.path.abspath(os.path.join(os.path.dirname(__file__), "assets/torus.nvdb")),
     "float_write": os.path.abspath(os.path.join(os.path.dirname(__file__), "assets/test_grid.nvdb")),
 }
@@ -492,7 +526,7 @@ def test_volume_sample_linear_v_gradient(test, device):
             )
         tape.backward(values)
 
-        grad_expected = np.array([6.0, 15.0, 24.0])
+        grad_expected = np.array([12.0, 15.0, 18.0])
         grad_computed = tape.gradients[uvws].numpy()[0]
         np.testing.assert_allclose(grad_computed, grad_expected, rtol=1e-4)
 
@@ -506,7 +540,7 @@ def test_volume_sample_linear_v_gradient(test, device):
             )
         tape.backward(values)
 
-        grad_expected = np.array([6.0, 15.0, 24.0]) / 0.25
+        grad_expected = np.array([12.0, 15.0, 18.0]) / 0.25
         grad_computed = tape.gradients[xyzs].numpy()[0]
         np.testing.assert_allclose(grad_computed, grad_expected, rtol=1e-4)
 
@@ -604,6 +638,188 @@ def test_volume_introspection(test, device):
             np.testing.assert_equal(test_volume_tiles, tiles_sorted)
             np.testing.assert_equal([0.25] * 3, voxel_size)
 
+            voxel_count = volume.get_voxel_count()
+            voxels_actual = volume.get_voxels().numpy()
+            assert voxel_count == voxels_actual.shape[0]
+
+            # Voxel coordinates should be unique
+            voxels_unique = np.unique(voxels_actual, axis=0)
+            assert voxel_count == voxels_unique.shape[0]
+
+            # Get back tiles from voxels, shoud match get_tiles()
+            voxel_tiles = 8 * (voxels_unique // 8)
+            voxel_tiles_sorted = voxel_tiles[np.lexsort(voxel_tiles.T[::-1])]
+            voxel_tiles_unique = np.unique(voxel_tiles_sorted, axis=0)
+
+            np.testing.assert_equal(voxel_tiles_unique, tiles_sorted)
+
+
+def test_volume_multiple_grids(test, device):
+    volume = volumes["index"][device.alias]
+
+    volume_2 = volume.load_next_grid()
+
+    test.assertIsNotNone(volume_2)
+
+    test.assertNotEqual(volume.id, volume_2.id)
+    test.assertNotEqual(volume.get_voxel_count(), volume_2.get_voxel_count())
+
+    test.assertEqual(volume.get_grid_info().grid_count, volume_2.get_grid_info().grid_count)
+    test.assertEqual(volume.get_grid_info().grid_index + 1, volume_2.get_grid_info().grid_index)
+
+    volume_3 = volume_2.load_next_grid()
+    test.assertIsNone(volume_3)
+
+
+def test_volume_feature_array(test, device):
+    volume = volumes["index"][device.alias]
+
+    test.assertEqual(volume.get_feature_array_count(), 1)
+
+    array = volume.feature_array(0, dtype=wp.uint64)
+    test.assertEqual(array.device, device)
+    test.assertEqual(array.dtype, wp.uint64)
+
+    # fVDB convention, data starts with array ndim + shape
+    np.testing.assert_equal(array.numpy()[0:4], [3, volume.get_voxel_count(), 2, 3])
+
+
+@wp.kernel
+def fill_leaf_values_kernel(volume: wp.uint64, ijk: wp.array2d(dtype=wp.int32), values: wp.array(dtype=Any)):
+    tid = wp.tid()
+
+    i = ijk[tid, 0]
+    j = ijk[tid, 1]
+    k = ijk[tid, 2]
+
+    expect_eq(tid, wp.volume_lookup_index(volume, i, j, k))
+
+    values[tid] = wp.volume_lookup(volume, i, j, k, dtype=values.dtype)
+
+
+@wp.kernel
+def test_volume_sample_index_kernel(
+    volume: wp.uint64,
+    points: wp.array(dtype=wp.vec3),
+    values: wp.array(dtype=Any),
+    background: wp.array(dtype=Any),
+    sampled_values: wp.array(dtype=Any),
+):
+    tid = wp.tid()
+    p = points[tid]
+
+    ref = wp.volume_sample(volume, p, wp.Volume.LINEAR, dtype=values.dtype)
+    sampled_values[tid] = wp.volume_sample_index(volume, p, wp.Volume.LINEAR, values, background[0])
+    expect_eq(sampled_values[tid], ref)
+
+
+@wp.kernel
+def test_volume_sample_grad_index_kernel(
+    volume: wp.uint64,
+    points: wp.array(dtype=wp.vec3),
+    values: wp.array(dtype=Any),
+    background: wp.array(dtype=Any),
+    sampled_values: wp.array(dtype=Any),
+    sampled_grads: wp.array(dtype=Any),
+):
+    tid = wp.tid()
+    p = points[tid]
+
+    ref_grad = sampled_grads.dtype()
+    ref = wp.volume_sample_grad(volume, p, wp.Volume.LINEAR, ref_grad, dtype=values.dtype)
+
+    grad = type(ref_grad)()
+    sampled_values[tid] = wp.volume_sample_grad_index(volume, p, wp.Volume.LINEAR, values, background[0], grad)
+    expect_eq(sampled_values[tid], ref)
+
+    expect_eq(grad[0], ref_grad[0])
+    expect_eq(grad[1], ref_grad[1])
+    expect_eq(grad[2], ref_grad[2])
+    sampled_grads[tid] = grad
+
+
+def test_volume_sample_index(test, device):
+    points = rng.uniform(-10.0, 10.0, size=(100, 3))
+    points[0:10, 0] += 100.0  # ensure some points are over unallocated voxels
+    uvws = wp.array(points, dtype=wp.vec3, device=device)
+
+    bg_values = {
+        "float": 10.0,
+        "vec3f": wp.vec3(10.8, -4.13, 10.26),
+    }
+    grad_types = {
+        "float": wp.vec3,
+        "vec3f": wp.mat33,
+    }
+
+    for volume_names in ("float", "vec3f"):
+        with test.subTest(volume_names=volume_names):
+            volume = volumes[volume_names][device.alias]
+
+            ijk = volume.get_voxels()
+
+            values = wp.empty(shape=volume.get_voxel_count(), dtype=volume.dtype, device=device, requires_grad=True)
+
+            vid = wp.uint64(volume.id)
+            wp.launch(fill_leaf_values_kernel, dim=values.shape, inputs=[vid, ijk, values], device=device)
+
+            sampled_values = wp.empty(shape=points.shape[0], dtype=volume.dtype, device=device, requires_grad=True)
+            background = wp.array([bg_values[volume_names]], dtype=volume.dtype, device=device, requires_grad=True)
+
+            tape = wp.Tape()
+            with tape:
+                wp.launch(
+                    test_volume_sample_index_kernel,
+                    dim=points.shape[0],
+                    inputs=[vid, uvws, values, background, sampled_values],
+                    device=device,
+                )
+
+            sampled_values.grad.fill_(1.0)
+            tape.backward()
+
+            # test adjoint w.r.t voxel and background value arrays
+            # we should have sum(sampled_values) = sum(adj_values * values) + (adj_background * background)
+            sum_sampled_values = np.sum(sampled_values.numpy(), axis=0)
+            sum_values_adj = np.sum(values.numpy() * values.grad.numpy(), axis=0)
+            sum_background_adj = background.numpy()[0] * background.grad.numpy()[0]
+
+            np.testing.assert_allclose(sum_sampled_values, sum_values_adj + sum_background_adj, rtol=1.0e-3)
+
+            tape.reset()
+
+            sampled_grads = wp.empty(
+                shape=points.shape[0], dtype=grad_types[volume_names], device=device, requires_grad=True
+            )
+
+            with tape:
+                wp.launch(
+                    test_volume_sample_grad_index_kernel,
+                    dim=points.shape[0],
+                    inputs=[vid, uvws, values, background, sampled_values, sampled_grads],
+                    device=device,
+                )
+
+            sampled_values.grad.fill_(1.0)
+            tape.backward()
+
+            # we should have sum(sampled_values) = sum(adj_values * values) + (adj_background * background)
+            sum_sampled_values = np.sum(sampled_values.numpy(), axis=0)
+            sum_values_adj = np.sum(values.numpy() * values.grad.numpy(), axis=0)
+            sum_background_adj = background.numpy()[0] * background.grad.numpy()[0]
+            np.testing.assert_allclose(sum_sampled_values, sum_values_adj + sum_background_adj, rtol=1.0e-3)
+
+            tape.zero()
+            sampled_values.grad.fill_(0.0)
+            sampled_grads.grad.fill_(1.0)
+            tape.backward()
+
+            # we should have sum(sampled_grad, axes=(0, -1)) = sum(adj_values * values) + (adj_background * background)
+            sum_sampled_grads = np.sum(np.sum(sampled_grads.numpy(), axis=0), axis=-1)
+            sum_values_adj = np.sum(values.numpy() * values.grad.numpy(), axis=0)
+            sum_background_adj = background.numpy()[0] * background.grad.numpy()[0]
+            np.testing.assert_allclose(sum_sampled_grads, sum_values_adj + sum_background_adj, rtol=1.0e-3)
+
 
 def test_volume_from_numpy(test, device):
     # Volume.allocate_from_tiles() is only available with CUDA
@@ -657,6 +873,9 @@ class TestVolume(unittest.TestCase):
 add_function_test(
     TestVolume, "test_volume_from_numpy", test_volume_from_numpy, devices=get_selected_cuda_test_devices()
 )
+add_function_test(TestVolume, "test_volume_multiple_grids", test_volume_multiple_grids, devices=devices)
+add_function_test(TestVolume, "test_volume_feature_array", test_volume_feature_array, devices=devices)
+add_function_test(TestVolume, "test_volume_sample_index", test_volume_sample_index, devices=devices)
 
 points = {}
 points_jittered = {}
@@ -715,6 +934,13 @@ class TestVolume(unittest.TestCase):
         inputs=[volumes["vec3f"][device.alias].id, points_jittered[device.alias]],
         devices=[device.alias],
     )
+    add_kernel_test(
+        TestVolume,
+        test_volume_sample_grad_linear_v,
+        dim=len(point_grid),
+        inputs=[volumes["vec3f"][device.alias].id, points_jittered[device.alias]],
+        devices=[device.alias],
+    )
 
     add_kernel_test(
         TestVolume,
diff --git a/warp/tests/test_volume_write.py b/warp/tests/test_volume_write.py
index a3725d23..75fcedc1 100644
--- a/warp/tests/test_volume_write.py
+++ b/warp/tests/test_volume_write.py
@@ -166,6 +166,10 @@ def test_volume_allocation(test, device):
         points_in_world_space=True,
         device=device,
     )
+
+    assert wp.types.types_equal(volume_a.dtype, wp.float32)
+    assert wp.types.types_equal(volume_b.dtype, wp.float32)
+
     points = wp.array(points_ref, dtype=wp.vec3, device=device)
     values_a = wp.empty(num_points, dtype=wp.float32, device=device)
     values_b = wp.empty(num_points, dtype=wp.float32, device=device)
@@ -201,6 +205,10 @@ def test_volume_allocate_by_tiles_f(test, device):
     points_ws_d = wp.array(points_ws, dtype=wp.vec3, device=device)
     volume_a = wp.Volume.allocate_by_tiles(points_is_d, voxel_size, background_value, translation, device=device)
     volume_b = wp.Volume.allocate_by_tiles(points_ws_d, voxel_size, background_value, translation, device=device)
+
+    assert wp.types.types_equal(volume_a.dtype, wp.float32)
+    assert wp.types.types_equal(volume_b.dtype, wp.float32)
+
     values_a = wp.empty(num_tiles * 512, dtype=wp.float32, device=device)
     values_b = wp.empty(num_tiles * 512, dtype=wp.float32, device=device)
 
@@ -229,6 +237,9 @@ def test_volume_allocate_by_tiles_v(test, device):
 
     points_d = wp.array(points_is, dtype=wp.int32, device=device)
     volume = wp.Volume.allocate_by_tiles(points_d, 0.1, wp.vec3(1, 2, 3), device=device)
+
+    assert wp.types.types_equal(volume.dtype, wp.vec3)
+
     values = wp.empty(len(points_d) * 512, dtype=wp.vec3, device=device)
 
     wp.launch(test_volume_tile_store_v, dim=len(points_d), inputs=[volume.id, points_d], device=device)
@@ -238,6 +249,72 @@ def test_volume_allocate_by_tiles_v(test, device):
     np.testing.assert_equal(values_res, values_ref)
 
 
+def test_volume_allocate_by_tiles_index(test, device):
+    num_tiles = 10
+    rng = np.random.default_rng(101215)
+    tiles = rng.integers(-512, 512, size=(num_tiles, 3), dtype=np.int32)
+    points_is = tiles * 8
+
+    points_d = wp.array(points_is, dtype=wp.int32, device=device)
+    volume = wp.Volume.allocate_by_tiles(points_d, 0.1, bg_value=None, device=device)
+
+    assert volume.is_index
+
+    vol_tiles = volume.get_tiles().numpy() / 8
+    vol_tile_sorted = vol_tiles[np.lexsort(vol_tiles.T[::-1])]
+    vol_tile_unique = np.unique(vol_tile_sorted, axis=0)
+
+    tile_sorted = tiles[np.lexsort(tiles.T[::-1])]
+    tile_unique = np.unique(tile_sorted, axis=0)
+
+    np.testing.assert_equal(tile_unique, vol_tile_unique)
+
+
+def test_volume_allocation_from_voxels(test, device):
+    point_count = 387
+    rng = np.random.default_rng(101215)
+
+    # Create from world-space points
+    points = wp.array(rng.uniform(5.0, 10.0, size=(point_count, 3)), dtype=float, device=device)
+
+    volume = wp.Volume.allocate_by_voxels(
+        voxel_points=points, voxel_size=0.25, translation=(0.0, 5.0, 10.0), device=device
+    )
+
+    assert volume.is_index
+
+    test.assertNotEqual(volume.id, 0)
+
+    test.assertAlmostEqual(volume.get_voxel_size(), (0.25, 0.25, 0.25))
+    voxel_count = volume.get_voxel_count()
+    test.assertGreaterEqual(point_count, voxel_count)
+    test.assertGreaterEqual(voxel_count, 1)
+
+    voxels = volume.get_voxels()
+
+    # Check that world-to-index transform has been correctly applied
+    voxel_low = np.min(voxels.numpy(), axis=0)
+    voxel_up = np.max(voxels.numpy(), axis=0)
+    np.testing.assert_array_less([19, -1, -21], voxel_low)
+    np.testing.assert_array_less(voxel_up, [41, 21, 1])
+
+    # Recreate the volume from ijk coords
+    volume_from_ijk = wp.Volume.allocate_by_voxels(
+        voxel_points=voxels, voxel_size=0.25, translation=(0.0, 5.0, 10.0), device=device
+    )
+
+    assert volume_from_ijk.is_index
+
+    assert volume_from_ijk.get_voxel_count() == voxel_count
+    ijk_voxels = volume_from_ijk.get_voxels().numpy()
+
+    voxels = voxels.numpy()
+    voxel_sorted = voxels[np.lexsort(voxels.T[::-1])]
+    ijk_voxel_sorted = ijk_voxels[np.lexsort(ijk_voxels.T[::-1])]
+
+    np.testing.assert_equal(voxel_sorted, ijk_voxel_sorted)
+
+
 devices = get_selected_cuda_test_devices()
 
 
@@ -248,6 +325,15 @@ class TestVolumeWrite(unittest.TestCase):
 add_function_test(TestVolumeWrite, "test_volume_allocation", test_volume_allocation, devices=devices)
 add_function_test(TestVolumeWrite, "test_volume_allocate_by_tiles_f", test_volume_allocate_by_tiles_f, devices=devices)
 add_function_test(TestVolumeWrite, "test_volume_allocate_by_tiles_v", test_volume_allocate_by_tiles_v, devices=devices)
+add_function_test(
+    TestVolumeWrite, "test_volume_allocate_by_tiles_index", test_volume_allocate_by_tiles_index, devices=devices
+)
+add_function_test(
+    TestVolumeWrite,
+    "test_volume_allocation_from_voxels",
+    test_volume_allocation_from_voxels,
+    devices=devices,
+)
 
 
 if __name__ == "__main__":
diff --git a/warp/types.py b/warp/types.py
index 8de0b0c1..d53946f5 100644
--- a/warp/types.py
+++ b/warp/types.py
@@ -12,7 +12,7 @@
 import inspect
 import struct
 import zlib
-from typing import Any, Callable, Generic, List, Tuple, TypeVar, Union
+from typing import Any, Callable, Generic, List, NamedTuple, Optional, Tuple, TypeVar, Union
 
 import numpy as np
 
@@ -2997,11 +2997,12 @@ class Volume:
     #: Enum value to specify trilinear interpolation during sampling
     LINEAR = constant(1)
 
-    def __init__(self, data: array):
+    def __init__(self, data: array, copy: bool = True):
         """Class representing a sparse grid.
 
         Args:
             data (:class:`warp.array`): Array of bytes representing the volume in NanoVDB format
+            copy (bool): Whether the incoming data will be copied or aliased
         """
 
         self.id = 0
@@ -3011,16 +3012,16 @@ def __init__(self, data: array):
 
         if data is None:
             return
-
-        if data.device is None:
-            raise RuntimeError("Invalid device")
         self.device = data.device
 
+        owner = False
         if self.device.is_cpu:
-            self.id = self.runtime.core.volume_create_host(ctypes.cast(data.ptr, ctypes.c_void_p), data.size)
+            self.id = self.runtime.core.volume_create_host(
+                ctypes.cast(data.ptr, ctypes.c_void_p), data.size, copy, owner
+            )
         else:
             self.id = self.runtime.core.volume_create_device(
-                self.device.context, ctypes.cast(data.ptr, ctypes.c_void_p), data.size
+                self.device.context, ctypes.cast(data.ptr, ctypes.c_void_p), data.size, copy, owner
             )
 
         if self.id == 0:
@@ -3041,32 +3042,90 @@ def array(self) -> array:
         """Returns the raw memory buffer of the Volume as an array"""
         buf = ctypes.c_void_p(0)
         size = ctypes.c_uint64(0)
+        self.runtime.core.volume_get_buffer_info(self.id, ctypes.byref(buf), ctypes.byref(size))
+        return array(ptr=buf.value, dtype=uint8, shape=size.value, device=self.device, owner=False)
+
+    def get_tile_count(self) -> int:
+        """Returns the number of tiles (NanoVDB leaf nodes) of the volume"""
+
+        voxel_count, tile_count = (
+            ctypes.c_uint64(0),
+            ctypes.c_uint32(0),
+        )
+        self.runtime.core.volume_get_tile_and_voxel_count(self.id, ctypes.byref(tile_count), ctypes.byref(voxel_count))
+        return tile_count.value
+
+    def get_tiles(self, out: Optional[array] = None) -> array:
+        """Returns the integer coordinates of all allocated tiles for this volume.
+
+        Args:
+            out (:class:`warp.array`, optional): If provided, use the `out` array to store the tile coordinates, otherwise
+                a new array will be allocated. `out` must be a contiguous array of ``tile_count`` ``vec3i`` or ``tile_count x 3`` ``int32``
+                on the same device as this volume.
+        """
+
+        if self.id == 0:
+            raise RuntimeError("Invalid Volume")
+
+        tile_count = self.get_tile_count()
+        if out is None:
+            out = warp.empty(dtype=int32, shape=(tile_count, 3), device=self.device)
+        elif out.device != self.device or out.shape[0] < tile_count:
+            raise RuntimeError(f"'out' array must an array with at least {tile_count} rows on device {self.device}")
+        elif not _is_contiguous_vec_like_array(out, vec_length=3, scalar_types=(int32,)):
+            raise RuntimeError(
+                "'out' must be a contiguous 1D array with type vec3i or a 2D array of type int32 with shape (N, 3) "
+            )
+
         if self.device.is_cpu:
-            self.runtime.core.volume_get_buffer_info_host(self.id, ctypes.byref(buf), ctypes.byref(size))
+            self.runtime.core.volume_get_tiles_host(self.id, out.ptr)
         else:
-            self.runtime.core.volume_get_buffer_info_device(self.id, ctypes.byref(buf), ctypes.byref(size))
-        return array(ptr=buf.value, dtype=uint8, shape=size.value, device=self.device)
+            self.runtime.core.volume_get_tiles_device(self.id, out.ptr)
+
+        return out
+
+    def get_voxel_count(self) -> int:
+        """Returns the total number of allocated voxels for this volume"""
+
+        voxel_count, tile_count = (
+            ctypes.c_uint64(0),
+            ctypes.c_uint32(0),
+        )
+        self.runtime.core.volume_get_tile_and_voxel_count(self.id, ctypes.byref(tile_count), ctypes.byref(voxel_count))
+        return voxel_count.value
+
+    def get_voxels(self, out: Optional[array] = None) -> array:
+        """Returns the integer coordinates of all allocated voxels for this volume.
+
+        Args:
+            out (:class:`warp.array`, optional): If provided, use the `out` array to store the voxel coordinates, otherwise
+                a new array will be allocated. `out` must be a contiguous array of ``voxel_count`` ``vec3i`` or ``voxel_count x 3`` ``int32``
+                on the same device as this volume.
+        """
 
-    def get_tiles(self) -> array:
         if self.id == 0:
             raise RuntimeError("Invalid Volume")
 
-        buf = ctypes.c_void_p(0)
-        size = ctypes.c_uint64(0)
+        voxel_count = self.get_voxel_count()
+        if out is None:
+            out = warp.empty(dtype=int32, shape=(voxel_count, 3), device=self.device)
+        elif out.device != self.device or out.shape[0] < voxel_count:
+            raise RuntimeError(f"'out' array must an array with at least {voxel_count} rows on device {self.device}")
+        elif not _is_contiguous_vec_like_array(out, vec_length=3, scalar_types=(int32,)):
+            raise RuntimeError(
+                "'out' must be a contiguous 1D array with type vec3i or a 2D array of type int32 with shape (N, 3) "
+            )
+
         if self.device.is_cpu:
-            self.runtime.core.volume_get_tiles_host(self.id, ctypes.byref(buf), ctypes.byref(size))
-            deleter = self.device.default_allocator.deleter
+            self.runtime.core.volume_get_voxels_host(self.id, out.ptr)
         else:
-            self.runtime.core.volume_get_tiles_device(self.id, ctypes.byref(buf), ctypes.byref(size))
-            if self.device.is_mempool_supported:
-                deleter = self.device.mempool_allocator.deleter
-            else:
-                deleter = self.device.default_allocator.deleter
-        num_tiles = size.value // (3 * 4)
+            self.runtime.core.volume_get_voxels_device(self.id, out.ptr)
 
-        return array(ptr=buf.value, dtype=int32, shape=(num_tiles, 3), device=self.device, deleter=deleter)
+        return out
 
     def get_voxel_size(self) -> Tuple[float, float, float]:
+        """Voxel size, i.e, world coordinates of voxel's diagonal vector"""
+
         if self.id == 0:
             raise RuntimeError("Invalid Volume")
 
@@ -3074,9 +3133,181 @@ def get_voxel_size(self) -> Tuple[float, float, float]:
         self.runtime.core.volume_get_voxel_size(self.id, ctypes.byref(dx), ctypes.byref(dy), ctypes.byref(dz))
         return (dx.value, dy.value, dz.value)
 
+    class GridInfo(NamedTuple):
+        """Grid metadata"""
+
+        name: str
+        """Grid name"""
+        size_in_bytes: int
+        """Size of this grid's data, in bytes"""
+
+        grid_index: int
+        """Index of this grid in the data buffer"""
+        grid_count: int
+        """Total number of grids in the data buffer"""
+        type_str: str
+        """String describing the type of the grid values"""
+
+        translation: vec3f
+        """Index-to-world translation"""
+        transform_matrix: mat33f
+        """Linear part of the index-to-world transform"""
+
+    def get_grid_info(self) -> Volume.GridInfo:
+        """Returns the metadata associated with this Volume"""
+
+        grid_index = ctypes.c_uint32(0)
+        grid_count = ctypes.c_uint32(0)
+        grid_size = ctypes.c_uint64(0)
+        translation_buffer = (ctypes.c_float * 3)()
+        transform_buffer = (ctypes.c_float * 9)()
+        type_str_buffer = (ctypes.c_char * 16)()
+
+        name = self.runtime.core.volume_get_grid_info(
+            self.id,
+            ctypes.byref(grid_size),
+            ctypes.byref(grid_index),
+            ctypes.byref(grid_count),
+            translation_buffer,
+            transform_buffer,
+            type_str_buffer,
+        )
+
+        if name is None:
+            raise RuntimeError("Invalid volume")
+
+        return Volume.GridInfo(
+            name.decode("ascii"),
+            grid_size.value,
+            grid_index.value,
+            grid_count.value,
+            type_str_buffer.value.decode("ascii"),
+            vec3f.from_buffer_copy(translation_buffer),
+            mat33f.from_buffer_copy(transform_buffer),
+        )
+
+    _nvdb_type_to_dtype = {
+        "float": float32,
+        "double": float64,
+        "int16": int16,
+        "int32": int32,
+        "int64": int64,
+        "Vec3f": vec3f,
+        "Vec3d": vec3d,
+        "Half": float16,
+        "uint32": uint32,
+        "bool": bool,
+        "Vec4f": vec4f,
+        "Vec4d": vec4d,
+        "Vec3u8": vec3ub,
+        "Vec3u16": vec3us,
+        "uint8": uint8,
+    }
+
+    @property
+    def dtype(self) -> type:
+        """Type of the Volume's values as a Warp type.
+
+        If the grid does not contain values (e.g. index grids) or if the NanoVDB type is not
+        representable as a Warp type, returns ``None``.
+        """
+        return Volume._nvdb_type_to_dtype.get(self.get_grid_info().type_str, None)
+
+    _nvdb_index_types = ("Index", "OnIndex", "IndexMask", "OnIndexMask")
+
+    @property
+    def is_index(self) -> bool:
+        """Whether this Volume contains an index grid, that is, a type of grid that does
+        not explicitly store values but associates each voxel to linearized index.
+        """
+
+        return self.get_grid_info().type_str in Volume._nvdb_index_types
+
+    def get_feature_array_count(self) -> int:
+        """Returns the number of supplemental data arrays stored alongside the grid"""
+
+        return self.runtime.core.volume_get_blind_data_count(self.id)
+
+    class FeatureArrayInfo(NamedTuple):
+        """Metadata for a supplemental data array"""
+
+        name: str
+        """Name of the data array"""
+        ptr: int
+        """Memory address of the start of the array"""
+
+        value_size: int
+        """Size in bytes of the array values"""
+        value_count: int
+        """Number of values in the array"""
+        type_str: str
+        """String describing the type of the array values"""
+
+    def get_feature_array_info(self, feature_index: int) -> Volume.FeatureArrayInfo:
+        """Returns the metadata associated to the feature array at `feature_index`"""
+
+        buf = ctypes.c_void_p(0)
+        value_count = ctypes.c_uint64(0)
+        value_size = ctypes.c_uint32(0)
+        type_str_buffer = (ctypes.c_char * 16)()
+
+        name = self.runtime.core.volume_get_blind_data_info(
+            self.id,
+            feature_index,
+            ctypes.byref(buf),
+            ctypes.byref(value_count),
+            ctypes.byref(value_size),
+            type_str_buffer,
+        )
+
+        if buf.value is None:
+            raise RuntimeError("Invalid feature array")
+
+        return Volume.FeatureArrayInfo(
+            name.decode("ascii"),
+            buf.value,
+            value_size.value,
+            value_count.value,
+            type_str_buffer.value.decode("ascii"),
+        )
+
+    def feature_array(self, feature_index: int, dtype=None) -> array:
+        """Returns one the the grid's feature data arrays as a Warp array
+
+        Args:
+            feature_index: index of the supplemental dat aarray in the grid
+            dtype: type for the returned warp array. If not provided, will be deduced from the array metdata.
+        """
+
+        info = self.get_feature_array_info(feature_index)
+
+        if dtype is None:
+            try:
+                dtype = Volume._nvdb_type_to_dtype[info.type_str]
+            except KeyError:
+                # Unknown type, default to byte array
+                dtype = uint8
+
+        value_count = info.value_count
+        value_size = info.value_size
+
+        if type_size_in_bytes(dtype) == 1:
+            # allow requesting a byte array from any type
+            value_count *= value_size
+            value_size = 1
+        elif value_size == 1 and (value_count % type_size_in_bytes(dtype)) == 0:
+            # allow converting a byte array to any type
+            value_size = type_size_in_bytes(dtype)
+            value_count = value_count // value_size
+
+        if type_size_in_bytes(dtype) != value_size:
+            raise RuntimeError(f"Cannot cast feature data of size {value_size} to array dtype {type_repr(dtype)}")
+
+        return array(ptr=info.ptr, dtype=dtype, shape=value_count, device=self.device, owner=False)
+
     @classmethod
     def load_from_nvdb(cls, file_or_buffer, device=None) -> Volume:
-        """Creates a Volume object from a NanoVDB file or in-memory buffer.
+        """Creates a Volume object from a serialized NanoVDB file or in-memory buffer.
 
         Returns:
 
@@ -3088,28 +3319,117 @@ def load_from_nvdb(cls, file_or_buffer, device=None) -> Volume:
             data = file_or_buffer
 
         magic, version, grid_count, codec = struct.unpack("<QIHH", data[0:16])
-        if magic != 0x304244566F6E614E:
+        if magic not in (0x304244566F6E614E, 0x324244566F6E614E):  # NanoVDB0 or NanoVDB2 in hex, little-endian
             raise RuntimeError("NanoVDB signature not found")
         if version >> 21 != 32:  # checking major version
             raise RuntimeError("Unsupported NanoVDB version")
-        if grid_count != 1:
-            raise RuntimeError("Only NVDBs with exactly one grid are supported")
 
-        grid_data_offset = 192 + struct.unpack("<I", data[152:156])[0]
+        # Skip over segment metadata, store total payload size
+        grid_data_offset = 16  # sizeof(FileHeader)
+        tot_file_size = 0
+        for _ in range(grid_count):
+            grid_file_size = struct.unpack("<Q", data[grid_data_offset + 8 : grid_data_offset + 16])[0]
+            tot_file_size += grid_file_size
+
+            grid_name_size = struct.unpack("<I", data[grid_data_offset + 136 : grid_data_offset + 140])[0]
+            grid_data_offset += 176 + grid_name_size  # sizeof(FileMetadata) + grid name
+
+        file_end = grid_data_offset + tot_file_size
+
         if codec == 0:  # no compression
-            grid_data = data[grid_data_offset:]
+            grid_data = data[grid_data_offset:file_end]
         elif codec == 1:  # zip compression
-            grid_data = zlib.decompress(data[grid_data_offset + 8 :])
+            grid_data = bytearray()
+            while grid_data_offset < file_end:
+                chunk_size = struct.unpack("<Q", data[grid_data_offset : grid_data_offset + 8])[0]
+                grid_data += zlib.decompress(data[grid_data_offset + 8 :])
+                grid_data_offset += 8 + chunk_size
+
+        elif codec == 2:  # blosc compression
+            try:
+                import blosc
+            except ImportError as err:
+                raise RuntimeError(
+                    f"NanoVDB buffer is compressed using blosc, but Python module could not be imported: {err}"
+                ) from err
+
+            grid_data = bytearray()
+            while grid_data_offset < file_end:
+                chunk_size = struct.unpack("<Q", data[grid_data_offset : grid_data_offset + 8])[0]
+                grid_data += blosc.decompress(data[grid_data_offset + 8 :])
+                grid_data_offset += 8 + chunk_size
         else:
             raise RuntimeError(f"Unsupported codec code: {codec}")
 
         magic = struct.unpack("<Q", grid_data[0:8])[0]
-        if magic != 0x304244566F6E614E:
+        if magic not in (0x304244566F6E614E, 0x314244566F6E614E):  # NanoVDB0 or NanoVDB1 in hex, little-endian
             raise RuntimeError("NanoVDB signature not found on grid!")
 
         data_array = array(np.frombuffer(grid_data, dtype=np.byte), device=device)
         return cls(data_array)
 
+    @classmethod
+    def load_from_address(cls, grid_ptr: int, buffer_size: int = 0, device=None) -> Volume:
+        """
+        Creates a new :class:`Volume` aliasing an in-memory grid buffer.
+
+        In contrast to :meth:`load_from_nvdb` which should be used to load serialized NanoVDB grids,
+        here the buffer must be uncompressed and must not contain file header information.
+        If the passed address does not contain a NanoVDB grid, the behavior of this function is undefined.
+
+        Args:
+            grid_ptr: Integer address of the start of the grid buffer
+            buffer_size: Size of the buffer, in bytes. If not provided, the size will be assumed to be that of the single grid starting at `grid_ptr`.
+            device: Device of the buffer, and of the returned Volume. If not provided, the current Warp device is assumed.
+
+        Returns the newly created Volume.
+        """
+
+        if not grid_ptr:
+            raise (RuntimeError, "Invalid grid buffer pointer")
+
+        # Check that a Volume has not already been created for this address
+        # (to allow this we would need to ref-count the volume descriptor)
+        existing_buf = ctypes.c_void_p(0)
+        existing_size = ctypes.c_uint64(0)
+        warp.context.runtime.core.volume_get_buffer_info(
+            grid_ptr, ctypes.byref(existing_buf), ctypes.byref(existing_size)
+        )
+
+        if existing_buf.value is not None:
+            raise RuntimeError(
+                "A warp Volume has already been created for this grid, aliasing it more than once is not possible."
+            )
+
+        data_array = array(ptr=grid_ptr, dtype=uint8, shape=buffer_size, owner=False, device=device)
+
+        return cls(data_array, copy=False)
+
+    def load_next_grid(self) -> Volume:
+        """
+        Tries to create a new warp Volume for the next grid that is linked to by this Volume.
+
+        The existence of a next grid is deduced from the `grid_index` and `grid_count` metadata
+        as well as the size of this Volume's in-memory buffer.
+
+        Returns the newly created Volume, or None if there is no next grid.
+        """
+
+        grid = self.get_grid_info()
+
+        array = self.array()
+
+        if grid.grid_index + 1 >= grid.grid_count or array.capacity <= grid.size_in_bytes:
+            return None
+
+        next_volume = Volume.load_from_address(
+            array.ptr + grid.size_in_bytes, buffer_size=array.capacity - grid.size_in_bytes, device=self.device
+        )
+        # makes the new Volume keep a reference to the current grid, as we're aliasing its buffer
+        next_volume._previous_grid = self
+
+        return next_volume
+
     @classmethod
     def load_from_numpy(
         cls, ndarray: np.array, min_world=(0.0, 0.0, 0.0), voxel_size=1.0, bg_value=0.0, device=None
@@ -3261,11 +3581,11 @@ def allocate_by_tiles(
 
         Args:
             tile_points (:class:`warp.array`): Array of positions that define the tiles to be allocated.
-                The array can be a 2D, N-by-3 array of :class:`warp.int32` values, indicating index space positions,
-                or can be a 1D array of :class:`warp.vec3` values, indicating world space positions.
+                The array may use an integer scalar type (2D N-by-3 array of :class:`warp.int32` or 1D array of `warp.vec3i` values), indicating index space positions,
+                or a floating point scalar type (2D N-by-3 array of :class:`warp.float32` or 1D array of `warp.vec3f` values), indicating world space positions.
                 Repeated points per tile are allowed and will be efficiently deduplicated.
             voxel_size (float): Voxel size of the new volume.
-            bg_value (float or array-like): Value of unallocated voxels of the volume, also defines the volume's type, a :class:`warp.vec3` volume is created if this is `array-like`, otherwise a float volume is created
+            bg_value (array-like, float, int or None): Value of unallocated voxels of the volume, also defines the volume's type. A :class:`warp.vec3` volume is created if this is `array-like`, an index volume will be created if `bg_value` is ``None``.
             translation (array-like): Translation between the index and world spaces.
             device (Devicelike): The CUDA device to create the volume on, e.g.: "cuda" or "cuda:0".
 
@@ -3276,19 +3596,28 @@ def allocate_by_tiles(
             raise RuntimeError(f"Voxel size must be positive! Got {voxel_size}")
         if not device.is_cuda:
             raise RuntimeError("Only CUDA devices are supported for allocate_by_tiles")
-        if not (
-            isinstance(tile_points, array)
-            and (tile_points.dtype == int32 and tile_points.ndim == 2)
-            or (tile_points.dtype == vec3 and tile_points.ndim == 1)
-        ):
-            raise RuntimeError("Expected an warp array of vec3s or of n-by-3 int32s as tile_points!")
+        if not _is_contiguous_vec_like_array(tile_points, vec_length=3, scalar_types=(float32, int32)):
+            raise RuntimeError(
+                "tile_points must be contiguous and either a 1D warp array of vec3f or vec3i or a 2D n-by-3 array of int32 or float32."
+            )
         if not tile_points.device.is_cuda:
-            tile_points = array(tile_points, dtype=tile_points.dtype, device=device)
+            tile_points = tile_points.to(device)
 
         volume = cls(data=None)
         volume.device = device
-        in_world_space = tile_points.dtype == vec3
-        if hasattr(bg_value, "__len__"):
+        in_world_space = type_scalar_type(tile_points.dtype) == float32
+        if bg_value is None:
+            volume.id = volume.runtime.core.volume_index_from_tiles_device(
+                volume.device.context,
+                ctypes.c_void_p(tile_points.ptr),
+                tile_points.shape[0],
+                voxel_size,
+                translation[0],
+                translation[1],
+                translation[2],
+                in_world_space,
+            )
+        elif hasattr(bg_value, "__len__"):
             volume.id = volume.runtime.core.volume_v_from_tiles_device(
                 volume.device.context,
                 ctypes.c_void_p(tile_points.ptr),
@@ -3332,6 +3661,73 @@ def allocate_by_tiles(
 
         return volume
 
+    @classmethod
+    def allocate_by_voxels(
+        cls, voxel_points: array, voxel_size: float, translation=(0.0, 0.0, 0.0), device=None
+    ) -> Volume:
+        """Allocate a new Volume with active voxel for each point voxel_points.
+
+        This function creates an *index* Volume, a special kind of volume that does not any store any
+        explicit payload but encodes a linearized index for each active voxel, allowing to lookup and
+        sample data from arbitrary external arrays.
+
+        This function is only supported for CUDA devices.
+
+        Args:
+            voxel_points (:class:`warp.array`): Array of positions that define the voxels to be allocated.
+                The array may use an integer scalar type (2D N-by-3 array of :class:`warp.int32` or 1D array of `warp.vec3i` values), indicating index space positions,
+                or a floating point scalar type (2D N-by-3 array of :class:`warp.float32` or 1D array of `warp.vec3f` values), indicating world space positions.
+                Repeated points per tile are allowed and will be efficiently deduplicated.
+            voxel_size (float): Voxel size of the new volume.
+            translation (array-like): Translation between the index and world spaces.
+            device (Devicelike): The CUDA device to create the volume on, e.g.: "cuda" or "cuda:0".
+
+        """
+        device = warp.get_device(device)
+
+        if voxel_size <= 0.0:
+            raise RuntimeError(f"Voxel size must be positive! Got {voxel_size}")
+        if not device.is_cuda:
+            raise RuntimeError("Only CUDA devices are supported for allocate_by_tiles")
+        if not (is_array(voxel_points) and voxel_points.is_contiguous):
+            raise RuntimeError("tile_points must be a contiguous array")
+        if not _is_contiguous_vec_like_array(voxel_points, vec_length=3, scalar_types=(float32, int32)):
+            raise RuntimeError(
+                "voxel_points must be contiguous and either a 1D warp array of vec3f or vec3i or a 2D n-by-3 array of int32 or float32."
+            )
+        if not voxel_points.device.is_cuda:
+            voxel_points = voxel_points.to(device)
+
+        volume = cls(data=None)
+        volume.device = device
+        in_world_space = type_scalar_type(voxel_points.dtype) == float32
+
+        volume.id = volume.runtime.core.volume_from_active_voxels_device(
+            volume.device.context,
+            ctypes.c_void_p(voxel_points.ptr),
+            voxel_points.shape[0],
+            voxel_size,
+            translation[0],
+            translation[1],
+            translation[2],
+            in_world_space,
+        )
+
+        if volume.id == 0:
+            raise RuntimeError("Failed to create volume")
+
+        return volume
+
+
+def _is_contiguous_vec_like_array(array, vec_length: int, scalar_types: Tuple[type]) -> bool:
+    if not (is_array(array) and array.is_contiguous):
+        return False
+    if type_scalar_type(array.dtype) not in scalar_types:
+        return False
+    return (array.ndim == 1 and type_length(array.dtype) == vec_length) or (
+        array.ndim == 2 and array.shape[1] == vec_length and type_length(array.dtype) == 1
+    )
+
 
 # definition just for kernel type (cannot be a parameter), see mesh.h
 # NOTE: its layout must match the corresponding struct defined in C.