Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[c++/python] SOMAGeometryDataFrame basic write #3687

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions apis/python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,7 @@ def run(self):
"src/tiledbsoma/soma_group.cc",
"src/tiledbsoma/soma_collection.cc",
"src/tiledbsoma/managed_query.cc",
"src/tiledbsoma/transformer.cc",
"src/tiledbsoma/pytiledbsoma.cc",
],
include_dirs=INC_DIRS,
Expand Down
100 changes: 95 additions & 5 deletions apis/python/src/tiledbsoma/_geometry_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from __future__ import annotations

import warnings
from typing import Any, Sequence, Tuple, Union
from typing import Any, Sequence, Tuple, Union, cast

import pyarrow as pa
import somacore
Expand All @@ -17,7 +17,10 @@

from tiledbsoma._tdb_handles import GeometryDataFrameWrapper
from tiledbsoma.options._soma_tiledb_context import _validate_soma_tiledb_context
from tiledbsoma.options._tiledb_create_write_options import TileDBCreateOptions
from tiledbsoma.options._tiledb_create_write_options import (
TileDBCreateOptions,
TileDBWriteOptions,
)

from . import _arrow_types, _util
from . import pytiledbsoma as clib
Expand All @@ -35,7 +38,7 @@
_revise_domain_for_extent,
)
from ._exception import SOMAError, map_exception_for_create
from ._read_iters import TableReadIter
from ._read_iters import ManagedQuery, TableReadIter
from ._spatial_dataframe import SpatialDataFrame
from ._spatial_util import (
coordinate_space_from_json,
Expand Down Expand Up @@ -311,6 +314,17 @@

# Data operations

def __len__(self) -> int:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is very convenient! But (AFAIK) not implemented for the other dataframe classes.

Might be worth doing for the other dataframe classes (as a separate PR).

"""Returns the number of rows in the geometry dataframe."""
return self.count

Check warning on line 319 in apis/python/src/tiledbsoma/_geometry_dataframe.py

View check run for this annotation

Codecov / codecov/patch

apis/python/src/tiledbsoma/_geometry_dataframe.py#L319

Added line #L319 was not covered by tests

@property
def count(self) -> int:
"""Returns the number of rows in the geometry dataframe."""
self._check_open_read()

Check warning on line 324 in apis/python/src/tiledbsoma/_geometry_dataframe.py

View check run for this annotation

Codecov / codecov/patch

apis/python/src/tiledbsoma/_geometry_dataframe.py#L324

Added line #L324 was not covered by tests
# if is it in read open mode, then it is a GeometryDataFrameWrapper
return cast(GeometryDataFrameWrapper, self._handle).count

Check warning on line 326 in apis/python/src/tiledbsoma/_geometry_dataframe.py

View check run for this annotation

Codecov / codecov/patch

apis/python/src/tiledbsoma/_geometry_dataframe.py#L326

Added line #L326 was not covered by tests

def read(
self,
coords: options.SparseDFCoords = (),
Expand Down Expand Up @@ -343,7 +357,19 @@
Lifecycle:
Experimental.
"""
raise NotImplementedError()
del batch_size # Currently unused.
_util.check_unpartitioned(partitions)
self._check_open_read()

# TODO: batch_size
return TableReadIter(
array=self,
coords=coords,
column_names=column_names,
result_order=_util.to_clib_result_order(result_order),
value_filter=value_filter,
platform_config=platform_config,
)

def read_spatial_region(
self,
Expand Down Expand Up @@ -414,7 +440,71 @@
Lifecycle:
Experimental.
"""
raise NotImplementedError()
_util.check_type("values", values, (pa.Table,))

write_options: Union[TileDBCreateOptions, TileDBWriteOptions]
sort_coords = None
if isinstance(platform_config, TileDBCreateOptions):
raise ValueError(

Check warning on line 448 in apis/python/src/tiledbsoma/_geometry_dataframe.py

View check run for this annotation

Codecov / codecov/patch

apis/python/src/tiledbsoma/_geometry_dataframe.py#L448

Added line #L448 was not covered by tests
"As of TileDB-SOMA 1.13, the write method takes "
"TileDBWriteOptions instead of TileDBCreateOptions"
)
write_options = TileDBWriteOptions.from_platform_config(platform_config)
sort_coords = write_options.sort_coords

clib_dataframe = self._handle._handle

for batch in values.to_batches():
mq = ManagedQuery(self, None)
mq._handle.set_array_data(batch)
mq._handle.submit_write(sort_coords or False)

if write_options.consolidate_and_vacuum:
clib_dataframe.consolidate_and_vacuum()

Check warning on line 463 in apis/python/src/tiledbsoma/_geometry_dataframe.py

View check run for this annotation

Codecov / codecov/patch

apis/python/src/tiledbsoma/_geometry_dataframe.py#L463

Added line #L463 was not covered by tests

return self

# Write helpers with automatic transformations

def from_outlines(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should I know what an 'outline' is?

self,
values: Union[pa.RecordBatch, pa.Table],
*,
platform_config: options.PlatformConfig | None = None,
) -> Self:
"""Writes the data from an Arrow table to the persistent object
applying a data transformation to transform the given outline
of each polygon to the appropriate WKB encoded polygon.

Geometry data provided are expected to be a list of point coordinates
per polygon in the form of [x0, y0, x1, y1, ..., x0, y0] and will
be converted automatically to a list of WKB encoded polygons.

Args:
values: An Arrow table containing all columns, including
the index columns. The schema for the values must match
the schema for the ``DataFrame``. `soma_geometry` column
should contain lists of floating point numbers which are
the point coordinates of the outline of each polygon in
the form [x0, y0, x1, y1, ..., x0, y0].

Returns: ``self``, to enable method chaining.

"""

outline_transformer = clib.OutlineTransformer(
coordinate_space_to_json(self._coord_space)
)

for batch in values.to_batches():
self.write(
clib.TransformerPipeline(batch)
.transform(outline_transformer)
.asTable(),
platform_config=platform_config,
)

return self

# Metadata operations

Expand Down
2 changes: 2 additions & 0 deletions apis/python/src/tiledbsoma/pytiledbsoma.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ void load_query_condition(py::module&);
void load_reindexer(py::module&);
void load_soma_vfs(py::module&);
void load_managed_query(py::module&);
void load_transformers(py::module&);

PYBIND11_MODULE(pytiledbsoma, m) {
py::register_exception<TileDBSOMAError>(m, "SOMAError");
Expand Down Expand Up @@ -173,6 +174,7 @@ PYBIND11_MODULE(pytiledbsoma, m) {
load_reindexer(m);
load_soma_vfs(m);
load_managed_query(m);
load_transformers(m);
}

}; // namespace libtiledbsomacpp
88 changes: 88 additions & 0 deletions apis/python/src/tiledbsoma/transformer.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
/**
* @file transformer.cc
*
* @section LICENSE
*
* Licensed under the MIT License.
* Copyright (c) TileDB, Inc. and The Chan Zuckerberg Initiative Foundation
*
* @section DESCRIPTION
*
* This file defines the TransformerPipeline, Transformer and derived classes
* bindings.
*/

#include <pybind11/numpy.h>
#include <pybind11/pybind11.h>
#include <pybind11/pytypes.h>
#include <pybind11/stl.h>
#include <pybind11/stl_bind.h>

#include "common.h"

namespace libtiledbsomacpp {

namespace py = pybind11;
using namespace py::literals;
using namespace tiledbsoma;

void load_transformers(py::module& m) {
py::class_<TransformerPipeline>(m, "TransformerPipeline")
.def(py::init([](py::handle py_batch) {
ArrowSchema arrow_schema;
ArrowArray arrow_array;
uintptr_t arrow_schema_ptr = (uintptr_t)(&arrow_schema);
uintptr_t arrow_array_ptr = (uintptr_t)(&arrow_array);
py_batch.attr("_export_to_c")(arrow_array_ptr, arrow_schema_ptr);

auto array = std::make_unique<ArrowArray>(arrow_array);
auto schema = std::make_unique<ArrowSchema>(arrow_schema);

return TransformerPipeline(std::move(array), std::move(schema));
}))
.def(
"transform",
[](TransformerPipeline& pipeline,
std::shared_ptr<Transformer> transformer)
-> TransformerPipeline& {
return pipeline.transform(transformer);
})
.def("asTable", [](TransformerPipeline& pipeline) {
auto pa = py::module::import("pyarrow");
auto pa_table_from_arrays = pa.attr("Table").attr("from_arrays");
auto pa_array_import = pa.attr("Array").attr("_import_from_c");
auto pa_schema_import = pa.attr("Schema").attr("_import_from_c");

auto [array, schema] = pipeline.asTable();

py::list array_list;
py::list names;

for (int64_t i = 0; i < schema->n_children; ++i) {
// Should happen before pyarrow array construction because
// py::capsule get ownership of the memory
names.append(std::string(schema->children[i]->name));

auto pa_array = pa_array_import(
py::capsule(array->children[i]),
py::capsule(schema->children[i]));

array_list.append(pa_array);
}

return pa_table_from_arrays(array_list, names);
});

py::class_<Transformer, std::shared_ptr<Transformer>>(m, "Transformer");
py::class_<
OutlineTransformer,
Transformer,
std::shared_ptr<OutlineTransformer>>(m, "OutlineTransformer")
.def(py::init([](std::string coord_space) {
auto coordinate_space = SOMACoordinateSpace::from_string(
coord_space);

return std::make_shared<OutlineTransformer>(coordinate_space);
}));
}
} // namespace libtiledbsomacpp
37 changes: 37 additions & 0 deletions apis/python/tests/test_geometry_dataframe.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import numpy as np
import pyarrow as pa
import pytest
import shapely

import tiledbsoma as soma

Expand Down Expand Up @@ -55,3 +56,39 @@ def test_geometry_coordinate_space(tmp_path):
)
assert geom.coordinate_space[0] == soma.Axis(name="x", unit="m")
assert geom.coordinate_space[1] == soma.Axis(name="y", unit="in")


def test_geometry_basic_read(tmp_path):
uri = tmp_path.as_uri()

asch = pa.schema([("quality", pa.float32())])

with soma.GeometryDataFrame.create(
uri, schema=asch, domain=[[(-10, 10), (-10, 10)], [0, 100]]
) as geom:
pydict = {}
pydict["soma_geometry"] = [
[0.0, 0, 0, 1, 1, 0, 0, 0],
[0.0, 0, 0, 1, 1, 1, 1, 0, 0, 0],
]
pydict["soma_joinid"] = [1, 2]
pydict["quality"] = [4.1, 5.2]

rb = pa.Table.from_pydict(pydict)
geom.from_outlines(rb)

with soma.GeometryDataFrame.open(uri) as geom:
result = geom.read().concat()

# Internal columns will be hidden in subsequent PR
assert (result[0].to_numpy() == [0, 0]).all()
assert (result[1].to_numpy() == [0, 0]).all()
assert (result[2].to_numpy() == [1, 1]).all()
assert (result[3].to_numpy() == [1, 1]).all()

assert shapely.from_wkb(result[5].to_numpy()[0]) == shapely.Polygon(
[(0, 0), (0, 1), (1, 0), (0, 0)]
)
assert shapely.from_wkb(result[5].to_numpy()[1]) == shapely.Polygon(
[(0, 0), (0, 1), (1, 1), (1, 0), (0, 0)]
)
4 changes: 4 additions & 0 deletions libtiledbsoma/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,13 @@ add_library(TILEDB_SOMA_OBJECTS OBJECT
${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_dataframe.cc
${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_dense_ndarray.cc
${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_sparse_ndarray.cc
${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_transformers.cc
${CMAKE_CURRENT_SOURCE_DIR}/soma/array_buffers.cc
${CMAKE_CURRENT_SOURCE_DIR}/soma/column_buffer.cc
${CMAKE_CURRENT_SOURCE_DIR}/utils/arrow_adapter.cc
${CMAKE_CURRENT_SOURCE_DIR}/utils/logger.cc
${CMAKE_CURRENT_SOURCE_DIR}/utils/stats.cc
${CMAKE_CURRENT_SOURCE_DIR}/utils/transformer.cc
${CMAKE_CURRENT_SOURCE_DIR}/utils/util.cc
${CMAKE_CURRENT_SOURCE_DIR}/utils/version.cc
${CMAKE_CURRENT_SOURCE_DIR}/external/src/thread_pool/thread_pool.cc
Expand Down Expand Up @@ -221,6 +223,7 @@ install(FILES
${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_point_cloud_dataframe.h
${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_multiscale_image.h
${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_object.h
${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_transformers.h
DESTINATION "include/tiledbsoma/soma"
)

Expand All @@ -240,6 +243,7 @@ install(FILES
${CMAKE_CURRENT_SOURCE_DIR}/utils/fastercsx.h
${CMAKE_CURRENT_SOURCE_DIR}/utils/parallel_functions.h
${CMAKE_CURRENT_SOURCE_DIR}/utils/stats.h
${CMAKE_CURRENT_SOURCE_DIR}/utils/transformer.h
${CMAKE_CURRENT_SOURCE_DIR}/utils/util.h
${CMAKE_CURRENT_SOURCE_DIR}/utils/version.h

Expand Down
10 changes: 8 additions & 2 deletions libtiledbsoma/src/soma/soma_coordinates.cc
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,14 @@ SOMACoordinateSpace SOMACoordinateSpace::from_metadata(
"[SOMACoordinateSpace]: Missing value for coordinate space "
"metadata.");
}
std::string value_str(static_cast<const char*>(value), value_num);
auto value_json = json::parse(value_str);

return SOMACoordinateSpace::from_string(
std::string_view(static_cast<const char*>(value), value_num));
}

SOMACoordinateSpace SOMACoordinateSpace::from_string(
std::string_view metadata) {
auto value_json = json::parse(metadata);
auto axes = value_json.template get<std::vector<SOMAAxis>>();

return SOMACoordinateSpace(axes);
Expand Down
2 changes: 2 additions & 0 deletions libtiledbsoma/src/soma/soma_coordinates.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ class SOMACoordinateSpace {
static SOMACoordinateSpace from_metadata(
tiledb_datatype_t value_type, uint32_t value_num, const void* value);

static SOMACoordinateSpace from_string(std::string_view metadata);

SOMACoordinateSpace();

SOMACoordinateSpace(const std::vector<SOMAAxis>& axes);
Expand Down
Loading
Loading