diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.cc b/cpp/src/arrow/extension/fixed_shape_tensor.cc index c271ef870d458..9e02d4ab38c83 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor.cc @@ -39,14 +39,15 @@ namespace arrow { namespace internal { -Status ComputeStrides(const FixedWidthType& type, const std::vector& shape, +Status ComputeStrides(const std::shared_ptr& value_type, + const std::vector& shape, const std::vector& permutation, std::vector* strides) { + auto fixed_width_type = internal::checked_pointer_cast(value_type); if (permutation.empty()) { - return internal::ComputeRowMajorStrides(type, shape, strides); + return internal::ComputeRowMajorStrides(*fixed_width_type.get(), shape, strides); } - - const int byte_width = type.byte_width(); + const int byte_width = value_type->byte_width(); int64_t remaining = 0; if (!shape.empty() && shape.front() > 0) { @@ -319,13 +320,12 @@ const Result> FixedShapeTensorArray::ToTensor() const { permutation.insert(permutation.begin(), 1, 0); std::vector tensor_strides; - auto value_type = internal::checked_pointer_cast(ext_arr->value_type()); + std::shared_ptr type = ext_arr->value_type(); ARROW_RETURN_NOT_OK( - internal::ComputeStrides(*value_type.get(), shape, permutation, &tensor_strides)); + internal::ComputeStrides(type, shape, permutation, &tensor_strides)); ARROW_ASSIGN_OR_RAISE(auto buffers, ext_arr->Flatten()); - ARROW_ASSIGN_OR_RAISE( - auto tensor, Tensor::Make(ext_arr->value_type(), buffers->data()->buffers[1], shape, - tensor_strides, dim_names)); + ARROW_ASSIGN_OR_RAISE(auto tensor, Tensor::Make(type, buffers->data()->buffers[1], + shape, tensor_strides, dim_names)); return tensor; } @@ -348,9 +348,8 @@ Result> FixedShapeTensorType::Make( const std::vector& FixedShapeTensorType::strides() { if (strides_.empty()) { - auto value_type = internal::checked_pointer_cast(this->value_type_); std::vector tensor_strides; - ARROW_CHECK_OK(internal::ComputeStrides(*value_type.get(), this->shape(), + ARROW_CHECK_OK(internal::ComputeStrides(this->value_type_, this->shape(), this->permutation(), &tensor_strides)); strides_ = tensor_strides; } diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.h b/cpp/src/arrow/extension/fixed_shape_tensor.h index 8bb5e7f37e7ad..6556675e87695 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor.h +++ b/cpp/src/arrow/extension/fixed_shape_tensor.h @@ -23,7 +23,8 @@ namespace arrow { namespace internal { ARROW_EXPORT -Status ComputeStrides(const FixedWidthType& type, const std::vector& shape, +Status ComputeStrides(const std::shared_ptr& value_type, + const std::vector& shape, const std::vector& permutation, std::vector* strides); diff --git a/cpp/src/arrow/extension/tensor_extension_array_test.cc b/cpp/src/arrow/extension/tensor_extension_array_test.cc index fa26a877be23c..ffb554318b10a 100644 --- a/cpp/src/arrow/extension/tensor_extension_array_test.cc +++ b/cpp/src/arrow/extension/tensor_extension_array_test.cc @@ -641,6 +641,11 @@ TEST_F(TestVariableShapeTensorType, ComputeStrides) { ASSERT_EQ(t->shape(), (std::vector{2, 3, 1})); ASSERT_EQ(t->strides(), (std::vector{24, 8, 8})); + ASSERT_OK_AND_ASSIGN(auto sc, ext_array->GetScalar(0)); + + auto vt = internal::checked_pointer_cast(sc->type); + auto it = vt->value_type(); + std::vector shape = {2, 3, 1}; std::vector strides = {sizeof(int64_t) * 3, sizeof(int64_t) * 1, sizeof(int64_t) * 1}; diff --git a/cpp/src/arrow/extension/variable_shape_tensor.cc b/cpp/src/arrow/extension/variable_shape_tensor.cc index 9c303bc1033fc..c87dd4036ef08 100644 --- a/cpp/src/arrow/extension/variable_shape_tensor.cc +++ b/cpp/src/arrow/extension/variable_shape_tensor.cc @@ -41,8 +41,7 @@ const Result> VariableShapeTensorArray::GetTensor( const int64_t i) const { auto ext_arr = internal::checked_pointer_cast(this->storage()); auto ext_type = internal::checked_pointer_cast(this->type()); - auto value_type = - internal::checked_pointer_cast(ext_type->value_type()); + auto value_type = ext_type->value_type(); auto ndim = ext_type->ndim(); auto dim_names = ext_type->dim_names(); auto shapes = @@ -57,16 +56,16 @@ const Result> VariableShapeTensorArray::GetTensor( std::vector strides; // TODO: optimize ComputeStrides for non-uniform tensors - ARROW_CHECK_OK(internal::ComputeStrides(*value_type.get(), shape, - ext_type->permutation(), &strides)); + ARROW_CHECK_OK( + internal::ComputeStrides(value_type, shape, ext_type->permutation(), &strides)); auto list_arr = std::static_pointer_cast(ext_arr->field(1))->value_slice(i)->data(); - auto bw = value_type->byte_width(); - auto buffer = - SliceBuffer(list_arr->buffers[1], list_arr->offset * bw, list_arr->length * bw); + auto byte_width = value_type->byte_width(); + auto buffer = SliceBuffer(list_arr->buffers[1], list_arr->offset * byte_width, + list_arr->length * byte_width); - return Tensor::Make(ext_type->value_type(), buffer, shape, strides, dim_names); + return Tensor::Make(value_type, buffer, shape, strides, dim_names); } bool VariableShapeTensorType::ExtensionEquals(const ExtensionType& other) const { diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 9ef531be51cda..b53ab72d95e61 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -952,6 +952,11 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: void set_chunksize(int64_t chunksize) cdef cppclass CTensor" arrow::Tensor": + CTensor(const shared_ptr[CDataType]& type, + const shared_ptr[CBuffer]& data, + const vector[int64_t]& shape, + const vector[int64_t]& strides, + const vector[c_string]& dim_names) shared_ptr[CDataType] type() shared_ptr[CBuffer] data() @@ -2691,6 +2696,11 @@ cdef extern from "arrow/extension/fixed_shape_tensor.h" namespace "arrow::extens const vector[int64_t] permutation() const vector[c_string] dim_names() +cdef extern from "arrow/extension/fixed_shape_tensor.h" namespace "arrow::internal" nogil: + cdef CStatus ComputeStrides(const shared_ptr[CDataType]& value_type, + const vector[int64_t]& shape, + const vector[int64_t]& permutation, + vector[int64_t]* strides) cdef extern from "arrow/util/compression.h" namespace "arrow" nogil: cdef enum CCompressionType" arrow::Compression::type": diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 7e4a3bddbe300..f9fb5c66d4de3 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -1027,7 +1027,7 @@ cdef class ExtensionScalar(Scalar): return pyarrow_wrap_scalar( sp_scalar) -class VariableShapeTensorScalar(ExtensionScalar): +cdef class VariableShapeTensorScalar(ExtensionScalar): """ Concrete class for variable shape tensor extension scalar. """ @@ -1047,6 +1047,32 @@ class VariableShapeTensorScalar(ExtensionScalar): raise ValueError( 'Only non-permuted tensors can be converted to numpy tensors.') + def to_tensor(self): + """ + Convert variable shape tensor extension scalar to a pyarrow.Tensor. + """ + cdef: + shared_ptr[CTensor] ctensor + vector[int64_t] strides + vector[c_string] dim_names + + shared_ptr[CVariableShapeTensorType] typ = static_pointer_cast[CVariableShapeTensorType, CDataType]( + self.wrapped.get().type) + + shared_ptr[CDataType] ty = typ.get().value_type() + # TODO: this accesses the full buffer instead of a slice + shared_ptr[CBuffer] data = pyarrow_unwrap_buffer(self.value[1].values.buffers()[1]) + vector[int64_t] shape = self.value[0].values.to_pylist() + vector[int64_t] permutation = self.type.permutation + + for name in self.type.dim_names: + dim_names.push_back(tobytes(name)) + + check_status(ComputeStrides(ty, shape, permutation, &strides)) + ctensor = make_shared[CTensor](ty, data, shape, strides, dim_names) + + return pyarrow_wrap_tensor(ctensor) + cdef dict _scalar_classes = { _Type_BOOL: BooleanScalar, diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index cf8e9c4774b30..1fe410a235527 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1439,6 +1439,13 @@ def test_variable_shape_tensor_class_methods(value_type): np.testing.assert_array_equal(arr[0].to_numpy_ndarray(), expected_0) np.testing.assert_array_equal(arr[1].to_numpy_ndarray(), expected_1) + assert arr[0].to_tensor().equals( + pa.Tensor.from_numpy(expected_0, dim_names=["H", "W"])) + + # TODO: due to wrong offset this would return [[1], [2]] instead of [[7], [8]] + assert arr[1].to_tensor().equals( + pa.Tensor.from_numpy(expected_1, dim_names=["H", "W"])) + @pytest.mark.parametrize("tensor_type", ( pa.fixed_shape_tensor(pa.int8(), [2, 2, 3]),