Skip to content

Commit

Permalink
GH-42247: [C++] Support casting to and from utf8_view/binary_view (#4…
Browse files Browse the repository at this point in the history
…3302)

### Rationale for this change

We need casts between string (binary) and string-view (binary-view) types since they are semantically equivalent.

### What changes are included in this PR?

 - Add `is_binary_view_like()` type predicate
 - Add `BinaryViewTypes()` list including `STRING_VIEW/BINARY_VIEW`
 - New cast kernels

### Are these changes tested?

Yes, but test coverage might be improved.

### Are there any user-facing changes?

More casts are available.
* GitHub Issue: #42247

Lead-authored-by: Felipe Oliveira Carvalho <[email protected]>
Co-authored-by: mwish <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
  • Loading branch information
felipecrv and mapleFU authored Sep 12, 2024
1 parent 0c2891d commit 85fc3eb
Show file tree
Hide file tree
Showing 12 changed files with 473 additions and 76 deletions.
19 changes: 18 additions & 1 deletion cpp/src/arrow/compute/kernels/codegen_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,8 @@ struct GetViewType<Type, enable_if_has_c_type<Type>> {

template <typename Type>
struct GetViewType<Type, enable_if_t<is_base_binary_type<Type>::value ||
is_fixed_size_binary_type<Type>::value>> {
is_fixed_size_binary_type<Type>::value ||
is_binary_view_like_type<Type>::value>> {
using T = std::string_view;
using PhysicalType = T;

Expand Down Expand Up @@ -1265,6 +1266,22 @@ ArrayKernelExec GenerateVarBinary(detail::GetTypeId get_id) {
}
}

// Generate a kernel given a templated functor for binary-view types. Generates a
// single kernel for binary/string-view.
//
// See "Numeric" above for description of the generator functor
template <template <typename...> class Generator, typename Type0, typename... Args>
ArrayKernelExec GenerateVarBinaryViewBase(detail::GetTypeId get_id) {
switch (get_id.id) {
case Type::BINARY_VIEW:
case Type::STRING_VIEW:
return Generator<Type0, BinaryViewType, Args...>::Exec;
default:
DCHECK(false);
return nullptr;
}
}

// Generate a kernel given a templated functor for temporal types
//
// See "Numeric" above for description of the generator functor
Expand Down
6 changes: 6 additions & 0 deletions cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,12 @@ std::vector<std::shared_ptr<CastFunction>> GetBooleanCasts() {
BooleanType, ParseBooleanString>(*ty);
DCHECK_OK(func->AddKernel(ty->id(), {ty}, boolean(), exec));
}
for (const auto& ty : BinaryViewTypes()) {
ArrayKernelExec exec =
GenerateVarBinaryViewBase<applicator::ScalarUnaryNotNull, BooleanType,
ParseBooleanString>(*ty);
DCHECK_OK(func->AddKernel(ty->id(), {ty}, boolean(), exec));
}
return {func};
}

Expand Down
7 changes: 2 additions & 5 deletions cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -188,8 +188,6 @@ void CastNumberToNumberUnsafe(Type::type in_type, Type::type out_type,
// ----------------------------------------------------------------------

Status UnpackDictionary(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
// TODO: is there an implementation more friendly to the "span" data structures?

DictionaryArray dict_arr(batch[0].array.ToArrayData());
const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options;

Expand Down Expand Up @@ -281,6 +279,8 @@ void AddZeroCopyCast(Type::type in_type_id, InputType in_type, OutputType out_ty
}

static bool CanCastFromDictionary(Type::type type_id) {
/// TODO(GH-43010): add is_binary_view_like() here once array_take
/// can handle string-views
return (is_primitive(type_id) || is_base_binary_like(type_id) ||
is_fixed_size_binary(type_id));
}
Expand All @@ -297,9 +297,6 @@ void AddCommonCasts(Type::type out_type_id, OutputType out_ty, CastFunction* fun
// From dictionary to this type
if (CanCastFromDictionary(out_type_id)) {
// Dictionary unpacking not implemented for boolean or nested types.
//
// XXX: Uses Take and does its own memory allocation for the moment. We can
// fix this later.
DCHECK_OK(func->AddKernel(Type::DICTIONARY, {InputType(Type::DICTIONARY)}, out_ty,
UnpackDictionary, NullHandling::COMPUTED_NO_PREALLOCATE,
MemAllocation::NO_PREALLOCATE));
Expand Down
24 changes: 21 additions & 3 deletions cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,9 @@ struct ParseString {

template <typename O, typename I>
struct CastFunctor<
O, I, enable_if_t<(is_number_type<O>::value && is_base_binary_type<I>::value)>> {
O, I,
enable_if_t<(is_number_type<O>::value && (is_base_binary_type<I>::value ||
is_binary_view_like_type<I>::value))>> {
static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
return applicator::ScalarUnaryNotNull<O, I, ParseString<O>>::Exec(ctx, batch, out);
}
Expand Down Expand Up @@ -658,11 +660,15 @@ struct DecimalCastFunctor {
};

template <typename I>
struct CastFunctor<Decimal128Type, I, enable_if_t<is_base_binary_type<I>::value>>
struct CastFunctor<
Decimal128Type, I,
enable_if_t<is_base_binary_type<I>::value || is_binary_view_like_type<I>::value>>
: public DecimalCastFunctor<Decimal128Type, I> {};

template <typename I>
struct CastFunctor<Decimal256Type, I, enable_if_t<is_base_binary_type<I>::value>>
struct CastFunctor<
Decimal256Type, I,
enable_if_t<is_base_binary_type<I>::value || is_binary_view_like_type<I>::value>>
: public DecimalCastFunctor<Decimal256Type, I> {};

// ----------------------------------------------------------------------
Expand Down Expand Up @@ -708,6 +714,10 @@ void AddCommonNumberCasts(const std::shared_ptr<DataType>& out_ty, CastFunction*
auto exec = GenerateVarBinaryBase<CastFunctor, OutType>(*in_ty);
DCHECK_OK(func->AddKernel(in_ty->id(), {in_ty}, out_ty, exec));
}
for (const std::shared_ptr<DataType>& in_ty : BinaryViewTypes()) {
auto exec = GenerateVarBinaryViewBase<CastFunctor, OutType>(*in_ty);
DCHECK_OK(func->AddKernel(in_ty->id(), {in_ty}, out_ty, exec));
}
}

template <typename OutType>
Expand Down Expand Up @@ -793,6 +803,10 @@ std::shared_ptr<CastFunction> GetCastToDecimal128() {
auto exec = GenerateVarBinaryBase<CastFunctor, Decimal128Type>(in_ty->id());
DCHECK_OK(func->AddKernel(in_ty->id(), {in_ty}, sig_out_ty, std::move(exec)));
}
for (const std::shared_ptr<DataType>& in_ty : BinaryViewTypes()) {
auto exec = GenerateVarBinaryViewBase<CastFunctor, Decimal128Type>(in_ty->id());
DCHECK_OK(func->AddKernel(in_ty->id(), {in_ty}, sig_out_ty, std::move(exec)));
}

// Cast from other decimal
auto exec = CastFunctor<Decimal128Type, Decimal128Type>::Exec;
Expand Down Expand Up @@ -828,6 +842,10 @@ std::shared_ptr<CastFunction> GetCastToDecimal256() {
auto exec = GenerateVarBinaryBase<CastFunctor, Decimal256Type>(in_ty->id());
DCHECK_OK(func->AddKernel(in_ty->id(), {in_ty}, sig_out_ty, std::move(exec)));
}
for (const std::shared_ptr<DataType>& in_ty : BinaryViewTypes()) {
auto exec = GenerateVarBinaryViewBase<CastFunctor, Decimal256Type>(in_ty->id());
DCHECK_OK(func->AddKernel(in_ty->id(), {in_ty}, sig_out_ty, std::move(exec)));
}

// Cast from other decimal
auto exec = CastFunctor<Decimal256Type, Decimal128Type>::Exec;
Expand Down
Loading

0 comments on commit 85fc3eb

Please sign in to comment.