Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-44446: [C++][Python] Add mask creation helper #44447

Open
wants to merge 16 commits into
base: main
Choose a base branch
from
21 changes: 21 additions & 0 deletions cpp/src/arrow/array/array_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -894,6 +894,27 @@ TEST_F(TestArray, TestConcurrentFillFromScalar) {
}
}

TEST_F(TestArray, TestMakeMaskArray) {
ASSERT_OK_AND_ASSIGN(auto mask_array_from_vector, MakeMaskArray({5, 8}, 10));
ASSERT_OK(mask_array_from_vector->ValidateFull());
ASSERT_EQ(mask_array_from_vector->length(), 10);

// Only values at index 5 and 8 should be true.
auto expected = ArrayFromJSON(
boolean(), "[false, false, false, false, false, true, false, false, true, false]");
AssertArraysEqual(*mask_array_from_vector, *expected);

auto array_indices = ArrayFromJSON(int64(), "[5, 8]");
ASSERT_OK_AND_ASSIGN(auto mask_array_from_array, MakeMaskArray(array_indices, 10));
ASSERT_OK(mask_array_from_array->ValidateFull());
ASSERT_EQ(mask_array_from_array->length(), 10);
AssertArraysEqual(*mask_array_from_array, *expected);

// Test out of bounds indices
ASSERT_RAISES(IndexError, MakeMaskArray({5, 10}, 8));
ASSERT_RAISES(IndexError, MakeMaskArray(ArrayFromJSON(int64(), "[5, 10]"), 8));
}

TEST_F(TestArray, ExtensionSpanRoundTrip) {
// Other types are checked in MakeEmptyArray but MakeEmptyArray doesn't
// work for extension types so we check that here
Expand Down
67 changes: 67 additions & 0 deletions cpp/src/arrow/array/util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@

#include "arrow/array.h"
#include "arrow/array/builder_base.h"
#include "arrow/array/builder_primitive.h"
#include "arrow/array/concatenate.h"
#include "arrow/buffer.h"
#include "arrow/buffer_builder.h"
Expand All @@ -51,6 +52,7 @@
namespace arrow {

using internal::checked_cast;
using internal::checked_pointer_cast;

// ----------------------------------------------------------------------
// Loading from ArrayData
Expand Down Expand Up @@ -915,6 +917,71 @@ Result<std::shared_ptr<Array>> MakeEmptyArray(std::shared_ptr<DataType> type,
return builder->Finish();
}

Result<std::shared_ptr<Array>> MakeMaskArray(const std::vector<int64_t>& indices,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This std::vector could be a span<int64_t> for more flexibility. The selection vector could come from anywhere.

int64_t length, MemoryPool* pool) {
ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateBitmap(length, pool));
bit_util::SetBitsTo(buffer->mutable_data(), 0, length, false);
amol- marked this conversation as resolved.
Show resolved Hide resolved
for (int64_t index : indices) {
if (index < 0 || index >= length) {
return Status::IndexError("Index out of bounds: ", index);
}
bit_util::SetBit(buffer->mutable_data(), index);
}
return std::make_shared<BooleanArray>(length, buffer);
}

template <typename IndexType>
Result<std::shared_ptr<Array>> MakeMaskArrayImpl(
const std::shared_ptr<NumericArray<IndexType>>& indices, int64_t length,
MemoryPool* pool) {
ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateBitmap(length, pool));
bit_util::SetBitsTo(buffer->mutable_data(), 0, length, false);
amol- marked this conversation as resolved.
Show resolved Hide resolved
for (int64_t i = 0; i < indices->length(); ++i) {
int64_t index = indices->Value(i);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The value could be null and nulls must be skipped.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The outer MakeMaskArray function already prevents values from being null, that's why it's not done in the Impl again.

if (index < 0 || index >= length) {
return Status::IndexError("Index out of bounds: ", index);
}
bit_util::SetBit(buffer->mutable_data(), index);
}
return std::make_shared<BooleanArray>(length, buffer);
}

Result<std::shared_ptr<Array>> MakeMaskArray(const std::shared_ptr<Array>& indices,
int64_t length, MemoryPool* pool) {
if (indices->null_count() > 0) {
return Status::Invalid("Indices array must not contain null values");
}
Comment on lines +949 to +951
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm. If it takes an Arrow array of indices it should be able to handle nulls. The loop can be specialized based on the result of indices->MayHaveNulls() so the common case doesn't have to check the validity bitmap of every iteration.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't see a reason why it would make sense to accept null values, it verifies if there are null values and rejects the array in case there are because there doesn't seem to be a case where it would make sense to have nulls in an array of indices.
If the user has an array containing nulls it can be used by passing it to compute.drop_null before using MakeMaskArray.


switch (indices->type_id()) {
case Type::INT8:
return MakeMaskArrayImpl(checked_pointer_cast<NumericArray<Int8Type>>(indices),
length, pool);
case Type::UINT8:
return MakeMaskArrayImpl(checked_pointer_cast<NumericArray<UInt8Type>>(indices),
length, pool);
case Type::INT16:
return MakeMaskArrayImpl(checked_pointer_cast<NumericArray<Int16Type>>(indices),
length, pool);
case Type::UINT16:
return MakeMaskArrayImpl(checked_pointer_cast<NumericArray<UInt16Type>>(indices),
length, pool);
case Type::INT32:
return MakeMaskArrayImpl(checked_pointer_cast<NumericArray<Int32Type>>(indices),
length, pool);
case Type::UINT32:
return MakeMaskArrayImpl(checked_pointer_cast<NumericArray<UInt32Type>>(indices),
length, pool);
case Type::INT64:
return MakeMaskArrayImpl(checked_pointer_cast<NumericArray<Int64Type>>(indices),
length, pool);
case Type::UINT64:
return MakeMaskArrayImpl(checked_pointer_cast<NumericArray<UInt64Type>>(indices),
length, pool);
default:
return Status::Invalid("Indices array must be of integer type");
}
}

namespace internal {

std::vector<ArrayVector> RechunkArraysConsistently(
Expand Down
27 changes: 27 additions & 0 deletions cpp/src/arrow/array/util.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,33 @@ ARROW_EXPORT
Result<std::shared_ptr<Array>> MakeEmptyArray(std::shared_ptr<DataType> type,
MemoryPool* pool = default_memory_pool());

/// \brief Create an Array representing a boolean mask
///
/// The mask will have all elements set to false except for those
/// indices specified in the indices vector.
///
/// \param[in] indices Which indices in the mask should be set to true
/// \param[in] length The total length of the mask
/// \param[in] pool the memory pool to allocate memory from
/// \return the resulting Array
ARROW_EXPORT
Result<std::shared_ptr<Array>> MakeMaskArray(const std::vector<int64_t>& indices,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As this function doesn't need to take ownership of indices, it can accept a span<int64_t> instead of a std::vector<int64_t> & so both vectors or any contiguous buffer if int64_t can be passed.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that it would be a span<const int64_t> then :)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought Arrow was constrained to C++17, isn't span a C++20 addition?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, that's why we have a util::span backport.

int64_t length,
MemoryPool* pool = default_memory_pool());

/// \brief Create an Array representing a boolean mask
///
/// The mask will have all elements set to false except for those
/// indices specified in the indices vector.
///
/// \param[in] indices Which indices in the mask should be set to true
/// \param[in] length The total length of the mask
/// \param[in] pool the memory pool to allocate memory from
/// \return the resulting Array
ARROW_EXPORT
Result<std::shared_ptr<Array>> MakeMaskArray(const std::shared_ptr<Array>& indices,
int64_t length,
MemoryPool* pool = default_memory_pool());
/// @}

namespace internal {
Expand Down
1 change: 1 addition & 0 deletions docs/source/python/api/arrays.rst
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ These functions create new Arrow arrays:

array
nulls
mask

Array Types
-----------
Expand Down
2 changes: 1 addition & 1 deletion python/pyarrow/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ def print_entry(label, value):
schema,
unify_schemas,
Array, Tensor,
array, chunked_array, record_batch, nulls, repeat,
array, chunked_array, record_batch, nulls, repeat, mask,
SparseCOOTensor, SparseCSRMatrix, SparseCSCMatrix,
SparseCSFTensor,
infer_type, from_numpy_dtype,
Expand Down
45 changes: 45 additions & 0 deletions python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -538,6 +538,51 @@ def repeat(value, size, MemoryPool memory_pool=None):
return pyarrow_wrap_array(c_array)


def mask(indices, length, MemoryPool memory_pool=None):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jorisvandenbossche Does this API look ok?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Late response, and the general API looks good, my one suggestion would be to use a bit more descriptive name. mask could also be interpret in the active sense ("mask those values"), and eg pandas has a method with that name that work in that sense. Some more explicit options: create_mask, make_mask, mask_from_indices, ..

This is essentially the counterpart for indices_nonzero I think? (which converts a mask into indices)
Could maybe mention that in a "See Also" section in the docstring

"""
Create a boolean Array instance where specific indices are marked as True.

Parameters
----------
indices : array-like (a sequence, numpy.ndarray, pyarrow.Array) of integers
The indices that have to be marked as True.
All other indices will be False.
length : int
How many entries the array should have total.
memory_pool : MemoryPool, default None
Arrow MemoryPool to use for allocations. Uses the default memory
pool if not passed.

Returns
-------
arr : Array

Examples
--------
>>> import pyarrow as pa
>>> pa.mask([1, 3], length=5)
<pyarrow.lib.BooleanArray object at ...>
[
false,
true,
false,
true,
false
]
"""
cdef:
CMemoryPool* c_pool = maybe_unbox_memory_pool(memory_pool)
shared_ptr[CArray] c_indices = pyarrow_unwrap_array(asarray(indices))
int64_t c_length = length

with nogil:
c_array = GetResultValue(
MakeMaskArray(c_indices, c_length, c_pool)
)

return pyarrow_wrap_array(c_array)


def infer_type(values, mask=None, from_pandas=False):
"""
Attempt to infer Arrow data type that can hold the passed Python
Expand Down
6 changes: 6 additions & 0 deletions python/pyarrow/includes/libarrow.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,12 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
CResult[shared_ptr[CArray]] MakeArrayFromScalar(
const CScalar& scalar, int64_t length, CMemoryPool* pool)

CResult[shared_ptr[CArray]] MakeMaskArray(
const vector[int64_t]&, int64_t length, CMemoryPool* pool)

CResult[shared_ptr[CArray]] MakeMaskArray(
const shared_ptr[CArray]&, int64_t length, CMemoryPool* pool)

CStatus DebugPrint(const CArray& arr, int indent)

cdef cppclass CFixedWidthType" arrow::FixedWidthType"(CDataType):
Expand Down
9 changes: 9 additions & 0 deletions python/pyarrow/tests/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -4228,3 +4228,12 @@ def test_non_cpu_array():
arr.tolist()
with pytest.raises(NotImplementedError):
arr.validate(full=True)


def test_mask_array():
expected = pa.array([False, False, True, False, True, False])
mask_array = pa.mask([2, 4], 6)
assert mask_array.equals(expected)

mask_array = pa.mask(pa.array([2, 4]), 6)
assert mask_array.equals(expected)
Loading