diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index 44ccf9687b774..3c2abd54f661d 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -894,6 +894,27 @@ TEST_F(TestArray, TestConcurrentFillFromScalar) { } } +TEST_F(TestArray, TestMakeMaskArray) { + ASSERT_OK_AND_ASSIGN(auto mask_array_from_vector, MakeMaskArray({5, 8}, 10)); + ASSERT_OK(mask_array_from_vector->ValidateFull()); + ASSERT_EQ(mask_array_from_vector->length(), 10); + + // Only values at index 5 and 8 should be true. + auto expected = ArrayFromJSON( + boolean(), "[false, false, false, false, false, true, false, false, true, false]"); + AssertArraysEqual(*mask_array_from_vector, *expected); + + auto array_indices = ArrayFromJSON(int64(), "[5, 8]"); + ASSERT_OK_AND_ASSIGN(auto mask_array_from_array, MakeMaskArray(array_indices, 10)); + ASSERT_OK(mask_array_from_array->ValidateFull()); + ASSERT_EQ(mask_array_from_array->length(), 10); + AssertArraysEqual(*mask_array_from_array, *expected); + + // Test out of bounds indices + ASSERT_RAISES(IndexError, MakeMaskArray({5, 10}, 8)); + ASSERT_RAISES(IndexError, MakeMaskArray(ArrayFromJSON(int64(), "[5, 10]"), 8)); +} + TEST_F(TestArray, ExtensionSpanRoundTrip) { // Other types are checked in MakeEmptyArray but MakeEmptyArray doesn't // work for extension types so we check that here diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index 51c27b2d9719f..b036a08aff634 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -29,6 +29,7 @@ #include "arrow/array.h" #include "arrow/array/builder_base.h" +#include "arrow/array/builder_primitive.h" #include "arrow/array/concatenate.h" #include "arrow/buffer.h" #include "arrow/buffer_builder.h" @@ -51,6 +52,7 @@ namespace arrow { using internal::checked_cast; +using internal::checked_pointer_cast; // ---------------------------------------------------------------------- // Loading from ArrayData @@ -915,6 +917,69 @@ Result> MakeEmptyArray(std::shared_ptr type, return builder->Finish(); } +Result> MakeMaskArray(const std::vector& indices, + int64_t length, MemoryPool* pool) { + ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateEmptyBitmap(length, pool)); + for (int64_t index : indices) { + if (index < 0 || index >= length) { + return Status::IndexError("Index out of bounds: ", index); + } + bit_util::SetBit(buffer->mutable_data(), index); + } + return std::make_shared(length, buffer); +} + +template +Result> MakeMaskArrayImpl( + const std::shared_ptr>& indices, int64_t length, + MemoryPool* pool) { + ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateEmptyBitmap(length, pool)); + for (int64_t i = 0; i < indices->length(); ++i) { + int64_t index = indices->Value(i); + if (index < 0 || index >= length) { + return Status::IndexError("Index out of bounds: ", index); + } + bit_util::SetBit(buffer->mutable_data(), index); + } + return std::make_shared(length, buffer); +} + +Result> MakeMaskArray(const std::shared_ptr& indices, + int64_t length, MemoryPool* pool) { + if (indices->null_count() > 0) { + return Status::Invalid("Indices array must not contain null values"); + } + + switch (indices->type_id()) { + case Type::INT8: + return MakeMaskArrayImpl(checked_pointer_cast>(indices), + length, pool); + case Type::UINT8: + return MakeMaskArrayImpl(checked_pointer_cast>(indices), + length, pool); + case Type::INT16: + return MakeMaskArrayImpl(checked_pointer_cast>(indices), + length, pool); + case Type::UINT16: + return MakeMaskArrayImpl(checked_pointer_cast>(indices), + length, pool); + case Type::INT32: + return MakeMaskArrayImpl(checked_pointer_cast>(indices), + length, pool); + case Type::UINT32: + return MakeMaskArrayImpl(checked_pointer_cast>(indices), + length, pool); + case Type::INT64: + return MakeMaskArrayImpl(checked_pointer_cast>(indices), + length, pool); + case Type::UINT64: + return MakeMaskArrayImpl(checked_pointer_cast>(indices), + length, pool); + default: + return Status::Invalid("Indices array must be of integer type"); + } +} + namespace internal { std::vector RechunkArraysConsistently( diff --git a/cpp/src/arrow/array/util.h b/cpp/src/arrow/array/util.h index fd8e75ddb8640..bbe5b3e18463e 100644 --- a/cpp/src/arrow/array/util.h +++ b/cpp/src/arrow/array/util.h @@ -69,6 +69,33 @@ ARROW_EXPORT Result> MakeEmptyArray(std::shared_ptr type, MemoryPool* pool = default_memory_pool()); +/// \brief Create an Array representing a boolean mask +/// +/// The mask will have all elements set to false except for those +/// indices specified in the indices vector. +/// +/// \param[in] indices Which indices in the mask should be set to true +/// \param[in] length The total length of the mask +/// \param[in] pool the memory pool to allocate memory from +/// \return the resulting Array +ARROW_EXPORT +Result> MakeMaskArray(const std::vector& indices, + int64_t length, + MemoryPool* pool = default_memory_pool()); + +/// \brief Create an Array representing a boolean mask +/// +/// The mask will have all elements set to false except for those +/// indices specified in the indices vector. +/// +/// \param[in] indices Which indices in the mask should be set to true +/// \param[in] length The total length of the mask +/// \param[in] pool the memory pool to allocate memory from +/// \return the resulting Array +ARROW_EXPORT +Result> MakeMaskArray(const std::shared_ptr& indices, + int64_t length, + MemoryPool* pool = default_memory_pool()); /// @} namespace internal { diff --git a/docs/source/python/api/arrays.rst b/docs/source/python/api/arrays.rst index 4ad35b190cdd0..121b563d7cce5 100644 --- a/docs/source/python/api/arrays.rst +++ b/docs/source/python/api/arrays.rst @@ -31,6 +31,7 @@ These functions create new Arrow arrays: array nulls + mask Array Types ----------- diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 8c8c09265d0bf..c0e94377f1e23 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -193,7 +193,7 @@ def print_entry(label, value): schema, unify_schemas, Array, Tensor, - array, chunked_array, record_batch, nulls, repeat, + array, chunked_array, record_batch, nulls, repeat, mask, SparseCOOTensor, SparseCSRMatrix, SparseCSCMatrix, SparseCSFTensor, infer_type, from_numpy_dtype, diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 8bddc34e1000b..fe42ad58101cd 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -538,6 +538,51 @@ def repeat(value, size, MemoryPool memory_pool=None): return pyarrow_wrap_array(c_array) +def mask(indices, length, MemoryPool memory_pool=None): + """ + Create a boolean Array instance where specific indices are marked as True. + + Parameters + ---------- + indices : array-like (a sequence, numpy.ndarray, pyarrow.Array) of integers + The indices that have to be marked as True. + All other indices will be False. + length : int + How many entries the array should have total. + memory_pool : MemoryPool, default None + Arrow MemoryPool to use for allocations. Uses the default memory + pool if not passed. + + Returns + ------- + arr : Array + + Examples + -------- + >>> import pyarrow as pa + >>> pa.mask([1, 3], length=5) + + [ + false, + true, + false, + true, + false + ] + """ + cdef: + CMemoryPool* c_pool = maybe_unbox_memory_pool(memory_pool) + shared_ptr[CArray] c_indices = pyarrow_unwrap_array(asarray(indices)) + int64_t c_length = length + + with nogil: + c_array = GetResultValue( + MakeMaskArray(c_indices, c_length, c_pool) + ) + + return pyarrow_wrap_array(c_array) + + def infer_type(values, mask=None, from_pandas=False): """ Attempt to infer Arrow data type that can hold the passed Python diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 8bf61b73cc211..ed0616580bcda 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -246,6 +246,12 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: CResult[shared_ptr[CArray]] MakeArrayFromScalar( const CScalar& scalar, int64_t length, CMemoryPool* pool) + CResult[shared_ptr[CArray]] MakeMaskArray( + const vector[int64_t]&, int64_t length, CMemoryPool* pool) + + CResult[shared_ptr[CArray]] MakeMaskArray( + const shared_ptr[CArray]&, int64_t length, CMemoryPool* pool) + CStatus DebugPrint(const CArray& arr, int indent) cdef cppclass CFixedWidthType" arrow::FixedWidthType"(CDataType): diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 885442b079c5b..d3bdcb6f5088d 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -4228,3 +4228,12 @@ def test_non_cpu_array(): arr.tolist() with pytest.raises(NotImplementedError): arr.validate(full=True) + + +def test_mask_array(): + expected = pa.array([False, False, True, False, True, False]) + mask_array = pa.mask([2, 4], 6) + assert mask_array.equals(expected) + + mask_array = pa.mask(pa.array([2, 4]), 6) + assert mask_array.equals(expected)