-
Notifications
You must be signed in to change notification settings - Fork 3.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
GH-44446: [C++][Python] Add mask creation helper #44447
base: main
Are you sure you want to change the base?
Changes from 14 commits
2b44595
06fd1d0
9b12391
09249fb
df2a4f2
b29533e
7e45794
3396732
3e886df
e2836ec
117f68b
2934be7
040e6fe
1227f6b
7467b06
b291114
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -29,6 +29,7 @@ | |
|
||
#include "arrow/array.h" | ||
#include "arrow/array/builder_base.h" | ||
#include "arrow/array/builder_primitive.h" | ||
#include "arrow/array/concatenate.h" | ||
#include "arrow/buffer.h" | ||
#include "arrow/buffer_builder.h" | ||
|
@@ -51,6 +52,7 @@ | |
namespace arrow { | ||
|
||
using internal::checked_cast; | ||
using internal::checked_pointer_cast; | ||
|
||
// ---------------------------------------------------------------------- | ||
// Loading from ArrayData | ||
|
@@ -915,6 +917,71 @@ Result<std::shared_ptr<Array>> MakeEmptyArray(std::shared_ptr<DataType> type, | |
return builder->Finish(); | ||
} | ||
|
||
Result<std::shared_ptr<Array>> MakeMaskArray(const std::vector<int64_t>& indices, | ||
int64_t length, MemoryPool* pool) { | ||
ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateBitmap(length, pool)); | ||
bit_util::SetBitsTo(buffer->mutable_data(), 0, length, false); | ||
amol- marked this conversation as resolved.
Show resolved
Hide resolved
|
||
for (int64_t index : indices) { | ||
if (index < 0 || index >= length) { | ||
return Status::IndexError("Index out of bounds: ", index); | ||
} | ||
bit_util::SetBit(buffer->mutable_data(), index); | ||
} | ||
return std::make_shared<BooleanArray>(length, buffer); | ||
} | ||
|
||
template <typename IndexType> | ||
Result<std::shared_ptr<Array>> MakeMaskArrayImpl( | ||
const std::shared_ptr<NumericArray<IndexType>>& indices, int64_t length, | ||
MemoryPool* pool) { | ||
ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateBitmap(length, pool)); | ||
bit_util::SetBitsTo(buffer->mutable_data(), 0, length, false); | ||
amol- marked this conversation as resolved.
Show resolved
Hide resolved
|
||
for (int64_t i = 0; i < indices->length(); ++i) { | ||
int64_t index = indices->Value(i); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The value could be null and nulls must be skipped. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The outer MakeMaskArray function already prevents values from being null, that's why it's not done in the Impl again. |
||
if (index < 0 || index >= length) { | ||
return Status::IndexError("Index out of bounds: ", index); | ||
} | ||
bit_util::SetBit(buffer->mutable_data(), index); | ||
} | ||
return std::make_shared<BooleanArray>(length, buffer); | ||
} | ||
|
||
Result<std::shared_ptr<Array>> MakeMaskArray(const std::shared_ptr<Array>& indices, | ||
int64_t length, MemoryPool* pool) { | ||
if (indices->null_count() > 0) { | ||
return Status::Invalid("Indices array must not contain null values"); | ||
} | ||
Comment on lines
+949
to
+951
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm. If it takes an Arrow array of indices it should be able to handle nulls. The loop can be specialized based on the result of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't see a reason why it would make sense to accept null values, it verifies if there are null values and rejects the array in case there are because there doesn't seem to be a case where it would make sense to have nulls in an array of indices. |
||
|
||
switch (indices->type_id()) { | ||
case Type::INT8: | ||
return MakeMaskArrayImpl(checked_pointer_cast<NumericArray<Int8Type>>(indices), | ||
length, pool); | ||
case Type::UINT8: | ||
return MakeMaskArrayImpl(checked_pointer_cast<NumericArray<UInt8Type>>(indices), | ||
length, pool); | ||
case Type::INT16: | ||
return MakeMaskArrayImpl(checked_pointer_cast<NumericArray<Int16Type>>(indices), | ||
length, pool); | ||
case Type::UINT16: | ||
return MakeMaskArrayImpl(checked_pointer_cast<NumericArray<UInt16Type>>(indices), | ||
length, pool); | ||
case Type::INT32: | ||
return MakeMaskArrayImpl(checked_pointer_cast<NumericArray<Int32Type>>(indices), | ||
length, pool); | ||
case Type::UINT32: | ||
return MakeMaskArrayImpl(checked_pointer_cast<NumericArray<UInt32Type>>(indices), | ||
length, pool); | ||
case Type::INT64: | ||
return MakeMaskArrayImpl(checked_pointer_cast<NumericArray<Int64Type>>(indices), | ||
length, pool); | ||
case Type::UINT64: | ||
return MakeMaskArrayImpl(checked_pointer_cast<NumericArray<UInt64Type>>(indices), | ||
length, pool); | ||
default: | ||
return Status::Invalid("Indices array must be of integer type"); | ||
} | ||
} | ||
|
||
namespace internal { | ||
|
||
std::vector<ArrayVector> RechunkArraysConsistently( | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -69,6 +69,33 @@ ARROW_EXPORT | |
Result<std::shared_ptr<Array>> MakeEmptyArray(std::shared_ptr<DataType> type, | ||
MemoryPool* pool = default_memory_pool()); | ||
|
||
/// \brief Create an Array representing a boolean mask | ||
/// | ||
/// The mask will have all elements set to false except for those | ||
/// indices specified in the indices vector. | ||
/// | ||
/// \param[in] indices Which indices in the mask should be set to true | ||
/// \param[in] length The total length of the mask | ||
/// \param[in] pool the memory pool to allocate memory from | ||
/// \return the resulting Array | ||
ARROW_EXPORT | ||
Result<std::shared_ptr<Array>> MakeMaskArray(const std::vector<int64_t>& indices, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As this function doesn't need to take ownership of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note that it would be a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I thought Arrow was constrained to C++17, isn't span a C++20 addition? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, that's why we have a |
||
int64_t length, | ||
MemoryPool* pool = default_memory_pool()); | ||
|
||
/// \brief Create an Array representing a boolean mask | ||
/// | ||
/// The mask will have all elements set to false except for those | ||
/// indices specified in the indices vector. | ||
/// | ||
/// \param[in] indices Which indices in the mask should be set to true | ||
/// \param[in] length The total length of the mask | ||
/// \param[in] pool the memory pool to allocate memory from | ||
/// \return the resulting Array | ||
ARROW_EXPORT | ||
Result<std::shared_ptr<Array>> MakeMaskArray(const std::shared_ptr<Array>& indices, | ||
int64_t length, | ||
MemoryPool* pool = default_memory_pool()); | ||
/// @} | ||
|
||
namespace internal { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,6 +31,7 @@ These functions create new Arrow arrays: | |
|
||
array | ||
nulls | ||
mask | ||
|
||
Array Types | ||
----------- | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -538,6 +538,51 @@ def repeat(value, size, MemoryPool memory_pool=None): | |
return pyarrow_wrap_array(c_array) | ||
|
||
|
||
def mask(indices, length, MemoryPool memory_pool=None): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jorisvandenbossche Does this API look ok? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Late response, and the general API looks good, my one suggestion would be to use a bit more descriptive name. This is essentially the counterpart for |
||
""" | ||
Create a boolean Array instance where specific indices are marked as True. | ||
|
||
Parameters | ||
---------- | ||
indices : array-like (a sequence, numpy.ndarray, pyarrow.Array) of integers | ||
The indices that have to be marked as True. | ||
All other indices will be False. | ||
length : int | ||
How many entries the array should have total. | ||
memory_pool : MemoryPool, default None | ||
Arrow MemoryPool to use for allocations. Uses the default memory | ||
pool if not passed. | ||
|
||
Returns | ||
------- | ||
arr : Array | ||
|
||
Examples | ||
-------- | ||
>>> import pyarrow as pa | ||
>>> pa.mask([1, 3], length=5) | ||
<pyarrow.lib.BooleanArray object at ...> | ||
[ | ||
false, | ||
true, | ||
false, | ||
true, | ||
false | ||
] | ||
""" | ||
cdef: | ||
CMemoryPool* c_pool = maybe_unbox_memory_pool(memory_pool) | ||
shared_ptr[CArray] c_indices = pyarrow_unwrap_array(asarray(indices)) | ||
int64_t c_length = length | ||
|
||
with nogil: | ||
c_array = GetResultValue( | ||
MakeMaskArray(c_indices, c_length, c_pool) | ||
) | ||
|
||
return pyarrow_wrap_array(c_array) | ||
|
||
|
||
def infer_type(values, mask=None, from_pandas=False): | ||
""" | ||
Attempt to infer Arrow data type that can hold the passed Python | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This
std::vector
could be aspan<int64_t>
for more flexibility. The selection vector could come from anywhere.