Skip to content

Commit

Permalink
Merge branch 'main' into parquet-using-buffer-as-api
Browse files Browse the repository at this point in the history
  • Loading branch information
mapleFU committed Jan 5, 2024
2 parents 6afb852 + 04d7984 commit 1763cea
Show file tree
Hide file tree
Showing 81 changed files with 917 additions and 340 deletions.
2 changes: 1 addition & 1 deletion NOTICE.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
Apache Arrow
Copyright 2016-2019 The Apache Software Foundation
Copyright 2016-2024 The Apache Software Foundation

This product includes software developed at
The Apache Software Foundation (http://www.apache.org/).
Expand Down
2 changes: 1 addition & 1 deletion ci/scripts/integration_substrait.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ set -e
echo "Substrait Integration Tests"
echo "Validating imports"
python -c "import pyarrow.substrait"
python -c "from substrait_consumer.consumers import AceroConsumer"
python -c "from substrait_consumer.consumers.acero_consumer import AceroConsumer"

echo "Executing pytest"
cd consumer-testing
Expand Down
2 changes: 1 addition & 1 deletion cpp/cmake_modules/FindLLVMAlt.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,8 @@ if(LLVM_FOUND)
debuginfodwarf
ipo
linker
mcjit
native
orcjit
target)
if(LLVM_VERSION_MAJOR GREATER_EQUAL 14)
list(APPEND LLVM_TARGET_COMPONENTS passes)
Expand Down
16 changes: 8 additions & 8 deletions cpp/src/arrow/array/array_nested.cc
Original file line number Diff line number Diff line change
Expand Up @@ -894,7 +894,8 @@ const std::shared_ptr<DataType>& FixedSizeListArray::value_type() const {
const std::shared_ptr<Array>& FixedSizeListArray::values() const { return values_; }

Result<std::shared_ptr<Array>> FixedSizeListArray::FromArrays(
const std::shared_ptr<Array>& values, int32_t list_size) {
const std::shared_ptr<Array>& values, int32_t list_size,
std::shared_ptr<Buffer> null_bitmap, int64_t null_count) {
if (list_size <= 0) {
return Status::Invalid("list_size needs to be a strict positive integer");
}
Expand All @@ -905,14 +906,14 @@ Result<std::shared_ptr<Array>> FixedSizeListArray::FromArrays(
}
int64_t length = values->length() / list_size;
auto list_type = std::make_shared<FixedSizeListType>(values->type(), list_size);
std::shared_ptr<Buffer> validity_buf;

return std::make_shared<FixedSizeListArray>(list_type, length, values, validity_buf,
/*null_count=*/0, /*offset=*/0);
return std::make_shared<FixedSizeListArray>(list_type, length, values, null_bitmap,
null_count);
}

Result<std::shared_ptr<Array>> FixedSizeListArray::FromArrays(
const std::shared_ptr<Array>& values, std::shared_ptr<DataType> type) {
const std::shared_ptr<Array>& values, std::shared_ptr<DataType> type,
std::shared_ptr<Buffer> null_bitmap, int64_t null_count) {
if (type->id() != Type::FIXED_SIZE_LIST) {
return Status::TypeError("Expected fixed size list type, got ", type->ToString());
}
Expand All @@ -926,10 +927,9 @@ Result<std::shared_ptr<Array>> FixedSizeListArray::FromArrays(
"The length of the values Array needs to be a multiple of the list size");
}
int64_t length = values->length() / list_type.list_size();
std::shared_ptr<Buffer> validity_buf;

return std::make_shared<FixedSizeListArray>(type, length, values, validity_buf,
/*null_count=*/0, /*offset=*/0);
return std::make_shared<FixedSizeListArray>(type, length, values, null_bitmap,
null_count);
}

Result<std::shared_ptr<Array>> FixedSizeListArray::Flatten(
Expand Down
16 changes: 12 additions & 4 deletions cpp/src/arrow/array/array_nested.h
Original file line number Diff line number Diff line change
Expand Up @@ -599,17 +599,25 @@ class ARROW_EXPORT FixedSizeListArray : public Array {
///
/// \param[in] values Array containing list values
/// \param[in] list_size The fixed length of each list
/// \param[in] null_bitmap Optional validity bitmap
/// \param[in] null_count Optional null count in null_bitmap
/// \return Will have length equal to values.length() / list_size
static Result<std::shared_ptr<Array>> FromArrays(const std::shared_ptr<Array>& values,
int32_t list_size);
static Result<std::shared_ptr<Array>> FromArrays(
const std::shared_ptr<Array>& values, int32_t list_size,
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount);

/// \brief Construct FixedSizeListArray from child value array and type
///
/// \param[in] values Array containing list values
/// \param[in] type The fixed sized list type
/// \param[in] null_bitmap Optional validity bitmap
/// \param[in] null_count Optional null count in null_bitmap
/// \return Will have length equal to values.length() / type.list_size()
static Result<std::shared_ptr<Array>> FromArrays(const std::shared_ptr<Array>& values,
std::shared_ptr<DataType> type);
static Result<std::shared_ptr<Array>> FromArrays(
const std::shared_ptr<Array>& values, std::shared_ptr<DataType> type,
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount);

protected:
void SetData(const std::shared_ptr<ArrayData>& data);
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/compute/api_aggregate.h
Original file line number Diff line number Diff line change
Expand Up @@ -452,7 +452,7 @@ Result<Datum> TDigest(const Datum& value,
/// \brief Find the first index of a value in an array.
///
/// \param[in] value The array to search.
/// \param[in] options The array to search for. See IndexOoptions.
/// \param[in] options The array to search for. See IndexOptions.
/// \param[in] ctx the function execution context, optional
/// \return out a Scalar containing the index (or -1 if not found).
///
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/arrow/compute/api_scalar.h
Original file line number Diff line number Diff line change
Expand Up @@ -491,7 +491,7 @@ struct ARROW_EXPORT AssumeTimezoneOptions : public FunctionOptions {

/// How to interpret ambiguous local times (due to DST shifts)
Ambiguous ambiguous;
/// How to interpret non-existent local times (due to DST shifts)
/// How to interpret nonexistent local times (due to DST shifts)
Nonexistent nonexistent;
};

Expand Down Expand Up @@ -1589,7 +1589,7 @@ ARROW_EXPORT Result<Datum> MonthsBetween(const Datum& left, const Datum& right,
ARROW_EXPORT Result<Datum> WeeksBetween(const Datum& left, const Datum& right,
ExecContext* ctx = NULLPTR);

/// \brief Month Day Nano Between finds the number of months, days, and nonaseconds
/// \brief Month Day Nano Between finds the number of months, days, and nanoseconds
/// between two values
///
/// \param[in] left input treated as the start time
Expand Down
6 changes: 3 additions & 3 deletions cpp/src/arrow/compute/api_vector.h
Original file line number Diff line number Diff line change
Expand Up @@ -401,7 +401,7 @@ Result<std::shared_ptr<Array>> NthToIndices(const Array& values, int64_t n,

/// \brief Return indices that partition an array around n-th sorted element.
///
/// This overload takes a PartitionNthOptions specifiying the pivot index
/// This overload takes a PartitionNthOptions specifying the pivot index
/// and the null handling.
///
/// \param[in] values array to be partitioned
Expand Down Expand Up @@ -452,7 +452,7 @@ Result<std::shared_ptr<Array>> SortIndices(const Array& array,

/// \brief Return the indices that would sort an array.
///
/// This overload takes a ArraySortOptions specifiying the sort order
/// This overload takes a ArraySortOptions specifying the sort order
/// and the null handling.
///
/// \param[in] array array to sort
Expand Down Expand Up @@ -486,7 +486,7 @@ Result<std::shared_ptr<Array>> SortIndices(const ChunkedArray& chunked_array,

/// \brief Return the indices that would sort a chunked array.
///
/// This overload takes a ArraySortOptions specifiying the sort order
/// This overload takes a ArraySortOptions specifying the sort order
/// and the null handling.
///
/// \param[in] chunked_array chunked array to sort
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/compute/exec.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1164,7 +1164,7 @@ class ScalarAggExecutor : public KernelExecutorImpl<ScalarAggregateKernel> {
// TODO(wesm): this is odd and should be examined soon -- only one state
// "should" be needed per thread of execution

// FIXME(ARROW-11840) don't merge *any* aggegates for every batch
// FIXME(ARROW-11840) don't merge *any* aggregates for every batch
ARROW_ASSIGN_OR_RAISE(auto batch_state,
kernel_->init(kernel_ctx_, {kernel_, *input_types_, options_}));

Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/compute/exec_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ class ARROW_EXPORT ExecSpanIterator {
public:
ExecSpanIterator() = default;

/// \brief Initialize itertor iterator and do basic argument validation
/// \brief Initialize iterator and do basic argument validation
///
/// \param[in] batch the input ExecBatch
/// \param[in] max_chunksize the maximum length of each ExecSpan. Depending
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/compute/exec_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1232,7 +1232,7 @@ void TestCallScalarFunctionPreallocationCases::DoTest(FunctionCallerMaker caller
}

// Set the exec_chunksize to be smaller, so now we have several invocations
// of the kernel, but still the output is onee array
// of the kernel, but still the output is one array
{
std::vector<Datum> args = {Datum(arr)};
exec_ctx_->set_exec_chunksize(80);
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/compute/kernels/aggregate_basic.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1100,7 +1100,7 @@ void RegisterScalarAggregateBasic(FunctionRegistry* registry) {
AddFirstLastKernels(FirstLastInit, TemporalTypes(), func.get());
DCHECK_OK(registry->AddFunction(std::move(func)));

// Add first/last as convience functions
// Add first/last as convenience functions
func = std::make_shared<ScalarAggregateFunction>("first", Arity::Unary(), first_doc,
&default_scalar_aggregate_options);
AddFirstOrLastAggKernel<FirstOrLast::First>(func.get(), first_last_func);
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/compute/kernels/aggregate_basic_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -472,7 +472,7 @@ struct FirstLastImpl : public ScalarAggregator {
this->count += arr.length() - null_count;

if (null_count == 0) {
// If there are no null valus, we can just merge
// If there are no null values, we can just merge
// the first and last element
this->state.MergeOne(arr.GetView(0));
this->state.MergeOne(arr.GetView(arr.length() - 1));
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/compute/kernels/aggregate_mode.cc
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ Status Finalize(KernelContext* ctx, const DataType& type, ExecResult* out,
return Status::OK();
}

// count value occurances for integers with narrow value range
// count value occurrences for integers with narrow value range
// O(1) space, O(n) time
template <typename T>
struct CountModer {
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/compute/kernels/aggregate_quantile.cc
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ struct SortQuantiler {
});

// input array is partitioned around data point at `last_index` (pivot)
// for next quatile which is smaller, we only consider inputs left of the pivot
// for next quantile which is smaller, we only consider inputs left of the pivot
uint64_t last_index = in_buffer.size();
if (is_datapoint) {
CType* out_buffer = out_data->template GetMutableValues<CType>(1);
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/arrow/compute/kernels/aggregate_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -478,7 +478,7 @@ TEST_F(TestSumKernelRoundOff, Basics) {

// array = np.arange(321000, dtype='float64')
// array -= np.mean(array)
// array *= arrray
// array *= array
double index = 0;
ASSERT_OK_AND_ASSIGN(
auto array, ArrayFromBuilderVisitor(
Expand Down Expand Up @@ -3653,7 +3653,7 @@ class TestPrimitiveQuantileKernel : public ::testing::Test {

#define INTYPE(x) Datum(static_cast<typename TypeParam::c_type>(x))
#define DOUBLE(x) Datum(static_cast<double>(x))
// output type per interplation: linear, lower, higher, nearest, midpoint
// output type per interpolation: linear, lower, higher, nearest, midpoint
#define O(a, b, c, d, e) \
{ DOUBLE(a), INTYPE(b), INTYPE(c), INTYPE(d), DOUBLE(e) }

Expand Down
4 changes: 2 additions & 2 deletions cpp/src/arrow/compute/kernels/hash_aggregate.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1848,8 +1848,8 @@ struct GroupedFirstLastImpl final : public GroupedAggregator {
const ArrayData& group_id_mapping) override {
// The merge is asymmetric. "first" from this state gets pick over "first" from other
// state. "last" from other state gets pick over from this state. This is so that when
// using with segmeneted aggregation, we still get the correct "first" and "last"
// value for the entire segement.
// using with segmented aggregation, we still get the correct "first" and "last"
// value for the entire segment.
auto other = checked_cast<GroupedFirstLastImpl*>(&raw_other);

auto raw_firsts = firsts_.mutable_data();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ constexpr auto kSeed = 0x94378165;
using BinaryOp = Result<Datum>(const Datum&, const Datum&, ArithmeticOptions,
ExecContext*);

// Add explicit overflow-checked shortcuts, for easy benchmark parametering.
// Add explicit overflow-checked shortcuts, for easy benchmark parameterizing.
static Result<Datum> AddChecked(const Datum& left, const Datum& right,
ArithmeticOptions options = ArithmeticOptions(),
ExecContext* ctx = NULLPTR) {
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1857,7 +1857,7 @@ TEST_F(TestBinaryArithmeticDecimal, DispatchBest) {
}
}

// reference result from bc (precsion=100, scale=40)
// reference result from bc (precision=100, scale=40)
TEST_F(TestBinaryArithmeticDecimal, AddSubtract) {
// array array, decimal128
{
Expand Down
8 changes: 4 additions & 4 deletions cpp/src/arrow/compute/kernels/scalar_cast_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2842,19 +2842,19 @@ TEST(Cast, StructToDifferentNullabilityStruct) {
::testing::HasSubstr("cannot cast nullable field to non-nullable field"),
Cast(src_nullable, options1_non_nullable));

std::vector<std::shared_ptr<Field>> fields_dest2_non_nullble = {
std::vector<std::shared_ptr<Field>> fields_dest2_non_nullable = {
std::make_shared<Field>("a", int64(), false),
std::make_shared<Field>("c", int64(), false)};
const auto dest2_non_nullable = arrow::struct_(fields_dest2_non_nullble);
const auto dest2_non_nullable = arrow::struct_(fields_dest2_non_nullable);
const auto options2_non_nullable = CastOptions::Safe(dest2_non_nullable);
EXPECT_RAISES_WITH_MESSAGE_THAT(
TypeError,
::testing::HasSubstr("cannot cast nullable field to non-nullable field"),
Cast(src_nullable, options2_non_nullable));

std::vector<std::shared_ptr<Field>> fields_dest3_non_nullble = {
std::vector<std::shared_ptr<Field>> fields_dest3_non_nullable = {
std::make_shared<Field>("c", int64(), false)};
const auto dest3_non_nullable = arrow::struct_(fields_dest3_non_nullble);
const auto dest3_non_nullable = arrow::struct_(fields_dest3_non_nullable);
const auto options3_non_nullable = CastOptions::Safe(dest3_non_nullable);
EXPECT_RAISES_WITH_MESSAGE_THAT(
TypeError,
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/compute/kernels/scalar_if_else_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ template <typename Type>
class TestIfElsePrimitive : public ::testing::Test {};

// There are a lot of tests here if we cover all the types and it gets slow on valgrind
// so we overrdie the standard type sets with a smaller range
// so we override the standard type sets with a smaller range
#ifdef ARROW_VALGRIND
using IfElseNumericBasedTypes =
::testing::Types<UInt32Type, FloatType, Date32Type, Time32Type, TimestampType,
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/compute/kernels/scalar_round.cc
Original file line number Diff line number Diff line change
Expand Up @@ -747,7 +747,7 @@ struct Round {
} else {
round_val = RoundImpl<CType, RndMode>::Round(round_val);
}
// Equality check is ommitted so that the common case of 10^0 (integer rounding)
// Equality check is omitted so that the common case of 10^0 (integer rounding)
// uses multiply-only
round_val = ndigits > 0 ? (round_val / pow10) : (round_val * pow10);
if (!std::isfinite(round_val)) {
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/compute/kernels/scalar_string_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,7 @@ struct StringSplitExec {
using ListOffsetsBuilderType = TypedBufferBuilder<list_offset_type>;
using State = OptionsWrapper<Options>;

// Keep the temporary storage accross individual values, to minimize reallocations
// Keep the temporary storage across individual values, to minimize reallocations
std::vector<std::string_view> parts;
Options options;

Expand Down
4 changes: 2 additions & 2 deletions cpp/src/arrow/compute/kernels/scalar_string_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2060,7 +2060,7 @@ TYPED_TEST(TestStringKernels, SliceCodeunitsBasic) {
this->CheckUnary("utf8_slice_codeunits", R"(["𝑓öõḍš"])", this->type(), R"([""])",
&options_edgecase_1);

// this is a safeguard agains an optimization path possible, but actually a tricky case
// this is a safeguard against an optimization path possible, but actually a tricky case
SliceOptions options_edgecase_2{-6, -2};
this->CheckUnary("utf8_slice_codeunits", R"(["𝑓öõḍš"])", this->type(), R"(["𝑓öõ"])",
&options_edgecase_2);
Expand Down Expand Up @@ -2189,7 +2189,7 @@ TYPED_TEST(TestBinaryKernels, SliceBytesBasic) {
"ds\"]",
this->type(), R"([""])", &options_edgecase_1);

// this is a safeguard agains an optimization path possible, but actually a tricky case
// this is a safeguard against an optimization path possible, but actually a tricky case
SliceOptions options_edgecase_2{-6, -2};
this->CheckUnary("binary_slice",
"[\"f\xc2\xa2"
Expand Down
14 changes: 7 additions & 7 deletions cpp/src/arrow/compute/kernels/scalar_temporal_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2101,9 +2101,9 @@ TEST_F(ScalarTemporalTest, StrftimeNoTimezone) {

TEST_F(ScalarTemporalTest, StrftimeInvalidTimezone) {
const char* seconds = R"(["1970-01-01T00:00:59", null])";
auto arr = ArrayFromJSON(timestamp(TimeUnit::SECOND, "non-existent"), seconds);
auto arr = ArrayFromJSON(timestamp(TimeUnit::SECOND, "nonexistent"), seconds);
EXPECT_RAISES_WITH_MESSAGE_THAT(
Invalid, testing::HasSubstr("Cannot locate timezone 'non-existent'"),
Invalid, testing::HasSubstr("Cannot locate timezone 'nonexistent'"),
Strftime(arr, StrftimeOptions()));
}

Expand Down Expand Up @@ -2159,12 +2159,12 @@ TEST_F(ScalarTemporalTest, StrftimeOtherLocale) {
}

TEST_F(ScalarTemporalTest, StrftimeInvalidLocale) {
auto options = StrftimeOptions("%d %B %Y %H:%M:%S", "non-existent");
auto options = StrftimeOptions("%d %B %Y %H:%M:%S", "nonexistent");
const char* seconds = R"(["1970-01-01T00:00:59", null])";
auto arr = ArrayFromJSON(timestamp(TimeUnit::SECOND, "UTC"), seconds);

EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid,
testing::HasSubstr("Cannot find locale 'non-existent'"),
testing::HasSubstr("Cannot find locale 'nonexistent'"),
Strftime(arr, options));
}

Expand Down Expand Up @@ -2601,7 +2601,7 @@ TEST_F(ScalarTemporalTestStrictCeil, TestCeilTemporalStrictCeil) {
TEST_F(ScalarTemporalTestMultipleSinceGreaterUnit, CeilUTC) {
std::string op = "ceil_temporal";

// Data for tests below was generaed via lubridate with the exception
// Data for tests below was generated via lubridate with the exception
// of week data because lubridate currently does not support rounding to
// multiple of week.
const char* ceil_15_nanosecond =
Expand Down Expand Up @@ -2989,7 +2989,7 @@ TEST_F(ScalarTemporalTest, TestFloorTemporal) {
TEST_F(ScalarTemporalTestMultipleSinceGreaterUnit, FloorUTC) {
std::string op = "floor_temporal";

// Data for tests below was generaed via lubridate with the exception
// Data for tests below was generated via lubridate with the exception
// of week data because lubridate currently does not support rounding to
// multiple of week.
const char* floor_15_nanosecond =
Expand Down Expand Up @@ -3402,7 +3402,7 @@ TEST_F(ScalarTemporalTest, TestCeilFloorRoundTemporalBrussels) {
TEST_F(ScalarTemporalTestMultipleSinceGreaterUnit, RoundUTC) {
std::string op = "round_temporal";

// Data for tests below was generaed via lubridate with the exception
// Data for tests below was generated via lubridate with the exception
// of week data because lubridate currently does not support rounding to
// multiple of week.
const char* round_15_nanosecond =
Expand Down
Loading

0 comments on commit 1763cea

Please sign in to comment.