Skip to content

Commit

Permalink
Merge branch 'main' into bump-zlib-to-1.3
Browse files Browse the repository at this point in the history
  • Loading branch information
mapleFU committed Jan 17, 2024
2 parents 6b31549 + 6eeee3b commit 5e86d1f
Show file tree
Hide file tree
Showing 294 changed files with 5,485 additions and 2,191 deletions.
4 changes: 3 additions & 1 deletion .github/workflows/archery.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,9 @@ jobs:
- name: Install pygit2 binary wheel
run: pip install pygit2 --only-binary pygit2
- name: Install Archery, Crossbow- and Test Dependencies
run: pip install pytest responses -e dev/archery[all]
run: |
pip install -e dev/archery[all]
pip install -r dev/archery/requirements-test.txt
- name: Archery Unittests
working-directory: dev/archery
run: pytest -v archery
Expand Down
2 changes: 0 additions & 2 deletions .github/workflows/cpp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -340,8 +340,6 @@ jobs:
fail-fast: false
matrix:
include:
- msystem_lower: mingw32
msystem_upper: MINGW32
- msystem_lower: mingw64
msystem_upper: MINGW64
- msystem_lower: clang64
Expand Down
2 changes: 1 addition & 1 deletion NOTICE.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
Apache Arrow
Copyright 2016-2019 The Apache Software Foundation
Copyright 2016-2024 The Apache Software Foundation

This product includes software developed at
The Apache Software Foundation (http://www.apache.org/).
Expand Down
22 changes: 21 additions & 1 deletion c_glib/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,17 @@ $ meson compile -C c_glib.build
$ sudo meson install -C c_glib.build
```

> [!WARNING]
>
> When building Arrow GLib, it typically uses the Arrow C++ installed via Homebrew. However, this can lead to build failures
> if there are mismatches between the changes in Arrow's GLib and C++ libraries. To resolve this, you may need to
> reference the Arrow C++ library built locally. In such cases, use the `--cmake-prefix-path` option with the `meson setup`
> command to explicitly specify the library path.
>
> ```console
> $ meson setup c_glib.build c_glib --cmake-prefix-path=${arrow_cpp_install_prefix} -Dgtk_doc=true
> ```
Others:
```console
Expand Down Expand Up @@ -231,9 +242,18 @@ Now, you can run unit tests by the followings:

```console
$ cd c_glib.build
$ bundle exec ../c_glib/test/run-test.sh
$ BUNDLE_GEMFILE=../c_glib/Gemfile bundle exec ../c_glib/test/run-test.sh
```


> [!NOTE]
>
> If debugging is necessary, you can proceed using the `DEBUGGER` option as follows:
>
> ```console
> $ DEBUGGER=lldb BUNDLE_GEMFILE=../c_glib/Gemfile bundle exec ../c_glib/test/run-test.sh
> ```
## Common build problems
### build failed - /usr/bin/ld: cannot find -larrow
Expand Down
6 changes: 4 additions & 2 deletions c_glib/arrow-glib/scalar.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
#include <arrow-glib/interval.hpp>
#include <arrow-glib/scalar.hpp>

#include <arrow/compute/cast.h>

G_BEGIN_DECLS

/**
Expand Down Expand Up @@ -385,9 +387,9 @@ garrow_scalar_cast(GArrowScalar *scalar,
{
const auto arrow_scalar = garrow_scalar_get_raw(scalar);
const auto arrow_data_type = garrow_data_type_get_raw(data_type);
auto arrow_casted_scalar_result = arrow_scalar->CastTo(arrow_data_type);
auto arrow_casted_scalar_result = arrow::compute::Cast(arrow_scalar, arrow_data_type);
if (garrow::check(error, arrow_casted_scalar_result, "[scalar][cast]")) {
auto arrow_casted_scalar = *arrow_casted_scalar_result;
auto arrow_casted_scalar = (*arrow_casted_scalar_result).scalar();
return garrow_scalar_new_raw(&arrow_casted_scalar,
"scalar", &arrow_casted_scalar,
"data-type", data_type,
Expand Down
23 changes: 19 additions & 4 deletions c_glib/test/run-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,14 @@ for module in "${modules[@]}"; do
module_build_dir="${build_dir}/${module}"
if [ -d "${module_build_dir}" ]; then
LD_LIBRARY_PATH="${module_build_dir}:${LD_LIBRARY_PATH}"
DYLD_LIBRARY_PATH="${module_build_dir}:${DYLD_LIBRARY_PATH}"
fi
done
export LD_LIBRARY_PATH
export DYLD_LIBRARY_PATH

if [ "${BUILD}" != "no" ]; then
if [ -f "Makefile" ]; then
make -j8 > /dev/null || exit $?
elif [ -f "build.ninja" ]; then
if [ -f "build.ninja" ]; then
ninja || exit $?
fi
fi
Expand All @@ -59,4 +59,19 @@ for module in "${modules[@]}"; do
done
export GI_TYPELIB_PATH

${GDB} ruby ${test_dir}/run-test.rb "$@"
if type rbenv > /dev/null 2>&1; then
RUBY="$(rbenv which ruby)"
else
RUBY=ruby
fi
DEBUGGER_ARGS=()
case "${DEBUGGER}" in
"gdb")
DEBUGGER_ARGS+=(--args)
;;
"lldb")
DEBUGGER_ARGS+=(--one-line "env DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}")
DEBUGGER_ARGS+=(--)
;;
esac
${DEBUGGER} "${DEBUGGER_ARGS[@]}" "${RUBY}" ${test_dir}/run-test.rb "$@"
2 changes: 1 addition & 1 deletion ci/conda_env_sphinx.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ breathe
doxygen
ipython
numpydoc
pydata-sphinx-theme
pydata-sphinx-theme=0.14
sphinx-autobuild
sphinx-design
sphinx-copybutton
Expand Down
7 changes: 6 additions & 1 deletion ci/scripts/integration_dask.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,12 @@ python -c "import dask.dataframe"
# The "skip_with_pyarrow_strings" marker is meant to skip automatically, but that doesn't work with --pyargs, so de-selecting manually
# - The 'test_categorize_info' test is failing because of change in StringArray's nbytes and
# an upstream fix (https://github.com/apache/arrow/issues/39028)
pytest -v --pyargs dask.dataframe.tests.test_dataframe -m "not skip_with_pyarrow_strings" -k "not test_categorize_info"
# - The 'test_describe_empty' test is flakey
# upstream issue: https://github.com/dask/dask/issues/10672
# - The 'test_view' fails because we are not using the dev version of pandas
# where pd.Series.view is deprecated (https://pandas.pydata.org/docs/dev/reference/api/pandas.Series.view.html)
pytest -v --pyargs dask.dataframe.tests.test_dataframe -m "not skip_with_pyarrow_strings" \
-k "not test_categorize_info and not test_describe_empty and not test_view"
pytest -v --pyargs dask.dataframe.io.tests.test_orc
pytest -v --pyargs dask.dataframe.io.tests.test_parquet \
-m "not skip_with_pyarrow_strings and not xfail_with_pyarrow_strings"
Expand Down
2 changes: 1 addition & 1 deletion ci/scripts/integration_substrait.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ set -e
echo "Substrait Integration Tests"
echo "Validating imports"
python -c "import pyarrow.substrait"
python -c "from substrait_consumer.consumers import AceroConsumer"
python -c "from substrait_consumer.consumers.acero_consumer import AceroConsumer"

echo "Executing pytest"
cd consumer-testing
Expand Down
5 changes: 3 additions & 2 deletions ci/scripts/r_docker_configure.sh
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,9 @@ if [ -f "${ARROW_SOURCE_HOME}/ci/scripts/r_install_system_dependencies.sh" ]; th
"${ARROW_SOURCE_HOME}/ci/scripts/r_install_system_dependencies.sh"
fi

# Install rsync for bundling cpp source and curl to make sure it is installed on all images
$PACKAGE_MANAGER install -y rsync curl
# Install rsync for bundling cpp source and curl to make sure it is installed on all images,
# cmake is now a listed sys req.
$PACKAGE_MANAGER install -y rsync cmake curl

# Workaround for html help install failure; see https://github.com/r-lib/devtools/issues/2084#issuecomment-530912786
Rscript -e 'x <- file.path(R.home("doc"), "html"); if (!file.exists(x)) {dir.create(x, recursive=TRUE); file.copy(system.file("html/R.css", package="stats"), x)}'
2 changes: 1 addition & 1 deletion cpp/cmake_modules/FindLLVMAlt.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,8 @@ if(LLVM_FOUND)
debuginfodwarf
ipo
linker
mcjit
native
orcjit
target)
if(LLVM_VERSION_MAJOR GREATER_EQUAL 14)
list(APPEND LLVM_TARGET_COMPONENTS passes)
Expand Down
24 changes: 16 additions & 8 deletions cpp/cmake_modules/ThirdpartyToolchain.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -1005,16 +1005,19 @@ if("${MAKE}" STREQUAL "")
endif()
endif()

# Using make -j in sub-make is fragile
# see discussion https://github.com/apache/arrow/pull/2779
if(${CMAKE_GENERATOR} MATCHES "Makefiles")
set(MAKE_BUILD_ARGS "")
# Args for external projects using make
if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.28")
# Prevent 'bad file descriptor' error see #39517 #39628
set(MAKE_BUILD_ARGS "-j1")
else()
# limit the maximum number of jobs for ninja
set(MAKE_BUILD_ARGS "-j${NPROC}")
endif()

include(FetchContent)
set(FC_DECLARE_COMMON_OPTIONS)
if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.28)
list(APPEND FC_DECLARE_COMMON_OPTIONS EXCLUDE_FROM_ALL TRUE)
endif()

macro(prepare_fetchcontent)
set(BUILD_SHARED_LIBS OFF)
Expand Down Expand Up @@ -2036,10 +2039,13 @@ macro(build_jemalloc)
# Enable jemalloc debug checks when Arrow itself has debugging enabled
list(APPEND JEMALLOC_CONFIGURE_COMMAND "--enable-debug")
endif()

set(JEMALLOC_BUILD_COMMAND ${MAKE} ${MAKE_BUILD_ARGS})

if(CMAKE_OSX_SYSROOT)
list(APPEND JEMALLOC_BUILD_COMMAND "SDKROOT=${CMAKE_OSX_SYSROOT}")
endif()

externalproject_add(jemalloc_ep
${EP_COMMON_OPTIONS}
URL ${JEMALLOC_SOURCE_URL}
Expand Down Expand Up @@ -2146,6 +2152,9 @@ function(build_gtest)
message(STATUS "Building gtest from source")
set(GTEST_VENDORED TRUE)
fetchcontent_declare(googletest
# We should not specify "EXCLUDE_FROM_ALL TRUE" here.
# Because we install GTest with custom path.
# ${FC_DECLARE_COMMON_OPTIONS}
URL ${GTEST_SOURCE_URL}
URL_HASH "SHA256=${ARROW_GTEST_BUILD_SHA256_CHECKSUM}")
prepare_fetchcontent()
Expand Down Expand Up @@ -2630,7 +2639,7 @@ macro(build_bzip2)
BUILD_IN_SOURCE 1
BUILD_COMMAND ${MAKE} libbz2.a ${MAKE_BUILD_ARGS}
${BZIP2_EXTRA_ARGS}
INSTALL_COMMAND ${MAKE} install PREFIX=${BZIP2_PREFIX}
INSTALL_COMMAND ${MAKE} install -j1 PREFIX=${BZIP2_PREFIX}
${BZIP2_EXTRA_ARGS}
INSTALL_DIR ${BZIP2_PREFIX}
URL ${ARROW_BZIP2_SOURCE_URL}
Expand Down Expand Up @@ -5096,8 +5105,7 @@ function(build_azure_sdk)
endif()
message(STATUS "Building Azure SDK for C++ from source")
fetchcontent_declare(azure_sdk
# EXCLUDE_FROM_ALL is available since CMake 3.28
# EXCLUDE_FROM_ALL TRUE
${FC_DECLARE_COMMON_OPTIONS}
URL ${ARROW_AZURE_SDK_URL}
URL_HASH "SHA256=${ARROW_AZURE_SDK_BUILD_SHA256_CHECKSUM}")
prepare_fetchcontent()
Expand Down
36 changes: 36 additions & 0 deletions cpp/src/arrow/acero/hash_aggregate_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1694,6 +1694,42 @@ TEST_P(GroupBy, SumMeanProductScalar) {
}
}

TEST_P(GroupBy, MeanOverflow) {
BatchesWithSchema input;
// would overflow if intermediate sum is integer
input.batches = {
ExecBatchFromJSON({int64(), int64()}, {ArgShape::SCALAR, ArgShape::ARRAY},

"[[9223372036854775805, 1], [9223372036854775805, 1], "
"[9223372036854775805, 2], [9223372036854775805, 3]]"),
ExecBatchFromJSON({int64(), int64()}, {ArgShape::SCALAR, ArgShape::ARRAY},
"[[null, 1], [null, 1], [null, 2], [null, 3]]"),
ExecBatchFromJSON({int64(), int64()},
"[[9223372036854775805, 1], [9223372036854775805, 2], "
"[9223372036854775805, 3]]"),
};
input.schema = schema({field("argument", int64()), field("key", int64())});
for (bool use_threads : {true, false}) {
SCOPED_TRACE(use_threads ? "parallel/merged" : "serial");
ASSERT_OK_AND_ASSIGN(Datum actual,
RunGroupBy(input, {"key"},
{
{"hash_mean", nullptr, "argument", "hash_mean"},
},
use_threads));
Datum expected = ArrayFromJSON(struct_({
field("key", int64()),
field("hash_mean", float64()),
}),
R"([
[1, 9223372036854775805],
[2, 9223372036854775805],
[3, 9223372036854775805]
])");
AssertDatumsApproxEqual(expected, actual, /*verbose=*/true);
}
}

TEST_P(GroupBy, VarianceAndStddev) {
auto batch = RecordBatchFromJSON(
schema({field("argument", int32()), field("key", int64())}), R"([
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/array/array_binary.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
// specific language governing permissions and limitations
// under the License.

// Array accessor classes for Binary, LargeBinart, String, LargeString,
// Array accessor classes for Binary, LargeBinary, String, LargeString,
// FixedSizeBinary

#pragma once
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/arrow/array/array_dict.h
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ class ARROW_EXPORT DictionaryUnifier {
static Result<std::unique_ptr<DictionaryUnifier>> Make(
std::shared_ptr<DataType> value_type, MemoryPool* pool = default_memory_pool());

/// \brief Unify dictionaries accross array chunks
/// \brief Unify dictionaries across array chunks
///
/// The dictionaries in the array chunks will be unified, their indices
/// accordingly transposed.
Expand All @@ -144,7 +144,7 @@ class ARROW_EXPORT DictionaryUnifier {
const std::shared_ptr<ChunkedArray>& array,
MemoryPool* pool = default_memory_pool());

/// \brief Unify dictionaries accross the chunks of each table column
/// \brief Unify dictionaries across the chunks of each table column
///
/// The dictionaries in each table column will be unified, their indices
/// accordingly transposed.
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/array/array_dict_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1130,7 +1130,7 @@ TEST(TestDictionary, Validate) {
ASSERT_RAISES(Invalid, arr->ValidateFull());

#if !defined(__APPLE__) && !defined(ARROW_VALGRIND)
// GH-35712: ASSERT_DEATH would make testing slow on MacOS.
// GH-35712: ASSERT_DEATH would make testing slow on macOS.
ASSERT_DEATH(
{
std::shared_ptr<Array> null_dict_arr =
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/array/array_list_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,7 @@ class TestListArray : public ::testing::Test {
ASSERT_OK(result->ValidateFull());
AssertArraysEqual(*result, *expected);

// Offets without nulls, will replace null with empty list
// Offsets without nulls, will replace null with empty list
ASSERT_OK_AND_ASSIGN(result, FromArrays(*offsets_wo_nulls, *sizes_wo_nulls, *values));
ASSERT_OK(result->ValidateFull());
AssertArraysEqual(*result, *std::dynamic_pointer_cast<ArrayType>(
Expand Down
20 changes: 10 additions & 10 deletions cpp/src/arrow/array/array_nested.cc
Original file line number Diff line number Diff line change
Expand Up @@ -391,7 +391,7 @@ Result<std::shared_ptr<ArrayData>> ListViewFromListImpl(
const auto* offsets = list_data->template GetValues<offset_type>(1, 0);
auto* sizes = sizes_buffer->mutable_data_as<offset_type>();
// Zero the initial padding area to avoid leaking any data when buffers are
// sent over IPC or throught the C Data interface.
// sent over IPC or through the C Data interface.
memset(sizes, 0, list_data->offset * sizeof(offset_type));
for (int64_t i = list_data->offset; i < buffer_length; i++) {
sizes[i] = offsets[i + 1] - offsets[i];
Expand Down Expand Up @@ -776,7 +776,7 @@ Result<std::shared_ptr<Array>> MapArray::FromArraysInternal(
}

if (keys->null_count() != 0) {
return Status::Invalid("Map can not contain NULL valued keys");
return Status::Invalid("Map cannot contain NULL valued keys");
}

if (keys->length() != items->length()) {
Expand Down Expand Up @@ -894,7 +894,8 @@ const std::shared_ptr<DataType>& FixedSizeListArray::value_type() const {
const std::shared_ptr<Array>& FixedSizeListArray::values() const { return values_; }

Result<std::shared_ptr<Array>> FixedSizeListArray::FromArrays(
const std::shared_ptr<Array>& values, int32_t list_size) {
const std::shared_ptr<Array>& values, int32_t list_size,
std::shared_ptr<Buffer> null_bitmap, int64_t null_count) {
if (list_size <= 0) {
return Status::Invalid("list_size needs to be a strict positive integer");
}
Expand All @@ -905,14 +906,14 @@ Result<std::shared_ptr<Array>> FixedSizeListArray::FromArrays(
}
int64_t length = values->length() / list_size;
auto list_type = std::make_shared<FixedSizeListType>(values->type(), list_size);
std::shared_ptr<Buffer> validity_buf;

return std::make_shared<FixedSizeListArray>(list_type, length, values, validity_buf,
/*null_count=*/0, /*offset=*/0);
return std::make_shared<FixedSizeListArray>(list_type, length, values, null_bitmap,
null_count);
}

Result<std::shared_ptr<Array>> FixedSizeListArray::FromArrays(
const std::shared_ptr<Array>& values, std::shared_ptr<DataType> type) {
const std::shared_ptr<Array>& values, std::shared_ptr<DataType> type,
std::shared_ptr<Buffer> null_bitmap, int64_t null_count) {
if (type->id() != Type::FIXED_SIZE_LIST) {
return Status::TypeError("Expected fixed size list type, got ", type->ToString());
}
Expand All @@ -926,10 +927,9 @@ Result<std::shared_ptr<Array>> FixedSizeListArray::FromArrays(
"The length of the values Array needs to be a multiple of the list size");
}
int64_t length = values->length() / list_type.list_size();
std::shared_ptr<Buffer> validity_buf;

return std::make_shared<FixedSizeListArray>(type, length, values, validity_buf,
/*null_count=*/0, /*offset=*/0);
return std::make_shared<FixedSizeListArray>(type, length, values, null_bitmap,
null_count);
}

Result<std::shared_ptr<Array>> FixedSizeListArray::Flatten(
Expand Down
Loading

0 comments on commit 5e86d1f

Please sign in to comment.