diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 793dbb3806f80..e72d5b4321da8 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -46,7 +46,6 @@ # Docs # /docs/ -# .readthedocs.yml # *.md # *.rmd # *.rst diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e9d7f28fa6e63..6bde1cb2964e0 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -181,4 +181,5 @@ repos: files: >- ( ?^ci/scripts/c_glib_build\.sh$| + ?^ci/scripts/c_glib_test\.sh$| ) diff --git a/.readthedocs.yml b/.readthedocs.yml deleted file mode 100644 index 11a7d70c25131..0000000000000 --- a/.readthedocs.yml +++ /dev/null @@ -1,19 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -conda: - file: python/doc/environment.yml diff --git a/c_glib/parquet-glib/arrow-file-reader.cpp b/c_glib/parquet-glib/arrow-file-reader.cpp index 4996d7862713e..692fd0ca0a634 100644 --- a/c_glib/parquet-glib/arrow-file-reader.cpp +++ b/c_glib/parquet-glib/arrow-file-reader.cpp @@ -134,12 +134,13 @@ gparquet_arrow_file_reader_new_arrow(GArrowSeekableInputStream *source, GError * { auto arrow_random_access_file = garrow_seekable_input_stream_get_raw(source); auto arrow_memory_pool = arrow::default_memory_pool(); - std::unique_ptr parquet_arrow_file_reader; - auto status = parquet::arrow::OpenFile(arrow_random_access_file, - arrow_memory_pool, - &parquet_arrow_file_reader); - if (garrow_error_check(error, status, "[parquet][arrow][file-reader][new-arrow]")) { - return gparquet_arrow_file_reader_new_raw(parquet_arrow_file_reader.release()); + auto parquet_arrow_file_reader_result = + parquet::arrow::OpenFile(arrow_random_access_file, arrow_memory_pool); + if (garrow::check(error, + parquet_arrow_file_reader_result, + "[parquet][arrow][file-reader][new-arrow]")) { + return gparquet_arrow_file_reader_new_raw( + parquet_arrow_file_reader_result->release()); } else { return NULL; } @@ -168,12 +169,13 @@ gparquet_arrow_file_reader_new_path(const gchar *path, GError **error) std::shared_ptr arrow_random_access_file = arrow_memory_mapped_file.ValueOrDie(); auto arrow_memory_pool = arrow::default_memory_pool(); - std::unique_ptr parquet_arrow_file_reader; - auto status = parquet::arrow::OpenFile(arrow_random_access_file, - arrow_memory_pool, - &parquet_arrow_file_reader); - if (garrow::check(error, status, "[parquet][arrow][file-reader][new-path]")) { - return gparquet_arrow_file_reader_new_raw(parquet_arrow_file_reader.release()); + auto parquet_arrow_file_reader_result = + parquet::arrow::OpenFile(arrow_random_access_file, arrow_memory_pool); + if (garrow::check(error, + parquet_arrow_file_reader_result, + "[parquet][arrow][file-reader][new-path]")) { + return gparquet_arrow_file_reader_new_raw( + parquet_arrow_file_reader_result->release()); } else { return NULL; } diff --git a/ci/scripts/c_glib_test.sh b/ci/scripts/c_glib_test.sh index 02753872dcb2d..19131709e2b43 100755 --- a/ci/scripts/c_glib_test.sh +++ b/ci/scripts/c_glib_test.sh @@ -22,7 +22,7 @@ set -ex source_dir=${1}/c_glib build_dir=${2}/c_glib -: ${ARROW_GLIB_VAPI:=true} +: "${ARROW_GLIB_VAPI:=true}" export DYLD_LIBRARY_PATH=${ARROW_HOME}/lib:${DYLD_LIBRARY_PATH} export LD_LIBRARY_PATH=${ARROW_HOME}/lib:${LD_LIBRARY_PATH} @@ -34,7 +34,7 @@ if [ -z "${ARROW_DEBUG_MEMORY_POOL}" ]; then export ARROW_DEBUG_MEMORY_POOL=trap fi -pushd ${source_dir} +pushd "${source_dir}" ruby test/run-test.rb @@ -51,7 +51,7 @@ fi popd -pushd ${build_dir} +pushd "${build_dir}" example/build example/extension-type if [ "${ARROW_GLIB_VAPI}" = "true" ]; then diff --git a/cpp/build-support/lint_exclusions.txt b/cpp/build-support/lint_exclusions.txt index 195c3dee36a83..aa57db72ce9dd 100644 --- a/cpp/build-support/lint_exclusions.txt +++ b/cpp/build-support/lint_exclusions.txt @@ -3,7 +3,6 @@ *RcppExports.cpp* *_generated* *arrowExports.cpp* -*parquet_constants.* *parquet_types.* *pyarrow_api.h *pyarrow_lib.h diff --git a/cpp/examples/arrow/parquet_read_write.cc b/cpp/examples/arrow/parquet_read_write.cc index 3b8b4c2212b75..a07c10fda5af8 100644 --- a/cpp/examples/arrow/parquet_read_write.cc +++ b/cpp/examples/arrow/parquet_read_write.cc @@ -34,7 +34,7 @@ arrow::Status ReadFullFile(std::string path_to_file) { // Open Parquet file reader std::unique_ptr arrow_reader; - ARROW_RETURN_NOT_OK(parquet::arrow::OpenFile(input, pool, &arrow_reader)); + ARROW_ASSIGN_OR_RAISE(arrow_reader, parquet::arrow::OpenFile(input, pool)); // Read entire file as a single Arrow table std::shared_ptr table; diff --git a/cpp/examples/parquet/parquet_arrow/reader_writer.cc b/cpp/examples/parquet/parquet_arrow/reader_writer.cc index f5d96ec16ca64..448c9ecfb88cb 100644 --- a/cpp/examples/parquet/parquet_arrow/reader_writer.cc +++ b/cpp/examples/parquet/parquet_arrow/reader_writer.cc @@ -68,8 +68,8 @@ void read_whole_file() { arrow::default_memory_pool())); std::unique_ptr reader; - PARQUET_THROW_NOT_OK( - parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); + PARQUET_ASSIGN_OR_THROW(reader, + parquet::arrow::OpenFile(infile, arrow::default_memory_pool())); std::shared_ptr table; PARQUET_THROW_NOT_OK(reader->ReadTable(&table)); std::cout << "Loaded " << table->num_rows() << " rows in " << table->num_columns() @@ -85,8 +85,8 @@ void read_single_rowgroup() { arrow::default_memory_pool())); std::unique_ptr reader; - PARQUET_THROW_NOT_OK( - parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); + PARQUET_ASSIGN_OR_THROW(reader, + parquet::arrow::OpenFile(infile, arrow::default_memory_pool())); std::shared_ptr table; PARQUET_THROW_NOT_OK(reader->RowGroup(0)->ReadTable(&table)); std::cout << "Loaded " << table->num_rows() << " rows in " << table->num_columns() @@ -102,8 +102,8 @@ void read_single_column() { arrow::default_memory_pool())); std::unique_ptr reader; - PARQUET_THROW_NOT_OK( - parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); + PARQUET_ASSIGN_OR_THROW(reader, + parquet::arrow::OpenFile(infile, arrow::default_memory_pool())); std::shared_ptr array; PARQUET_THROW_NOT_OK(reader->ReadColumn(0, &array)); PARQUET_THROW_NOT_OK(arrow::PrettyPrint(*array, 4, &std::cout)); @@ -122,8 +122,8 @@ void read_single_column_chunk() { arrow::default_memory_pool())); std::unique_ptr reader; - PARQUET_THROW_NOT_OK( - parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); + PARQUET_ASSIGN_OR_THROW(reader, + parquet::arrow::OpenFile(infile, arrow::default_memory_pool())); std::shared_ptr array; PARQUET_THROW_NOT_OK(reader->RowGroup(0)->Column(0)->Read(&array)); PARQUET_THROW_NOT_OK(arrow::PrettyPrint(*array, 4, &std::cout)); diff --git a/cpp/src/arrow/array/statistics.h b/cpp/src/arrow/array/statistics.h index e7365a9d7f908..99b853ab0fe73 100644 --- a/cpp/src/arrow/array/statistics.h +++ b/cpp/src/arrow/array/statistics.h @@ -27,6 +27,7 @@ namespace arrow { +/// \class ArrayStatistics /// \brief Statistics for an Array /// /// Apache Arrow format doesn't have statistics but data source such diff --git a/cpp/src/arrow/c/bridge.cc b/cpp/src/arrow/c/bridge.cc index f848b341154b5..6b30802c78ec1 100644 --- a/cpp/src/arrow/c/bridge.cc +++ b/cpp/src/arrow/c/bridge.cc @@ -1310,13 +1310,13 @@ struct SchemaImporter { } bool keys_sorted = (c_struct_->flags & ARROW_FLAG_MAP_KEYS_SORTED); - bool values_nullable = value_type->field(1)->nullable(); + // Some implementations of Arrow (such as Rust) use a non-standard field name // for key ("keys") and value ("values") fields. For simplicity, we override // them on import. - auto values_field = - ::arrow::field("value", value_type->field(1)->type(), values_nullable); - type_ = map(value_type->field(0)->type(), values_field, keys_sorted); + type_ = + std::make_shared(value_type->field(0)->WithName("key"), + value_type->field(1)->WithName("value"), keys_sorted); return Status::OK(); } diff --git a/cpp/src/arrow/c/bridge_test.cc b/cpp/src/arrow/c/bridge_test.cc index bc60b587cf6f7..c1ad2ae2c320b 100644 --- a/cpp/src/arrow/c/bridge_test.cc +++ b/cpp/src/arrow/c/bridge_test.cc @@ -3769,6 +3769,10 @@ TEST_F(TestSchemaRoundtrip, RegisteredExtension) { TEST_F(TestSchemaRoundtrip, Map) { TestWithTypeFactory([&]() { return map(utf8(), int32()); }); TestWithTypeFactory([&]() { return map(utf8(), field("value", int32(), false)); }); + TestWithTypeFactory([&]() { + return map(utf8(), field("value", int32(), false, + KeyValueMetadata::Make({"meta key"}, {"meta value"}))); + }); // Field names are brought in line with the spec on import. TestWithTypeFactory( [&]() { @@ -5315,6 +5319,13 @@ TEST_F(TestArrayDeviceStreamRoundtrip, ChunkedArrayRoundtripEmpty) { class TestAsyncDeviceArrayStreamRoundTrip : public BaseArrayStreamTest { public: + void SetUp() override { + BaseArrayStreamTest::SetUp(); +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires ARROW_ENABLE_THREADING=ON"; +#endif + } + static Result> ToDeviceData( const std::shared_ptr& mm, const ArrayData& data) { arrow::BufferVector buffers; diff --git a/cpp/src/arrow/compute/row/grouper.cc b/cpp/src/arrow/compute/row/grouper.cc index 6c1a93fb89bfa..137e9a290e916 100644 --- a/cpp/src/arrow/compute/row/grouper.cc +++ b/cpp/src/arrow/compute/row/grouper.cc @@ -877,6 +877,9 @@ struct GrouperFastImpl : public Grouper { } else { ARROW_ASSIGN_OR_RAISE(fixedlen_bufs[i], AllocatePaddedBuffer((num_groups + 1) * sizeof(uint32_t))); + // Set offset[0] to 0 so the later allocation of varlen_bufs doesn't see an + // uninitialized value when num_groups == 0. + reinterpret_cast(fixedlen_bufs[i]->mutable_data())[0] = 0; } cols_[i] = KeyColumnArray(col_metadata_[i], num_groups, non_null_bufs[i]->mutable_data(), diff --git a/cpp/src/arrow/compute/row/grouper_test.cc b/cpp/src/arrow/compute/row/grouper_test.cc index 1e853be5e4af7..fcee46863fdf8 100644 --- a/cpp/src/arrow/compute/row/grouper_test.cc +++ b/cpp/src/arrow/compute/row/grouper_test.cc @@ -64,5 +64,27 @@ TEST(Grouper, ResortedColumnsWithLargeNullRows) { } } +// Reproduction of GH-43124: Provoke var length buffer size if a grouper produces zero +// groups. +TEST(Grouper, EmptyGroups) { + ASSERT_OK_AND_ASSIGN(auto grouper, Grouper::Make({int32(), utf8()})); + ASSERT_OK_AND_ASSIGN(auto groups, grouper->GetUniques()); + + ASSERT_TRUE(groups[0].is_array()); + ASSERT_EQ(groups[0].array()->buffers.size(), 2); + ASSERT_EQ(groups[0].array()->buffers[0], nullptr); + ASSERT_NE(groups[0].array()->buffers[1], nullptr); + ASSERT_EQ(groups[0].array()->buffers[1]->size(), 0); + + ASSERT_TRUE(groups[1].is_array()); + ASSERT_EQ(groups[1].array()->buffers.size(), 3); + ASSERT_EQ(groups[1].array()->buffers[0], nullptr); + ASSERT_NE(groups[1].array()->buffers[1], nullptr); + ASSERT_EQ(groups[1].array()->buffers[1]->size(), 4); + ASSERT_EQ(groups[1].array()->buffers[1]->data_as()[0], 0); + ASSERT_NE(groups[1].array()->buffers[2], nullptr); + ASSERT_EQ(groups[1].array()->buffers[2]->size(), 0); +} + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 242c2c29505ac..a04977bdee076 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -475,10 +475,10 @@ TEST(AzureFileSystem, InitializeWithDefaultCredential) { TEST(AzureFileSystem, InitializeWithDefaultCredentialImplicitly) { AzureOptions options; options.account_name = "dummy-account-name"; - AzureOptions explictly_default_options; - explictly_default_options.account_name = "dummy-account-name"; - ARROW_EXPECT_OK(explictly_default_options.ConfigureDefaultCredential()); - ASSERT_TRUE(options.Equals(explictly_default_options)); + AzureOptions explicitly_default_options; + explicitly_default_options.account_name = "dummy-account-name"; + ARROW_EXPECT_OK(explicitly_default_options.ConfigureDefaultCredential()); + ASSERT_TRUE(options.Equals(explicitly_default_options)); } TEST(AzureFileSystem, InitializeWithAnonymousCredential) { diff --git a/cpp/src/arrow/util/float16.h b/cpp/src/arrow/util/float16.h index 0a432fee2cd31..b7455dad53796 100644 --- a/cpp/src/arrow/util/float16.h +++ b/cpp/src/arrow/util/float16.h @@ -111,11 +111,7 @@ class ARROW_EXPORT Float16 { } /// \brief Return the value's bytes in little-endian byte order constexpr std::array ToLittleEndian() const { -#if ARROW_LITTLE_ENDIAN return {uint8_t(bits_ & 0xff), uint8_t(bits_ >> 8)}; -#else - return {uint8_t(bits_ >> 8), uint8_t(bits_ & 0xff)}; -#endif } /// \brief Copy the value's bytes in big-endian byte order @@ -125,11 +121,7 @@ class ARROW_EXPORT Float16 { } /// \brief Return the value's bytes in big-endian byte order constexpr std::array ToBigEndian() const { -#if ARROW_LITTLE_ENDIAN return {uint8_t(bits_ >> 8), uint8_t(bits_ & 0xff)}; -#else - return {uint8_t(bits_ & 0xff), uint8_t(bits_ >> 8)}; -#endif } constexpr Float16 operator-() const { return FromBits(bits_ ^ 0x8000); } diff --git a/cpp/src/arrow/util/float16_test.cc b/cpp/src/arrow/util/float16_test.cc index 073375882e3c2..5918381a26997 100644 --- a/cpp/src/arrow/util/float16_test.cc +++ b/cpp/src/arrow/util/float16_test.cc @@ -323,44 +323,53 @@ TEST(Float16Test, Compare) { TEST(Float16Test, ToBytes) { constexpr auto f16 = Float16::FromBits(0xd01c); std::array bytes; - auto load = [&bytes]() { return SafeLoadAs(bytes.data()); }; + + constexpr uint8_t expected_high = 0xd0; + constexpr uint8_t expected_low = 0x1c; // Test native-endian f16.ToBytes(bytes.data()); - ASSERT_EQ(load(), 0xd01c); +#if ARROW_LITTLE_ENDIAN + ASSERT_EQ(bytes[0], expected_low); + ASSERT_EQ(bytes[1], expected_high); +#else + ASSERT_EQ(bytes[0], expected_high); + ASSERT_EQ(bytes[1], expected_low); +#endif bytes = f16.ToBytes(); - ASSERT_EQ(load(), 0xd01c); - #if ARROW_LITTLE_ENDIAN - constexpr uint16_t expected_le = 0xd01c; - constexpr uint16_t expected_be = 0x1cd0; + ASSERT_EQ(bytes[0], expected_low); + ASSERT_EQ(bytes[1], expected_high); #else - constexpr uint16_t expected_le = 0x1cd0; - constexpr uint16_t expected_be = 0xd01c; + ASSERT_EQ(bytes[0], expected_high); + ASSERT_EQ(bytes[1], expected_low); #endif + // Test little-endian f16.ToLittleEndian(bytes.data()); - ASSERT_EQ(load(), expected_le); + ASSERT_EQ(bytes[0], expected_low); + ASSERT_EQ(bytes[1], expected_high); bytes = f16.ToLittleEndian(); - ASSERT_EQ(load(), expected_le); + ASSERT_EQ(bytes[0], expected_low); + ASSERT_EQ(bytes[1], expected_high); // Test big-endian f16.ToBigEndian(bytes.data()); - ASSERT_EQ(load(), expected_be); + ASSERT_EQ(bytes[0], expected_high); + ASSERT_EQ(bytes[1], expected_low); bytes = f16.ToBigEndian(); - ASSERT_EQ(load(), expected_be); + ASSERT_EQ(bytes[0], expected_high); + ASSERT_EQ(bytes[1], expected_low); } TEST(Float16Test, FromBytes) { - constexpr uint16_t u16 = 0xd01c; - const auto* data = reinterpret_cast(&u16); - ASSERT_EQ(Float16::FromBytes(data), Float16::FromBits(0xd01c)); + const std::array bytes = {0x1c, 0xd0}; #if ARROW_LITTLE_ENDIAN - ASSERT_EQ(Float16::FromLittleEndian(data), Float16::FromBits(0xd01c)); - ASSERT_EQ(Float16::FromBigEndian(data), Float16::FromBits(0x1cd0)); + ASSERT_EQ(Float16::FromBytes(bytes.data()), Float16::FromBits(0xd01c)); #else - ASSERT_EQ(Float16::FromLittleEndian(data), Float16(0x1cd0)); - ASSERT_EQ(Float16::FromBigEndian(data), Float16(0xd01c)); + ASSERT_EQ(Float16::FromBytes(bytes.data()), Float16::FromBits(0x1cd0)); #endif + ASSERT_EQ(Float16::FromLittleEndian(bytes.data()), Float16::FromBits(0xd01c)); + ASSERT_EQ(Float16::FromBigEndian(bytes.data()), Float16::FromBits(0x1cd0)); } } // namespace diff --git a/cpp/src/arrow/util/io_util.cc b/cpp/src/arrow/util/io_util.cc index 8c4d925dac541..18d7de64a0efd 100644 --- a/cpp/src/arrow/util/io_util.cc +++ b/cpp/src/arrow/util/io_util.cc @@ -118,7 +118,7 @@ #endif #ifdef _WIN32 -# include +# include #else # include #endif diff --git a/cpp/src/generated/parquet_constants.cpp b/cpp/src/generated/parquet_constants.cpp deleted file mode 100644 index b1b4ce6267335..0000000000000 --- a/cpp/src/generated/parquet_constants.cpp +++ /dev/null @@ -1,17 +0,0 @@ -/** - * Autogenerated by Thrift Compiler (0.13.0) - * - * DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING - * @generated - */ -#include "parquet_constants.h" - -namespace parquet { namespace format { - -const parquetConstants g_parquet_constants; - -parquetConstants::parquetConstants() { -} - -}} // namespace - diff --git a/cpp/src/generated/parquet_constants.h b/cpp/src/generated/parquet_constants.h deleted file mode 100644 index 1e288c7cd1fcb..0000000000000 --- a/cpp/src/generated/parquet_constants.h +++ /dev/null @@ -1,24 +0,0 @@ -/** - * Autogenerated by Thrift Compiler (0.13.0) - * - * DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING - * @generated - */ -#ifndef parquet_CONSTANTS_H -#define parquet_CONSTANTS_H - -#include "parquet_types.h" - -namespace parquet { namespace format { - -class parquetConstants { - public: - parquetConstants(); - -}; - -extern const parquetConstants g_parquet_constants; - -}} // namespace - -#endif diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index e43a254fb616a..9c28b749e4319 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -140,8 +140,6 @@ set(PARQUET_THRIFT_SOURCE_DIR "${ARROW_SOURCE_DIR}/src/generated/") set_source_files_properties("${PARQUET_THRIFT_SOURCE_DIR}/parquet_types.cpp" "${PARQUET_THRIFT_SOURCE_DIR}/parquet_types.h" - "${PARQUET_THRIFT_SOURCE_DIR}/parquet_constants.cpp" - "${PARQUET_THRIFT_SOURCE_DIR}/parquet_constants.h" PROPERTIES SKIP_PRECOMPILE_HEADERS ON SKIP_UNITY_BUILD_INCLUSION ON) @@ -178,7 +176,6 @@ set(PARQUET_SRCS metadata.cc xxhasher.cc page_index.cc - "${PARQUET_THRIFT_SOURCE_DIR}/parquet_constants.cpp" "${PARQUET_THRIFT_SOURCE_DIR}/parquet_types.cpp" platform.cc printer.cc @@ -307,7 +304,6 @@ add_arrow_lib(parquet if(WIN32 AND NOT (ARROW_TEST_LINKAGE STREQUAL "static")) add_library(parquet_test_support STATIC - "${PARQUET_THRIFT_SOURCE_DIR}/parquet_constants.cpp" "${PARQUET_THRIFT_SOURCE_DIR}/parquet_types.cpp") target_link_libraries(parquet_test_support thrift::thrift) list(PREPEND PARQUET_TEST_LINK_LIBS parquet_test_support) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index 73974f9b2a888..b5fd2bc2553d9 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -448,9 +448,8 @@ void DoSimpleRoundtrip(const std::shared_ptr& table, bool use_threads, ASSERT_NO_FATAL_FAILURE( WriteTableToBuffer(table, row_group_size, arrow_properties, &buffer)); - std::unique_ptr reader; - ASSERT_OK_NO_THROW(OpenFile(std::make_shared(buffer), - ::arrow::default_memory_pool(), &reader)); + ASSERT_OK_AND_ASSIGN(auto reader, OpenFile(std::make_shared(buffer), + ::arrow::default_memory_pool())); reader->set_use_threads(use_threads); if (column_subset.size() > 0) { @@ -1095,8 +1094,7 @@ TYPED_TEST(TestParquetIO, SingleColumnTableRequiredChunkedWriteArrowIO) { auto source = std::make_shared(pbuffer); std::shared_ptr<::arrow::Table> out; - std::unique_ptr reader; - ASSERT_OK_NO_THROW(OpenFile(source, ::arrow::default_memory_pool(), &reader)); + ASSERT_OK_AND_ASSIGN(auto reader, OpenFile(source, ::arrow::default_memory_pool())); ASSERT_NO_FATAL_FAILURE(this->ReadTableFromFile(std::move(reader), &out)); ASSERT_EQ(1, out->num_columns()); ASSERT_EQ(values->length(), out->num_rows()); @@ -2295,9 +2293,8 @@ TEST(TestArrowReadWrite, ReadSingleRowGroup) { ASSERT_NO_FATAL_FAILURE(WriteTableToBuffer(table, num_rows / 2, default_arrow_writer_properties(), &buffer)); - std::unique_ptr reader; - ASSERT_OK_NO_THROW(OpenFile(std::make_shared(buffer), - ::arrow::default_memory_pool(), &reader)); + ASSERT_OK_AND_ASSIGN(auto reader, OpenFile(std::make_shared(buffer), + ::arrow::default_memory_pool())); ASSERT_EQ(2, reader->num_row_groups()); @@ -2357,9 +2354,8 @@ TEST(TestArrowReadWrite, ReadTableManually) { ASSERT_NO_FATAL_FAILURE(WriteTableToBuffer(expected, num_rows / 2, default_arrow_writer_properties(), &buffer)); - std::unique_ptr reader; - ASSERT_OK_NO_THROW(OpenFile(std::make_shared(buffer), - ::arrow::default_memory_pool(), &reader)); + ASSERT_OK_AND_ASSIGN(auto reader, OpenFile(std::make_shared(buffer), + ::arrow::default_memory_pool())); ASSERT_EQ(2, reader->num_row_groups()); @@ -2476,9 +2472,8 @@ TEST(TestArrowReadWrite, CoalescedReadsAndNonCoalescedReads) { ASSERT_NO_FATAL_FAILURE(WriteTableToBuffer(expected, num_rows / 2, default_arrow_writer_properties(), &buffer)); - std::unique_ptr reader; - ASSERT_OK_NO_THROW(OpenFile(std::make_shared(buffer), - ::arrow::default_memory_pool(), &reader)); + ASSERT_OK_AND_ASSIGN(auto reader, OpenFile(std::make_shared(buffer), + ::arrow::default_memory_pool())); ASSERT_EQ(2, reader->num_row_groups()); @@ -2594,9 +2589,8 @@ TEST(TestArrowReadWrite, ScanContents) { ASSERT_NO_FATAL_FAILURE(WriteTableToBuffer(table, num_rows / 2, default_arrow_writer_properties(), &buffer)); - std::unique_ptr reader; - ASSERT_OK_NO_THROW(OpenFile(std::make_shared(buffer), - ::arrow::default_memory_pool(), &reader)); + ASSERT_OK_AND_ASSIGN(auto reader, OpenFile(std::make_shared(buffer), + ::arrow::default_memory_pool())); int64_t num_rows_returned = 0; ASSERT_OK_NO_THROW(reader->ScanContents({}, 256, &num_rows_returned)); @@ -2689,9 +2683,8 @@ TEST(TestArrowReadWrite, ListLargeRecords) { ASSERT_NO_FATAL_FAILURE(WriteTableToBuffer(table, row_group_size, default_arrow_writer_properties(), &buffer)); - std::unique_ptr reader; - ASSERT_OK_NO_THROW(OpenFile(std::make_shared(buffer), - ::arrow::default_memory_pool(), &reader)); + ASSERT_OK_AND_ASSIGN(auto reader, OpenFile(std::make_shared(buffer), + ::arrow::default_memory_pool())); // Read everything std::shared_ptr
result; @@ -2699,8 +2692,8 @@ TEST(TestArrowReadWrite, ListLargeRecords) { ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*table, *result)); // Read 1 record at a time - ASSERT_OK_NO_THROW(OpenFile(std::make_shared(buffer), - ::arrow::default_memory_pool(), &reader)); + ASSERT_OK_AND_ASSIGN(reader, OpenFile(std::make_shared(buffer), + ::arrow::default_memory_pool())); std::unique_ptr col_reader; ASSERT_OK(reader->GetColumn(0, &col_reader)); @@ -2974,9 +2967,8 @@ TEST(ArrowReadWrite, DecimalStats) { ASSERT_NO_FATAL_FAILURE(WriteTableToBuffer(table, /*row_group_size=*/100, default_arrow_writer_properties(), &buffer)); - std::unique_ptr reader; - ASSERT_OK_NO_THROW(OpenFile(std::make_shared(buffer), - ::arrow::default_memory_pool(), &reader)); + ASSERT_OK_AND_ASSIGN(auto reader, OpenFile(std::make_shared(buffer), + ::arrow::default_memory_pool())); std::shared_ptr min, max; ReadSingleColumnFileStatistics(std::move(reader), &min, &max); @@ -3575,8 +3567,8 @@ class TestNestedSchemaRead : public ::testing::TestWithParam { void InitReader() { ASSERT_OK_AND_ASSIGN(auto buffer, nested_parquet_->Finish()); - ASSERT_OK_NO_THROW(OpenFile(std::make_shared(buffer), - ::arrow::default_memory_pool(), &reader_)); + ASSERT_OK_AND_ASSIGN(reader_, OpenFile(std::make_shared(buffer), + ::arrow::default_memory_pool())); } void InitNewParquetFile(const std::shared_ptr& schema, int num_rows) { @@ -5344,8 +5336,8 @@ TEST(TestArrowReadWrite, MultithreadedWrite) { // Read to verify the data. std::shared_ptr
result; - std::unique_ptr reader; - ASSERT_OK_NO_THROW(OpenFile(std::make_shared(buffer), pool, &reader)); + ASSERT_OK_AND_ASSIGN(auto reader, + OpenFile(std::make_shared(buffer), pool)); ASSERT_OK_NO_THROW(reader->ReadTable(&result)); ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*table, *result)); } diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index 4f57c3f4f56f7..3002d90b5fff8 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -1372,9 +1372,14 @@ Result> FileReaderBuilder::Build() { Status OpenFile(std::shared_ptr<::arrow::io::RandomAccessFile> file, MemoryPool* pool, std::unique_ptr* reader) { + return OpenFile(std::move(file), pool).Value(reader); +} + +Result> OpenFile( + std::shared_ptr<::arrow::io::RandomAccessFile> file, MemoryPool* pool) { FileReaderBuilder builder; RETURN_NOT_OK(builder.Open(std::move(file))); - return builder.memory_pool(pool)->Build(reader); + return builder.memory_pool(pool)->Build(); } namespace internal { diff --git a/cpp/src/parquet/arrow/reader.h b/cpp/src/parquet/arrow/reader.h index 6e46ca43f7b18..ec996a5afa615 100644 --- a/cpp/src/parquet/arrow/reader.h +++ b/cpp/src/parquet/arrow/reader.h @@ -357,11 +357,21 @@ class PARQUET_EXPORT FileReaderBuilder { /// \brief Build FileReader from Arrow file and MemoryPool /// /// Advanced settings are supported through the FileReaderBuilder class. +/// +/// \deprecated Deprecated in 19.0.0. Use arrow::Result version instead. +ARROW_DEPRECATED("Deprecated in 19.0.0. Use arrow::Result version instead.") PARQUET_EXPORT ::arrow::Status OpenFile(std::shared_ptr<::arrow::io::RandomAccessFile>, ::arrow::MemoryPool* allocator, std::unique_ptr* reader); +/// \brief Build FileReader from Arrow file and MemoryPool +/// +/// Advanced settings are supported through the FileReaderBuilder class. +PARQUET_EXPORT +::arrow::Result> OpenFile( + std::shared_ptr<::arrow::io::RandomAccessFile>, ::arrow::MemoryPool* allocator); + /// @} PARQUET_EXPORT diff --git a/cpp/src/parquet/arrow/reconstruct_internal_test.cc b/cpp/src/parquet/arrow/reconstruct_internal_test.cc index 4e1f421498e85..ecdbcc5a3da14 100644 --- a/cpp/src/parquet/arrow/reconstruct_internal_test.cc +++ b/cpp/src/parquet/arrow/reconstruct_internal_test.cc @@ -189,7 +189,9 @@ class FileTester { protected: Status Open(std::shared_ptr buffer, MemoryPool* pool) { pool_ = pool; - return OpenFile(std::make_shared(buffer), pool_, &file_reader_); + ARROW_ASSIGN_OR_RAISE(file_reader_, + OpenFile(std::make_shared(buffer), pool_)); + return Status::OK(); } MemoryPool* pool_; diff --git a/cpp/thirdparty/versions.txt b/cpp/thirdparty/versions.txt index 369a49744b49d..5d8ccb861060b 100644 --- a/cpp/thirdparty/versions.txt +++ b/cpp/thirdparty/versions.txt @@ -90,8 +90,8 @@ ARROW_OPENTELEMETRY_BUILD_VERSION=v1.13.0 ARROW_OPENTELEMETRY_BUILD_SHA256_CHECKSUM=7735cc56507149686e6019e06f588317099d4522480be5f38a2a09ec69af1706 ARROW_OPENTELEMETRY_PROTO_BUILD_VERSION=v0.17.0 ARROW_OPENTELEMETRY_PROTO_BUILD_SHA256_CHECKSUM=f269fbcb30e17b03caa1decd231ce826e59d7651c0f71c3b28eb5140b4bb5412 -ARROW_ORC_BUILD_VERSION=2.0.1 -ARROW_ORC_BUILD_SHA256_CHECKSUM=1ffac0228aa83f04a1b1cf2788a3af5953e82587ae3a77c41900e99f2557132d +ARROW_ORC_BUILD_VERSION=2.0.3 +ARROW_ORC_BUILD_SHA256_CHECKSUM=082cba862b5a8a0d14c225404d0b51cd8d1b64ca81b8f1e500322ce8922cb86d ARROW_PROTOBUF_BUILD_VERSION=v21.3 ARROW_PROTOBUF_BUILD_SHA256_CHECKSUM=2f723218f6cb709ae4cdc4fb5ed56a5951fc5d466f0128ce4c946b8c78c8c49f # Because of https://github.com/Tencent/rapidjson/pull/1323, we require diff --git a/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj b/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj index b8f69672cbc7c..6c1b64fd1ef41 100644 --- a/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj +++ b/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj @@ -13,7 +13,7 @@ - + diff --git a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj index 7fbba13b2cc0d..944a5add28d7b 100644 --- a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj +++ b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj @@ -8,11 +8,11 @@ - + - + diff --git a/csharp/src/Apache.Arrow/Apache.Arrow.csproj b/csharp/src/Apache.Arrow/Apache.Arrow.csproj index a845f8e693695..301894f93708d 100644 --- a/csharp/src/Apache.Arrow/Apache.Arrow.csproj +++ b/csharp/src/Apache.Arrow/Apache.Arrow.csproj @@ -12,10 +12,10 @@ - - - - + + + + diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index dda1d36dc1aeb..5057667eb94d1 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -25,8 +25,6 @@ cpp/build-support/iwyu/* cpp/cmake_modules/FindPythonLibsNew.cmake cpp/examples/parquet/parquet-arrow/cmake_modules/FindArrow.cmake cpp/src/parquet/.parquetcppversion -cpp/src/generated/parquet_constants.cpp -cpp/src/generated/parquet_constants.h cpp/src/generated/parquet_types.cpp cpp/src/generated/parquet_types.h cpp/src/generated/substrait/* diff --git a/docs/source/cpp/api/array.rst b/docs/source/cpp/api/array.rst index a87e3810c47a0..b17d1957a8b66 100644 --- a/docs/source/cpp/api/array.rst +++ b/docs/source/cpp/api/array.rst @@ -22,6 +22,10 @@ Arrays Base classes ============ +.. doxygenclass:: arrow::ArrayStatistics + :project: arrow_cpp + :members: + .. doxygenclass:: arrow::ArrayData :project: arrow_cpp :members: diff --git a/java/bom/pom.xml b/java/bom/pom.xml index 5e118bae183c9..ccb70d5fb339d 100644 --- a/java/bom/pom.xml +++ b/java/bom/pom.xml @@ -208,7 +208,7 @@ under the License. org.codehaus.mojo versions-maven-plugin - 2.17.1 + 2.18.0 diff --git a/java/dataset/pom.xml b/java/dataset/pom.xml index 86b1e5f467c31..0c1f55dd69edb 100644 --- a/java/dataset/pom.xml +++ b/java/dataset/pom.xml @@ -32,7 +32,7 @@ under the License. ../../../cpp/release-build/ - 1.14.3 + 1.14.4 1.12.0 diff --git a/java/flight/flight-core/pom.xml b/java/flight/flight-core/pom.xml index 461c415535764..374f6fcda7e09 100644 --- a/java/flight/flight-core/pom.xml +++ b/java/flight/flight-core/pom.xml @@ -134,7 +134,7 @@ under the License. com.google.api.grpc proto-google-common-protos - 2.48.0 + 2.49.0 test diff --git a/java/pom.xml b/java/pom.xml index 9e876e302cbf1..f2c8d8f1f6513 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -96,9 +96,9 @@ under the License. 5.11.3 2.0.16 33.3.1-jre - 4.1.114.Final + 4.1.115.Final 1.65.0 - 3.25.4 + 3.25.5 2.18.1 3.4.1 24.3.25 @@ -519,7 +519,7 @@ under the License. org.codehaus.mojo versions-maven-plugin - 2.17.1 + 2.18.0 pl.project13.maven diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index 3a4fa1ab611a7..fd50215cee9ae 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -3716,10 +3716,13 @@ cdef class Scanner(_Weakrefable): Parameters ---------- - source : Iterator - The iterator of Batches. + source : Iterator or Arrow-compatible stream object + The iterator of Batches. This can be a pyarrow RecordBatchReader, + any object that implements the Arrow PyCapsule Protocol for + streams, or an actual Python iterator of RecordBatches. schema : Schema - The schema of the batches. + The schema of the batches (required when passing a Python + iterator). columns : list[str] or dict[str, Expression], default None The columns to project. This can be a list of column names to include (order and duplicates will be preserved), or a dictionary @@ -3775,6 +3778,12 @@ cdef class Scanner(_Weakrefable): raise ValueError('Cannot specify a schema when providing ' 'a RecordBatchReader') reader = source + elif hasattr(source, "__arrow_c_stream__"): + if schema: + raise ValueError( + 'Cannot specify a schema when providing an object ' + 'implementing the Arrow PyCapsule Protocol') + reader = pa.ipc.RecordBatchReader.from_stream(source) elif _is_iterable(source): if schema is None: raise ValueError('Must provide schema to construct scanner ' diff --git a/python/pyarrow/_fs.pyx b/python/pyarrow/_fs.pyx index dbfb6ed114553..e315dd6381f41 100644 --- a/python/pyarrow/_fs.pyx +++ b/python/pyarrow/_fs.pyx @@ -939,7 +939,7 @@ cdef class FileSystem(_Weakrefable): ... f.write(b'+newly added') 12 - Print out the content fo the file: + Print out the content to the file: >>> with local.open_input_file(path) as f: ... print(f.readall()) diff --git a/python/pyarrow/_s3fs.pyx b/python/pyarrow/_s3fs.pyx index ba6603322838d..038f9109d8152 100644 --- a/python/pyarrow/_s3fs.pyx +++ b/python/pyarrow/_s3fs.pyx @@ -249,7 +249,7 @@ cdef class S3FileSystem(FileSystem): rather than issue two dependent I/O calls. If true, when creating a directory the code will only create the directory when necessary at the cost of extra I/O calls. This can be used for key/value cloud storage which has - a hard rate limit to number of object mutation operations or scenerios such as + a hard rate limit to number of object mutation operations or scenarios such as the directories already exist and you do not have creation access. retry_strategy : S3RetryStrategy, default AwsStandardS3RetryStrategy(max_attempts=3) The retry strategy to use with S3; fail after max_attempts. Available diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index eaedbf1e38580..8bddc34e1000b 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -1174,7 +1174,12 @@ cdef class Array(_PandasConvertible): "({0}) did not match the passed number " "({1}).".format(type.num_fields, len(children))) - if type.num_buffers != len(buffers): + if type.has_variadic_buffers: + if type.num_buffers > len(buffers): + raise ValueError("Type's expected number of buffers is at least " + "{0}, but the passed number is " + "{1}.".format(type.num_buffers, len(buffers))) + elif type.num_buffers != len(buffers): raise ValueError("Type's expected number of buffers " "({0}) did not match the passed number " "({1}).".format(type.num_buffers, len(buffers))) diff --git a/python/pyarrow/dataset.py b/python/pyarrow/dataset.py index 1efbfe1665a75..c61e13ee75801 100644 --- a/python/pyarrow/dataset.py +++ b/python/pyarrow/dataset.py @@ -964,7 +964,11 @@ def file_visitor(written_file): elif isinstance(data, (pa.RecordBatch, pa.Table)): schema = schema or data.schema data = InMemoryDataset(data, schema=schema) - elif isinstance(data, pa.ipc.RecordBatchReader) or _is_iterable(data): + elif ( + isinstance(data, pa.ipc.RecordBatchReader) + or hasattr(data, "__arrow_c_stream__") + or _is_iterable(data) + ): data = Scanner.from_batches(data, schema=schema) schema = None elif not isinstance(data, (Dataset, Scanner)): diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index a70cb91873e45..8bf61b73cc211 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -158,6 +158,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: cdef cppclass CDataTypeLayout" arrow::DataTypeLayout": vector[CBufferSpec] buffers + optional[CBufferSpec] variadic_spec c_bool has_dictionary cdef cppclass CDataType" arrow::DataType": diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi index 1d942e8ccabc6..b3de15067fbfa 100644 --- a/python/pyarrow/io.pxi +++ b/python/pyarrow/io.pxi @@ -1761,7 +1761,7 @@ cdef class CompressedInputStream(NativeFile): Examples -------- - Create an output stream wich compresses the data: + Create an output stream which compresses the data: >>> import pyarrow as pa >>> data = b"Compressed stream" @@ -1818,7 +1818,7 @@ cdef class CompressedOutputStream(NativeFile): Examples -------- - Create an output stream wich compresses the data: + Create an output stream which compresses the data: >>> import pyarrow as pa >>> data = b"Compressed stream" @@ -2342,7 +2342,7 @@ cdef class CacheOptions(_Weakrefable): def from_network_metrics(time_to_first_byte_millis, transfer_bandwidth_mib_per_sec, ideal_bandwidth_utilization_frac=0.9, max_ideal_request_size_mib=64): """ - Create suiteable CacheOptions based on provided network metrics. + Create suitable CacheOptions based on provided network metrics. Typically this will be used with object storage solutions like Amazon S3, Google Cloud Storage and Azure Blob Storage. diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 4160d64829483..885442b079c5b 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -651,6 +651,32 @@ def test_string_binary_from_buffers(): assert copied.null_count == 0 +def test_string_view_from_buffers(): + array = pa.array( + [ + "String longer than 12 characters", + None, + "short", + "Length is 12" + ], type=pa.string_view()) + + buffers = array.buffers() + copied = pa.StringViewArray.from_buffers( + pa.string_view(), len(array), buffers) + copied.validate(full=True) + assert copied.to_pylist() == [ + "String longer than 12 characters", + None, + "short", + "Length is 12" + ] + + match = r"number of buffers is at least 2" + with pytest.raises(ValueError, match=match): + pa.StringViewArray.from_buffers( + pa.string_view(), len(array), buffers[0:1]) + + @pytest.mark.parametrize('list_type_factory', [ pa.list_, pa.large_list, pa.list_view, pa.large_list_view]) def test_list_from_buffers(list_type_factory): diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index 772670ad79fd3..b6aaa2840d83c 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -66,6 +66,14 @@ pytestmark = pytest.mark.dataset +class TableStreamWrapper: + def __init__(self, table): + self.table = table + + def __arrow_c_stream__(self, requested_schema=None): + return self.table.__arrow_c_stream__(requested_schema) + + def _generate_data(n): import datetime import itertools @@ -2543,6 +2551,7 @@ def test_scan_iterator(use_threads): for factory, schema in ( (lambda: pa.RecordBatchReader.from_batches( batch.schema, [batch]), None), + (lambda: TableStreamWrapper(table), None), (lambda: (batch for _ in range(1)), batch.schema), ): # Scanning the fragment consumes the underlying iterator @@ -4674,15 +4683,20 @@ def test_write_iterable(tempdir): base_dir = tempdir / 'inmemory_iterable' ds.write_dataset((batch for batch in table.to_batches()), base_dir, schema=table.schema, - basename_template='dat_{i}.arrow', format="feather") + basename_template='dat_{i}.arrow', format="ipc") result = ds.dataset(base_dir, format="ipc").to_table() assert result.equals(table) base_dir = tempdir / 'inmemory_reader' reader = pa.RecordBatchReader.from_batches(table.schema, table.to_batches()) - ds.write_dataset(reader, base_dir, - basename_template='dat_{i}.arrow', format="feather") + ds.write_dataset(reader, base_dir, basename_template='dat_{i}.arrow', format="ipc") + result = ds.dataset(base_dir, format="ipc").to_table() + assert result.equals(table) + + base_dir = tempdir / 'inmemory_pycapsule' + stream = TableStreamWrapper(table) + ds.write_dataset(stream, base_dir, basename_template='dat_{i}.arrow', format="ipc") result = ds.dataset(base_dir, format="ipc").to_table() assert result.equals(table) diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py index fef350d5de958..de439b6bb8cd7 100644 --- a/python/pyarrow/tests/test_types.py +++ b/python/pyarrow/tests/test_types.py @@ -887,6 +887,14 @@ def test_types_weakref(): assert wr() is None # not a singleton +def test_types_has_variadic_buffers(): + for ty in get_many_types(): + if ty in (pa.string_view(), pa.binary_view()): + assert ty.has_variadic_buffers + else: + assert not ty.has_variadic_buffers + + def test_fields_hashable(): in_dict = {} fields = [pa.field('a', pa.int32()), diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 4aa8238556a9c..0d6787cf2a049 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -326,6 +326,22 @@ cdef class DataType(_Weakrefable): """ return self.type.layout().buffers.size() + @property + def has_variadic_buffers(self): + """ + If True, the number of expected buffers is only + lower-bounded by num_buffers. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.int64().has_variadic_buffers + False + >>> pa.string_view().has_variadic_buffers + True + """ + return self.type.layout().variadic_spec.has_value() + def __str__(self): return frombytes(self.type.ToString(), safe=True) diff --git a/python/pyproject.toml b/python/pyproject.toml index 32b95254f217d..85bdbec0915ed 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -20,7 +20,7 @@ requires = [ "cython >= 0.29.31", # Starting with NumPy 1.25, NumPy is (by default) as far back compatible # as oldest-support-numpy was (customizable with a NPY_TARGET_VERSION - # define). For older Python versions (where NumPy 1.25 is not yet avaiable) + # define). For older Python versions (where NumPy 1.25 is not yet available) # continue using oldest-support-numpy. "oldest-supported-numpy>=0.14; python_version<'3.9'", "numpy>=1.25; python_version>='3.9'", diff --git a/ruby/red-arrow/lib/arrow/list-array-builder.rb b/ruby/red-arrow/lib/arrow/list-array-builder.rb index d889c8a0cf411..0b1d17f0a5491 100644 --- a/ruby/red-arrow/lib/arrow/list-array-builder.rb +++ b/ruby/red-arrow/lib/arrow/list-array-builder.rb @@ -54,6 +54,7 @@ def append_value(*args) when nil append_null when ::Array + return if value.empty? append_value_raw @value_builder ||= value_builder @value_builder.append(*value) diff --git a/ruby/red-arrow/test/test-list-array-builder.rb b/ruby/red-arrow/test/test-list-array-builder.rb index aee31e73b1b96..91105e92bf8d0 100644 --- a/ruby/red-arrow/test/test-list-array-builder.rb +++ b/ruby/red-arrow/test/test-list-array-builder.rb @@ -33,6 +33,15 @@ def setup array = @builder.finish assert_equal([true, false, true], array[0].to_a) end + + test("Struct[]") do + item_type = Arrow::StructDataType.new([{name: "visible", type: :boolean}]) + data_type = Arrow::ListDataType.new(name: "struct", data_type: item_type) + builder = Arrow::ListArrayBuilder.new(data_type) + builder.append_value([]) + array = builder.finish + assert_equal([], array[0].to_a) + end end sub_test_case("#append_values") do